File size: 2,917 Bytes
5284c75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import re
from transformers import CLIPVisionModel
import torch
import torch.nn as nn
class ClipVisionTransformer(CLIPVisionModel):
def forward(
self,
pixel_values=None,
output_attentions=None,
output_hidden_states=True,
return_dict=None,
):
r"""
Returns:
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModel
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
image_forward_outs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return image_forward_outs.hidden_states[-2][:, 1:] # Use second to last layer as in LLaVA
def create_clip_vit(
precision="fp16", pretrained_model_name_or_path: str = "", low_cpu_mem_usage=False, **kwargs
):
dtype = torch.float16 if "16" in precision else torch.float32
model = ClipVisionTransformer.from_pretrained(
pretrained_model_name_or_path,
torch_dtype=dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
ignore_mismatched_sizes=True,
).cuda()
return model
def build_projector(
type: str = "linear", input_hidden_size: int = 1024, output_hidden_size: int = 1024
):
"""build vision projector
Args:
type: projector type (linear, mlp2x_gelu, identity)
input_hidden_size: input hidden size from adaptor
output_hidden_size: output hidden size to llm
Returns:
vision projector module(nn.Module)
"""
if type == "linear":
return nn.Linear(input_hidden_size, output_hidden_size)
mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", type)
if mlp_gelu_match:
mlp_depth = int(mlp_gelu_match.group(1))
modules = [nn.Linear(input_hidden_size, output_hidden_size)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(output_hidden_size, output_hidden_size))
return nn.Sequential(*modules)
if type == "identity":
return nn.Identity()
raise ValueError(f"Unknown projector type: {type}")
|