|
|
import re |
|
|
|
|
|
from transformers import CLIPVisionModel |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
|
|
|
|
|
|
class ClipVisionTransformer(CLIPVisionModel): |
|
|
def forward( |
|
|
self, |
|
|
pixel_values=None, |
|
|
output_attentions=None, |
|
|
output_hidden_states=True, |
|
|
return_dict=None, |
|
|
): |
|
|
r""" |
|
|
Returns: |
|
|
|
|
|
Examples: |
|
|
|
|
|
```python |
|
|
>>> from PIL import Image |
|
|
>>> import requests |
|
|
>>> from transformers import AutoProcessor, CLIPVisionModel |
|
|
|
|
|
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") |
|
|
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
|
|
|
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
|
>>> image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
|
|
>>> inputs = processor(images=image, return_tensors="pt") |
|
|
|
|
|
>>> outputs = model(**inputs) |
|
|
>>> last_hidden_state = outputs.last_hidden_state |
|
|
>>> pooled_output = outputs.pooler_output # pooled CLS states |
|
|
```""" |
|
|
return_dict = ( |
|
|
return_dict if return_dict is not None else self.config.use_return_dict |
|
|
) |
|
|
|
|
|
image_forward_outs = self.vision_model( |
|
|
pixel_values=pixel_values, |
|
|
output_attentions=output_attentions, |
|
|
output_hidden_states=output_hidden_states, |
|
|
return_dict=return_dict, |
|
|
) |
|
|
|
|
|
return image_forward_outs.hidden_states[-2][:, 1:] |
|
|
|
|
|
|
|
|
def create_clip_vit( |
|
|
precision="fp16", pretrained_model_name_or_path: str = "", low_cpu_mem_usage=False, **kwargs |
|
|
): |
|
|
dtype = torch.float16 if "16" in precision else torch.float32 |
|
|
model = ClipVisionTransformer.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
torch_dtype=dtype, |
|
|
low_cpu_mem_usage=low_cpu_mem_usage, |
|
|
ignore_mismatched_sizes=True, |
|
|
).cuda() |
|
|
return model |
|
|
|
|
|
|
|
|
def build_projector( |
|
|
type: str = "linear", input_hidden_size: int = 1024, output_hidden_size: int = 1024 |
|
|
): |
|
|
"""build vision projector |
|
|
Args: |
|
|
type: projector type (linear, mlp2x_gelu, identity) |
|
|
input_hidden_size: input hidden size from adaptor |
|
|
output_hidden_size: output hidden size to llm |
|
|
Returns: |
|
|
vision projector module(nn.Module) |
|
|
""" |
|
|
|
|
|
if type == "linear": |
|
|
return nn.Linear(input_hidden_size, output_hidden_size) |
|
|
|
|
|
mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", type) |
|
|
if mlp_gelu_match: |
|
|
mlp_depth = int(mlp_gelu_match.group(1)) |
|
|
modules = [nn.Linear(input_hidden_size, output_hidden_size)] |
|
|
for _ in range(1, mlp_depth): |
|
|
modules.append(nn.GELU()) |
|
|
modules.append(nn.Linear(output_hidden_size, output_hidden_size)) |
|
|
return nn.Sequential(*modules) |
|
|
|
|
|
if type == "identity": |
|
|
return nn.Identity() |
|
|
|
|
|
raise ValueError(f"Unknown projector type: {type}") |
|
|
|