elysium_7b / build_vit.py
sty-yyj's picture
Upload ElysiumForCausalLM
5284c75 verified
import re
from transformers import CLIPVisionModel
import torch
import torch.nn as nn
class ClipVisionTransformer(CLIPVisionModel):
def forward(
self,
pixel_values=None,
output_attentions=None,
output_hidden_states=True,
return_dict=None,
):
r"""
Returns:
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModel
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
image_forward_outs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
return image_forward_outs.hidden_states[-2][:, 1:] # Use second to last layer as in LLaVA
def create_clip_vit(
precision="fp16", pretrained_model_name_or_path: str = "", low_cpu_mem_usage=False, **kwargs
):
dtype = torch.float16 if "16" in precision else torch.float32
model = ClipVisionTransformer.from_pretrained(
pretrained_model_name_or_path,
torch_dtype=dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
ignore_mismatched_sizes=True,
).cuda()
return model
def build_projector(
type: str = "linear", input_hidden_size: int = 1024, output_hidden_size: int = 1024
):
"""build vision projector
Args:
type: projector type (linear, mlp2x_gelu, identity)
input_hidden_size: input hidden size from adaptor
output_hidden_size: output hidden size to llm
Returns:
vision projector module(nn.Module)
"""
if type == "linear":
return nn.Linear(input_hidden_size, output_hidden_size)
mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", type)
if mlp_gelu_match:
mlp_depth = int(mlp_gelu_match.group(1))
modules = [nn.Linear(input_hidden_size, output_hidden_size)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(output_hidden_size, output_hidden_size))
return nn.Sequential(*modules)
if type == "identity":
return nn.Identity()
raise ValueError(f"Unknown projector type: {type}")