Upload TaiVisionForCausalLM

Browse files

Files changed (4) hide show

config.json +6 -1
generation_config.json +6 -0
model.safetensors +3 -0
modeling_taivisionlm.py +472 -0

config.json CHANGED Viewed

@@ -1,6 +1,10 @@
 {
   "auto_map": {
-    "AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig"
   },
   "hidden_size": 2048,
   "ignore_index": -100,
@@ -21,6 +25,7 @@
     "torch_dtype": "bfloat16",
     "vocab_size": 32001
   },
   "transformers_version": "4.44.0",
   "vision_config": {
     "model_type": "siglip_vision_model",

 {
+  "architectures": [
+    "TaiVisionForCausalLM"
+  ],
   "auto_map": {
+    "AutoConfig": "configuration_taivisionlm.TaiVisionLMConfig",
+    "AutoModelForCausalLM": "modeling_taivisionlm.TaiVisionForCausalLM"
   },
   "hidden_size": 2048,
   "ignore_index": -100,
     "torch_dtype": "bfloat16",
     "vocab_size": 32001
   },
+  "torch_dtype": "float32",
   "transformers_version": "4.44.0",
   "vision_config": {
     "model_type": "siglip_vision_model",

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.44.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e3c91245701f070448659cda849d90ef35ea419ad8aa53c459b20a7d516df00
+size 4806424752

modeling_taivisionlm.py ADDED Viewed

	@@ -0,0 +1,472 @@

+"""PyTorch TaiVisionLM"""
+import torch
+from transformers import PreTrainedModel, AutoModel, AutoModelForCausalLM
+from transformers.utils import logging, add_start_docstrings, ModelOutput
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+from torch import nn
+from transformers.cache_utils import Cache, StaticCache
+logger = logging.get_logger(__name__)
+from .configuration_taivisionlm import TaiVisionLMConfig
+_CONFIG_FOR_DOC = "TaiVisionLMConfig"
+@dataclass
+class TaiVisionCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for TaiVision language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class TaiVisionMultiModalProjector(nn.Module):
+    """
+    Multimodal projector that cast the image features into the same dimension space as the language model
+    """
+    def __init__(self, config: TaiVisionLMConfig, dropout=0.1):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(config.vision_config.projection_dim, 4*config.vision_config.projection_dim, bias=True),
+            nn.GELU(),
+            nn.Linear(4*config.vision_config.projection_dim, config.hidden_size, bias=True),
+            nn.Dropout(dropout)
+        )
+    def forward(self, image_features):
+        hidden_states = self.net(image_features).to(image_features.dtype)
+        return hidden_states
+TRAVISIONLM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`TaiVisionLMConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare TaiVision Model outputting raw hidden-states without any specific head on top.",
+    TRAVISIONLM_START_DOCSTRING,
+)
+class TaiVisionPreTrainedModel(PreTrainedModel):
+    config_class = TaiVisionLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["TaiVisionMultiModalProjector"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        # Do NOT init the weights of the model using this class call, this is a ported version,
+        # hence not intended to be trained from scratch.
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+@add_start_docstrings(
+    """The TaiVisionLM model which consists of a vision backbone and a language model.""",
+    TRAVISIONLM_START_DOCSTRING,
+)
+class TaiVisionForCausalLM(TaiVisionPreTrainedModel):
+    def __init__(self, config: TaiVisionLMConfig):
+        super(TaiVisionForCausalLM, self).__init__(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.pad_token_id = -1 if config.pad_token_id == None else config.pad_token_id
+        self._attn_implementation = config._attn_implementation
+        self.gradient_checkpointing = False
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.vision_projector = TaiVisionMultiModalProjector(config)
+        language_model = AutoModelForCausalLM.from_config(
+            config=config.text_config, attn_implementation=self._attn_implementation
+        )
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        self.language_model = language_model
+        self.post_init()
+    def load_pretrained(self):
+        '''
+        load the pretrained weights for language model and vision model
+        '''
+        import transformers
+        language_model = AutoModelForCausalLM.from_pretrained("benchang1110/Taiwan-tinyllama-v1.0-chat")
+        if language_model.vocab_size != self.vocab_size:
+            print("vocab size mismatch, resize the token embeddings for the pretained language model")
+            language_model.resize_token_embeddings(self.vocab_size)
+        self.language_model = language_model
+        vision_model = transformers.SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
+        self.vision_tower = vision_model
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_input_embeddings with PaliGemma->TaiVisionLM
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.set_input_embeddings with PaliGemma->TaiVisionLM
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_output_embeddings with PaliGemma->TaiVisionLM
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.set_output_embeddings with PaliGemma->TaiVisionLM
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.set_decoder with PaliGemma->TaiVisionLM
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.get_decoder with PaliGemma->TaiVisionLM
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration.tie_weights with PaliGemma->TaiVisionLM
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        # TODO: config.vocab_size is deprecated and will be removed in v4.43.
+        # `resize_token_embeddings` should work from `modeling_utils.py``
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    # Copied from transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration._merge_input_ids_with_image_features with PaliGemma->TaiVisionLM
+    def _update_causal_mask(
+        self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False
+    ):
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        dtype, device = inputs_embeds.dtype, inputs_embeds.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+            if sequence_length != 1:
+                if is_training:
+                    causal_mask = torch.triu(causal_mask, diagonal=1)
+                else:
+                    causal_mask = torch.zeros_like(causal_mask)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+        return causal_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TaiVisionCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
+        >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
+        >>> prompt = "answer en Where is the cow standing?"
+        >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "answer en Where is the cow standing?\nbeach"
+        ```"""
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        is_training = token_type_ids is not None and labels is not None
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+        # Merge text and images
+        if pixel_values is not None:
+            image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.vision_projector(selected_image_feature)
+            image_features = image_features / (self.config.hidden_size**0.5)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training
+        )
+        outputs = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        logits = outputs.logits
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return TaiVisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        model_inputs["token_type_ids"] = token_type_ids
+        # position_ids in Paligemma are 1-indexed
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        return model_inputs
+if __name__ == '__main__':
+    import transformers
+    config = transformers.AutoConfig.from_pretrained("benchang1110/TaiVision-base",trust_remote_code=True)
+    model = TaiVisionForCausalLM(config).to("cuda")
+    print(model)
+    model.save_pretrained
+    # Test forward
+    import torch
+    from PIL import Image
+    import requests
+    # Initialize processor
+    processor = transformers.AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
+    # Load image
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    # Define prompt and label
+    prompt = "What is the color of the car?"
+    label = "I am fine, thank you."
+    # Process inputs
+    inputs = processor(prompts=prompt,images=image, return_tensors="pt", padding=False, max_length=512).to('cuda')
+    outputs = model.generate(**inputs, max_length=512, do_sample=True, pad_token_id=processor.tokenizer.pad_token_id)
+    print(processor.decode(outputs[0], skip_special_tokens=True))