Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

c80f2b2

verified ·

1 Parent(s): 6fedc6b

Update custom_vlm.py

Browse files

Files changed (1) hide show

custom_vlm.py +27 -48

custom_vlm.py CHANGED Viewed

@@ -6,10 +6,6 @@ from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
 class VLMConfig(PretrainedConfig):
-    """
-    Configuration class for our custom from-scratch Vision Language Model.
-    This holds the configurations for the sub-modules.
-    """
     model_type = "custom_scratch_vlm"
     def __init__(
@@ -22,12 +18,10 @@ class VLMConfig(PretrainedConfig):
         self.vision_config = AutoConfig.from_pretrained(vision_model_name)
         self.language_config = AutoConfig.from_pretrained(language_model_name)
         self.projection_dim = projection_dim
-        # Make language model config aware of vocab size change if tokenizer is updated
         self.language_config.vocab_size = kwargs.get("vocab_size", self.language_config.vocab_size)
         super().__init__(**kwargs)
 class VLMProjector(nn.Module):
-    """Simple MLP to project vision features into the language model's embedding space."""
     def __init__(self, config: VLMConfig):
         super().__init__()
         self.linear1 = nn.Linear(config.vision_config.hidden_size, config.projection_dim)
@@ -38,23 +32,15 @@ class VLMProjector(nn.Module):
         return self.linear2(self.gelu(self.linear1(x)))
 class CustomScratchVLM(PreTrainedModel):
-    """
-    A VLM built from randomly initialized components.
-    """
     config_class = VLMConfig
     def __init__(self, config: VLMConfig):
         super().__init__(config)
         print("Initializing model components from scratch using their configurations...")
-        # 1. Initialize models from their CONFIGURATIONS ONLY (random weights)
         self.vision_tower = AutoModel.from_config(config.vision_config)
         self.language_model = AutoModelForCausalLM.from_config(config.language_config)
-        # 2. Initialize our custom projector
         self.multi_modal_projector = VLMProjector(config)
-        # This will be used to find where image features should be inserted
-        self.image_token_id = -1 # Placeholder, will be set after tokenizer is prepared
     def forward(
         self,
@@ -64,42 +50,29 @@ class CustomScratchVLM(PreTrainedModel):
         labels: torch.LongTensor = None,
         **kwargs
     ):
-        # Step 1: Get image embeddings from the vision tower
         image_features = self.vision_tower(pixel_values).last_hidden_state
-        # Step 2: Project image patch embeddings to the language model's input space
         image_embeds = self.multi_modal_projector(image_features)
-        # Step 3: Get text embeddings
         text_embeds = self.language_model.get_input_embeddings()(input_ids)
-        # Step 4: Find placeholder token indices and replace with image embeddings
-        batch_size = input_ids.shape[0]
-        # Find where the image token placeholder is in the input_ids
-        # It's assumed there is one image token per sequence
-        image_token_indices = torch.where(input_ids == self.image_token_id)
         final_embeds = text_embeds.clone()
         # Replace each placeholder with the corresponding full sequence of image embeddings
-        for i in range(batch_size):
-            # The start index for replacement in the text embeddings
-            start_idx = image_token_indices[1][i]
-            # The corresponding image embeddings for this item in the batch
-            img_embed_item = image_embeds[i]
-            # Construct the new embedding sequence
-            # 1. Part of text before the image
-            pre_img_embed = final_embeds[i, :start_idx]
-            # 2. Part of text after the image
-            post_img_embed = final_embeds[i, start_idx + 1:] # +1 to skip the placeholder
-            # Concatenate them all
-            final_embeds[i] = torch.cat(
-                [pre_img_embed, img_embed_item, post_img_embed], dim=0
-            )
-        # Step 5: Pass combined embeddings to the language model
         outputs = self.language_model(
             inputs_embeds=final_embeds,
             attention_mask=attention_mask,
@@ -108,17 +81,23 @@ class CustomScratchVLM(PreTrainedModel):
         )
         return outputs
-    def generate(self, pixel_values, prompt_ids, **kwargs):
-        """Custom generate function to handle multimodal input."""
-        # This is a simplified generate function. More robust implementations are complex.
         self.eval()
         with torch.no_grad():
             image_features = self.vision_tower(pixel_values).last_hidden_state
             image_embeds = self.multi_modal_projector(image_features)
-            text_embeds = self.language_model.get_input_embeddings()(prompt_ids)
-            # Combine embeddings (simple concatenation for generation)
             inputs_embeds = torch.cat([image_embeds, text_embeds], dim=1)
-            output_ids = self.language_model.generate(inputs_embeds=inputs_embeds, **kwargs)
         return output_ids

 from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
 class VLMConfig(PretrainedConfig):
     model_type = "custom_scratch_vlm"
     def __init__(
         self.vision_config = AutoConfig.from_pretrained(vision_model_name)
         self.language_config = AutoConfig.from_pretrained(language_model_name)
         self.projection_dim = projection_dim
         self.language_config.vocab_size = kwargs.get("vocab_size", self.language_config.vocab_size)
         super().__init__(**kwargs)
 class VLMProjector(nn.Module):
     def __init__(self, config: VLMConfig):
         super().__init__()
         self.linear1 = nn.Linear(config.vision_config.hidden_size, config.projection_dim)
         return self.linear2(self.gelu(self.linear1(x)))
 class CustomScratchVLM(PreTrainedModel):
     config_class = VLMConfig
     def __init__(self, config: VLMConfig):
         super().__init__(config)
         print("Initializing model components from scratch using their configurations...")
         self.vision_tower = AutoModel.from_config(config.vision_config)
         self.language_model = AutoModelForCausalLM.from_config(config.language_config)
         self.multi_modal_projector = VLMProjector(config)
+        self.image_token_id = -1
     def forward(
         self,
         labels: torch.LongTensor = None,
         **kwargs
     ):
         image_features = self.vision_tower(pixel_values).last_hidden_state
         image_embeds = self.multi_modal_projector(image_features)
         text_embeds = self.language_model.get_input_embeddings()(input_ids)
         final_embeds = text_embeds.clone()
         # Replace each placeholder with the corresponding full sequence of image embeddings
+        for i in range(input_ids.shape[0]):
+            image_token_idx = torch.where(input_ids[i] == self.image_token_id)[0]
+            if image_token_idx.numel() == 0: continue # Skip if no image token found
+            image_token_idx = image_token_idx[0]
+            pre_img_embed = final_embeds[i, :image_token_idx]
+            post_img_embed = final_embeds[i, image_token_idx + 1:]
+            # Combine parts
+            combined = torch.cat([pre_img_embed, image_embeds[i], post_img_embed], dim=0)
+            # Since lengths can vary, we need to ensure it fits back.
+            # The preprocessor now handles creating correctly sized masks/labels.
+            final_embeds[i] = combined
         outputs = self.language_model(
             inputs_embeds=final_embeds,
             attention_mask=attention_mask,
         )
         return outputs
+    def generate(self, pixel_values, input_ids, attention_mask, **kwargs):
+        """Custom generate function to handle multimodal input for inference."""
         self.eval()
         with torch.no_grad():
             image_features = self.vision_tower(pixel_values).last_hidden_state
             image_embeds = self.multi_modal_projector(image_features)
+            text_embeds = self.language_model.get_input_embeddings()(input_ids)
             inputs_embeds = torch.cat([image_embeds, text_embeds], dim=1)
+            # Create a combined attention mask for generation
+            image_attention_mask = torch.ones(image_embeds.shape[:2], dtype=torch.long, device=self.device)
+            combined_attention_mask = torch.cat([image_attention_mask, attention_mask], dim=1)
+            output_ids = self.language_model.generate(
+                inputs_embeds=inputs_embeds,
+                attention_mask=combined_attention_mask,
+                **kwargs
+            )
         return output_ids