Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

be4d66f

verified ·

1 Parent(s): c80f2b2

Update train_vlm.py

Browse files

Files changed (1) hide show

train_vlm.py +58 -79

train_vlm.py CHANGED Viewed

@@ -7,147 +7,126 @@ from transformers import (
 )
 from datasets import load_dataset
 from PIL import Image
-# Import our custom VLM architecture
 from custom_vlm import CustomScratchVLM, VLMConfig
-# --- Tokenizer and Processor Setup ---
 def get_processors_and_model(config):
-    """Initializes tokenizer, image processor, and the custom VLM."""
-    # Using the sub-model names from our config
     vision_model_name = config.vision_config._name_or_path
     language_model_name = config.language_config._name_or_path
-    # 1. Load standard processors for the chosen sub-models
     image_processor = AutoImageProcessor.from_pretrained(vision_model_name)
     tokenizer = AutoTokenizer.from_pretrained(language_model_name)
-    # 2. Add a special token for the image placeholder
     IMAGE_TOKEN = "<IMAGE>"
     tokenizer.add_special_tokens({"additional_special_tokens": [IMAGE_TOKEN]})
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    # 3. Update the VLM config with the new vocab size
     config.language_config.vocab_size = len(tokenizer)
-    # 4. Instantiate our from-scratch model
     model = CustomScratchVLM(config)
     model.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
     return image_processor, tokenizer, model
-# --- Dataset and Preprocessing ---
-def load_and_prepare_dataset(stage, image_processor, tokenizer, split="train[:50]"):
     dataset = load_dataset("HuggingFaceM4/TextVQA", split=split)
     IMAGE_TOKEN = "<IMAGE>"
-    # This is the number of patch embeddings from the ViT
-    NUM_IMAGE_PATCHES = (image_processor.size['height'] // image_processor.patch_size) * \
-                      (image_processor.size['width'] // image_processor.patch_size)
     def preprocess_function(examples):
-        # This function is now much more complex
         image = examples['image'].convert("RGB")
         question = examples.get('question', '')
         answer = examples['answers'][0] if examples.get('answers') else "unknown"
-        # Stage-specific formatting
         if stage == 1:
-            prompt = f"USER: {IMAGE_TOKEN}\nQ: {question}\nA: Let's think step by step.\nASSISTANT: I see {answer} in the image. Therefore, the answer is {answer}."
         elif stage == 2:
-            prompt = f"USER: {IMAGE_TOKEN}\nQ: {question}\nA: [INTERNAL THOUGHT HIDDEN]... Final Answer:\nASSISTANT: {answer}"
-        else: # Stage 3
-            prompt = f"USER: {IMAGE_TOKEN}\nQ: {question}\nA: Think deeply.\nASSISTANT: I revise: '{answer}' is correct. Confidence: 89%."
-        # Tokenize text
         full_text = prompt + tokenizer.eos_token
-        tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
-        # Prepare labels for causal language modeling
-        labels = tokenized.input_ids.clone()
-        # Mask out the user's prompt part in the labels
-        # Find where the assistant's response starts
         try:
-            assistant_start_marker = "ASSISTANT:"
-            # Find the token IDs for the marker
-            marker_ids = tokenizer(assistant_start_marker, add_special_tokens=False).input_ids
-            # Search for this sequence of IDs in the labels
-            assistant_start_idx = -1
-            for i in range(len(labels[0]) - len(marker_ids) + 1):
-                if (labels[0, i:i+len(marker_ids)] == torch.tensor(marker_ids)).all():
-                    assistant_start_idx = i
-                    break
-            if assistant_start_idx != -1:
-                labels[0, :assistant_start_idx + len(marker_ids)] = -100 # Mask everything before and including the marker
-        except Exception:
-             # If something fails, just mask the first token
-            labels[0, 0] = -100
-        # Process image
         pixel_values = image_processor(image, return_tensors="pt").pixel_values
-        # The model's forward pass expects the placeholder to be replaced by N patches.
-        # But for input_ids, we only have one placeholder. The attention mask needs to
-        # be expanded to account for the N patches that will replace the single token.
-        image_token_idx = torch.where(tokenized.input_ids == model.image_token_id)[1]
-        # Create a new attention mask
-        new_attention_mask = torch.cat([
-            tokenized.attention_mask[:, :image_token_idx],
-            torch.ones(1, NUM_IMAGE_PATCHES, dtype=torch.long),
-            tokenized.attention_mask[:, image_token_idx+1:]
-        ], dim=1)
         return {
-            "pixel_values": pixel_values.squeeze(),
-            "input_ids": tokenized.input_ids.squeeze(),
-            "attention_mask": new_attention_mask.squeeze(),
-            "labels": labels.squeeze()
         }
-    return dataset.map(preprocess_function, remove_columns=list(dataset.column_names))
-# --- Training ---
 def train_vlm_stage(stage, output_dir, resume_from=None):
     print(f"🚀 Starting VLM Stage {stage} Training FROM SCRATCH...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    # 1. Get our custom model and its processors
     vlm_config = VLMConfig()
     image_processor, tokenizer, model = get_processors_and_model(vlm_config)
     model.to(device)
-    # 2. Load and prepare dataset using the new pipeline
-    tokenized_dataset = load_and_prepare_dataset(stage, image_processor, tokenizer)
-    is_cuda = (device == "cuda")
     training_args = TrainingArguments(
         output_dir=output_dir,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=8,
-        num_train_epochs=5, # More epochs needed for from-scratch
         learning_rate=5e-5,
-        fp16=is_cuda,
-        bf16=is_cuda and torch.cuda.is_bf16_supported(),
         save_strategy="epoch",
-        logging_steps=5,
-        report_to="none",
-        optim="adamw_torch",
         remove_unused_columns=False,
     )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_dataset,
-        data_collator=DefaultDataCollator()
-    )
-    print("--- Starting Trainer on From-Scratch Model ---")
     trainer.train(resume_from_checkpoint=resume_from)
-    print("--- Training Finished ---")
     model.save_pretrained(output_dir)
     image_processor.save_pretrained(output_dir)

 )
 from datasets import load_dataset
 from PIL import Image
 from custom_vlm import CustomScratchVLM, VLMConfig
 def get_processors_and_model(config):
     vision_model_name = config.vision_config._name_or_path
     language_model_name = config.language_config._name_or_path
     image_processor = AutoImageProcessor.from_pretrained(vision_model_name)
     tokenizer = AutoTokenizer.from_pretrained(language_model_name)
     IMAGE_TOKEN = "<IMAGE>"
     tokenizer.add_special_tokens({"additional_special_tokens": [IMAGE_TOKEN]})
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     config.language_config.vocab_size = len(tokenizer)
     model = CustomScratchVLM(config)
     model.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
     return image_processor, tokenizer, model
+def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="train[:50]"):
     dataset = load_dataset("HuggingFaceM4/TextVQA", split=split)
     IMAGE_TOKEN = "<IMAGE>"
+    TEXT_MAX_LENGTH = 128
+    NUM_IMAGE_PATCHES = (image_processor.size['height'] // image_processor.patch_size) ** 2
+    FINAL_MAX_LENGTH = TEXT_MAX_LENGTH - 1 + NUM_IMAGE_PATCHES
     def preprocess_function(examples):
         image = examples['image'].convert("RGB")
         question = examples.get('question', '')
         answer = examples['answers'][0] if examples.get('answers') else "unknown"
         if stage == 1:
+            prompt = f"USER: {IMAGE_TOKEN}\nQuestion: {question}\nASSISTANT: {answer}"
         elif stage == 2:
+            prompt = f"USER: {IMAGE_TOKEN}\nQuestion: {question} Think step-by-step.\nASSISTANT: I think the answer is {answer}."
+        else:
+            prompt = f"USER: {IMAGE_TOKEN}\n{question}\nASSISTANT: The final answer is: {answer}."
         full_text = prompt + tokenizer.eos_token
+        # Tokenize text part first, up to a max text length
+        tokenized = tokenizer(full_text, max_length=TEXT_MAX_LENGTH, truncation=True)
+        input_ids = torch.tensor(tokenized.input_ids)
+        # --- CRITICAL FIX: Build Labels and Attention Mask for the FINAL sequence length ---
+        # 1. Find the location of the image token placeholder
         try:
+            image_token_idx = torch.where(input_ids == model.image_token_id)[0][0].item()
+        except IndexError: # If token was truncated out, skip this example
+            return None
+        # 2. Build the LABELS tensor
+        labels = input_ids.clone()
+        # Mask out the prompt part (everything before and including "ASSISTANT:")
+        assistant_marker = tokenizer("ASSISTANT:", add_special_tokens=False).input_ids
+        for i in range(len(labels) - len(assistant_marker) + 1):
+            if (labels[i:i+len(assistant_marker)] == torch.tensor(assistant_marker)).all():
+                labels[:i+len(assistant_marker)] = -100
+                break
+        # Expand labels to the final length, inserting padding for the image patches
+        pre_labels = labels[:image_token_idx]
+        post_labels = labels[image_token_idx+1:]
+        # The image part of the labels should be all -100 (we don't predict image patches)
+        image_labels_pad = torch.full((NUM_IMAGE_PATCHES,), -100, dtype=torch.long)
+        # Combine and pad/truncate to FINAL_MAX_LENGTH
+        final_labels = torch.cat([pre_labels, image_labels_pad, post_labels], dim=0)
+        final_labels = torch.nn.functional.pad(final_labels, (0, FINAL_MAX_LENGTH - len(final_labels)), value=-100)
+        # 3. Build the ATTENTION MASK in the same way
+        attention_mask = torch.ones_like(input_ids)
+        pre_mask = attention_mask[:image_token_idx]
+        post_mask = attention_mask[image_token_idx+1:]
+        image_mask = torch.ones(NUM_IMAGE_PATCHES, dtype=torch.long)
+        final_attention_mask = torch.cat([pre_mask, image_mask, post_mask], dim=0)
+        final_attention_mask = torch.nn.functional.pad(final_attention_mask, (0, FINAL_MAX_LENGTH - len(final_attention_mask)), value=0)
+        # 4. Process the image
         pixel_values = image_processor(image, return_tensors="pt").pixel_values
         return {
+            "pixel_values": pixel_values.squeeze(0),
+            "input_ids": input_ids, # Keep original input_ids for placeholder finding
+            "attention_mask": final_attention_mask,
+            "labels": final_labels
         }
+    processed_dataset = dataset.map(preprocess_function, remove_columns=list(dataset.column_names))
+    return processed_dataset.filter(lambda x: x is not None)
 def train_vlm_stage(stage, output_dir, resume_from=None):
     print(f"🚀 Starting VLM Stage {stage} Training FROM SCRATCH...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     vlm_config = VLMConfig()
     image_processor, tokenizer, model = get_processors_and_model(vlm_config)
     model.to(device)
+    tokenized_dataset = load_and_prepare_dataset(stage, image_processor, tokenizer, model)
     training_args = TrainingArguments(
         output_dir=output_dir,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=8,
+        num_train_epochs=5,
         learning_rate=5e-5,
+        fp16=(device == "cuda"),
+        bf16=(device == "cuda" and torch.cuda.is_bf16_supported()),
         save_strategy="epoch",
+        logging_steps=5, report_to="none", optim="adamw_torch",
         remove_unused_columns=False,
     )
+    trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=DefaultDataCollator())
     trainer.train(resume_from_checkpoint=resume_from)
     model.save_pretrained(output_dir)
     image_processor.save_pretrained(output_dir)