Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

5ab1e4b

verified ·

1 Parent(s): c91df11

Update train_vlm.py

Browse files

Files changed (1) hide show

train_vlm.py +9 -17

train_vlm.py CHANGED Viewed

@@ -28,7 +28,11 @@ def get_processors_and_model(config):
     return image_processor, tokenizer, model
 def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="train[:50]"):
-    dataset = load_dataset("HuggingFaceM4/TextVQA", split=split)
     IMAGE_TOKEN = "<IMAGE>"
     TEXT_MAX_LENGTH = 128
@@ -48,53 +52,41 @@ def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="tr
             prompt = f"USER: {IMAGE_TOKEN}\n{question}\nASSISTANT: The final answer is: {answer}."
         full_text = prompt + tokenizer.eos_token
-        # Tokenize text part first, up to a max text length
         tokenized = tokenizer(full_text, max_length=TEXT_MAX_LENGTH, truncation=True)
         input_ids = torch.tensor(tokenized.input_ids)
-        # --- CRITICAL FIX: Build Labels and Attention Mask for the FINAL sequence length ---
-        # 1. Find the location of the image token placeholder
         try:
             image_token_idx = torch.where(input_ids == model.image_token_id)[0][0].item()
-        except IndexError: # If token was truncated out, skip this example
             return None
-        # 2. Build the LABELS tensor
         labels = input_ids.clone()
-        # Mask out the prompt part (everything before and including "ASSISTANT:")
         assistant_marker = tokenizer("ASSISTANT:", add_special_tokens=False).input_ids
         for i in range(len(labels) - len(assistant_marker) + 1):
             if (labels[i:i+len(assistant_marker)] == torch.tensor(assistant_marker)).all():
                 labels[:i+len(assistant_marker)] = -100
                 break
-        # Expand labels to the final length, inserting padding for the image patches
         pre_labels = labels[:image_token_idx]
         post_labels = labels[image_token_idx+1:]
-        # The image part of the labels should be all -100 (we don't predict image patches)
         image_labels_pad = torch.full((NUM_IMAGE_PATCHES,), -100, dtype=torch.long)
-        # Combine and pad/truncate to FINAL_MAX_LENGTH
         final_labels = torch.cat([pre_labels, image_labels_pad, post_labels], dim=0)
-        final_labels = torch.nn.functional.pad(final_labels, (0, FINAL_MAX_LENGTH - len(final_labels)), value=-100)
-        # 3. Build the ATTENTION MASK in the same way
         attention_mask = torch.ones_like(input_ids)
         pre_mask = attention_mask[:image_token_idx]
         post_mask = attention_mask[image_token_idx+1:]
         image_mask = torch.ones(NUM_IMAGE_PATCHES, dtype=torch.long)
         final_attention_mask = torch.cat([pre_mask, image_mask, post_mask], dim=0)
-        final_attention_mask = torch.nn.functional.pad(final_attention_mask, (0, FINAL_MAX_LENGTH - len(final_attention_mask)), value=0)
-        # 4. Process the image
         pixel_values = image_processor(image, return_tensors="pt").pixel_values
         return {
             "pixel_values": pixel_values.squeeze(0),
-            "input_ids": input_ids, # Keep original input_ids for placeholder finding
             "attention_mask": final_attention_mask,
             "labels": final_labels
         }

     return image_processor, tokenizer, model
 def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="train[:50]"):
+    # --- THIS IS THE FIX ---
+    # Using the official facebook/textvqa dataset with the required trust_remote_code flag.
+    print(f"Attempting to load dataset 'facebook/textvqa' with trust_remote_code=True...")
+    dataset = load_dataset("facebook/textvqa", split=split, trust_remote_code=True)
+    print("Dataset loaded successfully.")
     IMAGE_TOKEN = "<IMAGE>"
     TEXT_MAX_LENGTH = 128
             prompt = f"USER: {IMAGE_TOKEN}\n{question}\nASSISTANT: The final answer is: {answer}."
         full_text = prompt + tokenizer.eos_token
         tokenized = tokenizer(full_text, max_length=TEXT_MAX_LENGTH, truncation=True)
         input_ids = torch.tensor(tokenized.input_ids)
         try:
             image_token_idx = torch.where(input_ids == model.image_token_id)[0][0].item()
+        except IndexError:
             return None
         labels = input_ids.clone()
         assistant_marker = tokenizer("ASSISTANT:", add_special_tokens=False).input_ids
         for i in range(len(labels) - len(assistant_marker) + 1):
             if (labels[i:i+len(assistant_marker)] == torch.tensor(assistant_marker)).all():
                 labels[:i+len(assistant_marker)] = -100
                 break
         pre_labels = labels[:image_token_idx]
         post_labels = labels[image_token_idx+1:]
         image_labels_pad = torch.full((NUM_IMAGE_PATCHES,), -100, dtype=torch.long)
         final_labels = torch.cat([pre_labels, image_labels_pad, post_labels], dim=0)
+        final_labels = torch.nn.functional.pad(final_labels, (0, FINAL_MAX_LENGTH - len(final_labels)), value=-100)[:FINAL_MAX_LENGTH]
         attention_mask = torch.ones_like(input_ids)
         pre_mask = attention_mask[:image_token_idx]
         post_mask = attention_mask[image_token_idx+1:]
         image_mask = torch.ones(NUM_IMAGE_PATCHES, dtype=torch.long)
         final_attention_mask = torch.cat([pre_mask, image_mask, post_mask], dim=0)
+        final_attention_mask = torch.nn.functional.pad(final_attention_mask, (0, FINAL_MAX_LENGTH - len(final_attention_mask)), value=0)[:FINAL_MAX_LENGTH]
         pixel_values = image_processor(image, return_tensors="pt").pixel_values
         return {
             "pixel_values": pixel_values.squeeze(0),
+            "input_ids": input_ids,
             "attention_mask": final_attention_mask,
             "labels": final_labels
         }