Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

589d16b

verified ·

1 Parent(s): 5ab1e4b

Update train_vlm.py

Browse files

Files changed (1) hide show

train_vlm.py +42 -24

train_vlm.py CHANGED Viewed

@@ -27,31 +27,39 @@ def get_processors_and_model(config):
     return image_processor, tokenizer, model
-def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="train[:50]"):
-    # --- THIS IS THE FIX ---
-    # Using the official facebook/textvqa dataset with the required trust_remote_code flag.
-    print(f"Attempting to load dataset 'facebook/textvqa' with trust_remote_code=True...")
-    dataset = load_dataset("facebook/textvqa", split=split, trust_remote_code=True)
     print("Dataset loaded successfully.")
     IMAGE_TOKEN = "<IMAGE>"
-    TEXT_MAX_LENGTH = 128
     NUM_IMAGE_PATCHES = (image_processor.size['height'] // image_processor.patch_size) ** 2
     FINAL_MAX_LENGTH = TEXT_MAX_LENGTH - 1 + NUM_IMAGE_PATCHES
     def preprocess_function(examples):
         image = examples['image'].convert("RGB")
-        question = examples.get('question', '')
-        answer = examples['answers'][0] if examples.get('answers') else "unknown"
-        if stage == 1:
-            prompt = f"USER: {IMAGE_TOKEN}\nQuestion: {question}\nASSISTANT: {answer}"
-        elif stage == 2:
-            prompt = f"USER: {IMAGE_TOKEN}\nQuestion: {question} Think step-by-step.\nASSISTANT: I think the answer is {answer}."
-        else:
-            prompt = f"USER: {IMAGE_TOKEN}\n{question}\nASSISTANT: The final answer is: {answer}."
-        full_text = prompt + tokenizer.eos_token
         tokenized = tokenizer(full_text, max_length=TEXT_MAX_LENGTH, truncation=True)
         input_ids = torch.tensor(tokenized.input_ids)
@@ -61,12 +69,21 @@ def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="tr
             return None
         labels = input_ids.clone()
-        assistant_marker = tokenizer("ASSISTANT:", add_special_tokens=False).input_ids
-        for i in range(len(labels) - len(assistant_marker) + 1):
-            if (labels[i:i+len(assistant_marker)] == torch.tensor(assistant_marker)).all():
-                labels[:i+len(assistant_marker)] = -100
-                break
         pre_labels = labels[:image_token_idx]
         post_labels = labels[image_token_idx+1:]
         image_labels_pad = torch.full((NUM_IMAGE_PATCHES,), -100, dtype=torch.long)
@@ -95,20 +112,21 @@ def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="tr
     return processed_dataset.filter(lambda x: x is not None)
 def train_vlm_stage(stage, output_dir, resume_from=None):
-    print(f"🚀 Starting VLM Stage {stage} Training FROM SCRATCH...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     vlm_config = VLMConfig()
     image_processor, tokenizer, model = get_processors_and_model(vlm_config)
     model.to(device)
-    tokenized_dataset = load_and_prepare_dataset(stage, image_processor, tokenizer, model)
     training_args = TrainingArguments(
         output_dir=output_dir,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=8,
-        num_train_epochs=5,
         learning_rate=5e-5,
         fp16=(device == "cuda"),
         bf16=(device == "cuda" and torch.cuda.is_bf16_supported()),

     return image_processor, tokenizer, model
+def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="train[:200]"):
+    # --- USING THE DATASET YOU SPECIFIED ---
+    print("Loading dataset 'zera09/lmarena-ai_VisionArena-Chat-en'...")
+    dataset = load_dataset("zera09/lmarena-ai_VisionArena-Chat-en", split=split)
     print("Dataset loaded successfully.")
     IMAGE_TOKEN = "<IMAGE>"
+    TEXT_MAX_LENGTH = 256
     NUM_IMAGE_PATCHES = (image_processor.size['height'] // image_processor.patch_size) ** 2
     FINAL_MAX_LENGTH = TEXT_MAX_LENGTH - 1 + NUM_IMAGE_PATCHES
     def preprocess_function(examples):
         image = examples['image'].convert("RGB")
+        # --- USING THE CONVERSATION FORMAT YOU PROVIDED ---
+        # We select 'conversation_a' and parse it as a list of lists of dicts.
+        conversation = examples['conversation_a']
+        full_text = ""
+        is_first_user_turn = True
+        for turn_list in conversation:
+            if not turn_list: continue
+            turn = turn_list[0]
+            role = turn['role'].upper()
+            content = turn['content']
+            if role == "USER" and is_first_user_turn:
+                full_text += f"USER: {IMAGE_TOKEN}\n{content}\n"
+                is_first_user_turn = False
+            else:
+                full_text += f"{role}: {content}\n"
+        full_text += tokenizer.eos_token
         tokenized = tokenizer(full_text, max_length=TEXT_MAX_LENGTH, truncation=True)
         input_ids = torch.tensor(tokenized.input_ids)
             return None
         labels = input_ids.clone()
+        assistant_marker_ids = tokenizer("ASSISTANT:", add_special_tokens=False).input_ids
+        is_assistant_section = torch.zeros_like(labels, dtype=torch.bool)
+        for i in range(len(labels) - len(assistant_marker_ids) + 1):
+            if (labels[i:i+len(assistant_marker_ids)] == torch.tensor(assistant_marker_ids)).all():
+                end_idx = len(labels)
+                user_marker_ids = tokenizer("USER:", add_special_tokens=False).input_ids
+                for j in range(i + 1, len(labels) - len(user_marker_ids) + 1):
+                    if (labels[j:j+len(user_marker_ids)] == torch.tensor(user_marker_ids)).all():
+                        end_idx = j
+                        break
+                is_assistant_section[i:end_idx] = True
+        labels[~is_assistant_section] = -100
         pre_labels = labels[:image_token_idx]
         post_labels = labels[image_token_idx+1:]
         image_labels_pad = torch.full((NUM_IMAGE_PATCHES,), -100, dtype=torch.long)
     return processed_dataset.filter(lambda x: x is not None)
 def train_vlm_stage(stage, output_dir, resume_from=None):
+    print(f"🚀 Starting VLM Conversational Training Stage {stage} FROM SCRATCH...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     vlm_config = VLMConfig()
     image_processor, tokenizer, model = get_processors_and_model(vlm_config)
     model.to(device)
+    split = f"train[{200*(stage-1)}:{200*stage}]"
+    tokenized_dataset = load_and_prepare_dataset(stage, image_processor, tokenizer, model, split=split)
     training_args = TrainingArguments(
         output_dir=output_dir,
         per_device_train_batch_size=1,
         gradient_accumulation_steps=8,
+        num_train_epochs=3,
         learning_rate=5e-5,
         fp16=(device == "cuda"),
         bf16=(device == "cuda" and torch.cuda.is_bf16_supported()),