Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

b798184

verified ·

1 Parent(s): e5aed01

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -23

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
-# app.py — FIXED: Handles remote code trust and logical error on failure
 import gradio as gr
 import os
 import time
-from train_vlm import train_vlm_stage  # Assuming this file exists and works
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 import torch
@@ -28,14 +28,15 @@ def load_model_for_stage(stage):
     current_stage = stage
     ckpt_path = f"{CHECKPOINT_ROOT}/stage_{stage}"
-    # ✅ FIX 1: Added trust_remote_code=True to all .from_pretrained calls
     if os.path.exists(ckpt_path) and os.path.exists(os.path.join(ckpt_path, "adapter_model.safetensors")):
         print(f"✅ Loading checkpoint: Stage {stage}")
         del model
-        torch.cuda.empty_cache()
         model = LlavaForConditionalGeneration.from_pretrained(
             ckpt_path,
-            torch_dtype=torch.float16,
             trust_remote_code=True
         ).to(device)
         processor = AutoProcessor.from_pretrained(ckpt_path, trust_remote_code=True)
@@ -43,7 +44,7 @@ def load_model_for_stage(stage):
         print(f"⚠️ No checkpoint for Stage {stage} — loading base model")
         model = LlavaForConditionalGeneration.from_pretrained(
             MODEL_NAME,
-            torch_dtype=torch.float16,
             trust_remote_code=True
         ).to(device)
         processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
@@ -51,11 +52,11 @@ def load_model_for_stage(stage):
 def chat_with_image(image, text, chat_history):
     """Handles the user's chat interaction."""
     if model is None or processor is None:
-        return "", chat_history.append({"role": "assistant", "content": "Model is not loaded yet. Please wait for training to start."})
     try:
         conversation = [{"role": "user", "content": f"<image>\n{text}"}]
-        prompt = processor.apply_chat_template(conversation, tokenize=False)
         inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
         output = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
@@ -70,12 +71,9 @@ def chat_with_image(image, text, chat_history):
         return "", chat_history
 def run_autonomous_training_and_update_ui():
-    """
-    Generator function that runs the training pipeline and yields status updates.
-    """
     yield "🚀 Initializing COCONUT-VLM Autonomous Trainer..."
-    # ✅ FIX 2: Added a flag to track if training failed
     training_failed = False
     for stage in [1, 2, 3]:
@@ -94,7 +92,6 @@ def run_autonomous_training_and_update_ui():
         yield status_message
         try:
-            # IMPORTANT: Make sure train_vlm_stage also uses trust_remote_code=True
             train_vlm_stage(stage, MODEL_NAME, ckpt_path)
             status_message = f"✅ Stage {stage} completed! Loading new model..."
@@ -112,16 +109,15 @@ def run_autonomous_training_and_update_ui():
             status_message = f"❌ Stage {stage} failed: {e}"
             print(status_message)
             yield status_message
-            training_failed = True # Set the flag to True on failure
-            break # Stop the entire pipeline
-    # ✅ FIX 2: Only show the completion message if the loop finished without failing
     if not training_failed:
         final_message = "🎉 COCONUT-VLM Training Complete — All 3 Stages Finished!"
         print(final_message)
         yield final_message
-# --- Gradio UI (No changes needed here) ---
 with gr.Blocks(title="🥥 COCONUT-VLM Autonomous Trainer") as demo:
     gr.Markdown("# 🥥 COCONUT-VLM: Autonomous Vision-Language Trainer")
     gr.Markdown("Model is training itself in 3 stages automatically. **You can only chat.** Training is backend-only.")
@@ -129,13 +125,10 @@ with gr.Blocks(title="🥥 COCONUT-VLM Autonomous Trainer") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             status = gr.Textbox(
-                label="Training Status",
-                value="Waiting to start...",
-                interactive=False,
-                show_label=False,
-                lines=3
             )
-            gr.Markdown("💡 _Training runs automatically on page load. No buttons needed._")
         with gr.Column(scale=2):
             image_input = gr.Image(type="pil", label="Upload Image")

+# app.py
 import gradio as gr
 import os
 import time
+from train_vlm import train_vlm_stage
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 import torch
     current_stage = stage
     ckpt_path = f"{CHECKPOINT_ROOT}/stage_{stage}"
     if os.path.exists(ckpt_path) and os.path.exists(os.path.join(ckpt_path, "adapter_model.safetensors")):
         print(f"✅ Loading checkpoint: Stage {stage}")
+        # Free up memory before loading the next model
         del model
+        if device == "cuda":
+            torch.cuda.empty_cache()
         model = LlavaForConditionalGeneration.from_pretrained(
             ckpt_path,
+            torch_dtype=torch.float16 if device == "cuda" else torch.bfloat16,
             trust_remote_code=True
         ).to(device)
         processor = AutoProcessor.from_pretrained(ckpt_path, trust_remote_code=True)
         print(f"⚠️ No checkpoint for Stage {stage} — loading base model")
         model = LlavaForConditionalGeneration.from_pretrained(
             MODEL_NAME,
+            torch_dtype=torch.float16 if device == "cuda" else torch.bfloat16,
             trust_remote_code=True
         ).to(device)
         processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
 def chat_with_image(image, text, chat_history):
     """Handles the user's chat interaction."""
     if model is None or processor is None:
+        return "", chat_history + [{"role": "assistant", "content": "Model is not loaded yet. Please wait for training to start."}]
     try:
         conversation = [{"role": "user", "content": f"<image>\n{text}"}]
+        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
         inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
         output = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
         return "", chat_history
 def run_autonomous_training_and_update_ui():
+    """Generator function that runs the training pipeline and yields status updates."""
     yield "🚀 Initializing COCONUT-VLM Autonomous Trainer..."
     training_failed = False
     for stage in [1, 2, 3]:
         yield status_message
         try:
             train_vlm_stage(stage, MODEL_NAME, ckpt_path)
             status_message = f"✅ Stage {stage} completed! Loading new model..."
             status_message = f"❌ Stage {stage} failed: {e}"
             print(status_message)
             yield status_message
+            training_failed = True
+            break
     if not training_failed:
         final_message = "🎉 COCONUT-VLM Training Complete — All 3 Stages Finished!"
         print(final_message)
         yield final_message
+# --- Gradio UI ---
 with gr.Blocks(title="🥥 COCONUT-VLM Autonomous Trainer") as demo:
     gr.Markdown("# 🥥 COCONUT-VLM: Autonomous Vision-Language Trainer")
     gr.Markdown("Model is training itself in 3 stages automatically. **You can only chat.** Training is backend-only.")
     with gr.Row():
         with gr.Column(scale=1):
             status = gr.Textbox(
+                label="Training Status", value="Waiting to start...", interactive=False,
+                show_label=False, lines=3, max_lines=5
             )
+            gr.Markdown("💡 _Training runs automatically on page load._")
         with gr.Column(scale=2):
             image_input = gr.Image(type="pil", label="Upload Image")