Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 17

Commit

31126b4

verified ·

1 Parent(s): 5d7aeb6

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -56

app.py CHANGED Viewed

@@ -1,17 +1,15 @@
-# app.py
 import gradio as gr
 import threading
 import os
 from train_vlm import train_vlm_stage
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 import torch
 # --- Config ---
-MODEL_NAME = "bczhou/TinyLLaVA-3.1B"  # or "" for faster training
-HF_USERNAME = "Smilyai-labs-research"
-YOUR_SPACE_REPO = "Smilyai-labs-research/VISION-LLM-COT"
 CHECKPOINT_ROOT = "./checkpoints"
 os.makedirs(CHECKPOINT_ROOT, exist_ok=True)
 # --- Global state ---
@@ -19,16 +17,17 @@ current_stage = 1
 model = None
 processor = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def load_model_for_stage(stage):
     global model, processor
     ckpt_path = f"{CHECKPOINT_ROOT}/stage_{stage}"
     if os.path.exists(ckpt_path):
-        print(f"Loading checkpoint from {ckpt_path}")
         model = LlavaForConditionalGeneration.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
         processor = AutoProcessor.from_pretrained(ckpt_path)
     else:
-        print(f"No checkpoint for stage {stage}, loading base model")
         model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(device)
         processor = AutoProcessor.from_pretrained(MODEL_NAME)
@@ -36,64 +35,83 @@ def chat_with_image(image, text, chat_history):
     if model is None or processor is None:
         load_model_for_stage(current_stage)
-    conversation = [
-        {"role": "user", "content": f"<image>\n{text}"},
-    ]
-    prompt = processor.apply_chat_template(conversation, tokenize=False)
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
-    output = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
-    response = processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    chat_history.append((text, response))
-    return "", chat_history
-def start_training(stage):
-    global current_stage
-    current_stage = stage
-    thread = threading.Thread(target=train_vlm_stage, args=(stage, MODEL_NAME, f"{CHECKPOINT_ROOT}/stage_{stage}"))
-    thread.start()
-    return f"▶️ Training started for Stage {stage}. Check logs."
-def switch_stage(stage):
-    global current_stage
-    current_stage = stage
-    load_model_for_stage(stage)
-    return f"✅ Switched to Stage {stage}. Model reloaded."
-# --- Gradio UI ---
-with gr.Blocks(title="🥥 VLM COCONUT Trainer") as demo:
-    gr.Markdown("# 🥥 Vision-Language COCONUT CoT Trainer (Real Training!)")
-    gr.Markdown("Train a VLM in 3 stages. Chat with the latest stage.")
     with gr.Row():
-        with gr.Column():
-            stage_btn1 = gr.Button("Stage 1: Plain CoT", variant="primary")
-            stage_btn2 = gr.Button("Stage 2: Masked Thought")
-            stage_btn3 = gr.Button("Stage 3: COCONUT Mode")
-            status = gr.Textbox(label="Status", interactive=False)
-        with gr.Column():
             image_input = gr.Image(type="pil", label="Upload Image")
             chatbot = gr.Chatbot(height=400)
             msg = gr.Textbox(label="Ask a question about the image")
             clear = gr.Button("Clear Chat")
-    # Event bindings
-    stage_btn1.click(lambda: switch_stage(1), None, status)
-    stage_btn2.click(lambda: switch_stage(2), None, status)
-    stage_btn3.click(lambda: switch_stage(3), None, status)
     msg.submit(chat_with_image, [image_input, msg, chatbot], [msg, chatbot])
     clear.click(lambda: None, None, chatbot, queue=False)
-    gr.Markdown("## ⚙️ Start Training (Uses your GPU Grant!)")
-    train_btn1 = gr.Button("▶️ Train Stage 1")
-    train_btn2 = gr.Button("▶️ Train Stage 2")
-    train_btn3 = gr.Button("▶️ Train Stage 3")
-    train_btn1.click(lambda: start_training(1), None, status)
-    train_btn2.click(lambda: start_training(2), None, status)
-    train_btn3.click(lambda: start_training(3), None, status)
-demo.queue(max_size=10).launch()

+# app.py — Fully autonomous 3-stage VLM trainer. UI is chat-only.
 import gradio as gr
 import threading
 import os
+import time
 from train_vlm import train_vlm_stage
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 import torch
 # --- Config ---
+MODEL_NAME = "bczhou/TinyLLaVA-3.1B"  # or "llava-hf/llava-1.5-7b-hf"
 CHECKPOINT_ROOT = "./checkpoints"
 os.makedirs(CHECKPOINT_ROOT, exist_ok=True)
 # --- Global state ---
 model = None
 processor = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
+training_status = "🚀 Initializing COCONUT-VLM Autonomous Trainer..."
 def load_model_for_stage(stage):
     global model, processor
     ckpt_path = f"{CHECKPOINT_ROOT}/stage_{stage}"
     if os.path.exists(ckpt_path):
+        print(f"✅ Loading checkpoint: Stage {stage}")
         model = LlavaForConditionalGeneration.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
         processor = AutoProcessor.from_pretrained(ckpt_path)
     else:
+        print(f"⚠️ No checkpoint for Stage {stage} — loading base model")
         model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(device)
         processor = AutoProcessor.from_pretrained(MODEL_NAME)
     if model is None or processor is None:
         load_model_for_stage(current_stage)
+    try:
+        conversation = [
+            {"role": "user", "content": f"<image>\n{text}"},
+        ]
+        prompt = processor.apply_chat_template(conversation, tokenize=False)
+        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+        output = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
+        response = processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        chat_history.append((text, response))
+        return "", chat_history
+    except Exception as e:
+        chat_history.append((text, f"⚠️ Error: {str(e)}"))
+        return "", chat_history
+# --- Autonomous Training Pipeline ---
+def auto_train_pipeline():
+    global current_stage, training_status
+    for stage in [1, 2, 3]:
+        current_stage = stage
+        training_status = f"▶️ AUTO-TRAINING STARTED: Stage {stage}"
+        print(training_status)
+        try:
+            # Train stage
+            train_vlm_stage(stage, MODEL_NAME, f"{CHECKPOINT_ROOT}/stage_{stage}")
+            # Update status
+            training_status = f"✅ Stage {stage} completed! Loading model..."
+            print(training_status)
+            # Load newly trained model
+            load_model_for_stage(stage)
+            # Brief pause before next stage
+            if stage < 3:
+                training_status = f"⏳ Advancing to Stage {stage + 1} in 5 seconds..."
+                print(training_status)
+                time.sleep(5)
+        except Exception as e:
+            training_status = f"❌ Stage {stage} failed: {str(e)}"
+            print(training_status)
+            break  # Stop pipeline on failure
+    training_status = "🎉 COCONUT-VLM Training Complete — All 3 Stages Finished!"
+    print(training_status)
+# --- Launch training on app start ---
+def initialize_autonomous_trainer():
+    training_thread = threading.Thread(target=auto_train_pipeline, daemon=True)
+    training_thread.start()
+# --- Gradio UI (Chat-Only) ---
+with gr.Blocks(title="🥥 COCONUT-VLM Autonomous Trainer") as demo:
+    gr.Markdown("# 🥥 COCONUT-VLM: Autonomous Vision-Language Trainer")
+    gr.Markdown("Model is training itself in 3 stages automatically. **You can only chat.** Training is backend-only.")
     with gr.Row():
+        with gr.Column(scale=1):
+            status = gr.Textbox(label="Training Status", value="Initializing...", interactive=False)
+            gr.Markdown("💡 _Training runs automatically in background. No buttons. No switching._")
+        with gr.Column(scale=2):
             image_input = gr.Image(type="pil", label="Upload Image")
             chatbot = gr.Chatbot(height=400)
             msg = gr.Textbox(label="Ask a question about the image")
             clear = gr.Button("Clear Chat")
     msg.submit(chat_with_image, [image_input, msg, chatbot], [msg, chatbot])
     clear.click(lambda: None, None, chatbot, queue=False)
+    # Initialize autonomous training on launch
+    demo.load(initialize_autonomous_trainer, inputs=None, outputs=None)
+    # Poll training status every 3 seconds
+    demo.load(lambda: training_status, every=3, outputs=status)
+demo.queue(max_size=20).launch()