Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

Keeby-smilyai commited on Sep 17

Commit

f746dfb

verified ·

1 Parent(s): 622c2dd

Create app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# app.py
+import gradio as gr
+import threading
+import os
+from train_vlm import train_vlm_stage
+from transformers import LlavaForConditionalGeneration, AutoProcessor
+import torch
+# --- Config ---
+MODEL_NAME = "bczhou/TinyLLaVA-3.1B"  # or "" for faster training
+HF_USERNAME = "Smilyai-labs-research"
+YOUR_SPACE_REPO = "Smilyai-labs-research/VISION-LLM-COT"
+CHECKPOINT_ROOT = "./checkpoints"
+os.makedirs(CHECKPOINT_ROOT, exist_ok=True)
+# --- Global state ---
+current_stage = 1
+model = None
+processor = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def load_model_for_stage(stage):
+    global model, processor
+    ckpt_path = f"{CHECKPOINT_ROOT}/stage_{stage}"
+    if os.path.exists(ckpt_path):
+        print(f"Loading checkpoint from {ckpt_path}")
+        model = LlavaForConditionalGeneration.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
+        processor = AutoProcessor.from_pretrained(ckpt_path)
+    else:
+        print(f"No checkpoint for stage {stage}, loading base model")
+        model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(device)
+        processor = AutoProcessor.from_pretrained(MODEL_NAME)
+def chat_with_image(image, text, chat_history):
+    if model is None or processor is None:
+        load_model_for_stage(current_stage)
+    conversation = [
+        {"role": "user", "content": f"<image>\n{text}"},
+    ]
+    prompt = processor.apply_chat_template(conversation, tokenize=False)
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    output = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
+    response = processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    chat_history.append((text, response))
+    return "", chat_history
+def start_training(stage):
+    global current_stage
+    current_stage = stage
+    thread = threading.Thread(target=train_vlm_stage, args=(stage, MODEL_NAME, f"{CHECKPOINT_ROOT}/stage_{stage}"))
+    thread.start()
+    return f"▶️ Training started for Stage {stage}. Check logs."
+def switch_stage(stage):
+    global current_stage
+    current_stage = stage
+    load_model_for_stage(stage)
+    return f"✅ Switched to Stage {stage}. Model reloaded."
+# --- Gradio UI ---
+with gr.Blocks(title="🥥 VLM COCONUT Trainer") as demo:
+    gr.Markdown("# 🥥 Vision-Language COCONUT CoT Trainer (Real Training!)")
+    gr.Markdown("Train a VLM in 3 stages. Chat with the latest stage.")
+    with gr.Row():
+        with gr.Column():
+            stage_btn1 = gr.Button("Stage 1: Plain CoT", variant="primary")
+            stage_btn2 = gr.Button("Stage 2: Masked Thought")
+            stage_btn3 = gr.Button("Stage 3: COCONUT Mode")
+            status = gr.Textbox(label="Status", interactive=False)
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload Image")
+            chatbot = gr.Chatbot(height=400)
+            msg = gr.Textbox(label="Ask a question about the image")
+            clear = gr.Button("Clear Chat")
+    # Event bindings
+    stage_btn1.click(lambda: switch_stage(1), None, status)
+    stage_btn2.click(lambda: switch_stage(2), None, status)
+    stage_btn3.click(lambda: switch_stage(3), None, status)
+    msg.submit(chat_with_image, [image_input, msg, chatbot], [msg, chatbot])
+    clear.click(lambda: None, None, chatbot, queue=False)
+    gr.Markdown("## ⚙️ Start Training (Uses your GPU Grant!)")
+    train_btn1 = gr.Button("▶️ Train Stage 1")
+    train_btn2 = gr.Button("▶️ Train Stage 2")
+    train_btn3 = gr.Button("▶️ Train Stage 3")
+    train_btn1.click(lambda: start_training(1), None, status)
+    train_btn2.click(lambda: start_training(2), None, status)
+    train_btn3.click(lambda: start_training(3), None, status)
+demo.queue(max_size=10).launch()