import gradio as gr import torch, threading, time, spaces from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer # --------------------- # Model Config # --------------------- MODEL_ID = "WeiboAI/VibeThinker-1.5B" SYSTEM_PROMPT = "You are a concise solver. Respond briefly with the correct answer." print(f"⏳ Loading {MODEL_ID} …") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, low_cpu_mem_usage=True, dtype=torch.bfloat16, device_map="auto" ) print("✅ Model ready.") # --------------------- # Chat Function # --------------------- @spaces.GPU(duration=60) def chat_fn(message, history): history = history or [] messages = [{"role": "system", "content": SYSTEM_PROMPT}] for user_msg, bot_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if bot_msg: messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([prompt], return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=200, temperature=0.3, top_p=0.9, do_sample=False, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, repetition_penalty=1.15 ) thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) thread.start() partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text # --------------------- # UI # --------------------- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("## 💡 VibeThinker-1.5B · Edge/ZeroGPU (Streaming Stable)") chatbot = gr.Chatbot(label="Chatbot", height=500) msg_box = gr.Textbox(label="Textbox", placeholder="Type here…") send_btn = gr.Button("Send", variant="primary") def user_message(message, history): history = history or [] return "", history + [[message, None]] def bot_response(history): user_message = history[-1][0] response = "" for partial in chat_fn(user_message, history[:-1]): response = partial history[-1][1] = response yield history msg_box.submit(user_message, [msg_box, chatbot], [msg_box, chatbot], queue=False).then( bot_response, chatbot, chatbot ) send_btn.click(user_message, [msg_box, chatbot], [msg_box, chatbot], queue=False).then( bot_response, chatbot, chatbot ) if __name__ == "__main__": demo.queue(max_size=16).launch()