Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import spaces | |
| # Model config | |
| MODEL_ID = "WeiboAI/VibeThinker-1.5B" | |
| SYSTEM_PROMPT = "You are a concise solver. Respond briefly." | |
| # Load model | |
| print("Loading model...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| ) | |
| print("Model loaded!") | |
| @spaces.GPU | |
| def chat_with_stream(message, history, progress=gr.Progress()): | |
| """Chat with streaming output""" | |
| # Handle inputs safely | |
| if message is None: | |
| message = "Hello" | |
| if history is None: | |
| history = [] | |
| # Convert to string | |
| message = str(message) | |
| progress(0.1, desc="Building conversation...") | |
| # Build messages | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| # Add history | |
| for user_msg, assistant_msg in history: | |
| if user_msg is not None: | |
| messages.append({"role": "user", "content": str(user_msg)}) | |
| if assistant_msg is not None: | |
| messages.append({"role": "assistant", "content": str(assistant_msg)}) | |
| progress(0.3, desc="Adding your message...") | |
| messages.append({"role": "user", "content": message}) | |
| progress(0.5, desc="Formatting input...") | |
| prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| progress(0.6, desc="Tokenizing...") | |
| inputs = tokenizer([prompt], return_tensors="pt").to(model.device) | |
| progress(0.7, desc="Starting generation...") | |
| # Generate with streaming | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=1000, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| pad_token_id=tokenizer.eos_token_id, | |
| return_dict_in_generate=True, | |
| output_scores=False, | |
| ) | |
| progress(0.9, desc="Decoding response...") | |
| # Decode | |
| full_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract assistant response | |
| if "assistant" in full_text: | |
| response = full_text.split("assistant")[-1].strip() | |
| else: | |
| response = full_text | |
| progress(1.0, desc="Complete!") | |
| return response | |
| def create_demo(): | |
| """Create simple demo""" | |
| demo = gr.ChatInterface( | |
| fn=chat_with_stream, | |
| title="VibeThinker Chat", | |
| description="Simple chat with VibeThinker-1.5B", | |
| examples=["2+2", "What is AI?", "Write a poem"] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| print("Starting...") | |
| demo = create_demo() | |
| demo.launch(share=False) |