Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig | |
| import spaces | |
| class VibeThinker: | |
| def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"): | |
| self.model_path = model_path | |
| print("Loading model... This may take a minute.") | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_path, | |
| low_cpu_mem_usage=True, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_path, | |
| trust_remote_code=True | |
| ) | |
| print(f"Model loaded successfully!") | |
| print(f"Using device: {self.model.device}") | |
| if torch.cuda.is_available(): | |
| print(f"CUDA device: {torch.cuda.get_device_name(0)}") | |
| # This decorator allocates GPU when function is called (for ZeroGPU spaces) | |
| def infer_text(self, prompt, temperature=0.6, max_tokens=40960, top_p=0.95): | |
| """ | |
| Generate response for a given prompt | |
| Args: | |
| prompt: The input question (preferably in English) | |
| temperature: Controls randomness (0.6 or 1.0 recommended) | |
| max_tokens: Maximum tokens to generate | |
| top_p: Nucleus sampling parameter | |
| """ | |
| messages = [ | |
| {"role": "user", "content": prompt} | |
| ] | |
| text = self.tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device) | |
| generation_config = dict( | |
| max_new_tokens=max_tokens, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=None # Set to -1 in vLLM/SGLang | |
| ) | |
| print(f"Generating response with temperature={temperature}, max_tokens={max_tokens}...") | |
| generated_ids = self.model.generate( | |
| **model_inputs, | |
| generation_config=GenerationConfig(**generation_config) | |
| ) | |
| generated_ids = [ | |
| output_ids[len(input_ids):] | |
| for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
| ] | |
| response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| return response | |
| # Initialize model | |
| print("Initializing VibeThinker-1.5B...") | |
| model = VibeThinker() | |
| # Create Gradio interface | |
| def generate_response(prompt, temperature, max_tokens, top_p): | |
| if not prompt.strip(): | |
| return "Please enter a question." | |
| try: | |
| response = model.infer_text( | |
| prompt=prompt, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| top_p=top_p | |
| ) | |
| return response | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Gradio UI | |
| with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo: | |
| gr.Markdown(""" | |
| # π§ VibeThinker-1.5B: Reasoning Model | |
| **Optimized for**: Competitive math problems and algorithm coding challenges | |
| **Note**: This model works best with questions in English. It's specifically trained for | |
| mathematical reasoning and competitive programming tasks. | |
| ### Example Prompts: | |
| - "Solve: Find all solutions to x^3 - 3x^2 + 4 = 0" | |
| - "Write a Python function to find the longest palindromic substring in O(n^2) time" | |
| - "Prove that the sum of angles in a triangle equals 180 degrees" | |
| [GitHub](https://github.com/WeiboAI/VibeThinker) | [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| prompt_input = gr.Textbox( | |
| label="Your Question", | |
| placeholder="Enter your math problem or coding challenge here (in English)...", | |
| lines=5 | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.5, | |
| value=0.6, | |
| step=0.1, | |
| label="Temperature (0.6 or 1.0 recommended)" | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| minimum=512, | |
| maximum=40960, | |
| value=8192, | |
| step=512, | |
| label="Max Tokens (model supports up to 40,960)" | |
| ) | |
| top_p_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top P" | |
| ) | |
| submit_btn = gr.Button("π Generate Solution", variant="primary") | |
| clear_btn = gr.Button("ποΈ Clear") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="Model Response", | |
| lines=20, | |
| show_copy_button=True | |
| ) | |
| # Example questions | |
| gr.Examples( | |
| examples=[ | |
| ["Find the number of positive integers n β€ 1000 such that n^2 + n + 41 is prime.", 0.6, 8192, 0.95], | |
| ["Write an efficient algorithm to solve the 0-1 knapsack problem using dynamic programming.", 0.6, 8192, 0.95], | |
| ["Prove that β2 is irrational using proof by contradiction.", 0.6, 8192, 0.95], | |
| ["A tank can be filled by pipe A in 3 hours and pipe B in 5 hours. If both pipes are opened together, how long will it take to fill the tank?", 0.6, 8192, 0.95], | |
| ], | |
| inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider], | |
| label="Example Problems" | |
| ) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=generate_response, | |
| inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider], | |
| outputs=output_text | |
| ) | |
| clear_btn.click( | |
| fn=lambda: ("", ""), | |
| inputs=[], | |
| outputs=[prompt_input, output_text] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π Model Performance Highlights: | |
| - **AIME24**: 80.3 (vs DeepSeek R1: 79.8) | |
| - **AIME25**: 74.4 (vs DeepSeek R1: 70.0) | |
| - **LiveCodeBench v6**: 51.1 | |
| - **Parameters**: Only 1.5B (400x smaller than DeepSeek R1!) | |
| **Training Cost**: $7,800 USD | **License**: MIT | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue() # Enable queuing for better UX | |
| demo.launch() | |