VibeThinker / app.py
VladBoyko's picture
Create app.py
41ff223 verified
raw
history blame
6.83 kB
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import spaces
class VibeThinker:
def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
self.model_path = model_path
print("Loading model... This may take a minute.")
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path,
trust_remote_code=True
)
print(f"Model loaded successfully!")
print(f"Using device: {self.model.device}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
@spaces.GPU # This decorator allocates GPU when function is called (for ZeroGPU spaces)
def infer_text(self, prompt, temperature=0.6, max_tokens=40960, top_p=0.95):
"""
Generate response for a given prompt
Args:
prompt: The input question (preferably in English)
temperature: Controls randomness (0.6 or 1.0 recommended)
max_tokens: Maximum tokens to generate
top_p: Nucleus sampling parameter
"""
messages = [
{"role": "user", "content": prompt}
]
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
generation_config = dict(
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
top_k=None # Set to -1 in vLLM/SGLang
)
print(f"Generating response with temperature={temperature}, max_tokens={max_tokens}...")
generated_ids = self.model.generate(
**model_inputs,
generation_config=GenerationConfig(**generation_config)
)
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
# Initialize model
print("Initializing VibeThinker-1.5B...")
model = VibeThinker()
# Create Gradio interface
def generate_response(prompt, temperature, max_tokens, top_p):
if not prompt.strip():
return "Please enter a question."
try:
response = model.infer_text(
prompt=prompt,
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p
)
return response
except Exception as e:
return f"Error: {str(e)}"
# Gradio UI
with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
gr.Markdown("""
# 🧠 VibeThinker-1.5B: Reasoning Model
**Optimized for**: Competitive math problems and algorithm coding challenges
**Note**: This model works best with questions in English. It's specifically trained for
mathematical reasoning and competitive programming tasks.
### Example Prompts:
- "Solve: Find all solutions to x^3 - 3x^2 + 4 = 0"
- "Write a Python function to find the longest palindromic substring in O(n^2) time"
- "Prove that the sum of angles in a triangle equals 180 degrees"
[GitHub](https://github.com/WeiboAI/VibeThinker) | [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221)
""")
with gr.Row():
with gr.Column(scale=1):
prompt_input = gr.Textbox(
label="Your Question",
placeholder="Enter your math problem or coding challenge here (in English)...",
lines=5
)
with gr.Accordion("Advanced Settings", open=False):
temperature_slider = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.6,
step=0.1,
label="Temperature (0.6 or 1.0 recommended)"
)
max_tokens_slider = gr.Slider(
minimum=512,
maximum=40960,
value=8192,
step=512,
label="Max Tokens (model supports up to 40,960)"
)
top_p_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top P"
)
submit_btn = gr.Button("πŸš€ Generate Solution", variant="primary")
clear_btn = gr.Button("πŸ—‘οΈ Clear")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="Model Response",
lines=20,
show_copy_button=True
)
# Example questions
gr.Examples(
examples=[
["Find the number of positive integers n ≀ 1000 such that n^2 + n + 41 is prime.", 0.6, 8192, 0.95],
["Write an efficient algorithm to solve the 0-1 knapsack problem using dynamic programming.", 0.6, 8192, 0.95],
["Prove that √2 is irrational using proof by contradiction.", 0.6, 8192, 0.95],
["A tank can be filled by pipe A in 3 hours and pipe B in 5 hours. If both pipes are opened together, how long will it take to fill the tank?", 0.6, 8192, 0.95],
],
inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
label="Example Problems"
)
# Event handlers
submit_btn.click(
fn=generate_response,
inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
outputs=output_text
)
clear_btn.click(
fn=lambda: ("", ""),
inputs=[],
outputs=[prompt_input, output_text]
)
gr.Markdown("""
---
### πŸ“Š Model Performance Highlights:
- **AIME24**: 80.3 (vs DeepSeek R1: 79.8)
- **AIME25**: 74.4 (vs DeepSeek R1: 70.0)
- **LiveCodeBench v6**: 51.1
- **Parameters**: Only 1.5B (400x smaller than DeepSeek R1!)
**Training Cost**: $7,800 USD | **License**: MIT
""")
# Launch the app
if __name__ == "__main__":
demo.queue() # Enable queuing for better UX
demo.launch()