Spaces:

WellGoods
/

VibeThinker

Sleeping

App Files Files Community

VibeThinker / app.py

VladBoyko

Create app.py

41ff223 verified 27 days ago

raw

history blame

6.83 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
	import spaces

	class VibeThinker:
	def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
	self.model_path = model_path
	print("Loading model... This may take a minute.")

	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_path,
	low_cpu_mem_usage=True,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)

	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_path,
	trust_remote_code=True
	)

	print(f"Model loaded successfully!")
	print(f"Using device: {self.model.device}")
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(0)}")

	@spaces.GPU # This decorator allocates GPU when function is called (for ZeroGPU spaces)
	def infer_text(self, prompt, temperature=0.6, max_tokens=40960, top_p=0.95):
	"""
	Generate response for a given prompt

	Args:
	prompt: The input question (preferably in English)
	temperature: Controls randomness (0.6 or 1.0 recommended)
	max_tokens: Maximum tokens to generate
	top_p: Nucleus sampling parameter
	"""
	messages = [
	{"role": "user", "content": prompt}
	]

	text = self.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

	generation_config = dict(
	max_new_tokens=max_tokens,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	top_k=None # Set to -1 in vLLM/SGLang
	)

	print(f"Generating response with temperature={temperature}, max_tokens={max_tokens}...")

	generated_ids = self.model.generate(
	**model_inputs,
	generation_config=GenerationConfig(**generation_config)
	)

	generated_ids = [
	output_ids[len(input_ids):]
	for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
	]

	response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

	return response


	# Initialize model
	print("Initializing VibeThinker-1.5B...")
	model = VibeThinker()

	# Create Gradio interface
	def generate_response(prompt, temperature, max_tokens, top_p):
	if not prompt.strip():
	return "Please enter a question."

	try:
	response = model.infer_text(
	prompt=prompt,
	temperature=temperature,
	max_tokens=max_tokens,
	top_p=top_p
	)
	return response
	except Exception as e:
	return f"Error: {str(e)}"


	# Gradio UI
	with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
	gr.Markdown("""
	# 🧠 VibeThinker-1.5B: Reasoning Model

	Optimized for: Competitive math problems and algorithm coding challenges

	Note: This model works best with questions in English. It's specifically trained for
	mathematical reasoning and competitive programming tasks.

	### Example Prompts:
	- "Solve: Find all solutions to x^3 - 3x^2 + 4 = 0"
	- "Write a Python function to find the longest palindromic substring in O(n^2) time"
	- "Prove that the sum of angles in a triangle equals 180 degrees"

	[GitHub](https://github.com/WeiboAI/VibeThinker) \| [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) \| [Paper](https://huggingface.co/papers/2511.06221)
	""")

	with gr.Row():
	with gr.Column(scale=1):
	prompt_input = gr.Textbox(
	label="Your Question",
	placeholder="Enter your math problem or coding challenge here (in English)...",
	lines=5
	)

	with gr.Accordion("Advanced Settings", open=False):
	temperature_slider = gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.6,
	step=0.1,
	label="Temperature (0.6 or 1.0 recommended)"
	)

	max_tokens_slider = gr.Slider(
	minimum=512,
	maximum=40960,
	value=8192,
	step=512,
	label="Max Tokens (model supports up to 40,960)"
	)

	top_p_slider = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top P"
	)

	submit_btn = gr.Button("🚀 Generate Solution", variant="primary")
	clear_btn = gr.Button("🗑️ Clear")

	with gr.Column(scale=1):
	output_text = gr.Textbox(
	label="Model Response",
	lines=20,
	show_copy_button=True
	)

	# Example questions
	gr.Examples(
	examples=[
	["Find the number of positive integers n ≤ 1000 such that n^2 + n + 41 is prime.", 0.6, 8192, 0.95],
	["Write an efficient algorithm to solve the 0-1 knapsack problem using dynamic programming.", 0.6, 8192, 0.95],
	["Prove that √2 is irrational using proof by contradiction.", 0.6, 8192, 0.95],
	["A tank can be filled by pipe A in 3 hours and pipe B in 5 hours. If both pipes are opened together, how long will it take to fill the tank?", 0.6, 8192, 0.95],
	],
	inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
	label="Example Problems"
	)

	# Event handlers
	submit_btn.click(
	fn=generate_response,
	inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
	outputs=output_text
	)

	clear_btn.click(
	fn=lambda: ("", ""),
	inputs=[],
	outputs=[prompt_input, output_text]
	)

	gr.Markdown("""
	---
	### 📊 Model Performance Highlights:
	- AIME24: 80.3 (vs DeepSeek R1: 79.8)
	- AIME25: 74.4 (vs DeepSeek R1: 70.0)
	- LiveCodeBench v6: 51.1
	- Parameters: Only 1.5B (400x smaller than DeepSeek R1!)

	Training Cost: $7,800 USD \| License: MIT
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.queue() # Enable queuing for better UX
	demo.launch()