VladBoyko commited on
Commit
e5b56c6
Β·
verified Β·
1 Parent(s): 9f752eb

Update app.py

Browse files

Updated to use vLLM and improved how the models output is parsed

Files changed (1) hide show
  1. app.py +314 -92
app.py CHANGED
@@ -1,114 +1,334 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
4
  import spaces
5
 
6
- class VibeThinker:
7
  def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
8
  self.model_path = model_path
9
- print("Loading model... This may take a minute.")
10
 
11
- self.model = AutoModelForCausalLM.from_pretrained(
12
- self.model_path,
13
- low_cpu_mem_usage=True,
14
- torch_dtype=torch.bfloat16,
15
- device_map="auto",
16
  trust_remote_code=True
17
  )
18
 
19
- self.tokenizer = AutoTokenizer.from_pretrained(
20
- self.model_path,
21
- trust_remote_code=True
22
- )
 
23
 
24
- print(f"Model loaded successfully!")
25
- print(f"Using device: {self.model.device}")
26
- if torch.cuda.is_available():
27
- print(f"CUDA device: {torch.cuda.get_device_name(0)}")
28
-
29
- @spaces.GPU # This decorator allocates GPU when function is called (for ZeroGPU spaces)
30
- def infer_text(self, prompt, temperature=0.6, max_tokens=40960, top_p=0.95):
31
- """
32
- Generate response for a given prompt
33
-
34
- Args:
35
- prompt: The input question (preferably in English)
36
- temperature: Controls randomness (0.6 or 1.0 recommended)
37
- max_tokens: Maximum tokens to generate
38
- top_p: Nucleus sampling parameter
39
- """
40
  messages = [
41
  {"role": "user", "content": prompt}
42
  ]
43
 
44
- text = self.tokenizer.apply_chat_template(
45
- messages,
46
- tokenize=False,
47
- add_generation_prompt=True
48
- )
49
-
50
- model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
51
-
52
- generation_config = dict(
53
- max_new_tokens=max_tokens,
54
- do_sample=True,
55
  temperature=temperature,
 
56
  top_p=top_p,
57
- top_k=None # Set to -1 in vLLM/SGLang
58
  )
59
 
60
- print(f"Generating response with temperature={temperature}, max_tokens={max_tokens}...")
61
 
62
- generated_ids = self.model.generate(
63
- **model_inputs,
64
- generation_config=GenerationConfig(**generation_config)
65
- )
66
 
67
- generated_ids = [
68
- output_ids[len(input_ids):]
69
- for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
70
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
 
 
73
 
74
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  # Initialize model
78
- print("Initializing VibeThinker-1.5B...")
79
- model = VibeThinker()
80
 
81
  # Create Gradio interface
82
  def generate_response(prompt, temperature, max_tokens, top_p):
83
  if not prompt.strip():
84
- return "Please enter a question."
85
 
86
  try:
87
- response = model.infer_text(
 
88
  prompt=prompt,
89
  temperature=temperature,
90
  max_tokens=max_tokens,
91
  top_p=top_p
92
  )
93
- return response
 
 
 
 
 
 
94
  except Exception as e:
95
- return f"Error: {str(e)}"
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # Gradio UI
99
- with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
100
  gr.Markdown("""
101
- # 🧠 VibeThinker-1.5B: Reasoning Model
102
 
103
- **Optimized for**: Competitive math problems and algorithm coding challenges
104
 
105
- **Note**: This model works best with questions in English. It's specifically trained for
106
- mathematical reasoning and competitive programming tasks.
 
 
107
 
108
- ### Example Prompts:
109
- - "Solve: Find all solutions to x^3 - 3x^2 + 4 = 0"
110
- - "Write a Python function to find the longest palindromic substring in O(n^2) time"
111
- - "Prove that the sum of angles in a triangle equals 180 degrees"
112
 
113
  [GitHub](https://github.com/WeiboAI/VibeThinker) | [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221)
114
  """)
@@ -117,11 +337,11 @@ with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
117
  with gr.Column(scale=1):
118
  prompt_input = gr.Textbox(
119
  label="Your Question",
120
- placeholder="Enter your math problem or coding challenge here (in English)...",
121
- lines=5
122
  )
123
 
124
- with gr.Accordion("Advanced Settings", open=False):
125
  temperature_slider = gr.Slider(
126
  minimum=0.1,
127
  maximum=1.5,
@@ -135,7 +355,7 @@ with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
135
  maximum=40960,
136
  value=8192,
137
  step=512,
138
- label="Max Tokens (model supports up to 40,960)"
139
  )
140
 
141
  top_p_slider = gr.Slider(
@@ -146,53 +366,55 @@ with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
146
  label="Top P"
147
  )
148
 
149
- submit_btn = gr.Button("πŸš€ Generate Solution", variant="primary")
150
- clear_btn = gr.Button("πŸ—‘οΈ Clear")
151
 
152
  with gr.Column(scale=1):
153
- output_text = gr.Textbox(
154
  label="Model Response",
155
- lines=20,
156
- show_copy_button=True
157
  )
158
 
159
  # Example questions
160
  gr.Examples(
161
  examples=[
162
- ["Find the number of positive integers n ≀ 1000 such that n^2 + n + 41 is prime.", 0.6, 8192, 0.95],
163
- ["Write an efficient algorithm to solve the 0-1 knapsack problem using dynamic programming.", 0.6, 8192, 0.95],
164
- ["Prove that √2 is irrational using proof by contradiction.", 0.6, 8192, 0.95],
165
- ["A tank can be filled by pipe A in 3 hours and pipe B in 5 hours. If both pipes are opened together, how long will it take to fill the tank?", 0.6, 8192, 0.95],
166
  ],
167
  inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
168
- label="Example Problems"
169
  )
170
 
171
  # Event handlers
172
  submit_btn.click(
173
  fn=generate_response,
174
  inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
175
- outputs=output_text
176
  )
177
 
178
  clear_btn.click(
179
- fn=lambda: ("", ""),
180
  inputs=[],
181
- outputs=[prompt_input, output_text]
182
  )
183
 
184
  gr.Markdown("""
185
  ---
186
- ### πŸ“Š Model Performance Highlights:
187
- - **AIME24**: 80.3 (vs DeepSeek R1: 79.8)
188
- - **AIME25**: 74.4 (vs DeepSeek R1: 70.0)
189
- - **LiveCodeBench v6**: 51.1
190
- - **Parameters**: Only 1.5B (400x smaller than DeepSeek R1!)
 
 
 
191
 
192
- **Training Cost**: $7,800 USD | **License**: MIT
193
  """)
194
 
195
  # Launch the app
196
  if __name__ == "__main__":
197
- demo.queue() # Enable queuing for better UX
198
- demo.launch()
 
1
  import gradio as gr
2
+ import re
3
+ from vllm import LLM, SamplingParams
4
  import spaces
5
 
6
+ class VibeThinkerVLLM:
7
  def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
8
  self.model_path = model_path
9
+ print("Loading model with vLLM... This may take a minute.")
10
 
11
+ self.model = LLM(
12
+ model=self.model_path,
13
+ dtype="bfloat16",
14
+ gpu_memory_utilization=0.9,
15
+ max_model_len=40960, # Support full context length
16
  trust_remote_code=True
17
  )
18
 
19
+ print(f"Model loaded successfully with vLLM!")
20
+
21
+ @spaces.GPU
22
+ def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
23
+ """Generate response with vLLM for faster inference"""
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  messages = [
26
  {"role": "user", "content": prompt}
27
  ]
28
 
29
+ sampling_params = SamplingParams(
 
 
 
 
 
 
 
 
 
 
30
  temperature=temperature,
31
+ max_tokens=max_tokens,
32
  top_p=top_p,
33
+ top_k=-1, # Disable top_k sampling
34
  )
35
 
36
+ print(f"Generating with vLLM (temp={temperature}, max_tokens={max_tokens})...")
37
 
38
+ outputs = self.model.chat(messages, sampling_params=sampling_params)
39
+ response = outputs[0].outputs[0].text
 
 
40
 
41
+ return response
42
+
43
+
44
+ def parse_model_output(text):
45
+ """
46
+ Parse model output into structured components:
47
+ - Thinking sections (within <think> tags)
48
+ - Regular text (chat messages)
49
+ - Code blocks (within ``` or <code> tags)
50
+ """
51
+
52
+ sections = []
53
+
54
+ # Split by <think> tags
55
+ think_pattern = r'<think>(.*?)</think>'
56
+ code_pattern = r'```(\w+)?\n(.*?)```'
57
+
58
+ # Extract thinking sections
59
+ think_matches = list(re.finditer(think_pattern, text, re.DOTALL))
60
+
61
+ # Track positions
62
+ last_pos = 0
63
+
64
+ for match in think_matches:
65
+ # Add text before thinking section
66
+ before_text = text[last_pos:match.start()].strip()
67
+ if before_text:
68
+ # Check for code blocks in this text
69
+ code_blocks = list(re.finditer(code_pattern, before_text, re.DOTALL))
70
+
71
+ if code_blocks:
72
+ # Process text with code blocks
73
+ text_pos = 0
74
+ for code_match in code_blocks:
75
+ # Add text before code
76
+ pre_code_text = before_text[text_pos:code_match.start()].strip()
77
+ if pre_code_text:
78
+ sections.append({
79
+ 'type': 'text',
80
+ 'content': pre_code_text
81
+ })
82
+
83
+ # Add code block
84
+ language = code_match.group(1) or 'plaintext'
85
+ code_content = code_match.group(2).strip()
86
+ sections.append({
87
+ 'type': 'code',
88
+ 'language': language,
89
+ 'content': code_content
90
+ })
91
+
92
+ text_pos = code_match.end()
93
+
94
+ # Add remaining text after last code block
95
+ remaining_text = before_text[text_pos:].strip()
96
+ if remaining_text:
97
+ sections.append({
98
+ 'type': 'text',
99
+ 'content': remaining_text
100
+ })
101
+ else:
102
+ sections.append({
103
+ 'type': 'text',
104
+ 'content': before_text
105
+ })
106
 
107
+ # Add thinking section
108
+ think_content = match.group(1).strip()
109
+ sections.append({
110
+ 'type': 'thinking',
111
+ 'content': think_content
112
+ })
113
 
114
+ last_pos = match.end()
115
+
116
+ # Add remaining text after last thinking section
117
+ remaining = text[last_pos:].strip()
118
+ if remaining:
119
+ # Check for code blocks
120
+ code_blocks = list(re.finditer(code_pattern, remaining, re.DOTALL))
121
+
122
+ if code_blocks:
123
+ text_pos = 0
124
+ for code_match in code_blocks:
125
+ pre_code_text = remaining[text_pos:code_match.start()].strip()
126
+ if pre_code_text:
127
+ sections.append({
128
+ 'type': 'text',
129
+ 'content': pre_code_text
130
+ })
131
+
132
+ language = code_match.group(1) or 'plaintext'
133
+ code_content = code_match.group(2).strip()
134
+ sections.append({
135
+ 'type': 'code',
136
+ 'language': language,
137
+ 'content': code_content
138
+ })
139
+
140
+ text_pos = code_match.end()
141
+
142
+ remaining_text = remaining[text_pos:].strip()
143
+ if remaining_text:
144
+ sections.append({
145
+ 'type': 'text',
146
+ 'content': remaining_text
147
+ })
148
+ else:
149
+ sections.append({
150
+ 'type': 'text',
151
+ 'content': remaining
152
+ })
153
+
154
+ return sections
155
+
156
+
157
+ def format_output_for_display(sections):
158
+ """
159
+ Format parsed sections into a rich HTML display with:
160
+ - Collapsible thinking sections
161
+ - Syntax-highlighted code blocks
162
+ - Clean text rendering
163
+ """
164
+
165
+ html_parts = []
166
+
167
+ for i, section in enumerate(sections):
168
+ if section['type'] == 'thinking':
169
+ # Collapsible thinking section
170
+ html_parts.append(f"""
171
+ <details class="thinking-section" style="margin: 15px 0; border: 2px solid #f39c12; border-radius: 8px; background-color: #fff9e6;">
172
+ <summary style="padding: 12px; cursor: pointer; font-weight: bold; color: #d68910; user-select: none;">
173
+ πŸ€” Thinking Process (Click to expand)
174
+ </summary>
175
+ <div style="padding: 15px; border-top: 1px solid #f39c12; background-color: #fffef7; white-space: pre-wrap; font-family: 'Courier New', monospace; font-size: 13px; color: #333; line-height: 1.6;">
176
+ {section['content']}
177
+ </div>
178
+ </details>
179
+ """)
180
+
181
+ elif section['type'] == 'code':
182
+ # Code block with copy/download buttons
183
+ code_id = f"code-{i}"
184
+ html_parts.append(f"""
185
+ <details class="code-section" open style="margin: 15px 0; border: 2px solid #3498db; border-radius: 8px; background-color: #e8f4fd;">
186
+ <summary style="padding: 12px; cursor: pointer; font-weight: bold; color: #2874a6; user-select: none;">
187
+ πŸ’» Code ({section['language']}) - Click to collapse
188
+ </summary>
189
+ <div style="position: relative; padding: 0;">
190
+ <div style="position: absolute; top: 10px; right: 10px; z-index: 10;">
191
+ <button onclick="copyCode('{code_id}')" style="padding: 6px 12px; margin-right: 5px; background-color: #3498db; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 12px;">
192
+ πŸ“‹ Copy
193
+ </button>
194
+ <button onclick="downloadCode('{code_id}', '{section['language']}')" style="padding: 6px 12px; background-color: #27ae60; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 12px;">
195
+ ⬇️ Download
196
+ </button>
197
+ </div>
198
+ <pre id="{code_id}" style="margin: 0; padding: 40px 15px 15px 15px; background-color: #f8f9fa; border-top: 1px solid #3498db; overflow-x: auto; font-family: 'Courier New', monospace; font-size: 13px; line-height: 1.5;"><code class="language-{section['language']}">{section['content']}</code></pre>
199
+ </div>
200
+ </details>
201
+ """)
202
+
203
+ else: # text
204
+ # Regular text output
205
+ html_parts.append(f"""
206
+ <div class="text-section" style="margin: 15px 0; padding: 15px; border: 1px solid #bdc3c7; border-radius: 8px; background-color: #ffffff; white-space: pre-wrap; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; font-size: 14px; line-height: 1.8; color: #2c3e50;">
207
+ {section['content']}
208
+ </div>
209
+ """)
210
+
211
+ # Add JavaScript for copy and download functionality
212
+ js_code = """
213
+ <script>
214
+ function copyCode(elementId) {
215
+ const codeElement = document.getElementById(elementId);
216
+ const code = codeElement.textContent;
217
+ navigator.clipboard.writeText(code).then(() => {
218
+ alert('Code copied to clipboard!');
219
+ }).catch(err => {
220
+ console.error('Failed to copy:', err);
221
+ });
222
+ }
223
+
224
+ function downloadCode(elementId, language) {
225
+ const codeElement = document.getElementById(elementId);
226
+ const code = codeElement.textContent;
227
+
228
+ // Determine file extension
229
+ const extensions = {
230
+ 'python': 'py',
231
+ 'javascript': 'js',
232
+ 'typescript': 'ts',
233
+ 'html': 'html',
234
+ 'css': 'css',
235
+ 'java': 'java',
236
+ 'cpp': 'cpp',
237
+ 'c': 'c',
238
+ 'ruby': 'rb',
239
+ 'go': 'go',
240
+ 'rust': 'rs',
241
+ 'swift': 'swift',
242
+ 'kotlin': 'kt',
243
+ 'plaintext': 'txt'
244
+ };
245
+
246
+ const ext = extensions[language.toLowerCase()] || 'txt';
247
+ const filename = `code_snippet.${ext}`;
248
+
249
+ // Create blob and download
250
+ const blob = new Blob([code], { type: 'text/plain' });
251
+ const url = window.URL.createObjectURL(blob);
252
+ const a = document.createElement('a');
253
+ a.href = url;
254
+ a.download = filename;
255
+ document.body.appendChild(a);
256
+ a.click();
257
+ document.body.removeChild(a);
258
+ window.URL.revokeObjectURL(url);
259
+ }
260
+ </script>
261
+ """
262
+
263
+ return js_code + "\n".join(html_parts)
264
 
265
 
266
  # Initialize model
267
+ print("Initializing VibeThinker-1.5B with vLLM...")
268
+ model = VibeThinkerVLLM()
269
 
270
  # Create Gradio interface
271
  def generate_response(prompt, temperature, max_tokens, top_p):
272
  if not prompt.strip():
273
+ return "<p style='color: red;'>Please enter a question.</p>"
274
 
275
  try:
276
+ # Generate raw response
277
+ raw_response = model.infer_text(
278
  prompt=prompt,
279
  temperature=temperature,
280
  max_tokens=max_tokens,
281
  top_p=top_p
282
  )
283
+
284
+ # Parse and format the response
285
+ sections = parse_model_output(raw_response)
286
+ formatted_html = format_output_for_display(sections)
287
+
288
+ return formatted_html
289
+
290
  except Exception as e:
291
+ return f"<p style='color: red;'><strong>Error:</strong> {str(e)}</p>"
292
 
293
 
294
+ # Custom CSS for better styling
295
+ custom_css = """
296
+ .thinking-section summary:hover {
297
+ background-color: #fef5e7;
298
+ }
299
+
300
+ .code-section summary:hover {
301
+ background-color: #d6eaf8;
302
+ }
303
+
304
+ .text-section {
305
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
306
+ }
307
+
308
+ details[open] summary {
309
+ border-bottom: 1px solid #ddd;
310
+ margin-bottom: 10px;
311
+ }
312
+
313
+ /* Syntax highlighting enhancements */
314
+ code {
315
+ font-family: 'Fira Code', 'Courier New', monospace;
316
+ }
317
+ """
318
+
319
  # Gradio UI
320
+ with gr.Blocks(title="VibeThinker-1.5B Advanced", css=custom_css) as demo:
321
  gr.Markdown("""
322
+ # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
323
 
324
+ **Optimized with vLLM** for 10-20x faster inference! ⚑
325
 
326
+ **Features**:
327
+ - πŸ€” **Collapsible Thinking Sections**: See the model's reasoning process
328
+ - πŸ’» **Interactive Code Blocks**: Copy or download code snippets
329
+ - πŸ“ **Clean Text Display**: Easy-to-read formatted responses
330
 
331
+ **Best for**: Competitive math problems and algorithm coding challenges
 
 
 
332
 
333
  [GitHub](https://github.com/WeiboAI/VibeThinker) | [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221)
334
  """)
 
337
  with gr.Column(scale=1):
338
  prompt_input = gr.Textbox(
339
  label="Your Question",
340
+ placeholder="Ask a math problem or coding challenge (in English)...",
341
+ lines=6
342
  )
343
 
344
+ with gr.Accordion("βš™οΈ Advanced Settings", open=False):
345
  temperature_slider = gr.Slider(
346
  minimum=0.1,
347
  maximum=1.5,
 
355
  maximum=40960,
356
  value=8192,
357
  step=512,
358
+ label="Max Tokens"
359
  )
360
 
361
  top_p_slider = gr.Slider(
 
366
  label="Top P"
367
  )
368
 
369
+ submit_btn = gr.Button("πŸš€ Generate Solution", variant="primary", size="lg")
370
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", size="sm")
371
 
372
  with gr.Column(scale=1):
373
+ output_html = gr.HTML(
374
  label="Model Response",
375
+ value="<p style='color: #7f8c8d; text-align: center; padding: 40px;'>Your response will appear here...</p>"
 
376
  )
377
 
378
  # Example questions
379
  gr.Examples(
380
  examples=[
381
+ ["Make me a single page html application that takes a color and outputs a color theme based on that color", 0.6, 16384, 0.95],
382
+ ["Solve this AIME problem: Find the number of positive integers n ≀ 1000 such that n^2 + n + 41 is prime.", 0.6, 12288, 0.95],
383
+ ["Write a Python function to implement the Euclidean algorithm for finding GCD, then optimize it.", 0.6, 8192, 0.95],
384
+ ["Prove that the sum of the first n odd numbers equals n^2 using mathematical induction.", 0.6, 8192, 0.95],
385
  ],
386
  inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
387
+ label="πŸ“š Example Problems"
388
  )
389
 
390
  # Event handlers
391
  submit_btn.click(
392
  fn=generate_response,
393
  inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
394
+ outputs=output_html
395
  )
396
 
397
  clear_btn.click(
398
+ fn=lambda: ("", "<p style='color: #7f8c8d; text-align: center; padding: 40px;'>Your response will appear here...</p>"),
399
  inputs=[],
400
+ outputs=[prompt_input, output_html]
401
  )
402
 
403
  gr.Markdown("""
404
  ---
405
+ ### πŸ“Š Performance Comparison:
406
+
407
+ | Metric | VibeThinker-1.5B | DeepSeek R1 (671B) | Size Ratio |
408
+ |--------|------------------|---------------------|------------|
409
+ | AIME24 | **80.3** | 79.8 | **400Γ— smaller** |
410
+ | AIME25 | **74.4** | 70.0 | **400Γ— smaller** |
411
+ | HMMT25 | **50.4** | 41.7 | **400Γ— smaller** |
412
+ | Training Cost | **$7,800** | $294,000+ | **40Γ— cheaper** |
413
 
414
+ πŸš€ **Powered by vLLM** for ultra-fast inference on T4 GPUs
415
  """)
416
 
417
  # Launch the app
418
  if __name__ == "__main__":
419
+ demo.queue(max_size=20)
420
+ demo.launch(share=False)