VladBoyko commited on
Commit
bfb609d
Β·
verified Β·
1 Parent(s): a271ec8

Update app.py

Browse files

switched from vllm to tensorflow

Files changed (1) hide show
  1. app.py +101 -139
app.py CHANGED
@@ -1,106 +1,110 @@
1
  import gradio as gr
2
- import os
 
3
  import re
4
  import time
5
- from vllm import LLM, SamplingParams
6
 
7
- # Force XFormers backend for T4 compatibility
8
- os.environ['VLLM_ATTENTION_BACKEND'] = 'XFORMERS'
9
- os.environ['VLLM_USE_TRITON_FLASH_ATTN'] = '0'
10
-
11
- class VibeThinkerVLLM:
12
  def __init__(self):
13
  self.model = None
 
 
14
  self.load_model()
15
 
16
  def load_model(self):
17
- """Load VibeThinker model with vLLM (T4-compatible settings)"""
18
  try:
19
- self.model = LLM(
20
- model="WeiboAI/VibeThinker-1.5B",
21
- dtype="float16", # Use float16 instead of bfloat16 for T4
22
- gpu_memory_utilization=0.85,
23
- max_model_len=40960, # Full 40K context as per docs
24
- enforce_eager=True, # Disable CUDA graphs for T4
25
- disable_custom_all_reduce=True, # Avoid custom kernels
26
- enable_prefix_caching=False, # Disable for stability
27
- max_num_seqs=1, # Process one sequence at a time
 
 
28
  trust_remote_code=True
29
  )
30
- print("βœ… vLLM model loaded successfully with T4-compatible settings (40K context)")
 
 
 
31
  except Exception as e:
32
  print(f"❌ Error loading model: {e}")
33
  raise
34
 
35
- def generate_response(self, prompt, temperature=0.6, max_tokens=16384, max_thinking_tokens=4096):
36
  """
37
- Generate response with thinking length control and loop detection
38
 
39
  Args:
40
  prompt: Input prompt
41
  temperature: Sampling temperature
42
- max_tokens: Total max tokens (thinking + output)
43
- max_thinking_tokens: Maximum tokens for reasoning phase
44
  """
45
- if not self.model:
46
  return "Model not loaded!", 0, 0, 0
47
 
48
  try:
49
  start_time = time.time()
50
 
51
- # Create sampling params with stop sequences to prevent loops
52
- sampling_params = SamplingParams(
53
- temperature=temperature,
54
- top_p=0.95,
55
- top_k=-1,
56
- max_tokens=max_tokens,
57
- # Stop sequences to prevent infinite loops
58
- stop=[
59
- "Wait, the problem says", # Common loop pattern
60
- "\n\n\n\n", # Multiple blank lines
61
- "###END###", # Custom stop token
62
- ],
63
- repetition_penalty=1.1, # Penalize repetition
64
- )
65
-
66
- # Format prompt clearly for competitive coding
67
  formatted_prompt = f"""<|im_start|>system
68
  You are a competitive programming expert. Provide clear, concise solutions to coding problems.
69
 
70
  Format your response as:
71
- 1. Brief analysis (2-3 sentences max)
72
  2. Solution approach
73
  3. Implementation code
74
  4. Test cases
75
 
76
- Keep reasoning under {max_thinking_tokens} tokens. DO NOT repeat yourself.<|im_end|>
77
  <|im_start|>user
78
  {prompt}<|im_end|>
79
  <|im_start|>assistant
80
  """
81
-
82
- # Generate with vLLM
83
- outputs = self.model.generate([formatted_prompt], sampling_params)
84
 
85
- generation_time = time.time() - start_time
 
 
86
 
87
- if outputs and len(outputs) > 0:
88
- output = outputs[0]
89
- generated_text = output.outputs[0].text
90
-
91
- # Check for loop patterns
92
- if self._detect_loop(generated_text):
93
- generated_text = self._truncate_loop(generated_text)
94
- generated_text += "\n\n⚠️ *[Loop detected and truncated]*"
95
-
96
- # Get token counts
97
- prompt_tokens = len(output.prompt_token_ids)
98
- completion_tokens = len(output.outputs[0].token_ids)
99
-
100
- return generated_text, prompt_tokens, completion_tokens, generation_time
 
 
 
 
 
101
  else:
102
- return "No output generated", 0, 0, 0
103
-
 
 
 
 
 
 
 
 
 
 
 
 
104
  except Exception as e:
105
  return f"Error during generation: {str(e)}", 0, 0, 0
106
 
@@ -112,6 +116,8 @@ Keep reasoning under {max_thinking_tokens} tokens. DO NOT repeat yourself.<|im_e
112
 
113
  # Check if same phrase repeats 3+ times
114
  for length in [10, 15, 20]:
 
 
115
  for i in range(len(words) - length * 3):
116
  phrase = ' '.join(words[i:i+length])
117
  rest = ' '.join(words[i+length:])
@@ -123,6 +129,8 @@ Keep reasoning under {max_thinking_tokens} tokens. DO NOT repeat yourself.<|im_e
123
  """Truncate text at the start of detected loop"""
124
  words = text.split()
125
  for length in [10, 15, 20]:
 
 
126
  for i in range(len(words) - length * 2):
127
  phrase = ' '.join(words[i:i+length])
128
  rest_start = i + length
@@ -135,12 +143,11 @@ def parse_model_output(text):
135
  """
136
  Parse model output to separate thinking and final answer
137
  ONLY extract code from the final answer section, not from thinking
138
- Returns: (thinking_content, answer_content, code_blocks)
139
  """
140
  loop_warning = ""
141
- if "[Loop detected and truncated]" in text:
142
  loop_warning = "\n\n⚠️ **Note**: Repetitive content was detected and removed"
143
- text = text.replace("⚠️ *[Loop detected and truncated]*", "")
144
 
145
  # Try to find explicit thinking delimiters
146
  thinking_patterns = [
@@ -159,7 +166,6 @@ def parse_model_output(text):
159
  break
160
 
161
  # If no explicit thinking tags, try to detect reasoning section
162
- # Look for a natural break like "Solution:" or "Here's the code:"
163
  if not thinking_content:
164
  split_markers = [
165
  r'(.*?)(?=\n\n(?:Solution|Here\'s|Implementation|Code|Final).*?:)',
@@ -170,7 +176,6 @@ def parse_model_output(text):
170
  match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
171
  if match:
172
  potential_thinking = match.group(1).strip()
173
- # Only treat as thinking if it's substantial (>100 chars) and contains reasoning keywords
174
  if len(potential_thinking) > 100:
175
  thinking_lower = potential_thinking.lower()
176
  if any(word in thinking_lower for word in ['step', 'approach', 'idea', 'first', 'we can', 'let\'s']):
@@ -178,11 +183,11 @@ def parse_model_output(text):
178
  answer_content = text[len(potential_thinking):].strip()
179
  break
180
 
181
- # NOW extract code blocks ONLY from answer_content (not from thinking)
182
  code_pattern = r'```(\w+)?\n(.*?)```'
183
  code_blocks = re.findall(code_pattern, answer_content, re.DOTALL)
184
 
185
- # Extract final answer (boxed or use answer_content as-is)
186
  answer_match = re.search(r'\\boxed\{([^}]+)\}', answer_content)
187
  if answer_match:
188
  final_answer = f"**Final Answer:** {answer_match.group(1)}"
@@ -194,23 +199,20 @@ def parse_model_output(text):
194
  return thinking_content, final_answer, code_blocks
195
 
196
  def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, generation_time):
197
- """
198
- Format output as styled HTML - thinking is plain text, code blocks are from final answer only
199
- """
200
  total_tokens = prompt_tokens + completion_tokens
201
  thinking_tokens_est = len(thinking.split()) * 1.3 if thinking else 0
202
  tokens_per_sec = completion_tokens / generation_time if generation_time > 0 else 0
203
 
204
- # Build thinking section HTML - PLAIN TEXT, NO CODE PARSING
205
  thinking_html = ""
206
  if thinking:
207
- # Escape any HTML in thinking to prevent rendering
208
  thinking_escaped = thinking.replace('<', '&lt;').replace('>', '&gt;')
209
  thinking_html = f"""
210
  <details style="background: #f8f9fa; border: 2px solid #e9ecef; border-radius: 12px; padding: 20px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
211
  <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #495057; user-select: none; display: flex; align-items: center; gap: 8px;">
212
  <span style="font-size: 20px;">🧠</span>
213
- <span>Reasoning Process ({int(thinking_tokens_est):,} tokens)</span>
214
  <span style="margin-left: auto; font-size: 12px; color: #6c757d;">Click to expand/collapse</span>
215
  </summary>
216
  <div style="margin-top: 16px; padding-top: 16px; border-top: 1px solid #dee2e6; color: #212529; line-height: 1.7; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, Consolas, monospace; background: #ffffff; padding: 16px; border-radius: 8px;">
@@ -219,15 +221,13 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
219
  </details>
220
  """
221
 
222
- # Build code blocks HTML - ONLY from final answer
223
  code_html = ""
224
  if code_blocks:
225
  code_blocks_html = ""
226
  for idx, (lang, code) in enumerate(code_blocks):
227
  lang_display = lang if lang else "code"
228
  code_id = f"code_{idx}"
229
-
230
- # Create downloadable version
231
  code_clean = code.strip()
232
 
233
  code_blocks_html += f"""
@@ -264,16 +264,9 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
264
  <script>
265
  function downloadCode(code, lang) {{
266
  const extensions = {{
267
- 'python': 'py',
268
- 'javascript': 'js',
269
- 'java': 'java',
270
- 'cpp': 'cpp',
271
- 'c': 'c',
272
- 'html': 'html',
273
- 'css': 'css',
274
- 'typescript': 'ts',
275
- 'rust': 'rs',
276
- 'go': 'go',
277
  }};
278
  const ext = extensions[lang.toLowerCase()] || 'txt';
279
  const filename = `solution.${{ext}}`;
@@ -294,7 +287,7 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
294
  html = f"""
295
  <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; max-width: 100%; margin: 0 auto; background: #ffffff; color: #1a1a1a;">
296
 
297
- <!-- Token Stats -->
298
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; margin-bottom: 24px; color: white; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
299
  <h3 style="margin: 0 0 12px 0; font-size: 18px; font-weight: 600;">πŸ“Š Generation Stats</h3>
300
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 12px; font-size: 14px;">
@@ -304,7 +297,7 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
304
  </div>
305
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
306
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Speed</div>
307
- <div style="font-size: 20px; font-weight: bold;">{tokens_per_sec:.0f} t/s</div>
308
  </div>
309
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
310
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Prompt</div>
@@ -316,7 +309,7 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
316
  </div>
317
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
318
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Thinking</div>
319
- <div style="font-size: 20px; font-weight: bold;">{int(thinking_tokens_est):,}</div>
320
  </div>
321
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
322
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Total</div>
@@ -325,10 +318,9 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
325
  </div>
326
  </div>
327
 
328
- <!-- Thinking Section (Plain Text Only) -->
329
  {thinking_html}
330
 
331
- <!-- Answer Section -->
332
  <div style="background: #ffffff; border: 2px solid #28a745; border-radius: 12px; padding: 24px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(40,167,69,0.1);">
333
  <h3 style="margin: 0 0 16px 0; color: #28a745; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 8px;">
334
  <span style="font-size: 22px;">βœ…</span> Final Solution
@@ -338,7 +330,6 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
338
  </div>
339
  </div>
340
 
341
- <!-- Code Blocks (From Final Answer Only) -->
342
  {code_html}
343
 
344
  </div>
@@ -346,54 +337,43 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
346
  return html
347
 
348
  # Initialize model
349
- print("πŸ”„ Initializing VibeThinker with vLLM (T4-optimized, 40K context)...")
350
- vibe_model = VibeThinkerVLLM()
351
 
352
- def generate_solution(prompt, temperature=0.6, max_tokens=16384, max_thinking_tokens=4096, progress=gr.Progress()):
353
  """Generate and format solution with progress tracking"""
354
  if not prompt.strip():
355
  return "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
356
 
357
- progress(0, desc="πŸ”„ Initializing generation...")
 
358
 
359
- progress(0.3, desc="🧠 Model is thinking...")
360
  response, prompt_tokens, completion_tokens, gen_time = vibe_model.generate_response(
361
  prompt,
362
  temperature=temperature,
363
- max_tokens=max_tokens,
364
  max_thinking_tokens=max_thinking_tokens
365
  )
366
 
367
  progress(0.8, desc="πŸ“ Formatting output...")
368
 
369
- # Parse output - thinking stays as plain text, code only from answer
370
  thinking, answer, code_blocks = parse_model_output(response)
371
-
372
- # Format as HTML
373
  html_output = format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, gen_time)
374
 
375
  progress(1.0, desc="βœ… Complete!")
376
-
377
  return html_output
378
 
379
  # Create Gradio interface
380
  with gr.Blocks(
381
- theme=gr.themes.Soft(
382
- primary_hue="indigo",
383
- secondary_hue="purple",
384
- ),
385
- css="""
386
- .gradio-container {
387
- max-width: 1400px !important;
388
- }
389
- """
390
  ) as demo:
391
  gr.Markdown("""
392
  # 🧠 VibeThinker-1.5B Competitive Coding Assistant
393
 
394
  **Optimized for**: Competitive programming (LeetCode, Codeforces, AtCoder) and algorithm challenges
395
 
396
- ⚑ **Powered by vLLM** (40K context) | 🎯 **Best for**: Python algorithmic problems with clear input/output specs
397
 
398
  ⚠️ **Note**: This model is specialized for competitive programming, not general software development
399
  """)
@@ -408,48 +388,32 @@ with gr.Blocks(
408
 
409
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
410
  temperature_slider = gr.Slider(
411
- minimum=0.0,
412
- maximum=1.0,
413
- value=0.6,
414
- step=0.1,
415
  label="🌑️ Temperature (0.6 recommended)"
416
  )
417
  max_tokens_slider = gr.Slider(
418
- minimum=1024,
419
- maximum=40960,
420
- value=16384,
421
- step=1024,
422
- label="πŸ“ Max Total Tokens (40K max)"
423
  )
424
  max_thinking_slider = gr.Slider(
425
- minimum=512,
426
- maximum=8192,
427
- value=3072,
428
- step=512,
429
- label="🧠 Max Thinking Tokens (Lower = faster, less verbose)"
430
  )
431
 
432
  gr.Markdown("""
433
  **Tips:**
434
- - Lower thinking tokens (1024-2048) for faster, more direct solutions
435
- - Higher thinking tokens (4096-8192) for complex problems requiring detailed reasoning
436
  - Temperature 0.6 balances creativity and accuracy
437
- - Loop detection is automatic - repetitive output will be truncated
438
- - Code blocks shown are from final solution only (not from reasoning process)
439
  """)
440
 
441
  generate_btn = gr.Button("πŸš€ Generate Solution", variant="primary", size="lg")
442
  clear_btn = gr.Button("πŸ—‘οΈ Clear", size="sm")
443
-
444
- gr.Markdown("""
445
- ---
446
- **Status**: Generation progress will appear above the output when running
447
- """)
448
 
449
  with gr.Column(scale=2):
450
  output_html = gr.HTML(label="Solution")
451
 
452
- # Button actions
453
  generate_btn.click(
454
  fn=generate_solution,
455
  inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
@@ -458,11 +422,9 @@ with gr.Blocks(
458
 
459
  clear_btn.click(
460
  fn=lambda: ("", ""),
461
- inputs=None,
462
  outputs=[prompt_input, output_html]
463
  )
464
 
465
- # Example problems
466
  gr.Examples(
467
  examples=[
468
  ["Write a Python function to find the maximum sum of a contiguous subarray (Kadane's Algorithm). Include edge cases and test with array [-2,1,-3,4,-1,2,1,-5,4]"],
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import re
5
  import time
 
6
 
7
+ class VibeThinkerModel:
 
 
 
 
8
  def __init__(self):
9
  self.model = None
10
+ self.tokenizer = None
11
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
12
  self.load_model()
13
 
14
  def load_model(self):
15
+ """Load VibeThinker model with transformers"""
16
  try:
17
+ print("πŸ”„ Loading VibeThinker-1.5B with transformers...")
18
+
19
+ self.tokenizer = AutoTokenizer.from_pretrained(
20
+ "WeiboAI/VibeThinker-1.5B",
21
+ trust_remote_code=True
22
+ )
23
+
24
+ self.model = AutoModelForCausalLM.from_pretrained(
25
+ "WeiboAI/VibeThinker-1.5B",
26
+ torch_dtype=torch.float16,
27
+ device_map="auto",
28
  trust_remote_code=True
29
  )
30
+
31
+ print(f"βœ… Model loaded successfully on {self.device}")
32
+ print(f"πŸ’Ύ Model memory: ~{self.model.get_memory_footprint() / 1e9:.2f} GB")
33
+
34
  except Exception as e:
35
  print(f"❌ Error loading model: {e}")
36
  raise
37
 
38
+ def generate_response(self, prompt, temperature=0.6, max_new_tokens=8192, max_thinking_tokens=4096):
39
  """
40
+ Generate response with thinking length control
41
 
42
  Args:
43
  prompt: Input prompt
44
  temperature: Sampling temperature
45
+ max_new_tokens: Maximum new tokens to generate
46
+ max_thinking_tokens: Hint for reasoning depth (used in prompt)
47
  """
48
+ if not self.model or not self.tokenizer:
49
  return "Model not loaded!", 0, 0, 0
50
 
51
  try:
52
  start_time = time.time()
53
 
54
+ # Format prompt for competitive coding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  formatted_prompt = f"""<|im_start|>system
56
  You are a competitive programming expert. Provide clear, concise solutions to coding problems.
57
 
58
  Format your response as:
59
+ 1. Brief analysis (2-3 sentences)
60
  2. Solution approach
61
  3. Implementation code
62
  4. Test cases
63
 
64
+ Keep reasoning under {max_thinking_tokens} tokens. Be direct and avoid repetition.<|im_end|>
65
  <|im_start|>user
66
  {prompt}<|im_end|>
67
  <|im_start|>assistant
68
  """
 
 
 
69
 
70
+ # Tokenize input
71
+ inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
72
+ prompt_length = inputs.input_ids.shape[1]
73
 
74
+ # Generate with appropriate parameters
75
+ with torch.no_grad():
76
+ outputs = self.model.generate(
77
+ **inputs,
78
+ max_new_tokens=max_new_tokens,
79
+ temperature=temperature,
80
+ top_p=0.95,
81
+ top_k=50,
82
+ do_sample=True,
83
+ repetition_penalty=1.1,
84
+ pad_token_id=self.tokenizer.eos_token_id,
85
+ )
86
+
87
+ # Decode output
88
+ full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
89
+
90
+ # Extract only the assistant's response
91
+ if "<|im_start|>assistant" in full_output:
92
+ generated_text = full_output.split("<|im_start|>assistant")[-1].strip()
93
  else:
94
+ generated_text = full_output[len(formatted_prompt):].strip()
95
+
96
+ # Check for loops and truncate if needed
97
+ if self._detect_loop(generated_text):
98
+ generated_text = self._truncate_loop(generated_text)
99
+ generated_text += "\n\n⚠️ *[Repetitive content detected and truncated]*"
100
+
101
+ generation_time = time.time() - start_time
102
+
103
+ # Calculate token counts
104
+ completion_length = outputs.shape[1] - prompt_length
105
+
106
+ return generated_text, prompt_length, completion_length, generation_time
107
+
108
  except Exception as e:
109
  return f"Error during generation: {str(e)}", 0, 0, 0
110
 
 
116
 
117
  # Check if same phrase repeats 3+ times
118
  for length in [10, 15, 20]:
119
+ if len(words) < length * 3:
120
+ continue
121
  for i in range(len(words) - length * 3):
122
  phrase = ' '.join(words[i:i+length])
123
  rest = ' '.join(words[i+length:])
 
129
  """Truncate text at the start of detected loop"""
130
  words = text.split()
131
  for length in [10, 15, 20]:
132
+ if len(words) < length * 2:
133
+ continue
134
  for i in range(len(words) - length * 2):
135
  phrase = ' '.join(words[i:i+length])
136
  rest_start = i + length
 
143
  """
144
  Parse model output to separate thinking and final answer
145
  ONLY extract code from the final answer section, not from thinking
 
146
  """
147
  loop_warning = ""
148
+ if "[Repetitive content detected and truncated]" in text:
149
  loop_warning = "\n\n⚠️ **Note**: Repetitive content was detected and removed"
150
+ text = text.replace("⚠️ *[Repetitive content detected and truncated]*", "")
151
 
152
  # Try to find explicit thinking delimiters
153
  thinking_patterns = [
 
166
  break
167
 
168
  # If no explicit thinking tags, try to detect reasoning section
 
169
  if not thinking_content:
170
  split_markers = [
171
  r'(.*?)(?=\n\n(?:Solution|Here\'s|Implementation|Code|Final).*?:)',
 
176
  match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
177
  if match:
178
  potential_thinking = match.group(1).strip()
 
179
  if len(potential_thinking) > 100:
180
  thinking_lower = potential_thinking.lower()
181
  if any(word in thinking_lower for word in ['step', 'approach', 'idea', 'first', 'we can', 'let\'s']):
 
183
  answer_content = text[len(potential_thinking):].strip()
184
  break
185
 
186
+ # Extract code blocks ONLY from answer_content
187
  code_pattern = r'```(\w+)?\n(.*?)```'
188
  code_blocks = re.findall(code_pattern, answer_content, re.DOTALL)
189
 
190
+ # Extract final answer
191
  answer_match = re.search(r'\\boxed\{([^}]+)\}', answer_content)
192
  if answer_match:
193
  final_answer = f"**Final Answer:** {answer_match.group(1)}"
 
199
  return thinking_content, final_answer, code_blocks
200
 
201
  def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, generation_time):
202
+ """Format output as styled HTML"""
 
 
203
  total_tokens = prompt_tokens + completion_tokens
204
  thinking_tokens_est = len(thinking.split()) * 1.3 if thinking else 0
205
  tokens_per_sec = completion_tokens / generation_time if generation_time > 0 else 0
206
 
207
+ # Build thinking section HTML - plain text only
208
  thinking_html = ""
209
  if thinking:
 
210
  thinking_escaped = thinking.replace('<', '&lt;').replace('>', '&gt;')
211
  thinking_html = f"""
212
  <details style="background: #f8f9fa; border: 2px solid #e9ecef; border-radius: 12px; padding: 20px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
213
  <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #495057; user-select: none; display: flex; align-items: center; gap: 8px;">
214
  <span style="font-size: 20px;">🧠</span>
215
+ <span>Reasoning Process (~{int(thinking_tokens_est):,} tokens)</span>
216
  <span style="margin-left: auto; font-size: 12px; color: #6c757d;">Click to expand/collapse</span>
217
  </summary>
218
  <div style="margin-top: 16px; padding-top: 16px; border-top: 1px solid #dee2e6; color: #212529; line-height: 1.7; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, Consolas, monospace; background: #ffffff; padding: 16px; border-radius: 8px;">
 
221
  </details>
222
  """
223
 
224
+ # Build code blocks HTML
225
  code_html = ""
226
  if code_blocks:
227
  code_blocks_html = ""
228
  for idx, (lang, code) in enumerate(code_blocks):
229
  lang_display = lang if lang else "code"
230
  code_id = f"code_{idx}"
 
 
231
  code_clean = code.strip()
232
 
233
  code_blocks_html += f"""
 
264
  <script>
265
  function downloadCode(code, lang) {{
266
  const extensions = {{
267
+ 'python': 'py', 'javascript': 'js', 'java': 'java',
268
+ 'cpp': 'cpp', 'c': 'c', 'html': 'html', 'css': 'css',
269
+ 'typescript': 'ts', 'rust': 'rs', 'go': 'go',
 
 
 
 
 
 
 
270
  }};
271
  const ext = extensions[lang.toLowerCase()] || 'txt';
272
  const filename = `solution.${{ext}}`;
 
287
  html = f"""
288
  <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; max-width: 100%; margin: 0 auto; background: #ffffff; color: #1a1a1a;">
289
 
290
+ <!-- Stats -->
291
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; margin-bottom: 24px; color: white; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
292
  <h3 style="margin: 0 0 12px 0; font-size: 18px; font-weight: 600;">πŸ“Š Generation Stats</h3>
293
  <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 12px; font-size: 14px;">
 
297
  </div>
298
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
299
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Speed</div>
300
+ <div style="font-size: 20px; font-weight: bold;">{tokens_per_sec:.1f} t/s</div>
301
  </div>
302
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
303
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Prompt</div>
 
309
  </div>
310
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
311
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Thinking</div>
312
+ <div style="font-size: 20px; font-weight: bold;">~{int(thinking_tokens_est):,}</div>
313
  </div>
314
  <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
315
  <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Total</div>
 
318
  </div>
319
  </div>
320
 
 
321
  {thinking_html}
322
 
323
+ <!-- Answer -->
324
  <div style="background: #ffffff; border: 2px solid #28a745; border-radius: 12px; padding: 24px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(40,167,69,0.1);">
325
  <h3 style="margin: 0 0 16px 0; color: #28a745; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 8px;">
326
  <span style="font-size: 22px;">βœ…</span> Final Solution
 
330
  </div>
331
  </div>
332
 
 
333
  {code_html}
334
 
335
  </div>
 
337
  return html
338
 
339
  # Initialize model
340
+ print("πŸ”„ Initializing VibeThinker-1.5B...")
341
+ vibe_model = VibeThinkerModel()
342
 
343
+ def generate_solution(prompt, temperature=0.6, max_tokens=8192, max_thinking_tokens=4096, progress=gr.Progress()):
344
  """Generate and format solution with progress tracking"""
345
  if not prompt.strip():
346
  return "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
347
 
348
+ progress(0, desc="πŸ”„ Initializing...")
349
+ progress(0.2, desc="🧠 Generating solution...")
350
 
 
351
  response, prompt_tokens, completion_tokens, gen_time = vibe_model.generate_response(
352
  prompt,
353
  temperature=temperature,
354
+ max_new_tokens=max_tokens,
355
  max_thinking_tokens=max_thinking_tokens
356
  )
357
 
358
  progress(0.8, desc="πŸ“ Formatting output...")
359
 
 
360
  thinking, answer, code_blocks = parse_model_output(response)
 
 
361
  html_output = format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, gen_time)
362
 
363
  progress(1.0, desc="βœ… Complete!")
 
364
  return html_output
365
 
366
  # Create Gradio interface
367
  with gr.Blocks(
368
+ theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
369
+ css=".gradio-container { max-width: 1400px !important; }"
 
 
 
 
 
 
 
370
  ) as demo:
371
  gr.Markdown("""
372
  # 🧠 VibeThinker-1.5B Competitive Coding Assistant
373
 
374
  **Optimized for**: Competitive programming (LeetCode, Codeforces, AtCoder) and algorithm challenges
375
 
376
+ 🎯 **Best for**: Python algorithmic problems with clear input/output specifications
377
 
378
  ⚠️ **Note**: This model is specialized for competitive programming, not general software development
379
  """)
 
388
 
389
  with gr.Accordion("βš™οΈ Advanced Settings", open=False):
390
  temperature_slider = gr.Slider(
391
+ minimum=0.0, maximum=1.0, value=0.6, step=0.1,
 
 
 
392
  label="🌑️ Temperature (0.6 recommended)"
393
  )
394
  max_tokens_slider = gr.Slider(
395
+ minimum=1024, maximum=16384, value=8192, step=1024,
396
+ label="πŸ“ Max New Tokens"
 
 
 
397
  )
398
  max_thinking_slider = gr.Slider(
399
+ minimum=512, maximum=8192, value=3072, step=512,
400
+ label="🧠 Max Thinking Tokens (hint for prompt)"
 
 
 
401
  )
402
 
403
  gr.Markdown("""
404
  **Tips:**
405
+ - Lower thinking tokens (1024-2048) for faster, direct solutions
406
+ - Higher thinking tokens (4096-8192) for complex reasoning
407
  - Temperature 0.6 balances creativity and accuracy
408
+ - Automatic loop detection and truncation
 
409
  """)
410
 
411
  generate_btn = gr.Button("πŸš€ Generate Solution", variant="primary", size="lg")
412
  clear_btn = gr.Button("πŸ—‘οΈ Clear", size="sm")
 
 
 
 
 
413
 
414
  with gr.Column(scale=2):
415
  output_html = gr.HTML(label="Solution")
416
 
 
417
  generate_btn.click(
418
  fn=generate_solution,
419
  inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
 
422
 
423
  clear_btn.click(
424
  fn=lambda: ("", ""),
 
425
  outputs=[prompt_input, output_html]
426
  )
427
 
 
428
  gr.Examples(
429
  examples=[
430
  ["Write a Python function to find the maximum sum of a contiguous subarray (Kadane's Algorithm). Include edge cases and test with array [-2,1,-3,4,-1,2,1,-5,4]"],