Spaces:

WellGoods
/

VibeThinker

Sleeping

App Files Files Community

VladBoyko commited on 21 days ago

Commit

bfb609d

verified ·

1 Parent(s): a271ec8

Update app.py

Browse files

switched from vllm to tensorflow

Files changed (1) hide show

app.py +101 -139

app.py CHANGED Viewed

@@ -1,106 +1,110 @@
 import gradio as gr
-import os
 import re
 import time
-from vllm import LLM, SamplingParams
-# Force XFormers backend for T4 compatibility
-os.environ['VLLM_ATTENTION_BACKEND'] = 'XFORMERS'
-os.environ['VLLM_USE_TRITON_FLASH_ATTN'] = '0'
-class VibeThinkerVLLM:
     def __init__(self):
         self.model = None
         self.load_model()
     def load_model(self):
-        """Load VibeThinker model with vLLM (T4-compatible settings)"""
         try:
-            self.model = LLM(
-                model="WeiboAI/VibeThinker-1.5B",
-                dtype="float16",  # Use float16 instead of bfloat16 for T4
-                gpu_memory_utilization=0.85,
-                max_model_len=40960,  # Full 40K context as per docs
-                enforce_eager=True,  # Disable CUDA graphs for T4
-                disable_custom_all_reduce=True,  # Avoid custom kernels
-                enable_prefix_caching=False,  # Disable for stability
-                max_num_seqs=1,  # Process one sequence at a time
                 trust_remote_code=True
             )
-            print("✅ vLLM model loaded successfully with T4-compatible settings (40K context)")
         except Exception as e:
             print(f"❌ Error loading model: {e}")
             raise
-    def generate_response(self, prompt, temperature=0.6, max_tokens=16384, max_thinking_tokens=4096):
         """
-        Generate response with thinking length control and loop detection
         Args:
             prompt: Input prompt
             temperature: Sampling temperature
-            max_tokens: Total max tokens (thinking + output)
-            max_thinking_tokens: Maximum tokens for reasoning phase
         """
-        if not self.model:
             return "Model not loaded!", 0, 0, 0
         try:
             start_time = time.time()
-            # Create sampling params with stop sequences to prevent loops
-            sampling_params = SamplingParams(
-                temperature=temperature,
-                top_p=0.95,
-                top_k=-1,
-                max_tokens=max_tokens,
-                # Stop sequences to prevent infinite loops
-                stop=[
-                    "Wait, the problem says",  # Common loop pattern
-                    "\n\n\n\n",  # Multiple blank lines
-                    "###END###",  # Custom stop token
-                ],
-                repetition_penalty=1.1,  # Penalize repetition
-            )
-            # Format prompt clearly for competitive coding
             formatted_prompt = f"""<|im_start|>system
 You are a competitive programming expert. Provide clear, concise solutions to coding problems.
 Format your response as:
-1. Brief analysis (2-3 sentences max)
 2. Solution approach
 3. Implementation code
 4. Test cases
-Keep reasoning under {max_thinking_tokens} tokens. DO NOT repeat yourself.<|im_end|>
 <|im_start|>user
 {prompt}<|im_end|>
 <|im_start|>assistant
 """
-            # Generate with vLLM
-            outputs = self.model.generate([formatted_prompt], sampling_params)
-            generation_time = time.time() - start_time
-            if outputs and len(outputs) > 0:
-                output = outputs[0]
-                generated_text = output.outputs[0].text
-                # Check for loop patterns
-                if self._detect_loop(generated_text):
-                    generated_text = self._truncate_loop(generated_text)
-                    generated_text += "\n\n⚠️ *[Loop detected and truncated]*"
-                # Get token counts
-                prompt_tokens = len(output.prompt_token_ids)
-                completion_tokens = len(output.outputs[0].token_ids)
-                return generated_text, prompt_tokens, completion_tokens, generation_time
             else:
-                return "No output generated", 0, 0, 0
         except Exception as e:
             return f"Error during generation: {str(e)}", 0, 0, 0
@@ -112,6 +116,8 @@ Keep reasoning under {max_thinking_tokens} tokens. DO NOT repeat yourself.<|im_e
         # Check if same phrase repeats 3+ times
         for length in [10, 15, 20]:
             for i in range(len(words) - length * 3):
                 phrase = ' '.join(words[i:i+length])
                 rest = ' '.join(words[i+length:])
@@ -123,6 +129,8 @@ Keep reasoning under {max_thinking_tokens} tokens. DO NOT repeat yourself.<|im_e
         """Truncate text at the start of detected loop"""
         words = text.split()
         for length in [10, 15, 20]:
             for i in range(len(words) - length * 2):
                 phrase = ' '.join(words[i:i+length])
                 rest_start = i + length
@@ -135,12 +143,11 @@ def parse_model_output(text):
     """
     Parse model output to separate thinking and final answer
     ONLY extract code from the final answer section, not from thinking
-    Returns: (thinking_content, answer_content, code_blocks)
     """
     loop_warning = ""
-    if "[Loop detected and truncated]" in text:
         loop_warning = "\n\n⚠️ **Note**: Repetitive content was detected and removed"
-        text = text.replace("⚠️ *[Loop detected and truncated]*", "")
     # Try to find explicit thinking delimiters
     thinking_patterns = [
@@ -159,7 +166,6 @@ def parse_model_output(text):
             break
     # If no explicit thinking tags, try to detect reasoning section
-    # Look for a natural break like "Solution:" or "Here's the code:"
     if not thinking_content:
         split_markers = [
             r'(.*?)(?=\n\n(?:Solution|Here\'s|Implementation|Code|Final).*?:)',
@@ -170,7 +176,6 @@ def parse_model_output(text):
             match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
             if match:
                 potential_thinking = match.group(1).strip()
-                # Only treat as thinking if it's substantial (>100 chars) and contains reasoning keywords
                 if len(potential_thinking) > 100:
                     thinking_lower = potential_thinking.lower()
                     if any(word in thinking_lower for word in ['step', 'approach', 'idea', 'first', 'we can', 'let\'s']):
@@ -178,11 +183,11 @@ def parse_model_output(text):
                         answer_content = text[len(potential_thinking):].strip()
                         break
-    # NOW extract code blocks ONLY from answer_content (not from thinking)
     code_pattern = r'```(\w+)?\n(.*?)```'
     code_blocks = re.findall(code_pattern, answer_content, re.DOTALL)
-    # Extract final answer (boxed or use answer_content as-is)
     answer_match = re.search(r'\\boxed\{([^}]+)\}', answer_content)
     if answer_match:
         final_answer = f"**Final Answer:** {answer_match.group(1)}"
@@ -194,23 +199,20 @@ def parse_model_output(text):
     return thinking_content, final_answer, code_blocks
 def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, generation_time):
-    """
-    Format output as styled HTML - thinking is plain text, code blocks are from final answer only
-    """
     total_tokens = prompt_tokens + completion_tokens
     thinking_tokens_est = len(thinking.split()) * 1.3 if thinking else 0
     tokens_per_sec = completion_tokens / generation_time if generation_time > 0 else 0
-    # Build thinking section HTML - PLAIN TEXT, NO CODE PARSING
     thinking_html = ""
     if thinking:
-        # Escape any HTML in thinking to prevent rendering
         thinking_escaped = thinking.replace('<', '&lt;').replace('>', '&gt;')
         thinking_html = f"""
         <details style="background: #f8f9fa; border: 2px solid #e9ecef; border-radius: 12px; padding: 20px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
             <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #495057; user-select: none; display: flex; align-items: center; gap: 8px;">
                 <span style="font-size: 20px;">🧠</span>
-                <span>Reasoning Process ({int(thinking_tokens_est):,} tokens)</span>
                 <span style="margin-left: auto; font-size: 12px; color: #6c757d;">Click to expand/collapse</span>
             </summary>
             <div style="margin-top: 16px; padding-top: 16px; border-top: 1px solid #dee2e6; color: #212529; line-height: 1.7; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, Consolas, monospace; background: #ffffff; padding: 16px; border-radius: 8px;">
@@ -219,15 +221,13 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
         </details>
         """
-    # Build code blocks HTML - ONLY from final answer
     code_html = ""
     if code_blocks:
         code_blocks_html = ""
         for idx, (lang, code) in enumerate(code_blocks):
             lang_display = lang if lang else "code"
             code_id = f"code_{idx}"
-            # Create downloadable version
             code_clean = code.strip()
             code_blocks_html += f"""
@@ -264,16 +264,9 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
         <script>
         function downloadCode(code, lang) {{
             const extensions = {{
-                'python': 'py',
-                'javascript': 'js',
-                'java': 'java',
-                'cpp': 'cpp',
-                'c': 'c',
-                'html': 'html',
-                'css': 'css',
-                'typescript': 'ts',
-                'rust': 'rs',
-                'go': 'go',
             }};
             const ext = extensions[lang.toLowerCase()] || 'txt';
             const filename = `solution.${{ext}}`;
@@ -294,7 +287,7 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
     html = f"""
     <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; max-width: 100%; margin: 0 auto; background: #ffffff; color: #1a1a1a;">
-        <!-- Token Stats -->
         <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; margin-bottom: 24px; color: white; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
             <h3 style="margin: 0 0 12px 0; font-size: 18px; font-weight: 600;">📊 Generation Stats</h3>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 12px; font-size: 14px;">
@@ -304,7 +297,7 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Speed</div>
-                    <div style="font-size: 20px; font-weight: bold;">{tokens_per_sec:.0f} t/s</div>
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Prompt</div>
@@ -316,7 +309,7 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Thinking</div>
-                    <div style="font-size: 20px; font-weight: bold;">{int(thinking_tokens_est):,}</div>
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Total</div>
@@ -325,10 +318,9 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
             </div>
         </div>
-        <!-- Thinking Section (Plain Text Only) -->
         {thinking_html}
-        <!-- Answer Section -->
         <div style="background: #ffffff; border: 2px solid #28a745; border-radius: 12px; padding: 24px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(40,167,69,0.1);">
             <h3 style="margin: 0 0 16px 0; color: #28a745; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 8px;">
                 <span style="font-size: 22px;">✅</span> Final Solution
@@ -338,7 +330,6 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
             </div>
         </div>
-        <!-- Code Blocks (From Final Answer Only) -->
         {code_html}
     </div>
@@ -346,54 +337,43 @@ def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_
     return html
 # Initialize model
-print("🔄 Initializing VibeThinker with vLLM (T4-optimized, 40K context)...")
-vibe_model = VibeThinkerVLLM()
-def generate_solution(prompt, temperature=0.6, max_tokens=16384, max_thinking_tokens=4096, progress=gr.Progress()):
     """Generate and format solution with progress tracking"""
     if not prompt.strip():
         return "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
-    progress(0, desc="🔄 Initializing generation...")
-    progress(0.3, desc="🧠 Model is thinking...")
     response, prompt_tokens, completion_tokens, gen_time = vibe_model.generate_response(
         prompt,
         temperature=temperature,
-        max_tokens=max_tokens,
         max_thinking_tokens=max_thinking_tokens
     )
     progress(0.8, desc="📝 Formatting output...")
-    # Parse output - thinking stays as plain text, code only from answer
     thinking, answer, code_blocks = parse_model_output(response)
-    # Format as HTML
     html_output = format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, gen_time)
     progress(1.0, desc="✅ Complete!")
     return html_output
 # Create Gradio interface
 with gr.Blocks(
-    theme=gr.themes.Soft(
-        primary_hue="indigo",
-        secondary_hue="purple",
-    ),
-    css="""
-    .gradio-container {
-        max-width: 1400px !important;
-    }
-    """
 ) as demo:
     gr.Markdown("""
     # 🧠 VibeThinker-1.5B Competitive Coding Assistant
     **Optimized for**: Competitive programming (LeetCode, Codeforces, AtCoder) and algorithm challenges
-    ⚡ **Powered by vLLM** (40K context) | 🎯 **Best for**: Python algorithmic problems with clear input/output specs
     ⚠️ **Note**: This model is specialized for competitive programming, not general software development
     """)
@@ -408,48 +388,32 @@ with gr.Blocks(
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature_slider = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.6,
-                    step=0.1,
                     label="🌡️ Temperature (0.6 recommended)"
                 )
                 max_tokens_slider = gr.Slider(
-                    minimum=1024,
-                    maximum=40960,
-                    value=16384,
-                    step=1024,
-                    label="📝 Max Total Tokens (40K max)"
                 )
                 max_thinking_slider = gr.Slider(
-                    minimum=512,
-                    maximum=8192,
-                    value=3072,
-                    step=512,
-                    label="🧠 Max Thinking Tokens (Lower = faster, less verbose)"
                 )
                 gr.Markdown("""
                 **Tips:**
-                - Lower thinking tokens (1024-2048) for faster, more direct solutions
-                - Higher thinking tokens (4096-8192) for complex problems requiring detailed reasoning
                 - Temperature 0.6 balances creativity and accuracy
-                - Loop detection is automatic - repetitive output will be truncated
-                - Code blocks shown are from final solution only (not from reasoning process)
                 """)
             generate_btn = gr.Button("🚀 Generate Solution", variant="primary", size="lg")
             clear_btn = gr.Button("🗑️ Clear", size="sm")
-            gr.Markdown("""
-            ---
-            **Status**: Generation progress will appear above the output when running
-            """)
         with gr.Column(scale=2):
             output_html = gr.HTML(label="Solution")
-    # Button actions
     generate_btn.click(
         fn=generate_solution,
         inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
@@ -458,11 +422,9 @@ with gr.Blocks(
     clear_btn.click(
         fn=lambda: ("", ""),
-        inputs=None,
         outputs=[prompt_input, output_html]
     )
-    # Example problems
     gr.Examples(
         examples=[
             ["Write a Python function to find the maximum sum of a contiguous subarray (Kadane's Algorithm). Include edge cases and test with array [-2,1,-3,4,-1,2,1,-5,4]"],

 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import re
 import time
+class VibeThinkerModel:
     def __init__(self):
         self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.load_model()
     def load_model(self):
+        """Load VibeThinker model with transformers"""
         try:
+            print("🔄 Loading VibeThinker-1.5B with transformers...")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "WeiboAI/VibeThinker-1.5B",
+                trust_remote_code=True
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                "WeiboAI/VibeThinker-1.5B",
+                torch_dtype=torch.float16,
+                device_map="auto",
                 trust_remote_code=True
             )
+            print(f"✅ Model loaded successfully on {self.device}")
+            print(f"💾 Model memory: ~{self.model.get_memory_footprint() / 1e9:.2f} GB")
         except Exception as e:
             print(f"❌ Error loading model: {e}")
             raise
+    def generate_response(self, prompt, temperature=0.6, max_new_tokens=8192, max_thinking_tokens=4096):
         """
+        Generate response with thinking length control
         Args:
             prompt: Input prompt
             temperature: Sampling temperature
+            max_new_tokens: Maximum new tokens to generate
+            max_thinking_tokens: Hint for reasoning depth (used in prompt)
         """
+        if not self.model or not self.tokenizer:
             return "Model not loaded!", 0, 0, 0
         try:
             start_time = time.time()
+            # Format prompt for competitive coding
             formatted_prompt = f"""<|im_start|>system
 You are a competitive programming expert. Provide clear, concise solutions to coding problems.
 Format your response as:
+1. Brief analysis (2-3 sentences)
 2. Solution approach
 3. Implementation code
 4. Test cases
+Keep reasoning under {max_thinking_tokens} tokens. Be direct and avoid repetition.<|im_end|>
 <|im_start|>user
 {prompt}<|im_end|>
 <|im_start|>assistant
 """
+            # Tokenize input
+            inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
+            prompt_length = inputs.input_ids.shape[1]
+            # Generate with appropriate parameters
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=0.95,
+                    top_k=50,
+                    do_sample=True,
+                    repetition_penalty=1.1,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            # Decode output
+            full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the assistant's response
+            if "<|im_start|>assistant" in full_output:
+                generated_text = full_output.split("<|im_start|>assistant")[-1].strip()
             else:
+                generated_text = full_output[len(formatted_prompt):].strip()
+            # Check for loops and truncate if needed
+            if self._detect_loop(generated_text):
+                generated_text = self._truncate_loop(generated_text)
+                generated_text += "\n\n⚠️ *[Repetitive content detected and truncated]*"
+            generation_time = time.time() - start_time
+            # Calculate token counts
+            completion_length = outputs.shape[1] - prompt_length
+            return generated_text, prompt_length, completion_length, generation_time
         except Exception as e:
             return f"Error during generation: {str(e)}", 0, 0, 0
         # Check if same phrase repeats 3+ times
         for length in [10, 15, 20]:
+            if len(words) < length * 3:
+                continue
             for i in range(len(words) - length * 3):
                 phrase = ' '.join(words[i:i+length])
                 rest = ' '.join(words[i+length:])
         """Truncate text at the start of detected loop"""
         words = text.split()
         for length in [10, 15, 20]:
+            if len(words) < length * 2:
+                continue
             for i in range(len(words) - length * 2):
                 phrase = ' '.join(words[i:i+length])
                 rest_start = i + length
     """
     Parse model output to separate thinking and final answer
     ONLY extract code from the final answer section, not from thinking
     """
     loop_warning = ""
+    if "[Repetitive content detected and truncated]" in text:
         loop_warning = "\n\n⚠️ **Note**: Repetitive content was detected and removed"
+        text = text.replace("⚠️ *[Repetitive content detected and truncated]*", "")
     # Try to find explicit thinking delimiters
     thinking_patterns = [
             break
     # If no explicit thinking tags, try to detect reasoning section
     if not thinking_content:
         split_markers = [
             r'(.*?)(?=\n\n(?:Solution|Here\'s|Implementation|Code|Final).*?:)',
             match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
             if match:
                 potential_thinking = match.group(1).strip()
                 if len(potential_thinking) > 100:
                     thinking_lower = potential_thinking.lower()
                     if any(word in thinking_lower for word in ['step', 'approach', 'idea', 'first', 'we can', 'let\'s']):
                         answer_content = text[len(potential_thinking):].strip()
                         break
+    # Extract code blocks ONLY from answer_content
     code_pattern = r'```(\w+)?\n(.*?)```'
     code_blocks = re.findall(code_pattern, answer_content, re.DOTALL)
+    # Extract final answer
     answer_match = re.search(r'\\boxed\{([^}]+)\}', answer_content)
     if answer_match:
         final_answer = f"**Final Answer:** {answer_match.group(1)}"
     return thinking_content, final_answer, code_blocks
 def format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, generation_time):
+    """Format output as styled HTML"""
     total_tokens = prompt_tokens + completion_tokens
     thinking_tokens_est = len(thinking.split()) * 1.3 if thinking else 0
     tokens_per_sec = completion_tokens / generation_time if generation_time > 0 else 0
+    # Build thinking section HTML - plain text only
     thinking_html = ""
     if thinking:
         thinking_escaped = thinking.replace('<', '&lt;').replace('>', '&gt;')
         thinking_html = f"""
         <details style="background: #f8f9fa; border: 2px solid #e9ecef; border-radius: 12px; padding: 20px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(0,0,0,0.05);">
             <summary style="cursor: pointer; font-weight: 600; font-size: 16px; color: #495057; user-select: none; display: flex; align-items: center; gap: 8px;">
                 <span style="font-size: 20px;">🧠</span>
+                <span>Reasoning Process (~{int(thinking_tokens_est):,} tokens)</span>
                 <span style="margin-left: auto; font-size: 12px; color: #6c757d;">Click to expand/collapse</span>
             </summary>
             <div style="margin-top: 16px; padding-top: 16px; border-top: 1px solid #dee2e6; color: #212529; line-height: 1.7; white-space: pre-wrap; font-size: 14px; font-family: 'SF Mono', Monaco, Consolas, monospace; background: #ffffff; padding: 16px; border-radius: 8px;">
         </details>
         """
+    # Build code blocks HTML
     code_html = ""
     if code_blocks:
         code_blocks_html = ""
         for idx, (lang, code) in enumerate(code_blocks):
             lang_display = lang if lang else "code"
             code_id = f"code_{idx}"
             code_clean = code.strip()
             code_blocks_html += f"""
         <script>
         function downloadCode(code, lang) {{
             const extensions = {{
+                'python': 'py', 'javascript': 'js', 'java': 'java',
+                'cpp': 'cpp', 'c': 'c', 'html': 'html', 'css': 'css',
+                'typescript': 'ts', 'rust': 'rs', 'go': 'go',
             }};
             const ext = extensions[lang.toLowerCase()] || 'txt';
             const filename = `solution.${{ext}}`;
     html = f"""
     <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; max-width: 100%; margin: 0 auto; background: #ffffff; color: #1a1a1a;">
+        <!-- Stats -->
         <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; margin-bottom: 24px; color: white; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
             <h3 style="margin: 0 0 12px 0; font-size: 18px; font-weight: 600;">📊 Generation Stats</h3>
             <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 12px; font-size: 14px;">
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Speed</div>
+                    <div style="font-size: 20px; font-weight: bold;">{tokens_per_sec:.1f} t/s</div>
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Prompt</div>
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Thinking</div>
+                    <div style="font-size: 20px; font-weight: bold;">~{int(thinking_tokens_est):,}</div>
                 </div>
                 <div style="background: rgba(255,255,255,0.2); padding: 12px; border-radius: 8px;">
                     <div style="opacity: 0.9; font-size: 12px; margin-bottom: 4px;">Total</div>
             </div>
         </div>
         {thinking_html}
+        <!-- Answer -->
         <div style="background: #ffffff; border: 2px solid #28a745; border-radius: 12px; padding: 24px; margin-bottom: 24px; box-shadow: 0 2px 4px rgba(40,167,69,0.1);">
             <h3 style="margin: 0 0 16px 0; color: #28a745; font-size: 18px; font-weight: 600; display: flex; align-items: center; gap: 8px;">
                 <span style="font-size: 22px;">✅</span> Final Solution
             </div>
         </div>
         {code_html}
     </div>
     return html
 # Initialize model
+print("🔄 Initializing VibeThinker-1.5B...")
+vibe_model = VibeThinkerModel()
+def generate_solution(prompt, temperature=0.6, max_tokens=8192, max_thinking_tokens=4096, progress=gr.Progress()):
     """Generate and format solution with progress tracking"""
     if not prompt.strip():
         return "<p style='color: #dc3545; font-size: 16px; padding: 20px;'>⚠️ Please enter a problem to solve.</p>"
+    progress(0, desc="🔄 Initializing...")
+    progress(0.2, desc="🧠 Generating solution...")
     response, prompt_tokens, completion_tokens, gen_time = vibe_model.generate_response(
         prompt,
         temperature=temperature,
+        max_new_tokens=max_tokens,
         max_thinking_tokens=max_thinking_tokens
     )
     progress(0.8, desc="📝 Formatting output...")
     thinking, answer, code_blocks = parse_model_output(response)
     html_output = format_output_html(thinking, answer, code_blocks, prompt_tokens, completion_tokens, gen_time)
     progress(1.0, desc="✅ Complete!")
     return html_output
 # Create Gradio interface
 with gr.Blocks(
+    theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
+    css=".gradio-container { max-width: 1400px !important; }"
 ) as demo:
     gr.Markdown("""
     # 🧠 VibeThinker-1.5B Competitive Coding Assistant
     **Optimized for**: Competitive programming (LeetCode, Codeforces, AtCoder) and algorithm challenges
+    🎯 **Best for**: Python algorithmic problems with clear input/output specifications
     ⚠️ **Note**: This model is specialized for competitive programming, not general software development
     """)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature_slider = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.6, step=0.1,
                     label="🌡️ Temperature (0.6 recommended)"
                 )
                 max_tokens_slider = gr.Slider(
+                    minimum=1024, maximum=16384, value=8192, step=1024,
+                    label="📝 Max New Tokens"
                 )
                 max_thinking_slider = gr.Slider(
+                    minimum=512, maximum=8192, value=3072, step=512,
+                    label="🧠 Max Thinking Tokens (hint for prompt)"
                 )
                 gr.Markdown("""
                 **Tips:**
+                - Lower thinking tokens (1024-2048) for faster, direct solutions
+                - Higher thinking tokens (4096-8192) for complex reasoning
                 - Temperature 0.6 balances creativity and accuracy
+                - Automatic loop detection and truncation
                 """)
             generate_btn = gr.Button("🚀 Generate Solution", variant="primary", size="lg")
             clear_btn = gr.Button("🗑️ Clear", size="sm")
         with gr.Column(scale=2):
             output_html = gr.HTML(label="Solution")
     generate_btn.click(
         fn=generate_solution,
         inputs=[prompt_input, temperature_slider, max_tokens_slider, max_thinking_slider],
     clear_btn.click(
         fn=lambda: ("", ""),
         outputs=[prompt_input, output_html]
     )
     gr.Examples(
         examples=[
             ["Write a Python function to find the maximum sum of a contiguous subarray (Kadane's Algorithm). Include edge cases and test with array [-2,1,-3,4,-1,2,1,-5,4]"],