Spaces:

WellGoods
/

VibeThinker

Sleeping

App Files Files Community

VladBoyko commited on Nov 16

Commit

e5b56c6

verified ·

1 Parent(s): 9f752eb

Update app.py

Browse files

Updated to use vLLM and improved how the models output is parsed

Files changed (1) hide show

app.py +314 -92

app.py CHANGED Viewed

@@ -1,114 +1,334 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 import spaces
-class VibeThinker:
     def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
         self.model_path = model_path
-        print("Loading model... This may take a minute.")
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_path,
-            low_cpu_mem_usage=True,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
             trust_remote_code=True
         )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            trust_remote_code=True
-        )
-        print(f"Model loaded successfully!")
-        print(f"Using device: {self.model.device}")
-        if torch.cuda.is_available():
-            print(f"CUDA device: {torch.cuda.get_device_name(0)}")
-    @spaces.GPU  # This decorator allocates GPU when function is called (for ZeroGPU spaces)
-    def infer_text(self, prompt, temperature=0.6, max_tokens=40960, top_p=0.95):
-        """
-        Generate response for a given prompt
-        Args:
-            prompt: The input question (preferably in English)
-            temperature: Controls randomness (0.6 or 1.0 recommended)
-            max_tokens: Maximum tokens to generate
-            top_p: Nucleus sampling parameter
-        """
         messages = [
             {"role": "user", "content": prompt}
         ]
-        text = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
-        generation_config = dict(
-            max_new_tokens=max_tokens,
-            do_sample=True,
             temperature=temperature,
             top_p=top_p,
-            top_k=None  # Set to -1 in vLLM/SGLang
         )
-        print(f"Generating response with temperature={temperature}, max_tokens={max_tokens}...")
-        generated_ids = self.model.generate(
-            **model_inputs,
-            generation_config=GenerationConfig(**generation_config)
-        )
-        generated_ids = [
-            output_ids[len(input_ids):]
-            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-        ]
-        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return response
 # Initialize model
-print("Initializing VibeThinker-1.5B...")
-model = VibeThinker()
 # Create Gradio interface
 def generate_response(prompt, temperature, max_tokens, top_p):
     if not prompt.strip():
-        return "Please enter a question."
     try:
-        response = model.infer_text(
             prompt=prompt,
             temperature=temperature,
             max_tokens=max_tokens,
             top_p=top_p
         )
-        return response
     except Exception as e:
-        return f"Error: {str(e)}"
 # Gradio UI
-with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
     gr.Markdown("""
-    # 🧠 VibeThinker-1.5B: Reasoning Model
-    **Optimized for**: Competitive math problems and algorithm coding challenges
-    **Note**: This model works best with questions in English. It's specifically trained for
-    mathematical reasoning and competitive programming tasks.
-    ### Example Prompts:
-    - "Solve: Find all solutions to x^3 - 3x^2 + 4 = 0"
-    - "Write a Python function to find the longest palindromic substring in O(n^2) time"
-    - "Prove that the sum of angles in a triangle equals 180 degrees"
     [GitHub](https://github.com/WeiboAI/VibeThinker) | [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221)
     """)
@@ -117,11 +337,11 @@ with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
         with gr.Column(scale=1):
             prompt_input = gr.Textbox(
                 label="Your Question",
-                placeholder="Enter your math problem or coding challenge here (in English)...",
-                lines=5
             )
-            with gr.Accordion("Advanced Settings", open=False):
                 temperature_slider = gr.Slider(
                     minimum=0.1,
                     maximum=1.5,
@@ -135,7 +355,7 @@ with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
                     maximum=40960,
                     value=8192,
                     step=512,
-                    label="Max Tokens (model supports up to 40,960)"
                 )
                 top_p_slider = gr.Slider(
@@ -146,53 +366,55 @@ with gr.Blocks(title="VibeThinker-1.5B Math & Code Reasoning") as demo:
                     label="Top P"
                 )
-            submit_btn = gr.Button("🚀 Generate Solution", variant="primary")
-            clear_btn = gr.Button("🗑️ Clear")
         with gr.Column(scale=1):
-            output_text = gr.Textbox(
                 label="Model Response",
-                lines=20,
-                show_copy_button=True
             )
     # Example questions
     gr.Examples(
         examples=[
-            ["Find the number of positive integers n ≤ 1000 such that n^2 + n + 41 is prime.", 0.6, 8192, 0.95],
-            ["Write an efficient algorithm to solve the 0-1 knapsack problem using dynamic programming.", 0.6, 8192, 0.95],
-            ["Prove that √2 is irrational using proof by contradiction.", 0.6, 8192, 0.95],
-            ["A tank can be filled by pipe A in 3 hours and pipe B in 5 hours. If both pipes are opened together, how long will it take to fill the tank?", 0.6, 8192, 0.95],
         ],
         inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
-        label="Example Problems"
     )
     # Event handlers
     submit_btn.click(
         fn=generate_response,
         inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
-        outputs=output_text
     )
     clear_btn.click(
-        fn=lambda: ("", ""),
         inputs=[],
-        outputs=[prompt_input, output_text]
     )
     gr.Markdown("""
     ---
-    ### 📊 Model Performance Highlights:
-    - **AIME24**: 80.3 (vs DeepSeek R1: 79.8)
-    - **AIME25**: 74.4 (vs DeepSeek R1: 70.0)
-    - **LiveCodeBench v6**: 51.1
-    - **Parameters**: Only 1.5B (400x smaller than DeepSeek R1!)
-    **Training Cost**: $7,800 USD | **License**: MIT
     """)
 # Launch the app
 if __name__ == "__main__":
-    demo.queue()  # Enable queuing for better UX
-    demo.launch()

 import gradio as gr
+import re
+from vllm import LLM, SamplingParams
 import spaces
+class VibeThinkerVLLM:
     def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
         self.model_path = model_path
+        print("Loading model with vLLM... This may take a minute.")
+        self.model = LLM(
+            model=self.model_path,
+            dtype="bfloat16",
+            gpu_memory_utilization=0.9,
+            max_model_len=40960,  # Support full context length
             trust_remote_code=True
         )
+        print(f"Model loaded successfully with vLLM!")
+    @spaces.GPU
+    def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
+        """Generate response with vLLM for faster inference"""
         messages = [
             {"role": "user", "content": prompt}
         ]
+        sampling_params = SamplingParams(
             temperature=temperature,
+            max_tokens=max_tokens,
             top_p=top_p,
+            top_k=-1,  # Disable top_k sampling
         )
+        print(f"Generating with vLLM (temp={temperature}, max_tokens={max_tokens})...")
+        outputs = self.model.chat(messages, sampling_params=sampling_params)
+        response = outputs[0].outputs[0].text
+        return response
+def parse_model_output(text):
+    """
+    Parse model output into structured components:
+    - Thinking sections (within <think> tags)
+    - Regular text (chat messages)
+    - Code blocks (within ``` or <code> tags)
+    """
+    sections = []
+    # Split by <think> tags
+    think_pattern = r'<think>(.*?)</think>'
+    code_pattern = r'```(\w+)?\n(.*?)```'
+    # Extract thinking sections
+    think_matches = list(re.finditer(think_pattern, text, re.DOTALL))
+    # Track positions
+    last_pos = 0
+    for match in think_matches:
+        # Add text before thinking section
+        before_text = text[last_pos:match.start()].strip()
+        if before_text:
+            # Check for code blocks in this text
+            code_blocks = list(re.finditer(code_pattern, before_text, re.DOTALL))
+            if code_blocks:
+                # Process text with code blocks
+                text_pos = 0
+                for code_match in code_blocks:
+                    # Add text before code
+                    pre_code_text = before_text[text_pos:code_match.start()].strip()
+                    if pre_code_text:
+                        sections.append({
+                            'type': 'text',
+                            'content': pre_code_text
+                        })
+                    # Add code block
+                    language = code_match.group(1) or 'plaintext'
+                    code_content = code_match.group(2).strip()
+                    sections.append({
+                        'type': 'code',
+                        'language': language,
+                        'content': code_content
+                    })
+                    text_pos = code_match.end()
+                # Add remaining text after last code block
+                remaining_text = before_text[text_pos:].strip()
+                if remaining_text:
+                    sections.append({
+                        'type': 'text',
+                        'content': remaining_text
+                    })
+            else:
+                sections.append({
+                    'type': 'text',
+                    'content': before_text
+                })
+        # Add thinking section
+        think_content = match.group(1).strip()
+        sections.append({
+            'type': 'thinking',
+            'content': think_content
+        })
+        last_pos = match.end()
+    # Add remaining text after last thinking section
+    remaining = text[last_pos:].strip()
+    if remaining:
+        # Check for code blocks
+        code_blocks = list(re.finditer(code_pattern, remaining, re.DOTALL))
+        if code_blocks:
+            text_pos = 0
+            for code_match in code_blocks:
+                pre_code_text = remaining[text_pos:code_match.start()].strip()
+                if pre_code_text:
+                    sections.append({
+                        'type': 'text',
+                        'content': pre_code_text
+                    })
+                language = code_match.group(1) or 'plaintext'
+                code_content = code_match.group(2).strip()
+                sections.append({
+                    'type': 'code',
+                    'language': language,
+                    'content': code_content
+                })
+                text_pos = code_match.end()
+            remaining_text = remaining[text_pos:].strip()
+            if remaining_text:
+                sections.append({
+                    'type': 'text',
+                    'content': remaining_text
+                })
+        else:
+            sections.append({
+                'type': 'text',
+                'content': remaining
+            })
+    return sections
+def format_output_for_display(sections):
+    """
+    Format parsed sections into a rich HTML display with:
+    - Collapsible thinking sections
+    - Syntax-highlighted code blocks
+    - Clean text rendering
+    """
+    html_parts = []
+    for i, section in enumerate(sections):
+        if section['type'] == 'thinking':
+            # Collapsible thinking section
+            html_parts.append(f"""
+            <details class="thinking-section" style="margin: 15px 0; border: 2px solid #f39c12; border-radius: 8px; background-color: #fff9e6;">
+                <summary style="padding: 12px; cursor: pointer; font-weight: bold; color: #d68910; user-select: none;">
+                    🤔 Thinking Process (Click to expand)
+                </summary>
+                <div style="padding: 15px; border-top: 1px solid #f39c12; background-color: #fffef7; white-space: pre-wrap; font-family: 'Courier New', monospace; font-size: 13px; color: #333; line-height: 1.6;">
+{section['content']}
+                </div>
+            </details>
+            """)
+        elif section['type'] == 'code':
+            # Code block with copy/download buttons
+            code_id = f"code-{i}"
+            html_parts.append(f"""
+            <details class="code-section" open style="margin: 15px 0; border: 2px solid #3498db; border-radius: 8px; background-color: #e8f4fd;">
+                <summary style="padding: 12px; cursor: pointer; font-weight: bold; color: #2874a6; user-select: none;">
+                    💻 Code ({section['language']}) - Click to collapse
+                </summary>
+                <div style="position: relative; padding: 0;">
+                    <div style="position: absolute; top: 10px; right: 10px; z-index: 10;">
+                        <button onclick="copyCode('{code_id}')" style="padding: 6px 12px; margin-right: 5px; background-color: #3498db; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 12px;">
+                            📋 Copy
+                        </button>
+                        <button onclick="downloadCode('{code_id}', '{section['language']}')" style="padding: 6px 12px; background-color: #27ae60; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 12px;">
+                            ⬇️ Download
+                        </button>
+                    </div>
+                    <pre id="{code_id}" style="margin: 0; padding: 40px 15px 15px 15px; background-color: #f8f9fa; border-top: 1px solid #3498db; overflow-x: auto; font-family: 'Courier New', monospace; font-size: 13px; line-height: 1.5;"><code class="language-{section['language']}">{section['content']}</code></pre>
+                </div>
+            </details>
+            """)
+        else:  # text
+            # Regular text output
+            html_parts.append(f"""
+            <div class="text-section" style="margin: 15px 0; padding: 15px; border: 1px solid #bdc3c7; border-radius: 8px; background-color: #ffffff; white-space: pre-wrap; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; font-size: 14px; line-height: 1.8; color: #2c3e50;">
+{section['content']}
+            </div>
+            """)
+    # Add JavaScript for copy and download functionality
+    js_code = """
+    <script>
+    function copyCode(elementId) {
+        const codeElement = document.getElementById(elementId);
+        const code = codeElement.textContent;
+        navigator.clipboard.writeText(code).then(() => {
+            alert('Code copied to clipboard!');
+        }).catch(err => {
+            console.error('Failed to copy:', err);
+        });
+    }
+    function downloadCode(elementId, language) {
+        const codeElement = document.getElementById(elementId);
+        const code = codeElement.textContent;
+        // Determine file extension
+        const extensions = {
+            'python': 'py',
+            'javascript': 'js',
+            'typescript': 'ts',
+            'html': 'html',
+            'css': 'css',
+            'java': 'java',
+            'cpp': 'cpp',
+            'c': 'c',
+            'ruby': 'rb',
+            'go': 'go',
+            'rust': 'rs',
+            'swift': 'swift',
+            'kotlin': 'kt',
+            'plaintext': 'txt'
+        };
+        const ext = extensions[language.toLowerCase()] || 'txt';
+        const filename = `code_snippet.${ext}`;
+        // Create blob and download
+        const blob = new Blob([code], { type: 'text/plain' });
+        const url = window.URL.createObjectURL(blob);
+        const a = document.createElement('a');
+        a.href = url;
+        a.download = filename;
+        document.body.appendChild(a);
+        a.click();
+        document.body.removeChild(a);
+        window.URL.revokeObjectURL(url);
+    }
+    </script>
+    """
+    return js_code + "\n".join(html_parts)
 # Initialize model
+print("Initializing VibeThinker-1.5B with vLLM...")
+model = VibeThinkerVLLM()
 # Create Gradio interface
 def generate_response(prompt, temperature, max_tokens, top_p):
     if not prompt.strip():
+        return "<p style='color: red;'>Please enter a question.</p>"
     try:
+        # Generate raw response
+        raw_response = model.infer_text(
             prompt=prompt,
             temperature=temperature,
             max_tokens=max_tokens,
             top_p=top_p
         )
+        # Parse and format the response
+        sections = parse_model_output(raw_response)
+        formatted_html = format_output_for_display(sections)
+        return formatted_html
     except Exception as e:
+        return f"<p style='color: red;'><strong>Error:</strong> {str(e)}</p>"
+# Custom CSS for better styling
+custom_css = """
+.thinking-section summary:hover {
+    background-color: #fef5e7;
+}
+.code-section summary:hover {
+    background-color: #d6eaf8;
+}
+.text-section {
+    box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+}
+details[open] summary {
+    border-bottom: 1px solid #ddd;
+    margin-bottom: 10px;
+}
+/* Syntax highlighting enhancements */
+code {
+    font-family: 'Fira Code', 'Courier New', monospace;
+}
+"""
 # Gradio UI
+with gr.Blocks(title="VibeThinker-1.5B Advanced", css=custom_css) as demo:
     gr.Markdown("""
+    # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
+    **Optimized with vLLM** for 10-20x faster inference! ⚡
+    **Features**:
+    - 🤔 **Collapsible Thinking Sections**: See the model's reasoning process
+    - 💻 **Interactive Code Blocks**: Copy or download code snippets
+    - 📝 **Clean Text Display**: Easy-to-read formatted responses
+    **Best for**: Competitive math problems and algorithm coding challenges
     [GitHub](https://github.com/WeiboAI/VibeThinker) | [HuggingFace Model](https://huggingface.co/WeiboAI/VibeThinker-1.5B) | [Paper](https://huggingface.co/papers/2511.06221)
     """)
         with gr.Column(scale=1):
             prompt_input = gr.Textbox(
                 label="Your Question",
+                placeholder="Ask a math problem or coding challenge (in English)...",
+                lines=6
             )
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature_slider = gr.Slider(
                     minimum=0.1,
                     maximum=1.5,
                     maximum=40960,
                     value=8192,
                     step=512,
+                    label="Max Tokens"
                 )
                 top_p_slider = gr.Slider(
                     label="Top P"
                 )
+            submit_btn = gr.Button("🚀 Generate Solution", variant="primary", size="lg")
+            clear_btn = gr.Button("🗑️ Clear", size="sm")
         with gr.Column(scale=1):
+            output_html = gr.HTML(
                 label="Model Response",
+                value="<p style='color: #7f8c8d; text-align: center; padding: 40px;'>Your response will appear here...</p>"
             )
     # Example questions
     gr.Examples(
         examples=[
+            ["Make me a single page html application that takes a color and outputs a color theme based on that color", 0.6, 16384, 0.95],
+            ["Solve this AIME problem: Find the number of positive integers n ≤ 1000 such that n^2 + n + 41 is prime.", 0.6, 12288, 0.95],
+            ["Write a Python function to implement the Euclidean algorithm for finding GCD, then optimize it.", 0.6, 8192, 0.95],
+            ["Prove that the sum of the first n odd numbers equals n^2 using mathematical induction.", 0.6, 8192, 0.95],
         ],
         inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
+        label="📚 Example Problems"
     )
     # Event handlers
     submit_btn.click(
         fn=generate_response,
         inputs=[prompt_input, temperature_slider, max_tokens_slider, top_p_slider],
+        outputs=output_html
     )
     clear_btn.click(
+        fn=lambda: ("", "<p style='color: #7f8c8d; text-align: center; padding: 40px;'>Your response will appear here...</p>"),
         inputs=[],
+        outputs=[prompt_input, output_html]
     )
     gr.Markdown("""
     ---
+    ### 📊 Performance Comparison:
+    | Metric | VibeThinker-1.5B | DeepSeek R1 (671B) | Size Ratio |
+    |--------|------------------|---------------------|------------|
+    | AIME24 | **80.3** | 79.8 | **400× smaller** |
+    | AIME25 | **74.4** | 70.0 | **400× smaller** |
+    | HMMT25 | **50.4** | 41.7 | **400× smaller** |
+    | Training Cost | **$7,800** | $294,000+ | **40× cheaper** |
+    🚀 **Powered by vLLM** for ultra-fast inference on T4 GPUs
     """)
 # Launch the app
 if __name__ == "__main__":
+    demo.queue(max_size=20)
+    demo.launch(share=False)