import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread, Event import re import time import html # --- Configuration --- MODEL_ID = "WeiboAI/VibeThinker-1.5B" class VibeThinkerModel: def __init__(self): self.model = None self.tokenizer = None self.device = "cuda" if torch.cuda.is_available() else "cpu" self.stop_signal = Event() def load_model(self): if self.model is not None: return print(f"🔄 Loading {MODEL_ID}...") try: self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) self.model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True ) print("✅ Model loaded.") except Exception as e: raise e def stop_generation(self): self.stop_signal.set() def _detect_tail_loop(self, text, min_phrase_len=3, max_phrase_len=10, threshold=20): """ Detects if the generator has gotten stuck in a loop at the END of the text. Criteria: A phrase of 3-10 words repeated at least 20 times consecutively. """ words = text.split() total_words = len(words) # We need at least (min_phrase * threshold) words to even check if total_words < min_phrase_len * threshold: return False # Only check the end of the string (optimization) # We look at the last (max_phrase * threshold) words check_window = max_phrase_len * threshold recent_words = words[-check_window:] if total_words > check_window else words for phrase_len in range(min_phrase_len, max_phrase_len + 1): # The candidate phrase is the very last 'phrase_len' words candidate_phrase = recent_words[-phrase_len:] # Construct what the tail SHOULD look like if it's looping # e.g. if phrase is "and then", we expect "and then and then..." # We check if the tail of the text matches (phrase * threshold) required_len = phrase_len * threshold if len(recent_words) < required_len: continue segment_to_check = recent_words[-required_len:] # Efficient check: does the segment consist ONLY of the candidate phrase? # We compare the segment against the candidate phrase repeated expected_segment = candidate_phrase * threshold if segment_to_check == expected_segment: return True return False def generate_response_streaming(self, prompt, temperature=0.6, max_new_tokens=32000): if not self.model: self.load_model() self.stop_signal.clear() try: start_time = time.time() # Optimized Prompt for VibeThinker messages = [ {"role": "system", "content": "You are an expert algorithm engineer. Analyze the problem deeply, then provide a clean Python solution."}, {"role": "user", "content": prompt} ] text_input = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = self.tokenizer(text_input, return_tensors="pt").to(self.device) streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=0.95, top_k=50, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, streamer=streamer, ) thread = Thread(target=self.model.generate, kwargs=generation_kwargs) thread.start() generated_text = "" loop_detected = False # Token counter for loop check frequency token_count = 0 for new_text in streamer: if self.stop_signal.is_set(): break generated_text += new_text token_count += 1 # Check for loops every 10 tokens to save CPU if token_count % 10 == 0: if self._detect_tail_loop(generated_text): loop_detected = True self.stop_signal.set() # Stop the model # Optional: Truncate the repetitive garbage # (Simple truncation for UI cleanliness) generated_text = generated_text + "\n\n[⚠️ Generation stopped: Infinite loop detected]" break yield generated_text, { "time": time.time() - start_time, "tokens": len(self.tokenizer.encode(generated_text)), "generating": True } if not self.stop_signal.is_set(): thread.join() yield generated_text, { "time": time.time() - start_time, "tokens": len(self.tokenizer.encode(generated_text)), "generating": False } except Exception as e: yield f"Error: {str(e)}", None vibe_model = VibeThinkerModel() class ModernUIParser: """Parses text into a structured, modern UI""" def format_code(self, code, lang="python"): """Applies basic HTML syntax highlighting regex""" code = html.escape(code) # Comments code = re.sub(r'(#.*?)(?=\n|$)', r'\1', code) # Keywords keywords = r'\b(def|class|return|if|else|elif|for|while|import|from|try|except|with|as|pass|None|True|False)\b' code = re.sub(keywords, r'\1', code) # Builtins/Calls code = re.sub(r'\b(print|len|range|enumerate|zip|super|__init__)\b', r'\1', code) # Strings code = re.sub(r'(".*?")', r'\1', code) code = re.sub(r"('.*?')", r'\1', code) return code def parse_and_render(self, text, stats): # 1. Separate Thinking from Content # Heuristic: Content before the first code block or explicit "Solution" header is usually thinking thinking = "" solution = text # Find split point markers = ["```", "Here is the solution", "### Solution", "Implementation:"] first_marker_idx = len(text) for m in markers: idx = text.find(m) if idx != -1 and idx < first_marker_idx: first_marker_idx = idx if first_marker_idx < len(text) and first_marker_idx > 50: thinking = text[:first_marker_idx].strip() solution = text[first_marker_idx:].strip() # 2. Process Solution Text (Markdown-ish to HTML) # Handle Code Blocks parts = re.split(r'(```\w*\n.*?```)', solution, flags=re.DOTALL) solution_html = "" for part in parts: if part.startswith('```'): # Extract lang and code match = re.match(r'```(\w*)\n(.*?)```', part, re.DOTALL) if match: lang = match.group(1) or "text" code_content = match.group(2) highlighted = self.format_code(code_content, lang) solution_html += f"""
{highlighted}
{html.escape(part)}"
else:
# Normal text processing
clean_text = html.escape(part)
# Headers
clean_text = re.sub(r'^### (.*?)$', r'Specialized 1.5B Model for Algorithms & Competitive Coding