Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on 18 days ago

Commit

579190c

verified ·

1 Parent(s): c842762

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -468

app.py CHANGED Viewed

@@ -11,10 +11,10 @@ import time
 # ============================================================================
 # 🎊 FESTIVE MODE TOGGLE 🎊
 # ============================================================================
-FESTIVE = True # Set to False for production-only mode
 # ============================================================================
-# Configuration & Model Loading (Architecture definitions included)
 # ============================================================================
 print("🚀 Loading Sam-large-2 Model...")
@@ -39,14 +39,11 @@ class RotaryEmbedding(keras.layers.Layer):
         super().build(input_shape)
     def _build_cache(self):
-        """Build RoPE cache on first forward pass"""
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
-            # Store as constant tensors
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
@@ -57,16 +54,11 @@ class RotaryEmbedding(keras.layers.Layer):
     def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
-        q_rotated = (q * cos) + (self.rotate_half(q) * sin)
-        k_rotated = (k * cos) + (self.rotate_half(k) * sin)
-        return q_rotated, k_rotated
     def get_config(self):
         config = super().get_config()
@@ -108,65 +100,39 @@ class TransformerBlock(keras.layers.Layer):
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
-        # Attention
         res = x
         y = self.pre_attn_norm(x)
         q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
-        mask = tf.where(
-            tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
-            tf.constant(-1e9, dtype=dtype),
-            tf.constant(0.0, dtype=dtype)
-        )
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
-        # FFN (SwiGLU)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
         return res + self.dropout(ffn, training=training)
     def get_config(self):
         config = super().get_config()
-        config.update({
-            "d_model": self.d_model,
-            "n_heads": self.n_heads,
-            "ff_dim": self.ff_dim,
-            "dropout": self.dropout_rate,
-            "max_len": self.max_len,
-            "rope_theta": self.rope_theta,
-            "layer_idx": self.layer_idx
-        })
         return config
@@ -182,31 +148,19 @@ class SAM1Model(keras.Model):
             self.cfg = kwargs.get('cfg', kwargs)
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
-        block_args = {
-            'd_model': self.cfg['d_model'],
-            'n_heads': self.cfg['n_heads'],
-            'ff_dim': ff_dim,
-            'dropout': self.cfg['dropout'],
-            'max_len': self.cfg['max_len'],
-            'rope_theta': self.cfg['rope_theta']
-        }
         self.blocks = []
         for i in range(self.cfg['n_layers']):
             block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
             self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
@@ -216,10 +170,8 @@ class SAM1Model(keras.Model):
 # --- Model and Tokenizer Loading ---
-# Download model files
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
-# Try to download checkpoint weights first (more reliable)
 try:
     weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
     print("✅ Found checkpoint weights (ckpt.weights.h5)")
@@ -231,14 +183,10 @@ except Exception as e:
         use_checkpoint = False
     except Exception as e_model:
         print(f"❌ Also failed to find model.keras: {e_model}")
-        # Commenting out raise to allow the Gradio UI to load even if model fails
-        # raise
-# Load config
 with open(config_path, 'r') as f:
     config = json.load(f)
-# Create tokenizer from scratch
 from transformers import AutoTokenizer
 hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
@@ -249,19 +197,14 @@ hf_tokenizer.save_pretrained("./temp_tokenizer")
 tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
 print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
 eos_token_id = config.get('eos_token_id', 50256)
-# ==============================================================================
-# Load Model - Priority: checkpoint weights > saved model
-# ==============================================================================
 print("\n🔄 Loading model...")
 model = None
 if use_checkpoint:
     print("📦 Building model from config and loading checkpoint weights...")
     model_config = {
         'vocab_size': config['vocab_size'],
         'd_model': config['hidden_size'],
@@ -272,53 +215,37 @@ if use_checkpoint:
         'dropout': 0.1,
         'rope_theta': config['rope_theta']
     }
     model = SAM1Model(config=model_config)
-    # Dummy call to build the model graph
     dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
     _ = model(dummy_input, training=False)
     print(f"✅ Model architecture built: {model.count_params():,} parameters")
     try:
         model.load_weights(weights_path)
         print("✅ Checkpoint weights loaded successfully!")
     except Exception as e:
         print(f"❌ Failed to load checkpoint weights: {e}")
-        # Continue with un-initialized model, which will likely fail on inference
 else:
     print("📦 Loading full saved model...")
     try:
-        # Custom objects needed for loading
-        custom_objects = {
-            'SAM1Model': SAM1Model,
-            'TransformerBlock': TransformerBlock,
-            'RMSNorm': RMSNorm,
-            'RotaryEmbedding': RotaryEmbedding
-        }
         model = keras.models.load_model(model_path, compile=False, custom_objects=custom_objects)
         print("✅ Model loaded successfully")
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
-        # Commenting out raise to allow the Gradio UI to load even if model fails
-        # raise
 if model:
     print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
-# Global stop flag
-stop_generation = False
 # ============================================================================
-# Generation Function with Streaming & Stop Button
 # ============================================================================
-# Dummy/Simulated generation logic for safety when running without full TF environment
-@tf.function(jit_compile=True)
-def generate_step(input_ids, max_len, temp, topk, topp, rep_pen):
-    # This is a placeholder for the actual model call
-    return tf.constant([50256], dtype=tf.int32), tf.constant(0.9, dtype=tf.float32)
 def generate_stream(
     prompt: str,
@@ -328,57 +255,88 @@ def generate_stream(
     top_p: float = 0.9,
     repetition_penalty: float = 1.1
 ):
-    """Generate text with streaming output and stop support"""
     global stop_generation
     stop_generation = False
     prompt_ids = tokenizer.encode(prompt).ids
     input_ids = [i for i in prompt_ids if i != eos_token_id]
     generated_text = ""
     token_count = 0
-    start_time = time.time()
-    # Simple fixed token sequence for stable demonstration
-    fixed_demo_tokens = [
-        tokenizer.token_to_id("Hello"),
-        tokenizer.token_to_id(" world"),
-        tokenizer.token_to_id("."),
-        tokenizer.token_to_id(" I"),
-        tokenizer.token_to_id(" am"),
-        tokenizer.token_to_id(" Sam"),
-        tokenizer.token_to_id("-"),
-        tokenizer.token_to_id("large"),
-        tokenizer.token_to_id("-"),
-        tokenizer.token_to_id("2")
-    ]
-    for i in range(max_tokens):
         if stop_generation:
             break
-        # SIMULATION: Use fixed tokens
-        if i < len(fixed_demo_tokens):
-            next_token_id_val = fixed_demo_tokens[i]
         else:
-            next_token_id_val = eos_token_id
-        if next_token_id_val == eos_token_id or next_token_id_val == tokenizer.token_to_id("<|im_end|>") or next_token_id_val == tokenizer.token_to_id("<im end for model tun>"):
             break
-        input_ids.append(next_token_id_val)
         token_count += 1
-        try:
-            generated_text = tokenizer.decode(input_ids[len(prompt_ids):], skip_special_tokens=False)
-        except Exception:
-            pass
-        # Add a pause to simulate streaming speed
-        time.sleep(0.02)
-        yield generated_text
     elapsed = time.time() - start_time
     tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
@@ -392,19 +350,16 @@ def generate_stream(
 # ============================================================================
 def format_chat_prompt(message: str, history: list, reasoning_enabled: bool) -> str:
-    """Format message history into chat prompt and prepend <think> if enabled (Model turn)"""
     prompt = ""
-    # Add history
     for user_msg, assistant_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
         if assistant_msg:
             prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
-    # Add current message
     prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    # Add <think> tag if enabled (Model Turn)
     if reasoning_enabled:
         prompt += "<think>"
@@ -420,7 +375,6 @@ def chat_stream(
     repetition_penalty: float,
     reasoning_enabled: bool
 ):
-    """Streaming chat response"""
     if not message.strip():
         yield history
         return
@@ -428,21 +382,14 @@ def chat_stream(
     prompt = format_chat_prompt(message, history, reasoning_enabled)
     partial_response = ""
-    # SIMULATION: If reasoning is enabled, prepend a simulated thought
-    if reasoning_enabled:
-        simulated_thought = (
-            "Deciding the response requires an introduction and answering the user's implicit query. "
-            "I will start with a friendly greeting and state my identity."
-        )
-        # Prepend the thought to the prompt for the generator to pick up
-        prompt = prompt.replace("<think>", f"<think>{simulated_thought}</think>")
     for generated in generate_stream(
         prompt, max_tokens, temperature, top_k, top_p, repetition_penalty
     ):
         partial_response = generated
-        # Robust End-of-Turn Detection Logic
         stop_tags = ["<|im_end|>", "<im end for model tun>"]
         earliest_stop = len(partial_response)
         should_stop = False
@@ -455,295 +402,119 @@ def chat_stream(
         if should_stop:
             partial_response = partial_response[:earliest_stop]
-        # Post-process reasoning tags for display (collapsible)
         if reasoning_enabled:
-            # Look for the simulated thought or any generated thought
             if '<think>' in partial_response and '</think>' in partial_response:
                 start_idx = partial_response.find('<think>')
                 end_idx = partial_response.find('</think>')
                 if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
                     thought_content = partial_response[start_idx + len('<think>'):end_idx].strip()
                     details_html = (
                         f'<details class="reasoning-block">'
                         f'<summary>Model Reasoning (Click to show/hide)</summary>'
-                        f'<p>{thought_content.replace("\\n", "<br>")}</p>'
                         f'</details>'
                     )
                     partial_response = partial_response[:start_idx] + details_html + partial_response[end_idx + len('</think>'):]
                 elif start_idx != -1 and end_idx == -1:
-                    # If </think> is missing (i.e., generation stopped mid-thought)
-                    partial_response = partial_response.replace('<think>', '')
-        # Update history
         yield history + [[message, partial_response.strip()]]
 def stop_gen():
-    """Stop generation callback"""
     global stop_generation
     stop_generation = True
     return None
 # ============================================================================
-# Gradio UI & CSS (Modal and Styling)
 # ============================================================================
 custom_css = """
-.gradio-container {
-    max-width: 1200px !important;
-    margin: auto !important;
-}
 .header {
-    text-align: center;
-    padding: 2rem;
-    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-    color: white;
-    border-radius: 12px;
-    margin-bottom: 2rem;
-    box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);
     animation: pulse 2s ease-in-out infinite;
 }
-@keyframes pulse {
-    0%, 100% { transform: scale(1); }
-    50% { transform: scale(1.02); }
-}
-.header h1 {
-    font-size: 2.8rem;
-    margin-bottom: 0.5rem;
-    font-weight: 700;
-    text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
-}
-.header p {
-    font-size: 1.1rem;
-    opacity: 0.95;
-}
-.celebration {
-    font-size: 2rem;
-    margin: 0.5rem;
-    animation: bounce 1s ease infinite;
-}
-@keyframes bounce {
-    0%, 100% { transform: translateY(0); }
-    50% { transform: translateY(-10px); }
-}
 .twin-badge {
-    display: inline-block;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    color: white;
-    padding: 0.5rem 1rem;
-    border-radius: 20px;
-    font-weight: bold;
-    margin: 0.5rem;
     box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
 }
-footer {
-    text-align: center;
-    padding: 2rem;
-    color: #666;
-    border-top: 1px solid #eee;
-    margin-top: 2rem;
-}
-/* Reasoning Toggle */
-#reasoning-control-group {
-    position: relative;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    margin-right: 10px;
-}
 #reasoning-toggle-btn {
-    font-size: 1.5rem;
-    border-radius: 50%;
-    width: 40px;
-    height: 40px;
-    padding: 0;
-    min-width: 0 !important;
-    line-height: 1;
-    background-color: #ffcc00;
-    border: 2px solid #e6b800;
 }
-#reasoning-toggle-btn.off {
-    background-color: #e0e0e0;
-    border: 2px solid #ccc;
-}
 .new-tag-red {
-    display: inline-block;
-    background-color: #f5576c;
-    color: white;
-    font-size: 0.7em;
-    font-weight: bold;
-    padding: 2px 5px;
-    border-radius: 4px;
-    line-height: 1;
-    position: absolute;
-    top: -5px;
-    right: -5px;
-    z-index: 10;
-    animation: blink 1s infinite;
 }
-@keyframes blink {
-    0%, 100% { opacity: 1; }
-    50% { opacity: 0.5; }
-}
-/* Reasoning block styling inside chatbot */
 .gradio-html details.reasoning-block {
-    border: 1px solid #ddd;
-    border-left: 5px solid #667eea;
-    padding: 5px 10px;
-    margin: 10px 0;
-    border-radius: 4px;
-    background-color: #f9f9ff;
 }
-.gradio-html details.reasoning-block summary {
-    font-weight: bold;
-    cursor: pointer;
-    outline: none;
-    color: #667eea;
-}
-.gradio-html details.reasoning-block p {
-    margin-top: 5px;
-    padding-left: 10px;
-    border-left: 1px dashed #ccc;
-    white-space: pre-wrap;
-}
-/* --- Modal Styling --- */
 .modal-overlay {
-    position: fixed;
-    top: 0;
-    left: 0;
-    right: 0;
-    bottom: 0;
-    background: rgba(0, 0, 0, 0.7);
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    z-index: 1000;
 }
 .modal-content {
-    background: white;
-    padding: 30px;
-    border-radius: 15px;
-    width: 90%;
-    max-width: 900px;
-    box-shadow: 0 10px 50px rgba(0, 0, 0, 0.5);
-    animation: slide-in 0.5s ease-out;
-}
-@keyframes slide-in {
-    from { transform: translateY(-50px); opacity: 0; }
-    to { transform: translateY(0); opacity: 1; }
-}
-.modal-content h2 {
-    color: #764ba2;
-    border-bottom: 2px solid #eee;
-    padding-bottom: 10px;
-    margin-top: 0;
-}
-.comparison-box {
-    display: flex;
-    gap: 20px;
-    margin-top: 20px;
-}
-.comparison-mode {
-    flex: 1;
-    padding: 15px;
-    border-radius: 10px;
-}
-.mode-reasoning {
-    border: 2px solid #667eea;
-    background-color: #f6f7ff;
-}
-.mode-direct {
-    border: 2px solid #fcb69f;
-    background-color: #fffaf5;
-}
-.comparison-mode h3 {
-    margin-top: 0;
-    font-size: 1.3rem;
-}
-.comparison-mode pre {
-    background-color: #eef;
-    padding: 10px;
-    border-radius: 5px;
-    overflow-x: auto;
 }
 .close-btn {
-    margin-top: 20px;
-    padding: 10px 20px;
-    background-color: #764ba2;
-    color: white;
-    border: none;
-    border-radius: 8px;
-    cursor: pointer;
-    font-size: 1rem;
-    transition: background-color 0.3s;
-}
-.close-btn:hover {
-    background-color: #5d3a84;
 }
 """
 festive_css = custom_css
 custom_css = festive_css
-# Build interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     reasoning_enabled = gr.State(False)
     modal_shown = gr.State(False)
-    # --- The Welcome Modal HTML Component ---
     welcome_modal_html = gr.HTML(
         """
         <div id="welcome-modal" class="modal-overlay" style="display:none;">
             <div class="modal-content">
                 <h2>🧠 Welcome to Sam-large-2: Dual-Mode Reasoning Demo</h2>
                 <p>Our latest model, **Sam-large-2**, features **Chain-of-Thought (CoT)** functionality. You can toggle this feature using the 💡 button next to the input field.</p>
-                <p>Here is how the two modes affect the output:</p>
                 <div class="comparison-box">
                     <div class="comparison-mode mode-reasoning">
                         <h3>💡 Reasoning Mode (ON)</h3>
-                        <p>The model performs a **CoT step** first. The internal thought process is contained within the <code>&lt;think>...&lt;/think></code> tags (which are shown in a collapsible box).</p>
-                        <pre>
-&lt;think>
-1. Identify the user's request.
-2. Formulate a plan...
-&lt;/think>
-[Collapsible Box]
-This is the final, reasoned answer.
-</pre>
                     </div>
                     <div class="comparison-mode mode-direct">
                         <h3>⚪ Direct Mode (OFF)</h3>
-                        <p>The model generates the final answer immediately, maximizing speed but potentially reducing accuracy for complex tasks.</p>
-                        <pre>
-This is the final, direct answer.
-</pre>
                     </div>
                 </div>
                 <button class="close-btn" onclick="document.getElementById('welcome-modal').style.display='none'">Got it! Start Chatting</button>
@@ -752,37 +523,20 @@ This is the final, direct answer.
         """
     )
-    # Header
     if FESTIVE:
         gr.HTML("""
             <div class="header">
                 <div class="celebration">🎉 🎊 ✨ 🎈 🎆</div>
                 <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
-                    alt="Sam-large-2"
-                    style="max-width: 400px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);">
                 <h1>🤖 Sam-large-2 Chat 🤖</h1>
                 <p><strong>LATEST RELEASE!</strong> Our **BEST Reasoning Model** - Full Chain-of-Thought!</p>
                 <div class="twin-badge">Reasoning Model</div>
-                <p style="font-size: 0.9rem; margin-top: 1rem;">
-                    768D • 16 Layers • 12 Heads • ~313M Parameters • **Trained for Reasoning**
-                </p>
                 <div class="celebration">🚀 💫 🎯 ⚡ 🔥</div>
             </div>
         """)
     else:
-        gr.HTML("""
-            <div class="header">
-                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
-                    alt="Sam-large-2"
-                    style="max-width: 300px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 4px 16px rgba(0,0,0,0.15);">
-                <h1>🤖 Sam-large-2 Chat</h1>
-                <p>Advanced Reasoning Model with Chain-of-Thought support.</p>
-                <p style="font-size: 0.9rem; margin-top: 0.5rem;">
-                    768D • 16 Layers • 12 Heads • Trained on TPU v5e-8
-                </p>
-            </div>
-        """)
     with gr.Row():
         with gr.Column(scale=4):
@@ -791,144 +545,74 @@ This is the final, direct answer.
                 avatar_images=(None, "https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/KtiMi-aDUOOeN--YNT-Fu.jpeg"),
                 bubble_full_width=False
             )
             with gr.Row():
                 with gr.Column(min_width=0, scale=0, elem_id="reasoning-control-group"):
                     reasoning_btn = gr.Button("💡", size="sm", elem_id="reasoning-toggle-btn", elem_classes=["off"])
                     gr.HTML('<span class="new-tag-red">NEW</span>')
                 msg = gr.Textbox(placeholder="Type your message here...", show_label=False, scale=8, container=False)
                 submit_btn = gr.Button("Send 🚀" if FESTIVE else "Send", variant="primary", scale=1)
                 stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
             with gr.Row():
                 clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
                 retry_btn = gr.Button("🔄 Retry", size="sm")
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Generation Settings")
-            max_tokens = gr.Slider(minimum=50, maximum=1024, value=512, step=50, label="Max Tokens", info="Maximum length of response")
-            temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature", info="Higher = more creative")
-            top_k = gr.Slider(minimum=1, maximum=100, value=40, step=1, label="Top-K", info="Sample from top K tokens")
-            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P", info="Nucleus sampling threshold")
-            repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty", info="Penalize repeated tokens")
             gr.Markdown("---")
-            gr.Markdown(f"""
-                ### 🎊 Sam-large-2 Model Info
-                **🎯 The Reasoning Core!**
                 **Type:** Chain-of-Thought Reasoning Model
-                **Parameters:** ~313M
-                **Context:** {config['max_position_embeddings']} tokens
                 **Vocab:** {config['vocab_size']}
                 **Reasoning:** Full CoT support (uses **<think>** tags)
-                **Feature:** Reasoning toggle available! (Top-left of input box)
-                **Architecture:**
-                - RoPE positional encoding
-                - SwiGLU activation
-                - RMSNorm layers
-                - No bias terms (efficient!)
                 """)
-    # Example prompts
-    gr.Examples(
-        examples=[
-            "Hi! What can you do?",
-            "Explain quantum computing in simple terms",
-            "Write a short poem about AI",
-            "Why is Sam-large-2 considered a reasoning model?",
-            "Tell me a step-by-step method for solving a math problem.",
-        ],
-        inputs=msg,
-        label="🎯 Try these examples!"
-    )
-    # Footer - Ensure this is a clean multi-line string
     gr.HTML("""
             <footer>
-                <p style="font-size: 1.2rem;"><strong>🎉 Sam-large-2 - LATEST RELEASE! 🎉</strong></p>
-                <p><strong>The Reasoning Core</strong> - Chain-of-Thought Enabled</p>
-                <p style="font-size: 0.9rem; color: #999; margin-top: 0.5rem;">
-                    Trained from scratch on TPU v5e-8 • Built by Smily studios with TensorFlow & Gradio
-                </p>
-                <p style="font-size: 0.9rem; color: #999;">
-                    Uses **<think>** tags for reasoning when enabled.
-                </p>
-                <div style="margin-top: 1rem; font-size: 1.5rem;">
-                    ⚡ 🚀 💫 ✨ 🎯
-                </div>
             </footer>
         """)
-    # --- JavaScript to show modal on first load ---
     def show_modal_js():
         return """
         (function() {
             if (sessionStorage.getItem('sam2_modal_shown') !== 'true') {
                 const modal = document.getElementById('welcome-modal');
-                if (modal) {
-                    modal.style.display = 'flex';
-                    sessionStorage.setItem('sam2_modal_shown', 'true');
-                }
             }
         })();
         """
-    # Execute the JavaScript function on page load
     demo.load(None, inputs=None, outputs=None, js=show_modal_js())
-    # Reasoning Toggle function
     def toggle_reasoning(current_state):
         new_state = not current_state
-        btn_class = "" if new_state else "off"
-        return new_state, gr.update(elem_classes=btn_class)
-    # Reasoning Toggle Event Handler
-    reasoning_btn.click(
-        fn=toggle_reasoning,
-        inputs=[reasoning_enabled],
-        outputs=[reasoning_enabled, reasoning_btn],
-        preprocess=False
-    )
-    # Event handlers for chat
-    submit_event = msg.submit(
-        chat_stream,
-        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty, reasoning_enabled],
-        outputs=[chatbot]
-    ).then(lambda: "", outputs=[msg])
-    click_event = submit_btn.click(
-        chat_stream,
-        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty, reasoning_enabled],
-        outputs=[chatbot]
-    ).then(lambda: "", outputs=[msg])
     stop_btn.click(fn=stop_gen, inputs=None, outputs=None, cancels=[submit_event, click_event])
     clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
     def retry_last(history, max_tok, temp, topk, topp, rep_pen, reasoning_en):
-        if not history:
-            return history
         last_user_msg = history[-1][0]
-        history = history[:-1]
-        for update in chat_stream(last_user_msg, history, max_tok, temp, topk, topp, rep_pen, reasoning_en):
             yield update
-    retry_event = retry_btn.click(
-        retry_last,
-        inputs=[chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty, reasoning_enabled],
-        outputs=[chatbot]
-    )
     stop_btn.click(fn=stop_gen, inputs=None, outputs=None, cancels=[retry_event])
-# Launch
 if __name__ == "__main__":
     demo.queue(max_size=20)
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )

 # ============================================================================
 # 🎊 FESTIVE MODE TOGGLE 🎊
 # ============================================================================
+FESTIVE = True
 # ============================================================================
+# Configuration & Model Loading
 # ============================================================================
 print("🚀 Loading Sam-large-2 Model...")
         super().build(input_shape)
     def _build_cache(self):
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
     def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
+        return (q * cos) + (self.rotate_half(q) * sin), (k * cos) + (self.rotate_half(k) * sin)
     def get_config(self):
         config = super().get_config()
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
         q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
+        mask = tf.where(tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0, tf.constant(-1e9, dtype=dtype), tf.constant(0.0, dtype=dtype))
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
         return res + self.dropout(ffn, training=training)
     def get_config(self):
         config = super().get_config()
+        config.update({"d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim, "dropout": self.dropout_rate, "max_len": self.max_len, "rope_theta": self.rope_theta, "layer_idx": self.layer_idx})
         return config
             self.cfg = kwargs.get('cfg', kwargs)
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
+        block_args = {'d_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'], 'ff_dim': ff_dim, 'dropout': self.cfg['dropout'], 'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']}
         self.blocks = []
         for i in range(self.cfg['n_layers']):
             block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
             self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
 # --- Model and Tokenizer Loading ---
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
 try:
     weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
     print("✅ Found checkpoint weights (ckpt.weights.h5)")
         use_checkpoint = False
     except Exception as e_model:
         print(f"❌ Also failed to find model.keras: {e_model}")
 with open(config_path, 'r') as f:
     config = json.load(f)
 from transformers import AutoTokenizer
 hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
 print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
 eos_token_id = config.get('eos_token_id', 50256)
 print("\n🔄 Loading model...")
 model = None
 if use_checkpoint:
     print("📦 Building model from config and loading checkpoint weights...")
     model_config = {
         'vocab_size': config['vocab_size'],
         'd_model': config['hidden_size'],
         'dropout': 0.1,
         'rope_theta': config['rope_theta']
     }
     model = SAM1Model(config=model_config)
     dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
     _ = model(dummy_input, training=False)
     print(f"✅ Model architecture built: {model.count_params():,} parameters")
     try:
         model.load_weights(weights_path)
         print("✅ Checkpoint weights loaded successfully!")
     except Exception as e:
         print(f"❌ Failed to load checkpoint weights: {e}")
 else:
     print("📦 Loading full saved model...")
     try:
+        custom_objects = {'SAM1Model': SAM1Model, 'TransformerBlock': TransformerBlock, 'RMSNorm': RMSNorm, 'RotaryEmbedding': RotaryEmbedding}
         model = keras.models.load_model(model_path, compile=False, custom_objects=custom_objects)
         print("✅ Model loaded successfully")
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
 if model:
     print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
 # ============================================================================
+# Optimized Inference Logic (TF Functions)
 # ============================================================================
+# Define fast forward for real generation
+@tf.function(reduce_retracing=True)
+def fast_forward(input_tensor):
+    return model(input_tensor, training=False)
+stop_generation = False
 def generate_stream(
     prompt: str,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1
 ):
+    """Generate text with streaming output using REAL model inference"""
     global stop_generation
     stop_generation = False
+    # Tokenize prompt
     prompt_ids = tokenizer.encode(prompt).ids
     input_ids = [i for i in prompt_ids if i != eos_token_id]
+    input_tensor = tf.constant([input_ids], dtype=tf.int32)
     generated_text = ""
     token_count = 0
+    token_freq = {}
+    start_time = time.time()
+    # --- REAL INFERENCE LOOP ---
+    for step in range(max_tokens):
         if stop_generation:
+            yield generated_text + "\n\n*[Generation stopped]*"
             break
+        # 1. Forward Pass (Real Model)
+        logits = fast_forward(input_tensor)
+        next_token_logits = logits[0, -1, :].numpy()
+        # 2. Temperature
+        next_token_logits = next_token_logits / temperature
+        # 3. Repetition Penalty
+        if repetition_penalty != 1.0:
+            for token_id, freq in token_freq.items():
+                if token_id < len(next_token_logits):
+                    next_token_logits[token_id] /= (repetition_penalty ** freq)
+        # 4. Sampling (Top-K / Top-P)
+        # Top-K
+        if top_k > 0:
+            top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
+            top_k_logits = next_token_logits[top_k_indices]
+            top_k_probs = tf.nn.softmax(top_k_logits).numpy()
+            # Top-P (Nucleus)
+            if top_p < 1.0:
+                sorted_indices = np.argsort(top_k_probs)[::-1]
+                cumsum = np.cumsum(top_k_probs[sorted_indices])
+                cutoff_idx = np.searchsorted(cumsum, top_p)
+                nucleus_indices = sorted_indices[:cutoff_idx + 1]
+                nucleus_logits = top_k_logits[nucleus_indices]
+                nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
+                sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
+                next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
+            else:
+                sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
+                next_token_id = int(top_k_indices[sampled_idx])
         else:
+            probs = tf.nn.softmax(next_token_logits).numpy()
+            next_token_id = np.random.choice(len(probs), p=probs)
+        # 5. Stop Conditions
+        if next_token_id == eos_token_id or \
+           next_token_id == tokenizer.token_to_id("<|im_end|>") or \
+           next_token_id == tokenizer.token_to_id("<im end for model tun>"):
             break
+        # 6. Update Input & History
+        token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
+        token_text = tokenizer.decode([next_token_id])
+        generated_text += token_text
         token_count += 1
+        yield generated_text
+        # Prepare next input
+        input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
+        # Truncate if exceeding context
+        if input_tensor.shape[1] > config['max_position_embeddings']:
+             input_tensor = input_tensor[:, -config['max_position_embeddings']:]
     elapsed = time.time() - start_time
     tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
 # ============================================================================
 def format_chat_prompt(message: str, history: list, reasoning_enabled: bool) -> str:
+    """Format message history and SEED <think> if enabled"""
     prompt = ""
     for user_msg, assistant_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
         if assistant_msg:
             prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
     prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    # 🧠 REAL REASONING: Just add the tag. The model will do the rest.
     if reasoning_enabled:
         prompt += "<think>"
     repetition_penalty: float,
     reasoning_enabled: bool
 ):
     if not message.strip():
         yield history
         return
     prompt = format_chat_prompt(message, history, reasoning_enabled)
     partial_response = ""
+    # ⚡ NO FAKE REASONING HERE. We trust the model.
     for generated in generate_stream(
         prompt, max_tokens, temperature, top_k, top_p, repetition_penalty
     ):
         partial_response = generated
+        # Robust End-of-Turn Detection
         stop_tags = ["<|im_end|>", "<im end for model tun>"]
         earliest_stop = len(partial_response)
         should_stop = False
         if should_stop:
             partial_response = partial_response[:earliest_stop]
+        # Post-process reasoning tags for display (Collapsing the REAL thought)
         if reasoning_enabled:
             if '<think>' in partial_response and '</think>' in partial_response:
                 start_idx = partial_response.find('<think>')
                 end_idx = partial_response.find('</think>')
                 if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
                     thought_content = partial_response[start_idx + len('<think>'):end_idx].strip()
+                    # Safe formatting outside f-string
+                    formatted_thought = thought_content.replace("\n", "<br>")
                     details_html = (
                         f'<details class="reasoning-block">'
                         f'<summary>Model Reasoning (Click to show/hide)</summary>'
+                        f'<p>{formatted_thought}</p>'
                         f'</details>'
                     )
                     partial_response = partial_response[:start_idx] + details_html + partial_response[end_idx + len('</think>'):]
                 elif start_idx != -1 and end_idx == -1:
+                    # Model is currently thinking...
+                    partial_response = partial_response.replace('<think>', '**Thinking:** ')
         yield history + [[message, partial_response.strip()]]
 def stop_gen():
     global stop_generation
     stop_generation = True
     return None
 # ============================================================================
+# Gradio UI
 # ============================================================================
 custom_css = """
+.gradio-container { max-width: 1200px !important; margin: auto !important; }
 .header {
+    text-align: center; padding: 2rem; background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+    color: white; border-radius: 12px; margin-bottom: 2rem; box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);
     animation: pulse 2s ease-in-out infinite;
 }
+@keyframes pulse { 0%, 100% { transform: scale(1); } 50% { transform: scale(1.02); } }
+.header h1 { font-size: 2.8rem; margin-bottom: 0.5rem; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.2); }
+.header p { font-size: 1.1rem; opacity: 0.95; }
+.celebration { font-size: 2rem; margin: 0.5rem; animation: bounce 1s ease infinite; }
+@keyframes bounce { 0%, 100% { transform: translateY(0); } 50% { transform: translateY(-10px); } }
 .twin-badge {
+    display: inline-block; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white; padding: 0.5rem 1rem; border-radius: 20px; font-weight: bold; margin: 0.5rem;
     box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
 }
+footer { text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem; }
+#reasoning-control-group { position: relative; display: flex; align-items: center; justify-content: center; margin-right: 10px; }
 #reasoning-toggle-btn {
+    font-size: 1.5rem; border-radius: 50%; width: 40px; height: 40px; padding: 0;
+    min-width: 0 !important; line-height: 1; background-color: #ffcc00; border: 2px solid #e6b800;
 }
+#reasoning-toggle-btn.off { background-color: #e0e0e0; border: 2px solid #ccc; }
 .new-tag-red {
+    display: inline-block; background-color: #f5576c; color: white; font-size: 0.7em;
+    font-weight: bold; padding: 2px 5px; border-radius: 4px; line-height: 1;
+    position: absolute; top: -5px; right: -5px; z-index: 10; animation: blink 1s infinite;
 }
+@keyframes blink { 0%, 100% { opacity: 1; } 50% { opacity: 0.5; } }
 .gradio-html details.reasoning-block {
+    border: 1px solid #ddd; border-left: 5px solid #667eea; padding: 5px 10px;
+    margin: 10px 0; border-radius: 4px; background-color: #f9f9ff;
 }
+.gradio-html details.reasoning-block summary { font-weight: bold; cursor: pointer; outline: none; color: #667eea; }
+.gradio-html details.reasoning-block p { margin-top: 5px; padding-left: 10px; border-left: 1px dashed #ccc; white-space: pre-wrap; }
 .modal-overlay {
+    position: fixed; top: 0; left: 0; right: 0; bottom: 0; background: rgba(0, 0, 0, 0.7);
+    display: flex; justify-content: center; align-items: center; z-index: 1000;
 }
 .modal-content {
+    background: white; padding: 30px; border-radius: 15px; width: 90%; max-width: 900px;
+    box-shadow: 0 10px 50px rgba(0, 0, 0, 0.5); animation: slide-in 0.5s ease-out;
 }
+@keyframes slide-in { from { transform: translateY(-50px); opacity: 0; } to { transform: translateY(0); opacity: 1; } }
+.modal-content h2 { color: #764ba2; border-bottom: 2px solid #eee; padding-bottom: 10px; margin-top: 0; }
+.comparison-box { display: flex; gap: 20px; margin-top: 20px; }
+.comparison-mode { flex: 1; padding: 15px; border-radius: 10px; }
+.mode-reasoning { border: 2px solid #667eea; background-color: #f6f7ff; }
+.mode-direct { border: 2px solid #fcb69f; background-color: #fffaf5; }
+.comparison-mode h3 { margin-top: 0; font-size: 1.3rem; }
+.comparison-mode pre { background-color: #eef; padding: 10px; border-radius: 5px; overflow-x: auto; }
 .close-btn {
+    margin-top: 20px; padding: 10px 20px; background-color: #764ba2; color: white;
+    border: none; border-radius: 8px; cursor: pointer; font-size: 1rem; transition: background-color 0.3s;
 }
+.close-btn:hover { background-color: #5d3a84; }
 """
 festive_css = custom_css
 custom_css = festive_css
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     reasoning_enabled = gr.State(False)
     modal_shown = gr.State(False)
     welcome_modal_html = gr.HTML(
         """
         <div id="welcome-modal" class="modal-overlay" style="display:none;">
             <div class="modal-content">
                 <h2>🧠 Welcome to Sam-large-2: Dual-Mode Reasoning Demo</h2>
                 <p>Our latest model, **Sam-large-2**, features **Chain-of-Thought (CoT)** functionality. You can toggle this feature using the 💡 button next to the input field.</p>
                 <div class="comparison-box">
                     <div class="comparison-mode mode-reasoning">
                         <h3>💡 Reasoning Mode (ON)</h3>
+                        <p>The model performs a **CoT step** first. The internal thought process is contained within the <code>&lt;think>...&lt;/think></code> tags.</p>
                     </div>
                     <div class="comparison-mode mode-direct">
                         <h3>⚪ Direct Mode (OFF)</h3>
+                        <p>The model generates the final answer immediately, maximizing speed.</p>
                     </div>
                 </div>
                 <button class="close-btn" onclick="document.getElementById('welcome-modal').style.display='none'">Got it! Start Chatting</button>
         """
     )
     if FESTIVE:
         gr.HTML("""
             <div class="header">
                 <div class="celebration">🎉 🎊 ✨ 🎈 🎆</div>
                 <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
+                    alt="Sam-large-2" style="max-width: 400px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);">
                 <h1>🤖 Sam-large-2 Chat 🤖</h1>
                 <p><strong>LATEST RELEASE!</strong> Our **BEST Reasoning Model** - Full Chain-of-Thought!</p>
                 <div class="twin-badge">Reasoning Model</div>
                 <div class="celebration">🚀 💫 🎯 ⚡ 🔥</div>
             </div>
         """)
     else:
+        gr.HTML("""<div class="header"><h1>🤖 Sam-large-2 Chat</h1><p>Advanced Reasoning Model</p></div>""")
     with gr.Row():
         with gr.Column(scale=4):
                 avatar_images=(None, "https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/KtiMi-aDUOOeN--YNT-Fu.jpeg"),
                 bubble_full_width=False
             )
             with gr.Row():
                 with gr.Column(min_width=0, scale=0, elem_id="reasoning-control-group"):
                     reasoning_btn = gr.Button("💡", size="sm", elem_id="reasoning-toggle-btn", elem_classes=["off"])
                     gr.HTML('<span class="new-tag-red">NEW</span>')
                 msg = gr.Textbox(placeholder="Type your message here...", show_label=False, scale=8, container=False)
                 submit_btn = gr.Button("Send 🚀" if FESTIVE else "Send", variant="primary", scale=1)
                 stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
             with gr.Row():
                 clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
                 retry_btn = gr.Button("🔄 Retry", size="sm")
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Generation Settings")
+            max_tokens = gr.Slider(minimum=50, maximum=1024, value=512, step=50, label="Max Tokens")
+            temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature")
+            top_k = gr.Slider(minimum=1, maximum=100, value=40, step=1, label="Top-K")
+            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
+            repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty")
             gr.Markdown("---")
+            gr.Markdown(f"""### 🎊 Sam-large-2 Model Info
                 **Type:** Chain-of-Thought Reasoning Model
                 **Vocab:** {config['vocab_size']}
                 **Reasoning:** Full CoT support (uses **<think>** tags)
                 """)
+    gr.Examples(examples=["Explain quantum computing", "Write a short poem about AI", "Solve 24*12 with reasoning"], inputs=msg)
     gr.HTML("""
             <footer>
+                <p><strong>🎉 Sam-large-2 - LATEST RELEASE! 🎉</strong></p>
+                <p style="font-size: 0.9rem; color: #999;">Trained from scratch on TPU v5e-8 • Built by Smily studios with TensorFlow & Gradio</p>
             </footer>
         """)
     def show_modal_js():
         return """
         (function() {
             if (sessionStorage.getItem('sam2_modal_shown') !== 'true') {
                 const modal = document.getElementById('welcome-modal');
+                if (modal) { modal.style.display = 'flex'; sessionStorage.setItem('sam2_modal_shown', 'true'); }
             }
         })();
         """
     demo.load(None, inputs=None, outputs=None, js=show_modal_js())
     def toggle_reasoning(current_state):
         new_state = not current_state
+        return new_state, gr.update(elem_classes="" if new_state else "off")
+    reasoning_btn.click(fn=toggle_reasoning, inputs=[reasoning_enabled], outputs=[reasoning_enabled, reasoning_btn], preprocess=False)
+    common_inputs = [msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty, reasoning_enabled]
+    submit_event = msg.submit(chat_stream, inputs=common_inputs, outputs=[chatbot]).then(lambda: "", outputs=[msg])
+    click_event = submit_btn.click(chat_stream, inputs=common_inputs, outputs=[chatbot]).then(lambda: "", outputs=[msg])
     stop_btn.click(fn=stop_gen, inputs=None, outputs=None, cancels=[submit_event, click_event])
     clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
     def retry_last(history, max_tok, temp, topk, topp, rep_pen, reasoning_en):
+        if not history: return history
         last_user_msg = history[-1][0]
+        for update in chat_stream(last_user_msg, history[:-1], max_tok, temp, topk, topp, rep_pen, reasoning_en):
             yield update
+    retry_event = retry_btn.click(retry_last, inputs=[chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty, reasoning_enabled], outputs=[chatbot])
     stop_btn.click(fn=stop_gen, inputs=None, outputs=None, cancels=[retry_event])
 if __name__ == "__main__":
     demo.queue(max_size=20)
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)