Spaces:

Smilyai-labs
/

MixSam-exp-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on Nov 8

Commit

0a071aa

verified ·

1 Parent(s): 8f29b30

Create app.py

Browse files

Files changed (1) hide show

app.py +611 -0

app.py ADDED Viewed

	@@ -0,0 +1,611 @@

+import gradio as gr
+import jax
+import jax.numpy as jnp
+from jax import random
+import flax.linen as nn
+from tokenizers import Tokenizer
+from safetensors.flax import load_file
+import json
+import os
+from typing import Any, Optional
+import numpy as np
+# ==============================================================================
+# MODEL ARCHITECTURE (from your training code)
+# ==============================================================================
+class RMSNorm(nn.Module):
+    epsilon: float = 1e-5
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x):
+        x = x.astype(jnp.float32)
+        scale = self.param('scale', nn.initializers.ones, (x.shape[-1],))
+        variance = jnp.mean(jnp.square(x), axis=-1, keepdims=True)
+        x = x * jax.lax.rsqrt(variance + self.epsilon) * scale
+        return x.astype(self.dtype)
+def precompute_yarn_freqs(dim: int, end: int, theta: float = 10000.0,
+                          scale: float = 1.0, alpha: float = 1.0,
+                          beta: float = 32.0, dtype=jnp.bfloat16):
+    freqs = 1.0 / (theta ** (jnp.arange(0, dim, 2, dtype=jnp.float32) / dim))
+    if scale > 1.0:
+        def yarn_find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+            return (dim * jnp.log(max_position_embeddings / (num_rotations * 2 * jnp.pi))) / (2 * jnp.log(base))
+        def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
+            low = jnp.floor(yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+            high = jnp.ceil(yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings))
+            return jnp.maximum(low, 0).astype(jnp.int32), jnp.minimum(high, dim - 1).astype(jnp.int32)
+        def yarn_linear_ramp_mask(min_val, max_val, dim):
+            if min_val == max_val:
+                max_val += 0.001
+            linear_func = (jnp.arange(dim, dtype=jnp.float32) - min_val) / (max_val - min_val)
+            return jnp.clip(linear_func, 0, 1)
+        low, high = yarn_find_correction_range(beta, alpha, dim, theta, int(end * scale))
+        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2)
+        freqs = freqs / ((1 - inv_freq_mask) * (scale - 1) + 1)
+    t = jnp.arange(end, dtype=jnp.float32)
+    freqs = jnp.outer(t, freqs)
+    mscale = 1.0
+    if scale > 1.0:
+        mscale = 0.1 * 1.0 * jnp.log(scale) + 1.0
+    cos = jnp.cos(freqs) * mscale
+    sin = jnp.sin(freqs) * mscale
+    return jnp.concatenate([cos, sin], axis=-1).astype(dtype), mscale
+def apply_rotary_emb(xq, xk, freqs_cis, mscale=1.0):
+    def rotate_half(x):
+        x1, x2 = jnp.split(x, 2, axis=-1)
+        return jnp.concatenate([-x2, x1], axis=-1)
+    seq_len = xq.shape[2]
+    head_dim = xq.shape[3]
+    freqs = freqs_cis[:seq_len, :]
+    half_dim = head_dim // 2
+    cos = freqs[:, :half_dim]
+    sin = freqs[:, half_dim:]
+    cos = jnp.repeat(cos, 2, axis=-1)
+    sin = jnp.repeat(sin, 2, axis=-1)
+    cos = cos[None, None, :, :]
+    sin = sin[None, None, :, :]
+    xq_out = (xq * cos) + (rotate_half(xq) * sin)
+    xk_out = (xk * cos) + (rotate_half(xk) * sin)
+    return xq_out, xk_out
+class DepthwiseSeparableConv1D(nn.Module):
+    channels: int
+    kernel_size: int = 3
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x):
+        depthwise = nn.Conv(
+            features=self.channels,
+            kernel_size=(self.kernel_size,),
+            feature_group_count=self.channels,
+            padding='SAME',
+            use_bias=False,
+            dtype=self.dtype,
+            name='depthwise'
+        )(x)
+        pointwise = nn.Conv(
+            features=self.channels,
+            kernel_size=(1,),
+            use_bias=False,
+            dtype=self.dtype,
+            name='pointwise'
+        )(depthwise)
+        return pointwise
+class LocalContextCNN(nn.Module):
+    d_model: int
+    dropout: float
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, training: bool = False):
+        conv3 = DepthwiseSeparableConv1D(self.d_model, 3, self.dtype, name='conv3')(x)
+        conv5 = DepthwiseSeparableConv1D(self.d_model, 5, self.dtype, name='conv5')(x)
+        conv7 = DepthwiseSeparableConv1D(self.d_model, 7, self.dtype, name='conv7')(x)
+        gate = nn.Dense(self.d_model * 3, dtype=self.dtype, name='fusion_gate')(x)
+        gate = nn.sigmoid(gate)
+        g3, g5, g7 = jnp.split(gate, 3, axis=-1)
+        out = g3 * conv3 + g5 * conv5 + g7 * conv7
+        scale = self.param('layer_scale', nn.initializers.constant(1e-6), (self.d_model,))
+        out = out * scale
+        return nn.Dropout(self.dropout, deterministic=not training)(out)
+class MinGRUCell(nn.Module):
+    hidden_size: int
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, h):
+        z = nn.Dense(self.hidden_size, use_bias=True, dtype=self.dtype, name='gate')(x)
+        h_tilde = nn.Dense(self.hidden_size, use_bias=True, dtype=self.dtype, name='candidate')(x)
+        z = nn.sigmoid(z)
+        h_tilde = nn.tanh(h_tilde)
+        h_new = (1 - z) * h + z * h_tilde
+        return h_new
+class BidirectionalMinGRU(nn.Module):
+    hidden_size: int
+    dropout: float
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, training: bool = False):
+        batch_size, seq_len, d_model = x.shape
+        x_proj = nn.Dense(self.hidden_size, dtype=self.dtype, name='input_proj')(x)
+        class ScanRNNCell(nn.Module):
+            hidden_size: int
+            dtype: Any = jnp.bfloat16
+            @nn.compact
+            def __call__(self, h, x_t):
+                cell = MinGRUCell(self.hidden_size, dtype=self.dtype)
+                h_new = cell(x_t, h)
+                return h_new, h_new
+        ForwardScanner = nn.scan(
+            ScanRNNCell,
+            variable_broadcast='params',
+            split_rngs={'params': False},
+            in_axes=1,
+            out_axes=1
+        )
+        h0_forward = jnp.zeros((batch_size, self.hidden_size), dtype=self.dtype)
+        _, h_forward = ForwardScanner(
+            hidden_size=self.hidden_size,
+            dtype=self.dtype,
+            name='forward_cell'
+        )(h0_forward, x_proj)
+        BackwardScanner = nn.scan(
+            ScanRNNCell,
+            variable_broadcast='params',
+            split_rngs={'params': False},
+            in_axes=1,
+            out_axes=1
+        )
+        h0_backward = jnp.zeros((batch_size, self.hidden_size), dtype=self.dtype)
+        x_proj_reversed = jnp.flip(x_proj, axis=1)
+        _, h_backward = BackwardScanner(
+            hidden_size=self.hidden_size,
+            dtype=self.dtype,
+            name='backward_cell'
+        )(h0_backward, x_proj_reversed)
+        h_backward = jnp.flip(h_backward, axis=1)
+        h_bi = jnp.concatenate([h_forward, h_backward], axis=-1)
+        out = nn.Dense(d_model, dtype=self.dtype, name='output_proj')(h_bi)
+        scale = self.param('layer_scale', nn.initializers.constant(1e-6), (d_model,))
+        out = out * scale
+        return nn.Dropout(self.dropout, deterministic=not training)(out)
+class GroupedQueryAttention(nn.Module):
+    d_model: int
+    n_heads: int
+    n_kv_heads: int
+    dropout: float
+    freqs_cis: jnp.ndarray
+    yarn_mscale: float
+    alibi_bias: Optional[jnp.ndarray]
+    alibi_weight: float
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, mask, training: bool = False):
+        B, T, D = x.shape
+        head_dim = self.d_model // self.n_heads
+        n_rep = self.n_heads // self.n_kv_heads
+        q = nn.Dense(self.d_model, use_bias=False,
+                     kernel_init=nn.initializers.normal(stddev=0.02),
+                     dtype=self.dtype, name='q_proj')(x)
+        kv_dim = self.d_model * self.n_kv_heads // self.n_heads
+        k = nn.Dense(kv_dim, use_bias=False,
+                     kernel_init=nn.initializers.normal(stddev=0.02),
+                     dtype=self.dtype, name='k_proj')(x)
+        v = nn.Dense(kv_dim, use_bias=False,
+                     kernel_init=nn.initializers.normal(stddev=0.02),
+                     dtype=self.dtype, name='v_proj')(x)
+        q = q.reshape(B, T, self.n_heads, head_dim).transpose(0, 2, 1, 3)
+        k = k.reshape(B, T, self.n_kv_heads, head_dim).transpose(0, 2, 1, 3)
+        v = v.reshape(B, T, self.n_kv_heads, head_dim).transpose(0, 2, 1, 3)
+        k = jnp.repeat(k, n_rep, axis=1)
+        v = jnp.repeat(v, n_rep, axis=1)
+        q, k = apply_rotary_emb(q, k, self.freqs_cis, self.yarn_mscale)
+        scores = jnp.matmul(q, k.transpose(0, 1, 3, 2)) / jnp.sqrt(head_dim).astype(self.dtype)
+        if self.alibi_bias is not None:
+            scores = scores * (1 - self.alibi_weight)
+            alibi = self.alibi_bias[:, :, :T, :T]
+            scores = scores + (alibi * self.alibi_weight)
+        scores = scores + mask
+        attn_weights = nn.softmax(scores, axis=-1)
+        attn_weights = nn.Dropout(self.dropout, deterministic=not training)(attn_weights)
+        attn_out = jnp.matmul(attn_weights, v)
+        attn_out = attn_out.transpose(0, 2, 1, 3).reshape(B, T, D)
+        out = nn.Dense(self.d_model, use_bias=False,
+                      kernel_init=nn.initializers.normal(stddev=0.02),
+                      dtype=self.dtype, name='o_proj')(attn_out)
+        return nn.Dropout(self.dropout, deterministic=not training)(out)
+class SwiGLU(nn.Module):
+    d_model: int
+    ff_dim: int
+    dropout: float
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, training: bool = False):
+        gate = nn.Dense(self.ff_dim, use_bias=False,
+                       kernel_init=nn.initializers.normal(stddev=0.02),
+                       dtype=self.dtype, name='gate_proj')(x)
+        up = nn.Dense(self.ff_dim, use_bias=False,
+                     kernel_init=nn.initializers.normal(stddev=0.02),
+                     dtype=self.dtype, name='up_proj')(x)
+        hidden = nn.silu(gate) * up
+        out = nn.Dense(self.d_model, use_bias=False,
+                      kernel_init=nn.initializers.normal(stddev=0.02),
+                      dtype=self.dtype, name='down_proj')(hidden)
+        return nn.Dropout(self.dropout, deterministic=not training)(out)
+class HybridTransformerBlock(nn.Module):
+    d_model: int
+    n_heads: int
+    n_kv_heads: int
+    ff_dim: int
+    dropout: float
+    freqs_cis: jnp.ndarray
+    yarn_mscale: float
+    alibi_bias: Optional[jnp.ndarray]
+    alibi_weight: float
+    layer_idx: int
+    layer_drop_prob: float = 0.0
+    use_cnn: bool = True
+    use_rnn: bool = True
+    rnn_hidden: int = 512
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, mask, training: bool = False):
+        scale = 1.0
+        if self.use_rnn:
+            h_rnn = RMSNorm(dtype=self.dtype, name='rnn_norm')(x)
+            h_rnn = BidirectionalMinGRU(
+                self.rnn_hidden, self.dropout, dtype=self.dtype, name='bidirectional_rnn'
+            )(h_rnn, training)
+            x = x + h_rnn * scale
+        if self.use_cnn:
+            h_cnn = RMSNorm(dtype=self.dtype, name='cnn_norm')(x)
+            h_cnn = LocalContextCNN(
+                self.d_model, self.dropout, dtype=self.dtype, name='local_cnn'
+            )(h_cnn, training)
+            x = x + h_cnn * scale
+        h = RMSNorm(dtype=self.dtype, name='attn_norm')(x)
+        h = GroupedQueryAttention(
+            self.d_model, self.n_heads, self.n_kv_heads, self.dropout,
+            self.freqs_cis, self.yarn_mscale, self.alibi_bias,
+            self.alibi_weight, dtype=self.dtype, name='attn'
+        )(h, mask, training)
+        x = x + h * scale
+        h = RMSNorm(dtype=self.dtype, name='ffn_norm')(x)
+        h = SwiGLU(self.d_model, self.ff_dim, self.dropout,
+                   dtype=self.dtype, name='ffn')(h, training)
+        x = x + h * scale
+        return x
+class SAM1HybridModel(nn.Module):
+    vocab_size: int
+    d_model: int
+    n_layers: int
+    n_heads: int
+    n_kv_heads: int
+    ff_dim: int
+    max_len: int
+    dropout: float = 0.1
+    layer_drop_prob: float = 0.05
+    rope_theta: float = 10000.0
+    yarn_scale: float = 1.0
+    yarn_alpha: float = 1.0
+    yarn_beta: float = 32.0
+    use_alibi: bool = False
+    alibi_weight: float = 0.3
+    use_cnn: bool = True
+    use_rnn: bool = True
+    rnn_hidden: int = 384
+    dtype: Any = jnp.bfloat16
+    @nn.compact
+    def __call__(self, input_ids, training: bool = False):
+        head_dim = self.d_model // self.n_heads
+        freqs_cis, yarn_mscale = precompute_yarn_freqs(
+            head_dim, self.max_len, self.rope_theta,
+            self.yarn_scale, self.yarn_alpha, self.yarn_beta, self.dtype
+        )
+        alibi_bias = None
+        x = nn.Embed(self.vocab_size, self.d_model,
+                    embedding_init=nn.initializers.normal(stddev=0.02),
+                    dtype=self.dtype, name='embed_tokens')(input_ids)
+        seq_len = input_ids.shape[1]
+        mask = jnp.tril(jnp.ones((seq_len, seq_len)))
+        mask = jnp.where(mask == 0, -1e9, 0.0).astype(self.dtype)
+        for i in range(self.n_layers):
+            use_cnn_layer = self.use_cnn and (i % 3 == 0)
+            use_rnn_layer = self.use_rnn and (i % 4 == 0)
+            x = HybridTransformerBlock(
+                self.d_model, self.n_heads, self.n_kv_heads, self.ff_dim,
+                self.dropout, freqs_cis, yarn_mscale, alibi_bias,
+                self.alibi_weight, i, self.layer_drop_prob,
+                use_cnn_layer, use_rnn_layer, self.rnn_hidden,
+                dtype=self.dtype, name=f'layers_{i}'
+            )(x, mask, training)
+        x = RMSNorm(dtype=self.dtype, name='norm')(x)
+        logits = nn.Dense(self.vocab_size, use_bias=False,
+                         kernel_init=nn.initializers.normal(stddev=0.02),
+                         dtype=self.dtype, name='lm_head')(x)
+        return logits
+# ==============================================================================
+# MODEL LOADING & GENERATION
+# ==============================================================================
+class ModelWrapper:
+    def __init__(self, model_path: str):
+        print("🔧 Loading model...")
+        # Load config
+        with open(os.path.join(model_path, "config.json"), 'r') as f:
+            config = json.load(f)
+        self.vocab_size = config['vocab_size']
+        self.d_model = config['d_model']
+        self.n_layers = config['n_layers']
+        self.n_heads = config['n_heads']
+        self.n_kv_heads = config['n_kv_heads']
+        self.ff_dim = int(self.d_model * 2.5)
+        self.max_len = config['max_len']
+        self.use_cnn = config.get('use_cnn', True)
+        self.use_rnn = config.get('use_rnn', True)
+        self.rnn_hidden = config.get('rnn_hidden', 384)
+        # Load tokenizer
+        self.tokenizer = Tokenizer.from_file(os.path.join(model_path, "tokenizer.json"))
+        # Initialize model
+        self.model = SAM1HybridModel(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            n_layers=self.n_layers,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            ff_dim=self.ff_dim,
+            max_len=self.max_len,
+            use_cnn=self.use_cnn,
+            use_rnn=self.use_rnn,
+            rnn_hidden=self.rnn_hidden,
+            dtype=jnp.bfloat16
+        )
+        # Load weights
+        flat_params = load_file(os.path.join(model_path, "model.safetensors"))
+        # Unflatten parameters
+        def unflatten_dict(flat_dict, sep='.'):
+            result = {}
+            for key, value in flat_dict.items():
+                parts = key.split(sep)
+                d = result
+                for part in parts[:-1]:
+                    if part not in d:
+                        d[part] = {}
+                    d = d[part]
+                d[parts[-1]] = jnp.array(value)
+            return result
+        self.params = {'params': unflatten_dict(flat_params)}
+        print(f"✅ Model loaded: {self.d_model}d × {self.n_layers}L × {self.n_heads}H")
+    def generate(self, prompt: str, max_new_tokens: int = 200,
+                 temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
+        # Format prompt correctly (NO newline between User: and Sam:)
+        if not prompt.startswith("User:"):
+            prompt = f"User: {prompt} Sam:"
+        else:
+            if " Sam:" not in prompt:
+                prompt = prompt + " Sam:"
+        # Tokenize
+        encoding = self.tokenizer.encode(prompt)
+        input_ids = jnp.array(encoding.ids)[None, :]
+        if input_ids.shape[1] > self.max_len:
+            input_ids = input_ids[:, -self.max_len:]
+        rng = random.PRNGKey(42)
+        generated_ids = input_ids
+        # Generate tokens
+        for _ in range(max_new_tokens):
+            logits = self.model.apply(self.params, generated_ids, training=False)
+            next_logits = logits[0, -1, :] / temperature
+            # Top-k filtering
+            top_k_logits, top_k_indices = jax.lax.top_k(next_logits, top_k)
+            # Top-p (nucleus) filtering
+            sorted_logits = jnp.sort(top_k_logits)[::-1]
+            sorted_indices = jnp.argsort(top_k_logits)[::-1]
+            cumsum_probs = jnp.cumsum(nn.softmax(sorted_logits))
+            mask = cumsum_probs <= top_p
+            mask = jnp.concatenate([jnp.array([True]), mask[:-1]])
+            filtered_logits = jnp.where(mask, sorted_logits, -1e9)
+            # Sample
+            rng, sample_rng = random.split(rng)
+            next_token_idx = random.categorical(sample_rng, filtered_logits)
+            next_token = top_k_indices[sorted_indices[next_token_idx]][None, None]
+            generated_ids = jnp.concatenate([generated_ids, next_token], axis=1)
+            # Stop on EOS
+            if next_token[0, 0] == self.tokenizer.token_to_id("<|endoftext|>"):
+                break
+        generated_text = self.tokenizer.decode(generated_ids[0].tolist())
+        # Extract response after "Sam:"
+        if "Sam:" in generated_text:
+            response = generated_text.split("Sam:")[-1].strip()
+            # Clean up
+            if "<|endoftext|>" in response:
+                response = response.split("<|endoftext|>")[0].strip()
+            return response
+        else:
+            return generated_text
+# ==============================================================================
+# GRADIO INTERFACE
+# ==============================================================================
+# Load model
+model = ModelWrapper("Smilyai-labs/MixSam-exp")
+def chat_fn(message, history, temperature, top_k, top_p, max_tokens):
+    # Build conversation context
+    conversation = ""
+    for user_msg, bot_msg in history:
+        conversation += f"User: {user_msg} Sam: {bot_msg} "
+    # Add current message
+    conversation += f"User: {message} Sam:"
+    # Generate response
+    response = model.generate(
+        conversation,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p
+    )
+    return response
+# Create Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🤖 SAM1 Hybrid Chat
+    ### Transformer + CNN + RNN Architecture
+    Chat with SAM1, a custom hybrid language model combining:
+    - 🔷 **Transformer** attention (GQA + YARN + RoPE)
+    - 🔶 **CNN** for local context (multi-scale convolutions)
+    - 🔵 **RNN** for sequential modeling (bidirectional MinGRU)
+    """)
+    chatbot = gr.Chatbot(height=500, show_copy_button=True)
+    with gr.Row():
+        msg = gr.Textbox(
+            placeholder="Type your message here...",
+            show_label=False,
+            scale=4
+        )
+        submit = gr.Button("Send", scale=1, variant="primary")
+    with gr.Accordion("⚙️ Generation Settings", open=False):
+        with gr.Row():
+            temperature = gr.Slider(0.1, 2.0, value=0.8, label="Temperature", step=0.1)
+            top_k = gr.Slider(1, 100, value=50, label="Top-K", step=1)
+        with gr.Row():
+            top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-P", step=0.05)
+            max_tokens = gr.Slider(50, 500, value=200, label="Max Tokens", step=10)
+    clear = gr.Button("🗑️ Clear Chat")
+    # Event handlers
+    msg.submit(
+        chat_fn,
+        inputs=[msg, chatbot, temperature, top_k, top_p, max_tokens],
+        outputs=chatbot
+    ).then(lambda: "", None, msg)
+    submit.click(
+        chat_fn,
+        inputs=[msg, chatbot, temperature, top_k, top_p, max_tokens],
+        outputs=chatbot
+    ).then(lambda: "", None, msg)
+    clear.click(lambda: None, None, chatbot, queue=False)
+    gr.Markdown("""
+    ---
+    **Model Details:**
+    - Architecture: SAM1 Hybrid (Custom)
+    - Parameters: ~600M
+    - Context Length: 1024 tokens
+    - Format: `User: {query} Sam: {response}` (no newlines)
+    **Tips:**
+    - Lower temperature (0.3-0.5) for focused responses
+    - Higher temperature (0.8-1.2) for creative responses
+    - Adjust top-k/top-p for response diversity
+    """)
+if __name__ == "__main__":
+    demo.queue().launch()