Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on 17 days ago

Commit

d9e88a5

verified ·

1 Parent(s): 90b1095

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -51

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ import json
 from tokenizers import Tokenizer
 import numpy as np
 import time
-from functools import lru_cache
 # Configure TF threading
 tf.config.threading.set_inter_op_parallelism_threads(NUM_CORES)
@@ -47,43 +46,40 @@ MODEL_REPO = "Smilyai-labs/Sam-large-2"
 CACHE_DIR = "./model_cache"
 # ============================================================================
-# Model Architecture - MUST MATCH CHECKPOINT STRUCTURE
 # ============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
-    """RoPE with pre-computed cache (no trainable weights - compatible with checkpoint)."""
     def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
         super().__init__(**kwargs)
         self.dim = dim
         self.max_len = max_len
         self.theta = theta
-        self.built_cache = False
-        self.cos_cached = None
-        self.sin_cached = None
     def build(self, input_shape):
         super().build(input_shape)
-    def _build_cache(self):
-        if not self.built_cache:
-            inv_freq = 1.0 / (self.theta ** (np.arange(0, self.dim, 2, dtype=np.float32) / self.dim))
-            t = np.arange(self.max_len, dtype=np.float32)
-            freqs = np.outer(t, inv_freq)
-            emb = np.concatenate([freqs, freqs], axis=-1)
-            self.cos_cached = tf.constant(np.cos(emb), dtype=tf.float32)
-            self.sin_cached = tf.constant(np.sin(emb), dtype=tf.float32)
-            self.built_cache = True
     def call(self, q, k, offset=0):
         """Apply rotary embeddings with position offset for KV-cache."""
-        self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
-        cos = tf.cast(self.cos_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
-        sin = tf.cast(self.sin_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
         # Fused rotate_half
         x1_q, x2_q = tf.split(q, 2, axis=-1)
@@ -176,7 +172,10 @@ class TransformerBlock(keras.layers.Layer):
         v = tf.transpose(v, [0, 2, 1, 3])
         # Determine position offset for RoPE
-        past_len = tf.shape(past_kv[0])[2] if past_kv is not None else 0
         # Apply RoPE with position offset
         q, k = self.rope(q, k, offset=past_len)
@@ -192,7 +191,7 @@ class TransformerBlock(keras.layers.Layer):
         full_len = tf.shape(k)[2]
         scores = tf.matmul(q, k, transpose_b=True) * self.scale
-        # Optimized causal mask
         q_positions = tf.range(past_len, past_len + T)
         k_positions = tf.range(full_len)
         mask = tf.cast(q_positions[:, None] < k_positions[None, :], scores.dtype) * -1e9
@@ -288,10 +287,8 @@ class FastSampler:
     def sample(self, logits, temperature, top_k, top_p, token_freq, repetition_penalty):
         """Optimized sampling with vectorized operations."""
-        # Make a copy to avoid modifying original
         logits = logits.copy()
-        # Temperature scaling
         if temperature != 1.0:
             logits = logits / temperature
@@ -415,43 +412,67 @@ if model:
 # Initialize fast sampler
 sampler = FastSampler(config['vocab_size'])
-# Warm up with trace compilation
-print("🔥 Warming up model and compiling traces...")
 warmup_input = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)
-# Warm up prefill
-for _ in range(3):
     logits, past_kv = model(warmup_input, training=False, use_cache=True)
-# Warm up decode step
 single_token = tf.constant([[1]], dtype=tf.int32)
-for _ in range(3):
     logits, past_kv = model(single_token, training=False, past_kv=past_kv, use_cache=True)
-print("✅ Model warmed up and traces compiled")
 # ============================================================================
-# Compiled Inference Functions
 # ============================================================================
-# Create tf.function wrapped inference for speed
-@tf.function(reduce_retracing=True)
-def model_prefill(input_ids):
-    """Compiled prefill function."""
-    return model(input_ids, training=False, use_cache=True)
-@tf.function(reduce_retracing=True)
-def model_decode(input_ids, past_kv):
-    """Compiled single-token decode function."""
-    return model(input_ids, training=False, past_kv=past_kv, use_cache=True)
-# Additional warmup for compiled functions
-print("🔥 Compiling tf.function traces...")
-_ = model_prefill(warmup_input)
-_ = model_decode(single_token, past_kv)
-print("✅ Compiled functions ready")
 # ============================================================================
@@ -502,7 +523,7 @@ def generate_stream(
     input_tensor = tf.constant([input_ids], dtype=tf.int32)
     try:
-        logits, past_kv = model_prefill(input_tensor)
     except Exception as e:
         yield f"Error during prefill: {e}"
         return
@@ -522,7 +543,7 @@ def generate_stream(
             yield generated_text + "\n\n*[Generation stopped]*"
             return
-        # Sample next token using optimized sampler
         next_token_id = sampler.sample(
             next_token_logits, temperature, top_k, top_p, token_freq, repetition_penalty
         )
@@ -540,18 +561,18 @@ def generate_stream(
         token_count += 1
         yield generated_text
-        # === DECODE PHASE (single token, reuse cache) ===
         next_input = tf.constant([[next_token_id]], dtype=tf.int32)
         try:
-            logits, past_kv = model_decode(next_input, past_kv)
         except Exception as e:
             yield generated_text + f"\n\n*[Error during generation: {e}]*"
             return
         next_token_logits = logits[0, -1, :].numpy()
-        # Truncate cache if too long (check less frequently)
         if step % 100 == 99:
             current_len = past_kv[0][0].shape[2] if past_kv and past_kv[0] is not None else 0
             if current_len > max_context:

 from tokenizers import Tokenizer
 import numpy as np
 import time
 # Configure TF threading
 tf.config.threading.set_inter_op_parallelism_threads(NUM_CORES)
 CACHE_DIR = "./model_cache"
 # ============================================================================
+# Model Architecture - MATCHES CHECKPOINT STRUCTURE
 # ============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
+    """RoPE with cache built during layer build phase."""
     def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
         super().__init__(**kwargs)
         self.dim = dim
         self.max_len = max_len
         self.theta = theta
     def build(self, input_shape):
+        # Pre-compute RoPE cache as numpy arrays during build
+        inv_freq = 1.0 / (self.theta ** (np.arange(0, self.dim, 2, dtype=np.float32) / self.dim))
+        t = np.arange(self.max_len, dtype=np.float32)
+        freqs = np.outer(t, inv_freq)
+        emb = np.concatenate([freqs, freqs], axis=-1)
+        # Store as numpy arrays - will be converted to tensors in call()
+        self._cos_cached = np.cos(emb).astype(np.float32)
+        self._sin_cached = np.sin(emb).astype(np.float32)
         super().build(input_shape)
     def call(self, q, k, offset=0):
         """Apply rotary embeddings with position offset for KV-cache."""
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
+        # Slice the pre-computed values
+        cos = tf.cast(self._cos_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
+        sin = tf.cast(self._sin_cached[offset:offset + seq_len, :], dtype)[None, None, :, :]
         # Fused rotate_half
         x1_q, x2_q = tf.split(q, 2, axis=-1)
         v = tf.transpose(v, [0, 2, 1, 3])
         # Determine position offset for RoPE
+        if past_kv is not None:
+            past_len = tf.shape(past_kv[0])[2]
+        else:
+            past_len = 0
         # Apply RoPE with position offset
         q, k = self.rope(q, k, offset=past_len)
         full_len = tf.shape(k)[2]
         scores = tf.matmul(q, k, transpose_b=True) * self.scale
+        # Causal mask
         q_positions = tf.range(past_len, past_len + T)
         k_positions = tf.range(full_len)
         mask = tf.cast(q_positions[:, None] < k_positions[None, :], scores.dtype) * -1e9
     def sample(self, logits, temperature, top_k, top_p, token_freq, repetition_penalty):
         """Optimized sampling with vectorized operations."""
         logits = logits.copy()
         if temperature != 1.0:
             logits = logits / temperature
 # Initialize fast sampler
 sampler = FastSampler(config['vocab_size'])
+# Warm up the model (without tf.function first)
+print("🔥 Warming up model...")
 warmup_input = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)
+# Initial warmup to build all internal caches
+for _ in range(2):
     logits, past_kv = model(warmup_input, training=False, use_cache=True)
+# Warmup decode step
 single_token = tf.constant([[1]], dtype=tf.int32)
+for _ in range(2):
     logits, past_kv = model(single_token, training=False, past_kv=past_kv, use_cache=True)
+print("✅ Model warmed up")
 # ============================================================================
+# Inference wrapper class for clean tf.function usage
 # ============================================================================
+class InferenceEngine:
+    """Wrapper for compiled inference functions."""
+    def __init__(self, model):
+        self.model = model
+        self._prefill_fn = None
+        self._decode_fn = None
+    def prefill(self, input_ids):
+        """Run prefill (first call builds trace)."""
+        if self._prefill_fn is None:
+            # First call - run eagerly to ensure all caches are built
+            return self.model(input_ids, training=False, use_cache=True)
+        return self._prefill_fn(input_ids)
+    def decode(self, input_ids, past_kv):
+        """Run single-token decode."""
+        return self.model(input_ids, training=False, past_kv=past_kv, use_cache=True)
+    def compile_traces(self):
+        """Compile tf.function traces after warmup."""
+        print("🔥 Compiling optimized traces...")
+        @tf.function(reduce_retracing=True)
+        def prefill_fn(input_ids):
+            return self.model(input_ids, training=False, use_cache=True)
+        self._prefill_fn = prefill_fn
+        # Trace with sample inputs
+        sample_input = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)
+        _ = self._prefill_fn(sample_input)
+        print("✅ Traces compiled")
+# Create inference engine
+engine = InferenceEngine(model)
+# Compile traces after warmup
+engine.compile_traces()
 # ============================================================================
     input_tensor = tf.constant([input_ids], dtype=tf.int32)
     try:
+        logits, past_kv = engine.prefill(input_tensor)
     except Exception as e:
         yield f"Error during prefill: {e}"
         return
             yield generated_text + "\n\n*[Generation stopped]*"
             return
+        # Sample next token
         next_token_id = sampler.sample(
             next_token_logits, temperature, top_k, top_p, token_freq, repetition_penalty
         )
         token_count += 1
         yield generated_text
+        # === DECODE PHASE ===
         next_input = tf.constant([[next_token_id]], dtype=tf.int32)
         try:
+            logits, past_kv = engine.decode(next_input, past_kv)
         except Exception as e:
             yield generated_text + f"\n\n*[Error during generation: {e}]*"
             return
         next_token_logits = logits[0, -1, :].numpy()
+        # Truncate cache if too long
         if step % 100 == 99:
             current_len = past_kv[0][0].shape[2] if past_kv and past_kv[0] is not None else 0
             if current_len > max_context: