Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

Keeby-smilyai commited on Oct 23

Commit

8e2d20f

verified ·

1 Parent(s): ac2a3fe

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -33,38 +33,33 @@ class RotaryEmbedding(keras.layers.Layer):
         self.dim = dim
         self.max_len = max_len
         self.theta = theta
     def build(self, input_shape):
-        # FIXED: Compute in numpy first to avoid symbolic tensor issues
-        inv_freq = 1.0 / (self.theta ** (np.arange(0, self.dim, 2, dtype=np.float32) / self.dim))
-        t = np.arange(self.max_len, dtype=np.float32)
-        freqs = np.outer(t, inv_freq)
-        emb = np.concatenate([freqs, freqs], axis=-1)
-        # Create as non-trainable weights instead of tf.constant
-        self.cos_cached = self.add_weight(
-            name="cos_cached",
-            shape=(self.max_len, self.dim),
-            initializer=keras.initializers.Constant(np.cos(emb)),
-            trainable=False,
-            dtype=tf.float32
-        )
-        self.sin_cached = self.add_weight(
-            name="sin_cached",
-            shape=(self.max_len, self.dim),
-            initializer=keras.initializers.Constant(np.sin(emb)),
-            trainable=False,
-            dtype=tf.float32
-        )
         super().build(input_shape)
     def rotate_half(self, x):
         x1, x2 = tf.split(x, 2, axis=-1)
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]

         self.dim = dim
         self.max_len = max_len
         self.theta = theta
+        self.built_cache = False
     def build(self, input_shape):
+        # Use the ORIGINAL training code - compute cache on first call, not in build
         super().build(input_shape)
+    def _build_cache(self):
+        """Build RoPE cache on first forward pass"""
+        if not self.built_cache:
+            inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
+            t = tf.range(self.max_len, dtype=tf.float32)
+            freqs = tf.einsum("i,j->ij", t, inv_freq)
+            emb = tf.concat([freqs, freqs], axis=-1)
+            # Store as numpy arrays to avoid graph issues
+            self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
+            self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
+            self.built_cache = True
     def rotate_half(self, x):
         x1, x2 = tf.split(x, 2, axis=-1)
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
+        # Build cache on first call (avoids build-time issues)
+        self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]