NoesisLab
/

Spartacus-1B-Instruct

@@ -97,6 +97,7 @@ except ImportError:
         return states, (log_acc, S)
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 # Config / 配置
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -452,6 +453,35 @@ class MonoidAttention(nn.Module):
             o = o.contiguous().view(B, 1, -1)
             return self.o_proj(o), new_state
         # ══════════════════════════════════════════════════════════
         # Training path (parallel scan): O(T) via prefix sum
         # 训练路径 (并行扫描): 通过前缀和 O(T)
@@ -467,16 +497,9 @@ class MonoidAttention(nn.Module):
         # Batch outer product: kv_{t} = k_t ⊗ v_t for all t
         # 批量外积: kv_{t} = k_t ⊗ v_t, 对所有 t
         kv = torch.einsum('bhtd, bhte -> bhtde', k, v)  # [B,H,T,d,d]
-        if use_cache:
-            # Prefill with state extraction (for switching to RNN inference)
-            # 带状态提取的预填充 (用于切换到 RNN 推理)
-            states, final_state = parallel_scan_with_state(log_alpha, kv)
-        else:
-            # Pure training, no state needed
-            # 纯训练, 不需要状态
-            states = parallel_scan(log_alpha, kv)
-            final_state = None
         # ── Incorporate h0: make training consistent with inference ──
         # ── 融入 h0: 使训练与推理一致 ──
@@ -489,17 +512,12 @@ class MonoidAttention(nn.Module):
         cum_log_decay = torch.cumsum(log_alpha.squeeze(-1), dim=2)    # [B,H,T]
         cum_decay = torch.exp(cum_log_decay).unsqueeze(-1).unsqueeze(-1)  # [B,H,T,1,1]
         states = states + self.h0.unsqueeze(2) * cum_decay            # [B,H,T,d,d]
-        if use_cache:
-            # Update final_state to include h0 contribution
-            # 更新最终状态以包含 h0 的贡献
-            final_state = (final_state[0], states[:, :, -1])
-            if monoid_cache is not None:
-                monoid_cache.update(self.layer_idx, final_state)
         # Readout: o_t = q_t · S_t for all t simultaneously
         # 读出: o_t = q_t · S_t, 对所有 t 同时计算
         o = torch.einsum('bhtd, bhtde -> bhte', q, states)
         o = o.transpose(1, 2).contiguous().view(B, T, -1)
         return self.o_proj(o), final_state
@@ -576,6 +594,8 @@ class MonoidPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class MonoidModel(MonoidPreTrainedModel):
     """
@@ -609,7 +629,15 @@ class MonoidModel(MonoidPreTrainedModel):
         hidden_states = inputs_embeds
         for layer in self.layers:
-            hidden_states = layer(hidden_states, monoid_cache=monoid_cache, use_cache=use_cache)
         hidden_states = self.norm(hidden_states)

         return states, (log_acc, S)
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 # Config / 配置
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
             o = o.contiguous().view(B, 1, -1)
             return self.o_proj(o), new_state
+        # ══════════════════════════════════════════════════════════
+        # Inference prefill (use_cache=True, T>1): fused scan + readout
+        # 推理预填充 (use_cache=True, T>1): 融合扫描 + 读出
+        # ══════════════════════════════════════════════════════════
+        # Avoids materializing full [B,H,T,d,d] states tensor.
+        # Peak memory: O(H·d²) instead of O(T·H·d²).
+        # 避免实体化完整的 [B,H,T,d,d] 状态张量。
+        # 峰值内存: O(H·d²) 而非 O(T·H·d²)。
+        if use_cache:
+            S = self.h0.expand(B, -1, -1, -1).clone()          # [B,H,d,d]
+            log_acc = torch.zeros(B, H, 1, device=hidden_states.device, dtype=q.dtype)
+            o_parts = []
+            for t in range(T):
+                kv_t = torch.einsum('bhd, bhe -> bhde', k[:, :, t], v[:, :, t])
+                decay = torch.exp(log_alpha[:, :, t])           # [B,H,1]
+                while decay.dim() < S.dim():
+                    decay = decay.unsqueeze(-1)
+                S = S * decay + kv_t
+                o_parts.append(torch.einsum('bhd, bhde -> bhe', q[:, :, t], S))
+                log_acc = log_acc + log_alpha[:, :, t]
+            final_state = (log_acc, S)
+            if monoid_cache is not None:
+                monoid_cache.update(self.layer_idx, final_state)
+            o = torch.stack(o_parts, dim=2)                     # [B,H,T,d]
+            o = o.transpose(1, 2).contiguous().view(B, T, -1)
+            return self.o_proj(o), final_state
         # ══════════════════════════════════════════════════════════
         # Training path (parallel scan): O(T) via prefix sum
         # 训练路径 (并行扫描): 通过前缀和 O(T)
         # Batch outer product: kv_{t} = k_t ⊗ v_t for all t
         # 批量外积: kv_{t} = k_t ⊗ v_t, 对所有 t
         kv = torch.einsum('bhtd, bhte -> bhtde', k, v)  # [B,H,T,d,d]
+        states = parallel_scan(log_alpha, kv)
+        del kv                                           # free [B,H,T,d,d] early
+        final_state = None
         # ── Incorporate h0: make training consistent with inference ──
         # ── 融入 h0: 使训练与推理一致 ──
         cum_log_decay = torch.cumsum(log_alpha.squeeze(-1), dim=2)    # [B,H,T]
         cum_decay = torch.exp(cum_log_decay).unsqueeze(-1).unsqueeze(-1)  # [B,H,T,1,1]
         states = states + self.h0.unsqueeze(2) * cum_decay            # [B,H,T,d,d]
+        del cum_decay
         # Readout: o_t = q_t · S_t for all t simultaneously
         # 读出: o_t = q_t · S_t, 对所有 t 同时计算
         o = torch.einsum('bhtd, bhtde -> bhte', q, states)
+        del states                                       # free [B,H,T,d,d]
         o = o.transpose(1, 2).contiguous().view(B, T, -1)
         return self.o_proj(o), final_state
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, MonoidAttention):
+            nn.init.constant_(module.decay_proj.bias, 4.0)
 class MonoidModel(MonoidPreTrainedModel):
     """
         hidden_states = inputs_embeds
         for layer in self.layers:
+            if self.gradient_checkpointing and self.training and not use_cache:
+                hidden_states = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    monoid_cache,
+                    use_cache,
+                )
+            else:
+                hidden_states = layer(hidden_states, monoid_cache=monoid_cache, use_cache=use_cache)
         hidden_states = self.norm(hidden_states)