NoesisLab
/

Spartacus-1B-Instruct

Text Generation

linear-attention

Model card Files Files and versions

OzTianlu commited on 4 days ago

Commit

39fa0da

·

verified ·

1 Parent(s): b0097d1

Upload MonoidForCausalLM.py

Files changed (1) hide show

MonoidForCausalLM.py +33 -1

MonoidForCausalLM.py CHANGED Viewed

@@ -62,7 +62,39 @@ from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin, Aut
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm
-from monoid_scan_cuda import parallel_scan, parallel_scan_with_state
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm
+try:
+    from monoid_scan_cuda import parallel_scan, parallel_scan_with_state
+except ImportError:
+    # Pure-PyTorch fallback (sequential scan) — works on CPU / MPS / any device.
+    # Slower than the fused CUDA kernel but numerically identical.
+    def parallel_scan(log_alpha: Tensor, kv: Tensor) -> Tensor:
+        """Sequential prefix scan fallback: S_t = exp(log_α_t)·S_{t-1} + kv_t."""
+        B, H, T, d1, d2 = kv.shape
+        states = torch.zeros(B, H, T, d1, d2, device=kv.device, dtype=kv.dtype)
+        S = torch.zeros(B, H, d1, d2, device=kv.device, dtype=kv.dtype)
+        for t in range(T):
+            decay = torch.exp(log_alpha[:, :, t])                    # [B, H, 1]
+            while decay.dim() < S.dim():
+                decay = decay.unsqueeze(-1)
+            S = S * decay + kv[:, :, t]
+            states[:, :, t] = S
+        return states
+    def parallel_scan_with_state(log_alpha: Tensor, kv: Tensor):
+        """Sequential prefix scan that also returns the final (log_decay, S) state."""
+        B, H, T, d1, d2 = kv.shape
+        states = torch.zeros(B, H, T, d1, d2, device=kv.device, dtype=kv.dtype)
+        S = torch.zeros(B, H, d1, d2, device=kv.device, dtype=kv.dtype)
+        log_acc = torch.zeros(B, H, 1, device=log_alpha.device, dtype=log_alpha.dtype)
+        for t in range(T):
+            decay = torch.exp(log_alpha[:, :, t])
+            while decay.dim() < S.dim():
+                decay = decay.unsqueeze(-1)
+            S = S * decay + kv[:, :, t]
+            states[:, :, t] = S
+            log_acc = log_acc + log_alpha[:, :, t]
+        return states, (log_acc, S)
 # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━