microsoft
/

phi-1

@@ -8,7 +8,8 @@ from __future__ import annotations
 import math
 from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -31,6 +32,15 @@ except:
     FusedDense = None
 @dataclass
 class InferenceParams:
     """Inference parameters passed to model to efficiently calculate
@@ -218,7 +228,10 @@ class RotaryEmbedding(nn.Module):
         return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
     def _update_cos_sin_cache(
-        self, seqlen: int, device: Optional[str] = None, dtype: Optional[torch.dtype] = None
     ) -> None:
         self._seq_len_cached = seqlen
@@ -261,14 +274,30 @@ class RotaryEmbedding(nn.Module):
         seq_start = seqlen_offset
         seq_end = seq_start + qkv.shape[1]
-        if self._cos_cached.device != qkv.device or self._cos_cached.dtype != qkv.dtype or (self.training and self._cos_cached.is_inference()):
             self._update_cos_sin_cache(self.max_position_embeddings, device=qkv.device, dtype=qkv.dtype)
         if kv is None:
-            return _apply_rotary_emb_qkv(qkv, self._cos_cached[seq_start:seq_end], self._sin_cached[seq_start:seq_end])
         else:
-            q = _apply_rotary_emb(qkv, self._cos_cached[seq_start:seq_end], self._sin_cached[seq_start:seq_end])
-            kv = _apply_rotary_emb_kv(kv, self._cos_cached[seq_start:seq_end], self._sin_cached[seq_start:seq_end])
             return q, kv
@@ -327,6 +356,7 @@ class SelfAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
     def forward(
         self,
         qkv: torch.FloatTensor,
@@ -337,9 +367,14 @@ class SelfAttention(nn.Module):
         batch_size, seqlen = qkv.shape[0], qkv.shape[1]
         q, k, v = qkv.unbind(dim=2)
         causal = self.causal if causal is None else causal
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
@@ -352,7 +387,7 @@ class SelfAttention(nn.Module):
             causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
             scores = scores + causal_mask.to(dtype=scores.dtype)
-        attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
         attention = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention, v)
@@ -380,6 +415,7 @@ class CrossAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
     def forward(
         self,
         q: torch.FloatTensor,
@@ -395,9 +431,14 @@ class CrossAttention(nn.Module):
             kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
         k, v = kv.unbind(dim=2)
         causal = self.causal if causal is None else causal
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
@@ -418,7 +459,7 @@ class CrossAttention(nn.Module):
             scores = scores.masked_fill(causal_mask, -10000.0)
-        attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
         attention = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention, v)
@@ -507,7 +548,13 @@ class MHA(nn.Module):
             if rotary_cls is RotaryEmbedding:
                 rotary_kwargs["max_position_embeddings"] = config.n_positions
-            self.rotary_emb = rotary_cls(self.rotary_dim, base=rotary_base, scale_base=rotary_scale_base, device=device, **rotary_kwargs)
         # MLP
         self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
@@ -532,9 +579,15 @@ class MHA(nn.Module):
         if cross_attn_cls is None:
             cross_attn_cls = CrossAttention
-        self.inner_attn = attn_cls(causal=causal, softmax_scale=softmax_scale, attention_dropout=config.attn_pdrop)
         self.inner_cross_attn = cross_attn_cls(
-            causal=causal, softmax_scale=softmax_scale, attention_dropout=config.attn_pdrop
         )
         self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
@@ -603,7 +656,12 @@ class MHA(nn.Module):
             batch_size, seqlen_q = q.shape[0], q.shape[1]
             seqlen_k = kv.shape[1]
-            cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = None, None, None, None
             if key_padding_mask is not None:
                 kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
@@ -644,7 +702,11 @@ class MHA(nn.Module):
         if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(
-                self.inner_cross_attn, q, kv, key_padding_mask=key_padding_mask, causal=causal
             )
         return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)

 import math
 from dataclasses import dataclass, field
+from functools import wraps
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn
     FusedDense = None
+def disable_autocast(func: Callable) -> Callable:
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with torch.cuda.amp.autocast(enabled=False):
+            return func(*args, **kwargs)
+    return wrapper
 @dataclass
 class InferenceParams:
     """Inference parameters passed to model to efficiently calculate
         return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
     def _update_cos_sin_cache(
+        self,
+        seqlen: int,
+        device: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
     ) -> None:
         self._seq_len_cached = seqlen
         seq_start = seqlen_offset
         seq_end = seq_start + qkv.shape[1]
+        if (
+            self._cos_cached.device != qkv.device
+            or self._cos_cached.dtype != qkv.dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
             self._update_cos_sin_cache(self.max_position_embeddings, device=qkv.device, dtype=qkv.dtype)
         if kv is None:
+            return _apply_rotary_emb_qkv(
+                qkv,
+                self._cos_cached[seq_start:seq_end],
+                self._sin_cached[seq_start:seq_end],
+            )
         else:
+            q = _apply_rotary_emb(
+                qkv,
+                self._cos_cached[seq_start:seq_end],
+                self._sin_cached[seq_start:seq_end],
+            )
+            kv = _apply_rotary_emb_kv(
+                kv,
+                self._cos_cached[seq_start:seq_end],
+                self._sin_cached[seq_start:seq_end],
+            )
             return q, kv
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
+    @disable_autocast
     def forward(
         self,
         qkv: torch.FloatTensor,
         batch_size, seqlen = qkv.shape[0], qkv.shape[1]
         q, k, v = qkv.unbind(dim=2)
+        q = q.to(torch.float32)
+        k = k.to(torch.float32)
         causal = self.causal if causal is None else causal
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # Autocast is manually disabled to avoid `torch.einsum` performing the operation
+        # using float16, which might lead to overflow
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
             scores = scores + causal_mask.to(dtype=scores.dtype)
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
         attention = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention, v)
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
+    @disable_autocast
     def forward(
         self,
         q: torch.FloatTensor,
             kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
         k, v = kv.unbind(dim=2)
+        q = q.to(torch.float32)
+        k = k.to(torch.float32)
         causal = self.causal if causal is None else causal
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # Autocast is manually disabled to avoid `torch.einsum` performing the operation
+        # using float16, which might lead to overflow
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             scores = scores.masked_fill(causal_mask, -10000.0)
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
         attention = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention, v)
             if rotary_cls is RotaryEmbedding:
                 rotary_kwargs["max_position_embeddings"] = config.n_positions
+            self.rotary_emb = rotary_cls(
+                self.rotary_dim,
+                base=rotary_base,
+                scale_base=rotary_scale_base,
+                device=device,
+                **rotary_kwargs,
+            )
         # MLP
         self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
         if cross_attn_cls is None:
             cross_attn_cls = CrossAttention
+        self.inner_attn = attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=config.attn_pdrop,
+        )
         self.inner_cross_attn = cross_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=config.attn_pdrop,
         )
         self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
             batch_size, seqlen_q = q.shape[0], q.shape[1]
             seqlen_k = kv.shape[1]
+            cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
+                None,
+                None,
+                None,
+                None,
+            )
             if key_padding_mask is not None:
                 kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
         if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(
+                self.inner_cross_attn,
+                q,
+                kv,
+                key_padding_mask=key_padding_mask,
+                causal=causal,
             )
         return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)