Spaces:
Runtime error
Runtime error
| # Copyright 2024 MIT Han Lab | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # | |
| # SPDX-License-Identifier: Apache-2.0 | |
| import os | |
| from typing import Optional, Tuple | |
| import torch | |
| from torch import nn | |
| from torch.nn import functional as F | |
| class LiteMLA(nn.Module): | |
| r"""Lightweight multiscale linear attention""" | |
| PAD_VAL = 1 | |
| def __init__( | |
| self, | |
| in_dim: int, | |
| out_dim: int, | |
| heads: Optional[int] = None, | |
| heads_ratio: float = 1.0, | |
| dim=32, | |
| kernel_func="relu", | |
| scales: Optional[Tuple[int]] = (5,), | |
| eps=1e-15, | |
| use_bias=False, | |
| norm=(None, "bn2d"), | |
| act=(None, None), | |
| ): | |
| heads = heads or int(out_dim // dim * heads_ratio) | |
| super().__init__() | |
| self.in_dim = in_dim | |
| self.out_dim = out_dim | |
| self.heads = heads | |
| self.dim = dim | |
| self.scales = scales | |
| self.eps = eps | |
| self.aggreg = None | |
| scales = () | |
| self.kernel_func = nn.ReLU(inplace=False) | |
| self.qkv = nn.Linear(in_dim, in_dim * 3, bias=use_bias) | |
| self.proj = nn.Linear(out_dim, out_dim) | |
| def attn_matmul(self, q, k, v: torch.Tensor) -> torch.Tensor: | |
| # lightweight linear attention | |
| q = self.kernel_func(q) # B, h, h_d, N | |
| k = self.kernel_func(k) | |
| use_fp32_attention = getattr(self, "fp32_attention", False) # necessary for NAN loss | |
| if use_fp32_attention: | |
| q, k, v = q.float(), k.float(), v.float() | |
| v = F.pad(v, (0, 0, 0, 1), mode="constant", value=LiteMLA.PAD_VAL) | |
| vk = torch.matmul(v, k) | |
| out = torch.matmul(vk, q) | |
| if out.dtype in [torch.float16, torch.bfloat16]: | |
| out = out.float() | |
| out = out[:, :, :-1] / (out[:, :, -1:] + self.eps) | |
| return out | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| B, N, C = x.shape | |
| qkv = self.qkv(x).reshape(B, N, 3, C).permute(0, 2, 3, 1) | |
| # B, 3, C, N --> B, C, N | |
| q, k, v = qkv.unbind(1) | |
| dtype = q.dtype | |
| q = q.reshape(B, C // self.dim, self.dim, N) # b, h, h_d, N | |
| k = k.reshape(B, C // self.dim, self.dim, N).transpose(-1, -2) # b, h, N, h_d | |
| v = v.reshape(B, C // self.dim, self.dim, N) # b, h, h_d, N | |
| out = self.attn_matmul(q, k, v).to(dtype) | |
| out = out.view(B, C, N).permute(0, 2, 1) # B, N, C | |
| out = self.proj(out) | |
| return out | |
| def module_str(self) -> str: | |
| _str = type(self).__name__ + "(" | |
| eps = f"{self.eps:.1E}" | |
| _str += f"i={self.in_dim},o={self.out_dim},h={self.heads},d={self.dim},eps={eps}" | |
| return _str | |
| def __repr__(self): | |
| return f"EPS{self.eps}-" + super().__repr__() | |