both
Browse files
configuration_super_linear.py
CHANGED
|
@@ -1,21 +1,14 @@
|
|
| 1 |
from typing import Optional, Tuple
|
| 2 |
import torch, torch.nn as nn, torch.nn.functional as F
|
|
|
|
| 3 |
|
| 4 |
-
from transformers import (
|
| 5 |
-
PretrainedConfig,
|
| 6 |
-
PreTrainedModel,
|
| 7 |
-
GenerationMixin,
|
| 8 |
-
AutoConfig,
|
| 9 |
-
AutoModelForCausalLM,
|
| 10 |
-
)
|
| 11 |
-
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 12 |
|
| 13 |
# 1) --------------------------------------------------------------------------
|
| 14 |
# CONFIG
|
| 15 |
# -----------------------------------------------------------------------------
|
| 16 |
|
| 17 |
|
| 18 |
-
class SuperLinearConfig(
|
| 19 |
"""
|
| 20 |
Configuration for the SuperLinear MoE time–series foundation model.
|
| 21 |
Only *model_type* must be unique inside transformers; the rest mirrors
|
|
|
|
| 1 |
from typing import Optional, Tuple
|
| 2 |
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
+
from configuration_super_linear_base import SuperLinearConfigBase
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# 1) --------------------------------------------------------------------------
|
| 7 |
# CONFIG
|
| 8 |
# -----------------------------------------------------------------------------
|
| 9 |
|
| 10 |
|
| 11 |
+
class SuperLinearConfig(SuperLinearConfigBase):
|
| 12 |
"""
|
| 13 |
Configuration for the SuperLinear MoE time–series foundation model.
|
| 14 |
Only *model_type* must be unique inside transformers; the rest mirrors
|
configuration_super_linear_base.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Tuple
|
| 2 |
+
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
+
|
| 4 |
+
from transformers import (
|
| 5 |
+
PretrainedConfig,
|
| 6 |
+
PreTrainedModel,
|
| 7 |
+
GenerationMixin,
|
| 8 |
+
AutoConfig,
|
| 9 |
+
AutoModelForCausalLM,
|
| 10 |
+
)
|
| 11 |
+
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 12 |
+
|
| 13 |
+
# 1) --------------------------------------------------------------------------
|
| 14 |
+
# CONFIG
|
| 15 |
+
# -----------------------------------------------------------------------------
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SuperLinearConfigBase(PretrainedConfig):
|
| 19 |
+
"""
|
| 20 |
+
Configuration for the SuperLinear MoE time–series foundation model.
|
| 21 |
+
Only *model_type* must be unique inside transformers; the rest mirrors
|
| 22 |
+
the __init__ arguments of your original Config object.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
model_type = "super_linear"
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
seq_len=512,
|
| 30 |
+
pred_len=96,
|
| 31 |
+
inf_pred_len=96,
|
| 32 |
+
max_horizon=96,
|
| 33 |
+
moe_n_experts=12,
|
| 34 |
+
top_k_experts=5,
|
| 35 |
+
moe =1,
|
| 36 |
+
freq_experts= 'mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
|
| 37 |
+
auto_regressive= 1,
|
| 38 |
+
d_model= 128,
|
| 39 |
+
dropout= 0.0,
|
| 40 |
+
fft_len= 5000,
|
| 41 |
+
freeze_experts= 1,
|
| 42 |
+
layer_type= "RLinear",
|
| 43 |
+
linear_checkpoints_dir= "checkpoints5",
|
| 44 |
+
linear_checkpoints_path= "/cs/azencot_fsas/MoE/",
|
| 45 |
+
load_linear = 0,
|
| 46 |
+
load_weights =0,
|
| 47 |
+
misc_moe = 10,
|
| 48 |
+
mlp_gating = 0,
|
| 49 |
+
moe_norm = 1,
|
| 50 |
+
model_type= "super_linear",
|
| 51 |
+
moe_temp = 1,
|
| 52 |
+
noisy_gating_std = 0.1,
|
| 53 |
+
noisy_gating_std_decay = 1,
|
| 54 |
+
torch_dtype = "float32",
|
| 55 |
+
transformers_version = "4.40.1",
|
| 56 |
+
use_fft = 1,
|
| 57 |
+
**kwargs, # any extra CLI args
|
| 58 |
+
):
|
| 59 |
+
self.seq_len = seq_len
|
| 60 |
+
self.moe = moe
|
| 61 |
+
self.pred_len = pred_len
|
| 62 |
+
self.inf_pred_len = inf_pred_len
|
| 63 |
+
self.max_horizon = max_horizon
|
| 64 |
+
self.auto_regressive = auto_regressive
|
| 65 |
+
self.moe_n_experts = moe_n_experts
|
| 66 |
+
self.top_k_experts = top_k_experts
|
| 67 |
+
self.freq_experts = freq_experts
|
| 68 |
+
self.freeze_experts = freeze_experts
|
| 69 |
+
self.layer_type = layer_type
|
| 70 |
+
self.linear_checkpoints_path = linear_checkpoints_path
|
| 71 |
+
self.linear_checkpoints_dir = linear_checkpoints_dir
|
| 72 |
+
self.load_linear = load_linear
|
| 73 |
+
self.load_weights = load_weights
|
| 74 |
+
self.misc_moe = misc_moe
|
| 75 |
+
self.noisy_gating_std = noisy_gating_std
|
| 76 |
+
self.noisy_gating_std_decay = noisy_gating_std_decay
|
| 77 |
+
self.d_model = d_model
|
| 78 |
+
self.mlp_gating = mlp_gating
|
| 79 |
+
self.moe_norm = moe_norm
|
| 80 |
+
self.moe_temp = moe_temp
|
| 81 |
+
self.use_fft = use_fft
|
| 82 |
+
self.fft_len = fft_len
|
| 83 |
+
self.dropout = dropout
|
| 84 |
+
super().__init__(**kwargs)
|
configuration_super_linear_fs.py
CHANGED
|
@@ -1,13 +1,7 @@
|
|
| 1 |
from typing import Optional, Tuple
|
| 2 |
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
|
| 4 |
-
from
|
| 5 |
-
PretrainedConfig,
|
| 6 |
-
PreTrainedModel,
|
| 7 |
-
GenerationMixin,
|
| 8 |
-
AutoConfig,
|
| 9 |
-
AutoModelForCausalLM,
|
| 10 |
-
)
|
| 11 |
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 12 |
|
| 13 |
# 1) --------------------------------------------------------------------------
|
|
@@ -15,7 +9,7 @@ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
|
| 15 |
# -----------------------------------------------------------------------------
|
| 16 |
|
| 17 |
|
| 18 |
-
class SuperLinearConfigFS(
|
| 19 |
"""
|
| 20 |
Configuration for the SuperLinear MoE time–series foundation model.
|
| 21 |
Only *model_type* must be unique inside transformers; the rest mirrors
|
|
|
|
| 1 |
from typing import Optional, Tuple
|
| 2 |
import torch, torch.nn as nn, torch.nn.functional as F
|
| 3 |
|
| 4 |
+
from configuration_super_linear_base import SuperLinearConfigBase
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 6 |
|
| 7 |
# 1) --------------------------------------------------------------------------
|
|
|
|
| 9 |
# -----------------------------------------------------------------------------
|
| 10 |
|
| 11 |
|
| 12 |
+
class SuperLinearConfigFS(SuperLinearConfigBase):
|
| 13 |
"""
|
| 14 |
Configuration for the SuperLinear MoE time–series foundation model.
|
| 15 |
Only *model_type* must be unique inside transformers; the rest mirrors
|
modeling_super_linear.py
CHANGED
|
@@ -6,6 +6,7 @@ from transformers import (PreTrainedModel,GenerationMix
|
|
| 6 |
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 7 |
from .configuration_super_linear import SuperLinearConfig
|
| 8 |
from .configuration_super_linear_fs import SuperLinearConfigFS
|
|
|
|
| 9 |
from typing import Tuple, Union
|
| 10 |
|
| 11 |
|
|
@@ -547,7 +548,7 @@ class superLinear(nn.Module):
|
|
| 547 |
"-------------------------------------------------------------------------------------------------------------------"
|
| 548 |
class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
|
| 549 |
|
| 550 |
-
config_class =
|
| 551 |
|
| 552 |
def __init__(self, config: Union[SuperLinearConfig, SuperLinearConfigFS]):
|
| 553 |
super().__init__(config)
|
|
|
|
| 6 |
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 7 |
from .configuration_super_linear import SuperLinearConfig
|
| 8 |
from .configuration_super_linear_fs import SuperLinearConfigFS
|
| 9 |
+
from .configuration_super_linear_base import SuperLinearConfigBase
|
| 10 |
from typing import Tuple, Union
|
| 11 |
|
| 12 |
|
|
|
|
| 548 |
"-------------------------------------------------------------------------------------------------------------------"
|
| 549 |
class SuperLinearForCausalLM(PreTrainedModel, GenerationMixin):
|
| 550 |
|
| 551 |
+
config_class = SuperLinearConfigBase
|
| 552 |
|
| 553 |
def __init__(self, config: Union[SuperLinearConfig, SuperLinearConfigFS]):
|
| 554 |
super().__init__(config)
|