lirannoc commited on
Commit
7cccffc
·
verified ·
1 Parent(s): ad2aa43

Update configuration_super_linear.py

Browse files
Files changed (1) hide show
  1. configuration_super_linear.py +5 -60
configuration_super_linear.py CHANGED
@@ -1,13 +1,6 @@
1
  from typing import Optional, Tuple
2
 
3
- from transformers import (
4
- PretrainedConfig,
5
- PreTrainedModel,
6
- GenerationMixin,
7
- AutoConfig,
8
- AutoModelForCausalLM,
9
- )
10
- from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
11
 
12
  # 1) --------------------------------------------------------------------------
13
  # CONFIG
@@ -28,23 +21,16 @@ class SuperLinearConfig(PretrainedConfig):
28
  # Model architecture parameters
29
  train_seq_len=512,
30
  train_pred_len=96,
31
- seq_len=512,
32
- pred_len=96,
33
- inf_pred_len=96,
34
- max_horizon=96,
35
- auto_regressive=1,
36
-
37
  # MoE parameters
38
- moe_n_experts=4,
39
  top_k_experts=12,
40
  noisy_gating_std=0.1,
41
  moe_temp=1.0,
42
  moe_norm=False,
43
  layer_type='RLinear',
44
- n_experts=4,
45
  comp_moe=12,
46
  freeze_experts=True,
47
- moe=1,
48
 
49
  # FFT-based gating parameters
50
  use_fft=True,
@@ -53,46 +39,23 @@ class SuperLinearConfig(PretrainedConfig):
53
  # Expert configuration
54
  freq_experts='mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
55
 
56
- # Model loading and saving
57
- load_linear=True,
58
- load_weights_full=True,
59
- linear_freq_weights_path='./weights/linear_freq_weights/',
60
- full_weights_path='./weights/full_weights/checkpoint.pth',
61
-
62
  # Training parameters
63
  resample_long_lookback=False,
64
 
65
- # Legacy parameters for backward compatibility
66
- linear_checkpoints_path='/cs/azencot_fsas/MoE/',
67
- linear_checkpoints_dir="checkpoints5",
68
- manual_moe=0,
69
- misc_moe=1,
70
- noisy_gating_std_decay=1,
71
- ker_len=50,
72
- con=0,
73
- d_model=512,
74
- mlp_gating=1,
75
- dropout=0.0,
76
  **kwargs,
77
  ):
78
  # Model architecture parameters
79
  self.train_seq_len = train_seq_len
80
  self.train_pred_len = train_pred_len
81
- self.seq_len = seq_len
82
- self.pred_len = pred_len
83
- self.inf_pred_len = inf_pred_len
84
- self.max_horizon = max_horizon
85
- self.auto_regressive = auto_regressive
86
 
87
  # MoE parameters
88
  self.moe = moe
89
- self.moe_n_experts = moe_n_experts
90
  self.top_k_experts = top_k_experts
91
  self.noisy_gating_std = noisy_gating_std
92
  self.moe_temp = moe_temp
93
  self.moe_norm = moe_norm
94
  self.layer_type = layer_type
95
- self.n_experts = n_experts
96
  self.comp_moe = comp_moe
97
  self.freeze_experts = freeze_experts
98
 
@@ -103,25 +66,7 @@ class SuperLinearConfig(PretrainedConfig):
103
  # Expert configuration
104
  self.freq_experts = freq_experts
105
 
106
- # Model loading and saving
107
- self.load_linear = load_linear
108
- self.load_weights_full = load_weights_full
109
- self.linear_freq_weights_path = linear_freq_weights_path
110
- self.full_weights_path = full_weights_path
111
-
112
  # Training parameters
113
  self.resample_long_lookback = resample_long_lookback
114
 
115
- # Legacy parameters for backward compatibility
116
- self.linear_checkpoints_path = linear_checkpoints_path
117
- self.linear_checkpoints_dir = linear_checkpoints_dir
118
- self.manual_moe = manual_moe
119
- self.misc_moe = misc_moe
120
- self.noisy_gating_std_decay = noisy_gating_std_decay
121
- self.ker_len = ker_len
122
- self.con = con
123
- self.d_model = d_model
124
- self.mlp_gating = mlp_gating
125
- self.dropout = dropout
126
-
127
- super().__init__(**kwargs)
 
1
  from typing import Optional, Tuple
2
 
3
+ from transformers import PretrainedConfig
 
 
 
 
 
 
 
4
 
5
  # 1) --------------------------------------------------------------------------
6
  # CONFIG
 
21
  # Model architecture parameters
22
  train_seq_len=512,
23
  train_pred_len=96,
24
+
 
 
 
 
 
25
  # MoE parameters
26
+ n_experts=4,
27
  top_k_experts=12,
28
  noisy_gating_std=0.1,
29
  moe_temp=1.0,
30
  moe_norm=False,
31
  layer_type='RLinear',
 
32
  comp_moe=12,
33
  freeze_experts=True,
 
34
 
35
  # FFT-based gating parameters
36
  use_fft=True,
 
39
  # Expert configuration
40
  freq_experts='mean_naive_1/4_1/6_1/7_1/8_1/12_1/14_1/16_1/21_1/24_1/28_1/30_1/32_1/36_1/42_1/48_1/52_1/56_1/60_1/72_1/84_1/90_1/96_1/120_1/144_1/168_1/180_1/224_1/252_1/288_1/336_1/365_1/504_1/672_1/1008_1/1440_1/2016_1/3600',
41
 
 
 
 
 
 
 
42
  # Training parameters
43
  resample_long_lookback=False,
44
 
 
 
 
 
 
 
 
 
 
 
 
45
  **kwargs,
46
  ):
47
  # Model architecture parameters
48
  self.train_seq_len = train_seq_len
49
  self.train_pred_len = train_pred_len
 
 
 
 
 
50
 
51
  # MoE parameters
52
  self.moe = moe
53
+ self.n_experts = n_experts
54
  self.top_k_experts = top_k_experts
55
  self.noisy_gating_std = noisy_gating_std
56
  self.moe_temp = moe_temp
57
  self.moe_norm = moe_norm
58
  self.layer_type = layer_type
 
59
  self.comp_moe = comp_moe
60
  self.freeze_experts = freeze_experts
61
 
 
66
  # Expert configuration
67
  self.freq_experts = freq_experts
68
 
 
 
 
 
 
 
69
  # Training parameters
70
  self.resample_long_lookback = resample_long_lookback
71
 
72
+ super().__init__(**kwargs)