martinkorelic commited on
Commit
13fd163
·
verified ·
1 Parent(s): 2701c6a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/adapter_config.json +39 -0
  2. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/eval_results.json +4 -0
  3. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/training_configuration.json +38 -0
  4. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/training_logs.json +625 -0
  5. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/adapter_config.json +39 -0
  6. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/eval_results.json +4 -0
  7. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/training_configuration.json +38 -0
  8. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/training_logs.json +625 -0
  9. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/adapter_config.json +39 -0
  10. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/eval_results.json +4 -0
  11. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/training_configuration.json +38 -0
  12. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/training_logs.json +625 -0
  13. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/adapter_config.json +39 -0
  14. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/eval_results.json +4 -0
  15. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/training_configuration.json +38 -0
  16. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/training_logs.json +1273 -0
  17. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/adapter_config.json +39 -0
  18. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/eval_results.json +4 -0
  19. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/training_configuration.json +38 -0
  20. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/training_logs.json +2659 -0
  21. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/adapter_config.json +39 -0
  22. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/eval_results.json +4 -0
  23. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_configuration.json +38 -0
  24. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_logs.json +0 -0
  25. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/adapter_config.json +39 -0
  26. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/eval_results.json +4 -0
  27. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/training_configuration.json +38 -0
  28. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/training_logs.json +0 -0
  29. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/adapter_config.json +39 -0
  30. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/eval_results.json +4 -0
  31. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/training_configuration.json +38 -0
  32. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/training_logs.json +0 -0
  33. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/adapter_config.json +39 -0
  34. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/eval_results.json +4 -0
  35. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/training_configuration.json +38 -0
  36. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/training_logs.json +0 -0
  37. TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r8-a2/training_configuration.json +38 -0
  38. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/adapter_config.json +40 -0
  39. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/eval_results.json +4 -0
  40. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/training_configuration.json +38 -0
  41. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/training_logs.json +625 -0
  42. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/adapter_config.json +40 -0
  43. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/eval_results.json +4 -0
  44. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/training_configuration.json +38 -0
  45. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/training_logs.json +625 -0
  46. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/adapter_config.json +40 -0
  47. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/eval_results.json +4 -0
  48. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/training_configuration.json +38 -0
  49. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/training_logs.json +625 -0
  50. TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_e-r32-a2/adapter_config.json +40 -0
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.5691126279863481
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_c-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-arc_c-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-29T18:23:17.489113"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r2-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.580422656,
6
+ "gpu_mem": 1.065693696,
7
+ "loss": 4.6127,
8
+ "grad_norm": 381.2337341308594,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.580619264,
15
+ "gpu_mem": 1.076182016,
16
+ "loss": 4.674,
17
+ "grad_norm": 388.8489685058594,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.580619264,
24
+ "gpu_mem": 1.076212736,
25
+ "loss": 1.8432,
26
+ "grad_norm": 72.13705444335938,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.580619264,
33
+ "gpu_mem": 1.076178944,
34
+ "loss": 1.6594,
35
+ "grad_norm": 50.97885513305664,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.580619264,
42
+ "gpu_mem": 1.076166656,
43
+ "loss": 1.4504,
44
+ "grad_norm": 25.90673065185547,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.580619264,
51
+ "gpu_mem": 1.076229632,
52
+ "loss": 1.4254,
53
+ "grad_norm": 21.743545532226562,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.580619264,
60
+ "gpu_mem": 1.076235776,
61
+ "loss": 1.4606,
62
+ "grad_norm": 25.414306640625,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.580619264,
69
+ "gpu_mem": 1.076194304,
70
+ "loss": 1.5571,
71
+ "grad_norm": 35.37538146972656,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.580619264,
78
+ "gpu_mem": 1.076189696,
79
+ "loss": 1.3306,
80
+ "grad_norm": 12.570324897766113,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.580619264,
87
+ "gpu_mem": 1.076178944,
88
+ "loss": 1.6494,
89
+ "grad_norm": 30.424644470214844,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.580619264,
96
+ "gpu_mem": 1.076189696,
97
+ "loss": 1.4528,
98
+ "grad_norm": 14.515549659729004,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.580619264,
105
+ "gpu_mem": 1.076214272,
106
+ "loss": 1.443,
107
+ "grad_norm": 14.896547317504883,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.580619264,
114
+ "gpu_mem": 1.076214272,
115
+ "loss": 1.3921,
116
+ "grad_norm": 30.263879776000977,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.580619264,
123
+ "gpu_mem": 1.076162048,
124
+ "loss": 1.7549,
125
+ "grad_norm": 41.95735168457031,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.580619264,
132
+ "gpu_mem": 1.076237312,
133
+ "loss": 1.5313,
134
+ "grad_norm": 22.88416290283203,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.580619264,
141
+ "gpu_mem": 1.076231168,
142
+ "loss": 1.4738,
143
+ "grad_norm": 30.19275665283203,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.580619264,
150
+ "gpu_mem": 1.076235776,
151
+ "loss": 1.5348,
152
+ "grad_norm": 30.131532669067383,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.580619264,
159
+ "gpu_mem": 1.081441792,
160
+ "loss": 2.0798,
161
+ "grad_norm": 17.52899742126465,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.580619264,
168
+ "gpu_mem": 1.081440256,
169
+ "loss": 1.4488,
170
+ "grad_norm": 20.202335357666016,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.580619264,
177
+ "gpu_mem": 1.08141568,
178
+ "loss": 1.2952,
179
+ "grad_norm": 13.343443870544434,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.580619264,
186
+ "gpu_mem": 1.08142336,
187
+ "loss": 1.4289,
188
+ "grad_norm": 17.170682907104492,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.580619264,
195
+ "gpu_mem": 1.081452544,
196
+ "loss": 1.391,
197
+ "grad_norm": 17.611751556396484,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.580619264,
204
+ "gpu_mem": 1.081481728,
205
+ "loss": 1.3177,
206
+ "grad_norm": 11.4840087890625,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.580619264,
213
+ "gpu_mem": 1.081424896,
214
+ "loss": 1.5163,
215
+ "grad_norm": 30.58349609375,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.580619264,
222
+ "gpu_mem": 1.081494016,
223
+ "loss": 1.3967,
224
+ "grad_norm": 14.075834274291992,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.580619264,
231
+ "gpu_mem": 1.081451008,
232
+ "loss": 1.3574,
233
+ "grad_norm": 14.51030158996582,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.580619264,
240
+ "gpu_mem": 1.081409536,
241
+ "loss": 1.3929,
242
+ "grad_norm": 13.834196090698242,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.580619264,
249
+ "gpu_mem": 1.081455616,
250
+ "loss": 1.6196,
251
+ "grad_norm": 46.8470344543457,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.580619264,
258
+ "gpu_mem": 1.081451008,
259
+ "loss": 1.3476,
260
+ "grad_norm": 7.685541152954102,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.580619264,
267
+ "gpu_mem": 1.081440256,
268
+ "loss": 1.3697,
269
+ "grad_norm": 9.882088661193848,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.580619264,
276
+ "gpu_mem": 1.081470976,
277
+ "loss": 1.331,
278
+ "grad_norm": 13.1102294921875,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.580619264,
285
+ "gpu_mem": 1.081480192,
286
+ "loss": 1.3643,
287
+ "grad_norm": 9.88525676727295,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.580619264,
294
+ "gpu_mem": 1.081460224,
295
+ "loss": 1.3741,
296
+ "grad_norm": 8.89490795135498,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.580619264,
303
+ "gpu_mem": 1.08143872,
304
+ "loss": 1.3514,
305
+ "grad_norm": 11.753167152404785,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.580619264,
312
+ "gpu_mem": 1.081326592,
313
+ "loss": 2.0743,
314
+ "grad_norm": 16.071163177490234,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.580619264,
321
+ "gpu_mem": 1.076208128,
322
+ "loss": 1.3093,
323
+ "grad_norm": 8.304706573486328,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.580619264,
330
+ "gpu_mem": 1.076217344,
331
+ "loss": 1.4079,
332
+ "grad_norm": 17.81264877319336,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.580619264,
339
+ "gpu_mem": 1.07618816,
340
+ "loss": 1.3492,
341
+ "grad_norm": 16.122692108154297,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.580619264,
348
+ "gpu_mem": 1.076206592,
349
+ "loss": 1.3398,
350
+ "grad_norm": 14.483545303344727,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.580619264,
357
+ "gpu_mem": 1.076183552,
358
+ "loss": 1.2991,
359
+ "grad_norm": 7.728339195251465,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.580619264,
366
+ "gpu_mem": 1.076185088,
367
+ "loss": 1.3073,
368
+ "grad_norm": 8.475275039672852,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.580619264,
375
+ "gpu_mem": 1.076214272,
376
+ "loss": 1.4702,
377
+ "grad_norm": 66.37582397460938,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.580619264,
384
+ "gpu_mem": 1.076229632,
385
+ "loss": 1.2906,
386
+ "grad_norm": 13.710612297058105,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.580619264,
393
+ "gpu_mem": 1.076248064,
394
+ "loss": 1.3712,
395
+ "grad_norm": 212.30665588378906,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.580619264,
402
+ "gpu_mem": 1.076201984,
403
+ "loss": 1.2661,
404
+ "grad_norm": 96.19535827636719,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.580619264,
411
+ "gpu_mem": 1.07619584,
412
+ "loss": 1.2963,
413
+ "grad_norm": 16.374256134033203,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.580619264,
420
+ "gpu_mem": 1.076189696,
421
+ "loss": 1.293,
422
+ "grad_norm": 18.95760154724121,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.580619264,
429
+ "gpu_mem": 1.076194304,
430
+ "loss": 1.211,
431
+ "grad_norm": 10.877612113952637,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.580619264,
438
+ "gpu_mem": 1.076185088,
439
+ "loss": 1.2268,
440
+ "grad_norm": 15.633013725280762,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.580619264,
447
+ "gpu_mem": 1.076166656,
448
+ "loss": 1.1838,
449
+ "grad_norm": 8.313477516174316,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.580619264,
456
+ "gpu_mem": 1.076191232,
457
+ "loss": 1.2146,
458
+ "grad_norm": 11.858922004699707,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.580619264,
465
+ "gpu_mem": 1.07621888,
466
+ "loss": 1.305,
467
+ "grad_norm": 9.989842414855957,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.580619264,
474
+ "gpu_mem": 1.081437184,
475
+ "loss": 1.8208,
476
+ "grad_norm": 18.719863891601562,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.580619264,
483
+ "gpu_mem": 1.081406464,
484
+ "loss": 1.2094,
485
+ "grad_norm": 13.842973709106445,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.580619264,
492
+ "gpu_mem": 1.081440256,
493
+ "loss": 1.2424,
494
+ "grad_norm": 14.39765739440918,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.580619264,
501
+ "gpu_mem": 1.081513984,
502
+ "loss": 1.151,
503
+ "grad_norm": 12.968193054199219,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.580619264,
510
+ "gpu_mem": 1.081457152,
511
+ "loss": 1.1003,
512
+ "grad_norm": 13.164222717285156,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.580619264,
519
+ "gpu_mem": 1.081451008,
520
+ "loss": 1.0009,
521
+ "grad_norm": 10.711393356323242,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.580619264,
528
+ "gpu_mem": 1.081501696,
529
+ "loss": 1.0536,
530
+ "grad_norm": 13.167465209960938,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.580619264,
537
+ "gpu_mem": 1.081427968,
538
+ "loss": 1.1045,
539
+ "grad_norm": 23.531604766845703,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.580619264,
546
+ "gpu_mem": 1.081441792,
547
+ "loss": 1.1359,
548
+ "grad_norm": 17.696874618530273,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.580619264,
555
+ "gpu_mem": 1.081443328,
556
+ "loss": 1.0889,
557
+ "grad_norm": 15.676621437072754,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.580619264,
564
+ "gpu_mem": 1.081432576,
565
+ "loss": 1.178,
566
+ "grad_norm": 18.50305938720703,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.580619264,
573
+ "gpu_mem": 1.081449472,
574
+ "loss": 1.1488,
575
+ "grad_norm": 18.644163131713867,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.580619264,
582
+ "gpu_mem": 1.081470976,
583
+ "loss": 1.0864,
584
+ "grad_norm": 16.862136840820312,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.580619264,
591
+ "gpu_mem": 1.08146176,
592
+ "loss": 1.012,
593
+ "grad_norm": 18.95318031311035,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.580619264,
600
+ "gpu_mem": 1.081487872,
601
+ "loss": 1.0928,
602
+ "grad_norm": 18.284706115722656,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.580619264,
609
+ "gpu_mem": 1.08143872,
610
+ "loss": 1.1189,
611
+ "grad_norm": 15.364293098449707,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.580619264,
618
+ "gpu_mem": 1.08143872,
619
+ "train_runtime": 374.0598,
620
+ "train_samples_per_second": 11.966,
621
+ "train_steps_per_second": 0.182,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4615876972675323
624
+ }
625
+ ]
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.5503412969283277
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_c-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-arc_c-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T08:36:40.140752"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r32-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.438529024,
6
+ "gpu_mem": 1.153432064,
7
+ "loss": 4.6127,
8
+ "grad_norm": 83.32475280761719,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.438725632,
15
+ "gpu_mem": 1.321571328,
16
+ "loss": 4.674,
17
+ "grad_norm": 85.61141204833984,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.43892224,
24
+ "gpu_mem": 1.321602048,
25
+ "loss": 2.0451,
26
+ "grad_norm": 22.385805130004883,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.43892224,
33
+ "gpu_mem": 1.321568256,
34
+ "loss": 1.5792,
35
+ "grad_norm": 8.16244125366211,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.43892224,
42
+ "gpu_mem": 1.321555968,
43
+ "loss": 1.4964,
44
+ "grad_norm": 8.01338005065918,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.43892224,
51
+ "gpu_mem": 1.321618944,
52
+ "loss": 1.448,
53
+ "grad_norm": 5.569636821746826,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.43892224,
60
+ "gpu_mem": 1.321625088,
61
+ "loss": 1.4577,
62
+ "grad_norm": 5.084090232849121,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.43892224,
69
+ "gpu_mem": 1.321583616,
70
+ "loss": 1.4895,
71
+ "grad_norm": 4.238208293914795,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.439118848,
78
+ "gpu_mem": 1.321579008,
79
+ "loss": 1.3082,
80
+ "grad_norm": 1.7702213525772095,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.439118848,
87
+ "gpu_mem": 1.321568256,
88
+ "loss": 1.5789,
89
+ "grad_norm": 5.247867107391357,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.439118848,
96
+ "gpu_mem": 1.321579008,
97
+ "loss": 1.4814,
98
+ "grad_norm": 3.4710798263549805,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.439118848,
105
+ "gpu_mem": 1.321603584,
106
+ "loss": 1.3923,
107
+ "grad_norm": 1.7332303524017334,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.439118848,
114
+ "gpu_mem": 1.321603584,
115
+ "loss": 1.3625,
116
+ "grad_norm": 3.513798713684082,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.439118848,
123
+ "gpu_mem": 1.32155136,
124
+ "loss": 1.653,
125
+ "grad_norm": 4.981041431427002,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.439118848,
132
+ "gpu_mem": 1.321626624,
133
+ "loss": 1.5649,
134
+ "grad_norm": 3.986966133117676,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.439118848,
141
+ "gpu_mem": 1.32162048,
142
+ "loss": 1.418,
143
+ "grad_norm": 2.171356678009033,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.439118848,
150
+ "gpu_mem": 1.321625088,
151
+ "loss": 1.4328,
152
+ "grad_norm": 2.804058790206909,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.439118848,
159
+ "gpu_mem": 1.405656576,
160
+ "loss": 2.0652,
161
+ "grad_norm": 2.6565699577331543,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.439118848,
168
+ "gpu_mem": 1.40565504,
169
+ "loss": 1.3863,
170
+ "grad_norm": 1.8464845418930054,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.439118848,
177
+ "gpu_mem": 1.405630464,
178
+ "loss": 1.3204,
179
+ "grad_norm": 1.9601354598999023,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.439118848,
186
+ "gpu_mem": 1.405638144,
187
+ "loss": 1.384,
188
+ "grad_norm": 2.193528413772583,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.439118848,
195
+ "gpu_mem": 1.405667328,
196
+ "loss": 1.3968,
197
+ "grad_norm": 3.0995216369628906,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.439118848,
204
+ "gpu_mem": 1.405696512,
205
+ "loss": 1.3764,
206
+ "grad_norm": 3.8095834255218506,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.439118848,
213
+ "gpu_mem": 1.40563968,
214
+ "loss": 1.3585,
215
+ "grad_norm": 2.3663456439971924,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.439118848,
222
+ "gpu_mem": 1.4057088,
223
+ "loss": 1.3727,
224
+ "grad_norm": 2.6865601539611816,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.439118848,
231
+ "gpu_mem": 1.405665792,
232
+ "loss": 1.3964,
233
+ "grad_norm": 2.392038583755493,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.439118848,
240
+ "gpu_mem": 1.40562432,
241
+ "loss": 1.4306,
242
+ "grad_norm": 2.2583298683166504,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.439118848,
249
+ "gpu_mem": 1.4056704,
250
+ "loss": 1.641,
251
+ "grad_norm": 4.678727149963379,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.439118848,
258
+ "gpu_mem": 1.405665792,
259
+ "loss": 1.3692,
260
+ "grad_norm": 1.6019768714904785,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.439118848,
267
+ "gpu_mem": 1.40565504,
268
+ "loss": 1.3711,
269
+ "grad_norm": 1.569014310836792,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.439118848,
276
+ "gpu_mem": 1.40568576,
277
+ "loss": 1.346,
278
+ "grad_norm": 1.4607075452804565,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.439118848,
285
+ "gpu_mem": 1.405694976,
286
+ "loss": 1.4061,
287
+ "grad_norm": 2.075629949569702,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.439118848,
294
+ "gpu_mem": 1.405675008,
295
+ "loss": 1.4244,
296
+ "grad_norm": 1.9367634057998657,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.439118848,
303
+ "gpu_mem": 1.405653504,
304
+ "loss": 1.3788,
305
+ "grad_norm": 1.2187618017196655,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.439118848,
312
+ "gpu_mem": 1.405541376,
313
+ "loss": 2.0491,
314
+ "grad_norm": 1.391384243965149,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.439118848,
321
+ "gpu_mem": 1.32159744,
322
+ "loss": 1.3294,
323
+ "grad_norm": 0.9405258297920227,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.439118848,
330
+ "gpu_mem": 1.321606656,
331
+ "loss": 1.4945,
332
+ "grad_norm": 2.8952982425689697,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.439118848,
339
+ "gpu_mem": 1.321577472,
340
+ "loss": 1.4347,
341
+ "grad_norm": 2.3792271614074707,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.439118848,
348
+ "gpu_mem": 1.321595904,
349
+ "loss": 1.4687,
350
+ "grad_norm": 2.7014079093933105,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.439118848,
357
+ "gpu_mem": 1.321572864,
358
+ "loss": 1.3671,
359
+ "grad_norm": 1.4931557178497314,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.439118848,
366
+ "gpu_mem": 1.3215744,
367
+ "loss": 1.372,
368
+ "grad_norm": 1.171431541442871,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.439118848,
375
+ "gpu_mem": 1.321603584,
376
+ "loss": 1.3052,
377
+ "grad_norm": 1.5233906507492065,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.439118848,
384
+ "gpu_mem": 1.321618944,
385
+ "loss": 1.3462,
386
+ "grad_norm": 1.46059250831604,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.439118848,
393
+ "gpu_mem": 1.321637376,
394
+ "loss": 1.2995,
395
+ "grad_norm": 1.066210150718689,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.439118848,
402
+ "gpu_mem": 1.321591296,
403
+ "loss": 1.292,
404
+ "grad_norm": 0.8745197057723999,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.439118848,
411
+ "gpu_mem": 1.321585152,
412
+ "loss": 1.2877,
413
+ "grad_norm": 1.2903093099594116,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.439118848,
420
+ "gpu_mem": 1.321579008,
421
+ "loss": 1.2495,
422
+ "grad_norm": 0.9918281435966492,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.439118848,
429
+ "gpu_mem": 1.321583616,
430
+ "loss": 1.1968,
431
+ "grad_norm": 0.9396869540214539,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.439118848,
438
+ "gpu_mem": 1.3215744,
439
+ "loss": 1.2387,
440
+ "grad_norm": 0.9950159788131714,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.439118848,
447
+ "gpu_mem": 1.321555968,
448
+ "loss": 1.2554,
449
+ "grad_norm": 1.2974423170089722,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.439118848,
456
+ "gpu_mem": 1.321580544,
457
+ "loss": 1.2675,
458
+ "grad_norm": 1.546997308731079,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.439118848,
465
+ "gpu_mem": 1.321608192,
466
+ "loss": 1.3211,
467
+ "grad_norm": 1.2576476335525513,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.439118848,
474
+ "gpu_mem": 1.405651968,
475
+ "loss": 1.7943,
476
+ "grad_norm": 1.8999438285827637,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.439118848,
483
+ "gpu_mem": 1.405621248,
484
+ "loss": 1.2363,
485
+ "grad_norm": 1.7934603691101074,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.439118848,
492
+ "gpu_mem": 1.40565504,
493
+ "loss": 1.2356,
494
+ "grad_norm": 1.768385887145996,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.439118848,
501
+ "gpu_mem": 1.405728768,
502
+ "loss": 1.2013,
503
+ "grad_norm": 1.5940515995025635,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.439118848,
510
+ "gpu_mem": 1.405671936,
511
+ "loss": 1.1569,
512
+ "grad_norm": 1.4316647052764893,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.439118848,
519
+ "gpu_mem": 1.405665792,
520
+ "loss": 1.0889,
521
+ "grad_norm": 1.7646082639694214,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.439118848,
528
+ "gpu_mem": 1.40571648,
529
+ "loss": 1.1441,
530
+ "grad_norm": 1.6605567932128906,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.439118848,
537
+ "gpu_mem": 1.405642752,
538
+ "loss": 1.194,
539
+ "grad_norm": 2.2929937839508057,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.439118848,
546
+ "gpu_mem": 1.405656576,
547
+ "loss": 1.1754,
548
+ "grad_norm": 2.0269861221313477,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.439118848,
555
+ "gpu_mem": 1.405658112,
556
+ "loss": 1.1437,
557
+ "grad_norm": 2.0639264583587646,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.439118848,
564
+ "gpu_mem": 1.40564736,
565
+ "loss": 1.181,
566
+ "grad_norm": 1.9590049982070923,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.439118848,
573
+ "gpu_mem": 1.405664256,
574
+ "loss": 1.14,
575
+ "grad_norm": 2.042527198791504,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.439118848,
582
+ "gpu_mem": 1.40568576,
583
+ "loss": 1.1584,
584
+ "grad_norm": 2.447296380996704,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.439118848,
591
+ "gpu_mem": 1.405676544,
592
+ "loss": 1.058,
593
+ "grad_norm": 2.189685583114624,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.439118848,
600
+ "gpu_mem": 1.405702656,
601
+ "loss": 1.1381,
602
+ "grad_norm": 1.9674383401870728,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.439118848,
609
+ "gpu_mem": 1.405653504,
610
+ "loss": 1.1528,
611
+ "grad_norm": 2.103919267654419,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.439118848,
618
+ "gpu_mem": 1.405653504,
619
+ "train_runtime": 378.0821,
620
+ "train_samples_per_second": 11.839,
621
+ "train_steps_per_second": 0.18,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4734139126889847
624
+ }
625
+ ]
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.5674061433447098
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_c-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-arc_c-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T01:30:22.564244"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_c-r8-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.590892032,
6
+ "gpu_mem": 1.079811584,
7
+ "loss": 4.6127,
8
+ "grad_norm": 190.1083221435547,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.59108864,
15
+ "gpu_mem": 1.121681408,
16
+ "loss": 4.674,
17
+ "grad_norm": 195.60543823242188,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.59108864,
24
+ "gpu_mem": 1.121712128,
25
+ "loss": 1.9423,
26
+ "grad_norm": 42.55434036254883,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.591285248,
33
+ "gpu_mem": 1.121678336,
34
+ "loss": 1.6033,
35
+ "grad_norm": 18.729658126831055,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.591285248,
42
+ "gpu_mem": 1.121666048,
43
+ "loss": 1.502,
44
+ "grad_norm": 17.119783401489258,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.591285248,
51
+ "gpu_mem": 1.121729024,
52
+ "loss": 1.4153,
53
+ "grad_norm": 8.702713012695312,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.591285248,
60
+ "gpu_mem": 1.121735168,
61
+ "loss": 1.4484,
62
+ "grad_norm": 9.63271713256836,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.591285248,
69
+ "gpu_mem": 1.121693696,
70
+ "loss": 1.4964,
71
+ "grad_norm": 8.020484924316406,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.591285248,
78
+ "gpu_mem": 1.121689088,
79
+ "loss": 1.3058,
80
+ "grad_norm": 5.498310089111328,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.591285248,
87
+ "gpu_mem": 1.121678336,
88
+ "loss": 1.5191,
89
+ "grad_norm": 12.341209411621094,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.591285248,
96
+ "gpu_mem": 1.121689088,
97
+ "loss": 1.4231,
98
+ "grad_norm": 8.037294387817383,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.591285248,
105
+ "gpu_mem": 1.121713664,
106
+ "loss": 1.4298,
107
+ "grad_norm": 5.25429630279541,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.591285248,
114
+ "gpu_mem": 1.121713664,
115
+ "loss": 1.3615,
116
+ "grad_norm": 7.831629753112793,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.591285248,
123
+ "gpu_mem": 1.12166144,
124
+ "loss": 1.6705,
125
+ "grad_norm": 10.625100135803223,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.591285248,
132
+ "gpu_mem": 1.121736704,
133
+ "loss": 1.6489,
134
+ "grad_norm": 9.55422306060791,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.591285248,
141
+ "gpu_mem": 1.12173056,
142
+ "loss": 1.4015,
143
+ "grad_norm": 2.770230770111084,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.591285248,
150
+ "gpu_mem": 1.121735168,
151
+ "loss": 1.4158,
152
+ "grad_norm": 4.04749870300293,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.591285248,
159
+ "gpu_mem": 1.142631936,
160
+ "loss": 2.0539,
161
+ "grad_norm": 5.462876319885254,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.591285248,
168
+ "gpu_mem": 1.1426304,
169
+ "loss": 1.4073,
170
+ "grad_norm": 4.310235023498535,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.591285248,
177
+ "gpu_mem": 1.142605824,
178
+ "loss": 1.3455,
179
+ "grad_norm": 4.525611877441406,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.591285248,
186
+ "gpu_mem": 1.142613504,
187
+ "loss": 1.4146,
188
+ "grad_norm": 4.8981218338012695,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.591285248,
195
+ "gpu_mem": 1.142642688,
196
+ "loss": 1.3934,
197
+ "grad_norm": 6.351171493530273,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.591285248,
204
+ "gpu_mem": 1.142671872,
205
+ "loss": 1.3354,
206
+ "grad_norm": 5.1450886726379395,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.591285248,
213
+ "gpu_mem": 1.14261504,
214
+ "loss": 1.394,
215
+ "grad_norm": 6.1006340980529785,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.591285248,
222
+ "gpu_mem": 1.14268416,
223
+ "loss": 1.3449,
224
+ "grad_norm": 4.99368143081665,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.591285248,
231
+ "gpu_mem": 1.142641152,
232
+ "loss": 1.3966,
233
+ "grad_norm": 5.3322882652282715,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.591285248,
240
+ "gpu_mem": 1.14259968,
241
+ "loss": 1.4423,
242
+ "grad_norm": 5.347881317138672,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.591285248,
249
+ "gpu_mem": 1.14264576,
250
+ "loss": 1.5757,
251
+ "grad_norm": 7.814599514007568,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.591285248,
258
+ "gpu_mem": 1.142641152,
259
+ "loss": 1.353,
260
+ "grad_norm": 2.587792158126831,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.591285248,
267
+ "gpu_mem": 1.1426304,
268
+ "loss": 1.3714,
269
+ "grad_norm": 3.4897141456604004,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.591285248,
276
+ "gpu_mem": 1.14266112,
277
+ "loss": 1.3522,
278
+ "grad_norm": 2.2583186626434326,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.591285248,
285
+ "gpu_mem": 1.142670336,
286
+ "loss": 1.3582,
287
+ "grad_norm": 2.4103989601135254,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.591285248,
294
+ "gpu_mem": 1.142650368,
295
+ "loss": 1.3951,
296
+ "grad_norm": 3.2258849143981934,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.591285248,
303
+ "gpu_mem": 1.142628864,
304
+ "loss": 1.3668,
305
+ "grad_norm": 2.021745204925537,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.591285248,
312
+ "gpu_mem": 1.142516736,
313
+ "loss": 2.0591,
314
+ "grad_norm": 2.1437292098999023,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.591285248,
321
+ "gpu_mem": 1.12170752,
322
+ "loss": 1.3225,
323
+ "grad_norm": 1.7468737363815308,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.591285248,
330
+ "gpu_mem": 1.121716736,
331
+ "loss": 1.4179,
332
+ "grad_norm": 3.716902494430542,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.591285248,
339
+ "gpu_mem": 1.121687552,
340
+ "loss": 1.3573,
341
+ "grad_norm": 2.5589308738708496,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.591285248,
348
+ "gpu_mem": 1.121705984,
349
+ "loss": 1.3358,
350
+ "grad_norm": 2.8327057361602783,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.591285248,
357
+ "gpu_mem": 1.121682944,
358
+ "loss": 1.3182,
359
+ "grad_norm": 2.0415289402008057,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.591285248,
366
+ "gpu_mem": 1.12168448,
367
+ "loss": 1.3512,
368
+ "grad_norm": 2.6037545204162598,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.591285248,
375
+ "gpu_mem": 1.121713664,
376
+ "loss": 1.3009,
377
+ "grad_norm": 3.407050132751465,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.591285248,
384
+ "gpu_mem": 1.121729024,
385
+ "loss": 1.375,
386
+ "grad_norm": 4.440658092498779,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.591285248,
393
+ "gpu_mem": 1.121747456,
394
+ "loss": 1.2674,
395
+ "grad_norm": 2.6665470600128174,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.591285248,
402
+ "gpu_mem": 1.121701376,
403
+ "loss": 1.2365,
404
+ "grad_norm": 3.8944313526153564,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.591285248,
411
+ "gpu_mem": 1.121695232,
412
+ "loss": 1.2408,
413
+ "grad_norm": 2.97174072265625,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.591285248,
420
+ "gpu_mem": 1.121689088,
421
+ "loss": 1.24,
422
+ "grad_norm": 3.1578757762908936,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.591285248,
429
+ "gpu_mem": 1.121693696,
430
+ "loss": 1.1557,
431
+ "grad_norm": 3.0455143451690674,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.591285248,
438
+ "gpu_mem": 1.12168448,
439
+ "loss": 1.2344,
440
+ "grad_norm": 3.0066888332366943,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.591285248,
447
+ "gpu_mem": 1.121666048,
448
+ "loss": 1.2225,
449
+ "grad_norm": 3.4496467113494873,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.591285248,
456
+ "gpu_mem": 1.121690624,
457
+ "loss": 1.2478,
458
+ "grad_norm": 4.3415117263793945,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.591285248,
465
+ "gpu_mem": 1.121718272,
466
+ "loss": 1.2959,
467
+ "grad_norm": 3.4093315601348877,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.591285248,
474
+ "gpu_mem": 1.142627328,
475
+ "loss": 1.7767,
476
+ "grad_norm": 5.994507312774658,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.591285248,
483
+ "gpu_mem": 1.142596608,
484
+ "loss": 1.2086,
485
+ "grad_norm": 4.536657333374023,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.591285248,
492
+ "gpu_mem": 1.1426304,
493
+ "loss": 1.1557,
494
+ "grad_norm": 4.9032063484191895,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.591285248,
501
+ "gpu_mem": 1.142704128,
502
+ "loss": 1.1267,
503
+ "grad_norm": 4.643110275268555,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.591285248,
510
+ "gpu_mem": 1.142647296,
511
+ "loss": 1.159,
512
+ "grad_norm": 3.9950804710388184,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.591285248,
519
+ "gpu_mem": 1.142641152,
520
+ "loss": 1.035,
521
+ "grad_norm": 4.866928577423096,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.591285248,
528
+ "gpu_mem": 1.14269184,
529
+ "loss": 1.0495,
530
+ "grad_norm": 4.91159725189209,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.591285248,
537
+ "gpu_mem": 1.142618112,
538
+ "loss": 1.128,
539
+ "grad_norm": 6.490297794342041,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.591285248,
546
+ "gpu_mem": 1.142631936,
547
+ "loss": 1.0631,
548
+ "grad_norm": 5.946484088897705,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.591285248,
555
+ "gpu_mem": 1.142633472,
556
+ "loss": 1.122,
557
+ "grad_norm": 6.1243577003479,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.591285248,
564
+ "gpu_mem": 1.14262272,
565
+ "loss": 1.1157,
566
+ "grad_norm": 5.4304728507995605,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.591285248,
573
+ "gpu_mem": 1.142639616,
574
+ "loss": 1.0507,
575
+ "grad_norm": 5.880957126617432,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.591285248,
582
+ "gpu_mem": 1.14266112,
583
+ "loss": 1.0811,
584
+ "grad_norm": 6.808589458465576,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.591285248,
591
+ "gpu_mem": 1.142651904,
592
+ "loss": 0.9973,
593
+ "grad_norm": 6.240514278411865,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.591285248,
600
+ "gpu_mem": 1.142678016,
601
+ "loss": 1.0386,
602
+ "grad_norm": 5.9608283042907715,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.591285248,
609
+ "gpu_mem": 1.142628864,
610
+ "loss": 1.0538,
611
+ "grad_norm": 6.074984550476074,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.591285248,
618
+ "gpu_mem": 1.142628864,
619
+ "train_runtime": 374.9512,
620
+ "train_samples_per_second": 11.938,
621
+ "train_steps_per_second": 0.181,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4457515811218935
624
+ }
625
+ ]
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.7121212121212122
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_e-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-arc_e-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T07:55:18.577303"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-arc_e-r32-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 2.556440576,
6
+ "gpu_mem": 1.143415296,
7
+ "loss": 4.6689,
8
+ "grad_norm": 80.02104187011719,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 2.556637184,
15
+ "gpu_mem": 1.311625216,
16
+ "loss": 4.5086,
17
+ "grad_norm": 83.96923065185547,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 2.556833792,
24
+ "gpu_mem": 1.311603712,
25
+ "loss": 3.0059,
26
+ "grad_norm": 49.818973541259766,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 2.556833792,
33
+ "gpu_mem": 1.311582208,
34
+ "loss": 2.0779,
35
+ "grad_norm": 18.78441619873047,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 2.5570304,
42
+ "gpu_mem": 1.31162368,
43
+ "loss": 1.5397,
44
+ "grad_norm": 5.960499286651611,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 2.557227008,
51
+ "gpu_mem": 1.311599104,
52
+ "loss": 1.4567,
53
+ "grad_norm": 8.546557426452637,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 2.557227008,
60
+ "gpu_mem": 1.311622144,
61
+ "loss": 1.6186,
62
+ "grad_norm": 14.599489212036133,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 2.557227008,
69
+ "gpu_mem": 1.311580672,
70
+ "loss": 1.4127,
71
+ "grad_norm": 6.496516227722168,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 2.557227008,
78
+ "gpu_mem": 1.311582208,
79
+ "loss": 1.4107,
80
+ "grad_norm": 4.811722278594971,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 2.557423616,
87
+ "gpu_mem": 1.3115776,
88
+ "loss": 1.531,
89
+ "grad_norm": 7.236036777496338,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 2.557423616,
96
+ "gpu_mem": 1.311655936,
97
+ "loss": 1.3515,
98
+ "grad_norm": 3.067713499069214,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 2.557423616,
105
+ "gpu_mem": 1.311629824,
106
+ "loss": 1.4038,
107
+ "grad_norm": 3.7601993083953857,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 2.557423616,
114
+ "gpu_mem": 1.311580672,
115
+ "loss": 1.3711,
116
+ "grad_norm": 2.6803042888641357,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 2.557423616,
123
+ "gpu_mem": 1.311602176,
124
+ "loss": 1.3275,
125
+ "grad_norm": 1.6125613451004028,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 2.557423616,
132
+ "gpu_mem": 1.311579136,
133
+ "loss": 1.3446,
134
+ "grad_norm": 1.7154477834701538,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 2.557423616,
141
+ "gpu_mem": 1.311583744,
142
+ "loss": 1.375,
143
+ "grad_norm": 1.3047140836715698,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 2.557423616,
150
+ "gpu_mem": 1.311620608,
151
+ "loss": 1.3274,
152
+ "grad_norm": 1.3025691509246826,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 2.557423616,
159
+ "gpu_mem": 1.31163136,
160
+ "loss": 1.4244,
161
+ "grad_norm": 3.0251400470733643,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 2.557423616,
168
+ "gpu_mem": 1.311574528,
169
+ "loss": 1.3444,
170
+ "grad_norm": 2.2817976474761963,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 2.557423616,
177
+ "gpu_mem": 1.311645184,
178
+ "loss": 1.4019,
179
+ "grad_norm": 2.1219711303710938,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 2.557423616,
186
+ "gpu_mem": 1.311643648,
187
+ "loss": 1.331,
188
+ "grad_norm": 1.734360694885254,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 2.557423616,
195
+ "gpu_mem": 1.31160064,
196
+ "loss": 1.3231,
197
+ "grad_norm": 1.4685757160186768,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 2.557423616,
204
+ "gpu_mem": 1.311617536,
205
+ "loss": 1.3441,
206
+ "grad_norm": 1.3601280450820923,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 2.557423616,
213
+ "gpu_mem": 1.311574528,
214
+ "loss": 1.3631,
215
+ "grad_norm": 1.5260316133499146,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 2.557423616,
222
+ "gpu_mem": 1.311603712,
223
+ "loss": 1.3865,
224
+ "grad_norm": 1.4043406248092651,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 2.557423616,
231
+ "gpu_mem": 1.311583744,
232
+ "loss": 1.4606,
233
+ "grad_norm": 1.6535191535949707,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 2.557423616,
240
+ "gpu_mem": 1.311609856,
241
+ "loss": 1.3359,
242
+ "grad_norm": 0.8193792700767517,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 2.557423616,
249
+ "gpu_mem": 1.311609856,
250
+ "loss": 1.3871,
251
+ "grad_norm": 1.0423256158828735,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 2.557423616,
258
+ "gpu_mem": 1.311588352,
259
+ "loss": 1.2794,
260
+ "grad_norm": 1.2377030849456787,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 2.557423616,
267
+ "gpu_mem": 1.311579136,
268
+ "loss": 1.3537,
269
+ "grad_norm": 1.0612547397613525,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 2.557423616,
276
+ "gpu_mem": 1.311597568,
277
+ "loss": 1.3698,
278
+ "grad_norm": 0.8752574920654297,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 2.557423616,
285
+ "gpu_mem": 1.311620608,
286
+ "loss": 1.3323,
287
+ "grad_norm": 1.2664260864257812,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 2.557423616,
294
+ "gpu_mem": 1.311617536,
295
+ "loss": 1.375,
296
+ "grad_norm": 1.1751697063446045,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 2.557423616,
303
+ "gpu_mem": 1.311620608,
304
+ "loss": 1.4226,
305
+ "grad_norm": 1.6566500663757324,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 2.557423616,
312
+ "gpu_mem": 1.311602176,
313
+ "loss": 1.3242,
314
+ "grad_norm": 1.2318401336669922,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 2.557423616,
321
+ "gpu_mem": 1.395667456,
322
+ "loss": 2.0355,
323
+ "grad_norm": 2.13344407081604,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 2.557423616,
330
+ "gpu_mem": 1.395672064,
331
+ "loss": 1.3796,
332
+ "grad_norm": 1.7476377487182617,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 2.557423616,
339
+ "gpu_mem": 1.39565056,
340
+ "loss": 1.2405,
341
+ "grad_norm": 1.125243902206421,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 2.557423616,
348
+ "gpu_mem": 1.395639808,
349
+ "loss": 1.3792,
350
+ "grad_norm": 2.231471538543701,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 2.557423616,
357
+ "gpu_mem": 1.395702784,
358
+ "loss": 1.3537,
359
+ "grad_norm": 1.4337620735168457,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 2.557423616,
366
+ "gpu_mem": 1.395662848,
367
+ "loss": 1.3459,
368
+ "grad_norm": 1.9487059116363525,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 2.557423616,
375
+ "gpu_mem": 1.395705856,
376
+ "loss": 1.3556,
377
+ "grad_norm": 1.2340912818908691,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 2.557423616,
384
+ "gpu_mem": 1.395655168,
385
+ "loss": 1.3948,
386
+ "grad_norm": 1.187638521194458,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 2.557423616,
393
+ "gpu_mem": 1.39571968,
394
+ "loss": 1.3685,
395
+ "grad_norm": 1.508293628692627,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 2.557423616,
402
+ "gpu_mem": 1.395687424,
403
+ "loss": 1.3882,
404
+ "grad_norm": 0.9527984261512756,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 2.557423616,
411
+ "gpu_mem": 1.395692032,
412
+ "loss": 1.3874,
413
+ "grad_norm": 1.313702940940857,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 2.557423616,
420
+ "gpu_mem": 1.395638272,
421
+ "loss": 1.4268,
422
+ "grad_norm": 2.3709449768066406,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 2.557423616,
429
+ "gpu_mem": 1.395652096,
430
+ "loss": 1.3532,
431
+ "grad_norm": 1.4290915727615356,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 2.557423616,
438
+ "gpu_mem": 1.395641344,
439
+ "loss": 1.3,
440
+ "grad_norm": 0.8550411462783813,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 2.557423616,
447
+ "gpu_mem": 1.395655168,
448
+ "loss": 1.324,
449
+ "grad_norm": 1.0939958095550537,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 2.557423616,
456
+ "gpu_mem": 1.395707392,
457
+ "loss": 1.3759,
458
+ "grad_norm": 1.5617879629135132,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 2.557423616,
465
+ "gpu_mem": 1.395655168,
466
+ "loss": 1.3453,
467
+ "grad_norm": 1.4145621061325073,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 2.557423616,
474
+ "gpu_mem": 1.395724288,
475
+ "loss": 1.3062,
476
+ "grad_norm": 0.8992661833763123,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 2.557423616,
483
+ "gpu_mem": 1.395692032,
484
+ "loss": 1.3001,
485
+ "grad_norm": 1.7878972291946411,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 2.557423616,
492
+ "gpu_mem": 1.395701248,
493
+ "loss": 1.4296,
494
+ "grad_norm": 2.7013356685638428,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 2.557423616,
501
+ "gpu_mem": 1.395676672,
502
+ "loss": 1.3501,
503
+ "grad_norm": 1.583625078201294,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 2.557423616,
510
+ "gpu_mem": 1.395710464,
511
+ "loss": 1.2723,
512
+ "grad_norm": 0.6726813316345215,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 2.557423616,
519
+ "gpu_mem": 1.395692032,
520
+ "loss": 1.2848,
521
+ "grad_norm": 0.9176018834114075,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 2.557423616,
528
+ "gpu_mem": 1.395678208,
529
+ "loss": 1.3193,
530
+ "grad_norm": 1.1201019287109375,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 2.557423616,
537
+ "gpu_mem": 1.395716608,
538
+ "loss": 1.3011,
539
+ "grad_norm": 1.3688689470291138,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 2.557423616,
546
+ "gpu_mem": 1.395649024,
547
+ "loss": 1.2904,
548
+ "grad_norm": 1.38422429561615,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 2.557423616,
555
+ "gpu_mem": 1.39569664,
556
+ "loss": 1.3664,
557
+ "grad_norm": 1.3409178256988525,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 2.557423616,
564
+ "gpu_mem": 1.395645952,
565
+ "loss": 1.3458,
566
+ "grad_norm": 1.7384254932403564,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 2.557423616,
573
+ "gpu_mem": 1.395695104,
574
+ "loss": 1.3653,
575
+ "grad_norm": 1.7220115661621094,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 2.557423616,
582
+ "gpu_mem": 1.395693568,
583
+ "loss": 1.3209,
584
+ "grad_norm": 1.4237152338027954,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 2.557423616,
591
+ "gpu_mem": 1.395712,
592
+ "loss": 1.2984,
593
+ "grad_norm": 1.356772780418396,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 2.557423616,
600
+ "gpu_mem": 1.395653632,
601
+ "loss": 1.3236,
602
+ "grad_norm": 1.1635974645614624,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 2.557423616,
609
+ "gpu_mem": 1.39566592,
610
+ "loss": 1.348,
611
+ "grad_norm": 0.9813856482505798,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 2.557423616,
618
+ "gpu_mem": 1.395690496,
619
+ "loss": 1.3142,
620
+ "grad_norm": 1.3948644399642944,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 2.557423616,
627
+ "gpu_mem": 1.395667456,
628
+ "loss": 1.242,
629
+ "grad_norm": 1.0734961032867432,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 2.557423616,
636
+ "gpu_mem": 1.395503104,
637
+ "loss": 1.922,
638
+ "grad_norm": 2.428542375564575,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 2.557423616,
645
+ "gpu_mem": 1.311612928,
646
+ "loss": 1.2839,
647
+ "grad_norm": 1.276829481124878,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 2.557423616,
654
+ "gpu_mem": 1.311576064,
655
+ "loss": 1.241,
656
+ "grad_norm": 1.3086708784103394,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 2.557423616,
663
+ "gpu_mem": 1.311635968,
664
+ "loss": 1.2416,
665
+ "grad_norm": 1.5263572931289673,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 2.557423616,
672
+ "gpu_mem": 1.311603712,
673
+ "loss": 1.2897,
674
+ "grad_norm": 2.197709560394287,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 2.557423616,
681
+ "gpu_mem": 1.311614464,
682
+ "loss": 1.1855,
683
+ "grad_norm": 1.6668955087661743,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 2.557423616,
690
+ "gpu_mem": 1.311651328,
691
+ "loss": 1.2452,
692
+ "grad_norm": 1.952316403388977,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 2.557423616,
699
+ "gpu_mem": 1.311635968,
700
+ "loss": 1.2823,
701
+ "grad_norm": 2.414081573486328,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 2.557423616,
708
+ "gpu_mem": 1.311586816,
709
+ "loss": 1.1377,
710
+ "grad_norm": 2.1895029544830322,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 2.557423616,
717
+ "gpu_mem": 1.31163136,
718
+ "loss": 1.244,
719
+ "grad_norm": 1.7564090490341187,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 2.557423616,
726
+ "gpu_mem": 1.311617536,
727
+ "loss": 1.3114,
728
+ "grad_norm": 2.8045568466186523,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 2.557423616,
735
+ "gpu_mem": 1.31158528,
736
+ "loss": 1.283,
737
+ "grad_norm": 2.8252458572387695,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 2.557423616,
744
+ "gpu_mem": 1.311635968,
745
+ "loss": 1.2006,
746
+ "grad_norm": 2.767235517501831,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 2.557423616,
753
+ "gpu_mem": 1.311574528,
754
+ "loss": 1.2354,
755
+ "grad_norm": 2.006580352783203,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 2.557423616,
762
+ "gpu_mem": 1.311620608,
763
+ "loss": 1.2189,
764
+ "grad_norm": 2.486454486846924,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 2.557423616,
771
+ "gpu_mem": 1.311574528,
772
+ "loss": 1.1596,
773
+ "grad_norm": 1.7841145992279053,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 2.557423616,
780
+ "gpu_mem": 1.311605248,
781
+ "loss": 1.1458,
782
+ "grad_norm": 2.067601203918457,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 2.557423616,
789
+ "gpu_mem": 1.311580672,
790
+ "loss": 1.1451,
791
+ "grad_norm": 2.60972261428833,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 2.557423616,
798
+ "gpu_mem": 1.311634432,
799
+ "loss": 1.0901,
800
+ "grad_norm": 1.8266022205352783,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 2.557423616,
807
+ "gpu_mem": 1.311616,
808
+ "loss": 1.1689,
809
+ "grad_norm": 2.385910987854004,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 2.557423616,
816
+ "gpu_mem": 1.311565312,
817
+ "loss": 1.1882,
818
+ "grad_norm": 2.8446080684661865,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 2.557423616,
825
+ "gpu_mem": 1.311589888,
826
+ "loss": 1.2837,
827
+ "grad_norm": 4.627291202545166,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 2.557423616,
834
+ "gpu_mem": 1.31159296,
835
+ "loss": 1.1516,
836
+ "grad_norm": 3.087660551071167,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 2.557423616,
843
+ "gpu_mem": 1.31158528,
844
+ "loss": 1.1214,
845
+ "grad_norm": 3.74629282951355,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 2.557423616,
852
+ "gpu_mem": 1.31162368,
853
+ "loss": 1.1599,
854
+ "grad_norm": 3.486809730529785,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 2.557423616,
861
+ "gpu_mem": 1.311632896,
862
+ "loss": 1.0801,
863
+ "grad_norm": 3.3228325843811035,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 2.557423616,
870
+ "gpu_mem": 1.311576064,
871
+ "loss": 1.0717,
872
+ "grad_norm": 3.0872833728790283,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 2.557423616,
879
+ "gpu_mem": 1.311576064,
880
+ "loss": 1.1351,
881
+ "grad_norm": 2.749499797821045,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 2.557423616,
888
+ "gpu_mem": 1.311572992,
889
+ "loss": 1.1429,
890
+ "grad_norm": 3.283785343170166,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 2.557423616,
897
+ "gpu_mem": 1.311571456,
898
+ "loss": 1.1013,
899
+ "grad_norm": 3.4343783855438232,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 2.557423616,
906
+ "gpu_mem": 1.311614464,
907
+ "loss": 0.9924,
908
+ "grad_norm": 3.1077940464019775,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 2.557423616,
915
+ "gpu_mem": 1.311553024,
916
+ "loss": 1.196,
917
+ "grad_norm": 3.4048402309417725,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 2.557423616,
924
+ "gpu_mem": 1.311602176,
925
+ "loss": 1.0887,
926
+ "grad_norm": 2.745129346847534,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 2.557423616,
933
+ "gpu_mem": 1.311665152,
934
+ "loss": 1.2042,
935
+ "grad_norm": 4.17969274520874,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 2.557423616,
942
+ "gpu_mem": 1.311617536,
943
+ "loss": 1.0371,
944
+ "grad_norm": 2.640275716781616,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 2.557423616,
951
+ "gpu_mem": 1.311599104,
952
+ "loss": 1.1826,
953
+ "grad_norm": 3.1509358882904053,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 2.557423616,
960
+ "gpu_mem": 1.395690496,
961
+ "loss": 1.2879,
962
+ "grad_norm": 5.848189353942871,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 2.557423616,
969
+ "gpu_mem": 1.395672064,
970
+ "loss": 0.938,
971
+ "grad_norm": 2.7139761447906494,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 2.557423616,
978
+ "gpu_mem": 1.395661312,
979
+ "loss": 0.8768,
980
+ "grad_norm": 3.0623884201049805,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 2.557423616,
987
+ "gpu_mem": 1.395715072,
988
+ "loss": 0.8991,
989
+ "grad_norm": 3.3740732669830322,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 2.557423616,
996
+ "gpu_mem": 1.395675136,
997
+ "loss": 0.9248,
998
+ "grad_norm": 3.0837693214416504,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 2.557423616,
1005
+ "gpu_mem": 1.395693568,
1006
+ "loss": 0.8681,
1007
+ "grad_norm": 2.892775535583496,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 2.557423616,
1014
+ "gpu_mem": 1.395756544,
1015
+ "loss": 0.769,
1016
+ "grad_norm": 2.8671891689300537,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 2.557423616,
1023
+ "gpu_mem": 1.395684352,
1024
+ "loss": 0.8145,
1025
+ "grad_norm": 3.2121176719665527,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 2.557423616,
1032
+ "gpu_mem": 1.395678208,
1033
+ "loss": 0.9297,
1034
+ "grad_norm": 3.5213088989257812,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 2.557423616,
1041
+ "gpu_mem": 1.395693568,
1042
+ "loss": 0.8152,
1043
+ "grad_norm": 4.156257629394531,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 2.557423616,
1050
+ "gpu_mem": 1.395708928,
1051
+ "loss": 0.7545,
1052
+ "grad_norm": 3.5243821144104004,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 2.557423616,
1059
+ "gpu_mem": 1.395699712,
1060
+ "loss": 0.8089,
1061
+ "grad_norm": 4.4013495445251465,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 2.557423616,
1068
+ "gpu_mem": 1.395690496,
1069
+ "loss": 0.7094,
1070
+ "grad_norm": 3.5775132179260254,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 2.557423616,
1077
+ "gpu_mem": 1.395708928,
1078
+ "loss": 0.8269,
1079
+ "grad_norm": 4.176782131195068,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 2.557423616,
1086
+ "gpu_mem": 1.395707392,
1087
+ "loss": 0.6735,
1088
+ "grad_norm": 4.356370449066162,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 2.557423616,
1095
+ "gpu_mem": 1.395664384,
1096
+ "loss": 0.5891,
1097
+ "grad_norm": 3.9058308601379395,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 2.557423616,
1104
+ "gpu_mem": 1.39569664,
1105
+ "loss": 0.674,
1106
+ "grad_norm": 4.811568260192871,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 2.557423616,
1113
+ "gpu_mem": 1.39565056,
1114
+ "loss": 0.696,
1115
+ "grad_norm": 5.005099773406982,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 2.557423616,
1122
+ "gpu_mem": 1.395695104,
1123
+ "loss": 0.6493,
1124
+ "grad_norm": 4.956096649169922,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 2.557423616,
1131
+ "gpu_mem": 1.395645952,
1132
+ "loss": 0.6994,
1133
+ "grad_norm": 5.525113582611084,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 2.557423616,
1140
+ "gpu_mem": 1.39565824,
1141
+ "loss": 0.6784,
1142
+ "grad_norm": 4.813937187194824,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 2.557423616,
1149
+ "gpu_mem": 1.395682816,
1150
+ "loss": 0.6895,
1151
+ "grad_norm": 5.974727630615234,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 2.557423616,
1158
+ "gpu_mem": 1.395644416,
1159
+ "loss": 0.6212,
1160
+ "grad_norm": 5.571749210357666,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 2.557423616,
1167
+ "gpu_mem": 1.395647488,
1168
+ "loss": 0.7407,
1169
+ "grad_norm": 6.153623580932617,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 2.557423616,
1176
+ "gpu_mem": 1.395659776,
1177
+ "loss": 0.6919,
1178
+ "grad_norm": 7.186251163482666,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 2.557423616,
1185
+ "gpu_mem": 1.395624448,
1186
+ "loss": 0.7357,
1187
+ "grad_norm": 5.518382549285889,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 2.557423616,
1194
+ "gpu_mem": 1.39566592,
1195
+ "loss": 0.645,
1196
+ "grad_norm": 4.9238481521606445,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 2.557423616,
1203
+ "gpu_mem": 1.39568128,
1204
+ "loss": 0.7276,
1205
+ "grad_norm": 5.81230354309082,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 2.557423616,
1212
+ "gpu_mem": 1.395645952,
1213
+ "loss": 0.7939,
1214
+ "grad_norm": 7.195316791534424,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 2.557423616,
1221
+ "gpu_mem": 1.395653632,
1222
+ "loss": 0.6268,
1223
+ "grad_norm": 5.663719177246094,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 2.557423616,
1230
+ "gpu_mem": 1.395675136,
1231
+ "loss": 0.6477,
1232
+ "grad_norm": 6.333588600158691,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 2.557423616,
1239
+ "gpu_mem": 1.395685888,
1240
+ "loss": 0.7271,
1241
+ "grad_norm": 7.243999004364014,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 2.557423616,
1248
+ "gpu_mem": 1.395678208,
1249
+ "loss": 0.6908,
1250
+ "grad_norm": 6.188854217529297,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 2.557423616,
1257
+ "gpu_mem": 1.395712,
1258
+ "loss": 0.6887,
1259
+ "grad_norm": 5.151924133300781,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 2.557423616,
1266
+ "gpu_mem": 1.395712,
1267
+ "train_runtime": 679.7329,
1268
+ "train_samples_per_second": 13.246,
1269
+ "train_steps_per_second": 0.206,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.2421479872294834
1272
+ }
1273
+ ]
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.6785932721712539
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-boolq-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-boolq-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-29T13:54:03.881222"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-boolq-r2-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 1.692184576,
6
+ "gpu_mem": 1.039525888,
7
+ "loss": 8.874,
8
+ "grad_norm": 291.3755798339844,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 1.774481408,
15
+ "gpu_mem": 1.067201024,
16
+ "loss": 8.9593,
17
+ "grad_norm": 313.7054443359375,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 1.775071232,
24
+ "gpu_mem": 1.067119616,
25
+ "loss": 7.8833,
26
+ "grad_norm": 301.32061767578125,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 1.77625088,
33
+ "gpu_mem": 1.067119616,
34
+ "loss": 6.035,
35
+ "grad_norm": 295.67266845703125,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 1.776840704,
42
+ "gpu_mem": 1.067055104,
43
+ "loss": 3.8316,
44
+ "grad_norm": 244.2612762451172,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 1.77723392,
51
+ "gpu_mem": 1.067075072,
52
+ "loss": 2.2108,
53
+ "grad_norm": 140.66513061523438,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 1.77821696,
60
+ "gpu_mem": 1.067127296,
61
+ "loss": 1.3536,
62
+ "grad_norm": 75.02676391601562,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 1.779003392,
69
+ "gpu_mem": 1.067213312,
70
+ "loss": 0.8221,
71
+ "grad_norm": 44.54599380493164,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 1.779396608,
78
+ "gpu_mem": 1.067121152,
79
+ "loss": 1.147,
80
+ "grad_norm": 125.10873413085938,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 1.78018304,
87
+ "gpu_mem": 1.067021312,
88
+ "loss": 0.7799,
89
+ "grad_norm": 66.12985229492188,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 1.780576256,
96
+ "gpu_mem": 1.06712576,
97
+ "loss": 1.2022,
98
+ "grad_norm": 143.1725311279297,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 1.780969472,
105
+ "gpu_mem": 1.067497472,
106
+ "loss": 1.0554,
107
+ "grad_norm": 115.85326385498047,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 1.781362688,
114
+ "gpu_mem": 1.067101184,
115
+ "loss": 0.6992,
116
+ "grad_norm": 21.397693634033203,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 1.781755904,
123
+ "gpu_mem": 1.067078144,
124
+ "loss": 0.6819,
125
+ "grad_norm": 14.809820175170898,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 1.781952512,
132
+ "gpu_mem": 1.067016704,
133
+ "loss": 0.7285,
134
+ "grad_norm": 27.61170196533203,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 1.782345728,
141
+ "gpu_mem": 1.067101184,
142
+ "loss": 0.7103,
143
+ "grad_norm": 21.049345016479492,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 1.782738944,
150
+ "gpu_mem": 1.06714112,
151
+ "loss": 0.8616,
152
+ "grad_norm": 82.16301727294922,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 1.782935552,
159
+ "gpu_mem": 1.067204096,
160
+ "loss": 0.7385,
161
+ "grad_norm": 39.1813850402832,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 1.783328768,
168
+ "gpu_mem": 1.06704128,
169
+ "loss": 0.7023,
170
+ "grad_norm": 9.508649826049805,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 1.7841152,
177
+ "gpu_mem": 1.067153408,
178
+ "loss": 0.681,
179
+ "grad_norm": 37.81815719604492,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 1.784311808,
186
+ "gpu_mem": 1.067311616,
187
+ "loss": 0.6711,
188
+ "grad_norm": 31.313589096069336,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 1.784393728,
195
+ "gpu_mem": 1.067204096,
196
+ "loss": 0.7823,
197
+ "grad_norm": 41.10369873046875,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 1.784983552,
204
+ "gpu_mem": 1.067176448,
205
+ "loss": 0.7426,
206
+ "grad_norm": 43.635948181152344,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 1.78518016,
213
+ "gpu_mem": 1.06723328,
214
+ "loss": 0.6711,
215
+ "grad_norm": 38.893856048583984,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 1.78536448,
222
+ "gpu_mem": 1.06701824,
223
+ "loss": 0.7459,
224
+ "grad_norm": 34.04146194458008,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 1.786150912,
231
+ "gpu_mem": 1.067073536,
232
+ "loss": 0.7823,
233
+ "grad_norm": 32.66416549682617,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 1.786343424,
240
+ "gpu_mem": 1.067365376,
241
+ "loss": 0.7429,
242
+ "grad_norm": 37.693138122558594,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 1.786376192,
249
+ "gpu_mem": 1.067044352,
250
+ "loss": 0.6878,
251
+ "grad_norm": 9.707056045532227,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 1.786769408,
258
+ "gpu_mem": 1.067108864,
259
+ "loss": 0.6581,
260
+ "grad_norm": 12.347441673278809,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 1.786966016,
267
+ "gpu_mem": 1.0671872,
268
+ "loss": 0.6763,
269
+ "grad_norm": 14.598573684692383,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 1.787162624,
276
+ "gpu_mem": 1.066990592,
277
+ "loss": 0.8017,
278
+ "grad_norm": 43.43600082397461,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 1.787359232,
285
+ "gpu_mem": 1.067104256,
286
+ "loss": 0.7734,
287
+ "grad_norm": 28.66684913635254,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 1.78755584,
294
+ "gpu_mem": 1.067342336,
295
+ "loss": 0.6755,
296
+ "grad_norm": 3.923459529876709,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 1.787752448,
303
+ "gpu_mem": 1.067044352,
304
+ "loss": 0.5818,
305
+ "grad_norm": 6.199655055999756,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 1.788342272,
312
+ "gpu_mem": 1.067254784,
313
+ "loss": 0.6999,
314
+ "grad_norm": 15.666038513183594,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 1.78853888,
321
+ "gpu_mem": 1.067205632,
322
+ "loss": 0.8204,
323
+ "grad_norm": 33.818546295166016,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 1.788735488,
330
+ "gpu_mem": 1.067016704,
331
+ "loss": 0.8295,
332
+ "grad_norm": 37.759639739990234,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 1.788932096,
339
+ "gpu_mem": 1.067264,
340
+ "loss": 0.7103,
341
+ "grad_norm": 16.049278259277344,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 1.791488,
348
+ "gpu_mem": 1.067643392,
349
+ "loss": 0.6793,
350
+ "grad_norm": 5.071969509124756,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 1.791488,
357
+ "gpu_mem": 1.067213312,
358
+ "loss": 0.7018,
359
+ "grad_norm": 16.534059524536133,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 1.791684608,
366
+ "gpu_mem": 1.06744064,
367
+ "loss": 0.6888,
368
+ "grad_norm": 5.772024154663086,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 1.791881216,
375
+ "gpu_mem": 1.067337728,
376
+ "loss": 0.7286,
377
+ "grad_norm": 24.6931095123291,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 1.792077824,
384
+ "gpu_mem": 1.067159552,
385
+ "loss": 0.6181,
386
+ "grad_norm": 10.050416946411133,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 1.792274432,
393
+ "gpu_mem": 1.0673024,
394
+ "loss": 0.6505,
395
+ "grad_norm": 15.0296630859375,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 1.79247104,
402
+ "gpu_mem": 1.067082752,
403
+ "loss": 0.6943,
404
+ "grad_norm": 10.401119232177734,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 1.792667648,
411
+ "gpu_mem": 1.06732544,
412
+ "loss": 0.7991,
413
+ "grad_norm": 26.349973678588867,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 1.792864256,
420
+ "gpu_mem": 1.06704896,
421
+ "loss": 0.6991,
422
+ "grad_norm": 18.533720016479492,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 1.792864256,
429
+ "gpu_mem": 1.06712576,
430
+ "loss": 0.5962,
431
+ "grad_norm": 3.064704656600952,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 1.793060864,
438
+ "gpu_mem": 1.067142656,
439
+ "loss": 1.081,
440
+ "grad_norm": 37.34111022949219,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 1.793257472,
447
+ "gpu_mem": 1.067081216,
448
+ "loss": 0.74,
449
+ "grad_norm": 21.937891006469727,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 1.793257472,
456
+ "gpu_mem": 1.067085824,
457
+ "loss": 0.635,
458
+ "grad_norm": 22.710704803466797,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 1.79345408,
465
+ "gpu_mem": 1.067165696,
466
+ "loss": 0.7188,
467
+ "grad_norm": 9.722867965698242,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 1.79345408,
474
+ "gpu_mem": 1.067188736,
475
+ "loss": 0.5927,
476
+ "grad_norm": 7.451268672943115,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 1.79345408,
483
+ "gpu_mem": 1.067116544,
484
+ "loss": 1.0226,
485
+ "grad_norm": 33.91159439086914,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 1.793650688,
492
+ "gpu_mem": 1.06738688,
493
+ "loss": 0.753,
494
+ "grad_norm": 21.46257972717285,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 1.793847296,
501
+ "gpu_mem": 1.067173376,
502
+ "loss": 0.712,
503
+ "grad_norm": 16.427001953125,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 1.793847296,
510
+ "gpu_mem": 1.067167232,
511
+ "loss": 0.9295,
512
+ "grad_norm": 48.56226348876953,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 1.794043904,
519
+ "gpu_mem": 1.067062784,
520
+ "loss": 1.1245,
521
+ "grad_norm": 55.51771545410156,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 1.794043904,
528
+ "gpu_mem": 1.06707968,
529
+ "loss": 0.7755,
530
+ "grad_norm": 22.440706253051758,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 1.794043904,
537
+ "gpu_mem": 1.067173376,
538
+ "loss": 0.5978,
539
+ "grad_norm": 5.783385276794434,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 1.794240512,
546
+ "gpu_mem": 1.067184128,
547
+ "loss": 0.7382,
548
+ "grad_norm": 24.120975494384766,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 1.79443712,
555
+ "gpu_mem": 1.06717184,
556
+ "loss": 1.046,
557
+ "grad_norm": 57.39268112182617,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 1.79443712,
564
+ "gpu_mem": 1.06716416,
565
+ "loss": 0.6748,
566
+ "grad_norm": 59.949466705322266,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 1.79443712,
573
+ "gpu_mem": 1.067093504,
574
+ "loss": 0.7373,
575
+ "grad_norm": 67.06160736083984,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 1.79443712,
582
+ "gpu_mem": 1.067138048,
583
+ "loss": 0.7317,
584
+ "grad_norm": 35.02265548706055,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 1.794633728,
591
+ "gpu_mem": 1.067331584,
592
+ "loss": 0.6341,
593
+ "grad_norm": 16.198490142822266,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 1.794633728,
600
+ "gpu_mem": 1.06704128,
601
+ "loss": 0.7393,
602
+ "grad_norm": 65.17927551269531,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 1.794830336,
609
+ "gpu_mem": 1.067009024,
610
+ "loss": 0.6964,
611
+ "grad_norm": 20.7398738861084,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 1.795026944,
618
+ "gpu_mem": 1.067075072,
619
+ "loss": 0.6231,
620
+ "grad_norm": 23.524229049682617,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 1.795026944,
627
+ "gpu_mem": 1.067068928,
628
+ "loss": 0.6883,
629
+ "grad_norm": 19.696847915649414,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 1.795026944,
636
+ "gpu_mem": 1.067297792,
637
+ "loss": 0.7668,
638
+ "grad_norm": 36.587337493896484,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 1.795223552,
645
+ "gpu_mem": 1.067290112,
646
+ "loss": 0.6592,
647
+ "grad_norm": 19.84144401550293,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 1.795223552,
654
+ "gpu_mem": 1.06725632,
655
+ "loss": 0.8185,
656
+ "grad_norm": 26.411832809448242,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 1.795223552,
663
+ "gpu_mem": 1.067116544,
664
+ "loss": 0.5878,
665
+ "grad_norm": 11.005779266357422,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 1.795223552,
672
+ "gpu_mem": 1.06704128,
673
+ "loss": 0.6086,
674
+ "grad_norm": 16.758214950561523,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 1.795223552,
681
+ "gpu_mem": 1.066981376,
682
+ "loss": 0.6045,
683
+ "grad_norm": 12.238960266113281,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 1.795223552,
690
+ "gpu_mem": 1.067055104,
691
+ "loss": 0.5746,
692
+ "grad_norm": 6.873617172241211,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 1.79542016,
699
+ "gpu_mem": 1.067107328,
700
+ "loss": 0.9858,
701
+ "grad_norm": 54.217674255371094,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 1.795612672,
708
+ "gpu_mem": 1.067239424,
709
+ "loss": 0.675,
710
+ "grad_norm": 15.84262466430664,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 1.795612672,
717
+ "gpu_mem": 1.067130368,
718
+ "loss": 0.6726,
719
+ "grad_norm": 18.819782257080078,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 1.795612672,
726
+ "gpu_mem": 1.06701056,
727
+ "loss": 0.6121,
728
+ "grad_norm": 12.81640911102295,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 1.795612672,
735
+ "gpu_mem": 1.06707968,
736
+ "loss": 0.8172,
737
+ "grad_norm": 42.44817352294922,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 1.795612672,
744
+ "gpu_mem": 1.06717952,
745
+ "loss": 0.7441,
746
+ "grad_norm": 27.580766677856445,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 1.79580928,
753
+ "gpu_mem": 1.067142656,
754
+ "loss": 0.656,
755
+ "grad_norm": 16.37428092956543,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 1.79580928,
762
+ "gpu_mem": 1.067174912,
763
+ "loss": 0.6365,
764
+ "grad_norm": 7.864536285400391,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 1.79580928,
771
+ "gpu_mem": 1.06712576,
772
+ "loss": 0.6734,
773
+ "grad_norm": 7.170403480529785,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 1.79580928,
780
+ "gpu_mem": 1.06713344,
781
+ "loss": 0.613,
782
+ "grad_norm": 7.2136759757995605,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 1.79580928,
789
+ "gpu_mem": 1.067277824,
790
+ "loss": 0.7224,
791
+ "grad_norm": 40.812339782714844,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 1.79580928,
798
+ "gpu_mem": 1.067059712,
799
+ "loss": 0.6609,
800
+ "grad_norm": 12.928619384765625,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 1.796005888,
807
+ "gpu_mem": 1.067113472,
808
+ "loss": 0.6629,
809
+ "grad_norm": 14.732848167419434,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 1.796005888,
816
+ "gpu_mem": 1.067081216,
817
+ "loss": 0.6226,
818
+ "grad_norm": 8.494933128356934,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 1.796005888,
825
+ "gpu_mem": 1.067162624,
826
+ "loss": 0.693,
827
+ "grad_norm": 9.185640335083008,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 1.796005888,
834
+ "gpu_mem": 1.066966016,
835
+ "loss": 0.8204,
836
+ "grad_norm": 30.638669967651367,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 1.796005888,
843
+ "gpu_mem": 1.06707968,
844
+ "loss": 0.9576,
845
+ "grad_norm": 56.41335678100586,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 1.796005888,
852
+ "gpu_mem": 1.067099648,
853
+ "loss": 0.6816,
854
+ "grad_norm": 17.54722785949707,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 1.796005888,
861
+ "gpu_mem": 1.067138048,
862
+ "loss": 0.8321,
863
+ "grad_norm": 61.89482498168945,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 1.796005888,
870
+ "gpu_mem": 1.067122688,
871
+ "loss": 0.9806,
872
+ "grad_norm": 64.5597915649414,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 1.796005888,
879
+ "gpu_mem": 1.067035136,
880
+ "loss": 0.6897,
881
+ "grad_norm": 16.576112747192383,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 1.796005888,
888
+ "gpu_mem": 1.066984448,
889
+ "loss": 0.6967,
890
+ "grad_norm": 21.121322631835938,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 1.796005888,
897
+ "gpu_mem": 1.067101184,
898
+ "loss": 0.6994,
899
+ "grad_norm": 21.069868087768555,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 1.796005888,
906
+ "gpu_mem": 1.06713344,
907
+ "loss": 1.2423,
908
+ "grad_norm": 117.06101989746094,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 1.796005888,
915
+ "gpu_mem": 1.067167232,
916
+ "loss": 0.8385,
917
+ "grad_norm": 38.71371841430664,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 1.796005888,
924
+ "gpu_mem": 1.06721792,
925
+ "loss": 0.6489,
926
+ "grad_norm": 5.8236260414123535,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 1.796005888,
933
+ "gpu_mem": 1.067122688,
934
+ "loss": 0.7339,
935
+ "grad_norm": 66.14034271240234,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 1.796005888,
942
+ "gpu_mem": 1.067224064,
943
+ "loss": 0.6994,
944
+ "grad_norm": 16.291786193847656,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 1.796005888,
951
+ "gpu_mem": 1.067174912,
952
+ "loss": 0.6202,
953
+ "grad_norm": 4.222020149230957,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 1.796202496,
960
+ "gpu_mem": 1.067062784,
961
+ "loss": 0.6355,
962
+ "grad_norm": 7.902516841888428,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 1.796202496,
969
+ "gpu_mem": 1.067247104,
970
+ "loss": 0.6792,
971
+ "grad_norm": 31.659059524536133,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 1.79613696,
978
+ "gpu_mem": 1.067101184,
979
+ "loss": 0.6705,
980
+ "grad_norm": 4.247161388397217,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 1.79613696,
987
+ "gpu_mem": 1.067104256,
988
+ "loss": 0.6483,
989
+ "grad_norm": 4.0431647300720215,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 1.79613696,
996
+ "gpu_mem": 1.067073536,
997
+ "loss": 0.6156,
998
+ "grad_norm": 4.198848247528076,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 1.79613696,
1005
+ "gpu_mem": 1.067119616,
1006
+ "loss": 0.6729,
1007
+ "grad_norm": 21.031217575073242,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 1.796333568,
1014
+ "gpu_mem": 1.0671104,
1015
+ "loss": 0.6631,
1016
+ "grad_norm": 6.252691268920898,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 1.796333568,
1023
+ "gpu_mem": 1.067091968,
1024
+ "loss": 0.6223,
1025
+ "grad_norm": 11.559258460998535,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 1.79625984,
1032
+ "gpu_mem": 1.067167232,
1033
+ "loss": 0.774,
1034
+ "grad_norm": 35.43349838256836,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 1.79625984,
1041
+ "gpu_mem": 1.06708736,
1042
+ "loss": 0.6297,
1043
+ "grad_norm": 10.36580753326416,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 1.79625984,
1050
+ "gpu_mem": 1.066978304,
1051
+ "loss": 0.6969,
1052
+ "grad_norm": 17.803829193115234,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 1.796456448,
1059
+ "gpu_mem": 1.067216384,
1060
+ "loss": 0.6875,
1061
+ "grad_norm": 14.91399097442627,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 1.796456448,
1068
+ "gpu_mem": 1.06738688,
1069
+ "loss": 0.6005,
1070
+ "grad_norm": 4.876289367675781,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 1.796653056,
1077
+ "gpu_mem": 1.067119616,
1078
+ "loss": 0.6439,
1079
+ "grad_norm": 11.509865760803223,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 1.796653056,
1086
+ "gpu_mem": 1.067147264,
1087
+ "loss": 0.6838,
1088
+ "grad_norm": 96.23526763916016,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 1.796653056,
1095
+ "gpu_mem": 1.067197952,
1096
+ "loss": 0.5861,
1097
+ "grad_norm": 16.30876350402832,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 1.796849664,
1104
+ "gpu_mem": 1.067007488,
1105
+ "loss": 0.6853,
1106
+ "grad_norm": 15.60392951965332,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 1.797046272,
1113
+ "gpu_mem": 1.067449856,
1114
+ "loss": 0.6229,
1115
+ "grad_norm": 10.207924842834473,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 1.797042176,
1122
+ "gpu_mem": 1.067176448,
1123
+ "loss": 0.5491,
1124
+ "grad_norm": 7.521026611328125,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 1.797042176,
1131
+ "gpu_mem": 1.067059712,
1132
+ "loss": 0.5683,
1133
+ "grad_norm": 7.79779052734375,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 1.797042176,
1140
+ "gpu_mem": 1.067499008,
1141
+ "loss": 0.6645,
1142
+ "grad_norm": 18.44670295715332,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 1.797238784,
1149
+ "gpu_mem": 1.067274752,
1150
+ "loss": 0.6625,
1151
+ "grad_norm": 28.01342010498047,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 1.797013504,
1158
+ "gpu_mem": 1.067314688,
1159
+ "loss": 0.7477,
1160
+ "grad_norm": 11.30091381072998,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 1.797013504,
1167
+ "gpu_mem": 1.067096576,
1168
+ "loss": 0.6573,
1169
+ "grad_norm": 9.1353120803833,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 1.797013504,
1176
+ "gpu_mem": 1.0672256,
1177
+ "loss": 0.6643,
1178
+ "grad_norm": 9.245180130004883,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 1.797013504,
1185
+ "gpu_mem": 1.067307008,
1186
+ "loss": 0.6864,
1187
+ "grad_norm": 20.19770050048828,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 1.797210112,
1194
+ "gpu_mem": 1.067090432,
1195
+ "loss": 0.661,
1196
+ "grad_norm": 230.81329345703125,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 1.797210112,
1203
+ "gpu_mem": 1.067224064,
1204
+ "loss": 0.578,
1205
+ "grad_norm": 14.924580574035645,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 1.797107712,
1212
+ "gpu_mem": 1.067247104,
1213
+ "loss": 0.6823,
1214
+ "grad_norm": 19.97910499572754,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 1.797107712,
1221
+ "gpu_mem": 1.067084288,
1222
+ "loss": 0.6087,
1223
+ "grad_norm": 9.597871780395508,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 1.797107712,
1230
+ "gpu_mem": 1.06696448,
1231
+ "loss": 0.6557,
1232
+ "grad_norm": 5.5722551345825195,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 1.796898816,
1239
+ "gpu_mem": 1.067145728,
1240
+ "loss": 0.655,
1241
+ "grad_norm": 8.457000732421875,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 1.79689472,
1248
+ "gpu_mem": 1.067044352,
1249
+ "loss": 0.655,
1250
+ "grad_norm": 6.6021409034729,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 1.797050368,
1257
+ "gpu_mem": 1.067096576,
1258
+ "loss": 0.636,
1259
+ "grad_norm": 4.045039176940918,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 1.796988928,
1266
+ "gpu_mem": 1.067128832,
1267
+ "loss": 0.6422,
1268
+ "grad_norm": 6.502439975738525,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 1.797185536,
1275
+ "gpu_mem": 1.067268608,
1276
+ "loss": 0.5553,
1277
+ "grad_norm": 7.805881023406982,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 1.797382144,
1284
+ "gpu_mem": 1.067251712,
1285
+ "loss": 0.6842,
1286
+ "grad_norm": 15.737360000610352,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 1.797382144,
1293
+ "gpu_mem": 1.067443712,
1294
+ "loss": 0.6948,
1295
+ "grad_norm": 17.525169372558594,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 1.797578752,
1302
+ "gpu_mem": 1.067154944,
1303
+ "loss": 0.5722,
1304
+ "grad_norm": 7.157924175262451,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 1.797578752,
1311
+ "gpu_mem": 1.067190272,
1312
+ "loss": 0.5911,
1313
+ "grad_norm": 7.200604438781738,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 1.797373952,
1320
+ "gpu_mem": 1.067088896,
1321
+ "loss": 0.6567,
1322
+ "grad_norm": 19.34654998779297,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 1.79757056,
1329
+ "gpu_mem": 1.072502272,
1330
+ "loss": 0.9631,
1331
+ "grad_norm": 10.374675750732422,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 1.79746816,
1338
+ "gpu_mem": 1.07243776,
1339
+ "loss": 0.6399,
1340
+ "grad_norm": 9.10370922088623,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 1.797419008,
1347
+ "gpu_mem": 1.072274944,
1348
+ "loss": 0.6068,
1349
+ "grad_norm": 8.218132972717285,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 1.797615616,
1356
+ "gpu_mem": 1.072347136,
1357
+ "loss": 0.7122,
1358
+ "grad_norm": 16.563411712646484,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 1.797615616,
1365
+ "gpu_mem": 1.072382464,
1366
+ "loss": 0.5872,
1367
+ "grad_norm": 7.684173107147217,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 1.797615616,
1374
+ "gpu_mem": 1.07240704,
1375
+ "loss": 0.7744,
1376
+ "grad_norm": 34.04887008666992,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 1.797615616,
1383
+ "gpu_mem": 1.07236864,
1384
+ "loss": 0.6853,
1385
+ "grad_norm": 25.140567779541016,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 1.797615616,
1392
+ "gpu_mem": 1.072589824,
1393
+ "loss": 0.6239,
1394
+ "grad_norm": 11.781804084777832,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 1.797615616,
1401
+ "gpu_mem": 1.072497664,
1402
+ "loss": 0.6295,
1403
+ "grad_norm": 8.846559524536133,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 1.797615616,
1410
+ "gpu_mem": 1.072403968,
1411
+ "loss": 0.7732,
1412
+ "grad_norm": 28.42833137512207,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 1.797615616,
1419
+ "gpu_mem": 1.07233024,
1420
+ "loss": 0.7523,
1421
+ "grad_norm": 22.629343032836914,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 1.797615616,
1428
+ "gpu_mem": 1.072678912,
1429
+ "loss": 0.5702,
1430
+ "grad_norm": 6.326220989227295,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 1.797582848,
1437
+ "gpu_mem": 1.072273408,
1438
+ "loss": 0.6681,
1439
+ "grad_norm": 9.057672500610352,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 1.797582848,
1446
+ "gpu_mem": 1.072219648,
1447
+ "loss": 0.5916,
1448
+ "grad_norm": 15.630982398986816,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 1.797545984,
1455
+ "gpu_mem": 1.072995328,
1456
+ "loss": 0.6475,
1457
+ "grad_norm": 9.621928215026855,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 1.797742592,
1464
+ "gpu_mem": 1.072471552,
1465
+ "loss": 0.6628,
1466
+ "grad_norm": 10.567188262939453,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 1.797742592,
1473
+ "gpu_mem": 1.072384,
1474
+ "loss": 0.6579,
1475
+ "grad_norm": 7.776843547821045,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 1.797742592,
1482
+ "gpu_mem": 1.072333312,
1483
+ "loss": 0.6304,
1484
+ "grad_norm": 6.893831253051758,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 1.797742592,
1491
+ "gpu_mem": 1.072428544,
1492
+ "loss": 0.653,
1493
+ "grad_norm": 17.217058181762695,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 1.797742592,
1500
+ "gpu_mem": 1.0723456,
1501
+ "loss": 0.6726,
1502
+ "grad_norm": 10.91408634185791,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 1.797742592,
1509
+ "gpu_mem": 1.072364032,
1510
+ "loss": 0.6239,
1511
+ "grad_norm": 6.429533958435059,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 1.797742592,
1518
+ "gpu_mem": 1.072451584,
1519
+ "loss": 0.588,
1520
+ "grad_norm": 6.233065605163574,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 1.797742592,
1527
+ "gpu_mem": 1.072336384,
1528
+ "loss": 0.612,
1529
+ "grad_norm": 11.369017601013184,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 1.797742592,
1536
+ "gpu_mem": 1.07239936,
1537
+ "loss": 0.6753,
1538
+ "grad_norm": 9.935527801513672,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 1.7979392,
1545
+ "gpu_mem": 1.0723072,
1546
+ "loss": 0.6529,
1547
+ "grad_norm": 6.226833820343018,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 1.7979392,
1554
+ "gpu_mem": 1.07260672,
1555
+ "loss": 0.6305,
1556
+ "grad_norm": 13.911097526550293,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 1.7979392,
1563
+ "gpu_mem": 1.07233024,
1564
+ "loss": 0.6629,
1565
+ "grad_norm": 10.934565544128418,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 1.7979392,
1572
+ "gpu_mem": 1.072296448,
1573
+ "loss": 0.6504,
1574
+ "grad_norm": 15.064614295959473,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 1.7979392,
1581
+ "gpu_mem": 1.072434688,
1582
+ "loss": 0.591,
1583
+ "grad_norm": 13.485958099365234,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 1.7979392,
1590
+ "gpu_mem": 1.072532992,
1591
+ "loss": 0.5742,
1592
+ "grad_norm": 6.3795294761657715,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 1.7979392,
1599
+ "gpu_mem": 1.072279552,
1600
+ "loss": 0.6229,
1601
+ "grad_norm": 10.318319320678711,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 1.7979392,
1608
+ "gpu_mem": 1.072379392,
1609
+ "loss": 0.5779,
1610
+ "grad_norm": 6.194882869720459,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 1.7979392,
1617
+ "gpu_mem": 1.072351744,
1618
+ "loss": 0.6831,
1619
+ "grad_norm": 13.943729400634766,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 1.7979392,
1626
+ "gpu_mem": 1.072288768,
1627
+ "loss": 0.607,
1628
+ "grad_norm": 17.86134910583496,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 1.7979392,
1635
+ "gpu_mem": 1.07250688,
1636
+ "loss": 0.6728,
1637
+ "grad_norm": 13.313254356384277,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 1.7979392,
1644
+ "gpu_mem": 1.072403968,
1645
+ "loss": 0.7095,
1646
+ "grad_norm": 16.932268142700195,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 1.7979392,
1653
+ "gpu_mem": 1.072351744,
1654
+ "loss": 0.6871,
1655
+ "grad_norm": 22.442790985107422,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 1.7979392,
1662
+ "gpu_mem": 1.07233024,
1663
+ "loss": 0.635,
1664
+ "grad_norm": 17.37511444091797,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 1.7979392,
1671
+ "gpu_mem": 1.072339456,
1672
+ "loss": 0.5487,
1673
+ "grad_norm": 6.469228267669678,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 1.7979392,
1680
+ "gpu_mem": 1.072271872,
1681
+ "loss": 0.6375,
1682
+ "grad_norm": 8.593610763549805,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 1.7979392,
1689
+ "gpu_mem": 1.072434688,
1690
+ "loss": 0.658,
1691
+ "grad_norm": 12.747215270996094,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 1.7979392,
1698
+ "gpu_mem": 1.072304128,
1699
+ "loss": 0.63,
1700
+ "grad_norm": 8.740394592285156,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 1.7979392,
1707
+ "gpu_mem": 1.072423936,
1708
+ "loss": 0.5315,
1709
+ "grad_norm": 5.967763900756836,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 1.7979392,
1716
+ "gpu_mem": 1.072242688,
1717
+ "loss": 0.5519,
1718
+ "grad_norm": 7.465148448944092,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 1.7979392,
1725
+ "gpu_mem": 1.072374784,
1726
+ "loss": 0.5694,
1727
+ "grad_norm": 17.18702507019043,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 1.7979392,
1734
+ "gpu_mem": 1.072348672,
1735
+ "loss": 0.606,
1736
+ "grad_norm": 7.86152982711792,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 1.7979392,
1743
+ "gpu_mem": 1.07231488,
1744
+ "loss": 0.6232,
1745
+ "grad_norm": 6.897183418273926,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 1.7979392,
1752
+ "gpu_mem": 1.072419328,
1753
+ "loss": 0.5539,
1754
+ "grad_norm": 8.286925315856934,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 1.7979392,
1761
+ "gpu_mem": 1.07241472,
1762
+ "loss": 0.5208,
1763
+ "grad_norm": 7.31152868270874,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 1.7979392,
1770
+ "gpu_mem": 1.072273408,
1771
+ "loss": 0.5487,
1772
+ "grad_norm": 8.306645393371582,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 1.7979392,
1779
+ "gpu_mem": 1.072465408,
1780
+ "loss": 0.5854,
1781
+ "grad_norm": 8.069097518920898,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 1.7979392,
1788
+ "gpu_mem": 1.072316416,
1789
+ "loss": 0.6259,
1790
+ "grad_norm": 16.867874145507812,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 1.7979392,
1797
+ "gpu_mem": 1.072419328,
1798
+ "loss": 0.55,
1799
+ "grad_norm": 8.23694896697998,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 1.7979392,
1806
+ "gpu_mem": 1.07262208,
1807
+ "loss": 0.5431,
1808
+ "grad_norm": 12.741939544677734,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 1.7979392,
1815
+ "gpu_mem": 1.072423936,
1816
+ "loss": 0.6053,
1817
+ "grad_norm": 9.624649047851562,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 1.7979392,
1824
+ "gpu_mem": 1.072310272,
1825
+ "loss": 0.4861,
1826
+ "grad_norm": 11.901386260986328,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 1.7979392,
1833
+ "gpu_mem": 1.072324096,
1834
+ "loss": 0.6024,
1835
+ "grad_norm": 7.298447132110596,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 1.7979392,
1842
+ "gpu_mem": 1.07236864,
1843
+ "loss": 0.6356,
1844
+ "grad_norm": 11.019448280334473,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 1.7979392,
1851
+ "gpu_mem": 1.072310272,
1852
+ "loss": 0.5197,
1853
+ "grad_norm": 6.436569690704346,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 1.7979392,
1860
+ "gpu_mem": 1.072543744,
1861
+ "loss": 0.5731,
1862
+ "grad_norm": 9.095263481140137,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 1.7979392,
1869
+ "gpu_mem": 1.072574464,
1870
+ "loss": 0.5065,
1871
+ "grad_norm": 11.13254165649414,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 1.7979392,
1878
+ "gpu_mem": 1.072500736,
1879
+ "loss": 0.6019,
1880
+ "grad_norm": 9.443418502807617,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 1.7979392,
1887
+ "gpu_mem": 1.072388608,
1888
+ "loss": 0.5776,
1889
+ "grad_norm": 6.862228870391846,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 1.7979392,
1896
+ "gpu_mem": 1.07233792,
1897
+ "loss": 0.547,
1898
+ "grad_norm": 8.054023742675781,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 1.7979392,
1905
+ "gpu_mem": 1.0723072,
1906
+ "loss": 0.5822,
1907
+ "grad_norm": 8.296786308288574,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 1.7979392,
1914
+ "gpu_mem": 1.07233024,
1915
+ "loss": 0.6606,
1916
+ "grad_norm": 8.03130054473877,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 1.7979392,
1923
+ "gpu_mem": 1.072413184,
1924
+ "loss": 0.607,
1925
+ "grad_norm": 7.151822090148926,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 1.7979392,
1932
+ "gpu_mem": 1.072340992,
1933
+ "loss": 0.5881,
1934
+ "grad_norm": 10.393117904663086,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 1.7979392,
1941
+ "gpu_mem": 1.07250688,
1942
+ "loss": 0.5719,
1943
+ "grad_norm": 8.044678688049316,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 1.7979392,
1950
+ "gpu_mem": 1.072348672,
1951
+ "loss": 0.579,
1952
+ "grad_norm": 8.284494400024414,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 1.7979392,
1959
+ "gpu_mem": 1.072325632,
1960
+ "loss": 0.5693,
1961
+ "grad_norm": 9.51559829711914,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 1.7979392,
1968
+ "gpu_mem": 1.072451584,
1969
+ "loss": 0.5657,
1970
+ "grad_norm": 9.279644966125488,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 1.7979392,
1977
+ "gpu_mem": 1.072485376,
1978
+ "loss": 0.5521,
1979
+ "grad_norm": 7.055351257324219,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 1.7979392,
1986
+ "gpu_mem": 1.07235328,
1987
+ "loss": 0.6077,
1988
+ "grad_norm": 8.214317321777344,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 1.7979392,
1995
+ "gpu_mem": 1.072489984,
1996
+ "loss": 0.5135,
1997
+ "grad_norm": 6.798290252685547,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 1.7979392,
2004
+ "gpu_mem": 1.072403968,
2005
+ "loss": 0.5714,
2006
+ "grad_norm": 8.6298189163208,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 1.7979392,
2013
+ "gpu_mem": 1.07236864,
2014
+ "loss": 0.6006,
2015
+ "grad_norm": 7.770857810974121,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 1.7979392,
2022
+ "gpu_mem": 1.072333312,
2023
+ "loss": 0.6025,
2024
+ "grad_norm": 9.466155052185059,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 1.7979392,
2031
+ "gpu_mem": 1.072482304,
2032
+ "loss": 0.4778,
2033
+ "grad_norm": 6.76349401473999,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 1.7979392,
2040
+ "gpu_mem": 1.072371712,
2041
+ "loss": 0.5687,
2042
+ "grad_norm": 8.24431324005127,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 1.7979392,
2049
+ "gpu_mem": 1.072316416,
2050
+ "loss": 0.6535,
2051
+ "grad_norm": 10.470850944519043,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 1.7979392,
2058
+ "gpu_mem": 1.072258048,
2059
+ "loss": 0.5761,
2060
+ "grad_norm": 8.238707542419434,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 1.7979392,
2067
+ "gpu_mem": 1.07231488,
2068
+ "loss": 0.603,
2069
+ "grad_norm": 8.463348388671875,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 1.7979392,
2076
+ "gpu_mem": 1.072592896,
2077
+ "loss": 0.5968,
2078
+ "grad_norm": 6.370932579040527,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 1.7979392,
2085
+ "gpu_mem": 1.072316416,
2086
+ "loss": 0.6799,
2087
+ "grad_norm": 10.808438301086426,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 1.7979392,
2094
+ "gpu_mem": 1.07262976,
2095
+ "loss": 0.5498,
2096
+ "grad_norm": 9.736083030700684,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 1.7979392,
2103
+ "gpu_mem": 1.072505344,
2104
+ "loss": 0.544,
2105
+ "grad_norm": 10.119654655456543,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 1.7979392,
2112
+ "gpu_mem": 1.07226112,
2113
+ "loss": 0.5947,
2114
+ "grad_norm": 18.05272674560547,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 1.7979392,
2121
+ "gpu_mem": 1.072321024,
2122
+ "loss": 0.5395,
2123
+ "grad_norm": 8.5020170211792,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 1.7979392,
2130
+ "gpu_mem": 1.072382464,
2131
+ "loss": 0.5431,
2132
+ "grad_norm": 11.324089050292969,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 1.7979392,
2139
+ "gpu_mem": 1.072384,
2140
+ "loss": 0.5979,
2141
+ "grad_norm": 9.914655685424805,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 1.7979392,
2148
+ "gpu_mem": 1.07263744,
2149
+ "loss": 0.6107,
2150
+ "grad_norm": 10.563992500305176,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 1.7979392,
2157
+ "gpu_mem": 1.072287232,
2158
+ "loss": 0.6307,
2159
+ "grad_norm": 9.804160118103027,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 1.7979392,
2166
+ "gpu_mem": 1.07258368,
2167
+ "loss": 0.5666,
2168
+ "grad_norm": 8.388216972351074,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 1.7979392,
2175
+ "gpu_mem": 1.07244544,
2176
+ "loss": 0.5815,
2177
+ "grad_norm": 9.455634117126465,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 1.7979392,
2184
+ "gpu_mem": 1.072297984,
2185
+ "loss": 0.7665,
2186
+ "grad_norm": 16.60271644592285,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 1.7979392,
2193
+ "gpu_mem": 1.07243776,
2194
+ "loss": 0.5376,
2195
+ "grad_norm": 7.040289878845215,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 1.7979392,
2202
+ "gpu_mem": 1.072316416,
2203
+ "loss": 0.588,
2204
+ "grad_norm": 9.260379791259766,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 1.7979392,
2211
+ "gpu_mem": 1.072410112,
2212
+ "loss": 0.5779,
2213
+ "grad_norm": 9.03203010559082,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 1.7979392,
2220
+ "gpu_mem": 1.072428544,
2221
+ "loss": 0.5881,
2222
+ "grad_norm": 8.537538528442383,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 1.7979392,
2229
+ "gpu_mem": 1.072380928,
2230
+ "loss": 0.5312,
2231
+ "grad_norm": 5.940595626831055,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 1.7979392,
2238
+ "gpu_mem": 1.072287232,
2239
+ "loss": 0.5151,
2240
+ "grad_norm": 9.854302406311035,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 1.7979392,
2247
+ "gpu_mem": 1.072379392,
2248
+ "loss": 0.5526,
2249
+ "grad_norm": 12.617406845092773,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 1.7979392,
2256
+ "gpu_mem": 1.07229184,
2257
+ "loss": 0.5842,
2258
+ "grad_norm": 7.597400665283203,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 1.7979392,
2265
+ "gpu_mem": 1.072334848,
2266
+ "loss": 0.5797,
2267
+ "grad_norm": 7.297758102416992,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 1.7979392,
2274
+ "gpu_mem": 1.072528384,
2275
+ "loss": 0.5631,
2276
+ "grad_norm": 8.775803565979004,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 1.7979392,
2283
+ "gpu_mem": 1.072324096,
2284
+ "loss": 0.6357,
2285
+ "grad_norm": 11.823816299438477,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 1.7979392,
2292
+ "gpu_mem": 1.072480768,
2293
+ "loss": 0.5455,
2294
+ "grad_norm": 9.827958106994629,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 1.797906432,
2301
+ "gpu_mem": 1.0723072,
2302
+ "loss": 0.5356,
2303
+ "grad_norm": 8.764373779296875,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 1.79810304,
2310
+ "gpu_mem": 1.072720384,
2311
+ "loss": 0.6121,
2312
+ "grad_norm": 15.294634819030762,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 1.79810304,
2319
+ "gpu_mem": 1.072379392,
2320
+ "loss": 0.5652,
2321
+ "grad_norm": 12.042337417602539,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 1.79810304,
2328
+ "gpu_mem": 1.072296448,
2329
+ "loss": 0.6021,
2330
+ "grad_norm": 8.471673011779785,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 1.79810304,
2337
+ "gpu_mem": 1.072413184,
2338
+ "loss": 0.4417,
2339
+ "grad_norm": 7.8211774826049805,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 1.79810304,
2346
+ "gpu_mem": 1.07236864,
2347
+ "loss": 0.4433,
2348
+ "grad_norm": 7.459161758422852,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 1.79810304,
2355
+ "gpu_mem": 1.072325632,
2356
+ "loss": 0.4973,
2357
+ "grad_norm": 8.854447364807129,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 1.79810304,
2364
+ "gpu_mem": 1.072362496,
2365
+ "loss": 0.5719,
2366
+ "grad_norm": 7.8189520835876465,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 1.79810304,
2373
+ "gpu_mem": 1.072450048,
2374
+ "loss": 0.5331,
2375
+ "grad_norm": 7.528275489807129,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 1.79810304,
2382
+ "gpu_mem": 1.07236864,
2383
+ "loss": 0.5941,
2384
+ "grad_norm": 10.641229629516602,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 1.79810304,
2391
+ "gpu_mem": 1.07258368,
2392
+ "loss": 0.591,
2393
+ "grad_norm": 8.315288543701172,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 1.79810304,
2400
+ "gpu_mem": 1.07237632,
2401
+ "loss": 0.4775,
2402
+ "grad_norm": 11.404972076416016,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 1.79810304,
2409
+ "gpu_mem": 1.072380928,
2410
+ "loss": 0.5234,
2411
+ "grad_norm": 10.700228691101074,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 1.79810304,
2418
+ "gpu_mem": 1.07239168,
2419
+ "loss": 0.5836,
2420
+ "grad_norm": 13.900726318359375,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 1.79810304,
2427
+ "gpu_mem": 1.07243008,
2428
+ "loss": 0.6022,
2429
+ "grad_norm": 10.767265319824219,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 1.799479296,
2436
+ "gpu_mem": 1.072482304,
2437
+ "loss": 0.5524,
2438
+ "grad_norm": 10.918460845947266,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 1.801641984,
2445
+ "gpu_mem": 1.072340992,
2446
+ "loss": 0.6307,
2447
+ "grad_norm": 11.49914836883545,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 1.801641984,
2454
+ "gpu_mem": 1.072221184,
2455
+ "loss": 0.5899,
2456
+ "grad_norm": 10.842668533325195,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 1.801641984,
2463
+ "gpu_mem": 1.072448512,
2464
+ "loss": 0.6161,
2465
+ "grad_norm": 10.53109359741211,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 1.801641984,
2472
+ "gpu_mem": 1.072692736,
2473
+ "loss": 0.5369,
2474
+ "grad_norm": 11.559465408325195,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 1.801641984,
2481
+ "gpu_mem": 1.07235328,
2482
+ "loss": 0.5555,
2483
+ "grad_norm": 7.54006290435791,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 1.801641984,
2490
+ "gpu_mem": 1.07229952,
2491
+ "loss": 0.6145,
2492
+ "grad_norm": 8.808974266052246,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 1.801641984,
2499
+ "gpu_mem": 1.072462336,
2500
+ "loss": 0.5865,
2501
+ "grad_norm": 8.585607528686523,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 1.802428416,
2508
+ "gpu_mem": 1.072402432,
2509
+ "loss": 0.5963,
2510
+ "grad_norm": 8.654258728027344,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 1.802428416,
2517
+ "gpu_mem": 1.072382464,
2518
+ "loss": 0.5161,
2519
+ "grad_norm": 6.762901306152344,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 1.802428416,
2526
+ "gpu_mem": 1.072317952,
2527
+ "loss": 0.6325,
2528
+ "grad_norm": 9.365017890930176,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 1.804529664,
2535
+ "gpu_mem": 1.072746496,
2536
+ "loss": 0.5636,
2537
+ "grad_norm": 11.185689926147461,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 1.80492288,
2544
+ "gpu_mem": 1.07245312,
2545
+ "loss": 0.6543,
2546
+ "grad_norm": 10.285652160644531,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 1.80492288,
2553
+ "gpu_mem": 1.072308736,
2554
+ "loss": 0.5674,
2555
+ "grad_norm": 8.522608757019043,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 1.80492288,
2562
+ "gpu_mem": 1.072362496,
2563
+ "loss": 0.6468,
2564
+ "grad_norm": 11.664175987243652,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 1.8049024,
2571
+ "gpu_mem": 1.072780288,
2572
+ "loss": 0.5655,
2573
+ "grad_norm": 10.680014610290527,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 1.805099008,
2580
+ "gpu_mem": 1.072549888,
2581
+ "loss": 0.5575,
2582
+ "grad_norm": 10.169900894165039,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 1.804886016,
2589
+ "gpu_mem": 1.072334848,
2590
+ "loss": 0.5905,
2591
+ "grad_norm": 9.122550010681152,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 1.805082624,
2598
+ "gpu_mem": 1.072419328,
2599
+ "loss": 0.5947,
2600
+ "grad_norm": 10.595670700073242,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 1.808031744,
2607
+ "gpu_mem": 1.072344064,
2608
+ "loss": 0.6375,
2609
+ "grad_norm": 9.97242259979248,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 1.808818176,
2616
+ "gpu_mem": 1.072379392,
2617
+ "loss": 0.6539,
2618
+ "grad_norm": 9.128472328186035,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 1.813532672,
2625
+ "gpu_mem": 1.072462336,
2626
+ "loss": 0.5639,
2627
+ "grad_norm": 8.636130332946777,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 1.815302144,
2634
+ "gpu_mem": 1.072379392,
2635
+ "loss": 0.6293,
2636
+ "grad_norm": 9.215025901794434,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 1.821396992,
2643
+ "gpu_mem": 1.072405504,
2644
+ "loss": 0.6499,
2645
+ "grad_norm": 9.986368179321289,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 1.821396992,
2652
+ "gpu_mem": 1.072405504,
2653
+ "train_runtime": 4463.1665,
2654
+ "train_samples_per_second": 4.224,
2655
+ "train_steps_per_second": 0.066,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.7742782291911897
2658
+ }
2659
+ ]
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.813483369846644
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-hellaswag-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-hellaswag-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T01:39:19.750745"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.27101838463127453
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "LOGIQA",
5
+ "dataset_id": "data/logiqa_train",
6
+ "preprocess_id": "logiqa_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 3,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-logiqa-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-logiqa-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-29T15:14:52.117394"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r2-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.40921297252633754
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "LOGIQA",
5
+ "dataset_id": "data/logiqa_train",
6
+ "preprocess_id": "logiqa_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 3,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-logiqa-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-logiqa-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-29T22:21:48.612520"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-logiqa-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 4,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "o_proj",
31
+ "k_proj",
32
+ "down_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "up_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null
39
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "winogrande",
3
+ "results": 0.516179952644041
4
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "WINOGRANDE",
5
+ "dataset_id": "allenai/winogrande",
6
+ "preprocess_id": "winogrande_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-winogrande-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-winogrande-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T08:10:27.000811"
38
+ }
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r32-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q4/TinyLlama_v1.1-mars-winogrande-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "WINOGRANDE",
5
+ "dataset_id": "allenai/winogrande",
6
+ "preprocess_id": "winogrande_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-winogrande-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q4-new/TinyLlama_v1.1-mars-winogrande-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T01:04:10.381088"
38
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 0,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "o_proj",
32
+ "q_proj",
33
+ "gate_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "k_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": true
40
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.6322525597269625
4
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "qmars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-qmars-arc_c-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T08:46:32.040412"
38
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r2-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 3.020107776,
6
+ "gpu_mem": 1.058196992,
7
+ "loss": 4.8349,
8
+ "grad_norm": 360.26629638671875,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 3.025219584,
15
+ "gpu_mem": 1.068685312,
16
+ "loss": 5.1818,
17
+ "grad_norm": 364.10589599609375,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 3.025416192,
24
+ "gpu_mem": 1.068716032,
25
+ "loss": 1.9128,
26
+ "grad_norm": 82.8514175415039,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 3.0256128,
33
+ "gpu_mem": 1.06868224,
34
+ "loss": 1.5444,
35
+ "grad_norm": 44.257389068603516,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 3.0256128,
42
+ "gpu_mem": 1.068669952,
43
+ "loss": 1.4274,
44
+ "grad_norm": 38.661521911621094,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 3.0256128,
51
+ "gpu_mem": 1.068732928,
52
+ "loss": 1.4406,
53
+ "grad_norm": 26.17365837097168,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 3.0256128,
60
+ "gpu_mem": 1.068739072,
61
+ "loss": 1.522,
62
+ "grad_norm": 31.642549514770508,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 3.0256128,
69
+ "gpu_mem": 1.0686976,
70
+ "loss": 1.5263,
71
+ "grad_norm": 30.574464797973633,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 3.0256128,
78
+ "gpu_mem": 1.068692992,
79
+ "loss": 1.6581,
80
+ "grad_norm": 56.67964553833008,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 3.0256128,
87
+ "gpu_mem": 1.06868224,
88
+ "loss": 1.4143,
89
+ "grad_norm": 13.563467979431152,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 3.0256128,
96
+ "gpu_mem": 1.068692992,
97
+ "loss": 1.372,
98
+ "grad_norm": 8.824348449707031,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 3.0256128,
105
+ "gpu_mem": 1.068717568,
106
+ "loss": 1.4953,
107
+ "grad_norm": 19.334678649902344,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 3.0256128,
114
+ "gpu_mem": 1.068717568,
115
+ "loss": 1.4141,
116
+ "grad_norm": 17.438413619995117,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 3.0256128,
123
+ "gpu_mem": 1.068665344,
124
+ "loss": 1.4638,
125
+ "grad_norm": 11.587946891784668,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 3.0256128,
132
+ "gpu_mem": 1.068740608,
133
+ "loss": 1.4139,
134
+ "grad_norm": 7.782718181610107,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 3.0256128,
141
+ "gpu_mem": 1.068734464,
142
+ "loss": 1.4766,
143
+ "grad_norm": 12.382817268371582,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 3.0256128,
150
+ "gpu_mem": 1.068739072,
151
+ "loss": 1.4863,
152
+ "grad_norm": 10.532707214355469,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 3.0256128,
159
+ "gpu_mem": 1.073945088,
160
+ "loss": 2.0762,
161
+ "grad_norm": 13.852774620056152,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 3.0256128,
168
+ "gpu_mem": 1.073943552,
169
+ "loss": 1.39,
170
+ "grad_norm": 9.768491744995117,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 3.0256128,
177
+ "gpu_mem": 1.073918976,
178
+ "loss": 1.3337,
179
+ "grad_norm": 11.261113166809082,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 3.0256128,
186
+ "gpu_mem": 1.073926656,
187
+ "loss": 1.3633,
188
+ "grad_norm": 10.808806419372559,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 3.0256128,
195
+ "gpu_mem": 1.07395584,
196
+ "loss": 1.3556,
197
+ "grad_norm": 14.811458587646484,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 3.0256128,
204
+ "gpu_mem": 1.073985024,
205
+ "loss": 1.3555,
206
+ "grad_norm": 17.6370906829834,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 3.0256128,
213
+ "gpu_mem": 1.073928192,
214
+ "loss": 1.3234,
215
+ "grad_norm": 11.551438331604004,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 3.0256128,
222
+ "gpu_mem": 1.073997312,
223
+ "loss": 1.3358,
224
+ "grad_norm": 14.671205520629883,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 3.0256128,
231
+ "gpu_mem": 1.073954304,
232
+ "loss": 1.3709,
233
+ "grad_norm": 16.04226303100586,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 3.0256128,
240
+ "gpu_mem": 1.073912832,
241
+ "loss": 1.391,
242
+ "grad_norm": 14.387223243713379,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 3.0256128,
249
+ "gpu_mem": 1.073958912,
250
+ "loss": 1.6496,
251
+ "grad_norm": 26.753936767578125,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 3.0256128,
258
+ "gpu_mem": 1.073954304,
259
+ "loss": 1.3764,
260
+ "grad_norm": 10.248379707336426,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 3.0256128,
267
+ "gpu_mem": 1.073943552,
268
+ "loss": 1.3444,
269
+ "grad_norm": 8.105730056762695,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 3.0256128,
276
+ "gpu_mem": 1.073974272,
277
+ "loss": 1.3598,
278
+ "grad_norm": 9.523242950439453,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 3.0256128,
285
+ "gpu_mem": 1.073983488,
286
+ "loss": 1.3474,
287
+ "grad_norm": 10.56064224243164,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 3.0256128,
294
+ "gpu_mem": 1.07396352,
295
+ "loss": 1.3986,
296
+ "grad_norm": 15.865970611572266,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 3.0256128,
303
+ "gpu_mem": 1.073942016,
304
+ "loss": 1.4546,
305
+ "grad_norm": 21.07072639465332,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 3.0256128,
312
+ "gpu_mem": 1.073829888,
313
+ "loss": 2.0834,
314
+ "grad_norm": 17.78594398498535,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 3.0256128,
321
+ "gpu_mem": 1.068711424,
322
+ "loss": 1.2406,
323
+ "grad_norm": 12.084159851074219,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 3.0256128,
330
+ "gpu_mem": 1.06872064,
331
+ "loss": 1.5605,
332
+ "grad_norm": 59.46875762939453,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 3.0256128,
339
+ "gpu_mem": 1.068691456,
340
+ "loss": 1.3837,
341
+ "grad_norm": 37.64811706542969,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 3.0256128,
348
+ "gpu_mem": 1.068709888,
349
+ "loss": 1.3123,
350
+ "grad_norm": 21.93562889099121,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 3.0256128,
357
+ "gpu_mem": 1.068686848,
358
+ "loss": 1.3359,
359
+ "grad_norm": 17.83315658569336,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 3.0256128,
366
+ "gpu_mem": 1.068688384,
367
+ "loss": 1.2908,
368
+ "grad_norm": 12.607256889343262,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 3.0256128,
375
+ "gpu_mem": 1.068717568,
376
+ "loss": 1.3378,
377
+ "grad_norm": 29.468717575073242,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 3.0256128,
384
+ "gpu_mem": 1.068732928,
385
+ "loss": 1.2618,
386
+ "grad_norm": 13.891093254089355,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 3.0256128,
393
+ "gpu_mem": 1.06875136,
394
+ "loss": 1.2953,
395
+ "grad_norm": 20.18376922607422,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 3.0256128,
402
+ "gpu_mem": 1.06870528,
403
+ "loss": 1.2423,
404
+ "grad_norm": 12.552496910095215,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 3.0256128,
411
+ "gpu_mem": 1.068699136,
412
+ "loss": 1.197,
413
+ "grad_norm": 11.219213485717773,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 3.0256128,
420
+ "gpu_mem": 1.068692992,
421
+ "loss": 1.1792,
422
+ "grad_norm": 11.913239479064941,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 3.0256128,
429
+ "gpu_mem": 1.0686976,
430
+ "loss": 1.0754,
431
+ "grad_norm": 11.84579086303711,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 3.0256128,
438
+ "gpu_mem": 1.068688384,
439
+ "loss": 1.1704,
440
+ "grad_norm": 11.788681030273438,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 3.0256128,
447
+ "gpu_mem": 1.068669952,
448
+ "loss": 1.1673,
449
+ "grad_norm": 12.49934196472168,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 3.0256128,
456
+ "gpu_mem": 1.068694528,
457
+ "loss": 1.1689,
458
+ "grad_norm": 20.719552993774414,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 3.0256128,
465
+ "gpu_mem": 1.068722176,
466
+ "loss": 1.2007,
467
+ "grad_norm": 18.217304229736328,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 3.0256128,
474
+ "gpu_mem": 1.07394048,
475
+ "loss": 1.6113,
476
+ "grad_norm": 27.60318374633789,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 3.0256128,
483
+ "gpu_mem": 1.07390976,
484
+ "loss": 1.0235,
485
+ "grad_norm": 18.013338088989258,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 3.0256128,
492
+ "gpu_mem": 1.073943552,
493
+ "loss": 1.009,
494
+ "grad_norm": 22.722183227539062,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 3.0256128,
501
+ "gpu_mem": 1.07401728,
502
+ "loss": 1.0239,
503
+ "grad_norm": 21.968795776367188,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 3.0256128,
510
+ "gpu_mem": 1.073960448,
511
+ "loss": 0.8981,
512
+ "grad_norm": 19.004884719848633,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 3.0256128,
519
+ "gpu_mem": 1.073954304,
520
+ "loss": 0.8651,
521
+ "grad_norm": 24.159976959228516,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 3.0256128,
528
+ "gpu_mem": 1.074004992,
529
+ "loss": 0.8833,
530
+ "grad_norm": 21.838836669921875,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 3.0256128,
537
+ "gpu_mem": 1.073931264,
538
+ "loss": 0.9072,
539
+ "grad_norm": 27.530853271484375,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 3.0256128,
546
+ "gpu_mem": 1.073945088,
547
+ "loss": 0.9804,
548
+ "grad_norm": 28.941020965576172,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 3.0256128,
555
+ "gpu_mem": 1.073946624,
556
+ "loss": 0.9688,
557
+ "grad_norm": 28.280624389648438,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 3.0256128,
564
+ "gpu_mem": 1.073935872,
565
+ "loss": 1.0072,
566
+ "grad_norm": 28.708356857299805,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 3.0256128,
573
+ "gpu_mem": 1.073952768,
574
+ "loss": 0.9184,
575
+ "grad_norm": 29.564748764038086,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 3.0256128,
582
+ "gpu_mem": 1.073974272,
583
+ "loss": 0.8999,
584
+ "grad_norm": 42.21194839477539,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 3.0256128,
591
+ "gpu_mem": 1.073965056,
592
+ "loss": 0.8607,
593
+ "grad_norm": 30.265111923217773,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 3.0256128,
600
+ "gpu_mem": 1.073991168,
601
+ "loss": 0.8626,
602
+ "grad_norm": 28.273231506347656,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 3.0256128,
609
+ "gpu_mem": 1.073942016,
610
+ "loss": 0.8573,
611
+ "grad_norm": 29.98070526123047,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 3.0256128,
618
+ "gpu_mem": 1.073942016,
619
+ "train_runtime": 372.3545,
620
+ "train_samples_per_second": 12.021,
621
+ "train_steps_per_second": 0.183,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4101540008011986
624
+ }
625
+ ]
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 0,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "o_proj",
32
+ "q_proj",
33
+ "gate_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "k_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": true
40
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.6109215017064846
4
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "qmars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-qmars-arc_c-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T21:50:02.458769"
38
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r32-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.860781568,
6
+ "gpu_mem": 1.137022464,
7
+ "loss": 4.8349,
8
+ "grad_norm": 75.46897888183594,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.865893376,
15
+ "gpu_mem": 1.305161728,
16
+ "loss": 5.1818,
17
+ "grad_norm": 76.17707824707031,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.866089984,
24
+ "gpu_mem": 1.305192448,
25
+ "loss": 2.218,
26
+ "grad_norm": 29.5435733795166,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.866089984,
33
+ "gpu_mem": 1.305158656,
34
+ "loss": 1.5022,
35
+ "grad_norm": 6.774408340454102,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.866089984,
42
+ "gpu_mem": 1.305146368,
43
+ "loss": 1.4632,
44
+ "grad_norm": 7.44521427154541,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.866089984,
51
+ "gpu_mem": 1.305209344,
52
+ "loss": 1.4715,
53
+ "grad_norm": 7.740568161010742,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.866089984,
60
+ "gpu_mem": 1.305215488,
61
+ "loss": 1.5742,
62
+ "grad_norm": 8.267945289611816,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.866089984,
69
+ "gpu_mem": 1.305174016,
70
+ "loss": 1.3934,
71
+ "grad_norm": 3.597435235977173,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.866089984,
78
+ "gpu_mem": 1.305169408,
79
+ "loss": 1.3452,
80
+ "grad_norm": 2.5027928352355957,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.866089984,
87
+ "gpu_mem": 1.305158656,
88
+ "loss": 1.4474,
89
+ "grad_norm": 3.489070415496826,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.866089984,
96
+ "gpu_mem": 1.305169408,
97
+ "loss": 1.3633,
98
+ "grad_norm": 1.7864291667938232,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.866089984,
105
+ "gpu_mem": 1.305193984,
106
+ "loss": 1.4912,
107
+ "grad_norm": 4.657872200012207,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.866089984,
114
+ "gpu_mem": 1.305193984,
115
+ "loss": 1.4818,
116
+ "grad_norm": 5.277657985687256,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.866089984,
123
+ "gpu_mem": 1.30514176,
124
+ "loss": 1.5029,
125
+ "grad_norm": 2.831676959991455,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.866089984,
132
+ "gpu_mem": 1.305217024,
133
+ "loss": 1.5351,
134
+ "grad_norm": 3.477987051010132,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.866089984,
141
+ "gpu_mem": 1.30521088,
142
+ "loss": 1.3817,
143
+ "grad_norm": 1.5456229448318481,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.866089984,
150
+ "gpu_mem": 1.305215488,
151
+ "loss": 1.4314,
152
+ "grad_norm": 2.3160977363586426,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.866089984,
159
+ "gpu_mem": 1.389246976,
160
+ "loss": 2.0422,
161
+ "grad_norm": 2.573564052581787,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.866089984,
168
+ "gpu_mem": 1.38924544,
169
+ "loss": 1.3818,
170
+ "grad_norm": 2.031684637069702,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.866089984,
177
+ "gpu_mem": 1.389220864,
178
+ "loss": 1.3827,
179
+ "grad_norm": 2.671165704727173,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.866089984,
186
+ "gpu_mem": 1.389228544,
187
+ "loss": 1.3347,
188
+ "grad_norm": 1.5832688808441162,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.866089984,
195
+ "gpu_mem": 1.389257728,
196
+ "loss": 1.3429,
197
+ "grad_norm": 2.4159576892852783,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.866089984,
204
+ "gpu_mem": 1.389286912,
205
+ "loss": 1.29,
206
+ "grad_norm": 1.643193006515503,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.866089984,
213
+ "gpu_mem": 1.38923008,
214
+ "loss": 1.4058,
215
+ "grad_norm": 2.3235573768615723,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.866089984,
222
+ "gpu_mem": 1.3892992,
223
+ "loss": 1.3284,
224
+ "grad_norm": 2.1304330825805664,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.866089984,
231
+ "gpu_mem": 1.389256192,
232
+ "loss": 1.3167,
233
+ "grad_norm": 1.504705786705017,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.866089984,
240
+ "gpu_mem": 1.38921472,
241
+ "loss": 1.3632,
242
+ "grad_norm": 2.2097530364990234,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.866089984,
249
+ "gpu_mem": 1.3892608,
250
+ "loss": 1.5934,
251
+ "grad_norm": 4.348984241485596,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.866089984,
258
+ "gpu_mem": 1.389256192,
259
+ "loss": 1.3625,
260
+ "grad_norm": 1.59405517578125,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.866089984,
267
+ "gpu_mem": 1.38924544,
268
+ "loss": 1.3605,
269
+ "grad_norm": 2.354477643966675,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.866089984,
276
+ "gpu_mem": 1.38927616,
277
+ "loss": 1.3167,
278
+ "grad_norm": 1.2073973417282104,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.866089984,
285
+ "gpu_mem": 1.389285376,
286
+ "loss": 1.3462,
287
+ "grad_norm": 1.477156639099121,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.866089984,
294
+ "gpu_mem": 1.389265408,
295
+ "loss": 1.4029,
296
+ "grad_norm": 2.4271631240844727,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.866089984,
303
+ "gpu_mem": 1.389243904,
304
+ "loss": 1.4064,
305
+ "grad_norm": 1.9992505311965942,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.866089984,
312
+ "gpu_mem": 1.389131776,
313
+ "loss": 2.0767,
314
+ "grad_norm": 2.3909573554992676,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.866089984,
321
+ "gpu_mem": 1.30518784,
322
+ "loss": 1.3289,
323
+ "grad_norm": 1.7182539701461792,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.866089984,
330
+ "gpu_mem": 1.305197056,
331
+ "loss": 1.3174,
332
+ "grad_norm": 1.4748281240463257,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.866089984,
339
+ "gpu_mem": 1.305167872,
340
+ "loss": 1.3186,
341
+ "grad_norm": 1.1838716268539429,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.866089984,
348
+ "gpu_mem": 1.305186304,
349
+ "loss": 1.3067,
350
+ "grad_norm": 1.746688723564148,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.866089984,
357
+ "gpu_mem": 1.305163264,
358
+ "loss": 1.316,
359
+ "grad_norm": 1.3366076946258545,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.866089984,
366
+ "gpu_mem": 1.3051648,
367
+ "loss": 1.3113,
368
+ "grad_norm": 2.173269510269165,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.866089984,
375
+ "gpu_mem": 1.305193984,
376
+ "loss": 1.287,
377
+ "grad_norm": 1.9705522060394287,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.866089984,
384
+ "gpu_mem": 1.305209344,
385
+ "loss": 1.2743,
386
+ "grad_norm": 1.440999984741211,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.866089984,
393
+ "gpu_mem": 1.305227776,
394
+ "loss": 1.2127,
395
+ "grad_norm": 1.4414082765579224,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.866089984,
402
+ "gpu_mem": 1.305181696,
403
+ "loss": 1.2363,
404
+ "grad_norm": 1.410117745399475,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.866089984,
411
+ "gpu_mem": 1.305175552,
412
+ "loss": 1.1618,
413
+ "grad_norm": 1.3951239585876465,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.866089984,
420
+ "gpu_mem": 1.305169408,
421
+ "loss": 1.1931,
422
+ "grad_norm": 1.4868206977844238,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.866089984,
429
+ "gpu_mem": 1.305174016,
430
+ "loss": 1.1658,
431
+ "grad_norm": 2.2503466606140137,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.866089984,
438
+ "gpu_mem": 1.3051648,
439
+ "loss": 1.1666,
440
+ "grad_norm": 2.3152573108673096,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.866089984,
447
+ "gpu_mem": 1.305146368,
448
+ "loss": 1.1869,
449
+ "grad_norm": 2.246035575866699,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.866089984,
456
+ "gpu_mem": 1.305170944,
457
+ "loss": 1.1132,
458
+ "grad_norm": 2.2531511783599854,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.866089984,
465
+ "gpu_mem": 1.305198592,
466
+ "loss": 1.2356,
467
+ "grad_norm": 2.720115900039673,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.866089984,
474
+ "gpu_mem": 1.389242368,
475
+ "loss": 1.7441,
476
+ "grad_norm": 3.720527172088623,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.866089984,
483
+ "gpu_mem": 1.389211648,
484
+ "loss": 1.0292,
485
+ "grad_norm": 2.4771134853363037,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.866089984,
492
+ "gpu_mem": 1.38924544,
493
+ "loss": 1.0343,
494
+ "grad_norm": 3.095989942550659,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.866089984,
501
+ "gpu_mem": 1.389319168,
502
+ "loss": 1.0445,
503
+ "grad_norm": 3.5430476665496826,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.866089984,
510
+ "gpu_mem": 1.389262336,
511
+ "loss": 0.9922,
512
+ "grad_norm": 2.634396553039551,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.866089984,
519
+ "gpu_mem": 1.389256192,
520
+ "loss": 0.9425,
521
+ "grad_norm": 3.697744131088257,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.866089984,
528
+ "gpu_mem": 1.38930688,
529
+ "loss": 0.8918,
530
+ "grad_norm": 3.4737136363983154,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.866089984,
537
+ "gpu_mem": 1.389233152,
538
+ "loss": 1.091,
539
+ "grad_norm": 4.901792049407959,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.866089984,
546
+ "gpu_mem": 1.389246976,
547
+ "loss": 1.0153,
548
+ "grad_norm": 3.8909311294555664,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.866089984,
555
+ "gpu_mem": 1.389248512,
556
+ "loss": 1.0389,
557
+ "grad_norm": 4.103649616241455,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.866089984,
564
+ "gpu_mem": 1.38923776,
565
+ "loss": 1.0792,
566
+ "grad_norm": 4.326652526855469,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.866089984,
573
+ "gpu_mem": 1.389254656,
574
+ "loss": 1.0662,
575
+ "grad_norm": 4.315281391143799,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.866089984,
582
+ "gpu_mem": 1.38927616,
583
+ "loss": 1.0428,
584
+ "grad_norm": 4.233317852020264,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.866089984,
591
+ "gpu_mem": 1.389266944,
592
+ "loss": 0.8833,
593
+ "grad_norm": 3.347691297531128,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.866089984,
600
+ "gpu_mem": 1.389293056,
601
+ "loss": 0.8826,
602
+ "grad_norm": 3.1047513484954834,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.866089984,
609
+ "gpu_mem": 1.389243904,
610
+ "loss": 1.0324,
611
+ "grad_norm": 3.6745681762695312,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.866089984,
618
+ "gpu_mem": 1.389243904,
619
+ "train_runtime": 384.6701,
620
+ "train_samples_per_second": 11.636,
621
+ "train_steps_per_second": 0.177,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.420820819104419
624
+ }
625
+ ]
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 0,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "o_proj",
32
+ "q_proj",
33
+ "gate_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "k_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": true
40
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.6203071672354948
4
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "qmars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-qmars-arc_c-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T15:16:51.501968"
38
+ }
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_c-r8-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.883121152,
6
+ "gpu_mem": 1.073887744,
7
+ "loss": 4.8349,
8
+ "grad_norm": 167.41305541992188,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.888429568,
15
+ "gpu_mem": 1.115757568,
16
+ "loss": 5.1818,
17
+ "grad_norm": 169.5339813232422,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.888429568,
24
+ "gpu_mem": 1.115788288,
25
+ "loss": 2.0398,
26
+ "grad_norm": 45.44222640991211,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.888429568,
33
+ "gpu_mem": 1.115754496,
34
+ "loss": 1.4888,
35
+ "grad_norm": 13.766551971435547,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.888429568,
42
+ "gpu_mem": 1.115742208,
43
+ "loss": 1.4426,
44
+ "grad_norm": 19.619434356689453,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.888429568,
51
+ "gpu_mem": 1.115805184,
52
+ "loss": 1.4259,
53
+ "grad_norm": 12.592857360839844,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.888429568,
60
+ "gpu_mem": 1.115811328,
61
+ "loss": 1.4734,
62
+ "grad_norm": 10.968993186950684,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.888429568,
69
+ "gpu_mem": 1.115769856,
70
+ "loss": 1.4618,
71
+ "grad_norm": 9.009696006774902,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.888429568,
78
+ "gpu_mem": 1.115765248,
79
+ "loss": 1.3015,
80
+ "grad_norm": 4.542705535888672,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.888429568,
87
+ "gpu_mem": 1.115754496,
88
+ "loss": 1.5272,
89
+ "grad_norm": 11.390000343322754,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.888429568,
96
+ "gpu_mem": 1.115765248,
97
+ "loss": 1.3546,
98
+ "grad_norm": 3.7129766941070557,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.888429568,
105
+ "gpu_mem": 1.115789824,
106
+ "loss": 1.5265,
107
+ "grad_norm": 9.75503921508789,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.888429568,
114
+ "gpu_mem": 1.115789824,
115
+ "loss": 1.3401,
116
+ "grad_norm": 9.924088478088379,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.888429568,
123
+ "gpu_mem": 1.1157376,
124
+ "loss": 1.6946,
125
+ "grad_norm": 11.323637008666992,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.888429568,
132
+ "gpu_mem": 1.115812864,
133
+ "loss": 1.5725,
134
+ "grad_norm": 8.125307083129883,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.888429568,
141
+ "gpu_mem": 1.11580672,
142
+ "loss": 1.4913,
143
+ "grad_norm": 7.795830726623535,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.888429568,
150
+ "gpu_mem": 1.115811328,
151
+ "loss": 1.4569,
152
+ "grad_norm": 5.1771240234375,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.888429568,
159
+ "gpu_mem": 1.136708096,
160
+ "loss": 2.1211,
161
+ "grad_norm": 18.78937339782715,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.888429568,
168
+ "gpu_mem": 1.13670656,
169
+ "loss": 1.4114,
170
+ "grad_norm": 4.584958553314209,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.888429568,
177
+ "gpu_mem": 1.136681984,
178
+ "loss": 1.3739,
179
+ "grad_norm": 7.459542751312256,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.888429568,
186
+ "gpu_mem": 1.136689664,
187
+ "loss": 1.3671,
188
+ "grad_norm": 3.8549575805664062,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.888429568,
195
+ "gpu_mem": 1.136718848,
196
+ "loss": 1.3602,
197
+ "grad_norm": 4.934852123260498,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.888429568,
204
+ "gpu_mem": 1.136748032,
205
+ "loss": 1.3096,
206
+ "grad_norm": 2.996155261993408,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.888429568,
213
+ "gpu_mem": 1.1366912,
214
+ "loss": 1.3381,
215
+ "grad_norm": 3.403679370880127,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.888429568,
222
+ "gpu_mem": 1.13676032,
223
+ "loss": 1.3265,
224
+ "grad_norm": 3.279336452484131,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.888429568,
231
+ "gpu_mem": 1.136717312,
232
+ "loss": 1.3729,
233
+ "grad_norm": 3.639723539352417,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.888429568,
240
+ "gpu_mem": 1.13667584,
241
+ "loss": 1.3882,
242
+ "grad_norm": 4.425181865692139,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.888429568,
249
+ "gpu_mem": 1.13672192,
250
+ "loss": 1.5116,
251
+ "grad_norm": 6.004555702209473,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.888429568,
258
+ "gpu_mem": 1.136717312,
259
+ "loss": 1.3472,
260
+ "grad_norm": 3.100022792816162,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.888429568,
267
+ "gpu_mem": 1.13670656,
268
+ "loss": 1.3379,
269
+ "grad_norm": 3.413043260574341,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.888429568,
276
+ "gpu_mem": 1.13673728,
277
+ "loss": 1.3091,
278
+ "grad_norm": 1.8831239938735962,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.888429568,
285
+ "gpu_mem": 1.136746496,
286
+ "loss": 1.3402,
287
+ "grad_norm": 2.753385543823242,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.888429568,
294
+ "gpu_mem": 1.136726528,
295
+ "loss": 1.3955,
296
+ "grad_norm": 3.8380284309387207,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.888429568,
303
+ "gpu_mem": 1.136705024,
304
+ "loss": 1.3676,
305
+ "grad_norm": 2.57751202583313,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.888429568,
312
+ "gpu_mem": 1.136592896,
313
+ "loss": 2.1042,
314
+ "grad_norm": 4.824999809265137,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.888429568,
321
+ "gpu_mem": 1.11578368,
322
+ "loss": 1.2838,
323
+ "grad_norm": 3.206289052963257,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.888429568,
330
+ "gpu_mem": 1.115792896,
331
+ "loss": 1.3517,
332
+ "grad_norm": 3.9317543506622314,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.888429568,
339
+ "gpu_mem": 1.115763712,
340
+ "loss": 1.3077,
341
+ "grad_norm": 3.159288167953491,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.888429568,
348
+ "gpu_mem": 1.115782144,
349
+ "loss": 1.3157,
350
+ "grad_norm": 3.4918620586395264,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.888429568,
357
+ "gpu_mem": 1.115759104,
358
+ "loss": 1.3421,
359
+ "grad_norm": 3.825528383255005,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.888429568,
366
+ "gpu_mem": 1.11576064,
367
+ "loss": 1.3183,
368
+ "grad_norm": 2.9959378242492676,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.888429568,
375
+ "gpu_mem": 1.115789824,
376
+ "loss": 1.24,
377
+ "grad_norm": 4.1695404052734375,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.888429568,
384
+ "gpu_mem": 1.115805184,
385
+ "loss": 1.2721,
386
+ "grad_norm": 4.863770484924316,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.888429568,
393
+ "gpu_mem": 1.115823616,
394
+ "loss": 1.2515,
395
+ "grad_norm": 3.4958484172821045,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.888429568,
402
+ "gpu_mem": 1.115777536,
403
+ "loss": 1.2826,
404
+ "grad_norm": 3.9481709003448486,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.888429568,
411
+ "gpu_mem": 1.115771392,
412
+ "loss": 1.2037,
413
+ "grad_norm": 3.5659868717193604,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.888429568,
420
+ "gpu_mem": 1.115765248,
421
+ "loss": 1.1832,
422
+ "grad_norm": 4.207839488983154,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.888429568,
429
+ "gpu_mem": 1.115769856,
430
+ "loss": 1.093,
431
+ "grad_norm": 3.661602735519409,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.888429568,
438
+ "gpu_mem": 1.11576064,
439
+ "loss": 1.1906,
440
+ "grad_norm": 4.302423000335693,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.888429568,
447
+ "gpu_mem": 1.115742208,
448
+ "loss": 1.1374,
449
+ "grad_norm": 4.260297775268555,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.888429568,
456
+ "gpu_mem": 1.115766784,
457
+ "loss": 1.2067,
458
+ "grad_norm": 6.081331253051758,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.888429568,
465
+ "gpu_mem": 1.115794432,
466
+ "loss": 1.1881,
467
+ "grad_norm": 6.451767444610596,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.888429568,
474
+ "gpu_mem": 1.136703488,
475
+ "loss": 1.7001,
476
+ "grad_norm": 9.44800090789795,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.888429568,
483
+ "gpu_mem": 1.136672768,
484
+ "loss": 1.0727,
485
+ "grad_norm": 5.197486400604248,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.888429568,
492
+ "gpu_mem": 1.13670656,
493
+ "loss": 1.0685,
494
+ "grad_norm": 5.644598007202148,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.888429568,
501
+ "gpu_mem": 1.136780288,
502
+ "loss": 1.0943,
503
+ "grad_norm": 6.423681259155273,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.888429568,
510
+ "gpu_mem": 1.136723456,
511
+ "loss": 0.9465,
512
+ "grad_norm": 5.702960968017578,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.888429568,
519
+ "gpu_mem": 1.136717312,
520
+ "loss": 0.9811,
521
+ "grad_norm": 6.766078472137451,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.888429568,
528
+ "gpu_mem": 1.136768,
529
+ "loss": 0.9603,
530
+ "grad_norm": 6.710747241973877,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.888429568,
537
+ "gpu_mem": 1.136694272,
538
+ "loss": 1.0782,
539
+ "grad_norm": 7.888988018035889,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.888429568,
546
+ "gpu_mem": 1.136708096,
547
+ "loss": 0.9895,
548
+ "grad_norm": 6.987651348114014,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.888429568,
555
+ "gpu_mem": 1.136709632,
556
+ "loss": 0.9953,
557
+ "grad_norm": 6.920251846313477,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.888429568,
564
+ "gpu_mem": 1.13669888,
565
+ "loss": 1.0593,
566
+ "grad_norm": 7.611911773681641,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.888429568,
573
+ "gpu_mem": 1.136715776,
574
+ "loss": 0.9854,
575
+ "grad_norm": 8.091558456420898,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.888429568,
582
+ "gpu_mem": 1.13673728,
583
+ "loss": 1.0049,
584
+ "grad_norm": 8.712056159973145,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.888429568,
591
+ "gpu_mem": 1.136728064,
592
+ "loss": 0.9963,
593
+ "grad_norm": 7.288658618927002,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.888429568,
600
+ "gpu_mem": 1.136754176,
601
+ "loss": 0.9171,
602
+ "grad_norm": 6.7615966796875,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.888429568,
609
+ "gpu_mem": 1.136705024,
610
+ "loss": 0.946,
611
+ "grad_norm": 8.281795501708984,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.888429568,
618
+ "gpu_mem": 1.136705024,
619
+ "train_runtime": 372.291,
620
+ "train_samples_per_second": 12.023,
621
+ "train_steps_per_second": 0.183,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4214746145641102
624
+ }
625
+ ]
TinyLlama_v1.1-qmars/TinyLlama_v1.1-qmars-arc_e-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 0,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "o_proj",
32
+ "q_proj",
33
+ "gate_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "k_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": true
40
+ }