j05hr3d commited on
Commit
6577443
·
verified ·
1 Parent(s): a8b232c

Model save

Browse files
Files changed (3) hide show
  1. README.md +12 -15
  2. adapter_model.safetensors +1 -1
  3. trainer_state.json +110 -155
README.md CHANGED
@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.8500
23
 
24
  ## Model description
25
 
@@ -47,25 +47,22 @@ The following hyperparameters were used during training:
47
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
  - lr_scheduler_type: linear
49
  - lr_scheduler_warmup_ratio: 0.03
50
- - num_epochs: 4
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
- | 1.0763 | 0.2996 | 20 | 0.9908 |
57
- | 0.8064 | 0.5993 | 40 | 0.9294 |
58
- | 1.0071 | 0.8989 | 60 | 0.9032 |
59
- | 0.805 | 1.1948 | 80 | 0.8865 |
60
- | 0.7293 | 1.4944 | 100 | 0.8719 |
61
- | 0.7675 | 1.7940 | 120 | 0.8570 |
62
- | 0.7367 | 2.0899 | 140 | 0.8649 |
63
- | 0.6303 | 2.3895 | 160 | 0.8570 |
64
- | 0.6213 | 2.6891 | 180 | 0.8549 |
65
- | 0.7035 | 2.9888 | 200 | 0.8500 |
66
- | 0.6131 | 3.2846 | 220 | 0.8656 |
67
- | 0.6333 | 3.5843 | 240 | 0.8650 |
68
- | 0.5252 | 3.8839 | 260 | 0.8719 |
69
 
70
 
71
  ### Framework versions
 
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.7801
23
 
24
  ## Model description
25
 
 
47
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
  - lr_scheduler_type: linear
49
  - lr_scheduler_warmup_ratio: 0.03
50
+ - num_epochs: 3
51
 
52
  ### Training results
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
+ | 1.0934 | 0.2909 | 20 | 0.9285 |
57
+ | 0.8825 | 0.5818 | 40 | 0.8725 |
58
+ | 0.8459 | 0.8727 | 60 | 0.8423 |
59
+ | 0.8573 | 1.16 | 80 | 0.8205 |
60
+ | 0.8109 | 1.4509 | 100 | 0.8079 |
61
+ | 0.7729 | 1.7418 | 120 | 0.7978 |
62
+ | 0.7089 | 2.0291 | 140 | 0.7842 |
63
+ | 0.7298 | 2.32 | 160 | 0.7870 |
64
+ | 0.6684 | 2.6109 | 180 | 0.7820 |
65
+ | 0.6122 | 2.9018 | 200 | 0.7801 |
 
 
 
66
 
67
 
68
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3577d469fc22bdcf933d2f00aa6484d30cfcf5c86d78ce05adc235ac36d79c57
3
  size 239536272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62fea6b6416f6f3ac15bb70f240b30aae45e641d6354487f192e9574b4bae07c
3
  size 239536272
trainer_state.json CHANGED
@@ -1,231 +1,186 @@
1
  {
2
  "best_global_step": 200,
3
- "best_metric": 0.8499857187271118,
4
  "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-3B_v1/checkpoint-200",
5
- "epoch": 3.8838951310861423,
6
  "eval_steps": 20,
7
- "global_step": 260,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.299625468164794,
14
- "grad_norm": 0.5244920253753662,
15
- "learning_rate": 9.613899613899614e-05,
16
- "loss": 1.0763,
17
  "step": 20
18
  },
19
  {
20
- "epoch": 0.299625468164794,
21
- "eval_loss": 0.9908040761947632,
22
- "eval_runtime": 13.9504,
23
- "eval_samples_per_second": 4.086,
24
- "eval_steps_per_second": 0.573,
25
  "step": 20
26
  },
27
  {
28
- "epoch": 0.599250936329588,
29
- "grad_norm": 0.3149108290672302,
30
- "learning_rate": 8.841698841698842e-05,
31
- "loss": 0.8064,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.599250936329588,
36
- "eval_loss": 0.9294381141662598,
37
- "eval_runtime": 12.1002,
38
- "eval_samples_per_second": 4.711,
39
- "eval_steps_per_second": 0.661,
40
  "step": 40
41
  },
42
  {
43
- "epoch": 0.898876404494382,
44
- "grad_norm": 0.6792584657669067,
45
- "learning_rate": 8.06949806949807e-05,
46
- "loss": 1.0071,
47
  "step": 60
48
  },
49
  {
50
- "epoch": 0.898876404494382,
51
- "eval_loss": 0.9031661152839661,
52
- "eval_runtime": 12.0977,
53
- "eval_samples_per_second": 4.712,
54
- "eval_steps_per_second": 0.661,
55
  "step": 60
56
  },
57
  {
58
- "epoch": 1.1947565543071161,
59
- "grad_norm": 0.4398963451385498,
60
- "learning_rate": 7.297297297297297e-05,
61
- "loss": 0.805,
62
  "step": 80
63
  },
64
  {
65
- "epoch": 1.1947565543071161,
66
- "eval_loss": 0.886512815952301,
67
- "eval_runtime": 12.1101,
68
- "eval_samples_per_second": 4.707,
69
- "eval_steps_per_second": 0.661,
70
  "step": 80
71
  },
72
  {
73
- "epoch": 1.49438202247191,
74
- "grad_norm": 0.4501686096191406,
75
- "learning_rate": 6.525096525096526e-05,
76
- "loss": 0.7293,
77
  "step": 100
78
  },
79
  {
80
- "epoch": 1.49438202247191,
81
- "eval_loss": 0.8719378113746643,
82
- "eval_runtime": 12.1092,
83
- "eval_samples_per_second": 4.707,
84
- "eval_steps_per_second": 0.661,
85
  "step": 100
86
  },
87
  {
88
- "epoch": 1.7940074906367043,
89
- "grad_norm": 0.3679046630859375,
90
- "learning_rate": 5.752895752895753e-05,
91
- "loss": 0.7675,
92
  "step": 120
93
  },
94
  {
95
- "epoch": 1.7940074906367043,
96
- "eval_loss": 0.856951117515564,
97
- "eval_runtime": 12.1105,
98
- "eval_samples_per_second": 4.707,
99
- "eval_steps_per_second": 0.661,
100
  "step": 120
101
  },
102
  {
103
- "epoch": 2.0898876404494384,
104
- "grad_norm": 0.7033249139785767,
105
- "learning_rate": 4.980694980694981e-05,
106
- "loss": 0.7367,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 2.0898876404494384,
111
- "eval_loss": 0.8648577928543091,
112
- "eval_runtime": 12.1063,
113
- "eval_samples_per_second": 4.708,
114
- "eval_steps_per_second": 0.661,
115
  "step": 140
116
  },
117
  {
118
- "epoch": 2.3895131086142323,
119
- "grad_norm": 0.4625875949859619,
120
- "learning_rate": 4.2084942084942086e-05,
121
- "loss": 0.6303,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 2.3895131086142323,
126
- "eval_loss": 0.8570329546928406,
127
- "eval_runtime": 12.1098,
128
- "eval_samples_per_second": 4.707,
129
- "eval_steps_per_second": 0.661,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 2.689138576779026,
134
- "grad_norm": 0.949469804763794,
135
- "learning_rate": 3.436293436293436e-05,
136
- "loss": 0.6213,
137
  "step": 180
138
  },
139
  {
140
- "epoch": 2.689138576779026,
141
- "eval_loss": 0.8548977971076965,
142
- "eval_runtime": 12.1087,
143
- "eval_samples_per_second": 4.707,
144
- "eval_steps_per_second": 0.661,
145
  "step": 180
146
  },
147
  {
148
- "epoch": 2.98876404494382,
149
- "grad_norm": 0.8807445168495178,
150
- "learning_rate": 2.6640926640926645e-05,
151
- "loss": 0.7035,
152
  "step": 200
153
  },
154
  {
155
- "epoch": 2.98876404494382,
156
- "eval_loss": 0.8499857187271118,
157
- "eval_runtime": 12.1085,
158
- "eval_samples_per_second": 4.707,
159
- "eval_steps_per_second": 0.661,
160
  "step": 200
161
  },
162
  {
163
- "epoch": 3.284644194756554,
164
- "grad_norm": 0.573523223400116,
165
- "learning_rate": 1.891891891891892e-05,
166
- "loss": 0.6131,
167
- "step": 220
 
 
168
  },
169
  {
170
- "epoch": 3.284644194756554,
171
- "eval_loss": 0.8655520677566528,
172
- "eval_runtime": 12.1027,
173
- "eval_samples_per_second": 4.71,
174
- "eval_steps_per_second": 0.661,
175
- "step": 220
176
- },
177
- {
178
- "epoch": 3.5842696629213484,
179
- "grad_norm": 0.4450347125530243,
180
- "learning_rate": 1.1196911196911197e-05,
181
- "loss": 0.6333,
182
- "step": 240
183
- },
184
- {
185
- "epoch": 3.5842696629213484,
186
- "eval_loss": 0.8650490641593933,
187
- "eval_runtime": 12.1174,
188
- "eval_samples_per_second": 4.704,
189
- "eval_steps_per_second": 0.66,
190
- "step": 240
191
- },
192
- {
193
- "epoch": 3.8838951310861423,
194
- "grad_norm": 0.6607774496078491,
195
- "learning_rate": 3.474903474903475e-06,
196
- "loss": 0.5252,
197
- "step": 260
198
- },
199
- {
200
- "epoch": 3.8838951310861423,
201
- "eval_loss": 0.8718735575675964,
202
- "eval_runtime": 12.1037,
203
- "eval_samples_per_second": 4.709,
204
- "eval_steps_per_second": 0.661,
205
- "step": 260
206
- },
207
- {
208
- "epoch": 3.8838951310861423,
209
- "step": 260,
210
- "total_flos": 4.022488092617933e+16,
211
- "train_loss": 0.7426851749420166,
212
- "train_runtime": 1184.6635,
213
- "train_samples_per_second": 1.803,
214
- "train_steps_per_second": 0.226
215
- },
216
- {
217
- "epoch": 3.8838951310861423,
218
- "eval_loss": 0.8499857187271118,
219
- "eval_runtime": 12.1277,
220
- "eval_samples_per_second": 4.7,
221
- "eval_steps_per_second": 0.66,
222
- "step": 260
223
  }
224
  ],
225
  "logging_steps": 20,
226
- "max_steps": 268,
227
  "num_input_tokens_seen": 0,
228
- "num_train_epochs": 4,
229
  "save_steps": 20,
230
  "stateful_callbacks": {
231
  "EarlyStoppingCallback": {
@@ -234,7 +189,7 @@
234
  "early_stopping_threshold": 0.0
235
  },
236
  "attributes": {
237
- "early_stopping_patience_counter": 3
238
  }
239
  },
240
  "TrainerControl": {
@@ -248,7 +203,7 @@
248
  "attributes": {}
249
  }
250
  },
251
- "total_flos": 4.022488092617933e+16,
252
  "train_batch_size": 2,
253
  "trial_name": null,
254
  "trial_params": null
 
1
  {
2
  "best_global_step": 200,
3
+ "best_metric": 0.7801279425621033,
4
  "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-3B_v1/checkpoint-200",
5
+ "epoch": 3.0,
6
  "eval_steps": 20,
7
+ "global_step": 207,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.2909090909090909,
14
+ "grad_norm": 0.3590531647205353,
15
+ "learning_rate": 9.4e-05,
16
+ "loss": 1.0934,
17
  "step": 20
18
  },
19
  {
20
+ "epoch": 0.2909090909090909,
21
+ "eval_loss": 0.9285057783126831,
22
+ "eval_runtime": 14.1933,
23
+ "eval_samples_per_second": 4.227,
24
+ "eval_steps_per_second": 0.564,
25
  "step": 20
26
  },
27
  {
28
+ "epoch": 0.5818181818181818,
29
+ "grad_norm": 0.9511428475379944,
30
+ "learning_rate": 8.4e-05,
31
+ "loss": 0.8825,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.5818181818181818,
36
+ "eval_loss": 0.8724586367607117,
37
+ "eval_runtime": 11.8887,
38
+ "eval_samples_per_second": 5.047,
39
+ "eval_steps_per_second": 0.673,
40
  "step": 40
41
  },
42
  {
43
+ "epoch": 0.8727272727272727,
44
+ "grad_norm": 0.4212999939918518,
45
+ "learning_rate": 7.4e-05,
46
+ "loss": 0.8459,
47
  "step": 60
48
  },
49
  {
50
+ "epoch": 0.8727272727272727,
51
+ "eval_loss": 0.8422630429267883,
52
+ "eval_runtime": 11.896,
53
+ "eval_samples_per_second": 5.044,
54
+ "eval_steps_per_second": 0.672,
55
  "step": 60
56
  },
57
  {
58
+ "epoch": 1.16,
59
+ "grad_norm": 0.4596804082393646,
60
+ "learning_rate": 6.400000000000001e-05,
61
+ "loss": 0.8573,
62
  "step": 80
63
  },
64
  {
65
+ "epoch": 1.16,
66
+ "eval_loss": 0.8204946517944336,
67
+ "eval_runtime": 11.9018,
68
+ "eval_samples_per_second": 5.041,
69
+ "eval_steps_per_second": 0.672,
70
  "step": 80
71
  },
72
  {
73
+ "epoch": 1.450909090909091,
74
+ "grad_norm": 0.5978978872299194,
75
+ "learning_rate": 5.4000000000000005e-05,
76
+ "loss": 0.8109,
77
  "step": 100
78
  },
79
  {
80
+ "epoch": 1.450909090909091,
81
+ "eval_loss": 0.807877779006958,
82
+ "eval_runtime": 11.8993,
83
+ "eval_samples_per_second": 5.042,
84
+ "eval_steps_per_second": 0.672,
85
  "step": 100
86
  },
87
  {
88
+ "epoch": 1.7418181818181817,
89
+ "grad_norm": 0.6281698942184448,
90
+ "learning_rate": 4.4000000000000006e-05,
91
+ "loss": 0.7729,
92
  "step": 120
93
  },
94
  {
95
+ "epoch": 1.7418181818181817,
96
+ "eval_loss": 0.7977859377861023,
97
+ "eval_runtime": 11.9022,
98
+ "eval_samples_per_second": 5.041,
99
+ "eval_steps_per_second": 0.672,
100
  "step": 120
101
  },
102
  {
103
+ "epoch": 2.0290909090909093,
104
+ "grad_norm": 0.6559261679649353,
105
+ "learning_rate": 3.4000000000000007e-05,
106
+ "loss": 0.7089,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 2.0290909090909093,
111
+ "eval_loss": 0.7841778993606567,
112
+ "eval_runtime": 11.9,
113
+ "eval_samples_per_second": 5.042,
114
+ "eval_steps_per_second": 0.672,
115
  "step": 140
116
  },
117
  {
118
+ "epoch": 2.32,
119
+ "grad_norm": 0.7929721474647522,
120
+ "learning_rate": 2.4e-05,
121
+ "loss": 0.7298,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 2.32,
126
+ "eval_loss": 0.7870374917984009,
127
+ "eval_runtime": 11.902,
128
+ "eval_samples_per_second": 5.041,
129
+ "eval_steps_per_second": 0.672,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 2.610909090909091,
134
+ "grad_norm": 0.48386672139167786,
135
+ "learning_rate": 1.4000000000000001e-05,
136
+ "loss": 0.6684,
137
  "step": 180
138
  },
139
  {
140
+ "epoch": 2.610909090909091,
141
+ "eval_loss": 0.7819858193397522,
142
+ "eval_runtime": 11.9025,
143
+ "eval_samples_per_second": 5.041,
144
+ "eval_steps_per_second": 0.672,
145
  "step": 180
146
  },
147
  {
148
+ "epoch": 2.901818181818182,
149
+ "grad_norm": 0.3761616349220276,
150
+ "learning_rate": 4.000000000000001e-06,
151
+ "loss": 0.6122,
152
  "step": 200
153
  },
154
  {
155
+ "epoch": 2.901818181818182,
156
+ "eval_loss": 0.7801279425621033,
157
+ "eval_runtime": 11.902,
158
+ "eval_samples_per_second": 5.041,
159
+ "eval_steps_per_second": 0.672,
160
  "step": 200
161
  },
162
  {
163
+ "epoch": 3.0,
164
+ "step": 207,
165
+ "total_flos": 3.147624935890944e+16,
166
+ "train_loss": 0.794930600889639,
167
+ "train_runtime": 952.7294,
168
+ "train_samples_per_second": 1.732,
169
+ "train_steps_per_second": 0.217
170
  },
171
  {
172
+ "epoch": 3.0,
173
+ "eval_loss": 0.7801279425621033,
174
+ "eval_runtime": 11.912,
175
+ "eval_samples_per_second": 5.037,
176
+ "eval_steps_per_second": 0.672,
177
+ "step": 207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  }
179
  ],
180
  "logging_steps": 20,
181
+ "max_steps": 207,
182
  "num_input_tokens_seen": 0,
183
+ "num_train_epochs": 3,
184
  "save_steps": 20,
185
  "stateful_callbacks": {
186
  "EarlyStoppingCallback": {
 
189
  "early_stopping_threshold": 0.0
190
  },
191
  "attributes": {
192
+ "early_stopping_patience_counter": 0
193
  }
194
  },
195
  "TrainerControl": {
 
203
  "attributes": {}
204
  }
205
  },
206
+ "total_flos": 3.147624935890944e+16,
207
  "train_batch_size": 2,
208
  "trial_name": null,
209
  "trial_params": null