indiejoseph commited on
Commit
4c21da6
·
verified ·
1 Parent(s): c472012

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +744 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +1175 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CTCTransformerModel"
4
+ ],
5
+ "blank_id": 0,
6
+ "hidden_size": 384,
7
+ "id2label": {
8
+ "0": "<pad>",
9
+ "1": "|",
10
+ "2": "AA0",
11
+ "3": "AA1",
12
+ "4": "AA2",
13
+ "5": "AE0",
14
+ "6": "AE1",
15
+ "7": "AE2",
16
+ "8": "AH0",
17
+ "9": "AH1",
18
+ "10": "AH2",
19
+ "11": "AO0",
20
+ "12": "AO1",
21
+ "13": "AO2",
22
+ "14": "AW0",
23
+ "15": "AW1",
24
+ "16": "AW2",
25
+ "17": "AY0",
26
+ "18": "AY1",
27
+ "19": "AY2",
28
+ "20": "B",
29
+ "21": "CH",
30
+ "22": "D",
31
+ "23": "DH",
32
+ "24": "EH0",
33
+ "25": "EH1",
34
+ "26": "EH2",
35
+ "27": "ER0",
36
+ "28": "ER1",
37
+ "29": "ER2",
38
+ "30": "EY0",
39
+ "31": "EY1",
40
+ "32": "EY2",
41
+ "33": "F",
42
+ "34": "G",
43
+ "35": "HH",
44
+ "36": "IH0",
45
+ "37": "IH1",
46
+ "38": "IH2",
47
+ "39": "IY0",
48
+ "40": "IY1",
49
+ "41": "IY2",
50
+ "42": "JH",
51
+ "43": "K",
52
+ "44": "L",
53
+ "45": "M",
54
+ "46": "N",
55
+ "47": "NG",
56
+ "48": "Ng",
57
+ "49": "OW0",
58
+ "50": "OW1",
59
+ "51": "OW2",
60
+ "52": "OY0",
61
+ "53": "OY1",
62
+ "54": "OY2",
63
+ "55": "P",
64
+ "56": "R",
65
+ "57": "S",
66
+ "58": "SH",
67
+ "59": "T",
68
+ "60": "TH",
69
+ "61": "UH0",
70
+ "62": "UH1",
71
+ "63": "UH2",
72
+ "64": "UW0",
73
+ "65": "UW1",
74
+ "66": "UW2",
75
+ "67": "V",
76
+ "68": "W",
77
+ "69": "Y",
78
+ "70": "Z",
79
+ "71": "ZH",
80
+ "72": "aa",
81
+ "73": "aa_1",
82
+ "74": "aa_2",
83
+ "75": "aa_3",
84
+ "76": "aa_4",
85
+ "77": "aa_5",
86
+ "78": "aa_6",
87
+ "79": "aai",
88
+ "80": "aai_1",
89
+ "81": "aai_2",
90
+ "82": "aai_3",
91
+ "83": "aai_4",
92
+ "84": "aai_5",
93
+ "85": "aai_6",
94
+ "86": "aak",
95
+ "87": "aak_1",
96
+ "88": "aak_2",
97
+ "89": "aak_3",
98
+ "90": "aak_6",
99
+ "91": "aam",
100
+ "92": "aam_1",
101
+ "93": "aam_2",
102
+ "94": "aam_3",
103
+ "95": "aam_4",
104
+ "96": "aam_5",
105
+ "97": "aam_6",
106
+ "98": "aan_1",
107
+ "99": "aan_2",
108
+ "100": "aan_3",
109
+ "101": "aan_4",
110
+ "102": "aan_5",
111
+ "103": "aan_6",
112
+ "104": "aang_1",
113
+ "105": "aang_2",
114
+ "106": "aang_3",
115
+ "107": "aang_4",
116
+ "108": "aang_5",
117
+ "109": "aang_6",
118
+ "110": "aap",
119
+ "111": "aap_2",
120
+ "112": "aap_3",
121
+ "113": "aap_6",
122
+ "114": "aat",
123
+ "115": "aat_1",
124
+ "116": "aat_2",
125
+ "117": "aat_3",
126
+ "118": "aat_6",
127
+ "119": "aau",
128
+ "120": "aau_1",
129
+ "121": "aau_2",
130
+ "122": "aau_3",
131
+ "123": "aau_4",
132
+ "124": "aau_5",
133
+ "125": "aau_6",
134
+ "126": "ai",
135
+ "127": "ai_1",
136
+ "128": "ai_2",
137
+ "129": "ai_3",
138
+ "130": "ai_4",
139
+ "131": "ai_5",
140
+ "132": "ai_6",
141
+ "133": "ak",
142
+ "134": "ak_1",
143
+ "135": "ak_6",
144
+ "136": "am",
145
+ "137": "am_1",
146
+ "138": "am_2",
147
+ "139": "am_3",
148
+ "140": "am_4",
149
+ "141": "am_6",
150
+ "142": "an_1",
151
+ "143": "an_2",
152
+ "144": "an_3",
153
+ "145": "an_4",
154
+ "146": "an_5",
155
+ "147": "an_6",
156
+ "148": "ang_1",
157
+ "149": "ang_2",
158
+ "150": "ang_3",
159
+ "151": "ang_4",
160
+ "152": "ang_6",
161
+ "153": "ap_1",
162
+ "154": "ap_2",
163
+ "155": "ap_6",
164
+ "156": "at_1",
165
+ "157": "at_2",
166
+ "158": "at_4",
167
+ "159": "at_6",
168
+ "160": "au",
169
+ "161": "au_1",
170
+ "162": "au_2",
171
+ "163": "au_3",
172
+ "164": "au_4",
173
+ "165": "au_5",
174
+ "166": "au_6",
175
+ "167": "b",
176
+ "168": "c",
177
+ "169": "d",
178
+ "170": "e_1",
179
+ "171": "e_2",
180
+ "172": "e_3",
181
+ "173": "e_4",
182
+ "174": "e_5",
183
+ "175": "e_6",
184
+ "176": "ei_1",
185
+ "177": "ei_2",
186
+ "178": "ei_3",
187
+ "179": "ei_4",
188
+ "180": "ei_5",
189
+ "181": "ei_6",
190
+ "182": "ek_1",
191
+ "183": "ek_3",
192
+ "184": "ek_6",
193
+ "185": "eng_1",
194
+ "186": "eng_2",
195
+ "187": "eng_3",
196
+ "188": "eng_4",
197
+ "189": "eng_5",
198
+ "190": "eng_6",
199
+ "191": "eoi_1",
200
+ "192": "eoi_2",
201
+ "193": "eoi_3",
202
+ "194": "eoi_4",
203
+ "195": "eoi_5",
204
+ "196": "eoi_6",
205
+ "197": "eon_1",
206
+ "198": "eon_2",
207
+ "199": "eon_3",
208
+ "200": "eon_4",
209
+ "201": "eon_5",
210
+ "202": "eon_6",
211
+ "203": "eot_1",
212
+ "204": "eot_2",
213
+ "205": "eot_6",
214
+ "206": "eu_6",
215
+ "207": "f",
216
+ "208": "g",
217
+ "209": "gw",
218
+ "210": "h",
219
+ "211": "i_1",
220
+ "212": "i_2",
221
+ "213": "i_3",
222
+ "214": "i_4",
223
+ "215": "i_5",
224
+ "216": "i_6",
225
+ "217": "ik_1",
226
+ "218": "ik_4",
227
+ "219": "ik_6",
228
+ "220": "im_1",
229
+ "221": "im_2",
230
+ "222": "im_3",
231
+ "223": "im_4",
232
+ "224": "im_5",
233
+ "225": "im_6",
234
+ "226": "in_1",
235
+ "227": "in_2",
236
+ "228": "in_3",
237
+ "229": "in_4",
238
+ "230": "in_5",
239
+ "231": "in_6",
240
+ "232": "ing_1",
241
+ "233": "ing_2",
242
+ "234": "ing_3",
243
+ "235": "ing_4",
244
+ "236": "ing_5",
245
+ "237": "ing_6",
246
+ "238": "ip_2",
247
+ "239": "ip_3",
248
+ "240": "ip_6",
249
+ "241": "it_1",
250
+ "242": "it_3",
251
+ "243": "it_6",
252
+ "244": "iu_1",
253
+ "245": "iu_2",
254
+ "246": "iu_3",
255
+ "247": "iu_4",
256
+ "248": "iu_5",
257
+ "249": "iu_6",
258
+ "250": "j",
259
+ "251": "k",
260
+ "252": "kw",
261
+ "253": "l",
262
+ "254": "m",
263
+ "255": "n",
264
+ "256": "ng",
265
+ "257": "o",
266
+ "258": "o_1",
267
+ "259": "o_2",
268
+ "260": "o_3",
269
+ "261": "o_4",
270
+ "262": "o_5",
271
+ "263": "o_6",
272
+ "264": "oe_1",
273
+ "265": "oe_2",
274
+ "266": "oe_3",
275
+ "267": "oek_2",
276
+ "268": "oek_3",
277
+ "269": "oek_4",
278
+ "270": "oek_6",
279
+ "271": "oeng_1",
280
+ "272": "oeng_2",
281
+ "273": "oeng_3",
282
+ "274": "oeng_4",
283
+ "275": "oeng_5",
284
+ "276": "oeng_6",
285
+ "277": "oi",
286
+ "278": "oi_1",
287
+ "279": "oi_2",
288
+ "280": "oi_3",
289
+ "281": "oi_4",
290
+ "282": "oi_5",
291
+ "283": "oi_6",
292
+ "284": "ok",
293
+ "285": "ok_1",
294
+ "286": "ok_2",
295
+ "287": "ok_3",
296
+ "288": "ok_6",
297
+ "289": "on",
298
+ "290": "on_1",
299
+ "291": "on_2",
300
+ "292": "on_3",
301
+ "293": "on_4",
302
+ "294": "on_6",
303
+ "295": "ong_1",
304
+ "296": "ong_2",
305
+ "297": "ong_3",
306
+ "298": "ong_4",
307
+ "299": "ong_5",
308
+ "300": "ong_6",
309
+ "301": "ot_3",
310
+ "302": "ou",
311
+ "303": "ou_1",
312
+ "304": "ou_2",
313
+ "305": "ou_3",
314
+ "306": "ou_4",
315
+ "307": "ou_5",
316
+ "308": "ou_6",
317
+ "309": "p",
318
+ "310": "s",
319
+ "311": "t",
320
+ "312": "u_1",
321
+ "313": "u_2",
322
+ "314": "u_3",
323
+ "315": "u_4",
324
+ "316": "u_5",
325
+ "317": "u_6",
326
+ "318": "ui_1",
327
+ "319": "ui_2",
328
+ "320": "ui_3",
329
+ "321": "ui_4",
330
+ "322": "ui_5",
331
+ "323": "ui_6",
332
+ "324": "uk",
333
+ "325": "uk_1",
334
+ "326": "uk_2",
335
+ "327": "uk_6",
336
+ "328": "un_1",
337
+ "329": "un_2",
338
+ "330": "un_3",
339
+ "331": "un_4",
340
+ "332": "un_5",
341
+ "333": "un_6",
342
+ "334": "ung",
343
+ "335": "ung_1",
344
+ "336": "ung_2",
345
+ "337": "ung_3",
346
+ "338": "ung_4",
347
+ "339": "ung_5",
348
+ "340": "ung_6",
349
+ "341": "ut_1",
350
+ "342": "ut_3",
351
+ "343": "ut_6",
352
+ "344": "w",
353
+ "345": "yu_1",
354
+ "346": "yu_2",
355
+ "347": "yu_3",
356
+ "348": "yu_4",
357
+ "349": "yu_5",
358
+ "350": "yu_6",
359
+ "351": "yun_1",
360
+ "352": "yun_2",
361
+ "353": "yun_3",
362
+ "354": "yun_4",
363
+ "355": "yun_5",
364
+ "356": "yun_6",
365
+ "357": "yut_1",
366
+ "358": "yut_2",
367
+ "359": "yut_3",
368
+ "360": "yut_4",
369
+ "361": "yut_6",
370
+ "362": "z"
371
+ },
372
+ "intermediate_size": 1024,
373
+ "label2id": {
374
+ "<pad>": 0,
375
+ "AA0": 2,
376
+ "AA1": 3,
377
+ "AA2": 4,
378
+ "AE0": 5,
379
+ "AE1": 6,
380
+ "AE2": 7,
381
+ "AH0": 8,
382
+ "AH1": 9,
383
+ "AH2": 10,
384
+ "AO0": 11,
385
+ "AO1": 12,
386
+ "AO2": 13,
387
+ "AW0": 14,
388
+ "AW1": 15,
389
+ "AW2": 16,
390
+ "AY0": 17,
391
+ "AY1": 18,
392
+ "AY2": 19,
393
+ "B": 20,
394
+ "CH": 21,
395
+ "D": 22,
396
+ "DH": 23,
397
+ "EH0": 24,
398
+ "EH1": 25,
399
+ "EH2": 26,
400
+ "ER0": 27,
401
+ "ER1": 28,
402
+ "ER2": 29,
403
+ "EY0": 30,
404
+ "EY1": 31,
405
+ "EY2": 32,
406
+ "F": 33,
407
+ "G": 34,
408
+ "HH": 35,
409
+ "IH0": 36,
410
+ "IH1": 37,
411
+ "IH2": 38,
412
+ "IY0": 39,
413
+ "IY1": 40,
414
+ "IY2": 41,
415
+ "JH": 42,
416
+ "K": 43,
417
+ "L": 44,
418
+ "M": 45,
419
+ "N": 46,
420
+ "NG": 47,
421
+ "Ng": 48,
422
+ "OW0": 49,
423
+ "OW1": 50,
424
+ "OW2": 51,
425
+ "OY0": 52,
426
+ "OY1": 53,
427
+ "OY2": 54,
428
+ "P": 55,
429
+ "R": 56,
430
+ "S": 57,
431
+ "SH": 58,
432
+ "T": 59,
433
+ "TH": 60,
434
+ "UH0": 61,
435
+ "UH1": 62,
436
+ "UH2": 63,
437
+ "UW0": 64,
438
+ "UW1": 65,
439
+ "UW2": 66,
440
+ "V": 67,
441
+ "W": 68,
442
+ "Y": 69,
443
+ "Z": 70,
444
+ "ZH": 71,
445
+ "aa": 72,
446
+ "aa_1": 73,
447
+ "aa_2": 74,
448
+ "aa_3": 75,
449
+ "aa_4": 76,
450
+ "aa_5": 77,
451
+ "aa_6": 78,
452
+ "aai": 79,
453
+ "aai_1": 80,
454
+ "aai_2": 81,
455
+ "aai_3": 82,
456
+ "aai_4": 83,
457
+ "aai_5": 84,
458
+ "aai_6": 85,
459
+ "aak": 86,
460
+ "aak_1": 87,
461
+ "aak_2": 88,
462
+ "aak_3": 89,
463
+ "aak_6": 90,
464
+ "aam": 91,
465
+ "aam_1": 92,
466
+ "aam_2": 93,
467
+ "aam_3": 94,
468
+ "aam_4": 95,
469
+ "aam_5": 96,
470
+ "aam_6": 97,
471
+ "aan_1": 98,
472
+ "aan_2": 99,
473
+ "aan_3": 100,
474
+ "aan_4": 101,
475
+ "aan_5": 102,
476
+ "aan_6": 103,
477
+ "aang_1": 104,
478
+ "aang_2": 105,
479
+ "aang_3": 106,
480
+ "aang_4": 107,
481
+ "aang_5": 108,
482
+ "aang_6": 109,
483
+ "aap": 110,
484
+ "aap_2": 111,
485
+ "aap_3": 112,
486
+ "aap_6": 113,
487
+ "aat": 114,
488
+ "aat_1": 115,
489
+ "aat_2": 116,
490
+ "aat_3": 117,
491
+ "aat_6": 118,
492
+ "aau": 119,
493
+ "aau_1": 120,
494
+ "aau_2": 121,
495
+ "aau_3": 122,
496
+ "aau_4": 123,
497
+ "aau_5": 124,
498
+ "aau_6": 125,
499
+ "ai": 126,
500
+ "ai_1": 127,
501
+ "ai_2": 128,
502
+ "ai_3": 129,
503
+ "ai_4": 130,
504
+ "ai_5": 131,
505
+ "ai_6": 132,
506
+ "ak": 133,
507
+ "ak_1": 134,
508
+ "ak_6": 135,
509
+ "am": 136,
510
+ "am_1": 137,
511
+ "am_2": 138,
512
+ "am_3": 139,
513
+ "am_4": 140,
514
+ "am_6": 141,
515
+ "an_1": 142,
516
+ "an_2": 143,
517
+ "an_3": 144,
518
+ "an_4": 145,
519
+ "an_5": 146,
520
+ "an_6": 147,
521
+ "ang_1": 148,
522
+ "ang_2": 149,
523
+ "ang_3": 150,
524
+ "ang_4": 151,
525
+ "ang_6": 152,
526
+ "ap_1": 153,
527
+ "ap_2": 154,
528
+ "ap_6": 155,
529
+ "at_1": 156,
530
+ "at_2": 157,
531
+ "at_4": 158,
532
+ "at_6": 159,
533
+ "au": 160,
534
+ "au_1": 161,
535
+ "au_2": 162,
536
+ "au_3": 163,
537
+ "au_4": 164,
538
+ "au_5": 165,
539
+ "au_6": 166,
540
+ "b": 167,
541
+ "c": 168,
542
+ "d": 169,
543
+ "e_1": 170,
544
+ "e_2": 171,
545
+ "e_3": 172,
546
+ "e_4": 173,
547
+ "e_5": 174,
548
+ "e_6": 175,
549
+ "ei_1": 176,
550
+ "ei_2": 177,
551
+ "ei_3": 178,
552
+ "ei_4": 179,
553
+ "ei_5": 180,
554
+ "ei_6": 181,
555
+ "ek_1": 182,
556
+ "ek_3": 183,
557
+ "ek_6": 184,
558
+ "eng_1": 185,
559
+ "eng_2": 186,
560
+ "eng_3": 187,
561
+ "eng_4": 188,
562
+ "eng_5": 189,
563
+ "eng_6": 190,
564
+ "eoi_1": 191,
565
+ "eoi_2": 192,
566
+ "eoi_3": 193,
567
+ "eoi_4": 194,
568
+ "eoi_5": 195,
569
+ "eoi_6": 196,
570
+ "eon_1": 197,
571
+ "eon_2": 198,
572
+ "eon_3": 199,
573
+ "eon_4": 200,
574
+ "eon_5": 201,
575
+ "eon_6": 202,
576
+ "eot_1": 203,
577
+ "eot_2": 204,
578
+ "eot_6": 205,
579
+ "eu_6": 206,
580
+ "f": 207,
581
+ "g": 208,
582
+ "gw": 209,
583
+ "h": 210,
584
+ "i_1": 211,
585
+ "i_2": 212,
586
+ "i_3": 213,
587
+ "i_4": 214,
588
+ "i_5": 215,
589
+ "i_6": 216,
590
+ "ik_1": 217,
591
+ "ik_4": 218,
592
+ "ik_6": 219,
593
+ "im_1": 220,
594
+ "im_2": 221,
595
+ "im_3": 222,
596
+ "im_4": 223,
597
+ "im_5": 224,
598
+ "im_6": 225,
599
+ "in_1": 226,
600
+ "in_2": 227,
601
+ "in_3": 228,
602
+ "in_4": 229,
603
+ "in_5": 230,
604
+ "in_6": 231,
605
+ "ing_1": 232,
606
+ "ing_2": 233,
607
+ "ing_3": 234,
608
+ "ing_4": 235,
609
+ "ing_5": 236,
610
+ "ing_6": 237,
611
+ "ip_2": 238,
612
+ "ip_3": 239,
613
+ "ip_6": 240,
614
+ "it_1": 241,
615
+ "it_3": 242,
616
+ "it_6": 243,
617
+ "iu_1": 244,
618
+ "iu_2": 245,
619
+ "iu_3": 246,
620
+ "iu_4": 247,
621
+ "iu_5": 248,
622
+ "iu_6": 249,
623
+ "j": 250,
624
+ "k": 251,
625
+ "kw": 252,
626
+ "l": 253,
627
+ "m": 254,
628
+ "n": 255,
629
+ "ng": 256,
630
+ "o": 257,
631
+ "o_1": 258,
632
+ "o_2": 259,
633
+ "o_3": 260,
634
+ "o_4": 261,
635
+ "o_5": 262,
636
+ "o_6": 263,
637
+ "oe_1": 264,
638
+ "oe_2": 265,
639
+ "oe_3": 266,
640
+ "oek_2": 267,
641
+ "oek_3": 268,
642
+ "oek_4": 269,
643
+ "oek_6": 270,
644
+ "oeng_1": 271,
645
+ "oeng_2": 272,
646
+ "oeng_3": 273,
647
+ "oeng_4": 274,
648
+ "oeng_5": 275,
649
+ "oeng_6": 276,
650
+ "oi": 277,
651
+ "oi_1": 278,
652
+ "oi_2": 279,
653
+ "oi_3": 280,
654
+ "oi_4": 281,
655
+ "oi_5": 282,
656
+ "oi_6": 283,
657
+ "ok": 284,
658
+ "ok_1": 285,
659
+ "ok_2": 286,
660
+ "ok_3": 287,
661
+ "ok_6": 288,
662
+ "on": 289,
663
+ "on_1": 290,
664
+ "on_2": 291,
665
+ "on_3": 292,
666
+ "on_4": 293,
667
+ "on_6": 294,
668
+ "ong_1": 295,
669
+ "ong_2": 296,
670
+ "ong_3": 297,
671
+ "ong_4": 298,
672
+ "ong_5": 299,
673
+ "ong_6": 300,
674
+ "ot_3": 301,
675
+ "ou": 302,
676
+ "ou_1": 303,
677
+ "ou_2": 304,
678
+ "ou_3": 305,
679
+ "ou_4": 306,
680
+ "ou_5": 307,
681
+ "ou_6": 308,
682
+ "p": 309,
683
+ "s": 310,
684
+ "t": 311,
685
+ "u_1": 312,
686
+ "u_2": 313,
687
+ "u_3": 314,
688
+ "u_4": 315,
689
+ "u_5": 316,
690
+ "u_6": 317,
691
+ "ui_1": 318,
692
+ "ui_2": 319,
693
+ "ui_3": 320,
694
+ "ui_4": 321,
695
+ "ui_5": 322,
696
+ "ui_6": 323,
697
+ "uk": 324,
698
+ "uk_1": 325,
699
+ "uk_2": 326,
700
+ "uk_6": 327,
701
+ "un_1": 328,
702
+ "un_2": 329,
703
+ "un_3": 330,
704
+ "un_4": 331,
705
+ "un_5": 332,
706
+ "un_6": 333,
707
+ "ung": 334,
708
+ "ung_1": 335,
709
+ "ung_2": 336,
710
+ "ung_3": 337,
711
+ "ung_4": 338,
712
+ "ung_5": 339,
713
+ "ung_6": 340,
714
+ "ut_1": 341,
715
+ "ut_3": 342,
716
+ "ut_6": 343,
717
+ "w": 344,
718
+ "yu_1": 345,
719
+ "yu_2": 346,
720
+ "yu_3": 347,
721
+ "yu_4": 348,
722
+ "yu_5": 349,
723
+ "yu_6": 350,
724
+ "yun_1": 351,
725
+ "yun_2": 352,
726
+ "yun_3": 353,
727
+ "yun_4": 354,
728
+ "yun_5": 355,
729
+ "yun_6": 356,
730
+ "yut_1": 357,
731
+ "yut_2": 358,
732
+ "yut_3": 359,
733
+ "yut_4": 360,
734
+ "yut_6": 361,
735
+ "z": 362,
736
+ "|": 1
737
+ },
738
+ "max_position_embeddings": 1024,
739
+ "num_attention_heads": 6,
740
+ "num_hidden_layers": 6,
741
+ "torch_dtype": "float32",
742
+ "transformers_version": "4.50.3",
743
+ "vocab_size": 6561
744
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47947574cf1d0161d5e51a17465e3c5d6500449f7c2d421d04b8f3c2d241a750
3
+ size 43783580
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:042e96fb4c58ed8717496a4ebd1ed1ae01d8a45911590edb9a789e74209b6148
3
+ size 87614778
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d613c42bc6269fa0dc958aa82cd8cd6dfce040534eaf9518e3193f2c7952dd0
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfd9c5c96ae49d06833c053e6af6f37491082e4ff8aad12c8e68cdd6d2e4aa79
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,1175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 14.0,
6
+ "eval_steps": 500,
7
+ "global_step": 14728,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09505703422053231,
14
+ "grad_norm": 0.9250678420066833,
15
+ "learning_rate": 3.95882818685669e-05,
16
+ "loss": 6.6663,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.19011406844106463,
21
+ "grad_norm": 1.117205262184143,
22
+ "learning_rate": 7.91765637371338e-05,
23
+ "loss": 4.145,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.28517110266159695,
28
+ "grad_norm": 1.3325448036193848,
29
+ "learning_rate": 0.00011876484560570071,
30
+ "loss": 3.8608,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.38022813688212925,
35
+ "grad_norm": 0.9613128900527954,
36
+ "learning_rate": 0.0001583531274742676,
37
+ "loss": 3.768,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4752851711026616,
42
+ "grad_norm": 1.1198444366455078,
43
+ "learning_rate": 0.00019794140934283454,
44
+ "loss": 3.7093,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5703422053231939,
49
+ "grad_norm": 1.0210366249084473,
50
+ "learning_rate": 0.00023752969121140142,
51
+ "loss": 3.6664,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6653992395437263,
56
+ "grad_norm": 0.9687440395355225,
57
+ "learning_rate": 0.00027711797307996834,
58
+ "loss": 3.5409,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.7604562737642585,
63
+ "grad_norm": 1.4981633424758911,
64
+ "learning_rate": 0.0003167062549485352,
65
+ "loss": 3.3992,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8555133079847909,
70
+ "grad_norm": 0.8627603650093079,
71
+ "learning_rate": 0.00035629453681710216,
72
+ "loss": 3.1727,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9505703422053232,
77
+ "grad_norm": 0.9925593733787537,
78
+ "learning_rate": 0.0003958828186856691,
79
+ "loss": 2.9724,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_loss": 2.6960370540618896,
85
+ "eval_runtime": 3.7345,
86
+ "eval_samples_per_second": 1897.152,
87
+ "eval_steps_per_second": 118.622,
88
+ "step": 1052
89
+ },
90
+ {
91
+ "epoch": 1.0456273764258555,
92
+ "grad_norm": 0.9267619848251343,
93
+ "learning_rate": 0.00043547110055423594,
94
+ "loss": 2.7022,
95
+ "step": 1100
96
+ },
97
+ {
98
+ "epoch": 1.1406844106463878,
99
+ "grad_norm": 0.7666485905647278,
100
+ "learning_rate": 0.00047505938242280285,
101
+ "loss": 2.5641,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.2357414448669202,
106
+ "grad_norm": 0.5969619154930115,
107
+ "learning_rate": 0.0004990645699549983,
108
+ "loss": 2.5058,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.3307984790874525,
113
+ "grad_norm": 0.7782655358314514,
114
+ "learning_rate": 0.0004965363806441826,
115
+ "loss": 2.4665,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.4258555133079849,
120
+ "grad_norm": 0.8928040266036987,
121
+ "learning_rate": 0.000494008191333367,
122
+ "loss": 2.4311,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.5209125475285172,
127
+ "grad_norm": 0.8687949180603027,
128
+ "learning_rate": 0.0004914800020225515,
129
+ "loss": 2.3964,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.6159695817490496,
134
+ "grad_norm": 0.6245518922805786,
135
+ "learning_rate": 0.0004889518127117359,
136
+ "loss": 2.374,
137
+ "step": 1700
138
+ },
139
+ {
140
+ "epoch": 1.7110266159695817,
141
+ "grad_norm": 0.6903976202011108,
142
+ "learning_rate": 0.0004864236234009203,
143
+ "loss": 2.3606,
144
+ "step": 1800
145
+ },
146
+ {
147
+ "epoch": 1.806083650190114,
148
+ "grad_norm": 0.8996257781982422,
149
+ "learning_rate": 0.00048389543409010466,
150
+ "loss": 2.3376,
151
+ "step": 1900
152
+ },
153
+ {
154
+ "epoch": 1.9011406844106464,
155
+ "grad_norm": 0.734466016292572,
156
+ "learning_rate": 0.0004813672447792891,
157
+ "loss": 2.3226,
158
+ "step": 2000
159
+ },
160
+ {
161
+ "epoch": 1.9961977186311786,
162
+ "grad_norm": 0.6836825013160706,
163
+ "learning_rate": 0.0004788390554684735,
164
+ "loss": 2.3108,
165
+ "step": 2100
166
+ },
167
+ {
168
+ "epoch": 2.0,
169
+ "eval_loss": 2.285733461380005,
170
+ "eval_runtime": 3.623,
171
+ "eval_samples_per_second": 1955.579,
172
+ "eval_steps_per_second": 122.275,
173
+ "step": 2104
174
+ },
175
+ {
176
+ "epoch": 2.091254752851711,
177
+ "grad_norm": 0.5974160432815552,
178
+ "learning_rate": 0.0004763108661576579,
179
+ "loss": 2.2585,
180
+ "step": 2200
181
+ },
182
+ {
183
+ "epoch": 2.1863117870722433,
184
+ "grad_norm": 0.788093626499176,
185
+ "learning_rate": 0.0004737826768468423,
186
+ "loss": 2.264,
187
+ "step": 2300
188
+ },
189
+ {
190
+ "epoch": 2.2813688212927756,
191
+ "grad_norm": 0.7451100945472717,
192
+ "learning_rate": 0.00047125448753602674,
193
+ "loss": 2.2504,
194
+ "step": 2400
195
+ },
196
+ {
197
+ "epoch": 2.376425855513308,
198
+ "grad_norm": 0.6724629998207092,
199
+ "learning_rate": 0.0004687262982252111,
200
+ "loss": 2.2358,
201
+ "step": 2500
202
+ },
203
+ {
204
+ "epoch": 2.4714828897338403,
205
+ "grad_norm": 0.6606141924858093,
206
+ "learning_rate": 0.00046619810891439554,
207
+ "loss": 2.2301,
208
+ "step": 2600
209
+ },
210
+ {
211
+ "epoch": 2.5665399239543727,
212
+ "grad_norm": 0.6599621772766113,
213
+ "learning_rate": 0.0004636699196035799,
214
+ "loss": 2.2268,
215
+ "step": 2700
216
+ },
217
+ {
218
+ "epoch": 2.661596958174905,
219
+ "grad_norm": 0.6633493304252625,
220
+ "learning_rate": 0.00046114173029276434,
221
+ "loss": 2.2247,
222
+ "step": 2800
223
+ },
224
+ {
225
+ "epoch": 2.7566539923954374,
226
+ "grad_norm": 0.6308265328407288,
227
+ "learning_rate": 0.00045861354098194877,
228
+ "loss": 2.2221,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 2.8517110266159698,
233
+ "grad_norm": 0.6383451223373413,
234
+ "learning_rate": 0.00045608535167113314,
235
+ "loss": 2.2274,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 2.9467680608365017,
240
+ "grad_norm": 0.61512291431427,
241
+ "learning_rate": 0.00045355716236031757,
242
+ "loss": 2.2067,
243
+ "step": 3100
244
+ },
245
+ {
246
+ "epoch": 3.0,
247
+ "eval_loss": 2.2008087635040283,
248
+ "eval_runtime": 3.5613,
249
+ "eval_samples_per_second": 1989.445,
250
+ "eval_steps_per_second": 124.393,
251
+ "step": 3156
252
+ },
253
+ {
254
+ "epoch": 3.041825095057034,
255
+ "grad_norm": 0.7461186647415161,
256
+ "learning_rate": 0.00045102897304950194,
257
+ "loss": 2.1882,
258
+ "step": 3200
259
+ },
260
+ {
261
+ "epoch": 3.1368821292775664,
262
+ "grad_norm": 0.6590662598609924,
263
+ "learning_rate": 0.00044850078373868637,
264
+ "loss": 2.1662,
265
+ "step": 3300
266
+ },
267
+ {
268
+ "epoch": 3.2319391634980987,
269
+ "grad_norm": 0.5832785964012146,
270
+ "learning_rate": 0.00044597259442787074,
271
+ "loss": 2.1603,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 3.326996197718631,
276
+ "grad_norm": 0.6356543898582458,
277
+ "learning_rate": 0.00044344440511705517,
278
+ "loss": 2.1601,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 3.4220532319391634,
283
+ "grad_norm": 0.7197031378746033,
284
+ "learning_rate": 0.0004409162158062396,
285
+ "loss": 2.1567,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 3.517110266159696,
290
+ "grad_norm": 0.5856086611747742,
291
+ "learning_rate": 0.00043838802649542397,
292
+ "loss": 2.1588,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 3.612167300380228,
297
+ "grad_norm": 0.6212655305862427,
298
+ "learning_rate": 0.00043585983718460834,
299
+ "loss": 2.1565,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 3.7072243346007605,
304
+ "grad_norm": 0.6765671968460083,
305
+ "learning_rate": 0.0004333316478737928,
306
+ "loss": 2.1667,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 3.802281368821293,
311
+ "grad_norm": 0.6720090508460999,
312
+ "learning_rate": 0.0004308034585629772,
313
+ "loss": 2.1675,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 3.897338403041825,
318
+ "grad_norm": 0.7150991559028625,
319
+ "learning_rate": 0.00042827526925216157,
320
+ "loss": 2.1474,
321
+ "step": 4100
322
+ },
323
+ {
324
+ "epoch": 3.9923954372623576,
325
+ "grad_norm": 0.5831249356269836,
326
+ "learning_rate": 0.00042574707994134605,
327
+ "loss": 2.1485,
328
+ "step": 4200
329
+ },
330
+ {
331
+ "epoch": 4.0,
332
+ "eval_loss": 2.15364408493042,
333
+ "eval_runtime": 3.644,
334
+ "eval_samples_per_second": 1944.292,
335
+ "eval_steps_per_second": 121.57,
336
+ "step": 4208
337
+ },
338
+ {
339
+ "epoch": 4.08745247148289,
340
+ "grad_norm": 0.6653150916099548,
341
+ "learning_rate": 0.0004232188906305304,
342
+ "loss": 2.0899,
343
+ "step": 4300
344
+ },
345
+ {
346
+ "epoch": 4.182509505703422,
347
+ "grad_norm": 0.7235066294670105,
348
+ "learning_rate": 0.0004206907013197148,
349
+ "loss": 2.0982,
350
+ "step": 4400
351
+ },
352
+ {
353
+ "epoch": 4.277566539923955,
354
+ "grad_norm": 0.7326545715332031,
355
+ "learning_rate": 0.0004181625120088992,
356
+ "loss": 2.1007,
357
+ "step": 4500
358
+ },
359
+ {
360
+ "epoch": 4.3726235741444865,
361
+ "grad_norm": 0.6236776113510132,
362
+ "learning_rate": 0.00041563432269808365,
363
+ "loss": 2.1031,
364
+ "step": 4600
365
+ },
366
+ {
367
+ "epoch": 4.467680608365019,
368
+ "grad_norm": 0.5669475197792053,
369
+ "learning_rate": 0.000413106133387268,
370
+ "loss": 2.1087,
371
+ "step": 4700
372
+ },
373
+ {
374
+ "epoch": 4.562737642585551,
375
+ "grad_norm": 0.5483006834983826,
376
+ "learning_rate": 0.00041057794407645245,
377
+ "loss": 2.1034,
378
+ "step": 4800
379
+ },
380
+ {
381
+ "epoch": 4.657794676806084,
382
+ "grad_norm": 0.5456926822662354,
383
+ "learning_rate": 0.0004080497547656369,
384
+ "loss": 2.1065,
385
+ "step": 4900
386
+ },
387
+ {
388
+ "epoch": 4.752851711026616,
389
+ "grad_norm": 0.9545803666114807,
390
+ "learning_rate": 0.00040552156545482125,
391
+ "loss": 2.1168,
392
+ "step": 5000
393
+ },
394
+ {
395
+ "epoch": 4.847908745247148,
396
+ "grad_norm": 0.5378767251968384,
397
+ "learning_rate": 0.0004029933761440057,
398
+ "loss": 2.1107,
399
+ "step": 5100
400
+ },
401
+ {
402
+ "epoch": 4.942965779467681,
403
+ "grad_norm": 0.629880964756012,
404
+ "learning_rate": 0.00040046518683319005,
405
+ "loss": 2.0983,
406
+ "step": 5200
407
+ },
408
+ {
409
+ "epoch": 5.0,
410
+ "eval_loss": 2.132718801498413,
411
+ "eval_runtime": 3.6373,
412
+ "eval_samples_per_second": 1947.857,
413
+ "eval_steps_per_second": 121.793,
414
+ "step": 5260
415
+ },
416
+ {
417
+ "epoch": 5.038022813688213,
418
+ "grad_norm": 0.5900342464447021,
419
+ "learning_rate": 0.0003979369975223745,
420
+ "loss": 2.0758,
421
+ "step": 5300
422
+ },
423
+ {
424
+ "epoch": 5.133079847908745,
425
+ "grad_norm": 0.6181082129478455,
426
+ "learning_rate": 0.0003954088082115589,
427
+ "loss": 2.041,
428
+ "step": 5400
429
+ },
430
+ {
431
+ "epoch": 5.228136882129277,
432
+ "grad_norm": 0.6756412386894226,
433
+ "learning_rate": 0.0003928806189007433,
434
+ "loss": 2.0548,
435
+ "step": 5500
436
+ },
437
+ {
438
+ "epoch": 5.32319391634981,
439
+ "grad_norm": 0.6649320125579834,
440
+ "learning_rate": 0.0003903524295899277,
441
+ "loss": 2.0438,
442
+ "step": 5600
443
+ },
444
+ {
445
+ "epoch": 5.418250950570342,
446
+ "grad_norm": 0.5628513693809509,
447
+ "learning_rate": 0.00038782424027911214,
448
+ "loss": 2.0485,
449
+ "step": 5700
450
+ },
451
+ {
452
+ "epoch": 5.513307984790875,
453
+ "grad_norm": 0.6923677921295166,
454
+ "learning_rate": 0.0003852960509682965,
455
+ "loss": 2.063,
456
+ "step": 5800
457
+ },
458
+ {
459
+ "epoch": 5.608365019011407,
460
+ "grad_norm": 0.6819363236427307,
461
+ "learning_rate": 0.0003827678616574809,
462
+ "loss": 2.0618,
463
+ "step": 5900
464
+ },
465
+ {
466
+ "epoch": 5.7034220532319395,
467
+ "grad_norm": 0.6446284055709839,
468
+ "learning_rate": 0.00038023967234666537,
469
+ "loss": 2.0674,
470
+ "step": 6000
471
+ },
472
+ {
473
+ "epoch": 5.798479087452471,
474
+ "grad_norm": 0.6319680213928223,
475
+ "learning_rate": 0.00037771148303584974,
476
+ "loss": 2.061,
477
+ "step": 6100
478
+ },
479
+ {
480
+ "epoch": 5.893536121673003,
481
+ "grad_norm": 0.6318814754486084,
482
+ "learning_rate": 0.0003751832937250341,
483
+ "loss": 2.0656,
484
+ "step": 6200
485
+ },
486
+ {
487
+ "epoch": 5.988593155893536,
488
+ "grad_norm": 0.6261875033378601,
489
+ "learning_rate": 0.0003726551044142186,
490
+ "loss": 2.0663,
491
+ "step": 6300
492
+ },
493
+ {
494
+ "epoch": 6.0,
495
+ "eval_loss": 2.1098814010620117,
496
+ "eval_runtime": 3.698,
497
+ "eval_samples_per_second": 1915.889,
498
+ "eval_steps_per_second": 119.794,
499
+ "step": 6312
500
+ },
501
+ {
502
+ "epoch": 6.083650190114068,
503
+ "grad_norm": 0.6620230674743652,
504
+ "learning_rate": 0.00037012691510340297,
505
+ "loss": 1.9996,
506
+ "step": 6400
507
+ },
508
+ {
509
+ "epoch": 6.178707224334601,
510
+ "grad_norm": 1.0794607400894165,
511
+ "learning_rate": 0.00036759872579258734,
512
+ "loss": 2.0018,
513
+ "step": 6500
514
+ },
515
+ {
516
+ "epoch": 6.273764258555133,
517
+ "grad_norm": 1.372861385345459,
518
+ "learning_rate": 0.00036507053648177177,
519
+ "loss": 2.0059,
520
+ "step": 6600
521
+ },
522
+ {
523
+ "epoch": 6.3688212927756656,
524
+ "grad_norm": 0.5926664471626282,
525
+ "learning_rate": 0.0003625423471709562,
526
+ "loss": 2.012,
527
+ "step": 6700
528
+ },
529
+ {
530
+ "epoch": 6.4638783269961975,
531
+ "grad_norm": 0.7855852246284485,
532
+ "learning_rate": 0.00036001415786014057,
533
+ "loss": 2.0128,
534
+ "step": 6800
535
+ },
536
+ {
537
+ "epoch": 6.55893536121673,
538
+ "grad_norm": 0.6684075593948364,
539
+ "learning_rate": 0.000357485968549325,
540
+ "loss": 2.0221,
541
+ "step": 6900
542
+ },
543
+ {
544
+ "epoch": 6.653992395437262,
545
+ "grad_norm": 0.628013014793396,
546
+ "learning_rate": 0.00035495777923850937,
547
+ "loss": 2.0159,
548
+ "step": 7000
549
+ },
550
+ {
551
+ "epoch": 6.749049429657795,
552
+ "grad_norm": 0.7943947911262512,
553
+ "learning_rate": 0.0003524295899276938,
554
+ "loss": 2.0223,
555
+ "step": 7100
556
+ },
557
+ {
558
+ "epoch": 6.844106463878327,
559
+ "grad_norm": 0.645799994468689,
560
+ "learning_rate": 0.0003499014006168782,
561
+ "loss": 2.0206,
562
+ "step": 7200
563
+ },
564
+ {
565
+ "epoch": 6.93916349809886,
566
+ "grad_norm": 0.6603648066520691,
567
+ "learning_rate": 0.0003473732113060626,
568
+ "loss": 2.0304,
569
+ "step": 7300
570
+ },
571
+ {
572
+ "epoch": 7.0,
573
+ "eval_loss": 2.099062919616699,
574
+ "eval_runtime": 3.631,
575
+ "eval_samples_per_second": 1951.251,
576
+ "eval_steps_per_second": 122.005,
577
+ "step": 7364
578
+ },
579
+ {
580
+ "epoch": 7.034220532319392,
581
+ "grad_norm": 0.6082973480224609,
582
+ "learning_rate": 0.000344845021995247,
583
+ "loss": 2.0039,
584
+ "step": 7400
585
+ },
586
+ {
587
+ "epoch": 7.129277566539924,
588
+ "grad_norm": 0.673995852470398,
589
+ "learning_rate": 0.0003423168326844314,
590
+ "loss": 1.9663,
591
+ "step": 7500
592
+ },
593
+ {
594
+ "epoch": 7.224334600760456,
595
+ "grad_norm": 0.675037682056427,
596
+ "learning_rate": 0.0003397886433736158,
597
+ "loss": 1.9696,
598
+ "step": 7600
599
+ },
600
+ {
601
+ "epoch": 7.319391634980988,
602
+ "grad_norm": 0.6488978266716003,
603
+ "learning_rate": 0.0003372604540628002,
604
+ "loss": 1.9701,
605
+ "step": 7700
606
+ },
607
+ {
608
+ "epoch": 7.414448669201521,
609
+ "grad_norm": 0.8255399465560913,
610
+ "learning_rate": 0.0003347322647519846,
611
+ "loss": 1.9654,
612
+ "step": 7800
613
+ },
614
+ {
615
+ "epoch": 7.509505703422053,
616
+ "grad_norm": 1.2661654949188232,
617
+ "learning_rate": 0.00033220407544116905,
618
+ "loss": 1.9736,
619
+ "step": 7900
620
+ },
621
+ {
622
+ "epoch": 7.604562737642586,
623
+ "grad_norm": 0.6545805335044861,
624
+ "learning_rate": 0.0003296758861303534,
625
+ "loss": 1.9783,
626
+ "step": 8000
627
+ },
628
+ {
629
+ "epoch": 7.699619771863118,
630
+ "grad_norm": 0.8890361189842224,
631
+ "learning_rate": 0.00032714769681953785,
632
+ "loss": 1.9807,
633
+ "step": 8100
634
+ },
635
+ {
636
+ "epoch": 7.79467680608365,
637
+ "grad_norm": 0.6547899842262268,
638
+ "learning_rate": 0.0003246195075087223,
639
+ "loss": 1.9723,
640
+ "step": 8200
641
+ },
642
+ {
643
+ "epoch": 7.889733840304182,
644
+ "grad_norm": 1.1239402294158936,
645
+ "learning_rate": 0.00032209131819790665,
646
+ "loss": 1.9734,
647
+ "step": 8300
648
+ },
649
+ {
650
+ "epoch": 7.984790874524715,
651
+ "grad_norm": 0.6624830961227417,
652
+ "learning_rate": 0.000319563128887091,
653
+ "loss": 1.9869,
654
+ "step": 8400
655
+ },
656
+ {
657
+ "epoch": 8.0,
658
+ "eval_loss": 2.1034328937530518,
659
+ "eval_runtime": 3.6013,
660
+ "eval_samples_per_second": 1967.337,
661
+ "eval_steps_per_second": 123.011,
662
+ "step": 8416
663
+ },
664
+ {
665
+ "epoch": 8.079847908745247,
666
+ "grad_norm": 0.6550971269607544,
667
+ "learning_rate": 0.0003170349395762755,
668
+ "loss": 1.9223,
669
+ "step": 8500
670
+ },
671
+ {
672
+ "epoch": 8.17490494296578,
673
+ "grad_norm": 0.660987138748169,
674
+ "learning_rate": 0.0003145067502654599,
675
+ "loss": 1.9245,
676
+ "step": 8600
677
+ },
678
+ {
679
+ "epoch": 8.269961977186313,
680
+ "grad_norm": 0.759884774684906,
681
+ "learning_rate": 0.00031197856095464425,
682
+ "loss": 1.9235,
683
+ "step": 8700
684
+ },
685
+ {
686
+ "epoch": 8.365019011406844,
687
+ "grad_norm": 0.9319919347763062,
688
+ "learning_rate": 0.00030945037164382874,
689
+ "loss": 1.9239,
690
+ "step": 8800
691
+ },
692
+ {
693
+ "epoch": 8.460076045627376,
694
+ "grad_norm": 0.6610597968101501,
695
+ "learning_rate": 0.0003069221823330131,
696
+ "loss": 1.928,
697
+ "step": 8900
698
+ },
699
+ {
700
+ "epoch": 8.55513307984791,
701
+ "grad_norm": 0.7076143622398376,
702
+ "learning_rate": 0.0003043939930221975,
703
+ "loss": 1.9289,
704
+ "step": 9000
705
+ },
706
+ {
707
+ "epoch": 8.65019011406844,
708
+ "grad_norm": 0.6368849873542786,
709
+ "learning_rate": 0.0003018658037113819,
710
+ "loss": 1.932,
711
+ "step": 9100
712
+ },
713
+ {
714
+ "epoch": 8.745247148288973,
715
+ "grad_norm": 0.7639185786247253,
716
+ "learning_rate": 0.00029933761440056634,
717
+ "loss": 1.9485,
718
+ "step": 9200
719
+ },
720
+ {
721
+ "epoch": 8.840304182509506,
722
+ "grad_norm": 1.0823330879211426,
723
+ "learning_rate": 0.0002968094250897507,
724
+ "loss": 1.9447,
725
+ "step": 9300
726
+ },
727
+ {
728
+ "epoch": 8.935361216730039,
729
+ "grad_norm": 0.8542035222053528,
730
+ "learning_rate": 0.00029428123577893514,
731
+ "loss": 1.942,
732
+ "step": 9400
733
+ },
734
+ {
735
+ "epoch": 9.0,
736
+ "eval_loss": 2.0947535037994385,
737
+ "eval_runtime": 3.6147,
738
+ "eval_samples_per_second": 1960.063,
739
+ "eval_steps_per_second": 122.556,
740
+ "step": 9468
741
+ },
742
+ {
743
+ "epoch": 9.03041825095057,
744
+ "grad_norm": 0.7601971626281738,
745
+ "learning_rate": 0.00029175304646811956,
746
+ "loss": 1.9243,
747
+ "step": 9500
748
+ },
749
+ {
750
+ "epoch": 9.125475285171103,
751
+ "grad_norm": 0.7461040019989014,
752
+ "learning_rate": 0.00028922485715730394,
753
+ "loss": 1.8704,
754
+ "step": 9600
755
+ },
756
+ {
757
+ "epoch": 9.220532319391635,
758
+ "grad_norm": 0.7719326019287109,
759
+ "learning_rate": 0.00028669666784648836,
760
+ "loss": 1.8832,
761
+ "step": 9700
762
+ },
763
+ {
764
+ "epoch": 9.315589353612168,
765
+ "grad_norm": 0.716136634349823,
766
+ "learning_rate": 0.00028416847853567274,
767
+ "loss": 1.8787,
768
+ "step": 9800
769
+ },
770
+ {
771
+ "epoch": 9.4106463878327,
772
+ "grad_norm": 0.6928532123565674,
773
+ "learning_rate": 0.00028164028922485717,
774
+ "loss": 1.8855,
775
+ "step": 9900
776
+ },
777
+ {
778
+ "epoch": 9.505703422053232,
779
+ "grad_norm": 0.7696681618690491,
780
+ "learning_rate": 0.0002791120999140416,
781
+ "loss": 1.8855,
782
+ "step": 10000
783
+ },
784
+ {
785
+ "epoch": 9.600760456273765,
786
+ "grad_norm": 0.8969391584396362,
787
+ "learning_rate": 0.00027658391060322597,
788
+ "loss": 1.9034,
789
+ "step": 10100
790
+ },
791
+ {
792
+ "epoch": 9.695817490494296,
793
+ "grad_norm": 0.8469530940055847,
794
+ "learning_rate": 0.00027405572129241034,
795
+ "loss": 1.8965,
796
+ "step": 10200
797
+ },
798
+ {
799
+ "epoch": 9.790874524714829,
800
+ "grad_norm": 0.7956866025924683,
801
+ "learning_rate": 0.0002715275319815948,
802
+ "loss": 1.9087,
803
+ "step": 10300
804
+ },
805
+ {
806
+ "epoch": 9.885931558935361,
807
+ "grad_norm": 0.8293343782424927,
808
+ "learning_rate": 0.0002689993426707792,
809
+ "loss": 1.9177,
810
+ "step": 10400
811
+ },
812
+ {
813
+ "epoch": 9.980988593155894,
814
+ "grad_norm": 0.7472631931304932,
815
+ "learning_rate": 0.00026647115335996357,
816
+ "loss": 1.9082,
817
+ "step": 10500
818
+ },
819
+ {
820
+ "epoch": 10.0,
821
+ "eval_loss": 2.097904920578003,
822
+ "eval_runtime": 3.5592,
823
+ "eval_samples_per_second": 1990.641,
824
+ "eval_steps_per_second": 124.468,
825
+ "step": 10520
826
+ },
827
+ {
828
+ "epoch": 10.076045627376425,
829
+ "grad_norm": 0.7787309288978577,
830
+ "learning_rate": 0.00026394296404914805,
831
+ "loss": 1.8393,
832
+ "step": 10600
833
+ },
834
+ {
835
+ "epoch": 10.171102661596958,
836
+ "grad_norm": 1.3328174352645874,
837
+ "learning_rate": 0.0002614147747383324,
838
+ "loss": 1.8283,
839
+ "step": 10700
840
+ },
841
+ {
842
+ "epoch": 10.26615969581749,
843
+ "grad_norm": 0.7740694284439087,
844
+ "learning_rate": 0.0002588865854275168,
845
+ "loss": 1.8422,
846
+ "step": 10800
847
+ },
848
+ {
849
+ "epoch": 10.361216730038024,
850
+ "grad_norm": 0.828940749168396,
851
+ "learning_rate": 0.0002563583961167012,
852
+ "loss": 1.8516,
853
+ "step": 10900
854
+ },
855
+ {
856
+ "epoch": 10.456273764258555,
857
+ "grad_norm": 0.751752495765686,
858
+ "learning_rate": 0.00025383020680588565,
859
+ "loss": 1.8624,
860
+ "step": 11000
861
+ },
862
+ {
863
+ "epoch": 10.551330798479087,
864
+ "grad_norm": 0.9940192103385925,
865
+ "learning_rate": 0.00025130201749507,
866
+ "loss": 1.8599,
867
+ "step": 11100
868
+ },
869
+ {
870
+ "epoch": 10.64638783269962,
871
+ "grad_norm": 0.8591569066047668,
872
+ "learning_rate": 0.00024877382818425445,
873
+ "loss": 1.8581,
874
+ "step": 11200
875
+ },
876
+ {
877
+ "epoch": 10.741444866920151,
878
+ "grad_norm": 0.7676281332969666,
879
+ "learning_rate": 0.0002462456388734388,
880
+ "loss": 1.8637,
881
+ "step": 11300
882
+ },
883
+ {
884
+ "epoch": 10.836501901140684,
885
+ "grad_norm": 0.7896871566772461,
886
+ "learning_rate": 0.00024371744956262325,
887
+ "loss": 1.8606,
888
+ "step": 11400
889
+ },
890
+ {
891
+ "epoch": 10.931558935361217,
892
+ "grad_norm": 0.8302274942398071,
893
+ "learning_rate": 0.00024118926025180765,
894
+ "loss": 1.8656,
895
+ "step": 11500
896
+ },
897
+ {
898
+ "epoch": 11.0,
899
+ "eval_loss": 2.0961618423461914,
900
+ "eval_runtime": 3.6362,
901
+ "eval_samples_per_second": 1948.473,
902
+ "eval_steps_per_second": 121.831,
903
+ "step": 11572
904
+ },
905
+ {
906
+ "epoch": 11.02661596958175,
907
+ "grad_norm": 0.8891871571540833,
908
+ "learning_rate": 0.00023866107094099208,
909
+ "loss": 1.8522,
910
+ "step": 11600
911
+ },
912
+ {
913
+ "epoch": 11.12167300380228,
914
+ "grad_norm": 0.7549653649330139,
915
+ "learning_rate": 0.00023613288163017645,
916
+ "loss": 1.7913,
917
+ "step": 11700
918
+ },
919
+ {
920
+ "epoch": 11.216730038022813,
921
+ "grad_norm": 0.8127674460411072,
922
+ "learning_rate": 0.00023360469231936088,
923
+ "loss": 1.8102,
924
+ "step": 11800
925
+ },
926
+ {
927
+ "epoch": 11.311787072243346,
928
+ "grad_norm": 0.841659426689148,
929
+ "learning_rate": 0.0002310765030085453,
930
+ "loss": 1.803,
931
+ "step": 11900
932
+ },
933
+ {
934
+ "epoch": 11.406844106463879,
935
+ "grad_norm": 0.8460645079612732,
936
+ "learning_rate": 0.00022854831369772968,
937
+ "loss": 1.8201,
938
+ "step": 12000
939
+ },
940
+ {
941
+ "epoch": 11.50190114068441,
942
+ "grad_norm": 0.7932580709457397,
943
+ "learning_rate": 0.0002260201243869141,
944
+ "loss": 1.811,
945
+ "step": 12100
946
+ },
947
+ {
948
+ "epoch": 11.596958174904943,
949
+ "grad_norm": 0.8419378399848938,
950
+ "learning_rate": 0.0002234919350760985,
951
+ "loss": 1.8145,
952
+ "step": 12200
953
+ },
954
+ {
955
+ "epoch": 11.692015209125476,
956
+ "grad_norm": 0.8346748352050781,
957
+ "learning_rate": 0.0002209637457652829,
958
+ "loss": 1.8328,
959
+ "step": 12300
960
+ },
961
+ {
962
+ "epoch": 11.787072243346007,
963
+ "grad_norm": 1.019510269165039,
964
+ "learning_rate": 0.0002184355564544673,
965
+ "loss": 1.8257,
966
+ "step": 12400
967
+ },
968
+ {
969
+ "epoch": 11.88212927756654,
970
+ "grad_norm": 0.8175719976425171,
971
+ "learning_rate": 0.00021590736714365173,
972
+ "loss": 1.8274,
973
+ "step": 12500
974
+ },
975
+ {
976
+ "epoch": 11.977186311787072,
977
+ "grad_norm": 0.7476153373718262,
978
+ "learning_rate": 0.00021337917783283614,
979
+ "loss": 1.8361,
980
+ "step": 12600
981
+ },
982
+ {
983
+ "epoch": 12.0,
984
+ "eval_loss": 2.1029505729675293,
985
+ "eval_runtime": 3.5932,
986
+ "eval_samples_per_second": 1971.782,
987
+ "eval_steps_per_second": 123.289,
988
+ "step": 12624
989
+ },
990
+ {
991
+ "epoch": 12.072243346007605,
992
+ "grad_norm": 0.8637651205062866,
993
+ "learning_rate": 0.00021085098852202054,
994
+ "loss": 1.7684,
995
+ "step": 12700
996
+ },
997
+ {
998
+ "epoch": 12.167300380228136,
999
+ "grad_norm": 0.80800461769104,
1000
+ "learning_rate": 0.00020832279921120496,
1001
+ "loss": 1.7703,
1002
+ "step": 12800
1003
+ },
1004
+ {
1005
+ "epoch": 12.262357414448669,
1006
+ "grad_norm": 1.0111021995544434,
1007
+ "learning_rate": 0.00020579460990038934,
1008
+ "loss": 1.7809,
1009
+ "step": 12900
1010
+ },
1011
+ {
1012
+ "epoch": 12.357414448669202,
1013
+ "grad_norm": 0.8477798700332642,
1014
+ "learning_rate": 0.00020326642058957376,
1015
+ "loss": 1.7795,
1016
+ "step": 13000
1017
+ },
1018
+ {
1019
+ "epoch": 12.452471482889734,
1020
+ "grad_norm": 0.8284028172492981,
1021
+ "learning_rate": 0.00020073823127875814,
1022
+ "loss": 1.7803,
1023
+ "step": 13100
1024
+ },
1025
+ {
1026
+ "epoch": 12.547528517110266,
1027
+ "grad_norm": 0.7752136588096619,
1028
+ "learning_rate": 0.00019821004196794256,
1029
+ "loss": 1.7836,
1030
+ "step": 13200
1031
+ },
1032
+ {
1033
+ "epoch": 12.642585551330798,
1034
+ "grad_norm": 0.8929184675216675,
1035
+ "learning_rate": 0.00019568185265712696,
1036
+ "loss": 1.7724,
1037
+ "step": 13300
1038
+ },
1039
+ {
1040
+ "epoch": 12.737642585551331,
1041
+ "grad_norm": 0.8475900888442993,
1042
+ "learning_rate": 0.00019315366334631136,
1043
+ "loss": 1.7891,
1044
+ "step": 13400
1045
+ },
1046
+ {
1047
+ "epoch": 12.832699619771864,
1048
+ "grad_norm": 0.9029939770698547,
1049
+ "learning_rate": 0.0001906254740354958,
1050
+ "loss": 1.7888,
1051
+ "step": 13500
1052
+ },
1053
+ {
1054
+ "epoch": 12.927756653992395,
1055
+ "grad_norm": 0.841206967830658,
1056
+ "learning_rate": 0.0001880972847246802,
1057
+ "loss": 1.8005,
1058
+ "step": 13600
1059
+ },
1060
+ {
1061
+ "epoch": 13.0,
1062
+ "eval_loss": 2.1176211833953857,
1063
+ "eval_runtime": 3.6226,
1064
+ "eval_samples_per_second": 1955.796,
1065
+ "eval_steps_per_second": 122.289,
1066
+ "step": 13676
1067
+ },
1068
+ {
1069
+ "epoch": 13.022813688212928,
1070
+ "grad_norm": 0.786509096622467,
1071
+ "learning_rate": 0.0001855690954138646,
1072
+ "loss": 1.7784,
1073
+ "step": 13700
1074
+ },
1075
+ {
1076
+ "epoch": 13.11787072243346,
1077
+ "grad_norm": 0.8644747734069824,
1078
+ "learning_rate": 0.000183040906103049,
1079
+ "loss": 1.7234,
1080
+ "step": 13800
1081
+ },
1082
+ {
1083
+ "epoch": 13.212927756653992,
1084
+ "grad_norm": 0.8760172128677368,
1085
+ "learning_rate": 0.00018051271679223342,
1086
+ "loss": 1.7308,
1087
+ "step": 13900
1088
+ },
1089
+ {
1090
+ "epoch": 13.307984790874524,
1091
+ "grad_norm": 0.7858941555023193,
1092
+ "learning_rate": 0.0001779845274814178,
1093
+ "loss": 1.7318,
1094
+ "step": 14000
1095
+ },
1096
+ {
1097
+ "epoch": 13.403041825095057,
1098
+ "grad_norm": 0.8771238327026367,
1099
+ "learning_rate": 0.00017545633817060222,
1100
+ "loss": 1.7473,
1101
+ "step": 14100
1102
+ },
1103
+ {
1104
+ "epoch": 13.49809885931559,
1105
+ "grad_norm": 0.8886803984642029,
1106
+ "learning_rate": 0.00017292814885978665,
1107
+ "loss": 1.7491,
1108
+ "step": 14200
1109
+ },
1110
+ {
1111
+ "epoch": 13.593155893536121,
1112
+ "grad_norm": 0.8704127669334412,
1113
+ "learning_rate": 0.00017039995954897102,
1114
+ "loss": 1.7548,
1115
+ "step": 14300
1116
+ },
1117
+ {
1118
+ "epoch": 13.688212927756654,
1119
+ "grad_norm": 1.2635705471038818,
1120
+ "learning_rate": 0.00016787177023815545,
1121
+ "loss": 1.7532,
1122
+ "step": 14400
1123
+ },
1124
+ {
1125
+ "epoch": 13.783269961977187,
1126
+ "grad_norm": 0.9218750596046448,
1127
+ "learning_rate": 0.00016534358092733985,
1128
+ "loss": 1.7531,
1129
+ "step": 14500
1130
+ },
1131
+ {
1132
+ "epoch": 13.87832699619772,
1133
+ "grad_norm": 0.9513919353485107,
1134
+ "learning_rate": 0.00016281539161652425,
1135
+ "loss": 1.7618,
1136
+ "step": 14600
1137
+ },
1138
+ {
1139
+ "epoch": 13.97338403041825,
1140
+ "grad_norm": 1.010962963104248,
1141
+ "learning_rate": 0.00016028720230570865,
1142
+ "loss": 1.7646,
1143
+ "step": 14700
1144
+ },
1145
+ {
1146
+ "epoch": 14.0,
1147
+ "eval_loss": 2.130631923675537,
1148
+ "eval_runtime": 3.6539,
1149
+ "eval_samples_per_second": 1938.998,
1150
+ "eval_steps_per_second": 121.239,
1151
+ "step": 14728
1152
+ }
1153
+ ],
1154
+ "logging_steps": 100,
1155
+ "max_steps": 21040,
1156
+ "num_input_tokens_seen": 0,
1157
+ "num_train_epochs": 20,
1158
+ "save_steps": 500,
1159
+ "stateful_callbacks": {
1160
+ "TrainerControl": {
1161
+ "args": {
1162
+ "should_epoch_stop": false,
1163
+ "should_evaluate": false,
1164
+ "should_log": false,
1165
+ "should_save": true,
1166
+ "should_training_stop": false
1167
+ },
1168
+ "attributes": {}
1169
+ }
1170
+ },
1171
+ "total_flos": 5.6215157665850184e+16,
1172
+ "train_batch_size": 128,
1173
+ "trial_name": null,
1174
+ "trial_params": null
1175
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c747c7cd25c286914dbb5bbec4723cdf421e34ea221f411b85b150d0b96bd4
3
+ size 5304