Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +744 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1175 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,744 @@

+{
+  "architectures": [
+    "CTCTransformerModel"
+  ],
+  "blank_id": 0,
+  "hidden_size": 384,
+  "id2label": {
+    "0": "<pad>",
+    "1": "|",
+    "2": "AA0",
+    "3": "AA1",
+    "4": "AA2",
+    "5": "AE0",
+    "6": "AE1",
+    "7": "AE2",
+    "8": "AH0",
+    "9": "AH1",
+    "10": "AH2",
+    "11": "AO0",
+    "12": "AO1",
+    "13": "AO2",
+    "14": "AW0",
+    "15": "AW1",
+    "16": "AW2",
+    "17": "AY0",
+    "18": "AY1",
+    "19": "AY2",
+    "20": "B",
+    "21": "CH",
+    "22": "D",
+    "23": "DH",
+    "24": "EH0",
+    "25": "EH1",
+    "26": "EH2",
+    "27": "ER0",
+    "28": "ER1",
+    "29": "ER2",
+    "30": "EY0",
+    "31": "EY1",
+    "32": "EY2",
+    "33": "F",
+    "34": "G",
+    "35": "HH",
+    "36": "IH0",
+    "37": "IH1",
+    "38": "IH2",
+    "39": "IY0",
+    "40": "IY1",
+    "41": "IY2",
+    "42": "JH",
+    "43": "K",
+    "44": "L",
+    "45": "M",
+    "46": "N",
+    "47": "NG",
+    "48": "Ng",
+    "49": "OW0",
+    "50": "OW1",
+    "51": "OW2",
+    "52": "OY0",
+    "53": "OY1",
+    "54": "OY2",
+    "55": "P",
+    "56": "R",
+    "57": "S",
+    "58": "SH",
+    "59": "T",
+    "60": "TH",
+    "61": "UH0",
+    "62": "UH1",
+    "63": "UH2",
+    "64": "UW0",
+    "65": "UW1",
+    "66": "UW2",
+    "67": "V",
+    "68": "W",
+    "69": "Y",
+    "70": "Z",
+    "71": "ZH",
+    "72": "aa",
+    "73": "aa_1",
+    "74": "aa_2",
+    "75": "aa_3",
+    "76": "aa_4",
+    "77": "aa_5",
+    "78": "aa_6",
+    "79": "aai",
+    "80": "aai_1",
+    "81": "aai_2",
+    "82": "aai_3",
+    "83": "aai_4",
+    "84": "aai_5",
+    "85": "aai_6",
+    "86": "aak",
+    "87": "aak_1",
+    "88": "aak_2",
+    "89": "aak_3",
+    "90": "aak_6",
+    "91": "aam",
+    "92": "aam_1",
+    "93": "aam_2",
+    "94": "aam_3",
+    "95": "aam_4",
+    "96": "aam_5",
+    "97": "aam_6",
+    "98": "aan_1",
+    "99": "aan_2",
+    "100": "aan_3",
+    "101": "aan_4",
+    "102": "aan_5",
+    "103": "aan_6",
+    "104": "aang_1",
+    "105": "aang_2",
+    "106": "aang_3",
+    "107": "aang_4",
+    "108": "aang_5",
+    "109": "aang_6",
+    "110": "aap",
+    "111": "aap_2",
+    "112": "aap_3",
+    "113": "aap_6",
+    "114": "aat",
+    "115": "aat_1",
+    "116": "aat_2",
+    "117": "aat_3",
+    "118": "aat_6",
+    "119": "aau",
+    "120": "aau_1",
+    "121": "aau_2",
+    "122": "aau_3",
+    "123": "aau_4",
+    "124": "aau_5",
+    "125": "aau_6",
+    "126": "ai",
+    "127": "ai_1",
+    "128": "ai_2",
+    "129": "ai_3",
+    "130": "ai_4",
+    "131": "ai_5",
+    "132": "ai_6",
+    "133": "ak",
+    "134": "ak_1",
+    "135": "ak_6",
+    "136": "am",
+    "137": "am_1",
+    "138": "am_2",
+    "139": "am_3",
+    "140": "am_4",
+    "141": "am_6",
+    "142": "an_1",
+    "143": "an_2",
+    "144": "an_3",
+    "145": "an_4",
+    "146": "an_5",
+    "147": "an_6",
+    "148": "ang_1",
+    "149": "ang_2",
+    "150": "ang_3",
+    "151": "ang_4",
+    "152": "ang_6",
+    "153": "ap_1",
+    "154": "ap_2",
+    "155": "ap_6",
+    "156": "at_1",
+    "157": "at_2",
+    "158": "at_4",
+    "159": "at_6",
+    "160": "au",
+    "161": "au_1",
+    "162": "au_2",
+    "163": "au_3",
+    "164": "au_4",
+    "165": "au_5",
+    "166": "au_6",
+    "167": "b",
+    "168": "c",
+    "169": "d",
+    "170": "e_1",
+    "171": "e_2",
+    "172": "e_3",
+    "173": "e_4",
+    "174": "e_5",
+    "175": "e_6",
+    "176": "ei_1",
+    "177": "ei_2",
+    "178": "ei_3",
+    "179": "ei_4",
+    "180": "ei_5",
+    "181": "ei_6",
+    "182": "ek_1",
+    "183": "ek_3",
+    "184": "ek_6",
+    "185": "eng_1",
+    "186": "eng_2",
+    "187": "eng_3",
+    "188": "eng_4",
+    "189": "eng_5",
+    "190": "eng_6",
+    "191": "eoi_1",
+    "192": "eoi_2",
+    "193": "eoi_3",
+    "194": "eoi_4",
+    "195": "eoi_5",
+    "196": "eoi_6",
+    "197": "eon_1",
+    "198": "eon_2",
+    "199": "eon_3",
+    "200": "eon_4",
+    "201": "eon_5",
+    "202": "eon_6",
+    "203": "eot_1",
+    "204": "eot_2",
+    "205": "eot_6",
+    "206": "eu_6",
+    "207": "f",
+    "208": "g",
+    "209": "gw",
+    "210": "h",
+    "211": "i_1",
+    "212": "i_2",
+    "213": "i_3",
+    "214": "i_4",
+    "215": "i_5",
+    "216": "i_6",
+    "217": "ik_1",
+    "218": "ik_4",
+    "219": "ik_6",
+    "220": "im_1",
+    "221": "im_2",
+    "222": "im_3",
+    "223": "im_4",
+    "224": "im_5",
+    "225": "im_6",
+    "226": "in_1",
+    "227": "in_2",
+    "228": "in_3",
+    "229": "in_4",
+    "230": "in_5",
+    "231": "in_6",
+    "232": "ing_1",
+    "233": "ing_2",
+    "234": "ing_3",
+    "235": "ing_4",
+    "236": "ing_5",
+    "237": "ing_6",
+    "238": "ip_2",
+    "239": "ip_3",
+    "240": "ip_6",
+    "241": "it_1",
+    "242": "it_3",
+    "243": "it_6",
+    "244": "iu_1",
+    "245": "iu_2",
+    "246": "iu_3",
+    "247": "iu_4",
+    "248": "iu_5",
+    "249": "iu_6",
+    "250": "j",
+    "251": "k",
+    "252": "kw",
+    "253": "l",
+    "254": "m",
+    "255": "n",
+    "256": "ng",
+    "257": "o",
+    "258": "o_1",
+    "259": "o_2",
+    "260": "o_3",
+    "261": "o_4",
+    "262": "o_5",
+    "263": "o_6",
+    "264": "oe_1",
+    "265": "oe_2",
+    "266": "oe_3",
+    "267": "oek_2",
+    "268": "oek_3",
+    "269": "oek_4",
+    "270": "oek_6",
+    "271": "oeng_1",
+    "272": "oeng_2",
+    "273": "oeng_3",
+    "274": "oeng_4",
+    "275": "oeng_5",
+    "276": "oeng_6",
+    "277": "oi",
+    "278": "oi_1",
+    "279": "oi_2",
+    "280": "oi_3",
+    "281": "oi_4",
+    "282": "oi_5",
+    "283": "oi_6",
+    "284": "ok",
+    "285": "ok_1",
+    "286": "ok_2",
+    "287": "ok_3",
+    "288": "ok_6",
+    "289": "on",
+    "290": "on_1",
+    "291": "on_2",
+    "292": "on_3",
+    "293": "on_4",
+    "294": "on_6",
+    "295": "ong_1",
+    "296": "ong_2",
+    "297": "ong_3",
+    "298": "ong_4",
+    "299": "ong_5",
+    "300": "ong_6",
+    "301": "ot_3",
+    "302": "ou",
+    "303": "ou_1",
+    "304": "ou_2",
+    "305": "ou_3",
+    "306": "ou_4",
+    "307": "ou_5",
+    "308": "ou_6",
+    "309": "p",
+    "310": "s",
+    "311": "t",
+    "312": "u_1",
+    "313": "u_2",
+    "314": "u_3",
+    "315": "u_4",
+    "316": "u_5",
+    "317": "u_6",
+    "318": "ui_1",
+    "319": "ui_2",
+    "320": "ui_3",
+    "321": "ui_4",
+    "322": "ui_5",
+    "323": "ui_6",
+    "324": "uk",
+    "325": "uk_1",
+    "326": "uk_2",
+    "327": "uk_6",
+    "328": "un_1",
+    "329": "un_2",
+    "330": "un_3",
+    "331": "un_4",
+    "332": "un_5",
+    "333": "un_6",
+    "334": "ung",
+    "335": "ung_1",
+    "336": "ung_2",
+    "337": "ung_3",
+    "338": "ung_4",
+    "339": "ung_5",
+    "340": "ung_6",
+    "341": "ut_1",
+    "342": "ut_3",
+    "343": "ut_6",
+    "344": "w",
+    "345": "yu_1",
+    "346": "yu_2",
+    "347": "yu_3",
+    "348": "yu_4",
+    "349": "yu_5",
+    "350": "yu_6",
+    "351": "yun_1",
+    "352": "yun_2",
+    "353": "yun_3",
+    "354": "yun_4",
+    "355": "yun_5",
+    "356": "yun_6",
+    "357": "yut_1",
+    "358": "yut_2",
+    "359": "yut_3",
+    "360": "yut_4",
+    "361": "yut_6",
+    "362": "z"
+  },
+  "intermediate_size": 1024,
+  "label2id": {
+    "<pad>": 0,
+    "AA0": 2,
+    "AA1": 3,
+    "AA2": 4,
+    "AE0": 5,
+    "AE1": 6,
+    "AE2": 7,
+    "AH0": 8,
+    "AH1": 9,
+    "AH2": 10,
+    "AO0": 11,
+    "AO1": 12,
+    "AO2": 13,
+    "AW0": 14,
+    "AW1": 15,
+    "AW2": 16,
+    "AY0": 17,
+    "AY1": 18,
+    "AY2": 19,
+    "B": 20,
+    "CH": 21,
+    "D": 22,
+    "DH": 23,
+    "EH0": 24,
+    "EH1": 25,
+    "EH2": 26,
+    "ER0": 27,
+    "ER1": 28,
+    "ER2": 29,
+    "EY0": 30,
+    "EY1": 31,
+    "EY2": 32,
+    "F": 33,
+    "G": 34,
+    "HH": 35,
+    "IH0": 36,
+    "IH1": 37,
+    "IH2": 38,
+    "IY0": 39,
+    "IY1": 40,
+    "IY2": 41,
+    "JH": 42,
+    "K": 43,
+    "L": 44,
+    "M": 45,
+    "N": 46,
+    "NG": 47,
+    "Ng": 48,
+    "OW0": 49,
+    "OW1": 50,
+    "OW2": 51,
+    "OY0": 52,
+    "OY1": 53,
+    "OY2": 54,
+    "P": 55,
+    "R": 56,
+    "S": 57,
+    "SH": 58,
+    "T": 59,
+    "TH": 60,
+    "UH0": 61,
+    "UH1": 62,
+    "UH2": 63,
+    "UW0": 64,
+    "UW1": 65,
+    "UW2": 66,
+    "V": 67,
+    "W": 68,
+    "Y": 69,
+    "Z": 70,
+    "ZH": 71,
+    "aa": 72,
+    "aa_1": 73,
+    "aa_2": 74,
+    "aa_3": 75,
+    "aa_4": 76,
+    "aa_5": 77,
+    "aa_6": 78,
+    "aai": 79,
+    "aai_1": 80,
+    "aai_2": 81,
+    "aai_3": 82,
+    "aai_4": 83,
+    "aai_5": 84,
+    "aai_6": 85,
+    "aak": 86,
+    "aak_1": 87,
+    "aak_2": 88,
+    "aak_3": 89,
+    "aak_6": 90,
+    "aam": 91,
+    "aam_1": 92,
+    "aam_2": 93,
+    "aam_3": 94,
+    "aam_4": 95,
+    "aam_5": 96,
+    "aam_6": 97,
+    "aan_1": 98,
+    "aan_2": 99,
+    "aan_3": 100,
+    "aan_4": 101,
+    "aan_5": 102,
+    "aan_6": 103,
+    "aang_1": 104,
+    "aang_2": 105,
+    "aang_3": 106,
+    "aang_4": 107,
+    "aang_5": 108,
+    "aang_6": 109,
+    "aap": 110,
+    "aap_2": 111,
+    "aap_3": 112,
+    "aap_6": 113,
+    "aat": 114,
+    "aat_1": 115,
+    "aat_2": 116,
+    "aat_3": 117,
+    "aat_6": 118,
+    "aau": 119,
+    "aau_1": 120,
+    "aau_2": 121,
+    "aau_3": 122,
+    "aau_4": 123,
+    "aau_5": 124,
+    "aau_6": 125,
+    "ai": 126,
+    "ai_1": 127,
+    "ai_2": 128,
+    "ai_3": 129,
+    "ai_4": 130,
+    "ai_5": 131,
+    "ai_6": 132,
+    "ak": 133,
+    "ak_1": 134,
+    "ak_6": 135,
+    "am": 136,
+    "am_1": 137,
+    "am_2": 138,
+    "am_3": 139,
+    "am_4": 140,
+    "am_6": 141,
+    "an_1": 142,
+    "an_2": 143,
+    "an_3": 144,
+    "an_4": 145,
+    "an_5": 146,
+    "an_6": 147,
+    "ang_1": 148,
+    "ang_2": 149,
+    "ang_3": 150,
+    "ang_4": 151,
+    "ang_6": 152,
+    "ap_1": 153,
+    "ap_2": 154,
+    "ap_6": 155,
+    "at_1": 156,
+    "at_2": 157,
+    "at_4": 158,
+    "at_6": 159,
+    "au": 160,
+    "au_1": 161,
+    "au_2": 162,
+    "au_3": 163,
+    "au_4": 164,
+    "au_5": 165,
+    "au_6": 166,
+    "b": 167,
+    "c": 168,
+    "d": 169,
+    "e_1": 170,
+    "e_2": 171,
+    "e_3": 172,
+    "e_4": 173,
+    "e_5": 174,
+    "e_6": 175,
+    "ei_1": 176,
+    "ei_2": 177,
+    "ei_3": 178,
+    "ei_4": 179,
+    "ei_5": 180,
+    "ei_6": 181,
+    "ek_1": 182,
+    "ek_3": 183,
+    "ek_6": 184,
+    "eng_1": 185,
+    "eng_2": 186,
+    "eng_3": 187,
+    "eng_4": 188,
+    "eng_5": 189,
+    "eng_6": 190,
+    "eoi_1": 191,
+    "eoi_2": 192,
+    "eoi_3": 193,
+    "eoi_4": 194,
+    "eoi_5": 195,
+    "eoi_6": 196,
+    "eon_1": 197,
+    "eon_2": 198,
+    "eon_3": 199,
+    "eon_4": 200,
+    "eon_5": 201,
+    "eon_6": 202,
+    "eot_1": 203,
+    "eot_2": 204,
+    "eot_6": 205,
+    "eu_6": 206,
+    "f": 207,
+    "g": 208,
+    "gw": 209,
+    "h": 210,
+    "i_1": 211,
+    "i_2": 212,
+    "i_3": 213,
+    "i_4": 214,
+    "i_5": 215,
+    "i_6": 216,
+    "ik_1": 217,
+    "ik_4": 218,
+    "ik_6": 219,
+    "im_1": 220,
+    "im_2": 221,
+    "im_3": 222,
+    "im_4": 223,
+    "im_5": 224,
+    "im_6": 225,
+    "in_1": 226,
+    "in_2": 227,
+    "in_3": 228,
+    "in_4": 229,
+    "in_5": 230,
+    "in_6": 231,
+    "ing_1": 232,
+    "ing_2": 233,
+    "ing_3": 234,
+    "ing_4": 235,
+    "ing_5": 236,
+    "ing_6": 237,
+    "ip_2": 238,
+    "ip_3": 239,
+    "ip_6": 240,
+    "it_1": 241,
+    "it_3": 242,
+    "it_6": 243,
+    "iu_1": 244,
+    "iu_2": 245,
+    "iu_3": 246,
+    "iu_4": 247,
+    "iu_5": 248,
+    "iu_6": 249,
+    "j": 250,
+    "k": 251,
+    "kw": 252,
+    "l": 253,
+    "m": 254,
+    "n": 255,
+    "ng": 256,
+    "o": 257,
+    "o_1": 258,
+    "o_2": 259,
+    "o_3": 260,
+    "o_4": 261,
+    "o_5": 262,
+    "o_6": 263,
+    "oe_1": 264,
+    "oe_2": 265,
+    "oe_3": 266,
+    "oek_2": 267,
+    "oek_3": 268,
+    "oek_4": 269,
+    "oek_6": 270,
+    "oeng_1": 271,
+    "oeng_2": 272,
+    "oeng_3": 273,
+    "oeng_4": 274,
+    "oeng_5": 275,
+    "oeng_6": 276,
+    "oi": 277,
+    "oi_1": 278,
+    "oi_2": 279,
+    "oi_3": 280,
+    "oi_4": 281,
+    "oi_5": 282,
+    "oi_6": 283,
+    "ok": 284,
+    "ok_1": 285,
+    "ok_2": 286,
+    "ok_3": 287,
+    "ok_6": 288,
+    "on": 289,
+    "on_1": 290,
+    "on_2": 291,
+    "on_3": 292,
+    "on_4": 293,
+    "on_6": 294,
+    "ong_1": 295,
+    "ong_2": 296,
+    "ong_3": 297,
+    "ong_4": 298,
+    "ong_5": 299,
+    "ong_6": 300,
+    "ot_3": 301,
+    "ou": 302,
+    "ou_1": 303,
+    "ou_2": 304,
+    "ou_3": 305,
+    "ou_4": 306,
+    "ou_5": 307,
+    "ou_6": 308,
+    "p": 309,
+    "s": 310,
+    "t": 311,
+    "u_1": 312,
+    "u_2": 313,
+    "u_3": 314,
+    "u_4": 315,
+    "u_5": 316,
+    "u_6": 317,
+    "ui_1": 318,
+    "ui_2": 319,
+    "ui_3": 320,
+    "ui_4": 321,
+    "ui_5": 322,
+    "ui_6": 323,
+    "uk": 324,
+    "uk_1": 325,
+    "uk_2": 326,
+    "uk_6": 327,
+    "un_1": 328,
+    "un_2": 329,
+    "un_3": 330,
+    "un_4": 331,
+    "un_5": 332,
+    "un_6": 333,
+    "ung": 334,
+    "ung_1": 335,
+    "ung_2": 336,
+    "ung_3": 337,
+    "ung_4": 338,
+    "ung_5": 339,
+    "ung_6": 340,
+    "ut_1": 341,
+    "ut_3": 342,
+    "ut_6": 343,
+    "w": 344,
+    "yu_1": 345,
+    "yu_2": 346,
+    "yu_3": 347,
+    "yu_4": 348,
+    "yu_5": 349,
+    "yu_6": 350,
+    "yun_1": 351,
+    "yun_2": 352,
+    "yun_3": 353,
+    "yun_4": 354,
+    "yun_5": 355,
+    "yun_6": 356,
+    "yut_1": 357,
+    "yut_2": 358,
+    "yut_3": 359,
+    "yut_4": 360,
+    "yut_6": 361,
+    "z": 362,
+    "|": 1
+  },
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 6,
+  "num_hidden_layers": 6,
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "vocab_size": 6561
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47947574cf1d0161d5e51a17465e3c5d6500449f7c2d421d04b8f3c2d241a750
+size 43783580

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:042e96fb4c58ed8717496a4ebd1ed1ae01d8a45911590edb9a789e74209b6148
+size 87614778

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d613c42bc6269fa0dc958aa82cd8cd6dfce040534eaf9518e3193f2c7952dd0
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd9c5c96ae49d06833c053e6af6f37491082e4ff8aad12c8e68cdd6d2e4aa79
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1175 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 14.0,
+  "eval_steps": 500,
+  "global_step": 14728,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09505703422053231,
+      "grad_norm": 0.9250678420066833,
+      "learning_rate": 3.95882818685669e-05,
+      "loss": 6.6663,
+      "step": 100
+    },
+    {
+      "epoch": 0.19011406844106463,
+      "grad_norm": 1.117205262184143,
+      "learning_rate": 7.91765637371338e-05,
+      "loss": 4.145,
+      "step": 200
+    },
+    {
+      "epoch": 0.28517110266159695,
+      "grad_norm": 1.3325448036193848,
+      "learning_rate": 0.00011876484560570071,
+      "loss": 3.8608,
+      "step": 300
+    },
+    {
+      "epoch": 0.38022813688212925,
+      "grad_norm": 0.9613128900527954,
+      "learning_rate": 0.0001583531274742676,
+      "loss": 3.768,
+      "step": 400
+    },
+    {
+      "epoch": 0.4752851711026616,
+      "grad_norm": 1.1198444366455078,
+      "learning_rate": 0.00019794140934283454,
+      "loss": 3.7093,
+      "step": 500
+    },
+    {
+      "epoch": 0.5703422053231939,
+      "grad_norm": 1.0210366249084473,
+      "learning_rate": 0.00023752969121140142,
+      "loss": 3.6664,
+      "step": 600
+    },
+    {
+      "epoch": 0.6653992395437263,
+      "grad_norm": 0.9687440395355225,
+      "learning_rate": 0.00027711797307996834,
+      "loss": 3.5409,
+      "step": 700
+    },
+    {
+      "epoch": 0.7604562737642585,
+      "grad_norm": 1.4981633424758911,
+      "learning_rate": 0.0003167062549485352,
+      "loss": 3.3992,
+      "step": 800
+    },
+    {
+      "epoch": 0.8555133079847909,
+      "grad_norm": 0.8627603650093079,
+      "learning_rate": 0.00035629453681710216,
+      "loss": 3.1727,
+      "step": 900
+    },
+    {
+      "epoch": 0.9505703422053232,
+      "grad_norm": 0.9925593733787537,
+      "learning_rate": 0.0003958828186856691,
+      "loss": 2.9724,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 2.6960370540618896,
+      "eval_runtime": 3.7345,
+      "eval_samples_per_second": 1897.152,
+      "eval_steps_per_second": 118.622,
+      "step": 1052
+    },
+    {
+      "epoch": 1.0456273764258555,
+      "grad_norm": 0.9267619848251343,
+      "learning_rate": 0.00043547110055423594,
+      "loss": 2.7022,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1406844106463878,
+      "grad_norm": 0.7666485905647278,
+      "learning_rate": 0.00047505938242280285,
+      "loss": 2.5641,
+      "step": 1200
+    },
+    {
+      "epoch": 1.2357414448669202,
+      "grad_norm": 0.5969619154930115,
+      "learning_rate": 0.0004990645699549983,
+      "loss": 2.5058,
+      "step": 1300
+    },
+    {
+      "epoch": 1.3307984790874525,
+      "grad_norm": 0.7782655358314514,
+      "learning_rate": 0.0004965363806441826,
+      "loss": 2.4665,
+      "step": 1400
+    },
+    {
+      "epoch": 1.4258555133079849,
+      "grad_norm": 0.8928040266036987,
+      "learning_rate": 0.000494008191333367,
+      "loss": 2.4311,
+      "step": 1500
+    },
+    {
+      "epoch": 1.5209125475285172,
+      "grad_norm": 0.8687949180603027,
+      "learning_rate": 0.0004914800020225515,
+      "loss": 2.3964,
+      "step": 1600
+    },
+    {
+      "epoch": 1.6159695817490496,
+      "grad_norm": 0.6245518922805786,
+      "learning_rate": 0.0004889518127117359,
+      "loss": 2.374,
+      "step": 1700
+    },
+    {
+      "epoch": 1.7110266159695817,
+      "grad_norm": 0.6903976202011108,
+      "learning_rate": 0.0004864236234009203,
+      "loss": 2.3606,
+      "step": 1800
+    },
+    {
+      "epoch": 1.806083650190114,
+      "grad_norm": 0.8996257781982422,
+      "learning_rate": 0.00048389543409010466,
+      "loss": 2.3376,
+      "step": 1900
+    },
+    {
+      "epoch": 1.9011406844106464,
+      "grad_norm": 0.734466016292572,
+      "learning_rate": 0.0004813672447792891,
+      "loss": 2.3226,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9961977186311786,
+      "grad_norm": 0.6836825013160706,
+      "learning_rate": 0.0004788390554684735,
+      "loss": 2.3108,
+      "step": 2100
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.285733461380005,
+      "eval_runtime": 3.623,
+      "eval_samples_per_second": 1955.579,
+      "eval_steps_per_second": 122.275,
+      "step": 2104
+    },
+    {
+      "epoch": 2.091254752851711,
+      "grad_norm": 0.5974160432815552,
+      "learning_rate": 0.0004763108661576579,
+      "loss": 2.2585,
+      "step": 2200
+    },
+    {
+      "epoch": 2.1863117870722433,
+      "grad_norm": 0.788093626499176,
+      "learning_rate": 0.0004737826768468423,
+      "loss": 2.264,
+      "step": 2300
+    },
+    {
+      "epoch": 2.2813688212927756,
+      "grad_norm": 0.7451100945472717,
+      "learning_rate": 0.00047125448753602674,
+      "loss": 2.2504,
+      "step": 2400
+    },
+    {
+      "epoch": 2.376425855513308,
+      "grad_norm": 0.6724629998207092,
+      "learning_rate": 0.0004687262982252111,
+      "loss": 2.2358,
+      "step": 2500
+    },
+    {
+      "epoch": 2.4714828897338403,
+      "grad_norm": 0.6606141924858093,
+      "learning_rate": 0.00046619810891439554,
+      "loss": 2.2301,
+      "step": 2600
+    },
+    {
+      "epoch": 2.5665399239543727,
+      "grad_norm": 0.6599621772766113,
+      "learning_rate": 0.0004636699196035799,
+      "loss": 2.2268,
+      "step": 2700
+    },
+    {
+      "epoch": 2.661596958174905,
+      "grad_norm": 0.6633493304252625,
+      "learning_rate": 0.00046114173029276434,
+      "loss": 2.2247,
+      "step": 2800
+    },
+    {
+      "epoch": 2.7566539923954374,
+      "grad_norm": 0.6308265328407288,
+      "learning_rate": 0.00045861354098194877,
+      "loss": 2.2221,
+      "step": 2900
+    },
+    {
+      "epoch": 2.8517110266159698,
+      "grad_norm": 0.6383451223373413,
+      "learning_rate": 0.00045608535167113314,
+      "loss": 2.2274,
+      "step": 3000
+    },
+    {
+      "epoch": 2.9467680608365017,
+      "grad_norm": 0.61512291431427,
+      "learning_rate": 0.00045355716236031757,
+      "loss": 2.2067,
+      "step": 3100
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 2.2008087635040283,
+      "eval_runtime": 3.5613,
+      "eval_samples_per_second": 1989.445,
+      "eval_steps_per_second": 124.393,
+      "step": 3156
+    },
+    {
+      "epoch": 3.041825095057034,
+      "grad_norm": 0.7461186647415161,
+      "learning_rate": 0.00045102897304950194,
+      "loss": 2.1882,
+      "step": 3200
+    },
+    {
+      "epoch": 3.1368821292775664,
+      "grad_norm": 0.6590662598609924,
+      "learning_rate": 0.00044850078373868637,
+      "loss": 2.1662,
+      "step": 3300
+    },
+    {
+      "epoch": 3.2319391634980987,
+      "grad_norm": 0.5832785964012146,
+      "learning_rate": 0.00044597259442787074,
+      "loss": 2.1603,
+      "step": 3400
+    },
+    {
+      "epoch": 3.326996197718631,
+      "grad_norm": 0.6356543898582458,
+      "learning_rate": 0.00044344440511705517,
+      "loss": 2.1601,
+      "step": 3500
+    },
+    {
+      "epoch": 3.4220532319391634,
+      "grad_norm": 0.7197031378746033,
+      "learning_rate": 0.0004409162158062396,
+      "loss": 2.1567,
+      "step": 3600
+    },
+    {
+      "epoch": 3.517110266159696,
+      "grad_norm": 0.5856086611747742,
+      "learning_rate": 0.00043838802649542397,
+      "loss": 2.1588,
+      "step": 3700
+    },
+    {
+      "epoch": 3.612167300380228,
+      "grad_norm": 0.6212655305862427,
+      "learning_rate": 0.00043585983718460834,
+      "loss": 2.1565,
+      "step": 3800
+    },
+    {
+      "epoch": 3.7072243346007605,
+      "grad_norm": 0.6765671968460083,
+      "learning_rate": 0.0004333316478737928,
+      "loss": 2.1667,
+      "step": 3900
+    },
+    {
+      "epoch": 3.802281368821293,
+      "grad_norm": 0.6720090508460999,
+      "learning_rate": 0.0004308034585629772,
+      "loss": 2.1675,
+      "step": 4000
+    },
+    {
+      "epoch": 3.897338403041825,
+      "grad_norm": 0.7150991559028625,
+      "learning_rate": 0.00042827526925216157,
+      "loss": 2.1474,
+      "step": 4100
+    },
+    {
+      "epoch": 3.9923954372623576,
+      "grad_norm": 0.5831249356269836,
+      "learning_rate": 0.00042574707994134605,
+      "loss": 2.1485,
+      "step": 4200
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.15364408493042,
+      "eval_runtime": 3.644,
+      "eval_samples_per_second": 1944.292,
+      "eval_steps_per_second": 121.57,
+      "step": 4208
+    },
+    {
+      "epoch": 4.08745247148289,
+      "grad_norm": 0.6653150916099548,
+      "learning_rate": 0.0004232188906305304,
+      "loss": 2.0899,
+      "step": 4300
+    },
+    {
+      "epoch": 4.182509505703422,
+      "grad_norm": 0.7235066294670105,
+      "learning_rate": 0.0004206907013197148,
+      "loss": 2.0982,
+      "step": 4400
+    },
+    {
+      "epoch": 4.277566539923955,
+      "grad_norm": 0.7326545715332031,
+      "learning_rate": 0.0004181625120088992,
+      "loss": 2.1007,
+      "step": 4500
+    },
+    {
+      "epoch": 4.3726235741444865,
+      "grad_norm": 0.6236776113510132,
+      "learning_rate": 0.00041563432269808365,
+      "loss": 2.1031,
+      "step": 4600
+    },
+    {
+      "epoch": 4.467680608365019,
+      "grad_norm": 0.5669475197792053,
+      "learning_rate": 0.000413106133387268,
+      "loss": 2.1087,
+      "step": 4700
+    },
+    {
+      "epoch": 4.562737642585551,
+      "grad_norm": 0.5483006834983826,
+      "learning_rate": 0.00041057794407645245,
+      "loss": 2.1034,
+      "step": 4800
+    },
+    {
+      "epoch": 4.657794676806084,
+      "grad_norm": 0.5456926822662354,
+      "learning_rate": 0.0004080497547656369,
+      "loss": 2.1065,
+      "step": 4900
+    },
+    {
+      "epoch": 4.752851711026616,
+      "grad_norm": 0.9545803666114807,
+      "learning_rate": 0.00040552156545482125,
+      "loss": 2.1168,
+      "step": 5000
+    },
+    {
+      "epoch": 4.847908745247148,
+      "grad_norm": 0.5378767251968384,
+      "learning_rate": 0.0004029933761440057,
+      "loss": 2.1107,
+      "step": 5100
+    },
+    {
+      "epoch": 4.942965779467681,
+      "grad_norm": 0.629880964756012,
+      "learning_rate": 0.00040046518683319005,
+      "loss": 2.0983,
+      "step": 5200
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.132718801498413,
+      "eval_runtime": 3.6373,
+      "eval_samples_per_second": 1947.857,
+      "eval_steps_per_second": 121.793,
+      "step": 5260
+    },
+    {
+      "epoch": 5.038022813688213,
+      "grad_norm": 0.5900342464447021,
+      "learning_rate": 0.0003979369975223745,
+      "loss": 2.0758,
+      "step": 5300
+    },
+    {
+      "epoch": 5.133079847908745,
+      "grad_norm": 0.6181082129478455,
+      "learning_rate": 0.0003954088082115589,
+      "loss": 2.041,
+      "step": 5400
+    },
+    {
+      "epoch": 5.228136882129277,
+      "grad_norm": 0.6756412386894226,
+      "learning_rate": 0.0003928806189007433,
+      "loss": 2.0548,
+      "step": 5500
+    },
+    {
+      "epoch": 5.32319391634981,
+      "grad_norm": 0.6649320125579834,
+      "learning_rate": 0.0003903524295899277,
+      "loss": 2.0438,
+      "step": 5600
+    },
+    {
+      "epoch": 5.418250950570342,
+      "grad_norm": 0.5628513693809509,
+      "learning_rate": 0.00038782424027911214,
+      "loss": 2.0485,
+      "step": 5700
+    },
+    {
+      "epoch": 5.513307984790875,
+      "grad_norm": 0.6923677921295166,
+      "learning_rate": 0.0003852960509682965,
+      "loss": 2.063,
+      "step": 5800
+    },
+    {
+      "epoch": 5.608365019011407,
+      "grad_norm": 0.6819363236427307,
+      "learning_rate": 0.0003827678616574809,
+      "loss": 2.0618,
+      "step": 5900
+    },
+    {
+      "epoch": 5.7034220532319395,
+      "grad_norm": 0.6446284055709839,
+      "learning_rate": 0.00038023967234666537,
+      "loss": 2.0674,
+      "step": 6000
+    },
+    {
+      "epoch": 5.798479087452471,
+      "grad_norm": 0.6319680213928223,
+      "learning_rate": 0.00037771148303584974,
+      "loss": 2.061,
+      "step": 6100
+    },
+    {
+      "epoch": 5.893536121673003,
+      "grad_norm": 0.6318814754486084,
+      "learning_rate": 0.0003751832937250341,
+      "loss": 2.0656,
+      "step": 6200
+    },
+    {
+      "epoch": 5.988593155893536,
+      "grad_norm": 0.6261875033378601,
+      "learning_rate": 0.0003726551044142186,
+      "loss": 2.0663,
+      "step": 6300
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.1098814010620117,
+      "eval_runtime": 3.698,
+      "eval_samples_per_second": 1915.889,
+      "eval_steps_per_second": 119.794,
+      "step": 6312
+    },
+    {
+      "epoch": 6.083650190114068,
+      "grad_norm": 0.6620230674743652,
+      "learning_rate": 0.00037012691510340297,
+      "loss": 1.9996,
+      "step": 6400
+    },
+    {
+      "epoch": 6.178707224334601,
+      "grad_norm": 1.0794607400894165,
+      "learning_rate": 0.00036759872579258734,
+      "loss": 2.0018,
+      "step": 6500
+    },
+    {
+      "epoch": 6.273764258555133,
+      "grad_norm": 1.372861385345459,
+      "learning_rate": 0.00036507053648177177,
+      "loss": 2.0059,
+      "step": 6600
+    },
+    {
+      "epoch": 6.3688212927756656,
+      "grad_norm": 0.5926664471626282,
+      "learning_rate": 0.0003625423471709562,
+      "loss": 2.012,
+      "step": 6700
+    },
+    {
+      "epoch": 6.4638783269961975,
+      "grad_norm": 0.7855852246284485,
+      "learning_rate": 0.00036001415786014057,
+      "loss": 2.0128,
+      "step": 6800
+    },
+    {
+      "epoch": 6.55893536121673,
+      "grad_norm": 0.6684075593948364,
+      "learning_rate": 0.000357485968549325,
+      "loss": 2.0221,
+      "step": 6900
+    },
+    {
+      "epoch": 6.653992395437262,
+      "grad_norm": 0.628013014793396,
+      "learning_rate": 0.00035495777923850937,
+      "loss": 2.0159,
+      "step": 7000
+    },
+    {
+      "epoch": 6.749049429657795,
+      "grad_norm": 0.7943947911262512,
+      "learning_rate": 0.0003524295899276938,
+      "loss": 2.0223,
+      "step": 7100
+    },
+    {
+      "epoch": 6.844106463878327,
+      "grad_norm": 0.645799994468689,
+      "learning_rate": 0.0003499014006168782,
+      "loss": 2.0206,
+      "step": 7200
+    },
+    {
+      "epoch": 6.93916349809886,
+      "grad_norm": 0.6603648066520691,
+      "learning_rate": 0.0003473732113060626,
+      "loss": 2.0304,
+      "step": 7300
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.099062919616699,
+      "eval_runtime": 3.631,
+      "eval_samples_per_second": 1951.251,
+      "eval_steps_per_second": 122.005,
+      "step": 7364
+    },
+    {
+      "epoch": 7.034220532319392,
+      "grad_norm": 0.6082973480224609,
+      "learning_rate": 0.000344845021995247,
+      "loss": 2.0039,
+      "step": 7400
+    },
+    {
+      "epoch": 7.129277566539924,
+      "grad_norm": 0.673995852470398,
+      "learning_rate": 0.0003423168326844314,
+      "loss": 1.9663,
+      "step": 7500
+    },
+    {
+      "epoch": 7.224334600760456,
+      "grad_norm": 0.675037682056427,
+      "learning_rate": 0.0003397886433736158,
+      "loss": 1.9696,
+      "step": 7600
+    },
+    {
+      "epoch": 7.319391634980988,
+      "grad_norm": 0.6488978266716003,
+      "learning_rate": 0.0003372604540628002,
+      "loss": 1.9701,
+      "step": 7700
+    },
+    {
+      "epoch": 7.414448669201521,
+      "grad_norm": 0.8255399465560913,
+      "learning_rate": 0.0003347322647519846,
+      "loss": 1.9654,
+      "step": 7800
+    },
+    {
+      "epoch": 7.509505703422053,
+      "grad_norm": 1.2661654949188232,
+      "learning_rate": 0.00033220407544116905,
+      "loss": 1.9736,
+      "step": 7900
+    },
+    {
+      "epoch": 7.604562737642586,
+      "grad_norm": 0.6545805335044861,
+      "learning_rate": 0.0003296758861303534,
+      "loss": 1.9783,
+      "step": 8000
+    },
+    {
+      "epoch": 7.699619771863118,
+      "grad_norm": 0.8890361189842224,
+      "learning_rate": 0.00032714769681953785,
+      "loss": 1.9807,
+      "step": 8100
+    },
+    {
+      "epoch": 7.79467680608365,
+      "grad_norm": 0.6547899842262268,
+      "learning_rate": 0.0003246195075087223,
+      "loss": 1.9723,
+      "step": 8200
+    },
+    {
+      "epoch": 7.889733840304182,
+      "grad_norm": 1.1239402294158936,
+      "learning_rate": 0.00032209131819790665,
+      "loss": 1.9734,
+      "step": 8300
+    },
+    {
+      "epoch": 7.984790874524715,
+      "grad_norm": 0.6624830961227417,
+      "learning_rate": 0.000319563128887091,
+      "loss": 1.9869,
+      "step": 8400
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.1034328937530518,
+      "eval_runtime": 3.6013,
+      "eval_samples_per_second": 1967.337,
+      "eval_steps_per_second": 123.011,
+      "step": 8416
+    },
+    {
+      "epoch": 8.079847908745247,
+      "grad_norm": 0.6550971269607544,
+      "learning_rate": 0.0003170349395762755,
+      "loss": 1.9223,
+      "step": 8500
+    },
+    {
+      "epoch": 8.17490494296578,
+      "grad_norm": 0.660987138748169,
+      "learning_rate": 0.0003145067502654599,
+      "loss": 1.9245,
+      "step": 8600
+    },
+    {
+      "epoch": 8.269961977186313,
+      "grad_norm": 0.759884774684906,
+      "learning_rate": 0.00031197856095464425,
+      "loss": 1.9235,
+      "step": 8700
+    },
+    {
+      "epoch": 8.365019011406844,
+      "grad_norm": 0.9319919347763062,
+      "learning_rate": 0.00030945037164382874,
+      "loss": 1.9239,
+      "step": 8800
+    },
+    {
+      "epoch": 8.460076045627376,
+      "grad_norm": 0.6610597968101501,
+      "learning_rate": 0.0003069221823330131,
+      "loss": 1.928,
+      "step": 8900
+    },
+    {
+      "epoch": 8.55513307984791,
+      "grad_norm": 0.7076143622398376,
+      "learning_rate": 0.0003043939930221975,
+      "loss": 1.9289,
+      "step": 9000
+    },
+    {
+      "epoch": 8.65019011406844,
+      "grad_norm": 0.6368849873542786,
+      "learning_rate": 0.0003018658037113819,
+      "loss": 1.932,
+      "step": 9100
+    },
+    {
+      "epoch": 8.745247148288973,
+      "grad_norm": 0.7639185786247253,
+      "learning_rate": 0.00029933761440056634,
+      "loss": 1.9485,
+      "step": 9200
+    },
+    {
+      "epoch": 8.840304182509506,
+      "grad_norm": 1.0823330879211426,
+      "learning_rate": 0.0002968094250897507,
+      "loss": 1.9447,
+      "step": 9300
+    },
+    {
+      "epoch": 8.935361216730039,
+      "grad_norm": 0.8542035222053528,
+      "learning_rate": 0.00029428123577893514,
+      "loss": 1.942,
+      "step": 9400
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 2.0947535037994385,
+      "eval_runtime": 3.6147,
+      "eval_samples_per_second": 1960.063,
+      "eval_steps_per_second": 122.556,
+      "step": 9468
+    },
+    {
+      "epoch": 9.03041825095057,
+      "grad_norm": 0.7601971626281738,
+      "learning_rate": 0.00029175304646811956,
+      "loss": 1.9243,
+      "step": 9500
+    },
+    {
+      "epoch": 9.125475285171103,
+      "grad_norm": 0.7461040019989014,
+      "learning_rate": 0.00028922485715730394,
+      "loss": 1.8704,
+      "step": 9600
+    },
+    {
+      "epoch": 9.220532319391635,
+      "grad_norm": 0.7719326019287109,
+      "learning_rate": 0.00028669666784648836,
+      "loss": 1.8832,
+      "step": 9700
+    },
+    {
+      "epoch": 9.315589353612168,
+      "grad_norm": 0.716136634349823,
+      "learning_rate": 0.00028416847853567274,
+      "loss": 1.8787,
+      "step": 9800
+    },
+    {
+      "epoch": 9.4106463878327,
+      "grad_norm": 0.6928532123565674,
+      "learning_rate": 0.00028164028922485717,
+      "loss": 1.8855,
+      "step": 9900
+    },
+    {
+      "epoch": 9.505703422053232,
+      "grad_norm": 0.7696681618690491,
+      "learning_rate": 0.0002791120999140416,
+      "loss": 1.8855,
+      "step": 10000
+    },
+    {
+      "epoch": 9.600760456273765,
+      "grad_norm": 0.8969391584396362,
+      "learning_rate": 0.00027658391060322597,
+      "loss": 1.9034,
+      "step": 10100
+    },
+    {
+      "epoch": 9.695817490494296,
+      "grad_norm": 0.8469530940055847,
+      "learning_rate": 0.00027405572129241034,
+      "loss": 1.8965,
+      "step": 10200
+    },
+    {
+      "epoch": 9.790874524714829,
+      "grad_norm": 0.7956866025924683,
+      "learning_rate": 0.0002715275319815948,
+      "loss": 1.9087,
+      "step": 10300
+    },
+    {
+      "epoch": 9.885931558935361,
+      "grad_norm": 0.8293343782424927,
+      "learning_rate": 0.0002689993426707792,
+      "loss": 1.9177,
+      "step": 10400
+    },
+    {
+      "epoch": 9.980988593155894,
+      "grad_norm": 0.7472631931304932,
+      "learning_rate": 0.00026647115335996357,
+      "loss": 1.9082,
+      "step": 10500
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 2.097904920578003,
+      "eval_runtime": 3.5592,
+      "eval_samples_per_second": 1990.641,
+      "eval_steps_per_second": 124.468,
+      "step": 10520
+    },
+    {
+      "epoch": 10.076045627376425,
+      "grad_norm": 0.7787309288978577,
+      "learning_rate": 0.00026394296404914805,
+      "loss": 1.8393,
+      "step": 10600
+    },
+    {
+      "epoch": 10.171102661596958,
+      "grad_norm": 1.3328174352645874,
+      "learning_rate": 0.0002614147747383324,
+      "loss": 1.8283,
+      "step": 10700
+    },
+    {
+      "epoch": 10.26615969581749,
+      "grad_norm": 0.7740694284439087,
+      "learning_rate": 0.0002588865854275168,
+      "loss": 1.8422,
+      "step": 10800
+    },
+    {
+      "epoch": 10.361216730038024,
+      "grad_norm": 0.828940749168396,
+      "learning_rate": 0.0002563583961167012,
+      "loss": 1.8516,
+      "step": 10900
+    },
+    {
+      "epoch": 10.456273764258555,
+      "grad_norm": 0.751752495765686,
+      "learning_rate": 0.00025383020680588565,
+      "loss": 1.8624,
+      "step": 11000
+    },
+    {
+      "epoch": 10.551330798479087,
+      "grad_norm": 0.9940192103385925,
+      "learning_rate": 0.00025130201749507,
+      "loss": 1.8599,
+      "step": 11100
+    },
+    {
+      "epoch": 10.64638783269962,
+      "grad_norm": 0.8591569066047668,
+      "learning_rate": 0.00024877382818425445,
+      "loss": 1.8581,
+      "step": 11200
+    },
+    {
+      "epoch": 10.741444866920151,
+      "grad_norm": 0.7676281332969666,
+      "learning_rate": 0.0002462456388734388,
+      "loss": 1.8637,
+      "step": 11300
+    },
+    {
+      "epoch": 10.836501901140684,
+      "grad_norm": 0.7896871566772461,
+      "learning_rate": 0.00024371744956262325,
+      "loss": 1.8606,
+      "step": 11400
+    },
+    {
+      "epoch": 10.931558935361217,
+      "grad_norm": 0.8302274942398071,
+      "learning_rate": 0.00024118926025180765,
+      "loss": 1.8656,
+      "step": 11500
+    },
+    {
+      "epoch": 11.0,
+      "eval_loss": 2.0961618423461914,
+      "eval_runtime": 3.6362,
+      "eval_samples_per_second": 1948.473,
+      "eval_steps_per_second": 121.831,
+      "step": 11572
+    },
+    {
+      "epoch": 11.02661596958175,
+      "grad_norm": 0.8891871571540833,
+      "learning_rate": 0.00023866107094099208,
+      "loss": 1.8522,
+      "step": 11600
+    },
+    {
+      "epoch": 11.12167300380228,
+      "grad_norm": 0.7549653649330139,
+      "learning_rate": 0.00023613288163017645,
+      "loss": 1.7913,
+      "step": 11700
+    },
+    {
+      "epoch": 11.216730038022813,
+      "grad_norm": 0.8127674460411072,
+      "learning_rate": 0.00023360469231936088,
+      "loss": 1.8102,
+      "step": 11800
+    },
+    {
+      "epoch": 11.311787072243346,
+      "grad_norm": 0.841659426689148,
+      "learning_rate": 0.0002310765030085453,
+      "loss": 1.803,
+      "step": 11900
+    },
+    {
+      "epoch": 11.406844106463879,
+      "grad_norm": 0.8460645079612732,
+      "learning_rate": 0.00022854831369772968,
+      "loss": 1.8201,
+      "step": 12000
+    },
+    {
+      "epoch": 11.50190114068441,
+      "grad_norm": 0.7932580709457397,
+      "learning_rate": 0.0002260201243869141,
+      "loss": 1.811,
+      "step": 12100
+    },
+    {
+      "epoch": 11.596958174904943,
+      "grad_norm": 0.8419378399848938,
+      "learning_rate": 0.0002234919350760985,
+      "loss": 1.8145,
+      "step": 12200
+    },
+    {
+      "epoch": 11.692015209125476,
+      "grad_norm": 0.8346748352050781,
+      "learning_rate": 0.0002209637457652829,
+      "loss": 1.8328,
+      "step": 12300
+    },
+    {
+      "epoch": 11.787072243346007,
+      "grad_norm": 1.019510269165039,
+      "learning_rate": 0.0002184355564544673,
+      "loss": 1.8257,
+      "step": 12400
+    },
+    {
+      "epoch": 11.88212927756654,
+      "grad_norm": 0.8175719976425171,
+      "learning_rate": 0.00021590736714365173,
+      "loss": 1.8274,
+      "step": 12500
+    },
+    {
+      "epoch": 11.977186311787072,
+      "grad_norm": 0.7476153373718262,
+      "learning_rate": 0.00021337917783283614,
+      "loss": 1.8361,
+      "step": 12600
+    },
+    {
+      "epoch": 12.0,
+      "eval_loss": 2.1029505729675293,
+      "eval_runtime": 3.5932,
+      "eval_samples_per_second": 1971.782,
+      "eval_steps_per_second": 123.289,
+      "step": 12624
+    },
+    {
+      "epoch": 12.072243346007605,
+      "grad_norm": 0.8637651205062866,
+      "learning_rate": 0.00021085098852202054,
+      "loss": 1.7684,
+      "step": 12700
+    },
+    {
+      "epoch": 12.167300380228136,
+      "grad_norm": 0.80800461769104,
+      "learning_rate": 0.00020832279921120496,
+      "loss": 1.7703,
+      "step": 12800
+    },
+    {
+      "epoch": 12.262357414448669,
+      "grad_norm": 1.0111021995544434,
+      "learning_rate": 0.00020579460990038934,
+      "loss": 1.7809,
+      "step": 12900
+    },
+    {
+      "epoch": 12.357414448669202,
+      "grad_norm": 0.8477798700332642,
+      "learning_rate": 0.00020326642058957376,
+      "loss": 1.7795,
+      "step": 13000
+    },
+    {
+      "epoch": 12.452471482889734,
+      "grad_norm": 0.8284028172492981,
+      "learning_rate": 0.00020073823127875814,
+      "loss": 1.7803,
+      "step": 13100
+    },
+    {
+      "epoch": 12.547528517110266,
+      "grad_norm": 0.7752136588096619,
+      "learning_rate": 0.00019821004196794256,
+      "loss": 1.7836,
+      "step": 13200
+    },
+    {
+      "epoch": 12.642585551330798,
+      "grad_norm": 0.8929184675216675,
+      "learning_rate": 0.00019568185265712696,
+      "loss": 1.7724,
+      "step": 13300
+    },
+    {
+      "epoch": 12.737642585551331,
+      "grad_norm": 0.8475900888442993,
+      "learning_rate": 0.00019315366334631136,
+      "loss": 1.7891,
+      "step": 13400
+    },
+    {
+      "epoch": 12.832699619771864,
+      "grad_norm": 0.9029939770698547,
+      "learning_rate": 0.0001906254740354958,
+      "loss": 1.7888,
+      "step": 13500
+    },
+    {
+      "epoch": 12.927756653992395,
+      "grad_norm": 0.841206967830658,
+      "learning_rate": 0.0001880972847246802,
+      "loss": 1.8005,
+      "step": 13600
+    },
+    {
+      "epoch": 13.0,
+      "eval_loss": 2.1176211833953857,
+      "eval_runtime": 3.6226,
+      "eval_samples_per_second": 1955.796,
+      "eval_steps_per_second": 122.289,
+      "step": 13676
+    },
+    {
+      "epoch": 13.022813688212928,
+      "grad_norm": 0.786509096622467,
+      "learning_rate": 0.0001855690954138646,
+      "loss": 1.7784,
+      "step": 13700
+    },
+    {
+      "epoch": 13.11787072243346,
+      "grad_norm": 0.8644747734069824,
+      "learning_rate": 0.000183040906103049,
+      "loss": 1.7234,
+      "step": 13800
+    },
+    {
+      "epoch": 13.212927756653992,
+      "grad_norm": 0.8760172128677368,
+      "learning_rate": 0.00018051271679223342,
+      "loss": 1.7308,
+      "step": 13900
+    },
+    {
+      "epoch": 13.307984790874524,
+      "grad_norm": 0.7858941555023193,
+      "learning_rate": 0.0001779845274814178,
+      "loss": 1.7318,
+      "step": 14000
+    },
+    {
+      "epoch": 13.403041825095057,
+      "grad_norm": 0.8771238327026367,
+      "learning_rate": 0.00017545633817060222,
+      "loss": 1.7473,
+      "step": 14100
+    },
+    {
+      "epoch": 13.49809885931559,
+      "grad_norm": 0.8886803984642029,
+      "learning_rate": 0.00017292814885978665,
+      "loss": 1.7491,
+      "step": 14200
+    },
+    {
+      "epoch": 13.593155893536121,
+      "grad_norm": 0.8704127669334412,
+      "learning_rate": 0.00017039995954897102,
+      "loss": 1.7548,
+      "step": 14300
+    },
+    {
+      "epoch": 13.688212927756654,
+      "grad_norm": 1.2635705471038818,
+      "learning_rate": 0.00016787177023815545,
+      "loss": 1.7532,
+      "step": 14400
+    },
+    {
+      "epoch": 13.783269961977187,
+      "grad_norm": 0.9218750596046448,
+      "learning_rate": 0.00016534358092733985,
+      "loss": 1.7531,
+      "step": 14500
+    },
+    {
+      "epoch": 13.87832699619772,
+      "grad_norm": 0.9513919353485107,
+      "learning_rate": 0.00016281539161652425,
+      "loss": 1.7618,
+      "step": 14600
+    },
+    {
+      "epoch": 13.97338403041825,
+      "grad_norm": 1.010962963104248,
+      "learning_rate": 0.00016028720230570865,
+      "loss": 1.7646,
+      "step": 14700
+    },
+    {
+      "epoch": 14.0,
+      "eval_loss": 2.130631923675537,
+      "eval_runtime": 3.6539,
+      "eval_samples_per_second": 1938.998,
+      "eval_steps_per_second": 121.239,
+      "step": 14728
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 21040,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.6215157665850184e+16,
+  "train_batch_size": 128,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83c747c7cd25c286914dbb5bbec4723cdf421e34ea221f411b85b150d0b96bd4
+size 5304