diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3980 +1,628 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 4.712550607287449, "eval_steps": 100, - "global_step": 494, + "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0020242914979757085, - "grad_norm": 0.19618171453475952, - "learning_rate": 4.0000000000000003e-07, - "loss": 0.6744, - "mean_token_accuracy": 0.8156972527503967, + "epoch": 0.06477732793522267, + "grad_norm": 0.08484393358230591, + "learning_rate": 2.5e-08, + "loss": 0.693, + "mean_token_accuracy": 0.8232172057032585, "step": 1 }, { - "epoch": 0.004048582995951417, - "grad_norm": 0.20514528453350067, - "learning_rate": 8.000000000000001e-07, - "loss": 0.6586, - "mean_token_accuracy": 0.8400664329528809, + "epoch": 0.12955465587044535, + "grad_norm": 0.08104647696018219, + "learning_rate": 5e-08, + "loss": 0.6361, + "mean_token_accuracy": 0.8352038823068142, "step": 2 }, { - "epoch": 0.006072874493927126, - "grad_norm": 0.1573631763458252, - "learning_rate": 1.2000000000000002e-06, - "loss": 0.576, - "mean_token_accuracy": 0.8486753702163696, + "epoch": 0.19433198380566802, + "grad_norm": 0.08257265388965607, + "learning_rate": 7.5e-08, + "loss": 0.669, + "mean_token_accuracy": 0.8248961418867111, "step": 3 }, { - "epoch": 0.008097165991902834, - "grad_norm": 0.1505558341741562, - "learning_rate": 1.6000000000000001e-06, - "loss": 0.5729, - "mean_token_accuracy": 0.860769510269165, + "epoch": 0.2591093117408907, + "grad_norm": 0.08169891685247421, + "learning_rate": 1e-07, + "loss": 0.654, + "mean_token_accuracy": 0.8317753672599792, "step": 4 }, { - "epoch": 0.010121457489878543, - "grad_norm": 0.1700662225484848, - "learning_rate": 2.0000000000000003e-06, - "loss": 0.6392, - "mean_token_accuracy": 0.8422141671180725, + "epoch": 0.32388663967611336, + "grad_norm": 0.07086937129497528, + "learning_rate": 1.25e-07, + "loss": 0.6029, + "mean_token_accuracy": 0.8417873457074165, "step": 5 }, { - "epoch": 0.012145748987854251, - "grad_norm": 0.18006764352321625, - "learning_rate": 2.4000000000000003e-06, - "loss": 0.6843, - "mean_token_accuracy": 0.8051391839981079, + "epoch": 0.38866396761133604, + "grad_norm": 0.08497548848390579, + "learning_rate": 1.5e-07, + "loss": 0.6561, + "mean_token_accuracy": 0.8321865610778332, "step": 6 }, { - "epoch": 0.01417004048582996, - "grad_norm": 0.14205701649188995, - "learning_rate": 2.8000000000000003e-06, - "loss": 0.608, - "mean_token_accuracy": 0.8361465334892273, + "epoch": 0.4534412955465587, + "grad_norm": 0.07375866174697876, + "learning_rate": 1.75e-07, + "loss": 0.6029, + "mean_token_accuracy": 0.8472057469189167, "step": 7 }, { - "epoch": 0.016194331983805668, - "grad_norm": 0.14348368346691132, - "learning_rate": 3.2000000000000003e-06, - "loss": 0.6215, - "mean_token_accuracy": 0.8492392897605896, + "epoch": 0.5182186234817814, + "grad_norm": 0.08089514076709747, + "learning_rate": 2e-07, + "loss": 0.6429, + "mean_token_accuracy": 0.8303790092468262, "step": 8 }, { - "epoch": 0.018218623481781375, - "grad_norm": 0.18333880603313446, - "learning_rate": 3.6000000000000003e-06, - "loss": 0.6696, - "mean_token_accuracy": 0.8336831331253052, + "epoch": 0.582995951417004, + "grad_norm": 0.07564779371023178, + "learning_rate": 1.9989008914857112e-07, + "loss": 0.6453, + "mean_token_accuracy": 0.8341928347945213, "step": 9 }, { - "epoch": 0.020242914979757085, - "grad_norm": 0.13913995027542114, - "learning_rate": 4.000000000000001e-06, - "loss": 0.6407, - "mean_token_accuracy": 0.8342278003692627, + "epoch": 0.6477732793522267, + "grad_norm": 0.08072903752326965, + "learning_rate": 1.995605982021898e-07, + "loss": 0.6476, + "mean_token_accuracy": 0.8313158191740513, "step": 10 }, { - "epoch": 0.022267206477732792, - "grad_norm": 0.14520516991615295, - "learning_rate": 4.4e-06, - "loss": 0.6821, - "mean_token_accuracy": 0.829963207244873, + "epoch": 0.7125506072874493, + "grad_norm": 0.08074145019054413, + "learning_rate": 1.9901225145346506e-07, + "loss": 0.6519, + "mean_token_accuracy": 0.8337250761687756, "step": 11 }, { - "epoch": 0.024291497975708502, - "grad_norm": 0.15494030714035034, - "learning_rate": 4.800000000000001e-06, - "loss": 0.6534, - "mean_token_accuracy": 0.8419155478477478, + "epoch": 0.7773279352226721, + "grad_norm": 0.08061974495649338, + "learning_rate": 1.9824625428755758e-07, + "loss": 0.651, + "mean_token_accuracy": 0.8378880433738232, "step": 12 }, { - "epoch": 0.02631578947368421, - "grad_norm": 0.1409451961517334, - "learning_rate": 5.2e-06, - "loss": 0.5572, - "mean_token_accuracy": 0.8476144075393677, + "epoch": 0.8421052631578947, + "grad_norm": 0.0789395272731781, + "learning_rate": 1.9726429053248129e-07, + "loss": 0.6443, + "mean_token_accuracy": 0.8334936276078224, "step": 13 }, { - "epoch": 0.02834008097165992, - "grad_norm": 0.11936034262180328, - "learning_rate": 5.600000000000001e-06, - "loss": 0.5364, - "mean_token_accuracy": 0.8614328503608704, + "epoch": 0.9068825910931174, + "grad_norm": 0.07814885675907135, + "learning_rate": 1.96068518757684e-07, + "loss": 0.6565, + "mean_token_accuracy": 0.8271612599492073, "step": 14 }, { - "epoch": 0.030364372469635626, - "grad_norm": 0.15133579075336456, - "learning_rate": 6e-06, - "loss": 0.6552, - "mean_token_accuracy": 0.8225256204605103, + "epoch": 0.97165991902834, + "grad_norm": 0.07767179608345032, + "learning_rate": 1.946615675290434e-07, + "loss": 0.6348, + "mean_token_accuracy": 0.8349605351686478, "step": 15 }, { - "epoch": 0.032388663967611336, - "grad_norm": 0.18947570025920868, - "learning_rate": 6.4000000000000006e-06, - "loss": 0.6948, - "mean_token_accuracy": 0.8155401349067688, + "epoch": 1.0, + "grad_norm": 0.07767179608345032, + "learning_rate": 1.9304652963070865e-07, + "loss": 0.6711, + "mean_token_accuracy": 0.8187758156231472, "step": 16 }, { - "epoch": 0.03441295546558704, - "grad_norm": 0.12663717567920685, - "learning_rate": 6.800000000000001e-06, - "loss": 0.6184, - "mean_token_accuracy": 0.8341503143310547, + "epoch": 1.0647773279352226, + "grad_norm": 0.1275881677865982, + "learning_rate": 1.9122695526648966e-07, + "loss": 0.6142, + "mean_token_accuracy": 0.838625643402338, "step": 17 }, { - "epoch": 0.03643724696356275, - "grad_norm": 0.1532672792673111, - "learning_rate": 7.2000000000000005e-06, - "loss": 0.6429, - "mean_token_accuracy": 0.8517258167266846, + "epoch": 1.1295546558704452, + "grad_norm": 0.07696238905191422, + "learning_rate": 1.8920684425573862e-07, + "loss": 0.6289, + "mean_token_accuracy": 0.835904911160469, "step": 18 }, { - "epoch": 0.038461538461538464, - "grad_norm": 0.17713797092437744, - "learning_rate": 7.600000000000001e-06, - "loss": 0.6789, - "mean_token_accuracy": 0.8132250308990479, + "epoch": 1.194331983805668, + "grad_norm": 0.08134917914867401, + "learning_rate": 1.8699063724087903e-07, + "loss": 0.6511, + "mean_token_accuracy": 0.830647598952055, "step": 19 }, { - "epoch": 0.04048582995951417, - "grad_norm": 0.12441661208868027, - "learning_rate": 8.000000000000001e-06, - "loss": 0.6009, - "mean_token_accuracy": 0.8490859866142273, + "epoch": 1.2591093117408907, + "grad_norm": 0.08185581117868423, + "learning_rate": 1.8458320592590972e-07, + "loss": 0.6395, + "mean_token_accuracy": 0.8346012011170387, "step": 20 }, { - "epoch": 0.04251012145748988, - "grad_norm": 0.1268952190876007, - "learning_rate": 8.400000000000001e-06, - "loss": 0.6118, - "mean_token_accuracy": 0.8372530341148376, + "epoch": 1.3238866396761133, + "grad_norm": 0.07670310139656067, + "learning_rate": 1.8198984236734245e-07, + "loss": 0.5885, + "mean_token_accuracy": 0.8427848629653454, "step": 21 }, { - "epoch": 0.044534412955465584, - "grad_norm": 0.17228662967681885, - "learning_rate": 8.8e-06, - "loss": 0.6998, - "mean_token_accuracy": 0.8195767402648926, + "epoch": 1.3886639676113361, + "grad_norm": 0.0753348097205162, + "learning_rate": 1.792162473411129e-07, + "loss": 0.6299, + "mean_token_accuracy": 0.834791149944067, "step": 22 }, { - "epoch": 0.0465587044534413, - "grad_norm": 0.17604027688503265, - "learning_rate": 9.200000000000002e-06, - "loss": 0.6755, - "mean_token_accuracy": 0.8294089436531067, + "epoch": 1.4534412955465588, + "grad_norm": 0.07955452054738998, + "learning_rate": 1.7626851781103817e-07, + "loss": 0.6499, + "mean_token_accuracy": 0.8349028266966343, "step": 23 }, { - "epoch": 0.048582995951417005, - "grad_norm": 0.14454977214336395, - "learning_rate": 9.600000000000001e-06, - "loss": 0.7248, - "mean_token_accuracy": 0.8143091201782227, + "epoch": 1.5182186234817814, + "grad_norm": 0.07731788605451584, + "learning_rate": 1.731531335263669e-07, + "loss": 0.6252, + "mean_token_accuracy": 0.8352472670376301, "step": 24 }, { - "epoch": 0.05060728744939271, - "grad_norm": 0.17172342538833618, - "learning_rate": 1e-05, - "loss": 0.5982, - "mean_token_accuracy": 0.8516228795051575, + "epoch": 1.582995951417004, + "grad_norm": 0.07537000626325607, + "learning_rate": 1.6987694277788415e-07, + "loss": 0.6405, + "mean_token_accuracy": 0.8319784663617611, "step": 25 }, { - "epoch": 0.05263157894736842, - "grad_norm": 0.19159506261348724, - "learning_rate": 1.04e-05, - "loss": 0.7188, - "mean_token_accuracy": 0.8197706341743469, + "epoch": 1.6477732793522266, + "grad_norm": 0.08128884434700012, + "learning_rate": 1.6644714734388215e-07, + "loss": 0.6501, + "mean_token_accuracy": 0.8315212316811085, "step": 26 }, { - "epoch": 0.05465587044534413, - "grad_norm": 0.12568704783916473, - "learning_rate": 1.0800000000000002e-05, - "loss": 0.4287, - "mean_token_accuracy": 0.8862559199333191, + "epoch": 1.7125506072874495, + "grad_norm": 0.08110049366950989, + "learning_rate": 1.628712866590885e-07, + "loss": 0.6727, + "mean_token_accuracy": 0.8280021287500858, "step": 27 }, { - "epoch": 0.05668016194331984, - "grad_norm": 0.13636384904384613, - "learning_rate": 1.1200000000000001e-05, - "loss": 0.5182, - "mean_token_accuracy": 0.8564677834510803, + "epoch": 1.777327935222672, + "grad_norm": 0.08107449114322662, + "learning_rate": 1.5915722124135225e-07, + "loss": 0.6888, + "mean_token_accuracy": 0.8243995904922485, "step": 28 }, { - "epoch": 0.058704453441295545, - "grad_norm": 0.15473420917987823, - "learning_rate": 1.16e-05, - "loss": 0.6351, - "mean_token_accuracy": 0.8287112712860107, + "epoch": 1.8421052631578947, + "grad_norm": 0.08128422498703003, + "learning_rate": 1.5531311541251992e-07, + "loss": 0.6592, + "mean_token_accuracy": 0.8349018841981888, "step": 29 }, { - "epoch": 0.06072874493927125, - "grad_norm": 0.14979153871536255, - "learning_rate": 1.2e-05, - "loss": 0.6087, - "mean_token_accuracy": 0.8364104628562927, + "epoch": 1.9068825910931175, + "grad_norm": 0.08368204534053802, + "learning_rate": 1.5134741935148418e-07, + "loss": 0.6931, + "mean_token_accuracy": 0.8207659162580967, "step": 30 }, { - "epoch": 0.06275303643724696, - "grad_norm": 0.1627970039844513, - "learning_rate": 1.2400000000000002e-05, - "loss": 0.6458, - "mean_token_accuracy": 0.8359329700469971, + "epoch": 1.97165991902834, + "grad_norm": 0.08243861049413681, + "learning_rate": 1.4726885051885652e-07, + "loss": 0.677, + "mean_token_accuracy": 0.829660214483738, "step": 31 }, { - "epoch": 0.06477732793522267, - "grad_norm": 0.17766571044921875, - "learning_rate": 1.2800000000000001e-05, - "loss": 0.6847, - "mean_token_accuracy": 0.8317025303840637, + "epoch": 2.0, + "grad_norm": 0.08243861049413681, + "learning_rate": 1.4308637449409705e-07, + "loss": 0.6144, + "mean_token_accuracy": 0.8387703725269863, "step": 32 }, { - "epoch": 0.06680161943319839, - "grad_norm": 0.16728782653808594, - "learning_rate": 1.3200000000000002e-05, - "loss": 0.5836, - "mean_token_accuracy": 0.8538188934326172, + "epoch": 2.064777327935223, + "grad_norm": 0.1279604583978653, + "learning_rate": 1.3880918526722496e-07, + "loss": 0.6332, + "mean_token_accuracy": 0.8366866521537304, "step": 33 }, { - "epoch": 0.06882591093117409, - "grad_norm": 0.21175438165664673, - "learning_rate": 1.3600000000000002e-05, - "loss": 0.6896, - "mean_token_accuracy": 0.8219363689422607, + "epoch": 2.1295546558704452, + "grad_norm": 0.08055976778268814, + "learning_rate": 1.344466850284333e-07, + "loss": 0.6709, + "mean_token_accuracy": 0.8283472806215286, "step": 34 }, { - "epoch": 0.0708502024291498, - "grad_norm": 0.1283571571111679, - "learning_rate": 1.4e-05, - "loss": 0.5793, - "mean_token_accuracy": 0.8478589653968811, + "epoch": 2.194331983805668, + "grad_norm": 0.08526450395584106, + "learning_rate": 1.3000846350003406e-07, + "loss": 0.6429, + "mean_token_accuracy": 0.8317065984010696, "step": 35 }, { - "epoch": 0.0728744939271255, - "grad_norm": 0.15766221284866333, - "learning_rate": 1.4400000000000001e-05, - "loss": 0.5341, - "mean_token_accuracy": 0.8500925302505493, + "epoch": 2.2591093117408905, + "grad_norm": 0.07659583538770676, + "learning_rate": 1.2550427685616765e-07, + "loss": 0.6523, + "mean_token_accuracy": 0.8286443240940571, "step": 36 }, { - "epoch": 0.07489878542510121, - "grad_norm": 0.18162991106510162, - "learning_rate": 1.48e-05, - "loss": 0.7043, - "mean_token_accuracy": 0.8179640769958496, + "epoch": 2.3238866396761133, + "grad_norm": 0.07898923009634018, + "learning_rate": 1.2094402627661445e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.8340383470058441, "step": 37 }, { - "epoch": 0.07692307692307693, - "grad_norm": 0.1570591777563095, - "learning_rate": 1.5200000000000002e-05, - "loss": 0.597, - "mean_token_accuracy": 0.8484577536582947, + "epoch": 2.388663967611336, + "grad_norm": 0.07619204372167587, + "learning_rate": 1.1633773618185301e-07, + "loss": 0.6139, + "mean_token_accuracy": 0.8424343690276146, "step": 38 }, { - "epoch": 0.07894736842105263, - "grad_norm": 0.21057575941085815, - "learning_rate": 1.5600000000000003e-05, - "loss": 0.7738, - "mean_token_accuracy": 0.7961832284927368, + "epoch": 2.4534412955465585, + "grad_norm": 0.07402335107326508, + "learning_rate": 1.1169553219720827e-07, + "loss": 0.6798, + "mean_token_accuracy": 0.8285421878099442, "step": 39 }, { - "epoch": 0.08097165991902834, - "grad_norm": 0.21168996393680573, - "learning_rate": 1.6000000000000003e-05, - "loss": 0.7407, - "mean_token_accuracy": 0.808038055896759, + "epoch": 2.5182186234817814, + "grad_norm": 0.08590105175971985, + "learning_rate": 1.0702761889452929e-07, + "loss": 0.6632, + "mean_token_accuracy": 0.8267885185778141, "step": 40 }, { - "epoch": 0.08299595141700405, - "grad_norm": 0.13393346965312958, - "learning_rate": 1.64e-05, - "loss": 0.5588, - "mean_token_accuracy": 0.8508504033088684, + "epoch": 2.582995951417004, + "grad_norm": 0.08060938864946365, + "learning_rate": 1.0234425736032605e-07, + "loss": 0.6725, + "mean_token_accuracy": 0.8298077434301376, "step": 41 }, { - "epoch": 0.08502024291497975, - "grad_norm": 0.13981293141841888, - "learning_rate": 1.6800000000000002e-05, - "loss": 0.6503, - "mean_token_accuracy": 0.8287777304649353, + "epoch": 2.6477732793522266, + "grad_norm": 0.08330174535512924, + "learning_rate": 9.765574263967395e-08, + "loss": 0.6404, + "mean_token_accuracy": 0.8322437591850758, "step": 42 }, { - "epoch": 0.08704453441295547, - "grad_norm": 0.17055265605449677, - "learning_rate": 1.72e-05, - "loss": 0.7479, - "mean_token_accuracy": 0.7955465316772461, + "epoch": 2.7125506072874495, + "grad_norm": 0.0802123174071312, + "learning_rate": 9.297238110547073e-08, + "loss": 0.6333, + "mean_token_accuracy": 0.8358821533620358, "step": 43 }, { - "epoch": 0.08906882591093117, - "grad_norm": 0.1803690642118454, - "learning_rate": 1.76e-05, - "loss": 0.6342, - "mean_token_accuracy": 0.8441314697265625, + "epoch": 2.7773279352226723, + "grad_norm": 0.07823917269706726, + "learning_rate": 8.830446780279175e-08, + "loss": 0.633, + "mean_token_accuracy": 0.8377866670489311, "step": 44 }, { - "epoch": 0.09109311740890688, - "grad_norm": 0.15695004165172577, - "learning_rate": 1.8e-05, - "loss": 0.6291, - "mean_token_accuracy": 0.8388278484344482, + "epoch": 2.8421052631578947, + "grad_norm": 0.07679135352373123, + "learning_rate": 8.366226381814697e-08, + "loss": 0.6263, + "mean_token_accuracy": 0.8360908254981041, "step": 45 }, { - "epoch": 0.0931174089068826, - "grad_norm": 0.15088340640068054, - "learning_rate": 1.8400000000000003e-05, - "loss": 0.5862, - "mean_token_accuracy": 0.845678985118866, + "epoch": 2.9068825910931175, + "grad_norm": 0.07800301164388657, + "learning_rate": 7.905597372338558e-08, + "loss": 0.6464, + "mean_token_accuracy": 0.8296581134200096, "step": 46 }, { - "epoch": 0.0951417004048583, - "grad_norm": 0.19936536252498627, - "learning_rate": 1.88e-05, - "loss": 0.683, - "mean_token_accuracy": 0.8353626132011414, + "epoch": 2.97165991902834, + "grad_norm": 0.07923697680234909, + "learning_rate": 7.449572314383236e-08, + "loss": 0.6825, + "mean_token_accuracy": 0.8269040808081627, "step": 47 }, { - "epoch": 0.09716599190283401, - "grad_norm": 0.18100468814373016, - "learning_rate": 1.9200000000000003e-05, - "loss": 0.6099, - "mean_token_accuracy": 0.8441349864006042, + "epoch": 3.0, + "grad_norm": 0.096576027572155, + "learning_rate": 6.999153649996594e-08, + "loss": 0.6081, + "mean_token_accuracy": 0.8382441401481628, "step": 48 }, { - "epoch": 0.09919028340080972, - "grad_norm": 0.17626872658729553, - "learning_rate": 1.9600000000000002e-05, - "loss": 0.6067, - "mean_token_accuracy": 0.8398520350456238, + "epoch": 3.064777327935223, + "grad_norm": 0.10343178361654282, + "learning_rate": 6.555331497156671e-08, + "loss": 0.6464, + "mean_token_accuracy": 0.8325350135564804, "step": 49 }, { - "epoch": 0.10121457489878542, - "grad_norm": 0.16862498223781586, - "learning_rate": 2e-05, - "loss": 0.6604, - "mean_token_accuracy": 0.8260050415992737, + "epoch": 3.1295546558704452, + "grad_norm": 0.07666892558336258, + "learning_rate": 6.119081473277501e-08, + "loss": 0.6379, + "mean_token_accuracy": 0.8326343894004822, "step": 50 }, { - "epoch": 0.10323886639676114, - "grad_norm": 0.21076682209968567, - "learning_rate": 1.9999749676283775e-05, - "loss": 0.7307, - "mean_token_accuracy": 0.830464243888855, + "epoch": 3.194331983805668, + "grad_norm": 0.07846437394618988, + "learning_rate": 5.691362550590296e-08, + "loss": 0.6438, + "mean_token_accuracy": 0.8362045586109161, "step": 51 }, { - "epoch": 0.10526315789473684, - "grad_norm": 0.21457180380821228, - "learning_rate": 1.999899871766749e-05, - "loss": 0.7789, - "mean_token_accuracy": 0.8072487711906433, + "epoch": 3.2591093117408905, + "grad_norm": 0.07778104394674301, + "learning_rate": 5.2731149481143456e-08, + "loss": 0.6296, + "mean_token_accuracy": 0.8366052508354187, "step": 52 }, { - "epoch": 0.10728744939271255, - "grad_norm": 0.149997279047966, - "learning_rate": 1.9997747161747696e-05, - "loss": 0.5559, - "mean_token_accuracy": 0.8497348427772522, + "epoch": 3.3238866396761133, + "grad_norm": 0.07893098890781403, + "learning_rate": 4.8652580648515785e-08, + "loss": 0.6426, + "mean_token_accuracy": 0.83216942101717, "step": 53 }, { - "epoch": 0.10931174089068826, - "grad_norm": 0.1614534556865692, - "learning_rate": 1.999599507118322e-05, - "loss": 0.6496, - "mean_token_accuracy": 0.8317460417747498, + "epoch": 3.388663967611336, + "grad_norm": 0.08473635464906693, + "learning_rate": 4.4686884587480054e-08, + "loss": 0.6913, + "mean_token_accuracy": 0.8237782865762711, "step": 54 }, { - "epoch": 0.11133603238866396, - "grad_norm": 0.3064078390598297, - "learning_rate": 1.999374253369202e-05, - "loss": 0.7523, - "mean_token_accuracy": 0.8246474266052246, + "epoch": 3.4534412955465585, + "grad_norm": 0.08035323023796082, + "learning_rate": 4.084277875864776e-08, + "loss": 0.635, + "mean_token_accuracy": 0.8352065198123455, "step": 55 }, { - "epoch": 0.11336032388663968, - "grad_norm": 0.14961189031600952, - "learning_rate": 1.999098966204682e-05, - "loss": 0.5924, - "mean_token_accuracy": 0.8457792401313782, + "epoch": 3.5182186234817814, + "grad_norm": 0.0813593938946724, + "learning_rate": 3.712871334091153e-08, + "loss": 0.6778, + "mean_token_accuracy": 0.8271778710186481, "step": 56 }, { - "epoch": 0.11538461538461539, - "grad_norm": 0.26088255643844604, - "learning_rate": 1.9987736594069417e-05, - "loss": 0.7341, - "mean_token_accuracy": 0.8132271766662598, + "epoch": 3.582995951417004, + "grad_norm": 0.07975359261035919, + "learning_rate": 3.355285265611784e-08, + "loss": 0.632, + "mean_token_accuracy": 0.8359011709690094, "step": 57 }, { - "epoch": 0.11740890688259109, - "grad_norm": 0.20378543436527252, - "learning_rate": 1.9983983492623832e-05, - "loss": 0.6641, - "mean_token_accuracy": 0.8299359083175659, + "epoch": 3.6477732793522266, + "grad_norm": 0.07455204427242279, + "learning_rate": 3.0123057222115835e-08, + "loss": 0.6287, + "mean_token_accuracy": 0.8337721861898899, "step": 58 }, { - "epoch": 0.1194331983805668, - "grad_norm": 0.24202823638916016, - "learning_rate": 1.9979730545608128e-05, - "loss": 0.854, - "mean_token_accuracy": 0.7922971248626709, + "epoch": 3.7125506072874495, + "grad_norm": 0.07816082239151001, + "learning_rate": 2.6846866473633123e-08, + "loss": 0.6399, + "mean_token_accuracy": 0.835223838686943, "step": 59 }, { - "epoch": 0.1214574898785425, - "grad_norm": 0.17491643130779266, - "learning_rate": 1.9974977965945e-05, - "loss": 0.6383, - "mean_token_accuracy": 0.8353293538093567, + "epoch": 3.7773279352226723, + "grad_norm": 0.08226487785577774, + "learning_rate": 2.3731482188961815e-08, + "loss": 0.6772, + "mean_token_accuracy": 0.8268170021474361, "step": 60 }, { - "epoch": 0.12348178137651822, - "grad_norm": 0.192088782787323, - "learning_rate": 1.996972599157113e-05, - "loss": 0.677, - "mean_token_accuracy": 0.8339276909828186, + "epoch": 3.8421052631578947, + "grad_norm": 0.08300112187862396, + "learning_rate": 2.0783752658887067e-08, + "loss": 0.6465, + "mean_token_accuracy": 0.8339819870889187, "step": 61 }, { - "epoch": 0.12550607287449392, - "grad_norm": 0.2212095856666565, - "learning_rate": 1.9963974885425267e-05, - "loss": 0.663, - "mean_token_accuracy": 0.8254847526550293, + "epoch": 3.9068825910931175, + "grad_norm": 0.08047865331172943, + "learning_rate": 1.801015763265754e-08, + "loss": 0.6524, + "mean_token_accuracy": 0.8351892940700054, "step": 62 }, { - "epoch": 0.12753036437246965, - "grad_norm": 0.16962337493896484, - "learning_rate": 1.9957724935435065e-05, - "loss": 0.7721, - "mean_token_accuracy": 0.8089033365249634, + "epoch": 3.97165991902834, + "grad_norm": 0.07589241862297058, + "learning_rate": 1.5416794074090255e-08, + "loss": 0.598, + "mean_token_accuracy": 0.8409505113959312, "step": 63 }, { - "epoch": 0.12955465587044535, - "grad_norm": 0.20441047847270966, - "learning_rate": 1.995097645450266e-05, - "loss": 0.645, - "mean_token_accuracy": 0.8362248539924622, + "epoch": 4.0, + "grad_norm": 0.07589241862297058, + "learning_rate": 1.3009362759120978e-08, + "loss": 0.6815, + "mean_token_accuracy": 0.8250831876482282, "step": 64 }, { - "epoch": 0.13157894736842105, - "grad_norm": 0.20698894560337067, - "learning_rate": 1.994372978048903e-05, - "loss": 0.7018, - "mean_token_accuracy": 0.8064516186714172, + "epoch": 4.064777327935222, + "grad_norm": 0.12950177490711212, + "learning_rate": 1.079315574426135e-08, + "loss": 0.6328, + "mean_token_accuracy": 0.836986843496561, "step": 65 }, { - "epoch": 0.13360323886639677, - "grad_norm": 0.13746251165866852, - "learning_rate": 1.9935985276197033e-05, - "loss": 0.4596, - "mean_token_accuracy": 0.8803600072860718, + "epoch": 4.129554655870446, + "grad_norm": 0.07754851877689362, + "learning_rate": 8.773044733510337e-09, + "loss": 0.6614, + "mean_token_accuracy": 0.829079158604145, "step": 66 }, { - "epoch": 0.13562753036437247, - "grad_norm": 0.21085698902606964, - "learning_rate": 1.9927743329353295e-05, - "loss": 0.6831, - "mean_token_accuracy": 0.8211008906364441, + "epoch": 4.194331983805668, + "grad_norm": 0.07672492414712906, + "learning_rate": 6.953470369291348e-09, + "loss": 0.6493, + "mean_token_accuracy": 0.8342937044799328, "step": 67 }, { - "epoch": 0.13765182186234817, - "grad_norm": 0.20604638755321503, - "learning_rate": 1.9919004352588768e-05, - "loss": 0.701, - "mean_token_accuracy": 0.8291005492210388, + "epoch": 4.2591093117408905, + "grad_norm": 0.0766540914773941, + "learning_rate": 5.338432470956589e-09, + "loss": 0.6079, + "mean_token_accuracy": 0.8416761197149754, "step": 68 }, { - "epoch": 0.1396761133603239, - "grad_norm": 0.15121905505657196, - "learning_rate": 1.9909768783418086e-05, - "loss": 0.4735, - "mean_token_accuracy": 0.8771929740905762, + "epoch": 4.323886639676114, + "grad_norm": 0.07487937808036804, + "learning_rate": 3.931481242315993e-09, + "loss": 0.6135, + "mean_token_accuracy": 0.839394424110651, "step": 69 }, { - "epoch": 0.1417004048582996, - "grad_norm": 0.13172437250614166, - "learning_rate": 1.9900037084217637e-05, - "loss": 0.4863, - "mean_token_accuracy": 0.8677502870559692, + "epoch": 4.388663967611336, + "grad_norm": 0.07662676274776459, + "learning_rate": 2.7357094675186986e-09, + "loss": 0.6908, + "mean_token_accuracy": 0.8233718760311604, "step": 70 }, { - "epoch": 0.1437246963562753, - "grad_norm": 0.17617544531822205, - "learning_rate": 1.9889809742202454e-05, - "loss": 0.5514, - "mean_token_accuracy": 0.8573604226112366, + "epoch": 4.4534412955465585, + "grad_norm": 0.08624370396137238, + "learning_rate": 1.7537457124423893e-09, + "loss": 0.649, + "mean_token_accuracy": 0.8329134620726109, "step": 71 }, { - "epoch": 0.145748987854251, - "grad_norm": 0.19886170327663422, - "learning_rate": 1.9879087269401782e-05, - "loss": 0.6655, - "mean_token_accuracy": 0.8351132273674011, + "epoch": 4.518218623481781, + "grad_norm": 0.08158352971076965, + "learning_rate": 9.877485465349057e-10, + "loss": 0.646, + "mean_token_accuracy": 0.8340884000062943, "step": 72 }, { - "epoch": 0.14777327935222673, - "grad_norm": 0.205868661403656, - "learning_rate": 1.986787020263347e-05, - "loss": 0.695, - "mean_token_accuracy": 0.8311954736709595, + "epoch": 4.582995951417004, + "grad_norm": 0.08172011375427246, + "learning_rate": 4.394017978101905e-10, + "loss": 0.6344, + "mean_token_accuracy": 0.8312402181327343, "step": 73 }, { - "epoch": 0.14979757085020243, - "grad_norm": 0.1743887960910797, - "learning_rate": 1.9856159103477085e-05, - "loss": 0.6084, - "mean_token_accuracy": 0.8449342846870422, + "epoch": 4.647773279352227, + "grad_norm": 0.07922390103340149, + "learning_rate": 1.0991085142886269e-10, + "loss": 0.666, + "mean_token_accuracy": 0.8276109620928764, "step": 74 }, { - "epoch": 0.15182186234817813, - "grad_norm": 0.22704970836639404, - "learning_rate": 1.98439545582458e-05, - "loss": 0.6138, - "mean_token_accuracy": 0.8425281047821045, + "epoch": 4.712550607287449, + "grad_norm": 0.08142900466918945, + "learning_rate": 0.0, + "loss": 0.659, + "mean_token_accuracy": 0.8328934013843536, "step": 75 }, { - "epoch": 0.15384615384615385, - "grad_norm": 0.22189851105213165, - "learning_rate": 1.9831257177957045e-05, - "loss": 0.6893, - "mean_token_accuracy": 0.8370082378387451, - "step": 76 - }, - { - "epoch": 0.15587044534412955, - "grad_norm": 0.1812412440776825, - "learning_rate": 1.9818067598301894e-05, - "loss": 0.6025, - "mean_token_accuracy": 0.8505423665046692, - "step": 77 - }, - { - "epoch": 0.15789473684210525, - "grad_norm": 0.23385366797447205, - "learning_rate": 1.9804386479613268e-05, - "loss": 0.6333, - "mean_token_accuracy": 0.827102780342102, - "step": 78 - }, - { - "epoch": 0.15991902834008098, - "grad_norm": 0.2589596211910248, - "learning_rate": 1.9790214506832868e-05, - "loss": 0.6613, - "mean_token_accuracy": 0.8239401578903198, - "step": 79 - }, - { - "epoch": 0.16194331983805668, - "grad_norm": 0.24977940320968628, - "learning_rate": 1.9775552389476865e-05, - "loss": 0.6244, - "mean_token_accuracy": 0.8267026543617249, - "step": 80 - }, - { - "epoch": 0.16396761133603238, - "grad_norm": 0.22027316689491272, - "learning_rate": 1.97604008616004e-05, - "loss": 0.6324, - "mean_token_accuracy": 0.8370981812477112, - "step": 81 - }, - { - "epoch": 0.1659919028340081, - "grad_norm": 0.1776619702577591, - "learning_rate": 1.9744760681760832e-05, - "loss": 0.5265, - "mean_token_accuracy": 0.8645498156547546, - "step": 82 - }, - { - "epoch": 0.1680161943319838, - "grad_norm": 0.19093215465545654, - "learning_rate": 1.9728632632979746e-05, - "loss": 0.6313, - "mean_token_accuracy": 0.8432900309562683, - "step": 83 - }, - { - "epoch": 0.1700404858299595, - "grad_norm": 0.22855107486248016, - "learning_rate": 1.9712017522703764e-05, - "loss": 0.5718, - "mean_token_accuracy": 0.8341137170791626, - "step": 84 - }, - { - "epoch": 0.1720647773279352, - "grad_norm": 0.20743606984615326, - "learning_rate": 1.9694916182764113e-05, - "loss": 0.574, - "mean_token_accuracy": 0.8542357087135315, - "step": 85 - }, - { - "epoch": 0.17408906882591094, - "grad_norm": 0.32714810967445374, - "learning_rate": 1.967732946933499e-05, - "loss": 0.6813, - "mean_token_accuracy": 0.8177083134651184, - "step": 86 - }, - { - "epoch": 0.17611336032388664, - "grad_norm": 0.2417590320110321, - "learning_rate": 1.9659258262890683e-05, - "loss": 0.7423, - "mean_token_accuracy": 0.8204029202461243, - "step": 87 - }, - { - "epoch": 0.17813765182186234, - "grad_norm": 0.22506733238697052, - "learning_rate": 1.9640703468161508e-05, - "loss": 0.6379, - "mean_token_accuracy": 0.835094153881073, - "step": 88 - }, - { - "epoch": 0.18016194331983806, - "grad_norm": 0.2758257985115051, - "learning_rate": 1.9621666014088495e-05, - "loss": 0.6055, - "mean_token_accuracy": 0.8465368747711182, - "step": 89 - }, - { - "epoch": 0.18218623481781376, - "grad_norm": 0.26795482635498047, - "learning_rate": 1.9602146853776894e-05, - "loss": 0.6584, - "mean_token_accuracy": 0.8275015354156494, - "step": 90 - }, - { - "epoch": 0.18421052631578946, - "grad_norm": 0.22383807599544525, - "learning_rate": 1.9582146964448457e-05, - "loss": 0.5799, - "mean_token_accuracy": 0.842945396900177, - "step": 91 - }, - { - "epoch": 0.1862348178137652, - "grad_norm": 0.21481236815452576, - "learning_rate": 1.956166734739251e-05, - "loss": 0.5439, - "mean_token_accuracy": 0.8482651710510254, - "step": 92 - }, - { - "epoch": 0.1882591093117409, - "grad_norm": 0.19826941192150116, - "learning_rate": 1.954070902791582e-05, - "loss": 0.5595, - "mean_token_accuracy": 0.8426966071128845, - "step": 93 - }, - { - "epoch": 0.1902834008097166, - "grad_norm": 0.2748395800590515, - "learning_rate": 1.9519273055291266e-05, - "loss": 0.6142, - "mean_token_accuracy": 0.8315182328224182, - "step": 94 - }, - { - "epoch": 0.19230769230769232, - "grad_norm": 0.23335261642932892, - "learning_rate": 1.949736050270532e-05, - "loss": 0.5591, - "mean_token_accuracy": 0.8485158681869507, - "step": 95 - }, - { - "epoch": 0.19433198380566802, - "grad_norm": 0.26217031478881836, - "learning_rate": 1.9474972467204298e-05, - "loss": 0.6583, - "mean_token_accuracy": 0.8374301791191101, - "step": 96 - }, - { - "epoch": 0.19635627530364372, - "grad_norm": 0.2468375861644745, - "learning_rate": 1.945211006963945e-05, - "loss": 0.4778, - "mean_token_accuracy": 0.8702101111412048, - "step": 97 - }, - { - "epoch": 0.19838056680161945, - "grad_norm": 0.2425367534160614, - "learning_rate": 1.9428774454610845e-05, - "loss": 0.5753, - "mean_token_accuracy": 0.8493397235870361, - "step": 98 - }, - { - "epoch": 0.20040485829959515, - "grad_norm": 0.21901822090148926, - "learning_rate": 1.9404966790410047e-05, - "loss": 0.6074, - "mean_token_accuracy": 0.851259708404541, - "step": 99 - }, - { - "epoch": 0.20242914979757085, - "grad_norm": 0.17670230567455292, - "learning_rate": 1.938068826896166e-05, - "loss": 0.6112, - "mean_token_accuracy": 0.8372820615768433, - "step": 100 - }, - { - "epoch": 0.20445344129554655, - "grad_norm": 0.21904608607292175, - "learning_rate": 1.9355940105763622e-05, - "loss": 0.5904, - "mean_token_accuracy": 0.8459681272506714, - "step": 101 - }, - { - "epoch": 0.20647773279352227, - "grad_norm": 0.2911970317363739, - "learning_rate": 1.9330723539826373e-05, - "loss": 0.7404, - "mean_token_accuracy": 0.8156565427780151, - "step": 102 - }, - { - "epoch": 0.20850202429149797, - "grad_norm": 0.23587605357170105, - "learning_rate": 1.930503983361081e-05, - "loss": 0.5728, - "mean_token_accuracy": 0.8425158262252808, - "step": 103 - }, - { - "epoch": 0.21052631578947367, - "grad_norm": 0.22998309135437012, - "learning_rate": 1.9278890272965097e-05, - "loss": 0.6436, - "mean_token_accuracy": 0.836202085018158, - "step": 104 - }, - { - "epoch": 0.2125506072874494, - "grad_norm": 0.21080300211906433, - "learning_rate": 1.925227616706026e-05, - "loss": 0.5063, - "mean_token_accuracy": 0.868156909942627, - "step": 105 - }, - { - "epoch": 0.2145748987854251, - "grad_norm": 0.3365795910358429, - "learning_rate": 1.9225198848324687e-05, - "loss": 0.6685, - "mean_token_accuracy": 0.8274034857749939, - "step": 106 - }, - { - "epoch": 0.2165991902834008, - "grad_norm": 0.25566524267196655, - "learning_rate": 1.9197659672377388e-05, - "loss": 0.5392, - "mean_token_accuracy": 0.8387492299079895, - "step": 107 - }, - { - "epoch": 0.21862348178137653, - "grad_norm": 0.24027182161808014, - "learning_rate": 1.9169660017960135e-05, - "loss": 0.5435, - "mean_token_accuracy": 0.8624338507652283, - "step": 108 - }, - { - "epoch": 0.22064777327935223, - "grad_norm": 0.2666286826133728, - "learning_rate": 1.9141201286868435e-05, - "loss": 0.7356, - "mean_token_accuracy": 0.8124300241470337, - "step": 109 - }, - { - "epoch": 0.22267206477732793, - "grad_norm": 0.21396684646606445, - "learning_rate": 1.911228490388136e-05, - "loss": 0.4923, - "mean_token_accuracy": 0.8706277012825012, - "step": 110 - }, - { - "epoch": 0.22469635627530365, - "grad_norm": 0.23490038514137268, - "learning_rate": 1.908291231669019e-05, - "loss": 0.651, - "mean_token_accuracy": 0.8370558619499207, - "step": 111 - }, - { - "epoch": 0.22672064777327935, - "grad_norm": 0.2001212239265442, - "learning_rate": 1.905308499582597e-05, - "loss": 0.5137, - "mean_token_accuracy": 0.8653126955032349, - "step": 112 - }, - { - "epoch": 0.22874493927125505, - "grad_norm": 0.26428893208503723, - "learning_rate": 1.9022804434585854e-05, - "loss": 0.5852, - "mean_token_accuracy": 0.8668292760848999, - "step": 113 - }, - { - "epoch": 0.23076923076923078, - "grad_norm": 0.18154019117355347, - "learning_rate": 1.8992072148958368e-05, - "loss": 0.4395, - "mean_token_accuracy": 0.8839049935340881, - "step": 114 - }, - { - "epoch": 0.23279352226720648, - "grad_norm": 0.1800609678030014, - "learning_rate": 1.8960889677547506e-05, - "loss": 0.4731, - "mean_token_accuracy": 0.8780291080474854, - "step": 115 - }, - { - "epoch": 0.23481781376518218, - "grad_norm": 0.24386590719223022, - "learning_rate": 1.8929258581495688e-05, - "loss": 0.517, - "mean_token_accuracy": 0.8482933044433594, - "step": 116 - }, - { - "epoch": 0.23684210526315788, - "grad_norm": 0.2463385909795761, - "learning_rate": 1.8897180444405615e-05, - "loss": 0.6356, - "mean_token_accuracy": 0.8325533270835876, - "step": 117 - }, - { - "epoch": 0.2388663967611336, - "grad_norm": 0.2074287086725235, - "learning_rate": 1.8864656872260985e-05, - "loss": 0.5643, - "mean_token_accuracy": 0.8535178899765015, - "step": 118 - }, - { - "epoch": 0.2408906882591093, - "grad_norm": 0.2767307758331299, - "learning_rate": 1.8831689493346095e-05, - "loss": 0.5837, - "mean_token_accuracy": 0.8493902683258057, - "step": 119 - }, - { - "epoch": 0.242914979757085, - "grad_norm": 0.2797743082046509, - "learning_rate": 1.8798279958164295e-05, - "loss": 0.6587, - "mean_token_accuracy": 0.8430637121200562, - "step": 120 - }, - { - "epoch": 0.24493927125506074, - "grad_norm": 0.28018176555633545, - "learning_rate": 1.8764429939355394e-05, - "loss": 0.5805, - "mean_token_accuracy": 0.8473895788192749, - "step": 121 - }, - { - "epoch": 0.24696356275303644, - "grad_norm": 0.1992814540863037, - "learning_rate": 1.8730141131611882e-05, - "loss": 0.529, - "mean_token_accuracy": 0.8487154841423035, - "step": 122 - }, - { - "epoch": 0.24898785425101214, - "grad_norm": 0.3097265362739563, - "learning_rate": 1.8695415251594123e-05, - "loss": 0.6846, - "mean_token_accuracy": 0.8307022452354431, - "step": 123 - }, - { - "epoch": 0.25101214574898784, - "grad_norm": 0.31372424960136414, - "learning_rate": 1.866025403784439e-05, - "loss": 0.7371, - "mean_token_accuracy": 0.8107274770736694, - "step": 124 - }, - { - "epoch": 0.25303643724696356, - "grad_norm": 0.30829596519470215, - "learning_rate": 1.8624659250699807e-05, - "loss": 0.607, - "mean_token_accuracy": 0.859689474105835, - "step": 125 - }, - { - "epoch": 0.2550607287449393, - "grad_norm": 0.29508087038993835, - "learning_rate": 1.8588632672204264e-05, - "loss": 0.4883, - "mean_token_accuracy": 0.8674641251564026, - "step": 126 - }, - { - "epoch": 0.25708502024291496, - "grad_norm": 0.2224210500717163, - "learning_rate": 1.8552176106019156e-05, - "loss": 0.5458, - "mean_token_accuracy": 0.85467129945755, - "step": 127 - }, - { - "epoch": 0.2591093117408907, - "grad_norm": 0.2578910291194916, - "learning_rate": 1.8515291377333114e-05, - "loss": 0.5492, - "mean_token_accuracy": 0.8488425016403198, - "step": 128 - }, - { - "epoch": 0.2611336032388664, - "grad_norm": 0.2848791480064392, - "learning_rate": 1.847798033277061e-05, - "loss": 0.5902, - "mean_token_accuracy": 0.854683518409729, - "step": 129 - }, - { - "epoch": 0.2631578947368421, - "grad_norm": 0.2781189978122711, - "learning_rate": 1.8440244840299507e-05, - "loss": 0.6038, - "mean_token_accuracy": 0.8376891613006592, - "step": 130 - }, - { - "epoch": 0.2651821862348178, - "grad_norm": 0.23901361227035522, - "learning_rate": 1.8402086789137547e-05, - "loss": 0.6103, - "mean_token_accuracy": 0.8358832001686096, - "step": 131 - }, - { - "epoch": 0.26720647773279355, - "grad_norm": 0.23731403052806854, - "learning_rate": 1.8363508089657763e-05, - "loss": 0.5742, - "mean_token_accuracy": 0.8496110439300537, - "step": 132 - }, - { - "epoch": 0.2692307692307692, - "grad_norm": 0.36295345425605774, - "learning_rate": 1.8324510673292844e-05, - "loss": 0.6462, - "mean_token_accuracy": 0.8392658233642578, - "step": 133 - }, - { - "epoch": 0.27125506072874495, - "grad_norm": 0.23564256727695465, - "learning_rate": 1.8285096492438424e-05, - "loss": 0.5936, - "mean_token_accuracy": 0.8450990319252014, - "step": 134 - }, - { - "epoch": 0.2732793522267207, - "grad_norm": 0.21983715891838074, - "learning_rate": 1.8245267520355348e-05, - "loss": 0.5966, - "mean_token_accuracy": 0.8399209380149841, - "step": 135 - }, - { - "epoch": 0.27530364372469635, - "grad_norm": 0.2106492817401886, - "learning_rate": 1.8205025751070878e-05, - "loss": 0.5501, - "mean_token_accuracy": 0.8545739054679871, - "step": 136 - }, - { - "epoch": 0.2773279352226721, - "grad_norm": 0.21314162015914917, - "learning_rate": 1.8164373199278858e-05, - "loss": 0.5591, - "mean_token_accuracy": 0.8543897271156311, - "step": 137 - }, - { - "epoch": 0.2793522267206478, - "grad_norm": 0.21074095368385315, - "learning_rate": 1.812331190023886e-05, - "loss": 0.4741, - "mean_token_accuracy": 0.8711554408073425, - "step": 138 - }, - { - "epoch": 0.2813765182186235, - "grad_norm": 0.23884296417236328, - "learning_rate": 1.8081843909674277e-05, - "loss": 0.5084, - "mean_token_accuracy": 0.8729857802391052, - "step": 139 - }, - { - "epoch": 0.2834008097165992, - "grad_norm": 0.3194005787372589, - "learning_rate": 1.8039971303669407e-05, - "loss": 0.6815, - "mean_token_accuracy": 0.807113528251648, - "step": 140 - }, - { - "epoch": 0.2854251012145749, - "grad_norm": 0.2949461042881012, - "learning_rate": 1.799769617856552e-05, - "loss": 0.5174, - "mean_token_accuracy": 0.8580542206764221, - "step": 141 - }, - { - "epoch": 0.2874493927125506, - "grad_norm": 0.2602043151855469, - "learning_rate": 1.79550206508559e-05, - "loss": 0.6328, - "mean_token_accuracy": 0.8295890688896179, - "step": 142 - }, - { - "epoch": 0.2894736842105263, - "grad_norm": 0.27205556631088257, - "learning_rate": 1.7911946857079886e-05, - "loss": 0.5921, - "mean_token_accuracy": 0.8282608985900879, - "step": 143 - }, - { - "epoch": 0.291497975708502, - "grad_norm": 0.23467545211315155, - "learning_rate": 1.78684769537159e-05, - "loss": 0.554, - "mean_token_accuracy": 0.8527607321739197, - "step": 144 - }, - { - "epoch": 0.2935222672064777, - "grad_norm": 0.27535098791122437, - "learning_rate": 1.78246131170735e-05, - "loss": 0.4676, - "mean_token_accuracy": 0.871345043182373, - "step": 145 - }, - { - "epoch": 0.29554655870445345, - "grad_norm": 0.25174370408058167, - "learning_rate": 1.7780357543184396e-05, - "loss": 0.5593, - "mean_token_accuracy": 0.8446115255355835, - "step": 146 - }, - { - "epoch": 0.2975708502024291, - "grad_norm": 0.2598954439163208, - "learning_rate": 1.773571244769254e-05, - "loss": 0.5884, - "mean_token_accuracy": 0.8401237726211548, - "step": 147 - }, - { - "epoch": 0.29959514170040485, - "grad_norm": 0.40423285961151123, - "learning_rate": 1.769068006574317e-05, - "loss": 0.6519, - "mean_token_accuracy": 0.8430913090705872, - "step": 148 - }, - { - "epoch": 0.3016194331983806, - "grad_norm": 0.2684638500213623, - "learning_rate": 1.7645262651870926e-05, - "loss": 0.4679, - "mean_token_accuracy": 0.8538409471511841, - "step": 149 - }, - { - "epoch": 0.30364372469635625, - "grad_norm": 0.244306743144989, - "learning_rate": 1.7599462479886976e-05, - "loss": 0.4809, - "mean_token_accuracy": 0.8723497986793518, - "step": 150 - }, - { - "epoch": 0.305668016194332, - "grad_norm": 0.27933186292648315, - "learning_rate": 1.755328184276517e-05, - "loss": 0.6049, - "mean_token_accuracy": 0.8480749130249023, - "step": 151 - }, - { - "epoch": 0.3076923076923077, - "grad_norm": 0.2595280110836029, - "learning_rate": 1.7506723052527243e-05, - "loss": 0.5375, - "mean_token_accuracy": 0.8573270440101624, - "step": 152 - }, - { - "epoch": 0.3097165991902834, - "grad_norm": 0.36356401443481445, - "learning_rate": 1.7459788440127083e-05, - "loss": 0.6101, - "mean_token_accuracy": 0.8315719962120056, - "step": 153 - }, - { - "epoch": 0.3117408906882591, - "grad_norm": 0.28417396545410156, - "learning_rate": 1.7412480355334006e-05, - "loss": 0.6359, - "mean_token_accuracy": 0.8482972383499146, - "step": 154 - }, - { - "epoch": 0.31376518218623484, - "grad_norm": 0.3155061900615692, - "learning_rate": 1.7364801166615124e-05, - "loss": 0.6643, - "mean_token_accuracy": 0.8326530456542969, - "step": 155 - }, - { - "epoch": 0.3157894736842105, - "grad_norm": 0.21922272443771362, - "learning_rate": 1.7316753261016782e-05, - "loss": 0.5184, - "mean_token_accuracy": 0.8553861379623413, - "step": 156 - }, - { - "epoch": 0.31781376518218624, - "grad_norm": 0.3232700228691101, - "learning_rate": 1.7268339044045044e-05, - "loss": 0.5586, - "mean_token_accuracy": 0.8481873273849487, - "step": 157 - }, - { - "epoch": 0.31983805668016196, - "grad_norm": 0.2255302518606186, - "learning_rate": 1.7219560939545246e-05, - "loss": 0.5903, - "mean_token_accuracy": 0.8607842922210693, - "step": 158 - }, - { - "epoch": 0.32186234817813764, - "grad_norm": 0.17876094579696655, - "learning_rate": 1.7170421389580666e-05, - "loss": 0.4914, - "mean_token_accuracy": 0.8620192408561707, - "step": 159 - }, - { - "epoch": 0.32388663967611336, - "grad_norm": 0.22846977412700653, - "learning_rate": 1.712092285431026e-05, - "loss": 0.4533, - "mean_token_accuracy": 0.8775308728218079, - "step": 160 - }, - { - "epoch": 0.3259109311740891, - "grad_norm": 0.2580243647098541, - "learning_rate": 1.7071067811865477e-05, - "loss": 0.6124, - "mean_token_accuracy": 0.8417159914970398, - "step": 161 - }, - { - "epoch": 0.32793522267206476, - "grad_norm": 0.303861141204834, - "learning_rate": 1.702085875822623e-05, - "loss": 0.5966, - "mean_token_accuracy": 0.8357366919517517, - "step": 162 - }, - { - "epoch": 0.3299595141700405, - "grad_norm": 0.26614582538604736, - "learning_rate": 1.6970298207095887e-05, - "loss": 0.5795, - "mean_token_accuracy": 0.8428571224212646, - "step": 163 - }, - { - "epoch": 0.3319838056680162, - "grad_norm": 0.2822970151901245, - "learning_rate": 1.6919388689775463e-05, - "loss": 0.5743, - "mean_token_accuracy": 0.8430072069168091, - "step": 164 - }, - { - "epoch": 0.3340080971659919, - "grad_norm": 0.29884374141693115, - "learning_rate": 1.6868132755036875e-05, - "loss": 0.6735, - "mean_token_accuracy": 0.8227627873420715, - "step": 165 - }, - { - "epoch": 0.3360323886639676, - "grad_norm": 0.32726043462753296, - "learning_rate": 1.681653296899533e-05, - "loss": 0.6551, - "mean_token_accuracy": 0.8337730765342712, - "step": 166 - }, - { - "epoch": 0.33805668016194335, - "grad_norm": 0.16429030895233154, - "learning_rate": 1.676459191498087e-05, - "loss": 0.4281, - "mean_token_accuracy": 0.8818212151527405, - "step": 167 - }, - { - "epoch": 0.340080971659919, - "grad_norm": 0.2392423003911972, - "learning_rate": 1.6712312193409032e-05, - "loss": 0.5083, - "mean_token_accuracy": 0.8621307015419006, - "step": 168 - }, - { - "epoch": 0.34210526315789475, - "grad_norm": 0.20973503589630127, - "learning_rate": 1.6659696421650645e-05, - "loss": 0.454, - "mean_token_accuracy": 0.8688760995864868, - "step": 169 - }, - { - "epoch": 0.3441295546558704, - "grad_norm": 0.18245910108089447, - "learning_rate": 1.6606747233900816e-05, - "loss": 0.4816, - "mean_token_accuracy": 0.8563218116760254, - "step": 170 - }, - { - "epoch": 0.34615384615384615, - "grad_norm": 0.2780061662197113, - "learning_rate": 1.655346728104704e-05, - "loss": 0.5072, - "mean_token_accuracy": 0.8650442361831665, - "step": 171 - }, - { - "epoch": 0.3481781376518219, - "grad_norm": 0.319735050201416, - "learning_rate": 1.6499859230536468e-05, - "loss": 0.6437, - "mean_token_accuracy": 0.8298412561416626, - "step": 172 - }, - { - "epoch": 0.35020242914979755, - "grad_norm": 0.27722129225730896, - "learning_rate": 1.6445925766242392e-05, - "loss": 0.6204, - "mean_token_accuracy": 0.8339704871177673, - "step": 173 - }, - { - "epoch": 0.3522267206477733, - "grad_norm": 0.2680011987686157, - "learning_rate": 1.639166958832985e-05, - "loss": 0.5956, - "mean_token_accuracy": 0.8411633372306824, - "step": 174 - }, - { - "epoch": 0.354251012145749, - "grad_norm": 0.34449663758277893, - "learning_rate": 1.6337093413120463e-05, - "loss": 0.5733, - "mean_token_accuracy": 0.8423880338668823, - "step": 175 - }, - { - "epoch": 0.3562753036437247, - "grad_norm": 0.24189139902591705, - "learning_rate": 1.6282199972956425e-05, - "loss": 0.5352, - "mean_token_accuracy": 0.8527716398239136, - "step": 176 - }, - { - "epoch": 0.3582995951417004, - "grad_norm": 0.31007909774780273, - "learning_rate": 1.6226992016063726e-05, - "loss": 0.5431, - "mean_token_accuracy": 0.8473648428916931, - "step": 177 - }, - { - "epoch": 0.3603238866396761, - "grad_norm": 0.19545142352581024, - "learning_rate": 1.6171472306414554e-05, - "loss": 0.4527, - "mean_token_accuracy": 0.8697916865348816, - "step": 178 - }, - { - "epoch": 0.3623481781376518, - "grad_norm": 0.35753557085990906, - "learning_rate": 1.6115643623588915e-05, - "loss": 0.5145, - "mean_token_accuracy": 0.8581011295318604, - "step": 179 - }, - { - "epoch": 0.3643724696356275, - "grad_norm": 0.23397327959537506, - "learning_rate": 1.6059508762635482e-05, - "loss": 0.4717, - "mean_token_accuracy": 0.860744297504425, - "step": 180 - }, - { - "epoch": 0.36639676113360325, - "grad_norm": 0.29638534784317017, - "learning_rate": 1.6003070533931657e-05, - "loss": 0.4542, - "mean_token_accuracy": 0.8717948794364929, - "step": 181 - }, - { - "epoch": 0.3684210526315789, - "grad_norm": 0.23162566125392914, - "learning_rate": 1.594633176304287e-05, - "loss": 0.4617, - "mean_token_accuracy": 0.8729259371757507, - "step": 182 - }, - { - "epoch": 0.37044534412955465, - "grad_norm": 0.2598213851451874, - "learning_rate": 1.588929529058111e-05, - "loss": 0.5532, - "mean_token_accuracy": 0.8479591608047485, - "step": 183 - }, - { - "epoch": 0.3724696356275304, - "grad_norm": 0.3145219385623932, - "learning_rate": 1.5831963972062734e-05, - "loss": 0.5311, - "mean_token_accuracy": 0.8493150472640991, - "step": 184 - }, - { - "epoch": 0.37449392712550605, - "grad_norm": 0.26244890689849854, - "learning_rate": 1.5774340677765483e-05, - "loss": 0.545, - "mean_token_accuracy": 0.8599779605865479, - "step": 185 - }, - { - "epoch": 0.3765182186234818, - "grad_norm": 0.1403060108423233, - "learning_rate": 1.5716428292584788e-05, - "loss": 0.3864, - "mean_token_accuracy": 0.8956466317176819, - "step": 186 - }, - { - "epoch": 0.3785425101214575, - "grad_norm": 0.3069169223308563, - "learning_rate": 1.5658229715889345e-05, - "loss": 0.5963, - "mean_token_accuracy": 0.8490465879440308, - "step": 187 - }, - { - "epoch": 0.3805668016194332, - "grad_norm": 0.27172955870628357, - "learning_rate": 1.5599747861375957e-05, - "loss": 0.5844, - "mean_token_accuracy": 0.8412802219390869, - "step": 188 - }, - { - "epoch": 0.3825910931174089, - "grad_norm": 0.2943626046180725, - "learning_rate": 1.5540985656923648e-05, - "loss": 0.5273, - "mean_token_accuracy": 0.850491464138031, - "step": 189 - }, - { - "epoch": 0.38461538461538464, - "grad_norm": 0.30295711755752563, - "learning_rate": 1.54819460444471e-05, - "loss": 0.6003, - "mean_token_accuracy": 0.8302251100540161, - "step": 190 - }, - { - "epoch": 0.3866396761133603, - "grad_norm": 0.21159401535987854, - "learning_rate": 1.5422631979749354e-05, - "loss": 0.582, - "mean_token_accuracy": 0.8241075873374939, - "step": 191 - }, - { - "epoch": 0.38866396761133604, - "grad_norm": 0.2957298755645752, - "learning_rate": 1.5363046432373824e-05, - "loss": 0.5709, - "mean_token_accuracy": 0.8461936712265015, - "step": 192 - }, - { - "epoch": 0.39068825910931176, - "grad_norm": 0.3318915069103241, - "learning_rate": 1.5303192385455652e-05, - "loss": 0.5812, - "mean_token_accuracy": 0.8348472118377686, - "step": 193 - }, - { - "epoch": 0.39271255060728744, - "grad_norm": 0.18238049745559692, - "learning_rate": 1.5243072835572319e-05, - "loss": 0.3858, - "mean_token_accuracy": 0.8846918344497681, - "step": 194 - }, - { - "epoch": 0.39473684210526316, - "grad_norm": 0.22623980045318604, - "learning_rate": 1.5182690792593659e-05, - "loss": 0.4931, - "mean_token_accuracy": 0.8711094856262207, - "step": 195 - }, - { - "epoch": 0.3967611336032389, - "grad_norm": 0.30967429280281067, - "learning_rate": 1.5122049279531143e-05, - "loss": 0.4871, - "mean_token_accuracy": 0.8649900555610657, - "step": 196 - }, - { - "epoch": 0.39878542510121456, - "grad_norm": 0.21760259568691254, - "learning_rate": 1.5061151332386565e-05, - "loss": 0.4727, - "mean_token_accuracy": 0.8606798052787781, - "step": 197 - }, - { - "epoch": 0.4008097165991903, - "grad_norm": 0.26807570457458496, - "learning_rate": 1.5000000000000002e-05, - "loss": 0.459, - "mean_token_accuracy": 0.8632004261016846, - "step": 198 - }, - { - "epoch": 0.402834008097166, - "grad_norm": 0.26478174328804016, - "learning_rate": 1.4938598343897215e-05, - "loss": 0.5557, - "mean_token_accuracy": 0.8421052694320679, - "step": 199 - }, - { - "epoch": 0.4048582995951417, - "grad_norm": 0.29175126552581787, - "learning_rate": 1.4876949438136348e-05, - "loss": 0.5912, - "mean_token_accuracy": 0.8419395685195923, - "step": 200 - }, - { - "epoch": 0.4068825910931174, - "grad_norm": 0.22912077605724335, - "learning_rate": 1.4815056369154039e-05, - "loss": 0.525, - "mean_token_accuracy": 0.8536022901535034, - "step": 201 - }, - { - "epoch": 0.4089068825910931, - "grad_norm": 0.25623202323913574, - "learning_rate": 1.47529222356109e-05, - "loss": 0.5464, - "mean_token_accuracy": 0.8433228135108948, - "step": 202 - }, - { - "epoch": 0.4109311740890688, - "grad_norm": 0.2668095529079437, - "learning_rate": 1.4690550148236371e-05, - "loss": 0.5017, - "mean_token_accuracy": 0.8548858761787415, - "step": 203 - }, - { - "epoch": 0.41295546558704455, - "grad_norm": 0.27633169293403625, - "learning_rate": 1.4627943229672992e-05, - "loss": 0.5546, - "mean_token_accuracy": 0.861094057559967, - "step": 204 - }, - { - "epoch": 0.4149797570850202, - "grad_norm": 0.2418694645166397, - "learning_rate": 1.4565104614320065e-05, - "loss": 0.5139, - "mean_token_accuracy": 0.8555907607078552, - "step": 205 - }, - { - "epoch": 0.41700404858299595, - "grad_norm": 0.2270972579717636, - "learning_rate": 1.4502037448176734e-05, - "loss": 0.4913, - "mean_token_accuracy": 0.8657086491584778, - "step": 206 - }, - { - "epoch": 0.4190283400809717, - "grad_norm": 0.22995734214782715, - "learning_rate": 1.4438744888684481e-05, - "loss": 0.4249, - "mean_token_accuracy": 0.8714148998260498, - "step": 207 - }, - { - "epoch": 0.42105263157894735, - "grad_norm": 0.25857847929000854, - "learning_rate": 1.4375230104569044e-05, - "loss": 0.4941, - "mean_token_accuracy": 0.8693322539329529, - "step": 208 - }, - { - "epoch": 0.4230769230769231, - "grad_norm": 0.26808950304985046, - "learning_rate": 1.4311496275681785e-05, - "loss": 0.5223, - "mean_token_accuracy": 0.8595696687698364, - "step": 209 - }, - { - "epoch": 0.4251012145748988, - "grad_norm": 0.22069592773914337, - "learning_rate": 1.424754659284048e-05, - "loss": 0.4852, - "mean_token_accuracy": 0.8667218685150146, - "step": 210 - }, - { - "epoch": 0.4271255060728745, - "grad_norm": 0.23519064486026764, - "learning_rate": 1.418338425766958e-05, - "loss": 0.4135, - "mean_token_accuracy": 0.8849557638168335, - "step": 211 - }, - { - "epoch": 0.4291497975708502, - "grad_norm": 0.18871621787548065, - "learning_rate": 1.4119012482439929e-05, - "loss": 0.4434, - "mean_token_accuracy": 0.875, - "step": 212 - }, - { - "epoch": 0.4311740890688259, - "grad_norm": 0.28710120916366577, - "learning_rate": 1.4054434489907916e-05, - "loss": 0.536, - "mean_token_accuracy": 0.8565318584442139, - "step": 213 - }, - { - "epoch": 0.4331983805668016, - "grad_norm": 0.2687305510044098, - "learning_rate": 1.3989653513154165e-05, - "loss": 0.5182, - "mean_token_accuracy": 0.8477508425712585, - "step": 214 - }, - { - "epoch": 0.4352226720647773, - "grad_norm": 0.20654776692390442, - "learning_rate": 1.3924672795421638e-05, - "loss": 0.4205, - "mean_token_accuracy": 0.883825957775116, - "step": 215 - }, - { - "epoch": 0.43724696356275305, - "grad_norm": 0.2604771554470062, - "learning_rate": 1.3859495589953289e-05, - "loss": 0.5191, - "mean_token_accuracy": 0.8543003797531128, - "step": 216 - }, - { - "epoch": 0.4392712550607287, - "grad_norm": 0.23390381038188934, - "learning_rate": 1.3794125159829173e-05, - "loss": 0.5133, - "mean_token_accuracy": 0.8571428656578064, - "step": 217 - }, - { - "epoch": 0.44129554655870445, - "grad_norm": 0.23897536098957062, - "learning_rate": 1.3728564777803089e-05, - "loss": 0.4926, - "mean_token_accuracy": 0.8516854047775269, - "step": 218 - }, - { - "epoch": 0.4433198380566802, - "grad_norm": 0.21264900267124176, - "learning_rate": 1.3662817726138729e-05, - "loss": 0.5032, - "mean_token_accuracy": 0.8576989769935608, - "step": 219 - }, - { - "epoch": 0.44534412955465585, - "grad_norm": 0.24573111534118652, - "learning_rate": 1.359688729644536e-05, - "loss": 0.4246, - "mean_token_accuracy": 0.8809523582458496, - "step": 220 - }, - { - "epoch": 0.4473684210526316, - "grad_norm": 0.23440654575824738, - "learning_rate": 1.3530776789513009e-05, - "loss": 0.4817, - "mean_token_accuracy": 0.868558406829834, - "step": 221 - }, - { - "epoch": 0.4493927125506073, - "grad_norm": 0.2577238976955414, - "learning_rate": 1.3464489515147239e-05, - "loss": 0.4677, - "mean_token_accuracy": 0.8709677457809448, - "step": 222 - }, - { - "epoch": 0.451417004048583, - "grad_norm": 0.244438037276268, - "learning_rate": 1.3398028792003413e-05, - "loss": 0.4473, - "mean_token_accuracy": 0.8778778910636902, - "step": 223 - }, - { - "epoch": 0.4534412955465587, - "grad_norm": 0.3506868779659271, - "learning_rate": 1.3331397947420578e-05, - "loss": 0.5558, - "mean_token_accuracy": 0.8539267182350159, - "step": 224 - }, - { - "epoch": 0.45546558704453444, - "grad_norm": 0.3051239252090454, - "learning_rate": 1.3264600317254854e-05, - "loss": 0.5066, - "mean_token_accuracy": 0.8567582368850708, - "step": 225 - }, - { - "epoch": 0.4574898785425101, - "grad_norm": 0.3017394542694092, - "learning_rate": 1.3197639245712454e-05, - "loss": 0.596, - "mean_token_accuracy": 0.8487140536308289, - "step": 226 - }, - { - "epoch": 0.45951417004048584, - "grad_norm": 0.3093607723712921, - "learning_rate": 1.3130518085182224e-05, - "loss": 0.6099, - "mean_token_accuracy": 0.8373684287071228, - "step": 227 - }, - { - "epoch": 0.46153846153846156, - "grad_norm": 0.19963714480400085, - "learning_rate": 1.3063240196067837e-05, - "loss": 0.3668, - "mean_token_accuracy": 0.8941338062286377, - "step": 228 - }, - { - "epoch": 0.46356275303643724, - "grad_norm": 0.2264312505722046, - "learning_rate": 1.2995808946619533e-05, - "loss": 0.4301, - "mean_token_accuracy": 0.8826290965080261, - "step": 229 - }, - { - "epoch": 0.46558704453441296, - "grad_norm": 0.1361990123987198, - "learning_rate": 1.2928227712765504e-05, - "loss": 0.341, - "mean_token_accuracy": 0.896472156047821, - "step": 230 - }, - { - "epoch": 0.4676113360323887, - "grad_norm": 0.2794758975505829, - "learning_rate": 1.2860499877942876e-05, - "loss": 0.4997, - "mean_token_accuracy": 0.8587601184844971, - "step": 231 - }, - { - "epoch": 0.46963562753036436, - "grad_norm": 0.2864310145378113, - "learning_rate": 1.2792628832928302e-05, - "loss": 0.4895, - "mean_token_accuracy": 0.8697621822357178, - "step": 232 - }, - { - "epoch": 0.4716599190283401, - "grad_norm": 0.2873767614364624, - "learning_rate": 1.2724617975668229e-05, - "loss": 0.5106, - "mean_token_accuracy": 0.8528000116348267, - "step": 233 - }, - { - "epoch": 0.47368421052631576, - "grad_norm": 0.3079386055469513, - "learning_rate": 1.2656470711108763e-05, - "loss": 0.6188, - "mean_token_accuracy": 0.8250824809074402, - "step": 234 - }, - { - "epoch": 0.4757085020242915, - "grad_norm": 0.27556318044662476, - "learning_rate": 1.2588190451025209e-05, - "loss": 0.4288, - "mean_token_accuracy": 0.8745723962783813, - "step": 235 - }, - { - "epoch": 0.4777327935222672, - "grad_norm": 0.20111826062202454, - "learning_rate": 1.2519780613851254e-05, - "loss": 0.4395, - "mean_token_accuracy": 0.8747698068618774, - "step": 236 - }, - { - "epoch": 0.4797570850202429, - "grad_norm": 0.3482305407524109, - "learning_rate": 1.2451244624507831e-05, - "loss": 0.5573, - "mean_token_accuracy": 0.8556756973266602, - "step": 237 - }, - { - "epoch": 0.4817813765182186, - "grad_norm": 0.3096829056739807, - "learning_rate": 1.238258591423165e-05, - "loss": 0.5096, - "mean_token_accuracy": 0.8562538623809814, - "step": 238 - }, - { - "epoch": 0.48380566801619435, - "grad_norm": 0.26353561878204346, - "learning_rate": 1.2313807920403419e-05, - "loss": 0.421, - "mean_token_accuracy": 0.8860892653465271, - "step": 239 - }, - { - "epoch": 0.48582995951417, - "grad_norm": 0.26074233651161194, - "learning_rate": 1.2244914086375726e-05, - "loss": 0.4924, - "mean_token_accuracy": 0.8551951050758362, - "step": 240 - }, - { - "epoch": 0.48785425101214575, - "grad_norm": 0.2902613580226898, - "learning_rate": 1.2175907861300698e-05, - "loss": 0.5623, - "mean_token_accuracy": 0.8518710732460022, - "step": 241 - }, - { - "epoch": 0.4898785425101215, - "grad_norm": 0.31945541501045227, - "learning_rate": 1.2106792699957264e-05, - "loss": 0.5792, - "mean_token_accuracy": 0.8478597402572632, - "step": 242 - }, - { - "epoch": 0.49190283400809715, - "grad_norm": 0.24584610760211945, - "learning_rate": 1.2037572062578238e-05, - "loss": 0.4751, - "mean_token_accuracy": 0.8572905659675598, - "step": 243 - }, - { - "epoch": 0.4939271255060729, - "grad_norm": 0.28070396184921265, - "learning_rate": 1.1968249414677055e-05, - "loss": 0.5414, - "mean_token_accuracy": 0.8388301134109497, - "step": 244 - }, - { - "epoch": 0.4959514170040486, - "grad_norm": 0.3328787386417389, - "learning_rate": 1.1898828226874284e-05, - "loss": 0.5668, - "mean_token_accuracy": 0.8313993215560913, - "step": 245 - }, - { - "epoch": 0.4979757085020243, - "grad_norm": 0.2542007863521576, - "learning_rate": 1.1829311974723868e-05, - "loss": 0.5974, - "mean_token_accuracy": 0.8257485032081604, - "step": 246 - }, - { - "epoch": 0.5, - "grad_norm": 0.22894838452339172, - "learning_rate": 1.1759704138539121e-05, - "loss": 0.381, - "mean_token_accuracy": 0.8852607607841492, - "step": 247 - }, - { - "epoch": 0.5020242914979757, - "grad_norm": 0.37377893924713135, - "learning_rate": 1.1690008203218493e-05, - "loss": 0.519, - "mean_token_accuracy": 0.8542805314064026, - "step": 248 - }, - { - "epoch": 0.5040485829959515, - "grad_norm": 0.2569988965988159, - "learning_rate": 1.1620227658071088e-05, - "loss": 0.4751, - "mean_token_accuracy": 0.8624535202980042, - "step": 249 - }, - { - "epoch": 0.5060728744939271, - "grad_norm": 0.27775225043296814, - "learning_rate": 1.155036599664198e-05, - "loss": 0.4182, - "mean_token_accuracy": 0.8656914830207825, - "step": 250 - }, - { - "epoch": 0.5080971659919028, - "grad_norm": 0.25449249148368835, - "learning_rate": 1.1480426716537316e-05, - "loss": 0.524, - "mean_token_accuracy": 0.8571428656578064, - "step": 251 - }, - { - "epoch": 0.5101214574898786, - "grad_norm": 0.28567492961883545, - "learning_rate": 1.1410413319249193e-05, - "loss": 0.5068, - "mean_token_accuracy": 0.8494461178779602, - "step": 252 - }, - { - "epoch": 0.5121457489878543, - "grad_norm": 0.2155705690383911, - "learning_rate": 1.1340329309980379e-05, - "loss": 0.3154, - "mean_token_accuracy": 0.901196300983429, - "step": 253 - }, - { - "epoch": 0.5141700404858299, - "grad_norm": 0.2748875617980957, - "learning_rate": 1.1270178197468788e-05, - "loss": 0.4494, - "mean_token_accuracy": 0.8714776635169983, - "step": 254 - }, - { - "epoch": 0.5161943319838057, - "grad_norm": 0.3891545236110687, - "learning_rate": 1.119996349381187e-05, - "loss": 0.6305, - "mean_token_accuracy": 0.8360042572021484, - "step": 255 - }, - { - "epoch": 0.5182186234817814, - "grad_norm": 0.24013206362724304, - "learning_rate": 1.112968871429073e-05, - "loss": 0.4703, - "mean_token_accuracy": 0.8685857057571411, - "step": 256 - }, - { - "epoch": 0.520242914979757, - "grad_norm": 0.3602214753627777, - "learning_rate": 1.1059357377194161e-05, - "loss": 0.548, - "mean_token_accuracy": 0.847953200340271, - "step": 257 - }, - { - "epoch": 0.5222672064777328, - "grad_norm": 0.26108884811401367, - "learning_rate": 1.09889730036425e-05, - "loss": 0.4013, - "mean_token_accuracy": 0.8851963877677917, - "step": 258 - }, - { - "epoch": 0.5242914979757085, - "grad_norm": 0.28321635723114014, - "learning_rate": 1.0918539117411334e-05, - "loss": 0.4847, - "mean_token_accuracy": 0.8601503968238831, - "step": 259 - }, - { - "epoch": 0.5263157894736842, - "grad_norm": 0.30514952540397644, - "learning_rate": 1.0848059244755093e-05, - "loss": 0.5191, - "mean_token_accuracy": 0.8522663712501526, - "step": 260 - }, - { - "epoch": 0.52834008097166, - "grad_norm": 0.3193584978580475, - "learning_rate": 1.0777536914230509e-05, - "loss": 0.5076, - "mean_token_accuracy": 0.8504902124404907, - "step": 261 - }, - { - "epoch": 0.5303643724696356, - "grad_norm": 0.34578680992126465, - "learning_rate": 1.0706975656519946e-05, - "loss": 0.6462, - "mean_token_accuracy": 0.8127555251121521, - "step": 262 - }, - { - "epoch": 0.5323886639676113, - "grad_norm": 0.22260263562202454, - "learning_rate": 1.0636379004254665e-05, - "loss": 0.382, - "mean_token_accuracy": 0.8847006559371948, - "step": 263 - }, - { - "epoch": 0.5344129554655871, - "grad_norm": 0.3325004577636719, - "learning_rate": 1.0565750491837925e-05, - "loss": 0.5118, - "mean_token_accuracy": 0.8544698357582092, - "step": 264 - }, - { - "epoch": 0.5364372469635628, - "grad_norm": 0.21064940094947815, - "learning_rate": 1.049509365526807e-05, - "loss": 0.4833, - "mean_token_accuracy": 0.8545101881027222, - "step": 265 - }, - { - "epoch": 0.5384615384615384, - "grad_norm": 0.2569122612476349, - "learning_rate": 1.0424412031961485e-05, - "loss": 0.4462, - "mean_token_accuracy": 0.8711549639701843, - "step": 266 - }, - { - "epoch": 0.5404858299595142, - "grad_norm": 0.2512376010417938, - "learning_rate": 1.0353709160575488e-05, - "loss": 0.4553, - "mean_token_accuracy": 0.8676716685295105, - "step": 267 - }, - { - "epoch": 0.5425101214574899, - "grad_norm": 0.20290617644786835, - "learning_rate": 1.0282988580831183e-05, - "loss": 0.3591, - "mean_token_accuracy": 0.8925163149833679, - "step": 268 - }, - { - "epoch": 0.5445344129554656, - "grad_norm": 0.276850163936615, - "learning_rate": 1.0212253833336237e-05, - "loss": 0.5178, - "mean_token_accuracy": 0.8581696152687073, - "step": 269 - }, - { - "epoch": 0.5465587044534413, - "grad_norm": 0.27752986550331116, - "learning_rate": 1.0141508459407622e-05, - "loss": 0.499, - "mean_token_accuracy": 0.852477490901947, - "step": 270 - }, - { - "epoch": 0.548582995951417, - "grad_norm": 0.27458417415618896, - "learning_rate": 1.0070756000894321e-05, - "loss": 0.528, - "mean_token_accuracy": 0.8500254154205322, - "step": 271 - }, - { - "epoch": 0.5506072874493927, - "grad_norm": 0.25697359442710876, - "learning_rate": 1e-05, - "loss": 0.4372, - "mean_token_accuracy": 0.8874056935310364, - "step": 272 - }, - { - "epoch": 0.5526315789473685, - "grad_norm": 0.2661818563938141, - "learning_rate": 9.929243999105682e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8609970808029175, - "step": 273 - }, - { - "epoch": 0.5546558704453441, - "grad_norm": 0.20793814957141876, - "learning_rate": 9.858491540592383e-06, - "loss": 0.4136, - "mean_token_accuracy": 0.8824717402458191, - "step": 274 - }, - { - "epoch": 0.5566801619433198, - "grad_norm": 0.20243680477142334, - "learning_rate": 9.787746166663765e-06, - "loss": 0.4097, - "mean_token_accuracy": 0.8892011046409607, - "step": 275 - }, - { - "epoch": 0.5587044534412956, - "grad_norm": 0.25867629051208496, - "learning_rate": 9.71701141916882e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8806194067001343, - "step": 276 - }, - { - "epoch": 0.5607287449392713, - "grad_norm": 0.2549677789211273, - "learning_rate": 9.646290839424515e-06, - "loss": 0.3627, - "mean_token_accuracy": 0.896221399307251, - "step": 277 - }, - { - "epoch": 0.562753036437247, - "grad_norm": 0.27826279401779175, - "learning_rate": 9.57558796803852e-06, - "loss": 0.5258, - "mean_token_accuracy": 0.8454780578613281, - "step": 278 - }, - { - "epoch": 0.5647773279352226, - "grad_norm": 0.2794351875782013, - "learning_rate": 9.504906344731933e-06, - "loss": 0.5382, - "mean_token_accuracy": 0.8399999737739563, - "step": 279 - }, - { - "epoch": 0.5668016194331984, - "grad_norm": 0.2891370952129364, - "learning_rate": 9.434249508162076e-06, - "loss": 0.5144, - "mean_token_accuracy": 0.8665386438369751, - "step": 280 - }, - { - "epoch": 0.5688259109311741, - "grad_norm": 0.2668806314468384, - "learning_rate": 9.363620995745337e-06, - "loss": 0.476, - "mean_token_accuracy": 0.8648930788040161, - "step": 281 - }, - { - "epoch": 0.5708502024291497, - "grad_norm": 0.19786901772022247, - "learning_rate": 9.293024343480056e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8725023865699768, - "step": 282 - }, - { - "epoch": 0.5728744939271255, - "grad_norm": 0.2296639382839203, - "learning_rate": 9.222463085769495e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.867986798286438, - "step": 283 - }, - { - "epoch": 0.5748987854251012, - "grad_norm": 0.2851167619228363, - "learning_rate": 9.151940755244912e-06, - "loss": 0.4893, - "mean_token_accuracy": 0.8656179904937744, - "step": 284 - }, - { - "epoch": 0.5769230769230769, - "grad_norm": 0.28831976652145386, - "learning_rate": 9.081460882588668e-06, - "loss": 0.4983, - "mean_token_accuracy": 0.8594695329666138, - "step": 285 - }, - { - "epoch": 0.5789473684210527, - "grad_norm": 0.24200013279914856, - "learning_rate": 9.011026996357504e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8737270832061768, - "step": 286 - }, - { - "epoch": 0.5809716599190283, - "grad_norm": 0.2352951318025589, - "learning_rate": 8.94064262280584e-06, - "loss": 0.3563, - "mean_token_accuracy": 0.8963627219200134, - "step": 287 - }, - { - "epoch": 0.582995951417004, - "grad_norm": 0.3065440058708191, - "learning_rate": 8.870311285709274e-06, - "loss": 0.5417, - "mean_token_accuracy": 0.8533872365951538, - "step": 288 - }, - { - "epoch": 0.5850202429149798, - "grad_norm": 0.3517458140850067, - "learning_rate": 8.80003650618813e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.874932587146759, - "step": 289 - }, - { - "epoch": 0.5870445344129555, - "grad_norm": 0.2901054620742798, - "learning_rate": 8.729821802531213e-06, - "loss": 0.5214, - "mean_token_accuracy": 0.8495886921882629, - "step": 290 - }, - { - "epoch": 0.5890688259109311, - "grad_norm": 0.24167115986347198, - "learning_rate": 8.659670690019626e-06, - "loss": 0.4589, - "mean_token_accuracy": 0.8680351972579956, - "step": 291 - }, - { - "epoch": 0.5910931174089069, - "grad_norm": 0.2503730058670044, - "learning_rate": 8.58958668075081e-06, - "loss": 0.4722, - "mean_token_accuracy": 0.8686291575431824, - "step": 292 - }, - { - "epoch": 0.5931174089068826, - "grad_norm": 0.2157752513885498, - "learning_rate": 8.519573283462688e-06, - "loss": 0.4448, - "mean_token_accuracy": 0.8718662858009338, - "step": 293 - }, - { - "epoch": 0.5951417004048583, - "grad_norm": 0.26292556524276733, - "learning_rate": 8.449634003358022e-06, - "loss": 0.4789, - "mean_token_accuracy": 0.8660488724708557, - "step": 294 - }, - { - "epoch": 0.597165991902834, - "grad_norm": 0.2639862895011902, - "learning_rate": 8.379772341928916e-06, - "loss": 0.4656, - "mean_token_accuracy": 0.871257483959198, - "step": 295 - }, - { - "epoch": 0.5991902834008097, - "grad_norm": 0.19293735921382904, - "learning_rate": 8.309991796781512e-06, - "loss": 0.394, - "mean_token_accuracy": 0.8828034996986389, - "step": 296 - }, - { - "epoch": 0.6012145748987854, - "grad_norm": 0.28275519609451294, - "learning_rate": 8.24029586146088e-06, - "loss": 0.5222, - "mean_token_accuracy": 0.8616324663162231, - "step": 297 - }, - { - "epoch": 0.6032388663967612, - "grad_norm": 0.31240344047546387, - "learning_rate": 8.170688025276134e-06, - "loss": 0.5318, - "mean_token_accuracy": 0.8440040946006775, - "step": 298 - }, - { - "epoch": 0.6052631578947368, - "grad_norm": 0.2823457419872284, - "learning_rate": 8.101171773125716e-06, - "loss": 0.497, - "mean_token_accuracy": 0.862606942653656, - "step": 299 - }, - { - "epoch": 0.6072874493927125, - "grad_norm": 0.2254837453365326, - "learning_rate": 8.031750585322948e-06, - "loss": 0.4309, - "mean_token_accuracy": 0.8666666746139526, - "step": 300 - }, - { - "epoch": 0.6093117408906883, - "grad_norm": 0.2284829467535019, - "learning_rate": 7.962427937421763e-06, - "loss": 0.3243, - "mean_token_accuracy": 0.9150023460388184, - "step": 301 - }, - { - "epoch": 0.611336032388664, - "grad_norm": 0.1687772572040558, - "learning_rate": 7.89320730004274e-06, - "loss": 0.3102, - "mean_token_accuracy": 0.9114139676094055, - "step": 302 - }, - { - "epoch": 0.6133603238866396, - "grad_norm": 0.2668203115463257, - "learning_rate": 7.824092138699307e-06, - "loss": 0.4753, - "mean_token_accuracy": 0.8627451062202454, - "step": 303 - }, - { - "epoch": 0.6153846153846154, - "grad_norm": 0.2429952621459961, - "learning_rate": 7.755085913624274e-06, - "loss": 0.5117, - "mean_token_accuracy": 0.8498145937919617, - "step": 304 - }, - { - "epoch": 0.6174089068825911, - "grad_norm": 0.2407144159078598, - "learning_rate": 7.686192079596586e-06, - "loss": 0.4016, - "mean_token_accuracy": 0.8928571343421936, - "step": 305 - }, - { - "epoch": 0.6194331983805668, - "grad_norm": 0.27329036593437195, - "learning_rate": 7.617414085768352e-06, - "loss": 0.4611, - "mean_token_accuracy": 0.8771044015884399, - "step": 306 - }, - { - "epoch": 0.6214574898785425, - "grad_norm": 0.2208373248577118, - "learning_rate": 7.548755375492173e-06, - "loss": 0.418, - "mean_token_accuracy": 0.8668587803840637, - "step": 307 - }, - { - "epoch": 0.6234817813765182, - "grad_norm": 0.2131100744009018, - "learning_rate": 7.480219386148751e-06, - "loss": 0.409, - "mean_token_accuracy": 0.8864306807518005, - "step": 308 - }, - { - "epoch": 0.6255060728744939, - "grad_norm": 0.27837830781936646, - "learning_rate": 7.411809548974792e-06, - "loss": 0.4889, - "mean_token_accuracy": 0.8648374080657959, - "step": 309 - }, - { - "epoch": 0.6275303643724697, - "grad_norm": 0.2820056974887848, - "learning_rate": 7.343529288891239e-06, - "loss": 0.5201, - "mean_token_accuracy": 0.8595438003540039, - "step": 310 - }, - { - "epoch": 0.6295546558704453, - "grad_norm": 0.2284225970506668, - "learning_rate": 7.275382024331773e-06, - "loss": 0.3983, - "mean_token_accuracy": 0.8888888955116272, - "step": 311 - }, - { - "epoch": 0.631578947368421, - "grad_norm": 0.2649460434913635, - "learning_rate": 7.2073711670717e-06, - "loss": 0.5232, - "mean_token_accuracy": 0.8596938848495483, - "step": 312 - }, - { - "epoch": 0.6336032388663968, - "grad_norm": 0.4928232431411743, - "learning_rate": 7.13950012205713e-06, - "loss": 0.4908, - "mean_token_accuracy": 0.8723404407501221, - "step": 313 - }, - { - "epoch": 0.6356275303643725, - "grad_norm": 0.26822856068611145, - "learning_rate": 7.071772287234497e-06, - "loss": 0.4883, - "mean_token_accuracy": 0.8623949289321899, - "step": 314 - }, - { - "epoch": 0.6376518218623481, - "grad_norm": 0.21105557680130005, - "learning_rate": 7.004191053380469e-06, - "loss": 0.4825, - "mean_token_accuracy": 0.8558024764060974, - "step": 315 - }, - { - "epoch": 0.6396761133603239, - "grad_norm": 0.29273661971092224, - "learning_rate": 6.936759803932167e-06, - "loss": 0.5179, - "mean_token_accuracy": 0.8536452651023865, - "step": 316 - }, - { - "epoch": 0.6417004048582996, - "grad_norm": 0.21913200616836548, - "learning_rate": 6.869481914817779e-06, - "loss": 0.4951, - "mean_token_accuracy": 0.8600091338157654, - "step": 317 - }, - { - "epoch": 0.6437246963562753, - "grad_norm": 0.26065701246261597, - "learning_rate": 6.802360754287548e-06, - "loss": 0.4621, - "mean_token_accuracy": 0.8590487241744995, - "step": 318 - }, - { - "epoch": 0.645748987854251, - "grad_norm": 0.2516014873981476, - "learning_rate": 6.735399682745145e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8689809441566467, - "step": 319 - }, - { - "epoch": 0.6477732793522267, - "grad_norm": 0.3060797154903412, - "learning_rate": 6.668602052579425e-06, - "loss": 0.5212, - "mean_token_accuracy": 0.8627451062202454, - "step": 320 - }, - { - "epoch": 0.6497975708502024, - "grad_norm": 0.307005375623703, - "learning_rate": 6.601971207996592e-06, - "loss": 0.5049, - "mean_token_accuracy": 0.8486055731773376, - "step": 321 - }, - { - "epoch": 0.6518218623481782, - "grad_norm": 0.2623290717601776, - "learning_rate": 6.535510484852767e-06, - "loss": 0.4424, - "mean_token_accuracy": 0.8855451345443726, - "step": 322 - }, - { - "epoch": 0.6538461538461539, - "grad_norm": 0.31143125891685486, - "learning_rate": 6.469223210486992e-06, - "loss": 0.459, - "mean_token_accuracy": 0.8782935738563538, - "step": 323 - }, - { - "epoch": 0.6558704453441295, - "grad_norm": 0.2084709107875824, - "learning_rate": 6.403112703554643e-06, - "loss": 0.4153, - "mean_token_accuracy": 0.8768005967140198, - "step": 324 - }, - { - "epoch": 0.6578947368421053, - "grad_norm": 0.26807963848114014, - "learning_rate": 6.337182273861273e-06, - "loss": 0.483, - "mean_token_accuracy": 0.8573106527328491, - "step": 325 - }, - { - "epoch": 0.659919028340081, - "grad_norm": 0.2525019645690918, - "learning_rate": 6.2714352221969155e-06, - "loss": 0.381, - "mean_token_accuracy": 0.8902173638343811, - "step": 326 - }, - { - "epoch": 0.6619433198380567, - "grad_norm": 0.2668350040912628, - "learning_rate": 6.205874840170833e-06, - "loss": 0.4758, - "mean_token_accuracy": 0.8671524524688721, - "step": 327 - }, - { - "epoch": 0.6639676113360324, - "grad_norm": 0.30968189239501953, - "learning_rate": 6.140504410046712e-06, - "loss": 0.4616, - "mean_token_accuracy": 0.8822922110557556, - "step": 328 - }, - { - "epoch": 0.6659919028340081, - "grad_norm": 0.22733403742313385, - "learning_rate": 6.075327204578363e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8846761584281921, - "step": 329 - }, - { - "epoch": 0.6680161943319838, - "grad_norm": 0.3149683475494385, - "learning_rate": 6.010346486845837e-06, - "loss": 0.5278, - "mean_token_accuracy": 0.8606253266334534, - "step": 330 - }, - { - "epoch": 0.6700404858299596, - "grad_norm": 0.24868245422840118, - "learning_rate": 5.945565510092086e-06, - "loss": 0.3365, - "mean_token_accuracy": 0.8998779058456421, - "step": 331 - }, - { - "epoch": 0.6720647773279352, - "grad_norm": 0.36468973755836487, - "learning_rate": 5.880987517560075e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8575851321220398, - "step": 332 - }, - { - "epoch": 0.6740890688259109, - "grad_norm": 0.29678285121917725, - "learning_rate": 5.81661574233042e-06, - "loss": 0.4878, - "mean_token_accuracy": 0.8686288595199585, - "step": 333 - }, - { - "epoch": 0.6761133603238867, - "grad_norm": 0.23850063979625702, - "learning_rate": 5.752453407159521e-06, - "loss": 0.471, - "mean_token_accuracy": 0.8644067645072937, - "step": 334 - }, - { - "epoch": 0.6781376518218624, - "grad_norm": 0.33608052134513855, - "learning_rate": 5.688503724318217e-06, - "loss": 0.5153, - "mean_token_accuracy": 0.863723635673523, - "step": 335 - }, - { - "epoch": 0.680161943319838, - "grad_norm": 0.322111576795578, - "learning_rate": 5.6247698954309616e-06, - "loss": 0.5171, - "mean_token_accuracy": 0.8543990254402161, - "step": 336 - }, - { - "epoch": 0.6821862348178138, - "grad_norm": 0.2473558485507965, - "learning_rate": 5.561255111315525e-06, - "loss": 0.3823, - "mean_token_accuracy": 0.8928987383842468, - "step": 337 - }, - { - "epoch": 0.6842105263157895, - "grad_norm": 0.2070508897304535, - "learning_rate": 5.497962551823266e-06, - "loss": 0.4532, - "mean_token_accuracy": 0.8769230842590332, - "step": 338 - }, - { - "epoch": 0.6862348178137652, - "grad_norm": 0.27380725741386414, - "learning_rate": 5.434895385679937e-06, - "loss": 0.473, - "mean_token_accuracy": 0.8681750893592834, - "step": 339 - }, - { - "epoch": 0.6882591093117408, - "grad_norm": 0.26557326316833496, - "learning_rate": 5.3720567703270135e-06, - "loss": 0.4951, - "mean_token_accuracy": 0.8644970655441284, - "step": 340 - }, - { - "epoch": 0.6902834008097166, - "grad_norm": 0.27739718556404114, - "learning_rate": 5.3094498517636324e-06, - "loss": 0.4504, - "mean_token_accuracy": 0.8617021441459656, - "step": 341 - }, - { - "epoch": 0.6923076923076923, - "grad_norm": 0.25480371713638306, - "learning_rate": 5.247077764389099e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8725548386573792, - "step": 342 - }, - { - "epoch": 0.694331983805668, - "grad_norm": 0.23700827360153198, - "learning_rate": 5.18494363084596e-06, - "loss": 0.4703, - "mean_token_accuracy": 0.8650107979774475, - "step": 343 - }, - { - "epoch": 0.6963562753036437, - "grad_norm": 0.2832289934158325, - "learning_rate": 5.1230505618636575e-06, - "loss": 0.5179, - "mean_token_accuracy": 0.8526434302330017, - "step": 344 - }, - { - "epoch": 0.6983805668016194, - "grad_norm": 0.28387221693992615, - "learning_rate": 5.061401656102791e-06, - "loss": 0.4843, - "mean_token_accuracy": 0.8752436637878418, - "step": 345 - }, - { - "epoch": 0.7004048582995951, - "grad_norm": 0.2733575701713562, - "learning_rate": 5.000000000000003e-06, - "loss": 0.4283, - "mean_token_accuracy": 0.8881770372390747, - "step": 346 - }, - { - "epoch": 0.7024291497975709, - "grad_norm": 0.17921440303325653, - "learning_rate": 4.938848667613436e-06, - "loss": 0.374, - "mean_token_accuracy": 0.8849824070930481, - "step": 347 - }, - { - "epoch": 0.7044534412955465, - "grad_norm": 0.29820355772972107, - "learning_rate": 4.8779507204688595e-06, - "loss": 0.6646, - "mean_token_accuracy": 0.8174545168876648, - "step": 348 - }, - { - "epoch": 0.7064777327935222, - "grad_norm": 0.23080453276634216, - "learning_rate": 4.817309207406347e-06, - "loss": 0.3988, - "mean_token_accuracy": 0.8915223479270935, - "step": 349 - }, - { - "epoch": 0.708502024291498, - "grad_norm": 0.23018287122249603, - "learning_rate": 4.756927164427685e-06, - "loss": 0.3978, - "mean_token_accuracy": 0.890070915222168, - "step": 350 - }, - { - "epoch": 0.7105263157894737, - "grad_norm": 0.1862955093383789, - "learning_rate": 4.696807614544352e-06, - "loss": 0.3969, - "mean_token_accuracy": 0.8772727251052856, - "step": 351 - }, - { - "epoch": 0.7125506072874493, - "grad_norm": 0.3891066610813141, - "learning_rate": 4.636953567626176e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8715929985046387, - "step": 352 - }, - { - "epoch": 0.7145748987854251, - "grad_norm": 0.32266443967819214, - "learning_rate": 4.57736802025065e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8703703880310059, - "step": 353 - }, - { - "epoch": 0.7165991902834008, - "grad_norm": 0.19085142016410828, - "learning_rate": 4.518053955552903e-06, - "loss": 0.3958, - "mean_token_accuracy": 0.8828560709953308, - "step": 354 - }, - { - "epoch": 0.7186234817813765, - "grad_norm": 0.2669807970523834, - "learning_rate": 4.459014343076356e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8822800517082214, - "step": 355 - }, - { - "epoch": 0.7206477732793523, - "grad_norm": 0.2982715964317322, - "learning_rate": 4.400252138624047e-06, - "loss": 0.391, - "mean_token_accuracy": 0.8809523582458496, - "step": 356 - }, - { - "epoch": 0.7226720647773279, - "grad_norm": 0.2584725618362427, - "learning_rate": 4.341770284110655e-06, - "loss": 0.4552, - "mean_token_accuracy": 0.8733572363853455, - "step": 357 - }, - { - "epoch": 0.7246963562753036, - "grad_norm": 0.18682384490966797, - "learning_rate": 4.283571707415214e-06, - "loss": 0.4377, - "mean_token_accuracy": 0.8692899346351624, - "step": 358 - }, - { - "epoch": 0.7267206477732794, - "grad_norm": 0.2730652391910553, - "learning_rate": 4.2256593222345185e-06, - "loss": 0.3772, - "mean_token_accuracy": 0.8911022543907166, - "step": 359 - }, - { - "epoch": 0.728744939271255, - "grad_norm": 0.4370400905609131, - "learning_rate": 4.168036027937267e-06, - "loss": 0.4908, - "mean_token_accuracy": 0.8653010725975037, - "step": 360 - }, - { - "epoch": 0.7307692307692307, - "grad_norm": 0.21713419258594513, - "learning_rate": 4.1107047094188946e-06, - "loss": 0.4043, - "mean_token_accuracy": 0.8931367993354797, - "step": 361 - }, - { - "epoch": 0.7327935222672065, - "grad_norm": 0.36545848846435547, - "learning_rate": 4.053668236957135e-06, - "loss": 0.4879, - "mean_token_accuracy": 0.8663522005081177, - "step": 362 - }, - { - "epoch": 0.7348178137651822, - "grad_norm": 0.2589530050754547, - "learning_rate": 3.996929466068344e-06, - "loss": 0.4796, - "mean_token_accuracy": 0.8690228462219238, - "step": 363 - }, - { - "epoch": 0.7368421052631579, - "grad_norm": 0.24485215544700623, - "learning_rate": 3.940491237364519e-06, - "loss": 0.4668, - "mean_token_accuracy": 0.8754062652587891, - "step": 364 - }, - { - "epoch": 0.7388663967611336, - "grad_norm": 0.23230288922786713, - "learning_rate": 3.884356376411089e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8739306330680847, - "step": 365 - }, - { - "epoch": 0.7408906882591093, - "grad_norm": 0.31604430079460144, - "learning_rate": 3.828527693585451e-06, - "loss": 0.6245, - "mean_token_accuracy": 0.8259986639022827, - "step": 366 - }, - { - "epoch": 0.742914979757085, - "grad_norm": 0.268093466758728, - "learning_rate": 3.7730079839362755e-06, - "loss": 0.4716, - "mean_token_accuracy": 0.8759820461273193, - "step": 367 - }, - { - "epoch": 0.7449392712550608, - "grad_norm": 0.2674328684806824, - "learning_rate": 3.7178000270435765e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8691232800483704, - "step": 368 - }, - { - "epoch": 0.7469635627530364, - "grad_norm": 0.2278822362422943, - "learning_rate": 3.662906586879542e-06, - "loss": 0.3952, - "mean_token_accuracy": 0.8885647654533386, - "step": 369 - }, - { - "epoch": 0.7489878542510121, - "grad_norm": 0.2409820705652237, - "learning_rate": 3.6083304116701535e-06, - "loss": 0.4588, - "mean_token_accuracy": 0.8738207817077637, - "step": 370 - }, - { - "epoch": 0.7510121457489879, - "grad_norm": 0.2809779644012451, - "learning_rate": 3.5540742337576083e-06, - "loss": 0.4546, - "mean_token_accuracy": 0.8706353902816772, - "step": 371 - }, - { - "epoch": 0.7530364372469636, - "grad_norm": 0.2390257716178894, - "learning_rate": 3.5001407694635326e-06, - "loss": 0.4162, - "mean_token_accuracy": 0.889140248298645, - "step": 372 - }, - { - "epoch": 0.7550607287449392, - "grad_norm": 0.36595165729522705, - "learning_rate": 3.446532718952966e-06, - "loss": 0.5713, - "mean_token_accuracy": 0.8397144675254822, - "step": 373 - }, - { - "epoch": 0.757085020242915, - "grad_norm": 0.25939351320266724, - "learning_rate": 3.3932527660991877e-06, - "loss": 0.4312, - "mean_token_accuracy": 0.8785249590873718, - "step": 374 - }, - { - "epoch": 0.7591093117408907, - "grad_norm": 0.3776956796646118, - "learning_rate": 3.340303578349361e-06, - "loss": 0.5116, - "mean_token_accuracy": 0.8577494621276855, - "step": 375 - }, - { - "epoch": 0.7611336032388664, - "grad_norm": 0.24847571551799774, - "learning_rate": 3.2876878065909714e-06, - "loss": 0.4522, - "mean_token_accuracy": 0.8745308518409729, - "step": 376 - }, - { - "epoch": 0.7631578947368421, - "grad_norm": 0.19581596553325653, - "learning_rate": 3.2354080850191328e-06, - "loss": 0.3578, - "mean_token_accuracy": 0.8948983550071716, - "step": 377 - }, - { - "epoch": 0.7651821862348178, - "grad_norm": 0.2568861246109009, - "learning_rate": 3.1834670310046735e-06, - "loss": 0.4396, - "mean_token_accuracy": 0.8738559484481812, - "step": 378 - }, - { - "epoch": 0.7672064777327935, - "grad_norm": 0.2038806527853012, - "learning_rate": 3.1318672449631283e-06, - "loss": 0.3829, - "mean_token_accuracy": 0.8911939263343811, - "step": 379 - }, - { - "epoch": 0.7692307692307693, - "grad_norm": 0.3116026222705841, - "learning_rate": 3.0806113102245395e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8694765567779541, - "step": 380 - }, - { - "epoch": 0.771255060728745, - "grad_norm": 0.24429158866405487, - "learning_rate": 3.029701792904117e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8861004114151001, - "step": 381 - }, - { - "epoch": 0.7732793522267206, - "grad_norm": 0.2990184724330902, - "learning_rate": 2.979141241773775e-06, - "loss": 0.4215, - "mean_token_accuracy": 0.883500874042511, - "step": 382 - }, - { - "epoch": 0.7753036437246964, - "grad_norm": 0.3037009835243225, - "learning_rate": 2.9289321881345257e-06, - "loss": 0.3884, - "mean_token_accuracy": 0.8895916938781738, - "step": 383 - }, - { - "epoch": 0.7773279352226721, - "grad_norm": 0.27103233337402344, - "learning_rate": 2.879077145689746e-06, - "loss": 0.4003, - "mean_token_accuracy": 0.8842247724533081, - "step": 384 - }, - { - "epoch": 0.7793522267206477, - "grad_norm": 0.2830134928226471, - "learning_rate": 2.829578610419337e-06, - "loss": 0.524, - "mean_token_accuracy": 0.8542538285255432, - "step": 385 - }, - { - "epoch": 0.7813765182186235, - "grad_norm": 0.30209383368492126, - "learning_rate": 2.780439060454756e-06, - "loss": 0.5034, - "mean_token_accuracy": 0.8675914406776428, - "step": 386 - }, - { - "epoch": 0.7834008097165992, - "grad_norm": 0.22681166231632233, - "learning_rate": 2.7316609559549568e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8747357130050659, - "step": 387 - }, - { - "epoch": 0.7854251012145749, - "grad_norm": 0.22978028655052185, - "learning_rate": 2.683246738983217e-06, - "loss": 0.3554, - "mean_token_accuracy": 0.892988920211792, - "step": 388 - }, - { - "epoch": 0.7874493927125507, - "grad_norm": 0.278886616230011, - "learning_rate": 2.6351988333848787e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8729411959648132, - "step": 389 - }, - { - "epoch": 0.7894736842105263, - "grad_norm": 0.2988165318965912, - "learning_rate": 2.587519644666001e-06, - "loss": 0.4546, - "mean_token_accuracy": 0.8679119348526001, - "step": 390 - }, - { - "epoch": 0.791497975708502, - "grad_norm": 0.3190794885158539, - "learning_rate": 2.5402115598729182e-06, - "loss": 0.4893, - "mean_token_accuracy": 0.866288959980011, - "step": 391 - }, - { - "epoch": 0.7935222672064778, - "grad_norm": 0.30137163400650024, - "learning_rate": 2.493276947472756e-06, - "loss": 0.4787, - "mean_token_accuracy": 0.8605625629425049, - "step": 392 - }, - { - "epoch": 0.7955465587044535, - "grad_norm": 0.3619798719882965, - "learning_rate": 2.446718157234832e-06, - "loss": 0.5242, - "mean_token_accuracy": 0.8560822606086731, - "step": 393 - }, - { - "epoch": 0.7975708502024291, - "grad_norm": 0.3067499101161957, - "learning_rate": 2.4005375201130275e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.8781384229660034, - "step": 394 - }, - { - "epoch": 0.7995951417004049, - "grad_norm": 0.31653648614883423, - "learning_rate": 2.354737348129077e-06, - "loss": 0.5013, - "mean_token_accuracy": 0.8644986152648926, - "step": 395 - }, - { - "epoch": 0.8016194331983806, - "grad_norm": 0.3089480996131897, - "learning_rate": 2.3093199342568316e-06, - "loss": 0.4508, - "mean_token_accuracy": 0.8853985071182251, - "step": 396 - }, - { - "epoch": 0.8036437246963563, - "grad_norm": 0.2246381789445877, - "learning_rate": 2.2642875523074613e-06, - "loss": 0.3853, - "mean_token_accuracy": 0.8819203972816467, - "step": 397 - }, - { - "epoch": 0.805668016194332, - "grad_norm": 0.29770413041114807, - "learning_rate": 2.2196424568156073e-06, - "loss": 0.5376, - "mean_token_accuracy": 0.8378896713256836, - "step": 398 - }, - { - "epoch": 0.8076923076923077, - "grad_norm": 0.333595335483551, - "learning_rate": 2.1753868829265046e-06, - "loss": 0.4522, - "mean_token_accuracy": 0.8549618124961853, - "step": 399 - }, - { - "epoch": 0.8097165991902834, - "grad_norm": 0.2043197900056839, - "learning_rate": 2.1315230462840985e-06, - "loss": 0.3687, - "mean_token_accuracy": 0.8900883197784424, - "step": 400 - }, - { - "epoch": 0.8117408906882592, - "grad_norm": 0.2769086956977844, - "learning_rate": 2.0880531429201146e-06, - "loss": 0.4606, - "mean_token_accuracy": 0.8631333708763123, - "step": 401 - }, - { - "epoch": 0.8137651821862348, - "grad_norm": 0.22918100655078888, - "learning_rate": 2.0449793491441026e-06, - "loss": 0.4826, - "mean_token_accuracy": 0.8662207126617432, - "step": 402 - }, - { - "epoch": 0.8157894736842105, - "grad_norm": 0.24323149025440216, - "learning_rate": 2.0023038214344827e-06, - "loss": 0.4381, - "mean_token_accuracy": 0.8717339634895325, - "step": 403 - }, - { - "epoch": 0.8178137651821862, - "grad_norm": 0.2817937135696411, - "learning_rate": 1.960028696330596e-06, - "loss": 0.4194, - "mean_token_accuracy": 0.8925619721412659, - "step": 404 - }, - { - "epoch": 0.819838056680162, - "grad_norm": 0.28915348649024963, - "learning_rate": 1.9181560903257234e-06, - "loss": 0.5171, - "mean_token_accuracy": 0.8500000238418579, - "step": 405 - }, - { - "epoch": 0.8218623481781376, - "grad_norm": 0.17370840907096863, - "learning_rate": 1.8766880997611424e-06, - "loss": 0.3169, - "mean_token_accuracy": 0.9196190237998962, - "step": 406 - }, - { - "epoch": 0.8238866396761133, - "grad_norm": 0.3060840368270874, - "learning_rate": 1.8356268007211442e-06, - "loss": 0.4136, - "mean_token_accuracy": 0.8882314562797546, - "step": 407 - }, - { - "epoch": 0.8259109311740891, - "grad_norm": 0.19518621265888214, - "learning_rate": 1.7949742489291256e-06, - "loss": 0.3566, - "mean_token_accuracy": 0.8933706283569336, - "step": 408 - }, - { - "epoch": 0.8279352226720648, - "grad_norm": 0.2959750294685364, - "learning_rate": 1.7547324796446553e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.8780388832092285, - "step": 409 - }, - { - "epoch": 0.8299595141700404, - "grad_norm": 0.24308601021766663, - "learning_rate": 1.7149035075615795e-06, - "loss": 0.3944, - "mean_token_accuracy": 0.8858375549316406, - "step": 410 - }, - { - "epoch": 0.8319838056680162, - "grad_norm": 0.22043316066265106, - "learning_rate": 1.6754893267071593e-06, - "loss": 0.3242, - "mean_token_accuracy": 0.9051490426063538, - "step": 411 - }, - { - "epoch": 0.8340080971659919, - "grad_norm": 0.33916962146759033, - "learning_rate": 1.6364919103422394e-06, - "loss": 0.5231, - "mean_token_accuracy": 0.8668769598007202, - "step": 412 - }, - { - "epoch": 0.8360323886639676, - "grad_norm": 0.3752172887325287, - "learning_rate": 1.5979132108624572e-06, - "loss": 0.5339, - "mean_token_accuracy": 0.8474466800689697, - "step": 413 - }, - { - "epoch": 0.8380566801619433, - "grad_norm": 0.2694341242313385, - "learning_rate": 1.5597551597004968e-06, - "loss": 0.4139, - "mean_token_accuracy": 0.8806228637695312, - "step": 414 - }, - { - "epoch": 0.840080971659919, - "grad_norm": 0.2853461503982544, - "learning_rate": 1.522019667229393e-06, - "loss": 0.4129, - "mean_token_accuracy": 0.8857493996620178, - "step": 415 - }, - { - "epoch": 0.8421052631578947, - "grad_norm": 0.2771718502044678, - "learning_rate": 1.4847086226668871e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8848413825035095, - "step": 416 - }, - { - "epoch": 0.8441295546558705, - "grad_norm": 0.28253501653671265, - "learning_rate": 1.4478238939808454e-06, - "loss": 0.4416, - "mean_token_accuracy": 0.8766881823539734, - "step": 417 - }, - { - "epoch": 0.8461538461538461, - "grad_norm": 0.2291014939546585, - "learning_rate": 1.4113673277957395e-06, - "loss": 0.4423, - "mean_token_accuracy": 0.8711934089660645, - "step": 418 - }, - { - "epoch": 0.8481781376518218, - "grad_norm": 0.2619485855102539, - "learning_rate": 1.3753407493001968e-06, - "loss": 0.374, - "mean_token_accuracy": 0.8914362788200378, - "step": 419 - }, - { - "epoch": 0.8502024291497976, - "grad_norm": 0.20695680379867554, - "learning_rate": 1.339745962155613e-06, - "loss": 0.3784, - "mean_token_accuracy": 0.8880422115325928, - "step": 420 - }, - { - "epoch": 0.8522267206477733, - "grad_norm": 0.24776305258274078, - "learning_rate": 1.3045847484058748e-06, - "loss": 0.3826, - "mean_token_accuracy": 0.889227032661438, - "step": 421 - }, - { - "epoch": 0.854251012145749, - "grad_norm": 0.2515615224838257, - "learning_rate": 1.2698588683881185e-06, - "loss": 0.4579, - "mean_token_accuracy": 0.8664596080780029, - "step": 422 - }, - { - "epoch": 0.8562753036437247, - "grad_norm": 0.301064133644104, - "learning_rate": 1.2355700606446119e-06, - "loss": 0.4299, - "mean_token_accuracy": 0.8724428415298462, - "step": 423 - }, - { - "epoch": 0.8582995951417004, - "grad_norm": 0.24376371502876282, - "learning_rate": 1.2017200418357077e-06, - "loss": 0.4607, - "mean_token_accuracy": 0.8685946464538574, - "step": 424 - }, - { - "epoch": 0.8603238866396761, - "grad_norm": 0.27435746788978577, - "learning_rate": 1.1683105066539068e-06, - "loss": 0.4675, - "mean_token_accuracy": 0.8655617237091064, - "step": 425 - }, - { - "epoch": 0.8623481781376519, - "grad_norm": 0.2541494071483612, - "learning_rate": 1.1353431277390125e-06, - "loss": 0.4656, - "mean_token_accuracy": 0.862048864364624, - "step": 426 - }, - { - "epoch": 0.8643724696356275, - "grad_norm": 0.27517831325531006, - "learning_rate": 1.1028195555943877e-06, - "loss": 0.4935, - "mean_token_accuracy": 0.8678010702133179, - "step": 427 - }, - { - "epoch": 0.8663967611336032, - "grad_norm": 0.22525008022785187, - "learning_rate": 1.0707414185043163e-06, - "loss": 0.3822, - "mean_token_accuracy": 0.8995452523231506, - "step": 428 - }, - { - "epoch": 0.868421052631579, - "grad_norm": 0.3820272386074066, - "learning_rate": 1.0391103224524957e-06, - "loss": 0.446, - "mean_token_accuracy": 0.8697771430015564, - "step": 429 - }, - { - "epoch": 0.8704453441295547, - "grad_norm": 0.164522185921669, - "learning_rate": 1.0079278510416313e-06, - "loss": 0.3243, - "mean_token_accuracy": 0.8974565863609314, - "step": 430 - }, - { - "epoch": 0.8724696356275303, - "grad_norm": 0.2887153923511505, - "learning_rate": 9.771955654141496e-07, - "loss": 0.457, - "mean_token_accuracy": 0.8749398589134216, - "step": 431 - }, - { - "epoch": 0.8744939271255061, - "grad_norm": 0.26364317536354065, - "learning_rate": 9.469150041740338e-07, - "loss": 0.4695, - "mean_token_accuracy": 0.8601856827735901, - "step": 432 - }, - { - "epoch": 0.8765182186234818, - "grad_norm": 0.23383867740631104, - "learning_rate": 9.170876833098119e-07, - "loss": 0.3645, - "mean_token_accuracy": 0.9048991203308105, - "step": 433 - }, - { - "epoch": 0.8785425101214575, - "grad_norm": 0.2173306941986084, - "learning_rate": 8.87715096118642e-07, - "loss": 0.458, - "mean_token_accuracy": 0.8632075190544128, - "step": 434 - }, - { - "epoch": 0.8805668016194332, - "grad_norm": 0.29172396659851074, - "learning_rate": 8.587987131315656e-07, - "loss": 0.4671, - "mean_token_accuracy": 0.8700696229934692, - "step": 435 - }, - { - "epoch": 0.8825910931174089, - "grad_norm": 0.23296132683753967, - "learning_rate": 8.303399820398672e-07, - "loss": 0.404, - "mean_token_accuracy": 0.8798627257347107, - "step": 436 - }, - { - "epoch": 0.8846153846153846, - "grad_norm": 0.25785768032073975, - "learning_rate": 8.023403276226127e-07, - "loss": 0.4228, - "mean_token_accuracy": 0.8814306855201721, - "step": 437 - }, - { - "epoch": 0.8866396761133604, - "grad_norm": 0.26605987548828125, - "learning_rate": 7.74801151675314e-07, - "loss": 0.4327, - "mean_token_accuracy": 0.8785097002983093, - "step": 438 - }, - { - "epoch": 0.888663967611336, - "grad_norm": 0.25588130950927734, - "learning_rate": 7.477238329397419e-07, - "loss": 0.4919, - "mean_token_accuracy": 0.8562639951705933, - "step": 439 - }, - { - "epoch": 0.8906882591093117, - "grad_norm": 0.2864658832550049, - "learning_rate": 7.211097270349065e-07, - "loss": 0.4974, - "mean_token_accuracy": 0.8614386916160583, - "step": 440 - }, - { - "epoch": 0.8927125506072875, - "grad_norm": 0.2550905644893646, - "learning_rate": 6.949601663891891e-07, - "loss": 0.4284, - "mean_token_accuracy": 0.8829268217086792, - "step": 441 - }, - { - "epoch": 0.8947368421052632, - "grad_norm": 0.19677165150642395, - "learning_rate": 6.692764601736268e-07, - "loss": 0.4149, - "mean_token_accuracy": 0.875516951084137, - "step": 442 - }, - { - "epoch": 0.8967611336032388, - "grad_norm": 0.3176872432231903, - "learning_rate": 6.440598942363796e-07, - "loss": 0.4427, - "mean_token_accuracy": 0.8617944717407227, - "step": 443 - }, - { - "epoch": 0.8987854251012146, - "grad_norm": 0.27307766675949097, - "learning_rate": 6.193117310383412e-07, - "loss": 0.4616, - "mean_token_accuracy": 0.8798370957374573, - "step": 444 - }, - { - "epoch": 0.9008097165991903, - "grad_norm": 0.28930559754371643, - "learning_rate": 5.950332095899547e-07, - "loss": 0.4958, - "mean_token_accuracy": 0.8664363622665405, - "step": 445 - }, - { - "epoch": 0.902834008097166, - "grad_norm": 0.24667935073375702, - "learning_rate": 5.71225545389158e-07, - "loss": 0.5217, - "mean_token_accuracy": 0.8562564849853516, - "step": 446 - }, - { - "epoch": 0.9048582995951417, - "grad_norm": 0.2966341972351074, - "learning_rate": 5.478899303605512e-07, - "loss": 0.5367, - "mean_token_accuracy": 0.8706467747688293, - "step": 447 - }, - { - "epoch": 0.9068825910931174, - "grad_norm": 0.21977975964546204, - "learning_rate": 5.250275327957033e-07, - "loss": 0.4222, - "mean_token_accuracy": 0.8754432797431946, - "step": 448 - }, - { - "epoch": 0.9089068825910931, - "grad_norm": 0.26447972655296326, - "learning_rate": 5.026394972946813e-07, - "loss": 0.4087, - "mean_token_accuracy": 0.8903620839118958, - "step": 449 - }, - { - "epoch": 0.9109311740890689, - "grad_norm": 0.2512384355068207, - "learning_rate": 4.807269447087348e-07, - "loss": 0.471, - "mean_token_accuracy": 0.8707289099693298, - "step": 450 - }, - { - "epoch": 0.9129554655870445, - "grad_norm": 0.2299729883670807, - "learning_rate": 4.592909720841843e-07, - "loss": 0.413, - "mean_token_accuracy": 0.8833592534065247, - "step": 451 - }, - { - "epoch": 0.9149797570850202, - "grad_norm": 0.2830863296985626, - "learning_rate": 4.3833265260749157e-07, - "loss": 0.4341, - "mean_token_accuracy": 0.896039605140686, - "step": 452 - }, - { - "epoch": 0.917004048582996, - "grad_norm": 0.2753170132637024, - "learning_rate": 4.178530355515409e-07, - "loss": 0.4285, - "mean_token_accuracy": 0.8766725063323975, - "step": 453 - }, - { - "epoch": 0.9190283400809717, - "grad_norm": 0.18439187109470367, - "learning_rate": 3.97853146223105e-07, - "loss": 0.3962, - "mean_token_accuracy": 0.8877434134483337, - "step": 454 - }, - { - "epoch": 0.9210526315789473, - "grad_norm": 0.3276051878929138, - "learning_rate": 3.783339859115065e-07, - "loss": 0.4733, - "mean_token_accuracy": 0.8768656849861145, - "step": 455 - }, - { - "epoch": 0.9230769230769231, - "grad_norm": 0.302237868309021, - "learning_rate": 3.5929653183849444e-07, - "loss": 0.4583, - "mean_token_accuracy": 0.8618030548095703, - "step": 456 - }, - { - "epoch": 0.9251012145748988, - "grad_norm": 0.2920514941215515, - "learning_rate": 3.4074173710931804e-07, - "loss": 0.4657, - "mean_token_accuracy": 0.8741217851638794, - "step": 457 - }, - { - "epoch": 0.9271255060728745, - "grad_norm": 0.14648732542991638, - "learning_rate": 3.226705306650113e-07, - "loss": 0.3451, - "mean_token_accuracy": 0.8986467719078064, - "step": 458 - }, - { - "epoch": 0.9291497975708503, - "grad_norm": 0.24604742228984833, - "learning_rate": 3.050838172358883e-07, - "loss": 0.3458, - "mean_token_accuracy": 0.9110772609710693, - "step": 459 - }, - { - "epoch": 0.9311740890688259, - "grad_norm": 0.24755385518074036, - "learning_rate": 2.879824772962381e-07, - "loss": 0.3859, - "mean_token_accuracy": 0.891435980796814, - "step": 460 - }, - { - "epoch": 0.9331983805668016, - "grad_norm": 0.2505938708782196, - "learning_rate": 2.7136736702025436e-07, - "loss": 0.4337, - "mean_token_accuracy": 0.8723084330558777, - "step": 461 - }, - { - "epoch": 0.9352226720647774, - "grad_norm": 0.2497698813676834, - "learning_rate": 2.552393182391677e-07, - "loss": 0.4469, - "mean_token_accuracy": 0.8665005564689636, - "step": 462 - }, - { - "epoch": 0.937246963562753, - "grad_norm": 0.3191662132740021, - "learning_rate": 2.395991383995999e-07, - "loss": 0.4424, - "mean_token_accuracy": 0.8791821599006653, - "step": 463 - }, - { - "epoch": 0.9392712550607287, - "grad_norm": 0.4106998145580292, - "learning_rate": 2.2444761052313857e-07, - "loss": 0.5896, - "mean_token_accuracy": 0.8302023410797119, - "step": 464 - }, - { - "epoch": 0.9412955465587044, - "grad_norm": 0.18561993539333344, - "learning_rate": 2.0978549316713615e-07, - "loss": 0.3504, - "mean_token_accuracy": 0.8924598097801208, - "step": 465 - }, - { - "epoch": 0.9433198380566802, - "grad_norm": 0.33718597888946533, - "learning_rate": 1.9561352038673264e-07, - "loss": 0.4549, - "mean_token_accuracy": 0.8738977313041687, - "step": 466 - }, - { - "epoch": 0.9453441295546559, - "grad_norm": 0.3026043474674225, - "learning_rate": 1.8193240169810943e-07, - "loss": 0.4668, - "mean_token_accuracy": 0.8618420958518982, - "step": 467 - }, - { - "epoch": 0.9473684210526315, - "grad_norm": 0.26501694321632385, - "learning_rate": 1.6874282204295765e-07, - "loss": 0.4307, - "mean_token_accuracy": 0.8807380795478821, - "step": 468 - }, - { - "epoch": 0.9493927125506073, - "grad_norm": 0.26896053552627563, - "learning_rate": 1.5604544175419901e-07, - "loss": 0.3877, - "mean_token_accuracy": 0.8920484185218811, - "step": 469 - }, - { - "epoch": 0.951417004048583, - "grad_norm": 0.2413828819990158, - "learning_rate": 1.4384089652291544e-07, - "loss": 0.3673, - "mean_token_accuracy": 0.8918406367301941, - "step": 470 - }, - { - "epoch": 0.9534412955465587, - "grad_norm": 0.24448032677173615, - "learning_rate": 1.3212979736653142e-07, - "loss": 0.3627, - "mean_token_accuracy": 0.8885869383811951, - "step": 471 - }, - { - "epoch": 0.9554655870445344, - "grad_norm": 0.29778435826301575, - "learning_rate": 1.209127305982205e-07, - "loss": 0.4382, - "mean_token_accuracy": 0.8771374225616455, - "step": 472 - }, - { - "epoch": 0.9574898785425101, - "grad_norm": 0.20374563336372375, - "learning_rate": 1.1019025779754666e-07, - "loss": 0.3854, - "mean_token_accuracy": 0.886918842792511, - "step": 473 - }, - { - "epoch": 0.9595141700404858, - "grad_norm": 0.33799806237220764, - "learning_rate": 9.996291578236228e-08, - "loss": 0.5051, - "mean_token_accuracy": 0.8672335147857666, - "step": 474 - }, - { - "epoch": 0.9615384615384616, - "grad_norm": 0.3109436333179474, - "learning_rate": 9.023121658191636e-08, - "loss": 0.4186, - "mean_token_accuracy": 0.8837453722953796, - "step": 475 - }, - { - "epoch": 0.9635627530364372, - "grad_norm": 0.27846312522888184, - "learning_rate": 8.099564741123167e-08, - "loss": 0.4549, - "mean_token_accuracy": 0.8804086446762085, - "step": 476 - }, - { - "epoch": 0.9655870445344129, - "grad_norm": 0.247327521443367, - "learning_rate": 7.225667064670761e-08, - "loss": 0.3735, - "mean_token_accuracy": 0.8910034894943237, - "step": 477 - }, - { - "epoch": 0.9676113360323887, - "grad_norm": 0.2059151977300644, - "learning_rate": 6.401472380297091e-08, - "loss": 0.3844, - "mean_token_accuracy": 0.8830559253692627, - "step": 478 - }, - { - "epoch": 0.9696356275303644, - "grad_norm": 0.26618608832359314, - "learning_rate": 5.6270219510975445e-08, - "loss": 0.4498, - "mean_token_accuracy": 0.8715971112251282, - "step": 479 - }, - { - "epoch": 0.97165991902834, - "grad_norm": 0.31504425406455994, - "learning_rate": 4.902354549733979e-08, - "loss": 0.4592, - "mean_token_accuracy": 0.8812423944473267, - "step": 480 - }, - { - "epoch": 0.9736842105263158, - "grad_norm": 0.24930904805660248, - "learning_rate": 4.227506456493835e-08, - "loss": 0.4226, - "mean_token_accuracy": 0.8748299479484558, - "step": 481 - }, - { - "epoch": 0.9757085020242915, - "grad_norm": 0.23372402787208557, - "learning_rate": 3.602511457473479e-08, - "loss": 0.3934, - "mean_token_accuracy": 0.8909899592399597, - "step": 482 - }, - { - "epoch": 0.9777327935222672, - "grad_norm": 0.25504592061042786, - "learning_rate": 3.027400842887218e-08, - "loss": 0.4117, - "mean_token_accuracy": 0.8802729249000549, - "step": 483 - }, - { - "epoch": 0.979757085020243, - "grad_norm": 0.2677006721496582, - "learning_rate": 2.5022034055003363e-08, - "loss": 0.3958, - "mean_token_accuracy": 0.8953003883361816, - "step": 484 - }, - { - "epoch": 0.9817813765182186, - "grad_norm": 0.32685738801956177, - "learning_rate": 2.0269454391874665e-08, - "loss": 0.4479, - "mean_token_accuracy": 0.8705402612686157, - "step": 485 - }, - { - "epoch": 0.9838056680161943, - "grad_norm": 0.267930269241333, - "learning_rate": 1.6016507376169776e-08, - "loss": 0.3837, - "mean_token_accuracy": 0.8849610686302185, - "step": 486 - }, - { - "epoch": 0.9858299595141701, - "grad_norm": 0.3171535134315491, - "learning_rate": 1.2263405930585947e-08, - "loss": 0.5383, - "mean_token_accuracy": 0.8503844141960144, - "step": 487 - }, - { - "epoch": 0.9878542510121457, - "grad_norm": 0.31351831555366516, - "learning_rate": 9.010337953185843e-09, - "loss": 0.4605, - "mean_token_accuracy": 0.8758657574653625, - "step": 488 - }, - { - "epoch": 0.9898785425101214, - "grad_norm": 0.2406328022480011, - "learning_rate": 6.257466307980631e-09, - "loss": 0.4072, - "mean_token_accuracy": 0.8733096122741699, - "step": 489 - }, - { - "epoch": 0.9919028340080972, - "grad_norm": 0.3346785604953766, - "learning_rate": 4.00492881678427e-09, - "loss": 0.4925, - "mean_token_accuracy": 0.8693274855613708, - "step": 490 - }, - { - "epoch": 0.9939271255060729, - "grad_norm": 0.26899194717407227, - "learning_rate": 2.2528382523057115e-09, - "loss": 0.4615, - "mean_token_accuracy": 0.8673709034919739, - "step": 491 - }, - { - "epoch": 0.9959514170040485, - "grad_norm": 0.19365371763706207, - "learning_rate": 1.0012823325111776e-09, - "loss": 0.3367, - "mean_token_accuracy": 0.8977272510528564, - "step": 492 - }, - { - "epoch": 0.9979757085020243, - "grad_norm": 0.2652401030063629, - "learning_rate": 2.503237162254646e-10, - "loss": 0.5036, - "mean_token_accuracy": 0.8553218841552734, - "step": 493 - }, - { - "epoch": 1.0, - "grad_norm": 0.3128429055213928, - "learning_rate": 0.0, - "loss": 0.485, - "mean_token_accuracy": 0.8615916967391968, - "step": 494 - }, - { - "epoch": 1.0, - "step": 494, - "total_flos": 1.3768541467574272e+16, - "train_loss": 0.5140652489444988, - "train_runtime": 284.5417, - "train_samples_per_second": 13.882, - "train_steps_per_second": 1.736 + "epoch": 4.712550607287449, + "step": 75, + "total_flos": 3.216438415536947e+16, + "train_loss": 0.6463175455729167, + "train_runtime": 630.4846, + "train_samples_per_second": 15.663, + "train_steps_per_second": 0.119 } ], "logging_steps": 1, - "max_steps": 494, + "max_steps": 75, "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 100, + "num_train_epochs": 5, + "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { @@ -3987,7 +635,7 @@ "attributes": {} } }, - "total_flos": 1.3768541467574272e+16, + "total_flos": 3.216438415536947e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null