diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,229754 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 32816, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021331058020477816, + "grad_norm": 2.9658357652362923, + "learning_rate": 4.875076173065205e-08, + "loss": 1.4591, + "step": 1 + }, + { + "epoch": 0.0004266211604095563, + "grad_norm": 2.920123384356365, + "learning_rate": 9.75015234613041e-08, + "loss": 1.4572, + "step": 2 + }, + { + "epoch": 0.0006399317406143345, + "grad_norm": 2.979663663437902, + "learning_rate": 1.4625228519195615e-07, + "loss": 1.532, + "step": 3 + }, + { + "epoch": 0.0008532423208191126, + "grad_norm": 2.998886225641143, + "learning_rate": 1.950030469226082e-07, + "loss": 1.4899, + "step": 4 + }, + { + "epoch": 0.0010665529010238908, + "grad_norm": 2.9333778306483422, + "learning_rate": 2.4375380865326023e-07, + "loss": 1.4759, + "step": 5 + }, + { + "epoch": 0.001279863481228669, + "grad_norm": 2.9827161754513476, + "learning_rate": 2.925045703839123e-07, + "loss": 1.5088, + "step": 6 + }, + { + "epoch": 0.0014931740614334472, + "grad_norm": 2.9765005024747206, + "learning_rate": 3.412553321145643e-07, + "loss": 1.4931, + "step": 7 + }, + { + "epoch": 0.0017064846416382253, + "grad_norm": 2.9507169800827797, + "learning_rate": 3.900060938452164e-07, + "loss": 1.4726, + "step": 8 + }, + { + "epoch": 0.0019197952218430034, + "grad_norm": 2.955841406533115, + "learning_rate": 4.387568555758684e-07, + "loss": 1.4914, + "step": 9 + }, + { + "epoch": 0.0021331058020477816, + "grad_norm": 2.9506449886028627, + "learning_rate": 4.875076173065205e-07, + "loss": 1.4876, + "step": 10 + }, + { + "epoch": 0.00234641638225256, + "grad_norm": 2.9036808805339516, + "learning_rate": 5.362583790371724e-07, + "loss": 1.4865, + "step": 11 + }, + { + "epoch": 0.002559726962457338, + "grad_norm": 2.9110918466050357, + "learning_rate": 5.850091407678246e-07, + "loss": 1.5298, + "step": 12 + }, + { + "epoch": 0.002773037542662116, + "grad_norm": 2.8864684834337004, + "learning_rate": 6.337599024984766e-07, + "loss": 1.4945, + "step": 13 + }, + { + "epoch": 0.0029863481228668944, + "grad_norm": 2.838611643037617, + "learning_rate": 6.825106642291286e-07, + "loss": 1.4693, + "step": 14 + }, + { + "epoch": 0.0031996587030716723, + "grad_norm": 2.6395929196737655, + "learning_rate": 7.312614259597807e-07, + "loss": 1.4863, + "step": 15 + }, + { + "epoch": 0.0034129692832764505, + "grad_norm": 2.6638788833558333, + "learning_rate": 7.800121876904328e-07, + "loss": 1.5154, + "step": 16 + }, + { + "epoch": 0.003626279863481229, + "grad_norm": 2.6097414359430515, + "learning_rate": 8.287629494210847e-07, + "loss": 1.469, + "step": 17 + }, + { + "epoch": 0.0038395904436860067, + "grad_norm": 2.547964488167061, + "learning_rate": 8.775137111517368e-07, + "loss": 1.4802, + "step": 18 + }, + { + "epoch": 0.0040529010238907846, + "grad_norm": 2.5739848245636696, + "learning_rate": 9.262644728823889e-07, + "loss": 1.4801, + "step": 19 + }, + { + "epoch": 0.004266211604095563, + "grad_norm": 2.006349382997552, + "learning_rate": 9.75015234613041e-07, + "loss": 1.4853, + "step": 20 + }, + { + "epoch": 0.004479522184300341, + "grad_norm": 1.9991568257205752, + "learning_rate": 1.023765996343693e-06, + "loss": 1.467, + "step": 21 + }, + { + "epoch": 0.00469283276450512, + "grad_norm": 1.945001678012569, + "learning_rate": 1.0725167580743448e-06, + "loss": 1.4602, + "step": 22 + }, + { + "epoch": 0.004906143344709898, + "grad_norm": 1.9540044419042715, + "learning_rate": 1.121267519804997e-06, + "loss": 1.4827, + "step": 23 + }, + { + "epoch": 0.005119453924914676, + "grad_norm": 1.89765888769476, + "learning_rate": 1.1700182815356492e-06, + "loss": 1.4586, + "step": 24 + }, + { + "epoch": 0.005332764505119454, + "grad_norm": 1.864967367385872, + "learning_rate": 1.2187690432663011e-06, + "loss": 1.4625, + "step": 25 + }, + { + "epoch": 0.005546075085324232, + "grad_norm": 1.8207689047514761, + "learning_rate": 1.2675198049969531e-06, + "loss": 1.4817, + "step": 26 + }, + { + "epoch": 0.00575938566552901, + "grad_norm": 1.2742986871843462, + "learning_rate": 1.316270566727605e-06, + "loss": 1.4712, + "step": 27 + }, + { + "epoch": 0.005972696245733789, + "grad_norm": 1.4165248961913217, + "learning_rate": 1.3650213284582572e-06, + "loss": 1.4058, + "step": 28 + }, + { + "epoch": 0.006186006825938567, + "grad_norm": 1.4895394403550966, + "learning_rate": 1.4137720901889094e-06, + "loss": 1.4373, + "step": 29 + }, + { + "epoch": 0.0063993174061433445, + "grad_norm": 1.4596174013728584, + "learning_rate": 1.4625228519195614e-06, + "loss": 1.4408, + "step": 30 + }, + { + "epoch": 0.006612627986348123, + "grad_norm": 1.436338411563705, + "learning_rate": 1.5112736136502135e-06, + "loss": 1.4477, + "step": 31 + }, + { + "epoch": 0.006825938566552901, + "grad_norm": 1.3181858885981355, + "learning_rate": 1.5600243753808655e-06, + "loss": 1.4568, + "step": 32 + }, + { + "epoch": 0.007039249146757679, + "grad_norm": 1.2090098884608917, + "learning_rate": 1.6087751371115177e-06, + "loss": 1.4397, + "step": 33 + }, + { + "epoch": 0.007252559726962458, + "grad_norm": 1.1203693518437228, + "learning_rate": 1.6575258988421694e-06, + "loss": 1.4457, + "step": 34 + }, + { + "epoch": 0.0074658703071672355, + "grad_norm": 1.0307297398810533, + "learning_rate": 1.7062766605728214e-06, + "loss": 1.4065, + "step": 35 + }, + { + "epoch": 0.007679180887372013, + "grad_norm": 1.2784442219657466, + "learning_rate": 1.7550274223034736e-06, + "loss": 1.4008, + "step": 36 + }, + { + "epoch": 0.007892491467576791, + "grad_norm": 1.2294618084530962, + "learning_rate": 1.8037781840341257e-06, + "loss": 1.4219, + "step": 37 + }, + { + "epoch": 0.008105802047781569, + "grad_norm": 1.0765156608759907, + "learning_rate": 1.8525289457647777e-06, + "loss": 1.3938, + "step": 38 + }, + { + "epoch": 0.008319112627986349, + "grad_norm": 0.9517818616775472, + "learning_rate": 1.9012797074954299e-06, + "loss": 1.3642, + "step": 39 + }, + { + "epoch": 0.008532423208191127, + "grad_norm": 0.9582369696193453, + "learning_rate": 1.950030469226082e-06, + "loss": 1.4345, + "step": 40 + }, + { + "epoch": 0.008745733788395904, + "grad_norm": 0.9637188160497729, + "learning_rate": 1.998781230956734e-06, + "loss": 1.4073, + "step": 41 + }, + { + "epoch": 0.008959044368600682, + "grad_norm": 0.9598605437821353, + "learning_rate": 2.047531992687386e-06, + "loss": 1.3708, + "step": 42 + }, + { + "epoch": 0.00917235494880546, + "grad_norm": 0.9266351132088498, + "learning_rate": 2.096282754418038e-06, + "loss": 1.3964, + "step": 43 + }, + { + "epoch": 0.00938566552901024, + "grad_norm": 0.8231288569820878, + "learning_rate": 2.1450335161486897e-06, + "loss": 1.3869, + "step": 44 + }, + { + "epoch": 0.009598976109215018, + "grad_norm": 0.7388105741099776, + "learning_rate": 2.193784277879342e-06, + "loss": 1.3789, + "step": 45 + }, + { + "epoch": 0.009812286689419795, + "grad_norm": 0.630713600308782, + "learning_rate": 2.242535039609994e-06, + "loss": 1.3757, + "step": 46 + }, + { + "epoch": 0.010025597269624573, + "grad_norm": 0.5364828524519228, + "learning_rate": 2.291285801340646e-06, + "loss": 1.3484, + "step": 47 + }, + { + "epoch": 0.010238907849829351, + "grad_norm": 0.5788625776396672, + "learning_rate": 2.3400365630712984e-06, + "loss": 1.3516, + "step": 48 + }, + { + "epoch": 0.010452218430034129, + "grad_norm": 0.7286242800852014, + "learning_rate": 2.38878732480195e-06, + "loss": 1.3476, + "step": 49 + }, + { + "epoch": 0.010665529010238909, + "grad_norm": 0.7745939846361763, + "learning_rate": 2.4375380865326023e-06, + "loss": 1.3383, + "step": 50 + }, + { + "epoch": 0.010878839590443687, + "grad_norm": 0.7385994204791534, + "learning_rate": 2.4862888482632545e-06, + "loss": 1.2921, + "step": 51 + }, + { + "epoch": 0.011092150170648464, + "grad_norm": 0.6770549317958093, + "learning_rate": 2.5350396099939062e-06, + "loss": 1.3341, + "step": 52 + }, + { + "epoch": 0.011305460750853242, + "grad_norm": 0.5539451154555429, + "learning_rate": 2.5837903717245584e-06, + "loss": 1.328, + "step": 53 + }, + { + "epoch": 0.01151877133105802, + "grad_norm": 0.4762019550415222, + "learning_rate": 2.63254113345521e-06, + "loss": 1.3264, + "step": 54 + }, + { + "epoch": 0.011732081911262798, + "grad_norm": 0.4420963573829151, + "learning_rate": 2.6812918951858623e-06, + "loss": 1.32, + "step": 55 + }, + { + "epoch": 0.011945392491467578, + "grad_norm": 0.527270925583055, + "learning_rate": 2.7300426569165145e-06, + "loss": 1.3125, + "step": 56 + }, + { + "epoch": 0.012158703071672355, + "grad_norm": 0.5682134699905531, + "learning_rate": 2.7787934186471667e-06, + "loss": 1.3457, + "step": 57 + }, + { + "epoch": 0.012372013651877133, + "grad_norm": 0.5614022869162921, + "learning_rate": 2.827544180377819e-06, + "loss": 1.3395, + "step": 58 + }, + { + "epoch": 0.012585324232081911, + "grad_norm": 0.5074152396583242, + "learning_rate": 2.8762949421084706e-06, + "loss": 1.3414, + "step": 59 + }, + { + "epoch": 0.012798634812286689, + "grad_norm": 0.46083364790154324, + "learning_rate": 2.9250457038391228e-06, + "loss": 1.319, + "step": 60 + }, + { + "epoch": 0.013011945392491467, + "grad_norm": 0.5256237057248203, + "learning_rate": 2.973796465569775e-06, + "loss": 1.3108, + "step": 61 + }, + { + "epoch": 0.013225255972696246, + "grad_norm": 0.4040368740229212, + "learning_rate": 3.022547227300427e-06, + "loss": 1.2754, + "step": 62 + }, + { + "epoch": 0.013438566552901024, + "grad_norm": 0.37143201901871914, + "learning_rate": 3.071297989031079e-06, + "loss": 1.3269, + "step": 63 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 0.3370923890403275, + "learning_rate": 3.120048750761731e-06, + "loss": 1.286, + "step": 64 + }, + { + "epoch": 0.01386518771331058, + "grad_norm": 0.3231712132986102, + "learning_rate": 3.168799512492383e-06, + "loss": 1.3033, + "step": 65 + }, + { + "epoch": 0.014078498293515358, + "grad_norm": 0.3728371559121103, + "learning_rate": 3.2175502742230354e-06, + "loss": 1.2988, + "step": 66 + }, + { + "epoch": 0.014291808873720136, + "grad_norm": 0.36814715716305835, + "learning_rate": 3.266301035953687e-06, + "loss": 1.3136, + "step": 67 + }, + { + "epoch": 0.014505119453924915, + "grad_norm": 0.37852069571822083, + "learning_rate": 3.315051797684339e-06, + "loss": 1.2764, + "step": 68 + }, + { + "epoch": 0.014718430034129693, + "grad_norm": 0.3137163338661027, + "learning_rate": 3.363802559414991e-06, + "loss": 1.3049, + "step": 69 + }, + { + "epoch": 0.014931740614334471, + "grad_norm": 0.27789112565694957, + "learning_rate": 3.4125533211456428e-06, + "loss": 1.26, + "step": 70 + }, + { + "epoch": 0.015145051194539249, + "grad_norm": 0.2956947441153168, + "learning_rate": 3.461304082876295e-06, + "loss": 1.2987, + "step": 71 + }, + { + "epoch": 0.015358361774744027, + "grad_norm": 0.30576931922481987, + "learning_rate": 3.510054844606947e-06, + "loss": 1.3092, + "step": 72 + }, + { + "epoch": 0.015571672354948805, + "grad_norm": 0.30432144577006126, + "learning_rate": 3.5588056063375993e-06, + "loss": 1.2765, + "step": 73 + }, + { + "epoch": 0.015784982935153583, + "grad_norm": 0.26979370707351163, + "learning_rate": 3.6075563680682515e-06, + "loss": 1.2807, + "step": 74 + }, + { + "epoch": 0.01599829351535836, + "grad_norm": 0.2731126300655708, + "learning_rate": 3.6563071297989032e-06, + "loss": 1.2854, + "step": 75 + }, + { + "epoch": 0.016211604095563138, + "grad_norm": 0.24902882101802692, + "learning_rate": 3.7050578915295554e-06, + "loss": 1.2871, + "step": 76 + }, + { + "epoch": 0.01642491467576792, + "grad_norm": 0.25742902585606964, + "learning_rate": 3.7538086532602076e-06, + "loss": 1.2899, + "step": 77 + }, + { + "epoch": 0.016638225255972697, + "grad_norm": 0.24145005965473001, + "learning_rate": 3.8025594149908597e-06, + "loss": 1.2547, + "step": 78 + }, + { + "epoch": 0.016851535836177475, + "grad_norm": 0.21933134224619025, + "learning_rate": 3.8513101767215115e-06, + "loss": 1.288, + "step": 79 + }, + { + "epoch": 0.017064846416382253, + "grad_norm": 0.2497521431633828, + "learning_rate": 3.900060938452164e-06, + "loss": 1.3026, + "step": 80 + }, + { + "epoch": 0.01727815699658703, + "grad_norm": 0.23750123342335402, + "learning_rate": 3.948811700182816e-06, + "loss": 1.2528, + "step": 81 + }, + { + "epoch": 0.01749146757679181, + "grad_norm": 0.22650920098237104, + "learning_rate": 3.997562461913468e-06, + "loss": 1.2391, + "step": 82 + }, + { + "epoch": 0.017704778156996587, + "grad_norm": 0.20445403241349117, + "learning_rate": 4.04631322364412e-06, + "loss": 1.2588, + "step": 83 + }, + { + "epoch": 0.017918088737201365, + "grad_norm": 0.21382925898394087, + "learning_rate": 4.095063985374772e-06, + "loss": 1.2687, + "step": 84 + }, + { + "epoch": 0.018131399317406142, + "grad_norm": 0.21151658508393267, + "learning_rate": 4.143814747105424e-06, + "loss": 1.2653, + "step": 85 + }, + { + "epoch": 0.01834470989761092, + "grad_norm": 0.22257710467276118, + "learning_rate": 4.192565508836076e-06, + "loss": 1.2728, + "step": 86 + }, + { + "epoch": 0.018558020477815698, + "grad_norm": 0.2061058319695634, + "learning_rate": 4.241316270566728e-06, + "loss": 1.2637, + "step": 87 + }, + { + "epoch": 0.01877133105802048, + "grad_norm": 0.21295816589186936, + "learning_rate": 4.290067032297379e-06, + "loss": 1.2467, + "step": 88 + }, + { + "epoch": 0.018984641638225257, + "grad_norm": 0.1975745952934038, + "learning_rate": 4.3388177940280315e-06, + "loss": 1.2409, + "step": 89 + }, + { + "epoch": 0.019197952218430035, + "grad_norm": 0.1790884822425659, + "learning_rate": 4.387568555758684e-06, + "loss": 1.2482, + "step": 90 + }, + { + "epoch": 0.019411262798634813, + "grad_norm": 0.20176036531657135, + "learning_rate": 4.436319317489336e-06, + "loss": 1.2729, + "step": 91 + }, + { + "epoch": 0.01962457337883959, + "grad_norm": 0.19939635763884384, + "learning_rate": 4.485070079219988e-06, + "loss": 1.2424, + "step": 92 + }, + { + "epoch": 0.01983788395904437, + "grad_norm": 0.20829155088584378, + "learning_rate": 4.53382084095064e-06, + "loss": 1.2618, + "step": 93 + }, + { + "epoch": 0.020051194539249147, + "grad_norm": 0.1921705604185282, + "learning_rate": 4.582571602681292e-06, + "loss": 1.2706, + "step": 94 + }, + { + "epoch": 0.020264505119453925, + "grad_norm": 0.17508258200718924, + "learning_rate": 4.6313223644119446e-06, + "loss": 1.2364, + "step": 95 + }, + { + "epoch": 0.020477815699658702, + "grad_norm": 0.20506045094299222, + "learning_rate": 4.680073126142597e-06, + "loss": 1.2562, + "step": 96 + }, + { + "epoch": 0.02069112627986348, + "grad_norm": 0.19432168430322394, + "learning_rate": 4.728823887873248e-06, + "loss": 1.2596, + "step": 97 + }, + { + "epoch": 0.020904436860068258, + "grad_norm": 0.1769111782357937, + "learning_rate": 4.7775746496039e-06, + "loss": 1.2136, + "step": 98 + }, + { + "epoch": 0.021117747440273036, + "grad_norm": 0.17712548595800484, + "learning_rate": 4.826325411334552e-06, + "loss": 1.2342, + "step": 99 + }, + { + "epoch": 0.021331058020477817, + "grad_norm": 0.17285890510089127, + "learning_rate": 4.875076173065205e-06, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.021544368600682595, + "grad_norm": 0.1653031345324932, + "learning_rate": 4.923826934795857e-06, + "loss": 1.273, + "step": 101 + }, + { + "epoch": 0.021757679180887373, + "grad_norm": 0.18288411201168664, + "learning_rate": 4.972577696526509e-06, + "loss": 1.2351, + "step": 102 + }, + { + "epoch": 0.02197098976109215, + "grad_norm": 0.16890897152766438, + "learning_rate": 5.021328458257161e-06, + "loss": 1.231, + "step": 103 + }, + { + "epoch": 0.02218430034129693, + "grad_norm": 0.16861615721698234, + "learning_rate": 5.0700792199878124e-06, + "loss": 1.1917, + "step": 104 + }, + { + "epoch": 0.022397610921501707, + "grad_norm": 0.1611137153425664, + "learning_rate": 5.118829981718465e-06, + "loss": 1.2391, + "step": 105 + }, + { + "epoch": 0.022610921501706484, + "grad_norm": 0.18195348338968495, + "learning_rate": 5.167580743449117e-06, + "loss": 1.2465, + "step": 106 + }, + { + "epoch": 0.022824232081911262, + "grad_norm": 0.1773041193070426, + "learning_rate": 5.216331505179769e-06, + "loss": 1.221, + "step": 107 + }, + { + "epoch": 0.02303754266211604, + "grad_norm": 0.18349418770405548, + "learning_rate": 5.26508226691042e-06, + "loss": 1.2537, + "step": 108 + }, + { + "epoch": 0.023250853242320818, + "grad_norm": 0.1736401882560067, + "learning_rate": 5.313833028641073e-06, + "loss": 1.2256, + "step": 109 + }, + { + "epoch": 0.023464163822525596, + "grad_norm": 0.18606524727264476, + "learning_rate": 5.362583790371725e-06, + "loss": 1.208, + "step": 110 + }, + { + "epoch": 0.023677474402730374, + "grad_norm": 0.18794532315860318, + "learning_rate": 5.411334552102378e-06, + "loss": 1.2551, + "step": 111 + }, + { + "epoch": 0.023890784982935155, + "grad_norm": 0.17865383692760983, + "learning_rate": 5.460085313833029e-06, + "loss": 1.2197, + "step": 112 + }, + { + "epoch": 0.024104095563139933, + "grad_norm": 0.18203614972507925, + "learning_rate": 5.508836075563682e-06, + "loss": 1.2469, + "step": 113 + }, + { + "epoch": 0.02431740614334471, + "grad_norm": 0.17067947626798285, + "learning_rate": 5.557586837294333e-06, + "loss": 1.2434, + "step": 114 + }, + { + "epoch": 0.02453071672354949, + "grad_norm": 0.16724879272027157, + "learning_rate": 5.6063375990249855e-06, + "loss": 1.2318, + "step": 115 + }, + { + "epoch": 0.024744027303754267, + "grad_norm": 0.1713351799538319, + "learning_rate": 5.655088360755638e-06, + "loss": 1.2222, + "step": 116 + }, + { + "epoch": 0.024957337883959044, + "grad_norm": 0.17790616321401265, + "learning_rate": 5.703839122486289e-06, + "loss": 1.2445, + "step": 117 + }, + { + "epoch": 0.025170648464163822, + "grad_norm": 0.1753974758487813, + "learning_rate": 5.752589884216941e-06, + "loss": 1.2365, + "step": 118 + }, + { + "epoch": 0.0253839590443686, + "grad_norm": 0.1805851908589912, + "learning_rate": 5.801340645947593e-06, + "loss": 1.2038, + "step": 119 + }, + { + "epoch": 0.025597269624573378, + "grad_norm": 0.19370293575713024, + "learning_rate": 5.8500914076782455e-06, + "loss": 1.2138, + "step": 120 + }, + { + "epoch": 0.025810580204778156, + "grad_norm": 0.20623717553184537, + "learning_rate": 5.898842169408897e-06, + "loss": 1.2792, + "step": 121 + }, + { + "epoch": 0.026023890784982934, + "grad_norm": 0.172306359792908, + "learning_rate": 5.94759293113955e-06, + "loss": 1.2019, + "step": 122 + }, + { + "epoch": 0.02623720136518771, + "grad_norm": 0.19982071246935154, + "learning_rate": 5.996343692870201e-06, + "loss": 1.2337, + "step": 123 + }, + { + "epoch": 0.026450511945392493, + "grad_norm": 0.1650468895246387, + "learning_rate": 6.045094454600854e-06, + "loss": 1.2202, + "step": 124 + }, + { + "epoch": 0.02666382252559727, + "grad_norm": 0.16647173734544551, + "learning_rate": 6.0938452163315055e-06, + "loss": 1.2158, + "step": 125 + }, + { + "epoch": 0.02687713310580205, + "grad_norm": 0.1698535688897183, + "learning_rate": 6.142595978062158e-06, + "loss": 1.1975, + "step": 126 + }, + { + "epoch": 0.027090443686006827, + "grad_norm": 0.18449144139756657, + "learning_rate": 6.19134673979281e-06, + "loss": 1.2219, + "step": 127 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 0.174529288214799, + "learning_rate": 6.240097501523462e-06, + "loss": 1.2484, + "step": 128 + }, + { + "epoch": 0.027517064846416382, + "grad_norm": 0.17541359527823078, + "learning_rate": 6.288848263254113e-06, + "loss": 1.2229, + "step": 129 + }, + { + "epoch": 0.02773037542662116, + "grad_norm": 0.18191537337109984, + "learning_rate": 6.337599024984766e-06, + "loss": 1.2182, + "step": 130 + }, + { + "epoch": 0.027943686006825938, + "grad_norm": 0.1916733791850958, + "learning_rate": 6.386349786715418e-06, + "loss": 1.1912, + "step": 131 + }, + { + "epoch": 0.028156996587030716, + "grad_norm": 0.17972562647723395, + "learning_rate": 6.435100548446071e-06, + "loss": 1.1901, + "step": 132 + }, + { + "epoch": 0.028370307167235494, + "grad_norm": 0.18299683104017472, + "learning_rate": 6.483851310176722e-06, + "loss": 1.1645, + "step": 133 + }, + { + "epoch": 0.02858361774744027, + "grad_norm": 0.1839123952077926, + "learning_rate": 6.532602071907374e-06, + "loss": 1.231, + "step": 134 + }, + { + "epoch": 0.028796928327645053, + "grad_norm": 0.1888756933383566, + "learning_rate": 6.581352833638026e-06, + "loss": 1.2263, + "step": 135 + }, + { + "epoch": 0.02901023890784983, + "grad_norm": 0.16932712350085324, + "learning_rate": 6.630103595368678e-06, + "loss": 1.2024, + "step": 136 + }, + { + "epoch": 0.02922354948805461, + "grad_norm": 0.17309110695537996, + "learning_rate": 6.67885435709933e-06, + "loss": 1.2175, + "step": 137 + }, + { + "epoch": 0.029436860068259386, + "grad_norm": 0.1736045918011362, + "learning_rate": 6.727605118829982e-06, + "loss": 1.2259, + "step": 138 + }, + { + "epoch": 0.029650170648464164, + "grad_norm": 0.1823492958425635, + "learning_rate": 6.776355880560634e-06, + "loss": 1.1943, + "step": 139 + }, + { + "epoch": 0.029863481228668942, + "grad_norm": 0.20170724922034258, + "learning_rate": 6.8251066422912856e-06, + "loss": 1.1944, + "step": 140 + }, + { + "epoch": 0.03007679180887372, + "grad_norm": 0.17681287769637766, + "learning_rate": 6.873857404021939e-06, + "loss": 1.2102, + "step": 141 + }, + { + "epoch": 0.030290102389078498, + "grad_norm": 0.18337680320542923, + "learning_rate": 6.92260816575259e-06, + "loss": 1.1864, + "step": 142 + }, + { + "epoch": 0.030503412969283276, + "grad_norm": 0.18534335661300477, + "learning_rate": 6.971358927483243e-06, + "loss": 1.2338, + "step": 143 + }, + { + "epoch": 0.030716723549488054, + "grad_norm": 0.1687866685147983, + "learning_rate": 7.020109689213894e-06, + "loss": 1.1988, + "step": 144 + }, + { + "epoch": 0.03093003412969283, + "grad_norm": 0.18651901872113566, + "learning_rate": 7.068860450944547e-06, + "loss": 1.1786, + "step": 145 + }, + { + "epoch": 0.03114334470989761, + "grad_norm": 0.16981577549972654, + "learning_rate": 7.117611212675199e-06, + "loss": 1.1759, + "step": 146 + }, + { + "epoch": 0.03135665529010239, + "grad_norm": 0.165745912421366, + "learning_rate": 7.166361974405851e-06, + "loss": 1.1815, + "step": 147 + }, + { + "epoch": 0.031569965870307165, + "grad_norm": 0.1791314296630991, + "learning_rate": 7.215112736136503e-06, + "loss": 1.1656, + "step": 148 + }, + { + "epoch": 0.031783276450511946, + "grad_norm": 0.16703663119694553, + "learning_rate": 7.263863497867155e-06, + "loss": 1.2264, + "step": 149 + }, + { + "epoch": 0.03199658703071672, + "grad_norm": 0.16974655248337875, + "learning_rate": 7.3126142595978065e-06, + "loss": 1.2138, + "step": 150 + }, + { + "epoch": 0.0322098976109215, + "grad_norm": 0.18828490208134854, + "learning_rate": 7.3613650213284595e-06, + "loss": 1.209, + "step": 151 + }, + { + "epoch": 0.032423208191126277, + "grad_norm": 0.17882102134359731, + "learning_rate": 7.410115783059111e-06, + "loss": 1.2022, + "step": 152 + }, + { + "epoch": 0.03263651877133106, + "grad_norm": 0.16196990462574706, + "learning_rate": 7.458866544789764e-06, + "loss": 1.1568, + "step": 153 + }, + { + "epoch": 0.03284982935153584, + "grad_norm": 0.18096901779778635, + "learning_rate": 7.507617306520415e-06, + "loss": 1.2192, + "step": 154 + }, + { + "epoch": 0.033063139931740614, + "grad_norm": 0.18457347460511853, + "learning_rate": 7.5563680682510665e-06, + "loss": 1.1821, + "step": 155 + }, + { + "epoch": 0.033276450511945395, + "grad_norm": 0.1952025002227092, + "learning_rate": 7.6051188299817195e-06, + "loss": 1.1793, + "step": 156 + }, + { + "epoch": 0.03348976109215017, + "grad_norm": 0.1909268197710842, + "learning_rate": 7.65386959171237e-06, + "loss": 1.1802, + "step": 157 + }, + { + "epoch": 0.03370307167235495, + "grad_norm": 0.187714385720673, + "learning_rate": 7.702620353443023e-06, + "loss": 1.2171, + "step": 158 + }, + { + "epoch": 0.033916382252559725, + "grad_norm": 0.2026959281317527, + "learning_rate": 7.751371115173674e-06, + "loss": 1.1519, + "step": 159 + }, + { + "epoch": 0.034129692832764506, + "grad_norm": 0.19068735409302653, + "learning_rate": 7.800121876904327e-06, + "loss": 1.2123, + "step": 160 + }, + { + "epoch": 0.03434300341296928, + "grad_norm": 0.19431570043773622, + "learning_rate": 7.848872638634979e-06, + "loss": 1.2014, + "step": 161 + }, + { + "epoch": 0.03455631399317406, + "grad_norm": 0.19701488848607637, + "learning_rate": 7.897623400365632e-06, + "loss": 1.2331, + "step": 162 + }, + { + "epoch": 0.034769624573378836, + "grad_norm": 0.1714491339531252, + "learning_rate": 7.946374162096283e-06, + "loss": 1.1897, + "step": 163 + }, + { + "epoch": 0.03498293515358362, + "grad_norm": 0.1906613308697754, + "learning_rate": 7.995124923826936e-06, + "loss": 1.2348, + "step": 164 + }, + { + "epoch": 0.0351962457337884, + "grad_norm": 0.1887460133356115, + "learning_rate": 8.043875685557587e-06, + "loss": 1.1875, + "step": 165 + }, + { + "epoch": 0.035409556313993173, + "grad_norm": 0.20898664367659955, + "learning_rate": 8.09262644728824e-06, + "loss": 1.1884, + "step": 166 + }, + { + "epoch": 0.035622866894197955, + "grad_norm": 0.22134672759569465, + "learning_rate": 8.141377209018892e-06, + "loss": 1.1809, + "step": 167 + }, + { + "epoch": 0.03583617747440273, + "grad_norm": 0.18658001599302834, + "learning_rate": 8.190127970749545e-06, + "loss": 1.2092, + "step": 168 + }, + { + "epoch": 0.03604948805460751, + "grad_norm": 0.16824922439295498, + "learning_rate": 8.238878732480196e-06, + "loss": 1.1796, + "step": 169 + }, + { + "epoch": 0.036262798634812285, + "grad_norm": 0.22004311358867595, + "learning_rate": 8.287629494210847e-06, + "loss": 1.1686, + "step": 170 + }, + { + "epoch": 0.036476109215017066, + "grad_norm": 0.17672319632010824, + "learning_rate": 8.3363802559415e-06, + "loss": 1.1633, + "step": 171 + }, + { + "epoch": 0.03668941979522184, + "grad_norm": 0.20732474907666856, + "learning_rate": 8.385131017672152e-06, + "loss": 1.1879, + "step": 172 + }, + { + "epoch": 0.03690273037542662, + "grad_norm": 0.20770620473533272, + "learning_rate": 8.433881779402803e-06, + "loss": 1.188, + "step": 173 + }, + { + "epoch": 0.037116040955631396, + "grad_norm": 0.1872958753946926, + "learning_rate": 8.482632541133456e-06, + "loss": 1.2155, + "step": 174 + }, + { + "epoch": 0.03732935153583618, + "grad_norm": 0.17977064636532453, + "learning_rate": 8.531383302864107e-06, + "loss": 1.185, + "step": 175 + }, + { + "epoch": 0.03754266211604096, + "grad_norm": 0.19783661192645727, + "learning_rate": 8.580134064594759e-06, + "loss": 1.1722, + "step": 176 + }, + { + "epoch": 0.03775597269624573, + "grad_norm": 0.19835737299570474, + "learning_rate": 8.628884826325412e-06, + "loss": 1.1864, + "step": 177 + }, + { + "epoch": 0.037969283276450515, + "grad_norm": 0.19672344103317527, + "learning_rate": 8.677635588056063e-06, + "loss": 1.1827, + "step": 178 + }, + { + "epoch": 0.03818259385665529, + "grad_norm": 0.17954576686416868, + "learning_rate": 8.726386349786716e-06, + "loss": 1.1596, + "step": 179 + }, + { + "epoch": 0.03839590443686007, + "grad_norm": 0.19485497568275068, + "learning_rate": 8.775137111517367e-06, + "loss": 1.1734, + "step": 180 + }, + { + "epoch": 0.038609215017064845, + "grad_norm": 0.19863493336411078, + "learning_rate": 8.82388787324802e-06, + "loss": 1.2383, + "step": 181 + }, + { + "epoch": 0.038822525597269626, + "grad_norm": 0.17778738894695473, + "learning_rate": 8.872638634978672e-06, + "loss": 1.2001, + "step": 182 + }, + { + "epoch": 0.0390358361774744, + "grad_norm": 0.2065764498038711, + "learning_rate": 8.921389396709325e-06, + "loss": 1.2031, + "step": 183 + }, + { + "epoch": 0.03924914675767918, + "grad_norm": 0.19997681014456226, + "learning_rate": 8.970140158439976e-06, + "loss": 1.1873, + "step": 184 + }, + { + "epoch": 0.039462457337883956, + "grad_norm": 0.21989006197322156, + "learning_rate": 9.018890920170629e-06, + "loss": 1.1839, + "step": 185 + }, + { + "epoch": 0.03967576791808874, + "grad_norm": 0.22888494191021616, + "learning_rate": 9.06764168190128e-06, + "loss": 1.1522, + "step": 186 + }, + { + "epoch": 0.03988907849829351, + "grad_norm": 0.20919619030651596, + "learning_rate": 9.116392443631933e-06, + "loss": 1.1765, + "step": 187 + }, + { + "epoch": 0.04010238907849829, + "grad_norm": 0.24917755793300847, + "learning_rate": 9.165143205362585e-06, + "loss": 1.2077, + "step": 188 + }, + { + "epoch": 0.040315699658703075, + "grad_norm": 0.20848960774608719, + "learning_rate": 9.213893967093238e-06, + "loss": 1.172, + "step": 189 + }, + { + "epoch": 0.04052901023890785, + "grad_norm": 0.22208156551332398, + "learning_rate": 9.262644728823889e-06, + "loss": 1.1709, + "step": 190 + }, + { + "epoch": 0.04074232081911263, + "grad_norm": 0.2157697677180084, + "learning_rate": 9.31139549055454e-06, + "loss": 1.1842, + "step": 191 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 0.2137542784253138, + "learning_rate": 9.360146252285193e-06, + "loss": 1.1968, + "step": 192 + }, + { + "epoch": 0.041168941979522186, + "grad_norm": 0.2319685462435285, + "learning_rate": 9.408897014015845e-06, + "loss": 1.1577, + "step": 193 + }, + { + "epoch": 0.04138225255972696, + "grad_norm": 0.24451927731554993, + "learning_rate": 9.457647775746496e-06, + "loss": 1.1855, + "step": 194 + }, + { + "epoch": 0.04159556313993174, + "grad_norm": 0.25653547320733266, + "learning_rate": 9.50639853747715e-06, + "loss": 1.1664, + "step": 195 + }, + { + "epoch": 0.041808873720136516, + "grad_norm": 0.19083650426871382, + "learning_rate": 9.5551492992078e-06, + "loss": 1.1557, + "step": 196 + }, + { + "epoch": 0.0420221843003413, + "grad_norm": 0.2571394505838406, + "learning_rate": 9.603900060938452e-06, + "loss": 1.1674, + "step": 197 + }, + { + "epoch": 0.04223549488054607, + "grad_norm": 0.20219241541225116, + "learning_rate": 9.652650822669105e-06, + "loss": 1.1553, + "step": 198 + }, + { + "epoch": 0.04244880546075085, + "grad_norm": 0.2416410948580995, + "learning_rate": 9.701401584399756e-06, + "loss": 1.1718, + "step": 199 + }, + { + "epoch": 0.042662116040955635, + "grad_norm": 0.18821636916584697, + "learning_rate": 9.75015234613041e-06, + "loss": 1.182, + "step": 200 + }, + { + "epoch": 0.04287542662116041, + "grad_norm": 0.22124945552567019, + "learning_rate": 9.79890310786106e-06, + "loss": 1.1744, + "step": 201 + }, + { + "epoch": 0.04308873720136519, + "grad_norm": 0.2513917494095293, + "learning_rate": 9.847653869591714e-06, + "loss": 1.2157, + "step": 202 + }, + { + "epoch": 0.043302047781569965, + "grad_norm": 0.25116368903411684, + "learning_rate": 9.896404631322365e-06, + "loss": 1.1824, + "step": 203 + }, + { + "epoch": 0.043515358361774746, + "grad_norm": 0.24929778386470663, + "learning_rate": 9.945155393053018e-06, + "loss": 1.1844, + "step": 204 + }, + { + "epoch": 0.04372866894197952, + "grad_norm": 0.253633081499088, + "learning_rate": 9.99390615478367e-06, + "loss": 1.1579, + "step": 205 + }, + { + "epoch": 0.0439419795221843, + "grad_norm": 0.23734322221880488, + "learning_rate": 1.0042656916514322e-05, + "loss": 1.1656, + "step": 206 + }, + { + "epoch": 0.044155290102389076, + "grad_norm": 0.21649494888745138, + "learning_rate": 1.0091407678244974e-05, + "loss": 1.1668, + "step": 207 + }, + { + "epoch": 0.04436860068259386, + "grad_norm": 0.23910533165981102, + "learning_rate": 1.0140158439975625e-05, + "loss": 1.218, + "step": 208 + }, + { + "epoch": 0.04458191126279863, + "grad_norm": 0.1929690746212809, + "learning_rate": 1.0188909201706278e-05, + "loss": 1.1734, + "step": 209 + }, + { + "epoch": 0.04479522184300341, + "grad_norm": 0.22073275404683235, + "learning_rate": 1.023765996343693e-05, + "loss": 1.1718, + "step": 210 + }, + { + "epoch": 0.045008532423208195, + "grad_norm": 0.1891206569989621, + "learning_rate": 1.0286410725167582e-05, + "loss": 1.1672, + "step": 211 + }, + { + "epoch": 0.04522184300341297, + "grad_norm": 0.1926946037272704, + "learning_rate": 1.0335161486898234e-05, + "loss": 1.1712, + "step": 212 + }, + { + "epoch": 0.04543515358361775, + "grad_norm": 0.19512996262042837, + "learning_rate": 1.0383912248628885e-05, + "loss": 1.1743, + "step": 213 + }, + { + "epoch": 0.045648464163822525, + "grad_norm": 0.23570913493694853, + "learning_rate": 1.0432663010359538e-05, + "loss": 1.1855, + "step": 214 + }, + { + "epoch": 0.045861774744027306, + "grad_norm": 0.20815486803314295, + "learning_rate": 1.048141377209019e-05, + "loss": 1.1603, + "step": 215 + }, + { + "epoch": 0.04607508532423208, + "grad_norm": 0.25280894159189715, + "learning_rate": 1.053016453382084e-05, + "loss": 1.1769, + "step": 216 + }, + { + "epoch": 0.04628839590443686, + "grad_norm": 0.22384849898509343, + "learning_rate": 1.0578915295551494e-05, + "loss": 1.1634, + "step": 217 + }, + { + "epoch": 0.046501706484641636, + "grad_norm": 0.19863857212773026, + "learning_rate": 1.0627666057282147e-05, + "loss": 1.1599, + "step": 218 + }, + { + "epoch": 0.04671501706484642, + "grad_norm": 0.24091385239188917, + "learning_rate": 1.0676416819012798e-05, + "loss": 1.2248, + "step": 219 + }, + { + "epoch": 0.04692832764505119, + "grad_norm": 0.20397758788669812, + "learning_rate": 1.072516758074345e-05, + "loss": 1.1386, + "step": 220 + }, + { + "epoch": 0.04714163822525597, + "grad_norm": 0.2181218744436116, + "learning_rate": 1.07739183424741e-05, + "loss": 1.1495, + "step": 221 + }, + { + "epoch": 0.04735494880546075, + "grad_norm": 0.20477556947920125, + "learning_rate": 1.0822669104204755e-05, + "loss": 1.1508, + "step": 222 + }, + { + "epoch": 0.04756825938566553, + "grad_norm": 0.2307827205954812, + "learning_rate": 1.0871419865935407e-05, + "loss": 1.1539, + "step": 223 + }, + { + "epoch": 0.04778156996587031, + "grad_norm": 0.18467700808602697, + "learning_rate": 1.0920170627666058e-05, + "loss": 1.1937, + "step": 224 + }, + { + "epoch": 0.047994880546075085, + "grad_norm": 0.24454179606953821, + "learning_rate": 1.096892138939671e-05, + "loss": 1.1314, + "step": 225 + }, + { + "epoch": 0.048208191126279866, + "grad_norm": 0.21151498470789853, + "learning_rate": 1.1017672151127364e-05, + "loss": 1.1532, + "step": 226 + }, + { + "epoch": 0.04842150170648464, + "grad_norm": 0.20376848034823805, + "learning_rate": 1.1066422912858015e-05, + "loss": 1.2028, + "step": 227 + }, + { + "epoch": 0.04863481228668942, + "grad_norm": 0.2238319630482885, + "learning_rate": 1.1115173674588667e-05, + "loss": 1.1627, + "step": 228 + }, + { + "epoch": 0.048848122866894196, + "grad_norm": 0.30047125037168104, + "learning_rate": 1.1163924436319318e-05, + "loss": 1.1652, + "step": 229 + }, + { + "epoch": 0.04906143344709898, + "grad_norm": 0.29025516897762543, + "learning_rate": 1.1212675198049971e-05, + "loss": 1.1647, + "step": 230 + }, + { + "epoch": 0.04927474402730375, + "grad_norm": 0.24300570414513706, + "learning_rate": 1.1261425959780622e-05, + "loss": 1.1695, + "step": 231 + }, + { + "epoch": 0.04948805460750853, + "grad_norm": 0.22369114476363902, + "learning_rate": 1.1310176721511275e-05, + "loss": 1.1454, + "step": 232 + }, + { + "epoch": 0.04970136518771331, + "grad_norm": 0.3049722219895098, + "learning_rate": 1.1358927483241927e-05, + "loss": 1.1704, + "step": 233 + }, + { + "epoch": 0.04991467576791809, + "grad_norm": 0.26422084786599237, + "learning_rate": 1.1407678244972578e-05, + "loss": 1.1753, + "step": 234 + }, + { + "epoch": 0.05012798634812287, + "grad_norm": 0.18318180052867775, + "learning_rate": 1.1456429006703231e-05, + "loss": 1.1825, + "step": 235 + }, + { + "epoch": 0.050341296928327645, + "grad_norm": 0.270378767865389, + "learning_rate": 1.1505179768433882e-05, + "loss": 1.1902, + "step": 236 + }, + { + "epoch": 0.050554607508532426, + "grad_norm": 0.2680656151005632, + "learning_rate": 1.1553930530164534e-05, + "loss": 1.1544, + "step": 237 + }, + { + "epoch": 0.0507679180887372, + "grad_norm": 0.18675936255826223, + "learning_rate": 1.1602681291895187e-05, + "loss": 1.1832, + "step": 238 + }, + { + "epoch": 0.05098122866894198, + "grad_norm": 0.2412180049176055, + "learning_rate": 1.165143205362584e-05, + "loss": 1.1651, + "step": 239 + }, + { + "epoch": 0.051194539249146756, + "grad_norm": 0.25407697100883625, + "learning_rate": 1.1700182815356491e-05, + "loss": 1.1613, + "step": 240 + }, + { + "epoch": 0.05140784982935154, + "grad_norm": 0.2305331151387242, + "learning_rate": 1.1748933577087142e-05, + "loss": 1.1882, + "step": 241 + }, + { + "epoch": 0.05162116040955631, + "grad_norm": 0.28564444245812437, + "learning_rate": 1.1797684338817794e-05, + "loss": 1.1577, + "step": 242 + }, + { + "epoch": 0.05183447098976109, + "grad_norm": 0.29047295432615916, + "learning_rate": 1.1846435100548448e-05, + "loss": 1.165, + "step": 243 + }, + { + "epoch": 0.05204778156996587, + "grad_norm": 0.2552162781788833, + "learning_rate": 1.18951858622791e-05, + "loss": 1.146, + "step": 244 + }, + { + "epoch": 0.05226109215017065, + "grad_norm": 0.2862055240193271, + "learning_rate": 1.1943936624009751e-05, + "loss": 1.1566, + "step": 245 + }, + { + "epoch": 0.05247440273037542, + "grad_norm": 0.3347688461693089, + "learning_rate": 1.1992687385740402e-05, + "loss": 1.123, + "step": 246 + }, + { + "epoch": 0.052687713310580205, + "grad_norm": 0.31622258037893874, + "learning_rate": 1.2041438147471055e-05, + "loss": 1.165, + "step": 247 + }, + { + "epoch": 0.052901023890784986, + "grad_norm": 0.2719541317001036, + "learning_rate": 1.2090188909201708e-05, + "loss": 1.1531, + "step": 248 + }, + { + "epoch": 0.05311433447098976, + "grad_norm": 0.28641292320907497, + "learning_rate": 1.213893967093236e-05, + "loss": 1.1583, + "step": 249 + }, + { + "epoch": 0.05332764505119454, + "grad_norm": 0.25509089875064045, + "learning_rate": 1.2187690432663011e-05, + "loss": 1.138, + "step": 250 + }, + { + "epoch": 0.053540955631399316, + "grad_norm": 0.2437886560216688, + "learning_rate": 1.2236441194393662e-05, + "loss": 1.1568, + "step": 251 + }, + { + "epoch": 0.0537542662116041, + "grad_norm": 0.20482913393641372, + "learning_rate": 1.2285191956124315e-05, + "loss": 1.1387, + "step": 252 + }, + { + "epoch": 0.05396757679180887, + "grad_norm": 0.23231293288530397, + "learning_rate": 1.2333942717854967e-05, + "loss": 1.1525, + "step": 253 + }, + { + "epoch": 0.05418088737201365, + "grad_norm": 0.2013187211455775, + "learning_rate": 1.238269347958562e-05, + "loss": 1.1853, + "step": 254 + }, + { + "epoch": 0.05439419795221843, + "grad_norm": 0.2453908396620326, + "learning_rate": 1.2431444241316271e-05, + "loss": 1.1247, + "step": 255 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 0.23712446012866475, + "learning_rate": 1.2480195003046924e-05, + "loss": 1.1477, + "step": 256 + }, + { + "epoch": 0.05482081911262798, + "grad_norm": 0.21405685805924712, + "learning_rate": 1.2528945764777575e-05, + "loss": 1.1522, + "step": 257 + }, + { + "epoch": 0.055034129692832764, + "grad_norm": 0.23275693834317324, + "learning_rate": 1.2577696526508227e-05, + "loss": 1.1767, + "step": 258 + }, + { + "epoch": 0.055247440273037546, + "grad_norm": 0.20466868865862736, + "learning_rate": 1.2626447288238878e-05, + "loss": 1.1599, + "step": 259 + }, + { + "epoch": 0.05546075085324232, + "grad_norm": 0.2024058415788373, + "learning_rate": 1.2675198049969533e-05, + "loss": 1.1427, + "step": 260 + }, + { + "epoch": 0.0556740614334471, + "grad_norm": 0.1961819956269941, + "learning_rate": 1.2723948811700184e-05, + "loss": 1.137, + "step": 261 + }, + { + "epoch": 0.055887372013651876, + "grad_norm": 0.24855407938838706, + "learning_rate": 1.2772699573430835e-05, + "loss": 1.1362, + "step": 262 + }, + { + "epoch": 0.05610068259385666, + "grad_norm": 0.27052694621780266, + "learning_rate": 1.2821450335161487e-05, + "loss": 1.1669, + "step": 263 + }, + { + "epoch": 0.05631399317406143, + "grad_norm": 0.26277565489399374, + "learning_rate": 1.2870201096892141e-05, + "loss": 1.1511, + "step": 264 + }, + { + "epoch": 0.05652730375426621, + "grad_norm": 0.2650706552119122, + "learning_rate": 1.2918951858622793e-05, + "loss": 1.1628, + "step": 265 + }, + { + "epoch": 0.05674061433447099, + "grad_norm": 0.28214528552420015, + "learning_rate": 1.2967702620353444e-05, + "loss": 1.1654, + "step": 266 + }, + { + "epoch": 0.05695392491467577, + "grad_norm": 0.2559619987543729, + "learning_rate": 1.3016453382084095e-05, + "loss": 1.1479, + "step": 267 + }, + { + "epoch": 0.05716723549488054, + "grad_norm": 0.32339817387851333, + "learning_rate": 1.3065204143814748e-05, + "loss": 1.1493, + "step": 268 + }, + { + "epoch": 0.057380546075085324, + "grad_norm": 0.2946436212467297, + "learning_rate": 1.3113954905545401e-05, + "loss": 1.1419, + "step": 269 + }, + { + "epoch": 0.057593856655290106, + "grad_norm": 0.2639197538533878, + "learning_rate": 1.3162705667276053e-05, + "loss": 1.1457, + "step": 270 + }, + { + "epoch": 0.05780716723549488, + "grad_norm": 0.2716651836989999, + "learning_rate": 1.3211456429006704e-05, + "loss": 1.1936, + "step": 271 + }, + { + "epoch": 0.05802047781569966, + "grad_norm": 0.2768389047730188, + "learning_rate": 1.3260207190737355e-05, + "loss": 1.123, + "step": 272 + }, + { + "epoch": 0.058233788395904436, + "grad_norm": 0.29038617228803504, + "learning_rate": 1.3308957952468008e-05, + "loss": 1.1159, + "step": 273 + }, + { + "epoch": 0.05844709897610922, + "grad_norm": 0.2179861001333094, + "learning_rate": 1.335770871419866e-05, + "loss": 1.1619, + "step": 274 + }, + { + "epoch": 0.05866040955631399, + "grad_norm": 0.27195531231438036, + "learning_rate": 1.3406459475929313e-05, + "loss": 1.1997, + "step": 275 + }, + { + "epoch": 0.05887372013651877, + "grad_norm": 0.25278019463429413, + "learning_rate": 1.3455210237659964e-05, + "loss": 1.1503, + "step": 276 + }, + { + "epoch": 0.05908703071672355, + "grad_norm": 0.20539794730442343, + "learning_rate": 1.3503960999390617e-05, + "loss": 1.1257, + "step": 277 + }, + { + "epoch": 0.05930034129692833, + "grad_norm": 0.22978296792546218, + "learning_rate": 1.3552711761121268e-05, + "loss": 1.1705, + "step": 278 + }, + { + "epoch": 0.0595136518771331, + "grad_norm": 0.25987079881726544, + "learning_rate": 1.360146252285192e-05, + "loss": 1.1599, + "step": 279 + }, + { + "epoch": 0.059726962457337884, + "grad_norm": 0.25256294009268715, + "learning_rate": 1.3650213284582571e-05, + "loss": 1.1345, + "step": 280 + }, + { + "epoch": 0.05994027303754266, + "grad_norm": 0.34196129816471144, + "learning_rate": 1.3698964046313226e-05, + "loss": 1.1304, + "step": 281 + }, + { + "epoch": 0.06015358361774744, + "grad_norm": 0.404311680703092, + "learning_rate": 1.3747714808043877e-05, + "loss": 1.1595, + "step": 282 + }, + { + "epoch": 0.06036689419795222, + "grad_norm": 0.36779848636541623, + "learning_rate": 1.3796465569774529e-05, + "loss": 1.1634, + "step": 283 + }, + { + "epoch": 0.060580204778156996, + "grad_norm": 0.3041041335204479, + "learning_rate": 1.384521633150518e-05, + "loss": 1.1431, + "step": 284 + }, + { + "epoch": 0.06079351535836178, + "grad_norm": 0.27474716824405265, + "learning_rate": 1.3893967093235835e-05, + "loss": 1.1464, + "step": 285 + }, + { + "epoch": 0.06100682593856655, + "grad_norm": 0.2938803796720488, + "learning_rate": 1.3942717854966486e-05, + "loss": 1.1298, + "step": 286 + }, + { + "epoch": 0.06122013651877133, + "grad_norm": 0.23868800087960468, + "learning_rate": 1.3991468616697137e-05, + "loss": 1.137, + "step": 287 + }, + { + "epoch": 0.06143344709897611, + "grad_norm": 0.2829466009466259, + "learning_rate": 1.4040219378427789e-05, + "loss": 1.1312, + "step": 288 + }, + { + "epoch": 0.06164675767918089, + "grad_norm": 0.27173295327235664, + "learning_rate": 1.4088970140158442e-05, + "loss": 1.132, + "step": 289 + }, + { + "epoch": 0.06186006825938566, + "grad_norm": 0.29583366143105055, + "learning_rate": 1.4137720901889095e-05, + "loss": 1.1136, + "step": 290 + }, + { + "epoch": 0.062073378839590444, + "grad_norm": 0.2560441384178619, + "learning_rate": 1.4186471663619746e-05, + "loss": 1.1768, + "step": 291 + }, + { + "epoch": 0.06228668941979522, + "grad_norm": 0.23383552145143682, + "learning_rate": 1.4235222425350397e-05, + "loss": 1.1445, + "step": 292 + }, + { + "epoch": 0.0625, + "grad_norm": 0.290240782173701, + "learning_rate": 1.4283973187081049e-05, + "loss": 1.1439, + "step": 293 + }, + { + "epoch": 0.06271331058020478, + "grad_norm": 0.3138537148475983, + "learning_rate": 1.4332723948811702e-05, + "loss": 1.1556, + "step": 294 + }, + { + "epoch": 0.06292662116040956, + "grad_norm": 0.28633915182541303, + "learning_rate": 1.4381474710542353e-05, + "loss": 1.1237, + "step": 295 + }, + { + "epoch": 0.06313993174061433, + "grad_norm": 0.30367085922544634, + "learning_rate": 1.4430225472273006e-05, + "loss": 1.149, + "step": 296 + }, + { + "epoch": 0.06335324232081911, + "grad_norm": 0.37858655880757863, + "learning_rate": 1.4478976234003657e-05, + "loss": 1.1793, + "step": 297 + }, + { + "epoch": 0.06356655290102389, + "grad_norm": 0.4096999169806745, + "learning_rate": 1.452772699573431e-05, + "loss": 1.1464, + "step": 298 + }, + { + "epoch": 0.06377986348122867, + "grad_norm": 0.3659270383293867, + "learning_rate": 1.4576477757464962e-05, + "loss": 1.1506, + "step": 299 + }, + { + "epoch": 0.06399317406143344, + "grad_norm": 0.3244264249038771, + "learning_rate": 1.4625228519195613e-05, + "loss": 1.1536, + "step": 300 + }, + { + "epoch": 0.06420648464163822, + "grad_norm": 0.27814718826008183, + "learning_rate": 1.4673979280926264e-05, + "loss": 1.1564, + "step": 301 + }, + { + "epoch": 0.064419795221843, + "grad_norm": 0.3224221040838471, + "learning_rate": 1.4722730042656919e-05, + "loss": 1.1615, + "step": 302 + }, + { + "epoch": 0.06463310580204779, + "grad_norm": 0.32049360304752433, + "learning_rate": 1.477148080438757e-05, + "loss": 1.121, + "step": 303 + }, + { + "epoch": 0.06484641638225255, + "grad_norm": 0.29417167108388087, + "learning_rate": 1.4820231566118222e-05, + "loss": 1.1219, + "step": 304 + }, + { + "epoch": 0.06505972696245733, + "grad_norm": 0.32422110245350516, + "learning_rate": 1.4868982327848873e-05, + "loss": 1.1825, + "step": 305 + }, + { + "epoch": 0.06527303754266212, + "grad_norm": 0.3222313955604191, + "learning_rate": 1.4917733089579528e-05, + "loss": 1.136, + "step": 306 + }, + { + "epoch": 0.0654863481228669, + "grad_norm": 0.27251448185144145, + "learning_rate": 1.4966483851310179e-05, + "loss": 1.1761, + "step": 307 + }, + { + "epoch": 0.06569965870307168, + "grad_norm": 0.24262085560086083, + "learning_rate": 1.501523461304083e-05, + "loss": 1.112, + "step": 308 + }, + { + "epoch": 0.06591296928327645, + "grad_norm": 0.2728204736695768, + "learning_rate": 1.5063985374771482e-05, + "loss": 1.1483, + "step": 309 + }, + { + "epoch": 0.06612627986348123, + "grad_norm": 0.28968469270544334, + "learning_rate": 1.5112736136502133e-05, + "loss": 1.1647, + "step": 310 + }, + { + "epoch": 0.06633959044368601, + "grad_norm": 0.3673318503128783, + "learning_rate": 1.5161486898232786e-05, + "loss": 1.1199, + "step": 311 + }, + { + "epoch": 0.06655290102389079, + "grad_norm": 0.4128001518721509, + "learning_rate": 1.5210237659963439e-05, + "loss": 1.1567, + "step": 312 + }, + { + "epoch": 0.06676621160409556, + "grad_norm": 0.3733399664536921, + "learning_rate": 1.525898842169409e-05, + "loss": 1.135, + "step": 313 + }, + { + "epoch": 0.06697952218430034, + "grad_norm": 0.3287663914565797, + "learning_rate": 1.530773918342474e-05, + "loss": 1.1319, + "step": 314 + }, + { + "epoch": 0.06719283276450512, + "grad_norm": 0.2803593730071665, + "learning_rate": 1.5356489945155396e-05, + "loss": 1.1081, + "step": 315 + }, + { + "epoch": 0.0674061433447099, + "grad_norm": 0.2772011833427616, + "learning_rate": 1.5405240706886046e-05, + "loss": 1.1727, + "step": 316 + }, + { + "epoch": 0.06761945392491468, + "grad_norm": 0.24166608145599816, + "learning_rate": 1.54539914686167e-05, + "loss": 1.1229, + "step": 317 + }, + { + "epoch": 0.06783276450511945, + "grad_norm": 0.26883823770886917, + "learning_rate": 1.550274223034735e-05, + "loss": 1.167, + "step": 318 + }, + { + "epoch": 0.06804607508532423, + "grad_norm": 0.3148547875196285, + "learning_rate": 1.5551492992078e-05, + "loss": 1.1219, + "step": 319 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 0.2831109949942752, + "learning_rate": 1.5600243753808655e-05, + "loss": 1.1189, + "step": 320 + }, + { + "epoch": 0.0684726962457338, + "grad_norm": 0.2599040608454248, + "learning_rate": 1.5648994515539308e-05, + "loss": 1.1256, + "step": 321 + }, + { + "epoch": 0.06868600682593856, + "grad_norm": 0.2622799899623819, + "learning_rate": 1.5697745277269957e-05, + "loss": 1.1141, + "step": 322 + }, + { + "epoch": 0.06889931740614334, + "grad_norm": 0.3045013820318074, + "learning_rate": 1.574649603900061e-05, + "loss": 1.1271, + "step": 323 + }, + { + "epoch": 0.06911262798634812, + "grad_norm": 0.2764495516065324, + "learning_rate": 1.5795246800731263e-05, + "loss": 1.1344, + "step": 324 + }, + { + "epoch": 0.0693259385665529, + "grad_norm": 0.2588399872120055, + "learning_rate": 1.5843997562461913e-05, + "loss": 1.1539, + "step": 325 + }, + { + "epoch": 0.06953924914675767, + "grad_norm": 0.27234342333344647, + "learning_rate": 1.5892748324192566e-05, + "loss": 1.1194, + "step": 326 + }, + { + "epoch": 0.06975255972696245, + "grad_norm": 0.3371492588719658, + "learning_rate": 1.594149908592322e-05, + "loss": 1.1405, + "step": 327 + }, + { + "epoch": 0.06996587030716724, + "grad_norm": 0.33437007144817843, + "learning_rate": 1.5990249847653872e-05, + "loss": 1.158, + "step": 328 + }, + { + "epoch": 0.07017918088737202, + "grad_norm": 0.3867190501782718, + "learning_rate": 1.603900060938452e-05, + "loss": 1.1469, + "step": 329 + }, + { + "epoch": 0.0703924914675768, + "grad_norm": 0.49449616160387627, + "learning_rate": 1.6087751371115175e-05, + "loss": 1.1433, + "step": 330 + }, + { + "epoch": 0.07060580204778157, + "grad_norm": 0.5263846772175986, + "learning_rate": 1.6136502132845824e-05, + "loss": 1.1277, + "step": 331 + }, + { + "epoch": 0.07081911262798635, + "grad_norm": 0.4383453744073459, + "learning_rate": 1.618525289457648e-05, + "loss": 1.1292, + "step": 332 + }, + { + "epoch": 0.07103242320819113, + "grad_norm": 0.3614199234591267, + "learning_rate": 1.623400365630713e-05, + "loss": 1.1433, + "step": 333 + }, + { + "epoch": 0.07124573378839591, + "grad_norm": 0.25596647908752845, + "learning_rate": 1.6282754418037783e-05, + "loss": 1.1259, + "step": 334 + }, + { + "epoch": 0.07145904436860068, + "grad_norm": 0.33730469591098505, + "learning_rate": 1.6331505179768433e-05, + "loss": 1.142, + "step": 335 + }, + { + "epoch": 0.07167235494880546, + "grad_norm": 0.47674002845437424, + "learning_rate": 1.638025594149909e-05, + "loss": 1.1023, + "step": 336 + }, + { + "epoch": 0.07188566552901024, + "grad_norm": 0.5623924078223965, + "learning_rate": 1.642900670322974e-05, + "loss": 1.1283, + "step": 337 + }, + { + "epoch": 0.07209897610921502, + "grad_norm": 0.44364906672533144, + "learning_rate": 1.6477757464960392e-05, + "loss": 1.1493, + "step": 338 + }, + { + "epoch": 0.07231228668941979, + "grad_norm": 0.2685552989738244, + "learning_rate": 1.6526508226691042e-05, + "loss": 1.1222, + "step": 339 + }, + { + "epoch": 0.07252559726962457, + "grad_norm": 0.3656584724329907, + "learning_rate": 1.6575258988421695e-05, + "loss": 1.1473, + "step": 340 + }, + { + "epoch": 0.07273890784982935, + "grad_norm": 0.43045104630760245, + "learning_rate": 1.6624009750152348e-05, + "loss": 1.1585, + "step": 341 + }, + { + "epoch": 0.07295221843003413, + "grad_norm": 0.37248562352652415, + "learning_rate": 1.6672760511883e-05, + "loss": 1.1488, + "step": 342 + }, + { + "epoch": 0.07316552901023891, + "grad_norm": 0.2627950657886523, + "learning_rate": 1.672151127361365e-05, + "loss": 1.1181, + "step": 343 + }, + { + "epoch": 0.07337883959044368, + "grad_norm": 0.2716621031357467, + "learning_rate": 1.6770262035344303e-05, + "loss": 1.0999, + "step": 344 + }, + { + "epoch": 0.07359215017064846, + "grad_norm": 0.3295288260905522, + "learning_rate": 1.6819012797074956e-05, + "loss": 1.1262, + "step": 345 + }, + { + "epoch": 0.07380546075085324, + "grad_norm": 0.328204157141318, + "learning_rate": 1.6867763558805606e-05, + "loss": 1.1367, + "step": 346 + }, + { + "epoch": 0.07401877133105803, + "grad_norm": 0.3657756937391081, + "learning_rate": 1.691651432053626e-05, + "loss": 1.1435, + "step": 347 + }, + { + "epoch": 0.07423208191126279, + "grad_norm": 0.3851684080959009, + "learning_rate": 1.6965265082266912e-05, + "loss": 1.142, + "step": 348 + }, + { + "epoch": 0.07444539249146757, + "grad_norm": 0.3578416045544469, + "learning_rate": 1.7014015843997565e-05, + "loss": 1.132, + "step": 349 + }, + { + "epoch": 0.07465870307167236, + "grad_norm": 0.27968021925002984, + "learning_rate": 1.7062766605728215e-05, + "loss": 1.097, + "step": 350 + }, + { + "epoch": 0.07487201365187714, + "grad_norm": 0.23803243704882757, + "learning_rate": 1.7111517367458868e-05, + "loss": 1.1051, + "step": 351 + }, + { + "epoch": 0.07508532423208192, + "grad_norm": 0.37091531183080606, + "learning_rate": 1.7160268129189517e-05, + "loss": 1.1563, + "step": 352 + }, + { + "epoch": 0.07529863481228669, + "grad_norm": 0.5410428920582847, + "learning_rate": 1.7209018890920174e-05, + "loss": 1.1174, + "step": 353 + }, + { + "epoch": 0.07551194539249147, + "grad_norm": 0.5752717746438437, + "learning_rate": 1.7257769652650823e-05, + "loss": 1.1317, + "step": 354 + }, + { + "epoch": 0.07572525597269625, + "grad_norm": 0.57212695069217, + "learning_rate": 1.7306520414381476e-05, + "loss": 1.1146, + "step": 355 + }, + { + "epoch": 0.07593856655290103, + "grad_norm": 0.5872095059037257, + "learning_rate": 1.7355271176112126e-05, + "loss": 1.1239, + "step": 356 + }, + { + "epoch": 0.0761518771331058, + "grad_norm": 0.5600263739556905, + "learning_rate": 1.7404021937842783e-05, + "loss": 1.1308, + "step": 357 + }, + { + "epoch": 0.07636518771331058, + "grad_norm": 0.44015657544915676, + "learning_rate": 1.7452772699573432e-05, + "loss": 1.1324, + "step": 358 + }, + { + "epoch": 0.07657849829351536, + "grad_norm": 0.3936508072881111, + "learning_rate": 1.7501523461304085e-05, + "loss": 1.112, + "step": 359 + }, + { + "epoch": 0.07679180887372014, + "grad_norm": 0.5061934019106973, + "learning_rate": 1.7550274223034735e-05, + "loss": 1.1681, + "step": 360 + }, + { + "epoch": 0.07700511945392491, + "grad_norm": 0.5825988173359957, + "learning_rate": 1.7599024984765388e-05, + "loss": 1.1159, + "step": 361 + }, + { + "epoch": 0.07721843003412969, + "grad_norm": 0.5797820602835184, + "learning_rate": 1.764777574649604e-05, + "loss": 1.1238, + "step": 362 + }, + { + "epoch": 0.07743174061433447, + "grad_norm": 0.44818621866782754, + "learning_rate": 1.7696526508226694e-05, + "loss": 1.1028, + "step": 363 + }, + { + "epoch": 0.07764505119453925, + "grad_norm": 0.4627503826041691, + "learning_rate": 1.7745277269957344e-05, + "loss": 1.1331, + "step": 364 + }, + { + "epoch": 0.07785836177474403, + "grad_norm": 0.4386132022395208, + "learning_rate": 1.7794028031687997e-05, + "loss": 1.0957, + "step": 365 + }, + { + "epoch": 0.0780716723549488, + "grad_norm": 0.5208505410331938, + "learning_rate": 1.784277879341865e-05, + "loss": 1.1191, + "step": 366 + }, + { + "epoch": 0.07828498293515358, + "grad_norm": 0.579090119554618, + "learning_rate": 1.78915295551493e-05, + "loss": 1.1453, + "step": 367 + }, + { + "epoch": 0.07849829351535836, + "grad_norm": 0.45082594239090124, + "learning_rate": 1.7940280316879952e-05, + "loss": 1.1357, + "step": 368 + }, + { + "epoch": 0.07871160409556315, + "grad_norm": 0.40689390005301834, + "learning_rate": 1.7989031078610605e-05, + "loss": 1.1563, + "step": 369 + }, + { + "epoch": 0.07892491467576791, + "grad_norm": 0.38833515190272927, + "learning_rate": 1.8037781840341258e-05, + "loss": 1.1088, + "step": 370 + }, + { + "epoch": 0.0791382252559727, + "grad_norm": 0.37227864959501494, + "learning_rate": 1.8086532602071908e-05, + "loss": 1.1404, + "step": 371 + }, + { + "epoch": 0.07935153583617748, + "grad_norm": 0.32268399189229263, + "learning_rate": 1.813528336380256e-05, + "loss": 1.102, + "step": 372 + }, + { + "epoch": 0.07956484641638226, + "grad_norm": 0.42131703604674947, + "learning_rate": 1.818403412553321e-05, + "loss": 1.1491, + "step": 373 + }, + { + "epoch": 0.07977815699658702, + "grad_norm": 0.6531226295990412, + "learning_rate": 1.8232784887263867e-05, + "loss": 1.1327, + "step": 374 + }, + { + "epoch": 0.0799914675767918, + "grad_norm": 0.7464372623614196, + "learning_rate": 1.8281535648994517e-05, + "loss": 1.1501, + "step": 375 + }, + { + "epoch": 0.08020477815699659, + "grad_norm": 0.6758043361013083, + "learning_rate": 1.833028641072517e-05, + "loss": 1.1536, + "step": 376 + }, + { + "epoch": 0.08041808873720137, + "grad_norm": 0.5897844623968519, + "learning_rate": 1.837903717245582e-05, + "loss": 1.1351, + "step": 377 + }, + { + "epoch": 0.08063139931740615, + "grad_norm": 0.3699761593927993, + "learning_rate": 1.8427787934186476e-05, + "loss": 1.1368, + "step": 378 + }, + { + "epoch": 0.08084470989761092, + "grad_norm": 0.43866296366715446, + "learning_rate": 1.8476538695917125e-05, + "loss": 1.1182, + "step": 379 + }, + { + "epoch": 0.0810580204778157, + "grad_norm": 0.5449052351204199, + "learning_rate": 1.8525289457647778e-05, + "loss": 1.1111, + "step": 380 + }, + { + "epoch": 0.08127133105802048, + "grad_norm": 0.4648821718729246, + "learning_rate": 1.8574040219378428e-05, + "loss": 1.1309, + "step": 381 + }, + { + "epoch": 0.08148464163822526, + "grad_norm": 0.36712111965670113, + "learning_rate": 1.862279098110908e-05, + "loss": 1.127, + "step": 382 + }, + { + "epoch": 0.08169795221843003, + "grad_norm": 0.42540497846510994, + "learning_rate": 1.8671541742839734e-05, + "loss": 1.117, + "step": 383 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 0.42837065198400764, + "learning_rate": 1.8720292504570387e-05, + "loss": 1.1132, + "step": 384 + }, + { + "epoch": 0.08212457337883959, + "grad_norm": 0.47456947214994816, + "learning_rate": 1.8769043266301037e-05, + "loss": 1.1272, + "step": 385 + }, + { + "epoch": 0.08233788395904437, + "grad_norm": 0.42570049439349467, + "learning_rate": 1.881779402803169e-05, + "loss": 1.1089, + "step": 386 + }, + { + "epoch": 0.08255119453924915, + "grad_norm": 0.3078969491255546, + "learning_rate": 1.8866544789762343e-05, + "loss": 1.1188, + "step": 387 + }, + { + "epoch": 0.08276450511945392, + "grad_norm": 0.3005229196952467, + "learning_rate": 1.8915295551492992e-05, + "loss": 1.1295, + "step": 388 + }, + { + "epoch": 0.0829778156996587, + "grad_norm": 0.3174873687033879, + "learning_rate": 1.8964046313223645e-05, + "loss": 1.1429, + "step": 389 + }, + { + "epoch": 0.08319112627986348, + "grad_norm": 0.2898421393255217, + "learning_rate": 1.90127970749543e-05, + "loss": 1.123, + "step": 390 + }, + { + "epoch": 0.08340443686006827, + "grad_norm": 0.2986694654974715, + "learning_rate": 1.906154783668495e-05, + "loss": 1.1269, + "step": 391 + }, + { + "epoch": 0.08361774744027303, + "grad_norm": 0.3081371608047492, + "learning_rate": 1.91102985984156e-05, + "loss": 1.1133, + "step": 392 + }, + { + "epoch": 0.08383105802047781, + "grad_norm": 0.2947010581402372, + "learning_rate": 1.9159049360146254e-05, + "loss": 1.0965, + "step": 393 + }, + { + "epoch": 0.0840443686006826, + "grad_norm": 0.32517211244940736, + "learning_rate": 1.9207800121876904e-05, + "loss": 1.1279, + "step": 394 + }, + { + "epoch": 0.08425767918088738, + "grad_norm": 0.3927048757193291, + "learning_rate": 1.925655088360756e-05, + "loss": 1.1389, + "step": 395 + }, + { + "epoch": 0.08447098976109214, + "grad_norm": 0.5651149223240242, + "learning_rate": 1.930530164533821e-05, + "loss": 1.1045, + "step": 396 + }, + { + "epoch": 0.08468430034129693, + "grad_norm": 0.598058560355937, + "learning_rate": 1.9354052407068863e-05, + "loss": 1.111, + "step": 397 + }, + { + "epoch": 0.0848976109215017, + "grad_norm": 0.4173840617892741, + "learning_rate": 1.9402803168799512e-05, + "loss": 1.1249, + "step": 398 + }, + { + "epoch": 0.08511092150170649, + "grad_norm": 0.34646903126932405, + "learning_rate": 1.945155393053017e-05, + "loss": 1.1222, + "step": 399 + }, + { + "epoch": 0.08532423208191127, + "grad_norm": 0.4252472694745222, + "learning_rate": 1.950030469226082e-05, + "loss": 1.0696, + "step": 400 + }, + { + "epoch": 0.08553754266211604, + "grad_norm": 0.3697616600981427, + "learning_rate": 1.954905545399147e-05, + "loss": 1.1254, + "step": 401 + }, + { + "epoch": 0.08575085324232082, + "grad_norm": 0.37691871667575194, + "learning_rate": 1.959780621572212e-05, + "loss": 1.169, + "step": 402 + }, + { + "epoch": 0.0859641638225256, + "grad_norm": 0.2616849965824557, + "learning_rate": 1.9646556977452774e-05, + "loss": 1.104, + "step": 403 + }, + { + "epoch": 0.08617747440273038, + "grad_norm": 0.31456559137151247, + "learning_rate": 1.9695307739183427e-05, + "loss": 1.1863, + "step": 404 + }, + { + "epoch": 0.08639078498293515, + "grad_norm": 0.36148061669440085, + "learning_rate": 1.974405850091408e-05, + "loss": 1.1063, + "step": 405 + }, + { + "epoch": 0.08660409556313993, + "grad_norm": 0.46905524858504305, + "learning_rate": 1.979280926264473e-05, + "loss": 1.1267, + "step": 406 + }, + { + "epoch": 0.08681740614334471, + "grad_norm": 0.5145105876044925, + "learning_rate": 1.9841560024375383e-05, + "loss": 1.1001, + "step": 407 + }, + { + "epoch": 0.08703071672354949, + "grad_norm": 0.43554674315131553, + "learning_rate": 1.9890310786106036e-05, + "loss": 1.1201, + "step": 408 + }, + { + "epoch": 0.08724402730375426, + "grad_norm": 0.38991745748325995, + "learning_rate": 1.9939061547836685e-05, + "loss": 1.1605, + "step": 409 + }, + { + "epoch": 0.08745733788395904, + "grad_norm": 0.5237143907107175, + "learning_rate": 1.998781230956734e-05, + "loss": 1.1228, + "step": 410 + }, + { + "epoch": 0.08767064846416382, + "grad_norm": 0.5276348240683983, + "learning_rate": 2.003656307129799e-05, + "loss": 1.1117, + "step": 411 + }, + { + "epoch": 0.0878839590443686, + "grad_norm": 0.45377245866486077, + "learning_rate": 2.0085313833028644e-05, + "loss": 1.1288, + "step": 412 + }, + { + "epoch": 0.08809726962457338, + "grad_norm": 0.4268050489753117, + "learning_rate": 2.0134064594759294e-05, + "loss": 1.0972, + "step": 413 + }, + { + "epoch": 0.08831058020477815, + "grad_norm": 0.47895694341919587, + "learning_rate": 2.0182815356489947e-05, + "loss": 1.103, + "step": 414 + }, + { + "epoch": 0.08852389078498293, + "grad_norm": 0.4316984280164501, + "learning_rate": 2.02315661182206e-05, + "loss": 1.1461, + "step": 415 + }, + { + "epoch": 0.08873720136518772, + "grad_norm": 0.4239647166730071, + "learning_rate": 2.028031687995125e-05, + "loss": 1.1013, + "step": 416 + }, + { + "epoch": 0.0889505119453925, + "grad_norm": 0.5122739099724171, + "learning_rate": 2.0329067641681903e-05, + "loss": 1.1396, + "step": 417 + }, + { + "epoch": 0.08916382252559726, + "grad_norm": 0.5188084875255707, + "learning_rate": 2.0377818403412556e-05, + "loss": 1.0879, + "step": 418 + }, + { + "epoch": 0.08937713310580205, + "grad_norm": 0.3932788634976767, + "learning_rate": 2.0426569165143205e-05, + "loss": 1.1114, + "step": 419 + }, + { + "epoch": 0.08959044368600683, + "grad_norm": 0.3312274801295201, + "learning_rate": 2.047531992687386e-05, + "loss": 1.1282, + "step": 420 + }, + { + "epoch": 0.08980375426621161, + "grad_norm": 0.2934339645737796, + "learning_rate": 2.0524070688604508e-05, + "loss": 1.1073, + "step": 421 + }, + { + "epoch": 0.09001706484641639, + "grad_norm": 0.3610983242733062, + "learning_rate": 2.0572821450335164e-05, + "loss": 1.1555, + "step": 422 + }, + { + "epoch": 0.09023037542662116, + "grad_norm": 0.4349541774538363, + "learning_rate": 2.0621572212065817e-05, + "loss": 1.1091, + "step": 423 + }, + { + "epoch": 0.09044368600682594, + "grad_norm": 0.4972534626290102, + "learning_rate": 2.0670322973796467e-05, + "loss": 1.0857, + "step": 424 + }, + { + "epoch": 0.09065699658703072, + "grad_norm": 0.37938931993902897, + "learning_rate": 2.071907373552712e-05, + "loss": 1.0894, + "step": 425 + }, + { + "epoch": 0.0908703071672355, + "grad_norm": 0.37626345431991826, + "learning_rate": 2.076782449725777e-05, + "loss": 1.1576, + "step": 426 + }, + { + "epoch": 0.09108361774744027, + "grad_norm": 0.4330099665584405, + "learning_rate": 2.0816575258988423e-05, + "loss": 1.1028, + "step": 427 + }, + { + "epoch": 0.09129692832764505, + "grad_norm": 0.4501386766769097, + "learning_rate": 2.0865326020719076e-05, + "loss": 1.1199, + "step": 428 + }, + { + "epoch": 0.09151023890784983, + "grad_norm": 0.3734784837630038, + "learning_rate": 2.0914076782449725e-05, + "loss": 1.1124, + "step": 429 + }, + { + "epoch": 0.09172354948805461, + "grad_norm": 0.33255760132499446, + "learning_rate": 2.096282754418038e-05, + "loss": 1.0746, + "step": 430 + }, + { + "epoch": 0.09193686006825938, + "grad_norm": 0.43324384974570024, + "learning_rate": 2.1011578305911035e-05, + "loss": 1.1085, + "step": 431 + }, + { + "epoch": 0.09215017064846416, + "grad_norm": 0.4860242999121032, + "learning_rate": 2.106032906764168e-05, + "loss": 1.1561, + "step": 432 + }, + { + "epoch": 0.09236348122866894, + "grad_norm": 0.464071974045162, + "learning_rate": 2.1109079829372338e-05, + "loss": 1.1111, + "step": 433 + }, + { + "epoch": 0.09257679180887372, + "grad_norm": 0.5346422915846143, + "learning_rate": 2.1157830591102987e-05, + "loss": 1.1311, + "step": 434 + }, + { + "epoch": 0.0927901023890785, + "grad_norm": 0.592718273409851, + "learning_rate": 2.120658135283364e-05, + "loss": 1.1027, + "step": 435 + }, + { + "epoch": 0.09300341296928327, + "grad_norm": 0.5661925237428692, + "learning_rate": 2.1255332114564293e-05, + "loss": 1.1007, + "step": 436 + }, + { + "epoch": 0.09321672354948805, + "grad_norm": 0.6787213923008931, + "learning_rate": 2.1304082876294943e-05, + "loss": 1.1251, + "step": 437 + }, + { + "epoch": 0.09343003412969283, + "grad_norm": 0.7810531840895104, + "learning_rate": 2.1352833638025596e-05, + "loss": 1.1309, + "step": 438 + }, + { + "epoch": 0.09364334470989762, + "grad_norm": 0.8023474874674217, + "learning_rate": 2.140158439975625e-05, + "loss": 1.1041, + "step": 439 + }, + { + "epoch": 0.09385665529010238, + "grad_norm": 0.7789080340158566, + "learning_rate": 2.14503351614869e-05, + "loss": 1.1596, + "step": 440 + }, + { + "epoch": 0.09406996587030717, + "grad_norm": 0.5678491747800755, + "learning_rate": 2.149908592321755e-05, + "loss": 1.1212, + "step": 441 + }, + { + "epoch": 0.09428327645051195, + "grad_norm": 0.5085333890033986, + "learning_rate": 2.15478366849482e-05, + "loss": 1.1072, + "step": 442 + }, + { + "epoch": 0.09449658703071673, + "grad_norm": 0.46943825144352086, + "learning_rate": 2.1596587446678858e-05, + "loss": 1.1117, + "step": 443 + }, + { + "epoch": 0.0947098976109215, + "grad_norm": 0.530847420726543, + "learning_rate": 2.164533820840951e-05, + "loss": 1.1285, + "step": 444 + }, + { + "epoch": 0.09492320819112628, + "grad_norm": 0.541829009708531, + "learning_rate": 2.169408897014016e-05, + "loss": 1.1282, + "step": 445 + }, + { + "epoch": 0.09513651877133106, + "grad_norm": 0.3972690432580843, + "learning_rate": 2.1742839731870813e-05, + "loss": 1.1083, + "step": 446 + }, + { + "epoch": 0.09534982935153584, + "grad_norm": 0.43107449215970706, + "learning_rate": 2.1791590493601463e-05, + "loss": 1.1204, + "step": 447 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 0.4313768805232582, + "learning_rate": 2.1840341255332116e-05, + "loss": 1.0917, + "step": 448 + }, + { + "epoch": 0.09577645051194539, + "grad_norm": 0.24846006147607283, + "learning_rate": 2.188909201706277e-05, + "loss": 1.0787, + "step": 449 + }, + { + "epoch": 0.09598976109215017, + "grad_norm": 0.3802131292851314, + "learning_rate": 2.193784277879342e-05, + "loss": 1.1472, + "step": 450 + }, + { + "epoch": 0.09620307167235495, + "grad_norm": 0.4453276425400783, + "learning_rate": 2.198659354052407e-05, + "loss": 1.0833, + "step": 451 + }, + { + "epoch": 0.09641638225255973, + "grad_norm": 0.47348404475426376, + "learning_rate": 2.2035344302254728e-05, + "loss": 1.1054, + "step": 452 + }, + { + "epoch": 0.0966296928327645, + "grad_norm": 0.49543399723411774, + "learning_rate": 2.2084095063985374e-05, + "loss": 1.126, + "step": 453 + }, + { + "epoch": 0.09684300341296928, + "grad_norm": 0.4156928038722509, + "learning_rate": 2.213284582571603e-05, + "loss": 1.1402, + "step": 454 + }, + { + "epoch": 0.09705631399317406, + "grad_norm": 0.4006760220803069, + "learning_rate": 2.218159658744668e-05, + "loss": 1.0737, + "step": 455 + }, + { + "epoch": 0.09726962457337884, + "grad_norm": 0.44670801223883483, + "learning_rate": 2.2230347349177333e-05, + "loss": 1.086, + "step": 456 + }, + { + "epoch": 0.09748293515358361, + "grad_norm": 0.3086465865719076, + "learning_rate": 2.2279098110907986e-05, + "loss": 1.127, + "step": 457 + }, + { + "epoch": 0.09769624573378839, + "grad_norm": 0.3282816203237966, + "learning_rate": 2.2327848872638636e-05, + "loss": 1.1234, + "step": 458 + }, + { + "epoch": 0.09790955631399317, + "grad_norm": 0.3400559060413713, + "learning_rate": 2.237659963436929e-05, + "loss": 1.1635, + "step": 459 + }, + { + "epoch": 0.09812286689419795, + "grad_norm": 0.35047511604471265, + "learning_rate": 2.2425350396099942e-05, + "loss": 1.0911, + "step": 460 + }, + { + "epoch": 0.09833617747440274, + "grad_norm": 0.3696956343113795, + "learning_rate": 2.247410115783059e-05, + "loss": 1.1125, + "step": 461 + }, + { + "epoch": 0.0985494880546075, + "grad_norm": 0.34437890783151215, + "learning_rate": 2.2522851919561245e-05, + "loss": 1.1023, + "step": 462 + }, + { + "epoch": 0.09876279863481228, + "grad_norm": 0.34575164303350947, + "learning_rate": 2.2571602681291894e-05, + "loss": 1.1129, + "step": 463 + }, + { + "epoch": 0.09897610921501707, + "grad_norm": 0.41476276441554766, + "learning_rate": 2.262035344302255e-05, + "loss": 1.1136, + "step": 464 + }, + { + "epoch": 0.09918941979522185, + "grad_norm": 0.4488526624089836, + "learning_rate": 2.2669104204753204e-05, + "loss": 1.1197, + "step": 465 + }, + { + "epoch": 0.09940273037542662, + "grad_norm": 0.40831211875325063, + "learning_rate": 2.2717854966483853e-05, + "loss": 1.1183, + "step": 466 + }, + { + "epoch": 0.0996160409556314, + "grad_norm": 0.33295012299871457, + "learning_rate": 2.2766605728214506e-05, + "loss": 1.1052, + "step": 467 + }, + { + "epoch": 0.09982935153583618, + "grad_norm": 0.3964128030949336, + "learning_rate": 2.2815356489945156e-05, + "loss": 1.0595, + "step": 468 + }, + { + "epoch": 0.10004266211604096, + "grad_norm": 0.4675914504569668, + "learning_rate": 2.286410725167581e-05, + "loss": 1.1176, + "step": 469 + }, + { + "epoch": 0.10025597269624574, + "grad_norm": 0.4064013571922432, + "learning_rate": 2.2912858013406462e-05, + "loss": 1.1213, + "step": 470 + }, + { + "epoch": 0.10046928327645051, + "grad_norm": 0.3701819279184297, + "learning_rate": 2.296160877513711e-05, + "loss": 1.1099, + "step": 471 + }, + { + "epoch": 0.10068259385665529, + "grad_norm": 0.3415909905706806, + "learning_rate": 2.3010359536867765e-05, + "loss": 1.1401, + "step": 472 + }, + { + "epoch": 0.10089590443686007, + "grad_norm": 0.44410327302940344, + "learning_rate": 2.305911029859842e-05, + "loss": 1.1031, + "step": 473 + }, + { + "epoch": 0.10110921501706485, + "grad_norm": 0.6186631235211668, + "learning_rate": 2.3107861060329067e-05, + "loss": 1.1196, + "step": 474 + }, + { + "epoch": 0.10132252559726962, + "grad_norm": 0.8161164645493059, + "learning_rate": 2.3156611822059724e-05, + "loss": 1.111, + "step": 475 + }, + { + "epoch": 0.1015358361774744, + "grad_norm": 0.9942316544689418, + "learning_rate": 2.3205362583790373e-05, + "loss": 1.1018, + "step": 476 + }, + { + "epoch": 0.10174914675767918, + "grad_norm": 1.029183111844055, + "learning_rate": 2.3254113345521026e-05, + "loss": 1.0917, + "step": 477 + }, + { + "epoch": 0.10196245733788396, + "grad_norm": 0.8392528462768165, + "learning_rate": 2.330286410725168e-05, + "loss": 1.1188, + "step": 478 + }, + { + "epoch": 0.10217576791808873, + "grad_norm": 0.6181893494562333, + "learning_rate": 2.335161486898233e-05, + "loss": 1.1116, + "step": 479 + }, + { + "epoch": 0.10238907849829351, + "grad_norm": 0.4434981992987011, + "learning_rate": 2.3400365630712982e-05, + "loss": 1.112, + "step": 480 + }, + { + "epoch": 0.1026023890784983, + "grad_norm": 0.5240537113159602, + "learning_rate": 2.3449116392443635e-05, + "loss": 1.1054, + "step": 481 + }, + { + "epoch": 0.10281569965870307, + "grad_norm": 0.4640705995730811, + "learning_rate": 2.3497867154174285e-05, + "loss": 1.0809, + "step": 482 + }, + { + "epoch": 0.10302901023890786, + "grad_norm": 0.4674851077685055, + "learning_rate": 2.3546617915904938e-05, + "loss": 1.1097, + "step": 483 + }, + { + "epoch": 0.10324232081911262, + "grad_norm": 0.5064415158923862, + "learning_rate": 2.3595368677635587e-05, + "loss": 1.1099, + "step": 484 + }, + { + "epoch": 0.1034556313993174, + "grad_norm": 0.4290728661374739, + "learning_rate": 2.3644119439366244e-05, + "loss": 1.0665, + "step": 485 + }, + { + "epoch": 0.10366894197952219, + "grad_norm": 0.5384416770994964, + "learning_rate": 2.3692870201096897e-05, + "loss": 1.1212, + "step": 486 + }, + { + "epoch": 0.10388225255972697, + "grad_norm": 0.4629912028503603, + "learning_rate": 2.3741620962827546e-05, + "loss": 1.1235, + "step": 487 + }, + { + "epoch": 0.10409556313993173, + "grad_norm": 0.44423399144952597, + "learning_rate": 2.37903717245582e-05, + "loss": 1.1132, + "step": 488 + }, + { + "epoch": 0.10430887372013652, + "grad_norm": 0.46404404736606364, + "learning_rate": 2.383912248628885e-05, + "loss": 1.1348, + "step": 489 + }, + { + "epoch": 0.1045221843003413, + "grad_norm": 0.4888108294730988, + "learning_rate": 2.3887873248019502e-05, + "loss": 1.1078, + "step": 490 + }, + { + "epoch": 0.10473549488054608, + "grad_norm": 0.46335377870253824, + "learning_rate": 2.3936624009750155e-05, + "loss": 1.1452, + "step": 491 + }, + { + "epoch": 0.10494880546075085, + "grad_norm": 0.3888348917022155, + "learning_rate": 2.3985374771480805e-05, + "loss": 1.0853, + "step": 492 + }, + { + "epoch": 0.10516211604095563, + "grad_norm": 0.44777103192698053, + "learning_rate": 2.4034125533211458e-05, + "loss": 1.1005, + "step": 493 + }, + { + "epoch": 0.10537542662116041, + "grad_norm": 0.4255417364961353, + "learning_rate": 2.408287629494211e-05, + "loss": 1.1202, + "step": 494 + }, + { + "epoch": 0.10558873720136519, + "grad_norm": 0.4438742741138847, + "learning_rate": 2.413162705667276e-05, + "loss": 1.0944, + "step": 495 + }, + { + "epoch": 0.10580204778156997, + "grad_norm": 0.4605583388466923, + "learning_rate": 2.4180377818403417e-05, + "loss": 1.1318, + "step": 496 + }, + { + "epoch": 0.10601535836177474, + "grad_norm": 0.41318175310887956, + "learning_rate": 2.4229128580134066e-05, + "loss": 1.0869, + "step": 497 + }, + { + "epoch": 0.10622866894197952, + "grad_norm": 0.3984154741376282, + "learning_rate": 2.427787934186472e-05, + "loss": 1.1006, + "step": 498 + }, + { + "epoch": 0.1064419795221843, + "grad_norm": 0.3940246701073218, + "learning_rate": 2.4326630103595372e-05, + "loss": 1.1079, + "step": 499 + }, + { + "epoch": 0.10665529010238908, + "grad_norm": 0.31912814493412156, + "learning_rate": 2.4375380865326022e-05, + "loss": 1.134, + "step": 500 + }, + { + "epoch": 0.10686860068259385, + "grad_norm": 0.3784989153996933, + "learning_rate": 2.4424131627056675e-05, + "loss": 1.1102, + "step": 501 + }, + { + "epoch": 0.10708191126279863, + "grad_norm": 0.462706317111215, + "learning_rate": 2.4472882388787325e-05, + "loss": 1.1143, + "step": 502 + }, + { + "epoch": 0.10729522184300341, + "grad_norm": 0.5031684666108226, + "learning_rate": 2.4521633150517978e-05, + "loss": 1.0813, + "step": 503 + }, + { + "epoch": 0.1075085324232082, + "grad_norm": 0.4475785082659592, + "learning_rate": 2.457038391224863e-05, + "loss": 1.117, + "step": 504 + }, + { + "epoch": 0.10772184300341298, + "grad_norm": 0.4691685437809708, + "learning_rate": 2.461913467397928e-05, + "loss": 1.1103, + "step": 505 + }, + { + "epoch": 0.10793515358361774, + "grad_norm": 0.501490848790564, + "learning_rate": 2.4667885435709933e-05, + "loss": 1.0874, + "step": 506 + }, + { + "epoch": 0.10814846416382252, + "grad_norm": 0.5456507065249636, + "learning_rate": 2.471663619744059e-05, + "loss": 1.112, + "step": 507 + }, + { + "epoch": 0.1083617747440273, + "grad_norm": 0.6512851565030611, + "learning_rate": 2.476538695917124e-05, + "loss": 1.1144, + "step": 508 + }, + { + "epoch": 0.10857508532423209, + "grad_norm": 0.5920373866763805, + "learning_rate": 2.4814137720901892e-05, + "loss": 1.096, + "step": 509 + }, + { + "epoch": 0.10878839590443685, + "grad_norm": 0.657200630408084, + "learning_rate": 2.4862888482632542e-05, + "loss": 1.11, + "step": 510 + }, + { + "epoch": 0.10900170648464164, + "grad_norm": 0.7408747295307635, + "learning_rate": 2.4911639244363195e-05, + "loss": 1.0987, + "step": 511 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 0.8402533517981139, + "learning_rate": 2.4960390006093848e-05, + "loss": 1.1053, + "step": 512 + }, + { + "epoch": 0.1094283276450512, + "grad_norm": 0.9920461271117528, + "learning_rate": 2.5009140767824498e-05, + "loss": 1.0831, + "step": 513 + }, + { + "epoch": 0.10964163822525597, + "grad_norm": 1.2275347678207147, + "learning_rate": 2.505789152955515e-05, + "loss": 1.0832, + "step": 514 + }, + { + "epoch": 0.10985494880546075, + "grad_norm": 0.5553171456972693, + "learning_rate": 2.5106642291285804e-05, + "loss": 1.1246, + "step": 515 + }, + { + "epoch": 0.11006825938566553, + "grad_norm": 0.6053285204018722, + "learning_rate": 2.5155393053016453e-05, + "loss": 1.1133, + "step": 516 + }, + { + "epoch": 0.11028156996587031, + "grad_norm": 1.0963391874116453, + "learning_rate": 2.520414381474711e-05, + "loss": 1.0869, + "step": 517 + }, + { + "epoch": 0.11049488054607509, + "grad_norm": 0.985182227196785, + "learning_rate": 2.5252894576477756e-05, + "loss": 1.1193, + "step": 518 + }, + { + "epoch": 0.11070819112627986, + "grad_norm": 0.7445483009263677, + "learning_rate": 2.5301645338208413e-05, + "loss": 1.1195, + "step": 519 + }, + { + "epoch": 0.11092150170648464, + "grad_norm": 0.5256097803572766, + "learning_rate": 2.5350396099939066e-05, + "loss": 1.0752, + "step": 520 + }, + { + "epoch": 0.11113481228668942, + "grad_norm": 0.5650842893983393, + "learning_rate": 2.5399146861669715e-05, + "loss": 1.0724, + "step": 521 + }, + { + "epoch": 0.1113481228668942, + "grad_norm": 0.5560925882057328, + "learning_rate": 2.5447897623400368e-05, + "loss": 1.0811, + "step": 522 + }, + { + "epoch": 0.11156143344709897, + "grad_norm": 0.6092457198280364, + "learning_rate": 2.5496648385131018e-05, + "loss": 1.0955, + "step": 523 + }, + { + "epoch": 0.11177474402730375, + "grad_norm": 0.6033345824616568, + "learning_rate": 2.554539914686167e-05, + "loss": 1.1081, + "step": 524 + }, + { + "epoch": 0.11198805460750853, + "grad_norm": 0.47591082111199084, + "learning_rate": 2.5594149908592324e-05, + "loss": 1.0937, + "step": 525 + }, + { + "epoch": 0.11220136518771331, + "grad_norm": 0.4434722771054973, + "learning_rate": 2.5642900670322973e-05, + "loss": 1.1263, + "step": 526 + }, + { + "epoch": 0.11241467576791808, + "grad_norm": 0.5217230250491911, + "learning_rate": 2.5691651432053627e-05, + "loss": 1.1211, + "step": 527 + }, + { + "epoch": 0.11262798634812286, + "grad_norm": 0.4092053567491736, + "learning_rate": 2.5740402193784283e-05, + "loss": 1.1219, + "step": 528 + }, + { + "epoch": 0.11284129692832764, + "grad_norm": 0.44080075824400977, + "learning_rate": 2.5789152955514933e-05, + "loss": 1.1024, + "step": 529 + }, + { + "epoch": 0.11305460750853243, + "grad_norm": 0.4519179471929369, + "learning_rate": 2.5837903717245586e-05, + "loss": 1.0817, + "step": 530 + }, + { + "epoch": 0.11326791808873721, + "grad_norm": 0.5194829551302332, + "learning_rate": 2.5886654478976235e-05, + "loss": 1.1027, + "step": 531 + }, + { + "epoch": 0.11348122866894197, + "grad_norm": 0.47424147287839363, + "learning_rate": 2.5935405240706888e-05, + "loss": 1.1293, + "step": 532 + }, + { + "epoch": 0.11369453924914676, + "grad_norm": 0.5084072109647452, + "learning_rate": 2.598415600243754e-05, + "loss": 1.1033, + "step": 533 + }, + { + "epoch": 0.11390784982935154, + "grad_norm": 0.5492957075156463, + "learning_rate": 2.603290676416819e-05, + "loss": 1.0794, + "step": 534 + }, + { + "epoch": 0.11412116040955632, + "grad_norm": 0.6040581873959222, + "learning_rate": 2.6081657525898844e-05, + "loss": 1.0805, + "step": 535 + }, + { + "epoch": 0.11433447098976109, + "grad_norm": 0.6475417673980359, + "learning_rate": 2.6130408287629497e-05, + "loss": 1.0698, + "step": 536 + }, + { + "epoch": 0.11454778156996587, + "grad_norm": 0.5766564424387247, + "learning_rate": 2.6179159049360147e-05, + "loss": 1.1223, + "step": 537 + }, + { + "epoch": 0.11476109215017065, + "grad_norm": 0.3468277549068465, + "learning_rate": 2.6227909811090803e-05, + "loss": 1.1291, + "step": 538 + }, + { + "epoch": 0.11497440273037543, + "grad_norm": 0.527636255644222, + "learning_rate": 2.627666057282145e-05, + "loss": 1.081, + "step": 539 + }, + { + "epoch": 0.11518771331058021, + "grad_norm": 0.7380094642958064, + "learning_rate": 2.6325411334552106e-05, + "loss": 1.1112, + "step": 540 + }, + { + "epoch": 0.11540102389078498, + "grad_norm": 0.7896302160112497, + "learning_rate": 2.637416209628276e-05, + "loss": 1.1039, + "step": 541 + }, + { + "epoch": 0.11561433447098976, + "grad_norm": 0.684607723639423, + "learning_rate": 2.6422912858013408e-05, + "loss": 1.1331, + "step": 542 + }, + { + "epoch": 0.11582764505119454, + "grad_norm": 0.4138889747855594, + "learning_rate": 2.647166361974406e-05, + "loss": 1.1321, + "step": 543 + }, + { + "epoch": 0.11604095563139932, + "grad_norm": 0.349349330046237, + "learning_rate": 2.652041438147471e-05, + "loss": 1.1202, + "step": 544 + }, + { + "epoch": 0.11625426621160409, + "grad_norm": 0.5154588726545829, + "learning_rate": 2.6569165143205364e-05, + "loss": 1.0812, + "step": 545 + }, + { + "epoch": 0.11646757679180887, + "grad_norm": 0.5896535952880537, + "learning_rate": 2.6617915904936017e-05, + "loss": 1.0975, + "step": 546 + }, + { + "epoch": 0.11668088737201365, + "grad_norm": 0.5314828802565258, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.0805, + "step": 547 + }, + { + "epoch": 0.11689419795221843, + "grad_norm": 0.4421916050971987, + "learning_rate": 2.671541742839732e-05, + "loss": 1.0851, + "step": 548 + }, + { + "epoch": 0.1171075085324232, + "grad_norm": 0.4164431658738949, + "learning_rate": 2.6764168190127976e-05, + "loss": 1.0709, + "step": 549 + }, + { + "epoch": 0.11732081911262798, + "grad_norm": 0.37261463715934046, + "learning_rate": 2.6812918951858626e-05, + "loss": 1.102, + "step": 550 + }, + { + "epoch": 0.11753412969283276, + "grad_norm": 0.3832873687063351, + "learning_rate": 2.686166971358928e-05, + "loss": 1.1118, + "step": 551 + }, + { + "epoch": 0.11774744027303755, + "grad_norm": 0.42207136478649177, + "learning_rate": 2.6910420475319928e-05, + "loss": 1.1007, + "step": 552 + }, + { + "epoch": 0.11796075085324233, + "grad_norm": 0.5162581052869368, + "learning_rate": 2.695917123705058e-05, + "loss": 1.1169, + "step": 553 + }, + { + "epoch": 0.1181740614334471, + "grad_norm": 0.5319608989299144, + "learning_rate": 2.7007921998781234e-05, + "loss": 1.0679, + "step": 554 + }, + { + "epoch": 0.11838737201365188, + "grad_norm": 0.4826047376539658, + "learning_rate": 2.7056672760511884e-05, + "loss": 1.0964, + "step": 555 + }, + { + "epoch": 0.11860068259385666, + "grad_norm": 0.3327949277066272, + "learning_rate": 2.7105423522242537e-05, + "loss": 1.106, + "step": 556 + }, + { + "epoch": 0.11881399317406144, + "grad_norm": 0.2941278836193505, + "learning_rate": 2.715417428397319e-05, + "loss": 1.1172, + "step": 557 + }, + { + "epoch": 0.1190273037542662, + "grad_norm": 0.3251414599639306, + "learning_rate": 2.720292504570384e-05, + "loss": 1.0827, + "step": 558 + }, + { + "epoch": 0.11924061433447099, + "grad_norm": 0.3985650660754043, + "learning_rate": 2.7251675807434496e-05, + "loss": 1.1181, + "step": 559 + }, + { + "epoch": 0.11945392491467577, + "grad_norm": 0.4157638243014205, + "learning_rate": 2.7300426569165142e-05, + "loss": 1.0885, + "step": 560 + }, + { + "epoch": 0.11966723549488055, + "grad_norm": 0.4163389706276557, + "learning_rate": 2.73491773308958e-05, + "loss": 1.1376, + "step": 561 + }, + { + "epoch": 0.11988054607508532, + "grad_norm": 0.3471822338993976, + "learning_rate": 2.7397928092626452e-05, + "loss": 1.0686, + "step": 562 + }, + { + "epoch": 0.1200938566552901, + "grad_norm": 0.431920751117417, + "learning_rate": 2.74466788543571e-05, + "loss": 1.1133, + "step": 563 + }, + { + "epoch": 0.12030716723549488, + "grad_norm": 0.5110905669460136, + "learning_rate": 2.7495429616087754e-05, + "loss": 1.1297, + "step": 564 + }, + { + "epoch": 0.12052047781569966, + "grad_norm": 0.42733304597311006, + "learning_rate": 2.7544180377818404e-05, + "loss": 1.0781, + "step": 565 + }, + { + "epoch": 0.12073378839590444, + "grad_norm": 0.5933748461975104, + "learning_rate": 2.7592931139549057e-05, + "loss": 1.1361, + "step": 566 + }, + { + "epoch": 0.12094709897610921, + "grad_norm": 0.5796727894063759, + "learning_rate": 2.764168190127971e-05, + "loss": 1.0933, + "step": 567 + }, + { + "epoch": 0.12116040955631399, + "grad_norm": 0.5302501220115013, + "learning_rate": 2.769043266301036e-05, + "loss": 1.1154, + "step": 568 + }, + { + "epoch": 0.12137372013651877, + "grad_norm": 0.4696573660308258, + "learning_rate": 2.7739183424741013e-05, + "loss": 1.1039, + "step": 569 + }, + { + "epoch": 0.12158703071672355, + "grad_norm": 0.5075438170872163, + "learning_rate": 2.778793418647167e-05, + "loss": 1.0748, + "step": 570 + }, + { + "epoch": 0.12180034129692832, + "grad_norm": 0.5565238638692651, + "learning_rate": 2.783668494820232e-05, + "loss": 1.0999, + "step": 571 + }, + { + "epoch": 0.1220136518771331, + "grad_norm": 0.5651671973856022, + "learning_rate": 2.7885435709932972e-05, + "loss": 1.0772, + "step": 572 + }, + { + "epoch": 0.12222696245733788, + "grad_norm": 0.6144987129915103, + "learning_rate": 2.793418647166362e-05, + "loss": 1.1028, + "step": 573 + }, + { + "epoch": 0.12244027303754267, + "grad_norm": 0.5852288356400535, + "learning_rate": 2.7982937233394274e-05, + "loss": 1.0831, + "step": 574 + }, + { + "epoch": 0.12265358361774745, + "grad_norm": 0.4863230260725257, + "learning_rate": 2.8031687995124927e-05, + "loss": 1.0952, + "step": 575 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 0.444168074903368, + "learning_rate": 2.8080438756855577e-05, + "loss": 1.1263, + "step": 576 + }, + { + "epoch": 0.123080204778157, + "grad_norm": 0.37700043164842645, + "learning_rate": 2.812918951858623e-05, + "loss": 1.1197, + "step": 577 + }, + { + "epoch": 0.12329351535836178, + "grad_norm": 0.37979409588285346, + "learning_rate": 2.8177940280316883e-05, + "loss": 1.0932, + "step": 578 + }, + { + "epoch": 0.12350682593856656, + "grad_norm": 0.35292294815239733, + "learning_rate": 2.8226691042047533e-05, + "loss": 1.062, + "step": 579 + }, + { + "epoch": 0.12372013651877133, + "grad_norm": 0.30483089720057854, + "learning_rate": 2.827544180377819e-05, + "loss": 1.0894, + "step": 580 + }, + { + "epoch": 0.12393344709897611, + "grad_norm": 0.39656184397554123, + "learning_rate": 2.8324192565508835e-05, + "loss": 1.1066, + "step": 581 + }, + { + "epoch": 0.12414675767918089, + "grad_norm": 0.46092842300738496, + "learning_rate": 2.8372943327239492e-05, + "loss": 1.0851, + "step": 582 + }, + { + "epoch": 0.12436006825938567, + "grad_norm": 0.5278444758345902, + "learning_rate": 2.8421694088970145e-05, + "loss": 1.0878, + "step": 583 + }, + { + "epoch": 0.12457337883959044, + "grad_norm": 0.6447588012121451, + "learning_rate": 2.8470444850700794e-05, + "loss": 1.0785, + "step": 584 + }, + { + "epoch": 0.12478668941979522, + "grad_norm": 0.623209577697939, + "learning_rate": 2.8519195612431447e-05, + "loss": 1.0944, + "step": 585 + }, + { + "epoch": 0.125, + "grad_norm": 0.5095012628999064, + "learning_rate": 2.8567946374162097e-05, + "loss": 1.1107, + "step": 586 + }, + { + "epoch": 0.12521331058020477, + "grad_norm": 0.5121739323217577, + "learning_rate": 2.861669713589275e-05, + "loss": 1.1066, + "step": 587 + }, + { + "epoch": 0.12542662116040956, + "grad_norm": 0.584066959291587, + "learning_rate": 2.8665447897623403e-05, + "loss": 1.0843, + "step": 588 + }, + { + "epoch": 0.12563993174061433, + "grad_norm": 0.5680628052913761, + "learning_rate": 2.8714198659354053e-05, + "loss": 1.0823, + "step": 589 + }, + { + "epoch": 0.12585324232081913, + "grad_norm": 0.4615260365911247, + "learning_rate": 2.8762949421084706e-05, + "loss": 1.0723, + "step": 590 + }, + { + "epoch": 0.1260665529010239, + "grad_norm": 0.49635251037493244, + "learning_rate": 2.8811700182815362e-05, + "loss": 1.097, + "step": 591 + }, + { + "epoch": 0.12627986348122866, + "grad_norm": 0.4888175860218995, + "learning_rate": 2.8860450944546012e-05, + "loss": 1.0536, + "step": 592 + }, + { + "epoch": 0.12649317406143346, + "grad_norm": 0.5089185462872496, + "learning_rate": 2.8909201706276665e-05, + "loss": 1.0741, + "step": 593 + }, + { + "epoch": 0.12670648464163822, + "grad_norm": 0.5502193116810089, + "learning_rate": 2.8957952468007314e-05, + "loss": 1.0942, + "step": 594 + }, + { + "epoch": 0.126919795221843, + "grad_norm": 0.5280623461683024, + "learning_rate": 2.9006703229737968e-05, + "loss": 1.1184, + "step": 595 + }, + { + "epoch": 0.12713310580204779, + "grad_norm": 0.4596833350192435, + "learning_rate": 2.905545399146862e-05, + "loss": 1.0818, + "step": 596 + }, + { + "epoch": 0.12734641638225255, + "grad_norm": 0.42587871593685017, + "learning_rate": 2.910420475319927e-05, + "loss": 1.0792, + "step": 597 + }, + { + "epoch": 0.12755972696245735, + "grad_norm": 0.46895652102696994, + "learning_rate": 2.9152955514929923e-05, + "loss": 1.0667, + "step": 598 + }, + { + "epoch": 0.12777303754266212, + "grad_norm": 0.428309402015149, + "learning_rate": 2.9201706276660573e-05, + "loss": 1.1075, + "step": 599 + }, + { + "epoch": 0.12798634812286688, + "grad_norm": 0.46332460622440536, + "learning_rate": 2.9250457038391226e-05, + "loss": 1.0711, + "step": 600 + }, + { + "epoch": 0.12819965870307168, + "grad_norm": 0.5503728706035884, + "learning_rate": 2.929920780012188e-05, + "loss": 1.0737, + "step": 601 + }, + { + "epoch": 0.12841296928327645, + "grad_norm": 0.5011752050005094, + "learning_rate": 2.934795856185253e-05, + "loss": 1.0815, + "step": 602 + }, + { + "epoch": 0.12862627986348124, + "grad_norm": 0.4969163781587722, + "learning_rate": 2.9396709323583185e-05, + "loss": 1.1121, + "step": 603 + }, + { + "epoch": 0.128839590443686, + "grad_norm": 0.5364270979428908, + "learning_rate": 2.9445460085313838e-05, + "loss": 1.1091, + "step": 604 + }, + { + "epoch": 0.12905290102389078, + "grad_norm": 0.5837685620435199, + "learning_rate": 2.9494210847044488e-05, + "loss": 1.1152, + "step": 605 + }, + { + "epoch": 0.12926621160409557, + "grad_norm": 0.7045149811441571, + "learning_rate": 2.954296160877514e-05, + "loss": 1.1119, + "step": 606 + }, + { + "epoch": 0.12947952218430034, + "grad_norm": 0.7328777162299238, + "learning_rate": 2.959171237050579e-05, + "loss": 1.0867, + "step": 607 + }, + { + "epoch": 0.1296928327645051, + "grad_norm": 0.6652193532564457, + "learning_rate": 2.9640463132236443e-05, + "loss": 1.1036, + "step": 608 + }, + { + "epoch": 0.1299061433447099, + "grad_norm": 0.5642542411698578, + "learning_rate": 2.9689213893967096e-05, + "loss": 1.062, + "step": 609 + }, + { + "epoch": 0.13011945392491467, + "grad_norm": 0.40756457000087276, + "learning_rate": 2.9737964655697746e-05, + "loss": 1.1041, + "step": 610 + }, + { + "epoch": 0.13033276450511946, + "grad_norm": 0.3128103731373788, + "learning_rate": 2.97867154174284e-05, + "loss": 1.1109, + "step": 611 + }, + { + "epoch": 0.13054607508532423, + "grad_norm": 0.4285728711011058, + "learning_rate": 2.9835466179159055e-05, + "loss": 1.1355, + "step": 612 + }, + { + "epoch": 0.130759385665529, + "grad_norm": 0.522205707388807, + "learning_rate": 2.98842169408897e-05, + "loss": 1.0613, + "step": 613 + }, + { + "epoch": 0.1309726962457338, + "grad_norm": 0.5736398496533861, + "learning_rate": 2.9932967702620358e-05, + "loss": 1.118, + "step": 614 + }, + { + "epoch": 0.13118600682593856, + "grad_norm": 0.6048653026461186, + "learning_rate": 2.9981718464351008e-05, + "loss": 1.1038, + "step": 615 + }, + { + "epoch": 0.13139931740614336, + "grad_norm": 0.666553586097891, + "learning_rate": 3.003046922608166e-05, + "loss": 1.0913, + "step": 616 + }, + { + "epoch": 0.13161262798634812, + "grad_norm": 0.735963431700105, + "learning_rate": 3.0079219987812314e-05, + "loss": 1.1135, + "step": 617 + }, + { + "epoch": 0.1318259385665529, + "grad_norm": 0.7426127118336336, + "learning_rate": 3.0127970749542963e-05, + "loss": 1.0798, + "step": 618 + }, + { + "epoch": 0.1320392491467577, + "grad_norm": 0.7353451028906075, + "learning_rate": 3.0176721511273616e-05, + "loss": 1.0907, + "step": 619 + }, + { + "epoch": 0.13225255972696245, + "grad_norm": 0.748064957954645, + "learning_rate": 3.0225472273004266e-05, + "loss": 1.1126, + "step": 620 + }, + { + "epoch": 0.13246587030716722, + "grad_norm": 0.6533419693047269, + "learning_rate": 3.027422303473492e-05, + "loss": 1.0824, + "step": 621 + }, + { + "epoch": 0.13267918088737202, + "grad_norm": 0.5582640644141249, + "learning_rate": 3.0322973796465572e-05, + "loss": 1.083, + "step": 622 + }, + { + "epoch": 0.13289249146757678, + "grad_norm": 0.5935755295116466, + "learning_rate": 3.037172455819622e-05, + "loss": 1.0857, + "step": 623 + }, + { + "epoch": 0.13310580204778158, + "grad_norm": 0.4926267782716839, + "learning_rate": 3.0420475319926878e-05, + "loss": 1.1011, + "step": 624 + }, + { + "epoch": 0.13331911262798635, + "grad_norm": 0.36404344477205, + "learning_rate": 3.046922608165753e-05, + "loss": 1.068, + "step": 625 + }, + { + "epoch": 0.13353242320819111, + "grad_norm": 0.3448553192143145, + "learning_rate": 3.051797684338818e-05, + "loss": 1.117, + "step": 626 + }, + { + "epoch": 0.1337457337883959, + "grad_norm": 0.41547265446612053, + "learning_rate": 3.0566727605118834e-05, + "loss": 1.0936, + "step": 627 + }, + { + "epoch": 0.13395904436860068, + "grad_norm": 0.5060078530474861, + "learning_rate": 3.061547836684948e-05, + "loss": 1.1111, + "step": 628 + }, + { + "epoch": 0.13417235494880547, + "grad_norm": 0.5961018994167043, + "learning_rate": 3.066422912858014e-05, + "loss": 1.1104, + "step": 629 + }, + { + "epoch": 0.13438566552901024, + "grad_norm": 0.6870511027156619, + "learning_rate": 3.071297989031079e-05, + "loss": 1.0948, + "step": 630 + }, + { + "epoch": 0.134598976109215, + "grad_norm": 0.8172440048476897, + "learning_rate": 3.076173065204144e-05, + "loss": 1.0756, + "step": 631 + }, + { + "epoch": 0.1348122866894198, + "grad_norm": 0.8958232745140313, + "learning_rate": 3.081048141377209e-05, + "loss": 1.0777, + "step": 632 + }, + { + "epoch": 0.13502559726962457, + "grad_norm": 0.9352994298305213, + "learning_rate": 3.0859232175502745e-05, + "loss": 1.0707, + "step": 633 + }, + { + "epoch": 0.13523890784982937, + "grad_norm": 0.9228567257316848, + "learning_rate": 3.09079829372334e-05, + "loss": 1.0632, + "step": 634 + }, + { + "epoch": 0.13545221843003413, + "grad_norm": 0.7789142244920876, + "learning_rate": 3.095673369896405e-05, + "loss": 1.0677, + "step": 635 + }, + { + "epoch": 0.1356655290102389, + "grad_norm": 0.5745704523565366, + "learning_rate": 3.10054844606947e-05, + "loss": 1.1314, + "step": 636 + }, + { + "epoch": 0.1358788395904437, + "grad_norm": 0.49496837020038903, + "learning_rate": 3.105423522242535e-05, + "loss": 1.0767, + "step": 637 + }, + { + "epoch": 0.13609215017064846, + "grad_norm": 0.5453909383444313, + "learning_rate": 3.1102985984156e-05, + "loss": 1.0816, + "step": 638 + }, + { + "epoch": 0.13630546075085323, + "grad_norm": 0.5446405548463357, + "learning_rate": 3.1151736745886656e-05, + "loss": 1.0963, + "step": 639 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 0.431285122982047, + "learning_rate": 3.120048750761731e-05, + "loss": 1.0893, + "step": 640 + }, + { + "epoch": 0.1367320819112628, + "grad_norm": 0.3332325695261883, + "learning_rate": 3.124923826934796e-05, + "loss": 1.0845, + "step": 641 + }, + { + "epoch": 0.1369453924914676, + "grad_norm": 0.45352245475744496, + "learning_rate": 3.1297989031078615e-05, + "loss": 1.0851, + "step": 642 + }, + { + "epoch": 0.13715870307167236, + "grad_norm": 0.4995711545701483, + "learning_rate": 3.134673979280927e-05, + "loss": 1.11, + "step": 643 + }, + { + "epoch": 0.13737201365187712, + "grad_norm": 0.4276472837382676, + "learning_rate": 3.1395490554539915e-05, + "loss": 1.0795, + "step": 644 + }, + { + "epoch": 0.13758532423208192, + "grad_norm": 0.3837916025549892, + "learning_rate": 3.144424131627057e-05, + "loss": 1.0695, + "step": 645 + }, + { + "epoch": 0.13779863481228669, + "grad_norm": 0.33719732246871337, + "learning_rate": 3.149299207800122e-05, + "loss": 1.1083, + "step": 646 + }, + { + "epoch": 0.13801194539249148, + "grad_norm": 0.37671360012180927, + "learning_rate": 3.1541742839731874e-05, + "loss": 1.0958, + "step": 647 + }, + { + "epoch": 0.13822525597269625, + "grad_norm": 0.42165839325284116, + "learning_rate": 3.159049360146253e-05, + "loss": 1.0975, + "step": 648 + }, + { + "epoch": 0.13843856655290102, + "grad_norm": 0.4426745980025579, + "learning_rate": 3.163924436319317e-05, + "loss": 1.0662, + "step": 649 + }, + { + "epoch": 0.1386518771331058, + "grad_norm": 0.43671869184720097, + "learning_rate": 3.1687995124923826e-05, + "loss": 1.1233, + "step": 650 + }, + { + "epoch": 0.13886518771331058, + "grad_norm": 0.33639368853926366, + "learning_rate": 3.1736745886654486e-05, + "loss": 1.0678, + "step": 651 + }, + { + "epoch": 0.13907849829351535, + "grad_norm": 0.34373749284296595, + "learning_rate": 3.178549664838513e-05, + "loss": 1.0789, + "step": 652 + }, + { + "epoch": 0.13929180887372014, + "grad_norm": 0.36149712836733966, + "learning_rate": 3.1834247410115785e-05, + "loss": 1.0713, + "step": 653 + }, + { + "epoch": 0.1395051194539249, + "grad_norm": 0.452954603720398, + "learning_rate": 3.188299817184644e-05, + "loss": 1.1255, + "step": 654 + }, + { + "epoch": 0.1397184300341297, + "grad_norm": 0.6507244933793764, + "learning_rate": 3.193174893357709e-05, + "loss": 1.1168, + "step": 655 + }, + { + "epoch": 0.13993174061433447, + "grad_norm": 0.7704346526931665, + "learning_rate": 3.1980499695307744e-05, + "loss": 1.0707, + "step": 656 + }, + { + "epoch": 0.14014505119453924, + "grad_norm": 0.8787085436085401, + "learning_rate": 3.202925045703839e-05, + "loss": 1.076, + "step": 657 + }, + { + "epoch": 0.14035836177474403, + "grad_norm": 1.011984263095825, + "learning_rate": 3.207800121876904e-05, + "loss": 1.0974, + "step": 658 + }, + { + "epoch": 0.1405716723549488, + "grad_norm": 1.0953060083899258, + "learning_rate": 3.2126751980499696e-05, + "loss": 1.0746, + "step": 659 + }, + { + "epoch": 0.1407849829351536, + "grad_norm": 0.8557179511407021, + "learning_rate": 3.217550274223035e-05, + "loss": 1.1019, + "step": 660 + }, + { + "epoch": 0.14099829351535836, + "grad_norm": 0.5983472713449829, + "learning_rate": 3.2224253503961e-05, + "loss": 1.0628, + "step": 661 + }, + { + "epoch": 0.14121160409556313, + "grad_norm": 0.49590339829763314, + "learning_rate": 3.227300426569165e-05, + "loss": 1.0788, + "step": 662 + }, + { + "epoch": 0.14142491467576793, + "grad_norm": 0.6934225339900432, + "learning_rate": 3.232175502742231e-05, + "loss": 1.0862, + "step": 663 + }, + { + "epoch": 0.1416382252559727, + "grad_norm": 0.9562479121287689, + "learning_rate": 3.237050578915296e-05, + "loss": 1.0518, + "step": 664 + }, + { + "epoch": 0.14185153583617746, + "grad_norm": 1.009782150489015, + "learning_rate": 3.241925655088361e-05, + "loss": 1.0977, + "step": 665 + }, + { + "epoch": 0.14206484641638226, + "grad_norm": 0.8727214529537581, + "learning_rate": 3.246800731261426e-05, + "loss": 1.0932, + "step": 666 + }, + { + "epoch": 0.14227815699658702, + "grad_norm": 0.7069173578567359, + "learning_rate": 3.2516758074344914e-05, + "loss": 1.0928, + "step": 667 + }, + { + "epoch": 0.14249146757679182, + "grad_norm": 0.5514502531661958, + "learning_rate": 3.256550883607557e-05, + "loss": 1.0877, + "step": 668 + }, + { + "epoch": 0.1427047781569966, + "grad_norm": 0.5006364852774016, + "learning_rate": 3.261425959780622e-05, + "loss": 1.0763, + "step": 669 + }, + { + "epoch": 0.14291808873720135, + "grad_norm": 0.5846619367992926, + "learning_rate": 3.2663010359536866e-05, + "loss": 1.0587, + "step": 670 + }, + { + "epoch": 0.14313139931740615, + "grad_norm": 0.6261970032862956, + "learning_rate": 3.271176112126752e-05, + "loss": 1.0994, + "step": 671 + }, + { + "epoch": 0.14334470989761092, + "grad_norm": 0.6157820763512434, + "learning_rate": 3.276051188299818e-05, + "loss": 1.0974, + "step": 672 + }, + { + "epoch": 0.1435580204778157, + "grad_norm": 0.48144909379440576, + "learning_rate": 3.2809262644728825e-05, + "loss": 1.092, + "step": 673 + }, + { + "epoch": 0.14377133105802048, + "grad_norm": 0.4376472899403462, + "learning_rate": 3.285801340645948e-05, + "loss": 1.1035, + "step": 674 + }, + { + "epoch": 0.14398464163822525, + "grad_norm": 0.4708941561580069, + "learning_rate": 3.290676416819013e-05, + "loss": 1.0948, + "step": 675 + }, + { + "epoch": 0.14419795221843004, + "grad_norm": 0.5455425570407003, + "learning_rate": 3.2955514929920784e-05, + "loss": 1.1102, + "step": 676 + }, + { + "epoch": 0.1444112627986348, + "grad_norm": 0.4658883883579199, + "learning_rate": 3.300426569165144e-05, + "loss": 1.0987, + "step": 677 + }, + { + "epoch": 0.14462457337883958, + "grad_norm": 0.5543611046406184, + "learning_rate": 3.3053016453382083e-05, + "loss": 1.058, + "step": 678 + }, + { + "epoch": 0.14483788395904437, + "grad_norm": 0.5559301649856287, + "learning_rate": 3.3101767215112736e-05, + "loss": 1.0887, + "step": 679 + }, + { + "epoch": 0.14505119453924914, + "grad_norm": 0.5879565709029223, + "learning_rate": 3.315051797684339e-05, + "loss": 1.0943, + "step": 680 + }, + { + "epoch": 0.14526450511945393, + "grad_norm": 0.6503864949555986, + "learning_rate": 3.319926873857404e-05, + "loss": 1.0813, + "step": 681 + }, + { + "epoch": 0.1454778156996587, + "grad_norm": 0.6225228578059359, + "learning_rate": 3.3248019500304696e-05, + "loss": 1.1032, + "step": 682 + }, + { + "epoch": 0.14569112627986347, + "grad_norm": 0.6224717924800379, + "learning_rate": 3.329677026203534e-05, + "loss": 1.0791, + "step": 683 + }, + { + "epoch": 0.14590443686006827, + "grad_norm": 0.5474521737351457, + "learning_rate": 3.3345521023766e-05, + "loss": 1.0895, + "step": 684 + }, + { + "epoch": 0.14611774744027303, + "grad_norm": 0.42488483230685703, + "learning_rate": 3.3394271785496655e-05, + "loss": 1.1028, + "step": 685 + }, + { + "epoch": 0.14633105802047783, + "grad_norm": 0.3805826466413442, + "learning_rate": 3.34430225472273e-05, + "loss": 1.1081, + "step": 686 + }, + { + "epoch": 0.1465443686006826, + "grad_norm": 0.41842862006729425, + "learning_rate": 3.3491773308957954e-05, + "loss": 1.0437, + "step": 687 + }, + { + "epoch": 0.14675767918088736, + "grad_norm": 0.46657340780093387, + "learning_rate": 3.354052407068861e-05, + "loss": 1.0816, + "step": 688 + }, + { + "epoch": 0.14697098976109216, + "grad_norm": 0.4395172150567953, + "learning_rate": 3.358927483241926e-05, + "loss": 1.0887, + "step": 689 + }, + { + "epoch": 0.14718430034129693, + "grad_norm": 0.36545713120099826, + "learning_rate": 3.363802559414991e-05, + "loss": 1.1118, + "step": 690 + }, + { + "epoch": 0.1473976109215017, + "grad_norm": 0.3639795335561904, + "learning_rate": 3.368677635588056e-05, + "loss": 1.0756, + "step": 691 + }, + { + "epoch": 0.1476109215017065, + "grad_norm": 0.4211339647816553, + "learning_rate": 3.373552711761121e-05, + "loss": 1.1086, + "step": 692 + }, + { + "epoch": 0.14782423208191126, + "grad_norm": 0.5115450902070497, + "learning_rate": 3.378427787934187e-05, + "loss": 1.0615, + "step": 693 + }, + { + "epoch": 0.14803754266211605, + "grad_norm": 0.5556851936404191, + "learning_rate": 3.383302864107252e-05, + "loss": 1.0919, + "step": 694 + }, + { + "epoch": 0.14825085324232082, + "grad_norm": 0.5996747379378153, + "learning_rate": 3.388177940280317e-05, + "loss": 1.0914, + "step": 695 + }, + { + "epoch": 0.14846416382252559, + "grad_norm": 0.5500104508550911, + "learning_rate": 3.3930530164533824e-05, + "loss": 1.0604, + "step": 696 + }, + { + "epoch": 0.14867747440273038, + "grad_norm": 0.48372163414143227, + "learning_rate": 3.397928092626448e-05, + "loss": 1.1061, + "step": 697 + }, + { + "epoch": 0.14889078498293515, + "grad_norm": 0.47088022461758317, + "learning_rate": 3.402803168799513e-05, + "loss": 1.0835, + "step": 698 + }, + { + "epoch": 0.14910409556313994, + "grad_norm": 0.5327440154728011, + "learning_rate": 3.4076782449725777e-05, + "loss": 1.0525, + "step": 699 + }, + { + "epoch": 0.1493174061433447, + "grad_norm": 0.6570117141418617, + "learning_rate": 3.412553321145643e-05, + "loss": 1.0983, + "step": 700 + }, + { + "epoch": 0.14953071672354948, + "grad_norm": 0.7740106456598679, + "learning_rate": 3.417428397318708e-05, + "loss": 1.0704, + "step": 701 + }, + { + "epoch": 0.14974402730375427, + "grad_norm": 0.8764978201110313, + "learning_rate": 3.4223034734917736e-05, + "loss": 1.0822, + "step": 702 + }, + { + "epoch": 0.14995733788395904, + "grad_norm": 0.8861934786915853, + "learning_rate": 3.427178549664839e-05, + "loss": 1.1105, + "step": 703 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 0.9888563421834652, + "learning_rate": 3.4320536258379035e-05, + "loss": 1.0816, + "step": 704 + }, + { + "epoch": 0.1503839590443686, + "grad_norm": 1.0199833366956486, + "learning_rate": 3.4369287020109695e-05, + "loss": 1.099, + "step": 705 + }, + { + "epoch": 0.15059726962457337, + "grad_norm": 0.8917026997448313, + "learning_rate": 3.441803778184035e-05, + "loss": 1.0666, + "step": 706 + }, + { + "epoch": 0.15081058020477817, + "grad_norm": 0.7982191142025228, + "learning_rate": 3.4466788543570994e-05, + "loss": 1.0878, + "step": 707 + }, + { + "epoch": 0.15102389078498293, + "grad_norm": 0.8139689553018693, + "learning_rate": 3.451553930530165e-05, + "loss": 1.0739, + "step": 708 + }, + { + "epoch": 0.1512372013651877, + "grad_norm": 0.6815457916006251, + "learning_rate": 3.45642900670323e-05, + "loss": 1.1038, + "step": 709 + }, + { + "epoch": 0.1514505119453925, + "grad_norm": 0.5258370716735588, + "learning_rate": 3.461304082876295e-05, + "loss": 1.0743, + "step": 710 + }, + { + "epoch": 0.15166382252559726, + "grad_norm": 0.5203977724827681, + "learning_rate": 3.4661791590493606e-05, + "loss": 1.1038, + "step": 711 + }, + { + "epoch": 0.15187713310580206, + "grad_norm": 0.6448334135233473, + "learning_rate": 3.471054235222425e-05, + "loss": 1.0875, + "step": 712 + }, + { + "epoch": 0.15209044368600683, + "grad_norm": 0.7289944276909184, + "learning_rate": 3.4759293113954905e-05, + "loss": 1.0674, + "step": 713 + }, + { + "epoch": 0.1523037542662116, + "grad_norm": 0.6403248577748036, + "learning_rate": 3.4808043875685565e-05, + "loss": 1.0661, + "step": 714 + }, + { + "epoch": 0.1525170648464164, + "grad_norm": 0.497509306548104, + "learning_rate": 3.485679463741621e-05, + "loss": 1.0801, + "step": 715 + }, + { + "epoch": 0.15273037542662116, + "grad_norm": 0.7123107227954263, + "learning_rate": 3.4905545399146864e-05, + "loss": 1.0975, + "step": 716 + }, + { + "epoch": 0.15294368600682595, + "grad_norm": 0.6786685624732779, + "learning_rate": 3.495429616087752e-05, + "loss": 1.1053, + "step": 717 + }, + { + "epoch": 0.15315699658703072, + "grad_norm": 0.5906405555759844, + "learning_rate": 3.500304692260817e-05, + "loss": 1.0679, + "step": 718 + }, + { + "epoch": 0.1533703071672355, + "grad_norm": 0.5354008594236005, + "learning_rate": 3.5051797684338823e-05, + "loss": 1.0818, + "step": 719 + }, + { + "epoch": 0.15358361774744028, + "grad_norm": 0.4911660850762362, + "learning_rate": 3.510054844606947e-05, + "loss": 1.0722, + "step": 720 + }, + { + "epoch": 0.15379692832764505, + "grad_norm": 0.43209681046828813, + "learning_rate": 3.514929920780012e-05, + "loss": 1.0892, + "step": 721 + }, + { + "epoch": 0.15401023890784982, + "grad_norm": 0.35219450535113395, + "learning_rate": 3.5198049969530776e-05, + "loss": 1.0776, + "step": 722 + }, + { + "epoch": 0.1542235494880546, + "grad_norm": 0.5040494304404304, + "learning_rate": 3.524680073126143e-05, + "loss": 1.0528, + "step": 723 + }, + { + "epoch": 0.15443686006825938, + "grad_norm": 0.6113018643081751, + "learning_rate": 3.529555149299208e-05, + "loss": 1.066, + "step": 724 + }, + { + "epoch": 0.15465017064846417, + "grad_norm": 0.5901934646000287, + "learning_rate": 3.534430225472273e-05, + "loss": 1.0829, + "step": 725 + }, + { + "epoch": 0.15486348122866894, + "grad_norm": 0.571269997580845, + "learning_rate": 3.539305301645339e-05, + "loss": 1.1275, + "step": 726 + }, + { + "epoch": 0.1550767918088737, + "grad_norm": 0.5942761101071677, + "learning_rate": 3.544180377818404e-05, + "loss": 1.0829, + "step": 727 + }, + { + "epoch": 0.1552901023890785, + "grad_norm": 0.5877360874500142, + "learning_rate": 3.549055453991469e-05, + "loss": 1.0758, + "step": 728 + }, + { + "epoch": 0.15550341296928327, + "grad_norm": 0.5804612183318224, + "learning_rate": 3.553930530164534e-05, + "loss": 1.0749, + "step": 729 + }, + { + "epoch": 0.15571672354948807, + "grad_norm": 0.6521541212370563, + "learning_rate": 3.558805606337599e-05, + "loss": 1.0712, + "step": 730 + }, + { + "epoch": 0.15593003412969283, + "grad_norm": 0.746707633117643, + "learning_rate": 3.5636806825106646e-05, + "loss": 1.0817, + "step": 731 + }, + { + "epoch": 0.1561433447098976, + "grad_norm": 0.8162977643509968, + "learning_rate": 3.56855575868373e-05, + "loss": 1.1118, + "step": 732 + }, + { + "epoch": 0.1563566552901024, + "grad_norm": 0.7953172183949454, + "learning_rate": 3.5734308348567945e-05, + "loss": 1.0786, + "step": 733 + }, + { + "epoch": 0.15656996587030717, + "grad_norm": 0.6396703673173548, + "learning_rate": 3.57830591102986e-05, + "loss": 1.0449, + "step": 734 + }, + { + "epoch": 0.15678327645051193, + "grad_norm": 0.43797187353790723, + "learning_rate": 3.583180987202926e-05, + "loss": 1.0534, + "step": 735 + }, + { + "epoch": 0.15699658703071673, + "grad_norm": 0.6586923754329126, + "learning_rate": 3.5880560633759904e-05, + "loss": 1.0515, + "step": 736 + }, + { + "epoch": 0.1572098976109215, + "grad_norm": 0.8193004067892995, + "learning_rate": 3.592931139549056e-05, + "loss": 1.067, + "step": 737 + }, + { + "epoch": 0.1574232081911263, + "grad_norm": 0.8477830192506561, + "learning_rate": 3.597806215722121e-05, + "loss": 1.0697, + "step": 738 + }, + { + "epoch": 0.15763651877133106, + "grad_norm": 0.720698483427188, + "learning_rate": 3.6026812918951863e-05, + "loss": 1.0939, + "step": 739 + }, + { + "epoch": 0.15784982935153583, + "grad_norm": 0.5715496272889603, + "learning_rate": 3.6075563680682516e-05, + "loss": 1.0976, + "step": 740 + }, + { + "epoch": 0.15806313993174062, + "grad_norm": 0.5093357338741221, + "learning_rate": 3.612431444241316e-05, + "loss": 1.0654, + "step": 741 + }, + { + "epoch": 0.1582764505119454, + "grad_norm": 0.5608555291732066, + "learning_rate": 3.6173065204143816e-05, + "loss": 1.0588, + "step": 742 + }, + { + "epoch": 0.15848976109215018, + "grad_norm": 0.704233240743737, + "learning_rate": 3.622181596587447e-05, + "loss": 1.0662, + "step": 743 + }, + { + "epoch": 0.15870307167235495, + "grad_norm": 0.6933421890719679, + "learning_rate": 3.627056672760512e-05, + "loss": 1.0712, + "step": 744 + }, + { + "epoch": 0.15891638225255972, + "grad_norm": 0.6545094738849405, + "learning_rate": 3.6319317489335775e-05, + "loss": 1.0738, + "step": 745 + }, + { + "epoch": 0.1591296928327645, + "grad_norm": 0.5943087961074439, + "learning_rate": 3.636806825106642e-05, + "loss": 1.0453, + "step": 746 + }, + { + "epoch": 0.15934300341296928, + "grad_norm": 0.4973951602875992, + "learning_rate": 3.641681901279708e-05, + "loss": 1.1022, + "step": 747 + }, + { + "epoch": 0.15955631399317405, + "grad_norm": 0.4053145678430064, + "learning_rate": 3.6465569774527734e-05, + "loss": 1.1075, + "step": 748 + }, + { + "epoch": 0.15976962457337884, + "grad_norm": 0.4533007653178857, + "learning_rate": 3.651432053625838e-05, + "loss": 1.1111, + "step": 749 + }, + { + "epoch": 0.1599829351535836, + "grad_norm": 0.4455159096525701, + "learning_rate": 3.656307129798903e-05, + "loss": 1.0759, + "step": 750 + }, + { + "epoch": 0.1601962457337884, + "grad_norm": 0.31477651356655145, + "learning_rate": 3.6611822059719686e-05, + "loss": 1.066, + "step": 751 + }, + { + "epoch": 0.16040955631399317, + "grad_norm": 0.47602239111321876, + "learning_rate": 3.666057282145034e-05, + "loss": 1.1306, + "step": 752 + }, + { + "epoch": 0.16062286689419794, + "grad_norm": 0.5217611476806718, + "learning_rate": 3.670932358318099e-05, + "loss": 1.091, + "step": 753 + }, + { + "epoch": 0.16083617747440274, + "grad_norm": 0.46415210008494073, + "learning_rate": 3.675807434491164e-05, + "loss": 1.0683, + "step": 754 + }, + { + "epoch": 0.1610494880546075, + "grad_norm": 0.4626005629617466, + "learning_rate": 3.680682510664229e-05, + "loss": 1.071, + "step": 755 + }, + { + "epoch": 0.1612627986348123, + "grad_norm": 0.5118047573753391, + "learning_rate": 3.685557586837295e-05, + "loss": 1.0694, + "step": 756 + }, + { + "epoch": 0.16147610921501707, + "grad_norm": 0.6376447299765144, + "learning_rate": 3.69043266301036e-05, + "loss": 1.1132, + "step": 757 + }, + { + "epoch": 0.16168941979522183, + "grad_norm": 0.6461440204299982, + "learning_rate": 3.695307739183425e-05, + "loss": 1.0466, + "step": 758 + }, + { + "epoch": 0.16190273037542663, + "grad_norm": 0.6558778221256095, + "learning_rate": 3.7001828153564904e-05, + "loss": 1.0679, + "step": 759 + }, + { + "epoch": 0.1621160409556314, + "grad_norm": 0.7049752317721177, + "learning_rate": 3.7050578915295557e-05, + "loss": 1.0613, + "step": 760 + }, + { + "epoch": 0.16232935153583616, + "grad_norm": 0.6505284654594408, + "learning_rate": 3.709932967702621e-05, + "loss": 1.091, + "step": 761 + }, + { + "epoch": 0.16254266211604096, + "grad_norm": 0.5952986124674025, + "learning_rate": 3.7148080438756856e-05, + "loss": 1.0651, + "step": 762 + }, + { + "epoch": 0.16275597269624573, + "grad_norm": 0.6127577269681912, + "learning_rate": 3.719683120048751e-05, + "loss": 1.1071, + "step": 763 + }, + { + "epoch": 0.16296928327645052, + "grad_norm": 0.582377814926791, + "learning_rate": 3.724558196221816e-05, + "loss": 1.0684, + "step": 764 + }, + { + "epoch": 0.1631825938566553, + "grad_norm": 0.5587226079832958, + "learning_rate": 3.7294332723948815e-05, + "loss": 1.1069, + "step": 765 + }, + { + "epoch": 0.16339590443686006, + "grad_norm": 0.5375903748462174, + "learning_rate": 3.734308348567947e-05, + "loss": 1.0902, + "step": 766 + }, + { + "epoch": 0.16360921501706485, + "grad_norm": 0.5420092510464712, + "learning_rate": 3.7391834247410114e-05, + "loss": 1.0921, + "step": 767 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 0.6065710826988953, + "learning_rate": 3.7440585009140774e-05, + "loss": 1.1037, + "step": 768 + }, + { + "epoch": 0.16403583617747441, + "grad_norm": 0.5148981644013875, + "learning_rate": 3.748933577087143e-05, + "loss": 1.0423, + "step": 769 + }, + { + "epoch": 0.16424914675767918, + "grad_norm": 0.6493549370228356, + "learning_rate": 3.753808653260207e-05, + "loss": 1.0855, + "step": 770 + }, + { + "epoch": 0.16446245733788395, + "grad_norm": 0.852380398582728, + "learning_rate": 3.7586837294332726e-05, + "loss": 1.0475, + "step": 771 + }, + { + "epoch": 0.16467576791808874, + "grad_norm": 0.8037440204399025, + "learning_rate": 3.763558805606338e-05, + "loss": 1.0782, + "step": 772 + }, + { + "epoch": 0.1648890784982935, + "grad_norm": 0.7071971320052141, + "learning_rate": 3.768433881779403e-05, + "loss": 1.0719, + "step": 773 + }, + { + "epoch": 0.1651023890784983, + "grad_norm": 0.7321795965565571, + "learning_rate": 3.7733089579524685e-05, + "loss": 1.0664, + "step": 774 + }, + { + "epoch": 0.16531569965870307, + "grad_norm": 0.7220410829354813, + "learning_rate": 3.778184034125533e-05, + "loss": 1.0753, + "step": 775 + }, + { + "epoch": 0.16552901023890784, + "grad_norm": 0.5163689712470992, + "learning_rate": 3.7830591102985985e-05, + "loss": 1.0873, + "step": 776 + }, + { + "epoch": 0.16574232081911264, + "grad_norm": 0.46682100383107195, + "learning_rate": 3.7879341864716644e-05, + "loss": 1.0207, + "step": 777 + }, + { + "epoch": 0.1659556313993174, + "grad_norm": 0.44913015507417636, + "learning_rate": 3.792809262644729e-05, + "loss": 1.0842, + "step": 778 + }, + { + "epoch": 0.16616894197952217, + "grad_norm": 0.4494154026917198, + "learning_rate": 3.7976843388177944e-05, + "loss": 1.098, + "step": 779 + }, + { + "epoch": 0.16638225255972697, + "grad_norm": 0.4516690714576684, + "learning_rate": 3.80255941499086e-05, + "loss": 1.0482, + "step": 780 + }, + { + "epoch": 0.16659556313993173, + "grad_norm": 0.5100691470641486, + "learning_rate": 3.807434491163925e-05, + "loss": 1.0763, + "step": 781 + }, + { + "epoch": 0.16680887372013653, + "grad_norm": 0.5162532369489226, + "learning_rate": 3.81230956733699e-05, + "loss": 1.0902, + "step": 782 + }, + { + "epoch": 0.1670221843003413, + "grad_norm": 0.48407732877488463, + "learning_rate": 3.817184643510055e-05, + "loss": 1.0578, + "step": 783 + }, + { + "epoch": 0.16723549488054607, + "grad_norm": 0.5686139988567218, + "learning_rate": 3.82205971968312e-05, + "loss": 1.0634, + "step": 784 + }, + { + "epoch": 0.16744880546075086, + "grad_norm": 0.6814825610585948, + "learning_rate": 3.8269347958561855e-05, + "loss": 1.0878, + "step": 785 + }, + { + "epoch": 0.16766211604095563, + "grad_norm": 0.8651792379303521, + "learning_rate": 3.831809872029251e-05, + "loss": 1.0743, + "step": 786 + }, + { + "epoch": 0.16787542662116042, + "grad_norm": 0.9210523213400074, + "learning_rate": 3.836684948202316e-05, + "loss": 1.0545, + "step": 787 + }, + { + "epoch": 0.1680887372013652, + "grad_norm": 0.8471410592018486, + "learning_rate": 3.841560024375381e-05, + "loss": 1.0797, + "step": 788 + }, + { + "epoch": 0.16830204778156996, + "grad_norm": 0.7076945507302368, + "learning_rate": 3.846435100548447e-05, + "loss": 1.0839, + "step": 789 + }, + { + "epoch": 0.16851535836177475, + "grad_norm": 0.47186788862146345, + "learning_rate": 3.851310176721512e-05, + "loss": 1.0647, + "step": 790 + }, + { + "epoch": 0.16872866894197952, + "grad_norm": 0.48441368356229025, + "learning_rate": 3.8561852528945766e-05, + "loss": 1.0863, + "step": 791 + }, + { + "epoch": 0.1689419795221843, + "grad_norm": 0.5129537993356702, + "learning_rate": 3.861060329067642e-05, + "loss": 1.0671, + "step": 792 + }, + { + "epoch": 0.16915529010238908, + "grad_norm": 0.7220309146016077, + "learning_rate": 3.865935405240707e-05, + "loss": 1.0737, + "step": 793 + }, + { + "epoch": 0.16936860068259385, + "grad_norm": 0.8649233807658457, + "learning_rate": 3.8708104814137725e-05, + "loss": 1.0938, + "step": 794 + }, + { + "epoch": 0.16958191126279865, + "grad_norm": 0.8603026939763951, + "learning_rate": 3.875685557586838e-05, + "loss": 1.123, + "step": 795 + }, + { + "epoch": 0.1697952218430034, + "grad_norm": 0.7314056373007055, + "learning_rate": 3.8805606337599025e-05, + "loss": 1.0664, + "step": 796 + }, + { + "epoch": 0.17000853242320818, + "grad_norm": 0.5362324113655372, + "learning_rate": 3.885435709932968e-05, + "loss": 1.0729, + "step": 797 + }, + { + "epoch": 0.17022184300341298, + "grad_norm": 0.4137311703771168, + "learning_rate": 3.890310786106034e-05, + "loss": 1.0628, + "step": 798 + }, + { + "epoch": 0.17043515358361774, + "grad_norm": 0.39895559407509157, + "learning_rate": 3.8951858622790984e-05, + "loss": 1.0857, + "step": 799 + }, + { + "epoch": 0.17064846416382254, + "grad_norm": 0.5582593704316904, + "learning_rate": 3.900060938452164e-05, + "loss": 1.0642, + "step": 800 + }, + { + "epoch": 0.1708617747440273, + "grad_norm": 0.7919432594424013, + "learning_rate": 3.904936014625229e-05, + "loss": 1.0651, + "step": 801 + }, + { + "epoch": 0.17107508532423207, + "grad_norm": 0.9721683310817844, + "learning_rate": 3.909811090798294e-05, + "loss": 1.0948, + "step": 802 + }, + { + "epoch": 0.17128839590443687, + "grad_norm": 1.0759338477986138, + "learning_rate": 3.9146861669713596e-05, + "loss": 1.0655, + "step": 803 + }, + { + "epoch": 0.17150170648464164, + "grad_norm": 0.698295536249421, + "learning_rate": 3.919561243144424e-05, + "loss": 1.0774, + "step": 804 + }, + { + "epoch": 0.1717150170648464, + "grad_norm": 0.42040065836645335, + "learning_rate": 3.9244363193174895e-05, + "loss": 1.0702, + "step": 805 + }, + { + "epoch": 0.1719283276450512, + "grad_norm": 0.5839875100085201, + "learning_rate": 3.929311395490555e-05, + "loss": 1.084, + "step": 806 + }, + { + "epoch": 0.17214163822525597, + "grad_norm": 0.899718313630807, + "learning_rate": 3.93418647166362e-05, + "loss": 1.0617, + "step": 807 + }, + { + "epoch": 0.17235494880546076, + "grad_norm": 1.0648191228892434, + "learning_rate": 3.9390615478366854e-05, + "loss": 1.0578, + "step": 808 + }, + { + "epoch": 0.17256825938566553, + "grad_norm": 0.8214472000273725, + "learning_rate": 3.94393662400975e-05, + "loss": 1.1064, + "step": 809 + }, + { + "epoch": 0.1727815699658703, + "grad_norm": 0.5422903223912993, + "learning_rate": 3.948811700182816e-05, + "loss": 1.0617, + "step": 810 + }, + { + "epoch": 0.1729948805460751, + "grad_norm": 0.5831092213363263, + "learning_rate": 3.953686776355881e-05, + "loss": 1.0265, + "step": 811 + }, + { + "epoch": 0.17320819112627986, + "grad_norm": 0.7642727326353689, + "learning_rate": 3.958561852528946e-05, + "loss": 1.0999, + "step": 812 + }, + { + "epoch": 0.17342150170648465, + "grad_norm": 0.8119140955620074, + "learning_rate": 3.963436928702011e-05, + "loss": 1.0662, + "step": 813 + }, + { + "epoch": 0.17363481228668942, + "grad_norm": 0.6192307959908305, + "learning_rate": 3.9683120048750765e-05, + "loss": 1.081, + "step": 814 + }, + { + "epoch": 0.1738481228668942, + "grad_norm": 0.40991231527171834, + "learning_rate": 3.973187081048142e-05, + "loss": 1.0927, + "step": 815 + }, + { + "epoch": 0.17406143344709898, + "grad_norm": 0.6081295157907491, + "learning_rate": 3.978062157221207e-05, + "loss": 1.0437, + "step": 816 + }, + { + "epoch": 0.17427474402730375, + "grad_norm": 0.7915677355833888, + "learning_rate": 3.982937233394272e-05, + "loss": 1.0782, + "step": 817 + }, + { + "epoch": 0.17448805460750852, + "grad_norm": 0.7237519144237891, + "learning_rate": 3.987812309567337e-05, + "loss": 1.0521, + "step": 818 + }, + { + "epoch": 0.17470136518771331, + "grad_norm": 0.5906766375428689, + "learning_rate": 3.9926873857404024e-05, + "loss": 1.0786, + "step": 819 + }, + { + "epoch": 0.17491467576791808, + "grad_norm": 0.5480427125165082, + "learning_rate": 3.997562461913468e-05, + "loss": 1.066, + "step": 820 + }, + { + "epoch": 0.17512798634812288, + "grad_norm": 0.5401371025932201, + "learning_rate": 4.002437538086532e-05, + "loss": 1.0911, + "step": 821 + }, + { + "epoch": 0.17534129692832764, + "grad_norm": 0.5343189550606542, + "learning_rate": 4.007312614259598e-05, + "loss": 1.049, + "step": 822 + }, + { + "epoch": 0.1755546075085324, + "grad_norm": 0.532238767856913, + "learning_rate": 4.0121876904326636e-05, + "loss": 1.0916, + "step": 823 + }, + { + "epoch": 0.1757679180887372, + "grad_norm": 0.5169963460831898, + "learning_rate": 4.017062766605729e-05, + "loss": 1.0816, + "step": 824 + }, + { + "epoch": 0.17598122866894197, + "grad_norm": 0.4808592331547615, + "learning_rate": 4.021937842778794e-05, + "loss": 1.0664, + "step": 825 + }, + { + "epoch": 0.17619453924914677, + "grad_norm": 0.5446787019909566, + "learning_rate": 4.026812918951859e-05, + "loss": 1.0615, + "step": 826 + }, + { + "epoch": 0.17640784982935154, + "grad_norm": 0.4699473942112424, + "learning_rate": 4.031687995124924e-05, + "loss": 1.0402, + "step": 827 + }, + { + "epoch": 0.1766211604095563, + "grad_norm": 0.40459249183641327, + "learning_rate": 4.0365630712979894e-05, + "loss": 1.0497, + "step": 828 + }, + { + "epoch": 0.1768344709897611, + "grad_norm": 0.4466318423387796, + "learning_rate": 4.041438147471055e-05, + "loss": 1.0863, + "step": 829 + }, + { + "epoch": 0.17704778156996587, + "grad_norm": 0.5651450881029338, + "learning_rate": 4.04631322364412e-05, + "loss": 1.0828, + "step": 830 + }, + { + "epoch": 0.17726109215017063, + "grad_norm": 0.6967590716986148, + "learning_rate": 4.0511882998171846e-05, + "loss": 1.0865, + "step": 831 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 0.8149559077496147, + "learning_rate": 4.05606337599025e-05, + "loss": 1.0916, + "step": 832 + }, + { + "epoch": 0.1776877133105802, + "grad_norm": 0.7470925331759264, + "learning_rate": 4.060938452163315e-05, + "loss": 1.0686, + "step": 833 + }, + { + "epoch": 0.177901023890785, + "grad_norm": 0.5743952990036156, + "learning_rate": 4.0658135283363805e-05, + "loss": 1.068, + "step": 834 + }, + { + "epoch": 0.17811433447098976, + "grad_norm": 0.49874351608860257, + "learning_rate": 4.070688604509446e-05, + "loss": 1.0448, + "step": 835 + }, + { + "epoch": 0.17832764505119453, + "grad_norm": 0.6617066077201995, + "learning_rate": 4.075563680682511e-05, + "loss": 1.0499, + "step": 836 + }, + { + "epoch": 0.17854095563139932, + "grad_norm": 0.5829638030054316, + "learning_rate": 4.080438756855576e-05, + "loss": 1.043, + "step": 837 + }, + { + "epoch": 0.1787542662116041, + "grad_norm": 0.48914386423718553, + "learning_rate": 4.085313833028641e-05, + "loss": 1.0845, + "step": 838 + }, + { + "epoch": 0.17896757679180889, + "grad_norm": 0.4769784067814574, + "learning_rate": 4.0901889092017064e-05, + "loss": 1.0675, + "step": 839 + }, + { + "epoch": 0.17918088737201365, + "grad_norm": 0.4468549309158688, + "learning_rate": 4.095063985374772e-05, + "loss": 1.0615, + "step": 840 + }, + { + "epoch": 0.17939419795221842, + "grad_norm": 0.5080122208601652, + "learning_rate": 4.099939061547838e-05, + "loss": 1.068, + "step": 841 + }, + { + "epoch": 0.17960750853242322, + "grad_norm": 0.5898736321413974, + "learning_rate": 4.1048141377209016e-05, + "loss": 1.0757, + "step": 842 + }, + { + "epoch": 0.17982081911262798, + "grad_norm": 0.6315918347917038, + "learning_rate": 4.109689213893967e-05, + "loss": 1.1221, + "step": 843 + }, + { + "epoch": 0.18003412969283278, + "grad_norm": 0.593351349581397, + "learning_rate": 4.114564290067033e-05, + "loss": 1.0659, + "step": 844 + }, + { + "epoch": 0.18024744027303755, + "grad_norm": 0.7315052475297371, + "learning_rate": 4.119439366240098e-05, + "loss": 1.0677, + "step": 845 + }, + { + "epoch": 0.1804607508532423, + "grad_norm": 0.8540431024638835, + "learning_rate": 4.1243144424131635e-05, + "loss": 1.0803, + "step": 846 + }, + { + "epoch": 0.1806740614334471, + "grad_norm": 0.8772272071566355, + "learning_rate": 4.129189518586228e-05, + "loss": 1.0752, + "step": 847 + }, + { + "epoch": 0.18088737201365188, + "grad_norm": 0.8618606985095055, + "learning_rate": 4.1340645947592934e-05, + "loss": 1.09, + "step": 848 + }, + { + "epoch": 0.18110068259385664, + "grad_norm": 0.9030669749242042, + "learning_rate": 4.138939670932359e-05, + "loss": 1.0632, + "step": 849 + }, + { + "epoch": 0.18131399317406144, + "grad_norm": 0.9123313965089002, + "learning_rate": 4.143814747105424e-05, + "loss": 1.0823, + "step": 850 + }, + { + "epoch": 0.1815273037542662, + "grad_norm": 0.8041063049140952, + "learning_rate": 4.148689823278489e-05, + "loss": 1.0711, + "step": 851 + }, + { + "epoch": 0.181740614334471, + "grad_norm": 0.6751792770539949, + "learning_rate": 4.153564899451554e-05, + "loss": 1.0607, + "step": 852 + }, + { + "epoch": 0.18195392491467577, + "grad_norm": 0.6342083839385113, + "learning_rate": 4.158439975624619e-05, + "loss": 1.0682, + "step": 853 + }, + { + "epoch": 0.18216723549488054, + "grad_norm": 0.7289370003613554, + "learning_rate": 4.1633150517976846e-05, + "loss": 1.0915, + "step": 854 + }, + { + "epoch": 0.18238054607508533, + "grad_norm": 0.7176318488534438, + "learning_rate": 4.16819012797075e-05, + "loss": 1.0765, + "step": 855 + }, + { + "epoch": 0.1825938566552901, + "grad_norm": 0.6007662171776146, + "learning_rate": 4.173065204143815e-05, + "loss": 1.0876, + "step": 856 + }, + { + "epoch": 0.1828071672354949, + "grad_norm": 0.4521458416443912, + "learning_rate": 4.1779402803168805e-05, + "loss": 1.065, + "step": 857 + }, + { + "epoch": 0.18302047781569966, + "grad_norm": 0.48054888860707146, + "learning_rate": 4.182815356489945e-05, + "loss": 1.0559, + "step": 858 + }, + { + "epoch": 0.18323378839590443, + "grad_norm": 0.8249935197559954, + "learning_rate": 4.1876904326630104e-05, + "loss": 1.0598, + "step": 859 + }, + { + "epoch": 0.18344709897610922, + "grad_norm": 0.9881450072294299, + "learning_rate": 4.192565508836076e-05, + "loss": 1.069, + "step": 860 + }, + { + "epoch": 0.183660409556314, + "grad_norm": 0.8595840370931584, + "learning_rate": 4.197440585009141e-05, + "loss": 1.068, + "step": 861 + }, + { + "epoch": 0.18387372013651876, + "grad_norm": 0.5822524693311415, + "learning_rate": 4.202315661182207e-05, + "loss": 1.0538, + "step": 862 + }, + { + "epoch": 0.18408703071672355, + "grad_norm": 0.4697625987691203, + "learning_rate": 4.207190737355271e-05, + "loss": 1.0819, + "step": 863 + }, + { + "epoch": 0.18430034129692832, + "grad_norm": 0.4256073818388479, + "learning_rate": 4.212065813528336e-05, + "loss": 1.0497, + "step": 864 + }, + { + "epoch": 0.18451365187713312, + "grad_norm": 0.4832480059953953, + "learning_rate": 4.216940889701402e-05, + "loss": 1.0776, + "step": 865 + }, + { + "epoch": 0.18472696245733788, + "grad_norm": 0.5751637514554114, + "learning_rate": 4.2218159658744675e-05, + "loss": 1.122, + "step": 866 + }, + { + "epoch": 0.18494027303754265, + "grad_norm": 0.5390625721293342, + "learning_rate": 4.226691042047533e-05, + "loss": 1.0812, + "step": 867 + }, + { + "epoch": 0.18515358361774745, + "grad_norm": 0.47911938181717717, + "learning_rate": 4.2315661182205974e-05, + "loss": 1.0434, + "step": 868 + }, + { + "epoch": 0.18536689419795221, + "grad_norm": 0.5557807712930758, + "learning_rate": 4.236441194393663e-05, + "loss": 1.0551, + "step": 869 + }, + { + "epoch": 0.185580204778157, + "grad_norm": 0.592837544370306, + "learning_rate": 4.241316270566728e-05, + "loss": 1.0749, + "step": 870 + }, + { + "epoch": 0.18579351535836178, + "grad_norm": 0.44916378974629917, + "learning_rate": 4.246191346739793e-05, + "loss": 1.0742, + "step": 871 + }, + { + "epoch": 0.18600682593856654, + "grad_norm": 0.47512256097345085, + "learning_rate": 4.2510664229128586e-05, + "loss": 1.0449, + "step": 872 + }, + { + "epoch": 0.18622013651877134, + "grad_norm": 0.6154441007895567, + "learning_rate": 4.255941499085923e-05, + "loss": 1.0727, + "step": 873 + }, + { + "epoch": 0.1864334470989761, + "grad_norm": 0.6227958413053061, + "learning_rate": 4.2608165752589886e-05, + "loss": 1.0421, + "step": 874 + }, + { + "epoch": 0.18664675767918087, + "grad_norm": 0.6704052687021391, + "learning_rate": 4.265691651432054e-05, + "loss": 1.0947, + "step": 875 + }, + { + "epoch": 0.18686006825938567, + "grad_norm": 0.7730795183487692, + "learning_rate": 4.270566727605119e-05, + "loss": 1.0904, + "step": 876 + }, + { + "epoch": 0.18707337883959044, + "grad_norm": 0.8829601252967686, + "learning_rate": 4.2754418037781845e-05, + "loss": 1.0737, + "step": 877 + }, + { + "epoch": 0.18728668941979523, + "grad_norm": 0.8240735033955189, + "learning_rate": 4.28031687995125e-05, + "loss": 1.0779, + "step": 878 + }, + { + "epoch": 0.1875, + "grad_norm": 0.8267546502633678, + "learning_rate": 4.2851919561243144e-05, + "loss": 1.1055, + "step": 879 + }, + { + "epoch": 0.18771331058020477, + "grad_norm": 0.7402520433833486, + "learning_rate": 4.29006703229738e-05, + "loss": 1.0397, + "step": 880 + }, + { + "epoch": 0.18792662116040956, + "grad_norm": 0.6000600916530276, + "learning_rate": 4.294942108470445e-05, + "loss": 1.0874, + "step": 881 + }, + { + "epoch": 0.18813993174061433, + "grad_norm": 0.48694502122782113, + "learning_rate": 4.29981718464351e-05, + "loss": 1.0544, + "step": 882 + }, + { + "epoch": 0.18835324232081913, + "grad_norm": 0.40275467801593196, + "learning_rate": 4.304692260816576e-05, + "loss": 1.0676, + "step": 883 + }, + { + "epoch": 0.1885665529010239, + "grad_norm": 0.45686297449749613, + "learning_rate": 4.30956733698964e-05, + "loss": 1.0889, + "step": 884 + }, + { + "epoch": 0.18877986348122866, + "grad_norm": 0.49502270496697304, + "learning_rate": 4.3144424131627055e-05, + "loss": 1.037, + "step": 885 + }, + { + "epoch": 0.18899317406143346, + "grad_norm": 0.38912550896791187, + "learning_rate": 4.3193174893357715e-05, + "loss": 1.041, + "step": 886 + }, + { + "epoch": 0.18920648464163822, + "grad_norm": 0.447672822160334, + "learning_rate": 4.324192565508837e-05, + "loss": 1.0573, + "step": 887 + }, + { + "epoch": 0.189419795221843, + "grad_norm": 0.4964182578881089, + "learning_rate": 4.329067641681902e-05, + "loss": 1.0917, + "step": 888 + }, + { + "epoch": 0.18963310580204779, + "grad_norm": 0.47709545891904337, + "learning_rate": 4.333942717854967e-05, + "loss": 1.0687, + "step": 889 + }, + { + "epoch": 0.18984641638225255, + "grad_norm": 0.40899696114905665, + "learning_rate": 4.338817794028032e-05, + "loss": 1.0742, + "step": 890 + }, + { + "epoch": 0.19005972696245735, + "grad_norm": 0.37523939847368143, + "learning_rate": 4.3436928702010973e-05, + "loss": 1.0592, + "step": 891 + }, + { + "epoch": 0.19027303754266212, + "grad_norm": 0.36531878397206263, + "learning_rate": 4.3485679463741626e-05, + "loss": 1.0497, + "step": 892 + }, + { + "epoch": 0.19048634812286688, + "grad_norm": 0.37972646902101453, + "learning_rate": 4.353443022547228e-05, + "loss": 1.0692, + "step": 893 + }, + { + "epoch": 0.19069965870307168, + "grad_norm": 0.39260054544777606, + "learning_rate": 4.3583180987202926e-05, + "loss": 1.0227, + "step": 894 + }, + { + "epoch": 0.19091296928327645, + "grad_norm": 0.5005597772271189, + "learning_rate": 4.363193174893358e-05, + "loss": 1.0322, + "step": 895 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 0.8413056291322106, + "learning_rate": 4.368068251066423e-05, + "loss": 1.0723, + "step": 896 + }, + { + "epoch": 0.191339590443686, + "grad_norm": 1.23822727936375, + "learning_rate": 4.3729433272394885e-05, + "loss": 1.0601, + "step": 897 + }, + { + "epoch": 0.19155290102389078, + "grad_norm": 0.7668693708927922, + "learning_rate": 4.377818403412554e-05, + "loss": 1.0948, + "step": 898 + }, + { + "epoch": 0.19176621160409557, + "grad_norm": 0.49612483039200267, + "learning_rate": 4.382693479585619e-05, + "loss": 1.0345, + "step": 899 + }, + { + "epoch": 0.19197952218430034, + "grad_norm": 0.4976849377701419, + "learning_rate": 4.387568555758684e-05, + "loss": 1.0593, + "step": 900 + }, + { + "epoch": 0.1921928327645051, + "grad_norm": 0.8173406634986213, + "learning_rate": 4.392443631931749e-05, + "loss": 1.0644, + "step": 901 + }, + { + "epoch": 0.1924061433447099, + "grad_norm": 1.1595647661359216, + "learning_rate": 4.397318708104814e-05, + "loss": 1.049, + "step": 902 + }, + { + "epoch": 0.19261945392491467, + "grad_norm": 0.885262838964625, + "learning_rate": 4.4021937842778796e-05, + "loss": 1.0526, + "step": 903 + }, + { + "epoch": 0.19283276450511946, + "grad_norm": 0.6107148421750689, + "learning_rate": 4.4070688604509456e-05, + "loss": 1.0657, + "step": 904 + }, + { + "epoch": 0.19304607508532423, + "grad_norm": 0.5128612354339199, + "learning_rate": 4.4119439366240095e-05, + "loss": 1.0541, + "step": 905 + }, + { + "epoch": 0.193259385665529, + "grad_norm": 0.5381860678715549, + "learning_rate": 4.416819012797075e-05, + "loss": 1.1111, + "step": 906 + }, + { + "epoch": 0.1934726962457338, + "grad_norm": 0.48558876152718317, + "learning_rate": 4.421694088970141e-05, + "loss": 1.0557, + "step": 907 + }, + { + "epoch": 0.19368600682593856, + "grad_norm": 0.4357923500276707, + "learning_rate": 4.426569165143206e-05, + "loss": 1.0814, + "step": 908 + }, + { + "epoch": 0.19389931740614336, + "grad_norm": 0.5978070070142062, + "learning_rate": 4.4314442413162714e-05, + "loss": 1.0534, + "step": 909 + }, + { + "epoch": 0.19411262798634812, + "grad_norm": 0.7924692167842007, + "learning_rate": 4.436319317489336e-05, + "loss": 1.0757, + "step": 910 + }, + { + "epoch": 0.1943259385665529, + "grad_norm": 1.0790114252805607, + "learning_rate": 4.4411943936624013e-05, + "loss": 1.0489, + "step": 911 + }, + { + "epoch": 0.1945392491467577, + "grad_norm": 1.1333065629979089, + "learning_rate": 4.4460694698354667e-05, + "loss": 1.0685, + "step": 912 + }, + { + "epoch": 0.19475255972696245, + "grad_norm": 0.7883856798859377, + "learning_rate": 4.450944546008532e-05, + "loss": 1.0693, + "step": 913 + }, + { + "epoch": 0.19496587030716722, + "grad_norm": 0.558613030360291, + "learning_rate": 4.455819622181597e-05, + "loss": 1.0687, + "step": 914 + }, + { + "epoch": 0.19517918088737202, + "grad_norm": 0.5390065627346893, + "learning_rate": 4.460694698354662e-05, + "loss": 1.0642, + "step": 915 + }, + { + "epoch": 0.19539249146757678, + "grad_norm": 0.5297330733478972, + "learning_rate": 4.465569774527727e-05, + "loss": 1.0532, + "step": 916 + }, + { + "epoch": 0.19560580204778158, + "grad_norm": 0.5604415565214139, + "learning_rate": 4.4704448507007925e-05, + "loss": 1.0231, + "step": 917 + }, + { + "epoch": 0.19581911262798635, + "grad_norm": 0.6903909519876111, + "learning_rate": 4.475319926873858e-05, + "loss": 1.0847, + "step": 918 + }, + { + "epoch": 0.19603242320819111, + "grad_norm": 0.7406981706871111, + "learning_rate": 4.480195003046923e-05, + "loss": 1.0469, + "step": 919 + }, + { + "epoch": 0.1962457337883959, + "grad_norm": 0.8024661362653956, + "learning_rate": 4.4850700792199884e-05, + "loss": 1.0581, + "step": 920 + }, + { + "epoch": 0.19645904436860068, + "grad_norm": 0.9617405895938714, + "learning_rate": 4.489945155393053e-05, + "loss": 1.0968, + "step": 921 + }, + { + "epoch": 0.19667235494880547, + "grad_norm": 1.1050540241362043, + "learning_rate": 4.494820231566118e-05, + "loss": 1.0665, + "step": 922 + }, + { + "epoch": 0.19688566552901024, + "grad_norm": 0.7472634205095369, + "learning_rate": 4.4996953077391836e-05, + "loss": 1.0902, + "step": 923 + }, + { + "epoch": 0.197098976109215, + "grad_norm": 0.46986809298047605, + "learning_rate": 4.504570383912249e-05, + "loss": 1.0686, + "step": 924 + }, + { + "epoch": 0.1973122866894198, + "grad_norm": 0.49811503083378317, + "learning_rate": 4.509445460085315e-05, + "loss": 1.056, + "step": 925 + }, + { + "epoch": 0.19752559726962457, + "grad_norm": 0.6072239672167835, + "learning_rate": 4.514320536258379e-05, + "loss": 1.0466, + "step": 926 + }, + { + "epoch": 0.19773890784982937, + "grad_norm": 0.7947753865108256, + "learning_rate": 4.519195612431444e-05, + "loss": 1.0415, + "step": 927 + }, + { + "epoch": 0.19795221843003413, + "grad_norm": 0.8047876634603726, + "learning_rate": 4.52407068860451e-05, + "loss": 1.0673, + "step": 928 + }, + { + "epoch": 0.1981655290102389, + "grad_norm": 0.7662698540439811, + "learning_rate": 4.5289457647775754e-05, + "loss": 1.0491, + "step": 929 + }, + { + "epoch": 0.1983788395904437, + "grad_norm": 0.7181060719050633, + "learning_rate": 4.533820840950641e-05, + "loss": 1.0913, + "step": 930 + }, + { + "epoch": 0.19859215017064846, + "grad_norm": 0.49651832342887176, + "learning_rate": 4.5386959171237054e-05, + "loss": 1.0775, + "step": 931 + }, + { + "epoch": 0.19880546075085323, + "grad_norm": 0.4868059983466154, + "learning_rate": 4.5435709932967707e-05, + "loss": 1.0856, + "step": 932 + }, + { + "epoch": 0.19901877133105803, + "grad_norm": 0.5530772630645586, + "learning_rate": 4.548446069469836e-05, + "loss": 1.1052, + "step": 933 + }, + { + "epoch": 0.1992320819112628, + "grad_norm": 0.6044280392520396, + "learning_rate": 4.553321145642901e-05, + "loss": 1.0908, + "step": 934 + }, + { + "epoch": 0.1994453924914676, + "grad_norm": 0.5754251329749809, + "learning_rate": 4.5581962218159666e-05, + "loss": 1.0554, + "step": 935 + }, + { + "epoch": 0.19965870307167236, + "grad_norm": 0.5343310849251703, + "learning_rate": 4.563071297989031e-05, + "loss": 1.0936, + "step": 936 + }, + { + "epoch": 0.19987201365187712, + "grad_norm": 0.5535206090728048, + "learning_rate": 4.5679463741620965e-05, + "loss": 1.0646, + "step": 937 + }, + { + "epoch": 0.20008532423208192, + "grad_norm": 0.49712688247653986, + "learning_rate": 4.572821450335162e-05, + "loss": 1.0269, + "step": 938 + }, + { + "epoch": 0.20029863481228669, + "grad_norm": 0.4774163929802181, + "learning_rate": 4.577696526508227e-05, + "loss": 1.0772, + "step": 939 + }, + { + "epoch": 0.20051194539249148, + "grad_norm": 0.4819401983452939, + "learning_rate": 4.5825716026812924e-05, + "loss": 1.0503, + "step": 940 + }, + { + "epoch": 0.20072525597269625, + "grad_norm": 0.6099356838221711, + "learning_rate": 4.587446678854358e-05, + "loss": 1.0625, + "step": 941 + }, + { + "epoch": 0.20093856655290102, + "grad_norm": 0.765611396867985, + "learning_rate": 4.592321755027422e-05, + "loss": 1.0327, + "step": 942 + }, + { + "epoch": 0.2011518771331058, + "grad_norm": 0.9422429407574135, + "learning_rate": 4.5971968312004876e-05, + "loss": 1.0489, + "step": 943 + }, + { + "epoch": 0.20136518771331058, + "grad_norm": 0.9725561102108843, + "learning_rate": 4.602071907373553e-05, + "loss": 1.0858, + "step": 944 + }, + { + "epoch": 0.20157849829351535, + "grad_norm": 0.8108036267844934, + "learning_rate": 4.606946983546618e-05, + "loss": 1.0547, + "step": 945 + }, + { + "epoch": 0.20179180887372014, + "grad_norm": 0.6090164037484078, + "learning_rate": 4.611822059719684e-05, + "loss": 1.0551, + "step": 946 + }, + { + "epoch": 0.2020051194539249, + "grad_norm": 0.5168620470566225, + "learning_rate": 4.616697135892748e-05, + "loss": 1.0657, + "step": 947 + }, + { + "epoch": 0.2022184300341297, + "grad_norm": 0.4037143537024899, + "learning_rate": 4.6215722120658135e-05, + "loss": 1.0256, + "step": 948 + }, + { + "epoch": 0.20243174061433447, + "grad_norm": 0.5790395439331943, + "learning_rate": 4.6264472882388794e-05, + "loss": 1.0696, + "step": 949 + }, + { + "epoch": 0.20264505119453924, + "grad_norm": 0.8029472879930136, + "learning_rate": 4.631322364411945e-05, + "loss": 1.065, + "step": 950 + }, + { + "epoch": 0.20285836177474403, + "grad_norm": 0.9181729934409629, + "learning_rate": 4.63619744058501e-05, + "loss": 1.0693, + "step": 951 + }, + { + "epoch": 0.2030716723549488, + "grad_norm": 0.8755872633785305, + "learning_rate": 4.641072516758075e-05, + "loss": 1.0827, + "step": 952 + }, + { + "epoch": 0.2032849829351536, + "grad_norm": 0.7759470432651319, + "learning_rate": 4.64594759293114e-05, + "loss": 1.0428, + "step": 953 + }, + { + "epoch": 0.20349829351535836, + "grad_norm": 0.7183255080816557, + "learning_rate": 4.650822669104205e-05, + "loss": 1.0738, + "step": 954 + }, + { + "epoch": 0.20371160409556313, + "grad_norm": 0.6350005216206918, + "learning_rate": 4.6556977452772706e-05, + "loss": 1.0745, + "step": 955 + }, + { + "epoch": 0.20392491467576793, + "grad_norm": 0.47318305048720605, + "learning_rate": 4.660572821450336e-05, + "loss": 1.1064, + "step": 956 + }, + { + "epoch": 0.2041382252559727, + "grad_norm": 0.3099611611747466, + "learning_rate": 4.6654478976234005e-05, + "loss": 1.0388, + "step": 957 + }, + { + "epoch": 0.20435153583617746, + "grad_norm": 0.4700976794407362, + "learning_rate": 4.670322973796466e-05, + "loss": 1.0781, + "step": 958 + }, + { + "epoch": 0.20456484641638226, + "grad_norm": 0.5789809670321887, + "learning_rate": 4.675198049969531e-05, + "loss": 1.0418, + "step": 959 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 0.604545151927564, + "learning_rate": 4.6800731261425964e-05, + "loss": 1.0485, + "step": 960 + }, + { + "epoch": 0.20499146757679182, + "grad_norm": 0.5823356773633664, + "learning_rate": 4.684948202315662e-05, + "loss": 1.0685, + "step": 961 + }, + { + "epoch": 0.2052047781569966, + "grad_norm": 0.4902103776390323, + "learning_rate": 4.689823278488727e-05, + "loss": 1.0508, + "step": 962 + }, + { + "epoch": 0.20541808873720135, + "grad_norm": 0.6403869519581531, + "learning_rate": 4.6946983546617916e-05, + "loss": 1.0588, + "step": 963 + }, + { + "epoch": 0.20563139931740615, + "grad_norm": 0.8814507599025008, + "learning_rate": 4.699573430834857e-05, + "loss": 1.0944, + "step": 964 + }, + { + "epoch": 0.20584470989761092, + "grad_norm": 0.9955608661841778, + "learning_rate": 4.704448507007922e-05, + "loss": 1.0427, + "step": 965 + }, + { + "epoch": 0.2060580204778157, + "grad_norm": 1.051591388683206, + "learning_rate": 4.7093235831809875e-05, + "loss": 1.0817, + "step": 966 + }, + { + "epoch": 0.20627133105802048, + "grad_norm": 0.7502904103730081, + "learning_rate": 4.7141986593540535e-05, + "loss": 1.0569, + "step": 967 + }, + { + "epoch": 0.20648464163822525, + "grad_norm": 0.46264819537506546, + "learning_rate": 4.7190737355271175e-05, + "loss": 1.0724, + "step": 968 + }, + { + "epoch": 0.20669795221843004, + "grad_norm": 0.494833737708367, + "learning_rate": 4.723948811700183e-05, + "loss": 1.0466, + "step": 969 + }, + { + "epoch": 0.2069112627986348, + "grad_norm": 0.6485552999813629, + "learning_rate": 4.728823887873249e-05, + "loss": 1.0959, + "step": 970 + }, + { + "epoch": 0.20712457337883958, + "grad_norm": 0.7244733451659313, + "learning_rate": 4.733698964046314e-05, + "loss": 1.0566, + "step": 971 + }, + { + "epoch": 0.20733788395904437, + "grad_norm": 0.6650311639735798, + "learning_rate": 4.7385740402193793e-05, + "loss": 1.0734, + "step": 972 + }, + { + "epoch": 0.20755119453924914, + "grad_norm": 0.5685460811555754, + "learning_rate": 4.743449116392444e-05, + "loss": 1.0761, + "step": 973 + }, + { + "epoch": 0.20776450511945393, + "grad_norm": 0.6722090555509854, + "learning_rate": 4.748324192565509e-05, + "loss": 1.0632, + "step": 974 + }, + { + "epoch": 0.2079778156996587, + "grad_norm": 0.6212228385142403, + "learning_rate": 4.7531992687385746e-05, + "loss": 1.0497, + "step": 975 + }, + { + "epoch": 0.20819112627986347, + "grad_norm": 0.5185012014670697, + "learning_rate": 4.75807434491164e-05, + "loss": 1.0403, + "step": 976 + }, + { + "epoch": 0.20840443686006827, + "grad_norm": 0.5455320125689828, + "learning_rate": 4.762949421084705e-05, + "loss": 1.0509, + "step": 977 + }, + { + "epoch": 0.20861774744027303, + "grad_norm": 0.5631909845334467, + "learning_rate": 4.76782449725777e-05, + "loss": 1.0772, + "step": 978 + }, + { + "epoch": 0.20883105802047783, + "grad_norm": 0.549706670471974, + "learning_rate": 4.772699573430835e-05, + "loss": 1.0651, + "step": 979 + }, + { + "epoch": 0.2090443686006826, + "grad_norm": 0.5118069764145919, + "learning_rate": 4.7775746496039004e-05, + "loss": 1.0537, + "step": 980 + }, + { + "epoch": 0.20925767918088736, + "grad_norm": 0.5336519064262736, + "learning_rate": 4.782449725776966e-05, + "loss": 1.057, + "step": 981 + }, + { + "epoch": 0.20947098976109216, + "grad_norm": 0.4818430117873423, + "learning_rate": 4.787324801950031e-05, + "loss": 1.0309, + "step": 982 + }, + { + "epoch": 0.20968430034129693, + "grad_norm": 0.46380021406147204, + "learning_rate": 4.792199878123096e-05, + "loss": 1.0972, + "step": 983 + }, + { + "epoch": 0.2098976109215017, + "grad_norm": 0.5660847734814146, + "learning_rate": 4.797074954296161e-05, + "loss": 1.0759, + "step": 984 + }, + { + "epoch": 0.2101109215017065, + "grad_norm": 0.534738092897025, + "learning_rate": 4.801950030469226e-05, + "loss": 1.0938, + "step": 985 + }, + { + "epoch": 0.21032423208191126, + "grad_norm": 0.4788935188702831, + "learning_rate": 4.8068251066422915e-05, + "loss": 1.0666, + "step": 986 + }, + { + "epoch": 0.21053754266211605, + "grad_norm": 0.47846941552269134, + "learning_rate": 4.811700182815357e-05, + "loss": 1.0629, + "step": 987 + }, + { + "epoch": 0.21075085324232082, + "grad_norm": 0.4666057142966185, + "learning_rate": 4.816575258988422e-05, + "loss": 1.0339, + "step": 988 + }, + { + "epoch": 0.21096416382252559, + "grad_norm": 0.43328813317052944, + "learning_rate": 4.821450335161487e-05, + "loss": 1.0628, + "step": 989 + }, + { + "epoch": 0.21117747440273038, + "grad_norm": 0.45865013464400944, + "learning_rate": 4.826325411334552e-05, + "loss": 1.0521, + "step": 990 + }, + { + "epoch": 0.21139078498293515, + "grad_norm": 0.4835110223413795, + "learning_rate": 4.831200487507618e-05, + "loss": 1.041, + "step": 991 + }, + { + "epoch": 0.21160409556313994, + "grad_norm": 0.48609756065433973, + "learning_rate": 4.8360755636806834e-05, + "loss": 1.0743, + "step": 992 + }, + { + "epoch": 0.2118174061433447, + "grad_norm": 0.5567262038310756, + "learning_rate": 4.8409506398537487e-05, + "loss": 1.0584, + "step": 993 + }, + { + "epoch": 0.21203071672354948, + "grad_norm": 0.5815592555234698, + "learning_rate": 4.845825716026813e-05, + "loss": 1.028, + "step": 994 + }, + { + "epoch": 0.21224402730375427, + "grad_norm": 0.6773748436717767, + "learning_rate": 4.8507007921998786e-05, + "loss": 1.0583, + "step": 995 + }, + { + "epoch": 0.21245733788395904, + "grad_norm": 0.8439933319437324, + "learning_rate": 4.855575868372944e-05, + "loss": 1.0309, + "step": 996 + }, + { + "epoch": 0.21267064846416384, + "grad_norm": 1.1033226190284988, + "learning_rate": 4.860450944546009e-05, + "loss": 1.0466, + "step": 997 + }, + { + "epoch": 0.2128839590443686, + "grad_norm": 1.07115348129386, + "learning_rate": 4.8653260207190745e-05, + "loss": 1.0523, + "step": 998 + }, + { + "epoch": 0.21309726962457337, + "grad_norm": 0.8927440474050443, + "learning_rate": 4.870201096892139e-05, + "loss": 1.0727, + "step": 999 + }, + { + "epoch": 0.21331058020477817, + "grad_norm": 0.8185245035911902, + "learning_rate": 4.8750761730652044e-05, + "loss": 1.0436, + "step": 1000 + }, + { + "epoch": 0.21352389078498293, + "grad_norm": 0.6122750209731327, + "learning_rate": 4.87995124923827e-05, + "loss": 1.0231, + "step": 1001 + }, + { + "epoch": 0.2137372013651877, + "grad_norm": 0.41414255690018986, + "learning_rate": 4.884826325411335e-05, + "loss": 1.0336, + "step": 1002 + }, + { + "epoch": 0.2139505119453925, + "grad_norm": 0.6016840612121785, + "learning_rate": 4.8897014015844e-05, + "loss": 1.0559, + "step": 1003 + }, + { + "epoch": 0.21416382252559726, + "grad_norm": 0.796669888352076, + "learning_rate": 4.894576477757465e-05, + "loss": 1.0383, + "step": 1004 + }, + { + "epoch": 0.21437713310580206, + "grad_norm": 0.800603375281776, + "learning_rate": 4.89945155393053e-05, + "loss": 1.0585, + "step": 1005 + }, + { + "epoch": 0.21459044368600683, + "grad_norm": 0.7954404465792347, + "learning_rate": 4.9043266301035956e-05, + "loss": 1.0409, + "step": 1006 + }, + { + "epoch": 0.2148037542662116, + "grad_norm": 0.795600094805881, + "learning_rate": 4.909201706276661e-05, + "loss": 1.0382, + "step": 1007 + }, + { + "epoch": 0.2150170648464164, + "grad_norm": 0.5163064590387043, + "learning_rate": 4.914076782449726e-05, + "loss": 1.0609, + "step": 1008 + }, + { + "epoch": 0.21523037542662116, + "grad_norm": 0.47067824181108525, + "learning_rate": 4.9189518586227915e-05, + "loss": 1.082, + "step": 1009 + }, + { + "epoch": 0.21544368600682595, + "grad_norm": 0.4893428697386786, + "learning_rate": 4.923826934795856e-05, + "loss": 1.0769, + "step": 1010 + }, + { + "epoch": 0.21565699658703072, + "grad_norm": 0.5247659365852165, + "learning_rate": 4.9287020109689214e-05, + "loss": 1.0572, + "step": 1011 + }, + { + "epoch": 0.2158703071672355, + "grad_norm": 0.4739193281878152, + "learning_rate": 4.933577087141987e-05, + "loss": 1.0717, + "step": 1012 + }, + { + "epoch": 0.21608361774744028, + "grad_norm": 0.6615140483377175, + "learning_rate": 4.938452163315053e-05, + "loss": 1.0616, + "step": 1013 + }, + { + "epoch": 0.21629692832764505, + "grad_norm": 0.8460935080442317, + "learning_rate": 4.943327239488118e-05, + "loss": 1.0436, + "step": 1014 + }, + { + "epoch": 0.21651023890784982, + "grad_norm": 0.8134754110613966, + "learning_rate": 4.9482023156611826e-05, + "loss": 1.0597, + "step": 1015 + }, + { + "epoch": 0.2167235494880546, + "grad_norm": 0.6418728608217931, + "learning_rate": 4.953077391834248e-05, + "loss": 1.0281, + "step": 1016 + }, + { + "epoch": 0.21693686006825938, + "grad_norm": 0.572987235523415, + "learning_rate": 4.957952468007313e-05, + "loss": 1.0953, + "step": 1017 + }, + { + "epoch": 0.21715017064846417, + "grad_norm": 0.5457279244060615, + "learning_rate": 4.9628275441803785e-05, + "loss": 1.0764, + "step": 1018 + }, + { + "epoch": 0.21736348122866894, + "grad_norm": 0.6113815513559963, + "learning_rate": 4.967702620353444e-05, + "loss": 1.0747, + "step": 1019 + }, + { + "epoch": 0.2175767918088737, + "grad_norm": 0.49491298804799017, + "learning_rate": 4.9725776965265084e-05, + "loss": 1.0705, + "step": 1020 + }, + { + "epoch": 0.2177901023890785, + "grad_norm": 0.591212868909889, + "learning_rate": 4.977452772699574e-05, + "loss": 1.0516, + "step": 1021 + }, + { + "epoch": 0.21800341296928327, + "grad_norm": 0.8167904239875813, + "learning_rate": 4.982327848872639e-05, + "loss": 1.0601, + "step": 1022 + }, + { + "epoch": 0.21821672354948807, + "grad_norm": 0.8154698858869693, + "learning_rate": 4.987202925045704e-05, + "loss": 1.0409, + "step": 1023 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 0.7349208764890164, + "learning_rate": 4.9920780012187696e-05, + "loss": 1.0621, + "step": 1024 + }, + { + "epoch": 0.2186433447098976, + "grad_norm": 0.7098148740681803, + "learning_rate": 4.996953077391834e-05, + "loss": 1.0538, + "step": 1025 + }, + { + "epoch": 0.2188566552901024, + "grad_norm": 0.7272055824085337, + "learning_rate": 5.0018281535648996e-05, + "loss": 1.0724, + "step": 1026 + }, + { + "epoch": 0.21906996587030717, + "grad_norm": 0.7679939477038356, + "learning_rate": 5.006703229737965e-05, + "loss": 1.056, + "step": 1027 + }, + { + "epoch": 0.21928327645051193, + "grad_norm": 0.7273321814575324, + "learning_rate": 5.01157830591103e-05, + "loss": 0.9656, + "step": 1028 + }, + { + "epoch": 0.21949658703071673, + "grad_norm": 0.7147127221873277, + "learning_rate": 5.0164533820840955e-05, + "loss": 1.0388, + "step": 1029 + }, + { + "epoch": 0.2197098976109215, + "grad_norm": 0.7123877808601359, + "learning_rate": 5.021328458257161e-05, + "loss": 1.0339, + "step": 1030 + }, + { + "epoch": 0.2199232081911263, + "grad_norm": 0.7849145068541908, + "learning_rate": 5.0262035344302254e-05, + "loss": 1.0689, + "step": 1031 + }, + { + "epoch": 0.22013651877133106, + "grad_norm": 0.9023172124620138, + "learning_rate": 5.031078610603291e-05, + "loss": 1.0409, + "step": 1032 + }, + { + "epoch": 0.22034982935153583, + "grad_norm": 1.0959074356929093, + "learning_rate": 5.035953686776356e-05, + "loss": 1.0489, + "step": 1033 + }, + { + "epoch": 0.22056313993174062, + "grad_norm": 1.0012760652062702, + "learning_rate": 5.040828762949422e-05, + "loss": 1.0605, + "step": 1034 + }, + { + "epoch": 0.2207764505119454, + "grad_norm": 0.885278500417742, + "learning_rate": 5.045703839122487e-05, + "loss": 1.0387, + "step": 1035 + }, + { + "epoch": 0.22098976109215018, + "grad_norm": 0.7817951307054585, + "learning_rate": 5.050578915295551e-05, + "loss": 1.0508, + "step": 1036 + }, + { + "epoch": 0.22120307167235495, + "grad_norm": 0.4753510460635615, + "learning_rate": 5.055453991468617e-05, + "loss": 1.0649, + "step": 1037 + }, + { + "epoch": 0.22141638225255972, + "grad_norm": 0.467213813770483, + "learning_rate": 5.0603290676416825e-05, + "loss": 1.0723, + "step": 1038 + }, + { + "epoch": 0.2216296928327645, + "grad_norm": 0.7152681158362237, + "learning_rate": 5.065204143814748e-05, + "loss": 1.0645, + "step": 1039 + }, + { + "epoch": 0.22184300341296928, + "grad_norm": 0.7359867712634943, + "learning_rate": 5.070079219987813e-05, + "loss": 1.0484, + "step": 1040 + }, + { + "epoch": 0.22205631399317405, + "grad_norm": 0.8387505822880682, + "learning_rate": 5.074954296160878e-05, + "loss": 1.0536, + "step": 1041 + }, + { + "epoch": 0.22226962457337884, + "grad_norm": 0.934535696234934, + "learning_rate": 5.079829372333943e-05, + "loss": 1.061, + "step": 1042 + }, + { + "epoch": 0.2224829351535836, + "grad_norm": 0.9889331302075479, + "learning_rate": 5.084704448507008e-05, + "loss": 1.0429, + "step": 1043 + }, + { + "epoch": 0.2226962457337884, + "grad_norm": 0.9147382931522542, + "learning_rate": 5.0895795246800736e-05, + "loss": 1.0426, + "step": 1044 + }, + { + "epoch": 0.22290955631399317, + "grad_norm": 0.7652336728100342, + "learning_rate": 5.094454600853139e-05, + "loss": 1.0745, + "step": 1045 + }, + { + "epoch": 0.22312286689419794, + "grad_norm": 0.5097468344986402, + "learning_rate": 5.0993296770262036e-05, + "loss": 1.0917, + "step": 1046 + }, + { + "epoch": 0.22333617747440274, + "grad_norm": 0.4911610869795114, + "learning_rate": 5.104204753199269e-05, + "loss": 1.0658, + "step": 1047 + }, + { + "epoch": 0.2235494880546075, + "grad_norm": 0.5125148342326558, + "learning_rate": 5.109079829372334e-05, + "loss": 1.0601, + "step": 1048 + }, + { + "epoch": 0.2237627986348123, + "grad_norm": 0.6410908458861451, + "learning_rate": 5.1139549055453995e-05, + "loss": 1.0788, + "step": 1049 + }, + { + "epoch": 0.22397610921501707, + "grad_norm": 0.8340334591840751, + "learning_rate": 5.118829981718465e-05, + "loss": 1.0512, + "step": 1050 + }, + { + "epoch": 0.22418941979522183, + "grad_norm": 0.907716400869883, + "learning_rate": 5.12370505789153e-05, + "loss": 1.0892, + "step": 1051 + }, + { + "epoch": 0.22440273037542663, + "grad_norm": 0.7370542537402005, + "learning_rate": 5.128580134064595e-05, + "loss": 1.0399, + "step": 1052 + }, + { + "epoch": 0.2246160409556314, + "grad_norm": 0.4662792039350034, + "learning_rate": 5.13345521023766e-05, + "loss": 1.0212, + "step": 1053 + }, + { + "epoch": 0.22482935153583616, + "grad_norm": 0.4601421250493139, + "learning_rate": 5.138330286410725e-05, + "loss": 1.0599, + "step": 1054 + }, + { + "epoch": 0.22504266211604096, + "grad_norm": 0.45645164998671633, + "learning_rate": 5.143205362583791e-05, + "loss": 1.0749, + "step": 1055 + }, + { + "epoch": 0.22525597269624573, + "grad_norm": 0.4383271849776051, + "learning_rate": 5.1480804387568566e-05, + "loss": 1.0548, + "step": 1056 + }, + { + "epoch": 0.22546928327645052, + "grad_norm": 0.4075135961473711, + "learning_rate": 5.1529555149299205e-05, + "loss": 1.0427, + "step": 1057 + }, + { + "epoch": 0.2256825938566553, + "grad_norm": 0.37031128584609574, + "learning_rate": 5.1578305911029865e-05, + "loss": 1.0418, + "step": 1058 + }, + { + "epoch": 0.22589590443686006, + "grad_norm": 0.4171105942767091, + "learning_rate": 5.162705667276052e-05, + "loss": 1.0476, + "step": 1059 + }, + { + "epoch": 0.22610921501706485, + "grad_norm": 0.5376063385969712, + "learning_rate": 5.167580743449117e-05, + "loss": 1.0339, + "step": 1060 + }, + { + "epoch": 0.22632252559726962, + "grad_norm": 0.6333817668058712, + "learning_rate": 5.1724558196221824e-05, + "loss": 1.036, + "step": 1061 + }, + { + "epoch": 0.22653583617747441, + "grad_norm": 0.47519559271430056, + "learning_rate": 5.177330895795247e-05, + "loss": 1.0707, + "step": 1062 + }, + { + "epoch": 0.22674914675767918, + "grad_norm": 0.4036167364685387, + "learning_rate": 5.1822059719683123e-05, + "loss": 1.0213, + "step": 1063 + }, + { + "epoch": 0.22696245733788395, + "grad_norm": 0.4389576820898569, + "learning_rate": 5.1870810481413776e-05, + "loss": 1.0758, + "step": 1064 + }, + { + "epoch": 0.22717576791808874, + "grad_norm": 0.5348411003592203, + "learning_rate": 5.191956124314443e-05, + "loss": 1.0452, + "step": 1065 + }, + { + "epoch": 0.2273890784982935, + "grad_norm": 0.5208435184341053, + "learning_rate": 5.196831200487508e-05, + "loss": 1.0233, + "step": 1066 + }, + { + "epoch": 0.2276023890784983, + "grad_norm": 0.45333161300525454, + "learning_rate": 5.201706276660573e-05, + "loss": 1.0707, + "step": 1067 + }, + { + "epoch": 0.22781569965870307, + "grad_norm": 0.48159703961254086, + "learning_rate": 5.206581352833638e-05, + "loss": 1.0371, + "step": 1068 + }, + { + "epoch": 0.22802901023890784, + "grad_norm": 0.5815448151965298, + "learning_rate": 5.2114564290067035e-05, + "loss": 1.0411, + "step": 1069 + }, + { + "epoch": 0.22824232081911264, + "grad_norm": 0.683812229526363, + "learning_rate": 5.216331505179769e-05, + "loss": 1.0256, + "step": 1070 + }, + { + "epoch": 0.2284556313993174, + "grad_norm": 0.8302500968300515, + "learning_rate": 5.221206581352834e-05, + "loss": 1.06, + "step": 1071 + }, + { + "epoch": 0.22866894197952217, + "grad_norm": 0.99243114605998, + "learning_rate": 5.2260816575258994e-05, + "loss": 1.0606, + "step": 1072 + }, + { + "epoch": 0.22888225255972697, + "grad_norm": 1.0919038309042377, + "learning_rate": 5.230956733698964e-05, + "loss": 1.0767, + "step": 1073 + }, + { + "epoch": 0.22909556313993173, + "grad_norm": 0.7879023664800889, + "learning_rate": 5.235831809872029e-05, + "loss": 1.0645, + "step": 1074 + }, + { + "epoch": 0.22930887372013653, + "grad_norm": 0.6114932971728788, + "learning_rate": 5.2407068860450946e-05, + "loss": 1.0472, + "step": 1075 + }, + { + "epoch": 0.2295221843003413, + "grad_norm": 0.6371554100056188, + "learning_rate": 5.2455819622181606e-05, + "loss": 1.0646, + "step": 1076 + }, + { + "epoch": 0.22973549488054607, + "grad_norm": 0.5556593282976192, + "learning_rate": 5.250457038391226e-05, + "loss": 1.0341, + "step": 1077 + }, + { + "epoch": 0.22994880546075086, + "grad_norm": 0.4977158865821939, + "learning_rate": 5.25533211456429e-05, + "loss": 1.0727, + "step": 1078 + }, + { + "epoch": 0.23016211604095563, + "grad_norm": 0.5367501724684788, + "learning_rate": 5.260207190737356e-05, + "loss": 1.0676, + "step": 1079 + }, + { + "epoch": 0.23037542662116042, + "grad_norm": 0.540510533793099, + "learning_rate": 5.265082266910421e-05, + "loss": 1.0968, + "step": 1080 + }, + { + "epoch": 0.2305887372013652, + "grad_norm": 0.6158948388426513, + "learning_rate": 5.2699573430834864e-05, + "loss": 1.0632, + "step": 1081 + }, + { + "epoch": 0.23080204778156996, + "grad_norm": 0.688311051893914, + "learning_rate": 5.274832419256552e-05, + "loss": 1.0421, + "step": 1082 + }, + { + "epoch": 0.23101535836177475, + "grad_norm": 0.6188972672971454, + "learning_rate": 5.2797074954296164e-05, + "loss": 1.0941, + "step": 1083 + }, + { + "epoch": 0.23122866894197952, + "grad_norm": 0.4428194909780055, + "learning_rate": 5.2845825716026817e-05, + "loss": 1.0805, + "step": 1084 + }, + { + "epoch": 0.2314419795221843, + "grad_norm": 0.37715272642619757, + "learning_rate": 5.289457647775747e-05, + "loss": 1.0539, + "step": 1085 + }, + { + "epoch": 0.23165529010238908, + "grad_norm": 0.6357792934920555, + "learning_rate": 5.294332723948812e-05, + "loss": 1.0373, + "step": 1086 + }, + { + "epoch": 0.23186860068259385, + "grad_norm": 0.724957506120238, + "learning_rate": 5.2992078001218776e-05, + "loss": 1.0474, + "step": 1087 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 0.7581397888233028, + "learning_rate": 5.304082876294942e-05, + "loss": 1.054, + "step": 1088 + }, + { + "epoch": 0.2322952218430034, + "grad_norm": 0.8970894571729343, + "learning_rate": 5.3089579524680075e-05, + "loss": 1.0251, + "step": 1089 + }, + { + "epoch": 0.23250853242320818, + "grad_norm": 1.0091321888225415, + "learning_rate": 5.313833028641073e-05, + "loss": 1.0676, + "step": 1090 + }, + { + "epoch": 0.23272184300341298, + "grad_norm": 0.9580866018586535, + "learning_rate": 5.318708104814138e-05, + "loss": 1.034, + "step": 1091 + }, + { + "epoch": 0.23293515358361774, + "grad_norm": 0.8362067243013929, + "learning_rate": 5.3235831809872034e-05, + "loss": 1.0593, + "step": 1092 + }, + { + "epoch": 0.23314846416382254, + "grad_norm": 0.6201097312760256, + "learning_rate": 5.328458257160269e-05, + "loss": 1.0655, + "step": 1093 + }, + { + "epoch": 0.2333617747440273, + "grad_norm": 0.5065577426732792, + "learning_rate": 5.333333333333333e-05, + "loss": 1.0422, + "step": 1094 + }, + { + "epoch": 0.23357508532423207, + "grad_norm": 0.598023837198703, + "learning_rate": 5.3382084095063986e-05, + "loss": 1.0539, + "step": 1095 + }, + { + "epoch": 0.23378839590443687, + "grad_norm": 0.6938454085042437, + "learning_rate": 5.343083485679464e-05, + "loss": 1.0497, + "step": 1096 + }, + { + "epoch": 0.23400170648464164, + "grad_norm": 0.7016508282587213, + "learning_rate": 5.34795856185253e-05, + "loss": 1.0772, + "step": 1097 + }, + { + "epoch": 0.2342150170648464, + "grad_norm": 0.6907178313435974, + "learning_rate": 5.352833638025595e-05, + "loss": 1.0541, + "step": 1098 + }, + { + "epoch": 0.2344283276450512, + "grad_norm": 0.7721162579401485, + "learning_rate": 5.357708714198659e-05, + "loss": 1.0933, + "step": 1099 + }, + { + "epoch": 0.23464163822525597, + "grad_norm": 0.7975465795896084, + "learning_rate": 5.362583790371725e-05, + "loss": 1.0308, + "step": 1100 + }, + { + "epoch": 0.23485494880546076, + "grad_norm": 0.8903597627361396, + "learning_rate": 5.3674588665447904e-05, + "loss": 1.0633, + "step": 1101 + }, + { + "epoch": 0.23506825938566553, + "grad_norm": 0.9196104328732793, + "learning_rate": 5.372333942717856e-05, + "loss": 1.0325, + "step": 1102 + }, + { + "epoch": 0.2352815699658703, + "grad_norm": 0.6990449390546701, + "learning_rate": 5.377209018890921e-05, + "loss": 1.0603, + "step": 1103 + }, + { + "epoch": 0.2354948805460751, + "grad_norm": 0.6675574952302671, + "learning_rate": 5.3820840950639857e-05, + "loss": 1.0671, + "step": 1104 + }, + { + "epoch": 0.23570819112627986, + "grad_norm": 0.5452452064398297, + "learning_rate": 5.386959171237051e-05, + "loss": 1.0931, + "step": 1105 + }, + { + "epoch": 0.23592150170648465, + "grad_norm": 0.4382836882791079, + "learning_rate": 5.391834247410116e-05, + "loss": 1.0438, + "step": 1106 + }, + { + "epoch": 0.23613481228668942, + "grad_norm": 0.5461907372470773, + "learning_rate": 5.3967093235831816e-05, + "loss": 1.0898, + "step": 1107 + }, + { + "epoch": 0.2363481228668942, + "grad_norm": 0.6208557465064812, + "learning_rate": 5.401584399756247e-05, + "loss": 1.0414, + "step": 1108 + }, + { + "epoch": 0.23656143344709898, + "grad_norm": 0.6342224090757516, + "learning_rate": 5.4064594759293115e-05, + "loss": 1.0482, + "step": 1109 + }, + { + "epoch": 0.23677474402730375, + "grad_norm": 0.5070795757296601, + "learning_rate": 5.411334552102377e-05, + "loss": 1.053, + "step": 1110 + }, + { + "epoch": 0.23698805460750852, + "grad_norm": 0.5252707956865701, + "learning_rate": 5.416209628275442e-05, + "loss": 1.0344, + "step": 1111 + }, + { + "epoch": 0.23720136518771331, + "grad_norm": 0.5691509012691018, + "learning_rate": 5.4210847044485074e-05, + "loss": 1.0609, + "step": 1112 + }, + { + "epoch": 0.23741467576791808, + "grad_norm": 0.522922783579417, + "learning_rate": 5.425959780621573e-05, + "loss": 1.0502, + "step": 1113 + }, + { + "epoch": 0.23762798634812288, + "grad_norm": 0.4984554773332391, + "learning_rate": 5.430834856794638e-05, + "loss": 1.0532, + "step": 1114 + }, + { + "epoch": 0.23784129692832764, + "grad_norm": 0.48884392271359844, + "learning_rate": 5.4357099329677026e-05, + "loss": 1.0403, + "step": 1115 + }, + { + "epoch": 0.2380546075085324, + "grad_norm": 0.4945648550667078, + "learning_rate": 5.440585009140768e-05, + "loss": 1.0489, + "step": 1116 + }, + { + "epoch": 0.2382679180887372, + "grad_norm": 0.41016310096722775, + "learning_rate": 5.445460085313833e-05, + "loss": 1.0542, + "step": 1117 + }, + { + "epoch": 0.23848122866894197, + "grad_norm": 0.4693077359319714, + "learning_rate": 5.450335161486899e-05, + "loss": 1.0392, + "step": 1118 + }, + { + "epoch": 0.23869453924914677, + "grad_norm": 0.46352656943455445, + "learning_rate": 5.4552102376599645e-05, + "loss": 1.0766, + "step": 1119 + }, + { + "epoch": 0.23890784982935154, + "grad_norm": 0.5318338924231967, + "learning_rate": 5.4600853138330285e-05, + "loss": 1.0827, + "step": 1120 + }, + { + "epoch": 0.2391211604095563, + "grad_norm": 0.48214133093994116, + "learning_rate": 5.4649603900060944e-05, + "loss": 1.0316, + "step": 1121 + }, + { + "epoch": 0.2393344709897611, + "grad_norm": 0.5403501903873846, + "learning_rate": 5.46983546617916e-05, + "loss": 1.0426, + "step": 1122 + }, + { + "epoch": 0.23954778156996587, + "grad_norm": 0.6534006571402129, + "learning_rate": 5.474710542352225e-05, + "loss": 1.0799, + "step": 1123 + }, + { + "epoch": 0.23976109215017063, + "grad_norm": 0.6987048347459573, + "learning_rate": 5.4795856185252903e-05, + "loss": 1.0651, + "step": 1124 + }, + { + "epoch": 0.23997440273037543, + "grad_norm": 0.7252746706190966, + "learning_rate": 5.484460694698355e-05, + "loss": 1.0454, + "step": 1125 + }, + { + "epoch": 0.2401877133105802, + "grad_norm": 0.6681746184487284, + "learning_rate": 5.48933577087142e-05, + "loss": 1.0323, + "step": 1126 + }, + { + "epoch": 0.240401023890785, + "grad_norm": 0.6760944926127886, + "learning_rate": 5.4942108470444856e-05, + "loss": 1.0699, + "step": 1127 + }, + { + "epoch": 0.24061433447098976, + "grad_norm": 0.750366938650997, + "learning_rate": 5.499085923217551e-05, + "loss": 1.055, + "step": 1128 + }, + { + "epoch": 0.24082764505119453, + "grad_norm": 0.9244360086914633, + "learning_rate": 5.503960999390616e-05, + "loss": 1.0553, + "step": 1129 + }, + { + "epoch": 0.24104095563139932, + "grad_norm": 1.1972839911635436, + "learning_rate": 5.508836075563681e-05, + "loss": 1.1069, + "step": 1130 + }, + { + "epoch": 0.2412542662116041, + "grad_norm": 0.9079922249411774, + "learning_rate": 5.513711151736746e-05, + "loss": 1.0382, + "step": 1131 + }, + { + "epoch": 0.24146757679180889, + "grad_norm": 0.7751839312436324, + "learning_rate": 5.5185862279098114e-05, + "loss": 1.0564, + "step": 1132 + }, + { + "epoch": 0.24168088737201365, + "grad_norm": 0.8161096861845998, + "learning_rate": 5.523461304082877e-05, + "loss": 1.0463, + "step": 1133 + }, + { + "epoch": 0.24189419795221842, + "grad_norm": 0.8751262699469916, + "learning_rate": 5.528336380255942e-05, + "loss": 1.0681, + "step": 1134 + }, + { + "epoch": 0.24210750853242322, + "grad_norm": 0.7396277126833674, + "learning_rate": 5.533211456429007e-05, + "loss": 1.0427, + "step": 1135 + }, + { + "epoch": 0.24232081911262798, + "grad_norm": 0.6097379794392853, + "learning_rate": 5.538086532602072e-05, + "loss": 1.0534, + "step": 1136 + }, + { + "epoch": 0.24253412969283278, + "grad_norm": 0.6494106193106673, + "learning_rate": 5.542961608775137e-05, + "loss": 1.0538, + "step": 1137 + }, + { + "epoch": 0.24274744027303755, + "grad_norm": 0.6225738376524532, + "learning_rate": 5.5478366849482025e-05, + "loss": 1.0391, + "step": 1138 + }, + { + "epoch": 0.2429607508532423, + "grad_norm": 0.4548433783477059, + "learning_rate": 5.5527117611212685e-05, + "loss": 1.0267, + "step": 1139 + }, + { + "epoch": 0.2431740614334471, + "grad_norm": 0.599331139660555, + "learning_rate": 5.557586837294334e-05, + "loss": 1.0474, + "step": 1140 + }, + { + "epoch": 0.24338737201365188, + "grad_norm": 0.6252510674440943, + "learning_rate": 5.562461913467398e-05, + "loss": 1.0339, + "step": 1141 + }, + { + "epoch": 0.24360068259385664, + "grad_norm": 0.5304768008995833, + "learning_rate": 5.567336989640464e-05, + "loss": 1.0502, + "step": 1142 + }, + { + "epoch": 0.24381399317406144, + "grad_norm": 0.5421185461875896, + "learning_rate": 5.572212065813529e-05, + "loss": 1.0469, + "step": 1143 + }, + { + "epoch": 0.2440273037542662, + "grad_norm": 0.5785110207883173, + "learning_rate": 5.5770871419865944e-05, + "loss": 1.0526, + "step": 1144 + }, + { + "epoch": 0.244240614334471, + "grad_norm": 0.6088840557827213, + "learning_rate": 5.5819622181596597e-05, + "loss": 1.0294, + "step": 1145 + }, + { + "epoch": 0.24445392491467577, + "grad_norm": 0.6873513524090311, + "learning_rate": 5.586837294332724e-05, + "loss": 1.0356, + "step": 1146 + }, + { + "epoch": 0.24466723549488054, + "grad_norm": 0.7844263184583319, + "learning_rate": 5.5917123705057896e-05, + "loss": 1.0617, + "step": 1147 + }, + { + "epoch": 0.24488054607508533, + "grad_norm": 0.710877585311789, + "learning_rate": 5.596587446678855e-05, + "loss": 1.0374, + "step": 1148 + }, + { + "epoch": 0.2450938566552901, + "grad_norm": 0.6021698973574892, + "learning_rate": 5.60146252285192e-05, + "loss": 1.03, + "step": 1149 + }, + { + "epoch": 0.2453071672354949, + "grad_norm": 0.6784372226576043, + "learning_rate": 5.6063375990249855e-05, + "loss": 1.0393, + "step": 1150 + }, + { + "epoch": 0.24552047781569966, + "grad_norm": 0.814442681265487, + "learning_rate": 5.61121267519805e-05, + "loss": 1.0601, + "step": 1151 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 0.9735783850528926, + "learning_rate": 5.6160877513711154e-05, + "loss": 1.072, + "step": 1152 + }, + { + "epoch": 0.24594709897610922, + "grad_norm": 1.1566144312612197, + "learning_rate": 5.620962827544181e-05, + "loss": 1.039, + "step": 1153 + }, + { + "epoch": 0.246160409556314, + "grad_norm": 0.7913814980839161, + "learning_rate": 5.625837903717246e-05, + "loss": 1.0585, + "step": 1154 + }, + { + "epoch": 0.24637372013651876, + "grad_norm": 0.5865936265737974, + "learning_rate": 5.630712979890311e-05, + "loss": 1.043, + "step": 1155 + }, + { + "epoch": 0.24658703071672355, + "grad_norm": 0.6164603471477024, + "learning_rate": 5.6355880560633766e-05, + "loss": 1.0481, + "step": 1156 + }, + { + "epoch": 0.24680034129692832, + "grad_norm": 0.6674286017102158, + "learning_rate": 5.640463132236441e-05, + "loss": 1.0488, + "step": 1157 + }, + { + "epoch": 0.24701365187713312, + "grad_norm": 0.706184022775059, + "learning_rate": 5.6453382084095065e-05, + "loss": 1.0253, + "step": 1158 + }, + { + "epoch": 0.24722696245733788, + "grad_norm": 0.8355259407119232, + "learning_rate": 5.650213284582572e-05, + "loss": 1.0926, + "step": 1159 + }, + { + "epoch": 0.24744027303754265, + "grad_norm": 0.9400697002731083, + "learning_rate": 5.655088360755638e-05, + "loss": 1.0543, + "step": 1160 + }, + { + "epoch": 0.24765358361774745, + "grad_norm": 0.9628248989904721, + "learning_rate": 5.659963436928703e-05, + "loss": 1.0617, + "step": 1161 + }, + { + "epoch": 0.24786689419795221, + "grad_norm": 1.0256137292677552, + "learning_rate": 5.664838513101767e-05, + "loss": 1.0228, + "step": 1162 + }, + { + "epoch": 0.248080204778157, + "grad_norm": 0.8593603673402128, + "learning_rate": 5.669713589274833e-05, + "loss": 1.058, + "step": 1163 + }, + { + "epoch": 0.24829351535836178, + "grad_norm": 0.6806021224311347, + "learning_rate": 5.6745886654478984e-05, + "loss": 1.0194, + "step": 1164 + }, + { + "epoch": 0.24850682593856654, + "grad_norm": 0.5594856085338639, + "learning_rate": 5.6794637416209637e-05, + "loss": 1.0388, + "step": 1165 + }, + { + "epoch": 0.24872013651877134, + "grad_norm": 0.5337856204868735, + "learning_rate": 5.684338817794029e-05, + "loss": 1.0578, + "step": 1166 + }, + { + "epoch": 0.2489334470989761, + "grad_norm": 0.5093522141854492, + "learning_rate": 5.6892138939670936e-05, + "loss": 1.0494, + "step": 1167 + }, + { + "epoch": 0.24914675767918087, + "grad_norm": 0.5358068572186543, + "learning_rate": 5.694088970140159e-05, + "loss": 1.0115, + "step": 1168 + }, + { + "epoch": 0.24936006825938567, + "grad_norm": 0.4857209039826245, + "learning_rate": 5.698964046313224e-05, + "loss": 1.0345, + "step": 1169 + }, + { + "epoch": 0.24957337883959044, + "grad_norm": 0.5373826837997457, + "learning_rate": 5.7038391224862895e-05, + "loss": 1.0417, + "step": 1170 + }, + { + "epoch": 0.24978668941979523, + "grad_norm": 0.6053333178189789, + "learning_rate": 5.708714198659355e-05, + "loss": 1.0547, + "step": 1171 + }, + { + "epoch": 0.25, + "grad_norm": 0.5952241694402046, + "learning_rate": 5.7135892748324194e-05, + "loss": 1.0788, + "step": 1172 + }, + { + "epoch": 0.2502133105802048, + "grad_norm": 0.5757376320823155, + "learning_rate": 5.718464351005485e-05, + "loss": 1.0379, + "step": 1173 + }, + { + "epoch": 0.25042662116040953, + "grad_norm": 0.5680266776620091, + "learning_rate": 5.72333942717855e-05, + "loss": 1.0512, + "step": 1174 + }, + { + "epoch": 0.25063993174061433, + "grad_norm": 0.5917116494651085, + "learning_rate": 5.728214503351615e-05, + "loss": 1.0744, + "step": 1175 + }, + { + "epoch": 0.2508532423208191, + "grad_norm": 0.6567108381401371, + "learning_rate": 5.7330895795246806e-05, + "loss": 1.0674, + "step": 1176 + }, + { + "epoch": 0.25106655290102387, + "grad_norm": 0.6899927298589319, + "learning_rate": 5.737964655697746e-05, + "loss": 1.0476, + "step": 1177 + }, + { + "epoch": 0.25127986348122866, + "grad_norm": 0.8424250055913315, + "learning_rate": 5.7428397318708106e-05, + "loss": 1.0753, + "step": 1178 + }, + { + "epoch": 0.25149317406143346, + "grad_norm": 0.9833090486076359, + "learning_rate": 5.747714808043876e-05, + "loss": 1.0318, + "step": 1179 + }, + { + "epoch": 0.25170648464163825, + "grad_norm": 1.4120977737861375, + "learning_rate": 5.752589884216941e-05, + "loss": 1.0593, + "step": 1180 + }, + { + "epoch": 0.251919795221843, + "grad_norm": 0.44170772447634543, + "learning_rate": 5.7574649603900065e-05, + "loss": 1.0288, + "step": 1181 + }, + { + "epoch": 0.2521331058020478, + "grad_norm": 1.1036618972300394, + "learning_rate": 5.7623400365630724e-05, + "loss": 1.0582, + "step": 1182 + }, + { + "epoch": 0.2523464163822526, + "grad_norm": 1.4735331689967526, + "learning_rate": 5.7672151127361364e-05, + "loss": 1.0637, + "step": 1183 + }, + { + "epoch": 0.2525597269624573, + "grad_norm": 0.4882377086917796, + "learning_rate": 5.7720901889092024e-05, + "loss": 1.0535, + "step": 1184 + }, + { + "epoch": 0.2527730375426621, + "grad_norm": 1.47740925870354, + "learning_rate": 5.776965265082268e-05, + "loss": 1.0583, + "step": 1185 + }, + { + "epoch": 0.2529863481228669, + "grad_norm": 0.6260549806662005, + "learning_rate": 5.781840341255333e-05, + "loss": 1.0817, + "step": 1186 + }, + { + "epoch": 0.25319965870307165, + "grad_norm": 0.9890551907332958, + "learning_rate": 5.786715417428398e-05, + "loss": 1.0779, + "step": 1187 + }, + { + "epoch": 0.25341296928327645, + "grad_norm": 1.0196331438808723, + "learning_rate": 5.791590493601463e-05, + "loss": 1.0551, + "step": 1188 + }, + { + "epoch": 0.25362627986348124, + "grad_norm": 0.9723287145769575, + "learning_rate": 5.796465569774528e-05, + "loss": 1.0281, + "step": 1189 + }, + { + "epoch": 0.253839590443686, + "grad_norm": 0.9521862214518658, + "learning_rate": 5.8013406459475935e-05, + "loss": 1.0505, + "step": 1190 + }, + { + "epoch": 0.2540529010238908, + "grad_norm": 0.7986654971607635, + "learning_rate": 5.806215722120659e-05, + "loss": 1.0525, + "step": 1191 + }, + { + "epoch": 0.25426621160409557, + "grad_norm": 0.703284651719775, + "learning_rate": 5.811090798293724e-05, + "loss": 1.0354, + "step": 1192 + }, + { + "epoch": 0.25447952218430037, + "grad_norm": 0.6578235112240771, + "learning_rate": 5.815965874466789e-05, + "loss": 1.0613, + "step": 1193 + }, + { + "epoch": 0.2546928327645051, + "grad_norm": 0.6669981439543424, + "learning_rate": 5.820840950639854e-05, + "loss": 1.046, + "step": 1194 + }, + { + "epoch": 0.2549061433447099, + "grad_norm": 0.7586188395765848, + "learning_rate": 5.825716026812919e-05, + "loss": 1.0112, + "step": 1195 + }, + { + "epoch": 0.2551194539249147, + "grad_norm": 0.5699001277761349, + "learning_rate": 5.8305911029859846e-05, + "loss": 1.0788, + "step": 1196 + }, + { + "epoch": 0.25533276450511944, + "grad_norm": 0.5301534745365786, + "learning_rate": 5.83546617915905e-05, + "loss": 1.0375, + "step": 1197 + }, + { + "epoch": 0.25554607508532423, + "grad_norm": 0.7360860777693183, + "learning_rate": 5.8403412553321146e-05, + "loss": 1.0347, + "step": 1198 + }, + { + "epoch": 0.255759385665529, + "grad_norm": 0.5431506754108519, + "learning_rate": 5.84521633150518e-05, + "loss": 1.0858, + "step": 1199 + }, + { + "epoch": 0.25597269624573377, + "grad_norm": 0.46844900755958857, + "learning_rate": 5.850091407678245e-05, + "loss": 1.0527, + "step": 1200 + }, + { + "epoch": 0.25618600682593856, + "grad_norm": 0.5099990900780584, + "learning_rate": 5.8549664838513105e-05, + "loss": 1.0553, + "step": 1201 + }, + { + "epoch": 0.25639931740614336, + "grad_norm": 0.43777288707689943, + "learning_rate": 5.859841560024376e-05, + "loss": 1.067, + "step": 1202 + }, + { + "epoch": 0.2566126279863481, + "grad_norm": 0.4742256054322187, + "learning_rate": 5.864716636197442e-05, + "loss": 1.0511, + "step": 1203 + }, + { + "epoch": 0.2568259385665529, + "grad_norm": 0.4241930468003853, + "learning_rate": 5.869591712370506e-05, + "loss": 1.0338, + "step": 1204 + }, + { + "epoch": 0.2570392491467577, + "grad_norm": 0.4737850446003086, + "learning_rate": 5.874466788543571e-05, + "loss": 1.0357, + "step": 1205 + }, + { + "epoch": 0.2572525597269625, + "grad_norm": 0.4972643142828798, + "learning_rate": 5.879341864716637e-05, + "loss": 1.0442, + "step": 1206 + }, + { + "epoch": 0.2574658703071672, + "grad_norm": 0.4166407751434157, + "learning_rate": 5.884216940889702e-05, + "loss": 1.0205, + "step": 1207 + }, + { + "epoch": 0.257679180887372, + "grad_norm": 0.4272468639649158, + "learning_rate": 5.8890920170627676e-05, + "loss": 1.0396, + "step": 1208 + }, + { + "epoch": 0.2578924914675768, + "grad_norm": 0.38225927683595523, + "learning_rate": 5.893967093235832e-05, + "loss": 1.0704, + "step": 1209 + }, + { + "epoch": 0.25810580204778155, + "grad_norm": 0.4669437113100295, + "learning_rate": 5.8988421694088975e-05, + "loss": 1.0523, + "step": 1210 + }, + { + "epoch": 0.25831911262798635, + "grad_norm": 0.48901945899959803, + "learning_rate": 5.903717245581963e-05, + "loss": 1.0322, + "step": 1211 + }, + { + "epoch": 0.25853242320819114, + "grad_norm": 0.5806337922046855, + "learning_rate": 5.908592321755028e-05, + "loss": 1.012, + "step": 1212 + }, + { + "epoch": 0.2587457337883959, + "grad_norm": 0.7048705104636575, + "learning_rate": 5.9134673979280934e-05, + "loss": 1.047, + "step": 1213 + }, + { + "epoch": 0.2589590443686007, + "grad_norm": 0.8046442865125303, + "learning_rate": 5.918342474101158e-05, + "loss": 1.0276, + "step": 1214 + }, + { + "epoch": 0.2591723549488055, + "grad_norm": 0.9826961805544577, + "learning_rate": 5.9232175502742233e-05, + "loss": 1.0267, + "step": 1215 + }, + { + "epoch": 0.2593856655290102, + "grad_norm": 1.2785722269353426, + "learning_rate": 5.9280926264472886e-05, + "loss": 1.0553, + "step": 1216 + }, + { + "epoch": 0.259598976109215, + "grad_norm": 0.627696771090669, + "learning_rate": 5.932967702620354e-05, + "loss": 1.0497, + "step": 1217 + }, + { + "epoch": 0.2598122866894198, + "grad_norm": 0.5893559660497751, + "learning_rate": 5.937842778793419e-05, + "loss": 1.0207, + "step": 1218 + }, + { + "epoch": 0.2600255972696246, + "grad_norm": 1.0267988687475078, + "learning_rate": 5.942717854966484e-05, + "loss": 1.0244, + "step": 1219 + }, + { + "epoch": 0.26023890784982934, + "grad_norm": 1.4916056054993332, + "learning_rate": 5.947592931139549e-05, + "loss": 1.0451, + "step": 1220 + }, + { + "epoch": 0.26045221843003413, + "grad_norm": 0.4784768486173323, + "learning_rate": 5.9524680073126145e-05, + "loss": 1.0524, + "step": 1221 + }, + { + "epoch": 0.2606655290102389, + "grad_norm": 1.3839424152979587, + "learning_rate": 5.95734308348568e-05, + "loss": 1.0686, + "step": 1222 + }, + { + "epoch": 0.26087883959044367, + "grad_norm": 0.7815371745132224, + "learning_rate": 5.962218159658745e-05, + "loss": 1.0375, + "step": 1223 + }, + { + "epoch": 0.26109215017064846, + "grad_norm": 0.9679319475559994, + "learning_rate": 5.967093235831811e-05, + "loss": 1.0355, + "step": 1224 + }, + { + "epoch": 0.26130546075085326, + "grad_norm": 1.115411454648815, + "learning_rate": 5.971968312004875e-05, + "loss": 1.0725, + "step": 1225 + }, + { + "epoch": 0.261518771331058, + "grad_norm": 0.6337801582085565, + "learning_rate": 5.97684338817794e-05, + "loss": 1.0306, + "step": 1226 + }, + { + "epoch": 0.2617320819112628, + "grad_norm": 0.8577540564208821, + "learning_rate": 5.981718464351006e-05, + "loss": 1.0171, + "step": 1227 + }, + { + "epoch": 0.2619453924914676, + "grad_norm": 0.9693718805630849, + "learning_rate": 5.9865935405240716e-05, + "loss": 1.0601, + "step": 1228 + }, + { + "epoch": 0.2621587030716723, + "grad_norm": 0.6695714145802127, + "learning_rate": 5.991468616697137e-05, + "loss": 1.0346, + "step": 1229 + }, + { + "epoch": 0.2623720136518771, + "grad_norm": 0.8200294055348524, + "learning_rate": 5.9963436928702015e-05, + "loss": 1.0334, + "step": 1230 + }, + { + "epoch": 0.2625853242320819, + "grad_norm": 0.951720148473004, + "learning_rate": 6.001218769043267e-05, + "loss": 1.0568, + "step": 1231 + }, + { + "epoch": 0.2627986348122867, + "grad_norm": 0.6868442023860093, + "learning_rate": 6.006093845216332e-05, + "loss": 1.0673, + "step": 1232 + }, + { + "epoch": 0.26301194539249145, + "grad_norm": 0.9352715395900931, + "learning_rate": 6.0109689213893974e-05, + "loss": 1.0581, + "step": 1233 + }, + { + "epoch": 0.26322525597269625, + "grad_norm": 0.8539257842575858, + "learning_rate": 6.015843997562463e-05, + "loss": 1.0438, + "step": 1234 + }, + { + "epoch": 0.26343856655290104, + "grad_norm": 0.5868044479310108, + "learning_rate": 6.0207190737355273e-05, + "loss": 1.0444, + "step": 1235 + }, + { + "epoch": 0.2636518771331058, + "grad_norm": 0.6502068742794157, + "learning_rate": 6.0255941499085926e-05, + "loss": 1.0333, + "step": 1236 + }, + { + "epoch": 0.2638651877133106, + "grad_norm": 0.5117105180799555, + "learning_rate": 6.030469226081658e-05, + "loss": 1.0332, + "step": 1237 + }, + { + "epoch": 0.2640784982935154, + "grad_norm": 0.665075871333597, + "learning_rate": 6.035344302254723e-05, + "loss": 1.0385, + "step": 1238 + }, + { + "epoch": 0.2642918088737201, + "grad_norm": 0.6037184292986011, + "learning_rate": 6.0402193784277886e-05, + "loss": 1.0553, + "step": 1239 + }, + { + "epoch": 0.2645051194539249, + "grad_norm": 0.6153613857663758, + "learning_rate": 6.045094454600853e-05, + "loss": 1.0433, + "step": 1240 + }, + { + "epoch": 0.2647184300341297, + "grad_norm": 0.6675626422956633, + "learning_rate": 6.0499695307739185e-05, + "loss": 1.0467, + "step": 1241 + }, + { + "epoch": 0.26493174061433444, + "grad_norm": 0.672173508292083, + "learning_rate": 6.054844606946984e-05, + "loss": 1.0665, + "step": 1242 + }, + { + "epoch": 0.26514505119453924, + "grad_norm": 0.7038973121239936, + "learning_rate": 6.059719683120049e-05, + "loss": 1.0441, + "step": 1243 + }, + { + "epoch": 0.26535836177474403, + "grad_norm": 0.6410480611645132, + "learning_rate": 6.0645947592931144e-05, + "loss": 1.0434, + "step": 1244 + }, + { + "epoch": 0.26557167235494883, + "grad_norm": 0.587814981741719, + "learning_rate": 6.0694698354661804e-05, + "loss": 1.0369, + "step": 1245 + }, + { + "epoch": 0.26578498293515357, + "grad_norm": 0.5589815149047108, + "learning_rate": 6.074344911639244e-05, + "loss": 1.0036, + "step": 1246 + }, + { + "epoch": 0.26599829351535836, + "grad_norm": 0.6064997728553089, + "learning_rate": 6.0792199878123096e-05, + "loss": 1.0573, + "step": 1247 + }, + { + "epoch": 0.26621160409556316, + "grad_norm": 0.5372203831080162, + "learning_rate": 6.0840950639853756e-05, + "loss": 1.055, + "step": 1248 + }, + { + "epoch": 0.2664249146757679, + "grad_norm": 0.5125434830284478, + "learning_rate": 6.088970140158441e-05, + "loss": 1.049, + "step": 1249 + }, + { + "epoch": 0.2666382252559727, + "grad_norm": 0.5389412634685228, + "learning_rate": 6.093845216331506e-05, + "loss": 1.0499, + "step": 1250 + }, + { + "epoch": 0.2668515358361775, + "grad_norm": 0.46746848855169504, + "learning_rate": 6.098720292504571e-05, + "loss": 1.038, + "step": 1251 + }, + { + "epoch": 0.26706484641638223, + "grad_norm": 0.5398221025431527, + "learning_rate": 6.103595368677636e-05, + "loss": 1.0386, + "step": 1252 + }, + { + "epoch": 0.267278156996587, + "grad_norm": 0.6138651076492879, + "learning_rate": 6.108470444850701e-05, + "loss": 1.0371, + "step": 1253 + }, + { + "epoch": 0.2674914675767918, + "grad_norm": 0.6695821411277055, + "learning_rate": 6.113345521023767e-05, + "loss": 1.0489, + "step": 1254 + }, + { + "epoch": 0.2677047781569966, + "grad_norm": 0.7008282548821197, + "learning_rate": 6.118220597196832e-05, + "loss": 1.0786, + "step": 1255 + }, + { + "epoch": 0.26791808873720135, + "grad_norm": 0.5951712448940383, + "learning_rate": 6.123095673369896e-05, + "loss": 1.0458, + "step": 1256 + }, + { + "epoch": 0.26813139931740615, + "grad_norm": 0.4842622128925028, + "learning_rate": 6.127970749542961e-05, + "loss": 1.0165, + "step": 1257 + }, + { + "epoch": 0.26834470989761094, + "grad_norm": 0.4339732458730256, + "learning_rate": 6.132845825716028e-05, + "loss": 1.016, + "step": 1258 + }, + { + "epoch": 0.2685580204778157, + "grad_norm": 0.5289769737904507, + "learning_rate": 6.137720901889093e-05, + "loss": 1.0617, + "step": 1259 + }, + { + "epoch": 0.2687713310580205, + "grad_norm": 0.5375714022901263, + "learning_rate": 6.142595978062159e-05, + "loss": 1.032, + "step": 1260 + }, + { + "epoch": 0.2689846416382253, + "grad_norm": 0.5136291935798024, + "learning_rate": 6.147471054235222e-05, + "loss": 1.0353, + "step": 1261 + }, + { + "epoch": 0.26919795221843, + "grad_norm": 0.47676606978255315, + "learning_rate": 6.152346130408288e-05, + "loss": 1.0392, + "step": 1262 + }, + { + "epoch": 0.2694112627986348, + "grad_norm": 0.48815423785745027, + "learning_rate": 6.157221206581353e-05, + "loss": 1.0559, + "step": 1263 + }, + { + "epoch": 0.2696245733788396, + "grad_norm": 0.4883828511306935, + "learning_rate": 6.162096282754418e-05, + "loss": 1.0428, + "step": 1264 + }, + { + "epoch": 0.26983788395904434, + "grad_norm": 0.4138996554324494, + "learning_rate": 6.166971358927484e-05, + "loss": 1.052, + "step": 1265 + }, + { + "epoch": 0.27005119453924914, + "grad_norm": 0.421532175930684, + "learning_rate": 6.171846435100549e-05, + "loss": 1.0191, + "step": 1266 + }, + { + "epoch": 0.27026450511945393, + "grad_norm": 0.40674769468100525, + "learning_rate": 6.176721511273614e-05, + "loss": 1.0581, + "step": 1267 + }, + { + "epoch": 0.27047781569965873, + "grad_norm": 0.4460316915603834, + "learning_rate": 6.18159658744668e-05, + "loss": 1.054, + "step": 1268 + }, + { + "epoch": 0.27069112627986347, + "grad_norm": 0.8111630194077004, + "learning_rate": 6.186471663619745e-05, + "loss": 1.0655, + "step": 1269 + }, + { + "epoch": 0.27090443686006827, + "grad_norm": 0.600850036269361, + "learning_rate": 6.19134673979281e-05, + "loss": 1.0286, + "step": 1270 + }, + { + "epoch": 0.27111774744027306, + "grad_norm": 0.7531023969566332, + "learning_rate": 6.196221815965876e-05, + "loss": 1.0316, + "step": 1271 + }, + { + "epoch": 0.2713310580204778, + "grad_norm": 0.9836937352672878, + "learning_rate": 6.20109689213894e-05, + "loss": 1.0317, + "step": 1272 + }, + { + "epoch": 0.2715443686006826, + "grad_norm": 1.1537496337568185, + "learning_rate": 6.205971968312005e-05, + "loss": 1.0524, + "step": 1273 + }, + { + "epoch": 0.2717576791808874, + "grad_norm": 0.8120581193350541, + "learning_rate": 6.21084704448507e-05, + "loss": 1.0663, + "step": 1274 + }, + { + "epoch": 0.27197098976109213, + "grad_norm": 0.6214843500974171, + "learning_rate": 6.215722120658135e-05, + "loss": 1.0336, + "step": 1275 + }, + { + "epoch": 0.2721843003412969, + "grad_norm": 0.6804095795212842, + "learning_rate": 6.2205971968312e-05, + "loss": 1.0446, + "step": 1276 + }, + { + "epoch": 0.2723976109215017, + "grad_norm": 0.7439519499792383, + "learning_rate": 6.225472273004266e-05, + "loss": 1.0407, + "step": 1277 + }, + { + "epoch": 0.27261092150170646, + "grad_norm": 0.7970302149706586, + "learning_rate": 6.230347349177331e-05, + "loss": 1.0412, + "step": 1278 + }, + { + "epoch": 0.27282423208191126, + "grad_norm": 0.9384998954165485, + "learning_rate": 6.235222425350397e-05, + "loss": 1.0518, + "step": 1279 + }, + { + "epoch": 0.27303754266211605, + "grad_norm": 1.1539862246119996, + "learning_rate": 6.240097501523462e-05, + "loss": 1.0581, + "step": 1280 + }, + { + "epoch": 0.27325085324232085, + "grad_norm": 0.8254736502862753, + "learning_rate": 6.244972577696527e-05, + "loss": 1.043, + "step": 1281 + }, + { + "epoch": 0.2734641638225256, + "grad_norm": 0.7079968098761233, + "learning_rate": 6.249847653869592e-05, + "loss": 1.0361, + "step": 1282 + }, + { + "epoch": 0.2736774744027304, + "grad_norm": 0.696690240144381, + "learning_rate": 6.254722730042658e-05, + "loss": 1.0453, + "step": 1283 + }, + { + "epoch": 0.2738907849829352, + "grad_norm": 0.7368032829538118, + "learning_rate": 6.259597806215723e-05, + "loss": 1.0636, + "step": 1284 + }, + { + "epoch": 0.2741040955631399, + "grad_norm": 0.7469738183687169, + "learning_rate": 6.264472882388788e-05, + "loss": 1.0282, + "step": 1285 + }, + { + "epoch": 0.2743174061433447, + "grad_norm": 0.776399300083127, + "learning_rate": 6.269347958561854e-05, + "loss": 1.0244, + "step": 1286 + }, + { + "epoch": 0.2745307167235495, + "grad_norm": 0.9177161971657437, + "learning_rate": 6.274223034734919e-05, + "loss": 1.0776, + "step": 1287 + }, + { + "epoch": 0.27474402730375425, + "grad_norm": 0.9352566167257084, + "learning_rate": 6.279098110907983e-05, + "loss": 1.0464, + "step": 1288 + }, + { + "epoch": 0.27495733788395904, + "grad_norm": 0.9176563855861891, + "learning_rate": 6.283973187081048e-05, + "loss": 1.0323, + "step": 1289 + }, + { + "epoch": 0.27517064846416384, + "grad_norm": 0.9532023891128935, + "learning_rate": 6.288848263254114e-05, + "loss": 1.0124, + "step": 1290 + }, + { + "epoch": 0.2753839590443686, + "grad_norm": 0.970615313940892, + "learning_rate": 6.293723339427179e-05, + "loss": 1.0466, + "step": 1291 + }, + { + "epoch": 0.27559726962457337, + "grad_norm": 0.8782726545228058, + "learning_rate": 6.298598415600244e-05, + "loss": 1.06, + "step": 1292 + }, + { + "epoch": 0.27581058020477817, + "grad_norm": 0.6439638622154874, + "learning_rate": 6.30347349177331e-05, + "loss": 1.0575, + "step": 1293 + }, + { + "epoch": 0.27602389078498296, + "grad_norm": 0.4511118364723981, + "learning_rate": 6.308348567946375e-05, + "loss": 1.0335, + "step": 1294 + }, + { + "epoch": 0.2762372013651877, + "grad_norm": 0.46453888187585174, + "learning_rate": 6.31322364411944e-05, + "loss": 1.017, + "step": 1295 + }, + { + "epoch": 0.2764505119453925, + "grad_norm": 0.596161647606558, + "learning_rate": 6.318098720292505e-05, + "loss": 1.0235, + "step": 1296 + }, + { + "epoch": 0.2766638225255973, + "grad_norm": 0.7341708708303987, + "learning_rate": 6.32297379646557e-05, + "loss": 1.029, + "step": 1297 + }, + { + "epoch": 0.27687713310580203, + "grad_norm": 0.769640975139376, + "learning_rate": 6.327848872638635e-05, + "loss": 1.0487, + "step": 1298 + }, + { + "epoch": 0.2770904436860068, + "grad_norm": 0.675623027507121, + "learning_rate": 6.3327239488117e-05, + "loss": 1.0304, + "step": 1299 + }, + { + "epoch": 0.2773037542662116, + "grad_norm": 0.5742843671388832, + "learning_rate": 6.337599024984765e-05, + "loss": 1.0455, + "step": 1300 + }, + { + "epoch": 0.27751706484641636, + "grad_norm": 0.47026768515237805, + "learning_rate": 6.342474101157832e-05, + "loss": 1.0419, + "step": 1301 + }, + { + "epoch": 0.27773037542662116, + "grad_norm": 0.5011977464385041, + "learning_rate": 6.347349177330897e-05, + "loss": 1.0693, + "step": 1302 + }, + { + "epoch": 0.27794368600682595, + "grad_norm": 0.5540033220598275, + "learning_rate": 6.352224253503961e-05, + "loss": 1.0284, + "step": 1303 + }, + { + "epoch": 0.2781569965870307, + "grad_norm": 0.49475930803314766, + "learning_rate": 6.357099329677026e-05, + "loss": 1.0017, + "step": 1304 + }, + { + "epoch": 0.2783703071672355, + "grad_norm": 0.5073932896203509, + "learning_rate": 6.361974405850092e-05, + "loss": 1.0376, + "step": 1305 + }, + { + "epoch": 0.2785836177474403, + "grad_norm": 0.5202795916993047, + "learning_rate": 6.366849482023157e-05, + "loss": 1.0775, + "step": 1306 + }, + { + "epoch": 0.2787969283276451, + "grad_norm": 0.5172295167890112, + "learning_rate": 6.371724558196222e-05, + "loss": 1.02, + "step": 1307 + }, + { + "epoch": 0.2790102389078498, + "grad_norm": 0.49595750964890684, + "learning_rate": 6.376599634369288e-05, + "loss": 1.0449, + "step": 1308 + }, + { + "epoch": 0.2792235494880546, + "grad_norm": 0.5600635411047357, + "learning_rate": 6.381474710542353e-05, + "loss": 1.0546, + "step": 1309 + }, + { + "epoch": 0.2794368600682594, + "grad_norm": 0.5975313127185705, + "learning_rate": 6.386349786715418e-05, + "loss": 1.0073, + "step": 1310 + }, + { + "epoch": 0.27965017064846415, + "grad_norm": 0.6600967559863131, + "learning_rate": 6.391224862888484e-05, + "loss": 1.0512, + "step": 1311 + }, + { + "epoch": 0.27986348122866894, + "grad_norm": 1.3781420233816837, + "learning_rate": 6.396099939061549e-05, + "loss": 1.0741, + "step": 1312 + }, + { + "epoch": 0.28007679180887374, + "grad_norm": 0.3463337189263882, + "learning_rate": 6.400975015234614e-05, + "loss": 1.0544, + "step": 1313 + }, + { + "epoch": 0.2802901023890785, + "grad_norm": 0.7613932005260794, + "learning_rate": 6.405850091407678e-05, + "loss": 1.0568, + "step": 1314 + }, + { + "epoch": 0.2805034129692833, + "grad_norm": 1.1992517291628455, + "learning_rate": 6.410725167580743e-05, + "loss": 1.0416, + "step": 1315 + }, + { + "epoch": 0.28071672354948807, + "grad_norm": 0.9476951120204923, + "learning_rate": 6.415600243753809e-05, + "loss": 1.0585, + "step": 1316 + }, + { + "epoch": 0.2809300341296928, + "grad_norm": 0.7491556439279401, + "learning_rate": 6.420475319926874e-05, + "loss": 1.0334, + "step": 1317 + }, + { + "epoch": 0.2811433447098976, + "grad_norm": 0.5513129334905749, + "learning_rate": 6.425350396099939e-05, + "loss": 1.0306, + "step": 1318 + }, + { + "epoch": 0.2813566552901024, + "grad_norm": 0.5429045597202701, + "learning_rate": 6.430225472273005e-05, + "loss": 1.0632, + "step": 1319 + }, + { + "epoch": 0.2815699658703072, + "grad_norm": 0.6866422844106291, + "learning_rate": 6.43510054844607e-05, + "loss": 1.0161, + "step": 1320 + }, + { + "epoch": 0.28178327645051193, + "grad_norm": 0.665844540681092, + "learning_rate": 6.439975624619135e-05, + "loss": 1.032, + "step": 1321 + }, + { + "epoch": 0.2819965870307167, + "grad_norm": 0.5547842494139446, + "learning_rate": 6.4448507007922e-05, + "loss": 1.0606, + "step": 1322 + }, + { + "epoch": 0.2822098976109215, + "grad_norm": 0.6147043263888013, + "learning_rate": 6.449725776965266e-05, + "loss": 1.0761, + "step": 1323 + }, + { + "epoch": 0.28242320819112626, + "grad_norm": 0.6580150047551335, + "learning_rate": 6.45460085313833e-05, + "loss": 1.0451, + "step": 1324 + }, + { + "epoch": 0.28263651877133106, + "grad_norm": 0.8654310340637665, + "learning_rate": 6.459475929311396e-05, + "loss": 1.0143, + "step": 1325 + }, + { + "epoch": 0.28284982935153585, + "grad_norm": 1.0711944790060959, + "learning_rate": 6.464351005484462e-05, + "loss": 1.0317, + "step": 1326 + }, + { + "epoch": 0.2830631399317406, + "grad_norm": 0.9965000196740763, + "learning_rate": 6.469226081657527e-05, + "loss": 1.0429, + "step": 1327 + }, + { + "epoch": 0.2832764505119454, + "grad_norm": 1.0929693972030197, + "learning_rate": 6.474101157830592e-05, + "loss": 1.0314, + "step": 1328 + }, + { + "epoch": 0.2834897610921502, + "grad_norm": 0.7757950154450399, + "learning_rate": 6.478976234003658e-05, + "loss": 1.0228, + "step": 1329 + }, + { + "epoch": 0.2837030716723549, + "grad_norm": 0.6719126099860675, + "learning_rate": 6.483851310176722e-05, + "loss": 1.0704, + "step": 1330 + }, + { + "epoch": 0.2839163822525597, + "grad_norm": 0.7305066881369775, + "learning_rate": 6.488726386349787e-05, + "loss": 1.0441, + "step": 1331 + }, + { + "epoch": 0.2841296928327645, + "grad_norm": 0.7844276279808442, + "learning_rate": 6.493601462522852e-05, + "loss": 1.0246, + "step": 1332 + }, + { + "epoch": 0.2843430034129693, + "grad_norm": 0.7248467679369401, + "learning_rate": 6.498476538695917e-05, + "loss": 1.039, + "step": 1333 + }, + { + "epoch": 0.28455631399317405, + "grad_norm": 0.8959540016320416, + "learning_rate": 6.503351614868983e-05, + "loss": 1.0373, + "step": 1334 + }, + { + "epoch": 0.28476962457337884, + "grad_norm": 1.0100016544004489, + "learning_rate": 6.508226691042048e-05, + "loss": 1.0549, + "step": 1335 + }, + { + "epoch": 0.28498293515358364, + "grad_norm": 0.9268306845246487, + "learning_rate": 6.513101767215113e-05, + "loss": 1.0203, + "step": 1336 + }, + { + "epoch": 0.2851962457337884, + "grad_norm": 0.8220593433188014, + "learning_rate": 6.517976843388179e-05, + "loss": 1.0309, + "step": 1337 + }, + { + "epoch": 0.2854095563139932, + "grad_norm": 0.6322568728924339, + "learning_rate": 6.522851919561244e-05, + "loss": 1.0289, + "step": 1338 + }, + { + "epoch": 0.28562286689419797, + "grad_norm": 0.44772150724563964, + "learning_rate": 6.527726995734309e-05, + "loss": 1.0632, + "step": 1339 + }, + { + "epoch": 0.2858361774744027, + "grad_norm": 0.35300902725328015, + "learning_rate": 6.532602071907373e-05, + "loss": 1.0283, + "step": 1340 + }, + { + "epoch": 0.2860494880546075, + "grad_norm": 0.43886100756705726, + "learning_rate": 6.537477148080439e-05, + "loss": 0.9984, + "step": 1341 + }, + { + "epoch": 0.2862627986348123, + "grad_norm": 0.6620728721915764, + "learning_rate": 6.542352224253504e-05, + "loss": 1.0049, + "step": 1342 + }, + { + "epoch": 0.28647610921501704, + "grad_norm": 0.9898844498269147, + "learning_rate": 6.54722730042657e-05, + "loss": 1.033, + "step": 1343 + }, + { + "epoch": 0.28668941979522183, + "grad_norm": 1.2898261024993287, + "learning_rate": 6.552102376599636e-05, + "loss": 1.0193, + "step": 1344 + }, + { + "epoch": 0.28690273037542663, + "grad_norm": 0.5013301871321155, + "learning_rate": 6.5569774527727e-05, + "loss": 1.0159, + "step": 1345 + }, + { + "epoch": 0.2871160409556314, + "grad_norm": 0.6023666887264528, + "learning_rate": 6.561852528945765e-05, + "loss": 1.0811, + "step": 1346 + }, + { + "epoch": 0.28732935153583616, + "grad_norm": 1.0305815581829012, + "learning_rate": 6.56672760511883e-05, + "loss": 1.0554, + "step": 1347 + }, + { + "epoch": 0.28754266211604096, + "grad_norm": 1.2514776691361853, + "learning_rate": 6.571602681291896e-05, + "loss": 0.994, + "step": 1348 + }, + { + "epoch": 0.28775597269624575, + "grad_norm": 0.5135289672994179, + "learning_rate": 6.576477757464961e-05, + "loss": 1.0628, + "step": 1349 + }, + { + "epoch": 0.2879692832764505, + "grad_norm": 0.6768152105926759, + "learning_rate": 6.581352833638026e-05, + "loss": 1.034, + "step": 1350 + }, + { + "epoch": 0.2881825938566553, + "grad_norm": 1.1364000550355007, + "learning_rate": 6.586227909811092e-05, + "loss": 1.0446, + "step": 1351 + }, + { + "epoch": 0.2883959044368601, + "grad_norm": 1.059972882867335, + "learning_rate": 6.591102985984157e-05, + "loss": 1.0409, + "step": 1352 + }, + { + "epoch": 0.2886092150170648, + "grad_norm": 0.8244693172461286, + "learning_rate": 6.595978062157222e-05, + "loss": 1.028, + "step": 1353 + }, + { + "epoch": 0.2888225255972696, + "grad_norm": 0.6667209443900916, + "learning_rate": 6.600853138330287e-05, + "loss": 1.0524, + "step": 1354 + }, + { + "epoch": 0.2890358361774744, + "grad_norm": 0.5251515298884984, + "learning_rate": 6.605728214503353e-05, + "loss": 1.0271, + "step": 1355 + }, + { + "epoch": 0.28924914675767915, + "grad_norm": 0.6255032773254369, + "learning_rate": 6.610603290676417e-05, + "loss": 1.0191, + "step": 1356 + }, + { + "epoch": 0.28946245733788395, + "grad_norm": 0.674362135301057, + "learning_rate": 6.615478366849482e-05, + "loss": 1.0456, + "step": 1357 + }, + { + "epoch": 0.28967576791808874, + "grad_norm": 0.6666258630446996, + "learning_rate": 6.620353443022547e-05, + "loss": 1.0197, + "step": 1358 + }, + { + "epoch": 0.28988907849829354, + "grad_norm": 0.6553983372789806, + "learning_rate": 6.625228519195613e-05, + "loss": 1.0569, + "step": 1359 + }, + { + "epoch": 0.2901023890784983, + "grad_norm": 0.6901180219319268, + "learning_rate": 6.630103595368678e-05, + "loss": 1.0671, + "step": 1360 + }, + { + "epoch": 0.2903156996587031, + "grad_norm": 0.7410376492979474, + "learning_rate": 6.634978671541743e-05, + "loss": 1.0643, + "step": 1361 + }, + { + "epoch": 0.29052901023890787, + "grad_norm": 0.748911641930415, + "learning_rate": 6.639853747714809e-05, + "loss": 1.0795, + "step": 1362 + }, + { + "epoch": 0.2907423208191126, + "grad_norm": 0.7763229094116589, + "learning_rate": 6.644728823887874e-05, + "loss": 1.0213, + "step": 1363 + }, + { + "epoch": 0.2909556313993174, + "grad_norm": 0.8329668047530597, + "learning_rate": 6.649603900060939e-05, + "loss": 1.0077, + "step": 1364 + }, + { + "epoch": 0.2911689419795222, + "grad_norm": 0.6551373443873982, + "learning_rate": 6.654478976234004e-05, + "loss": 1.064, + "step": 1365 + }, + { + "epoch": 0.29138225255972694, + "grad_norm": 0.375527093922395, + "learning_rate": 6.659354052407068e-05, + "loss": 1.0535, + "step": 1366 + }, + { + "epoch": 0.29159556313993173, + "grad_norm": 0.46907158912488445, + "learning_rate": 6.664229128580135e-05, + "loss": 1.0318, + "step": 1367 + }, + { + "epoch": 0.29180887372013653, + "grad_norm": 0.596164156231066, + "learning_rate": 6.6691042047532e-05, + "loss": 1.0266, + "step": 1368 + }, + { + "epoch": 0.29202218430034127, + "grad_norm": 0.6521495713046515, + "learning_rate": 6.673979280926266e-05, + "loss": 1.035, + "step": 1369 + }, + { + "epoch": 0.29223549488054607, + "grad_norm": 0.7611054863065692, + "learning_rate": 6.678854357099331e-05, + "loss": 1.0354, + "step": 1370 + }, + { + "epoch": 0.29244880546075086, + "grad_norm": 1.5634234183789026, + "learning_rate": 6.683729433272395e-05, + "loss": 1.0815, + "step": 1371 + }, + { + "epoch": 0.29266211604095566, + "grad_norm": 0.519062185986469, + "learning_rate": 6.68860450944546e-05, + "loss": 1.0383, + "step": 1372 + }, + { + "epoch": 0.2928754266211604, + "grad_norm": 1.285005452526197, + "learning_rate": 6.693479585618525e-05, + "loss": 1.0207, + "step": 1373 + }, + { + "epoch": 0.2930887372013652, + "grad_norm": 0.8769259602324998, + "learning_rate": 6.698354661791591e-05, + "loss": 1.0386, + "step": 1374 + }, + { + "epoch": 0.29330204778157, + "grad_norm": 0.6279867949035989, + "learning_rate": 6.703229737964656e-05, + "loss": 1.0558, + "step": 1375 + }, + { + "epoch": 0.2935153583617747, + "grad_norm": 0.8989557712066479, + "learning_rate": 6.708104814137721e-05, + "loss": 1.0604, + "step": 1376 + }, + { + "epoch": 0.2937286689419795, + "grad_norm": 1.3755413024044543, + "learning_rate": 6.712979890310787e-05, + "loss": 1.0536, + "step": 1377 + }, + { + "epoch": 0.2939419795221843, + "grad_norm": 1.0364795738778956, + "learning_rate": 6.717854966483852e-05, + "loss": 1.0549, + "step": 1378 + }, + { + "epoch": 0.29415529010238906, + "grad_norm": 1.0344188760839759, + "learning_rate": 6.722730042656917e-05, + "loss": 1.0355, + "step": 1379 + }, + { + "epoch": 0.29436860068259385, + "grad_norm": 1.316081177058334, + "learning_rate": 6.727605118829983e-05, + "loss": 1.0686, + "step": 1380 + }, + { + "epoch": 0.29458191126279865, + "grad_norm": 0.4777076080848344, + "learning_rate": 6.732480195003048e-05, + "loss": 1.0544, + "step": 1381 + }, + { + "epoch": 0.2947952218430034, + "grad_norm": 1.078623343915374, + "learning_rate": 6.737355271176112e-05, + "loss": 1.0493, + "step": 1382 + }, + { + "epoch": 0.2950085324232082, + "grad_norm": 1.114284499262646, + "learning_rate": 6.742230347349177e-05, + "loss": 1.0397, + "step": 1383 + }, + { + "epoch": 0.295221843003413, + "grad_norm": 0.6127857044524944, + "learning_rate": 6.747105423522242e-05, + "loss": 1.0541, + "step": 1384 + }, + { + "epoch": 0.29543515358361777, + "grad_norm": 0.7311757348879293, + "learning_rate": 6.751980499695309e-05, + "loss": 1.0444, + "step": 1385 + }, + { + "epoch": 0.2956484641638225, + "grad_norm": 0.7328049893132681, + "learning_rate": 6.756855575868374e-05, + "loss": 1.0687, + "step": 1386 + }, + { + "epoch": 0.2958617747440273, + "grad_norm": 0.6649968995853109, + "learning_rate": 6.761730652041438e-05, + "loss": 1.0806, + "step": 1387 + }, + { + "epoch": 0.2960750853242321, + "grad_norm": 0.735180262928212, + "learning_rate": 6.766605728214504e-05, + "loss": 1.0201, + "step": 1388 + }, + { + "epoch": 0.29628839590443684, + "grad_norm": 0.7308080952038458, + "learning_rate": 6.771480804387569e-05, + "loss": 1.0471, + "step": 1389 + }, + { + "epoch": 0.29650170648464164, + "grad_norm": 0.8975759783888005, + "learning_rate": 6.776355880560634e-05, + "loss": 1.0778, + "step": 1390 + }, + { + "epoch": 0.29671501706484643, + "grad_norm": 0.7324101686194606, + "learning_rate": 6.7812309567337e-05, + "loss": 1.0274, + "step": 1391 + }, + { + "epoch": 0.29692832764505117, + "grad_norm": 0.7696687675053225, + "learning_rate": 6.786106032906765e-05, + "loss": 1.0192, + "step": 1392 + }, + { + "epoch": 0.29714163822525597, + "grad_norm": 0.7036986913011856, + "learning_rate": 6.79098110907983e-05, + "loss": 1.0295, + "step": 1393 + }, + { + "epoch": 0.29735494880546076, + "grad_norm": 0.6410643990254566, + "learning_rate": 6.795856185252895e-05, + "loss": 1.0326, + "step": 1394 + }, + { + "epoch": 0.29756825938566556, + "grad_norm": 0.6493846808913335, + "learning_rate": 6.800731261425961e-05, + "loss": 1.034, + "step": 1395 + }, + { + "epoch": 0.2977815699658703, + "grad_norm": 0.506130865964612, + "learning_rate": 6.805606337599026e-05, + "loss": 1.0179, + "step": 1396 + }, + { + "epoch": 0.2979948805460751, + "grad_norm": 0.4951562424875798, + "learning_rate": 6.810481413772091e-05, + "loss": 1.0599, + "step": 1397 + }, + { + "epoch": 0.2982081911262799, + "grad_norm": 0.46687249186392143, + "learning_rate": 6.815356489945155e-05, + "loss": 1.0331, + "step": 1398 + }, + { + "epoch": 0.2984215017064846, + "grad_norm": 0.5638204028984396, + "learning_rate": 6.82023156611822e-05, + "loss": 1.0547, + "step": 1399 + }, + { + "epoch": 0.2986348122866894, + "grad_norm": 0.6975745855597791, + "learning_rate": 6.825106642291286e-05, + "loss": 1.0309, + "step": 1400 + }, + { + "epoch": 0.2988481228668942, + "grad_norm": 0.7785008414879905, + "learning_rate": 6.829981718464351e-05, + "loss": 1.0341, + "step": 1401 + }, + { + "epoch": 0.29906143344709896, + "grad_norm": 0.8033705322567881, + "learning_rate": 6.834856794637417e-05, + "loss": 1.0222, + "step": 1402 + }, + { + "epoch": 0.29927474402730375, + "grad_norm": 0.7241248061843436, + "learning_rate": 6.839731870810482e-05, + "loss": 1.0373, + "step": 1403 + }, + { + "epoch": 0.29948805460750855, + "grad_norm": 0.5339356453987664, + "learning_rate": 6.844606946983547e-05, + "loss": 1.0392, + "step": 1404 + }, + { + "epoch": 0.2997013651877133, + "grad_norm": 0.6243096337725146, + "learning_rate": 6.849482023156612e-05, + "loss": 1.0407, + "step": 1405 + }, + { + "epoch": 0.2999146757679181, + "grad_norm": 0.5992760127272379, + "learning_rate": 6.854357099329678e-05, + "loss": 1.0325, + "step": 1406 + }, + { + "epoch": 0.3001279863481229, + "grad_norm": 0.5621428159821218, + "learning_rate": 6.859232175502743e-05, + "loss": 1.0538, + "step": 1407 + }, + { + "epoch": 0.3003412969283277, + "grad_norm": 0.749280169321798, + "learning_rate": 6.864107251675807e-05, + "loss": 1.0609, + "step": 1408 + }, + { + "epoch": 0.3005546075085324, + "grad_norm": 0.8099893777862441, + "learning_rate": 6.868982327848874e-05, + "loss": 1.0358, + "step": 1409 + }, + { + "epoch": 0.3007679180887372, + "grad_norm": 0.999690438687684, + "learning_rate": 6.873857404021939e-05, + "loss": 1.0398, + "step": 1410 + }, + { + "epoch": 0.300981228668942, + "grad_norm": 1.2124370327784766, + "learning_rate": 6.878732480195004e-05, + "loss": 1.042, + "step": 1411 + }, + { + "epoch": 0.30119453924914674, + "grad_norm": 0.5296966691285583, + "learning_rate": 6.88360755636807e-05, + "loss": 1.0066, + "step": 1412 + }, + { + "epoch": 0.30140784982935154, + "grad_norm": 0.46544566289375433, + "learning_rate": 6.888482632541133e-05, + "loss": 1.074, + "step": 1413 + }, + { + "epoch": 0.30162116040955633, + "grad_norm": 0.9220558243207901, + "learning_rate": 6.893357708714199e-05, + "loss": 1.0372, + "step": 1414 + }, + { + "epoch": 0.3018344709897611, + "grad_norm": 1.138070819814446, + "learning_rate": 6.898232784887264e-05, + "loss": 1.0384, + "step": 1415 + }, + { + "epoch": 0.30204778156996587, + "grad_norm": 0.6948915210358274, + "learning_rate": 6.90310786106033e-05, + "loss": 1.0904, + "step": 1416 + }, + { + "epoch": 0.30226109215017066, + "grad_norm": 0.5097948866453883, + "learning_rate": 6.907982937233395e-05, + "loss": 1.018, + "step": 1417 + }, + { + "epoch": 0.3024744027303754, + "grad_norm": 0.67170028760169, + "learning_rate": 6.91285801340646e-05, + "loss": 1.0343, + "step": 1418 + }, + { + "epoch": 0.3026877133105802, + "grad_norm": 0.8073234152547543, + "learning_rate": 6.917733089579525e-05, + "loss": 1.0437, + "step": 1419 + }, + { + "epoch": 0.302901023890785, + "grad_norm": 0.925362969287005, + "learning_rate": 6.92260816575259e-05, + "loss": 1.0751, + "step": 1420 + }, + { + "epoch": 0.3031143344709898, + "grad_norm": 1.0424393413876163, + "learning_rate": 6.927483241925656e-05, + "loss": 1.0673, + "step": 1421 + }, + { + "epoch": 0.3033276450511945, + "grad_norm": 0.8482241202083396, + "learning_rate": 6.932358318098721e-05, + "loss": 1.0248, + "step": 1422 + }, + { + "epoch": 0.3035409556313993, + "grad_norm": 0.6915757382853395, + "learning_rate": 6.937233394271787e-05, + "loss": 1.0494, + "step": 1423 + }, + { + "epoch": 0.3037542662116041, + "grad_norm": 0.7321993881918418, + "learning_rate": 6.94210847044485e-05, + "loss": 1.0606, + "step": 1424 + }, + { + "epoch": 0.30396757679180886, + "grad_norm": 0.8053537966564596, + "learning_rate": 6.946983546617916e-05, + "loss": 1.0471, + "step": 1425 + }, + { + "epoch": 0.30418088737201365, + "grad_norm": 0.772114998030813, + "learning_rate": 6.951858622790981e-05, + "loss": 1.0464, + "step": 1426 + }, + { + "epoch": 0.30439419795221845, + "grad_norm": 0.6130354113279851, + "learning_rate": 6.956733698964048e-05, + "loss": 1.0338, + "step": 1427 + }, + { + "epoch": 0.3046075085324232, + "grad_norm": 0.5349246689564657, + "learning_rate": 6.961608775137113e-05, + "loss": 1.0328, + "step": 1428 + }, + { + "epoch": 0.304820819112628, + "grad_norm": 0.47709069172430757, + "learning_rate": 6.966483851310177e-05, + "loss": 1.0453, + "step": 1429 + }, + { + "epoch": 0.3050341296928328, + "grad_norm": 0.518376573039617, + "learning_rate": 6.971358927483242e-05, + "loss": 1.0389, + "step": 1430 + }, + { + "epoch": 0.3052474402730375, + "grad_norm": 0.47498943701080754, + "learning_rate": 6.976234003656308e-05, + "loss": 1.0322, + "step": 1431 + }, + { + "epoch": 0.3054607508532423, + "grad_norm": 0.40391766486780756, + "learning_rate": 6.981109079829373e-05, + "loss": 1.0462, + "step": 1432 + }, + { + "epoch": 0.3056740614334471, + "grad_norm": 0.4530465753363124, + "learning_rate": 6.985984156002438e-05, + "loss": 1.045, + "step": 1433 + }, + { + "epoch": 0.3058873720136519, + "grad_norm": 0.46075605083021015, + "learning_rate": 6.990859232175503e-05, + "loss": 1.0319, + "step": 1434 + }, + { + "epoch": 0.30610068259385664, + "grad_norm": 0.4127128330206883, + "learning_rate": 6.995734308348569e-05, + "loss": 1.0417, + "step": 1435 + }, + { + "epoch": 0.30631399317406144, + "grad_norm": 0.49875512453984433, + "learning_rate": 7.000609384521634e-05, + "loss": 1.0687, + "step": 1436 + }, + { + "epoch": 0.30652730375426623, + "grad_norm": 0.4376577923342832, + "learning_rate": 7.0054844606947e-05, + "loss": 1.0659, + "step": 1437 + }, + { + "epoch": 0.306740614334471, + "grad_norm": 0.36940429055724405, + "learning_rate": 7.010359536867765e-05, + "loss": 1.0503, + "step": 1438 + }, + { + "epoch": 0.30695392491467577, + "grad_norm": 0.411307179953468, + "learning_rate": 7.01523461304083e-05, + "loss": 1.0181, + "step": 1439 + }, + { + "epoch": 0.30716723549488056, + "grad_norm": 0.4106970639539348, + "learning_rate": 7.020109689213894e-05, + "loss": 1.0486, + "step": 1440 + }, + { + "epoch": 0.3073805460750853, + "grad_norm": 0.3931783518656895, + "learning_rate": 7.024984765386959e-05, + "loss": 1.0424, + "step": 1441 + }, + { + "epoch": 0.3075938566552901, + "grad_norm": 0.44163013247823707, + "learning_rate": 7.029859841560025e-05, + "loss": 1.0392, + "step": 1442 + }, + { + "epoch": 0.3078071672354949, + "grad_norm": 0.4458693196941231, + "learning_rate": 7.03473491773309e-05, + "loss": 1.0515, + "step": 1443 + }, + { + "epoch": 0.30802047781569963, + "grad_norm": 0.4604472452332526, + "learning_rate": 7.039609993906155e-05, + "loss": 1.0388, + "step": 1444 + }, + { + "epoch": 0.30823378839590443, + "grad_norm": 0.4767025695992418, + "learning_rate": 7.04448507007922e-05, + "loss": 1.0431, + "step": 1445 + }, + { + "epoch": 0.3084470989761092, + "grad_norm": 0.5575417218194121, + "learning_rate": 7.049360146252286e-05, + "loss": 1.0581, + "step": 1446 + }, + { + "epoch": 0.308660409556314, + "grad_norm": 0.6005991531394687, + "learning_rate": 7.054235222425351e-05, + "loss": 1.0242, + "step": 1447 + }, + { + "epoch": 0.30887372013651876, + "grad_norm": 0.6278435757346211, + "learning_rate": 7.059110298598416e-05, + "loss": 1.0377, + "step": 1448 + }, + { + "epoch": 0.30908703071672355, + "grad_norm": 0.740432430996603, + "learning_rate": 7.063985374771482e-05, + "loss": 1.0105, + "step": 1449 + }, + { + "epoch": 0.30930034129692835, + "grad_norm": 0.8936722948644586, + "learning_rate": 7.068860450944546e-05, + "loss": 1.0334, + "step": 1450 + }, + { + "epoch": 0.3095136518771331, + "grad_norm": 0.9971908590915286, + "learning_rate": 7.073735527117612e-05, + "loss": 1.0281, + "step": 1451 + }, + { + "epoch": 0.3097269624573379, + "grad_norm": 1.1244429757225223, + "learning_rate": 7.078610603290678e-05, + "loss": 1.0548, + "step": 1452 + }, + { + "epoch": 0.3099402730375427, + "grad_norm": 0.7373294435217881, + "learning_rate": 7.083485679463743e-05, + "loss": 1.0253, + "step": 1453 + }, + { + "epoch": 0.3101535836177474, + "grad_norm": 0.43831320856793354, + "learning_rate": 7.088360755636808e-05, + "loss": 1.0088, + "step": 1454 + }, + { + "epoch": 0.3103668941979522, + "grad_norm": 0.6893601831082988, + "learning_rate": 7.093235831809872e-05, + "loss": 1.0336, + "step": 1455 + }, + { + "epoch": 0.310580204778157, + "grad_norm": 0.8351646746817547, + "learning_rate": 7.098110907982937e-05, + "loss": 1.0682, + "step": 1456 + }, + { + "epoch": 0.31079351535836175, + "grad_norm": 0.701647342452367, + "learning_rate": 7.102985984156003e-05, + "loss": 1.0265, + "step": 1457 + }, + { + "epoch": 0.31100682593856654, + "grad_norm": 0.5093882497707838, + "learning_rate": 7.107861060329068e-05, + "loss": 1.027, + "step": 1458 + }, + { + "epoch": 0.31122013651877134, + "grad_norm": 0.43580044197567785, + "learning_rate": 7.112736136502133e-05, + "loss": 1.028, + "step": 1459 + }, + { + "epoch": 0.31143344709897613, + "grad_norm": 0.46508954246584244, + "learning_rate": 7.117611212675199e-05, + "loss": 1.0456, + "step": 1460 + }, + { + "epoch": 0.3116467576791809, + "grad_norm": 0.49247984197339456, + "learning_rate": 7.122486288848264e-05, + "loss": 1.0172, + "step": 1461 + }, + { + "epoch": 0.31186006825938567, + "grad_norm": 1.723894359990332, + "learning_rate": 7.127361365021329e-05, + "loss": 1.025, + "step": 1462 + }, + { + "epoch": 0.31207337883959047, + "grad_norm": 0.4673036018254151, + "learning_rate": 7.132236441194395e-05, + "loss": 1.0466, + "step": 1463 + }, + { + "epoch": 0.3122866894197952, + "grad_norm": 0.8850236907144928, + "learning_rate": 7.13711151736746e-05, + "loss": 1.022, + "step": 1464 + }, + { + "epoch": 0.3125, + "grad_norm": 0.9202009194441586, + "learning_rate": 7.141986593540525e-05, + "loss": 1.0457, + "step": 1465 + }, + { + "epoch": 0.3127133105802048, + "grad_norm": 0.9947366377184924, + "learning_rate": 7.146861669713589e-05, + "loss": 1.0584, + "step": 1466 + }, + { + "epoch": 0.31292662116040953, + "grad_norm": 1.159344123893893, + "learning_rate": 7.151736745886654e-05, + "loss": 1.0476, + "step": 1467 + }, + { + "epoch": 0.31313993174061433, + "grad_norm": 0.8096443498201393, + "learning_rate": 7.15661182205972e-05, + "loss": 1.0537, + "step": 1468 + }, + { + "epoch": 0.3133532423208191, + "grad_norm": 0.8027624554342971, + "learning_rate": 7.161486898232785e-05, + "loss": 1.0105, + "step": 1469 + }, + { + "epoch": 0.31356655290102387, + "grad_norm": 0.7620567102212153, + "learning_rate": 7.166361974405852e-05, + "loss": 1.0278, + "step": 1470 + }, + { + "epoch": 0.31377986348122866, + "grad_norm": 0.8126969169284063, + "learning_rate": 7.171237050578916e-05, + "loss": 0.9945, + "step": 1471 + }, + { + "epoch": 0.31399317406143346, + "grad_norm": 0.8062211845623755, + "learning_rate": 7.176112126751981e-05, + "loss": 1.0366, + "step": 1472 + }, + { + "epoch": 0.31420648464163825, + "grad_norm": 0.5952583228927807, + "learning_rate": 7.180987202925046e-05, + "loss": 1.0502, + "step": 1473 + }, + { + "epoch": 0.314419795221843, + "grad_norm": 0.6175697714344687, + "learning_rate": 7.185862279098111e-05, + "loss": 1.05, + "step": 1474 + }, + { + "epoch": 0.3146331058020478, + "grad_norm": 0.6394743623884015, + "learning_rate": 7.190737355271177e-05, + "loss": 1.0223, + "step": 1475 + }, + { + "epoch": 0.3148464163822526, + "grad_norm": 0.5931148012596758, + "learning_rate": 7.195612431444242e-05, + "loss": 1.0546, + "step": 1476 + }, + { + "epoch": 0.3150597269624573, + "grad_norm": 0.5638721700538937, + "learning_rate": 7.200487507617307e-05, + "loss": 1.022, + "step": 1477 + }, + { + "epoch": 0.3152730375426621, + "grad_norm": 0.37239102041316585, + "learning_rate": 7.205362583790373e-05, + "loss": 1.0038, + "step": 1478 + }, + { + "epoch": 0.3154863481228669, + "grad_norm": 0.4582267431806509, + "learning_rate": 7.210237659963438e-05, + "loss": 1.0162, + "step": 1479 + }, + { + "epoch": 0.31569965870307165, + "grad_norm": 0.5843761640181626, + "learning_rate": 7.215112736136503e-05, + "loss": 1.0581, + "step": 1480 + }, + { + "epoch": 0.31591296928327645, + "grad_norm": 0.568066497880411, + "learning_rate": 7.219987812309569e-05, + "loss": 1.0297, + "step": 1481 + }, + { + "epoch": 0.31612627986348124, + "grad_norm": 0.610791526278668, + "learning_rate": 7.224862888482633e-05, + "loss": 1.0381, + "step": 1482 + }, + { + "epoch": 0.316339590443686, + "grad_norm": 0.587434255834548, + "learning_rate": 7.229737964655698e-05, + "loss": 0.9969, + "step": 1483 + }, + { + "epoch": 0.3165529010238908, + "grad_norm": 0.5274707518854461, + "learning_rate": 7.234613040828763e-05, + "loss": 1.0891, + "step": 1484 + }, + { + "epoch": 0.31676621160409557, + "grad_norm": 0.5118509203014614, + "learning_rate": 7.239488117001828e-05, + "loss": 1.0557, + "step": 1485 + }, + { + "epoch": 0.31697952218430037, + "grad_norm": 0.4118308445428142, + "learning_rate": 7.244363193174894e-05, + "loss": 1.0296, + "step": 1486 + }, + { + "epoch": 0.3171928327645051, + "grad_norm": 0.43541847703222586, + "learning_rate": 7.249238269347959e-05, + "loss": 1.0253, + "step": 1487 + }, + { + "epoch": 0.3174061433447099, + "grad_norm": 0.5086081475007906, + "learning_rate": 7.254113345521024e-05, + "loss": 1.0428, + "step": 1488 + }, + { + "epoch": 0.3176194539249147, + "grad_norm": 0.7099101988819808, + "learning_rate": 7.25898842169409e-05, + "loss": 1.012, + "step": 1489 + }, + { + "epoch": 0.31783276450511944, + "grad_norm": 0.9070697359222807, + "learning_rate": 7.263863497867155e-05, + "loss": 1.0102, + "step": 1490 + }, + { + "epoch": 0.31804607508532423, + "grad_norm": 1.1740308474420347, + "learning_rate": 7.26873857404022e-05, + "loss": 0.9924, + "step": 1491 + }, + { + "epoch": 0.318259385665529, + "grad_norm": 0.952668194153907, + "learning_rate": 7.273613650213284e-05, + "loss": 1.0419, + "step": 1492 + }, + { + "epoch": 0.31847269624573377, + "grad_norm": 0.7863378567156561, + "learning_rate": 7.27848872638635e-05, + "loss": 1.0262, + "step": 1493 + }, + { + "epoch": 0.31868600682593856, + "grad_norm": 0.5755301263963731, + "learning_rate": 7.283363802559416e-05, + "loss": 1.0411, + "step": 1494 + }, + { + "epoch": 0.31889931740614336, + "grad_norm": 0.4408887698744087, + "learning_rate": 7.288238878732481e-05, + "loss": 1.022, + "step": 1495 + }, + { + "epoch": 0.3191126279863481, + "grad_norm": 0.41326834750458025, + "learning_rate": 7.293113954905547e-05, + "loss": 1.0213, + "step": 1496 + }, + { + "epoch": 0.3193259385665529, + "grad_norm": 0.4441704907144489, + "learning_rate": 7.297989031078611e-05, + "loss": 1.0391, + "step": 1497 + }, + { + "epoch": 0.3195392491467577, + "grad_norm": 0.5371024468203053, + "learning_rate": 7.302864107251676e-05, + "loss": 1.0159, + "step": 1498 + }, + { + "epoch": 0.3197525597269625, + "grad_norm": 0.5833636748351281, + "learning_rate": 7.307739183424741e-05, + "loss": 1.0504, + "step": 1499 + }, + { + "epoch": 0.3199658703071672, + "grad_norm": 0.6426064068762993, + "learning_rate": 7.312614259597807e-05, + "loss": 1.0324, + "step": 1500 + }, + { + "epoch": 0.320179180887372, + "grad_norm": 0.6958906743563039, + "learning_rate": 7.317489335770872e-05, + "loss": 1.0446, + "step": 1501 + }, + { + "epoch": 0.3203924914675768, + "grad_norm": 0.7353045590654693, + "learning_rate": 7.322364411943937e-05, + "loss": 1.0419, + "step": 1502 + }, + { + "epoch": 0.32060580204778155, + "grad_norm": 0.8142597381831455, + "learning_rate": 7.327239488117003e-05, + "loss": 1.0606, + "step": 1503 + }, + { + "epoch": 0.32081911262798635, + "grad_norm": 0.9370190633006552, + "learning_rate": 7.332114564290068e-05, + "loss": 1.0291, + "step": 1504 + }, + { + "epoch": 0.32103242320819114, + "grad_norm": 1.1719272083025403, + "learning_rate": 7.336989640463133e-05, + "loss": 1.039, + "step": 1505 + }, + { + "epoch": 0.3212457337883959, + "grad_norm": 0.7644034386901433, + "learning_rate": 7.341864716636198e-05, + "loss": 1.028, + "step": 1506 + }, + { + "epoch": 0.3214590443686007, + "grad_norm": 0.5853696459604228, + "learning_rate": 7.346739792809264e-05, + "loss": 1.0193, + "step": 1507 + }, + { + "epoch": 0.3216723549488055, + "grad_norm": 0.62649935913896, + "learning_rate": 7.351614868982328e-05, + "loss": 1.0548, + "step": 1508 + }, + { + "epoch": 0.3218856655290102, + "grad_norm": 0.7111108329861667, + "learning_rate": 7.356489945155393e-05, + "loss": 1.0411, + "step": 1509 + }, + { + "epoch": 0.322098976109215, + "grad_norm": 0.8506477668079246, + "learning_rate": 7.361365021328458e-05, + "loss": 1.0301, + "step": 1510 + }, + { + "epoch": 0.3223122866894198, + "grad_norm": 0.9309963392651661, + "learning_rate": 7.366240097501524e-05, + "loss": 1.0644, + "step": 1511 + }, + { + "epoch": 0.3225255972696246, + "grad_norm": 1.0629522601012447, + "learning_rate": 7.37111517367459e-05, + "loss": 1.0173, + "step": 1512 + }, + { + "epoch": 0.32273890784982934, + "grad_norm": 0.8618192936784371, + "learning_rate": 7.375990249847654e-05, + "loss": 1.0149, + "step": 1513 + }, + { + "epoch": 0.32295221843003413, + "grad_norm": 0.7043657671838657, + "learning_rate": 7.38086532602072e-05, + "loss": 1.0403, + "step": 1514 + }, + { + "epoch": 0.3231655290102389, + "grad_norm": 0.8215491292774668, + "learning_rate": 7.385740402193785e-05, + "loss": 1.0075, + "step": 1515 + }, + { + "epoch": 0.32337883959044367, + "grad_norm": 0.9218160858403165, + "learning_rate": 7.39061547836685e-05, + "loss": 0.9967, + "step": 1516 + }, + { + "epoch": 0.32359215017064846, + "grad_norm": 0.8558665051709303, + "learning_rate": 7.395490554539915e-05, + "loss": 1.044, + "step": 1517 + }, + { + "epoch": 0.32380546075085326, + "grad_norm": 0.7521878105530466, + "learning_rate": 7.400365630712981e-05, + "loss": 1.0153, + "step": 1518 + }, + { + "epoch": 0.324018771331058, + "grad_norm": 0.7720478245603992, + "learning_rate": 7.405240706886046e-05, + "loss": 1.0249, + "step": 1519 + }, + { + "epoch": 0.3242320819112628, + "grad_norm": 0.712429125982731, + "learning_rate": 7.410115783059111e-05, + "loss": 1.014, + "step": 1520 + }, + { + "epoch": 0.3244453924914676, + "grad_norm": 0.5935881104685745, + "learning_rate": 7.414990859232177e-05, + "loss": 1.0066, + "step": 1521 + }, + { + "epoch": 0.3246587030716723, + "grad_norm": 0.5842598453090031, + "learning_rate": 7.419865935405242e-05, + "loss": 1.0372, + "step": 1522 + }, + { + "epoch": 0.3248720136518771, + "grad_norm": 0.5979690674164679, + "learning_rate": 7.424741011578307e-05, + "loss": 1.0565, + "step": 1523 + }, + { + "epoch": 0.3250853242320819, + "grad_norm": 0.5543144400705153, + "learning_rate": 7.429616087751371e-05, + "loss": 1.0513, + "step": 1524 + }, + { + "epoch": 0.3252986348122867, + "grad_norm": 0.43843211435912816, + "learning_rate": 7.434491163924436e-05, + "loss": 1.0177, + "step": 1525 + }, + { + "epoch": 0.32551194539249145, + "grad_norm": 0.49877266388878544, + "learning_rate": 7.439366240097502e-05, + "loss": 1.0413, + "step": 1526 + }, + { + "epoch": 0.32572525597269625, + "grad_norm": 0.5383538361328503, + "learning_rate": 7.444241316270567e-05, + "loss": 1.0128, + "step": 1527 + }, + { + "epoch": 0.32593856655290104, + "grad_norm": 0.5326135143053079, + "learning_rate": 7.449116392443632e-05, + "loss": 1.0408, + "step": 1528 + }, + { + "epoch": 0.3261518771331058, + "grad_norm": 0.5038094942528476, + "learning_rate": 7.453991468616698e-05, + "loss": 1.0741, + "step": 1529 + }, + { + "epoch": 0.3263651877133106, + "grad_norm": 0.3830532197911247, + "learning_rate": 7.458866544789763e-05, + "loss": 1.0412, + "step": 1530 + }, + { + "epoch": 0.3265784982935154, + "grad_norm": 0.6114853685592379, + "learning_rate": 7.463741620962828e-05, + "loss": 1.0181, + "step": 1531 + }, + { + "epoch": 0.3267918088737201, + "grad_norm": 0.8255559529506483, + "learning_rate": 7.468616697135894e-05, + "loss": 1.0135, + "step": 1532 + }, + { + "epoch": 0.3270051194539249, + "grad_norm": 0.9160269480923019, + "learning_rate": 7.473491773308959e-05, + "loss": 1.0007, + "step": 1533 + }, + { + "epoch": 0.3272184300341297, + "grad_norm": 0.9967996748180051, + "learning_rate": 7.478366849482023e-05, + "loss": 1.0572, + "step": 1534 + }, + { + "epoch": 0.32743174061433444, + "grad_norm": 1.0353552169010347, + "learning_rate": 7.483241925655088e-05, + "loss": 1.0345, + "step": 1535 + }, + { + "epoch": 0.32764505119453924, + "grad_norm": 1.0132728560669697, + "learning_rate": 7.488117001828155e-05, + "loss": 1.033, + "step": 1536 + }, + { + "epoch": 0.32785836177474403, + "grad_norm": 1.0161854994279031, + "learning_rate": 7.49299207800122e-05, + "loss": 1.0361, + "step": 1537 + }, + { + "epoch": 0.32807167235494883, + "grad_norm": 0.9348921456309088, + "learning_rate": 7.497867154174285e-05, + "loss": 1.0296, + "step": 1538 + }, + { + "epoch": 0.32828498293515357, + "grad_norm": 0.9614829883221251, + "learning_rate": 7.50274223034735e-05, + "loss": 1.0603, + "step": 1539 + }, + { + "epoch": 0.32849829351535836, + "grad_norm": 1.1159873870411943, + "learning_rate": 7.507617306520415e-05, + "loss": 1.0468, + "step": 1540 + }, + { + "epoch": 0.32871160409556316, + "grad_norm": 0.7702299962849729, + "learning_rate": 7.51249238269348e-05, + "loss": 1.07, + "step": 1541 + }, + { + "epoch": 0.3289249146757679, + "grad_norm": 0.464340143692387, + "learning_rate": 7.517367458866545e-05, + "loss": 1.0453, + "step": 1542 + }, + { + "epoch": 0.3291382252559727, + "grad_norm": 0.5670365623115615, + "learning_rate": 7.52224253503961e-05, + "loss": 1.0435, + "step": 1543 + }, + { + "epoch": 0.3293515358361775, + "grad_norm": 0.7735181584094084, + "learning_rate": 7.527117611212676e-05, + "loss": 1.0722, + "step": 1544 + }, + { + "epoch": 0.32956484641638223, + "grad_norm": 0.8007927804308789, + "learning_rate": 7.531992687385741e-05, + "loss": 1.0363, + "step": 1545 + }, + { + "epoch": 0.329778156996587, + "grad_norm": 0.7098697676664012, + "learning_rate": 7.536867763558806e-05, + "loss": 1.0437, + "step": 1546 + }, + { + "epoch": 0.3299914675767918, + "grad_norm": 0.7356109102789883, + "learning_rate": 7.541742839731872e-05, + "loss": 1.0157, + "step": 1547 + }, + { + "epoch": 0.3302047781569966, + "grad_norm": 0.6462349507674193, + "learning_rate": 7.546617915904937e-05, + "loss": 1.0688, + "step": 1548 + }, + { + "epoch": 0.33041808873720135, + "grad_norm": 0.5539483211436083, + "learning_rate": 7.551492992078002e-05, + "loss": 1.0205, + "step": 1549 + }, + { + "epoch": 0.33063139931740615, + "grad_norm": 0.5735446152909967, + "learning_rate": 7.556368068251066e-05, + "loss": 1.0182, + "step": 1550 + }, + { + "epoch": 0.33084470989761094, + "grad_norm": 0.7951431064338743, + "learning_rate": 7.561243144424132e-05, + "loss": 1.0158, + "step": 1551 + }, + { + "epoch": 0.3310580204778157, + "grad_norm": 1.0121579548646011, + "learning_rate": 7.566118220597197e-05, + "loss": 1.0434, + "step": 1552 + }, + { + "epoch": 0.3312713310580205, + "grad_norm": 1.0892021157469773, + "learning_rate": 7.570993296770262e-05, + "loss": 1.0324, + "step": 1553 + }, + { + "epoch": 0.3314846416382253, + "grad_norm": 0.8515388502914333, + "learning_rate": 7.575868372943329e-05, + "loss": 1.0353, + "step": 1554 + }, + { + "epoch": 0.33169795221843, + "grad_norm": 0.6122903154001446, + "learning_rate": 7.580743449116393e-05, + "loss": 1.0386, + "step": 1555 + }, + { + "epoch": 0.3319112627986348, + "grad_norm": 0.524407684877061, + "learning_rate": 7.585618525289458e-05, + "loss": 1.0422, + "step": 1556 + }, + { + "epoch": 0.3321245733788396, + "grad_norm": 0.4913867249785508, + "learning_rate": 7.590493601462523e-05, + "loss": 1.023, + "step": 1557 + }, + { + "epoch": 0.33233788395904434, + "grad_norm": 0.5778291650112763, + "learning_rate": 7.595368677635589e-05, + "loss": 1.0263, + "step": 1558 + }, + { + "epoch": 0.33255119453924914, + "grad_norm": 0.7580013733654722, + "learning_rate": 7.600243753808654e-05, + "loss": 1.002, + "step": 1559 + }, + { + "epoch": 0.33276450511945393, + "grad_norm": 0.8111568555446307, + "learning_rate": 7.60511882998172e-05, + "loss": 1.03, + "step": 1560 + }, + { + "epoch": 0.33297781569965873, + "grad_norm": 0.8236406671454986, + "learning_rate": 7.609993906154785e-05, + "loss": 1.0285, + "step": 1561 + }, + { + "epoch": 0.33319112627986347, + "grad_norm": 1.0848294892536863, + "learning_rate": 7.61486898232785e-05, + "loss": 1.0401, + "step": 1562 + }, + { + "epoch": 0.33340443686006827, + "grad_norm": 1.0613121183151448, + "learning_rate": 7.619744058500915e-05, + "loss": 1.0205, + "step": 1563 + }, + { + "epoch": 0.33361774744027306, + "grad_norm": 0.6447264957780497, + "learning_rate": 7.62461913467398e-05, + "loss": 1.0343, + "step": 1564 + }, + { + "epoch": 0.3338310580204778, + "grad_norm": 0.45349536716938316, + "learning_rate": 7.629494210847044e-05, + "loss": 1.0435, + "step": 1565 + }, + { + "epoch": 0.3340443686006826, + "grad_norm": 0.45249451654525175, + "learning_rate": 7.63436928702011e-05, + "loss": 1.0339, + "step": 1566 + }, + { + "epoch": 0.3342576791808874, + "grad_norm": 0.4055543913761508, + "learning_rate": 7.639244363193175e-05, + "loss": 1.0477, + "step": 1567 + }, + { + "epoch": 0.33447098976109213, + "grad_norm": 0.3953728454339153, + "learning_rate": 7.64411943936624e-05, + "loss": 1.0568, + "step": 1568 + }, + { + "epoch": 0.3346843003412969, + "grad_norm": 0.5679052384738448, + "learning_rate": 7.648994515539306e-05, + "loss": 1.0595, + "step": 1569 + }, + { + "epoch": 0.3348976109215017, + "grad_norm": 0.5532771608386938, + "learning_rate": 7.653869591712371e-05, + "loss": 1.0356, + "step": 1570 + }, + { + "epoch": 0.33511092150170646, + "grad_norm": 0.5055988869414461, + "learning_rate": 7.658744667885436e-05, + "loss": 1.0234, + "step": 1571 + }, + { + "epoch": 0.33532423208191126, + "grad_norm": 0.6510119280428212, + "learning_rate": 7.663619744058502e-05, + "loss": 1.0359, + "step": 1572 + }, + { + "epoch": 0.33553754266211605, + "grad_norm": 0.6534823443613641, + "learning_rate": 7.668494820231567e-05, + "loss": 1.0138, + "step": 1573 + }, + { + "epoch": 0.33575085324232085, + "grad_norm": 0.7301070906819657, + "learning_rate": 7.673369896404632e-05, + "loss": 1.0083, + "step": 1574 + }, + { + "epoch": 0.3359641638225256, + "grad_norm": 0.8932933800700228, + "learning_rate": 7.678244972577697e-05, + "loss": 1.0324, + "step": 1575 + }, + { + "epoch": 0.3361774744027304, + "grad_norm": 1.0150198476553565, + "learning_rate": 7.683120048750761e-05, + "loss": 1.0616, + "step": 1576 + }, + { + "epoch": 0.3363907849829352, + "grad_norm": 1.1508510279547726, + "learning_rate": 7.687995124923827e-05, + "loss": 1.0474, + "step": 1577 + }, + { + "epoch": 0.3366040955631399, + "grad_norm": 0.7859180789470361, + "learning_rate": 7.692870201096893e-05, + "loss": 1.0325, + "step": 1578 + }, + { + "epoch": 0.3368174061433447, + "grad_norm": 0.6276541738035173, + "learning_rate": 7.697745277269959e-05, + "loss": 1.0413, + "step": 1579 + }, + { + "epoch": 0.3370307167235495, + "grad_norm": 0.6569805304360614, + "learning_rate": 7.702620353443024e-05, + "loss": 1.0246, + "step": 1580 + }, + { + "epoch": 0.33724402730375425, + "grad_norm": 0.6820354385280121, + "learning_rate": 7.707495429616088e-05, + "loss": 1.048, + "step": 1581 + }, + { + "epoch": 0.33745733788395904, + "grad_norm": 0.7682449688105084, + "learning_rate": 7.712370505789153e-05, + "loss": 1.0198, + "step": 1582 + }, + { + "epoch": 0.33767064846416384, + "grad_norm": 0.9043447492282712, + "learning_rate": 7.717245581962219e-05, + "loss": 1.0382, + "step": 1583 + }, + { + "epoch": 0.3378839590443686, + "grad_norm": 1.120032111558173, + "learning_rate": 7.722120658135284e-05, + "loss": 1.0205, + "step": 1584 + }, + { + "epoch": 0.33809726962457337, + "grad_norm": 0.7681942321401305, + "learning_rate": 7.726995734308349e-05, + "loss": 1.0384, + "step": 1585 + }, + { + "epoch": 0.33831058020477817, + "grad_norm": 0.689927963857581, + "learning_rate": 7.731870810481414e-05, + "loss": 1.0514, + "step": 1586 + }, + { + "epoch": 0.33852389078498296, + "grad_norm": 0.6276873090184809, + "learning_rate": 7.73674588665448e-05, + "loss": 1.0252, + "step": 1587 + }, + { + "epoch": 0.3387372013651877, + "grad_norm": 0.6740743235506534, + "learning_rate": 7.741620962827545e-05, + "loss": 1.0167, + "step": 1588 + }, + { + "epoch": 0.3389505119453925, + "grad_norm": 0.5811291264304586, + "learning_rate": 7.74649603900061e-05, + "loss": 1.0361, + "step": 1589 + }, + { + "epoch": 0.3391638225255973, + "grad_norm": 0.5888605350644204, + "learning_rate": 7.751371115173676e-05, + "loss": 1.0284, + "step": 1590 + }, + { + "epoch": 0.33937713310580203, + "grad_norm": 0.766174149259972, + "learning_rate": 7.756246191346741e-05, + "loss": 1.062, + "step": 1591 + }, + { + "epoch": 0.3395904436860068, + "grad_norm": 0.5954888480387905, + "learning_rate": 7.761121267519805e-05, + "loss": 1.0068, + "step": 1592 + }, + { + "epoch": 0.3398037542662116, + "grad_norm": 0.6631527568996218, + "learning_rate": 7.76599634369287e-05, + "loss": 1.0598, + "step": 1593 + }, + { + "epoch": 0.34001706484641636, + "grad_norm": 0.6458291571814803, + "learning_rate": 7.770871419865936e-05, + "loss": 1.0426, + "step": 1594 + }, + { + "epoch": 0.34023037542662116, + "grad_norm": 0.4170712771840706, + "learning_rate": 7.775746496039001e-05, + "loss": 0.9903, + "step": 1595 + }, + { + "epoch": 0.34044368600682595, + "grad_norm": 0.5315632286327943, + "learning_rate": 7.780621572212067e-05, + "loss": 1.0051, + "step": 1596 + }, + { + "epoch": 0.3406569965870307, + "grad_norm": 0.7947043915588453, + "learning_rate": 7.785496648385131e-05, + "loss": 1.0267, + "step": 1597 + }, + { + "epoch": 0.3408703071672355, + "grad_norm": 0.9247418459607237, + "learning_rate": 7.790371724558197e-05, + "loss": 1.0345, + "step": 1598 + }, + { + "epoch": 0.3410836177474403, + "grad_norm": 0.9863672376518341, + "learning_rate": 7.795246800731262e-05, + "loss": 1.0467, + "step": 1599 + }, + { + "epoch": 0.3412969283276451, + "grad_norm": 1.0355297802525145, + "learning_rate": 7.800121876904327e-05, + "loss": 1.0591, + "step": 1600 + }, + { + "epoch": 0.3415102389078498, + "grad_norm": 0.989583122851217, + "learning_rate": 7.804996953077393e-05, + "loss": 1.0324, + "step": 1601 + }, + { + "epoch": 0.3417235494880546, + "grad_norm": 0.9898985536739082, + "learning_rate": 7.809872029250458e-05, + "loss": 1.0307, + "step": 1602 + }, + { + "epoch": 0.3419368600682594, + "grad_norm": 0.9204900279039979, + "learning_rate": 7.814747105423523e-05, + "loss": 1.0211, + "step": 1603 + }, + { + "epoch": 0.34215017064846415, + "grad_norm": 0.875864605950398, + "learning_rate": 7.819622181596589e-05, + "loss": 1.0049, + "step": 1604 + }, + { + "epoch": 0.34236348122866894, + "grad_norm": 0.9590420511647038, + "learning_rate": 7.824497257769654e-05, + "loss": 1.027, + "step": 1605 + }, + { + "epoch": 0.34257679180887374, + "grad_norm": 1.014449231531124, + "learning_rate": 7.829372333942719e-05, + "loss": 1.0299, + "step": 1606 + }, + { + "epoch": 0.3427901023890785, + "grad_norm": 0.9752821064053867, + "learning_rate": 7.834247410115783e-05, + "loss": 1.0247, + "step": 1607 + }, + { + "epoch": 0.3430034129692833, + "grad_norm": 0.8586189685868137, + "learning_rate": 7.839122486288848e-05, + "loss": 0.9942, + "step": 1608 + }, + { + "epoch": 0.34321672354948807, + "grad_norm": 0.8163448304591143, + "learning_rate": 7.843997562461914e-05, + "loss": 1.0535, + "step": 1609 + }, + { + "epoch": 0.3434300341296928, + "grad_norm": 0.7800509040337726, + "learning_rate": 7.848872638634979e-05, + "loss": 1.0553, + "step": 1610 + }, + { + "epoch": 0.3436433447098976, + "grad_norm": 0.6121346701836885, + "learning_rate": 7.853747714808044e-05, + "loss": 1.0253, + "step": 1611 + }, + { + "epoch": 0.3438566552901024, + "grad_norm": 0.5648867183646822, + "learning_rate": 7.85862279098111e-05, + "loss": 0.9894, + "step": 1612 + }, + { + "epoch": 0.3440699658703072, + "grad_norm": 0.5546546828333101, + "learning_rate": 7.863497867154175e-05, + "loss": 1.0429, + "step": 1613 + }, + { + "epoch": 0.34428327645051193, + "grad_norm": 0.6073124315253996, + "learning_rate": 7.86837294332724e-05, + "loss": 1.0248, + "step": 1614 + }, + { + "epoch": 0.3444965870307167, + "grad_norm": 0.5937292107716117, + "learning_rate": 7.873248019500306e-05, + "loss": 1.0368, + "step": 1615 + }, + { + "epoch": 0.3447098976109215, + "grad_norm": 0.6023014618565141, + "learning_rate": 7.878123095673371e-05, + "loss": 1.0134, + "step": 1616 + }, + { + "epoch": 0.34492320819112626, + "grad_norm": 0.7503657649487274, + "learning_rate": 7.882998171846436e-05, + "loss": 1.0586, + "step": 1617 + }, + { + "epoch": 0.34513651877133106, + "grad_norm": 0.7806680455443268, + "learning_rate": 7.8878732480195e-05, + "loss": 1.0455, + "step": 1618 + }, + { + "epoch": 0.34534982935153585, + "grad_norm": 0.7143129941722636, + "learning_rate": 7.892748324192565e-05, + "loss": 1.0042, + "step": 1619 + }, + { + "epoch": 0.3455631399317406, + "grad_norm": 0.5675977609594827, + "learning_rate": 7.897623400365632e-05, + "loss": 1.0908, + "step": 1620 + }, + { + "epoch": 0.3457764505119454, + "grad_norm": 0.4031313218155185, + "learning_rate": 7.902498476538697e-05, + "loss": 1.0638, + "step": 1621 + }, + { + "epoch": 0.3459897610921502, + "grad_norm": 0.4676217178506133, + "learning_rate": 7.907373552711763e-05, + "loss": 1.0555, + "step": 1622 + }, + { + "epoch": 0.3462030716723549, + "grad_norm": 0.5260125355231037, + "learning_rate": 7.912248628884827e-05, + "loss": 1.0624, + "step": 1623 + }, + { + "epoch": 0.3464163822525597, + "grad_norm": 0.459572631176196, + "learning_rate": 7.917123705057892e-05, + "loss": 1.0244, + "step": 1624 + }, + { + "epoch": 0.3466296928327645, + "grad_norm": 0.34724865868467214, + "learning_rate": 7.921998781230957e-05, + "loss": 1.0467, + "step": 1625 + }, + { + "epoch": 0.3468430034129693, + "grad_norm": 0.46574284570902214, + "learning_rate": 7.926873857404022e-05, + "loss": 1.0411, + "step": 1626 + }, + { + "epoch": 0.34705631399317405, + "grad_norm": 0.616213903039594, + "learning_rate": 7.931748933577088e-05, + "loss": 1.0031, + "step": 1627 + }, + { + "epoch": 0.34726962457337884, + "grad_norm": 0.547976968729227, + "learning_rate": 7.936624009750153e-05, + "loss": 1.0247, + "step": 1628 + }, + { + "epoch": 0.34748293515358364, + "grad_norm": 0.44835310600504436, + "learning_rate": 7.941499085923218e-05, + "loss": 1.0244, + "step": 1629 + }, + { + "epoch": 0.3476962457337884, + "grad_norm": 0.5540504872404591, + "learning_rate": 7.946374162096284e-05, + "loss": 1.0073, + "step": 1630 + }, + { + "epoch": 0.3479095563139932, + "grad_norm": 0.6872250742005787, + "learning_rate": 7.951249238269349e-05, + "loss": 1.0036, + "step": 1631 + }, + { + "epoch": 0.34812286689419797, + "grad_norm": 0.8634559472830611, + "learning_rate": 7.956124314442414e-05, + "loss": 1.0313, + "step": 1632 + }, + { + "epoch": 0.3483361774744027, + "grad_norm": 0.9666854425773594, + "learning_rate": 7.96099939061548e-05, + "loss": 1.0406, + "step": 1633 + }, + { + "epoch": 0.3485494880546075, + "grad_norm": 0.9655204320035564, + "learning_rate": 7.965874466788544e-05, + "loss": 1.0204, + "step": 1634 + }, + { + "epoch": 0.3487627986348123, + "grad_norm": 1.0003465509607299, + "learning_rate": 7.970749542961609e-05, + "loss": 1.0299, + "step": 1635 + }, + { + "epoch": 0.34897610921501704, + "grad_norm": 1.0846378678334232, + "learning_rate": 7.975624619134674e-05, + "loss": 1.0669, + "step": 1636 + }, + { + "epoch": 0.34918941979522183, + "grad_norm": 1.0091334209962746, + "learning_rate": 7.98049969530774e-05, + "loss": 1.0025, + "step": 1637 + }, + { + "epoch": 0.34940273037542663, + "grad_norm": 0.9953067146078007, + "learning_rate": 7.985374771480805e-05, + "loss": 1.0372, + "step": 1638 + }, + { + "epoch": 0.3496160409556314, + "grad_norm": 1.0024604649213575, + "learning_rate": 7.99024984765387e-05, + "loss": 1.0524, + "step": 1639 + }, + { + "epoch": 0.34982935153583616, + "grad_norm": 0.7923229283208546, + "learning_rate": 7.995124923826935e-05, + "loss": 1.0101, + "step": 1640 + }, + { + "epoch": 0.35004266211604096, + "grad_norm": 0.5223005399874736, + "learning_rate": 8e-05, + "loss": 1.0333, + "step": 1641 + }, + { + "epoch": 0.35025597269624575, + "grad_norm": 0.5212056937504805, + "learning_rate": 8.004875076173065e-05, + "loss": 1.01, + "step": 1642 + }, + { + "epoch": 0.3504692832764505, + "grad_norm": 0.5915343533356198, + "learning_rate": 8.009750152346131e-05, + "loss": 1.0175, + "step": 1643 + }, + { + "epoch": 0.3506825938566553, + "grad_norm": 0.7005844981945955, + "learning_rate": 8.014625228519197e-05, + "loss": 1.0103, + "step": 1644 + }, + { + "epoch": 0.3508959044368601, + "grad_norm": 0.643662498285595, + "learning_rate": 8.019500304692262e-05, + "loss": 1.0484, + "step": 1645 + }, + { + "epoch": 0.3511092150170648, + "grad_norm": 0.45993377159438276, + "learning_rate": 8.024375380865327e-05, + "loss": 1.0112, + "step": 1646 + }, + { + "epoch": 0.3513225255972696, + "grad_norm": 2.0938289839071214, + "learning_rate": 8.029250457038391e-05, + "loss": 1.0713, + "step": 1647 + }, + { + "epoch": 0.3515358361774744, + "grad_norm": 0.5791522076471226, + "learning_rate": 8.034125533211458e-05, + "loss": 1.0373, + "step": 1648 + }, + { + "epoch": 0.35174914675767915, + "grad_norm": 1.110142088231322, + "learning_rate": 8.039000609384522e-05, + "loss": 1.0399, + "step": 1649 + }, + { + "epoch": 0.35196245733788395, + "grad_norm": 1.1105746019610638, + "learning_rate": 8.043875685557588e-05, + "loss": 1.0246, + "step": 1650 + }, + { + "epoch": 0.35217576791808874, + "grad_norm": 2.877403559876316, + "learning_rate": 8.048750761730652e-05, + "loss": 1.0325, + "step": 1651 + }, + { + "epoch": 0.35238907849829354, + "grad_norm": 1.3927058202081535, + "learning_rate": 8.053625837903718e-05, + "loss": 1.0369, + "step": 1652 + }, + { + "epoch": 0.3526023890784983, + "grad_norm": 1.288900936212764, + "learning_rate": 8.058500914076783e-05, + "loss": 1.0366, + "step": 1653 + }, + { + "epoch": 0.3528156996587031, + "grad_norm": 0.648439606707072, + "learning_rate": 8.063375990249848e-05, + "loss": 1.0462, + "step": 1654 + }, + { + "epoch": 0.35302901023890787, + "grad_norm": 1.034616965362602, + "learning_rate": 8.068251066422914e-05, + "loss": 1.0238, + "step": 1655 + }, + { + "epoch": 0.3532423208191126, + "grad_norm": 1.3902191721560733, + "learning_rate": 8.073126142595979e-05, + "loss": 1.0319, + "step": 1656 + }, + { + "epoch": 0.3534556313993174, + "grad_norm": 0.6643257696249503, + "learning_rate": 8.078001218769043e-05, + "loss": 1.0476, + "step": 1657 + }, + { + "epoch": 0.3536689419795222, + "grad_norm": 1.2403288665327543, + "learning_rate": 8.08287629494211e-05, + "loss": 1.0601, + "step": 1658 + }, + { + "epoch": 0.35388225255972694, + "grad_norm": 1.0952661616744235, + "learning_rate": 8.087751371115173e-05, + "loss": 1.0421, + "step": 1659 + }, + { + "epoch": 0.35409556313993173, + "grad_norm": 0.9224445609621512, + "learning_rate": 8.09262644728824e-05, + "loss": 1.055, + "step": 1660 + }, + { + "epoch": 0.35430887372013653, + "grad_norm": 0.7225987466098217, + "learning_rate": 8.097501523461304e-05, + "loss": 1.062, + "step": 1661 + }, + { + "epoch": 0.35452218430034127, + "grad_norm": 1.4240967615875315, + "learning_rate": 8.102376599634369e-05, + "loss": 1.0668, + "step": 1662 + }, + { + "epoch": 0.35473549488054607, + "grad_norm": 0.5712935818852444, + "learning_rate": 8.107251675807436e-05, + "loss": 1.0198, + "step": 1663 + }, + { + "epoch": 0.35494880546075086, + "grad_norm": 1.254656500977429, + "learning_rate": 8.1121267519805e-05, + "loss": 1.0506, + "step": 1664 + }, + { + "epoch": 0.35516211604095566, + "grad_norm": 0.6544200108711257, + "learning_rate": 8.117001828153567e-05, + "loss": 1.0373, + "step": 1665 + }, + { + "epoch": 0.3553754266211604, + "grad_norm": 0.9321237012494406, + "learning_rate": 8.12187690432663e-05, + "loss": 1.0366, + "step": 1666 + }, + { + "epoch": 0.3555887372013652, + "grad_norm": 0.904034318446588, + "learning_rate": 8.126751980499697e-05, + "loss": 1.0333, + "step": 1667 + }, + { + "epoch": 0.35580204778157, + "grad_norm": 0.6239809040246509, + "learning_rate": 8.131627056672761e-05, + "loss": 1.0335, + "step": 1668 + }, + { + "epoch": 0.3560153583617747, + "grad_norm": 0.7444669864383586, + "learning_rate": 8.136502132845826e-05, + "loss": 1.0512, + "step": 1669 + }, + { + "epoch": 0.3562286689419795, + "grad_norm": 0.6521565425414634, + "learning_rate": 8.141377209018892e-05, + "loss": 1.0189, + "step": 1670 + }, + { + "epoch": 0.3564419795221843, + "grad_norm": 0.4630770931780532, + "learning_rate": 8.146252285191957e-05, + "loss": 1.0272, + "step": 1671 + }, + { + "epoch": 0.35665529010238906, + "grad_norm": 0.8072125682834308, + "learning_rate": 8.151127361365022e-05, + "loss": 1.0689, + "step": 1672 + }, + { + "epoch": 0.35686860068259385, + "grad_norm": 0.8306874563362271, + "learning_rate": 8.156002437538088e-05, + "loss": 1.0474, + "step": 1673 + }, + { + "epoch": 0.35708191126279865, + "grad_norm": 0.7471394740066171, + "learning_rate": 8.160877513711152e-05, + "loss": 1.0171, + "step": 1674 + }, + { + "epoch": 0.3572952218430034, + "grad_norm": 0.7996264299090617, + "learning_rate": 8.165752589884218e-05, + "loss": 1.0444, + "step": 1675 + }, + { + "epoch": 0.3575085324232082, + "grad_norm": 0.715010548761344, + "learning_rate": 8.170627666057282e-05, + "loss": 0.9968, + "step": 1676 + }, + { + "epoch": 0.357721843003413, + "grad_norm": 0.7559341114767194, + "learning_rate": 8.175502742230349e-05, + "loss": 1.0016, + "step": 1677 + }, + { + "epoch": 0.35793515358361777, + "grad_norm": 0.5476615408017901, + "learning_rate": 8.180377818403413e-05, + "loss": 1.0416, + "step": 1678 + }, + { + "epoch": 0.3581484641638225, + "grad_norm": 0.5596386967232589, + "learning_rate": 8.185252894576478e-05, + "loss": 1.0216, + "step": 1679 + }, + { + "epoch": 0.3583617747440273, + "grad_norm": 0.6484870250769353, + "learning_rate": 8.190127970749543e-05, + "loss": 1.0186, + "step": 1680 + }, + { + "epoch": 0.3585750853242321, + "grad_norm": 0.5964525551057389, + "learning_rate": 8.195003046922609e-05, + "loss": 1.0336, + "step": 1681 + }, + { + "epoch": 0.35878839590443684, + "grad_norm": 0.5861004732525558, + "learning_rate": 8.199878123095675e-05, + "loss": 1.0302, + "step": 1682 + }, + { + "epoch": 0.35900170648464164, + "grad_norm": 0.5732086478082877, + "learning_rate": 8.204753199268739e-05, + "loss": 1.0351, + "step": 1683 + }, + { + "epoch": 0.35921501706484643, + "grad_norm": 0.4318772106115433, + "learning_rate": 8.209628275441803e-05, + "loss": 0.9906, + "step": 1684 + }, + { + "epoch": 0.35942832764505117, + "grad_norm": 0.4853056268777749, + "learning_rate": 8.21450335161487e-05, + "loss": 1.0085, + "step": 1685 + }, + { + "epoch": 0.35964163822525597, + "grad_norm": 0.5303111863741032, + "learning_rate": 8.219378427787934e-05, + "loss": 1.0189, + "step": 1686 + }, + { + "epoch": 0.35985494880546076, + "grad_norm": 0.5780510374824954, + "learning_rate": 8.224253503961e-05, + "loss": 1.0135, + "step": 1687 + }, + { + "epoch": 0.36006825938566556, + "grad_norm": 0.5609367135569184, + "learning_rate": 8.229128580134066e-05, + "loss": 1.0402, + "step": 1688 + }, + { + "epoch": 0.3602815699658703, + "grad_norm": 0.5063396435739078, + "learning_rate": 8.23400365630713e-05, + "loss": 1.017, + "step": 1689 + }, + { + "epoch": 0.3604948805460751, + "grad_norm": 0.6947549803625538, + "learning_rate": 8.238878732480196e-05, + "loss": 0.9979, + "step": 1690 + }, + { + "epoch": 0.3607081911262799, + "grad_norm": 0.6629719272049013, + "learning_rate": 8.24375380865326e-05, + "loss": 1.0361, + "step": 1691 + }, + { + "epoch": 0.3609215017064846, + "grad_norm": 0.6868484396757393, + "learning_rate": 8.248628884826327e-05, + "loss": 1.0294, + "step": 1692 + }, + { + "epoch": 0.3611348122866894, + "grad_norm": 0.7258482572001421, + "learning_rate": 8.253503960999391e-05, + "loss": 1.038, + "step": 1693 + }, + { + "epoch": 0.3613481228668942, + "grad_norm": 0.6741758733223192, + "learning_rate": 8.258379037172456e-05, + "loss": 1.0313, + "step": 1694 + }, + { + "epoch": 0.36156143344709896, + "grad_norm": 0.5944040406391511, + "learning_rate": 8.263254113345522e-05, + "loss": 1.0434, + "step": 1695 + }, + { + "epoch": 0.36177474402730375, + "grad_norm": 0.4695847335531975, + "learning_rate": 8.268129189518587e-05, + "loss": 1.059, + "step": 1696 + }, + { + "epoch": 0.36198805460750855, + "grad_norm": 0.3839384438155547, + "learning_rate": 8.273004265691652e-05, + "loss": 1.0245, + "step": 1697 + }, + { + "epoch": 0.3622013651877133, + "grad_norm": 0.38531924930750694, + "learning_rate": 8.277879341864717e-05, + "loss": 1.0221, + "step": 1698 + }, + { + "epoch": 0.3624146757679181, + "grad_norm": 0.407732234348823, + "learning_rate": 8.282754418037781e-05, + "loss": 1.0744, + "step": 1699 + }, + { + "epoch": 0.3626279863481229, + "grad_norm": 0.5007097862739133, + "learning_rate": 8.287629494210848e-05, + "loss": 1.0395, + "step": 1700 + }, + { + "epoch": 0.3628412969283277, + "grad_norm": 0.5229008686778952, + "learning_rate": 8.292504570383912e-05, + "loss": 1.0239, + "step": 1701 + }, + { + "epoch": 0.3630546075085324, + "grad_norm": 0.5909589000599823, + "learning_rate": 8.297379646556979e-05, + "loss": 1.0315, + "step": 1702 + }, + { + "epoch": 0.3632679180887372, + "grad_norm": 0.764677844079117, + "learning_rate": 8.302254722730043e-05, + "loss": 1.0127, + "step": 1703 + }, + { + "epoch": 0.363481228668942, + "grad_norm": 1.0981358007869204, + "learning_rate": 8.307129798903108e-05, + "loss": 1.022, + "step": 1704 + }, + { + "epoch": 0.36369453924914674, + "grad_norm": 1.156192214610539, + "learning_rate": 8.312004875076175e-05, + "loss": 1.0416, + "step": 1705 + }, + { + "epoch": 0.36390784982935154, + "grad_norm": 0.7879207271588937, + "learning_rate": 8.316879951249239e-05, + "loss": 1.039, + "step": 1706 + }, + { + "epoch": 0.36412116040955633, + "grad_norm": 0.570386383094708, + "learning_rate": 8.321755027422305e-05, + "loss": 1.0432, + "step": 1707 + }, + { + "epoch": 0.3643344709897611, + "grad_norm": 0.4858040145886447, + "learning_rate": 8.326630103595369e-05, + "loss": 1.0467, + "step": 1708 + }, + { + "epoch": 0.36454778156996587, + "grad_norm": 0.5592758911279768, + "learning_rate": 8.331505179768436e-05, + "loss": 1.0049, + "step": 1709 + }, + { + "epoch": 0.36476109215017066, + "grad_norm": 0.7080895193959106, + "learning_rate": 8.3363802559415e-05, + "loss": 1.0257, + "step": 1710 + }, + { + "epoch": 0.3649744027303754, + "grad_norm": 0.9138181886164903, + "learning_rate": 8.341255332114565e-05, + "loss": 1.0227, + "step": 1711 + }, + { + "epoch": 0.3651877133105802, + "grad_norm": 1.1342812634566208, + "learning_rate": 8.34613040828763e-05, + "loss": 1.0545, + "step": 1712 + }, + { + "epoch": 0.365401023890785, + "grad_norm": 0.86356749519849, + "learning_rate": 8.351005484460696e-05, + "loss": 1.0232, + "step": 1713 + }, + { + "epoch": 0.3656143344709898, + "grad_norm": 0.6915361534504274, + "learning_rate": 8.355880560633761e-05, + "loss": 1.0409, + "step": 1714 + }, + { + "epoch": 0.3658276450511945, + "grad_norm": 0.5407451548060029, + "learning_rate": 8.360755636806826e-05, + "loss": 1.0156, + "step": 1715 + }, + { + "epoch": 0.3660409556313993, + "grad_norm": 0.5201355548388948, + "learning_rate": 8.36563071297989e-05, + "loss": 1.0206, + "step": 1716 + }, + { + "epoch": 0.3662542662116041, + "grad_norm": 0.5205684749809918, + "learning_rate": 8.370505789152957e-05, + "loss": 1.0316, + "step": 1717 + }, + { + "epoch": 0.36646757679180886, + "grad_norm": 0.49053583298536396, + "learning_rate": 8.375380865326021e-05, + "loss": 1.0244, + "step": 1718 + }, + { + "epoch": 0.36668088737201365, + "grad_norm": 0.5551670628532298, + "learning_rate": 8.380255941499087e-05, + "loss": 1.0298, + "step": 1719 + }, + { + "epoch": 0.36689419795221845, + "grad_norm": 0.7122686596575447, + "learning_rate": 8.385131017672151e-05, + "loss": 1.018, + "step": 1720 + }, + { + "epoch": 0.3671075085324232, + "grad_norm": 0.8423915143369747, + "learning_rate": 8.390006093845217e-05, + "loss": 1.024, + "step": 1721 + }, + { + "epoch": 0.367320819112628, + "grad_norm": 0.8618778561581798, + "learning_rate": 8.394881170018282e-05, + "loss": 1.0328, + "step": 1722 + }, + { + "epoch": 0.3675341296928328, + "grad_norm": 0.8614170231304029, + "learning_rate": 8.399756246191347e-05, + "loss": 1.0649, + "step": 1723 + }, + { + "epoch": 0.3677474402730375, + "grad_norm": 0.7820140907273722, + "learning_rate": 8.404631322364414e-05, + "loss": 1.0271, + "step": 1724 + }, + { + "epoch": 0.3679607508532423, + "grad_norm": 0.7756862007981733, + "learning_rate": 8.409506398537478e-05, + "loss": 1.015, + "step": 1725 + }, + { + "epoch": 0.3681740614334471, + "grad_norm": 0.7466316894401431, + "learning_rate": 8.414381474710542e-05, + "loss": 0.9974, + "step": 1726 + }, + { + "epoch": 0.3683873720136519, + "grad_norm": 0.8386286878545364, + "learning_rate": 8.419256550883608e-05, + "loss": 1.0104, + "step": 1727 + }, + { + "epoch": 0.36860068259385664, + "grad_norm": 1.0106916611007493, + "learning_rate": 8.424131627056672e-05, + "loss": 1.053, + "step": 1728 + }, + { + "epoch": 0.36881399317406144, + "grad_norm": 1.1424240086725637, + "learning_rate": 8.429006703229739e-05, + "loss": 1.0284, + "step": 1729 + }, + { + "epoch": 0.36902730375426623, + "grad_norm": 0.7130183853568317, + "learning_rate": 8.433881779402804e-05, + "loss": 1.0438, + "step": 1730 + }, + { + "epoch": 0.369240614334471, + "grad_norm": 0.6155854323578681, + "learning_rate": 8.438756855575868e-05, + "loss": 1.0005, + "step": 1731 + }, + { + "epoch": 0.36945392491467577, + "grad_norm": 0.5974527330006525, + "learning_rate": 8.443631931748935e-05, + "loss": 1.0326, + "step": 1732 + }, + { + "epoch": 0.36966723549488056, + "grad_norm": 0.5174426703722935, + "learning_rate": 8.448507007921999e-05, + "loss": 1.0217, + "step": 1733 + }, + { + "epoch": 0.3698805460750853, + "grad_norm": 0.3975414120046563, + "learning_rate": 8.453382084095066e-05, + "loss": 1.0211, + "step": 1734 + }, + { + "epoch": 0.3700938566552901, + "grad_norm": 0.4221865004479814, + "learning_rate": 8.45825716026813e-05, + "loss": 1.0179, + "step": 1735 + }, + { + "epoch": 0.3703071672354949, + "grad_norm": 0.4897092072878396, + "learning_rate": 8.463132236441195e-05, + "loss": 1.0409, + "step": 1736 + }, + { + "epoch": 0.37052047781569963, + "grad_norm": 0.5621956016242252, + "learning_rate": 8.46800731261426e-05, + "loss": 1.0273, + "step": 1737 + }, + { + "epoch": 0.37073378839590443, + "grad_norm": 0.6048089867198996, + "learning_rate": 8.472882388787325e-05, + "loss": 1.0286, + "step": 1738 + }, + { + "epoch": 0.3709470989761092, + "grad_norm": 0.566077300609908, + "learning_rate": 8.477757464960391e-05, + "loss": 0.9953, + "step": 1739 + }, + { + "epoch": 0.371160409556314, + "grad_norm": 0.5501534800778128, + "learning_rate": 8.482632541133456e-05, + "loss": 1.032, + "step": 1740 + }, + { + "epoch": 0.37137372013651876, + "grad_norm": 0.5556524353787505, + "learning_rate": 8.48750761730652e-05, + "loss": 1.0087, + "step": 1741 + }, + { + "epoch": 0.37158703071672355, + "grad_norm": 0.5885309775405376, + "learning_rate": 8.492382693479587e-05, + "loss": 1.0335, + "step": 1742 + }, + { + "epoch": 0.37180034129692835, + "grad_norm": 0.6091403007519004, + "learning_rate": 8.49725776965265e-05, + "loss": 1.0322, + "step": 1743 + }, + { + "epoch": 0.3720136518771331, + "grad_norm": 0.6809057144899536, + "learning_rate": 8.502132845825717e-05, + "loss": 1.0444, + "step": 1744 + }, + { + "epoch": 0.3722269624573379, + "grad_norm": 0.6699338137408917, + "learning_rate": 8.507007921998781e-05, + "loss": 1.0151, + "step": 1745 + }, + { + "epoch": 0.3724402730375427, + "grad_norm": 0.6620527629303165, + "learning_rate": 8.511882998171847e-05, + "loss": 1.0011, + "step": 1746 + }, + { + "epoch": 0.3726535836177474, + "grad_norm": 0.704802594140097, + "learning_rate": 8.516758074344913e-05, + "loss": 1.0269, + "step": 1747 + }, + { + "epoch": 0.3728668941979522, + "grad_norm": 0.7402354555445164, + "learning_rate": 8.521633150517977e-05, + "loss": 1.0145, + "step": 1748 + }, + { + "epoch": 0.373080204778157, + "grad_norm": 0.773315982978999, + "learning_rate": 8.526508226691044e-05, + "loss": 1.0236, + "step": 1749 + }, + { + "epoch": 0.37329351535836175, + "grad_norm": 0.7839449409447343, + "learning_rate": 8.531383302864108e-05, + "loss": 1.0771, + "step": 1750 + }, + { + "epoch": 0.37350682593856654, + "grad_norm": 0.7435304548442465, + "learning_rate": 8.536258379037174e-05, + "loss": 1.0073, + "step": 1751 + }, + { + "epoch": 0.37372013651877134, + "grad_norm": 0.6569231572578151, + "learning_rate": 8.541133455210238e-05, + "loss": 1.0385, + "step": 1752 + }, + { + "epoch": 0.37393344709897613, + "grad_norm": 0.7047244035731084, + "learning_rate": 8.546008531383304e-05, + "loss": 1.0837, + "step": 1753 + }, + { + "epoch": 0.3741467576791809, + "grad_norm": 0.9009739927945779, + "learning_rate": 8.550883607556369e-05, + "loss": 1.0209, + "step": 1754 + }, + { + "epoch": 0.37436006825938567, + "grad_norm": 1.003754417640162, + "learning_rate": 8.555758683729434e-05, + "loss": 1.0278, + "step": 1755 + }, + { + "epoch": 0.37457337883959047, + "grad_norm": 0.9607047011927842, + "learning_rate": 8.5606337599025e-05, + "loss": 1.011, + "step": 1756 + }, + { + "epoch": 0.3747866894197952, + "grad_norm": 0.8651870432885636, + "learning_rate": 8.565508836075565e-05, + "loss": 1.0269, + "step": 1757 + }, + { + "epoch": 0.375, + "grad_norm": 0.8242488520190435, + "learning_rate": 8.570383912248629e-05, + "loss": 0.9982, + "step": 1758 + }, + { + "epoch": 0.3752133105802048, + "grad_norm": 0.9469134513764694, + "learning_rate": 8.575258988421695e-05, + "loss": 0.9965, + "step": 1759 + }, + { + "epoch": 0.37542662116040953, + "grad_norm": 1.0639109474177761, + "learning_rate": 8.58013406459476e-05, + "loss": 1.004, + "step": 1760 + }, + { + "epoch": 0.37563993174061433, + "grad_norm": 0.7324783096967942, + "learning_rate": 8.585009140767826e-05, + "loss": 1.0209, + "step": 1761 + }, + { + "epoch": 0.3758532423208191, + "grad_norm": 0.677049450064904, + "learning_rate": 8.58988421694089e-05, + "loss": 1.0022, + "step": 1762 + }, + { + "epoch": 0.37606655290102387, + "grad_norm": 0.6910978382033378, + "learning_rate": 8.594759293113955e-05, + "loss": 1.0359, + "step": 1763 + }, + { + "epoch": 0.37627986348122866, + "grad_norm": 0.596922904525674, + "learning_rate": 8.59963436928702e-05, + "loss": 1.0434, + "step": 1764 + }, + { + "epoch": 0.37649317406143346, + "grad_norm": 0.7335320306054965, + "learning_rate": 8.604509445460086e-05, + "loss": 1.0222, + "step": 1765 + }, + { + "epoch": 0.37670648464163825, + "grad_norm": 0.849822088970582, + "learning_rate": 8.609384521633153e-05, + "loss": 1.0272, + "step": 1766 + }, + { + "epoch": 0.376919795221843, + "grad_norm": 0.7580300478220793, + "learning_rate": 8.614259597806217e-05, + "loss": 1.0012, + "step": 1767 + }, + { + "epoch": 0.3771331058020478, + "grad_norm": 0.8668980645119678, + "learning_rate": 8.61913467397928e-05, + "loss": 1.0437, + "step": 1768 + }, + { + "epoch": 0.3773464163822526, + "grad_norm": 0.9440458808919624, + "learning_rate": 8.624009750152347e-05, + "loss": 1.0272, + "step": 1769 + }, + { + "epoch": 0.3775597269624573, + "grad_norm": 0.8604072106137697, + "learning_rate": 8.628884826325411e-05, + "loss": 1.0103, + "step": 1770 + }, + { + "epoch": 0.3777730375426621, + "grad_norm": 0.9509095400432259, + "learning_rate": 8.633759902498478e-05, + "loss": 1.0182, + "step": 1771 + }, + { + "epoch": 0.3779863481228669, + "grad_norm": 1.0490384812102114, + "learning_rate": 8.638634978671543e-05, + "loss": 1.0656, + "step": 1772 + }, + { + "epoch": 0.37819965870307165, + "grad_norm": 0.8074556493910889, + "learning_rate": 8.643510054844607e-05, + "loss": 1.0346, + "step": 1773 + }, + { + "epoch": 0.37841296928327645, + "grad_norm": 0.6876218688053511, + "learning_rate": 8.648385131017674e-05, + "loss": 1.0253, + "step": 1774 + }, + { + "epoch": 0.37862627986348124, + "grad_norm": 0.618433306252932, + "learning_rate": 8.653260207190738e-05, + "loss": 1.0293, + "step": 1775 + }, + { + "epoch": 0.378839590443686, + "grad_norm": 0.6038544768629212, + "learning_rate": 8.658135283363804e-05, + "loss": 1.0188, + "step": 1776 + }, + { + "epoch": 0.3790529010238908, + "grad_norm": 0.9978569392623212, + "learning_rate": 8.663010359536868e-05, + "loss": 1.011, + "step": 1777 + }, + { + "epoch": 0.37926621160409557, + "grad_norm": 1.1554754356882162, + "learning_rate": 8.667885435709933e-05, + "loss": 1.0746, + "step": 1778 + }, + { + "epoch": 0.37947952218430037, + "grad_norm": 0.7432422479005706, + "learning_rate": 8.672760511882999e-05, + "loss": 0.993, + "step": 1779 + }, + { + "epoch": 0.3796928327645051, + "grad_norm": 0.6052655201861705, + "learning_rate": 8.677635588056064e-05, + "loss": 1.0198, + "step": 1780 + }, + { + "epoch": 0.3799061433447099, + "grad_norm": 0.38480027359607527, + "learning_rate": 8.68251066422913e-05, + "loss": 0.9876, + "step": 1781 + }, + { + "epoch": 0.3801194539249147, + "grad_norm": 0.5981631034292618, + "learning_rate": 8.687385740402195e-05, + "loss": 1.0433, + "step": 1782 + }, + { + "epoch": 0.38033276450511944, + "grad_norm": 0.6399344562948528, + "learning_rate": 8.692260816575259e-05, + "loss": 1.0407, + "step": 1783 + }, + { + "epoch": 0.38054607508532423, + "grad_norm": 0.6627107026843019, + "learning_rate": 8.697135892748325e-05, + "loss": 0.998, + "step": 1784 + }, + { + "epoch": 0.380759385665529, + "grad_norm": 0.802663631079623, + "learning_rate": 8.702010968921389e-05, + "loss": 1.0176, + "step": 1785 + }, + { + "epoch": 0.38097269624573377, + "grad_norm": 0.8962515134531829, + "learning_rate": 8.706886045094456e-05, + "loss": 1.0327, + "step": 1786 + }, + { + "epoch": 0.38118600682593856, + "grad_norm": 1.0934073545390384, + "learning_rate": 8.71176112126752e-05, + "loss": 1.0273, + "step": 1787 + }, + { + "epoch": 0.38139931740614336, + "grad_norm": 0.8303734700401538, + "learning_rate": 8.716636197440585e-05, + "loss": 1.0396, + "step": 1788 + }, + { + "epoch": 0.3816126279863481, + "grad_norm": 0.6826737748265642, + "learning_rate": 8.721511273613652e-05, + "loss": 1.0301, + "step": 1789 + }, + { + "epoch": 0.3818259385665529, + "grad_norm": 0.6645032623544873, + "learning_rate": 8.726386349786716e-05, + "loss": 1.0182, + "step": 1790 + }, + { + "epoch": 0.3820392491467577, + "grad_norm": 0.687213507341998, + "learning_rate": 8.731261425959782e-05, + "loss": 1.0776, + "step": 1791 + }, + { + "epoch": 0.3822525597269625, + "grad_norm": 0.680751936692582, + "learning_rate": 8.736136502132846e-05, + "loss": 1.0377, + "step": 1792 + }, + { + "epoch": 0.3824658703071672, + "grad_norm": 0.9251318524394576, + "learning_rate": 8.741011578305913e-05, + "loss": 1.0275, + "step": 1793 + }, + { + "epoch": 0.382679180887372, + "grad_norm": 0.9579215731386345, + "learning_rate": 8.745886654478977e-05, + "loss": 1.0185, + "step": 1794 + }, + { + "epoch": 0.3828924914675768, + "grad_norm": 0.9353888265245534, + "learning_rate": 8.750761730652042e-05, + "loss": 1.0273, + "step": 1795 + }, + { + "epoch": 0.38310580204778155, + "grad_norm": 0.8305896358516688, + "learning_rate": 8.755636806825108e-05, + "loss": 1.025, + "step": 1796 + }, + { + "epoch": 0.38331911262798635, + "grad_norm": 0.7546203333543234, + "learning_rate": 8.760511882998173e-05, + "loss": 1.0491, + "step": 1797 + }, + { + "epoch": 0.38353242320819114, + "grad_norm": 0.6155121155824741, + "learning_rate": 8.765386959171238e-05, + "loss": 1.0335, + "step": 1798 + }, + { + "epoch": 0.3837457337883959, + "grad_norm": 0.5258810492078118, + "learning_rate": 8.770262035344303e-05, + "loss": 0.9975, + "step": 1799 + }, + { + "epoch": 0.3839590443686007, + "grad_norm": 0.5301244473884357, + "learning_rate": 8.775137111517367e-05, + "loss": 1.0206, + "step": 1800 + }, + { + "epoch": 0.3841723549488055, + "grad_norm": 0.568846188471971, + "learning_rate": 8.780012187690434e-05, + "loss": 1.0028, + "step": 1801 + }, + { + "epoch": 0.3843856655290102, + "grad_norm": 0.5291143542652912, + "learning_rate": 8.784887263863498e-05, + "loss": 1.0141, + "step": 1802 + }, + { + "epoch": 0.384598976109215, + "grad_norm": 0.4712362835748106, + "learning_rate": 8.789762340036565e-05, + "loss": 1.015, + "step": 1803 + }, + { + "epoch": 0.3848122866894198, + "grad_norm": 0.46067962746648927, + "learning_rate": 8.794637416209629e-05, + "loss": 1.0569, + "step": 1804 + }, + { + "epoch": 0.3850255972696246, + "grad_norm": 0.4566661245720999, + "learning_rate": 8.799512492382694e-05, + "loss": 1.0461, + "step": 1805 + }, + { + "epoch": 0.38523890784982934, + "grad_norm": 0.4251091658168504, + "learning_rate": 8.804387568555759e-05, + "loss": 1.0425, + "step": 1806 + }, + { + "epoch": 0.38545221843003413, + "grad_norm": 0.42838818919350563, + "learning_rate": 8.809262644728825e-05, + "loss": 0.9968, + "step": 1807 + }, + { + "epoch": 0.3856655290102389, + "grad_norm": 0.5036705890110657, + "learning_rate": 8.814137720901891e-05, + "loss": 1.0438, + "step": 1808 + }, + { + "epoch": 0.38587883959044367, + "grad_norm": 0.5171600057295026, + "learning_rate": 8.819012797074955e-05, + "loss": 1.0265, + "step": 1809 + }, + { + "epoch": 0.38609215017064846, + "grad_norm": 0.6380480525393155, + "learning_rate": 8.823887873248019e-05, + "loss": 1.0021, + "step": 1810 + }, + { + "epoch": 0.38630546075085326, + "grad_norm": 0.769933322573996, + "learning_rate": 8.828762949421086e-05, + "loss": 1.0114, + "step": 1811 + }, + { + "epoch": 0.386518771331058, + "grad_norm": 0.8602454791965066, + "learning_rate": 8.83363802559415e-05, + "loss": 1.0272, + "step": 1812 + }, + { + "epoch": 0.3867320819112628, + "grad_norm": 0.8405426943947985, + "learning_rate": 8.838513101767216e-05, + "loss": 1.0072, + "step": 1813 + }, + { + "epoch": 0.3869453924914676, + "grad_norm": 0.7253349720540038, + "learning_rate": 8.843388177940282e-05, + "loss": 1.0021, + "step": 1814 + }, + { + "epoch": 0.3871587030716723, + "grad_norm": 1.4271298765764358, + "learning_rate": 8.848263254113346e-05, + "loss": 1.0528, + "step": 1815 + }, + { + "epoch": 0.3873720136518771, + "grad_norm": 0.37248068604071066, + "learning_rate": 8.853138330286412e-05, + "loss": 1.0168, + "step": 1816 + }, + { + "epoch": 0.3875853242320819, + "grad_norm": 0.6755257243061312, + "learning_rate": 8.858013406459476e-05, + "loss": 1.033, + "step": 1817 + }, + { + "epoch": 0.3877986348122867, + "grad_norm": 1.18237241715248, + "learning_rate": 8.862888482632543e-05, + "loss": 1.0113, + "step": 1818 + }, + { + "epoch": 0.38801194539249145, + "grad_norm": 1.112027414346963, + "learning_rate": 8.867763558805607e-05, + "loss": 1.0332, + "step": 1819 + }, + { + "epoch": 0.38822525597269625, + "grad_norm": 0.6353926799597456, + "learning_rate": 8.872638634978672e-05, + "loss": 1.0022, + "step": 1820 + }, + { + "epoch": 0.38843856655290104, + "grad_norm": 0.5054078293164308, + "learning_rate": 8.877513711151737e-05, + "loss": 1.0111, + "step": 1821 + }, + { + "epoch": 0.3886518771331058, + "grad_norm": 0.6783507373711489, + "learning_rate": 8.882388787324803e-05, + "loss": 1.0305, + "step": 1822 + }, + { + "epoch": 0.3888651877133106, + "grad_norm": 0.8453987368020099, + "learning_rate": 8.887263863497868e-05, + "loss": 0.9986, + "step": 1823 + }, + { + "epoch": 0.3890784982935154, + "grad_norm": 0.9342038850567774, + "learning_rate": 8.892138939670933e-05, + "loss": 1.0284, + "step": 1824 + }, + { + "epoch": 0.3892918088737201, + "grad_norm": 0.9700441232018074, + "learning_rate": 8.897014015843997e-05, + "loss": 1.0381, + "step": 1825 + }, + { + "epoch": 0.3895051194539249, + "grad_norm": 1.1028522943884354, + "learning_rate": 8.901889092017064e-05, + "loss": 1.0071, + "step": 1826 + }, + { + "epoch": 0.3897184300341297, + "grad_norm": 0.8247787479008838, + "learning_rate": 8.906764168190128e-05, + "loss": 1.0165, + "step": 1827 + }, + { + "epoch": 0.38993174061433444, + "grad_norm": 0.7184291592832126, + "learning_rate": 8.911639244363195e-05, + "loss": 1.012, + "step": 1828 + }, + { + "epoch": 0.39014505119453924, + "grad_norm": 0.7736747674874641, + "learning_rate": 8.916514320536258e-05, + "loss": 1.0099, + "step": 1829 + }, + { + "epoch": 0.39035836177474403, + "grad_norm": 0.7863352535410935, + "learning_rate": 8.921389396709324e-05, + "loss": 1.013, + "step": 1830 + }, + { + "epoch": 0.39057167235494883, + "grad_norm": 0.7058450426082268, + "learning_rate": 8.926264472882389e-05, + "loss": 1.0156, + "step": 1831 + }, + { + "epoch": 0.39078498293515357, + "grad_norm": 0.7163346203535096, + "learning_rate": 8.931139549055454e-05, + "loss": 1.055, + "step": 1832 + }, + { + "epoch": 0.39099829351535836, + "grad_norm": 0.7528620948292998, + "learning_rate": 8.936014625228521e-05, + "loss": 1.0167, + "step": 1833 + }, + { + "epoch": 0.39121160409556316, + "grad_norm": 0.6610332459041368, + "learning_rate": 8.940889701401585e-05, + "loss": 0.9987, + "step": 1834 + }, + { + "epoch": 0.3914249146757679, + "grad_norm": 0.5645210433480881, + "learning_rate": 8.945764777574649e-05, + "loss": 1.0131, + "step": 1835 + }, + { + "epoch": 0.3916382252559727, + "grad_norm": 0.5618529382479173, + "learning_rate": 8.950639853747716e-05, + "loss": 1.055, + "step": 1836 + }, + { + "epoch": 0.3918515358361775, + "grad_norm": 0.6840282562183121, + "learning_rate": 8.955514929920781e-05, + "loss": 0.9894, + "step": 1837 + }, + { + "epoch": 0.39206484641638223, + "grad_norm": 0.8094460689565026, + "learning_rate": 8.960390006093846e-05, + "loss": 1.0154, + "step": 1838 + }, + { + "epoch": 0.392278156996587, + "grad_norm": 0.6187852505769237, + "learning_rate": 8.965265082266911e-05, + "loss": 1.0089, + "step": 1839 + }, + { + "epoch": 0.3924914675767918, + "grad_norm": 0.4634221147080393, + "learning_rate": 8.970140158439977e-05, + "loss": 0.9882, + "step": 1840 + }, + { + "epoch": 0.3927047781569966, + "grad_norm": 0.5510832795401975, + "learning_rate": 8.975015234613042e-05, + "loss": 1.0025, + "step": 1841 + }, + { + "epoch": 0.39291808873720135, + "grad_norm": 0.6317165285168981, + "learning_rate": 8.979890310786106e-05, + "loss": 1.0543, + "step": 1842 + }, + { + "epoch": 0.39313139931740615, + "grad_norm": 0.667054593046533, + "learning_rate": 8.984765386959173e-05, + "loss": 1.0072, + "step": 1843 + }, + { + "epoch": 0.39334470989761094, + "grad_norm": 0.6634391223633791, + "learning_rate": 8.989640463132237e-05, + "loss": 1.004, + "step": 1844 + }, + { + "epoch": 0.3935580204778157, + "grad_norm": 0.6855596822222466, + "learning_rate": 8.994515539305303e-05, + "loss": 1.04, + "step": 1845 + }, + { + "epoch": 0.3937713310580205, + "grad_norm": 0.8693806924258595, + "learning_rate": 8.999390615478367e-05, + "loss": 1.0404, + "step": 1846 + }, + { + "epoch": 0.3939846416382253, + "grad_norm": 0.9314136623240001, + "learning_rate": 9.004265691651433e-05, + "loss": 1.0165, + "step": 1847 + }, + { + "epoch": 0.39419795221843, + "grad_norm": 0.6328276998160294, + "learning_rate": 9.009140767824498e-05, + "loss": 1.0513, + "step": 1848 + }, + { + "epoch": 0.3944112627986348, + "grad_norm": 0.43792092457023174, + "learning_rate": 9.014015843997563e-05, + "loss": 1.0482, + "step": 1849 + }, + { + "epoch": 0.3946245733788396, + "grad_norm": 0.5819854889895227, + "learning_rate": 9.01889092017063e-05, + "loss": 1.0022, + "step": 1850 + }, + { + "epoch": 0.39483788395904434, + "grad_norm": 0.6588217150372767, + "learning_rate": 9.023765996343694e-05, + "loss": 1.0437, + "step": 1851 + }, + { + "epoch": 0.39505119453924914, + "grad_norm": 0.5088764591789919, + "learning_rate": 9.028641072516758e-05, + "loss": 1.0281, + "step": 1852 + }, + { + "epoch": 0.39526450511945393, + "grad_norm": 0.584172006717163, + "learning_rate": 9.033516148689824e-05, + "loss": 1.0528, + "step": 1853 + }, + { + "epoch": 0.39547781569965873, + "grad_norm": 0.6604773502609104, + "learning_rate": 9.038391224862888e-05, + "loss": 1.0651, + "step": 1854 + }, + { + "epoch": 0.39569112627986347, + "grad_norm": 0.7603670497727234, + "learning_rate": 9.043266301035955e-05, + "loss": 1.0131, + "step": 1855 + }, + { + "epoch": 0.39590443686006827, + "grad_norm": 0.841734929714837, + "learning_rate": 9.04814137720902e-05, + "loss": 1.0356, + "step": 1856 + }, + { + "epoch": 0.39611774744027306, + "grad_norm": 0.8452057548302572, + "learning_rate": 9.053016453382084e-05, + "loss": 1.0545, + "step": 1857 + }, + { + "epoch": 0.3963310580204778, + "grad_norm": 0.7316589466895111, + "learning_rate": 9.057891529555151e-05, + "loss": 1.0119, + "step": 1858 + }, + { + "epoch": 0.3965443686006826, + "grad_norm": 0.6498584529362799, + "learning_rate": 9.062766605728215e-05, + "loss": 1.0078, + "step": 1859 + }, + { + "epoch": 0.3967576791808874, + "grad_norm": 0.6498842647657338, + "learning_rate": 9.067641681901281e-05, + "loss": 1.0094, + "step": 1860 + }, + { + "epoch": 0.39697098976109213, + "grad_norm": 0.6746415692540366, + "learning_rate": 9.072516758074345e-05, + "loss": 1.0426, + "step": 1861 + }, + { + "epoch": 0.3971843003412969, + "grad_norm": 0.6832258004334105, + "learning_rate": 9.077391834247411e-05, + "loss": 1.0223, + "step": 1862 + }, + { + "epoch": 0.3973976109215017, + "grad_norm": 0.6774048339478018, + "learning_rate": 9.082266910420476e-05, + "loss": 1.0445, + "step": 1863 + }, + { + "epoch": 0.39761092150170646, + "grad_norm": 0.6269658397482009, + "learning_rate": 9.087141986593541e-05, + "loss": 1.0165, + "step": 1864 + }, + { + "epoch": 0.39782423208191126, + "grad_norm": 0.5152814518009374, + "learning_rate": 9.092017062766607e-05, + "loss": 1.0252, + "step": 1865 + }, + { + "epoch": 0.39803754266211605, + "grad_norm": 0.4659369075283232, + "learning_rate": 9.096892138939672e-05, + "loss": 1.023, + "step": 1866 + }, + { + "epoch": 0.39825085324232085, + "grad_norm": 0.4108189332515803, + "learning_rate": 9.101767215112736e-05, + "loss": 1.0543, + "step": 1867 + }, + { + "epoch": 0.3984641638225256, + "grad_norm": 0.3508937743200614, + "learning_rate": 9.106642291285803e-05, + "loss": 1.0116, + "step": 1868 + }, + { + "epoch": 0.3986774744027304, + "grad_norm": 0.43223099994745084, + "learning_rate": 9.111517367458866e-05, + "loss": 1.0056, + "step": 1869 + }, + { + "epoch": 0.3988907849829352, + "grad_norm": 0.5040052096190774, + "learning_rate": 9.116392443631933e-05, + "loss": 1.0403, + "step": 1870 + }, + { + "epoch": 0.3991040955631399, + "grad_norm": 0.5034031326310602, + "learning_rate": 9.121267519804997e-05, + "loss": 1.027, + "step": 1871 + }, + { + "epoch": 0.3993174061433447, + "grad_norm": 0.6767058059970703, + "learning_rate": 9.126142595978062e-05, + "loss": 1.0337, + "step": 1872 + }, + { + "epoch": 0.3995307167235495, + "grad_norm": 0.9300455512846222, + "learning_rate": 9.131017672151128e-05, + "loss": 1.0518, + "step": 1873 + }, + { + "epoch": 0.39974402730375425, + "grad_norm": 1.1083686729298685, + "learning_rate": 9.135892748324193e-05, + "loss": 1.0017, + "step": 1874 + }, + { + "epoch": 0.39995733788395904, + "grad_norm": 0.9560486560410572, + "learning_rate": 9.14076782449726e-05, + "loss": 1.0293, + "step": 1875 + }, + { + "epoch": 0.40017064846416384, + "grad_norm": 0.9890350921781247, + "learning_rate": 9.145642900670324e-05, + "loss": 1.0108, + "step": 1876 + }, + { + "epoch": 0.4003839590443686, + "grad_norm": 1.133601493837578, + "learning_rate": 9.150517976843388e-05, + "loss": 0.9994, + "step": 1877 + }, + { + "epoch": 0.40059726962457337, + "grad_norm": 0.8196728705648614, + "learning_rate": 9.155393053016454e-05, + "loss": 1.0103, + "step": 1878 + }, + { + "epoch": 0.40081058020477817, + "grad_norm": 0.6430635089607769, + "learning_rate": 9.160268129189518e-05, + "loss": 1.0083, + "step": 1879 + }, + { + "epoch": 0.40102389078498296, + "grad_norm": 0.5709071442605776, + "learning_rate": 9.165143205362585e-05, + "loss": 1.0442, + "step": 1880 + }, + { + "epoch": 0.4012372013651877, + "grad_norm": 0.794595018866944, + "learning_rate": 9.17001828153565e-05, + "loss": 1.013, + "step": 1881 + }, + { + "epoch": 0.4014505119453925, + "grad_norm": 1.2322157907460376, + "learning_rate": 9.174893357708715e-05, + "loss": 1.02, + "step": 1882 + }, + { + "epoch": 0.4016638225255973, + "grad_norm": 0.7318258058236162, + "learning_rate": 9.179768433881781e-05, + "loss": 1.003, + "step": 1883 + }, + { + "epoch": 0.40187713310580203, + "grad_norm": 0.5686182367378404, + "learning_rate": 9.184643510054845e-05, + "loss": 0.9987, + "step": 1884 + }, + { + "epoch": 0.4020904436860068, + "grad_norm": 0.6939438013969326, + "learning_rate": 9.189518586227911e-05, + "loss": 1.0081, + "step": 1885 + }, + { + "epoch": 0.4023037542662116, + "grad_norm": 0.8277807325106277, + "learning_rate": 9.194393662400975e-05, + "loss": 1.0092, + "step": 1886 + }, + { + "epoch": 0.40251706484641636, + "grad_norm": 0.8029668067290163, + "learning_rate": 9.199268738574042e-05, + "loss": 0.9928, + "step": 1887 + }, + { + "epoch": 0.40273037542662116, + "grad_norm": 0.8133835982397161, + "learning_rate": 9.204143814747106e-05, + "loss": 1.0309, + "step": 1888 + }, + { + "epoch": 0.40294368600682595, + "grad_norm": 1.009885582514812, + "learning_rate": 9.209018890920171e-05, + "loss": 1.0423, + "step": 1889 + }, + { + "epoch": 0.4031569965870307, + "grad_norm": 1.1892222365365244, + "learning_rate": 9.213893967093236e-05, + "loss": 1.0084, + "step": 1890 + }, + { + "epoch": 0.4033703071672355, + "grad_norm": 0.716950737589623, + "learning_rate": 9.218769043266302e-05, + "loss": 1.0371, + "step": 1891 + }, + { + "epoch": 0.4035836177474403, + "grad_norm": 0.6536486847882056, + "learning_rate": 9.223644119439368e-05, + "loss": 1.0248, + "step": 1892 + }, + { + "epoch": 0.4037969283276451, + "grad_norm": 0.693601640219995, + "learning_rate": 9.228519195612432e-05, + "loss": 1.0206, + "step": 1893 + }, + { + "epoch": 0.4040102389078498, + "grad_norm": 0.8959371081283022, + "learning_rate": 9.233394271785496e-05, + "loss": 1.0277, + "step": 1894 + }, + { + "epoch": 0.4042235494880546, + "grad_norm": 1.120799308674885, + "learning_rate": 9.238269347958563e-05, + "loss": 1.0292, + "step": 1895 + }, + { + "epoch": 0.4044368600682594, + "grad_norm": 0.7619007707722141, + "learning_rate": 9.243144424131627e-05, + "loss": 1.0391, + "step": 1896 + }, + { + "epoch": 0.40465017064846415, + "grad_norm": 0.791223220948381, + "learning_rate": 9.248019500304694e-05, + "loss": 1.0096, + "step": 1897 + }, + { + "epoch": 0.40486348122866894, + "grad_norm": 0.7966196442250397, + "learning_rate": 9.252894576477759e-05, + "loss": 1.0097, + "step": 1898 + }, + { + "epoch": 0.40507679180887374, + "grad_norm": 0.7741522868710121, + "learning_rate": 9.257769652650823e-05, + "loss": 1.0377, + "step": 1899 + }, + { + "epoch": 0.4052901023890785, + "grad_norm": 0.6580247145627298, + "learning_rate": 9.26264472882389e-05, + "loss": 1.0146, + "step": 1900 + }, + { + "epoch": 0.4055034129692833, + "grad_norm": 0.5630986129891725, + "learning_rate": 9.267519804996953e-05, + "loss": 1.0, + "step": 1901 + }, + { + "epoch": 0.40571672354948807, + "grad_norm": 0.6540941055558859, + "learning_rate": 9.27239488117002e-05, + "loss": 1.0137, + "step": 1902 + }, + { + "epoch": 0.4059300341296928, + "grad_norm": 0.8131374170903172, + "learning_rate": 9.277269957343084e-05, + "loss": 1.0311, + "step": 1903 + }, + { + "epoch": 0.4061433447098976, + "grad_norm": 0.7255987022389719, + "learning_rate": 9.28214503351615e-05, + "loss": 1.0298, + "step": 1904 + }, + { + "epoch": 0.4063566552901024, + "grad_norm": 0.5916687974417055, + "learning_rate": 9.287020109689215e-05, + "loss": 1.04, + "step": 1905 + }, + { + "epoch": 0.4065699658703072, + "grad_norm": 0.6410131029352047, + "learning_rate": 9.29189518586228e-05, + "loss": 1.0355, + "step": 1906 + }, + { + "epoch": 0.40678327645051193, + "grad_norm": 0.761091350912574, + "learning_rate": 9.296770262035345e-05, + "loss": 1.0093, + "step": 1907 + }, + { + "epoch": 0.4069965870307167, + "grad_norm": 0.7847934468787976, + "learning_rate": 9.30164533820841e-05, + "loss": 1.0475, + "step": 1908 + }, + { + "epoch": 0.4072098976109215, + "grad_norm": 0.7980895480011939, + "learning_rate": 9.306520414381474e-05, + "loss": 1.0099, + "step": 1909 + }, + { + "epoch": 0.40742320819112626, + "grad_norm": 0.7721077147014677, + "learning_rate": 9.311395490554541e-05, + "loss": 1.0114, + "step": 1910 + }, + { + "epoch": 0.40763651877133106, + "grad_norm": 0.7688415025625399, + "learning_rate": 9.316270566727605e-05, + "loss": 1.0138, + "step": 1911 + }, + { + "epoch": 0.40784982935153585, + "grad_norm": 0.6362989692924146, + "learning_rate": 9.321145642900672e-05, + "loss": 1.0214, + "step": 1912 + }, + { + "epoch": 0.4080631399317406, + "grad_norm": 0.5854343642100444, + "learning_rate": 9.326020719073736e-05, + "loss": 1.0219, + "step": 1913 + }, + { + "epoch": 0.4082764505119454, + "grad_norm": 0.7662268056577394, + "learning_rate": 9.330895795246801e-05, + "loss": 1.0234, + "step": 1914 + }, + { + "epoch": 0.4084897610921502, + "grad_norm": 0.7205909727649229, + "learning_rate": 9.335770871419866e-05, + "loss": 1.0091, + "step": 1915 + }, + { + "epoch": 0.4087030716723549, + "grad_norm": 0.5232213683921745, + "learning_rate": 9.340645947592932e-05, + "loss": 1.0226, + "step": 1916 + }, + { + "epoch": 0.4089163822525597, + "grad_norm": 0.49809134722409715, + "learning_rate": 9.345521023765998e-05, + "loss": 0.9902, + "step": 1917 + }, + { + "epoch": 0.4091296928327645, + "grad_norm": 0.46305689372629877, + "learning_rate": 9.350396099939062e-05, + "loss": 1.0426, + "step": 1918 + }, + { + "epoch": 0.4093430034129693, + "grad_norm": 0.5778823524093002, + "learning_rate": 9.355271176112126e-05, + "loss": 0.9907, + "step": 1919 + }, + { + "epoch": 0.40955631399317405, + "grad_norm": 0.715501010231112, + "learning_rate": 9.360146252285193e-05, + "loss": 1.0125, + "step": 1920 + }, + { + "epoch": 0.40976962457337884, + "grad_norm": 0.7273629288491986, + "learning_rate": 9.365021328458257e-05, + "loss": 1.0197, + "step": 1921 + }, + { + "epoch": 0.40998293515358364, + "grad_norm": 0.7868105291209185, + "learning_rate": 9.369896404631323e-05, + "loss": 1.036, + "step": 1922 + }, + { + "epoch": 0.4101962457337884, + "grad_norm": 1.0102680898669247, + "learning_rate": 9.374771480804389e-05, + "loss": 1.0289, + "step": 1923 + }, + { + "epoch": 0.4104095563139932, + "grad_norm": 1.4277219954485125, + "learning_rate": 9.379646556977454e-05, + "loss": 1.0271, + "step": 1924 + }, + { + "epoch": 0.41062286689419797, + "grad_norm": 0.43373941875111566, + "learning_rate": 9.384521633150519e-05, + "loss": 0.9966, + "step": 1925 + }, + { + "epoch": 0.4108361774744027, + "grad_norm": 1.2461826034476549, + "learning_rate": 9.389396709323583e-05, + "loss": 1.0538, + "step": 1926 + }, + { + "epoch": 0.4110494880546075, + "grad_norm": 1.4315443394979217, + "learning_rate": 9.39427178549665e-05, + "loss": 1.0418, + "step": 1927 + }, + { + "epoch": 0.4112627986348123, + "grad_norm": 0.4682298648593617, + "learning_rate": 9.399146861669714e-05, + "loss": 1.002, + "step": 1928 + }, + { + "epoch": 0.41147610921501704, + "grad_norm": 1.3973565144317373, + "learning_rate": 9.40402193784278e-05, + "loss": 1.0164, + "step": 1929 + }, + { + "epoch": 0.41168941979522183, + "grad_norm": 0.9127633374105131, + "learning_rate": 9.408897014015844e-05, + "loss": 1.0612, + "step": 1930 + }, + { + "epoch": 0.41190273037542663, + "grad_norm": 0.9071833795019943, + "learning_rate": 9.41377209018891e-05, + "loss": 1.0186, + "step": 1931 + }, + { + "epoch": 0.4121160409556314, + "grad_norm": 0.9306106547627835, + "learning_rate": 9.418647166361975e-05, + "loss": 1.0196, + "step": 1932 + }, + { + "epoch": 0.41232935153583616, + "grad_norm": 0.8797574117886025, + "learning_rate": 9.42352224253504e-05, + "loss": 1.0144, + "step": 1933 + }, + { + "epoch": 0.41254266211604096, + "grad_norm": 1.016569310386437, + "learning_rate": 9.428397318708107e-05, + "loss": 1.0117, + "step": 1934 + }, + { + "epoch": 0.41275597269624575, + "grad_norm": 1.041329064097399, + "learning_rate": 9.433272394881171e-05, + "loss": 1.0108, + "step": 1935 + }, + { + "epoch": 0.4129692832764505, + "grad_norm": 0.8312320635438919, + "learning_rate": 9.438147471054235e-05, + "loss": 1.0255, + "step": 1936 + }, + { + "epoch": 0.4131825938566553, + "grad_norm": 0.7645423984245281, + "learning_rate": 9.443022547227302e-05, + "loss": 1.0227, + "step": 1937 + }, + { + "epoch": 0.4133959044368601, + "grad_norm": 0.6576322379877438, + "learning_rate": 9.447897623400366e-05, + "loss": 1.0072, + "step": 1938 + }, + { + "epoch": 0.4136092150170648, + "grad_norm": 0.5989009281990509, + "learning_rate": 9.452772699573432e-05, + "loss": 0.9994, + "step": 1939 + }, + { + "epoch": 0.4138225255972696, + "grad_norm": 0.5229286906565241, + "learning_rate": 9.457647775746497e-05, + "loss": 1.0283, + "step": 1940 + }, + { + "epoch": 0.4140358361774744, + "grad_norm": 0.516823997703823, + "learning_rate": 9.462522851919561e-05, + "loss": 1.0084, + "step": 1941 + }, + { + "epoch": 0.41424914675767915, + "grad_norm": 0.6297086141840122, + "learning_rate": 9.467397928092628e-05, + "loss": 1.0327, + "step": 1942 + }, + { + "epoch": 0.41446245733788395, + "grad_norm": 0.5268764496812992, + "learning_rate": 9.472273004265692e-05, + "loss": 1.0239, + "step": 1943 + }, + { + "epoch": 0.41467576791808874, + "grad_norm": 0.6208879487007432, + "learning_rate": 9.477148080438759e-05, + "loss": 1.0132, + "step": 1944 + }, + { + "epoch": 0.41488907849829354, + "grad_norm": 0.6980094003739453, + "learning_rate": 9.482023156611823e-05, + "loss": 1.0094, + "step": 1945 + }, + { + "epoch": 0.4151023890784983, + "grad_norm": 0.5090815018677859, + "learning_rate": 9.486898232784888e-05, + "loss": 1.0045, + "step": 1946 + }, + { + "epoch": 0.4153156996587031, + "grad_norm": 0.4817765271888028, + "learning_rate": 9.491773308957953e-05, + "loss": 1.0174, + "step": 1947 + }, + { + "epoch": 0.41552901023890787, + "grad_norm": 0.522837873796482, + "learning_rate": 9.496648385131019e-05, + "loss": 1.0008, + "step": 1948 + }, + { + "epoch": 0.4157423208191126, + "grad_norm": 0.44283840874925595, + "learning_rate": 9.501523461304084e-05, + "loss": 1.0264, + "step": 1949 + }, + { + "epoch": 0.4159556313993174, + "grad_norm": 0.37018237751093924, + "learning_rate": 9.506398537477149e-05, + "loss": 1.0078, + "step": 1950 + }, + { + "epoch": 0.4161689419795222, + "grad_norm": 0.40382523180446644, + "learning_rate": 9.511273613650213e-05, + "loss": 1.0258, + "step": 1951 + }, + { + "epoch": 0.41638225255972694, + "grad_norm": 0.48464127953043556, + "learning_rate": 9.51614868982328e-05, + "loss": 1.0282, + "step": 1952 + }, + { + "epoch": 0.41659556313993173, + "grad_norm": 0.5173110548125863, + "learning_rate": 9.521023765996344e-05, + "loss": 1.0099, + "step": 1953 + }, + { + "epoch": 0.41680887372013653, + "grad_norm": 0.5967051836907503, + "learning_rate": 9.52589884216941e-05, + "loss": 1.0233, + "step": 1954 + }, + { + "epoch": 0.41702218430034127, + "grad_norm": 0.6551199440494527, + "learning_rate": 9.530773918342474e-05, + "loss": 1.0113, + "step": 1955 + }, + { + "epoch": 0.41723549488054607, + "grad_norm": 0.652680189618763, + "learning_rate": 9.53564899451554e-05, + "loss": 1.0225, + "step": 1956 + }, + { + "epoch": 0.41744880546075086, + "grad_norm": 0.6542380380167566, + "learning_rate": 9.540524070688605e-05, + "loss": 1.0343, + "step": 1957 + }, + { + "epoch": 0.41766211604095566, + "grad_norm": 0.5836458045468309, + "learning_rate": 9.54539914686167e-05, + "loss": 1.0645, + "step": 1958 + }, + { + "epoch": 0.4178754266211604, + "grad_norm": 0.4983483221311835, + "learning_rate": 9.550274223034737e-05, + "loss": 1.0356, + "step": 1959 + }, + { + "epoch": 0.4180887372013652, + "grad_norm": 0.4695714115857038, + "learning_rate": 9.555149299207801e-05, + "loss": 1.0193, + "step": 1960 + }, + { + "epoch": 0.41830204778157, + "grad_norm": 0.49064733488280304, + "learning_rate": 9.560024375380865e-05, + "loss": 1.0096, + "step": 1961 + }, + { + "epoch": 0.4185153583617747, + "grad_norm": 0.5493895462288283, + "learning_rate": 9.564899451553931e-05, + "loss": 1.0283, + "step": 1962 + }, + { + "epoch": 0.4187286689419795, + "grad_norm": 0.682834677073182, + "learning_rate": 9.569774527726995e-05, + "loss": 1.0167, + "step": 1963 + }, + { + "epoch": 0.4189419795221843, + "grad_norm": 0.8330106185621954, + "learning_rate": 9.574649603900062e-05, + "loss": 1.0021, + "step": 1964 + }, + { + "epoch": 0.41915529010238906, + "grad_norm": 0.97616480706483, + "learning_rate": 9.579524680073127e-05, + "loss": 1.0393, + "step": 1965 + }, + { + "epoch": 0.41936860068259385, + "grad_norm": 1.0676924196972657, + "learning_rate": 9.584399756246193e-05, + "loss": 0.9947, + "step": 1966 + }, + { + "epoch": 0.41958191126279865, + "grad_norm": 0.9374986694544658, + "learning_rate": 9.589274832419258e-05, + "loss": 0.984, + "step": 1967 + }, + { + "epoch": 0.4197952218430034, + "grad_norm": 0.828887306675161, + "learning_rate": 9.594149908592322e-05, + "loss": 1.032, + "step": 1968 + }, + { + "epoch": 0.4200085324232082, + "grad_norm": 0.659676367628572, + "learning_rate": 9.599024984765389e-05, + "loss": 1.021, + "step": 1969 + }, + { + "epoch": 0.420221843003413, + "grad_norm": 0.5836308300383598, + "learning_rate": 9.603900060938452e-05, + "loss": 1.0509, + "step": 1970 + }, + { + "epoch": 0.42043515358361777, + "grad_norm": 0.7395906456620489, + "learning_rate": 9.608775137111519e-05, + "loss": 1.0254, + "step": 1971 + }, + { + "epoch": 0.4206484641638225, + "grad_norm": 1.1500634945849366, + "learning_rate": 9.613650213284583e-05, + "loss": 1.0208, + "step": 1972 + }, + { + "epoch": 0.4208617747440273, + "grad_norm": 1.0617856133708439, + "learning_rate": 9.618525289457648e-05, + "loss": 0.9809, + "step": 1973 + }, + { + "epoch": 0.4210750853242321, + "grad_norm": 0.7065744580346013, + "learning_rate": 9.623400365630714e-05, + "loss": 1.0304, + "step": 1974 + }, + { + "epoch": 0.42128839590443684, + "grad_norm": 0.45139975246557856, + "learning_rate": 9.628275441803779e-05, + "loss": 1.0106, + "step": 1975 + }, + { + "epoch": 0.42150170648464164, + "grad_norm": 0.42813161476563294, + "learning_rate": 9.633150517976844e-05, + "loss": 1.0457, + "step": 1976 + }, + { + "epoch": 0.42171501706484643, + "grad_norm": 0.6155695903737228, + "learning_rate": 9.63802559414991e-05, + "loss": 1.0277, + "step": 1977 + }, + { + "epoch": 0.42192832764505117, + "grad_norm": 0.8119421150651439, + "learning_rate": 9.642900670322974e-05, + "loss": 1.0188, + "step": 1978 + }, + { + "epoch": 0.42214163822525597, + "grad_norm": 1.0204978339310686, + "learning_rate": 9.64777574649604e-05, + "loss": 0.9878, + "step": 1979 + }, + { + "epoch": 0.42235494880546076, + "grad_norm": 1.2109380701629164, + "learning_rate": 9.652650822669104e-05, + "loss": 1.0187, + "step": 1980 + }, + { + "epoch": 0.42256825938566556, + "grad_norm": 0.6868688551087927, + "learning_rate": 9.657525898842171e-05, + "loss": 1.0273, + "step": 1981 + }, + { + "epoch": 0.4227815699658703, + "grad_norm": 0.5224853992439626, + "learning_rate": 9.662400975015236e-05, + "loss": 1.0438, + "step": 1982 + }, + { + "epoch": 0.4229948805460751, + "grad_norm": 0.9185328191588443, + "learning_rate": 9.6672760511883e-05, + "loss": 1.0288, + "step": 1983 + }, + { + "epoch": 0.4232081911262799, + "grad_norm": 1.4005045562137928, + "learning_rate": 9.672151127361367e-05, + "loss": 1.0094, + "step": 1984 + }, + { + "epoch": 0.4234215017064846, + "grad_norm": 0.48148913600832177, + "learning_rate": 9.67702620353443e-05, + "loss": 1.0317, + "step": 1985 + }, + { + "epoch": 0.4236348122866894, + "grad_norm": 0.7853702731828675, + "learning_rate": 9.681901279707497e-05, + "loss": 1.0017, + "step": 1986 + }, + { + "epoch": 0.4238481228668942, + "grad_norm": 1.3737899266536269, + "learning_rate": 9.686776355880561e-05, + "loss": 1.002, + "step": 1987 + }, + { + "epoch": 0.42406143344709896, + "grad_norm": 0.6662377086786239, + "learning_rate": 9.691651432053627e-05, + "loss": 1.0158, + "step": 1988 + }, + { + "epoch": 0.42427474402730375, + "grad_norm": 0.7263114356203582, + "learning_rate": 9.696526508226692e-05, + "loss": 1.0307, + "step": 1989 + }, + { + "epoch": 0.42448805460750855, + "grad_norm": 1.2730067143190011, + "learning_rate": 9.701401584399757e-05, + "loss": 1.0395, + "step": 1990 + }, + { + "epoch": 0.4247013651877133, + "grad_norm": 0.7683572921690186, + "learning_rate": 9.706276660572822e-05, + "loss": 1.0247, + "step": 1991 + }, + { + "epoch": 0.4249146757679181, + "grad_norm": 0.9009091708631158, + "learning_rate": 9.711151736745888e-05, + "loss": 1.0269, + "step": 1992 + }, + { + "epoch": 0.4251279863481229, + "grad_norm": 1.042121872975418, + "learning_rate": 9.716026812918952e-05, + "loss": 0.9993, + "step": 1993 + }, + { + "epoch": 0.4253412969283277, + "grad_norm": 0.8024302530357343, + "learning_rate": 9.720901889092018e-05, + "loss": 1.0046, + "step": 1994 + }, + { + "epoch": 0.4255546075085324, + "grad_norm": 0.7729909894086057, + "learning_rate": 9.725776965265082e-05, + "loss": 0.9897, + "step": 1995 + }, + { + "epoch": 0.4257679180887372, + "grad_norm": 0.8793119058036046, + "learning_rate": 9.730652041438149e-05, + "loss": 1.0328, + "step": 1996 + }, + { + "epoch": 0.425981228668942, + "grad_norm": 0.736490929250099, + "learning_rate": 9.735527117611213e-05, + "loss": 1.0192, + "step": 1997 + }, + { + "epoch": 0.42619453924914674, + "grad_norm": 0.6400809126506714, + "learning_rate": 9.740402193784278e-05, + "loss": 1.0061, + "step": 1998 + }, + { + "epoch": 0.42640784982935154, + "grad_norm": 0.5824346828789303, + "learning_rate": 9.745277269957344e-05, + "loss": 1.0043, + "step": 1999 + }, + { + "epoch": 0.42662116040955633, + "grad_norm": 0.49616654314214526, + "learning_rate": 9.750152346130409e-05, + "loss": 1.0237, + "step": 2000 + }, + { + "epoch": 0.4268344709897611, + "grad_norm": 0.6053362372477596, + "learning_rate": 9.755027422303475e-05, + "loss": 1.0077, + "step": 2001 + }, + { + "epoch": 0.42704778156996587, + "grad_norm": 0.6668840179920098, + "learning_rate": 9.75990249847654e-05, + "loss": 1.0049, + "step": 2002 + }, + { + "epoch": 0.42726109215017066, + "grad_norm": 0.6839668555926645, + "learning_rate": 9.764777574649603e-05, + "loss": 1.0445, + "step": 2003 + }, + { + "epoch": 0.4274744027303754, + "grad_norm": 0.6885690881762679, + "learning_rate": 9.76965265082267e-05, + "loss": 1.0352, + "step": 2004 + }, + { + "epoch": 0.4276877133105802, + "grad_norm": 0.6011315414828192, + "learning_rate": 9.774527726995734e-05, + "loss": 1.0437, + "step": 2005 + }, + { + "epoch": 0.427901023890785, + "grad_norm": 0.7551217700402596, + "learning_rate": 9.7794028031688e-05, + "loss": 1.0153, + "step": 2006 + }, + { + "epoch": 0.4281143344709898, + "grad_norm": 0.7817852252875769, + "learning_rate": 9.784277879341866e-05, + "loss": 1.0191, + "step": 2007 + }, + { + "epoch": 0.4283276450511945, + "grad_norm": 0.6655380541839127, + "learning_rate": 9.78915295551493e-05, + "loss": 0.9862, + "step": 2008 + }, + { + "epoch": 0.4285409556313993, + "grad_norm": 0.6089869417787005, + "learning_rate": 9.794028031687997e-05, + "loss": 1.0145, + "step": 2009 + }, + { + "epoch": 0.4287542662116041, + "grad_norm": 0.5629942422499751, + "learning_rate": 9.79890310786106e-05, + "loss": 1.0524, + "step": 2010 + }, + { + "epoch": 0.42896757679180886, + "grad_norm": 0.5448078108914606, + "learning_rate": 9.803778184034127e-05, + "loss": 1.0175, + "step": 2011 + }, + { + "epoch": 0.42918088737201365, + "grad_norm": 0.5816739973040348, + "learning_rate": 9.808653260207191e-05, + "loss": 0.9995, + "step": 2012 + }, + { + "epoch": 0.42939419795221845, + "grad_norm": 0.6281769849206031, + "learning_rate": 9.813528336380258e-05, + "loss": 1.0171, + "step": 2013 + }, + { + "epoch": 0.4296075085324232, + "grad_norm": 0.7296560161517119, + "learning_rate": 9.818403412553322e-05, + "loss": 1.0152, + "step": 2014 + }, + { + "epoch": 0.429820819112628, + "grad_norm": 0.719916948753768, + "learning_rate": 9.823278488726387e-05, + "loss": 1.0418, + "step": 2015 + }, + { + "epoch": 0.4300341296928328, + "grad_norm": 0.7754032273368788, + "learning_rate": 9.828153564899452e-05, + "loss": 1.0565, + "step": 2016 + }, + { + "epoch": 0.4302474402730375, + "grad_norm": 0.7696242932489418, + "learning_rate": 9.833028641072518e-05, + "loss": 1.0207, + "step": 2017 + }, + { + "epoch": 0.4304607508532423, + "grad_norm": 0.7326525660701556, + "learning_rate": 9.837903717245583e-05, + "loss": 1.0245, + "step": 2018 + }, + { + "epoch": 0.4306740614334471, + "grad_norm": 0.7238743363102695, + "learning_rate": 9.842778793418648e-05, + "loss": 1.036, + "step": 2019 + }, + { + "epoch": 0.4308873720136519, + "grad_norm": 0.7673597355221832, + "learning_rate": 9.847653869591712e-05, + "loss": 1.0053, + "step": 2020 + }, + { + "epoch": 0.43110068259385664, + "grad_norm": 0.9492287717824249, + "learning_rate": 9.852528945764779e-05, + "loss": 0.9997, + "step": 2021 + }, + { + "epoch": 0.43131399317406144, + "grad_norm": 0.8217505747267403, + "learning_rate": 9.857404021937843e-05, + "loss": 1.0346, + "step": 2022 + }, + { + "epoch": 0.43152730375426623, + "grad_norm": 0.8981063295321833, + "learning_rate": 9.86227909811091e-05, + "loss": 1.0318, + "step": 2023 + }, + { + "epoch": 0.431740614334471, + "grad_norm": 0.9249814310264285, + "learning_rate": 9.867154174283973e-05, + "loss": 1.048, + "step": 2024 + }, + { + "epoch": 0.43195392491467577, + "grad_norm": 0.9298180374669365, + "learning_rate": 9.872029250457039e-05, + "loss": 0.9925, + "step": 2025 + }, + { + "epoch": 0.43216723549488056, + "grad_norm": 0.8900506838446521, + "learning_rate": 9.876904326630105e-05, + "loss": 0.9687, + "step": 2026 + }, + { + "epoch": 0.4323805460750853, + "grad_norm": 0.8593379219112424, + "learning_rate": 9.881779402803169e-05, + "loss": 1.0254, + "step": 2027 + }, + { + "epoch": 0.4325938566552901, + "grad_norm": 0.9462764519609992, + "learning_rate": 9.886654478976236e-05, + "loss": 1.0194, + "step": 2028 + }, + { + "epoch": 0.4328071672354949, + "grad_norm": 0.9852460611199867, + "learning_rate": 9.8915295551493e-05, + "loss": 1.0045, + "step": 2029 + }, + { + "epoch": 0.43302047781569963, + "grad_norm": 1.0920373197439677, + "learning_rate": 9.896404631322365e-05, + "loss": 1.0252, + "step": 2030 + }, + { + "epoch": 0.43323378839590443, + "grad_norm": 0.7816022721773037, + "learning_rate": 9.90127970749543e-05, + "loss": 1.0029, + "step": 2031 + }, + { + "epoch": 0.4334470989761092, + "grad_norm": 0.4733175857172978, + "learning_rate": 9.906154783668496e-05, + "loss": 1.008, + "step": 2032 + }, + { + "epoch": 0.433660409556314, + "grad_norm": 0.5611002224462267, + "learning_rate": 9.911029859841561e-05, + "loss": 1.0432, + "step": 2033 + }, + { + "epoch": 0.43387372013651876, + "grad_norm": 0.795775630167519, + "learning_rate": 9.915904936014626e-05, + "loss": 0.9776, + "step": 2034 + }, + { + "epoch": 0.43408703071672355, + "grad_norm": 0.9244668553928306, + "learning_rate": 9.92078001218769e-05, + "loss": 1.0126, + "step": 2035 + }, + { + "epoch": 0.43430034129692835, + "grad_norm": 0.8770090277961679, + "learning_rate": 9.925655088360757e-05, + "loss": 0.9916, + "step": 2036 + }, + { + "epoch": 0.4345136518771331, + "grad_norm": 0.8722667314314926, + "learning_rate": 9.930530164533821e-05, + "loss": 1.0154, + "step": 2037 + }, + { + "epoch": 0.4347269624573379, + "grad_norm": 0.8060593345731109, + "learning_rate": 9.935405240706888e-05, + "loss": 1.0096, + "step": 2038 + }, + { + "epoch": 0.4349402730375427, + "grad_norm": 0.6852077790554639, + "learning_rate": 9.940280316879952e-05, + "loss": 1.0528, + "step": 2039 + }, + { + "epoch": 0.4351535836177474, + "grad_norm": 0.5172803186414073, + "learning_rate": 9.945155393053017e-05, + "loss": 1.0257, + "step": 2040 + }, + { + "epoch": 0.4353668941979522, + "grad_norm": 0.41677982547140763, + "learning_rate": 9.950030469226082e-05, + "loss": 1.0003, + "step": 2041 + }, + { + "epoch": 0.435580204778157, + "grad_norm": 0.4054979152433958, + "learning_rate": 9.954905545399147e-05, + "loss": 1.0307, + "step": 2042 + }, + { + "epoch": 0.43579351535836175, + "grad_norm": 0.3429225899152693, + "learning_rate": 9.959780621572214e-05, + "loss": 1.0116, + "step": 2043 + }, + { + "epoch": 0.43600682593856654, + "grad_norm": 0.3831851245932244, + "learning_rate": 9.964655697745278e-05, + "loss": 1.0228, + "step": 2044 + }, + { + "epoch": 0.43622013651877134, + "grad_norm": 0.35466218290466767, + "learning_rate": 9.969530773918342e-05, + "loss": 0.9971, + "step": 2045 + }, + { + "epoch": 0.43643344709897613, + "grad_norm": 0.3847171753359935, + "learning_rate": 9.974405850091409e-05, + "loss": 1.0111, + "step": 2046 + }, + { + "epoch": 0.4366467576791809, + "grad_norm": 0.4259595277716597, + "learning_rate": 9.979280926264473e-05, + "loss": 1.0288, + "step": 2047 + }, + { + "epoch": 0.43686006825938567, + "grad_norm": 0.5537701143287517, + "learning_rate": 9.984156002437539e-05, + "loss": 0.9899, + "step": 2048 + }, + { + "epoch": 0.43707337883959047, + "grad_norm": 0.6227192827582377, + "learning_rate": 9.989031078610605e-05, + "loss": 0.9859, + "step": 2049 + }, + { + "epoch": 0.4372866894197952, + "grad_norm": 0.6656484816612129, + "learning_rate": 9.993906154783669e-05, + "loss": 1.0319, + "step": 2050 + }, + { + "epoch": 0.4375, + "grad_norm": 0.7255006438327907, + "learning_rate": 9.998781230956735e-05, + "loss": 1.0028, + "step": 2051 + }, + { + "epoch": 0.4377133105802048, + "grad_norm": 0.7596548541446398, + "learning_rate": 0.00010003656307129799, + "loss": 0.9869, + "step": 2052 + }, + { + "epoch": 0.43792662116040953, + "grad_norm": 0.659623679304774, + "learning_rate": 0.00010008531383302866, + "loss": 1.0568, + "step": 2053 + }, + { + "epoch": 0.43813993174061433, + "grad_norm": 0.5266834357396463, + "learning_rate": 0.0001001340645947593, + "loss": 1.0452, + "step": 2054 + }, + { + "epoch": 0.4383532423208191, + "grad_norm": 0.5304750105983187, + "learning_rate": 0.00010018281535648996, + "loss": 0.9959, + "step": 2055 + }, + { + "epoch": 0.43856655290102387, + "grad_norm": 0.5734408792496155, + "learning_rate": 0.0001002315661182206, + "loss": 1.0214, + "step": 2056 + }, + { + "epoch": 0.43877986348122866, + "grad_norm": 0.7795387578776872, + "learning_rate": 0.00010028031687995126, + "loss": 1.0125, + "step": 2057 + }, + { + "epoch": 0.43899317406143346, + "grad_norm": 1.0065976078761623, + "learning_rate": 0.00010032906764168191, + "loss": 1.0237, + "step": 2058 + }, + { + "epoch": 0.43920648464163825, + "grad_norm": 1.0419386880384165, + "learning_rate": 0.00010037781840341256, + "loss": 1.0197, + "step": 2059 + }, + { + "epoch": 0.439419795221843, + "grad_norm": 0.8173762911713182, + "learning_rate": 0.00010042656916514322, + "loss": 1.0359, + "step": 2060 + }, + { + "epoch": 0.4396331058020478, + "grad_norm": 0.6228804550007214, + "learning_rate": 0.00010047531992687387, + "loss": 0.9904, + "step": 2061 + }, + { + "epoch": 0.4398464163822526, + "grad_norm": 0.5351808599301162, + "learning_rate": 0.00010052407068860451, + "loss": 0.9962, + "step": 2062 + }, + { + "epoch": 0.4400597269624573, + "grad_norm": 0.6134162891714252, + "learning_rate": 0.00010057282145033517, + "loss": 1.0188, + "step": 2063 + }, + { + "epoch": 0.4402730375426621, + "grad_norm": 0.5343602263053431, + "learning_rate": 0.00010062157221206581, + "loss": 1.0149, + "step": 2064 + }, + { + "epoch": 0.4404863481228669, + "grad_norm": 0.4091247033752717, + "learning_rate": 0.00010067032297379648, + "loss": 1.0114, + "step": 2065 + }, + { + "epoch": 0.44069965870307165, + "grad_norm": 0.4151765740844952, + "learning_rate": 0.00010071907373552712, + "loss": 1.0416, + "step": 2066 + }, + { + "epoch": 0.44091296928327645, + "grad_norm": 0.450349212926554, + "learning_rate": 0.00010076782449725777, + "loss": 0.997, + "step": 2067 + }, + { + "epoch": 0.44112627986348124, + "grad_norm": 0.48288607219120167, + "learning_rate": 0.00010081657525898844, + "loss": 1.0392, + "step": 2068 + }, + { + "epoch": 0.441339590443686, + "grad_norm": 0.4642732965996954, + "learning_rate": 0.00010086532602071908, + "loss": 0.9886, + "step": 2069 + }, + { + "epoch": 0.4415529010238908, + "grad_norm": 0.459009143016274, + "learning_rate": 0.00010091407678244975, + "loss": 0.9884, + "step": 2070 + }, + { + "epoch": 0.44176621160409557, + "grad_norm": 0.6008955190641985, + "learning_rate": 0.00010096282754418038, + "loss": 0.9729, + "step": 2071 + }, + { + "epoch": 0.44197952218430037, + "grad_norm": 0.7949438945098442, + "learning_rate": 0.00010101157830591102, + "loss": 0.9928, + "step": 2072 + }, + { + "epoch": 0.4421928327645051, + "grad_norm": 0.956163692913499, + "learning_rate": 0.00010106032906764169, + "loss": 1.0181, + "step": 2073 + }, + { + "epoch": 0.4424061433447099, + "grad_norm": 1.1304720079908606, + "learning_rate": 0.00010110907982937234, + "loss": 1.0239, + "step": 2074 + }, + { + "epoch": 0.4426194539249147, + "grad_norm": 0.8316605819108629, + "learning_rate": 0.000101157830591103, + "loss": 1.0259, + "step": 2075 + }, + { + "epoch": 0.44283276450511944, + "grad_norm": 0.6829761827260576, + "learning_rate": 0.00010120658135283365, + "loss": 1.0609, + "step": 2076 + }, + { + "epoch": 0.44304607508532423, + "grad_norm": 0.5969858285979501, + "learning_rate": 0.00010125533211456429, + "loss": 1.0049, + "step": 2077 + }, + { + "epoch": 0.443259385665529, + "grad_norm": 0.5474562073891063, + "learning_rate": 0.00010130408287629496, + "loss": 1.0188, + "step": 2078 + }, + { + "epoch": 0.44347269624573377, + "grad_norm": 0.677976675613919, + "learning_rate": 0.0001013528336380256, + "loss": 0.9958, + "step": 2079 + }, + { + "epoch": 0.44368600682593856, + "grad_norm": 0.8793805869325027, + "learning_rate": 0.00010140158439975626, + "loss": 0.9984, + "step": 2080 + }, + { + "epoch": 0.44389931740614336, + "grad_norm": 0.9623247909029201, + "learning_rate": 0.0001014503351614869, + "loss": 1.0106, + "step": 2081 + }, + { + "epoch": 0.4441126279863481, + "grad_norm": 1.146718159426942, + "learning_rate": 0.00010149908592321755, + "loss": 1.0235, + "step": 2082 + }, + { + "epoch": 0.4443259385665529, + "grad_norm": 1.1070956809981276, + "learning_rate": 0.00010154783668494821, + "loss": 1.0262, + "step": 2083 + }, + { + "epoch": 0.4445392491467577, + "grad_norm": 1.098325402554301, + "learning_rate": 0.00010159658744667886, + "loss": 1.0173, + "step": 2084 + }, + { + "epoch": 0.4447525597269625, + "grad_norm": 0.7836381062374193, + "learning_rate": 0.00010164533820840953, + "loss": 1.0449, + "step": 2085 + }, + { + "epoch": 0.4449658703071672, + "grad_norm": 0.4464671065200356, + "learning_rate": 0.00010169408897014017, + "loss": 1.032, + "step": 2086 + }, + { + "epoch": 0.445179180887372, + "grad_norm": 0.7216407491793547, + "learning_rate": 0.0001017428397318708, + "loss": 1.0159, + "step": 2087 + }, + { + "epoch": 0.4453924914675768, + "grad_norm": 0.8796299527854269, + "learning_rate": 0.00010179159049360147, + "loss": 1.0181, + "step": 2088 + }, + { + "epoch": 0.44560580204778155, + "grad_norm": 0.9490302333231664, + "learning_rate": 0.00010184034125533211, + "loss": 1.003, + "step": 2089 + }, + { + "epoch": 0.44581911262798635, + "grad_norm": 0.9420181327985564, + "learning_rate": 0.00010188909201706278, + "loss": 0.9868, + "step": 2090 + }, + { + "epoch": 0.44603242320819114, + "grad_norm": 1.0255394144573184, + "learning_rate": 0.00010193784277879343, + "loss": 1.0208, + "step": 2091 + }, + { + "epoch": 0.4462457337883959, + "grad_norm": 0.9541712595272506, + "learning_rate": 0.00010198659354052407, + "loss": 1.0198, + "step": 2092 + }, + { + "epoch": 0.4464590443686007, + "grad_norm": 0.8889990743583598, + "learning_rate": 0.00010203534430225474, + "loss": 1.0036, + "step": 2093 + }, + { + "epoch": 0.4466723549488055, + "grad_norm": 0.8093359918877147, + "learning_rate": 0.00010208409506398538, + "loss": 1.0269, + "step": 2094 + }, + { + "epoch": 0.4468856655290102, + "grad_norm": 0.8676804268889493, + "learning_rate": 0.00010213284582571604, + "loss": 1.0025, + "step": 2095 + }, + { + "epoch": 0.447098976109215, + "grad_norm": 0.9042195426811775, + "learning_rate": 0.00010218159658744668, + "loss": 1.0089, + "step": 2096 + }, + { + "epoch": 0.4473122866894198, + "grad_norm": 0.8773239055725401, + "learning_rate": 0.00010223034734917735, + "loss": 0.999, + "step": 2097 + }, + { + "epoch": 0.4475255972696246, + "grad_norm": 0.8802393094544402, + "learning_rate": 0.00010227909811090799, + "loss": 1.0242, + "step": 2098 + }, + { + "epoch": 0.44773890784982934, + "grad_norm": 0.9138631582402136, + "learning_rate": 0.00010232784887263864, + "loss": 1.0565, + "step": 2099 + }, + { + "epoch": 0.44795221843003413, + "grad_norm": 0.9518844805936808, + "learning_rate": 0.0001023765996343693, + "loss": 1.0225, + "step": 2100 + }, + { + "epoch": 0.4481655290102389, + "grad_norm": 0.8887400122332322, + "learning_rate": 0.00010242535039609995, + "loss": 0.9766, + "step": 2101 + }, + { + "epoch": 0.44837883959044367, + "grad_norm": 0.782102988922902, + "learning_rate": 0.0001024741011578306, + "loss": 1.0435, + "step": 2102 + }, + { + "epoch": 0.44859215017064846, + "grad_norm": 0.7113195592651094, + "learning_rate": 0.00010252285191956125, + "loss": 1.0035, + "step": 2103 + }, + { + "epoch": 0.44880546075085326, + "grad_norm": 0.5694257006289424, + "learning_rate": 0.0001025716026812919, + "loss": 1.0062, + "step": 2104 + }, + { + "epoch": 0.449018771331058, + "grad_norm": 0.5239678315230168, + "learning_rate": 0.00010262035344302256, + "loss": 1.0224, + "step": 2105 + }, + { + "epoch": 0.4492320819112628, + "grad_norm": 0.4812953211626129, + "learning_rate": 0.0001026691042047532, + "loss": 1.0053, + "step": 2106 + }, + { + "epoch": 0.4494453924914676, + "grad_norm": 0.4235289481744513, + "learning_rate": 0.00010271785496648387, + "loss": 1.0481, + "step": 2107 + }, + { + "epoch": 0.4496587030716723, + "grad_norm": 0.3637386030514604, + "learning_rate": 0.0001027666057282145, + "loss": 0.9755, + "step": 2108 + }, + { + "epoch": 0.4498720136518771, + "grad_norm": 0.42324586463866487, + "learning_rate": 0.00010281535648994516, + "loss": 1.0177, + "step": 2109 + }, + { + "epoch": 0.4500853242320819, + "grad_norm": 0.508570170424452, + "learning_rate": 0.00010286410725167583, + "loss": 1.0571, + "step": 2110 + }, + { + "epoch": 0.4502986348122867, + "grad_norm": 0.5955111953572616, + "learning_rate": 0.00010291285801340647, + "loss": 0.9912, + "step": 2111 + }, + { + "epoch": 0.45051194539249145, + "grad_norm": 0.5946601511891616, + "learning_rate": 0.00010296160877513713, + "loss": 0.9532, + "step": 2112 + }, + { + "epoch": 0.45072525597269625, + "grad_norm": 0.5911972034266786, + "learning_rate": 0.00010301035953686777, + "loss": 0.9887, + "step": 2113 + }, + { + "epoch": 0.45093856655290104, + "grad_norm": 0.6601923521715681, + "learning_rate": 0.00010305911029859841, + "loss": 1.0659, + "step": 2114 + }, + { + "epoch": 0.4511518771331058, + "grad_norm": 0.7024020570916516, + "learning_rate": 0.00010310786106032908, + "loss": 1.022, + "step": 2115 + }, + { + "epoch": 0.4513651877133106, + "grad_norm": 0.7315938387091868, + "learning_rate": 0.00010315661182205973, + "loss": 1.0102, + "step": 2116 + }, + { + "epoch": 0.4515784982935154, + "grad_norm": 0.7728334116901555, + "learning_rate": 0.00010320536258379038, + "loss": 1.0354, + "step": 2117 + }, + { + "epoch": 0.4517918088737201, + "grad_norm": 0.8090931213281504, + "learning_rate": 0.00010325411334552104, + "loss": 0.9822, + "step": 2118 + }, + { + "epoch": 0.4520051194539249, + "grad_norm": 0.8354700347992265, + "learning_rate": 0.00010330286410725168, + "loss": 0.9986, + "step": 2119 + }, + { + "epoch": 0.4522184300341297, + "grad_norm": 0.8364818644025053, + "learning_rate": 0.00010335161486898234, + "loss": 1.0005, + "step": 2120 + }, + { + "epoch": 0.45243174061433444, + "grad_norm": 0.7794620862141317, + "learning_rate": 0.00010340036563071298, + "loss": 1.0069, + "step": 2121 + }, + { + "epoch": 0.45264505119453924, + "grad_norm": 0.7900130473230581, + "learning_rate": 0.00010344911639244365, + "loss": 1.0117, + "step": 2122 + }, + { + "epoch": 0.45285836177474403, + "grad_norm": 0.7581650267620442, + "learning_rate": 0.00010349786715417429, + "loss": 0.9853, + "step": 2123 + }, + { + "epoch": 0.45307167235494883, + "grad_norm": 0.7376814296837485, + "learning_rate": 0.00010354661791590494, + "loss": 1.041, + "step": 2124 + }, + { + "epoch": 0.45328498293515357, + "grad_norm": 0.6982745797630298, + "learning_rate": 0.0001035953686776356, + "loss": 1.05, + "step": 2125 + }, + { + "epoch": 0.45349829351535836, + "grad_norm": 0.6884802443516873, + "learning_rate": 0.00010364411943936625, + "loss": 0.9851, + "step": 2126 + }, + { + "epoch": 0.45371160409556316, + "grad_norm": 0.7050051712129464, + "learning_rate": 0.00010369287020109691, + "loss": 1.0363, + "step": 2127 + }, + { + "epoch": 0.4539249146757679, + "grad_norm": 0.7995678011384795, + "learning_rate": 0.00010374162096282755, + "loss": 0.9939, + "step": 2128 + }, + { + "epoch": 0.4541382252559727, + "grad_norm": 0.8205236792466956, + "learning_rate": 0.00010379037172455819, + "loss": 1.0313, + "step": 2129 + }, + { + "epoch": 0.4543515358361775, + "grad_norm": 0.9301522115975518, + "learning_rate": 0.00010383912248628886, + "loss": 1.008, + "step": 2130 + }, + { + "epoch": 0.45456484641638223, + "grad_norm": 0.9968288512752063, + "learning_rate": 0.0001038878732480195, + "loss": 0.9882, + "step": 2131 + }, + { + "epoch": 0.454778156996587, + "grad_norm": 0.9627183789572158, + "learning_rate": 0.00010393662400975017, + "loss": 1.0111, + "step": 2132 + }, + { + "epoch": 0.4549914675767918, + "grad_norm": 0.8890226503802261, + "learning_rate": 0.00010398537477148082, + "loss": 1.0051, + "step": 2133 + }, + { + "epoch": 0.4552047781569966, + "grad_norm": 0.823900407261687, + "learning_rate": 0.00010403412553321146, + "loss": 1.0023, + "step": 2134 + }, + { + "epoch": 0.45541808873720135, + "grad_norm": 0.8665914162062793, + "learning_rate": 0.00010408287629494212, + "loss": 1.0392, + "step": 2135 + }, + { + "epoch": 0.45563139931740615, + "grad_norm": 0.9273378316461006, + "learning_rate": 0.00010413162705667276, + "loss": 0.9893, + "step": 2136 + }, + { + "epoch": 0.45584470989761094, + "grad_norm": 0.8237584908564607, + "learning_rate": 0.00010418037781840343, + "loss": 0.9723, + "step": 2137 + }, + { + "epoch": 0.4560580204778157, + "grad_norm": 0.6466671573687943, + "learning_rate": 0.00010422912858013407, + "loss": 1.0162, + "step": 2138 + }, + { + "epoch": 0.4562713310580205, + "grad_norm": 0.5789553617072088, + "learning_rate": 0.00010427787934186474, + "loss": 0.9863, + "step": 2139 + }, + { + "epoch": 0.4564846416382253, + "grad_norm": 0.7286167086087649, + "learning_rate": 0.00010432663010359538, + "loss": 1.0161, + "step": 2140 + }, + { + "epoch": 0.45669795221843, + "grad_norm": 0.8181246711206565, + "learning_rate": 0.00010437538086532603, + "loss": 1.0264, + "step": 2141 + }, + { + "epoch": 0.4569112627986348, + "grad_norm": 0.9454318523845036, + "learning_rate": 0.00010442413162705668, + "loss": 1.0346, + "step": 2142 + }, + { + "epoch": 0.4571245733788396, + "grad_norm": 0.9215784221945671, + "learning_rate": 0.00010447288238878733, + "loss": 1.0074, + "step": 2143 + }, + { + "epoch": 0.45733788395904434, + "grad_norm": 0.7772059533582331, + "learning_rate": 0.00010452163315051799, + "loss": 1.0164, + "step": 2144 + }, + { + "epoch": 0.45755119453924914, + "grad_norm": 0.6350566528561687, + "learning_rate": 0.00010457038391224864, + "loss": 0.9952, + "step": 2145 + }, + { + "epoch": 0.45776450511945393, + "grad_norm": 0.43060594890067566, + "learning_rate": 0.00010461913467397928, + "loss": 1.0053, + "step": 2146 + }, + { + "epoch": 0.45797781569965873, + "grad_norm": 0.5363880360058959, + "learning_rate": 0.00010466788543570995, + "loss": 1.0114, + "step": 2147 + }, + { + "epoch": 0.45819112627986347, + "grad_norm": 0.6979176236444571, + "learning_rate": 0.00010471663619744059, + "loss": 1.019, + "step": 2148 + }, + { + "epoch": 0.45840443686006827, + "grad_norm": 0.6779456536240408, + "learning_rate": 0.00010476538695917125, + "loss": 1.0265, + "step": 2149 + }, + { + "epoch": 0.45861774744027306, + "grad_norm": 0.708323900057343, + "learning_rate": 0.00010481413772090189, + "loss": 0.9935, + "step": 2150 + }, + { + "epoch": 0.4588310580204778, + "grad_norm": 0.7129199236077229, + "learning_rate": 0.00010486288848263255, + "loss": 1.0367, + "step": 2151 + }, + { + "epoch": 0.4590443686006826, + "grad_norm": 0.8207092280446754, + "learning_rate": 0.00010491163924436321, + "loss": 0.9957, + "step": 2152 + }, + { + "epoch": 0.4592576791808874, + "grad_norm": 0.8107482691820125, + "learning_rate": 0.00010496039000609385, + "loss": 1.0088, + "step": 2153 + }, + { + "epoch": 0.45947098976109213, + "grad_norm": 0.7788234575392707, + "learning_rate": 0.00010500914076782452, + "loss": 0.9998, + "step": 2154 + }, + { + "epoch": 0.4596843003412969, + "grad_norm": 0.6961756376339153, + "learning_rate": 0.00010505789152955516, + "loss": 0.9733, + "step": 2155 + }, + { + "epoch": 0.4598976109215017, + "grad_norm": 0.4971543396467815, + "learning_rate": 0.0001051066422912858, + "loss": 1.0095, + "step": 2156 + }, + { + "epoch": 0.46011092150170646, + "grad_norm": 0.45136493292658375, + "learning_rate": 0.00010515539305301646, + "loss": 1.0345, + "step": 2157 + }, + { + "epoch": 0.46032423208191126, + "grad_norm": 0.5294606755979336, + "learning_rate": 0.00010520414381474712, + "loss": 0.9962, + "step": 2158 + }, + { + "epoch": 0.46053754266211605, + "grad_norm": 0.6600866798672765, + "learning_rate": 0.00010525289457647777, + "loss": 1.0354, + "step": 2159 + }, + { + "epoch": 0.46075085324232085, + "grad_norm": 0.8418130220461942, + "learning_rate": 0.00010530164533820842, + "loss": 0.9681, + "step": 2160 + }, + { + "epoch": 0.4609641638225256, + "grad_norm": 1.0414114738652023, + "learning_rate": 0.00010535039609993906, + "loss": 0.996, + "step": 2161 + }, + { + "epoch": 0.4611774744027304, + "grad_norm": 0.9430283603527447, + "learning_rate": 0.00010539914686166973, + "loss": 0.9995, + "step": 2162 + }, + { + "epoch": 0.4613907849829352, + "grad_norm": 0.8656855297676566, + "learning_rate": 0.00010544789762340037, + "loss": 1.0186, + "step": 2163 + }, + { + "epoch": 0.4616040955631399, + "grad_norm": 0.7827560769540118, + "learning_rate": 0.00010549664838513103, + "loss": 0.9919, + "step": 2164 + }, + { + "epoch": 0.4618174061433447, + "grad_norm": 0.6066142056753218, + "learning_rate": 0.00010554539914686167, + "loss": 1.0047, + "step": 2165 + }, + { + "epoch": 0.4620307167235495, + "grad_norm": 0.4671612125396391, + "learning_rate": 0.00010559414990859233, + "loss": 1.0245, + "step": 2166 + }, + { + "epoch": 0.46224402730375425, + "grad_norm": 0.44397698837137045, + "learning_rate": 0.00010564290067032298, + "loss": 1.0016, + "step": 2167 + }, + { + "epoch": 0.46245733788395904, + "grad_norm": 0.4563166722126029, + "learning_rate": 0.00010569165143205363, + "loss": 1.0218, + "step": 2168 + }, + { + "epoch": 0.46267064846416384, + "grad_norm": 0.4546658794313409, + "learning_rate": 0.00010574040219378429, + "loss": 0.9786, + "step": 2169 + }, + { + "epoch": 0.4628839590443686, + "grad_norm": 0.5686793288392524, + "learning_rate": 0.00010578915295551494, + "loss": 0.9941, + "step": 2170 + }, + { + "epoch": 0.46309726962457337, + "grad_norm": 0.616856044377035, + "learning_rate": 0.00010583790371724558, + "loss": 1.0195, + "step": 2171 + }, + { + "epoch": 0.46331058020477817, + "grad_norm": 0.5681963024093163, + "learning_rate": 0.00010588665447897625, + "loss": 1.0129, + "step": 2172 + }, + { + "epoch": 0.46352389078498296, + "grad_norm": 0.5249757367193257, + "learning_rate": 0.00010593540524070688, + "loss": 1.0096, + "step": 2173 + }, + { + "epoch": 0.4637372013651877, + "grad_norm": 0.511161256499988, + "learning_rate": 0.00010598415600243755, + "loss": 1.0215, + "step": 2174 + }, + { + "epoch": 0.4639505119453925, + "grad_norm": 0.5015645528845725, + "learning_rate": 0.0001060329067641682, + "loss": 0.9837, + "step": 2175 + }, + { + "epoch": 0.4641638225255973, + "grad_norm": 0.6000807398466953, + "learning_rate": 0.00010608165752589884, + "loss": 1.0158, + "step": 2176 + }, + { + "epoch": 0.46437713310580203, + "grad_norm": 0.6422031914549452, + "learning_rate": 0.00010613040828762951, + "loss": 0.9889, + "step": 2177 + }, + { + "epoch": 0.4645904436860068, + "grad_norm": 0.6885598386018346, + "learning_rate": 0.00010617915904936015, + "loss": 1.011, + "step": 2178 + }, + { + "epoch": 0.4648037542662116, + "grad_norm": 0.7854014911645498, + "learning_rate": 0.00010622790981109082, + "loss": 1.0243, + "step": 2179 + }, + { + "epoch": 0.46501706484641636, + "grad_norm": 0.9294864981804023, + "learning_rate": 0.00010627666057282146, + "loss": 1.0203, + "step": 2180 + }, + { + "epoch": 0.46523037542662116, + "grad_norm": 1.1220052059489243, + "learning_rate": 0.00010632541133455212, + "loss": 1.0532, + "step": 2181 + }, + { + "epoch": 0.46544368600682595, + "grad_norm": 0.840534869047363, + "learning_rate": 0.00010637416209628276, + "loss": 1.034, + "step": 2182 + }, + { + "epoch": 0.4656569965870307, + "grad_norm": 0.7210400022487292, + "learning_rate": 0.00010642291285801341, + "loss": 1.0123, + "step": 2183 + }, + { + "epoch": 0.4658703071672355, + "grad_norm": 0.733059229740774, + "learning_rate": 0.00010647166361974407, + "loss": 1.0439, + "step": 2184 + }, + { + "epoch": 0.4660836177474403, + "grad_norm": 0.7055679073850889, + "learning_rate": 0.00010652041438147472, + "loss": 1.0318, + "step": 2185 + }, + { + "epoch": 0.4662969283276451, + "grad_norm": 0.6990048764689109, + "learning_rate": 0.00010656916514320537, + "loss": 0.9757, + "step": 2186 + }, + { + "epoch": 0.4665102389078498, + "grad_norm": 0.6251308899480296, + "learning_rate": 0.00010661791590493603, + "loss": 1.0122, + "step": 2187 + }, + { + "epoch": 0.4667235494880546, + "grad_norm": 0.8050454149570739, + "learning_rate": 0.00010666666666666667, + "loss": 0.9898, + "step": 2188 + }, + { + "epoch": 0.4669368600682594, + "grad_norm": 1.097582212675433, + "learning_rate": 0.00010671541742839733, + "loss": 1.0153, + "step": 2189 + }, + { + "epoch": 0.46715017064846415, + "grad_norm": 0.965047500197038, + "learning_rate": 0.00010676416819012797, + "loss": 1.0175, + "step": 2190 + }, + { + "epoch": 0.46736348122866894, + "grad_norm": 0.854313380700971, + "learning_rate": 0.00010681291895185864, + "loss": 1.0027, + "step": 2191 + }, + { + "epoch": 0.46757679180887374, + "grad_norm": 0.7935681967158152, + "learning_rate": 0.00010686166971358928, + "loss": 1.0133, + "step": 2192 + }, + { + "epoch": 0.4677901023890785, + "grad_norm": 0.6756202355953042, + "learning_rate": 0.00010691042047531993, + "loss": 1.018, + "step": 2193 + }, + { + "epoch": 0.4680034129692833, + "grad_norm": 0.6438062510983144, + "learning_rate": 0.0001069591712370506, + "loss": 1.0027, + "step": 2194 + }, + { + "epoch": 0.46821672354948807, + "grad_norm": 0.756667561591685, + "learning_rate": 0.00010700792199878124, + "loss": 0.9504, + "step": 2195 + }, + { + "epoch": 0.4684300341296928, + "grad_norm": 0.9693594880142863, + "learning_rate": 0.0001070566727605119, + "loss": 1.0307, + "step": 2196 + }, + { + "epoch": 0.4686433447098976, + "grad_norm": 1.1005031584528449, + "learning_rate": 0.00010710542352224254, + "loss": 1.0145, + "step": 2197 + }, + { + "epoch": 0.4688566552901024, + "grad_norm": 0.731189934463501, + "learning_rate": 0.00010715417428397318, + "loss": 0.9597, + "step": 2198 + }, + { + "epoch": 0.4690699658703072, + "grad_norm": 0.6008937943729827, + "learning_rate": 0.00010720292504570385, + "loss": 1.0123, + "step": 2199 + }, + { + "epoch": 0.46928327645051193, + "grad_norm": 0.7213560468765394, + "learning_rate": 0.0001072516758074345, + "loss": 1.0297, + "step": 2200 + }, + { + "epoch": 0.4694965870307167, + "grad_norm": 0.8026241851270854, + "learning_rate": 0.00010730042656916516, + "loss": 1.0155, + "step": 2201 + }, + { + "epoch": 0.4697098976109215, + "grad_norm": 0.6529417736322948, + "learning_rate": 0.00010734917733089581, + "loss": 0.9998, + "step": 2202 + }, + { + "epoch": 0.46992320819112626, + "grad_norm": 0.4891208059470805, + "learning_rate": 0.00010739792809262645, + "loss": 1.0226, + "step": 2203 + }, + { + "epoch": 0.47013651877133106, + "grad_norm": 0.5152399217635297, + "learning_rate": 0.00010744667885435711, + "loss": 1.0044, + "step": 2204 + }, + { + "epoch": 0.47034982935153585, + "grad_norm": 0.7555287319222249, + "learning_rate": 0.00010749542961608775, + "loss": 1.0164, + "step": 2205 + }, + { + "epoch": 0.4705631399317406, + "grad_norm": 0.8391420913932668, + "learning_rate": 0.00010754418037781842, + "loss": 1.0167, + "step": 2206 + }, + { + "epoch": 0.4707764505119454, + "grad_norm": 0.6893994117378838, + "learning_rate": 0.00010759293113954906, + "loss": 1.0539, + "step": 2207 + }, + { + "epoch": 0.4709897610921502, + "grad_norm": 0.9575043814033276, + "learning_rate": 0.00010764168190127971, + "loss": 1.0383, + "step": 2208 + }, + { + "epoch": 0.4712030716723549, + "grad_norm": 1.1729794652251408, + "learning_rate": 0.00010769043266301037, + "loss": 0.9956, + "step": 2209 + }, + { + "epoch": 0.4714163822525597, + "grad_norm": 0.5857807655513912, + "learning_rate": 0.00010773918342474102, + "loss": 0.9679, + "step": 2210 + }, + { + "epoch": 0.4716296928327645, + "grad_norm": 0.8819969424182358, + "learning_rate": 0.00010778793418647167, + "loss": 0.9999, + "step": 2211 + }, + { + "epoch": 0.4718430034129693, + "grad_norm": 0.9841740506775121, + "learning_rate": 0.00010783668494820233, + "loss": 0.9978, + "step": 2212 + }, + { + "epoch": 0.47205631399317405, + "grad_norm": 0.6995159830738336, + "learning_rate": 0.00010788543570993296, + "loss": 1.0048, + "step": 2213 + }, + { + "epoch": 0.47226962457337884, + "grad_norm": 0.5262207343505135, + "learning_rate": 0.00010793418647166363, + "loss": 1.0111, + "step": 2214 + }, + { + "epoch": 0.47248293515358364, + "grad_norm": 0.5999648566154612, + "learning_rate": 0.00010798293723339427, + "loss": 0.9873, + "step": 2215 + }, + { + "epoch": 0.4726962457337884, + "grad_norm": 0.5083397827444951, + "learning_rate": 0.00010803168799512494, + "loss": 1.0136, + "step": 2216 + }, + { + "epoch": 0.4729095563139932, + "grad_norm": 0.4811396314256847, + "learning_rate": 0.00010808043875685558, + "loss": 1.0331, + "step": 2217 + }, + { + "epoch": 0.47312286689419797, + "grad_norm": 0.5276609171641542, + "learning_rate": 0.00010812918951858623, + "loss": 1.0178, + "step": 2218 + }, + { + "epoch": 0.4733361774744027, + "grad_norm": 0.497782162870865, + "learning_rate": 0.0001081779402803169, + "loss": 0.9985, + "step": 2219 + }, + { + "epoch": 0.4735494880546075, + "grad_norm": 0.5654254625002894, + "learning_rate": 0.00010822669104204754, + "loss": 1.0159, + "step": 2220 + }, + { + "epoch": 0.4737627986348123, + "grad_norm": 0.7559997015605322, + "learning_rate": 0.0001082754418037782, + "loss": 1.0077, + "step": 2221 + }, + { + "epoch": 0.47397610921501704, + "grad_norm": 0.924589432661335, + "learning_rate": 0.00010832419256550884, + "loss": 0.9832, + "step": 2222 + }, + { + "epoch": 0.47418941979522183, + "grad_norm": 1.1690823603925296, + "learning_rate": 0.0001083729433272395, + "loss": 0.9624, + "step": 2223 + }, + { + "epoch": 0.47440273037542663, + "grad_norm": 1.0081010460325617, + "learning_rate": 0.00010842169408897015, + "loss": 0.9925, + "step": 2224 + }, + { + "epoch": 0.4746160409556314, + "grad_norm": 0.9421099328074162, + "learning_rate": 0.0001084704448507008, + "loss": 1.0152, + "step": 2225 + }, + { + "epoch": 0.47482935153583616, + "grad_norm": 0.7372466455582771, + "learning_rate": 0.00010851919561243145, + "loss": 1.0162, + "step": 2226 + }, + { + "epoch": 0.47504266211604096, + "grad_norm": 0.4986577617262331, + "learning_rate": 0.00010856794637416211, + "loss": 1.022, + "step": 2227 + }, + { + "epoch": 0.47525597269624575, + "grad_norm": 0.49713931469799105, + "learning_rate": 0.00010861669713589276, + "loss": 1.0197, + "step": 2228 + }, + { + "epoch": 0.4754692832764505, + "grad_norm": 0.6092892085780486, + "learning_rate": 0.00010866544789762341, + "loss": 1.0302, + "step": 2229 + }, + { + "epoch": 0.4756825938566553, + "grad_norm": 0.6178311677543725, + "learning_rate": 0.00010871419865935405, + "loss": 1.0495, + "step": 2230 + }, + { + "epoch": 0.4758959044368601, + "grad_norm": 0.6580750257893653, + "learning_rate": 0.00010876294942108472, + "loss": 1.0072, + "step": 2231 + }, + { + "epoch": 0.4761092150170648, + "grad_norm": 0.7836876043979107, + "learning_rate": 0.00010881170018281536, + "loss": 0.98, + "step": 2232 + }, + { + "epoch": 0.4763225255972696, + "grad_norm": 0.97939085510857, + "learning_rate": 0.00010886045094454603, + "loss": 1.0071, + "step": 2233 + }, + { + "epoch": 0.4765358361774744, + "grad_norm": 1.2848431870887944, + "learning_rate": 0.00010890920170627666, + "loss": 1.0244, + "step": 2234 + }, + { + "epoch": 0.47674914675767915, + "grad_norm": 0.6458190352009759, + "learning_rate": 0.00010895795246800732, + "loss": 0.9904, + "step": 2235 + }, + { + "epoch": 0.47696245733788395, + "grad_norm": 0.48723589573728404, + "learning_rate": 0.00010900670322973798, + "loss": 1.0029, + "step": 2236 + }, + { + "epoch": 0.47717576791808874, + "grad_norm": 0.7559095970135482, + "learning_rate": 0.00010905545399146862, + "loss": 0.9832, + "step": 2237 + }, + { + "epoch": 0.47738907849829354, + "grad_norm": 1.2050178122234803, + "learning_rate": 0.00010910420475319929, + "loss": 1.0093, + "step": 2238 + }, + { + "epoch": 0.4776023890784983, + "grad_norm": 0.8742657995248349, + "learning_rate": 0.00010915295551492993, + "loss": 0.9649, + "step": 2239 + }, + { + "epoch": 0.4778156996587031, + "grad_norm": 0.7689520741941053, + "learning_rate": 0.00010920170627666057, + "loss": 0.988, + "step": 2240 + }, + { + "epoch": 0.47802901023890787, + "grad_norm": 0.7614746977127466, + "learning_rate": 0.00010925045703839124, + "loss": 1.0336, + "step": 2241 + }, + { + "epoch": 0.4782423208191126, + "grad_norm": 0.6426393136574865, + "learning_rate": 0.00010929920780012189, + "loss": 1.0453, + "step": 2242 + }, + { + "epoch": 0.4784556313993174, + "grad_norm": 0.5247633662517793, + "learning_rate": 0.00010934795856185254, + "loss": 1.0428, + "step": 2243 + }, + { + "epoch": 0.4786689419795222, + "grad_norm": 0.5203237708605373, + "learning_rate": 0.0001093967093235832, + "loss": 1.0323, + "step": 2244 + }, + { + "epoch": 0.47888225255972694, + "grad_norm": 0.5115996573923107, + "learning_rate": 0.00010944546008531383, + "loss": 1.0343, + "step": 2245 + }, + { + "epoch": 0.47909556313993173, + "grad_norm": 0.5428485680214573, + "learning_rate": 0.0001094942108470445, + "loss": 1.045, + "step": 2246 + }, + { + "epoch": 0.47930887372013653, + "grad_norm": 0.48698416359296487, + "learning_rate": 0.00010954296160877514, + "loss": 1.0106, + "step": 2247 + }, + { + "epoch": 0.47952218430034127, + "grad_norm": 0.5427547430872713, + "learning_rate": 0.00010959171237050581, + "loss": 1.0098, + "step": 2248 + }, + { + "epoch": 0.47973549488054607, + "grad_norm": 0.6714278605587877, + "learning_rate": 0.00010964046313223645, + "loss": 1.0302, + "step": 2249 + }, + { + "epoch": 0.47994880546075086, + "grad_norm": 0.5317196543798065, + "learning_rate": 0.0001096892138939671, + "loss": 0.9876, + "step": 2250 + }, + { + "epoch": 0.48016211604095566, + "grad_norm": 0.5279197937388636, + "learning_rate": 0.00010973796465569775, + "loss": 1.0306, + "step": 2251 + }, + { + "epoch": 0.4803754266211604, + "grad_norm": 0.6591145540591055, + "learning_rate": 0.0001097867154174284, + "loss": 1.0003, + "step": 2252 + }, + { + "epoch": 0.4805887372013652, + "grad_norm": 0.7009751697879685, + "learning_rate": 0.00010983546617915906, + "loss": 1.0005, + "step": 2253 + }, + { + "epoch": 0.48080204778157, + "grad_norm": 0.9090686131452106, + "learning_rate": 0.00010988421694088971, + "loss": 0.9685, + "step": 2254 + }, + { + "epoch": 0.4810153583617747, + "grad_norm": 0.9949316399118351, + "learning_rate": 0.00010993296770262035, + "loss": 1.0161, + "step": 2255 + }, + { + "epoch": 0.4812286689419795, + "grad_norm": 1.170865924000263, + "learning_rate": 0.00010998171846435102, + "loss": 1.0197, + "step": 2256 + }, + { + "epoch": 0.4814419795221843, + "grad_norm": 0.793118964092031, + "learning_rate": 0.00011003046922608166, + "loss": 1.0193, + "step": 2257 + }, + { + "epoch": 0.48165529010238906, + "grad_norm": 0.6303013663908319, + "learning_rate": 0.00011007921998781232, + "loss": 1.0106, + "step": 2258 + }, + { + "epoch": 0.48186860068259385, + "grad_norm": 0.6039610871247425, + "learning_rate": 0.00011012797074954296, + "loss": 0.9878, + "step": 2259 + }, + { + "epoch": 0.48208191126279865, + "grad_norm": 0.5281493961433891, + "learning_rate": 0.00011017672151127362, + "loss": 0.9833, + "step": 2260 + }, + { + "epoch": 0.4822952218430034, + "grad_norm": 0.4099964150593092, + "learning_rate": 0.00011022547227300428, + "loss": 0.9991, + "step": 2261 + }, + { + "epoch": 0.4825085324232082, + "grad_norm": 0.5517109322034272, + "learning_rate": 0.00011027422303473492, + "loss": 1.019, + "step": 2262 + }, + { + "epoch": 0.482721843003413, + "grad_norm": 0.6313880381742509, + "learning_rate": 0.00011032297379646559, + "loss": 1.0093, + "step": 2263 + }, + { + "epoch": 0.48293515358361777, + "grad_norm": 0.745207035225392, + "learning_rate": 0.00011037172455819623, + "loss": 1.0385, + "step": 2264 + }, + { + "epoch": 0.4831484641638225, + "grad_norm": 0.8878846896569298, + "learning_rate": 0.00011042047531992687, + "loss": 1.0513, + "step": 2265 + }, + { + "epoch": 0.4833617747440273, + "grad_norm": 0.8682869774976515, + "learning_rate": 0.00011046922608165753, + "loss": 1.0161, + "step": 2266 + }, + { + "epoch": 0.4835750853242321, + "grad_norm": 0.9057795894222803, + "learning_rate": 0.00011051797684338819, + "loss": 0.9915, + "step": 2267 + }, + { + "epoch": 0.48378839590443684, + "grad_norm": 1.026773247467624, + "learning_rate": 0.00011056672760511884, + "loss": 1.021, + "step": 2268 + }, + { + "epoch": 0.48400170648464164, + "grad_norm": 1.115871245789289, + "learning_rate": 0.00011061547836684949, + "loss": 1.0105, + "step": 2269 + }, + { + "epoch": 0.48421501706484643, + "grad_norm": 0.7918242688431302, + "learning_rate": 0.00011066422912858015, + "loss": 1.0076, + "step": 2270 + }, + { + "epoch": 0.48442832764505117, + "grad_norm": 0.5556140673916917, + "learning_rate": 0.0001107129798903108, + "loss": 1.013, + "step": 2271 + }, + { + "epoch": 0.48464163822525597, + "grad_norm": 0.47730848900725753, + "learning_rate": 0.00011076173065204144, + "loss": 1.0141, + "step": 2272 + }, + { + "epoch": 0.48485494880546076, + "grad_norm": 0.5048978345874613, + "learning_rate": 0.0001108104814137721, + "loss": 0.991, + "step": 2273 + }, + { + "epoch": 0.48506825938566556, + "grad_norm": 0.6366858772170214, + "learning_rate": 0.00011085923217550274, + "loss": 1.0211, + "step": 2274 + }, + { + "epoch": 0.4852815699658703, + "grad_norm": 0.6936800764935348, + "learning_rate": 0.00011090798293723341, + "loss": 1.0286, + "step": 2275 + }, + { + "epoch": 0.4854948805460751, + "grad_norm": 0.7316419199703837, + "learning_rate": 0.00011095673369896405, + "loss": 1.0259, + "step": 2276 + }, + { + "epoch": 0.4857081911262799, + "grad_norm": 0.9093562160676928, + "learning_rate": 0.0001110054844606947, + "loss": 0.9849, + "step": 2277 + }, + { + "epoch": 0.4859215017064846, + "grad_norm": 1.1727648248971954, + "learning_rate": 0.00011105423522242537, + "loss": 0.9881, + "step": 2278 + }, + { + "epoch": 0.4861348122866894, + "grad_norm": 0.7823908449529743, + "learning_rate": 0.00011110298598415601, + "loss": 0.9971, + "step": 2279 + }, + { + "epoch": 0.4863481228668942, + "grad_norm": 0.5970453278967918, + "learning_rate": 0.00011115173674588668, + "loss": 0.9934, + "step": 2280 + }, + { + "epoch": 0.48656143344709896, + "grad_norm": 0.6079736997841403, + "learning_rate": 0.00011120048750761732, + "loss": 1.0053, + "step": 2281 + }, + { + "epoch": 0.48677474402730375, + "grad_norm": 0.8521068488475366, + "learning_rate": 0.00011124923826934796, + "loss": 1.0231, + "step": 2282 + }, + { + "epoch": 0.48698805460750855, + "grad_norm": 1.1888249146673615, + "learning_rate": 0.00011129798903107862, + "loss": 1.0491, + "step": 2283 + }, + { + "epoch": 0.4872013651877133, + "grad_norm": 0.8531313236217871, + "learning_rate": 0.00011134673979280927, + "loss": 0.9913, + "step": 2284 + }, + { + "epoch": 0.4874146757679181, + "grad_norm": 0.6859250092960089, + "learning_rate": 0.00011139549055453993, + "loss": 0.9987, + "step": 2285 + }, + { + "epoch": 0.4876279863481229, + "grad_norm": 0.6439544210709569, + "learning_rate": 0.00011144424131627058, + "loss": 0.9859, + "step": 2286 + }, + { + "epoch": 0.4878412969283277, + "grad_norm": 0.6776867446079212, + "learning_rate": 0.00011149299207800122, + "loss": 1.0023, + "step": 2287 + }, + { + "epoch": 0.4880546075085324, + "grad_norm": 0.7522601853402499, + "learning_rate": 0.00011154174283973189, + "loss": 0.995, + "step": 2288 + }, + { + "epoch": 0.4882679180887372, + "grad_norm": 0.8003309701830679, + "learning_rate": 0.00011159049360146253, + "loss": 0.995, + "step": 2289 + }, + { + "epoch": 0.488481228668942, + "grad_norm": 0.919269035921836, + "learning_rate": 0.00011163924436319319, + "loss": 0.9847, + "step": 2290 + }, + { + "epoch": 0.48869453924914674, + "grad_norm": 1.109848055780718, + "learning_rate": 0.00011168799512492383, + "loss": 0.9675, + "step": 2291 + }, + { + "epoch": 0.48890784982935154, + "grad_norm": 0.8817341077982879, + "learning_rate": 0.00011173674588665449, + "loss": 1.0302, + "step": 2292 + }, + { + "epoch": 0.48912116040955633, + "grad_norm": 0.6133422667218018, + "learning_rate": 0.00011178549664838514, + "loss": 1.0385, + "step": 2293 + }, + { + "epoch": 0.4893344709897611, + "grad_norm": 0.5796901270722139, + "learning_rate": 0.00011183424741011579, + "loss": 1.0022, + "step": 2294 + }, + { + "epoch": 0.48954778156996587, + "grad_norm": 0.5761414184977179, + "learning_rate": 0.00011188299817184644, + "loss": 1.0167, + "step": 2295 + }, + { + "epoch": 0.48976109215017066, + "grad_norm": 0.5882847135353088, + "learning_rate": 0.0001119317489335771, + "loss": 0.9736, + "step": 2296 + }, + { + "epoch": 0.4899744027303754, + "grad_norm": 0.6110002913151472, + "learning_rate": 0.00011198049969530774, + "loss": 0.9722, + "step": 2297 + }, + { + "epoch": 0.4901877133105802, + "grad_norm": 0.5900546789626934, + "learning_rate": 0.0001120292504570384, + "loss": 1.0175, + "step": 2298 + }, + { + "epoch": 0.490401023890785, + "grad_norm": 0.536517653682912, + "learning_rate": 0.00011207800121876904, + "loss": 1.0068, + "step": 2299 + }, + { + "epoch": 0.4906143344709898, + "grad_norm": 0.5586690031381777, + "learning_rate": 0.00011212675198049971, + "loss": 0.9887, + "step": 2300 + }, + { + "epoch": 0.4908276450511945, + "grad_norm": 0.5560985262134996, + "learning_rate": 0.00011217550274223035, + "loss": 1.0247, + "step": 2301 + }, + { + "epoch": 0.4910409556313993, + "grad_norm": 0.6412696006479353, + "learning_rate": 0.000112224253503961, + "loss": 0.9996, + "step": 2302 + }, + { + "epoch": 0.4912542662116041, + "grad_norm": 0.6282800611388312, + "learning_rate": 0.00011227300426569167, + "loss": 1.0107, + "step": 2303 + }, + { + "epoch": 0.49146757679180886, + "grad_norm": 0.5891091067170586, + "learning_rate": 0.00011232175502742231, + "loss": 1.0319, + "step": 2304 + }, + { + "epoch": 0.49168088737201365, + "grad_norm": 0.6320560126679247, + "learning_rate": 0.00011237050578915297, + "loss": 1.0283, + "step": 2305 + }, + { + "epoch": 0.49189419795221845, + "grad_norm": 0.7228184191881631, + "learning_rate": 0.00011241925655088361, + "loss": 0.9955, + "step": 2306 + }, + { + "epoch": 0.4921075085324232, + "grad_norm": 0.8492535548615501, + "learning_rate": 0.00011246800731261425, + "loss": 1.0364, + "step": 2307 + }, + { + "epoch": 0.492320819112628, + "grad_norm": 0.9880043503416134, + "learning_rate": 0.00011251675807434492, + "loss": 0.9993, + "step": 2308 + }, + { + "epoch": 0.4925341296928328, + "grad_norm": 1.0008550626560935, + "learning_rate": 0.00011256550883607557, + "loss": 1.0043, + "step": 2309 + }, + { + "epoch": 0.4927474402730375, + "grad_norm": 0.9175743053204163, + "learning_rate": 0.00011261425959780623, + "loss": 0.978, + "step": 2310 + }, + { + "epoch": 0.4929607508532423, + "grad_norm": 0.8243709600988922, + "learning_rate": 0.00011266301035953688, + "loss": 1.0078, + "step": 2311 + }, + { + "epoch": 0.4931740614334471, + "grad_norm": 0.8095182155103939, + "learning_rate": 0.00011271176112126753, + "loss": 1.0292, + "step": 2312 + }, + { + "epoch": 0.4933873720136519, + "grad_norm": 0.833231011015502, + "learning_rate": 0.00011276051188299819, + "loss": 0.9724, + "step": 2313 + }, + { + "epoch": 0.49360068259385664, + "grad_norm": 0.8291985700398868, + "learning_rate": 0.00011280926264472882, + "loss": 1.0104, + "step": 2314 + }, + { + "epoch": 0.49381399317406144, + "grad_norm": 0.8791681953793065, + "learning_rate": 0.00011285801340645949, + "loss": 1.0183, + "step": 2315 + }, + { + "epoch": 0.49402730375426623, + "grad_norm": 0.8469805123338935, + "learning_rate": 0.00011290676416819013, + "loss": 0.9767, + "step": 2316 + }, + { + "epoch": 0.494240614334471, + "grad_norm": 0.7490326604927515, + "learning_rate": 0.0001129555149299208, + "loss": 0.999, + "step": 2317 + }, + { + "epoch": 0.49445392491467577, + "grad_norm": 0.7396017518542739, + "learning_rate": 0.00011300426569165144, + "loss": 1.0012, + "step": 2318 + }, + { + "epoch": 0.49466723549488056, + "grad_norm": 0.7489951872109304, + "learning_rate": 0.00011305301645338209, + "loss": 0.9788, + "step": 2319 + }, + { + "epoch": 0.4948805460750853, + "grad_norm": 0.842895001772742, + "learning_rate": 0.00011310176721511276, + "loss": 1.0059, + "step": 2320 + }, + { + "epoch": 0.4950938566552901, + "grad_norm": 0.9068232982818794, + "learning_rate": 0.0001131505179768434, + "loss": 1.0131, + "step": 2321 + }, + { + "epoch": 0.4953071672354949, + "grad_norm": 0.9164714666860754, + "learning_rate": 0.00011319926873857406, + "loss": 1.0156, + "step": 2322 + }, + { + "epoch": 0.49552047781569963, + "grad_norm": 0.844981028384367, + "learning_rate": 0.0001132480195003047, + "loss": 1.0003, + "step": 2323 + }, + { + "epoch": 0.49573378839590443, + "grad_norm": 0.7440675230667599, + "learning_rate": 0.00011329677026203534, + "loss": 1.0064, + "step": 2324 + }, + { + "epoch": 0.4959470989761092, + "grad_norm": 0.7746658865768958, + "learning_rate": 0.00011334552102376601, + "loss": 0.9871, + "step": 2325 + }, + { + "epoch": 0.496160409556314, + "grad_norm": 0.7333170875462479, + "learning_rate": 0.00011339427178549666, + "loss": 1.0386, + "step": 2326 + }, + { + "epoch": 0.49637372013651876, + "grad_norm": 0.681605292358498, + "learning_rate": 0.00011344302254722731, + "loss": 1.0105, + "step": 2327 + }, + { + "epoch": 0.49658703071672355, + "grad_norm": 0.6672069567428237, + "learning_rate": 0.00011349177330895797, + "loss": 1.0013, + "step": 2328 + }, + { + "epoch": 0.49680034129692835, + "grad_norm": 0.7207183504418768, + "learning_rate": 0.0001135405240706886, + "loss": 1.0289, + "step": 2329 + }, + { + "epoch": 0.4970136518771331, + "grad_norm": 0.721646568612733, + "learning_rate": 0.00011358927483241927, + "loss": 0.9946, + "step": 2330 + }, + { + "epoch": 0.4972269624573379, + "grad_norm": 0.7202427824723475, + "learning_rate": 0.00011363802559414991, + "loss": 1.0121, + "step": 2331 + }, + { + "epoch": 0.4974402730375427, + "grad_norm": 0.6314356868981433, + "learning_rate": 0.00011368677635588058, + "loss": 1.0276, + "step": 2332 + }, + { + "epoch": 0.4976535836177474, + "grad_norm": 0.5453046528344268, + "learning_rate": 0.00011373552711761122, + "loss": 1.0036, + "step": 2333 + }, + { + "epoch": 0.4978668941979522, + "grad_norm": 0.5426343156798723, + "learning_rate": 0.00011378427787934187, + "loss": 1.0261, + "step": 2334 + }, + { + "epoch": 0.498080204778157, + "grad_norm": 0.6384477475002761, + "learning_rate": 0.00011383302864107252, + "loss": 1.019, + "step": 2335 + }, + { + "epoch": 0.49829351535836175, + "grad_norm": 0.7140356153541008, + "learning_rate": 0.00011388177940280318, + "loss": 0.9902, + "step": 2336 + }, + { + "epoch": 0.49850682593856654, + "grad_norm": 0.745452100035981, + "learning_rate": 0.00011393053016453383, + "loss": 1.0261, + "step": 2337 + }, + { + "epoch": 0.49872013651877134, + "grad_norm": 0.8328731314901587, + "learning_rate": 0.00011397928092626448, + "loss": 0.9925, + "step": 2338 + }, + { + "epoch": 0.49893344709897613, + "grad_norm": 1.0291886845297142, + "learning_rate": 0.00011402803168799512, + "loss": 1.0349, + "step": 2339 + }, + { + "epoch": 0.4991467576791809, + "grad_norm": 1.0361823360569478, + "learning_rate": 0.00011407678244972579, + "loss": 0.999, + "step": 2340 + }, + { + "epoch": 0.49936006825938567, + "grad_norm": 0.7625745642131545, + "learning_rate": 0.00011412553321145643, + "loss": 1.0221, + "step": 2341 + }, + { + "epoch": 0.49957337883959047, + "grad_norm": 0.5283870586467585, + "learning_rate": 0.0001141742839731871, + "loss": 0.9994, + "step": 2342 + }, + { + "epoch": 0.4997866894197952, + "grad_norm": 0.46659167957522646, + "learning_rate": 0.00011422303473491774, + "loss": 0.9631, + "step": 2343 + }, + { + "epoch": 0.5, + "grad_norm": 0.5335834039359652, + "learning_rate": 0.00011427178549664839, + "loss": 1.0144, + "step": 2344 + }, + { + "epoch": 0.5002133105802048, + "grad_norm": 0.7873643458575851, + "learning_rate": 0.00011432053625837905, + "loss": 1.0158, + "step": 2345 + }, + { + "epoch": 0.5004266211604096, + "grad_norm": 1.1400609566071813, + "learning_rate": 0.0001143692870201097, + "loss": 0.9645, + "step": 2346 + }, + { + "epoch": 0.5006399317406144, + "grad_norm": 0.8732836217541452, + "learning_rate": 0.00011441803778184036, + "loss": 0.9878, + "step": 2347 + }, + { + "epoch": 0.5008532423208191, + "grad_norm": 0.6900710970850044, + "learning_rate": 0.000114466788543571, + "loss": 1.0156, + "step": 2348 + }, + { + "epoch": 0.5010665529010239, + "grad_norm": 0.6175541983068975, + "learning_rate": 0.00011451553930530164, + "loss": 1.0309, + "step": 2349 + }, + { + "epoch": 0.5012798634812287, + "grad_norm": 0.6641427252343999, + "learning_rate": 0.0001145642900670323, + "loss": 1.0041, + "step": 2350 + }, + { + "epoch": 0.5014931740614335, + "grad_norm": 0.5932871037419281, + "learning_rate": 0.00011461304082876296, + "loss": 0.992, + "step": 2351 + }, + { + "epoch": 0.5017064846416383, + "grad_norm": 0.5080399091524214, + "learning_rate": 0.00011466179159049361, + "loss": 0.9873, + "step": 2352 + }, + { + "epoch": 0.501919795221843, + "grad_norm": 0.44017262788326567, + "learning_rate": 0.00011471054235222427, + "loss": 1.0171, + "step": 2353 + }, + { + "epoch": 0.5021331058020477, + "grad_norm": 0.4201882702972557, + "learning_rate": 0.00011475929311395492, + "loss": 0.9814, + "step": 2354 + }, + { + "epoch": 0.5023464163822525, + "grad_norm": 0.5881181094634053, + "learning_rate": 0.00011480804387568557, + "loss": 0.999, + "step": 2355 + }, + { + "epoch": 0.5025597269624573, + "grad_norm": 0.7738229101905014, + "learning_rate": 0.00011485679463741621, + "loss": 0.9925, + "step": 2356 + }, + { + "epoch": 0.5027730375426621, + "grad_norm": 0.8208216565270725, + "learning_rate": 0.00011490554539914688, + "loss": 1.0317, + "step": 2357 + }, + { + "epoch": 0.5029863481228669, + "grad_norm": 0.805281137470238, + "learning_rate": 0.00011495429616087752, + "loss": 0.9662, + "step": 2358 + }, + { + "epoch": 0.5031996587030717, + "grad_norm": 0.9750942054084094, + "learning_rate": 0.00011500304692260818, + "loss": 1.0296, + "step": 2359 + }, + { + "epoch": 0.5034129692832765, + "grad_norm": 1.2943111976312678, + "learning_rate": 0.00011505179768433882, + "loss": 1.0002, + "step": 2360 + }, + { + "epoch": 0.5036262798634812, + "grad_norm": 0.68622399764193, + "learning_rate": 0.00011510054844606948, + "loss": 1.0046, + "step": 2361 + }, + { + "epoch": 0.503839590443686, + "grad_norm": 0.5048170380661378, + "learning_rate": 0.00011514929920780013, + "loss": 1.0412, + "step": 2362 + }, + { + "epoch": 0.5040529010238908, + "grad_norm": 0.84990471174686, + "learning_rate": 0.00011519804996953078, + "loss": 1.0142, + "step": 2363 + }, + { + "epoch": 0.5042662116040956, + "grad_norm": 1.381843351212909, + "learning_rate": 0.00011524680073126145, + "loss": 1.0222, + "step": 2364 + }, + { + "epoch": 0.5044795221843004, + "grad_norm": 0.7411091301545886, + "learning_rate": 0.00011529555149299209, + "loss": 1.0225, + "step": 2365 + }, + { + "epoch": 0.5046928327645052, + "grad_norm": 0.7779482285877266, + "learning_rate": 0.00011534430225472273, + "loss": 1.0461, + "step": 2366 + }, + { + "epoch": 0.5049061433447098, + "grad_norm": 0.9090420627417183, + "learning_rate": 0.0001153930530164534, + "loss": 1.0062, + "step": 2367 + }, + { + "epoch": 0.5051194539249146, + "grad_norm": 1.1823055037229213, + "learning_rate": 0.00011544180377818405, + "loss": 1.0083, + "step": 2368 + }, + { + "epoch": 0.5053327645051194, + "grad_norm": 0.7619222341645199, + "learning_rate": 0.0001154905545399147, + "loss": 0.9845, + "step": 2369 + }, + { + "epoch": 0.5055460750853242, + "grad_norm": 0.8532336397646485, + "learning_rate": 0.00011553930530164535, + "loss": 0.9842, + "step": 2370 + }, + { + "epoch": 0.505759385665529, + "grad_norm": 0.8565049140507015, + "learning_rate": 0.00011558805606337599, + "loss": 1.0299, + "step": 2371 + }, + { + "epoch": 0.5059726962457338, + "grad_norm": 0.6744969487939588, + "learning_rate": 0.00011563680682510666, + "loss": 1.0316, + "step": 2372 + }, + { + "epoch": 0.5061860068259386, + "grad_norm": 0.6236395647458305, + "learning_rate": 0.0001156855575868373, + "loss": 1.0044, + "step": 2373 + }, + { + "epoch": 0.5063993174061433, + "grad_norm": 0.54802817650853, + "learning_rate": 0.00011573430834856797, + "loss": 0.992, + "step": 2374 + }, + { + "epoch": 0.5066126279863481, + "grad_norm": 0.5515091530324122, + "learning_rate": 0.0001157830591102986, + "loss": 1.0094, + "step": 2375 + }, + { + "epoch": 0.5068259385665529, + "grad_norm": 0.6432254602785383, + "learning_rate": 0.00011583180987202926, + "loss": 1.0114, + "step": 2376 + }, + { + "epoch": 0.5070392491467577, + "grad_norm": 0.5653404918420211, + "learning_rate": 0.00011588056063375991, + "loss": 0.9888, + "step": 2377 + }, + { + "epoch": 0.5072525597269625, + "grad_norm": 0.5211279914402903, + "learning_rate": 0.00011592931139549056, + "loss": 1.0167, + "step": 2378 + }, + { + "epoch": 0.5074658703071673, + "grad_norm": 0.5496280467336007, + "learning_rate": 0.00011597806215722122, + "loss": 1.0007, + "step": 2379 + }, + { + "epoch": 0.507679180887372, + "grad_norm": 0.591992560078109, + "learning_rate": 0.00011602681291895187, + "loss": 1.0302, + "step": 2380 + }, + { + "epoch": 0.5078924914675768, + "grad_norm": 0.5841211308305919, + "learning_rate": 0.00011607556368068251, + "loss": 1.0208, + "step": 2381 + }, + { + "epoch": 0.5081058020477816, + "grad_norm": 0.48959486064979646, + "learning_rate": 0.00011612431444241318, + "loss": 0.9535, + "step": 2382 + }, + { + "epoch": 0.5083191126279863, + "grad_norm": 0.6308187842436941, + "learning_rate": 0.00011617306520414382, + "loss": 1.0206, + "step": 2383 + }, + { + "epoch": 0.5085324232081911, + "grad_norm": 0.6388523870507922, + "learning_rate": 0.00011622181596587448, + "loss": 1.0069, + "step": 2384 + }, + { + "epoch": 0.5087457337883959, + "grad_norm": 0.6775139960516711, + "learning_rate": 0.00011627056672760512, + "loss": 0.9942, + "step": 2385 + }, + { + "epoch": 0.5089590443686007, + "grad_norm": 0.6717855582195502, + "learning_rate": 0.00011631931748933577, + "loss": 0.9849, + "step": 2386 + }, + { + "epoch": 0.5091723549488054, + "grad_norm": 0.6615811671951989, + "learning_rate": 0.00011636806825106644, + "loss": 0.9842, + "step": 2387 + }, + { + "epoch": 0.5093856655290102, + "grad_norm": 0.7463357011003651, + "learning_rate": 0.00011641681901279708, + "loss": 0.9953, + "step": 2388 + }, + { + "epoch": 0.509598976109215, + "grad_norm": 0.6564763891950048, + "learning_rate": 0.00011646556977452775, + "loss": 0.9624, + "step": 2389 + }, + { + "epoch": 0.5098122866894198, + "grad_norm": 0.5811181439994723, + "learning_rate": 0.00011651432053625839, + "loss": 1.0581, + "step": 2390 + }, + { + "epoch": 0.5100255972696246, + "grad_norm": 0.7093026194707868, + "learning_rate": 0.00011656307129798903, + "loss": 1.0333, + "step": 2391 + }, + { + "epoch": 0.5102389078498294, + "grad_norm": 0.7379969655780975, + "learning_rate": 0.00011661182205971969, + "loss": 0.9882, + "step": 2392 + }, + { + "epoch": 0.5104522184300341, + "grad_norm": 0.7828821905093577, + "learning_rate": 0.00011666057282145035, + "loss": 1.0284, + "step": 2393 + }, + { + "epoch": 0.5106655290102389, + "grad_norm": 0.903823038917583, + "learning_rate": 0.000116709323583181, + "loss": 1.0292, + "step": 2394 + }, + { + "epoch": 0.5108788395904437, + "grad_norm": 0.9403168539610605, + "learning_rate": 0.00011675807434491165, + "loss": 0.9923, + "step": 2395 + }, + { + "epoch": 0.5110921501706485, + "grad_norm": 0.8112651755258764, + "learning_rate": 0.00011680682510664229, + "loss": 1.033, + "step": 2396 + }, + { + "epoch": 0.5113054607508533, + "grad_norm": 0.837530110748684, + "learning_rate": 0.00011685557586837296, + "loss": 1.0071, + "step": 2397 + }, + { + "epoch": 0.511518771331058, + "grad_norm": 0.9478884830106487, + "learning_rate": 0.0001169043266301036, + "loss": 1.0079, + "step": 2398 + }, + { + "epoch": 0.5117320819112628, + "grad_norm": 0.978643792726724, + "learning_rate": 0.00011695307739183426, + "loss": 0.9844, + "step": 2399 + }, + { + "epoch": 0.5119453924914675, + "grad_norm": 1.0266250221257485, + "learning_rate": 0.0001170018281535649, + "loss": 1.0044, + "step": 2400 + }, + { + "epoch": 0.5121587030716723, + "grad_norm": 0.9539963026899829, + "learning_rate": 0.00011705057891529557, + "loss": 0.9807, + "step": 2401 + }, + { + "epoch": 0.5123720136518771, + "grad_norm": 0.8878544535252786, + "learning_rate": 0.00011709932967702621, + "loss": 1.0209, + "step": 2402 + }, + { + "epoch": 0.5125853242320819, + "grad_norm": 0.7890787427835558, + "learning_rate": 0.00011714808043875686, + "loss": 1.0069, + "step": 2403 + }, + { + "epoch": 0.5127986348122867, + "grad_norm": 0.5677438618249412, + "learning_rate": 0.00011719683120048752, + "loss": 1.041, + "step": 2404 + }, + { + "epoch": 0.5130119453924915, + "grad_norm": 0.47556758669343446, + "learning_rate": 0.00011724558196221817, + "loss": 1.0103, + "step": 2405 + }, + { + "epoch": 0.5132252559726962, + "grad_norm": 0.5440119692000653, + "learning_rate": 0.00011729433272394883, + "loss": 1.0057, + "step": 2406 + }, + { + "epoch": 0.513438566552901, + "grad_norm": 2.6461746398740167, + "learning_rate": 0.00011734308348567947, + "loss": 1.0598, + "step": 2407 + }, + { + "epoch": 0.5136518771331058, + "grad_norm": 0.8055613352999785, + "learning_rate": 0.00011739183424741011, + "loss": 0.9916, + "step": 2408 + }, + { + "epoch": 0.5138651877133106, + "grad_norm": 1.277687707630685, + "learning_rate": 0.00011744058500914078, + "loss": 1.0155, + "step": 2409 + }, + { + "epoch": 0.5140784982935154, + "grad_norm": 0.4802814404120879, + "learning_rate": 0.00011748933577087142, + "loss": 0.9723, + "step": 2410 + }, + { + "epoch": 0.5142918088737202, + "grad_norm": 1.0534247178953544, + "learning_rate": 0.00011753808653260209, + "loss": 0.9912, + "step": 2411 + }, + { + "epoch": 0.514505119453925, + "grad_norm": 0.9733579873787921, + "learning_rate": 0.00011758683729433274, + "loss": 1.0027, + "step": 2412 + }, + { + "epoch": 0.5147184300341296, + "grad_norm": 0.6314088223176966, + "learning_rate": 0.00011763558805606338, + "loss": 1.0449, + "step": 2413 + }, + { + "epoch": 0.5149317406143344, + "grad_norm": 0.9434774950583885, + "learning_rate": 0.00011768433881779405, + "loss": 1.0251, + "step": 2414 + }, + { + "epoch": 0.5151450511945392, + "grad_norm": 0.8323336667478937, + "learning_rate": 0.00011773308957952469, + "loss": 1.0341, + "step": 2415 + }, + { + "epoch": 0.515358361774744, + "grad_norm": 0.6405452702184072, + "learning_rate": 0.00011778184034125535, + "loss": 1.0131, + "step": 2416 + }, + { + "epoch": 0.5155716723549488, + "grad_norm": 0.6719333744740256, + "learning_rate": 0.00011783059110298599, + "loss": 1.0169, + "step": 2417 + }, + { + "epoch": 0.5157849829351536, + "grad_norm": 0.7071954885986794, + "learning_rate": 0.00011787934186471664, + "loss": 1.004, + "step": 2418 + }, + { + "epoch": 0.5159982935153583, + "grad_norm": 0.8614243427055219, + "learning_rate": 0.0001179280926264473, + "loss": 1.0314, + "step": 2419 + }, + { + "epoch": 0.5162116040955631, + "grad_norm": 1.0767769066336856, + "learning_rate": 0.00011797684338817795, + "loss": 1.0206, + "step": 2420 + }, + { + "epoch": 0.5164249146757679, + "grad_norm": 0.9995655213593945, + "learning_rate": 0.0001180255941499086, + "loss": 0.9971, + "step": 2421 + }, + { + "epoch": 0.5166382252559727, + "grad_norm": 1.0770790516405673, + "learning_rate": 0.00011807434491163926, + "loss": 0.9934, + "step": 2422 + }, + { + "epoch": 0.5168515358361775, + "grad_norm": 1.0175540679084962, + "learning_rate": 0.0001181230956733699, + "loss": 0.9979, + "step": 2423 + }, + { + "epoch": 0.5170648464163823, + "grad_norm": 0.8579516078910966, + "learning_rate": 0.00011817184643510056, + "loss": 1.0138, + "step": 2424 + }, + { + "epoch": 0.5172781569965871, + "grad_norm": 0.694123869793816, + "learning_rate": 0.0001182205971968312, + "loss": 1.0351, + "step": 2425 + }, + { + "epoch": 0.5174914675767918, + "grad_norm": 0.5162015708185892, + "learning_rate": 0.00011826934795856187, + "loss": 1.009, + "step": 2426 + }, + { + "epoch": 0.5177047781569966, + "grad_norm": 0.4524724717570414, + "learning_rate": 0.00011831809872029251, + "loss": 1.0104, + "step": 2427 + }, + { + "epoch": 0.5179180887372014, + "grad_norm": 0.6808397108715439, + "learning_rate": 0.00011836684948202316, + "loss": 0.9967, + "step": 2428 + }, + { + "epoch": 0.5181313993174061, + "grad_norm": 0.9343166068867921, + "learning_rate": 0.00011841560024375383, + "loss": 1.042, + "step": 2429 + }, + { + "epoch": 0.518344709897611, + "grad_norm": 1.1346895274953708, + "learning_rate": 0.00011846435100548447, + "loss": 1.0135, + "step": 2430 + }, + { + "epoch": 0.5185580204778157, + "grad_norm": 0.8347114749106775, + "learning_rate": 0.00011851310176721513, + "loss": 1.013, + "step": 2431 + }, + { + "epoch": 0.5187713310580204, + "grad_norm": 0.6142914981810498, + "learning_rate": 0.00011856185252894577, + "loss": 1.0213, + "step": 2432 + }, + { + "epoch": 0.5189846416382252, + "grad_norm": 0.6018213957988883, + "learning_rate": 0.00011861060329067641, + "loss": 0.9918, + "step": 2433 + }, + { + "epoch": 0.51919795221843, + "grad_norm": 0.612512872050783, + "learning_rate": 0.00011865935405240708, + "loss": 1.0382, + "step": 2434 + }, + { + "epoch": 0.5194112627986348, + "grad_norm": 0.7066640899522243, + "learning_rate": 0.00011870810481413773, + "loss": 1.0256, + "step": 2435 + }, + { + "epoch": 0.5196245733788396, + "grad_norm": 0.7396016285207468, + "learning_rate": 0.00011875685557586838, + "loss": 0.9887, + "step": 2436 + }, + { + "epoch": 0.5198378839590444, + "grad_norm": 0.7646456457177313, + "learning_rate": 0.00011880560633759904, + "loss": 1.0198, + "step": 2437 + }, + { + "epoch": 0.5200511945392492, + "grad_norm": 0.7940118276916341, + "learning_rate": 0.00011885435709932968, + "loss": 1.0001, + "step": 2438 + }, + { + "epoch": 0.5202645051194539, + "grad_norm": 0.7802798390469149, + "learning_rate": 0.00011890310786106034, + "loss": 1.0103, + "step": 2439 + }, + { + "epoch": 0.5204778156996587, + "grad_norm": 0.685437136744317, + "learning_rate": 0.00011895185862279098, + "loss": 0.9666, + "step": 2440 + }, + { + "epoch": 0.5206911262798635, + "grad_norm": 0.5173569657350591, + "learning_rate": 0.00011900060938452165, + "loss": 1.0194, + "step": 2441 + }, + { + "epoch": 0.5209044368600683, + "grad_norm": 0.5217598239857686, + "learning_rate": 0.00011904936014625229, + "loss": 1.0042, + "step": 2442 + }, + { + "epoch": 0.5211177474402731, + "grad_norm": 0.575051938056623, + "learning_rate": 0.00011909811090798296, + "loss": 1.0044, + "step": 2443 + }, + { + "epoch": 0.5213310580204779, + "grad_norm": 0.5269751061494476, + "learning_rate": 0.0001191468616697136, + "loss": 1.0354, + "step": 2444 + }, + { + "epoch": 0.5215443686006825, + "grad_norm": 0.4973919309049569, + "learning_rate": 0.00011919561243144425, + "loss": 1.0041, + "step": 2445 + }, + { + "epoch": 0.5217576791808873, + "grad_norm": 0.5863224874264853, + "learning_rate": 0.0001192443631931749, + "loss": 0.9972, + "step": 2446 + }, + { + "epoch": 0.5219709897610921, + "grad_norm": 0.630298447471182, + "learning_rate": 0.00011929311395490555, + "loss": 1.0409, + "step": 2447 + }, + { + "epoch": 0.5221843003412969, + "grad_norm": 0.6186499332826402, + "learning_rate": 0.00011934186471663622, + "loss": 1.0443, + "step": 2448 + }, + { + "epoch": 0.5223976109215017, + "grad_norm": 0.5435560699752378, + "learning_rate": 0.00011939061547836686, + "loss": 0.988, + "step": 2449 + }, + { + "epoch": 0.5226109215017065, + "grad_norm": 0.4975978635221796, + "learning_rate": 0.0001194393662400975, + "loss": 1.0251, + "step": 2450 + }, + { + "epoch": 0.5228242320819113, + "grad_norm": 0.5543984606778384, + "learning_rate": 0.00011948811700182817, + "loss": 0.9945, + "step": 2451 + }, + { + "epoch": 0.523037542662116, + "grad_norm": 0.682206738967034, + "learning_rate": 0.0001195368677635588, + "loss": 1.0155, + "step": 2452 + }, + { + "epoch": 0.5232508532423208, + "grad_norm": 0.9642707595046226, + "learning_rate": 0.00011958561852528947, + "loss": 1.0257, + "step": 2453 + }, + { + "epoch": 0.5234641638225256, + "grad_norm": 1.2456020368760712, + "learning_rate": 0.00011963436928702013, + "loss": 1.0084, + "step": 2454 + }, + { + "epoch": 0.5236774744027304, + "grad_norm": 0.7576447409324196, + "learning_rate": 0.00011968312004875077, + "loss": 1.0111, + "step": 2455 + }, + { + "epoch": 0.5238907849829352, + "grad_norm": 0.6356379994030265, + "learning_rate": 0.00011973187081048143, + "loss": 0.9821, + "step": 2456 + }, + { + "epoch": 0.52410409556314, + "grad_norm": 0.6169392760837983, + "learning_rate": 0.00011978062157221207, + "loss": 0.9876, + "step": 2457 + }, + { + "epoch": 0.5243174061433447, + "grad_norm": 0.4971897448516707, + "learning_rate": 0.00011982937233394274, + "loss": 1.0125, + "step": 2458 + }, + { + "epoch": 0.5245307167235495, + "grad_norm": 0.502890761673097, + "learning_rate": 0.00011987812309567338, + "loss": 1.0135, + "step": 2459 + }, + { + "epoch": 0.5247440273037542, + "grad_norm": 0.5973543816617644, + "learning_rate": 0.00011992687385740403, + "loss": 1.0127, + "step": 2460 + }, + { + "epoch": 0.524957337883959, + "grad_norm": 0.6635726385791813, + "learning_rate": 0.00011997562461913468, + "loss": 1.0252, + "step": 2461 + }, + { + "epoch": 0.5251706484641638, + "grad_norm": 0.7573928641363747, + "learning_rate": 0.00012002437538086534, + "loss": 1.0287, + "step": 2462 + }, + { + "epoch": 0.5253839590443686, + "grad_norm": 0.8244174832798528, + "learning_rate": 0.00012007312614259599, + "loss": 0.9821, + "step": 2463 + }, + { + "epoch": 0.5255972696245734, + "grad_norm": 1.0074603147056476, + "learning_rate": 0.00012012187690432664, + "loss": 0.9993, + "step": 2464 + }, + { + "epoch": 0.5258105802047781, + "grad_norm": 1.3530313439894133, + "learning_rate": 0.00012017062766605728, + "loss": 1.0187, + "step": 2465 + }, + { + "epoch": 0.5260238907849829, + "grad_norm": 0.5500796551789119, + "learning_rate": 0.00012021937842778795, + "loss": 1.0056, + "step": 2466 + }, + { + "epoch": 0.5262372013651877, + "grad_norm": 0.7422923586172187, + "learning_rate": 0.00012026812918951859, + "loss": 1.0459, + "step": 2467 + }, + { + "epoch": 0.5264505119453925, + "grad_norm": 1.4609995335386323, + "learning_rate": 0.00012031687995124925, + "loss": 1.006, + "step": 2468 + }, + { + "epoch": 0.5266638225255973, + "grad_norm": 0.5298750440489782, + "learning_rate": 0.0001203656307129799, + "loss": 0.9989, + "step": 2469 + }, + { + "epoch": 0.5268771331058021, + "grad_norm": 1.1210812114357394, + "learning_rate": 0.00012041438147471055, + "loss": 1.0149, + "step": 2470 + }, + { + "epoch": 0.5270904436860068, + "grad_norm": 1.3549755406504769, + "learning_rate": 0.00012046313223644121, + "loss": 0.995, + "step": 2471 + }, + { + "epoch": 0.5273037542662116, + "grad_norm": 0.8021910516249856, + "learning_rate": 0.00012051188299817185, + "loss": 0.9788, + "step": 2472 + }, + { + "epoch": 0.5275170648464164, + "grad_norm": 1.2016615493935388, + "learning_rate": 0.00012056063375990252, + "loss": 1.0233, + "step": 2473 + }, + { + "epoch": 0.5277303754266212, + "grad_norm": 0.7505633951652448, + "learning_rate": 0.00012060938452163316, + "loss": 1.0531, + "step": 2474 + }, + { + "epoch": 0.527943686006826, + "grad_norm": 1.0914900997572652, + "learning_rate": 0.0001206581352833638, + "loss": 1.0204, + "step": 2475 + }, + { + "epoch": 0.5281569965870307, + "grad_norm": 0.8216344115691496, + "learning_rate": 0.00012070688604509447, + "loss": 0.9714, + "step": 2476 + }, + { + "epoch": 0.5283703071672355, + "grad_norm": 0.8407105010079386, + "learning_rate": 0.00012075563680682512, + "loss": 1.022, + "step": 2477 + }, + { + "epoch": 0.5285836177474402, + "grad_norm": 0.8132453574259426, + "learning_rate": 0.00012080438756855577, + "loss": 1.0147, + "step": 2478 + }, + { + "epoch": 0.528796928327645, + "grad_norm": 0.6478273642175693, + "learning_rate": 0.00012085313833028642, + "loss": 1.0318, + "step": 2479 + }, + { + "epoch": 0.5290102389078498, + "grad_norm": 0.6206429421841144, + "learning_rate": 0.00012090188909201706, + "loss": 0.9752, + "step": 2480 + }, + { + "epoch": 0.5292235494880546, + "grad_norm": 0.7319442801810583, + "learning_rate": 0.00012095063985374773, + "loss": 1.0108, + "step": 2481 + }, + { + "epoch": 0.5294368600682594, + "grad_norm": 0.8705496998889662, + "learning_rate": 0.00012099939061547837, + "loss": 0.996, + "step": 2482 + }, + { + "epoch": 0.5296501706484642, + "grad_norm": 1.128719553220549, + "learning_rate": 0.00012104814137720904, + "loss": 0.9763, + "step": 2483 + }, + { + "epoch": 0.5298634812286689, + "grad_norm": 0.8787872090264511, + "learning_rate": 0.00012109689213893968, + "loss": 0.984, + "step": 2484 + }, + { + "epoch": 0.5300767918088737, + "grad_norm": 0.6522550013895999, + "learning_rate": 0.00012114564290067034, + "loss": 1.0077, + "step": 2485 + }, + { + "epoch": 0.5302901023890785, + "grad_norm": 0.48075051731890384, + "learning_rate": 0.00012119439366240098, + "loss": 1.0003, + "step": 2486 + }, + { + "epoch": 0.5305034129692833, + "grad_norm": 2.475963232471856, + "learning_rate": 0.00012124314442413163, + "loss": 1.0422, + "step": 2487 + }, + { + "epoch": 0.5307167235494881, + "grad_norm": 0.7119679185399157, + "learning_rate": 0.00012129189518586229, + "loss": 0.9921, + "step": 2488 + }, + { + "epoch": 0.5309300341296929, + "grad_norm": 1.236403049441923, + "learning_rate": 0.00012134064594759294, + "loss": 1.054, + "step": 2489 + }, + { + "epoch": 0.5311433447098977, + "grad_norm": 0.6259821977843881, + "learning_rate": 0.00012138939670932361, + "loss": 1.0218, + "step": 2490 + }, + { + "epoch": 0.5313566552901023, + "grad_norm": 0.8375793310329945, + "learning_rate": 0.00012143814747105425, + "loss": 0.9913, + "step": 2491 + }, + { + "epoch": 0.5315699658703071, + "grad_norm": 0.9334615471677284, + "learning_rate": 0.00012148689823278489, + "loss": 1.0348, + "step": 2492 + }, + { + "epoch": 0.5317832764505119, + "grad_norm": 7.686638856626982, + "learning_rate": 0.00012153564899451555, + "loss": 0.9929, + "step": 2493 + }, + { + "epoch": 0.5319965870307167, + "grad_norm": 1.7735179298464705, + "learning_rate": 0.00012158439975624619, + "loss": 1.0072, + "step": 2494 + }, + { + "epoch": 0.5322098976109215, + "grad_norm": 20.96052162682905, + "learning_rate": 0.00012163315051797686, + "loss": 1.146, + "step": 2495 + }, + { + "epoch": 0.5324232081911263, + "grad_norm": 635.1521786805001, + "learning_rate": 0.00012168190127970751, + "loss": 4.476, + "step": 2496 + }, + { + "epoch": 0.5326365187713311, + "grad_norm": 67.19986605687195, + "learning_rate": 0.00012173065204143815, + "loss": 4.0, + "step": 2497 + }, + { + "epoch": 0.5328498293515358, + "grad_norm": 1566.44185519621, + "learning_rate": 0.00012177940280316882, + "loss": 61.2391, + "step": 2498 + }, + { + "epoch": 0.5330631399317406, + "grad_norm": 200.58896477935323, + "learning_rate": 0.00012182815356489946, + "loss": 23.3234, + "step": 2499 + }, + { + "epoch": 0.5332764505119454, + "grad_norm": 101.48156661863493, + "learning_rate": 0.00012187690432663012, + "loss": 20.1228, + "step": 2500 + }, + { + "epoch": 0.5334897610921502, + "grad_norm": 92.53550116425303, + "learning_rate": 0.00012192565508836076, + "loss": 18.0399, + "step": 2501 + }, + { + "epoch": 0.533703071672355, + "grad_norm": 84.56496101385594, + "learning_rate": 0.00012197440585009142, + "loss": 15.7392, + "step": 2502 + }, + { + "epoch": 0.5339163822525598, + "grad_norm": 79.7468234196724, + "learning_rate": 0.00012202315661182207, + "loss": 13.0681, + "step": 2503 + }, + { + "epoch": 0.5341296928327645, + "grad_norm": 53.518990143146, + "learning_rate": 0.00012207190737355272, + "loss": 10.2211, + "step": 2504 + }, + { + "epoch": 0.5343430034129693, + "grad_norm": 123.0549074203215, + "learning_rate": 0.0001221206581352834, + "loss": 13.8703, + "step": 2505 + }, + { + "epoch": 0.534556313993174, + "grad_norm": 79.62228736466163, + "learning_rate": 0.00012216940889701403, + "loss": 11.3262, + "step": 2506 + }, + { + "epoch": 0.5347696245733788, + "grad_norm": 67.04337708391058, + "learning_rate": 0.00012221815965874467, + "loss": 10.1782, + "step": 2507 + }, + { + "epoch": 0.5349829351535836, + "grad_norm": 62.30892604665798, + "learning_rate": 0.00012226691042047533, + "loss": 9.5646, + "step": 2508 + }, + { + "epoch": 0.5351962457337884, + "grad_norm": 57.70047839410062, + "learning_rate": 0.00012231566118220597, + "loss": 8.9616, + "step": 2509 + }, + { + "epoch": 0.5354095563139932, + "grad_norm": 45.54767696979588, + "learning_rate": 0.00012236441194393664, + "loss": 8.4399, + "step": 2510 + }, + { + "epoch": 0.5356228668941979, + "grad_norm": 22.375538165610436, + "learning_rate": 0.00012241316270566728, + "loss": 7.9139, + "step": 2511 + }, + { + "epoch": 0.5358361774744027, + "grad_norm": 21.866041267240906, + "learning_rate": 0.00012246191346739792, + "loss": 7.8609, + "step": 2512 + }, + { + "epoch": 0.5360494880546075, + "grad_norm": 16.04820115783179, + "learning_rate": 0.00012251066422912859, + "loss": 7.6049, + "step": 2513 + }, + { + "epoch": 0.5362627986348123, + "grad_norm": 14.658290825710123, + "learning_rate": 0.00012255941499085923, + "loss": 7.4217, + "step": 2514 + }, + { + "epoch": 0.5364761092150171, + "grad_norm": 10.293428144707702, + "learning_rate": 0.0001226081657525899, + "loss": 7.2593, + "step": 2515 + }, + { + "epoch": 0.5366894197952219, + "grad_norm": 10.481265562005593, + "learning_rate": 0.00012265691651432056, + "loss": 7.0615, + "step": 2516 + }, + { + "epoch": 0.5369027303754266, + "grad_norm": 11.602965290144205, + "learning_rate": 0.0001227056672760512, + "loss": 6.8653, + "step": 2517 + }, + { + "epoch": 0.5371160409556314, + "grad_norm": 8.7455571780955, + "learning_rate": 0.00012275441803778186, + "loss": 6.7995, + "step": 2518 + }, + { + "epoch": 0.5373293515358362, + "grad_norm": 7.759457967659353, + "learning_rate": 0.0001228031687995125, + "loss": 6.7291, + "step": 2519 + }, + { + "epoch": 0.537542662116041, + "grad_norm": 9.44120257491374, + "learning_rate": 0.00012285191956124317, + "loss": 6.6163, + "step": 2520 + }, + { + "epoch": 0.5377559726962458, + "grad_norm": 7.227454038061502, + "learning_rate": 0.0001229006703229738, + "loss": 6.6242, + "step": 2521 + }, + { + "epoch": 0.5379692832764505, + "grad_norm": 7.100964087652314, + "learning_rate": 0.00012294942108470445, + "loss": 6.49, + "step": 2522 + }, + { + "epoch": 0.5381825938566553, + "grad_norm": 6.5621886058990935, + "learning_rate": 0.00012299817184643512, + "loss": 6.3657, + "step": 2523 + }, + { + "epoch": 0.53839590443686, + "grad_norm": 5.177293772721369, + "learning_rate": 0.00012304692260816576, + "loss": 6.3336, + "step": 2524 + }, + { + "epoch": 0.5386092150170648, + "grad_norm": 4.519303571721748, + "learning_rate": 0.00012309567336989642, + "loss": 6.3626, + "step": 2525 + }, + { + "epoch": 0.5388225255972696, + "grad_norm": 4.186176202228657, + "learning_rate": 0.00012314442413162706, + "loss": 6.3121, + "step": 2526 + }, + { + "epoch": 0.5390358361774744, + "grad_norm": 4.672366986919028, + "learning_rate": 0.00012319317489335773, + "loss": 6.2718, + "step": 2527 + }, + { + "epoch": 0.5392491467576792, + "grad_norm": 5.43587464398596, + "learning_rate": 0.00012324192565508837, + "loss": 6.2245, + "step": 2528 + }, + { + "epoch": 0.539462457337884, + "grad_norm": 5.442593608157253, + "learning_rate": 0.000123290676416819, + "loss": 6.2221, + "step": 2529 + }, + { + "epoch": 0.5396757679180887, + "grad_norm": 4.7040355880521885, + "learning_rate": 0.00012333942717854967, + "loss": 6.1596, + "step": 2530 + }, + { + "epoch": 0.5398890784982935, + "grad_norm": 5.115918521361042, + "learning_rate": 0.0001233881779402803, + "loss": 6.1146, + "step": 2531 + }, + { + "epoch": 0.5401023890784983, + "grad_norm": 3.463700823948877, + "learning_rate": 0.00012343692870201098, + "loss": 6.0953, + "step": 2532 + }, + { + "epoch": 0.5403156996587031, + "grad_norm": 6.711080141740237, + "learning_rate": 0.00012348567946374162, + "loss": 6.1079, + "step": 2533 + }, + { + "epoch": 0.5405290102389079, + "grad_norm": 4.371747113380626, + "learning_rate": 0.00012353443022547229, + "loss": 6.0297, + "step": 2534 + }, + { + "epoch": 0.5407423208191127, + "grad_norm": 7.966949131040388, + "learning_rate": 0.00012358318098720295, + "loss": 6.0133, + "step": 2535 + }, + { + "epoch": 0.5409556313993175, + "grad_norm": 4.785938128854431, + "learning_rate": 0.0001236319317489336, + "loss": 5.8988, + "step": 2536 + }, + { + "epoch": 0.5411689419795221, + "grad_norm": 10.013298924730632, + "learning_rate": 0.00012368068251066426, + "loss": 6.0031, + "step": 2537 + }, + { + "epoch": 0.5413822525597269, + "grad_norm": 3.907020200286559, + "learning_rate": 0.0001237294332723949, + "loss": 5.9245, + "step": 2538 + }, + { + "epoch": 0.5415955631399317, + "grad_norm": 6.624555643898558, + "learning_rate": 0.00012377818403412554, + "loss": 5.856, + "step": 2539 + }, + { + "epoch": 0.5418088737201365, + "grad_norm": 4.5528247770177055, + "learning_rate": 0.0001238269347958562, + "loss": 5.9103, + "step": 2540 + }, + { + "epoch": 0.5420221843003413, + "grad_norm": 6.185872084833971, + "learning_rate": 0.00012387568555758684, + "loss": 5.7929, + "step": 2541 + }, + { + "epoch": 0.5422354948805461, + "grad_norm": 9.27795420496537, + "learning_rate": 0.0001239244363193175, + "loss": 5.822, + "step": 2542 + }, + { + "epoch": 0.5424488054607508, + "grad_norm": 4.746163043581523, + "learning_rate": 0.00012397318708104815, + "loss": 5.8171, + "step": 2543 + }, + { + "epoch": 0.5426621160409556, + "grad_norm": 7.085184594018184, + "learning_rate": 0.0001240219378427788, + "loss": 5.8565, + "step": 2544 + }, + { + "epoch": 0.5428754266211604, + "grad_norm": 6.0138357584928634, + "learning_rate": 0.00012407068860450946, + "loss": 5.7953, + "step": 2545 + }, + { + "epoch": 0.5430887372013652, + "grad_norm": 6.381732864566378, + "learning_rate": 0.0001241194393662401, + "loss": 5.7123, + "step": 2546 + }, + { + "epoch": 0.54330204778157, + "grad_norm": 5.085732650703209, + "learning_rate": 0.00012416819012797076, + "loss": 5.7362, + "step": 2547 + }, + { + "epoch": 0.5435153583617748, + "grad_norm": 6.086752812569313, + "learning_rate": 0.0001242169408897014, + "loss": 5.75, + "step": 2548 + }, + { + "epoch": 0.5437286689419796, + "grad_norm": 3.956712443869257, + "learning_rate": 0.00012426569165143207, + "loss": 5.7246, + "step": 2549 + }, + { + "epoch": 0.5439419795221843, + "grad_norm": 3.7811676877281677, + "learning_rate": 0.0001243144424131627, + "loss": 5.6362, + "step": 2550 + }, + { + "epoch": 0.544155290102389, + "grad_norm": 6.171192640079118, + "learning_rate": 0.00012436319317489337, + "loss": 5.6956, + "step": 2551 + }, + { + "epoch": 0.5443686006825939, + "grad_norm": 6.44429482817144, + "learning_rate": 0.000124411943936624, + "loss": 5.6732, + "step": 2552 + }, + { + "epoch": 0.5445819112627986, + "grad_norm": 4.310231391847143, + "learning_rate": 0.00012446069469835468, + "loss": 5.6207, + "step": 2553 + }, + { + "epoch": 0.5447952218430034, + "grad_norm": 5.648491520030647, + "learning_rate": 0.00012450944546008532, + "loss": 5.7024, + "step": 2554 + }, + { + "epoch": 0.5450085324232082, + "grad_norm": 5.009196007543663, + "learning_rate": 0.00012455819622181599, + "loss": 5.5981, + "step": 2555 + }, + { + "epoch": 0.5452218430034129, + "grad_norm": 8.389805062262056, + "learning_rate": 0.00012460694698354663, + "loss": 5.6309, + "step": 2556 + }, + { + "epoch": 0.5454351535836177, + "grad_norm": 5.529739883315418, + "learning_rate": 0.0001246556977452773, + "loss": 5.6266, + "step": 2557 + }, + { + "epoch": 0.5456484641638225, + "grad_norm": 4.46475923263434, + "learning_rate": 0.00012470444850700793, + "loss": 5.5768, + "step": 2558 + }, + { + "epoch": 0.5458617747440273, + "grad_norm": 4.611493757768206, + "learning_rate": 0.00012475319926873857, + "loss": 5.5118, + "step": 2559 + }, + { + "epoch": 0.5460750853242321, + "grad_norm": 3.918737228022115, + "learning_rate": 0.00012480195003046924, + "loss": 5.6088, + "step": 2560 + }, + { + "epoch": 0.5462883959044369, + "grad_norm": 4.259899143275343, + "learning_rate": 0.00012485070079219988, + "loss": 5.4479, + "step": 2561 + }, + { + "epoch": 0.5465017064846417, + "grad_norm": 5.912782958639605, + "learning_rate": 0.00012489945155393054, + "loss": 5.3795, + "step": 2562 + }, + { + "epoch": 0.5467150170648464, + "grad_norm": 8.342282198110052, + "learning_rate": 0.00012494820231566118, + "loss": 5.4254, + "step": 2563 + }, + { + "epoch": 0.5469283276450512, + "grad_norm": 4.76018532430801, + "learning_rate": 0.00012499695307739185, + "loss": 5.4549, + "step": 2564 + }, + { + "epoch": 0.547141638225256, + "grad_norm": 16.057288144732063, + "learning_rate": 0.0001250457038391225, + "loss": 5.6038, + "step": 2565 + }, + { + "epoch": 0.5473549488054608, + "grad_norm": 9.46731474525949, + "learning_rate": 0.00012509445460085316, + "loss": 5.4993, + "step": 2566 + }, + { + "epoch": 0.5475682593856656, + "grad_norm": 7.822984660299398, + "learning_rate": 0.0001251432053625838, + "loss": 5.3831, + "step": 2567 + }, + { + "epoch": 0.5477815699658704, + "grad_norm": 7.397997486689299, + "learning_rate": 0.00012519195612431446, + "loss": 5.4368, + "step": 2568 + }, + { + "epoch": 0.547994880546075, + "grad_norm": 6.979349317537846, + "learning_rate": 0.0001252407068860451, + "loss": 5.4003, + "step": 2569 + }, + { + "epoch": 0.5482081911262798, + "grad_norm": 4.719611606366249, + "learning_rate": 0.00012528945764777577, + "loss": 5.3966, + "step": 2570 + }, + { + "epoch": 0.5484215017064846, + "grad_norm": 5.864313920644723, + "learning_rate": 0.0001253382084095064, + "loss": 5.3424, + "step": 2571 + }, + { + "epoch": 0.5486348122866894, + "grad_norm": 5.0142776555247135, + "learning_rate": 0.00012538695917123707, + "loss": 5.3482, + "step": 2572 + }, + { + "epoch": 0.5488481228668942, + "grad_norm": 4.853112308180169, + "learning_rate": 0.0001254357099329677, + "loss": 5.3874, + "step": 2573 + }, + { + "epoch": 0.549061433447099, + "grad_norm": 4.419631546035831, + "learning_rate": 0.00012548446069469838, + "loss": 5.3105, + "step": 2574 + }, + { + "epoch": 0.5492747440273038, + "grad_norm": 4.399708359551792, + "learning_rate": 0.00012553321145642902, + "loss": 5.3374, + "step": 2575 + }, + { + "epoch": 0.5494880546075085, + "grad_norm": 5.281808058135433, + "learning_rate": 0.00012558196221815966, + "loss": 5.284, + "step": 2576 + }, + { + "epoch": 0.5497013651877133, + "grad_norm": 5.483255815854108, + "learning_rate": 0.00012563071297989033, + "loss": 5.2067, + "step": 2577 + }, + { + "epoch": 0.5499146757679181, + "grad_norm": 5.094099750744671, + "learning_rate": 0.00012567946374162096, + "loss": 5.1797, + "step": 2578 + }, + { + "epoch": 0.5501279863481229, + "grad_norm": 3.0153222370501265, + "learning_rate": 0.00012572821450335163, + "loss": 5.2081, + "step": 2579 + }, + { + "epoch": 0.5503412969283277, + "grad_norm": 5.343787037092409, + "learning_rate": 0.00012577696526508227, + "loss": 5.2702, + "step": 2580 + }, + { + "epoch": 0.5505546075085325, + "grad_norm": 3.8144083603336254, + "learning_rate": 0.0001258257160268129, + "loss": 5.1437, + "step": 2581 + }, + { + "epoch": 0.5507679180887372, + "grad_norm": 2.9314335667944524, + "learning_rate": 0.00012587446678854358, + "loss": 5.1148, + "step": 2582 + }, + { + "epoch": 0.550981228668942, + "grad_norm": 2.932784932740827, + "learning_rate": 0.00012592321755027424, + "loss": 5.0969, + "step": 2583 + }, + { + "epoch": 0.5511945392491467, + "grad_norm": 2.9949232746296954, + "learning_rate": 0.00012597196831200488, + "loss": 5.1382, + "step": 2584 + }, + { + "epoch": 0.5514078498293515, + "grad_norm": 2.5344264705134827, + "learning_rate": 0.00012602071907373555, + "loss": 5.1396, + "step": 2585 + }, + { + "epoch": 0.5516211604095563, + "grad_norm": 3.282930029692622, + "learning_rate": 0.0001260694698354662, + "loss": 5.1181, + "step": 2586 + }, + { + "epoch": 0.5518344709897611, + "grad_norm": 5.40688080143008, + "learning_rate": 0.00012611822059719686, + "loss": 5.1129, + "step": 2587 + }, + { + "epoch": 0.5520477815699659, + "grad_norm": 2.3899592973187533, + "learning_rate": 0.0001261669713589275, + "loss": 5.043, + "step": 2588 + }, + { + "epoch": 0.5522610921501706, + "grad_norm": 13.25226733965152, + "learning_rate": 0.00012621572212065816, + "loss": 5.3289, + "step": 2589 + }, + { + "epoch": 0.5524744027303754, + "grad_norm": 6.005787636627209, + "learning_rate": 0.0001262644728823888, + "loss": 5.032, + "step": 2590 + }, + { + "epoch": 0.5526877133105802, + "grad_norm": 5.720023099797669, + "learning_rate": 0.00012631322364411944, + "loss": 5.0703, + "step": 2591 + }, + { + "epoch": 0.552901023890785, + "grad_norm": 7.512072836582748, + "learning_rate": 0.0001263619744058501, + "loss": 5.1057, + "step": 2592 + }, + { + "epoch": 0.5531143344709898, + "grad_norm": 4.473457122004175, + "learning_rate": 0.00012641072516758075, + "loss": 5.1064, + "step": 2593 + }, + { + "epoch": 0.5533276450511946, + "grad_norm": 5.149429238230234, + "learning_rate": 0.0001264594759293114, + "loss": 4.9251, + "step": 2594 + }, + { + "epoch": 0.5535409556313993, + "grad_norm": 5.471579968522952, + "learning_rate": 0.00012650822669104205, + "loss": 5.0823, + "step": 2595 + }, + { + "epoch": 0.5537542662116041, + "grad_norm": 3.668583202906193, + "learning_rate": 0.0001265569774527727, + "loss": 4.9065, + "step": 2596 + }, + { + "epoch": 0.5539675767918089, + "grad_norm": 3.2615297013645406, + "learning_rate": 0.00012660572821450336, + "loss": 4.9424, + "step": 2597 + }, + { + "epoch": 0.5541808873720137, + "grad_norm": 3.2695212674244525, + "learning_rate": 0.000126654478976234, + "loss": 4.9127, + "step": 2598 + }, + { + "epoch": 0.5543941979522184, + "grad_norm": 3.217404533679879, + "learning_rate": 0.00012670322973796466, + "loss": 4.9502, + "step": 2599 + }, + { + "epoch": 0.5546075085324232, + "grad_norm": 2.680844606347315, + "learning_rate": 0.0001267519804996953, + "loss": 4.9639, + "step": 2600 + }, + { + "epoch": 0.554820819112628, + "grad_norm": 5.040883166308233, + "learning_rate": 0.00012680073126142597, + "loss": 4.8828, + "step": 2601 + }, + { + "epoch": 0.5550341296928327, + "grad_norm": 5.401253397357504, + "learning_rate": 0.00012684948202315664, + "loss": 4.9058, + "step": 2602 + }, + { + "epoch": 0.5552474402730375, + "grad_norm": 3.2256827735049023, + "learning_rate": 0.00012689823278488728, + "loss": 4.8157, + "step": 2603 + }, + { + "epoch": 0.5554607508532423, + "grad_norm": 6.225507994909579, + "learning_rate": 0.00012694698354661794, + "loss": 4.932, + "step": 2604 + }, + { + "epoch": 0.5556740614334471, + "grad_norm": 3.814064461929405, + "learning_rate": 0.00012699573430834858, + "loss": 4.8667, + "step": 2605 + }, + { + "epoch": 0.5558873720136519, + "grad_norm": 3.670184942178583, + "learning_rate": 0.00012704448507007922, + "loss": 4.862, + "step": 2606 + }, + { + "epoch": 0.5561006825938567, + "grad_norm": 4.085197686633542, + "learning_rate": 0.0001270932358318099, + "loss": 4.8077, + "step": 2607 + }, + { + "epoch": 0.5563139931740614, + "grad_norm": 4.002683353604945, + "learning_rate": 0.00012714198659354053, + "loss": 4.741, + "step": 2608 + }, + { + "epoch": 0.5565273037542662, + "grad_norm": 2.5900241891133673, + "learning_rate": 0.0001271907373552712, + "loss": 4.7451, + "step": 2609 + }, + { + "epoch": 0.556740614334471, + "grad_norm": 5.322383910811317, + "learning_rate": 0.00012723948811700183, + "loss": 4.7311, + "step": 2610 + }, + { + "epoch": 0.5569539249146758, + "grad_norm": 3.6016054606731256, + "learning_rate": 0.00012728823887873247, + "loss": 4.7695, + "step": 2611 + }, + { + "epoch": 0.5571672354948806, + "grad_norm": 4.2443256967758884, + "learning_rate": 0.00012733698964046314, + "loss": 4.6795, + "step": 2612 + }, + { + "epoch": 0.5573805460750854, + "grad_norm": 3.0977860259802354, + "learning_rate": 0.00012738574040219378, + "loss": 4.6988, + "step": 2613 + }, + { + "epoch": 0.5575938566552902, + "grad_norm": 3.233149482976536, + "learning_rate": 0.00012743449116392445, + "loss": 4.6548, + "step": 2614 + }, + { + "epoch": 0.5578071672354948, + "grad_norm": 2.5416988169600137, + "learning_rate": 0.00012748324192565509, + "loss": 4.6127, + "step": 2615 + }, + { + "epoch": 0.5580204778156996, + "grad_norm": 2.5316365103389034, + "learning_rate": 0.00012753199268738575, + "loss": 4.6276, + "step": 2616 + }, + { + "epoch": 0.5582337883959044, + "grad_norm": 2.7889645639367573, + "learning_rate": 0.0001275807434491164, + "loss": 4.6288, + "step": 2617 + }, + { + "epoch": 0.5584470989761092, + "grad_norm": 2.2394045880012072, + "learning_rate": 0.00012762949421084706, + "loss": 4.5797, + "step": 2618 + }, + { + "epoch": 0.558660409556314, + "grad_norm": 3.9489889941065703, + "learning_rate": 0.00012767824497257772, + "loss": 4.6195, + "step": 2619 + }, + { + "epoch": 0.5588737201365188, + "grad_norm": 4.39867315164555, + "learning_rate": 0.00012772699573430836, + "loss": 4.6537, + "step": 2620 + }, + { + "epoch": 0.5590870307167235, + "grad_norm": 3.164997765937845, + "learning_rate": 0.00012777574649603903, + "loss": 4.5207, + "step": 2621 + }, + { + "epoch": 0.5593003412969283, + "grad_norm": 3.2714895524592533, + "learning_rate": 0.00012782449725776967, + "loss": 4.5549, + "step": 2622 + }, + { + "epoch": 0.5595136518771331, + "grad_norm": 3.7411498508095105, + "learning_rate": 0.0001278732480195003, + "loss": 4.5201, + "step": 2623 + }, + { + "epoch": 0.5597269624573379, + "grad_norm": 5.659807631211137, + "learning_rate": 0.00012792199878123098, + "loss": 4.5652, + "step": 2624 + }, + { + "epoch": 0.5599402730375427, + "grad_norm": 2.8771987430203856, + "learning_rate": 0.00012797074954296162, + "loss": 4.5356, + "step": 2625 + }, + { + "epoch": 0.5601535836177475, + "grad_norm": 3.1111903722058187, + "learning_rate": 0.00012801950030469228, + "loss": 4.5344, + "step": 2626 + }, + { + "epoch": 0.5603668941979523, + "grad_norm": 4.882026075104041, + "learning_rate": 0.00012806825106642292, + "loss": 4.5348, + "step": 2627 + }, + { + "epoch": 0.560580204778157, + "grad_norm": 2.8038471772190148, + "learning_rate": 0.00012811700182815356, + "loss": 4.5117, + "step": 2628 + }, + { + "epoch": 0.5607935153583617, + "grad_norm": 5.195536162808856, + "learning_rate": 0.00012816575258988423, + "loss": 4.5896, + "step": 2629 + }, + { + "epoch": 0.5610068259385665, + "grad_norm": 3.2879181295014144, + "learning_rate": 0.00012821450335161487, + "loss": 4.4731, + "step": 2630 + }, + { + "epoch": 0.5612201365187713, + "grad_norm": 5.805817264329025, + "learning_rate": 0.00012826325411334553, + "loss": 4.4914, + "step": 2631 + }, + { + "epoch": 0.5614334470989761, + "grad_norm": 3.3995857219047894, + "learning_rate": 0.00012831200487507617, + "loss": 4.4919, + "step": 2632 + }, + { + "epoch": 0.5616467576791809, + "grad_norm": 3.9050036600622082, + "learning_rate": 0.00012836075563680684, + "loss": 4.5073, + "step": 2633 + }, + { + "epoch": 0.5618600682593856, + "grad_norm": 3.7861017874028535, + "learning_rate": 0.00012840950639853748, + "loss": 4.5245, + "step": 2634 + }, + { + "epoch": 0.5620733788395904, + "grad_norm": 3.4190250227031083, + "learning_rate": 0.00012845825716026815, + "loss": 4.3859, + "step": 2635 + }, + { + "epoch": 0.5622866894197952, + "grad_norm": 3.163081283233997, + "learning_rate": 0.00012850700792199879, + "loss": 4.4955, + "step": 2636 + }, + { + "epoch": 0.5625, + "grad_norm": 3.0223383229482192, + "learning_rate": 0.00012855575868372945, + "loss": 4.4206, + "step": 2637 + }, + { + "epoch": 0.5627133105802048, + "grad_norm": 2.471307511997737, + "learning_rate": 0.0001286045094454601, + "loss": 4.3489, + "step": 2638 + }, + { + "epoch": 0.5629266211604096, + "grad_norm": 4.128770819033029, + "learning_rate": 0.00012865326020719076, + "loss": 4.441, + "step": 2639 + }, + { + "epoch": 0.5631399317406144, + "grad_norm": 3.098316920423577, + "learning_rate": 0.0001287020109689214, + "loss": 4.419, + "step": 2640 + }, + { + "epoch": 0.5633532423208191, + "grad_norm": 3.0317005934380203, + "learning_rate": 0.00012875076173065206, + "loss": 4.3079, + "step": 2641 + }, + { + "epoch": 0.5635665529010239, + "grad_norm": 2.241701967021923, + "learning_rate": 0.0001287995124923827, + "loss": 4.4322, + "step": 2642 + }, + { + "epoch": 0.5637798634812287, + "grad_norm": 3.796311752902383, + "learning_rate": 0.00012884826325411334, + "loss": 4.33, + "step": 2643 + }, + { + "epoch": 0.5639931740614335, + "grad_norm": 3.3595585905197747, + "learning_rate": 0.000128897014015844, + "loss": 4.3539, + "step": 2644 + }, + { + "epoch": 0.5642064846416383, + "grad_norm": 3.283923477792092, + "learning_rate": 0.00012894576477757465, + "loss": 4.2528, + "step": 2645 + }, + { + "epoch": 0.564419795221843, + "grad_norm": 4.434479112680468, + "learning_rate": 0.00012899451553930532, + "loss": 4.3445, + "step": 2646 + }, + { + "epoch": 0.5646331058020477, + "grad_norm": 2.7717001946025457, + "learning_rate": 0.00012904326630103596, + "loss": 4.2144, + "step": 2647 + }, + { + "epoch": 0.5648464163822525, + "grad_norm": 4.257929445495832, + "learning_rate": 0.0001290920170627666, + "loss": 4.2523, + "step": 2648 + }, + { + "epoch": 0.5650597269624573, + "grad_norm": 3.714642985949812, + "learning_rate": 0.00012914076782449726, + "loss": 4.2211, + "step": 2649 + }, + { + "epoch": 0.5652730375426621, + "grad_norm": 1.9792564581801182, + "learning_rate": 0.00012918951858622793, + "loss": 4.1867, + "step": 2650 + }, + { + "epoch": 0.5654863481228669, + "grad_norm": 3.902383617839633, + "learning_rate": 0.00012923826934795857, + "loss": 4.2119, + "step": 2651 + }, + { + "epoch": 0.5656996587030717, + "grad_norm": 4.724288086957093, + "learning_rate": 0.00012928702010968923, + "loss": 4.245, + "step": 2652 + }, + { + "epoch": 0.5659129692832765, + "grad_norm": 2.937014808479551, + "learning_rate": 0.00012933577087141987, + "loss": 4.1372, + "step": 2653 + }, + { + "epoch": 0.5661262798634812, + "grad_norm": 8.099479380707221, + "learning_rate": 0.00012938452163315054, + "loss": 4.3231, + "step": 2654 + }, + { + "epoch": 0.566339590443686, + "grad_norm": 4.227091744110868, + "learning_rate": 0.00012943327239488118, + "loss": 4.1618, + "step": 2655 + }, + { + "epoch": 0.5665529010238908, + "grad_norm": 5.106026115879134, + "learning_rate": 0.00012948202315661185, + "loss": 4.2298, + "step": 2656 + }, + { + "epoch": 0.5667662116040956, + "grad_norm": 3.9363663340541697, + "learning_rate": 0.00012953077391834249, + "loss": 4.2171, + "step": 2657 + }, + { + "epoch": 0.5669795221843004, + "grad_norm": 4.373932681087029, + "learning_rate": 0.00012957952468007315, + "loss": 4.2558, + "step": 2658 + }, + { + "epoch": 0.5671928327645052, + "grad_norm": 5.043795099291689, + "learning_rate": 0.0001296282754418038, + "loss": 4.2071, + "step": 2659 + }, + { + "epoch": 0.5674061433447098, + "grad_norm": 4.320353108025662, + "learning_rate": 0.00012967702620353443, + "loss": 4.1225, + "step": 2660 + }, + { + "epoch": 0.5676194539249146, + "grad_norm": 3.4163268780073768, + "learning_rate": 0.0001297257769652651, + "loss": 4.0954, + "step": 2661 + }, + { + "epoch": 0.5678327645051194, + "grad_norm": 2.933291043968019, + "learning_rate": 0.00012977452772699574, + "loss": 4.0718, + "step": 2662 + }, + { + "epoch": 0.5680460750853242, + "grad_norm": 4.078980517631502, + "learning_rate": 0.0001298232784887264, + "loss": 4.0106, + "step": 2663 + }, + { + "epoch": 0.568259385665529, + "grad_norm": 3.110228086350965, + "learning_rate": 0.00012987202925045704, + "loss": 4.0327, + "step": 2664 + }, + { + "epoch": 0.5684726962457338, + "grad_norm": 2.479661486102455, + "learning_rate": 0.00012992078001218768, + "loss": 4.0195, + "step": 2665 + }, + { + "epoch": 0.5686860068259386, + "grad_norm": 3.3958160710382725, + "learning_rate": 0.00012996953077391835, + "loss": 4.0092, + "step": 2666 + }, + { + "epoch": 0.5688993174061433, + "grad_norm": 2.527541577653811, + "learning_rate": 0.00013001828153564902, + "loss": 4.0976, + "step": 2667 + }, + { + "epoch": 0.5691126279863481, + "grad_norm": 2.8709562688247336, + "learning_rate": 0.00013006703229737966, + "loss": 4.081, + "step": 2668 + }, + { + "epoch": 0.5693259385665529, + "grad_norm": 3.1011613177814623, + "learning_rate": 0.00013011578305911032, + "loss": 4.0383, + "step": 2669 + }, + { + "epoch": 0.5695392491467577, + "grad_norm": 2.859396060404123, + "learning_rate": 0.00013016453382084096, + "loss": 4.0324, + "step": 2670 + }, + { + "epoch": 0.5697525597269625, + "grad_norm": 2.7674927505156925, + "learning_rate": 0.00013021328458257163, + "loss": 4.0438, + "step": 2671 + }, + { + "epoch": 0.5699658703071673, + "grad_norm": 2.304850019576951, + "learning_rate": 0.00013026203534430227, + "loss": 3.9394, + "step": 2672 + }, + { + "epoch": 0.570179180887372, + "grad_norm": 1.9560754370898683, + "learning_rate": 0.00013031078610603293, + "loss": 3.9143, + "step": 2673 + }, + { + "epoch": 0.5703924914675768, + "grad_norm": 3.1092031423122686, + "learning_rate": 0.00013035953686776357, + "loss": 3.9167, + "step": 2674 + }, + { + "epoch": 0.5706058020477816, + "grad_norm": 2.459246292161218, + "learning_rate": 0.0001304082876294942, + "loss": 3.9171, + "step": 2675 + }, + { + "epoch": 0.5708191126279863, + "grad_norm": 2.6229857079434473, + "learning_rate": 0.00013045703839122488, + "loss": 3.9343, + "step": 2676 + }, + { + "epoch": 0.5710324232081911, + "grad_norm": 2.907983799154221, + "learning_rate": 0.00013050578915295552, + "loss": 3.88, + "step": 2677 + }, + { + "epoch": 0.5712457337883959, + "grad_norm": 2.9235985085724168, + "learning_rate": 0.00013055453991468619, + "loss": 3.9492, + "step": 2678 + }, + { + "epoch": 0.5714590443686007, + "grad_norm": 3.226235483020458, + "learning_rate": 0.00013060329067641682, + "loss": 3.9201, + "step": 2679 + }, + { + "epoch": 0.5716723549488054, + "grad_norm": 2.216970424801603, + "learning_rate": 0.00013065204143814746, + "loss": 3.8539, + "step": 2680 + }, + { + "epoch": 0.5718856655290102, + "grad_norm": 2.795706202466177, + "learning_rate": 0.00013070079219987813, + "loss": 3.8056, + "step": 2681 + }, + { + "epoch": 0.572098976109215, + "grad_norm": 2.60052142781147, + "learning_rate": 0.00013074954296160877, + "loss": 3.8913, + "step": 2682 + }, + { + "epoch": 0.5723122866894198, + "grad_norm": 2.888927002685698, + "learning_rate": 0.00013079829372333944, + "loss": 3.841, + "step": 2683 + }, + { + "epoch": 0.5725255972696246, + "grad_norm": 2.5291387174425077, + "learning_rate": 0.00013084704448507008, + "loss": 3.7675, + "step": 2684 + }, + { + "epoch": 0.5727389078498294, + "grad_norm": 2.484642627546108, + "learning_rate": 0.00013089579524680074, + "loss": 3.7288, + "step": 2685 + }, + { + "epoch": 0.5729522184300341, + "grad_norm": 2.877122786355159, + "learning_rate": 0.0001309445460085314, + "loss": 3.8002, + "step": 2686 + }, + { + "epoch": 0.5731655290102389, + "grad_norm": 1.9723664599334383, + "learning_rate": 0.00013099329677026205, + "loss": 3.7746, + "step": 2687 + }, + { + "epoch": 0.5733788395904437, + "grad_norm": 2.950926693449889, + "learning_rate": 0.00013104204753199272, + "loss": 3.7515, + "step": 2688 + }, + { + "epoch": 0.5735921501706485, + "grad_norm": 3.2014554195214875, + "learning_rate": 0.00013109079829372336, + "loss": 3.7046, + "step": 2689 + }, + { + "epoch": 0.5738054607508533, + "grad_norm": 2.253222923146668, + "learning_rate": 0.000131139549055454, + "loss": 3.7038, + "step": 2690 + }, + { + "epoch": 0.574018771331058, + "grad_norm": 2.447987064195161, + "learning_rate": 0.00013118829981718466, + "loss": 3.6542, + "step": 2691 + }, + { + "epoch": 0.5742320819112628, + "grad_norm": 3.2944200811670235, + "learning_rate": 0.0001312370505789153, + "loss": 3.7184, + "step": 2692 + }, + { + "epoch": 0.5744453924914675, + "grad_norm": 2.8020701292659793, + "learning_rate": 0.00013128580134064597, + "loss": 3.7013, + "step": 2693 + }, + { + "epoch": 0.5746587030716723, + "grad_norm": 2.886689819287596, + "learning_rate": 0.0001313345521023766, + "loss": 3.7023, + "step": 2694 + }, + { + "epoch": 0.5748720136518771, + "grad_norm": 2.93647217622373, + "learning_rate": 0.00013138330286410725, + "loss": 3.579, + "step": 2695 + }, + { + "epoch": 0.5750853242320819, + "grad_norm": 2.4631016353595756, + "learning_rate": 0.0001314320536258379, + "loss": 3.5876, + "step": 2696 + }, + { + "epoch": 0.5752986348122867, + "grad_norm": 3.5081727439916195, + "learning_rate": 0.00013148080438756855, + "loss": 3.6039, + "step": 2697 + }, + { + "epoch": 0.5755119453924915, + "grad_norm": 2.903519020111772, + "learning_rate": 0.00013152955514929922, + "loss": 3.4582, + "step": 2698 + }, + { + "epoch": 0.5757252559726962, + "grad_norm": 4.657555886045659, + "learning_rate": 0.00013157830591102986, + "loss": 3.4713, + "step": 2699 + }, + { + "epoch": 0.575938566552901, + "grad_norm": 2.8085979934198093, + "learning_rate": 0.00013162705667276052, + "loss": 3.3947, + "step": 2700 + }, + { + "epoch": 0.5761518771331058, + "grad_norm": 3.2428155399238827, + "learning_rate": 0.00013167580743449116, + "loss": 3.3934, + "step": 2701 + }, + { + "epoch": 0.5763651877133106, + "grad_norm": 5.448648936466839, + "learning_rate": 0.00013172455819622183, + "loss": 3.316, + "step": 2702 + }, + { + "epoch": 0.5765784982935154, + "grad_norm": 4.585661485818883, + "learning_rate": 0.0001317733089579525, + "loss": 3.2406, + "step": 2703 + }, + { + "epoch": 0.5767918088737202, + "grad_norm": 4.585807677655258, + "learning_rate": 0.00013182205971968314, + "loss": 3.0358, + "step": 2704 + }, + { + "epoch": 0.577005119453925, + "grad_norm": 6.668449616564426, + "learning_rate": 0.0001318708104814138, + "loss": 2.9093, + "step": 2705 + }, + { + "epoch": 0.5772184300341296, + "grad_norm": 6.964318353046114, + "learning_rate": 0.00013191956124314444, + "loss": 2.5667, + "step": 2706 + }, + { + "epoch": 0.5774317406143344, + "grad_norm": 8.180393394753102, + "learning_rate": 0.00013196831200487508, + "loss": 2.4532, + "step": 2707 + }, + { + "epoch": 0.5776450511945392, + "grad_norm": 9.876935570431565, + "learning_rate": 0.00013201706276660575, + "loss": 2.2265, + "step": 2708 + }, + { + "epoch": 0.577858361774744, + "grad_norm": 77.35165361369985, + "learning_rate": 0.0001320658135283364, + "loss": 5.1405, + "step": 2709 + }, + { + "epoch": 0.5780716723549488, + "grad_norm": 17.124843584548696, + "learning_rate": 0.00013211456429006705, + "loss": 2.6998, + "step": 2710 + }, + { + "epoch": 0.5782849829351536, + "grad_norm": 4.992222692017077, + "learning_rate": 0.0001321633150517977, + "loss": 2.2399, + "step": 2711 + }, + { + "epoch": 0.5784982935153583, + "grad_norm": 3.7401121121554035, + "learning_rate": 0.00013221206581352833, + "loss": 1.8344, + "step": 2712 + }, + { + "epoch": 0.5787116040955631, + "grad_norm": 33.980132156102854, + "learning_rate": 0.000132260816575259, + "loss": 1.7161, + "step": 2713 + }, + { + "epoch": 0.5789249146757679, + "grad_norm": 6.800694591406287, + "learning_rate": 0.00013230956733698964, + "loss": 1.7432, + "step": 2714 + }, + { + "epoch": 0.5791382252559727, + "grad_norm": 2.0701577443585584, + "learning_rate": 0.0001323583180987203, + "loss": 1.498, + "step": 2715 + }, + { + "epoch": 0.5793515358361775, + "grad_norm": 1.9162797387662214, + "learning_rate": 0.00013240706886045095, + "loss": 1.3967, + "step": 2716 + }, + { + "epoch": 0.5795648464163823, + "grad_norm": 1.4925200613372087, + "learning_rate": 0.0001324558196221816, + "loss": 1.2837, + "step": 2717 + }, + { + "epoch": 0.5797781569965871, + "grad_norm": 1.5901165652554283, + "learning_rate": 0.00013250457038391225, + "loss": 1.2578, + "step": 2718 + }, + { + "epoch": 0.5799914675767918, + "grad_norm": 1.8165048979663048, + "learning_rate": 0.00013255332114564292, + "loss": 1.2068, + "step": 2719 + }, + { + "epoch": 0.5802047781569966, + "grad_norm": 2.4616583343157936, + "learning_rate": 0.00013260207190737356, + "loss": 1.1905, + "step": 2720 + }, + { + "epoch": 0.5804180887372014, + "grad_norm": 1.5334360249031942, + "learning_rate": 0.00013265082266910422, + "loss": 1.2281, + "step": 2721 + }, + { + "epoch": 0.5806313993174061, + "grad_norm": 1.3418158903000474, + "learning_rate": 0.00013269957343083486, + "loss": 1.2014, + "step": 2722 + }, + { + "epoch": 0.580844709897611, + "grad_norm": 1.0299352028040833, + "learning_rate": 0.00013274832419256553, + "loss": 1.15, + "step": 2723 + }, + { + "epoch": 0.5810580204778157, + "grad_norm": 1.6608876375598165, + "learning_rate": 0.00013279707495429617, + "loss": 1.1751, + "step": 2724 + }, + { + "epoch": 0.5812713310580204, + "grad_norm": 0.8957709182523708, + "learning_rate": 0.00013284582571602684, + "loss": 1.1734, + "step": 2725 + }, + { + "epoch": 0.5814846416382252, + "grad_norm": 1.061985549946898, + "learning_rate": 0.00013289457647775748, + "loss": 1.1347, + "step": 2726 + }, + { + "epoch": 0.58169795221843, + "grad_norm": 1.2705198447617814, + "learning_rate": 0.00013294332723948812, + "loss": 1.1473, + "step": 2727 + }, + { + "epoch": 0.5819112627986348, + "grad_norm": 0.8725152752029627, + "learning_rate": 0.00013299207800121878, + "loss": 1.1202, + "step": 2728 + }, + { + "epoch": 0.5821245733788396, + "grad_norm": 1.0225332834934322, + "learning_rate": 0.00013304082876294942, + "loss": 1.1435, + "step": 2729 + }, + { + "epoch": 0.5823378839590444, + "grad_norm": 0.8699967626587042, + "learning_rate": 0.0001330895795246801, + "loss": 1.1132, + "step": 2730 + }, + { + "epoch": 0.5825511945392492, + "grad_norm": 0.9378040784669547, + "learning_rate": 0.00013313833028641073, + "loss": 1.0364, + "step": 2731 + }, + { + "epoch": 0.5827645051194539, + "grad_norm": 1.0404205820354853, + "learning_rate": 0.00013318708104814137, + "loss": 1.1313, + "step": 2732 + }, + { + "epoch": 0.5829778156996587, + "grad_norm": 1.0461320323222993, + "learning_rate": 0.00013323583180987203, + "loss": 1.1032, + "step": 2733 + }, + { + "epoch": 0.5831911262798635, + "grad_norm": 1.0636076514815953, + "learning_rate": 0.0001332845825716027, + "loss": 1.0597, + "step": 2734 + }, + { + "epoch": 0.5834044368600683, + "grad_norm": 0.8535466944027515, + "learning_rate": 0.00013333333333333334, + "loss": 1.0754, + "step": 2735 + }, + { + "epoch": 0.5836177474402731, + "grad_norm": 0.5914644954034185, + "learning_rate": 0.000133382084095064, + "loss": 1.0509, + "step": 2736 + }, + { + "epoch": 0.5838310580204779, + "grad_norm": 0.7085356421970461, + "learning_rate": 0.00013343083485679465, + "loss": 1.0657, + "step": 2737 + }, + { + "epoch": 0.5840443686006825, + "grad_norm": 1.0239221653266106, + "learning_rate": 0.0001334795856185253, + "loss": 1.0981, + "step": 2738 + }, + { + "epoch": 0.5842576791808873, + "grad_norm": 0.9470678012645457, + "learning_rate": 0.00013352833638025595, + "loss": 1.0564, + "step": 2739 + }, + { + "epoch": 0.5844709897610921, + "grad_norm": 0.4987415901519101, + "learning_rate": 0.00013357708714198662, + "loss": 1.0751, + "step": 2740 + }, + { + "epoch": 0.5846843003412969, + "grad_norm": 0.5174298824696374, + "learning_rate": 0.00013362583790371726, + "loss": 1.0396, + "step": 2741 + }, + { + "epoch": 0.5848976109215017, + "grad_norm": 0.7023180084945061, + "learning_rate": 0.0001336745886654479, + "loss": 1.0442, + "step": 2742 + }, + { + "epoch": 0.5851109215017065, + "grad_norm": 0.7471631057325429, + "learning_rate": 0.00013372333942717856, + "loss": 1.0176, + "step": 2743 + }, + { + "epoch": 0.5853242320819113, + "grad_norm": 0.5064298984010045, + "learning_rate": 0.0001337720901889092, + "loss": 1.0403, + "step": 2744 + }, + { + "epoch": 0.585537542662116, + "grad_norm": 0.4183568913784291, + "learning_rate": 0.00013382084095063987, + "loss": 1.0339, + "step": 2745 + }, + { + "epoch": 0.5857508532423208, + "grad_norm": 0.579007756093451, + "learning_rate": 0.0001338695917123705, + "loss": 1.0175, + "step": 2746 + }, + { + "epoch": 0.5859641638225256, + "grad_norm": 0.6602474058087108, + "learning_rate": 0.00013391834247410118, + "loss": 1.0658, + "step": 2747 + }, + { + "epoch": 0.5861774744027304, + "grad_norm": 0.644274403822723, + "learning_rate": 0.00013396709323583182, + "loss": 1.088, + "step": 2748 + }, + { + "epoch": 0.5863907849829352, + "grad_norm": 0.4410896699968849, + "learning_rate": 0.00013401584399756245, + "loss": 1.0229, + "step": 2749 + }, + { + "epoch": 0.58660409556314, + "grad_norm": 0.5037991943012318, + "learning_rate": 0.00013406459475929312, + "loss": 1.0092, + "step": 2750 + }, + { + "epoch": 0.5868174061433447, + "grad_norm": 0.5565412988589755, + "learning_rate": 0.00013411334552102376, + "loss": 1.0437, + "step": 2751 + }, + { + "epoch": 0.5870307167235495, + "grad_norm": 0.462755703416745, + "learning_rate": 0.00013416209628275443, + "loss": 1.0288, + "step": 2752 + }, + { + "epoch": 0.5872440273037542, + "grad_norm": 0.5116802543644282, + "learning_rate": 0.0001342108470444851, + "loss": 1.0217, + "step": 2753 + }, + { + "epoch": 0.587457337883959, + "grad_norm": 0.4673210113463809, + "learning_rate": 0.00013425959780621573, + "loss": 1.0054, + "step": 2754 + }, + { + "epoch": 0.5876706484641638, + "grad_norm": 0.4226921371701016, + "learning_rate": 0.0001343083485679464, + "loss": 1.0348, + "step": 2755 + }, + { + "epoch": 0.5878839590443686, + "grad_norm": 0.41969766753619253, + "learning_rate": 0.00013435709932967704, + "loss": 1.0126, + "step": 2756 + }, + { + "epoch": 0.5880972696245734, + "grad_norm": 0.49980145932778575, + "learning_rate": 0.0001344058500914077, + "loss": 1.022, + "step": 2757 + }, + { + "epoch": 0.5883105802047781, + "grad_norm": 0.4368205894685108, + "learning_rate": 0.00013445460085313835, + "loss": 0.9766, + "step": 2758 + }, + { + "epoch": 0.5885238907849829, + "grad_norm": 0.4757520592622311, + "learning_rate": 0.00013450335161486899, + "loss": 1.0113, + "step": 2759 + }, + { + "epoch": 0.5887372013651877, + "grad_norm": 0.6173690156117512, + "learning_rate": 0.00013455210237659965, + "loss": 1.0294, + "step": 2760 + }, + { + "epoch": 0.5889505119453925, + "grad_norm": 0.8939050789347421, + "learning_rate": 0.0001346008531383303, + "loss": 1.0476, + "step": 2761 + }, + { + "epoch": 0.5891638225255973, + "grad_norm": 1.2274386205215282, + "learning_rate": 0.00013464960390006096, + "loss": 1.0015, + "step": 2762 + }, + { + "epoch": 0.5893771331058021, + "grad_norm": 0.8572244075180029, + "learning_rate": 0.0001346983546617916, + "loss": 1.0264, + "step": 2763 + }, + { + "epoch": 0.5895904436860068, + "grad_norm": 0.6572729647711105, + "learning_rate": 0.00013474710542352224, + "loss": 1.0266, + "step": 2764 + }, + { + "epoch": 0.5898037542662116, + "grad_norm": 0.6356376129438024, + "learning_rate": 0.0001347958561852529, + "loss": 1.0379, + "step": 2765 + }, + { + "epoch": 0.5900170648464164, + "grad_norm": 0.6296505389417719, + "learning_rate": 0.00013484460694698354, + "loss": 1.0818, + "step": 2766 + }, + { + "epoch": 0.5902303754266212, + "grad_norm": 0.79554286077278, + "learning_rate": 0.0001348933577087142, + "loss": 1.0458, + "step": 2767 + }, + { + "epoch": 0.590443686006826, + "grad_norm": 0.9141871950084572, + "learning_rate": 0.00013494210847044485, + "loss": 1.038, + "step": 2768 + }, + { + "epoch": 0.5906569965870307, + "grad_norm": 0.7912285714131365, + "learning_rate": 0.00013499085923217552, + "loss": 0.9886, + "step": 2769 + }, + { + "epoch": 0.5908703071672355, + "grad_norm": 0.6964570783309065, + "learning_rate": 0.00013503960999390618, + "loss": 1.0397, + "step": 2770 + }, + { + "epoch": 0.5910836177474402, + "grad_norm": 0.8413156757486563, + "learning_rate": 0.00013508836075563682, + "loss": 1.0208, + "step": 2771 + }, + { + "epoch": 0.591296928327645, + "grad_norm": 1.0133411958639154, + "learning_rate": 0.0001351371115173675, + "loss": 1.0226, + "step": 2772 + }, + { + "epoch": 0.5915102389078498, + "grad_norm": 1.248767677856154, + "learning_rate": 0.00013518586227909813, + "loss": 1.0114, + "step": 2773 + }, + { + "epoch": 0.5917235494880546, + "grad_norm": 0.810304614824334, + "learning_rate": 0.00013523461304082877, + "loss": 1.0111, + "step": 2774 + }, + { + "epoch": 0.5919368600682594, + "grad_norm": 0.660707286936272, + "learning_rate": 0.00013528336380255943, + "loss": 1.032, + "step": 2775 + }, + { + "epoch": 0.5921501706484642, + "grad_norm": 0.6477626496151722, + "learning_rate": 0.00013533211456429007, + "loss": 1.0488, + "step": 2776 + }, + { + "epoch": 0.5923634812286689, + "grad_norm": 0.7935354318847259, + "learning_rate": 0.00013538086532602074, + "loss": 0.9941, + "step": 2777 + }, + { + "epoch": 0.5925767918088737, + "grad_norm": 1.077674850905276, + "learning_rate": 0.00013542961608775138, + "loss": 1.0007, + "step": 2778 + }, + { + "epoch": 0.5927901023890785, + "grad_norm": 16.848793759385167, + "learning_rate": 0.00013547836684948202, + "loss": 1.0269, + "step": 2779 + }, + { + "epoch": 0.5930034129692833, + "grad_norm": 2.9547596988656815, + "learning_rate": 0.00013552711761121268, + "loss": 1.0258, + "step": 2780 + }, + { + "epoch": 0.5932167235494881, + "grad_norm": 0.7952194123005243, + "learning_rate": 0.00013557586837294332, + "loss": 1.0202, + "step": 2781 + }, + { + "epoch": 0.5934300341296929, + "grad_norm": 3.1900818878168127, + "learning_rate": 0.000135624619134674, + "loss": 1.0643, + "step": 2782 + }, + { + "epoch": 0.5936433447098977, + "grad_norm": 3.0714570483423156, + "learning_rate": 0.00013567336989640463, + "loss": 1.1332, + "step": 2783 + }, + { + "epoch": 0.5938566552901023, + "grad_norm": 0.9079272612008876, + "learning_rate": 0.0001357221206581353, + "loss": 1.0056, + "step": 2784 + }, + { + "epoch": 0.5940699658703071, + "grad_norm": 2.064439624595396, + "learning_rate": 0.00013577087141986594, + "loss": 1.0272, + "step": 2785 + }, + { + "epoch": 0.5942832764505119, + "grad_norm": 1.6215352684899575, + "learning_rate": 0.0001358196221815966, + "loss": 1.0343, + "step": 2786 + }, + { + "epoch": 0.5944965870307167, + "grad_norm": 1.2634426810860508, + "learning_rate": 0.00013586837294332724, + "loss": 1.0123, + "step": 2787 + }, + { + "epoch": 0.5947098976109215, + "grad_norm": 1.2145389476981476, + "learning_rate": 0.0001359171237050579, + "loss": 1.0365, + "step": 2788 + }, + { + "epoch": 0.5949232081911263, + "grad_norm": 0.9922648175208506, + "learning_rate": 0.00013596587446678858, + "loss": 1.0053, + "step": 2789 + }, + { + "epoch": 0.5951365187713311, + "grad_norm": 0.7934950772833937, + "learning_rate": 0.00013601462522851922, + "loss": 1.0396, + "step": 2790 + }, + { + "epoch": 0.5953498293515358, + "grad_norm": 0.9063035272499173, + "learning_rate": 0.00013606337599024985, + "loss": 1.0078, + "step": 2791 + }, + { + "epoch": 0.5955631399317406, + "grad_norm": 0.8034353598325067, + "learning_rate": 0.00013611212675198052, + "loss": 1.0209, + "step": 2792 + }, + { + "epoch": 0.5957764505119454, + "grad_norm": 0.7607266671831935, + "learning_rate": 0.00013616087751371116, + "loss": 1.0887, + "step": 2793 + }, + { + "epoch": 0.5959897610921502, + "grad_norm": 0.6409963207584413, + "learning_rate": 0.00013620962827544183, + "loss": 0.9922, + "step": 2794 + }, + { + "epoch": 0.596203071672355, + "grad_norm": 0.6468048913234995, + "learning_rate": 0.00013625837903717247, + "loss": 1.0288, + "step": 2795 + }, + { + "epoch": 0.5964163822525598, + "grad_norm": 0.7091214303007067, + "learning_rate": 0.0001363071297989031, + "loss": 1.0227, + "step": 2796 + }, + { + "epoch": 0.5966296928327645, + "grad_norm": 0.5603287060533286, + "learning_rate": 0.00013635588056063377, + "loss": 1.066, + "step": 2797 + }, + { + "epoch": 0.5968430034129693, + "grad_norm": 0.5415996914691281, + "learning_rate": 0.0001364046313223644, + "loss": 1.0421, + "step": 2798 + }, + { + "epoch": 0.597056313993174, + "grad_norm": 0.46194629465162446, + "learning_rate": 0.00013645338208409508, + "loss": 1.0023, + "step": 2799 + }, + { + "epoch": 0.5972696245733788, + "grad_norm": 0.5226891832292438, + "learning_rate": 0.00013650213284582572, + "loss": 0.9841, + "step": 2800 + }, + { + "epoch": 0.5974829351535836, + "grad_norm": 0.4896086628455852, + "learning_rate": 0.00013655088360755638, + "loss": 1.0065, + "step": 2801 + }, + { + "epoch": 0.5976962457337884, + "grad_norm": 0.4270722830515163, + "learning_rate": 0.00013659963436928702, + "loss": 1.0279, + "step": 2802 + }, + { + "epoch": 0.5979095563139932, + "grad_norm": 0.4793599827766437, + "learning_rate": 0.0001366483851310177, + "loss": 1.0184, + "step": 2803 + }, + { + "epoch": 0.5981228668941979, + "grad_norm": 0.4793842634809573, + "learning_rate": 0.00013669713589274833, + "loss": 1.0176, + "step": 2804 + }, + { + "epoch": 0.5983361774744027, + "grad_norm": 0.4215133545406648, + "learning_rate": 0.000136745886654479, + "loss": 0.9777, + "step": 2805 + }, + { + "epoch": 0.5985494880546075, + "grad_norm": 0.5701341408667477, + "learning_rate": 0.00013679463741620964, + "loss": 1.0001, + "step": 2806 + }, + { + "epoch": 0.5987627986348123, + "grad_norm": 0.510667651628791, + "learning_rate": 0.0001368433881779403, + "loss": 1.0097, + "step": 2807 + }, + { + "epoch": 0.5989761092150171, + "grad_norm": 0.5325293439207038, + "learning_rate": 0.00013689213893967094, + "loss": 1.0363, + "step": 2808 + }, + { + "epoch": 0.5991894197952219, + "grad_norm": 0.528902002332053, + "learning_rate": 0.0001369408897014016, + "loss": 1.0459, + "step": 2809 + }, + { + "epoch": 0.5994027303754266, + "grad_norm": 0.5965286883224497, + "learning_rate": 0.00013698964046313225, + "loss": 0.9952, + "step": 2810 + }, + { + "epoch": 0.5996160409556314, + "grad_norm": 0.7702512551789179, + "learning_rate": 0.0001370383912248629, + "loss": 0.9823, + "step": 2811 + }, + { + "epoch": 0.5998293515358362, + "grad_norm": 0.8619632797140802, + "learning_rate": 0.00013708714198659355, + "loss": 1.0207, + "step": 2812 + }, + { + "epoch": 0.600042662116041, + "grad_norm": 1.047587365404359, + "learning_rate": 0.0001371358927483242, + "loss": 1.0181, + "step": 2813 + }, + { + "epoch": 0.6002559726962458, + "grad_norm": 1.1228986085574284, + "learning_rate": 0.00013718464351005486, + "loss": 1.0076, + "step": 2814 + }, + { + "epoch": 0.6004692832764505, + "grad_norm": 0.7581077500402149, + "learning_rate": 0.0001372333942717855, + "loss": 0.991, + "step": 2815 + }, + { + "epoch": 0.6006825938566553, + "grad_norm": 0.5320626560192082, + "learning_rate": 0.00013728214503351614, + "loss": 1.0062, + "step": 2816 + }, + { + "epoch": 0.60089590443686, + "grad_norm": 0.5590392875490544, + "learning_rate": 0.0001373308957952468, + "loss": 0.997, + "step": 2817 + }, + { + "epoch": 0.6011092150170648, + "grad_norm": 0.5878025737328677, + "learning_rate": 0.00013737964655697747, + "loss": 1.035, + "step": 2818 + }, + { + "epoch": 0.6013225255972696, + "grad_norm": 0.6259629627400118, + "learning_rate": 0.0001374283973187081, + "loss": 1.0105, + "step": 2819 + }, + { + "epoch": 0.6015358361774744, + "grad_norm": 0.6316416019603757, + "learning_rate": 0.00013747714808043878, + "loss": 1.0209, + "step": 2820 + }, + { + "epoch": 0.6017491467576792, + "grad_norm": 0.8399082597463101, + "learning_rate": 0.00013752589884216942, + "loss": 0.9802, + "step": 2821 + }, + { + "epoch": 0.601962457337884, + "grad_norm": 1.3030426541205258, + "learning_rate": 0.00013757464960390008, + "loss": 1.0495, + "step": 2822 + }, + { + "epoch": 0.6021757679180887, + "grad_norm": 0.6779059739933411, + "learning_rate": 0.00013762340036563072, + "loss": 1.0323, + "step": 2823 + }, + { + "epoch": 0.6023890784982935, + "grad_norm": 0.7577821915686979, + "learning_rate": 0.0001376721511273614, + "loss": 1.0103, + "step": 2824 + }, + { + "epoch": 0.6026023890784983, + "grad_norm": 0.6719106966626471, + "learning_rate": 0.00013772090188909203, + "loss": 0.9964, + "step": 2825 + }, + { + "epoch": 0.6028156996587031, + "grad_norm": 1.1275269197507742, + "learning_rate": 0.00013776965265082267, + "loss": 1.0298, + "step": 2826 + }, + { + "epoch": 0.6030290102389079, + "grad_norm": 1.1359413340808306, + "learning_rate": 0.00013781840341255334, + "loss": 1.0313, + "step": 2827 + }, + { + "epoch": 0.6032423208191127, + "grad_norm": 0.6751026720138619, + "learning_rate": 0.00013786715417428398, + "loss": 1.0201, + "step": 2828 + }, + { + "epoch": 0.6034556313993175, + "grad_norm": 0.3869533905537559, + "learning_rate": 0.00013791590493601464, + "loss": 1.0052, + "step": 2829 + }, + { + "epoch": 0.6036689419795221, + "grad_norm": 0.5431147187112688, + "learning_rate": 0.00013796465569774528, + "loss": 1.011, + "step": 2830 + }, + { + "epoch": 0.6038822525597269, + "grad_norm": 0.8234628243242718, + "learning_rate": 0.00013801340645947595, + "loss": 1.0027, + "step": 2831 + }, + { + "epoch": 0.6040955631399317, + "grad_norm": 1.1367459082139366, + "learning_rate": 0.0001380621572212066, + "loss": 1.0288, + "step": 2832 + }, + { + "epoch": 0.6043088737201365, + "grad_norm": 1.0024446185728904, + "learning_rate": 0.00013811090798293723, + "loss": 0.9982, + "step": 2833 + }, + { + "epoch": 0.6045221843003413, + "grad_norm": 0.8324825409586669, + "learning_rate": 0.0001381596587446679, + "loss": 1.0356, + "step": 2834 + }, + { + "epoch": 0.6047354948805461, + "grad_norm": 0.6710230644206658, + "learning_rate": 0.00013820840950639853, + "loss": 1.0229, + "step": 2835 + }, + { + "epoch": 0.6049488054607508, + "grad_norm": 0.6059421972528407, + "learning_rate": 0.0001382571602681292, + "loss": 1.0063, + "step": 2836 + }, + { + "epoch": 0.6051621160409556, + "grad_norm": 0.5326196101444112, + "learning_rate": 0.00013830591102985987, + "loss": 1.0053, + "step": 2837 + }, + { + "epoch": 0.6053754266211604, + "grad_norm": 0.4942413814056158, + "learning_rate": 0.0001383546617915905, + "loss": 1.0181, + "step": 2838 + }, + { + "epoch": 0.6055887372013652, + "grad_norm": 0.4670636840561295, + "learning_rate": 0.00013840341255332117, + "loss": 1.0199, + "step": 2839 + }, + { + "epoch": 0.60580204778157, + "grad_norm": 0.3471314657189795, + "learning_rate": 0.0001384521633150518, + "loss": 1.032, + "step": 2840 + }, + { + "epoch": 0.6060153583617748, + "grad_norm": 0.36075448222070033, + "learning_rate": 0.00013850091407678248, + "loss": 1.0129, + "step": 2841 + }, + { + "epoch": 0.6062286689419796, + "grad_norm": 0.4395212351430838, + "learning_rate": 0.00013854966483851312, + "loss": 0.9933, + "step": 2842 + }, + { + "epoch": 0.6064419795221843, + "grad_norm": 0.5336586637360955, + "learning_rate": 0.00013859841560024376, + "loss": 1.0316, + "step": 2843 + }, + { + "epoch": 0.606655290102389, + "grad_norm": 0.5648016983161197, + "learning_rate": 0.00013864716636197442, + "loss": 0.9879, + "step": 2844 + }, + { + "epoch": 0.6068686006825939, + "grad_norm": 0.691641541201791, + "learning_rate": 0.00013869591712370506, + "loss": 1.0077, + "step": 2845 + }, + { + "epoch": 0.6070819112627986, + "grad_norm": 1.0332045512304455, + "learning_rate": 0.00013874466788543573, + "loss": 1.0479, + "step": 2846 + }, + { + "epoch": 0.6072952218430034, + "grad_norm": 1.434450793391039, + "learning_rate": 0.00013879341864716637, + "loss": 1.0038, + "step": 2847 + }, + { + "epoch": 0.6075085324232082, + "grad_norm": 0.46457251609972494, + "learning_rate": 0.000138842169408897, + "loss": 0.9998, + "step": 2848 + }, + { + "epoch": 0.6077218430034129, + "grad_norm": 0.662552789726079, + "learning_rate": 0.00013889092017062768, + "loss": 1.0268, + "step": 2849 + }, + { + "epoch": 0.6079351535836177, + "grad_norm": 1.2736934709643097, + "learning_rate": 0.00013893967093235832, + "loss": 1.0247, + "step": 2850 + }, + { + "epoch": 0.6081484641638225, + "grad_norm": 1.0161278033619432, + "learning_rate": 0.00013898842169408898, + "loss": 1.0505, + "step": 2851 + }, + { + "epoch": 0.6083617747440273, + "grad_norm": 0.9300373066747746, + "learning_rate": 0.00013903717245581962, + "loss": 1.0269, + "step": 2852 + }, + { + "epoch": 0.6085750853242321, + "grad_norm": 0.968909905751062, + "learning_rate": 0.0001390859232175503, + "loss": 1.0327, + "step": 2853 + }, + { + "epoch": 0.6087883959044369, + "grad_norm": 0.9636171914713597, + "learning_rate": 0.00013913467397928095, + "loss": 0.9855, + "step": 2854 + }, + { + "epoch": 0.6090017064846417, + "grad_norm": 0.8429211926434531, + "learning_rate": 0.0001391834247410116, + "loss": 0.984, + "step": 2855 + }, + { + "epoch": 0.6092150170648464, + "grad_norm": 0.7706465264876003, + "learning_rate": 0.00013923217550274226, + "loss": 0.9948, + "step": 2856 + }, + { + "epoch": 0.6094283276450512, + "grad_norm": 0.9044154988604896, + "learning_rate": 0.0001392809262644729, + "loss": 1.0593, + "step": 2857 + }, + { + "epoch": 0.609641638225256, + "grad_norm": 0.9901305490299486, + "learning_rate": 0.00013932967702620354, + "loss": 1.0216, + "step": 2858 + }, + { + "epoch": 0.6098549488054608, + "grad_norm": 0.9345185923717969, + "learning_rate": 0.0001393784277879342, + "loss": 1.012, + "step": 2859 + }, + { + "epoch": 0.6100682593856656, + "grad_norm": 1.0508734530619444, + "learning_rate": 0.00013942717854966485, + "loss": 1.0142, + "step": 2860 + }, + { + "epoch": 0.6102815699658704, + "grad_norm": 0.9796276866815222, + "learning_rate": 0.0001394759293113955, + "loss": 1.0251, + "step": 2861 + }, + { + "epoch": 0.610494880546075, + "grad_norm": 0.759176344085803, + "learning_rate": 0.00013952468007312615, + "loss": 1.0141, + "step": 2862 + }, + { + "epoch": 0.6107081911262798, + "grad_norm": 0.6117082748615769, + "learning_rate": 0.0001395734308348568, + "loss": 0.9843, + "step": 2863 + }, + { + "epoch": 0.6109215017064846, + "grad_norm": 0.49304773678113145, + "learning_rate": 0.00013962218159658746, + "loss": 1.0155, + "step": 2864 + }, + { + "epoch": 0.6111348122866894, + "grad_norm": 0.5168535110137236, + "learning_rate": 0.0001396709323583181, + "loss": 1.0234, + "step": 2865 + }, + { + "epoch": 0.6113481228668942, + "grad_norm": 0.5248667050295881, + "learning_rate": 0.00013971968312004876, + "loss": 1.0088, + "step": 2866 + }, + { + "epoch": 0.611561433447099, + "grad_norm": 0.6981733984838099, + "learning_rate": 0.0001397684338817794, + "loss": 1.0301, + "step": 2867 + }, + { + "epoch": 0.6117747440273038, + "grad_norm": 0.9311943929331278, + "learning_rate": 0.00013981718464351007, + "loss": 1.0101, + "step": 2868 + }, + { + "epoch": 0.6119880546075085, + "grad_norm": 1.0915550359435462, + "learning_rate": 0.0001398659354052407, + "loss": 1.0318, + "step": 2869 + }, + { + "epoch": 0.6122013651877133, + "grad_norm": 0.9953059290833497, + "learning_rate": 0.00013991468616697138, + "loss": 1.0061, + "step": 2870 + }, + { + "epoch": 0.6124146757679181, + "grad_norm": 0.9863693632036418, + "learning_rate": 0.00013996343692870201, + "loss": 1.0064, + "step": 2871 + }, + { + "epoch": 0.6126279863481229, + "grad_norm": 0.8053494865817049, + "learning_rate": 0.00014001218769043268, + "loss": 0.9784, + "step": 2872 + }, + { + "epoch": 0.6128412969283277, + "grad_norm": 0.675449914024441, + "learning_rate": 0.00014006093845216335, + "loss": 1.0258, + "step": 2873 + }, + { + "epoch": 0.6130546075085325, + "grad_norm": 0.7222692045884466, + "learning_rate": 0.000140109689213894, + "loss": 1.011, + "step": 2874 + }, + { + "epoch": 0.6132679180887372, + "grad_norm": 0.7853598662075436, + "learning_rate": 0.00014015843997562463, + "loss": 1.0042, + "step": 2875 + }, + { + "epoch": 0.613481228668942, + "grad_norm": 0.717290663200113, + "learning_rate": 0.0001402071907373553, + "loss": 0.9857, + "step": 2876 + }, + { + "epoch": 0.6136945392491467, + "grad_norm": 0.8088974765709882, + "learning_rate": 0.00014025594149908593, + "loss": 0.9929, + "step": 2877 + }, + { + "epoch": 0.6139078498293515, + "grad_norm": 0.9190373644485728, + "learning_rate": 0.0001403046922608166, + "loss": 0.9861, + "step": 2878 + }, + { + "epoch": 0.6141211604095563, + "grad_norm": 0.73526167798717, + "learning_rate": 0.00014035344302254724, + "loss": 1.0321, + "step": 2879 + }, + { + "epoch": 0.6143344709897611, + "grad_norm": 0.754292312684906, + "learning_rate": 0.00014040219378427788, + "loss": 1.0236, + "step": 2880 + }, + { + "epoch": 0.6145477815699659, + "grad_norm": 0.9360491490418305, + "learning_rate": 0.00014045094454600855, + "loss": 1.0134, + "step": 2881 + }, + { + "epoch": 0.6147610921501706, + "grad_norm": 1.092045906622425, + "learning_rate": 0.00014049969530773918, + "loss": 1.0067, + "step": 2882 + }, + { + "epoch": 0.6149744027303754, + "grad_norm": 0.9124006234639299, + "learning_rate": 0.00014054844606946985, + "loss": 1.0417, + "step": 2883 + }, + { + "epoch": 0.6151877133105802, + "grad_norm": 0.687996449413743, + "learning_rate": 0.0001405971968312005, + "loss": 1.0092, + "step": 2884 + }, + { + "epoch": 0.615401023890785, + "grad_norm": 0.5299470908630067, + "learning_rate": 0.00014064594759293116, + "loss": 0.9849, + "step": 2885 + }, + { + "epoch": 0.6156143344709898, + "grad_norm": 0.5438819697844495, + "learning_rate": 0.0001406946983546618, + "loss": 1.0007, + "step": 2886 + }, + { + "epoch": 0.6158276450511946, + "grad_norm": 0.4587153346040395, + "learning_rate": 0.00014074344911639246, + "loss": 0.9945, + "step": 2887 + }, + { + "epoch": 0.6160409556313993, + "grad_norm": 0.4093180791695053, + "learning_rate": 0.0001407921998781231, + "loss": 0.9988, + "step": 2888 + }, + { + "epoch": 0.6162542662116041, + "grad_norm": 1.1748164652809856, + "learning_rate": 0.00014084095063985377, + "loss": 1.0278, + "step": 2889 + }, + { + "epoch": 0.6164675767918089, + "grad_norm": 0.3795150240205611, + "learning_rate": 0.0001408897014015844, + "loss": 1.0433, + "step": 2890 + }, + { + "epoch": 0.6166808873720137, + "grad_norm": 0.5472768766670848, + "learning_rate": 0.00014093845216331508, + "loss": 0.9927, + "step": 2891 + }, + { + "epoch": 0.6168941979522184, + "grad_norm": 0.58252429369972, + "learning_rate": 0.00014098720292504571, + "loss": 1.0155, + "step": 2892 + }, + { + "epoch": 0.6171075085324232, + "grad_norm": 0.5656750793005946, + "learning_rate": 0.00014103595368677638, + "loss": 1.0184, + "step": 2893 + }, + { + "epoch": 0.617320819112628, + "grad_norm": 0.582929925983406, + "learning_rate": 0.00014108470444850702, + "loss": 0.9948, + "step": 2894 + }, + { + "epoch": 0.6175341296928327, + "grad_norm": 0.5843295255305325, + "learning_rate": 0.00014113345521023766, + "loss": 1.0139, + "step": 2895 + }, + { + "epoch": 0.6177474402730375, + "grad_norm": 0.6470267646022411, + "learning_rate": 0.00014118220597196833, + "loss": 1.0416, + "step": 2896 + }, + { + "epoch": 0.6179607508532423, + "grad_norm": 0.8184907463457887, + "learning_rate": 0.00014123095673369897, + "loss": 0.9941, + "step": 2897 + }, + { + "epoch": 0.6181740614334471, + "grad_norm": 1.1509339870437452, + "learning_rate": 0.00014127970749542963, + "loss": 1.0378, + "step": 2898 + }, + { + "epoch": 0.6183873720136519, + "grad_norm": 1.0186491502313282, + "learning_rate": 0.00014132845825716027, + "loss": 1.0114, + "step": 2899 + }, + { + "epoch": 0.6186006825938567, + "grad_norm": 1.0303941091446747, + "learning_rate": 0.0001413772090188909, + "loss": 1.0144, + "step": 2900 + }, + { + "epoch": 0.6188139931740614, + "grad_norm": 1.0979793383670577, + "learning_rate": 0.00014142595978062158, + "loss": 1.0533, + "step": 2901 + }, + { + "epoch": 0.6190273037542662, + "grad_norm": 0.8288568388732258, + "learning_rate": 0.00014147471054235224, + "loss": 1.0332, + "step": 2902 + }, + { + "epoch": 0.619240614334471, + "grad_norm": 0.8048651214033394, + "learning_rate": 0.00014152346130408288, + "loss": 1.0151, + "step": 2903 + }, + { + "epoch": 0.6194539249146758, + "grad_norm": 0.8236836762448989, + "learning_rate": 0.00014157221206581355, + "loss": 1.0129, + "step": 2904 + }, + { + "epoch": 0.6196672354948806, + "grad_norm": 0.8459959577803525, + "learning_rate": 0.0001416209628275442, + "loss": 1.0399, + "step": 2905 + }, + { + "epoch": 0.6198805460750854, + "grad_norm": 0.8917099654361735, + "learning_rate": 0.00014166971358927486, + "loss": 1.0463, + "step": 2906 + }, + { + "epoch": 0.6200938566552902, + "grad_norm": 0.9987320463867724, + "learning_rate": 0.0001417184643510055, + "loss": 1.038, + "step": 2907 + }, + { + "epoch": 0.6203071672354948, + "grad_norm": 0.982140770940008, + "learning_rate": 0.00014176721511273616, + "loss": 0.996, + "step": 2908 + }, + { + "epoch": 0.6205204778156996, + "grad_norm": 0.8640337694085639, + "learning_rate": 0.0001418159658744668, + "loss": 1.0271, + "step": 2909 + }, + { + "epoch": 0.6207337883959044, + "grad_norm": 0.8523620237340979, + "learning_rate": 0.00014186471663619744, + "loss": 0.9859, + "step": 2910 + }, + { + "epoch": 0.6209470989761092, + "grad_norm": 0.6973579433473773, + "learning_rate": 0.0001419134673979281, + "loss": 1.0256, + "step": 2911 + }, + { + "epoch": 0.621160409556314, + "grad_norm": 0.5229853020626973, + "learning_rate": 0.00014196221815965875, + "loss": 1.011, + "step": 2912 + }, + { + "epoch": 0.6213737201365188, + "grad_norm": 0.4934914632147799, + "learning_rate": 0.00014201096892138941, + "loss": 1.0023, + "step": 2913 + }, + { + "epoch": 0.6215870307167235, + "grad_norm": 0.4863360182719745, + "learning_rate": 0.00014205971968312005, + "loss": 1.029, + "step": 2914 + }, + { + "epoch": 0.6218003412969283, + "grad_norm": 0.44399430050484284, + "learning_rate": 0.00014210847044485072, + "loss": 0.9747, + "step": 2915 + }, + { + "epoch": 0.6220136518771331, + "grad_norm": 0.5802918747960303, + "learning_rate": 0.00014215722120658136, + "loss": 1.0426, + "step": 2916 + }, + { + "epoch": 0.6222269624573379, + "grad_norm": 0.7289638951321425, + "learning_rate": 0.000142205971968312, + "loss": 0.9928, + "step": 2917 + }, + { + "epoch": 0.6224402730375427, + "grad_norm": 0.710693367838192, + "learning_rate": 0.00014225472273004267, + "loss": 1.0173, + "step": 2918 + }, + { + "epoch": 0.6226535836177475, + "grad_norm": 0.6920966807722998, + "learning_rate": 0.0001423034734917733, + "loss": 0.9903, + "step": 2919 + }, + { + "epoch": 0.6228668941979523, + "grad_norm": 0.794496358615043, + "learning_rate": 0.00014235222425350397, + "loss": 1.0435, + "step": 2920 + }, + { + "epoch": 0.623080204778157, + "grad_norm": 0.9961483580585744, + "learning_rate": 0.00014240097501523464, + "loss": 1.016, + "step": 2921 + }, + { + "epoch": 0.6232935153583617, + "grad_norm": 1.1561496778855114, + "learning_rate": 0.00014244972577696528, + "loss": 1.0085, + "step": 2922 + }, + { + "epoch": 0.6235068259385665, + "grad_norm": 0.6263012864131342, + "learning_rate": 0.00014249847653869594, + "loss": 0.9962, + "step": 2923 + }, + { + "epoch": 0.6237201365187713, + "grad_norm": 0.5118327626240896, + "learning_rate": 0.00014254722730042658, + "loss": 1.0214, + "step": 2924 + }, + { + "epoch": 0.6239334470989761, + "grad_norm": 0.7415704945133365, + "learning_rate": 0.00014259597806215725, + "loss": 1.0547, + "step": 2925 + }, + { + "epoch": 0.6241467576791809, + "grad_norm": 1.144305519219918, + "learning_rate": 0.0001426447288238879, + "loss": 1.0049, + "step": 2926 + }, + { + "epoch": 0.6243600682593856, + "grad_norm": 0.9988396774540612, + "learning_rate": 0.00014269347958561853, + "loss": 0.9963, + "step": 2927 + }, + { + "epoch": 0.6245733788395904, + "grad_norm": 0.7975987006744131, + "learning_rate": 0.0001427422303473492, + "loss": 1.0072, + "step": 2928 + }, + { + "epoch": 0.6247866894197952, + "grad_norm": 0.6099191745143723, + "learning_rate": 0.00014279098110907984, + "loss": 0.9703, + "step": 2929 + }, + { + "epoch": 0.625, + "grad_norm": 0.4906851106899999, + "learning_rate": 0.0001428397318708105, + "loss": 1.017, + "step": 2930 + }, + { + "epoch": 0.6252133105802048, + "grad_norm": 0.41560902420411133, + "learning_rate": 0.00014288848263254114, + "loss": 0.9721, + "step": 2931 + }, + { + "epoch": 0.6254266211604096, + "grad_norm": 0.44161355293159943, + "learning_rate": 0.00014293723339427178, + "loss": 0.9946, + "step": 2932 + }, + { + "epoch": 0.6256399317406144, + "grad_norm": 0.522373749476695, + "learning_rate": 0.00014298598415600245, + "loss": 1.0315, + "step": 2933 + }, + { + "epoch": 0.6258532423208191, + "grad_norm": 0.5158454738095984, + "learning_rate": 0.0001430347349177331, + "loss": 1.0236, + "step": 2934 + }, + { + "epoch": 0.6260665529010239, + "grad_norm": 0.5438968757831798, + "learning_rate": 0.00014308348567946375, + "loss": 1.0129, + "step": 2935 + }, + { + "epoch": 0.6262798634812287, + "grad_norm": 0.7110347886809376, + "learning_rate": 0.0001431322364411944, + "loss": 1.0241, + "step": 2936 + }, + { + "epoch": 0.6264931740614335, + "grad_norm": 0.8486712476856684, + "learning_rate": 0.00014318098720292506, + "loss": 1.0096, + "step": 2937 + }, + { + "epoch": 0.6267064846416383, + "grad_norm": 1.1451475973736662, + "learning_rate": 0.0001432297379646557, + "loss": 1.0141, + "step": 2938 + }, + { + "epoch": 0.626919795221843, + "grad_norm": 1.0676435186689184, + "learning_rate": 0.00014327848872638637, + "loss": 1.0369, + "step": 2939 + }, + { + "epoch": 0.6271331058020477, + "grad_norm": 0.8444043829130056, + "learning_rate": 0.00014332723948811703, + "loss": 1.0207, + "step": 2940 + }, + { + "epoch": 0.6273464163822525, + "grad_norm": 0.7430839736847525, + "learning_rate": 0.00014337599024984767, + "loss": 1.0252, + "step": 2941 + }, + { + "epoch": 0.6275597269624573, + "grad_norm": 0.6742503115084182, + "learning_rate": 0.0001434247410115783, + "loss": 1.0278, + "step": 2942 + }, + { + "epoch": 0.6277730375426621, + "grad_norm": 0.7986912907137239, + "learning_rate": 0.00014347349177330898, + "loss": 1.0077, + "step": 2943 + }, + { + "epoch": 0.6279863481228669, + "grad_norm": 0.9911981783485332, + "learning_rate": 0.00014352224253503962, + "loss": 1.0131, + "step": 2944 + }, + { + "epoch": 0.6281996587030717, + "grad_norm": 1.1606640116225884, + "learning_rate": 0.00014357099329677028, + "loss": 0.9578, + "step": 2945 + }, + { + "epoch": 0.6284129692832765, + "grad_norm": 0.8086736993112473, + "learning_rate": 0.00014361974405850092, + "loss": 1.0389, + "step": 2946 + }, + { + "epoch": 0.6286262798634812, + "grad_norm": 0.5778431417219454, + "learning_rate": 0.00014366849482023156, + "loss": 0.996, + "step": 2947 + }, + { + "epoch": 0.628839590443686, + "grad_norm": 0.6345916464850339, + "learning_rate": 0.00014371724558196223, + "loss": 0.9896, + "step": 2948 + }, + { + "epoch": 0.6290529010238908, + "grad_norm": 0.7135641574672371, + "learning_rate": 0.00014376599634369287, + "loss": 0.9891, + "step": 2949 + }, + { + "epoch": 0.6292662116040956, + "grad_norm": 0.6454767173065192, + "learning_rate": 0.00014381474710542354, + "loss": 0.9937, + "step": 2950 + }, + { + "epoch": 0.6294795221843004, + "grad_norm": 0.6903897673651721, + "learning_rate": 0.00014386349786715418, + "loss": 0.9867, + "step": 2951 + }, + { + "epoch": 0.6296928327645052, + "grad_norm": 0.731866312620397, + "learning_rate": 0.00014391224862888484, + "loss": 1.0201, + "step": 2952 + }, + { + "epoch": 0.6299061433447098, + "grad_norm": 0.7128903471042958, + "learning_rate": 0.00014396099939061548, + "loss": 1.0176, + "step": 2953 + }, + { + "epoch": 0.6301194539249146, + "grad_norm": 0.8217017254511444, + "learning_rate": 0.00014400975015234615, + "loss": 0.9966, + "step": 2954 + }, + { + "epoch": 0.6303327645051194, + "grad_norm": 1.2139634700578927, + "learning_rate": 0.0001440585009140768, + "loss": 1.0327, + "step": 2955 + }, + { + "epoch": 0.6305460750853242, + "grad_norm": 0.8968484581252176, + "learning_rate": 0.00014410725167580745, + "loss": 1.0122, + "step": 2956 + }, + { + "epoch": 0.630759385665529, + "grad_norm": 0.7400784295296498, + "learning_rate": 0.0001441560024375381, + "loss": 1.018, + "step": 2957 + }, + { + "epoch": 0.6309726962457338, + "grad_norm": 0.7134920689683613, + "learning_rate": 0.00014420475319926876, + "loss": 0.9865, + "step": 2958 + }, + { + "epoch": 0.6311860068259386, + "grad_norm": 0.5079317487626672, + "learning_rate": 0.0001442535039609994, + "loss": 1.0242, + "step": 2959 + }, + { + "epoch": 0.6313993174061433, + "grad_norm": 0.42770001001523833, + "learning_rate": 0.00014430225472273007, + "loss": 0.9647, + "step": 2960 + }, + { + "epoch": 0.6316126279863481, + "grad_norm": 0.6830638509296231, + "learning_rate": 0.0001443510054844607, + "loss": 0.9967, + "step": 2961 + }, + { + "epoch": 0.6318259385665529, + "grad_norm": 0.8149159390277789, + "learning_rate": 0.00014439975624619137, + "loss": 0.9993, + "step": 2962 + }, + { + "epoch": 0.6320392491467577, + "grad_norm": 0.8550040057464072, + "learning_rate": 0.000144448507007922, + "loss": 1.0255, + "step": 2963 + }, + { + "epoch": 0.6322525597269625, + "grad_norm": 0.8669815203789139, + "learning_rate": 0.00014449725776965265, + "loss": 0.9916, + "step": 2964 + }, + { + "epoch": 0.6324658703071673, + "grad_norm": 0.9841974523925037, + "learning_rate": 0.00014454600853138332, + "loss": 1.0022, + "step": 2965 + }, + { + "epoch": 0.632679180887372, + "grad_norm": 1.1573324454918692, + "learning_rate": 0.00014459475929311396, + "loss": 1.0457, + "step": 2966 + }, + { + "epoch": 0.6328924914675768, + "grad_norm": 0.8405082574405325, + "learning_rate": 0.00014464351005484462, + "loss": 1.001, + "step": 2967 + }, + { + "epoch": 0.6331058020477816, + "grad_norm": 0.6106540276590804, + "learning_rate": 0.00014469226081657526, + "loss": 0.9957, + "step": 2968 + }, + { + "epoch": 0.6333191126279863, + "grad_norm": 0.5208038747822594, + "learning_rate": 0.00014474101157830593, + "loss": 1.0223, + "step": 2969 + }, + { + "epoch": 0.6335324232081911, + "grad_norm": 0.47276981686936703, + "learning_rate": 0.00014478976234003657, + "loss": 1.0324, + "step": 2970 + }, + { + "epoch": 0.6337457337883959, + "grad_norm": 0.6051461212238561, + "learning_rate": 0.00014483851310176724, + "loss": 1.0031, + "step": 2971 + }, + { + "epoch": 0.6339590443686007, + "grad_norm": 0.9157519062930823, + "learning_rate": 0.00014488726386349788, + "loss": 1.0354, + "step": 2972 + }, + { + "epoch": 0.6341723549488054, + "grad_norm": 0.8595356059430281, + "learning_rate": 0.00014493601462522854, + "loss": 1.004, + "step": 2973 + }, + { + "epoch": 0.6343856655290102, + "grad_norm": 0.564158628662114, + "learning_rate": 0.00014498476538695918, + "loss": 1.0123, + "step": 2974 + }, + { + "epoch": 0.634598976109215, + "grad_norm": 0.5179212179846349, + "learning_rate": 0.00014503351614868985, + "loss": 0.9868, + "step": 2975 + }, + { + "epoch": 0.6348122866894198, + "grad_norm": 0.4966279126606111, + "learning_rate": 0.0001450822669104205, + "loss": 0.9979, + "step": 2976 + }, + { + "epoch": 0.6350255972696246, + "grad_norm": 0.5242732008115449, + "learning_rate": 0.00014513101767215115, + "loss": 0.9939, + "step": 2977 + }, + { + "epoch": 0.6352389078498294, + "grad_norm": 0.5989863890773339, + "learning_rate": 0.0001451797684338818, + "loss": 1.0012, + "step": 2978 + }, + { + "epoch": 0.6354522184300341, + "grad_norm": 0.678039215958168, + "learning_rate": 0.00014522851919561243, + "loss": 1.0009, + "step": 2979 + }, + { + "epoch": 0.6356655290102389, + "grad_norm": 0.59919729009966, + "learning_rate": 0.0001452772699573431, + "loss": 1.0105, + "step": 2980 + }, + { + "epoch": 0.6358788395904437, + "grad_norm": 0.4590519292929288, + "learning_rate": 0.00014532602071907374, + "loss": 1.0039, + "step": 2981 + }, + { + "epoch": 0.6360921501706485, + "grad_norm": 0.3997345988289697, + "learning_rate": 0.0001453747714808044, + "loss": 0.9843, + "step": 2982 + }, + { + "epoch": 0.6363054607508533, + "grad_norm": 0.5079113230768367, + "learning_rate": 0.00014542352224253504, + "loss": 1.0069, + "step": 2983 + }, + { + "epoch": 0.636518771331058, + "grad_norm": 0.7007387845495622, + "learning_rate": 0.00014547227300426568, + "loss": 0.9929, + "step": 2984 + }, + { + "epoch": 0.6367320819112628, + "grad_norm": 0.801996104092951, + "learning_rate": 0.00014552102376599635, + "loss": 1.0366, + "step": 2985 + }, + { + "epoch": 0.6369453924914675, + "grad_norm": 0.8515539812971832, + "learning_rate": 0.000145569774527727, + "loss": 0.9973, + "step": 2986 + }, + { + "epoch": 0.6371587030716723, + "grad_norm": 0.9269066797664497, + "learning_rate": 0.00014561852528945766, + "loss": 1.0069, + "step": 2987 + }, + { + "epoch": 0.6373720136518771, + "grad_norm": 1.1757928815063607, + "learning_rate": 0.00014566727605118832, + "loss": 0.9928, + "step": 2988 + }, + { + "epoch": 0.6375853242320819, + "grad_norm": 0.9047578408647668, + "learning_rate": 0.00014571602681291896, + "loss": 1.0038, + "step": 2989 + }, + { + "epoch": 0.6377986348122867, + "grad_norm": 0.7746546359640134, + "learning_rate": 0.00014576477757464963, + "loss": 1.0182, + "step": 2990 + }, + { + "epoch": 0.6380119453924915, + "grad_norm": 0.7586577712213509, + "learning_rate": 0.00014581352833638027, + "loss": 1.0304, + "step": 2991 + }, + { + "epoch": 0.6382252559726962, + "grad_norm": 0.8272599333601502, + "learning_rate": 0.00014586227909811094, + "loss": 1.0849, + "step": 2992 + }, + { + "epoch": 0.638438566552901, + "grad_norm": 0.8802616695446084, + "learning_rate": 0.00014591102985984157, + "loss": 0.9954, + "step": 2993 + }, + { + "epoch": 0.6386518771331058, + "grad_norm": 1.0268109658009792, + "learning_rate": 0.00014595978062157221, + "loss": 0.9863, + "step": 2994 + }, + { + "epoch": 0.6388651877133106, + "grad_norm": 1.0116477069000094, + "learning_rate": 0.00014600853138330288, + "loss": 1.0034, + "step": 2995 + }, + { + "epoch": 0.6390784982935154, + "grad_norm": 0.9328017222087805, + "learning_rate": 0.00014605728214503352, + "loss": 1.0308, + "step": 2996 + }, + { + "epoch": 0.6392918088737202, + "grad_norm": 0.9114487629260406, + "learning_rate": 0.0001461060329067642, + "loss": 1.0353, + "step": 2997 + }, + { + "epoch": 0.639505119453925, + "grad_norm": 0.83068754049097, + "learning_rate": 0.00014615478366849483, + "loss": 1.0329, + "step": 2998 + }, + { + "epoch": 0.6397184300341296, + "grad_norm": 0.7353825663450009, + "learning_rate": 0.00014620353443022547, + "loss": 1.0175, + "step": 2999 + }, + { + "epoch": 0.6399317406143344, + "grad_norm": 0.676476633294038, + "learning_rate": 0.00014625228519195613, + "loss": 1.0195, + "step": 3000 + }, + { + "epoch": 0.6401450511945392, + "grad_norm": 0.43267081649324735, + "learning_rate": 0.00014630103595368677, + "loss": 1.0075, + "step": 3001 + }, + { + "epoch": 0.640358361774744, + "grad_norm": 0.452287778896757, + "learning_rate": 0.00014634978671541744, + "loss": 1.0059, + "step": 3002 + }, + { + "epoch": 0.6405716723549488, + "grad_norm": 0.5312197783960948, + "learning_rate": 0.00014639853747714808, + "loss": 0.9787, + "step": 3003 + }, + { + "epoch": 0.6407849829351536, + "grad_norm": 0.5911451481858474, + "learning_rate": 0.00014644728823887874, + "loss": 0.9987, + "step": 3004 + }, + { + "epoch": 0.6409982935153583, + "grad_norm": 0.6546849216287166, + "learning_rate": 0.0001464960390006094, + "loss": 1.0172, + "step": 3005 + }, + { + "epoch": 0.6412116040955631, + "grad_norm": 0.5737947914409738, + "learning_rate": 0.00014654478976234005, + "loss": 1.023, + "step": 3006 + }, + { + "epoch": 0.6414249146757679, + "grad_norm": 0.5632016365631963, + "learning_rate": 0.00014659354052407072, + "loss": 1.012, + "step": 3007 + }, + { + "epoch": 0.6416382252559727, + "grad_norm": 0.6391274758217192, + "learning_rate": 0.00014664229128580136, + "loss": 1.0114, + "step": 3008 + }, + { + "epoch": 0.6418515358361775, + "grad_norm": 0.7930963432556777, + "learning_rate": 0.00014669104204753202, + "loss": 1.0255, + "step": 3009 + }, + { + "epoch": 0.6420648464163823, + "grad_norm": 0.9467087645862843, + "learning_rate": 0.00014673979280926266, + "loss": 1.0011, + "step": 3010 + }, + { + "epoch": 0.6422781569965871, + "grad_norm": 1.1270847410631437, + "learning_rate": 0.0001467885435709933, + "loss": 0.9977, + "step": 3011 + }, + { + "epoch": 0.6424914675767918, + "grad_norm": 0.8096680168876907, + "learning_rate": 0.00014683729433272397, + "loss": 1.0445, + "step": 3012 + }, + { + "epoch": 0.6427047781569966, + "grad_norm": 0.6959070841664905, + "learning_rate": 0.0001468860450944546, + "loss": 1.0212, + "step": 3013 + }, + { + "epoch": 0.6429180887372014, + "grad_norm": 0.506734665029881, + "learning_rate": 0.00014693479585618527, + "loss": 0.9695, + "step": 3014 + }, + { + "epoch": 0.6431313993174061, + "grad_norm": 0.47288482704369084, + "learning_rate": 0.00014698354661791591, + "loss": 1.0348, + "step": 3015 + }, + { + "epoch": 0.643344709897611, + "grad_norm": 0.4527949839247455, + "learning_rate": 0.00014703229737964655, + "loss": 1.0176, + "step": 3016 + }, + { + "epoch": 0.6435580204778157, + "grad_norm": 0.40223306566192985, + "learning_rate": 0.00014708104814137722, + "loss": 1.0209, + "step": 3017 + }, + { + "epoch": 0.6437713310580204, + "grad_norm": 0.5006082487482394, + "learning_rate": 0.00014712979890310786, + "loss": 1.0051, + "step": 3018 + }, + { + "epoch": 0.6439846416382252, + "grad_norm": 0.5759414157307297, + "learning_rate": 0.00014717854966483853, + "loss": 0.9979, + "step": 3019 + }, + { + "epoch": 0.64419795221843, + "grad_norm": 0.6626076659927802, + "learning_rate": 0.00014722730042656917, + "loss": 1.0, + "step": 3020 + }, + { + "epoch": 0.6444112627986348, + "grad_norm": 0.7336450774562889, + "learning_rate": 0.00014727605118829983, + "loss": 1.0147, + "step": 3021 + }, + { + "epoch": 0.6446245733788396, + "grad_norm": 0.953328656362633, + "learning_rate": 0.00014732480195003047, + "loss": 1.0408, + "step": 3022 + }, + { + "epoch": 0.6448378839590444, + "grad_norm": 1.2251027225905897, + "learning_rate": 0.00014737355271176114, + "loss": 1.0082, + "step": 3023 + }, + { + "epoch": 0.6450511945392492, + "grad_norm": 0.7296085323841025, + "learning_rate": 0.0001474223034734918, + "loss": 1.0195, + "step": 3024 + }, + { + "epoch": 0.6452645051194539, + "grad_norm": 0.5652074152661647, + "learning_rate": 0.00014747105423522244, + "loss": 1.0096, + "step": 3025 + }, + { + "epoch": 0.6454778156996587, + "grad_norm": 0.6665851656843329, + "learning_rate": 0.00014751980499695308, + "loss": 0.9659, + "step": 3026 + }, + { + "epoch": 0.6456911262798635, + "grad_norm": 0.6694282220205447, + "learning_rate": 0.00014756855575868375, + "loss": 0.9795, + "step": 3027 + }, + { + "epoch": 0.6459044368600683, + "grad_norm": 0.7550094913875608, + "learning_rate": 0.0001476173065204144, + "loss": 1.0215, + "step": 3028 + }, + { + "epoch": 0.6461177474402731, + "grad_norm": 0.8969035070360536, + "learning_rate": 0.00014766605728214506, + "loss": 1.0071, + "step": 3029 + }, + { + "epoch": 0.6463310580204779, + "grad_norm": 1.050072056615084, + "learning_rate": 0.0001477148080438757, + "loss": 0.9877, + "step": 3030 + }, + { + "epoch": 0.6465443686006825, + "grad_norm": 1.0930695194850946, + "learning_rate": 0.00014776355880560634, + "loss": 1.0399, + "step": 3031 + }, + { + "epoch": 0.6467576791808873, + "grad_norm": 0.8484401720942781, + "learning_rate": 0.000147812309567337, + "loss": 0.9917, + "step": 3032 + }, + { + "epoch": 0.6469709897610921, + "grad_norm": 0.7891468232366633, + "learning_rate": 0.00014786106032906764, + "loss": 1.0267, + "step": 3033 + }, + { + "epoch": 0.6471843003412969, + "grad_norm": 0.7253287413169974, + "learning_rate": 0.0001479098110907983, + "loss": 1.0256, + "step": 3034 + }, + { + "epoch": 0.6473976109215017, + "grad_norm": 0.643609655559948, + "learning_rate": 0.00014795856185252895, + "loss": 0.9977, + "step": 3035 + }, + { + "epoch": 0.6476109215017065, + "grad_norm": 0.6424628038252118, + "learning_rate": 0.00014800731261425961, + "loss": 1.0049, + "step": 3036 + }, + { + "epoch": 0.6478242320819113, + "grad_norm": 0.7672692125463094, + "learning_rate": 0.00014805606337599025, + "loss": 1.0131, + "step": 3037 + }, + { + "epoch": 0.648037542662116, + "grad_norm": 0.7222481197160096, + "learning_rate": 0.00014810481413772092, + "loss": 1.0291, + "step": 3038 + }, + { + "epoch": 0.6482508532423208, + "grad_norm": 0.5501257951247428, + "learning_rate": 0.00014815356489945156, + "loss": 0.9844, + "step": 3039 + }, + { + "epoch": 0.6484641638225256, + "grad_norm": 0.5111219887260539, + "learning_rate": 0.00014820231566118223, + "loss": 0.9841, + "step": 3040 + }, + { + "epoch": 0.6486774744027304, + "grad_norm": 0.553820499330589, + "learning_rate": 0.00014825106642291287, + "loss": 0.9806, + "step": 3041 + }, + { + "epoch": 0.6488907849829352, + "grad_norm": 0.5555061255831882, + "learning_rate": 0.00014829981718464353, + "loss": 1.0049, + "step": 3042 + }, + { + "epoch": 0.64910409556314, + "grad_norm": 0.5573919343834526, + "learning_rate": 0.00014834856794637417, + "loss": 0.9856, + "step": 3043 + }, + { + "epoch": 0.6493174061433447, + "grad_norm": 0.619330230129197, + "learning_rate": 0.00014839731870810484, + "loss": 0.9937, + "step": 3044 + }, + { + "epoch": 0.6495307167235495, + "grad_norm": 0.6651875889560701, + "learning_rate": 0.00014844606946983548, + "loss": 1.0038, + "step": 3045 + }, + { + "epoch": 0.6497440273037542, + "grad_norm": 0.8592803668402927, + "learning_rate": 0.00014849482023156614, + "loss": 0.9996, + "step": 3046 + }, + { + "epoch": 0.649957337883959, + "grad_norm": 1.211945613492869, + "learning_rate": 0.00014854357099329678, + "loss": 1.0047, + "step": 3047 + }, + { + "epoch": 0.6501706484641638, + "grad_norm": 0.7198822070687035, + "learning_rate": 0.00014859232175502742, + "loss": 1.0342, + "step": 3048 + }, + { + "epoch": 0.6503839590443686, + "grad_norm": 0.6893918713454308, + "learning_rate": 0.0001486410725167581, + "loss": 1.0043, + "step": 3049 + }, + { + "epoch": 0.6505972696245734, + "grad_norm": 0.754858226933105, + "learning_rate": 0.00014868982327848873, + "loss": 1.0241, + "step": 3050 + }, + { + "epoch": 0.6508105802047781, + "grad_norm": 0.6124617006441605, + "learning_rate": 0.0001487385740402194, + "loss": 0.9796, + "step": 3051 + }, + { + "epoch": 0.6510238907849829, + "grad_norm": 0.5001515250620437, + "learning_rate": 0.00014878732480195004, + "loss": 1.0392, + "step": 3052 + }, + { + "epoch": 0.6512372013651877, + "grad_norm": 0.5377671136958995, + "learning_rate": 0.0001488360755636807, + "loss": 1.0015, + "step": 3053 + }, + { + "epoch": 0.6514505119453925, + "grad_norm": 0.61861563718548, + "learning_rate": 0.00014888482632541134, + "loss": 0.968, + "step": 3054 + }, + { + "epoch": 0.6516638225255973, + "grad_norm": 0.712340828785949, + "learning_rate": 0.000148933577087142, + "loss": 0.9886, + "step": 3055 + }, + { + "epoch": 0.6518771331058021, + "grad_norm": 0.8768197231645722, + "learning_rate": 0.00014898232784887265, + "loss": 1.028, + "step": 3056 + }, + { + "epoch": 0.6520904436860068, + "grad_norm": 1.0614251439981792, + "learning_rate": 0.00014903107861060331, + "loss": 1.004, + "step": 3057 + }, + { + "epoch": 0.6523037542662116, + "grad_norm": 0.9918497806730283, + "learning_rate": 0.00014907982937233395, + "loss": 0.9898, + "step": 3058 + }, + { + "epoch": 0.6525170648464164, + "grad_norm": 1.0831324725424787, + "learning_rate": 0.00014912858013406462, + "loss": 0.9766, + "step": 3059 + }, + { + "epoch": 0.6527303754266212, + "grad_norm": 0.9940587861721281, + "learning_rate": 0.00014917733089579526, + "loss": 1.0241, + "step": 3060 + }, + { + "epoch": 0.652943686006826, + "grad_norm": 1.1508016549284474, + "learning_rate": 0.00014922608165752593, + "loss": 0.9891, + "step": 3061 + }, + { + "epoch": 0.6531569965870307, + "grad_norm": 0.9267715568976734, + "learning_rate": 0.00014927483241925657, + "loss": 1.0039, + "step": 3062 + }, + { + "epoch": 0.6533703071672355, + "grad_norm": 0.7773037773102166, + "learning_rate": 0.0001493235831809872, + "loss": 0.9891, + "step": 3063 + }, + { + "epoch": 0.6535836177474402, + "grad_norm": 0.53352556738301, + "learning_rate": 0.00014937233394271787, + "loss": 1.0074, + "step": 3064 + }, + { + "epoch": 0.653796928327645, + "grad_norm": 0.5449983726552493, + "learning_rate": 0.0001494210847044485, + "loss": 1.0089, + "step": 3065 + }, + { + "epoch": 0.6540102389078498, + "grad_norm": 0.4993642333905267, + "learning_rate": 0.00014946983546617918, + "loss": 1.0061, + "step": 3066 + }, + { + "epoch": 0.6542235494880546, + "grad_norm": 0.4200251702718383, + "learning_rate": 0.00014951858622790982, + "loss": 1.0056, + "step": 3067 + }, + { + "epoch": 0.6544368600682594, + "grad_norm": 0.5179127025802595, + "learning_rate": 0.00014956733698964046, + "loss": 1.0227, + "step": 3068 + }, + { + "epoch": 0.6546501706484642, + "grad_norm": 0.6104175581870325, + "learning_rate": 0.00014961608775137112, + "loss": 0.9979, + "step": 3069 + }, + { + "epoch": 0.6548634812286689, + "grad_norm": 0.7074588636158773, + "learning_rate": 0.00014966483851310176, + "loss": 0.9895, + "step": 3070 + }, + { + "epoch": 0.6550767918088737, + "grad_norm": 0.8014958186805718, + "learning_rate": 0.00014971358927483243, + "loss": 0.9745, + "step": 3071 + }, + { + "epoch": 0.6552901023890785, + "grad_norm": 0.8456937789267588, + "learning_rate": 0.0001497623400365631, + "loss": 0.9974, + "step": 3072 + }, + { + "epoch": 0.6555034129692833, + "grad_norm": 0.8173295057763613, + "learning_rate": 0.00014981109079829374, + "loss": 1.0311, + "step": 3073 + }, + { + "epoch": 0.6557167235494881, + "grad_norm": 0.8439389416593421, + "learning_rate": 0.0001498598415600244, + "loss": 1.0127, + "step": 3074 + }, + { + "epoch": 0.6559300341296929, + "grad_norm": 0.942569846312106, + "learning_rate": 0.00014990859232175504, + "loss": 1.0011, + "step": 3075 + }, + { + "epoch": 0.6561433447098977, + "grad_norm": 1.114623318216148, + "learning_rate": 0.0001499573430834857, + "loss": 1.0252, + "step": 3076 + }, + { + "epoch": 0.6563566552901023, + "grad_norm": 0.9022080341146808, + "learning_rate": 0.00015000609384521635, + "loss": 0.9558, + "step": 3077 + }, + { + "epoch": 0.6565699658703071, + "grad_norm": 0.7076982381768915, + "learning_rate": 0.000150054844606947, + "loss": 1.0098, + "step": 3078 + }, + { + "epoch": 0.6567832764505119, + "grad_norm": 0.5318955192119358, + "learning_rate": 0.00015010359536867765, + "loss": 0.9791, + "step": 3079 + }, + { + "epoch": 0.6569965870307167, + "grad_norm": 0.5251529165565744, + "learning_rate": 0.0001501523461304083, + "loss": 0.997, + "step": 3080 + }, + { + "epoch": 0.6572098976109215, + "grad_norm": 0.5705424758385198, + "learning_rate": 0.00015020109689213896, + "loss": 1.0166, + "step": 3081 + }, + { + "epoch": 0.6574232081911263, + "grad_norm": 0.4730810716407018, + "learning_rate": 0.0001502498476538696, + "loss": 1.0135, + "step": 3082 + }, + { + "epoch": 0.6576365187713311, + "grad_norm": 0.5722780103457167, + "learning_rate": 0.00015029859841560024, + "loss": 1.0293, + "step": 3083 + }, + { + "epoch": 0.6578498293515358, + "grad_norm": 0.6773054959124343, + "learning_rate": 0.0001503473491773309, + "loss": 1.0213, + "step": 3084 + }, + { + "epoch": 0.6580631399317406, + "grad_norm": 0.6319391990200284, + "learning_rate": 0.00015039609993906154, + "loss": 1.0119, + "step": 3085 + }, + { + "epoch": 0.6582764505119454, + "grad_norm": 0.7687475702325799, + "learning_rate": 0.0001504448507007922, + "loss": 0.9928, + "step": 3086 + }, + { + "epoch": 0.6584897610921502, + "grad_norm": 1.010368987494995, + "learning_rate": 0.00015049360146252285, + "loss": 1.0059, + "step": 3087 + }, + { + "epoch": 0.658703071672355, + "grad_norm": 1.1627958331803512, + "learning_rate": 0.00015054235222425352, + "loss": 0.9916, + "step": 3088 + }, + { + "epoch": 0.6589163822525598, + "grad_norm": 0.7372036145144707, + "learning_rate": 0.00015059110298598418, + "loss": 0.9737, + "step": 3089 + }, + { + "epoch": 0.6591296928327645, + "grad_norm": 0.6449419629763412, + "learning_rate": 0.00015063985374771482, + "loss": 1.0065, + "step": 3090 + }, + { + "epoch": 0.6593430034129693, + "grad_norm": 0.6375841331329649, + "learning_rate": 0.0001506886045094455, + "loss": 1.0123, + "step": 3091 + }, + { + "epoch": 0.659556313993174, + "grad_norm": 0.7815780475770493, + "learning_rate": 0.00015073735527117613, + "loss": 0.9758, + "step": 3092 + }, + { + "epoch": 0.6597696245733788, + "grad_norm": 1.1469157457548391, + "learning_rate": 0.0001507861060329068, + "loss": 0.9936, + "step": 3093 + }, + { + "epoch": 0.6599829351535836, + "grad_norm": 1.0819107955519343, + "learning_rate": 0.00015083485679463744, + "loss": 1.002, + "step": 3094 + }, + { + "epoch": 0.6601962457337884, + "grad_norm": 0.7802008099003481, + "learning_rate": 0.00015088360755636807, + "loss": 0.9958, + "step": 3095 + }, + { + "epoch": 0.6604095563139932, + "grad_norm": 0.5678718020016171, + "learning_rate": 0.00015093235831809874, + "loss": 0.9949, + "step": 3096 + }, + { + "epoch": 0.6606228668941979, + "grad_norm": 0.5342154886440005, + "learning_rate": 0.00015098110907982938, + "loss": 1.0308, + "step": 3097 + }, + { + "epoch": 0.6608361774744027, + "grad_norm": 0.5608214967807688, + "learning_rate": 0.00015102985984156005, + "loss": 1.004, + "step": 3098 + }, + { + "epoch": 0.6610494880546075, + "grad_norm": 0.5357153803868399, + "learning_rate": 0.0001510786106032907, + "loss": 0.9921, + "step": 3099 + }, + { + "epoch": 0.6612627986348123, + "grad_norm": 0.5943486610836453, + "learning_rate": 0.00015112736136502133, + "loss": 1.0139, + "step": 3100 + }, + { + "epoch": 0.6614761092150171, + "grad_norm": 0.7028253503819933, + "learning_rate": 0.000151176112126752, + "loss": 1.0183, + "step": 3101 + }, + { + "epoch": 0.6616894197952219, + "grad_norm": 0.7969359732009862, + "learning_rate": 0.00015122486288848263, + "loss": 1.0059, + "step": 3102 + }, + { + "epoch": 0.6619027303754266, + "grad_norm": 0.7667428464491803, + "learning_rate": 0.0001512736136502133, + "loss": 1.0111, + "step": 3103 + }, + { + "epoch": 0.6621160409556314, + "grad_norm": 0.6576874106168371, + "learning_rate": 0.00015132236441194394, + "loss": 0.9931, + "step": 3104 + }, + { + "epoch": 0.6623293515358362, + "grad_norm": 0.6793661265490344, + "learning_rate": 0.0001513711151736746, + "loss": 1.0132, + "step": 3105 + }, + { + "epoch": 0.662542662116041, + "grad_norm": 0.6935775328380502, + "learning_rate": 0.00015141986593540524, + "loss": 0.9774, + "step": 3106 + }, + { + "epoch": 0.6627559726962458, + "grad_norm": 0.6178842502014023, + "learning_rate": 0.0001514686166971359, + "loss": 0.9679, + "step": 3107 + }, + { + "epoch": 0.6629692832764505, + "grad_norm": 0.47815083672287834, + "learning_rate": 0.00015151736745886658, + "loss": 0.9701, + "step": 3108 + }, + { + "epoch": 0.6631825938566553, + "grad_norm": 0.47920179666646906, + "learning_rate": 0.00015156611822059722, + "loss": 1.0103, + "step": 3109 + }, + { + "epoch": 0.66339590443686, + "grad_norm": 0.6079939709554514, + "learning_rate": 0.00015161486898232786, + "loss": 0.9866, + "step": 3110 + }, + { + "epoch": 0.6636092150170648, + "grad_norm": 0.7227584775340747, + "learning_rate": 0.00015166361974405852, + "loss": 1.0057, + "step": 3111 + }, + { + "epoch": 0.6638225255972696, + "grad_norm": 0.9608849960537345, + "learning_rate": 0.00015171237050578916, + "loss": 1.0014, + "step": 3112 + }, + { + "epoch": 0.6640358361774744, + "grad_norm": 0.8947457701074644, + "learning_rate": 0.00015176112126751983, + "loss": 0.9986, + "step": 3113 + }, + { + "epoch": 0.6642491467576792, + "grad_norm": 0.7628224341784896, + "learning_rate": 0.00015180987202925047, + "loss": 0.9804, + "step": 3114 + }, + { + "epoch": 0.664462457337884, + "grad_norm": 0.6704042851654123, + "learning_rate": 0.0001518586227909811, + "loss": 1.0094, + "step": 3115 + }, + { + "epoch": 0.6646757679180887, + "grad_norm": 0.6132854918030802, + "learning_rate": 0.00015190737355271177, + "loss": 1.0077, + "step": 3116 + }, + { + "epoch": 0.6648890784982935, + "grad_norm": 0.5916948571680198, + "learning_rate": 0.00015195612431444241, + "loss": 0.9818, + "step": 3117 + }, + { + "epoch": 0.6651023890784983, + "grad_norm": 0.8691155384385736, + "learning_rate": 0.00015200487507617308, + "loss": 1.0118, + "step": 3118 + }, + { + "epoch": 0.6653156996587031, + "grad_norm": 1.0084806323277637, + "learning_rate": 0.00015205362583790372, + "loss": 1.0085, + "step": 3119 + }, + { + "epoch": 0.6655290102389079, + "grad_norm": 0.8850600122568162, + "learning_rate": 0.0001521023765996344, + "loss": 1.018, + "step": 3120 + }, + { + "epoch": 0.6657423208191127, + "grad_norm": 0.8042560392058685, + "learning_rate": 0.00015215112736136503, + "loss": 1.0353, + "step": 3121 + }, + { + "epoch": 0.6659556313993175, + "grad_norm": 0.8093583858482368, + "learning_rate": 0.0001521998781230957, + "loss": 1.0174, + "step": 3122 + }, + { + "epoch": 0.6661689419795221, + "grad_norm": 0.9027227978304314, + "learning_rate": 0.00015224862888482633, + "loss": 1.0002, + "step": 3123 + }, + { + "epoch": 0.6663822525597269, + "grad_norm": 1.1671333819838174, + "learning_rate": 0.000152297379646557, + "loss": 1.0208, + "step": 3124 + }, + { + "epoch": 0.6665955631399317, + "grad_norm": 0.8675561247091272, + "learning_rate": 0.00015234613040828764, + "loss": 1.0287, + "step": 3125 + }, + { + "epoch": 0.6668088737201365, + "grad_norm": 0.7357369128254575, + "learning_rate": 0.0001523948811700183, + "loss": 1.0315, + "step": 3126 + }, + { + "epoch": 0.6670221843003413, + "grad_norm": 0.5700306796099611, + "learning_rate": 0.00015244363193174894, + "loss": 0.9953, + "step": 3127 + }, + { + "epoch": 0.6672354948805461, + "grad_norm": 0.4673402137544409, + "learning_rate": 0.0001524923826934796, + "loss": 0.9869, + "step": 3128 + }, + { + "epoch": 0.6674488054607508, + "grad_norm": 0.47392270937786907, + "learning_rate": 0.00015254113345521025, + "loss": 0.9818, + "step": 3129 + }, + { + "epoch": 0.6676621160409556, + "grad_norm": 0.5045037449995946, + "learning_rate": 0.0001525898842169409, + "loss": 0.9776, + "step": 3130 + }, + { + "epoch": 0.6678754266211604, + "grad_norm": 0.5855198296886814, + "learning_rate": 0.00015263863497867156, + "loss": 0.993, + "step": 3131 + }, + { + "epoch": 0.6680887372013652, + "grad_norm": 0.6776918650850734, + "learning_rate": 0.0001526873857404022, + "loss": 1.0234, + "step": 3132 + }, + { + "epoch": 0.66830204778157, + "grad_norm": 0.8817403574480928, + "learning_rate": 0.00015273613650213286, + "loss": 1.0066, + "step": 3133 + }, + { + "epoch": 0.6685153583617748, + "grad_norm": 0.9632465183532638, + "learning_rate": 0.0001527848872638635, + "loss": 0.9997, + "step": 3134 + }, + { + "epoch": 0.6687286689419796, + "grad_norm": 1.0089504157546985, + "learning_rate": 0.00015283363802559417, + "loss": 0.9897, + "step": 3135 + }, + { + "epoch": 0.6689419795221843, + "grad_norm": 1.024496405052948, + "learning_rate": 0.0001528823887873248, + "loss": 1.0177, + "step": 3136 + }, + { + "epoch": 0.669155290102389, + "grad_norm": 0.9365898358612588, + "learning_rate": 0.00015293113954905545, + "loss": 1.0222, + "step": 3137 + }, + { + "epoch": 0.6693686006825939, + "grad_norm": 0.8399237052285464, + "learning_rate": 0.00015297989031078611, + "loss": 0.9812, + "step": 3138 + }, + { + "epoch": 0.6695819112627986, + "grad_norm": 0.8234094012763916, + "learning_rate": 0.00015302864107251678, + "loss": 1.0304, + "step": 3139 + }, + { + "epoch": 0.6697952218430034, + "grad_norm": 0.7904926556462444, + "learning_rate": 0.00015307739183424742, + "loss": 0.984, + "step": 3140 + }, + { + "epoch": 0.6700085324232082, + "grad_norm": 0.685407018258112, + "learning_rate": 0.00015312614259597809, + "loss": 1.009, + "step": 3141 + }, + { + "epoch": 0.6702218430034129, + "grad_norm": 0.5322843975212979, + "learning_rate": 0.00015317489335770873, + "loss": 1.0308, + "step": 3142 + }, + { + "epoch": 0.6704351535836177, + "grad_norm": 0.40271548963711995, + "learning_rate": 0.0001532236441194394, + "loss": 1.0123, + "step": 3143 + }, + { + "epoch": 0.6706484641638225, + "grad_norm": 0.33848764285497607, + "learning_rate": 0.00015327239488117003, + "loss": 0.9718, + "step": 3144 + }, + { + "epoch": 0.6708617747440273, + "grad_norm": 0.46798642206367214, + "learning_rate": 0.0001533211456429007, + "loss": 1.0056, + "step": 3145 + }, + { + "epoch": 0.6710750853242321, + "grad_norm": 0.5768771728749653, + "learning_rate": 0.00015336989640463134, + "loss": 0.9788, + "step": 3146 + }, + { + "epoch": 0.6712883959044369, + "grad_norm": 0.7163544871738943, + "learning_rate": 0.00015341864716636198, + "loss": 0.9934, + "step": 3147 + }, + { + "epoch": 0.6715017064846417, + "grad_norm": 0.9305729290315068, + "learning_rate": 0.00015346739792809264, + "loss": 1.0136, + "step": 3148 + }, + { + "epoch": 0.6717150170648464, + "grad_norm": 1.1179774366897264, + "learning_rate": 0.00015351614868982328, + "loss": 1.0009, + "step": 3149 + }, + { + "epoch": 0.6719283276450512, + "grad_norm": 0.8835528086622599, + "learning_rate": 0.00015356489945155395, + "loss": 0.9766, + "step": 3150 + }, + { + "epoch": 0.672141638225256, + "grad_norm": 0.7656132268440314, + "learning_rate": 0.0001536136502132846, + "loss": 0.9939, + "step": 3151 + }, + { + "epoch": 0.6723549488054608, + "grad_norm": 0.7229514242204209, + "learning_rate": 0.00015366240097501523, + "loss": 0.9951, + "step": 3152 + }, + { + "epoch": 0.6725682593856656, + "grad_norm": 0.6885328880651083, + "learning_rate": 0.0001537111517367459, + "loss": 0.9581, + "step": 3153 + }, + { + "epoch": 0.6727815699658704, + "grad_norm": 0.6766919915311225, + "learning_rate": 0.00015375990249847653, + "loss": 0.9851, + "step": 3154 + }, + { + "epoch": 0.672994880546075, + "grad_norm": 0.6124567033791302, + "learning_rate": 0.0001538086532602072, + "loss": 0.9728, + "step": 3155 + }, + { + "epoch": 0.6732081911262798, + "grad_norm": 0.6146874632809002, + "learning_rate": 0.00015385740402193787, + "loss": 1.0308, + "step": 3156 + }, + { + "epoch": 0.6734215017064846, + "grad_norm": 0.5833527536528769, + "learning_rate": 0.0001539061547836685, + "loss": 0.9959, + "step": 3157 + }, + { + "epoch": 0.6736348122866894, + "grad_norm": 0.5415837366603355, + "learning_rate": 0.00015395490554539917, + "loss": 1.0131, + "step": 3158 + }, + { + "epoch": 0.6738481228668942, + "grad_norm": 0.5452649756718133, + "learning_rate": 0.00015400365630712981, + "loss": 1.0006, + "step": 3159 + }, + { + "epoch": 0.674061433447099, + "grad_norm": 1.364404350068248, + "learning_rate": 0.00015405240706886048, + "loss": 1.0296, + "step": 3160 + }, + { + "epoch": 0.6742747440273038, + "grad_norm": 0.3989895446234423, + "learning_rate": 0.00015410115783059112, + "loss": 1.0102, + "step": 3161 + }, + { + "epoch": 0.6744880546075085, + "grad_norm": 0.7861882134670822, + "learning_rate": 0.00015414990859232176, + "loss": 0.9723, + "step": 3162 + }, + { + "epoch": 0.6747013651877133, + "grad_norm": 1.0514792258101122, + "learning_rate": 0.00015419865935405243, + "loss": 1.0161, + "step": 3163 + }, + { + "epoch": 0.6749146757679181, + "grad_norm": 1.0046627119372722, + "learning_rate": 0.00015424741011578307, + "loss": 0.9838, + "step": 3164 + }, + { + "epoch": 0.6751279863481229, + "grad_norm": 0.92933662953822, + "learning_rate": 0.00015429616087751373, + "loss": 0.974, + "step": 3165 + }, + { + "epoch": 0.6753412969283277, + "grad_norm": 0.9263560871159257, + "learning_rate": 0.00015434491163924437, + "loss": 1.0338, + "step": 3166 + }, + { + "epoch": 0.6755546075085325, + "grad_norm": 0.9657415633202622, + "learning_rate": 0.000154393662400975, + "loss": 0.9994, + "step": 3167 + }, + { + "epoch": 0.6757679180887372, + "grad_norm": 1.0080835138264062, + "learning_rate": 0.00015444241316270568, + "loss": 1.0012, + "step": 3168 + }, + { + "epoch": 0.675981228668942, + "grad_norm": 1.0818753594202541, + "learning_rate": 0.00015449116392443632, + "loss": 1.0131, + "step": 3169 + }, + { + "epoch": 0.6761945392491467, + "grad_norm": 1.0231556527184298, + "learning_rate": 0.00015453991468616698, + "loss": 1.0192, + "step": 3170 + }, + { + "epoch": 0.6764078498293515, + "grad_norm": 0.7732558144875956, + "learning_rate": 0.00015458866544789762, + "loss": 0.9839, + "step": 3171 + }, + { + "epoch": 0.6766211604095563, + "grad_norm": 0.8670412640148848, + "learning_rate": 0.0001546374162096283, + "loss": 1.0375, + "step": 3172 + }, + { + "epoch": 0.6768344709897611, + "grad_norm": 0.9595021013284822, + "learning_rate": 0.00015468616697135893, + "loss": 0.968, + "step": 3173 + }, + { + "epoch": 0.6770477815699659, + "grad_norm": 0.8829602023408317, + "learning_rate": 0.0001547349177330896, + "loss": 1.0322, + "step": 3174 + }, + { + "epoch": 0.6772610921501706, + "grad_norm": 0.8746326059111046, + "learning_rate": 0.00015478366849482026, + "loss": 1.01, + "step": 3175 + }, + { + "epoch": 0.6774744027303754, + "grad_norm": 0.9274417889444856, + "learning_rate": 0.0001548324192565509, + "loss": 1.0276, + "step": 3176 + }, + { + "epoch": 0.6776877133105802, + "grad_norm": 0.8968534461016812, + "learning_rate": 0.00015488117001828157, + "loss": 1.0101, + "step": 3177 + }, + { + "epoch": 0.677901023890785, + "grad_norm": 0.8025986969992973, + "learning_rate": 0.0001549299207800122, + "loss": 0.9855, + "step": 3178 + }, + { + "epoch": 0.6781143344709898, + "grad_norm": 0.7130526663137946, + "learning_rate": 0.00015497867154174285, + "loss": 0.9894, + "step": 3179 + }, + { + "epoch": 0.6783276450511946, + "grad_norm": 0.6236011303899351, + "learning_rate": 0.0001550274223034735, + "loss": 0.9708, + "step": 3180 + }, + { + "epoch": 0.6785409556313993, + "grad_norm": 0.6772124279660111, + "learning_rate": 0.00015507617306520415, + "loss": 1.0041, + "step": 3181 + }, + { + "epoch": 0.6787542662116041, + "grad_norm": 0.8371090416188882, + "learning_rate": 0.00015512492382693482, + "loss": 1.0245, + "step": 3182 + }, + { + "epoch": 0.6789675767918089, + "grad_norm": 1.1128520242562625, + "learning_rate": 0.00015517367458866546, + "loss": 1.0005, + "step": 3183 + }, + { + "epoch": 0.6791808873720137, + "grad_norm": 1.289500705044223, + "learning_rate": 0.0001552224253503961, + "loss": 1.0074, + "step": 3184 + }, + { + "epoch": 0.6793941979522184, + "grad_norm": 0.7213918011629713, + "learning_rate": 0.00015527117611212677, + "loss": 1.0393, + "step": 3185 + }, + { + "epoch": 0.6796075085324232, + "grad_norm": 0.5464892069455464, + "learning_rate": 0.0001553199268738574, + "loss": 1.0249, + "step": 3186 + }, + { + "epoch": 0.679820819112628, + "grad_norm": 0.7072508695721, + "learning_rate": 0.00015536867763558807, + "loss": 1.0063, + "step": 3187 + }, + { + "epoch": 0.6800341296928327, + "grad_norm": 0.9736096711034178, + "learning_rate": 0.0001554174283973187, + "loss": 1.004, + "step": 3188 + }, + { + "epoch": 0.6802474402730375, + "grad_norm": 1.2266473050030795, + "learning_rate": 0.00015546617915904938, + "loss": 0.9781, + "step": 3189 + }, + { + "epoch": 0.6804607508532423, + "grad_norm": 0.7636514400886739, + "learning_rate": 0.00015551492992078002, + "loss": 1.048, + "step": 3190 + }, + { + "epoch": 0.6806740614334471, + "grad_norm": 0.7744007954814401, + "learning_rate": 0.00015556368068251068, + "loss": 1.0186, + "step": 3191 + }, + { + "epoch": 0.6808873720136519, + "grad_norm": 0.6999270003119663, + "learning_rate": 0.00015561243144424135, + "loss": 0.9833, + "step": 3192 + }, + { + "epoch": 0.6811006825938567, + "grad_norm": 0.7582781931572821, + "learning_rate": 0.000155661182205972, + "loss": 1.0177, + "step": 3193 + }, + { + "epoch": 0.6813139931740614, + "grad_norm": 0.8830956281440429, + "learning_rate": 0.00015570993296770263, + "loss": 0.9826, + "step": 3194 + }, + { + "epoch": 0.6815273037542662, + "grad_norm": 0.9793127329426352, + "learning_rate": 0.0001557586837294333, + "loss": 0.9746, + "step": 3195 + }, + { + "epoch": 0.681740614334471, + "grad_norm": 1.110398757012261, + "learning_rate": 0.00015580743449116393, + "loss": 0.9954, + "step": 3196 + }, + { + "epoch": 0.6819539249146758, + "grad_norm": 0.8433020399658697, + "learning_rate": 0.0001558561852528946, + "loss": 1.0308, + "step": 3197 + }, + { + "epoch": 0.6821672354948806, + "grad_norm": 0.5874007993385315, + "learning_rate": 0.00015590493601462524, + "loss": 1.0347, + "step": 3198 + }, + { + "epoch": 0.6823805460750854, + "grad_norm": 0.41178271689010293, + "learning_rate": 0.00015595368677635588, + "loss": 1.0036, + "step": 3199 + }, + { + "epoch": 0.6825938566552902, + "grad_norm": 0.4006936783378365, + "learning_rate": 0.00015600243753808655, + "loss": 1.0077, + "step": 3200 + }, + { + "epoch": 0.6828071672354948, + "grad_norm": 0.547921679280663, + "learning_rate": 0.00015605118829981719, + "loss": 1.0146, + "step": 3201 + }, + { + "epoch": 0.6830204778156996, + "grad_norm": 0.518358584829232, + "learning_rate": 0.00015609993906154785, + "loss": 0.9938, + "step": 3202 + }, + { + "epoch": 0.6832337883959044, + "grad_norm": 0.5217241909930813, + "learning_rate": 0.0001561486898232785, + "loss": 0.9858, + "step": 3203 + }, + { + "epoch": 0.6834470989761092, + "grad_norm": 0.6514140193275159, + "learning_rate": 0.00015619744058500916, + "loss": 0.9719, + "step": 3204 + }, + { + "epoch": 0.683660409556314, + "grad_norm": 0.6853294870774257, + "learning_rate": 0.0001562461913467398, + "loss": 1.0066, + "step": 3205 + }, + { + "epoch": 0.6838737201365188, + "grad_norm": 0.7701469182468391, + "learning_rate": 0.00015629494210847046, + "loss": 0.9996, + "step": 3206 + }, + { + "epoch": 0.6840870307167235, + "grad_norm": 0.8744649047663984, + "learning_rate": 0.0001563436928702011, + "loss": 1.0102, + "step": 3207 + }, + { + "epoch": 0.6843003412969283, + "grad_norm": 0.8460392163890273, + "learning_rate": 0.00015639244363193177, + "loss": 0.9911, + "step": 3208 + }, + { + "epoch": 0.6845136518771331, + "grad_norm": 0.5768466448890113, + "learning_rate": 0.0001564411943936624, + "loss": 0.9737, + "step": 3209 + }, + { + "epoch": 0.6847269624573379, + "grad_norm": 0.3666332722655447, + "learning_rate": 0.00015648994515539308, + "loss": 1.0248, + "step": 3210 + }, + { + "epoch": 0.6849402730375427, + "grad_norm": 0.47432959910531824, + "learning_rate": 0.00015653869591712372, + "loss": 0.9859, + "step": 3211 + }, + { + "epoch": 0.6851535836177475, + "grad_norm": 0.45172387114999607, + "learning_rate": 0.00015658744667885438, + "loss": 0.9952, + "step": 3212 + }, + { + "epoch": 0.6853668941979523, + "grad_norm": 0.43103172789552957, + "learning_rate": 0.00015663619744058502, + "loss": 0.9984, + "step": 3213 + }, + { + "epoch": 0.685580204778157, + "grad_norm": 0.5082189985024436, + "learning_rate": 0.00015668494820231566, + "loss": 0.9811, + "step": 3214 + }, + { + "epoch": 0.6857935153583617, + "grad_norm": 0.5866162528390091, + "learning_rate": 0.00015673369896404633, + "loss": 1.0405, + "step": 3215 + }, + { + "epoch": 0.6860068259385665, + "grad_norm": 0.6385705303245955, + "learning_rate": 0.00015678244972577697, + "loss": 0.9907, + "step": 3216 + }, + { + "epoch": 0.6862201365187713, + "grad_norm": 1.0294380477984904, + "learning_rate": 0.00015683120048750763, + "loss": 0.997, + "step": 3217 + }, + { + "epoch": 0.6864334470989761, + "grad_norm": 0.515253836278396, + "learning_rate": 0.00015687995124923827, + "loss": 1.0379, + "step": 3218 + }, + { + "epoch": 0.6866467576791809, + "grad_norm": 0.6168534339929942, + "learning_rate": 0.00015692870201096894, + "loss": 1.0291, + "step": 3219 + }, + { + "epoch": 0.6868600682593856, + "grad_norm": 0.8087780674093716, + "learning_rate": 0.00015697745277269958, + "loss": 1.0124, + "step": 3220 + }, + { + "epoch": 0.6870733788395904, + "grad_norm": 0.9945552574418933, + "learning_rate": 0.00015702620353443022, + "loss": 0.9862, + "step": 3221 + }, + { + "epoch": 0.6872866894197952, + "grad_norm": 1.286558133819763, + "learning_rate": 0.00015707495429616089, + "loss": 0.9873, + "step": 3222 + }, + { + "epoch": 0.6875, + "grad_norm": 0.7290238756537387, + "learning_rate": 0.00015712370505789155, + "loss": 1.0462, + "step": 3223 + }, + { + "epoch": 0.6877133105802048, + "grad_norm": 0.6790154825859822, + "learning_rate": 0.0001571724558196222, + "loss": 1.023, + "step": 3224 + }, + { + "epoch": 0.6879266211604096, + "grad_norm": 0.8151199160216838, + "learning_rate": 0.00015722120658135286, + "loss": 0.9962, + "step": 3225 + }, + { + "epoch": 0.6881399317406144, + "grad_norm": 1.0116313269481745, + "learning_rate": 0.0001572699573430835, + "loss": 1.0211, + "step": 3226 + }, + { + "epoch": 0.6883532423208191, + "grad_norm": 1.1378502712259702, + "learning_rate": 0.00015731870810481416, + "loss": 1.015, + "step": 3227 + }, + { + "epoch": 0.6885665529010239, + "grad_norm": 0.8707599763288908, + "learning_rate": 0.0001573674588665448, + "loss": 0.9747, + "step": 3228 + }, + { + "epoch": 0.6887798634812287, + "grad_norm": 0.8755624335172311, + "learning_rate": 0.00015741620962827547, + "loss": 0.982, + "step": 3229 + }, + { + "epoch": 0.6889931740614335, + "grad_norm": 0.7944638222510858, + "learning_rate": 0.0001574649603900061, + "loss": 0.9897, + "step": 3230 + }, + { + "epoch": 0.6892064846416383, + "grad_norm": 0.6773518103256577, + "learning_rate": 0.00015751371115173675, + "loss": 0.9989, + "step": 3231 + }, + { + "epoch": 0.689419795221843, + "grad_norm": 0.5705300858146026, + "learning_rate": 0.00015756246191346742, + "loss": 1.0305, + "step": 3232 + }, + { + "epoch": 0.6896331058020477, + "grad_norm": 0.46003631869307327, + "learning_rate": 0.00015761121267519806, + "loss": 0.9983, + "step": 3233 + }, + { + "epoch": 0.6898464163822525, + "grad_norm": 0.49095359696832175, + "learning_rate": 0.00015765996343692872, + "loss": 0.9914, + "step": 3234 + }, + { + "epoch": 0.6900597269624573, + "grad_norm": 0.5936162166217183, + "learning_rate": 0.00015770871419865936, + "loss": 1.0096, + "step": 3235 + }, + { + "epoch": 0.6902730375426621, + "grad_norm": 0.5257829423541421, + "learning_rate": 0.00015775746496039, + "loss": 1.0135, + "step": 3236 + }, + { + "epoch": 0.6904863481228669, + "grad_norm": 0.4346854688004096, + "learning_rate": 0.00015780621572212067, + "loss": 1.0008, + "step": 3237 + }, + { + "epoch": 0.6906996587030717, + "grad_norm": 0.47287856230877395, + "learning_rate": 0.0001578549664838513, + "loss": 0.9995, + "step": 3238 + }, + { + "epoch": 0.6909129692832765, + "grad_norm": 0.6049510222968798, + "learning_rate": 0.00015790371724558197, + "loss": 1.0369, + "step": 3239 + }, + { + "epoch": 0.6911262798634812, + "grad_norm": 0.6933604824550947, + "learning_rate": 0.00015795246800731264, + "loss": 1.035, + "step": 3240 + }, + { + "epoch": 0.691339590443686, + "grad_norm": 0.7695921313606285, + "learning_rate": 0.00015800121876904328, + "loss": 1.003, + "step": 3241 + }, + { + "epoch": 0.6915529010238908, + "grad_norm": 0.9089360943778906, + "learning_rate": 0.00015804996953077395, + "loss": 1.0091, + "step": 3242 + }, + { + "epoch": 0.6917662116040956, + "grad_norm": 1.0803197679631116, + "learning_rate": 0.00015809872029250459, + "loss": 1.0189, + "step": 3243 + }, + { + "epoch": 0.6919795221843004, + "grad_norm": 1.0799388893486566, + "learning_rate": 0.00015814747105423525, + "loss": 1.0036, + "step": 3244 + }, + { + "epoch": 0.6921928327645052, + "grad_norm": 0.9882704115368522, + "learning_rate": 0.0001581962218159659, + "loss": 1.0051, + "step": 3245 + }, + { + "epoch": 0.6924061433447098, + "grad_norm": 0.9463570478537957, + "learning_rate": 0.00015824497257769653, + "loss": 1.0144, + "step": 3246 + }, + { + "epoch": 0.6926194539249146, + "grad_norm": 0.7936187198411591, + "learning_rate": 0.0001582937233394272, + "loss": 0.9702, + "step": 3247 + }, + { + "epoch": 0.6928327645051194, + "grad_norm": 0.6108203223679005, + "learning_rate": 0.00015834247410115784, + "loss": 1.0035, + "step": 3248 + }, + { + "epoch": 0.6930460750853242, + "grad_norm": 0.9277267381083824, + "learning_rate": 0.0001583912248628885, + "loss": 0.947, + "step": 3249 + }, + { + "epoch": 0.693259385665529, + "grad_norm": 0.5176488888466829, + "learning_rate": 0.00015843997562461914, + "loss": 0.9839, + "step": 3250 + }, + { + "epoch": 0.6934726962457338, + "grad_norm": 0.6795558566270629, + "learning_rate": 0.00015848872638634978, + "loss": 1.0118, + "step": 3251 + }, + { + "epoch": 0.6936860068259386, + "grad_norm": 0.8993718870914892, + "learning_rate": 0.00015853747714808045, + "loss": 0.9978, + "step": 3252 + }, + { + "epoch": 0.6938993174061433, + "grad_norm": 1.39582220779037, + "learning_rate": 0.0001585862279098111, + "loss": 1.0243, + "step": 3253 + }, + { + "epoch": 0.6941126279863481, + "grad_norm": 0.7415136898019922, + "learning_rate": 0.00015863497867154176, + "loss": 0.9805, + "step": 3254 + }, + { + "epoch": 0.6943259385665529, + "grad_norm": 0.8178874742210404, + "learning_rate": 0.0001586837294332724, + "loss": 1.006, + "step": 3255 + }, + { + "epoch": 0.6945392491467577, + "grad_norm": 1.0393121831534178, + "learning_rate": 0.00015873248019500306, + "loss": 1.004, + "step": 3256 + }, + { + "epoch": 0.6947525597269625, + "grad_norm": 0.8612825815469088, + "learning_rate": 0.0001587812309567337, + "loss": 1.0441, + "step": 3257 + }, + { + "epoch": 0.6949658703071673, + "grad_norm": 0.7484506782112976, + "learning_rate": 0.00015882998171846437, + "loss": 0.9941, + "step": 3258 + }, + { + "epoch": 0.695179180887372, + "grad_norm": 1.0200538889112118, + "learning_rate": 0.00015887873248019503, + "loss": 1.0381, + "step": 3259 + }, + { + "epoch": 0.6953924914675768, + "grad_norm": 0.9823231567753126, + "learning_rate": 0.00015892748324192567, + "loss": 1.0161, + "step": 3260 + }, + { + "epoch": 0.6956058020477816, + "grad_norm": 0.7842347874909189, + "learning_rate": 0.00015897623400365634, + "loss": 1.0155, + "step": 3261 + }, + { + "epoch": 0.6958191126279863, + "grad_norm": 0.8978888942624591, + "learning_rate": 0.00015902498476538698, + "loss": 1.0229, + "step": 3262 + }, + { + "epoch": 0.6960324232081911, + "grad_norm": 0.9279105131777658, + "learning_rate": 0.00015907373552711762, + "loss": 1.019, + "step": 3263 + }, + { + "epoch": 0.6962457337883959, + "grad_norm": 0.7962081404320575, + "learning_rate": 0.00015912248628884829, + "loss": 1.048, + "step": 3264 + }, + { + "epoch": 0.6964590443686007, + "grad_norm": 0.7804806533771612, + "learning_rate": 0.00015917123705057893, + "loss": 1.0375, + "step": 3265 + }, + { + "epoch": 0.6966723549488054, + "grad_norm": 1.280255392468046, + "learning_rate": 0.0001592199878123096, + "loss": 1.0602, + "step": 3266 + }, + { + "epoch": 0.6968856655290102, + "grad_norm": 0.45402234462167956, + "learning_rate": 0.00015926873857404023, + "loss": 0.9651, + "step": 3267 + }, + { + "epoch": 0.697098976109215, + "grad_norm": 1.0594311201691964, + "learning_rate": 0.00015931748933577087, + "loss": 1.0078, + "step": 3268 + }, + { + "epoch": 0.6973122866894198, + "grad_norm": 0.8109778804151426, + "learning_rate": 0.00015936624009750154, + "loss": 0.9953, + "step": 3269 + }, + { + "epoch": 0.6975255972696246, + "grad_norm": 4.730153133134193, + "learning_rate": 0.00015941499085923218, + "loss": 1.062, + "step": 3270 + }, + { + "epoch": 0.6977389078498294, + "grad_norm": 17.9857494464568, + "learning_rate": 0.00015946374162096284, + "loss": 1.1497, + "step": 3271 + }, + { + "epoch": 0.6979522184300341, + "grad_norm": 2.1834224885075613, + "learning_rate": 0.00015951249238269348, + "loss": 1.0971, + "step": 3272 + }, + { + "epoch": 0.6981655290102389, + "grad_norm": 0.9930738822922628, + "learning_rate": 0.00015956124314442415, + "loss": 1.0547, + "step": 3273 + }, + { + "epoch": 0.6983788395904437, + "grad_norm": 1.2870108518883787, + "learning_rate": 0.0001596099939061548, + "loss": 1.0606, + "step": 3274 + }, + { + "epoch": 0.6985921501706485, + "grad_norm": 0.9647227797743031, + "learning_rate": 0.00015965874466788546, + "loss": 1.0695, + "step": 3275 + }, + { + "epoch": 0.6988054607508533, + "grad_norm": 1.3156394045998265, + "learning_rate": 0.0001597074954296161, + "loss": 1.0595, + "step": 3276 + }, + { + "epoch": 0.699018771331058, + "grad_norm": 0.9655660376582569, + "learning_rate": 0.00015975624619134676, + "loss": 1.0291, + "step": 3277 + }, + { + "epoch": 0.6992320819112628, + "grad_norm": 1.1069806857193412, + "learning_rate": 0.0001598049969530774, + "loss": 1.0574, + "step": 3278 + }, + { + "epoch": 0.6994453924914675, + "grad_norm": 1.0814973596017985, + "learning_rate": 0.00015985374771480807, + "loss": 1.0519, + "step": 3279 + }, + { + "epoch": 0.6996587030716723, + "grad_norm": 1.0916216419176836, + "learning_rate": 0.0001599024984765387, + "loss": 1.0031, + "step": 3280 + }, + { + "epoch": 0.6998720136518771, + "grad_norm": 1.002738714146754, + "learning_rate": 0.00015995124923826937, + "loss": 1.055, + "step": 3281 + }, + { + "epoch": 0.7000853242320819, + "grad_norm": 0.8833155341723089, + "learning_rate": 0.00016, + "loss": 1.0612, + "step": 3282 + }, + { + "epoch": 0.7002986348122867, + "grad_norm": 0.8685262961695943, + "learning_rate": 0.00015999999954739933, + "loss": 1.0483, + "step": 3283 + }, + { + "epoch": 0.7005119453924915, + "grad_norm": 0.7448479522262953, + "learning_rate": 0.00015999999818959733, + "loss": 1.0146, + "step": 3284 + }, + { + "epoch": 0.7007252559726962, + "grad_norm": 0.6908826132627648, + "learning_rate": 0.00015999999592659399, + "loss": 1.0356, + "step": 3285 + }, + { + "epoch": 0.700938566552901, + "grad_norm": 0.7046510583487187, + "learning_rate": 0.00015999999275838935, + "loss": 1.0325, + "step": 3286 + }, + { + "epoch": 0.7011518771331058, + "grad_norm": 0.6533065887322003, + "learning_rate": 0.00015999998868498344, + "loss": 1.051, + "step": 3287 + }, + { + "epoch": 0.7013651877133106, + "grad_norm": 0.613942104097725, + "learning_rate": 0.0001599999837063763, + "loss": 1.0228, + "step": 3288 + }, + { + "epoch": 0.7015784982935154, + "grad_norm": 0.6035493774074413, + "learning_rate": 0.00015999997782256806, + "loss": 1.0104, + "step": 3289 + }, + { + "epoch": 0.7017918088737202, + "grad_norm": 1.1093865477605132, + "learning_rate": 0.00015999997103355868, + "loss": 1.0176, + "step": 3290 + }, + { + "epoch": 0.702005119453925, + "grad_norm": 0.6645782242410487, + "learning_rate": 0.00015999996333934829, + "loss": 1.0155, + "step": 3291 + }, + { + "epoch": 0.7022184300341296, + "grad_norm": 0.8765855557640639, + "learning_rate": 0.00015999995473993695, + "loss": 1.0281, + "step": 3292 + }, + { + "epoch": 0.7024317406143344, + "grad_norm": 1.0350593696575958, + "learning_rate": 0.0001599999452353248, + "loss": 1.0389, + "step": 3293 + }, + { + "epoch": 0.7026450511945392, + "grad_norm": 1.0177745061997994, + "learning_rate": 0.00015999993482551192, + "loss": 1.0562, + "step": 3294 + }, + { + "epoch": 0.702858361774744, + "grad_norm": 1.0872415355981422, + "learning_rate": 0.00015999992351049843, + "loss": 1.0499, + "step": 3295 + }, + { + "epoch": 0.7030716723549488, + "grad_norm": 0.6492152273918077, + "learning_rate": 0.00015999991129028447, + "loss": 1.033, + "step": 3296 + }, + { + "epoch": 0.7032849829351536, + "grad_norm": 0.9071594809392046, + "learning_rate": 0.00015999989816487014, + "loss": 1.0146, + "step": 3297 + }, + { + "epoch": 0.7034982935153583, + "grad_norm": 1.114641947110311, + "learning_rate": 0.00015999988413425563, + "loss": 0.995, + "step": 3298 + }, + { + "epoch": 0.7037116040955631, + "grad_norm": 0.6393494440176821, + "learning_rate": 0.00015999986919844108, + "loss": 1.0182, + "step": 3299 + }, + { + "epoch": 0.7039249146757679, + "grad_norm": 0.8146938566926339, + "learning_rate": 0.00015999985335742668, + "loss": 0.9855, + "step": 3300 + }, + { + "epoch": 0.7041382252559727, + "grad_norm": 0.9363555093762747, + "learning_rate": 0.0001599998366112126, + "loss": 1.0278, + "step": 3301 + }, + { + "epoch": 0.7043515358361775, + "grad_norm": 0.6288485859883108, + "learning_rate": 0.000159999818959799, + "loss": 0.9964, + "step": 3302 + }, + { + "epoch": 0.7045648464163823, + "grad_norm": 0.7955656967950205, + "learning_rate": 0.0001599998004031861, + "loss": 1.0123, + "step": 3303 + }, + { + "epoch": 0.7047781569965871, + "grad_norm": 0.7919867584704202, + "learning_rate": 0.00015999978094137412, + "loss": 1.0369, + "step": 3304 + }, + { + "epoch": 0.7049914675767918, + "grad_norm": 0.5581321533341278, + "learning_rate": 0.00015999976057436328, + "loss": 1.0074, + "step": 3305 + }, + { + "epoch": 0.7052047781569966, + "grad_norm": 0.6084210481896624, + "learning_rate": 0.0001599997393021538, + "loss": 1.0319, + "step": 3306 + }, + { + "epoch": 0.7054180887372014, + "grad_norm": 0.6396383811807113, + "learning_rate": 0.00015999971712474595, + "loss": 1.0008, + "step": 3307 + }, + { + "epoch": 0.7056313993174061, + "grad_norm": 0.511183642225909, + "learning_rate": 0.00015999969404213995, + "loss": 0.9869, + "step": 3308 + }, + { + "epoch": 0.705844709897611, + "grad_norm": 0.49805562921715746, + "learning_rate": 0.00015999967005433602, + "loss": 0.9987, + "step": 3309 + }, + { + "epoch": 0.7060580204778157, + "grad_norm": 0.5817700379201391, + "learning_rate": 0.0001599996451613345, + "loss": 1.0149, + "step": 3310 + }, + { + "epoch": 0.7062713310580204, + "grad_norm": 0.39271776058524954, + "learning_rate": 0.00015999961936313565, + "loss": 1.0055, + "step": 3311 + }, + { + "epoch": 0.7064846416382252, + "grad_norm": 0.5085811804301819, + "learning_rate": 0.00015999959265973978, + "loss": 1.0208, + "step": 3312 + }, + { + "epoch": 0.70669795221843, + "grad_norm": 0.4885117085903099, + "learning_rate": 0.00015999956505114715, + "loss": 1.0144, + "step": 3313 + }, + { + "epoch": 0.7069112627986348, + "grad_norm": 0.5046882054668519, + "learning_rate": 0.0001599995365373581, + "loss": 0.9944, + "step": 3314 + }, + { + "epoch": 0.7071245733788396, + "grad_norm": 0.5741735438009324, + "learning_rate": 0.00015999950711837295, + "loss": 1.0339, + "step": 3315 + }, + { + "epoch": 0.7073378839590444, + "grad_norm": 0.4836025380097417, + "learning_rate": 0.00015999947679419204, + "loss": 1.0314, + "step": 3316 + }, + { + "epoch": 0.7075511945392492, + "grad_norm": 0.3547402699927362, + "learning_rate": 0.0001599994455648157, + "loss": 1.0225, + "step": 3317 + }, + { + "epoch": 0.7077645051194539, + "grad_norm": 0.466395789434778, + "learning_rate": 0.0001599994134302443, + "loss": 0.9952, + "step": 3318 + }, + { + "epoch": 0.7079778156996587, + "grad_norm": 0.419771640925083, + "learning_rate": 0.00015999938039047817, + "loss": 0.9932, + "step": 3319 + }, + { + "epoch": 0.7081911262798635, + "grad_norm": 0.5245321736479999, + "learning_rate": 0.00015999934644551772, + "loss": 1.0027, + "step": 3320 + }, + { + "epoch": 0.7084044368600683, + "grad_norm": 0.6781922101676959, + "learning_rate": 0.00015999931159536333, + "loss": 0.9944, + "step": 3321 + }, + { + "epoch": 0.7086177474402731, + "grad_norm": 0.7599316147156112, + "learning_rate": 0.00015999927584001535, + "loss": 1.0092, + "step": 3322 + }, + { + "epoch": 0.7088310580204779, + "grad_norm": 0.8659581757691136, + "learning_rate": 0.00015999923917947424, + "loss": 1.0009, + "step": 3323 + }, + { + "epoch": 0.7090443686006825, + "grad_norm": 0.9198800022743481, + "learning_rate": 0.00015999920161374038, + "loss": 1.0339, + "step": 3324 + }, + { + "epoch": 0.7092576791808873, + "grad_norm": 0.908792404525595, + "learning_rate": 0.00015999916314281422, + "loss": 1.0216, + "step": 3325 + }, + { + "epoch": 0.7094709897610921, + "grad_norm": 0.910136836370089, + "learning_rate": 0.0001599991237666962, + "loss": 1.0092, + "step": 3326 + }, + { + "epoch": 0.7096843003412969, + "grad_norm": 0.8145767697411178, + "learning_rate": 0.0001599990834853867, + "loss": 0.9886, + "step": 3327 + }, + { + "epoch": 0.7098976109215017, + "grad_norm": 0.7777496816140406, + "learning_rate": 0.00015999904229888626, + "loss": 1.0193, + "step": 3328 + }, + { + "epoch": 0.7101109215017065, + "grad_norm": 0.7017328037858628, + "learning_rate": 0.00015999900020719532, + "loss": 1.0053, + "step": 3329 + }, + { + "epoch": 0.7103242320819113, + "grad_norm": 0.6990120861336322, + "learning_rate": 0.00015999895721031433, + "loss": 0.993, + "step": 3330 + }, + { + "epoch": 0.710537542662116, + "grad_norm": 0.6210892177451887, + "learning_rate": 0.00015999891330824377, + "loss": 0.9944, + "step": 3331 + }, + { + "epoch": 0.7107508532423208, + "grad_norm": 0.4635361428603447, + "learning_rate": 0.0001599988685009842, + "loss": 0.9857, + "step": 3332 + }, + { + "epoch": 0.7109641638225256, + "grad_norm": 0.3520526017486321, + "learning_rate": 0.00015999882278853607, + "loss": 1.0276, + "step": 3333 + }, + { + "epoch": 0.7111774744027304, + "grad_norm": 0.3231565213770237, + "learning_rate": 0.0001599987761708999, + "loss": 1.0478, + "step": 3334 + }, + { + "epoch": 0.7113907849829352, + "grad_norm": 0.31250266676404975, + "learning_rate": 0.00015999872864807626, + "loss": 1.0072, + "step": 3335 + }, + { + "epoch": 0.71160409556314, + "grad_norm": 0.349051894870923, + "learning_rate": 0.00015999868022006564, + "loss": 0.985, + "step": 3336 + }, + { + "epoch": 0.7118174061433447, + "grad_norm": 0.2813420530808103, + "learning_rate": 0.00015999863088686863, + "loss": 1.0037, + "step": 3337 + }, + { + "epoch": 0.7120307167235495, + "grad_norm": 0.32344661814161085, + "learning_rate": 0.00015999858064848576, + "loss": 0.9757, + "step": 3338 + }, + { + "epoch": 0.7122440273037542, + "grad_norm": 0.3564437289700512, + "learning_rate": 0.0001599985295049176, + "loss": 0.9898, + "step": 3339 + }, + { + "epoch": 0.712457337883959, + "grad_norm": 0.35805806789775074, + "learning_rate": 0.00015999847745616473, + "loss": 0.9797, + "step": 3340 + }, + { + "epoch": 0.7126706484641638, + "grad_norm": 0.471479953412218, + "learning_rate": 0.00015999842450222773, + "loss": 0.9849, + "step": 3341 + }, + { + "epoch": 0.7128839590443686, + "grad_norm": 0.5643467437113397, + "learning_rate": 0.00015999837064310722, + "loss": 1.0164, + "step": 3342 + }, + { + "epoch": 0.7130972696245734, + "grad_norm": 0.5381930291100641, + "learning_rate": 0.0001599983158788038, + "loss": 1.0158, + "step": 3343 + }, + { + "epoch": 0.7133105802047781, + "grad_norm": 0.39924859945462376, + "learning_rate": 0.0001599982602093181, + "loss": 1.007, + "step": 3344 + }, + { + "epoch": 0.7135238907849829, + "grad_norm": 0.53674514551133, + "learning_rate": 0.00015999820363465074, + "loss": 0.9934, + "step": 3345 + }, + { + "epoch": 0.7137372013651877, + "grad_norm": 0.7042254661423164, + "learning_rate": 0.00015999814615480235, + "loss": 1.0001, + "step": 3346 + }, + { + "epoch": 0.7139505119453925, + "grad_norm": 0.6576750572040884, + "learning_rate": 0.00015999808776977358, + "loss": 0.9804, + "step": 3347 + }, + { + "epoch": 0.7141638225255973, + "grad_norm": 0.6857611489912726, + "learning_rate": 0.00015999802847956515, + "loss": 0.9544, + "step": 3348 + }, + { + "epoch": 0.7143771331058021, + "grad_norm": 0.854719444441249, + "learning_rate": 0.00015999796828417763, + "loss": 1.0303, + "step": 3349 + }, + { + "epoch": 0.7145904436860068, + "grad_norm": 0.9664885972928555, + "learning_rate": 0.00015999790718361178, + "loss": 1.0071, + "step": 3350 + }, + { + "epoch": 0.7148037542662116, + "grad_norm": 0.9632450851920589, + "learning_rate": 0.00015999784517786825, + "loss": 0.9851, + "step": 3351 + }, + { + "epoch": 0.7150170648464164, + "grad_norm": 0.9561672585715802, + "learning_rate": 0.00015999778226694774, + "loss": 1.0335, + "step": 3352 + }, + { + "epoch": 0.7152303754266212, + "grad_norm": 0.898681735140565, + "learning_rate": 0.000159997718450851, + "loss": 1.0046, + "step": 3353 + }, + { + "epoch": 0.715443686006826, + "grad_norm": 0.7566955439857834, + "learning_rate": 0.00015999765372957876, + "loss": 1.0208, + "step": 3354 + }, + { + "epoch": 0.7156569965870307, + "grad_norm": 0.6074583192378078, + "learning_rate": 0.0001599975881031317, + "loss": 1.0252, + "step": 3355 + }, + { + "epoch": 0.7158703071672355, + "grad_norm": 0.4790049194850052, + "learning_rate": 0.00015999752157151056, + "loss": 0.9861, + "step": 3356 + }, + { + "epoch": 0.7160836177474402, + "grad_norm": 0.391214105657812, + "learning_rate": 0.00015999745413471613, + "loss": 1.0017, + "step": 3357 + }, + { + "epoch": 0.716296928327645, + "grad_norm": 0.3436613731767002, + "learning_rate": 0.00015999738579274917, + "loss": 0.9708, + "step": 3358 + }, + { + "epoch": 0.7165102389078498, + "grad_norm": 0.4487902293083796, + "learning_rate": 0.00015999731654561044, + "loss": 1.0481, + "step": 3359 + }, + { + "epoch": 0.7167235494880546, + "grad_norm": 0.5749619358876433, + "learning_rate": 0.00015999724639330072, + "loss": 1.0037, + "step": 3360 + }, + { + "epoch": 0.7169368600682594, + "grad_norm": 0.5512064062469216, + "learning_rate": 0.00015999717533582083, + "loss": 1.0127, + "step": 3361 + }, + { + "epoch": 0.7171501706484642, + "grad_norm": 0.5343718843684787, + "learning_rate": 0.00015999710337317156, + "loss": 0.999, + "step": 3362 + }, + { + "epoch": 0.7173634812286689, + "grad_norm": 0.6402571615364052, + "learning_rate": 0.0001599970305053537, + "loss": 1.0257, + "step": 3363 + }, + { + "epoch": 0.7175767918088737, + "grad_norm": 0.731860174067529, + "learning_rate": 0.00015999695673236808, + "loss": 1.0063, + "step": 3364 + }, + { + "epoch": 0.7177901023890785, + "grad_norm": 0.7643290786131617, + "learning_rate": 0.0001599968820542156, + "loss": 0.9908, + "step": 3365 + }, + { + "epoch": 0.7180034129692833, + "grad_norm": 0.7934559387142872, + "learning_rate": 0.000159996806470897, + "loss": 1.0115, + "step": 3366 + }, + { + "epoch": 0.7182167235494881, + "grad_norm": 0.8090377620611874, + "learning_rate": 0.0001599967299824132, + "loss": 1.0034, + "step": 3367 + }, + { + "epoch": 0.7184300341296929, + "grad_norm": 0.7548232165681725, + "learning_rate": 0.0001599966525887651, + "loss": 0.9837, + "step": 3368 + }, + { + "epoch": 0.7186433447098977, + "grad_norm": 0.7967534316406752, + "learning_rate": 0.0001599965742899535, + "loss": 1.0179, + "step": 3369 + }, + { + "epoch": 0.7188566552901023, + "grad_norm": 0.910425963874157, + "learning_rate": 0.0001599964950859793, + "loss": 0.9995, + "step": 3370 + }, + { + "epoch": 0.7190699658703071, + "grad_norm": 0.9738382060578198, + "learning_rate": 0.00015999641497684342, + "loss": 0.9837, + "step": 3371 + }, + { + "epoch": 0.7192832764505119, + "grad_norm": 0.9565715222745516, + "learning_rate": 0.00015999633396254675, + "loss": 1.0317, + "step": 3372 + }, + { + "epoch": 0.7194965870307167, + "grad_norm": 0.8545845420728695, + "learning_rate": 0.00015999625204309023, + "loss": 0.9824, + "step": 3373 + }, + { + "epoch": 0.7197098976109215, + "grad_norm": 0.6862389439611485, + "learning_rate": 0.00015999616921847477, + "loss": 1.0469, + "step": 3374 + }, + { + "epoch": 0.7199232081911263, + "grad_norm": 0.6766953499015054, + "learning_rate": 0.00015999608548870133, + "loss": 0.9979, + "step": 3375 + }, + { + "epoch": 0.7201365187713311, + "grad_norm": 0.71391737638168, + "learning_rate": 0.0001599960008537708, + "loss": 0.9956, + "step": 3376 + }, + { + "epoch": 0.7203498293515358, + "grad_norm": 0.6430330530926384, + "learning_rate": 0.0001599959153136842, + "loss": 1.0321, + "step": 3377 + }, + { + "epoch": 0.7205631399317406, + "grad_norm": 0.48611789553103263, + "learning_rate": 0.00015999582886844248, + "loss": 1.0066, + "step": 3378 + }, + { + "epoch": 0.7207764505119454, + "grad_norm": 0.363706512435602, + "learning_rate": 0.0001599957415180466, + "loss": 0.9809, + "step": 3379 + }, + { + "epoch": 0.7209897610921502, + "grad_norm": 0.35860987240640413, + "learning_rate": 0.00015999565326249755, + "loss": 1.0025, + "step": 3380 + }, + { + "epoch": 0.721203071672355, + "grad_norm": 0.3563870256655611, + "learning_rate": 0.00015999556410179635, + "loss": 0.9773, + "step": 3381 + }, + { + "epoch": 0.7214163822525598, + "grad_norm": 0.5799814351491488, + "learning_rate": 0.000159995474035944, + "loss": 0.9983, + "step": 3382 + }, + { + "epoch": 0.7216296928327645, + "grad_norm": 0.7682618076095563, + "learning_rate": 0.0001599953830649415, + "loss": 1.0159, + "step": 3383 + }, + { + "epoch": 0.7218430034129693, + "grad_norm": 0.9767127922928536, + "learning_rate": 0.0001599952911887899, + "loss": 0.9895, + "step": 3384 + }, + { + "epoch": 0.722056313993174, + "grad_norm": 1.1787738385610782, + "learning_rate": 0.00015999519840749024, + "loss": 1.0437, + "step": 3385 + }, + { + "epoch": 0.7222696245733788, + "grad_norm": 0.7898761870707913, + "learning_rate": 0.00015999510472104357, + "loss": 0.9838, + "step": 3386 + }, + { + "epoch": 0.7224829351535836, + "grad_norm": 0.5704681468578114, + "learning_rate": 0.00015999501012945095, + "loss": 1.0078, + "step": 3387 + }, + { + "epoch": 0.7226962457337884, + "grad_norm": 0.3898067725334251, + "learning_rate": 0.00015999491463271343, + "loss": 1.0344, + "step": 3388 + }, + { + "epoch": 0.7229095563139932, + "grad_norm": 0.38863746164139346, + "learning_rate": 0.0001599948182308321, + "loss": 0.9962, + "step": 3389 + }, + { + "epoch": 0.7231228668941979, + "grad_norm": 0.35665854436999533, + "learning_rate": 0.00015999472092380804, + "loss": 0.995, + "step": 3390 + }, + { + "epoch": 0.7233361774744027, + "grad_norm": 0.3630472377703767, + "learning_rate": 0.0001599946227116424, + "loss": 0.9767, + "step": 3391 + }, + { + "epoch": 0.7235494880546075, + "grad_norm": 0.37080603496639053, + "learning_rate": 0.00015999452359433624, + "loss": 0.9684, + "step": 3392 + }, + { + "epoch": 0.7237627986348123, + "grad_norm": 0.3279192880078481, + "learning_rate": 0.00015999442357189072, + "loss": 0.9659, + "step": 3393 + }, + { + "epoch": 0.7239761092150171, + "grad_norm": 0.3728739381559012, + "learning_rate": 0.00015999432264430691, + "loss": 0.9804, + "step": 3394 + }, + { + "epoch": 0.7241894197952219, + "grad_norm": 0.36164399651358176, + "learning_rate": 0.00015999422081158603, + "loss": 1.0059, + "step": 3395 + }, + { + "epoch": 0.7244027303754266, + "grad_norm": 0.3489356811825742, + "learning_rate": 0.0001599941180737292, + "loss": 0.9829, + "step": 3396 + }, + { + "epoch": 0.7246160409556314, + "grad_norm": 0.3415289645969737, + "learning_rate": 0.00015999401443073758, + "loss": 1.0177, + "step": 3397 + }, + { + "epoch": 0.7248293515358362, + "grad_norm": 0.31566332941063696, + "learning_rate": 0.00015999390988261233, + "loss": 1.0071, + "step": 3398 + }, + { + "epoch": 0.725042662116041, + "grad_norm": 0.3802073106076517, + "learning_rate": 0.00015999380442935466, + "loss": 0.9739, + "step": 3399 + }, + { + "epoch": 0.7252559726962458, + "grad_norm": 0.3886782724695605, + "learning_rate": 0.0001599936980709657, + "loss": 0.9903, + "step": 3400 + }, + { + "epoch": 0.7254692832764505, + "grad_norm": 0.4318363325447146, + "learning_rate": 0.00015999359080744673, + "loss": 0.9965, + "step": 3401 + }, + { + "epoch": 0.7256825938566553, + "grad_norm": 0.46555734635287194, + "learning_rate": 0.00015999348263879895, + "loss": 1.0115, + "step": 3402 + }, + { + "epoch": 0.72589590443686, + "grad_norm": 0.42882564785734356, + "learning_rate": 0.00015999337356502354, + "loss": 1.0483, + "step": 3403 + }, + { + "epoch": 0.7261092150170648, + "grad_norm": 0.5393152172575537, + "learning_rate": 0.00015999326358612177, + "loss": 0.9831, + "step": 3404 + }, + { + "epoch": 0.7263225255972696, + "grad_norm": 0.8021693956525378, + "learning_rate": 0.0001599931527020949, + "loss": 0.9754, + "step": 3405 + }, + { + "epoch": 0.7265358361774744, + "grad_norm": 1.0645933644552565, + "learning_rate": 0.00015999304091294413, + "loss": 0.9807, + "step": 3406 + }, + { + "epoch": 0.7267491467576792, + "grad_norm": 1.1390279695865635, + "learning_rate": 0.00015999292821867079, + "loss": 1.008, + "step": 3407 + }, + { + "epoch": 0.726962457337884, + "grad_norm": 0.8661989361617864, + "learning_rate": 0.0001599928146192761, + "loss": 1.008, + "step": 3408 + }, + { + "epoch": 0.7271757679180887, + "grad_norm": 0.6369419051196481, + "learning_rate": 0.00015999270011476138, + "loss": 0.9831, + "step": 3409 + }, + { + "epoch": 0.7273890784982935, + "grad_norm": 0.47237550450281557, + "learning_rate": 0.0001599925847051279, + "loss": 0.9878, + "step": 3410 + }, + { + "epoch": 0.7276023890784983, + "grad_norm": 0.4643339950081677, + "learning_rate": 0.000159992468390377, + "loss": 1.0226, + "step": 3411 + }, + { + "epoch": 0.7278156996587031, + "grad_norm": 0.5629849956738248, + "learning_rate": 0.00015999235117050996, + "loss": 0.9878, + "step": 3412 + }, + { + "epoch": 0.7280290102389079, + "grad_norm": 0.6541573254075457, + "learning_rate": 0.0001599922330455281, + "loss": 0.9807, + "step": 3413 + }, + { + "epoch": 0.7282423208191127, + "grad_norm": 0.5648110445306195, + "learning_rate": 0.00015999211401543278, + "loss": 1.0061, + "step": 3414 + }, + { + "epoch": 0.7284556313993175, + "grad_norm": 0.5571909007945581, + "learning_rate": 0.0001599919940802254, + "loss": 0.9997, + "step": 3415 + }, + { + "epoch": 0.7286689419795221, + "grad_norm": 0.61572833700157, + "learning_rate": 0.0001599918732399072, + "loss": 1.0147, + "step": 3416 + }, + { + "epoch": 0.7288822525597269, + "grad_norm": 0.6993748930889634, + "learning_rate": 0.00015999175149447963, + "loss": 1.027, + "step": 3417 + }, + { + "epoch": 0.7290955631399317, + "grad_norm": 0.6829786226239342, + "learning_rate": 0.00015999162884394405, + "loss": 0.9714, + "step": 3418 + }, + { + "epoch": 0.7293088737201365, + "grad_norm": 0.6591628743052478, + "learning_rate": 0.00015999150528830182, + "loss": 0.9727, + "step": 3419 + }, + { + "epoch": 0.7295221843003413, + "grad_norm": 0.6555037578894848, + "learning_rate": 0.0001599913808275544, + "loss": 0.9989, + "step": 3420 + }, + { + "epoch": 0.7297354948805461, + "grad_norm": 0.6522413244048646, + "learning_rate": 0.00015999125546170312, + "loss": 1.0276, + "step": 3421 + }, + { + "epoch": 0.7299488054607508, + "grad_norm": 0.6795553936042943, + "learning_rate": 0.00015999112919074947, + "loss": 1.0191, + "step": 3422 + }, + { + "epoch": 0.7301621160409556, + "grad_norm": 0.7168812405224563, + "learning_rate": 0.00015999100201469487, + "loss": 0.9945, + "step": 3423 + }, + { + "epoch": 0.7303754266211604, + "grad_norm": 0.762154278652201, + "learning_rate": 0.00015999087393354068, + "loss": 1.0186, + "step": 3424 + }, + { + "epoch": 0.7305887372013652, + "grad_norm": 0.7322601978037215, + "learning_rate": 0.00015999074494728844, + "loss": 0.983, + "step": 3425 + }, + { + "epoch": 0.73080204778157, + "grad_norm": 0.7240776718246815, + "learning_rate": 0.00015999061505593957, + "loss": 0.9951, + "step": 3426 + }, + { + "epoch": 0.7310153583617748, + "grad_norm": 0.6150776307816306, + "learning_rate": 0.00015999048425949553, + "loss": 0.9864, + "step": 3427 + }, + { + "epoch": 0.7312286689419796, + "grad_norm": 0.47551557855276894, + "learning_rate": 0.00015999035255795784, + "loss": 0.9475, + "step": 3428 + }, + { + "epoch": 0.7314419795221843, + "grad_norm": 0.5344738108009388, + "learning_rate": 0.00015999021995132795, + "loss": 0.9879, + "step": 3429 + }, + { + "epoch": 0.731655290102389, + "grad_norm": 0.5115095073490107, + "learning_rate": 0.0001599900864396074, + "loss": 1.0142, + "step": 3430 + }, + { + "epoch": 0.7318686006825939, + "grad_norm": 0.4336426313101084, + "learning_rate": 0.00015998995202279765, + "loss": 0.9684, + "step": 3431 + }, + { + "epoch": 0.7320819112627986, + "grad_norm": 0.4971379445588132, + "learning_rate": 0.00015998981670090027, + "loss": 1.0096, + "step": 3432 + }, + { + "epoch": 0.7322952218430034, + "grad_norm": 0.5462702206337773, + "learning_rate": 0.00015998968047391675, + "loss": 0.9846, + "step": 3433 + }, + { + "epoch": 0.7325085324232082, + "grad_norm": 0.651525616339615, + "learning_rate": 0.00015998954334184865, + "loss": 1.0138, + "step": 3434 + }, + { + "epoch": 0.7327218430034129, + "grad_norm": 0.7892315313776269, + "learning_rate": 0.0001599894053046975, + "loss": 0.9542, + "step": 3435 + }, + { + "epoch": 0.7329351535836177, + "grad_norm": 0.9347116584517328, + "learning_rate": 0.0001599892663624649, + "loss": 0.9722, + "step": 3436 + }, + { + "epoch": 0.7331484641638225, + "grad_norm": 1.0465594301506091, + "learning_rate": 0.00015998912651515242, + "loss": 0.9949, + "step": 3437 + }, + { + "epoch": 0.7333617747440273, + "grad_norm": 0.9422684867655926, + "learning_rate": 0.00015998898576276163, + "loss": 1.0095, + "step": 3438 + }, + { + "epoch": 0.7335750853242321, + "grad_norm": 0.9080426786635997, + "learning_rate": 0.0001599888441052941, + "loss": 0.9787, + "step": 3439 + }, + { + "epoch": 0.7337883959044369, + "grad_norm": 0.8950036362603112, + "learning_rate": 0.00015998870154275148, + "loss": 0.9791, + "step": 3440 + }, + { + "epoch": 0.7340017064846417, + "grad_norm": 0.8680423404017782, + "learning_rate": 0.00015998855807513533, + "loss": 0.9475, + "step": 3441 + }, + { + "epoch": 0.7342150170648464, + "grad_norm": 0.7336789778678523, + "learning_rate": 0.00015998841370244734, + "loss": 0.9796, + "step": 3442 + }, + { + "epoch": 0.7344283276450512, + "grad_norm": 0.5770359034379544, + "learning_rate": 0.00015998826842468907, + "loss": 0.9954, + "step": 3443 + }, + { + "epoch": 0.734641638225256, + "grad_norm": 0.39543560911710235, + "learning_rate": 0.00015998812224186218, + "loss": 1.0309, + "step": 3444 + }, + { + "epoch": 0.7348549488054608, + "grad_norm": 0.4551145422813606, + "learning_rate": 0.00015998797515396837, + "loss": 0.9758, + "step": 3445 + }, + { + "epoch": 0.7350682593856656, + "grad_norm": 0.46442304099583864, + "learning_rate": 0.00015998782716100928, + "loss": 0.9699, + "step": 3446 + }, + { + "epoch": 0.7352815699658704, + "grad_norm": 0.4061419843744514, + "learning_rate": 0.00015998767826298657, + "loss": 0.9976, + "step": 3447 + }, + { + "epoch": 0.735494880546075, + "grad_norm": 0.3882600641630494, + "learning_rate": 0.00015998752845990195, + "loss": 0.9724, + "step": 3448 + }, + { + "epoch": 0.7357081911262798, + "grad_norm": 0.4881232636928314, + "learning_rate": 0.00015998737775175706, + "loss": 0.9936, + "step": 3449 + }, + { + "epoch": 0.7359215017064846, + "grad_norm": 0.5478298480017262, + "learning_rate": 0.00015998722613855368, + "loss": 0.9945, + "step": 3450 + }, + { + "epoch": 0.7361348122866894, + "grad_norm": 0.4809847740777981, + "learning_rate": 0.00015998707362029348, + "loss": 1.0006, + "step": 3451 + }, + { + "epoch": 0.7363481228668942, + "grad_norm": 0.5004950786359424, + "learning_rate": 0.0001599869201969782, + "loss": 0.9805, + "step": 3452 + }, + { + "epoch": 0.736561433447099, + "grad_norm": 0.5482928375264662, + "learning_rate": 0.00015998676586860956, + "loss": 0.9869, + "step": 3453 + }, + { + "epoch": 0.7367747440273038, + "grad_norm": 0.5565867925438684, + "learning_rate": 0.00015998661063518934, + "loss": 0.9948, + "step": 3454 + }, + { + "epoch": 0.7369880546075085, + "grad_norm": 0.6665320765858641, + "learning_rate": 0.00015998645449671927, + "loss": 1.007, + "step": 3455 + }, + { + "epoch": 0.7372013651877133, + "grad_norm": 0.7444184024982126, + "learning_rate": 0.0001599862974532011, + "loss": 0.943, + "step": 3456 + }, + { + "epoch": 0.7374146757679181, + "grad_norm": 0.8362488680038006, + "learning_rate": 0.00015998613950463667, + "loss": 0.9828, + "step": 3457 + }, + { + "epoch": 0.7376279863481229, + "grad_norm": 0.9878775746019652, + "learning_rate": 0.0001599859806510277, + "loss": 0.9951, + "step": 3458 + }, + { + "epoch": 0.7378412969283277, + "grad_norm": 1.1543223950699708, + "learning_rate": 0.000159985820892376, + "loss": 0.9835, + "step": 3459 + }, + { + "epoch": 0.7380546075085325, + "grad_norm": 0.790304832309082, + "learning_rate": 0.0001599856602286834, + "loss": 0.9763, + "step": 3460 + }, + { + "epoch": 0.7382679180887372, + "grad_norm": 0.5054508754865654, + "learning_rate": 0.00015998549865995173, + "loss": 1.0066, + "step": 3461 + }, + { + "epoch": 0.738481228668942, + "grad_norm": 0.5716711874548196, + "learning_rate": 0.00015998533618618275, + "loss": 0.9818, + "step": 3462 + }, + { + "epoch": 0.7386945392491467, + "grad_norm": 0.8374420881849327, + "learning_rate": 0.0001599851728073784, + "loss": 0.985, + "step": 3463 + }, + { + "epoch": 0.7389078498293515, + "grad_norm": 1.0254742101518677, + "learning_rate": 0.00015998500852354046, + "loss": 1.0011, + "step": 3464 + }, + { + "epoch": 0.7391211604095563, + "grad_norm": 1.0575320485068314, + "learning_rate": 0.0001599848433346708, + "loss": 1.0008, + "step": 3465 + }, + { + "epoch": 0.7393344709897611, + "grad_norm": 0.8137646140751918, + "learning_rate": 0.00015998467724077128, + "loss": 0.9865, + "step": 3466 + }, + { + "epoch": 0.7395477815699659, + "grad_norm": 0.5674162947929196, + "learning_rate": 0.00015998451024184382, + "loss": 1.0231, + "step": 3467 + }, + { + "epoch": 0.7397610921501706, + "grad_norm": 0.4052247584314529, + "learning_rate": 0.00015998434233789025, + "loss": 1.0308, + "step": 3468 + }, + { + "epoch": 0.7399744027303754, + "grad_norm": 0.3925375500692851, + "learning_rate": 0.0001599841735289125, + "loss": 1.0093, + "step": 3469 + }, + { + "epoch": 0.7401877133105802, + "grad_norm": 0.5185415091957214, + "learning_rate": 0.0001599840038149125, + "loss": 0.9923, + "step": 3470 + }, + { + "epoch": 0.740401023890785, + "grad_norm": 0.5605923756324549, + "learning_rate": 0.00015998383319589213, + "loss": 0.9801, + "step": 3471 + }, + { + "epoch": 0.7406143344709898, + "grad_norm": 0.5428168740902807, + "learning_rate": 0.00015998366167185337, + "loss": 0.9959, + "step": 3472 + }, + { + "epoch": 0.7408276450511946, + "grad_norm": 0.5607622700554453, + "learning_rate": 0.00015998348924279812, + "loss": 0.991, + "step": 3473 + }, + { + "epoch": 0.7410409556313993, + "grad_norm": 0.6049266557136458, + "learning_rate": 0.00015998331590872834, + "loss": 0.9909, + "step": 3474 + }, + { + "epoch": 0.7412542662116041, + "grad_norm": 0.6112306292920446, + "learning_rate": 0.000159983141669646, + "loss": 1.0271, + "step": 3475 + }, + { + "epoch": 0.7414675767918089, + "grad_norm": 0.6294711441456753, + "learning_rate": 0.00015998296652555306, + "loss": 0.9669, + "step": 3476 + }, + { + "epoch": 0.7416808873720137, + "grad_norm": 0.49643498456325885, + "learning_rate": 0.00015998279047645152, + "loss": 0.9881, + "step": 3477 + }, + { + "epoch": 0.7418941979522184, + "grad_norm": 0.4535026973620342, + "learning_rate": 0.00015998261352234334, + "loss": 1.0126, + "step": 3478 + }, + { + "epoch": 0.7421075085324232, + "grad_norm": 0.33900892599566734, + "learning_rate": 0.0001599824356632306, + "loss": 0.9926, + "step": 3479 + }, + { + "epoch": 0.742320819112628, + "grad_norm": 0.5433809365884704, + "learning_rate": 0.00015998225689911518, + "loss": 0.9976, + "step": 3480 + }, + { + "epoch": 0.7425341296928327, + "grad_norm": 0.59391445346873, + "learning_rate": 0.00015998207722999922, + "loss": 1.0532, + "step": 3481 + }, + { + "epoch": 0.7427474402730375, + "grad_norm": 0.5205222186080796, + "learning_rate": 0.0001599818966558847, + "loss": 1.0212, + "step": 3482 + }, + { + "epoch": 0.7429607508532423, + "grad_norm": 0.6101726515687496, + "learning_rate": 0.0001599817151767737, + "loss": 0.9822, + "step": 3483 + }, + { + "epoch": 0.7431740614334471, + "grad_norm": 0.5864508196579061, + "learning_rate": 0.0001599815327926682, + "loss": 0.9858, + "step": 3484 + }, + { + "epoch": 0.7433873720136519, + "grad_norm": 0.4794353214894495, + "learning_rate": 0.00015998134950357033, + "loss": 0.9984, + "step": 3485 + }, + { + "epoch": 0.7436006825938567, + "grad_norm": 0.417359774942823, + "learning_rate": 0.00015998116530948215, + "loss": 0.9698, + "step": 3486 + }, + { + "epoch": 0.7438139931740614, + "grad_norm": 0.42422235681339066, + "learning_rate": 0.00015998098021040573, + "loss": 1.0033, + "step": 3487 + }, + { + "epoch": 0.7440273037542662, + "grad_norm": 0.361841875855479, + "learning_rate": 0.00015998079420634318, + "loss": 1.0246, + "step": 3488 + }, + { + "epoch": 0.744240614334471, + "grad_norm": 0.39019055825424037, + "learning_rate": 0.0001599806072972966, + "loss": 1.012, + "step": 3489 + }, + { + "epoch": 0.7444539249146758, + "grad_norm": 0.37020536630130135, + "learning_rate": 0.0001599804194832681, + "loss": 0.9626, + "step": 3490 + }, + { + "epoch": 0.7446672354948806, + "grad_norm": 0.3788845434441914, + "learning_rate": 0.0001599802307642598, + "loss": 0.9824, + "step": 3491 + }, + { + "epoch": 0.7448805460750854, + "grad_norm": 0.49026588064969423, + "learning_rate": 0.00015998004114027386, + "loss": 0.9862, + "step": 3492 + }, + { + "epoch": 0.7450938566552902, + "grad_norm": 0.5247492816332996, + "learning_rate": 0.0001599798506113124, + "loss": 1.0084, + "step": 3493 + }, + { + "epoch": 0.7453071672354948, + "grad_norm": 0.48301361637135787, + "learning_rate": 0.0001599796591773776, + "loss": 0.9633, + "step": 3494 + }, + { + "epoch": 0.7455204778156996, + "grad_norm": 0.5012913947051743, + "learning_rate": 0.00015997946683847161, + "loss": 0.9773, + "step": 3495 + }, + { + "epoch": 0.7457337883959044, + "grad_norm": 0.5297314224405101, + "learning_rate": 0.0001599792735945966, + "loss": 1.0012, + "step": 3496 + }, + { + "epoch": 0.7459470989761092, + "grad_norm": 0.4445668666820395, + "learning_rate": 0.00015997907944575475, + "loss": 1.0082, + "step": 3497 + }, + { + "epoch": 0.746160409556314, + "grad_norm": 0.40510165944751875, + "learning_rate": 0.00015997888439194828, + "loss": 1.0127, + "step": 3498 + }, + { + "epoch": 0.7463737201365188, + "grad_norm": 0.4748970667173028, + "learning_rate": 0.0001599786884331794, + "loss": 0.9866, + "step": 3499 + }, + { + "epoch": 0.7465870307167235, + "grad_norm": 0.500842198014533, + "learning_rate": 0.0001599784915694503, + "loss": 1.0133, + "step": 3500 + }, + { + "epoch": 0.7468003412969283, + "grad_norm": 0.45174105733774966, + "learning_rate": 0.00015997829380076322, + "loss": 0.9818, + "step": 3501 + }, + { + "epoch": 0.7470136518771331, + "grad_norm": 0.5845589803789638, + "learning_rate": 0.00015997809512712042, + "loss": 0.9952, + "step": 3502 + }, + { + "epoch": 0.7472269624573379, + "grad_norm": 0.6704522554929792, + "learning_rate": 0.00015997789554852412, + "loss": 1.0064, + "step": 3503 + }, + { + "epoch": 0.7474402730375427, + "grad_norm": 0.7443958853565591, + "learning_rate": 0.00015997769506497658, + "loss": 1.0241, + "step": 3504 + }, + { + "epoch": 0.7476535836177475, + "grad_norm": 0.8446605741805631, + "learning_rate": 0.00015997749367648007, + "loss": 1.0026, + "step": 3505 + }, + { + "epoch": 0.7478668941979523, + "grad_norm": 0.9975215528137519, + "learning_rate": 0.00015997729138303687, + "loss": 0.9972, + "step": 3506 + }, + { + "epoch": 0.748080204778157, + "grad_norm": 1.169095719189658, + "learning_rate": 0.0001599770881846493, + "loss": 0.9806, + "step": 3507 + }, + { + "epoch": 0.7482935153583617, + "grad_norm": 0.7447678688390565, + "learning_rate": 0.00015997688408131961, + "loss": 1.0104, + "step": 3508 + }, + { + "epoch": 0.7485068259385665, + "grad_norm": 0.5299950902150937, + "learning_rate": 0.00015997667907305014, + "loss": 0.9865, + "step": 3509 + }, + { + "epoch": 0.7487201365187713, + "grad_norm": 0.44091135463617703, + "learning_rate": 0.00015997647315984322, + "loss": 0.9902, + "step": 3510 + }, + { + "epoch": 0.7489334470989761, + "grad_norm": 0.447725193844857, + "learning_rate": 0.00015997626634170112, + "loss": 0.9732, + "step": 3511 + }, + { + "epoch": 0.7491467576791809, + "grad_norm": 0.5217158813256343, + "learning_rate": 0.00015997605861862624, + "loss": 0.9987, + "step": 3512 + }, + { + "epoch": 0.7493600682593856, + "grad_norm": 0.5707069677126705, + "learning_rate": 0.00015997584999062094, + "loss": 1.0322, + "step": 3513 + }, + { + "epoch": 0.7495733788395904, + "grad_norm": 0.6517908249344591, + "learning_rate": 0.0001599756404576875, + "loss": 0.9953, + "step": 3514 + }, + { + "epoch": 0.7497866894197952, + "grad_norm": 0.7384480801660079, + "learning_rate": 0.00015997543001982837, + "loss": 0.9878, + "step": 3515 + }, + { + "epoch": 0.75, + "grad_norm": 0.7196384139103551, + "learning_rate": 0.0001599752186770459, + "loss": 0.9432, + "step": 3516 + }, + { + "epoch": 0.7502133105802048, + "grad_norm": 0.630709225730263, + "learning_rate": 0.0001599750064293425, + "loss": 0.9896, + "step": 3517 + }, + { + "epoch": 0.7504266211604096, + "grad_norm": 0.5452982005050532, + "learning_rate": 0.00015997479327672055, + "loss": 0.9606, + "step": 3518 + }, + { + "epoch": 0.7506399317406144, + "grad_norm": 0.5666785921320612, + "learning_rate": 0.00015997457921918245, + "loss": 0.9937, + "step": 3519 + }, + { + "epoch": 0.7508532423208191, + "grad_norm": 0.6163304567829156, + "learning_rate": 0.00015997436425673066, + "loss": 1.0035, + "step": 3520 + }, + { + "epoch": 0.7510665529010239, + "grad_norm": 0.5691047295036686, + "learning_rate": 0.00015997414838936756, + "loss": 0.9819, + "step": 3521 + }, + { + "epoch": 0.7512798634812287, + "grad_norm": 0.43684019209640024, + "learning_rate": 0.0001599739316170957, + "loss": 1.0214, + "step": 3522 + }, + { + "epoch": 0.7514931740614335, + "grad_norm": 0.3922201970055015, + "learning_rate": 0.0001599737139399174, + "loss": 0.9968, + "step": 3523 + }, + { + "epoch": 0.7517064846416383, + "grad_norm": 0.4366914168811387, + "learning_rate": 0.00015997349535783518, + "loss": 0.9747, + "step": 3524 + }, + { + "epoch": 0.751919795221843, + "grad_norm": 0.5844775204747769, + "learning_rate": 0.00015997327587085155, + "loss": 0.9946, + "step": 3525 + }, + { + "epoch": 0.7521331058020477, + "grad_norm": 0.6692759982774754, + "learning_rate": 0.0001599730554789689, + "loss": 0.993, + "step": 3526 + }, + { + "epoch": 0.7523464163822525, + "grad_norm": 0.614636282569812, + "learning_rate": 0.00015997283418218983, + "loss": 0.9874, + "step": 3527 + }, + { + "epoch": 0.7525597269624573, + "grad_norm": 0.5407928210365206, + "learning_rate": 0.0001599726119805168, + "loss": 0.9694, + "step": 3528 + }, + { + "epoch": 0.7527730375426621, + "grad_norm": 0.6135440116978241, + "learning_rate": 0.00015997238887395228, + "loss": 0.9861, + "step": 3529 + }, + { + "epoch": 0.7529863481228669, + "grad_norm": 0.8236953914421425, + "learning_rate": 0.00015997216486249888, + "loss": 1.0131, + "step": 3530 + }, + { + "epoch": 0.7531996587030717, + "grad_norm": 1.0345165735876776, + "learning_rate": 0.00015997193994615906, + "loss": 0.973, + "step": 3531 + }, + { + "epoch": 0.7534129692832765, + "grad_norm": 1.196777885924149, + "learning_rate": 0.00015997171412493542, + "loss": 1.0195, + "step": 3532 + }, + { + "epoch": 0.7536262798634812, + "grad_norm": 0.7364146107855812, + "learning_rate": 0.00015997148739883046, + "loss": 0.9855, + "step": 3533 + }, + { + "epoch": 0.753839590443686, + "grad_norm": 0.6379599041426377, + "learning_rate": 0.0001599712597678468, + "loss": 0.9991, + "step": 3534 + }, + { + "epoch": 0.7540529010238908, + "grad_norm": 0.8114246422372623, + "learning_rate": 0.00015997103123198696, + "loss": 1.0024, + "step": 3535 + }, + { + "epoch": 0.7542662116040956, + "grad_norm": 1.008734321160477, + "learning_rate": 0.0001599708017912536, + "loss": 0.9836, + "step": 3536 + }, + { + "epoch": 0.7544795221843004, + "grad_norm": 1.17877963499892, + "learning_rate": 0.00015997057144564924, + "loss": 1.0219, + "step": 3537 + }, + { + "epoch": 0.7546928327645052, + "grad_norm": 0.9724156387331131, + "learning_rate": 0.00015997034019517653, + "loss": 0.9903, + "step": 3538 + }, + { + "epoch": 0.7549061433447098, + "grad_norm": 0.8968244310268274, + "learning_rate": 0.00015997010803983806, + "loss": 1.0236, + "step": 3539 + }, + { + "epoch": 0.7551194539249146, + "grad_norm": 0.7322847912003667, + "learning_rate": 0.0001599698749796365, + "loss": 1.0316, + "step": 3540 + }, + { + "epoch": 0.7553327645051194, + "grad_norm": 0.3963336117036846, + "learning_rate": 0.00015996964101457444, + "loss": 1.0, + "step": 3541 + }, + { + "epoch": 0.7555460750853242, + "grad_norm": 0.35644461671028344, + "learning_rate": 0.00015996940614465451, + "loss": 0.9873, + "step": 3542 + }, + { + "epoch": 0.755759385665529, + "grad_norm": 0.5127241535701131, + "learning_rate": 0.00015996917036987946, + "loss": 1.0156, + "step": 3543 + }, + { + "epoch": 0.7559726962457338, + "grad_norm": 0.6689431002864341, + "learning_rate": 0.0001599689336902519, + "loss": 1.0283, + "step": 3544 + }, + { + "epoch": 0.7561860068259386, + "grad_norm": 0.7629751592503922, + "learning_rate": 0.0001599686961057745, + "loss": 0.978, + "step": 3545 + }, + { + "epoch": 0.7563993174061433, + "grad_norm": 0.8702730623920446, + "learning_rate": 0.00015996845761644997, + "loss": 0.9965, + "step": 3546 + }, + { + "epoch": 0.7566126279863481, + "grad_norm": 0.913032302186929, + "learning_rate": 0.00015996821822228098, + "loss": 1.0241, + "step": 3547 + }, + { + "epoch": 0.7568259385665529, + "grad_norm": 0.8820062441689566, + "learning_rate": 0.00015996797792327023, + "loss": 0.9919, + "step": 3548 + }, + { + "epoch": 0.7570392491467577, + "grad_norm": 0.820927530330856, + "learning_rate": 0.0001599677367194205, + "loss": 1.0134, + "step": 3549 + }, + { + "epoch": 0.7572525597269625, + "grad_norm": 0.7205955753742633, + "learning_rate": 0.00015996749461073449, + "loss": 0.9993, + "step": 3550 + }, + { + "epoch": 0.7574658703071673, + "grad_norm": 0.5742166077783858, + "learning_rate": 0.0001599672515972149, + "loss": 0.9989, + "step": 3551 + }, + { + "epoch": 0.757679180887372, + "grad_norm": 0.5826889182438959, + "learning_rate": 0.00015996700767886455, + "loss": 1.0005, + "step": 3552 + }, + { + "epoch": 0.7578924914675768, + "grad_norm": 0.5263494875439036, + "learning_rate": 0.00015996676285568617, + "loss": 0.9729, + "step": 3553 + }, + { + "epoch": 0.7581058020477816, + "grad_norm": 0.3852383534156712, + "learning_rate": 0.0001599665171276825, + "loss": 0.9826, + "step": 3554 + }, + { + "epoch": 0.7583191126279863, + "grad_norm": 0.5041969963897647, + "learning_rate": 0.00015996627049485632, + "loss": 0.9896, + "step": 3555 + }, + { + "epoch": 0.7585324232081911, + "grad_norm": 0.5816138422012802, + "learning_rate": 0.00015996602295721048, + "loss": 0.9645, + "step": 3556 + }, + { + "epoch": 0.7587457337883959, + "grad_norm": 0.6350597559928131, + "learning_rate": 0.00015996577451474772, + "loss": 0.9865, + "step": 3557 + }, + { + "epoch": 0.7589590443686007, + "grad_norm": 0.46219107938102705, + "learning_rate": 0.00015996552516747088, + "loss": 0.9669, + "step": 3558 + }, + { + "epoch": 0.7591723549488054, + "grad_norm": 0.4009050078368423, + "learning_rate": 0.0001599652749153828, + "loss": 0.9907, + "step": 3559 + }, + { + "epoch": 0.7593856655290102, + "grad_norm": 0.4873978026362066, + "learning_rate": 0.00015996502375848628, + "loss": 0.9898, + "step": 3560 + }, + { + "epoch": 0.759598976109215, + "grad_norm": 1.408433416067991, + "learning_rate": 0.00015996477169678418, + "loss": 0.999, + "step": 3561 + }, + { + "epoch": 0.7598122866894198, + "grad_norm": 0.3969243264020471, + "learning_rate": 0.00015996451873027931, + "loss": 0.9864, + "step": 3562 + }, + { + "epoch": 0.7600255972696246, + "grad_norm": 0.5871004510408023, + "learning_rate": 0.0001599642648589746, + "loss": 1.0, + "step": 3563 + }, + { + "epoch": 0.7602389078498294, + "grad_norm": 0.5696770267971937, + "learning_rate": 0.00015996401008287287, + "loss": 1.0104, + "step": 3564 + }, + { + "epoch": 0.7604522184300341, + "grad_norm": 0.6497279935441218, + "learning_rate": 0.00015996375440197703, + "loss": 1.0095, + "step": 3565 + }, + { + "epoch": 0.7606655290102389, + "grad_norm": 0.8574901073565969, + "learning_rate": 0.00015996349781628995, + "loss": 1.0087, + "step": 3566 + }, + { + "epoch": 0.7608788395904437, + "grad_norm": 1.2302072994345967, + "learning_rate": 0.00015996324032581456, + "loss": 1.0389, + "step": 3567 + }, + { + "epoch": 0.7610921501706485, + "grad_norm": 0.7186249509073857, + "learning_rate": 0.00015996298193055375, + "loss": 1.0245, + "step": 3568 + }, + { + "epoch": 0.7613054607508533, + "grad_norm": 0.569992132261926, + "learning_rate": 0.00015996272263051048, + "loss": 0.9832, + "step": 3569 + }, + { + "epoch": 0.761518771331058, + "grad_norm": 0.6508434452177562, + "learning_rate": 0.00015996246242568765, + "loss": 1.0061, + "step": 3570 + }, + { + "epoch": 0.7617320819112628, + "grad_norm": 0.8583761561626148, + "learning_rate": 0.0001599622013160882, + "loss": 0.9798, + "step": 3571 + }, + { + "epoch": 0.7619453924914675, + "grad_norm": 1.0236095895372912, + "learning_rate": 0.0001599619393017151, + "loss": 0.9719, + "step": 3572 + }, + { + "epoch": 0.7621587030716723, + "grad_norm": 1.0428557148282893, + "learning_rate": 0.0001599616763825713, + "loss": 0.9783, + "step": 3573 + }, + { + "epoch": 0.7623720136518771, + "grad_norm": 0.8793796460080945, + "learning_rate": 0.0001599614125586598, + "loss": 0.9969, + "step": 3574 + }, + { + "epoch": 0.7625853242320819, + "grad_norm": 0.649572887613564, + "learning_rate": 0.00015996114782998355, + "loss": 0.9785, + "step": 3575 + }, + { + "epoch": 0.7627986348122867, + "grad_norm": 0.5272490176227781, + "learning_rate": 0.0001599608821965456, + "loss": 0.993, + "step": 3576 + }, + { + "epoch": 0.7630119453924915, + "grad_norm": 0.48177380215575033, + "learning_rate": 0.0001599606156583489, + "loss": 1.0192, + "step": 3577 + }, + { + "epoch": 0.7632252559726962, + "grad_norm": 0.6853353302134398, + "learning_rate": 0.00015996034821539652, + "loss": 1.0044, + "step": 3578 + }, + { + "epoch": 0.763438566552901, + "grad_norm": 1.14610025826189, + "learning_rate": 0.00015996007986769143, + "loss": 1.0354, + "step": 3579 + }, + { + "epoch": 0.7636518771331058, + "grad_norm": 0.5885725757916797, + "learning_rate": 0.0001599598106152367, + "loss": 0.9757, + "step": 3580 + }, + { + "epoch": 0.7638651877133106, + "grad_norm": 0.4627012106265978, + "learning_rate": 0.00015995954045803536, + "loss": 1.0233, + "step": 3581 + }, + { + "epoch": 0.7640784982935154, + "grad_norm": 0.47971116487370996, + "learning_rate": 0.00015995926939609047, + "loss": 1.0271, + "step": 3582 + }, + { + "epoch": 0.7642918088737202, + "grad_norm": 0.7471233486785792, + "learning_rate": 0.00015995899742940513, + "loss": 1.008, + "step": 3583 + }, + { + "epoch": 0.764505119453925, + "grad_norm": 0.9212750030828882, + "learning_rate": 0.00015995872455798235, + "loss": 0.9925, + "step": 3584 + }, + { + "epoch": 0.7647184300341296, + "grad_norm": 1.1897944234320585, + "learning_rate": 0.00015995845078182528, + "loss": 0.9482, + "step": 3585 + }, + { + "epoch": 0.7649317406143344, + "grad_norm": 0.7682878951051438, + "learning_rate": 0.000159958176100937, + "loss": 0.9963, + "step": 3586 + }, + { + "epoch": 0.7651450511945392, + "grad_norm": 0.5488198732857644, + "learning_rate": 0.00015995790051532058, + "loss": 0.9705, + "step": 3587 + }, + { + "epoch": 0.765358361774744, + "grad_norm": 0.45866935073526205, + "learning_rate": 0.00015995762402497922, + "loss": 1.0066, + "step": 3588 + }, + { + "epoch": 0.7655716723549488, + "grad_norm": 0.6350138168507485, + "learning_rate": 0.00015995734662991595, + "loss": 0.982, + "step": 3589 + }, + { + "epoch": 0.7657849829351536, + "grad_norm": 0.7103473583996214, + "learning_rate": 0.000159957068330134, + "loss": 0.9853, + "step": 3590 + }, + { + "epoch": 0.7659982935153583, + "grad_norm": 0.7922728907428837, + "learning_rate": 0.00015995678912563644, + "loss": 0.9951, + "step": 3591 + }, + { + "epoch": 0.7662116040955631, + "grad_norm": 0.980937345333103, + "learning_rate": 0.0001599565090164265, + "loss": 0.985, + "step": 3592 + }, + { + "epoch": 0.7664249146757679, + "grad_norm": 0.7091823574076888, + "learning_rate": 0.0001599562280025073, + "loss": 1.0081, + "step": 3593 + }, + { + "epoch": 0.7666382252559727, + "grad_norm": 0.5242301931660962, + "learning_rate": 0.00015995594608388204, + "loss": 0.9667, + "step": 3594 + }, + { + "epoch": 0.7668515358361775, + "grad_norm": 0.5135914893257494, + "learning_rate": 0.0001599556632605539, + "loss": 1.0364, + "step": 3595 + }, + { + "epoch": 0.7670648464163823, + "grad_norm": 0.4526570365604271, + "learning_rate": 0.0001599553795325261, + "loss": 1.0054, + "step": 3596 + }, + { + "epoch": 0.7672781569965871, + "grad_norm": 0.4861587993342933, + "learning_rate": 0.0001599550948998018, + "loss": 1.0031, + "step": 3597 + }, + { + "epoch": 0.7674914675767918, + "grad_norm": 0.4685164527854361, + "learning_rate": 0.00015995480936238426, + "loss": 1.0276, + "step": 3598 + }, + { + "epoch": 0.7677047781569966, + "grad_norm": 0.4680877032398802, + "learning_rate": 0.00015995452292027673, + "loss": 0.9516, + "step": 3599 + }, + { + "epoch": 0.7679180887372014, + "grad_norm": 0.6539174583126398, + "learning_rate": 0.00015995423557348245, + "loss": 1.0255, + "step": 3600 + }, + { + "epoch": 0.7681313993174061, + "grad_norm": 0.8205925996132236, + "learning_rate": 0.00015995394732200462, + "loss": 0.9877, + "step": 3601 + }, + { + "epoch": 0.768344709897611, + "grad_norm": 0.9646349679052144, + "learning_rate": 0.00015995365816584654, + "loss": 1.0379, + "step": 3602 + }, + { + "epoch": 0.7685580204778157, + "grad_norm": 1.0970345891789028, + "learning_rate": 0.00015995336810501146, + "loss": 0.9926, + "step": 3603 + }, + { + "epoch": 0.7687713310580204, + "grad_norm": 0.8085599765585836, + "learning_rate": 0.0001599530771395027, + "loss": 1.0154, + "step": 3604 + }, + { + "epoch": 0.7689846416382252, + "grad_norm": 0.6296232636934981, + "learning_rate": 0.0001599527852693235, + "loss": 1.0267, + "step": 3605 + }, + { + "epoch": 0.76919795221843, + "grad_norm": 0.6723407389510451, + "learning_rate": 0.00015995249249447718, + "loss": 0.9905, + "step": 3606 + }, + { + "epoch": 0.7694112627986348, + "grad_norm": 0.8037481044175868, + "learning_rate": 0.00015995219881496708, + "loss": 1.0006, + "step": 3607 + }, + { + "epoch": 0.7696245733788396, + "grad_norm": 0.7799285265785629, + "learning_rate": 0.00015995190423079652, + "loss": 1.0, + "step": 3608 + }, + { + "epoch": 0.7698378839590444, + "grad_norm": 0.5461824676533376, + "learning_rate": 0.0001599516087419688, + "loss": 0.9954, + "step": 3609 + }, + { + "epoch": 0.7700511945392492, + "grad_norm": 0.3775620503807146, + "learning_rate": 0.0001599513123484873, + "loss": 0.9911, + "step": 3610 + }, + { + "epoch": 0.7702645051194539, + "grad_norm": 0.5064914555333347, + "learning_rate": 0.00015995101505035533, + "loss": 0.9947, + "step": 3611 + }, + { + "epoch": 0.7704778156996587, + "grad_norm": 0.6918263462893103, + "learning_rate": 0.00015995071684757632, + "loss": 0.9981, + "step": 3612 + }, + { + "epoch": 0.7706911262798635, + "grad_norm": 1.4957324656359547, + "learning_rate": 0.00015995041774015357, + "loss": 1.0287, + "step": 3613 + }, + { + "epoch": 0.7709044368600683, + "grad_norm": 0.4302120326917958, + "learning_rate": 0.0001599501177280905, + "loss": 0.9925, + "step": 3614 + }, + { + "epoch": 0.7711177474402731, + "grad_norm": 1.1146087409529668, + "learning_rate": 0.0001599498168113905, + "loss": 1.0291, + "step": 3615 + }, + { + "epoch": 0.7713310580204779, + "grad_norm": 1.0922384321723322, + "learning_rate": 0.000159949514990057, + "loss": 1.0221, + "step": 3616 + }, + { + "epoch": 0.7715443686006825, + "grad_norm": 0.8342891065352146, + "learning_rate": 0.00015994921226409337, + "loss": 0.9926, + "step": 3617 + }, + { + "epoch": 0.7717576791808873, + "grad_norm": 0.8703440089923933, + "learning_rate": 0.00015994890863350308, + "loss": 1.0111, + "step": 3618 + }, + { + "epoch": 0.7719709897610921, + "grad_norm": 0.7986258463942043, + "learning_rate": 0.00015994860409828955, + "loss": 1.0118, + "step": 3619 + }, + { + "epoch": 0.7721843003412969, + "grad_norm": 0.696967675437126, + "learning_rate": 0.00015994829865845618, + "loss": 0.9952, + "step": 3620 + }, + { + "epoch": 0.7723976109215017, + "grad_norm": 0.5578495954012639, + "learning_rate": 0.0001599479923140065, + "loss": 1.0148, + "step": 3621 + }, + { + "epoch": 0.7726109215017065, + "grad_norm": 0.4816603268333951, + "learning_rate": 0.00015994768506494392, + "loss": 1.0078, + "step": 3622 + }, + { + "epoch": 0.7728242320819113, + "grad_norm": 0.5478919101282885, + "learning_rate": 0.00015994737691127196, + "loss": 0.9958, + "step": 3623 + }, + { + "epoch": 0.773037542662116, + "grad_norm": 0.6753257148244377, + "learning_rate": 0.00015994706785299408, + "loss": 0.9834, + "step": 3624 + }, + { + "epoch": 0.7732508532423208, + "grad_norm": 0.552795824326851, + "learning_rate": 0.00015994675789011376, + "loss": 1.0173, + "step": 3625 + }, + { + "epoch": 0.7734641638225256, + "grad_norm": 0.6028052523741788, + "learning_rate": 0.00015994644702263454, + "loss": 1.0085, + "step": 3626 + }, + { + "epoch": 0.7736774744027304, + "grad_norm": 0.6655680519736621, + "learning_rate": 0.00015994613525055997, + "loss": 1.0001, + "step": 3627 + }, + { + "epoch": 0.7738907849829352, + "grad_norm": 0.6224308886559543, + "learning_rate": 0.00015994582257389348, + "loss": 1.0155, + "step": 3628 + }, + { + "epoch": 0.77410409556314, + "grad_norm": 0.471346283578459, + "learning_rate": 0.0001599455089926387, + "loss": 0.9813, + "step": 3629 + }, + { + "epoch": 0.7743174061433447, + "grad_norm": 0.3975057323410497, + "learning_rate": 0.0001599451945067991, + "loss": 1.0056, + "step": 3630 + }, + { + "epoch": 0.7745307167235495, + "grad_norm": 0.423302183596002, + "learning_rate": 0.0001599448791163783, + "loss": 0.9914, + "step": 3631 + }, + { + "epoch": 0.7747440273037542, + "grad_norm": 0.4909146724725356, + "learning_rate": 0.00015994456282137986, + "loss": 0.9766, + "step": 3632 + }, + { + "epoch": 0.774957337883959, + "grad_norm": 0.6009689343291374, + "learning_rate": 0.00015994424562180734, + "loss": 0.9818, + "step": 3633 + }, + { + "epoch": 0.7751706484641638, + "grad_norm": 0.633683661527295, + "learning_rate": 0.00015994392751766435, + "loss": 0.9827, + "step": 3634 + }, + { + "epoch": 0.7753839590443686, + "grad_norm": 0.46980971759443957, + "learning_rate": 0.00015994360850895444, + "loss": 0.9827, + "step": 3635 + }, + { + "epoch": 0.7755972696245734, + "grad_norm": 0.46024490831935794, + "learning_rate": 0.0001599432885956813, + "loss": 0.9593, + "step": 3636 + }, + { + "epoch": 0.7758105802047781, + "grad_norm": 0.5235555367188387, + "learning_rate": 0.00015994296777784847, + "loss": 0.9984, + "step": 3637 + }, + { + "epoch": 0.7760238907849829, + "grad_norm": 0.5732651875636157, + "learning_rate": 0.0001599426460554596, + "loss": 1.0082, + "step": 3638 + }, + { + "epoch": 0.7762372013651877, + "grad_norm": 0.4565230795776931, + "learning_rate": 0.0001599423234285184, + "loss": 0.9644, + "step": 3639 + }, + { + "epoch": 0.7764505119453925, + "grad_norm": 0.4789100159516409, + "learning_rate": 0.00015994199989702844, + "loss": 0.9691, + "step": 3640 + }, + { + "epoch": 0.7766638225255973, + "grad_norm": 0.5542881485434911, + "learning_rate": 0.00015994167546099338, + "loss": 0.9844, + "step": 3641 + }, + { + "epoch": 0.7768771331058021, + "grad_norm": 0.6293390719781375, + "learning_rate": 0.00015994135012041696, + "loss": 0.9881, + "step": 3642 + }, + { + "epoch": 0.7770904436860068, + "grad_norm": 0.8385184826868528, + "learning_rate": 0.00015994102387530278, + "loss": 1.0209, + "step": 3643 + }, + { + "epoch": 0.7773037542662116, + "grad_norm": 0.9180961879361001, + "learning_rate": 0.0001599406967256546, + "loss": 1.0121, + "step": 3644 + }, + { + "epoch": 0.7775170648464164, + "grad_norm": 0.9778063562691572, + "learning_rate": 0.00015994036867147607, + "loss": 0.9753, + "step": 3645 + }, + { + "epoch": 0.7777303754266212, + "grad_norm": 1.0243565725187467, + "learning_rate": 0.00015994003971277096, + "loss": 1.0086, + "step": 3646 + }, + { + "epoch": 0.777943686006826, + "grad_norm": 1.000904241414696, + "learning_rate": 0.00015993970984954292, + "loss": 0.9942, + "step": 3647 + }, + { + "epoch": 0.7781569965870307, + "grad_norm": 0.9649446792026142, + "learning_rate": 0.00015993937908179574, + "loss": 0.9901, + "step": 3648 + }, + { + "epoch": 0.7783703071672355, + "grad_norm": 0.9344358810317112, + "learning_rate": 0.00015993904740953314, + "loss": 0.9893, + "step": 3649 + }, + { + "epoch": 0.7785836177474402, + "grad_norm": 0.8483859635378972, + "learning_rate": 0.00015993871483275884, + "loss": 0.9849, + "step": 3650 + }, + { + "epoch": 0.778796928327645, + "grad_norm": 0.6710051471334902, + "learning_rate": 0.00015993838135147667, + "loss": 0.975, + "step": 3651 + }, + { + "epoch": 0.7790102389078498, + "grad_norm": 0.6320985796498311, + "learning_rate": 0.00015993804696569036, + "loss": 1.041, + "step": 3652 + }, + { + "epoch": 0.7792235494880546, + "grad_norm": 0.40427787850334945, + "learning_rate": 0.00015993771167540372, + "loss": 1.0053, + "step": 3653 + }, + { + "epoch": 0.7794368600682594, + "grad_norm": 0.7554267271919572, + "learning_rate": 0.0001599373754806205, + "loss": 0.9768, + "step": 3654 + }, + { + "epoch": 0.7796501706484642, + "grad_norm": 0.631117362570729, + "learning_rate": 0.00015993703838134457, + "loss": 0.9851, + "step": 3655 + }, + { + "epoch": 0.7798634812286689, + "grad_norm": 0.5930238579590718, + "learning_rate": 0.00015993670037757967, + "loss": 1.0023, + "step": 3656 + }, + { + "epoch": 0.7800767918088737, + "grad_norm": 0.4901614341589808, + "learning_rate": 0.00015993636146932966, + "loss": 0.9916, + "step": 3657 + }, + { + "epoch": 0.7802901023890785, + "grad_norm": 0.554399181172103, + "learning_rate": 0.00015993602165659837, + "loss": 1.0036, + "step": 3658 + }, + { + "epoch": 0.7805034129692833, + "grad_norm": 0.6626666586529539, + "learning_rate": 0.0001599356809393897, + "loss": 0.9943, + "step": 3659 + }, + { + "epoch": 0.7807167235494881, + "grad_norm": 0.6569961714377681, + "learning_rate": 0.00015993533931770741, + "loss": 0.9795, + "step": 3660 + }, + { + "epoch": 0.7809300341296929, + "grad_norm": 0.4164725584196454, + "learning_rate": 0.00015993499679155541, + "loss": 0.9745, + "step": 3661 + }, + { + "epoch": 0.7811433447098977, + "grad_norm": 0.6266628847350099, + "learning_rate": 0.0001599346533609376, + "loss": 0.9979, + "step": 3662 + }, + { + "epoch": 0.7813566552901023, + "grad_norm": 0.7918298367101855, + "learning_rate": 0.00015993430902585783, + "loss": 1.0154, + "step": 3663 + }, + { + "epoch": 0.7815699658703071, + "grad_norm": 0.650683546872967, + "learning_rate": 0.00015993396378632003, + "loss": 1.0113, + "step": 3664 + }, + { + "epoch": 0.7817832764505119, + "grad_norm": 0.596033735786947, + "learning_rate": 0.00015993361764232806, + "loss": 0.9944, + "step": 3665 + }, + { + "epoch": 0.7819965870307167, + "grad_norm": 0.7267472424780153, + "learning_rate": 0.00015993327059388588, + "loss": 0.9882, + "step": 3666 + }, + { + "epoch": 0.7822098976109215, + "grad_norm": 0.7004264663130422, + "learning_rate": 0.00015993292264099738, + "loss": 1.0142, + "step": 3667 + }, + { + "epoch": 0.7824232081911263, + "grad_norm": 0.6834890043610184, + "learning_rate": 0.00015993257378366652, + "loss": 0.9947, + "step": 3668 + }, + { + "epoch": 0.7826365187713311, + "grad_norm": 0.6537973916719697, + "learning_rate": 0.00015993222402189725, + "loss": 0.9904, + "step": 3669 + }, + { + "epoch": 0.7828498293515358, + "grad_norm": 0.54595344575316, + "learning_rate": 0.00015993187335569355, + "loss": 1.0054, + "step": 3670 + }, + { + "epoch": 0.7830631399317406, + "grad_norm": 0.6438760770568717, + "learning_rate": 0.00015993152178505932, + "loss": 0.9932, + "step": 3671 + }, + { + "epoch": 0.7832764505119454, + "grad_norm": 0.625247918070549, + "learning_rate": 0.00015993116930999857, + "loss": 0.9867, + "step": 3672 + }, + { + "epoch": 0.7834897610921502, + "grad_norm": 0.5457101030253122, + "learning_rate": 0.00015993081593051529, + "loss": 1.0089, + "step": 3673 + }, + { + "epoch": 0.783703071672355, + "grad_norm": 0.516110876623927, + "learning_rate": 0.0001599304616466135, + "loss": 1.0039, + "step": 3674 + }, + { + "epoch": 0.7839163822525598, + "grad_norm": 0.5050322625181118, + "learning_rate": 0.00015993010645829722, + "loss": 1.0082, + "step": 3675 + }, + { + "epoch": 0.7841296928327645, + "grad_norm": 0.49658620244315516, + "learning_rate": 0.0001599297503655704, + "loss": 0.9888, + "step": 3676 + }, + { + "epoch": 0.7843430034129693, + "grad_norm": 0.5175186917306063, + "learning_rate": 0.00015992939336843714, + "loss": 1.0134, + "step": 3677 + }, + { + "epoch": 0.784556313993174, + "grad_norm": 0.5875166084348933, + "learning_rate": 0.00015992903546690142, + "loss": 0.9963, + "step": 3678 + }, + { + "epoch": 0.7847696245733788, + "grad_norm": 0.555635525620799, + "learning_rate": 0.00015992867666096732, + "loss": 1.0103, + "step": 3679 + }, + { + "epoch": 0.7849829351535836, + "grad_norm": 0.46961113009136163, + "learning_rate": 0.00015992831695063893, + "loss": 0.9909, + "step": 3680 + }, + { + "epoch": 0.7851962457337884, + "grad_norm": 0.547063389763823, + "learning_rate": 0.00015992795633592026, + "loss": 1.0352, + "step": 3681 + }, + { + "epoch": 0.7854095563139932, + "grad_norm": 0.5607661551500975, + "learning_rate": 0.00015992759481681546, + "loss": 0.9728, + "step": 3682 + }, + { + "epoch": 0.7856228668941979, + "grad_norm": 0.5322161174508245, + "learning_rate": 0.00015992723239332854, + "loss": 1.0051, + "step": 3683 + }, + { + "epoch": 0.7858361774744027, + "grad_norm": 0.5848142609947308, + "learning_rate": 0.00015992686906546365, + "loss": 0.99, + "step": 3684 + }, + { + "epoch": 0.7860494880546075, + "grad_norm": 0.5695068875039179, + "learning_rate": 0.0001599265048332249, + "loss": 0.9952, + "step": 3685 + }, + { + "epoch": 0.7862627986348123, + "grad_norm": 0.49662283720234707, + "learning_rate": 0.0001599261396966164, + "loss": 0.9869, + "step": 3686 + }, + { + "epoch": 0.7864761092150171, + "grad_norm": 0.6166306498770073, + "learning_rate": 0.0001599257736556423, + "loss": 0.9904, + "step": 3687 + }, + { + "epoch": 0.7866894197952219, + "grad_norm": 0.670112130700452, + "learning_rate": 0.00015992540671030668, + "loss": 1.0288, + "step": 3688 + }, + { + "epoch": 0.7869027303754266, + "grad_norm": 0.6663008507105858, + "learning_rate": 0.0001599250388606138, + "loss": 0.9845, + "step": 3689 + }, + { + "epoch": 0.7871160409556314, + "grad_norm": 0.6485379896589618, + "learning_rate": 0.0001599246701065677, + "loss": 1.0307, + "step": 3690 + }, + { + "epoch": 0.7873293515358362, + "grad_norm": 0.6108150091012612, + "learning_rate": 0.00015992430044817267, + "loss": 0.9961, + "step": 3691 + }, + { + "epoch": 0.787542662116041, + "grad_norm": 0.6113707985416993, + "learning_rate": 0.00015992392988543278, + "loss": 0.9804, + "step": 3692 + }, + { + "epoch": 0.7877559726962458, + "grad_norm": 0.5100304377559375, + "learning_rate": 0.00015992355841835232, + "loss": 0.9917, + "step": 3693 + }, + { + "epoch": 0.7879692832764505, + "grad_norm": 0.41033325003624066, + "learning_rate": 0.00015992318604693545, + "loss": 0.9711, + "step": 3694 + }, + { + "epoch": 0.7881825938566553, + "grad_norm": 0.4346858131786567, + "learning_rate": 0.00015992281277118638, + "loss": 0.9899, + "step": 3695 + }, + { + "epoch": 0.78839590443686, + "grad_norm": 0.49803253534003095, + "learning_rate": 0.00015992243859110934, + "loss": 1.0121, + "step": 3696 + }, + { + "epoch": 0.7886092150170648, + "grad_norm": 0.5199453848951904, + "learning_rate": 0.00015992206350670855, + "loss": 0.9787, + "step": 3697 + }, + { + "epoch": 0.7888225255972696, + "grad_norm": 0.48254214366903897, + "learning_rate": 0.00015992168751798828, + "loss": 1.0049, + "step": 3698 + }, + { + "epoch": 0.7890358361774744, + "grad_norm": 0.5090650249013261, + "learning_rate": 0.00015992131062495278, + "loss": 0.9937, + "step": 3699 + }, + { + "epoch": 0.7892491467576792, + "grad_norm": 0.5740797853826728, + "learning_rate": 0.00015992093282760628, + "loss": 0.9973, + "step": 3700 + }, + { + "epoch": 0.789462457337884, + "grad_norm": 0.6777138800321976, + "learning_rate": 0.0001599205541259531, + "loss": 0.9779, + "step": 3701 + }, + { + "epoch": 0.7896757679180887, + "grad_norm": 0.7497216013509572, + "learning_rate": 0.00015992017451999747, + "loss": 0.9528, + "step": 3702 + }, + { + "epoch": 0.7898890784982935, + "grad_norm": 0.7956113080407977, + "learning_rate": 0.00015991979400974376, + "loss": 0.993, + "step": 3703 + }, + { + "epoch": 0.7901023890784983, + "grad_norm": 0.8974112980635349, + "learning_rate": 0.00015991941259519625, + "loss": 1.0234, + "step": 3704 + }, + { + "epoch": 0.7903156996587031, + "grad_norm": 0.9334813483546837, + "learning_rate": 0.0001599190302763592, + "loss": 1.0119, + "step": 3705 + }, + { + "epoch": 0.7905290102389079, + "grad_norm": 0.8954253487601779, + "learning_rate": 0.000159918647053237, + "loss": 0.9895, + "step": 3706 + }, + { + "epoch": 0.7907423208191127, + "grad_norm": 0.8195502288678248, + "learning_rate": 0.00015991826292583396, + "loss": 1.0098, + "step": 3707 + }, + { + "epoch": 0.7909556313993175, + "grad_norm": 0.749353188108203, + "learning_rate": 0.00015991787789415446, + "loss": 0.9993, + "step": 3708 + }, + { + "epoch": 0.7911689419795221, + "grad_norm": 0.6443962590702009, + "learning_rate": 0.00015991749195820278, + "loss": 0.9886, + "step": 3709 + }, + { + "epoch": 0.7913822525597269, + "grad_norm": 0.5677333994684673, + "learning_rate": 0.00015991710511798336, + "loss": 1.0527, + "step": 3710 + }, + { + "epoch": 0.7915955631399317, + "grad_norm": 0.5667714628407172, + "learning_rate": 0.00015991671737350055, + "loss": 0.9798, + "step": 3711 + }, + { + "epoch": 0.7918088737201365, + "grad_norm": 0.5805440480879241, + "learning_rate": 0.00015991632872475872, + "loss": 0.9777, + "step": 3712 + }, + { + "epoch": 0.7920221843003413, + "grad_norm": 0.4480302635064237, + "learning_rate": 0.0001599159391717623, + "loss": 0.9887, + "step": 3713 + }, + { + "epoch": 0.7922354948805461, + "grad_norm": 0.395934802448821, + "learning_rate": 0.0001599155487145157, + "loss": 0.9921, + "step": 3714 + }, + { + "epoch": 0.7924488054607508, + "grad_norm": 0.4818016936445491, + "learning_rate": 0.0001599151573530233, + "loss": 1.0154, + "step": 3715 + }, + { + "epoch": 0.7926621160409556, + "grad_norm": 0.4466907172462746, + "learning_rate": 0.00015991476508728957, + "loss": 1.0009, + "step": 3716 + }, + { + "epoch": 0.7928754266211604, + "grad_norm": 0.5209165571308182, + "learning_rate": 0.0001599143719173189, + "loss": 0.9982, + "step": 3717 + }, + { + "epoch": 0.7930887372013652, + "grad_norm": 0.4440623112148406, + "learning_rate": 0.00015991397784311578, + "loss": 0.9676, + "step": 3718 + }, + { + "epoch": 0.79330204778157, + "grad_norm": 0.33460751620452384, + "learning_rate": 0.00015991358286468466, + "loss": 0.9776, + "step": 3719 + }, + { + "epoch": 0.7935153583617748, + "grad_norm": 0.4926071243788357, + "learning_rate": 0.00015991318698203, + "loss": 0.9953, + "step": 3720 + }, + { + "epoch": 0.7937286689419796, + "grad_norm": 0.47535820998914596, + "learning_rate": 0.0001599127901951563, + "loss": 0.986, + "step": 3721 + }, + { + "epoch": 0.7939419795221843, + "grad_norm": 0.45697946240166365, + "learning_rate": 0.00015991239250406802, + "loss": 0.9612, + "step": 3722 + }, + { + "epoch": 0.794155290102389, + "grad_norm": 0.47901205580572254, + "learning_rate": 0.0001599119939087697, + "loss": 1.0002, + "step": 3723 + }, + { + "epoch": 0.7943686006825939, + "grad_norm": 0.5231231609639118, + "learning_rate": 0.00015991159440926578, + "loss": 0.9845, + "step": 3724 + }, + { + "epoch": 0.7945819112627986, + "grad_norm": 0.5984030034725296, + "learning_rate": 0.00015991119400556085, + "loss": 0.9975, + "step": 3725 + }, + { + "epoch": 0.7947952218430034, + "grad_norm": 0.6268582680764383, + "learning_rate": 0.00015991079269765942, + "loss": 1.0009, + "step": 3726 + }, + { + "epoch": 0.7950085324232082, + "grad_norm": 0.6534231591222915, + "learning_rate": 0.00015991039048556603, + "loss": 0.9992, + "step": 3727 + }, + { + "epoch": 0.7952218430034129, + "grad_norm": 0.7828524979863074, + "learning_rate": 0.00015990998736928523, + "loss": 0.986, + "step": 3728 + }, + { + "epoch": 0.7954351535836177, + "grad_norm": 0.8838383300856973, + "learning_rate": 0.00015990958334882155, + "loss": 1.024, + "step": 3729 + }, + { + "epoch": 0.7956484641638225, + "grad_norm": 0.8493967279708973, + "learning_rate": 0.0001599091784241796, + "loss": 0.9841, + "step": 3730 + }, + { + "epoch": 0.7958617747440273, + "grad_norm": 0.8196632576993487, + "learning_rate": 0.000159908772595364, + "loss": 0.9791, + "step": 3731 + }, + { + "epoch": 0.7960750853242321, + "grad_norm": 0.7326166138656324, + "learning_rate": 0.00015990836586237924, + "loss": 0.9961, + "step": 3732 + }, + { + "epoch": 0.7962883959044369, + "grad_norm": 0.5719461626926717, + "learning_rate": 0.00015990795822523, + "loss": 0.9535, + "step": 3733 + }, + { + "epoch": 0.7965017064846417, + "grad_norm": 0.47293283729329955, + "learning_rate": 0.00015990754968392084, + "loss": 1.0035, + "step": 3734 + }, + { + "epoch": 0.7967150170648464, + "grad_norm": 0.44602181627463683, + "learning_rate": 0.00015990714023845644, + "loss": 0.9578, + "step": 3735 + }, + { + "epoch": 0.7969283276450512, + "grad_norm": 0.4642459479047911, + "learning_rate": 0.0001599067298888414, + "loss": 0.9759, + "step": 3736 + }, + { + "epoch": 0.797141638225256, + "grad_norm": 0.5548954268601541, + "learning_rate": 0.00015990631863508037, + "loss": 1.0342, + "step": 3737 + }, + { + "epoch": 0.7973549488054608, + "grad_norm": 0.5771358225587613, + "learning_rate": 0.00015990590647717797, + "loss": 0.992, + "step": 3738 + }, + { + "epoch": 0.7975682593856656, + "grad_norm": 0.47913092345772196, + "learning_rate": 0.00015990549341513891, + "loss": 1.0212, + "step": 3739 + }, + { + "epoch": 0.7977815699658704, + "grad_norm": 0.4496104004239881, + "learning_rate": 0.00015990507944896787, + "loss": 1.0062, + "step": 3740 + }, + { + "epoch": 0.797994880546075, + "grad_norm": 0.5586733985571647, + "learning_rate": 0.00015990466457866949, + "loss": 0.9771, + "step": 3741 + }, + { + "epoch": 0.7982081911262798, + "grad_norm": 0.6648316442198615, + "learning_rate": 0.0001599042488042485, + "loss": 0.9827, + "step": 3742 + }, + { + "epoch": 0.7984215017064846, + "grad_norm": 0.7190354532519676, + "learning_rate": 0.0001599038321257096, + "loss": 0.9911, + "step": 3743 + }, + { + "epoch": 0.7986348122866894, + "grad_norm": 0.8380556049251239, + "learning_rate": 0.00015990341454305746, + "loss": 0.9843, + "step": 3744 + }, + { + "epoch": 0.7988481228668942, + "grad_norm": 0.8706959618839, + "learning_rate": 0.00015990299605629687, + "loss": 0.9768, + "step": 3745 + }, + { + "epoch": 0.799061433447099, + "grad_norm": 0.8777028770724431, + "learning_rate": 0.00015990257666543252, + "loss": 0.9807, + "step": 3746 + }, + { + "epoch": 0.7992747440273038, + "grad_norm": 0.8055012321813189, + "learning_rate": 0.00015990215637046916, + "loss": 0.9815, + "step": 3747 + }, + { + "epoch": 0.7994880546075085, + "grad_norm": 0.7617460918729585, + "learning_rate": 0.0001599017351714116, + "loss": 1.0145, + "step": 3748 + }, + { + "epoch": 0.7997013651877133, + "grad_norm": 0.7999269157681813, + "learning_rate": 0.00015990131306826454, + "loss": 0.965, + "step": 3749 + }, + { + "epoch": 0.7999146757679181, + "grad_norm": 0.8028491705662667, + "learning_rate": 0.00015990089006103275, + "loss": 1.0045, + "step": 3750 + }, + { + "epoch": 0.8001279863481229, + "grad_norm": 0.6215862744141422, + "learning_rate": 0.00015990046614972107, + "loss": 1.0056, + "step": 3751 + }, + { + "epoch": 0.8003412969283277, + "grad_norm": 0.4120597623766579, + "learning_rate": 0.0001599000413343343, + "loss": 0.9648, + "step": 3752 + }, + { + "epoch": 0.8005546075085325, + "grad_norm": 0.31297278355611335, + "learning_rate": 0.00015989961561487719, + "loss": 1.0117, + "step": 3753 + }, + { + "epoch": 0.8007679180887372, + "grad_norm": 0.32832619728342005, + "learning_rate": 0.00015989918899135457, + "loss": 0.9926, + "step": 3754 + }, + { + "epoch": 0.800981228668942, + "grad_norm": 0.3357504245861952, + "learning_rate": 0.0001598987614637713, + "loss": 0.9647, + "step": 3755 + }, + { + "epoch": 0.8011945392491467, + "grad_norm": 0.3604130520833723, + "learning_rate": 0.00015989833303213218, + "loss": 0.984, + "step": 3756 + }, + { + "epoch": 0.8014078498293515, + "grad_norm": 0.36229297710106917, + "learning_rate": 0.0001598979036964421, + "loss": 1.0059, + "step": 3757 + }, + { + "epoch": 0.8016211604095563, + "grad_norm": 0.3697277434055495, + "learning_rate": 0.0001598974734567059, + "loss": 0.9678, + "step": 3758 + }, + { + "epoch": 0.8018344709897611, + "grad_norm": 0.40715097465168365, + "learning_rate": 0.00015989704231292843, + "loss": 0.9744, + "step": 3759 + }, + { + "epoch": 0.8020477815699659, + "grad_norm": 0.5498085865945012, + "learning_rate": 0.0001598966102651146, + "loss": 1.0093, + "step": 3760 + }, + { + "epoch": 0.8022610921501706, + "grad_norm": 0.6650945638641016, + "learning_rate": 0.00015989617731326927, + "loss": 0.9891, + "step": 3761 + }, + { + "epoch": 0.8024744027303754, + "grad_norm": 0.806269294168823, + "learning_rate": 0.00015989574345739736, + "loss": 1.0245, + "step": 3762 + }, + { + "epoch": 0.8026877133105802, + "grad_norm": 0.9115492492711579, + "learning_rate": 0.00015989530869750372, + "loss": 0.9619, + "step": 3763 + }, + { + "epoch": 0.802901023890785, + "grad_norm": 0.9894431085902428, + "learning_rate": 0.0001598948730335934, + "loss": 1.008, + "step": 3764 + }, + { + "epoch": 0.8031143344709898, + "grad_norm": 0.9519891656289712, + "learning_rate": 0.0001598944364656712, + "loss": 1.0274, + "step": 3765 + }, + { + "epoch": 0.8033276450511946, + "grad_norm": 0.8461632557665177, + "learning_rate": 0.0001598939989937421, + "loss": 0.9972, + "step": 3766 + }, + { + "epoch": 0.8035409556313993, + "grad_norm": 0.7257337607994594, + "learning_rate": 0.00015989356061781108, + "loss": 0.9966, + "step": 3767 + }, + { + "epoch": 0.8037542662116041, + "grad_norm": 0.6170361512464232, + "learning_rate": 0.00015989312133788306, + "loss": 1.0009, + "step": 3768 + }, + { + "epoch": 0.8039675767918089, + "grad_norm": 0.48647329424570507, + "learning_rate": 0.000159892681153963, + "loss": 1.0025, + "step": 3769 + }, + { + "epoch": 0.8041808873720137, + "grad_norm": 0.41920072562633226, + "learning_rate": 0.00015989224006605598, + "loss": 0.9921, + "step": 3770 + }, + { + "epoch": 0.8043941979522184, + "grad_norm": 0.4260076915251829, + "learning_rate": 0.00015989179807416687, + "loss": 0.9736, + "step": 3771 + }, + { + "epoch": 0.8046075085324232, + "grad_norm": 0.4588462786685524, + "learning_rate": 0.00015989135517830073, + "loss": 0.9887, + "step": 3772 + }, + { + "epoch": 0.804820819112628, + "grad_norm": 0.460542236924828, + "learning_rate": 0.00015989091137846257, + "loss": 0.9821, + "step": 3773 + }, + { + "epoch": 0.8050341296928327, + "grad_norm": 0.4968000876234243, + "learning_rate": 0.0001598904666746574, + "loss": 0.9685, + "step": 3774 + }, + { + "epoch": 0.8052474402730375, + "grad_norm": 0.5214931476312141, + "learning_rate": 0.00015989002106689027, + "loss": 0.9876, + "step": 3775 + }, + { + "epoch": 0.8054607508532423, + "grad_norm": 0.5141374506826787, + "learning_rate": 0.00015988957455516617, + "loss": 1.0164, + "step": 3776 + }, + { + "epoch": 0.8056740614334471, + "grad_norm": 0.551815492790319, + "learning_rate": 0.0001598891271394902, + "loss": 0.981, + "step": 3777 + }, + { + "epoch": 0.8058873720136519, + "grad_norm": 0.5834114663323232, + "learning_rate": 0.00015988867881986746, + "loss": 0.9925, + "step": 3778 + }, + { + "epoch": 0.8061006825938567, + "grad_norm": 0.6598856460164539, + "learning_rate": 0.00015988822959630295, + "loss": 0.9648, + "step": 3779 + }, + { + "epoch": 0.8063139931740614, + "grad_norm": 0.6136557645973374, + "learning_rate": 0.00015988777946880176, + "loss": 1.0121, + "step": 3780 + }, + { + "epoch": 0.8065273037542662, + "grad_norm": 0.569849424442123, + "learning_rate": 0.000159887328437369, + "loss": 0.9996, + "step": 3781 + }, + { + "epoch": 0.806740614334471, + "grad_norm": 0.49184884683590524, + "learning_rate": 0.0001598868765020098, + "loss": 0.9877, + "step": 3782 + }, + { + "epoch": 0.8069539249146758, + "grad_norm": 0.4281133441159912, + "learning_rate": 0.00015988642366272923, + "loss": 0.9867, + "step": 3783 + }, + { + "epoch": 0.8071672354948806, + "grad_norm": 0.41445114311871517, + "learning_rate": 0.00015988596991953243, + "loss": 1.0117, + "step": 3784 + }, + { + "epoch": 0.8073805460750854, + "grad_norm": 0.41746955722235046, + "learning_rate": 0.00015988551527242455, + "loss": 0.9749, + "step": 3785 + }, + { + "epoch": 0.8075938566552902, + "grad_norm": 0.37328745729973756, + "learning_rate": 0.00015988505972141072, + "loss": 0.9761, + "step": 3786 + }, + { + "epoch": 0.8078071672354948, + "grad_norm": 0.3317444638337059, + "learning_rate": 0.00015988460326649609, + "loss": 1.0247, + "step": 3787 + }, + { + "epoch": 0.8080204778156996, + "grad_norm": 0.36430054800348205, + "learning_rate": 0.00015988414590768584, + "loss": 0.9794, + "step": 3788 + }, + { + "epoch": 0.8082337883959044, + "grad_norm": 0.4573962564059702, + "learning_rate": 0.0001598836876449851, + "loss": 0.9966, + "step": 3789 + }, + { + "epoch": 0.8084470989761092, + "grad_norm": 0.4278085755402099, + "learning_rate": 0.00015988322847839916, + "loss": 1.0192, + "step": 3790 + }, + { + "epoch": 0.808660409556314, + "grad_norm": 0.3861776752745725, + "learning_rate": 0.0001598827684079331, + "loss": 1.0026, + "step": 3791 + }, + { + "epoch": 0.8088737201365188, + "grad_norm": 0.46621094282805087, + "learning_rate": 0.00015988230743359215, + "loss": 1.0139, + "step": 3792 + }, + { + "epoch": 0.8090870307167235, + "grad_norm": 0.6059182609150796, + "learning_rate": 0.00015988184555538158, + "loss": 1.0062, + "step": 3793 + }, + { + "epoch": 0.8093003412969283, + "grad_norm": 0.8332454074359927, + "learning_rate": 0.00015988138277330658, + "loss": 0.9715, + "step": 3794 + }, + { + "epoch": 0.8095136518771331, + "grad_norm": 0.9735990469728443, + "learning_rate": 0.0001598809190873724, + "loss": 0.9756, + "step": 3795 + }, + { + "epoch": 0.8097269624573379, + "grad_norm": 1.0684979847890146, + "learning_rate": 0.00015988045449758426, + "loss": 0.9977, + "step": 3796 + }, + { + "epoch": 0.8099402730375427, + "grad_norm": 0.8874548145256914, + "learning_rate": 0.00015987998900394746, + "loss": 0.9973, + "step": 3797 + }, + { + "epoch": 0.8101535836177475, + "grad_norm": 0.7232677484630596, + "learning_rate": 0.0001598795226064672, + "loss": 1.0045, + "step": 3798 + }, + { + "epoch": 0.8103668941979523, + "grad_norm": 0.562356956966079, + "learning_rate": 0.0001598790553051488, + "loss": 1.0047, + "step": 3799 + }, + { + "epoch": 0.810580204778157, + "grad_norm": 0.4181292672967865, + "learning_rate": 0.00015987858709999757, + "loss": 0.9898, + "step": 3800 + }, + { + "epoch": 0.8107935153583617, + "grad_norm": 0.3371477595691613, + "learning_rate": 0.00015987811799101877, + "loss": 1.0, + "step": 3801 + }, + { + "epoch": 0.8110068259385665, + "grad_norm": 0.2882563220529459, + "learning_rate": 0.00015987764797821774, + "loss": 0.9873, + "step": 3802 + }, + { + "epoch": 0.8112201365187713, + "grad_norm": 0.3439421906928409, + "learning_rate": 0.00015987717706159979, + "loss": 0.9941, + "step": 3803 + }, + { + "epoch": 0.8114334470989761, + "grad_norm": 0.4206551240174069, + "learning_rate": 0.0001598767052411702, + "loss": 1.0268, + "step": 3804 + }, + { + "epoch": 0.8116467576791809, + "grad_norm": 0.3982024126170602, + "learning_rate": 0.00015987623251693438, + "loss": 0.9917, + "step": 3805 + }, + { + "epoch": 0.8118600682593856, + "grad_norm": 0.39574843531691334, + "learning_rate": 0.0001598757588888976, + "loss": 0.9976, + "step": 3806 + }, + { + "epoch": 0.8120733788395904, + "grad_norm": 0.43316775567650395, + "learning_rate": 0.0001598752843570653, + "loss": 0.9528, + "step": 3807 + }, + { + "epoch": 0.8122866894197952, + "grad_norm": 0.4882559273998891, + "learning_rate": 0.0001598748089214428, + "loss": 0.9818, + "step": 3808 + }, + { + "epoch": 0.8125, + "grad_norm": 0.4092741126175385, + "learning_rate": 0.0001598743325820355, + "loss": 0.997, + "step": 3809 + }, + { + "epoch": 0.8127133105802048, + "grad_norm": 0.3965294618768619, + "learning_rate": 0.00015987385533884879, + "loss": 0.9956, + "step": 3810 + }, + { + "epoch": 0.8129266211604096, + "grad_norm": 0.36664224425248326, + "learning_rate": 0.00015987337719188807, + "loss": 0.9686, + "step": 3811 + }, + { + "epoch": 0.8131399317406144, + "grad_norm": 0.36766354421912856, + "learning_rate": 0.0001598728981411587, + "loss": 0.9951, + "step": 3812 + }, + { + "epoch": 0.8133532423208191, + "grad_norm": 0.49498678270365065, + "learning_rate": 0.0001598724181866662, + "loss": 0.9673, + "step": 3813 + }, + { + "epoch": 0.8135665529010239, + "grad_norm": 0.6736729030061996, + "learning_rate": 0.0001598719373284159, + "loss": 0.9709, + "step": 3814 + }, + { + "epoch": 0.8137798634812287, + "grad_norm": 0.8468091640152975, + "learning_rate": 0.0001598714555664133, + "loss": 0.9815, + "step": 3815 + }, + { + "epoch": 0.8139931740614335, + "grad_norm": 0.9670950519415792, + "learning_rate": 0.00015987097290066384, + "loss": 0.9824, + "step": 3816 + }, + { + "epoch": 0.8142064846416383, + "grad_norm": 1.068758710965246, + "learning_rate": 0.000159870489331173, + "loss": 0.982, + "step": 3817 + }, + { + "epoch": 0.814419795221843, + "grad_norm": 0.9471289582634834, + "learning_rate": 0.0001598700048579462, + "loss": 0.9834, + "step": 3818 + }, + { + "epoch": 0.8146331058020477, + "grad_norm": 0.9305090931476947, + "learning_rate": 0.00015986951948098897, + "loss": 1.0139, + "step": 3819 + }, + { + "epoch": 0.8148464163822525, + "grad_norm": 0.9737822828618626, + "learning_rate": 0.00015986903320030674, + "loss": 0.988, + "step": 3820 + }, + { + "epoch": 0.8150597269624573, + "grad_norm": 0.8422718202607374, + "learning_rate": 0.00015986854601590512, + "loss": 1.0067, + "step": 3821 + }, + { + "epoch": 0.8152730375426621, + "grad_norm": 0.5689213821028457, + "learning_rate": 0.00015986805792778954, + "loss": 0.9775, + "step": 3822 + }, + { + "epoch": 0.8154863481228669, + "grad_norm": 0.4249827647359952, + "learning_rate": 0.0001598675689359655, + "loss": 0.9922, + "step": 3823 + }, + { + "epoch": 0.8156996587030717, + "grad_norm": 0.5109905525058063, + "learning_rate": 0.00015986707904043862, + "loss": 0.9868, + "step": 3824 + }, + { + "epoch": 0.8159129692832765, + "grad_norm": 0.7012291365083468, + "learning_rate": 0.00015986658824121438, + "loss": 0.9902, + "step": 3825 + }, + { + "epoch": 0.8161262798634812, + "grad_norm": 0.7850494047043342, + "learning_rate": 0.00015986609653829837, + "loss": 0.9796, + "step": 3826 + }, + { + "epoch": 0.816339590443686, + "grad_norm": 0.6636779516855359, + "learning_rate": 0.0001598656039316961, + "loss": 0.9869, + "step": 3827 + }, + { + "epoch": 0.8165529010238908, + "grad_norm": 0.6009990036277945, + "learning_rate": 0.0001598651104214132, + "loss": 0.9922, + "step": 3828 + }, + { + "epoch": 0.8167662116040956, + "grad_norm": 0.5740263432422537, + "learning_rate": 0.00015986461600745524, + "loss": 0.9856, + "step": 3829 + }, + { + "epoch": 0.8169795221843004, + "grad_norm": 0.4689684604470879, + "learning_rate": 0.00015986412068982777, + "loss": 1.0057, + "step": 3830 + }, + { + "epoch": 0.8171928327645052, + "grad_norm": 0.3572673139904379, + "learning_rate": 0.00015986362446853645, + "loss": 0.991, + "step": 3831 + }, + { + "epoch": 0.8174061433447098, + "grad_norm": 0.4211131415381789, + "learning_rate": 0.0001598631273435869, + "loss": 1.0161, + "step": 3832 + }, + { + "epoch": 0.8176194539249146, + "grad_norm": 0.5254954870540117, + "learning_rate": 0.00015986262931498468, + "loss": 0.9555, + "step": 3833 + }, + { + "epoch": 0.8178327645051194, + "grad_norm": 0.6374647985932828, + "learning_rate": 0.0001598621303827355, + "loss": 1.034, + "step": 3834 + }, + { + "epoch": 0.8180460750853242, + "grad_norm": 0.6593940910513791, + "learning_rate": 0.00015986163054684495, + "loss": 0.9719, + "step": 3835 + }, + { + "epoch": 0.818259385665529, + "grad_norm": 0.5603012927162361, + "learning_rate": 0.00015986112980731872, + "loss": 0.9724, + "step": 3836 + }, + { + "epoch": 0.8184726962457338, + "grad_norm": 0.5184653345894014, + "learning_rate": 0.00015986062816416245, + "loss": 0.9622, + "step": 3837 + }, + { + "epoch": 0.8186860068259386, + "grad_norm": 0.469221525719868, + "learning_rate": 0.00015986012561738184, + "loss": 0.9872, + "step": 3838 + }, + { + "epoch": 0.8188993174061433, + "grad_norm": 0.5100379265514801, + "learning_rate": 0.00015985962216698255, + "loss": 0.9871, + "step": 3839 + }, + { + "epoch": 0.8191126279863481, + "grad_norm": 0.520200063566067, + "learning_rate": 0.0001598591178129703, + "loss": 0.9905, + "step": 3840 + }, + { + "epoch": 0.8193259385665529, + "grad_norm": 0.4340406929098328, + "learning_rate": 0.00015985861255535082, + "loss": 0.9646, + "step": 3841 + }, + { + "epoch": 0.8195392491467577, + "grad_norm": 0.35169519121877624, + "learning_rate": 0.00015985810639412975, + "loss": 1.0143, + "step": 3842 + }, + { + "epoch": 0.8197525597269625, + "grad_norm": 0.46388106711323945, + "learning_rate": 0.0001598575993293129, + "loss": 1.0043, + "step": 3843 + }, + { + "epoch": 0.8199658703071673, + "grad_norm": 0.562045676643684, + "learning_rate": 0.00015985709136090594, + "loss": 0.9762, + "step": 3844 + }, + { + "epoch": 0.820179180887372, + "grad_norm": 0.5753643831620687, + "learning_rate": 0.00015985658248891466, + "loss": 1.0074, + "step": 3845 + }, + { + "epoch": 0.8203924914675768, + "grad_norm": 0.5451453744780673, + "learning_rate": 0.0001598560727133448, + "loss": 0.9859, + "step": 3846 + }, + { + "epoch": 0.8206058020477816, + "grad_norm": 0.6881673945551606, + "learning_rate": 0.00015985556203420214, + "loss": 0.9755, + "step": 3847 + }, + { + "epoch": 0.8208191126279863, + "grad_norm": 0.8133590979815177, + "learning_rate": 0.00015985505045149246, + "loss": 0.98, + "step": 3848 + }, + { + "epoch": 0.8210324232081911, + "grad_norm": 0.805951758015885, + "learning_rate": 0.00015985453796522154, + "loss": 0.9752, + "step": 3849 + }, + { + "epoch": 0.8212457337883959, + "grad_norm": 0.7085342973742398, + "learning_rate": 0.00015985402457539518, + "loss": 0.9499, + "step": 3850 + }, + { + "epoch": 0.8214590443686007, + "grad_norm": 0.6186788229408133, + "learning_rate": 0.00015985351028201917, + "loss": 0.9838, + "step": 3851 + }, + { + "epoch": 0.8216723549488054, + "grad_norm": 0.503868960802325, + "learning_rate": 0.00015985299508509938, + "loss": 0.996, + "step": 3852 + }, + { + "epoch": 0.8218856655290102, + "grad_norm": 0.35683850538979495, + "learning_rate": 0.00015985247898464157, + "loss": 1.0063, + "step": 3853 + }, + { + "epoch": 0.822098976109215, + "grad_norm": 0.317095442066069, + "learning_rate": 0.00015985196198065163, + "loss": 0.9821, + "step": 3854 + }, + { + "epoch": 0.8223122866894198, + "grad_norm": 0.4390542997198238, + "learning_rate": 0.0001598514440731354, + "loss": 0.9841, + "step": 3855 + }, + { + "epoch": 0.8225255972696246, + "grad_norm": 0.4989226699612617, + "learning_rate": 0.00015985092526209875, + "loss": 0.9972, + "step": 3856 + }, + { + "epoch": 0.8227389078498294, + "grad_norm": 0.5491479055759603, + "learning_rate": 0.0001598504055475475, + "loss": 0.9694, + "step": 3857 + }, + { + "epoch": 0.8229522184300341, + "grad_norm": 0.6110730732211513, + "learning_rate": 0.00015984988492948762, + "loss": 0.9816, + "step": 3858 + }, + { + "epoch": 0.8231655290102389, + "grad_norm": 0.7008669615445173, + "learning_rate": 0.00015984936340792492, + "loss": 0.9974, + "step": 3859 + }, + { + "epoch": 0.8233788395904437, + "grad_norm": 0.729668583641213, + "learning_rate": 0.00015984884098286533, + "loss": 0.9739, + "step": 3860 + }, + { + "epoch": 0.8235921501706485, + "grad_norm": 0.7276795762630119, + "learning_rate": 0.00015984831765431478, + "loss": 0.9912, + "step": 3861 + }, + { + "epoch": 0.8238054607508533, + "grad_norm": 0.7066310168062486, + "learning_rate": 0.00015984779342227915, + "loss": 0.9608, + "step": 3862 + }, + { + "epoch": 0.824018771331058, + "grad_norm": 0.6326244626310615, + "learning_rate": 0.0001598472682867644, + "loss": 0.9923, + "step": 3863 + }, + { + "epoch": 0.8242320819112628, + "grad_norm": 0.5237479456630018, + "learning_rate": 0.00015984674224777648, + "loss": 0.9557, + "step": 3864 + }, + { + "epoch": 0.8244453924914675, + "grad_norm": 0.457722634854263, + "learning_rate": 0.00015984621530532132, + "loss": 1.0029, + "step": 3865 + }, + { + "epoch": 0.8246587030716723, + "grad_norm": 0.3755204785329466, + "learning_rate": 0.00015984568745940487, + "loss": 0.9839, + "step": 3866 + }, + { + "epoch": 0.8248720136518771, + "grad_norm": 0.3822071299608312, + "learning_rate": 0.00015984515871003316, + "loss": 0.9911, + "step": 3867 + }, + { + "epoch": 0.8250853242320819, + "grad_norm": 0.48611183752115583, + "learning_rate": 0.0001598446290572121, + "loss": 1.0162, + "step": 3868 + }, + { + "epoch": 0.8252986348122867, + "grad_norm": 0.5091381519787879, + "learning_rate": 0.0001598440985009477, + "loss": 0.9905, + "step": 3869 + }, + { + "epoch": 0.8255119453924915, + "grad_norm": 0.5101545193106001, + "learning_rate": 0.00015984356704124605, + "loss": 0.9732, + "step": 3870 + }, + { + "epoch": 0.8257252559726962, + "grad_norm": 0.5488837836103684, + "learning_rate": 0.00015984303467811306, + "loss": 1.0052, + "step": 3871 + }, + { + "epoch": 0.825938566552901, + "grad_norm": 0.7016752987163305, + "learning_rate": 0.0001598425014115548, + "loss": 0.9798, + "step": 3872 + }, + { + "epoch": 0.8261518771331058, + "grad_norm": 0.7234276053369708, + "learning_rate": 0.00015984196724157728, + "loss": 0.9886, + "step": 3873 + }, + { + "epoch": 0.8263651877133106, + "grad_norm": 0.646069106546168, + "learning_rate": 0.00015984143216818658, + "loss": 0.9797, + "step": 3874 + }, + { + "epoch": 0.8265784982935154, + "grad_norm": 0.48227287544634023, + "learning_rate": 0.00015984089619138872, + "loss": 0.9631, + "step": 3875 + }, + { + "epoch": 0.8267918088737202, + "grad_norm": 0.3241955482539958, + "learning_rate": 0.00015984035931118976, + "loss": 0.9699, + "step": 3876 + }, + { + "epoch": 0.827005119453925, + "grad_norm": 0.31690232627744186, + "learning_rate": 0.00015983982152759581, + "loss": 1.0023, + "step": 3877 + }, + { + "epoch": 0.8272184300341296, + "grad_norm": 0.37544465288194323, + "learning_rate": 0.00015983928284061294, + "loss": 0.9967, + "step": 3878 + }, + { + "epoch": 0.8274317406143344, + "grad_norm": 0.3435869445799187, + "learning_rate": 0.00015983874325024722, + "loss": 0.9697, + "step": 3879 + }, + { + "epoch": 0.8276450511945392, + "grad_norm": 0.40227661693224165, + "learning_rate": 0.0001598382027565048, + "loss": 1.0026, + "step": 3880 + }, + { + "epoch": 0.827858361774744, + "grad_norm": 0.46512023722901275, + "learning_rate": 0.00015983766135939175, + "loss": 0.9817, + "step": 3881 + }, + { + "epoch": 0.8280716723549488, + "grad_norm": 0.5267564665967838, + "learning_rate": 0.00015983711905891422, + "loss": 0.9907, + "step": 3882 + }, + { + "epoch": 0.8282849829351536, + "grad_norm": 0.5018715452829876, + "learning_rate": 0.0001598365758550784, + "loss": 0.9814, + "step": 3883 + }, + { + "epoch": 0.8284982935153583, + "grad_norm": 0.40648973769414454, + "learning_rate": 0.00015983603174789032, + "loss": 0.9992, + "step": 3884 + }, + { + "epoch": 0.8287116040955631, + "grad_norm": 0.4018027283092789, + "learning_rate": 0.0001598354867373562, + "loss": 1.0025, + "step": 3885 + }, + { + "epoch": 0.8289249146757679, + "grad_norm": 0.31640299163052654, + "learning_rate": 0.00015983494082348225, + "loss": 1.0047, + "step": 3886 + }, + { + "epoch": 0.8291382252559727, + "grad_norm": 0.33644107901493, + "learning_rate": 0.00015983439400627455, + "loss": 0.9751, + "step": 3887 + }, + { + "epoch": 0.8293515358361775, + "grad_norm": 0.44231029020725016, + "learning_rate": 0.0001598338462857394, + "loss": 1.0101, + "step": 3888 + }, + { + "epoch": 0.8295648464163823, + "grad_norm": 0.438772001566569, + "learning_rate": 0.00015983329766188287, + "loss": 0.9533, + "step": 3889 + }, + { + "epoch": 0.8297781569965871, + "grad_norm": 0.4783434171860556, + "learning_rate": 0.00015983274813471127, + "loss": 0.9772, + "step": 3890 + }, + { + "epoch": 0.8299914675767918, + "grad_norm": 0.604472582836157, + "learning_rate": 0.00015983219770423078, + "loss": 0.991, + "step": 3891 + }, + { + "epoch": 0.8302047781569966, + "grad_norm": 0.6864930004709366, + "learning_rate": 0.0001598316463704476, + "loss": 1.0086, + "step": 3892 + }, + { + "epoch": 0.8304180887372014, + "grad_norm": 0.7460532620323687, + "learning_rate": 0.00015983109413336802, + "loss": 0.9659, + "step": 3893 + }, + { + "epoch": 0.8306313993174061, + "grad_norm": 0.8822969700850659, + "learning_rate": 0.0001598305409929983, + "loss": 0.959, + "step": 3894 + }, + { + "epoch": 0.830844709897611, + "grad_norm": 1.0571297756733753, + "learning_rate": 0.00015982998694934463, + "loss": 1.0142, + "step": 3895 + }, + { + "epoch": 0.8310580204778157, + "grad_norm": 1.0069684020674898, + "learning_rate": 0.0001598294320024133, + "loss": 0.993, + "step": 3896 + }, + { + "epoch": 0.8312713310580204, + "grad_norm": 1.028107388115007, + "learning_rate": 0.00015982887615221064, + "loss": 0.9723, + "step": 3897 + }, + { + "epoch": 0.8314846416382252, + "grad_norm": 0.9237208102612797, + "learning_rate": 0.0001598283193987429, + "loss": 0.9716, + "step": 3898 + }, + { + "epoch": 0.83169795221843, + "grad_norm": 0.7407503831999336, + "learning_rate": 0.00015982776174201636, + "loss": 1.0007, + "step": 3899 + }, + { + "epoch": 0.8319112627986348, + "grad_norm": 0.5176706885892838, + "learning_rate": 0.00015982720318203738, + "loss": 0.9889, + "step": 3900 + }, + { + "epoch": 0.8321245733788396, + "grad_norm": 0.41688216736463163, + "learning_rate": 0.00015982664371881223, + "loss": 0.9882, + "step": 3901 + }, + { + "epoch": 0.8323378839590444, + "grad_norm": 0.4666851412102616, + "learning_rate": 0.00015982608335234728, + "loss": 1.0081, + "step": 3902 + }, + { + "epoch": 0.8325511945392492, + "grad_norm": 0.4324456816036896, + "learning_rate": 0.0001598255220826488, + "loss": 0.9808, + "step": 3903 + }, + { + "epoch": 0.8327645051194539, + "grad_norm": 0.4247549393247873, + "learning_rate": 0.00015982495990972328, + "loss": 1.0194, + "step": 3904 + }, + { + "epoch": 0.8329778156996587, + "grad_norm": 0.5150255310898313, + "learning_rate": 0.00015982439683357694, + "loss": 0.9839, + "step": 3905 + }, + { + "epoch": 0.8331911262798635, + "grad_norm": 0.5322322296540305, + "learning_rate": 0.00015982383285421625, + "loss": 0.9951, + "step": 3906 + }, + { + "epoch": 0.8334044368600683, + "grad_norm": 0.5813770887856266, + "learning_rate": 0.0001598232679716475, + "loss": 1.0056, + "step": 3907 + }, + { + "epoch": 0.8336177474402731, + "grad_norm": 0.5409817012227325, + "learning_rate": 0.00015982270218587715, + "loss": 0.988, + "step": 3908 + }, + { + "epoch": 0.8338310580204779, + "grad_norm": 0.4451351316626306, + "learning_rate": 0.0001598221354969116, + "loss": 0.9832, + "step": 3909 + }, + { + "epoch": 0.8340443686006825, + "grad_norm": 0.40633190627427945, + "learning_rate": 0.00015982156790475722, + "loss": 0.9885, + "step": 3910 + }, + { + "epoch": 0.8342576791808873, + "grad_norm": 0.3896304517586001, + "learning_rate": 0.00015982099940942048, + "loss": 0.9904, + "step": 3911 + }, + { + "epoch": 0.8344709897610921, + "grad_norm": 0.4108370709184398, + "learning_rate": 0.00015982043001090777, + "loss": 0.9898, + "step": 3912 + }, + { + "epoch": 0.8346843003412969, + "grad_norm": 0.4786090746767538, + "learning_rate": 0.00015981985970922556, + "loss": 1.0086, + "step": 3913 + }, + { + "epoch": 0.8348976109215017, + "grad_norm": 0.5546688974822878, + "learning_rate": 0.00015981928850438028, + "loss": 0.9825, + "step": 3914 + }, + { + "epoch": 0.8351109215017065, + "grad_norm": 0.6026897607129376, + "learning_rate": 0.00015981871639637844, + "loss": 0.9759, + "step": 3915 + }, + { + "epoch": 0.8353242320819113, + "grad_norm": 0.5577761610952847, + "learning_rate": 0.00015981814338522643, + "loss": 0.9814, + "step": 3916 + }, + { + "epoch": 0.835537542662116, + "grad_norm": 0.5143017826951195, + "learning_rate": 0.00015981756947093084, + "loss": 0.9686, + "step": 3917 + }, + { + "epoch": 0.8357508532423208, + "grad_norm": 0.6014405136273706, + "learning_rate": 0.0001598169946534981, + "loss": 1.0046, + "step": 3918 + }, + { + "epoch": 0.8359641638225256, + "grad_norm": 0.6797018563312681, + "learning_rate": 0.0001598164189329347, + "loss": 1.0314, + "step": 3919 + }, + { + "epoch": 0.8361774744027304, + "grad_norm": 0.7416628073004187, + "learning_rate": 0.0001598158423092472, + "loss": 1.0183, + "step": 3920 + }, + { + "epoch": 0.8363907849829352, + "grad_norm": 0.6137671268326629, + "learning_rate": 0.00015981526478244207, + "loss": 0.9792, + "step": 3921 + }, + { + "epoch": 0.83660409556314, + "grad_norm": 0.6811683642521837, + "learning_rate": 0.0001598146863525259, + "loss": 1.0202, + "step": 3922 + }, + { + "epoch": 0.8368174061433447, + "grad_norm": 0.6852915056677145, + "learning_rate": 0.0001598141070195052, + "loss": 0.9933, + "step": 3923 + }, + { + "epoch": 0.8370307167235495, + "grad_norm": 0.7236463100341023, + "learning_rate": 0.00015981352678338655, + "loss": 0.9959, + "step": 3924 + }, + { + "epoch": 0.8372440273037542, + "grad_norm": 0.8518331381132551, + "learning_rate": 0.00015981294564417647, + "loss": 0.9927, + "step": 3925 + }, + { + "epoch": 0.837457337883959, + "grad_norm": 0.9805212884424503, + "learning_rate": 0.0001598123636018816, + "loss": 0.978, + "step": 3926 + }, + { + "epoch": 0.8376706484641638, + "grad_norm": 1.1441529161442592, + "learning_rate": 0.0001598117806565085, + "loss": 0.9504, + "step": 3927 + }, + { + "epoch": 0.8378839590443686, + "grad_norm": 0.7329128985451169, + "learning_rate": 0.00015981119680806373, + "loss": 0.9889, + "step": 3928 + }, + { + "epoch": 0.8380972696245734, + "grad_norm": 0.4091388692838379, + "learning_rate": 0.00015981061205655396, + "loss": 0.9898, + "step": 3929 + }, + { + "epoch": 0.8383105802047781, + "grad_norm": 0.48771757898418183, + "learning_rate": 0.00015981002640198572, + "loss": 0.9951, + "step": 3930 + }, + { + "epoch": 0.8385238907849829, + "grad_norm": 0.6775907167746148, + "learning_rate": 0.00015980943984436572, + "loss": 1.0, + "step": 3931 + }, + { + "epoch": 0.8387372013651877, + "grad_norm": 0.9461693368863143, + "learning_rate": 0.00015980885238370057, + "loss": 1.0079, + "step": 3932 + }, + { + "epoch": 0.8389505119453925, + "grad_norm": 1.0489973432296853, + "learning_rate": 0.0001598082640199969, + "loss": 1.0026, + "step": 3933 + }, + { + "epoch": 0.8391638225255973, + "grad_norm": 0.8231462247372743, + "learning_rate": 0.0001598076747532614, + "loss": 0.9726, + "step": 3934 + }, + { + "epoch": 0.8393771331058021, + "grad_norm": 0.8572394340180224, + "learning_rate": 0.0001598070845835007, + "loss": 0.9651, + "step": 3935 + }, + { + "epoch": 0.8395904436860068, + "grad_norm": 0.7830254679951605, + "learning_rate": 0.00015980649351072151, + "loss": 0.9682, + "step": 3936 + }, + { + "epoch": 0.8398037542662116, + "grad_norm": 0.5056497548766185, + "learning_rate": 0.0001598059015349305, + "loss": 0.9756, + "step": 3937 + }, + { + "epoch": 0.8400170648464164, + "grad_norm": 0.32746745119416115, + "learning_rate": 0.00015980530865613438, + "loss": 0.9776, + "step": 3938 + }, + { + "epoch": 0.8402303754266212, + "grad_norm": 0.3144120565114504, + "learning_rate": 0.0001598047148743398, + "loss": 0.9598, + "step": 3939 + }, + { + "epoch": 0.840443686006826, + "grad_norm": 0.4278442352601117, + "learning_rate": 0.0001598041201895536, + "loss": 0.9772, + "step": 3940 + }, + { + "epoch": 0.8406569965870307, + "grad_norm": 0.5229587688994386, + "learning_rate": 0.0001598035246017824, + "loss": 0.9595, + "step": 3941 + }, + { + "epoch": 0.8408703071672355, + "grad_norm": 0.526814636858304, + "learning_rate": 0.00015980292811103297, + "loss": 0.9806, + "step": 3942 + }, + { + "epoch": 0.8410836177474402, + "grad_norm": 0.4513246121095353, + "learning_rate": 0.00015980233071731208, + "loss": 1.0069, + "step": 3943 + }, + { + "epoch": 0.841296928327645, + "grad_norm": 0.4913133389192613, + "learning_rate": 0.0001598017324206265, + "loss": 0.989, + "step": 3944 + }, + { + "epoch": 0.8415102389078498, + "grad_norm": 0.47128809298575264, + "learning_rate": 0.0001598011332209829, + "loss": 0.9925, + "step": 3945 + }, + { + "epoch": 0.8417235494880546, + "grad_norm": 0.3753486328526187, + "learning_rate": 0.0001598005331183882, + "loss": 0.9771, + "step": 3946 + }, + { + "epoch": 0.8419368600682594, + "grad_norm": 0.5168029170956271, + "learning_rate": 0.0001597999321128491, + "loss": 1.0114, + "step": 3947 + }, + { + "epoch": 0.8421501706484642, + "grad_norm": 0.608908160072427, + "learning_rate": 0.00015979933020437246, + "loss": 0.999, + "step": 3948 + }, + { + "epoch": 0.8423634812286689, + "grad_norm": 0.5470283127985014, + "learning_rate": 0.00015979872739296504, + "loss": 1.015, + "step": 3949 + }, + { + "epoch": 0.8425767918088737, + "grad_norm": 0.4406584707562035, + "learning_rate": 0.00015979812367863366, + "loss": 0.9267, + "step": 3950 + }, + { + "epoch": 0.8427901023890785, + "grad_norm": 0.36295226275312636, + "learning_rate": 0.0001597975190613852, + "loss": 0.9864, + "step": 3951 + }, + { + "epoch": 0.8430034129692833, + "grad_norm": 0.3188442120008751, + "learning_rate": 0.00015979691354122642, + "loss": 0.9737, + "step": 3952 + }, + { + "epoch": 0.8432167235494881, + "grad_norm": 0.40306848386156896, + "learning_rate": 0.00015979630711816423, + "loss": 0.965, + "step": 3953 + }, + { + "epoch": 0.8434300341296929, + "grad_norm": 0.35702719966887586, + "learning_rate": 0.00015979569979220553, + "loss": 0.9718, + "step": 3954 + }, + { + "epoch": 0.8436433447098977, + "grad_norm": 0.3666542556783375, + "learning_rate": 0.0001597950915633571, + "loss": 0.9842, + "step": 3955 + }, + { + "epoch": 0.8438566552901023, + "grad_norm": 0.41871373833304554, + "learning_rate": 0.0001597944824316259, + "loss": 0.9747, + "step": 3956 + }, + { + "epoch": 0.8440699658703071, + "grad_norm": 0.544749998022858, + "learning_rate": 0.00015979387239701875, + "loss": 1.0173, + "step": 3957 + }, + { + "epoch": 0.8442832764505119, + "grad_norm": 0.7361456726943945, + "learning_rate": 0.0001597932614595426, + "loss": 0.9973, + "step": 3958 + }, + { + "epoch": 0.8444965870307167, + "grad_norm": 0.9231065876664011, + "learning_rate": 0.0001597926496192044, + "loss": 1.0148, + "step": 3959 + }, + { + "epoch": 0.8447098976109215, + "grad_norm": 1.1878886057804583, + "learning_rate": 0.000159792036876011, + "loss": 0.9764, + "step": 3960 + }, + { + "epoch": 0.8449232081911263, + "grad_norm": 0.8015316847259255, + "learning_rate": 0.00015979142322996936, + "loss": 0.997, + "step": 3961 + }, + { + "epoch": 0.8451365187713311, + "grad_norm": 0.6307454888699493, + "learning_rate": 0.0001597908086810864, + "loss": 0.9923, + "step": 3962 + }, + { + "epoch": 0.8453498293515358, + "grad_norm": 0.43985405681009276, + "learning_rate": 0.00015979019322936912, + "loss": 0.994, + "step": 3963 + }, + { + "epoch": 0.8455631399317406, + "grad_norm": 0.44742955338178675, + "learning_rate": 0.00015978957687482446, + "loss": 0.9986, + "step": 3964 + }, + { + "epoch": 0.8457764505119454, + "grad_norm": 0.7482303249940675, + "learning_rate": 0.0001597889596174594, + "loss": 1.0228, + "step": 3965 + }, + { + "epoch": 0.8459897610921502, + "grad_norm": 1.0211794038280146, + "learning_rate": 0.00015978834145728092, + "loss": 0.9955, + "step": 3966 + }, + { + "epoch": 0.846203071672355, + "grad_norm": 1.1393283115138468, + "learning_rate": 0.00015978772239429601, + "loss": 0.9661, + "step": 3967 + }, + { + "epoch": 0.8464163822525598, + "grad_norm": 0.7529339212523721, + "learning_rate": 0.00015978710242851166, + "loss": 0.9838, + "step": 3968 + }, + { + "epoch": 0.8466296928327645, + "grad_norm": 0.46968075979906154, + "learning_rate": 0.00015978648155993492, + "loss": 1.0138, + "step": 3969 + }, + { + "epoch": 0.8468430034129693, + "grad_norm": 0.4364067854314814, + "learning_rate": 0.0001597858597885728, + "loss": 0.9661, + "step": 3970 + }, + { + "epoch": 0.847056313993174, + "grad_norm": 0.531122228371334, + "learning_rate": 0.00015978523711443235, + "loss": 0.9918, + "step": 3971 + }, + { + "epoch": 0.8472696245733788, + "grad_norm": 0.6402083814907782, + "learning_rate": 0.00015978461353752057, + "loss": 0.9874, + "step": 3972 + }, + { + "epoch": 0.8474829351535836, + "grad_norm": 0.7331631404798071, + "learning_rate": 0.00015978398905784456, + "loss": 0.9988, + "step": 3973 + }, + { + "epoch": 0.8476962457337884, + "grad_norm": 0.9062960752017253, + "learning_rate": 0.00015978336367541138, + "loss": 0.9568, + "step": 3974 + }, + { + "epoch": 0.8479095563139932, + "grad_norm": 1.0882201342584061, + "learning_rate": 0.0001597827373902281, + "loss": 0.9867, + "step": 3975 + }, + { + "epoch": 0.8481228668941979, + "grad_norm": 0.8175150692828472, + "learning_rate": 0.0001597821102023018, + "loss": 0.9742, + "step": 3976 + }, + { + "epoch": 0.8483361774744027, + "grad_norm": 0.5294491410834494, + "learning_rate": 0.00015978148211163957, + "loss": 0.987, + "step": 3977 + }, + { + "epoch": 0.8485494880546075, + "grad_norm": 0.42445267707999523, + "learning_rate": 0.00015978085311824854, + "loss": 0.9655, + "step": 3978 + }, + { + "epoch": 0.8487627986348123, + "grad_norm": 0.3950311239464752, + "learning_rate": 0.0001597802232221358, + "loss": 0.972, + "step": 3979 + }, + { + "epoch": 0.8489761092150171, + "grad_norm": 0.48732223939242564, + "learning_rate": 0.0001597795924233085, + "loss": 0.9738, + "step": 3980 + }, + { + "epoch": 0.8491894197952219, + "grad_norm": 0.5892492631991603, + "learning_rate": 0.00015977896072177377, + "loss": 0.9447, + "step": 3981 + }, + { + "epoch": 0.8494027303754266, + "grad_norm": 0.7197119621116864, + "learning_rate": 0.00015977832811753874, + "loss": 0.9777, + "step": 3982 + }, + { + "epoch": 0.8496160409556314, + "grad_norm": 0.7688171284222703, + "learning_rate": 0.00015977769461061062, + "loss": 0.9825, + "step": 3983 + }, + { + "epoch": 0.8498293515358362, + "grad_norm": 0.7271272788503282, + "learning_rate": 0.00015977706020099653, + "loss": 0.9814, + "step": 3984 + }, + { + "epoch": 0.850042662116041, + "grad_norm": 0.6141299178607713, + "learning_rate": 0.00015977642488870368, + "loss": 0.9948, + "step": 3985 + }, + { + "epoch": 0.8502559726962458, + "grad_norm": 0.48198181278872637, + "learning_rate": 0.0001597757886737392, + "loss": 0.9852, + "step": 3986 + }, + { + "epoch": 0.8504692832764505, + "grad_norm": 0.4079256185758377, + "learning_rate": 0.00015977515155611035, + "loss": 0.9634, + "step": 3987 + }, + { + "epoch": 0.8506825938566553, + "grad_norm": 0.4613477757925894, + "learning_rate": 0.00015977451353582433, + "loss": 0.9849, + "step": 3988 + }, + { + "epoch": 0.85089590443686, + "grad_norm": 0.4434158987349276, + "learning_rate": 0.00015977387461288831, + "loss": 0.9549, + "step": 3989 + }, + { + "epoch": 0.8511092150170648, + "grad_norm": 0.37093356453795645, + "learning_rate": 0.0001597732347873096, + "loss": 0.9697, + "step": 3990 + }, + { + "epoch": 0.8513225255972696, + "grad_norm": 0.4309807920942274, + "learning_rate": 0.00015977259405909537, + "loss": 0.9783, + "step": 3991 + }, + { + "epoch": 0.8515358361774744, + "grad_norm": 0.47329203514815854, + "learning_rate": 0.00015977195242825295, + "loss": 0.9531, + "step": 3992 + }, + { + "epoch": 0.8517491467576792, + "grad_norm": 0.41905392661267915, + "learning_rate": 0.00015977130989478948, + "loss": 0.9668, + "step": 3993 + }, + { + "epoch": 0.851962457337884, + "grad_norm": 0.37489145110890676, + "learning_rate": 0.0001597706664587123, + "loss": 0.9833, + "step": 3994 + }, + { + "epoch": 0.8521757679180887, + "grad_norm": 0.42490030995022054, + "learning_rate": 0.00015977002212002873, + "loss": 0.9566, + "step": 3995 + }, + { + "epoch": 0.8523890784982935, + "grad_norm": 0.47311059412763135, + "learning_rate": 0.000159769376878746, + "loss": 0.9666, + "step": 3996 + }, + { + "epoch": 0.8526023890784983, + "grad_norm": 0.5729151140533477, + "learning_rate": 0.0001597687307348714, + "loss": 0.9975, + "step": 3997 + }, + { + "epoch": 0.8528156996587031, + "grad_norm": 0.6175622178508576, + "learning_rate": 0.0001597680836884123, + "loss": 0.9763, + "step": 3998 + }, + { + "epoch": 0.8530290102389079, + "grad_norm": 0.5331704473323116, + "learning_rate": 0.00015976743573937598, + "loss": 0.9725, + "step": 3999 + }, + { + "epoch": 0.8532423208191127, + "grad_norm": 0.4259597995449173, + "learning_rate": 0.0001597667868877698, + "loss": 0.9872, + "step": 4000 + }, + { + "epoch": 0.8534556313993175, + "grad_norm": 0.4895059654125853, + "learning_rate": 0.00015976613713360107, + "loss": 0.9602, + "step": 4001 + }, + { + "epoch": 0.8536689419795221, + "grad_norm": 0.4869599731073017, + "learning_rate": 0.00015976548647687715, + "loss": 0.9709, + "step": 4002 + }, + { + "epoch": 0.8538822525597269, + "grad_norm": 0.38150788198784297, + "learning_rate": 0.0001597648349176054, + "loss": 0.9831, + "step": 4003 + }, + { + "epoch": 0.8540955631399317, + "grad_norm": 0.3876930626113617, + "learning_rate": 0.00015976418245579324, + "loss": 0.9891, + "step": 4004 + }, + { + "epoch": 0.8543088737201365, + "grad_norm": 0.38457650359284967, + "learning_rate": 0.00015976352909144798, + "loss": 0.9581, + "step": 4005 + }, + { + "epoch": 0.8545221843003413, + "grad_norm": 0.41293444878172536, + "learning_rate": 0.00015976287482457707, + "loss": 0.9652, + "step": 4006 + }, + { + "epoch": 0.8547354948805461, + "grad_norm": 0.3427714884845729, + "learning_rate": 0.0001597622196551879, + "loss": 0.9821, + "step": 4007 + }, + { + "epoch": 0.8549488054607508, + "grad_norm": 0.32474633920496665, + "learning_rate": 0.00015976156358328786, + "loss": 0.9727, + "step": 4008 + }, + { + "epoch": 0.8551621160409556, + "grad_norm": 0.40915159845785576, + "learning_rate": 0.00015976090660888435, + "loss": 0.9895, + "step": 4009 + }, + { + "epoch": 0.8553754266211604, + "grad_norm": 0.4164276198889476, + "learning_rate": 0.00015976024873198488, + "loss": 0.9979, + "step": 4010 + }, + { + "epoch": 0.8555887372013652, + "grad_norm": 0.4721354356620702, + "learning_rate": 0.00015975958995259682, + "loss": 0.9799, + "step": 4011 + }, + { + "epoch": 0.85580204778157, + "grad_norm": 0.4621755990325098, + "learning_rate": 0.0001597589302707277, + "loss": 0.9587, + "step": 4012 + }, + { + "epoch": 0.8560153583617748, + "grad_norm": 0.49651795096618784, + "learning_rate": 0.0001597582696863849, + "loss": 0.9613, + "step": 4013 + }, + { + "epoch": 0.8562286689419796, + "grad_norm": 0.5867932826055011, + "learning_rate": 0.00015975760819957597, + "loss": 0.9866, + "step": 4014 + }, + { + "epoch": 0.8564419795221843, + "grad_norm": 0.6727203000483272, + "learning_rate": 0.00015975694581030835, + "loss": 1.0186, + "step": 4015 + }, + { + "epoch": 0.856655290102389, + "grad_norm": 0.7773354632570608, + "learning_rate": 0.00015975628251858954, + "loss": 1.0028, + "step": 4016 + }, + { + "epoch": 0.8568686006825939, + "grad_norm": 0.9286494575111125, + "learning_rate": 0.0001597556183244271, + "loss": 0.9936, + "step": 4017 + }, + { + "epoch": 0.8570819112627986, + "grad_norm": 1.2200372442513634, + "learning_rate": 0.00015975495322782842, + "loss": 1.0175, + "step": 4018 + }, + { + "epoch": 0.8572952218430034, + "grad_norm": 0.7640210947339849, + "learning_rate": 0.00015975428722880114, + "loss": 1.0222, + "step": 4019 + }, + { + "epoch": 0.8575085324232082, + "grad_norm": 0.42010348081382864, + "learning_rate": 0.00015975362032735277, + "loss": 0.9937, + "step": 4020 + }, + { + "epoch": 0.8577218430034129, + "grad_norm": 0.4681969904974012, + "learning_rate": 0.0001597529525234908, + "loss": 0.9606, + "step": 4021 + }, + { + "epoch": 0.8579351535836177, + "grad_norm": 0.7758259632030988, + "learning_rate": 0.0001597522838172229, + "loss": 0.9708, + "step": 4022 + }, + { + "epoch": 0.8581484641638225, + "grad_norm": 0.9542601863573542, + "learning_rate": 0.00015975161420855654, + "loss": 1.0002, + "step": 4023 + }, + { + "epoch": 0.8583617747440273, + "grad_norm": 1.1147906399722547, + "learning_rate": 0.0001597509436974993, + "loss": 0.9753, + "step": 4024 + }, + { + "epoch": 0.8585750853242321, + "grad_norm": 0.8342153078506813, + "learning_rate": 0.0001597502722840588, + "loss": 0.9972, + "step": 4025 + }, + { + "epoch": 0.8587883959044369, + "grad_norm": 0.5734192619531889, + "learning_rate": 0.00015974959996824267, + "loss": 1.0144, + "step": 4026 + }, + { + "epoch": 0.8590017064846417, + "grad_norm": 0.31390757096498534, + "learning_rate": 0.00015974892675005842, + "loss": 0.9622, + "step": 4027 + }, + { + "epoch": 0.8592150170648464, + "grad_norm": 0.3764653389427353, + "learning_rate": 0.00015974825262951372, + "loss": 1.0084, + "step": 4028 + }, + { + "epoch": 0.8594283276450512, + "grad_norm": 0.6603303297207533, + "learning_rate": 0.00015974757760661622, + "loss": 0.997, + "step": 4029 + }, + { + "epoch": 0.859641638225256, + "grad_norm": 0.8814476453123703, + "learning_rate": 0.00015974690168137355, + "loss": 0.9889, + "step": 4030 + }, + { + "epoch": 0.8598549488054608, + "grad_norm": 0.8921600288026728, + "learning_rate": 0.00015974622485379332, + "loss": 1.0064, + "step": 4031 + }, + { + "epoch": 0.8600682593856656, + "grad_norm": 0.807900924525459, + "learning_rate": 0.00015974554712388324, + "loss": 0.9907, + "step": 4032 + }, + { + "epoch": 0.8602815699658704, + "grad_norm": 0.7580616331104815, + "learning_rate": 0.00015974486849165093, + "loss": 0.9826, + "step": 4033 + }, + { + "epoch": 0.860494880546075, + "grad_norm": 0.6830796522139524, + "learning_rate": 0.0001597441889571041, + "loss": 0.9819, + "step": 4034 + }, + { + "epoch": 0.8607081911262798, + "grad_norm": 0.5070693480844818, + "learning_rate": 0.00015974350852025046, + "loss": 0.9821, + "step": 4035 + }, + { + "epoch": 0.8609215017064846, + "grad_norm": 0.3881619520662482, + "learning_rate": 0.00015974282718109764, + "loss": 0.9684, + "step": 4036 + }, + { + "epoch": 0.8611348122866894, + "grad_norm": 0.4272694986125148, + "learning_rate": 0.00015974214493965342, + "loss": 1.0089, + "step": 4037 + }, + { + "epoch": 0.8613481228668942, + "grad_norm": 0.5357563218536512, + "learning_rate": 0.00015974146179592545, + "loss": 0.993, + "step": 4038 + }, + { + "epoch": 0.861561433447099, + "grad_norm": 0.6309802298174519, + "learning_rate": 0.00015974077774992153, + "loss": 1.0112, + "step": 4039 + }, + { + "epoch": 0.8617747440273038, + "grad_norm": 0.6980417344697993, + "learning_rate": 0.00015974009280164938, + "loss": 0.9925, + "step": 4040 + }, + { + "epoch": 0.8619880546075085, + "grad_norm": 0.7494944238491248, + "learning_rate": 0.0001597394069511167, + "loss": 1.0113, + "step": 4041 + }, + { + "epoch": 0.8622013651877133, + "grad_norm": 0.6901717887085188, + "learning_rate": 0.0001597387201983313, + "loss": 0.9865, + "step": 4042 + }, + { + "epoch": 0.8624146757679181, + "grad_norm": 0.637682352056928, + "learning_rate": 0.00015973803254330093, + "loss": 0.9888, + "step": 4043 + }, + { + "epoch": 0.8626279863481229, + "grad_norm": 0.6140789256719873, + "learning_rate": 0.00015973734398603342, + "loss": 0.9948, + "step": 4044 + }, + { + "epoch": 0.8628412969283277, + "grad_norm": 0.5667722543978789, + "learning_rate": 0.0001597366545265365, + "loss": 0.9788, + "step": 4045 + }, + { + "epoch": 0.8630546075085325, + "grad_norm": 0.45466426237419316, + "learning_rate": 0.000159735964164818, + "loss": 1.0067, + "step": 4046 + }, + { + "epoch": 0.8632679180887372, + "grad_norm": 0.4021416408171792, + "learning_rate": 0.00015973527290088571, + "loss": 0.9827, + "step": 4047 + }, + { + "epoch": 0.863481228668942, + "grad_norm": 0.4190960343458088, + "learning_rate": 0.00015973458073474745, + "loss": 0.9653, + "step": 4048 + }, + { + "epoch": 0.8636945392491467, + "grad_norm": 0.5242591592300084, + "learning_rate": 0.00015973388766641112, + "loss": 0.9697, + "step": 4049 + }, + { + "epoch": 0.8639078498293515, + "grad_norm": 0.6842305946096232, + "learning_rate": 0.00015973319369588446, + "loss": 0.9836, + "step": 4050 + }, + { + "epoch": 0.8641211604095563, + "grad_norm": 0.8045762440985249, + "learning_rate": 0.00015973249882317541, + "loss": 0.993, + "step": 4051 + }, + { + "epoch": 0.8643344709897611, + "grad_norm": 0.7164308012348609, + "learning_rate": 0.00015973180304829177, + "loss": 0.992, + "step": 4052 + }, + { + "epoch": 0.8645477815699659, + "grad_norm": 0.5512409337138827, + "learning_rate": 0.00015973110637124144, + "loss": 0.975, + "step": 4053 + }, + { + "epoch": 0.8647610921501706, + "grad_norm": 0.6583056573488314, + "learning_rate": 0.0001597304087920323, + "loss": 0.9887, + "step": 4054 + }, + { + "epoch": 0.8649744027303754, + "grad_norm": 0.7511578431019691, + "learning_rate": 0.00015972971031067226, + "loss": 0.9899, + "step": 4055 + }, + { + "epoch": 0.8651877133105802, + "grad_norm": 0.6167648342264183, + "learning_rate": 0.0001597290109271692, + "loss": 0.9743, + "step": 4056 + }, + { + "epoch": 0.865401023890785, + "grad_norm": 0.434811967642931, + "learning_rate": 0.00015972831064153104, + "loss": 1.0081, + "step": 4057 + }, + { + "epoch": 0.8656143344709898, + "grad_norm": 0.4957254477364596, + "learning_rate": 0.0001597276094537657, + "loss": 0.9843, + "step": 4058 + }, + { + "epoch": 0.8658276450511946, + "grad_norm": 0.4877252665169306, + "learning_rate": 0.0001597269073638811, + "loss": 0.9757, + "step": 4059 + }, + { + "epoch": 0.8660409556313993, + "grad_norm": 0.40902069501934396, + "learning_rate": 0.00015972620437188523, + "loss": 0.9653, + "step": 4060 + }, + { + "epoch": 0.8662542662116041, + "grad_norm": 0.34123472548966916, + "learning_rate": 0.000159725500477786, + "loss": 0.9579, + "step": 4061 + }, + { + "epoch": 0.8664675767918089, + "grad_norm": 0.3885211842093058, + "learning_rate": 0.0001597247956815914, + "loss": 1.0123, + "step": 4062 + }, + { + "epoch": 0.8666808873720137, + "grad_norm": 0.4643794715001354, + "learning_rate": 0.00015972408998330938, + "loss": 0.996, + "step": 4063 + }, + { + "epoch": 0.8668941979522184, + "grad_norm": 0.5103544245724426, + "learning_rate": 0.00015972338338294796, + "loss": 1.0179, + "step": 4064 + }, + { + "epoch": 0.8671075085324232, + "grad_norm": 0.5781903463442306, + "learning_rate": 0.0001597226758805151, + "loss": 0.9935, + "step": 4065 + }, + { + "epoch": 0.867320819112628, + "grad_norm": 0.5737981742266154, + "learning_rate": 0.00015972196747601882, + "loss": 0.974, + "step": 4066 + }, + { + "epoch": 0.8675341296928327, + "grad_norm": 0.5271702521131734, + "learning_rate": 0.00015972125816946715, + "loss": 0.9851, + "step": 4067 + }, + { + "epoch": 0.8677474402730375, + "grad_norm": 0.4919836526428891, + "learning_rate": 0.0001597205479608681, + "loss": 1.0077, + "step": 4068 + }, + { + "epoch": 0.8679607508532423, + "grad_norm": 0.48398379420297, + "learning_rate": 0.00015971983685022972, + "loss": 0.9714, + "step": 4069 + }, + { + "epoch": 0.8681740614334471, + "grad_norm": 0.5056490883016955, + "learning_rate": 0.00015971912483756003, + "loss": 0.9723, + "step": 4070 + }, + { + "epoch": 0.8683873720136519, + "grad_norm": 0.5257249920207523, + "learning_rate": 0.0001597184119228671, + "loss": 1.0154, + "step": 4071 + }, + { + "epoch": 0.8686006825938567, + "grad_norm": 0.501913947503635, + "learning_rate": 0.000159717698106159, + "loss": 0.9843, + "step": 4072 + }, + { + "epoch": 0.8688139931740614, + "grad_norm": 0.4779651450823987, + "learning_rate": 0.0001597169833874438, + "loss": 0.9412, + "step": 4073 + }, + { + "epoch": 0.8690273037542662, + "grad_norm": 0.45009061363634334, + "learning_rate": 0.00015971626776672962, + "loss": 1.0004, + "step": 4074 + }, + { + "epoch": 0.869240614334471, + "grad_norm": 0.3768093039196011, + "learning_rate": 0.00015971555124402452, + "loss": 0.9939, + "step": 4075 + }, + { + "epoch": 0.8694539249146758, + "grad_norm": 0.319374942099654, + "learning_rate": 0.0001597148338193366, + "loss": 0.9934, + "step": 4076 + }, + { + "epoch": 0.8696672354948806, + "grad_norm": 0.438547978639444, + "learning_rate": 0.000159714115492674, + "loss": 0.969, + "step": 4077 + }, + { + "epoch": 0.8698805460750854, + "grad_norm": 0.522051355007981, + "learning_rate": 0.00015971339626404485, + "loss": 0.9534, + "step": 4078 + }, + { + "epoch": 0.8700938566552902, + "grad_norm": 0.5091011302672718, + "learning_rate": 0.00015971267613345728, + "loss": 0.9925, + "step": 4079 + }, + { + "epoch": 0.8703071672354948, + "grad_norm": 0.5388310062644116, + "learning_rate": 0.00015971195510091943, + "loss": 0.9749, + "step": 4080 + }, + { + "epoch": 0.8705204778156996, + "grad_norm": 0.6649061179439354, + "learning_rate": 0.00015971123316643946, + "loss": 0.9921, + "step": 4081 + }, + { + "epoch": 0.8707337883959044, + "grad_norm": 0.7736370872778292, + "learning_rate": 0.00015971051033002555, + "loss": 0.9971, + "step": 4082 + }, + { + "epoch": 0.8709470989761092, + "grad_norm": 0.8110061736518275, + "learning_rate": 0.0001597097865916859, + "loss": 0.9983, + "step": 4083 + }, + { + "epoch": 0.871160409556314, + "grad_norm": 0.7966933983372656, + "learning_rate": 0.00015970906195142865, + "loss": 0.9968, + "step": 4084 + }, + { + "epoch": 0.8713737201365188, + "grad_norm": 1.207399893890641, + "learning_rate": 0.00015970833640926204, + "loss": 0.9714, + "step": 4085 + }, + { + "epoch": 0.8715870307167235, + "grad_norm": 0.3179110792981319, + "learning_rate": 0.00015970760996519422, + "loss": 0.9869, + "step": 4086 + }, + { + "epoch": 0.8718003412969283, + "grad_norm": 0.6960016680461067, + "learning_rate": 0.0001597068826192335, + "loss": 0.9786, + "step": 4087 + }, + { + "epoch": 0.8720136518771331, + "grad_norm": 1.2126529224075258, + "learning_rate": 0.00015970615437138806, + "loss": 1.027, + "step": 4088 + }, + { + "epoch": 0.8722269624573379, + "grad_norm": 0.81278578121933, + "learning_rate": 0.00015970542522166612, + "loss": 0.9714, + "step": 4089 + }, + { + "epoch": 0.8724402730375427, + "grad_norm": 0.39121213389204984, + "learning_rate": 0.00015970469517007595, + "loss": 0.9876, + "step": 4090 + }, + { + "epoch": 0.8726535836177475, + "grad_norm": 0.5796927318715054, + "learning_rate": 0.00015970396421662584, + "loss": 0.9892, + "step": 4091 + }, + { + "epoch": 0.8728668941979523, + "grad_norm": 0.7780327218102927, + "learning_rate": 0.00015970323236132403, + "loss": 0.9703, + "step": 4092 + }, + { + "epoch": 0.873080204778157, + "grad_norm": 0.9164216565597932, + "learning_rate": 0.00015970249960417878, + "loss": 1.0344, + "step": 4093 + }, + { + "epoch": 0.8732935153583617, + "grad_norm": 0.9475334981803589, + "learning_rate": 0.00015970176594519843, + "loss": 1.0078, + "step": 4094 + }, + { + "epoch": 0.8735068259385665, + "grad_norm": 0.813364294190245, + "learning_rate": 0.00015970103138439126, + "loss": 0.9726, + "step": 4095 + }, + { + "epoch": 0.8737201365187713, + "grad_norm": 0.7285352603133058, + "learning_rate": 0.00015970029592176556, + "loss": 0.9882, + "step": 4096 + }, + { + "epoch": 0.8739334470989761, + "grad_norm": 0.5839932107351735, + "learning_rate": 0.00015969955955732966, + "loss": 0.9801, + "step": 4097 + }, + { + "epoch": 0.8741467576791809, + "grad_norm": 0.4563452953944956, + "learning_rate": 0.00015969882229109194, + "loss": 0.9546, + "step": 4098 + }, + { + "epoch": 0.8743600682593856, + "grad_norm": 0.5039334235740468, + "learning_rate": 0.00015969808412306068, + "loss": 1.0227, + "step": 4099 + }, + { + "epoch": 0.8745733788395904, + "grad_norm": 0.6183682135337065, + "learning_rate": 0.00015969734505324426, + "loss": 0.9689, + "step": 4100 + }, + { + "epoch": 0.8747866894197952, + "grad_norm": 0.6472491717007995, + "learning_rate": 0.00015969660508165104, + "loss": 1.0042, + "step": 4101 + }, + { + "epoch": 0.875, + "grad_norm": 0.6286617441469725, + "learning_rate": 0.0001596958642082894, + "loss": 1.0004, + "step": 4102 + }, + { + "epoch": 0.8752133105802048, + "grad_norm": 0.7733797337169845, + "learning_rate": 0.00015969512243316772, + "loss": 0.9985, + "step": 4103 + }, + { + "epoch": 0.8754266211604096, + "grad_norm": 0.6031776121214975, + "learning_rate": 0.00015969437975629438, + "loss": 0.9833, + "step": 4104 + }, + { + "epoch": 0.8756399317406144, + "grad_norm": 0.449507674895992, + "learning_rate": 0.00015969363617767782, + "loss": 0.9488, + "step": 4105 + }, + { + "epoch": 0.8758532423208191, + "grad_norm": 0.5712683587919323, + "learning_rate": 0.00015969289169732637, + "loss": 0.9761, + "step": 4106 + }, + { + "epoch": 0.8760665529010239, + "grad_norm": 0.5013960976947146, + "learning_rate": 0.00015969214631524856, + "loss": 0.9884, + "step": 4107 + }, + { + "epoch": 0.8762798634812287, + "grad_norm": 0.48134927626715956, + "learning_rate": 0.00015969140003145276, + "loss": 0.9873, + "step": 4108 + }, + { + "epoch": 0.8764931740614335, + "grad_norm": 0.4313712425270677, + "learning_rate": 0.00015969065284594742, + "loss": 0.9765, + "step": 4109 + }, + { + "epoch": 0.8767064846416383, + "grad_norm": 0.41855446623504294, + "learning_rate": 0.000159689904758741, + "loss": 0.9801, + "step": 4110 + }, + { + "epoch": 0.876919795221843, + "grad_norm": 0.37938406202196107, + "learning_rate": 0.000159689155769842, + "loss": 0.9585, + "step": 4111 + }, + { + "epoch": 0.8771331058020477, + "grad_norm": 0.4513773039195946, + "learning_rate": 0.0001596884058792588, + "loss": 0.9729, + "step": 4112 + }, + { + "epoch": 0.8773464163822525, + "grad_norm": 0.4313933537379293, + "learning_rate": 0.000159687655087, + "loss": 0.9811, + "step": 4113 + }, + { + "epoch": 0.8775597269624573, + "grad_norm": 0.38415971013900113, + "learning_rate": 0.00015968690339307403, + "loss": 1.0042, + "step": 4114 + }, + { + "epoch": 0.8777730375426621, + "grad_norm": 0.39216829734634706, + "learning_rate": 0.0001596861507974894, + "loss": 1.0141, + "step": 4115 + }, + { + "epoch": 0.8779863481228669, + "grad_norm": 0.4313024507261242, + "learning_rate": 0.00015968539730025464, + "loss": 1.0028, + "step": 4116 + }, + { + "epoch": 0.8781996587030717, + "grad_norm": 0.5171211624689082, + "learning_rate": 0.00015968464290137826, + "loss": 0.9737, + "step": 4117 + }, + { + "epoch": 0.8784129692832765, + "grad_norm": 0.6676262377083769, + "learning_rate": 0.00015968388760086882, + "loss": 0.9856, + "step": 4118 + }, + { + "epoch": 0.8786262798634812, + "grad_norm": 0.7696894978550767, + "learning_rate": 0.00015968313139873484, + "loss": 0.9451, + "step": 4119 + }, + { + "epoch": 0.878839590443686, + "grad_norm": 0.7641324731819941, + "learning_rate": 0.0001596823742949849, + "loss": 1.0203, + "step": 4120 + }, + { + "epoch": 0.8790529010238908, + "grad_norm": 0.6580241082740969, + "learning_rate": 0.00015968161628962754, + "loss": 1.0071, + "step": 4121 + }, + { + "epoch": 0.8792662116040956, + "grad_norm": 0.6625067603094552, + "learning_rate": 0.00015968085738267136, + "loss": 0.9767, + "step": 4122 + }, + { + "epoch": 0.8794795221843004, + "grad_norm": 0.6352797869770975, + "learning_rate": 0.00015968009757412493, + "loss": 0.9773, + "step": 4123 + }, + { + "epoch": 0.8796928327645052, + "grad_norm": 0.43489388379828126, + "learning_rate": 0.00015967933686399688, + "loss": 0.9737, + "step": 4124 + }, + { + "epoch": 0.8799061433447098, + "grad_norm": 0.3409985278984479, + "learning_rate": 0.00015967857525229576, + "loss": 1.0043, + "step": 4125 + }, + { + "epoch": 0.8801194539249146, + "grad_norm": 0.5111802148720609, + "learning_rate": 0.00015967781273903023, + "loss": 0.9871, + "step": 4126 + }, + { + "epoch": 0.8803327645051194, + "grad_norm": 0.736529025495447, + "learning_rate": 0.0001596770493242089, + "loss": 1.0247, + "step": 4127 + }, + { + "epoch": 0.8805460750853242, + "grad_norm": 1.0575784676732112, + "learning_rate": 0.00015967628500784044, + "loss": 1.0094, + "step": 4128 + }, + { + "epoch": 0.880759385665529, + "grad_norm": 1.05382531491944, + "learning_rate": 0.00015967551978993347, + "loss": 0.9622, + "step": 4129 + }, + { + "epoch": 0.8809726962457338, + "grad_norm": 0.781493163550062, + "learning_rate": 0.00015967475367049665, + "loss": 0.9846, + "step": 4130 + }, + { + "epoch": 0.8811860068259386, + "grad_norm": 0.5970475035436419, + "learning_rate": 0.00015967398664953864, + "loss": 0.9807, + "step": 4131 + }, + { + "epoch": 0.8813993174061433, + "grad_norm": 0.5442761717662687, + "learning_rate": 0.00015967321872706816, + "loss": 0.9533, + "step": 4132 + }, + { + "epoch": 0.8816126279863481, + "grad_norm": 0.6536206645351065, + "learning_rate": 0.00015967244990309384, + "loss": 0.9452, + "step": 4133 + }, + { + "epoch": 0.8818259385665529, + "grad_norm": 0.7227865055249493, + "learning_rate": 0.00015967168017762444, + "loss": 0.9698, + "step": 4134 + }, + { + "epoch": 0.8820392491467577, + "grad_norm": 0.8496993479041313, + "learning_rate": 0.0001596709095506686, + "loss": 0.9815, + "step": 4135 + }, + { + "epoch": 0.8822525597269625, + "grad_norm": 0.9176447976096921, + "learning_rate": 0.0001596701380222351, + "loss": 0.9891, + "step": 4136 + }, + { + "epoch": 0.8824658703071673, + "grad_norm": 0.8029060334483853, + "learning_rate": 0.00015966936559233267, + "loss": 0.9802, + "step": 4137 + }, + { + "epoch": 0.882679180887372, + "grad_norm": 0.6615972129157812, + "learning_rate": 0.00015966859226096997, + "loss": 0.964, + "step": 4138 + }, + { + "epoch": 0.8828924914675768, + "grad_norm": 0.4440726294033041, + "learning_rate": 0.00015966781802815588, + "loss": 0.987, + "step": 4139 + }, + { + "epoch": 0.8831058020477816, + "grad_norm": 0.37886262900801093, + "learning_rate": 0.00015966704289389905, + "loss": 0.9456, + "step": 4140 + }, + { + "epoch": 0.8833191126279863, + "grad_norm": 0.6510733785240151, + "learning_rate": 0.00015966626685820826, + "loss": 0.9976, + "step": 4141 + }, + { + "epoch": 0.8835324232081911, + "grad_norm": 0.8572927736123462, + "learning_rate": 0.00015966548992109235, + "loss": 1.0227, + "step": 4142 + }, + { + "epoch": 0.8837457337883959, + "grad_norm": 0.9075541275628226, + "learning_rate": 0.0001596647120825601, + "loss": 0.9622, + "step": 4143 + }, + { + "epoch": 0.8839590443686007, + "grad_norm": 0.7127877113010731, + "learning_rate": 0.00015966393334262028, + "loss": 0.9981, + "step": 4144 + }, + { + "epoch": 0.8841723549488054, + "grad_norm": 0.4971266141156977, + "learning_rate": 0.0001596631537012817, + "loss": 0.9506, + "step": 4145 + }, + { + "epoch": 0.8843856655290102, + "grad_norm": 0.5304321354684441, + "learning_rate": 0.00015966237315855321, + "loss": 0.9634, + "step": 4146 + }, + { + "epoch": 0.884598976109215, + "grad_norm": 0.42189413354975114, + "learning_rate": 0.00015966159171444363, + "loss": 0.9833, + "step": 4147 + }, + { + "epoch": 0.8848122866894198, + "grad_norm": 0.35139844816059884, + "learning_rate": 0.00015966080936896177, + "loss": 0.9783, + "step": 4148 + }, + { + "epoch": 0.8850255972696246, + "grad_norm": 0.456728095154982, + "learning_rate": 0.00015966002612211654, + "loss": 1.0036, + "step": 4149 + }, + { + "epoch": 0.8852389078498294, + "grad_norm": 0.48575285652031946, + "learning_rate": 0.0001596592419739168, + "loss": 0.985, + "step": 4150 + }, + { + "epoch": 0.8854522184300341, + "grad_norm": 0.4021605350031266, + "learning_rate": 0.00015965845692437135, + "loss": 0.968, + "step": 4151 + }, + { + "epoch": 0.8856655290102389, + "grad_norm": 0.32356191481393154, + "learning_rate": 0.00015965767097348915, + "loss": 0.9786, + "step": 4152 + }, + { + "epoch": 0.8858788395904437, + "grad_norm": 0.37262671248909207, + "learning_rate": 0.00015965688412127905, + "loss": 0.9702, + "step": 4153 + }, + { + "epoch": 0.8860921501706485, + "grad_norm": 0.46132955148018917, + "learning_rate": 0.00015965609636774998, + "loss": 1.0061, + "step": 4154 + }, + { + "epoch": 0.8863054607508533, + "grad_norm": 0.4408148810688075, + "learning_rate": 0.00015965530771291082, + "loss": 0.9728, + "step": 4155 + }, + { + "epoch": 0.886518771331058, + "grad_norm": 0.38085049254058545, + "learning_rate": 0.00015965451815677051, + "loss": 0.9821, + "step": 4156 + }, + { + "epoch": 0.8867320819112628, + "grad_norm": 0.38773226548567574, + "learning_rate": 0.00015965372769933801, + "loss": 0.9762, + "step": 4157 + }, + { + "epoch": 0.8869453924914675, + "grad_norm": 0.42247944167586515, + "learning_rate": 0.0001596529363406222, + "loss": 1.0261, + "step": 4158 + }, + { + "epoch": 0.8871587030716723, + "grad_norm": 0.394259833834286, + "learning_rate": 0.00015965214408063212, + "loss": 1.0114, + "step": 4159 + }, + { + "epoch": 0.8873720136518771, + "grad_norm": 0.42523334210559743, + "learning_rate": 0.00015965135091937667, + "loss": 0.977, + "step": 4160 + }, + { + "epoch": 0.8875853242320819, + "grad_norm": 0.46669621421977375, + "learning_rate": 0.00015965055685686485, + "loss": 0.9703, + "step": 4161 + }, + { + "epoch": 0.8877986348122867, + "grad_norm": 0.49184649964091676, + "learning_rate": 0.00015964976189310564, + "loss": 0.9751, + "step": 4162 + }, + { + "epoch": 0.8880119453924915, + "grad_norm": 0.5649931246044866, + "learning_rate": 0.00015964896602810804, + "loss": 0.9843, + "step": 4163 + }, + { + "epoch": 0.8882252559726962, + "grad_norm": 0.5994083320578366, + "learning_rate": 0.00015964816926188104, + "loss": 0.9296, + "step": 4164 + }, + { + "epoch": 0.888438566552901, + "grad_norm": 0.6224866673377394, + "learning_rate": 0.00015964737159443367, + "loss": 0.9508, + "step": 4165 + }, + { + "epoch": 0.8886518771331058, + "grad_norm": 0.7342402745164736, + "learning_rate": 0.00015964657302577494, + "loss": 0.9904, + "step": 4166 + }, + { + "epoch": 0.8888651877133106, + "grad_norm": 0.8321056348866508, + "learning_rate": 0.0001596457735559139, + "loss": 0.9922, + "step": 4167 + }, + { + "epoch": 0.8890784982935154, + "grad_norm": 0.7877262996323384, + "learning_rate": 0.00015964497318485962, + "loss": 0.9891, + "step": 4168 + }, + { + "epoch": 0.8892918088737202, + "grad_norm": 0.794189623851461, + "learning_rate": 0.0001596441719126211, + "loss": 0.9731, + "step": 4169 + }, + { + "epoch": 0.889505119453925, + "grad_norm": 0.8312925737297495, + "learning_rate": 0.00015964336973920743, + "loss": 0.9565, + "step": 4170 + }, + { + "epoch": 0.8897184300341296, + "grad_norm": 0.8961281184096805, + "learning_rate": 0.00015964256666462772, + "loss": 0.9761, + "step": 4171 + }, + { + "epoch": 0.8899317406143344, + "grad_norm": 0.9058570816653261, + "learning_rate": 0.000159641762688891, + "loss": 0.9992, + "step": 4172 + }, + { + "epoch": 0.8901450511945392, + "grad_norm": 0.8279596518053601, + "learning_rate": 0.00015964095781200642, + "loss": 0.957, + "step": 4173 + }, + { + "epoch": 0.890358361774744, + "grad_norm": 0.6105960142594516, + "learning_rate": 0.00015964015203398304, + "loss": 0.9735, + "step": 4174 + }, + { + "epoch": 0.8905716723549488, + "grad_norm": 0.4065798497409888, + "learning_rate": 0.00015963934535483, + "loss": 0.9956, + "step": 4175 + }, + { + "epoch": 0.8907849829351536, + "grad_norm": 0.3670127998870047, + "learning_rate": 0.00015963853777455645, + "loss": 0.9745, + "step": 4176 + }, + { + "epoch": 0.8909982935153583, + "grad_norm": 0.40340018976604497, + "learning_rate": 0.0001596377292931715, + "loss": 0.9744, + "step": 4177 + }, + { + "epoch": 0.8912116040955631, + "grad_norm": 0.49959535270044614, + "learning_rate": 0.0001596369199106843, + "loss": 0.9663, + "step": 4178 + }, + { + "epoch": 0.8914249146757679, + "grad_norm": 0.5436525878330869, + "learning_rate": 0.000159636109627104, + "loss": 0.9761, + "step": 4179 + }, + { + "epoch": 0.8916382252559727, + "grad_norm": 0.6143432318470589, + "learning_rate": 0.00015963529844243977, + "loss": 0.9543, + "step": 4180 + }, + { + "epoch": 0.8918515358361775, + "grad_norm": 0.6206389179758415, + "learning_rate": 0.00015963448635670084, + "loss": 0.9621, + "step": 4181 + }, + { + "epoch": 0.8920648464163823, + "grad_norm": 0.7216338549814224, + "learning_rate": 0.0001596336733698963, + "loss": 0.9824, + "step": 4182 + }, + { + "epoch": 0.8922781569965871, + "grad_norm": 0.6387793914371666, + "learning_rate": 0.00015963285948203547, + "loss": 0.9953, + "step": 4183 + }, + { + "epoch": 0.8924914675767918, + "grad_norm": 0.5108582390880754, + "learning_rate": 0.00015963204469312747, + "loss": 0.9873, + "step": 4184 + }, + { + "epoch": 0.8927047781569966, + "grad_norm": 0.46105214320020466, + "learning_rate": 0.00015963122900318155, + "loss": 0.9675, + "step": 4185 + }, + { + "epoch": 0.8929180887372014, + "grad_norm": 0.44913605598536227, + "learning_rate": 0.0001596304124122069, + "loss": 1.0029, + "step": 4186 + }, + { + "epoch": 0.8931313993174061, + "grad_norm": 0.44178782770370817, + "learning_rate": 0.00015962959492021284, + "loss": 1.0004, + "step": 4187 + }, + { + "epoch": 0.893344709897611, + "grad_norm": 0.47852709878507327, + "learning_rate": 0.00015962877652720857, + "loss": 0.9857, + "step": 4188 + }, + { + "epoch": 0.8935580204778157, + "grad_norm": 0.5967412810737706, + "learning_rate": 0.00015962795723320331, + "loss": 0.9669, + "step": 4189 + }, + { + "epoch": 0.8937713310580204, + "grad_norm": 0.6259151663890228, + "learning_rate": 0.0001596271370382064, + "loss": 0.9862, + "step": 4190 + }, + { + "epoch": 0.8939846416382252, + "grad_norm": 0.6303654221320327, + "learning_rate": 0.0001596263159422271, + "loss": 0.9692, + "step": 4191 + }, + { + "epoch": 0.89419795221843, + "grad_norm": 0.6882286567548002, + "learning_rate": 0.0001596254939452747, + "loss": 0.9298, + "step": 4192 + }, + { + "epoch": 0.8944112627986348, + "grad_norm": 0.5563932635694553, + "learning_rate": 0.0001596246710473585, + "loss": 0.947, + "step": 4193 + }, + { + "epoch": 0.8946245733788396, + "grad_norm": 0.3997354695343865, + "learning_rate": 0.0001596238472484878, + "loss": 1.0019, + "step": 4194 + }, + { + "epoch": 0.8948378839590444, + "grad_norm": 0.33482901086857675, + "learning_rate": 0.0001596230225486719, + "loss": 1.0128, + "step": 4195 + }, + { + "epoch": 0.8950511945392492, + "grad_norm": 0.4666258239110231, + "learning_rate": 0.0001596221969479202, + "loss": 0.9868, + "step": 4196 + }, + { + "epoch": 0.8952645051194539, + "grad_norm": 0.46872058198372, + "learning_rate": 0.00015962137044624202, + "loss": 0.9816, + "step": 4197 + }, + { + "epoch": 0.8954778156996587, + "grad_norm": 0.3596705580945355, + "learning_rate": 0.00015962054304364664, + "loss": 0.9925, + "step": 4198 + }, + { + "epoch": 0.8956911262798635, + "grad_norm": 0.4303198263102454, + "learning_rate": 0.00015961971474014349, + "loss": 0.9632, + "step": 4199 + }, + { + "epoch": 0.8959044368600683, + "grad_norm": 0.4885816606480433, + "learning_rate": 0.00015961888553574195, + "loss": 0.9406, + "step": 4200 + }, + { + "epoch": 0.8961177474402731, + "grad_norm": 0.484584822661318, + "learning_rate": 0.00015961805543045137, + "loss": 1.0225, + "step": 4201 + }, + { + "epoch": 0.8963310580204779, + "grad_norm": 0.5449582478128187, + "learning_rate": 0.00015961722442428116, + "loss": 0.9592, + "step": 4202 + }, + { + "epoch": 0.8965443686006825, + "grad_norm": 0.6193986032744996, + "learning_rate": 0.0001596163925172407, + "loss": 0.9846, + "step": 4203 + }, + { + "epoch": 0.8967576791808873, + "grad_norm": 0.6790542046142115, + "learning_rate": 0.00015961555970933945, + "loss": 0.9843, + "step": 4204 + }, + { + "epoch": 0.8969709897610921, + "grad_norm": 0.7515971933719642, + "learning_rate": 0.00015961472600058678, + "loss": 0.9734, + "step": 4205 + }, + { + "epoch": 0.8971843003412969, + "grad_norm": 0.8757773339791136, + "learning_rate": 0.00015961389139099215, + "loss": 0.9501, + "step": 4206 + }, + { + "epoch": 0.8973976109215017, + "grad_norm": 1.005286051788261, + "learning_rate": 0.00015961305588056498, + "loss": 0.9848, + "step": 4207 + }, + { + "epoch": 0.8976109215017065, + "grad_norm": 0.9920741067022355, + "learning_rate": 0.00015961221946931477, + "loss": 0.9726, + "step": 4208 + }, + { + "epoch": 0.8978242320819113, + "grad_norm": 0.8503064499697961, + "learning_rate": 0.00015961138215725095, + "loss": 0.9663, + "step": 4209 + }, + { + "epoch": 0.898037542662116, + "grad_norm": 0.9178873869036704, + "learning_rate": 0.000159610543944383, + "loss": 0.9552, + "step": 4210 + }, + { + "epoch": 0.8982508532423208, + "grad_norm": 0.48069766390924173, + "learning_rate": 0.0001596097048307204, + "loss": 0.9847, + "step": 4211 + }, + { + "epoch": 0.8984641638225256, + "grad_norm": 0.4157120726539533, + "learning_rate": 0.00015960886481627268, + "loss": 0.9438, + "step": 4212 + }, + { + "epoch": 0.8986774744027304, + "grad_norm": 0.4725691858006927, + "learning_rate": 0.0001596080239010493, + "loss": 0.9661, + "step": 4213 + }, + { + "epoch": 0.8988907849829352, + "grad_norm": 0.6733568290336587, + "learning_rate": 0.00015960718208505976, + "loss": 0.9992, + "step": 4214 + }, + { + "epoch": 0.89910409556314, + "grad_norm": 0.8095146944028802, + "learning_rate": 0.00015960633936831366, + "loss": 0.9738, + "step": 4215 + }, + { + "epoch": 0.8993174061433447, + "grad_norm": 0.8892766575272476, + "learning_rate": 0.00015960549575082047, + "loss": 0.9856, + "step": 4216 + }, + { + "epoch": 0.8995307167235495, + "grad_norm": 0.9757792051346355, + "learning_rate": 0.00015960465123258976, + "loss": 0.9797, + "step": 4217 + }, + { + "epoch": 0.8997440273037542, + "grad_norm": 0.9140317300273813, + "learning_rate": 0.00015960380581363104, + "loss": 0.9574, + "step": 4218 + }, + { + "epoch": 0.899957337883959, + "grad_norm": 0.6361840832582877, + "learning_rate": 0.00015960295949395397, + "loss": 1.0149, + "step": 4219 + }, + { + "epoch": 0.9001706484641638, + "grad_norm": 0.47048289082115724, + "learning_rate": 0.00015960211227356808, + "loss": 0.9813, + "step": 4220 + }, + { + "epoch": 0.9003839590443686, + "grad_norm": 0.4618345122506045, + "learning_rate": 0.00015960126415248292, + "loss": 0.9833, + "step": 4221 + }, + { + "epoch": 0.9005972696245734, + "grad_norm": 0.4921835615230344, + "learning_rate": 0.00015960041513070813, + "loss": 0.9872, + "step": 4222 + }, + { + "epoch": 0.9008105802047781, + "grad_norm": 0.4546245493889677, + "learning_rate": 0.00015959956520825332, + "loss": 1.0101, + "step": 4223 + }, + { + "epoch": 0.9010238907849829, + "grad_norm": 0.4698672031369919, + "learning_rate": 0.00015959871438512807, + "loss": 0.9959, + "step": 4224 + }, + { + "epoch": 0.9012372013651877, + "grad_norm": 0.5203839029074974, + "learning_rate": 0.00015959786266134203, + "loss": 0.9646, + "step": 4225 + }, + { + "epoch": 0.9014505119453925, + "grad_norm": 0.49639520496486744, + "learning_rate": 0.00015959701003690486, + "loss": 0.9681, + "step": 4226 + }, + { + "epoch": 0.9016638225255973, + "grad_norm": 0.4443373889237779, + "learning_rate": 0.00015959615651182614, + "loss": 0.9947, + "step": 4227 + }, + { + "epoch": 0.9018771331058021, + "grad_norm": 0.3098173170170522, + "learning_rate": 0.0001595953020861156, + "loss": 0.9456, + "step": 4228 + }, + { + "epoch": 0.9020904436860068, + "grad_norm": 0.33608071745352763, + "learning_rate": 0.00015959444675978286, + "loss": 0.9966, + "step": 4229 + }, + { + "epoch": 0.9023037542662116, + "grad_norm": 0.3724662895452303, + "learning_rate": 0.00015959359053283764, + "loss": 0.9818, + "step": 4230 + }, + { + "epoch": 0.9025170648464164, + "grad_norm": 0.40293233683088325, + "learning_rate": 0.00015959273340528958, + "loss": 0.9639, + "step": 4231 + }, + { + "epoch": 0.9027303754266212, + "grad_norm": 0.4412345554755669, + "learning_rate": 0.00015959187537714842, + "loss": 0.971, + "step": 4232 + }, + { + "epoch": 0.902943686006826, + "grad_norm": 0.4674134307724434, + "learning_rate": 0.0001595910164484238, + "loss": 1.0035, + "step": 4233 + }, + { + "epoch": 0.9031569965870307, + "grad_norm": 0.5214949440445835, + "learning_rate": 0.00015959015661912558, + "loss": 1.0104, + "step": 4234 + }, + { + "epoch": 0.9033703071672355, + "grad_norm": 0.5578932049692743, + "learning_rate": 0.00015958929588926333, + "loss": 0.9762, + "step": 4235 + }, + { + "epoch": 0.9035836177474402, + "grad_norm": 0.5642553316414852, + "learning_rate": 0.00015958843425884688, + "loss": 0.974, + "step": 4236 + }, + { + "epoch": 0.903796928327645, + "grad_norm": 0.5458566228838719, + "learning_rate": 0.00015958757172788597, + "loss": 0.9755, + "step": 4237 + }, + { + "epoch": 0.9040102389078498, + "grad_norm": 0.5086069493942073, + "learning_rate": 0.00015958670829639034, + "loss": 1.0207, + "step": 4238 + }, + { + "epoch": 0.9042235494880546, + "grad_norm": 0.4336920315452677, + "learning_rate": 0.00015958584396436976, + "loss": 0.9955, + "step": 4239 + }, + { + "epoch": 0.9044368600682594, + "grad_norm": 0.41848880681310174, + "learning_rate": 0.00015958497873183402, + "loss": 0.9882, + "step": 4240 + }, + { + "epoch": 0.9046501706484642, + "grad_norm": 0.3662095040100004, + "learning_rate": 0.0001595841125987929, + "loss": 0.9941, + "step": 4241 + }, + { + "epoch": 0.9048634812286689, + "grad_norm": 0.36974627273600474, + "learning_rate": 0.00015958324556525623, + "loss": 0.9881, + "step": 4242 + }, + { + "epoch": 0.9050767918088737, + "grad_norm": 0.38645656971049264, + "learning_rate": 0.00015958237763123376, + "loss": 0.988, + "step": 4243 + }, + { + "epoch": 0.9052901023890785, + "grad_norm": 0.4696673682907714, + "learning_rate": 0.00015958150879673538, + "loss": 1.016, + "step": 4244 + }, + { + "epoch": 0.9055034129692833, + "grad_norm": 0.6750542813063088, + "learning_rate": 0.00015958063906177088, + "loss": 0.9858, + "step": 4245 + }, + { + "epoch": 0.9057167235494881, + "grad_norm": 0.8765666975031361, + "learning_rate": 0.0001595797684263501, + "loss": 0.9806, + "step": 4246 + }, + { + "epoch": 0.9059300341296929, + "grad_norm": 1.0067967129216422, + "learning_rate": 0.00015957889689048293, + "loss": 0.9853, + "step": 4247 + }, + { + "epoch": 0.9061433447098977, + "grad_norm": 1.1236643703979463, + "learning_rate": 0.00015957802445417917, + "loss": 0.9612, + "step": 4248 + }, + { + "epoch": 0.9063566552901023, + "grad_norm": 0.8163552438592109, + "learning_rate": 0.00015957715111744873, + "loss": 0.9844, + "step": 4249 + }, + { + "epoch": 0.9065699658703071, + "grad_norm": 0.5868927445261235, + "learning_rate": 0.00015957627688030151, + "loss": 0.9806, + "step": 4250 + }, + { + "epoch": 0.9067832764505119, + "grad_norm": 0.42716261814839357, + "learning_rate": 0.00015957540174274738, + "loss": 0.9738, + "step": 4251 + }, + { + "epoch": 0.9069965870307167, + "grad_norm": 0.49439444540537913, + "learning_rate": 0.0001595745257047962, + "loss": 0.9747, + "step": 4252 + }, + { + "epoch": 0.9072098976109215, + "grad_norm": 0.7051621830374541, + "learning_rate": 0.00015957364876645797, + "loss": 0.9904, + "step": 4253 + }, + { + "epoch": 0.9074232081911263, + "grad_norm": 0.893705993865881, + "learning_rate": 0.00015957277092774254, + "loss": 0.9809, + "step": 4254 + }, + { + "epoch": 0.9076365187713311, + "grad_norm": 0.9420901714726359, + "learning_rate": 0.00015957189218865987, + "loss": 1.0192, + "step": 4255 + }, + { + "epoch": 0.9078498293515358, + "grad_norm": 0.9052379217052723, + "learning_rate": 0.00015957101254921992, + "loss": 1.0065, + "step": 4256 + }, + { + "epoch": 0.9080631399317406, + "grad_norm": 0.8409994737314431, + "learning_rate": 0.0001595701320094326, + "loss": 0.9909, + "step": 4257 + }, + { + "epoch": 0.9082764505119454, + "grad_norm": 0.6870102458921257, + "learning_rate": 0.0001595692505693079, + "loss": 1.006, + "step": 4258 + }, + { + "epoch": 0.9084897610921502, + "grad_norm": 0.4515043366049288, + "learning_rate": 0.00015956836822885581, + "loss": 1.002, + "step": 4259 + }, + { + "epoch": 0.908703071672355, + "grad_norm": 0.32994737976532645, + "learning_rate": 0.00015956748498808626, + "loss": 0.9367, + "step": 4260 + }, + { + "epoch": 0.9089163822525598, + "grad_norm": 0.3633002989505795, + "learning_rate": 0.00015956660084700934, + "loss": 0.9629, + "step": 4261 + }, + { + "epoch": 0.9091296928327645, + "grad_norm": 0.4482566852921805, + "learning_rate": 0.00015956571580563495, + "loss": 0.9631, + "step": 4262 + }, + { + "epoch": 0.9093430034129693, + "grad_norm": 0.4190347942816336, + "learning_rate": 0.00015956482986397312, + "loss": 0.9712, + "step": 4263 + }, + { + "epoch": 0.909556313993174, + "grad_norm": 0.3825792578295931, + "learning_rate": 0.00015956394302203396, + "loss": 0.9848, + "step": 4264 + }, + { + "epoch": 0.9097696245733788, + "grad_norm": 0.36284487231363166, + "learning_rate": 0.00015956305527982742, + "loss": 0.9876, + "step": 4265 + }, + { + "epoch": 0.9099829351535836, + "grad_norm": 0.3742563818878921, + "learning_rate": 0.00015956216663736358, + "loss": 1.0007, + "step": 4266 + }, + { + "epoch": 0.9101962457337884, + "grad_norm": 0.3787821716639909, + "learning_rate": 0.00015956127709465247, + "loss": 0.958, + "step": 4267 + }, + { + "epoch": 0.9104095563139932, + "grad_norm": 0.40491251849137, + "learning_rate": 0.0001595603866517042, + "loss": 0.9806, + "step": 4268 + }, + { + "epoch": 0.9106228668941979, + "grad_norm": 0.46052217359041836, + "learning_rate": 0.0001595594953085288, + "loss": 1.0154, + "step": 4269 + }, + { + "epoch": 0.9108361774744027, + "grad_norm": 0.5621091351576939, + "learning_rate": 0.00015955860306513634, + "loss": 1.0074, + "step": 4270 + }, + { + "epoch": 0.9110494880546075, + "grad_norm": 0.6246238831155105, + "learning_rate": 0.00015955770992153698, + "loss": 0.9807, + "step": 4271 + }, + { + "epoch": 0.9112627986348123, + "grad_norm": 0.6235865497667301, + "learning_rate": 0.0001595568158777408, + "loss": 0.974, + "step": 4272 + }, + { + "epoch": 0.9114761092150171, + "grad_norm": 0.6352724256854992, + "learning_rate": 0.00015955592093375788, + "loss": 0.9598, + "step": 4273 + }, + { + "epoch": 0.9116894197952219, + "grad_norm": 0.6850916928196599, + "learning_rate": 0.00015955502508959842, + "loss": 0.9579, + "step": 4274 + }, + { + "epoch": 0.9119027303754266, + "grad_norm": 0.6876611629833812, + "learning_rate": 0.0001595541283452725, + "loss": 0.9585, + "step": 4275 + }, + { + "epoch": 0.9121160409556314, + "grad_norm": 0.6806754311584077, + "learning_rate": 0.00015955323070079025, + "loss": 0.9637, + "step": 4276 + }, + { + "epoch": 0.9123293515358362, + "grad_norm": 0.6519031829742175, + "learning_rate": 0.0001595523321561619, + "loss": 0.9999, + "step": 4277 + }, + { + "epoch": 0.912542662116041, + "grad_norm": 0.47416514021626655, + "learning_rate": 0.00015955143271139755, + "loss": 0.9933, + "step": 4278 + }, + { + "epoch": 0.9127559726962458, + "grad_norm": 0.3490507324109255, + "learning_rate": 0.00015955053236650738, + "loss": 0.9639, + "step": 4279 + }, + { + "epoch": 0.9129692832764505, + "grad_norm": 0.3399087457790829, + "learning_rate": 0.00015954963112150162, + "loss": 0.964, + "step": 4280 + }, + { + "epoch": 0.9131825938566553, + "grad_norm": 0.4436415465223694, + "learning_rate": 0.00015954872897639045, + "loss": 0.9537, + "step": 4281 + }, + { + "epoch": 0.91339590443686, + "grad_norm": 0.49227150770578687, + "learning_rate": 0.00015954782593118408, + "loss": 0.9846, + "step": 4282 + }, + { + "epoch": 0.9136092150170648, + "grad_norm": 0.43577543459189744, + "learning_rate": 0.0001595469219858927, + "loss": 0.9793, + "step": 4283 + }, + { + "epoch": 0.9138225255972696, + "grad_norm": 0.5037723162905454, + "learning_rate": 0.0001595460171405266, + "loss": 0.9748, + "step": 4284 + }, + { + "epoch": 0.9140358361774744, + "grad_norm": 0.5259479728833418, + "learning_rate": 0.00015954511139509592, + "loss": 0.9712, + "step": 4285 + }, + { + "epoch": 0.9142491467576792, + "grad_norm": 0.5442010825096165, + "learning_rate": 0.000159544204749611, + "loss": 1.0095, + "step": 4286 + }, + { + "epoch": 0.914462457337884, + "grad_norm": 0.6064045910257266, + "learning_rate": 0.00015954329720408204, + "loss": 0.9803, + "step": 4287 + }, + { + "epoch": 0.9146757679180887, + "grad_norm": 0.7288315513346115, + "learning_rate": 0.0001595423887585194, + "loss": 0.9885, + "step": 4288 + }, + { + "epoch": 0.9148890784982935, + "grad_norm": 0.7718528376420287, + "learning_rate": 0.00015954147941293323, + "loss": 0.9963, + "step": 4289 + }, + { + "epoch": 0.9151023890784983, + "grad_norm": 0.8142131388058869, + "learning_rate": 0.0001595405691673339, + "loss": 0.98, + "step": 4290 + }, + { + "epoch": 0.9153156996587031, + "grad_norm": 0.9105388813916877, + "learning_rate": 0.0001595396580217317, + "loss": 1.0044, + "step": 4291 + }, + { + "epoch": 0.9155290102389079, + "grad_norm": 0.9892210743249529, + "learning_rate": 0.00015953874597613693, + "loss": 0.9697, + "step": 4292 + }, + { + "epoch": 0.9157423208191127, + "grad_norm": 0.9921406234603195, + "learning_rate": 0.0001595378330305599, + "loss": 0.9822, + "step": 4293 + }, + { + "epoch": 0.9159556313993175, + "grad_norm": 0.9693403075213546, + "learning_rate": 0.00015953691918501095, + "loss": 1.0026, + "step": 4294 + }, + { + "epoch": 0.9161689419795221, + "grad_norm": 0.7924640759882633, + "learning_rate": 0.00015953600443950042, + "loss": 0.949, + "step": 4295 + }, + { + "epoch": 0.9163822525597269, + "grad_norm": 0.5161250829314047, + "learning_rate": 0.00015953508879403867, + "loss": 1.0133, + "step": 4296 + }, + { + "epoch": 0.9165955631399317, + "grad_norm": 0.45965657416685735, + "learning_rate": 0.00015953417224863606, + "loss": 0.9986, + "step": 4297 + }, + { + "epoch": 0.9168088737201365, + "grad_norm": 0.8013678725542331, + "learning_rate": 0.00015953325480330295, + "loss": 1.0038, + "step": 4298 + }, + { + "epoch": 0.9170221843003413, + "grad_norm": 0.8619522219264307, + "learning_rate": 0.00015953233645804972, + "loss": 0.9753, + "step": 4299 + }, + { + "epoch": 0.9172354948805461, + "grad_norm": 0.9030325623476732, + "learning_rate": 0.00015953141721288674, + "loss": 1.0178, + "step": 4300 + }, + { + "epoch": 0.9174488054607508, + "grad_norm": 0.7221010018579279, + "learning_rate": 0.0001595304970678245, + "loss": 0.9894, + "step": 4301 + }, + { + "epoch": 0.9176621160409556, + "grad_norm": 0.81385500972023, + "learning_rate": 0.00015952957602287328, + "loss": 0.9908, + "step": 4302 + }, + { + "epoch": 0.9178754266211604, + "grad_norm": 0.812869939367367, + "learning_rate": 0.0001595286540780436, + "loss": 1.0007, + "step": 4303 + }, + { + "epoch": 0.9180887372013652, + "grad_norm": 0.7561394573639133, + "learning_rate": 0.0001595277312333459, + "loss": 1.0014, + "step": 4304 + }, + { + "epoch": 0.91830204778157, + "grad_norm": 0.7464244705742282, + "learning_rate": 0.00015952680748879052, + "loss": 0.9842, + "step": 4305 + }, + { + "epoch": 0.9185153583617748, + "grad_norm": 0.6424386021504424, + "learning_rate": 0.00015952588284438802, + "loss": 0.9615, + "step": 4306 + }, + { + "epoch": 0.9187286689419796, + "grad_norm": 0.5021673719306112, + "learning_rate": 0.0001595249573001488, + "loss": 1.0013, + "step": 4307 + }, + { + "epoch": 0.9189419795221843, + "grad_norm": 0.49953701344394996, + "learning_rate": 0.00015952403085608334, + "loss": 0.9838, + "step": 4308 + }, + { + "epoch": 0.919155290102389, + "grad_norm": 0.4812344543100747, + "learning_rate": 0.00015952310351220213, + "loss": 0.9544, + "step": 4309 + }, + { + "epoch": 0.9193686006825939, + "grad_norm": 0.46542928469176204, + "learning_rate": 0.00015952217526851568, + "loss": 0.9545, + "step": 4310 + }, + { + "epoch": 0.9195819112627986, + "grad_norm": 0.4241142885382056, + "learning_rate": 0.00015952124612503448, + "loss": 1.0295, + "step": 4311 + }, + { + "epoch": 0.9197952218430034, + "grad_norm": 0.42575919862824185, + "learning_rate": 0.00015952031608176904, + "loss": 0.9732, + "step": 4312 + }, + { + "epoch": 0.9200085324232082, + "grad_norm": 0.4728709657918361, + "learning_rate": 0.00015951938513872988, + "loss": 0.9779, + "step": 4313 + }, + { + "epoch": 0.9202218430034129, + "grad_norm": 0.5109494119129883, + "learning_rate": 0.00015951845329592755, + "loss": 0.9756, + "step": 4314 + }, + { + "epoch": 0.9204351535836177, + "grad_norm": 0.5798594478005055, + "learning_rate": 0.00015951752055337257, + "loss": 0.9996, + "step": 4315 + }, + { + "epoch": 0.9206484641638225, + "grad_norm": 0.5977119434533165, + "learning_rate": 0.00015951658691107552, + "loss": 0.9778, + "step": 4316 + }, + { + "epoch": 0.9208617747440273, + "grad_norm": 0.5673922909221829, + "learning_rate": 0.00015951565236904696, + "loss": 0.9615, + "step": 4317 + }, + { + "epoch": 0.9210750853242321, + "grad_norm": 0.6959044817877864, + "learning_rate": 0.00015951471692729743, + "loss": 1.0071, + "step": 4318 + }, + { + "epoch": 0.9212883959044369, + "grad_norm": 0.8270407558557844, + "learning_rate": 0.00015951378058583757, + "loss": 0.9751, + "step": 4319 + }, + { + "epoch": 0.9215017064846417, + "grad_norm": 0.7873522375695964, + "learning_rate": 0.00015951284334467792, + "loss": 0.9331, + "step": 4320 + }, + { + "epoch": 0.9217150170648464, + "grad_norm": 0.6914255852965302, + "learning_rate": 0.00015951190520382911, + "loss": 0.9636, + "step": 4321 + }, + { + "epoch": 0.9219283276450512, + "grad_norm": 0.5827976113039143, + "learning_rate": 0.00015951096616330178, + "loss": 0.9732, + "step": 4322 + }, + { + "epoch": 0.922141638225256, + "grad_norm": 0.4133735928197901, + "learning_rate": 0.0001595100262231065, + "loss": 1.0032, + "step": 4323 + }, + { + "epoch": 0.9223549488054608, + "grad_norm": 0.40854112971933837, + "learning_rate": 0.00015950908538325397, + "loss": 0.9818, + "step": 4324 + }, + { + "epoch": 0.9225682593856656, + "grad_norm": 0.46686947996094974, + "learning_rate": 0.00015950814364375476, + "loss": 0.9669, + "step": 4325 + }, + { + "epoch": 0.9227815699658704, + "grad_norm": 0.4842017248399946, + "learning_rate": 0.0001595072010046196, + "loss": 0.9925, + "step": 4326 + }, + { + "epoch": 0.922994880546075, + "grad_norm": 0.5550589412465722, + "learning_rate": 0.0001595062574658591, + "loss": 0.99, + "step": 4327 + }, + { + "epoch": 0.9232081911262798, + "grad_norm": 0.6691776816135877, + "learning_rate": 0.000159505313027484, + "loss": 0.9682, + "step": 4328 + }, + { + "epoch": 0.9234215017064846, + "grad_norm": 0.8058489415683691, + "learning_rate": 0.0001595043676895049, + "loss": 0.9792, + "step": 4329 + }, + { + "epoch": 0.9236348122866894, + "grad_norm": 0.826195584070399, + "learning_rate": 0.00015950342145193256, + "loss": 0.9745, + "step": 4330 + }, + { + "epoch": 0.9238481228668942, + "grad_norm": 0.7860472091389327, + "learning_rate": 0.00015950247431477767, + "loss": 0.9569, + "step": 4331 + }, + { + "epoch": 0.924061433447099, + "grad_norm": 0.6481484447203272, + "learning_rate": 0.00015950152627805097, + "loss": 0.9819, + "step": 4332 + }, + { + "epoch": 0.9242747440273038, + "grad_norm": 0.5191549835250332, + "learning_rate": 0.00015950057734176313, + "loss": 0.9648, + "step": 4333 + }, + { + "epoch": 0.9244880546075085, + "grad_norm": 0.3448512796255316, + "learning_rate": 0.00015949962750592492, + "loss": 0.9647, + "step": 4334 + }, + { + "epoch": 0.9247013651877133, + "grad_norm": 0.4503663388531548, + "learning_rate": 0.0001594986767705471, + "loss": 0.9959, + "step": 4335 + }, + { + "epoch": 0.9249146757679181, + "grad_norm": 0.5825532495594139, + "learning_rate": 0.0001594977251356404, + "loss": 0.9689, + "step": 4336 + }, + { + "epoch": 0.9251279863481229, + "grad_norm": 0.658261467093005, + "learning_rate": 0.00015949677260121563, + "loss": 1.0082, + "step": 4337 + }, + { + "epoch": 0.9253412969283277, + "grad_norm": 0.6901701310917409, + "learning_rate": 0.00015949581916728352, + "loss": 0.9422, + "step": 4338 + }, + { + "epoch": 0.9255546075085325, + "grad_norm": 0.6019114345557636, + "learning_rate": 0.0001594948648338549, + "loss": 0.976, + "step": 4339 + }, + { + "epoch": 0.9257679180887372, + "grad_norm": 0.5563960008947482, + "learning_rate": 0.00015949390960094052, + "loss": 0.9941, + "step": 4340 + }, + { + "epoch": 0.925981228668942, + "grad_norm": 0.46422558224517363, + "learning_rate": 0.00015949295346855122, + "loss": 0.9902, + "step": 4341 + }, + { + "epoch": 0.9261945392491467, + "grad_norm": 0.42049944794731003, + "learning_rate": 0.00015949199643669785, + "loss": 0.9894, + "step": 4342 + }, + { + "epoch": 0.9264078498293515, + "grad_norm": 0.4248680406789056, + "learning_rate": 0.00015949103850539114, + "loss": 0.9617, + "step": 4343 + }, + { + "epoch": 0.9266211604095563, + "grad_norm": 0.43503697803400576, + "learning_rate": 0.00015949007967464205, + "loss": 0.9808, + "step": 4344 + }, + { + "epoch": 0.9268344709897611, + "grad_norm": 0.38379725246187113, + "learning_rate": 0.00015948911994446134, + "loss": 1.0241, + "step": 4345 + }, + { + "epoch": 0.9270477815699659, + "grad_norm": 0.42819633030481474, + "learning_rate": 0.0001594881593148599, + "loss": 0.9644, + "step": 4346 + }, + { + "epoch": 0.9272610921501706, + "grad_norm": 0.4617827107325041, + "learning_rate": 0.0001594871977858486, + "loss": 0.9575, + "step": 4347 + }, + { + "epoch": 0.9274744027303754, + "grad_norm": 0.5224772994932866, + "learning_rate": 0.00015948623535743837, + "loss": 0.9929, + "step": 4348 + }, + { + "epoch": 0.9276877133105802, + "grad_norm": 0.5349272679353352, + "learning_rate": 0.00015948527202963998, + "loss": 0.9989, + "step": 4349 + }, + { + "epoch": 0.927901023890785, + "grad_norm": 0.4892563145748345, + "learning_rate": 0.00015948430780246444, + "loss": 0.9645, + "step": 4350 + }, + { + "epoch": 0.9281143344709898, + "grad_norm": 0.4620831367383803, + "learning_rate": 0.0001594833426759226, + "loss": 0.9988, + "step": 4351 + }, + { + "epoch": 0.9283276450511946, + "grad_norm": 0.5322948422579429, + "learning_rate": 0.00015948237665002542, + "loss": 1.0038, + "step": 4352 + }, + { + "epoch": 0.9285409556313993, + "grad_norm": 0.5379267228607475, + "learning_rate": 0.00015948140972478378, + "loss": 0.975, + "step": 4353 + }, + { + "epoch": 0.9287542662116041, + "grad_norm": 0.4942889575805688, + "learning_rate": 0.00015948044190020868, + "loss": 0.9817, + "step": 4354 + }, + { + "epoch": 0.9289675767918089, + "grad_norm": 0.48619928165327964, + "learning_rate": 0.00015947947317631106, + "loss": 0.9627, + "step": 4355 + }, + { + "epoch": 0.9291808873720137, + "grad_norm": 0.478189490288788, + "learning_rate": 0.00015947850355310184, + "loss": 1.0, + "step": 4356 + }, + { + "epoch": 0.9293941979522184, + "grad_norm": 0.6903336518664315, + "learning_rate": 0.00015947753303059201, + "loss": 0.9764, + "step": 4357 + }, + { + "epoch": 0.9296075085324232, + "grad_norm": 0.5517810392953783, + "learning_rate": 0.00015947656160879257, + "loss": 0.9801, + "step": 4358 + }, + { + "epoch": 0.929820819112628, + "grad_norm": 0.5400402337326885, + "learning_rate": 0.00015947558928771448, + "loss": 0.9821, + "step": 4359 + }, + { + "epoch": 0.9300341296928327, + "grad_norm": 0.48823743855536317, + "learning_rate": 0.00015947461606736878, + "loss": 0.9827, + "step": 4360 + }, + { + "epoch": 0.9302474402730375, + "grad_norm": 0.4657879780106205, + "learning_rate": 0.00015947364194776644, + "loss": 0.9909, + "step": 4361 + }, + { + "epoch": 0.9304607508532423, + "grad_norm": 0.4276650132083599, + "learning_rate": 0.00015947266692891852, + "loss": 0.9324, + "step": 4362 + }, + { + "epoch": 0.9306740614334471, + "grad_norm": 0.4916797298727446, + "learning_rate": 0.00015947169101083604, + "loss": 0.9548, + "step": 4363 + }, + { + "epoch": 0.9308873720136519, + "grad_norm": 0.4704674403336712, + "learning_rate": 0.00015947071419353004, + "loss": 0.9623, + "step": 4364 + }, + { + "epoch": 0.9311006825938567, + "grad_norm": 0.47417136007619637, + "learning_rate": 0.00015946973647701157, + "loss": 0.9749, + "step": 4365 + }, + { + "epoch": 0.9313139931740614, + "grad_norm": 0.473113914355748, + "learning_rate": 0.0001594687578612917, + "loss": 0.985, + "step": 4366 + }, + { + "epoch": 0.9315273037542662, + "grad_norm": 0.44738392971427704, + "learning_rate": 0.00015946777834638148, + "loss": 0.9844, + "step": 4367 + }, + { + "epoch": 0.931740614334471, + "grad_norm": 0.4058609851249687, + "learning_rate": 0.00015946679793229203, + "loss": 0.9711, + "step": 4368 + }, + { + "epoch": 0.9319539249146758, + "grad_norm": 0.4206680156987823, + "learning_rate": 0.00015946581661903443, + "loss": 0.9871, + "step": 4369 + }, + { + "epoch": 0.9321672354948806, + "grad_norm": 0.539000214010355, + "learning_rate": 0.00015946483440661976, + "loss": 0.9784, + "step": 4370 + }, + { + "epoch": 0.9323805460750854, + "grad_norm": 0.5780907624973549, + "learning_rate": 0.00015946385129505915, + "loss": 0.9932, + "step": 4371 + }, + { + "epoch": 0.9325938566552902, + "grad_norm": 0.5257310373764683, + "learning_rate": 0.00015946286728436374, + "loss": 0.9868, + "step": 4372 + }, + { + "epoch": 0.9328071672354948, + "grad_norm": 0.481923989336714, + "learning_rate": 0.00015946188237454464, + "loss": 1.0075, + "step": 4373 + }, + { + "epoch": 0.9330204778156996, + "grad_norm": 0.46933151198546097, + "learning_rate": 0.000159460896565613, + "loss": 0.9452, + "step": 4374 + }, + { + "epoch": 0.9332337883959044, + "grad_norm": 0.623638704604704, + "learning_rate": 0.00015945990985758, + "loss": 1.0044, + "step": 4375 + }, + { + "epoch": 0.9334470989761092, + "grad_norm": 0.6792938962232767, + "learning_rate": 0.00015945892225045677, + "loss": 0.9731, + "step": 4376 + }, + { + "epoch": 0.933660409556314, + "grad_norm": 0.6313986339706618, + "learning_rate": 0.0001594579337442545, + "loss": 0.979, + "step": 4377 + }, + { + "epoch": 0.9338737201365188, + "grad_norm": 0.6904367455175933, + "learning_rate": 0.00015945694433898436, + "loss": 0.9837, + "step": 4378 + }, + { + "epoch": 0.9340870307167235, + "grad_norm": 0.7402390072009223, + "learning_rate": 0.00015945595403465756, + "loss": 0.9993, + "step": 4379 + }, + { + "epoch": 0.9343003412969283, + "grad_norm": 0.9184437047600182, + "learning_rate": 0.00015945496283128533, + "loss": 0.9722, + "step": 4380 + }, + { + "epoch": 0.9345136518771331, + "grad_norm": 1.1478989292911361, + "learning_rate": 0.00015945397072887884, + "loss": 0.9848, + "step": 4381 + }, + { + "epoch": 0.9347269624573379, + "grad_norm": 0.8155969586223011, + "learning_rate": 0.00015945297772744935, + "loss": 0.987, + "step": 4382 + }, + { + "epoch": 0.9349402730375427, + "grad_norm": 0.6549182295497186, + "learning_rate": 0.00015945198382700808, + "loss": 1.0005, + "step": 4383 + }, + { + "epoch": 0.9351535836177475, + "grad_norm": 0.8520182562000442, + "learning_rate": 0.00015945098902756625, + "loss": 0.9943, + "step": 4384 + }, + { + "epoch": 0.9353668941979523, + "grad_norm": 0.6547465079454736, + "learning_rate": 0.00015944999332913516, + "loss": 0.9796, + "step": 4385 + }, + { + "epoch": 0.935580204778157, + "grad_norm": 0.8417577455036033, + "learning_rate": 0.00015944899673172604, + "loss": 0.9886, + "step": 4386 + }, + { + "epoch": 0.9357935153583617, + "grad_norm": 0.9903633322261802, + "learning_rate": 0.0001594479992353502, + "loss": 0.9732, + "step": 4387 + }, + { + "epoch": 0.9360068259385665, + "grad_norm": 1.065789771841882, + "learning_rate": 0.00015944700084001892, + "loss": 0.9798, + "step": 4388 + }, + { + "epoch": 0.9362201365187713, + "grad_norm": 0.8356531609686008, + "learning_rate": 0.00015944600154574348, + "loss": 0.9686, + "step": 4389 + }, + { + "epoch": 0.9364334470989761, + "grad_norm": 0.8175062038297066, + "learning_rate": 0.00015944500135253522, + "loss": 0.976, + "step": 4390 + }, + { + "epoch": 0.9366467576791809, + "grad_norm": 0.7638296913498948, + "learning_rate": 0.00015944400026040543, + "loss": 0.9853, + "step": 4391 + }, + { + "epoch": 0.9368600682593856, + "grad_norm": 0.6913465663780739, + "learning_rate": 0.0001594429982693654, + "loss": 0.9753, + "step": 4392 + }, + { + "epoch": 0.9370733788395904, + "grad_norm": 0.6405916795304049, + "learning_rate": 0.00015944199537942653, + "loss": 0.9613, + "step": 4393 + }, + { + "epoch": 0.9372866894197952, + "grad_norm": 0.6939141216096077, + "learning_rate": 0.00015944099159060014, + "loss": 0.9977, + "step": 4394 + }, + { + "epoch": 0.9375, + "grad_norm": 0.8505758258396365, + "learning_rate": 0.0001594399869028976, + "loss": 0.9648, + "step": 4395 + }, + { + "epoch": 0.9377133105802048, + "grad_norm": 0.9946471953036137, + "learning_rate": 0.00015943898131633028, + "loss": 1.001, + "step": 4396 + }, + { + "epoch": 0.9379266211604096, + "grad_norm": 0.9655759219592414, + "learning_rate": 0.00015943797483090956, + "loss": 0.9846, + "step": 4397 + }, + { + "epoch": 0.9381399317406144, + "grad_norm": 0.8276582761913628, + "learning_rate": 0.0001594369674466468, + "loss": 1.0004, + "step": 4398 + }, + { + "epoch": 0.9383532423208191, + "grad_norm": 0.7113104477467607, + "learning_rate": 0.0001594359591635534, + "loss": 1.0312, + "step": 4399 + }, + { + "epoch": 0.9385665529010239, + "grad_norm": 0.6328579325034579, + "learning_rate": 0.00015943494998164082, + "loss": 0.9678, + "step": 4400 + }, + { + "epoch": 0.9387798634812287, + "grad_norm": 0.6058334359865953, + "learning_rate": 0.00015943393990092044, + "loss": 0.9555, + "step": 4401 + }, + { + "epoch": 0.9389931740614335, + "grad_norm": 0.6195256510868409, + "learning_rate": 0.00015943292892140368, + "loss": 1.0063, + "step": 4402 + }, + { + "epoch": 0.9392064846416383, + "grad_norm": 0.5347226788189134, + "learning_rate": 0.000159431917043102, + "loss": 1.005, + "step": 4403 + }, + { + "epoch": 0.939419795221843, + "grad_norm": 0.5085807601142843, + "learning_rate": 0.00015943090426602684, + "loss": 0.961, + "step": 4404 + }, + { + "epoch": 0.9396331058020477, + "grad_norm": 0.474796904278843, + "learning_rate": 0.00015942989059018967, + "loss": 0.9872, + "step": 4405 + }, + { + "epoch": 0.9398464163822525, + "grad_norm": 0.4954377384053715, + "learning_rate": 0.00015942887601560194, + "loss": 1.0032, + "step": 4406 + }, + { + "epoch": 0.9400597269624573, + "grad_norm": 0.5140365772163468, + "learning_rate": 0.00015942786054227515, + "loss": 0.989, + "step": 4407 + }, + { + "epoch": 0.9402730375426621, + "grad_norm": 0.49122786864779566, + "learning_rate": 0.00015942684417022076, + "loss": 0.9978, + "step": 4408 + }, + { + "epoch": 0.9404863481228669, + "grad_norm": 0.4651814254772113, + "learning_rate": 0.00015942582689945032, + "loss": 0.9901, + "step": 4409 + }, + { + "epoch": 0.9406996587030717, + "grad_norm": 0.4458723928377273, + "learning_rate": 0.00015942480872997532, + "loss": 0.9527, + "step": 4410 + }, + { + "epoch": 0.9409129692832765, + "grad_norm": 0.4903633975459651, + "learning_rate": 0.00015942378966180725, + "loss": 0.9688, + "step": 4411 + }, + { + "epoch": 0.9411262798634812, + "grad_norm": 0.4282193476391241, + "learning_rate": 0.00015942276969495766, + "loss": 0.9926, + "step": 4412 + }, + { + "epoch": 0.941339590443686, + "grad_norm": 0.41449490886396023, + "learning_rate": 0.0001594217488294381, + "loss": 0.9652, + "step": 4413 + }, + { + "epoch": 0.9415529010238908, + "grad_norm": 0.4667541082278772, + "learning_rate": 0.00015942072706526015, + "loss": 0.9892, + "step": 4414 + }, + { + "epoch": 0.9417662116040956, + "grad_norm": 0.44268243598353524, + "learning_rate": 0.0001594197044024353, + "loss": 0.9573, + "step": 4415 + }, + { + "epoch": 0.9419795221843004, + "grad_norm": 0.40802798389001516, + "learning_rate": 0.00015941868084097516, + "loss": 0.9831, + "step": 4416 + }, + { + "epoch": 0.9421928327645052, + "grad_norm": 0.4242059792233898, + "learning_rate": 0.0001594176563808913, + "loss": 0.9809, + "step": 4417 + }, + { + "epoch": 0.9424061433447098, + "grad_norm": 0.4141182768401314, + "learning_rate": 0.00015941663102219539, + "loss": 0.9907, + "step": 4418 + }, + { + "epoch": 0.9426194539249146, + "grad_norm": 0.34180152820549753, + "learning_rate": 0.00015941560476489888, + "loss": 0.9907, + "step": 4419 + }, + { + "epoch": 0.9428327645051194, + "grad_norm": 0.439007493529516, + "learning_rate": 0.00015941457760901352, + "loss": 0.9698, + "step": 4420 + }, + { + "epoch": 0.9430460750853242, + "grad_norm": 0.49757983914064174, + "learning_rate": 0.00015941354955455083, + "loss": 0.9534, + "step": 4421 + }, + { + "epoch": 0.943259385665529, + "grad_norm": 0.42289438508980387, + "learning_rate": 0.00015941252060152256, + "loss": 0.976, + "step": 4422 + }, + { + "epoch": 0.9434726962457338, + "grad_norm": 0.5120044969205001, + "learning_rate": 0.00015941149074994023, + "loss": 0.9813, + "step": 4423 + }, + { + "epoch": 0.9436860068259386, + "grad_norm": 0.6013654347711471, + "learning_rate": 0.00015941045999981555, + "loss": 0.9885, + "step": 4424 + }, + { + "epoch": 0.9438993174061433, + "grad_norm": 0.6129615195632006, + "learning_rate": 0.0001594094283511602, + "loss": 1.0118, + "step": 4425 + }, + { + "epoch": 0.9441126279863481, + "grad_norm": 0.4975171110935836, + "learning_rate": 0.00015940839580398582, + "loss": 0.9799, + "step": 4426 + }, + { + "epoch": 0.9443259385665529, + "grad_norm": 0.3607187945899603, + "learning_rate": 0.00015940736235830412, + "loss": 0.9461, + "step": 4427 + }, + { + "epoch": 0.9445392491467577, + "grad_norm": 0.3992195975606231, + "learning_rate": 0.00015940632801412678, + "loss": 0.9755, + "step": 4428 + }, + { + "epoch": 0.9447525597269625, + "grad_norm": 0.46118687228341126, + "learning_rate": 0.00015940529277146546, + "loss": 0.9637, + "step": 4429 + }, + { + "epoch": 0.9449658703071673, + "grad_norm": 0.46836485524660654, + "learning_rate": 0.00015940425663033198, + "loss": 0.9869, + "step": 4430 + }, + { + "epoch": 0.945179180887372, + "grad_norm": 0.39177415309109936, + "learning_rate": 0.00015940321959073795, + "loss": 0.9801, + "step": 4431 + }, + { + "epoch": 0.9453924914675768, + "grad_norm": 0.2887927119747625, + "learning_rate": 0.0001594021816526952, + "loss": 0.9532, + "step": 4432 + }, + { + "epoch": 0.9456058020477816, + "grad_norm": 0.378154374013537, + "learning_rate": 0.00015940114281621538, + "loss": 0.9985, + "step": 4433 + }, + { + "epoch": 0.9458191126279863, + "grad_norm": 0.46178566275659266, + "learning_rate": 0.0001594001030813103, + "loss": 0.949, + "step": 4434 + }, + { + "epoch": 0.9460324232081911, + "grad_norm": 0.4536227261183786, + "learning_rate": 0.00015939906244799172, + "loss": 0.9757, + "step": 4435 + }, + { + "epoch": 0.9462457337883959, + "grad_norm": 0.4485076870704947, + "learning_rate": 0.00015939802091627145, + "loss": 0.9714, + "step": 4436 + }, + { + "epoch": 0.9464590443686007, + "grad_norm": 0.5543961226871397, + "learning_rate": 0.00015939697848616123, + "loss": 0.9656, + "step": 4437 + }, + { + "epoch": 0.9466723549488054, + "grad_norm": 0.6271351696678328, + "learning_rate": 0.00015939593515767282, + "loss": 0.9889, + "step": 4438 + }, + { + "epoch": 0.9468856655290102, + "grad_norm": 0.66966586554657, + "learning_rate": 0.00015939489093081812, + "loss": 0.9811, + "step": 4439 + }, + { + "epoch": 0.947098976109215, + "grad_norm": 0.6553273603080093, + "learning_rate": 0.00015939384580560884, + "loss": 0.9782, + "step": 4440 + }, + { + "epoch": 0.9473122866894198, + "grad_norm": 0.6274497310028907, + "learning_rate": 0.00015939279978205691, + "loss": 0.9777, + "step": 4441 + }, + { + "epoch": 0.9475255972696246, + "grad_norm": 0.6643428634143859, + "learning_rate": 0.0001593917528601741, + "loss": 0.9743, + "step": 4442 + }, + { + "epoch": 0.9477389078498294, + "grad_norm": 0.6917921778036923, + "learning_rate": 0.00015939070503997224, + "loss": 1.0195, + "step": 4443 + }, + { + "epoch": 0.9479522184300341, + "grad_norm": 0.740814549456052, + "learning_rate": 0.00015938965632146326, + "loss": 0.9776, + "step": 4444 + }, + { + "epoch": 0.9481655290102389, + "grad_norm": 0.7433500653839606, + "learning_rate": 0.00015938860670465895, + "loss": 0.9999, + "step": 4445 + }, + { + "epoch": 0.9483788395904437, + "grad_norm": 0.6212523304049996, + "learning_rate": 0.00015938755618957124, + "loss": 0.9523, + "step": 4446 + }, + { + "epoch": 0.9485921501706485, + "grad_norm": 0.4720899946211255, + "learning_rate": 0.000159386504776212, + "loss": 0.9925, + "step": 4447 + }, + { + "epoch": 0.9488054607508533, + "grad_norm": 0.4193503456015506, + "learning_rate": 0.00015938545246459312, + "loss": 0.9862, + "step": 4448 + }, + { + "epoch": 0.949018771331058, + "grad_norm": 0.41005193220926284, + "learning_rate": 0.0001593843992547265, + "loss": 1.013, + "step": 4449 + }, + { + "epoch": 0.9492320819112628, + "grad_norm": 0.42522688083599647, + "learning_rate": 0.00015938334514662407, + "loss": 0.9999, + "step": 4450 + }, + { + "epoch": 0.9494453924914675, + "grad_norm": 0.4231541145321212, + "learning_rate": 0.00015938229014029776, + "loss": 0.964, + "step": 4451 + }, + { + "epoch": 0.9496587030716723, + "grad_norm": 0.4927548432903838, + "learning_rate": 0.0001593812342357595, + "loss": 0.9647, + "step": 4452 + }, + { + "epoch": 0.9498720136518771, + "grad_norm": 0.5399880860497791, + "learning_rate": 0.00015938017743302125, + "loss": 0.981, + "step": 4453 + }, + { + "epoch": 0.9500853242320819, + "grad_norm": 0.5338434270569427, + "learning_rate": 0.00015937911973209492, + "loss": 0.9949, + "step": 4454 + }, + { + "epoch": 0.9502986348122867, + "grad_norm": 0.6022835958328597, + "learning_rate": 0.00015937806113299255, + "loss": 0.9898, + "step": 4455 + }, + { + "epoch": 0.9505119453924915, + "grad_norm": 0.6707610603122329, + "learning_rate": 0.00015937700163572608, + "loss": 1.0012, + "step": 4456 + }, + { + "epoch": 0.9507252559726962, + "grad_norm": 0.5735053262713857, + "learning_rate": 0.00015937594124030747, + "loss": 0.9779, + "step": 4457 + }, + { + "epoch": 0.950938566552901, + "grad_norm": 0.6372178355392514, + "learning_rate": 0.0001593748799467488, + "loss": 0.9519, + "step": 4458 + }, + { + "epoch": 0.9511518771331058, + "grad_norm": 0.6731592378886987, + "learning_rate": 0.00015937381775506197, + "loss": 0.9943, + "step": 4459 + }, + { + "epoch": 0.9513651877133106, + "grad_norm": 0.6947485293777663, + "learning_rate": 0.0001593727546652591, + "loss": 0.9704, + "step": 4460 + }, + { + "epoch": 0.9515784982935154, + "grad_norm": 0.7007328309916158, + "learning_rate": 0.00015937169067735217, + "loss": 0.9525, + "step": 4461 + }, + { + "epoch": 0.9517918088737202, + "grad_norm": 0.6053281519290717, + "learning_rate": 0.0001593706257913532, + "loss": 1.0, + "step": 4462 + }, + { + "epoch": 0.952005119453925, + "grad_norm": 0.4318731837957087, + "learning_rate": 0.0001593695600072743, + "loss": 0.9723, + "step": 4463 + }, + { + "epoch": 0.9522184300341296, + "grad_norm": 0.37831832825325784, + "learning_rate": 0.00015936849332512748, + "loss": 0.9808, + "step": 4464 + }, + { + "epoch": 0.9524317406143344, + "grad_norm": 0.36754090573814296, + "learning_rate": 0.00015936742574492482, + "loss": 0.9863, + "step": 4465 + }, + { + "epoch": 0.9526450511945392, + "grad_norm": 0.3261476263988001, + "learning_rate": 0.00015936635726667838, + "loss": 0.9704, + "step": 4466 + }, + { + "epoch": 0.952858361774744, + "grad_norm": 0.37226830956852064, + "learning_rate": 0.0001593652878904003, + "loss": 0.9747, + "step": 4467 + }, + { + "epoch": 0.9530716723549488, + "grad_norm": 0.43302771575031396, + "learning_rate": 0.00015936421761610268, + "loss": 0.9867, + "step": 4468 + }, + { + "epoch": 0.9532849829351536, + "grad_norm": 0.569693201999225, + "learning_rate": 0.00015936314644379756, + "loss": 0.9884, + "step": 4469 + }, + { + "epoch": 0.9534982935153583, + "grad_norm": 0.6396050948577136, + "learning_rate": 0.0001593620743734971, + "loss": 0.9622, + "step": 4470 + }, + { + "epoch": 0.9537116040955631, + "grad_norm": 0.6179099749685354, + "learning_rate": 0.00015936100140521344, + "loss": 1.0117, + "step": 4471 + }, + { + "epoch": 0.9539249146757679, + "grad_norm": 0.5396855869150404, + "learning_rate": 0.00015935992753895876, + "loss": 1.021, + "step": 4472 + }, + { + "epoch": 0.9541382252559727, + "grad_norm": 0.4547415834566048, + "learning_rate": 0.00015935885277474512, + "loss": 0.994, + "step": 4473 + }, + { + "epoch": 0.9543515358361775, + "grad_norm": 0.39092588598434375, + "learning_rate": 0.00015935777711258473, + "loss": 0.9848, + "step": 4474 + }, + { + "epoch": 0.9545648464163823, + "grad_norm": 0.4034015471991292, + "learning_rate": 0.00015935670055248975, + "loss": 0.9631, + "step": 4475 + }, + { + "epoch": 0.9547781569965871, + "grad_norm": 0.36749202515019674, + "learning_rate": 0.0001593556230944724, + "loss": 0.9842, + "step": 4476 + }, + { + "epoch": 0.9549914675767918, + "grad_norm": 0.2881675976388795, + "learning_rate": 0.00015935454473854478, + "loss": 0.9857, + "step": 4477 + }, + { + "epoch": 0.9552047781569966, + "grad_norm": 0.3607749650918677, + "learning_rate": 0.0001593534654847192, + "loss": 0.9824, + "step": 4478 + }, + { + "epoch": 0.9554180887372014, + "grad_norm": 0.4800041165372719, + "learning_rate": 0.00015935238533300783, + "loss": 0.9937, + "step": 4479 + }, + { + "epoch": 0.9556313993174061, + "grad_norm": 0.5948700697262731, + "learning_rate": 0.00015935130428342285, + "loss": 0.9776, + "step": 4480 + }, + { + "epoch": 0.955844709897611, + "grad_norm": 0.6473853206923871, + "learning_rate": 0.00015935022233597656, + "loss": 0.9603, + "step": 4481 + }, + { + "epoch": 0.9560580204778157, + "grad_norm": 0.674158595659558, + "learning_rate": 0.00015934913949068117, + "loss": 1.0257, + "step": 4482 + }, + { + "epoch": 0.9562713310580204, + "grad_norm": 0.6582009872491892, + "learning_rate": 0.0001593480557475489, + "loss": 1.0011, + "step": 4483 + }, + { + "epoch": 0.9564846416382252, + "grad_norm": 0.6007838250730633, + "learning_rate": 0.00015934697110659205, + "loss": 0.9385, + "step": 4484 + }, + { + "epoch": 0.95669795221843, + "grad_norm": 0.5894299526046355, + "learning_rate": 0.00015934588556782293, + "loss": 0.9833, + "step": 4485 + }, + { + "epoch": 0.9569112627986348, + "grad_norm": 0.6283923290639288, + "learning_rate": 0.00015934479913125375, + "loss": 0.9918, + "step": 4486 + }, + { + "epoch": 0.9571245733788396, + "grad_norm": 0.6361135531161234, + "learning_rate": 0.0001593437117968968, + "loss": 1.0082, + "step": 4487 + }, + { + "epoch": 0.9573378839590444, + "grad_norm": 0.6338496883329527, + "learning_rate": 0.00015934262356476447, + "loss": 0.9703, + "step": 4488 + }, + { + "epoch": 0.9575511945392492, + "grad_norm": 0.6755060314196961, + "learning_rate": 0.000159341534434869, + "loss": 0.9884, + "step": 4489 + }, + { + "epoch": 0.9577645051194539, + "grad_norm": 0.6641119805169203, + "learning_rate": 0.00015934044440722272, + "loss": 1.0061, + "step": 4490 + }, + { + "epoch": 0.9579778156996587, + "grad_norm": 0.5867950691782385, + "learning_rate": 0.000159339353481838, + "loss": 0.9866, + "step": 4491 + }, + { + "epoch": 0.9581911262798635, + "grad_norm": 0.8232656824290754, + "learning_rate": 0.00015933826165872713, + "loss": 0.9593, + "step": 4492 + }, + { + "epoch": 0.9584044368600683, + "grad_norm": 1.0406464916941756, + "learning_rate": 0.00015933716893790252, + "loss": 0.9629, + "step": 4493 + }, + { + "epoch": 0.9586177474402731, + "grad_norm": 0.8831444753569885, + "learning_rate": 0.00015933607531937652, + "loss": 0.9786, + "step": 4494 + }, + { + "epoch": 0.9588310580204779, + "grad_norm": 0.6624653837618767, + "learning_rate": 0.00015933498080316147, + "loss": 1.0014, + "step": 4495 + }, + { + "epoch": 0.9590443686006825, + "grad_norm": 0.9240074475890284, + "learning_rate": 0.00015933388538926977, + "loss": 0.9727, + "step": 4496 + }, + { + "epoch": 0.9592576791808873, + "grad_norm": 0.5905737006489198, + "learning_rate": 0.00015933278907771385, + "loss": 0.985, + "step": 4497 + }, + { + "epoch": 0.9594709897610921, + "grad_norm": 0.5294795310506941, + "learning_rate": 0.00015933169186850606, + "loss": 1.0285, + "step": 4498 + }, + { + "epoch": 0.9596843003412969, + "grad_norm": 0.4845699145406752, + "learning_rate": 0.00015933059376165888, + "loss": 0.9697, + "step": 4499 + }, + { + "epoch": 0.9598976109215017, + "grad_norm": 0.4709463769350625, + "learning_rate": 0.00015932949475718468, + "loss": 0.9951, + "step": 4500 + }, + { + "epoch": 0.9601109215017065, + "grad_norm": 0.5463836756089726, + "learning_rate": 0.0001593283948550959, + "loss": 1.0024, + "step": 4501 + }, + { + "epoch": 0.9603242320819113, + "grad_norm": 0.4900935818900955, + "learning_rate": 0.000159327294055405, + "loss": 0.9742, + "step": 4502 + }, + { + "epoch": 0.960537542662116, + "grad_norm": 0.3535232337401767, + "learning_rate": 0.00015932619235812444, + "loss": 0.9787, + "step": 4503 + }, + { + "epoch": 0.9607508532423208, + "grad_norm": 0.44318470440911506, + "learning_rate": 0.00015932508976326668, + "loss": 0.9854, + "step": 4504 + }, + { + "epoch": 0.9609641638225256, + "grad_norm": 0.504800194037242, + "learning_rate": 0.0001593239862708442, + "loss": 0.9728, + "step": 4505 + }, + { + "epoch": 0.9611774744027304, + "grad_norm": 0.48359248212862094, + "learning_rate": 0.00015932288188086945, + "loss": 0.9839, + "step": 4506 + }, + { + "epoch": 0.9613907849829352, + "grad_norm": 0.5180758707249553, + "learning_rate": 0.000159321776593355, + "loss": 0.9701, + "step": 4507 + }, + { + "epoch": 0.96160409556314, + "grad_norm": 0.6390060259592, + "learning_rate": 0.0001593206704083133, + "loss": 1.0209, + "step": 4508 + }, + { + "epoch": 0.9618174061433447, + "grad_norm": 0.4688848824718933, + "learning_rate": 0.00015931956332575686, + "loss": 1.0041, + "step": 4509 + }, + { + "epoch": 0.9620307167235495, + "grad_norm": 0.6127457490881167, + "learning_rate": 0.00015931845534569825, + "loss": 0.9741, + "step": 4510 + }, + { + "epoch": 0.9622440273037542, + "grad_norm": 0.6252231265910339, + "learning_rate": 0.00015931734646814995, + "loss": 0.9903, + "step": 4511 + }, + { + "epoch": 0.962457337883959, + "grad_norm": 0.5797269673854863, + "learning_rate": 0.00015931623669312458, + "loss": 0.9566, + "step": 4512 + }, + { + "epoch": 0.9626706484641638, + "grad_norm": 0.6146794447077609, + "learning_rate": 0.00015931512602063464, + "loss": 0.9641, + "step": 4513 + }, + { + "epoch": 0.9628839590443686, + "grad_norm": 0.6047797874910509, + "learning_rate": 0.0001593140144506927, + "loss": 0.9819, + "step": 4514 + }, + { + "epoch": 0.9630972696245734, + "grad_norm": 0.5927630769445921, + "learning_rate": 0.0001593129019833114, + "loss": 0.9549, + "step": 4515 + }, + { + "epoch": 0.9633105802047781, + "grad_norm": 0.6867488698376555, + "learning_rate": 0.00015931178861850324, + "loss": 0.993, + "step": 4516 + }, + { + "epoch": 0.9635238907849829, + "grad_norm": 0.8068530062824603, + "learning_rate": 0.00015931067435628088, + "loss": 1.0074, + "step": 4517 + }, + { + "epoch": 0.9637372013651877, + "grad_norm": 0.9901025497775994, + "learning_rate": 0.0001593095591966569, + "loss": 1.0035, + "step": 4518 + }, + { + "epoch": 0.9639505119453925, + "grad_norm": 1.0796637728824654, + "learning_rate": 0.00015930844313964392, + "loss": 0.9634, + "step": 4519 + }, + { + "epoch": 0.9641638225255973, + "grad_norm": 0.6886294731369044, + "learning_rate": 0.00015930732618525456, + "loss": 0.9753, + "step": 4520 + }, + { + "epoch": 0.9643771331058021, + "grad_norm": 0.4629038389648484, + "learning_rate": 0.00015930620833350145, + "loss": 0.9682, + "step": 4521 + }, + { + "epoch": 0.9645904436860068, + "grad_norm": 0.39461486984519184, + "learning_rate": 0.0001593050895843973, + "loss": 0.985, + "step": 4522 + }, + { + "epoch": 0.9648037542662116, + "grad_norm": 0.4479396254469732, + "learning_rate": 0.0001593039699379547, + "loss": 0.9875, + "step": 4523 + }, + { + "epoch": 0.9650170648464164, + "grad_norm": 0.5357910363111721, + "learning_rate": 0.0001593028493941864, + "loss": 0.9649, + "step": 4524 + }, + { + "epoch": 0.9652303754266212, + "grad_norm": 0.6663427411582467, + "learning_rate": 0.00015930172795310498, + "loss": 1.018, + "step": 4525 + }, + { + "epoch": 0.965443686006826, + "grad_norm": 0.6762166202610564, + "learning_rate": 0.00015930060561472316, + "loss": 0.9623, + "step": 4526 + }, + { + "epoch": 0.9656569965870307, + "grad_norm": 0.6619678058324974, + "learning_rate": 0.0001592994823790537, + "loss": 0.9845, + "step": 4527 + }, + { + "epoch": 0.9658703071672355, + "grad_norm": 0.6600881183102466, + "learning_rate": 0.00015929835824610923, + "loss": 0.9925, + "step": 4528 + }, + { + "epoch": 0.9660836177474402, + "grad_norm": 0.6497781478542858, + "learning_rate": 0.0001592972332159025, + "loss": 0.9967, + "step": 4529 + }, + { + "epoch": 0.966296928327645, + "grad_norm": 0.650535301892443, + "learning_rate": 0.00015929610728844626, + "loss": 0.9537, + "step": 4530 + }, + { + "epoch": 0.9665102389078498, + "grad_norm": 0.556572129469436, + "learning_rate": 0.00015929498046375322, + "loss": 0.9661, + "step": 4531 + }, + { + "epoch": 0.9667235494880546, + "grad_norm": 0.48747707286547076, + "learning_rate": 0.00015929385274183615, + "loss": 0.9743, + "step": 4532 + }, + { + "epoch": 0.9669368600682594, + "grad_norm": 0.4117897076712269, + "learning_rate": 0.0001592927241227078, + "loss": 0.9738, + "step": 4533 + }, + { + "epoch": 0.9671501706484642, + "grad_norm": 0.4312233659070147, + "learning_rate": 0.00015929159460638094, + "loss": 0.9824, + "step": 4534 + }, + { + "epoch": 0.9673634812286689, + "grad_norm": 0.40606155428691565, + "learning_rate": 0.00015929046419286836, + "loss": 0.9787, + "step": 4535 + }, + { + "epoch": 0.9675767918088737, + "grad_norm": 0.4310766511724631, + "learning_rate": 0.00015928933288218282, + "loss": 0.9632, + "step": 4536 + }, + { + "epoch": 0.9677901023890785, + "grad_norm": 0.4741550770159318, + "learning_rate": 0.00015928820067433717, + "loss": 0.9996, + "step": 4537 + }, + { + "epoch": 0.9680034129692833, + "grad_norm": 0.5057132225666154, + "learning_rate": 0.0001592870675693442, + "loss": 1.0139, + "step": 4538 + }, + { + "epoch": 0.9682167235494881, + "grad_norm": 0.497463175006166, + "learning_rate": 0.00015928593356721673, + "loss": 1.0016, + "step": 4539 + }, + { + "epoch": 0.9684300341296929, + "grad_norm": 0.4525767849482682, + "learning_rate": 0.00015928479866796755, + "loss": 1.0069, + "step": 4540 + }, + { + "epoch": 0.9686433447098977, + "grad_norm": 0.4559926216924115, + "learning_rate": 0.00015928366287160957, + "loss": 0.9695, + "step": 4541 + }, + { + "epoch": 0.9688566552901023, + "grad_norm": 0.44405939180147896, + "learning_rate": 0.0001592825261781556, + "loss": 0.9955, + "step": 4542 + }, + { + "epoch": 0.9690699658703071, + "grad_norm": 0.3605631771510931, + "learning_rate": 0.00015928138858761854, + "loss": 0.9888, + "step": 4543 + }, + { + "epoch": 0.9692832764505119, + "grad_norm": 0.36228143620919634, + "learning_rate": 0.00015928025010001118, + "loss": 1.0248, + "step": 4544 + }, + { + "epoch": 0.9694965870307167, + "grad_norm": 0.39299405653519004, + "learning_rate": 0.00015927911071534652, + "loss": 0.9952, + "step": 4545 + }, + { + "epoch": 0.9697098976109215, + "grad_norm": 0.41866621103637036, + "learning_rate": 0.00015927797043363735, + "loss": 0.9479, + "step": 4546 + }, + { + "epoch": 0.9699232081911263, + "grad_norm": 0.44252352522649546, + "learning_rate": 0.00015927682925489663, + "loss": 0.9606, + "step": 4547 + }, + { + "epoch": 0.9701365187713311, + "grad_norm": 0.4082465039952664, + "learning_rate": 0.00015927568717913722, + "loss": 0.9402, + "step": 4548 + }, + { + "epoch": 0.9703498293515358, + "grad_norm": 0.3966839927221166, + "learning_rate": 0.0001592745442063721, + "loss": 0.9838, + "step": 4549 + }, + { + "epoch": 0.9705631399317406, + "grad_norm": 0.42299038442598247, + "learning_rate": 0.00015927340033661418, + "loss": 0.946, + "step": 4550 + }, + { + "epoch": 0.9707764505119454, + "grad_norm": 0.40001092177023034, + "learning_rate": 0.00015927225556987642, + "loss": 0.9722, + "step": 4551 + }, + { + "epoch": 0.9709897610921502, + "grad_norm": 0.3760049310895169, + "learning_rate": 0.00015927110990617173, + "loss": 0.9721, + "step": 4552 + }, + { + "epoch": 0.971203071672355, + "grad_norm": 0.3938441202465464, + "learning_rate": 0.00015926996334551308, + "loss": 0.9937, + "step": 4553 + }, + { + "epoch": 0.9714163822525598, + "grad_norm": 0.33610015190351344, + "learning_rate": 0.0001592688158879135, + "loss": 0.9729, + "step": 4554 + }, + { + "epoch": 0.9716296928327645, + "grad_norm": 0.2740479712585741, + "learning_rate": 0.00015926766753338593, + "loss": 0.9813, + "step": 4555 + }, + { + "epoch": 0.9718430034129693, + "grad_norm": 0.3190459955568631, + "learning_rate": 0.00015926651828194335, + "loss": 1.0032, + "step": 4556 + }, + { + "epoch": 0.972056313993174, + "grad_norm": 0.43017162318360025, + "learning_rate": 0.0001592653681335988, + "loss": 0.9602, + "step": 4557 + }, + { + "epoch": 0.9722696245733788, + "grad_norm": 0.6267291610809058, + "learning_rate": 0.00015926421708836527, + "loss": 1.0103, + "step": 4558 + }, + { + "epoch": 0.9724829351535836, + "grad_norm": 0.8276879940788746, + "learning_rate": 0.00015926306514625578, + "loss": 0.9661, + "step": 4559 + }, + { + "epoch": 0.9726962457337884, + "grad_norm": 1.0223604072527843, + "learning_rate": 0.00015926191230728335, + "loss": 0.9923, + "step": 4560 + }, + { + "epoch": 0.9729095563139932, + "grad_norm": 1.062312865898941, + "learning_rate": 0.00015926075857146112, + "loss": 1.0281, + "step": 4561 + }, + { + "epoch": 0.9731228668941979, + "grad_norm": 0.8212279747022566, + "learning_rate": 0.00015925960393880202, + "loss": 0.9753, + "step": 4562 + }, + { + "epoch": 0.9733361774744027, + "grad_norm": 0.6948712015429946, + "learning_rate": 0.00015925844840931918, + "loss": 0.9867, + "step": 4563 + }, + { + "epoch": 0.9735494880546075, + "grad_norm": 0.5839862079851905, + "learning_rate": 0.00015925729198302567, + "loss": 0.9598, + "step": 4564 + }, + { + "epoch": 0.9737627986348123, + "grad_norm": 0.4305689013499268, + "learning_rate": 0.00015925613465993455, + "loss": 0.9543, + "step": 4565 + }, + { + "epoch": 0.9739761092150171, + "grad_norm": 0.41681396370920226, + "learning_rate": 0.00015925497644005897, + "loss": 0.9628, + "step": 4566 + }, + { + "epoch": 0.9741894197952219, + "grad_norm": 0.48325553123860876, + "learning_rate": 0.00015925381732341195, + "loss": 0.9582, + "step": 4567 + }, + { + "epoch": 0.9744027303754266, + "grad_norm": 0.5350019132128814, + "learning_rate": 0.0001592526573100067, + "loss": 0.9768, + "step": 4568 + }, + { + "epoch": 0.9746160409556314, + "grad_norm": 0.5874215794767864, + "learning_rate": 0.0001592514963998563, + "loss": 0.9961, + "step": 4569 + }, + { + "epoch": 0.9748293515358362, + "grad_norm": 0.5989989635013091, + "learning_rate": 0.00015925033459297382, + "loss": 0.9389, + "step": 4570 + }, + { + "epoch": 0.975042662116041, + "grad_norm": 0.6325613402147285, + "learning_rate": 0.0001592491718893725, + "loss": 1.0013, + "step": 4571 + }, + { + "epoch": 0.9752559726962458, + "grad_norm": 0.6498351100160937, + "learning_rate": 0.0001592480082890655, + "loss": 0.9694, + "step": 4572 + }, + { + "epoch": 0.9754692832764505, + "grad_norm": 0.6473652011285378, + "learning_rate": 0.00015924684379206594, + "loss": 0.9969, + "step": 4573 + }, + { + "epoch": 0.9756825938566553, + "grad_norm": 0.5413693910549885, + "learning_rate": 0.000159245678398387, + "loss": 0.9714, + "step": 4574 + }, + { + "epoch": 0.97589590443686, + "grad_norm": 0.4931532360962717, + "learning_rate": 0.00015924451210804188, + "loss": 0.9833, + "step": 4575 + }, + { + "epoch": 0.9761092150170648, + "grad_norm": 0.45722294077089, + "learning_rate": 0.0001592433449210438, + "loss": 0.9748, + "step": 4576 + }, + { + "epoch": 0.9763225255972696, + "grad_norm": 0.40151017941430794, + "learning_rate": 0.0001592421768374059, + "loss": 0.9681, + "step": 4577 + }, + { + "epoch": 0.9765358361774744, + "grad_norm": 0.41129418604108603, + "learning_rate": 0.00015924100785714144, + "loss": 0.9635, + "step": 4578 + }, + { + "epoch": 0.9767491467576792, + "grad_norm": 0.538244387882154, + "learning_rate": 0.00015923983798026364, + "loss": 0.9623, + "step": 4579 + }, + { + "epoch": 0.976962457337884, + "grad_norm": 0.6421568243188247, + "learning_rate": 0.00015923866720678577, + "loss": 1.0114, + "step": 4580 + }, + { + "epoch": 0.9771757679180887, + "grad_norm": 0.7050567310047068, + "learning_rate": 0.000159237495536721, + "loss": 0.9253, + "step": 4581 + }, + { + "epoch": 0.9773890784982935, + "grad_norm": 0.6239573802218865, + "learning_rate": 0.0001592363229700827, + "loss": 0.9678, + "step": 4582 + }, + { + "epoch": 0.9776023890784983, + "grad_norm": 0.5179817910256019, + "learning_rate": 0.00015923514950688403, + "loss": 0.9788, + "step": 4583 + }, + { + "epoch": 0.9778156996587031, + "grad_norm": 0.4911781717805279, + "learning_rate": 0.0001592339751471383, + "loss": 0.9731, + "step": 4584 + }, + { + "epoch": 0.9780290102389079, + "grad_norm": 0.4251331758435665, + "learning_rate": 0.00015923279989085885, + "loss": 0.9406, + "step": 4585 + }, + { + "epoch": 0.9782423208191127, + "grad_norm": 0.3373164729162721, + "learning_rate": 0.00015923162373805892, + "loss": 0.9823, + "step": 4586 + }, + { + "epoch": 0.9784556313993175, + "grad_norm": 0.35814022558786107, + "learning_rate": 0.0001592304466887518, + "loss": 0.9827, + "step": 4587 + }, + { + "epoch": 0.9786689419795221, + "grad_norm": 0.3643970234724253, + "learning_rate": 0.00015922926874295087, + "loss": 0.9981, + "step": 4588 + }, + { + "epoch": 0.9788822525597269, + "grad_norm": 0.3511566602764474, + "learning_rate": 0.00015922808990066946, + "loss": 0.9868, + "step": 4589 + }, + { + "epoch": 0.9790955631399317, + "grad_norm": 0.35719393521452797, + "learning_rate": 0.00015922691016192084, + "loss": 0.9533, + "step": 4590 + }, + { + "epoch": 0.9793088737201365, + "grad_norm": 0.30213317004865364, + "learning_rate": 0.0001592257295267184, + "loss": 0.9704, + "step": 4591 + }, + { + "epoch": 0.9795221843003413, + "grad_norm": 0.4165386514859645, + "learning_rate": 0.00015922454799507552, + "loss": 0.98, + "step": 4592 + }, + { + "epoch": 0.9797354948805461, + "grad_norm": 0.4630191353164534, + "learning_rate": 0.00015922336556700554, + "loss": 1.0068, + "step": 4593 + }, + { + "epoch": 0.9799488054607508, + "grad_norm": 0.39383867363269875, + "learning_rate": 0.00015922218224252184, + "loss": 0.9533, + "step": 4594 + }, + { + "epoch": 0.9801621160409556, + "grad_norm": 0.40487927279539815, + "learning_rate": 0.00015922099802163784, + "loss": 1.0145, + "step": 4595 + }, + { + "epoch": 0.9803754266211604, + "grad_norm": 0.3811894254949082, + "learning_rate": 0.0001592198129043669, + "loss": 0.968, + "step": 4596 + }, + { + "epoch": 0.9805887372013652, + "grad_norm": 0.40747168115129234, + "learning_rate": 0.00015921862689072242, + "loss": 0.9706, + "step": 4597 + }, + { + "epoch": 0.98080204778157, + "grad_norm": 0.4206146172397865, + "learning_rate": 0.00015921743998071784, + "loss": 0.9704, + "step": 4598 + }, + { + "epoch": 0.9810153583617748, + "grad_norm": 0.4887957822007, + "learning_rate": 0.00015921625217436662, + "loss": 1.0088, + "step": 4599 + }, + { + "epoch": 0.9812286689419796, + "grad_norm": 0.5257179283816428, + "learning_rate": 0.0001592150634716822, + "loss": 0.9747, + "step": 4600 + }, + { + "epoch": 0.9814419795221843, + "grad_norm": 0.4791041209204971, + "learning_rate": 0.00015921387387267795, + "loss": 0.9728, + "step": 4601 + }, + { + "epoch": 0.981655290102389, + "grad_norm": 0.3908574134768045, + "learning_rate": 0.0001592126833773674, + "loss": 0.9529, + "step": 4602 + }, + { + "epoch": 0.9818686006825939, + "grad_norm": 0.4343783583183416, + "learning_rate": 0.00015921149198576404, + "loss": 0.9517, + "step": 4603 + }, + { + "epoch": 0.9820819112627986, + "grad_norm": 0.38696419969988016, + "learning_rate": 0.00015921029969788127, + "loss": 0.9383, + "step": 4604 + }, + { + "epoch": 0.9822952218430034, + "grad_norm": 0.3379853484386546, + "learning_rate": 0.00015920910651373265, + "loss": 0.9761, + "step": 4605 + }, + { + "epoch": 0.9825085324232082, + "grad_norm": 0.4696702509587119, + "learning_rate": 0.00015920791243333166, + "loss": 0.9599, + "step": 4606 + }, + { + "epoch": 0.9827218430034129, + "grad_norm": 0.5553240032497698, + "learning_rate": 0.0001592067174566918, + "loss": 0.9826, + "step": 4607 + }, + { + "epoch": 0.9829351535836177, + "grad_norm": 0.48629704164449683, + "learning_rate": 0.0001592055215838266, + "loss": 0.9709, + "step": 4608 + }, + { + "epoch": 0.9831484641638225, + "grad_norm": 0.3831668456155139, + "learning_rate": 0.0001592043248147496, + "loss": 0.9646, + "step": 4609 + }, + { + "epoch": 0.9833617747440273, + "grad_norm": 0.4432839887829975, + "learning_rate": 0.00015920312714947434, + "loss": 0.9558, + "step": 4610 + }, + { + "epoch": 0.9835750853242321, + "grad_norm": 0.5438045008388174, + "learning_rate": 0.00015920192858801433, + "loss": 0.9578, + "step": 4611 + }, + { + "epoch": 0.9837883959044369, + "grad_norm": 0.6005390387415968, + "learning_rate": 0.0001592007291303832, + "loss": 0.9451, + "step": 4612 + }, + { + "epoch": 0.9840017064846417, + "grad_norm": 0.5892215501316022, + "learning_rate": 0.00015919952877659446, + "loss": 0.9432, + "step": 4613 + }, + { + "epoch": 0.9842150170648464, + "grad_norm": 1.8804600445214348, + "learning_rate": 0.00015919832752666173, + "loss": 1.0305, + "step": 4614 + }, + { + "epoch": 0.9844283276450512, + "grad_norm": 0.5606109099424088, + "learning_rate": 0.0001591971253805986, + "loss": 0.977, + "step": 4615 + }, + { + "epoch": 0.984641638225256, + "grad_norm": 1.215435512861518, + "learning_rate": 0.00015919592233841866, + "loss": 0.9897, + "step": 4616 + }, + { + "epoch": 0.9848549488054608, + "grad_norm": 1.1728184411253335, + "learning_rate": 0.0001591947184001355, + "loss": 0.9917, + "step": 4617 + }, + { + "epoch": 0.9850682593856656, + "grad_norm": 0.9543057389287974, + "learning_rate": 0.00015919351356576282, + "loss": 0.9804, + "step": 4618 + }, + { + "epoch": 0.9852815699658704, + "grad_norm": 1.2504355895890797, + "learning_rate": 0.00015919230783531418, + "loss": 0.9683, + "step": 4619 + }, + { + "epoch": 0.985494880546075, + "grad_norm": 1.0002717100146126, + "learning_rate": 0.00015919110120880323, + "loss": 0.9992, + "step": 4620 + }, + { + "epoch": 0.9857081911262798, + "grad_norm": 1.4402628154576653, + "learning_rate": 0.00015918989368624367, + "loss": 0.9639, + "step": 4621 + }, + { + "epoch": 0.9859215017064846, + "grad_norm": 0.6836441053954525, + "learning_rate": 0.0001591886852676491, + "loss": 0.9658, + "step": 4622 + }, + { + "epoch": 0.9861348122866894, + "grad_norm": 1.4301373258944556, + "learning_rate": 0.00015918747595303323, + "loss": 1.0406, + "step": 4623 + }, + { + "epoch": 0.9863481228668942, + "grad_norm": 0.6207714426251292, + "learning_rate": 0.00015918626574240973, + "loss": 1.0011, + "step": 4624 + }, + { + "epoch": 0.986561433447099, + "grad_norm": 1.428260967767891, + "learning_rate": 0.00015918505463579234, + "loss": 0.9944, + "step": 4625 + }, + { + "epoch": 0.9867747440273038, + "grad_norm": 0.7155727028940363, + "learning_rate": 0.0001591838426331947, + "loss": 0.9739, + "step": 4626 + }, + { + "epoch": 0.9869880546075085, + "grad_norm": 1.2944400998592398, + "learning_rate": 0.00015918262973463053, + "loss": 0.991, + "step": 4627 + }, + { + "epoch": 0.9872013651877133, + "grad_norm": 0.7416330396024206, + "learning_rate": 0.0001591814159401136, + "loss": 0.9801, + "step": 4628 + }, + { + "epoch": 0.9874146757679181, + "grad_norm": 1.2814680626259352, + "learning_rate": 0.0001591802012496576, + "loss": 0.9843, + "step": 4629 + }, + { + "epoch": 0.9876279863481229, + "grad_norm": 0.7033048369136444, + "learning_rate": 0.0001591789856632763, + "loss": 0.9753, + "step": 4630 + }, + { + "epoch": 0.9878412969283277, + "grad_norm": 0.8353439017460231, + "learning_rate": 0.00015917776918098346, + "loss": 0.9911, + "step": 4631 + }, + { + "epoch": 0.9880546075085325, + "grad_norm": 0.6384981444865627, + "learning_rate": 0.0001591765518027928, + "loss": 0.9731, + "step": 4632 + }, + { + "epoch": 0.9882679180887372, + "grad_norm": 0.6553231257238628, + "learning_rate": 0.00015917533352871813, + "loss": 1.0093, + "step": 4633 + }, + { + "epoch": 0.988481228668942, + "grad_norm": 0.7353109995544551, + "learning_rate": 0.00015917411435877326, + "loss": 0.9781, + "step": 4634 + }, + { + "epoch": 0.9886945392491467, + "grad_norm": 0.719798451074493, + "learning_rate": 0.0001591728942929719, + "loss": 0.9569, + "step": 4635 + }, + { + "epoch": 0.9889078498293515, + "grad_norm": 0.5608986398808408, + "learning_rate": 0.00015917167333132795, + "loss": 0.9753, + "step": 4636 + }, + { + "epoch": 0.9891211604095563, + "grad_norm": 0.7434467792640334, + "learning_rate": 0.00015917045147385516, + "loss": 0.9876, + "step": 4637 + }, + { + "epoch": 0.9893344709897611, + "grad_norm": 0.6768981733018588, + "learning_rate": 0.00015916922872056744, + "loss": 0.9734, + "step": 4638 + }, + { + "epoch": 0.9895477815699659, + "grad_norm": 0.7337195066339999, + "learning_rate": 0.00015916800507147848, + "loss": 1.0131, + "step": 4639 + }, + { + "epoch": 0.9897610921501706, + "grad_norm": 0.7508167964002712, + "learning_rate": 0.00015916678052660225, + "loss": 0.9798, + "step": 4640 + }, + { + "epoch": 0.9899744027303754, + "grad_norm": 0.5697795970068986, + "learning_rate": 0.00015916555508595256, + "loss": 1.0044, + "step": 4641 + }, + { + "epoch": 0.9901877133105802, + "grad_norm": 1.869839395500576, + "learning_rate": 0.00015916432874954332, + "loss": 0.9967, + "step": 4642 + }, + { + "epoch": 0.990401023890785, + "grad_norm": 0.5903460993745533, + "learning_rate": 0.0001591631015173883, + "loss": 1.0005, + "step": 4643 + }, + { + "epoch": 0.9906143344709898, + "grad_norm": 1.1519747974953598, + "learning_rate": 0.00015916187338950154, + "loss": 0.9869, + "step": 4644 + }, + { + "epoch": 0.9908276450511946, + "grad_norm": 1.2577388250576, + "learning_rate": 0.0001591606443658968, + "loss": 0.9981, + "step": 4645 + }, + { + "epoch": 0.9910409556313993, + "grad_norm": 0.7560513256084582, + "learning_rate": 0.00015915941444658805, + "loss": 1.0252, + "step": 4646 + }, + { + "epoch": 0.9912542662116041, + "grad_norm": 0.7931279805290741, + "learning_rate": 0.00015915818363158917, + "loss": 0.9922, + "step": 4647 + }, + { + "epoch": 0.9914675767918089, + "grad_norm": 0.9824242169848394, + "learning_rate": 0.00015915695192091414, + "loss": 1.0018, + "step": 4648 + }, + { + "epoch": 0.9916808873720137, + "grad_norm": 1.1422200590852738, + "learning_rate": 0.00015915571931457686, + "loss": 0.9683, + "step": 4649 + }, + { + "epoch": 0.9918941979522184, + "grad_norm": 0.7432286544482586, + "learning_rate": 0.00015915448581259125, + "loss": 0.9711, + "step": 4650 + }, + { + "epoch": 0.9921075085324232, + "grad_norm": 0.6145713669251276, + "learning_rate": 0.00015915325141497135, + "loss": 0.9849, + "step": 4651 + }, + { + "epoch": 0.992320819112628, + "grad_norm": 0.722118563512428, + "learning_rate": 0.00015915201612173104, + "loss": 1.0017, + "step": 4652 + }, + { + "epoch": 0.9925341296928327, + "grad_norm": 0.8292620138702594, + "learning_rate": 0.00015915077993288435, + "loss": 0.9724, + "step": 4653 + }, + { + "epoch": 0.9927474402730375, + "grad_norm": 0.7521202968047118, + "learning_rate": 0.0001591495428484453, + "loss": 0.9555, + "step": 4654 + }, + { + "epoch": 0.9929607508532423, + "grad_norm": 0.7119047695098065, + "learning_rate": 0.0001591483048684278, + "loss": 0.9557, + "step": 4655 + }, + { + "epoch": 0.9931740614334471, + "grad_norm": 0.5843925392625342, + "learning_rate": 0.00015914706599284587, + "loss": 0.9629, + "step": 4656 + }, + { + "epoch": 0.9933873720136519, + "grad_norm": 0.42078808793404787, + "learning_rate": 0.0001591458262217136, + "loss": 0.9958, + "step": 4657 + }, + { + "epoch": 0.9936006825938567, + "grad_norm": 0.5625077188325768, + "learning_rate": 0.00015914458555504497, + "loss": 0.9939, + "step": 4658 + }, + { + "epoch": 0.9938139931740614, + "grad_norm": 0.5518807249160107, + "learning_rate": 0.00015914334399285402, + "loss": 1.0006, + "step": 4659 + }, + { + "epoch": 0.9940273037542662, + "grad_norm": 0.5957618264925125, + "learning_rate": 0.00015914210153515477, + "loss": 0.9646, + "step": 4660 + }, + { + "epoch": 0.994240614334471, + "grad_norm": 0.5654209973923057, + "learning_rate": 0.00015914085818196136, + "loss": 0.9699, + "step": 4661 + }, + { + "epoch": 0.9944539249146758, + "grad_norm": 0.4794152562014573, + "learning_rate": 0.0001591396139332878, + "loss": 0.9794, + "step": 4662 + }, + { + "epoch": 0.9946672354948806, + "grad_norm": 0.46710916441669664, + "learning_rate": 0.00015913836878914814, + "loss": 0.9953, + "step": 4663 + }, + { + "epoch": 0.9948805460750854, + "grad_norm": 0.4368138247507383, + "learning_rate": 0.00015913712274955654, + "loss": 1.0128, + "step": 4664 + }, + { + "epoch": 0.9950938566552902, + "grad_norm": 0.41527602099534294, + "learning_rate": 0.00015913587581452705, + "loss": 0.9533, + "step": 4665 + }, + { + "epoch": 0.9953071672354948, + "grad_norm": 0.4139004708627797, + "learning_rate": 0.00015913462798407377, + "loss": 0.9788, + "step": 4666 + }, + { + "epoch": 0.9955204778156996, + "grad_norm": 0.4260454843046068, + "learning_rate": 0.00015913337925821088, + "loss": 0.9657, + "step": 4667 + }, + { + "epoch": 0.9957337883959044, + "grad_norm": 0.4296398363022765, + "learning_rate": 0.0001591321296369525, + "loss": 0.9404, + "step": 4668 + }, + { + "epoch": 0.9959470989761092, + "grad_norm": 0.4579812913595996, + "learning_rate": 0.00015913087912031268, + "loss": 0.9649, + "step": 4669 + }, + { + "epoch": 0.996160409556314, + "grad_norm": 0.4129101658646057, + "learning_rate": 0.00015912962770830565, + "loss": 0.9913, + "step": 4670 + }, + { + "epoch": 0.9963737201365188, + "grad_norm": 0.38750613348620716, + "learning_rate": 0.00015912837540094557, + "loss": 0.9853, + "step": 4671 + }, + { + "epoch": 0.9965870307167235, + "grad_norm": 0.42331977802325943, + "learning_rate": 0.0001591271221982466, + "loss": 0.968, + "step": 4672 + }, + { + "epoch": 0.9968003412969283, + "grad_norm": 0.310316240708729, + "learning_rate": 0.00015912586810022287, + "loss": 0.9891, + "step": 4673 + }, + { + "epoch": 0.9970136518771331, + "grad_norm": 0.40105982938741114, + "learning_rate": 0.00015912461310688863, + "loss": 0.9423, + "step": 4674 + }, + { + "epoch": 0.9972269624573379, + "grad_norm": 0.4794235671041489, + "learning_rate": 0.00015912335721825807, + "loss": 0.9715, + "step": 4675 + }, + { + "epoch": 0.9974402730375427, + "grad_norm": 0.5281306440660717, + "learning_rate": 0.0001591221004343454, + "loss": 0.9568, + "step": 4676 + }, + { + "epoch": 0.9976535836177475, + "grad_norm": 0.5126120712632922, + "learning_rate": 0.00015912084275516482, + "loss": 0.9698, + "step": 4677 + }, + { + "epoch": 0.9978668941979523, + "grad_norm": 0.530512596981504, + "learning_rate": 0.00015911958418073056, + "loss": 0.978, + "step": 4678 + }, + { + "epoch": 0.998080204778157, + "grad_norm": 0.6727837444567547, + "learning_rate": 0.0001591183247110569, + "loss": 0.986, + "step": 4679 + }, + { + "epoch": 0.9982935153583617, + "grad_norm": 0.7300809205970737, + "learning_rate": 0.00015911706434615806, + "loss": 0.9628, + "step": 4680 + }, + { + "epoch": 0.9985068259385665, + "grad_norm": 0.6576065797711654, + "learning_rate": 0.00015911580308604827, + "loss": 0.9872, + "step": 4681 + }, + { + "epoch": 0.9987201365187713, + "grad_norm": 0.48881585263450994, + "learning_rate": 0.0001591145409307419, + "loss": 0.9699, + "step": 4682 + }, + { + "epoch": 0.9989334470989761, + "grad_norm": 0.3339203456909549, + "learning_rate": 0.00015911327788025313, + "loss": 0.9737, + "step": 4683 + }, + { + "epoch": 0.9991467576791809, + "grad_norm": 0.3723180585437666, + "learning_rate": 0.00015911201393459628, + "loss": 0.9651, + "step": 4684 + }, + { + "epoch": 0.9993600682593856, + "grad_norm": 0.44774805296711073, + "learning_rate": 0.00015911074909378569, + "loss": 0.9908, + "step": 4685 + }, + { + "epoch": 0.9995733788395904, + "grad_norm": 0.5300838655057885, + "learning_rate": 0.0001591094833578356, + "loss": 1.0006, + "step": 4686 + }, + { + "epoch": 0.9997866894197952, + "grad_norm": 0.6477463907846669, + "learning_rate": 0.0001591082167267604, + "loss": 0.9518, + "step": 4687 + }, + { + "epoch": 1.0, + "grad_norm": 0.6987969756014982, + "learning_rate": 0.00015910694920057443, + "loss": 0.942, + "step": 4688 + }, + { + "epoch": 1.0002133105802047, + "grad_norm": 0.752127750666474, + "learning_rate": 0.00015910568077929198, + "loss": 0.9919, + "step": 4689 + }, + { + "epoch": 1.0004266211604096, + "grad_norm": 0.7264534736815772, + "learning_rate": 0.0001591044114629274, + "loss": 0.9624, + "step": 4690 + }, + { + "epoch": 1.0006399317406143, + "grad_norm": 0.6120259252278892, + "learning_rate": 0.0001591031412514951, + "loss": 0.9772, + "step": 4691 + }, + { + "epoch": 1.0008532423208192, + "grad_norm": 0.4584156121638513, + "learning_rate": 0.00015910187014500938, + "loss": 0.9346, + "step": 4692 + }, + { + "epoch": 1.0010665529010239, + "grad_norm": 0.34974103333863427, + "learning_rate": 0.00015910059814348474, + "loss": 0.9224, + "step": 4693 + }, + { + "epoch": 1.0012798634812288, + "grad_norm": 0.33819693590744804, + "learning_rate": 0.00015909932524693545, + "loss": 0.9683, + "step": 4694 + }, + { + "epoch": 1.0014931740614335, + "grad_norm": 0.38631350092920524, + "learning_rate": 0.00015909805145537597, + "loss": 1.0007, + "step": 4695 + }, + { + "epoch": 1.0017064846416381, + "grad_norm": 0.465131277424044, + "learning_rate": 0.00015909677676882072, + "loss": 0.98, + "step": 4696 + }, + { + "epoch": 1.001919795221843, + "grad_norm": 0.5227084179677948, + "learning_rate": 0.00015909550118728408, + "loss": 0.9684, + "step": 4697 + }, + { + "epoch": 1.0021331058020477, + "grad_norm": 0.535585304320242, + "learning_rate": 0.00015909422471078056, + "loss": 0.9752, + "step": 4698 + }, + { + "epoch": 1.0023464163822526, + "grad_norm": 0.43755452157452135, + "learning_rate": 0.00015909294733932453, + "loss": 0.959, + "step": 4699 + }, + { + "epoch": 1.0025597269624573, + "grad_norm": 0.2935138391238246, + "learning_rate": 0.00015909166907293048, + "loss": 0.9725, + "step": 4700 + }, + { + "epoch": 1.0027730375426622, + "grad_norm": 0.324416173958152, + "learning_rate": 0.00015909038991161287, + "loss": 0.9403, + "step": 4701 + }, + { + "epoch": 1.002986348122867, + "grad_norm": 0.3183185084514073, + "learning_rate": 0.00015908910985538616, + "loss": 1.0113, + "step": 4702 + }, + { + "epoch": 1.0031996587030716, + "grad_norm": 0.36388723742933793, + "learning_rate": 0.00015908782890426484, + "loss": 0.9527, + "step": 4703 + }, + { + "epoch": 1.0034129692832765, + "grad_norm": 0.47130523777275957, + "learning_rate": 0.0001590865470582634, + "loss": 0.9811, + "step": 4704 + }, + { + "epoch": 1.0036262798634812, + "grad_norm": 0.5382887487936387, + "learning_rate": 0.00015908526431739637, + "loss": 0.9485, + "step": 4705 + }, + { + "epoch": 1.003839590443686, + "grad_norm": 0.7966732634147142, + "learning_rate": 0.00015908398068167822, + "loss": 0.9944, + "step": 4706 + }, + { + "epoch": 1.0040529010238908, + "grad_norm": 0.3687129639795602, + "learning_rate": 0.00015908269615112352, + "loss": 0.9813, + "step": 4707 + }, + { + "epoch": 1.0042662116040955, + "grad_norm": 0.4256293876490023, + "learning_rate": 0.00015908141072574675, + "loss": 0.9649, + "step": 4708 + }, + { + "epoch": 1.0044795221843004, + "grad_norm": 0.6009004114740949, + "learning_rate": 0.00015908012440556249, + "loss": 0.9794, + "step": 4709 + }, + { + "epoch": 1.004692832764505, + "grad_norm": 0.550247646856335, + "learning_rate": 0.00015907883719058533, + "loss": 0.9814, + "step": 4710 + }, + { + "epoch": 1.00490614334471, + "grad_norm": 0.33608739834651663, + "learning_rate": 0.00015907754908082978, + "loss": 0.9565, + "step": 4711 + }, + { + "epoch": 1.0051194539249146, + "grad_norm": 0.44259182015397563, + "learning_rate": 0.00015907626007631042, + "loss": 0.9896, + "step": 4712 + }, + { + "epoch": 1.0053327645051195, + "grad_norm": 0.5145253896901384, + "learning_rate": 0.00015907497017704185, + "loss": 0.9821, + "step": 4713 + }, + { + "epoch": 1.0055460750853242, + "grad_norm": 0.6099697348135645, + "learning_rate": 0.00015907367938303868, + "loss": 0.9823, + "step": 4714 + }, + { + "epoch": 1.005759385665529, + "grad_norm": 0.9037289797698447, + "learning_rate": 0.0001590723876943155, + "loss": 0.9836, + "step": 4715 + }, + { + "epoch": 1.0059726962457338, + "grad_norm": 1.0988352398014039, + "learning_rate": 0.00015907109511088688, + "loss": 0.9654, + "step": 4716 + }, + { + "epoch": 1.0061860068259385, + "grad_norm": 0.7345407362843368, + "learning_rate": 0.00015906980163276754, + "loss": 0.9717, + "step": 4717 + }, + { + "epoch": 1.0063993174061434, + "grad_norm": 0.5296974782337498, + "learning_rate": 0.00015906850725997204, + "loss": 0.9792, + "step": 4718 + }, + { + "epoch": 1.006612627986348, + "grad_norm": 0.3435542662181891, + "learning_rate": 0.00015906721199251505, + "loss": 0.9558, + "step": 4719 + }, + { + "epoch": 1.006825938566553, + "grad_norm": 0.3249877324647722, + "learning_rate": 0.00015906591583041125, + "loss": 0.9733, + "step": 4720 + }, + { + "epoch": 1.0070392491467577, + "grad_norm": 0.40967499215859904, + "learning_rate": 0.00015906461877367525, + "loss": 0.9559, + "step": 4721 + }, + { + "epoch": 1.0072525597269624, + "grad_norm": 0.5149549309546358, + "learning_rate": 0.00015906332082232176, + "loss": 1.0005, + "step": 4722 + }, + { + "epoch": 1.0074658703071673, + "grad_norm": 0.6494544773826312, + "learning_rate": 0.0001590620219763655, + "loss": 0.98, + "step": 4723 + }, + { + "epoch": 1.007679180887372, + "grad_norm": 0.7168243603506486, + "learning_rate": 0.0001590607222358211, + "loss": 0.9797, + "step": 4724 + }, + { + "epoch": 1.0078924914675769, + "grad_norm": 1.2652582384212216, + "learning_rate": 0.0001590594216007033, + "loss": 1.0303, + "step": 4725 + }, + { + "epoch": 1.0081058020477816, + "grad_norm": 0.4352048105145134, + "learning_rate": 0.00015905812007102683, + "loss": 0.9813, + "step": 4726 + }, + { + "epoch": 1.0083191126279865, + "grad_norm": 0.8056812862663473, + "learning_rate": 0.00015905681764680638, + "loss": 1.0293, + "step": 4727 + }, + { + "epoch": 1.0085324232081911, + "grad_norm": 1.2141776947926244, + "learning_rate": 0.00015905551432805672, + "loss": 0.9748, + "step": 4728 + }, + { + "epoch": 1.0087457337883958, + "grad_norm": 0.8548633118349699, + "learning_rate": 0.00015905421011479262, + "loss": 0.9901, + "step": 4729 + }, + { + "epoch": 1.0089590443686007, + "grad_norm": 0.7239315641463814, + "learning_rate": 0.00015905290500702873, + "loss": 0.9912, + "step": 4730 + }, + { + "epoch": 1.0091723549488054, + "grad_norm": 0.7325225316235451, + "learning_rate": 0.00015905159900477995, + "loss": 0.9945, + "step": 4731 + }, + { + "epoch": 1.0093856655290103, + "grad_norm": 0.5914198925644881, + "learning_rate": 0.00015905029210806096, + "loss": 0.9639, + "step": 4732 + }, + { + "epoch": 1.009598976109215, + "grad_norm": 0.4368968565355743, + "learning_rate": 0.0001590489843168866, + "loss": 0.9474, + "step": 4733 + }, + { + "epoch": 1.0098122866894197, + "grad_norm": 0.5134060751901208, + "learning_rate": 0.00015904767563127168, + "loss": 0.9551, + "step": 4734 + }, + { + "epoch": 1.0100255972696246, + "grad_norm": 0.5381580572657962, + "learning_rate": 0.00015904636605123095, + "loss": 0.9647, + "step": 4735 + }, + { + "epoch": 1.0102389078498293, + "grad_norm": 0.5013241329263304, + "learning_rate": 0.00015904505557677926, + "loss": 0.9979, + "step": 4736 + }, + { + "epoch": 1.0104522184300342, + "grad_norm": 0.5194186197457769, + "learning_rate": 0.00015904374420793143, + "loss": 0.9861, + "step": 4737 + }, + { + "epoch": 1.0106655290102389, + "grad_norm": 0.48164171189397925, + "learning_rate": 0.0001590424319447023, + "loss": 0.958, + "step": 4738 + }, + { + "epoch": 1.0108788395904438, + "grad_norm": 0.47388478020182395, + "learning_rate": 0.00015904111878710676, + "loss": 1.007, + "step": 4739 + }, + { + "epoch": 1.0110921501706485, + "grad_norm": 0.42592991679644304, + "learning_rate": 0.00015903980473515962, + "loss": 0.987, + "step": 4740 + }, + { + "epoch": 1.0113054607508531, + "grad_norm": 0.3967804759511995, + "learning_rate": 0.00015903848978887576, + "loss": 0.9619, + "step": 4741 + }, + { + "epoch": 1.011518771331058, + "grad_norm": 0.5034278352340316, + "learning_rate": 0.00015903717394827004, + "loss": 1.0047, + "step": 4742 + }, + { + "epoch": 1.0117320819112627, + "grad_norm": 0.5487830976937006, + "learning_rate": 0.00015903585721335734, + "loss": 0.9915, + "step": 4743 + }, + { + "epoch": 1.0119453924914676, + "grad_norm": 0.4035537046094826, + "learning_rate": 0.00015903453958415265, + "loss": 0.9617, + "step": 4744 + }, + { + "epoch": 1.0121587030716723, + "grad_norm": 0.34410081692544847, + "learning_rate": 0.00015903322106067077, + "loss": 0.9849, + "step": 4745 + }, + { + "epoch": 1.0123720136518772, + "grad_norm": 0.38194922244901297, + "learning_rate": 0.00015903190164292668, + "loss": 0.9588, + "step": 4746 + }, + { + "epoch": 1.012585324232082, + "grad_norm": 0.36049428088548713, + "learning_rate": 0.00015903058133093532, + "loss": 0.9784, + "step": 4747 + }, + { + "epoch": 1.0127986348122866, + "grad_norm": 0.2884720371096184, + "learning_rate": 0.00015902926012471158, + "loss": 0.9676, + "step": 4748 + }, + { + "epoch": 1.0130119453924915, + "grad_norm": 0.34086231516471754, + "learning_rate": 0.00015902793802427043, + "loss": 1.0092, + "step": 4749 + }, + { + "epoch": 1.0132252559726962, + "grad_norm": 0.43651631782067896, + "learning_rate": 0.0001590266150296268, + "loss": 0.9966, + "step": 4750 + }, + { + "epoch": 1.013438566552901, + "grad_norm": 0.4793558920864127, + "learning_rate": 0.00015902529114079575, + "loss": 0.9942, + "step": 4751 + }, + { + "epoch": 1.0136518771331058, + "grad_norm": 0.4511451480179726, + "learning_rate": 0.00015902396635779218, + "loss": 0.9536, + "step": 4752 + }, + { + "epoch": 1.0138651877133107, + "grad_norm": 0.3701845727109123, + "learning_rate": 0.0001590226406806311, + "loss": 0.9646, + "step": 4753 + }, + { + "epoch": 1.0140784982935154, + "grad_norm": 0.3594178332829537, + "learning_rate": 0.0001590213141093275, + "loss": 1.0005, + "step": 4754 + }, + { + "epoch": 1.01429180887372, + "grad_norm": 0.33951024423138043, + "learning_rate": 0.0001590199866438964, + "loss": 0.9742, + "step": 4755 + }, + { + "epoch": 1.014505119453925, + "grad_norm": 0.33111648361549095, + "learning_rate": 0.00015901865828435284, + "loss": 0.9566, + "step": 4756 + }, + { + "epoch": 1.0147184300341296, + "grad_norm": 0.33800593353983044, + "learning_rate": 0.00015901732903071181, + "loss": 0.9693, + "step": 4757 + }, + { + "epoch": 1.0149317406143346, + "grad_norm": 0.3468847001508492, + "learning_rate": 0.0001590159988829884, + "loss": 0.9699, + "step": 4758 + }, + { + "epoch": 1.0151450511945392, + "grad_norm": 0.41183393576573296, + "learning_rate": 0.00015901466784119765, + "loss": 0.9745, + "step": 4759 + }, + { + "epoch": 1.015358361774744, + "grad_norm": 0.4862208644474387, + "learning_rate": 0.00015901333590535456, + "loss": 0.968, + "step": 4760 + }, + { + "epoch": 1.0155716723549488, + "grad_norm": 0.5110018320161649, + "learning_rate": 0.00015901200307547424, + "loss": 0.9756, + "step": 4761 + }, + { + "epoch": 1.0157849829351535, + "grad_norm": 0.5475962882204874, + "learning_rate": 0.0001590106693515718, + "loss": 0.9514, + "step": 4762 + }, + { + "epoch": 1.0159982935153584, + "grad_norm": 0.650243443461606, + "learning_rate": 0.0001590093347336623, + "loss": 0.9665, + "step": 4763 + }, + { + "epoch": 1.016211604095563, + "grad_norm": 0.7828684590613684, + "learning_rate": 0.00015900799922176085, + "loss": 0.9747, + "step": 4764 + }, + { + "epoch": 1.016424914675768, + "grad_norm": 0.7710472615789411, + "learning_rate": 0.00015900666281588259, + "loss": 0.9724, + "step": 4765 + }, + { + "epoch": 1.0166382252559727, + "grad_norm": 0.611788120267984, + "learning_rate": 0.00015900532551604257, + "loss": 0.9555, + "step": 4766 + }, + { + "epoch": 1.0168515358361774, + "grad_norm": 0.4988762287309992, + "learning_rate": 0.00015900398732225597, + "loss": 0.966, + "step": 4767 + }, + { + "epoch": 1.0170648464163823, + "grad_norm": 0.43539921557953926, + "learning_rate": 0.00015900264823453793, + "loss": 0.9468, + "step": 4768 + }, + { + "epoch": 1.017278156996587, + "grad_norm": 1.2899246126064963, + "learning_rate": 0.0001590013082529036, + "loss": 1.0769, + "step": 4769 + }, + { + "epoch": 1.0174914675767919, + "grad_norm": 0.29087997191563747, + "learning_rate": 0.00015899996737736815, + "loss": 0.9755, + "step": 4770 + }, + { + "epoch": 1.0177047781569966, + "grad_norm": 0.4045945377847346, + "learning_rate": 0.00015899862560794672, + "loss": 0.9584, + "step": 4771 + }, + { + "epoch": 1.0179180887372015, + "grad_norm": 0.4639129165800072, + "learning_rate": 0.00015899728294465452, + "loss": 0.9568, + "step": 4772 + }, + { + "epoch": 1.0181313993174061, + "grad_norm": 0.6164609912564516, + "learning_rate": 0.00015899593938750674, + "loss": 0.9482, + "step": 4773 + }, + { + "epoch": 1.0183447098976108, + "grad_norm": 0.7428282512767531, + "learning_rate": 0.00015899459493651856, + "loss": 0.9812, + "step": 4774 + }, + { + "epoch": 1.0185580204778157, + "grad_norm": 0.8622822021917246, + "learning_rate": 0.00015899324959170523, + "loss": 0.9557, + "step": 4775 + }, + { + "epoch": 1.0187713310580204, + "grad_norm": 0.966986127034464, + "learning_rate": 0.0001589919033530819, + "loss": 0.997, + "step": 4776 + }, + { + "epoch": 1.0189846416382253, + "grad_norm": 1.00931404935043, + "learning_rate": 0.00015899055622066392, + "loss": 0.9755, + "step": 4777 + }, + { + "epoch": 1.01919795221843, + "grad_norm": 0.9371434748546419, + "learning_rate": 0.00015898920819446645, + "loss": 0.9534, + "step": 4778 + }, + { + "epoch": 1.019411262798635, + "grad_norm": 0.7781315204819279, + "learning_rate": 0.00015898785927450477, + "loss": 0.94, + "step": 4779 + }, + { + "epoch": 1.0196245733788396, + "grad_norm": 0.49533541036472234, + "learning_rate": 0.00015898650946079412, + "loss": 0.9849, + "step": 4780 + }, + { + "epoch": 1.0198378839590443, + "grad_norm": 0.3273774507790414, + "learning_rate": 0.00015898515875334976, + "loss": 0.9885, + "step": 4781 + }, + { + "epoch": 1.0200511945392492, + "grad_norm": 0.48714427026267193, + "learning_rate": 0.00015898380715218704, + "loss": 0.9815, + "step": 4782 + }, + { + "epoch": 1.0202645051194539, + "grad_norm": 0.606680479189878, + "learning_rate": 0.0001589824546573212, + "loss": 0.9701, + "step": 4783 + }, + { + "epoch": 1.0204778156996588, + "grad_norm": 0.7915338454542371, + "learning_rate": 0.00015898110126876756, + "loss": 0.9998, + "step": 4784 + }, + { + "epoch": 1.0206911262798635, + "grad_norm": 0.4610853353834515, + "learning_rate": 0.00015897974698654143, + "loss": 0.9824, + "step": 4785 + }, + { + "epoch": 1.0209044368600682, + "grad_norm": 0.46180547297116636, + "learning_rate": 0.00015897839181065813, + "loss": 0.9848, + "step": 4786 + }, + { + "epoch": 1.021117747440273, + "grad_norm": 0.5428106534938044, + "learning_rate": 0.00015897703574113301, + "loss": 0.9817, + "step": 4787 + }, + { + "epoch": 1.0213310580204777, + "grad_norm": 0.4123526982344178, + "learning_rate": 0.0001589756787779814, + "loss": 0.9472, + "step": 4788 + }, + { + "epoch": 1.0215443686006827, + "grad_norm": 0.3948661209108343, + "learning_rate": 0.00015897432092121862, + "loss": 0.9762, + "step": 4789 + }, + { + "epoch": 1.0217576791808873, + "grad_norm": 0.45863224074626113, + "learning_rate": 0.0001589729621708601, + "loss": 0.959, + "step": 4790 + }, + { + "epoch": 1.0219709897610922, + "grad_norm": 0.49470693006247, + "learning_rate": 0.0001589716025269212, + "loss": 0.9838, + "step": 4791 + }, + { + "epoch": 1.022184300341297, + "grad_norm": 0.5246163868659726, + "learning_rate": 0.00015897024198941728, + "loss": 0.9618, + "step": 4792 + }, + { + "epoch": 1.0223976109215016, + "grad_norm": 0.5571062506889563, + "learning_rate": 0.00015896888055836372, + "loss": 0.9487, + "step": 4793 + }, + { + "epoch": 1.0226109215017065, + "grad_norm": 0.6273625373644572, + "learning_rate": 0.00015896751823377597, + "loss": 0.9568, + "step": 4794 + }, + { + "epoch": 1.0228242320819112, + "grad_norm": 0.5237727127086593, + "learning_rate": 0.0001589661550156694, + "loss": 0.9778, + "step": 4795 + }, + { + "epoch": 1.023037542662116, + "grad_norm": 1.8982331386827054, + "learning_rate": 0.0001589647909040595, + "loss": 0.9576, + "step": 4796 + }, + { + "epoch": 1.0232508532423208, + "grad_norm": 0.4619045552385366, + "learning_rate": 0.00015896342589896164, + "loss": 0.968, + "step": 4797 + }, + { + "epoch": 1.0234641638225257, + "grad_norm": 0.9359452624985096, + "learning_rate": 0.00015896206000039128, + "loss": 0.9732, + "step": 4798 + }, + { + "epoch": 1.0236774744027304, + "grad_norm": 1.4194359975755886, + "learning_rate": 0.00015896069320836388, + "loss": 0.9823, + "step": 4799 + }, + { + "epoch": 1.023890784982935, + "grad_norm": 0.4987642030838176, + "learning_rate": 0.00015895932552289493, + "loss": 0.9916, + "step": 4800 + }, + { + "epoch": 1.02410409556314, + "grad_norm": 0.9536713950155331, + "learning_rate": 0.00015895795694399986, + "loss": 0.9998, + "step": 4801 + }, + { + "epoch": 1.0243174061433447, + "grad_norm": 1.6794676645764266, + "learning_rate": 0.00015895658747169418, + "loss": 1.0063, + "step": 4802 + }, + { + "epoch": 1.0245307167235496, + "grad_norm": 0.6305249398406138, + "learning_rate": 0.0001589552171059934, + "loss": 0.9393, + "step": 4803 + }, + { + "epoch": 1.0247440273037542, + "grad_norm": 1.777633371027517, + "learning_rate": 0.000158953845846913, + "loss": 0.9644, + "step": 4804 + }, + { + "epoch": 1.0249573378839592, + "grad_norm": 0.7419366274894815, + "learning_rate": 0.00015895247369446852, + "loss": 0.9998, + "step": 4805 + }, + { + "epoch": 1.0251706484641638, + "grad_norm": 1.7353485412219718, + "learning_rate": 0.00015895110064867546, + "loss": 0.9834, + "step": 4806 + }, + { + "epoch": 1.0253839590443685, + "grad_norm": 1.1478560301877254, + "learning_rate": 0.00015894972670954936, + "loss": 0.9921, + "step": 4807 + }, + { + "epoch": 1.0255972696245734, + "grad_norm": 1.4338562683320522, + "learning_rate": 0.0001589483518771058, + "loss": 0.9674, + "step": 4808 + }, + { + "epoch": 1.025810580204778, + "grad_norm": 1.2809608396205465, + "learning_rate": 0.0001589469761513603, + "loss": 0.9783, + "step": 4809 + }, + { + "epoch": 1.026023890784983, + "grad_norm": 1.014317419699132, + "learning_rate": 0.00015894559953232842, + "loss": 0.9619, + "step": 4810 + }, + { + "epoch": 1.0262372013651877, + "grad_norm": 1.5948791918734493, + "learning_rate": 0.0001589442220200258, + "loss": 0.9741, + "step": 4811 + }, + { + "epoch": 1.0264505119453924, + "grad_norm": 0.8235271934730573, + "learning_rate": 0.0001589428436144679, + "loss": 0.9975, + "step": 4812 + }, + { + "epoch": 1.0266638225255973, + "grad_norm": 1.5911375142347648, + "learning_rate": 0.00015894146431567045, + "loss": 0.995, + "step": 4813 + }, + { + "epoch": 1.026877133105802, + "grad_norm": 1.7148738335299287, + "learning_rate": 0.000158940084123649, + "loss": 1.0065, + "step": 4814 + }, + { + "epoch": 1.0270904436860069, + "grad_norm": 1.231686752029277, + "learning_rate": 0.0001589387030384192, + "loss": 0.9894, + "step": 4815 + }, + { + "epoch": 1.0273037542662116, + "grad_norm": 0.95454435555255, + "learning_rate": 0.0001589373210599966, + "loss": 0.9854, + "step": 4816 + }, + { + "epoch": 1.0275170648464165, + "grad_norm": 0.7481643919063256, + "learning_rate": 0.0001589359381883969, + "loss": 0.997, + "step": 4817 + }, + { + "epoch": 1.0277303754266212, + "grad_norm": 0.5928313007098278, + "learning_rate": 0.00015893455442363572, + "loss": 0.9821, + "step": 4818 + }, + { + "epoch": 1.0279436860068258, + "grad_norm": 0.6922371021056922, + "learning_rate": 0.00015893316976572873, + "loss": 0.9749, + "step": 4819 + }, + { + "epoch": 1.0281569965870307, + "grad_norm": 0.78538854722791, + "learning_rate": 0.0001589317842146916, + "loss": 1.0049, + "step": 4820 + }, + { + "epoch": 1.0283703071672354, + "grad_norm": 1.0666411875107618, + "learning_rate": 0.00015893039777054003, + "loss": 1.011, + "step": 4821 + }, + { + "epoch": 1.0285836177474403, + "grad_norm": 0.8389805087252772, + "learning_rate": 0.00015892901043328966, + "loss": 0.9635, + "step": 4822 + }, + { + "epoch": 1.028796928327645, + "grad_norm": 0.948087292664806, + "learning_rate": 0.0001589276222029562, + "loss": 0.9692, + "step": 4823 + }, + { + "epoch": 1.02901023890785, + "grad_norm": 1.1632426663966158, + "learning_rate": 0.00015892623307955537, + "loss": 1.0278, + "step": 4824 + }, + { + "epoch": 1.0292235494880546, + "grad_norm": 0.8231904161256646, + "learning_rate": 0.00015892484306310293, + "loss": 0.9873, + "step": 4825 + }, + { + "epoch": 1.0294368600682593, + "grad_norm": 0.91943015639401, + "learning_rate": 0.00015892345215361449, + "loss": 1.0042, + "step": 4826 + }, + { + "epoch": 1.0296501706484642, + "grad_norm": 0.8221067231740645, + "learning_rate": 0.0001589220603511059, + "loss": 0.9702, + "step": 4827 + }, + { + "epoch": 1.0298634812286689, + "grad_norm": 0.8324215448211607, + "learning_rate": 0.00015892066765559287, + "loss": 0.9848, + "step": 4828 + }, + { + "epoch": 1.0300767918088738, + "grad_norm": 0.786287302740933, + "learning_rate": 0.00015891927406709117, + "loss": 0.9579, + "step": 4829 + }, + { + "epoch": 1.0302901023890785, + "grad_norm": 0.6344886316062589, + "learning_rate": 0.00015891787958561654, + "loss": 0.969, + "step": 4830 + }, + { + "epoch": 1.0305034129692834, + "grad_norm": 0.7641086725354039, + "learning_rate": 0.00015891648421118476, + "loss": 0.9741, + "step": 4831 + }, + { + "epoch": 1.030716723549488, + "grad_norm": 0.6968056393167574, + "learning_rate": 0.00015891508794381166, + "loss": 0.9853, + "step": 4832 + }, + { + "epoch": 1.0309300341296928, + "grad_norm": 0.5097450775203703, + "learning_rate": 0.00015891369078351302, + "loss": 1.0147, + "step": 4833 + }, + { + "epoch": 1.0311433447098977, + "grad_norm": 0.6338645256806084, + "learning_rate": 0.0001589122927303046, + "loss": 0.9645, + "step": 4834 + }, + { + "epoch": 1.0313566552901023, + "grad_norm": 0.6030371940632545, + "learning_rate": 0.0001589108937842023, + "loss": 0.986, + "step": 4835 + }, + { + "epoch": 1.0315699658703072, + "grad_norm": 0.5032178764599734, + "learning_rate": 0.0001589094939452219, + "loss": 0.954, + "step": 4836 + }, + { + "epoch": 1.031783276450512, + "grad_norm": 0.5137836229701197, + "learning_rate": 0.00015890809321337924, + "loss": 0.968, + "step": 4837 + }, + { + "epoch": 1.0319965870307166, + "grad_norm": 0.40328191674053737, + "learning_rate": 0.00015890669158869018, + "loss": 0.9301, + "step": 4838 + }, + { + "epoch": 1.0322098976109215, + "grad_norm": 0.4224498117146192, + "learning_rate": 0.00015890528907117058, + "loss": 0.9763, + "step": 4839 + }, + { + "epoch": 1.0324232081911262, + "grad_norm": 0.5455833052371092, + "learning_rate": 0.00015890388566083632, + "loss": 0.9912, + "step": 4840 + }, + { + "epoch": 1.0326365187713311, + "grad_norm": 0.6134811054718498, + "learning_rate": 0.00015890248135770323, + "loss": 0.9793, + "step": 4841 + }, + { + "epoch": 1.0328498293515358, + "grad_norm": 0.44056016961292527, + "learning_rate": 0.00015890107616178725, + "loss": 0.9934, + "step": 4842 + }, + { + "epoch": 1.0330631399317407, + "grad_norm": 0.48333973617016357, + "learning_rate": 0.0001588996700731043, + "loss": 0.9884, + "step": 4843 + }, + { + "epoch": 1.0332764505119454, + "grad_norm": 3.8482719455976757, + "learning_rate": 0.0001588982630916702, + "loss": 1.0176, + "step": 4844 + }, + { + "epoch": 1.03348976109215, + "grad_norm": 0.9415325565396364, + "learning_rate": 0.00015889685521750096, + "loss": 1.0039, + "step": 4845 + }, + { + "epoch": 1.033703071672355, + "grad_norm": 1.4796612804301998, + "learning_rate": 0.00015889544645061247, + "loss": 0.991, + "step": 4846 + }, + { + "epoch": 1.0339163822525597, + "grad_norm": 0.5905162280473873, + "learning_rate": 0.00015889403679102067, + "loss": 1.0107, + "step": 4847 + }, + { + "epoch": 1.0341296928327646, + "grad_norm": 1.4934423835975428, + "learning_rate": 0.00015889262623874155, + "loss": 0.9793, + "step": 4848 + }, + { + "epoch": 1.0343430034129693, + "grad_norm": 0.5920439389455459, + "learning_rate": 0.000158891214793791, + "loss": 0.9956, + "step": 4849 + }, + { + "epoch": 1.0345563139931742, + "grad_norm": 1.4035699363141716, + "learning_rate": 0.00015888980245618505, + "loss": 1.0111, + "step": 4850 + }, + { + "epoch": 1.0347696245733788, + "grad_norm": 0.57164359944461, + "learning_rate": 0.00015888838922593966, + "loss": 0.989, + "step": 4851 + }, + { + "epoch": 1.0349829351535835, + "grad_norm": 0.9692579058685012, + "learning_rate": 0.0001588869751030708, + "loss": 0.9932, + "step": 4852 + }, + { + "epoch": 1.0351962457337884, + "grad_norm": 0.6378744507807075, + "learning_rate": 0.0001588855600875945, + "loss": 0.9987, + "step": 4853 + }, + { + "epoch": 1.0354095563139931, + "grad_norm": 0.8848364081886059, + "learning_rate": 0.00015888414417952675, + "loss": 0.9883, + "step": 4854 + }, + { + "epoch": 1.035622866894198, + "grad_norm": 0.6449941909216014, + "learning_rate": 0.00015888272737888358, + "loss": 0.9476, + "step": 4855 + }, + { + "epoch": 1.0358361774744027, + "grad_norm": 0.6009046870337293, + "learning_rate": 0.00015888130968568103, + "loss": 0.9627, + "step": 4856 + }, + { + "epoch": 1.0360494880546076, + "grad_norm": 1.1571110945364567, + "learning_rate": 0.00015887989109993514, + "loss": 1.0015, + "step": 4857 + }, + { + "epoch": 1.0362627986348123, + "grad_norm": 0.55346925318381, + "learning_rate": 0.00015887847162166196, + "loss": 0.9819, + "step": 4858 + }, + { + "epoch": 1.036476109215017, + "grad_norm": 0.5800797059387364, + "learning_rate": 0.00015887705125087754, + "loss": 0.9802, + "step": 4859 + }, + { + "epoch": 1.036689419795222, + "grad_norm": 0.5586693456642654, + "learning_rate": 0.00015887562998759795, + "loss": 0.9895, + "step": 4860 + }, + { + "epoch": 1.0369027303754266, + "grad_norm": 0.7333390516146725, + "learning_rate": 0.0001588742078318393, + "loss": 1.008, + "step": 4861 + }, + { + "epoch": 1.0371160409556315, + "grad_norm": 0.4990027306714865, + "learning_rate": 0.00015887278478361765, + "loss": 1.0062, + "step": 4862 + }, + { + "epoch": 1.0373293515358362, + "grad_norm": 0.5863553578764761, + "learning_rate": 0.0001588713608429491, + "loss": 0.9573, + "step": 4863 + }, + { + "epoch": 1.0375426621160408, + "grad_norm": 0.5674291788017417, + "learning_rate": 0.0001588699360098498, + "loss": 0.9695, + "step": 4864 + }, + { + "epoch": 1.0377559726962458, + "grad_norm": 0.577100209026433, + "learning_rate": 0.00015886851028433583, + "loss": 0.9893, + "step": 4865 + }, + { + "epoch": 1.0379692832764504, + "grad_norm": 0.6117883808880942, + "learning_rate": 0.00015886708366642335, + "loss": 0.9754, + "step": 4866 + }, + { + "epoch": 1.0381825938566553, + "grad_norm": 0.4348963016209579, + "learning_rate": 0.00015886565615612849, + "loss": 0.9928, + "step": 4867 + }, + { + "epoch": 1.03839590443686, + "grad_norm": 0.5383466268160605, + "learning_rate": 0.0001588642277534674, + "loss": 0.9704, + "step": 4868 + }, + { + "epoch": 1.038609215017065, + "grad_norm": 0.5672952408122328, + "learning_rate": 0.00015886279845845624, + "loss": 0.9807, + "step": 4869 + }, + { + "epoch": 1.0388225255972696, + "grad_norm": 0.5417881465514522, + "learning_rate": 0.00015886136827111122, + "loss": 1.0011, + "step": 4870 + }, + { + "epoch": 1.0390358361774743, + "grad_norm": 0.5641852990518929, + "learning_rate": 0.00015885993719144843, + "loss": 0.9914, + "step": 4871 + }, + { + "epoch": 1.0392491467576792, + "grad_norm": 0.5253086492147004, + "learning_rate": 0.00015885850521948417, + "loss": 0.9966, + "step": 4872 + }, + { + "epoch": 1.039462457337884, + "grad_norm": 0.44349050754909336, + "learning_rate": 0.0001588570723552346, + "loss": 0.9471, + "step": 4873 + }, + { + "epoch": 1.0396757679180888, + "grad_norm": 0.5723054937494997, + "learning_rate": 0.0001588556385987159, + "loss": 0.9756, + "step": 4874 + }, + { + "epoch": 1.0398890784982935, + "grad_norm": 0.5346599363260257, + "learning_rate": 0.00015885420394994436, + "loss": 0.9962, + "step": 4875 + }, + { + "epoch": 1.0401023890784984, + "grad_norm": 0.39180124767733654, + "learning_rate": 0.00015885276840893616, + "loss": 1.008, + "step": 4876 + }, + { + "epoch": 1.040315699658703, + "grad_norm": 0.5117462664246308, + "learning_rate": 0.00015885133197570756, + "loss": 0.9518, + "step": 4877 + }, + { + "epoch": 1.0405290102389078, + "grad_norm": 0.39994904054008884, + "learning_rate": 0.0001588498946502748, + "loss": 0.9732, + "step": 4878 + }, + { + "epoch": 1.0407423208191127, + "grad_norm": 0.3650589900058698, + "learning_rate": 0.00015884845643265416, + "loss": 0.9767, + "step": 4879 + }, + { + "epoch": 1.0409556313993173, + "grad_norm": 0.46545419887640227, + "learning_rate": 0.00015884701732286191, + "loss": 0.9642, + "step": 4880 + }, + { + "epoch": 1.0411689419795223, + "grad_norm": 0.3926930712931123, + "learning_rate": 0.00015884557732091434, + "loss": 0.9665, + "step": 4881 + }, + { + "epoch": 1.041382252559727, + "grad_norm": 0.34703241941694857, + "learning_rate": 0.00015884413642682773, + "loss": 1.0016, + "step": 4882 + }, + { + "epoch": 1.0415955631399318, + "grad_norm": 0.5120673475447985, + "learning_rate": 0.0001588426946406184, + "loss": 0.9595, + "step": 4883 + }, + { + "epoch": 1.0418088737201365, + "grad_norm": 0.4067297905789398, + "learning_rate": 0.00015884125196230267, + "loss": 0.9377, + "step": 4884 + }, + { + "epoch": 1.0420221843003412, + "grad_norm": 0.3226032194651676, + "learning_rate": 0.0001588398083918968, + "loss": 0.966, + "step": 4885 + }, + { + "epoch": 1.0422354948805461, + "grad_norm": 0.3968596356783599, + "learning_rate": 0.00015883836392941721, + "loss": 0.9721, + "step": 4886 + }, + { + "epoch": 1.0424488054607508, + "grad_norm": 0.3321633596837675, + "learning_rate": 0.00015883691857488018, + "loss": 0.9839, + "step": 4887 + }, + { + "epoch": 1.0426621160409557, + "grad_norm": 0.3060469753139435, + "learning_rate": 0.00015883547232830211, + "loss": 0.9591, + "step": 4888 + }, + { + "epoch": 1.0428754266211604, + "grad_norm": 0.3231219776797136, + "learning_rate": 0.00015883402518969934, + "loss": 0.9797, + "step": 4889 + }, + { + "epoch": 1.0430887372013653, + "grad_norm": 0.32785371708919403, + "learning_rate": 0.00015883257715908824, + "loss": 0.9864, + "step": 4890 + }, + { + "epoch": 1.04330204778157, + "grad_norm": 0.3400703923571941, + "learning_rate": 0.00015883112823648522, + "loss": 0.9995, + "step": 4891 + }, + { + "epoch": 1.0435153583617747, + "grad_norm": 0.28971217493872886, + "learning_rate": 0.00015882967842190663, + "loss": 0.9574, + "step": 4892 + }, + { + "epoch": 1.0437286689419796, + "grad_norm": 0.34296595874795244, + "learning_rate": 0.00015882822771536896, + "loss": 0.9698, + "step": 4893 + }, + { + "epoch": 1.0439419795221843, + "grad_norm": 0.3578027809672496, + "learning_rate": 0.00015882677611688852, + "loss": 0.9885, + "step": 4894 + }, + { + "epoch": 1.0441552901023892, + "grad_norm": 0.3636304267222772, + "learning_rate": 0.00015882532362648179, + "loss": 0.9789, + "step": 4895 + }, + { + "epoch": 1.0443686006825939, + "grad_norm": 0.35430165261468255, + "learning_rate": 0.0001588238702441652, + "loss": 0.9619, + "step": 4896 + }, + { + "epoch": 1.0445819112627985, + "grad_norm": 0.39423238060689964, + "learning_rate": 0.00015882241596995518, + "loss": 0.9576, + "step": 4897 + }, + { + "epoch": 1.0447952218430034, + "grad_norm": 0.40192120947608856, + "learning_rate": 0.00015882096080386822, + "loss": 0.9653, + "step": 4898 + }, + { + "epoch": 1.0450085324232081, + "grad_norm": 1.7408066104842317, + "learning_rate": 0.00015881950474592074, + "loss": 0.9885, + "step": 4899 + }, + { + "epoch": 1.045221843003413, + "grad_norm": 0.38228606287508066, + "learning_rate": 0.00015881804779612925, + "loss": 0.9769, + "step": 4900 + }, + { + "epoch": 1.0454351535836177, + "grad_norm": 0.5959939201393489, + "learning_rate": 0.00015881658995451023, + "loss": 0.9908, + "step": 4901 + }, + { + "epoch": 1.0456484641638226, + "grad_norm": 0.6995348942076303, + "learning_rate": 0.00015881513122108015, + "loss": 0.9602, + "step": 4902 + }, + { + "epoch": 1.0458617747440273, + "grad_norm": 0.8340750973856921, + "learning_rate": 0.00015881367159585556, + "loss": 0.9963, + "step": 4903 + }, + { + "epoch": 1.046075085324232, + "grad_norm": 0.91409660733822, + "learning_rate": 0.00015881221107885292, + "loss": 0.9811, + "step": 4904 + }, + { + "epoch": 1.046288395904437, + "grad_norm": 0.8623917355609697, + "learning_rate": 0.0001588107496700888, + "loss": 1.0001, + "step": 4905 + }, + { + "epoch": 1.0465017064846416, + "grad_norm": 0.7075238689510518, + "learning_rate": 0.00015880928736957973, + "loss": 0.9669, + "step": 4906 + }, + { + "epoch": 1.0467150170648465, + "grad_norm": 0.5645543036085218, + "learning_rate": 0.00015880782417734224, + "loss": 0.9728, + "step": 4907 + }, + { + "epoch": 1.0469283276450512, + "grad_norm": 0.4082585272748839, + "learning_rate": 0.0001588063600933929, + "loss": 0.9438, + "step": 4908 + }, + { + "epoch": 1.047141638225256, + "grad_norm": 0.5120987871614375, + "learning_rate": 0.00015880489511774828, + "loss": 1.01, + "step": 4909 + }, + { + "epoch": 1.0473549488054608, + "grad_norm": 0.6803056705184289, + "learning_rate": 0.00015880342925042492, + "loss": 0.9663, + "step": 4910 + }, + { + "epoch": 1.0475682593856654, + "grad_norm": 0.8046084529111661, + "learning_rate": 0.00015880196249143944, + "loss": 1.0061, + "step": 4911 + }, + { + "epoch": 1.0477815699658704, + "grad_norm": 0.7929156908189037, + "learning_rate": 0.00015880049484080845, + "loss": 0.9716, + "step": 4912 + }, + { + "epoch": 1.047994880546075, + "grad_norm": 0.6232731583499701, + "learning_rate": 0.0001587990262985485, + "loss": 0.9799, + "step": 4913 + }, + { + "epoch": 1.04820819112628, + "grad_norm": 0.36147370308627425, + "learning_rate": 0.00015879755686467627, + "loss": 0.9271, + "step": 4914 + }, + { + "epoch": 1.0484215017064846, + "grad_norm": 0.4048943499308923, + "learning_rate": 0.00015879608653920834, + "loss": 0.957, + "step": 4915 + }, + { + "epoch": 1.0486348122866893, + "grad_norm": 0.5945372535742157, + "learning_rate": 0.00015879461532216139, + "loss": 0.9955, + "step": 4916 + }, + { + "epoch": 1.0488481228668942, + "grad_norm": 0.5966346264843051, + "learning_rate": 0.000158793143213552, + "loss": 0.9435, + "step": 4917 + }, + { + "epoch": 1.049061433447099, + "grad_norm": 0.5372106203295935, + "learning_rate": 0.0001587916702133969, + "loss": 0.9915, + "step": 4918 + }, + { + "epoch": 1.0492747440273038, + "grad_norm": 0.4527400569812558, + "learning_rate": 0.00015879019632171273, + "loss": 0.9637, + "step": 4919 + }, + { + "epoch": 1.0494880546075085, + "grad_norm": 0.3739375056738441, + "learning_rate": 0.00015878872153851618, + "loss": 1.001, + "step": 4920 + }, + { + "epoch": 1.0497013651877134, + "grad_norm": 0.311210146614455, + "learning_rate": 0.00015878724586382387, + "loss": 0.9545, + "step": 4921 + }, + { + "epoch": 1.049914675767918, + "grad_norm": 0.36037063986654333, + "learning_rate": 0.0001587857692976526, + "loss": 0.9493, + "step": 4922 + }, + { + "epoch": 1.0501279863481228, + "grad_norm": 0.4879256074282357, + "learning_rate": 0.00015878429184001898, + "loss": 0.9521, + "step": 4923 + }, + { + "epoch": 1.0503412969283277, + "grad_norm": 0.544373904966999, + "learning_rate": 0.00015878281349093978, + "loss": 0.9771, + "step": 4924 + }, + { + "epoch": 1.0505546075085324, + "grad_norm": 0.4925712316553244, + "learning_rate": 0.00015878133425043176, + "loss": 0.9741, + "step": 4925 + }, + { + "epoch": 1.0507679180887373, + "grad_norm": 0.4611210003877605, + "learning_rate": 0.00015877985411851157, + "loss": 0.9509, + "step": 4926 + }, + { + "epoch": 1.050981228668942, + "grad_norm": 0.37653041103452484, + "learning_rate": 0.00015877837309519603, + "loss": 1.0013, + "step": 4927 + }, + { + "epoch": 1.0511945392491469, + "grad_norm": 0.3138327421865529, + "learning_rate": 0.00015877689118050186, + "loss": 0.9818, + "step": 4928 + }, + { + "epoch": 1.0514078498293515, + "grad_norm": 0.3183694631949951, + "learning_rate": 0.00015877540837444586, + "loss": 0.9679, + "step": 4929 + }, + { + "epoch": 1.0516211604095562, + "grad_norm": 0.34414644964854235, + "learning_rate": 0.0001587739246770448, + "loss": 0.9773, + "step": 4930 + }, + { + "epoch": 1.0518344709897611, + "grad_norm": 0.31053465537455516, + "learning_rate": 0.00015877244008831543, + "loss": 0.9524, + "step": 4931 + }, + { + "epoch": 1.0520477815699658, + "grad_norm": 0.30528466048639846, + "learning_rate": 0.00015877095460827456, + "loss": 0.9828, + "step": 4932 + }, + { + "epoch": 1.0522610921501707, + "grad_norm": 0.31846941515357263, + "learning_rate": 0.00015876946823693903, + "loss": 0.9744, + "step": 4933 + }, + { + "epoch": 1.0524744027303754, + "grad_norm": 0.29415019357593464, + "learning_rate": 0.00015876798097432568, + "loss": 0.9994, + "step": 4934 + }, + { + "epoch": 1.0526877133105803, + "grad_norm": 0.29906177325099464, + "learning_rate": 0.00015876649282045127, + "loss": 1.0015, + "step": 4935 + }, + { + "epoch": 1.052901023890785, + "grad_norm": 0.2791093205185652, + "learning_rate": 0.00015876500377533267, + "loss": 0.9465, + "step": 4936 + }, + { + "epoch": 1.0531143344709897, + "grad_norm": 0.29062596599358276, + "learning_rate": 0.0001587635138389867, + "loss": 0.9846, + "step": 4937 + }, + { + "epoch": 1.0533276450511946, + "grad_norm": 0.35190831045027077, + "learning_rate": 0.00015876202301143028, + "loss": 0.9592, + "step": 4938 + }, + { + "epoch": 1.0535409556313993, + "grad_norm": 0.3433656899462568, + "learning_rate": 0.00015876053129268024, + "loss": 0.9916, + "step": 4939 + }, + { + "epoch": 1.0537542662116042, + "grad_norm": 0.3758899465910502, + "learning_rate": 0.00015875903868275344, + "loss": 0.9622, + "step": 4940 + }, + { + "epoch": 1.0539675767918089, + "grad_norm": 0.49153672520049374, + "learning_rate": 0.00015875754518166683, + "loss": 0.9819, + "step": 4941 + }, + { + "epoch": 1.0541808873720138, + "grad_norm": 0.5020189940543425, + "learning_rate": 0.00015875605078943727, + "loss": 0.9675, + "step": 4942 + }, + { + "epoch": 1.0543941979522184, + "grad_norm": 0.4815570135212534, + "learning_rate": 0.00015875455550608164, + "loss": 0.9668, + "step": 4943 + }, + { + "epoch": 1.0546075085324231, + "grad_norm": 0.527874917165855, + "learning_rate": 0.00015875305933161688, + "loss": 0.9754, + "step": 4944 + }, + { + "epoch": 1.054820819112628, + "grad_norm": 0.48164903603282955, + "learning_rate": 0.00015875156226605996, + "loss": 0.9646, + "step": 4945 + }, + { + "epoch": 1.0550341296928327, + "grad_norm": 0.4313841194932857, + "learning_rate": 0.00015875006430942777, + "loss": 0.9917, + "step": 4946 + }, + { + "epoch": 1.0552474402730376, + "grad_norm": 0.4318086705605105, + "learning_rate": 0.0001587485654617373, + "loss": 0.9753, + "step": 4947 + }, + { + "epoch": 1.0554607508532423, + "grad_norm": 0.2752208810192915, + "learning_rate": 0.00015874706572300548, + "loss": 1.0078, + "step": 4948 + }, + { + "epoch": 1.055674061433447, + "grad_norm": 0.3069097837154426, + "learning_rate": 0.00015874556509324928, + "loss": 0.961, + "step": 4949 + }, + { + "epoch": 1.055887372013652, + "grad_norm": 0.41113152453218915, + "learning_rate": 0.00015874406357248568, + "loss": 0.9518, + "step": 4950 + }, + { + "epoch": 1.0561006825938566, + "grad_norm": 0.45578442801977903, + "learning_rate": 0.0001587425611607317, + "loss": 0.9829, + "step": 4951 + }, + { + "epoch": 1.0563139931740615, + "grad_norm": 0.3924267865989583, + "learning_rate": 0.0001587410578580043, + "loss": 0.9748, + "step": 4952 + }, + { + "epoch": 1.0565273037542662, + "grad_norm": 0.357341797482944, + "learning_rate": 0.0001587395536643205, + "loss": 0.9883, + "step": 4953 + }, + { + "epoch": 1.056740614334471, + "grad_norm": 0.3003219130753224, + "learning_rate": 0.00015873804857969735, + "loss": 0.9711, + "step": 4954 + }, + { + "epoch": 1.0569539249146758, + "grad_norm": 0.27593271035168615, + "learning_rate": 0.00015873654260415183, + "loss": 0.9682, + "step": 4955 + }, + { + "epoch": 1.0571672354948805, + "grad_norm": 0.3318381630772055, + "learning_rate": 0.00015873503573770104, + "loss": 0.9837, + "step": 4956 + }, + { + "epoch": 1.0573805460750854, + "grad_norm": 1.630161767037333, + "learning_rate": 0.00015873352798036194, + "loss": 0.9675, + "step": 4957 + }, + { + "epoch": 1.05759385665529, + "grad_norm": 0.3478301699829325, + "learning_rate": 0.00015873201933215168, + "loss": 0.9574, + "step": 4958 + }, + { + "epoch": 1.057807167235495, + "grad_norm": 0.6032488665094625, + "learning_rate": 0.0001587305097930873, + "loss": 0.9414, + "step": 4959 + }, + { + "epoch": 1.0580204778156996, + "grad_norm": 0.7075115080656905, + "learning_rate": 0.00015872899936318586, + "loss": 0.9782, + "step": 4960 + }, + { + "epoch": 1.0582337883959045, + "grad_norm": 0.677631755495181, + "learning_rate": 0.0001587274880424645, + "loss": 0.9902, + "step": 4961 + }, + { + "epoch": 1.0584470989761092, + "grad_norm": 0.5741330475327786, + "learning_rate": 0.00015872597583094027, + "loss": 1.011, + "step": 4962 + }, + { + "epoch": 1.058660409556314, + "grad_norm": 0.47888885052742375, + "learning_rate": 0.0001587244627286303, + "loss": 0.937, + "step": 4963 + }, + { + "epoch": 1.0588737201365188, + "grad_norm": 0.48684086457016384, + "learning_rate": 0.0001587229487355517, + "loss": 1.0029, + "step": 4964 + }, + { + "epoch": 1.0590870307167235, + "grad_norm": 0.5395247439664163, + "learning_rate": 0.00015872143385172163, + "loss": 0.9715, + "step": 4965 + }, + { + "epoch": 1.0593003412969284, + "grad_norm": 0.5394821537478565, + "learning_rate": 0.0001587199180771572, + "loss": 1.0097, + "step": 4966 + }, + { + "epoch": 1.059513651877133, + "grad_norm": 0.49961893896370846, + "learning_rate": 0.00015871840141187555, + "loss": 0.967, + "step": 4967 + }, + { + "epoch": 1.0597269624573378, + "grad_norm": 0.45425735559915564, + "learning_rate": 0.00015871688385589393, + "loss": 0.9988, + "step": 4968 + }, + { + "epoch": 1.0599402730375427, + "grad_norm": 0.5273551793833863, + "learning_rate": 0.00015871536540922938, + "loss": 0.9636, + "step": 4969 + }, + { + "epoch": 1.0601535836177474, + "grad_norm": 0.4596189576709201, + "learning_rate": 0.00015871384607189917, + "loss": 0.9759, + "step": 4970 + }, + { + "epoch": 1.0603668941979523, + "grad_norm": 0.37402179446902206, + "learning_rate": 0.00015871232584392046, + "loss": 0.9871, + "step": 4971 + }, + { + "epoch": 1.060580204778157, + "grad_norm": 0.4439437494683215, + "learning_rate": 0.00015871080472531045, + "loss": 0.9629, + "step": 4972 + }, + { + "epoch": 1.0607935153583619, + "grad_norm": 0.44578641534055546, + "learning_rate": 0.0001587092827160864, + "loss": 0.9638, + "step": 4973 + }, + { + "epoch": 1.0610068259385665, + "grad_norm": 0.4826878134203843, + "learning_rate": 0.00015870775981626547, + "loss": 0.951, + "step": 4974 + }, + { + "epoch": 1.0612201365187712, + "grad_norm": 0.5053293568674337, + "learning_rate": 0.00015870623602586492, + "loss": 1.0121, + "step": 4975 + }, + { + "epoch": 1.0614334470989761, + "grad_norm": 0.586187947175032, + "learning_rate": 0.00015870471134490197, + "loss": 0.995, + "step": 4976 + }, + { + "epoch": 1.0616467576791808, + "grad_norm": 0.5360408767355235, + "learning_rate": 0.0001587031857733939, + "loss": 0.9615, + "step": 4977 + }, + { + "epoch": 1.0618600682593857, + "grad_norm": 0.42969446235387737, + "learning_rate": 0.00015870165931135795, + "loss": 0.9676, + "step": 4978 + }, + { + "epoch": 1.0620733788395904, + "grad_norm": 0.38489982267204953, + "learning_rate": 0.0001587001319588114, + "loss": 0.9735, + "step": 4979 + }, + { + "epoch": 1.0622866894197953, + "grad_norm": 0.40922897604631736, + "learning_rate": 0.00015869860371577155, + "loss": 0.938, + "step": 4980 + }, + { + "epoch": 1.0625, + "grad_norm": 0.44449608398835994, + "learning_rate": 0.00015869707458225564, + "loss": 0.9746, + "step": 4981 + }, + { + "epoch": 1.0627133105802047, + "grad_norm": 0.5322418550713054, + "learning_rate": 0.00015869554455828105, + "loss": 1.0004, + "step": 4982 + }, + { + "epoch": 1.0629266211604096, + "grad_norm": 0.6889143250723561, + "learning_rate": 0.00015869401364386503, + "loss": 0.963, + "step": 4983 + }, + { + "epoch": 1.0631399317406143, + "grad_norm": 0.5025137354609706, + "learning_rate": 0.00015869248183902494, + "loss": 0.9766, + "step": 4984 + }, + { + "epoch": 1.0633532423208192, + "grad_norm": 0.5127886858810827, + "learning_rate": 0.00015869094914377805, + "loss": 0.975, + "step": 4985 + }, + { + "epoch": 1.0635665529010239, + "grad_norm": 0.4532190783916095, + "learning_rate": 0.0001586894155581418, + "loss": 0.9601, + "step": 4986 + }, + { + "epoch": 1.0637798634812288, + "grad_norm": 0.41465005054143594, + "learning_rate": 0.00015868788108213346, + "loss": 1.0001, + "step": 4987 + }, + { + "epoch": 1.0639931740614335, + "grad_norm": 0.2994421147615885, + "learning_rate": 0.00015868634571577042, + "loss": 1.0074, + "step": 4988 + }, + { + "epoch": 1.0642064846416381, + "grad_norm": 0.35994758118564907, + "learning_rate": 0.00015868480945907006, + "loss": 0.9861, + "step": 4989 + }, + { + "epoch": 1.064419795221843, + "grad_norm": 0.5008375643294769, + "learning_rate": 0.00015868327231204977, + "loss": 0.9791, + "step": 4990 + }, + { + "epoch": 1.0646331058020477, + "grad_norm": 0.46015085745746714, + "learning_rate": 0.0001586817342747269, + "loss": 0.9773, + "step": 4991 + }, + { + "epoch": 1.0648464163822526, + "grad_norm": 0.40700904506917895, + "learning_rate": 0.0001586801953471189, + "loss": 0.9945, + "step": 4992 + }, + { + "epoch": 1.0650597269624573, + "grad_norm": 0.39712581359040255, + "learning_rate": 0.00015867865552924315, + "loss": 1.0149, + "step": 4993 + }, + { + "epoch": 1.0652730375426622, + "grad_norm": 0.4342507926684983, + "learning_rate": 0.00015867711482111713, + "loss": 0.9703, + "step": 4994 + }, + { + "epoch": 1.065486348122867, + "grad_norm": 0.41974490607757825, + "learning_rate": 0.00015867557322275818, + "loss": 0.9861, + "step": 4995 + }, + { + "epoch": 1.0656996587030716, + "grad_norm": 0.321137569834343, + "learning_rate": 0.00015867403073418384, + "loss": 0.9972, + "step": 4996 + }, + { + "epoch": 1.0659129692832765, + "grad_norm": 0.35026193868789035, + "learning_rate": 0.0001586724873554115, + "loss": 0.9707, + "step": 4997 + }, + { + "epoch": 1.0661262798634812, + "grad_norm": 0.4098311920140812, + "learning_rate": 0.00015867094308645867, + "loss": 0.9652, + "step": 4998 + }, + { + "epoch": 1.066339590443686, + "grad_norm": 0.47325550831869856, + "learning_rate": 0.00015866939792734275, + "loss": 0.9783, + "step": 4999 + }, + { + "epoch": 1.0665529010238908, + "grad_norm": 0.4589993062211773, + "learning_rate": 0.00015866785187808132, + "loss": 0.963, + "step": 5000 + }, + { + "epoch": 1.0667662116040955, + "grad_norm": 0.5467847121400631, + "learning_rate": 0.0001586663049386918, + "loss": 1.0232, + "step": 5001 + }, + { + "epoch": 1.0669795221843004, + "grad_norm": 0.4607065475219987, + "learning_rate": 0.0001586647571091917, + "loss": 0.9649, + "step": 5002 + }, + { + "epoch": 1.067192832764505, + "grad_norm": 0.3839618200331465, + "learning_rate": 0.00015866320838959857, + "loss": 0.9683, + "step": 5003 + }, + { + "epoch": 1.06740614334471, + "grad_norm": 0.29330337346558283, + "learning_rate": 0.00015866165877992988, + "loss": 0.9947, + "step": 5004 + }, + { + "epoch": 1.0676194539249146, + "grad_norm": 0.3159869877639349, + "learning_rate": 0.00015866010828020322, + "loss": 0.9978, + "step": 5005 + }, + { + "epoch": 1.0678327645051195, + "grad_norm": 0.37667802884365675, + "learning_rate": 0.0001586585568904361, + "loss": 0.9463, + "step": 5006 + }, + { + "epoch": 1.0680460750853242, + "grad_norm": 0.3308920500547112, + "learning_rate": 0.00015865700461064613, + "loss": 0.9693, + "step": 5007 + }, + { + "epoch": 1.068259385665529, + "grad_norm": 0.31536506965154215, + "learning_rate": 0.0001586554514408508, + "loss": 0.9606, + "step": 5008 + }, + { + "epoch": 1.0684726962457338, + "grad_norm": 0.3721402360595859, + "learning_rate": 0.0001586538973810677, + "loss": 0.987, + "step": 5009 + }, + { + "epoch": 1.0686860068259385, + "grad_norm": 0.32017587970863454, + "learning_rate": 0.00015865234243131444, + "loss": 0.9815, + "step": 5010 + }, + { + "epoch": 1.0688993174061434, + "grad_norm": 0.3029000272942583, + "learning_rate": 0.00015865078659160861, + "loss": 0.9301, + "step": 5011 + }, + { + "epoch": 1.069112627986348, + "grad_norm": 0.36918258063834397, + "learning_rate": 0.00015864922986196778, + "loss": 1.0027, + "step": 5012 + }, + { + "epoch": 1.069325938566553, + "grad_norm": 0.340417193326197, + "learning_rate": 0.00015864767224240962, + "loss": 0.9736, + "step": 5013 + }, + { + "epoch": 1.0695392491467577, + "grad_norm": 0.40159186048277157, + "learning_rate": 0.00015864611373295174, + "loss": 0.9845, + "step": 5014 + }, + { + "epoch": 1.0697525597269624, + "grad_norm": 0.5020624554928419, + "learning_rate": 0.00015864455433361172, + "loss": 0.9496, + "step": 5015 + }, + { + "epoch": 1.0699658703071673, + "grad_norm": 0.5562933923424065, + "learning_rate": 0.00015864299404440727, + "loss": 0.9857, + "step": 5016 + }, + { + "epoch": 1.070179180887372, + "grad_norm": 0.6425619975530573, + "learning_rate": 0.00015864143286535603, + "loss": 1.012, + "step": 5017 + }, + { + "epoch": 1.0703924914675769, + "grad_norm": 0.7733138207404674, + "learning_rate": 0.00015863987079647561, + "loss": 0.9894, + "step": 5018 + }, + { + "epoch": 1.0706058020477816, + "grad_norm": 1.0161668168652547, + "learning_rate": 0.00015863830783778378, + "loss": 0.9825, + "step": 5019 + }, + { + "epoch": 1.0708191126279862, + "grad_norm": 1.0330256148311954, + "learning_rate": 0.00015863674398929812, + "loss": 0.9759, + "step": 5020 + }, + { + "epoch": 1.0710324232081911, + "grad_norm": 0.9365054119366679, + "learning_rate": 0.00015863517925103642, + "loss": 0.9781, + "step": 5021 + }, + { + "epoch": 1.0712457337883958, + "grad_norm": 0.8194076819498309, + "learning_rate": 0.00015863361362301634, + "loss": 0.9648, + "step": 5022 + }, + { + "epoch": 1.0714590443686007, + "grad_norm": 0.6505680326092382, + "learning_rate": 0.0001586320471052556, + "loss": 0.9855, + "step": 5023 + }, + { + "epoch": 1.0716723549488054, + "grad_norm": 0.4588450695411535, + "learning_rate": 0.0001586304796977719, + "loss": 0.9618, + "step": 5024 + }, + { + "epoch": 1.0718856655290103, + "grad_norm": 0.35260324237691715, + "learning_rate": 0.00015862891140058303, + "loss": 0.9824, + "step": 5025 + }, + { + "epoch": 1.072098976109215, + "grad_norm": 0.45778989476451376, + "learning_rate": 0.00015862734221370667, + "loss": 0.9661, + "step": 5026 + }, + { + "epoch": 1.0723122866894197, + "grad_norm": 0.6513090032811099, + "learning_rate": 0.00015862577213716063, + "loss": 0.9969, + "step": 5027 + }, + { + "epoch": 1.0725255972696246, + "grad_norm": 0.7788381553587813, + "learning_rate": 0.00015862420117096268, + "loss": 0.9854, + "step": 5028 + }, + { + "epoch": 1.0727389078498293, + "grad_norm": 0.829693520554591, + "learning_rate": 0.00015862262931513055, + "loss": 0.9693, + "step": 5029 + }, + { + "epoch": 1.0729522184300342, + "grad_norm": 0.9191142644898568, + "learning_rate": 0.00015862105656968203, + "loss": 0.9642, + "step": 5030 + }, + { + "epoch": 1.0731655290102389, + "grad_norm": 0.9382582350572722, + "learning_rate": 0.00015861948293463494, + "loss": 0.9925, + "step": 5031 + }, + { + "epoch": 1.0733788395904438, + "grad_norm": 0.8483418977499967, + "learning_rate": 0.0001586179084100071, + "loss": 0.9358, + "step": 5032 + }, + { + "epoch": 1.0735921501706485, + "grad_norm": 0.8709139755807628, + "learning_rate": 0.00015861633299581627, + "loss": 0.9636, + "step": 5033 + }, + { + "epoch": 1.0738054607508531, + "grad_norm": 0.945620569304958, + "learning_rate": 0.00015861475669208034, + "loss": 0.9616, + "step": 5034 + }, + { + "epoch": 1.074018771331058, + "grad_norm": 0.8666346423043413, + "learning_rate": 0.0001586131794988171, + "loss": 0.9904, + "step": 5035 + }, + { + "epoch": 1.0742320819112627, + "grad_norm": 0.6447209435601488, + "learning_rate": 0.00015861160141604442, + "loss": 0.9516, + "step": 5036 + }, + { + "epoch": 1.0744453924914676, + "grad_norm": 0.5138731935559803, + "learning_rate": 0.00015861002244378015, + "loss": 0.9729, + "step": 5037 + }, + { + "epoch": 1.0746587030716723, + "grad_norm": 0.37755245290145123, + "learning_rate": 0.00015860844258204214, + "loss": 0.9666, + "step": 5038 + }, + { + "epoch": 1.0748720136518772, + "grad_norm": 0.6010199689557867, + "learning_rate": 0.00015860686183084832, + "loss": 0.9457, + "step": 5039 + }, + { + "epoch": 1.075085324232082, + "grad_norm": 0.7077182259637261, + "learning_rate": 0.0001586052801902165, + "loss": 0.9656, + "step": 5040 + }, + { + "epoch": 1.0752986348122866, + "grad_norm": 0.4846827457327532, + "learning_rate": 0.0001586036976601646, + "loss": 0.9846, + "step": 5041 + }, + { + "epoch": 1.0755119453924915, + "grad_norm": 0.48301739313983877, + "learning_rate": 0.00015860211424071057, + "loss": 0.9928, + "step": 5042 + }, + { + "epoch": 1.0757252559726962, + "grad_norm": 0.5209443418607163, + "learning_rate": 0.0001586005299318723, + "loss": 0.9528, + "step": 5043 + }, + { + "epoch": 1.075938566552901, + "grad_norm": 0.3904623298872814, + "learning_rate": 0.00015859894473366765, + "loss": 1.0023, + "step": 5044 + }, + { + "epoch": 1.0761518771331058, + "grad_norm": 0.35818783595554526, + "learning_rate": 0.00015859735864611466, + "loss": 0.9756, + "step": 5045 + }, + { + "epoch": 1.0763651877133107, + "grad_norm": 0.36759535669273985, + "learning_rate": 0.00015859577166923126, + "loss": 0.988, + "step": 5046 + }, + { + "epoch": 1.0765784982935154, + "grad_norm": 0.45776847469322446, + "learning_rate": 0.00015859418380303537, + "loss": 0.9846, + "step": 5047 + }, + { + "epoch": 1.07679180887372, + "grad_norm": 0.5229260703625399, + "learning_rate": 0.00015859259504754495, + "loss": 0.9896, + "step": 5048 + }, + { + "epoch": 1.077005119453925, + "grad_norm": 0.554273941503222, + "learning_rate": 0.000158591005402778, + "loss": 0.9876, + "step": 5049 + }, + { + "epoch": 1.0772184300341296, + "grad_norm": 0.5395154334016427, + "learning_rate": 0.0001585894148687525, + "loss": 0.9779, + "step": 5050 + }, + { + "epoch": 1.0774317406143346, + "grad_norm": 0.5127638046902309, + "learning_rate": 0.00015858782344548646, + "loss": 0.9903, + "step": 5051 + }, + { + "epoch": 1.0776450511945392, + "grad_norm": 0.45155774248575836, + "learning_rate": 0.00015858623113299786, + "loss": 0.964, + "step": 5052 + }, + { + "epoch": 1.077858361774744, + "grad_norm": 0.433489474866753, + "learning_rate": 0.00015858463793130473, + "loss": 0.9896, + "step": 5053 + }, + { + "epoch": 1.0780716723549488, + "grad_norm": 0.31942847601600005, + "learning_rate": 0.00015858304384042511, + "loss": 0.9818, + "step": 5054 + }, + { + "epoch": 1.0782849829351535, + "grad_norm": 1.8140142487626039, + "learning_rate": 0.00015858144886037704, + "loss": 0.9926, + "step": 5055 + }, + { + "epoch": 1.0784982935153584, + "grad_norm": 0.42828573924405167, + "learning_rate": 0.00015857985299117856, + "loss": 0.9849, + "step": 5056 + }, + { + "epoch": 1.078711604095563, + "grad_norm": 0.7461190862312379, + "learning_rate": 0.0001585782562328477, + "loss": 0.9658, + "step": 5057 + }, + { + "epoch": 1.078924914675768, + "grad_norm": 0.6877270271173124, + "learning_rate": 0.00015857665858540253, + "loss": 0.9758, + "step": 5058 + }, + { + "epoch": 1.0791382252559727, + "grad_norm": 0.6742861204024195, + "learning_rate": 0.00015857506004886117, + "loss": 0.9908, + "step": 5059 + }, + { + "epoch": 1.0793515358361774, + "grad_norm": 0.7151088495229226, + "learning_rate": 0.0001585734606232417, + "loss": 0.9884, + "step": 5060 + }, + { + "epoch": 1.0795648464163823, + "grad_norm": 0.7189798768153108, + "learning_rate": 0.00015857186030856217, + "loss": 1.0138, + "step": 5061 + }, + { + "epoch": 1.079778156996587, + "grad_norm": 0.7292140859941045, + "learning_rate": 0.00015857025910484074, + "loss": 0.9827, + "step": 5062 + }, + { + "epoch": 1.0799914675767919, + "grad_norm": 0.6872077709825466, + "learning_rate": 0.0001585686570120955, + "loss": 1.0077, + "step": 5063 + }, + { + "epoch": 1.0802047781569966, + "grad_norm": 0.7066240474823532, + "learning_rate": 0.00015856705403034457, + "loss": 1.0261, + "step": 5064 + }, + { + "epoch": 1.0804180887372015, + "grad_norm": 0.5261695622031106, + "learning_rate": 0.00015856545015960612, + "loss": 0.9908, + "step": 5065 + }, + { + "epoch": 1.0806313993174061, + "grad_norm": 0.5218003997159609, + "learning_rate": 0.0001585638453998983, + "loss": 0.978, + "step": 5066 + }, + { + "epoch": 1.0808447098976108, + "grad_norm": 0.4247190484407767, + "learning_rate": 0.0001585622397512392, + "loss": 0.9912, + "step": 5067 + }, + { + "epoch": 1.0810580204778157, + "grad_norm": 0.3815810995333176, + "learning_rate": 0.00015856063321364704, + "loss": 0.9365, + "step": 5068 + }, + { + "epoch": 1.0812713310580204, + "grad_norm": 0.39822251316928525, + "learning_rate": 0.00015855902578714004, + "loss": 0.9733, + "step": 5069 + }, + { + "epoch": 1.0814846416382253, + "grad_norm": 0.4813377893804259, + "learning_rate": 0.00015855741747173633, + "loss": 0.98, + "step": 5070 + }, + { + "epoch": 1.08169795221843, + "grad_norm": 0.5033908691643907, + "learning_rate": 0.0001585558082674541, + "loss": 0.9774, + "step": 5071 + }, + { + "epoch": 1.0819112627986347, + "grad_norm": 0.501930560397822, + "learning_rate": 0.00015855419817431158, + "loss": 0.9959, + "step": 5072 + }, + { + "epoch": 1.0821245733788396, + "grad_norm": 0.5836052074964622, + "learning_rate": 0.000158552587192327, + "loss": 1.01, + "step": 5073 + }, + { + "epoch": 1.0823378839590443, + "grad_norm": 0.5309507126559092, + "learning_rate": 0.00015855097532151857, + "loss": 1.0015, + "step": 5074 + }, + { + "epoch": 1.0825511945392492, + "grad_norm": 0.4994833794012995, + "learning_rate": 0.0001585493625619045, + "loss": 0.9779, + "step": 5075 + }, + { + "epoch": 1.0827645051194539, + "grad_norm": 0.36300013152085114, + "learning_rate": 0.0001585477489135031, + "loss": 0.9851, + "step": 5076 + }, + { + "epoch": 1.0829778156996588, + "grad_norm": 0.3306821617422908, + "learning_rate": 0.00015854613437633262, + "loss": 0.9769, + "step": 5077 + }, + { + "epoch": 1.0831911262798635, + "grad_norm": 0.4350436188699504, + "learning_rate": 0.00015854451895041128, + "loss": 0.9799, + "step": 5078 + }, + { + "epoch": 1.0834044368600684, + "grad_norm": 0.3754182049463897, + "learning_rate": 0.00015854290263575739, + "loss": 0.9672, + "step": 5079 + }, + { + "epoch": 1.083617747440273, + "grad_norm": 0.3788957089270003, + "learning_rate": 0.00015854128543238926, + "loss": 0.9971, + "step": 5080 + }, + { + "epoch": 1.0838310580204777, + "grad_norm": 0.39469232898206186, + "learning_rate": 0.0001585396673403251, + "loss": 0.9569, + "step": 5081 + }, + { + "epoch": 1.0840443686006827, + "grad_norm": 0.41013087579608243, + "learning_rate": 0.00015853804835958338, + "loss": 0.9519, + "step": 5082 + }, + { + "epoch": 1.0842576791808873, + "grad_norm": 0.4024527539687724, + "learning_rate": 0.00015853642849018225, + "loss": 0.9851, + "step": 5083 + }, + { + "epoch": 1.0844709897610922, + "grad_norm": 0.32503005942804153, + "learning_rate": 0.00015853480773214012, + "loss": 0.9483, + "step": 5084 + }, + { + "epoch": 1.084684300341297, + "grad_norm": 0.3606581642618041, + "learning_rate": 0.00015853318608547532, + "loss": 0.9662, + "step": 5085 + }, + { + "epoch": 1.0848976109215016, + "grad_norm": 0.4034510255384294, + "learning_rate": 0.00015853156355020623, + "loss": 0.9833, + "step": 5086 + }, + { + "epoch": 1.0851109215017065, + "grad_norm": 0.5005419255427431, + "learning_rate": 0.00015852994012635117, + "loss": 0.985, + "step": 5087 + }, + { + "epoch": 1.0853242320819112, + "grad_norm": 0.4995333639019227, + "learning_rate": 0.00015852831581392849, + "loss": 0.989, + "step": 5088 + }, + { + "epoch": 1.085537542662116, + "grad_norm": 0.42417672379818805, + "learning_rate": 0.0001585266906129566, + "loss": 0.9367, + "step": 5089 + }, + { + "epoch": 1.0857508532423208, + "grad_norm": 0.30578853835857106, + "learning_rate": 0.0001585250645234539, + "loss": 1.0219, + "step": 5090 + }, + { + "epoch": 1.0859641638225257, + "grad_norm": 0.4674827854450124, + "learning_rate": 0.00015852343754543875, + "loss": 0.9695, + "step": 5091 + }, + { + "epoch": 1.0861774744027304, + "grad_norm": 0.4832713453674351, + "learning_rate": 0.0001585218096789296, + "loss": 0.9873, + "step": 5092 + }, + { + "epoch": 1.086390784982935, + "grad_norm": 0.34693765729700554, + "learning_rate": 0.00015852018092394485, + "loss": 0.9334, + "step": 5093 + }, + { + "epoch": 1.08660409556314, + "grad_norm": 0.3334371230206452, + "learning_rate": 0.00015851855128050293, + "loss": 0.9927, + "step": 5094 + }, + { + "epoch": 1.0868174061433447, + "grad_norm": 0.4405000058019614, + "learning_rate": 0.0001585169207486223, + "loss": 0.9606, + "step": 5095 + }, + { + "epoch": 1.0870307167235496, + "grad_norm": 0.4453154690544994, + "learning_rate": 0.0001585152893283214, + "loss": 0.9393, + "step": 5096 + }, + { + "epoch": 1.0872440273037542, + "grad_norm": 0.48249796544681095, + "learning_rate": 0.00015851365701961863, + "loss": 0.9495, + "step": 5097 + }, + { + "epoch": 1.0874573378839592, + "grad_norm": 0.515308922845146, + "learning_rate": 0.00015851202382253255, + "loss": 0.9648, + "step": 5098 + }, + { + "epoch": 1.0876706484641638, + "grad_norm": 0.41174437698119404, + "learning_rate": 0.00015851038973708158, + "loss": 0.9532, + "step": 5099 + }, + { + "epoch": 1.0878839590443685, + "grad_norm": 0.4230838998423623, + "learning_rate": 0.00015850875476328423, + "loss": 0.9414, + "step": 5100 + }, + { + "epoch": 1.0880972696245734, + "grad_norm": 0.40374748430053004, + "learning_rate": 0.000158507118901159, + "loss": 0.9976, + "step": 5101 + }, + { + "epoch": 1.088310580204778, + "grad_norm": 0.3243969795889086, + "learning_rate": 0.00015850548215072443, + "loss": 0.9606, + "step": 5102 + }, + { + "epoch": 1.088523890784983, + "grad_norm": 0.2691091221146735, + "learning_rate": 0.00015850384451199898, + "loss": 0.9631, + "step": 5103 + }, + { + "epoch": 1.0887372013651877, + "grad_norm": 0.4444782880508142, + "learning_rate": 0.0001585022059850012, + "loss": 0.991, + "step": 5104 + }, + { + "epoch": 1.0889505119453924, + "grad_norm": 0.3031613672395042, + "learning_rate": 0.00015850056656974967, + "loss": 0.9399, + "step": 5105 + }, + { + "epoch": 1.0891638225255973, + "grad_norm": 0.317164557392565, + "learning_rate": 0.00015849892626626288, + "loss": 0.9498, + "step": 5106 + }, + { + "epoch": 1.089377133105802, + "grad_norm": 0.31515352299132615, + "learning_rate": 0.00015849728507455943, + "loss": 0.9739, + "step": 5107 + }, + { + "epoch": 1.0895904436860069, + "grad_norm": 0.33498672840148974, + "learning_rate": 0.0001584956429946579, + "loss": 0.9562, + "step": 5108 + }, + { + "epoch": 1.0898037542662116, + "grad_norm": 0.33193050851119654, + "learning_rate": 0.00015849400002657682, + "loss": 0.9811, + "step": 5109 + }, + { + "epoch": 1.0900170648464165, + "grad_norm": 0.26632083332086215, + "learning_rate": 0.00015849235617033483, + "loss": 0.9586, + "step": 5110 + }, + { + "epoch": 1.0902303754266212, + "grad_norm": 0.32339047718116026, + "learning_rate": 0.0001584907114259505, + "loss": 0.9501, + "step": 5111 + }, + { + "epoch": 1.0904436860068258, + "grad_norm": 0.31881838332149615, + "learning_rate": 0.00015848906579344244, + "loss": 0.9856, + "step": 5112 + }, + { + "epoch": 1.0906569965870307, + "grad_norm": 0.3055166467233763, + "learning_rate": 0.00015848741927282928, + "loss": 0.9612, + "step": 5113 + }, + { + "epoch": 1.0908703071672354, + "grad_norm": 0.3541755232967964, + "learning_rate": 0.00015848577186412964, + "loss": 0.9822, + "step": 5114 + }, + { + "epoch": 1.0910836177474403, + "grad_norm": 0.4319788853135583, + "learning_rate": 0.0001584841235673622, + "loss": 0.977, + "step": 5115 + }, + { + "epoch": 1.091296928327645, + "grad_norm": 0.5321111260639664, + "learning_rate": 0.00015848247438254558, + "loss": 0.997, + "step": 5116 + }, + { + "epoch": 1.09151023890785, + "grad_norm": 0.6854314194699584, + "learning_rate": 0.00015848082430969845, + "loss": 0.9468, + "step": 5117 + }, + { + "epoch": 1.0917235494880546, + "grad_norm": 0.8371886512542835, + "learning_rate": 0.00015847917334883946, + "loss": 0.9861, + "step": 5118 + }, + { + "epoch": 1.0919368600682593, + "grad_norm": 0.9845112663213721, + "learning_rate": 0.00015847752149998728, + "loss": 0.9832, + "step": 5119 + }, + { + "epoch": 1.0921501706484642, + "grad_norm": 1.070313297550182, + "learning_rate": 0.00015847586876316066, + "loss": 0.9487, + "step": 5120 + }, + { + "epoch": 1.0923634812286689, + "grad_norm": 0.8869886138326157, + "learning_rate": 0.00015847421513837825, + "loss": 0.9607, + "step": 5121 + }, + { + "epoch": 1.0925767918088738, + "grad_norm": 0.6297370987077845, + "learning_rate": 0.00015847256062565877, + "loss": 0.9907, + "step": 5122 + }, + { + "epoch": 1.0927901023890785, + "grad_norm": 0.37655499164397027, + "learning_rate": 0.00015847090522502094, + "loss": 0.9831, + "step": 5123 + }, + { + "epoch": 1.0930034129692832, + "grad_norm": 0.342459531633411, + "learning_rate": 0.00015846924893648352, + "loss": 0.9288, + "step": 5124 + }, + { + "epoch": 1.093216723549488, + "grad_norm": 0.48340076805273136, + "learning_rate": 0.0001584675917600652, + "loss": 0.97, + "step": 5125 + }, + { + "epoch": 1.0934300341296928, + "grad_norm": 0.6306078009934662, + "learning_rate": 0.0001584659336957848, + "loss": 0.9519, + "step": 5126 + }, + { + "epoch": 1.0936433447098977, + "grad_norm": 0.7159421579338014, + "learning_rate": 0.000158464274743661, + "loss": 0.9765, + "step": 5127 + }, + { + "epoch": 1.0938566552901023, + "grad_norm": 0.8414438453284763, + "learning_rate": 0.00015846261490371264, + "loss": 0.9792, + "step": 5128 + }, + { + "epoch": 1.0940699658703072, + "grad_norm": 0.8182294718385574, + "learning_rate": 0.00015846095417595846, + "loss": 0.9693, + "step": 5129 + }, + { + "epoch": 1.094283276450512, + "grad_norm": 0.7655469032223545, + "learning_rate": 0.00015845929256041725, + "loss": 0.9893, + "step": 5130 + }, + { + "epoch": 1.0944965870307168, + "grad_norm": 0.6732696907037149, + "learning_rate": 0.00015845763005710786, + "loss": 0.9567, + "step": 5131 + }, + { + "epoch": 1.0947098976109215, + "grad_norm": 0.5435870750466913, + "learning_rate": 0.00015845596666604905, + "loss": 0.9603, + "step": 5132 + }, + { + "epoch": 1.0949232081911262, + "grad_norm": 0.3919741044421809, + "learning_rate": 0.00015845430238725964, + "loss": 0.9668, + "step": 5133 + }, + { + "epoch": 1.0951365187713311, + "grad_norm": 0.4008388283494979, + "learning_rate": 0.0001584526372207585, + "loss": 0.9662, + "step": 5134 + }, + { + "epoch": 1.0953498293515358, + "grad_norm": 0.537440514741269, + "learning_rate": 0.00015845097116656446, + "loss": 0.9821, + "step": 5135 + }, + { + "epoch": 1.0955631399317407, + "grad_norm": 0.47649447859044286, + "learning_rate": 0.0001584493042246963, + "loss": 0.958, + "step": 5136 + }, + { + "epoch": 1.0957764505119454, + "grad_norm": 0.5202878181871206, + "learning_rate": 0.000158447636395173, + "loss": 0.967, + "step": 5137 + }, + { + "epoch": 1.09598976109215, + "grad_norm": 0.6208178965635411, + "learning_rate": 0.00015844596767801338, + "loss": 0.9718, + "step": 5138 + }, + { + "epoch": 1.096203071672355, + "grad_norm": 0.5722243696700535, + "learning_rate": 0.0001584442980732363, + "loss": 0.9763, + "step": 5139 + }, + { + "epoch": 1.0964163822525597, + "grad_norm": 0.46201193446158845, + "learning_rate": 0.00015844262758086066, + "loss": 0.9824, + "step": 5140 + }, + { + "epoch": 1.0966296928327646, + "grad_norm": 0.40198771341001543, + "learning_rate": 0.00015844095620090538, + "loss": 0.9646, + "step": 5141 + }, + { + "epoch": 1.0968430034129693, + "grad_norm": 0.4318571816157608, + "learning_rate": 0.00015843928393338935, + "loss": 0.9714, + "step": 5142 + }, + { + "epoch": 1.0970563139931742, + "grad_norm": 0.3589875319283552, + "learning_rate": 0.00015843761077833152, + "loss": 0.9826, + "step": 5143 + }, + { + "epoch": 1.0972696245733788, + "grad_norm": 0.36147228905067946, + "learning_rate": 0.0001584359367357508, + "loss": 0.9557, + "step": 5144 + }, + { + "epoch": 1.0974829351535835, + "grad_norm": 0.4175489950331197, + "learning_rate": 0.00015843426180566612, + "loss": 0.9871, + "step": 5145 + }, + { + "epoch": 1.0976962457337884, + "grad_norm": 0.4212318659272673, + "learning_rate": 0.00015843258598809644, + "loss": 0.9658, + "step": 5146 + }, + { + "epoch": 1.0979095563139931, + "grad_norm": 0.5442518559779913, + "learning_rate": 0.00015843090928306078, + "loss": 0.9539, + "step": 5147 + }, + { + "epoch": 1.098122866894198, + "grad_norm": 0.5782339445862741, + "learning_rate": 0.00015842923169057802, + "loss": 0.9628, + "step": 5148 + }, + { + "epoch": 1.0983361774744027, + "grad_norm": 0.544955190616171, + "learning_rate": 0.0001584275532106672, + "loss": 0.9766, + "step": 5149 + }, + { + "epoch": 1.0985494880546076, + "grad_norm": 0.5785215195722165, + "learning_rate": 0.00015842587384334728, + "loss": 1.0083, + "step": 5150 + }, + { + "epoch": 1.0987627986348123, + "grad_norm": 0.6702186257651512, + "learning_rate": 0.0001584241935886373, + "loss": 0.9723, + "step": 5151 + }, + { + "epoch": 1.098976109215017, + "grad_norm": 0.7150921912235216, + "learning_rate": 0.00015842251244655627, + "loss": 0.9606, + "step": 5152 + }, + { + "epoch": 1.099189419795222, + "grad_norm": 0.6458159308409074, + "learning_rate": 0.00015842083041712316, + "loss": 0.9882, + "step": 5153 + }, + { + "epoch": 1.0994027303754266, + "grad_norm": 0.5420607428691251, + "learning_rate": 0.00015841914750035704, + "loss": 0.9883, + "step": 5154 + }, + { + "epoch": 1.0996160409556315, + "grad_norm": 0.3683713142844263, + "learning_rate": 0.00015841746369627693, + "loss": 0.9582, + "step": 5155 + }, + { + "epoch": 1.0998293515358362, + "grad_norm": 0.3416383437277355, + "learning_rate": 0.00015841577900490192, + "loss": 0.9535, + "step": 5156 + }, + { + "epoch": 1.1000426621160408, + "grad_norm": 0.38536945512159887, + "learning_rate": 0.00015841409342625103, + "loss": 0.9764, + "step": 5157 + }, + { + "epoch": 1.1002559726962458, + "grad_norm": 0.44602881070042516, + "learning_rate": 0.00015841240696034338, + "loss": 0.9449, + "step": 5158 + }, + { + "epoch": 1.1004692832764504, + "grad_norm": 0.4716063609707291, + "learning_rate": 0.00015841071960719804, + "loss": 0.9716, + "step": 5159 + }, + { + "epoch": 1.1006825938566553, + "grad_norm": 0.5271357231082135, + "learning_rate": 0.00015840903136683405, + "loss": 0.9493, + "step": 5160 + }, + { + "epoch": 1.10089590443686, + "grad_norm": 0.5373121487204002, + "learning_rate": 0.00015840734223927056, + "loss": 0.9817, + "step": 5161 + }, + { + "epoch": 1.101109215017065, + "grad_norm": 0.44694053325584127, + "learning_rate": 0.0001584056522245267, + "loss": 0.9693, + "step": 5162 + }, + { + "epoch": 1.1013225255972696, + "grad_norm": 0.5232632188189785, + "learning_rate": 0.00015840396132262154, + "loss": 0.992, + "step": 5163 + }, + { + "epoch": 1.1015358361774743, + "grad_norm": 0.4689368305509511, + "learning_rate": 0.00015840226953357423, + "loss": 0.9608, + "step": 5164 + }, + { + "epoch": 1.1017491467576792, + "grad_norm": 0.35146516646760095, + "learning_rate": 0.00015840057685740394, + "loss": 0.9615, + "step": 5165 + }, + { + "epoch": 1.101962457337884, + "grad_norm": 0.3251197628802595, + "learning_rate": 0.0001583988832941298, + "loss": 0.974, + "step": 5166 + }, + { + "epoch": 1.1021757679180888, + "grad_norm": 0.3022120003839412, + "learning_rate": 0.00015839718884377098, + "loss": 0.9981, + "step": 5167 + }, + { + "epoch": 1.1023890784982935, + "grad_norm": 0.3478180620031825, + "learning_rate": 0.00015839549350634666, + "loss": 0.9509, + "step": 5168 + }, + { + "epoch": 1.1026023890784984, + "grad_norm": 0.5978807177779047, + "learning_rate": 0.000158393797281876, + "loss": 0.9466, + "step": 5169 + }, + { + "epoch": 1.102815699658703, + "grad_norm": 0.35345513899383935, + "learning_rate": 0.0001583921001703782, + "loss": 0.9773, + "step": 5170 + }, + { + "epoch": 1.1030290102389078, + "grad_norm": 0.35656275694431394, + "learning_rate": 0.00015839040217187247, + "loss": 0.9599, + "step": 5171 + }, + { + "epoch": 1.1032423208191127, + "grad_norm": 0.5131016098752965, + "learning_rate": 0.00015838870328637803, + "loss": 0.9724, + "step": 5172 + }, + { + "epoch": 1.1034556313993173, + "grad_norm": 0.5330865101127377, + "learning_rate": 0.00015838700351391408, + "loss": 0.9432, + "step": 5173 + }, + { + "epoch": 1.1036689419795223, + "grad_norm": 0.522357261540406, + "learning_rate": 0.0001583853028544999, + "loss": 0.9492, + "step": 5174 + }, + { + "epoch": 1.103882252559727, + "grad_norm": 0.4611938432378947, + "learning_rate": 0.00015838360130815467, + "loss": 1.0142, + "step": 5175 + }, + { + "epoch": 1.1040955631399316, + "grad_norm": 0.37916268631689193, + "learning_rate": 0.00015838189887489766, + "loss": 0.9463, + "step": 5176 + }, + { + "epoch": 1.1043088737201365, + "grad_norm": 0.28256083880439664, + "learning_rate": 0.00015838019555474816, + "loss": 0.9551, + "step": 5177 + }, + { + "epoch": 1.1045221843003412, + "grad_norm": 0.3495225900241124, + "learning_rate": 0.00015837849134772545, + "loss": 0.981, + "step": 5178 + }, + { + "epoch": 1.1047354948805461, + "grad_norm": 0.4538883030527365, + "learning_rate": 0.00015837678625384876, + "loss": 0.9805, + "step": 5179 + }, + { + "epoch": 1.1049488054607508, + "grad_norm": 0.5246436697071563, + "learning_rate": 0.00015837508027313744, + "loss": 0.9595, + "step": 5180 + }, + { + "epoch": 1.1051621160409557, + "grad_norm": 0.6781466452282856, + "learning_rate": 0.00015837337340561075, + "loss": 0.9839, + "step": 5181 + }, + { + "epoch": 1.1053754266211604, + "grad_norm": 0.7248295009177392, + "learning_rate": 0.00015837166565128803, + "loss": 0.9898, + "step": 5182 + }, + { + "epoch": 1.1055887372013653, + "grad_norm": 0.8003933339087215, + "learning_rate": 0.00015836995701018857, + "loss": 0.9478, + "step": 5183 + }, + { + "epoch": 1.10580204778157, + "grad_norm": 0.7898441274875211, + "learning_rate": 0.0001583682474823318, + "loss": 0.9475, + "step": 5184 + }, + { + "epoch": 1.1060153583617747, + "grad_norm": 0.7840704387171915, + "learning_rate": 0.00015836653706773694, + "loss": 0.971, + "step": 5185 + }, + { + "epoch": 1.1062286689419796, + "grad_norm": 0.8045549003945489, + "learning_rate": 0.00015836482576642337, + "loss": 0.9781, + "step": 5186 + }, + { + "epoch": 1.1064419795221843, + "grad_norm": 0.7871681103879093, + "learning_rate": 0.00015836311357841053, + "loss": 0.953, + "step": 5187 + }, + { + "epoch": 1.1066552901023892, + "grad_norm": 0.6591790210306852, + "learning_rate": 0.0001583614005037177, + "loss": 0.9756, + "step": 5188 + }, + { + "epoch": 1.1068686006825939, + "grad_norm": 0.45857884076413363, + "learning_rate": 0.00015835968654236433, + "loss": 0.9761, + "step": 5189 + }, + { + "epoch": 1.1070819112627985, + "grad_norm": 0.32508090480949503, + "learning_rate": 0.0001583579716943698, + "loss": 0.9656, + "step": 5190 + }, + { + "epoch": 1.1072952218430034, + "grad_norm": 0.4518478905069013, + "learning_rate": 0.00015835625595975346, + "loss": 0.9305, + "step": 5191 + }, + { + "epoch": 1.1075085324232081, + "grad_norm": 0.616019962538181, + "learning_rate": 0.0001583545393385348, + "loss": 0.964, + "step": 5192 + }, + { + "epoch": 1.107721843003413, + "grad_norm": 0.7225023008582875, + "learning_rate": 0.00015835282183073322, + "loss": 0.9798, + "step": 5193 + }, + { + "epoch": 1.1079351535836177, + "grad_norm": 0.8009911171697938, + "learning_rate": 0.0001583511034363681, + "loss": 0.9431, + "step": 5194 + }, + { + "epoch": 1.1081484641638226, + "grad_norm": 0.7902094001046144, + "learning_rate": 0.00015834938415545897, + "loss": 0.9507, + "step": 5195 + }, + { + "epoch": 1.1083617747440273, + "grad_norm": 0.6934478240164178, + "learning_rate": 0.00015834766398802521, + "loss": 0.9571, + "step": 5196 + }, + { + "epoch": 1.108575085324232, + "grad_norm": 0.46470405504130013, + "learning_rate": 0.00015834594293408634, + "loss": 0.9643, + "step": 5197 + }, + { + "epoch": 1.108788395904437, + "grad_norm": 0.3145661312497474, + "learning_rate": 0.00015834422099366178, + "loss": 0.9492, + "step": 5198 + }, + { + "epoch": 1.1090017064846416, + "grad_norm": 0.5035167039980939, + "learning_rate": 0.00015834249816677106, + "loss": 0.9662, + "step": 5199 + }, + { + "epoch": 1.1092150170648465, + "grad_norm": 0.6802551186955211, + "learning_rate": 0.00015834077445343365, + "loss": 0.9259, + "step": 5200 + }, + { + "epoch": 1.1094283276450512, + "grad_norm": 0.729881893862927, + "learning_rate": 0.00015833904985366903, + "loss": 0.9615, + "step": 5201 + }, + { + "epoch": 1.109641638225256, + "grad_norm": 0.7462077666908739, + "learning_rate": 0.0001583373243674968, + "loss": 0.988, + "step": 5202 + }, + { + "epoch": 1.1098549488054608, + "grad_norm": 0.6698630589956109, + "learning_rate": 0.0001583355979949364, + "loss": 0.9567, + "step": 5203 + }, + { + "epoch": 1.1100682593856654, + "grad_norm": 0.4993490934592028, + "learning_rate": 0.0001583338707360074, + "loss": 0.9462, + "step": 5204 + }, + { + "epoch": 1.1102815699658704, + "grad_norm": 0.3770168788641509, + "learning_rate": 0.0001583321425907293, + "loss": 0.9718, + "step": 5205 + }, + { + "epoch": 1.110494880546075, + "grad_norm": 0.2948035913940668, + "learning_rate": 0.00015833041355912175, + "loss": 0.9911, + "step": 5206 + }, + { + "epoch": 1.11070819112628, + "grad_norm": 0.3524474836061004, + "learning_rate": 0.0001583286836412042, + "loss": 0.9756, + "step": 5207 + }, + { + "epoch": 1.1109215017064846, + "grad_norm": 0.4489963831117822, + "learning_rate": 0.0001583269528369963, + "loss": 0.989, + "step": 5208 + }, + { + "epoch": 1.1111348122866893, + "grad_norm": 0.5502705798520686, + "learning_rate": 0.00015832522114651762, + "loss": 0.9684, + "step": 5209 + }, + { + "epoch": 1.1113481228668942, + "grad_norm": 0.6309688083833744, + "learning_rate": 0.0001583234885697877, + "loss": 0.9619, + "step": 5210 + }, + { + "epoch": 1.111561433447099, + "grad_norm": 0.6049819699131304, + "learning_rate": 0.00015832175510682622, + "loss": 0.9667, + "step": 5211 + }, + { + "epoch": 1.1117747440273038, + "grad_norm": 0.5154948314986957, + "learning_rate": 0.00015832002075765277, + "loss": 0.9664, + "step": 5212 + }, + { + "epoch": 1.1119880546075085, + "grad_norm": 0.4568399065012902, + "learning_rate": 0.00015831828552228698, + "loss": 0.9519, + "step": 5213 + }, + { + "epoch": 1.1122013651877134, + "grad_norm": 0.3985379978529112, + "learning_rate": 0.00015831654940074845, + "loss": 0.98, + "step": 5214 + }, + { + "epoch": 1.112414675767918, + "grad_norm": 0.31776593260039304, + "learning_rate": 0.00015831481239305685, + "loss": 0.9508, + "step": 5215 + }, + { + "epoch": 1.1126279863481228, + "grad_norm": 0.3939095498263, + "learning_rate": 0.0001583130744992318, + "loss": 0.9631, + "step": 5216 + }, + { + "epoch": 1.1128412969283277, + "grad_norm": 0.41836483683559045, + "learning_rate": 0.00015831133571929306, + "loss": 0.9424, + "step": 5217 + }, + { + "epoch": 1.1130546075085324, + "grad_norm": 0.31776356704402003, + "learning_rate": 0.0001583095960532602, + "loss": 1.0113, + "step": 5218 + }, + { + "epoch": 1.1132679180887373, + "grad_norm": 0.37720262827162926, + "learning_rate": 0.00015830785550115297, + "loss": 0.9564, + "step": 5219 + }, + { + "epoch": 1.113481228668942, + "grad_norm": 0.3360779221186837, + "learning_rate": 0.000158306114062991, + "loss": 0.9982, + "step": 5220 + }, + { + "epoch": 1.1136945392491469, + "grad_norm": 0.4174955164533953, + "learning_rate": 0.00015830437173879405, + "loss": 0.9401, + "step": 5221 + }, + { + "epoch": 1.1139078498293515, + "grad_norm": 0.6001932648308631, + "learning_rate": 0.0001583026285285818, + "loss": 0.9793, + "step": 5222 + }, + { + "epoch": 1.1141211604095562, + "grad_norm": 0.6269531303724153, + "learning_rate": 0.00015830088443237402, + "loss": 0.9586, + "step": 5223 + }, + { + "epoch": 1.1143344709897611, + "grad_norm": 0.5112343967633494, + "learning_rate": 0.0001582991394501904, + "loss": 0.9727, + "step": 5224 + }, + { + "epoch": 1.1145477815699658, + "grad_norm": 0.4627876053611048, + "learning_rate": 0.0001582973935820507, + "loss": 0.9714, + "step": 5225 + }, + { + "epoch": 1.1147610921501707, + "grad_norm": 0.42803527122566265, + "learning_rate": 0.00015829564682797467, + "loss": 0.9333, + "step": 5226 + }, + { + "epoch": 1.1149744027303754, + "grad_norm": 0.38128272045696077, + "learning_rate": 0.00015829389918798207, + "loss": 0.9569, + "step": 5227 + }, + { + "epoch": 1.1151877133105803, + "grad_norm": 0.45222115389719936, + "learning_rate": 0.00015829215066209273, + "loss": 0.9209, + "step": 5228 + }, + { + "epoch": 1.115401023890785, + "grad_norm": 0.4278188368054265, + "learning_rate": 0.00015829040125032634, + "loss": 0.9762, + "step": 5229 + }, + { + "epoch": 1.1156143344709897, + "grad_norm": 0.4929505770327576, + "learning_rate": 0.00015828865095270276, + "loss": 1.0103, + "step": 5230 + }, + { + "epoch": 1.1158276450511946, + "grad_norm": 0.7029850560014774, + "learning_rate": 0.0001582868997692418, + "loss": 0.947, + "step": 5231 + }, + { + "epoch": 1.1160409556313993, + "grad_norm": 0.8068092840920597, + "learning_rate": 0.00015828514769996321, + "loss": 0.9753, + "step": 5232 + }, + { + "epoch": 1.1162542662116042, + "grad_norm": 0.8405809433990497, + "learning_rate": 0.00015828339474488687, + "loss": 0.9937, + "step": 5233 + }, + { + "epoch": 1.1164675767918089, + "grad_norm": 0.9226895516233227, + "learning_rate": 0.00015828164090403262, + "loss": 0.9957, + "step": 5234 + }, + { + "epoch": 1.1166808873720138, + "grad_norm": 0.9837783544443888, + "learning_rate": 0.00015827988617742028, + "loss": 0.9943, + "step": 5235 + }, + { + "epoch": 1.1168941979522184, + "grad_norm": 0.8903226256348216, + "learning_rate": 0.0001582781305650697, + "loss": 0.9912, + "step": 5236 + }, + { + "epoch": 1.1171075085324231, + "grad_norm": 0.742589990841128, + "learning_rate": 0.00015827637406700076, + "loss": 0.9674, + "step": 5237 + }, + { + "epoch": 1.117320819112628, + "grad_norm": 0.42460429188213544, + "learning_rate": 0.00015827461668323336, + "loss": 0.9878, + "step": 5238 + }, + { + "epoch": 1.1175341296928327, + "grad_norm": 0.2914909416086188, + "learning_rate": 0.0001582728584137873, + "loss": 0.9685, + "step": 5239 + }, + { + "epoch": 1.1177474402730376, + "grad_norm": 0.4480054243059658, + "learning_rate": 0.00015827109925868258, + "loss": 0.9681, + "step": 5240 + }, + { + "epoch": 1.1179607508532423, + "grad_norm": 0.6411342272878546, + "learning_rate": 0.00015826933921793903, + "loss": 0.9453, + "step": 5241 + }, + { + "epoch": 1.118174061433447, + "grad_norm": 0.837076154709747, + "learning_rate": 0.0001582675782915766, + "loss": 0.9788, + "step": 5242 + }, + { + "epoch": 1.118387372013652, + "grad_norm": 0.9175235946696744, + "learning_rate": 0.0001582658164796152, + "loss": 0.9465, + "step": 5243 + }, + { + "epoch": 1.1186006825938566, + "grad_norm": 0.8796856418277745, + "learning_rate": 0.00015826405378207477, + "loss": 0.9746, + "step": 5244 + }, + { + "epoch": 1.1188139931740615, + "grad_norm": 0.7433523564242196, + "learning_rate": 0.00015826229019897525, + "loss": 0.9759, + "step": 5245 + }, + { + "epoch": 1.1190273037542662, + "grad_norm": 0.5131128761654546, + "learning_rate": 0.00015826052573033662, + "loss": 0.9665, + "step": 5246 + }, + { + "epoch": 1.119240614334471, + "grad_norm": 0.2858318539317425, + "learning_rate": 0.00015825876037617878, + "loss": 0.9654, + "step": 5247 + }, + { + "epoch": 1.1194539249146758, + "grad_norm": 0.3277316086654533, + "learning_rate": 0.00015825699413652179, + "loss": 0.9706, + "step": 5248 + }, + { + "epoch": 1.1196672354948805, + "grad_norm": 0.4830424996604273, + "learning_rate": 0.0001582552270113856, + "loss": 0.9819, + "step": 5249 + }, + { + "epoch": 1.1198805460750854, + "grad_norm": 0.5217294286963114, + "learning_rate": 0.00015825345900079017, + "loss": 0.9702, + "step": 5250 + }, + { + "epoch": 1.12009385665529, + "grad_norm": 0.49146273964830295, + "learning_rate": 0.00015825169010475555, + "loss": 0.9633, + "step": 5251 + }, + { + "epoch": 1.120307167235495, + "grad_norm": 0.36160701507242066, + "learning_rate": 0.00015824992032330176, + "loss": 0.9367, + "step": 5252 + }, + { + "epoch": 1.1205204778156996, + "grad_norm": 0.28873821869829525, + "learning_rate": 0.00015824814965644877, + "loss": 0.9666, + "step": 5253 + }, + { + "epoch": 1.1207337883959045, + "grad_norm": 0.3093429537563535, + "learning_rate": 0.00015824637810421668, + "loss": 0.9734, + "step": 5254 + }, + { + "epoch": 1.1209470989761092, + "grad_norm": 0.3960562401184434, + "learning_rate": 0.0001582446056666255, + "loss": 0.9764, + "step": 5255 + }, + { + "epoch": 1.121160409556314, + "grad_norm": 0.49621011811980215, + "learning_rate": 0.00015824283234369528, + "loss": 0.9547, + "step": 5256 + }, + { + "epoch": 1.1213737201365188, + "grad_norm": 0.41178314138716626, + "learning_rate": 0.0001582410581354461, + "loss": 0.9786, + "step": 5257 + }, + { + "epoch": 1.1215870307167235, + "grad_norm": 0.32932948731082506, + "learning_rate": 0.000158239283041898, + "loss": 0.9812, + "step": 5258 + }, + { + "epoch": 1.1218003412969284, + "grad_norm": 0.3797546987121003, + "learning_rate": 0.00015823750706307115, + "loss": 0.9615, + "step": 5259 + }, + { + "epoch": 1.122013651877133, + "grad_norm": 0.3603792062700543, + "learning_rate": 0.00015823573019898557, + "loss": 0.9644, + "step": 5260 + }, + { + "epoch": 1.1222269624573378, + "grad_norm": 0.35599534188019855, + "learning_rate": 0.00015823395244966138, + "loss": 0.9409, + "step": 5261 + }, + { + "epoch": 1.1224402730375427, + "grad_norm": 0.3191731499320414, + "learning_rate": 0.00015823217381511872, + "loss": 0.9513, + "step": 5262 + }, + { + "epoch": 1.1226535836177474, + "grad_norm": 0.3083972899588669, + "learning_rate": 0.0001582303942953777, + "loss": 0.9893, + "step": 5263 + }, + { + "epoch": 1.1228668941979523, + "grad_norm": 0.2818069642495875, + "learning_rate": 0.0001582286138904584, + "loss": 0.9786, + "step": 5264 + }, + { + "epoch": 1.123080204778157, + "grad_norm": 0.330720161652369, + "learning_rate": 0.00015822683260038105, + "loss": 0.9742, + "step": 5265 + }, + { + "epoch": 1.1232935153583619, + "grad_norm": 0.4016402072245308, + "learning_rate": 0.0001582250504251658, + "loss": 0.9402, + "step": 5266 + }, + { + "epoch": 1.1235068259385665, + "grad_norm": 0.4546035082707565, + "learning_rate": 0.00015822326736483278, + "loss": 0.9901, + "step": 5267 + }, + { + "epoch": 1.1237201365187712, + "grad_norm": 0.44907469203987116, + "learning_rate": 0.00015822148341940214, + "loss": 0.9383, + "step": 5268 + }, + { + "epoch": 1.1239334470989761, + "grad_norm": 0.3926651858214823, + "learning_rate": 0.00015821969858889411, + "loss": 0.9686, + "step": 5269 + }, + { + "epoch": 1.1241467576791808, + "grad_norm": 0.4020065784760134, + "learning_rate": 0.0001582179128733289, + "loss": 0.9399, + "step": 5270 + }, + { + "epoch": 1.1243600682593857, + "grad_norm": 0.4160454624902317, + "learning_rate": 0.00015821612627272668, + "loss": 0.9654, + "step": 5271 + }, + { + "epoch": 1.1245733788395904, + "grad_norm": 0.39175255785952595, + "learning_rate": 0.0001582143387871077, + "loss": 0.98, + "step": 5272 + }, + { + "epoch": 1.1247866894197953, + "grad_norm": 0.35679006495476506, + "learning_rate": 0.0001582125504164921, + "loss": 0.9367, + "step": 5273 + }, + { + "epoch": 1.125, + "grad_norm": 0.3606888534917662, + "learning_rate": 0.0001582107611609002, + "loss": 0.9714, + "step": 5274 + }, + { + "epoch": 1.1252133105802047, + "grad_norm": 0.46446525683467305, + "learning_rate": 0.00015820897102035225, + "loss": 0.9797, + "step": 5275 + }, + { + "epoch": 1.1254266211604096, + "grad_norm": 0.5486896676855213, + "learning_rate": 0.00015820717999486845, + "loss": 0.9206, + "step": 5276 + }, + { + "epoch": 1.1256399317406143, + "grad_norm": 0.5724433670221923, + "learning_rate": 0.0001582053880844691, + "loss": 0.9419, + "step": 5277 + }, + { + "epoch": 1.1258532423208192, + "grad_norm": 0.5577569294050354, + "learning_rate": 0.00015820359528917447, + "loss": 0.9522, + "step": 5278 + }, + { + "epoch": 1.1260665529010239, + "grad_norm": 0.4870583194584129, + "learning_rate": 0.00015820180160900485, + "loss": 0.9337, + "step": 5279 + }, + { + "epoch": 1.1262798634812285, + "grad_norm": 0.41299715559440314, + "learning_rate": 0.00015820000704398052, + "loss": 0.9494, + "step": 5280 + }, + { + "epoch": 1.1264931740614335, + "grad_norm": 0.40293556399271246, + "learning_rate": 0.0001581982115941218, + "loss": 1.0041, + "step": 5281 + }, + { + "epoch": 1.1267064846416381, + "grad_norm": 0.51573369489272, + "learning_rate": 0.000158196415259449, + "loss": 0.9573, + "step": 5282 + }, + { + "epoch": 1.126919795221843, + "grad_norm": 0.7000021241154359, + "learning_rate": 0.00015819461803998246, + "loss": 0.9676, + "step": 5283 + }, + { + "epoch": 1.1271331058020477, + "grad_norm": 0.7147410260821585, + "learning_rate": 0.0001581928199357425, + "loss": 0.9425, + "step": 5284 + }, + { + "epoch": 1.1273464163822526, + "grad_norm": 0.5452118415163608, + "learning_rate": 0.00015819102094674943, + "loss": 0.9393, + "step": 5285 + }, + { + "epoch": 1.1275597269624573, + "grad_norm": 0.42768356094973875, + "learning_rate": 0.00015818922107302367, + "loss": 0.9831, + "step": 5286 + }, + { + "epoch": 1.1277730375426622, + "grad_norm": 0.3151807009901743, + "learning_rate": 0.00015818742031458555, + "loss": 0.975, + "step": 5287 + }, + { + "epoch": 1.127986348122867, + "grad_norm": 0.405319772620054, + "learning_rate": 0.00015818561867145546, + "loss": 0.9594, + "step": 5288 + }, + { + "epoch": 1.1281996587030716, + "grad_norm": 0.5246993133041175, + "learning_rate": 0.00015818381614365377, + "loss": 0.9439, + "step": 5289 + }, + { + "epoch": 1.1284129692832765, + "grad_norm": 0.5417902855408593, + "learning_rate": 0.0001581820127312009, + "loss": 0.9571, + "step": 5290 + }, + { + "epoch": 1.1286262798634812, + "grad_norm": 0.4952009160705999, + "learning_rate": 0.00015818020843411722, + "loss": 0.9757, + "step": 5291 + }, + { + "epoch": 1.128839590443686, + "grad_norm": 0.38627281239563255, + "learning_rate": 0.0001581784032524232, + "loss": 0.9569, + "step": 5292 + }, + { + "epoch": 1.1290529010238908, + "grad_norm": 0.3529925408607553, + "learning_rate": 0.00015817659718613921, + "loss": 0.9796, + "step": 5293 + }, + { + "epoch": 1.1292662116040955, + "grad_norm": 0.3701150005646464, + "learning_rate": 0.0001581747902352857, + "loss": 0.9416, + "step": 5294 + }, + { + "epoch": 1.1294795221843004, + "grad_norm": 0.33310093426151066, + "learning_rate": 0.00015817298239988314, + "loss": 0.9932, + "step": 5295 + }, + { + "epoch": 1.129692832764505, + "grad_norm": 0.41466835276287534, + "learning_rate": 0.00015817117367995196, + "loss": 0.9858, + "step": 5296 + }, + { + "epoch": 1.12990614334471, + "grad_norm": 0.426865035744716, + "learning_rate": 0.00015816936407551265, + "loss": 0.9662, + "step": 5297 + }, + { + "epoch": 1.1301194539249146, + "grad_norm": 0.3541455973011228, + "learning_rate": 0.00015816755358658566, + "loss": 0.9527, + "step": 5298 + }, + { + "epoch": 1.1303327645051195, + "grad_norm": 0.3404244547079378, + "learning_rate": 0.00015816574221319147, + "loss": 0.9611, + "step": 5299 + }, + { + "epoch": 1.1305460750853242, + "grad_norm": 0.43664563439018284, + "learning_rate": 0.0001581639299553506, + "loss": 0.9852, + "step": 5300 + }, + { + "epoch": 1.130759385665529, + "grad_norm": 0.3504163941810162, + "learning_rate": 0.0001581621168130836, + "loss": 0.9613, + "step": 5301 + }, + { + "epoch": 1.1309726962457338, + "grad_norm": 0.294992042120463, + "learning_rate": 0.00015816030278641087, + "loss": 0.9699, + "step": 5302 + }, + { + "epoch": 1.1311860068259385, + "grad_norm": 0.4046816965382651, + "learning_rate": 0.000158158487875353, + "loss": 0.978, + "step": 5303 + }, + { + "epoch": 1.1313993174061434, + "grad_norm": 0.7407977412447465, + "learning_rate": 0.00015815667207993056, + "loss": 0.9346, + "step": 5304 + }, + { + "epoch": 1.131612627986348, + "grad_norm": 0.6614598204906081, + "learning_rate": 0.00015815485540016406, + "loss": 0.9373, + "step": 5305 + }, + { + "epoch": 1.131825938566553, + "grad_norm": 0.8096400740325452, + "learning_rate": 0.00015815303783607402, + "loss": 0.9511, + "step": 5306 + }, + { + "epoch": 1.1320392491467577, + "grad_norm": 0.9284248726670196, + "learning_rate": 0.00015815121938768105, + "loss": 0.9514, + "step": 5307 + }, + { + "epoch": 1.1322525597269624, + "grad_norm": 0.9471434557347741, + "learning_rate": 0.00015814940005500576, + "loss": 0.9421, + "step": 5308 + }, + { + "epoch": 1.1324658703071673, + "grad_norm": 1.0043242620085557, + "learning_rate": 0.00015814757983806865, + "loss": 0.9729, + "step": 5309 + }, + { + "epoch": 1.132679180887372, + "grad_norm": 0.8785598199303005, + "learning_rate": 0.00015814575873689037, + "loss": 0.9323, + "step": 5310 + }, + { + "epoch": 1.1328924914675769, + "grad_norm": 0.7842971403475508, + "learning_rate": 0.00015814393675149153, + "loss": 0.9558, + "step": 5311 + }, + { + "epoch": 1.1331058020477816, + "grad_norm": 0.7584776360379419, + "learning_rate": 0.00015814211388189272, + "loss": 0.9912, + "step": 5312 + }, + { + "epoch": 1.1333191126279862, + "grad_norm": 0.8608020346476793, + "learning_rate": 0.00015814029012811457, + "loss": 0.9277, + "step": 5313 + }, + { + "epoch": 1.1335324232081911, + "grad_norm": 0.8324533513075609, + "learning_rate": 0.00015813846549017773, + "loss": 0.9865, + "step": 5314 + }, + { + "epoch": 1.1337457337883958, + "grad_norm": 0.5451573365668757, + "learning_rate": 0.00015813663996810286, + "loss": 0.9721, + "step": 5315 + }, + { + "epoch": 1.1339590443686007, + "grad_norm": 0.3783484479066225, + "learning_rate": 0.00015813481356191056, + "loss": 0.9581, + "step": 5316 + }, + { + "epoch": 1.1341723549488054, + "grad_norm": 0.5028759928490751, + "learning_rate": 0.00015813298627162157, + "loss": 0.9766, + "step": 5317 + }, + { + "epoch": 1.1343856655290103, + "grad_norm": 0.6104381009495764, + "learning_rate": 0.0001581311580972565, + "loss": 0.9495, + "step": 5318 + }, + { + "epoch": 1.134598976109215, + "grad_norm": 0.6019308772820059, + "learning_rate": 0.00015812932903883604, + "loss": 0.9638, + "step": 5319 + }, + { + "epoch": 1.13481228668942, + "grad_norm": 0.6437439647741989, + "learning_rate": 0.00015812749909638094, + "loss": 0.9671, + "step": 5320 + }, + { + "epoch": 1.1350255972696246, + "grad_norm": 0.6154008458616415, + "learning_rate": 0.00015812566826991186, + "loss": 0.9662, + "step": 5321 + }, + { + "epoch": 1.1352389078498293, + "grad_norm": 0.45819713892097375, + "learning_rate": 0.00015812383655944954, + "loss": 0.9533, + "step": 5322 + }, + { + "epoch": 1.1354522184300342, + "grad_norm": 0.31383235440845186, + "learning_rate": 0.0001581220039650147, + "loss": 0.9673, + "step": 5323 + }, + { + "epoch": 1.1356655290102389, + "grad_norm": 0.372441915995134, + "learning_rate": 0.00015812017048662803, + "loss": 0.9893, + "step": 5324 + }, + { + "epoch": 1.1358788395904438, + "grad_norm": 0.42377404539564323, + "learning_rate": 0.00015811833612431036, + "loss": 0.9601, + "step": 5325 + }, + { + "epoch": 1.1360921501706485, + "grad_norm": 0.39054494333727996, + "learning_rate": 0.0001581165008780824, + "loss": 0.976, + "step": 5326 + }, + { + "epoch": 1.1363054607508531, + "grad_norm": 0.33069656581029483, + "learning_rate": 0.0001581146647479649, + "loss": 0.9486, + "step": 5327 + }, + { + "epoch": 1.136518771331058, + "grad_norm": 0.291016076785268, + "learning_rate": 0.00015811282773397866, + "loss": 0.9657, + "step": 5328 + }, + { + "epoch": 1.1367320819112627, + "grad_norm": 0.3018310692658602, + "learning_rate": 0.00015811098983614446, + "loss": 0.9967, + "step": 5329 + }, + { + "epoch": 1.1369453924914676, + "grad_norm": 0.7312587551718224, + "learning_rate": 0.0001581091510544831, + "loss": 1.0214, + "step": 5330 + }, + { + "epoch": 1.1371587030716723, + "grad_norm": 0.433474650411033, + "learning_rate": 0.0001581073113890154, + "loss": 0.9833, + "step": 5331 + }, + { + "epoch": 1.137372013651877, + "grad_norm": 0.4337940641125684, + "learning_rate": 0.0001581054708397621, + "loss": 0.9779, + "step": 5332 + }, + { + "epoch": 1.137585324232082, + "grad_norm": 0.4193708847689592, + "learning_rate": 0.00015810362940674413, + "loss": 0.9711, + "step": 5333 + }, + { + "epoch": 1.1377986348122866, + "grad_norm": 0.40274068476714214, + "learning_rate": 0.00015810178708998228, + "loss": 0.967, + "step": 5334 + }, + { + "epoch": 1.1380119453924915, + "grad_norm": 0.4368434633470673, + "learning_rate": 0.0001580999438894974, + "loss": 1.0017, + "step": 5335 + }, + { + "epoch": 1.1382252559726962, + "grad_norm": 0.47293401293619897, + "learning_rate": 0.00015809809980531033, + "loss": 0.9488, + "step": 5336 + }, + { + "epoch": 1.138438566552901, + "grad_norm": 0.41787964205960054, + "learning_rate": 0.00015809625483744196, + "loss": 0.9889, + "step": 5337 + }, + { + "epoch": 1.1386518771331058, + "grad_norm": 0.38825529572359024, + "learning_rate": 0.00015809440898591312, + "loss": 0.9439, + "step": 5338 + }, + { + "epoch": 1.1388651877133107, + "grad_norm": 0.28960898911022986, + "learning_rate": 0.00015809256225074475, + "loss": 0.9684, + "step": 5339 + }, + { + "epoch": 1.1390784982935154, + "grad_norm": 0.3304665846332661, + "learning_rate": 0.00015809071463195772, + "loss": 0.9516, + "step": 5340 + }, + { + "epoch": 1.13929180887372, + "grad_norm": 0.3581655774044064, + "learning_rate": 0.00015808886612957295, + "loss": 0.9804, + "step": 5341 + }, + { + "epoch": 1.139505119453925, + "grad_norm": 0.4267593044383983, + "learning_rate": 0.00015808701674361133, + "loss": 0.9972, + "step": 5342 + }, + { + "epoch": 1.1397184300341296, + "grad_norm": 0.46203682482169345, + "learning_rate": 0.00015808516647409382, + "loss": 0.9415, + "step": 5343 + }, + { + "epoch": 1.1399317406143346, + "grad_norm": 0.5273225495174158, + "learning_rate": 0.00015808331532104133, + "loss": 0.9491, + "step": 5344 + }, + { + "epoch": 1.1401450511945392, + "grad_norm": 0.5902565880786902, + "learning_rate": 0.00015808146328447482, + "loss": 1.0081, + "step": 5345 + }, + { + "epoch": 1.140358361774744, + "grad_norm": 0.6177940601157165, + "learning_rate": 0.00015807961036441525, + "loss": 0.955, + "step": 5346 + }, + { + "epoch": 1.1405716723549488, + "grad_norm": 0.5313714077599038, + "learning_rate": 0.00015807775656088353, + "loss": 0.971, + "step": 5347 + }, + { + "epoch": 1.1407849829351535, + "grad_norm": 0.5497916234244773, + "learning_rate": 0.0001580759018739007, + "loss": 0.9851, + "step": 5348 + }, + { + "epoch": 1.1409982935153584, + "grad_norm": 0.5550897283580138, + "learning_rate": 0.00015807404630348776, + "loss": 0.968, + "step": 5349 + }, + { + "epoch": 1.141211604095563, + "grad_norm": 0.5177762378446295, + "learning_rate": 0.00015807218984966562, + "loss": 0.9258, + "step": 5350 + }, + { + "epoch": 1.141424914675768, + "grad_norm": 0.5352407509410171, + "learning_rate": 0.00015807033251245538, + "loss": 0.9654, + "step": 5351 + }, + { + "epoch": 1.1416382252559727, + "grad_norm": 0.5900738410885862, + "learning_rate": 0.000158068474291878, + "loss": 0.9418, + "step": 5352 + }, + { + "epoch": 1.1418515358361774, + "grad_norm": 0.6444517470569138, + "learning_rate": 0.00015806661518795453, + "loss": 0.9988, + "step": 5353 + }, + { + "epoch": 1.1420648464163823, + "grad_norm": 0.6949391046222121, + "learning_rate": 0.00015806475520070598, + "loss": 0.9864, + "step": 5354 + }, + { + "epoch": 1.142278156996587, + "grad_norm": 0.7258958763945565, + "learning_rate": 0.0001580628943301534, + "loss": 0.9821, + "step": 5355 + }, + { + "epoch": 1.1424914675767919, + "grad_norm": 0.6754394337567079, + "learning_rate": 0.00015806103257631792, + "loss": 0.9631, + "step": 5356 + }, + { + "epoch": 1.1427047781569966, + "grad_norm": 0.5231406187618408, + "learning_rate": 0.0001580591699392205, + "loss": 0.9778, + "step": 5357 + }, + { + "epoch": 1.1429180887372015, + "grad_norm": 0.3531004805677951, + "learning_rate": 0.00015805730641888223, + "loss": 0.9769, + "step": 5358 + }, + { + "epoch": 1.1431313993174061, + "grad_norm": 0.2847659050945902, + "learning_rate": 0.00015805544201532426, + "loss": 0.9665, + "step": 5359 + }, + { + "epoch": 1.1433447098976108, + "grad_norm": 0.30797371386446376, + "learning_rate": 0.00015805357672856767, + "loss": 0.9713, + "step": 5360 + }, + { + "epoch": 1.1435580204778157, + "grad_norm": 0.34575842071173857, + "learning_rate": 0.00015805171055863353, + "loss": 0.9645, + "step": 5361 + }, + { + "epoch": 1.1437713310580204, + "grad_norm": 0.44196625840761955, + "learning_rate": 0.00015804984350554292, + "loss": 0.9383, + "step": 5362 + }, + { + "epoch": 1.1439846416382253, + "grad_norm": 0.5012405441205576, + "learning_rate": 0.0001580479755693171, + "loss": 0.9866, + "step": 5363 + }, + { + "epoch": 1.14419795221843, + "grad_norm": 0.5734269858733552, + "learning_rate": 0.00015804610674997706, + "loss": 0.946, + "step": 5364 + }, + { + "epoch": 1.1444112627986347, + "grad_norm": 0.5274129944168414, + "learning_rate": 0.00015804423704754406, + "loss": 0.9764, + "step": 5365 + }, + { + "epoch": 1.1446245733788396, + "grad_norm": 0.3835402122392768, + "learning_rate": 0.00015804236646203916, + "loss": 0.9825, + "step": 5366 + }, + { + "epoch": 1.1448378839590443, + "grad_norm": 0.3706908323722634, + "learning_rate": 0.0001580404949934836, + "loss": 0.9849, + "step": 5367 + }, + { + "epoch": 1.1450511945392492, + "grad_norm": 0.33084140457756217, + "learning_rate": 0.0001580386226418985, + "loss": 0.9857, + "step": 5368 + }, + { + "epoch": 1.1452645051194539, + "grad_norm": 0.4117206644851172, + "learning_rate": 0.0001580367494073051, + "loss": 0.9757, + "step": 5369 + }, + { + "epoch": 1.1454778156996588, + "grad_norm": 0.4541597424601983, + "learning_rate": 0.00015803487528972458, + "loss": 0.9607, + "step": 5370 + }, + { + "epoch": 1.1456911262798635, + "grad_norm": 0.49446777230868366, + "learning_rate": 0.0001580330002891781, + "loss": 0.9485, + "step": 5371 + }, + { + "epoch": 1.1459044368600684, + "grad_norm": 0.5392370138777053, + "learning_rate": 0.00015803112440568692, + "loss": 0.9925, + "step": 5372 + }, + { + "epoch": 1.146117747440273, + "grad_norm": 0.3885160818894848, + "learning_rate": 0.00015802924763927225, + "loss": 0.9409, + "step": 5373 + }, + { + "epoch": 1.1463310580204777, + "grad_norm": 0.31771521025378097, + "learning_rate": 0.00015802736998995535, + "loss": 0.9772, + "step": 5374 + }, + { + "epoch": 1.1465443686006827, + "grad_norm": 0.359179326184579, + "learning_rate": 0.00015802549145775744, + "loss": 0.9577, + "step": 5375 + }, + { + "epoch": 1.1467576791808873, + "grad_norm": 0.3719068645001044, + "learning_rate": 0.00015802361204269975, + "loss": 0.9752, + "step": 5376 + }, + { + "epoch": 1.1469709897610922, + "grad_norm": 0.3888853914388359, + "learning_rate": 0.0001580217317448036, + "loss": 0.9535, + "step": 5377 + }, + { + "epoch": 1.147184300341297, + "grad_norm": 0.3227952727314045, + "learning_rate": 0.00015801985056409024, + "loss": 1.0073, + "step": 5378 + }, + { + "epoch": 1.1473976109215016, + "grad_norm": 0.2776527847188885, + "learning_rate": 0.00015801796850058096, + "loss": 0.9619, + "step": 5379 + }, + { + "epoch": 1.1476109215017065, + "grad_norm": 0.30764933072198747, + "learning_rate": 0.00015801608555429707, + "loss": 0.9568, + "step": 5380 + }, + { + "epoch": 1.1478242320819112, + "grad_norm": 0.4301339400237577, + "learning_rate": 0.00015801420172525984, + "loss": 0.9495, + "step": 5381 + }, + { + "epoch": 1.148037542662116, + "grad_norm": 0.5032592491118852, + "learning_rate": 0.00015801231701349062, + "loss": 0.9797, + "step": 5382 + }, + { + "epoch": 1.1482508532423208, + "grad_norm": 0.5334685857510992, + "learning_rate": 0.00015801043141901072, + "loss": 0.9593, + "step": 5383 + }, + { + "epoch": 1.1484641638225255, + "grad_norm": 0.62953096673349, + "learning_rate": 0.00015800854494184147, + "loss": 0.9653, + "step": 5384 + }, + { + "epoch": 1.1486774744027304, + "grad_norm": 0.7349278247049588, + "learning_rate": 0.0001580066575820042, + "loss": 0.97, + "step": 5385 + }, + { + "epoch": 1.148890784982935, + "grad_norm": 0.8415541237376369, + "learning_rate": 0.0001580047693395203, + "loss": 0.9436, + "step": 5386 + }, + { + "epoch": 1.14910409556314, + "grad_norm": 0.8440021487742324, + "learning_rate": 0.00015800288021441115, + "loss": 0.992, + "step": 5387 + }, + { + "epoch": 1.1493174061433447, + "grad_norm": 0.8383814596063842, + "learning_rate": 0.00015800099020669807, + "loss": 0.9557, + "step": 5388 + }, + { + "epoch": 1.1495307167235496, + "grad_norm": 0.76047411983696, + "learning_rate": 0.0001579990993164025, + "loss": 0.9537, + "step": 5389 + }, + { + "epoch": 1.1497440273037542, + "grad_norm": 0.6200108807170975, + "learning_rate": 0.0001579972075435458, + "loss": 0.9539, + "step": 5390 + }, + { + "epoch": 1.1499573378839592, + "grad_norm": 0.4390782821568752, + "learning_rate": 0.00015799531488814938, + "loss": 0.9497, + "step": 5391 + }, + { + "epoch": 1.1501706484641638, + "grad_norm": 0.3993041603796068, + "learning_rate": 0.00015799342135023464, + "loss": 0.9814, + "step": 5392 + }, + { + "epoch": 1.1503839590443685, + "grad_norm": 0.5680912831849242, + "learning_rate": 0.00015799152692982306, + "loss": 0.989, + "step": 5393 + }, + { + "epoch": 1.1505972696245734, + "grad_norm": 0.8405153242236955, + "learning_rate": 0.000157989631626936, + "loss": 0.9319, + "step": 5394 + }, + { + "epoch": 1.150810580204778, + "grad_norm": 0.986869045986263, + "learning_rate": 0.00015798773544159497, + "loss": 0.9524, + "step": 5395 + }, + { + "epoch": 1.151023890784983, + "grad_norm": 0.9493500709172327, + "learning_rate": 0.0001579858383738214, + "loss": 0.978, + "step": 5396 + }, + { + "epoch": 1.1512372013651877, + "grad_norm": 0.8286288077805288, + "learning_rate": 0.00015798394042363677, + "loss": 0.9623, + "step": 5397 + }, + { + "epoch": 1.1514505119453924, + "grad_norm": 0.6290148011066689, + "learning_rate": 0.00015798204159106252, + "loss": 0.9466, + "step": 5398 + }, + { + "epoch": 1.1516638225255973, + "grad_norm": 0.4476714213496059, + "learning_rate": 0.00015798014187612017, + "loss": 0.9494, + "step": 5399 + }, + { + "epoch": 1.151877133105802, + "grad_norm": 0.3714032163812325, + "learning_rate": 0.00015797824127883122, + "loss": 1.0071, + "step": 5400 + }, + { + "epoch": 1.1520904436860069, + "grad_norm": 0.3334076312490309, + "learning_rate": 0.00015797633979921713, + "loss": 0.9634, + "step": 5401 + }, + { + "epoch": 1.1523037542662116, + "grad_norm": 0.3483887221593694, + "learning_rate": 0.00015797443743729945, + "loss": 0.9334, + "step": 5402 + }, + { + "epoch": 1.1525170648464165, + "grad_norm": 0.40464993211846056, + "learning_rate": 0.0001579725341930997, + "loss": 0.9479, + "step": 5403 + }, + { + "epoch": 1.1527303754266212, + "grad_norm": 0.42529400226873226, + "learning_rate": 0.0001579706300666394, + "loss": 0.9703, + "step": 5404 + }, + { + "epoch": 1.152943686006826, + "grad_norm": 0.40223622024159666, + "learning_rate": 0.00015796872505794015, + "loss": 0.9518, + "step": 5405 + }, + { + "epoch": 1.1531569965870307, + "grad_norm": 0.44282000049668996, + "learning_rate": 0.00015796681916702344, + "loss": 0.9773, + "step": 5406 + }, + { + "epoch": 1.1533703071672354, + "grad_norm": 0.47378917893306666, + "learning_rate": 0.00015796491239391085, + "loss": 1.0015, + "step": 5407 + }, + { + "epoch": 1.1535836177474403, + "grad_norm": 0.42348690173217407, + "learning_rate": 0.000157963004738624, + "loss": 0.9588, + "step": 5408 + }, + { + "epoch": 1.153796928327645, + "grad_norm": 0.4299096650791415, + "learning_rate": 0.00015796109620118442, + "loss": 0.9618, + "step": 5409 + }, + { + "epoch": 1.15401023890785, + "grad_norm": 0.47449474577525197, + "learning_rate": 0.00015795918678161372, + "loss": 1.0063, + "step": 5410 + }, + { + "epoch": 1.1542235494880546, + "grad_norm": 0.5735242268004564, + "learning_rate": 0.0001579572764799335, + "loss": 0.9856, + "step": 5411 + }, + { + "epoch": 1.1544368600682593, + "grad_norm": 0.6635158511556557, + "learning_rate": 0.0001579553652961654, + "loss": 0.9579, + "step": 5412 + }, + { + "epoch": 1.1546501706484642, + "grad_norm": 0.71806226588203, + "learning_rate": 0.00015795345323033102, + "loss": 0.9858, + "step": 5413 + }, + { + "epoch": 1.1548634812286689, + "grad_norm": 0.6775054539775912, + "learning_rate": 0.000157951540282452, + "loss": 0.9604, + "step": 5414 + }, + { + "epoch": 1.1550767918088738, + "grad_norm": 0.5854097156781372, + "learning_rate": 0.00015794962645255002, + "loss": 0.9587, + "step": 5415 + }, + { + "epoch": 1.1552901023890785, + "grad_norm": 0.45326128656762454, + "learning_rate": 0.0001579477117406467, + "loss": 0.9807, + "step": 5416 + }, + { + "epoch": 1.1555034129692832, + "grad_norm": 0.3772002319957216, + "learning_rate": 0.00015794579614676366, + "loss": 0.9891, + "step": 5417 + }, + { + "epoch": 1.155716723549488, + "grad_norm": 0.5023765160711956, + "learning_rate": 0.00015794387967092268, + "loss": 0.9782, + "step": 5418 + }, + { + "epoch": 1.1559300341296928, + "grad_norm": 0.643650260290637, + "learning_rate": 0.00015794196231314539, + "loss": 0.9755, + "step": 5419 + }, + { + "epoch": 1.1561433447098977, + "grad_norm": 0.5536911453057723, + "learning_rate": 0.00015794004407345346, + "loss": 0.9553, + "step": 5420 + }, + { + "epoch": 1.1563566552901023, + "grad_norm": 0.38028267811121014, + "learning_rate": 0.0001579381249518686, + "loss": 0.9526, + "step": 5421 + }, + { + "epoch": 1.1565699658703072, + "grad_norm": 0.43012734920088913, + "learning_rate": 0.0001579362049484126, + "loss": 0.9397, + "step": 5422 + }, + { + "epoch": 1.156783276450512, + "grad_norm": 0.5269703040487869, + "learning_rate": 0.0001579342840631071, + "loss": 0.9749, + "step": 5423 + }, + { + "epoch": 1.1569965870307168, + "grad_norm": 0.46364277675704085, + "learning_rate": 0.00015793236229597388, + "loss": 1.0165, + "step": 5424 + }, + { + "epoch": 1.1572098976109215, + "grad_norm": 0.3311250874080929, + "learning_rate": 0.00015793043964703465, + "loss": 0.9707, + "step": 5425 + }, + { + "epoch": 1.1574232081911262, + "grad_norm": 0.3675805508395433, + "learning_rate": 0.0001579285161163112, + "loss": 0.9883, + "step": 5426 + }, + { + "epoch": 1.1576365187713311, + "grad_norm": 0.3835362159639177, + "learning_rate": 0.00015792659170382528, + "loss": 0.963, + "step": 5427 + }, + { + "epoch": 1.1578498293515358, + "grad_norm": 0.3881503824698635, + "learning_rate": 0.00015792466640959866, + "loss": 0.9524, + "step": 5428 + }, + { + "epoch": 1.1580631399317407, + "grad_norm": 0.3356191207128052, + "learning_rate": 0.00015792274023365314, + "loss": 0.983, + "step": 5429 + }, + { + "epoch": 1.1582764505119454, + "grad_norm": 0.32737599160754405, + "learning_rate": 0.00015792081317601047, + "loss": 0.997, + "step": 5430 + }, + { + "epoch": 1.15848976109215, + "grad_norm": 0.3652579135075575, + "learning_rate": 0.00015791888523669252, + "loss": 0.9559, + "step": 5431 + }, + { + "epoch": 1.158703071672355, + "grad_norm": 0.4123433173715855, + "learning_rate": 0.00015791695641572106, + "loss": 0.9658, + "step": 5432 + }, + { + "epoch": 1.1589163822525597, + "grad_norm": 0.47976232120364326, + "learning_rate": 0.00015791502671311792, + "loss": 0.9646, + "step": 5433 + }, + { + "epoch": 1.1591296928327646, + "grad_norm": 0.5622299030981607, + "learning_rate": 0.00015791309612890494, + "loss": 0.9511, + "step": 5434 + }, + { + "epoch": 1.1593430034129693, + "grad_norm": 0.5228105651429023, + "learning_rate": 0.000157911164663104, + "loss": 0.9616, + "step": 5435 + }, + { + "epoch": 1.159556313993174, + "grad_norm": 0.4162491202913591, + "learning_rate": 0.0001579092323157369, + "loss": 0.9921, + "step": 5436 + }, + { + "epoch": 1.1597696245733788, + "grad_norm": 0.39674467304706107, + "learning_rate": 0.00015790729908682553, + "loss": 0.9599, + "step": 5437 + }, + { + "epoch": 1.1599829351535835, + "grad_norm": 0.33988625444792464, + "learning_rate": 0.00015790536497639176, + "loss": 0.9305, + "step": 5438 + }, + { + "epoch": 1.1601962457337884, + "grad_norm": 0.2979308695564687, + "learning_rate": 0.00015790342998445747, + "loss": 1.0213, + "step": 5439 + }, + { + "epoch": 1.1604095563139931, + "grad_norm": 0.3413989814890407, + "learning_rate": 0.00015790149411104457, + "loss": 0.972, + "step": 5440 + }, + { + "epoch": 1.160622866894198, + "grad_norm": 0.3218516272175406, + "learning_rate": 0.00015789955735617494, + "loss": 0.945, + "step": 5441 + }, + { + "epoch": 1.1608361774744027, + "grad_norm": 0.3242451690147126, + "learning_rate": 0.0001578976197198705, + "loss": 0.9731, + "step": 5442 + }, + { + "epoch": 1.1610494880546076, + "grad_norm": 0.3949389965353848, + "learning_rate": 0.0001578956812021532, + "loss": 0.9648, + "step": 5443 + }, + { + "epoch": 1.1612627986348123, + "grad_norm": 0.4205746063147206, + "learning_rate": 0.00015789374180304495, + "loss": 0.9459, + "step": 5444 + }, + { + "epoch": 1.161476109215017, + "grad_norm": 0.38873503517147967, + "learning_rate": 0.0001578918015225677, + "loss": 0.9747, + "step": 5445 + }, + { + "epoch": 1.161689419795222, + "grad_norm": 0.47289514356657397, + "learning_rate": 0.0001578898603607434, + "loss": 0.9612, + "step": 5446 + }, + { + "epoch": 1.1619027303754266, + "grad_norm": 0.4935663781973145, + "learning_rate": 0.00015788791831759403, + "loss": 0.9604, + "step": 5447 + }, + { + "epoch": 1.1621160409556315, + "grad_norm": 0.5173737159896191, + "learning_rate": 0.00015788597539314155, + "loss": 0.9682, + "step": 5448 + }, + { + "epoch": 1.1623293515358362, + "grad_norm": 0.5644708198141062, + "learning_rate": 0.00015788403158740796, + "loss": 0.9884, + "step": 5449 + }, + { + "epoch": 1.1625426621160408, + "grad_norm": 0.6740687058630963, + "learning_rate": 0.00015788208690041523, + "loss": 0.972, + "step": 5450 + }, + { + "epoch": 1.1627559726962458, + "grad_norm": 0.7298489583983907, + "learning_rate": 0.00015788014133218538, + "loss": 0.9523, + "step": 5451 + }, + { + "epoch": 1.1629692832764504, + "grad_norm": 0.7061976986983545, + "learning_rate": 0.00015787819488274045, + "loss": 0.9685, + "step": 5452 + }, + { + "epoch": 1.1631825938566553, + "grad_norm": 0.6822866860351152, + "learning_rate": 0.00015787624755210237, + "loss": 0.9644, + "step": 5453 + }, + { + "epoch": 1.16339590443686, + "grad_norm": 0.6924463361832558, + "learning_rate": 0.0001578742993402933, + "loss": 0.9787, + "step": 5454 + }, + { + "epoch": 1.163609215017065, + "grad_norm": 0.5039662197598114, + "learning_rate": 0.00015787235024733519, + "loss": 0.9489, + "step": 5455 + }, + { + "epoch": 1.1638225255972696, + "grad_norm": 0.3385244646642284, + "learning_rate": 0.00015787040027325015, + "loss": 0.9748, + "step": 5456 + }, + { + "epoch": 1.1640358361774745, + "grad_norm": 0.36442492873801186, + "learning_rate": 0.00015786844941806017, + "loss": 0.9403, + "step": 5457 + }, + { + "epoch": 1.1642491467576792, + "grad_norm": 0.3778955030681829, + "learning_rate": 0.00015786649768178743, + "loss": 0.9864, + "step": 5458 + }, + { + "epoch": 1.164462457337884, + "grad_norm": 0.38516534565262367, + "learning_rate": 0.00015786454506445393, + "loss": 0.9261, + "step": 5459 + }, + { + "epoch": 1.1646757679180888, + "grad_norm": 0.4741701059063893, + "learning_rate": 0.0001578625915660818, + "loss": 0.9703, + "step": 5460 + }, + { + "epoch": 1.1648890784982935, + "grad_norm": 0.489029888211093, + "learning_rate": 0.00015786063718669315, + "loss": 0.9694, + "step": 5461 + }, + { + "epoch": 1.1651023890784984, + "grad_norm": 0.41343069832215257, + "learning_rate": 0.00015785868192631008, + "loss": 0.9589, + "step": 5462 + }, + { + "epoch": 1.165315699658703, + "grad_norm": 0.44088355835560505, + "learning_rate": 0.0001578567257849547, + "loss": 0.9646, + "step": 5463 + }, + { + "epoch": 1.1655290102389078, + "grad_norm": 0.4971287863768314, + "learning_rate": 0.00015785476876264916, + "loss": 0.9775, + "step": 5464 + }, + { + "epoch": 1.1657423208191127, + "grad_norm": 0.5231784017471259, + "learning_rate": 0.0001578528108594156, + "loss": 0.9811, + "step": 5465 + }, + { + "epoch": 1.1659556313993173, + "grad_norm": 0.466349508387615, + "learning_rate": 0.0001578508520752762, + "loss": 0.9523, + "step": 5466 + }, + { + "epoch": 1.1661689419795223, + "grad_norm": 0.4744146730338912, + "learning_rate": 0.00015784889241025307, + "loss": 0.9826, + "step": 5467 + }, + { + "epoch": 1.166382252559727, + "grad_norm": 0.5030910810896528, + "learning_rate": 0.00015784693186436842, + "loss": 0.9838, + "step": 5468 + }, + { + "epoch": 1.1665955631399316, + "grad_norm": 0.42966910308413603, + "learning_rate": 0.00015784497043764444, + "loss": 0.9726, + "step": 5469 + }, + { + "epoch": 1.1668088737201365, + "grad_norm": 0.40046793717242846, + "learning_rate": 0.0001578430081301033, + "loss": 0.9778, + "step": 5470 + }, + { + "epoch": 1.1670221843003412, + "grad_norm": 0.3903118632577463, + "learning_rate": 0.00015784104494176722, + "loss": 0.9523, + "step": 5471 + }, + { + "epoch": 1.1672354948805461, + "grad_norm": 0.30669696427561294, + "learning_rate": 0.0001578390808726584, + "loss": 0.9353, + "step": 5472 + }, + { + "epoch": 1.1674488054607508, + "grad_norm": 0.3709220451126586, + "learning_rate": 0.00015783711592279906, + "loss": 0.9777, + "step": 5473 + }, + { + "epoch": 1.1676621160409557, + "grad_norm": 0.4635613799548005, + "learning_rate": 0.00015783515009221147, + "loss": 0.9713, + "step": 5474 + }, + { + "epoch": 1.1678754266211604, + "grad_norm": 0.41402795395406594, + "learning_rate": 0.00015783318338091784, + "loss": 0.9604, + "step": 5475 + }, + { + "epoch": 1.1680887372013653, + "grad_norm": 0.37815734009440055, + "learning_rate": 0.00015783121578894041, + "loss": 0.9623, + "step": 5476 + }, + { + "epoch": 1.16830204778157, + "grad_norm": 0.37208643136717945, + "learning_rate": 0.00015782924731630148, + "loss": 0.976, + "step": 5477 + }, + { + "epoch": 1.1685153583617747, + "grad_norm": 0.49741314886453036, + "learning_rate": 0.0001578272779630233, + "loss": 0.9764, + "step": 5478 + }, + { + "epoch": 1.1687286689419796, + "grad_norm": 0.6788746198274744, + "learning_rate": 0.00015782530772912818, + "loss": 0.9698, + "step": 5479 + }, + { + "epoch": 1.1689419795221843, + "grad_norm": 0.6266502044559572, + "learning_rate": 0.00015782333661463837, + "loss": 0.9894, + "step": 5480 + }, + { + "epoch": 1.1691552901023892, + "grad_norm": 0.7778487278692364, + "learning_rate": 0.00015782136461957622, + "loss": 0.9857, + "step": 5481 + }, + { + "epoch": 1.1693686006825939, + "grad_norm": 0.5493643778420065, + "learning_rate": 0.000157819391743964, + "loss": 0.9543, + "step": 5482 + }, + { + "epoch": 1.1695819112627985, + "grad_norm": 0.5869067712310063, + "learning_rate": 0.00015781741798782405, + "loss": 0.9813, + "step": 5483 + }, + { + "epoch": 1.1697952218430034, + "grad_norm": 0.6417085134739564, + "learning_rate": 0.00015781544335117872, + "loss": 0.9905, + "step": 5484 + }, + { + "epoch": 1.1700085324232081, + "grad_norm": 0.5744786337031941, + "learning_rate": 0.00015781346783405034, + "loss": 0.9854, + "step": 5485 + }, + { + "epoch": 1.170221843003413, + "grad_norm": 0.4655008155587361, + "learning_rate": 0.00015781149143646127, + "loss": 0.9543, + "step": 5486 + }, + { + "epoch": 1.1704351535836177, + "grad_norm": 0.466425744678966, + "learning_rate": 0.00015780951415843384, + "loss": 0.9614, + "step": 5487 + }, + { + "epoch": 1.1706484641638226, + "grad_norm": 0.45273308865208706, + "learning_rate": 0.00015780753599999047, + "loss": 0.9718, + "step": 5488 + }, + { + "epoch": 1.1708617747440273, + "grad_norm": 0.4424944657946359, + "learning_rate": 0.0001578055569611535, + "loss": 0.9868, + "step": 5489 + }, + { + "epoch": 1.171075085324232, + "grad_norm": 0.4910616327891272, + "learning_rate": 0.00015780357704194537, + "loss": 0.9666, + "step": 5490 + }, + { + "epoch": 1.171288395904437, + "grad_norm": 0.5067292884435821, + "learning_rate": 0.00015780159624238847, + "loss": 0.9605, + "step": 5491 + }, + { + "epoch": 1.1715017064846416, + "grad_norm": 0.528156764877163, + "learning_rate": 0.00015779961456250516, + "loss": 0.9604, + "step": 5492 + }, + { + "epoch": 1.1717150170648465, + "grad_norm": 0.5818206538229618, + "learning_rate": 0.00015779763200231793, + "loss": 0.99, + "step": 5493 + }, + { + "epoch": 1.1719283276450512, + "grad_norm": 0.7044598657729635, + "learning_rate": 0.00015779564856184918, + "loss": 0.9757, + "step": 5494 + }, + { + "epoch": 1.172141638225256, + "grad_norm": 0.8249178247834227, + "learning_rate": 0.00015779366424112135, + "loss": 0.9698, + "step": 5495 + }, + { + "epoch": 1.1723549488054608, + "grad_norm": 0.8073770006938746, + "learning_rate": 0.0001577916790401569, + "loss": 0.9386, + "step": 5496 + }, + { + "epoch": 1.1725682593856654, + "grad_norm": 0.755218254773703, + "learning_rate": 0.0001577896929589783, + "loss": 0.9453, + "step": 5497 + }, + { + "epoch": 1.1727815699658704, + "grad_norm": 0.7476824885613218, + "learning_rate": 0.000157787705997608, + "loss": 0.984, + "step": 5498 + }, + { + "epoch": 1.172994880546075, + "grad_norm": 0.773910814239226, + "learning_rate": 0.00015778571815606852, + "loss": 0.9775, + "step": 5499 + }, + { + "epoch": 1.17320819112628, + "grad_norm": 0.6989969806474182, + "learning_rate": 0.0001577837294343823, + "loss": 0.9758, + "step": 5500 + }, + { + "epoch": 1.1734215017064846, + "grad_norm": 0.6213041243830575, + "learning_rate": 0.00015778173983257193, + "loss": 0.9798, + "step": 5501 + }, + { + "epoch": 1.1736348122866893, + "grad_norm": 0.4807865723434294, + "learning_rate": 0.00015777974935065982, + "loss": 0.9548, + "step": 5502 + }, + { + "epoch": 1.1738481228668942, + "grad_norm": 0.41524123996560175, + "learning_rate": 0.00015777775798866853, + "loss": 0.9643, + "step": 5503 + }, + { + "epoch": 1.174061433447099, + "grad_norm": 0.509699229942808, + "learning_rate": 0.00015777576574662063, + "loss": 0.9621, + "step": 5504 + }, + { + "epoch": 1.1742747440273038, + "grad_norm": 0.5296263991824977, + "learning_rate": 0.00015777377262453862, + "loss": 0.9806, + "step": 5505 + }, + { + "epoch": 1.1744880546075085, + "grad_norm": 0.4756931375169422, + "learning_rate": 0.00015777177862244505, + "loss": 0.9307, + "step": 5506 + }, + { + "epoch": 1.1747013651877134, + "grad_norm": 0.4065493295517318, + "learning_rate": 0.00015776978374036253, + "loss": 0.9577, + "step": 5507 + }, + { + "epoch": 1.174914675767918, + "grad_norm": 0.3735451081926029, + "learning_rate": 0.00015776778797831355, + "loss": 0.9587, + "step": 5508 + }, + { + "epoch": 1.175127986348123, + "grad_norm": 0.35147367714335487, + "learning_rate": 0.0001577657913363208, + "loss": 0.9872, + "step": 5509 + }, + { + "epoch": 1.1753412969283277, + "grad_norm": 0.35057316288246565, + "learning_rate": 0.00015776379381440679, + "loss": 0.9717, + "step": 5510 + }, + { + "epoch": 1.1755546075085324, + "grad_norm": 0.28218285225661194, + "learning_rate": 0.00015776179541259413, + "loss": 0.9426, + "step": 5511 + }, + { + "epoch": 1.1757679180887373, + "grad_norm": 0.31658917873374937, + "learning_rate": 0.00015775979613090547, + "loss": 0.9526, + "step": 5512 + }, + { + "epoch": 1.175981228668942, + "grad_norm": 0.3021960230671011, + "learning_rate": 0.00015775779596936338, + "loss": 0.964, + "step": 5513 + }, + { + "epoch": 1.1761945392491469, + "grad_norm": 0.26044048370601763, + "learning_rate": 0.00015775579492799052, + "loss": 0.9651, + "step": 5514 + }, + { + "epoch": 1.1764078498293515, + "grad_norm": 0.29405859036864757, + "learning_rate": 0.00015775379300680956, + "loss": 0.9801, + "step": 5515 + }, + { + "epoch": 1.1766211604095562, + "grad_norm": 0.26473139863661915, + "learning_rate": 0.00015775179020584312, + "loss": 1.0125, + "step": 5516 + }, + { + "epoch": 1.1768344709897611, + "grad_norm": 0.3452558293537349, + "learning_rate": 0.00015774978652511383, + "loss": 0.9775, + "step": 5517 + }, + { + "epoch": 1.1770477815699658, + "grad_norm": 0.31345981679831425, + "learning_rate": 0.00015774778196464445, + "loss": 0.9611, + "step": 5518 + }, + { + "epoch": 1.1772610921501707, + "grad_norm": 0.3120470481067052, + "learning_rate": 0.00015774577652445757, + "loss": 0.9822, + "step": 5519 + }, + { + "epoch": 1.1774744027303754, + "grad_norm": 0.36042258815103756, + "learning_rate": 0.00015774377020457596, + "loss": 0.9606, + "step": 5520 + }, + { + "epoch": 1.17768771331058, + "grad_norm": 0.3711309805102721, + "learning_rate": 0.00015774176300502223, + "loss": 1.0129, + "step": 5521 + }, + { + "epoch": 1.177901023890785, + "grad_norm": 0.37152286685856356, + "learning_rate": 0.0001577397549258192, + "loss": 0.9605, + "step": 5522 + }, + { + "epoch": 1.1781143344709897, + "grad_norm": 0.36049772664073554, + "learning_rate": 0.0001577377459669895, + "loss": 0.9634, + "step": 5523 + }, + { + "epoch": 1.1783276450511946, + "grad_norm": 0.4099082253907262, + "learning_rate": 0.00015773573612855592, + "loss": 0.9597, + "step": 5524 + }, + { + "epoch": 1.1785409556313993, + "grad_norm": 0.337196050958757, + "learning_rate": 0.00015773372541054114, + "loss": 0.9297, + "step": 5525 + }, + { + "epoch": 1.1787542662116042, + "grad_norm": 0.3227438444633641, + "learning_rate": 0.000157731713812968, + "loss": 0.9614, + "step": 5526 + }, + { + "epoch": 1.1789675767918089, + "grad_norm": 0.37978894572133937, + "learning_rate": 0.00015772970133585916, + "loss": 0.9744, + "step": 5527 + }, + { + "epoch": 1.1791808873720138, + "grad_norm": 0.3451759641707082, + "learning_rate": 0.00015772768797923747, + "loss": 0.9575, + "step": 5528 + }, + { + "epoch": 1.1793941979522184, + "grad_norm": 0.324087756865993, + "learning_rate": 0.00015772567374312566, + "loss": 0.9152, + "step": 5529 + }, + { + "epoch": 1.1796075085324231, + "grad_norm": 0.27731447871638565, + "learning_rate": 0.0001577236586275466, + "loss": 0.9548, + "step": 5530 + }, + { + "epoch": 1.179820819112628, + "grad_norm": 0.2641782007399189, + "learning_rate": 0.00015772164263252298, + "loss": 0.971, + "step": 5531 + }, + { + "epoch": 1.1800341296928327, + "grad_norm": 0.3540516003530215, + "learning_rate": 0.00015771962575807766, + "loss": 0.9804, + "step": 5532 + }, + { + "epoch": 1.1802474402730376, + "grad_norm": 0.4578353377446297, + "learning_rate": 0.00015771760800423353, + "loss": 0.9537, + "step": 5533 + }, + { + "epoch": 1.1804607508532423, + "grad_norm": 0.4423662962440806, + "learning_rate": 0.0001577155893710133, + "loss": 0.9609, + "step": 5534 + }, + { + "epoch": 1.180674061433447, + "grad_norm": 0.4166297069955799, + "learning_rate": 0.0001577135698584399, + "loss": 0.9762, + "step": 5535 + }, + { + "epoch": 1.180887372013652, + "grad_norm": 0.5187093608010646, + "learning_rate": 0.00015771154946653615, + "loss": 0.9734, + "step": 5536 + }, + { + "epoch": 1.1811006825938566, + "grad_norm": 0.6628275606802796, + "learning_rate": 0.0001577095281953249, + "loss": 0.9569, + "step": 5537 + }, + { + "epoch": 1.1813139931740615, + "grad_norm": 2.393216671983221, + "learning_rate": 0.000157707506044829, + "loss": 1.0128, + "step": 5538 + }, + { + "epoch": 1.1815273037542662, + "grad_norm": 0.928157505673991, + "learning_rate": 0.00015770548301507142, + "loss": 0.9701, + "step": 5539 + }, + { + "epoch": 1.181740614334471, + "grad_norm": 2.1826597402537478, + "learning_rate": 0.00015770345910607497, + "loss": 0.9954, + "step": 5540 + }, + { + "epoch": 1.1819539249146758, + "grad_norm": 1.3959534359317067, + "learning_rate": 0.00015770143431786254, + "loss": 0.99, + "step": 5541 + }, + { + "epoch": 1.1821672354948805, + "grad_norm": 2.5471508309899815, + "learning_rate": 0.0001576994086504571, + "loss": 0.9765, + "step": 5542 + }, + { + "epoch": 1.1823805460750854, + "grad_norm": 2.404825231872929, + "learning_rate": 0.00015769738210388157, + "loss": 1.0089, + "step": 5543 + }, + { + "epoch": 1.18259385665529, + "grad_norm": 1.2434032935922035, + "learning_rate": 0.00015769535467815882, + "loss": 0.966, + "step": 5544 + }, + { + "epoch": 1.182807167235495, + "grad_norm": 1.1297384517149252, + "learning_rate": 0.00015769332637331182, + "loss": 0.9871, + "step": 5545 + }, + { + "epoch": 1.1830204778156996, + "grad_norm": 1.293831089245662, + "learning_rate": 0.00015769129718936356, + "loss": 0.9993, + "step": 5546 + }, + { + "epoch": 1.1832337883959045, + "grad_norm": 0.9455590618044782, + "learning_rate": 0.00015768926712633695, + "loss": 0.9643, + "step": 5547 + }, + { + "epoch": 1.1834470989761092, + "grad_norm": 0.8693461345461689, + "learning_rate": 0.00015768723618425496, + "loss": 0.974, + "step": 5548 + }, + { + "epoch": 1.183660409556314, + "grad_norm": 0.7811228622676989, + "learning_rate": 0.0001576852043631406, + "loss": 0.9754, + "step": 5549 + }, + { + "epoch": 1.1838737201365188, + "grad_norm": 0.651193640506055, + "learning_rate": 0.00015768317166301685, + "loss": 0.9833, + "step": 5550 + }, + { + "epoch": 1.1840870307167235, + "grad_norm": 0.6616598928920194, + "learning_rate": 0.0001576811380839067, + "loss": 0.9697, + "step": 5551 + }, + { + "epoch": 1.1843003412969284, + "grad_norm": 0.7604729802053971, + "learning_rate": 0.00015767910362583315, + "loss": 0.9589, + "step": 5552 + }, + { + "epoch": 1.184513651877133, + "grad_norm": 0.7403787098008668, + "learning_rate": 0.0001576770682888193, + "loss": 0.9709, + "step": 5553 + }, + { + "epoch": 1.1847269624573378, + "grad_norm": 0.6989677335222803, + "learning_rate": 0.00015767503207288804, + "loss": 0.9692, + "step": 5554 + }, + { + "epoch": 1.1849402730375427, + "grad_norm": 0.5863933742802896, + "learning_rate": 0.0001576729949780625, + "loss": 0.9675, + "step": 5555 + }, + { + "epoch": 1.1851535836177474, + "grad_norm": 0.6398917365094995, + "learning_rate": 0.00015767095700436575, + "loss": 0.9721, + "step": 5556 + }, + { + "epoch": 1.1853668941979523, + "grad_norm": 0.7817483608252913, + "learning_rate": 0.0001576689181518208, + "loss": 0.9776, + "step": 5557 + }, + { + "epoch": 1.185580204778157, + "grad_norm": 0.7662095774806006, + "learning_rate": 0.0001576668784204507, + "loss": 0.9872, + "step": 5558 + }, + { + "epoch": 1.1857935153583619, + "grad_norm": 0.6907289543028006, + "learning_rate": 0.0001576648378102786, + "loss": 0.9998, + "step": 5559 + }, + { + "epoch": 1.1860068259385665, + "grad_norm": 0.6307557948847462, + "learning_rate": 0.00015766279632132755, + "loss": 0.9724, + "step": 5560 + }, + { + "epoch": 1.1862201365187715, + "grad_norm": 0.5316686251647622, + "learning_rate": 0.00015766075395362068, + "loss": 0.9696, + "step": 5561 + }, + { + "epoch": 1.1864334470989761, + "grad_norm": 0.5449446161752363, + "learning_rate": 0.00015765871070718101, + "loss": 0.9735, + "step": 5562 + }, + { + "epoch": 1.1866467576791808, + "grad_norm": 0.5489651621480088, + "learning_rate": 0.00015765666658203176, + "loss": 0.9905, + "step": 5563 + }, + { + "epoch": 1.1868600682593857, + "grad_norm": 0.4522726254505893, + "learning_rate": 0.00015765462157819602, + "loss": 0.9882, + "step": 5564 + }, + { + "epoch": 1.1870733788395904, + "grad_norm": 0.475290310371814, + "learning_rate": 0.0001576525756956969, + "loss": 0.9754, + "step": 5565 + }, + { + "epoch": 1.1872866894197953, + "grad_norm": 0.42595965372592975, + "learning_rate": 0.00015765052893455765, + "loss": 0.9608, + "step": 5566 + }, + { + "epoch": 1.1875, + "grad_norm": 0.4769934717203348, + "learning_rate": 0.0001576484812948013, + "loss": 0.9689, + "step": 5567 + }, + { + "epoch": 1.1877133105802047, + "grad_norm": 0.48712916616834745, + "learning_rate": 0.0001576464327764511, + "loss": 1.0042, + "step": 5568 + }, + { + "epoch": 1.1879266211604096, + "grad_norm": 0.395435329011127, + "learning_rate": 0.00015764438337953019, + "loss": 0.9526, + "step": 5569 + }, + { + "epoch": 1.1881399317406143, + "grad_norm": 0.39855252830687793, + "learning_rate": 0.0001576423331040618, + "loss": 0.9856, + "step": 5570 + }, + { + "epoch": 1.1883532423208192, + "grad_norm": 0.49617132710443573, + "learning_rate": 0.0001576402819500691, + "loss": 0.9322, + "step": 5571 + }, + { + "epoch": 1.1885665529010239, + "grad_norm": 0.6356368118557377, + "learning_rate": 0.00015763822991757528, + "loss": 0.9611, + "step": 5572 + }, + { + "epoch": 1.1887798634812285, + "grad_norm": 0.6791445954301588, + "learning_rate": 0.00015763617700660358, + "loss": 0.9623, + "step": 5573 + }, + { + "epoch": 1.1889931740614335, + "grad_norm": 0.5804958022389728, + "learning_rate": 0.0001576341232171773, + "loss": 0.9331, + "step": 5574 + }, + { + "epoch": 1.1892064846416381, + "grad_norm": 0.6558495689167564, + "learning_rate": 0.00015763206854931954, + "loss": 0.9749, + "step": 5575 + }, + { + "epoch": 1.189419795221843, + "grad_norm": 0.5243428756948637, + "learning_rate": 0.00015763001300305362, + "loss": 0.9385, + "step": 5576 + }, + { + "epoch": 1.1896331058020477, + "grad_norm": 0.38272668097975526, + "learning_rate": 0.0001576279565784028, + "loss": 0.9668, + "step": 5577 + }, + { + "epoch": 1.1898464163822526, + "grad_norm": 0.5184035843576543, + "learning_rate": 0.00015762589927539037, + "loss": 0.9552, + "step": 5578 + }, + { + "epoch": 1.1900597269624573, + "grad_norm": 0.46057438360321756, + "learning_rate": 0.0001576238410940396, + "loss": 0.9701, + "step": 5579 + }, + { + "epoch": 1.1902730375426622, + "grad_norm": 0.4768138951812307, + "learning_rate": 0.00015762178203437374, + "loss": 0.959, + "step": 5580 + }, + { + "epoch": 1.190486348122867, + "grad_norm": 0.5322329915465004, + "learning_rate": 0.0001576197220964161, + "loss": 0.9609, + "step": 5581 + }, + { + "epoch": 1.1906996587030716, + "grad_norm": 0.355664599141064, + "learning_rate": 0.00015761766128018998, + "loss": 0.9574, + "step": 5582 + }, + { + "epoch": 1.1909129692832765, + "grad_norm": 0.3228531960157951, + "learning_rate": 0.00015761559958571878, + "loss": 0.9457, + "step": 5583 + }, + { + "epoch": 1.1911262798634812, + "grad_norm": 0.4560466023484778, + "learning_rate": 0.00015761353701302573, + "loss": 0.9464, + "step": 5584 + }, + { + "epoch": 1.191339590443686, + "grad_norm": 0.5907632102313756, + "learning_rate": 0.0001576114735621342, + "loss": 0.961, + "step": 5585 + }, + { + "epoch": 1.1915529010238908, + "grad_norm": 0.6759424614556114, + "learning_rate": 0.00015760940923306753, + "loss": 0.9681, + "step": 5586 + }, + { + "epoch": 1.1917662116040955, + "grad_norm": 0.7062567467690684, + "learning_rate": 0.0001576073440258491, + "loss": 1.0138, + "step": 5587 + }, + { + "epoch": 1.1919795221843004, + "grad_norm": 0.6900674073845413, + "learning_rate": 0.00015760527794050227, + "loss": 0.9613, + "step": 5588 + }, + { + "epoch": 1.192192832764505, + "grad_norm": 0.5223573328911989, + "learning_rate": 0.00015760321097705042, + "loss": 0.9723, + "step": 5589 + }, + { + "epoch": 1.19240614334471, + "grad_norm": 0.3837745514657916, + "learning_rate": 0.00015760114313551692, + "loss": 0.9926, + "step": 5590 + }, + { + "epoch": 1.1926194539249146, + "grad_norm": 0.31113933688263307, + "learning_rate": 0.00015759907441592518, + "loss": 0.9629, + "step": 5591 + }, + { + "epoch": 1.1928327645051195, + "grad_norm": 0.34332490967260276, + "learning_rate": 0.0001575970048182986, + "loss": 0.9783, + "step": 5592 + }, + { + "epoch": 1.1930460750853242, + "grad_norm": 0.36544305125598137, + "learning_rate": 0.00015759493434266064, + "loss": 0.9695, + "step": 5593 + }, + { + "epoch": 1.193259385665529, + "grad_norm": 0.4938642752440894, + "learning_rate": 0.0001575928629890347, + "loss": 0.9515, + "step": 5594 + }, + { + "epoch": 1.1934726962457338, + "grad_norm": 0.5529334113357747, + "learning_rate": 0.00015759079075744416, + "loss": 0.9532, + "step": 5595 + }, + { + "epoch": 1.1936860068259385, + "grad_norm": 0.5084470239028736, + "learning_rate": 0.00015758871764791254, + "loss": 0.9494, + "step": 5596 + }, + { + "epoch": 1.1938993174061434, + "grad_norm": 0.403658043779691, + "learning_rate": 0.00015758664366046326, + "loss": 0.9359, + "step": 5597 + }, + { + "epoch": 1.194112627986348, + "grad_norm": 0.37386926236628915, + "learning_rate": 0.00015758456879511984, + "loss": 0.9506, + "step": 5598 + }, + { + "epoch": 1.194325938566553, + "grad_norm": 0.42273960521980514, + "learning_rate": 0.0001575824930519057, + "loss": 0.9578, + "step": 5599 + }, + { + "epoch": 1.1945392491467577, + "grad_norm": 0.473321027115109, + "learning_rate": 0.00015758041643084434, + "loss": 0.9719, + "step": 5600 + }, + { + "epoch": 1.1947525597269624, + "grad_norm": 0.4805399297089712, + "learning_rate": 0.00015757833893195928, + "loss": 0.9567, + "step": 5601 + }, + { + "epoch": 1.1949658703071673, + "grad_norm": 0.37271183444144695, + "learning_rate": 0.00015757626055527398, + "loss": 0.9946, + "step": 5602 + }, + { + "epoch": 1.195179180887372, + "grad_norm": 0.36686385659612397, + "learning_rate": 0.000157574181300812, + "loss": 0.9993, + "step": 5603 + }, + { + "epoch": 1.1953924914675769, + "grad_norm": 0.5163494757357365, + "learning_rate": 0.00015757210116859686, + "loss": 0.96, + "step": 5604 + }, + { + "epoch": 1.1956058020477816, + "grad_norm": 0.5579218501413896, + "learning_rate": 0.0001575700201586521, + "loss": 0.9683, + "step": 5605 + }, + { + "epoch": 1.1958191126279862, + "grad_norm": 0.4879937470447867, + "learning_rate": 0.00015756793827100124, + "loss": 0.9933, + "step": 5606 + }, + { + "epoch": 1.1960324232081911, + "grad_norm": 0.40639915266059906, + "learning_rate": 0.00015756585550566783, + "loss": 0.9272, + "step": 5607 + }, + { + "epoch": 1.1962457337883958, + "grad_norm": 0.3342338999787636, + "learning_rate": 0.0001575637718626755, + "loss": 0.9391, + "step": 5608 + }, + { + "epoch": 1.1964590443686007, + "grad_norm": 0.42282833532248226, + "learning_rate": 0.0001575616873420478, + "loss": 0.967, + "step": 5609 + }, + { + "epoch": 1.1966723549488054, + "grad_norm": 0.43874025665315947, + "learning_rate": 0.00015755960194380826, + "loss": 0.9575, + "step": 5610 + }, + { + "epoch": 1.1968856655290103, + "grad_norm": 0.39141602723801916, + "learning_rate": 0.00015755751566798052, + "loss": 0.9625, + "step": 5611 + }, + { + "epoch": 1.197098976109215, + "grad_norm": 0.27353377634687975, + "learning_rate": 0.0001575554285145882, + "loss": 0.9647, + "step": 5612 + }, + { + "epoch": 1.19731228668942, + "grad_norm": 0.32030821647825497, + "learning_rate": 0.00015755334048365492, + "loss": 0.9627, + "step": 5613 + }, + { + "epoch": 1.1975255972696246, + "grad_norm": 0.36539908223950435, + "learning_rate": 0.00015755125157520427, + "loss": 0.9472, + "step": 5614 + }, + { + "epoch": 1.1977389078498293, + "grad_norm": 0.5004008978843754, + "learning_rate": 0.0001575491617892599, + "loss": 0.9438, + "step": 5615 + }, + { + "epoch": 1.1979522184300342, + "grad_norm": 0.6099189202822852, + "learning_rate": 0.00015754707112584546, + "loss": 0.942, + "step": 5616 + }, + { + "epoch": 1.1981655290102389, + "grad_norm": 0.6132649583343107, + "learning_rate": 0.00015754497958498463, + "loss": 0.9456, + "step": 5617 + }, + { + "epoch": 1.1983788395904438, + "grad_norm": 0.6076818009957254, + "learning_rate": 0.00015754288716670104, + "loss": 0.9749, + "step": 5618 + }, + { + "epoch": 1.1985921501706485, + "grad_norm": 0.5747568064500422, + "learning_rate": 0.00015754079387101836, + "loss": 0.9462, + "step": 5619 + }, + { + "epoch": 1.1988054607508531, + "grad_norm": 0.4559872704327405, + "learning_rate": 0.0001575386996979603, + "loss": 0.9418, + "step": 5620 + }, + { + "epoch": 1.199018771331058, + "grad_norm": 0.3745406355999236, + "learning_rate": 0.00015753660464755057, + "loss": 0.9815, + "step": 5621 + }, + { + "epoch": 1.1992320819112627, + "grad_norm": 0.30130864099578486, + "learning_rate": 0.00015753450871981283, + "loss": 0.9625, + "step": 5622 + }, + { + "epoch": 1.1994453924914676, + "grad_norm": 0.7175415663366669, + "learning_rate": 0.00015753241191477082, + "loss": 0.9629, + "step": 5623 + }, + { + "epoch": 1.1996587030716723, + "grad_norm": 0.43118254730113487, + "learning_rate": 0.0001575303142324483, + "loss": 0.9455, + "step": 5624 + }, + { + "epoch": 1.199872013651877, + "grad_norm": 0.3403272202531047, + "learning_rate": 0.00015752821567286893, + "loss": 0.9712, + "step": 5625 + }, + { + "epoch": 1.200085324232082, + "grad_norm": 0.2562954210784436, + "learning_rate": 0.0001575261162360565, + "loss": 0.972, + "step": 5626 + }, + { + "epoch": 1.2002986348122866, + "grad_norm": 0.328476307948007, + "learning_rate": 0.00015752401592203477, + "loss": 0.9541, + "step": 5627 + }, + { + "epoch": 1.2005119453924915, + "grad_norm": 0.2818357616270791, + "learning_rate": 0.0001575219147308275, + "loss": 1.0076, + "step": 5628 + }, + { + "epoch": 1.2007252559726962, + "grad_norm": 0.28581687435910696, + "learning_rate": 0.00015751981266245844, + "loss": 0.9719, + "step": 5629 + }, + { + "epoch": 1.200938566552901, + "grad_norm": 0.32154166388661437, + "learning_rate": 0.00015751770971695142, + "loss": 0.9928, + "step": 5630 + }, + { + "epoch": 1.2011518771331058, + "grad_norm": 0.2830748289924721, + "learning_rate": 0.0001575156058943302, + "loss": 0.9476, + "step": 5631 + }, + { + "epoch": 1.2013651877133107, + "grad_norm": 0.34280582382572705, + "learning_rate": 0.0001575135011946186, + "loss": 0.9721, + "step": 5632 + }, + { + "epoch": 1.2015784982935154, + "grad_norm": 0.3475957267133547, + "learning_rate": 0.00015751139561784043, + "loss": 0.9356, + "step": 5633 + }, + { + "epoch": 1.20179180887372, + "grad_norm": 0.4022712978900531, + "learning_rate": 0.0001575092891640195, + "loss": 1.013, + "step": 5634 + }, + { + "epoch": 1.202005119453925, + "grad_norm": 0.45327182341476996, + "learning_rate": 0.00015750718183317968, + "loss": 0.9724, + "step": 5635 + }, + { + "epoch": 1.2022184300341296, + "grad_norm": 0.5246025955355834, + "learning_rate": 0.00015750507362534478, + "loss": 0.9525, + "step": 5636 + }, + { + "epoch": 1.2024317406143346, + "grad_norm": 0.5705530530848291, + "learning_rate": 0.00015750296454053868, + "loss": 0.988, + "step": 5637 + }, + { + "epoch": 1.2026450511945392, + "grad_norm": 0.5639060875873672, + "learning_rate": 0.00015750085457878524, + "loss": 0.9549, + "step": 5638 + }, + { + "epoch": 1.202858361774744, + "grad_norm": 0.4919627971933963, + "learning_rate": 0.00015749874374010832, + "loss": 0.9336, + "step": 5639 + }, + { + "epoch": 1.2030716723549488, + "grad_norm": 0.516386472727412, + "learning_rate": 0.0001574966320245318, + "loss": 0.9625, + "step": 5640 + }, + { + "epoch": 1.2032849829351535, + "grad_norm": 0.5174175966229044, + "learning_rate": 0.0001574945194320796, + "loss": 0.9642, + "step": 5641 + }, + { + "epoch": 1.2034982935153584, + "grad_norm": 0.4366875851244672, + "learning_rate": 0.00015749240596277556, + "loss": 0.9247, + "step": 5642 + }, + { + "epoch": 1.203711604095563, + "grad_norm": 0.33266358984856176, + "learning_rate": 0.0001574902916166437, + "loss": 0.9672, + "step": 5643 + }, + { + "epoch": 1.203924914675768, + "grad_norm": 0.3416859436706215, + "learning_rate": 0.00015748817639370786, + "loss": 0.9604, + "step": 5644 + }, + { + "epoch": 1.2041382252559727, + "grad_norm": 0.28592787456511226, + "learning_rate": 0.00015748606029399202, + "loss": 1.0041, + "step": 5645 + }, + { + "epoch": 1.2043515358361774, + "grad_norm": 0.626385347391755, + "learning_rate": 0.0001574839433175201, + "loss": 0.9592, + "step": 5646 + }, + { + "epoch": 1.2045648464163823, + "grad_norm": 0.31890123718864516, + "learning_rate": 0.00015748182546431605, + "loss": 0.9925, + "step": 5647 + }, + { + "epoch": 1.204778156996587, + "grad_norm": 0.3055692749906128, + "learning_rate": 0.00015747970673440386, + "loss": 0.983, + "step": 5648 + }, + { + "epoch": 1.2049914675767919, + "grad_norm": 0.2664659904229726, + "learning_rate": 0.00015747758712780744, + "loss": 0.938, + "step": 5649 + }, + { + "epoch": 1.2052047781569966, + "grad_norm": 0.2778725009220411, + "learning_rate": 0.00015747546664455085, + "loss": 0.9575, + "step": 5650 + }, + { + "epoch": 1.2054180887372015, + "grad_norm": 0.36452585168278884, + "learning_rate": 0.00015747334528465806, + "loss": 0.991, + "step": 5651 + }, + { + "epoch": 1.2056313993174061, + "grad_norm": 0.4017825418947813, + "learning_rate": 0.00015747122304815304, + "loss": 0.939, + "step": 5652 + }, + { + "epoch": 1.2058447098976108, + "grad_norm": 0.4076583694195988, + "learning_rate": 0.00015746909993505985, + "loss": 0.9705, + "step": 5653 + }, + { + "epoch": 1.2060580204778157, + "grad_norm": 0.4251283365746768, + "learning_rate": 0.0001574669759454025, + "loss": 0.9579, + "step": 5654 + }, + { + "epoch": 1.2062713310580204, + "grad_norm": 0.43530053763393706, + "learning_rate": 0.00015746485107920498, + "loss": 0.9436, + "step": 5655 + }, + { + "epoch": 1.2064846416382253, + "grad_norm": 0.4236321992019742, + "learning_rate": 0.00015746272533649137, + "loss": 1.0097, + "step": 5656 + }, + { + "epoch": 1.20669795221843, + "grad_norm": 0.42325758005150405, + "learning_rate": 0.00015746059871728576, + "loss": 0.9952, + "step": 5657 + }, + { + "epoch": 1.2069112627986347, + "grad_norm": 0.46094613285214, + "learning_rate": 0.00015745847122161213, + "loss": 0.9581, + "step": 5658 + }, + { + "epoch": 1.2071245733788396, + "grad_norm": 0.5151208235701623, + "learning_rate": 0.0001574563428494946, + "loss": 0.9497, + "step": 5659 + }, + { + "epoch": 1.2073378839590443, + "grad_norm": 0.6092400288246772, + "learning_rate": 0.00015745421360095728, + "loss": 0.9687, + "step": 5660 + }, + { + "epoch": 1.2075511945392492, + "grad_norm": 0.6501049140852393, + "learning_rate": 0.00015745208347602423, + "loss": 0.9431, + "step": 5661 + }, + { + "epoch": 1.2077645051194539, + "grad_norm": 0.6941849796553647, + "learning_rate": 0.00015744995247471953, + "loss": 0.9873, + "step": 5662 + }, + { + "epoch": 1.2079778156996588, + "grad_norm": 0.7896842684464767, + "learning_rate": 0.0001574478205970673, + "loss": 0.9481, + "step": 5663 + }, + { + "epoch": 1.2081911262798635, + "grad_norm": 0.8257446333241658, + "learning_rate": 0.0001574456878430917, + "loss": 0.9613, + "step": 5664 + }, + { + "epoch": 1.2084044368600684, + "grad_norm": 0.7592814080578224, + "learning_rate": 0.00015744355421281684, + "loss": 0.9699, + "step": 5665 + }, + { + "epoch": 1.208617747440273, + "grad_norm": 0.6626468848942099, + "learning_rate": 0.00015744141970626686, + "loss": 0.9816, + "step": 5666 + }, + { + "epoch": 1.2088310580204777, + "grad_norm": 0.5791188299561635, + "learning_rate": 0.00015743928432346592, + "loss": 0.9661, + "step": 5667 + }, + { + "epoch": 1.2090443686006827, + "grad_norm": 0.5062621581371235, + "learning_rate": 0.00015743714806443814, + "loss": 0.9458, + "step": 5668 + }, + { + "epoch": 1.2092576791808873, + "grad_norm": 0.4101987980836255, + "learning_rate": 0.00015743501092920777, + "loss": 0.9358, + "step": 5669 + }, + { + "epoch": 1.2094709897610922, + "grad_norm": 0.5024637940598218, + "learning_rate": 0.00015743287291779894, + "loss": 0.9235, + "step": 5670 + }, + { + "epoch": 1.209684300341297, + "grad_norm": 0.7021478086781298, + "learning_rate": 0.00015743073403023583, + "loss": 0.9505, + "step": 5671 + }, + { + "epoch": 1.2098976109215016, + "grad_norm": 0.7386614090146818, + "learning_rate": 0.00015742859426654267, + "loss": 0.9599, + "step": 5672 + }, + { + "epoch": 1.2101109215017065, + "grad_norm": 0.7355212454160488, + "learning_rate": 0.00015742645362674368, + "loss": 0.9663, + "step": 5673 + }, + { + "epoch": 1.2103242320819112, + "grad_norm": 0.8067106164494289, + "learning_rate": 0.00015742431211086304, + "loss": 0.9699, + "step": 5674 + }, + { + "epoch": 1.210537542662116, + "grad_norm": 0.8670064703767246, + "learning_rate": 0.000157422169718925, + "loss": 0.9495, + "step": 5675 + }, + { + "epoch": 1.2107508532423208, + "grad_norm": 0.8800146982156908, + "learning_rate": 0.00015742002645095383, + "loss": 0.9588, + "step": 5676 + }, + { + "epoch": 1.2109641638225255, + "grad_norm": 0.8028699676117153, + "learning_rate": 0.00015741788230697377, + "loss": 0.9174, + "step": 5677 + }, + { + "epoch": 1.2111774744027304, + "grad_norm": 0.5878162167459183, + "learning_rate": 0.00015741573728700906, + "loss": 0.9727, + "step": 5678 + }, + { + "epoch": 1.211390784982935, + "grad_norm": 0.5306644686438873, + "learning_rate": 0.00015741359139108396, + "loss": 0.9274, + "step": 5679 + }, + { + "epoch": 1.21160409556314, + "grad_norm": 0.44951027810256566, + "learning_rate": 0.0001574114446192228, + "loss": 0.9551, + "step": 5680 + }, + { + "epoch": 1.2118174061433447, + "grad_norm": 0.3413429081794741, + "learning_rate": 0.00015740929697144983, + "loss": 0.9552, + "step": 5681 + }, + { + "epoch": 1.2120307167235496, + "grad_norm": 0.3756447118084477, + "learning_rate": 0.00015740714844778936, + "loss": 0.9916, + "step": 5682 + }, + { + "epoch": 1.2122440273037542, + "grad_norm": 0.5357964126708326, + "learning_rate": 0.00015740499904826568, + "loss": 0.9653, + "step": 5683 + }, + { + "epoch": 1.2124573378839592, + "grad_norm": 0.4799140441508718, + "learning_rate": 0.00015740284877290315, + "loss": 0.9659, + "step": 5684 + }, + { + "epoch": 1.2126706484641638, + "grad_norm": 0.34126296629804653, + "learning_rate": 0.0001574006976217261, + "loss": 0.9655, + "step": 5685 + }, + { + "epoch": 1.2128839590443685, + "grad_norm": 0.43735374013347755, + "learning_rate": 0.00015739854559475885, + "loss": 0.9733, + "step": 5686 + }, + { + "epoch": 1.2130972696245734, + "grad_norm": 0.4983203776235241, + "learning_rate": 0.00015739639269202576, + "loss": 0.9393, + "step": 5687 + }, + { + "epoch": 1.213310580204778, + "grad_norm": 0.43337772291704313, + "learning_rate": 0.00015739423891355118, + "loss": 0.9625, + "step": 5688 + }, + { + "epoch": 1.213523890784983, + "grad_norm": 0.33719704420095536, + "learning_rate": 0.00015739208425935946, + "loss": 0.9646, + "step": 5689 + }, + { + "epoch": 1.2137372013651877, + "grad_norm": 0.4221131038469724, + "learning_rate": 0.00015738992872947505, + "loss": 0.9438, + "step": 5690 + }, + { + "epoch": 1.2139505119453924, + "grad_norm": 0.5008275408285009, + "learning_rate": 0.00015738777232392225, + "loss": 0.9816, + "step": 5691 + }, + { + "epoch": 1.2141638225255973, + "grad_norm": 0.4859861324338629, + "learning_rate": 0.00015738561504272553, + "loss": 0.9854, + "step": 5692 + }, + { + "epoch": 1.214377133105802, + "grad_norm": 0.38976246984412577, + "learning_rate": 0.00015738345688590927, + "loss": 0.9606, + "step": 5693 + }, + { + "epoch": 1.2145904436860069, + "grad_norm": 0.3452417750801383, + "learning_rate": 0.0001573812978534979, + "loss": 0.9844, + "step": 5694 + }, + { + "epoch": 1.2148037542662116, + "grad_norm": 0.42387671777206226, + "learning_rate": 0.00015737913794551582, + "loss": 0.9778, + "step": 5695 + }, + { + "epoch": 1.2150170648464165, + "grad_norm": 0.45166016916641166, + "learning_rate": 0.0001573769771619875, + "loss": 0.9827, + "step": 5696 + }, + { + "epoch": 1.2152303754266212, + "grad_norm": 0.3974761711095504, + "learning_rate": 0.00015737481550293742, + "loss": 0.9574, + "step": 5697 + }, + { + "epoch": 1.215443686006826, + "grad_norm": 0.3565024369835279, + "learning_rate": 0.00015737265296838997, + "loss": 0.9465, + "step": 5698 + }, + { + "epoch": 1.2156569965870307, + "grad_norm": 0.3925148520359083, + "learning_rate": 0.00015737048955836964, + "loss": 0.9404, + "step": 5699 + }, + { + "epoch": 1.2158703071672354, + "grad_norm": 0.36314104609295683, + "learning_rate": 0.00015736832527290091, + "loss": 0.959, + "step": 5700 + }, + { + "epoch": 1.2160836177474403, + "grad_norm": 0.3116410694202182, + "learning_rate": 0.00015736616011200833, + "loss": 0.9629, + "step": 5701 + }, + { + "epoch": 1.216296928327645, + "grad_norm": 0.30952330971258873, + "learning_rate": 0.0001573639940757163, + "loss": 0.9743, + "step": 5702 + }, + { + "epoch": 1.21651023890785, + "grad_norm": 0.399051557023417, + "learning_rate": 0.00015736182716404937, + "loss": 0.9535, + "step": 5703 + }, + { + "epoch": 1.2167235494880546, + "grad_norm": 0.5629147242281115, + "learning_rate": 0.0001573596593770321, + "loss": 1.0026, + "step": 5704 + }, + { + "epoch": 1.2169368600682593, + "grad_norm": 0.5322629647450546, + "learning_rate": 0.000157357490714689, + "loss": 0.9628, + "step": 5705 + }, + { + "epoch": 1.2171501706484642, + "grad_norm": 0.5857582683501175, + "learning_rate": 0.00015735532117704454, + "loss": 0.9617, + "step": 5706 + }, + { + "epoch": 1.2173634812286689, + "grad_norm": 0.6153833178543776, + "learning_rate": 0.00015735315076412335, + "loss": 0.9814, + "step": 5707 + }, + { + "epoch": 1.2175767918088738, + "grad_norm": 0.6710358803908832, + "learning_rate": 0.00015735097947594995, + "loss": 0.9624, + "step": 5708 + }, + { + "epoch": 1.2177901023890785, + "grad_norm": 0.6043078572616606, + "learning_rate": 0.00015734880731254892, + "loss": 0.9751, + "step": 5709 + }, + { + "epoch": 1.2180034129692832, + "grad_norm": 0.5058586983189237, + "learning_rate": 0.0001573466342739448, + "loss": 0.948, + "step": 5710 + }, + { + "epoch": 1.218216723549488, + "grad_norm": 0.4095816912590133, + "learning_rate": 0.00015734446036016225, + "loss": 0.9745, + "step": 5711 + }, + { + "epoch": 1.2184300341296928, + "grad_norm": 0.34868567821248714, + "learning_rate": 0.00015734228557122583, + "loss": 0.958, + "step": 5712 + }, + { + "epoch": 1.2186433447098977, + "grad_norm": 0.3665216317246911, + "learning_rate": 0.00015734010990716012, + "loss": 0.9705, + "step": 5713 + }, + { + "epoch": 1.2188566552901023, + "grad_norm": 0.37688096045321096, + "learning_rate": 0.0001573379333679898, + "loss": 1.0091, + "step": 5714 + }, + { + "epoch": 1.2190699658703072, + "grad_norm": 0.35501269606516306, + "learning_rate": 0.00015733575595373942, + "loss": 0.9752, + "step": 5715 + }, + { + "epoch": 1.219283276450512, + "grad_norm": 0.30181303167423795, + "learning_rate": 0.0001573335776644337, + "loss": 0.9812, + "step": 5716 + }, + { + "epoch": 1.2194965870307168, + "grad_norm": 0.331183210974346, + "learning_rate": 0.00015733139850009723, + "loss": 0.9768, + "step": 5717 + }, + { + "epoch": 1.2197098976109215, + "grad_norm": 0.36926537510975604, + "learning_rate": 0.0001573292184607547, + "loss": 0.9859, + "step": 5718 + }, + { + "epoch": 1.2199232081911262, + "grad_norm": 0.34683903265403626, + "learning_rate": 0.00015732703754643075, + "loss": 0.9666, + "step": 5719 + }, + { + "epoch": 1.2201365187713311, + "grad_norm": 0.3147285132493752, + "learning_rate": 0.0001573248557571501, + "loss": 0.9498, + "step": 5720 + }, + { + "epoch": 1.2203498293515358, + "grad_norm": 0.3683338534591394, + "learning_rate": 0.00015732267309293734, + "loss": 0.9661, + "step": 5721 + }, + { + "epoch": 1.2205631399317407, + "grad_norm": 0.45001081381613367, + "learning_rate": 0.0001573204895538173, + "loss": 0.9741, + "step": 5722 + }, + { + "epoch": 1.2207764505119454, + "grad_norm": 0.5103712317960144, + "learning_rate": 0.00015731830513981458, + "loss": 0.9788, + "step": 5723 + }, + { + "epoch": 1.22098976109215, + "grad_norm": 0.5200730121036132, + "learning_rate": 0.00015731611985095396, + "loss": 0.9716, + "step": 5724 + }, + { + "epoch": 1.221203071672355, + "grad_norm": 0.5443230895306436, + "learning_rate": 0.00015731393368726012, + "loss": 0.9535, + "step": 5725 + }, + { + "epoch": 1.2214163822525597, + "grad_norm": 0.5808801645935744, + "learning_rate": 0.00015731174664875783, + "loss": 0.945, + "step": 5726 + }, + { + "epoch": 1.2216296928327646, + "grad_norm": 0.699294216206442, + "learning_rate": 0.00015730955873547182, + "loss": 0.9702, + "step": 5727 + }, + { + "epoch": 1.2218430034129693, + "grad_norm": 0.7547027738082352, + "learning_rate": 0.0001573073699474269, + "loss": 0.9811, + "step": 5728 + }, + { + "epoch": 1.222056313993174, + "grad_norm": 0.7418123632362806, + "learning_rate": 0.00015730518028464772, + "loss": 0.9577, + "step": 5729 + }, + { + "epoch": 1.2222696245733788, + "grad_norm": 0.7394482935616207, + "learning_rate": 0.00015730298974715915, + "loss": 0.925, + "step": 5730 + }, + { + "epoch": 1.2224829351535835, + "grad_norm": 0.6036612152692452, + "learning_rate": 0.00015730079833498598, + "loss": 0.968, + "step": 5731 + }, + { + "epoch": 1.2226962457337884, + "grad_norm": 0.34176708964488134, + "learning_rate": 0.00015729860604815296, + "loss": 0.9522, + "step": 5732 + }, + { + "epoch": 1.2229095563139931, + "grad_norm": 0.3487146907925295, + "learning_rate": 0.00015729641288668488, + "loss": 0.9964, + "step": 5733 + }, + { + "epoch": 1.223122866894198, + "grad_norm": 0.3759728982067221, + "learning_rate": 0.0001572942188506066, + "loss": 0.9826, + "step": 5734 + }, + { + "epoch": 1.2233361774744027, + "grad_norm": 0.3893971641239459, + "learning_rate": 0.00015729202393994298, + "loss": 0.9492, + "step": 5735 + }, + { + "epoch": 1.2235494880546076, + "grad_norm": 0.4745701701560058, + "learning_rate": 0.00015728982815471876, + "loss": 0.9841, + "step": 5736 + }, + { + "epoch": 1.2237627986348123, + "grad_norm": 0.47492392480787754, + "learning_rate": 0.00015728763149495885, + "loss": 0.977, + "step": 5737 + }, + { + "epoch": 1.223976109215017, + "grad_norm": 0.4524961279140626, + "learning_rate": 0.00015728543396068809, + "loss": 0.9526, + "step": 5738 + }, + { + "epoch": 1.224189419795222, + "grad_norm": 0.9890357830205773, + "learning_rate": 0.00015728323555193135, + "loss": 0.9399, + "step": 5739 + }, + { + "epoch": 1.2244027303754266, + "grad_norm": 0.4028601346436683, + "learning_rate": 0.0001572810362687135, + "loss": 0.9385, + "step": 5740 + }, + { + "epoch": 1.2246160409556315, + "grad_norm": 0.6928421044894841, + "learning_rate": 0.0001572788361110594, + "loss": 0.9589, + "step": 5741 + }, + { + "epoch": 1.2248293515358362, + "grad_norm": 0.42935377325882174, + "learning_rate": 0.00015727663507899398, + "loss": 0.9437, + "step": 5742 + }, + { + "epoch": 1.2250426621160408, + "grad_norm": 0.38740694825993266, + "learning_rate": 0.00015727443317254213, + "loss": 0.9654, + "step": 5743 + }, + { + "epoch": 1.2252559726962458, + "grad_norm": 0.4214725572893034, + "learning_rate": 0.00015727223039172883, + "loss": 1.0145, + "step": 5744 + }, + { + "epoch": 1.2254692832764504, + "grad_norm": 0.46683016435842856, + "learning_rate": 0.00015727002673657887, + "loss": 0.9528, + "step": 5745 + }, + { + "epoch": 1.2256825938566553, + "grad_norm": 0.35580218556910653, + "learning_rate": 0.00015726782220711727, + "loss": 0.9675, + "step": 5746 + }, + { + "epoch": 1.22589590443686, + "grad_norm": 0.28668952328310143, + "learning_rate": 0.000157265616803369, + "loss": 0.9521, + "step": 5747 + }, + { + "epoch": 1.226109215017065, + "grad_norm": 0.3673849066599126, + "learning_rate": 0.00015726341052535895, + "loss": 0.972, + "step": 5748 + }, + { + "epoch": 1.2263225255972696, + "grad_norm": 0.46857749436083307, + "learning_rate": 0.0001572612033731121, + "loss": 0.967, + "step": 5749 + }, + { + "epoch": 1.2265358361774745, + "grad_norm": 0.5022948926339735, + "learning_rate": 0.00015725899534665345, + "loss": 0.9595, + "step": 5750 + }, + { + "epoch": 1.2267491467576792, + "grad_norm": 0.3683326669629901, + "learning_rate": 0.00015725678644600797, + "loss": 0.9787, + "step": 5751 + }, + { + "epoch": 1.226962457337884, + "grad_norm": 0.3561030783453824, + "learning_rate": 0.00015725457667120063, + "loss": 0.9802, + "step": 5752 + }, + { + "epoch": 1.2271757679180888, + "grad_norm": 0.3774918480982903, + "learning_rate": 0.00015725236602225648, + "loss": 0.9701, + "step": 5753 + }, + { + "epoch": 1.2273890784982935, + "grad_norm": 0.39961112115065095, + "learning_rate": 0.0001572501544992005, + "loss": 0.9539, + "step": 5754 + }, + { + "epoch": 1.2276023890784984, + "grad_norm": 0.43022880711887856, + "learning_rate": 0.00015724794210205772, + "loss": 0.968, + "step": 5755 + }, + { + "epoch": 1.227815699658703, + "grad_norm": 0.5057620078994428, + "learning_rate": 0.00015724572883085318, + "loss": 0.9666, + "step": 5756 + }, + { + "epoch": 1.2280290102389078, + "grad_norm": 0.520544077819084, + "learning_rate": 0.00015724351468561192, + "loss": 0.9534, + "step": 5757 + }, + { + "epoch": 1.2282423208191127, + "grad_norm": 0.5779495565686428, + "learning_rate": 0.00015724129966635897, + "loss": 0.9744, + "step": 5758 + }, + { + "epoch": 1.2284556313993173, + "grad_norm": 0.6671262106171088, + "learning_rate": 0.00015723908377311945, + "loss": 0.9736, + "step": 5759 + }, + { + "epoch": 1.2286689419795223, + "grad_norm": 0.7246571789298378, + "learning_rate": 0.0001572368670059184, + "loss": 0.9582, + "step": 5760 + }, + { + "epoch": 1.228882252559727, + "grad_norm": 0.8552915995573723, + "learning_rate": 0.00015723464936478085, + "loss": 0.9824, + "step": 5761 + }, + { + "epoch": 1.2290955631399316, + "grad_norm": 0.9772346355210337, + "learning_rate": 0.000157232430849732, + "loss": 0.9919, + "step": 5762 + }, + { + "epoch": 1.2293088737201365, + "grad_norm": 1.0089236865014415, + "learning_rate": 0.00015723021146079685, + "loss": 0.9972, + "step": 5763 + }, + { + "epoch": 1.2295221843003412, + "grad_norm": 0.8831124467157231, + "learning_rate": 0.0001572279911980006, + "loss": 0.9629, + "step": 5764 + }, + { + "epoch": 1.2297354948805461, + "grad_norm": 0.7268820365294555, + "learning_rate": 0.0001572257700613683, + "loss": 0.9427, + "step": 5765 + }, + { + "epoch": 1.2299488054607508, + "grad_norm": 0.6241069256281196, + "learning_rate": 0.00015722354805092513, + "loss": 0.9772, + "step": 5766 + }, + { + "epoch": 1.2301621160409557, + "grad_norm": 0.5509621512758421, + "learning_rate": 0.0001572213251666962, + "loss": 0.9725, + "step": 5767 + }, + { + "epoch": 1.2303754266211604, + "grad_norm": 0.4555620280603142, + "learning_rate": 0.0001572191014087067, + "loss": 0.9575, + "step": 5768 + }, + { + "epoch": 1.2305887372013653, + "grad_norm": 0.3469360343976241, + "learning_rate": 0.00015721687677698175, + "loss": 0.9664, + "step": 5769 + }, + { + "epoch": 1.23080204778157, + "grad_norm": 0.28471718763857046, + "learning_rate": 0.00015721465127154654, + "loss": 0.9585, + "step": 5770 + }, + { + "epoch": 1.2310153583617747, + "grad_norm": 0.4509050675653776, + "learning_rate": 0.00015721242489242625, + "loss": 0.9437, + "step": 5771 + }, + { + "epoch": 1.2312286689419796, + "grad_norm": 0.5343397015655815, + "learning_rate": 0.0001572101976396461, + "loss": 0.9488, + "step": 5772 + }, + { + "epoch": 1.2314419795221843, + "grad_norm": 0.4923860187131915, + "learning_rate": 0.00015720796951323125, + "loss": 0.984, + "step": 5773 + }, + { + "epoch": 1.2316552901023892, + "grad_norm": 0.44368860494278667, + "learning_rate": 0.0001572057405132069, + "loss": 0.9414, + "step": 5774 + }, + { + "epoch": 1.2318686006825939, + "grad_norm": 0.473149994563641, + "learning_rate": 0.00015720351063959833, + "loss": 0.9561, + "step": 5775 + }, + { + "epoch": 1.2320819112627985, + "grad_norm": 0.4581251308916715, + "learning_rate": 0.00015720127989243073, + "loss": 0.9823, + "step": 5776 + }, + { + "epoch": 1.2322952218430034, + "grad_norm": 0.3511264487043937, + "learning_rate": 0.00015719904827172934, + "loss": 0.9554, + "step": 5777 + }, + { + "epoch": 1.2325085324232081, + "grad_norm": 0.2909638145730771, + "learning_rate": 0.00015719681577751942, + "loss": 0.9809, + "step": 5778 + }, + { + "epoch": 1.232721843003413, + "grad_norm": 0.30007498995128623, + "learning_rate": 0.00015719458240982624, + "loss": 0.9781, + "step": 5779 + }, + { + "epoch": 1.2329351535836177, + "grad_norm": 0.3913772223202547, + "learning_rate": 0.00015719234816867508, + "loss": 0.9382, + "step": 5780 + }, + { + "epoch": 1.2331484641638226, + "grad_norm": 0.4291858060424047, + "learning_rate": 0.00015719011305409116, + "loss": 0.9692, + "step": 5781 + }, + { + "epoch": 1.2333617747440273, + "grad_norm": 0.4246448129058225, + "learning_rate": 0.00015718787706609985, + "loss": 0.9492, + "step": 5782 + }, + { + "epoch": 1.233575085324232, + "grad_norm": 0.4955207254678278, + "learning_rate": 0.0001571856402047264, + "loss": 0.987, + "step": 5783 + }, + { + "epoch": 1.233788395904437, + "grad_norm": 0.49111922953561865, + "learning_rate": 0.00015718340246999613, + "loss": 0.9495, + "step": 5784 + }, + { + "epoch": 1.2340017064846416, + "grad_norm": 0.42402119712680764, + "learning_rate": 0.00015718116386193435, + "loss": 0.9494, + "step": 5785 + }, + { + "epoch": 1.2342150170648465, + "grad_norm": 0.40578406172027864, + "learning_rate": 0.00015717892438056643, + "loss": 0.9707, + "step": 5786 + }, + { + "epoch": 1.2344283276450512, + "grad_norm": 0.3973199737972863, + "learning_rate": 0.00015717668402591767, + "loss": 0.9552, + "step": 5787 + }, + { + "epoch": 1.234641638225256, + "grad_norm": 0.32666841961102355, + "learning_rate": 0.00015717444279801342, + "loss": 1.0104, + "step": 5788 + }, + { + "epoch": 1.2348549488054608, + "grad_norm": 0.39765844907893194, + "learning_rate": 0.00015717220069687906, + "loss": 0.9605, + "step": 5789 + }, + { + "epoch": 1.2350682593856654, + "grad_norm": 0.42516306241787966, + "learning_rate": 0.00015716995772253997, + "loss": 0.9707, + "step": 5790 + }, + { + "epoch": 1.2352815699658704, + "grad_norm": 0.4256111669714082, + "learning_rate": 0.00015716771387502146, + "loss": 0.9586, + "step": 5791 + }, + { + "epoch": 1.235494880546075, + "grad_norm": 0.4736967778805133, + "learning_rate": 0.000157165469154349, + "loss": 0.9531, + "step": 5792 + }, + { + "epoch": 1.23570819112628, + "grad_norm": 0.4757752739654571, + "learning_rate": 0.00015716322356054796, + "loss": 0.9772, + "step": 5793 + }, + { + "epoch": 1.2359215017064846, + "grad_norm": 0.4864586949125675, + "learning_rate": 0.00015716097709364378, + "loss": 0.9658, + "step": 5794 + }, + { + "epoch": 1.2361348122866893, + "grad_norm": 0.48625889557061425, + "learning_rate": 0.00015715872975366178, + "loss": 0.9888, + "step": 5795 + }, + { + "epoch": 1.2363481228668942, + "grad_norm": 0.4393189310377853, + "learning_rate": 0.0001571564815406275, + "loss": 0.9782, + "step": 5796 + }, + { + "epoch": 1.236561433447099, + "grad_norm": 0.45772259320705916, + "learning_rate": 0.00015715423245456634, + "loss": 0.9599, + "step": 5797 + }, + { + "epoch": 1.2367747440273038, + "grad_norm": 0.46558891478399345, + "learning_rate": 0.00015715198249550372, + "loss": 0.9771, + "step": 5798 + }, + { + "epoch": 1.2369880546075085, + "grad_norm": 0.48083530298446153, + "learning_rate": 0.00015714973166346515, + "loss": 0.9584, + "step": 5799 + }, + { + "epoch": 1.2372013651877134, + "grad_norm": 0.37822042793169597, + "learning_rate": 0.00015714747995847604, + "loss": 0.9757, + "step": 5800 + }, + { + "epoch": 1.237414675767918, + "grad_norm": 0.37038673598363836, + "learning_rate": 0.0001571452273805619, + "loss": 0.9945, + "step": 5801 + }, + { + "epoch": 1.237627986348123, + "grad_norm": 0.41062285076524313, + "learning_rate": 0.0001571429739297482, + "loss": 0.9574, + "step": 5802 + }, + { + "epoch": 1.2378412969283277, + "grad_norm": 0.35926091432680923, + "learning_rate": 0.00015714071960606045, + "loss": 0.9888, + "step": 5803 + }, + { + "epoch": 1.2380546075085324, + "grad_norm": 0.3319298222028829, + "learning_rate": 0.0001571384644095242, + "loss": 0.9603, + "step": 5804 + }, + { + "epoch": 1.2382679180887373, + "grad_norm": 0.2962264841389751, + "learning_rate": 0.00015713620834016488, + "loss": 0.9473, + "step": 5805 + }, + { + "epoch": 1.238481228668942, + "grad_norm": 0.26143149045084246, + "learning_rate": 0.00015713395139800812, + "loss": 0.9725, + "step": 5806 + }, + { + "epoch": 1.2386945392491469, + "grad_norm": 0.3114622916030436, + "learning_rate": 0.0001571316935830794, + "loss": 0.9477, + "step": 5807 + }, + { + "epoch": 1.2389078498293515, + "grad_norm": 0.3079913555561656, + "learning_rate": 0.00015712943489540421, + "loss": 0.9568, + "step": 5808 + }, + { + "epoch": 1.2391211604095562, + "grad_norm": 0.3348309806745927, + "learning_rate": 0.0001571271753350082, + "loss": 0.9517, + "step": 5809 + }, + { + "epoch": 1.2393344709897611, + "grad_norm": 0.3682847997877009, + "learning_rate": 0.00015712491490191692, + "loss": 0.9817, + "step": 5810 + }, + { + "epoch": 1.2395477815699658, + "grad_norm": 0.4164496398970447, + "learning_rate": 0.0001571226535961559, + "loss": 0.9272, + "step": 5811 + }, + { + "epoch": 1.2397610921501707, + "grad_norm": 0.46513477846808027, + "learning_rate": 0.00015712039141775076, + "loss": 0.9767, + "step": 5812 + }, + { + "epoch": 1.2399744027303754, + "grad_norm": 0.4555652838890404, + "learning_rate": 0.00015711812836672711, + "loss": 0.9334, + "step": 5813 + }, + { + "epoch": 1.24018771331058, + "grad_norm": 0.46834656917057443, + "learning_rate": 0.00015711586444311053, + "loss": 0.975, + "step": 5814 + }, + { + "epoch": 1.240401023890785, + "grad_norm": 0.48768472742493485, + "learning_rate": 0.00015711359964692666, + "loss": 0.94, + "step": 5815 + }, + { + "epoch": 1.2406143344709897, + "grad_norm": 0.5541951312575312, + "learning_rate": 0.0001571113339782011, + "loss": 0.9655, + "step": 5816 + }, + { + "epoch": 1.2408276450511946, + "grad_norm": 0.5640210536178268, + "learning_rate": 0.00015710906743695953, + "loss": 0.9935, + "step": 5817 + }, + { + "epoch": 1.2410409556313993, + "grad_norm": 0.47805579128405434, + "learning_rate": 0.00015710680002322753, + "loss": 0.9709, + "step": 5818 + }, + { + "epoch": 1.2412542662116042, + "grad_norm": 0.5263475815891565, + "learning_rate": 0.00015710453173703082, + "loss": 0.9617, + "step": 5819 + }, + { + "epoch": 1.2414675767918089, + "grad_norm": 0.6876928651292958, + "learning_rate": 0.00015710226257839498, + "loss": 0.9721, + "step": 5820 + }, + { + "epoch": 1.2416808873720138, + "grad_norm": 0.7922746863144309, + "learning_rate": 0.0001570999925473458, + "loss": 0.9732, + "step": 5821 + }, + { + "epoch": 1.2418941979522184, + "grad_norm": 0.8956130263730744, + "learning_rate": 0.00015709772164390889, + "loss": 0.9546, + "step": 5822 + }, + { + "epoch": 1.2421075085324231, + "grad_norm": 1.0228134507632665, + "learning_rate": 0.00015709544986810994, + "loss": 0.9317, + "step": 5823 + }, + { + "epoch": 1.242320819112628, + "grad_norm": 0.9361416190122241, + "learning_rate": 0.00015709317721997472, + "loss": 0.9996, + "step": 5824 + }, + { + "epoch": 1.2425341296928327, + "grad_norm": 0.7475770347716141, + "learning_rate": 0.00015709090369952887, + "loss": 0.9555, + "step": 5825 + }, + { + "epoch": 1.2427474402730376, + "grad_norm": 0.5722434392327896, + "learning_rate": 0.00015708862930679816, + "loss": 0.9636, + "step": 5826 + }, + { + "epoch": 1.2429607508532423, + "grad_norm": 0.4432575164895864, + "learning_rate": 0.00015708635404180831, + "loss": 0.9897, + "step": 5827 + }, + { + "epoch": 1.243174061433447, + "grad_norm": 0.3959422737939551, + "learning_rate": 0.00015708407790458506, + "loss": 0.9667, + "step": 5828 + }, + { + "epoch": 1.243387372013652, + "grad_norm": 0.3259978900483786, + "learning_rate": 0.0001570818008951542, + "loss": 0.9361, + "step": 5829 + }, + { + "epoch": 1.2436006825938566, + "grad_norm": 0.3556426291468244, + "learning_rate": 0.00015707952301354143, + "loss": 0.9597, + "step": 5830 + }, + { + "epoch": 1.2438139931740615, + "grad_norm": 0.4537470349682481, + "learning_rate": 0.00015707724425977258, + "loss": 0.9555, + "step": 5831 + }, + { + "epoch": 1.2440273037542662, + "grad_norm": 0.5439094865070888, + "learning_rate": 0.00015707496463387342, + "loss": 0.9522, + "step": 5832 + }, + { + "epoch": 1.244240614334471, + "grad_norm": 0.5499566524652747, + "learning_rate": 0.00015707268413586974, + "loss": 0.9494, + "step": 5833 + }, + { + "epoch": 1.2444539249146758, + "grad_norm": 0.4989284377487081, + "learning_rate": 0.00015707040276578731, + "loss": 0.9552, + "step": 5834 + }, + { + "epoch": 1.2446672354948805, + "grad_norm": 0.4171212075235294, + "learning_rate": 0.00015706812052365202, + "loss": 0.9692, + "step": 5835 + }, + { + "epoch": 1.2448805460750854, + "grad_norm": 0.3019885778702089, + "learning_rate": 0.0001570658374094896, + "loss": 0.9944, + "step": 5836 + }, + { + "epoch": 1.24509385665529, + "grad_norm": 0.3271809974182871, + "learning_rate": 0.00015706355342332598, + "loss": 0.9418, + "step": 5837 + }, + { + "epoch": 1.245307167235495, + "grad_norm": 0.391688430273468, + "learning_rate": 0.00015706126856518694, + "loss": 0.9764, + "step": 5838 + }, + { + "epoch": 1.2455204778156996, + "grad_norm": 0.31166141182990903, + "learning_rate": 0.00015705898283509837, + "loss": 0.9542, + "step": 5839 + }, + { + "epoch": 1.2457337883959045, + "grad_norm": 0.3186436439199802, + "learning_rate": 0.00015705669623308609, + "loss": 0.9498, + "step": 5840 + }, + { + "epoch": 1.2459470989761092, + "grad_norm": 0.4115026703579652, + "learning_rate": 0.00015705440875917598, + "loss": 0.9804, + "step": 5841 + }, + { + "epoch": 1.246160409556314, + "grad_norm": 0.4068177149213456, + "learning_rate": 0.00015705212041339396, + "loss": 0.9517, + "step": 5842 + }, + { + "epoch": 1.2463737201365188, + "grad_norm": 0.348188485313263, + "learning_rate": 0.0001570498311957659, + "loss": 0.9722, + "step": 5843 + }, + { + "epoch": 1.2465870307167235, + "grad_norm": 0.3853542650297716, + "learning_rate": 0.00015704754110631771, + "loss": 0.962, + "step": 5844 + }, + { + "epoch": 1.2468003412969284, + "grad_norm": 0.41461692023570546, + "learning_rate": 0.00015704525014507528, + "loss": 0.9848, + "step": 5845 + }, + { + "epoch": 1.247013651877133, + "grad_norm": 0.43144994013365856, + "learning_rate": 0.00015704295831206456, + "loss": 0.9744, + "step": 5846 + }, + { + "epoch": 1.2472269624573378, + "grad_norm": 2.236340992629753, + "learning_rate": 0.00015704066560731148, + "loss": 0.9866, + "step": 5847 + }, + { + "epoch": 1.2474402730375427, + "grad_norm": 0.5201711961155804, + "learning_rate": 0.00015703837203084197, + "loss": 0.9664, + "step": 5848 + }, + { + "epoch": 1.2476535836177474, + "grad_norm": 0.9285441813092329, + "learning_rate": 0.00015703607758268197, + "loss": 0.9995, + "step": 5849 + }, + { + "epoch": 1.2478668941979523, + "grad_norm": 1.0028340172828114, + "learning_rate": 0.00015703378226285746, + "loss": 0.9771, + "step": 5850 + }, + { + "epoch": 1.248080204778157, + "grad_norm": 0.8375206064022932, + "learning_rate": 0.00015703148607139442, + "loss": 0.9817, + "step": 5851 + }, + { + "epoch": 1.2482935153583619, + "grad_norm": 0.776057300293247, + "learning_rate": 0.0001570291890083188, + "loss": 0.9574, + "step": 5852 + }, + { + "epoch": 1.2485068259385665, + "grad_norm": 0.9689911709906615, + "learning_rate": 0.00015702689107365662, + "loss": 0.9731, + "step": 5853 + }, + { + "epoch": 1.2487201365187715, + "grad_norm": 1.0357126229562534, + "learning_rate": 0.00015702459226743387, + "loss": 0.9804, + "step": 5854 + }, + { + "epoch": 1.2489334470989761, + "grad_norm": 0.8436409387599328, + "learning_rate": 0.00015702229258967657, + "loss": 0.9536, + "step": 5855 + }, + { + "epoch": 1.2491467576791808, + "grad_norm": 0.7491799199490696, + "learning_rate": 0.00015701999204041075, + "loss": 0.9842, + "step": 5856 + }, + { + "epoch": 1.2493600682593857, + "grad_norm": 0.6457881102886085, + "learning_rate": 0.0001570176906196624, + "loss": 0.9253, + "step": 5857 + }, + { + "epoch": 1.2495733788395904, + "grad_norm": 0.5520876593521202, + "learning_rate": 0.00015701538832745758, + "loss": 0.9898, + "step": 5858 + }, + { + "epoch": 1.2497866894197953, + "grad_norm": 0.4022510004505905, + "learning_rate": 0.00015701308516382235, + "loss": 0.9643, + "step": 5859 + }, + { + "epoch": 1.25, + "grad_norm": 0.5094444077038525, + "learning_rate": 0.00015701078112878277, + "loss": 0.9551, + "step": 5860 + }, + { + "epoch": 1.2502133105802047, + "grad_norm": 0.7770551038920243, + "learning_rate": 0.0001570084762223649, + "loss": 1.0104, + "step": 5861 + }, + { + "epoch": 1.2504266211604096, + "grad_norm": 0.707630047329327, + "learning_rate": 0.00015700617044459483, + "loss": 0.9479, + "step": 5862 + }, + { + "epoch": 1.2506399317406143, + "grad_norm": 0.5374288775127601, + "learning_rate": 0.00015700386379549866, + "loss": 0.9392, + "step": 5863 + }, + { + "epoch": 1.2508532423208192, + "grad_norm": 0.5777057838772164, + "learning_rate": 0.00015700155627510246, + "loss": 0.9654, + "step": 5864 + }, + { + "epoch": 1.2510665529010239, + "grad_norm": 0.44316337753214413, + "learning_rate": 0.00015699924788343237, + "loss": 0.9944, + "step": 5865 + }, + { + "epoch": 1.2512798634812285, + "grad_norm": 0.3317960371281904, + "learning_rate": 0.0001569969386205145, + "loss": 0.969, + "step": 5866 + }, + { + "epoch": 1.2514931740614335, + "grad_norm": 0.3964714961885683, + "learning_rate": 0.00015699462848637496, + "loss": 0.945, + "step": 5867 + }, + { + "epoch": 1.2517064846416384, + "grad_norm": 0.40810205916653086, + "learning_rate": 0.0001569923174810399, + "loss": 0.9613, + "step": 5868 + }, + { + "epoch": 1.251919795221843, + "grad_norm": 0.40576421768509574, + "learning_rate": 0.00015699000560453547, + "loss": 0.9674, + "step": 5869 + }, + { + "epoch": 1.2521331058020477, + "grad_norm": 0.4634515159433809, + "learning_rate": 0.00015698769285688784, + "loss": 0.9771, + "step": 5870 + }, + { + "epoch": 1.2523464163822526, + "grad_norm": 0.5435951478663942, + "learning_rate": 0.00015698537923812317, + "loss": 0.9836, + "step": 5871 + }, + { + "epoch": 1.2525597269624573, + "grad_norm": 0.4812380101858213, + "learning_rate": 0.00015698306474826762, + "loss": 0.943, + "step": 5872 + }, + { + "epoch": 1.2527730375426622, + "grad_norm": 0.3428698840810537, + "learning_rate": 0.00015698074938734743, + "loss": 0.9597, + "step": 5873 + }, + { + "epoch": 1.252986348122867, + "grad_norm": 0.34905268684678015, + "learning_rate": 0.00015697843315538875, + "loss": 0.9819, + "step": 5874 + }, + { + "epoch": 1.2531996587030716, + "grad_norm": 0.3089883895820842, + "learning_rate": 0.0001569761160524178, + "loss": 0.9714, + "step": 5875 + }, + { + "epoch": 1.2534129692832765, + "grad_norm": 0.390342120268537, + "learning_rate": 0.0001569737980784608, + "loss": 0.9619, + "step": 5876 + }, + { + "epoch": 1.2536262798634812, + "grad_norm": 0.3517929119010163, + "learning_rate": 0.000156971479233544, + "loss": 0.9692, + "step": 5877 + }, + { + "epoch": 1.253839590443686, + "grad_norm": 0.3367242220493354, + "learning_rate": 0.0001569691595176936, + "loss": 0.9963, + "step": 5878 + }, + { + "epoch": 1.2540529010238908, + "grad_norm": 0.3270284661840316, + "learning_rate": 0.00015696683893093587, + "loss": 0.9567, + "step": 5879 + }, + { + "epoch": 1.2542662116040955, + "grad_norm": 0.27090216359064323, + "learning_rate": 0.00015696451747329707, + "loss": 0.9632, + "step": 5880 + }, + { + "epoch": 1.2544795221843004, + "grad_norm": 0.34813632427988234, + "learning_rate": 0.00015696219514480344, + "loss": 0.987, + "step": 5881 + }, + { + "epoch": 1.254692832764505, + "grad_norm": 0.3849471096033415, + "learning_rate": 0.0001569598719454813, + "loss": 0.9729, + "step": 5882 + }, + { + "epoch": 1.25490614334471, + "grad_norm": 0.3984234958673389, + "learning_rate": 0.00015695754787535692, + "loss": 0.9816, + "step": 5883 + }, + { + "epoch": 1.2551194539249146, + "grad_norm": 0.4924669617296283, + "learning_rate": 0.00015695522293445656, + "loss": 0.9774, + "step": 5884 + }, + { + "epoch": 1.2553327645051193, + "grad_norm": 0.4858975167422153, + "learning_rate": 0.00015695289712280657, + "loss": 0.9412, + "step": 5885 + }, + { + "epoch": 1.2555460750853242, + "grad_norm": 0.30577409787976984, + "learning_rate": 0.00015695057044043327, + "loss": 0.9803, + "step": 5886 + }, + { + "epoch": 1.2557593856655291, + "grad_norm": 0.3399091812515579, + "learning_rate": 0.00015694824288736296, + "loss": 0.9578, + "step": 5887 + }, + { + "epoch": 1.2559726962457338, + "grad_norm": 0.381836046957318, + "learning_rate": 0.00015694591446362198, + "loss": 0.9617, + "step": 5888 + }, + { + "epoch": 1.2561860068259385, + "grad_norm": 0.37447852185181957, + "learning_rate": 0.00015694358516923668, + "loss": 0.9756, + "step": 5889 + }, + { + "epoch": 1.2563993174061434, + "grad_norm": 0.360394845444867, + "learning_rate": 0.00015694125500423346, + "loss": 0.9865, + "step": 5890 + }, + { + "epoch": 1.256612627986348, + "grad_norm": 0.37930594974735404, + "learning_rate": 0.0001569389239686386, + "loss": 0.9723, + "step": 5891 + }, + { + "epoch": 1.256825938566553, + "grad_norm": 0.4177859127602329, + "learning_rate": 0.00015693659206247857, + "loss": 0.9766, + "step": 5892 + }, + { + "epoch": 1.2570392491467577, + "grad_norm": 1.3729622360526348, + "learning_rate": 0.00015693425928577965, + "loss": 0.9611, + "step": 5893 + }, + { + "epoch": 1.2572525597269624, + "grad_norm": 0.28547100913389634, + "learning_rate": 0.0001569319256385683, + "loss": 0.9403, + "step": 5894 + }, + { + "epoch": 1.2574658703071673, + "grad_norm": 0.5007661022207127, + "learning_rate": 0.00015692959112087096, + "loss": 0.9608, + "step": 5895 + }, + { + "epoch": 1.257679180887372, + "grad_norm": 0.6574275527312214, + "learning_rate": 0.00015692725573271396, + "loss": 0.9617, + "step": 5896 + }, + { + "epoch": 1.2578924914675769, + "grad_norm": 0.8933569822533566, + "learning_rate": 0.00015692491947412378, + "loss": 0.9966, + "step": 5897 + }, + { + "epoch": 1.2581058020477816, + "grad_norm": 1.069680335750692, + "learning_rate": 0.00015692258234512686, + "loss": 1.0052, + "step": 5898 + }, + { + "epoch": 1.2583191126279862, + "grad_norm": 1.0306809253523805, + "learning_rate": 0.00015692024434574962, + "loss": 0.9776, + "step": 5899 + }, + { + "epoch": 1.2585324232081911, + "grad_norm": 0.9409949611104786, + "learning_rate": 0.00015691790547601853, + "loss": 0.9692, + "step": 5900 + }, + { + "epoch": 1.2587457337883958, + "grad_norm": 0.7037977795627742, + "learning_rate": 0.00015691556573596002, + "loss": 0.9616, + "step": 5901 + }, + { + "epoch": 1.2589590443686007, + "grad_norm": 0.5048105805843988, + "learning_rate": 0.0001569132251256006, + "loss": 0.9464, + "step": 5902 + }, + { + "epoch": 1.2591723549488054, + "grad_norm": 0.5109881969387425, + "learning_rate": 0.00015691088364496673, + "loss": 0.9572, + "step": 5903 + }, + { + "epoch": 1.25938566552901, + "grad_norm": 0.6603404273676398, + "learning_rate": 0.00015690854129408495, + "loss": 0.9835, + "step": 5904 + }, + { + "epoch": 1.259598976109215, + "grad_norm": 0.6475012543065739, + "learning_rate": 0.00015690619807298171, + "loss": 0.9986, + "step": 5905 + }, + { + "epoch": 1.25981228668942, + "grad_norm": 0.4791945048058989, + "learning_rate": 0.00015690385398168353, + "loss": 1.0056, + "step": 5906 + }, + { + "epoch": 1.2600255972696246, + "grad_norm": 0.45871920770486696, + "learning_rate": 0.00015690150902021696, + "loss": 0.9564, + "step": 5907 + }, + { + "epoch": 1.2602389078498293, + "grad_norm": 0.4007837096115712, + "learning_rate": 0.00015689916318860854, + "loss": 0.9923, + "step": 5908 + }, + { + "epoch": 1.2604522184300342, + "grad_norm": 0.4090010371684944, + "learning_rate": 0.00015689681648688479, + "loss": 0.9444, + "step": 5909 + }, + { + "epoch": 1.2606655290102389, + "grad_norm": 0.5840708861116535, + "learning_rate": 0.00015689446891507228, + "loss": 0.9961, + "step": 5910 + }, + { + "epoch": 1.2608788395904438, + "grad_norm": 0.7032360455809654, + "learning_rate": 0.00015689212047319755, + "loss": 0.9705, + "step": 5911 + }, + { + "epoch": 1.2610921501706485, + "grad_norm": 0.5904429061427671, + "learning_rate": 0.00015688977116128715, + "loss": 0.9708, + "step": 5912 + }, + { + "epoch": 1.2613054607508531, + "grad_norm": 0.42315303696278866, + "learning_rate": 0.00015688742097936772, + "loss": 0.9967, + "step": 5913 + }, + { + "epoch": 1.261518771331058, + "grad_norm": 0.5335019166873503, + "learning_rate": 0.00015688506992746584, + "loss": 0.9755, + "step": 5914 + }, + { + "epoch": 1.2617320819112627, + "grad_norm": 0.5064221862285264, + "learning_rate": 0.0001568827180056081, + "loss": 0.966, + "step": 5915 + }, + { + "epoch": 1.2619453924914676, + "grad_norm": 0.4003127683299175, + "learning_rate": 0.0001568803652138211, + "loss": 0.9735, + "step": 5916 + }, + { + "epoch": 1.2621587030716723, + "grad_norm": 0.4973344787201604, + "learning_rate": 0.00015687801155213148, + "loss": 0.9626, + "step": 5917 + }, + { + "epoch": 1.262372013651877, + "grad_norm": 0.5075541305917608, + "learning_rate": 0.00015687565702056586, + "loss": 0.9672, + "step": 5918 + }, + { + "epoch": 1.262585324232082, + "grad_norm": 0.44638573805316645, + "learning_rate": 0.0001568733016191509, + "loss": 0.9591, + "step": 5919 + }, + { + "epoch": 1.2627986348122868, + "grad_norm": 0.39842069512053757, + "learning_rate": 0.0001568709453479132, + "loss": 0.9671, + "step": 5920 + }, + { + "epoch": 1.2630119453924915, + "grad_norm": 0.4694638477775009, + "learning_rate": 0.00015686858820687953, + "loss": 0.9429, + "step": 5921 + }, + { + "epoch": 1.2632252559726962, + "grad_norm": 0.47195614708274, + "learning_rate": 0.00015686623019607645, + "loss": 0.954, + "step": 5922 + }, + { + "epoch": 1.263438566552901, + "grad_norm": 0.4642388128711152, + "learning_rate": 0.0001568638713155307, + "loss": 0.9564, + "step": 5923 + }, + { + "epoch": 1.2636518771331058, + "grad_norm": 0.5078610111641348, + "learning_rate": 0.0001568615115652689, + "loss": 0.9536, + "step": 5924 + }, + { + "epoch": 1.2638651877133107, + "grad_norm": 0.5391695631039242, + "learning_rate": 0.00015685915094531787, + "loss": 0.9904, + "step": 5925 + }, + { + "epoch": 1.2640784982935154, + "grad_norm": 0.45620489847030293, + "learning_rate": 0.00015685678945570424, + "loss": 0.9484, + "step": 5926 + }, + { + "epoch": 1.26429180887372, + "grad_norm": 0.3855743186784703, + "learning_rate": 0.00015685442709645472, + "loss": 0.9222, + "step": 5927 + }, + { + "epoch": 1.264505119453925, + "grad_norm": 0.5030731870358943, + "learning_rate": 0.00015685206386759608, + "loss": 0.9496, + "step": 5928 + }, + { + "epoch": 1.2647184300341296, + "grad_norm": 0.5262735915777378, + "learning_rate": 0.00015684969976915503, + "loss": 0.9636, + "step": 5929 + }, + { + "epoch": 1.2649317406143346, + "grad_norm": 0.4952021026434885, + "learning_rate": 0.00015684733480115837, + "loss": 0.9467, + "step": 5930 + }, + { + "epoch": 1.2651450511945392, + "grad_norm": 0.40582140278088796, + "learning_rate": 0.0001568449689636328, + "loss": 0.9729, + "step": 5931 + }, + { + "epoch": 1.265358361774744, + "grad_norm": 0.36369535555003896, + "learning_rate": 0.0001568426022566051, + "loss": 0.9732, + "step": 5932 + }, + { + "epoch": 1.2655716723549488, + "grad_norm": 0.3125589287699405, + "learning_rate": 0.0001568402346801021, + "loss": 0.9502, + "step": 5933 + }, + { + "epoch": 1.2657849829351535, + "grad_norm": 0.2973807289964098, + "learning_rate": 0.0001568378662341505, + "loss": 1.0013, + "step": 5934 + }, + { + "epoch": 1.2659982935153584, + "grad_norm": 0.3099252301140392, + "learning_rate": 0.00015683549691877718, + "loss": 0.9853, + "step": 5935 + }, + { + "epoch": 1.266211604095563, + "grad_norm": 0.37348163827943676, + "learning_rate": 0.00015683312673400892, + "loss": 0.9523, + "step": 5936 + }, + { + "epoch": 1.2664249146757678, + "grad_norm": 0.3208261936341965, + "learning_rate": 0.00015683075567987255, + "loss": 0.9775, + "step": 5937 + }, + { + "epoch": 1.2666382252559727, + "grad_norm": 0.23369985357136797, + "learning_rate": 0.00015682838375639485, + "loss": 0.9664, + "step": 5938 + }, + { + "epoch": 1.2668515358361776, + "grad_norm": 0.317240513803349, + "learning_rate": 0.00015682601096360274, + "loss": 0.9777, + "step": 5939 + }, + { + "epoch": 1.2670648464163823, + "grad_norm": 0.3859195572799795, + "learning_rate": 0.00015682363730152302, + "loss": 0.9768, + "step": 5940 + }, + { + "epoch": 1.267278156996587, + "grad_norm": 0.35763771359257834, + "learning_rate": 0.0001568212627701825, + "loss": 0.9805, + "step": 5941 + }, + { + "epoch": 1.2674914675767919, + "grad_norm": 0.38146424532153395, + "learning_rate": 0.00015681888736960816, + "loss": 0.9721, + "step": 5942 + }, + { + "epoch": 1.2677047781569966, + "grad_norm": 0.32583952251578957, + "learning_rate": 0.0001568165110998268, + "loss": 0.9701, + "step": 5943 + }, + { + "epoch": 1.2679180887372015, + "grad_norm": 0.30800298382924524, + "learning_rate": 0.00015681413396086532, + "loss": 0.9905, + "step": 5944 + }, + { + "epoch": 1.2681313993174061, + "grad_norm": 0.31098869110113253, + "learning_rate": 0.00015681175595275062, + "loss": 0.9748, + "step": 5945 + }, + { + "epoch": 1.2683447098976108, + "grad_norm": 0.3457870479050849, + "learning_rate": 0.0001568093770755096, + "loss": 0.9666, + "step": 5946 + }, + { + "epoch": 1.2685580204778157, + "grad_norm": 0.39321342710183205, + "learning_rate": 0.00015680699732916918, + "loss": 0.9757, + "step": 5947 + }, + { + "epoch": 1.2687713310580204, + "grad_norm": 0.5146120276682814, + "learning_rate": 0.00015680461671375634, + "loss": 0.9495, + "step": 5948 + }, + { + "epoch": 1.2689846416382253, + "grad_norm": 0.5160891648303013, + "learning_rate": 0.00015680223522929792, + "loss": 0.9493, + "step": 5949 + }, + { + "epoch": 1.26919795221843, + "grad_norm": 0.49365586500792524, + "learning_rate": 0.00015679985287582094, + "loss": 0.9432, + "step": 5950 + }, + { + "epoch": 1.2694112627986347, + "grad_norm": 0.5097953115277022, + "learning_rate": 0.00015679746965335234, + "loss": 0.9403, + "step": 5951 + }, + { + "epoch": 1.2696245733788396, + "grad_norm": 0.506491280642898, + "learning_rate": 0.00015679508556191906, + "loss": 0.9686, + "step": 5952 + }, + { + "epoch": 1.2698378839590443, + "grad_norm": 0.4405169845748023, + "learning_rate": 0.0001567927006015481, + "loss": 0.9679, + "step": 5953 + }, + { + "epoch": 1.2700511945392492, + "grad_norm": 0.36776654196902653, + "learning_rate": 0.00015679031477226646, + "loss": 0.9552, + "step": 5954 + }, + { + "epoch": 1.2702645051194539, + "grad_norm": 0.3015365267572394, + "learning_rate": 0.0001567879280741011, + "loss": 0.9367, + "step": 5955 + }, + { + "epoch": 1.2704778156996588, + "grad_norm": 0.327617255800641, + "learning_rate": 0.00015678554050707903, + "loss": 0.9458, + "step": 5956 + }, + { + "epoch": 1.2706911262798635, + "grad_norm": 0.28984958911402303, + "learning_rate": 0.0001567831520712273, + "loss": 0.9729, + "step": 5957 + }, + { + "epoch": 1.2709044368600684, + "grad_norm": 0.3147900371725545, + "learning_rate": 0.00015678076276657289, + "loss": 0.9732, + "step": 5958 + }, + { + "epoch": 1.271117747440273, + "grad_norm": 0.3750816005068885, + "learning_rate": 0.00015677837259314288, + "loss": 0.9534, + "step": 5959 + }, + { + "epoch": 1.2713310580204777, + "grad_norm": 0.42618912247634794, + "learning_rate": 0.00015677598155096426, + "loss": 0.9753, + "step": 5960 + }, + { + "epoch": 1.2715443686006827, + "grad_norm": 0.4800831234458124, + "learning_rate": 0.00015677358964006418, + "loss": 0.9676, + "step": 5961 + }, + { + "epoch": 1.2717576791808873, + "grad_norm": 0.42278299374503703, + "learning_rate": 0.0001567711968604696, + "loss": 0.951, + "step": 5962 + }, + { + "epoch": 1.2719709897610922, + "grad_norm": 0.33372939184083206, + "learning_rate": 0.00015676880321220763, + "loss": 0.9846, + "step": 5963 + }, + { + "epoch": 1.272184300341297, + "grad_norm": 0.33772644208145075, + "learning_rate": 0.00015676640869530538, + "loss": 0.9506, + "step": 5964 + }, + { + "epoch": 1.2723976109215016, + "grad_norm": 0.32485557378567775, + "learning_rate": 0.00015676401330978992, + "loss": 0.9398, + "step": 5965 + }, + { + "epoch": 1.2726109215017065, + "grad_norm": 2.001825137020637, + "learning_rate": 0.00015676161705568834, + "loss": 0.9886, + "step": 5966 + }, + { + "epoch": 1.2728242320819112, + "grad_norm": 0.396177621893483, + "learning_rate": 0.00015675921993302776, + "loss": 0.9706, + "step": 5967 + }, + { + "epoch": 1.273037542662116, + "grad_norm": 0.9002166855412904, + "learning_rate": 0.00015675682194183537, + "loss": 0.9478, + "step": 5968 + }, + { + "epoch": 1.2732508532423208, + "grad_norm": 0.7805033770234538, + "learning_rate": 0.0001567544230821382, + "loss": 0.9244, + "step": 5969 + }, + { + "epoch": 1.2734641638225255, + "grad_norm": 0.5904575589547058, + "learning_rate": 0.00015675202335396348, + "loss": 0.9749, + "step": 5970 + }, + { + "epoch": 1.2736774744027304, + "grad_norm": 0.5720165368119237, + "learning_rate": 0.00015674962275733832, + "loss": 0.9621, + "step": 5971 + }, + { + "epoch": 1.2738907849829353, + "grad_norm": 0.7619304303780943, + "learning_rate": 0.00015674722129228985, + "loss": 0.9553, + "step": 5972 + }, + { + "epoch": 1.27410409556314, + "grad_norm": 0.7414512574373899, + "learning_rate": 0.0001567448189588453, + "loss": 0.9508, + "step": 5973 + }, + { + "epoch": 1.2743174061433447, + "grad_norm": 0.6327584551538222, + "learning_rate": 0.00015674241575703183, + "loss": 0.9914, + "step": 5974 + }, + { + "epoch": 1.2745307167235496, + "grad_norm": 0.7166302329330542, + "learning_rate": 0.00015674001168687664, + "loss": 0.9593, + "step": 5975 + }, + { + "epoch": 1.2747440273037542, + "grad_norm": 0.585663858404763, + "learning_rate": 0.00015673760674840694, + "loss": 0.9813, + "step": 5976 + }, + { + "epoch": 1.2749573378839592, + "grad_norm": 0.5772479449816095, + "learning_rate": 0.0001567352009416499, + "loss": 0.99, + "step": 5977 + }, + { + "epoch": 1.2751706484641638, + "grad_norm": 0.5204215850806035, + "learning_rate": 0.0001567327942666328, + "loss": 0.934, + "step": 5978 + }, + { + "epoch": 1.2753839590443685, + "grad_norm": 0.5353789872977539, + "learning_rate": 0.00015673038672338283, + "loss": 0.976, + "step": 5979 + }, + { + "epoch": 1.2755972696245734, + "grad_norm": 0.5697090985463067, + "learning_rate": 0.00015672797831192724, + "loss": 0.9514, + "step": 5980 + }, + { + "epoch": 1.275810580204778, + "grad_norm": 0.5456139092189303, + "learning_rate": 0.0001567255690322933, + "loss": 0.979, + "step": 5981 + }, + { + "epoch": 1.276023890784983, + "grad_norm": 0.5997716859667168, + "learning_rate": 0.00015672315888450822, + "loss": 0.9576, + "step": 5982 + }, + { + "epoch": 1.2762372013651877, + "grad_norm": 0.7041694105418742, + "learning_rate": 0.00015672074786859936, + "loss": 0.953, + "step": 5983 + }, + { + "epoch": 1.2764505119453924, + "grad_norm": 0.7540284967481378, + "learning_rate": 0.0001567183359845939, + "loss": 0.9539, + "step": 5984 + }, + { + "epoch": 1.2766638225255973, + "grad_norm": 0.719262989206326, + "learning_rate": 0.0001567159232325192, + "loss": 0.9841, + "step": 5985 + }, + { + "epoch": 1.276877133105802, + "grad_norm": 0.7581896572163294, + "learning_rate": 0.00015671350961240252, + "loss": 0.9804, + "step": 5986 + }, + { + "epoch": 1.2770904436860069, + "grad_norm": 0.8444008152898204, + "learning_rate": 0.0001567110951242712, + "loss": 0.9692, + "step": 5987 + }, + { + "epoch": 1.2773037542662116, + "grad_norm": 0.8112388943122276, + "learning_rate": 0.00015670867976815258, + "loss": 0.9625, + "step": 5988 + }, + { + "epoch": 1.2775170648464163, + "grad_norm": 0.7278603252135433, + "learning_rate": 0.0001567062635440739, + "loss": 0.9571, + "step": 5989 + }, + { + "epoch": 1.2777303754266212, + "grad_norm": 0.558572845952281, + "learning_rate": 0.0001567038464520626, + "loss": 0.9187, + "step": 5990 + }, + { + "epoch": 1.277943686006826, + "grad_norm": 0.4981314441315806, + "learning_rate": 0.00015670142849214598, + "loss": 0.9579, + "step": 5991 + }, + { + "epoch": 1.2781569965870307, + "grad_norm": 0.4398791142585623, + "learning_rate": 0.00015669900966435138, + "loss": 0.9527, + "step": 5992 + }, + { + "epoch": 1.2783703071672354, + "grad_norm": 0.3844293102655885, + "learning_rate": 0.00015669658996870622, + "loss": 0.9748, + "step": 5993 + }, + { + "epoch": 1.2785836177474403, + "grad_norm": 0.4564408904864016, + "learning_rate": 0.00015669416940523786, + "loss": 0.9499, + "step": 5994 + }, + { + "epoch": 1.278796928327645, + "grad_norm": 0.4564812309857781, + "learning_rate": 0.00015669174797397367, + "loss": 0.9762, + "step": 5995 + }, + { + "epoch": 1.27901023890785, + "grad_norm": 0.38623722835779584, + "learning_rate": 0.00015668932567494107, + "loss": 0.9687, + "step": 5996 + }, + { + "epoch": 1.2792235494880546, + "grad_norm": 0.3857914937748039, + "learning_rate": 0.00015668690250816744, + "loss": 0.9777, + "step": 5997 + }, + { + "epoch": 1.2794368600682593, + "grad_norm": 0.34860082745584986, + "learning_rate": 0.00015668447847368022, + "loss": 0.9608, + "step": 5998 + }, + { + "epoch": 1.2796501706484642, + "grad_norm": 0.33240844798849306, + "learning_rate": 0.00015668205357150688, + "loss": 0.9773, + "step": 5999 + }, + { + "epoch": 1.2798634812286689, + "grad_norm": 0.34252679388877066, + "learning_rate": 0.00015667962780167475, + "loss": 0.9839, + "step": 6000 + }, + { + "epoch": 1.2800767918088738, + "grad_norm": 0.35371897310639305, + "learning_rate": 0.0001566772011642114, + "loss": 0.9828, + "step": 6001 + }, + { + "epoch": 1.2802901023890785, + "grad_norm": 0.36864027903215274, + "learning_rate": 0.0001566747736591442, + "loss": 0.9709, + "step": 6002 + }, + { + "epoch": 1.2805034129692832, + "grad_norm": 0.40246813954161226, + "learning_rate": 0.00015667234528650065, + "loss": 0.9313, + "step": 6003 + }, + { + "epoch": 1.280716723549488, + "grad_norm": 0.3654921130160774, + "learning_rate": 0.00015666991604630824, + "loss": 0.9525, + "step": 6004 + }, + { + "epoch": 1.2809300341296928, + "grad_norm": 0.30194627525037293, + "learning_rate": 0.0001566674859385944, + "loss": 0.9654, + "step": 6005 + }, + { + "epoch": 1.2811433447098977, + "grad_norm": 0.31423095988136374, + "learning_rate": 0.0001566650549633867, + "loss": 0.9595, + "step": 6006 + }, + { + "epoch": 1.2813566552901023, + "grad_norm": 0.39195008104103396, + "learning_rate": 0.0001566626231207126, + "loss": 1.0031, + "step": 6007 + }, + { + "epoch": 1.2815699658703072, + "grad_norm": 0.43196898906774517, + "learning_rate": 0.00015666019041059965, + "loss": 0.9429, + "step": 6008 + }, + { + "epoch": 1.281783276450512, + "grad_norm": 0.45298315031825387, + "learning_rate": 0.00015665775683307534, + "loss": 0.9478, + "step": 6009 + }, + { + "epoch": 1.2819965870307168, + "grad_norm": 0.4523016893548649, + "learning_rate": 0.00015665532238816722, + "loss": 0.9568, + "step": 6010 + }, + { + "epoch": 1.2822098976109215, + "grad_norm": 0.44936448142402896, + "learning_rate": 0.00015665288707590282, + "loss": 0.9556, + "step": 6011 + }, + { + "epoch": 1.2824232081911262, + "grad_norm": 0.4308157684879668, + "learning_rate": 0.00015665045089630975, + "loss": 0.9457, + "step": 6012 + }, + { + "epoch": 1.2826365187713311, + "grad_norm": 0.48177300612002744, + "learning_rate": 0.00015664801384941552, + "loss": 0.983, + "step": 6013 + }, + { + "epoch": 1.2828498293515358, + "grad_norm": 0.5542905156467837, + "learning_rate": 0.00015664557593524775, + "loss": 0.9881, + "step": 6014 + }, + { + "epoch": 1.2830631399317407, + "grad_norm": 0.6504965026574042, + "learning_rate": 0.00015664313715383396, + "loss": 0.969, + "step": 6015 + }, + { + "epoch": 1.2832764505119454, + "grad_norm": 0.7296746257555229, + "learning_rate": 0.00015664069750520178, + "loss": 0.9667, + "step": 6016 + }, + { + "epoch": 1.28348976109215, + "grad_norm": 0.7558463591589164, + "learning_rate": 0.00015663825698937885, + "loss": 0.9509, + "step": 6017 + }, + { + "epoch": 1.283703071672355, + "grad_norm": 0.6361575844282713, + "learning_rate": 0.00015663581560639275, + "loss": 0.9887, + "step": 6018 + }, + { + "epoch": 1.2839163822525597, + "grad_norm": 0.5142672312792511, + "learning_rate": 0.0001566333733562711, + "loss": 0.9331, + "step": 6019 + }, + { + "epoch": 1.2841296928327646, + "grad_norm": 0.4429228880498726, + "learning_rate": 0.00015663093023904155, + "loss": 0.9573, + "step": 6020 + }, + { + "epoch": 1.2843430034129693, + "grad_norm": 0.36606701646459644, + "learning_rate": 0.00015662848625473173, + "loss": 0.9466, + "step": 6021 + }, + { + "epoch": 1.284556313993174, + "grad_norm": 0.3396758153820167, + "learning_rate": 0.00015662604140336927, + "loss": 0.9667, + "step": 6022 + }, + { + "epoch": 1.2847696245733788, + "grad_norm": 0.3717442140709851, + "learning_rate": 0.0001566235956849819, + "loss": 0.9508, + "step": 6023 + }, + { + "epoch": 1.2849829351535837, + "grad_norm": 0.6031288942154484, + "learning_rate": 0.00015662114909959724, + "loss": 0.9984, + "step": 6024 + }, + { + "epoch": 1.2851962457337884, + "grad_norm": 0.7871758296621878, + "learning_rate": 0.000156618701647243, + "loss": 0.9407, + "step": 6025 + }, + { + "epoch": 1.2854095563139931, + "grad_norm": 0.9139691418014827, + "learning_rate": 0.00015661625332794686, + "loss": 0.9688, + "step": 6026 + }, + { + "epoch": 1.285622866894198, + "grad_norm": 0.998106280970728, + "learning_rate": 0.00015661380414173652, + "loss": 0.972, + "step": 6027 + }, + { + "epoch": 1.2858361774744027, + "grad_norm": 1.1239604360066, + "learning_rate": 0.00015661135408863967, + "loss": 0.9732, + "step": 6028 + }, + { + "epoch": 1.2860494880546076, + "grad_norm": 0.7191204509863887, + "learning_rate": 0.0001566089031686841, + "loss": 0.9523, + "step": 6029 + }, + { + "epoch": 1.2862627986348123, + "grad_norm": 0.3263759138778739, + "learning_rate": 0.0001566064513818975, + "loss": 0.9781, + "step": 6030 + }, + { + "epoch": 1.286476109215017, + "grad_norm": 0.5752965496028001, + "learning_rate": 0.0001566039987283076, + "loss": 0.9482, + "step": 6031 + }, + { + "epoch": 1.286689419795222, + "grad_norm": 0.9602708953650184, + "learning_rate": 0.00015660154520794216, + "loss": 0.9657, + "step": 6032 + }, + { + "epoch": 1.2869027303754266, + "grad_norm": 1.1344130538762023, + "learning_rate": 0.00015659909082082896, + "loss": 0.9618, + "step": 6033 + }, + { + "epoch": 1.2871160409556315, + "grad_norm": 0.6040522288782724, + "learning_rate": 0.00015659663556699574, + "loss": 0.951, + "step": 6034 + }, + { + "epoch": 1.2873293515358362, + "grad_norm": 0.4538520137136033, + "learning_rate": 0.00015659417944647032, + "loss": 0.9606, + "step": 6035 + }, + { + "epoch": 1.2875426621160408, + "grad_norm": 0.5972906866160289, + "learning_rate": 0.00015659172245928048, + "loss": 0.9354, + "step": 6036 + }, + { + "epoch": 1.2877559726962458, + "grad_norm": 0.7980651647242633, + "learning_rate": 0.000156589264605454, + "loss": 0.9637, + "step": 6037 + }, + { + "epoch": 1.2879692832764504, + "grad_norm": 0.9337603593487396, + "learning_rate": 0.0001565868058850187, + "loss": 0.9712, + "step": 6038 + }, + { + "epoch": 1.2881825938566553, + "grad_norm": 0.877707291473384, + "learning_rate": 0.0001565843462980024, + "loss": 0.9525, + "step": 6039 + }, + { + "epoch": 1.28839590443686, + "grad_norm": 0.6521307505966746, + "learning_rate": 0.00015658188584443295, + "loss": 0.9902, + "step": 6040 + }, + { + "epoch": 1.2886092150170647, + "grad_norm": 0.42823550977956604, + "learning_rate": 0.00015657942452433816, + "loss": 0.9708, + "step": 6041 + }, + { + "epoch": 1.2888225255972696, + "grad_norm": 0.34468264159076134, + "learning_rate": 0.00015657696233774588, + "loss": 0.959, + "step": 6042 + }, + { + "epoch": 1.2890358361774745, + "grad_norm": 0.42142740359810993, + "learning_rate": 0.000156574499284684, + "loss": 0.9354, + "step": 6043 + }, + { + "epoch": 1.2892491467576792, + "grad_norm": 0.5292703728189497, + "learning_rate": 0.00015657203536518038, + "loss": 0.972, + "step": 6044 + }, + { + "epoch": 1.289462457337884, + "grad_norm": 0.51813400205738, + "learning_rate": 0.00015656957057926289, + "loss": 0.981, + "step": 6045 + }, + { + "epoch": 1.2896757679180888, + "grad_norm": 0.4616671939765588, + "learning_rate": 0.0001565671049269594, + "loss": 0.9817, + "step": 6046 + }, + { + "epoch": 1.2898890784982935, + "grad_norm": 0.43895331912497887, + "learning_rate": 0.00015656463840829784, + "loss": 0.9662, + "step": 6047 + }, + { + "epoch": 1.2901023890784984, + "grad_norm": 0.43221672251196847, + "learning_rate": 0.00015656217102330612, + "loss": 0.9204, + "step": 6048 + }, + { + "epoch": 1.290315699658703, + "grad_norm": 0.40948832063216145, + "learning_rate": 0.00015655970277201215, + "loss": 0.9594, + "step": 6049 + }, + { + "epoch": 1.2905290102389078, + "grad_norm": 0.32251440950388327, + "learning_rate": 0.00015655723365444384, + "loss": 0.9247, + "step": 6050 + }, + { + "epoch": 1.2907423208191127, + "grad_norm": 0.3377176816681803, + "learning_rate": 0.00015655476367062914, + "loss": 0.9411, + "step": 6051 + }, + { + "epoch": 1.2909556313993173, + "grad_norm": 0.454218807349998, + "learning_rate": 0.00015655229282059602, + "loss": 0.9706, + "step": 6052 + }, + { + "epoch": 1.2911689419795223, + "grad_norm": 0.5892322017916317, + "learning_rate": 0.0001565498211043724, + "loss": 0.995, + "step": 6053 + }, + { + "epoch": 1.291382252559727, + "grad_norm": 0.6022068479881197, + "learning_rate": 0.00015654734852198627, + "loss": 0.9442, + "step": 6054 + }, + { + "epoch": 1.2915955631399316, + "grad_norm": 0.4570232013948761, + "learning_rate": 0.0001565448750734656, + "loss": 0.9763, + "step": 6055 + }, + { + "epoch": 1.2918088737201365, + "grad_norm": 0.3076619202104443, + "learning_rate": 0.0001565424007588384, + "loss": 0.9796, + "step": 6056 + }, + { + "epoch": 1.2920221843003412, + "grad_norm": 0.2943260046020229, + "learning_rate": 0.00015653992557813264, + "loss": 1.0009, + "step": 6057 + }, + { + "epoch": 1.2922354948805461, + "grad_norm": 0.3595755787582961, + "learning_rate": 0.00015653744953137634, + "loss": 0.9364, + "step": 6058 + }, + { + "epoch": 1.2924488054607508, + "grad_norm": 0.4290302020245316, + "learning_rate": 0.00015653497261859748, + "loss": 0.9834, + "step": 6059 + }, + { + "epoch": 1.2926621160409557, + "grad_norm": 0.4552499131774306, + "learning_rate": 0.00015653249483982414, + "loss": 0.957, + "step": 6060 + }, + { + "epoch": 1.2928754266211604, + "grad_norm": 0.3893277805526411, + "learning_rate": 0.00015653001619508436, + "loss": 0.968, + "step": 6061 + }, + { + "epoch": 1.2930887372013653, + "grad_norm": 0.345401913156239, + "learning_rate": 0.00015652753668440612, + "loss": 0.9643, + "step": 6062 + }, + { + "epoch": 1.29330204778157, + "grad_norm": 0.40887617189632264, + "learning_rate": 0.00015652505630781754, + "loss": 0.9495, + "step": 6063 + }, + { + "epoch": 1.2935153583617747, + "grad_norm": 0.3662318959531934, + "learning_rate": 0.00015652257506534664, + "loss": 0.9834, + "step": 6064 + }, + { + "epoch": 1.2937286689419796, + "grad_norm": 0.33817660413644923, + "learning_rate": 0.00015652009295702156, + "loss": 0.9646, + "step": 6065 + }, + { + "epoch": 1.2939419795221843, + "grad_norm": 0.37436177222163386, + "learning_rate": 0.0001565176099828703, + "loss": 0.9378, + "step": 6066 + }, + { + "epoch": 1.2941552901023892, + "grad_norm": 0.406775588998285, + "learning_rate": 0.000156515126142921, + "loss": 0.9786, + "step": 6067 + }, + { + "epoch": 1.2943686006825939, + "grad_norm": 0.46321851189451524, + "learning_rate": 0.00015651264143720173, + "loss": 0.9609, + "step": 6068 + }, + { + "epoch": 1.2945819112627985, + "grad_norm": 0.39008752714233047, + "learning_rate": 0.0001565101558657407, + "loss": 0.9823, + "step": 6069 + }, + { + "epoch": 1.2947952218430034, + "grad_norm": 0.2888753964530381, + "learning_rate": 0.00015650766942856592, + "loss": 0.9519, + "step": 6070 + }, + { + "epoch": 1.2950085324232081, + "grad_norm": 0.38463003053807565, + "learning_rate": 0.0001565051821257056, + "loss": 0.959, + "step": 6071 + }, + { + "epoch": 1.295221843003413, + "grad_norm": 0.390345366428164, + "learning_rate": 0.00015650269395718784, + "loss": 0.9707, + "step": 6072 + }, + { + "epoch": 1.2954351535836177, + "grad_norm": 0.2820722393994069, + "learning_rate": 0.00015650020492304084, + "loss": 0.9458, + "step": 6073 + }, + { + "epoch": 1.2956484641638224, + "grad_norm": 0.3452830568817978, + "learning_rate": 0.0001564977150232927, + "loss": 0.9483, + "step": 6074 + }, + { + "epoch": 1.2958617747440273, + "grad_norm": 0.3518330465802901, + "learning_rate": 0.00015649522425797166, + "loss": 0.9188, + "step": 6075 + }, + { + "epoch": 1.2960750853242322, + "grad_norm": 0.32215949501674485, + "learning_rate": 0.00015649273262710584, + "loss": 0.9765, + "step": 6076 + }, + { + "epoch": 1.296288395904437, + "grad_norm": 0.48025700875594174, + "learning_rate": 0.00015649024013072348, + "loss": 0.9677, + "step": 6077 + }, + { + "epoch": 1.2965017064846416, + "grad_norm": 0.5961866321146346, + "learning_rate": 0.00015648774676885278, + "loss": 0.9793, + "step": 6078 + }, + { + "epoch": 1.2967150170648465, + "grad_norm": 0.5919033054810975, + "learning_rate": 0.00015648525254152192, + "loss": 0.9521, + "step": 6079 + }, + { + "epoch": 1.2969283276450512, + "grad_norm": 0.7461501364123168, + "learning_rate": 0.00015648275744875915, + "loss": 0.9357, + "step": 6080 + }, + { + "epoch": 1.297141638225256, + "grad_norm": 0.8156907988524282, + "learning_rate": 0.00015648026149059268, + "loss": 0.9859, + "step": 6081 + }, + { + "epoch": 1.2973549488054608, + "grad_norm": 0.6046888102694288, + "learning_rate": 0.00015647776466705076, + "loss": 0.9491, + "step": 6082 + }, + { + "epoch": 1.2975682593856654, + "grad_norm": 0.43578631306288557, + "learning_rate": 0.00015647526697816166, + "loss": 0.9387, + "step": 6083 + }, + { + "epoch": 1.2977815699658704, + "grad_norm": 0.448026091581575, + "learning_rate": 0.00015647276842395365, + "loss": 0.9592, + "step": 6084 + }, + { + "epoch": 1.297994880546075, + "grad_norm": 0.41946520345800886, + "learning_rate": 0.00015647026900445494, + "loss": 0.9723, + "step": 6085 + }, + { + "epoch": 1.29820819112628, + "grad_norm": 0.3951142668665709, + "learning_rate": 0.00015646776871969386, + "loss": 0.9718, + "step": 6086 + }, + { + "epoch": 1.2984215017064846, + "grad_norm": 0.5123717199576885, + "learning_rate": 0.0001564652675696987, + "loss": 0.9358, + "step": 6087 + }, + { + "epoch": 1.2986348122866893, + "grad_norm": 0.5118415374413717, + "learning_rate": 0.00015646276555449776, + "loss": 0.9653, + "step": 6088 + }, + { + "epoch": 1.2988481228668942, + "grad_norm": 0.4511839448464573, + "learning_rate": 0.0001564602626741193, + "loss": 0.9733, + "step": 6089 + }, + { + "epoch": 1.299061433447099, + "grad_norm": 0.46952814463672204, + "learning_rate": 0.00015645775892859172, + "loss": 0.9918, + "step": 6090 + }, + { + "epoch": 1.2992747440273038, + "grad_norm": 0.43407274747364677, + "learning_rate": 0.0001564552543179433, + "loss": 0.9879, + "step": 6091 + }, + { + "epoch": 1.2994880546075085, + "grad_norm": 0.23951107094108495, + "learning_rate": 0.0001564527488422024, + "loss": 0.9731, + "step": 6092 + }, + { + "epoch": 1.2997013651877132, + "grad_norm": 0.4632024934492076, + "learning_rate": 0.00015645024250139736, + "loss": 0.9637, + "step": 6093 + }, + { + "epoch": 1.299914675767918, + "grad_norm": 0.5012776821791325, + "learning_rate": 0.00015644773529555653, + "loss": 0.9603, + "step": 6094 + }, + { + "epoch": 1.300127986348123, + "grad_norm": 0.47582100136719635, + "learning_rate": 0.0001564452272247083, + "loss": 0.9616, + "step": 6095 + }, + { + "epoch": 1.3003412969283277, + "grad_norm": 0.49458569767292654, + "learning_rate": 0.00015644271828888103, + "loss": 0.937, + "step": 6096 + }, + { + "epoch": 1.3005546075085324, + "grad_norm": 0.4260058960498475, + "learning_rate": 0.00015644020848810312, + "loss": 0.9602, + "step": 6097 + }, + { + "epoch": 1.3007679180887373, + "grad_norm": 0.35779296954801354, + "learning_rate": 0.00015643769782240296, + "loss": 0.9763, + "step": 6098 + }, + { + "epoch": 1.300981228668942, + "grad_norm": 0.3803677383095146, + "learning_rate": 0.00015643518629180898, + "loss": 0.9607, + "step": 6099 + }, + { + "epoch": 1.3011945392491469, + "grad_norm": 0.3483838875795586, + "learning_rate": 0.00015643267389634955, + "loss": 0.9548, + "step": 6100 + }, + { + "epoch": 1.3014078498293515, + "grad_norm": 0.31052944232788604, + "learning_rate": 0.00015643016063605314, + "loss": 0.9609, + "step": 6101 + }, + { + "epoch": 1.3016211604095562, + "grad_norm": 0.37890899086033464, + "learning_rate": 0.0001564276465109482, + "loss": 0.9504, + "step": 6102 + }, + { + "epoch": 1.3018344709897611, + "grad_norm": 0.49179914701663, + "learning_rate": 0.00015642513152106313, + "loss": 0.9303, + "step": 6103 + }, + { + "epoch": 1.3020477815699658, + "grad_norm": 0.6495296879112132, + "learning_rate": 0.00015642261566642642, + "loss": 0.9771, + "step": 6104 + }, + { + "epoch": 1.3022610921501707, + "grad_norm": 0.6898446096703846, + "learning_rate": 0.00015642009894706652, + "loss": 0.9409, + "step": 6105 + }, + { + "epoch": 1.3024744027303754, + "grad_norm": 0.7293905396244452, + "learning_rate": 0.00015641758136301193, + "loss": 0.9584, + "step": 6106 + }, + { + "epoch": 1.30268771331058, + "grad_norm": 0.7107407496769508, + "learning_rate": 0.0001564150629142911, + "loss": 0.9476, + "step": 6107 + }, + { + "epoch": 1.302901023890785, + "grad_norm": 0.6203632333063099, + "learning_rate": 0.00015641254360093258, + "loss": 0.9893, + "step": 6108 + }, + { + "epoch": 1.30311433447099, + "grad_norm": 0.5551493110316543, + "learning_rate": 0.00015641002342296482, + "loss": 0.971, + "step": 6109 + }, + { + "epoch": 1.3033276450511946, + "grad_norm": 0.5505401010620541, + "learning_rate": 0.00015640750238041638, + "loss": 0.9735, + "step": 6110 + }, + { + "epoch": 1.3035409556313993, + "grad_norm": 0.39578207869319154, + "learning_rate": 0.00015640498047331573, + "loss": 0.9577, + "step": 6111 + }, + { + "epoch": 1.3037542662116042, + "grad_norm": 0.2907332503001976, + "learning_rate": 0.00015640245770169147, + "loss": 0.9406, + "step": 6112 + }, + { + "epoch": 1.3039675767918089, + "grad_norm": 0.3249619583573108, + "learning_rate": 0.0001563999340655721, + "loss": 0.9574, + "step": 6113 + }, + { + "epoch": 1.3041808873720138, + "grad_norm": 0.346011909139572, + "learning_rate": 0.00015639740956498621, + "loss": 0.965, + "step": 6114 + }, + { + "epoch": 1.3043941979522184, + "grad_norm": 0.3875518689456871, + "learning_rate": 0.00015639488419996235, + "loss": 0.9285, + "step": 6115 + }, + { + "epoch": 1.3046075085324231, + "grad_norm": 0.48960543342696816, + "learning_rate": 0.00015639235797052905, + "loss": 0.9811, + "step": 6116 + }, + { + "epoch": 1.304820819112628, + "grad_norm": 0.5942491307431828, + "learning_rate": 0.00015638983087671497, + "loss": 0.9789, + "step": 6117 + }, + { + "epoch": 1.3050341296928327, + "grad_norm": 0.6351806998071693, + "learning_rate": 0.00015638730291854866, + "loss": 0.9777, + "step": 6118 + }, + { + "epoch": 1.3052474402730376, + "grad_norm": 0.5917868708876071, + "learning_rate": 0.00015638477409605874, + "loss": 0.9791, + "step": 6119 + }, + { + "epoch": 1.3054607508532423, + "grad_norm": 0.45983773724717625, + "learning_rate": 0.0001563822444092738, + "loss": 0.9149, + "step": 6120 + }, + { + "epoch": 1.305674061433447, + "grad_norm": 0.3602121413675008, + "learning_rate": 0.00015637971385822248, + "loss": 0.9754, + "step": 6121 + }, + { + "epoch": 1.305887372013652, + "grad_norm": 0.29161492044880977, + "learning_rate": 0.00015637718244293342, + "loss": 0.9775, + "step": 6122 + }, + { + "epoch": 1.3061006825938566, + "grad_norm": 0.3634531678408645, + "learning_rate": 0.00015637465016343527, + "loss": 0.9589, + "step": 6123 + }, + { + "epoch": 1.3063139931740615, + "grad_norm": 0.4095959593150331, + "learning_rate": 0.00015637211701975668, + "loss": 0.9744, + "step": 6124 + }, + { + "epoch": 1.3065273037542662, + "grad_norm": 0.40409738504422904, + "learning_rate": 0.00015636958301192626, + "loss": 0.9306, + "step": 6125 + }, + { + "epoch": 1.3067406143344709, + "grad_norm": 0.6520260070041244, + "learning_rate": 0.00015636704813997275, + "loss": 0.9674, + "step": 6126 + }, + { + "epoch": 1.3069539249146758, + "grad_norm": 0.4949201013176546, + "learning_rate": 0.00015636451240392477, + "loss": 0.9354, + "step": 6127 + }, + { + "epoch": 1.3071672354948807, + "grad_norm": 0.49274088628863205, + "learning_rate": 0.0001563619758038111, + "loss": 0.9707, + "step": 6128 + }, + { + "epoch": 1.3073805460750854, + "grad_norm": 0.42499240835886437, + "learning_rate": 0.00015635943833966034, + "loss": 0.9556, + "step": 6129 + }, + { + "epoch": 1.30759385665529, + "grad_norm": 0.3677526715194333, + "learning_rate": 0.0001563569000115013, + "loss": 0.985, + "step": 6130 + }, + { + "epoch": 1.307807167235495, + "grad_norm": 0.39441958347563066, + "learning_rate": 0.0001563543608193626, + "loss": 0.9593, + "step": 6131 + }, + { + "epoch": 1.3080204778156996, + "grad_norm": 0.393744391659762, + "learning_rate": 0.00015635182076327304, + "loss": 0.9803, + "step": 6132 + }, + { + "epoch": 1.3082337883959045, + "grad_norm": 0.5457088256778012, + "learning_rate": 0.00015634927984326137, + "loss": 0.96, + "step": 6133 + }, + { + "epoch": 1.3084470989761092, + "grad_norm": 0.5455151237811582, + "learning_rate": 0.0001563467380593563, + "loss": 0.9423, + "step": 6134 + }, + { + "epoch": 1.308660409556314, + "grad_norm": 0.44780063903988676, + "learning_rate": 0.0001563441954115866, + "loss": 0.9821, + "step": 6135 + }, + { + "epoch": 1.3088737201365188, + "grad_norm": 0.48775471291978406, + "learning_rate": 0.00015634165189998104, + "loss": 0.9976, + "step": 6136 + }, + { + "epoch": 1.3090870307167235, + "grad_norm": 0.5260251039148689, + "learning_rate": 0.00015633910752456843, + "loss": 0.9495, + "step": 6137 + }, + { + "epoch": 1.3093003412969284, + "grad_norm": 0.680470346139667, + "learning_rate": 0.0001563365622853775, + "loss": 0.9808, + "step": 6138 + }, + { + "epoch": 1.309513651877133, + "grad_norm": 0.6971349452969732, + "learning_rate": 0.00015633401618243712, + "loss": 0.9703, + "step": 6139 + }, + { + "epoch": 1.3097269624573378, + "grad_norm": 0.652155580508358, + "learning_rate": 0.00015633146921577602, + "loss": 0.9794, + "step": 6140 + }, + { + "epoch": 1.3099402730375427, + "grad_norm": 0.5501838153752391, + "learning_rate": 0.00015632892138542308, + "loss": 0.9923, + "step": 6141 + }, + { + "epoch": 1.3101535836177474, + "grad_norm": 0.5176006053825913, + "learning_rate": 0.00015632637269140712, + "loss": 0.9344, + "step": 6142 + }, + { + "epoch": 1.3103668941979523, + "grad_norm": 0.4631967621838635, + "learning_rate": 0.00015632382313375697, + "loss": 0.9584, + "step": 6143 + }, + { + "epoch": 1.310580204778157, + "grad_norm": 0.4118175706750553, + "learning_rate": 0.00015632127271250146, + "loss": 1.0071, + "step": 6144 + }, + { + "epoch": 1.3107935153583616, + "grad_norm": 0.6294652423419698, + "learning_rate": 0.0001563187214276695, + "loss": 0.9664, + "step": 6145 + }, + { + "epoch": 1.3110068259385665, + "grad_norm": 0.6803231261407817, + "learning_rate": 0.0001563161692792899, + "loss": 0.959, + "step": 6146 + }, + { + "epoch": 1.3112201365187715, + "grad_norm": 0.6955824029344249, + "learning_rate": 0.00015631361626739155, + "loss": 0.9974, + "step": 6147 + }, + { + "epoch": 1.3114334470989761, + "grad_norm": 0.7586864799901999, + "learning_rate": 0.00015631106239200336, + "loss": 0.9683, + "step": 6148 + }, + { + "epoch": 1.3116467576791808, + "grad_norm": 0.7779754951856394, + "learning_rate": 0.0001563085076531542, + "loss": 0.9568, + "step": 6149 + }, + { + "epoch": 1.3118600682593857, + "grad_norm": 0.742652422989719, + "learning_rate": 0.000156305952050873, + "loss": 0.9836, + "step": 6150 + }, + { + "epoch": 1.3120733788395904, + "grad_norm": 0.6886948130691589, + "learning_rate": 0.00015630339558518868, + "loss": 0.9446, + "step": 6151 + }, + { + "epoch": 1.3122866894197953, + "grad_norm": 0.49327544621461966, + "learning_rate": 0.00015630083825613018, + "loss": 0.9408, + "step": 6152 + }, + { + "epoch": 1.3125, + "grad_norm": 0.37445289783547764, + "learning_rate": 0.00015629828006372636, + "loss": 0.9893, + "step": 6153 + }, + { + "epoch": 1.3127133105802047, + "grad_norm": 0.3615479271129651, + "learning_rate": 0.00015629572100800624, + "loss": 0.9785, + "step": 6154 + }, + { + "epoch": 1.3129266211604096, + "grad_norm": 0.47780593593190457, + "learning_rate": 0.00015629316108899875, + "loss": 0.9816, + "step": 6155 + }, + { + "epoch": 1.3131399317406143, + "grad_norm": 0.5935044230308593, + "learning_rate": 0.00015629060030673288, + "loss": 0.9651, + "step": 6156 + }, + { + "epoch": 1.3133532423208192, + "grad_norm": 0.6483789152707213, + "learning_rate": 0.00015628803866123757, + "loss": 0.9584, + "step": 6157 + }, + { + "epoch": 1.3135665529010239, + "grad_norm": 0.6252587911567191, + "learning_rate": 0.00015628547615254183, + "loss": 0.9851, + "step": 6158 + }, + { + "epoch": 1.3137798634812285, + "grad_norm": 0.4896733060737531, + "learning_rate": 0.00015628291278067463, + "loss": 0.9407, + "step": 6159 + }, + { + "epoch": 1.3139931740614335, + "grad_norm": 0.3749622529299815, + "learning_rate": 0.00015628034854566497, + "loss": 0.9263, + "step": 6160 + }, + { + "epoch": 1.3142064846416384, + "grad_norm": 0.26705068628055684, + "learning_rate": 0.00015627778344754192, + "loss": 0.9738, + "step": 6161 + }, + { + "epoch": 1.314419795221843, + "grad_norm": 0.2870324522766437, + "learning_rate": 0.00015627521748633443, + "loss": 0.9895, + "step": 6162 + }, + { + "epoch": 1.3146331058020477, + "grad_norm": 0.44540853921704004, + "learning_rate": 0.0001562726506620716, + "loss": 0.9772, + "step": 6163 + }, + { + "epoch": 1.3148464163822526, + "grad_norm": 0.529006661416085, + "learning_rate": 0.00015627008297478245, + "loss": 0.9363, + "step": 6164 + }, + { + "epoch": 1.3150597269624573, + "grad_norm": 0.6198146527308284, + "learning_rate": 0.000156267514424496, + "loss": 0.9674, + "step": 6165 + }, + { + "epoch": 1.3152730375426622, + "grad_norm": 0.5216832413529038, + "learning_rate": 0.00015626494501124136, + "loss": 0.9795, + "step": 6166 + }, + { + "epoch": 1.315486348122867, + "grad_norm": 0.41703240769097116, + "learning_rate": 0.00015626237473504757, + "loss": 0.9737, + "step": 6167 + }, + { + "epoch": 1.3156996587030716, + "grad_norm": 0.2998807793142318, + "learning_rate": 0.00015625980359594376, + "loss": 0.9293, + "step": 6168 + }, + { + "epoch": 1.3159129692832765, + "grad_norm": 0.3385489714723307, + "learning_rate": 0.00015625723159395897, + "loss": 0.9596, + "step": 6169 + }, + { + "epoch": 1.3161262798634812, + "grad_norm": 0.39160570935120254, + "learning_rate": 0.00015625465872912231, + "loss": 0.9506, + "step": 6170 + }, + { + "epoch": 1.316339590443686, + "grad_norm": 0.3729108996665403, + "learning_rate": 0.0001562520850014629, + "loss": 0.9773, + "step": 6171 + }, + { + "epoch": 1.3165529010238908, + "grad_norm": 0.30721024599573515, + "learning_rate": 0.00015624951041100989, + "loss": 0.9593, + "step": 6172 + }, + { + "epoch": 1.3167662116040955, + "grad_norm": 0.3688097900734802, + "learning_rate": 0.0001562469349577924, + "loss": 0.9795, + "step": 6173 + }, + { + "epoch": 1.3169795221843004, + "grad_norm": 0.3987187742544757, + "learning_rate": 0.0001562443586418395, + "loss": 0.984, + "step": 6174 + }, + { + "epoch": 1.317192832764505, + "grad_norm": 0.3096001267666377, + "learning_rate": 0.00015624178146318043, + "loss": 0.9616, + "step": 6175 + }, + { + "epoch": 1.31740614334471, + "grad_norm": 0.3076227459297573, + "learning_rate": 0.00015623920342184436, + "loss": 0.9448, + "step": 6176 + }, + { + "epoch": 1.3176194539249146, + "grad_norm": 0.4258551644746513, + "learning_rate": 0.00015623662451786036, + "loss": 0.985, + "step": 6177 + }, + { + "epoch": 1.3178327645051193, + "grad_norm": 0.45808306057378045, + "learning_rate": 0.0001562340447512577, + "loss": 0.9697, + "step": 6178 + }, + { + "epoch": 1.3180460750853242, + "grad_norm": 0.38909505654685816, + "learning_rate": 0.00015623146412206554, + "loss": 0.94, + "step": 6179 + }, + { + "epoch": 1.3182593856655291, + "grad_norm": 0.34684569228744533, + "learning_rate": 0.00015622888263031308, + "loss": 0.9843, + "step": 6180 + }, + { + "epoch": 1.3184726962457338, + "grad_norm": 0.3759924327636347, + "learning_rate": 0.00015622630027602952, + "loss": 0.9774, + "step": 6181 + }, + { + "epoch": 1.3186860068259385, + "grad_norm": 0.377228311464931, + "learning_rate": 0.0001562237170592441, + "loss": 0.9331, + "step": 6182 + }, + { + "epoch": 1.3188993174061434, + "grad_norm": 0.40263774651501544, + "learning_rate": 0.00015622113297998603, + "loss": 0.9625, + "step": 6183 + }, + { + "epoch": 1.319112627986348, + "grad_norm": 0.43497768871030196, + "learning_rate": 0.00015621854803828458, + "loss": 0.9702, + "step": 6184 + }, + { + "epoch": 1.319325938566553, + "grad_norm": 0.4656176301365763, + "learning_rate": 0.00015621596223416897, + "loss": 0.9778, + "step": 6185 + }, + { + "epoch": 1.3195392491467577, + "grad_norm": 0.4766468721110239, + "learning_rate": 0.00015621337556766845, + "loss": 0.9706, + "step": 6186 + }, + { + "epoch": 1.3197525597269624, + "grad_norm": 0.46049797649126667, + "learning_rate": 0.00015621078803881232, + "loss": 0.9947, + "step": 6187 + }, + { + "epoch": 1.3199658703071673, + "grad_norm": 0.42909610317115154, + "learning_rate": 0.00015620819964762986, + "loss": 0.9586, + "step": 6188 + }, + { + "epoch": 1.320179180887372, + "grad_norm": 0.4060570096684126, + "learning_rate": 0.0001562056103941503, + "loss": 0.9581, + "step": 6189 + }, + { + "epoch": 1.3203924914675769, + "grad_norm": 0.4271271429992679, + "learning_rate": 0.00015620302027840298, + "loss": 0.9997, + "step": 6190 + }, + { + "epoch": 1.3206058020477816, + "grad_norm": 0.4831308433766614, + "learning_rate": 0.0001562004293004172, + "loss": 0.9614, + "step": 6191 + }, + { + "epoch": 1.3208191126279862, + "grad_norm": 0.5496769340796854, + "learning_rate": 0.0001561978374602223, + "loss": 0.9747, + "step": 6192 + }, + { + "epoch": 1.3210324232081911, + "grad_norm": 0.6442599729897857, + "learning_rate": 0.0001561952447578476, + "loss": 0.9784, + "step": 6193 + }, + { + "epoch": 1.3212457337883958, + "grad_norm": 0.62906254611194, + "learning_rate": 0.0001561926511933224, + "loss": 0.929, + "step": 6194 + }, + { + "epoch": 1.3214590443686007, + "grad_norm": 0.6280583817439493, + "learning_rate": 0.00015619005676667608, + "loss": 0.9663, + "step": 6195 + }, + { + "epoch": 1.3216723549488054, + "grad_norm": 0.951339732711457, + "learning_rate": 0.000156187461477938, + "loss": 0.9706, + "step": 6196 + }, + { + "epoch": 1.32188566552901, + "grad_norm": 0.5847947394378993, + "learning_rate": 0.00015618486532713747, + "loss": 0.9591, + "step": 6197 + }, + { + "epoch": 1.322098976109215, + "grad_norm": 0.5255268652285309, + "learning_rate": 0.00015618226831430395, + "loss": 0.957, + "step": 6198 + }, + { + "epoch": 1.32231228668942, + "grad_norm": 0.6930185027664347, + "learning_rate": 0.00015617967043946674, + "loss": 0.9605, + "step": 6199 + }, + { + "epoch": 1.3225255972696246, + "grad_norm": 0.42825051327902647, + "learning_rate": 0.0001561770717026553, + "loss": 0.9556, + "step": 6200 + }, + { + "epoch": 1.3227389078498293, + "grad_norm": 0.433892180470271, + "learning_rate": 0.00015617447210389902, + "loss": 0.9582, + "step": 6201 + }, + { + "epoch": 1.3229522184300342, + "grad_norm": 0.5469045174728403, + "learning_rate": 0.0001561718716432273, + "loss": 0.922, + "step": 6202 + }, + { + "epoch": 1.3231655290102389, + "grad_norm": 0.7149569415540747, + "learning_rate": 0.00015616927032066958, + "loss": 0.9542, + "step": 6203 + }, + { + "epoch": 1.3233788395904438, + "grad_norm": 0.575485287277824, + "learning_rate": 0.0001561666681362553, + "loss": 0.9534, + "step": 6204 + }, + { + "epoch": 1.3235921501706485, + "grad_norm": 0.5363742325487525, + "learning_rate": 0.00015616406509001384, + "loss": 0.9988, + "step": 6205 + }, + { + "epoch": 1.3238054607508531, + "grad_norm": 0.5708436756337961, + "learning_rate": 0.00015616146118197475, + "loss": 0.9645, + "step": 6206 + }, + { + "epoch": 1.324018771331058, + "grad_norm": 0.6266189185992729, + "learning_rate": 0.0001561588564121674, + "loss": 0.975, + "step": 6207 + }, + { + "epoch": 1.3242320819112627, + "grad_norm": 0.5783548770848532, + "learning_rate": 0.00015615625078062135, + "loss": 0.9631, + "step": 6208 + }, + { + "epoch": 1.3244453924914676, + "grad_norm": 0.461667762108796, + "learning_rate": 0.00015615364428736602, + "loss": 0.9693, + "step": 6209 + }, + { + "epoch": 1.3246587030716723, + "grad_norm": 0.47517424349439785, + "learning_rate": 0.00015615103693243095, + "loss": 0.9631, + "step": 6210 + }, + { + "epoch": 1.324872013651877, + "grad_norm": 0.4523771132920012, + "learning_rate": 0.0001561484287158456, + "loss": 0.9566, + "step": 6211 + }, + { + "epoch": 1.325085324232082, + "grad_norm": 0.43324434666629474, + "learning_rate": 0.00015614581963763947, + "loss": 0.9668, + "step": 6212 + }, + { + "epoch": 1.3252986348122868, + "grad_norm": 0.41472360234079264, + "learning_rate": 0.00015614320969784215, + "loss": 0.9293, + "step": 6213 + }, + { + "epoch": 1.3255119453924915, + "grad_norm": 0.3465064633672683, + "learning_rate": 0.0001561405988964831, + "loss": 0.9631, + "step": 6214 + }, + { + "epoch": 1.3257252559726962, + "grad_norm": 0.3195560926957221, + "learning_rate": 0.0001561379872335919, + "loss": 0.9414, + "step": 6215 + }, + { + "epoch": 1.325938566552901, + "grad_norm": 0.40511594973805154, + "learning_rate": 0.0001561353747091981, + "loss": 0.9512, + "step": 6216 + }, + { + "epoch": 1.3261518771331058, + "grad_norm": 0.511647264328382, + "learning_rate": 0.00015613276132333127, + "loss": 0.9776, + "step": 6217 + }, + { + "epoch": 1.3263651877133107, + "grad_norm": 0.49084226006665715, + "learning_rate": 0.00015613014707602094, + "loss": 0.9481, + "step": 6218 + }, + { + "epoch": 1.3265784982935154, + "grad_norm": 0.40641436487319565, + "learning_rate": 0.00015612753196729673, + "loss": 0.9513, + "step": 6219 + }, + { + "epoch": 1.32679180887372, + "grad_norm": 0.36907456710281, + "learning_rate": 0.00015612491599718823, + "loss": 0.9587, + "step": 6220 + }, + { + "epoch": 1.327005119453925, + "grad_norm": 0.3581995544077375, + "learning_rate": 0.000156122299165725, + "loss": 0.9419, + "step": 6221 + }, + { + "epoch": 1.3272184300341296, + "grad_norm": 0.3616507646787153, + "learning_rate": 0.00015611968147293668, + "loss": 0.954, + "step": 6222 + }, + { + "epoch": 1.3274317406143346, + "grad_norm": 0.34693930897702085, + "learning_rate": 0.00015611706291885288, + "loss": 0.9686, + "step": 6223 + }, + { + "epoch": 1.3276450511945392, + "grad_norm": 0.32803849222416026, + "learning_rate": 0.00015611444350350324, + "loss": 0.9932, + "step": 6224 + }, + { + "epoch": 1.327858361774744, + "grad_norm": 0.26321981766445945, + "learning_rate": 0.0001561118232269174, + "loss": 0.9606, + "step": 6225 + }, + { + "epoch": 1.3280716723549488, + "grad_norm": 0.3160279730007017, + "learning_rate": 0.00015610920208912498, + "loss": 0.9629, + "step": 6226 + }, + { + "epoch": 1.3282849829351535, + "grad_norm": 0.4368882520450433, + "learning_rate": 0.00015610658009015567, + "loss": 0.9889, + "step": 6227 + }, + { + "epoch": 1.3284982935153584, + "grad_norm": 0.4518947437630603, + "learning_rate": 0.00015610395723003912, + "loss": 0.9374, + "step": 6228 + }, + { + "epoch": 1.328711604095563, + "grad_norm": 0.4036825384601867, + "learning_rate": 0.000156101333508805, + "loss": 0.9626, + "step": 6229 + }, + { + "epoch": 1.3289249146757678, + "grad_norm": 0.3867266899324074, + "learning_rate": 0.00015609870892648305, + "loss": 0.9502, + "step": 6230 + }, + { + "epoch": 1.3291382252559727, + "grad_norm": 0.4037247099650175, + "learning_rate": 0.0001560960834831029, + "loss": 0.9525, + "step": 6231 + }, + { + "epoch": 1.3293515358361776, + "grad_norm": 0.4205059269969061, + "learning_rate": 0.0001560934571786943, + "loss": 0.9599, + "step": 6232 + }, + { + "epoch": 1.3295648464163823, + "grad_norm": 0.4041523798333893, + "learning_rate": 0.00015609083001328692, + "loss": 0.9839, + "step": 6233 + }, + { + "epoch": 1.329778156996587, + "grad_norm": 0.38727084332627393, + "learning_rate": 0.00015608820198691057, + "loss": 0.9732, + "step": 6234 + }, + { + "epoch": 1.3299914675767919, + "grad_norm": 0.4631630921235303, + "learning_rate": 0.0001560855730995949, + "loss": 0.9578, + "step": 6235 + }, + { + "epoch": 1.3302047781569966, + "grad_norm": 0.5293595679760115, + "learning_rate": 0.0001560829433513697, + "loss": 0.9691, + "step": 6236 + }, + { + "epoch": 1.3304180887372015, + "grad_norm": 0.4597357869004747, + "learning_rate": 0.0001560803127422647, + "loss": 0.9768, + "step": 6237 + }, + { + "epoch": 1.3306313993174061, + "grad_norm": 0.4743625426784657, + "learning_rate": 0.00015607768127230971, + "loss": 0.9884, + "step": 6238 + }, + { + "epoch": 1.3308447098976108, + "grad_norm": 0.4934119433917797, + "learning_rate": 0.00015607504894153446, + "loss": 0.9571, + "step": 6239 + }, + { + "epoch": 1.3310580204778157, + "grad_norm": 0.4129448596362092, + "learning_rate": 0.00015607241574996876, + "loss": 0.9635, + "step": 6240 + }, + { + "epoch": 1.3312713310580204, + "grad_norm": 0.36408608623678734, + "learning_rate": 0.0001560697816976424, + "loss": 1.0008, + "step": 6241 + }, + { + "epoch": 1.3314846416382253, + "grad_norm": 0.3072934851760362, + "learning_rate": 0.0001560671467845852, + "loss": 0.9831, + "step": 6242 + }, + { + "epoch": 1.33169795221843, + "grad_norm": 0.3101923310422754, + "learning_rate": 0.0001560645110108269, + "loss": 0.9488, + "step": 6243 + }, + { + "epoch": 1.3319112627986347, + "grad_norm": 0.3501703669363611, + "learning_rate": 0.00015606187437639739, + "loss": 0.9743, + "step": 6244 + }, + { + "epoch": 1.3321245733788396, + "grad_norm": 0.32063797277201284, + "learning_rate": 0.00015605923688132652, + "loss": 0.9523, + "step": 6245 + }, + { + "epoch": 1.3323378839590443, + "grad_norm": 0.33147593307213935, + "learning_rate": 0.0001560565985256441, + "loss": 0.9477, + "step": 6246 + }, + { + "epoch": 1.3325511945392492, + "grad_norm": 0.3489143969115092, + "learning_rate": 0.00015605395930937997, + "loss": 0.958, + "step": 6247 + }, + { + "epoch": 1.3327645051194539, + "grad_norm": 0.3508542910454327, + "learning_rate": 0.00015605131923256404, + "loss": 0.9633, + "step": 6248 + }, + { + "epoch": 1.3329778156996588, + "grad_norm": 0.42246557532423684, + "learning_rate": 0.00015604867829522613, + "loss": 0.9661, + "step": 6249 + }, + { + "epoch": 1.3331911262798635, + "grad_norm": 0.44761631289233983, + "learning_rate": 0.00015604603649739613, + "loss": 0.9494, + "step": 6250 + }, + { + "epoch": 1.3334044368600684, + "grad_norm": 0.4830695162050874, + "learning_rate": 0.00015604339383910399, + "loss": 1.013, + "step": 6251 + }, + { + "epoch": 1.333617747440273, + "grad_norm": 0.5199084315549051, + "learning_rate": 0.00015604075032037953, + "loss": 0.9501, + "step": 6252 + }, + { + "epoch": 1.3338310580204777, + "grad_norm": 0.5464496521162018, + "learning_rate": 0.00015603810594125273, + "loss": 0.9739, + "step": 6253 + }, + { + "epoch": 1.3340443686006827, + "grad_norm": 0.5354983436055767, + "learning_rate": 0.00015603546070175345, + "loss": 0.9669, + "step": 6254 + }, + { + "epoch": 1.3342576791808873, + "grad_norm": 0.5108386114613112, + "learning_rate": 0.00015603281460191168, + "loss": 0.9484, + "step": 6255 + }, + { + "epoch": 1.3344709897610922, + "grad_norm": 0.4468188594152561, + "learning_rate": 0.00015603016764175734, + "loss": 0.9323, + "step": 6256 + }, + { + "epoch": 1.334684300341297, + "grad_norm": 0.46529342417059827, + "learning_rate": 0.00015602751982132036, + "loss": 0.9443, + "step": 6257 + }, + { + "epoch": 1.3348976109215016, + "grad_norm": 0.5617572185023767, + "learning_rate": 0.0001560248711406307, + "loss": 0.9559, + "step": 6258 + }, + { + "epoch": 1.3351109215017065, + "grad_norm": 0.6470958418342032, + "learning_rate": 0.00015602222159971838, + "loss": 0.9712, + "step": 6259 + }, + { + "epoch": 1.3353242320819112, + "grad_norm": 0.6811032827870457, + "learning_rate": 0.00015601957119861334, + "loss": 0.9303, + "step": 6260 + }, + { + "epoch": 1.335537542662116, + "grad_norm": 0.6260893331533756, + "learning_rate": 0.00015601691993734554, + "loss": 0.9198, + "step": 6261 + }, + { + "epoch": 1.3357508532423208, + "grad_norm": 0.5807007528749718, + "learning_rate": 0.00015601426781594504, + "loss": 0.9386, + "step": 6262 + }, + { + "epoch": 1.3359641638225255, + "grad_norm": 0.6462050692294868, + "learning_rate": 0.0001560116148344418, + "loss": 0.9477, + "step": 6263 + }, + { + "epoch": 1.3361774744027304, + "grad_norm": 0.6175932425529891, + "learning_rate": 0.0001560089609928659, + "loss": 0.9536, + "step": 6264 + }, + { + "epoch": 1.3363907849829353, + "grad_norm": 0.5598380024345974, + "learning_rate": 0.00015600630629124729, + "loss": 0.9907, + "step": 6265 + }, + { + "epoch": 1.33660409556314, + "grad_norm": 0.5224568380611179, + "learning_rate": 0.00015600365072961607, + "loss": 0.9529, + "step": 6266 + }, + { + "epoch": 1.3368174061433447, + "grad_norm": 0.47920644474594287, + "learning_rate": 0.00015600099430800222, + "loss": 0.9749, + "step": 6267 + }, + { + "epoch": 1.3370307167235496, + "grad_norm": 0.3355167918464669, + "learning_rate": 0.00015599833702643589, + "loss": 0.9611, + "step": 6268 + }, + { + "epoch": 1.3372440273037542, + "grad_norm": 0.36527877476789694, + "learning_rate": 0.00015599567888494706, + "loss": 0.9515, + "step": 6269 + }, + { + "epoch": 1.3374573378839592, + "grad_norm": 0.4255998524419605, + "learning_rate": 0.00015599301988356587, + "loss": 0.9617, + "step": 6270 + }, + { + "epoch": 1.3376706484641638, + "grad_norm": 0.44640083992906426, + "learning_rate": 0.00015599036002232237, + "loss": 0.9752, + "step": 6271 + }, + { + "epoch": 1.3378839590443685, + "grad_norm": 0.48892156663784003, + "learning_rate": 0.0001559876993012467, + "loss": 0.9642, + "step": 6272 + }, + { + "epoch": 1.3380972696245734, + "grad_norm": 0.592895594105519, + "learning_rate": 0.0001559850377203689, + "loss": 0.9726, + "step": 6273 + }, + { + "epoch": 1.338310580204778, + "grad_norm": 0.6921180159764535, + "learning_rate": 0.00015598237527971912, + "loss": 0.9607, + "step": 6274 + }, + { + "epoch": 1.338523890784983, + "grad_norm": 0.69622597133928, + "learning_rate": 0.0001559797119793275, + "loss": 0.9519, + "step": 6275 + }, + { + "epoch": 1.3387372013651877, + "grad_norm": 0.6761304710337035, + "learning_rate": 0.00015597704781922415, + "loss": 0.9437, + "step": 6276 + }, + { + "epoch": 1.3389505119453924, + "grad_norm": 0.5967348660445937, + "learning_rate": 0.00015597438279943925, + "loss": 0.9402, + "step": 6277 + }, + { + "epoch": 1.3391638225255973, + "grad_norm": 0.8607591077803785, + "learning_rate": 0.00015597171692000288, + "loss": 0.9881, + "step": 6278 + }, + { + "epoch": 1.339377133105802, + "grad_norm": 0.6800301864550908, + "learning_rate": 0.0001559690501809453, + "loss": 0.9737, + "step": 6279 + }, + { + "epoch": 1.3395904436860069, + "grad_norm": 0.5738683594774353, + "learning_rate": 0.00015596638258229663, + "loss": 0.9528, + "step": 6280 + }, + { + "epoch": 1.3398037542662116, + "grad_norm": 0.39803522972295674, + "learning_rate": 0.00015596371412408705, + "loss": 0.9545, + "step": 6281 + }, + { + "epoch": 1.3400170648464163, + "grad_norm": 0.3044878376958907, + "learning_rate": 0.0001559610448063468, + "loss": 0.9497, + "step": 6282 + }, + { + "epoch": 1.3402303754266212, + "grad_norm": 6.788690201896584, + "learning_rate": 0.00015595837462910602, + "loss": 1.0135, + "step": 6283 + }, + { + "epoch": 1.340443686006826, + "grad_norm": 0.6181256429431999, + "learning_rate": 0.00015595570359239495, + "loss": 0.9499, + "step": 6284 + }, + { + "epoch": 1.3406569965870307, + "grad_norm": 1.0499102471754351, + "learning_rate": 0.00015595303169624382, + "loss": 0.972, + "step": 6285 + }, + { + "epoch": 1.3408703071672354, + "grad_norm": 1.3004609251715824, + "learning_rate": 0.00015595035894068286, + "loss": 0.9884, + "step": 6286 + }, + { + "epoch": 1.3410836177474403, + "grad_norm": 0.7082534587440057, + "learning_rate": 0.00015594768532574234, + "loss": 0.9632, + "step": 6287 + }, + { + "epoch": 1.341296928327645, + "grad_norm": 0.8202525546297217, + "learning_rate": 0.00015594501085145244, + "loss": 0.9848, + "step": 6288 + }, + { + "epoch": 1.34151023890785, + "grad_norm": 0.7480259129450406, + "learning_rate": 0.0001559423355178435, + "loss": 0.9567, + "step": 6289 + }, + { + "epoch": 1.3417235494880546, + "grad_norm": 0.9644037642570641, + "learning_rate": 0.00015593965932494575, + "loss": 0.9877, + "step": 6290 + }, + { + "epoch": 1.3419368600682593, + "grad_norm": 1.0017224926046486, + "learning_rate": 0.00015593698227278944, + "loss": 0.9488, + "step": 6291 + }, + { + "epoch": 1.3421501706484642, + "grad_norm": 0.7279523599189488, + "learning_rate": 0.00015593430436140496, + "loss": 0.9994, + "step": 6292 + }, + { + "epoch": 1.3423634812286689, + "grad_norm": 0.9754409725067792, + "learning_rate": 0.0001559316255908225, + "loss": 0.9383, + "step": 6293 + }, + { + "epoch": 1.3425767918088738, + "grad_norm": 0.9290469243246349, + "learning_rate": 0.00015592894596107244, + "loss": 0.977, + "step": 6294 + }, + { + "epoch": 1.3427901023890785, + "grad_norm": 0.5430736915312829, + "learning_rate": 0.00015592626547218506, + "loss": 0.9593, + "step": 6295 + }, + { + "epoch": 1.3430034129692832, + "grad_norm": 0.6996305439452997, + "learning_rate": 0.00015592358412419073, + "loss": 0.977, + "step": 6296 + }, + { + "epoch": 1.343216723549488, + "grad_norm": 0.6364264480297672, + "learning_rate": 0.00015592090191711975, + "loss": 0.9589, + "step": 6297 + }, + { + "epoch": 1.3434300341296928, + "grad_norm": 0.6559636545105689, + "learning_rate": 0.00015591821885100249, + "loss": 0.9467, + "step": 6298 + }, + { + "epoch": 1.3436433447098977, + "grad_norm": 0.6584764442988775, + "learning_rate": 0.00015591553492586935, + "loss": 0.9749, + "step": 6299 + }, + { + "epoch": 1.3438566552901023, + "grad_norm": 0.48205485323073055, + "learning_rate": 0.0001559128501417506, + "loss": 0.9812, + "step": 6300 + }, + { + "epoch": 1.3440699658703072, + "grad_norm": 0.3992119188856467, + "learning_rate": 0.00015591016449867671, + "loss": 0.9602, + "step": 6301 + }, + { + "epoch": 1.344283276450512, + "grad_norm": 1.5100899525064688, + "learning_rate": 0.00015590747799667802, + "loss": 0.9819, + "step": 6302 + }, + { + "epoch": 1.3444965870307168, + "grad_norm": 0.4205638165528524, + "learning_rate": 0.00015590479063578493, + "loss": 0.947, + "step": 6303 + }, + { + "epoch": 1.3447098976109215, + "grad_norm": 0.6773123927784857, + "learning_rate": 0.00015590210241602786, + "loss": 0.9674, + "step": 6304 + }, + { + "epoch": 1.3449232081911262, + "grad_norm": 0.8247398943430352, + "learning_rate": 0.00015589941333743723, + "loss": 0.9951, + "step": 6305 + }, + { + "epoch": 1.3451365187713311, + "grad_norm": 0.5294407033560021, + "learning_rate": 0.00015589672340004347, + "loss": 0.9798, + "step": 6306 + }, + { + "epoch": 1.3453498293515358, + "grad_norm": 0.3888352243620997, + "learning_rate": 0.000155894032603877, + "loss": 0.9444, + "step": 6307 + }, + { + "epoch": 1.3455631399317407, + "grad_norm": 0.5751676593868988, + "learning_rate": 0.00015589134094896827, + "loss": 0.9627, + "step": 6308 + }, + { + "epoch": 1.3457764505119454, + "grad_norm": 0.6112779368532602, + "learning_rate": 0.00015588864843534773, + "loss": 0.9948, + "step": 6309 + }, + { + "epoch": 1.34598976109215, + "grad_norm": 0.4599683506025471, + "learning_rate": 0.00015588595506304588, + "loss": 0.9579, + "step": 6310 + }, + { + "epoch": 1.346203071672355, + "grad_norm": 0.380493699252725, + "learning_rate": 0.00015588326083209315, + "loss": 0.9167, + "step": 6311 + }, + { + "epoch": 1.3464163822525597, + "grad_norm": 0.5065425125647203, + "learning_rate": 0.00015588056574252006, + "loss": 0.9685, + "step": 6312 + }, + { + "epoch": 1.3466296928327646, + "grad_norm": 0.44062489888629747, + "learning_rate": 0.0001558778697943571, + "loss": 0.9562, + "step": 6313 + }, + { + "epoch": 1.3468430034129693, + "grad_norm": 0.3601383936494582, + "learning_rate": 0.00015587517298763475, + "loss": 0.9515, + "step": 6314 + }, + { + "epoch": 1.347056313993174, + "grad_norm": 0.39764690552943316, + "learning_rate": 0.00015587247532238355, + "loss": 0.9387, + "step": 6315 + }, + { + "epoch": 1.3472696245733788, + "grad_norm": 0.4847724176047638, + "learning_rate": 0.000155869776798634, + "loss": 0.9646, + "step": 6316 + }, + { + "epoch": 1.3474829351535837, + "grad_norm": 0.42690430013967406, + "learning_rate": 0.0001558670774164167, + "loss": 0.9483, + "step": 6317 + }, + { + "epoch": 1.3476962457337884, + "grad_norm": 0.3721389016096809, + "learning_rate": 0.00015586437717576208, + "loss": 0.9906, + "step": 6318 + }, + { + "epoch": 1.3479095563139931, + "grad_norm": 0.3279676986960256, + "learning_rate": 0.00015586167607670078, + "loss": 0.9335, + "step": 6319 + }, + { + "epoch": 1.348122866894198, + "grad_norm": 0.40102829176099564, + "learning_rate": 0.00015585897411926335, + "loss": 0.9808, + "step": 6320 + }, + { + "epoch": 1.3483361774744027, + "grad_norm": 0.4721217131957471, + "learning_rate": 0.00015585627130348034, + "loss": 0.9616, + "step": 6321 + }, + { + "epoch": 1.3485494880546076, + "grad_norm": 0.37507388094387617, + "learning_rate": 0.00015585356762938237, + "loss": 0.9498, + "step": 6322 + }, + { + "epoch": 1.3487627986348123, + "grad_norm": 0.3353167177080682, + "learning_rate": 0.00015585086309699998, + "loss": 1.0003, + "step": 6323 + }, + { + "epoch": 1.348976109215017, + "grad_norm": 0.423460604023067, + "learning_rate": 0.0001558481577063638, + "loss": 0.9753, + "step": 6324 + }, + { + "epoch": 1.349189419795222, + "grad_norm": 0.44721510942880266, + "learning_rate": 0.00015584545145750443, + "loss": 0.9573, + "step": 6325 + }, + { + "epoch": 1.3494027303754266, + "grad_norm": 0.33722306452210976, + "learning_rate": 0.00015584274435045253, + "loss": 0.9826, + "step": 6326 + }, + { + "epoch": 1.3496160409556315, + "grad_norm": 0.34972251416269523, + "learning_rate": 0.00015584003638523868, + "loss": 0.9717, + "step": 6327 + }, + { + "epoch": 1.3498293515358362, + "grad_norm": 0.3477403948578407, + "learning_rate": 0.00015583732756189355, + "loss": 0.9454, + "step": 6328 + }, + { + "epoch": 1.3500426621160408, + "grad_norm": 0.27543479974647855, + "learning_rate": 0.00015583461788044777, + "loss": 0.9797, + "step": 6329 + }, + { + "epoch": 1.3502559726962458, + "grad_norm": 0.3250586789345742, + "learning_rate": 0.00015583190734093204, + "loss": 0.9835, + "step": 6330 + }, + { + "epoch": 1.3504692832764504, + "grad_norm": 0.3125045483077466, + "learning_rate": 0.00015582919594337696, + "loss": 0.9517, + "step": 6331 + }, + { + "epoch": 1.3506825938566553, + "grad_norm": 0.29464260698645756, + "learning_rate": 0.0001558264836878133, + "loss": 0.9726, + "step": 6332 + }, + { + "epoch": 1.35089590443686, + "grad_norm": 0.38545657355044705, + "learning_rate": 0.00015582377057427168, + "loss": 0.9564, + "step": 6333 + }, + { + "epoch": 1.3511092150170647, + "grad_norm": 0.34308757264281226, + "learning_rate": 0.00015582105660278282, + "loss": 0.9771, + "step": 6334 + }, + { + "epoch": 1.3513225255972696, + "grad_norm": 0.2877779614829573, + "learning_rate": 0.00015581834177337745, + "loss": 0.9739, + "step": 6335 + }, + { + "epoch": 1.3515358361774745, + "grad_norm": 0.2622898393218065, + "learning_rate": 0.00015581562608608625, + "loss": 0.9414, + "step": 6336 + }, + { + "epoch": 1.3517491467576792, + "grad_norm": 0.2813423136494447, + "learning_rate": 0.00015581290954093995, + "loss": 0.975, + "step": 6337 + }, + { + "epoch": 1.351962457337884, + "grad_norm": 0.3528113198711744, + "learning_rate": 0.00015581019213796934, + "loss": 0.957, + "step": 6338 + }, + { + "epoch": 1.3521757679180888, + "grad_norm": 0.4611857356683716, + "learning_rate": 0.00015580747387720508, + "loss": 0.9575, + "step": 6339 + }, + { + "epoch": 1.3523890784982935, + "grad_norm": 0.5791394876310029, + "learning_rate": 0.000155804754758678, + "loss": 0.9514, + "step": 6340 + }, + { + "epoch": 1.3526023890784984, + "grad_norm": 0.642095299165531, + "learning_rate": 0.00015580203478241889, + "loss": 0.9426, + "step": 6341 + }, + { + "epoch": 1.352815699658703, + "grad_norm": 0.7378060218853524, + "learning_rate": 0.00015579931394845843, + "loss": 0.9723, + "step": 6342 + }, + { + "epoch": 1.3530290102389078, + "grad_norm": 0.781139789263139, + "learning_rate": 0.00015579659225682747, + "loss": 0.9902, + "step": 6343 + }, + { + "epoch": 1.3532423208191127, + "grad_norm": 0.7893757532979306, + "learning_rate": 0.0001557938697075568, + "loss": 0.9759, + "step": 6344 + }, + { + "epoch": 1.3534556313993173, + "grad_norm": 0.7294117097958612, + "learning_rate": 0.0001557911463006772, + "loss": 0.9587, + "step": 6345 + }, + { + "epoch": 1.3536689419795223, + "grad_norm": 0.657827483757941, + "learning_rate": 0.00015578842203621952, + "loss": 0.9716, + "step": 6346 + }, + { + "epoch": 1.353882252559727, + "grad_norm": 0.5716529409466261, + "learning_rate": 0.00015578569691421457, + "loss": 0.9482, + "step": 6347 + }, + { + "epoch": 1.3540955631399316, + "grad_norm": 0.4267665425242532, + "learning_rate": 0.0001557829709346932, + "loss": 0.953, + "step": 6348 + }, + { + "epoch": 1.3543088737201365, + "grad_norm": 0.31538128740669125, + "learning_rate": 0.0001557802440976862, + "loss": 0.9705, + "step": 6349 + }, + { + "epoch": 1.3545221843003412, + "grad_norm": 0.40518870077969726, + "learning_rate": 0.0001557775164032245, + "loss": 0.9786, + "step": 6350 + }, + { + "epoch": 1.3547354948805461, + "grad_norm": 0.4478394358502304, + "learning_rate": 0.0001557747878513389, + "loss": 0.9557, + "step": 6351 + }, + { + "epoch": 1.3549488054607508, + "grad_norm": 0.4877704359958933, + "learning_rate": 0.0001557720584420603, + "loss": 0.9719, + "step": 6352 + }, + { + "epoch": 1.3551621160409557, + "grad_norm": 0.573324201587579, + "learning_rate": 0.0001557693281754196, + "loss": 0.9823, + "step": 6353 + }, + { + "epoch": 1.3553754266211604, + "grad_norm": 1.4509068236079552, + "learning_rate": 0.0001557665970514477, + "loss": 1.0653, + "step": 6354 + }, + { + "epoch": 1.3555887372013653, + "grad_norm": 0.3245053975364517, + "learning_rate": 0.00015576386507017544, + "loss": 0.9675, + "step": 6355 + }, + { + "epoch": 1.35580204778157, + "grad_norm": 1.6601309142736345, + "learning_rate": 0.0001557611322316338, + "loss": 1.0539, + "step": 6356 + }, + { + "epoch": 1.3560153583617747, + "grad_norm": 0.37356275207910683, + "learning_rate": 0.00015575839853585366, + "loss": 0.9668, + "step": 6357 + }, + { + "epoch": 1.3562286689419796, + "grad_norm": 0.7400237789787032, + "learning_rate": 0.000155755663982866, + "loss": 0.9566, + "step": 6358 + }, + { + "epoch": 1.3564419795221843, + "grad_norm": 0.8311122418167151, + "learning_rate": 0.00015575292857270166, + "loss": 0.9849, + "step": 6359 + }, + { + "epoch": 1.3566552901023892, + "grad_norm": 0.8114811508611549, + "learning_rate": 0.0001557501923053917, + "loss": 0.9545, + "step": 6360 + }, + { + "epoch": 1.3568686006825939, + "grad_norm": 0.7177513854830272, + "learning_rate": 0.00015574745518096703, + "loss": 0.9652, + "step": 6361 + }, + { + "epoch": 1.3570819112627985, + "grad_norm": 0.5596864438297489, + "learning_rate": 0.00015574471719945865, + "loss": 0.9993, + "step": 6362 + }, + { + "epoch": 1.3572952218430034, + "grad_norm": 0.5790638790077388, + "learning_rate": 0.0001557419783608975, + "loss": 0.9653, + "step": 6363 + }, + { + "epoch": 1.3575085324232081, + "grad_norm": 0.49977257895927973, + "learning_rate": 0.0001557392386653146, + "loss": 0.9782, + "step": 6364 + }, + { + "epoch": 1.357721843003413, + "grad_norm": 0.31800040825209314, + "learning_rate": 0.00015573649811274093, + "loss": 0.9654, + "step": 6365 + }, + { + "epoch": 1.3579351535836177, + "grad_norm": 0.5033379089284727, + "learning_rate": 0.00015573375670320752, + "loss": 0.9842, + "step": 6366 + }, + { + "epoch": 1.3581484641638224, + "grad_norm": 0.675220069289413, + "learning_rate": 0.0001557310144367454, + "loss": 0.9604, + "step": 6367 + }, + { + "epoch": 1.3583617747440273, + "grad_norm": 0.8172913939445786, + "learning_rate": 0.00015572827131338555, + "loss": 0.9409, + "step": 6368 + }, + { + "epoch": 1.3585750853242322, + "grad_norm": 0.7286704965172387, + "learning_rate": 0.00015572552733315902, + "loss": 0.9481, + "step": 6369 + }, + { + "epoch": 1.358788395904437, + "grad_norm": 0.6112962147655192, + "learning_rate": 0.0001557227824960969, + "loss": 0.958, + "step": 6370 + }, + { + "epoch": 1.3590017064846416, + "grad_norm": 0.49458689413977497, + "learning_rate": 0.00015572003680223022, + "loss": 0.9775, + "step": 6371 + }, + { + "epoch": 1.3592150170648465, + "grad_norm": 0.28872514944978, + "learning_rate": 0.00015571729025159006, + "loss": 0.9382, + "step": 6372 + }, + { + "epoch": 1.3594283276450512, + "grad_norm": 0.29391782970898317, + "learning_rate": 0.00015571454284420746, + "loss": 0.9808, + "step": 6373 + }, + { + "epoch": 1.359641638225256, + "grad_norm": 0.4226473977400952, + "learning_rate": 0.00015571179458011357, + "loss": 0.94, + "step": 6374 + }, + { + "epoch": 1.3598549488054608, + "grad_norm": 0.49043133731418653, + "learning_rate": 0.00015570904545933941, + "loss": 0.9549, + "step": 6375 + }, + { + "epoch": 1.3600682593856654, + "grad_norm": 0.6144954299338621, + "learning_rate": 0.00015570629548191617, + "loss": 0.9692, + "step": 6376 + }, + { + "epoch": 1.3602815699658704, + "grad_norm": 0.9503836697535717, + "learning_rate": 0.0001557035446478749, + "loss": 0.9939, + "step": 6377 + }, + { + "epoch": 1.360494880546075, + "grad_norm": 0.533388525429393, + "learning_rate": 0.00015570079295724673, + "loss": 0.9864, + "step": 6378 + }, + { + "epoch": 1.36070819112628, + "grad_norm": 0.4791431050031226, + "learning_rate": 0.00015569804041006284, + "loss": 0.9481, + "step": 6379 + }, + { + "epoch": 1.3609215017064846, + "grad_norm": 0.3648871753790706, + "learning_rate": 0.00015569528700635436, + "loss": 0.9665, + "step": 6380 + }, + { + "epoch": 1.3611348122866893, + "grad_norm": 0.33330468634234317, + "learning_rate": 0.0001556925327461524, + "loss": 0.9848, + "step": 6381 + }, + { + "epoch": 1.3613481228668942, + "grad_norm": 0.416735616084394, + "learning_rate": 0.00015568977762948818, + "loss": 0.9773, + "step": 6382 + }, + { + "epoch": 1.361561433447099, + "grad_norm": 0.44503303574379366, + "learning_rate": 0.00015568702165639285, + "loss": 0.9315, + "step": 6383 + }, + { + "epoch": 1.3617747440273038, + "grad_norm": 0.5023065658357073, + "learning_rate": 0.0001556842648268976, + "loss": 0.9368, + "step": 6384 + }, + { + "epoch": 1.3619880546075085, + "grad_norm": 0.5695056772715761, + "learning_rate": 0.0001556815071410336, + "loss": 0.9779, + "step": 6385 + }, + { + "epoch": 1.3622013651877132, + "grad_norm": 0.512370698274818, + "learning_rate": 0.00015567874859883212, + "loss": 0.9552, + "step": 6386 + }, + { + "epoch": 1.362414675767918, + "grad_norm": 0.31173259980226437, + "learning_rate": 0.00015567598920032428, + "loss": 0.9635, + "step": 6387 + }, + { + "epoch": 1.362627986348123, + "grad_norm": 0.3008080443989394, + "learning_rate": 0.00015567322894554137, + "loss": 0.9439, + "step": 6388 + }, + { + "epoch": 1.3628412969283277, + "grad_norm": 0.4005021019313716, + "learning_rate": 0.0001556704678345146, + "loss": 0.9884, + "step": 6389 + }, + { + "epoch": 1.3630546075085324, + "grad_norm": 0.416174793290578, + "learning_rate": 0.0001556677058672752, + "loss": 0.9201, + "step": 6390 + }, + { + "epoch": 1.3632679180887373, + "grad_norm": 0.5974038988630546, + "learning_rate": 0.00015566494304385445, + "loss": 0.9437, + "step": 6391 + }, + { + "epoch": 1.363481228668942, + "grad_norm": 0.8208914136438803, + "learning_rate": 0.0001556621793642836, + "loss": 0.9734, + "step": 6392 + }, + { + "epoch": 1.3636945392491469, + "grad_norm": 0.9133488040343551, + "learning_rate": 0.00015565941482859391, + "loss": 0.9472, + "step": 6393 + }, + { + "epoch": 1.3639078498293515, + "grad_norm": 0.7955966447150948, + "learning_rate": 0.00015565664943681666, + "loss": 0.9672, + "step": 6394 + }, + { + "epoch": 1.3641211604095562, + "grad_norm": 0.6135301637710466, + "learning_rate": 0.00015565388318898316, + "loss": 0.9659, + "step": 6395 + }, + { + "epoch": 1.3643344709897611, + "grad_norm": 0.48413861020821214, + "learning_rate": 0.00015565111608512473, + "loss": 0.9612, + "step": 6396 + }, + { + "epoch": 1.3645477815699658, + "grad_norm": 0.35258067621940814, + "learning_rate": 0.0001556483481252726, + "loss": 0.9515, + "step": 6397 + }, + { + "epoch": 1.3647610921501707, + "grad_norm": 0.25548644423800465, + "learning_rate": 0.00015564557930945814, + "loss": 0.9631, + "step": 6398 + }, + { + "epoch": 1.3649744027303754, + "grad_norm": 0.2953612294860877, + "learning_rate": 0.00015564280963771272, + "loss": 0.9643, + "step": 6399 + }, + { + "epoch": 1.36518771331058, + "grad_norm": 0.36701390243093496, + "learning_rate": 0.00015564003911006758, + "loss": 0.9451, + "step": 6400 + }, + { + "epoch": 1.365401023890785, + "grad_norm": 0.40995219095853763, + "learning_rate": 0.0001556372677265542, + "loss": 0.9591, + "step": 6401 + }, + { + "epoch": 1.36561433447099, + "grad_norm": 0.34595418060756783, + "learning_rate": 0.0001556344954872038, + "loss": 0.995, + "step": 6402 + }, + { + "epoch": 1.3658276450511946, + "grad_norm": 0.29784088745447834, + "learning_rate": 0.00015563172239204783, + "loss": 0.9695, + "step": 6403 + }, + { + "epoch": 1.3660409556313993, + "grad_norm": 0.304984097480694, + "learning_rate": 0.00015562894844111766, + "loss": 0.9698, + "step": 6404 + }, + { + "epoch": 1.3662542662116042, + "grad_norm": 0.31047843179948853, + "learning_rate": 0.00015562617363444465, + "loss": 0.9439, + "step": 6405 + }, + { + "epoch": 1.3664675767918089, + "grad_norm": 0.2940683062806434, + "learning_rate": 0.00015562339797206022, + "loss": 0.9816, + "step": 6406 + }, + { + "epoch": 1.3666808873720138, + "grad_norm": 0.27468706949026844, + "learning_rate": 0.00015562062145399577, + "loss": 0.9528, + "step": 6407 + }, + { + "epoch": 1.3668941979522184, + "grad_norm": 0.33517595352563406, + "learning_rate": 0.0001556178440802827, + "loss": 0.943, + "step": 6408 + }, + { + "epoch": 1.3671075085324231, + "grad_norm": 0.45175557111230696, + "learning_rate": 0.00015561506585095246, + "loss": 0.9655, + "step": 6409 + }, + { + "epoch": 1.367320819112628, + "grad_norm": 0.4569938610593706, + "learning_rate": 0.0001556122867660365, + "loss": 0.9611, + "step": 6410 + }, + { + "epoch": 1.3675341296928327, + "grad_norm": 0.6432411399055804, + "learning_rate": 0.00015560950682556624, + "loss": 0.9936, + "step": 6411 + }, + { + "epoch": 1.3677474402730376, + "grad_norm": 0.7925140125215628, + "learning_rate": 0.00015560672602957313, + "loss": 0.9581, + "step": 6412 + }, + { + "epoch": 1.3679607508532423, + "grad_norm": 0.689607745511175, + "learning_rate": 0.00015560394437808862, + "loss": 0.9289, + "step": 6413 + }, + { + "epoch": 1.368174061433447, + "grad_norm": 0.48502422361678305, + "learning_rate": 0.00015560116187114423, + "loss": 0.972, + "step": 6414 + }, + { + "epoch": 1.368387372013652, + "grad_norm": 0.34935867403820453, + "learning_rate": 0.00015559837850877143, + "loss": 1.0118, + "step": 6415 + }, + { + "epoch": 1.3686006825938566, + "grad_norm": 0.3459711525935038, + "learning_rate": 0.0001555955942910017, + "loss": 0.9634, + "step": 6416 + }, + { + "epoch": 1.3688139931740615, + "grad_norm": 0.3607338168207903, + "learning_rate": 0.00015559280921786654, + "loss": 0.941, + "step": 6417 + }, + { + "epoch": 1.3690273037542662, + "grad_norm": 0.3603809067996682, + "learning_rate": 0.00015559002328939748, + "loss": 0.9376, + "step": 6418 + }, + { + "epoch": 1.3692406143344709, + "grad_norm": 0.40300174255731946, + "learning_rate": 0.00015558723650562604, + "loss": 0.949, + "step": 6419 + }, + { + "epoch": 1.3694539249146758, + "grad_norm": 0.3627812233588504, + "learning_rate": 0.00015558444886658372, + "loss": 0.9439, + "step": 6420 + }, + { + "epoch": 1.3696672354948807, + "grad_norm": 0.3325814258891945, + "learning_rate": 0.00015558166037230213, + "loss": 0.9461, + "step": 6421 + }, + { + "epoch": 1.3698805460750854, + "grad_norm": 0.3614335305605956, + "learning_rate": 0.00015557887102281276, + "loss": 0.9818, + "step": 6422 + }, + { + "epoch": 1.37009385665529, + "grad_norm": 0.336921561819439, + "learning_rate": 0.0001555760808181472, + "loss": 0.9288, + "step": 6423 + }, + { + "epoch": 1.370307167235495, + "grad_norm": 0.3467706226356366, + "learning_rate": 0.000155573289758337, + "loss": 0.9389, + "step": 6424 + }, + { + "epoch": 1.3705204778156996, + "grad_norm": 0.4534464899127375, + "learning_rate": 0.00015557049784341374, + "loss": 0.9685, + "step": 6425 + }, + { + "epoch": 1.3707337883959045, + "grad_norm": 0.5061611692679836, + "learning_rate": 0.00015556770507340906, + "loss": 0.9721, + "step": 6426 + }, + { + "epoch": 1.3709470989761092, + "grad_norm": 0.5270304579707018, + "learning_rate": 0.00015556491144835452, + "loss": 0.9763, + "step": 6427 + }, + { + "epoch": 1.371160409556314, + "grad_norm": 0.5779549920999079, + "learning_rate": 0.00015556211696828173, + "loss": 0.9553, + "step": 6428 + }, + { + "epoch": 1.3713737201365188, + "grad_norm": 0.620824240938492, + "learning_rate": 0.00015555932163322233, + "loss": 0.9687, + "step": 6429 + }, + { + "epoch": 1.3715870307167235, + "grad_norm": 0.5789275191535391, + "learning_rate": 0.0001555565254432079, + "loss": 0.9268, + "step": 6430 + }, + { + "epoch": 1.3718003412969284, + "grad_norm": 0.4098678731747999, + "learning_rate": 0.00015555372839827015, + "loss": 0.9707, + "step": 6431 + }, + { + "epoch": 1.372013651877133, + "grad_norm": 0.3232549091941681, + "learning_rate": 0.00015555093049844068, + "loss": 0.9796, + "step": 6432 + }, + { + "epoch": 1.3722269624573378, + "grad_norm": 0.4469209299801284, + "learning_rate": 0.00015554813174375117, + "loss": 0.9798, + "step": 6433 + }, + { + "epoch": 1.3724402730375427, + "grad_norm": 0.5617223830652331, + "learning_rate": 0.00015554533213423325, + "loss": 0.9591, + "step": 6434 + }, + { + "epoch": 1.3726535836177474, + "grad_norm": 0.4902572278948841, + "learning_rate": 0.00015554253166991864, + "loss": 0.9538, + "step": 6435 + }, + { + "epoch": 1.3728668941979523, + "grad_norm": 0.36061396225857406, + "learning_rate": 0.00015553973035083903, + "loss": 0.9927, + "step": 6436 + }, + { + "epoch": 1.373080204778157, + "grad_norm": 0.40807709598125447, + "learning_rate": 0.00015553692817702608, + "loss": 0.9637, + "step": 6437 + }, + { + "epoch": 1.3732935153583616, + "grad_norm": 0.4011852238105082, + "learning_rate": 0.00015553412514851151, + "loss": 0.9806, + "step": 6438 + }, + { + "epoch": 1.3735068259385665, + "grad_norm": 0.34740967087656704, + "learning_rate": 0.00015553132126532707, + "loss": 0.9503, + "step": 6439 + }, + { + "epoch": 1.3737201365187715, + "grad_norm": 0.4645148807743324, + "learning_rate": 0.00015552851652750446, + "loss": 0.9797, + "step": 6440 + }, + { + "epoch": 1.3739334470989761, + "grad_norm": 0.5710973898104378, + "learning_rate": 0.0001555257109350754, + "loss": 0.9389, + "step": 6441 + }, + { + "epoch": 1.3741467576791808, + "grad_norm": 0.5865704435041014, + "learning_rate": 0.00015552290448807163, + "loss": 0.944, + "step": 6442 + }, + { + "epoch": 1.3743600682593857, + "grad_norm": 0.5192078608888324, + "learning_rate": 0.00015552009718652497, + "loss": 0.9737, + "step": 6443 + }, + { + "epoch": 1.3745733788395904, + "grad_norm": 0.4559812170875783, + "learning_rate": 0.0001555172890304671, + "loss": 0.9846, + "step": 6444 + }, + { + "epoch": 1.3747866894197953, + "grad_norm": 0.4034343318663824, + "learning_rate": 0.00015551448001992985, + "loss": 0.9184, + "step": 6445 + }, + { + "epoch": 1.375, + "grad_norm": 0.3699782582796682, + "learning_rate": 0.00015551167015494501, + "loss": 0.929, + "step": 6446 + }, + { + "epoch": 1.3752133105802047, + "grad_norm": 0.356940266217629, + "learning_rate": 0.00015550885943554433, + "loss": 0.9684, + "step": 6447 + }, + { + "epoch": 1.3754266211604096, + "grad_norm": 0.3564431394297663, + "learning_rate": 0.00015550604786175964, + "loss": 0.9538, + "step": 6448 + }, + { + "epoch": 1.3756399317406143, + "grad_norm": 0.43415620327388715, + "learning_rate": 0.00015550323543362274, + "loss": 0.9716, + "step": 6449 + }, + { + "epoch": 1.3758532423208192, + "grad_norm": 0.48016490712018506, + "learning_rate": 0.00015550042215116546, + "loss": 0.9783, + "step": 6450 + }, + { + "epoch": 1.3760665529010239, + "grad_norm": 0.450636580876992, + "learning_rate": 0.00015549760801441965, + "loss": 0.9544, + "step": 6451 + }, + { + "epoch": 1.3762798634812285, + "grad_norm": 0.4151575339243447, + "learning_rate": 0.00015549479302341714, + "loss": 0.9407, + "step": 6452 + }, + { + "epoch": 1.3764931740614335, + "grad_norm": 0.37465934186433897, + "learning_rate": 0.00015549197717818974, + "loss": 0.9207, + "step": 6453 + }, + { + "epoch": 1.3767064846416384, + "grad_norm": 0.35920017138248617, + "learning_rate": 0.00015548916047876938, + "loss": 0.9678, + "step": 6454 + }, + { + "epoch": 1.376919795221843, + "grad_norm": 0.3880851233005002, + "learning_rate": 0.0001554863429251879, + "loss": 0.9861, + "step": 6455 + }, + { + "epoch": 1.3771331058020477, + "grad_norm": 0.31938192634666873, + "learning_rate": 0.00015548352451747718, + "loss": 0.9691, + "step": 6456 + }, + { + "epoch": 1.3773464163822526, + "grad_norm": 0.44380277404729374, + "learning_rate": 0.0001554807052556691, + "loss": 0.9615, + "step": 6457 + }, + { + "epoch": 1.3775597269624573, + "grad_norm": 0.5974744576851794, + "learning_rate": 0.00015547788513979555, + "loss": 0.9396, + "step": 6458 + }, + { + "epoch": 1.3777730375426622, + "grad_norm": 0.6641223570379287, + "learning_rate": 0.0001554750641698885, + "loss": 0.9353, + "step": 6459 + }, + { + "epoch": 1.377986348122867, + "grad_norm": 0.620131167090144, + "learning_rate": 0.00015547224234597982, + "loss": 0.9201, + "step": 6460 + }, + { + "epoch": 1.3781996587030716, + "grad_norm": 0.6018488997689747, + "learning_rate": 0.00015546941966810142, + "loss": 0.958, + "step": 6461 + }, + { + "epoch": 1.3784129692832765, + "grad_norm": 0.4813044851577809, + "learning_rate": 0.0001554665961362853, + "loss": 0.9435, + "step": 6462 + }, + { + "epoch": 1.3786262798634812, + "grad_norm": 0.3935674524364242, + "learning_rate": 0.00015546377175056336, + "loss": 0.9384, + "step": 6463 + }, + { + "epoch": 1.378839590443686, + "grad_norm": 0.3509338991486953, + "learning_rate": 0.00015546094651096756, + "loss": 0.9443, + "step": 6464 + }, + { + "epoch": 1.3790529010238908, + "grad_norm": 0.3690664609292482, + "learning_rate": 0.00015545812041752993, + "loss": 0.9734, + "step": 6465 + }, + { + "epoch": 1.3792662116040955, + "grad_norm": 0.36815626118616573, + "learning_rate": 0.00015545529347028234, + "loss": 0.9529, + "step": 6466 + }, + { + "epoch": 1.3794795221843004, + "grad_norm": 0.3605096542338266, + "learning_rate": 0.00015545246566925686, + "loss": 0.9644, + "step": 6467 + }, + { + "epoch": 1.379692832764505, + "grad_norm": 0.3944764613941518, + "learning_rate": 0.00015544963701448548, + "loss": 0.982, + "step": 6468 + }, + { + "epoch": 1.37990614334471, + "grad_norm": 0.4343154330279379, + "learning_rate": 0.00015544680750600017, + "loss": 0.9918, + "step": 6469 + }, + { + "epoch": 1.3801194539249146, + "grad_norm": 0.40366978236144196, + "learning_rate": 0.00015544397714383295, + "loss": 1.0131, + "step": 6470 + }, + { + "epoch": 1.3803327645051193, + "grad_norm": 0.43668252001564384, + "learning_rate": 0.00015544114592801588, + "loss": 0.9703, + "step": 6471 + }, + { + "epoch": 1.3805460750853242, + "grad_norm": 0.47107397849721055, + "learning_rate": 0.00015543831385858096, + "loss": 0.948, + "step": 6472 + }, + { + "epoch": 1.3807593856655291, + "grad_norm": 0.3122584638899191, + "learning_rate": 0.00015543548093556028, + "loss": 0.933, + "step": 6473 + }, + { + "epoch": 1.3809726962457338, + "grad_norm": 0.36855782341297366, + "learning_rate": 0.00015543264715898585, + "loss": 0.9657, + "step": 6474 + }, + { + "epoch": 1.3811860068259385, + "grad_norm": 0.4132388591466027, + "learning_rate": 0.00015542981252888972, + "loss": 0.9614, + "step": 6475 + }, + { + "epoch": 1.3813993174061434, + "grad_norm": 0.35213092446238226, + "learning_rate": 0.00015542697704530402, + "loss": 0.9031, + "step": 6476 + }, + { + "epoch": 1.381612627986348, + "grad_norm": 0.35211820736766175, + "learning_rate": 0.0001554241407082608, + "loss": 0.966, + "step": 6477 + }, + { + "epoch": 1.381825938566553, + "grad_norm": 0.3185320450084787, + "learning_rate": 0.00015542130351779216, + "loss": 0.9524, + "step": 6478 + }, + { + "epoch": 1.3820392491467577, + "grad_norm": 0.3069275542784571, + "learning_rate": 0.0001554184654739302, + "loss": 0.9694, + "step": 6479 + }, + { + "epoch": 1.3822525597269624, + "grad_norm": 0.28785866967992363, + "learning_rate": 0.00015541562657670705, + "loss": 0.9764, + "step": 6480 + }, + { + "epoch": 1.3824658703071673, + "grad_norm": 0.26315959527131155, + "learning_rate": 0.00015541278682615478, + "loss": 0.9756, + "step": 6481 + }, + { + "epoch": 1.382679180887372, + "grad_norm": 0.28675600010194213, + "learning_rate": 0.0001554099462223056, + "loss": 0.9775, + "step": 6482 + }, + { + "epoch": 1.3828924914675769, + "grad_norm": 0.3131051831989581, + "learning_rate": 0.0001554071047651916, + "loss": 0.9648, + "step": 6483 + }, + { + "epoch": 1.3831058020477816, + "grad_norm": 0.2816288050896034, + "learning_rate": 0.0001554042624548449, + "loss": 0.9765, + "step": 6484 + }, + { + "epoch": 1.3833191126279862, + "grad_norm": 0.416695105752652, + "learning_rate": 0.00015540141929129775, + "loss": 1.0012, + "step": 6485 + }, + { + "epoch": 1.3835324232081911, + "grad_norm": 0.42198660651768355, + "learning_rate": 0.00015539857527458226, + "loss": 0.9352, + "step": 6486 + }, + { + "epoch": 1.3837457337883958, + "grad_norm": 0.5313293696258093, + "learning_rate": 0.00015539573040473063, + "loss": 0.969, + "step": 6487 + }, + { + "epoch": 1.3839590443686007, + "grad_norm": 0.5783835971842405, + "learning_rate": 0.00015539288468177501, + "loss": 0.9615, + "step": 6488 + }, + { + "epoch": 1.3841723549488054, + "grad_norm": 0.5998021771739698, + "learning_rate": 0.00015539003810574766, + "loss": 0.9884, + "step": 6489 + }, + { + "epoch": 1.38438566552901, + "grad_norm": 0.5884671172059496, + "learning_rate": 0.00015538719067668077, + "loss": 0.9422, + "step": 6490 + }, + { + "epoch": 1.384598976109215, + "grad_norm": 0.6206066571950148, + "learning_rate": 0.00015538434239460654, + "loss": 0.9717, + "step": 6491 + }, + { + "epoch": 1.38481228668942, + "grad_norm": 0.6382974850527716, + "learning_rate": 0.0001553814932595572, + "loss": 0.9961, + "step": 6492 + }, + { + "epoch": 1.3850255972696246, + "grad_norm": 0.6028688871585397, + "learning_rate": 0.000155378643271565, + "loss": 0.9468, + "step": 6493 + }, + { + "epoch": 1.3852389078498293, + "grad_norm": 0.5228541314288853, + "learning_rate": 0.00015537579243066215, + "loss": 0.9697, + "step": 6494 + }, + { + "epoch": 1.3854522184300342, + "grad_norm": 0.3602055754252019, + "learning_rate": 0.000155372940736881, + "loss": 0.9841, + "step": 6495 + }, + { + "epoch": 1.3856655290102389, + "grad_norm": 0.24014157331016592, + "learning_rate": 0.0001553700881902537, + "loss": 0.926, + "step": 6496 + }, + { + "epoch": 1.3858788395904438, + "grad_norm": 0.3030513434955523, + "learning_rate": 0.00015536723479081264, + "loss": 0.963, + "step": 6497 + }, + { + "epoch": 1.3860921501706485, + "grad_norm": 0.39644988147371996, + "learning_rate": 0.00015536438053859002, + "loss": 0.9642, + "step": 6498 + }, + { + "epoch": 1.3863054607508531, + "grad_norm": 0.45971487315439585, + "learning_rate": 0.00015536152543361816, + "loss": 0.9639, + "step": 6499 + }, + { + "epoch": 1.386518771331058, + "grad_norm": 0.4473161320502103, + "learning_rate": 0.0001553586694759294, + "loss": 0.9764, + "step": 6500 + }, + { + "epoch": 1.3867320819112627, + "grad_norm": 0.4132641237310153, + "learning_rate": 0.000155355812665556, + "loss": 0.9427, + "step": 6501 + }, + { + "epoch": 1.3869453924914676, + "grad_norm": 0.4490196797451401, + "learning_rate": 0.00015535295500253033, + "loss": 0.9667, + "step": 6502 + }, + { + "epoch": 1.3871587030716723, + "grad_norm": 0.48412646914096313, + "learning_rate": 0.0001553500964868847, + "loss": 0.9695, + "step": 6503 + }, + { + "epoch": 1.387372013651877, + "grad_norm": 0.46051584663874867, + "learning_rate": 0.00015534723711865144, + "loss": 0.9803, + "step": 6504 + }, + { + "epoch": 1.387585324232082, + "grad_norm": 0.39155819134031966, + "learning_rate": 0.00015534437689786296, + "loss": 1.0023, + "step": 6505 + }, + { + "epoch": 1.3877986348122868, + "grad_norm": 0.2721152413602337, + "learning_rate": 0.00015534151582455156, + "loss": 0.923, + "step": 6506 + }, + { + "epoch": 1.3880119453924915, + "grad_norm": 0.2852610294548479, + "learning_rate": 0.00015533865389874966, + "loss": 0.9774, + "step": 6507 + }, + { + "epoch": 1.3882252559726962, + "grad_norm": 0.37980747650367386, + "learning_rate": 0.00015533579112048964, + "loss": 0.9535, + "step": 6508 + }, + { + "epoch": 1.388438566552901, + "grad_norm": 0.4603198728006296, + "learning_rate": 0.00015533292748980384, + "loss": 0.9336, + "step": 6509 + }, + { + "epoch": 1.3886518771331058, + "grad_norm": 0.5067902368851687, + "learning_rate": 0.00015533006300672472, + "loss": 0.9389, + "step": 6510 + }, + { + "epoch": 1.3888651877133107, + "grad_norm": 0.48223880485594545, + "learning_rate": 0.00015532719767128467, + "loss": 0.9656, + "step": 6511 + }, + { + "epoch": 1.3890784982935154, + "grad_norm": 0.4389350654591348, + "learning_rate": 0.0001553243314835161, + "loss": 0.9713, + "step": 6512 + }, + { + "epoch": 1.38929180887372, + "grad_norm": 0.3585150429298366, + "learning_rate": 0.00015532146444345146, + "loss": 0.9662, + "step": 6513 + }, + { + "epoch": 1.389505119453925, + "grad_norm": 0.26387919923731346, + "learning_rate": 0.0001553185965511232, + "loss": 0.9895, + "step": 6514 + }, + { + "epoch": 1.3897184300341296, + "grad_norm": 0.26222868108649416, + "learning_rate": 0.0001553157278065637, + "loss": 0.9537, + "step": 6515 + }, + { + "epoch": 1.3899317406143346, + "grad_norm": 0.3462053195846716, + "learning_rate": 0.0001553128582098055, + "loss": 0.9344, + "step": 6516 + }, + { + "epoch": 1.3901450511945392, + "grad_norm": 0.42484301112175477, + "learning_rate": 0.00015530998776088105, + "loss": 0.9573, + "step": 6517 + }, + { + "epoch": 1.390358361774744, + "grad_norm": 0.5170151604538646, + "learning_rate": 0.00015530711645982283, + "loss": 0.9782, + "step": 6518 + }, + { + "epoch": 1.3905716723549488, + "grad_norm": 0.4808188813690307, + "learning_rate": 0.0001553042443066633, + "loss": 0.9362, + "step": 6519 + }, + { + "epoch": 1.3907849829351535, + "grad_norm": 0.381035091494901, + "learning_rate": 0.000155301371301435, + "loss": 0.9713, + "step": 6520 + }, + { + "epoch": 1.3909982935153584, + "grad_norm": 0.3424947805776915, + "learning_rate": 0.0001552984974441704, + "loss": 0.9699, + "step": 6521 + }, + { + "epoch": 1.391211604095563, + "grad_norm": 0.36751163928548813, + "learning_rate": 0.00015529562273490204, + "loss": 0.9631, + "step": 6522 + }, + { + "epoch": 1.3914249146757678, + "grad_norm": 0.39724681159058683, + "learning_rate": 0.00015529274717366245, + "loss": 0.9541, + "step": 6523 + }, + { + "epoch": 1.3916382252559727, + "grad_norm": 0.3788998183971069, + "learning_rate": 0.00015528987076048417, + "loss": 0.9547, + "step": 6524 + }, + { + "epoch": 1.3918515358361776, + "grad_norm": 0.35590724980641697, + "learning_rate": 0.00015528699349539972, + "loss": 0.9709, + "step": 6525 + }, + { + "epoch": 1.3920648464163823, + "grad_norm": 0.4254957134315343, + "learning_rate": 0.00015528411537844166, + "loss": 0.9532, + "step": 6526 + }, + { + "epoch": 1.392278156996587, + "grad_norm": 0.4525596708850749, + "learning_rate": 0.00015528123640964257, + "loss": 0.9673, + "step": 6527 + }, + { + "epoch": 1.3924914675767919, + "grad_norm": 0.42565952323144646, + "learning_rate": 0.00015527835658903503, + "loss": 0.9578, + "step": 6528 + }, + { + "epoch": 1.3927047781569966, + "grad_norm": 0.3258973549618417, + "learning_rate": 0.00015527547591665162, + "loss": 0.9626, + "step": 6529 + }, + { + "epoch": 1.3929180887372015, + "grad_norm": 0.2703605012722583, + "learning_rate": 0.00015527259439252495, + "loss": 0.9405, + "step": 6530 + }, + { + "epoch": 1.3931313993174061, + "grad_norm": 0.3755369884153899, + "learning_rate": 0.0001552697120166876, + "loss": 0.9686, + "step": 6531 + }, + { + "epoch": 1.3933447098976108, + "grad_norm": 0.3841959291587727, + "learning_rate": 0.00015526682878917218, + "loss": 0.9074, + "step": 6532 + }, + { + "epoch": 1.3935580204778157, + "grad_norm": 0.25385289489222324, + "learning_rate": 0.00015526394471001133, + "loss": 0.9722, + "step": 6533 + }, + { + "epoch": 1.3937713310580204, + "grad_norm": 0.3037996652720571, + "learning_rate": 0.00015526105977923767, + "loss": 0.9648, + "step": 6534 + }, + { + "epoch": 1.3939846416382253, + "grad_norm": 0.31474160844282023, + "learning_rate": 0.00015525817399688387, + "loss": 0.9865, + "step": 6535 + }, + { + "epoch": 1.39419795221843, + "grad_norm": 0.28793679425382107, + "learning_rate": 0.00015525528736298256, + "loss": 0.9717, + "step": 6536 + }, + { + "epoch": 1.3944112627986347, + "grad_norm": 0.32436852055355425, + "learning_rate": 0.00015525239987756643, + "loss": 0.978, + "step": 6537 + }, + { + "epoch": 1.3946245733788396, + "grad_norm": 0.3741256821546366, + "learning_rate": 0.0001552495115406681, + "loss": 0.9514, + "step": 6538 + }, + { + "epoch": 1.3948378839590443, + "grad_norm": 0.46740802093970923, + "learning_rate": 0.00015524662235232028, + "loss": 0.987, + "step": 6539 + }, + { + "epoch": 1.3950511945392492, + "grad_norm": 0.6217779284100379, + "learning_rate": 0.00015524373231255568, + "loss": 0.9399, + "step": 6540 + }, + { + "epoch": 1.3952645051194539, + "grad_norm": 0.7048496176188187, + "learning_rate": 0.00015524084142140698, + "loss": 0.9561, + "step": 6541 + }, + { + "epoch": 1.3954778156996588, + "grad_norm": 0.6982944048379359, + "learning_rate": 0.00015523794967890688, + "loss": 0.9455, + "step": 6542 + }, + { + "epoch": 1.3956911262798635, + "grad_norm": 0.7393045693595587, + "learning_rate": 0.0001552350570850881, + "loss": 0.9807, + "step": 6543 + }, + { + "epoch": 1.3959044368600684, + "grad_norm": 0.7375495344940719, + "learning_rate": 0.00015523216363998345, + "loss": 0.9581, + "step": 6544 + }, + { + "epoch": 1.396117747440273, + "grad_norm": 0.6597326277233392, + "learning_rate": 0.00015522926934362553, + "loss": 0.9533, + "step": 6545 + }, + { + "epoch": 1.3963310580204777, + "grad_norm": 0.6060802913181769, + "learning_rate": 0.0001552263741960472, + "loss": 0.9589, + "step": 6546 + }, + { + "epoch": 1.3965443686006827, + "grad_norm": 0.5084191288614385, + "learning_rate": 0.00015522347819728119, + "loss": 0.9592, + "step": 6547 + }, + { + "epoch": 1.3967576791808873, + "grad_norm": 1.6105911836609694, + "learning_rate": 0.00015522058134736022, + "loss": 0.9955, + "step": 6548 + }, + { + "epoch": 1.3969709897610922, + "grad_norm": 0.38519962282197995, + "learning_rate": 0.00015521768364631713, + "loss": 0.9745, + "step": 6549 + }, + { + "epoch": 1.397184300341297, + "grad_norm": 0.8349952645028484, + "learning_rate": 0.00015521478509418469, + "loss": 0.9532, + "step": 6550 + }, + { + "epoch": 1.3973976109215016, + "grad_norm": 1.2009745622868115, + "learning_rate": 0.00015521188569099568, + "loss": 0.9732, + "step": 6551 + }, + { + "epoch": 1.3976109215017065, + "grad_norm": 0.6188348945410327, + "learning_rate": 0.00015520898543678292, + "loss": 0.9406, + "step": 6552 + }, + { + "epoch": 1.3978242320819112, + "grad_norm": 0.36987384202939705, + "learning_rate": 0.00015520608433157923, + "loss": 0.9739, + "step": 6553 + }, + { + "epoch": 1.398037542662116, + "grad_norm": 0.6371492010669366, + "learning_rate": 0.00015520318237541742, + "loss": 0.9343, + "step": 6554 + }, + { + "epoch": 1.3982508532423208, + "grad_norm": 0.8798568797434525, + "learning_rate": 0.00015520027956833035, + "loss": 0.9721, + "step": 6555 + }, + { + "epoch": 1.3984641638225255, + "grad_norm": 1.0188290778555114, + "learning_rate": 0.00015519737591035085, + "loss": 0.9229, + "step": 6556 + }, + { + "epoch": 1.3986774744027304, + "grad_norm": 0.8934092686675302, + "learning_rate": 0.00015519447140151176, + "loss": 0.9454, + "step": 6557 + }, + { + "epoch": 1.3988907849829353, + "grad_norm": 0.7376499359804908, + "learning_rate": 0.00015519156604184596, + "loss": 0.9534, + "step": 6558 + }, + { + "epoch": 1.39910409556314, + "grad_norm": 0.5960388459827406, + "learning_rate": 0.00015518865983138637, + "loss": 0.962, + "step": 6559 + }, + { + "epoch": 1.3993174061433447, + "grad_norm": 0.5164064397062158, + "learning_rate": 0.0001551857527701658, + "loss": 0.9465, + "step": 6560 + }, + { + "epoch": 1.3995307167235496, + "grad_norm": 0.4858381169387845, + "learning_rate": 0.00015518284485821715, + "loss": 0.928, + "step": 6561 + }, + { + "epoch": 1.3997440273037542, + "grad_norm": 0.6787316590501268, + "learning_rate": 0.00015517993609557337, + "loss": 1.0275, + "step": 6562 + }, + { + "epoch": 1.3999573378839592, + "grad_norm": 0.8773073191465843, + "learning_rate": 0.00015517702648226736, + "loss": 0.9545, + "step": 6563 + }, + { + "epoch": 1.4001706484641638, + "grad_norm": 0.8851570559400986, + "learning_rate": 0.00015517411601833201, + "loss": 0.9586, + "step": 6564 + }, + { + "epoch": 1.4003839590443685, + "grad_norm": 0.7849432915581612, + "learning_rate": 0.00015517120470380028, + "loss": 0.9507, + "step": 6565 + }, + { + "epoch": 1.4005972696245734, + "grad_norm": 0.5994930968949008, + "learning_rate": 0.00015516829253870508, + "loss": 0.9651, + "step": 6566 + }, + { + "epoch": 1.400810580204778, + "grad_norm": 0.46397084163133445, + "learning_rate": 0.00015516537952307944, + "loss": 0.9301, + "step": 6567 + }, + { + "epoch": 1.401023890784983, + "grad_norm": 0.42916767035314596, + "learning_rate": 0.00015516246565695622, + "loss": 0.9706, + "step": 6568 + }, + { + "epoch": 1.4012372013651877, + "grad_norm": 0.48898992935909547, + "learning_rate": 0.00015515955094036845, + "loss": 0.931, + "step": 6569 + }, + { + "epoch": 1.4014505119453924, + "grad_norm": 0.4341892657661568, + "learning_rate": 0.0001551566353733491, + "loss": 0.9328, + "step": 6570 + }, + { + "epoch": 1.4016638225255973, + "grad_norm": 0.4644801451863394, + "learning_rate": 0.00015515371895593116, + "loss": 0.9689, + "step": 6571 + }, + { + "epoch": 1.401877133105802, + "grad_norm": 0.5483028963589545, + "learning_rate": 0.00015515080168814765, + "loss": 0.9091, + "step": 6572 + }, + { + "epoch": 1.4020904436860069, + "grad_norm": 0.48668245336006166, + "learning_rate": 0.0001551478835700315, + "loss": 0.9583, + "step": 6573 + }, + { + "epoch": 1.4023037542662116, + "grad_norm": 0.4795932388415624, + "learning_rate": 0.0001551449646016158, + "loss": 0.9766, + "step": 6574 + }, + { + "epoch": 1.4025170648464163, + "grad_norm": 0.43952956056080017, + "learning_rate": 0.0001551420447829336, + "loss": 0.938, + "step": 6575 + }, + { + "epoch": 1.4027303754266212, + "grad_norm": 0.3976896428722188, + "learning_rate": 0.00015513912411401786, + "loss": 0.9658, + "step": 6576 + }, + { + "epoch": 1.402943686006826, + "grad_norm": 0.42247818120595093, + "learning_rate": 0.00015513620259490168, + "loss": 0.9426, + "step": 6577 + }, + { + "epoch": 1.4031569965870307, + "grad_norm": 0.4711309775665964, + "learning_rate": 0.00015513328022561808, + "loss": 0.938, + "step": 6578 + }, + { + "epoch": 1.4033703071672354, + "grad_norm": 0.3809009326824482, + "learning_rate": 0.0001551303570062002, + "loss": 0.9555, + "step": 6579 + }, + { + "epoch": 1.4035836177474403, + "grad_norm": 0.3559867622548515, + "learning_rate": 0.00015512743293668102, + "loss": 0.9421, + "step": 6580 + }, + { + "epoch": 1.403796928327645, + "grad_norm": 0.3998846311413065, + "learning_rate": 0.0001551245080170937, + "loss": 0.9568, + "step": 6581 + }, + { + "epoch": 1.40401023890785, + "grad_norm": 0.3573590041204341, + "learning_rate": 0.0001551215822474713, + "loss": 0.9663, + "step": 6582 + }, + { + "epoch": 1.4042235494880546, + "grad_norm": 0.31125752110373034, + "learning_rate": 0.00015511865562784692, + "loss": 0.9467, + "step": 6583 + }, + { + "epoch": 1.4044368600682593, + "grad_norm": 0.3861278629957349, + "learning_rate": 0.0001551157281582537, + "loss": 0.9608, + "step": 6584 + }, + { + "epoch": 1.4046501706484642, + "grad_norm": 0.39751469509750326, + "learning_rate": 0.00015511279983872477, + "loss": 0.9456, + "step": 6585 + }, + { + "epoch": 1.4048634812286689, + "grad_norm": 0.388133667046397, + "learning_rate": 0.0001551098706692932, + "loss": 0.9698, + "step": 6586 + }, + { + "epoch": 1.4050767918088738, + "grad_norm": 0.30547308373248294, + "learning_rate": 0.0001551069406499922, + "loss": 0.9362, + "step": 6587 + }, + { + "epoch": 1.4052901023890785, + "grad_norm": 0.3169760880391873, + "learning_rate": 0.00015510400978085493, + "loss": 0.9691, + "step": 6588 + }, + { + "epoch": 1.4055034129692832, + "grad_norm": 0.38048792664730047, + "learning_rate": 0.00015510107806191454, + "loss": 0.9401, + "step": 6589 + }, + { + "epoch": 1.405716723549488, + "grad_norm": 0.39087597497243104, + "learning_rate": 0.00015509814549320415, + "loss": 0.9512, + "step": 6590 + }, + { + "epoch": 1.4059300341296928, + "grad_norm": 0.4012362120500607, + "learning_rate": 0.00015509521207475698, + "loss": 0.9631, + "step": 6591 + }, + { + "epoch": 1.4061433447098977, + "grad_norm": 0.31921062232012304, + "learning_rate": 0.00015509227780660624, + "loss": 0.9729, + "step": 6592 + }, + { + "epoch": 1.4063566552901023, + "grad_norm": 0.2613164441916381, + "learning_rate": 0.0001550893426887851, + "loss": 0.9383, + "step": 6593 + }, + { + "epoch": 1.4065699658703072, + "grad_norm": 0.3265287673298241, + "learning_rate": 0.00015508640672132678, + "loss": 0.9611, + "step": 6594 + }, + { + "epoch": 1.406783276450512, + "grad_norm": 0.3325200904882048, + "learning_rate": 0.0001550834699042645, + "loss": 0.9655, + "step": 6595 + }, + { + "epoch": 1.4069965870307168, + "grad_norm": 0.35652960165932024, + "learning_rate": 0.00015508053223763153, + "loss": 0.9456, + "step": 6596 + }, + { + "epoch": 1.4072098976109215, + "grad_norm": 0.3569510442839101, + "learning_rate": 0.00015507759372146107, + "loss": 0.9409, + "step": 6597 + }, + { + "epoch": 1.4074232081911262, + "grad_norm": 0.4183604234806881, + "learning_rate": 0.00015507465435578633, + "loss": 0.9362, + "step": 6598 + }, + { + "epoch": 1.4076365187713311, + "grad_norm": 0.47852457138968585, + "learning_rate": 0.00015507171414064065, + "loss": 0.9742, + "step": 6599 + }, + { + "epoch": 1.4078498293515358, + "grad_norm": 0.49574966877458826, + "learning_rate": 0.00015506877307605726, + "loss": 0.9456, + "step": 6600 + }, + { + "epoch": 1.4080631399317407, + "grad_norm": 0.4980643416319912, + "learning_rate": 0.00015506583116206942, + "loss": 0.9608, + "step": 6601 + }, + { + "epoch": 1.4082764505119454, + "grad_norm": 0.46403597520166123, + "learning_rate": 0.00015506288839871045, + "loss": 0.9472, + "step": 6602 + }, + { + "epoch": 1.40848976109215, + "grad_norm": 0.3464742279737656, + "learning_rate": 0.00015505994478601365, + "loss": 0.9304, + "step": 6603 + }, + { + "epoch": 1.408703071672355, + "grad_norm": 0.3664342350633346, + "learning_rate": 0.00015505700032401228, + "loss": 0.9867, + "step": 6604 + }, + { + "epoch": 1.4089163822525597, + "grad_norm": 0.4365355671078843, + "learning_rate": 0.0001550540550127397, + "loss": 0.9394, + "step": 6605 + }, + { + "epoch": 1.4091296928327646, + "grad_norm": 0.47595100744716773, + "learning_rate": 0.00015505110885222925, + "loss": 0.9567, + "step": 6606 + }, + { + "epoch": 1.4093430034129693, + "grad_norm": 0.35018550025188155, + "learning_rate": 0.0001550481618425142, + "loss": 0.9429, + "step": 6607 + }, + { + "epoch": 1.409556313993174, + "grad_norm": 0.2835545376541165, + "learning_rate": 0.00015504521398362797, + "loss": 0.9792, + "step": 6608 + }, + { + "epoch": 1.4097696245733788, + "grad_norm": 0.5021661307283888, + "learning_rate": 0.00015504226527560388, + "loss": 0.9652, + "step": 6609 + }, + { + "epoch": 1.4099829351535837, + "grad_norm": 0.41457389597450284, + "learning_rate": 0.00015503931571847528, + "loss": 0.9391, + "step": 6610 + }, + { + "epoch": 1.4101962457337884, + "grad_norm": 0.3642975483348048, + "learning_rate": 0.00015503636531227557, + "loss": 0.9698, + "step": 6611 + }, + { + "epoch": 1.4104095563139931, + "grad_norm": 0.4014819687203631, + "learning_rate": 0.00015503341405703812, + "loss": 0.9638, + "step": 6612 + }, + { + "epoch": 1.410622866894198, + "grad_norm": 0.40684456330400626, + "learning_rate": 0.00015503046195279634, + "loss": 0.9683, + "step": 6613 + }, + { + "epoch": 1.4108361774744027, + "grad_norm": 0.379451555744056, + "learning_rate": 0.00015502750899958358, + "loss": 0.9681, + "step": 6614 + }, + { + "epoch": 1.4110494880546076, + "grad_norm": 0.4088273690789481, + "learning_rate": 0.00015502455519743333, + "loss": 0.9437, + "step": 6615 + }, + { + "epoch": 1.4112627986348123, + "grad_norm": 0.5140419001086556, + "learning_rate": 0.000155021600546379, + "loss": 0.9622, + "step": 6616 + }, + { + "epoch": 1.411476109215017, + "grad_norm": 0.5434994555785401, + "learning_rate": 0.00015501864504645395, + "loss": 0.9363, + "step": 6617 + }, + { + "epoch": 1.411689419795222, + "grad_norm": 0.5415186508755552, + "learning_rate": 0.0001550156886976917, + "loss": 0.9855, + "step": 6618 + }, + { + "epoch": 1.4119027303754266, + "grad_norm": 0.6189034600761614, + "learning_rate": 0.00015501273150012564, + "loss": 0.9784, + "step": 6619 + }, + { + "epoch": 1.4121160409556315, + "grad_norm": 0.6917422157729044, + "learning_rate": 0.0001550097734537893, + "loss": 0.9521, + "step": 6620 + }, + { + "epoch": 1.4123293515358362, + "grad_norm": 0.667174026565121, + "learning_rate": 0.0001550068145587161, + "loss": 0.9615, + "step": 6621 + }, + { + "epoch": 1.4125426621160408, + "grad_norm": 0.5600553189949807, + "learning_rate": 0.0001550038548149395, + "loss": 0.9586, + "step": 6622 + }, + { + "epoch": 1.4127559726962458, + "grad_norm": 0.5662603792632649, + "learning_rate": 0.00015500089422249304, + "loss": 0.9386, + "step": 6623 + }, + { + "epoch": 1.4129692832764504, + "grad_norm": 0.5034286180569365, + "learning_rate": 0.00015499793278141022, + "loss": 0.961, + "step": 6624 + }, + { + "epoch": 1.4131825938566553, + "grad_norm": 0.4379041146892824, + "learning_rate": 0.00015499497049172451, + "loss": 0.9203, + "step": 6625 + }, + { + "epoch": 1.41339590443686, + "grad_norm": 0.4191055200612759, + "learning_rate": 0.00015499200735346943, + "loss": 0.9431, + "step": 6626 + }, + { + "epoch": 1.4136092150170647, + "grad_norm": 0.39712953132467343, + "learning_rate": 0.00015498904336667856, + "loss": 0.9574, + "step": 6627 + }, + { + "epoch": 1.4138225255972696, + "grad_norm": 0.3096330967198892, + "learning_rate": 0.00015498607853138538, + "loss": 0.9688, + "step": 6628 + }, + { + "epoch": 1.4140358361774745, + "grad_norm": 0.2796961946932849, + "learning_rate": 0.00015498311284762347, + "loss": 0.939, + "step": 6629 + }, + { + "epoch": 1.4142491467576792, + "grad_norm": 0.3179646361605157, + "learning_rate": 0.0001549801463154264, + "loss": 0.9935, + "step": 6630 + }, + { + "epoch": 1.414462457337884, + "grad_norm": 0.375435713301252, + "learning_rate": 0.00015497717893482767, + "loss": 0.9482, + "step": 6631 + }, + { + "epoch": 1.4146757679180888, + "grad_norm": 0.37176200082582533, + "learning_rate": 0.00015497421070586093, + "loss": 0.9497, + "step": 6632 + }, + { + "epoch": 1.4148890784982935, + "grad_norm": 0.37813960484119136, + "learning_rate": 0.00015497124162855972, + "loss": 0.9808, + "step": 6633 + }, + { + "epoch": 1.4151023890784984, + "grad_norm": 0.4534301651471531, + "learning_rate": 0.00015496827170295764, + "loss": 0.9535, + "step": 6634 + }, + { + "epoch": 1.415315699658703, + "grad_norm": 0.5578196676877675, + "learning_rate": 0.0001549653009290883, + "loss": 0.9493, + "step": 6635 + }, + { + "epoch": 1.4155290102389078, + "grad_norm": 0.4433657299001026, + "learning_rate": 0.00015496232930698538, + "loss": 0.9468, + "step": 6636 + }, + { + "epoch": 1.4157423208191127, + "grad_norm": 0.4135955970745844, + "learning_rate": 0.00015495935683668239, + "loss": 0.9603, + "step": 6637 + }, + { + "epoch": 1.4159556313993173, + "grad_norm": 0.512962733265387, + "learning_rate": 0.00015495638351821302, + "loss": 0.9206, + "step": 6638 + }, + { + "epoch": 1.4161689419795223, + "grad_norm": 0.5390224079580908, + "learning_rate": 0.0001549534093516109, + "loss": 0.9427, + "step": 6639 + }, + { + "epoch": 1.416382252559727, + "grad_norm": 0.5591063740691042, + "learning_rate": 0.00015495043433690973, + "loss": 0.9951, + "step": 6640 + }, + { + "epoch": 1.4165955631399316, + "grad_norm": 0.5645707448762901, + "learning_rate": 0.0001549474584741431, + "loss": 0.9914, + "step": 6641 + }, + { + "epoch": 1.4168088737201365, + "grad_norm": 0.5370095883220174, + "learning_rate": 0.00015494448176334475, + "loss": 0.9325, + "step": 6642 + }, + { + "epoch": 1.4170221843003412, + "grad_norm": 0.49030409004136216, + "learning_rate": 0.00015494150420454828, + "loss": 0.9441, + "step": 6643 + }, + { + "epoch": 1.4172354948805461, + "grad_norm": 0.5031214286342993, + "learning_rate": 0.00015493852579778747, + "loss": 0.9479, + "step": 6644 + }, + { + "epoch": 1.4174488054607508, + "grad_norm": 0.45933269139026384, + "learning_rate": 0.00015493554654309597, + "loss": 0.9783, + "step": 6645 + }, + { + "epoch": 1.4176621160409557, + "grad_norm": 0.46421182179334763, + "learning_rate": 0.0001549325664405075, + "loss": 0.9699, + "step": 6646 + }, + { + "epoch": 1.4178754266211604, + "grad_norm": 0.44833386535312114, + "learning_rate": 0.00015492958549005576, + "loss": 0.9642, + "step": 6647 + }, + { + "epoch": 1.4180887372013653, + "grad_norm": 0.3731218241170011, + "learning_rate": 0.00015492660369177452, + "loss": 0.9641, + "step": 6648 + }, + { + "epoch": 1.41830204778157, + "grad_norm": 0.6072718158671699, + "learning_rate": 0.00015492362104569751, + "loss": 0.9593, + "step": 6649 + }, + { + "epoch": 1.4185153583617747, + "grad_norm": 0.33226083610390467, + "learning_rate": 0.00015492063755185846, + "loss": 0.92, + "step": 6650 + }, + { + "epoch": 1.4187286689419796, + "grad_norm": 0.33592775028359, + "learning_rate": 0.00015491765321029114, + "loss": 0.9495, + "step": 6651 + }, + { + "epoch": 1.4189419795221843, + "grad_norm": 0.27692590335701317, + "learning_rate": 0.00015491466802102928, + "loss": 0.9664, + "step": 6652 + }, + { + "epoch": 1.4191552901023892, + "grad_norm": 0.3047331839757817, + "learning_rate": 0.00015491168198410673, + "loss": 1.002, + "step": 6653 + }, + { + "epoch": 1.4193686006825939, + "grad_norm": 0.31404627427156656, + "learning_rate": 0.00015490869509955725, + "loss": 0.9382, + "step": 6654 + }, + { + "epoch": 1.4195819112627985, + "grad_norm": 0.31647755483313106, + "learning_rate": 0.00015490570736741458, + "loss": 0.9552, + "step": 6655 + }, + { + "epoch": 1.4197952218430034, + "grad_norm": 0.3071931401964481, + "learning_rate": 0.00015490271878771257, + "loss": 0.9717, + "step": 6656 + }, + { + "epoch": 1.4200085324232081, + "grad_norm": 0.27157153929277583, + "learning_rate": 0.00015489972936048508, + "loss": 0.9564, + "step": 6657 + }, + { + "epoch": 1.420221843003413, + "grad_norm": 0.25993440781608074, + "learning_rate": 0.00015489673908576586, + "loss": 0.9637, + "step": 6658 + }, + { + "epoch": 1.4204351535836177, + "grad_norm": 0.32899533129498654, + "learning_rate": 0.0001548937479635888, + "loss": 0.9603, + "step": 6659 + }, + { + "epoch": 1.4206484641638224, + "grad_norm": 0.36199871744188283, + "learning_rate": 0.0001548907559939877, + "loss": 0.9275, + "step": 6660 + }, + { + "epoch": 1.4208617747440273, + "grad_norm": 0.4210917395146236, + "learning_rate": 0.00015488776317699644, + "loss": 0.952, + "step": 6661 + }, + { + "epoch": 1.4210750853242322, + "grad_norm": 0.4765773411778018, + "learning_rate": 0.00015488476951264888, + "loss": 0.9359, + "step": 6662 + }, + { + "epoch": 1.421288395904437, + "grad_norm": 0.5256738908835791, + "learning_rate": 0.00015488177500097892, + "loss": 0.9566, + "step": 6663 + }, + { + "epoch": 1.4215017064846416, + "grad_norm": 0.5627150889259208, + "learning_rate": 0.00015487877964202038, + "loss": 0.9388, + "step": 6664 + }, + { + "epoch": 1.4217150170648465, + "grad_norm": 0.5689072959851251, + "learning_rate": 0.0001548757834358072, + "loss": 0.9128, + "step": 6665 + }, + { + "epoch": 1.4219283276450512, + "grad_norm": 0.6327473234003971, + "learning_rate": 0.0001548727863823733, + "loss": 0.9265, + "step": 6666 + }, + { + "epoch": 1.422141638225256, + "grad_norm": 0.7877708745248946, + "learning_rate": 0.00015486978848175252, + "loss": 0.9377, + "step": 6667 + }, + { + "epoch": 1.4223549488054608, + "grad_norm": 1.1259312967571873, + "learning_rate": 0.00015486678973397885, + "loss": 0.9613, + "step": 6668 + }, + { + "epoch": 1.4225682593856654, + "grad_norm": 0.5732538520307058, + "learning_rate": 0.0001548637901390862, + "loss": 0.9641, + "step": 6669 + }, + { + "epoch": 1.4227815699658704, + "grad_norm": 0.5195390723615729, + "learning_rate": 0.00015486078969710848, + "loss": 0.9524, + "step": 6670 + }, + { + "epoch": 1.422994880546075, + "grad_norm": 0.4200011091687078, + "learning_rate": 0.00015485778840807968, + "loss": 0.9803, + "step": 6671 + }, + { + "epoch": 1.42320819112628, + "grad_norm": 0.34034278547189384, + "learning_rate": 0.0001548547862720337, + "loss": 0.9831, + "step": 6672 + }, + { + "epoch": 1.4234215017064846, + "grad_norm": 0.46419737959322976, + "learning_rate": 0.00015485178328900463, + "loss": 0.987, + "step": 6673 + }, + { + "epoch": 1.4236348122866893, + "grad_norm": 0.5352433873918623, + "learning_rate": 0.00015484877945902634, + "loss": 0.9722, + "step": 6674 + }, + { + "epoch": 1.4238481228668942, + "grad_norm": 0.5743347981170213, + "learning_rate": 0.00015484577478213285, + "loss": 0.9462, + "step": 6675 + }, + { + "epoch": 1.424061433447099, + "grad_norm": 0.5839216518154456, + "learning_rate": 0.00015484276925835816, + "loss": 0.9291, + "step": 6676 + }, + { + "epoch": 1.4242747440273038, + "grad_norm": 0.5751946746366979, + "learning_rate": 0.00015483976288773626, + "loss": 0.9584, + "step": 6677 + }, + { + "epoch": 1.4244880546075085, + "grad_norm": 0.46055122114936325, + "learning_rate": 0.00015483675567030123, + "loss": 0.9415, + "step": 6678 + }, + { + "epoch": 1.4247013651877132, + "grad_norm": 0.4205112672148765, + "learning_rate": 0.000154833747606087, + "loss": 0.9428, + "step": 6679 + }, + { + "epoch": 1.424914675767918, + "grad_norm": 0.49589407655642953, + "learning_rate": 0.0001548307386951277, + "loss": 0.9689, + "step": 6680 + }, + { + "epoch": 1.425127986348123, + "grad_norm": 0.4481788071409988, + "learning_rate": 0.0001548277289374573, + "loss": 0.9898, + "step": 6681 + }, + { + "epoch": 1.4253412969283277, + "grad_norm": 0.9132746506869075, + "learning_rate": 0.0001548247183331099, + "loss": 0.9514, + "step": 6682 + }, + { + "epoch": 1.4255546075085324, + "grad_norm": 0.3889435390970732, + "learning_rate": 0.00015482170688211957, + "loss": 0.987, + "step": 6683 + }, + { + "epoch": 1.4257679180887373, + "grad_norm": 0.6632972651949515, + "learning_rate": 0.00015481869458452033, + "loss": 0.9664, + "step": 6684 + }, + { + "epoch": 1.425981228668942, + "grad_norm": 0.7591147424007607, + "learning_rate": 0.00015481568144034633, + "loss": 0.9489, + "step": 6685 + }, + { + "epoch": 1.4261945392491469, + "grad_norm": 0.7741091337424013, + "learning_rate": 0.00015481266744963164, + "loss": 0.9785, + "step": 6686 + }, + { + "epoch": 1.4264078498293515, + "grad_norm": 0.7126947845839727, + "learning_rate": 0.00015480965261241035, + "loss": 0.9704, + "step": 6687 + }, + { + "epoch": 1.4266211604095562, + "grad_norm": 0.5473528368275695, + "learning_rate": 0.0001548066369287166, + "loss": 0.9782, + "step": 6688 + }, + { + "epoch": 1.4268344709897611, + "grad_norm": 0.5544375813186068, + "learning_rate": 0.00015480362039858446, + "loss": 0.9524, + "step": 6689 + }, + { + "epoch": 1.4270477815699658, + "grad_norm": 0.47868295610842604, + "learning_rate": 0.00015480060302204812, + "loss": 0.981, + "step": 6690 + }, + { + "epoch": 1.4272610921501707, + "grad_norm": 0.520163348464398, + "learning_rate": 0.00015479758479914168, + "loss": 0.9614, + "step": 6691 + }, + { + "epoch": 1.4274744027303754, + "grad_norm": 0.5064097061035326, + "learning_rate": 0.00015479456572989932, + "loss": 0.9618, + "step": 6692 + }, + { + "epoch": 1.42768771331058, + "grad_norm": 0.5244520381266513, + "learning_rate": 0.0001547915458143552, + "loss": 0.9851, + "step": 6693 + }, + { + "epoch": 1.427901023890785, + "grad_norm": 0.548092280674724, + "learning_rate": 0.00015478852505254349, + "loss": 0.9806, + "step": 6694 + }, + { + "epoch": 1.42811433447099, + "grad_norm": 0.5043376481214387, + "learning_rate": 0.00015478550344449833, + "loss": 0.9735, + "step": 6695 + }, + { + "epoch": 1.4283276450511946, + "grad_norm": 0.4887513659852757, + "learning_rate": 0.00015478248099025398, + "loss": 0.952, + "step": 6696 + }, + { + "epoch": 1.4285409556313993, + "grad_norm": 0.5025052269031147, + "learning_rate": 0.00015477945768984456, + "loss": 0.9955, + "step": 6697 + }, + { + "epoch": 1.4287542662116042, + "grad_norm": 0.44256232956184116, + "learning_rate": 0.00015477643354330434, + "loss": 0.9597, + "step": 6698 + }, + { + "epoch": 1.4289675767918089, + "grad_norm": 0.460220006906432, + "learning_rate": 0.0001547734085506675, + "loss": 0.9649, + "step": 6699 + }, + { + "epoch": 1.4291808873720138, + "grad_norm": 0.4772430933514082, + "learning_rate": 0.0001547703827119683, + "loss": 0.9785, + "step": 6700 + }, + { + "epoch": 1.4293941979522184, + "grad_norm": 0.4691955227251931, + "learning_rate": 0.00015476735602724096, + "loss": 0.9726, + "step": 6701 + }, + { + "epoch": 1.4296075085324231, + "grad_norm": 0.308047651083402, + "learning_rate": 0.00015476432849651972, + "loss": 0.9352, + "step": 6702 + }, + { + "epoch": 1.429820819112628, + "grad_norm": 0.3667652380585238, + "learning_rate": 0.00015476130011983887, + "loss": 0.9665, + "step": 6703 + }, + { + "epoch": 1.4300341296928327, + "grad_norm": 0.36662598559408005, + "learning_rate": 0.0001547582708972326, + "loss": 0.9691, + "step": 6704 + }, + { + "epoch": 1.4302474402730376, + "grad_norm": 0.29902941125235066, + "learning_rate": 0.0001547552408287353, + "loss": 0.9048, + "step": 6705 + }, + { + "epoch": 1.4304607508532423, + "grad_norm": 0.3234687610091824, + "learning_rate": 0.00015475220991438115, + "loss": 0.9565, + "step": 6706 + }, + { + "epoch": 1.430674061433447, + "grad_norm": 0.3838571857022832, + "learning_rate": 0.00015474917815420453, + "loss": 0.9612, + "step": 6707 + }, + { + "epoch": 1.430887372013652, + "grad_norm": 0.3700863128646268, + "learning_rate": 0.00015474614554823968, + "loss": 0.9383, + "step": 6708 + }, + { + "epoch": 1.4311006825938566, + "grad_norm": 0.37409005553101266, + "learning_rate": 0.00015474311209652091, + "loss": 0.9792, + "step": 6709 + }, + { + "epoch": 1.4313139931740615, + "grad_norm": 0.3883458015840242, + "learning_rate": 0.00015474007779908258, + "loss": 0.9662, + "step": 6710 + }, + { + "epoch": 1.4315273037542662, + "grad_norm": 0.42520817984707987, + "learning_rate": 0.00015473704265595904, + "loss": 0.9592, + "step": 6711 + }, + { + "epoch": 1.4317406143344709, + "grad_norm": 0.36925906423421123, + "learning_rate": 0.00015473400666718459, + "loss": 0.9682, + "step": 6712 + }, + { + "epoch": 1.4319539249146758, + "grad_norm": 0.3127125721113721, + "learning_rate": 0.00015473096983279359, + "loss": 0.9934, + "step": 6713 + }, + { + "epoch": 1.4321672354948807, + "grad_norm": 0.3318276811246487, + "learning_rate": 0.0001547279321528204, + "loss": 0.9237, + "step": 6714 + }, + { + "epoch": 1.4323805460750854, + "grad_norm": 0.27512173426688724, + "learning_rate": 0.00015472489362729943, + "loss": 0.9851, + "step": 6715 + }, + { + "epoch": 1.43259385665529, + "grad_norm": 0.2551745355642493, + "learning_rate": 0.000154721854256265, + "loss": 0.9742, + "step": 6716 + }, + { + "epoch": 1.432807167235495, + "grad_norm": 0.2792006257137576, + "learning_rate": 0.00015471881403975155, + "loss": 0.9575, + "step": 6717 + }, + { + "epoch": 1.4330204778156996, + "grad_norm": 0.3175453726992306, + "learning_rate": 0.00015471577297779347, + "loss": 0.9784, + "step": 6718 + }, + { + "epoch": 1.4332337883959045, + "grad_norm": 0.3819072698748441, + "learning_rate": 0.00015471273107042514, + "loss": 0.9819, + "step": 6719 + }, + { + "epoch": 1.4334470989761092, + "grad_norm": 0.5086948482014252, + "learning_rate": 0.000154709688317681, + "loss": 0.9442, + "step": 6720 + }, + { + "epoch": 1.433660409556314, + "grad_norm": 0.5980253836199914, + "learning_rate": 0.0001547066447195955, + "loss": 0.9426, + "step": 6721 + }, + { + "epoch": 1.4338737201365188, + "grad_norm": 0.7440560608299127, + "learning_rate": 0.00015470360027620308, + "loss": 0.9472, + "step": 6722 + }, + { + "epoch": 1.4340870307167235, + "grad_norm": 0.806680652426383, + "learning_rate": 0.00015470055498753812, + "loss": 0.9591, + "step": 6723 + }, + { + "epoch": 1.4343003412969284, + "grad_norm": 0.8500598875058001, + "learning_rate": 0.00015469750885363514, + "loss": 0.9786, + "step": 6724 + }, + { + "epoch": 1.434513651877133, + "grad_norm": 0.8996601133552825, + "learning_rate": 0.0001546944618745286, + "loss": 0.9629, + "step": 6725 + }, + { + "epoch": 1.4347269624573378, + "grad_norm": 0.877568186814219, + "learning_rate": 0.00015469141405025295, + "loss": 0.9705, + "step": 6726 + }, + { + "epoch": 1.4349402730375427, + "grad_norm": 0.7728393523081976, + "learning_rate": 0.0001546883653808427, + "loss": 0.9628, + "step": 6727 + }, + { + "epoch": 1.4351535836177474, + "grad_norm": 0.5728696750989574, + "learning_rate": 0.00015468531586633236, + "loss": 0.9288, + "step": 6728 + }, + { + "epoch": 1.4353668941979523, + "grad_norm": 0.35447768414255537, + "learning_rate": 0.0001546822655067564, + "loss": 0.9393, + "step": 6729 + }, + { + "epoch": 1.435580204778157, + "grad_norm": 0.3464603880845687, + "learning_rate": 0.00015467921430214934, + "loss": 0.9737, + "step": 6730 + }, + { + "epoch": 1.4357935153583616, + "grad_norm": 0.412508930590679, + "learning_rate": 0.0001546761622525457, + "loss": 0.9776, + "step": 6731 + }, + { + "epoch": 1.4360068259385665, + "grad_norm": 0.5638220070933386, + "learning_rate": 0.00015467310935798008, + "loss": 0.9907, + "step": 6732 + }, + { + "epoch": 1.4362201365187715, + "grad_norm": 0.73868894490279, + "learning_rate": 0.00015467005561848693, + "loss": 0.9517, + "step": 6733 + }, + { + "epoch": 1.4364334470989761, + "grad_norm": 0.8418524535937764, + "learning_rate": 0.00015466700103410085, + "loss": 0.9651, + "step": 6734 + }, + { + "epoch": 1.4366467576791808, + "grad_norm": 0.9082914055934813, + "learning_rate": 0.00015466394560485641, + "loss": 0.9341, + "step": 6735 + }, + { + "epoch": 1.4368600682593857, + "grad_norm": 0.7974983859329515, + "learning_rate": 0.00015466088933078817, + "loss": 0.9639, + "step": 6736 + }, + { + "epoch": 1.4370733788395904, + "grad_norm": 0.5309600235026138, + "learning_rate": 0.0001546578322119307, + "loss": 0.9461, + "step": 6737 + }, + { + "epoch": 1.4372866894197953, + "grad_norm": 0.28766845896056575, + "learning_rate": 0.00015465477424831862, + "loss": 0.9879, + "step": 6738 + }, + { + "epoch": 1.4375, + "grad_norm": 0.41919633295481734, + "learning_rate": 0.0001546517154399865, + "loss": 0.9655, + "step": 6739 + }, + { + "epoch": 1.4377133105802047, + "grad_norm": 0.6600781328103185, + "learning_rate": 0.00015464865578696896, + "loss": 0.9472, + "step": 6740 + }, + { + "epoch": 1.4379266211604096, + "grad_norm": 0.851887657497449, + "learning_rate": 0.00015464559528930062, + "loss": 0.9702, + "step": 6741 + }, + { + "epoch": 1.4381399317406143, + "grad_norm": 0.8890749886211567, + "learning_rate": 0.00015464253394701613, + "loss": 0.9875, + "step": 6742 + }, + { + "epoch": 1.4383532423208192, + "grad_norm": 0.8440087059900245, + "learning_rate": 0.0001546394717601501, + "loss": 1.0097, + "step": 6743 + }, + { + "epoch": 1.4385665529010239, + "grad_norm": 0.7468434562579415, + "learning_rate": 0.0001546364087287372, + "loss": 0.9962, + "step": 6744 + }, + { + "epoch": 1.4387798634812285, + "grad_norm": 0.5638349532754785, + "learning_rate": 0.0001546333448528121, + "loss": 0.9579, + "step": 6745 + }, + { + "epoch": 1.4389931740614335, + "grad_norm": 0.35088899559294545, + "learning_rate": 0.0001546302801324094, + "loss": 0.9696, + "step": 6746 + }, + { + "epoch": 1.4392064846416384, + "grad_norm": 0.27668317979288, + "learning_rate": 0.00015462721456756384, + "loss": 0.9617, + "step": 6747 + }, + { + "epoch": 1.439419795221843, + "grad_norm": 0.3767394017197273, + "learning_rate": 0.00015462414815831013, + "loss": 0.9637, + "step": 6748 + }, + { + "epoch": 1.4396331058020477, + "grad_norm": 0.4625447087870889, + "learning_rate": 0.00015462108090468288, + "loss": 0.936, + "step": 6749 + }, + { + "epoch": 1.4398464163822526, + "grad_norm": 0.505819344218925, + "learning_rate": 0.00015461801280671687, + "loss": 0.9581, + "step": 6750 + }, + { + "epoch": 1.4400597269624573, + "grad_norm": 0.4187971536550685, + "learning_rate": 0.00015461494386444677, + "loss": 0.9393, + "step": 6751 + }, + { + "epoch": 1.4402730375426622, + "grad_norm": 0.27938295493273246, + "learning_rate": 0.00015461187407790736, + "loss": 0.9665, + "step": 6752 + }, + { + "epoch": 1.440486348122867, + "grad_norm": 0.2697422456504427, + "learning_rate": 0.00015460880344713332, + "loss": 0.9682, + "step": 6753 + }, + { + "epoch": 1.4406996587030716, + "grad_norm": 0.2641401139328669, + "learning_rate": 0.0001546057319721594, + "loss": 0.9132, + "step": 6754 + }, + { + "epoch": 1.4409129692832765, + "grad_norm": 0.3269657953836801, + "learning_rate": 0.0001546026596530204, + "loss": 0.9383, + "step": 6755 + }, + { + "epoch": 1.4411262798634812, + "grad_norm": 0.3406193985420186, + "learning_rate": 0.00015459958648975105, + "loss": 0.9588, + "step": 6756 + }, + { + "epoch": 1.441339590443686, + "grad_norm": 0.3812837308284488, + "learning_rate": 0.0001545965124823861, + "loss": 0.9279, + "step": 6757 + }, + { + "epoch": 1.4415529010238908, + "grad_norm": 0.3928897247759739, + "learning_rate": 0.00015459343763096035, + "loss": 0.9537, + "step": 6758 + }, + { + "epoch": 1.4417662116040955, + "grad_norm": 0.31714558906303586, + "learning_rate": 0.00015459036193550863, + "loss": 0.9538, + "step": 6759 + }, + { + "epoch": 1.4419795221843004, + "grad_norm": 0.3954513526934187, + "learning_rate": 0.0001545872853960657, + "loss": 0.9596, + "step": 6760 + }, + { + "epoch": 1.442192832764505, + "grad_norm": 0.3651685750999649, + "learning_rate": 0.00015458420801266635, + "loss": 0.9451, + "step": 6761 + }, + { + "epoch": 1.44240614334471, + "grad_norm": 0.3315080013096485, + "learning_rate": 0.00015458112978534546, + "loss": 0.9374, + "step": 6762 + }, + { + "epoch": 1.4426194539249146, + "grad_norm": 0.325753500132423, + "learning_rate": 0.00015457805071413783, + "loss": 0.9438, + "step": 6763 + }, + { + "epoch": 1.4428327645051193, + "grad_norm": 0.30012687040058883, + "learning_rate": 0.0001545749707990783, + "loss": 0.9542, + "step": 6764 + }, + { + "epoch": 1.4430460750853242, + "grad_norm": 0.2903315173352558, + "learning_rate": 0.00015457189004020172, + "loss": 0.9669, + "step": 6765 + }, + { + "epoch": 1.4432593856655291, + "grad_norm": 0.33249758105841914, + "learning_rate": 0.00015456880843754293, + "loss": 0.9661, + "step": 6766 + }, + { + "epoch": 1.4434726962457338, + "grad_norm": 0.3536064335398188, + "learning_rate": 0.00015456572599113684, + "loss": 0.9521, + "step": 6767 + }, + { + "epoch": 1.4436860068259385, + "grad_norm": 0.3305390124851728, + "learning_rate": 0.00015456264270101832, + "loss": 0.9445, + "step": 6768 + }, + { + "epoch": 1.4438993174061434, + "grad_norm": 0.5423454890557863, + "learning_rate": 0.00015455955856722223, + "loss": 0.9644, + "step": 6769 + }, + { + "epoch": 1.444112627986348, + "grad_norm": 0.7132958670215843, + "learning_rate": 0.00015455647358978347, + "loss": 0.9716, + "step": 6770 + }, + { + "epoch": 1.444325938566553, + "grad_norm": 0.6988387922350962, + "learning_rate": 0.00015455338776873697, + "loss": 0.9828, + "step": 6771 + }, + { + "epoch": 1.4445392491467577, + "grad_norm": 0.5732890931323235, + "learning_rate": 0.00015455030110411765, + "loss": 0.9466, + "step": 6772 + }, + { + "epoch": 1.4447525597269624, + "grad_norm": 0.4462733394749458, + "learning_rate": 0.00015454721359596036, + "loss": 0.9563, + "step": 6773 + }, + { + "epoch": 1.4449658703071673, + "grad_norm": 0.4233352678889059, + "learning_rate": 0.00015454412524430015, + "loss": 0.9371, + "step": 6774 + }, + { + "epoch": 1.445179180887372, + "grad_norm": 0.4257335793213341, + "learning_rate": 0.0001545410360491719, + "loss": 0.9203, + "step": 6775 + }, + { + "epoch": 1.4453924914675769, + "grad_norm": 0.3858192186022422, + "learning_rate": 0.00015453794601061056, + "loss": 0.9339, + "step": 6776 + }, + { + "epoch": 1.4456058020477816, + "grad_norm": 0.36349888741619707, + "learning_rate": 0.0001545348551286511, + "loss": 0.9333, + "step": 6777 + }, + { + "epoch": 1.4458191126279862, + "grad_norm": 0.31195085050587784, + "learning_rate": 0.00015453176340332852, + "loss": 0.9726, + "step": 6778 + }, + { + "epoch": 1.4460324232081911, + "grad_norm": 0.25679783637519854, + "learning_rate": 0.0001545286708346778, + "loss": 0.9414, + "step": 6779 + }, + { + "epoch": 1.4462457337883958, + "grad_norm": 0.2816111053099199, + "learning_rate": 0.00015452557742273387, + "loss": 0.9562, + "step": 6780 + }, + { + "epoch": 1.4464590443686007, + "grad_norm": 0.28086396877794073, + "learning_rate": 0.00015452248316753182, + "loss": 0.9395, + "step": 6781 + }, + { + "epoch": 1.4466723549488054, + "grad_norm": 0.3377424137647319, + "learning_rate": 0.0001545193880691066, + "loss": 0.9625, + "step": 6782 + }, + { + "epoch": 1.44688566552901, + "grad_norm": 0.42328991475650535, + "learning_rate": 0.00015451629212749323, + "loss": 0.951, + "step": 6783 + }, + { + "epoch": 1.447098976109215, + "grad_norm": 0.4490040742347499, + "learning_rate": 0.00015451319534272678, + "loss": 0.9339, + "step": 6784 + }, + { + "epoch": 1.44731228668942, + "grad_norm": 0.398468938604009, + "learning_rate": 0.00015451009771484228, + "loss": 0.9399, + "step": 6785 + }, + { + "epoch": 1.4475255972696246, + "grad_norm": 0.3528176690657921, + "learning_rate": 0.00015450699924387477, + "loss": 0.9973, + "step": 6786 + }, + { + "epoch": 1.4477389078498293, + "grad_norm": 0.29025341097980323, + "learning_rate": 0.0001545038999298593, + "loss": 0.9857, + "step": 6787 + }, + { + "epoch": 1.4479522184300342, + "grad_norm": 0.3392243274078072, + "learning_rate": 0.00015450079977283093, + "loss": 0.9743, + "step": 6788 + }, + { + "epoch": 1.4481655290102389, + "grad_norm": 0.3105430966056206, + "learning_rate": 0.00015449769877282478, + "loss": 0.9488, + "step": 6789 + }, + { + "epoch": 1.4483788395904438, + "grad_norm": 0.3571619233816855, + "learning_rate": 0.00015449459692987592, + "loss": 0.9564, + "step": 6790 + }, + { + "epoch": 1.4485921501706485, + "grad_norm": 0.4514099410938694, + "learning_rate": 0.00015449149424401947, + "loss": 0.9765, + "step": 6791 + }, + { + "epoch": 1.4488054607508531, + "grad_norm": 0.5286215382236098, + "learning_rate": 0.00015448839071529046, + "loss": 0.9471, + "step": 6792 + }, + { + "epoch": 1.449018771331058, + "grad_norm": 0.4804478352595618, + "learning_rate": 0.00015448528634372406, + "loss": 0.973, + "step": 6793 + }, + { + "epoch": 1.4492320819112627, + "grad_norm": 0.35140303197092343, + "learning_rate": 0.0001544821811293554, + "loss": 0.9312, + "step": 6794 + }, + { + "epoch": 1.4494453924914676, + "grad_norm": 0.3202052876370257, + "learning_rate": 0.00015447907507221964, + "loss": 0.9445, + "step": 6795 + }, + { + "epoch": 1.4496587030716723, + "grad_norm": 0.36003191698506676, + "learning_rate": 0.00015447596817235186, + "loss": 0.9297, + "step": 6796 + }, + { + "epoch": 1.449872013651877, + "grad_norm": 0.3508135209901603, + "learning_rate": 0.00015447286042978727, + "loss": 0.9658, + "step": 6797 + }, + { + "epoch": 1.450085324232082, + "grad_norm": 0.37861764502104495, + "learning_rate": 0.000154469751844561, + "loss": 0.9627, + "step": 6798 + }, + { + "epoch": 1.4502986348122868, + "grad_norm": 0.29620012202487117, + "learning_rate": 0.00015446664241670823, + "loss": 0.9397, + "step": 6799 + }, + { + "epoch": 1.4505119453924915, + "grad_norm": 0.2636691781458879, + "learning_rate": 0.00015446353214626415, + "loss": 0.9345, + "step": 6800 + }, + { + "epoch": 1.4507252559726962, + "grad_norm": 0.2809084430382671, + "learning_rate": 0.00015446042103326393, + "loss": 0.9607, + "step": 6801 + }, + { + "epoch": 1.450938566552901, + "grad_norm": 0.2825960472171017, + "learning_rate": 0.0001544573090777428, + "loss": 0.9469, + "step": 6802 + }, + { + "epoch": 1.4511518771331058, + "grad_norm": 0.3089409227514035, + "learning_rate": 0.000154454196279736, + "loss": 0.9683, + "step": 6803 + }, + { + "epoch": 1.4513651877133107, + "grad_norm": 0.28801859354883097, + "learning_rate": 0.0001544510826392787, + "loss": 0.9495, + "step": 6804 + }, + { + "epoch": 1.4515784982935154, + "grad_norm": 0.27621361267094, + "learning_rate": 0.0001544479681564061, + "loss": 0.9728, + "step": 6805 + }, + { + "epoch": 1.45179180887372, + "grad_norm": 0.2993526007463025, + "learning_rate": 0.00015444485283115357, + "loss": 0.9531, + "step": 6806 + }, + { + "epoch": 1.452005119453925, + "grad_norm": 0.2968140693042938, + "learning_rate": 0.00015444173666355625, + "loss": 0.9449, + "step": 6807 + }, + { + "epoch": 1.4522184300341296, + "grad_norm": 0.2815766827524658, + "learning_rate": 0.0001544386196536494, + "loss": 0.9496, + "step": 6808 + }, + { + "epoch": 1.4524317406143346, + "grad_norm": 0.38450796325445663, + "learning_rate": 0.00015443550180146835, + "loss": 0.9857, + "step": 6809 + }, + { + "epoch": 1.4526450511945392, + "grad_norm": 0.38382677648922725, + "learning_rate": 0.00015443238310704834, + "loss": 0.9785, + "step": 6810 + }, + { + "epoch": 1.452858361774744, + "grad_norm": 0.4507566811998042, + "learning_rate": 0.00015442926357042464, + "loss": 0.9674, + "step": 6811 + }, + { + "epoch": 1.4530716723549488, + "grad_norm": 0.5005285276937138, + "learning_rate": 0.00015442614319163258, + "loss": 0.9405, + "step": 6812 + }, + { + "epoch": 1.4532849829351535, + "grad_norm": 0.4640734935776035, + "learning_rate": 0.00015442302197070748, + "loss": 0.9456, + "step": 6813 + }, + { + "epoch": 1.4534982935153584, + "grad_norm": 0.3466170598414701, + "learning_rate": 0.0001544198999076846, + "loss": 0.9585, + "step": 6814 + }, + { + "epoch": 1.453711604095563, + "grad_norm": 0.30200426066701663, + "learning_rate": 0.00015441677700259933, + "loss": 0.9549, + "step": 6815 + }, + { + "epoch": 1.4539249146757678, + "grad_norm": 0.4268915626048306, + "learning_rate": 0.00015441365325548695, + "loss": 0.9522, + "step": 6816 + }, + { + "epoch": 1.4541382252559727, + "grad_norm": 0.49721348155750467, + "learning_rate": 0.00015441052866638284, + "loss": 0.9532, + "step": 6817 + }, + { + "epoch": 1.4543515358361776, + "grad_norm": 0.4621263112072325, + "learning_rate": 0.00015440740323532234, + "loss": 0.9631, + "step": 6818 + }, + { + "epoch": 1.4545648464163823, + "grad_norm": 0.3829444191683763, + "learning_rate": 0.00015440427696234086, + "loss": 0.9663, + "step": 6819 + }, + { + "epoch": 1.454778156996587, + "grad_norm": 0.3477141706243673, + "learning_rate": 0.00015440114984747368, + "loss": 0.9491, + "step": 6820 + }, + { + "epoch": 1.4549914675767919, + "grad_norm": 0.3568411134313297, + "learning_rate": 0.00015439802189075628, + "loss": 0.9853, + "step": 6821 + }, + { + "epoch": 1.4552047781569966, + "grad_norm": 0.4958258948870453, + "learning_rate": 0.000154394893092224, + "loss": 1.0045, + "step": 6822 + }, + { + "epoch": 1.4554180887372015, + "grad_norm": 0.6007909397129919, + "learning_rate": 0.00015439176345191224, + "loss": 0.9493, + "step": 6823 + }, + { + "epoch": 1.4556313993174061, + "grad_norm": 0.6461676168802043, + "learning_rate": 0.00015438863296985645, + "loss": 0.9907, + "step": 6824 + }, + { + "epoch": 1.4558447098976108, + "grad_norm": 0.7066406386374905, + "learning_rate": 0.000154385501646092, + "loss": 0.9652, + "step": 6825 + }, + { + "epoch": 1.4560580204778157, + "grad_norm": 0.7065856109755734, + "learning_rate": 0.00015438236948065435, + "loss": 0.9328, + "step": 6826 + }, + { + "epoch": 1.4562713310580204, + "grad_norm": 0.6390470250297023, + "learning_rate": 0.00015437923647357895, + "loss": 0.9795, + "step": 6827 + }, + { + "epoch": 1.4564846416382253, + "grad_norm": 0.6148043773165793, + "learning_rate": 0.00015437610262490124, + "loss": 0.9583, + "step": 6828 + }, + { + "epoch": 1.45669795221843, + "grad_norm": 0.5909203478550922, + "learning_rate": 0.00015437296793465667, + "loss": 0.9356, + "step": 6829 + }, + { + "epoch": 1.4569112627986347, + "grad_norm": 0.531466527235444, + "learning_rate": 0.00015436983240288074, + "loss": 0.9609, + "step": 6830 + }, + { + "epoch": 1.4571245733788396, + "grad_norm": 0.4927120714759739, + "learning_rate": 0.0001543666960296089, + "loss": 0.9454, + "step": 6831 + }, + { + "epoch": 1.4573378839590443, + "grad_norm": 0.44344808070224384, + "learning_rate": 0.0001543635588148766, + "loss": 0.9508, + "step": 6832 + }, + { + "epoch": 1.4575511945392492, + "grad_norm": 0.4668082632678627, + "learning_rate": 0.0001543604207587194, + "loss": 0.982, + "step": 6833 + }, + { + "epoch": 1.4577645051194539, + "grad_norm": 0.39051408611812216, + "learning_rate": 0.00015435728186117282, + "loss": 0.935, + "step": 6834 + }, + { + "epoch": 1.4579778156996588, + "grad_norm": 0.34974325116414473, + "learning_rate": 0.0001543541421222723, + "loss": 0.9895, + "step": 6835 + }, + { + "epoch": 1.4581911262798635, + "grad_norm": 0.3444778289706982, + "learning_rate": 0.00015435100154205346, + "loss": 0.9353, + "step": 6836 + }, + { + "epoch": 1.4584044368600684, + "grad_norm": 0.37040207065249126, + "learning_rate": 0.00015434786012055176, + "loss": 0.9846, + "step": 6837 + }, + { + "epoch": 1.458617747440273, + "grad_norm": 0.32443226974692135, + "learning_rate": 0.00015434471785780277, + "loss": 0.9858, + "step": 6838 + }, + { + "epoch": 1.4588310580204777, + "grad_norm": 0.33934135477714444, + "learning_rate": 0.00015434157475384205, + "loss": 0.9413, + "step": 6839 + }, + { + "epoch": 1.4590443686006827, + "grad_norm": 0.39412382381576894, + "learning_rate": 0.00015433843080870516, + "loss": 0.9743, + "step": 6840 + }, + { + "epoch": 1.4592576791808873, + "grad_norm": 0.443328993263849, + "learning_rate": 0.00015433528602242769, + "loss": 0.9446, + "step": 6841 + }, + { + "epoch": 1.4594709897610922, + "grad_norm": 0.5070941033207311, + "learning_rate": 0.00015433214039504517, + "loss": 0.9607, + "step": 6842 + }, + { + "epoch": 1.459684300341297, + "grad_norm": 0.4627170546836525, + "learning_rate": 0.00015432899392659325, + "loss": 0.8954, + "step": 6843 + }, + { + "epoch": 1.4598976109215016, + "grad_norm": 0.40408391388796927, + "learning_rate": 0.00015432584661710753, + "loss": 0.948, + "step": 6844 + }, + { + "epoch": 1.4601109215017065, + "grad_norm": 0.442641393872871, + "learning_rate": 0.0001543226984666236, + "loss": 0.9484, + "step": 6845 + }, + { + "epoch": 1.4603242320819112, + "grad_norm": 0.5002442399189617, + "learning_rate": 0.00015431954947517707, + "loss": 0.9519, + "step": 6846 + }, + { + "epoch": 1.460537542662116, + "grad_norm": 0.5323332337300986, + "learning_rate": 0.0001543163996428036, + "loss": 0.9563, + "step": 6847 + }, + { + "epoch": 1.4607508532423208, + "grad_norm": 0.5514023911021664, + "learning_rate": 0.00015431324896953883, + "loss": 0.9685, + "step": 6848 + }, + { + "epoch": 1.4609641638225255, + "grad_norm": 0.5666402780344767, + "learning_rate": 0.00015431009745541836, + "loss": 0.9518, + "step": 6849 + }, + { + "epoch": 1.4611774744027304, + "grad_norm": 0.6277269427463649, + "learning_rate": 0.00015430694510047794, + "loss": 1.004, + "step": 6850 + }, + { + "epoch": 1.4613907849829353, + "grad_norm": 0.7217525681131822, + "learning_rate": 0.00015430379190475315, + "loss": 0.9559, + "step": 6851 + }, + { + "epoch": 1.46160409556314, + "grad_norm": 0.8404413915634902, + "learning_rate": 0.00015430063786827972, + "loss": 0.9686, + "step": 6852 + }, + { + "epoch": 1.4618174061433447, + "grad_norm": 0.8961818237523403, + "learning_rate": 0.00015429748299109332, + "loss": 0.9311, + "step": 6853 + }, + { + "epoch": 1.4620307167235496, + "grad_norm": 0.8219382747411402, + "learning_rate": 0.00015429432727322965, + "loss": 0.9603, + "step": 6854 + }, + { + "epoch": 1.4622440273037542, + "grad_norm": 0.7025967025349596, + "learning_rate": 0.00015429117071472442, + "loss": 0.9678, + "step": 6855 + }, + { + "epoch": 1.4624573378839592, + "grad_norm": 0.4289863512433769, + "learning_rate": 0.00015428801331561333, + "loss": 0.9598, + "step": 6856 + }, + { + "epoch": 1.4626706484641638, + "grad_norm": 0.29892697292049886, + "learning_rate": 0.00015428485507593213, + "loss": 0.9485, + "step": 6857 + }, + { + "epoch": 1.4628839590443685, + "grad_norm": 0.5075221074107638, + "learning_rate": 0.00015428169599571655, + "loss": 0.9488, + "step": 6858 + }, + { + "epoch": 1.4630972696245734, + "grad_norm": 0.6740393874777794, + "learning_rate": 0.0001542785360750023, + "loss": 0.9131, + "step": 6859 + }, + { + "epoch": 1.463310580204778, + "grad_norm": 0.7616125916240247, + "learning_rate": 0.0001542753753138252, + "loss": 0.9498, + "step": 6860 + }, + { + "epoch": 1.463523890784983, + "grad_norm": 0.6731540453059632, + "learning_rate": 0.00015427221371222096, + "loss": 0.9524, + "step": 6861 + }, + { + "epoch": 1.4637372013651877, + "grad_norm": 0.5334027658752303, + "learning_rate": 0.00015426905127022538, + "loss": 0.9535, + "step": 6862 + }, + { + "epoch": 1.4639505119453924, + "grad_norm": 0.3166988313242403, + "learning_rate": 0.00015426588798787423, + "loss": 0.9733, + "step": 6863 + }, + { + "epoch": 1.4641638225255973, + "grad_norm": 0.3169505500534911, + "learning_rate": 0.00015426272386520333, + "loss": 0.9403, + "step": 6864 + }, + { + "epoch": 1.464377133105802, + "grad_norm": 0.4498107495501305, + "learning_rate": 0.00015425955890224842, + "loss": 0.9785, + "step": 6865 + }, + { + "epoch": 1.4645904436860069, + "grad_norm": 0.574788215924788, + "learning_rate": 0.0001542563930990454, + "loss": 0.9978, + "step": 6866 + }, + { + "epoch": 1.4648037542662116, + "grad_norm": 0.608358352434178, + "learning_rate": 0.00015425322645563002, + "loss": 0.9628, + "step": 6867 + }, + { + "epoch": 1.4650170648464163, + "grad_norm": 0.7418409853347613, + "learning_rate": 0.00015425005897203812, + "loss": 0.9837, + "step": 6868 + }, + { + "epoch": 1.4652303754266212, + "grad_norm": 0.8786022908564528, + "learning_rate": 0.00015424689064830557, + "loss": 0.9587, + "step": 6869 + }, + { + "epoch": 1.465443686006826, + "grad_norm": 0.8546507884900142, + "learning_rate": 0.0001542437214844682, + "loss": 0.9416, + "step": 6870 + }, + { + "epoch": 1.4656569965870307, + "grad_norm": 0.7466527851238393, + "learning_rate": 0.0001542405514805619, + "loss": 0.9389, + "step": 6871 + }, + { + "epoch": 1.4658703071672354, + "grad_norm": 0.46530136648464726, + "learning_rate": 0.00015423738063662248, + "loss": 0.9654, + "step": 6872 + }, + { + "epoch": 1.4660836177474403, + "grad_norm": 0.32083833953541957, + "learning_rate": 0.00015423420895268583, + "loss": 0.9313, + "step": 6873 + }, + { + "epoch": 1.466296928327645, + "grad_norm": 0.2973735795508233, + "learning_rate": 0.0001542310364287879, + "loss": 0.9349, + "step": 6874 + }, + { + "epoch": 1.46651023890785, + "grad_norm": 0.40008146497345876, + "learning_rate": 0.00015422786306496452, + "loss": 0.9709, + "step": 6875 + }, + { + "epoch": 1.4667235494880546, + "grad_norm": 0.4763881659736462, + "learning_rate": 0.0001542246888612516, + "loss": 0.9429, + "step": 6876 + }, + { + "epoch": 1.4669368600682593, + "grad_norm": 0.4551558611488623, + "learning_rate": 0.00015422151381768514, + "loss": 0.968, + "step": 6877 + }, + { + "epoch": 1.4671501706484642, + "grad_norm": 0.44153260190830945, + "learning_rate": 0.00015421833793430095, + "loss": 0.9415, + "step": 6878 + }, + { + "epoch": 1.4673634812286689, + "grad_norm": 0.31920708085298055, + "learning_rate": 0.00015421516121113504, + "loss": 0.9574, + "step": 6879 + }, + { + "epoch": 1.4675767918088738, + "grad_norm": 0.29503280571260715, + "learning_rate": 0.00015421198364822333, + "loss": 0.9477, + "step": 6880 + }, + { + "epoch": 1.4677901023890785, + "grad_norm": 0.5090917952706432, + "learning_rate": 0.00015420880524560178, + "loss": 0.9716, + "step": 6881 + }, + { + "epoch": 1.4680034129692832, + "grad_norm": 0.4144200655468391, + "learning_rate": 0.0001542056260033063, + "loss": 0.9615, + "step": 6882 + }, + { + "epoch": 1.468216723549488, + "grad_norm": 0.386986370001941, + "learning_rate": 0.00015420244592137297, + "loss": 0.9801, + "step": 6883 + }, + { + "epoch": 1.4684300341296928, + "grad_norm": 0.34297119265824283, + "learning_rate": 0.0001541992649998377, + "loss": 0.9653, + "step": 6884 + }, + { + "epoch": 1.4686433447098977, + "grad_norm": 0.34722843054048685, + "learning_rate": 0.0001541960832387365, + "loss": 0.9733, + "step": 6885 + }, + { + "epoch": 1.4688566552901023, + "grad_norm": 0.31886965880691653, + "learning_rate": 0.00015419290063810537, + "loss": 0.9361, + "step": 6886 + }, + { + "epoch": 1.4690699658703072, + "grad_norm": 0.3360916747275883, + "learning_rate": 0.00015418971719798034, + "loss": 0.9512, + "step": 6887 + }, + { + "epoch": 1.469283276450512, + "grad_norm": 0.36094642762108003, + "learning_rate": 0.0001541865329183974, + "loss": 0.9425, + "step": 6888 + }, + { + "epoch": 1.4694965870307168, + "grad_norm": 0.4299823669547221, + "learning_rate": 0.00015418334779939257, + "loss": 0.9611, + "step": 6889 + }, + { + "epoch": 1.4697098976109215, + "grad_norm": 0.3990294097747253, + "learning_rate": 0.00015418016184100194, + "loss": 0.9562, + "step": 6890 + }, + { + "epoch": 1.4699232081911262, + "grad_norm": 0.28687372760528423, + "learning_rate": 0.0001541769750432615, + "loss": 0.9346, + "step": 6891 + }, + { + "epoch": 1.4701365187713311, + "grad_norm": 0.2466112865766769, + "learning_rate": 0.0001541737874062074, + "loss": 0.967, + "step": 6892 + }, + { + "epoch": 1.4703498293515358, + "grad_norm": 0.3345302709158401, + "learning_rate": 0.00015417059892987558, + "loss": 0.9324, + "step": 6893 + }, + { + "epoch": 1.4705631399317407, + "grad_norm": 0.3292664746091279, + "learning_rate": 0.00015416740961430222, + "loss": 0.9564, + "step": 6894 + }, + { + "epoch": 1.4707764505119454, + "grad_norm": 0.3071528595186168, + "learning_rate": 0.00015416421945952337, + "loss": 0.9438, + "step": 6895 + }, + { + "epoch": 1.47098976109215, + "grad_norm": 0.3401012135602524, + "learning_rate": 0.00015416102846557513, + "loss": 0.9775, + "step": 6896 + }, + { + "epoch": 1.471203071672355, + "grad_norm": 0.3375364707588659, + "learning_rate": 0.00015415783663249357, + "loss": 0.9332, + "step": 6897 + }, + { + "epoch": 1.4714163822525597, + "grad_norm": 0.34132662740169845, + "learning_rate": 0.0001541546439603149, + "loss": 0.9492, + "step": 6898 + }, + { + "epoch": 1.4716296928327646, + "grad_norm": 0.3359559137735338, + "learning_rate": 0.00015415145044907517, + "loss": 0.9844, + "step": 6899 + }, + { + "epoch": 1.4718430034129693, + "grad_norm": 0.34371699047923204, + "learning_rate": 0.0001541482560988105, + "loss": 0.9666, + "step": 6900 + }, + { + "epoch": 1.472056313993174, + "grad_norm": 0.31903471420309826, + "learning_rate": 0.00015414506090955708, + "loss": 0.9644, + "step": 6901 + }, + { + "epoch": 1.4722696245733788, + "grad_norm": 0.3299226827879498, + "learning_rate": 0.00015414186488135105, + "loss": 0.9483, + "step": 6902 + }, + { + "epoch": 1.4724829351535837, + "grad_norm": 0.31232938650471304, + "learning_rate": 0.00015413866801422858, + "loss": 0.9526, + "step": 6903 + }, + { + "epoch": 1.4726962457337884, + "grad_norm": 0.3373855806119939, + "learning_rate": 0.00015413547030822584, + "loss": 0.9154, + "step": 6904 + }, + { + "epoch": 1.4729095563139931, + "grad_norm": 0.2998750472876009, + "learning_rate": 0.000154132271763379, + "loss": 0.9912, + "step": 6905 + }, + { + "epoch": 1.473122866894198, + "grad_norm": 0.3274348830893779, + "learning_rate": 0.00015412907237972423, + "loss": 0.9604, + "step": 6906 + }, + { + "epoch": 1.4733361774744027, + "grad_norm": 0.3540263230409592, + "learning_rate": 0.0001541258721572978, + "loss": 0.9466, + "step": 6907 + }, + { + "epoch": 1.4735494880546076, + "grad_norm": 0.3674348958850488, + "learning_rate": 0.00015412267109613586, + "loss": 0.9544, + "step": 6908 + }, + { + "epoch": 1.4737627986348123, + "grad_norm": 0.5297968768286734, + "learning_rate": 0.00015411946919627467, + "loss": 0.9497, + "step": 6909 + }, + { + "epoch": 1.473976109215017, + "grad_norm": 0.6014653647116446, + "learning_rate": 0.00015411626645775038, + "loss": 0.955, + "step": 6910 + }, + { + "epoch": 1.474189419795222, + "grad_norm": 0.7191892773913042, + "learning_rate": 0.00015411306288059935, + "loss": 0.9578, + "step": 6911 + }, + { + "epoch": 1.4744027303754266, + "grad_norm": 0.7346788780110233, + "learning_rate": 0.00015410985846485772, + "loss": 0.966, + "step": 6912 + }, + { + "epoch": 1.4746160409556315, + "grad_norm": 0.722328091479239, + "learning_rate": 0.00015410665321056183, + "loss": 0.9607, + "step": 6913 + }, + { + "epoch": 1.4748293515358362, + "grad_norm": 0.6798219752516302, + "learning_rate": 0.0001541034471177479, + "loss": 0.955, + "step": 6914 + }, + { + "epoch": 1.4750426621160408, + "grad_norm": 0.617289009419161, + "learning_rate": 0.00015410024018645225, + "loss": 0.9631, + "step": 6915 + }, + { + "epoch": 1.4752559726962458, + "grad_norm": 0.7010692870543564, + "learning_rate": 0.00015409703241671112, + "loss": 0.9506, + "step": 6916 + }, + { + "epoch": 1.4754692832764504, + "grad_norm": 0.3681532272595551, + "learning_rate": 0.0001540938238085608, + "loss": 0.9583, + "step": 6917 + }, + { + "epoch": 1.4756825938566553, + "grad_norm": 0.369939803787328, + "learning_rate": 0.00015409061436203763, + "loss": 0.9781, + "step": 6918 + }, + { + "epoch": 1.47589590443686, + "grad_norm": 0.4869600870027355, + "learning_rate": 0.0001540874040771779, + "loss": 1.0051, + "step": 6919 + }, + { + "epoch": 1.4761092150170647, + "grad_norm": 0.6780693280933193, + "learning_rate": 0.00015408419295401797, + "loss": 0.9504, + "step": 6920 + }, + { + "epoch": 1.4763225255972696, + "grad_norm": 0.7747215138047692, + "learning_rate": 0.00015408098099259414, + "loss": 0.9994, + "step": 6921 + }, + { + "epoch": 1.4765358361774745, + "grad_norm": 0.7310833839767382, + "learning_rate": 0.00015407776819294274, + "loss": 0.9661, + "step": 6922 + }, + { + "epoch": 1.4767491467576792, + "grad_norm": 0.6785654911095699, + "learning_rate": 0.0001540745545551002, + "loss": 0.9703, + "step": 6923 + }, + { + "epoch": 1.476962457337884, + "grad_norm": 0.6853972797809299, + "learning_rate": 0.00015407134007910277, + "loss": 0.9715, + "step": 6924 + }, + { + "epoch": 1.4771757679180888, + "grad_norm": 0.6623724357070583, + "learning_rate": 0.0001540681247649869, + "loss": 0.9315, + "step": 6925 + }, + { + "epoch": 1.4773890784982935, + "grad_norm": 0.637053938182748, + "learning_rate": 0.000154064908612789, + "loss": 0.9485, + "step": 6926 + }, + { + "epoch": 1.4776023890784984, + "grad_norm": 0.6039547503889104, + "learning_rate": 0.00015406169162254535, + "loss": 0.9965, + "step": 6927 + }, + { + "epoch": 1.477815699658703, + "grad_norm": 0.41191627750575616, + "learning_rate": 0.00015405847379429245, + "loss": 0.9484, + "step": 6928 + }, + { + "epoch": 1.4780290102389078, + "grad_norm": 0.31941080640412356, + "learning_rate": 0.00015405525512806668, + "loss": 0.9662, + "step": 6929 + }, + { + "epoch": 1.4782423208191127, + "grad_norm": 0.3419278689390197, + "learning_rate": 0.00015405203562390442, + "loss": 0.9354, + "step": 6930 + }, + { + "epoch": 1.4784556313993173, + "grad_norm": 0.36254815365109816, + "learning_rate": 0.00015404881528184215, + "loss": 0.9754, + "step": 6931 + }, + { + "epoch": 1.4786689419795223, + "grad_norm": 0.3854759339856225, + "learning_rate": 0.00015404559410191627, + "loss": 0.9952, + "step": 6932 + }, + { + "epoch": 1.478882252559727, + "grad_norm": 0.4022066063104142, + "learning_rate": 0.00015404237208416327, + "loss": 0.9816, + "step": 6933 + }, + { + "epoch": 1.4790955631399316, + "grad_norm": 0.4211928237949773, + "learning_rate": 0.00015403914922861958, + "loss": 0.9653, + "step": 6934 + }, + { + "epoch": 1.4793088737201365, + "grad_norm": 0.48951613876279304, + "learning_rate": 0.00015403592553532165, + "loss": 0.9607, + "step": 6935 + }, + { + "epoch": 1.4795221843003412, + "grad_norm": 0.5568874294246355, + "learning_rate": 0.000154032701004306, + "loss": 0.9473, + "step": 6936 + }, + { + "epoch": 1.4797354948805461, + "grad_norm": 0.5996065434589255, + "learning_rate": 0.00015402947563560908, + "loss": 0.991, + "step": 6937 + }, + { + "epoch": 1.4799488054607508, + "grad_norm": 0.6830757990630155, + "learning_rate": 0.00015402624942926742, + "loss": 0.9783, + "step": 6938 + }, + { + "epoch": 1.4801621160409557, + "grad_norm": 0.8116396690239828, + "learning_rate": 0.0001540230223853175, + "loss": 0.9727, + "step": 6939 + }, + { + "epoch": 1.4803754266211604, + "grad_norm": 0.8812838808750587, + "learning_rate": 0.00015401979450379582, + "loss": 0.9774, + "step": 6940 + }, + { + "epoch": 1.4805887372013653, + "grad_norm": 0.8322808403357596, + "learning_rate": 0.00015401656578473892, + "loss": 0.9685, + "step": 6941 + }, + { + "epoch": 1.48080204778157, + "grad_norm": 0.7191957114316481, + "learning_rate": 0.00015401333622818333, + "loss": 0.9722, + "step": 6942 + }, + { + "epoch": 1.4810153583617747, + "grad_norm": 0.5218717183315031, + "learning_rate": 0.00015401010583416562, + "loss": 0.984, + "step": 6943 + }, + { + "epoch": 1.4812286689419796, + "grad_norm": 0.33976313294629024, + "learning_rate": 0.0001540068746027223, + "loss": 0.9672, + "step": 6944 + }, + { + "epoch": 1.4814419795221843, + "grad_norm": 0.26139377349802345, + "learning_rate": 0.00015400364253388994, + "loss": 0.9572, + "step": 6945 + }, + { + "epoch": 1.4816552901023892, + "grad_norm": 0.4102208263548035, + "learning_rate": 0.00015400040962770513, + "loss": 1.0102, + "step": 6946 + }, + { + "epoch": 1.4818686006825939, + "grad_norm": 0.4521023141244513, + "learning_rate": 0.00015399717588420445, + "loss": 0.9881, + "step": 6947 + }, + { + "epoch": 1.4820819112627985, + "grad_norm": 0.47792683579892914, + "learning_rate": 0.00015399394130342448, + "loss": 0.9171, + "step": 6948 + }, + { + "epoch": 1.4822952218430034, + "grad_norm": 0.5202144390416612, + "learning_rate": 0.00015399070588540181, + "loss": 0.9622, + "step": 6949 + }, + { + "epoch": 1.4825085324232081, + "grad_norm": 0.4978548988071263, + "learning_rate": 0.00015398746963017306, + "loss": 0.9551, + "step": 6950 + }, + { + "epoch": 1.482721843003413, + "grad_norm": 0.33672174143387135, + "learning_rate": 0.00015398423253777484, + "loss": 0.9657, + "step": 6951 + }, + { + "epoch": 1.4829351535836177, + "grad_norm": 0.3610554610647904, + "learning_rate": 0.0001539809946082438, + "loss": 0.9647, + "step": 6952 + }, + { + "epoch": 1.4831484641638224, + "grad_norm": 0.4633939567747305, + "learning_rate": 0.00015397775584161656, + "loss": 0.9887, + "step": 6953 + }, + { + "epoch": 1.4833617747440273, + "grad_norm": 0.4742018348422418, + "learning_rate": 0.00015397451623792974, + "loss": 0.9868, + "step": 6954 + }, + { + "epoch": 1.4835750853242322, + "grad_norm": 0.4753513424238271, + "learning_rate": 0.00015397127579722006, + "loss": 0.9582, + "step": 6955 + }, + { + "epoch": 1.483788395904437, + "grad_norm": 0.4468170526376825, + "learning_rate": 0.00015396803451952413, + "loss": 0.9714, + "step": 6956 + }, + { + "epoch": 1.4840017064846416, + "grad_norm": 0.42738121206678376, + "learning_rate": 0.00015396479240487864, + "loss": 0.9453, + "step": 6957 + }, + { + "epoch": 1.4842150170648465, + "grad_norm": 0.30251247133362347, + "learning_rate": 0.0001539615494533203, + "loss": 0.9553, + "step": 6958 + }, + { + "epoch": 1.4844283276450512, + "grad_norm": 0.3236804918992939, + "learning_rate": 0.00015395830566488576, + "loss": 0.9568, + "step": 6959 + }, + { + "epoch": 1.484641638225256, + "grad_norm": 0.438501500405819, + "learning_rate": 0.00015395506103961176, + "loss": 0.9644, + "step": 6960 + }, + { + "epoch": 1.4848549488054608, + "grad_norm": 0.6233414312284247, + "learning_rate": 0.000153951815577535, + "loss": 0.9754, + "step": 6961 + }, + { + "epoch": 1.4850682593856654, + "grad_norm": 0.8301805491286427, + "learning_rate": 0.00015394856927869222, + "loss": 1.0006, + "step": 6962 + }, + { + "epoch": 1.4852815699658704, + "grad_norm": 0.8393287283840628, + "learning_rate": 0.00015394532214312013, + "loss": 0.9513, + "step": 6963 + }, + { + "epoch": 1.485494880546075, + "grad_norm": 0.6743331618809745, + "learning_rate": 0.00015394207417085545, + "loss": 0.9368, + "step": 6964 + }, + { + "epoch": 1.48570819112628, + "grad_norm": 0.4435494952245379, + "learning_rate": 0.00015393882536193497, + "loss": 0.9534, + "step": 6965 + }, + { + "epoch": 1.4859215017064846, + "grad_norm": 0.26579841229513596, + "learning_rate": 0.00015393557571639546, + "loss": 0.9428, + "step": 6966 + }, + { + "epoch": 1.4861348122866893, + "grad_norm": 0.36961235624882127, + "learning_rate": 0.00015393232523427364, + "loss": 0.9662, + "step": 6967 + }, + { + "epoch": 1.4863481228668942, + "grad_norm": 0.8371681021238797, + "learning_rate": 0.00015392907391560632, + "loss": 0.9348, + "step": 6968 + }, + { + "epoch": 1.486561433447099, + "grad_norm": 0.5679447910548199, + "learning_rate": 0.0001539258217604303, + "loss": 0.9517, + "step": 6969 + }, + { + "epoch": 1.4867747440273038, + "grad_norm": 3.36328191208487, + "learning_rate": 0.00015392256876878234, + "loss": 1.0827, + "step": 6970 + }, + { + "epoch": 1.4869880546075085, + "grad_norm": 1.3661796487038607, + "learning_rate": 0.00015391931494069928, + "loss": 0.9835, + "step": 6971 + }, + { + "epoch": 1.4872013651877132, + "grad_norm": 2.5225720442421027, + "learning_rate": 0.00015391606027621792, + "loss": 1.0395, + "step": 6972 + }, + { + "epoch": 1.487414675767918, + "grad_norm": 1.5708979846599773, + "learning_rate": 0.0001539128047753751, + "loss": 1.0522, + "step": 6973 + }, + { + "epoch": 1.487627986348123, + "grad_norm": 1.7149515026264173, + "learning_rate": 0.00015390954843820767, + "loss": 1.0022, + "step": 6974 + }, + { + "epoch": 1.4878412969283277, + "grad_norm": 0.7859279553810355, + "learning_rate": 0.00015390629126475246, + "loss": 0.96, + "step": 6975 + }, + { + "epoch": 1.4880546075085324, + "grad_norm": 2.064304336878749, + "learning_rate": 0.0001539030332550463, + "loss": 1.0157, + "step": 6976 + }, + { + "epoch": 1.4882679180887373, + "grad_norm": 0.96880490369489, + "learning_rate": 0.00015389977440912608, + "loss": 0.9904, + "step": 6977 + }, + { + "epoch": 1.488481228668942, + "grad_norm": 2.785978233779877, + "learning_rate": 0.00015389651472702868, + "loss": 1.0051, + "step": 6978 + }, + { + "epoch": 1.4886945392491469, + "grad_norm": 2.1822706418667437, + "learning_rate": 0.00015389325420879095, + "loss": 1.0397, + "step": 6979 + }, + { + "epoch": 1.4889078498293515, + "grad_norm": 1.3550746634387973, + "learning_rate": 0.00015388999285444982, + "loss": 1.0251, + "step": 6980 + }, + { + "epoch": 1.4891211604095562, + "grad_norm": 0.9700862996699435, + "learning_rate": 0.00015388673066404218, + "loss": 1.0055, + "step": 6981 + }, + { + "epoch": 1.4893344709897611, + "grad_norm": 2.232595402801356, + "learning_rate": 0.00015388346763760494, + "loss": 1.0266, + "step": 6982 + }, + { + "epoch": 1.4895477815699658, + "grad_norm": 1.3727918252119842, + "learning_rate": 0.000153880203775175, + "loss": 0.9898, + "step": 6983 + }, + { + "epoch": 1.4897610921501707, + "grad_norm": 1.7962539866121676, + "learning_rate": 0.00015387693907678934, + "loss": 1.032, + "step": 6984 + }, + { + "epoch": 1.4899744027303754, + "grad_norm": 1.3187936564766005, + "learning_rate": 0.00015387367354248484, + "loss": 0.9945, + "step": 6985 + }, + { + "epoch": 1.49018771331058, + "grad_norm": 1.7904076244389102, + "learning_rate": 0.0001538704071722985, + "loss": 0.9764, + "step": 6986 + }, + { + "epoch": 1.490401023890785, + "grad_norm": 1.4615649886255042, + "learning_rate": 0.00015386713996626725, + "loss": 1.022, + "step": 6987 + }, + { + "epoch": 1.49061433447099, + "grad_norm": 1.5916260422317887, + "learning_rate": 0.0001538638719244281, + "loss": 1.0279, + "step": 6988 + }, + { + "epoch": 1.4908276450511946, + "grad_norm": 1.3968224327538885, + "learning_rate": 0.000153860603046818, + "loss": 1.0233, + "step": 6989 + }, + { + "epoch": 1.4910409556313993, + "grad_norm": 1.3894136334173717, + "learning_rate": 0.0001538573333334739, + "loss": 0.9547, + "step": 6990 + }, + { + "epoch": 1.4912542662116042, + "grad_norm": 1.1020713341940471, + "learning_rate": 0.00015385406278443283, + "loss": 1.0126, + "step": 6991 + }, + { + "epoch": 1.4914675767918089, + "grad_norm": 1.3707961716029315, + "learning_rate": 0.0001538507913997318, + "loss": 0.9966, + "step": 6992 + }, + { + "epoch": 1.4916808873720138, + "grad_norm": 1.0107932463166869, + "learning_rate": 0.00015384751917940783, + "loss": 1.0114, + "step": 6993 + }, + { + "epoch": 1.4918941979522184, + "grad_norm": 1.7113052811184462, + "learning_rate": 0.00015384424612349795, + "loss": 0.9548, + "step": 6994 + }, + { + "epoch": 1.4921075085324231, + "grad_norm": 1.484414234007292, + "learning_rate": 0.00015384097223203918, + "loss": 0.994, + "step": 6995 + }, + { + "epoch": 1.492320819112628, + "grad_norm": 1.1438755042592326, + "learning_rate": 0.00015383769750506855, + "loss": 0.9636, + "step": 6996 + }, + { + "epoch": 1.4925341296928327, + "grad_norm": 1.0501084292330571, + "learning_rate": 0.00015383442194262314, + "loss": 1.0195, + "step": 6997 + }, + { + "epoch": 1.4927474402730376, + "grad_norm": 1.1365146110790383, + "learning_rate": 0.00015383114554474002, + "loss": 0.9474, + "step": 6998 + }, + { + "epoch": 1.4929607508532423, + "grad_norm": 0.7406971760163036, + "learning_rate": 0.00015382786831145623, + "loss": 0.9467, + "step": 6999 + }, + { + "epoch": 1.493174061433447, + "grad_norm": 1.0058531135454194, + "learning_rate": 0.00015382459024280886, + "loss": 0.9749, + "step": 7000 + }, + { + "epoch": 1.493387372013652, + "grad_norm": 0.5098105634008622, + "learning_rate": 0.00015382131133883503, + "loss": 1.0021, + "step": 7001 + }, + { + "epoch": 1.4936006825938566, + "grad_norm": 0.912640749056202, + "learning_rate": 0.00015381803159957182, + "loss": 0.9904, + "step": 7002 + }, + { + "epoch": 1.4938139931740615, + "grad_norm": 0.5199684767312246, + "learning_rate": 0.00015381475102505633, + "loss": 0.9827, + "step": 7003 + }, + { + "epoch": 1.4940273037542662, + "grad_norm": 0.6338580910185903, + "learning_rate": 0.0001538114696153257, + "loss": 0.9616, + "step": 7004 + }, + { + "epoch": 1.4942406143344709, + "grad_norm": 0.6036427776177758, + "learning_rate": 0.00015380818737041702, + "loss": 0.9752, + "step": 7005 + }, + { + "epoch": 1.4944539249146758, + "grad_norm": 0.4209979296121997, + "learning_rate": 0.00015380490429036747, + "loss": 0.977, + "step": 7006 + }, + { + "epoch": 1.4946672354948807, + "grad_norm": 0.5279599120753817, + "learning_rate": 0.0001538016203752142, + "loss": 0.9854, + "step": 7007 + }, + { + "epoch": 1.4948805460750854, + "grad_norm": 0.49604859568939536, + "learning_rate": 0.00015379833562499437, + "loss": 1.0031, + "step": 7008 + }, + { + "epoch": 1.49509385665529, + "grad_norm": 0.441161778869305, + "learning_rate": 0.0001537950500397451, + "loss": 0.9396, + "step": 7009 + }, + { + "epoch": 1.495307167235495, + "grad_norm": 0.4976815374743849, + "learning_rate": 0.0001537917636195036, + "loss": 0.9526, + "step": 7010 + }, + { + "epoch": 1.4955204778156996, + "grad_norm": 0.3604709188458323, + "learning_rate": 0.00015378847636430708, + "loss": 0.9649, + "step": 7011 + }, + { + "epoch": 1.4957337883959045, + "grad_norm": 0.4570136499961363, + "learning_rate": 0.00015378518827419268, + "loss": 0.9566, + "step": 7012 + }, + { + "epoch": 1.4959470989761092, + "grad_norm": 0.37591255718154265, + "learning_rate": 0.00015378189934919765, + "loss": 0.9511, + "step": 7013 + }, + { + "epoch": 1.496160409556314, + "grad_norm": 0.3413878820777131, + "learning_rate": 0.0001537786095893592, + "loss": 0.9554, + "step": 7014 + }, + { + "epoch": 1.4963737201365188, + "grad_norm": 0.3896954121368391, + "learning_rate": 0.0001537753189947145, + "loss": 0.9121, + "step": 7015 + }, + { + "epoch": 1.4965870307167235, + "grad_norm": 0.3308108708907277, + "learning_rate": 0.00015377202756530086, + "loss": 0.961, + "step": 7016 + }, + { + "epoch": 1.4968003412969284, + "grad_norm": 0.36078888107129453, + "learning_rate": 0.00015376873530115548, + "loss": 0.9623, + "step": 7017 + }, + { + "epoch": 1.497013651877133, + "grad_norm": 0.2926973782818125, + "learning_rate": 0.0001537654422023156, + "loss": 0.9659, + "step": 7018 + }, + { + "epoch": 1.4972269624573378, + "grad_norm": 0.36643542974866794, + "learning_rate": 0.00015376214826881855, + "loss": 0.957, + "step": 7019 + }, + { + "epoch": 1.4974402730375427, + "grad_norm": 0.31020058295721975, + "learning_rate": 0.0001537588535007015, + "loss": 0.9738, + "step": 7020 + }, + { + "epoch": 1.4976535836177474, + "grad_norm": 0.33758008632104775, + "learning_rate": 0.0001537555578980018, + "loss": 0.9397, + "step": 7021 + }, + { + "epoch": 1.4978668941979523, + "grad_norm": 0.3026551294817916, + "learning_rate": 0.00015375226146075672, + "loss": 0.944, + "step": 7022 + }, + { + "epoch": 1.498080204778157, + "grad_norm": 0.33662923857848565, + "learning_rate": 0.00015374896418900356, + "loss": 0.9505, + "step": 7023 + }, + { + "epoch": 1.4982935153583616, + "grad_norm": 0.36339457430935396, + "learning_rate": 0.00015374566608277964, + "loss": 0.9917, + "step": 7024 + }, + { + "epoch": 1.4985068259385665, + "grad_norm": 0.3824893653798276, + "learning_rate": 0.00015374236714212224, + "loss": 0.9548, + "step": 7025 + }, + { + "epoch": 1.4987201365187715, + "grad_norm": 0.2516285478789394, + "learning_rate": 0.00015373906736706875, + "loss": 0.9695, + "step": 7026 + }, + { + "epoch": 1.4989334470989761, + "grad_norm": 0.32867510069973715, + "learning_rate": 0.00015373576675765645, + "loss": 0.9535, + "step": 7027 + }, + { + "epoch": 1.4991467576791808, + "grad_norm": 0.28452636063008613, + "learning_rate": 0.00015373246531392274, + "loss": 0.9846, + "step": 7028 + }, + { + "epoch": 1.4993600682593857, + "grad_norm": 0.2509824161161889, + "learning_rate": 0.00015372916303590492, + "loss": 0.9216, + "step": 7029 + }, + { + "epoch": 1.4995733788395904, + "grad_norm": 0.31016601863962623, + "learning_rate": 0.0001537258599236404, + "loss": 0.9419, + "step": 7030 + }, + { + "epoch": 1.4997866894197953, + "grad_norm": 0.2934057162853453, + "learning_rate": 0.0001537225559771665, + "loss": 0.9335, + "step": 7031 + }, + { + "epoch": 1.5, + "grad_norm": 0.28708758558659797, + "learning_rate": 0.00015371925119652064, + "loss": 0.9622, + "step": 7032 + }, + { + "epoch": 1.5002133105802047, + "grad_norm": 0.23332236390135358, + "learning_rate": 0.00015371594558174023, + "loss": 0.9547, + "step": 7033 + }, + { + "epoch": 1.5004266211604096, + "grad_norm": 0.23317132247973102, + "learning_rate": 0.00015371263913286265, + "loss": 0.9543, + "step": 7034 + }, + { + "epoch": 1.5006399317406145, + "grad_norm": 0.24701162525182377, + "learning_rate": 0.0001537093318499253, + "loss": 0.939, + "step": 7035 + }, + { + "epoch": 1.5008532423208192, + "grad_norm": 0.3005838462660247, + "learning_rate": 0.0001537060237329657, + "loss": 0.9517, + "step": 7036 + }, + { + "epoch": 1.5010665529010239, + "grad_norm": 0.24814721397181813, + "learning_rate": 0.00015370271478202112, + "loss": 0.9734, + "step": 7037 + }, + { + "epoch": 1.5012798634812285, + "grad_norm": 0.3069768041455903, + "learning_rate": 0.0001536994049971291, + "loss": 0.9665, + "step": 7038 + }, + { + "epoch": 1.5014931740614335, + "grad_norm": 0.37699993301226314, + "learning_rate": 0.00015369609437832708, + "loss": 0.9324, + "step": 7039 + }, + { + "epoch": 1.5017064846416384, + "grad_norm": 0.3391381160388799, + "learning_rate": 0.00015369278292565254, + "loss": 0.9714, + "step": 7040 + }, + { + "epoch": 1.501919795221843, + "grad_norm": 0.30982270172741605, + "learning_rate": 0.00015368947063914288, + "loss": 0.9754, + "step": 7041 + }, + { + "epoch": 1.5021331058020477, + "grad_norm": 0.36245783352725514, + "learning_rate": 0.00015368615751883564, + "loss": 0.9537, + "step": 7042 + }, + { + "epoch": 1.5023464163822524, + "grad_norm": 0.41877645793928137, + "learning_rate": 0.00015368284356476826, + "loss": 0.9419, + "step": 7043 + }, + { + "epoch": 1.5025597269624573, + "grad_norm": 0.3283579562023075, + "learning_rate": 0.0001536795287769783, + "loss": 0.9763, + "step": 7044 + }, + { + "epoch": 1.5027730375426622, + "grad_norm": 0.3150513931655896, + "learning_rate": 0.0001536762131555032, + "loss": 0.9448, + "step": 7045 + }, + { + "epoch": 1.502986348122867, + "grad_norm": 0.32030662400224674, + "learning_rate": 0.00015367289670038055, + "loss": 0.9637, + "step": 7046 + }, + { + "epoch": 1.5031996587030716, + "grad_norm": 0.23701847942164522, + "learning_rate": 0.00015366957941164783, + "loss": 0.9686, + "step": 7047 + }, + { + "epoch": 1.5034129692832765, + "grad_norm": 0.23442365077538893, + "learning_rate": 0.00015366626128934254, + "loss": 0.9619, + "step": 7048 + }, + { + "epoch": 1.5036262798634812, + "grad_norm": 0.30012922999620273, + "learning_rate": 0.0001536629423335023, + "loss": 0.9592, + "step": 7049 + }, + { + "epoch": 1.503839590443686, + "grad_norm": 0.30926583884582476, + "learning_rate": 0.0001536596225441646, + "loss": 0.9707, + "step": 7050 + }, + { + "epoch": 1.5040529010238908, + "grad_norm": 0.20832158870834644, + "learning_rate": 0.00015365630192136707, + "loss": 0.9287, + "step": 7051 + }, + { + "epoch": 1.5042662116040955, + "grad_norm": 0.32277408138472863, + "learning_rate": 0.00015365298046514722, + "loss": 0.9378, + "step": 7052 + }, + { + "epoch": 1.5044795221843004, + "grad_norm": 0.394593578716634, + "learning_rate": 0.00015364965817554266, + "loss": 0.9401, + "step": 7053 + }, + { + "epoch": 1.5046928327645053, + "grad_norm": 0.3589950190565204, + "learning_rate": 0.00015364633505259096, + "loss": 0.9737, + "step": 7054 + }, + { + "epoch": 1.50490614334471, + "grad_norm": 0.3418169619492201, + "learning_rate": 0.00015364301109632977, + "loss": 0.9598, + "step": 7055 + }, + { + "epoch": 1.5051194539249146, + "grad_norm": 0.4085501857535431, + "learning_rate": 0.00015363968630679667, + "loss": 0.9929, + "step": 7056 + }, + { + "epoch": 1.5053327645051193, + "grad_norm": 0.46763323088117853, + "learning_rate": 0.00015363636068402927, + "loss": 0.9651, + "step": 7057 + }, + { + "epoch": 1.5055460750853242, + "grad_norm": 0.40942655262914, + "learning_rate": 0.00015363303422806523, + "loss": 0.9495, + "step": 7058 + }, + { + "epoch": 1.5057593856655291, + "grad_norm": 0.38287775116026834, + "learning_rate": 0.00015362970693894213, + "loss": 0.9656, + "step": 7059 + }, + { + "epoch": 1.5059726962457338, + "grad_norm": 0.25308116141514597, + "learning_rate": 0.00015362637881669768, + "loss": 0.9768, + "step": 7060 + }, + { + "epoch": 1.5061860068259385, + "grad_norm": 0.23968156668645227, + "learning_rate": 0.00015362304986136954, + "loss": 0.9522, + "step": 7061 + }, + { + "epoch": 1.5063993174061432, + "grad_norm": 0.330164606590979, + "learning_rate": 0.0001536197200729953, + "loss": 0.9639, + "step": 7062 + }, + { + "epoch": 1.506612627986348, + "grad_norm": 0.362222788455821, + "learning_rate": 0.00015361638945161272, + "loss": 0.9489, + "step": 7063 + }, + { + "epoch": 1.506825938566553, + "grad_norm": 0.421712042163696, + "learning_rate": 0.00015361305799725946, + "loss": 0.9471, + "step": 7064 + }, + { + "epoch": 1.5070392491467577, + "grad_norm": 0.4293538969919025, + "learning_rate": 0.0001536097257099732, + "loss": 0.969, + "step": 7065 + }, + { + "epoch": 1.5072525597269624, + "grad_norm": 0.4545776694617653, + "learning_rate": 0.00015360639258979166, + "loss": 0.9387, + "step": 7066 + }, + { + "epoch": 1.5074658703071673, + "grad_norm": 0.4338245308651984, + "learning_rate": 0.00015360305863675253, + "loss": 0.9534, + "step": 7067 + }, + { + "epoch": 1.507679180887372, + "grad_norm": 0.37118312866830905, + "learning_rate": 0.0001535997238508936, + "loss": 0.9747, + "step": 7068 + }, + { + "epoch": 1.5078924914675769, + "grad_norm": 0.3409012187719256, + "learning_rate": 0.0001535963882322525, + "loss": 0.9317, + "step": 7069 + }, + { + "epoch": 1.5081058020477816, + "grad_norm": 0.300066104759138, + "learning_rate": 0.00015359305178086705, + "loss": 0.9721, + "step": 7070 + }, + { + "epoch": 1.5083191126279862, + "grad_norm": 0.23275852789033155, + "learning_rate": 0.00015358971449677501, + "loss": 0.9351, + "step": 7071 + }, + { + "epoch": 1.5085324232081911, + "grad_norm": 0.29839633429888746, + "learning_rate": 0.00015358637638001409, + "loss": 0.9656, + "step": 7072 + }, + { + "epoch": 1.508745733788396, + "grad_norm": 0.304983994933061, + "learning_rate": 0.00015358303743062207, + "loss": 0.9575, + "step": 7073 + }, + { + "epoch": 1.5089590443686007, + "grad_norm": 0.2179624539233489, + "learning_rate": 0.00015357969764863676, + "loss": 0.9416, + "step": 7074 + }, + { + "epoch": 1.5091723549488054, + "grad_norm": 0.29749859821832686, + "learning_rate": 0.00015357635703409594, + "loss": 0.97, + "step": 7075 + }, + { + "epoch": 1.50938566552901, + "grad_norm": 0.3031690789079127, + "learning_rate": 0.00015357301558703739, + "loss": 0.9489, + "step": 7076 + }, + { + "epoch": 1.509598976109215, + "grad_norm": 0.3258245386288855, + "learning_rate": 0.00015356967330749894, + "loss": 0.9927, + "step": 7077 + }, + { + "epoch": 1.50981228668942, + "grad_norm": 0.3878313860598902, + "learning_rate": 0.00015356633019551836, + "loss": 0.939, + "step": 7078 + }, + { + "epoch": 1.5100255972696246, + "grad_norm": 0.4490962920759096, + "learning_rate": 0.00015356298625113357, + "loss": 0.9473, + "step": 7079 + }, + { + "epoch": 1.5102389078498293, + "grad_norm": 0.5171806639084047, + "learning_rate": 0.00015355964147438235, + "loss": 0.9613, + "step": 7080 + }, + { + "epoch": 1.510452218430034, + "grad_norm": 0.5467245313370471, + "learning_rate": 0.00015355629586530253, + "loss": 0.9778, + "step": 7081 + }, + { + "epoch": 1.5106655290102389, + "grad_norm": 0.5310151843696717, + "learning_rate": 0.000153552949423932, + "loss": 0.9728, + "step": 7082 + }, + { + "epoch": 1.5108788395904438, + "grad_norm": 4.271492658756738, + "learning_rate": 0.0001535496021503086, + "loss": 1.012, + "step": 7083 + }, + { + "epoch": 1.5110921501706485, + "grad_norm": 0.9430026699312942, + "learning_rate": 0.00015354625404447018, + "loss": 0.9549, + "step": 7084 + }, + { + "epoch": 1.5113054607508531, + "grad_norm": 1.9681076536642337, + "learning_rate": 0.00015354290510645468, + "loss": 0.9615, + "step": 7085 + }, + { + "epoch": 1.511518771331058, + "grad_norm": 1.001384624517734, + "learning_rate": 0.0001535395553363, + "loss": 0.9735, + "step": 7086 + }, + { + "epoch": 1.511732081911263, + "grad_norm": 2.779044325192068, + "learning_rate": 0.000153536204734044, + "loss": 0.9568, + "step": 7087 + }, + { + "epoch": 1.5119453924914676, + "grad_norm": 2.727864692287403, + "learning_rate": 0.0001535328532997246, + "loss": 0.9951, + "step": 7088 + }, + { + "epoch": 1.5121587030716723, + "grad_norm": 0.9067748695477149, + "learning_rate": 0.00015352950103337973, + "loss": 0.9668, + "step": 7089 + }, + { + "epoch": 1.512372013651877, + "grad_norm": 1.6303476497711846, + "learning_rate": 0.00015352614793504732, + "loss": 0.9208, + "step": 7090 + }, + { + "epoch": 1.512585324232082, + "grad_norm": 1.3989140684712105, + "learning_rate": 0.0001535227940047653, + "loss": 0.9992, + "step": 7091 + }, + { + "epoch": 1.5127986348122868, + "grad_norm": 0.856387093650241, + "learning_rate": 0.00015351943924257164, + "loss": 0.9779, + "step": 7092 + }, + { + "epoch": 1.5130119453924915, + "grad_norm": 1.2400301656723318, + "learning_rate": 0.0001535160836485043, + "loss": 0.9804, + "step": 7093 + }, + { + "epoch": 1.5132252559726962, + "grad_norm": 0.7165474427071369, + "learning_rate": 0.00015351272722260122, + "loss": 0.9632, + "step": 7094 + }, + { + "epoch": 1.5134385665529009, + "grad_norm": 0.8230248924277715, + "learning_rate": 0.0001535093699649004, + "loss": 1.0003, + "step": 7095 + }, + { + "epoch": 1.5136518771331058, + "grad_norm": 0.9502888969279168, + "learning_rate": 0.00015350601187543982, + "loss": 0.9822, + "step": 7096 + }, + { + "epoch": 1.5138651877133107, + "grad_norm": 0.9763853373329472, + "learning_rate": 0.00015350265295425747, + "loss": 0.9783, + "step": 7097 + }, + { + "epoch": 1.5140784982935154, + "grad_norm": 0.8570509947719007, + "learning_rate": 0.00015349929320139136, + "loss": 0.9801, + "step": 7098 + }, + { + "epoch": 1.51429180887372, + "grad_norm": 0.5672360331943253, + "learning_rate": 0.00015349593261687955, + "loss": 0.9462, + "step": 7099 + }, + { + "epoch": 1.514505119453925, + "grad_norm": 0.6423928332026487, + "learning_rate": 0.00015349257120076, + "loss": 0.9607, + "step": 7100 + }, + { + "epoch": 1.5147184300341296, + "grad_norm": 0.7862922312093149, + "learning_rate": 0.00015348920895307078, + "loss": 0.9662, + "step": 7101 + }, + { + "epoch": 1.5149317406143346, + "grad_norm": 0.7407990563540919, + "learning_rate": 0.0001534858458738499, + "loss": 0.9686, + "step": 7102 + }, + { + "epoch": 1.5151450511945392, + "grad_norm": 0.435493250522347, + "learning_rate": 0.00015348248196313548, + "loss": 0.9479, + "step": 7103 + }, + { + "epoch": 1.515358361774744, + "grad_norm": 0.5462863040478161, + "learning_rate": 0.00015347911722096553, + "loss": 0.9606, + "step": 7104 + }, + { + "epoch": 1.5155716723549488, + "grad_norm": 0.6295852519032574, + "learning_rate": 0.0001534757516473781, + "loss": 0.9565, + "step": 7105 + }, + { + "epoch": 1.5157849829351537, + "grad_norm": 0.6939874952809805, + "learning_rate": 0.00015347238524241133, + "loss": 0.9759, + "step": 7106 + }, + { + "epoch": 1.5159982935153584, + "grad_norm": 0.5741214315574399, + "learning_rate": 0.00015346901800610327, + "loss": 0.9835, + "step": 7107 + }, + { + "epoch": 1.516211604095563, + "grad_norm": 0.32730656809063124, + "learning_rate": 0.00015346564993849203, + "loss": 0.9523, + "step": 7108 + }, + { + "epoch": 1.5164249146757678, + "grad_norm": 0.5213431417340123, + "learning_rate": 0.00015346228103961578, + "loss": 0.9607, + "step": 7109 + }, + { + "epoch": 1.5166382252559727, + "grad_norm": 0.5496156264366749, + "learning_rate": 0.00015345891130951253, + "loss": 0.9566, + "step": 7110 + }, + { + "epoch": 1.5168515358361776, + "grad_norm": 0.42305188155229945, + "learning_rate": 0.0001534555407482205, + "loss": 0.969, + "step": 7111 + }, + { + "epoch": 1.5170648464163823, + "grad_norm": 0.34515223571992815, + "learning_rate": 0.00015345216935577774, + "loss": 0.9565, + "step": 7112 + }, + { + "epoch": 1.517278156996587, + "grad_norm": 0.3661731703541534, + "learning_rate": 0.00015344879713222247, + "loss": 0.9633, + "step": 7113 + }, + { + "epoch": 1.5174914675767917, + "grad_norm": 0.3513568493731036, + "learning_rate": 0.00015344542407759283, + "loss": 0.9359, + "step": 7114 + }, + { + "epoch": 1.5177047781569966, + "grad_norm": 0.4818144757204254, + "learning_rate": 0.000153442050191927, + "loss": 0.9519, + "step": 7115 + }, + { + "epoch": 1.5179180887372015, + "grad_norm": 0.489846730184081, + "learning_rate": 0.00015343867547526312, + "loss": 0.9402, + "step": 7116 + }, + { + "epoch": 1.5181313993174061, + "grad_norm": 0.283129918781487, + "learning_rate": 0.0001534352999276394, + "loss": 0.97, + "step": 7117 + }, + { + "epoch": 1.5183447098976108, + "grad_norm": 0.3506107016919257, + "learning_rate": 0.000153431923549094, + "loss": 0.9432, + "step": 7118 + }, + { + "epoch": 1.5185580204778157, + "grad_norm": 0.44114657358779363, + "learning_rate": 0.00015342854633966518, + "loss": 0.9528, + "step": 7119 + }, + { + "epoch": 1.5187713310580204, + "grad_norm": 0.37572259135805847, + "learning_rate": 0.00015342516829939108, + "loss": 1.0034, + "step": 7120 + }, + { + "epoch": 1.5189846416382253, + "grad_norm": 0.3530206950331341, + "learning_rate": 0.00015342178942831001, + "loss": 0.9535, + "step": 7121 + }, + { + "epoch": 1.51919795221843, + "grad_norm": 0.3610937272397207, + "learning_rate": 0.00015341840972646017, + "loss": 0.9606, + "step": 7122 + }, + { + "epoch": 1.5194112627986347, + "grad_norm": 0.3241567326158434, + "learning_rate": 0.00015341502919387974, + "loss": 0.9602, + "step": 7123 + }, + { + "epoch": 1.5196245733788396, + "grad_norm": 0.4154942100294157, + "learning_rate": 0.00015341164783060706, + "loss": 0.946, + "step": 7124 + }, + { + "epoch": 1.5198378839590445, + "grad_norm": 0.3425986163724353, + "learning_rate": 0.00015340826563668033, + "loss": 0.9621, + "step": 7125 + }, + { + "epoch": 1.5200511945392492, + "grad_norm": 0.31046445933309647, + "learning_rate": 0.00015340488261213783, + "loss": 0.9619, + "step": 7126 + }, + { + "epoch": 1.5202645051194539, + "grad_norm": 0.33028091806378906, + "learning_rate": 0.00015340149875701787, + "loss": 0.9551, + "step": 7127 + }, + { + "epoch": 1.5204778156996586, + "grad_norm": 0.2850161114204274, + "learning_rate": 0.0001533981140713587, + "loss": 0.9387, + "step": 7128 + }, + { + "epoch": 1.5206911262798635, + "grad_norm": 0.22755106780628462, + "learning_rate": 0.00015339472855519862, + "loss": 0.9436, + "step": 7129 + }, + { + "epoch": 1.5209044368600684, + "grad_norm": 0.24878517674321107, + "learning_rate": 0.000153391342208576, + "loss": 0.9443, + "step": 7130 + }, + { + "epoch": 1.521117747440273, + "grad_norm": 0.3147918023248902, + "learning_rate": 0.00015338795503152907, + "loss": 0.9291, + "step": 7131 + }, + { + "epoch": 1.5213310580204777, + "grad_norm": 0.28832180863162077, + "learning_rate": 0.0001533845670240962, + "loss": 0.9585, + "step": 7132 + }, + { + "epoch": 1.5215443686006824, + "grad_norm": 0.23690285763577543, + "learning_rate": 0.00015338117818631576, + "loss": 0.9799, + "step": 7133 + }, + { + "epoch": 1.5217576791808873, + "grad_norm": 0.32401224408415075, + "learning_rate": 0.000153377788518226, + "loss": 0.92, + "step": 7134 + }, + { + "epoch": 1.5219709897610922, + "grad_norm": 0.32544786486679145, + "learning_rate": 0.00015337439801986534, + "loss": 0.9908, + "step": 7135 + }, + { + "epoch": 1.522184300341297, + "grad_norm": 0.33562537865523717, + "learning_rate": 0.00015337100669127215, + "loss": 0.9866, + "step": 7136 + }, + { + "epoch": 1.5223976109215016, + "grad_norm": 0.3629173511527926, + "learning_rate": 0.00015336761453248478, + "loss": 0.9508, + "step": 7137 + }, + { + "epoch": 1.5226109215017065, + "grad_norm": 0.3044912458296432, + "learning_rate": 0.0001533642215435416, + "loss": 0.9477, + "step": 7138 + }, + { + "epoch": 1.5228242320819114, + "grad_norm": 0.25769970402108666, + "learning_rate": 0.00015336082772448106, + "loss": 0.9471, + "step": 7139 + }, + { + "epoch": 1.523037542662116, + "grad_norm": 0.3528825472370911, + "learning_rate": 0.00015335743307534147, + "loss": 0.9547, + "step": 7140 + }, + { + "epoch": 1.5232508532423208, + "grad_norm": 0.46783545410548694, + "learning_rate": 0.00015335403759616131, + "loss": 0.9597, + "step": 7141 + }, + { + "epoch": 1.5234641638225255, + "grad_norm": 0.43795821731155293, + "learning_rate": 0.00015335064128697902, + "loss": 0.9672, + "step": 7142 + }, + { + "epoch": 1.5236774744027304, + "grad_norm": 0.35430113306284644, + "learning_rate": 0.00015334724414783294, + "loss": 0.9637, + "step": 7143 + }, + { + "epoch": 1.5238907849829353, + "grad_norm": 0.2667441571595605, + "learning_rate": 0.0001533438461787616, + "loss": 0.958, + "step": 7144 + }, + { + "epoch": 1.52410409556314, + "grad_norm": 0.1993464645547916, + "learning_rate": 0.00015334044737980336, + "loss": 0.9659, + "step": 7145 + }, + { + "epoch": 1.5243174061433447, + "grad_norm": 0.23308486070097548, + "learning_rate": 0.00015333704775099673, + "loss": 0.965, + "step": 7146 + }, + { + "epoch": 1.5245307167235493, + "grad_norm": 0.29390970532597127, + "learning_rate": 0.0001533336472923802, + "loss": 0.9682, + "step": 7147 + }, + { + "epoch": 1.5247440273037542, + "grad_norm": 0.28282252981575773, + "learning_rate": 0.0001533302460039922, + "loss": 0.9346, + "step": 7148 + }, + { + "epoch": 1.5249573378839592, + "grad_norm": 0.2823448113572184, + "learning_rate": 0.00015332684388587125, + "loss": 0.9417, + "step": 7149 + }, + { + "epoch": 1.5251706484641638, + "grad_norm": 0.24525380973953065, + "learning_rate": 0.0001533234409380558, + "loss": 0.9493, + "step": 7150 + }, + { + "epoch": 1.5253839590443685, + "grad_norm": 0.23528313245079438, + "learning_rate": 0.0001533200371605844, + "loss": 0.9613, + "step": 7151 + }, + { + "epoch": 1.5255972696245734, + "grad_norm": 0.225838677049171, + "learning_rate": 0.00015331663255349555, + "loss": 0.9409, + "step": 7152 + }, + { + "epoch": 1.525810580204778, + "grad_norm": 0.27796297631982697, + "learning_rate": 0.00015331322711682776, + "loss": 1.003, + "step": 7153 + }, + { + "epoch": 1.526023890784983, + "grad_norm": 0.2621872780175749, + "learning_rate": 0.00015330982085061956, + "loss": 0.9774, + "step": 7154 + }, + { + "epoch": 1.5262372013651877, + "grad_norm": 0.2502388605039238, + "learning_rate": 0.00015330641375490954, + "loss": 0.9793, + "step": 7155 + }, + { + "epoch": 1.5264505119453924, + "grad_norm": 0.2731782350480242, + "learning_rate": 0.0001533030058297362, + "loss": 0.915, + "step": 7156 + }, + { + "epoch": 1.5266638225255973, + "grad_norm": 0.24565553393426884, + "learning_rate": 0.00015329959707513808, + "loss": 0.973, + "step": 7157 + }, + { + "epoch": 1.5268771331058022, + "grad_norm": 0.2634717518960714, + "learning_rate": 0.00015329618749115385, + "loss": 0.9461, + "step": 7158 + }, + { + "epoch": 1.5270904436860069, + "grad_norm": 0.26160482406806285, + "learning_rate": 0.00015329277707782198, + "loss": 0.9747, + "step": 7159 + }, + { + "epoch": 1.5273037542662116, + "grad_norm": 0.25919349497632005, + "learning_rate": 0.00015328936583518113, + "loss": 0.9351, + "step": 7160 + }, + { + "epoch": 1.5275170648464163, + "grad_norm": 0.257175961921966, + "learning_rate": 0.00015328595376326985, + "loss": 0.9562, + "step": 7161 + }, + { + "epoch": 1.5277303754266212, + "grad_norm": 0.24131473997375802, + "learning_rate": 0.0001532825408621268, + "loss": 0.9575, + "step": 7162 + }, + { + "epoch": 1.527943686006826, + "grad_norm": 0.25191795219272495, + "learning_rate": 0.00015327912713179054, + "loss": 0.9482, + "step": 7163 + }, + { + "epoch": 1.5281569965870307, + "grad_norm": 0.6839381496509329, + "learning_rate": 0.00015327571257229972, + "loss": 0.9299, + "step": 7164 + }, + { + "epoch": 1.5283703071672354, + "grad_norm": 0.3393673843196955, + "learning_rate": 0.000153272297183693, + "loss": 0.9498, + "step": 7165 + }, + { + "epoch": 1.5285836177474401, + "grad_norm": 0.3138370186783031, + "learning_rate": 0.000153268880966009, + "loss": 0.9534, + "step": 7166 + }, + { + "epoch": 1.528796928327645, + "grad_norm": 0.31417218907990807, + "learning_rate": 0.00015326546391928637, + "loss": 0.9632, + "step": 7167 + }, + { + "epoch": 1.52901023890785, + "grad_norm": 0.2673217810846289, + "learning_rate": 0.00015326204604356378, + "loss": 0.9745, + "step": 7168 + }, + { + "epoch": 1.5292235494880546, + "grad_norm": 0.2838954520060669, + "learning_rate": 0.00015325862733887993, + "loss": 0.9784, + "step": 7169 + }, + { + "epoch": 1.5294368600682593, + "grad_norm": 0.30607722822304295, + "learning_rate": 0.00015325520780527346, + "loss": 0.948, + "step": 7170 + }, + { + "epoch": 1.5296501706484642, + "grad_norm": 0.24522031818344014, + "learning_rate": 0.0001532517874427831, + "loss": 0.9259, + "step": 7171 + }, + { + "epoch": 1.5298634812286689, + "grad_norm": 0.2568873364441853, + "learning_rate": 0.0001532483662514475, + "loss": 0.9499, + "step": 7172 + }, + { + "epoch": 1.5300767918088738, + "grad_norm": 0.25078246165836127, + "learning_rate": 0.00015324494423130542, + "loss": 0.9697, + "step": 7173 + }, + { + "epoch": 1.5302901023890785, + "grad_norm": 0.24419028648436, + "learning_rate": 0.0001532415213823956, + "loss": 0.9362, + "step": 7174 + }, + { + "epoch": 1.5305034129692832, + "grad_norm": 0.2959466213845874, + "learning_rate": 0.00015323809770475668, + "loss": 0.9621, + "step": 7175 + }, + { + "epoch": 1.530716723549488, + "grad_norm": 0.28206983747376313, + "learning_rate": 0.00015323467319842745, + "loss": 0.9569, + "step": 7176 + }, + { + "epoch": 1.530930034129693, + "grad_norm": 0.2767892168993216, + "learning_rate": 0.0001532312478634467, + "loss": 0.9605, + "step": 7177 + }, + { + "epoch": 1.5311433447098977, + "grad_norm": 0.3003224467597115, + "learning_rate": 0.00015322782169985315, + "loss": 0.9385, + "step": 7178 + }, + { + "epoch": 1.5313566552901023, + "grad_norm": 0.30488640022886654, + "learning_rate": 0.00015322439470768554, + "loss": 0.9605, + "step": 7179 + }, + { + "epoch": 1.531569965870307, + "grad_norm": 0.23042722417205236, + "learning_rate": 0.00015322096688698267, + "loss": 0.9462, + "step": 7180 + }, + { + "epoch": 1.531783276450512, + "grad_norm": 0.24450567070133633, + "learning_rate": 0.00015321753823778334, + "loss": 0.9558, + "step": 7181 + }, + { + "epoch": 1.5319965870307168, + "grad_norm": 0.31484972551345725, + "learning_rate": 0.00015321410876012635, + "loss": 0.9662, + "step": 7182 + }, + { + "epoch": 1.5322098976109215, + "grad_norm": 0.3194547716773714, + "learning_rate": 0.0001532106784540505, + "loss": 0.9329, + "step": 7183 + }, + { + "epoch": 1.5324232081911262, + "grad_norm": 0.27271433157795627, + "learning_rate": 0.00015320724731959453, + "loss": 0.9589, + "step": 7184 + }, + { + "epoch": 1.5326365187713311, + "grad_norm": 0.28337943645690294, + "learning_rate": 0.00015320381535679738, + "loss": 0.9582, + "step": 7185 + }, + { + "epoch": 1.5328498293515358, + "grad_norm": 0.3321457609519731, + "learning_rate": 0.00015320038256569777, + "loss": 0.9662, + "step": 7186 + }, + { + "epoch": 1.5330631399317407, + "grad_norm": 0.40087671278619785, + "learning_rate": 0.00015319694894633465, + "loss": 0.9669, + "step": 7187 + }, + { + "epoch": 1.5332764505119454, + "grad_norm": 0.4174565396040206, + "learning_rate": 0.0001531935144987468, + "loss": 0.9866, + "step": 7188 + }, + { + "epoch": 1.53348976109215, + "grad_norm": 0.4325875088822285, + "learning_rate": 0.00015319007922297311, + "loss": 0.9681, + "step": 7189 + }, + { + "epoch": 1.533703071672355, + "grad_norm": 0.4281452739824632, + "learning_rate": 0.00015318664311905246, + "loss": 0.9654, + "step": 7190 + }, + { + "epoch": 1.5339163822525599, + "grad_norm": 0.37458494478323084, + "learning_rate": 0.00015318320618702369, + "loss": 0.9474, + "step": 7191 + }, + { + "epoch": 1.5341296928327646, + "grad_norm": 0.3262277614561809, + "learning_rate": 0.00015317976842692572, + "loss": 0.9924, + "step": 7192 + }, + { + "epoch": 1.5343430034129693, + "grad_norm": 0.2905701165239828, + "learning_rate": 0.00015317632983879743, + "loss": 0.9654, + "step": 7193 + }, + { + "epoch": 1.534556313993174, + "grad_norm": 0.29578044566473016, + "learning_rate": 0.00015317289042267776, + "loss": 0.9627, + "step": 7194 + }, + { + "epoch": 1.5347696245733788, + "grad_norm": 0.2808392434954988, + "learning_rate": 0.00015316945017860556, + "loss": 0.9403, + "step": 7195 + }, + { + "epoch": 1.5349829351535837, + "grad_norm": 0.26449647536535714, + "learning_rate": 0.00015316600910661983, + "loss": 0.9418, + "step": 7196 + }, + { + "epoch": 1.5351962457337884, + "grad_norm": 0.2798920783433535, + "learning_rate": 0.0001531625672067595, + "loss": 0.9517, + "step": 7197 + }, + { + "epoch": 1.5354095563139931, + "grad_norm": 0.2936506646611506, + "learning_rate": 0.00015315912447906344, + "loss": 0.93, + "step": 7198 + }, + { + "epoch": 1.5356228668941978, + "grad_norm": 0.2336845900899433, + "learning_rate": 0.00015315568092357068, + "loss": 0.953, + "step": 7199 + }, + { + "epoch": 1.5358361774744027, + "grad_norm": 0.2575690646414943, + "learning_rate": 0.00015315223654032014, + "loss": 0.9416, + "step": 7200 + }, + { + "epoch": 1.5360494880546076, + "grad_norm": 0.2819073213490071, + "learning_rate": 0.00015314879132935085, + "loss": 0.9782, + "step": 7201 + }, + { + "epoch": 1.5362627986348123, + "grad_norm": 0.3554204270488896, + "learning_rate": 0.00015314534529070175, + "loss": 0.9626, + "step": 7202 + }, + { + "epoch": 1.536476109215017, + "grad_norm": 0.3090449289637748, + "learning_rate": 0.00015314189842441183, + "loss": 0.9607, + "step": 7203 + }, + { + "epoch": 1.536689419795222, + "grad_norm": 0.29546421503740256, + "learning_rate": 0.00015313845073052008, + "loss": 0.957, + "step": 7204 + }, + { + "epoch": 1.5369027303754266, + "grad_norm": 0.3165014164803992, + "learning_rate": 0.00015313500220906555, + "loss": 0.9761, + "step": 7205 + }, + { + "epoch": 1.5371160409556315, + "grad_norm": 0.2867718298964109, + "learning_rate": 0.00015313155286008723, + "loss": 0.9762, + "step": 7206 + }, + { + "epoch": 1.5373293515358362, + "grad_norm": 0.3145238095362084, + "learning_rate": 0.00015312810268362418, + "loss": 0.9633, + "step": 7207 + }, + { + "epoch": 1.5375426621160408, + "grad_norm": 0.2569271661050092, + "learning_rate": 0.0001531246516797154, + "loss": 0.9755, + "step": 7208 + }, + { + "epoch": 1.5377559726962458, + "grad_norm": 0.24282340985585926, + "learning_rate": 0.00015312119984839997, + "loss": 0.9458, + "step": 7209 + }, + { + "epoch": 1.5379692832764507, + "grad_norm": 0.24252248720903508, + "learning_rate": 0.0001531177471897169, + "loss": 0.9503, + "step": 7210 + }, + { + "epoch": 1.5381825938566553, + "grad_norm": 0.2592071740026942, + "learning_rate": 0.00015311429370370533, + "loss": 0.9603, + "step": 7211 + }, + { + "epoch": 1.53839590443686, + "grad_norm": 0.3211838890360693, + "learning_rate": 0.00015311083939040427, + "loss": 0.9648, + "step": 7212 + }, + { + "epoch": 1.5386092150170647, + "grad_norm": 0.32475663474093003, + "learning_rate": 0.00015310738424985284, + "loss": 0.9611, + "step": 7213 + }, + { + "epoch": 1.5388225255972696, + "grad_norm": 0.3053986714699775, + "learning_rate": 0.0001531039282820901, + "loss": 0.9516, + "step": 7214 + }, + { + "epoch": 1.5390358361774745, + "grad_norm": 0.36394477858497426, + "learning_rate": 0.00015310047148715523, + "loss": 0.9617, + "step": 7215 + }, + { + "epoch": 1.5392491467576792, + "grad_norm": 0.45236079909993854, + "learning_rate": 0.0001530970138650873, + "loss": 0.9704, + "step": 7216 + }, + { + "epoch": 1.539462457337884, + "grad_norm": 0.4925521544769793, + "learning_rate": 0.0001530935554159254, + "loss": 0.966, + "step": 7217 + }, + { + "epoch": 1.5396757679180886, + "grad_norm": 0.5270403412092491, + "learning_rate": 0.0001530900961397087, + "loss": 0.9934, + "step": 7218 + }, + { + "epoch": 1.5398890784982935, + "grad_norm": 0.5479652880295988, + "learning_rate": 0.0001530866360364763, + "loss": 0.9361, + "step": 7219 + }, + { + "epoch": 1.5401023890784984, + "grad_norm": 0.41141470373308353, + "learning_rate": 0.00015308317510626744, + "loss": 0.9706, + "step": 7220 + }, + { + "epoch": 1.540315699658703, + "grad_norm": 0.31419595057980304, + "learning_rate": 0.0001530797133491212, + "loss": 0.9761, + "step": 7221 + }, + { + "epoch": 1.5405290102389078, + "grad_norm": 0.3630025341980356, + "learning_rate": 0.0001530762507650768, + "loss": 0.9578, + "step": 7222 + }, + { + "epoch": 1.5407423208191127, + "grad_norm": 0.31506555100779016, + "learning_rate": 0.00015307278735417334, + "loss": 0.9164, + "step": 7223 + }, + { + "epoch": 1.5409556313993176, + "grad_norm": 0.40805678219890235, + "learning_rate": 0.00015306932311645008, + "loss": 0.9627, + "step": 7224 + }, + { + "epoch": 1.5411689419795223, + "grad_norm": 0.5050247597547906, + "learning_rate": 0.00015306585805194623, + "loss": 0.947, + "step": 7225 + }, + { + "epoch": 1.541382252559727, + "grad_norm": 0.4402258232859558, + "learning_rate": 0.00015306239216070092, + "loss": 0.943, + "step": 7226 + }, + { + "epoch": 1.5415955631399316, + "grad_norm": 0.382791356333959, + "learning_rate": 0.00015305892544275346, + "loss": 0.9562, + "step": 7227 + }, + { + "epoch": 1.5418088737201365, + "grad_norm": 0.3964281951774004, + "learning_rate": 0.00015305545789814302, + "loss": 0.9491, + "step": 7228 + }, + { + "epoch": 1.5420221843003414, + "grad_norm": 0.41187522113852587, + "learning_rate": 0.0001530519895269088, + "loss": 0.9889, + "step": 7229 + }, + { + "epoch": 1.5422354948805461, + "grad_norm": 0.37235173437623725, + "learning_rate": 0.00015304852032909015, + "loss": 0.972, + "step": 7230 + }, + { + "epoch": 1.5424488054607508, + "grad_norm": 0.2923589613138279, + "learning_rate": 0.00015304505030472621, + "loss": 0.9636, + "step": 7231 + }, + { + "epoch": 1.5426621160409555, + "grad_norm": 0.3015870810344341, + "learning_rate": 0.00015304157945385634, + "loss": 0.9807, + "step": 7232 + }, + { + "epoch": 1.5428754266211604, + "grad_norm": 0.5586362365160016, + "learning_rate": 0.00015303810777651976, + "loss": 0.9794, + "step": 7233 + }, + { + "epoch": 1.5430887372013653, + "grad_norm": 0.7172946355495082, + "learning_rate": 0.00015303463527275573, + "loss": 0.9564, + "step": 7234 + }, + { + "epoch": 1.54330204778157, + "grad_norm": 0.7187588474788259, + "learning_rate": 0.00015303116194260362, + "loss": 0.9603, + "step": 7235 + }, + { + "epoch": 1.5435153583617747, + "grad_norm": 0.6492228793086259, + "learning_rate": 0.00015302768778610267, + "loss": 0.9393, + "step": 7236 + }, + { + "epoch": 1.5437286689419796, + "grad_norm": 0.5803459707107027, + "learning_rate": 0.0001530242128032922, + "loss": 0.9594, + "step": 7237 + }, + { + "epoch": 1.5439419795221843, + "grad_norm": 0.45161409315827167, + "learning_rate": 0.0001530207369942115, + "loss": 0.9629, + "step": 7238 + }, + { + "epoch": 1.5441552901023892, + "grad_norm": 0.3472088102898338, + "learning_rate": 0.0001530172603589, + "loss": 0.9527, + "step": 7239 + }, + { + "epoch": 1.5443686006825939, + "grad_norm": 0.4223378565832005, + "learning_rate": 0.00015301378289739692, + "loss": 0.9394, + "step": 7240 + }, + { + "epoch": 1.5445819112627985, + "grad_norm": 0.4760612834649654, + "learning_rate": 0.00015301030460974167, + "loss": 0.9389, + "step": 7241 + }, + { + "epoch": 1.5447952218430034, + "grad_norm": 0.5221511094310715, + "learning_rate": 0.0001530068254959736, + "loss": 1.0018, + "step": 7242 + }, + { + "epoch": 1.5450085324232083, + "grad_norm": 0.5697278894450061, + "learning_rate": 0.00015300334555613207, + "loss": 0.967, + "step": 7243 + }, + { + "epoch": 1.545221843003413, + "grad_norm": 0.5291744054096076, + "learning_rate": 0.00015299986479025646, + "loss": 0.9493, + "step": 7244 + }, + { + "epoch": 1.5454351535836177, + "grad_norm": 0.38049651487426717, + "learning_rate": 0.00015299638319838616, + "loss": 0.9851, + "step": 7245 + }, + { + "epoch": 1.5456484641638224, + "grad_norm": 0.3862830694550811, + "learning_rate": 0.00015299290078056054, + "loss": 0.9619, + "step": 7246 + }, + { + "epoch": 1.5458617747440273, + "grad_norm": 0.37072004805929887, + "learning_rate": 0.00015298941753681903, + "loss": 0.9183, + "step": 7247 + }, + { + "epoch": 1.5460750853242322, + "grad_norm": 0.2487261438813532, + "learning_rate": 0.00015298593346720104, + "loss": 0.9546, + "step": 7248 + }, + { + "epoch": 1.546288395904437, + "grad_norm": 0.2678849301397766, + "learning_rate": 0.00015298244857174596, + "loss": 0.96, + "step": 7249 + }, + { + "epoch": 1.5465017064846416, + "grad_norm": 0.3669464547863332, + "learning_rate": 0.00015297896285049328, + "loss": 0.9548, + "step": 7250 + }, + { + "epoch": 1.5467150170648463, + "grad_norm": 0.4656468792928061, + "learning_rate": 0.00015297547630348238, + "loss": 0.9923, + "step": 7251 + }, + { + "epoch": 1.5469283276450512, + "grad_norm": 0.38791951454184775, + "learning_rate": 0.00015297198893075273, + "loss": 0.9693, + "step": 7252 + }, + { + "epoch": 1.547141638225256, + "grad_norm": 0.37118610871639773, + "learning_rate": 0.00015296850073234382, + "loss": 0.9724, + "step": 7253 + }, + { + "epoch": 1.5473549488054608, + "grad_norm": 0.32444475918698756, + "learning_rate": 0.00015296501170829508, + "loss": 0.9737, + "step": 7254 + }, + { + "epoch": 1.5475682593856654, + "grad_norm": 0.2843506542164707, + "learning_rate": 0.00015296152185864605, + "loss": 0.967, + "step": 7255 + }, + { + "epoch": 1.5477815699658704, + "grad_norm": 0.28145789350737394, + "learning_rate": 0.00015295803118343612, + "loss": 0.9742, + "step": 7256 + }, + { + "epoch": 1.547994880546075, + "grad_norm": 0.29930458102719104, + "learning_rate": 0.00015295453968270487, + "loss": 0.9716, + "step": 7257 + }, + { + "epoch": 1.54820819112628, + "grad_norm": 0.2730534575201116, + "learning_rate": 0.00015295104735649177, + "loss": 0.9571, + "step": 7258 + }, + { + "epoch": 1.5484215017064846, + "grad_norm": 0.24100754342047526, + "learning_rate": 0.00015294755420483633, + "loss": 0.9843, + "step": 7259 + }, + { + "epoch": 1.5486348122866893, + "grad_norm": 0.29103854545412033, + "learning_rate": 0.00015294406022777808, + "loss": 0.9302, + "step": 7260 + }, + { + "epoch": 1.5488481228668942, + "grad_norm": 0.3008189555906062, + "learning_rate": 0.00015294056542535658, + "loss": 0.9643, + "step": 7261 + }, + { + "epoch": 1.5490614334470991, + "grad_norm": 0.2451769029019281, + "learning_rate": 0.00015293706979761136, + "loss": 0.9728, + "step": 7262 + }, + { + "epoch": 1.5492747440273038, + "grad_norm": 0.2687221261891965, + "learning_rate": 0.00015293357334458197, + "loss": 0.9334, + "step": 7263 + }, + { + "epoch": 1.5494880546075085, + "grad_norm": 0.332862038021682, + "learning_rate": 0.00015293007606630797, + "loss": 0.9624, + "step": 7264 + }, + { + "epoch": 1.5497013651877132, + "grad_norm": 0.47829504225694885, + "learning_rate": 0.0001529265779628289, + "loss": 0.9375, + "step": 7265 + }, + { + "epoch": 1.549914675767918, + "grad_norm": 0.4927694559336537, + "learning_rate": 0.0001529230790341844, + "loss": 0.9658, + "step": 7266 + }, + { + "epoch": 1.550127986348123, + "grad_norm": 0.613530126351456, + "learning_rate": 0.00015291957928041403, + "loss": 0.9572, + "step": 7267 + }, + { + "epoch": 1.5503412969283277, + "grad_norm": 0.6693837922419186, + "learning_rate": 0.0001529160787015574, + "loss": 0.9593, + "step": 7268 + }, + { + "epoch": 1.5505546075085324, + "grad_norm": 0.6756955120931741, + "learning_rate": 0.00015291257729765413, + "loss": 0.976, + "step": 7269 + }, + { + "epoch": 1.550767918088737, + "grad_norm": 0.653711357643179, + "learning_rate": 0.00015290907506874377, + "loss": 0.9511, + "step": 7270 + }, + { + "epoch": 1.550981228668942, + "grad_norm": 0.5813830852994141, + "learning_rate": 0.00015290557201486606, + "loss": 0.9746, + "step": 7271 + }, + { + "epoch": 1.5511945392491469, + "grad_norm": 0.4151811993140932, + "learning_rate": 0.00015290206813606052, + "loss": 0.9828, + "step": 7272 + }, + { + "epoch": 1.5514078498293515, + "grad_norm": 0.29279802671428834, + "learning_rate": 0.0001528985634323669, + "loss": 0.9804, + "step": 7273 + }, + { + "epoch": 1.5516211604095562, + "grad_norm": 0.3115052437769712, + "learning_rate": 0.00015289505790382478, + "loss": 0.9381, + "step": 7274 + }, + { + "epoch": 1.5518344709897611, + "grad_norm": 0.4422268881332657, + "learning_rate": 0.00015289155155047388, + "loss": 0.9631, + "step": 7275 + }, + { + "epoch": 1.552047781569966, + "grad_norm": 0.5161056796275337, + "learning_rate": 0.0001528880443723538, + "loss": 0.9478, + "step": 7276 + }, + { + "epoch": 1.5522610921501707, + "grad_norm": 0.5946645448841542, + "learning_rate": 0.0001528845363695043, + "loss": 0.9785, + "step": 7277 + }, + { + "epoch": 1.5524744027303754, + "grad_norm": 0.6400090680404106, + "learning_rate": 0.00015288102754196505, + "loss": 0.9445, + "step": 7278 + }, + { + "epoch": 1.55268771331058, + "grad_norm": 0.5333356494579681, + "learning_rate": 0.00015287751788977574, + "loss": 0.9464, + "step": 7279 + }, + { + "epoch": 1.552901023890785, + "grad_norm": 0.35430070471927705, + "learning_rate": 0.0001528740074129761, + "loss": 0.9475, + "step": 7280 + }, + { + "epoch": 1.55311433447099, + "grad_norm": 0.27890109086911263, + "learning_rate": 0.00015287049611160584, + "loss": 0.9833, + "step": 7281 + }, + { + "epoch": 1.5533276450511946, + "grad_norm": 0.2829509870394348, + "learning_rate": 0.00015286698398570467, + "loss": 0.9739, + "step": 7282 + }, + { + "epoch": 1.5535409556313993, + "grad_norm": 0.32036758233132756, + "learning_rate": 0.00015286347103531236, + "loss": 0.9076, + "step": 7283 + }, + { + "epoch": 1.553754266211604, + "grad_norm": 0.35724918728011434, + "learning_rate": 0.00015285995726046868, + "loss": 0.9492, + "step": 7284 + }, + { + "epoch": 1.5539675767918089, + "grad_norm": 0.4446248193460032, + "learning_rate": 0.0001528564426612133, + "loss": 0.9261, + "step": 7285 + }, + { + "epoch": 1.5541808873720138, + "grad_norm": 0.4954826314969989, + "learning_rate": 0.0001528529272375861, + "loss": 0.9727, + "step": 7286 + }, + { + "epoch": 1.5543941979522184, + "grad_norm": 0.4435989332022568, + "learning_rate": 0.00015284941098962678, + "loss": 0.9528, + "step": 7287 + }, + { + "epoch": 1.5546075085324231, + "grad_norm": 0.3440661025839218, + "learning_rate": 0.00015284589391737516, + "loss": 0.9508, + "step": 7288 + }, + { + "epoch": 1.554820819112628, + "grad_norm": 0.3004788519205526, + "learning_rate": 0.000152842376020871, + "loss": 0.9559, + "step": 7289 + }, + { + "epoch": 1.5550341296928327, + "grad_norm": 0.2909667451187176, + "learning_rate": 0.00015283885730015416, + "loss": 0.9458, + "step": 7290 + }, + { + "epoch": 1.5552474402730376, + "grad_norm": 0.2978408260290572, + "learning_rate": 0.0001528353377552644, + "loss": 0.9571, + "step": 7291 + }, + { + "epoch": 1.5554607508532423, + "grad_norm": 0.28391049147296404, + "learning_rate": 0.0001528318173862416, + "loss": 0.958, + "step": 7292 + }, + { + "epoch": 1.555674061433447, + "grad_norm": 0.29159364237669855, + "learning_rate": 0.00015282829619312554, + "loss": 0.9182, + "step": 7293 + }, + { + "epoch": 1.555887372013652, + "grad_norm": 0.28156002920605416, + "learning_rate": 0.00015282477417595606, + "loss": 0.957, + "step": 7294 + }, + { + "epoch": 1.5561006825938568, + "grad_norm": 0.2935123234182851, + "learning_rate": 0.00015282125133477307, + "loss": 0.9933, + "step": 7295 + }, + { + "epoch": 1.5563139931740615, + "grad_norm": 0.2622901946939813, + "learning_rate": 0.0001528177276696164, + "loss": 0.9516, + "step": 7296 + }, + { + "epoch": 1.5565273037542662, + "grad_norm": 0.27250240422886707, + "learning_rate": 0.00015281420318052588, + "loss": 0.9562, + "step": 7297 + }, + { + "epoch": 1.5567406143344709, + "grad_norm": 0.2871174746717561, + "learning_rate": 0.00015281067786754145, + "loss": 0.9927, + "step": 7298 + }, + { + "epoch": 1.5569539249146758, + "grad_norm": 0.2797801917979107, + "learning_rate": 0.00015280715173070296, + "loss": 0.9832, + "step": 7299 + }, + { + "epoch": 1.5571672354948807, + "grad_norm": 0.3182044812323974, + "learning_rate": 0.00015280362477005033, + "loss": 0.9589, + "step": 7300 + }, + { + "epoch": 1.5573805460750854, + "grad_norm": 0.33872401593549933, + "learning_rate": 0.00015280009698562347, + "loss": 0.9372, + "step": 7301 + }, + { + "epoch": 1.55759385665529, + "grad_norm": 0.31866225755685934, + "learning_rate": 0.00015279656837746227, + "loss": 0.972, + "step": 7302 + }, + { + "epoch": 1.5578071672354947, + "grad_norm": 0.24980364317014103, + "learning_rate": 0.00015279303894560672, + "loss": 0.9392, + "step": 7303 + }, + { + "epoch": 1.5580204778156996, + "grad_norm": 0.23142242287789017, + "learning_rate": 0.00015278950869009668, + "loss": 0.9794, + "step": 7304 + }, + { + "epoch": 1.5582337883959045, + "grad_norm": 0.24810060290687308, + "learning_rate": 0.0001527859776109721, + "loss": 0.9389, + "step": 7305 + }, + { + "epoch": 1.5584470989761092, + "grad_norm": 0.20679744953241969, + "learning_rate": 0.000152782445708273, + "loss": 0.9348, + "step": 7306 + }, + { + "epoch": 1.558660409556314, + "grad_norm": 0.29602226772660073, + "learning_rate": 0.0001527789129820393, + "loss": 0.9645, + "step": 7307 + }, + { + "epoch": 1.5588737201365188, + "grad_norm": 0.38498943608488345, + "learning_rate": 0.00015277537943231094, + "loss": 0.9587, + "step": 7308 + }, + { + "epoch": 1.5590870307167235, + "grad_norm": 0.39296223946750447, + "learning_rate": 0.00015277184505912798, + "loss": 0.9222, + "step": 7309 + }, + { + "epoch": 1.5593003412969284, + "grad_norm": 0.3554394281150259, + "learning_rate": 0.00015276830986253034, + "loss": 1.0019, + "step": 7310 + }, + { + "epoch": 1.559513651877133, + "grad_norm": 0.3227857846027047, + "learning_rate": 0.00015276477384255807, + "loss": 0.9479, + "step": 7311 + }, + { + "epoch": 1.5597269624573378, + "grad_norm": 0.3273919684801196, + "learning_rate": 0.00015276123699925113, + "loss": 0.9464, + "step": 7312 + }, + { + "epoch": 1.5599402730375427, + "grad_norm": 0.30059939887705833, + "learning_rate": 0.0001527576993326496, + "loss": 0.9397, + "step": 7313 + }, + { + "epoch": 1.5601535836177476, + "grad_norm": 0.34812057869408536, + "learning_rate": 0.00015275416084279347, + "loss": 0.9154, + "step": 7314 + }, + { + "epoch": 1.5603668941979523, + "grad_norm": 0.378220463783238, + "learning_rate": 0.00015275062152972276, + "loss": 0.9515, + "step": 7315 + }, + { + "epoch": 1.560580204778157, + "grad_norm": 0.45228657182695253, + "learning_rate": 0.00015274708139347758, + "loss": 0.9409, + "step": 7316 + }, + { + "epoch": 1.5607935153583616, + "grad_norm": 0.516253175077645, + "learning_rate": 0.00015274354043409794, + "loss": 0.9801, + "step": 7317 + }, + { + "epoch": 1.5610068259385665, + "grad_norm": 0.5006630663883693, + "learning_rate": 0.0001527399986516239, + "loss": 0.9061, + "step": 7318 + }, + { + "epoch": 1.5612201365187715, + "grad_norm": 0.41957927812939794, + "learning_rate": 0.00015273645604609558, + "loss": 0.9374, + "step": 7319 + }, + { + "epoch": 1.5614334470989761, + "grad_norm": 0.30397605031038816, + "learning_rate": 0.00015273291261755302, + "loss": 0.9307, + "step": 7320 + }, + { + "epoch": 1.5616467576791808, + "grad_norm": 0.23567225325829574, + "learning_rate": 0.00015272936836603632, + "loss": 0.9368, + "step": 7321 + }, + { + "epoch": 1.5618600682593855, + "grad_norm": 0.3117990824894179, + "learning_rate": 0.0001527258232915856, + "loss": 0.9801, + "step": 7322 + }, + { + "epoch": 1.5620733788395904, + "grad_norm": 0.37332975588133094, + "learning_rate": 0.00015272227739424098, + "loss": 1.0181, + "step": 7323 + }, + { + "epoch": 1.5622866894197953, + "grad_norm": 0.4213185335638088, + "learning_rate": 0.00015271873067404255, + "loss": 0.9781, + "step": 7324 + }, + { + "epoch": 1.5625, + "grad_norm": 0.5774537371815658, + "learning_rate": 0.00015271518313103049, + "loss": 0.955, + "step": 7325 + }, + { + "epoch": 1.5627133105802047, + "grad_norm": 0.6942256778294479, + "learning_rate": 0.00015271163476524485, + "loss": 0.9723, + "step": 7326 + }, + { + "epoch": 1.5629266211604096, + "grad_norm": 0.6041404298672174, + "learning_rate": 0.0001527080855767259, + "loss": 0.92, + "step": 7327 + }, + { + "epoch": 1.5631399317406145, + "grad_norm": 0.38600951147454526, + "learning_rate": 0.0001527045355655137, + "loss": 0.9674, + "step": 7328 + }, + { + "epoch": 1.5633532423208192, + "grad_norm": 0.3311973115691042, + "learning_rate": 0.0001527009847316485, + "loss": 0.9579, + "step": 7329 + }, + { + "epoch": 1.5635665529010239, + "grad_norm": 0.3473796288086355, + "learning_rate": 0.0001526974330751704, + "loss": 0.9715, + "step": 7330 + }, + { + "epoch": 1.5637798634812285, + "grad_norm": 0.35363511085605726, + "learning_rate": 0.00015269388059611965, + "loss": 0.9722, + "step": 7331 + }, + { + "epoch": 1.5639931740614335, + "grad_norm": 0.3994830678739855, + "learning_rate": 0.0001526903272945364, + "loss": 0.9577, + "step": 7332 + }, + { + "epoch": 1.5642064846416384, + "grad_norm": 0.5124947414444424, + "learning_rate": 0.00015268677317046086, + "loss": 0.9843, + "step": 7333 + }, + { + "epoch": 1.564419795221843, + "grad_norm": 0.5915884039517896, + "learning_rate": 0.00015268321822393326, + "loss": 0.9726, + "step": 7334 + }, + { + "epoch": 1.5646331058020477, + "grad_norm": 0.5538641252862094, + "learning_rate": 0.00015267966245499385, + "loss": 0.9362, + "step": 7335 + }, + { + "epoch": 1.5648464163822524, + "grad_norm": 0.4229134450285546, + "learning_rate": 0.00015267610586368284, + "loss": 0.9333, + "step": 7336 + }, + { + "epoch": 1.5650597269624573, + "grad_norm": 0.32657911675864654, + "learning_rate": 0.00015267254845004042, + "loss": 0.9301, + "step": 7337 + }, + { + "epoch": 1.5652730375426622, + "grad_norm": 0.2719390940002777, + "learning_rate": 0.00015266899021410693, + "loss": 0.948, + "step": 7338 + }, + { + "epoch": 1.565486348122867, + "grad_norm": 0.27754311873601206, + "learning_rate": 0.00015266543115592259, + "loss": 0.9038, + "step": 7339 + }, + { + "epoch": 1.5656996587030716, + "grad_norm": 0.30318330672938354, + "learning_rate": 0.00015266187127552767, + "loss": 0.974, + "step": 7340 + }, + { + "epoch": 1.5659129692832765, + "grad_norm": 0.3660073687077367, + "learning_rate": 0.00015265831057296246, + "loss": 0.9337, + "step": 7341 + }, + { + "epoch": 1.5661262798634812, + "grad_norm": 0.4696625854153182, + "learning_rate": 0.00015265474904826724, + "loss": 0.983, + "step": 7342 + }, + { + "epoch": 1.566339590443686, + "grad_norm": 0.5513785978028033, + "learning_rate": 0.0001526511867014823, + "loss": 0.974, + "step": 7343 + }, + { + "epoch": 1.5665529010238908, + "grad_norm": 0.5690419225276041, + "learning_rate": 0.00015264762353264796, + "loss": 0.9476, + "step": 7344 + }, + { + "epoch": 1.5667662116040955, + "grad_norm": 0.5675007861554004, + "learning_rate": 0.00015264405954180458, + "loss": 0.9779, + "step": 7345 + }, + { + "epoch": 1.5669795221843004, + "grad_norm": 0.5208367676215171, + "learning_rate": 0.0001526404947289924, + "loss": 0.9615, + "step": 7346 + }, + { + "epoch": 1.5671928327645053, + "grad_norm": 0.44955946392383966, + "learning_rate": 0.00015263692909425183, + "loss": 0.9499, + "step": 7347 + }, + { + "epoch": 1.56740614334471, + "grad_norm": 0.3615139915146935, + "learning_rate": 0.00015263336263762316, + "loss": 0.942, + "step": 7348 + }, + { + "epoch": 1.5676194539249146, + "grad_norm": 0.3156060155740434, + "learning_rate": 0.00015262979535914678, + "loss": 0.9659, + "step": 7349 + }, + { + "epoch": 1.5678327645051193, + "grad_norm": 0.26960854292815706, + "learning_rate": 0.00015262622725886304, + "loss": 0.93, + "step": 7350 + }, + { + "epoch": 1.5680460750853242, + "grad_norm": 0.2971460262587907, + "learning_rate": 0.0001526226583368123, + "loss": 0.9324, + "step": 7351 + }, + { + "epoch": 1.5682593856655291, + "grad_norm": 0.4019054889336616, + "learning_rate": 0.000152619088593035, + "loss": 0.9573, + "step": 7352 + }, + { + "epoch": 1.5684726962457338, + "grad_norm": 0.535533797735334, + "learning_rate": 0.00015261551802757148, + "loss": 0.9464, + "step": 7353 + }, + { + "epoch": 1.5686860068259385, + "grad_norm": 0.593563145831169, + "learning_rate": 0.00015261194664046212, + "loss": 0.9755, + "step": 7354 + }, + { + "epoch": 1.5688993174061432, + "grad_norm": 0.5988672254125831, + "learning_rate": 0.00015260837443174742, + "loss": 0.9355, + "step": 7355 + }, + { + "epoch": 1.569112627986348, + "grad_norm": 0.5649640027093944, + "learning_rate": 0.0001526048014014677, + "loss": 0.9716, + "step": 7356 + }, + { + "epoch": 1.569325938566553, + "grad_norm": 0.5057774883311142, + "learning_rate": 0.00015260122754966345, + "loss": 0.9788, + "step": 7357 + }, + { + "epoch": 1.5695392491467577, + "grad_norm": 0.4267975389681246, + "learning_rate": 0.00015259765287637506, + "loss": 0.9247, + "step": 7358 + }, + { + "epoch": 1.5697525597269624, + "grad_norm": 0.3530088610915355, + "learning_rate": 0.00015259407738164304, + "loss": 0.9601, + "step": 7359 + }, + { + "epoch": 1.5699658703071673, + "grad_norm": 0.34806417561003217, + "learning_rate": 0.0001525905010655078, + "loss": 0.9915, + "step": 7360 + }, + { + "epoch": 1.570179180887372, + "grad_norm": 0.4489550536931533, + "learning_rate": 0.00015258692392800978, + "loss": 0.9747, + "step": 7361 + }, + { + "epoch": 1.5703924914675769, + "grad_norm": 0.5114555597756325, + "learning_rate": 0.00015258334596918955, + "loss": 0.9506, + "step": 7362 + }, + { + "epoch": 1.5706058020477816, + "grad_norm": 0.577517521280557, + "learning_rate": 0.00015257976718908753, + "loss": 0.9123, + "step": 7363 + }, + { + "epoch": 1.5708191126279862, + "grad_norm": 0.6755082638996839, + "learning_rate": 0.00015257618758774422, + "loss": 0.9536, + "step": 7364 + }, + { + "epoch": 1.5710324232081911, + "grad_norm": 0.63719545210725, + "learning_rate": 0.0001525726071652001, + "loss": 0.9606, + "step": 7365 + }, + { + "epoch": 1.571245733788396, + "grad_norm": 0.5583233900856543, + "learning_rate": 0.00015256902592149574, + "loss": 0.9309, + "step": 7366 + }, + { + "epoch": 1.5714590443686007, + "grad_norm": 0.45339897983060334, + "learning_rate": 0.00015256544385667165, + "loss": 0.921, + "step": 7367 + }, + { + "epoch": 1.5716723549488054, + "grad_norm": 0.37244123986382144, + "learning_rate": 0.0001525618609707683, + "loss": 0.9746, + "step": 7368 + }, + { + "epoch": 1.57188566552901, + "grad_norm": 0.3996218663764291, + "learning_rate": 0.00015255827726382628, + "loss": 0.9785, + "step": 7369 + }, + { + "epoch": 1.572098976109215, + "grad_norm": 0.38615698891304717, + "learning_rate": 0.00015255469273588613, + "loss": 0.9608, + "step": 7370 + }, + { + "epoch": 1.57231228668942, + "grad_norm": 0.2992535776835535, + "learning_rate": 0.00015255110738698845, + "loss": 0.9699, + "step": 7371 + }, + { + "epoch": 1.5725255972696246, + "grad_norm": 0.39094172020487583, + "learning_rate": 0.00015254752121717373, + "loss": 0.9315, + "step": 7372 + }, + { + "epoch": 1.5727389078498293, + "grad_norm": 0.49703345581024877, + "learning_rate": 0.00015254393422648257, + "loss": 0.9553, + "step": 7373 + }, + { + "epoch": 1.572952218430034, + "grad_norm": 0.5019643328845266, + "learning_rate": 0.00015254034641495564, + "loss": 0.9348, + "step": 7374 + }, + { + "epoch": 1.5731655290102389, + "grad_norm": 0.46344582365034864, + "learning_rate": 0.0001525367577826334, + "loss": 0.9464, + "step": 7375 + }, + { + "epoch": 1.5733788395904438, + "grad_norm": 0.4229612414379578, + "learning_rate": 0.00015253316832955656, + "loss": 0.9696, + "step": 7376 + }, + { + "epoch": 1.5735921501706485, + "grad_norm": 0.37728297995244764, + "learning_rate": 0.0001525295780557657, + "loss": 0.9818, + "step": 7377 + }, + { + "epoch": 1.5738054607508531, + "grad_norm": 0.34932576496588486, + "learning_rate": 0.00015252598696130143, + "loss": 0.9349, + "step": 7378 + }, + { + "epoch": 1.574018771331058, + "grad_norm": 0.33746499256395734, + "learning_rate": 0.00015252239504620442, + "loss": 0.9631, + "step": 7379 + }, + { + "epoch": 1.574232081911263, + "grad_norm": 0.3613380591245148, + "learning_rate": 0.00015251880231051528, + "loss": 0.9277, + "step": 7380 + }, + { + "epoch": 1.5744453924914676, + "grad_norm": 0.37011753594492375, + "learning_rate": 0.00015251520875427468, + "loss": 0.99, + "step": 7381 + }, + { + "epoch": 1.5746587030716723, + "grad_norm": 0.40592894000318896, + "learning_rate": 0.00015251161437752325, + "loss": 0.9572, + "step": 7382 + }, + { + "epoch": 1.574872013651877, + "grad_norm": 0.4283232876556319, + "learning_rate": 0.00015250801918030173, + "loss": 0.9768, + "step": 7383 + }, + { + "epoch": 1.575085324232082, + "grad_norm": 0.5117203349121245, + "learning_rate": 0.0001525044231626507, + "loss": 0.9589, + "step": 7384 + }, + { + "epoch": 1.5752986348122868, + "grad_norm": 0.5466711262398284, + "learning_rate": 0.00015250082632461092, + "loss": 0.9369, + "step": 7385 + }, + { + "epoch": 1.5755119453924915, + "grad_norm": 0.4717319419611402, + "learning_rate": 0.00015249722866622308, + "loss": 0.9355, + "step": 7386 + }, + { + "epoch": 1.5757252559726962, + "grad_norm": 0.32895716440216577, + "learning_rate": 0.00015249363018752786, + "loss": 0.9945, + "step": 7387 + }, + { + "epoch": 1.5759385665529009, + "grad_norm": 0.3022048122223626, + "learning_rate": 0.000152490030888566, + "loss": 0.9476, + "step": 7388 + }, + { + "epoch": 1.5761518771331058, + "grad_norm": 0.41736373960482953, + "learning_rate": 0.00015248643076937825, + "loss": 0.9776, + "step": 7389 + }, + { + "epoch": 1.5763651877133107, + "grad_norm": 0.39798477598177706, + "learning_rate": 0.0001524828298300053, + "loss": 0.9611, + "step": 7390 + }, + { + "epoch": 1.5765784982935154, + "grad_norm": 0.3487286117824743, + "learning_rate": 0.00015247922807048788, + "loss": 0.9325, + "step": 7391 + }, + { + "epoch": 1.57679180887372, + "grad_norm": 0.3916207454332207, + "learning_rate": 0.00015247562549086682, + "loss": 0.9666, + "step": 7392 + }, + { + "epoch": 1.577005119453925, + "grad_norm": 0.3999634968499589, + "learning_rate": 0.00015247202209118282, + "loss": 0.9306, + "step": 7393 + }, + { + "epoch": 1.5772184300341296, + "grad_norm": 0.37674186245130026, + "learning_rate": 0.00015246841787147668, + "loss": 0.9569, + "step": 7394 + }, + { + "epoch": 1.5774317406143346, + "grad_norm": 0.40501903420365293, + "learning_rate": 0.00015246481283178918, + "loss": 0.9505, + "step": 7395 + }, + { + "epoch": 1.5776450511945392, + "grad_norm": 0.3918273603683562, + "learning_rate": 0.00015246120697216107, + "loss": 0.9475, + "step": 7396 + }, + { + "epoch": 1.577858361774744, + "grad_norm": 0.331847937969728, + "learning_rate": 0.0001524576002926332, + "loss": 0.9627, + "step": 7397 + }, + { + "epoch": 1.5780716723549488, + "grad_norm": 0.3189160927524678, + "learning_rate": 0.0001524539927932464, + "loss": 0.9695, + "step": 7398 + }, + { + "epoch": 1.5782849829351537, + "grad_norm": 0.3837669301356153, + "learning_rate": 0.00015245038447404138, + "loss": 0.9476, + "step": 7399 + }, + { + "epoch": 1.5784982935153584, + "grad_norm": 0.38688468207470095, + "learning_rate": 0.00015244677533505908, + "loss": 0.9729, + "step": 7400 + }, + { + "epoch": 1.578711604095563, + "grad_norm": 0.40665753899111007, + "learning_rate": 0.00015244316537634033, + "loss": 0.9516, + "step": 7401 + }, + { + "epoch": 1.5789249146757678, + "grad_norm": 0.4992238879657005, + "learning_rate": 0.0001524395545979259, + "loss": 0.9359, + "step": 7402 + }, + { + "epoch": 1.5791382252559727, + "grad_norm": 0.6106058586265116, + "learning_rate": 0.00015243594299985672, + "loss": 0.9544, + "step": 7403 + }, + { + "epoch": 1.5793515358361776, + "grad_norm": 0.7179446117111815, + "learning_rate": 0.0001524323305821736, + "loss": 0.9634, + "step": 7404 + }, + { + "epoch": 1.5795648464163823, + "grad_norm": 0.7590633344026467, + "learning_rate": 0.00015242871734491743, + "loss": 0.9563, + "step": 7405 + }, + { + "epoch": 1.579778156996587, + "grad_norm": 0.6373808168477996, + "learning_rate": 0.00015242510328812914, + "loss": 0.9663, + "step": 7406 + }, + { + "epoch": 1.5799914675767917, + "grad_norm": 0.46730477796352754, + "learning_rate": 0.0001524214884118496, + "loss": 0.9378, + "step": 7407 + }, + { + "epoch": 1.5802047781569966, + "grad_norm": 0.3396561000923038, + "learning_rate": 0.00015241787271611965, + "loss": 0.9966, + "step": 7408 + }, + { + "epoch": 1.5804180887372015, + "grad_norm": 0.24556243736993402, + "learning_rate": 0.0001524142562009803, + "loss": 0.9523, + "step": 7409 + }, + { + "epoch": 1.5806313993174061, + "grad_norm": 0.3220354884848231, + "learning_rate": 0.00015241063886647238, + "loss": 0.9544, + "step": 7410 + }, + { + "epoch": 1.5808447098976108, + "grad_norm": 0.42980991935258595, + "learning_rate": 0.0001524070207126369, + "loss": 0.9417, + "step": 7411 + }, + { + "epoch": 1.5810580204778157, + "grad_norm": 0.5333556484620023, + "learning_rate": 0.00015240340173951474, + "loss": 0.9354, + "step": 7412 + }, + { + "epoch": 1.5812713310580204, + "grad_norm": 0.5959150258488206, + "learning_rate": 0.0001523997819471469, + "loss": 0.9798, + "step": 7413 + }, + { + "epoch": 1.5814846416382253, + "grad_norm": 0.6201180478527238, + "learning_rate": 0.00015239616133557428, + "loss": 0.9846, + "step": 7414 + }, + { + "epoch": 1.58169795221843, + "grad_norm": 0.6074207084680123, + "learning_rate": 0.00015239253990483788, + "loss": 0.9288, + "step": 7415 + }, + { + "epoch": 1.5819112627986347, + "grad_norm": 0.5208044376627337, + "learning_rate": 0.0001523889176549787, + "loss": 0.9464, + "step": 7416 + }, + { + "epoch": 1.5821245733788396, + "grad_norm": 0.4574168102905286, + "learning_rate": 0.00015238529458603766, + "loss": 0.9424, + "step": 7417 + }, + { + "epoch": 1.5823378839590445, + "grad_norm": 0.40266737199763514, + "learning_rate": 0.0001523816706980558, + "loss": 0.9429, + "step": 7418 + }, + { + "epoch": 1.5825511945392492, + "grad_norm": 0.3121262923029344, + "learning_rate": 0.00015237804599107415, + "loss": 0.9291, + "step": 7419 + }, + { + "epoch": 1.5827645051194539, + "grad_norm": 0.30830871491139233, + "learning_rate": 0.00015237442046513367, + "loss": 0.9467, + "step": 7420 + }, + { + "epoch": 1.5829778156996586, + "grad_norm": 0.2756696201628278, + "learning_rate": 0.00015237079412027543, + "loss": 0.9154, + "step": 7421 + }, + { + "epoch": 1.5831911262798635, + "grad_norm": 0.25238066530196374, + "learning_rate": 0.0001523671669565404, + "loss": 0.9534, + "step": 7422 + }, + { + "epoch": 1.5834044368600684, + "grad_norm": 0.35492871006869214, + "learning_rate": 0.00015236353897396968, + "loss": 0.9407, + "step": 7423 + }, + { + "epoch": 1.583617747440273, + "grad_norm": 0.38447797535247613, + "learning_rate": 0.0001523599101726043, + "loss": 0.9662, + "step": 7424 + }, + { + "epoch": 1.5838310580204777, + "grad_norm": 0.2691358177169355, + "learning_rate": 0.0001523562805524853, + "loss": 0.9329, + "step": 7425 + }, + { + "epoch": 1.5840443686006824, + "grad_norm": 0.33304547532558443, + "learning_rate": 0.00015235265011365377, + "loss": 0.9497, + "step": 7426 + }, + { + "epoch": 1.5842576791808873, + "grad_norm": 0.34007795405628016, + "learning_rate": 0.0001523490188561508, + "loss": 0.942, + "step": 7427 + }, + { + "epoch": 1.5844709897610922, + "grad_norm": 0.34552613293916584, + "learning_rate": 0.00015234538678001746, + "loss": 0.9673, + "step": 7428 + }, + { + "epoch": 1.584684300341297, + "grad_norm": 0.30807737175309957, + "learning_rate": 0.00015234175388529486, + "loss": 0.9854, + "step": 7429 + }, + { + "epoch": 1.5848976109215016, + "grad_norm": 0.3374838867974864, + "learning_rate": 0.0001523381201720241, + "loss": 0.928, + "step": 7430 + }, + { + "epoch": 1.5851109215017065, + "grad_norm": 0.3876645093071866, + "learning_rate": 0.0001523344856402463, + "loss": 0.9572, + "step": 7431 + }, + { + "epoch": 1.5853242320819114, + "grad_norm": 0.3300473485669853, + "learning_rate": 0.00015233085029000255, + "loss": 0.9319, + "step": 7432 + }, + { + "epoch": 1.585537542662116, + "grad_norm": 0.2716696026054633, + "learning_rate": 0.00015232721412133403, + "loss": 0.9327, + "step": 7433 + }, + { + "epoch": 1.5857508532423208, + "grad_norm": 0.21110641397838906, + "learning_rate": 0.00015232357713428185, + "loss": 0.9136, + "step": 7434 + }, + { + "epoch": 1.5859641638225255, + "grad_norm": 0.2785324109988843, + "learning_rate": 0.0001523199393288872, + "loss": 0.9852, + "step": 7435 + }, + { + "epoch": 1.5861774744027304, + "grad_norm": 0.29376626221257085, + "learning_rate": 0.0001523163007051912, + "loss": 0.9653, + "step": 7436 + }, + { + "epoch": 1.5863907849829353, + "grad_norm": 0.2698646236819046, + "learning_rate": 0.00015231266126323505, + "loss": 0.9699, + "step": 7437 + }, + { + "epoch": 1.58660409556314, + "grad_norm": 0.30877674261469756, + "learning_rate": 0.00015230902100305994, + "loss": 0.9897, + "step": 7438 + }, + { + "epoch": 1.5868174061433447, + "grad_norm": 0.33379139282451326, + "learning_rate": 0.00015230537992470702, + "loss": 0.9663, + "step": 7439 + }, + { + "epoch": 1.5870307167235493, + "grad_norm": 0.3882992491386068, + "learning_rate": 0.0001523017380282175, + "loss": 0.942, + "step": 7440 + }, + { + "epoch": 1.5872440273037542, + "grad_norm": 0.4449618669068021, + "learning_rate": 0.00015229809531363262, + "loss": 0.9322, + "step": 7441 + }, + { + "epoch": 1.5874573378839592, + "grad_norm": 0.4424974195984052, + "learning_rate": 0.00015229445178099356, + "loss": 0.9367, + "step": 7442 + }, + { + "epoch": 1.5876706484641638, + "grad_norm": 0.3649591370342498, + "learning_rate": 0.00015229080743034158, + "loss": 0.9699, + "step": 7443 + }, + { + "epoch": 1.5878839590443685, + "grad_norm": 0.26922256869818856, + "learning_rate": 0.0001522871622617179, + "loss": 0.9493, + "step": 7444 + }, + { + "epoch": 1.5880972696245734, + "grad_norm": 0.27982970951125724, + "learning_rate": 0.00015228351627516375, + "loss": 0.9483, + "step": 7445 + }, + { + "epoch": 1.588310580204778, + "grad_norm": 0.3518328563695515, + "learning_rate": 0.0001522798694707204, + "loss": 0.9486, + "step": 7446 + }, + { + "epoch": 1.588523890784983, + "grad_norm": 0.461600956625692, + "learning_rate": 0.0001522762218484291, + "loss": 0.947, + "step": 7447 + }, + { + "epoch": 1.5887372013651877, + "grad_norm": 0.5332904967773082, + "learning_rate": 0.00015227257340833113, + "loss": 0.9244, + "step": 7448 + }, + { + "epoch": 1.5889505119453924, + "grad_norm": 0.5517122418053233, + "learning_rate": 0.00015226892415046782, + "loss": 0.9532, + "step": 7449 + }, + { + "epoch": 1.5891638225255973, + "grad_norm": 0.4467507511770518, + "learning_rate": 0.00015226527407488036, + "loss": 0.9479, + "step": 7450 + }, + { + "epoch": 1.5893771331058022, + "grad_norm": 0.3396223072545123, + "learning_rate": 0.00015226162318161013, + "loss": 0.9489, + "step": 7451 + }, + { + "epoch": 1.5895904436860069, + "grad_norm": 0.3221510495505429, + "learning_rate": 0.00015225797147069844, + "loss": 0.9169, + "step": 7452 + }, + { + "epoch": 1.5898037542662116, + "grad_norm": 0.2999843792214919, + "learning_rate": 0.00015225431894218657, + "loss": 0.9469, + "step": 7453 + }, + { + "epoch": 1.5900170648464163, + "grad_norm": 0.3682394496953864, + "learning_rate": 0.0001522506655961159, + "loss": 0.9378, + "step": 7454 + }, + { + "epoch": 1.5902303754266212, + "grad_norm": 0.476471224039652, + "learning_rate": 0.00015224701143252766, + "loss": 0.972, + "step": 7455 + }, + { + "epoch": 1.590443686006826, + "grad_norm": 0.49735972005431117, + "learning_rate": 0.00015224335645146333, + "loss": 0.9545, + "step": 7456 + }, + { + "epoch": 1.5906569965870307, + "grad_norm": 0.42435721962425765, + "learning_rate": 0.00015223970065296417, + "loss": 0.9247, + "step": 7457 + }, + { + "epoch": 1.5908703071672354, + "grad_norm": 0.4831608491001175, + "learning_rate": 0.0001522360440370716, + "loss": 0.9736, + "step": 7458 + }, + { + "epoch": 1.5910836177474401, + "grad_norm": 0.5185222490879743, + "learning_rate": 0.000152232386603827, + "loss": 0.9733, + "step": 7459 + }, + { + "epoch": 1.591296928327645, + "grad_norm": 0.49114140842828924, + "learning_rate": 0.0001522287283532717, + "loss": 0.9394, + "step": 7460 + }, + { + "epoch": 1.59151023890785, + "grad_norm": 0.4885723474754413, + "learning_rate": 0.00015222506928544712, + "loss": 0.9525, + "step": 7461 + }, + { + "epoch": 1.5917235494880546, + "grad_norm": 0.41782389359432637, + "learning_rate": 0.00015222140940039468, + "loss": 0.9144, + "step": 7462 + }, + { + "epoch": 1.5919368600682593, + "grad_norm": 0.3281003175949948, + "learning_rate": 0.00015221774869815578, + "loss": 0.9864, + "step": 7463 + }, + { + "epoch": 1.5921501706484642, + "grad_norm": 0.3867281093733129, + "learning_rate": 0.00015221408717877184, + "loss": 0.9536, + "step": 7464 + }, + { + "epoch": 1.5923634812286689, + "grad_norm": 0.3656276219689345, + "learning_rate": 0.00015221042484228427, + "loss": 0.9496, + "step": 7465 + }, + { + "epoch": 1.5925767918088738, + "grad_norm": 0.3055828403136901, + "learning_rate": 0.00015220676168873455, + "loss": 0.946, + "step": 7466 + }, + { + "epoch": 1.5927901023890785, + "grad_norm": 0.3680507077222808, + "learning_rate": 0.0001522030977181641, + "loss": 0.9669, + "step": 7467 + }, + { + "epoch": 1.5930034129692832, + "grad_norm": 0.42092106928914425, + "learning_rate": 0.0001521994329306144, + "loss": 0.917, + "step": 7468 + }, + { + "epoch": 1.593216723549488, + "grad_norm": 0.5629736834081657, + "learning_rate": 0.0001521957673261269, + "loss": 0.9282, + "step": 7469 + }, + { + "epoch": 1.593430034129693, + "grad_norm": 0.6301207753771478, + "learning_rate": 0.00015219210090474303, + "loss": 0.9963, + "step": 7470 + }, + { + "epoch": 1.5936433447098977, + "grad_norm": 0.5997058885219111, + "learning_rate": 0.00015218843366650436, + "loss": 0.9407, + "step": 7471 + }, + { + "epoch": 1.5938566552901023, + "grad_norm": 0.5372237329338628, + "learning_rate": 0.00015218476561145236, + "loss": 0.9712, + "step": 7472 + }, + { + "epoch": 1.594069965870307, + "grad_norm": 0.45970269104895983, + "learning_rate": 0.0001521810967396285, + "loss": 0.9581, + "step": 7473 + }, + { + "epoch": 1.594283276450512, + "grad_norm": 0.3252455324896661, + "learning_rate": 0.0001521774270510743, + "loss": 0.9397, + "step": 7474 + }, + { + "epoch": 1.5944965870307168, + "grad_norm": 0.27806818546862316, + "learning_rate": 0.00015217375654583135, + "loss": 0.9578, + "step": 7475 + }, + { + "epoch": 1.5947098976109215, + "grad_norm": 0.35918754066554753, + "learning_rate": 0.0001521700852239411, + "loss": 0.9631, + "step": 7476 + }, + { + "epoch": 1.5949232081911262, + "grad_norm": 0.38796097495374565, + "learning_rate": 0.00015216641308544513, + "loss": 0.9537, + "step": 7477 + }, + { + "epoch": 1.5951365187713311, + "grad_norm": 0.36336610984201984, + "learning_rate": 0.00015216274013038497, + "loss": 0.9717, + "step": 7478 + }, + { + "epoch": 1.5953498293515358, + "grad_norm": 0.34970408356349486, + "learning_rate": 0.0001521590663588022, + "loss": 0.9524, + "step": 7479 + }, + { + "epoch": 1.5955631399317407, + "grad_norm": 0.31702237441649156, + "learning_rate": 0.0001521553917707384, + "loss": 0.9666, + "step": 7480 + }, + { + "epoch": 1.5957764505119454, + "grad_norm": 0.38438298347137545, + "learning_rate": 0.0001521517163662351, + "loss": 0.9913, + "step": 7481 + }, + { + "epoch": 1.59598976109215, + "grad_norm": 0.3694106185325403, + "learning_rate": 0.00015214804014533392, + "loss": 0.9459, + "step": 7482 + }, + { + "epoch": 1.596203071672355, + "grad_norm": 0.38168365048501257, + "learning_rate": 0.00015214436310807646, + "loss": 0.9222, + "step": 7483 + }, + { + "epoch": 1.5964163822525599, + "grad_norm": 0.4009522197667447, + "learning_rate": 0.0001521406852545043, + "loss": 0.9419, + "step": 7484 + }, + { + "epoch": 1.5966296928327646, + "grad_norm": 0.3928630320373928, + "learning_rate": 0.0001521370065846591, + "loss": 0.9748, + "step": 7485 + }, + { + "epoch": 1.5968430034129693, + "grad_norm": 0.36831748952524007, + "learning_rate": 0.00015213332709858245, + "loss": 0.9405, + "step": 7486 + }, + { + "epoch": 1.597056313993174, + "grad_norm": 0.36822783042463747, + "learning_rate": 0.00015212964679631598, + "loss": 0.9519, + "step": 7487 + }, + { + "epoch": 1.5972696245733788, + "grad_norm": 0.30412856840053437, + "learning_rate": 0.00015212596567790136, + "loss": 0.9394, + "step": 7488 + }, + { + "epoch": 1.5974829351535837, + "grad_norm": 0.3013589048921606, + "learning_rate": 0.00015212228374338022, + "loss": 0.9409, + "step": 7489 + }, + { + "epoch": 1.5976962457337884, + "grad_norm": 0.237205821161698, + "learning_rate": 0.0001521186009927942, + "loss": 0.9595, + "step": 7490 + }, + { + "epoch": 1.5979095563139931, + "grad_norm": 0.22104577097971836, + "learning_rate": 0.00015211491742618503, + "loss": 0.9276, + "step": 7491 + }, + { + "epoch": 1.5981228668941978, + "grad_norm": 0.25708110901996367, + "learning_rate": 0.00015211123304359434, + "loss": 0.9464, + "step": 7492 + }, + { + "epoch": 1.5983361774744027, + "grad_norm": 0.2426810462107846, + "learning_rate": 0.00015210754784506384, + "loss": 0.9593, + "step": 7493 + }, + { + "epoch": 1.5985494880546076, + "grad_norm": 0.2545247522622294, + "learning_rate": 0.0001521038618306352, + "loss": 0.9617, + "step": 7494 + }, + { + "epoch": 1.5987627986348123, + "grad_norm": 0.3098465345022166, + "learning_rate": 0.0001521001750003502, + "loss": 0.9424, + "step": 7495 + }, + { + "epoch": 1.598976109215017, + "grad_norm": 0.33134005394464616, + "learning_rate": 0.00015209648735425046, + "loss": 0.9277, + "step": 7496 + }, + { + "epoch": 1.599189419795222, + "grad_norm": 0.3351605487618286, + "learning_rate": 0.00015209279889237778, + "loss": 0.9332, + "step": 7497 + }, + { + "epoch": 1.5994027303754266, + "grad_norm": 0.3276542935895935, + "learning_rate": 0.00015208910961477388, + "loss": 0.9702, + "step": 7498 + }, + { + "epoch": 1.5996160409556315, + "grad_norm": 0.32308006125087335, + "learning_rate": 0.00015208541952148045, + "loss": 0.9305, + "step": 7499 + }, + { + "epoch": 1.5998293515358362, + "grad_norm": 0.32693424346402217, + "learning_rate": 0.00015208172861253932, + "loss": 1.005, + "step": 7500 + }, + { + "epoch": 1.6000426621160408, + "grad_norm": 0.32910150902967966, + "learning_rate": 0.00015207803688799222, + "loss": 0.9772, + "step": 7501 + }, + { + "epoch": 1.6002559726962458, + "grad_norm": 0.29052981266905104, + "learning_rate": 0.0001520743443478809, + "loss": 0.9768, + "step": 7502 + }, + { + "epoch": 1.6004692832764507, + "grad_norm": 0.2935541499540017, + "learning_rate": 0.00015207065099224714, + "loss": 0.9452, + "step": 7503 + }, + { + "epoch": 1.6006825938566553, + "grad_norm": 0.35241662594622286, + "learning_rate": 0.00015206695682113279, + "loss": 0.9576, + "step": 7504 + }, + { + "epoch": 1.60089590443686, + "grad_norm": 0.3203595577407072, + "learning_rate": 0.00015206326183457962, + "loss": 0.9518, + "step": 7505 + }, + { + "epoch": 1.6011092150170647, + "grad_norm": 0.3618905899806468, + "learning_rate": 0.0001520595660326294, + "loss": 0.9373, + "step": 7506 + }, + { + "epoch": 1.6013225255972696, + "grad_norm": 0.4554928732730621, + "learning_rate": 0.000152055869415324, + "loss": 0.9297, + "step": 7507 + }, + { + "epoch": 1.6015358361774745, + "grad_norm": 0.5031322679147622, + "learning_rate": 0.00015205217198270522, + "loss": 0.9162, + "step": 7508 + }, + { + "epoch": 1.6017491467576792, + "grad_norm": 0.5442363694955175, + "learning_rate": 0.0001520484737348149, + "loss": 0.9523, + "step": 7509 + }, + { + "epoch": 1.601962457337884, + "grad_norm": 0.567623980401491, + "learning_rate": 0.00015204477467169487, + "loss": 0.9543, + "step": 7510 + }, + { + "epoch": 1.6021757679180886, + "grad_norm": 0.6143958071108528, + "learning_rate": 0.000152041074793387, + "loss": 0.9591, + "step": 7511 + }, + { + "epoch": 1.6023890784982935, + "grad_norm": 0.6886657976859665, + "learning_rate": 0.00015203737409993318, + "loss": 0.9448, + "step": 7512 + }, + { + "epoch": 1.6026023890784984, + "grad_norm": 0.7355103966751738, + "learning_rate": 0.00015203367259137527, + "loss": 0.934, + "step": 7513 + }, + { + "epoch": 1.602815699658703, + "grad_norm": 0.7759631424526335, + "learning_rate": 0.00015202997026775511, + "loss": 0.9637, + "step": 7514 + }, + { + "epoch": 1.6030290102389078, + "grad_norm": 0.789567404894382, + "learning_rate": 0.00015202626712911466, + "loss": 0.9509, + "step": 7515 + }, + { + "epoch": 1.6032423208191127, + "grad_norm": 0.7384788215539213, + "learning_rate": 0.00015202256317549577, + "loss": 0.9279, + "step": 7516 + }, + { + "epoch": 1.6034556313993176, + "grad_norm": 0.6607830917958756, + "learning_rate": 0.00015201885840694039, + "loss": 0.9693, + "step": 7517 + }, + { + "epoch": 1.6036689419795223, + "grad_norm": 0.5122496088067887, + "learning_rate": 0.00015201515282349035, + "loss": 0.9386, + "step": 7518 + }, + { + "epoch": 1.603882252559727, + "grad_norm": 0.36961091828540155, + "learning_rate": 0.00015201144642518772, + "loss": 0.9902, + "step": 7519 + }, + { + "epoch": 1.6040955631399316, + "grad_norm": 0.3195814887090514, + "learning_rate": 0.0001520077392120743, + "loss": 0.9474, + "step": 7520 + }, + { + "epoch": 1.6043088737201365, + "grad_norm": 0.378009601044579, + "learning_rate": 0.00015200403118419215, + "loss": 0.9543, + "step": 7521 + }, + { + "epoch": 1.6045221843003414, + "grad_norm": 0.5224493349162674, + "learning_rate": 0.00015200032234158315, + "loss": 0.9414, + "step": 7522 + }, + { + "epoch": 1.6047354948805461, + "grad_norm": 0.5759173187704335, + "learning_rate": 0.00015199661268428926, + "loss": 0.9466, + "step": 7523 + }, + { + "epoch": 1.6049488054607508, + "grad_norm": 0.5961890106658198, + "learning_rate": 0.00015199290221235256, + "loss": 0.9297, + "step": 7524 + }, + { + "epoch": 1.6051621160409555, + "grad_norm": 0.5800380797340362, + "learning_rate": 0.00015198919092581492, + "loss": 0.9426, + "step": 7525 + }, + { + "epoch": 1.6053754266211604, + "grad_norm": 0.5166960537152254, + "learning_rate": 0.00015198547882471836, + "loss": 0.9739, + "step": 7526 + }, + { + "epoch": 1.6055887372013653, + "grad_norm": 0.5602775901819195, + "learning_rate": 0.00015198176590910492, + "loss": 0.9503, + "step": 7527 + }, + { + "epoch": 1.60580204778157, + "grad_norm": 0.4856261817346577, + "learning_rate": 0.00015197805217901657, + "loss": 0.9579, + "step": 7528 + }, + { + "epoch": 1.6060153583617747, + "grad_norm": 0.5099255968847269, + "learning_rate": 0.00015197433763449537, + "loss": 0.9247, + "step": 7529 + }, + { + "epoch": 1.6062286689419796, + "grad_norm": 0.5277210460427727, + "learning_rate": 0.00015197062227558333, + "loss": 0.9406, + "step": 7530 + }, + { + "epoch": 1.6064419795221843, + "grad_norm": 0.4264799477797426, + "learning_rate": 0.00015196690610232247, + "loss": 0.9747, + "step": 7531 + }, + { + "epoch": 1.6066552901023892, + "grad_norm": 0.3341303280266882, + "learning_rate": 0.00015196318911475486, + "loss": 0.9521, + "step": 7532 + }, + { + "epoch": 1.6068686006825939, + "grad_norm": 0.275936296633199, + "learning_rate": 0.00015195947131292254, + "loss": 0.9543, + "step": 7533 + }, + { + "epoch": 1.6070819112627985, + "grad_norm": 0.323921149420915, + "learning_rate": 0.00015195575269686762, + "loss": 0.9838, + "step": 7534 + }, + { + "epoch": 1.6072952218430034, + "grad_norm": 0.3672515798635133, + "learning_rate": 0.00015195203326663214, + "loss": 0.9162, + "step": 7535 + }, + { + "epoch": 1.6075085324232083, + "grad_norm": 0.4177589380349318, + "learning_rate": 0.0001519483130222582, + "loss": 0.9626, + "step": 7536 + }, + { + "epoch": 1.607721843003413, + "grad_norm": 0.24881104788169228, + "learning_rate": 0.00015194459196378788, + "loss": 0.9458, + "step": 7537 + }, + { + "epoch": 1.6079351535836177, + "grad_norm": 0.37884636925013626, + "learning_rate": 0.0001519408700912633, + "loss": 0.9601, + "step": 7538 + }, + { + "epoch": 1.6081484641638224, + "grad_norm": 0.473083944448906, + "learning_rate": 0.00015193714740472655, + "loss": 0.9629, + "step": 7539 + }, + { + "epoch": 1.6083617747440273, + "grad_norm": 0.45977810875243397, + "learning_rate": 0.00015193342390421976, + "loss": 0.9232, + "step": 7540 + }, + { + "epoch": 1.6085750853242322, + "grad_norm": 0.346368676084719, + "learning_rate": 0.0001519296995897851, + "loss": 0.98, + "step": 7541 + }, + { + "epoch": 1.608788395904437, + "grad_norm": 0.39796707567261813, + "learning_rate": 0.00015192597446146467, + "loss": 0.9647, + "step": 7542 + }, + { + "epoch": 1.6090017064846416, + "grad_norm": 0.4540635894583214, + "learning_rate": 0.0001519222485193006, + "loss": 0.9641, + "step": 7543 + }, + { + "epoch": 1.6092150170648463, + "grad_norm": 0.3146010243904282, + "learning_rate": 0.0001519185217633351, + "loss": 0.9661, + "step": 7544 + }, + { + "epoch": 1.6094283276450512, + "grad_norm": 0.37241798634489487, + "learning_rate": 0.00015191479419361033, + "loss": 0.9682, + "step": 7545 + }, + { + "epoch": 1.609641638225256, + "grad_norm": 0.4322417749388071, + "learning_rate": 0.00015191106581016842, + "loss": 0.9488, + "step": 7546 + }, + { + "epoch": 1.6098549488054608, + "grad_norm": 0.4464976598673962, + "learning_rate": 0.00015190733661305162, + "loss": 0.956, + "step": 7547 + }, + { + "epoch": 1.6100682593856654, + "grad_norm": 0.4435000025572136, + "learning_rate": 0.00015190360660230208, + "loss": 0.9663, + "step": 7548 + }, + { + "epoch": 1.6102815699658704, + "grad_norm": 0.4661605972972312, + "learning_rate": 0.00015189987577796205, + "loss": 0.9451, + "step": 7549 + }, + { + "epoch": 1.610494880546075, + "grad_norm": 0.5406167875999632, + "learning_rate": 0.00015189614414007368, + "loss": 0.9713, + "step": 7550 + }, + { + "epoch": 1.61070819112628, + "grad_norm": 0.6378991373694601, + "learning_rate": 0.00015189241168867926, + "loss": 0.9639, + "step": 7551 + }, + { + "epoch": 1.6109215017064846, + "grad_norm": 0.6229126244775716, + "learning_rate": 0.00015188867842382095, + "loss": 0.943, + "step": 7552 + }, + { + "epoch": 1.6111348122866893, + "grad_norm": 0.5066178723137338, + "learning_rate": 0.00015188494434554105, + "loss": 0.9443, + "step": 7553 + }, + { + "epoch": 1.6113481228668942, + "grad_norm": 0.34359806931405135, + "learning_rate": 0.00015188120945388182, + "loss": 0.942, + "step": 7554 + }, + { + "epoch": 1.6115614334470991, + "grad_norm": 0.29370377025234917, + "learning_rate": 0.00015187747374888547, + "loss": 0.9458, + "step": 7555 + }, + { + "epoch": 1.6117747440273038, + "grad_norm": 0.37904130632897504, + "learning_rate": 0.00015187373723059432, + "loss": 0.9505, + "step": 7556 + }, + { + "epoch": 1.6119880546075085, + "grad_norm": 0.43430482841421814, + "learning_rate": 0.00015186999989905063, + "loss": 0.9464, + "step": 7557 + }, + { + "epoch": 1.6122013651877132, + "grad_norm": 0.4254213796758122, + "learning_rate": 0.00015186626175429664, + "loss": 0.9618, + "step": 7558 + }, + { + "epoch": 1.612414675767918, + "grad_norm": 0.4568537966825928, + "learning_rate": 0.00015186252279637474, + "loss": 0.9597, + "step": 7559 + }, + { + "epoch": 1.612627986348123, + "grad_norm": 0.4636539758386563, + "learning_rate": 0.00015185878302532716, + "loss": 0.9275, + "step": 7560 + }, + { + "epoch": 1.6128412969283277, + "grad_norm": 0.39967190537490904, + "learning_rate": 0.0001518550424411962, + "loss": 0.9411, + "step": 7561 + }, + { + "epoch": 1.6130546075085324, + "grad_norm": 0.3941687029519857, + "learning_rate": 0.0001518513010440243, + "loss": 0.9979, + "step": 7562 + }, + { + "epoch": 1.613267918088737, + "grad_norm": 0.41564724146594245, + "learning_rate": 0.0001518475588338537, + "loss": 0.9375, + "step": 7563 + }, + { + "epoch": 1.613481228668942, + "grad_norm": 0.3441457304350214, + "learning_rate": 0.00015184381581072673, + "loss": 0.9584, + "step": 7564 + }, + { + "epoch": 1.6136945392491469, + "grad_norm": 0.2827138974485591, + "learning_rate": 0.0001518400719746858, + "loss": 0.9663, + "step": 7565 + }, + { + "epoch": 1.6139078498293515, + "grad_norm": 0.32165422351692885, + "learning_rate": 0.00015183632732577323, + "loss": 0.9345, + "step": 7566 + }, + { + "epoch": 1.6141211604095562, + "grad_norm": 0.3507368293039254, + "learning_rate": 0.00015183258186403142, + "loss": 0.944, + "step": 7567 + }, + { + "epoch": 1.6143344709897611, + "grad_norm": 0.347802421083644, + "learning_rate": 0.00015182883558950274, + "loss": 0.9419, + "step": 7568 + }, + { + "epoch": 1.614547781569966, + "grad_norm": 0.3667617380183567, + "learning_rate": 0.0001518250885022296, + "loss": 0.9465, + "step": 7569 + }, + { + "epoch": 1.6147610921501707, + "grad_norm": 0.37953945464057476, + "learning_rate": 0.00015182134060225434, + "loss": 0.9439, + "step": 7570 + }, + { + "epoch": 1.6149744027303754, + "grad_norm": 0.43922088841147855, + "learning_rate": 0.00015181759188961943, + "loss": 0.9417, + "step": 7571 + }, + { + "epoch": 1.61518771331058, + "grad_norm": 0.4844723675534103, + "learning_rate": 0.00015181384236436724, + "loss": 0.9478, + "step": 7572 + }, + { + "epoch": 1.615401023890785, + "grad_norm": 0.5266968691635908, + "learning_rate": 0.00015181009202654022, + "loss": 0.9738, + "step": 7573 + }, + { + "epoch": 1.61561433447099, + "grad_norm": 0.5240585088333659, + "learning_rate": 0.0001518063408761808, + "loss": 0.9656, + "step": 7574 + }, + { + "epoch": 1.6158276450511946, + "grad_norm": 0.5548119271691319, + "learning_rate": 0.0001518025889133314, + "loss": 0.9486, + "step": 7575 + }, + { + "epoch": 1.6160409556313993, + "grad_norm": 0.6471711540278127, + "learning_rate": 0.00015179883613803457, + "loss": 0.9699, + "step": 7576 + }, + { + "epoch": 1.616254266211604, + "grad_norm": 0.7182826667013518, + "learning_rate": 0.00015179508255033262, + "loss": 0.9732, + "step": 7577 + }, + { + "epoch": 1.6164675767918089, + "grad_norm": 0.6796220046697216, + "learning_rate": 0.00015179132815026816, + "loss": 0.937, + "step": 7578 + }, + { + "epoch": 1.6166808873720138, + "grad_norm": 0.6556950965369127, + "learning_rate": 0.0001517875729378836, + "loss": 0.9387, + "step": 7579 + }, + { + "epoch": 1.6168941979522184, + "grad_norm": 0.5550933839403885, + "learning_rate": 0.00015178381691322143, + "loss": 0.9712, + "step": 7580 + }, + { + "epoch": 1.6171075085324231, + "grad_norm": 0.41722196387233823, + "learning_rate": 0.00015178006007632416, + "loss": 0.9336, + "step": 7581 + }, + { + "epoch": 1.617320819112628, + "grad_norm": 0.35413412043724507, + "learning_rate": 0.00015177630242723433, + "loss": 0.9267, + "step": 7582 + }, + { + "epoch": 1.6175341296928327, + "grad_norm": 0.32536925513031767, + "learning_rate": 0.00015177254396599442, + "loss": 0.9196, + "step": 7583 + }, + { + "epoch": 1.6177474402730376, + "grad_norm": 0.36447539400048407, + "learning_rate": 0.00015176878469264698, + "loss": 0.9367, + "step": 7584 + }, + { + "epoch": 1.6179607508532423, + "grad_norm": 0.4392626829692707, + "learning_rate": 0.0001517650246072345, + "loss": 0.9893, + "step": 7585 + }, + { + "epoch": 1.618174061433447, + "grad_norm": 0.4945529219559142, + "learning_rate": 0.0001517612637097996, + "loss": 0.9341, + "step": 7586 + }, + { + "epoch": 1.618387372013652, + "grad_norm": 0.43762051633909665, + "learning_rate": 0.00015175750200038476, + "loss": 0.9378, + "step": 7587 + }, + { + "epoch": 1.6186006825938568, + "grad_norm": 0.42677798644726195, + "learning_rate": 0.00015175373947903258, + "loss": 0.9053, + "step": 7588 + }, + { + "epoch": 1.6188139931740615, + "grad_norm": 0.39509213818019084, + "learning_rate": 0.00015174997614578564, + "loss": 0.9455, + "step": 7589 + }, + { + "epoch": 1.6190273037542662, + "grad_norm": 0.339627940959623, + "learning_rate": 0.0001517462120006865, + "loss": 0.9634, + "step": 7590 + }, + { + "epoch": 1.6192406143344709, + "grad_norm": 0.36988401117765224, + "learning_rate": 0.0001517424470437778, + "loss": 0.9699, + "step": 7591 + }, + { + "epoch": 1.6194539249146758, + "grad_norm": 0.4055927670819957, + "learning_rate": 0.0001517386812751021, + "loss": 0.9574, + "step": 7592 + }, + { + "epoch": 1.6196672354948807, + "grad_norm": 0.35829401751320783, + "learning_rate": 0.00015173491469470197, + "loss": 0.9341, + "step": 7593 + }, + { + "epoch": 1.6198805460750854, + "grad_norm": 0.2727117857060081, + "learning_rate": 0.0001517311473026201, + "loss": 0.9593, + "step": 7594 + }, + { + "epoch": 1.62009385665529, + "grad_norm": 0.2983924590991007, + "learning_rate": 0.00015172737909889913, + "loss": 0.9402, + "step": 7595 + }, + { + "epoch": 1.6203071672354947, + "grad_norm": 0.32020089701440485, + "learning_rate": 0.0001517236100835816, + "loss": 0.9423, + "step": 7596 + }, + { + "epoch": 1.6205204778156996, + "grad_norm": 0.36563173627373663, + "learning_rate": 0.00015171984025671024, + "loss": 0.9795, + "step": 7597 + }, + { + "epoch": 1.6207337883959045, + "grad_norm": 0.37742689845903227, + "learning_rate": 0.00015171606961832769, + "loss": 0.9601, + "step": 7598 + }, + { + "epoch": 1.6209470989761092, + "grad_norm": 0.3408763161552726, + "learning_rate": 0.0001517122981684766, + "loss": 0.9495, + "step": 7599 + }, + { + "epoch": 1.621160409556314, + "grad_norm": 0.3198079712172852, + "learning_rate": 0.00015170852590719964, + "loss": 0.9327, + "step": 7600 + }, + { + "epoch": 1.6213737201365188, + "grad_norm": 0.3882023404092928, + "learning_rate": 0.0001517047528345395, + "loss": 0.9555, + "step": 7601 + }, + { + "epoch": 1.6215870307167235, + "grad_norm": 0.3562241657301124, + "learning_rate": 0.0001517009789505389, + "loss": 0.9421, + "step": 7602 + }, + { + "epoch": 1.6218003412969284, + "grad_norm": 0.3097949870590243, + "learning_rate": 0.0001516972042552405, + "loss": 0.9249, + "step": 7603 + }, + { + "epoch": 1.622013651877133, + "grad_norm": 0.34000207260972126, + "learning_rate": 0.00015169342874868702, + "loss": 0.9698, + "step": 7604 + }, + { + "epoch": 1.6222269624573378, + "grad_norm": 0.2609195883759181, + "learning_rate": 0.0001516896524309212, + "loss": 0.9594, + "step": 7605 + }, + { + "epoch": 1.6224402730375427, + "grad_norm": 0.27818745073499884, + "learning_rate": 0.00015168587530198574, + "loss": 0.9861, + "step": 7606 + }, + { + "epoch": 1.6226535836177476, + "grad_norm": 0.3328672142785384, + "learning_rate": 0.00015168209736192344, + "loss": 0.9619, + "step": 7607 + }, + { + "epoch": 1.6228668941979523, + "grad_norm": 0.31575546738252086, + "learning_rate": 0.00015167831861077696, + "loss": 0.9296, + "step": 7608 + }, + { + "epoch": 1.623080204778157, + "grad_norm": 0.39779500582819005, + "learning_rate": 0.0001516745390485891, + "loss": 0.9477, + "step": 7609 + }, + { + "epoch": 1.6232935153583616, + "grad_norm": 0.46867188455954134, + "learning_rate": 0.00015167075867540263, + "loss": 0.9422, + "step": 7610 + }, + { + "epoch": 1.6235068259385665, + "grad_norm": 0.4535322017240263, + "learning_rate": 0.00015166697749126033, + "loss": 0.9279, + "step": 7611 + }, + { + "epoch": 1.6237201365187715, + "grad_norm": 0.4198420918137666, + "learning_rate": 0.000151663195496205, + "loss": 0.9539, + "step": 7612 + }, + { + "epoch": 1.6239334470989761, + "grad_norm": 0.3652426697429005, + "learning_rate": 0.00015165941269027936, + "loss": 0.9366, + "step": 7613 + }, + { + "epoch": 1.6241467576791808, + "grad_norm": 0.37843807526085105, + "learning_rate": 0.00015165562907352629, + "loss": 0.9808, + "step": 7614 + }, + { + "epoch": 1.6243600682593855, + "grad_norm": 0.4990846776413334, + "learning_rate": 0.00015165184464598853, + "loss": 0.9696, + "step": 7615 + }, + { + "epoch": 1.6245733788395904, + "grad_norm": 0.5627725254319909, + "learning_rate": 0.00015164805940770897, + "loss": 0.9454, + "step": 7616 + }, + { + "epoch": 1.6247866894197953, + "grad_norm": 0.4827828690382922, + "learning_rate": 0.00015164427335873044, + "loss": 0.9095, + "step": 7617 + }, + { + "epoch": 1.625, + "grad_norm": 0.4662068019558229, + "learning_rate": 0.00015164048649909573, + "loss": 0.9523, + "step": 7618 + }, + { + "epoch": 1.6252133105802047, + "grad_norm": 0.5417717880980021, + "learning_rate": 0.0001516366988288477, + "loss": 0.9245, + "step": 7619 + }, + { + "epoch": 1.6254266211604096, + "grad_norm": 0.659409040347796, + "learning_rate": 0.0001516329103480292, + "loss": 0.9565, + "step": 7620 + }, + { + "epoch": 1.6256399317406145, + "grad_norm": 0.7912252858005138, + "learning_rate": 0.00015162912105668317, + "loss": 0.9636, + "step": 7621 + }, + { + "epoch": 1.6258532423208192, + "grad_norm": 0.8969506222615037, + "learning_rate": 0.00015162533095485238, + "loss": 0.9803, + "step": 7622 + }, + { + "epoch": 1.6260665529010239, + "grad_norm": 0.8537238589674374, + "learning_rate": 0.0001516215400425798, + "loss": 0.938, + "step": 7623 + }, + { + "epoch": 1.6262798634812285, + "grad_norm": 0.7602477937299377, + "learning_rate": 0.00015161774831990826, + "loss": 0.9086, + "step": 7624 + }, + { + "epoch": 1.6264931740614335, + "grad_norm": 0.6059074105413623, + "learning_rate": 0.00015161395578688073, + "loss": 0.973, + "step": 7625 + }, + { + "epoch": 1.6267064846416384, + "grad_norm": 0.44061232031090325, + "learning_rate": 0.00015161016244354006, + "loss": 0.9429, + "step": 7626 + }, + { + "epoch": 1.626919795221843, + "grad_norm": 0.3570285590783023, + "learning_rate": 0.0001516063682899292, + "loss": 0.9462, + "step": 7627 + }, + { + "epoch": 1.6271331058020477, + "grad_norm": 0.4482877316563627, + "learning_rate": 0.0001516025733260911, + "loss": 0.9344, + "step": 7628 + }, + { + "epoch": 1.6273464163822524, + "grad_norm": 0.5804243034141325, + "learning_rate": 0.00015159877755206862, + "loss": 0.9219, + "step": 7629 + }, + { + "epoch": 1.6275597269624573, + "grad_norm": 0.6635348033727405, + "learning_rate": 0.00015159498096790485, + "loss": 0.9574, + "step": 7630 + }, + { + "epoch": 1.6277730375426622, + "grad_norm": 0.7643808804568466, + "learning_rate": 0.0001515911835736426, + "loss": 0.931, + "step": 7631 + }, + { + "epoch": 1.627986348122867, + "grad_norm": 0.6547428047052376, + "learning_rate": 0.00015158738536932492, + "loss": 0.9813, + "step": 7632 + }, + { + "epoch": 1.6281996587030716, + "grad_norm": 0.44666357330651213, + "learning_rate": 0.00015158358635499478, + "loss": 0.9708, + "step": 7633 + }, + { + "epoch": 1.6284129692832765, + "grad_norm": 0.4136431185462996, + "learning_rate": 0.00015157978653069514, + "loss": 0.9658, + "step": 7634 + }, + { + "epoch": 1.6286262798634812, + "grad_norm": 0.4102551213520321, + "learning_rate": 0.00015157598589646902, + "loss": 0.9844, + "step": 7635 + }, + { + "epoch": 1.628839590443686, + "grad_norm": 0.40753671204206066, + "learning_rate": 0.00015157218445235944, + "loss": 0.9346, + "step": 7636 + }, + { + "epoch": 1.6290529010238908, + "grad_norm": 0.527183961495904, + "learning_rate": 0.00015156838219840938, + "loss": 0.9844, + "step": 7637 + }, + { + "epoch": 1.6292662116040955, + "grad_norm": 0.5897271901725448, + "learning_rate": 0.00015156457913466186, + "loss": 0.9697, + "step": 7638 + }, + { + "epoch": 1.6294795221843004, + "grad_norm": 0.5565159863013637, + "learning_rate": 0.00015156077526115992, + "loss": 0.9446, + "step": 7639 + }, + { + "epoch": 1.6296928327645053, + "grad_norm": 0.49117670343931336, + "learning_rate": 0.00015155697057794658, + "loss": 0.9238, + "step": 7640 + }, + { + "epoch": 1.62990614334471, + "grad_norm": 0.35097730925072695, + "learning_rate": 0.00015155316508506497, + "loss": 0.9464, + "step": 7641 + }, + { + "epoch": 1.6301194539249146, + "grad_norm": 0.284869248825282, + "learning_rate": 0.00015154935878255806, + "loss": 0.9297, + "step": 7642 + }, + { + "epoch": 1.6303327645051193, + "grad_norm": 0.36896684279120223, + "learning_rate": 0.00015154555167046896, + "loss": 0.9485, + "step": 7643 + }, + { + "epoch": 1.6305460750853242, + "grad_norm": 0.43507459891872097, + "learning_rate": 0.00015154174374884072, + "loss": 0.955, + "step": 7644 + }, + { + "epoch": 1.6307593856655291, + "grad_norm": 0.4118911046027399, + "learning_rate": 0.00015153793501771647, + "loss": 0.9927, + "step": 7645 + }, + { + "epoch": 1.6309726962457338, + "grad_norm": 0.4752758503894171, + "learning_rate": 0.00015153412547713927, + "loss": 0.9579, + "step": 7646 + }, + { + "epoch": 1.6311860068259385, + "grad_norm": 0.4933177858001897, + "learning_rate": 0.00015153031512715225, + "loss": 0.9386, + "step": 7647 + }, + { + "epoch": 1.6313993174061432, + "grad_norm": 0.45385052484576305, + "learning_rate": 0.0001515265039677985, + "loss": 0.9618, + "step": 7648 + }, + { + "epoch": 1.631612627986348, + "grad_norm": 0.4937245818195802, + "learning_rate": 0.00015152269199912116, + "loss": 0.973, + "step": 7649 + }, + { + "epoch": 1.631825938566553, + "grad_norm": 0.43859767118301934, + "learning_rate": 0.00015151887922116336, + "loss": 0.9855, + "step": 7650 + }, + { + "epoch": 1.6320392491467577, + "grad_norm": 0.3216401342950097, + "learning_rate": 0.00015151506563396822, + "loss": 0.9495, + "step": 7651 + }, + { + "epoch": 1.6322525597269624, + "grad_norm": 0.27329471152886486, + "learning_rate": 0.00015151125123757895, + "loss": 0.9536, + "step": 7652 + }, + { + "epoch": 1.6324658703071673, + "grad_norm": 0.272553934571055, + "learning_rate": 0.00015150743603203863, + "loss": 0.9474, + "step": 7653 + }, + { + "epoch": 1.632679180887372, + "grad_norm": 0.3220797502371639, + "learning_rate": 0.0001515036200173905, + "loss": 0.9519, + "step": 7654 + }, + { + "epoch": 1.6328924914675769, + "grad_norm": 0.3463526875547677, + "learning_rate": 0.00015149980319367767, + "loss": 0.9613, + "step": 7655 + }, + { + "epoch": 1.6331058020477816, + "grad_norm": 0.3762686972928929, + "learning_rate": 0.00015149598556094337, + "loss": 0.9315, + "step": 7656 + }, + { + "epoch": 1.6333191126279862, + "grad_norm": 0.3733827480715863, + "learning_rate": 0.0001514921671192308, + "loss": 0.9365, + "step": 7657 + }, + { + "epoch": 1.6335324232081911, + "grad_norm": 0.41677813737889735, + "learning_rate": 0.00015148834786858317, + "loss": 0.9299, + "step": 7658 + }, + { + "epoch": 1.633745733788396, + "grad_norm": 0.3890138045286504, + "learning_rate": 0.00015148452780904367, + "loss": 0.9657, + "step": 7659 + }, + { + "epoch": 1.6339590443686007, + "grad_norm": 0.3008999402593405, + "learning_rate": 0.00015148070694065554, + "loss": 0.9476, + "step": 7660 + }, + { + "epoch": 1.6341723549488054, + "grad_norm": 0.29158655692801383, + "learning_rate": 0.00015147688526346198, + "loss": 0.9683, + "step": 7661 + }, + { + "epoch": 1.63438566552901, + "grad_norm": 0.4410577313930724, + "learning_rate": 0.00015147306277750632, + "loss": 0.9569, + "step": 7662 + }, + { + "epoch": 1.634598976109215, + "grad_norm": 0.3717867786036358, + "learning_rate": 0.0001514692394828317, + "loss": 0.9513, + "step": 7663 + }, + { + "epoch": 1.63481228668942, + "grad_norm": 0.32799040643197325, + "learning_rate": 0.00015146541537948145, + "loss": 0.9621, + "step": 7664 + }, + { + "epoch": 1.6350255972696246, + "grad_norm": 0.39185563241698834, + "learning_rate": 0.00015146159046749882, + "loss": 0.9471, + "step": 7665 + }, + { + "epoch": 1.6352389078498293, + "grad_norm": 0.32575503353753416, + "learning_rate": 0.0001514577647469271, + "loss": 0.9318, + "step": 7666 + }, + { + "epoch": 1.635452218430034, + "grad_norm": 0.310814623790747, + "learning_rate": 0.00015145393821780958, + "loss": 0.9457, + "step": 7667 + }, + { + "epoch": 1.6356655290102389, + "grad_norm": 0.42579379606211576, + "learning_rate": 0.00015145011088018952, + "loss": 0.9502, + "step": 7668 + }, + { + "epoch": 1.6358788395904438, + "grad_norm": 0.47056036507024274, + "learning_rate": 0.00015144628273411028, + "loss": 0.9924, + "step": 7669 + }, + { + "epoch": 1.6360921501706485, + "grad_norm": 0.37387335017769047, + "learning_rate": 0.00015144245377961513, + "loss": 0.9284, + "step": 7670 + }, + { + "epoch": 1.6363054607508531, + "grad_norm": 0.33407952447375544, + "learning_rate": 0.00015143862401674742, + "loss": 0.9706, + "step": 7671 + }, + { + "epoch": 1.636518771331058, + "grad_norm": 0.3851849070808924, + "learning_rate": 0.0001514347934455505, + "loss": 0.9489, + "step": 7672 + }, + { + "epoch": 1.636732081911263, + "grad_norm": 0.3496600041004928, + "learning_rate": 0.00015143096206606766, + "loss": 0.934, + "step": 7673 + }, + { + "epoch": 1.6369453924914676, + "grad_norm": 0.44308537323359853, + "learning_rate": 0.00015142712987834228, + "loss": 0.9378, + "step": 7674 + }, + { + "epoch": 1.6371587030716723, + "grad_norm": 0.47684369957751044, + "learning_rate": 0.00015142329688241771, + "loss": 0.9425, + "step": 7675 + }, + { + "epoch": 1.637372013651877, + "grad_norm": 0.4535437365645771, + "learning_rate": 0.00015141946307833736, + "loss": 0.9406, + "step": 7676 + }, + { + "epoch": 1.637585324232082, + "grad_norm": 0.5493113662962169, + "learning_rate": 0.0001514156284661446, + "loss": 0.9412, + "step": 7677 + }, + { + "epoch": 1.6377986348122868, + "grad_norm": 0.6634489759049595, + "learning_rate": 0.0001514117930458828, + "loss": 0.969, + "step": 7678 + }, + { + "epoch": 1.6380119453924915, + "grad_norm": 0.5147891158647349, + "learning_rate": 0.00015140795681759534, + "loss": 0.9571, + "step": 7679 + }, + { + "epoch": 1.6382252559726962, + "grad_norm": 0.4773646001829979, + "learning_rate": 0.00015140411978132565, + "loss": 0.9727, + "step": 7680 + }, + { + "epoch": 1.6384385665529009, + "grad_norm": 0.6004335834702526, + "learning_rate": 0.00015140028193711715, + "loss": 0.953, + "step": 7681 + }, + { + "epoch": 1.6386518771331058, + "grad_norm": 0.6315758330634821, + "learning_rate": 0.00015139644328501323, + "loss": 0.9011, + "step": 7682 + }, + { + "epoch": 1.6388651877133107, + "grad_norm": 0.7504028506216733, + "learning_rate": 0.0001513926038250574, + "loss": 0.9715, + "step": 7683 + }, + { + "epoch": 1.6390784982935154, + "grad_norm": 0.650548996833834, + "learning_rate": 0.00015138876355729302, + "loss": 0.9302, + "step": 7684 + }, + { + "epoch": 1.63929180887372, + "grad_norm": 0.5936602075065242, + "learning_rate": 0.0001513849224817636, + "loss": 0.967, + "step": 7685 + }, + { + "epoch": 1.639505119453925, + "grad_norm": 0.4151032186929437, + "learning_rate": 0.0001513810805985126, + "loss": 0.934, + "step": 7686 + }, + { + "epoch": 1.6397184300341296, + "grad_norm": 0.3354620209063886, + "learning_rate": 0.00015137723790758346, + "loss": 0.9766, + "step": 7687 + }, + { + "epoch": 1.6399317406143346, + "grad_norm": 0.38814098514773093, + "learning_rate": 0.00015137339440901963, + "loss": 0.986, + "step": 7688 + }, + { + "epoch": 1.6401450511945392, + "grad_norm": 0.408391987506492, + "learning_rate": 0.00015136955010286468, + "loss": 0.9401, + "step": 7689 + }, + { + "epoch": 1.640358361774744, + "grad_norm": 0.39033817217391337, + "learning_rate": 0.00015136570498916208, + "loss": 0.9485, + "step": 7690 + }, + { + "epoch": 1.6405716723549488, + "grad_norm": 0.36676386180566634, + "learning_rate": 0.0001513618590679553, + "loss": 0.9813, + "step": 7691 + }, + { + "epoch": 1.6407849829351537, + "grad_norm": 0.26102078843622567, + "learning_rate": 0.00015135801233928789, + "loss": 0.9474, + "step": 7692 + }, + { + "epoch": 1.6409982935153584, + "grad_norm": 0.30682679810509983, + "learning_rate": 0.0001513541648032034, + "loss": 0.9803, + "step": 7693 + }, + { + "epoch": 1.641211604095563, + "grad_norm": 0.4049147794862557, + "learning_rate": 0.0001513503164597453, + "loss": 0.9469, + "step": 7694 + }, + { + "epoch": 1.6414249146757678, + "grad_norm": 0.43314808443695346, + "learning_rate": 0.0001513464673089572, + "loss": 0.9653, + "step": 7695 + }, + { + "epoch": 1.6416382252559727, + "grad_norm": 0.449976325413557, + "learning_rate": 0.00015134261735088257, + "loss": 0.9276, + "step": 7696 + }, + { + "epoch": 1.6418515358361776, + "grad_norm": 0.576656577867117, + "learning_rate": 0.00015133876658556507, + "loss": 0.9391, + "step": 7697 + }, + { + "epoch": 1.6420648464163823, + "grad_norm": 0.5447959779863165, + "learning_rate": 0.00015133491501304823, + "loss": 0.9535, + "step": 7698 + }, + { + "epoch": 1.642278156996587, + "grad_norm": 0.37728076959347373, + "learning_rate": 0.00015133106263337558, + "loss": 0.9864, + "step": 7699 + }, + { + "epoch": 1.6424914675767917, + "grad_norm": 0.30782659970753906, + "learning_rate": 0.0001513272094465908, + "loss": 0.9459, + "step": 7700 + }, + { + "epoch": 1.6427047781569966, + "grad_norm": 0.40531318029555946, + "learning_rate": 0.00015132335545273742, + "loss": 0.9569, + "step": 7701 + }, + { + "epoch": 1.6429180887372015, + "grad_norm": 0.5110804271254731, + "learning_rate": 0.0001513195006518591, + "loss": 0.9809, + "step": 7702 + }, + { + "epoch": 1.6431313993174061, + "grad_norm": 0.5779614297857747, + "learning_rate": 0.0001513156450439994, + "loss": 0.9651, + "step": 7703 + }, + { + "epoch": 1.6433447098976108, + "grad_norm": 0.6639732659514486, + "learning_rate": 0.00015131178862920198, + "loss": 0.9332, + "step": 7704 + }, + { + "epoch": 1.6435580204778157, + "grad_norm": 0.746416902313826, + "learning_rate": 0.0001513079314075105, + "loss": 0.9768, + "step": 7705 + }, + { + "epoch": 1.6437713310580204, + "grad_norm": 0.7962388444802663, + "learning_rate": 0.00015130407337896856, + "loss": 0.9681, + "step": 7706 + }, + { + "epoch": 1.6439846416382253, + "grad_norm": 0.7137145680538197, + "learning_rate": 0.00015130021454361984, + "loss": 0.9478, + "step": 7707 + }, + { + "epoch": 1.64419795221843, + "grad_norm": 0.5594408372849039, + "learning_rate": 0.00015129635490150799, + "loss": 0.9613, + "step": 7708 + }, + { + "epoch": 1.6444112627986347, + "grad_norm": 0.37443936112903087, + "learning_rate": 0.0001512924944526767, + "loss": 0.9432, + "step": 7709 + }, + { + "epoch": 1.6446245733788396, + "grad_norm": 0.24716317509041982, + "learning_rate": 0.0001512886331971696, + "loss": 0.9607, + "step": 7710 + }, + { + "epoch": 1.6448378839590445, + "grad_norm": 0.2975407282298941, + "learning_rate": 0.00015128477113503043, + "loss": 0.9433, + "step": 7711 + }, + { + "epoch": 1.6450511945392492, + "grad_norm": 0.35468203032461587, + "learning_rate": 0.00015128090826630288, + "loss": 0.9611, + "step": 7712 + }, + { + "epoch": 1.6452645051194539, + "grad_norm": 0.47143887166635934, + "learning_rate": 0.00015127704459103066, + "loss": 0.9698, + "step": 7713 + }, + { + "epoch": 1.6454778156996586, + "grad_norm": 0.5833911669068316, + "learning_rate": 0.00015127318010925745, + "loss": 0.965, + "step": 7714 + }, + { + "epoch": 1.6456911262798635, + "grad_norm": 0.5876981177711726, + "learning_rate": 0.00015126931482102704, + "loss": 0.9364, + "step": 7715 + }, + { + "epoch": 1.6459044368600684, + "grad_norm": 0.4836134085886605, + "learning_rate": 0.00015126544872638313, + "loss": 0.949, + "step": 7716 + }, + { + "epoch": 1.646117747440273, + "grad_norm": 0.3782842304291791, + "learning_rate": 0.00015126158182536946, + "loss": 0.9362, + "step": 7717 + }, + { + "epoch": 1.6463310580204777, + "grad_norm": 0.2763940088993554, + "learning_rate": 0.0001512577141180298, + "loss": 0.9478, + "step": 7718 + }, + { + "epoch": 1.6465443686006824, + "grad_norm": 0.26881214419857474, + "learning_rate": 0.0001512538456044079, + "loss": 0.9469, + "step": 7719 + }, + { + "epoch": 1.6467576791808873, + "grad_norm": 0.3591570552304109, + "learning_rate": 0.00015124997628454754, + "loss": 0.9341, + "step": 7720 + }, + { + "epoch": 1.6469709897610922, + "grad_norm": 0.41133511382417615, + "learning_rate": 0.00015124610615849251, + "loss": 0.9412, + "step": 7721 + }, + { + "epoch": 1.647184300341297, + "grad_norm": 0.473374565273724, + "learning_rate": 0.00015124223522628658, + "loss": 0.9342, + "step": 7722 + }, + { + "epoch": 1.6473976109215016, + "grad_norm": 0.4741373669799684, + "learning_rate": 0.00015123836348797354, + "loss": 0.9583, + "step": 7723 + }, + { + "epoch": 1.6476109215017065, + "grad_norm": 0.43281434685026904, + "learning_rate": 0.00015123449094359728, + "loss": 0.9789, + "step": 7724 + }, + { + "epoch": 1.6478242320819114, + "grad_norm": 0.29664630920572405, + "learning_rate": 0.00015123061759320151, + "loss": 0.974, + "step": 7725 + }, + { + "epoch": 1.648037542662116, + "grad_norm": 0.2769808564058561, + "learning_rate": 0.0001512267434368301, + "loss": 0.954, + "step": 7726 + }, + { + "epoch": 1.6482508532423208, + "grad_norm": 0.263449596456388, + "learning_rate": 0.00015122286847452692, + "loss": 0.9386, + "step": 7727 + }, + { + "epoch": 1.6484641638225255, + "grad_norm": 0.3016931596550287, + "learning_rate": 0.00015121899270633576, + "loss": 0.9683, + "step": 7728 + }, + { + "epoch": 1.6486774744027304, + "grad_norm": 0.3632843063411119, + "learning_rate": 0.0001512151161323005, + "loss": 0.9786, + "step": 7729 + }, + { + "epoch": 1.6488907849829353, + "grad_norm": 0.3994702787447405, + "learning_rate": 0.000151211238752465, + "loss": 0.9622, + "step": 7730 + }, + { + "epoch": 1.64910409556314, + "grad_norm": 0.5042003974810257, + "learning_rate": 0.00015120736056687317, + "loss": 0.9773, + "step": 7731 + }, + { + "epoch": 1.6493174061433447, + "grad_norm": 0.4893294201909299, + "learning_rate": 0.00015120348157556885, + "loss": 0.9371, + "step": 7732 + }, + { + "epoch": 1.6495307167235493, + "grad_norm": 0.4018595642117538, + "learning_rate": 0.0001511996017785959, + "loss": 0.9675, + "step": 7733 + }, + { + "epoch": 1.6497440273037542, + "grad_norm": 0.3847329803508636, + "learning_rate": 0.00015119572117599828, + "loss": 0.9697, + "step": 7734 + }, + { + "epoch": 1.6499573378839592, + "grad_norm": 0.40468645719491636, + "learning_rate": 0.00015119183976781988, + "loss": 0.9584, + "step": 7735 + }, + { + "epoch": 1.6501706484641638, + "grad_norm": 0.4986668058255649, + "learning_rate": 0.0001511879575541046, + "loss": 0.9428, + "step": 7736 + }, + { + "epoch": 1.6503839590443685, + "grad_norm": 0.3760466851710466, + "learning_rate": 0.0001511840745348964, + "loss": 0.994, + "step": 7737 + }, + { + "epoch": 1.6505972696245734, + "grad_norm": 0.3287639753864557, + "learning_rate": 0.0001511801907102392, + "loss": 0.946, + "step": 7738 + }, + { + "epoch": 1.650810580204778, + "grad_norm": 0.2409245093933023, + "learning_rate": 0.00015117630608017696, + "loss": 0.9564, + "step": 7739 + }, + { + "epoch": 1.651023890784983, + "grad_norm": 0.276091779919234, + "learning_rate": 0.00015117242064475358, + "loss": 0.9843, + "step": 7740 + }, + { + "epoch": 1.6512372013651877, + "grad_norm": 0.3204615207231442, + "learning_rate": 0.0001511685344040131, + "loss": 0.9675, + "step": 7741 + }, + { + "epoch": 1.6514505119453924, + "grad_norm": 0.33569399651114395, + "learning_rate": 0.00015116464735799942, + "loss": 0.9727, + "step": 7742 + }, + { + "epoch": 1.6516638225255973, + "grad_norm": 0.33337263441098086, + "learning_rate": 0.0001511607595067566, + "loss": 0.9874, + "step": 7743 + }, + { + "epoch": 1.6518771331058022, + "grad_norm": 0.41262184940943536, + "learning_rate": 0.00015115687085032854, + "loss": 0.9305, + "step": 7744 + }, + { + "epoch": 1.6520904436860069, + "grad_norm": 0.450986485237287, + "learning_rate": 0.00015115298138875934, + "loss": 0.9393, + "step": 7745 + }, + { + "epoch": 1.6523037542662116, + "grad_norm": 0.5358851653550857, + "learning_rate": 0.00015114909112209292, + "loss": 0.972, + "step": 7746 + }, + { + "epoch": 1.6525170648464163, + "grad_norm": 0.582115885725295, + "learning_rate": 0.00015114520005037337, + "loss": 0.9378, + "step": 7747 + }, + { + "epoch": 1.6527303754266212, + "grad_norm": 0.5511875754860863, + "learning_rate": 0.00015114130817364465, + "loss": 0.974, + "step": 7748 + }, + { + "epoch": 1.652943686006826, + "grad_norm": 0.4711710934318647, + "learning_rate": 0.00015113741549195085, + "loss": 0.9396, + "step": 7749 + }, + { + "epoch": 1.6531569965870307, + "grad_norm": 0.4485550228166465, + "learning_rate": 0.000151133522005336, + "loss": 0.9558, + "step": 7750 + }, + { + "epoch": 1.6533703071672354, + "grad_norm": 0.41000630910999686, + "learning_rate": 0.00015112962771384416, + "loss": 0.9408, + "step": 7751 + }, + { + "epoch": 1.6535836177474401, + "grad_norm": 0.44459437571643773, + "learning_rate": 0.00015112573261751936, + "loss": 0.9491, + "step": 7752 + }, + { + "epoch": 1.653796928327645, + "grad_norm": 0.49589401099291286, + "learning_rate": 0.0001511218367164057, + "loss": 0.9432, + "step": 7753 + }, + { + "epoch": 1.65401023890785, + "grad_norm": 0.5715030242335207, + "learning_rate": 0.00015111794001054728, + "loss": 0.9432, + "step": 7754 + }, + { + "epoch": 1.6542235494880546, + "grad_norm": 0.579369050287127, + "learning_rate": 0.00015111404249998817, + "loss": 0.9469, + "step": 7755 + }, + { + "epoch": 1.6544368600682593, + "grad_norm": 0.5359186732804089, + "learning_rate": 0.00015111014418477245, + "loss": 0.9661, + "step": 7756 + }, + { + "epoch": 1.6546501706484642, + "grad_norm": 0.5076837724417738, + "learning_rate": 0.0001511062450649443, + "loss": 0.9757, + "step": 7757 + }, + { + "epoch": 1.6548634812286689, + "grad_norm": 0.5250393700903925, + "learning_rate": 0.00015110234514054775, + "loss": 0.9576, + "step": 7758 + }, + { + "epoch": 1.6550767918088738, + "grad_norm": 0.44565435009204946, + "learning_rate": 0.00015109844441162697, + "loss": 0.8977, + "step": 7759 + }, + { + "epoch": 1.6552901023890785, + "grad_norm": 0.3735292161894826, + "learning_rate": 0.00015109454287822612, + "loss": 0.9417, + "step": 7760 + }, + { + "epoch": 1.6555034129692832, + "grad_norm": 0.3512615476332735, + "learning_rate": 0.0001510906405403893, + "loss": 0.9764, + "step": 7761 + }, + { + "epoch": 1.655716723549488, + "grad_norm": 0.3350435501782983, + "learning_rate": 0.00015108673739816072, + "loss": 0.9411, + "step": 7762 + }, + { + "epoch": 1.655930034129693, + "grad_norm": 0.3770840712283811, + "learning_rate": 0.00015108283345158444, + "loss": 0.9158, + "step": 7763 + }, + { + "epoch": 1.6561433447098977, + "grad_norm": 0.34661096602538993, + "learning_rate": 0.00015107892870070476, + "loss": 0.9322, + "step": 7764 + }, + { + "epoch": 1.6563566552901023, + "grad_norm": 0.378007012338401, + "learning_rate": 0.0001510750231455658, + "loss": 0.9778, + "step": 7765 + }, + { + "epoch": 1.656569965870307, + "grad_norm": 0.3507206799652332, + "learning_rate": 0.00015107111678621177, + "loss": 0.9356, + "step": 7766 + }, + { + "epoch": 1.656783276450512, + "grad_norm": 0.36697089021944496, + "learning_rate": 0.00015106720962268684, + "loss": 0.9641, + "step": 7767 + }, + { + "epoch": 1.6569965870307168, + "grad_norm": 0.3462102271733866, + "learning_rate": 0.00015106330165503523, + "loss": 0.9605, + "step": 7768 + }, + { + "epoch": 1.6572098976109215, + "grad_norm": 0.3502274566217151, + "learning_rate": 0.0001510593928833012, + "loss": 0.9101, + "step": 7769 + }, + { + "epoch": 1.6574232081911262, + "grad_norm": 0.4186615094569159, + "learning_rate": 0.0001510554833075289, + "loss": 0.9936, + "step": 7770 + }, + { + "epoch": 1.6576365187713311, + "grad_norm": 0.389670232496229, + "learning_rate": 0.00015105157292776263, + "loss": 0.9565, + "step": 7771 + }, + { + "epoch": 1.6578498293515358, + "grad_norm": 0.3037653387463099, + "learning_rate": 0.0001510476617440466, + "loss": 0.9397, + "step": 7772 + }, + { + "epoch": 1.6580631399317407, + "grad_norm": 0.2811039373540693, + "learning_rate": 0.00015104374975642511, + "loss": 0.9444, + "step": 7773 + }, + { + "epoch": 1.6582764505119454, + "grad_norm": 0.3155132622522423, + "learning_rate": 0.0001510398369649424, + "loss": 0.9461, + "step": 7774 + }, + { + "epoch": 1.65848976109215, + "grad_norm": 0.3581581343474515, + "learning_rate": 0.00015103592336964273, + "loss": 0.972, + "step": 7775 + }, + { + "epoch": 1.658703071672355, + "grad_norm": 0.3927782022464357, + "learning_rate": 0.00015103200897057038, + "loss": 0.971, + "step": 7776 + }, + { + "epoch": 1.6589163822525599, + "grad_norm": 0.43191097007123774, + "learning_rate": 0.00015102809376776966, + "loss": 0.9337, + "step": 7777 + }, + { + "epoch": 1.6591296928327646, + "grad_norm": 0.4734243856985165, + "learning_rate": 0.00015102417776128487, + "loss": 0.9721, + "step": 7778 + }, + { + "epoch": 1.6593430034129693, + "grad_norm": 0.4965174244999446, + "learning_rate": 0.0001510202609511603, + "loss": 0.9376, + "step": 7779 + }, + { + "epoch": 1.659556313993174, + "grad_norm": 0.5588367292390752, + "learning_rate": 0.0001510163433374403, + "loss": 0.9733, + "step": 7780 + }, + { + "epoch": 1.6597696245733788, + "grad_norm": 0.5907523476239613, + "learning_rate": 0.00015101242492016918, + "loss": 0.9595, + "step": 7781 + }, + { + "epoch": 1.6599829351535837, + "grad_norm": 0.5978277122796307, + "learning_rate": 0.00015100850569939127, + "loss": 0.928, + "step": 7782 + }, + { + "epoch": 1.6601962457337884, + "grad_norm": 0.565207832821434, + "learning_rate": 0.00015100458567515092, + "loss": 0.9496, + "step": 7783 + }, + { + "epoch": 1.6604095563139931, + "grad_norm": 0.511652524328105, + "learning_rate": 0.0001510006648474925, + "loss": 0.9859, + "step": 7784 + }, + { + "epoch": 1.6606228668941978, + "grad_norm": 0.46027789436894884, + "learning_rate": 0.00015099674321646036, + "loss": 0.9454, + "step": 7785 + }, + { + "epoch": 1.6608361774744027, + "grad_norm": 0.4024603249844509, + "learning_rate": 0.00015099282078209886, + "loss": 0.9711, + "step": 7786 + }, + { + "epoch": 1.6610494880546076, + "grad_norm": 0.3106362479735831, + "learning_rate": 0.00015098889754445243, + "loss": 0.9531, + "step": 7787 + }, + { + "epoch": 1.6612627986348123, + "grad_norm": 0.378491649052409, + "learning_rate": 0.0001509849735035654, + "loss": 0.9684, + "step": 7788 + }, + { + "epoch": 1.661476109215017, + "grad_norm": 0.39943954411930593, + "learning_rate": 0.00015098104865948222, + "loss": 0.936, + "step": 7789 + }, + { + "epoch": 1.661689419795222, + "grad_norm": 0.3111494621495606, + "learning_rate": 0.00015097712301224728, + "loss": 0.9904, + "step": 7790 + }, + { + "epoch": 1.6619027303754266, + "grad_norm": 0.2678153814890066, + "learning_rate": 0.00015097319656190498, + "loss": 0.9355, + "step": 7791 + }, + { + "epoch": 1.6621160409556315, + "grad_norm": 0.38541321464400496, + "learning_rate": 0.0001509692693084998, + "loss": 0.9616, + "step": 7792 + }, + { + "epoch": 1.6623293515358362, + "grad_norm": 0.3844508211943267, + "learning_rate": 0.00015096534125207613, + "loss": 0.9346, + "step": 7793 + }, + { + "epoch": 1.6625426621160408, + "grad_norm": 0.31520887087865507, + "learning_rate": 0.00015096141239267843, + "loss": 0.9719, + "step": 7794 + }, + { + "epoch": 1.6627559726962458, + "grad_norm": 0.31654009997221544, + "learning_rate": 0.00015095748273035115, + "loss": 0.9875, + "step": 7795 + }, + { + "epoch": 1.6629692832764507, + "grad_norm": 0.2983338603859666, + "learning_rate": 0.00015095355226513873, + "loss": 0.9576, + "step": 7796 + }, + { + "epoch": 1.6631825938566553, + "grad_norm": 0.3301013237481699, + "learning_rate": 0.00015094962099708573, + "loss": 0.9379, + "step": 7797 + }, + { + "epoch": 1.66339590443686, + "grad_norm": 0.33274863429720253, + "learning_rate": 0.00015094568892623655, + "loss": 0.9586, + "step": 7798 + }, + { + "epoch": 1.6636092150170647, + "grad_norm": 0.2709268903177063, + "learning_rate": 0.00015094175605263568, + "loss": 0.9177, + "step": 7799 + }, + { + "epoch": 1.6638225255972696, + "grad_norm": 0.23845037720398893, + "learning_rate": 0.00015093782237632768, + "loss": 0.9214, + "step": 7800 + }, + { + "epoch": 1.6640358361774745, + "grad_norm": 0.3279275328507266, + "learning_rate": 0.00015093388789735702, + "loss": 0.9569, + "step": 7801 + }, + { + "epoch": 1.6642491467576792, + "grad_norm": 0.41359817656901304, + "learning_rate": 0.00015092995261576823, + "loss": 0.9752, + "step": 7802 + }, + { + "epoch": 1.664462457337884, + "grad_norm": 0.5238081116949503, + "learning_rate": 0.0001509260165316058, + "loss": 0.9847, + "step": 7803 + }, + { + "epoch": 1.6646757679180886, + "grad_norm": 0.716162888219315, + "learning_rate": 0.00015092207964491434, + "loss": 0.9331, + "step": 7804 + }, + { + "epoch": 1.6648890784982935, + "grad_norm": 0.8803331448727449, + "learning_rate": 0.00015091814195573834, + "loss": 0.9267, + "step": 7805 + }, + { + "epoch": 1.6651023890784984, + "grad_norm": 0.8594236149964585, + "learning_rate": 0.00015091420346412235, + "loss": 0.9403, + "step": 7806 + }, + { + "epoch": 1.665315699658703, + "grad_norm": 0.887767161062565, + "learning_rate": 0.00015091026417011097, + "loss": 0.954, + "step": 7807 + }, + { + "epoch": 1.6655290102389078, + "grad_norm": 0.7910405721222992, + "learning_rate": 0.0001509063240737488, + "loss": 0.9358, + "step": 7808 + }, + { + "epoch": 1.6657423208191127, + "grad_norm": 0.5953851599749216, + "learning_rate": 0.00015090238317508033, + "loss": 0.9486, + "step": 7809 + }, + { + "epoch": 1.6659556313993176, + "grad_norm": 0.38143781178831515, + "learning_rate": 0.00015089844147415024, + "loss": 0.9469, + "step": 7810 + }, + { + "epoch": 1.6661689419795223, + "grad_norm": 0.27613224574360384, + "learning_rate": 0.00015089449897100305, + "loss": 0.9926, + "step": 7811 + }, + { + "epoch": 1.666382252559727, + "grad_norm": 0.2951201789816793, + "learning_rate": 0.00015089055566568343, + "loss": 0.9383, + "step": 7812 + }, + { + "epoch": 1.6665955631399316, + "grad_norm": 0.47182186765812634, + "learning_rate": 0.000150886611558236, + "loss": 0.9665, + "step": 7813 + }, + { + "epoch": 1.6668088737201365, + "grad_norm": 0.59404295471609, + "learning_rate": 0.0001508826666487053, + "loss": 0.916, + "step": 7814 + }, + { + "epoch": 1.6670221843003414, + "grad_norm": 0.5736098473436342, + "learning_rate": 0.00015087872093713612, + "loss": 0.9545, + "step": 7815 + }, + { + "epoch": 1.6672354948805461, + "grad_norm": 0.49878832229995806, + "learning_rate": 0.000150874774423573, + "loss": 0.9365, + "step": 7816 + }, + { + "epoch": 1.6674488054607508, + "grad_norm": 0.4809879396001061, + "learning_rate": 0.0001508708271080606, + "loss": 0.9676, + "step": 7817 + }, + { + "epoch": 1.6676621160409555, + "grad_norm": 0.504035299286146, + "learning_rate": 0.00015086687899064358, + "loss": 0.9537, + "step": 7818 + }, + { + "epoch": 1.6678754266211604, + "grad_norm": 0.3556357986646363, + "learning_rate": 0.00015086293007136666, + "loss": 0.9334, + "step": 7819 + }, + { + "epoch": 1.6680887372013653, + "grad_norm": 0.35098878493148034, + "learning_rate": 0.00015085898035027448, + "loss": 0.9255, + "step": 7820 + }, + { + "epoch": 1.66830204778157, + "grad_norm": 0.35852875675577683, + "learning_rate": 0.00015085502982741173, + "loss": 0.921, + "step": 7821 + }, + { + "epoch": 1.6685153583617747, + "grad_norm": 0.3408704394014859, + "learning_rate": 0.00015085107850282316, + "loss": 0.9497, + "step": 7822 + }, + { + "epoch": 1.6687286689419796, + "grad_norm": 0.45821244393502036, + "learning_rate": 0.00015084712637655344, + "loss": 0.9333, + "step": 7823 + }, + { + "epoch": 1.6689419795221843, + "grad_norm": 0.3964878235768899, + "learning_rate": 0.00015084317344864727, + "loss": 0.9335, + "step": 7824 + }, + { + "epoch": 1.6691552901023892, + "grad_norm": 0.24598213090295928, + "learning_rate": 0.00015083921971914942, + "loss": 0.9572, + "step": 7825 + }, + { + "epoch": 1.6693686006825939, + "grad_norm": 0.32163425589796996, + "learning_rate": 0.0001508352651881046, + "loss": 0.9636, + "step": 7826 + }, + { + "epoch": 1.6695819112627985, + "grad_norm": 0.31084348841358334, + "learning_rate": 0.00015083130985555756, + "loss": 0.9353, + "step": 7827 + }, + { + "epoch": 1.6697952218430034, + "grad_norm": 0.2908053023601909, + "learning_rate": 0.00015082735372155308, + "loss": 0.937, + "step": 7828 + }, + { + "epoch": 1.6700085324232083, + "grad_norm": 0.4281666512545163, + "learning_rate": 0.00015082339678613586, + "loss": 0.9722, + "step": 7829 + }, + { + "epoch": 1.670221843003413, + "grad_norm": 0.4647956239944411, + "learning_rate": 0.00015081943904935076, + "loss": 0.9795, + "step": 7830 + }, + { + "epoch": 1.6704351535836177, + "grad_norm": 0.5271664753261338, + "learning_rate": 0.0001508154805112425, + "loss": 0.9513, + "step": 7831 + }, + { + "epoch": 1.6706484641638224, + "grad_norm": 0.5148351247024716, + "learning_rate": 0.00015081152117185588, + "loss": 0.931, + "step": 7832 + }, + { + "epoch": 1.6708617747440273, + "grad_norm": 0.35505655533198, + "learning_rate": 0.00015080756103123572, + "loss": 0.93, + "step": 7833 + }, + { + "epoch": 1.6710750853242322, + "grad_norm": 0.2490229584829943, + "learning_rate": 0.0001508036000894268, + "loss": 0.9506, + "step": 7834 + }, + { + "epoch": 1.671288395904437, + "grad_norm": 0.34712849881260144, + "learning_rate": 0.00015079963834647395, + "loss": 0.9344, + "step": 7835 + }, + { + "epoch": 1.6715017064846416, + "grad_norm": 0.4466618800128417, + "learning_rate": 0.000150795675802422, + "loss": 0.9487, + "step": 7836 + }, + { + "epoch": 1.6717150170648463, + "grad_norm": 0.528569693129937, + "learning_rate": 0.0001507917124573158, + "loss": 0.9606, + "step": 7837 + }, + { + "epoch": 1.6719283276450512, + "grad_norm": 0.6712934868409915, + "learning_rate": 0.0001507877483112002, + "loss": 0.9707, + "step": 7838 + }, + { + "epoch": 1.672141638225256, + "grad_norm": 0.8132007669400417, + "learning_rate": 0.00015078378336412, + "loss": 0.9466, + "step": 7839 + }, + { + "epoch": 1.6723549488054608, + "grad_norm": 0.8272678699537774, + "learning_rate": 0.00015077981761612013, + "loss": 0.9567, + "step": 7840 + }, + { + "epoch": 1.6725682593856654, + "grad_norm": 0.7249090044694868, + "learning_rate": 0.00015077585106724542, + "loss": 0.9661, + "step": 7841 + }, + { + "epoch": 1.6727815699658704, + "grad_norm": 0.5549392773489907, + "learning_rate": 0.00015077188371754075, + "loss": 0.9305, + "step": 7842 + }, + { + "epoch": 1.672994880546075, + "grad_norm": 0.3679271863686851, + "learning_rate": 0.00015076791556705105, + "loss": 0.9518, + "step": 7843 + }, + { + "epoch": 1.67320819112628, + "grad_norm": 0.308335265989871, + "learning_rate": 0.00015076394661582118, + "loss": 0.935, + "step": 7844 + }, + { + "epoch": 1.6734215017064846, + "grad_norm": 0.34764399568742593, + "learning_rate": 0.00015075997686389607, + "loss": 0.9604, + "step": 7845 + }, + { + "epoch": 1.6736348122866893, + "grad_norm": 0.4858306136138085, + "learning_rate": 0.00015075600631132063, + "loss": 0.9408, + "step": 7846 + }, + { + "epoch": 1.6738481228668942, + "grad_norm": 0.5105480081916998, + "learning_rate": 0.0001507520349581398, + "loss": 0.9836, + "step": 7847 + }, + { + "epoch": 1.6740614334470991, + "grad_norm": 0.4754752259140558, + "learning_rate": 0.00015074806280439848, + "loss": 0.9553, + "step": 7848 + }, + { + "epoch": 1.6742747440273038, + "grad_norm": 0.3536198177520672, + "learning_rate": 0.00015074408985014167, + "loss": 0.9335, + "step": 7849 + }, + { + "epoch": 1.6744880546075085, + "grad_norm": 0.2615232173855098, + "learning_rate": 0.00015074011609541427, + "loss": 0.9728, + "step": 7850 + }, + { + "epoch": 1.6747013651877132, + "grad_norm": 0.3015575934250603, + "learning_rate": 0.00015073614154026128, + "loss": 0.9176, + "step": 7851 + }, + { + "epoch": 1.674914675767918, + "grad_norm": 0.33412302670722444, + "learning_rate": 0.00015073216618472765, + "loss": 0.9463, + "step": 7852 + }, + { + "epoch": 1.675127986348123, + "grad_norm": 0.3504151471673032, + "learning_rate": 0.00015072819002885838, + "loss": 0.9488, + "step": 7853 + }, + { + "epoch": 1.6753412969283277, + "grad_norm": 0.4193579527658922, + "learning_rate": 0.00015072421307269843, + "loss": 0.9568, + "step": 7854 + }, + { + "epoch": 1.6755546075085324, + "grad_norm": 0.4943886640067645, + "learning_rate": 0.00015072023531629284, + "loss": 0.9367, + "step": 7855 + }, + { + "epoch": 1.675767918088737, + "grad_norm": 0.462104358346051, + "learning_rate": 0.0001507162567596866, + "loss": 0.9192, + "step": 7856 + }, + { + "epoch": 1.675981228668942, + "grad_norm": 0.4535142062882519, + "learning_rate": 0.0001507122774029247, + "loss": 0.9394, + "step": 7857 + }, + { + "epoch": 1.6761945392491469, + "grad_norm": 0.469157521698188, + "learning_rate": 0.0001507082972460522, + "loss": 0.9746, + "step": 7858 + }, + { + "epoch": 1.6764078498293515, + "grad_norm": 0.49440554837476075, + "learning_rate": 0.00015070431628911413, + "loss": 0.9273, + "step": 7859 + }, + { + "epoch": 1.6766211604095562, + "grad_norm": 0.48955745464283396, + "learning_rate": 0.00015070033453215556, + "loss": 0.9292, + "step": 7860 + }, + { + "epoch": 1.6768344709897611, + "grad_norm": 0.4147786254451718, + "learning_rate": 0.00015069635197522148, + "loss": 0.9606, + "step": 7861 + }, + { + "epoch": 1.677047781569966, + "grad_norm": 0.31863556997886544, + "learning_rate": 0.000150692368618357, + "loss": 0.9327, + "step": 7862 + }, + { + "epoch": 1.6772610921501707, + "grad_norm": 0.25008757071126736, + "learning_rate": 0.00015068838446160717, + "loss": 0.9275, + "step": 7863 + }, + { + "epoch": 1.6774744027303754, + "grad_norm": 0.2743342143121903, + "learning_rate": 0.0001506843995050171, + "loss": 0.9687, + "step": 7864 + }, + { + "epoch": 1.67768771331058, + "grad_norm": 0.3860774674085208, + "learning_rate": 0.00015068041374863186, + "loss": 0.9116, + "step": 7865 + }, + { + "epoch": 1.677901023890785, + "grad_norm": 0.4899982268671588, + "learning_rate": 0.00015067642719249652, + "loss": 0.9596, + "step": 7866 + }, + { + "epoch": 1.67811433447099, + "grad_norm": 0.5280873444509426, + "learning_rate": 0.00015067243983665624, + "loss": 0.9514, + "step": 7867 + }, + { + "epoch": 1.6783276450511946, + "grad_norm": 0.4840946139077758, + "learning_rate": 0.00015066845168115612, + "loss": 0.9676, + "step": 7868 + }, + { + "epoch": 1.6785409556313993, + "grad_norm": 0.37961467549511024, + "learning_rate": 0.00015066446272604127, + "loss": 0.9651, + "step": 7869 + }, + { + "epoch": 1.678754266211604, + "grad_norm": 0.3021089680237717, + "learning_rate": 0.00015066047297135685, + "loss": 0.9738, + "step": 7870 + }, + { + "epoch": 1.6789675767918089, + "grad_norm": 0.2743186912646017, + "learning_rate": 0.000150656482417148, + "loss": 0.9303, + "step": 7871 + }, + { + "epoch": 1.6791808873720138, + "grad_norm": 0.24463784692571017, + "learning_rate": 0.0001506524910634598, + "loss": 0.9548, + "step": 7872 + }, + { + "epoch": 1.6793941979522184, + "grad_norm": 0.31805119042335656, + "learning_rate": 0.00015064849891033755, + "loss": 0.9286, + "step": 7873 + }, + { + "epoch": 1.6796075085324231, + "grad_norm": 0.3601517474358085, + "learning_rate": 0.0001506445059578263, + "loss": 0.9671, + "step": 7874 + }, + { + "epoch": 1.679820819112628, + "grad_norm": 0.3938667891004059, + "learning_rate": 0.00015064051220597128, + "loss": 0.9344, + "step": 7875 + }, + { + "epoch": 1.6800341296928327, + "grad_norm": 0.3796923097939457, + "learning_rate": 0.0001506365176548177, + "loss": 0.9739, + "step": 7876 + }, + { + "epoch": 1.6802474402730376, + "grad_norm": 0.31392214439411764, + "learning_rate": 0.0001506325223044107, + "loss": 0.9764, + "step": 7877 + }, + { + "epoch": 1.6804607508532423, + "grad_norm": 0.29909696977313804, + "learning_rate": 0.00015062852615479555, + "loss": 0.9611, + "step": 7878 + }, + { + "epoch": 1.680674061433447, + "grad_norm": 0.333335185622173, + "learning_rate": 0.00015062452920601744, + "loss": 0.9433, + "step": 7879 + }, + { + "epoch": 1.680887372013652, + "grad_norm": 0.33730327923243536, + "learning_rate": 0.0001506205314581216, + "loss": 0.9355, + "step": 7880 + }, + { + "epoch": 1.6811006825938568, + "grad_norm": 0.3224414398614593, + "learning_rate": 0.00015061653291115323, + "loss": 0.9507, + "step": 7881 + }, + { + "epoch": 1.6813139931740615, + "grad_norm": 0.2930436884939475, + "learning_rate": 0.0001506125335651576, + "loss": 0.949, + "step": 7882 + }, + { + "epoch": 1.6815273037542662, + "grad_norm": 0.34898926690207166, + "learning_rate": 0.00015060853342018, + "loss": 0.9572, + "step": 7883 + }, + { + "epoch": 1.6817406143344709, + "grad_norm": 0.3355272861454895, + "learning_rate": 0.00015060453247626562, + "loss": 0.939, + "step": 7884 + }, + { + "epoch": 1.6819539249146758, + "grad_norm": 0.2759057389051484, + "learning_rate": 0.0001506005307334598, + "loss": 0.9109, + "step": 7885 + }, + { + "epoch": 1.6821672354948807, + "grad_norm": 0.3220086941230455, + "learning_rate": 0.00015059652819180778, + "loss": 0.9841, + "step": 7886 + }, + { + "epoch": 1.6823805460750854, + "grad_norm": 0.360011207460076, + "learning_rate": 0.00015059252485135485, + "loss": 0.95, + "step": 7887 + }, + { + "epoch": 1.68259385665529, + "grad_norm": 0.3015937059788902, + "learning_rate": 0.0001505885207121463, + "loss": 0.9141, + "step": 7888 + }, + { + "epoch": 1.6828071672354947, + "grad_norm": 0.27591079361153925, + "learning_rate": 0.00015058451577422747, + "loss": 0.9573, + "step": 7889 + }, + { + "epoch": 1.6830204778156996, + "grad_norm": 0.3191541939476875, + "learning_rate": 0.00015058051003764367, + "loss": 0.9471, + "step": 7890 + }, + { + "epoch": 1.6832337883959045, + "grad_norm": 0.3364481500563284, + "learning_rate": 0.00015057650350244018, + "loss": 0.9386, + "step": 7891 + }, + { + "epoch": 1.6834470989761092, + "grad_norm": 0.31729338616940467, + "learning_rate": 0.00015057249616866238, + "loss": 0.9563, + "step": 7892 + }, + { + "epoch": 1.683660409556314, + "grad_norm": 0.2845093371460987, + "learning_rate": 0.0001505684880363556, + "loss": 0.9609, + "step": 7893 + }, + { + "epoch": 1.6838737201365188, + "grad_norm": 1.1254048666774819, + "learning_rate": 0.0001505644791055652, + "loss": 0.945, + "step": 7894 + }, + { + "epoch": 1.6840870307167235, + "grad_norm": 0.29942748127108026, + "learning_rate": 0.0001505604693763365, + "loss": 0.9856, + "step": 7895 + }, + { + "epoch": 1.6843003412969284, + "grad_norm": 0.3598568832735755, + "learning_rate": 0.00015055645884871492, + "loss": 0.947, + "step": 7896 + }, + { + "epoch": 1.684513651877133, + "grad_norm": 0.3274219651075977, + "learning_rate": 0.00015055244752274584, + "loss": 0.9645, + "step": 7897 + }, + { + "epoch": 1.6847269624573378, + "grad_norm": 0.3181036454411078, + "learning_rate": 0.0001505484353984746, + "loss": 0.9808, + "step": 7898 + }, + { + "epoch": 1.6849402730375427, + "grad_norm": 0.3427313770029097, + "learning_rate": 0.00015054442247594663, + "loss": 0.9669, + "step": 7899 + }, + { + "epoch": 1.6851535836177476, + "grad_norm": 0.39420689434201045, + "learning_rate": 0.00015054040875520734, + "loss": 0.9706, + "step": 7900 + }, + { + "epoch": 1.6853668941979523, + "grad_norm": 0.3855264905470181, + "learning_rate": 0.00015053639423630216, + "loss": 0.9681, + "step": 7901 + }, + { + "epoch": 1.685580204778157, + "grad_norm": 0.36871107870214004, + "learning_rate": 0.00015053237891927648, + "loss": 0.951, + "step": 7902 + }, + { + "epoch": 1.6857935153583616, + "grad_norm": 0.4051792453948067, + "learning_rate": 0.00015052836280417573, + "loss": 0.9689, + "step": 7903 + }, + { + "epoch": 1.6860068259385665, + "grad_norm": 0.5085276752301857, + "learning_rate": 0.00015052434589104538, + "loss": 0.9319, + "step": 7904 + }, + { + "epoch": 1.6862201365187715, + "grad_norm": 0.5992822413995794, + "learning_rate": 0.00015052032817993086, + "loss": 0.9849, + "step": 7905 + }, + { + "epoch": 1.6864334470989761, + "grad_norm": 0.7226732968404824, + "learning_rate": 0.00015051630967087765, + "loss": 0.9882, + "step": 7906 + }, + { + "epoch": 1.6866467576791808, + "grad_norm": 0.7966297150934409, + "learning_rate": 0.00015051229036393122, + "loss": 0.9901, + "step": 7907 + }, + { + "epoch": 1.6868600682593855, + "grad_norm": 0.8262573591884728, + "learning_rate": 0.00015050827025913703, + "loss": 0.9646, + "step": 7908 + }, + { + "epoch": 1.6870733788395904, + "grad_norm": 0.8078779740399349, + "learning_rate": 0.00015050424935654055, + "loss": 0.9864, + "step": 7909 + }, + { + "epoch": 1.6872866894197953, + "grad_norm": 0.7710044357613307, + "learning_rate": 0.00015050022765618733, + "loss": 0.9548, + "step": 7910 + }, + { + "epoch": 1.6875, + "grad_norm": 0.6465459096685462, + "learning_rate": 0.00015049620515812285, + "loss": 0.9619, + "step": 7911 + }, + { + "epoch": 1.6877133105802047, + "grad_norm": 0.5058945273189099, + "learning_rate": 0.00015049218186239264, + "loss": 0.9433, + "step": 7912 + }, + { + "epoch": 1.6879266211604096, + "grad_norm": 0.4190004049206725, + "learning_rate": 0.0001504881577690422, + "loss": 0.9611, + "step": 7913 + }, + { + "epoch": 1.6881399317406145, + "grad_norm": 0.4294032284116429, + "learning_rate": 0.00015048413287811703, + "loss": 0.9398, + "step": 7914 + }, + { + "epoch": 1.6883532423208192, + "grad_norm": 0.38556773842546815, + "learning_rate": 0.00015048010718966273, + "loss": 0.9882, + "step": 7915 + }, + { + "epoch": 1.6885665529010239, + "grad_norm": 0.36414982534596824, + "learning_rate": 0.00015047608070372485, + "loss": 0.9671, + "step": 7916 + }, + { + "epoch": 1.6887798634812285, + "grad_norm": 0.3454212070524081, + "learning_rate": 0.0001504720534203489, + "loss": 0.9561, + "step": 7917 + }, + { + "epoch": 1.6889931740614335, + "grad_norm": 0.2801695930569139, + "learning_rate": 0.00015046802533958051, + "loss": 0.9593, + "step": 7918 + }, + { + "epoch": 1.6892064846416384, + "grad_norm": 0.3165998672234257, + "learning_rate": 0.0001504639964614652, + "loss": 0.9655, + "step": 7919 + }, + { + "epoch": 1.689419795221843, + "grad_norm": 0.4007327952695342, + "learning_rate": 0.0001504599667860486, + "loss": 0.9687, + "step": 7920 + }, + { + "epoch": 1.6896331058020477, + "grad_norm": 0.4084321076295594, + "learning_rate": 0.0001504559363133763, + "loss": 0.9464, + "step": 7921 + }, + { + "epoch": 1.6898464163822524, + "grad_norm": 0.4421279257712267, + "learning_rate": 0.00015045190504349388, + "loss": 0.976, + "step": 7922 + }, + { + "epoch": 1.6900597269624573, + "grad_norm": 0.3795106362524627, + "learning_rate": 0.00015044787297644697, + "loss": 0.9389, + "step": 7923 + }, + { + "epoch": 1.6902730375426622, + "grad_norm": 0.33779024653816186, + "learning_rate": 0.0001504438401122812, + "loss": 0.97, + "step": 7924 + }, + { + "epoch": 1.690486348122867, + "grad_norm": 0.33102790743394606, + "learning_rate": 0.00015043980645104222, + "loss": 0.9725, + "step": 7925 + }, + { + "epoch": 1.6906996587030716, + "grad_norm": 0.3890862726610118, + "learning_rate": 0.0001504357719927756, + "loss": 0.9628, + "step": 7926 + }, + { + "epoch": 1.6909129692832765, + "grad_norm": 0.3143890940585629, + "learning_rate": 0.00015043173673752705, + "loss": 0.9265, + "step": 7927 + }, + { + "epoch": 1.6911262798634812, + "grad_norm": 0.24546752955387074, + "learning_rate": 0.00015042770068534223, + "loss": 0.9625, + "step": 7928 + }, + { + "epoch": 1.691339590443686, + "grad_norm": 0.42869363401590777, + "learning_rate": 0.00015042366383626675, + "loss": 0.8866, + "step": 7929 + }, + { + "epoch": 1.6915529010238908, + "grad_norm": 0.5738001856007302, + "learning_rate": 0.00015041962619034637, + "loss": 0.943, + "step": 7930 + }, + { + "epoch": 1.6917662116040955, + "grad_norm": 0.4749832442612451, + "learning_rate": 0.00015041558774762673, + "loss": 0.9782, + "step": 7931 + }, + { + "epoch": 1.6919795221843004, + "grad_norm": 0.3334399008706469, + "learning_rate": 0.00015041154850815354, + "loss": 0.9448, + "step": 7932 + }, + { + "epoch": 1.6921928327645053, + "grad_norm": 0.27617735655591263, + "learning_rate": 0.00015040750847197245, + "loss": 0.9381, + "step": 7933 + }, + { + "epoch": 1.69240614334471, + "grad_norm": 0.3005720622316531, + "learning_rate": 0.00015040346763912926, + "loss": 0.9472, + "step": 7934 + }, + { + "epoch": 1.6926194539249146, + "grad_norm": 0.34319671275211877, + "learning_rate": 0.00015039942600966963, + "loss": 0.9835, + "step": 7935 + }, + { + "epoch": 1.6928327645051193, + "grad_norm": 0.3734448557353872, + "learning_rate": 0.00015039538358363932, + "loss": 0.9884, + "step": 7936 + }, + { + "epoch": 1.6930460750853242, + "grad_norm": 0.3808647091136955, + "learning_rate": 0.00015039134036108406, + "loss": 0.9715, + "step": 7937 + }, + { + "epoch": 1.6932593856655291, + "grad_norm": 0.28894251965415085, + "learning_rate": 0.0001503872963420496, + "loss": 0.9508, + "step": 7938 + }, + { + "epoch": 1.6934726962457338, + "grad_norm": 0.3198873335017952, + "learning_rate": 0.00015038325152658168, + "loss": 0.9487, + "step": 7939 + }, + { + "epoch": 1.6936860068259385, + "grad_norm": 0.4405630812520188, + "learning_rate": 0.0001503792059147261, + "loss": 0.9622, + "step": 7940 + }, + { + "epoch": 1.6938993174061432, + "grad_norm": 0.45399035814497, + "learning_rate": 0.00015037515950652863, + "loss": 0.9451, + "step": 7941 + }, + { + "epoch": 1.694112627986348, + "grad_norm": 0.362503083557583, + "learning_rate": 0.00015037111230203502, + "loss": 0.9405, + "step": 7942 + }, + { + "epoch": 1.694325938566553, + "grad_norm": 0.3603557554000113, + "learning_rate": 0.00015036706430129112, + "loss": 0.9362, + "step": 7943 + }, + { + "epoch": 1.6945392491467577, + "grad_norm": 0.4567482058007261, + "learning_rate": 0.0001503630155043427, + "loss": 0.9389, + "step": 7944 + }, + { + "epoch": 1.6947525597269624, + "grad_norm": 0.49019512232021717, + "learning_rate": 0.00015035896591123553, + "loss": 0.9254, + "step": 7945 + }, + { + "epoch": 1.6949658703071673, + "grad_norm": 0.46190248748803414, + "learning_rate": 0.00015035491552201554, + "loss": 0.9585, + "step": 7946 + }, + { + "epoch": 1.695179180887372, + "grad_norm": 0.485796270684929, + "learning_rate": 0.00015035086433672846, + "loss": 0.9715, + "step": 7947 + }, + { + "epoch": 1.6953924914675769, + "grad_norm": 0.5218533391803535, + "learning_rate": 0.00015034681235542017, + "loss": 0.933, + "step": 7948 + }, + { + "epoch": 1.6956058020477816, + "grad_norm": 0.4860903347508498, + "learning_rate": 0.00015034275957813656, + "loss": 0.9716, + "step": 7949 + }, + { + "epoch": 1.6958191126279862, + "grad_norm": 0.48056488826559046, + "learning_rate": 0.0001503387060049234, + "loss": 0.9468, + "step": 7950 + }, + { + "epoch": 1.6960324232081911, + "grad_norm": 0.4374657667906347, + "learning_rate": 0.00015033465163582658, + "loss": 0.9504, + "step": 7951 + }, + { + "epoch": 1.696245733788396, + "grad_norm": 0.4289342792635917, + "learning_rate": 0.00015033059647089203, + "loss": 0.9567, + "step": 7952 + }, + { + "epoch": 1.6964590443686007, + "grad_norm": 0.3782543274614441, + "learning_rate": 0.0001503265405101656, + "loss": 0.942, + "step": 7953 + }, + { + "epoch": 1.6966723549488054, + "grad_norm": 0.3593067708306635, + "learning_rate": 0.00015032248375369315, + "loss": 0.93, + "step": 7954 + }, + { + "epoch": 1.69688566552901, + "grad_norm": 0.3583741072518774, + "learning_rate": 0.00015031842620152064, + "loss": 0.9523, + "step": 7955 + }, + { + "epoch": 1.697098976109215, + "grad_norm": 0.3344861130133938, + "learning_rate": 0.00015031436785369396, + "loss": 0.9463, + "step": 7956 + }, + { + "epoch": 1.69731228668942, + "grad_norm": 0.3321223377907217, + "learning_rate": 0.000150310308710259, + "loss": 0.9452, + "step": 7957 + }, + { + "epoch": 1.6975255972696246, + "grad_norm": 0.3265264357021094, + "learning_rate": 0.0001503062487712617, + "loss": 0.969, + "step": 7958 + }, + { + "epoch": 1.6977389078498293, + "grad_norm": 0.43835567506746537, + "learning_rate": 0.00015030218803674804, + "loss": 0.9609, + "step": 7959 + }, + { + "epoch": 1.697952218430034, + "grad_norm": 0.524108510905157, + "learning_rate": 0.00015029812650676396, + "loss": 0.9643, + "step": 7960 + }, + { + "epoch": 1.6981655290102389, + "grad_norm": 0.6607029522964025, + "learning_rate": 0.00015029406418135536, + "loss": 0.9219, + "step": 7961 + }, + { + "epoch": 1.6983788395904438, + "grad_norm": 0.7133889404620374, + "learning_rate": 0.00015029000106056824, + "loss": 0.9898, + "step": 7962 + }, + { + "epoch": 1.6985921501706485, + "grad_norm": 0.6741721786487789, + "learning_rate": 0.0001502859371444486, + "loss": 0.9731, + "step": 7963 + }, + { + "epoch": 1.6988054607508531, + "grad_norm": 0.7040818388752367, + "learning_rate": 0.0001502818724330424, + "loss": 0.9511, + "step": 7964 + }, + { + "epoch": 1.699018771331058, + "grad_norm": 0.7119844952620143, + "learning_rate": 0.00015027780692639561, + "loss": 0.9305, + "step": 7965 + }, + { + "epoch": 1.699232081911263, + "grad_norm": 0.7432847289361715, + "learning_rate": 0.00015027374062455428, + "loss": 0.9681, + "step": 7966 + }, + { + "epoch": 1.6994453924914676, + "grad_norm": 0.7386141402123241, + "learning_rate": 0.00015026967352756436, + "loss": 0.9514, + "step": 7967 + }, + { + "epoch": 1.6996587030716723, + "grad_norm": 0.605724205101935, + "learning_rate": 0.00015026560563547193, + "loss": 0.9572, + "step": 7968 + }, + { + "epoch": 1.699872013651877, + "grad_norm": 0.5135013626311956, + "learning_rate": 0.00015026153694832297, + "loss": 0.9521, + "step": 7969 + }, + { + "epoch": 1.700085324232082, + "grad_norm": 0.4332330840944184, + "learning_rate": 0.00015025746746616354, + "loss": 0.9343, + "step": 7970 + }, + { + "epoch": 1.7002986348122868, + "grad_norm": 0.4045381877438394, + "learning_rate": 0.0001502533971890397, + "loss": 0.9513, + "step": 7971 + }, + { + "epoch": 1.7005119453924915, + "grad_norm": 0.5177702247446725, + "learning_rate": 0.00015024932611699748, + "loss": 0.9385, + "step": 7972 + }, + { + "epoch": 1.7007252559726962, + "grad_norm": 0.5640718661469086, + "learning_rate": 0.00015024525425008292, + "loss": 0.9488, + "step": 7973 + }, + { + "epoch": 1.7009385665529009, + "grad_norm": 0.5378400972656807, + "learning_rate": 0.00015024118158834217, + "loss": 0.9721, + "step": 7974 + }, + { + "epoch": 1.7011518771331058, + "grad_norm": 0.4547322622078015, + "learning_rate": 0.0001502371081318213, + "loss": 0.9458, + "step": 7975 + }, + { + "epoch": 1.7013651877133107, + "grad_norm": 0.38119235789951356, + "learning_rate": 0.0001502330338805663, + "loss": 0.9719, + "step": 7976 + }, + { + "epoch": 1.7015784982935154, + "grad_norm": 0.34810499693193475, + "learning_rate": 0.00015022895883462338, + "loss": 0.9649, + "step": 7977 + }, + { + "epoch": 1.70179180887372, + "grad_norm": 0.3750302800611656, + "learning_rate": 0.00015022488299403858, + "loss": 0.9639, + "step": 7978 + }, + { + "epoch": 1.702005119453925, + "grad_norm": 0.360476717509828, + "learning_rate": 0.00015022080635885808, + "loss": 0.9441, + "step": 7979 + }, + { + "epoch": 1.7022184300341296, + "grad_norm": 0.309278612854402, + "learning_rate": 0.00015021672892912794, + "loss": 0.9245, + "step": 7980 + }, + { + "epoch": 1.7024317406143346, + "grad_norm": 0.31004393877627306, + "learning_rate": 0.00015021265070489436, + "loss": 0.9221, + "step": 7981 + }, + { + "epoch": 1.7026450511945392, + "grad_norm": 0.3917246812522616, + "learning_rate": 0.00015020857168620346, + "loss": 0.9285, + "step": 7982 + }, + { + "epoch": 1.702858361774744, + "grad_norm": 0.5217062374768623, + "learning_rate": 0.00015020449187310136, + "loss": 0.9739, + "step": 7983 + }, + { + "epoch": 1.7030716723549488, + "grad_norm": 0.5672548719217088, + "learning_rate": 0.00015020041126563428, + "loss": 0.9487, + "step": 7984 + }, + { + "epoch": 1.7032849829351537, + "grad_norm": 0.5306525205584143, + "learning_rate": 0.00015019632986384835, + "loss": 0.9577, + "step": 7985 + }, + { + "epoch": 1.7034982935153584, + "grad_norm": 0.5743160729255825, + "learning_rate": 0.00015019224766778976, + "loss": 0.9382, + "step": 7986 + }, + { + "epoch": 1.703711604095563, + "grad_norm": 0.5746044501570796, + "learning_rate": 0.00015018816467750473, + "loss": 0.9121, + "step": 7987 + }, + { + "epoch": 1.7039249146757678, + "grad_norm": 0.45945897351259146, + "learning_rate": 0.00015018408089303943, + "loss": 0.9879, + "step": 7988 + }, + { + "epoch": 1.7041382252559727, + "grad_norm": 0.323337118751079, + "learning_rate": 0.00015017999631444006, + "loss": 0.9372, + "step": 7989 + }, + { + "epoch": 1.7043515358361776, + "grad_norm": 0.3068566273720973, + "learning_rate": 0.0001501759109417529, + "loss": 0.933, + "step": 7990 + }, + { + "epoch": 1.7045648464163823, + "grad_norm": 0.298670330153295, + "learning_rate": 0.00015017182477502407, + "loss": 0.9947, + "step": 7991 + }, + { + "epoch": 1.704778156996587, + "grad_norm": 0.3609143055324168, + "learning_rate": 0.00015016773781429988, + "loss": 0.972, + "step": 7992 + }, + { + "epoch": 1.7049914675767917, + "grad_norm": 0.39016769431072995, + "learning_rate": 0.00015016365005962655, + "loss": 0.937, + "step": 7993 + }, + { + "epoch": 1.7052047781569966, + "grad_norm": 0.37458727335556596, + "learning_rate": 0.00015015956151105035, + "loss": 0.9511, + "step": 7994 + }, + { + "epoch": 1.7054180887372015, + "grad_norm": 0.35166048895018953, + "learning_rate": 0.00015015547216861753, + "loss": 0.9044, + "step": 7995 + }, + { + "epoch": 1.7056313993174061, + "grad_norm": 0.33348896547041623, + "learning_rate": 0.00015015138203237438, + "loss": 0.9492, + "step": 7996 + }, + { + "epoch": 1.7058447098976108, + "grad_norm": 0.33908123072372, + "learning_rate": 0.00015014729110236714, + "loss": 0.9515, + "step": 7997 + }, + { + "epoch": 1.7060580204778157, + "grad_norm": 0.35885600131799883, + "learning_rate": 0.00015014319937864214, + "loss": 0.9428, + "step": 7998 + }, + { + "epoch": 1.7062713310580204, + "grad_norm": 0.3647934829643452, + "learning_rate": 0.0001501391068612456, + "loss": 0.9981, + "step": 7999 + }, + { + "epoch": 1.7064846416382253, + "grad_norm": 0.25605415698378464, + "learning_rate": 0.00015013501355022396, + "loss": 0.959, + "step": 8000 + }, + { + "epoch": 1.70669795221843, + "grad_norm": 0.29188916883202504, + "learning_rate": 0.00015013091944562343, + "loss": 0.9526, + "step": 8001 + }, + { + "epoch": 1.7069112627986347, + "grad_norm": 0.365658202735114, + "learning_rate": 0.0001501268245474904, + "loss": 0.9477, + "step": 8002 + }, + { + "epoch": 1.7071245733788396, + "grad_norm": 0.33259280765593036, + "learning_rate": 0.0001501227288558711, + "loss": 0.9135, + "step": 8003 + }, + { + "epoch": 1.7073378839590445, + "grad_norm": 0.3045164193331709, + "learning_rate": 0.000150118632370812, + "loss": 0.9616, + "step": 8004 + }, + { + "epoch": 1.7075511945392492, + "grad_norm": 0.29016350471656865, + "learning_rate": 0.00015011453509235938, + "loss": 0.9379, + "step": 8005 + }, + { + "epoch": 1.7077645051194539, + "grad_norm": 0.3263710183280261, + "learning_rate": 0.00015011043702055962, + "loss": 0.9507, + "step": 8006 + }, + { + "epoch": 1.7079778156996586, + "grad_norm": 0.3441160410253801, + "learning_rate": 0.00015010633815545907, + "loss": 0.969, + "step": 8007 + }, + { + "epoch": 1.7081911262798635, + "grad_norm": 0.37144558447332365, + "learning_rate": 0.00015010223849710414, + "loss": 0.9403, + "step": 8008 + }, + { + "epoch": 1.7084044368600684, + "grad_norm": 0.31303040423655737, + "learning_rate": 0.00015009813804554118, + "loss": 0.9484, + "step": 8009 + }, + { + "epoch": 1.708617747440273, + "grad_norm": 0.2629303824820177, + "learning_rate": 0.00015009403680081663, + "loss": 0.9508, + "step": 8010 + }, + { + "epoch": 1.7088310580204777, + "grad_norm": 0.3227911666552158, + "learning_rate": 0.00015008993476297688, + "loss": 0.9448, + "step": 8011 + }, + { + "epoch": 1.7090443686006824, + "grad_norm": 0.3915682493751455, + "learning_rate": 0.00015008583193206832, + "loss": 0.9208, + "step": 8012 + }, + { + "epoch": 1.7092576791808873, + "grad_norm": 0.41254211794045714, + "learning_rate": 0.00015008172830813742, + "loss": 0.9599, + "step": 8013 + }, + { + "epoch": 1.7094709897610922, + "grad_norm": 0.32731843942318045, + "learning_rate": 0.00015007762389123057, + "loss": 0.9741, + "step": 8014 + }, + { + "epoch": 1.709684300341297, + "grad_norm": 0.25598703236099624, + "learning_rate": 0.00015007351868139422, + "loss": 0.9603, + "step": 8015 + }, + { + "epoch": 1.7098976109215016, + "grad_norm": 0.2929096123545848, + "learning_rate": 0.00015006941267867483, + "loss": 0.9523, + "step": 8016 + }, + { + "epoch": 1.7101109215017065, + "grad_norm": 0.3618744166318485, + "learning_rate": 0.00015006530588311886, + "loss": 0.9403, + "step": 8017 + }, + { + "epoch": 1.7103242320819114, + "grad_norm": 0.32877725288793813, + "learning_rate": 0.0001500611982947728, + "loss": 0.9163, + "step": 8018 + }, + { + "epoch": 1.710537542662116, + "grad_norm": 0.35731931718205107, + "learning_rate": 0.00015005708991368307, + "loss": 0.9544, + "step": 8019 + }, + { + "epoch": 1.7107508532423208, + "grad_norm": 0.4876016292655206, + "learning_rate": 0.0001500529807398962, + "loss": 0.9535, + "step": 8020 + }, + { + "epoch": 1.7109641638225255, + "grad_norm": 0.48076371608923546, + "learning_rate": 0.00015004887077345867, + "loss": 0.9305, + "step": 8021 + }, + { + "epoch": 1.7111774744027304, + "grad_norm": 0.41775188527673757, + "learning_rate": 0.000150044760014417, + "loss": 0.9905, + "step": 8022 + }, + { + "epoch": 1.7113907849829353, + "grad_norm": 0.48741461012883386, + "learning_rate": 0.00015004064846281764, + "loss": 0.9436, + "step": 8023 + }, + { + "epoch": 1.71160409556314, + "grad_norm": 0.5684618331954103, + "learning_rate": 0.00015003653611870722, + "loss": 0.9615, + "step": 8024 + }, + { + "epoch": 1.7118174061433447, + "grad_norm": 0.667669977143632, + "learning_rate": 0.0001500324229821322, + "loss": 0.9419, + "step": 8025 + }, + { + "epoch": 1.7120307167235493, + "grad_norm": 0.7057926904244843, + "learning_rate": 0.00015002830905313914, + "loss": 0.9807, + "step": 8026 + }, + { + "epoch": 1.7122440273037542, + "grad_norm": 0.6733922444497114, + "learning_rate": 0.0001500241943317746, + "loss": 0.9241, + "step": 8027 + }, + { + "epoch": 1.7124573378839592, + "grad_norm": 0.5711772911996427, + "learning_rate": 0.00015002007881808512, + "loss": 0.9329, + "step": 8028 + }, + { + "epoch": 1.7126706484641638, + "grad_norm": 0.5194043763059176, + "learning_rate": 0.00015001596251211724, + "loss": 0.9397, + "step": 8029 + }, + { + "epoch": 1.7128839590443685, + "grad_norm": 0.4395865437959001, + "learning_rate": 0.0001500118454139176, + "loss": 0.9425, + "step": 8030 + }, + { + "epoch": 1.7130972696245734, + "grad_norm": 0.32532422929408344, + "learning_rate": 0.00015000772752353273, + "loss": 0.9355, + "step": 8031 + }, + { + "epoch": 1.713310580204778, + "grad_norm": 0.26932141222160283, + "learning_rate": 0.00015000360884100925, + "loss": 0.929, + "step": 8032 + }, + { + "epoch": 1.713523890784983, + "grad_norm": 0.3577650872403138, + "learning_rate": 0.00014999948936639376, + "loss": 0.9558, + "step": 8033 + }, + { + "epoch": 1.7137372013651877, + "grad_norm": 0.4572949080372195, + "learning_rate": 0.0001499953690997329, + "loss": 0.9631, + "step": 8034 + }, + { + "epoch": 1.7139505119453924, + "grad_norm": 0.4779939692842427, + "learning_rate": 0.0001499912480410732, + "loss": 0.9849, + "step": 8035 + }, + { + "epoch": 1.7141638225255973, + "grad_norm": 0.49266653251390397, + "learning_rate": 0.0001499871261904614, + "loss": 0.9297, + "step": 8036 + }, + { + "epoch": 1.7143771331058022, + "grad_norm": 0.509549410899615, + "learning_rate": 0.00014998300354794407, + "loss": 0.9659, + "step": 8037 + }, + { + "epoch": 1.7145904436860069, + "grad_norm": 0.4441658445296247, + "learning_rate": 0.00014997888011356787, + "loss": 0.9558, + "step": 8038 + }, + { + "epoch": 1.7148037542662116, + "grad_norm": 0.41103527093933506, + "learning_rate": 0.00014997475588737947, + "loss": 0.9833, + "step": 8039 + }, + { + "epoch": 1.7150170648464163, + "grad_norm": 0.44398995052949997, + "learning_rate": 0.00014997063086942555, + "loss": 0.9591, + "step": 8040 + }, + { + "epoch": 1.7152303754266212, + "grad_norm": 0.3816577444801993, + "learning_rate": 0.00014996650505975275, + "loss": 0.9319, + "step": 8041 + }, + { + "epoch": 1.715443686006826, + "grad_norm": 0.27757519526563673, + "learning_rate": 0.00014996237845840777, + "loss": 0.9296, + "step": 8042 + }, + { + "epoch": 1.7156569965870307, + "grad_norm": 0.43644044788459074, + "learning_rate": 0.0001499582510654373, + "loss": 0.9262, + "step": 8043 + }, + { + "epoch": 1.7158703071672354, + "grad_norm": 0.4933002290979707, + "learning_rate": 0.00014995412288088805, + "loss": 0.9655, + "step": 8044 + }, + { + "epoch": 1.7160836177474401, + "grad_norm": 0.5219950218560745, + "learning_rate": 0.0001499499939048067, + "loss": 0.9358, + "step": 8045 + }, + { + "epoch": 1.716296928327645, + "grad_norm": 0.5128747949608885, + "learning_rate": 0.00014994586413724002, + "loss": 0.9833, + "step": 8046 + }, + { + "epoch": 1.71651023890785, + "grad_norm": 0.47944493890310214, + "learning_rate": 0.00014994173357823468, + "loss": 0.9375, + "step": 8047 + }, + { + "epoch": 1.7167235494880546, + "grad_norm": 0.4640179705268159, + "learning_rate": 0.00014993760222783747, + "loss": 0.9553, + "step": 8048 + }, + { + "epoch": 1.7169368600682593, + "grad_norm": 0.4296493367269235, + "learning_rate": 0.00014993347008609512, + "loss": 0.9338, + "step": 8049 + }, + { + "epoch": 1.7171501706484642, + "grad_norm": 0.3421912992910744, + "learning_rate": 0.00014992933715305437, + "loss": 0.9584, + "step": 8050 + }, + { + "epoch": 1.7173634812286689, + "grad_norm": 0.3327368851980429, + "learning_rate": 0.00014992520342876198, + "loss": 0.979, + "step": 8051 + }, + { + "epoch": 1.7175767918088738, + "grad_norm": 0.43785019467498426, + "learning_rate": 0.00014992106891326476, + "loss": 0.9461, + "step": 8052 + }, + { + "epoch": 1.7177901023890785, + "grad_norm": 0.41722091226789937, + "learning_rate": 0.00014991693360660947, + "loss": 0.9904, + "step": 8053 + }, + { + "epoch": 1.7180034129692832, + "grad_norm": 0.4729421814354434, + "learning_rate": 0.0001499127975088429, + "loss": 0.9579, + "step": 8054 + }, + { + "epoch": 1.718216723549488, + "grad_norm": 0.5510990733524318, + "learning_rate": 0.00014990866062001182, + "loss": 0.9562, + "step": 8055 + }, + { + "epoch": 1.718430034129693, + "grad_norm": 0.6166912296404787, + "learning_rate": 0.0001499045229401631, + "loss": 0.9402, + "step": 8056 + }, + { + "epoch": 1.7186433447098977, + "grad_norm": 0.7695696681619449, + "learning_rate": 0.0001499003844693435, + "loss": 0.9367, + "step": 8057 + }, + { + "epoch": 1.7188566552901023, + "grad_norm": 0.7433204572453944, + "learning_rate": 0.00014989624520759991, + "loss": 0.9681, + "step": 8058 + }, + { + "epoch": 1.719069965870307, + "grad_norm": 0.4954834822218536, + "learning_rate": 0.0001498921051549791, + "loss": 0.9532, + "step": 8059 + }, + { + "epoch": 1.719283276450512, + "grad_norm": 0.3824612611101993, + "learning_rate": 0.00014988796431152798, + "loss": 0.9636, + "step": 8060 + }, + { + "epoch": 1.7194965870307168, + "grad_norm": 0.3758017620485332, + "learning_rate": 0.00014988382267729333, + "loss": 0.9393, + "step": 8061 + }, + { + "epoch": 1.7197098976109215, + "grad_norm": 0.4898321759632931, + "learning_rate": 0.00014987968025232206, + "loss": 0.967, + "step": 8062 + }, + { + "epoch": 1.7199232081911262, + "grad_norm": 0.6110882548663928, + "learning_rate": 0.00014987553703666104, + "loss": 0.9583, + "step": 8063 + }, + { + "epoch": 1.7201365187713311, + "grad_norm": 0.6308888122931829, + "learning_rate": 0.00014987139303035716, + "loss": 0.9382, + "step": 8064 + }, + { + "epoch": 1.7203498293515358, + "grad_norm": 0.637791679723776, + "learning_rate": 0.00014986724823345726, + "loss": 0.9446, + "step": 8065 + }, + { + "epoch": 1.7205631399317407, + "grad_norm": 0.5374343782229458, + "learning_rate": 0.00014986310264600828, + "loss": 0.9424, + "step": 8066 + }, + { + "epoch": 1.7207764505119454, + "grad_norm": 0.4030834088059878, + "learning_rate": 0.00014985895626805712, + "loss": 0.9523, + "step": 8067 + }, + { + "epoch": 1.72098976109215, + "grad_norm": 0.3388944029831343, + "learning_rate": 0.00014985480909965071, + "loss": 0.9764, + "step": 8068 + }, + { + "epoch": 1.721203071672355, + "grad_norm": 0.47181806097183143, + "learning_rate": 0.00014985066114083596, + "loss": 0.9748, + "step": 8069 + }, + { + "epoch": 1.7214163822525599, + "grad_norm": 0.6980688045935466, + "learning_rate": 0.00014984651239165978, + "loss": 0.9308, + "step": 8070 + }, + { + "epoch": 1.7216296928327646, + "grad_norm": 0.7655825542749453, + "learning_rate": 0.00014984236285216915, + "loss": 0.9468, + "step": 8071 + }, + { + "epoch": 1.7218430034129693, + "grad_norm": 0.7535706229229671, + "learning_rate": 0.000149838212522411, + "loss": 0.9702, + "step": 8072 + }, + { + "epoch": 1.722056313993174, + "grad_norm": 0.6704117400410201, + "learning_rate": 0.0001498340614024323, + "loss": 0.9438, + "step": 8073 + }, + { + "epoch": 1.7222696245733788, + "grad_norm": 0.5150838462331109, + "learning_rate": 0.00014982990949228005, + "loss": 0.9202, + "step": 8074 + }, + { + "epoch": 1.7224829351535837, + "grad_norm": 0.32669994373576594, + "learning_rate": 0.0001498257567920012, + "loss": 0.9535, + "step": 8075 + }, + { + "epoch": 1.7226962457337884, + "grad_norm": 0.3359899876364486, + "learning_rate": 0.00014982160330164268, + "loss": 0.9188, + "step": 8076 + }, + { + "epoch": 1.7229095563139931, + "grad_norm": 0.40609404017153217, + "learning_rate": 0.00014981744902125158, + "loss": 0.9503, + "step": 8077 + }, + { + "epoch": 1.7231228668941978, + "grad_norm": 0.5012910370588651, + "learning_rate": 0.00014981329395087487, + "loss": 0.9446, + "step": 8078 + }, + { + "epoch": 1.7233361774744027, + "grad_norm": 0.5504731408093401, + "learning_rate": 0.00014980913809055956, + "loss": 0.9386, + "step": 8079 + }, + { + "epoch": 1.7235494880546076, + "grad_norm": 0.4816823549201463, + "learning_rate": 0.0001498049814403527, + "loss": 0.952, + "step": 8080 + }, + { + "epoch": 1.7237627986348123, + "grad_norm": 0.37545886835494197, + "learning_rate": 0.0001498008240003013, + "loss": 0.9665, + "step": 8081 + }, + { + "epoch": 1.723976109215017, + "grad_norm": 0.3178499864162317, + "learning_rate": 0.00014979666577045239, + "loss": 0.9806, + "step": 8082 + }, + { + "epoch": 1.724189419795222, + "grad_norm": 0.3248602290041371, + "learning_rate": 0.00014979250675085302, + "loss": 0.9131, + "step": 8083 + }, + { + "epoch": 1.7244027303754266, + "grad_norm": 0.29491980814220975, + "learning_rate": 0.00014978834694155026, + "loss": 0.9441, + "step": 8084 + }, + { + "epoch": 1.7246160409556315, + "grad_norm": 0.4258334115597638, + "learning_rate": 0.00014978418634259118, + "loss": 0.942, + "step": 8085 + }, + { + "epoch": 1.7248293515358362, + "grad_norm": 0.49842277818903674, + "learning_rate": 0.0001497800249540229, + "loss": 0.93, + "step": 8086 + }, + { + "epoch": 1.7250426621160408, + "grad_norm": 0.42930077346791656, + "learning_rate": 0.00014977586277589245, + "loss": 0.9491, + "step": 8087 + }, + { + "epoch": 1.7252559726962458, + "grad_norm": 0.43018954697338335, + "learning_rate": 0.0001497716998082469, + "loss": 0.9636, + "step": 8088 + }, + { + "epoch": 1.7254692832764507, + "grad_norm": 0.34429275161753503, + "learning_rate": 0.00014976753605113342, + "loss": 0.9497, + "step": 8089 + }, + { + "epoch": 1.7256825938566553, + "grad_norm": 0.29205290880573104, + "learning_rate": 0.0001497633715045991, + "loss": 0.9585, + "step": 8090 + }, + { + "epoch": 1.72589590443686, + "grad_norm": 0.3682912409862475, + "learning_rate": 0.00014975920616869105, + "loss": 0.9396, + "step": 8091 + }, + { + "epoch": 1.7261092150170647, + "grad_norm": 0.4315729140911017, + "learning_rate": 0.0001497550400434564, + "loss": 0.9719, + "step": 8092 + }, + { + "epoch": 1.7263225255972696, + "grad_norm": 0.5061052680185039, + "learning_rate": 0.0001497508731289423, + "loss": 0.9586, + "step": 8093 + }, + { + "epoch": 1.7265358361774745, + "grad_norm": 0.5593740737163054, + "learning_rate": 0.0001497467054251959, + "loss": 0.946, + "step": 8094 + }, + { + "epoch": 1.7267491467576792, + "grad_norm": 0.5927679154444666, + "learning_rate": 0.00014974253693226437, + "loss": 0.9395, + "step": 8095 + }, + { + "epoch": 1.726962457337884, + "grad_norm": 0.5931590982488826, + "learning_rate": 0.00014973836765019486, + "loss": 0.9471, + "step": 8096 + }, + { + "epoch": 1.7271757679180886, + "grad_norm": 0.5328648210990398, + "learning_rate": 0.00014973419757903454, + "loss": 0.9637, + "step": 8097 + }, + { + "epoch": 1.7273890784982935, + "grad_norm": 0.4342702938819881, + "learning_rate": 0.0001497300267188306, + "loss": 0.9357, + "step": 8098 + }, + { + "epoch": 1.7276023890784984, + "grad_norm": 0.3268873597146484, + "learning_rate": 0.00014972585506963022, + "loss": 0.9698, + "step": 8099 + }, + { + "epoch": 1.727815699658703, + "grad_norm": 0.23608457039272746, + "learning_rate": 0.00014972168263148065, + "loss": 0.9471, + "step": 8100 + }, + { + "epoch": 1.7280290102389078, + "grad_norm": 0.3197556891701778, + "learning_rate": 0.00014971750940442904, + "loss": 0.9329, + "step": 8101 + }, + { + "epoch": 1.7282423208191127, + "grad_norm": 0.46565448569059253, + "learning_rate": 0.00014971333538852264, + "loss": 1.0089, + "step": 8102 + }, + { + "epoch": 1.7284556313993176, + "grad_norm": 0.5275161888274915, + "learning_rate": 0.00014970916058380871, + "loss": 0.9458, + "step": 8103 + }, + { + "epoch": 1.7286689419795223, + "grad_norm": 0.5345458145053035, + "learning_rate": 0.00014970498499033443, + "loss": 0.9608, + "step": 8104 + }, + { + "epoch": 1.728882252559727, + "grad_norm": 0.4845221580361863, + "learning_rate": 0.00014970080860814705, + "loss": 1.0002, + "step": 8105 + }, + { + "epoch": 1.7290955631399316, + "grad_norm": 0.38484673636409134, + "learning_rate": 0.0001496966314372939, + "loss": 0.9671, + "step": 8106 + }, + { + "epoch": 1.7293088737201365, + "grad_norm": 0.2631313837988746, + "learning_rate": 0.00014969245347782215, + "loss": 0.9511, + "step": 8107 + }, + { + "epoch": 1.7295221843003414, + "grad_norm": 0.27439157153165594, + "learning_rate": 0.00014968827472977916, + "loss": 0.9897, + "step": 8108 + }, + { + "epoch": 1.7297354948805461, + "grad_norm": 0.343563814989995, + "learning_rate": 0.00014968409519321212, + "loss": 0.9252, + "step": 8109 + }, + { + "epoch": 1.7299488054607508, + "grad_norm": 0.39967803765284343, + "learning_rate": 0.0001496799148681684, + "loss": 0.9213, + "step": 8110 + }, + { + "epoch": 1.7301621160409555, + "grad_norm": 0.4042618497789103, + "learning_rate": 0.00014967573375469526, + "loss": 0.9439, + "step": 8111 + }, + { + "epoch": 1.7303754266211604, + "grad_norm": 0.3959453490041173, + "learning_rate": 0.00014967155185284002, + "loss": 0.9564, + "step": 8112 + }, + { + "epoch": 1.7305887372013653, + "grad_norm": 0.34850404618598063, + "learning_rate": 0.00014966736916265, + "loss": 0.9491, + "step": 8113 + }, + { + "epoch": 1.73080204778157, + "grad_norm": 0.30590592346598794, + "learning_rate": 0.00014966318568417254, + "loss": 0.9104, + "step": 8114 + }, + { + "epoch": 1.7310153583617747, + "grad_norm": 0.30452901345164735, + "learning_rate": 0.00014965900141745495, + "loss": 0.959, + "step": 8115 + }, + { + "epoch": 1.7312286689419796, + "grad_norm": 0.3394471221364035, + "learning_rate": 0.00014965481636254458, + "loss": 0.9391, + "step": 8116 + }, + { + "epoch": 1.7314419795221843, + "grad_norm": 0.3847996391725129, + "learning_rate": 0.0001496506305194888, + "loss": 0.9156, + "step": 8117 + }, + { + "epoch": 1.7316552901023892, + "grad_norm": 0.4066387186823739, + "learning_rate": 0.00014964644388833497, + "loss": 0.9355, + "step": 8118 + }, + { + "epoch": 1.7318686006825939, + "grad_norm": 0.34974199731635713, + "learning_rate": 0.00014964225646913045, + "loss": 0.9439, + "step": 8119 + }, + { + "epoch": 1.7320819112627985, + "grad_norm": 0.2990612689733389, + "learning_rate": 0.00014963806826192262, + "loss": 0.9306, + "step": 8120 + }, + { + "epoch": 1.7322952218430034, + "grad_norm": 0.3036309339689824, + "learning_rate": 0.00014963387926675888, + "loss": 0.9458, + "step": 8121 + }, + { + "epoch": 1.7325085324232083, + "grad_norm": 0.45744756296057465, + "learning_rate": 0.00014962968948368664, + "loss": 0.9536, + "step": 8122 + }, + { + "epoch": 1.732721843003413, + "grad_norm": 0.4799570381701124, + "learning_rate": 0.00014962549891275328, + "loss": 0.9531, + "step": 8123 + }, + { + "epoch": 1.7329351535836177, + "grad_norm": 0.35794445144872444, + "learning_rate": 0.00014962130755400623, + "loss": 0.9536, + "step": 8124 + }, + { + "epoch": 1.7331484641638224, + "grad_norm": 0.31513028744420146, + "learning_rate": 0.00014961711540749292, + "loss": 0.9482, + "step": 8125 + }, + { + "epoch": 1.7333617747440273, + "grad_norm": 0.3232645050657678, + "learning_rate": 0.00014961292247326076, + "loss": 0.9274, + "step": 8126 + }, + { + "epoch": 1.7335750853242322, + "grad_norm": 0.3509955194050475, + "learning_rate": 0.00014960872875135721, + "loss": 0.9697, + "step": 8127 + }, + { + "epoch": 1.733788395904437, + "grad_norm": 0.3192927876154917, + "learning_rate": 0.00014960453424182974, + "loss": 0.9114, + "step": 8128 + }, + { + "epoch": 1.7340017064846416, + "grad_norm": 0.3478407509655562, + "learning_rate": 0.00014960033894472578, + "loss": 0.9456, + "step": 8129 + }, + { + "epoch": 1.7342150170648463, + "grad_norm": 0.3756499300227437, + "learning_rate": 0.00014959614286009284, + "loss": 0.9323, + "step": 8130 + }, + { + "epoch": 1.7344283276450512, + "grad_norm": 0.4622833225942717, + "learning_rate": 0.00014959194598797835, + "loss": 0.9526, + "step": 8131 + }, + { + "epoch": 1.734641638225256, + "grad_norm": 0.46676613017468405, + "learning_rate": 0.00014958774832842984, + "loss": 0.9542, + "step": 8132 + }, + { + "epoch": 1.7348549488054608, + "grad_norm": 0.6882656199001521, + "learning_rate": 0.00014958354988149476, + "loss": 0.9547, + "step": 8133 + }, + { + "epoch": 1.7350682593856654, + "grad_norm": 0.3403817526239338, + "learning_rate": 0.00014957935064722065, + "loss": 0.9454, + "step": 8134 + }, + { + "epoch": 1.7352815699658704, + "grad_norm": 0.32823748440909983, + "learning_rate": 0.00014957515062565503, + "loss": 0.9722, + "step": 8135 + }, + { + "epoch": 1.735494880546075, + "grad_norm": 0.31053442361878997, + "learning_rate": 0.00014957094981684542, + "loss": 0.9653, + "step": 8136 + }, + { + "epoch": 1.73570819112628, + "grad_norm": 0.2929141129291569, + "learning_rate": 0.00014956674822083933, + "loss": 0.984, + "step": 8137 + }, + { + "epoch": 1.7359215017064846, + "grad_norm": 0.31738071889628816, + "learning_rate": 0.0001495625458376843, + "loss": 0.944, + "step": 8138 + }, + { + "epoch": 1.7361348122866893, + "grad_norm": 0.322888133126369, + "learning_rate": 0.00014955834266742793, + "loss": 0.9613, + "step": 8139 + }, + { + "epoch": 1.7363481228668942, + "grad_norm": 0.34416834331898244, + "learning_rate": 0.00014955413871011768, + "loss": 0.9677, + "step": 8140 + }, + { + "epoch": 1.7365614334470991, + "grad_norm": 0.3804536638251879, + "learning_rate": 0.00014954993396580122, + "loss": 1.0072, + "step": 8141 + }, + { + "epoch": 1.7367747440273038, + "grad_norm": 0.4474581674885773, + "learning_rate": 0.0001495457284345261, + "loss": 0.9731, + "step": 8142 + }, + { + "epoch": 1.7369880546075085, + "grad_norm": 0.3838612913392415, + "learning_rate": 0.00014954152211633988, + "loss": 0.9586, + "step": 8143 + }, + { + "epoch": 1.7372013651877132, + "grad_norm": 0.3408448130273199, + "learning_rate": 0.00014953731501129015, + "loss": 0.9803, + "step": 8144 + }, + { + "epoch": 1.737414675767918, + "grad_norm": 0.36522925308418935, + "learning_rate": 0.00014953310711942456, + "loss": 0.9467, + "step": 8145 + }, + { + "epoch": 1.737627986348123, + "grad_norm": 0.3624607947338418, + "learning_rate": 0.00014952889844079069, + "loss": 0.9495, + "step": 8146 + }, + { + "epoch": 1.7378412969283277, + "grad_norm": 0.3062459879489462, + "learning_rate": 0.00014952468897543616, + "loss": 0.929, + "step": 8147 + }, + { + "epoch": 1.7380546075085324, + "grad_norm": 0.3127591945587523, + "learning_rate": 0.0001495204787234086, + "loss": 0.9369, + "step": 8148 + }, + { + "epoch": 1.738267918088737, + "grad_norm": 0.30197672110789964, + "learning_rate": 0.00014951626768475564, + "loss": 0.9219, + "step": 8149 + }, + { + "epoch": 1.738481228668942, + "grad_norm": 0.2896502629275267, + "learning_rate": 0.00014951205585952496, + "loss": 0.9674, + "step": 8150 + }, + { + "epoch": 1.7386945392491469, + "grad_norm": 0.36296845388087934, + "learning_rate": 0.00014950784324776418, + "loss": 0.9483, + "step": 8151 + }, + { + "epoch": 1.7389078498293515, + "grad_norm": 0.4100598598894928, + "learning_rate": 0.000149503629849521, + "loss": 0.9279, + "step": 8152 + }, + { + "epoch": 1.7391211604095562, + "grad_norm": 0.3999269519394275, + "learning_rate": 0.00014949941566484308, + "loss": 0.9806, + "step": 8153 + }, + { + "epoch": 1.7393344709897611, + "grad_norm": 0.3884921380036611, + "learning_rate": 0.00014949520069377812, + "loss": 0.9485, + "step": 8154 + }, + { + "epoch": 1.739547781569966, + "grad_norm": 0.4382667752737595, + "learning_rate": 0.00014949098493637376, + "loss": 0.9612, + "step": 8155 + }, + { + "epoch": 1.7397610921501707, + "grad_norm": 0.4818158104583386, + "learning_rate": 0.00014948676839267774, + "loss": 0.9517, + "step": 8156 + }, + { + "epoch": 1.7399744027303754, + "grad_norm": 0.4291198287370661, + "learning_rate": 0.00014948255106273778, + "loss": 0.9528, + "step": 8157 + }, + { + "epoch": 1.74018771331058, + "grad_norm": 0.41148065978478937, + "learning_rate": 0.0001494783329466016, + "loss": 0.9764, + "step": 8158 + }, + { + "epoch": 1.740401023890785, + "grad_norm": 0.35373603951640603, + "learning_rate": 0.0001494741140443169, + "loss": 0.9665, + "step": 8159 + }, + { + "epoch": 1.74061433447099, + "grad_norm": 0.26095502125413245, + "learning_rate": 0.00014946989435593144, + "loss": 0.9274, + "step": 8160 + }, + { + "epoch": 1.7408276450511946, + "grad_norm": 0.26068300275539913, + "learning_rate": 0.00014946567388149295, + "loss": 0.988, + "step": 8161 + }, + { + "epoch": 1.7410409556313993, + "grad_norm": 0.2957711187549106, + "learning_rate": 0.00014946145262104919, + "loss": 0.9342, + "step": 8162 + }, + { + "epoch": 1.741254266211604, + "grad_norm": 0.31267761028463126, + "learning_rate": 0.00014945723057464792, + "loss": 0.971, + "step": 8163 + }, + { + "epoch": 1.7414675767918089, + "grad_norm": 0.34510799629835487, + "learning_rate": 0.0001494530077423369, + "loss": 0.9305, + "step": 8164 + }, + { + "epoch": 1.7416808873720138, + "grad_norm": 0.34035230530528643, + "learning_rate": 0.00014944878412416395, + "loss": 0.9342, + "step": 8165 + }, + { + "epoch": 1.7418941979522184, + "grad_norm": 0.2737428606867495, + "learning_rate": 0.00014944455972017685, + "loss": 0.9606, + "step": 8166 + }, + { + "epoch": 1.7421075085324231, + "grad_norm": 0.301798654600026, + "learning_rate": 0.0001494403345304234, + "loss": 0.9499, + "step": 8167 + }, + { + "epoch": 1.742320819112628, + "grad_norm": 0.36504832310216584, + "learning_rate": 0.00014943610855495135, + "loss": 0.9526, + "step": 8168 + }, + { + "epoch": 1.7425341296928327, + "grad_norm": 0.37898762178567624, + "learning_rate": 0.0001494318817938086, + "loss": 0.9345, + "step": 8169 + }, + { + "epoch": 1.7427474402730376, + "grad_norm": 0.3351987503595245, + "learning_rate": 0.00014942765424704293, + "loss": 0.9652, + "step": 8170 + }, + { + "epoch": 1.7429607508532423, + "grad_norm": 0.32020517508173424, + "learning_rate": 0.00014942342591470217, + "loss": 0.9673, + "step": 8171 + }, + { + "epoch": 1.743174061433447, + "grad_norm": 0.2729642319725799, + "learning_rate": 0.0001494191967968342, + "loss": 0.9577, + "step": 8172 + }, + { + "epoch": 1.743387372013652, + "grad_norm": 0.35847936090848814, + "learning_rate": 0.00014941496689348683, + "loss": 0.9519, + "step": 8173 + }, + { + "epoch": 1.7436006825938568, + "grad_norm": 0.34677891029735164, + "learning_rate": 0.00014941073620470798, + "loss": 0.9391, + "step": 8174 + }, + { + "epoch": 1.7438139931740615, + "grad_norm": 0.23833792654695116, + "learning_rate": 0.00014940650473054543, + "loss": 0.9321, + "step": 8175 + }, + { + "epoch": 1.7440273037542662, + "grad_norm": 0.35753675853677974, + "learning_rate": 0.00014940227247104714, + "loss": 0.9489, + "step": 8176 + }, + { + "epoch": 1.7442406143344709, + "grad_norm": 0.3511579343817144, + "learning_rate": 0.000149398039426261, + "loss": 0.9336, + "step": 8177 + }, + { + "epoch": 1.7444539249146758, + "grad_norm": 0.35859409371993783, + "learning_rate": 0.00014939380559623482, + "loss": 0.9674, + "step": 8178 + }, + { + "epoch": 1.7446672354948807, + "grad_norm": 0.4053949525416786, + "learning_rate": 0.00014938957098101658, + "loss": 0.9411, + "step": 8179 + }, + { + "epoch": 1.7448805460750854, + "grad_norm": 0.4535247411701344, + "learning_rate": 0.0001493853355806542, + "loss": 0.9774, + "step": 8180 + }, + { + "epoch": 1.74509385665529, + "grad_norm": 0.5135622350998973, + "learning_rate": 0.00014938109939519558, + "loss": 0.9408, + "step": 8181 + }, + { + "epoch": 1.7453071672354947, + "grad_norm": 0.5895106183125237, + "learning_rate": 0.0001493768624246886, + "loss": 0.9304, + "step": 8182 + }, + { + "epoch": 1.7455204778156996, + "grad_norm": 0.66998199988267, + "learning_rate": 0.00014937262466918134, + "loss": 0.9681, + "step": 8183 + }, + { + "epoch": 1.7457337883959045, + "grad_norm": 0.6620623255983166, + "learning_rate": 0.0001493683861287216, + "loss": 0.9801, + "step": 8184 + }, + { + "epoch": 1.7459470989761092, + "grad_norm": 0.6423849986637827, + "learning_rate": 0.00014936414680335742, + "loss": 0.9497, + "step": 8185 + }, + { + "epoch": 1.746160409556314, + "grad_norm": 0.6562110237350025, + "learning_rate": 0.00014935990669313675, + "loss": 0.9672, + "step": 8186 + }, + { + "epoch": 1.7463737201365188, + "grad_norm": 0.5853546470697837, + "learning_rate": 0.0001493556657981076, + "loss": 0.9447, + "step": 8187 + }, + { + "epoch": 1.7465870307167235, + "grad_norm": 0.37182256802496866, + "learning_rate": 0.00014935142411831787, + "loss": 0.9917, + "step": 8188 + }, + { + "epoch": 1.7468003412969284, + "grad_norm": 0.30948253411381077, + "learning_rate": 0.00014934718165381564, + "loss": 0.9358, + "step": 8189 + }, + { + "epoch": 1.747013651877133, + "grad_norm": 0.4113868340383091, + "learning_rate": 0.00014934293840464886, + "loss": 0.9285, + "step": 8190 + }, + { + "epoch": 1.7472269624573378, + "grad_norm": 0.5365723760775993, + "learning_rate": 0.00014933869437086563, + "loss": 0.9401, + "step": 8191 + }, + { + "epoch": 1.7474402730375427, + "grad_norm": 0.6087003910689728, + "learning_rate": 0.00014933444955251383, + "loss": 0.9411, + "step": 8192 + }, + { + "epoch": 1.7476535836177476, + "grad_norm": 0.5558649274939322, + "learning_rate": 0.0001493302039496416, + "loss": 0.9484, + "step": 8193 + }, + { + "epoch": 1.7478668941979523, + "grad_norm": 0.5336049755546807, + "learning_rate": 0.00014932595756229696, + "loss": 0.9827, + "step": 8194 + }, + { + "epoch": 1.748080204778157, + "grad_norm": 0.468216691775042, + "learning_rate": 0.00014932171039052793, + "loss": 0.923, + "step": 8195 + }, + { + "epoch": 1.7482935153583616, + "grad_norm": 0.4263682338606067, + "learning_rate": 0.00014931746243438258, + "loss": 0.9648, + "step": 8196 + }, + { + "epoch": 1.7485068259385665, + "grad_norm": 0.35659710717619075, + "learning_rate": 0.000149313213693909, + "loss": 0.9303, + "step": 8197 + }, + { + "epoch": 1.7487201365187715, + "grad_norm": 0.35568990616704316, + "learning_rate": 0.0001493089641691552, + "loss": 0.9121, + "step": 8198 + }, + { + "epoch": 1.7489334470989761, + "grad_norm": 0.3691660607726375, + "learning_rate": 0.00014930471386016933, + "loss": 0.9618, + "step": 8199 + }, + { + "epoch": 1.7491467576791808, + "grad_norm": 0.2861376286682906, + "learning_rate": 0.00014930046276699944, + "loss": 0.9617, + "step": 8200 + }, + { + "epoch": 1.7493600682593855, + "grad_norm": 0.33484910974427234, + "learning_rate": 0.00014929621088969366, + "loss": 0.9502, + "step": 8201 + }, + { + "epoch": 1.7495733788395904, + "grad_norm": 0.35697708546166906, + "learning_rate": 0.00014929195822830008, + "loss": 0.8896, + "step": 8202 + }, + { + "epoch": 1.7497866894197953, + "grad_norm": 0.3700074624960775, + "learning_rate": 0.00014928770478286684, + "loss": 0.9882, + "step": 8203 + }, + { + "epoch": 1.75, + "grad_norm": 0.4010517522904761, + "learning_rate": 0.00014928345055344208, + "loss": 0.954, + "step": 8204 + }, + { + "epoch": 1.7502133105802047, + "grad_norm": 0.4151435405075875, + "learning_rate": 0.00014927919554007387, + "loss": 0.9402, + "step": 8205 + }, + { + "epoch": 1.7504266211604096, + "grad_norm": 0.49516656682209237, + "learning_rate": 0.0001492749397428104, + "loss": 0.9774, + "step": 8206 + }, + { + "epoch": 1.7506399317406145, + "grad_norm": 0.49844695301004216, + "learning_rate": 0.00014927068316169983, + "loss": 0.9084, + "step": 8207 + }, + { + "epoch": 1.7508532423208192, + "grad_norm": 0.4097282112024411, + "learning_rate": 0.0001492664257967903, + "loss": 0.9326, + "step": 8208 + }, + { + "epoch": 1.7510665529010239, + "grad_norm": 0.34090989819499334, + "learning_rate": 0.00014926216764813005, + "loss": 0.9439, + "step": 8209 + }, + { + "epoch": 1.7512798634812285, + "grad_norm": 0.35970167740429515, + "learning_rate": 0.00014925790871576713, + "loss": 0.9437, + "step": 8210 + }, + { + "epoch": 1.7514931740614335, + "grad_norm": 0.3518935936942967, + "learning_rate": 0.00014925364899974985, + "loss": 0.9167, + "step": 8211 + }, + { + "epoch": 1.7517064846416384, + "grad_norm": 0.268609602101845, + "learning_rate": 0.0001492493885001264, + "loss": 0.9592, + "step": 8212 + }, + { + "epoch": 1.751919795221843, + "grad_norm": 0.27180378062633154, + "learning_rate": 0.00014924512721694488, + "loss": 0.934, + "step": 8213 + }, + { + "epoch": 1.7521331058020477, + "grad_norm": 0.3463771990659614, + "learning_rate": 0.00014924086515025363, + "loss": 0.9436, + "step": 8214 + }, + { + "epoch": 1.7523464163822524, + "grad_norm": 1.393774177344777, + "learning_rate": 0.00014923660230010084, + "loss": 0.9527, + "step": 8215 + }, + { + "epoch": 1.7525597269624573, + "grad_norm": 0.2890700151405987, + "learning_rate": 0.0001492323386665347, + "loss": 0.9437, + "step": 8216 + }, + { + "epoch": 1.7527730375426622, + "grad_norm": 0.45187614965821465, + "learning_rate": 0.00014922807424960348, + "loss": 0.9262, + "step": 8217 + }, + { + "epoch": 1.752986348122867, + "grad_norm": 0.40570846783127207, + "learning_rate": 0.00014922380904935546, + "loss": 0.931, + "step": 8218 + }, + { + "epoch": 1.7531996587030716, + "grad_norm": 0.42456376509149935, + "learning_rate": 0.00014921954306583885, + "loss": 0.9685, + "step": 8219 + }, + { + "epoch": 1.7534129692832765, + "grad_norm": 0.49978595344242904, + "learning_rate": 0.00014921527629910197, + "loss": 0.9372, + "step": 8220 + }, + { + "epoch": 1.7536262798634812, + "grad_norm": 0.5829472447070525, + "learning_rate": 0.00014921100874919305, + "loss": 0.9603, + "step": 8221 + }, + { + "epoch": 1.753839590443686, + "grad_norm": 0.6769589981607782, + "learning_rate": 0.0001492067404161604, + "loss": 0.9234, + "step": 8222 + }, + { + "epoch": 1.7540529010238908, + "grad_norm": 0.7559150717928872, + "learning_rate": 0.00014920247130005234, + "loss": 0.9471, + "step": 8223 + }, + { + "epoch": 1.7542662116040955, + "grad_norm": 0.8382847181129867, + "learning_rate": 0.00014919820140091712, + "loss": 0.9654, + "step": 8224 + }, + { + "epoch": 1.7544795221843004, + "grad_norm": 0.8443445576231172, + "learning_rate": 0.00014919393071880313, + "loss": 0.938, + "step": 8225 + }, + { + "epoch": 1.7546928327645053, + "grad_norm": 2.6772244971893717, + "learning_rate": 0.00014918965925375864, + "loss": 1.0202, + "step": 8226 + }, + { + "epoch": 1.75490614334471, + "grad_norm": 1.1944950444633122, + "learning_rate": 0.000149185387005832, + "loss": 0.9272, + "step": 8227 + }, + { + "epoch": 1.7551194539249146, + "grad_norm": 1.8618261136571015, + "learning_rate": 0.0001491811139750715, + "loss": 0.9447, + "step": 8228 + }, + { + "epoch": 1.7553327645051193, + "grad_norm": 1.1571856932011266, + "learning_rate": 0.00014917684016152558, + "loss": 0.9651, + "step": 8229 + }, + { + "epoch": 1.7555460750853242, + "grad_norm": 2.7863342632072565, + "learning_rate": 0.00014917256556524252, + "loss": 0.9637, + "step": 8230 + }, + { + "epoch": 1.7557593856655291, + "grad_norm": 2.8601755219875042, + "learning_rate": 0.00014916829018627072, + "loss": 0.9929, + "step": 8231 + }, + { + "epoch": 1.7559726962457338, + "grad_norm": 0.7950122956252962, + "learning_rate": 0.00014916401402465857, + "loss": 0.9536, + "step": 8232 + }, + { + "epoch": 1.7561860068259385, + "grad_norm": 1.6075097824715883, + "learning_rate": 0.00014915973708045442, + "loss": 0.9847, + "step": 8233 + }, + { + "epoch": 1.7563993174061432, + "grad_norm": 0.8245314303998136, + "learning_rate": 0.0001491554593537067, + "loss": 0.9576, + "step": 8234 + }, + { + "epoch": 1.756612627986348, + "grad_norm": 1.6859122714592207, + "learning_rate": 0.0001491511808444638, + "loss": 0.9384, + "step": 8235 + }, + { + "epoch": 1.756825938566553, + "grad_norm": 1.2250933510340287, + "learning_rate": 0.00014914690155277407, + "loss": 0.9843, + "step": 8236 + }, + { + "epoch": 1.7570392491467577, + "grad_norm": 1.2510825154423688, + "learning_rate": 0.00014914262147868603, + "loss": 0.9623, + "step": 8237 + }, + { + "epoch": 1.7572525597269624, + "grad_norm": 1.1031882598703566, + "learning_rate": 0.00014913834062224807, + "loss": 1.0148, + "step": 8238 + }, + { + "epoch": 1.7574658703071673, + "grad_norm": 1.082517513089969, + "learning_rate": 0.0001491340589835086, + "loss": 0.9668, + "step": 8239 + }, + { + "epoch": 1.757679180887372, + "grad_norm": 0.9590526601919109, + "learning_rate": 0.0001491297765625161, + "loss": 0.9823, + "step": 8240 + }, + { + "epoch": 1.7578924914675769, + "grad_norm": 1.0278898741686577, + "learning_rate": 0.000149125493359319, + "loss": 0.9759, + "step": 8241 + }, + { + "epoch": 1.7581058020477816, + "grad_norm": 0.9403581228913216, + "learning_rate": 0.0001491212093739658, + "loss": 0.9642, + "step": 8242 + }, + { + "epoch": 1.7583191126279862, + "grad_norm": 0.5700187631328547, + "learning_rate": 0.00014911692460650497, + "loss": 0.9808, + "step": 8243 + }, + { + "epoch": 1.7585324232081911, + "grad_norm": 0.6032740390880816, + "learning_rate": 0.00014911263905698495, + "loss": 0.9513, + "step": 8244 + }, + { + "epoch": 1.758745733788396, + "grad_norm": 0.49758897292647947, + "learning_rate": 0.00014910835272545427, + "loss": 0.9439, + "step": 8245 + }, + { + "epoch": 1.7589590443686007, + "grad_norm": 0.5313398536574696, + "learning_rate": 0.0001491040656119614, + "loss": 0.9504, + "step": 8246 + }, + { + "epoch": 1.7591723549488054, + "grad_norm": 0.42940152641336554, + "learning_rate": 0.00014909977771655487, + "loss": 0.9385, + "step": 8247 + }, + { + "epoch": 1.75938566552901, + "grad_norm": 0.4938926613238998, + "learning_rate": 0.00014909548903928318, + "loss": 0.9678, + "step": 8248 + }, + { + "epoch": 1.759598976109215, + "grad_norm": 0.5035756944313692, + "learning_rate": 0.00014909119958019489, + "loss": 0.949, + "step": 8249 + }, + { + "epoch": 1.75981228668942, + "grad_norm": 0.5173787475862387, + "learning_rate": 0.00014908690933933852, + "loss": 0.9734, + "step": 8250 + }, + { + "epoch": 1.7600255972696246, + "grad_norm": 0.5469990980513709, + "learning_rate": 0.0001490826183167626, + "loss": 0.9502, + "step": 8251 + }, + { + "epoch": 1.7602389078498293, + "grad_norm": 0.37590967087335764, + "learning_rate": 0.00014907832651251567, + "loss": 0.9193, + "step": 8252 + }, + { + "epoch": 1.760452218430034, + "grad_norm": 0.4127202374927419, + "learning_rate": 0.00014907403392664634, + "loss": 0.9345, + "step": 8253 + }, + { + "epoch": 1.7606655290102389, + "grad_norm": 0.4285414236609501, + "learning_rate": 0.00014906974055920317, + "loss": 0.9565, + "step": 8254 + }, + { + "epoch": 1.7608788395904438, + "grad_norm": 0.42031690903001784, + "learning_rate": 0.00014906544641023472, + "loss": 0.9489, + "step": 8255 + }, + { + "epoch": 1.7610921501706485, + "grad_norm": 0.34025940162582513, + "learning_rate": 0.00014906115147978954, + "loss": 0.9381, + "step": 8256 + }, + { + "epoch": 1.7613054607508531, + "grad_norm": 0.4532750023234691, + "learning_rate": 0.0001490568557679163, + "loss": 0.9515, + "step": 8257 + }, + { + "epoch": 1.761518771331058, + "grad_norm": 0.39551473385693914, + "learning_rate": 0.00014905255927466358, + "loss": 0.9439, + "step": 8258 + }, + { + "epoch": 1.761732081911263, + "grad_norm": 0.31025347314407686, + "learning_rate": 0.00014904826200007998, + "loss": 0.9426, + "step": 8259 + }, + { + "epoch": 1.7619453924914676, + "grad_norm": 0.4318956668622796, + "learning_rate": 0.00014904396394421413, + "loss": 0.9506, + "step": 8260 + }, + { + "epoch": 1.7621587030716723, + "grad_norm": 0.3567891902340958, + "learning_rate": 0.00014903966510711468, + "loss": 0.97, + "step": 8261 + }, + { + "epoch": 1.762372013651877, + "grad_norm": 0.2979101040824267, + "learning_rate": 0.00014903536548883023, + "loss": 0.9482, + "step": 8262 + }, + { + "epoch": 1.762585324232082, + "grad_norm": 0.36624272596779134, + "learning_rate": 0.0001490310650894095, + "loss": 0.9423, + "step": 8263 + }, + { + "epoch": 1.7627986348122868, + "grad_norm": 0.2863171711796853, + "learning_rate": 0.00014902676390890108, + "loss": 0.9665, + "step": 8264 + }, + { + "epoch": 1.7630119453924915, + "grad_norm": 0.28291274677291794, + "learning_rate": 0.00014902246194735367, + "loss": 0.9226, + "step": 8265 + }, + { + "epoch": 1.7632252559726962, + "grad_norm": 0.36135895858791967, + "learning_rate": 0.00014901815920481594, + "loss": 0.9429, + "step": 8266 + }, + { + "epoch": 1.7634385665529009, + "grad_norm": 0.36747436571112163, + "learning_rate": 0.00014901385568133656, + "loss": 0.9305, + "step": 8267 + }, + { + "epoch": 1.7636518771331058, + "grad_norm": 0.38349273947246454, + "learning_rate": 0.00014900955137696426, + "loss": 0.9598, + "step": 8268 + }, + { + "epoch": 1.7638651877133107, + "grad_norm": 0.29801712545188497, + "learning_rate": 0.00014900524629174773, + "loss": 0.966, + "step": 8269 + }, + { + "epoch": 1.7640784982935154, + "grad_norm": 0.30961873615392843, + "learning_rate": 0.00014900094042573567, + "loss": 0.9524, + "step": 8270 + }, + { + "epoch": 1.76429180887372, + "grad_norm": 0.40223067569231047, + "learning_rate": 0.00014899663377897682, + "loss": 0.9601, + "step": 8271 + }, + { + "epoch": 1.764505119453925, + "grad_norm": 0.37947614625330184, + "learning_rate": 0.00014899232635151992, + "loss": 0.9526, + "step": 8272 + }, + { + "epoch": 1.7647184300341296, + "grad_norm": 0.27668154734318096, + "learning_rate": 0.00014898801814341363, + "loss": 0.9403, + "step": 8273 + }, + { + "epoch": 1.7649317406143346, + "grad_norm": 0.22606191451564042, + "learning_rate": 0.0001489837091547068, + "loss": 0.9645, + "step": 8274 + }, + { + "epoch": 1.7651450511945392, + "grad_norm": 0.25543468741280484, + "learning_rate": 0.0001489793993854481, + "loss": 0.9136, + "step": 8275 + }, + { + "epoch": 1.765358361774744, + "grad_norm": 0.2597509397048967, + "learning_rate": 0.00014897508883568637, + "loss": 0.9717, + "step": 8276 + }, + { + "epoch": 1.7655716723549488, + "grad_norm": 0.26159069337069574, + "learning_rate": 0.00014897077750547033, + "loss": 0.9046, + "step": 8277 + }, + { + "epoch": 1.7657849829351537, + "grad_norm": 0.2574780800604768, + "learning_rate": 0.0001489664653948488, + "loss": 0.9267, + "step": 8278 + }, + { + "epoch": 1.7659982935153584, + "grad_norm": 0.24267652725504177, + "learning_rate": 0.00014896215250387055, + "loss": 0.9611, + "step": 8279 + }, + { + "epoch": 1.766211604095563, + "grad_norm": 0.3332912090380533, + "learning_rate": 0.00014895783883258436, + "loss": 0.915, + "step": 8280 + }, + { + "epoch": 1.7664249146757678, + "grad_norm": 0.4424504492448091, + "learning_rate": 0.0001489535243810391, + "loss": 0.9617, + "step": 8281 + }, + { + "epoch": 1.7666382252559727, + "grad_norm": 0.44033134378950956, + "learning_rate": 0.00014894920914928353, + "loss": 0.9601, + "step": 8282 + }, + { + "epoch": 1.7668515358361776, + "grad_norm": 0.271212452644377, + "learning_rate": 0.0001489448931373665, + "loss": 0.9539, + "step": 8283 + }, + { + "epoch": 1.7670648464163823, + "grad_norm": 0.24578705564755893, + "learning_rate": 0.00014894057634533685, + "loss": 0.9567, + "step": 8284 + }, + { + "epoch": 1.767278156996587, + "grad_norm": 0.29163522893873894, + "learning_rate": 0.0001489362587732434, + "loss": 0.9577, + "step": 8285 + }, + { + "epoch": 1.7674914675767917, + "grad_norm": 0.27241395815937036, + "learning_rate": 0.00014893194042113505, + "loss": 0.9296, + "step": 8286 + }, + { + "epoch": 1.7677047781569966, + "grad_norm": 0.29805340044778506, + "learning_rate": 0.00014892762128906062, + "loss": 0.9645, + "step": 8287 + }, + { + "epoch": 1.7679180887372015, + "grad_norm": 0.2561403813820701, + "learning_rate": 0.000148923301377069, + "loss": 0.9549, + "step": 8288 + }, + { + "epoch": 1.7681313993174061, + "grad_norm": 0.24028764614566048, + "learning_rate": 0.00014891898068520906, + "loss": 0.9375, + "step": 8289 + }, + { + "epoch": 1.7683447098976108, + "grad_norm": 0.2436788355313223, + "learning_rate": 0.00014891465921352973, + "loss": 0.9435, + "step": 8290 + }, + { + "epoch": 1.7685580204778157, + "grad_norm": 0.2835926855570173, + "learning_rate": 0.0001489103369620798, + "loss": 0.9221, + "step": 8291 + }, + { + "epoch": 1.7687713310580204, + "grad_norm": 0.27396035099916627, + "learning_rate": 0.0001489060139309083, + "loss": 0.9803, + "step": 8292 + }, + { + "epoch": 1.7689846416382253, + "grad_norm": 0.3298221075764088, + "learning_rate": 0.0001489016901200641, + "loss": 0.9514, + "step": 8293 + }, + { + "epoch": 1.76919795221843, + "grad_norm": 0.279396942691434, + "learning_rate": 0.0001488973655295961, + "loss": 0.9459, + "step": 8294 + }, + { + "epoch": 1.7694112627986347, + "grad_norm": 0.3015021143125397, + "learning_rate": 0.00014889304015955324, + "loss": 0.9377, + "step": 8295 + }, + { + "epoch": 1.7696245733788396, + "grad_norm": 0.2901185330580383, + "learning_rate": 0.0001488887140099845, + "loss": 0.9475, + "step": 8296 + }, + { + "epoch": 1.7698378839590445, + "grad_norm": 0.2551742830739145, + "learning_rate": 0.00014888438708093876, + "loss": 0.9323, + "step": 8297 + }, + { + "epoch": 1.7700511945392492, + "grad_norm": 0.28962645255605524, + "learning_rate": 0.00014888005937246506, + "loss": 0.9279, + "step": 8298 + }, + { + "epoch": 1.7702645051194539, + "grad_norm": 0.337729435744747, + "learning_rate": 0.00014887573088461233, + "loss": 0.9348, + "step": 8299 + }, + { + "epoch": 1.7704778156996586, + "grad_norm": 0.36726310914780386, + "learning_rate": 0.00014887140161742954, + "loss": 0.961, + "step": 8300 + }, + { + "epoch": 1.7706911262798635, + "grad_norm": 0.3633333392340565, + "learning_rate": 0.00014886707157096566, + "loss": 0.9706, + "step": 8301 + }, + { + "epoch": 1.7709044368600684, + "grad_norm": 0.3333815819243738, + "learning_rate": 0.00014886274074526971, + "loss": 0.9527, + "step": 8302 + }, + { + "epoch": 1.771117747440273, + "grad_norm": 0.27708295603644517, + "learning_rate": 0.0001488584091403907, + "loss": 0.9606, + "step": 8303 + }, + { + "epoch": 1.7713310580204777, + "grad_norm": 0.26782801176702564, + "learning_rate": 0.00014885407675637764, + "loss": 0.93, + "step": 8304 + }, + { + "epoch": 1.7715443686006824, + "grad_norm": 0.26413776802559014, + "learning_rate": 0.00014884974359327953, + "loss": 0.9352, + "step": 8305 + }, + { + "epoch": 1.7717576791808873, + "grad_norm": 0.29937654106977274, + "learning_rate": 0.00014884540965114543, + "loss": 0.9844, + "step": 8306 + }, + { + "epoch": 1.7719709897610922, + "grad_norm": 0.37363964942930533, + "learning_rate": 0.00014884107493002438, + "loss": 0.9173, + "step": 8307 + }, + { + "epoch": 1.772184300341297, + "grad_norm": 0.36441416992503994, + "learning_rate": 0.00014883673942996535, + "loss": 0.927, + "step": 8308 + }, + { + "epoch": 1.7723976109215016, + "grad_norm": 0.29094483996189047, + "learning_rate": 0.00014883240315101748, + "loss": 0.9304, + "step": 8309 + }, + { + "epoch": 1.7726109215017065, + "grad_norm": 0.27949346808847897, + "learning_rate": 0.00014882806609322984, + "loss": 0.9703, + "step": 8310 + }, + { + "epoch": 1.7728242320819114, + "grad_norm": 0.27342129903749535, + "learning_rate": 0.00014882372825665146, + "loss": 0.9422, + "step": 8311 + }, + { + "epoch": 1.773037542662116, + "grad_norm": 0.2555745838315043, + "learning_rate": 0.00014881938964133142, + "loss": 0.9666, + "step": 8312 + }, + { + "epoch": 1.7732508532423208, + "grad_norm": 0.2366825342065623, + "learning_rate": 0.00014881505024731884, + "loss": 0.9424, + "step": 8313 + }, + { + "epoch": 1.7734641638225255, + "grad_norm": 0.24870722419740598, + "learning_rate": 0.00014881071007466279, + "loss": 0.9539, + "step": 8314 + }, + { + "epoch": 1.7736774744027304, + "grad_norm": 0.2704396286271577, + "learning_rate": 0.00014880636912341242, + "loss": 0.9822, + "step": 8315 + }, + { + "epoch": 1.7738907849829353, + "grad_norm": 0.3124534616905643, + "learning_rate": 0.00014880202739361682, + "loss": 0.9574, + "step": 8316 + }, + { + "epoch": 1.77410409556314, + "grad_norm": 0.3326149940790477, + "learning_rate": 0.00014879768488532512, + "loss": 0.914, + "step": 8317 + }, + { + "epoch": 1.7743174061433447, + "grad_norm": 0.33728989851230956, + "learning_rate": 0.00014879334159858648, + "loss": 0.9455, + "step": 8318 + }, + { + "epoch": 1.7745307167235493, + "grad_norm": 0.33792037945135756, + "learning_rate": 0.00014878899753344998, + "loss": 0.9295, + "step": 8319 + }, + { + "epoch": 1.7747440273037542, + "grad_norm": 0.30963714904673256, + "learning_rate": 0.00014878465268996485, + "loss": 0.9239, + "step": 8320 + }, + { + "epoch": 1.7749573378839592, + "grad_norm": 0.2738543038603879, + "learning_rate": 0.0001487803070681802, + "loss": 0.9855, + "step": 8321 + }, + { + "epoch": 1.7751706484641638, + "grad_norm": 0.27502704782979714, + "learning_rate": 0.00014877596066814524, + "loss": 0.9869, + "step": 8322 + }, + { + "epoch": 1.7753839590443685, + "grad_norm": 0.23391354863847816, + "learning_rate": 0.00014877161348990912, + "loss": 0.9421, + "step": 8323 + }, + { + "epoch": 1.7755972696245734, + "grad_norm": 0.22833410452110203, + "learning_rate": 0.00014876726553352104, + "loss": 0.9608, + "step": 8324 + }, + { + "epoch": 1.775810580204778, + "grad_norm": 0.2059743581265258, + "learning_rate": 0.00014876291679903018, + "loss": 0.9523, + "step": 8325 + }, + { + "epoch": 1.776023890784983, + "grad_norm": 0.2266958990322202, + "learning_rate": 0.00014875856728648577, + "loss": 0.9168, + "step": 8326 + }, + { + "epoch": 1.7762372013651877, + "grad_norm": 0.2384489637972661, + "learning_rate": 0.000148754216995937, + "loss": 0.9357, + "step": 8327 + }, + { + "epoch": 1.7764505119453924, + "grad_norm": 0.21383306921728906, + "learning_rate": 0.0001487498659274331, + "loss": 0.9298, + "step": 8328 + }, + { + "epoch": 1.7766638225255973, + "grad_norm": 0.3147074566310021, + "learning_rate": 0.00014874551408102337, + "loss": 0.969, + "step": 8329 + }, + { + "epoch": 1.7768771331058022, + "grad_norm": 0.2652331677783074, + "learning_rate": 0.00014874116145675693, + "loss": 0.9533, + "step": 8330 + }, + { + "epoch": 1.7770904436860069, + "grad_norm": 0.2629862120155795, + "learning_rate": 0.0001487368080546831, + "loss": 0.9551, + "step": 8331 + }, + { + "epoch": 1.7773037542662116, + "grad_norm": 0.2564681871922879, + "learning_rate": 0.00014873245387485114, + "loss": 0.9419, + "step": 8332 + }, + { + "epoch": 1.7775170648464163, + "grad_norm": 0.26247999419603485, + "learning_rate": 0.0001487280989173103, + "loss": 0.9629, + "step": 8333 + }, + { + "epoch": 1.7777303754266212, + "grad_norm": 0.3401668546315791, + "learning_rate": 0.0001487237431821099, + "loss": 0.942, + "step": 8334 + }, + { + "epoch": 1.777943686006826, + "grad_norm": 0.33458192876375503, + "learning_rate": 0.00014871938666929916, + "loss": 0.9611, + "step": 8335 + }, + { + "epoch": 1.7781569965870307, + "grad_norm": 0.26161199518289013, + "learning_rate": 0.0001487150293789274, + "loss": 0.9563, + "step": 8336 + }, + { + "epoch": 1.7783703071672354, + "grad_norm": 0.4239252573484993, + "learning_rate": 0.00014871067131104395, + "loss": 0.9402, + "step": 8337 + }, + { + "epoch": 1.7785836177474401, + "grad_norm": 0.4965103571747821, + "learning_rate": 0.0001487063124656981, + "loss": 0.9528, + "step": 8338 + }, + { + "epoch": 1.778796928327645, + "grad_norm": 0.3844093641232316, + "learning_rate": 0.00014870195284293915, + "loss": 0.9074, + "step": 8339 + }, + { + "epoch": 1.77901023890785, + "grad_norm": 0.24800770062357405, + "learning_rate": 0.00014869759244281645, + "loss": 0.9527, + "step": 8340 + }, + { + "epoch": 1.7792235494880546, + "grad_norm": 0.3353210891723502, + "learning_rate": 0.00014869323126537933, + "loss": 0.9047, + "step": 8341 + }, + { + "epoch": 1.7794368600682593, + "grad_norm": 0.42992976219182893, + "learning_rate": 0.00014868886931067714, + "loss": 0.9704, + "step": 8342 + }, + { + "epoch": 1.7796501706484642, + "grad_norm": 0.4209830622225581, + "learning_rate": 0.00014868450657875928, + "loss": 0.9416, + "step": 8343 + }, + { + "epoch": 1.7798634812286689, + "grad_norm": 0.36500886545282596, + "learning_rate": 0.00014868014306967502, + "loss": 0.9741, + "step": 8344 + }, + { + "epoch": 1.7800767918088738, + "grad_norm": 0.4203789090832576, + "learning_rate": 0.0001486757787834738, + "loss": 0.9662, + "step": 8345 + }, + { + "epoch": 1.7802901023890785, + "grad_norm": 0.5471748827937185, + "learning_rate": 0.000148671413720205, + "loss": 0.9556, + "step": 8346 + }, + { + "epoch": 1.7805034129692832, + "grad_norm": 0.6336604280892696, + "learning_rate": 0.00014866704787991803, + "loss": 0.9751, + "step": 8347 + }, + { + "epoch": 1.780716723549488, + "grad_norm": 0.6086974250237578, + "learning_rate": 0.00014866268126266218, + "loss": 0.9542, + "step": 8348 + }, + { + "epoch": 1.780930034129693, + "grad_norm": 0.5538777358554441, + "learning_rate": 0.00014865831386848697, + "loss": 0.9105, + "step": 8349 + }, + { + "epoch": 1.7811433447098977, + "grad_norm": 0.4788837087148879, + "learning_rate": 0.0001486539456974418, + "loss": 0.9489, + "step": 8350 + }, + { + "epoch": 1.7813566552901023, + "grad_norm": 0.40879945860147926, + "learning_rate": 0.0001486495767495761, + "loss": 0.962, + "step": 8351 + }, + { + "epoch": 1.781569965870307, + "grad_norm": 0.4038538182123742, + "learning_rate": 0.00014864520702493925, + "loss": 0.9702, + "step": 8352 + }, + { + "epoch": 1.781783276450512, + "grad_norm": 0.4314207937177963, + "learning_rate": 0.00014864083652358072, + "loss": 0.9656, + "step": 8353 + }, + { + "epoch": 1.7819965870307168, + "grad_norm": 0.36809602118737045, + "learning_rate": 0.00014863646524555, + "loss": 0.9395, + "step": 8354 + }, + { + "epoch": 1.7822098976109215, + "grad_norm": 0.3975004517417102, + "learning_rate": 0.00014863209319089648, + "loss": 0.9503, + "step": 8355 + }, + { + "epoch": 1.7824232081911262, + "grad_norm": 0.5395664600981561, + "learning_rate": 0.0001486277203596697, + "loss": 0.9492, + "step": 8356 + }, + { + "epoch": 1.7826365187713311, + "grad_norm": 0.49343177952800227, + "learning_rate": 0.00014862334675191913, + "loss": 0.9343, + "step": 8357 + }, + { + "epoch": 1.7828498293515358, + "grad_norm": 0.33308094445265135, + "learning_rate": 0.0001486189723676942, + "loss": 0.9369, + "step": 8358 + }, + { + "epoch": 1.7830631399317407, + "grad_norm": 0.370102118198214, + "learning_rate": 0.00014861459720704446, + "loss": 0.9715, + "step": 8359 + }, + { + "epoch": 1.7832764505119454, + "grad_norm": 0.36684805500488765, + "learning_rate": 0.0001486102212700194, + "loss": 0.9666, + "step": 8360 + }, + { + "epoch": 1.78348976109215, + "grad_norm": 0.25974813349308046, + "learning_rate": 0.00014860584455666853, + "loss": 0.9208, + "step": 8361 + }, + { + "epoch": 1.783703071672355, + "grad_norm": 0.35553950067888107, + "learning_rate": 0.00014860146706704137, + "loss": 0.9601, + "step": 8362 + }, + { + "epoch": 1.7839163822525599, + "grad_norm": 0.5057570004109261, + "learning_rate": 0.00014859708880118747, + "loss": 0.9722, + "step": 8363 + }, + { + "epoch": 1.7841296928327646, + "grad_norm": 0.4860221111529296, + "learning_rate": 0.00014859270975915633, + "loss": 0.9455, + "step": 8364 + }, + { + "epoch": 1.7843430034129693, + "grad_norm": 0.4267472297970275, + "learning_rate": 0.00014858832994099755, + "loss": 0.9646, + "step": 8365 + }, + { + "epoch": 1.784556313993174, + "grad_norm": 0.46110309691337437, + "learning_rate": 0.00014858394934676066, + "loss": 0.9682, + "step": 8366 + }, + { + "epoch": 1.7847696245733788, + "grad_norm": 0.3910238452230916, + "learning_rate": 0.00014857956797649524, + "loss": 0.9541, + "step": 8367 + }, + { + "epoch": 1.7849829351535837, + "grad_norm": 0.36297989395367763, + "learning_rate": 0.00014857518583025084, + "loss": 0.9195, + "step": 8368 + }, + { + "epoch": 1.7851962457337884, + "grad_norm": 0.40943405932215143, + "learning_rate": 0.00014857080290807704, + "loss": 0.9616, + "step": 8369 + }, + { + "epoch": 1.7854095563139931, + "grad_norm": 0.27522123431874446, + "learning_rate": 0.00014856641921002348, + "loss": 0.9424, + "step": 8370 + }, + { + "epoch": 1.7856228668941978, + "grad_norm": 0.3700653097343831, + "learning_rate": 0.00014856203473613972, + "loss": 0.9538, + "step": 8371 + }, + { + "epoch": 1.7858361774744027, + "grad_norm": 0.48152303166421906, + "learning_rate": 0.00014855764948647538, + "loss": 0.9578, + "step": 8372 + }, + { + "epoch": 1.7860494880546076, + "grad_norm": 0.4772904820564267, + "learning_rate": 0.0001485532634610801, + "loss": 0.941, + "step": 8373 + }, + { + "epoch": 1.7862627986348123, + "grad_norm": 0.4659157960768083, + "learning_rate": 0.00014854887666000346, + "loss": 0.9119, + "step": 8374 + }, + { + "epoch": 1.786476109215017, + "grad_norm": 0.44270975838535787, + "learning_rate": 0.00014854448908329518, + "loss": 0.978, + "step": 8375 + }, + { + "epoch": 1.786689419795222, + "grad_norm": 0.42981281799910115, + "learning_rate": 0.0001485401007310048, + "loss": 0.9388, + "step": 8376 + }, + { + "epoch": 1.7869027303754266, + "grad_norm": 0.355367797769053, + "learning_rate": 0.00014853571160318205, + "loss": 0.9548, + "step": 8377 + }, + { + "epoch": 1.7871160409556315, + "grad_norm": 0.22897612748117646, + "learning_rate": 0.00014853132169987657, + "loss": 0.9373, + "step": 8378 + }, + { + "epoch": 1.7873293515358362, + "grad_norm": 0.30684782938482535, + "learning_rate": 0.000148526931021138, + "loss": 0.9216, + "step": 8379 + }, + { + "epoch": 1.7875426621160408, + "grad_norm": 0.40921660228263823, + "learning_rate": 0.00014852253956701612, + "loss": 0.9623, + "step": 8380 + }, + { + "epoch": 1.7877559726962458, + "grad_norm": 0.40086280336857655, + "learning_rate": 0.00014851814733756052, + "loss": 0.9468, + "step": 8381 + }, + { + "epoch": 1.7879692832764507, + "grad_norm": 0.42167272870009875, + "learning_rate": 0.00014851375433282092, + "loss": 0.9377, + "step": 8382 + }, + { + "epoch": 1.7881825938566553, + "grad_norm": 0.4454720491397837, + "learning_rate": 0.00014850936055284703, + "loss": 0.9553, + "step": 8383 + }, + { + "epoch": 1.78839590443686, + "grad_norm": 0.38649988029520055, + "learning_rate": 0.0001485049659976886, + "loss": 0.9728, + "step": 8384 + }, + { + "epoch": 1.7886092150170647, + "grad_norm": 0.38056627519205527, + "learning_rate": 0.00014850057066739531, + "loss": 0.9388, + "step": 8385 + }, + { + "epoch": 1.7888225255972696, + "grad_norm": 0.31488278877878695, + "learning_rate": 0.00014849617456201694, + "loss": 0.9303, + "step": 8386 + }, + { + "epoch": 1.7890358361774745, + "grad_norm": 0.2982597888006618, + "learning_rate": 0.00014849177768160314, + "loss": 0.9664, + "step": 8387 + }, + { + "epoch": 1.7892491467576792, + "grad_norm": 0.4477484142904187, + "learning_rate": 0.00014848738002620377, + "loss": 0.936, + "step": 8388 + }, + { + "epoch": 1.789462457337884, + "grad_norm": 0.541098662526087, + "learning_rate": 0.00014848298159586854, + "loss": 0.9578, + "step": 8389 + }, + { + "epoch": 1.7896757679180886, + "grad_norm": 0.622139615796002, + "learning_rate": 0.00014847858239064722, + "loss": 0.9651, + "step": 8390 + }, + { + "epoch": 1.7898890784982935, + "grad_norm": 0.6860626131999991, + "learning_rate": 0.0001484741824105896, + "loss": 0.8971, + "step": 8391 + }, + { + "epoch": 1.7901023890784984, + "grad_norm": 0.646990873417121, + "learning_rate": 0.00014846978165574542, + "loss": 0.9182, + "step": 8392 + }, + { + "epoch": 1.790315699658703, + "grad_norm": 0.5881355278735972, + "learning_rate": 0.0001484653801261645, + "loss": 0.9609, + "step": 8393 + }, + { + "epoch": 1.7905290102389078, + "grad_norm": 0.4150921435283091, + "learning_rate": 0.00014846097782189672, + "loss": 0.9781, + "step": 8394 + }, + { + "epoch": 1.7907423208191127, + "grad_norm": 0.28967407415783464, + "learning_rate": 0.00014845657474299177, + "loss": 0.9468, + "step": 8395 + }, + { + "epoch": 1.7909556313993176, + "grad_norm": 0.31529294111174244, + "learning_rate": 0.00014845217088949953, + "loss": 0.9501, + "step": 8396 + }, + { + "epoch": 1.7911689419795223, + "grad_norm": 0.32432514632725573, + "learning_rate": 0.00014844776626146984, + "loss": 0.9724, + "step": 8397 + }, + { + "epoch": 1.791382252559727, + "grad_norm": 0.322938919312533, + "learning_rate": 0.0001484433608589525, + "loss": 0.9614, + "step": 8398 + }, + { + "epoch": 1.7915955631399316, + "grad_norm": 0.34900685526139663, + "learning_rate": 0.00014843895468199741, + "loss": 0.9342, + "step": 8399 + }, + { + "epoch": 1.7918088737201365, + "grad_norm": 0.38921295922605986, + "learning_rate": 0.0001484345477306544, + "loss": 0.9535, + "step": 8400 + }, + { + "epoch": 1.7920221843003414, + "grad_norm": 0.35489454150870947, + "learning_rate": 0.0001484301400049733, + "loss": 0.9856, + "step": 8401 + }, + { + "epoch": 1.7922354948805461, + "grad_norm": 0.2956128298209865, + "learning_rate": 0.00014842573150500405, + "loss": 0.9654, + "step": 8402 + }, + { + "epoch": 1.7924488054607508, + "grad_norm": 0.2704840850996802, + "learning_rate": 0.00014842132223079647, + "loss": 0.9059, + "step": 8403 + }, + { + "epoch": 1.7926621160409555, + "grad_norm": 0.2888711792457139, + "learning_rate": 0.00014841691218240048, + "loss": 0.9867, + "step": 8404 + }, + { + "epoch": 1.7928754266211604, + "grad_norm": 0.30182181502759037, + "learning_rate": 0.000148412501359866, + "loss": 0.9621, + "step": 8405 + }, + { + "epoch": 1.7930887372013653, + "grad_norm": 0.3017483914782573, + "learning_rate": 0.00014840808976324287, + "loss": 0.9393, + "step": 8406 + }, + { + "epoch": 1.79330204778157, + "grad_norm": 0.38251328643304694, + "learning_rate": 0.0001484036773925811, + "loss": 0.9545, + "step": 8407 + }, + { + "epoch": 1.7935153583617747, + "grad_norm": 0.4660865227939068, + "learning_rate": 0.00014839926424793054, + "loss": 1.0156, + "step": 8408 + }, + { + "epoch": 1.7937286689419796, + "grad_norm": 0.5044751958193907, + "learning_rate": 0.0001483948503293412, + "loss": 0.9708, + "step": 8409 + }, + { + "epoch": 1.7939419795221843, + "grad_norm": 0.4363111985055266, + "learning_rate": 0.00014839043563686294, + "loss": 0.978, + "step": 8410 + }, + { + "epoch": 1.7941552901023892, + "grad_norm": 0.4344955673531615, + "learning_rate": 0.00014838602017054573, + "loss": 0.9183, + "step": 8411 + }, + { + "epoch": 1.7943686006825939, + "grad_norm": 0.481009867626633, + "learning_rate": 0.00014838160393043957, + "loss": 0.9519, + "step": 8412 + }, + { + "epoch": 1.7945819112627985, + "grad_norm": 0.38073628662578074, + "learning_rate": 0.00014837718691659445, + "loss": 0.9577, + "step": 8413 + }, + { + "epoch": 1.7947952218430034, + "grad_norm": 0.2675629718205059, + "learning_rate": 0.0001483727691290603, + "loss": 0.9378, + "step": 8414 + }, + { + "epoch": 1.7950085324232083, + "grad_norm": 0.2810953467624289, + "learning_rate": 0.00014836835056788707, + "loss": 0.9395, + "step": 8415 + }, + { + "epoch": 1.795221843003413, + "grad_norm": 0.288951326621554, + "learning_rate": 0.00014836393123312484, + "loss": 0.9532, + "step": 8416 + }, + { + "epoch": 1.7954351535836177, + "grad_norm": 0.32885889450139694, + "learning_rate": 0.00014835951112482357, + "loss": 0.9623, + "step": 8417 + }, + { + "epoch": 1.7956484641638224, + "grad_norm": 0.4682216791105898, + "learning_rate": 0.00014835509024303326, + "loss": 0.9467, + "step": 8418 + }, + { + "epoch": 1.7958617747440273, + "grad_norm": 0.6603924228102706, + "learning_rate": 0.000148350668587804, + "loss": 0.9521, + "step": 8419 + }, + { + "epoch": 1.7960750853242322, + "grad_norm": 0.8219672015231426, + "learning_rate": 0.00014834624615918572, + "loss": 0.9629, + "step": 8420 + }, + { + "epoch": 1.796288395904437, + "grad_norm": 0.8026119558811122, + "learning_rate": 0.00014834182295722856, + "loss": 0.9324, + "step": 8421 + }, + { + "epoch": 1.7965017064846416, + "grad_norm": 0.639470874704964, + "learning_rate": 0.0001483373989819825, + "loss": 0.9824, + "step": 8422 + }, + { + "epoch": 1.7967150170648463, + "grad_norm": 0.46618679741449653, + "learning_rate": 0.00014833297423349763, + "loss": 0.9264, + "step": 8423 + }, + { + "epoch": 1.7969283276450512, + "grad_norm": 0.34137696150787245, + "learning_rate": 0.000148328548711824, + "loss": 0.9578, + "step": 8424 + }, + { + "epoch": 1.797141638225256, + "grad_norm": 0.41171841139294885, + "learning_rate": 0.00014832412241701172, + "loss": 0.9651, + "step": 8425 + }, + { + "epoch": 1.7973549488054608, + "grad_norm": 0.5432600386718027, + "learning_rate": 0.0001483196953491108, + "loss": 0.9576, + "step": 8426 + }, + { + "epoch": 1.7975682593856654, + "grad_norm": 0.6734355751497999, + "learning_rate": 0.00014831526750817142, + "loss": 0.9304, + "step": 8427 + }, + { + "epoch": 1.7977815699658704, + "grad_norm": 0.708603874678737, + "learning_rate": 0.00014831083889424362, + "loss": 0.9385, + "step": 8428 + }, + { + "epoch": 1.797994880546075, + "grad_norm": 0.6075398479885524, + "learning_rate": 0.00014830640950737754, + "loss": 0.957, + "step": 8429 + }, + { + "epoch": 1.79820819112628, + "grad_norm": 0.37510436908532246, + "learning_rate": 0.00014830197934762325, + "loss": 0.9179, + "step": 8430 + }, + { + "epoch": 1.7984215017064846, + "grad_norm": 0.25639837551983446, + "learning_rate": 0.00014829754841503095, + "loss": 0.9566, + "step": 8431 + }, + { + "epoch": 1.7986348122866893, + "grad_norm": 0.3500594507994639, + "learning_rate": 0.00014829311670965073, + "loss": 0.9621, + "step": 8432 + }, + { + "epoch": 1.7988481228668942, + "grad_norm": 0.359003529948244, + "learning_rate": 0.00014828868423153276, + "loss": 0.965, + "step": 8433 + }, + { + "epoch": 1.7990614334470991, + "grad_norm": 0.36671441703488783, + "learning_rate": 0.00014828425098072716, + "loss": 0.9923, + "step": 8434 + }, + { + "epoch": 1.7992747440273038, + "grad_norm": 0.3691183102400423, + "learning_rate": 0.00014827981695728412, + "loss": 0.9644, + "step": 8435 + }, + { + "epoch": 1.7994880546075085, + "grad_norm": 0.3756364211268035, + "learning_rate": 0.0001482753821612538, + "loss": 0.9484, + "step": 8436 + }, + { + "epoch": 1.7997013651877132, + "grad_norm": 0.30656922304284595, + "learning_rate": 0.00014827094659268638, + "loss": 0.9326, + "step": 8437 + }, + { + "epoch": 1.799914675767918, + "grad_norm": 0.2518735429729289, + "learning_rate": 0.00014826651025163208, + "loss": 0.9611, + "step": 8438 + }, + { + "epoch": 1.800127986348123, + "grad_norm": 0.20675409959745597, + "learning_rate": 0.00014826207313814102, + "loss": 0.9495, + "step": 8439 + }, + { + "epoch": 1.8003412969283277, + "grad_norm": 0.29211605664195417, + "learning_rate": 0.0001482576352522635, + "loss": 0.9417, + "step": 8440 + }, + { + "epoch": 1.8005546075085324, + "grad_norm": 0.35621090918054377, + "learning_rate": 0.00014825319659404965, + "loss": 0.9445, + "step": 8441 + }, + { + "epoch": 1.800767918088737, + "grad_norm": 0.34556621232096496, + "learning_rate": 0.00014824875716354976, + "loss": 0.9711, + "step": 8442 + }, + { + "epoch": 1.800981228668942, + "grad_norm": 0.29186433745535256, + "learning_rate": 0.00014824431696081403, + "loss": 0.9591, + "step": 8443 + }, + { + "epoch": 1.8011945392491469, + "grad_norm": 0.28361580208020803, + "learning_rate": 0.0001482398759858927, + "loss": 0.9581, + "step": 8444 + }, + { + "epoch": 1.8014078498293515, + "grad_norm": 0.2897108371023245, + "learning_rate": 0.00014823543423883603, + "loss": 0.9789, + "step": 8445 + }, + { + "epoch": 1.8016211604095562, + "grad_norm": 0.36169949986650884, + "learning_rate": 0.00014823099171969425, + "loss": 0.9613, + "step": 8446 + }, + { + "epoch": 1.8018344709897611, + "grad_norm": 0.3353000696709937, + "learning_rate": 0.0001482265484285177, + "loss": 0.9533, + "step": 8447 + }, + { + "epoch": 1.802047781569966, + "grad_norm": 0.2749029297721329, + "learning_rate": 0.00014822210436535657, + "loss": 0.9413, + "step": 8448 + }, + { + "epoch": 1.8022610921501707, + "grad_norm": 0.23104896077689682, + "learning_rate": 0.00014821765953026118, + "loss": 0.9543, + "step": 8449 + }, + { + "epoch": 1.8024744027303754, + "grad_norm": 0.2683055985512421, + "learning_rate": 0.00014821321392328185, + "loss": 0.9654, + "step": 8450 + }, + { + "epoch": 1.80268771331058, + "grad_norm": 0.2713831081954724, + "learning_rate": 0.00014820876754446883, + "loss": 0.924, + "step": 8451 + }, + { + "epoch": 1.802901023890785, + "grad_norm": 0.29187649270044513, + "learning_rate": 0.00014820432039387246, + "loss": 0.9311, + "step": 8452 + }, + { + "epoch": 1.80311433447099, + "grad_norm": 0.3269562176972052, + "learning_rate": 0.00014819987247154307, + "loss": 0.9293, + "step": 8453 + }, + { + "epoch": 1.8033276450511946, + "grad_norm": 0.31976156492107344, + "learning_rate": 0.00014819542377753097, + "loss": 0.9683, + "step": 8454 + }, + { + "epoch": 1.8035409556313993, + "grad_norm": 0.2757955884137963, + "learning_rate": 0.00014819097431188652, + "loss": 0.9581, + "step": 8455 + }, + { + "epoch": 1.803754266211604, + "grad_norm": 0.37335659514941644, + "learning_rate": 0.00014818652407466002, + "loss": 0.9439, + "step": 8456 + }, + { + "epoch": 1.8039675767918089, + "grad_norm": 0.3781556313123288, + "learning_rate": 0.00014818207306590187, + "loss": 0.9312, + "step": 8457 + }, + { + "epoch": 1.8041808873720138, + "grad_norm": 0.24875751679723174, + "learning_rate": 0.00014817762128566242, + "loss": 0.9292, + "step": 8458 + }, + { + "epoch": 1.8043941979522184, + "grad_norm": 0.27315633196748657, + "learning_rate": 0.00014817316873399204, + "loss": 0.9578, + "step": 8459 + }, + { + "epoch": 1.8046075085324231, + "grad_norm": 0.31236303826665496, + "learning_rate": 0.0001481687154109411, + "loss": 0.9201, + "step": 8460 + }, + { + "epoch": 1.804820819112628, + "grad_norm": 0.27408739159843204, + "learning_rate": 0.00014816426131656, + "loss": 0.9935, + "step": 8461 + }, + { + "epoch": 1.8050341296928327, + "grad_norm": 0.27197066857286645, + "learning_rate": 0.00014815980645089912, + "loss": 0.9545, + "step": 8462 + }, + { + "epoch": 1.8052474402730376, + "grad_norm": 0.2666397436792411, + "learning_rate": 0.00014815535081400889, + "loss": 0.9543, + "step": 8463 + }, + { + "epoch": 1.8054607508532423, + "grad_norm": 0.3444175466246315, + "learning_rate": 0.00014815089440593974, + "loss": 0.9199, + "step": 8464 + }, + { + "epoch": 1.805674061433447, + "grad_norm": 0.32266405071841137, + "learning_rate": 0.00014814643722674208, + "loss": 0.8969, + "step": 8465 + }, + { + "epoch": 1.805887372013652, + "grad_norm": 0.395375554321788, + "learning_rate": 0.0001481419792764663, + "loss": 0.9414, + "step": 8466 + }, + { + "epoch": 1.8061006825938568, + "grad_norm": 0.590538834467079, + "learning_rate": 0.0001481375205551629, + "loss": 0.9259, + "step": 8467 + }, + { + "epoch": 1.8063139931740615, + "grad_norm": 0.6910565290082822, + "learning_rate": 0.00014813306106288232, + "loss": 0.9442, + "step": 8468 + }, + { + "epoch": 1.8065273037542662, + "grad_norm": 0.6875658596300921, + "learning_rate": 0.00014812860079967499, + "loss": 0.9437, + "step": 8469 + }, + { + "epoch": 1.8067406143344709, + "grad_norm": 0.5863417369447137, + "learning_rate": 0.0001481241397655914, + "loss": 0.9315, + "step": 8470 + }, + { + "epoch": 1.8069539249146758, + "grad_norm": 0.427772036669678, + "learning_rate": 0.00014811967796068203, + "loss": 0.9474, + "step": 8471 + }, + { + "epoch": 1.8071672354948807, + "grad_norm": 0.2681396350468327, + "learning_rate": 0.00014811521538499734, + "loss": 0.9634, + "step": 8472 + }, + { + "epoch": 1.8073805460750854, + "grad_norm": 0.34011852371902346, + "learning_rate": 0.00014811075203858786, + "loss": 0.9153, + "step": 8473 + }, + { + "epoch": 1.80759385665529, + "grad_norm": 0.3923798307103269, + "learning_rate": 0.00014810628792150406, + "loss": 0.9468, + "step": 8474 + }, + { + "epoch": 1.8078071672354947, + "grad_norm": 0.4115895070410966, + "learning_rate": 0.00014810182303379648, + "loss": 0.9712, + "step": 8475 + }, + { + "epoch": 1.8080204778156996, + "grad_norm": 0.5237235553902356, + "learning_rate": 0.00014809735737551563, + "loss": 0.9414, + "step": 8476 + }, + { + "epoch": 1.8082337883959045, + "grad_norm": 0.5395247954287291, + "learning_rate": 0.00014809289094671203, + "loss": 0.9924, + "step": 8477 + }, + { + "epoch": 1.8084470989761092, + "grad_norm": 0.54212066925529, + "learning_rate": 0.0001480884237474362, + "loss": 0.9331, + "step": 8478 + }, + { + "epoch": 1.808660409556314, + "grad_norm": 0.4375678979221658, + "learning_rate": 0.00014808395577773874, + "loss": 0.9545, + "step": 8479 + }, + { + "epoch": 1.8088737201365188, + "grad_norm": 0.32449402774845987, + "learning_rate": 0.00014807948703767017, + "loss": 0.9307, + "step": 8480 + }, + { + "epoch": 1.8090870307167235, + "grad_norm": 0.3540302190869652, + "learning_rate": 0.00014807501752728104, + "loss": 0.9476, + "step": 8481 + }, + { + "epoch": 1.8093003412969284, + "grad_norm": 0.39307473764299883, + "learning_rate": 0.00014807054724662194, + "loss": 0.9386, + "step": 8482 + }, + { + "epoch": 1.809513651877133, + "grad_norm": 0.44520499223647836, + "learning_rate": 0.0001480660761957435, + "loss": 0.938, + "step": 8483 + }, + { + "epoch": 1.8097269624573378, + "grad_norm": 0.37785689590161947, + "learning_rate": 0.00014806160437469624, + "loss": 0.9473, + "step": 8484 + }, + { + "epoch": 1.8099402730375427, + "grad_norm": 0.3822841484283064, + "learning_rate": 0.00014805713178353075, + "loss": 0.9317, + "step": 8485 + }, + { + "epoch": 1.8101535836177476, + "grad_norm": 0.49334293138972357, + "learning_rate": 0.0001480526584222977, + "loss": 0.9211, + "step": 8486 + }, + { + "epoch": 1.8103668941979523, + "grad_norm": 0.5684478254388987, + "learning_rate": 0.0001480481842910477, + "loss": 0.9271, + "step": 8487 + }, + { + "epoch": 1.810580204778157, + "grad_norm": 0.4346678579318798, + "learning_rate": 0.0001480437093898313, + "loss": 0.9614, + "step": 8488 + }, + { + "epoch": 1.8107935153583616, + "grad_norm": 0.34968269802926344, + "learning_rate": 0.0001480392337186992, + "loss": 0.9699, + "step": 8489 + }, + { + "epoch": 1.8110068259385665, + "grad_norm": 0.3837707657105286, + "learning_rate": 0.00014803475727770204, + "loss": 0.9516, + "step": 8490 + }, + { + "epoch": 1.8112201365187715, + "grad_norm": 0.30484327114816634, + "learning_rate": 0.00014803028006689045, + "loss": 0.9451, + "step": 8491 + }, + { + "epoch": 1.8114334470989761, + "grad_norm": 0.24086938931645901, + "learning_rate": 0.0001480258020863151, + "loss": 0.9871, + "step": 8492 + }, + { + "epoch": 1.8116467576791808, + "grad_norm": 0.3753558847863838, + "learning_rate": 0.00014802132333602662, + "loss": 0.957, + "step": 8493 + }, + { + "epoch": 1.8118600682593855, + "grad_norm": 0.404804525428544, + "learning_rate": 0.00014801684381607577, + "loss": 0.9528, + "step": 8494 + }, + { + "epoch": 1.8120733788395904, + "grad_norm": 0.49764217552988715, + "learning_rate": 0.00014801236352651316, + "loss": 0.9524, + "step": 8495 + }, + { + "epoch": 1.8122866894197953, + "grad_norm": 0.5877298018749828, + "learning_rate": 0.00014800788246738952, + "loss": 0.9572, + "step": 8496 + }, + { + "epoch": 1.8125, + "grad_norm": 0.5693394770951543, + "learning_rate": 0.00014800340063875553, + "loss": 0.9228, + "step": 8497 + }, + { + "epoch": 1.8127133105802047, + "grad_norm": 0.5263862676467633, + "learning_rate": 0.00014799891804066195, + "loss": 0.9617, + "step": 8498 + }, + { + "epoch": 1.8129266211604096, + "grad_norm": 0.4257442335088898, + "learning_rate": 0.00014799443467315945, + "loss": 0.9257, + "step": 8499 + }, + { + "epoch": 1.8131399317406145, + "grad_norm": 0.381789269507205, + "learning_rate": 0.0001479899505362988, + "loss": 0.9409, + "step": 8500 + }, + { + "epoch": 1.8133532423208192, + "grad_norm": 0.3528714601639787, + "learning_rate": 0.00014798546563013067, + "loss": 0.9253, + "step": 8501 + }, + { + "epoch": 1.8135665529010239, + "grad_norm": 0.26122517197539585, + "learning_rate": 0.0001479809799547059, + "loss": 0.9363, + "step": 8502 + }, + { + "epoch": 1.8137798634812285, + "grad_norm": 0.30692888296297605, + "learning_rate": 0.00014797649351007517, + "loss": 0.9889, + "step": 8503 + }, + { + "epoch": 1.8139931740614335, + "grad_norm": 0.4045166824750134, + "learning_rate": 0.0001479720062962893, + "loss": 0.9584, + "step": 8504 + }, + { + "epoch": 1.8142064846416384, + "grad_norm": 0.421826648608782, + "learning_rate": 0.00014796751831339902, + "loss": 0.9566, + "step": 8505 + }, + { + "epoch": 1.814419795221843, + "grad_norm": 0.43455280750860714, + "learning_rate": 0.00014796302956145512, + "loss": 0.965, + "step": 8506 + }, + { + "epoch": 1.8146331058020477, + "grad_norm": 0.4209919422241601, + "learning_rate": 0.0001479585400405084, + "loss": 0.9438, + "step": 8507 + }, + { + "epoch": 1.8148464163822524, + "grad_norm": 0.4482017517849466, + "learning_rate": 0.00014795404975060964, + "loss": 0.9582, + "step": 8508 + }, + { + "epoch": 1.8150597269624573, + "grad_norm": 0.3918244949213513, + "learning_rate": 0.00014794955869180967, + "loss": 0.932, + "step": 8509 + }, + { + "epoch": 1.8152730375426622, + "grad_norm": 0.28971511342862266, + "learning_rate": 0.00014794506686415933, + "loss": 0.966, + "step": 8510 + }, + { + "epoch": 1.815486348122867, + "grad_norm": 0.258014911518132, + "learning_rate": 0.0001479405742677094, + "loss": 0.9721, + "step": 8511 + }, + { + "epoch": 1.8156996587030716, + "grad_norm": 0.2931977014025339, + "learning_rate": 0.00014793608090251073, + "loss": 0.9602, + "step": 8512 + }, + { + "epoch": 1.8159129692832765, + "grad_norm": 0.3002643865674293, + "learning_rate": 0.00014793158676861416, + "loss": 0.9522, + "step": 8513 + }, + { + "epoch": 1.8161262798634812, + "grad_norm": 0.2681990316474975, + "learning_rate": 0.00014792709186607053, + "loss": 0.9343, + "step": 8514 + }, + { + "epoch": 1.816339590443686, + "grad_norm": 0.25899970056834254, + "learning_rate": 0.00014792259619493073, + "loss": 0.9556, + "step": 8515 + }, + { + "epoch": 1.8165529010238908, + "grad_norm": 0.2642277757894329, + "learning_rate": 0.0001479180997552456, + "loss": 0.9414, + "step": 8516 + }, + { + "epoch": 1.8167662116040955, + "grad_norm": 0.2495446849458782, + "learning_rate": 0.00014791360254706604, + "loss": 0.9305, + "step": 8517 + }, + { + "epoch": 1.8169795221843004, + "grad_norm": 0.2325849091765148, + "learning_rate": 0.00014790910457044293, + "loss": 0.9301, + "step": 8518 + }, + { + "epoch": 1.8171928327645053, + "grad_norm": 0.28348108265636507, + "learning_rate": 0.00014790460582542715, + "loss": 0.961, + "step": 8519 + }, + { + "epoch": 1.81740614334471, + "grad_norm": 0.26650691417018063, + "learning_rate": 0.0001479001063120696, + "loss": 0.9159, + "step": 8520 + }, + { + "epoch": 1.8176194539249146, + "grad_norm": 0.3420457018742598, + "learning_rate": 0.00014789560603042123, + "loss": 0.9561, + "step": 8521 + }, + { + "epoch": 1.8178327645051193, + "grad_norm": 0.33195922275385426, + "learning_rate": 0.0001478911049805329, + "loss": 0.9231, + "step": 8522 + }, + { + "epoch": 1.8180460750853242, + "grad_norm": 0.32470546091378516, + "learning_rate": 0.00014788660316245562, + "loss": 0.9463, + "step": 8523 + }, + { + "epoch": 1.8182593856655291, + "grad_norm": 0.34681212083281143, + "learning_rate": 0.00014788210057624027, + "loss": 0.9591, + "step": 8524 + }, + { + "epoch": 1.8184726962457338, + "grad_norm": 0.3892476148729343, + "learning_rate": 0.0001478775972219378, + "loss": 0.961, + "step": 8525 + }, + { + "epoch": 1.8186860068259385, + "grad_norm": 0.3319760370874512, + "learning_rate": 0.00014787309309959918, + "loss": 0.9643, + "step": 8526 + }, + { + "epoch": 1.8188993174061432, + "grad_norm": 0.26550828517150593, + "learning_rate": 0.00014786858820927537, + "loss": 0.9406, + "step": 8527 + }, + { + "epoch": 1.819112627986348, + "grad_norm": 0.2459641151701346, + "learning_rate": 0.00014786408255101734, + "loss": 0.9311, + "step": 8528 + }, + { + "epoch": 1.819325938566553, + "grad_norm": 0.23258647997000462, + "learning_rate": 0.00014785957612487607, + "loss": 0.9644, + "step": 8529 + }, + { + "epoch": 1.8195392491467577, + "grad_norm": 0.31550522589703744, + "learning_rate": 0.00014785506893090256, + "loss": 0.953, + "step": 8530 + }, + { + "epoch": 1.8197525597269624, + "grad_norm": 0.4151325981686032, + "learning_rate": 0.0001478505609691478, + "loss": 0.9411, + "step": 8531 + }, + { + "epoch": 1.8199658703071673, + "grad_norm": 0.45622351142932943, + "learning_rate": 0.0001478460522396628, + "loss": 0.9584, + "step": 8532 + }, + { + "epoch": 1.820179180887372, + "grad_norm": 0.5751195484918181, + "learning_rate": 0.00014784154274249858, + "loss": 0.9643, + "step": 8533 + }, + { + "epoch": 1.8203924914675769, + "grad_norm": 0.70183691229799, + "learning_rate": 0.00014783703247770618, + "loss": 0.9467, + "step": 8534 + }, + { + "epoch": 1.8206058020477816, + "grad_norm": 0.7511866849598174, + "learning_rate": 0.0001478325214453366, + "loss": 0.9604, + "step": 8535 + }, + { + "epoch": 1.8208191126279862, + "grad_norm": 0.7709145019300908, + "learning_rate": 0.00014782800964544088, + "loss": 0.9565, + "step": 8536 + }, + { + "epoch": 1.8210324232081911, + "grad_norm": 0.8111327102865241, + "learning_rate": 0.00014782349707807014, + "loss": 0.9451, + "step": 8537 + }, + { + "epoch": 1.821245733788396, + "grad_norm": 0.7794826755639106, + "learning_rate": 0.00014781898374327534, + "loss": 0.9615, + "step": 8538 + }, + { + "epoch": 1.8214590443686007, + "grad_norm": 0.6361582013636645, + "learning_rate": 0.00014781446964110762, + "loss": 0.9621, + "step": 8539 + }, + { + "epoch": 1.8216723549488054, + "grad_norm": 0.4824741870347695, + "learning_rate": 0.00014780995477161804, + "loss": 0.9357, + "step": 8540 + }, + { + "epoch": 1.82188566552901, + "grad_norm": 0.31128993205212474, + "learning_rate": 0.00014780543913485767, + "loss": 0.9598, + "step": 8541 + }, + { + "epoch": 1.822098976109215, + "grad_norm": 0.3841031720141458, + "learning_rate": 0.00014780092273087761, + "loss": 0.9711, + "step": 8542 + }, + { + "epoch": 1.82231228668942, + "grad_norm": 0.5116538372595268, + "learning_rate": 0.000147796405559729, + "loss": 0.9236, + "step": 8543 + }, + { + "epoch": 1.8225255972696246, + "grad_norm": 0.6983558794669468, + "learning_rate": 0.00014779188762146288, + "loss": 0.9158, + "step": 8544 + }, + { + "epoch": 1.8227389078498293, + "grad_norm": 0.7354344456760543, + "learning_rate": 0.00014778736891613045, + "loss": 0.9379, + "step": 8545 + }, + { + "epoch": 1.822952218430034, + "grad_norm": 0.5706529897203856, + "learning_rate": 0.00014778284944378279, + "loss": 0.987, + "step": 8546 + }, + { + "epoch": 1.8231655290102389, + "grad_norm": 0.45922691835552526, + "learning_rate": 0.00014777832920447102, + "loss": 0.9442, + "step": 8547 + }, + { + "epoch": 1.8233788395904438, + "grad_norm": 0.37915902297412535, + "learning_rate": 0.00014777380819824633, + "loss": 0.942, + "step": 8548 + }, + { + "epoch": 1.8235921501706485, + "grad_norm": 0.24853034789553632, + "learning_rate": 0.0001477692864251599, + "loss": 0.9842, + "step": 8549 + }, + { + "epoch": 1.8238054607508531, + "grad_norm": 0.27597962317622055, + "learning_rate": 0.00014776476388526281, + "loss": 0.9416, + "step": 8550 + }, + { + "epoch": 1.824018771331058, + "grad_norm": 0.34884287002131187, + "learning_rate": 0.00014776024057860628, + "loss": 0.9216, + "step": 8551 + }, + { + "epoch": 1.824232081911263, + "grad_norm": 0.36650533044149386, + "learning_rate": 0.0001477557165052415, + "loss": 0.9645, + "step": 8552 + }, + { + "epoch": 1.8244453924914676, + "grad_norm": 1.210536284217599, + "learning_rate": 0.00014775119166521964, + "loss": 0.9311, + "step": 8553 + }, + { + "epoch": 1.8246587030716723, + "grad_norm": 0.25476792881694577, + "learning_rate": 0.00014774666605859188, + "loss": 0.9611, + "step": 8554 + }, + { + "epoch": 1.824872013651877, + "grad_norm": 0.43143807384260835, + "learning_rate": 0.0001477421396854095, + "loss": 0.9394, + "step": 8555 + }, + { + "epoch": 1.825085324232082, + "grad_norm": 0.4468162637346623, + "learning_rate": 0.00014773761254572367, + "loss": 0.953, + "step": 8556 + }, + { + "epoch": 1.8252986348122868, + "grad_norm": 0.5580621099406757, + "learning_rate": 0.00014773308463958559, + "loss": 0.9477, + "step": 8557 + }, + { + "epoch": 1.8255119453924915, + "grad_norm": 0.6036605083564448, + "learning_rate": 0.00014772855596704654, + "loss": 0.9545, + "step": 8558 + }, + { + "epoch": 1.8257252559726962, + "grad_norm": 0.8666439214861882, + "learning_rate": 0.00014772402652815772, + "loss": 0.9717, + "step": 8559 + }, + { + "epoch": 1.8259385665529009, + "grad_norm": 1.1463738553496612, + "learning_rate": 0.0001477194963229704, + "loss": 0.9467, + "step": 8560 + }, + { + "epoch": 1.8261518771331058, + "grad_norm": 0.7960228582337213, + "learning_rate": 0.00014771496535153587, + "loss": 0.9517, + "step": 8561 + }, + { + "epoch": 1.8263651877133107, + "grad_norm": 0.785191423639414, + "learning_rate": 0.00014771043361390534, + "loss": 0.9358, + "step": 8562 + }, + { + "epoch": 1.8265784982935154, + "grad_norm": 0.7983591411868363, + "learning_rate": 0.00014770590111013013, + "loss": 0.9552, + "step": 8563 + }, + { + "epoch": 1.82679180887372, + "grad_norm": 0.650567922710526, + "learning_rate": 0.00014770136784026148, + "loss": 0.9904, + "step": 8564 + }, + { + "epoch": 1.827005119453925, + "grad_norm": 0.7653058213448798, + "learning_rate": 0.00014769683380435074, + "loss": 0.9627, + "step": 8565 + }, + { + "epoch": 1.8272184300341296, + "grad_norm": 1.1013519844661372, + "learning_rate": 0.00014769229900244918, + "loss": 0.9779, + "step": 8566 + }, + { + "epoch": 1.8274317406143346, + "grad_norm": 1.0675697152823076, + "learning_rate": 0.00014768776343460812, + "loss": 0.9409, + "step": 8567 + }, + { + "epoch": 1.8276450511945392, + "grad_norm": 0.7126991516982324, + "learning_rate": 0.0001476832271008789, + "loss": 0.9383, + "step": 8568 + }, + { + "epoch": 1.827858361774744, + "grad_norm": 0.5964441362981601, + "learning_rate": 0.00014767869000131278, + "loss": 0.9478, + "step": 8569 + }, + { + "epoch": 1.8280716723549488, + "grad_norm": 1.02826094816066, + "learning_rate": 0.00014767415213596117, + "loss": 0.9126, + "step": 8570 + }, + { + "epoch": 1.8282849829351537, + "grad_norm": 1.0735111113781663, + "learning_rate": 0.00014766961350487539, + "loss": 0.9253, + "step": 8571 + }, + { + "epoch": 1.8284982935153584, + "grad_norm": 0.624837504174127, + "learning_rate": 0.0001476650741081068, + "loss": 0.9287, + "step": 8572 + }, + { + "epoch": 1.828711604095563, + "grad_norm": 0.6260803060974162, + "learning_rate": 0.00014766053394570675, + "loss": 0.9445, + "step": 8573 + }, + { + "epoch": 1.8289249146757678, + "grad_norm": 0.699441424530379, + "learning_rate": 0.00014765599301772662, + "loss": 0.9475, + "step": 8574 + }, + { + "epoch": 1.8291382252559727, + "grad_norm": 0.6700893875234125, + "learning_rate": 0.0001476514513242178, + "loss": 0.9615, + "step": 8575 + }, + { + "epoch": 1.8293515358361776, + "grad_norm": 0.6401832920778816, + "learning_rate": 0.00014764690886523167, + "loss": 0.9816, + "step": 8576 + }, + { + "epoch": 1.8295648464163823, + "grad_norm": 0.5306871221169716, + "learning_rate": 0.0001476423656408196, + "loss": 0.9626, + "step": 8577 + }, + { + "epoch": 1.829778156996587, + "grad_norm": 0.502052958736876, + "learning_rate": 0.00014763782165103304, + "loss": 0.9656, + "step": 8578 + }, + { + "epoch": 1.8299914675767917, + "grad_norm": 0.47904525615738697, + "learning_rate": 0.00014763327689592341, + "loss": 0.9629, + "step": 8579 + }, + { + "epoch": 1.8302047781569966, + "grad_norm": 0.5485810922706608, + "learning_rate": 0.0001476287313755421, + "loss": 0.9641, + "step": 8580 + }, + { + "epoch": 1.8304180887372015, + "grad_norm": 0.5988996682832054, + "learning_rate": 0.00014762418508994056, + "loss": 0.9403, + "step": 8581 + }, + { + "epoch": 1.8306313993174061, + "grad_norm": 0.54664504917122, + "learning_rate": 0.00014761963803917023, + "loss": 0.9896, + "step": 8582 + }, + { + "epoch": 1.8308447098976108, + "grad_norm": 0.3208182507312175, + "learning_rate": 0.00014761509022328257, + "loss": 0.9369, + "step": 8583 + }, + { + "epoch": 1.8310580204778157, + "grad_norm": 0.31242922760835, + "learning_rate": 0.000147610541642329, + "loss": 0.9489, + "step": 8584 + }, + { + "epoch": 1.8312713310580204, + "grad_norm": 0.3271277453798923, + "learning_rate": 0.00014760599229636103, + "loss": 0.9491, + "step": 8585 + }, + { + "epoch": 1.8314846416382253, + "grad_norm": 0.3065977302073105, + "learning_rate": 0.0001476014421854301, + "loss": 0.9461, + "step": 8586 + }, + { + "epoch": 1.83169795221843, + "grad_norm": 0.30344678665687, + "learning_rate": 0.00014759689130958774, + "loss": 0.9414, + "step": 8587 + }, + { + "epoch": 1.8319112627986347, + "grad_norm": 0.26887295159451996, + "learning_rate": 0.0001475923396688854, + "loss": 0.9324, + "step": 8588 + }, + { + "epoch": 1.8321245733788396, + "grad_norm": 0.2748007821691032, + "learning_rate": 0.00014758778726337463, + "loss": 0.9289, + "step": 8589 + }, + { + "epoch": 1.8323378839590445, + "grad_norm": 0.2444991227498364, + "learning_rate": 0.0001475832340931069, + "loss": 0.9982, + "step": 8590 + }, + { + "epoch": 1.8325511945392492, + "grad_norm": 0.2659999223140863, + "learning_rate": 0.0001475786801581337, + "loss": 0.9334, + "step": 8591 + }, + { + "epoch": 1.8327645051194539, + "grad_norm": 0.2691049453436269, + "learning_rate": 0.00014757412545850663, + "loss": 0.961, + "step": 8592 + }, + { + "epoch": 1.8329778156996586, + "grad_norm": 0.2681422266136683, + "learning_rate": 0.00014756956999427718, + "loss": 0.9682, + "step": 8593 + }, + { + "epoch": 1.8331911262798635, + "grad_norm": 0.2810857752820678, + "learning_rate": 0.00014756501376549694, + "loss": 0.9362, + "step": 8594 + }, + { + "epoch": 1.8334044368600684, + "grad_norm": 0.2785830799014095, + "learning_rate": 0.00014756045677221738, + "loss": 0.932, + "step": 8595 + }, + { + "epoch": 1.833617747440273, + "grad_norm": 0.3163743444624691, + "learning_rate": 0.00014755589901449016, + "loss": 0.9398, + "step": 8596 + }, + { + "epoch": 1.8338310580204777, + "grad_norm": 0.2904239857617599, + "learning_rate": 0.0001475513404923668, + "loss": 0.9669, + "step": 8597 + }, + { + "epoch": 1.8340443686006824, + "grad_norm": 0.2753879016759947, + "learning_rate": 0.00014754678120589888, + "loss": 0.9498, + "step": 8598 + }, + { + "epoch": 1.8342576791808873, + "grad_norm": 0.286385720726345, + "learning_rate": 0.000147542221155138, + "loss": 0.936, + "step": 8599 + }, + { + "epoch": 1.8344709897610922, + "grad_norm": 0.2811968467694794, + "learning_rate": 0.00014753766034013576, + "loss": 0.9721, + "step": 8600 + }, + { + "epoch": 1.834684300341297, + "grad_norm": 0.24491254727854098, + "learning_rate": 0.00014753309876094372, + "loss": 0.9758, + "step": 8601 + }, + { + "epoch": 1.8348976109215016, + "grad_norm": 0.270139381671404, + "learning_rate": 0.00014752853641761355, + "loss": 0.9405, + "step": 8602 + }, + { + "epoch": 1.8351109215017065, + "grad_norm": 0.2678332919642154, + "learning_rate": 0.00014752397331019686, + "loss": 0.9407, + "step": 8603 + }, + { + "epoch": 1.8353242320819114, + "grad_norm": 0.31676372267988095, + "learning_rate": 0.00014751940943874529, + "loss": 0.9609, + "step": 8604 + }, + { + "epoch": 1.835537542662116, + "grad_norm": 0.3130811444740605, + "learning_rate": 0.00014751484480331043, + "loss": 0.9566, + "step": 8605 + }, + { + "epoch": 1.8357508532423208, + "grad_norm": 0.292181083264083, + "learning_rate": 0.000147510279403944, + "loss": 0.9663, + "step": 8606 + }, + { + "epoch": 1.8359641638225255, + "grad_norm": 0.29922587304043236, + "learning_rate": 0.0001475057132406976, + "loss": 0.9373, + "step": 8607 + }, + { + "epoch": 1.8361774744027304, + "grad_norm": 0.26751624019487674, + "learning_rate": 0.00014750114631362292, + "loss": 0.944, + "step": 8608 + }, + { + "epoch": 1.8363907849829353, + "grad_norm": 0.25336260060893534, + "learning_rate": 0.00014749657862277166, + "loss": 0.9524, + "step": 8609 + }, + { + "epoch": 1.83660409556314, + "grad_norm": 0.2802677569932735, + "learning_rate": 0.00014749201016819546, + "loss": 0.9848, + "step": 8610 + }, + { + "epoch": 1.8368174061433447, + "grad_norm": 0.3532707104030786, + "learning_rate": 0.000147487440949946, + "loss": 0.9537, + "step": 8611 + }, + { + "epoch": 1.8370307167235493, + "grad_norm": 0.36856159722850124, + "learning_rate": 0.00014748287096807505, + "loss": 0.9508, + "step": 8612 + }, + { + "epoch": 1.8372440273037542, + "grad_norm": 0.2998234892490258, + "learning_rate": 0.00014747830022263428, + "loss": 0.9297, + "step": 8613 + }, + { + "epoch": 1.8374573378839592, + "grad_norm": 0.26652396319448957, + "learning_rate": 0.0001474737287136754, + "loss": 0.9494, + "step": 8614 + }, + { + "epoch": 1.8376706484641638, + "grad_norm": 0.2557057962269324, + "learning_rate": 0.00014746915644125013, + "loss": 0.9445, + "step": 8615 + }, + { + "epoch": 1.8378839590443685, + "grad_norm": 0.26817697098555265, + "learning_rate": 0.00014746458340541025, + "loss": 0.9317, + "step": 8616 + }, + { + "epoch": 1.8380972696245734, + "grad_norm": 0.26832531117871333, + "learning_rate": 0.00014746000960620746, + "loss": 0.9542, + "step": 8617 + }, + { + "epoch": 1.838310580204778, + "grad_norm": 0.2882282595905064, + "learning_rate": 0.00014745543504369354, + "loss": 0.9703, + "step": 8618 + }, + { + "epoch": 1.838523890784983, + "grad_norm": 0.2952647561971466, + "learning_rate": 0.0001474508597179202, + "loss": 0.9422, + "step": 8619 + }, + { + "epoch": 1.8387372013651877, + "grad_norm": 0.28496094222943896, + "learning_rate": 0.00014744628362893926, + "loss": 0.9719, + "step": 8620 + }, + { + "epoch": 1.8389505119453924, + "grad_norm": 0.26388458510289153, + "learning_rate": 0.0001474417067768025, + "loss": 0.9517, + "step": 8621 + }, + { + "epoch": 1.8391638225255973, + "grad_norm": 0.28036439769181404, + "learning_rate": 0.0001474371291615617, + "loss": 0.9637, + "step": 8622 + }, + { + "epoch": 1.8393771331058022, + "grad_norm": 0.2761407338445416, + "learning_rate": 0.00014743255078326862, + "loss": 0.9466, + "step": 8623 + }, + { + "epoch": 1.8395904436860069, + "grad_norm": 0.3084481663980237, + "learning_rate": 0.00014742797164197513, + "loss": 0.9699, + "step": 8624 + }, + { + "epoch": 1.8398037542662116, + "grad_norm": 0.31500433601484695, + "learning_rate": 0.00014742339173773298, + "loss": 0.9748, + "step": 8625 + }, + { + "epoch": 1.8400170648464163, + "grad_norm": 0.27258528977864066, + "learning_rate": 0.00014741881107059403, + "loss": 0.9498, + "step": 8626 + }, + { + "epoch": 1.8402303754266212, + "grad_norm": 0.2902802640466977, + "learning_rate": 0.0001474142296406101, + "loss": 0.9626, + "step": 8627 + }, + { + "epoch": 1.840443686006826, + "grad_norm": 0.37653502565463387, + "learning_rate": 0.00014740964744783304, + "loss": 0.9555, + "step": 8628 + }, + { + "epoch": 1.8406569965870307, + "grad_norm": 0.2591927485155319, + "learning_rate": 0.00014740506449231465, + "loss": 0.9289, + "step": 8629 + }, + { + "epoch": 1.8408703071672354, + "grad_norm": 0.2686927705535513, + "learning_rate": 0.00014740048077410684, + "loss": 0.9337, + "step": 8630 + }, + { + "epoch": 1.8410836177474401, + "grad_norm": 0.34680684941467965, + "learning_rate": 0.00014739589629326146, + "loss": 0.9688, + "step": 8631 + }, + { + "epoch": 1.841296928327645, + "grad_norm": 0.30091776542802634, + "learning_rate": 0.00014739131104983034, + "loss": 0.9585, + "step": 8632 + }, + { + "epoch": 1.84151023890785, + "grad_norm": 0.34147399701705733, + "learning_rate": 0.00014738672504386547, + "loss": 0.9711, + "step": 8633 + }, + { + "epoch": 1.8417235494880546, + "grad_norm": 0.327009184363625, + "learning_rate": 0.00014738213827541864, + "loss": 0.9606, + "step": 8634 + }, + { + "epoch": 1.8419368600682593, + "grad_norm": 0.2505668114693641, + "learning_rate": 0.0001473775507445418, + "loss": 0.9219, + "step": 8635 + }, + { + "epoch": 1.8421501706484642, + "grad_norm": 0.26298559718439596, + "learning_rate": 0.0001473729624512868, + "loss": 0.9328, + "step": 8636 + }, + { + "epoch": 1.8423634812286689, + "grad_norm": 0.3454461319940742, + "learning_rate": 0.00014736837339570562, + "loss": 0.9174, + "step": 8637 + }, + { + "epoch": 1.8425767918088738, + "grad_norm": 0.34705187813078586, + "learning_rate": 0.00014736378357785017, + "loss": 0.9618, + "step": 8638 + }, + { + "epoch": 1.8427901023890785, + "grad_norm": 0.29263621975681525, + "learning_rate": 0.00014735919299777237, + "loss": 0.9533, + "step": 8639 + }, + { + "epoch": 1.8430034129692832, + "grad_norm": 0.28749439361106993, + "learning_rate": 0.00014735460165552416, + "loss": 0.9077, + "step": 8640 + }, + { + "epoch": 1.843216723549488, + "grad_norm": 0.23983796859823292, + "learning_rate": 0.00014735000955115748, + "loss": 0.9682, + "step": 8641 + }, + { + "epoch": 1.843430034129693, + "grad_norm": 0.23716811434068283, + "learning_rate": 0.00014734541668472436, + "loss": 0.9608, + "step": 8642 + }, + { + "epoch": 1.8436433447098977, + "grad_norm": 0.23103496741722687, + "learning_rate": 0.00014734082305627668, + "loss": 0.9245, + "step": 8643 + }, + { + "epoch": 1.8438566552901023, + "grad_norm": 0.2839945592757074, + "learning_rate": 0.0001473362286658665, + "loss": 0.9408, + "step": 8644 + }, + { + "epoch": 1.844069965870307, + "grad_norm": 0.574859728799475, + "learning_rate": 0.00014733163351354572, + "loss": 0.9599, + "step": 8645 + }, + { + "epoch": 1.844283276450512, + "grad_norm": 0.3123716319471012, + "learning_rate": 0.00014732703759936635, + "loss": 0.9702, + "step": 8646 + }, + { + "epoch": 1.8444965870307168, + "grad_norm": 0.2721965819017665, + "learning_rate": 0.00014732244092338046, + "loss": 0.9531, + "step": 8647 + }, + { + "epoch": 1.8447098976109215, + "grad_norm": 0.2810266571588126, + "learning_rate": 0.00014731784348564, + "loss": 0.9379, + "step": 8648 + }, + { + "epoch": 1.8449232081911262, + "grad_norm": 0.3359251009011097, + "learning_rate": 0.000147313245286197, + "loss": 0.933, + "step": 8649 + }, + { + "epoch": 1.8451365187713311, + "grad_norm": 0.3670858753097863, + "learning_rate": 0.0001473086463251035, + "loss": 0.9595, + "step": 8650 + }, + { + "epoch": 1.8453498293515358, + "grad_norm": 0.4569763973548173, + "learning_rate": 0.00014730404660241158, + "loss": 0.9311, + "step": 8651 + }, + { + "epoch": 1.8455631399317407, + "grad_norm": 0.4945452849099186, + "learning_rate": 0.0001472994461181732, + "loss": 0.9747, + "step": 8652 + }, + { + "epoch": 1.8457764505119454, + "grad_norm": 0.5332084382280233, + "learning_rate": 0.00014729484487244044, + "loss": 0.9404, + "step": 8653 + }, + { + "epoch": 1.84598976109215, + "grad_norm": 0.6097489301778339, + "learning_rate": 0.00014729024286526542, + "loss": 0.953, + "step": 8654 + }, + { + "epoch": 1.846203071672355, + "grad_norm": 0.6955919511370868, + "learning_rate": 0.00014728564009670013, + "loss": 0.9278, + "step": 8655 + }, + { + "epoch": 1.8464163822525599, + "grad_norm": 0.6362914284886586, + "learning_rate": 0.00014728103656679674, + "loss": 0.9574, + "step": 8656 + }, + { + "epoch": 1.8466296928327646, + "grad_norm": 0.45748152532205544, + "learning_rate": 0.00014727643227560727, + "loss": 0.9802, + "step": 8657 + }, + { + "epoch": 1.8468430034129693, + "grad_norm": 0.2449263053636451, + "learning_rate": 0.0001472718272231838, + "loss": 0.9536, + "step": 8658 + }, + { + "epoch": 1.847056313993174, + "grad_norm": 0.30962590369765663, + "learning_rate": 0.00014726722140957852, + "loss": 0.9389, + "step": 8659 + }, + { + "epoch": 1.8472696245733788, + "grad_norm": 0.40360921164144203, + "learning_rate": 0.0001472626148348435, + "loss": 0.961, + "step": 8660 + }, + { + "epoch": 1.8474829351535837, + "grad_norm": 0.44936224844272765, + "learning_rate": 0.00014725800749903082, + "loss": 0.9249, + "step": 8661 + }, + { + "epoch": 1.8476962457337884, + "grad_norm": 0.5234947953158693, + "learning_rate": 0.00014725339940219268, + "loss": 0.957, + "step": 8662 + }, + { + "epoch": 1.8479095563139931, + "grad_norm": 0.5562839909669781, + "learning_rate": 0.00014724879054438117, + "loss": 0.9511, + "step": 8663 + }, + { + "epoch": 1.8481228668941978, + "grad_norm": 0.537091973480931, + "learning_rate": 0.0001472441809256485, + "loss": 0.9581, + "step": 8664 + }, + { + "epoch": 1.8483361774744027, + "grad_norm": 0.46986126883401386, + "learning_rate": 0.00014723957054604676, + "loss": 0.9659, + "step": 8665 + }, + { + "epoch": 1.8485494880546076, + "grad_norm": 0.3750202957657139, + "learning_rate": 0.00014723495940562818, + "loss": 0.9568, + "step": 8666 + }, + { + "epoch": 1.8487627986348123, + "grad_norm": 0.27206033495431153, + "learning_rate": 0.00014723034750444488, + "loss": 0.947, + "step": 8667 + }, + { + "epoch": 1.848976109215017, + "grad_norm": 0.2761394586183396, + "learning_rate": 0.00014722573484254908, + "loss": 0.9159, + "step": 8668 + }, + { + "epoch": 1.849189419795222, + "grad_norm": 0.3967066061663519, + "learning_rate": 0.00014722112141999293, + "loss": 0.9246, + "step": 8669 + }, + { + "epoch": 1.8494027303754266, + "grad_norm": 0.5232002539379021, + "learning_rate": 0.00014721650723682872, + "loss": 0.9493, + "step": 8670 + }, + { + "epoch": 1.8496160409556315, + "grad_norm": 0.5228049592163442, + "learning_rate": 0.00014721189229310855, + "loss": 0.9327, + "step": 8671 + }, + { + "epoch": 1.8498293515358362, + "grad_norm": 0.3987747529610989, + "learning_rate": 0.0001472072765888847, + "loss": 0.9746, + "step": 8672 + }, + { + "epoch": 1.8500426621160408, + "grad_norm": 0.6303076924311373, + "learning_rate": 0.0001472026601242094, + "loss": 0.9483, + "step": 8673 + }, + { + "epoch": 1.8502559726962458, + "grad_norm": 0.30425922410802886, + "learning_rate": 0.00014719804289913484, + "loss": 0.9397, + "step": 8674 + }, + { + "epoch": 1.8504692832764507, + "grad_norm": 0.3223959984703239, + "learning_rate": 0.00014719342491371332, + "loss": 0.9617, + "step": 8675 + }, + { + "epoch": 1.8506825938566553, + "grad_norm": 0.3821072868657933, + "learning_rate": 0.00014718880616799705, + "loss": 0.945, + "step": 8676 + }, + { + "epoch": 1.85089590443686, + "grad_norm": 0.41843546157220163, + "learning_rate": 0.00014718418666203833, + "loss": 0.9386, + "step": 8677 + }, + { + "epoch": 1.8511092150170647, + "grad_norm": 0.4134564891799732, + "learning_rate": 0.00014717956639588939, + "loss": 0.9423, + "step": 8678 + }, + { + "epoch": 1.8513225255972696, + "grad_norm": 0.3634051232328153, + "learning_rate": 0.00014717494536960253, + "loss": 0.9893, + "step": 8679 + }, + { + "epoch": 1.8515358361774745, + "grad_norm": 0.28679338178341324, + "learning_rate": 0.00014717032358323005, + "loss": 0.947, + "step": 8680 + }, + { + "epoch": 1.8517491467576792, + "grad_norm": 0.25963049847808073, + "learning_rate": 0.0001471657010368242, + "loss": 0.9158, + "step": 8681 + }, + { + "epoch": 1.851962457337884, + "grad_norm": 0.22806659162091975, + "learning_rate": 0.00014716107773043733, + "loss": 0.9664, + "step": 8682 + }, + { + "epoch": 1.8521757679180886, + "grad_norm": 0.3360955217865968, + "learning_rate": 0.00014715645366412174, + "loss": 0.9611, + "step": 8683 + }, + { + "epoch": 1.8523890784982935, + "grad_norm": 0.3893081287833389, + "learning_rate": 0.0001471518288379297, + "loss": 0.9577, + "step": 8684 + }, + { + "epoch": 1.8526023890784984, + "grad_norm": 0.37007018792789836, + "learning_rate": 0.00014714720325191365, + "loss": 0.9741, + "step": 8685 + }, + { + "epoch": 1.852815699658703, + "grad_norm": 0.36082591590604546, + "learning_rate": 0.00014714257690612585, + "loss": 0.9745, + "step": 8686 + }, + { + "epoch": 1.8530290102389078, + "grad_norm": 0.36253939504047616, + "learning_rate": 0.00014713794980061865, + "loss": 0.9369, + "step": 8687 + }, + { + "epoch": 1.8532423208191127, + "grad_norm": 0.3800412168306422, + "learning_rate": 0.00014713332193544442, + "loss": 0.94, + "step": 8688 + }, + { + "epoch": 1.8534556313993176, + "grad_norm": 0.2907870464179193, + "learning_rate": 0.00014712869331065553, + "loss": 0.9194, + "step": 8689 + }, + { + "epoch": 1.8536689419795223, + "grad_norm": 0.2810495304162698, + "learning_rate": 0.00014712406392630435, + "loss": 0.9463, + "step": 8690 + }, + { + "epoch": 1.853882252559727, + "grad_norm": 0.3336069707917003, + "learning_rate": 0.00014711943378244323, + "loss": 0.9487, + "step": 8691 + }, + { + "epoch": 1.8540955631399316, + "grad_norm": 0.4251123362863547, + "learning_rate": 0.00014711480287912462, + "loss": 0.9731, + "step": 8692 + }, + { + "epoch": 1.8543088737201365, + "grad_norm": 0.4349745458758703, + "learning_rate": 0.00014711017121640089, + "loss": 0.936, + "step": 8693 + }, + { + "epoch": 1.8545221843003414, + "grad_norm": 0.44577858835373474, + "learning_rate": 0.0001471055387943244, + "loss": 0.9509, + "step": 8694 + }, + { + "epoch": 1.8547354948805461, + "grad_norm": 0.3806230429646489, + "learning_rate": 0.00014710090561294764, + "loss": 0.9654, + "step": 8695 + }, + { + "epoch": 1.8549488054607508, + "grad_norm": 0.32551454406451014, + "learning_rate": 0.00014709627167232303, + "loss": 0.9466, + "step": 8696 + }, + { + "epoch": 1.8551621160409555, + "grad_norm": 0.3694786273871787, + "learning_rate": 0.00014709163697250295, + "loss": 0.9561, + "step": 8697 + }, + { + "epoch": 1.8553754266211604, + "grad_norm": 0.31296806090607304, + "learning_rate": 0.00014708700151353985, + "loss": 0.9385, + "step": 8698 + }, + { + "epoch": 1.8555887372013653, + "grad_norm": 0.27275684294815095, + "learning_rate": 0.00014708236529548623, + "loss": 0.9694, + "step": 8699 + }, + { + "epoch": 1.85580204778157, + "grad_norm": 0.26853485485287965, + "learning_rate": 0.00014707772831839451, + "loss": 0.9544, + "step": 8700 + }, + { + "epoch": 1.8560153583617747, + "grad_norm": 0.34924917791945154, + "learning_rate": 0.00014707309058231715, + "loss": 0.9501, + "step": 8701 + }, + { + "epoch": 1.8562286689419796, + "grad_norm": 0.4084513331798661, + "learning_rate": 0.00014706845208730665, + "loss": 0.9362, + "step": 8702 + }, + { + "epoch": 1.8564419795221843, + "grad_norm": 0.4320537908924193, + "learning_rate": 0.0001470638128334155, + "loss": 0.9418, + "step": 8703 + }, + { + "epoch": 1.8566552901023892, + "grad_norm": 0.47575794576625763, + "learning_rate": 0.00014705917282069618, + "loss": 0.9441, + "step": 8704 + }, + { + "epoch": 1.8568686006825939, + "grad_norm": 0.5409199578251357, + "learning_rate": 0.00014705453204920117, + "loss": 0.9183, + "step": 8705 + }, + { + "epoch": 1.8570819112627985, + "grad_norm": 0.47183333959801954, + "learning_rate": 0.00014704989051898303, + "loss": 0.9577, + "step": 8706 + }, + { + "epoch": 1.8572952218430034, + "grad_norm": 0.34071800447537254, + "learning_rate": 0.00014704524823009422, + "loss": 0.9395, + "step": 8707 + }, + { + "epoch": 1.8575085324232083, + "grad_norm": 0.2867929661600526, + "learning_rate": 0.0001470406051825873, + "loss": 0.9831, + "step": 8708 + }, + { + "epoch": 1.857721843003413, + "grad_norm": 0.32304839468451024, + "learning_rate": 0.00014703596137651483, + "loss": 0.9839, + "step": 8709 + }, + { + "epoch": 1.8579351535836177, + "grad_norm": 0.3065043457069851, + "learning_rate": 0.00014703131681192933, + "loss": 0.9629, + "step": 8710 + }, + { + "epoch": 1.8581484641638224, + "grad_norm": 0.3251747665240783, + "learning_rate": 0.00014702667148888335, + "loss": 0.9516, + "step": 8711 + }, + { + "epoch": 1.8583617747440273, + "grad_norm": 0.39965075108164844, + "learning_rate": 0.00014702202540742943, + "loss": 0.9805, + "step": 8712 + }, + { + "epoch": 1.8585750853242322, + "grad_norm": 0.5359688104194182, + "learning_rate": 0.0001470173785676202, + "loss": 0.9916, + "step": 8713 + }, + { + "epoch": 1.858788395904437, + "grad_norm": 0.5935498949331777, + "learning_rate": 0.00014701273096950817, + "loss": 0.9303, + "step": 8714 + }, + { + "epoch": 1.8590017064846416, + "grad_norm": 0.5393557952804411, + "learning_rate": 0.000147008082613146, + "loss": 0.9891, + "step": 8715 + }, + { + "epoch": 1.8592150170648463, + "grad_norm": 0.49110170538211717, + "learning_rate": 0.0001470034334985862, + "loss": 0.9127, + "step": 8716 + }, + { + "epoch": 1.8594283276450512, + "grad_norm": 0.39576694168045506, + "learning_rate": 0.00014699878362588146, + "loss": 0.9271, + "step": 8717 + }, + { + "epoch": 1.859641638225256, + "grad_norm": 0.3377477359390118, + "learning_rate": 0.00014699413299508433, + "loss": 0.9238, + "step": 8718 + }, + { + "epoch": 1.8598549488054608, + "grad_norm": 0.3531861016560255, + "learning_rate": 0.00014698948160624746, + "loss": 0.941, + "step": 8719 + }, + { + "epoch": 1.8600682593856654, + "grad_norm": 0.331355613545619, + "learning_rate": 0.0001469848294594235, + "loss": 0.9576, + "step": 8720 + }, + { + "epoch": 1.8602815699658704, + "grad_norm": 0.28571231449850004, + "learning_rate": 0.00014698017655466507, + "loss": 0.9505, + "step": 8721 + }, + { + "epoch": 1.860494880546075, + "grad_norm": 0.291982720320417, + "learning_rate": 0.0001469755228920248, + "loss": 0.9842, + "step": 8722 + }, + { + "epoch": 1.86070819112628, + "grad_norm": 0.35195287434952455, + "learning_rate": 0.00014697086847155537, + "loss": 0.9873, + "step": 8723 + }, + { + "epoch": 1.8609215017064846, + "grad_norm": 0.3743858895306479, + "learning_rate": 0.00014696621329330942, + "loss": 0.9565, + "step": 8724 + }, + { + "epoch": 1.8611348122866893, + "grad_norm": 0.3727659834118674, + "learning_rate": 0.00014696155735733967, + "loss": 0.9762, + "step": 8725 + }, + { + "epoch": 1.8613481228668942, + "grad_norm": 0.3207446112480543, + "learning_rate": 0.00014695690066369874, + "loss": 0.9331, + "step": 8726 + }, + { + "epoch": 1.8615614334470991, + "grad_norm": 0.32984907614518055, + "learning_rate": 0.0001469522432124394, + "loss": 0.9597, + "step": 8727 + }, + { + "epoch": 1.8617747440273038, + "grad_norm": 0.31929139812758844, + "learning_rate": 0.00014694758500361424, + "loss": 0.9405, + "step": 8728 + }, + { + "epoch": 1.8619880546075085, + "grad_norm": 0.31667143794557456, + "learning_rate": 0.00014694292603727608, + "loss": 0.9276, + "step": 8729 + }, + { + "epoch": 1.8622013651877132, + "grad_norm": 0.3952818294278445, + "learning_rate": 0.00014693826631347756, + "loss": 0.9466, + "step": 8730 + }, + { + "epoch": 1.862414675767918, + "grad_norm": 0.43860044884645083, + "learning_rate": 0.00014693360583227145, + "loss": 0.9565, + "step": 8731 + }, + { + "epoch": 1.862627986348123, + "grad_norm": 0.44541491153967155, + "learning_rate": 0.00014692894459371043, + "loss": 0.9349, + "step": 8732 + }, + { + "epoch": 1.8628412969283277, + "grad_norm": 0.493376668066765, + "learning_rate": 0.00014692428259784732, + "loss": 0.9238, + "step": 8733 + }, + { + "epoch": 1.8630546075085324, + "grad_norm": 0.5234824282524502, + "learning_rate": 0.00014691961984473478, + "loss": 0.9242, + "step": 8734 + }, + { + "epoch": 1.863267918088737, + "grad_norm": 0.560310822982917, + "learning_rate": 0.00014691495633442565, + "loss": 0.9511, + "step": 8735 + }, + { + "epoch": 1.863481228668942, + "grad_norm": 0.5758875843208501, + "learning_rate": 0.00014691029206697267, + "loss": 0.9423, + "step": 8736 + }, + { + "epoch": 1.8636945392491469, + "grad_norm": 0.5307219680792847, + "learning_rate": 0.00014690562704242858, + "loss": 0.9714, + "step": 8737 + }, + { + "epoch": 1.8639078498293515, + "grad_norm": 0.4665646473547189, + "learning_rate": 0.0001469009612608462, + "loss": 0.9231, + "step": 8738 + }, + { + "epoch": 1.8641211604095562, + "grad_norm": 0.4306287960452614, + "learning_rate": 0.00014689629472227833, + "loss": 0.9677, + "step": 8739 + }, + { + "epoch": 1.8643344709897611, + "grad_norm": 0.41239410193709447, + "learning_rate": 0.00014689162742677778, + "loss": 0.9368, + "step": 8740 + }, + { + "epoch": 1.864547781569966, + "grad_norm": 0.39909617967013783, + "learning_rate": 0.0001468869593743973, + "loss": 0.9674, + "step": 8741 + }, + { + "epoch": 1.8647610921501707, + "grad_norm": 0.265102417098454, + "learning_rate": 0.00014688229056518978, + "loss": 0.9614, + "step": 8742 + }, + { + "epoch": 1.8649744027303754, + "grad_norm": 0.31204977649229976, + "learning_rate": 0.00014687762099920799, + "loss": 0.9585, + "step": 8743 + }, + { + "epoch": 1.86518771331058, + "grad_norm": 0.2816333427101589, + "learning_rate": 0.00014687295067650482, + "loss": 0.9379, + "step": 8744 + }, + { + "epoch": 1.865401023890785, + "grad_norm": 0.2597852875011284, + "learning_rate": 0.00014686827959713307, + "loss": 0.936, + "step": 8745 + }, + { + "epoch": 1.86561433447099, + "grad_norm": 0.27530434626155154, + "learning_rate": 0.00014686360776114562, + "loss": 0.9621, + "step": 8746 + }, + { + "epoch": 1.8658276450511946, + "grad_norm": 0.39059107077524763, + "learning_rate": 0.0001468589351685953, + "loss": 0.9569, + "step": 8747 + }, + { + "epoch": 1.8660409556313993, + "grad_norm": 0.31520500108635047, + "learning_rate": 0.00014685426181953505, + "loss": 0.9347, + "step": 8748 + }, + { + "epoch": 1.866254266211604, + "grad_norm": 1.2049108880944355, + "learning_rate": 0.00014684958771401767, + "loss": 0.987, + "step": 8749 + }, + { + "epoch": 1.8664675767918089, + "grad_norm": 0.5031565264860838, + "learning_rate": 0.00014684491285209608, + "loss": 0.9521, + "step": 8750 + }, + { + "epoch": 1.8666808873720138, + "grad_norm": 0.4473900203161986, + "learning_rate": 0.00014684023723382319, + "loss": 0.9418, + "step": 8751 + }, + { + "epoch": 1.8668941979522184, + "grad_norm": 0.5287252412724555, + "learning_rate": 0.00014683556085925186, + "loss": 0.9439, + "step": 8752 + }, + { + "epoch": 1.8671075085324231, + "grad_norm": 0.607566331423285, + "learning_rate": 0.00014683088372843508, + "loss": 0.9586, + "step": 8753 + }, + { + "epoch": 1.867320819112628, + "grad_norm": 0.9596557507760795, + "learning_rate": 0.0001468262058414257, + "loss": 0.9307, + "step": 8754 + }, + { + "epoch": 1.8675341296928327, + "grad_norm": 1.2628015398178434, + "learning_rate": 0.0001468215271982767, + "loss": 0.9803, + "step": 8755 + }, + { + "epoch": 1.8677474402730376, + "grad_norm": 0.6618807938874056, + "learning_rate": 0.00014681684779904096, + "loss": 0.9698, + "step": 8756 + }, + { + "epoch": 1.8679607508532423, + "grad_norm": 0.7574204048586847, + "learning_rate": 0.0001468121676437715, + "loss": 0.9423, + "step": 8757 + }, + { + "epoch": 1.868174061433447, + "grad_norm": 0.8526660231244941, + "learning_rate": 0.00014680748673252122, + "loss": 0.9527, + "step": 8758 + }, + { + "epoch": 1.868387372013652, + "grad_norm": 0.6699601493409373, + "learning_rate": 0.0001468028050653431, + "loss": 0.9374, + "step": 8759 + }, + { + "epoch": 1.8686006825938568, + "grad_norm": 0.5760043957298464, + "learning_rate": 0.00014679812264229014, + "loss": 0.9521, + "step": 8760 + }, + { + "epoch": 1.8688139931740615, + "grad_norm": 0.7622102602196583, + "learning_rate": 0.00014679343946341527, + "loss": 0.9723, + "step": 8761 + }, + { + "epoch": 1.8690273037542662, + "grad_norm": 0.647465727810195, + "learning_rate": 0.00014678875552877154, + "loss": 0.9769, + "step": 8762 + }, + { + "epoch": 1.8692406143344709, + "grad_norm": 0.4177955169358309, + "learning_rate": 0.0001467840708384119, + "loss": 0.9531, + "step": 8763 + }, + { + "epoch": 1.8694539249146758, + "grad_norm": 0.771816497852884, + "learning_rate": 0.00014677938539238935, + "loss": 0.975, + "step": 8764 + }, + { + "epoch": 1.8696672354948807, + "grad_norm": 0.5323999797209334, + "learning_rate": 0.00014677469919075696, + "loss": 0.9352, + "step": 8765 + }, + { + "epoch": 1.8698805460750854, + "grad_norm": 0.4779179635565438, + "learning_rate": 0.00014677001223356773, + "loss": 0.9828, + "step": 8766 + }, + { + "epoch": 1.87009385665529, + "grad_norm": 0.5565687599678912, + "learning_rate": 0.00014676532452087467, + "loss": 0.9687, + "step": 8767 + }, + { + "epoch": 1.8703071672354947, + "grad_norm": 0.4066790508135344, + "learning_rate": 0.00014676063605273087, + "loss": 0.9541, + "step": 8768 + }, + { + "epoch": 1.8705204778156996, + "grad_norm": 0.4748884824254988, + "learning_rate": 0.0001467559468291893, + "loss": 0.9342, + "step": 8769 + }, + { + "epoch": 1.8707337883959045, + "grad_norm": 0.4448209752518219, + "learning_rate": 0.0001467512568503031, + "loss": 0.95, + "step": 8770 + }, + { + "epoch": 1.8709470989761092, + "grad_norm": 0.45593000028845687, + "learning_rate": 0.0001467465661161253, + "loss": 0.9597, + "step": 8771 + }, + { + "epoch": 1.871160409556314, + "grad_norm": 0.4168833727502431, + "learning_rate": 0.000146741874626709, + "loss": 0.952, + "step": 8772 + }, + { + "epoch": 1.8713737201365188, + "grad_norm": 0.4286067085025568, + "learning_rate": 0.00014673718238210727, + "loss": 0.9137, + "step": 8773 + }, + { + "epoch": 1.8715870307167235, + "grad_norm": 0.39118763345844654, + "learning_rate": 0.00014673248938237317, + "loss": 0.9218, + "step": 8774 + }, + { + "epoch": 1.8718003412969284, + "grad_norm": 0.5087926409091743, + "learning_rate": 0.00014672779562755983, + "loss": 0.9577, + "step": 8775 + }, + { + "epoch": 1.872013651877133, + "grad_norm": 0.4153383180794442, + "learning_rate": 0.00014672310111772037, + "loss": 0.9549, + "step": 8776 + }, + { + "epoch": 1.8722269624573378, + "grad_norm": 0.3862826320296973, + "learning_rate": 0.0001467184058529079, + "loss": 0.9618, + "step": 8777 + }, + { + "epoch": 1.8724402730375427, + "grad_norm": 0.42339652574323267, + "learning_rate": 0.00014671370983317556, + "loss": 0.9746, + "step": 8778 + }, + { + "epoch": 1.8726535836177476, + "grad_norm": 0.3846242464080369, + "learning_rate": 0.00014670901305857646, + "loss": 0.9536, + "step": 8779 + }, + { + "epoch": 1.8728668941979523, + "grad_norm": 0.4267849094829895, + "learning_rate": 0.00014670431552916375, + "loss": 0.9765, + "step": 8780 + }, + { + "epoch": 1.873080204778157, + "grad_norm": 0.41762451295722786, + "learning_rate": 0.00014669961724499058, + "loss": 0.9243, + "step": 8781 + }, + { + "epoch": 1.8732935153583616, + "grad_norm": 0.5548018531343919, + "learning_rate": 0.00014669491820611013, + "loss": 0.9593, + "step": 8782 + }, + { + "epoch": 1.8735068259385665, + "grad_norm": 0.5191755369143769, + "learning_rate": 0.00014669021841257553, + "loss": 0.9444, + "step": 8783 + }, + { + "epoch": 1.8737201365187715, + "grad_norm": 0.3183597064042732, + "learning_rate": 0.00014668551786444005, + "loss": 0.9502, + "step": 8784 + }, + { + "epoch": 1.8739334470989761, + "grad_norm": 0.36284574182300106, + "learning_rate": 0.00014668081656175677, + "loss": 0.9603, + "step": 8785 + }, + { + "epoch": 1.8741467576791808, + "grad_norm": 0.4762150081459189, + "learning_rate": 0.00014667611450457893, + "loss": 0.9362, + "step": 8786 + }, + { + "epoch": 1.8743600682593855, + "grad_norm": 0.5356572971012149, + "learning_rate": 0.00014667141169295974, + "loss": 0.9504, + "step": 8787 + }, + { + "epoch": 1.8745733788395904, + "grad_norm": 0.56055545473834, + "learning_rate": 0.00014666670812695242, + "loss": 0.9399, + "step": 8788 + }, + { + "epoch": 1.8747866894197953, + "grad_norm": 0.47545645912784884, + "learning_rate": 0.00014666200380661015, + "loss": 0.9406, + "step": 8789 + }, + { + "epoch": 1.875, + "grad_norm": 0.3362357552167028, + "learning_rate": 0.00014665729873198619, + "loss": 0.9551, + "step": 8790 + }, + { + "epoch": 1.8752133105802047, + "grad_norm": 0.2651373516024096, + "learning_rate": 0.00014665259290313378, + "loss": 0.9318, + "step": 8791 + }, + { + "epoch": 1.8754266211604096, + "grad_norm": 0.40704384519524833, + "learning_rate": 0.00014664788632010617, + "loss": 0.9458, + "step": 8792 + }, + { + "epoch": 1.8756399317406145, + "grad_norm": 0.5229530645573895, + "learning_rate": 0.0001466431789829566, + "loss": 0.9891, + "step": 8793 + }, + { + "epoch": 1.8758532423208192, + "grad_norm": 0.5553868342321732, + "learning_rate": 0.00014663847089173832, + "loss": 0.963, + "step": 8794 + }, + { + "epoch": 1.8760665529010239, + "grad_norm": 0.5259560889560151, + "learning_rate": 0.00014663376204650464, + "loss": 0.9487, + "step": 8795 + }, + { + "epoch": 1.8762798634812285, + "grad_norm": 0.36391418057171654, + "learning_rate": 0.00014662905244730882, + "loss": 0.9495, + "step": 8796 + }, + { + "epoch": 1.8764931740614335, + "grad_norm": 0.30498514161722207, + "learning_rate": 0.00014662434209420416, + "loss": 0.986, + "step": 8797 + }, + { + "epoch": 1.8767064846416384, + "grad_norm": 0.3123380614730679, + "learning_rate": 0.00014661963098724391, + "loss": 0.9191, + "step": 8798 + }, + { + "epoch": 1.876919795221843, + "grad_norm": 0.3556688448277576, + "learning_rate": 0.00014661491912648146, + "loss": 0.9463, + "step": 8799 + }, + { + "epoch": 1.8771331058020477, + "grad_norm": 0.40548217026110284, + "learning_rate": 0.00014661020651197005, + "loss": 0.9322, + "step": 8800 + }, + { + "epoch": 1.8773464163822524, + "grad_norm": 0.4715858783155408, + "learning_rate": 0.00014660549314376306, + "loss": 0.9712, + "step": 8801 + }, + { + "epoch": 1.8775597269624573, + "grad_norm": 0.5457838584043042, + "learning_rate": 0.00014660077902191378, + "loss": 0.9692, + "step": 8802 + }, + { + "epoch": 1.8777730375426622, + "grad_norm": 0.48100820639796754, + "learning_rate": 0.00014659606414647554, + "loss": 0.9449, + "step": 8803 + }, + { + "epoch": 1.877986348122867, + "grad_norm": 0.2796631646398615, + "learning_rate": 0.00014659134851750176, + "loss": 0.9419, + "step": 8804 + }, + { + "epoch": 1.8781996587030716, + "grad_norm": 0.28193795475950667, + "learning_rate": 0.00014658663213504573, + "loss": 0.948, + "step": 8805 + }, + { + "epoch": 1.8784129692832765, + "grad_norm": 0.3680453608387445, + "learning_rate": 0.00014658191499916082, + "loss": 0.9515, + "step": 8806 + }, + { + "epoch": 1.8786262798634812, + "grad_norm": 0.325972271718442, + "learning_rate": 0.00014657719710990045, + "loss": 0.9461, + "step": 8807 + }, + { + "epoch": 1.878839590443686, + "grad_norm": 0.22622595875415313, + "learning_rate": 0.00014657247846731796, + "loss": 0.9474, + "step": 8808 + }, + { + "epoch": 1.8790529010238908, + "grad_norm": 0.2660458946938751, + "learning_rate": 0.00014656775907146674, + "loss": 0.9455, + "step": 8809 + }, + { + "epoch": 1.8792662116040955, + "grad_norm": 0.34100082973272666, + "learning_rate": 0.00014656303892240024, + "loss": 0.9589, + "step": 8810 + }, + { + "epoch": 1.8794795221843004, + "grad_norm": 0.36846082076072784, + "learning_rate": 0.00014655831802017183, + "loss": 0.9528, + "step": 8811 + }, + { + "epoch": 1.8796928327645053, + "grad_norm": 0.29873282680773205, + "learning_rate": 0.0001465535963648349, + "loss": 0.9627, + "step": 8812 + }, + { + "epoch": 1.87990614334471, + "grad_norm": 0.2598511566791555, + "learning_rate": 0.00014654887395644295, + "loss": 0.9123, + "step": 8813 + }, + { + "epoch": 1.8801194539249146, + "grad_norm": 0.3380236890545683, + "learning_rate": 0.00014654415079504935, + "loss": 0.9331, + "step": 8814 + }, + { + "epoch": 1.8803327645051193, + "grad_norm": 0.4196779299958642, + "learning_rate": 0.00014653942688070757, + "loss": 0.9739, + "step": 8815 + }, + { + "epoch": 1.8805460750853242, + "grad_norm": 0.400062141550742, + "learning_rate": 0.00014653470221347105, + "loss": 0.9573, + "step": 8816 + }, + { + "epoch": 1.8807593856655291, + "grad_norm": 0.2771087183181428, + "learning_rate": 0.00014652997679339327, + "loss": 0.9664, + "step": 8817 + }, + { + "epoch": 1.8809726962457338, + "grad_norm": 0.21695858391248393, + "learning_rate": 0.00014652525062052764, + "loss": 0.8985, + "step": 8818 + }, + { + "epoch": 1.8811860068259385, + "grad_norm": 0.29176263616840936, + "learning_rate": 0.00014652052369492773, + "loss": 0.9547, + "step": 8819 + }, + { + "epoch": 1.8813993174061432, + "grad_norm": 0.29227208136714994, + "learning_rate": 0.00014651579601664694, + "loss": 0.9438, + "step": 8820 + }, + { + "epoch": 1.881612627986348, + "grad_norm": 0.28689048112334775, + "learning_rate": 0.00014651106758573884, + "loss": 0.9679, + "step": 8821 + }, + { + "epoch": 1.881825938566553, + "grad_norm": 0.3140065441307157, + "learning_rate": 0.00014650633840225684, + "loss": 0.9163, + "step": 8822 + }, + { + "epoch": 1.8820392491467577, + "grad_norm": 0.3499952464976039, + "learning_rate": 0.00014650160846625454, + "loss": 0.948, + "step": 8823 + }, + { + "epoch": 1.8822525597269624, + "grad_norm": 0.31585017348881683, + "learning_rate": 0.00014649687777778544, + "loss": 0.9875, + "step": 8824 + }, + { + "epoch": 1.8824658703071673, + "grad_norm": 0.25206709005759015, + "learning_rate": 0.000146492146336903, + "loss": 0.9397, + "step": 8825 + }, + { + "epoch": 1.882679180887372, + "grad_norm": 0.2408163429636456, + "learning_rate": 0.00014648741414366082, + "loss": 0.9544, + "step": 8826 + }, + { + "epoch": 1.8828924914675769, + "grad_norm": 0.2114285578912918, + "learning_rate": 0.00014648268119811246, + "loss": 0.9558, + "step": 8827 + }, + { + "epoch": 1.8831058020477816, + "grad_norm": 0.21169002503598597, + "learning_rate": 0.0001464779475003114, + "loss": 0.9384, + "step": 8828 + }, + { + "epoch": 1.8833191126279862, + "grad_norm": 0.20750700550959042, + "learning_rate": 0.00014647321305031127, + "loss": 0.9625, + "step": 8829 + }, + { + "epoch": 1.8835324232081911, + "grad_norm": 0.23047407214655552, + "learning_rate": 0.00014646847784816563, + "loss": 0.9214, + "step": 8830 + }, + { + "epoch": 1.883745733788396, + "grad_norm": 0.23937019306595755, + "learning_rate": 0.00014646374189392802, + "loss": 0.9436, + "step": 8831 + }, + { + "epoch": 1.8839590443686007, + "grad_norm": 0.26742494720072063, + "learning_rate": 0.00014645900518765205, + "loss": 0.9708, + "step": 8832 + }, + { + "epoch": 1.8841723549488054, + "grad_norm": 0.28585309156647654, + "learning_rate": 0.00014645426772939133, + "loss": 0.972, + "step": 8833 + }, + { + "epoch": 1.88438566552901, + "grad_norm": 0.31655885429223135, + "learning_rate": 0.00014644952951919945, + "loss": 0.9138, + "step": 8834 + }, + { + "epoch": 1.884598976109215, + "grad_norm": 0.30612097682764167, + "learning_rate": 0.00014644479055713004, + "loss": 0.9301, + "step": 8835 + }, + { + "epoch": 1.88481228668942, + "grad_norm": 0.28614902526286184, + "learning_rate": 0.0001464400508432367, + "loss": 0.9134, + "step": 8836 + }, + { + "epoch": 1.8850255972696246, + "grad_norm": 0.3450595768262155, + "learning_rate": 0.00014643531037757305, + "loss": 0.9554, + "step": 8837 + }, + { + "epoch": 1.8852389078498293, + "grad_norm": 0.3464090145015517, + "learning_rate": 0.00014643056916019276, + "loss": 0.9172, + "step": 8838 + }, + { + "epoch": 1.885452218430034, + "grad_norm": 0.33707451460043003, + "learning_rate": 0.00014642582719114945, + "loss": 0.9573, + "step": 8839 + }, + { + "epoch": 1.8856655290102389, + "grad_norm": 0.3106911168644158, + "learning_rate": 0.0001464210844704968, + "loss": 0.9443, + "step": 8840 + }, + { + "epoch": 1.8858788395904438, + "grad_norm": 0.3061174491324635, + "learning_rate": 0.00014641634099828846, + "loss": 0.9541, + "step": 8841 + }, + { + "epoch": 1.8860921501706485, + "grad_norm": 0.2888860296548679, + "learning_rate": 0.00014641159677457812, + "loss": 0.9578, + "step": 8842 + }, + { + "epoch": 1.8863054607508531, + "grad_norm": 0.23784327248668055, + "learning_rate": 0.00014640685179941943, + "loss": 0.9377, + "step": 8843 + }, + { + "epoch": 1.886518771331058, + "grad_norm": 0.23952776594199893, + "learning_rate": 0.0001464021060728661, + "loss": 0.9507, + "step": 8844 + }, + { + "epoch": 1.886732081911263, + "grad_norm": 0.2589953313333531, + "learning_rate": 0.00014639735959497182, + "loss": 0.9542, + "step": 8845 + }, + { + "epoch": 1.8869453924914676, + "grad_norm": 0.2705974591139171, + "learning_rate": 0.00014639261236579028, + "loss": 0.9208, + "step": 8846 + }, + { + "epoch": 1.8871587030716723, + "grad_norm": 0.3058407307668856, + "learning_rate": 0.00014638786438537523, + "loss": 0.9439, + "step": 8847 + }, + { + "epoch": 1.887372013651877, + "grad_norm": 0.9695054857438078, + "learning_rate": 0.0001463831156537804, + "loss": 0.9431, + "step": 8848 + }, + { + "epoch": 1.887585324232082, + "grad_norm": 0.3734284751487409, + "learning_rate": 0.00014637836617105947, + "loss": 0.9482, + "step": 8849 + }, + { + "epoch": 1.8877986348122868, + "grad_norm": 0.4140195381176119, + "learning_rate": 0.0001463736159372662, + "loss": 0.9665, + "step": 8850 + }, + { + "epoch": 1.8880119453924915, + "grad_norm": 0.43657827210380734, + "learning_rate": 0.0001463688649524544, + "loss": 0.9317, + "step": 8851 + }, + { + "epoch": 1.8882252559726962, + "grad_norm": 0.5466621494615878, + "learning_rate": 0.00014636411321667774, + "loss": 0.9543, + "step": 8852 + }, + { + "epoch": 1.8884385665529009, + "grad_norm": 0.6431262358464075, + "learning_rate": 0.00014635936072999003, + "loss": 0.9679, + "step": 8853 + }, + { + "epoch": 1.8886518771331058, + "grad_norm": 0.7723094966668214, + "learning_rate": 0.00014635460749244504, + "loss": 0.9575, + "step": 8854 + }, + { + "epoch": 1.8888651877133107, + "grad_norm": 0.9245934837795478, + "learning_rate": 0.00014634985350409655, + "loss": 0.9768, + "step": 8855 + }, + { + "epoch": 1.8890784982935154, + "grad_norm": 0.9871447818191554, + "learning_rate": 0.00014634509876499836, + "loss": 0.9328, + "step": 8856 + }, + { + "epoch": 1.88929180887372, + "grad_norm": 0.9478466279035765, + "learning_rate": 0.00014634034327520423, + "loss": 0.9492, + "step": 8857 + }, + { + "epoch": 1.889505119453925, + "grad_norm": 0.8373159130334142, + "learning_rate": 0.00014633558703476803, + "loss": 0.9619, + "step": 8858 + }, + { + "epoch": 1.8897184300341296, + "grad_norm": 0.6981296146249559, + "learning_rate": 0.00014633083004374353, + "loss": 0.9354, + "step": 8859 + }, + { + "epoch": 1.8899317406143346, + "grad_norm": 0.48648759075696013, + "learning_rate": 0.0001463260723021846, + "loss": 0.9632, + "step": 8860 + }, + { + "epoch": 1.8901450511945392, + "grad_norm": 0.26871394964033346, + "learning_rate": 0.000146321313810145, + "loss": 0.9401, + "step": 8861 + }, + { + "epoch": 1.890358361774744, + "grad_norm": 0.2980867841359854, + "learning_rate": 0.00014631655456767863, + "loss": 0.955, + "step": 8862 + }, + { + "epoch": 1.8905716723549488, + "grad_norm": 0.5333669732510876, + "learning_rate": 0.00014631179457483937, + "loss": 0.9524, + "step": 8863 + }, + { + "epoch": 1.8907849829351537, + "grad_norm": 0.6329520176459849, + "learning_rate": 0.000146307033831681, + "loss": 0.9214, + "step": 8864 + }, + { + "epoch": 1.8909982935153584, + "grad_norm": 0.5790124150536139, + "learning_rate": 0.00014630227233825742, + "loss": 0.9889, + "step": 8865 + }, + { + "epoch": 1.891211604095563, + "grad_norm": 0.49651859825819994, + "learning_rate": 0.00014629751009462256, + "loss": 0.9822, + "step": 8866 + }, + { + "epoch": 1.8914249146757678, + "grad_norm": 0.4000614510583874, + "learning_rate": 0.00014629274710083022, + "loss": 0.9521, + "step": 8867 + }, + { + "epoch": 1.8916382252559727, + "grad_norm": 0.27429312963845837, + "learning_rate": 0.00014628798335693434, + "loss": 0.9217, + "step": 8868 + }, + { + "epoch": 1.8918515358361776, + "grad_norm": 0.31501231554265485, + "learning_rate": 0.00014628321886298883, + "loss": 0.9537, + "step": 8869 + }, + { + "epoch": 1.8920648464163823, + "grad_norm": 0.2941824694214898, + "learning_rate": 0.00014627845361904756, + "loss": 0.9151, + "step": 8870 + }, + { + "epoch": 1.892278156996587, + "grad_norm": 0.3644722007143224, + "learning_rate": 0.00014627368762516448, + "loss": 0.899, + "step": 8871 + }, + { + "epoch": 1.8924914675767917, + "grad_norm": 0.37608019603662896, + "learning_rate": 0.0001462689208813935, + "loss": 0.9447, + "step": 8872 + }, + { + "epoch": 1.8927047781569966, + "grad_norm": 0.3576525727963435, + "learning_rate": 0.0001462641533877886, + "loss": 0.9378, + "step": 8873 + }, + { + "epoch": 1.8929180887372015, + "grad_norm": 0.4222755489636188, + "learning_rate": 0.00014625938514440366, + "loss": 0.9422, + "step": 8874 + }, + { + "epoch": 1.8931313993174061, + "grad_norm": 0.3453136790433028, + "learning_rate": 0.00014625461615129265, + "loss": 0.9701, + "step": 8875 + }, + { + "epoch": 1.8933447098976108, + "grad_norm": 0.23423836155249467, + "learning_rate": 0.00014624984640850957, + "loss": 0.957, + "step": 8876 + }, + { + "epoch": 1.8935580204778157, + "grad_norm": 0.32711111060205783, + "learning_rate": 0.00014624507591610837, + "loss": 0.9675, + "step": 8877 + }, + { + "epoch": 1.8937713310580204, + "grad_norm": 0.3879648207069148, + "learning_rate": 0.000146240304674143, + "loss": 0.9558, + "step": 8878 + }, + { + "epoch": 1.8939846416382253, + "grad_norm": 0.46444564780064646, + "learning_rate": 0.0001462355326826675, + "loss": 0.9811, + "step": 8879 + }, + { + "epoch": 1.89419795221843, + "grad_norm": 0.4130904121705932, + "learning_rate": 0.00014623075994173577, + "loss": 0.9558, + "step": 8880 + }, + { + "epoch": 1.8944112627986347, + "grad_norm": 0.45288437229174966, + "learning_rate": 0.00014622598645140194, + "loss": 0.9521, + "step": 8881 + }, + { + "epoch": 1.8946245733788396, + "grad_norm": 0.39241504065744937, + "learning_rate": 0.0001462212122117199, + "loss": 0.9656, + "step": 8882 + }, + { + "epoch": 1.8948378839590445, + "grad_norm": 0.3182525257805151, + "learning_rate": 0.00014621643722274378, + "loss": 0.9266, + "step": 8883 + }, + { + "epoch": 1.8950511945392492, + "grad_norm": 0.3188687968163888, + "learning_rate": 0.00014621166148452757, + "loss": 0.9336, + "step": 8884 + }, + { + "epoch": 1.8952645051194539, + "grad_norm": 0.373733863321064, + "learning_rate": 0.00014620688499712527, + "loss": 0.9444, + "step": 8885 + }, + { + "epoch": 1.8954778156996586, + "grad_norm": 0.356511883818089, + "learning_rate": 0.00014620210776059096, + "loss": 0.9536, + "step": 8886 + }, + { + "epoch": 1.8956911262798635, + "grad_norm": 0.346950727514184, + "learning_rate": 0.00014619732977497868, + "loss": 0.9255, + "step": 8887 + }, + { + "epoch": 1.8959044368600684, + "grad_norm": 0.2882464848649854, + "learning_rate": 0.0001461925510403425, + "loss": 0.9698, + "step": 8888 + }, + { + "epoch": 1.896117747440273, + "grad_norm": 0.29061745913604287, + "learning_rate": 0.00014618777155673648, + "loss": 0.9554, + "step": 8889 + }, + { + "epoch": 1.8963310580204777, + "grad_norm": 0.2974868181061248, + "learning_rate": 0.00014618299132421474, + "loss": 0.9198, + "step": 8890 + }, + { + "epoch": 1.8965443686006824, + "grad_norm": 0.31958427754354973, + "learning_rate": 0.00014617821034283132, + "loss": 0.9594, + "step": 8891 + }, + { + "epoch": 1.8967576791808873, + "grad_norm": 0.3457114179773154, + "learning_rate": 0.00014617342861264035, + "loss": 0.9347, + "step": 8892 + }, + { + "epoch": 1.8969709897610922, + "grad_norm": 0.34948873043913536, + "learning_rate": 0.0001461686461336959, + "loss": 0.9742, + "step": 8893 + }, + { + "epoch": 1.897184300341297, + "grad_norm": 0.34379567109576487, + "learning_rate": 0.00014616386290605213, + "loss": 0.9678, + "step": 8894 + }, + { + "epoch": 1.8973976109215016, + "grad_norm": 0.3211984255950611, + "learning_rate": 0.00014615907892976312, + "loss": 0.9509, + "step": 8895 + }, + { + "epoch": 1.8976109215017065, + "grad_norm": 0.3395836562377396, + "learning_rate": 0.00014615429420488304, + "loss": 0.9505, + "step": 8896 + }, + { + "epoch": 1.8978242320819114, + "grad_norm": 0.3019591957128409, + "learning_rate": 0.00014614950873146598, + "loss": 0.9421, + "step": 8897 + }, + { + "epoch": 1.898037542662116, + "grad_norm": 0.25801875190645385, + "learning_rate": 0.00014614472250956613, + "loss": 0.941, + "step": 8898 + }, + { + "epoch": 1.8982508532423208, + "grad_norm": 0.27948548746142576, + "learning_rate": 0.00014613993553923765, + "loss": 0.9259, + "step": 8899 + }, + { + "epoch": 1.8984641638225255, + "grad_norm": 0.25359176714484133, + "learning_rate": 0.00014613514782053466, + "loss": 0.9539, + "step": 8900 + }, + { + "epoch": 1.8986774744027304, + "grad_norm": 0.2386593026669181, + "learning_rate": 0.00014613035935351137, + "loss": 0.9413, + "step": 8901 + }, + { + "epoch": 1.8988907849829353, + "grad_norm": 0.27014224619744176, + "learning_rate": 0.00014612557013822194, + "loss": 0.9609, + "step": 8902 + }, + { + "epoch": 1.89910409556314, + "grad_norm": 0.3057347337519827, + "learning_rate": 0.0001461207801747206, + "loss": 0.9964, + "step": 8903 + }, + { + "epoch": 1.8993174061433447, + "grad_norm": 0.33016187533214064, + "learning_rate": 0.00014611598946306153, + "loss": 0.9282, + "step": 8904 + }, + { + "epoch": 1.8995307167235493, + "grad_norm": 0.31575826709374305, + "learning_rate": 0.00014611119800329888, + "loss": 0.9379, + "step": 8905 + }, + { + "epoch": 1.8997440273037542, + "grad_norm": 0.34384668677651126, + "learning_rate": 0.00014610640579548693, + "loss": 0.9607, + "step": 8906 + }, + { + "epoch": 1.8999573378839592, + "grad_norm": 0.3466347981532484, + "learning_rate": 0.00014610161283967993, + "loss": 0.9777, + "step": 8907 + }, + { + "epoch": 1.9001706484641638, + "grad_norm": 0.2888857136930086, + "learning_rate": 0.000146096819135932, + "loss": 0.9614, + "step": 8908 + }, + { + "epoch": 1.9003839590443685, + "grad_norm": 0.22784850042124036, + "learning_rate": 0.00014609202468429748, + "loss": 0.9679, + "step": 8909 + }, + { + "epoch": 1.9005972696245734, + "grad_norm": 0.28752138643416036, + "learning_rate": 0.0001460872294848306, + "loss": 0.9411, + "step": 8910 + }, + { + "epoch": 1.900810580204778, + "grad_norm": 0.2529103201102539, + "learning_rate": 0.00014608243353758557, + "loss": 0.9362, + "step": 8911 + }, + { + "epoch": 1.901023890784983, + "grad_norm": 0.25399745619582026, + "learning_rate": 0.00014607763684261675, + "loss": 0.9457, + "step": 8912 + }, + { + "epoch": 1.9012372013651877, + "grad_norm": 0.3157644368194135, + "learning_rate": 0.0001460728393999783, + "loss": 0.9682, + "step": 8913 + }, + { + "epoch": 1.9014505119453924, + "grad_norm": 0.29544969608566835, + "learning_rate": 0.0001460680412097246, + "loss": 0.9352, + "step": 8914 + }, + { + "epoch": 1.9016638225255973, + "grad_norm": 0.25484077617759576, + "learning_rate": 0.0001460632422719099, + "loss": 0.9191, + "step": 8915 + }, + { + "epoch": 1.9018771331058022, + "grad_norm": 0.27050641544403137, + "learning_rate": 0.0001460584425865885, + "loss": 0.9728, + "step": 8916 + }, + { + "epoch": 1.9020904436860069, + "grad_norm": 0.24360562635922775, + "learning_rate": 0.00014605364215381468, + "loss": 0.9328, + "step": 8917 + }, + { + "epoch": 1.9023037542662116, + "grad_norm": 0.23780905842260117, + "learning_rate": 0.00014604884097364282, + "loss": 0.9485, + "step": 8918 + }, + { + "epoch": 1.9025170648464163, + "grad_norm": 0.2654963034889477, + "learning_rate": 0.00014604403904612718, + "loss": 0.938, + "step": 8919 + }, + { + "epoch": 1.9027303754266212, + "grad_norm": 0.27426612173908194, + "learning_rate": 0.00014603923637132216, + "loss": 0.919, + "step": 8920 + }, + { + "epoch": 1.902943686006826, + "grad_norm": 0.31880976054519106, + "learning_rate": 0.00014603443294928205, + "loss": 0.9192, + "step": 8921 + }, + { + "epoch": 1.9031569965870307, + "grad_norm": 0.34154003129690114, + "learning_rate": 0.00014602962878006127, + "loss": 0.9559, + "step": 8922 + }, + { + "epoch": 1.9033703071672354, + "grad_norm": 0.3450260220687852, + "learning_rate": 0.00014602482386371406, + "loss": 0.982, + "step": 8923 + }, + { + "epoch": 1.9035836177474401, + "grad_norm": 0.3403353770782334, + "learning_rate": 0.0001460200182002949, + "loss": 0.9777, + "step": 8924 + }, + { + "epoch": 1.903796928327645, + "grad_norm": 0.42324789940898055, + "learning_rate": 0.0001460152117898581, + "loss": 0.9467, + "step": 8925 + }, + { + "epoch": 1.90401023890785, + "grad_norm": 0.3937416620909984, + "learning_rate": 0.0001460104046324581, + "loss": 0.9741, + "step": 8926 + }, + { + "epoch": 1.9042235494880546, + "grad_norm": 0.2773243671974857, + "learning_rate": 0.00014600559672814923, + "loss": 0.9398, + "step": 8927 + }, + { + "epoch": 1.9044368600682593, + "grad_norm": 0.23985074509722454, + "learning_rate": 0.00014600078807698596, + "loss": 0.9129, + "step": 8928 + }, + { + "epoch": 1.9046501706484642, + "grad_norm": 0.2913962388791683, + "learning_rate": 0.00014599597867902262, + "loss": 0.9265, + "step": 8929 + }, + { + "epoch": 1.9048634812286689, + "grad_norm": 0.2861744977764837, + "learning_rate": 0.0001459911685343137, + "loss": 0.958, + "step": 8930 + }, + { + "epoch": 1.9050767918088738, + "grad_norm": 0.3755386883547934, + "learning_rate": 0.00014598635764291357, + "loss": 0.9113, + "step": 8931 + }, + { + "epoch": 1.9052901023890785, + "grad_norm": 0.4513744558616362, + "learning_rate": 0.00014598154600487673, + "loss": 0.9516, + "step": 8932 + }, + { + "epoch": 1.9055034129692832, + "grad_norm": 0.5182611810472663, + "learning_rate": 0.00014597673362025756, + "loss": 0.9688, + "step": 8933 + }, + { + "epoch": 1.905716723549488, + "grad_norm": 0.5700273477385739, + "learning_rate": 0.00014597192048911053, + "loss": 0.9553, + "step": 8934 + }, + { + "epoch": 1.905930034129693, + "grad_norm": 0.5472968216691737, + "learning_rate": 0.0001459671066114901, + "loss": 0.9435, + "step": 8935 + }, + { + "epoch": 1.9061433447098977, + "grad_norm": 0.4481999839261931, + "learning_rate": 0.00014596229198745082, + "loss": 0.9782, + "step": 8936 + }, + { + "epoch": 1.9063566552901023, + "grad_norm": 0.3491971806535862, + "learning_rate": 0.00014595747661704704, + "loss": 0.9448, + "step": 8937 + }, + { + "epoch": 1.906569965870307, + "grad_norm": 0.27133632992242185, + "learning_rate": 0.00014595266050033333, + "loss": 0.9494, + "step": 8938 + }, + { + "epoch": 1.906783276450512, + "grad_norm": 0.28472535464460186, + "learning_rate": 0.00014594784363736413, + "loss": 0.9878, + "step": 8939 + }, + { + "epoch": 1.9069965870307168, + "grad_norm": 0.2492095231232638, + "learning_rate": 0.00014594302602819397, + "loss": 0.9736, + "step": 8940 + }, + { + "epoch": 1.9072098976109215, + "grad_norm": 0.22130296970595945, + "learning_rate": 0.00014593820767287737, + "loss": 0.9444, + "step": 8941 + }, + { + "epoch": 1.9074232081911262, + "grad_norm": 0.24489899909722113, + "learning_rate": 0.00014593338857146887, + "loss": 0.935, + "step": 8942 + }, + { + "epoch": 1.9076365187713311, + "grad_norm": 0.25651804303479475, + "learning_rate": 0.00014592856872402293, + "loss": 0.9513, + "step": 8943 + }, + { + "epoch": 1.9078498293515358, + "grad_norm": 0.25610788828841596, + "learning_rate": 0.00014592374813059413, + "loss": 0.9393, + "step": 8944 + }, + { + "epoch": 1.9080631399317407, + "grad_norm": 0.2965631488065867, + "learning_rate": 0.00014591892679123705, + "loss": 0.918, + "step": 8945 + }, + { + "epoch": 1.9082764505119454, + "grad_norm": 0.2858989188605909, + "learning_rate": 0.00014591410470600615, + "loss": 0.9237, + "step": 8946 + }, + { + "epoch": 1.90848976109215, + "grad_norm": 0.3306288834703908, + "learning_rate": 0.00014590928187495608, + "loss": 0.9828, + "step": 8947 + }, + { + "epoch": 1.908703071672355, + "grad_norm": 0.2987486629080688, + "learning_rate": 0.00014590445829814138, + "loss": 0.956, + "step": 8948 + }, + { + "epoch": 1.9089163822525599, + "grad_norm": 0.29068508131853665, + "learning_rate": 0.0001458996339756166, + "loss": 0.9623, + "step": 8949 + }, + { + "epoch": 1.9091296928327646, + "grad_norm": 0.4734826518203138, + "learning_rate": 0.00014589480890743638, + "loss": 0.9369, + "step": 8950 + }, + { + "epoch": 1.9093430034129693, + "grad_norm": 0.55615380505737, + "learning_rate": 0.00014588998309365527, + "loss": 0.9591, + "step": 8951 + }, + { + "epoch": 1.909556313993174, + "grad_norm": 0.5618683405074236, + "learning_rate": 0.0001458851565343279, + "loss": 0.9315, + "step": 8952 + }, + { + "epoch": 1.9097696245733788, + "grad_norm": 0.5197802640447307, + "learning_rate": 0.00014588032922950887, + "loss": 0.9288, + "step": 8953 + }, + { + "epoch": 1.9099829351535837, + "grad_norm": 0.4290618770830521, + "learning_rate": 0.0001458755011792528, + "loss": 0.9082, + "step": 8954 + }, + { + "epoch": 1.9101962457337884, + "grad_norm": 0.37458711631444447, + "learning_rate": 0.00014587067238361437, + "loss": 0.9542, + "step": 8955 + }, + { + "epoch": 1.9104095563139931, + "grad_norm": 0.33588273428473836, + "learning_rate": 0.00014586584284264812, + "loss": 0.9761, + "step": 8956 + }, + { + "epoch": 1.9106228668941978, + "grad_norm": 0.29178258469789586, + "learning_rate": 0.00014586101255640877, + "loss": 0.931, + "step": 8957 + }, + { + "epoch": 1.9108361774744027, + "grad_norm": 0.32746843314354873, + "learning_rate": 0.00014585618152495095, + "loss": 0.9448, + "step": 8958 + }, + { + "epoch": 1.9110494880546076, + "grad_norm": 0.2742458774451931, + "learning_rate": 0.00014585134974832935, + "loss": 0.9534, + "step": 8959 + }, + { + "epoch": 1.9112627986348123, + "grad_norm": 0.22697928635135575, + "learning_rate": 0.0001458465172265986, + "loss": 0.9486, + "step": 8960 + }, + { + "epoch": 1.911476109215017, + "grad_norm": 0.31097921984057203, + "learning_rate": 0.00014584168395981342, + "loss": 0.9211, + "step": 8961 + }, + { + "epoch": 1.911689419795222, + "grad_norm": 0.3039207615964365, + "learning_rate": 0.00014583684994802845, + "loss": 0.9954, + "step": 8962 + }, + { + "epoch": 1.9119027303754266, + "grad_norm": 0.30186004066170147, + "learning_rate": 0.00014583201519129842, + "loss": 0.9631, + "step": 8963 + }, + { + "epoch": 1.9121160409556315, + "grad_norm": 0.2703798320178259, + "learning_rate": 0.00014582717968967805, + "loss": 0.9422, + "step": 8964 + }, + { + "epoch": 1.9123293515358362, + "grad_norm": 0.32992932730576185, + "learning_rate": 0.00014582234344322204, + "loss": 0.9761, + "step": 8965 + }, + { + "epoch": 1.9125426621160408, + "grad_norm": 0.3592715436533689, + "learning_rate": 0.00014581750645198508, + "loss": 0.9378, + "step": 8966 + }, + { + "epoch": 1.9127559726962458, + "grad_norm": 0.33115516205728074, + "learning_rate": 0.00014581266871602195, + "loss": 0.9533, + "step": 8967 + }, + { + "epoch": 1.9129692832764507, + "grad_norm": 0.36239389335645095, + "learning_rate": 0.00014580783023538732, + "loss": 0.9422, + "step": 8968 + }, + { + "epoch": 1.9131825938566553, + "grad_norm": 0.4497945451079948, + "learning_rate": 0.00014580299101013604, + "loss": 0.939, + "step": 8969 + }, + { + "epoch": 1.91339590443686, + "grad_norm": 0.49481665671600844, + "learning_rate": 0.0001457981510403228, + "loss": 0.9542, + "step": 8970 + }, + { + "epoch": 1.9136092150170647, + "grad_norm": 0.5086074708693633, + "learning_rate": 0.00014579331032600235, + "loss": 0.9614, + "step": 8971 + }, + { + "epoch": 1.9138225255972696, + "grad_norm": 0.5068184867251683, + "learning_rate": 0.00014578846886722951, + "loss": 0.9692, + "step": 8972 + }, + { + "epoch": 1.9140358361774745, + "grad_norm": 0.45574725304683944, + "learning_rate": 0.00014578362666405902, + "loss": 0.9648, + "step": 8973 + }, + { + "epoch": 1.9142491467576792, + "grad_norm": 0.3151299642727836, + "learning_rate": 0.0001457787837165457, + "loss": 0.9614, + "step": 8974 + }, + { + "epoch": 1.914462457337884, + "grad_norm": 0.4186460000127335, + "learning_rate": 0.00014577394002474433, + "loss": 0.9606, + "step": 8975 + }, + { + "epoch": 1.9146757679180886, + "grad_norm": 0.25532923349069586, + "learning_rate": 0.00014576909558870974, + "loss": 0.9313, + "step": 8976 + }, + { + "epoch": 1.9148890784982935, + "grad_norm": 0.29361927616639505, + "learning_rate": 0.0001457642504084967, + "loss": 0.9585, + "step": 8977 + }, + { + "epoch": 1.9151023890784984, + "grad_norm": 0.2812513384740519, + "learning_rate": 0.00014575940448416006, + "loss": 0.9157, + "step": 8978 + }, + { + "epoch": 1.915315699658703, + "grad_norm": 0.30323551409045213, + "learning_rate": 0.0001457545578157547, + "loss": 0.9386, + "step": 8979 + }, + { + "epoch": 1.9155290102389078, + "grad_norm": 0.28199328301954857, + "learning_rate": 0.00014574971040333534, + "loss": 0.9423, + "step": 8980 + }, + { + "epoch": 1.9157423208191127, + "grad_norm": 0.26226564184880286, + "learning_rate": 0.00014574486224695695, + "loss": 0.9831, + "step": 8981 + }, + { + "epoch": 1.9159556313993176, + "grad_norm": 0.3168178797026497, + "learning_rate": 0.00014574001334667435, + "loss": 0.9463, + "step": 8982 + }, + { + "epoch": 1.9161689419795223, + "grad_norm": 0.34186434251453357, + "learning_rate": 0.00014573516370254233, + "loss": 0.9853, + "step": 8983 + }, + { + "epoch": 1.916382252559727, + "grad_norm": 0.8689597489423602, + "learning_rate": 0.00014573031331461587, + "loss": 0.9407, + "step": 8984 + }, + { + "epoch": 1.9165955631399316, + "grad_norm": 0.30857719075149237, + "learning_rate": 0.00014572546218294982, + "loss": 0.9313, + "step": 8985 + }, + { + "epoch": 1.9168088737201365, + "grad_norm": 0.3558200520859963, + "learning_rate": 0.00014572061030759903, + "loss": 0.9688, + "step": 8986 + }, + { + "epoch": 1.9170221843003414, + "grad_norm": 0.3785778924154432, + "learning_rate": 0.00014571575768861845, + "loss": 0.9466, + "step": 8987 + }, + { + "epoch": 1.9172354948805461, + "grad_norm": 0.45942761834650814, + "learning_rate": 0.00014571090432606297, + "loss": 0.9851, + "step": 8988 + }, + { + "epoch": 1.9174488054607508, + "grad_norm": 0.5952340001240647, + "learning_rate": 0.0001457060502199875, + "loss": 0.956, + "step": 8989 + }, + { + "epoch": 1.9176621160409555, + "grad_norm": 0.7385781144790309, + "learning_rate": 0.00014570119537044693, + "loss": 0.97, + "step": 8990 + }, + { + "epoch": 1.9178754266211604, + "grad_norm": 0.8271652238561977, + "learning_rate": 0.00014569633977749627, + "loss": 0.9684, + "step": 8991 + }, + { + "epoch": 1.9180887372013653, + "grad_norm": 0.9216439225576328, + "learning_rate": 0.0001456914834411904, + "loss": 0.9636, + "step": 8992 + }, + { + "epoch": 1.91830204778157, + "grad_norm": 1.0676800839306877, + "learning_rate": 0.00014568662636158433, + "loss": 0.9617, + "step": 8993 + }, + { + "epoch": 1.9185153583617747, + "grad_norm": 0.9409178463687525, + "learning_rate": 0.00014568176853873291, + "loss": 0.9279, + "step": 8994 + }, + { + "epoch": 1.9187286689419796, + "grad_norm": 0.6790145963738639, + "learning_rate": 0.0001456769099726912, + "loss": 0.9635, + "step": 8995 + }, + { + "epoch": 1.9189419795221843, + "grad_norm": 0.4159556052443961, + "learning_rate": 0.0001456720506635142, + "loss": 0.9609, + "step": 8996 + }, + { + "epoch": 1.9191552901023892, + "grad_norm": 0.336168560416569, + "learning_rate": 0.0001456671906112568, + "loss": 0.9631, + "step": 8997 + }, + { + "epoch": 1.9193686006825939, + "grad_norm": 0.4891944206694917, + "learning_rate": 0.00014566232981597403, + "loss": 0.9513, + "step": 8998 + }, + { + "epoch": 1.9195819112627985, + "grad_norm": 0.681342267291843, + "learning_rate": 0.0001456574682777209, + "loss": 0.947, + "step": 8999 + }, + { + "epoch": 1.9197952218430034, + "grad_norm": 0.7827632585287961, + "learning_rate": 0.0001456526059965524, + "loss": 0.9404, + "step": 9000 + }, + { + "epoch": 1.9200085324232083, + "grad_norm": 0.7962512115561643, + "learning_rate": 0.00014564774297252356, + "loss": 0.9941, + "step": 9001 + }, + { + "epoch": 1.920221843003413, + "grad_norm": 0.6219115138184066, + "learning_rate": 0.00014564287920568943, + "loss": 0.9352, + "step": 9002 + }, + { + "epoch": 1.9204351535836177, + "grad_norm": 0.4070558189327167, + "learning_rate": 0.000145638014696105, + "loss": 0.9771, + "step": 9003 + }, + { + "epoch": 1.9206484641638224, + "grad_norm": 0.32830728732348546, + "learning_rate": 0.00014563314944382534, + "loss": 0.9517, + "step": 9004 + }, + { + "epoch": 1.9208617747440273, + "grad_norm": 0.4674740793619706, + "learning_rate": 0.00014562828344890548, + "loss": 0.9178, + "step": 9005 + }, + { + "epoch": 1.9210750853242322, + "grad_norm": 0.4951335298519131, + "learning_rate": 0.0001456234167114005, + "loss": 0.9345, + "step": 9006 + }, + { + "epoch": 1.921288395904437, + "grad_norm": 0.4220305601740885, + "learning_rate": 0.00014561854923136544, + "loss": 0.955, + "step": 9007 + }, + { + "epoch": 1.9215017064846416, + "grad_norm": 0.36709827988981886, + "learning_rate": 0.0001456136810088554, + "loss": 0.9546, + "step": 9008 + }, + { + "epoch": 1.9217150170648463, + "grad_norm": 0.3090349099667185, + "learning_rate": 0.00014560881204392546, + "loss": 0.9381, + "step": 9009 + }, + { + "epoch": 1.9219283276450512, + "grad_norm": 0.2623664410930039, + "learning_rate": 0.0001456039423366307, + "loss": 0.9422, + "step": 9010 + }, + { + "epoch": 1.922141638225256, + "grad_norm": 0.3987786264794626, + "learning_rate": 0.00014559907188702623, + "loss": 0.9698, + "step": 9011 + }, + { + "epoch": 1.9223549488054608, + "grad_norm": 0.4563262374809081, + "learning_rate": 0.00014559420069516714, + "loss": 0.9343, + "step": 9012 + }, + { + "epoch": 1.9225682593856654, + "grad_norm": 0.3059333172457586, + "learning_rate": 0.0001455893287611086, + "loss": 0.943, + "step": 9013 + }, + { + "epoch": 1.9227815699658704, + "grad_norm": 0.24840291481405496, + "learning_rate": 0.0001455844560849057, + "loss": 0.9391, + "step": 9014 + }, + { + "epoch": 1.922994880546075, + "grad_norm": 0.36548593755336833, + "learning_rate": 0.00014557958266661354, + "loss": 0.9339, + "step": 9015 + }, + { + "epoch": 1.92320819112628, + "grad_norm": 0.41350314660502485, + "learning_rate": 0.00014557470850628734, + "loss": 0.9641, + "step": 9016 + }, + { + "epoch": 1.9234215017064846, + "grad_norm": 0.3894117147749751, + "learning_rate": 0.0001455698336039822, + "loss": 0.9588, + "step": 9017 + }, + { + "epoch": 1.9236348122866893, + "grad_norm": 0.3580379452055938, + "learning_rate": 0.00014556495795975326, + "loss": 0.9729, + "step": 9018 + }, + { + "epoch": 1.9238481228668942, + "grad_norm": 0.3625528861864626, + "learning_rate": 0.00014556008157365574, + "loss": 0.956, + "step": 9019 + }, + { + "epoch": 1.9240614334470991, + "grad_norm": 0.37473956740178027, + "learning_rate": 0.00014555520444574482, + "loss": 0.9481, + "step": 9020 + }, + { + "epoch": 1.9242747440273038, + "grad_norm": 0.40969811252773475, + "learning_rate": 0.0001455503265760756, + "loss": 0.9423, + "step": 9021 + }, + { + "epoch": 1.9244880546075085, + "grad_norm": 0.4664766609983003, + "learning_rate": 0.00014554544796470335, + "loss": 0.949, + "step": 9022 + }, + { + "epoch": 1.9247013651877132, + "grad_norm": 0.45824333737561096, + "learning_rate": 0.0001455405686116833, + "loss": 0.9169, + "step": 9023 + }, + { + "epoch": 1.924914675767918, + "grad_norm": 0.38246838475828193, + "learning_rate": 0.00014553568851707055, + "loss": 0.9595, + "step": 9024 + }, + { + "epoch": 1.925127986348123, + "grad_norm": 0.34554613852707916, + "learning_rate": 0.00014553080768092043, + "loss": 0.9574, + "step": 9025 + }, + { + "epoch": 1.9253412969283277, + "grad_norm": 0.30001239760988335, + "learning_rate": 0.0001455259261032881, + "loss": 0.9536, + "step": 9026 + }, + { + "epoch": 1.9255546075085324, + "grad_norm": 0.26212537568763633, + "learning_rate": 0.00014552104378422884, + "loss": 0.9664, + "step": 9027 + }, + { + "epoch": 1.925767918088737, + "grad_norm": 0.2541655831636141, + "learning_rate": 0.00014551616072379784, + "loss": 0.9594, + "step": 9028 + }, + { + "epoch": 1.925981228668942, + "grad_norm": 0.2597977948905391, + "learning_rate": 0.0001455112769220504, + "loss": 0.962, + "step": 9029 + }, + { + "epoch": 1.9261945392491469, + "grad_norm": 0.26745893295187334, + "learning_rate": 0.00014550639237904172, + "loss": 0.9384, + "step": 9030 + }, + { + "epoch": 1.9264078498293515, + "grad_norm": 0.2615903702272602, + "learning_rate": 0.00014550150709482715, + "loss": 0.9272, + "step": 9031 + }, + { + "epoch": 1.9266211604095562, + "grad_norm": 0.21929002933919464, + "learning_rate": 0.00014549662106946192, + "loss": 0.9304, + "step": 9032 + }, + { + "epoch": 1.9268344709897611, + "grad_norm": 0.2920168431715556, + "learning_rate": 0.00014549173430300128, + "loss": 0.896, + "step": 9033 + }, + { + "epoch": 1.927047781569966, + "grad_norm": 0.357834411697117, + "learning_rate": 0.00014548684679550062, + "loss": 0.9504, + "step": 9034 + }, + { + "epoch": 1.9272610921501707, + "grad_norm": 0.36572521198586755, + "learning_rate": 0.00014548195854701516, + "loss": 0.9853, + "step": 9035 + }, + { + "epoch": 1.9274744027303754, + "grad_norm": 0.36909322057704247, + "learning_rate": 0.00014547706955760022, + "loss": 0.9679, + "step": 9036 + }, + { + "epoch": 1.92768771331058, + "grad_norm": 0.4257416166631525, + "learning_rate": 0.00014547217982731117, + "loss": 0.9577, + "step": 9037 + }, + { + "epoch": 1.927901023890785, + "grad_norm": 0.4234148589744146, + "learning_rate": 0.00014546728935620326, + "loss": 0.9527, + "step": 9038 + }, + { + "epoch": 1.92811433447099, + "grad_norm": 0.31077785276402226, + "learning_rate": 0.0001454623981443319, + "loss": 0.972, + "step": 9039 + }, + { + "epoch": 1.9283276450511946, + "grad_norm": 0.8465182524969075, + "learning_rate": 0.00014545750619175243, + "loss": 0.9606, + "step": 9040 + }, + { + "epoch": 1.9285409556313993, + "grad_norm": 0.2742097610265911, + "learning_rate": 0.00014545261349852012, + "loss": 0.9372, + "step": 9041 + }, + { + "epoch": 1.928754266211604, + "grad_norm": 0.35889131380621375, + "learning_rate": 0.00014544772006469043, + "loss": 1.0028, + "step": 9042 + }, + { + "epoch": 1.9289675767918089, + "grad_norm": 0.5233466459878098, + "learning_rate": 0.0001454428258903187, + "loss": 0.9594, + "step": 9043 + }, + { + "epoch": 1.9291808873720138, + "grad_norm": 0.4854352652416334, + "learning_rate": 0.00014543793097546024, + "loss": 0.9417, + "step": 9044 + }, + { + "epoch": 1.9293941979522184, + "grad_norm": 0.4228564114067626, + "learning_rate": 0.00014543303532017055, + "loss": 0.9276, + "step": 9045 + }, + { + "epoch": 1.9296075085324231, + "grad_norm": 0.4328977345924618, + "learning_rate": 0.0001454281389245049, + "loss": 0.9548, + "step": 9046 + }, + { + "epoch": 1.929820819112628, + "grad_norm": 0.45429207934453103, + "learning_rate": 0.00014542324178851878, + "loss": 0.9216, + "step": 9047 + }, + { + "epoch": 1.9300341296928327, + "grad_norm": 0.43861749079900597, + "learning_rate": 0.0001454183439122676, + "loss": 0.9293, + "step": 9048 + }, + { + "epoch": 1.9302474402730376, + "grad_norm": 0.7105380365199923, + "learning_rate": 0.00014541344529580676, + "loss": 0.9617, + "step": 9049 + }, + { + "epoch": 1.9304607508532423, + "grad_norm": 0.8201330634877075, + "learning_rate": 0.0001454085459391917, + "loss": 0.9619, + "step": 9050 + }, + { + "epoch": 1.930674061433447, + "grad_norm": 0.979368189392295, + "learning_rate": 0.0001454036458424778, + "loss": 0.9379, + "step": 9051 + }, + { + "epoch": 1.930887372013652, + "grad_norm": 0.9251807289241345, + "learning_rate": 0.00014539874500572056, + "loss": 0.9954, + "step": 9052 + }, + { + "epoch": 1.9311006825938568, + "grad_norm": 0.7783128129239937, + "learning_rate": 0.00014539384342897543, + "loss": 0.9203, + "step": 9053 + }, + { + "epoch": 1.9313139931740615, + "grad_norm": 0.5929242469742141, + "learning_rate": 0.00014538894111229786, + "loss": 0.9731, + "step": 9054 + }, + { + "epoch": 1.9315273037542662, + "grad_norm": 0.5959899047724941, + "learning_rate": 0.00014538403805574332, + "loss": 0.9281, + "step": 9055 + }, + { + "epoch": 1.9317406143344709, + "grad_norm": 0.720730005423974, + "learning_rate": 0.00014537913425936727, + "loss": 0.9636, + "step": 9056 + }, + { + "epoch": 1.9319539249146758, + "grad_norm": 0.7678762471018024, + "learning_rate": 0.00014537422972322523, + "loss": 0.9667, + "step": 9057 + }, + { + "epoch": 1.9321672354948807, + "grad_norm": 0.569117890192637, + "learning_rate": 0.00014536932444737268, + "loss": 0.98, + "step": 9058 + }, + { + "epoch": 1.9323805460750854, + "grad_norm": 0.45267737035909833, + "learning_rate": 0.00014536441843186512, + "loss": 0.9641, + "step": 9059 + }, + { + "epoch": 1.93259385665529, + "grad_norm": 0.441882902254126, + "learning_rate": 0.00014535951167675808, + "loss": 0.9614, + "step": 9060 + }, + { + "epoch": 1.9328071672354947, + "grad_norm": 0.46669299859395846, + "learning_rate": 0.00014535460418210706, + "loss": 0.9617, + "step": 9061 + }, + { + "epoch": 1.9330204778156996, + "grad_norm": 0.5118167487459402, + "learning_rate": 0.0001453496959479676, + "loss": 0.9536, + "step": 9062 + }, + { + "epoch": 1.9332337883959045, + "grad_norm": 0.414128987565466, + "learning_rate": 0.00014534478697439523, + "loss": 0.924, + "step": 9063 + }, + { + "epoch": 1.9334470989761092, + "grad_norm": 0.3855749469999984, + "learning_rate": 0.00014533987726144548, + "loss": 0.9618, + "step": 9064 + }, + { + "epoch": 1.933660409556314, + "grad_norm": 0.4246322010415785, + "learning_rate": 0.00014533496680917393, + "loss": 0.9377, + "step": 9065 + }, + { + "epoch": 1.9338737201365188, + "grad_norm": 0.4450768097496202, + "learning_rate": 0.00014533005561763614, + "loss": 0.9326, + "step": 9066 + }, + { + "epoch": 1.9340870307167235, + "grad_norm": 0.5189324763858839, + "learning_rate": 0.00014532514368688766, + "loss": 0.9716, + "step": 9067 + }, + { + "epoch": 1.9343003412969284, + "grad_norm": 0.4521008764905304, + "learning_rate": 0.00014532023101698408, + "loss": 0.9365, + "step": 9068 + }, + { + "epoch": 1.934513651877133, + "grad_norm": 0.36722647229670946, + "learning_rate": 0.000145315317607981, + "loss": 0.9419, + "step": 9069 + }, + { + "epoch": 1.9347269624573378, + "grad_norm": 0.28936548033998044, + "learning_rate": 0.000145310403459934, + "loss": 0.9363, + "step": 9070 + }, + { + "epoch": 1.9349402730375427, + "grad_norm": 0.39293082662299333, + "learning_rate": 0.00014530548857289867, + "loss": 0.941, + "step": 9071 + }, + { + "epoch": 1.9351535836177476, + "grad_norm": 0.46326474916219573, + "learning_rate": 0.00014530057294693066, + "loss": 0.9557, + "step": 9072 + }, + { + "epoch": 1.9353668941979523, + "grad_norm": 0.5133136096770409, + "learning_rate": 0.00014529565658208556, + "loss": 0.9132, + "step": 9073 + }, + { + "epoch": 1.935580204778157, + "grad_norm": 0.449221642603789, + "learning_rate": 0.000145290739478419, + "loss": 0.9181, + "step": 9074 + }, + { + "epoch": 1.9357935153583616, + "grad_norm": 0.3616608906996644, + "learning_rate": 0.00014528582163598664, + "loss": 0.9433, + "step": 9075 + }, + { + "epoch": 1.9360068259385665, + "grad_norm": 0.2912842249580637, + "learning_rate": 0.00014528090305484408, + "loss": 0.9425, + "step": 9076 + }, + { + "epoch": 1.9362201365187715, + "grad_norm": 0.27382594252542064, + "learning_rate": 0.00014527598373504703, + "loss": 0.9468, + "step": 9077 + }, + { + "epoch": 1.9364334470989761, + "grad_norm": 0.28253951627106105, + "learning_rate": 0.00014527106367665115, + "loss": 0.9449, + "step": 9078 + }, + { + "epoch": 1.9366467576791808, + "grad_norm": 0.3095361609335597, + "learning_rate": 0.00014526614287971204, + "loss": 0.9651, + "step": 9079 + }, + { + "epoch": 1.9368600682593855, + "grad_norm": 0.342595873343641, + "learning_rate": 0.00014526122134428545, + "loss": 0.9721, + "step": 9080 + }, + { + "epoch": 1.9370733788395904, + "grad_norm": 0.2782441145326023, + "learning_rate": 0.00014525629907042703, + "loss": 0.9575, + "step": 9081 + }, + { + "epoch": 1.9372866894197953, + "grad_norm": 0.22777117372671773, + "learning_rate": 0.0001452513760581925, + "loss": 0.9518, + "step": 9082 + }, + { + "epoch": 1.9375, + "grad_norm": 0.25165577825729746, + "learning_rate": 0.00014524645230763755, + "loss": 0.9393, + "step": 9083 + }, + { + "epoch": 1.9377133105802047, + "grad_norm": 0.2627967191120741, + "learning_rate": 0.00014524152781881787, + "loss": 0.9425, + "step": 9084 + }, + { + "epoch": 1.9379266211604096, + "grad_norm": 0.2384347283898155, + "learning_rate": 0.00014523660259178921, + "loss": 0.9336, + "step": 9085 + }, + { + "epoch": 1.9381399317406145, + "grad_norm": 0.23925594489882773, + "learning_rate": 0.00014523167662660734, + "loss": 0.9457, + "step": 9086 + }, + { + "epoch": 1.9383532423208192, + "grad_norm": 0.25587006845618565, + "learning_rate": 0.00014522674992332793, + "loss": 0.9747, + "step": 9087 + }, + { + "epoch": 1.9385665529010239, + "grad_norm": 0.253323604863062, + "learning_rate": 0.00014522182248200672, + "loss": 0.9099, + "step": 9088 + }, + { + "epoch": 1.9387798634812285, + "grad_norm": 0.33707472493723867, + "learning_rate": 0.00014521689430269954, + "loss": 0.9779, + "step": 9089 + }, + { + "epoch": 1.9389931740614335, + "grad_norm": 0.4039700970274495, + "learning_rate": 0.00014521196538546204, + "loss": 0.9682, + "step": 9090 + }, + { + "epoch": 1.9392064846416384, + "grad_norm": 0.4579158318947872, + "learning_rate": 0.0001452070357303501, + "loss": 0.9221, + "step": 9091 + }, + { + "epoch": 1.939419795221843, + "grad_norm": 0.45948863641232013, + "learning_rate": 0.00014520210533741946, + "loss": 0.9247, + "step": 9092 + }, + { + "epoch": 1.9396331058020477, + "grad_norm": 0.42638175398690215, + "learning_rate": 0.00014519717420672588, + "loss": 0.9395, + "step": 9093 + }, + { + "epoch": 1.9398464163822524, + "grad_norm": 0.28625523279589316, + "learning_rate": 0.00014519224233832518, + "loss": 0.9386, + "step": 9094 + }, + { + "epoch": 1.9400597269624573, + "grad_norm": 0.2588854697044863, + "learning_rate": 0.00014518730973227316, + "loss": 0.9388, + "step": 9095 + }, + { + "epoch": 1.9402730375426622, + "grad_norm": 0.38179000756768733, + "learning_rate": 0.00014518237638862564, + "loss": 0.9696, + "step": 9096 + }, + { + "epoch": 1.940486348122867, + "grad_norm": 0.398510347284627, + "learning_rate": 0.00014517744230743842, + "loss": 0.9859, + "step": 9097 + }, + { + "epoch": 1.9406996587030716, + "grad_norm": 0.42990088657039066, + "learning_rate": 0.00014517250748876737, + "loss": 0.955, + "step": 9098 + }, + { + "epoch": 1.9409129692832765, + "grad_norm": 0.5055199305313333, + "learning_rate": 0.00014516757193266827, + "loss": 0.9575, + "step": 9099 + }, + { + "epoch": 1.9411262798634812, + "grad_norm": 0.47405011693942517, + "learning_rate": 0.00014516263563919702, + "loss": 0.9474, + "step": 9100 + }, + { + "epoch": 1.941339590443686, + "grad_norm": 0.3180809162285644, + "learning_rate": 0.00014515769860840946, + "loss": 0.9493, + "step": 9101 + }, + { + "epoch": 1.9415529010238908, + "grad_norm": 0.33010743142991245, + "learning_rate": 0.00014515276084036143, + "loss": 0.9524, + "step": 9102 + }, + { + "epoch": 1.9417662116040955, + "grad_norm": 0.6708482932896397, + "learning_rate": 0.00014514782233510882, + "loss": 0.9364, + "step": 9103 + }, + { + "epoch": 1.9419795221843004, + "grad_norm": 0.2798096792185111, + "learning_rate": 0.0001451428830927075, + "loss": 0.9241, + "step": 9104 + }, + { + "epoch": 1.9421928327645053, + "grad_norm": 0.27646443396756704, + "learning_rate": 0.00014513794311321336, + "loss": 0.9392, + "step": 9105 + }, + { + "epoch": 1.94240614334471, + "grad_norm": 0.301643605246165, + "learning_rate": 0.0001451330023966823, + "loss": 0.9536, + "step": 9106 + }, + { + "epoch": 1.9426194539249146, + "grad_norm": 0.28178248896185637, + "learning_rate": 0.00014512806094317023, + "loss": 0.9414, + "step": 9107 + }, + { + "epoch": 1.9428327645051193, + "grad_norm": 0.2612743829742066, + "learning_rate": 0.00014512311875273306, + "loss": 0.9471, + "step": 9108 + }, + { + "epoch": 1.9430460750853242, + "grad_norm": 0.293564305594741, + "learning_rate": 0.0001451181758254267, + "loss": 0.9437, + "step": 9109 + }, + { + "epoch": 1.9432593856655291, + "grad_norm": 0.2713861456817509, + "learning_rate": 0.00014511323216130707, + "loss": 0.9632, + "step": 9110 + }, + { + "epoch": 1.9434726962457338, + "grad_norm": 0.30076177307599844, + "learning_rate": 0.00014510828776043012, + "loss": 0.9274, + "step": 9111 + }, + { + "epoch": 1.9436860068259385, + "grad_norm": 0.33234202441890237, + "learning_rate": 0.00014510334262285183, + "loss": 0.9427, + "step": 9112 + }, + { + "epoch": 1.9438993174061432, + "grad_norm": 0.29615177207919713, + "learning_rate": 0.0001450983967486281, + "loss": 0.9787, + "step": 9113 + }, + { + "epoch": 1.944112627986348, + "grad_norm": 0.2446892291628947, + "learning_rate": 0.00014509345013781494, + "loss": 0.9222, + "step": 9114 + }, + { + "epoch": 1.944325938566553, + "grad_norm": 0.2867799532168627, + "learning_rate": 0.00014508850279046825, + "loss": 0.9325, + "step": 9115 + }, + { + "epoch": 1.9445392491467577, + "grad_norm": 0.26918355259211385, + "learning_rate": 0.00014508355470664407, + "loss": 0.9282, + "step": 9116 + }, + { + "epoch": 1.9447525597269624, + "grad_norm": 0.22767034203187247, + "learning_rate": 0.0001450786058863984, + "loss": 0.974, + "step": 9117 + }, + { + "epoch": 1.9449658703071673, + "grad_norm": 0.24095450065679355, + "learning_rate": 0.0001450736563297872, + "loss": 0.9206, + "step": 9118 + }, + { + "epoch": 1.945179180887372, + "grad_norm": 0.2581867984448115, + "learning_rate": 0.0001450687060368665, + "loss": 0.9458, + "step": 9119 + }, + { + "epoch": 1.9453924914675769, + "grad_norm": 0.21514467297744014, + "learning_rate": 0.00014506375500769226, + "loss": 0.9547, + "step": 9120 + }, + { + "epoch": 1.9456058020477816, + "grad_norm": 0.20912133381658485, + "learning_rate": 0.00014505880324232055, + "loss": 0.9499, + "step": 9121 + }, + { + "epoch": 1.9458191126279862, + "grad_norm": 0.25650877286059076, + "learning_rate": 0.00014505385074080739, + "loss": 0.944, + "step": 9122 + }, + { + "epoch": 1.9460324232081911, + "grad_norm": 0.23241255227960644, + "learning_rate": 0.00014504889750320882, + "loss": 0.9511, + "step": 9123 + }, + { + "epoch": 1.946245733788396, + "grad_norm": 0.2448145535895959, + "learning_rate": 0.00014504394352958088, + "loss": 0.9471, + "step": 9124 + }, + { + "epoch": 1.9464590443686007, + "grad_norm": 0.2478369644432543, + "learning_rate": 0.00014503898881997962, + "loss": 0.9678, + "step": 9125 + }, + { + "epoch": 1.9466723549488054, + "grad_norm": 0.23803340671348266, + "learning_rate": 0.00014503403337446112, + "loss": 0.9237, + "step": 9126 + }, + { + "epoch": 1.94688566552901, + "grad_norm": 0.24221837037540692, + "learning_rate": 0.0001450290771930814, + "loss": 0.9526, + "step": 9127 + }, + { + "epoch": 1.947098976109215, + "grad_norm": 0.2640260585994145, + "learning_rate": 0.00014502412027589664, + "loss": 0.9884, + "step": 9128 + }, + { + "epoch": 1.94731228668942, + "grad_norm": 0.31025115270163584, + "learning_rate": 0.00014501916262296281, + "loss": 0.9745, + "step": 9129 + }, + { + "epoch": 1.9475255972696246, + "grad_norm": 0.3365375493330231, + "learning_rate": 0.0001450142042343361, + "loss": 0.9377, + "step": 9130 + }, + { + "epoch": 1.9477389078498293, + "grad_norm": 0.3397907765040605, + "learning_rate": 0.00014500924511007255, + "loss": 0.9399, + "step": 9131 + }, + { + "epoch": 1.947952218430034, + "grad_norm": 0.3965255126041522, + "learning_rate": 0.00014500428525022831, + "loss": 0.9451, + "step": 9132 + }, + { + "epoch": 1.9481655290102389, + "grad_norm": 0.5276572282227507, + "learning_rate": 0.00014499932465485949, + "loss": 0.9396, + "step": 9133 + }, + { + "epoch": 1.9483788395904438, + "grad_norm": 0.5691725757327849, + "learning_rate": 0.0001449943633240222, + "loss": 0.9672, + "step": 9134 + }, + { + "epoch": 1.9485921501706485, + "grad_norm": 0.6165231905786132, + "learning_rate": 0.0001449894012577726, + "loss": 0.9582, + "step": 9135 + }, + { + "epoch": 1.9488054607508531, + "grad_norm": 0.6464587110671356, + "learning_rate": 0.00014498443845616685, + "loss": 0.9848, + "step": 9136 + }, + { + "epoch": 1.949018771331058, + "grad_norm": 0.576987971258143, + "learning_rate": 0.00014497947491926109, + "loss": 0.9679, + "step": 9137 + }, + { + "epoch": 1.949232081911263, + "grad_norm": 0.5319506552956559, + "learning_rate": 0.00014497451064711148, + "loss": 0.962, + "step": 9138 + }, + { + "epoch": 1.9494453924914676, + "grad_norm": 0.49756155716812067, + "learning_rate": 0.00014496954563977417, + "loss": 0.949, + "step": 9139 + }, + { + "epoch": 1.9496587030716723, + "grad_norm": 0.44727180249582854, + "learning_rate": 0.00014496457989730534, + "loss": 0.9432, + "step": 9140 + }, + { + "epoch": 1.949872013651877, + "grad_norm": 0.41430136612703694, + "learning_rate": 0.00014495961341976123, + "loss": 0.9768, + "step": 9141 + }, + { + "epoch": 1.950085324232082, + "grad_norm": 0.3261364569460755, + "learning_rate": 0.00014495464620719798, + "loss": 0.9569, + "step": 9142 + }, + { + "epoch": 1.9502986348122868, + "grad_norm": 0.24068567641479446, + "learning_rate": 0.00014494967825967185, + "loss": 0.9768, + "step": 9143 + }, + { + "epoch": 1.9505119453924915, + "grad_norm": 0.3712214396407801, + "learning_rate": 0.000144944709577239, + "loss": 0.909, + "step": 9144 + }, + { + "epoch": 1.9507252559726962, + "grad_norm": 0.4564735648999777, + "learning_rate": 0.00014493974015995564, + "loss": 0.9549, + "step": 9145 + }, + { + "epoch": 1.9509385665529009, + "grad_norm": 0.47866034397220814, + "learning_rate": 0.00014493477000787806, + "loss": 0.9595, + "step": 9146 + }, + { + "epoch": 1.9511518771331058, + "grad_norm": 0.45617027895513734, + "learning_rate": 0.00014492979912106245, + "loss": 0.9097, + "step": 9147 + }, + { + "epoch": 1.9513651877133107, + "grad_norm": 0.41788692755068424, + "learning_rate": 0.00014492482749956507, + "loss": 0.9603, + "step": 9148 + }, + { + "epoch": 1.9515784982935154, + "grad_norm": 0.29752037276890875, + "learning_rate": 0.0001449198551434422, + "loss": 0.9881, + "step": 9149 + }, + { + "epoch": 1.95179180887372, + "grad_norm": 0.2390323515514818, + "learning_rate": 0.00014491488205275007, + "loss": 0.9398, + "step": 9150 + }, + { + "epoch": 1.952005119453925, + "grad_norm": 0.2577516311213443, + "learning_rate": 0.0001449099082275449, + "loss": 0.9141, + "step": 9151 + }, + { + "epoch": 1.9522184300341296, + "grad_norm": 0.3656291899688176, + "learning_rate": 0.00014490493366788309, + "loss": 0.9455, + "step": 9152 + }, + { + "epoch": 1.9524317406143346, + "grad_norm": 0.4852581105708706, + "learning_rate": 0.00014489995837382085, + "loss": 0.9754, + "step": 9153 + }, + { + "epoch": 1.9526450511945392, + "grad_norm": 0.46213190823782335, + "learning_rate": 0.00014489498234541447, + "loss": 0.9733, + "step": 9154 + }, + { + "epoch": 1.952858361774744, + "grad_norm": 0.4015799420915409, + "learning_rate": 0.0001448900055827203, + "loss": 0.9329, + "step": 9155 + }, + { + "epoch": 1.9530716723549488, + "grad_norm": 0.35004380832771753, + "learning_rate": 0.00014488502808579463, + "loss": 0.9513, + "step": 9156 + }, + { + "epoch": 1.9532849829351537, + "grad_norm": 0.2832523776963615, + "learning_rate": 0.00014488004985469376, + "loss": 0.9635, + "step": 9157 + }, + { + "epoch": 1.9534982935153584, + "grad_norm": 0.2762023528767306, + "learning_rate": 0.00014487507088947402, + "loss": 0.9908, + "step": 9158 + }, + { + "epoch": 1.953711604095563, + "grad_norm": 0.3715972297164714, + "learning_rate": 0.00014487009119019178, + "loss": 0.9327, + "step": 9159 + }, + { + "epoch": 1.9539249146757678, + "grad_norm": 0.4313117387104658, + "learning_rate": 0.00014486511075690334, + "loss": 0.9369, + "step": 9160 + }, + { + "epoch": 1.9541382252559727, + "grad_norm": 0.43774507325684336, + "learning_rate": 0.00014486012958966514, + "loss": 0.9373, + "step": 9161 + }, + { + "epoch": 1.9543515358361776, + "grad_norm": 0.48177505647462004, + "learning_rate": 0.00014485514768853344, + "loss": 0.9418, + "step": 9162 + }, + { + "epoch": 1.9545648464163823, + "grad_norm": 0.4530990739626662, + "learning_rate": 0.00014485016505356464, + "loss": 0.9519, + "step": 9163 + }, + { + "epoch": 1.954778156996587, + "grad_norm": 0.41293089758937285, + "learning_rate": 0.00014484518168481517, + "loss": 0.9546, + "step": 9164 + }, + { + "epoch": 1.9549914675767917, + "grad_norm": 0.4094794978280889, + "learning_rate": 0.00014484019758234133, + "loss": 0.9219, + "step": 9165 + }, + { + "epoch": 1.9552047781569966, + "grad_norm": 0.4821568597087545, + "learning_rate": 0.00014483521274619962, + "loss": 0.9625, + "step": 9166 + }, + { + "epoch": 1.9554180887372015, + "grad_norm": 0.31460399051747745, + "learning_rate": 0.00014483022717644634, + "loss": 0.9504, + "step": 9167 + }, + { + "epoch": 1.9556313993174061, + "grad_norm": 0.28624311919834233, + "learning_rate": 0.000144825240873138, + "loss": 0.9572, + "step": 9168 + }, + { + "epoch": 1.9558447098976108, + "grad_norm": 0.49824343472931504, + "learning_rate": 0.00014482025383633093, + "loss": 0.9463, + "step": 9169 + }, + { + "epoch": 1.9560580204778157, + "grad_norm": 0.5499835020772015, + "learning_rate": 0.00014481526606608162, + "loss": 0.9373, + "step": 9170 + }, + { + "epoch": 1.9562713310580204, + "grad_norm": 0.51463617631541, + "learning_rate": 0.00014481027756244646, + "loss": 0.9335, + "step": 9171 + }, + { + "epoch": 1.9564846416382253, + "grad_norm": 0.46330710285932764, + "learning_rate": 0.00014480528832548192, + "loss": 0.936, + "step": 9172 + }, + { + "epoch": 1.95669795221843, + "grad_norm": 0.44165636360072563, + "learning_rate": 0.00014480029835524445, + "loss": 0.9234, + "step": 9173 + }, + { + "epoch": 1.9569112627986347, + "grad_norm": 0.4350812522286309, + "learning_rate": 0.00014479530765179054, + "loss": 0.9166, + "step": 9174 + }, + { + "epoch": 1.9571245733788396, + "grad_norm": 0.3632940769337217, + "learning_rate": 0.00014479031621517663, + "loss": 0.9175, + "step": 9175 + }, + { + "epoch": 1.9573378839590445, + "grad_norm": 0.30173953176203006, + "learning_rate": 0.00014478532404545917, + "loss": 0.9242, + "step": 9176 + }, + { + "epoch": 1.9575511945392492, + "grad_norm": 0.22860025170075343, + "learning_rate": 0.00014478033114269473, + "loss": 0.9179, + "step": 9177 + }, + { + "epoch": 1.9577645051194539, + "grad_norm": 0.27206252303290074, + "learning_rate": 0.00014477533750693972, + "loss": 0.9506, + "step": 9178 + }, + { + "epoch": 1.9579778156996586, + "grad_norm": 0.4063699668290801, + "learning_rate": 0.00014477034313825065, + "loss": 0.9232, + "step": 9179 + }, + { + "epoch": 1.9581911262798635, + "grad_norm": 0.49371867067998204, + "learning_rate": 0.00014476534803668412, + "loss": 0.9591, + "step": 9180 + }, + { + "epoch": 1.9584044368600684, + "grad_norm": 0.4854684638149939, + "learning_rate": 0.00014476035220229656, + "loss": 0.9618, + "step": 9181 + }, + { + "epoch": 1.958617747440273, + "grad_norm": 0.44150201972415254, + "learning_rate": 0.00014475535563514454, + "loss": 0.9654, + "step": 9182 + }, + { + "epoch": 1.9588310580204777, + "grad_norm": 0.41187658545477485, + "learning_rate": 0.00014475035833528454, + "loss": 0.9202, + "step": 9183 + }, + { + "epoch": 1.9590443686006824, + "grad_norm": 0.3267653883878923, + "learning_rate": 0.00014474536030277318, + "loss": 0.9182, + "step": 9184 + }, + { + "epoch": 1.9592576791808873, + "grad_norm": 0.2567320078354551, + "learning_rate": 0.00014474036153766696, + "loss": 0.9636, + "step": 9185 + }, + { + "epoch": 1.9594709897610922, + "grad_norm": 0.26713971448996765, + "learning_rate": 0.00014473536204002248, + "loss": 0.9351, + "step": 9186 + }, + { + "epoch": 1.959684300341297, + "grad_norm": 0.25165113686323065, + "learning_rate": 0.00014473036180989628, + "loss": 0.9288, + "step": 9187 + }, + { + "epoch": 1.9598976109215016, + "grad_norm": 0.29141075766222013, + "learning_rate": 0.00014472536084734493, + "loss": 0.9734, + "step": 9188 + }, + { + "epoch": 1.9601109215017065, + "grad_norm": 0.3603400924901394, + "learning_rate": 0.00014472035915242504, + "loss": 0.9386, + "step": 9189 + }, + { + "epoch": 1.9603242320819114, + "grad_norm": 0.3167814389379607, + "learning_rate": 0.00014471535672519323, + "loss": 0.9204, + "step": 9190 + }, + { + "epoch": 1.960537542662116, + "grad_norm": 0.3975648914025256, + "learning_rate": 0.00014471035356570605, + "loss": 0.9682, + "step": 9191 + }, + { + "epoch": 1.9607508532423208, + "grad_norm": 0.3959270559718586, + "learning_rate": 0.00014470534967402012, + "loss": 0.9367, + "step": 9192 + }, + { + "epoch": 1.9609641638225255, + "grad_norm": 0.2882052219846432, + "learning_rate": 0.00014470034505019208, + "loss": 0.9508, + "step": 9193 + }, + { + "epoch": 1.9611774744027304, + "grad_norm": 0.320735833382675, + "learning_rate": 0.00014469533969427856, + "loss": 0.9324, + "step": 9194 + }, + { + "epoch": 1.9613907849829353, + "grad_norm": 0.32495484600300545, + "learning_rate": 0.00014469033360633616, + "loss": 0.9392, + "step": 9195 + }, + { + "epoch": 1.96160409556314, + "grad_norm": 0.3018765529638998, + "learning_rate": 0.00014468532678642157, + "loss": 0.9384, + "step": 9196 + }, + { + "epoch": 1.9618174061433447, + "grad_norm": 0.2774935374793477, + "learning_rate": 0.0001446803192345914, + "loss": 0.9639, + "step": 9197 + }, + { + "epoch": 1.9620307167235493, + "grad_norm": 0.3412016820080037, + "learning_rate": 0.00014467531095090235, + "loss": 0.9423, + "step": 9198 + }, + { + "epoch": 1.9622440273037542, + "grad_norm": 0.39183160661863875, + "learning_rate": 0.00014467030193541104, + "loss": 0.9451, + "step": 9199 + }, + { + "epoch": 1.9624573378839592, + "grad_norm": 0.3077543106515557, + "learning_rate": 0.0001446652921881742, + "loss": 0.9532, + "step": 9200 + }, + { + "epoch": 1.9626706484641638, + "grad_norm": 0.3002736811422381, + "learning_rate": 0.00014466028170924847, + "loss": 0.9348, + "step": 9201 + }, + { + "epoch": 1.9628839590443685, + "grad_norm": 0.519990539957318, + "learning_rate": 0.00014465527049869058, + "loss": 0.9512, + "step": 9202 + }, + { + "epoch": 1.9630972696245734, + "grad_norm": 0.2870008193470215, + "learning_rate": 0.00014465025855655723, + "loss": 0.9626, + "step": 9203 + }, + { + "epoch": 1.963310580204778, + "grad_norm": 0.2532905189685037, + "learning_rate": 0.00014464524588290507, + "loss": 0.9365, + "step": 9204 + }, + { + "epoch": 1.963523890784983, + "grad_norm": 0.30798762521289724, + "learning_rate": 0.00014464023247779092, + "loss": 0.973, + "step": 9205 + }, + { + "epoch": 1.9637372013651877, + "grad_norm": 0.28690154834604903, + "learning_rate": 0.00014463521834127141, + "loss": 0.95, + "step": 9206 + }, + { + "epoch": 1.9639505119453924, + "grad_norm": 0.25051290651015157, + "learning_rate": 0.00014463020347340333, + "loss": 0.9345, + "step": 9207 + }, + { + "epoch": 1.9641638225255973, + "grad_norm": 0.33985393474407827, + "learning_rate": 0.00014462518787424346, + "loss": 0.9526, + "step": 9208 + }, + { + "epoch": 1.9643771331058022, + "grad_norm": 0.3401684978589181, + "learning_rate": 0.00014462017154384844, + "loss": 0.9483, + "step": 9209 + }, + { + "epoch": 1.9645904436860069, + "grad_norm": 0.3267723065591654, + "learning_rate": 0.0001446151544822751, + "loss": 0.9062, + "step": 9210 + }, + { + "epoch": 1.9648037542662116, + "grad_norm": 0.28568895030645025, + "learning_rate": 0.0001446101366895802, + "loss": 0.9363, + "step": 9211 + }, + { + "epoch": 1.9650170648464163, + "grad_norm": 0.2979034780322229, + "learning_rate": 0.00014460511816582054, + "loss": 0.9421, + "step": 9212 + }, + { + "epoch": 1.9652303754266212, + "grad_norm": 0.30186891222693213, + "learning_rate": 0.00014460009891105287, + "loss": 0.9569, + "step": 9213 + }, + { + "epoch": 1.965443686006826, + "grad_norm": 0.2906917339813443, + "learning_rate": 0.000144595078925334, + "loss": 0.9081, + "step": 9214 + }, + { + "epoch": 1.9656569965870307, + "grad_norm": 0.27393553811621485, + "learning_rate": 0.00014459005820872072, + "loss": 0.9444, + "step": 9215 + }, + { + "epoch": 1.9658703071672354, + "grad_norm": 0.2720691933679221, + "learning_rate": 0.00014458503676126984, + "loss": 0.9439, + "step": 9216 + }, + { + "epoch": 1.9660836177474401, + "grad_norm": 0.36329744123204466, + "learning_rate": 0.00014458001458303817, + "loss": 0.938, + "step": 9217 + }, + { + "epoch": 1.966296928327645, + "grad_norm": 0.4134252610243178, + "learning_rate": 0.00014457499167408257, + "loss": 0.9359, + "step": 9218 + }, + { + "epoch": 1.96651023890785, + "grad_norm": 0.37172665846185016, + "learning_rate": 0.00014456996803445987, + "loss": 0.9317, + "step": 9219 + }, + { + "epoch": 1.9667235494880546, + "grad_norm": 0.3952856214006018, + "learning_rate": 0.00014456494366422686, + "loss": 0.9591, + "step": 9220 + }, + { + "epoch": 1.9669368600682593, + "grad_norm": 0.5024256424459677, + "learning_rate": 0.00014455991856344042, + "loss": 0.9259, + "step": 9221 + }, + { + "epoch": 1.9671501706484642, + "grad_norm": 0.34434880559721104, + "learning_rate": 0.00014455489273215746, + "loss": 0.9897, + "step": 9222 + }, + { + "epoch": 1.9673634812286689, + "grad_norm": 0.32443067890514266, + "learning_rate": 0.00014454986617043478, + "loss": 0.9521, + "step": 9223 + }, + { + "epoch": 1.9675767918088738, + "grad_norm": 0.30192639737401444, + "learning_rate": 0.00014454483887832927, + "loss": 0.967, + "step": 9224 + }, + { + "epoch": 1.9677901023890785, + "grad_norm": 0.40941707299517355, + "learning_rate": 0.0001445398108558978, + "loss": 0.9366, + "step": 9225 + }, + { + "epoch": 1.9680034129692832, + "grad_norm": 0.5091079009865345, + "learning_rate": 0.00014453478210319735, + "loss": 0.9185, + "step": 9226 + }, + { + "epoch": 1.968216723549488, + "grad_norm": 0.5912731481136583, + "learning_rate": 0.0001445297526202847, + "loss": 0.9438, + "step": 9227 + }, + { + "epoch": 1.968430034129693, + "grad_norm": 0.8823004642618213, + "learning_rate": 0.00014452472240721685, + "loss": 0.9151, + "step": 9228 + }, + { + "epoch": 1.9686433447098977, + "grad_norm": 1.054190545443553, + "learning_rate": 0.00014451969146405067, + "loss": 0.9528, + "step": 9229 + }, + { + "epoch": 1.9688566552901023, + "grad_norm": 0.9088250959485528, + "learning_rate": 0.0001445146597908431, + "loss": 0.9559, + "step": 9230 + }, + { + "epoch": 1.969069965870307, + "grad_norm": 0.7222636990397269, + "learning_rate": 0.00014450962738765105, + "loss": 0.9317, + "step": 9231 + }, + { + "epoch": 1.969283276450512, + "grad_norm": 0.48378737322661136, + "learning_rate": 0.00014450459425453149, + "loss": 0.9619, + "step": 9232 + }, + { + "epoch": 1.9694965870307168, + "grad_norm": 0.3273781107733687, + "learning_rate": 0.00014449956039154136, + "loss": 0.9434, + "step": 9233 + }, + { + "epoch": 1.9697098976109215, + "grad_norm": 0.4459103280038329, + "learning_rate": 0.00014449452579873761, + "loss": 0.9347, + "step": 9234 + }, + { + "epoch": 1.9699232081911262, + "grad_norm": 0.45106449950005484, + "learning_rate": 0.0001444894904761772, + "loss": 0.9276, + "step": 9235 + }, + { + "epoch": 1.9701365187713311, + "grad_norm": 0.5180385830094687, + "learning_rate": 0.00014448445442391718, + "loss": 0.952, + "step": 9236 + }, + { + "epoch": 1.9703498293515358, + "grad_norm": 0.5805645630897621, + "learning_rate": 0.00014447941764201443, + "loss": 0.9456, + "step": 9237 + }, + { + "epoch": 1.9705631399317407, + "grad_norm": 0.5091472036242358, + "learning_rate": 0.00014447438013052597, + "loss": 0.9841, + "step": 9238 + }, + { + "epoch": 1.9707764505119454, + "grad_norm": 0.3541407525743451, + "learning_rate": 0.00014446934188950885, + "loss": 0.9436, + "step": 9239 + }, + { + "epoch": 1.97098976109215, + "grad_norm": 0.4252545453689635, + "learning_rate": 0.00014446430291902005, + "loss": 0.9323, + "step": 9240 + }, + { + "epoch": 1.971203071672355, + "grad_norm": 0.4193328111627544, + "learning_rate": 0.00014445926321911655, + "loss": 0.9575, + "step": 9241 + }, + { + "epoch": 1.9714163822525599, + "grad_norm": 0.2814628051477235, + "learning_rate": 0.0001444542227898554, + "loss": 0.936, + "step": 9242 + }, + { + "epoch": 1.9716296928327646, + "grad_norm": 0.3650361809152761, + "learning_rate": 0.00014444918163129366, + "loss": 0.9212, + "step": 9243 + }, + { + "epoch": 1.9718430034129693, + "grad_norm": 0.4121596460436324, + "learning_rate": 0.00014444413974348832, + "loss": 0.9554, + "step": 9244 + }, + { + "epoch": 1.972056313993174, + "grad_norm": 0.32217786263752796, + "learning_rate": 0.00014443909712649647, + "loss": 0.931, + "step": 9245 + }, + { + "epoch": 1.9722696245733788, + "grad_norm": 0.2863623047430184, + "learning_rate": 0.00014443405378037515, + "loss": 0.951, + "step": 9246 + }, + { + "epoch": 1.9724829351535837, + "grad_norm": 0.2660741481805442, + "learning_rate": 0.00014442900970518144, + "loss": 0.9586, + "step": 9247 + }, + { + "epoch": 1.9726962457337884, + "grad_norm": 0.25754040541622253, + "learning_rate": 0.00014442396490097238, + "loss": 0.9437, + "step": 9248 + }, + { + "epoch": 1.9729095563139931, + "grad_norm": 0.3486755524344042, + "learning_rate": 0.00014441891936780508, + "loss": 0.9268, + "step": 9249 + }, + { + "epoch": 1.9731228668941978, + "grad_norm": 0.4218711889981856, + "learning_rate": 0.00014441387310573664, + "loss": 0.9455, + "step": 9250 + }, + { + "epoch": 1.9733361774744027, + "grad_norm": 0.4419005996789816, + "learning_rate": 0.00014440882611482412, + "loss": 0.924, + "step": 9251 + }, + { + "epoch": 1.9735494880546076, + "grad_norm": 0.41793443803166, + "learning_rate": 0.00014440377839512465, + "loss": 0.9721, + "step": 9252 + }, + { + "epoch": 1.9737627986348123, + "grad_norm": 0.4368771264603761, + "learning_rate": 0.00014439872994669536, + "loss": 0.9854, + "step": 9253 + }, + { + "epoch": 1.973976109215017, + "grad_norm": 0.4029802407057381, + "learning_rate": 0.00014439368076959335, + "loss": 0.9067, + "step": 9254 + }, + { + "epoch": 1.974189419795222, + "grad_norm": 0.35499272900939544, + "learning_rate": 0.00014438863086387575, + "loss": 0.9599, + "step": 9255 + }, + { + "epoch": 1.9744027303754266, + "grad_norm": 0.26855274550647895, + "learning_rate": 0.0001443835802295997, + "loss": 0.9397, + "step": 9256 + }, + { + "epoch": 1.9746160409556315, + "grad_norm": 0.30030528177956267, + "learning_rate": 0.0001443785288668224, + "loss": 0.9222, + "step": 9257 + }, + { + "epoch": 1.9748293515358362, + "grad_norm": 0.4395247647426922, + "learning_rate": 0.00014437347677560093, + "loss": 0.95, + "step": 9258 + }, + { + "epoch": 1.9750426621160408, + "grad_norm": 0.4058678387499671, + "learning_rate": 0.0001443684239559925, + "loss": 0.9386, + "step": 9259 + }, + { + "epoch": 1.9752559726962458, + "grad_norm": 0.3430741919151726, + "learning_rate": 0.00014436337040805425, + "loss": 0.9794, + "step": 9260 + }, + { + "epoch": 1.9754692832764507, + "grad_norm": 0.36872668564580335, + "learning_rate": 0.00014435831613184342, + "loss": 0.9263, + "step": 9261 + }, + { + "epoch": 1.9756825938566553, + "grad_norm": 0.3741922456256922, + "learning_rate": 0.0001443532611274171, + "loss": 0.9266, + "step": 9262 + }, + { + "epoch": 1.97589590443686, + "grad_norm": 0.3702363544784142, + "learning_rate": 0.0001443482053948326, + "loss": 0.9713, + "step": 9263 + }, + { + "epoch": 1.9761092150170647, + "grad_norm": 0.36784678285982436, + "learning_rate": 0.00014434314893414706, + "loss": 0.9672, + "step": 9264 + }, + { + "epoch": 1.9763225255972696, + "grad_norm": 0.26468254636320154, + "learning_rate": 0.0001443380917454177, + "loss": 0.9111, + "step": 9265 + }, + { + "epoch": 1.9765358361774745, + "grad_norm": 0.24455857401587955, + "learning_rate": 0.00014433303382870176, + "loss": 0.9514, + "step": 9266 + }, + { + "epoch": 1.9767491467576792, + "grad_norm": 0.2610945702558467, + "learning_rate": 0.00014432797518405645, + "loss": 0.8989, + "step": 9267 + }, + { + "epoch": 1.976962457337884, + "grad_norm": 0.3233292915281001, + "learning_rate": 0.00014432291581153902, + "loss": 0.9599, + "step": 9268 + }, + { + "epoch": 1.9771757679180886, + "grad_norm": 0.41191958671814266, + "learning_rate": 0.0001443178557112067, + "loss": 0.9652, + "step": 9269 + }, + { + "epoch": 1.9773890784982935, + "grad_norm": 0.43001179672004763, + "learning_rate": 0.0001443127948831168, + "loss": 0.9307, + "step": 9270 + }, + { + "epoch": 1.9776023890784984, + "grad_norm": 0.4720088778228061, + "learning_rate": 0.00014430773332732653, + "loss": 0.9341, + "step": 9271 + }, + { + "epoch": 1.977815699658703, + "grad_norm": 0.3829926994741862, + "learning_rate": 0.00014430267104389316, + "loss": 0.9519, + "step": 9272 + }, + { + "epoch": 1.9780290102389078, + "grad_norm": 0.2856828554459695, + "learning_rate": 0.00014429760803287401, + "loss": 0.9205, + "step": 9273 + }, + { + "epoch": 1.9782423208191127, + "grad_norm": 0.3160545848389836, + "learning_rate": 0.00014429254429432632, + "loss": 0.9476, + "step": 9274 + }, + { + "epoch": 1.9784556313993176, + "grad_norm": 0.2918190362979895, + "learning_rate": 0.00014428747982830743, + "loss": 0.9566, + "step": 9275 + }, + { + "epoch": 1.9786689419795223, + "grad_norm": 0.2587007569926521, + "learning_rate": 0.0001442824146348746, + "loss": 0.9564, + "step": 9276 + }, + { + "epoch": 1.978882252559727, + "grad_norm": 0.3281201609839211, + "learning_rate": 0.0001442773487140852, + "loss": 0.9422, + "step": 9277 + }, + { + "epoch": 1.9790955631399316, + "grad_norm": 0.3653766629568342, + "learning_rate": 0.00014427228206599649, + "loss": 0.9542, + "step": 9278 + }, + { + "epoch": 1.9793088737201365, + "grad_norm": 0.4399186254274377, + "learning_rate": 0.00014426721469066583, + "loss": 0.9601, + "step": 9279 + }, + { + "epoch": 1.9795221843003414, + "grad_norm": 0.4927564144625944, + "learning_rate": 0.00014426214658815056, + "loss": 0.944, + "step": 9280 + }, + { + "epoch": 1.9797354948805461, + "grad_norm": 0.47670471667208913, + "learning_rate": 0.000144257077758508, + "loss": 0.9297, + "step": 9281 + }, + { + "epoch": 1.9799488054607508, + "grad_norm": 0.4375555383467361, + "learning_rate": 0.00014425200820179557, + "loss": 0.9047, + "step": 9282 + }, + { + "epoch": 1.9801621160409555, + "grad_norm": 0.33812453482884314, + "learning_rate": 0.00014424693791807055, + "loss": 0.942, + "step": 9283 + }, + { + "epoch": 1.9803754266211604, + "grad_norm": 0.3149831705757544, + "learning_rate": 0.00014424186690739036, + "loss": 0.9246, + "step": 9284 + }, + { + "epoch": 1.9805887372013653, + "grad_norm": 0.36510444461049035, + "learning_rate": 0.00014423679516981235, + "loss": 0.9947, + "step": 9285 + }, + { + "epoch": 1.98080204778157, + "grad_norm": 0.38898388927778266, + "learning_rate": 0.00014423172270539396, + "loss": 0.9273, + "step": 9286 + }, + { + "epoch": 1.9810153583617747, + "grad_norm": 0.38655492598391444, + "learning_rate": 0.00014422664951419252, + "loss": 0.9693, + "step": 9287 + }, + { + "epoch": 1.9812286689419796, + "grad_norm": 0.43502915007264664, + "learning_rate": 0.00014422157559626546, + "loss": 0.9502, + "step": 9288 + }, + { + "epoch": 1.9814419795221843, + "grad_norm": 0.3984624557333933, + "learning_rate": 0.0001442165009516702, + "loss": 0.9234, + "step": 9289 + }, + { + "epoch": 1.9816552901023892, + "grad_norm": 0.42670407671089816, + "learning_rate": 0.00014421142558046414, + "loss": 0.9388, + "step": 9290 + }, + { + "epoch": 1.9818686006825939, + "grad_norm": 0.3899197354855618, + "learning_rate": 0.00014420634948270473, + "loss": 0.9571, + "step": 9291 + }, + { + "epoch": 1.9820819112627985, + "grad_norm": 0.3551922746818422, + "learning_rate": 0.0001442012726584494, + "loss": 0.9482, + "step": 9292 + }, + { + "epoch": 1.9822952218430034, + "grad_norm": 0.3364372823363188, + "learning_rate": 0.00014419619510775558, + "loss": 0.9606, + "step": 9293 + }, + { + "epoch": 1.9825085324232083, + "grad_norm": 0.2838403999436151, + "learning_rate": 0.0001441911168306807, + "loss": 0.923, + "step": 9294 + }, + { + "epoch": 1.982721843003413, + "grad_norm": 0.24293722785168095, + "learning_rate": 0.0001441860378272823, + "loss": 0.906, + "step": 9295 + }, + { + "epoch": 1.9829351535836177, + "grad_norm": 0.245885648575, + "learning_rate": 0.0001441809580976178, + "loss": 0.9212, + "step": 9296 + }, + { + "epoch": 1.9831484641638224, + "grad_norm": 0.24873268721981334, + "learning_rate": 0.00014417587764174467, + "loss": 0.9293, + "step": 9297 + }, + { + "epoch": 1.9833617747440273, + "grad_norm": 0.26995032177224887, + "learning_rate": 0.00014417079645972044, + "loss": 0.9305, + "step": 9298 + }, + { + "epoch": 1.9835750853242322, + "grad_norm": 0.2524104098379702, + "learning_rate": 0.00014416571455160248, + "loss": 0.9624, + "step": 9299 + }, + { + "epoch": 1.983788395904437, + "grad_norm": 0.2904949963213432, + "learning_rate": 0.00014416063191744844, + "loss": 0.9606, + "step": 9300 + }, + { + "epoch": 1.9840017064846416, + "grad_norm": 0.2783288192540694, + "learning_rate": 0.0001441555485573158, + "loss": 0.9639, + "step": 9301 + }, + { + "epoch": 1.9842150170648463, + "grad_norm": 0.2958481173231643, + "learning_rate": 0.000144150464471262, + "loss": 0.9308, + "step": 9302 + }, + { + "epoch": 1.9844283276450512, + "grad_norm": 0.308863464677603, + "learning_rate": 0.0001441453796593446, + "loss": 0.9659, + "step": 9303 + }, + { + "epoch": 1.984641638225256, + "grad_norm": 0.2924163095414211, + "learning_rate": 0.0001441402941216212, + "loss": 0.9326, + "step": 9304 + }, + { + "epoch": 1.9848549488054608, + "grad_norm": 0.25713122839862096, + "learning_rate": 0.0001441352078581493, + "loss": 0.9717, + "step": 9305 + }, + { + "epoch": 1.9850682593856654, + "grad_norm": 0.3038650361716289, + "learning_rate": 0.00014413012086898641, + "loss": 0.9814, + "step": 9306 + }, + { + "epoch": 1.9852815699658704, + "grad_norm": 0.40207179877602345, + "learning_rate": 0.00014412503315419013, + "loss": 0.9646, + "step": 9307 + }, + { + "epoch": 1.985494880546075, + "grad_norm": 0.5037254584226146, + "learning_rate": 0.000144119944713818, + "loss": 0.9436, + "step": 9308 + }, + { + "epoch": 1.98570819112628, + "grad_norm": 0.588069666345415, + "learning_rate": 0.00014411485554792765, + "loss": 0.935, + "step": 9309 + }, + { + "epoch": 1.9859215017064846, + "grad_norm": 0.6656062860349988, + "learning_rate": 0.00014410976565657662, + "loss": 0.9746, + "step": 9310 + }, + { + "epoch": 1.9861348122866893, + "grad_norm": 0.7214442570543279, + "learning_rate": 0.0001441046750398225, + "loss": 0.9488, + "step": 9311 + }, + { + "epoch": 1.9863481228668942, + "grad_norm": 0.6915937903180299, + "learning_rate": 0.00014409958369772293, + "loss": 0.9654, + "step": 9312 + }, + { + "epoch": 1.9865614334470991, + "grad_norm": 0.5430521432179248, + "learning_rate": 0.00014409449163033547, + "loss": 0.9042, + "step": 9313 + }, + { + "epoch": 1.9867747440273038, + "grad_norm": 0.3472171683949897, + "learning_rate": 0.00014408939883771776, + "loss": 0.9147, + "step": 9314 + }, + { + "epoch": 1.9869880546075085, + "grad_norm": 0.24400789161212522, + "learning_rate": 0.00014408430531992743, + "loss": 0.9454, + "step": 9315 + }, + { + "epoch": 1.9872013651877132, + "grad_norm": 0.42735250490397925, + "learning_rate": 0.00014407921107702213, + "loss": 0.9426, + "step": 9316 + }, + { + "epoch": 1.987414675767918, + "grad_norm": 0.5260258954496522, + "learning_rate": 0.00014407411610905946, + "loss": 0.9015, + "step": 9317 + }, + { + "epoch": 1.987627986348123, + "grad_norm": 0.5381066604990161, + "learning_rate": 0.0001440690204160971, + "loss": 0.9507, + "step": 9318 + }, + { + "epoch": 1.9878412969283277, + "grad_norm": 0.5385289586810409, + "learning_rate": 0.00014406392399819268, + "loss": 0.9483, + "step": 9319 + }, + { + "epoch": 1.9880546075085324, + "grad_norm": 0.5781626028107313, + "learning_rate": 0.00014405882685540388, + "loss": 0.9623, + "step": 9320 + }, + { + "epoch": 1.988267918088737, + "grad_norm": 0.49619098922013255, + "learning_rate": 0.00014405372898778837, + "loss": 0.9578, + "step": 9321 + }, + { + "epoch": 1.988481228668942, + "grad_norm": 0.351416381914116, + "learning_rate": 0.00014404863039540385, + "loss": 0.9594, + "step": 9322 + }, + { + "epoch": 1.9886945392491469, + "grad_norm": 0.2643919604580417, + "learning_rate": 0.00014404353107830802, + "loss": 0.949, + "step": 9323 + }, + { + "epoch": 1.9889078498293515, + "grad_norm": 0.23343528635591018, + "learning_rate": 0.00014403843103655853, + "loss": 0.926, + "step": 9324 + }, + { + "epoch": 1.9891211604095562, + "grad_norm": 0.27122797787271674, + "learning_rate": 0.00014403333027021312, + "loss": 0.9058, + "step": 9325 + }, + { + "epoch": 1.9893344709897611, + "grad_norm": 0.2943497080586718, + "learning_rate": 0.0001440282287793295, + "loss": 0.9314, + "step": 9326 + }, + { + "epoch": 1.989547781569966, + "grad_norm": 0.28750166326967475, + "learning_rate": 0.0001440231265639654, + "loss": 0.9674, + "step": 9327 + }, + { + "epoch": 1.9897610921501707, + "grad_norm": 0.29991375274694326, + "learning_rate": 0.00014401802362417856, + "loss": 0.9771, + "step": 9328 + }, + { + "epoch": 1.9899744027303754, + "grad_norm": 0.3116382318202547, + "learning_rate": 0.0001440129199600267, + "loss": 0.9393, + "step": 9329 + }, + { + "epoch": 1.99018771331058, + "grad_norm": 0.2678585662462422, + "learning_rate": 0.00014400781557156756, + "loss": 0.9557, + "step": 9330 + }, + { + "epoch": 1.990401023890785, + "grad_norm": 0.22209160599116917, + "learning_rate": 0.0001440027104588589, + "loss": 0.9444, + "step": 9331 + }, + { + "epoch": 1.99061433447099, + "grad_norm": 0.239241444122219, + "learning_rate": 0.00014399760462195852, + "loss": 0.9573, + "step": 9332 + }, + { + "epoch": 1.9908276450511946, + "grad_norm": 0.23576420673074333, + "learning_rate": 0.0001439924980609242, + "loss": 0.9584, + "step": 9333 + }, + { + "epoch": 1.9910409556313993, + "grad_norm": 0.21243170147892054, + "learning_rate": 0.00014398739077581363, + "loss": 0.9474, + "step": 9334 + }, + { + "epoch": 1.991254266211604, + "grad_norm": 0.3115542895592033, + "learning_rate": 0.00014398228276668468, + "loss": 0.9683, + "step": 9335 + }, + { + "epoch": 1.9914675767918089, + "grad_norm": 0.29616347865227677, + "learning_rate": 0.0001439771740335951, + "loss": 0.9469, + "step": 9336 + }, + { + "epoch": 1.9916808873720138, + "grad_norm": 0.20989362180718812, + "learning_rate": 0.00014397206457660272, + "loss": 0.9448, + "step": 9337 + }, + { + "epoch": 1.9918941979522184, + "grad_norm": 0.2781128866818214, + "learning_rate": 0.00014396695439576536, + "loss": 0.9663, + "step": 9338 + }, + { + "epoch": 1.9921075085324231, + "grad_norm": 0.3554035361907015, + "learning_rate": 0.00014396184349114088, + "loss": 0.9334, + "step": 9339 + }, + { + "epoch": 1.992320819112628, + "grad_norm": 0.27660432271994523, + "learning_rate": 0.00014395673186278702, + "loss": 0.9555, + "step": 9340 + }, + { + "epoch": 1.9925341296928327, + "grad_norm": 0.2290437957461972, + "learning_rate": 0.0001439516195107617, + "loss": 0.9237, + "step": 9341 + }, + { + "epoch": 1.9927474402730376, + "grad_norm": 0.24362016054222388, + "learning_rate": 0.0001439465064351227, + "loss": 0.9146, + "step": 9342 + }, + { + "epoch": 1.9929607508532423, + "grad_norm": 0.2558525892408812, + "learning_rate": 0.0001439413926359279, + "loss": 0.9529, + "step": 9343 + }, + { + "epoch": 1.993174061433447, + "grad_norm": 0.2675530410580743, + "learning_rate": 0.00014393627811323516, + "loss": 0.9748, + "step": 9344 + }, + { + "epoch": 1.993387372013652, + "grad_norm": 0.24541593352878077, + "learning_rate": 0.00014393116286710237, + "loss": 0.9556, + "step": 9345 + }, + { + "epoch": 1.9936006825938568, + "grad_norm": 0.245771765237378, + "learning_rate": 0.00014392604689758743, + "loss": 0.9354, + "step": 9346 + }, + { + "epoch": 1.9938139931740615, + "grad_norm": 0.2062931147192723, + "learning_rate": 0.00014392093020474815, + "loss": 0.9257, + "step": 9347 + }, + { + "epoch": 1.9940273037542662, + "grad_norm": 0.2484737804528796, + "learning_rate": 0.00014391581278864248, + "loss": 0.95, + "step": 9348 + }, + { + "epoch": 1.9942406143344709, + "grad_norm": 0.27003416946011544, + "learning_rate": 0.00014391069464932832, + "loss": 0.9521, + "step": 9349 + }, + { + "epoch": 1.9944539249146758, + "grad_norm": 0.2729521532066148, + "learning_rate": 0.00014390557578686355, + "loss": 0.9385, + "step": 9350 + }, + { + "epoch": 1.9946672354948807, + "grad_norm": 0.274358440326392, + "learning_rate": 0.00014390045620130614, + "loss": 0.9433, + "step": 9351 + }, + { + "epoch": 1.9948805460750854, + "grad_norm": 0.28400287627471776, + "learning_rate": 0.00014389533589271398, + "loss": 0.9244, + "step": 9352 + }, + { + "epoch": 1.99509385665529, + "grad_norm": 0.35104975927050946, + "learning_rate": 0.00014389021486114505, + "loss": 0.9317, + "step": 9353 + }, + { + "epoch": 1.9953071672354947, + "grad_norm": 0.48786810333440783, + "learning_rate": 0.00014388509310665724, + "loss": 0.9109, + "step": 9354 + }, + { + "epoch": 1.9955204778156996, + "grad_norm": 0.5894122992892071, + "learning_rate": 0.00014387997062930853, + "loss": 0.9558, + "step": 9355 + }, + { + "epoch": 1.9957337883959045, + "grad_norm": 0.6421130137160073, + "learning_rate": 0.0001438748474291569, + "loss": 0.9473, + "step": 9356 + }, + { + "epoch": 1.9959470989761092, + "grad_norm": 0.602759144592708, + "learning_rate": 0.00014386972350626025, + "loss": 0.9188, + "step": 9357 + }, + { + "epoch": 1.996160409556314, + "grad_norm": 0.5068783158168539, + "learning_rate": 0.00014386459886067663, + "loss": 0.9495, + "step": 9358 + }, + { + "epoch": 1.9963737201365188, + "grad_norm": 0.39465801465880135, + "learning_rate": 0.00014385947349246402, + "loss": 0.9074, + "step": 9359 + }, + { + "epoch": 1.9965870307167235, + "grad_norm": 0.3395822675462725, + "learning_rate": 0.00014385434740168038, + "loss": 0.9516, + "step": 9360 + }, + { + "epoch": 1.9968003412969284, + "grad_norm": 0.24797696292900945, + "learning_rate": 0.00014384922058838373, + "loss": 0.9248, + "step": 9361 + }, + { + "epoch": 1.997013651877133, + "grad_norm": 0.31059563167457405, + "learning_rate": 0.00014384409305263202, + "loss": 0.9574, + "step": 9362 + }, + { + "epoch": 1.9972269624573378, + "grad_norm": 0.41253364863213937, + "learning_rate": 0.00014383896479448337, + "loss": 0.937, + "step": 9363 + }, + { + "epoch": 1.9974402730375427, + "grad_norm": 0.4272686644270285, + "learning_rate": 0.00014383383581399576, + "loss": 0.9501, + "step": 9364 + }, + { + "epoch": 1.9976535836177476, + "grad_norm": 0.4752813516001071, + "learning_rate": 0.00014382870611122725, + "loss": 0.9693, + "step": 9365 + }, + { + "epoch": 1.9978668941979523, + "grad_norm": 0.49485184388684184, + "learning_rate": 0.0001438235756862358, + "loss": 0.9265, + "step": 9366 + }, + { + "epoch": 1.998080204778157, + "grad_norm": 0.46450273051245455, + "learning_rate": 0.00014381844453907955, + "loss": 0.9269, + "step": 9367 + }, + { + "epoch": 1.9982935153583616, + "grad_norm": 0.34708653285742136, + "learning_rate": 0.0001438133126698165, + "loss": 0.9712, + "step": 9368 + }, + { + "epoch": 1.9985068259385665, + "grad_norm": 0.282308211997342, + "learning_rate": 0.00014380818007850478, + "loss": 0.953, + "step": 9369 + }, + { + "epoch": 1.9987201365187715, + "grad_norm": 0.2657530715205286, + "learning_rate": 0.0001438030467652024, + "loss": 0.9386, + "step": 9370 + }, + { + "epoch": 1.9989334470989761, + "grad_norm": 0.29012570578578845, + "learning_rate": 0.00014379791272996749, + "loss": 0.9417, + "step": 9371 + }, + { + "epoch": 1.9991467576791808, + "grad_norm": 0.31939223090734803, + "learning_rate": 0.00014379277797285813, + "loss": 0.9755, + "step": 9372 + }, + { + "epoch": 1.9993600682593855, + "grad_norm": 0.3266771956129687, + "learning_rate": 0.0001437876424939324, + "loss": 0.9419, + "step": 9373 + }, + { + "epoch": 1.9995733788395904, + "grad_norm": 0.40830292535015333, + "learning_rate": 0.00014378250629324844, + "loss": 0.953, + "step": 9374 + }, + { + "epoch": 1.9997866894197953, + "grad_norm": 0.4471027431519684, + "learning_rate": 0.00014377736937086434, + "loss": 0.9592, + "step": 9375 + }, + { + "epoch": 2.0, + "grad_norm": 0.4407282011841662, + "learning_rate": 0.0001437722317268382, + "loss": 0.9547, + "step": 9376 + }, + { + "epoch": 2.0002133105802047, + "grad_norm": 0.4294243540806939, + "learning_rate": 0.00014376709336122822, + "loss": 0.9404, + "step": 9377 + }, + { + "epoch": 2.0004266211604094, + "grad_norm": 0.4585609665069548, + "learning_rate": 0.0001437619542740925, + "loss": 0.9299, + "step": 9378 + }, + { + "epoch": 2.0006399317406145, + "grad_norm": 0.4559914653010462, + "learning_rate": 0.00014375681446548923, + "loss": 0.9466, + "step": 9379 + }, + { + "epoch": 2.000853242320819, + "grad_norm": 0.3604981344970652, + "learning_rate": 0.00014375167393547648, + "loss": 0.9329, + "step": 9380 + }, + { + "epoch": 2.001066552901024, + "grad_norm": 0.334326003799698, + "learning_rate": 0.0001437465326841125, + "loss": 0.9307, + "step": 9381 + }, + { + "epoch": 2.0012798634812285, + "grad_norm": 0.7541701080088434, + "learning_rate": 0.0001437413907114554, + "loss": 0.9205, + "step": 9382 + }, + { + "epoch": 2.0014931740614332, + "grad_norm": 0.3136066562337202, + "learning_rate": 0.0001437362480175634, + "loss": 0.9424, + "step": 9383 + }, + { + "epoch": 2.0017064846416384, + "grad_norm": 0.39807973135892566, + "learning_rate": 0.00014373110460249469, + "loss": 0.9287, + "step": 9384 + }, + { + "epoch": 2.001919795221843, + "grad_norm": 0.37089679636139616, + "learning_rate": 0.00014372596046630744, + "loss": 0.95, + "step": 9385 + }, + { + "epoch": 2.0021331058020477, + "grad_norm": 0.3138487113638542, + "learning_rate": 0.0001437208156090599, + "loss": 0.9165, + "step": 9386 + }, + { + "epoch": 2.0023464163822524, + "grad_norm": 0.3018283763904325, + "learning_rate": 0.00014371567003081026, + "loss": 0.9313, + "step": 9387 + }, + { + "epoch": 2.0025597269624575, + "grad_norm": 0.3644411588884058, + "learning_rate": 0.00014371052373161671, + "loss": 0.9485, + "step": 9388 + }, + { + "epoch": 2.0027730375426622, + "grad_norm": 0.39057950349651693, + "learning_rate": 0.00014370537671153755, + "loss": 0.9207, + "step": 9389 + }, + { + "epoch": 2.002986348122867, + "grad_norm": 0.3106449576277412, + "learning_rate": 0.00014370022897063097, + "loss": 0.958, + "step": 9390 + }, + { + "epoch": 2.0031996587030716, + "grad_norm": 0.2635871541223121, + "learning_rate": 0.00014369508050895524, + "loss": 0.9406, + "step": 9391 + }, + { + "epoch": 2.0034129692832763, + "grad_norm": 0.26942699434058764, + "learning_rate": 0.00014368993132656859, + "loss": 0.9269, + "step": 9392 + }, + { + "epoch": 2.0036262798634814, + "grad_norm": 0.24004159595777363, + "learning_rate": 0.00014368478142352928, + "loss": 0.9165, + "step": 9393 + }, + { + "epoch": 2.003839590443686, + "grad_norm": 0.33127823404571566, + "learning_rate": 0.0001436796307998956, + "loss": 0.9081, + "step": 9394 + }, + { + "epoch": 2.0040529010238908, + "grad_norm": 0.43104879669659046, + "learning_rate": 0.00014367447945572585, + "loss": 0.955, + "step": 9395 + }, + { + "epoch": 2.0042662116040955, + "grad_norm": 0.48396586696619703, + "learning_rate": 0.0001436693273910783, + "loss": 0.9331, + "step": 9396 + }, + { + "epoch": 2.0044795221843, + "grad_norm": 0.47363956964546494, + "learning_rate": 0.0001436641746060112, + "loss": 0.9342, + "step": 9397 + }, + { + "epoch": 2.0046928327645053, + "grad_norm": 0.48814705714066364, + "learning_rate": 0.0001436590211005829, + "loss": 0.9557, + "step": 9398 + }, + { + "epoch": 2.00490614334471, + "grad_norm": 0.41470064798798667, + "learning_rate": 0.00014365386687485173, + "loss": 0.9418, + "step": 9399 + }, + { + "epoch": 2.0051194539249146, + "grad_norm": 0.5108207728147242, + "learning_rate": 0.00014364871192887598, + "loss": 0.9528, + "step": 9400 + }, + { + "epoch": 2.0053327645051193, + "grad_norm": 0.5937434015570247, + "learning_rate": 0.00014364355626271397, + "loss": 0.9219, + "step": 9401 + }, + { + "epoch": 2.0055460750853245, + "grad_norm": 0.6460330590186595, + "learning_rate": 0.00014363839987642404, + "loss": 0.9589, + "step": 9402 + }, + { + "epoch": 2.005759385665529, + "grad_norm": 0.7013287435927795, + "learning_rate": 0.0001436332427700646, + "loss": 0.9698, + "step": 9403 + }, + { + "epoch": 2.005972696245734, + "grad_norm": 0.7218117508635415, + "learning_rate": 0.0001436280849436939, + "loss": 0.9206, + "step": 9404 + }, + { + "epoch": 2.0061860068259385, + "grad_norm": 0.741498066273056, + "learning_rate": 0.00014362292639737037, + "loss": 0.9171, + "step": 9405 + }, + { + "epoch": 2.006399317406143, + "grad_norm": 0.6832769728772838, + "learning_rate": 0.00014361776713115235, + "loss": 0.9326, + "step": 9406 + }, + { + "epoch": 2.0066126279863483, + "grad_norm": 0.4809876269400973, + "learning_rate": 0.00014361260714509821, + "loss": 0.9364, + "step": 9407 + }, + { + "epoch": 2.006825938566553, + "grad_norm": 0.26648860828604964, + "learning_rate": 0.00014360744643926636, + "loss": 0.9237, + "step": 9408 + }, + { + "epoch": 2.0070392491467577, + "grad_norm": 0.3841458837945765, + "learning_rate": 0.00014360228501371518, + "loss": 0.9184, + "step": 9409 + }, + { + "epoch": 2.0072525597269624, + "grad_norm": 0.5549411423289125, + "learning_rate": 0.00014359712286850308, + "loss": 0.9317, + "step": 9410 + }, + { + "epoch": 2.007465870307167, + "grad_norm": 0.5943734624499425, + "learning_rate": 0.00014359196000368848, + "loss": 0.9601, + "step": 9411 + }, + { + "epoch": 2.007679180887372, + "grad_norm": 0.5219729128650145, + "learning_rate": 0.00014358679641932978, + "loss": 0.9138, + "step": 9412 + }, + { + "epoch": 2.007892491467577, + "grad_norm": 0.47309944306245866, + "learning_rate": 0.00014358163211548538, + "loss": 0.9319, + "step": 9413 + }, + { + "epoch": 2.0081058020477816, + "grad_norm": 0.3972335004834826, + "learning_rate": 0.00014357646709221378, + "loss": 0.9192, + "step": 9414 + }, + { + "epoch": 2.0083191126279862, + "grad_norm": 0.29121172324504674, + "learning_rate": 0.00014357130134957336, + "loss": 0.9366, + "step": 9415 + }, + { + "epoch": 2.008532423208191, + "grad_norm": 0.3201437094904741, + "learning_rate": 0.0001435661348876226, + "loss": 0.8948, + "step": 9416 + }, + { + "epoch": 2.008745733788396, + "grad_norm": 0.4258959788191219, + "learning_rate": 0.00014356096770641998, + "loss": 0.9118, + "step": 9417 + }, + { + "epoch": 2.0089590443686007, + "grad_norm": 0.5147057384117028, + "learning_rate": 0.0001435557998060239, + "loss": 0.933, + "step": 9418 + }, + { + "epoch": 2.0091723549488054, + "grad_norm": 0.4315002863846085, + "learning_rate": 0.00014355063118649288, + "loss": 0.9522, + "step": 9419 + }, + { + "epoch": 2.00938566552901, + "grad_norm": 0.2641179440198599, + "learning_rate": 0.00014354546184788545, + "loss": 0.9158, + "step": 9420 + }, + { + "epoch": 2.0095989761092152, + "grad_norm": 0.33494970405916696, + "learning_rate": 0.00014354029179026, + "loss": 0.9365, + "step": 9421 + }, + { + "epoch": 2.00981228668942, + "grad_norm": 0.32912934314623105, + "learning_rate": 0.00014353512101367507, + "loss": 0.9365, + "step": 9422 + }, + { + "epoch": 2.0100255972696246, + "grad_norm": 0.3247083103166259, + "learning_rate": 0.00014352994951818921, + "loss": 0.9386, + "step": 9423 + }, + { + "epoch": 2.0102389078498293, + "grad_norm": 1.2767028958332027, + "learning_rate": 0.00014352477730386092, + "loss": 0.9417, + "step": 9424 + }, + { + "epoch": 2.010452218430034, + "grad_norm": 0.26766017302545664, + "learning_rate": 0.00014351960437074867, + "loss": 0.9679, + "step": 9425 + }, + { + "epoch": 2.010665529010239, + "grad_norm": 0.3652065501938564, + "learning_rate": 0.00014351443071891103, + "loss": 0.9462, + "step": 9426 + }, + { + "epoch": 2.010878839590444, + "grad_norm": 0.4325671486137798, + "learning_rate": 0.00014350925634840653, + "loss": 0.9234, + "step": 9427 + }, + { + "epoch": 2.0110921501706485, + "grad_norm": 0.5489498239449806, + "learning_rate": 0.00014350408125929378, + "loss": 0.922, + "step": 9428 + }, + { + "epoch": 2.011305460750853, + "grad_norm": 1.0282019919816632, + "learning_rate": 0.00014349890545163122, + "loss": 1.0021, + "step": 9429 + }, + { + "epoch": 2.011518771331058, + "grad_norm": 0.5174703419316878, + "learning_rate": 0.0001434937289254775, + "loss": 0.9232, + "step": 9430 + }, + { + "epoch": 2.011732081911263, + "grad_norm": 0.431549245734597, + "learning_rate": 0.0001434885516808912, + "loss": 0.9436, + "step": 9431 + }, + { + "epoch": 2.0119453924914676, + "grad_norm": 0.3663552443918245, + "learning_rate": 0.00014348337371793084, + "loss": 0.9495, + "step": 9432 + }, + { + "epoch": 2.0121587030716723, + "grad_norm": 0.38726378483330864, + "learning_rate": 0.00014347819503665504, + "loss": 0.9241, + "step": 9433 + }, + { + "epoch": 2.012372013651877, + "grad_norm": 0.39464043259191667, + "learning_rate": 0.00014347301563712238, + "loss": 0.9575, + "step": 9434 + }, + { + "epoch": 2.0125853242320817, + "grad_norm": 0.40482546449484763, + "learning_rate": 0.0001434678355193915, + "loss": 0.9601, + "step": 9435 + }, + { + "epoch": 2.012798634812287, + "grad_norm": 0.4174872456036496, + "learning_rate": 0.00014346265468352098, + "loss": 0.9604, + "step": 9436 + }, + { + "epoch": 2.0130119453924915, + "grad_norm": 0.5380552339384069, + "learning_rate": 0.00014345747312956945, + "loss": 0.9223, + "step": 9437 + }, + { + "epoch": 2.013225255972696, + "grad_norm": 0.6358264111980968, + "learning_rate": 0.00014345229085759557, + "loss": 0.9321, + "step": 9438 + }, + { + "epoch": 2.013438566552901, + "grad_norm": 0.6860182599012349, + "learning_rate": 0.00014344710786765797, + "loss": 0.9411, + "step": 9439 + }, + { + "epoch": 2.013651877133106, + "grad_norm": 0.7789223535085157, + "learning_rate": 0.00014344192415981523, + "loss": 0.922, + "step": 9440 + }, + { + "epoch": 2.0138651877133107, + "grad_norm": 0.7381486068476596, + "learning_rate": 0.00014343673973412607, + "loss": 0.9582, + "step": 9441 + }, + { + "epoch": 2.0140784982935154, + "grad_norm": 0.633346392640999, + "learning_rate": 0.00014343155459064914, + "loss": 0.956, + "step": 9442 + }, + { + "epoch": 2.01429180887372, + "grad_norm": 0.5226452378513301, + "learning_rate": 0.00014342636872944308, + "loss": 0.9546, + "step": 9443 + }, + { + "epoch": 2.0145051194539247, + "grad_norm": 0.46759714933390706, + "learning_rate": 0.0001434211821505666, + "loss": 0.9501, + "step": 9444 + }, + { + "epoch": 2.01471843003413, + "grad_norm": 0.4445870212088351, + "learning_rate": 0.00014341599485407839, + "loss": 0.9519, + "step": 9445 + }, + { + "epoch": 2.0149317406143346, + "grad_norm": 0.389278736941088, + "learning_rate": 0.00014341080684003712, + "loss": 0.9741, + "step": 9446 + }, + { + "epoch": 2.0151450511945392, + "grad_norm": 0.36751644219630425, + "learning_rate": 0.00014340561810850152, + "loss": 0.9534, + "step": 9447 + }, + { + "epoch": 2.015358361774744, + "grad_norm": 1.024585564944131, + "learning_rate": 0.00014340042865953025, + "loss": 0.988, + "step": 9448 + }, + { + "epoch": 2.0155716723549486, + "grad_norm": 0.3024213632579131, + "learning_rate": 0.0001433952384931821, + "loss": 0.9777, + "step": 9449 + }, + { + "epoch": 2.0157849829351537, + "grad_norm": 0.34161434662992846, + "learning_rate": 0.00014339004760951574, + "loss": 0.9038, + "step": 9450 + }, + { + "epoch": 2.0159982935153584, + "grad_norm": 0.32992020809502287, + "learning_rate": 0.00014338485600858992, + "loss": 0.9328, + "step": 9451 + }, + { + "epoch": 2.016211604095563, + "grad_norm": 0.3060240583135632, + "learning_rate": 0.0001433796636904634, + "loss": 0.9443, + "step": 9452 + }, + { + "epoch": 2.016424914675768, + "grad_norm": 0.29436296326933764, + "learning_rate": 0.00014337447065519493, + "loss": 0.9545, + "step": 9453 + }, + { + "epoch": 2.016638225255973, + "grad_norm": 0.3243468418254125, + "learning_rate": 0.00014336927690284325, + "loss": 0.966, + "step": 9454 + }, + { + "epoch": 2.0168515358361776, + "grad_norm": 0.34037020148202907, + "learning_rate": 0.00014336408243346713, + "loss": 0.9369, + "step": 9455 + }, + { + "epoch": 2.0170648464163823, + "grad_norm": 0.3520099017594629, + "learning_rate": 0.00014335888724712538, + "loss": 0.9149, + "step": 9456 + }, + { + "epoch": 2.017278156996587, + "grad_norm": 0.3264345149161542, + "learning_rate": 0.0001433536913438767, + "loss": 0.918, + "step": 9457 + }, + { + "epoch": 2.0174914675767917, + "grad_norm": 0.3534256741543651, + "learning_rate": 0.00014334849472377997, + "loss": 0.9597, + "step": 9458 + }, + { + "epoch": 2.017704778156997, + "grad_norm": 0.3767385071714251, + "learning_rate": 0.00014334329738689396, + "loss": 0.9443, + "step": 9459 + }, + { + "epoch": 2.0179180887372015, + "grad_norm": 0.39315990538526663, + "learning_rate": 0.00014333809933327749, + "loss": 0.9217, + "step": 9460 + }, + { + "epoch": 2.018131399317406, + "grad_norm": 0.4322196952851767, + "learning_rate": 0.0001433329005629893, + "loss": 0.9356, + "step": 9461 + }, + { + "epoch": 2.018344709897611, + "grad_norm": 0.2773524763792089, + "learning_rate": 0.0001433277010760883, + "loss": 0.9167, + "step": 9462 + }, + { + "epoch": 2.0185580204778155, + "grad_norm": 0.3479851857017282, + "learning_rate": 0.00014332250087263333, + "loss": 0.9448, + "step": 9463 + }, + { + "epoch": 2.0187713310580206, + "grad_norm": 0.3210653276126966, + "learning_rate": 0.00014331729995268317, + "loss": 0.9258, + "step": 9464 + }, + { + "epoch": 2.0189846416382253, + "grad_norm": 0.3159752696150682, + "learning_rate": 0.0001433120983162967, + "loss": 0.933, + "step": 9465 + }, + { + "epoch": 2.01919795221843, + "grad_norm": 0.2786451513434736, + "learning_rate": 0.00014330689596353278, + "loss": 0.9588, + "step": 9466 + }, + { + "epoch": 2.0194112627986347, + "grad_norm": 0.3519120208249946, + "learning_rate": 0.00014330169289445026, + "loss": 0.9432, + "step": 9467 + }, + { + "epoch": 2.0196245733788394, + "grad_norm": 0.4409487715028318, + "learning_rate": 0.00014329648910910804, + "loss": 0.9113, + "step": 9468 + }, + { + "epoch": 2.0198378839590445, + "grad_norm": 0.36008466055063176, + "learning_rate": 0.00014329128460756495, + "loss": 0.9312, + "step": 9469 + }, + { + "epoch": 2.020051194539249, + "grad_norm": 0.38357942788938054, + "learning_rate": 0.00014328607938987993, + "loss": 0.9238, + "step": 9470 + }, + { + "epoch": 2.020264505119454, + "grad_norm": 0.5429090765230311, + "learning_rate": 0.00014328087345611185, + "loss": 0.924, + "step": 9471 + }, + { + "epoch": 2.0204778156996586, + "grad_norm": 1.0941680675856589, + "learning_rate": 0.0001432756668063196, + "loss": 0.942, + "step": 9472 + }, + { + "epoch": 2.0206911262798637, + "grad_norm": 0.34501286917469576, + "learning_rate": 0.00014327045944056218, + "loss": 0.9012, + "step": 9473 + }, + { + "epoch": 2.0209044368600684, + "grad_norm": 0.42068676701623725, + "learning_rate": 0.00014326525135889838, + "loss": 0.9321, + "step": 9474 + }, + { + "epoch": 2.021117747440273, + "grad_norm": 0.5797164087415017, + "learning_rate": 0.00014326004256138726, + "loss": 0.9395, + "step": 9475 + }, + { + "epoch": 2.0213310580204777, + "grad_norm": 0.7278595883853477, + "learning_rate": 0.00014325483304808765, + "loss": 0.9408, + "step": 9476 + }, + { + "epoch": 2.0215443686006824, + "grad_norm": 0.897165966779387, + "learning_rate": 0.00014324962281905853, + "loss": 0.9495, + "step": 9477 + }, + { + "epoch": 2.0217576791808876, + "grad_norm": 0.9345390838285159, + "learning_rate": 0.0001432444118743589, + "loss": 0.9383, + "step": 9478 + }, + { + "epoch": 2.0219709897610922, + "grad_norm": 0.9595630198870577, + "learning_rate": 0.00014323920021404765, + "loss": 0.9112, + "step": 9479 + }, + { + "epoch": 2.022184300341297, + "grad_norm": 1.0752779525476377, + "learning_rate": 0.0001432339878381838, + "loss": 0.9882, + "step": 9480 + }, + { + "epoch": 2.0223976109215016, + "grad_norm": 0.7256142482367999, + "learning_rate": 0.0001432287747468263, + "loss": 1.0069, + "step": 9481 + }, + { + "epoch": 2.0226109215017063, + "grad_norm": 1.0756960093629893, + "learning_rate": 0.00014322356094003422, + "loss": 0.923, + "step": 9482 + }, + { + "epoch": 2.0228242320819114, + "grad_norm": 1.1587858149143198, + "learning_rate": 0.00014321834641786642, + "loss": 0.9555, + "step": 9483 + }, + { + "epoch": 2.023037542662116, + "grad_norm": 0.6965622306809521, + "learning_rate": 0.00014321313118038199, + "loss": 0.9529, + "step": 9484 + }, + { + "epoch": 2.023250853242321, + "grad_norm": 0.8768002524875226, + "learning_rate": 0.0001432079152276399, + "loss": 0.9497, + "step": 9485 + }, + { + "epoch": 2.0234641638225255, + "grad_norm": 1.0959749939657424, + "learning_rate": 0.00014320269855969917, + "loss": 0.953, + "step": 9486 + }, + { + "epoch": 2.02367747440273, + "grad_norm": 0.8384725105073408, + "learning_rate": 0.00014319748117661889, + "loss": 0.947, + "step": 9487 + }, + { + "epoch": 2.0238907849829353, + "grad_norm": 1.1365426361836233, + "learning_rate": 0.000143192263078458, + "loss": 0.9589, + "step": 9488 + }, + { + "epoch": 2.02410409556314, + "grad_norm": 0.5065377536136533, + "learning_rate": 0.0001431870442652756, + "loss": 0.9444, + "step": 9489 + }, + { + "epoch": 2.0243174061433447, + "grad_norm": 0.7806633224733557, + "learning_rate": 0.00014318182473713079, + "loss": 0.9323, + "step": 9490 + }, + { + "epoch": 2.0245307167235493, + "grad_norm": 0.9740729268407661, + "learning_rate": 0.00014317660449408252, + "loss": 0.9332, + "step": 9491 + }, + { + "epoch": 2.0247440273037545, + "grad_norm": 0.8308918357660754, + "learning_rate": 0.0001431713835361899, + "loss": 0.9565, + "step": 9492 + }, + { + "epoch": 2.024957337883959, + "grad_norm": 0.7036625417691269, + "learning_rate": 0.00014316616186351203, + "loss": 0.9565, + "step": 9493 + }, + { + "epoch": 2.025170648464164, + "grad_norm": 0.51424758067789, + "learning_rate": 0.00014316093947610797, + "loss": 0.9422, + "step": 9494 + }, + { + "epoch": 2.0253839590443685, + "grad_norm": 0.6388442200106447, + "learning_rate": 0.0001431557163740368, + "loss": 0.9502, + "step": 9495 + }, + { + "epoch": 2.025597269624573, + "grad_norm": 0.6059991775182277, + "learning_rate": 0.00014315049255735768, + "loss": 0.9485, + "step": 9496 + }, + { + "epoch": 2.0258105802047783, + "grad_norm": 0.6094083805821044, + "learning_rate": 0.00014314526802612965, + "loss": 0.9552, + "step": 9497 + }, + { + "epoch": 2.026023890784983, + "grad_norm": 0.6502199159564234, + "learning_rate": 0.00014314004278041185, + "loss": 0.9883, + "step": 9498 + }, + { + "epoch": 2.0262372013651877, + "grad_norm": 0.4695385351502634, + "learning_rate": 0.0001431348168202634, + "loss": 0.9416, + "step": 9499 + }, + { + "epoch": 2.0264505119453924, + "grad_norm": 0.3739702035728074, + "learning_rate": 0.00014312959014574345, + "loss": 0.9488, + "step": 9500 + }, + { + "epoch": 2.026663822525597, + "grad_norm": 0.4748741895381898, + "learning_rate": 0.0001431243627569111, + "loss": 0.9245, + "step": 9501 + }, + { + "epoch": 2.026877133105802, + "grad_norm": 0.44737052060540033, + "learning_rate": 0.00014311913465382553, + "loss": 0.9258, + "step": 9502 + }, + { + "epoch": 2.027090443686007, + "grad_norm": 0.49121789738358773, + "learning_rate": 0.0001431139058365459, + "loss": 0.9553, + "step": 9503 + }, + { + "epoch": 2.0273037542662116, + "grad_norm": 0.44630589232031376, + "learning_rate": 0.0001431086763051314, + "loss": 0.9352, + "step": 9504 + }, + { + "epoch": 2.0275170648464163, + "grad_norm": 0.4029452531104011, + "learning_rate": 0.0001431034460596411, + "loss": 0.9343, + "step": 9505 + }, + { + "epoch": 2.0277303754266214, + "grad_norm": 0.2926626736070247, + "learning_rate": 0.0001430982151001343, + "loss": 0.9415, + "step": 9506 + }, + { + "epoch": 2.027943686006826, + "grad_norm": 0.32210368697986336, + "learning_rate": 0.0001430929834266701, + "loss": 0.9323, + "step": 9507 + }, + { + "epoch": 2.0281569965870307, + "grad_norm": 0.32086952198547347, + "learning_rate": 0.00014308775103930775, + "loss": 0.9333, + "step": 9508 + }, + { + "epoch": 2.0283703071672354, + "grad_norm": 0.25341214063290163, + "learning_rate": 0.0001430825179381064, + "loss": 0.8832, + "step": 9509 + }, + { + "epoch": 2.02858361774744, + "grad_norm": 0.34104972089118873, + "learning_rate": 0.00014307728412312534, + "loss": 0.9591, + "step": 9510 + }, + { + "epoch": 2.0287969283276452, + "grad_norm": 0.25693670312255634, + "learning_rate": 0.00014307204959442373, + "loss": 0.9283, + "step": 9511 + }, + { + "epoch": 2.02901023890785, + "grad_norm": 0.27726130819382244, + "learning_rate": 0.00014306681435206084, + "loss": 0.9338, + "step": 9512 + }, + { + "epoch": 2.0292235494880546, + "grad_norm": 0.27333752089602964, + "learning_rate": 0.00014306157839609587, + "loss": 0.9369, + "step": 9513 + }, + { + "epoch": 2.0294368600682593, + "grad_norm": 0.27279885637550527, + "learning_rate": 0.00014305634172658807, + "loss": 0.9139, + "step": 9514 + }, + { + "epoch": 2.029650170648464, + "grad_norm": 0.27661196918648273, + "learning_rate": 0.0001430511043435967, + "loss": 0.9589, + "step": 9515 + }, + { + "epoch": 2.029863481228669, + "grad_norm": 0.2269699652717018, + "learning_rate": 0.00014304586624718104, + "loss": 0.9502, + "step": 9516 + }, + { + "epoch": 2.030076791808874, + "grad_norm": 0.23188741581562725, + "learning_rate": 0.00014304062743740033, + "loss": 0.9299, + "step": 9517 + }, + { + "epoch": 2.0302901023890785, + "grad_norm": 0.22516899782252028, + "learning_rate": 0.00014303538791431386, + "loss": 0.9303, + "step": 9518 + }, + { + "epoch": 2.030503412969283, + "grad_norm": 0.26990006820999246, + "learning_rate": 0.00014303014767798095, + "loss": 0.9275, + "step": 9519 + }, + { + "epoch": 2.030716723549488, + "grad_norm": 0.23337345685884128, + "learning_rate": 0.00014302490672846082, + "loss": 0.9462, + "step": 9520 + }, + { + "epoch": 2.030930034129693, + "grad_norm": 0.19710986997443164, + "learning_rate": 0.0001430196650658128, + "loss": 0.9468, + "step": 9521 + }, + { + "epoch": 2.0311433447098977, + "grad_norm": 0.2186502802225287, + "learning_rate": 0.00014301442269009623, + "loss": 0.9101, + "step": 9522 + }, + { + "epoch": 2.0313566552901023, + "grad_norm": 0.2130043871554788, + "learning_rate": 0.00014300917960137044, + "loss": 0.931, + "step": 9523 + }, + { + "epoch": 2.031569965870307, + "grad_norm": 0.20820001220105766, + "learning_rate": 0.00014300393579969468, + "loss": 0.9332, + "step": 9524 + }, + { + "epoch": 2.031783276450512, + "grad_norm": 0.25434348581515187, + "learning_rate": 0.00014299869128512834, + "loss": 0.9263, + "step": 9525 + }, + { + "epoch": 2.031996587030717, + "grad_norm": 0.2622393847465953, + "learning_rate": 0.00014299344605773074, + "loss": 0.9241, + "step": 9526 + }, + { + "epoch": 2.0322098976109215, + "grad_norm": 0.2558319188379974, + "learning_rate": 0.00014298820011756126, + "loss": 0.9085, + "step": 9527 + }, + { + "epoch": 2.032423208191126, + "grad_norm": 0.20017575825817466, + "learning_rate": 0.00014298295346467922, + "loss": 0.8875, + "step": 9528 + }, + { + "epoch": 2.032636518771331, + "grad_norm": 0.27136451219474017, + "learning_rate": 0.000142977706099144, + "loss": 0.8977, + "step": 9529 + }, + { + "epoch": 2.032849829351536, + "grad_norm": 0.2934110596862786, + "learning_rate": 0.00014297245802101497, + "loss": 0.9221, + "step": 9530 + }, + { + "epoch": 2.0330631399317407, + "grad_norm": 0.2199478495721723, + "learning_rate": 0.00014296720923035155, + "loss": 0.9258, + "step": 9531 + }, + { + "epoch": 2.0332764505119454, + "grad_norm": 0.253178917375438, + "learning_rate": 0.00014296195972721305, + "loss": 0.9313, + "step": 9532 + }, + { + "epoch": 2.03348976109215, + "grad_norm": 0.2417106640535433, + "learning_rate": 0.00014295670951165897, + "loss": 0.9607, + "step": 9533 + }, + { + "epoch": 2.0337030716723548, + "grad_norm": 0.2506392615940946, + "learning_rate": 0.00014295145858374863, + "loss": 0.9414, + "step": 9534 + }, + { + "epoch": 2.03391638225256, + "grad_norm": 0.27010834207128637, + "learning_rate": 0.00014294620694354148, + "loss": 0.9245, + "step": 9535 + }, + { + "epoch": 2.0341296928327646, + "grad_norm": 0.2304355152741321, + "learning_rate": 0.00014294095459109696, + "loss": 0.9382, + "step": 9536 + }, + { + "epoch": 2.0343430034129693, + "grad_norm": 0.2463133729037904, + "learning_rate": 0.00014293570152647446, + "loss": 0.9643, + "step": 9537 + }, + { + "epoch": 2.034556313993174, + "grad_norm": 0.32903782687019095, + "learning_rate": 0.00014293044774973344, + "loss": 0.9552, + "step": 9538 + }, + { + "epoch": 2.0347696245733786, + "grad_norm": 0.33647603276813126, + "learning_rate": 0.00014292519326093334, + "loss": 0.926, + "step": 9539 + }, + { + "epoch": 2.0349829351535837, + "grad_norm": 0.3408230802984014, + "learning_rate": 0.00014291993806013364, + "loss": 0.9288, + "step": 9540 + }, + { + "epoch": 2.0351962457337884, + "grad_norm": 0.29744755159683295, + "learning_rate": 0.00014291468214739375, + "loss": 0.9273, + "step": 9541 + }, + { + "epoch": 2.035409556313993, + "grad_norm": 0.25195620552849485, + "learning_rate": 0.0001429094255227732, + "loss": 0.9597, + "step": 9542 + }, + { + "epoch": 2.035622866894198, + "grad_norm": 0.25872657638092156, + "learning_rate": 0.00014290416818633144, + "loss": 0.9096, + "step": 9543 + }, + { + "epoch": 2.035836177474403, + "grad_norm": 0.3581112194970803, + "learning_rate": 0.00014289891013812795, + "loss": 0.9165, + "step": 9544 + }, + { + "epoch": 2.0360494880546076, + "grad_norm": 0.43050927523780097, + "learning_rate": 0.00014289365137822225, + "loss": 0.9729, + "step": 9545 + }, + { + "epoch": 2.0362627986348123, + "grad_norm": 0.411289486929911, + "learning_rate": 0.00014288839190667378, + "loss": 0.9267, + "step": 9546 + }, + { + "epoch": 2.036476109215017, + "grad_norm": 0.3475374877275577, + "learning_rate": 0.00014288313172354216, + "loss": 0.943, + "step": 9547 + }, + { + "epoch": 2.0366894197952217, + "grad_norm": 0.21443584691739215, + "learning_rate": 0.0001428778708288868, + "loss": 0.9584, + "step": 9548 + }, + { + "epoch": 2.036902730375427, + "grad_norm": 0.2810788738647646, + "learning_rate": 0.0001428726092227673, + "loss": 0.9134, + "step": 9549 + }, + { + "epoch": 2.0371160409556315, + "grad_norm": 0.2705745800010507, + "learning_rate": 0.00014286734690524316, + "loss": 0.9425, + "step": 9550 + }, + { + "epoch": 2.037329351535836, + "grad_norm": 0.28473965410530216, + "learning_rate": 0.00014286208387637395, + "loss": 0.9683, + "step": 9551 + }, + { + "epoch": 2.037542662116041, + "grad_norm": 0.4103394052512772, + "learning_rate": 0.00014285682013621918, + "loss": 0.9802, + "step": 9552 + }, + { + "epoch": 2.0377559726962455, + "grad_norm": 0.34777719824048214, + "learning_rate": 0.00014285155568483844, + "loss": 0.9578, + "step": 9553 + }, + { + "epoch": 2.0379692832764507, + "grad_norm": 0.24745451974797064, + "learning_rate": 0.0001428462905222913, + "loss": 0.9045, + "step": 9554 + }, + { + "epoch": 2.0381825938566553, + "grad_norm": 0.3134942631112156, + "learning_rate": 0.0001428410246486373, + "loss": 0.9304, + "step": 9555 + }, + { + "epoch": 2.03839590443686, + "grad_norm": 0.3376837961192977, + "learning_rate": 0.00014283575806393607, + "loss": 0.9477, + "step": 9556 + }, + { + "epoch": 2.0386092150170647, + "grad_norm": 0.24886836211565527, + "learning_rate": 0.00014283049076824718, + "loss": 0.9406, + "step": 9557 + }, + { + "epoch": 2.03882252559727, + "grad_norm": 0.2406110624046966, + "learning_rate": 0.00014282522276163023, + "loss": 0.9523, + "step": 9558 + }, + { + "epoch": 2.0390358361774745, + "grad_norm": 0.28265007644553547, + "learning_rate": 0.00014281995404414482, + "loss": 0.9314, + "step": 9559 + }, + { + "epoch": 2.039249146757679, + "grad_norm": 0.24035284771954174, + "learning_rate": 0.0001428146846158506, + "loss": 0.9354, + "step": 9560 + }, + { + "epoch": 2.039462457337884, + "grad_norm": 0.22285873009080592, + "learning_rate": 0.0001428094144768071, + "loss": 0.9309, + "step": 9561 + }, + { + "epoch": 2.0396757679180886, + "grad_norm": 0.2143561679610959, + "learning_rate": 0.00014280414362707408, + "loss": 0.9692, + "step": 9562 + }, + { + "epoch": 2.0398890784982937, + "grad_norm": 0.21570745211172535, + "learning_rate": 0.0001427988720667111, + "loss": 0.909, + "step": 9563 + }, + { + "epoch": 2.0401023890784984, + "grad_norm": 0.2558737175035757, + "learning_rate": 0.00014279359979577782, + "loss": 0.9289, + "step": 9564 + }, + { + "epoch": 2.040315699658703, + "grad_norm": 0.2481708153950685, + "learning_rate": 0.00014278832681433395, + "loss": 0.956, + "step": 9565 + }, + { + "epoch": 2.0405290102389078, + "grad_norm": 0.22995350615997476, + "learning_rate": 0.00014278305312243904, + "loss": 0.9367, + "step": 9566 + }, + { + "epoch": 2.0407423208191124, + "grad_norm": 0.2469648369330329, + "learning_rate": 0.00014277777872015286, + "loss": 0.9202, + "step": 9567 + }, + { + "epoch": 2.0409556313993176, + "grad_norm": 0.24675265888291006, + "learning_rate": 0.00014277250360753506, + "loss": 0.9384, + "step": 9568 + }, + { + "epoch": 2.0411689419795223, + "grad_norm": 0.22143753168578062, + "learning_rate": 0.0001427672277846453, + "loss": 0.9301, + "step": 9569 + }, + { + "epoch": 2.041382252559727, + "grad_norm": 0.21956881006012496, + "learning_rate": 0.0001427619512515433, + "loss": 0.9386, + "step": 9570 + }, + { + "epoch": 2.0415955631399316, + "grad_norm": 0.23845972540938937, + "learning_rate": 0.0001427566740082888, + "loss": 0.9367, + "step": 9571 + }, + { + "epoch": 2.0418088737201363, + "grad_norm": 0.22149655921332556, + "learning_rate": 0.0001427513960549415, + "loss": 0.9594, + "step": 9572 + }, + { + "epoch": 2.0420221843003414, + "grad_norm": 0.20774484355029046, + "learning_rate": 0.00014274611739156104, + "loss": 0.9198, + "step": 9573 + }, + { + "epoch": 2.042235494880546, + "grad_norm": 0.24363550380475776, + "learning_rate": 0.00014274083801820721, + "loss": 0.916, + "step": 9574 + }, + { + "epoch": 2.042448805460751, + "grad_norm": 0.2199217693485953, + "learning_rate": 0.00014273555793493977, + "loss": 0.9523, + "step": 9575 + }, + { + "epoch": 2.0426621160409555, + "grad_norm": 0.23943447492290637, + "learning_rate": 0.00014273027714181843, + "loss": 0.9435, + "step": 9576 + }, + { + "epoch": 2.0428754266211606, + "grad_norm": 0.27088751507797093, + "learning_rate": 0.00014272499563890294, + "loss": 0.9373, + "step": 9577 + }, + { + "epoch": 2.0430887372013653, + "grad_norm": 0.2675202595486665, + "learning_rate": 0.00014271971342625308, + "loss": 0.9605, + "step": 9578 + }, + { + "epoch": 2.04330204778157, + "grad_norm": 0.3045424795756712, + "learning_rate": 0.0001427144305039286, + "loss": 0.9687, + "step": 9579 + }, + { + "epoch": 2.0435153583617747, + "grad_norm": 0.3743441108663971, + "learning_rate": 0.0001427091468719893, + "loss": 0.9522, + "step": 9580 + }, + { + "epoch": 2.0437286689419794, + "grad_norm": 0.42748423694954346, + "learning_rate": 0.0001427038625304949, + "loss": 0.9685, + "step": 9581 + }, + { + "epoch": 2.0439419795221845, + "grad_norm": 0.4096850033729156, + "learning_rate": 0.0001426985774795053, + "loss": 0.8956, + "step": 9582 + }, + { + "epoch": 2.044155290102389, + "grad_norm": 0.3707605880762592, + "learning_rate": 0.00014269329171908023, + "loss": 0.9345, + "step": 9583 + }, + { + "epoch": 2.044368600682594, + "grad_norm": 0.2728908845018406, + "learning_rate": 0.00014268800524927947, + "loss": 0.9365, + "step": 9584 + }, + { + "epoch": 2.0445819112627985, + "grad_norm": 0.2420044713806146, + "learning_rate": 0.00014268271807016291, + "loss": 0.9351, + "step": 9585 + }, + { + "epoch": 2.044795221843003, + "grad_norm": 0.27176023487480694, + "learning_rate": 0.00014267743018179035, + "loss": 0.9664, + "step": 9586 + }, + { + "epoch": 2.0450085324232083, + "grad_norm": 0.2664031014830022, + "learning_rate": 0.00014267214158422158, + "loss": 0.9213, + "step": 9587 + }, + { + "epoch": 2.045221843003413, + "grad_norm": 0.2834136444068629, + "learning_rate": 0.0001426668522775165, + "loss": 0.9289, + "step": 9588 + }, + { + "epoch": 2.0454351535836177, + "grad_norm": 0.30029862321787865, + "learning_rate": 0.00014266156226173493, + "loss": 0.9224, + "step": 9589 + }, + { + "epoch": 2.0456484641638224, + "grad_norm": 0.36637340275029856, + "learning_rate": 0.00014265627153693672, + "loss": 0.9479, + "step": 9590 + }, + { + "epoch": 2.0458617747440275, + "grad_norm": 0.29674255050961473, + "learning_rate": 0.00014265098010318174, + "loss": 0.9084, + "step": 9591 + }, + { + "epoch": 2.046075085324232, + "grad_norm": 0.2177550602361357, + "learning_rate": 0.00014264568796052988, + "loss": 0.966, + "step": 9592 + }, + { + "epoch": 2.046288395904437, + "grad_norm": 0.2508510230188444, + "learning_rate": 0.000142640395109041, + "loss": 0.9137, + "step": 9593 + }, + { + "epoch": 2.0465017064846416, + "grad_norm": 0.26711343408031896, + "learning_rate": 0.00014263510154877498, + "loss": 0.9211, + "step": 9594 + }, + { + "epoch": 2.0467150170648463, + "grad_norm": 0.20464194694412402, + "learning_rate": 0.00014262980727979175, + "loss": 0.946, + "step": 9595 + }, + { + "epoch": 2.0469283276450514, + "grad_norm": 0.22742145944644043, + "learning_rate": 0.00014262451230215122, + "loss": 0.9418, + "step": 9596 + }, + { + "epoch": 2.047141638225256, + "grad_norm": 0.2698152962527987, + "learning_rate": 0.00014261921661591325, + "loss": 0.929, + "step": 9597 + }, + { + "epoch": 2.0473549488054608, + "grad_norm": 0.35361400236198076, + "learning_rate": 0.0001426139202211378, + "loss": 0.9201, + "step": 9598 + }, + { + "epoch": 2.0475682593856654, + "grad_norm": 0.39487897680116185, + "learning_rate": 0.00014260862311788478, + "loss": 0.9564, + "step": 9599 + }, + { + "epoch": 2.04778156996587, + "grad_norm": 0.42112378182816745, + "learning_rate": 0.00014260332530621414, + "loss": 0.9271, + "step": 9600 + }, + { + "epoch": 2.0479948805460753, + "grad_norm": 0.3846616901544652, + "learning_rate": 0.0001425980267861858, + "loss": 0.8938, + "step": 9601 + }, + { + "epoch": 2.04820819112628, + "grad_norm": 0.33202643977946733, + "learning_rate": 0.00014259272755785978, + "loss": 0.9273, + "step": 9602 + }, + { + "epoch": 2.0484215017064846, + "grad_norm": 0.24127245499628813, + "learning_rate": 0.00014258742762129597, + "loss": 0.9534, + "step": 9603 + }, + { + "epoch": 2.0486348122866893, + "grad_norm": 0.17960684404741234, + "learning_rate": 0.00014258212697655436, + "loss": 0.9422, + "step": 9604 + }, + { + "epoch": 2.048848122866894, + "grad_norm": 0.24971673622462376, + "learning_rate": 0.00014257682562369492, + "loss": 0.939, + "step": 9605 + }, + { + "epoch": 2.049061433447099, + "grad_norm": 0.3229379031579044, + "learning_rate": 0.00014257152356277766, + "loss": 0.9487, + "step": 9606 + }, + { + "epoch": 2.049274744027304, + "grad_norm": 0.3747601442730901, + "learning_rate": 0.00014256622079386258, + "loss": 0.9265, + "step": 9607 + }, + { + "epoch": 2.0494880546075085, + "grad_norm": 0.3110013553609146, + "learning_rate": 0.00014256091731700964, + "loss": 0.8969, + "step": 9608 + }, + { + "epoch": 2.049701365187713, + "grad_norm": 0.19879016913163264, + "learning_rate": 0.00014255561313227886, + "loss": 0.943, + "step": 9609 + }, + { + "epoch": 2.0499146757679183, + "grad_norm": 0.21146019841743469, + "learning_rate": 0.00014255030823973029, + "loss": 0.9602, + "step": 9610 + }, + { + "epoch": 2.050127986348123, + "grad_norm": 0.22815559286386097, + "learning_rate": 0.0001425450026394239, + "loss": 0.9283, + "step": 9611 + }, + { + "epoch": 2.0503412969283277, + "grad_norm": 0.22370928081189118, + "learning_rate": 0.00014253969633141976, + "loss": 0.9318, + "step": 9612 + }, + { + "epoch": 2.0505546075085324, + "grad_norm": 0.21846859925150341, + "learning_rate": 0.0001425343893157779, + "loss": 0.9679, + "step": 9613 + }, + { + "epoch": 2.050767918088737, + "grad_norm": 0.23856873067421605, + "learning_rate": 0.00014252908159255838, + "loss": 0.8918, + "step": 9614 + }, + { + "epoch": 2.050981228668942, + "grad_norm": 0.24029149082117515, + "learning_rate": 0.00014252377316182127, + "loss": 0.8987, + "step": 9615 + }, + { + "epoch": 2.051194539249147, + "grad_norm": 0.24770131481749072, + "learning_rate": 0.0001425184640236266, + "loss": 0.9559, + "step": 9616 + }, + { + "epoch": 2.0514078498293515, + "grad_norm": 0.2590969376978894, + "learning_rate": 0.00014251315417803444, + "loss": 0.9317, + "step": 9617 + }, + { + "epoch": 2.051621160409556, + "grad_norm": 0.20223485902693886, + "learning_rate": 0.0001425078436251049, + "loss": 0.9336, + "step": 9618 + }, + { + "epoch": 2.051834470989761, + "grad_norm": 0.25783076674725053, + "learning_rate": 0.00014250253236489807, + "loss": 0.9322, + "step": 9619 + }, + { + "epoch": 2.052047781569966, + "grad_norm": 0.3592952139247049, + "learning_rate": 0.00014249722039747404, + "loss": 0.9248, + "step": 9620 + }, + { + "epoch": 2.0522610921501707, + "grad_norm": 0.40161860237593716, + "learning_rate": 0.0001424919077228929, + "loss": 0.9441, + "step": 9621 + }, + { + "epoch": 2.0524744027303754, + "grad_norm": 0.3699712521623843, + "learning_rate": 0.00014248659434121477, + "loss": 0.9401, + "step": 9622 + }, + { + "epoch": 2.05268771331058, + "grad_norm": 0.37792971460590247, + "learning_rate": 0.00014248128025249978, + "loss": 0.9146, + "step": 9623 + }, + { + "epoch": 2.0529010238907848, + "grad_norm": 0.35037652864980523, + "learning_rate": 0.00014247596545680804, + "loss": 0.9345, + "step": 9624 + }, + { + "epoch": 2.05311433447099, + "grad_norm": 0.2843259297700072, + "learning_rate": 0.0001424706499541997, + "loss": 0.9415, + "step": 9625 + }, + { + "epoch": 2.0533276450511946, + "grad_norm": 0.23510965238755832, + "learning_rate": 0.0001424653337447349, + "loss": 0.962, + "step": 9626 + }, + { + "epoch": 2.0535409556313993, + "grad_norm": 0.23856901155878374, + "learning_rate": 0.00014246001682847382, + "loss": 0.8968, + "step": 9627 + }, + { + "epoch": 2.053754266211604, + "grad_norm": 0.26125890171777394, + "learning_rate": 0.0001424546992054766, + "loss": 0.8995, + "step": 9628 + }, + { + "epoch": 2.053967576791809, + "grad_norm": 0.34029115032285456, + "learning_rate": 0.0001424493808758034, + "loss": 0.9162, + "step": 9629 + }, + { + "epoch": 2.0541808873720138, + "grad_norm": 0.3824492894152553, + "learning_rate": 0.0001424440618395144, + "loss": 0.9558, + "step": 9630 + }, + { + "epoch": 2.0543941979522184, + "grad_norm": 0.3588636663000172, + "learning_rate": 0.00014243874209666978, + "loss": 0.9547, + "step": 9631 + }, + { + "epoch": 2.054607508532423, + "grad_norm": 0.33463487739787506, + "learning_rate": 0.00014243342164732976, + "loss": 0.9414, + "step": 9632 + }, + { + "epoch": 2.054820819112628, + "grad_norm": 0.356415885002122, + "learning_rate": 0.00014242810049155453, + "loss": 0.9146, + "step": 9633 + }, + { + "epoch": 2.055034129692833, + "grad_norm": 0.3433902596697656, + "learning_rate": 0.00014242277862940426, + "loss": 0.9092, + "step": 9634 + }, + { + "epoch": 2.0552474402730376, + "grad_norm": 0.2847877043528052, + "learning_rate": 0.00014241745606093922, + "loss": 0.922, + "step": 9635 + }, + { + "epoch": 2.0554607508532423, + "grad_norm": 0.2707855123889664, + "learning_rate": 0.00014241213278621965, + "loss": 0.9488, + "step": 9636 + }, + { + "epoch": 2.055674061433447, + "grad_norm": 0.2804297927001561, + "learning_rate": 0.0001424068088053057, + "loss": 0.9392, + "step": 9637 + }, + { + "epoch": 2.0558873720136517, + "grad_norm": 0.2937289483853975, + "learning_rate": 0.00014240148411825767, + "loss": 0.9302, + "step": 9638 + }, + { + "epoch": 2.056100682593857, + "grad_norm": 0.35601763741760595, + "learning_rate": 0.0001423961587251358, + "loss": 0.9491, + "step": 9639 + }, + { + "epoch": 2.0563139931740615, + "grad_norm": 0.34291444234042484, + "learning_rate": 0.00014239083262600034, + "loss": 0.9507, + "step": 9640 + }, + { + "epoch": 2.056527303754266, + "grad_norm": 0.3520631082248333, + "learning_rate": 0.00014238550582091158, + "loss": 0.953, + "step": 9641 + }, + { + "epoch": 2.056740614334471, + "grad_norm": 0.38221902875672564, + "learning_rate": 0.00014238017830992973, + "loss": 0.9403, + "step": 9642 + }, + { + "epoch": 2.0569539249146755, + "grad_norm": 0.48298372360426683, + "learning_rate": 0.00014237485009311514, + "loss": 0.9638, + "step": 9643 + }, + { + "epoch": 2.0571672354948807, + "grad_norm": 0.44294080229675775, + "learning_rate": 0.0001423695211705281, + "loss": 0.9267, + "step": 9644 + }, + { + "epoch": 2.0573805460750854, + "grad_norm": 0.3206145244976488, + "learning_rate": 0.00014236419154222885, + "loss": 0.9372, + "step": 9645 + }, + { + "epoch": 2.05759385665529, + "grad_norm": 0.28366622587892126, + "learning_rate": 0.00014235886120827771, + "loss": 0.9376, + "step": 9646 + }, + { + "epoch": 2.0578071672354947, + "grad_norm": 0.44167813144485746, + "learning_rate": 0.00014235353016873501, + "loss": 0.9455, + "step": 9647 + }, + { + "epoch": 2.0580204778157, + "grad_norm": 0.4546398039077855, + "learning_rate": 0.0001423481984236611, + "loss": 0.946, + "step": 9648 + }, + { + "epoch": 2.0582337883959045, + "grad_norm": 0.28950338274345355, + "learning_rate": 0.00014234286597311624, + "loss": 0.9281, + "step": 9649 + }, + { + "epoch": 2.0584470989761092, + "grad_norm": 0.24371913445686552, + "learning_rate": 0.00014233753281716084, + "loss": 0.9299, + "step": 9650 + }, + { + "epoch": 2.058660409556314, + "grad_norm": 0.3698639940457225, + "learning_rate": 0.0001423321989558552, + "loss": 0.9569, + "step": 9651 + }, + { + "epoch": 2.0588737201365186, + "grad_norm": 0.359153175445602, + "learning_rate": 0.00014232686438925965, + "loss": 0.9769, + "step": 9652 + }, + { + "epoch": 2.0590870307167237, + "grad_norm": 0.2605717352719941, + "learning_rate": 0.00014232152911743457, + "loss": 0.9468, + "step": 9653 + }, + { + "epoch": 2.0593003412969284, + "grad_norm": 0.21159510479751725, + "learning_rate": 0.00014231619314044038, + "loss": 0.9356, + "step": 9654 + }, + { + "epoch": 2.059513651877133, + "grad_norm": 0.27115543414200377, + "learning_rate": 0.00014231085645833742, + "loss": 0.9066, + "step": 9655 + }, + { + "epoch": 2.0597269624573378, + "grad_norm": 0.27289769678687437, + "learning_rate": 0.00014230551907118603, + "loss": 0.945, + "step": 9656 + }, + { + "epoch": 2.0599402730375425, + "grad_norm": 0.2137271891863884, + "learning_rate": 0.00014230018097904665, + "loss": 0.9258, + "step": 9657 + }, + { + "epoch": 2.0601535836177476, + "grad_norm": 0.24535571656472327, + "learning_rate": 0.00014229484218197973, + "loss": 0.9446, + "step": 9658 + }, + { + "epoch": 2.0603668941979523, + "grad_norm": 0.2818433995768038, + "learning_rate": 0.00014228950268004557, + "loss": 0.9225, + "step": 9659 + }, + { + "epoch": 2.060580204778157, + "grad_norm": 0.3031224327592948, + "learning_rate": 0.00014228416247330465, + "loss": 0.9397, + "step": 9660 + }, + { + "epoch": 2.0607935153583616, + "grad_norm": 0.42136987568852585, + "learning_rate": 0.0001422788215618174, + "loss": 0.9516, + "step": 9661 + }, + { + "epoch": 2.0610068259385668, + "grad_norm": 0.3780591653158354, + "learning_rate": 0.00014227347994564423, + "loss": 0.9373, + "step": 9662 + }, + { + "epoch": 2.0612201365187715, + "grad_norm": 0.2754409346056673, + "learning_rate": 0.00014226813762484558, + "loss": 0.927, + "step": 9663 + }, + { + "epoch": 2.061433447098976, + "grad_norm": 0.2747839004731318, + "learning_rate": 0.0001422627945994819, + "loss": 0.9627, + "step": 9664 + }, + { + "epoch": 2.061646757679181, + "grad_norm": 0.28768898142206306, + "learning_rate": 0.00014225745086961366, + "loss": 0.9356, + "step": 9665 + }, + { + "epoch": 2.0618600682593855, + "grad_norm": 0.2975832080084711, + "learning_rate": 0.0001422521064353013, + "loss": 0.9346, + "step": 9666 + }, + { + "epoch": 2.0620733788395906, + "grad_norm": 0.2753870439709101, + "learning_rate": 0.00014224676129660535, + "loss": 0.9452, + "step": 9667 + }, + { + "epoch": 2.0622866894197953, + "grad_norm": 0.3201354519305622, + "learning_rate": 0.0001422414154535862, + "loss": 0.9482, + "step": 9668 + }, + { + "epoch": 2.0625, + "grad_norm": 0.30891905909267486, + "learning_rate": 0.00014223606890630442, + "loss": 0.9304, + "step": 9669 + }, + { + "epoch": 2.0627133105802047, + "grad_norm": 0.32514526556074747, + "learning_rate": 0.00014223072165482045, + "loss": 0.9518, + "step": 9670 + }, + { + "epoch": 2.0629266211604094, + "grad_norm": 0.3151195200370262, + "learning_rate": 0.00014222537369919483, + "loss": 0.9416, + "step": 9671 + }, + { + "epoch": 2.0631399317406145, + "grad_norm": 0.35830884614541864, + "learning_rate": 0.00014222002503948806, + "loss": 0.9638, + "step": 9672 + }, + { + "epoch": 2.063353242320819, + "grad_norm": 0.30181669172308107, + "learning_rate": 0.00014221467567576067, + "loss": 0.9151, + "step": 9673 + }, + { + "epoch": 2.063566552901024, + "grad_norm": 0.26883237859297376, + "learning_rate": 0.00014220932560807318, + "loss": 0.9329, + "step": 9674 + }, + { + "epoch": 2.0637798634812285, + "grad_norm": 0.3294311457458461, + "learning_rate": 0.00014220397483648612, + "loss": 0.9252, + "step": 9675 + }, + { + "epoch": 2.0639931740614332, + "grad_norm": 0.43497614363045556, + "learning_rate": 0.00014219862336106002, + "loss": 0.9503, + "step": 9676 + }, + { + "epoch": 2.0642064846416384, + "grad_norm": 0.48536578027193555, + "learning_rate": 0.00014219327118185547, + "loss": 0.9248, + "step": 9677 + }, + { + "epoch": 2.064419795221843, + "grad_norm": 0.5177545102375241, + "learning_rate": 0.000142187918298933, + "loss": 0.9373, + "step": 9678 + }, + { + "epoch": 2.0646331058020477, + "grad_norm": 0.5372797367919823, + "learning_rate": 0.0001421825647123532, + "loss": 0.9193, + "step": 9679 + }, + { + "epoch": 2.0648464163822524, + "grad_norm": 0.5457841934984595, + "learning_rate": 0.00014217721042217662, + "loss": 0.9411, + "step": 9680 + }, + { + "epoch": 2.0650597269624575, + "grad_norm": 0.5341639983056236, + "learning_rate": 0.00014217185542846384, + "loss": 0.911, + "step": 9681 + }, + { + "epoch": 2.0652730375426622, + "grad_norm": 0.43804412390768477, + "learning_rate": 0.0001421664997312755, + "loss": 0.9638, + "step": 9682 + }, + { + "epoch": 2.065486348122867, + "grad_norm": 0.28507509828909483, + "learning_rate": 0.0001421611433306722, + "loss": 0.9413, + "step": 9683 + }, + { + "epoch": 2.0656996587030716, + "grad_norm": 0.2584080056961705, + "learning_rate": 0.00014215578622671444, + "loss": 0.9382, + "step": 9684 + }, + { + "epoch": 2.0659129692832763, + "grad_norm": 0.3834125665365836, + "learning_rate": 0.00014215042841946298, + "loss": 0.916, + "step": 9685 + }, + { + "epoch": 2.0661262798634814, + "grad_norm": 0.4057675239671583, + "learning_rate": 0.00014214506990897833, + "loss": 0.948, + "step": 9686 + }, + { + "epoch": 2.066339590443686, + "grad_norm": 0.46028734468522114, + "learning_rate": 0.0001421397106953212, + "loss": 0.939, + "step": 9687 + }, + { + "epoch": 2.0665529010238908, + "grad_norm": 0.45184151480118495, + "learning_rate": 0.00014213435077855218, + "loss": 0.9258, + "step": 9688 + }, + { + "epoch": 2.0667662116040955, + "grad_norm": 0.4572494381497937, + "learning_rate": 0.00014212899015873196, + "loss": 0.9308, + "step": 9689 + }, + { + "epoch": 2.0669795221843, + "grad_norm": 0.3921494667739551, + "learning_rate": 0.00014212362883592115, + "loss": 0.9476, + "step": 9690 + }, + { + "epoch": 2.0671928327645053, + "grad_norm": 0.32361496636639164, + "learning_rate": 0.00014211826681018045, + "loss": 0.9132, + "step": 9691 + }, + { + "epoch": 2.06740614334471, + "grad_norm": 0.30190381963451246, + "learning_rate": 0.0001421129040815705, + "loss": 0.9088, + "step": 9692 + }, + { + "epoch": 2.0676194539249146, + "grad_norm": 0.2922656502731123, + "learning_rate": 0.00014210754065015202, + "loss": 0.9234, + "step": 9693 + }, + { + "epoch": 2.0678327645051193, + "grad_norm": 0.24612198102970764, + "learning_rate": 0.00014210217651598565, + "loss": 0.9318, + "step": 9694 + }, + { + "epoch": 2.0680460750853245, + "grad_norm": 0.2284790257645392, + "learning_rate": 0.0001420968116791321, + "loss": 0.8999, + "step": 9695 + }, + { + "epoch": 2.068259385665529, + "grad_norm": 0.22228001033176087, + "learning_rate": 0.00014209144613965208, + "loss": 0.9119, + "step": 9696 + }, + { + "epoch": 2.068472696245734, + "grad_norm": 0.24840249763458178, + "learning_rate": 0.00014208607989760633, + "loss": 0.9611, + "step": 9697 + }, + { + "epoch": 2.0686860068259385, + "grad_norm": 0.25964725248538645, + "learning_rate": 0.00014208071295305553, + "loss": 0.9479, + "step": 9698 + }, + { + "epoch": 2.068899317406143, + "grad_norm": 0.3196386374612046, + "learning_rate": 0.00014207534530606044, + "loss": 0.9282, + "step": 9699 + }, + { + "epoch": 2.0691126279863483, + "grad_norm": 0.31673386587903635, + "learning_rate": 0.00014206997695668174, + "loss": 0.9368, + "step": 9700 + }, + { + "epoch": 2.069325938566553, + "grad_norm": 0.2999846434741222, + "learning_rate": 0.00014206460790498022, + "loss": 0.9447, + "step": 9701 + }, + { + "epoch": 2.0695392491467577, + "grad_norm": 0.3016478510426756, + "learning_rate": 0.0001420592381510166, + "loss": 0.9538, + "step": 9702 + }, + { + "epoch": 2.0697525597269624, + "grad_norm": 0.26695235851955906, + "learning_rate": 0.00014205386769485168, + "loss": 0.9416, + "step": 9703 + }, + { + "epoch": 2.069965870307167, + "grad_norm": 0.2376136265804276, + "learning_rate": 0.0001420484965365462, + "loss": 0.9284, + "step": 9704 + }, + { + "epoch": 2.070179180887372, + "grad_norm": 0.19416235795852732, + "learning_rate": 0.00014204312467616095, + "loss": 0.9119, + "step": 9705 + }, + { + "epoch": 2.070392491467577, + "grad_norm": 0.21368534808794035, + "learning_rate": 0.0001420377521137567, + "loss": 0.9209, + "step": 9706 + }, + { + "epoch": 2.0706058020477816, + "grad_norm": 0.21494102328372988, + "learning_rate": 0.0001420323788493942, + "loss": 0.9115, + "step": 9707 + }, + { + "epoch": 2.0708191126279862, + "grad_norm": 0.21235268496455503, + "learning_rate": 0.00014202700488313432, + "loss": 0.956, + "step": 9708 + }, + { + "epoch": 2.071032423208191, + "grad_norm": 0.281381016000285, + "learning_rate": 0.00014202163021503783, + "loss": 0.9412, + "step": 9709 + }, + { + "epoch": 2.071245733788396, + "grad_norm": 0.34668798187124433, + "learning_rate": 0.00014201625484516555, + "loss": 0.964, + "step": 9710 + }, + { + "epoch": 2.0714590443686007, + "grad_norm": 0.36891388490089777, + "learning_rate": 0.00014201087877357832, + "loss": 0.9511, + "step": 9711 + }, + { + "epoch": 2.0716723549488054, + "grad_norm": 0.3863464120197712, + "learning_rate": 0.00014200550200033695, + "loss": 0.897, + "step": 9712 + }, + { + "epoch": 2.07188566552901, + "grad_norm": 0.3627460568255708, + "learning_rate": 0.00014200012452550222, + "loss": 0.944, + "step": 9713 + }, + { + "epoch": 2.0720989761092152, + "grad_norm": 0.33832169476407736, + "learning_rate": 0.0001419947463491351, + "loss": 0.9426, + "step": 9714 + }, + { + "epoch": 2.07231228668942, + "grad_norm": 0.24941290228925497, + "learning_rate": 0.0001419893674712964, + "loss": 0.9287, + "step": 9715 + }, + { + "epoch": 2.0725255972696246, + "grad_norm": 0.25799018432306003, + "learning_rate": 0.00014198398789204693, + "loss": 0.9532, + "step": 9716 + }, + { + "epoch": 2.0727389078498293, + "grad_norm": 0.3662598357769698, + "learning_rate": 0.0001419786076114476, + "loss": 0.9208, + "step": 9717 + }, + { + "epoch": 2.072952218430034, + "grad_norm": 0.4149999931668966, + "learning_rate": 0.00014197322662955927, + "loss": 0.927, + "step": 9718 + }, + { + "epoch": 2.073165529010239, + "grad_norm": 0.574571509047576, + "learning_rate": 0.00014196784494644287, + "loss": 0.937, + "step": 9719 + }, + { + "epoch": 2.073378839590444, + "grad_norm": 0.47912930999410513, + "learning_rate": 0.00014196246256215924, + "loss": 0.9237, + "step": 9720 + }, + { + "epoch": 2.0735921501706485, + "grad_norm": 0.5058116938416243, + "learning_rate": 0.0001419570794767693, + "loss": 0.935, + "step": 9721 + }, + { + "epoch": 2.073805460750853, + "grad_norm": 0.5378182521772401, + "learning_rate": 0.00014195169569033398, + "loss": 0.931, + "step": 9722 + }, + { + "epoch": 2.074018771331058, + "grad_norm": 0.5178473978285452, + "learning_rate": 0.00014194631120291416, + "loss": 0.9657, + "step": 9723 + }, + { + "epoch": 2.074232081911263, + "grad_norm": 0.4703903988323167, + "learning_rate": 0.0001419409260145708, + "loss": 0.9242, + "step": 9724 + }, + { + "epoch": 2.0744453924914676, + "grad_norm": 0.4363124586039034, + "learning_rate": 0.00014193554012536483, + "loss": 0.9382, + "step": 9725 + }, + { + "epoch": 2.0746587030716723, + "grad_norm": 1.4807367694572302, + "learning_rate": 0.0001419301535353572, + "loss": 0.955, + "step": 9726 + }, + { + "epoch": 2.074872013651877, + "grad_norm": 0.4537448805313456, + "learning_rate": 0.00014192476624460878, + "loss": 0.936, + "step": 9727 + }, + { + "epoch": 2.0750853242320817, + "grad_norm": 4.110682111905516, + "learning_rate": 0.0001419193782531806, + "loss": 0.9605, + "step": 9728 + }, + { + "epoch": 2.075298634812287, + "grad_norm": 1.4805370737399515, + "learning_rate": 0.00014191398956113366, + "loss": 0.9641, + "step": 9729 + }, + { + "epoch": 2.0755119453924915, + "grad_norm": 2.066391372643397, + "learning_rate": 0.00014190860016852886, + "loss": 0.9544, + "step": 9730 + }, + { + "epoch": 2.075725255972696, + "grad_norm": 1.4842113787320594, + "learning_rate": 0.00014190321007542721, + "loss": 0.9566, + "step": 9731 + }, + { + "epoch": 2.075938566552901, + "grad_norm": 1.9935260299236754, + "learning_rate": 0.00014189781928188969, + "loss": 0.987, + "step": 9732 + }, + { + "epoch": 2.076151877133106, + "grad_norm": 1.3640578629750977, + "learning_rate": 0.0001418924277879773, + "loss": 1.0024, + "step": 9733 + }, + { + "epoch": 2.0763651877133107, + "grad_norm": 2.3947001296475436, + "learning_rate": 0.00014188703559375108, + "loss": 0.9791, + "step": 9734 + }, + { + "epoch": 2.0765784982935154, + "grad_norm": 1.831551603230319, + "learning_rate": 0.00014188164269927198, + "loss": 0.9614, + "step": 9735 + }, + { + "epoch": 2.07679180887372, + "grad_norm": 2.1096267745778583, + "learning_rate": 0.00014187624910460105, + "loss": 0.9521, + "step": 9736 + }, + { + "epoch": 2.0770051194539247, + "grad_norm": 1.4811997643661103, + "learning_rate": 0.00014187085480979936, + "loss": 0.9788, + "step": 9737 + }, + { + "epoch": 2.07721843003413, + "grad_norm": 2.232681845775795, + "learning_rate": 0.00014186545981492786, + "loss": 1.0136, + "step": 9738 + }, + { + "epoch": 2.0774317406143346, + "grad_norm": 1.5927954224859282, + "learning_rate": 0.00014186006412004768, + "loss": 0.9792, + "step": 9739 + }, + { + "epoch": 2.0776450511945392, + "grad_norm": 2.1345348053138182, + "learning_rate": 0.0001418546677252198, + "loss": 0.9678, + "step": 9740 + }, + { + "epoch": 2.077858361774744, + "grad_norm": 1.8935992823326881, + "learning_rate": 0.00014184927063050533, + "loss": 0.9629, + "step": 9741 + }, + { + "epoch": 2.0780716723549486, + "grad_norm": 1.5583234024706054, + "learning_rate": 0.00014184387283596535, + "loss": 0.9655, + "step": 9742 + }, + { + "epoch": 2.0782849829351537, + "grad_norm": 1.2915187573858455, + "learning_rate": 0.00014183847434166085, + "loss": 0.9774, + "step": 9743 + }, + { + "epoch": 2.0784982935153584, + "grad_norm": 1.7348841202280005, + "learning_rate": 0.000141833075147653, + "loss": 0.9493, + "step": 9744 + }, + { + "epoch": 2.078711604095563, + "grad_norm": 1.281854872505105, + "learning_rate": 0.00014182767525400288, + "loss": 0.9612, + "step": 9745 + }, + { + "epoch": 2.078924914675768, + "grad_norm": 1.797891861374156, + "learning_rate": 0.00014182227466077156, + "loss": 0.965, + "step": 9746 + }, + { + "epoch": 2.0791382252559725, + "grad_norm": 1.439603920373384, + "learning_rate": 0.00014181687336802014, + "loss": 0.9637, + "step": 9747 + }, + { + "epoch": 2.0793515358361776, + "grad_norm": 1.5417867790036233, + "learning_rate": 0.0001418114713758098, + "loss": 1.0057, + "step": 9748 + }, + { + "epoch": 2.0795648464163823, + "grad_norm": 1.4310368036154546, + "learning_rate": 0.0001418060686842016, + "loss": 0.9911, + "step": 9749 + }, + { + "epoch": 2.079778156996587, + "grad_norm": 1.3479643563837405, + "learning_rate": 0.0001418006652932567, + "loss": 0.9477, + "step": 9750 + }, + { + "epoch": 2.0799914675767917, + "grad_norm": 1.171569293136542, + "learning_rate": 0.00014179526120303623, + "loss": 0.9245, + "step": 9751 + }, + { + "epoch": 2.080204778156997, + "grad_norm": 1.0199586674068224, + "learning_rate": 0.00014178985641360132, + "loss": 0.9515, + "step": 9752 + }, + { + "epoch": 2.0804180887372015, + "grad_norm": 1.0114888385725433, + "learning_rate": 0.00014178445092501317, + "loss": 0.9542, + "step": 9753 + }, + { + "epoch": 2.080631399317406, + "grad_norm": 0.8668104526181755, + "learning_rate": 0.00014177904473733292, + "loss": 0.9458, + "step": 9754 + }, + { + "epoch": 2.080844709897611, + "grad_norm": 0.44891770351901933, + "learning_rate": 0.00014177363785062175, + "loss": 0.9417, + "step": 9755 + }, + { + "epoch": 2.0810580204778155, + "grad_norm": 0.6597030404773696, + "learning_rate": 0.0001417682302649408, + "loss": 0.9789, + "step": 9756 + }, + { + "epoch": 2.0812713310580206, + "grad_norm": 0.7299570016831035, + "learning_rate": 0.0001417628219803513, + "loss": 0.9458, + "step": 9757 + }, + { + "epoch": 2.0814846416382253, + "grad_norm": 0.794619759447464, + "learning_rate": 0.00014175741299691442, + "loss": 0.9405, + "step": 9758 + }, + { + "epoch": 2.08169795221843, + "grad_norm": 0.4471386490326463, + "learning_rate": 0.0001417520033146914, + "loss": 0.9303, + "step": 9759 + }, + { + "epoch": 2.0819112627986347, + "grad_norm": 0.5706216183666755, + "learning_rate": 0.00014174659293374339, + "loss": 0.9597, + "step": 9760 + }, + { + "epoch": 2.0821245733788394, + "grad_norm": 0.7329032512403661, + "learning_rate": 0.00014174118185413166, + "loss": 0.9293, + "step": 9761 + }, + { + "epoch": 2.0823378839590445, + "grad_norm": 0.4816532183156828, + "learning_rate": 0.0001417357700759174, + "loss": 0.9215, + "step": 9762 + }, + { + "epoch": 2.082551194539249, + "grad_norm": 0.4505160131932541, + "learning_rate": 0.0001417303575991619, + "loss": 0.9474, + "step": 9763 + }, + { + "epoch": 2.082764505119454, + "grad_norm": 0.5630251496381126, + "learning_rate": 0.00014172494442392635, + "loss": 0.9851, + "step": 9764 + }, + { + "epoch": 2.0829778156996586, + "grad_norm": 0.4506887086152403, + "learning_rate": 0.000141719530550272, + "loss": 0.954, + "step": 9765 + }, + { + "epoch": 2.0831911262798637, + "grad_norm": 0.3977068034980499, + "learning_rate": 0.00014171411597826012, + "loss": 0.9616, + "step": 9766 + }, + { + "epoch": 2.0834044368600684, + "grad_norm": 0.35018980230982516, + "learning_rate": 0.00014170870070795202, + "loss": 0.9116, + "step": 9767 + }, + { + "epoch": 2.083617747440273, + "grad_norm": 0.44762719514213245, + "learning_rate": 0.00014170328473940889, + "loss": 0.9521, + "step": 9768 + }, + { + "epoch": 2.0838310580204777, + "grad_norm": 0.417504292045215, + "learning_rate": 0.0001416978680726921, + "loss": 0.9295, + "step": 9769 + }, + { + "epoch": 2.0840443686006824, + "grad_norm": 0.4244492869649108, + "learning_rate": 0.00014169245070786283, + "loss": 0.9178, + "step": 9770 + }, + { + "epoch": 2.0842576791808876, + "grad_norm": 0.36505564753370917, + "learning_rate": 0.0001416870326449825, + "loss": 0.9636, + "step": 9771 + }, + { + "epoch": 2.0844709897610922, + "grad_norm": 0.28551422576171903, + "learning_rate": 0.00014168161388411233, + "loss": 0.9462, + "step": 9772 + }, + { + "epoch": 2.084684300341297, + "grad_norm": 0.4020675750017012, + "learning_rate": 0.0001416761944253137, + "loss": 0.9679, + "step": 9773 + }, + { + "epoch": 2.0848976109215016, + "grad_norm": 0.40357585793827633, + "learning_rate": 0.00014167077426864783, + "loss": 0.9484, + "step": 9774 + }, + { + "epoch": 2.0851109215017063, + "grad_norm": 0.26116526330018264, + "learning_rate": 0.00014166535341417615, + "loss": 0.9383, + "step": 9775 + }, + { + "epoch": 2.0853242320819114, + "grad_norm": 0.37792995469846813, + "learning_rate": 0.00014165993186195994, + "loss": 0.9258, + "step": 9776 + }, + { + "epoch": 2.085537542662116, + "grad_norm": 0.3404445158201872, + "learning_rate": 0.0001416545096120606, + "loss": 0.9326, + "step": 9777 + }, + { + "epoch": 2.085750853242321, + "grad_norm": 0.26125718916929763, + "learning_rate": 0.0001416490866645394, + "loss": 0.966, + "step": 9778 + }, + { + "epoch": 2.0859641638225255, + "grad_norm": 0.32082265422982303, + "learning_rate": 0.00014164366301945778, + "loss": 0.9254, + "step": 9779 + }, + { + "epoch": 2.0861774744027306, + "grad_norm": 0.278363442297472, + "learning_rate": 0.00014163823867687706, + "loss": 0.9605, + "step": 9780 + }, + { + "epoch": 2.0863907849829353, + "grad_norm": 0.27577565931081477, + "learning_rate": 0.00014163281363685867, + "loss": 0.9192, + "step": 9781 + }, + { + "epoch": 2.08660409556314, + "grad_norm": 0.285875769232359, + "learning_rate": 0.0001416273878994639, + "loss": 0.9611, + "step": 9782 + }, + { + "epoch": 2.0868174061433447, + "grad_norm": 0.2897584478457591, + "learning_rate": 0.00014162196146475423, + "loss": 0.9447, + "step": 9783 + }, + { + "epoch": 2.0870307167235493, + "grad_norm": 0.3166228000188706, + "learning_rate": 0.00014161653433279105, + "loss": 0.9444, + "step": 9784 + }, + { + "epoch": 2.0872440273037545, + "grad_norm": 0.35362683206917456, + "learning_rate": 0.0001416111065036357, + "loss": 0.9303, + "step": 9785 + }, + { + "epoch": 2.087457337883959, + "grad_norm": 0.3466935390552141, + "learning_rate": 0.0001416056779773497, + "loss": 0.9476, + "step": 9786 + }, + { + "epoch": 2.087670648464164, + "grad_norm": 0.2723999197242164, + "learning_rate": 0.00014160024875399434, + "loss": 0.9635, + "step": 9787 + }, + { + "epoch": 2.0878839590443685, + "grad_norm": 0.31310219937975015, + "learning_rate": 0.00014159481883363118, + "loss": 0.9556, + "step": 9788 + }, + { + "epoch": 2.088097269624573, + "grad_norm": 0.2585002633400437, + "learning_rate": 0.0001415893882163216, + "loss": 0.9401, + "step": 9789 + }, + { + "epoch": 2.0883105802047783, + "grad_norm": 0.20215471403260052, + "learning_rate": 0.00014158395690212706, + "loss": 0.9418, + "step": 9790 + }, + { + "epoch": 2.088523890784983, + "grad_norm": 0.2953469424905213, + "learning_rate": 0.00014157852489110902, + "loss": 0.9509, + "step": 9791 + }, + { + "epoch": 2.0887372013651877, + "grad_norm": 0.2405211467543639, + "learning_rate": 0.00014157309218332893, + "loss": 0.9445, + "step": 9792 + }, + { + "epoch": 2.0889505119453924, + "grad_norm": 0.2347840444261291, + "learning_rate": 0.00014156765877884825, + "loss": 0.9324, + "step": 9793 + }, + { + "epoch": 2.089163822525597, + "grad_norm": 0.27025930517600655, + "learning_rate": 0.0001415622246777285, + "loss": 0.9496, + "step": 9794 + }, + { + "epoch": 2.089377133105802, + "grad_norm": 0.25403079416892843, + "learning_rate": 0.0001415567898800311, + "loss": 0.949, + "step": 9795 + }, + { + "epoch": 2.089590443686007, + "grad_norm": 0.2157696025565952, + "learning_rate": 0.00014155135438581763, + "loss": 0.9283, + "step": 9796 + }, + { + "epoch": 2.0898037542662116, + "grad_norm": 0.2810784190012911, + "learning_rate": 0.00014154591819514953, + "loss": 0.9355, + "step": 9797 + }, + { + "epoch": 2.0900170648464163, + "grad_norm": 0.2526722349572625, + "learning_rate": 0.00014154048130808836, + "loss": 0.9456, + "step": 9798 + }, + { + "epoch": 2.0902303754266214, + "grad_norm": 0.28542751677568085, + "learning_rate": 0.00014153504372469556, + "loss": 0.9407, + "step": 9799 + }, + { + "epoch": 2.090443686006826, + "grad_norm": 0.24725699620924124, + "learning_rate": 0.00014152960544503275, + "loss": 0.9293, + "step": 9800 + }, + { + "epoch": 2.0906569965870307, + "grad_norm": 0.24657050022724397, + "learning_rate": 0.0001415241664691614, + "loss": 0.9462, + "step": 9801 + }, + { + "epoch": 2.0908703071672354, + "grad_norm": 0.3180192554252706, + "learning_rate": 0.00014151872679714307, + "loss": 0.9492, + "step": 9802 + }, + { + "epoch": 2.09108361774744, + "grad_norm": 0.3467834354612258, + "learning_rate": 0.00014151328642903933, + "loss": 0.9517, + "step": 9803 + }, + { + "epoch": 2.0912969283276452, + "grad_norm": 0.2749149318856448, + "learning_rate": 0.0001415078453649117, + "loss": 0.9294, + "step": 9804 + }, + { + "epoch": 2.09151023890785, + "grad_norm": 0.26313373364283976, + "learning_rate": 0.00014150240360482174, + "loss": 0.9612, + "step": 9805 + }, + { + "epoch": 2.0917235494880546, + "grad_norm": 0.2720145313723805, + "learning_rate": 0.00014149696114883109, + "loss": 0.9, + "step": 9806 + }, + { + "epoch": 2.0919368600682593, + "grad_norm": 0.23011602097529718, + "learning_rate": 0.00014149151799700127, + "loss": 0.9556, + "step": 9807 + }, + { + "epoch": 2.092150170648464, + "grad_norm": 0.24868966554659602, + "learning_rate": 0.0001414860741493939, + "loss": 0.9451, + "step": 9808 + }, + { + "epoch": 2.092363481228669, + "grad_norm": 0.2753132685497924, + "learning_rate": 0.00014148062960607057, + "loss": 0.9189, + "step": 9809 + }, + { + "epoch": 2.092576791808874, + "grad_norm": 0.20992519768725107, + "learning_rate": 0.00014147518436709286, + "loss": 0.9262, + "step": 9810 + }, + { + "epoch": 2.0927901023890785, + "grad_norm": 0.26612938881376924, + "learning_rate": 0.00014146973843252243, + "loss": 0.9192, + "step": 9811 + }, + { + "epoch": 2.093003412969283, + "grad_norm": 0.3219173825774187, + "learning_rate": 0.00014146429180242088, + "loss": 0.9301, + "step": 9812 + }, + { + "epoch": 2.093216723549488, + "grad_norm": 0.20292150499528475, + "learning_rate": 0.00014145884447684982, + "loss": 0.9198, + "step": 9813 + }, + { + "epoch": 2.093430034129693, + "grad_norm": 0.26658884081004314, + "learning_rate": 0.0001414533964558709, + "loss": 0.9246, + "step": 9814 + }, + { + "epoch": 2.0936433447098977, + "grad_norm": 0.25797019688185463, + "learning_rate": 0.00014144794773954575, + "loss": 0.9292, + "step": 9815 + }, + { + "epoch": 2.0938566552901023, + "grad_norm": 0.23149442772420709, + "learning_rate": 0.00014144249832793605, + "loss": 0.9493, + "step": 9816 + }, + { + "epoch": 2.094069965870307, + "grad_norm": 0.2028423948899666, + "learning_rate": 0.00014143704822110344, + "loss": 0.9369, + "step": 9817 + }, + { + "epoch": 2.094283276450512, + "grad_norm": 0.25734823902410786, + "learning_rate": 0.00014143159741910962, + "loss": 0.9444, + "step": 9818 + }, + { + "epoch": 2.094496587030717, + "grad_norm": 0.21295273566931164, + "learning_rate": 0.00014142614592201622, + "loss": 0.9553, + "step": 9819 + }, + { + "epoch": 2.0947098976109215, + "grad_norm": 0.20708573903811647, + "learning_rate": 0.00014142069372988496, + "loss": 0.9369, + "step": 9820 + }, + { + "epoch": 2.094923208191126, + "grad_norm": 0.22695105249469721, + "learning_rate": 0.00014141524084277754, + "loss": 0.9273, + "step": 9821 + }, + { + "epoch": 2.095136518771331, + "grad_norm": 0.21339676034962707, + "learning_rate": 0.0001414097872607556, + "loss": 0.9257, + "step": 9822 + }, + { + "epoch": 2.095349829351536, + "grad_norm": 0.21814284608593879, + "learning_rate": 0.0001414043329838809, + "loss": 0.9692, + "step": 9823 + }, + { + "epoch": 2.0955631399317407, + "grad_norm": 0.2579767900456119, + "learning_rate": 0.00014139887801221514, + "loss": 0.9374, + "step": 9824 + }, + { + "epoch": 2.0957764505119454, + "grad_norm": 0.31280449728588405, + "learning_rate": 0.00014139342234582, + "loss": 0.9334, + "step": 9825 + }, + { + "epoch": 2.09598976109215, + "grad_norm": 0.301837980444761, + "learning_rate": 0.0001413879659847573, + "loss": 0.9488, + "step": 9826 + }, + { + "epoch": 2.0962030716723548, + "grad_norm": 0.24474079996532952, + "learning_rate": 0.00014138250892908871, + "loss": 0.9418, + "step": 9827 + }, + { + "epoch": 2.09641638225256, + "grad_norm": 0.24569265594364859, + "learning_rate": 0.00014137705117887604, + "loss": 0.9174, + "step": 9828 + }, + { + "epoch": 2.0966296928327646, + "grad_norm": 0.3213125594688348, + "learning_rate": 0.00014137159273418096, + "loss": 0.9299, + "step": 9829 + }, + { + "epoch": 2.0968430034129693, + "grad_norm": 0.3583928039113125, + "learning_rate": 0.0001413661335950653, + "loss": 0.9202, + "step": 9830 + }, + { + "epoch": 2.097056313993174, + "grad_norm": 0.2274158147974283, + "learning_rate": 0.00014136067376159078, + "loss": 0.9309, + "step": 9831 + }, + { + "epoch": 2.0972696245733786, + "grad_norm": 0.25076965448349314, + "learning_rate": 0.00014135521323381924, + "loss": 0.93, + "step": 9832 + }, + { + "epoch": 2.0974829351535837, + "grad_norm": 0.22909771853732788, + "learning_rate": 0.0001413497520118124, + "loss": 0.935, + "step": 9833 + }, + { + "epoch": 2.0976962457337884, + "grad_norm": 0.2555408420366579, + "learning_rate": 0.00014134429009563212, + "loss": 0.9344, + "step": 9834 + }, + { + "epoch": 2.097909556313993, + "grad_norm": 0.2475160541476583, + "learning_rate": 0.00014133882748534012, + "loss": 0.9378, + "step": 9835 + }, + { + "epoch": 2.098122866894198, + "grad_norm": 0.22869025486250608, + "learning_rate": 0.0001413333641809983, + "loss": 0.9378, + "step": 9836 + }, + { + "epoch": 2.098336177474403, + "grad_norm": 0.23395330644718854, + "learning_rate": 0.00014132790018266838, + "loss": 0.9243, + "step": 9837 + }, + { + "epoch": 2.0985494880546076, + "grad_norm": 0.2308699089796248, + "learning_rate": 0.00014132243549041227, + "loss": 0.9356, + "step": 9838 + }, + { + "epoch": 2.0987627986348123, + "grad_norm": 0.25300867590209625, + "learning_rate": 0.00014131697010429177, + "loss": 0.929, + "step": 9839 + }, + { + "epoch": 2.098976109215017, + "grad_norm": 0.28134533641247916, + "learning_rate": 0.0001413115040243687, + "loss": 0.9122, + "step": 9840 + }, + { + "epoch": 2.0991894197952217, + "grad_norm": 0.25983150569029045, + "learning_rate": 0.00014130603725070497, + "loss": 0.9303, + "step": 9841 + }, + { + "epoch": 2.099402730375427, + "grad_norm": 0.2733965400499019, + "learning_rate": 0.00014130056978336238, + "loss": 0.8847, + "step": 9842 + }, + { + "epoch": 2.0996160409556315, + "grad_norm": 0.3115953282371359, + "learning_rate": 0.00014129510162240282, + "loss": 0.9199, + "step": 9843 + }, + { + "epoch": 2.099829351535836, + "grad_norm": 0.2492572372570868, + "learning_rate": 0.00014128963276788812, + "loss": 0.9377, + "step": 9844 + }, + { + "epoch": 2.100042662116041, + "grad_norm": 0.2748859157231768, + "learning_rate": 0.00014128416321988022, + "loss": 0.9232, + "step": 9845 + }, + { + "epoch": 2.1002559726962455, + "grad_norm": 0.32083355299988287, + "learning_rate": 0.00014127869297844099, + "loss": 0.8945, + "step": 9846 + }, + { + "epoch": 2.1004692832764507, + "grad_norm": 0.27234896741826714, + "learning_rate": 0.0001412732220436323, + "loss": 0.9287, + "step": 9847 + }, + { + "epoch": 2.1006825938566553, + "grad_norm": 0.24837552138847077, + "learning_rate": 0.0001412677504155161, + "loss": 0.9243, + "step": 9848 + }, + { + "epoch": 2.10089590443686, + "grad_norm": 0.23310603228795687, + "learning_rate": 0.00014126227809415422, + "loss": 0.9254, + "step": 9849 + }, + { + "epoch": 2.1011092150170647, + "grad_norm": 0.2844898436474494, + "learning_rate": 0.0001412568050796087, + "loss": 0.9606, + "step": 9850 + }, + { + "epoch": 2.1013225255972694, + "grad_norm": 0.28552520711948237, + "learning_rate": 0.00014125133137194134, + "loss": 0.9348, + "step": 9851 + }, + { + "epoch": 2.1015358361774745, + "grad_norm": 0.3031872240121021, + "learning_rate": 0.00014124585697121417, + "loss": 0.9057, + "step": 9852 + }, + { + "epoch": 2.101749146757679, + "grad_norm": 0.30222662731343775, + "learning_rate": 0.0001412403818774891, + "loss": 0.9454, + "step": 9853 + }, + { + "epoch": 2.101962457337884, + "grad_norm": 0.2506723176822289, + "learning_rate": 0.00014123490609082806, + "loss": 0.9383, + "step": 9854 + }, + { + "epoch": 2.1021757679180886, + "grad_norm": 0.2547357718086856, + "learning_rate": 0.00014122942961129303, + "loss": 0.9085, + "step": 9855 + }, + { + "epoch": 2.1023890784982937, + "grad_norm": 0.2956468143607919, + "learning_rate": 0.000141223952438946, + "loss": 0.9824, + "step": 9856 + }, + { + "epoch": 2.1026023890784984, + "grad_norm": 0.2596933701082517, + "learning_rate": 0.0001412184745738489, + "loss": 0.9358, + "step": 9857 + }, + { + "epoch": 2.102815699658703, + "grad_norm": 0.21093911427785472, + "learning_rate": 0.00014121299601606372, + "loss": 0.9627, + "step": 9858 + }, + { + "epoch": 2.1030290102389078, + "grad_norm": 0.2410011151973554, + "learning_rate": 0.00014120751676565245, + "loss": 0.938, + "step": 9859 + }, + { + "epoch": 2.1032423208191124, + "grad_norm": 0.2751773170051107, + "learning_rate": 0.00014120203682267714, + "loss": 0.9551, + "step": 9860 + }, + { + "epoch": 2.1034556313993176, + "grad_norm": 0.21561008410912877, + "learning_rate": 0.0001411965561871997, + "loss": 0.9418, + "step": 9861 + }, + { + "epoch": 2.1036689419795223, + "grad_norm": 0.24868131226809417, + "learning_rate": 0.00014119107485928228, + "loss": 0.9405, + "step": 9862 + }, + { + "epoch": 2.103882252559727, + "grad_norm": 0.2334247200196386, + "learning_rate": 0.00014118559283898675, + "loss": 0.898, + "step": 9863 + }, + { + "epoch": 2.1040955631399316, + "grad_norm": 0.22616053223328036, + "learning_rate": 0.00014118011012637522, + "loss": 0.9163, + "step": 9864 + }, + { + "epoch": 2.1043088737201363, + "grad_norm": 0.2513690628355408, + "learning_rate": 0.00014117462672150972, + "loss": 0.9292, + "step": 9865 + }, + { + "epoch": 2.1045221843003414, + "grad_norm": 0.2570351066619934, + "learning_rate": 0.00014116914262445228, + "loss": 0.9301, + "step": 9866 + }, + { + "epoch": 2.104735494880546, + "grad_norm": 0.27841894563704556, + "learning_rate": 0.00014116365783526497, + "loss": 0.9384, + "step": 9867 + }, + { + "epoch": 2.104948805460751, + "grad_norm": 0.25693781656234616, + "learning_rate": 0.00014115817235400987, + "loss": 0.9378, + "step": 9868 + }, + { + "epoch": 2.1051621160409555, + "grad_norm": 0.23468172454486136, + "learning_rate": 0.000141152686180749, + "loss": 0.933, + "step": 9869 + }, + { + "epoch": 2.1053754266211606, + "grad_norm": 0.2536358994475856, + "learning_rate": 0.00014114719931554445, + "loss": 0.911, + "step": 9870 + }, + { + "epoch": 2.1055887372013653, + "grad_norm": 0.22783629212311615, + "learning_rate": 0.0001411417117584583, + "loss": 0.9543, + "step": 9871 + }, + { + "epoch": 2.10580204778157, + "grad_norm": 0.22731566799616992, + "learning_rate": 0.0001411362235095527, + "loss": 0.918, + "step": 9872 + }, + { + "epoch": 2.1060153583617747, + "grad_norm": 0.26442915304671805, + "learning_rate": 0.00014113073456888966, + "loss": 0.9465, + "step": 9873 + }, + { + "epoch": 2.1062286689419794, + "grad_norm": 0.341528945643225, + "learning_rate": 0.00014112524493653135, + "loss": 0.9301, + "step": 9874 + }, + { + "epoch": 2.1064419795221845, + "grad_norm": 0.38996860079043993, + "learning_rate": 0.00014111975461253984, + "loss": 0.9083, + "step": 9875 + }, + { + "epoch": 2.106655290102389, + "grad_norm": 0.36523835714116815, + "learning_rate": 0.0001411142635969773, + "loss": 0.961, + "step": 9876 + }, + { + "epoch": 2.106868600682594, + "grad_norm": 0.32396240736392223, + "learning_rate": 0.00014110877188990584, + "loss": 0.9378, + "step": 9877 + }, + { + "epoch": 2.1070819112627985, + "grad_norm": 0.2661758954383537, + "learning_rate": 0.0001411032794913876, + "loss": 0.9251, + "step": 9878 + }, + { + "epoch": 2.107295221843003, + "grad_norm": 0.23805839099092155, + "learning_rate": 0.0001410977864014847, + "loss": 0.9439, + "step": 9879 + }, + { + "epoch": 2.1075085324232083, + "grad_norm": 0.29555142023656994, + "learning_rate": 0.00014109229262025935, + "loss": 0.9317, + "step": 9880 + }, + { + "epoch": 2.107721843003413, + "grad_norm": 0.4352865589983362, + "learning_rate": 0.00014108679814777366, + "loss": 0.9407, + "step": 9881 + }, + { + "epoch": 2.1079351535836177, + "grad_norm": 0.505821467966192, + "learning_rate": 0.00014108130298408982, + "loss": 0.923, + "step": 9882 + }, + { + "epoch": 2.1081484641638224, + "grad_norm": 0.48894962088320104, + "learning_rate": 0.00014107580712927, + "loss": 0.9296, + "step": 9883 + }, + { + "epoch": 2.1083617747440275, + "grad_norm": 0.5037106300807559, + "learning_rate": 0.00014107031058337646, + "loss": 0.958, + "step": 9884 + }, + { + "epoch": 2.108575085324232, + "grad_norm": 0.45703406667875196, + "learning_rate": 0.00014106481334647127, + "loss": 0.9465, + "step": 9885 + }, + { + "epoch": 2.108788395904437, + "grad_norm": 0.329929906499561, + "learning_rate": 0.0001410593154186167, + "loss": 0.94, + "step": 9886 + }, + { + "epoch": 2.1090017064846416, + "grad_norm": 0.24294380345915384, + "learning_rate": 0.00014105381679987495, + "loss": 0.9554, + "step": 9887 + }, + { + "epoch": 2.1092150170648463, + "grad_norm": 0.4458958047483025, + "learning_rate": 0.00014104831749030822, + "loss": 0.9638, + "step": 9888 + }, + { + "epoch": 2.1094283276450514, + "grad_norm": 0.4997216893845919, + "learning_rate": 0.00014104281748997878, + "loss": 0.9187, + "step": 9889 + }, + { + "epoch": 2.109641638225256, + "grad_norm": 0.46360028457127356, + "learning_rate": 0.00014103731679894883, + "loss": 0.9508, + "step": 9890 + }, + { + "epoch": 2.1098549488054608, + "grad_norm": 0.46005162343168166, + "learning_rate": 0.0001410318154172806, + "loss": 0.9079, + "step": 9891 + }, + { + "epoch": 2.1100682593856654, + "grad_norm": 0.3358164226114013, + "learning_rate": 0.00014102631334503635, + "loss": 0.9212, + "step": 9892 + }, + { + "epoch": 2.11028156996587, + "grad_norm": 0.2892633642282836, + "learning_rate": 0.00014102081058227836, + "loss": 0.915, + "step": 9893 + }, + { + "epoch": 2.1104948805460753, + "grad_norm": 0.288061094244233, + "learning_rate": 0.00014101530712906884, + "loss": 0.9679, + "step": 9894 + }, + { + "epoch": 2.11070819112628, + "grad_norm": 0.27037675119454146, + "learning_rate": 0.00014100980298547013, + "loss": 0.9427, + "step": 9895 + }, + { + "epoch": 2.1109215017064846, + "grad_norm": 0.31241096027696297, + "learning_rate": 0.00014100429815154444, + "loss": 0.9405, + "step": 9896 + }, + { + "epoch": 2.1111348122866893, + "grad_norm": 0.3356113068292399, + "learning_rate": 0.0001409987926273541, + "loss": 0.9265, + "step": 9897 + }, + { + "epoch": 2.111348122866894, + "grad_norm": 0.31189771261066773, + "learning_rate": 0.00014099328641296142, + "loss": 0.9236, + "step": 9898 + }, + { + "epoch": 2.111561433447099, + "grad_norm": 0.21932902525889308, + "learning_rate": 0.00014098777950842864, + "loss": 0.9404, + "step": 9899 + }, + { + "epoch": 2.111774744027304, + "grad_norm": 0.26783551766324565, + "learning_rate": 0.00014098227191381812, + "loss": 0.9393, + "step": 9900 + }, + { + "epoch": 2.1119880546075085, + "grad_norm": 0.3378829138424952, + "learning_rate": 0.00014097676362919217, + "loss": 0.986, + "step": 9901 + }, + { + "epoch": 2.112201365187713, + "grad_norm": 0.3390092522287043, + "learning_rate": 0.00014097125465461311, + "loss": 0.9619, + "step": 9902 + }, + { + "epoch": 2.1124146757679183, + "grad_norm": 0.31409893279901197, + "learning_rate": 0.0001409657449901433, + "loss": 0.9281, + "step": 9903 + }, + { + "epoch": 2.112627986348123, + "grad_norm": 0.3171594069989043, + "learning_rate": 0.00014096023463584505, + "loss": 0.9633, + "step": 9904 + }, + { + "epoch": 2.1128412969283277, + "grad_norm": 0.27767497938433167, + "learning_rate": 0.00014095472359178072, + "loss": 0.9212, + "step": 9905 + }, + { + "epoch": 2.1130546075085324, + "grad_norm": 0.21167516812405376, + "learning_rate": 0.00014094921185801265, + "loss": 0.934, + "step": 9906 + }, + { + "epoch": 2.113267918088737, + "grad_norm": 0.2674455184328215, + "learning_rate": 0.00014094369943460323, + "loss": 0.9462, + "step": 9907 + }, + { + "epoch": 2.113481228668942, + "grad_norm": 0.27818105518446046, + "learning_rate": 0.00014093818632161483, + "loss": 0.9559, + "step": 9908 + }, + { + "epoch": 2.113694539249147, + "grad_norm": 0.3122871334712098, + "learning_rate": 0.00014093267251910984, + "loss": 0.961, + "step": 9909 + }, + { + "epoch": 2.1139078498293515, + "grad_norm": 0.3175036835089229, + "learning_rate": 0.00014092715802715064, + "loss": 0.907, + "step": 9910 + }, + { + "epoch": 2.114121160409556, + "grad_norm": 0.38852323593106575, + "learning_rate": 0.00014092164284579958, + "loss": 0.9346, + "step": 9911 + }, + { + "epoch": 2.114334470989761, + "grad_norm": 0.4013776820336142, + "learning_rate": 0.00014091612697511913, + "loss": 0.9226, + "step": 9912 + }, + { + "epoch": 2.114547781569966, + "grad_norm": 0.40882518258022366, + "learning_rate": 0.0001409106104151717, + "loss": 0.9275, + "step": 9913 + }, + { + "epoch": 2.1147610921501707, + "grad_norm": 0.3517004945628782, + "learning_rate": 0.00014090509316601967, + "loss": 0.9477, + "step": 9914 + }, + { + "epoch": 2.1149744027303754, + "grad_norm": 0.3745849172438652, + "learning_rate": 0.00014089957522772547, + "loss": 0.9816, + "step": 9915 + }, + { + "epoch": 2.11518771331058, + "grad_norm": 0.34231577443316386, + "learning_rate": 0.0001408940566003516, + "loss": 0.9624, + "step": 9916 + }, + { + "epoch": 2.1154010238907848, + "grad_norm": 0.3104453890880471, + "learning_rate": 0.0001408885372839604, + "loss": 0.9694, + "step": 9917 + }, + { + "epoch": 2.11561433447099, + "grad_norm": 0.29270409485139887, + "learning_rate": 0.0001408830172786144, + "loss": 0.9738, + "step": 9918 + }, + { + "epoch": 2.1158276450511946, + "grad_norm": 0.300146325172041, + "learning_rate": 0.00014087749658437602, + "loss": 0.936, + "step": 9919 + }, + { + "epoch": 2.1160409556313993, + "grad_norm": 0.3238236669196727, + "learning_rate": 0.00014087197520130777, + "loss": 0.9084, + "step": 9920 + }, + { + "epoch": 2.116254266211604, + "grad_norm": 0.36876625118756073, + "learning_rate": 0.00014086645312947208, + "loss": 0.9178, + "step": 9921 + }, + { + "epoch": 2.116467576791809, + "grad_norm": 0.34089410456997904, + "learning_rate": 0.00014086093036893148, + "loss": 0.9494, + "step": 9922 + }, + { + "epoch": 2.1166808873720138, + "grad_norm": 0.28954255048671923, + "learning_rate": 0.0001408554069197484, + "loss": 0.9084, + "step": 9923 + }, + { + "epoch": 2.1168941979522184, + "grad_norm": 0.24781608444403616, + "learning_rate": 0.00014084988278198534, + "loss": 0.9498, + "step": 9924 + }, + { + "epoch": 2.117107508532423, + "grad_norm": 0.3266086040286622, + "learning_rate": 0.00014084435795570487, + "loss": 0.9417, + "step": 9925 + }, + { + "epoch": 2.117320819112628, + "grad_norm": 0.3298043179072115, + "learning_rate": 0.00014083883244096946, + "loss": 0.9591, + "step": 9926 + }, + { + "epoch": 2.117534129692833, + "grad_norm": 0.3544082310825667, + "learning_rate": 0.00014083330623784166, + "loss": 0.9436, + "step": 9927 + }, + { + "epoch": 2.1177474402730376, + "grad_norm": 0.4244929750753784, + "learning_rate": 0.00014082777934638395, + "loss": 0.9436, + "step": 9928 + }, + { + "epoch": 2.1179607508532423, + "grad_norm": 0.43697103745871596, + "learning_rate": 0.00014082225176665887, + "loss": 0.9544, + "step": 9929 + }, + { + "epoch": 2.118174061433447, + "grad_norm": 0.41958116106086607, + "learning_rate": 0.00014081672349872901, + "loss": 0.9459, + "step": 9930 + }, + { + "epoch": 2.1183873720136517, + "grad_norm": 0.33266734684502614, + "learning_rate": 0.00014081119454265693, + "loss": 0.9456, + "step": 9931 + }, + { + "epoch": 2.118600682593857, + "grad_norm": 0.2334796226259575, + "learning_rate": 0.00014080566489850513, + "loss": 0.9602, + "step": 9932 + }, + { + "epoch": 2.1188139931740615, + "grad_norm": 0.2540172766522661, + "learning_rate": 0.0001408001345663362, + "loss": 0.9584, + "step": 9933 + }, + { + "epoch": 2.119027303754266, + "grad_norm": 0.3347253302195731, + "learning_rate": 0.00014079460354621276, + "loss": 0.9255, + "step": 9934 + }, + { + "epoch": 2.119240614334471, + "grad_norm": 0.33556300598156064, + "learning_rate": 0.00014078907183819733, + "loss": 0.9752, + "step": 9935 + }, + { + "epoch": 2.1194539249146755, + "grad_norm": 0.342867586471945, + "learning_rate": 0.00014078353944235253, + "loss": 0.907, + "step": 9936 + }, + { + "epoch": 2.1196672354948807, + "grad_norm": 0.34246891006875313, + "learning_rate": 0.00014077800635874097, + "loss": 0.9466, + "step": 9937 + }, + { + "epoch": 2.1198805460750854, + "grad_norm": 0.2574368351690644, + "learning_rate": 0.00014077247258742525, + "loss": 0.9254, + "step": 9938 + }, + { + "epoch": 2.12009385665529, + "grad_norm": 0.20141515887342373, + "learning_rate": 0.00014076693812846796, + "loss": 0.9329, + "step": 9939 + }, + { + "epoch": 2.1203071672354947, + "grad_norm": 0.26640788529731124, + "learning_rate": 0.00014076140298193178, + "loss": 0.9209, + "step": 9940 + }, + { + "epoch": 2.1205204778157, + "grad_norm": 0.28834429612654094, + "learning_rate": 0.00014075586714787925, + "loss": 0.924, + "step": 9941 + }, + { + "epoch": 2.1207337883959045, + "grad_norm": 0.2259394000487848, + "learning_rate": 0.0001407503306263731, + "loss": 0.9093, + "step": 9942 + }, + { + "epoch": 2.1209470989761092, + "grad_norm": 0.2897104023875135, + "learning_rate": 0.00014074479341747595, + "loss": 0.9482, + "step": 9943 + }, + { + "epoch": 2.121160409556314, + "grad_norm": 0.22134689476665295, + "learning_rate": 0.00014073925552125042, + "loss": 0.942, + "step": 9944 + }, + { + "epoch": 2.1213737201365186, + "grad_norm": 0.23255897041704518, + "learning_rate": 0.00014073371693775917, + "loss": 0.9363, + "step": 9945 + }, + { + "epoch": 2.1215870307167237, + "grad_norm": 0.24955178998556252, + "learning_rate": 0.0001407281776670649, + "loss": 0.9318, + "step": 9946 + }, + { + "epoch": 2.1218003412969284, + "grad_norm": 0.23093501222743437, + "learning_rate": 0.0001407226377092303, + "loss": 0.9281, + "step": 9947 + }, + { + "epoch": 2.122013651877133, + "grad_norm": 0.2154300608120038, + "learning_rate": 0.000140717097064318, + "loss": 0.9278, + "step": 9948 + }, + { + "epoch": 2.1222269624573378, + "grad_norm": 0.25457252528634633, + "learning_rate": 0.00014071155573239079, + "loss": 0.9269, + "step": 9949 + }, + { + "epoch": 2.1224402730375425, + "grad_norm": 0.20113251632265966, + "learning_rate": 0.00014070601371351125, + "loss": 0.9211, + "step": 9950 + }, + { + "epoch": 2.1226535836177476, + "grad_norm": 0.2744621435331522, + "learning_rate": 0.00014070047100774215, + "loss": 0.9238, + "step": 9951 + }, + { + "epoch": 2.1228668941979523, + "grad_norm": 0.26809912889975324, + "learning_rate": 0.00014069492761514623, + "loss": 0.9234, + "step": 9952 + }, + { + "epoch": 2.123080204778157, + "grad_norm": 0.22193997830472614, + "learning_rate": 0.00014068938353578613, + "loss": 0.9447, + "step": 9953 + }, + { + "epoch": 2.1232935153583616, + "grad_norm": 0.2487736782431418, + "learning_rate": 0.0001406838387697247, + "loss": 0.9179, + "step": 9954 + }, + { + "epoch": 2.1235068259385668, + "grad_norm": 0.2497640745198754, + "learning_rate": 0.00014067829331702457, + "loss": 0.9328, + "step": 9955 + }, + { + "epoch": 2.1237201365187715, + "grad_norm": 0.26873170057040335, + "learning_rate": 0.00014067274717774852, + "loss": 0.9341, + "step": 9956 + }, + { + "epoch": 2.123933447098976, + "grad_norm": 0.2608108603041064, + "learning_rate": 0.00014066720035195933, + "loss": 0.9021, + "step": 9957 + }, + { + "epoch": 2.124146757679181, + "grad_norm": 0.2666802190790921, + "learning_rate": 0.00014066165283971977, + "loss": 0.9454, + "step": 9958 + }, + { + "epoch": 2.1243600682593855, + "grad_norm": 0.2678221383940594, + "learning_rate": 0.00014065610464109257, + "loss": 0.9477, + "step": 9959 + }, + { + "epoch": 2.1245733788395906, + "grad_norm": 0.2822767080919715, + "learning_rate": 0.00014065055575614052, + "loss": 0.9522, + "step": 9960 + }, + { + "epoch": 2.1247866894197953, + "grad_norm": 0.29640799897042486, + "learning_rate": 0.0001406450061849264, + "loss": 0.9108, + "step": 9961 + }, + { + "epoch": 2.125, + "grad_norm": 0.2746287608485908, + "learning_rate": 0.00014063945592751305, + "loss": 0.9233, + "step": 9962 + }, + { + "epoch": 2.1252133105802047, + "grad_norm": 0.22626155231120315, + "learning_rate": 0.0001406339049839632, + "loss": 0.9443, + "step": 9963 + }, + { + "epoch": 2.1254266211604094, + "grad_norm": 0.21938340781866594, + "learning_rate": 0.00014062835335433972, + "loss": 0.9418, + "step": 9964 + }, + { + "epoch": 2.1256399317406145, + "grad_norm": 0.2142379686983595, + "learning_rate": 0.0001406228010387054, + "loss": 0.9147, + "step": 9965 + }, + { + "epoch": 2.125853242320819, + "grad_norm": 0.24438023705357317, + "learning_rate": 0.00014061724803712307, + "loss": 0.9405, + "step": 9966 + }, + { + "epoch": 2.126066552901024, + "grad_norm": 0.2119996539034949, + "learning_rate": 0.00014061169434965553, + "loss": 0.9308, + "step": 9967 + }, + { + "epoch": 2.1262798634812285, + "grad_norm": 0.24319534975109383, + "learning_rate": 0.00014060613997636566, + "loss": 0.9089, + "step": 9968 + }, + { + "epoch": 2.1264931740614337, + "grad_norm": 0.25273634292981634, + "learning_rate": 0.00014060058491731633, + "loss": 0.9664, + "step": 9969 + }, + { + "epoch": 2.1267064846416384, + "grad_norm": 0.24922655954987954, + "learning_rate": 0.00014059502917257032, + "loss": 0.9588, + "step": 9970 + }, + { + "epoch": 2.126919795221843, + "grad_norm": 0.27518631005474703, + "learning_rate": 0.00014058947274219055, + "loss": 0.9306, + "step": 9971 + }, + { + "epoch": 2.1271331058020477, + "grad_norm": 0.27888854456574014, + "learning_rate": 0.0001405839156262399, + "loss": 0.9381, + "step": 9972 + }, + { + "epoch": 2.1273464163822524, + "grad_norm": 0.2580107541422084, + "learning_rate": 0.0001405783578247812, + "loss": 0.9097, + "step": 9973 + }, + { + "epoch": 2.1275597269624575, + "grad_norm": 0.28792708952699964, + "learning_rate": 0.00014057279933787738, + "loss": 0.9196, + "step": 9974 + }, + { + "epoch": 2.1277730375426622, + "grad_norm": 0.2769928600310481, + "learning_rate": 0.0001405672401655913, + "loss": 0.9409, + "step": 9975 + }, + { + "epoch": 2.127986348122867, + "grad_norm": 0.2818691514632741, + "learning_rate": 0.00014056168030798587, + "loss": 0.9422, + "step": 9976 + }, + { + "epoch": 2.1281996587030716, + "grad_norm": 0.29892563171320785, + "learning_rate": 0.00014055611976512405, + "loss": 0.9569, + "step": 9977 + }, + { + "epoch": 2.1284129692832763, + "grad_norm": 0.23843816626194625, + "learning_rate": 0.00014055055853706867, + "loss": 0.9647, + "step": 9978 + }, + { + "epoch": 2.1286262798634814, + "grad_norm": 0.20051890759403368, + "learning_rate": 0.00014054499662388272, + "loss": 0.8938, + "step": 9979 + }, + { + "epoch": 2.128839590443686, + "grad_norm": 0.24919566375225202, + "learning_rate": 0.00014053943402562912, + "loss": 0.9639, + "step": 9980 + }, + { + "epoch": 2.1290529010238908, + "grad_norm": 0.24417646455681793, + "learning_rate": 0.00014053387074237082, + "loss": 0.9277, + "step": 9981 + }, + { + "epoch": 2.1292662116040955, + "grad_norm": 0.22159492613916262, + "learning_rate": 0.00014052830677417072, + "loss": 0.9495, + "step": 9982 + }, + { + "epoch": 2.1294795221843, + "grad_norm": 0.23203212258183245, + "learning_rate": 0.00014052274212109181, + "loss": 0.9142, + "step": 9983 + }, + { + "epoch": 2.1296928327645053, + "grad_norm": 0.23836582396952313, + "learning_rate": 0.0001405171767831971, + "loss": 0.9246, + "step": 9984 + }, + { + "epoch": 2.12990614334471, + "grad_norm": 0.2076989911265266, + "learning_rate": 0.00014051161076054946, + "loss": 0.9455, + "step": 9985 + }, + { + "epoch": 2.1301194539249146, + "grad_norm": 0.23434082714956733, + "learning_rate": 0.00014050604405321196, + "loss": 0.9463, + "step": 9986 + }, + { + "epoch": 2.1303327645051193, + "grad_norm": 0.23339870587495912, + "learning_rate": 0.00014050047666124755, + "loss": 0.9181, + "step": 9987 + }, + { + "epoch": 2.1305460750853245, + "grad_norm": 0.23507265668420013, + "learning_rate": 0.00014049490858471922, + "loss": 0.9432, + "step": 9988 + }, + { + "epoch": 2.130759385665529, + "grad_norm": 0.2652644052410818, + "learning_rate": 0.00014048933982368998, + "loss": 0.9578, + "step": 9989 + }, + { + "epoch": 2.130972696245734, + "grad_norm": 0.2718819355628995, + "learning_rate": 0.00014048377037822286, + "loss": 0.9702, + "step": 9990 + }, + { + "epoch": 2.1311860068259385, + "grad_norm": 0.38207351680498275, + "learning_rate": 0.00014047820024838087, + "loss": 0.9364, + "step": 9991 + }, + { + "epoch": 2.131399317406143, + "grad_norm": 0.459082300499466, + "learning_rate": 0.00014047262943422698, + "loss": 0.9023, + "step": 9992 + }, + { + "epoch": 2.1316126279863483, + "grad_norm": 0.44718752257638467, + "learning_rate": 0.00014046705793582432, + "loss": 0.9137, + "step": 9993 + }, + { + "epoch": 2.131825938566553, + "grad_norm": 0.42690103520414097, + "learning_rate": 0.00014046148575323585, + "loss": 0.9164, + "step": 9994 + }, + { + "epoch": 2.1320392491467577, + "grad_norm": 0.3479298714136753, + "learning_rate": 0.00014045591288652467, + "loss": 0.8905, + "step": 9995 + }, + { + "epoch": 2.1322525597269624, + "grad_norm": 0.2290665981326149, + "learning_rate": 0.00014045033933575383, + "loss": 0.9277, + "step": 9996 + }, + { + "epoch": 2.132465870307167, + "grad_norm": 0.2296722153302959, + "learning_rate": 0.00014044476510098635, + "loss": 0.9456, + "step": 9997 + }, + { + "epoch": 2.132679180887372, + "grad_norm": 0.27244895548520276, + "learning_rate": 0.00014043919018228538, + "loss": 0.9304, + "step": 9998 + }, + { + "epoch": 2.132892491467577, + "grad_norm": 0.3715721126614381, + "learning_rate": 0.00014043361457971392, + "loss": 0.948, + "step": 9999 + }, + { + "epoch": 2.1331058020477816, + "grad_norm": 0.4543216157441006, + "learning_rate": 0.0001404280382933351, + "loss": 0.9367, + "step": 10000 + }, + { + "epoch": 2.1333191126279862, + "grad_norm": 0.45913806741003615, + "learning_rate": 0.00014042246132321202, + "loss": 0.9492, + "step": 10001 + }, + { + "epoch": 2.133532423208191, + "grad_norm": 0.3757215020791551, + "learning_rate": 0.00014041688366940775, + "loss": 0.8989, + "step": 10002 + }, + { + "epoch": 2.133745733788396, + "grad_norm": 0.29227412543145526, + "learning_rate": 0.00014041130533198545, + "loss": 0.9651, + "step": 10003 + }, + { + "epoch": 2.1339590443686007, + "grad_norm": 0.31151645626131774, + "learning_rate": 0.0001404057263110082, + "loss": 0.9494, + "step": 10004 + }, + { + "epoch": 2.1341723549488054, + "grad_norm": 0.30718215853789765, + "learning_rate": 0.00014040014660653913, + "loss": 0.9264, + "step": 10005 + }, + { + "epoch": 2.13438566552901, + "grad_norm": 0.329195839109097, + "learning_rate": 0.0001403945662186414, + "loss": 0.8879, + "step": 10006 + }, + { + "epoch": 2.1345989761092152, + "grad_norm": 0.3037040664731422, + "learning_rate": 0.00014038898514737817, + "loss": 0.9303, + "step": 10007 + }, + { + "epoch": 2.13481228668942, + "grad_norm": 0.29931012743648866, + "learning_rate": 0.0001403834033928125, + "loss": 0.942, + "step": 10008 + }, + { + "epoch": 2.1350255972696246, + "grad_norm": 0.4205204571024811, + "learning_rate": 0.00014037782095500763, + "loss": 0.9348, + "step": 10009 + }, + { + "epoch": 2.1352389078498293, + "grad_norm": 0.4761413025043348, + "learning_rate": 0.00014037223783402671, + "loss": 0.9549, + "step": 10010 + }, + { + "epoch": 2.135452218430034, + "grad_norm": 0.3877390048836435, + "learning_rate": 0.00014036665402993292, + "loss": 0.9004, + "step": 10011 + }, + { + "epoch": 2.135665529010239, + "grad_norm": 0.3163437736883823, + "learning_rate": 0.0001403610695427894, + "loss": 0.9419, + "step": 10012 + }, + { + "epoch": 2.135878839590444, + "grad_norm": 0.28579155513474624, + "learning_rate": 0.00014035548437265935, + "loss": 0.9351, + "step": 10013 + }, + { + "epoch": 2.1360921501706485, + "grad_norm": 0.3080858844436437, + "learning_rate": 0.000140349898519606, + "loss": 0.9346, + "step": 10014 + }, + { + "epoch": 2.136305460750853, + "grad_norm": 0.27676743396123094, + "learning_rate": 0.00014034431198369253, + "loss": 0.9487, + "step": 10015 + }, + { + "epoch": 2.136518771331058, + "grad_norm": 0.3432798430953738, + "learning_rate": 0.00014033872476498216, + "loss": 0.9036, + "step": 10016 + }, + { + "epoch": 2.136732081911263, + "grad_norm": 0.41244754707016196, + "learning_rate": 0.00014033313686353808, + "loss": 0.9284, + "step": 10017 + }, + { + "epoch": 2.1369453924914676, + "grad_norm": 0.37711291844906913, + "learning_rate": 0.00014032754827942356, + "loss": 0.9317, + "step": 10018 + }, + { + "epoch": 2.1371587030716723, + "grad_norm": 0.31127602025588896, + "learning_rate": 0.0001403219590127018, + "loss": 0.9392, + "step": 10019 + }, + { + "epoch": 2.137372013651877, + "grad_norm": 0.32883397760107763, + "learning_rate": 0.0001403163690634361, + "loss": 0.9322, + "step": 10020 + }, + { + "epoch": 2.1375853242320817, + "grad_norm": 0.2525415965319467, + "learning_rate": 0.00014031077843168962, + "loss": 0.9521, + "step": 10021 + }, + { + "epoch": 2.137798634812287, + "grad_norm": 0.2503970129326179, + "learning_rate": 0.00014030518711752568, + "loss": 0.9478, + "step": 10022 + }, + { + "epoch": 2.1380119453924915, + "grad_norm": 0.24392409257136516, + "learning_rate": 0.00014029959512100755, + "loss": 0.9358, + "step": 10023 + }, + { + "epoch": 2.138225255972696, + "grad_norm": 0.23067382492467117, + "learning_rate": 0.00014029400244219847, + "loss": 0.9281, + "step": 10024 + }, + { + "epoch": 2.138438566552901, + "grad_norm": 0.29521642652150765, + "learning_rate": 0.00014028840908116174, + "loss": 0.9218, + "step": 10025 + }, + { + "epoch": 2.138651877133106, + "grad_norm": 0.2825964764924307, + "learning_rate": 0.00014028281503796066, + "loss": 0.9415, + "step": 10026 + }, + { + "epoch": 2.1388651877133107, + "grad_norm": 0.23739620152125318, + "learning_rate": 0.0001402772203126585, + "loss": 0.9087, + "step": 10027 + }, + { + "epoch": 2.1390784982935154, + "grad_norm": 0.22848499423447854, + "learning_rate": 0.0001402716249053186, + "loss": 0.9261, + "step": 10028 + }, + { + "epoch": 2.13929180887372, + "grad_norm": 0.24332812500182152, + "learning_rate": 0.00014026602881600422, + "loss": 0.9494, + "step": 10029 + }, + { + "epoch": 2.1395051194539247, + "grad_norm": 0.22929043673280342, + "learning_rate": 0.00014026043204477877, + "loss": 0.9579, + "step": 10030 + }, + { + "epoch": 2.13971843003413, + "grad_norm": 0.23517988564593967, + "learning_rate": 0.00014025483459170545, + "loss": 0.9307, + "step": 10031 + }, + { + "epoch": 2.1399317406143346, + "grad_norm": 0.31761720998133, + "learning_rate": 0.0001402492364568477, + "loss": 0.9701, + "step": 10032 + }, + { + "epoch": 2.1401450511945392, + "grad_norm": 0.4113244717754337, + "learning_rate": 0.0001402436376402688, + "loss": 0.934, + "step": 10033 + }, + { + "epoch": 2.140358361774744, + "grad_norm": 0.4904730916752199, + "learning_rate": 0.00014023803814203214, + "loss": 0.944, + "step": 10034 + }, + { + "epoch": 2.1405716723549486, + "grad_norm": 0.5126249333208674, + "learning_rate": 0.00014023243796220107, + "loss": 0.9362, + "step": 10035 + }, + { + "epoch": 2.1407849829351537, + "grad_norm": 0.515140432971491, + "learning_rate": 0.00014022683710083895, + "loss": 0.9305, + "step": 10036 + }, + { + "epoch": 2.1409982935153584, + "grad_norm": 0.4910092323813251, + "learning_rate": 0.00014022123555800916, + "loss": 0.9738, + "step": 10037 + }, + { + "epoch": 2.141211604095563, + "grad_norm": 0.4318889692785904, + "learning_rate": 0.00014021563333377506, + "loss": 0.9416, + "step": 10038 + }, + { + "epoch": 2.141424914675768, + "grad_norm": 0.3202073036669081, + "learning_rate": 0.00014021003042820007, + "loss": 0.9648, + "step": 10039 + }, + { + "epoch": 2.1416382252559725, + "grad_norm": 0.24850432549612606, + "learning_rate": 0.0001402044268413476, + "loss": 0.9194, + "step": 10040 + }, + { + "epoch": 2.1418515358361776, + "grad_norm": 0.2025568735408779, + "learning_rate": 0.00014019882257328096, + "loss": 0.9657, + "step": 10041 + }, + { + "epoch": 2.1420648464163823, + "grad_norm": 0.29070023267742556, + "learning_rate": 0.00014019321762406367, + "loss": 0.9183, + "step": 10042 + }, + { + "epoch": 2.142278156996587, + "grad_norm": 0.34312169886878396, + "learning_rate": 0.00014018761199375911, + "loss": 0.9304, + "step": 10043 + }, + { + "epoch": 2.1424914675767917, + "grad_norm": 0.3606892165265845, + "learning_rate": 0.00014018200568243072, + "loss": 0.9545, + "step": 10044 + }, + { + "epoch": 2.142704778156997, + "grad_norm": 0.3799755383617487, + "learning_rate": 0.00014017639869014188, + "loss": 0.9371, + "step": 10045 + }, + { + "epoch": 2.1429180887372015, + "grad_norm": 0.3612170374827153, + "learning_rate": 0.00014017079101695613, + "loss": 0.9487, + "step": 10046 + }, + { + "epoch": 2.143131399317406, + "grad_norm": 0.3708157697637494, + "learning_rate": 0.00014016518266293682, + "loss": 0.9657, + "step": 10047 + }, + { + "epoch": 2.143344709897611, + "grad_norm": 0.32653770583583697, + "learning_rate": 0.00014015957362814748, + "loss": 0.9471, + "step": 10048 + }, + { + "epoch": 2.1435580204778155, + "grad_norm": 0.2640235738866227, + "learning_rate": 0.00014015396391265156, + "loss": 0.9346, + "step": 10049 + }, + { + "epoch": 2.1437713310580206, + "grad_norm": 0.24895869044497834, + "learning_rate": 0.00014014835351651252, + "loss": 0.9239, + "step": 10050 + }, + { + "epoch": 2.1439846416382253, + "grad_norm": 0.24793062065329016, + "learning_rate": 0.00014014274243979384, + "loss": 0.932, + "step": 10051 + }, + { + "epoch": 2.14419795221843, + "grad_norm": 0.22380839813452327, + "learning_rate": 0.00014013713068255902, + "loss": 0.9311, + "step": 10052 + }, + { + "epoch": 2.1444112627986347, + "grad_norm": 0.2528035345672608, + "learning_rate": 0.00014013151824487157, + "loss": 0.9479, + "step": 10053 + }, + { + "epoch": 2.1446245733788394, + "grad_norm": 0.22980102429888913, + "learning_rate": 0.00014012590512679497, + "loss": 0.9247, + "step": 10054 + }, + { + "epoch": 2.1448378839590445, + "grad_norm": 0.26108987088259883, + "learning_rate": 0.00014012029132839273, + "loss": 0.9258, + "step": 10055 + }, + { + "epoch": 2.145051194539249, + "grad_norm": 0.24942815229257997, + "learning_rate": 0.0001401146768497284, + "loss": 0.968, + "step": 10056 + }, + { + "epoch": 2.145264505119454, + "grad_norm": 0.2069186529503997, + "learning_rate": 0.00014010906169086547, + "loss": 0.9105, + "step": 10057 + }, + { + "epoch": 2.1454778156996586, + "grad_norm": 0.21688497836567416, + "learning_rate": 0.0001401034458518675, + "loss": 0.9479, + "step": 10058 + }, + { + "epoch": 2.1456911262798632, + "grad_norm": 0.2456681350577818, + "learning_rate": 0.00014009782933279804, + "loss": 0.9393, + "step": 10059 + }, + { + "epoch": 2.1459044368600684, + "grad_norm": 0.2668187267486142, + "learning_rate": 0.00014009221213372064, + "loss": 0.9185, + "step": 10060 + }, + { + "epoch": 2.146117747440273, + "grad_norm": 0.3544819211284908, + "learning_rate": 0.00014008659425469885, + "loss": 0.97, + "step": 10061 + }, + { + "epoch": 2.1463310580204777, + "grad_norm": 0.36010182578210165, + "learning_rate": 0.00014008097569579618, + "loss": 0.8984, + "step": 10062 + }, + { + "epoch": 2.1465443686006824, + "grad_norm": 0.3739520571514855, + "learning_rate": 0.00014007535645707634, + "loss": 0.9503, + "step": 10063 + }, + { + "epoch": 2.1467576791808876, + "grad_norm": 0.2864442769009886, + "learning_rate": 0.00014006973653860278, + "loss": 0.92, + "step": 10064 + }, + { + "epoch": 2.1469709897610922, + "grad_norm": 0.2939648287196535, + "learning_rate": 0.00014006411594043915, + "loss": 0.9356, + "step": 10065 + }, + { + "epoch": 2.147184300341297, + "grad_norm": 0.3999832361686439, + "learning_rate": 0.00014005849466264904, + "loss": 0.9121, + "step": 10066 + }, + { + "epoch": 2.1473976109215016, + "grad_norm": 0.4020333211368907, + "learning_rate": 0.00014005287270529604, + "loss": 0.9018, + "step": 10067 + }, + { + "epoch": 2.1476109215017063, + "grad_norm": 0.38984389563583005, + "learning_rate": 0.00014004725006844379, + "loss": 0.9238, + "step": 10068 + }, + { + "epoch": 2.1478242320819114, + "grad_norm": 0.3514248995113033, + "learning_rate": 0.0001400416267521559, + "loss": 0.9408, + "step": 10069 + }, + { + "epoch": 2.148037542662116, + "grad_norm": 0.30696863213129166, + "learning_rate": 0.000140036002756496, + "loss": 0.9317, + "step": 10070 + }, + { + "epoch": 2.148250853242321, + "grad_norm": 0.263945818962811, + "learning_rate": 0.0001400303780815277, + "loss": 0.9117, + "step": 10071 + }, + { + "epoch": 2.1484641638225255, + "grad_norm": 0.24463468280391973, + "learning_rate": 0.00014002475272731468, + "loss": 0.9271, + "step": 10072 + }, + { + "epoch": 2.1486774744027306, + "grad_norm": 0.2316797233840103, + "learning_rate": 0.00014001912669392056, + "loss": 0.9601, + "step": 10073 + }, + { + "epoch": 2.1488907849829353, + "grad_norm": 0.21847078955150118, + "learning_rate": 0.000140013499981409, + "loss": 0.9283, + "step": 10074 + }, + { + "epoch": 2.14910409556314, + "grad_norm": 0.2520504067870295, + "learning_rate": 0.00014000787258984371, + "loss": 0.9266, + "step": 10075 + }, + { + "epoch": 2.1493174061433447, + "grad_norm": 0.24905428487040326, + "learning_rate": 0.0001400022445192883, + "loss": 0.9271, + "step": 10076 + }, + { + "epoch": 2.1495307167235493, + "grad_norm": 0.25198674808679883, + "learning_rate": 0.00013999661576980652, + "loss": 0.9427, + "step": 10077 + }, + { + "epoch": 2.1497440273037545, + "grad_norm": 0.21657410992856826, + "learning_rate": 0.00013999098634146198, + "loss": 0.9464, + "step": 10078 + }, + { + "epoch": 2.149957337883959, + "grad_norm": 0.2556759098755232, + "learning_rate": 0.00013998535623431846, + "loss": 0.9191, + "step": 10079 + }, + { + "epoch": 2.150170648464164, + "grad_norm": 0.31412683444911077, + "learning_rate": 0.0001399797254484396, + "loss": 0.8909, + "step": 10080 + }, + { + "epoch": 2.1503839590443685, + "grad_norm": 0.3317153265193369, + "learning_rate": 0.00013997409398388915, + "loss": 0.9433, + "step": 10081 + }, + { + "epoch": 2.150597269624573, + "grad_norm": 0.34407035766138944, + "learning_rate": 0.0001399684618407308, + "loss": 0.9194, + "step": 10082 + }, + { + "epoch": 2.1508105802047783, + "grad_norm": 0.3743217602374493, + "learning_rate": 0.00013996282901902832, + "loss": 0.9395, + "step": 10083 + }, + { + "epoch": 2.151023890784983, + "grad_norm": 0.4384737021353618, + "learning_rate": 0.00013995719551884543, + "loss": 0.8863, + "step": 10084 + }, + { + "epoch": 2.1512372013651877, + "grad_norm": 0.4430528554741814, + "learning_rate": 0.0001399515613402458, + "loss": 0.9126, + "step": 10085 + }, + { + "epoch": 2.1514505119453924, + "grad_norm": 0.421294204657606, + "learning_rate": 0.0001399459264832933, + "loss": 0.9258, + "step": 10086 + }, + { + "epoch": 2.151663822525597, + "grad_norm": 0.31089101059119506, + "learning_rate": 0.00013994029094805164, + "loss": 0.9123, + "step": 10087 + }, + { + "epoch": 2.151877133105802, + "grad_norm": 0.30105940784480045, + "learning_rate": 0.00013993465473458456, + "loss": 0.9321, + "step": 10088 + }, + { + "epoch": 2.152090443686007, + "grad_norm": 0.34126336863243295, + "learning_rate": 0.00013992901784295585, + "loss": 0.9412, + "step": 10089 + }, + { + "epoch": 2.1523037542662116, + "grad_norm": 0.3211945863262654, + "learning_rate": 0.00013992338027322932, + "loss": 0.9124, + "step": 10090 + }, + { + "epoch": 2.1525170648464163, + "grad_norm": 0.3010012568578368, + "learning_rate": 0.0001399177420254687, + "loss": 0.9281, + "step": 10091 + }, + { + "epoch": 2.1527303754266214, + "grad_norm": 0.38956599274289183, + "learning_rate": 0.00013991210309973782, + "loss": 0.9277, + "step": 10092 + }, + { + "epoch": 2.152943686006826, + "grad_norm": 0.44803184269238705, + "learning_rate": 0.00013990646349610052, + "loss": 0.9327, + "step": 10093 + }, + { + "epoch": 2.1531569965870307, + "grad_norm": 0.4045070629951243, + "learning_rate": 0.00013990082321462056, + "loss": 0.93, + "step": 10094 + }, + { + "epoch": 2.1533703071672354, + "grad_norm": 0.36385312431251254, + "learning_rate": 0.00013989518225536178, + "loss": 0.9428, + "step": 10095 + }, + { + "epoch": 2.15358361774744, + "grad_norm": 0.3869426689467086, + "learning_rate": 0.000139889540618388, + "loss": 0.9329, + "step": 10096 + }, + { + "epoch": 2.1537969283276452, + "grad_norm": 0.40827717846627143, + "learning_rate": 0.00013988389830376307, + "loss": 0.9471, + "step": 10097 + }, + { + "epoch": 2.15401023890785, + "grad_norm": 0.3883105506910038, + "learning_rate": 0.0001398782553115508, + "loss": 0.9498, + "step": 10098 + }, + { + "epoch": 2.1542235494880546, + "grad_norm": 0.39182941309640895, + "learning_rate": 0.00013987261164181508, + "loss": 0.9246, + "step": 10099 + }, + { + "epoch": 2.1544368600682593, + "grad_norm": 0.35449820354040734, + "learning_rate": 0.00013986696729461977, + "loss": 0.9434, + "step": 10100 + }, + { + "epoch": 2.154650170648464, + "grad_norm": 0.30535670697434947, + "learning_rate": 0.00013986132227002872, + "loss": 0.9458, + "step": 10101 + }, + { + "epoch": 2.154863481228669, + "grad_norm": 0.258864498190236, + "learning_rate": 0.00013985567656810577, + "loss": 0.9434, + "step": 10102 + }, + { + "epoch": 2.155076791808874, + "grad_norm": 0.20646506980392887, + "learning_rate": 0.00013985003018891483, + "loss": 0.9059, + "step": 10103 + }, + { + "epoch": 2.1552901023890785, + "grad_norm": 0.2552010014804012, + "learning_rate": 0.00013984438313251982, + "loss": 0.92, + "step": 10104 + }, + { + "epoch": 2.155503412969283, + "grad_norm": 0.30920090971372066, + "learning_rate": 0.00013983873539898459, + "loss": 0.9315, + "step": 10105 + }, + { + "epoch": 2.155716723549488, + "grad_norm": 0.3453622560986373, + "learning_rate": 0.00013983308698837306, + "loss": 0.9167, + "step": 10106 + }, + { + "epoch": 2.155930034129693, + "grad_norm": 0.39110150831764784, + "learning_rate": 0.00013982743790074916, + "loss": 0.9164, + "step": 10107 + }, + { + "epoch": 2.1561433447098977, + "grad_norm": 0.40999416247888326, + "learning_rate": 0.0001398217881361768, + "loss": 0.9248, + "step": 10108 + }, + { + "epoch": 2.1563566552901023, + "grad_norm": 0.29978773624060717, + "learning_rate": 0.0001398161376947199, + "loss": 0.9586, + "step": 10109 + }, + { + "epoch": 2.156569965870307, + "grad_norm": 0.2765036356834616, + "learning_rate": 0.00013981048657644234, + "loss": 0.9436, + "step": 10110 + }, + { + "epoch": 2.156783276450512, + "grad_norm": 0.26132744393531726, + "learning_rate": 0.0001398048347814082, + "loss": 0.9481, + "step": 10111 + }, + { + "epoch": 2.156996587030717, + "grad_norm": 0.283749873766784, + "learning_rate": 0.0001397991823096813, + "loss": 0.917, + "step": 10112 + }, + { + "epoch": 2.1572098976109215, + "grad_norm": 0.35473893535437423, + "learning_rate": 0.00013979352916132563, + "loss": 0.958, + "step": 10113 + }, + { + "epoch": 2.157423208191126, + "grad_norm": 0.37294280344150016, + "learning_rate": 0.00013978787533640522, + "loss": 0.9539, + "step": 10114 + }, + { + "epoch": 2.157636518771331, + "grad_norm": 0.47809426748492445, + "learning_rate": 0.00013978222083498395, + "loss": 0.9425, + "step": 10115 + }, + { + "epoch": 2.157849829351536, + "grad_norm": 0.4828474962927421, + "learning_rate": 0.00013977656565712591, + "loss": 0.9392, + "step": 10116 + }, + { + "epoch": 2.1580631399317407, + "grad_norm": 0.40538634244009253, + "learning_rate": 0.00013977090980289497, + "loss": 0.9276, + "step": 10117 + }, + { + "epoch": 2.1582764505119454, + "grad_norm": 0.3000499798200665, + "learning_rate": 0.00013976525327235518, + "loss": 0.9388, + "step": 10118 + }, + { + "epoch": 2.15848976109215, + "grad_norm": 0.2969841750876601, + "learning_rate": 0.00013975959606557056, + "loss": 0.9702, + "step": 10119 + }, + { + "epoch": 2.1587030716723548, + "grad_norm": 0.32553504604634437, + "learning_rate": 0.00013975393818260512, + "loss": 0.9496, + "step": 10120 + }, + { + "epoch": 2.15891638225256, + "grad_norm": 0.39669166922941446, + "learning_rate": 0.00013974827962352283, + "loss": 0.9214, + "step": 10121 + }, + { + "epoch": 2.1591296928327646, + "grad_norm": 0.4283581093132599, + "learning_rate": 0.0001397426203883878, + "loss": 0.9028, + "step": 10122 + }, + { + "epoch": 2.1593430034129693, + "grad_norm": 0.38649220380263416, + "learning_rate": 0.00013973696047726397, + "loss": 0.9064, + "step": 10123 + }, + { + "epoch": 2.159556313993174, + "grad_norm": 0.35084881821677777, + "learning_rate": 0.00013973129989021542, + "loss": 0.9179, + "step": 10124 + }, + { + "epoch": 2.1597696245733786, + "grad_norm": 0.3144844358982764, + "learning_rate": 0.00013972563862730622, + "loss": 0.9101, + "step": 10125 + }, + { + "epoch": 2.1599829351535837, + "grad_norm": 0.2547607091908404, + "learning_rate": 0.00013971997668860045, + "loss": 0.9437, + "step": 10126 + }, + { + "epoch": 2.1601962457337884, + "grad_norm": 0.2948378980937493, + "learning_rate": 0.0001397143140741621, + "loss": 0.9426, + "step": 10127 + }, + { + "epoch": 2.160409556313993, + "grad_norm": 0.4201407090978129, + "learning_rate": 0.00013970865078405528, + "loss": 0.9087, + "step": 10128 + }, + { + "epoch": 2.160622866894198, + "grad_norm": 0.42396026788486124, + "learning_rate": 0.0001397029868183441, + "loss": 0.9247, + "step": 10129 + }, + { + "epoch": 2.160836177474403, + "grad_norm": 0.32406090316566116, + "learning_rate": 0.0001396973221770926, + "loss": 0.9453, + "step": 10130 + }, + { + "epoch": 2.1610494880546076, + "grad_norm": 0.35290688092137606, + "learning_rate": 0.0001396916568603649, + "loss": 0.9319, + "step": 10131 + }, + { + "epoch": 2.1612627986348123, + "grad_norm": 0.4221234980470039, + "learning_rate": 0.0001396859908682251, + "loss": 0.9168, + "step": 10132 + }, + { + "epoch": 2.161476109215017, + "grad_norm": 0.3332747503155598, + "learning_rate": 0.00013968032420073732, + "loss": 0.9337, + "step": 10133 + }, + { + "epoch": 2.1616894197952217, + "grad_norm": 0.3700190924166926, + "learning_rate": 0.00013967465685796565, + "loss": 0.954, + "step": 10134 + }, + { + "epoch": 2.161902730375427, + "grad_norm": 0.45766298826600377, + "learning_rate": 0.00013966898883997425, + "loss": 0.9313, + "step": 10135 + }, + { + "epoch": 2.1621160409556315, + "grad_norm": 0.43463343045900316, + "learning_rate": 0.0001396633201468272, + "loss": 0.9507, + "step": 10136 + }, + { + "epoch": 2.162329351535836, + "grad_norm": 0.30299002031403033, + "learning_rate": 0.00013965765077858874, + "loss": 0.9186, + "step": 10137 + }, + { + "epoch": 2.162542662116041, + "grad_norm": 0.29343434449143696, + "learning_rate": 0.0001396519807353229, + "loss": 0.9497, + "step": 10138 + }, + { + "epoch": 2.1627559726962455, + "grad_norm": 0.3918338942313656, + "learning_rate": 0.00013964631001709394, + "loss": 0.9342, + "step": 10139 + }, + { + "epoch": 2.1629692832764507, + "grad_norm": 0.32846157473254006, + "learning_rate": 0.00013964063862396595, + "loss": 0.8796, + "step": 10140 + }, + { + "epoch": 2.1631825938566553, + "grad_norm": 0.2650268391442825, + "learning_rate": 0.00013963496655600315, + "loss": 0.9445, + "step": 10141 + }, + { + "epoch": 2.16339590443686, + "grad_norm": 0.3142404488015214, + "learning_rate": 0.00013962929381326967, + "loss": 0.937, + "step": 10142 + }, + { + "epoch": 2.1636092150170647, + "grad_norm": 0.26861347939670044, + "learning_rate": 0.00013962362039582975, + "loss": 0.911, + "step": 10143 + }, + { + "epoch": 2.1638225255972694, + "grad_norm": 0.22220399111861147, + "learning_rate": 0.00013961794630374755, + "loss": 0.9003, + "step": 10144 + }, + { + "epoch": 2.1640358361774745, + "grad_norm": 0.28306796416677926, + "learning_rate": 0.0001396122715370873, + "loss": 0.9157, + "step": 10145 + }, + { + "epoch": 2.164249146757679, + "grad_norm": 0.3150207285504696, + "learning_rate": 0.00013960659609591318, + "loss": 0.9191, + "step": 10146 + }, + { + "epoch": 2.164462457337884, + "grad_norm": 0.26028602442590787, + "learning_rate": 0.00013960091998028944, + "loss": 0.9687, + "step": 10147 + }, + { + "epoch": 2.1646757679180886, + "grad_norm": 0.22510582654515585, + "learning_rate": 0.00013959524319028029, + "loss": 0.9425, + "step": 10148 + }, + { + "epoch": 2.1648890784982937, + "grad_norm": 0.3008657574653398, + "learning_rate": 0.00013958956572594994, + "loss": 0.9252, + "step": 10149 + }, + { + "epoch": 2.1651023890784984, + "grad_norm": 0.29965377989367764, + "learning_rate": 0.00013958388758736267, + "loss": 0.8989, + "step": 10150 + }, + { + "epoch": 2.165315699658703, + "grad_norm": 0.23556977708289786, + "learning_rate": 0.0001395782087745827, + "loss": 0.9344, + "step": 10151 + }, + { + "epoch": 2.1655290102389078, + "grad_norm": 0.26448988009608204, + "learning_rate": 0.0001395725292876743, + "loss": 0.9434, + "step": 10152 + }, + { + "epoch": 2.1657423208191124, + "grad_norm": 0.30632592021379074, + "learning_rate": 0.00013956684912670172, + "loss": 0.9394, + "step": 10153 + }, + { + "epoch": 2.1659556313993176, + "grad_norm": 0.3001901214772597, + "learning_rate": 0.00013956116829172925, + "loss": 0.9175, + "step": 10154 + }, + { + "epoch": 2.1661689419795223, + "grad_norm": 0.31837109767950744, + "learning_rate": 0.00013955548678282114, + "loss": 0.9373, + "step": 10155 + }, + { + "epoch": 2.166382252559727, + "grad_norm": 0.31468563313402764, + "learning_rate": 0.00013954980460004175, + "loss": 0.9473, + "step": 10156 + }, + { + "epoch": 2.1665955631399316, + "grad_norm": 0.27592884248383687, + "learning_rate": 0.00013954412174345526, + "loss": 0.9313, + "step": 10157 + }, + { + "epoch": 2.1668088737201368, + "grad_norm": 0.36122510548212355, + "learning_rate": 0.00013953843821312606, + "loss": 0.9121, + "step": 10158 + }, + { + "epoch": 2.1670221843003414, + "grad_norm": 0.28829311385925876, + "learning_rate": 0.00013953275400911843, + "loss": 0.946, + "step": 10159 + }, + { + "epoch": 2.167235494880546, + "grad_norm": 0.22623508411088647, + "learning_rate": 0.00013952706913149666, + "loss": 0.9334, + "step": 10160 + }, + { + "epoch": 2.167448805460751, + "grad_norm": 0.2893349426526034, + "learning_rate": 0.00013952138358032512, + "loss": 0.9027, + "step": 10161 + }, + { + "epoch": 2.1676621160409555, + "grad_norm": 0.3247920741877621, + "learning_rate": 0.00013951569735566813, + "loss": 0.9396, + "step": 10162 + }, + { + "epoch": 2.1678754266211606, + "grad_norm": 0.3834491807014042, + "learning_rate": 0.00013951001045759, + "loss": 0.9351, + "step": 10163 + }, + { + "epoch": 2.1680887372013653, + "grad_norm": 0.45382964415793203, + "learning_rate": 0.00013950432288615513, + "loss": 0.9311, + "step": 10164 + }, + { + "epoch": 2.16830204778157, + "grad_norm": 0.51679285998421, + "learning_rate": 0.00013949863464142784, + "loss": 0.9581, + "step": 10165 + }, + { + "epoch": 2.1685153583617747, + "grad_norm": 0.4892697401813219, + "learning_rate": 0.00013949294572347247, + "loss": 0.9371, + "step": 10166 + }, + { + "epoch": 2.1687286689419794, + "grad_norm": 0.4323758708747697, + "learning_rate": 0.00013948725613235346, + "loss": 0.9318, + "step": 10167 + }, + { + "epoch": 2.1689419795221845, + "grad_norm": 0.44067107615690576, + "learning_rate": 0.00013948156586813512, + "loss": 0.9257, + "step": 10168 + }, + { + "epoch": 2.169155290102389, + "grad_norm": 0.42331838389233345, + "learning_rate": 0.00013947587493088186, + "loss": 0.9646, + "step": 10169 + }, + { + "epoch": 2.169368600682594, + "grad_norm": 0.3661380416987927, + "learning_rate": 0.0001394701833206581, + "loss": 0.9221, + "step": 10170 + }, + { + "epoch": 2.1695819112627985, + "grad_norm": 0.590474003730095, + "learning_rate": 0.0001394644910375282, + "loss": 0.9399, + "step": 10171 + }, + { + "epoch": 2.169795221843003, + "grad_norm": 0.259529306637993, + "learning_rate": 0.00013945879808155656, + "loss": 0.9481, + "step": 10172 + }, + { + "epoch": 2.1700085324232083, + "grad_norm": 0.27960426958278883, + "learning_rate": 0.00013945310445280764, + "loss": 0.928, + "step": 10173 + }, + { + "epoch": 2.170221843003413, + "grad_norm": 0.3170408117012166, + "learning_rate": 0.00013944741015134586, + "loss": 0.9073, + "step": 10174 + }, + { + "epoch": 2.1704351535836177, + "grad_norm": 0.30627925850451465, + "learning_rate": 0.00013944171517723562, + "loss": 0.9514, + "step": 10175 + }, + { + "epoch": 2.1706484641638224, + "grad_norm": 0.3768246496855341, + "learning_rate": 0.00013943601953054138, + "loss": 0.9623, + "step": 10176 + }, + { + "epoch": 2.1708617747440275, + "grad_norm": 0.49111458133607044, + "learning_rate": 0.00013943032321132756, + "loss": 0.927, + "step": 10177 + }, + { + "epoch": 2.171075085324232, + "grad_norm": 0.564153621374185, + "learning_rate": 0.00013942462621965865, + "loss": 0.9524, + "step": 10178 + }, + { + "epoch": 2.171288395904437, + "grad_norm": 0.6907330312255273, + "learning_rate": 0.00013941892855559907, + "loss": 0.9353, + "step": 10179 + }, + { + "epoch": 2.1715017064846416, + "grad_norm": 0.8327764746731303, + "learning_rate": 0.00013941323021921336, + "loss": 0.9448, + "step": 10180 + }, + { + "epoch": 2.1717150170648463, + "grad_norm": 0.8239075283994952, + "learning_rate": 0.00013940753121056593, + "loss": 0.9608, + "step": 10181 + }, + { + "epoch": 2.1719283276450514, + "grad_norm": 0.7291256570103862, + "learning_rate": 0.00013940183152972126, + "loss": 0.9693, + "step": 10182 + }, + { + "epoch": 2.172141638225256, + "grad_norm": 0.6250414778207002, + "learning_rate": 0.00013939613117674388, + "loss": 0.9084, + "step": 10183 + }, + { + "epoch": 2.1723549488054608, + "grad_norm": 0.45594275886591773, + "learning_rate": 0.00013939043015169828, + "loss": 0.9258, + "step": 10184 + }, + { + "epoch": 2.1725682593856654, + "grad_norm": 0.2860799146651125, + "learning_rate": 0.00013938472845464897, + "loss": 0.9673, + "step": 10185 + }, + { + "epoch": 2.17278156996587, + "grad_norm": 0.332327873281239, + "learning_rate": 0.00013937902608566045, + "loss": 0.9292, + "step": 10186 + }, + { + "epoch": 2.1729948805460753, + "grad_norm": 2.1704608818336415, + "learning_rate": 0.00013937332304479725, + "loss": 1.0075, + "step": 10187 + }, + { + "epoch": 2.17320819112628, + "grad_norm": 0.41548012861877287, + "learning_rate": 0.00013936761933212394, + "loss": 0.9334, + "step": 10188 + }, + { + "epoch": 2.1734215017064846, + "grad_norm": 0.7902185028869012, + "learning_rate": 0.00013936191494770496, + "loss": 0.9586, + "step": 10189 + }, + { + "epoch": 2.1736348122866893, + "grad_norm": 2.368957950729762, + "learning_rate": 0.00013935620989160495, + "loss": 0.9737, + "step": 10190 + }, + { + "epoch": 2.173848122866894, + "grad_norm": 1.1043356744934036, + "learning_rate": 0.00013935050416388843, + "loss": 0.9439, + "step": 10191 + }, + { + "epoch": 2.174061433447099, + "grad_norm": 2.1840710622972073, + "learning_rate": 0.00013934479776461992, + "loss": 0.9325, + "step": 10192 + }, + { + "epoch": 2.174274744027304, + "grad_norm": 1.687984426281165, + "learning_rate": 0.00013933909069386406, + "loss": 0.9486, + "step": 10193 + }, + { + "epoch": 2.1744880546075085, + "grad_norm": 1.9791471819769195, + "learning_rate": 0.00013933338295168537, + "loss": 0.9772, + "step": 10194 + }, + { + "epoch": 2.174701365187713, + "grad_norm": 1.8479829256045701, + "learning_rate": 0.00013932767453814847, + "loss": 0.9636, + "step": 10195 + }, + { + "epoch": 2.1749146757679183, + "grad_norm": 1.4675105651635132, + "learning_rate": 0.00013932196545331793, + "loss": 0.9543, + "step": 10196 + }, + { + "epoch": 2.175127986348123, + "grad_norm": 1.1612415670790688, + "learning_rate": 0.00013931625569725835, + "loss": 0.9683, + "step": 10197 + }, + { + "epoch": 2.1753412969283277, + "grad_norm": 1.7054136146701366, + "learning_rate": 0.00013931054527003433, + "loss": 0.934, + "step": 10198 + }, + { + "epoch": 2.1755546075085324, + "grad_norm": 1.2284818950251835, + "learning_rate": 0.00013930483417171053, + "loss": 0.9398, + "step": 10199 + }, + { + "epoch": 2.175767918088737, + "grad_norm": 1.9135788736140131, + "learning_rate": 0.00013929912240235147, + "loss": 0.9692, + "step": 10200 + }, + { + "epoch": 2.175981228668942, + "grad_norm": 1.6571914246176287, + "learning_rate": 0.0001392934099620219, + "loss": 0.9576, + "step": 10201 + }, + { + "epoch": 2.176194539249147, + "grad_norm": 1.3689261795196204, + "learning_rate": 0.00013928769685078637, + "loss": 0.9699, + "step": 10202 + }, + { + "epoch": 2.1764078498293515, + "grad_norm": 1.207235898830161, + "learning_rate": 0.00013928198306870958, + "loss": 0.9566, + "step": 10203 + }, + { + "epoch": 2.176621160409556, + "grad_norm": 1.4148706540964828, + "learning_rate": 0.00013927626861585612, + "loss": 0.9588, + "step": 10204 + }, + { + "epoch": 2.176834470989761, + "grad_norm": 0.9365039722125723, + "learning_rate": 0.0001392705534922907, + "loss": 0.9263, + "step": 10205 + }, + { + "epoch": 2.177047781569966, + "grad_norm": 1.3196563598429152, + "learning_rate": 0.00013926483769807798, + "loss": 0.93, + "step": 10206 + }, + { + "epoch": 2.1772610921501707, + "grad_norm": 0.9484003439933156, + "learning_rate": 0.0001392591212332826, + "loss": 0.9646, + "step": 10207 + }, + { + "epoch": 2.1774744027303754, + "grad_norm": 1.7535089264928607, + "learning_rate": 0.0001392534040979693, + "loss": 0.9493, + "step": 10208 + }, + { + "epoch": 2.17768771331058, + "grad_norm": 1.4568977261763616, + "learning_rate": 0.00013924768629220268, + "loss": 0.9309, + "step": 10209 + }, + { + "epoch": 2.1779010238907848, + "grad_norm": 1.0656380103231375, + "learning_rate": 0.00013924196781604754, + "loss": 0.9424, + "step": 10210 + }, + { + "epoch": 2.17811433447099, + "grad_norm": 1.0527792082527128, + "learning_rate": 0.00013923624866956855, + "loss": 0.9461, + "step": 10211 + }, + { + "epoch": 2.1783276450511946, + "grad_norm": 1.1504062956444618, + "learning_rate": 0.00013923052885283035, + "loss": 0.9485, + "step": 10212 + }, + { + "epoch": 2.1785409556313993, + "grad_norm": 0.7433016989367661, + "learning_rate": 0.0001392248083658978, + "loss": 0.9615, + "step": 10213 + }, + { + "epoch": 2.178754266211604, + "grad_norm": 1.019041662119122, + "learning_rate": 0.00013921908720883547, + "loss": 0.959, + "step": 10214 + }, + { + "epoch": 2.178967576791809, + "grad_norm": 0.542510827336488, + "learning_rate": 0.00013921336538170822, + "loss": 0.9667, + "step": 10215 + }, + { + "epoch": 2.1791808873720138, + "grad_norm": 0.9049329828351562, + "learning_rate": 0.00013920764288458072, + "loss": 0.9506, + "step": 10216 + }, + { + "epoch": 2.1793941979522184, + "grad_norm": 0.46669210849838816, + "learning_rate": 0.00013920191971751776, + "loss": 0.9284, + "step": 10217 + }, + { + "epoch": 2.179607508532423, + "grad_norm": 0.7392464991613553, + "learning_rate": 0.00013919619588058406, + "loss": 0.9183, + "step": 10218 + }, + { + "epoch": 2.179820819112628, + "grad_norm": 0.503690351368287, + "learning_rate": 0.00013919047137384444, + "loss": 0.9397, + "step": 10219 + }, + { + "epoch": 2.180034129692833, + "grad_norm": 1.1408739307384683, + "learning_rate": 0.00013918474619736363, + "loss": 0.9571, + "step": 10220 + }, + { + "epoch": 2.1802474402730376, + "grad_norm": 0.39660323884065835, + "learning_rate": 0.00013917902035120641, + "loss": 0.9276, + "step": 10221 + }, + { + "epoch": 2.1804607508532423, + "grad_norm": 0.5268514578817098, + "learning_rate": 0.0001391732938354376, + "loss": 0.9405, + "step": 10222 + }, + { + "epoch": 2.180674061433447, + "grad_norm": 0.4276798739764956, + "learning_rate": 0.00013916756665012194, + "loss": 0.9632, + "step": 10223 + }, + { + "epoch": 2.1808873720136517, + "grad_norm": 0.5419991463183441, + "learning_rate": 0.0001391618387953243, + "loss": 0.9321, + "step": 10224 + }, + { + "epoch": 2.181100682593857, + "grad_norm": 0.4670259584225453, + "learning_rate": 0.00013915611027110947, + "loss": 0.9291, + "step": 10225 + }, + { + "epoch": 2.1813139931740615, + "grad_norm": 0.45024376959936324, + "learning_rate": 0.0001391503810775422, + "loss": 0.9205, + "step": 10226 + }, + { + "epoch": 2.181527303754266, + "grad_norm": 0.3854927032129168, + "learning_rate": 0.00013914465121468743, + "loss": 0.955, + "step": 10227 + }, + { + "epoch": 2.181740614334471, + "grad_norm": 0.45693300935170633, + "learning_rate": 0.00013913892068260993, + "loss": 0.973, + "step": 10228 + }, + { + "epoch": 2.1819539249146755, + "grad_norm": 0.32532011939391653, + "learning_rate": 0.00013913318948137452, + "loss": 0.9194, + "step": 10229 + }, + { + "epoch": 2.1821672354948807, + "grad_norm": 0.43379988797614966, + "learning_rate": 0.0001391274576110461, + "loss": 0.9345, + "step": 10230 + }, + { + "epoch": 2.1823805460750854, + "grad_norm": 0.35023647328572866, + "learning_rate": 0.00013912172507168948, + "loss": 0.9456, + "step": 10231 + }, + { + "epoch": 2.18259385665529, + "grad_norm": 0.33921000756090974, + "learning_rate": 0.00013911599186336955, + "loss": 0.9428, + "step": 10232 + }, + { + "epoch": 2.1828071672354947, + "grad_norm": 0.34920610288802834, + "learning_rate": 0.0001391102579861512, + "loss": 0.9268, + "step": 10233 + }, + { + "epoch": 2.1830204778157, + "grad_norm": 0.30590201001729106, + "learning_rate": 0.0001391045234400993, + "loss": 0.9514, + "step": 10234 + }, + { + "epoch": 2.1832337883959045, + "grad_norm": 0.2805736246663486, + "learning_rate": 0.00013909878822527866, + "loss": 0.9142, + "step": 10235 + }, + { + "epoch": 2.1834470989761092, + "grad_norm": 0.2858982370027067, + "learning_rate": 0.0001390930523417543, + "loss": 0.9456, + "step": 10236 + }, + { + "epoch": 2.183660409556314, + "grad_norm": 0.2730182839556688, + "learning_rate": 0.00013908731578959104, + "loss": 0.9204, + "step": 10237 + }, + { + "epoch": 2.1838737201365186, + "grad_norm": 0.25392349032155914, + "learning_rate": 0.0001390815785688538, + "loss": 0.9117, + "step": 10238 + }, + { + "epoch": 2.1840870307167237, + "grad_norm": 0.2857441807282372, + "learning_rate": 0.0001390758406796075, + "loss": 0.9679, + "step": 10239 + }, + { + "epoch": 2.1843003412969284, + "grad_norm": 0.26453815271000075, + "learning_rate": 0.00013907010212191708, + "loss": 0.9429, + "step": 10240 + }, + { + "epoch": 2.184513651877133, + "grad_norm": 0.2573446267614692, + "learning_rate": 0.00013906436289584745, + "loss": 0.9301, + "step": 10241 + }, + { + "epoch": 2.1847269624573378, + "grad_norm": 0.25352012667054236, + "learning_rate": 0.00013905862300146355, + "loss": 0.9391, + "step": 10242 + }, + { + "epoch": 2.1849402730375425, + "grad_norm": 0.2679842244609645, + "learning_rate": 0.00013905288243883036, + "loss": 0.9513, + "step": 10243 + }, + { + "epoch": 2.1851535836177476, + "grad_norm": 0.26958932955141757, + "learning_rate": 0.0001390471412080128, + "loss": 0.9381, + "step": 10244 + }, + { + "epoch": 2.1853668941979523, + "grad_norm": 0.2776493860054092, + "learning_rate": 0.00013904139930907586, + "loss": 0.9529, + "step": 10245 + }, + { + "epoch": 2.185580204778157, + "grad_norm": 0.2241904914957653, + "learning_rate": 0.0001390356567420845, + "loss": 0.9215, + "step": 10246 + }, + { + "epoch": 2.1857935153583616, + "grad_norm": 0.2314543041791966, + "learning_rate": 0.00013902991350710366, + "loss": 0.9055, + "step": 10247 + }, + { + "epoch": 2.1860068259385663, + "grad_norm": 0.2252520804305227, + "learning_rate": 0.00013902416960419835, + "loss": 0.9311, + "step": 10248 + }, + { + "epoch": 2.1862201365187715, + "grad_norm": 0.21659677743783448, + "learning_rate": 0.0001390184250334336, + "loss": 0.924, + "step": 10249 + }, + { + "epoch": 2.186433447098976, + "grad_norm": 0.2447037416360564, + "learning_rate": 0.00013901267979487438, + "loss": 0.9797, + "step": 10250 + }, + { + "epoch": 2.186646757679181, + "grad_norm": 0.22961324209409736, + "learning_rate": 0.0001390069338885857, + "loss": 0.9503, + "step": 10251 + }, + { + "epoch": 2.1868600682593855, + "grad_norm": 0.19782133065322208, + "learning_rate": 0.00013900118731463253, + "loss": 0.9339, + "step": 10252 + }, + { + "epoch": 2.1870733788395906, + "grad_norm": 0.2085121968032582, + "learning_rate": 0.00013899544007307997, + "loss": 0.9291, + "step": 10253 + }, + { + "epoch": 2.1872866894197953, + "grad_norm": 0.236337495686057, + "learning_rate": 0.00013898969216399296, + "loss": 0.9335, + "step": 10254 + }, + { + "epoch": 2.1875, + "grad_norm": 0.20380971407535528, + "learning_rate": 0.00013898394358743664, + "loss": 0.9171, + "step": 10255 + }, + { + "epoch": 2.1877133105802047, + "grad_norm": 0.22946907851189705, + "learning_rate": 0.00013897819434347601, + "loss": 0.9485, + "step": 10256 + }, + { + "epoch": 2.1879266211604094, + "grad_norm": 0.26578238311886293, + "learning_rate": 0.0001389724444321761, + "loss": 0.9346, + "step": 10257 + }, + { + "epoch": 2.1881399317406145, + "grad_norm": 0.2584785873365933, + "learning_rate": 0.00013896669385360199, + "loss": 0.9342, + "step": 10258 + }, + { + "epoch": 2.188353242320819, + "grad_norm": 0.2259072569130377, + "learning_rate": 0.00013896094260781874, + "loss": 0.9396, + "step": 10259 + }, + { + "epoch": 2.188566552901024, + "grad_norm": 0.24896005590841408, + "learning_rate": 0.00013895519069489145, + "loss": 0.9223, + "step": 10260 + }, + { + "epoch": 2.1887798634812285, + "grad_norm": 0.3102428038920648, + "learning_rate": 0.0001389494381148852, + "loss": 0.9129, + "step": 10261 + }, + { + "epoch": 2.1889931740614337, + "grad_norm": 0.3134256296451767, + "learning_rate": 0.00013894368486786502, + "loss": 0.9449, + "step": 10262 + }, + { + "epoch": 2.1892064846416384, + "grad_norm": 0.32971083299464377, + "learning_rate": 0.00013893793095389609, + "loss": 0.9682, + "step": 10263 + }, + { + "epoch": 2.189419795221843, + "grad_norm": 0.28745635036637923, + "learning_rate": 0.00013893217637304348, + "loss": 0.9489, + "step": 10264 + }, + { + "epoch": 2.1896331058020477, + "grad_norm": 0.23335376456560736, + "learning_rate": 0.0001389264211253723, + "loss": 0.9363, + "step": 10265 + }, + { + "epoch": 2.1898464163822524, + "grad_norm": 0.3666538864141154, + "learning_rate": 0.00013892066521094764, + "loss": 0.9498, + "step": 10266 + }, + { + "epoch": 2.1900597269624575, + "grad_norm": 0.5354196981774914, + "learning_rate": 0.0001389149086298347, + "loss": 0.9394, + "step": 10267 + }, + { + "epoch": 2.1902730375426622, + "grad_norm": 0.610037132610792, + "learning_rate": 0.00013890915138209856, + "loss": 0.945, + "step": 10268 + }, + { + "epoch": 2.190486348122867, + "grad_norm": 0.49140779606704726, + "learning_rate": 0.00013890339346780438, + "loss": 0.9271, + "step": 10269 + }, + { + "epoch": 2.1906996587030716, + "grad_norm": 0.25977306142102247, + "learning_rate": 0.00013889763488701733, + "loss": 0.9533, + "step": 10270 + }, + { + "epoch": 2.1909129692832763, + "grad_norm": 0.2659345423278898, + "learning_rate": 0.00013889187563980254, + "loss": 0.9728, + "step": 10271 + }, + { + "epoch": 2.1911262798634814, + "grad_norm": 0.41116387960672374, + "learning_rate": 0.0001388861157262252, + "loss": 0.9377, + "step": 10272 + }, + { + "epoch": 2.191339590443686, + "grad_norm": 0.45693162328045017, + "learning_rate": 0.00013888035514635042, + "loss": 0.9151, + "step": 10273 + }, + { + "epoch": 2.1915529010238908, + "grad_norm": 0.3711689576439687, + "learning_rate": 0.00013887459390024348, + "loss": 0.9655, + "step": 10274 + }, + { + "epoch": 2.1917662116040955, + "grad_norm": 0.2504854117640819, + "learning_rate": 0.00013886883198796954, + "loss": 0.9277, + "step": 10275 + }, + { + "epoch": 2.1919795221843, + "grad_norm": 0.259601391985545, + "learning_rate": 0.00013886306940959374, + "loss": 0.9317, + "step": 10276 + }, + { + "epoch": 2.1921928327645053, + "grad_norm": 0.4092256404391142, + "learning_rate": 0.0001388573061651813, + "loss": 0.9222, + "step": 10277 + }, + { + "epoch": 2.19240614334471, + "grad_norm": 0.4261009754113055, + "learning_rate": 0.0001388515422547975, + "loss": 0.9288, + "step": 10278 + }, + { + "epoch": 2.1926194539249146, + "grad_norm": 0.36511988483152935, + "learning_rate": 0.00013884577767850746, + "loss": 0.9052, + "step": 10279 + }, + { + "epoch": 2.1928327645051193, + "grad_norm": 0.35925981053703304, + "learning_rate": 0.0001388400124363765, + "loss": 0.9437, + "step": 10280 + }, + { + "epoch": 2.1930460750853245, + "grad_norm": 0.24526052528263015, + "learning_rate": 0.00013883424652846976, + "loss": 0.9041, + "step": 10281 + }, + { + "epoch": 2.193259385665529, + "grad_norm": 0.22382909013951183, + "learning_rate": 0.00013882847995485257, + "loss": 0.9361, + "step": 10282 + }, + { + "epoch": 2.193472696245734, + "grad_norm": 0.3072674337673504, + "learning_rate": 0.00013882271271559014, + "loss": 0.956, + "step": 10283 + }, + { + "epoch": 2.1936860068259385, + "grad_norm": 0.752487431130169, + "learning_rate": 0.00013881694481074773, + "loss": 0.9796, + "step": 10284 + }, + { + "epoch": 2.193899317406143, + "grad_norm": 0.25962991740767744, + "learning_rate": 0.00013881117624039055, + "loss": 0.9522, + "step": 10285 + }, + { + "epoch": 2.1941126279863483, + "grad_norm": 0.22212496554964573, + "learning_rate": 0.000138805407004584, + "loss": 0.946, + "step": 10286 + }, + { + "epoch": 2.194325938566553, + "grad_norm": 0.21756806618566138, + "learning_rate": 0.0001387996371033932, + "loss": 0.9178, + "step": 10287 + }, + { + "epoch": 2.1945392491467577, + "grad_norm": 0.23054207632791393, + "learning_rate": 0.00013879386653688357, + "loss": 0.9826, + "step": 10288 + }, + { + "epoch": 2.1947525597269624, + "grad_norm": 0.22447454550095414, + "learning_rate": 0.00013878809530512034, + "loss": 0.9387, + "step": 10289 + }, + { + "epoch": 2.194965870307167, + "grad_norm": 0.30463922369639523, + "learning_rate": 0.00013878232340816882, + "loss": 0.9239, + "step": 10290 + }, + { + "epoch": 2.195179180887372, + "grad_norm": 0.3236120992649717, + "learning_rate": 0.0001387765508460943, + "loss": 0.9246, + "step": 10291 + }, + { + "epoch": 2.195392491467577, + "grad_norm": 0.23176511951244083, + "learning_rate": 0.00013877077761896214, + "loss": 0.9606, + "step": 10292 + }, + { + "epoch": 2.1956058020477816, + "grad_norm": 0.27419835076020593, + "learning_rate": 0.00013876500372683766, + "loss": 0.9394, + "step": 10293 + }, + { + "epoch": 2.1958191126279862, + "grad_norm": 0.2787859013183091, + "learning_rate": 0.00013875922916978614, + "loss": 0.9435, + "step": 10294 + }, + { + "epoch": 2.196032423208191, + "grad_norm": 0.2048013931571496, + "learning_rate": 0.00013875345394787298, + "loss": 0.9195, + "step": 10295 + }, + { + "epoch": 2.196245733788396, + "grad_norm": 0.22239504526599088, + "learning_rate": 0.0001387476780611635, + "loss": 0.9377, + "step": 10296 + }, + { + "epoch": 2.1964590443686007, + "grad_norm": 0.2276639411076831, + "learning_rate": 0.00013874190150972305, + "loss": 0.9322, + "step": 10297 + }, + { + "epoch": 2.1966723549488054, + "grad_norm": 0.19622909334575203, + "learning_rate": 0.00013873612429361696, + "loss": 0.9369, + "step": 10298 + }, + { + "epoch": 2.19688566552901, + "grad_norm": 0.2136085995332739, + "learning_rate": 0.0001387303464129107, + "loss": 0.9459, + "step": 10299 + }, + { + "epoch": 2.1970989761092152, + "grad_norm": 0.20279983883935285, + "learning_rate": 0.00013872456786766954, + "loss": 0.9114, + "step": 10300 + }, + { + "epoch": 2.19731228668942, + "grad_norm": 0.21371472368258124, + "learning_rate": 0.00013871878865795892, + "loss": 0.9312, + "step": 10301 + }, + { + "epoch": 2.1975255972696246, + "grad_norm": 0.23351452342707915, + "learning_rate": 0.00013871300878384423, + "loss": 0.9364, + "step": 10302 + }, + { + "epoch": 2.1977389078498293, + "grad_norm": 0.20448278149864146, + "learning_rate": 0.0001387072282453908, + "loss": 0.9264, + "step": 10303 + }, + { + "epoch": 2.197952218430034, + "grad_norm": 0.20492965297863883, + "learning_rate": 0.00013870144704266417, + "loss": 0.931, + "step": 10304 + }, + { + "epoch": 2.198165529010239, + "grad_norm": 0.22266776016681383, + "learning_rate": 0.00013869566517572964, + "loss": 0.9269, + "step": 10305 + }, + { + "epoch": 2.198378839590444, + "grad_norm": 0.23729139255169143, + "learning_rate": 0.00013868988264465268, + "loss": 0.954, + "step": 10306 + }, + { + "epoch": 2.1985921501706485, + "grad_norm": 0.2724347986388164, + "learning_rate": 0.00013868409944949871, + "loss": 0.9274, + "step": 10307 + }, + { + "epoch": 2.198805460750853, + "grad_norm": 0.22404374549150086, + "learning_rate": 0.00013867831559033318, + "loss": 0.953, + "step": 10308 + }, + { + "epoch": 2.199018771331058, + "grad_norm": 0.36005886992560876, + "learning_rate": 0.0001386725310672215, + "loss": 0.9622, + "step": 10309 + }, + { + "epoch": 2.199232081911263, + "grad_norm": 0.24203849862905152, + "learning_rate": 0.00013866674588022915, + "loss": 0.9173, + "step": 10310 + }, + { + "epoch": 2.1994453924914676, + "grad_norm": 0.23364042572999832, + "learning_rate": 0.00013866096002942158, + "loss": 0.9508, + "step": 10311 + }, + { + "epoch": 2.1996587030716723, + "grad_norm": 0.25794528147591267, + "learning_rate": 0.00013865517351486427, + "loss": 0.9033, + "step": 10312 + }, + { + "epoch": 2.199872013651877, + "grad_norm": 0.2500466602670931, + "learning_rate": 0.00013864938633662268, + "loss": 0.9504, + "step": 10313 + }, + { + "epoch": 2.2000853242320817, + "grad_norm": 1.5028445000832567, + "learning_rate": 0.0001386435984947623, + "loss": 0.9805, + "step": 10314 + }, + { + "epoch": 2.200298634812287, + "grad_norm": 0.3321927814919987, + "learning_rate": 0.00013863780998934862, + "loss": 0.9439, + "step": 10315 + }, + { + "epoch": 2.2005119453924915, + "grad_norm": 2.7368736184133198, + "learning_rate": 0.00013863202082044712, + "loss": 0.9309, + "step": 10316 + }, + { + "epoch": 2.200725255972696, + "grad_norm": 0.850680891241909, + "learning_rate": 0.00013862623098812333, + "loss": 0.9617, + "step": 10317 + }, + { + "epoch": 2.200938566552901, + "grad_norm": 55.3072503760056, + "learning_rate": 0.00013862044049244276, + "loss": 9.226, + "step": 10318 + }, + { + "epoch": 2.201151877133106, + "grad_norm": 17.145251433317522, + "learning_rate": 0.0001386146493334709, + "loss": 7.4286, + "step": 10319 + }, + { + "epoch": 2.2013651877133107, + "grad_norm": 21.040254986219296, + "learning_rate": 0.00013860885751127333, + "loss": 7.3918, + "step": 10320 + }, + { + "epoch": 2.2015784982935154, + "grad_norm": 141.4612942157552, + "learning_rate": 0.0001386030650259155, + "loss": 9.9425, + "step": 10321 + }, + { + "epoch": 2.20179180887372, + "grad_norm": 10.128477595355674, + "learning_rate": 0.00013859727187746302, + "loss": 7.3481, + "step": 10322 + }, + { + "epoch": 2.2020051194539247, + "grad_norm": 21.051683225369864, + "learning_rate": 0.00013859147806598147, + "loss": 6.8902, + "step": 10323 + }, + { + "epoch": 2.20221843003413, + "grad_norm": 26.282863055373117, + "learning_rate": 0.0001385856835915363, + "loss": 7.2905, + "step": 10324 + }, + { + "epoch": 2.2024317406143346, + "grad_norm": 12.345291080731151, + "learning_rate": 0.00013857988845419314, + "loss": 6.822, + "step": 10325 + }, + { + "epoch": 2.2026450511945392, + "grad_norm": 18.162851520988156, + "learning_rate": 0.0001385740926540176, + "loss": 7.087, + "step": 10326 + }, + { + "epoch": 2.202858361774744, + "grad_norm": 6.609273603394784, + "learning_rate": 0.00013856829619107518, + "loss": 6.4475, + "step": 10327 + }, + { + "epoch": 2.2030716723549486, + "grad_norm": 7.112285620156087, + "learning_rate": 0.00013856249906543154, + "loss": 6.4704, + "step": 10328 + }, + { + "epoch": 2.2032849829351537, + "grad_norm": 7.280529997880768, + "learning_rate": 0.0001385567012771522, + "loss": 6.36, + "step": 10329 + }, + { + "epoch": 2.2034982935153584, + "grad_norm": 7.156076278756118, + "learning_rate": 0.0001385509028263028, + "loss": 6.3168, + "step": 10330 + }, + { + "epoch": 2.203711604095563, + "grad_norm": 7.437316159794366, + "learning_rate": 0.00013854510371294896, + "loss": 6.2326, + "step": 10331 + }, + { + "epoch": 2.203924914675768, + "grad_norm": 7.033185116122771, + "learning_rate": 0.0001385393039371563, + "loss": 6.2972, + "step": 10332 + }, + { + "epoch": 2.2041382252559725, + "grad_norm": 5.274159022778832, + "learning_rate": 0.00013853350349899042, + "loss": 6.2515, + "step": 10333 + }, + { + "epoch": 2.2043515358361776, + "grad_norm": 5.291895796217867, + "learning_rate": 0.00013852770239851698, + "loss": 6.259, + "step": 10334 + }, + { + "epoch": 2.2045648464163823, + "grad_norm": 5.106239135637667, + "learning_rate": 0.00013852190063580157, + "loss": 6.1671, + "step": 10335 + }, + { + "epoch": 2.204778156996587, + "grad_norm": 5.7191729156813045, + "learning_rate": 0.0001385160982109099, + "loss": 6.1751, + "step": 10336 + }, + { + "epoch": 2.2049914675767917, + "grad_norm": 3.4002774655400936, + "learning_rate": 0.00013851029512390757, + "loss": 6.2051, + "step": 10337 + }, + { + "epoch": 2.205204778156997, + "grad_norm": 4.329568165398044, + "learning_rate": 0.00013850449137486028, + "loss": 6.0896, + "step": 10338 + }, + { + "epoch": 2.2054180887372015, + "grad_norm": 8.941193149846564, + "learning_rate": 0.0001384986869638337, + "loss": 6.084, + "step": 10339 + }, + { + "epoch": 2.205631399317406, + "grad_norm": 3.578598587498784, + "learning_rate": 0.00013849288189089347, + "loss": 6.0717, + "step": 10340 + }, + { + "epoch": 2.205844709897611, + "grad_norm": 10.519674957066721, + "learning_rate": 0.0001384870761561053, + "loss": 6.2246, + "step": 10341 + }, + { + "epoch": 2.2060580204778155, + "grad_norm": 3.511300947836011, + "learning_rate": 0.0001384812697595349, + "loss": 5.9532, + "step": 10342 + }, + { + "epoch": 2.2062713310580206, + "grad_norm": 17.930837223061307, + "learning_rate": 0.0001384754627012479, + "loss": 6.4974, + "step": 10343 + }, + { + "epoch": 2.2064846416382253, + "grad_norm": 12.980790533801395, + "learning_rate": 0.0001384696549813101, + "loss": 6.1468, + "step": 10344 + }, + { + "epoch": 2.20669795221843, + "grad_norm": 11.31182093445073, + "learning_rate": 0.00013846384659978715, + "loss": 5.9655, + "step": 10345 + }, + { + "epoch": 2.2069112627986347, + "grad_norm": 9.044552330667122, + "learning_rate": 0.00013845803755674482, + "loss": 6.0116, + "step": 10346 + }, + { + "epoch": 2.2071245733788394, + "grad_norm": 9.956920208116111, + "learning_rate": 0.00013845222785224877, + "loss": 6.0208, + "step": 10347 + }, + { + "epoch": 2.2073378839590445, + "grad_norm": 6.514574375280477, + "learning_rate": 0.0001384464174863648, + "loss": 5.8438, + "step": 10348 + }, + { + "epoch": 2.207551194539249, + "grad_norm": 6.688183632979221, + "learning_rate": 0.00013844060645915864, + "loss": 5.7611, + "step": 10349 + }, + { + "epoch": 2.207764505119454, + "grad_norm": 7.911997691636548, + "learning_rate": 0.00013843479477069606, + "loss": 5.7781, + "step": 10350 + }, + { + "epoch": 2.2079778156996586, + "grad_norm": 4.199745091143006, + "learning_rate": 0.00013842898242104275, + "loss": 5.6849, + "step": 10351 + }, + { + "epoch": 2.2081911262798632, + "grad_norm": 5.7020645318478, + "learning_rate": 0.00013842316941026453, + "loss": 5.6886, + "step": 10352 + }, + { + "epoch": 2.2084044368600684, + "grad_norm": 4.816817823174871, + "learning_rate": 0.00013841735573842716, + "loss": 5.6064, + "step": 10353 + }, + { + "epoch": 2.208617747440273, + "grad_norm": 6.246213508188818, + "learning_rate": 0.00013841154140559648, + "loss": 5.582, + "step": 10354 + }, + { + "epoch": 2.2088310580204777, + "grad_norm": 2.8638338753781687, + "learning_rate": 0.00013840572641183815, + "loss": 5.4488, + "step": 10355 + }, + { + "epoch": 2.2090443686006824, + "grad_norm": 3.3839899874448065, + "learning_rate": 0.00013839991075721808, + "loss": 5.4523, + "step": 10356 + }, + { + "epoch": 2.2092576791808876, + "grad_norm": 2.4981056080758, + "learning_rate": 0.00013839409444180202, + "loss": 5.3214, + "step": 10357 + }, + { + "epoch": 2.2094709897610922, + "grad_norm": 2.798526145499038, + "learning_rate": 0.00013838827746565582, + "loss": 5.2867, + "step": 10358 + }, + { + "epoch": 2.209684300341297, + "grad_norm": 2.8783477436335634, + "learning_rate": 0.00013838245982884527, + "loss": 5.2813, + "step": 10359 + }, + { + "epoch": 2.2098976109215016, + "grad_norm": 2.790317196844064, + "learning_rate": 0.00013837664153143621, + "loss": 5.3053, + "step": 10360 + }, + { + "epoch": 2.2101109215017063, + "grad_norm": 4.995253873237447, + "learning_rate": 0.00013837082257349447, + "loss": 5.2107, + "step": 10361 + }, + { + "epoch": 2.2103242320819114, + "grad_norm": 3.3655652717810263, + "learning_rate": 0.0001383650029550859, + "loss": 5.1325, + "step": 10362 + }, + { + "epoch": 2.210537542662116, + "grad_norm": 3.84034207440998, + "learning_rate": 0.00013835918267627632, + "loss": 5.1285, + "step": 10363 + }, + { + "epoch": 2.210750853242321, + "grad_norm": 4.618286127906377, + "learning_rate": 0.00013835336173713162, + "loss": 5.0796, + "step": 10364 + }, + { + "epoch": 2.2109641638225255, + "grad_norm": 3.1154204102037513, + "learning_rate": 0.00013834754013771766, + "loss": 5.0063, + "step": 10365 + }, + { + "epoch": 2.2111774744027306, + "grad_norm": 3.3778439741042567, + "learning_rate": 0.00013834171787810025, + "loss": 4.9629, + "step": 10366 + }, + { + "epoch": 2.2113907849829353, + "grad_norm": 2.365057464747147, + "learning_rate": 0.00013833589495834537, + "loss": 4.9151, + "step": 10367 + }, + { + "epoch": 2.21160409556314, + "grad_norm": 3.9416114275037204, + "learning_rate": 0.00013833007137851883, + "loss": 4.9138, + "step": 10368 + }, + { + "epoch": 2.2118174061433447, + "grad_norm": 2.945802118028816, + "learning_rate": 0.0001383242471386866, + "loss": 4.8631, + "step": 10369 + }, + { + "epoch": 2.2120307167235493, + "grad_norm": 2.734401780368918, + "learning_rate": 0.0001383184222389145, + "loss": 4.898, + "step": 10370 + }, + { + "epoch": 2.2122440273037545, + "grad_norm": 3.555640672350859, + "learning_rate": 0.00013831259667926844, + "loss": 4.8151, + "step": 10371 + }, + { + "epoch": 2.212457337883959, + "grad_norm": 4.531410184246767, + "learning_rate": 0.00013830677045981442, + "loss": 4.8323, + "step": 10372 + }, + { + "epoch": 2.212670648464164, + "grad_norm": 4.456823894208863, + "learning_rate": 0.0001383009435806183, + "loss": 4.7798, + "step": 10373 + }, + { + "epoch": 2.2128839590443685, + "grad_norm": 3.0380039034070463, + "learning_rate": 0.00013829511604174599, + "loss": 4.6693, + "step": 10374 + }, + { + "epoch": 2.213097269624573, + "grad_norm": 3.8686451414716734, + "learning_rate": 0.00013828928784326348, + "loss": 4.6752, + "step": 10375 + }, + { + "epoch": 2.2133105802047783, + "grad_norm": 3.2386430150399006, + "learning_rate": 0.0001382834589852367, + "loss": 4.5936, + "step": 10376 + }, + { + "epoch": 2.213523890784983, + "grad_norm": 4.058380612822805, + "learning_rate": 0.0001382776294677316, + "loss": 4.6092, + "step": 10377 + }, + { + "epoch": 2.2137372013651877, + "grad_norm": 3.704230109093299, + "learning_rate": 0.00013827179929081415, + "loss": 4.5546, + "step": 10378 + }, + { + "epoch": 2.2139505119453924, + "grad_norm": 2.64241957889815, + "learning_rate": 0.0001382659684545503, + "loss": 4.3854, + "step": 10379 + }, + { + "epoch": 2.214163822525597, + "grad_norm": 4.939273746712777, + "learning_rate": 0.00013826013695900606, + "loss": 4.4956, + "step": 10380 + }, + { + "epoch": 2.214377133105802, + "grad_norm": 2.884802804916408, + "learning_rate": 0.00013825430480424736, + "loss": 4.3355, + "step": 10381 + }, + { + "epoch": 2.214590443686007, + "grad_norm": 5.9717101542607915, + "learning_rate": 0.00013824847199034025, + "loss": 4.2589, + "step": 10382 + }, + { + "epoch": 2.2148037542662116, + "grad_norm": 8.281403530653566, + "learning_rate": 0.00013824263851735066, + "loss": 4.3983, + "step": 10383 + }, + { + "epoch": 2.2150170648464163, + "grad_norm": 8.420007866437881, + "learning_rate": 0.00013823680438534468, + "loss": 3.852, + "step": 10384 + }, + { + "epoch": 2.2152303754266214, + "grad_norm": 12.276325603247692, + "learning_rate": 0.00013823096959438827, + "loss": 5.1275, + "step": 10385 + }, + { + "epoch": 2.215443686006826, + "grad_norm": 11.098344530817764, + "learning_rate": 0.00013822513414454742, + "loss": 4.2677, + "step": 10386 + }, + { + "epoch": 2.2156569965870307, + "grad_norm": 4.235747098444404, + "learning_rate": 0.00013821929803588823, + "loss": 3.9299, + "step": 10387 + }, + { + "epoch": 2.2158703071672354, + "grad_norm": 3.662587834265976, + "learning_rate": 0.0001382134612684767, + "loss": 3.6038, + "step": 10388 + }, + { + "epoch": 2.21608361774744, + "grad_norm": 5.856496302927765, + "learning_rate": 0.00013820762384237888, + "loss": 3.1343, + "step": 10389 + }, + { + "epoch": 2.2162969283276452, + "grad_norm": 6.1511986472927385, + "learning_rate": 0.0001382017857576608, + "loss": 2.3819, + "step": 10390 + }, + { + "epoch": 2.21651023890785, + "grad_norm": 51.73251199143169, + "learning_rate": 0.00013819594701438857, + "loss": 2.1611, + "step": 10391 + }, + { + "epoch": 2.2167235494880546, + "grad_norm": 9.180412683459659, + "learning_rate": 0.00013819010761262818, + "loss": 1.719, + "step": 10392 + }, + { + "epoch": 2.2169368600682593, + "grad_norm": 10.197177290332307, + "learning_rate": 0.0001381842675524458, + "loss": 1.7158, + "step": 10393 + }, + { + "epoch": 2.217150170648464, + "grad_norm": 6.279757869394776, + "learning_rate": 0.0001381784268339074, + "loss": 1.5681, + "step": 10394 + }, + { + "epoch": 2.217363481228669, + "grad_norm": 2.0303545235763423, + "learning_rate": 0.00013817258545707913, + "loss": 1.33, + "step": 10395 + }, + { + "epoch": 2.217576791808874, + "grad_norm": 1.5064282482567322, + "learning_rate": 0.00013816674342202707, + "loss": 1.2104, + "step": 10396 + }, + { + "epoch": 2.2177901023890785, + "grad_norm": 1.6635061791241421, + "learning_rate": 0.00013816090072881734, + "loss": 1.1954, + "step": 10397 + }, + { + "epoch": 2.218003412969283, + "grad_norm": 1.6367902764154192, + "learning_rate": 0.00013815505737751604, + "loss": 1.1572, + "step": 10398 + }, + { + "epoch": 2.218216723549488, + "grad_norm": 1.3724266715750455, + "learning_rate": 0.0001381492133681893, + "loss": 1.1084, + "step": 10399 + }, + { + "epoch": 2.218430034129693, + "grad_norm": 1.3692785525925162, + "learning_rate": 0.00013814336870090325, + "loss": 1.127, + "step": 10400 + }, + { + "epoch": 2.2186433447098977, + "grad_norm": 1.0153892660104844, + "learning_rate": 0.00013813752337572398, + "loss": 1.0646, + "step": 10401 + }, + { + "epoch": 2.2188566552901023, + "grad_norm": 1.286397937634378, + "learning_rate": 0.00013813167739271763, + "loss": 1.0787, + "step": 10402 + }, + { + "epoch": 2.219069965870307, + "grad_norm": 1.3208749003171054, + "learning_rate": 0.0001381258307519504, + "loss": 1.0738, + "step": 10403 + }, + { + "epoch": 2.219283276450512, + "grad_norm": 0.8818325193135819, + "learning_rate": 0.0001381199834534884, + "loss": 1.0501, + "step": 10404 + }, + { + "epoch": 2.219496587030717, + "grad_norm": 1.0595143998327459, + "learning_rate": 0.00013811413549739783, + "loss": 0.9885, + "step": 10405 + }, + { + "epoch": 2.2197098976109215, + "grad_norm": 0.89002534502173, + "learning_rate": 0.00013810828688374485, + "loss": 1.0175, + "step": 10406 + }, + { + "epoch": 2.219923208191126, + "grad_norm": 0.5982611694785704, + "learning_rate": 0.0001381024376125956, + "loss": 1.0528, + "step": 10407 + }, + { + "epoch": 2.220136518771331, + "grad_norm": 0.8172078862662, + "learning_rate": 0.0001380965876840163, + "loss": 1.0042, + "step": 10408 + }, + { + "epoch": 2.220349829351536, + "grad_norm": 0.7066258699051173, + "learning_rate": 0.00013809073709807314, + "loss": 1.0145, + "step": 10409 + }, + { + "epoch": 2.2205631399317407, + "grad_norm": 0.7118972765765705, + "learning_rate": 0.0001380848858548323, + "loss": 0.9816, + "step": 10410 + }, + { + "epoch": 2.2207764505119454, + "grad_norm": 0.9286807255022905, + "learning_rate": 0.00013807903395436, + "loss": 1.0085, + "step": 10411 + }, + { + "epoch": 2.22098976109215, + "grad_norm": 0.7430635519592702, + "learning_rate": 0.0001380731813967225, + "loss": 0.9717, + "step": 10412 + }, + { + "epoch": 2.2212030716723548, + "grad_norm": 0.876593229073535, + "learning_rate": 0.00013806732818198592, + "loss": 1.0085, + "step": 10413 + }, + { + "epoch": 2.22141638225256, + "grad_norm": 1.1218341624093253, + "learning_rate": 0.00013806147431021654, + "loss": 0.9792, + "step": 10414 + }, + { + "epoch": 2.2216296928327646, + "grad_norm": 0.9332071450223237, + "learning_rate": 0.00013805561978148063, + "loss": 1.0002, + "step": 10415 + }, + { + "epoch": 2.2218430034129693, + "grad_norm": 1.0549542537608712, + "learning_rate": 0.0001380497645958444, + "loss": 0.9833, + "step": 10416 + }, + { + "epoch": 2.222056313993174, + "grad_norm": 0.949793975036213, + "learning_rate": 0.0001380439087533741, + "loss": 1.0057, + "step": 10417 + }, + { + "epoch": 2.2222696245733786, + "grad_norm": 0.8706928110894145, + "learning_rate": 0.00013803805225413602, + "loss": 0.9686, + "step": 10418 + }, + { + "epoch": 2.2224829351535837, + "grad_norm": 0.5960018321502419, + "learning_rate": 0.0001380321950981964, + "loss": 1.0008, + "step": 10419 + }, + { + "epoch": 2.2226962457337884, + "grad_norm": 0.4688475183927693, + "learning_rate": 0.00013802633728562147, + "loss": 0.9946, + "step": 10420 + }, + { + "epoch": 2.222909556313993, + "grad_norm": 0.5179009245802928, + "learning_rate": 0.0001380204788164776, + "loss": 0.9813, + "step": 10421 + }, + { + "epoch": 2.223122866894198, + "grad_norm": 0.49382387231768354, + "learning_rate": 0.000138014619690831, + "loss": 0.9861, + "step": 10422 + }, + { + "epoch": 2.223336177474403, + "grad_norm": 0.6587326538708304, + "learning_rate": 0.00013800875990874805, + "loss": 0.9583, + "step": 10423 + }, + { + "epoch": 2.2235494880546076, + "grad_norm": 0.6687829899019201, + "learning_rate": 0.00013800289947029498, + "loss": 0.9622, + "step": 10424 + }, + { + "epoch": 2.2237627986348123, + "grad_norm": 0.6238403248633095, + "learning_rate": 0.00013799703837553813, + "loss": 0.9775, + "step": 10425 + }, + { + "epoch": 2.223976109215017, + "grad_norm": 0.5373516694188671, + "learning_rate": 0.0001379911766245438, + "loss": 0.9835, + "step": 10426 + }, + { + "epoch": 2.2241894197952217, + "grad_norm": 0.4072690955900103, + "learning_rate": 0.00013798531421737835, + "loss": 1.006, + "step": 10427 + }, + { + "epoch": 2.224402730375427, + "grad_norm": 0.33184232923330254, + "learning_rate": 0.0001379794511541081, + "loss": 0.934, + "step": 10428 + }, + { + "epoch": 2.2246160409556315, + "grad_norm": 0.3313212686827112, + "learning_rate": 0.00013797358743479937, + "loss": 0.9711, + "step": 10429 + }, + { + "epoch": 2.224829351535836, + "grad_norm": 0.4678188498988446, + "learning_rate": 0.00013796772305951853, + "loss": 0.9842, + "step": 10430 + }, + { + "epoch": 2.225042662116041, + "grad_norm": 0.5522467920636588, + "learning_rate": 0.00013796185802833195, + "loss": 0.9993, + "step": 10431 + }, + { + "epoch": 2.2252559726962455, + "grad_norm": 0.53938279856031, + "learning_rate": 0.00013795599234130593, + "loss": 0.9432, + "step": 10432 + }, + { + "epoch": 2.2254692832764507, + "grad_norm": 0.3347978548476574, + "learning_rate": 0.0001379501259985069, + "loss": 0.9532, + "step": 10433 + }, + { + "epoch": 2.2256825938566553, + "grad_norm": 0.28759280092210476, + "learning_rate": 0.00013794425900000122, + "loss": 0.9927, + "step": 10434 + }, + { + "epoch": 2.22589590443686, + "grad_norm": 0.3081542020972563, + "learning_rate": 0.00013793839134585526, + "loss": 0.9232, + "step": 10435 + }, + { + "epoch": 2.2261092150170647, + "grad_norm": 0.34027010192529084, + "learning_rate": 0.00013793252303613546, + "loss": 0.9752, + "step": 10436 + }, + { + "epoch": 2.2263225255972694, + "grad_norm": 0.3434527460820945, + "learning_rate": 0.00013792665407090816, + "loss": 0.9372, + "step": 10437 + }, + { + "epoch": 2.2265358361774745, + "grad_norm": 0.2917478303463175, + "learning_rate": 0.0001379207844502398, + "loss": 0.9609, + "step": 10438 + }, + { + "epoch": 2.226749146757679, + "grad_norm": 0.20304131624481447, + "learning_rate": 0.00013791491417419679, + "loss": 0.9481, + "step": 10439 + }, + { + "epoch": 2.226962457337884, + "grad_norm": 0.28001648979206456, + "learning_rate": 0.00013790904324284555, + "loss": 0.9698, + "step": 10440 + }, + { + "epoch": 2.2271757679180886, + "grad_norm": 0.2721235666425416, + "learning_rate": 0.00013790317165625254, + "loss": 0.9301, + "step": 10441 + }, + { + "epoch": 2.2273890784982937, + "grad_norm": 0.25211254489751267, + "learning_rate": 0.00013789729941448414, + "loss": 0.9435, + "step": 10442 + }, + { + "epoch": 2.2276023890784984, + "grad_norm": 0.22047574427784883, + "learning_rate": 0.0001378914265176068, + "loss": 0.9356, + "step": 10443 + }, + { + "epoch": 2.227815699658703, + "grad_norm": 0.22762094800361582, + "learning_rate": 0.00013788555296568705, + "loss": 0.9749, + "step": 10444 + }, + { + "epoch": 2.2280290102389078, + "grad_norm": 0.334818233878772, + "learning_rate": 0.00013787967875879126, + "loss": 0.9311, + "step": 10445 + }, + { + "epoch": 2.2282423208191124, + "grad_norm": 0.44642367950887385, + "learning_rate": 0.00013787380389698595, + "loss": 0.98, + "step": 10446 + }, + { + "epoch": 2.2284556313993176, + "grad_norm": 0.4826603006012667, + "learning_rate": 0.00013786792838033754, + "loss": 0.9404, + "step": 10447 + }, + { + "epoch": 2.2286689419795223, + "grad_norm": 0.4374805806766539, + "learning_rate": 0.00013786205220891258, + "loss": 0.9253, + "step": 10448 + }, + { + "epoch": 2.228882252559727, + "grad_norm": 0.31633048347633763, + "learning_rate": 0.00013785617538277753, + "loss": 0.9223, + "step": 10449 + }, + { + "epoch": 2.2290955631399316, + "grad_norm": 0.23893204602676466, + "learning_rate": 0.00013785029790199888, + "loss": 0.9434, + "step": 10450 + }, + { + "epoch": 2.2293088737201368, + "grad_norm": 0.3085941679424995, + "learning_rate": 0.00013784441976664313, + "loss": 0.9653, + "step": 10451 + }, + { + "epoch": 2.2295221843003414, + "grad_norm": 0.4222167694403176, + "learning_rate": 0.0001378385409767768, + "loss": 0.9216, + "step": 10452 + }, + { + "epoch": 2.229735494880546, + "grad_norm": 0.42675627050210363, + "learning_rate": 0.0001378326615324664, + "loss": 0.9566, + "step": 10453 + }, + { + "epoch": 2.229948805460751, + "grad_norm": 0.32765057556503063, + "learning_rate": 0.00013782678143377843, + "loss": 0.9528, + "step": 10454 + }, + { + "epoch": 2.2301621160409555, + "grad_norm": 0.2542919244419527, + "learning_rate": 0.00013782090068077953, + "loss": 0.9554, + "step": 10455 + }, + { + "epoch": 2.2303754266211606, + "grad_norm": 0.227827727245486, + "learning_rate": 0.00013781501927353612, + "loss": 0.9275, + "step": 10456 + }, + { + "epoch": 2.2305887372013653, + "grad_norm": 0.3124313450404004, + "learning_rate": 0.0001378091372121148, + "loss": 0.9566, + "step": 10457 + }, + { + "epoch": 2.23080204778157, + "grad_norm": 0.36505029734823374, + "learning_rate": 0.00013780325449658213, + "loss": 0.9524, + "step": 10458 + }, + { + "epoch": 2.2310153583617747, + "grad_norm": 0.33375216139656955, + "learning_rate": 0.0001377973711270046, + "loss": 0.9426, + "step": 10459 + }, + { + "epoch": 2.2312286689419794, + "grad_norm": 0.28668459797656876, + "learning_rate": 0.00013779148710344892, + "loss": 0.9324, + "step": 10460 + }, + { + "epoch": 2.2314419795221845, + "grad_norm": 0.24621896386632097, + "learning_rate": 0.00013778560242598157, + "loss": 0.9219, + "step": 10461 + }, + { + "epoch": 2.231655290102389, + "grad_norm": 0.23517759303288008, + "learning_rate": 0.00013777971709466916, + "loss": 0.9251, + "step": 10462 + }, + { + "epoch": 2.231868600682594, + "grad_norm": 0.30176890269200085, + "learning_rate": 0.00013777383110957826, + "loss": 0.9435, + "step": 10463 + }, + { + "epoch": 2.2320819112627985, + "grad_norm": 0.27075334545252516, + "learning_rate": 0.0001377679444707755, + "loss": 0.9287, + "step": 10464 + }, + { + "epoch": 2.232295221843003, + "grad_norm": 0.25465954783241035, + "learning_rate": 0.0001377620571783275, + "loss": 0.9243, + "step": 10465 + }, + { + "epoch": 2.2325085324232083, + "grad_norm": 0.23060768316345032, + "learning_rate": 0.00013775616923230083, + "loss": 0.9389, + "step": 10466 + }, + { + "epoch": 2.232721843003413, + "grad_norm": 0.2143718600875238, + "learning_rate": 0.0001377502806327621, + "loss": 0.9439, + "step": 10467 + }, + { + "epoch": 2.2329351535836177, + "grad_norm": 0.2614146714926309, + "learning_rate": 0.000137744391379778, + "loss": 0.9309, + "step": 10468 + }, + { + "epoch": 2.2331484641638224, + "grad_norm": 0.35254627346407436, + "learning_rate": 0.00013773850147341516, + "loss": 0.9919, + "step": 10469 + }, + { + "epoch": 2.2333617747440275, + "grad_norm": 0.2910194853535409, + "learning_rate": 0.0001377326109137402, + "loss": 0.9362, + "step": 10470 + }, + { + "epoch": 2.233575085324232, + "grad_norm": 0.2651346096802439, + "learning_rate": 0.00013772671970081976, + "loss": 0.9332, + "step": 10471 + }, + { + "epoch": 2.233788395904437, + "grad_norm": 0.2731511389779741, + "learning_rate": 0.00013772082783472054, + "loss": 0.965, + "step": 10472 + }, + { + "epoch": 2.2340017064846416, + "grad_norm": 0.21879882327285843, + "learning_rate": 0.00013771493531550917, + "loss": 0.9358, + "step": 10473 + }, + { + "epoch": 2.2342150170648463, + "grad_norm": 0.23177143960511135, + "learning_rate": 0.0001377090421432523, + "loss": 0.9183, + "step": 10474 + }, + { + "epoch": 2.2344283276450514, + "grad_norm": 0.2626017802662122, + "learning_rate": 0.0001377031483180167, + "loss": 0.9327, + "step": 10475 + }, + { + "epoch": 2.234641638225256, + "grad_norm": 0.2358141207559124, + "learning_rate": 0.00013769725383986899, + "loss": 0.8985, + "step": 10476 + }, + { + "epoch": 2.2348549488054608, + "grad_norm": 0.2167283641679961, + "learning_rate": 0.0001376913587088759, + "loss": 0.9716, + "step": 10477 + }, + { + "epoch": 2.2350682593856654, + "grad_norm": 0.22449017438831068, + "learning_rate": 0.0001376854629251041, + "loss": 0.962, + "step": 10478 + }, + { + "epoch": 2.23528156996587, + "grad_norm": 0.2313938948499786, + "learning_rate": 0.0001376795664886203, + "loss": 0.9722, + "step": 10479 + }, + { + "epoch": 2.2354948805460753, + "grad_norm": 0.2644423232457302, + "learning_rate": 0.00013767366939949124, + "loss": 0.9239, + "step": 10480 + }, + { + "epoch": 2.23570819112628, + "grad_norm": 0.2893850115934598, + "learning_rate": 0.00013766777165778363, + "loss": 0.9465, + "step": 10481 + }, + { + "epoch": 2.2359215017064846, + "grad_norm": 0.2672188125636812, + "learning_rate": 0.00013766187326356424, + "loss": 0.9153, + "step": 10482 + }, + { + "epoch": 2.2361348122866893, + "grad_norm": 0.23500718343061175, + "learning_rate": 0.0001376559742168998, + "loss": 0.8872, + "step": 10483 + }, + { + "epoch": 2.236348122866894, + "grad_norm": 0.23751868134387372, + "learning_rate": 0.00013765007451785698, + "loss": 0.9662, + "step": 10484 + }, + { + "epoch": 2.236561433447099, + "grad_norm": 0.2587424173217526, + "learning_rate": 0.00013764417416650264, + "loss": 0.9204, + "step": 10485 + }, + { + "epoch": 2.236774744027304, + "grad_norm": 0.27051813086069926, + "learning_rate": 0.0001376382731629035, + "loss": 0.9296, + "step": 10486 + }, + { + "epoch": 2.2369880546075085, + "grad_norm": 0.36277497923142177, + "learning_rate": 0.0001376323715071263, + "loss": 0.9545, + "step": 10487 + }, + { + "epoch": 2.237201365187713, + "grad_norm": 0.38884904679111515, + "learning_rate": 0.0001376264691992379, + "loss": 0.9494, + "step": 10488 + }, + { + "epoch": 2.2374146757679183, + "grad_norm": 0.29317258146003405, + "learning_rate": 0.00013762056623930498, + "loss": 0.9192, + "step": 10489 + }, + { + "epoch": 2.237627986348123, + "grad_norm": 0.24606718213829282, + "learning_rate": 0.0001376146626273944, + "loss": 0.9421, + "step": 10490 + }, + { + "epoch": 2.2378412969283277, + "grad_norm": 0.2903298847472209, + "learning_rate": 0.00013760875836357298, + "loss": 0.9377, + "step": 10491 + }, + { + "epoch": 2.2380546075085324, + "grad_norm": 0.26433919557341706, + "learning_rate": 0.00013760285344790744, + "loss": 0.9235, + "step": 10492 + }, + { + "epoch": 2.238267918088737, + "grad_norm": 0.21309905509571403, + "learning_rate": 0.00013759694788046466, + "loss": 0.9198, + "step": 10493 + }, + { + "epoch": 2.238481228668942, + "grad_norm": 0.2510929745612251, + "learning_rate": 0.00013759104166131145, + "loss": 0.9642, + "step": 10494 + }, + { + "epoch": 2.238694539249147, + "grad_norm": 0.23330236998545037, + "learning_rate": 0.00013758513479051462, + "loss": 0.9345, + "step": 10495 + }, + { + "epoch": 2.2389078498293515, + "grad_norm": 0.2344127976821293, + "learning_rate": 0.00013757922726814105, + "loss": 0.9502, + "step": 10496 + }, + { + "epoch": 2.239121160409556, + "grad_norm": 0.2763815482771584, + "learning_rate": 0.00013757331909425755, + "loss": 0.9473, + "step": 10497 + }, + { + "epoch": 2.239334470989761, + "grad_norm": 0.2941296708830554, + "learning_rate": 0.00013756741026893098, + "loss": 0.9657, + "step": 10498 + }, + { + "epoch": 2.239547781569966, + "grad_norm": 0.3253532983285817, + "learning_rate": 0.00013756150079222815, + "loss": 0.9335, + "step": 10499 + }, + { + "epoch": 2.2397610921501707, + "grad_norm": 0.27060425078511763, + "learning_rate": 0.000137555590664216, + "loss": 0.9361, + "step": 10500 + }, + { + "epoch": 2.2399744027303754, + "grad_norm": 0.2330956433316633, + "learning_rate": 0.0001375496798849614, + "loss": 0.9545, + "step": 10501 + }, + { + "epoch": 2.24018771331058, + "grad_norm": 0.29419925720118817, + "learning_rate": 0.00013754376845453116, + "loss": 0.9368, + "step": 10502 + }, + { + "epoch": 2.2404010238907848, + "grad_norm": 0.2684827847005782, + "learning_rate": 0.00013753785637299226, + "loss": 0.9655, + "step": 10503 + }, + { + "epoch": 2.24061433447099, + "grad_norm": 0.23401816600004735, + "learning_rate": 0.00013753194364041148, + "loss": 0.9032, + "step": 10504 + }, + { + "epoch": 2.2408276450511946, + "grad_norm": 0.1979849036386812, + "learning_rate": 0.00013752603025685584, + "loss": 0.9576, + "step": 10505 + }, + { + "epoch": 2.2410409556313993, + "grad_norm": 0.2102990877916717, + "learning_rate": 0.0001375201162223922, + "loss": 0.924, + "step": 10506 + }, + { + "epoch": 2.241254266211604, + "grad_norm": 0.23869424333767747, + "learning_rate": 0.00013751420153708743, + "loss": 0.9262, + "step": 10507 + }, + { + "epoch": 2.241467576791809, + "grad_norm": 0.23512679830469932, + "learning_rate": 0.00013750828620100856, + "loss": 0.9261, + "step": 10508 + }, + { + "epoch": 2.2416808873720138, + "grad_norm": 0.228906463298409, + "learning_rate": 0.00013750237021422244, + "loss": 0.9098, + "step": 10509 + }, + { + "epoch": 2.2418941979522184, + "grad_norm": 0.19425144491907392, + "learning_rate": 0.00013749645357679605, + "loss": 0.9147, + "step": 10510 + }, + { + "epoch": 2.242107508532423, + "grad_norm": 0.2237520055144997, + "learning_rate": 0.00013749053628879632, + "loss": 0.9496, + "step": 10511 + }, + { + "epoch": 2.242320819112628, + "grad_norm": 0.2938061095282236, + "learning_rate": 0.00013748461835029016, + "loss": 0.9256, + "step": 10512 + }, + { + "epoch": 2.242534129692833, + "grad_norm": 0.2914407816562334, + "learning_rate": 0.00013747869976134461, + "loss": 0.9635, + "step": 10513 + }, + { + "epoch": 2.2427474402730376, + "grad_norm": 0.20778127242333694, + "learning_rate": 0.0001374727805220266, + "loss": 0.9133, + "step": 10514 + }, + { + "epoch": 2.2429607508532423, + "grad_norm": 0.24314237807076872, + "learning_rate": 0.00013746686063240311, + "loss": 0.9144, + "step": 10515 + }, + { + "epoch": 2.243174061433447, + "grad_norm": 0.2580440920317952, + "learning_rate": 0.00013746094009254114, + "loss": 0.9462, + "step": 10516 + }, + { + "epoch": 2.2433873720136517, + "grad_norm": 0.28316710559833536, + "learning_rate": 0.00013745501890250766, + "loss": 0.9481, + "step": 10517 + }, + { + "epoch": 2.243600682593857, + "grad_norm": 0.2550072667429319, + "learning_rate": 0.00013744909706236966, + "loss": 0.9337, + "step": 10518 + }, + { + "epoch": 2.2438139931740615, + "grad_norm": 0.2226016446957963, + "learning_rate": 0.0001374431745721942, + "loss": 0.9577, + "step": 10519 + }, + { + "epoch": 2.244027303754266, + "grad_norm": 0.2536857400994203, + "learning_rate": 0.00013743725143204822, + "loss": 0.9735, + "step": 10520 + }, + { + "epoch": 2.244240614334471, + "grad_norm": 0.2306215712186999, + "learning_rate": 0.00013743132764199874, + "loss": 0.9371, + "step": 10521 + }, + { + "epoch": 2.2444539249146755, + "grad_norm": 0.24814552648319113, + "learning_rate": 0.00013742540320211289, + "loss": 0.9415, + "step": 10522 + }, + { + "epoch": 2.2446672354948807, + "grad_norm": 0.2936858121015475, + "learning_rate": 0.0001374194781124576, + "loss": 0.9745, + "step": 10523 + }, + { + "epoch": 2.2448805460750854, + "grad_norm": 0.27146283017704775, + "learning_rate": 0.00013741355237309997, + "loss": 0.9418, + "step": 10524 + }, + { + "epoch": 2.24509385665529, + "grad_norm": 0.29025258289513245, + "learning_rate": 0.000137407625984107, + "loss": 0.9405, + "step": 10525 + }, + { + "epoch": 2.2453071672354947, + "grad_norm": 0.2281532870462107, + "learning_rate": 0.0001374016989455458, + "loss": 0.9237, + "step": 10526 + }, + { + "epoch": 2.2455204778157, + "grad_norm": 0.30581359788831114, + "learning_rate": 0.00013739577125748342, + "loss": 0.9454, + "step": 10527 + }, + { + "epoch": 2.2457337883959045, + "grad_norm": 0.348366701640234, + "learning_rate": 0.00013738984291998687, + "loss": 0.9467, + "step": 10528 + }, + { + "epoch": 2.2459470989761092, + "grad_norm": 0.28269730747302535, + "learning_rate": 0.00013738391393312334, + "loss": 0.9405, + "step": 10529 + }, + { + "epoch": 2.246160409556314, + "grad_norm": 0.3219888777689753, + "learning_rate": 0.00013737798429695983, + "loss": 0.9751, + "step": 10530 + }, + { + "epoch": 2.2463737201365186, + "grad_norm": 0.3437050851563045, + "learning_rate": 0.00013737205401156348, + "loss": 0.928, + "step": 10531 + }, + { + "epoch": 2.2465870307167237, + "grad_norm": 0.2616278458245781, + "learning_rate": 0.00013736612307700135, + "loss": 0.9338, + "step": 10532 + }, + { + "epoch": 2.2468003412969284, + "grad_norm": 0.2886967614975594, + "learning_rate": 0.0001373601914933406, + "loss": 0.9287, + "step": 10533 + }, + { + "epoch": 2.247013651877133, + "grad_norm": 0.26512920052484923, + "learning_rate": 0.00013735425926064828, + "loss": 0.9574, + "step": 10534 + }, + { + "epoch": 2.2472269624573378, + "grad_norm": 0.311280161569033, + "learning_rate": 0.0001373483263789916, + "loss": 0.9493, + "step": 10535 + }, + { + "epoch": 2.2474402730375425, + "grad_norm": 0.34950510629140946, + "learning_rate": 0.0001373423928484376, + "loss": 0.9358, + "step": 10536 + }, + { + "epoch": 2.2476535836177476, + "grad_norm": 0.22510805284635307, + "learning_rate": 0.0001373364586690535, + "loss": 0.9551, + "step": 10537 + }, + { + "epoch": 2.2478668941979523, + "grad_norm": 0.2517658449213096, + "learning_rate": 0.00013733052384090638, + "loss": 0.9344, + "step": 10538 + }, + { + "epoch": 2.248080204778157, + "grad_norm": 0.2558685100184055, + "learning_rate": 0.00013732458836406342, + "loss": 0.8979, + "step": 10539 + }, + { + "epoch": 2.2482935153583616, + "grad_norm": 0.2641072624488704, + "learning_rate": 0.0001373186522385918, + "loss": 0.9702, + "step": 10540 + }, + { + "epoch": 2.2485068259385663, + "grad_norm": 0.2496460792191546, + "learning_rate": 0.0001373127154645586, + "loss": 0.9777, + "step": 10541 + }, + { + "epoch": 2.2487201365187715, + "grad_norm": 0.22791785735155268, + "learning_rate": 0.00013730677804203113, + "loss": 0.9598, + "step": 10542 + }, + { + "epoch": 2.248933447098976, + "grad_norm": 0.25759690386472794, + "learning_rate": 0.0001373008399710765, + "loss": 0.9508, + "step": 10543 + }, + { + "epoch": 2.249146757679181, + "grad_norm": 0.22644909901067237, + "learning_rate": 0.00013729490125176188, + "loss": 0.9479, + "step": 10544 + }, + { + "epoch": 2.2493600682593855, + "grad_norm": 0.22096678734437716, + "learning_rate": 0.0001372889618841545, + "loss": 0.9425, + "step": 10545 + }, + { + "epoch": 2.2495733788395906, + "grad_norm": 0.2696588638535088, + "learning_rate": 0.00013728302186832153, + "loss": 0.943, + "step": 10546 + }, + { + "epoch": 2.2497866894197953, + "grad_norm": 0.22908273268019974, + "learning_rate": 0.0001372770812043302, + "loss": 0.9369, + "step": 10547 + }, + { + "epoch": 2.25, + "grad_norm": 0.2072228383558016, + "learning_rate": 0.00013727113989224778, + "loss": 0.9672, + "step": 10548 + }, + { + "epoch": 2.2502133105802047, + "grad_norm": 0.2750791974298684, + "learning_rate": 0.00013726519793214137, + "loss": 0.9249, + "step": 10549 + }, + { + "epoch": 2.2504266211604094, + "grad_norm": 0.35842538319119915, + "learning_rate": 0.00013725925532407838, + "loss": 0.931, + "step": 10550 + }, + { + "epoch": 2.2506399317406145, + "grad_norm": 0.3763925923007728, + "learning_rate": 0.0001372533120681259, + "loss": 0.937, + "step": 10551 + }, + { + "epoch": 2.250853242320819, + "grad_norm": 0.36445850693329357, + "learning_rate": 0.00013724736816435123, + "loss": 0.968, + "step": 10552 + }, + { + "epoch": 2.251066552901024, + "grad_norm": 0.27814450458446877, + "learning_rate": 0.00013724142361282163, + "loss": 0.9386, + "step": 10553 + }, + { + "epoch": 2.2512798634812285, + "grad_norm": 0.19865372798887782, + "learning_rate": 0.00013723547841360434, + "loss": 0.9139, + "step": 10554 + }, + { + "epoch": 2.2514931740614337, + "grad_norm": 0.29594140141913183, + "learning_rate": 0.0001372295325667667, + "loss": 0.9093, + "step": 10555 + }, + { + "epoch": 2.2517064846416384, + "grad_norm": 0.35896708594885196, + "learning_rate": 0.0001372235860723759, + "loss": 0.9628, + "step": 10556 + }, + { + "epoch": 2.251919795221843, + "grad_norm": 0.37034266780023783, + "learning_rate": 0.00013721763893049927, + "loss": 0.9139, + "step": 10557 + }, + { + "epoch": 2.2521331058020477, + "grad_norm": 0.34908527780397036, + "learning_rate": 0.0001372116911412041, + "loss": 0.9346, + "step": 10558 + }, + { + "epoch": 2.2523464163822524, + "grad_norm": 0.350690673746287, + "learning_rate": 0.00013720574270455766, + "loss": 0.9377, + "step": 10559 + }, + { + "epoch": 2.252559726962457, + "grad_norm": 0.2775229431559436, + "learning_rate": 0.0001371997936206273, + "loss": 0.9497, + "step": 10560 + }, + { + "epoch": 2.2527730375426622, + "grad_norm": 0.22548675808295995, + "learning_rate": 0.0001371938438894803, + "loss": 0.9451, + "step": 10561 + }, + { + "epoch": 2.252986348122867, + "grad_norm": 0.31370654825976124, + "learning_rate": 0.00013718789351118397, + "loss": 0.9331, + "step": 10562 + }, + { + "epoch": 2.2531996587030716, + "grad_norm": 0.3973049408056545, + "learning_rate": 0.0001371819424858057, + "loss": 0.9511, + "step": 10563 + }, + { + "epoch": 2.2534129692832763, + "grad_norm": 0.4936141429145291, + "learning_rate": 0.00013717599081341277, + "loss": 0.9503, + "step": 10564 + }, + { + "epoch": 2.2536262798634814, + "grad_norm": 0.4773533973672381, + "learning_rate": 0.00013717003849407256, + "loss": 0.9545, + "step": 10565 + }, + { + "epoch": 2.253839590443686, + "grad_norm": 0.4394383319298672, + "learning_rate": 0.0001371640855278524, + "loss": 0.934, + "step": 10566 + }, + { + "epoch": 2.2540529010238908, + "grad_norm": 0.3301125774475269, + "learning_rate": 0.00013715813191481963, + "loss": 0.9514, + "step": 10567 + }, + { + "epoch": 2.2542662116040955, + "grad_norm": 0.23087531139752698, + "learning_rate": 0.00013715217765504167, + "loss": 0.9149, + "step": 10568 + }, + { + "epoch": 2.2544795221843, + "grad_norm": 0.26153438001984963, + "learning_rate": 0.00013714622274858584, + "loss": 0.9519, + "step": 10569 + }, + { + "epoch": 2.2546928327645053, + "grad_norm": 0.3231269576646795, + "learning_rate": 0.00013714026719551954, + "loss": 0.94, + "step": 10570 + }, + { + "epoch": 2.25490614334471, + "grad_norm": 0.29623147141700856, + "learning_rate": 0.00013713431099591017, + "loss": 0.942, + "step": 10571 + }, + { + "epoch": 2.2551194539249146, + "grad_norm": 0.25631079969998954, + "learning_rate": 0.00013712835414982509, + "loss": 0.9327, + "step": 10572 + }, + { + "epoch": 2.2553327645051193, + "grad_norm": 0.2445684707694193, + "learning_rate": 0.00013712239665733173, + "loss": 0.9374, + "step": 10573 + }, + { + "epoch": 2.2555460750853245, + "grad_norm": 0.2051178088522578, + "learning_rate": 0.00013711643851849748, + "loss": 0.9386, + "step": 10574 + }, + { + "epoch": 2.255759385665529, + "grad_norm": 0.2604837082090416, + "learning_rate": 0.00013711047973338978, + "loss": 0.95, + "step": 10575 + }, + { + "epoch": 2.255972696245734, + "grad_norm": 0.22595138916756885, + "learning_rate": 0.00013710452030207603, + "loss": 0.9234, + "step": 10576 + }, + { + "epoch": 2.2561860068259385, + "grad_norm": 0.22747300625777883, + "learning_rate": 0.00013709856022462368, + "loss": 0.9029, + "step": 10577 + }, + { + "epoch": 2.256399317406143, + "grad_norm": 0.22941467841615606, + "learning_rate": 0.00013709259950110016, + "loss": 0.9538, + "step": 10578 + }, + { + "epoch": 2.256612627986348, + "grad_norm": 0.2694104729783501, + "learning_rate": 0.00013708663813157291, + "loss": 0.9425, + "step": 10579 + }, + { + "epoch": 2.256825938566553, + "grad_norm": 0.24756154370355948, + "learning_rate": 0.0001370806761161094, + "loss": 0.917, + "step": 10580 + }, + { + "epoch": 2.2570392491467577, + "grad_norm": 0.2624365768642969, + "learning_rate": 0.00013707471345477708, + "loss": 0.9371, + "step": 10581 + }, + { + "epoch": 2.2572525597269624, + "grad_norm": 0.22971719738344995, + "learning_rate": 0.0001370687501476434, + "loss": 0.9315, + "step": 10582 + }, + { + "epoch": 2.257465870307167, + "grad_norm": 0.27765731931644355, + "learning_rate": 0.0001370627861947759, + "loss": 0.9251, + "step": 10583 + }, + { + "epoch": 2.257679180887372, + "grad_norm": 0.22632984077212362, + "learning_rate": 0.00013705682159624192, + "loss": 0.9423, + "step": 10584 + }, + { + "epoch": 2.257892491467577, + "grad_norm": 0.26209627792475226, + "learning_rate": 0.00013705085635210912, + "loss": 0.9324, + "step": 10585 + }, + { + "epoch": 2.2581058020477816, + "grad_norm": 0.27035674054492026, + "learning_rate": 0.00013704489046244492, + "loss": 0.9187, + "step": 10586 + }, + { + "epoch": 2.2583191126279862, + "grad_norm": 0.27974198082991764, + "learning_rate": 0.00013703892392731679, + "loss": 0.9233, + "step": 10587 + }, + { + "epoch": 2.258532423208191, + "grad_norm": 0.2696942852645897, + "learning_rate": 0.0001370329567467923, + "loss": 0.9457, + "step": 10588 + }, + { + "epoch": 2.258745733788396, + "grad_norm": 0.25916301036980455, + "learning_rate": 0.00013702698892093895, + "loss": 0.9192, + "step": 10589 + }, + { + "epoch": 2.2589590443686007, + "grad_norm": 0.36424963848141245, + "learning_rate": 0.00013702102044982424, + "loss": 0.9552, + "step": 10590 + }, + { + "epoch": 2.2591723549488054, + "grad_norm": 0.4382594352510701, + "learning_rate": 0.00013701505133351574, + "loss": 0.9419, + "step": 10591 + }, + { + "epoch": 2.25938566552901, + "grad_norm": 0.43664099131093564, + "learning_rate": 0.000137009081572081, + "loss": 0.9497, + "step": 10592 + }, + { + "epoch": 2.2595989761092152, + "grad_norm": 0.43217829357101184, + "learning_rate": 0.0001370031111655875, + "loss": 0.9387, + "step": 10593 + }, + { + "epoch": 2.25981228668942, + "grad_norm": 0.36473283787867705, + "learning_rate": 0.00013699714011410283, + "loss": 0.9147, + "step": 10594 + }, + { + "epoch": 2.2600255972696246, + "grad_norm": 0.9571926174015618, + "learning_rate": 0.0001369911684176946, + "loss": 0.9227, + "step": 10595 + }, + { + "epoch": 2.2602389078498293, + "grad_norm": 0.2803625074711804, + "learning_rate": 0.00013698519607643032, + "loss": 0.9464, + "step": 10596 + }, + { + "epoch": 2.260452218430034, + "grad_norm": 0.2874443730266821, + "learning_rate": 0.0001369792230903776, + "loss": 0.957, + "step": 10597 + }, + { + "epoch": 2.260665529010239, + "grad_norm": 0.32988764645516794, + "learning_rate": 0.000136973249459604, + "loss": 0.9273, + "step": 10598 + }, + { + "epoch": 2.260878839590444, + "grad_norm": 0.48279281812413594, + "learning_rate": 0.0001369672751841771, + "loss": 0.9425, + "step": 10599 + }, + { + "epoch": 2.2610921501706485, + "grad_norm": 0.49230456048125615, + "learning_rate": 0.00013696130026416456, + "loss": 0.9561, + "step": 10600 + }, + { + "epoch": 2.261305460750853, + "grad_norm": 0.4607443615716203, + "learning_rate": 0.00013695532469963393, + "loss": 0.9178, + "step": 10601 + }, + { + "epoch": 2.261518771331058, + "grad_norm": 0.43898786634241566, + "learning_rate": 0.00013694934849065287, + "loss": 0.9412, + "step": 10602 + }, + { + "epoch": 2.261732081911263, + "grad_norm": 0.3296028287011319, + "learning_rate": 0.00013694337163728897, + "loss": 0.9253, + "step": 10603 + }, + { + "epoch": 2.2619453924914676, + "grad_norm": 0.25353494902366075, + "learning_rate": 0.00013693739413960985, + "loss": 0.9535, + "step": 10604 + }, + { + "epoch": 2.2621587030716723, + "grad_norm": 0.24618015648712077, + "learning_rate": 0.00013693141599768315, + "loss": 0.9346, + "step": 10605 + }, + { + "epoch": 2.262372013651877, + "grad_norm": 0.25641240211117244, + "learning_rate": 0.00013692543721157652, + "loss": 0.9444, + "step": 10606 + }, + { + "epoch": 2.2625853242320817, + "grad_norm": 0.28242827535750326, + "learning_rate": 0.00013691945778135763, + "loss": 0.929, + "step": 10607 + }, + { + "epoch": 2.262798634812287, + "grad_norm": 0.33439575066038557, + "learning_rate": 0.00013691347770709408, + "loss": 0.9472, + "step": 10608 + }, + { + "epoch": 2.2630119453924915, + "grad_norm": 0.268478878934118, + "learning_rate": 0.0001369074969888536, + "loss": 0.9402, + "step": 10609 + }, + { + "epoch": 2.263225255972696, + "grad_norm": 0.2483687363953068, + "learning_rate": 0.00013690151562670383, + "loss": 0.9064, + "step": 10610 + }, + { + "epoch": 2.263438566552901, + "grad_norm": 0.3242291642085813, + "learning_rate": 0.00013689553362071247, + "loss": 0.9469, + "step": 10611 + }, + { + "epoch": 2.263651877133106, + "grad_norm": 0.3027404524724451, + "learning_rate": 0.00013688955097094718, + "loss": 0.9231, + "step": 10612 + }, + { + "epoch": 2.2638651877133107, + "grad_norm": 0.2619368980384511, + "learning_rate": 0.00013688356767747566, + "loss": 0.917, + "step": 10613 + }, + { + "epoch": 2.2640784982935154, + "grad_norm": 0.3091185375090334, + "learning_rate": 0.00013687758374036563, + "loss": 0.9208, + "step": 10614 + }, + { + "epoch": 2.26429180887372, + "grad_norm": 0.2705262766706105, + "learning_rate": 0.00013687159915968476, + "loss": 0.9276, + "step": 10615 + }, + { + "epoch": 2.2645051194539247, + "grad_norm": 0.2987396075574232, + "learning_rate": 0.00013686561393550078, + "loss": 0.9727, + "step": 10616 + }, + { + "epoch": 2.26471843003413, + "grad_norm": 0.202221123243543, + "learning_rate": 0.00013685962806788145, + "loss": 0.9545, + "step": 10617 + }, + { + "epoch": 2.2649317406143346, + "grad_norm": 0.2880193191370079, + "learning_rate": 0.00013685364155689445, + "loss": 0.9645, + "step": 10618 + }, + { + "epoch": 2.2651450511945392, + "grad_norm": 0.33680185390888107, + "learning_rate": 0.00013684765440260757, + "loss": 0.9433, + "step": 10619 + }, + { + "epoch": 2.265358361774744, + "grad_norm": 0.3436576634097219, + "learning_rate": 0.00013684166660508847, + "loss": 0.9533, + "step": 10620 + }, + { + "epoch": 2.265571672354949, + "grad_norm": 0.34188425270842493, + "learning_rate": 0.00013683567816440503, + "loss": 0.9223, + "step": 10621 + }, + { + "epoch": 2.2657849829351537, + "grad_norm": 0.28427939675926966, + "learning_rate": 0.00013682968908062488, + "loss": 0.9521, + "step": 10622 + }, + { + "epoch": 2.2659982935153584, + "grad_norm": 0.21665783151299595, + "learning_rate": 0.00013682369935381585, + "loss": 0.957, + "step": 10623 + }, + { + "epoch": 2.266211604095563, + "grad_norm": 0.2340505624324537, + "learning_rate": 0.00013681770898404572, + "loss": 0.9205, + "step": 10624 + }, + { + "epoch": 2.266424914675768, + "grad_norm": 0.2753677278256045, + "learning_rate": 0.00013681171797138225, + "loss": 0.9337, + "step": 10625 + }, + { + "epoch": 2.2666382252559725, + "grad_norm": 0.29907515687826336, + "learning_rate": 0.00013680572631589324, + "loss": 0.936, + "step": 10626 + }, + { + "epoch": 2.2668515358361776, + "grad_norm": 0.28273016397247325, + "learning_rate": 0.00013679973401764647, + "loss": 0.9115, + "step": 10627 + }, + { + "epoch": 2.2670648464163823, + "grad_norm": 0.18485637809280783, + "learning_rate": 0.00013679374107670978, + "loss": 0.9481, + "step": 10628 + }, + { + "epoch": 2.267278156996587, + "grad_norm": 0.21315940731336608, + "learning_rate": 0.00013678774749315093, + "loss": 0.933, + "step": 10629 + }, + { + "epoch": 2.2674914675767917, + "grad_norm": 0.21527898610184376, + "learning_rate": 0.00013678175326703777, + "loss": 0.9248, + "step": 10630 + }, + { + "epoch": 2.267704778156997, + "grad_norm": 0.26105641470336355, + "learning_rate": 0.00013677575839843813, + "loss": 0.9354, + "step": 10631 + }, + { + "epoch": 2.2679180887372015, + "grad_norm": 0.30709319767552884, + "learning_rate": 0.0001367697628874198, + "loss": 0.9191, + "step": 10632 + }, + { + "epoch": 2.268131399317406, + "grad_norm": 0.3328342939638057, + "learning_rate": 0.00013676376673405069, + "loss": 0.9057, + "step": 10633 + }, + { + "epoch": 2.268344709897611, + "grad_norm": 0.25028853466824896, + "learning_rate": 0.00013675776993839854, + "loss": 0.9701, + "step": 10634 + }, + { + "epoch": 2.2685580204778155, + "grad_norm": 0.1956640168288384, + "learning_rate": 0.00013675177250053132, + "loss": 0.9403, + "step": 10635 + }, + { + "epoch": 2.2687713310580206, + "grad_norm": 0.2261744407511824, + "learning_rate": 0.00013674577442051683, + "loss": 0.9582, + "step": 10636 + }, + { + "epoch": 2.2689846416382253, + "grad_norm": 0.3593956921279201, + "learning_rate": 0.00013673977569842295, + "loss": 0.9582, + "step": 10637 + }, + { + "epoch": 2.26919795221843, + "grad_norm": 0.5209991162657289, + "learning_rate": 0.00013673377633431752, + "loss": 0.99, + "step": 10638 + }, + { + "epoch": 2.2694112627986347, + "grad_norm": 0.6220874971650486, + "learning_rate": 0.0001367277763282685, + "loss": 0.9454, + "step": 10639 + }, + { + "epoch": 2.26962457337884, + "grad_norm": 0.6470307623594838, + "learning_rate": 0.00013672177568034374, + "loss": 0.9185, + "step": 10640 + }, + { + "epoch": 2.2698378839590445, + "grad_norm": 0.5774709626549444, + "learning_rate": 0.00013671577439061108, + "loss": 0.9529, + "step": 10641 + }, + { + "epoch": 2.270051194539249, + "grad_norm": 0.4316219424677833, + "learning_rate": 0.00013670977245913853, + "loss": 0.975, + "step": 10642 + }, + { + "epoch": 2.270264505119454, + "grad_norm": 0.1952375262146373, + "learning_rate": 0.00013670376988599394, + "loss": 0.9212, + "step": 10643 + }, + { + "epoch": 2.2704778156996586, + "grad_norm": 0.30252475226588765, + "learning_rate": 0.0001366977666712452, + "loss": 0.909, + "step": 10644 + }, + { + "epoch": 2.2706911262798632, + "grad_norm": 0.36321444363620103, + "learning_rate": 0.00013669176281496032, + "loss": 0.9314, + "step": 10645 + }, + { + "epoch": 2.2709044368600684, + "grad_norm": 0.4523679048916527, + "learning_rate": 0.0001366857583172072, + "loss": 0.963, + "step": 10646 + }, + { + "epoch": 2.271117747440273, + "grad_norm": 0.5362096921486754, + "learning_rate": 0.00013667975317805374, + "loss": 0.9131, + "step": 10647 + }, + { + "epoch": 2.2713310580204777, + "grad_norm": 0.4404110127051892, + "learning_rate": 0.00013667374739756793, + "loss": 0.9176, + "step": 10648 + }, + { + "epoch": 2.2715443686006824, + "grad_norm": 0.26111081298896444, + "learning_rate": 0.00013666774097581772, + "loss": 0.9539, + "step": 10649 + }, + { + "epoch": 2.2717576791808876, + "grad_norm": 0.24527496411411645, + "learning_rate": 0.00013666173391287107, + "loss": 0.9067, + "step": 10650 + }, + { + "epoch": 2.2719709897610922, + "grad_norm": 0.2182924433979158, + "learning_rate": 0.00013665572620879594, + "loss": 0.9316, + "step": 10651 + }, + { + "epoch": 2.272184300341297, + "grad_norm": 0.27896807264185836, + "learning_rate": 0.0001366497178636603, + "loss": 0.9072, + "step": 10652 + }, + { + "epoch": 2.2723976109215016, + "grad_norm": 0.35715094935502223, + "learning_rate": 0.00013664370887753215, + "loss": 0.924, + "step": 10653 + }, + { + "epoch": 2.2726109215017063, + "grad_norm": 0.4222664854242708, + "learning_rate": 0.0001366376992504795, + "loss": 0.9284, + "step": 10654 + }, + { + "epoch": 2.2728242320819114, + "grad_norm": 0.34922253206089177, + "learning_rate": 0.00013663168898257034, + "loss": 0.9785, + "step": 10655 + }, + { + "epoch": 2.273037542662116, + "grad_norm": 0.31408203840975035, + "learning_rate": 0.00013662567807387266, + "loss": 0.9477, + "step": 10656 + }, + { + "epoch": 2.273250853242321, + "grad_norm": 0.2359964694553628, + "learning_rate": 0.00013661966652445449, + "loss": 0.9246, + "step": 10657 + }, + { + "epoch": 2.2734641638225255, + "grad_norm": 0.23140164230528512, + "learning_rate": 0.00013661365433438382, + "loss": 0.9211, + "step": 10658 + }, + { + "epoch": 2.2736774744027306, + "grad_norm": 0.1983937090555653, + "learning_rate": 0.00013660764150372868, + "loss": 0.9064, + "step": 10659 + }, + { + "epoch": 2.2738907849829353, + "grad_norm": 0.2171734221226046, + "learning_rate": 0.00013660162803255717, + "loss": 0.9567, + "step": 10660 + }, + { + "epoch": 2.27410409556314, + "grad_norm": 0.26221643457819366, + "learning_rate": 0.00013659561392093727, + "loss": 0.9181, + "step": 10661 + }, + { + "epoch": 2.2743174061433447, + "grad_norm": 0.27601200759550165, + "learning_rate": 0.00013658959916893702, + "loss": 0.9502, + "step": 10662 + }, + { + "epoch": 2.2745307167235493, + "grad_norm": 0.24683753939117808, + "learning_rate": 0.00013658358377662452, + "loss": 0.9642, + "step": 10663 + }, + { + "epoch": 2.274744027303754, + "grad_norm": 0.22820390959246656, + "learning_rate": 0.00013657756774406783, + "loss": 0.9428, + "step": 10664 + }, + { + "epoch": 2.274957337883959, + "grad_norm": 0.23546436848206545, + "learning_rate": 0.00013657155107133499, + "loss": 0.93, + "step": 10665 + }, + { + "epoch": 2.275170648464164, + "grad_norm": 0.25512255771577474, + "learning_rate": 0.00013656553375849414, + "loss": 0.9167, + "step": 10666 + }, + { + "epoch": 2.2753839590443685, + "grad_norm": 0.22181828094253098, + "learning_rate": 0.00013655951580561327, + "loss": 0.9437, + "step": 10667 + }, + { + "epoch": 2.275597269624573, + "grad_norm": 0.2561552063828873, + "learning_rate": 0.00013655349721276055, + "loss": 0.965, + "step": 10668 + }, + { + "epoch": 2.2758105802047783, + "grad_norm": 0.24512166279141337, + "learning_rate": 0.00013654747798000407, + "loss": 0.9531, + "step": 10669 + }, + { + "epoch": 2.276023890784983, + "grad_norm": 0.28931276343793977, + "learning_rate": 0.00013654145810741193, + "loss": 0.9304, + "step": 10670 + }, + { + "epoch": 2.2762372013651877, + "grad_norm": 0.3370086739778851, + "learning_rate": 0.0001365354375950522, + "loss": 0.903, + "step": 10671 + }, + { + "epoch": 2.2764505119453924, + "grad_norm": 0.2521931669064549, + "learning_rate": 0.00013652941644299308, + "loss": 0.9309, + "step": 10672 + }, + { + "epoch": 2.276663822525597, + "grad_norm": 0.3217481046327529, + "learning_rate": 0.00013652339465130265, + "loss": 0.9104, + "step": 10673 + }, + { + "epoch": 2.276877133105802, + "grad_norm": 0.3609563467126352, + "learning_rate": 0.00013651737222004906, + "loss": 0.9501, + "step": 10674 + }, + { + "epoch": 2.277090443686007, + "grad_norm": 0.2706163101118267, + "learning_rate": 0.00013651134914930045, + "loss": 0.9289, + "step": 10675 + }, + { + "epoch": 2.2773037542662116, + "grad_norm": 0.24038068869978965, + "learning_rate": 0.00013650532543912496, + "loss": 0.9812, + "step": 10676 + }, + { + "epoch": 2.2775170648464163, + "grad_norm": 0.27731611643076465, + "learning_rate": 0.00013649930108959078, + "loss": 0.9518, + "step": 10677 + }, + { + "epoch": 2.2777303754266214, + "grad_norm": 0.2841036035169354, + "learning_rate": 0.00013649327610076607, + "loss": 0.9222, + "step": 10678 + }, + { + "epoch": 2.277943686006826, + "grad_norm": 0.317082514666791, + "learning_rate": 0.00013648725047271898, + "loss": 1.0001, + "step": 10679 + }, + { + "epoch": 2.2781569965870307, + "grad_norm": 0.3367691081012723, + "learning_rate": 0.0001364812242055177, + "loss": 0.9246, + "step": 10680 + }, + { + "epoch": 2.2783703071672354, + "grad_norm": 0.29740064532421795, + "learning_rate": 0.00013647519729923042, + "loss": 0.9219, + "step": 10681 + }, + { + "epoch": 2.27858361774744, + "grad_norm": 0.2726899306655726, + "learning_rate": 0.0001364691697539253, + "loss": 0.9388, + "step": 10682 + }, + { + "epoch": 2.2787969283276452, + "grad_norm": 0.24977949179045572, + "learning_rate": 0.0001364631415696706, + "loss": 0.9272, + "step": 10683 + }, + { + "epoch": 2.27901023890785, + "grad_norm": 0.2553878586535113, + "learning_rate": 0.0001364571127465345, + "loss": 0.905, + "step": 10684 + }, + { + "epoch": 2.2792235494880546, + "grad_norm": 0.2587337996287429, + "learning_rate": 0.00013645108328458525, + "loss": 0.9619, + "step": 10685 + }, + { + "epoch": 2.2794368600682593, + "grad_norm": 0.30200724937057055, + "learning_rate": 0.000136445053183891, + "loss": 0.9149, + "step": 10686 + }, + { + "epoch": 2.279650170648464, + "grad_norm": 0.29404799843013657, + "learning_rate": 0.00013643902244452006, + "loss": 0.9128, + "step": 10687 + }, + { + "epoch": 2.279863481228669, + "grad_norm": 0.23220261359838298, + "learning_rate": 0.0001364329910665406, + "loss": 0.8986, + "step": 10688 + }, + { + "epoch": 2.280076791808874, + "grad_norm": 0.25052612205757024, + "learning_rate": 0.0001364269590500209, + "loss": 0.9469, + "step": 10689 + }, + { + "epoch": 2.2802901023890785, + "grad_norm": 0.2879368755191781, + "learning_rate": 0.00013642092639502924, + "loss": 0.9698, + "step": 10690 + }, + { + "epoch": 2.280503412969283, + "grad_norm": 0.41065689827617174, + "learning_rate": 0.0001364148931016338, + "loss": 0.9428, + "step": 10691 + }, + { + "epoch": 2.280716723549488, + "grad_norm": 0.460307952427483, + "learning_rate": 0.00013640885916990295, + "loss": 0.9217, + "step": 10692 + }, + { + "epoch": 2.280930034129693, + "grad_norm": 0.35809492518851954, + "learning_rate": 0.0001364028245999049, + "loss": 0.9289, + "step": 10693 + }, + { + "epoch": 2.2811433447098977, + "grad_norm": 0.2187031446860699, + "learning_rate": 0.0001363967893917079, + "loss": 0.9469, + "step": 10694 + }, + { + "epoch": 2.2813566552901023, + "grad_norm": 0.71816316157417, + "learning_rate": 0.0001363907535453803, + "loss": 0.9539, + "step": 10695 + }, + { + "epoch": 2.281569965870307, + "grad_norm": 0.3681336612921638, + "learning_rate": 0.0001363847170609904, + "loss": 0.946, + "step": 10696 + }, + { + "epoch": 2.281783276450512, + "grad_norm": 0.4471494344695872, + "learning_rate": 0.00013637867993860644, + "loss": 0.9419, + "step": 10697 + }, + { + "epoch": 2.281996587030717, + "grad_norm": 0.40504318246652543, + "learning_rate": 0.0001363726421782968, + "loss": 0.9581, + "step": 10698 + }, + { + "epoch": 2.2822098976109215, + "grad_norm": 0.34466118246433286, + "learning_rate": 0.00013636660378012975, + "loss": 0.9264, + "step": 10699 + }, + { + "epoch": 2.282423208191126, + "grad_norm": 0.2731038528427288, + "learning_rate": 0.00013636056474417363, + "loss": 0.943, + "step": 10700 + }, + { + "epoch": 2.282636518771331, + "grad_norm": 0.28104035024797763, + "learning_rate": 0.00013635452507049676, + "loss": 0.9459, + "step": 10701 + }, + { + "epoch": 2.282849829351536, + "grad_norm": 0.34380468164518857, + "learning_rate": 0.00013634848475916752, + "loss": 0.9622, + "step": 10702 + }, + { + "epoch": 2.2830631399317407, + "grad_norm": 0.3990338291774907, + "learning_rate": 0.0001363424438102542, + "loss": 0.9494, + "step": 10703 + }, + { + "epoch": 2.2832764505119454, + "grad_norm": 0.42019746888204246, + "learning_rate": 0.0001363364022238252, + "loss": 0.9631, + "step": 10704 + }, + { + "epoch": 2.28348976109215, + "grad_norm": 0.39438788867641994, + "learning_rate": 0.00013633035999994884, + "loss": 0.9375, + "step": 10705 + }, + { + "epoch": 2.2837030716723548, + "grad_norm": 0.3755537690024828, + "learning_rate": 0.00013632431713869354, + "loss": 0.924, + "step": 10706 + }, + { + "epoch": 2.28391638225256, + "grad_norm": 0.8463781960423159, + "learning_rate": 0.0001363182736401276, + "loss": 0.9717, + "step": 10707 + }, + { + "epoch": 2.2841296928327646, + "grad_norm": 0.2917865630670839, + "learning_rate": 0.0001363122295043195, + "loss": 0.9348, + "step": 10708 + }, + { + "epoch": 2.2843430034129693, + "grad_norm": 0.2514821459246412, + "learning_rate": 0.00013630618473133753, + "loss": 0.9402, + "step": 10709 + }, + { + "epoch": 2.284556313993174, + "grad_norm": 0.32737502128606377, + "learning_rate": 0.00013630013932125015, + "loss": 0.9476, + "step": 10710 + }, + { + "epoch": 2.2847696245733786, + "grad_norm": 0.31711216481874804, + "learning_rate": 0.00013629409327412575, + "loss": 0.9493, + "step": 10711 + }, + { + "epoch": 2.2849829351535837, + "grad_norm": 0.2346875192735703, + "learning_rate": 0.00013628804659003273, + "loss": 0.923, + "step": 10712 + }, + { + "epoch": 2.2851962457337884, + "grad_norm": 0.280737728005559, + "learning_rate": 0.00013628199926903954, + "loss": 0.9336, + "step": 10713 + }, + { + "epoch": 2.285409556313993, + "grad_norm": 0.372652693530235, + "learning_rate": 0.00013627595131121456, + "loss": 0.9521, + "step": 10714 + }, + { + "epoch": 2.285622866894198, + "grad_norm": 0.44495767464936664, + "learning_rate": 0.00013626990271662626, + "loss": 0.8987, + "step": 10715 + }, + { + "epoch": 2.285836177474403, + "grad_norm": 0.5151199413810497, + "learning_rate": 0.00013626385348534305, + "loss": 0.9216, + "step": 10716 + }, + { + "epoch": 2.2860494880546076, + "grad_norm": 0.44976807817458353, + "learning_rate": 0.00013625780361743337, + "loss": 0.9246, + "step": 10717 + }, + { + "epoch": 2.2862627986348123, + "grad_norm": 0.2766131355607638, + "learning_rate": 0.00013625175311296573, + "loss": 0.9099, + "step": 10718 + }, + { + "epoch": 2.286476109215017, + "grad_norm": 0.22445695358986703, + "learning_rate": 0.00013624570197200855, + "loss": 0.9328, + "step": 10719 + }, + { + "epoch": 2.2866894197952217, + "grad_norm": 0.4065419682361045, + "learning_rate": 0.0001362396501946303, + "loss": 0.952, + "step": 10720 + }, + { + "epoch": 2.286902730375427, + "grad_norm": 0.5573145976221978, + "learning_rate": 0.00013623359778089947, + "loss": 0.9338, + "step": 10721 + }, + { + "epoch": 2.2871160409556315, + "grad_norm": 0.6323244756766607, + "learning_rate": 0.00013622754473088453, + "loss": 0.9311, + "step": 10722 + }, + { + "epoch": 2.287329351535836, + "grad_norm": 0.5147142066304446, + "learning_rate": 0.00013622149104465398, + "loss": 0.9149, + "step": 10723 + }, + { + "epoch": 2.287542662116041, + "grad_norm": 0.3294869937869465, + "learning_rate": 0.00013621543672227629, + "loss": 0.9601, + "step": 10724 + }, + { + "epoch": 2.287755972696246, + "grad_norm": 0.2618439291045757, + "learning_rate": 0.00013620938176382003, + "loss": 0.9474, + "step": 10725 + }, + { + "epoch": 2.2879692832764507, + "grad_norm": 0.223639129379761, + "learning_rate": 0.00013620332616935362, + "loss": 0.9697, + "step": 10726 + }, + { + "epoch": 2.2881825938566553, + "grad_norm": 0.2646959132317392, + "learning_rate": 0.00013619726993894566, + "loss": 0.9402, + "step": 10727 + }, + { + "epoch": 2.28839590443686, + "grad_norm": 0.3376855916769594, + "learning_rate": 0.00013619121307266465, + "loss": 0.9307, + "step": 10728 + }, + { + "epoch": 2.2886092150170647, + "grad_norm": 0.3339458637421252, + "learning_rate": 0.0001361851555705791, + "loss": 0.9003, + "step": 10729 + }, + { + "epoch": 2.2888225255972694, + "grad_norm": 0.3324951526351189, + "learning_rate": 0.0001361790974327576, + "loss": 0.9701, + "step": 10730 + }, + { + "epoch": 2.2890358361774745, + "grad_norm": 0.2672478608032434, + "learning_rate": 0.00013617303865926864, + "loss": 0.9682, + "step": 10731 + }, + { + "epoch": 2.289249146757679, + "grad_norm": 0.2097838218827353, + "learning_rate": 0.00013616697925018082, + "loss": 0.9574, + "step": 10732 + }, + { + "epoch": 2.289462457337884, + "grad_norm": 0.28911174874123824, + "learning_rate": 0.00013616091920556266, + "loss": 0.9545, + "step": 10733 + }, + { + "epoch": 2.2896757679180886, + "grad_norm": 0.332362606903746, + "learning_rate": 0.00013615485852548276, + "loss": 0.9387, + "step": 10734 + }, + { + "epoch": 2.2898890784982937, + "grad_norm": 0.29686809273500797, + "learning_rate": 0.0001361487972100097, + "loss": 0.946, + "step": 10735 + }, + { + "epoch": 2.2901023890784984, + "grad_norm": 0.23266653985943142, + "learning_rate": 0.00013614273525921206, + "loss": 0.9457, + "step": 10736 + }, + { + "epoch": 2.290315699658703, + "grad_norm": 0.21742241167993748, + "learning_rate": 0.00013613667267315844, + "loss": 0.9605, + "step": 10737 + }, + { + "epoch": 2.2905290102389078, + "grad_norm": 0.2500519073339389, + "learning_rate": 0.00013613060945191739, + "loss": 0.9416, + "step": 10738 + }, + { + "epoch": 2.2907423208191124, + "grad_norm": 0.24245134314065334, + "learning_rate": 0.00013612454559555758, + "loss": 0.9283, + "step": 10739 + }, + { + "epoch": 2.2909556313993176, + "grad_norm": 0.38172711259070036, + "learning_rate": 0.00013611848110414756, + "loss": 0.9555, + "step": 10740 + }, + { + "epoch": 2.2911689419795223, + "grad_norm": 0.4578558376298644, + "learning_rate": 0.000136112415977756, + "loss": 0.9032, + "step": 10741 + }, + { + "epoch": 2.291382252559727, + "grad_norm": 0.39421921739510196, + "learning_rate": 0.00013610635021645153, + "loss": 0.9857, + "step": 10742 + }, + { + "epoch": 2.2915955631399316, + "grad_norm": 0.33192405252614093, + "learning_rate": 0.00013610028382030274, + "loss": 0.9352, + "step": 10743 + }, + { + "epoch": 2.2918088737201368, + "grad_norm": 0.2806061110102085, + "learning_rate": 0.0001360942167893783, + "loss": 0.9421, + "step": 10744 + }, + { + "epoch": 2.2920221843003414, + "grad_norm": 0.22395308603051348, + "learning_rate": 0.00013608814912374685, + "loss": 0.9325, + "step": 10745 + }, + { + "epoch": 2.292235494880546, + "grad_norm": 0.24143916107394892, + "learning_rate": 0.00013608208082347705, + "loss": 0.9416, + "step": 10746 + }, + { + "epoch": 2.292448805460751, + "grad_norm": 0.24748982483371812, + "learning_rate": 0.00013607601188863757, + "loss": 0.9519, + "step": 10747 + }, + { + "epoch": 2.2926621160409555, + "grad_norm": 0.3189403673107354, + "learning_rate": 0.00013606994231929706, + "loss": 0.9062, + "step": 10748 + }, + { + "epoch": 2.29287542662116, + "grad_norm": 0.3204819440143764, + "learning_rate": 0.0001360638721155242, + "loss": 0.939, + "step": 10749 + }, + { + "epoch": 2.2930887372013653, + "grad_norm": 0.2746722265816581, + "learning_rate": 0.0001360578012773877, + "loss": 0.9299, + "step": 10750 + }, + { + "epoch": 2.29330204778157, + "grad_norm": 0.2238558694126338, + "learning_rate": 0.00013605172980495626, + "loss": 0.9009, + "step": 10751 + }, + { + "epoch": 2.2935153583617747, + "grad_norm": 0.2709891104740865, + "learning_rate": 0.00013604565769829852, + "loss": 0.9621, + "step": 10752 + }, + { + "epoch": 2.2937286689419794, + "grad_norm": 0.34373800779148256, + "learning_rate": 0.00013603958495748324, + "loss": 0.9505, + "step": 10753 + }, + { + "epoch": 2.2939419795221845, + "grad_norm": 0.40619820475484186, + "learning_rate": 0.0001360335115825791, + "loss": 0.9349, + "step": 10754 + }, + { + "epoch": 2.294155290102389, + "grad_norm": 0.6896191353833816, + "learning_rate": 0.00013602743757365484, + "loss": 0.9709, + "step": 10755 + }, + { + "epoch": 2.294368600682594, + "grad_norm": 0.4051249401821519, + "learning_rate": 0.00013602136293077916, + "loss": 0.8941, + "step": 10756 + }, + { + "epoch": 2.2945819112627985, + "grad_norm": 0.26682095608829426, + "learning_rate": 0.00013601528765402085, + "loss": 0.9428, + "step": 10757 + }, + { + "epoch": 2.294795221843003, + "grad_norm": 0.28005493436767914, + "learning_rate": 0.0001360092117434486, + "loss": 0.9346, + "step": 10758 + }, + { + "epoch": 2.2950085324232083, + "grad_norm": 0.316646708202716, + "learning_rate": 0.0001360031351991312, + "loss": 0.9215, + "step": 10759 + }, + { + "epoch": 2.295221843003413, + "grad_norm": 0.35753075336259205, + "learning_rate": 0.00013599705802113735, + "loss": 0.9304, + "step": 10760 + }, + { + "epoch": 2.2954351535836177, + "grad_norm": 0.46708772392342957, + "learning_rate": 0.00013599098020953586, + "loss": 0.9106, + "step": 10761 + }, + { + "epoch": 2.2956484641638224, + "grad_norm": 0.4820592505289586, + "learning_rate": 0.0001359849017643955, + "loss": 0.9478, + "step": 10762 + }, + { + "epoch": 2.2958617747440275, + "grad_norm": 0.39562222764365756, + "learning_rate": 0.000135978822685785, + "loss": 0.9367, + "step": 10763 + }, + { + "epoch": 2.296075085324232, + "grad_norm": 0.2767028155638003, + "learning_rate": 0.00013597274297377324, + "loss": 0.9376, + "step": 10764 + }, + { + "epoch": 2.296288395904437, + "grad_norm": 0.2308367901904963, + "learning_rate": 0.00013596666262842888, + "loss": 0.9614, + "step": 10765 + }, + { + "epoch": 2.2965017064846416, + "grad_norm": 0.20114906323324583, + "learning_rate": 0.00013596058164982083, + "loss": 0.9296, + "step": 10766 + }, + { + "epoch": 2.2967150170648463, + "grad_norm": 0.25309430805815947, + "learning_rate": 0.00013595450003801783, + "loss": 0.9406, + "step": 10767 + }, + { + "epoch": 2.296928327645051, + "grad_norm": 0.3026031898461191, + "learning_rate": 0.00013594841779308877, + "loss": 0.9491, + "step": 10768 + }, + { + "epoch": 2.297141638225256, + "grad_norm": 0.43086355134236276, + "learning_rate": 0.00013594233491510239, + "loss": 0.9564, + "step": 10769 + }, + { + "epoch": 2.2973549488054608, + "grad_norm": 0.434860672907543, + "learning_rate": 0.00013593625140412754, + "loss": 0.9389, + "step": 10770 + }, + { + "epoch": 2.2975682593856654, + "grad_norm": 0.3433447293882791, + "learning_rate": 0.0001359301672602331, + "loss": 0.9174, + "step": 10771 + }, + { + "epoch": 2.29778156996587, + "grad_norm": 0.25208953656700356, + "learning_rate": 0.00013592408248348786, + "loss": 0.9505, + "step": 10772 + }, + { + "epoch": 2.2979948805460753, + "grad_norm": 0.21933861870162655, + "learning_rate": 0.0001359179970739607, + "loss": 0.945, + "step": 10773 + }, + { + "epoch": 2.29820819112628, + "grad_norm": 0.41194536176162844, + "learning_rate": 0.00013591191103172042, + "loss": 0.9871, + "step": 10774 + }, + { + "epoch": 2.2984215017064846, + "grad_norm": 0.2743746335617684, + "learning_rate": 0.00013590582435683593, + "loss": 0.9309, + "step": 10775 + }, + { + "epoch": 2.2986348122866893, + "grad_norm": 0.3159630970564043, + "learning_rate": 0.00013589973704937613, + "loss": 0.9363, + "step": 10776 + }, + { + "epoch": 2.298848122866894, + "grad_norm": 0.25932580933737087, + "learning_rate": 0.00013589364910940985, + "loss": 0.9243, + "step": 10777 + }, + { + "epoch": 2.299061433447099, + "grad_norm": 0.21596726263557064, + "learning_rate": 0.000135887560537006, + "loss": 0.9335, + "step": 10778 + }, + { + "epoch": 2.299274744027304, + "grad_norm": 0.4194495664279133, + "learning_rate": 0.00013588147133223346, + "loss": 0.9055, + "step": 10779 + }, + { + "epoch": 2.2994880546075085, + "grad_norm": 0.3008555098602416, + "learning_rate": 0.0001358753814951611, + "loss": 0.9417, + "step": 10780 + }, + { + "epoch": 2.299701365187713, + "grad_norm": 0.2562045236211279, + "learning_rate": 0.00013586929102585792, + "loss": 0.9102, + "step": 10781 + }, + { + "epoch": 2.2999146757679183, + "grad_norm": 0.3190395398470003, + "learning_rate": 0.00013586319992439272, + "loss": 0.9408, + "step": 10782 + }, + { + "epoch": 2.300127986348123, + "grad_norm": 0.2863737248805113, + "learning_rate": 0.0001358571081908345, + "loss": 0.9549, + "step": 10783 + }, + { + "epoch": 2.3003412969283277, + "grad_norm": 0.27724387017439805, + "learning_rate": 0.00013585101582525214, + "loss": 0.9286, + "step": 10784 + }, + { + "epoch": 2.3005546075085324, + "grad_norm": 0.2740491394839062, + "learning_rate": 0.00013584492282771462, + "loss": 0.971, + "step": 10785 + }, + { + "epoch": 2.300767918088737, + "grad_norm": 0.24842883147099287, + "learning_rate": 0.00013583882919829084, + "loss": 0.9435, + "step": 10786 + }, + { + "epoch": 2.300981228668942, + "grad_norm": 0.2787779242751302, + "learning_rate": 0.00013583273493704979, + "loss": 0.9479, + "step": 10787 + }, + { + "epoch": 2.301194539249147, + "grad_norm": 0.2527447370660577, + "learning_rate": 0.0001358266400440604, + "loss": 0.967, + "step": 10788 + }, + { + "epoch": 2.3014078498293515, + "grad_norm": 0.24882243721585803, + "learning_rate": 0.0001358205445193916, + "loss": 0.932, + "step": 10789 + }, + { + "epoch": 2.301621160409556, + "grad_norm": 0.2536072879082457, + "learning_rate": 0.00013581444836311242, + "loss": 0.949, + "step": 10790 + }, + { + "epoch": 2.301834470989761, + "grad_norm": 0.26539908250373007, + "learning_rate": 0.00013580835157529184, + "loss": 0.9412, + "step": 10791 + }, + { + "epoch": 2.302047781569966, + "grad_norm": 0.2790185240009239, + "learning_rate": 0.00013580225415599882, + "loss": 0.9285, + "step": 10792 + }, + { + "epoch": 2.3022610921501707, + "grad_norm": 0.2989163662582259, + "learning_rate": 0.00013579615610530234, + "loss": 0.9067, + "step": 10793 + }, + { + "epoch": 2.3024744027303754, + "grad_norm": 0.22038121049137746, + "learning_rate": 0.0001357900574232714, + "loss": 0.9332, + "step": 10794 + }, + { + "epoch": 2.30268771331058, + "grad_norm": 0.2591946154800659, + "learning_rate": 0.00013578395810997506, + "loss": 0.9572, + "step": 10795 + }, + { + "epoch": 2.3029010238907848, + "grad_norm": 0.22137889052282117, + "learning_rate": 0.0001357778581654823, + "loss": 0.9315, + "step": 10796 + }, + { + "epoch": 2.30311433447099, + "grad_norm": 0.23418635912782357, + "learning_rate": 0.00013577175758986213, + "loss": 0.9248, + "step": 10797 + }, + { + "epoch": 2.3033276450511946, + "grad_norm": 0.22084838346165872, + "learning_rate": 0.00013576565638318355, + "loss": 0.9043, + "step": 10798 + }, + { + "epoch": 2.3035409556313993, + "grad_norm": 0.2713390601487926, + "learning_rate": 0.00013575955454551565, + "loss": 0.9654, + "step": 10799 + }, + { + "epoch": 2.303754266211604, + "grad_norm": 0.33319472494237085, + "learning_rate": 0.00013575345207692747, + "loss": 0.9801, + "step": 10800 + }, + { + "epoch": 2.303967576791809, + "grad_norm": 0.4439975336699038, + "learning_rate": 0.000135747348977488, + "loss": 0.9714, + "step": 10801 + }, + { + "epoch": 2.3041808873720138, + "grad_norm": 0.5164146325413195, + "learning_rate": 0.00013574124524726636, + "loss": 0.9341, + "step": 10802 + }, + { + "epoch": 2.3043941979522184, + "grad_norm": 0.497066421689921, + "learning_rate": 0.00013573514088633163, + "loss": 0.9626, + "step": 10803 + }, + { + "epoch": 2.304607508532423, + "grad_norm": 0.49575291156746226, + "learning_rate": 0.0001357290358947528, + "loss": 0.929, + "step": 10804 + }, + { + "epoch": 2.304820819112628, + "grad_norm": 0.42505498461555713, + "learning_rate": 0.00013572293027259898, + "loss": 0.897, + "step": 10805 + }, + { + "epoch": 2.305034129692833, + "grad_norm": 0.27356344833298607, + "learning_rate": 0.0001357168240199393, + "loss": 0.9484, + "step": 10806 + }, + { + "epoch": 2.3052474402730376, + "grad_norm": 0.2311934926311712, + "learning_rate": 0.00013571071713684282, + "loss": 0.9316, + "step": 10807 + }, + { + "epoch": 2.3054607508532423, + "grad_norm": 0.36310215985486594, + "learning_rate": 0.00013570460962337858, + "loss": 0.924, + "step": 10808 + }, + { + "epoch": 2.305674061433447, + "grad_norm": 0.4815104529356905, + "learning_rate": 0.00013569850147961582, + "loss": 0.9484, + "step": 10809 + }, + { + "epoch": 2.305887372013652, + "grad_norm": 0.5239036685408924, + "learning_rate": 0.00013569239270562355, + "loss": 0.9034, + "step": 10810 + }, + { + "epoch": 2.306100682593857, + "grad_norm": 0.42080576772703815, + "learning_rate": 0.00013568628330147092, + "loss": 0.9537, + "step": 10811 + }, + { + "epoch": 2.3063139931740615, + "grad_norm": 0.2702478277685062, + "learning_rate": 0.00013568017326722704, + "loss": 0.9303, + "step": 10812 + }, + { + "epoch": 2.306527303754266, + "grad_norm": 0.21679935964809907, + "learning_rate": 0.0001356740626029611, + "loss": 0.9846, + "step": 10813 + }, + { + "epoch": 2.306740614334471, + "grad_norm": 0.29636700743505096, + "learning_rate": 0.0001356679513087422, + "loss": 0.97, + "step": 10814 + }, + { + "epoch": 2.3069539249146755, + "grad_norm": 0.4240218681499342, + "learning_rate": 0.00013566183938463946, + "loss": 0.9268, + "step": 10815 + }, + { + "epoch": 2.3071672354948807, + "grad_norm": 0.4511156457205635, + "learning_rate": 0.0001356557268307221, + "loss": 0.9181, + "step": 10816 + }, + { + "epoch": 2.3073805460750854, + "grad_norm": 0.41510730495708814, + "learning_rate": 0.00013564961364705925, + "loss": 0.916, + "step": 10817 + }, + { + "epoch": 2.30759385665529, + "grad_norm": 0.3953821448619048, + "learning_rate": 0.00013564349983372006, + "loss": 0.9317, + "step": 10818 + }, + { + "epoch": 2.3078071672354947, + "grad_norm": 0.3155598102826075, + "learning_rate": 0.00013563738539077374, + "loss": 0.9258, + "step": 10819 + }, + { + "epoch": 2.3080204778157, + "grad_norm": 0.20844567014599774, + "learning_rate": 0.0001356312703182895, + "loss": 0.9453, + "step": 10820 + }, + { + "epoch": 2.3082337883959045, + "grad_norm": 0.22499129111247054, + "learning_rate": 0.0001356251546163365, + "loss": 0.9402, + "step": 10821 + }, + { + "epoch": 2.3084470989761092, + "grad_norm": 0.2634887199820551, + "learning_rate": 0.00013561903828498392, + "loss": 0.9084, + "step": 10822 + }, + { + "epoch": 2.308660409556314, + "grad_norm": 0.2139300723632584, + "learning_rate": 0.000135612921324301, + "loss": 0.9753, + "step": 10823 + }, + { + "epoch": 2.3088737201365186, + "grad_norm": 0.2098184965594132, + "learning_rate": 0.00013560680373435693, + "loss": 0.9326, + "step": 10824 + }, + { + "epoch": 2.3090870307167237, + "grad_norm": 0.17910671535996706, + "learning_rate": 0.00013560068551522094, + "loss": 0.9095, + "step": 10825 + }, + { + "epoch": 2.3093003412969284, + "grad_norm": 0.17821162707472923, + "learning_rate": 0.00013559456666696224, + "loss": 0.9448, + "step": 10826 + }, + { + "epoch": 2.309513651877133, + "grad_norm": 0.1885194761034086, + "learning_rate": 0.0001355884471896501, + "loss": 0.9223, + "step": 10827 + }, + { + "epoch": 2.3097269624573378, + "grad_norm": 0.2570736590940854, + "learning_rate": 0.00013558232708335376, + "loss": 0.9708, + "step": 10828 + }, + { + "epoch": 2.309940273037543, + "grad_norm": 0.2613565674708997, + "learning_rate": 0.00013557620634814242, + "loss": 0.9201, + "step": 10829 + }, + { + "epoch": 2.3101535836177476, + "grad_norm": 0.2561916411425998, + "learning_rate": 0.0001355700849840854, + "loss": 0.9166, + "step": 10830 + }, + { + "epoch": 2.3103668941979523, + "grad_norm": 0.1829961770006171, + "learning_rate": 0.00013556396299125193, + "loss": 0.9062, + "step": 10831 + }, + { + "epoch": 2.310580204778157, + "grad_norm": 0.20267252957130175, + "learning_rate": 0.00013555784036971128, + "loss": 0.9436, + "step": 10832 + }, + { + "epoch": 2.3107935153583616, + "grad_norm": 0.24085232960344627, + "learning_rate": 0.00013555171711953273, + "loss": 0.9243, + "step": 10833 + }, + { + "epoch": 2.3110068259385663, + "grad_norm": 0.24322429966317716, + "learning_rate": 0.00013554559324078553, + "loss": 0.9483, + "step": 10834 + }, + { + "epoch": 2.3112201365187715, + "grad_norm": 0.1917414793592333, + "learning_rate": 0.00013553946873353908, + "loss": 0.9454, + "step": 10835 + }, + { + "epoch": 2.311433447098976, + "grad_norm": 0.2357311370739099, + "learning_rate": 0.00013553334359786256, + "loss": 0.9525, + "step": 10836 + }, + { + "epoch": 2.311646757679181, + "grad_norm": 0.2501622964872226, + "learning_rate": 0.00013552721783382534, + "loss": 0.9267, + "step": 10837 + }, + { + "epoch": 2.3118600682593855, + "grad_norm": 0.2654639851621868, + "learning_rate": 0.00013552109144149673, + "loss": 0.9585, + "step": 10838 + }, + { + "epoch": 2.3120733788395906, + "grad_norm": 0.2834730493503062, + "learning_rate": 0.000135514964420946, + "loss": 0.9392, + "step": 10839 + }, + { + "epoch": 2.3122866894197953, + "grad_norm": 0.3152339256547176, + "learning_rate": 0.00013550883677224254, + "loss": 0.9476, + "step": 10840 + }, + { + "epoch": 2.3125, + "grad_norm": 0.31535267053478244, + "learning_rate": 0.00013550270849545563, + "loss": 0.9239, + "step": 10841 + }, + { + "epoch": 2.3127133105802047, + "grad_norm": 0.2739728393726887, + "learning_rate": 0.0001354965795906547, + "loss": 0.9169, + "step": 10842 + }, + { + "epoch": 2.3129266211604094, + "grad_norm": 0.23981513879820168, + "learning_rate": 0.00013549045005790898, + "loss": 0.9316, + "step": 10843 + }, + { + "epoch": 2.3131399317406145, + "grad_norm": 0.244386738213034, + "learning_rate": 0.0001354843198972879, + "loss": 0.9384, + "step": 10844 + }, + { + "epoch": 2.313353242320819, + "grad_norm": 0.26694343342246357, + "learning_rate": 0.0001354781891088608, + "loss": 0.9326, + "step": 10845 + }, + { + "epoch": 2.313566552901024, + "grad_norm": 0.23725690215945944, + "learning_rate": 0.00013547205769269706, + "loss": 0.939, + "step": 10846 + }, + { + "epoch": 2.3137798634812285, + "grad_norm": 0.2807572293485074, + "learning_rate": 0.00013546592564886607, + "loss": 0.9243, + "step": 10847 + }, + { + "epoch": 2.3139931740614337, + "grad_norm": 0.2792983808114995, + "learning_rate": 0.0001354597929774372, + "loss": 0.9234, + "step": 10848 + }, + { + "epoch": 2.3142064846416384, + "grad_norm": 0.2480037644953833, + "learning_rate": 0.00013545365967847982, + "loss": 0.9356, + "step": 10849 + }, + { + "epoch": 2.314419795221843, + "grad_norm": 0.2685585316073883, + "learning_rate": 0.00013544752575206335, + "loss": 0.9325, + "step": 10850 + }, + { + "epoch": 2.3146331058020477, + "grad_norm": 0.27436757578463034, + "learning_rate": 0.0001354413911982572, + "loss": 0.9199, + "step": 10851 + }, + { + "epoch": 2.3148464163822524, + "grad_norm": 0.2875995831954657, + "learning_rate": 0.00013543525601713077, + "loss": 0.9577, + "step": 10852 + }, + { + "epoch": 2.315059726962457, + "grad_norm": 0.28045733466381684, + "learning_rate": 0.0001354291202087535, + "loss": 0.9397, + "step": 10853 + }, + { + "epoch": 2.3152730375426622, + "grad_norm": 0.26598587235595117, + "learning_rate": 0.0001354229837731948, + "loss": 0.9364, + "step": 10854 + }, + { + "epoch": 2.315486348122867, + "grad_norm": 0.21092566920539418, + "learning_rate": 0.0001354168467105241, + "loss": 0.9515, + "step": 10855 + }, + { + "epoch": 2.3156996587030716, + "grad_norm": 0.19892726669706598, + "learning_rate": 0.00013541070902081085, + "loss": 0.9536, + "step": 10856 + }, + { + "epoch": 2.3159129692832763, + "grad_norm": 0.2167103469021877, + "learning_rate": 0.0001354045707041245, + "loss": 0.9257, + "step": 10857 + }, + { + "epoch": 2.3161262798634814, + "grad_norm": 0.2198804805001975, + "learning_rate": 0.00013539843176053448, + "loss": 0.9299, + "step": 10858 + }, + { + "epoch": 2.316339590443686, + "grad_norm": 0.26517640416727134, + "learning_rate": 0.0001353922921901103, + "loss": 0.967, + "step": 10859 + }, + { + "epoch": 2.3165529010238908, + "grad_norm": 0.248674032962805, + "learning_rate": 0.00013538615199292138, + "loss": 0.9159, + "step": 10860 + }, + { + "epoch": 2.3167662116040955, + "grad_norm": 0.2309521656236075, + "learning_rate": 0.00013538001116903723, + "loss": 0.9316, + "step": 10861 + }, + { + "epoch": 2.3169795221843, + "grad_norm": 0.25187051324587373, + "learning_rate": 0.0001353738697185273, + "loss": 0.9597, + "step": 10862 + }, + { + "epoch": 2.3171928327645053, + "grad_norm": 0.27410095144376395, + "learning_rate": 0.00013536772764146114, + "loss": 0.9073, + "step": 10863 + }, + { + "epoch": 2.31740614334471, + "grad_norm": 0.23131166387876276, + "learning_rate": 0.0001353615849379082, + "loss": 0.9678, + "step": 10864 + }, + { + "epoch": 2.3176194539249146, + "grad_norm": 0.20467005244721445, + "learning_rate": 0.000135355441607938, + "loss": 0.9309, + "step": 10865 + }, + { + "epoch": 2.3178327645051193, + "grad_norm": 0.6317975200709492, + "learning_rate": 0.00013534929765162003, + "loss": 0.9207, + "step": 10866 + }, + { + "epoch": 2.3180460750853245, + "grad_norm": 0.22106565290843752, + "learning_rate": 0.0001353431530690238, + "loss": 0.9354, + "step": 10867 + }, + { + "epoch": 2.318259385665529, + "grad_norm": 0.22891085899242147, + "learning_rate": 0.0001353370078602189, + "loss": 0.9326, + "step": 10868 + }, + { + "epoch": 2.318472696245734, + "grad_norm": 0.2445628530920273, + "learning_rate": 0.00013533086202527483, + "loss": 0.9598, + "step": 10869 + }, + { + "epoch": 2.3186860068259385, + "grad_norm": 0.21826484429186424, + "learning_rate": 0.00013532471556426112, + "loss": 0.9502, + "step": 10870 + }, + { + "epoch": 2.318899317406143, + "grad_norm": 0.2370856214001303, + "learning_rate": 0.0001353185684772473, + "loss": 0.9329, + "step": 10871 + }, + { + "epoch": 2.319112627986348, + "grad_norm": 0.24524239628370445, + "learning_rate": 0.00013531242076430297, + "loss": 0.935, + "step": 10872 + }, + { + "epoch": 2.319325938566553, + "grad_norm": 0.2689593110120229, + "learning_rate": 0.00013530627242549768, + "loss": 0.9442, + "step": 10873 + }, + { + "epoch": 2.3195392491467577, + "grad_norm": 0.3037315525793781, + "learning_rate": 0.00013530012346090096, + "loss": 0.9295, + "step": 10874 + }, + { + "epoch": 2.3197525597269624, + "grad_norm": 0.312485069700222, + "learning_rate": 0.0001352939738705824, + "loss": 0.9468, + "step": 10875 + }, + { + "epoch": 2.319965870307167, + "grad_norm": 0.321056428954102, + "learning_rate": 0.00013528782365461163, + "loss": 0.9337, + "step": 10876 + }, + { + "epoch": 2.320179180887372, + "grad_norm": 0.3162948013908949, + "learning_rate": 0.0001352816728130582, + "loss": 0.9577, + "step": 10877 + }, + { + "epoch": 2.320392491467577, + "grad_norm": 0.2572247740033306, + "learning_rate": 0.0001352755213459917, + "loss": 0.9472, + "step": 10878 + }, + { + "epoch": 2.3206058020477816, + "grad_norm": 0.2756223487731754, + "learning_rate": 0.00013526936925348175, + "loss": 0.9273, + "step": 10879 + }, + { + "epoch": 2.3208191126279862, + "grad_norm": 0.2666399863755373, + "learning_rate": 0.00013526321653559796, + "loss": 0.9469, + "step": 10880 + }, + { + "epoch": 2.321032423208191, + "grad_norm": 0.19220401434951584, + "learning_rate": 0.00013525706319240995, + "loss": 0.9404, + "step": 10881 + }, + { + "epoch": 2.321245733788396, + "grad_norm": 0.22598483344004047, + "learning_rate": 0.00013525090922398734, + "loss": 0.9158, + "step": 10882 + }, + { + "epoch": 2.3214590443686007, + "grad_norm": 0.30475215132301664, + "learning_rate": 0.00013524475463039975, + "loss": 0.9202, + "step": 10883 + }, + { + "epoch": 2.3216723549488054, + "grad_norm": 0.2163112362060952, + "learning_rate": 0.00013523859941171683, + "loss": 0.9358, + "step": 10884 + }, + { + "epoch": 2.32188566552901, + "grad_norm": 0.22935602596128185, + "learning_rate": 0.00013523244356800825, + "loss": 0.9444, + "step": 10885 + }, + { + "epoch": 2.3220989761092152, + "grad_norm": 0.2874261006277325, + "learning_rate": 0.00013522628709934363, + "loss": 0.9193, + "step": 10886 + }, + { + "epoch": 2.32231228668942, + "grad_norm": 0.20776699004407195, + "learning_rate": 0.00013522013000579264, + "loss": 0.9275, + "step": 10887 + }, + { + "epoch": 2.3225255972696246, + "grad_norm": 0.240839192155864, + "learning_rate": 0.00013521397228742496, + "loss": 0.9454, + "step": 10888 + }, + { + "epoch": 2.3227389078498293, + "grad_norm": 0.2612779008054862, + "learning_rate": 0.00013520781394431026, + "loss": 0.9347, + "step": 10889 + }, + { + "epoch": 2.322952218430034, + "grad_norm": 0.27326773037758806, + "learning_rate": 0.0001352016549765182, + "loss": 0.9455, + "step": 10890 + }, + { + "epoch": 2.323165529010239, + "grad_norm": 0.3004613589625285, + "learning_rate": 0.0001351954953841185, + "loss": 0.9202, + "step": 10891 + }, + { + "epoch": 2.323378839590444, + "grad_norm": 0.26499480507860673, + "learning_rate": 0.00013518933516718082, + "loss": 0.9439, + "step": 10892 + }, + { + "epoch": 2.3235921501706485, + "grad_norm": 0.2785758852604333, + "learning_rate": 0.00013518317432577488, + "loss": 0.9999, + "step": 10893 + }, + { + "epoch": 2.323805460750853, + "grad_norm": 0.3000098720240082, + "learning_rate": 0.00013517701285997044, + "loss": 0.9646, + "step": 10894 + }, + { + "epoch": 2.324018771331058, + "grad_norm": 0.28963962050787784, + "learning_rate": 0.00013517085076983712, + "loss": 0.9294, + "step": 10895 + }, + { + "epoch": 2.324232081911263, + "grad_norm": 0.2652053542839378, + "learning_rate": 0.00013516468805544474, + "loss": 0.972, + "step": 10896 + }, + { + "epoch": 2.3244453924914676, + "grad_norm": 0.27637946518172224, + "learning_rate": 0.00013515852471686295, + "loss": 0.9377, + "step": 10897 + }, + { + "epoch": 2.3246587030716723, + "grad_norm": 0.28401949312017677, + "learning_rate": 0.00013515236075416155, + "loss": 0.9318, + "step": 10898 + }, + { + "epoch": 2.324872013651877, + "grad_norm": 0.3038401482595404, + "learning_rate": 0.00013514619616741025, + "loss": 0.9492, + "step": 10899 + }, + { + "epoch": 2.3250853242320817, + "grad_norm": 0.307957205910618, + "learning_rate": 0.0001351400309566788, + "loss": 0.9398, + "step": 10900 + }, + { + "epoch": 2.325298634812287, + "grad_norm": 0.39175821792124416, + "learning_rate": 0.000135133865122037, + "loss": 0.9467, + "step": 10901 + }, + { + "epoch": 2.3255119453924915, + "grad_norm": 0.46525384368507083, + "learning_rate": 0.00013512769866355458, + "loss": 0.9577, + "step": 10902 + }, + { + "epoch": 2.325725255972696, + "grad_norm": 0.3649887884754523, + "learning_rate": 0.0001351215315813013, + "loss": 0.928, + "step": 10903 + }, + { + "epoch": 2.325938566552901, + "grad_norm": 0.2663349981925354, + "learning_rate": 0.00013511536387534698, + "loss": 0.9501, + "step": 10904 + }, + { + "epoch": 2.326151877133106, + "grad_norm": 0.2848932991186994, + "learning_rate": 0.0001351091955457614, + "loss": 0.9248, + "step": 10905 + }, + { + "epoch": 2.3263651877133107, + "grad_norm": 0.28089930603211377, + "learning_rate": 0.0001351030265926143, + "loss": 0.9194, + "step": 10906 + }, + { + "epoch": 2.3265784982935154, + "grad_norm": 0.3982175080321818, + "learning_rate": 0.0001350968570159756, + "loss": 0.9072, + "step": 10907 + }, + { + "epoch": 2.32679180887372, + "grad_norm": 0.4271290939573652, + "learning_rate": 0.000135090686815915, + "loss": 0.9315, + "step": 10908 + }, + { + "epoch": 2.3270051194539247, + "grad_norm": 0.3308126140677552, + "learning_rate": 0.00013508451599250233, + "loss": 0.9542, + "step": 10909 + }, + { + "epoch": 2.32721843003413, + "grad_norm": 0.26935639462612504, + "learning_rate": 0.00013507834454580748, + "loss": 0.9479, + "step": 10910 + }, + { + "epoch": 2.3274317406143346, + "grad_norm": 0.23345604915384888, + "learning_rate": 0.00013507217247590018, + "loss": 0.9424, + "step": 10911 + }, + { + "epoch": 2.3276450511945392, + "grad_norm": 0.2605277145682045, + "learning_rate": 0.00013506599978285036, + "loss": 0.9433, + "step": 10912 + }, + { + "epoch": 2.327858361774744, + "grad_norm": 0.3018424295741254, + "learning_rate": 0.00013505982646672783, + "loss": 0.9476, + "step": 10913 + }, + { + "epoch": 2.328071672354949, + "grad_norm": 0.3205731876219646, + "learning_rate": 0.0001350536525276024, + "loss": 0.9798, + "step": 10914 + }, + { + "epoch": 2.3282849829351537, + "grad_norm": 0.2860825608651541, + "learning_rate": 0.000135047477965544, + "loss": 0.9847, + "step": 10915 + }, + { + "epoch": 2.3284982935153584, + "grad_norm": 0.22575096405397646, + "learning_rate": 0.00013504130278062246, + "loss": 0.9316, + "step": 10916 + }, + { + "epoch": 2.328711604095563, + "grad_norm": 0.2496310174883814, + "learning_rate": 0.00013503512697290765, + "loss": 0.9416, + "step": 10917 + }, + { + "epoch": 2.328924914675768, + "grad_norm": 0.31306263708050996, + "learning_rate": 0.00013502895054246942, + "loss": 0.9635, + "step": 10918 + }, + { + "epoch": 2.3291382252559725, + "grad_norm": 0.24679240026453622, + "learning_rate": 0.00013502277348937774, + "loss": 0.8775, + "step": 10919 + }, + { + "epoch": 2.3293515358361776, + "grad_norm": 0.25195322386081015, + "learning_rate": 0.00013501659581370242, + "loss": 0.9196, + "step": 10920 + }, + { + "epoch": 2.3295648464163823, + "grad_norm": 0.24586332786498472, + "learning_rate": 0.00013501041751551342, + "loss": 0.9385, + "step": 10921 + }, + { + "epoch": 2.329778156996587, + "grad_norm": 0.20268983600134582, + "learning_rate": 0.0001350042385948806, + "loss": 0.9174, + "step": 10922 + }, + { + "epoch": 2.3299914675767917, + "grad_norm": 0.2390013735073647, + "learning_rate": 0.0001349980590518739, + "loss": 0.9471, + "step": 10923 + }, + { + "epoch": 2.330204778156997, + "grad_norm": 0.2669361947272292, + "learning_rate": 0.0001349918788865632, + "loss": 0.9168, + "step": 10924 + }, + { + "epoch": 2.3304180887372015, + "grad_norm": 0.25699057845043355, + "learning_rate": 0.00013498569809901854, + "loss": 0.9231, + "step": 10925 + }, + { + "epoch": 2.330631399317406, + "grad_norm": 0.24555735815121085, + "learning_rate": 0.00013497951668930974, + "loss": 0.9596, + "step": 10926 + }, + { + "epoch": 2.330844709897611, + "grad_norm": 0.211283908984482, + "learning_rate": 0.00013497333465750677, + "loss": 0.9467, + "step": 10927 + }, + { + "epoch": 2.3310580204778155, + "grad_norm": 0.24958959399953537, + "learning_rate": 0.0001349671520036796, + "loss": 0.9638, + "step": 10928 + }, + { + "epoch": 2.3312713310580206, + "grad_norm": 0.2699719789866978, + "learning_rate": 0.00013496096872789821, + "loss": 0.9585, + "step": 10929 + }, + { + "epoch": 2.3314846416382253, + "grad_norm": 0.2781137750873096, + "learning_rate": 0.0001349547848302325, + "loss": 0.938, + "step": 10930 + }, + { + "epoch": 2.33169795221843, + "grad_norm": 0.26199312613814957, + "learning_rate": 0.0001349486003107525, + "loss": 0.9678, + "step": 10931 + }, + { + "epoch": 2.3319112627986347, + "grad_norm": 0.25931143476407376, + "learning_rate": 0.00013494241516952816, + "loss": 0.9054, + "step": 10932 + }, + { + "epoch": 2.33212457337884, + "grad_norm": 0.26300652273103775, + "learning_rate": 0.00013493622940662946, + "loss": 0.9277, + "step": 10933 + }, + { + "epoch": 2.3323378839590445, + "grad_norm": 0.25720798475937395, + "learning_rate": 0.0001349300430221264, + "loss": 0.9315, + "step": 10934 + }, + { + "epoch": 2.332551194539249, + "grad_norm": 0.33241956202749773, + "learning_rate": 0.00013492385601608897, + "loss": 0.9314, + "step": 10935 + }, + { + "epoch": 2.332764505119454, + "grad_norm": 0.31908635617275877, + "learning_rate": 0.0001349176683885872, + "loss": 0.9246, + "step": 10936 + }, + { + "epoch": 2.3329778156996586, + "grad_norm": 0.2897273730825015, + "learning_rate": 0.0001349114801396911, + "loss": 0.9611, + "step": 10937 + }, + { + "epoch": 2.3331911262798632, + "grad_norm": 0.38066737684619967, + "learning_rate": 0.00013490529126947064, + "loss": 0.9775, + "step": 10938 + }, + { + "epoch": 2.3334044368600684, + "grad_norm": 0.3190534796028598, + "learning_rate": 0.00013489910177799593, + "loss": 0.9032, + "step": 10939 + }, + { + "epoch": 2.333617747440273, + "grad_norm": 0.24000530947189055, + "learning_rate": 0.00013489291166533693, + "loss": 0.9415, + "step": 10940 + }, + { + "epoch": 2.3338310580204777, + "grad_norm": 0.2620697988844929, + "learning_rate": 0.0001348867209315637, + "loss": 0.9409, + "step": 10941 + }, + { + "epoch": 2.3340443686006824, + "grad_norm": 0.279976566964298, + "learning_rate": 0.00013488052957674634, + "loss": 0.9444, + "step": 10942 + }, + { + "epoch": 2.3342576791808876, + "grad_norm": 0.41131973122035426, + "learning_rate": 0.00013487433760095484, + "loss": 0.9224, + "step": 10943 + }, + { + "epoch": 2.3344709897610922, + "grad_norm": 0.29631777477702864, + "learning_rate": 0.0001348681450042593, + "loss": 0.9394, + "step": 10944 + }, + { + "epoch": 2.334684300341297, + "grad_norm": 0.2784388200688336, + "learning_rate": 0.00013486195178672976, + "loss": 0.9669, + "step": 10945 + }, + { + "epoch": 2.3348976109215016, + "grad_norm": 0.2682354135949821, + "learning_rate": 0.00013485575794843632, + "loss": 0.9664, + "step": 10946 + }, + { + "epoch": 2.3351109215017063, + "grad_norm": 0.2098845826048199, + "learning_rate": 0.00013484956348944904, + "loss": 0.9342, + "step": 10947 + }, + { + "epoch": 2.3353242320819114, + "grad_norm": 0.2888280775782921, + "learning_rate": 0.00013484336840983806, + "loss": 0.9265, + "step": 10948 + }, + { + "epoch": 2.335537542662116, + "grad_norm": 0.3702754202051372, + "learning_rate": 0.00013483717270967342, + "loss": 0.9261, + "step": 10949 + }, + { + "epoch": 2.335750853242321, + "grad_norm": 0.42407709286094286, + "learning_rate": 0.00013483097638902525, + "loss": 0.9135, + "step": 10950 + }, + { + "epoch": 2.3359641638225255, + "grad_norm": 0.43508016621521006, + "learning_rate": 0.00013482477944796366, + "loss": 0.9075, + "step": 10951 + }, + { + "epoch": 2.3361774744027306, + "grad_norm": 0.4034905231299928, + "learning_rate": 0.00013481858188655878, + "loss": 0.9884, + "step": 10952 + }, + { + "epoch": 2.3363907849829353, + "grad_norm": 0.39332208374186056, + "learning_rate": 0.00013481238370488072, + "loss": 0.9691, + "step": 10953 + }, + { + "epoch": 2.33660409556314, + "grad_norm": 0.47516570822359444, + "learning_rate": 0.0001348061849029996, + "loss": 0.9636, + "step": 10954 + }, + { + "epoch": 2.3368174061433447, + "grad_norm": 0.4716798705364711, + "learning_rate": 0.00013479998548098562, + "loss": 0.9085, + "step": 10955 + }, + { + "epoch": 2.3370307167235493, + "grad_norm": 0.4087624829830376, + "learning_rate": 0.00013479378543890884, + "loss": 0.9234, + "step": 10956 + }, + { + "epoch": 2.337244027303754, + "grad_norm": 0.37922391559561874, + "learning_rate": 0.00013478758477683948, + "loss": 0.9616, + "step": 10957 + }, + { + "epoch": 2.337457337883959, + "grad_norm": 0.2472988811434004, + "learning_rate": 0.00013478138349484766, + "loss": 0.9529, + "step": 10958 + }, + { + "epoch": 2.337670648464164, + "grad_norm": 0.24150735907392853, + "learning_rate": 0.00013477518159300356, + "loss": 0.968, + "step": 10959 + }, + { + "epoch": 2.3378839590443685, + "grad_norm": 0.24413604535328245, + "learning_rate": 0.00013476897907137737, + "loss": 0.9253, + "step": 10960 + }, + { + "epoch": 2.338097269624573, + "grad_norm": 0.2243150882010751, + "learning_rate": 0.00013476277593003926, + "loss": 0.9182, + "step": 10961 + }, + { + "epoch": 2.3383105802047783, + "grad_norm": 0.25768498343712026, + "learning_rate": 0.0001347565721690594, + "loss": 0.9323, + "step": 10962 + }, + { + "epoch": 2.338523890784983, + "grad_norm": 0.28167154418852297, + "learning_rate": 0.00013475036778850806, + "loss": 0.9129, + "step": 10963 + }, + { + "epoch": 2.3387372013651877, + "grad_norm": 0.3224687326974584, + "learning_rate": 0.00013474416278845533, + "loss": 0.934, + "step": 10964 + }, + { + "epoch": 2.3389505119453924, + "grad_norm": 0.3677186527896531, + "learning_rate": 0.0001347379571689715, + "loss": 0.9367, + "step": 10965 + }, + { + "epoch": 2.339163822525597, + "grad_norm": 0.3656972957443775, + "learning_rate": 0.00013473175093012675, + "loss": 0.9345, + "step": 10966 + }, + { + "epoch": 2.339377133105802, + "grad_norm": 0.400162016681015, + "learning_rate": 0.00013472554407199135, + "loss": 0.9347, + "step": 10967 + }, + { + "epoch": 2.339590443686007, + "grad_norm": 0.39247717703376045, + "learning_rate": 0.00013471933659463546, + "loss": 0.9314, + "step": 10968 + }, + { + "epoch": 2.3398037542662116, + "grad_norm": 0.3322067916857394, + "learning_rate": 0.00013471312849812936, + "loss": 0.946, + "step": 10969 + }, + { + "epoch": 2.3400170648464163, + "grad_norm": 1.699068471232742, + "learning_rate": 0.00013470691978254332, + "loss": 0.9811, + "step": 10970 + }, + { + "epoch": 2.3402303754266214, + "grad_norm": 0.341784433149376, + "learning_rate": 0.00013470071044794754, + "loss": 0.9696, + "step": 10971 + }, + { + "epoch": 2.340443686006826, + "grad_norm": 0.7621645141946876, + "learning_rate": 0.00013469450049441233, + "loss": 0.9435, + "step": 10972 + }, + { + "epoch": 2.3406569965870307, + "grad_norm": 0.7874643609948485, + "learning_rate": 0.0001346882899220079, + "loss": 0.9632, + "step": 10973 + }, + { + "epoch": 2.3408703071672354, + "grad_norm": 0.8853421163190195, + "learning_rate": 0.00013468207873080455, + "loss": 0.9302, + "step": 10974 + }, + { + "epoch": 2.34108361774744, + "grad_norm": 0.8563440129582892, + "learning_rate": 0.00013467586692087258, + "loss": 0.8983, + "step": 10975 + }, + { + "epoch": 2.3412969283276452, + "grad_norm": 0.6908220343669645, + "learning_rate": 0.00013466965449228225, + "loss": 0.9364, + "step": 10976 + }, + { + "epoch": 2.34151023890785, + "grad_norm": 0.4122842366811158, + "learning_rate": 0.00013466344144510384, + "loss": 0.9328, + "step": 10977 + }, + { + "epoch": 2.3417235494880546, + "grad_norm": 0.29477795093650083, + "learning_rate": 0.00013465722777940766, + "loss": 0.9761, + "step": 10978 + }, + { + "epoch": 2.3419368600682593, + "grad_norm": 0.4232864213553069, + "learning_rate": 0.00013465101349526405, + "loss": 0.9248, + "step": 10979 + }, + { + "epoch": 2.342150170648464, + "grad_norm": 0.45774711432259085, + "learning_rate": 0.0001346447985927433, + "loss": 0.9298, + "step": 10980 + }, + { + "epoch": 2.342363481228669, + "grad_norm": 0.3506772765626503, + "learning_rate": 0.00013463858307191572, + "loss": 0.9298, + "step": 10981 + }, + { + "epoch": 2.342576791808874, + "grad_norm": 0.32052957605423604, + "learning_rate": 0.0001346323669328517, + "loss": 0.953, + "step": 10982 + }, + { + "epoch": 2.3427901023890785, + "grad_norm": 0.22837352201177089, + "learning_rate": 0.0001346261501756215, + "loss": 0.9653, + "step": 10983 + }, + { + "epoch": 2.343003412969283, + "grad_norm": 0.2984929976967407, + "learning_rate": 0.0001346199328002955, + "loss": 0.98, + "step": 10984 + }, + { + "epoch": 2.343216723549488, + "grad_norm": 0.4324794249816942, + "learning_rate": 0.000134613714806944, + "loss": 0.9522, + "step": 10985 + }, + { + "epoch": 2.343430034129693, + "grad_norm": 0.4806763830930128, + "learning_rate": 0.00013460749619563744, + "loss": 0.9277, + "step": 10986 + }, + { + "epoch": 2.3436433447098977, + "grad_norm": 0.4843669510429825, + "learning_rate": 0.00013460127696644612, + "loss": 0.9384, + "step": 10987 + }, + { + "epoch": 2.3438566552901023, + "grad_norm": 0.3176815622354466, + "learning_rate": 0.0001345950571194405, + "loss": 0.9363, + "step": 10988 + }, + { + "epoch": 2.344069965870307, + "grad_norm": 0.29123902044377464, + "learning_rate": 0.00013458883665469083, + "loss": 0.9422, + "step": 10989 + }, + { + "epoch": 2.344283276450512, + "grad_norm": 0.3678067580124689, + "learning_rate": 0.00013458261557226754, + "loss": 0.9268, + "step": 10990 + }, + { + "epoch": 2.344496587030717, + "grad_norm": 0.41019407147805886, + "learning_rate": 0.00013457639387224108, + "loss": 0.9302, + "step": 10991 + }, + { + "epoch": 2.3447098976109215, + "grad_norm": 0.8150491171464074, + "learning_rate": 0.00013457017155468176, + "loss": 0.9613, + "step": 10992 + }, + { + "epoch": 2.344923208191126, + "grad_norm": 0.4728651152128792, + "learning_rate": 0.00013456394861966004, + "loss": 0.9553, + "step": 10993 + }, + { + "epoch": 2.345136518771331, + "grad_norm": 0.38431181991563435, + "learning_rate": 0.00013455772506724636, + "loss": 0.9592, + "step": 10994 + }, + { + "epoch": 2.345349829351536, + "grad_norm": 0.3142818571251095, + "learning_rate": 0.00013455150089751108, + "loss": 0.9231, + "step": 10995 + }, + { + "epoch": 2.3455631399317407, + "grad_norm": 0.41299367450117425, + "learning_rate": 0.00013454527611052463, + "loss": 0.9403, + "step": 10996 + }, + { + "epoch": 2.3457764505119454, + "grad_norm": 0.45092442702238095, + "learning_rate": 0.0001345390507063575, + "loss": 0.9515, + "step": 10997 + }, + { + "epoch": 2.34598976109215, + "grad_norm": 0.300534580429928, + "learning_rate": 0.00013453282468508008, + "loss": 0.9558, + "step": 10998 + }, + { + "epoch": 2.3462030716723548, + "grad_norm": 0.2540009854216513, + "learning_rate": 0.00013452659804676285, + "loss": 0.9494, + "step": 10999 + }, + { + "epoch": 2.34641638225256, + "grad_norm": 0.3993666875373059, + "learning_rate": 0.0001345203707914762, + "loss": 0.9721, + "step": 11000 + }, + { + "epoch": 2.3466296928327646, + "grad_norm": 0.3986590581251006, + "learning_rate": 0.00013451414291929068, + "loss": 0.9356, + "step": 11001 + }, + { + "epoch": 2.3468430034129693, + "grad_norm": 0.30127190024996614, + "learning_rate": 0.0001345079144302767, + "loss": 0.9519, + "step": 11002 + }, + { + "epoch": 2.347056313993174, + "grad_norm": 0.30467305917934573, + "learning_rate": 0.00013450168532450475, + "loss": 0.9561, + "step": 11003 + }, + { + "epoch": 2.3472696245733786, + "grad_norm": 0.3879187595434649, + "learning_rate": 0.0001344954556020453, + "loss": 0.934, + "step": 11004 + }, + { + "epoch": 2.3474829351535837, + "grad_norm": 0.3035708982682916, + "learning_rate": 0.00013448922526296887, + "loss": 0.9569, + "step": 11005 + }, + { + "epoch": 2.3476962457337884, + "grad_norm": 0.2865057951292694, + "learning_rate": 0.00013448299430734594, + "loss": 0.9467, + "step": 11006 + }, + { + "epoch": 2.347909556313993, + "grad_norm": 0.39636018663425066, + "learning_rate": 0.000134476762735247, + "loss": 0.9286, + "step": 11007 + }, + { + "epoch": 2.348122866894198, + "grad_norm": 0.337324256524826, + "learning_rate": 0.00013447053054674258, + "loss": 0.9287, + "step": 11008 + }, + { + "epoch": 2.348336177474403, + "grad_norm": 0.3489727410884749, + "learning_rate": 0.0001344642977419032, + "loss": 0.9096, + "step": 11009 + }, + { + "epoch": 2.3485494880546076, + "grad_norm": 0.39003562538208175, + "learning_rate": 0.00013445806432079937, + "loss": 0.9416, + "step": 11010 + }, + { + "epoch": 2.3487627986348123, + "grad_norm": 0.2939862034516257, + "learning_rate": 0.00013445183028350162, + "loss": 0.9583, + "step": 11011 + }, + { + "epoch": 2.348976109215017, + "grad_norm": 0.28119219962921543, + "learning_rate": 0.0001344455956300805, + "loss": 0.932, + "step": 11012 + }, + { + "epoch": 2.3491894197952217, + "grad_norm": 0.37746432503684874, + "learning_rate": 0.00013443936036060657, + "loss": 0.953, + "step": 11013 + }, + { + "epoch": 2.349402730375427, + "grad_norm": 0.3956650939553001, + "learning_rate": 0.00013443312447515034, + "loss": 0.9569, + "step": 11014 + }, + { + "epoch": 2.3496160409556315, + "grad_norm": 0.3700287248928023, + "learning_rate": 0.00013442688797378238, + "loss": 0.9538, + "step": 11015 + }, + { + "epoch": 2.349829351535836, + "grad_norm": 0.3762928058140788, + "learning_rate": 0.00013442065085657328, + "loss": 0.9371, + "step": 11016 + }, + { + "epoch": 2.350042662116041, + "grad_norm": 0.3169565042751116, + "learning_rate": 0.0001344144131235936, + "loss": 0.9452, + "step": 11017 + }, + { + "epoch": 2.350255972696246, + "grad_norm": 0.23569015115694888, + "learning_rate": 0.00013440817477491392, + "loss": 0.9346, + "step": 11018 + }, + { + "epoch": 2.3504692832764507, + "grad_norm": 0.2950409482832615, + "learning_rate": 0.00013440193581060482, + "loss": 0.9306, + "step": 11019 + }, + { + "epoch": 2.3506825938566553, + "grad_norm": 0.4214373481659332, + "learning_rate": 0.00013439569623073693, + "loss": 0.932, + "step": 11020 + }, + { + "epoch": 2.35089590443686, + "grad_norm": 0.4739215270121279, + "learning_rate": 0.0001343894560353808, + "loss": 0.9476, + "step": 11021 + }, + { + "epoch": 2.3511092150170647, + "grad_norm": 0.48013190968465663, + "learning_rate": 0.00013438321522460706, + "loss": 0.9421, + "step": 11022 + }, + { + "epoch": 2.3513225255972694, + "grad_norm": 0.4816334138431169, + "learning_rate": 0.0001343769737984863, + "loss": 0.9712, + "step": 11023 + }, + { + "epoch": 2.3515358361774745, + "grad_norm": 0.5004531492466179, + "learning_rate": 0.0001343707317570892, + "loss": 0.9589, + "step": 11024 + }, + { + "epoch": 2.351749146757679, + "grad_norm": 0.44911642344252867, + "learning_rate": 0.00013436448910048637, + "loss": 0.9388, + "step": 11025 + }, + { + "epoch": 2.351962457337884, + "grad_norm": 0.2768845205531358, + "learning_rate": 0.00013435824582874837, + "loss": 0.9352, + "step": 11026 + }, + { + "epoch": 2.3521757679180886, + "grad_norm": 0.3303959518272496, + "learning_rate": 0.00013435200194194597, + "loss": 0.9394, + "step": 11027 + }, + { + "epoch": 2.3523890784982937, + "grad_norm": 0.30368091606650693, + "learning_rate": 0.00013434575744014973, + "loss": 0.9391, + "step": 11028 + }, + { + "epoch": 2.3526023890784984, + "grad_norm": 0.32701023958047454, + "learning_rate": 0.00013433951232343032, + "loss": 0.9473, + "step": 11029 + }, + { + "epoch": 2.352815699658703, + "grad_norm": 0.33106212719608985, + "learning_rate": 0.00013433326659185841, + "loss": 0.9539, + "step": 11030 + }, + { + "epoch": 2.3530290102389078, + "grad_norm": 0.3914939198472626, + "learning_rate": 0.0001343270202455047, + "loss": 0.9522, + "step": 11031 + }, + { + "epoch": 2.3532423208191124, + "grad_norm": 0.4059374182332931, + "learning_rate": 0.00013432077328443985, + "loss": 0.9352, + "step": 11032 + }, + { + "epoch": 2.3534556313993176, + "grad_norm": 0.3070727962159388, + "learning_rate": 0.0001343145257087345, + "loss": 0.9057, + "step": 11033 + }, + { + "epoch": 2.3536689419795223, + "grad_norm": 0.2517149323465817, + "learning_rate": 0.00013430827751845941, + "loss": 0.9482, + "step": 11034 + }, + { + "epoch": 2.353882252559727, + "grad_norm": 0.354630679759099, + "learning_rate": 0.0001343020287136852, + "loss": 0.9408, + "step": 11035 + }, + { + "epoch": 2.3540955631399316, + "grad_norm": 0.39391862029477503, + "learning_rate": 0.00013429577929448266, + "loss": 0.9337, + "step": 11036 + }, + { + "epoch": 2.3543088737201368, + "grad_norm": 0.4788871646487071, + "learning_rate": 0.00013428952926092244, + "loss": 0.9595, + "step": 11037 + }, + { + "epoch": 2.3545221843003414, + "grad_norm": 0.5547274148798841, + "learning_rate": 0.0001342832786130753, + "loss": 0.919, + "step": 11038 + }, + { + "epoch": 2.354735494880546, + "grad_norm": 0.5492865247497537, + "learning_rate": 0.00013427702735101197, + "loss": 0.9413, + "step": 11039 + }, + { + "epoch": 2.354948805460751, + "grad_norm": 0.41190269711198924, + "learning_rate": 0.00013427077547480313, + "loss": 0.9356, + "step": 11040 + }, + { + "epoch": 2.3551621160409555, + "grad_norm": 0.3240112024377987, + "learning_rate": 0.00013426452298451958, + "loss": 0.9146, + "step": 11041 + }, + { + "epoch": 2.35537542662116, + "grad_norm": 0.3427702488846476, + "learning_rate": 0.00013425826988023201, + "loss": 0.9645, + "step": 11042 + }, + { + "epoch": 2.3555887372013653, + "grad_norm": 0.36399365104437326, + "learning_rate": 0.0001342520161620112, + "loss": 0.9242, + "step": 11043 + }, + { + "epoch": 2.35580204778157, + "grad_norm": 0.44549544818573766, + "learning_rate": 0.00013424576182992796, + "loss": 0.9829, + "step": 11044 + }, + { + "epoch": 2.3560153583617747, + "grad_norm": 0.4901783875379721, + "learning_rate": 0.00013423950688405296, + "loss": 0.9119, + "step": 11045 + }, + { + "epoch": 2.3562286689419794, + "grad_norm": 0.41851624600742066, + "learning_rate": 0.00013423325132445706, + "loss": 0.9179, + "step": 11046 + }, + { + "epoch": 2.3564419795221845, + "grad_norm": 0.24089096819478029, + "learning_rate": 0.00013422699515121101, + "loss": 0.9629, + "step": 11047 + }, + { + "epoch": 2.356655290102389, + "grad_norm": 0.27476316774132703, + "learning_rate": 0.00013422073836438558, + "loss": 0.9332, + "step": 11048 + }, + { + "epoch": 2.356868600682594, + "grad_norm": 0.34501164790637634, + "learning_rate": 0.00013421448096405158, + "loss": 0.9296, + "step": 11049 + }, + { + "epoch": 2.3570819112627985, + "grad_norm": 0.39869196599901835, + "learning_rate": 0.00013420822295027982, + "loss": 0.9543, + "step": 11050 + }, + { + "epoch": 2.357295221843003, + "grad_norm": 0.3783941822809167, + "learning_rate": 0.00013420196432314114, + "loss": 0.9319, + "step": 11051 + }, + { + "epoch": 2.3575085324232083, + "grad_norm": 0.2887947111736907, + "learning_rate": 0.0001341957050827063, + "loss": 0.9411, + "step": 11052 + }, + { + "epoch": 2.357721843003413, + "grad_norm": 0.21487878733841567, + "learning_rate": 0.00013418944522904612, + "loss": 0.9245, + "step": 11053 + }, + { + "epoch": 2.3579351535836177, + "grad_norm": 0.25725233654461704, + "learning_rate": 0.00013418318476223146, + "loss": 0.941, + "step": 11054 + }, + { + "epoch": 2.3581484641638224, + "grad_norm": 0.39216186678801324, + "learning_rate": 0.0001341769236823332, + "loss": 0.9872, + "step": 11055 + }, + { + "epoch": 2.3583617747440275, + "grad_norm": 0.4854426502984515, + "learning_rate": 0.0001341706619894221, + "loss": 0.9188, + "step": 11056 + }, + { + "epoch": 2.358575085324232, + "grad_norm": 0.5119067797460269, + "learning_rate": 0.00013416439968356905, + "loss": 0.9518, + "step": 11057 + }, + { + "epoch": 2.358788395904437, + "grad_norm": 0.43938606373070266, + "learning_rate": 0.00013415813676484492, + "loss": 0.9246, + "step": 11058 + }, + { + "epoch": 2.3590017064846416, + "grad_norm": 0.32882848215156657, + "learning_rate": 0.00013415187323332053, + "loss": 0.9234, + "step": 11059 + }, + { + "epoch": 2.3592150170648463, + "grad_norm": 0.22229430762658864, + "learning_rate": 0.00013414560908906682, + "loss": 0.9782, + "step": 11060 + }, + { + "epoch": 2.359428327645051, + "grad_norm": 0.23933050580089088, + "learning_rate": 0.0001341393443321546, + "loss": 0.946, + "step": 11061 + }, + { + "epoch": 2.359641638225256, + "grad_norm": 0.349850738686144, + "learning_rate": 0.0001341330789626548, + "loss": 0.923, + "step": 11062 + }, + { + "epoch": 2.3598549488054608, + "grad_norm": 0.4419355632922277, + "learning_rate": 0.00013412681298063833, + "loss": 0.9664, + "step": 11063 + }, + { + "epoch": 2.3600682593856654, + "grad_norm": 0.4323166592160888, + "learning_rate": 0.00013412054638617605, + "loss": 0.9458, + "step": 11064 + }, + { + "epoch": 2.36028156996587, + "grad_norm": 0.3293525043336529, + "learning_rate": 0.00013411427917933886, + "loss": 0.9322, + "step": 11065 + }, + { + "epoch": 2.3604948805460753, + "grad_norm": 0.23666192865617403, + "learning_rate": 0.0001341080113601977, + "loss": 0.927, + "step": 11066 + }, + { + "epoch": 2.36070819112628, + "grad_norm": 0.29486910785426995, + "learning_rate": 0.00013410174292882346, + "loss": 0.9313, + "step": 11067 + }, + { + "epoch": 2.3609215017064846, + "grad_norm": 0.4748360614456371, + "learning_rate": 0.0001340954738852871, + "loss": 0.9385, + "step": 11068 + }, + { + "epoch": 2.3611348122866893, + "grad_norm": 0.5842048420645288, + "learning_rate": 0.00013408920422965957, + "loss": 0.9162, + "step": 11069 + }, + { + "epoch": 2.361348122866894, + "grad_norm": 0.5384999910486129, + "learning_rate": 0.00013408293396201175, + "loss": 0.9324, + "step": 11070 + }, + { + "epoch": 2.361561433447099, + "grad_norm": 0.37804870050992206, + "learning_rate": 0.00013407666308241464, + "loss": 0.9326, + "step": 11071 + }, + { + "epoch": 2.361774744027304, + "grad_norm": 0.2147794044859339, + "learning_rate": 0.0001340703915909392, + "loss": 0.9251, + "step": 11072 + }, + { + "epoch": 2.3619880546075085, + "grad_norm": 0.34043997996063696, + "learning_rate": 0.00013406411948765632, + "loss": 0.9377, + "step": 11073 + }, + { + "epoch": 2.362201365187713, + "grad_norm": 0.4299557160718578, + "learning_rate": 0.00013405784677263704, + "loss": 0.938, + "step": 11074 + }, + { + "epoch": 2.3624146757679183, + "grad_norm": 0.42151248218444437, + "learning_rate": 0.00013405157344595234, + "loss": 0.8918, + "step": 11075 + }, + { + "epoch": 2.362627986348123, + "grad_norm": 0.3292989142136631, + "learning_rate": 0.00013404529950767312, + "loss": 0.9161, + "step": 11076 + }, + { + "epoch": 2.3628412969283277, + "grad_norm": 0.2799344285552388, + "learning_rate": 0.00013403902495787048, + "loss": 0.9322, + "step": 11077 + }, + { + "epoch": 2.3630546075085324, + "grad_norm": 0.2778134414817763, + "learning_rate": 0.00013403274979661536, + "loss": 0.9427, + "step": 11078 + }, + { + "epoch": 2.363267918088737, + "grad_norm": 0.3223999904327946, + "learning_rate": 0.00013402647402397874, + "loss": 0.9436, + "step": 11079 + }, + { + "epoch": 2.363481228668942, + "grad_norm": 0.4117219977093282, + "learning_rate": 0.00013402019764003167, + "loss": 0.9086, + "step": 11080 + }, + { + "epoch": 2.363694539249147, + "grad_norm": 0.417181234579862, + "learning_rate": 0.00013401392064484516, + "loss": 0.9514, + "step": 11081 + }, + { + "epoch": 2.3639078498293515, + "grad_norm": 0.37923963860598, + "learning_rate": 0.0001340076430384902, + "loss": 0.9475, + "step": 11082 + }, + { + "epoch": 2.364121160409556, + "grad_norm": 0.2771770317405016, + "learning_rate": 0.0001340013648210379, + "loss": 0.9539, + "step": 11083 + }, + { + "epoch": 2.364334470989761, + "grad_norm": 0.22228075095667885, + "learning_rate": 0.00013399508599255922, + "loss": 0.9063, + "step": 11084 + }, + { + "epoch": 2.364547781569966, + "grad_norm": 0.345161298908388, + "learning_rate": 0.00013398880655312524, + "loss": 0.9624, + "step": 11085 + }, + { + "epoch": 2.3647610921501707, + "grad_norm": 0.42611424206067555, + "learning_rate": 0.000133982526502807, + "loss": 0.927, + "step": 11086 + }, + { + "epoch": 2.3649744027303754, + "grad_norm": 0.4085210228574319, + "learning_rate": 0.00013397624584167555, + "loss": 0.926, + "step": 11087 + }, + { + "epoch": 2.36518771331058, + "grad_norm": 0.33288711599363013, + "learning_rate": 0.000133969964569802, + "loss": 0.9266, + "step": 11088 + }, + { + "epoch": 2.3654010238907848, + "grad_norm": 0.28013003431567035, + "learning_rate": 0.00013396368268725738, + "loss": 0.924, + "step": 11089 + }, + { + "epoch": 2.36561433447099, + "grad_norm": 0.2486142531500591, + "learning_rate": 0.0001339574001941128, + "loss": 0.9226, + "step": 11090 + }, + { + "epoch": 2.3658276450511946, + "grad_norm": 0.2760046640959017, + "learning_rate": 0.0001339511170904393, + "loss": 0.9189, + "step": 11091 + }, + { + "epoch": 2.3660409556313993, + "grad_norm": 0.3038350962636494, + "learning_rate": 0.00013394483337630803, + "loss": 0.9468, + "step": 11092 + }, + { + "epoch": 2.366254266211604, + "grad_norm": 0.3424383286890011, + "learning_rate": 0.00013393854905179, + "loss": 0.9595, + "step": 11093 + }, + { + "epoch": 2.366467576791809, + "grad_norm": 0.40614965819318316, + "learning_rate": 0.0001339322641169565, + "loss": 0.9338, + "step": 11094 + }, + { + "epoch": 2.3666808873720138, + "grad_norm": 0.3587578994736453, + "learning_rate": 0.0001339259785718784, + "loss": 0.9514, + "step": 11095 + }, + { + "epoch": 2.3668941979522184, + "grad_norm": 0.31892850433354614, + "learning_rate": 0.00013391969241662702, + "loss": 0.9219, + "step": 11096 + }, + { + "epoch": 2.367107508532423, + "grad_norm": 0.2523479555325438, + "learning_rate": 0.00013391340565127339, + "loss": 0.923, + "step": 11097 + }, + { + "epoch": 2.367320819112628, + "grad_norm": 0.26052498177221034, + "learning_rate": 0.00013390711827588868, + "loss": 0.9185, + "step": 11098 + }, + { + "epoch": 2.367534129692833, + "grad_norm": 0.3122528206842609, + "learning_rate": 0.000133900830290544, + "loss": 0.9231, + "step": 11099 + }, + { + "epoch": 2.3677474402730376, + "grad_norm": 0.4173553191283232, + "learning_rate": 0.00013389454169531053, + "loss": 0.9723, + "step": 11100 + }, + { + "epoch": 2.3679607508532423, + "grad_norm": 0.44582794444929097, + "learning_rate": 0.00013388825249025942, + "loss": 0.9756, + "step": 11101 + }, + { + "epoch": 2.368174061433447, + "grad_norm": 0.3647286032531625, + "learning_rate": 0.00013388196267546184, + "loss": 0.9331, + "step": 11102 + }, + { + "epoch": 2.368387372013652, + "grad_norm": 0.29190680981301526, + "learning_rate": 0.00013387567225098889, + "loss": 0.8969, + "step": 11103 + }, + { + "epoch": 2.368600682593857, + "grad_norm": 0.19936882627019498, + "learning_rate": 0.00013386938121691185, + "loss": 0.9588, + "step": 11104 + }, + { + "epoch": 2.3688139931740615, + "grad_norm": 0.24297529103376217, + "learning_rate": 0.00013386308957330184, + "loss": 0.9623, + "step": 11105 + }, + { + "epoch": 2.369027303754266, + "grad_norm": 0.2813930247626221, + "learning_rate": 0.00013385679732023006, + "loss": 0.9366, + "step": 11106 + }, + { + "epoch": 2.369240614334471, + "grad_norm": 0.3061454299391796, + "learning_rate": 0.00013385050445776774, + "loss": 0.9287, + "step": 11107 + }, + { + "epoch": 2.3694539249146755, + "grad_norm": 0.33194586141633425, + "learning_rate": 0.00013384421098598602, + "loss": 0.938, + "step": 11108 + }, + { + "epoch": 2.3696672354948807, + "grad_norm": 0.26300948675242164, + "learning_rate": 0.0001338379169049562, + "loss": 0.9465, + "step": 11109 + }, + { + "epoch": 2.3698805460750854, + "grad_norm": 0.2033417714275696, + "learning_rate": 0.00013383162221474937, + "loss": 0.9513, + "step": 11110 + }, + { + "epoch": 2.37009385665529, + "grad_norm": 0.210314586615214, + "learning_rate": 0.00013382532691543687, + "loss": 0.9202, + "step": 11111 + }, + { + "epoch": 2.3703071672354947, + "grad_norm": 0.23224185514595058, + "learning_rate": 0.0001338190310070899, + "loss": 0.9404, + "step": 11112 + }, + { + "epoch": 2.3705204778157, + "grad_norm": 0.194016210888998, + "learning_rate": 0.00013381273448977968, + "loss": 0.9327, + "step": 11113 + }, + { + "epoch": 2.3707337883959045, + "grad_norm": 0.23348228163680915, + "learning_rate": 0.00013380643736357744, + "loss": 0.9303, + "step": 11114 + }, + { + "epoch": 2.3709470989761092, + "grad_norm": 0.23679600415736304, + "learning_rate": 0.0001338001396285545, + "loss": 0.932, + "step": 11115 + }, + { + "epoch": 2.371160409556314, + "grad_norm": 0.26733039184572066, + "learning_rate": 0.00013379384128478204, + "loss": 0.9556, + "step": 11116 + }, + { + "epoch": 2.3713737201365186, + "grad_norm": 0.2766848265998495, + "learning_rate": 0.0001337875423323314, + "loss": 0.9421, + "step": 11117 + }, + { + "epoch": 2.3715870307167237, + "grad_norm": 0.3094071148187078, + "learning_rate": 0.00013378124277127379, + "loss": 0.9436, + "step": 11118 + }, + { + "epoch": 2.3718003412969284, + "grad_norm": 0.2931003445732501, + "learning_rate": 0.00013377494260168052, + "loss": 0.954, + "step": 11119 + }, + { + "epoch": 2.372013651877133, + "grad_norm": 0.21417737050152427, + "learning_rate": 0.00013376864182362285, + "loss": 0.9187, + "step": 11120 + }, + { + "epoch": 2.3722269624573378, + "grad_norm": 0.2943183327266275, + "learning_rate": 0.00013376234043717208, + "loss": 0.9183, + "step": 11121 + }, + { + "epoch": 2.372440273037543, + "grad_norm": 0.32361192809124534, + "learning_rate": 0.00013375603844239956, + "loss": 0.9438, + "step": 11122 + }, + { + "epoch": 2.3726535836177476, + "grad_norm": 0.28232327105849525, + "learning_rate": 0.00013374973583937656, + "loss": 0.9547, + "step": 11123 + }, + { + "epoch": 2.3728668941979523, + "grad_norm": 0.30130565408655635, + "learning_rate": 0.00013374343262817438, + "loss": 0.9191, + "step": 11124 + }, + { + "epoch": 2.373080204778157, + "grad_norm": 0.2690950835284358, + "learning_rate": 0.00013373712880886436, + "loss": 0.9233, + "step": 11125 + }, + { + "epoch": 2.3732935153583616, + "grad_norm": 0.2633463316149383, + "learning_rate": 0.0001337308243815178, + "loss": 0.9494, + "step": 11126 + }, + { + "epoch": 2.3735068259385663, + "grad_norm": 0.2349831327830048, + "learning_rate": 0.00013372451934620612, + "loss": 0.9576, + "step": 11127 + }, + { + "epoch": 2.3737201365187715, + "grad_norm": 0.22578095259373981, + "learning_rate": 0.00013371821370300056, + "loss": 0.946, + "step": 11128 + }, + { + "epoch": 2.373933447098976, + "grad_norm": 0.2583663375017575, + "learning_rate": 0.0001337119074519725, + "loss": 0.9434, + "step": 11129 + }, + { + "epoch": 2.374146757679181, + "grad_norm": 0.38470783914794077, + "learning_rate": 0.0001337056005931933, + "loss": 0.9599, + "step": 11130 + }, + { + "epoch": 2.3743600682593855, + "grad_norm": 0.5026928249649947, + "learning_rate": 0.00013369929312673435, + "loss": 0.943, + "step": 11131 + }, + { + "epoch": 2.3745733788395906, + "grad_norm": 0.517686959918875, + "learning_rate": 0.000133692985052667, + "loss": 0.9752, + "step": 11132 + }, + { + "epoch": 2.3747866894197953, + "grad_norm": 0.4804012262877228, + "learning_rate": 0.0001336866763710626, + "loss": 0.9317, + "step": 11133 + }, + { + "epoch": 2.375, + "grad_norm": 0.37520581870315256, + "learning_rate": 0.00013368036708199255, + "loss": 0.9141, + "step": 11134 + }, + { + "epoch": 2.3752133105802047, + "grad_norm": 0.3148343397060824, + "learning_rate": 0.00013367405718552825, + "loss": 0.9442, + "step": 11135 + }, + { + "epoch": 2.3754266211604094, + "grad_norm": 0.2415363178939215, + "learning_rate": 0.00013366774668174112, + "loss": 0.9795, + "step": 11136 + }, + { + "epoch": 2.3756399317406145, + "grad_norm": 0.21772354960680892, + "learning_rate": 0.0001336614355707025, + "loss": 0.9519, + "step": 11137 + }, + { + "epoch": 2.375853242320819, + "grad_norm": 0.25645121197473314, + "learning_rate": 0.00013365512385248382, + "loss": 0.9389, + "step": 11138 + }, + { + "epoch": 2.376066552901024, + "grad_norm": 0.3062132302427617, + "learning_rate": 0.00013364881152715656, + "loss": 0.9385, + "step": 11139 + }, + { + "epoch": 2.3762798634812285, + "grad_norm": 0.35798069066917737, + "learning_rate": 0.00013364249859479206, + "loss": 0.9254, + "step": 11140 + }, + { + "epoch": 2.3764931740614337, + "grad_norm": 0.31350973026877105, + "learning_rate": 0.00013363618505546178, + "loss": 0.9629, + "step": 11141 + }, + { + "epoch": 2.3767064846416384, + "grad_norm": 0.28026035925250453, + "learning_rate": 0.0001336298709092372, + "loss": 0.9266, + "step": 11142 + }, + { + "epoch": 2.376919795221843, + "grad_norm": 0.27653719366886703, + "learning_rate": 0.00013362355615618968, + "loss": 0.9438, + "step": 11143 + }, + { + "epoch": 2.3771331058020477, + "grad_norm": 0.2511236139566313, + "learning_rate": 0.00013361724079639077, + "loss": 0.9373, + "step": 11144 + }, + { + "epoch": 2.3773464163822524, + "grad_norm": 0.2899585933154601, + "learning_rate": 0.00013361092482991183, + "loss": 0.915, + "step": 11145 + }, + { + "epoch": 2.377559726962457, + "grad_norm": 0.28504579060771285, + "learning_rate": 0.00013360460825682441, + "loss": 0.9535, + "step": 11146 + }, + { + "epoch": 2.3777730375426622, + "grad_norm": 0.2786971856389917, + "learning_rate": 0.00013359829107719993, + "loss": 0.9493, + "step": 11147 + }, + { + "epoch": 2.377986348122867, + "grad_norm": 0.23994617851876535, + "learning_rate": 0.00013359197329110989, + "loss": 0.9435, + "step": 11148 + }, + { + "epoch": 2.3781996587030716, + "grad_norm": 0.2231829906615487, + "learning_rate": 0.00013358565489862576, + "loss": 0.917, + "step": 11149 + }, + { + "epoch": 2.3784129692832763, + "grad_norm": 0.22541597738765196, + "learning_rate": 0.00013357933589981907, + "loss": 0.9122, + "step": 11150 + }, + { + "epoch": 2.3786262798634814, + "grad_norm": 0.25636309514279415, + "learning_rate": 0.00013357301629476128, + "loss": 0.9216, + "step": 11151 + }, + { + "epoch": 2.378839590443686, + "grad_norm": 0.24136067065767372, + "learning_rate": 0.0001335666960835239, + "loss": 0.9253, + "step": 11152 + }, + { + "epoch": 2.3790529010238908, + "grad_norm": 0.23634528702175162, + "learning_rate": 0.00013356037526617844, + "loss": 0.9411, + "step": 11153 + }, + { + "epoch": 2.3792662116040955, + "grad_norm": 0.26500521888988016, + "learning_rate": 0.00013355405384279646, + "loss": 0.9472, + "step": 11154 + }, + { + "epoch": 2.3794795221843, + "grad_norm": 0.25503658364061627, + "learning_rate": 0.00013354773181344947, + "loss": 0.9393, + "step": 11155 + }, + { + "epoch": 2.3796928327645053, + "grad_norm": 0.2265342136893113, + "learning_rate": 0.00013354140917820896, + "loss": 0.934, + "step": 11156 + }, + { + "epoch": 2.37990614334471, + "grad_norm": 0.3034032939217368, + "learning_rate": 0.00013353508593714653, + "loss": 0.9632, + "step": 11157 + }, + { + "epoch": 2.3801194539249146, + "grad_norm": 0.32345769642723926, + "learning_rate": 0.0001335287620903337, + "loss": 0.9018, + "step": 11158 + }, + { + "epoch": 2.3803327645051193, + "grad_norm": 0.3176110557417298, + "learning_rate": 0.00013352243763784202, + "loss": 0.9221, + "step": 11159 + }, + { + "epoch": 2.3805460750853245, + "grad_norm": 0.29494407366339037, + "learning_rate": 0.00013351611257974305, + "loss": 0.9433, + "step": 11160 + }, + { + "epoch": 2.380759385665529, + "grad_norm": 0.33510207955450927, + "learning_rate": 0.0001335097869161084, + "loss": 0.955, + "step": 11161 + }, + { + "epoch": 2.380972696245734, + "grad_norm": 0.41264410142497815, + "learning_rate": 0.00013350346064700956, + "loss": 0.9544, + "step": 11162 + }, + { + "epoch": 2.3811860068259385, + "grad_norm": 0.3478605097984293, + "learning_rate": 0.00013349713377251818, + "loss": 0.9075, + "step": 11163 + }, + { + "epoch": 2.381399317406143, + "grad_norm": 0.28045396468793754, + "learning_rate": 0.00013349080629270583, + "loss": 0.9533, + "step": 11164 + }, + { + "epoch": 2.381612627986348, + "grad_norm": 0.3479825137877352, + "learning_rate": 0.00013348447820764412, + "loss": 0.9221, + "step": 11165 + }, + { + "epoch": 2.381825938566553, + "grad_norm": 0.37287074227920386, + "learning_rate": 0.00013347814951740464, + "loss": 0.9272, + "step": 11166 + }, + { + "epoch": 2.3820392491467577, + "grad_norm": 0.26093753768332545, + "learning_rate": 0.000133471820222059, + "loss": 0.9241, + "step": 11167 + }, + { + "epoch": 2.3822525597269624, + "grad_norm": 0.27002101330537237, + "learning_rate": 0.0001334654903216788, + "loss": 0.9173, + "step": 11168 + }, + { + "epoch": 2.382465870307167, + "grad_norm": 0.32912973559502884, + "learning_rate": 0.00013345915981633567, + "loss": 0.9343, + "step": 11169 + }, + { + "epoch": 2.382679180887372, + "grad_norm": 0.2809522865971033, + "learning_rate": 0.0001334528287061013, + "loss": 0.9546, + "step": 11170 + }, + { + "epoch": 2.382892491467577, + "grad_norm": 0.2804897829050724, + "learning_rate": 0.00013344649699104723, + "loss": 0.9297, + "step": 11171 + }, + { + "epoch": 2.3831058020477816, + "grad_norm": 0.3985404693107356, + "learning_rate": 0.00013344016467124517, + "loss": 0.9471, + "step": 11172 + }, + { + "epoch": 2.3833191126279862, + "grad_norm": 0.40044926184907126, + "learning_rate": 0.00013343383174676672, + "loss": 0.9376, + "step": 11173 + }, + { + "epoch": 2.383532423208191, + "grad_norm": 0.3582901824355295, + "learning_rate": 0.00013342749821768356, + "loss": 0.9422, + "step": 11174 + }, + { + "epoch": 2.383745733788396, + "grad_norm": 0.4661888084311254, + "learning_rate": 0.0001334211640840674, + "loss": 0.9404, + "step": 11175 + }, + { + "epoch": 2.3839590443686007, + "grad_norm": 0.47809800824889553, + "learning_rate": 0.00013341482934598983, + "loss": 0.938, + "step": 11176 + }, + { + "epoch": 2.3841723549488054, + "grad_norm": 0.3472543535880335, + "learning_rate": 0.00013340849400352259, + "loss": 0.922, + "step": 11177 + }, + { + "epoch": 2.38438566552901, + "grad_norm": 0.30290657794231446, + "learning_rate": 0.00013340215805673734, + "loss": 0.924, + "step": 11178 + }, + { + "epoch": 2.3845989761092152, + "grad_norm": 0.33329160837341587, + "learning_rate": 0.00013339582150570577, + "loss": 0.9222, + "step": 11179 + }, + { + "epoch": 2.38481228668942, + "grad_norm": 0.30576205933406064, + "learning_rate": 0.00013338948435049957, + "loss": 0.9511, + "step": 11180 + }, + { + "epoch": 2.3850255972696246, + "grad_norm": 0.22691202729333218, + "learning_rate": 0.00013338314659119046, + "loss": 0.9143, + "step": 11181 + }, + { + "epoch": 2.3852389078498293, + "grad_norm": 0.2855899392403864, + "learning_rate": 0.00013337680822785017, + "loss": 0.9332, + "step": 11182 + }, + { + "epoch": 2.385452218430034, + "grad_norm": 0.32069309354910475, + "learning_rate": 0.0001333704692605504, + "loss": 0.9391, + "step": 11183 + }, + { + "epoch": 2.385665529010239, + "grad_norm": 0.2964302546546362, + "learning_rate": 0.00013336412968936284, + "loss": 0.9283, + "step": 11184 + }, + { + "epoch": 2.385878839590444, + "grad_norm": 0.24134540243242916, + "learning_rate": 0.0001333577895143593, + "loss": 0.9503, + "step": 11185 + }, + { + "epoch": 2.3860921501706485, + "grad_norm": 0.2510971361385392, + "learning_rate": 0.00013335144873561142, + "loss": 0.9144, + "step": 11186 + }, + { + "epoch": 2.386305460750853, + "grad_norm": 0.26878713307525676, + "learning_rate": 0.000133345107353191, + "loss": 0.9522, + "step": 11187 + }, + { + "epoch": 2.386518771331058, + "grad_norm": 0.23770838076197306, + "learning_rate": 0.00013333876536716983, + "loss": 0.9188, + "step": 11188 + }, + { + "epoch": 2.386732081911263, + "grad_norm": 0.2322602777589685, + "learning_rate": 0.00013333242277761964, + "loss": 0.9053, + "step": 11189 + }, + { + "epoch": 2.3869453924914676, + "grad_norm": 0.3005665365812286, + "learning_rate": 0.00013332607958461217, + "loss": 0.9458, + "step": 11190 + }, + { + "epoch": 2.3871587030716723, + "grad_norm": 0.24344021053060477, + "learning_rate": 0.00013331973578821921, + "loss": 0.917, + "step": 11191 + }, + { + "epoch": 2.387372013651877, + "grad_norm": 0.2225873319317204, + "learning_rate": 0.00013331339138851253, + "loss": 0.9293, + "step": 11192 + }, + { + "epoch": 2.3875853242320817, + "grad_norm": 0.2679281098293131, + "learning_rate": 0.00013330704638556394, + "loss": 0.9606, + "step": 11193 + }, + { + "epoch": 2.387798634812287, + "grad_norm": 0.30362300765501155, + "learning_rate": 0.00013330070077944523, + "loss": 0.944, + "step": 11194 + }, + { + "epoch": 2.3880119453924915, + "grad_norm": 0.26091483161576656, + "learning_rate": 0.00013329435457022821, + "loss": 0.9122, + "step": 11195 + }, + { + "epoch": 2.388225255972696, + "grad_norm": 0.24316132602328397, + "learning_rate": 0.00013328800775798465, + "loss": 0.9425, + "step": 11196 + }, + { + "epoch": 2.388438566552901, + "grad_norm": 0.2534193200795089, + "learning_rate": 0.00013328166034278638, + "loss": 0.9263, + "step": 11197 + }, + { + "epoch": 2.388651877133106, + "grad_norm": 0.21415940235056216, + "learning_rate": 0.00013327531232470524, + "loss": 0.9195, + "step": 11198 + }, + { + "epoch": 2.3888651877133107, + "grad_norm": 0.22854527404447866, + "learning_rate": 0.00013326896370381304, + "loss": 0.9717, + "step": 11199 + }, + { + "epoch": 2.3890784982935154, + "grad_norm": 0.2358463112354905, + "learning_rate": 0.00013326261448018162, + "loss": 0.9358, + "step": 11200 + }, + { + "epoch": 2.38929180887372, + "grad_norm": 0.23718150864665255, + "learning_rate": 0.00013325626465388285, + "loss": 0.946, + "step": 11201 + }, + { + "epoch": 2.3895051194539247, + "grad_norm": 0.24306321856327978, + "learning_rate": 0.00013324991422498852, + "loss": 0.9512, + "step": 11202 + }, + { + "epoch": 2.38971843003413, + "grad_norm": 0.25315846026983807, + "learning_rate": 0.0001332435631935705, + "loss": 0.9374, + "step": 11203 + }, + { + "epoch": 2.3899317406143346, + "grad_norm": 0.27964528621758106, + "learning_rate": 0.0001332372115597007, + "loss": 0.9198, + "step": 11204 + }, + { + "epoch": 2.3901450511945392, + "grad_norm": 0.2965611921313549, + "learning_rate": 0.0001332308593234509, + "loss": 0.9393, + "step": 11205 + }, + { + "epoch": 2.390358361774744, + "grad_norm": 0.23254441514487115, + "learning_rate": 0.0001332245064848931, + "loss": 0.9376, + "step": 11206 + }, + { + "epoch": 2.390571672354949, + "grad_norm": 0.2313774155228611, + "learning_rate": 0.00013321815304409912, + "loss": 0.9132, + "step": 11207 + }, + { + "epoch": 2.3907849829351537, + "grad_norm": 0.2892338403284523, + "learning_rate": 0.0001332117990011408, + "loss": 0.9341, + "step": 11208 + }, + { + "epoch": 2.3909982935153584, + "grad_norm": 0.30370461193817866, + "learning_rate": 0.00013320544435609005, + "loss": 0.9353, + "step": 11209 + }, + { + "epoch": 2.391211604095563, + "grad_norm": 0.2857201691585772, + "learning_rate": 0.00013319908910901885, + "loss": 0.9247, + "step": 11210 + }, + { + "epoch": 2.391424914675768, + "grad_norm": 0.29640530108182644, + "learning_rate": 0.00013319273325999907, + "loss": 0.935, + "step": 11211 + }, + { + "epoch": 2.3916382252559725, + "grad_norm": 0.3074156389164987, + "learning_rate": 0.0001331863768091026, + "loss": 0.9535, + "step": 11212 + }, + { + "epoch": 2.3918515358361776, + "grad_norm": 0.312124563406852, + "learning_rate": 0.00013318001975640137, + "loss": 0.9622, + "step": 11213 + }, + { + "epoch": 2.3920648464163823, + "grad_norm": 0.32706491371615554, + "learning_rate": 0.00013317366210196735, + "loss": 0.9268, + "step": 11214 + }, + { + "epoch": 2.392278156996587, + "grad_norm": 0.3829322448118066, + "learning_rate": 0.00013316730384587243, + "loss": 0.9206, + "step": 11215 + }, + { + "epoch": 2.3924914675767917, + "grad_norm": 0.3771949368226909, + "learning_rate": 0.00013316094498818856, + "loss": 0.9112, + "step": 11216 + }, + { + "epoch": 2.392704778156997, + "grad_norm": 0.3179219779394122, + "learning_rate": 0.00013315458552898773, + "loss": 0.9616, + "step": 11217 + }, + { + "epoch": 2.3929180887372015, + "grad_norm": 0.21415287284638435, + "learning_rate": 0.00013314822546834187, + "loss": 0.9272, + "step": 11218 + }, + { + "epoch": 2.393131399317406, + "grad_norm": 0.2917625699613345, + "learning_rate": 0.0001331418648063229, + "loss": 0.9412, + "step": 11219 + }, + { + "epoch": 2.393344709897611, + "grad_norm": 0.47204322815345046, + "learning_rate": 0.00013313550354300289, + "loss": 0.9491, + "step": 11220 + }, + { + "epoch": 2.3935580204778155, + "grad_norm": 0.4768846349605133, + "learning_rate": 0.00013312914167845375, + "loss": 0.9445, + "step": 11221 + }, + { + "epoch": 2.3937713310580206, + "grad_norm": 0.3726482115975751, + "learning_rate": 0.00013312277921274747, + "loss": 0.9663, + "step": 11222 + }, + { + "epoch": 2.3939846416382253, + "grad_norm": 0.32342298067301684, + "learning_rate": 0.00013311641614595606, + "loss": 0.9164, + "step": 11223 + }, + { + "epoch": 2.39419795221843, + "grad_norm": 0.280122856940582, + "learning_rate": 0.0001331100524781515, + "loss": 0.9143, + "step": 11224 + }, + { + "epoch": 2.3944112627986347, + "grad_norm": 0.2994579224312558, + "learning_rate": 0.00013310368820940578, + "loss": 0.944, + "step": 11225 + }, + { + "epoch": 2.39462457337884, + "grad_norm": 0.419591101970876, + "learning_rate": 0.00013309732333979097, + "loss": 0.9078, + "step": 11226 + }, + { + "epoch": 2.3948378839590445, + "grad_norm": 0.584532681542959, + "learning_rate": 0.00013309095786937904, + "loss": 0.9605, + "step": 11227 + }, + { + "epoch": 2.395051194539249, + "grad_norm": 0.5559299806571695, + "learning_rate": 0.00013308459179824203, + "loss": 0.9624, + "step": 11228 + }, + { + "epoch": 2.395264505119454, + "grad_norm": 0.42723550703617674, + "learning_rate": 0.00013307822512645197, + "loss": 0.9414, + "step": 11229 + }, + { + "epoch": 2.3954778156996586, + "grad_norm": 0.2678576670202512, + "learning_rate": 0.00013307185785408092, + "loss": 0.9294, + "step": 11230 + }, + { + "epoch": 2.3956911262798632, + "grad_norm": 0.2742844691582079, + "learning_rate": 0.00013306548998120088, + "loss": 0.9413, + "step": 11231 + }, + { + "epoch": 2.3959044368600684, + "grad_norm": 0.3434676418465217, + "learning_rate": 0.00013305912150788393, + "loss": 0.9417, + "step": 11232 + }, + { + "epoch": 2.396117747440273, + "grad_norm": 0.2622916832093061, + "learning_rate": 0.00013305275243420214, + "loss": 0.9261, + "step": 11233 + }, + { + "epoch": 2.3963310580204777, + "grad_norm": 0.311658045194545, + "learning_rate": 0.00013304638276022754, + "loss": 0.9572, + "step": 11234 + }, + { + "epoch": 2.3965443686006824, + "grad_norm": 0.29214157122699397, + "learning_rate": 0.00013304001248603227, + "loss": 0.8996, + "step": 11235 + }, + { + "epoch": 2.3967576791808876, + "grad_norm": 0.23836517668164323, + "learning_rate": 0.0001330336416116883, + "loss": 0.9407, + "step": 11236 + }, + { + "epoch": 2.3969709897610922, + "grad_norm": 0.2504306564711289, + "learning_rate": 0.00013302727013726783, + "loss": 0.9158, + "step": 11237 + }, + { + "epoch": 2.397184300341297, + "grad_norm": 0.23216094994069922, + "learning_rate": 0.00013302089806284292, + "loss": 0.9518, + "step": 11238 + }, + { + "epoch": 2.3973976109215016, + "grad_norm": 0.2598931753454819, + "learning_rate": 0.00013301452538848563, + "loss": 0.9135, + "step": 11239 + }, + { + "epoch": 2.3976109215017063, + "grad_norm": 0.26297804288184917, + "learning_rate": 0.0001330081521142681, + "loss": 0.9039, + "step": 11240 + }, + { + "epoch": 2.3978242320819114, + "grad_norm": 0.22139112320727786, + "learning_rate": 0.00013300177824026243, + "loss": 0.9194, + "step": 11241 + }, + { + "epoch": 2.398037542662116, + "grad_norm": 0.24860609925750912, + "learning_rate": 0.00013299540376654077, + "loss": 0.949, + "step": 11242 + }, + { + "epoch": 2.398250853242321, + "grad_norm": 0.3333218934501789, + "learning_rate": 0.0001329890286931752, + "loss": 0.9214, + "step": 11243 + }, + { + "epoch": 2.3984641638225255, + "grad_norm": 0.43005320687686455, + "learning_rate": 0.00013298265302023787, + "loss": 0.933, + "step": 11244 + }, + { + "epoch": 2.3986774744027306, + "grad_norm": 0.4312031597794177, + "learning_rate": 0.00013297627674780095, + "loss": 0.9526, + "step": 11245 + }, + { + "epoch": 2.3988907849829353, + "grad_norm": 0.36933659867646706, + "learning_rate": 0.00013296989987593658, + "loss": 0.924, + "step": 11246 + }, + { + "epoch": 2.39910409556314, + "grad_norm": 0.3600931311907898, + "learning_rate": 0.0001329635224047169, + "loss": 0.9645, + "step": 11247 + }, + { + "epoch": 2.3993174061433447, + "grad_norm": 0.33909274190958394, + "learning_rate": 0.00013295714433421402, + "loss": 0.9499, + "step": 11248 + }, + { + "epoch": 2.3995307167235493, + "grad_norm": 0.2501995632172883, + "learning_rate": 0.00013295076566450023, + "loss": 0.9149, + "step": 11249 + }, + { + "epoch": 2.399744027303754, + "grad_norm": 0.2214362159257734, + "learning_rate": 0.0001329443863956476, + "loss": 0.9149, + "step": 11250 + }, + { + "epoch": 2.399957337883959, + "grad_norm": 0.25070813790794994, + "learning_rate": 0.00013293800652772834, + "loss": 0.9492, + "step": 11251 + }, + { + "epoch": 2.400170648464164, + "grad_norm": 0.2669734335014146, + "learning_rate": 0.00013293162606081469, + "loss": 0.9358, + "step": 11252 + }, + { + "epoch": 2.4003839590443685, + "grad_norm": 0.266957238130316, + "learning_rate": 0.00013292524499497875, + "loss": 0.9729, + "step": 11253 + }, + { + "epoch": 2.400597269624573, + "grad_norm": 0.3109232925846943, + "learning_rate": 0.0001329188633302928, + "loss": 0.9303, + "step": 11254 + }, + { + "epoch": 2.4008105802047783, + "grad_norm": 0.29769606625643097, + "learning_rate": 0.000132912481066829, + "loss": 0.9458, + "step": 11255 + }, + { + "epoch": 2.401023890784983, + "grad_norm": 0.3626189700730717, + "learning_rate": 0.00013290609820465962, + "loss": 0.9642, + "step": 11256 + }, + { + "epoch": 2.4012372013651877, + "grad_norm": 0.3749967844131269, + "learning_rate": 0.00013289971474385684, + "loss": 0.9083, + "step": 11257 + }, + { + "epoch": 2.4014505119453924, + "grad_norm": 0.27653346620778557, + "learning_rate": 0.0001328933306844929, + "loss": 0.976, + "step": 11258 + }, + { + "epoch": 2.401663822525597, + "grad_norm": 0.27608846607811444, + "learning_rate": 0.00013288694602664, + "loss": 0.923, + "step": 11259 + }, + { + "epoch": 2.401877133105802, + "grad_norm": 0.240157900332257, + "learning_rate": 0.00013288056077037045, + "loss": 0.9415, + "step": 11260 + }, + { + "epoch": 2.402090443686007, + "grad_norm": 0.2509789656084985, + "learning_rate": 0.00013287417491575644, + "loss": 0.924, + "step": 11261 + }, + { + "epoch": 2.4023037542662116, + "grad_norm": 0.3581094462976731, + "learning_rate": 0.00013286778846287027, + "loss": 0.9473, + "step": 11262 + }, + { + "epoch": 2.4025170648464163, + "grad_norm": 0.4802021700814653, + "learning_rate": 0.0001328614014117842, + "loss": 0.9098, + "step": 11263 + }, + { + "epoch": 2.4027303754266214, + "grad_norm": 0.44865937316349763, + "learning_rate": 0.00013285501376257046, + "loss": 0.9106, + "step": 11264 + }, + { + "epoch": 2.402943686006826, + "grad_norm": 0.3110059488684771, + "learning_rate": 0.00013284862551530134, + "loss": 0.9288, + "step": 11265 + }, + { + "epoch": 2.4031569965870307, + "grad_norm": 0.19320311457315367, + "learning_rate": 0.00013284223667004913, + "loss": 0.8927, + "step": 11266 + }, + { + "epoch": 2.4033703071672354, + "grad_norm": 0.26647023507124634, + "learning_rate": 0.00013283584722688618, + "loss": 0.961, + "step": 11267 + }, + { + "epoch": 2.40358361774744, + "grad_norm": 0.43567377547431424, + "learning_rate": 0.00013282945718588466, + "loss": 0.9661, + "step": 11268 + }, + { + "epoch": 2.4037969283276452, + "grad_norm": 0.5854080750362562, + "learning_rate": 0.00013282306654711698, + "loss": 0.9393, + "step": 11269 + }, + { + "epoch": 2.40401023890785, + "grad_norm": 0.613992259734463, + "learning_rate": 0.00013281667531065542, + "loss": 0.9638, + "step": 11270 + }, + { + "epoch": 2.4042235494880546, + "grad_norm": 0.5324626317911959, + "learning_rate": 0.00013281028347657226, + "loss": 0.9437, + "step": 11271 + }, + { + "epoch": 2.4044368600682593, + "grad_norm": 0.39877075614756663, + "learning_rate": 0.00013280389104493987, + "loss": 0.9678, + "step": 11272 + }, + { + "epoch": 2.404650170648464, + "grad_norm": 0.26104816544779835, + "learning_rate": 0.00013279749801583057, + "loss": 0.9333, + "step": 11273 + }, + { + "epoch": 2.404863481228669, + "grad_norm": 0.32078244492064495, + "learning_rate": 0.00013279110438931669, + "loss": 0.9457, + "step": 11274 + }, + { + "epoch": 2.405076791808874, + "grad_norm": 0.33993421762370235, + "learning_rate": 0.0001327847101654706, + "loss": 0.9347, + "step": 11275 + }, + { + "epoch": 2.4052901023890785, + "grad_norm": 0.3648500247549439, + "learning_rate": 0.00013277831534436456, + "loss": 0.9371, + "step": 11276 + }, + { + "epoch": 2.405503412969283, + "grad_norm": 0.3977790178057105, + "learning_rate": 0.00013277191992607106, + "loss": 0.9332, + "step": 11277 + }, + { + "epoch": 2.405716723549488, + "grad_norm": 0.35563897726688326, + "learning_rate": 0.00013276552391066238, + "loss": 0.9342, + "step": 11278 + }, + { + "epoch": 2.405930034129693, + "grad_norm": 0.31370914808302386, + "learning_rate": 0.00013275912729821088, + "loss": 0.9527, + "step": 11279 + }, + { + "epoch": 2.4061433447098977, + "grad_norm": 0.3103724876133648, + "learning_rate": 0.000132752730088789, + "loss": 0.9265, + "step": 11280 + }, + { + "epoch": 2.4063566552901023, + "grad_norm": 0.25221111085799797, + "learning_rate": 0.00013274633228246906, + "loss": 0.9468, + "step": 11281 + }, + { + "epoch": 2.406569965870307, + "grad_norm": 0.22953023822283547, + "learning_rate": 0.0001327399338793235, + "loss": 0.9263, + "step": 11282 + }, + { + "epoch": 2.406783276450512, + "grad_norm": 0.27502756965812236, + "learning_rate": 0.00013273353487942475, + "loss": 0.931, + "step": 11283 + }, + { + "epoch": 2.406996587030717, + "grad_norm": 0.2969294408987353, + "learning_rate": 0.0001327271352828451, + "loss": 0.9287, + "step": 11284 + }, + { + "epoch": 2.4072098976109215, + "grad_norm": 0.2734363739599515, + "learning_rate": 0.00013272073508965705, + "loss": 0.9417, + "step": 11285 + }, + { + "epoch": 2.407423208191126, + "grad_norm": 0.27957770022903095, + "learning_rate": 0.00013271433429993298, + "loss": 0.931, + "step": 11286 + }, + { + "epoch": 2.407636518771331, + "grad_norm": 0.3320482469414199, + "learning_rate": 0.00013270793291374536, + "loss": 0.9534, + "step": 11287 + }, + { + "epoch": 2.407849829351536, + "grad_norm": 0.30758556299301104, + "learning_rate": 0.00013270153093116656, + "loss": 0.9529, + "step": 11288 + }, + { + "epoch": 2.4080631399317407, + "grad_norm": 0.2505044581923475, + "learning_rate": 0.00013269512835226908, + "loss": 0.9278, + "step": 11289 + }, + { + "epoch": 2.4082764505119454, + "grad_norm": 0.3047022119701658, + "learning_rate": 0.00013268872517712533, + "loss": 0.94, + "step": 11290 + }, + { + "epoch": 2.40848976109215, + "grad_norm": 0.3247594499214052, + "learning_rate": 0.00013268232140580777, + "loss": 0.9276, + "step": 11291 + }, + { + "epoch": 2.4087030716723548, + "grad_norm": 0.33934204549470165, + "learning_rate": 0.00013267591703838884, + "loss": 0.934, + "step": 11292 + }, + { + "epoch": 2.40891638225256, + "grad_norm": 0.3165737999260793, + "learning_rate": 0.00013266951207494104, + "loss": 0.9165, + "step": 11293 + }, + { + "epoch": 2.4091296928327646, + "grad_norm": 0.31507442665483176, + "learning_rate": 0.00013266310651553683, + "loss": 0.9454, + "step": 11294 + }, + { + "epoch": 2.4093430034129693, + "grad_norm": 0.3105060978644584, + "learning_rate": 0.00013265670036024865, + "loss": 0.9336, + "step": 11295 + }, + { + "epoch": 2.409556313993174, + "grad_norm": 0.2503463551951722, + "learning_rate": 0.00013265029360914906, + "loss": 0.8975, + "step": 11296 + }, + { + "epoch": 2.4097696245733786, + "grad_norm": 0.2563617880724446, + "learning_rate": 0.0001326438862623105, + "loss": 0.9091, + "step": 11297 + }, + { + "epoch": 2.4099829351535837, + "grad_norm": 0.3723502681240693, + "learning_rate": 0.00013263747831980546, + "loss": 0.9297, + "step": 11298 + }, + { + "epoch": 2.4101962457337884, + "grad_norm": 0.43883435046435304, + "learning_rate": 0.00013263106978170648, + "loss": 0.9346, + "step": 11299 + }, + { + "epoch": 2.410409556313993, + "grad_norm": 0.4016842459247074, + "learning_rate": 0.00013262466064808606, + "loss": 0.9395, + "step": 11300 + }, + { + "epoch": 2.410622866894198, + "grad_norm": 0.3199730551223988, + "learning_rate": 0.00013261825091901675, + "loss": 0.9133, + "step": 11301 + }, + { + "epoch": 2.410836177474403, + "grad_norm": 0.2916133862194651, + "learning_rate": 0.000132611840594571, + "loss": 0.9457, + "step": 11302 + }, + { + "epoch": 2.4110494880546076, + "grad_norm": 0.2788920243271082, + "learning_rate": 0.0001326054296748214, + "loss": 0.9022, + "step": 11303 + }, + { + "epoch": 2.4112627986348123, + "grad_norm": 0.23320319835853448, + "learning_rate": 0.00013259901815984048, + "loss": 0.9304, + "step": 11304 + }, + { + "epoch": 2.411476109215017, + "grad_norm": 0.2789610081456397, + "learning_rate": 0.00013259260604970083, + "loss": 0.9364, + "step": 11305 + }, + { + "epoch": 2.4116894197952217, + "grad_norm": 0.32499306627831276, + "learning_rate": 0.0001325861933444749, + "loss": 0.9585, + "step": 11306 + }, + { + "epoch": 2.411902730375427, + "grad_norm": 0.3283109127074923, + "learning_rate": 0.00013257978004423533, + "loss": 0.9144, + "step": 11307 + }, + { + "epoch": 2.4121160409556315, + "grad_norm": 0.32995609273009885, + "learning_rate": 0.00013257336614905466, + "loss": 0.9099, + "step": 11308 + }, + { + "epoch": 2.412329351535836, + "grad_norm": 0.28307665027618756, + "learning_rate": 0.00013256695165900546, + "loss": 0.9954, + "step": 11309 + }, + { + "epoch": 2.412542662116041, + "grad_norm": 0.1982392424744579, + "learning_rate": 0.00013256053657416034, + "loss": 0.933, + "step": 11310 + }, + { + "epoch": 2.412755972696246, + "grad_norm": 0.24501907235650155, + "learning_rate": 0.00013255412089459181, + "loss": 0.9656, + "step": 11311 + }, + { + "epoch": 2.4129692832764507, + "grad_norm": 0.2600468965980667, + "learning_rate": 0.00013254770462037258, + "loss": 0.9336, + "step": 11312 + }, + { + "epoch": 2.4131825938566553, + "grad_norm": 0.3084561022613489, + "learning_rate": 0.0001325412877515752, + "loss": 0.9205, + "step": 11313 + }, + { + "epoch": 2.41339590443686, + "grad_norm": 0.33051553317634397, + "learning_rate": 0.0001325348702882722, + "loss": 0.9365, + "step": 11314 + }, + { + "epoch": 2.4136092150170647, + "grad_norm": 0.26429565812688033, + "learning_rate": 0.0001325284522305363, + "loss": 0.9769, + "step": 11315 + }, + { + "epoch": 2.4138225255972694, + "grad_norm": 0.20876655233511698, + "learning_rate": 0.00013252203357844007, + "loss": 0.9589, + "step": 11316 + }, + { + "epoch": 2.4140358361774745, + "grad_norm": 0.2525593112548504, + "learning_rate": 0.00013251561433205613, + "loss": 0.9004, + "step": 11317 + }, + { + "epoch": 2.414249146757679, + "grad_norm": 0.22283662441591376, + "learning_rate": 0.00013250919449145717, + "loss": 0.9131, + "step": 11318 + }, + { + "epoch": 2.414462457337884, + "grad_norm": 0.2740503469789605, + "learning_rate": 0.00013250277405671575, + "loss": 0.9472, + "step": 11319 + }, + { + "epoch": 2.4146757679180886, + "grad_norm": 0.3155430987121741, + "learning_rate": 0.0001324963530279046, + "loss": 0.9247, + "step": 11320 + }, + { + "epoch": 2.4148890784982937, + "grad_norm": 0.48048731805680867, + "learning_rate": 0.0001324899314050963, + "loss": 0.9514, + "step": 11321 + }, + { + "epoch": 2.4151023890784984, + "grad_norm": 0.34016047457797216, + "learning_rate": 0.00013248350918836356, + "loss": 0.9717, + "step": 11322 + }, + { + "epoch": 2.415315699658703, + "grad_norm": 0.2753290029728327, + "learning_rate": 0.00013247708637777902, + "loss": 0.9402, + "step": 11323 + }, + { + "epoch": 2.4155290102389078, + "grad_norm": 0.21211319131607867, + "learning_rate": 0.00013247066297341538, + "loss": 0.9172, + "step": 11324 + }, + { + "epoch": 2.4157423208191124, + "grad_norm": 0.22625728502492803, + "learning_rate": 0.0001324642389753453, + "loss": 0.9624, + "step": 11325 + }, + { + "epoch": 2.4159556313993176, + "grad_norm": 0.26896439663977995, + "learning_rate": 0.00013245781438364147, + "loss": 0.9336, + "step": 11326 + }, + { + "epoch": 2.4161689419795223, + "grad_norm": 0.23280087071515992, + "learning_rate": 0.00013245138919837658, + "loss": 0.9634, + "step": 11327 + }, + { + "epoch": 2.416382252559727, + "grad_norm": 0.32169998690933105, + "learning_rate": 0.00013244496341962333, + "loss": 0.9139, + "step": 11328 + }, + { + "epoch": 2.4165955631399316, + "grad_norm": 0.28063002971268125, + "learning_rate": 0.00013243853704745446, + "loss": 0.9399, + "step": 11329 + }, + { + "epoch": 2.4168088737201368, + "grad_norm": 0.2521801640306104, + "learning_rate": 0.00013243211008194267, + "loss": 0.9161, + "step": 11330 + }, + { + "epoch": 2.4170221843003414, + "grad_norm": 0.23639158114608305, + "learning_rate": 0.00013242568252316064, + "loss": 0.9303, + "step": 11331 + }, + { + "epoch": 2.417235494880546, + "grad_norm": 0.26706747340878206, + "learning_rate": 0.00013241925437118115, + "loss": 0.9328, + "step": 11332 + }, + { + "epoch": 2.417448805460751, + "grad_norm": 0.27557513467626776, + "learning_rate": 0.0001324128256260769, + "loss": 0.9244, + "step": 11333 + }, + { + "epoch": 2.4176621160409555, + "grad_norm": 0.3329861979920141, + "learning_rate": 0.00013240639628792067, + "loss": 0.9158, + "step": 11334 + }, + { + "epoch": 2.41787542662116, + "grad_norm": 0.3171885979813756, + "learning_rate": 0.00013239996635678518, + "loss": 0.9191, + "step": 11335 + }, + { + "epoch": 2.4180887372013653, + "grad_norm": 0.2508715457877031, + "learning_rate": 0.00013239353583274316, + "loss": 0.955, + "step": 11336 + }, + { + "epoch": 2.41830204778157, + "grad_norm": 0.2963644439310284, + "learning_rate": 0.00013238710471586744, + "loss": 0.9316, + "step": 11337 + }, + { + "epoch": 2.4185153583617747, + "grad_norm": 0.30398213924190914, + "learning_rate": 0.00013238067300623074, + "loss": 0.9321, + "step": 11338 + }, + { + "epoch": 2.4187286689419794, + "grad_norm": 0.24971192830190797, + "learning_rate": 0.00013237424070390583, + "loss": 0.9428, + "step": 11339 + }, + { + "epoch": 2.4189419795221845, + "grad_norm": 0.20327858274374422, + "learning_rate": 0.0001323678078089655, + "loss": 0.9679, + "step": 11340 + }, + { + "epoch": 2.419155290102389, + "grad_norm": 0.3063385677333044, + "learning_rate": 0.00013236137432148256, + "loss": 0.9408, + "step": 11341 + }, + { + "epoch": 2.419368600682594, + "grad_norm": 0.3898364403662856, + "learning_rate": 0.0001323549402415298, + "loss": 0.9558, + "step": 11342 + }, + { + "epoch": 2.4195819112627985, + "grad_norm": 0.4030328382341865, + "learning_rate": 0.00013234850556917999, + "loss": 0.9287, + "step": 11343 + }, + { + "epoch": 2.419795221843003, + "grad_norm": 0.37871043934086634, + "learning_rate": 0.00013234207030450595, + "loss": 0.9195, + "step": 11344 + }, + { + "epoch": 2.4200085324232083, + "grad_norm": 0.31619684197268527, + "learning_rate": 0.00013233563444758052, + "loss": 0.946, + "step": 11345 + }, + { + "epoch": 2.420221843003413, + "grad_norm": 0.3669507144111962, + "learning_rate": 0.00013232919799847651, + "loss": 0.9251, + "step": 11346 + }, + { + "epoch": 2.4204351535836177, + "grad_norm": 0.3649507538733672, + "learning_rate": 0.00013232276095726672, + "loss": 0.9222, + "step": 11347 + }, + { + "epoch": 2.4206484641638224, + "grad_norm": 0.3020165403737393, + "learning_rate": 0.00013231632332402402, + "loss": 0.925, + "step": 11348 + }, + { + "epoch": 2.4208617747440275, + "grad_norm": 0.23680073248129935, + "learning_rate": 0.00013230988509882124, + "loss": 0.8987, + "step": 11349 + }, + { + "epoch": 2.421075085324232, + "grad_norm": 0.19435372585253474, + "learning_rate": 0.00013230344628173125, + "loss": 0.9353, + "step": 11350 + }, + { + "epoch": 2.421288395904437, + "grad_norm": 0.26408658884084657, + "learning_rate": 0.00013229700687282688, + "loss": 0.9496, + "step": 11351 + }, + { + "epoch": 2.4215017064846416, + "grad_norm": 0.3639872110827884, + "learning_rate": 0.000132290566872181, + "loss": 0.9385, + "step": 11352 + }, + { + "epoch": 2.4217150170648463, + "grad_norm": 0.48928137273530725, + "learning_rate": 0.00013228412627986648, + "loss": 0.9583, + "step": 11353 + }, + { + "epoch": 2.421928327645051, + "grad_norm": 0.5776051055917025, + "learning_rate": 0.00013227768509595618, + "loss": 0.9383, + "step": 11354 + }, + { + "epoch": 2.422141638225256, + "grad_norm": 0.575827079899732, + "learning_rate": 0.00013227124332052299, + "loss": 0.8999, + "step": 11355 + }, + { + "epoch": 2.4223549488054608, + "grad_norm": 0.5247742447790797, + "learning_rate": 0.0001322648009536398, + "loss": 0.9473, + "step": 11356 + }, + { + "epoch": 2.4225682593856654, + "grad_norm": 0.4200182554944106, + "learning_rate": 0.00013225835799537952, + "loss": 0.9206, + "step": 11357 + }, + { + "epoch": 2.42278156996587, + "grad_norm": 0.2866699308551683, + "learning_rate": 0.00013225191444581503, + "loss": 0.9378, + "step": 11358 + }, + { + "epoch": 2.4229948805460753, + "grad_norm": 0.2267451929975676, + "learning_rate": 0.00013224547030501926, + "loss": 0.946, + "step": 11359 + }, + { + "epoch": 2.42320819112628, + "grad_norm": 0.3292414297567662, + "learning_rate": 0.0001322390255730651, + "loss": 0.9512, + "step": 11360 + }, + { + "epoch": 2.4234215017064846, + "grad_norm": 0.5295328057338762, + "learning_rate": 0.0001322325802500255, + "loss": 0.9439, + "step": 11361 + }, + { + "epoch": 2.4236348122866893, + "grad_norm": 0.4387433875574272, + "learning_rate": 0.00013222613433597338, + "loss": 0.9543, + "step": 11362 + }, + { + "epoch": 2.423848122866894, + "grad_norm": 0.4336132555517162, + "learning_rate": 0.00013221968783098167, + "loss": 0.8995, + "step": 11363 + }, + { + "epoch": 2.424061433447099, + "grad_norm": 0.32872320584049525, + "learning_rate": 0.0001322132407351233, + "loss": 0.9411, + "step": 11364 + }, + { + "epoch": 2.424274744027304, + "grad_norm": 0.24676472839484379, + "learning_rate": 0.0001322067930484712, + "loss": 0.9424, + "step": 11365 + }, + { + "epoch": 2.4244880546075085, + "grad_norm": 0.44654748119010657, + "learning_rate": 0.00013220034477109843, + "loss": 0.9193, + "step": 11366 + }, + { + "epoch": 2.424701365187713, + "grad_norm": 0.5257598638214892, + "learning_rate": 0.00013219389590307787, + "loss": 0.9597, + "step": 11367 + }, + { + "epoch": 2.4249146757679183, + "grad_norm": 0.625388836617257, + "learning_rate": 0.00013218744644448246, + "loss": 0.9284, + "step": 11368 + }, + { + "epoch": 2.425127986348123, + "grad_norm": 0.6050143479879274, + "learning_rate": 0.00013218099639538522, + "loss": 0.9492, + "step": 11369 + }, + { + "epoch": 2.4253412969283277, + "grad_norm": 0.7229428726038953, + "learning_rate": 0.00013217454575585916, + "loss": 0.937, + "step": 11370 + }, + { + "epoch": 2.4255546075085324, + "grad_norm": 0.7487222157694507, + "learning_rate": 0.0001321680945259772, + "loss": 0.9305, + "step": 11371 + }, + { + "epoch": 2.425767918088737, + "grad_norm": 0.6784978204343675, + "learning_rate": 0.00013216164270581242, + "loss": 0.9098, + "step": 11372 + }, + { + "epoch": 2.425981228668942, + "grad_norm": 0.5131602290362407, + "learning_rate": 0.00013215519029543773, + "loss": 0.9403, + "step": 11373 + }, + { + "epoch": 2.426194539249147, + "grad_norm": 0.3339271196679585, + "learning_rate": 0.0001321487372949262, + "loss": 0.9742, + "step": 11374 + }, + { + "epoch": 2.4264078498293515, + "grad_norm": 0.30884920260129645, + "learning_rate": 0.00013214228370435086, + "loss": 0.9109, + "step": 11375 + }, + { + "epoch": 2.426621160409556, + "grad_norm": 0.4202645104725769, + "learning_rate": 0.00013213582952378464, + "loss": 0.9475, + "step": 11376 + }, + { + "epoch": 2.426834470989761, + "grad_norm": 0.5726218790957239, + "learning_rate": 0.0001321293747533007, + "loss": 0.98, + "step": 11377 + }, + { + "epoch": 2.427047781569966, + "grad_norm": 0.6158101029809522, + "learning_rate": 0.00013212291939297197, + "loss": 0.9578, + "step": 11378 + }, + { + "epoch": 2.4272610921501707, + "grad_norm": 0.531595530998556, + "learning_rate": 0.00013211646344287156, + "loss": 0.9316, + "step": 11379 + }, + { + "epoch": 2.4274744027303754, + "grad_norm": 0.3495451465188691, + "learning_rate": 0.00013211000690307244, + "loss": 0.9551, + "step": 11380 + }, + { + "epoch": 2.42768771331058, + "grad_norm": 0.24565504635805585, + "learning_rate": 0.00013210354977364773, + "loss": 0.9468, + "step": 11381 + }, + { + "epoch": 2.4279010238907848, + "grad_norm": 0.3414781846813458, + "learning_rate": 0.00013209709205467052, + "loss": 0.94, + "step": 11382 + }, + { + "epoch": 2.42811433447099, + "grad_norm": 0.39951239311037756, + "learning_rate": 0.00013209063374621378, + "loss": 0.9419, + "step": 11383 + }, + { + "epoch": 2.4283276450511946, + "grad_norm": 0.42558861426865846, + "learning_rate": 0.00013208417484835067, + "loss": 0.9641, + "step": 11384 + }, + { + "epoch": 2.4285409556313993, + "grad_norm": 0.482607075685328, + "learning_rate": 0.00013207771536115424, + "loss": 0.9429, + "step": 11385 + }, + { + "epoch": 2.428754266211604, + "grad_norm": 0.4874432603394434, + "learning_rate": 0.0001320712552846976, + "loss": 0.9231, + "step": 11386 + }, + { + "epoch": 2.428967576791809, + "grad_norm": 0.30940531724269593, + "learning_rate": 0.0001320647946190538, + "loss": 0.889, + "step": 11387 + }, + { + "epoch": 2.4291808873720138, + "grad_norm": 0.24192084144657264, + "learning_rate": 0.000132058333364296, + "loss": 0.9463, + "step": 11388 + }, + { + "epoch": 2.4293941979522184, + "grad_norm": 0.36463726162475746, + "learning_rate": 0.0001320518715204973, + "loss": 0.9329, + "step": 11389 + }, + { + "epoch": 2.429607508532423, + "grad_norm": 0.5915989096033479, + "learning_rate": 0.00013204540908773077, + "loss": 0.9507, + "step": 11390 + }, + { + "epoch": 2.429820819112628, + "grad_norm": 0.3970316342619711, + "learning_rate": 0.00013203894606606957, + "loss": 0.8984, + "step": 11391 + }, + { + "epoch": 2.430034129692833, + "grad_norm": 2.678937327788661, + "learning_rate": 0.00013203248245558682, + "loss": 1.0458, + "step": 11392 + }, + { + "epoch": 2.4302474402730376, + "grad_norm": 0.5614229993753411, + "learning_rate": 0.00013202601825635561, + "loss": 0.9468, + "step": 11393 + }, + { + "epoch": 2.4304607508532423, + "grad_norm": 1.0852911181952434, + "learning_rate": 0.0001320195534684492, + "loss": 0.9521, + "step": 11394 + }, + { + "epoch": 2.430674061433447, + "grad_norm": 1.0339715505407463, + "learning_rate": 0.00013201308809194061, + "loss": 0.9318, + "step": 11395 + }, + { + "epoch": 2.430887372013652, + "grad_norm": 0.8322729202304818, + "learning_rate": 0.0001320066221269031, + "loss": 0.9174, + "step": 11396 + }, + { + "epoch": 2.431100682593857, + "grad_norm": 0.4626085075190884, + "learning_rate": 0.00013200015557340977, + "loss": 0.9685, + "step": 11397 + }, + { + "epoch": 2.4313139931740615, + "grad_norm": 0.6795494337696995, + "learning_rate": 0.00013199368843153382, + "loss": 0.9361, + "step": 11398 + }, + { + "epoch": 2.431527303754266, + "grad_norm": 0.9576164010023319, + "learning_rate": 0.00013198722070134836, + "loss": 0.9607, + "step": 11399 + }, + { + "epoch": 2.431740614334471, + "grad_norm": 0.8987751914288874, + "learning_rate": 0.00013198075238292665, + "loss": 0.9525, + "step": 11400 + }, + { + "epoch": 2.4319539249146755, + "grad_norm": 0.5598208746787892, + "learning_rate": 0.00013197428347634185, + "loss": 0.9441, + "step": 11401 + }, + { + "epoch": 2.4321672354948807, + "grad_norm": 0.320379546877561, + "learning_rate": 0.00013196781398166718, + "loss": 0.9626, + "step": 11402 + }, + { + "epoch": 2.4323805460750854, + "grad_norm": 0.4235457392718648, + "learning_rate": 0.0001319613438989758, + "loss": 0.9398, + "step": 11403 + }, + { + "epoch": 2.43259385665529, + "grad_norm": 0.5381829008765638, + "learning_rate": 0.00013195487322834095, + "loss": 0.9597, + "step": 11404 + }, + { + "epoch": 2.4328071672354947, + "grad_norm": 0.5733810118248039, + "learning_rate": 0.00013194840196983584, + "loss": 0.9353, + "step": 11405 + }, + { + "epoch": 2.4330204778157, + "grad_norm": 0.4254520543751027, + "learning_rate": 0.00013194193012353367, + "loss": 0.9802, + "step": 11406 + }, + { + "epoch": 2.4332337883959045, + "grad_norm": 0.31765207628132647, + "learning_rate": 0.0001319354576895077, + "loss": 0.9247, + "step": 11407 + }, + { + "epoch": 2.4334470989761092, + "grad_norm": 0.37584842608394375, + "learning_rate": 0.00013192898466783115, + "loss": 0.9546, + "step": 11408 + }, + { + "epoch": 2.433660409556314, + "grad_norm": 0.5151316720649133, + "learning_rate": 0.00013192251105857727, + "loss": 0.9345, + "step": 11409 + }, + { + "epoch": 2.4338737201365186, + "grad_norm": 0.3971093240737256, + "learning_rate": 0.00013191603686181928, + "loss": 0.9311, + "step": 11410 + }, + { + "epoch": 2.4340870307167237, + "grad_norm": 0.28809511964267376, + "learning_rate": 0.0001319095620776305, + "loss": 0.9415, + "step": 11411 + }, + { + "epoch": 2.4343003412969284, + "grad_norm": 0.3933905312813735, + "learning_rate": 0.0001319030867060841, + "loss": 0.9007, + "step": 11412 + }, + { + "epoch": 2.434513651877133, + "grad_norm": 0.3645193735494844, + "learning_rate": 0.00013189661074725347, + "loss": 0.9602, + "step": 11413 + }, + { + "epoch": 2.4347269624573378, + "grad_norm": 0.3648149773809283, + "learning_rate": 0.00013189013420121176, + "loss": 0.9283, + "step": 11414 + }, + { + "epoch": 2.434940273037543, + "grad_norm": 0.2657184733856385, + "learning_rate": 0.00013188365706803232, + "loss": 0.9369, + "step": 11415 + }, + { + "epoch": 2.4351535836177476, + "grad_norm": 0.23673298990002675, + "learning_rate": 0.00013187717934778844, + "loss": 0.9179, + "step": 11416 + }, + { + "epoch": 2.4353668941979523, + "grad_norm": 0.2719203341354306, + "learning_rate": 0.00013187070104055338, + "loss": 0.9467, + "step": 11417 + }, + { + "epoch": 2.435580204778157, + "grad_norm": 0.36104731957336267, + "learning_rate": 0.00013186422214640051, + "loss": 0.9649, + "step": 11418 + }, + { + "epoch": 2.4357935153583616, + "grad_norm": 0.32442724215554347, + "learning_rate": 0.00013185774266540306, + "loss": 0.943, + "step": 11419 + }, + { + "epoch": 2.4360068259385663, + "grad_norm": 0.2136283520391038, + "learning_rate": 0.00013185126259763436, + "loss": 0.9373, + "step": 11420 + }, + { + "epoch": 2.4362201365187715, + "grad_norm": 0.2893783486639582, + "learning_rate": 0.0001318447819431678, + "loss": 0.9189, + "step": 11421 + }, + { + "epoch": 2.436433447098976, + "grad_norm": 0.3725717779435494, + "learning_rate": 0.00013183830070207664, + "loss": 0.9179, + "step": 11422 + }, + { + "epoch": 2.436646757679181, + "grad_norm": 0.3211980111192907, + "learning_rate": 0.00013183181887443422, + "loss": 0.9657, + "step": 11423 + }, + { + "epoch": 2.4368600682593855, + "grad_norm": 0.26252728257775515, + "learning_rate": 0.00013182533646031392, + "loss": 0.932, + "step": 11424 + }, + { + "epoch": 2.4370733788395906, + "grad_norm": 0.2585926312350122, + "learning_rate": 0.00013181885345978908, + "loss": 0.9394, + "step": 11425 + }, + { + "epoch": 2.4372866894197953, + "grad_norm": 0.2344232954875828, + "learning_rate": 0.00013181236987293299, + "loss": 0.9222, + "step": 11426 + }, + { + "epoch": 2.4375, + "grad_norm": 0.29040407927965733, + "learning_rate": 0.00013180588569981907, + "loss": 0.9416, + "step": 11427 + }, + { + "epoch": 2.4377133105802047, + "grad_norm": 0.39426114153126485, + "learning_rate": 0.00013179940094052074, + "loss": 0.9655, + "step": 11428 + }, + { + "epoch": 2.4379266211604094, + "grad_norm": 0.3546200613965189, + "learning_rate": 0.00013179291559511124, + "loss": 0.9539, + "step": 11429 + }, + { + "epoch": 2.4381399317406145, + "grad_norm": 0.4351016078490988, + "learning_rate": 0.0001317864296636641, + "loss": 0.9759, + "step": 11430 + }, + { + "epoch": 2.438353242320819, + "grad_norm": 0.21618306735641796, + "learning_rate": 0.0001317799431462526, + "loss": 0.9437, + "step": 11431 + }, + { + "epoch": 2.438566552901024, + "grad_norm": 0.245222420046864, + "learning_rate": 0.0001317734560429502, + "loss": 0.9329, + "step": 11432 + }, + { + "epoch": 2.4387798634812285, + "grad_norm": 0.3038705790934869, + "learning_rate": 0.00013176696835383024, + "loss": 0.9456, + "step": 11433 + }, + { + "epoch": 2.4389931740614337, + "grad_norm": 0.26651974428673586, + "learning_rate": 0.00013176048007896616, + "loss": 0.9419, + "step": 11434 + }, + { + "epoch": 2.4392064846416384, + "grad_norm": 0.31144791478513184, + "learning_rate": 0.0001317539912184314, + "loss": 0.9448, + "step": 11435 + }, + { + "epoch": 2.439419795221843, + "grad_norm": 0.2805462384643346, + "learning_rate": 0.00013174750177229937, + "loss": 0.9483, + "step": 11436 + }, + { + "epoch": 2.4396331058020477, + "grad_norm": 0.20450464506594743, + "learning_rate": 0.00013174101174064345, + "loss": 0.9635, + "step": 11437 + }, + { + "epoch": 2.4398464163822524, + "grad_norm": 0.3258657801946832, + "learning_rate": 0.00013173452112353711, + "loss": 0.9834, + "step": 11438 + }, + { + "epoch": 2.440059726962457, + "grad_norm": 0.28974677390570197, + "learning_rate": 0.00013172802992105382, + "loss": 0.9114, + "step": 11439 + }, + { + "epoch": 2.4402730375426622, + "grad_norm": 0.22284528285945132, + "learning_rate": 0.000131721538133267, + "loss": 0.962, + "step": 11440 + }, + { + "epoch": 2.440486348122867, + "grad_norm": 0.22431905694080584, + "learning_rate": 0.00013171504576025013, + "loss": 0.9275, + "step": 11441 + }, + { + "epoch": 2.4406996587030716, + "grad_norm": 0.24218319437208968, + "learning_rate": 0.0001317085528020766, + "loss": 0.9708, + "step": 11442 + }, + { + "epoch": 2.4409129692832763, + "grad_norm": 0.2017419518775522, + "learning_rate": 0.00013170205925881994, + "loss": 0.9238, + "step": 11443 + }, + { + "epoch": 2.4411262798634814, + "grad_norm": 0.2521321919439793, + "learning_rate": 0.00013169556513055363, + "loss": 0.9644, + "step": 11444 + }, + { + "epoch": 2.441339590443686, + "grad_norm": 0.38380521125769546, + "learning_rate": 0.0001316890704173511, + "loss": 0.9404, + "step": 11445 + }, + { + "epoch": 2.4415529010238908, + "grad_norm": 0.35615493152790734, + "learning_rate": 0.00013168257511928587, + "loss": 0.9341, + "step": 11446 + }, + { + "epoch": 2.4417662116040955, + "grad_norm": 0.2994662505877289, + "learning_rate": 0.00013167607923643147, + "loss": 0.9708, + "step": 11447 + }, + { + "epoch": 2.4419795221843, + "grad_norm": 0.30264806760500457, + "learning_rate": 0.00013166958276886133, + "loss": 0.9411, + "step": 11448 + }, + { + "epoch": 2.4421928327645053, + "grad_norm": 0.26926317190943705, + "learning_rate": 0.000131663085716649, + "loss": 0.9502, + "step": 11449 + }, + { + "epoch": 2.44240614334471, + "grad_norm": 0.2557297222065552, + "learning_rate": 0.000131656588079868, + "loss": 0.951, + "step": 11450 + }, + { + "epoch": 2.4426194539249146, + "grad_norm": 0.253765327425416, + "learning_rate": 0.00013165008985859181, + "loss": 0.9537, + "step": 11451 + }, + { + "epoch": 2.4428327645051193, + "grad_norm": 0.23504000422561958, + "learning_rate": 0.00013164359105289402, + "loss": 0.9451, + "step": 11452 + }, + { + "epoch": 2.4430460750853245, + "grad_norm": 0.24736689018660502, + "learning_rate": 0.0001316370916628481, + "loss": 0.9427, + "step": 11453 + }, + { + "epoch": 2.443259385665529, + "grad_norm": 0.27563987787715377, + "learning_rate": 0.00013163059168852764, + "loss": 0.9273, + "step": 11454 + }, + { + "epoch": 2.443472696245734, + "grad_norm": 0.2741503300359164, + "learning_rate": 0.00013162409113000616, + "loss": 0.9285, + "step": 11455 + }, + { + "epoch": 2.4436860068259385, + "grad_norm": 0.2065017739565031, + "learning_rate": 0.00013161758998735717, + "loss": 0.9169, + "step": 11456 + }, + { + "epoch": 2.443899317406143, + "grad_norm": 0.29362467128933384, + "learning_rate": 0.00013161108826065436, + "loss": 0.953, + "step": 11457 + }, + { + "epoch": 2.444112627986348, + "grad_norm": 0.3709131353018722, + "learning_rate": 0.00013160458594997117, + "loss": 0.9538, + "step": 11458 + }, + { + "epoch": 2.444325938566553, + "grad_norm": 0.2781431163930394, + "learning_rate": 0.00013159808305538125, + "loss": 0.9475, + "step": 11459 + }, + { + "epoch": 2.4445392491467577, + "grad_norm": 0.22962567884710847, + "learning_rate": 0.00013159157957695812, + "loss": 0.9337, + "step": 11460 + }, + { + "epoch": 2.4447525597269624, + "grad_norm": 0.2901164855339364, + "learning_rate": 0.00013158507551477543, + "loss": 0.932, + "step": 11461 + }, + { + "epoch": 2.444965870307167, + "grad_norm": 0.2821896963970629, + "learning_rate": 0.00013157857086890673, + "loss": 0.9556, + "step": 11462 + }, + { + "epoch": 2.445179180887372, + "grad_norm": 0.24857695119698486, + "learning_rate": 0.00013157206563942564, + "loss": 0.9215, + "step": 11463 + }, + { + "epoch": 2.445392491467577, + "grad_norm": 0.27571964142924005, + "learning_rate": 0.00013156555982640576, + "loss": 0.9177, + "step": 11464 + }, + { + "epoch": 2.4456058020477816, + "grad_norm": 0.24906250273698619, + "learning_rate": 0.00013155905342992068, + "loss": 0.966, + "step": 11465 + }, + { + "epoch": 2.4458191126279862, + "grad_norm": 0.2589056872063703, + "learning_rate": 0.00013155254645004405, + "loss": 0.9353, + "step": 11466 + }, + { + "epoch": 2.446032423208191, + "grad_norm": 0.22514456105179015, + "learning_rate": 0.00013154603888684951, + "loss": 0.9257, + "step": 11467 + }, + { + "epoch": 2.446245733788396, + "grad_norm": 0.276450262315565, + "learning_rate": 0.00013153953074041067, + "loss": 0.9205, + "step": 11468 + }, + { + "epoch": 2.4464590443686007, + "grad_norm": 0.26449242696061, + "learning_rate": 0.00013153302201080117, + "loss": 0.9427, + "step": 11469 + }, + { + "epoch": 2.4466723549488054, + "grad_norm": 0.25711440119447926, + "learning_rate": 0.00013152651269809465, + "loss": 0.9556, + "step": 11470 + }, + { + "epoch": 2.44688566552901, + "grad_norm": 0.22851825317601046, + "learning_rate": 0.00013152000280236477, + "loss": 0.9282, + "step": 11471 + }, + { + "epoch": 2.4470989761092152, + "grad_norm": 0.22001459794989586, + "learning_rate": 0.00013151349232368522, + "loss": 0.9339, + "step": 11472 + }, + { + "epoch": 2.44731228668942, + "grad_norm": 0.29881198228543404, + "learning_rate": 0.0001315069812621296, + "loss": 0.9705, + "step": 11473 + }, + { + "epoch": 2.4475255972696246, + "grad_norm": 0.31270331734608947, + "learning_rate": 0.00013150046961777164, + "loss": 0.9442, + "step": 11474 + }, + { + "epoch": 2.4477389078498293, + "grad_norm": 0.34522037052861304, + "learning_rate": 0.000131493957390685, + "loss": 0.9477, + "step": 11475 + }, + { + "epoch": 2.447952218430034, + "grad_norm": 0.25175533746908374, + "learning_rate": 0.00013148744458094336, + "loss": 0.914, + "step": 11476 + }, + { + "epoch": 2.448165529010239, + "grad_norm": 0.2402624884422217, + "learning_rate": 0.00013148093118862042, + "loss": 0.9048, + "step": 11477 + }, + { + "epoch": 2.448378839590444, + "grad_norm": 0.2709672996224809, + "learning_rate": 0.00013147441721378988, + "loss": 0.9022, + "step": 11478 + }, + { + "epoch": 2.4485921501706485, + "grad_norm": 0.23333655896481287, + "learning_rate": 0.00013146790265652545, + "loss": 0.9445, + "step": 11479 + }, + { + "epoch": 2.448805460750853, + "grad_norm": 0.20497618249834892, + "learning_rate": 0.00013146138751690082, + "loss": 0.9513, + "step": 11480 + }, + { + "epoch": 2.449018771331058, + "grad_norm": 0.25590086841522824, + "learning_rate": 0.00013145487179498975, + "loss": 0.924, + "step": 11481 + }, + { + "epoch": 2.449232081911263, + "grad_norm": 0.27749935079348353, + "learning_rate": 0.0001314483554908659, + "loss": 0.9197, + "step": 11482 + }, + { + "epoch": 2.4494453924914676, + "grad_norm": 0.2496396508152097, + "learning_rate": 0.00013144183860460307, + "loss": 0.9372, + "step": 11483 + }, + { + "epoch": 2.4496587030716723, + "grad_norm": 0.26741099877580066, + "learning_rate": 0.000131435321136275, + "loss": 0.9735, + "step": 11484 + }, + { + "epoch": 2.449872013651877, + "grad_norm": 0.22217663709407875, + "learning_rate": 0.00013142880308595535, + "loss": 0.9414, + "step": 11485 + }, + { + "epoch": 2.4500853242320817, + "grad_norm": 0.29240934879249775, + "learning_rate": 0.00013142228445371798, + "loss": 0.9479, + "step": 11486 + }, + { + "epoch": 2.450298634812287, + "grad_norm": 0.2608234546117919, + "learning_rate": 0.00013141576523963654, + "loss": 0.954, + "step": 11487 + }, + { + "epoch": 2.4505119453924915, + "grad_norm": 0.22160622101718225, + "learning_rate": 0.00013140924544378487, + "loss": 0.9349, + "step": 11488 + }, + { + "epoch": 2.450725255972696, + "grad_norm": 0.2458612047566633, + "learning_rate": 0.00013140272506623673, + "loss": 0.9501, + "step": 11489 + }, + { + "epoch": 2.450938566552901, + "grad_norm": 0.23570779269767417, + "learning_rate": 0.0001313962041070659, + "loss": 0.9544, + "step": 11490 + }, + { + "epoch": 2.451151877133106, + "grad_norm": 0.18801235685472945, + "learning_rate": 0.00013138968256634615, + "loss": 0.9195, + "step": 11491 + }, + { + "epoch": 2.4513651877133107, + "grad_norm": 0.2592514220341204, + "learning_rate": 0.0001313831604441513, + "loss": 0.9348, + "step": 11492 + }, + { + "epoch": 2.4515784982935154, + "grad_norm": 0.18382468927140785, + "learning_rate": 0.00013137663774055508, + "loss": 0.9241, + "step": 11493 + }, + { + "epoch": 2.45179180887372, + "grad_norm": 0.20153635703961756, + "learning_rate": 0.00013137011445563137, + "loss": 0.9332, + "step": 11494 + }, + { + "epoch": 2.4520051194539247, + "grad_norm": 0.23136547589445028, + "learning_rate": 0.00013136359058945392, + "loss": 0.9181, + "step": 11495 + }, + { + "epoch": 2.45221843003413, + "grad_norm": 0.22650235021935494, + "learning_rate": 0.0001313570661420966, + "loss": 0.951, + "step": 11496 + }, + { + "epoch": 2.4524317406143346, + "grad_norm": 0.178668446824543, + "learning_rate": 0.00013135054111363322, + "loss": 0.903, + "step": 11497 + }, + { + "epoch": 2.4526450511945392, + "grad_norm": 0.23038871786084936, + "learning_rate": 0.00013134401550413761, + "loss": 0.9606, + "step": 11498 + }, + { + "epoch": 2.452858361774744, + "grad_norm": 0.22664745529351651, + "learning_rate": 0.00013133748931368357, + "loss": 0.9287, + "step": 11499 + }, + { + "epoch": 2.453071672354949, + "grad_norm": 0.210601207045242, + "learning_rate": 0.000131330962542345, + "loss": 0.9327, + "step": 11500 + }, + { + "epoch": 2.4532849829351537, + "grad_norm": 0.25241275183053385, + "learning_rate": 0.00013132443519019572, + "loss": 0.9348, + "step": 11501 + }, + { + "epoch": 2.4534982935153584, + "grad_norm": 0.2500932033123078, + "learning_rate": 0.0001313179072573096, + "loss": 0.9603, + "step": 11502 + }, + { + "epoch": 2.453711604095563, + "grad_norm": 0.25833208148505443, + "learning_rate": 0.00013131137874376048, + "loss": 0.9166, + "step": 11503 + }, + { + "epoch": 2.453924914675768, + "grad_norm": 0.2612370036214827, + "learning_rate": 0.00013130484964962225, + "loss": 0.9271, + "step": 11504 + }, + { + "epoch": 2.4541382252559725, + "grad_norm": 0.23308763203570967, + "learning_rate": 0.0001312983199749688, + "loss": 0.9163, + "step": 11505 + }, + { + "epoch": 2.4543515358361776, + "grad_norm": 0.2693458657664416, + "learning_rate": 0.00013129178971987398, + "loss": 0.908, + "step": 11506 + }, + { + "epoch": 2.4545648464163823, + "grad_norm": 0.4295094412623849, + "learning_rate": 0.00013128525888441168, + "loss": 0.9428, + "step": 11507 + }, + { + "epoch": 2.454778156996587, + "grad_norm": 0.402779856412796, + "learning_rate": 0.00013127872746865584, + "loss": 0.9465, + "step": 11508 + }, + { + "epoch": 2.4549914675767917, + "grad_norm": 0.21887780330784473, + "learning_rate": 0.00013127219547268035, + "loss": 0.9186, + "step": 11509 + }, + { + "epoch": 2.455204778156997, + "grad_norm": 0.3014094683232345, + "learning_rate": 0.00013126566289655908, + "loss": 0.9359, + "step": 11510 + }, + { + "epoch": 2.4554180887372015, + "grad_norm": 0.3800494528611895, + "learning_rate": 0.000131259129740366, + "loss": 0.9556, + "step": 11511 + }, + { + "epoch": 2.455631399317406, + "grad_norm": 0.3190458331855992, + "learning_rate": 0.00013125259600417494, + "loss": 0.9207, + "step": 11512 + }, + { + "epoch": 2.455844709897611, + "grad_norm": 0.2774489511469751, + "learning_rate": 0.00013124606168805997, + "loss": 0.9479, + "step": 11513 + }, + { + "epoch": 2.4560580204778155, + "grad_norm": 0.30624075814824897, + "learning_rate": 0.0001312395267920949, + "loss": 0.9321, + "step": 11514 + }, + { + "epoch": 2.4562713310580206, + "grad_norm": 0.35417716048966263, + "learning_rate": 0.00013123299131635375, + "loss": 0.9406, + "step": 11515 + }, + { + "epoch": 2.4564846416382253, + "grad_norm": 0.33591058710479044, + "learning_rate": 0.00013122645526091043, + "loss": 0.9455, + "step": 11516 + }, + { + "epoch": 2.45669795221843, + "grad_norm": 0.2762785324761308, + "learning_rate": 0.00013121991862583888, + "loss": 0.9747, + "step": 11517 + }, + { + "epoch": 2.4569112627986347, + "grad_norm": 0.29935978827586535, + "learning_rate": 0.00013121338141121314, + "loss": 0.9153, + "step": 11518 + }, + { + "epoch": 2.45712457337884, + "grad_norm": 0.4123379767177517, + "learning_rate": 0.00013120684361710708, + "loss": 0.9568, + "step": 11519 + }, + { + "epoch": 2.4573378839590445, + "grad_norm": 0.3581566886112926, + "learning_rate": 0.00013120030524359477, + "loss": 0.9386, + "step": 11520 + }, + { + "epoch": 2.457551194539249, + "grad_norm": 0.2446256639597451, + "learning_rate": 0.0001311937662907501, + "loss": 0.9333, + "step": 11521 + }, + { + "epoch": 2.457764505119454, + "grad_norm": 0.30988130618333426, + "learning_rate": 0.00013118722675864715, + "loss": 0.9389, + "step": 11522 + }, + { + "epoch": 2.4579778156996586, + "grad_norm": 0.3438720929542791, + "learning_rate": 0.00013118068664735985, + "loss": 0.9469, + "step": 11523 + }, + { + "epoch": 2.4581911262798632, + "grad_norm": 0.33856262256879777, + "learning_rate": 0.0001311741459569622, + "loss": 0.9259, + "step": 11524 + }, + { + "epoch": 2.4584044368600684, + "grad_norm": 0.28663315368915687, + "learning_rate": 0.00013116760468752825, + "loss": 0.9329, + "step": 11525 + }, + { + "epoch": 2.458617747440273, + "grad_norm": 0.23258833067783724, + "learning_rate": 0.00013116106283913198, + "loss": 0.9146, + "step": 11526 + }, + { + "epoch": 2.4588310580204777, + "grad_norm": 0.21877160540503687, + "learning_rate": 0.00013115452041184743, + "loss": 0.9319, + "step": 11527 + }, + { + "epoch": 2.4590443686006824, + "grad_norm": 0.22489054654941906, + "learning_rate": 0.00013114797740574862, + "loss": 0.9144, + "step": 11528 + }, + { + "epoch": 2.4592576791808876, + "grad_norm": 0.24005030542527137, + "learning_rate": 0.00013114143382090957, + "loss": 0.9449, + "step": 11529 + }, + { + "epoch": 2.4594709897610922, + "grad_norm": 0.9457751694801101, + "learning_rate": 0.00013113488965740437, + "loss": 0.9746, + "step": 11530 + }, + { + "epoch": 2.459684300341297, + "grad_norm": 0.3874813719458462, + "learning_rate": 0.000131128344915307, + "loss": 0.9441, + "step": 11531 + }, + { + "epoch": 2.4598976109215016, + "grad_norm": 0.2926376571655928, + "learning_rate": 0.00013112179959469156, + "loss": 0.9448, + "step": 11532 + }, + { + "epoch": 2.4601109215017063, + "grad_norm": 0.2294476680614021, + "learning_rate": 0.0001311152536956321, + "loss": 0.9085, + "step": 11533 + }, + { + "epoch": 2.4603242320819114, + "grad_norm": 0.24985035181533288, + "learning_rate": 0.00013110870721820268, + "loss": 0.9592, + "step": 11534 + }, + { + "epoch": 2.460537542662116, + "grad_norm": 0.23979666526229504, + "learning_rate": 0.00013110216016247738, + "loss": 0.9299, + "step": 11535 + }, + { + "epoch": 2.460750853242321, + "grad_norm": 0.26376629960066844, + "learning_rate": 0.00013109561252853028, + "loss": 0.9271, + "step": 11536 + }, + { + "epoch": 2.4609641638225255, + "grad_norm": 0.31400086779905206, + "learning_rate": 0.00013108906431643548, + "loss": 0.9651, + "step": 11537 + }, + { + "epoch": 2.4611774744027306, + "grad_norm": 0.39104073135030615, + "learning_rate": 0.00013108251552626702, + "loss": 0.9316, + "step": 11538 + }, + { + "epoch": 2.4613907849829353, + "grad_norm": 0.42852311296933066, + "learning_rate": 0.00013107596615809908, + "loss": 0.9324, + "step": 11539 + }, + { + "epoch": 2.46160409556314, + "grad_norm": 0.44865413446653757, + "learning_rate": 0.0001310694162120057, + "loss": 0.94, + "step": 11540 + }, + { + "epoch": 2.4618174061433447, + "grad_norm": 0.4389305569078906, + "learning_rate": 0.00013106286568806102, + "loss": 0.9133, + "step": 11541 + }, + { + "epoch": 2.4620307167235493, + "grad_norm": 0.3574032246662935, + "learning_rate": 0.00013105631458633916, + "loss": 0.9419, + "step": 11542 + }, + { + "epoch": 2.462244027303754, + "grad_norm": 0.2288980250433151, + "learning_rate": 0.00013104976290691424, + "loss": 0.9038, + "step": 11543 + }, + { + "epoch": 2.462457337883959, + "grad_norm": 0.24419633145475156, + "learning_rate": 0.0001310432106498604, + "loss": 0.9252, + "step": 11544 + }, + { + "epoch": 2.462670648464164, + "grad_norm": 0.40356962758322007, + "learning_rate": 0.00013103665781525174, + "loss": 0.9188, + "step": 11545 + }, + { + "epoch": 2.4628839590443685, + "grad_norm": 0.4826676480546799, + "learning_rate": 0.00013103010440316246, + "loss": 0.9435, + "step": 11546 + }, + { + "epoch": 2.463097269624573, + "grad_norm": 0.43373858662900083, + "learning_rate": 0.00013102355041366668, + "loss": 0.9334, + "step": 11547 + }, + { + "epoch": 2.4633105802047783, + "grad_norm": 0.31382936784006127, + "learning_rate": 0.0001310169958468386, + "loss": 0.9275, + "step": 11548 + }, + { + "epoch": 2.463523890784983, + "grad_norm": 0.24258502278327765, + "learning_rate": 0.00013101044070275235, + "loss": 0.9517, + "step": 11549 + }, + { + "epoch": 2.4637372013651877, + "grad_norm": 0.2899222274844624, + "learning_rate": 0.00013100388498148206, + "loss": 0.9613, + "step": 11550 + }, + { + "epoch": 2.4639505119453924, + "grad_norm": 0.3340310653323296, + "learning_rate": 0.000130997328683102, + "loss": 0.9398, + "step": 11551 + }, + { + "epoch": 2.464163822525597, + "grad_norm": 0.41732035217566854, + "learning_rate": 0.0001309907718076863, + "loss": 0.9176, + "step": 11552 + }, + { + "epoch": 2.464377133105802, + "grad_norm": 0.520929515814589, + "learning_rate": 0.00013098421435530914, + "loss": 0.9433, + "step": 11553 + }, + { + "epoch": 2.464590443686007, + "grad_norm": 0.5139316217359106, + "learning_rate": 0.00013097765632604476, + "loss": 0.9422, + "step": 11554 + }, + { + "epoch": 2.4648037542662116, + "grad_norm": 0.37517614720986053, + "learning_rate": 0.00013097109771996733, + "loss": 0.9278, + "step": 11555 + }, + { + "epoch": 2.4650170648464163, + "grad_norm": 0.2798619103470225, + "learning_rate": 0.00013096453853715106, + "loss": 0.9316, + "step": 11556 + }, + { + "epoch": 2.4652303754266214, + "grad_norm": 0.26538397586677404, + "learning_rate": 0.0001309579787776702, + "loss": 0.9213, + "step": 11557 + }, + { + "epoch": 2.465443686006826, + "grad_norm": 0.31984557069822095, + "learning_rate": 0.00013095141844159894, + "loss": 0.9469, + "step": 11558 + }, + { + "epoch": 2.4656569965870307, + "grad_norm": 0.45449479833347917, + "learning_rate": 0.00013094485752901156, + "loss": 0.9689, + "step": 11559 + }, + { + "epoch": 2.4658703071672354, + "grad_norm": 0.5432945986542531, + "learning_rate": 0.0001309382960399822, + "loss": 0.9898, + "step": 11560 + }, + { + "epoch": 2.46608361774744, + "grad_norm": 0.5510069712002313, + "learning_rate": 0.0001309317339745852, + "loss": 0.9185, + "step": 11561 + }, + { + "epoch": 2.4662969283276452, + "grad_norm": 0.42384219021200226, + "learning_rate": 0.00013092517133289476, + "loss": 0.9262, + "step": 11562 + }, + { + "epoch": 2.46651023890785, + "grad_norm": 0.2731030226789124, + "learning_rate": 0.00013091860811498515, + "loss": 0.9248, + "step": 11563 + }, + { + "epoch": 2.4667235494880546, + "grad_norm": 0.2286483272899946, + "learning_rate": 0.00013091204432093064, + "loss": 0.896, + "step": 11564 + }, + { + "epoch": 2.4669368600682593, + "grad_norm": 0.27445779566537304, + "learning_rate": 0.0001309054799508055, + "loss": 0.9502, + "step": 11565 + }, + { + "epoch": 2.467150170648464, + "grad_norm": 0.29519944287784866, + "learning_rate": 0.00013089891500468398, + "loss": 0.9638, + "step": 11566 + }, + { + "epoch": 2.467363481228669, + "grad_norm": 0.3229881175398991, + "learning_rate": 0.00013089234948264038, + "loss": 0.9297, + "step": 11567 + }, + { + "epoch": 2.467576791808874, + "grad_norm": 0.3540516919474489, + "learning_rate": 0.00013088578338474902, + "loss": 0.9243, + "step": 11568 + }, + { + "epoch": 2.4677901023890785, + "grad_norm": 0.24209733016279675, + "learning_rate": 0.00013087921671108411, + "loss": 0.9515, + "step": 11569 + }, + { + "epoch": 2.468003412969283, + "grad_norm": 0.20861798820246405, + "learning_rate": 0.00013087264946172006, + "loss": 0.9605, + "step": 11570 + }, + { + "epoch": 2.468216723549488, + "grad_norm": 0.2715975557915065, + "learning_rate": 0.0001308660816367311, + "loss": 0.9304, + "step": 11571 + }, + { + "epoch": 2.468430034129693, + "grad_norm": 0.3085956242394667, + "learning_rate": 0.00013085951323619157, + "loss": 0.9289, + "step": 11572 + }, + { + "epoch": 2.4686433447098977, + "grad_norm": 0.2857498109744808, + "learning_rate": 0.00013085294426017581, + "loss": 0.9161, + "step": 11573 + }, + { + "epoch": 2.4688566552901023, + "grad_norm": 0.30148423063797214, + "learning_rate": 0.0001308463747087581, + "loss": 0.9007, + "step": 11574 + }, + { + "epoch": 2.469069965870307, + "grad_norm": 0.27777698837298, + "learning_rate": 0.00013083980458201286, + "loss": 0.9169, + "step": 11575 + }, + { + "epoch": 2.469283276450512, + "grad_norm": 0.23483114942116723, + "learning_rate": 0.00013083323388001433, + "loss": 0.9479, + "step": 11576 + }, + { + "epoch": 2.469496587030717, + "grad_norm": 0.2632780941616446, + "learning_rate": 0.0001308266626028369, + "loss": 0.9354, + "step": 11577 + }, + { + "epoch": 2.4697098976109215, + "grad_norm": 0.32259616996022866, + "learning_rate": 0.00013082009075055494, + "loss": 0.9293, + "step": 11578 + }, + { + "epoch": 2.469923208191126, + "grad_norm": 0.335382679377945, + "learning_rate": 0.0001308135183232428, + "loss": 0.9275, + "step": 11579 + }, + { + "epoch": 2.470136518771331, + "grad_norm": 0.35189125958940587, + "learning_rate": 0.00013080694532097484, + "loss": 0.9285, + "step": 11580 + }, + { + "epoch": 2.470349829351536, + "grad_norm": 0.295912986936801, + "learning_rate": 0.00013080037174382545, + "loss": 0.9103, + "step": 11581 + }, + { + "epoch": 2.4705631399317407, + "grad_norm": 0.22645346258941493, + "learning_rate": 0.000130793797591869, + "loss": 0.9394, + "step": 11582 + }, + { + "epoch": 2.4707764505119454, + "grad_norm": 0.2448425635580258, + "learning_rate": 0.00013078722286517986, + "loss": 0.9337, + "step": 11583 + }, + { + "epoch": 2.47098976109215, + "grad_norm": 0.25793480939242386, + "learning_rate": 0.00013078064756383246, + "loss": 0.9216, + "step": 11584 + }, + { + "epoch": 2.4712030716723548, + "grad_norm": 0.31636097639888333, + "learning_rate": 0.00013077407168790117, + "loss": 0.9183, + "step": 11585 + }, + { + "epoch": 2.47141638225256, + "grad_norm": 0.3036024035912175, + "learning_rate": 0.0001307674952374604, + "loss": 0.9635, + "step": 11586 + }, + { + "epoch": 2.4716296928327646, + "grad_norm": 0.3411162280223505, + "learning_rate": 0.00013076091821258455, + "loss": 0.9003, + "step": 11587 + }, + { + "epoch": 2.4718430034129693, + "grad_norm": 0.3929504783980641, + "learning_rate": 0.0001307543406133481, + "loss": 0.928, + "step": 11588 + }, + { + "epoch": 2.472056313993174, + "grad_norm": 0.28848927825078097, + "learning_rate": 0.0001307477624398254, + "loss": 0.9191, + "step": 11589 + }, + { + "epoch": 2.4722696245733786, + "grad_norm": 0.2857023377131507, + "learning_rate": 0.00013074118369209094, + "loss": 0.9684, + "step": 11590 + }, + { + "epoch": 2.4724829351535837, + "grad_norm": 0.3092741102773993, + "learning_rate": 0.0001307346043702191, + "loss": 0.923, + "step": 11591 + }, + { + "epoch": 2.4726962457337884, + "grad_norm": 0.20023308674336734, + "learning_rate": 0.00013072802447428437, + "loss": 0.9189, + "step": 11592 + }, + { + "epoch": 2.472909556313993, + "grad_norm": 0.28703202027235986, + "learning_rate": 0.0001307214440043612, + "loss": 0.9304, + "step": 11593 + }, + { + "epoch": 2.473122866894198, + "grad_norm": 0.29445633016211425, + "learning_rate": 0.00013071486296052404, + "loss": 0.9373, + "step": 11594 + }, + { + "epoch": 2.473336177474403, + "grad_norm": 0.2966962253810437, + "learning_rate": 0.00013070828134284732, + "loss": 0.9294, + "step": 11595 + }, + { + "epoch": 2.4735494880546076, + "grad_norm": 0.29197580495644565, + "learning_rate": 0.00013070169915140557, + "loss": 0.9161, + "step": 11596 + }, + { + "epoch": 2.4737627986348123, + "grad_norm": 0.2006402521197124, + "learning_rate": 0.00013069511638627327, + "loss": 0.9157, + "step": 11597 + }, + { + "epoch": 2.473976109215017, + "grad_norm": 0.21695736612992086, + "learning_rate": 0.00013068853304752483, + "loss": 0.9558, + "step": 11598 + }, + { + "epoch": 2.4741894197952217, + "grad_norm": 0.2366338306304267, + "learning_rate": 0.0001306819491352348, + "loss": 0.9493, + "step": 11599 + }, + { + "epoch": 2.474402730375427, + "grad_norm": 0.21417812661399732, + "learning_rate": 0.00013067536464947765, + "loss": 0.9321, + "step": 11600 + }, + { + "epoch": 2.4746160409556315, + "grad_norm": 0.20771869274155375, + "learning_rate": 0.0001306687795903279, + "loss": 0.9531, + "step": 11601 + }, + { + "epoch": 2.474829351535836, + "grad_norm": 0.23289776683640148, + "learning_rate": 0.00013066219395786008, + "loss": 0.9327, + "step": 11602 + }, + { + "epoch": 2.475042662116041, + "grad_norm": 0.253118108537606, + "learning_rate": 0.00013065560775214868, + "loss": 0.9268, + "step": 11603 + }, + { + "epoch": 2.475255972696246, + "grad_norm": 0.22299687923788644, + "learning_rate": 0.00013064902097326822, + "loss": 0.9197, + "step": 11604 + }, + { + "epoch": 2.4754692832764507, + "grad_norm": 0.21749807089576112, + "learning_rate": 0.00013064243362129324, + "loss": 0.9329, + "step": 11605 + }, + { + "epoch": 2.4756825938566553, + "grad_norm": 0.2236732198780656, + "learning_rate": 0.00013063584569629827, + "loss": 0.9324, + "step": 11606 + }, + { + "epoch": 2.47589590443686, + "grad_norm": 0.2526795539822781, + "learning_rate": 0.00013062925719835783, + "loss": 0.9459, + "step": 11607 + }, + { + "epoch": 2.4761092150170647, + "grad_norm": 0.24538363298898508, + "learning_rate": 0.0001306226681275465, + "loss": 0.92, + "step": 11608 + }, + { + "epoch": 2.4763225255972694, + "grad_norm": 0.26575552361251625, + "learning_rate": 0.00013061607848393884, + "loss": 0.9507, + "step": 11609 + }, + { + "epoch": 2.4765358361774745, + "grad_norm": 0.23542753100235114, + "learning_rate": 0.0001306094882676094, + "loss": 0.9623, + "step": 11610 + }, + { + "epoch": 2.476749146757679, + "grad_norm": 0.20273272716934307, + "learning_rate": 0.00013060289747863276, + "loss": 0.9303, + "step": 11611 + }, + { + "epoch": 2.476962457337884, + "grad_norm": 0.27632300678462324, + "learning_rate": 0.00013059630611708347, + "loss": 0.9551, + "step": 11612 + }, + { + "epoch": 2.4771757679180886, + "grad_norm": 0.31485432355383614, + "learning_rate": 0.0001305897141830361, + "loss": 0.9337, + "step": 11613 + }, + { + "epoch": 2.4773890784982937, + "grad_norm": 0.30022999411417084, + "learning_rate": 0.00013058312167656531, + "loss": 0.9817, + "step": 11614 + }, + { + "epoch": 2.4776023890784984, + "grad_norm": 0.31280806428364993, + "learning_rate": 0.00013057652859774566, + "loss": 0.9665, + "step": 11615 + }, + { + "epoch": 2.477815699658703, + "grad_norm": 0.2811066316372995, + "learning_rate": 0.00013056993494665168, + "loss": 0.9054, + "step": 11616 + }, + { + "epoch": 2.4780290102389078, + "grad_norm": 0.2673394904969862, + "learning_rate": 0.00013056334072335805, + "loss": 0.9374, + "step": 11617 + }, + { + "epoch": 2.4782423208191124, + "grad_norm": 0.23361689005822261, + "learning_rate": 0.00013055674592793939, + "loss": 0.9409, + "step": 11618 + }, + { + "epoch": 2.4784556313993176, + "grad_norm": 0.23451686355239323, + "learning_rate": 0.00013055015056047032, + "loss": 0.9583, + "step": 11619 + }, + { + "epoch": 2.4786689419795223, + "grad_norm": 0.24141573570611455, + "learning_rate": 0.0001305435546210254, + "loss": 0.9271, + "step": 11620 + }, + { + "epoch": 2.478882252559727, + "grad_norm": 0.2806806195915419, + "learning_rate": 0.00013053695810967932, + "loss": 0.928, + "step": 11621 + }, + { + "epoch": 2.4790955631399316, + "grad_norm": 0.26533505704968324, + "learning_rate": 0.00013053036102650674, + "loss": 0.9384, + "step": 11622 + }, + { + "epoch": 2.4793088737201368, + "grad_norm": 0.2974677661865847, + "learning_rate": 0.00013052376337158224, + "loss": 0.9441, + "step": 11623 + }, + { + "epoch": 2.4795221843003414, + "grad_norm": 0.3197700639948536, + "learning_rate": 0.0001305171651449805, + "loss": 0.9413, + "step": 11624 + }, + { + "epoch": 2.479735494880546, + "grad_norm": 0.26044366104088346, + "learning_rate": 0.00013051056634677622, + "loss": 0.9148, + "step": 11625 + }, + { + "epoch": 2.479948805460751, + "grad_norm": 0.29543694988473146, + "learning_rate": 0.000130503966977044, + "loss": 0.9215, + "step": 11626 + }, + { + "epoch": 2.4801621160409555, + "grad_norm": 0.31085718723654193, + "learning_rate": 0.00013049736703585858, + "loss": 0.9083, + "step": 11627 + }, + { + "epoch": 2.48037542662116, + "grad_norm": 0.2673330623121967, + "learning_rate": 0.00013049076652329458, + "loss": 0.9155, + "step": 11628 + }, + { + "epoch": 2.4805887372013653, + "grad_norm": 0.33673986856300264, + "learning_rate": 0.00013048416543942672, + "loss": 0.9116, + "step": 11629 + }, + { + "epoch": 2.48080204778157, + "grad_norm": 0.31382900635704636, + "learning_rate": 0.00013047756378432965, + "loss": 0.9327, + "step": 11630 + }, + { + "epoch": 2.4810153583617747, + "grad_norm": 0.22226604663548638, + "learning_rate": 0.00013047096155807813, + "loss": 0.9457, + "step": 11631 + }, + { + "epoch": 2.4812286689419794, + "grad_norm": 0.25220149546207904, + "learning_rate": 0.00013046435876074683, + "loss": 0.9649, + "step": 11632 + }, + { + "epoch": 2.4814419795221845, + "grad_norm": 0.2871539340910407, + "learning_rate": 0.00013045775539241044, + "loss": 0.9557, + "step": 11633 + }, + { + "epoch": 2.481655290102389, + "grad_norm": 0.26843181678513994, + "learning_rate": 0.00013045115145314372, + "loss": 0.9279, + "step": 11634 + }, + { + "epoch": 2.481868600682594, + "grad_norm": 0.5097407861568674, + "learning_rate": 0.00013044454694302135, + "loss": 0.9037, + "step": 11635 + }, + { + "epoch": 2.4820819112627985, + "grad_norm": 0.4308917226303862, + "learning_rate": 0.0001304379418621181, + "loss": 0.945, + "step": 11636 + }, + { + "epoch": 2.482295221843003, + "grad_norm": 0.27537494023554754, + "learning_rate": 0.00013043133621050868, + "loss": 0.9381, + "step": 11637 + }, + { + "epoch": 2.4825085324232083, + "grad_norm": 0.3027508060666453, + "learning_rate": 0.00013042472998826786, + "loss": 0.9883, + "step": 11638 + }, + { + "epoch": 2.482721843003413, + "grad_norm": 0.35871491957031504, + "learning_rate": 0.00013041812319547036, + "loss": 0.9617, + "step": 11639 + }, + { + "epoch": 2.4829351535836177, + "grad_norm": 0.39583578849349965, + "learning_rate": 0.00013041151583219094, + "loss": 0.9583, + "step": 11640 + }, + { + "epoch": 2.4831484641638224, + "grad_norm": 0.37797608347821715, + "learning_rate": 0.00013040490789850435, + "loss": 0.9237, + "step": 11641 + }, + { + "epoch": 2.4833617747440275, + "grad_norm": 0.3776006398364892, + "learning_rate": 0.00013039829939448543, + "loss": 0.9213, + "step": 11642 + }, + { + "epoch": 2.483575085324232, + "grad_norm": 0.368793969952614, + "learning_rate": 0.00013039169032020887, + "loss": 0.9482, + "step": 11643 + }, + { + "epoch": 2.483788395904437, + "grad_norm": 0.5877827034760027, + "learning_rate": 0.0001303850806757495, + "loss": 0.9252, + "step": 11644 + }, + { + "epoch": 2.4840017064846416, + "grad_norm": 0.23593068565928382, + "learning_rate": 0.0001303784704611821, + "loss": 0.9345, + "step": 11645 + }, + { + "epoch": 2.4842150170648463, + "grad_norm": 0.3378144369175435, + "learning_rate": 0.00013037185967658144, + "loss": 0.9349, + "step": 11646 + }, + { + "epoch": 2.484428327645051, + "grad_norm": 0.4637038885511424, + "learning_rate": 0.00013036524832202236, + "loss": 0.9741, + "step": 11647 + }, + { + "epoch": 2.484641638225256, + "grad_norm": 0.5978684909626507, + "learning_rate": 0.00013035863639757964, + "loss": 0.9475, + "step": 11648 + }, + { + "epoch": 2.4848549488054608, + "grad_norm": 0.6268260543563698, + "learning_rate": 0.0001303520239033281, + "loss": 0.9368, + "step": 11649 + }, + { + "epoch": 2.4850682593856654, + "grad_norm": 0.5654497047144571, + "learning_rate": 0.00013034541083934258, + "loss": 0.9029, + "step": 11650 + }, + { + "epoch": 2.48528156996587, + "grad_norm": 0.40037855971555053, + "learning_rate": 0.00013033879720569786, + "loss": 0.9383, + "step": 11651 + }, + { + "epoch": 2.4854948805460753, + "grad_norm": 0.2629537293359297, + "learning_rate": 0.0001303321830024688, + "loss": 0.9471, + "step": 11652 + }, + { + "epoch": 2.48570819112628, + "grad_norm": 0.3050475797810258, + "learning_rate": 0.0001303255682297303, + "loss": 0.9582, + "step": 11653 + }, + { + "epoch": 2.4859215017064846, + "grad_norm": 0.3812436112013426, + "learning_rate": 0.0001303189528875571, + "loss": 0.9397, + "step": 11654 + }, + { + "epoch": 2.4861348122866893, + "grad_norm": 0.5218191722109966, + "learning_rate": 0.00013031233697602412, + "loss": 0.9416, + "step": 11655 + }, + { + "epoch": 2.486348122866894, + "grad_norm": 0.5920451387655321, + "learning_rate": 0.00013030572049520616, + "loss": 0.9266, + "step": 11656 + }, + { + "epoch": 2.486561433447099, + "grad_norm": 0.5167793179632465, + "learning_rate": 0.00013029910344517818, + "loss": 0.9392, + "step": 11657 + }, + { + "epoch": 2.486774744027304, + "grad_norm": 0.3711856577103557, + "learning_rate": 0.00013029248582601495, + "loss": 0.9405, + "step": 11658 + }, + { + "epoch": 2.4869880546075085, + "grad_norm": 0.24115511617037041, + "learning_rate": 0.00013028586763779142, + "loss": 0.9341, + "step": 11659 + }, + { + "epoch": 2.487201365187713, + "grad_norm": 0.30356760948187844, + "learning_rate": 0.00013027924888058247, + "loss": 0.9454, + "step": 11660 + }, + { + "epoch": 2.4874146757679183, + "grad_norm": 0.41496151075383625, + "learning_rate": 0.00013027262955446297, + "loss": 0.9597, + "step": 11661 + }, + { + "epoch": 2.487627986348123, + "grad_norm": 0.4533249917898956, + "learning_rate": 0.0001302660096595078, + "loss": 0.9567, + "step": 11662 + }, + { + "epoch": 2.4878412969283277, + "grad_norm": 0.4857067748696157, + "learning_rate": 0.0001302593891957919, + "loss": 0.9379, + "step": 11663 + }, + { + "epoch": 2.4880546075085324, + "grad_norm": 0.44590335467476533, + "learning_rate": 0.00013025276816339018, + "loss": 0.9374, + "step": 11664 + }, + { + "epoch": 2.488267918088737, + "grad_norm": 0.3254383993549261, + "learning_rate": 0.00013024614656237748, + "loss": 0.9198, + "step": 11665 + }, + { + "epoch": 2.488481228668942, + "grad_norm": 0.22685452202104353, + "learning_rate": 0.00013023952439282884, + "loss": 0.9592, + "step": 11666 + }, + { + "epoch": 2.488694539249147, + "grad_norm": 0.2767198641421013, + "learning_rate": 0.00013023290165481912, + "loss": 0.9355, + "step": 11667 + }, + { + "epoch": 2.4889078498293515, + "grad_norm": 0.33231233690888684, + "learning_rate": 0.0001302262783484233, + "loss": 0.9272, + "step": 11668 + }, + { + "epoch": 2.489121160409556, + "grad_norm": 0.277302885188121, + "learning_rate": 0.00013021965447371624, + "loss": 0.9294, + "step": 11669 + }, + { + "epoch": 2.489334470989761, + "grad_norm": 0.2459451484390282, + "learning_rate": 0.00013021303003077302, + "loss": 0.9486, + "step": 11670 + }, + { + "epoch": 2.489547781569966, + "grad_norm": 0.24117287665528403, + "learning_rate": 0.00013020640501966847, + "loss": 0.9352, + "step": 11671 + }, + { + "epoch": 2.4897610921501707, + "grad_norm": 0.2448494625031088, + "learning_rate": 0.0001301997794404776, + "loss": 0.929, + "step": 11672 + }, + { + "epoch": 2.4899744027303754, + "grad_norm": 0.2473695303374882, + "learning_rate": 0.00013019315329327542, + "loss": 0.9459, + "step": 11673 + }, + { + "epoch": 2.49018771331058, + "grad_norm": 0.2823870155088725, + "learning_rate": 0.00013018652657813682, + "loss": 0.9393, + "step": 11674 + }, + { + "epoch": 2.4904010238907848, + "grad_norm": 0.3446551133999056, + "learning_rate": 0.00013017989929513685, + "loss": 0.9374, + "step": 11675 + }, + { + "epoch": 2.49061433447099, + "grad_norm": 0.267977620914417, + "learning_rate": 0.00013017327144435048, + "loss": 0.9189, + "step": 11676 + }, + { + "epoch": 2.4908276450511946, + "grad_norm": 0.26557886097101513, + "learning_rate": 0.0001301666430258527, + "loss": 0.947, + "step": 11677 + }, + { + "epoch": 2.4910409556313993, + "grad_norm": 0.2864051684865851, + "learning_rate": 0.0001301600140397185, + "loss": 0.9305, + "step": 11678 + }, + { + "epoch": 2.491254266211604, + "grad_norm": 0.3077168061615188, + "learning_rate": 0.0001301533844860229, + "loss": 0.9473, + "step": 11679 + }, + { + "epoch": 2.491467576791809, + "grad_norm": 0.43205699401473036, + "learning_rate": 0.00013014675436484092, + "loss": 0.9515, + "step": 11680 + }, + { + "epoch": 2.4916808873720138, + "grad_norm": 0.353940373741658, + "learning_rate": 0.00013014012367624757, + "loss": 0.9403, + "step": 11681 + }, + { + "epoch": 2.4918941979522184, + "grad_norm": 0.3013482255564326, + "learning_rate": 0.00013013349242031787, + "loss": 0.9538, + "step": 11682 + }, + { + "epoch": 2.492107508532423, + "grad_norm": 0.28301645634011735, + "learning_rate": 0.00013012686059712684, + "loss": 0.922, + "step": 11683 + }, + { + "epoch": 2.492320819112628, + "grad_norm": 0.284408110332136, + "learning_rate": 0.00013012022820674954, + "loss": 0.9412, + "step": 11684 + }, + { + "epoch": 2.492534129692833, + "grad_norm": 0.24649807872597387, + "learning_rate": 0.00013011359524926105, + "loss": 0.9454, + "step": 11685 + }, + { + "epoch": 2.4927474402730376, + "grad_norm": 0.3344543357508651, + "learning_rate": 0.00013010696172473636, + "loss": 0.9398, + "step": 11686 + }, + { + "epoch": 2.4929607508532423, + "grad_norm": 0.3963225048823308, + "learning_rate": 0.00013010032763325057, + "loss": 0.926, + "step": 11687 + }, + { + "epoch": 2.493174061433447, + "grad_norm": 0.4525659341997725, + "learning_rate": 0.00013009369297487872, + "loss": 0.9423, + "step": 11688 + }, + { + "epoch": 2.493387372013652, + "grad_norm": 0.43630447161999525, + "learning_rate": 0.00013008705774969592, + "loss": 0.9176, + "step": 11689 + }, + { + "epoch": 2.493600682593857, + "grad_norm": 0.34176959671841944, + "learning_rate": 0.0001300804219577772, + "loss": 0.9424, + "step": 11690 + }, + { + "epoch": 2.4938139931740615, + "grad_norm": 0.2799419272920821, + "learning_rate": 0.00013007378559919765, + "loss": 0.9343, + "step": 11691 + }, + { + "epoch": 2.494027303754266, + "grad_norm": 0.2552006931399463, + "learning_rate": 0.00013006714867403238, + "loss": 0.9277, + "step": 11692 + }, + { + "epoch": 2.494240614334471, + "grad_norm": 0.2713152102298248, + "learning_rate": 0.0001300605111823565, + "loss": 0.9154, + "step": 11693 + }, + { + "epoch": 2.4944539249146755, + "grad_norm": 0.2591454862093983, + "learning_rate": 0.0001300538731242451, + "loss": 0.9427, + "step": 11694 + }, + { + "epoch": 2.4946672354948807, + "grad_norm": 0.2857590802497498, + "learning_rate": 0.00013004723449977322, + "loss": 0.9074, + "step": 11695 + }, + { + "epoch": 2.4948805460750854, + "grad_norm": 0.30408856210246993, + "learning_rate": 0.00013004059530901608, + "loss": 0.948, + "step": 11696 + }, + { + "epoch": 2.49509385665529, + "grad_norm": 0.2195526430657416, + "learning_rate": 0.00013003395555204878, + "loss": 0.9432, + "step": 11697 + }, + { + "epoch": 2.4953071672354947, + "grad_norm": 0.2558160467647228, + "learning_rate": 0.00013002731522894641, + "loss": 0.9277, + "step": 11698 + }, + { + "epoch": 2.4955204778157, + "grad_norm": 0.24778074493887406, + "learning_rate": 0.00013002067433978413, + "loss": 0.9255, + "step": 11699 + }, + { + "epoch": 2.4957337883959045, + "grad_norm": 0.24974412034060894, + "learning_rate": 0.00013001403288463706, + "loss": 0.9379, + "step": 11700 + }, + { + "epoch": 2.4959470989761092, + "grad_norm": 0.26021226620732796, + "learning_rate": 0.00013000739086358042, + "loss": 0.9262, + "step": 11701 + }, + { + "epoch": 2.496160409556314, + "grad_norm": 0.2781879398794476, + "learning_rate": 0.00013000074827668927, + "loss": 0.9398, + "step": 11702 + }, + { + "epoch": 2.4963737201365186, + "grad_norm": 0.2808630887924615, + "learning_rate": 0.00012999410512403882, + "loss": 0.9553, + "step": 11703 + }, + { + "epoch": 2.4965870307167237, + "grad_norm": 0.2483666275588006, + "learning_rate": 0.00012998746140570423, + "loss": 0.9616, + "step": 11704 + }, + { + "epoch": 2.4968003412969284, + "grad_norm": 0.29085461981513866, + "learning_rate": 0.0001299808171217607, + "loss": 0.9194, + "step": 11705 + }, + { + "epoch": 2.497013651877133, + "grad_norm": 0.2860427855032712, + "learning_rate": 0.00012997417227228334, + "loss": 0.9303, + "step": 11706 + }, + { + "epoch": 2.4972269624573378, + "grad_norm": 0.3264671751535591, + "learning_rate": 0.0001299675268573474, + "loss": 0.9377, + "step": 11707 + }, + { + "epoch": 2.497440273037543, + "grad_norm": 0.31955935296077165, + "learning_rate": 0.00012996088087702804, + "loss": 0.9134, + "step": 11708 + }, + { + "epoch": 2.4976535836177476, + "grad_norm": 0.2864581156925107, + "learning_rate": 0.00012995423433140047, + "loss": 0.9187, + "step": 11709 + }, + { + "epoch": 2.4978668941979523, + "grad_norm": 0.32737762138181586, + "learning_rate": 0.00012994758722053993, + "loss": 0.9421, + "step": 11710 + }, + { + "epoch": 2.498080204778157, + "grad_norm": 0.303527178773326, + "learning_rate": 0.00012994093954452158, + "loss": 0.9254, + "step": 11711 + }, + { + "epoch": 2.4982935153583616, + "grad_norm": 0.3004220671303733, + "learning_rate": 0.00012993429130342066, + "loss": 0.9325, + "step": 11712 + }, + { + "epoch": 2.4985068259385663, + "grad_norm": 0.26228876975791743, + "learning_rate": 0.00012992764249731238, + "loss": 0.9314, + "step": 11713 + }, + { + "epoch": 2.4987201365187715, + "grad_norm": 0.2009663068559911, + "learning_rate": 0.000129920993126272, + "loss": 0.9278, + "step": 11714 + }, + { + "epoch": 2.498933447098976, + "grad_norm": 0.2574954672140854, + "learning_rate": 0.00012991434319037474, + "loss": 0.9216, + "step": 11715 + }, + { + "epoch": 2.499146757679181, + "grad_norm": 0.2793953869055069, + "learning_rate": 0.00012990769268969587, + "loss": 0.9308, + "step": 11716 + }, + { + "epoch": 2.4993600682593855, + "grad_norm": 0.2997444529188103, + "learning_rate": 0.0001299010416243106, + "loss": 0.9253, + "step": 11717 + }, + { + "epoch": 2.4995733788395906, + "grad_norm": 0.25429507152944686, + "learning_rate": 0.00012989438999429418, + "loss": 0.9365, + "step": 11718 + }, + { + "epoch": 2.4997866894197953, + "grad_norm": 0.25122845528816656, + "learning_rate": 0.00012988773779972192, + "loss": 0.8942, + "step": 11719 + }, + { + "epoch": 2.5, + "grad_norm": 0.3403210254938434, + "learning_rate": 0.0001298810850406691, + "loss": 0.9458, + "step": 11720 + }, + { + "epoch": 2.5002133105802047, + "grad_norm": 0.3220957635388021, + "learning_rate": 0.00012987443171721093, + "loss": 0.9425, + "step": 11721 + }, + { + "epoch": 2.5004266211604094, + "grad_norm": 0.45854109032788004, + "learning_rate": 0.00012986777782942273, + "loss": 0.9188, + "step": 11722 + }, + { + "epoch": 2.5006399317406145, + "grad_norm": 0.562802031921708, + "learning_rate": 0.00012986112337737976, + "loss": 0.9245, + "step": 11723 + }, + { + "epoch": 2.500853242320819, + "grad_norm": 0.5431678408795356, + "learning_rate": 0.00012985446836115736, + "loss": 0.9217, + "step": 11724 + }, + { + "epoch": 2.501066552901024, + "grad_norm": 0.4420852685333133, + "learning_rate": 0.00012984781278083083, + "loss": 0.9129, + "step": 11725 + }, + { + "epoch": 2.5012798634812285, + "grad_norm": 0.35560528798166097, + "learning_rate": 0.00012984115663647544, + "loss": 0.951, + "step": 11726 + }, + { + "epoch": 2.5014931740614337, + "grad_norm": 0.25964277422188053, + "learning_rate": 0.00012983449992816652, + "loss": 0.9463, + "step": 11727 + }, + { + "epoch": 2.5017064846416384, + "grad_norm": 0.24136840735242926, + "learning_rate": 0.00012982784265597945, + "loss": 0.8856, + "step": 11728 + }, + { + "epoch": 2.501919795221843, + "grad_norm": 0.3588865613096803, + "learning_rate": 0.00012982118481998943, + "loss": 0.9294, + "step": 11729 + }, + { + "epoch": 2.5021331058020477, + "grad_norm": 0.4614211190367957, + "learning_rate": 0.0001298145264202719, + "loss": 0.9374, + "step": 11730 + }, + { + "epoch": 2.5023464163822524, + "grad_norm": 0.46926626869157667, + "learning_rate": 0.00012980786745690214, + "loss": 0.9513, + "step": 11731 + }, + { + "epoch": 2.502559726962457, + "grad_norm": 0.40043360497112457, + "learning_rate": 0.00012980120792995556, + "loss": 0.925, + "step": 11732 + }, + { + "epoch": 2.5027730375426622, + "grad_norm": 0.3185837547926908, + "learning_rate": 0.00012979454783950745, + "loss": 0.9267, + "step": 11733 + }, + { + "epoch": 2.502986348122867, + "grad_norm": 0.26813071583777565, + "learning_rate": 0.0001297878871856332, + "loss": 0.9328, + "step": 11734 + }, + { + "epoch": 2.5031996587030716, + "grad_norm": 0.2623038256542604, + "learning_rate": 0.00012978122596840817, + "loss": 0.95, + "step": 11735 + }, + { + "epoch": 2.5034129692832767, + "grad_norm": 0.372269478424003, + "learning_rate": 0.00012977456418790772, + "loss": 0.9305, + "step": 11736 + }, + { + "epoch": 2.5036262798634814, + "grad_norm": 0.4450561097743955, + "learning_rate": 0.00012976790184420726, + "loss": 0.8925, + "step": 11737 + }, + { + "epoch": 2.503839590443686, + "grad_norm": 0.3862661465834608, + "learning_rate": 0.00012976123893738213, + "loss": 0.8975, + "step": 11738 + }, + { + "epoch": 2.5040529010238908, + "grad_norm": 0.4013160576879059, + "learning_rate": 0.00012975457546750776, + "loss": 0.9485, + "step": 11739 + }, + { + "epoch": 2.5042662116040955, + "grad_norm": 0.3696226336424469, + "learning_rate": 0.00012974791143465953, + "loss": 0.9252, + "step": 11740 + }, + { + "epoch": 2.5044795221843, + "grad_norm": 0.2616209737583105, + "learning_rate": 0.00012974124683891284, + "loss": 1.0024, + "step": 11741 + }, + { + "epoch": 2.5046928327645053, + "grad_norm": 0.28074745203311063, + "learning_rate": 0.0001297345816803431, + "loss": 0.9394, + "step": 11742 + }, + { + "epoch": 2.50490614334471, + "grad_norm": 0.319244707039136, + "learning_rate": 0.00012972791595902575, + "loss": 0.937, + "step": 11743 + }, + { + "epoch": 2.5051194539249146, + "grad_norm": 0.26345150587075644, + "learning_rate": 0.0001297212496750362, + "loss": 0.9358, + "step": 11744 + }, + { + "epoch": 2.5053327645051193, + "grad_norm": 0.2938908325175185, + "learning_rate": 0.00012971458282844985, + "loss": 0.9873, + "step": 11745 + }, + { + "epoch": 2.5055460750853245, + "grad_norm": 0.32742876537236154, + "learning_rate": 0.00012970791541934217, + "loss": 0.948, + "step": 11746 + }, + { + "epoch": 2.505759385665529, + "grad_norm": 0.2865286302342467, + "learning_rate": 0.00012970124744778857, + "loss": 0.9206, + "step": 11747 + }, + { + "epoch": 2.505972696245734, + "grad_norm": 0.2597914328889434, + "learning_rate": 0.00012969457891386457, + "loss": 0.9295, + "step": 11748 + }, + { + "epoch": 2.5061860068259385, + "grad_norm": 0.24727992738812188, + "learning_rate": 0.0001296879098176455, + "loss": 0.9274, + "step": 11749 + }, + { + "epoch": 2.506399317406143, + "grad_norm": 0.22220836602849245, + "learning_rate": 0.00012968124015920695, + "loss": 0.9344, + "step": 11750 + }, + { + "epoch": 2.506612627986348, + "grad_norm": 0.21296427671980356, + "learning_rate": 0.00012967456993862432, + "loss": 0.9392, + "step": 11751 + }, + { + "epoch": 2.506825938566553, + "grad_norm": 0.21172955827260276, + "learning_rate": 0.00012966789915597307, + "loss": 0.9405, + "step": 11752 + }, + { + "epoch": 2.5070392491467577, + "grad_norm": 0.23448493831660075, + "learning_rate": 0.00012966122781132877, + "loss": 0.9287, + "step": 11753 + }, + { + "epoch": 2.5072525597269624, + "grad_norm": 0.21621894189081461, + "learning_rate": 0.0001296545559047668, + "loss": 0.94, + "step": 11754 + }, + { + "epoch": 2.5074658703071675, + "grad_norm": 0.2978187584005699, + "learning_rate": 0.00012964788343636268, + "loss": 0.9456, + "step": 11755 + }, + { + "epoch": 2.507679180887372, + "grad_norm": 0.3086984720958158, + "learning_rate": 0.00012964121040619193, + "loss": 0.9262, + "step": 11756 + }, + { + "epoch": 2.507892491467577, + "grad_norm": 0.29377013150244535, + "learning_rate": 0.00012963453681433007, + "loss": 0.9477, + "step": 11757 + }, + { + "epoch": 2.5081058020477816, + "grad_norm": 0.2707510943765743, + "learning_rate": 0.00012962786266085258, + "loss": 0.949, + "step": 11758 + }, + { + "epoch": 2.5083191126279862, + "grad_norm": 0.2729684623294394, + "learning_rate": 0.00012962118794583496, + "loss": 0.906, + "step": 11759 + }, + { + "epoch": 2.508532423208191, + "grad_norm": 0.2928498208190971, + "learning_rate": 0.00012961451266935283, + "loss": 0.9227, + "step": 11760 + }, + { + "epoch": 2.508745733788396, + "grad_norm": 0.30877197697315156, + "learning_rate": 0.0001296078368314816, + "loss": 0.9252, + "step": 11761 + }, + { + "epoch": 2.5089590443686007, + "grad_norm": 0.3994375882415709, + "learning_rate": 0.00012960116043229686, + "loss": 0.947, + "step": 11762 + }, + { + "epoch": 2.5091723549488054, + "grad_norm": 0.31702681207845024, + "learning_rate": 0.00012959448347187418, + "loss": 0.9252, + "step": 11763 + }, + { + "epoch": 2.50938566552901, + "grad_norm": 0.2346374663755104, + "learning_rate": 0.0001295878059502891, + "loss": 0.9626, + "step": 11764 + }, + { + "epoch": 2.5095989761092152, + "grad_norm": 0.25380014916652344, + "learning_rate": 0.00012958112786761713, + "loss": 0.9462, + "step": 11765 + }, + { + "epoch": 2.50981228668942, + "grad_norm": 0.25697697469703673, + "learning_rate": 0.00012957444922393388, + "loss": 0.9172, + "step": 11766 + }, + { + "epoch": 2.5100255972696246, + "grad_norm": 0.3088267578671693, + "learning_rate": 0.0001295677700193149, + "loss": 0.9237, + "step": 11767 + }, + { + "epoch": 2.5102389078498293, + "grad_norm": 0.40771949614894953, + "learning_rate": 0.00012956109025383578, + "loss": 0.9549, + "step": 11768 + }, + { + "epoch": 2.510452218430034, + "grad_norm": 0.5005843972326206, + "learning_rate": 0.00012955440992757204, + "loss": 0.9548, + "step": 11769 + }, + { + "epoch": 2.5106655290102387, + "grad_norm": 0.47394521867030664, + "learning_rate": 0.00012954772904059939, + "loss": 0.936, + "step": 11770 + }, + { + "epoch": 2.510878839590444, + "grad_norm": 0.7116806030082784, + "learning_rate": 0.00012954104759299332, + "loss": 0.9587, + "step": 11771 + }, + { + "epoch": 2.5110921501706485, + "grad_norm": 0.27355398546321086, + "learning_rate": 0.00012953436558482947, + "loss": 0.9157, + "step": 11772 + }, + { + "epoch": 2.511305460750853, + "grad_norm": 0.23703820487784613, + "learning_rate": 0.0001295276830161834, + "loss": 0.958, + "step": 11773 + }, + { + "epoch": 2.5115187713310583, + "grad_norm": 0.2706690691821656, + "learning_rate": 0.00012952099988713082, + "loss": 0.925, + "step": 11774 + }, + { + "epoch": 2.511732081911263, + "grad_norm": 0.36708121235471586, + "learning_rate": 0.00012951431619774725, + "loss": 0.9299, + "step": 11775 + }, + { + "epoch": 2.5119453924914676, + "grad_norm": 0.43115726357311956, + "learning_rate": 0.00012950763194810838, + "loss": 0.9448, + "step": 11776 + }, + { + "epoch": 2.5121587030716723, + "grad_norm": 0.3270255382572303, + "learning_rate": 0.00012950094713828984, + "loss": 0.9156, + "step": 11777 + }, + { + "epoch": 2.512372013651877, + "grad_norm": 0.31971170598832854, + "learning_rate": 0.00012949426176836723, + "loss": 0.9384, + "step": 11778 + }, + { + "epoch": 2.5125853242320817, + "grad_norm": 0.3732367460665823, + "learning_rate": 0.0001294875758384162, + "loss": 0.9663, + "step": 11779 + }, + { + "epoch": 2.512798634812287, + "grad_norm": 1.2528057431991275, + "learning_rate": 0.00012948088934851244, + "loss": 0.9835, + "step": 11780 + }, + { + "epoch": 2.5130119453924915, + "grad_norm": 0.2959262470270707, + "learning_rate": 0.00012947420229873156, + "loss": 0.9748, + "step": 11781 + }, + { + "epoch": 2.513225255972696, + "grad_norm": 0.3952909555797023, + "learning_rate": 0.00012946751468914926, + "loss": 0.953, + "step": 11782 + }, + { + "epoch": 2.513438566552901, + "grad_norm": 0.4077107539982061, + "learning_rate": 0.00012946082651984123, + "loss": 0.9658, + "step": 11783 + }, + { + "epoch": 2.513651877133106, + "grad_norm": 0.3146602156812882, + "learning_rate": 0.00012945413779088307, + "loss": 0.9328, + "step": 11784 + }, + { + "epoch": 2.5138651877133107, + "grad_norm": 0.3431699963476126, + "learning_rate": 0.00012944744850235056, + "loss": 0.9568, + "step": 11785 + }, + { + "epoch": 2.5140784982935154, + "grad_norm": 0.3675823385886179, + "learning_rate": 0.00012944075865431928, + "loss": 0.9569, + "step": 11786 + }, + { + "epoch": 2.51429180887372, + "grad_norm": 0.27711644179862877, + "learning_rate": 0.00012943406824686502, + "loss": 0.9319, + "step": 11787 + }, + { + "epoch": 2.5145051194539247, + "grad_norm": 0.3606087290507577, + "learning_rate": 0.00012942737728006344, + "loss": 0.9846, + "step": 11788 + }, + { + "epoch": 2.5147184300341294, + "grad_norm": 0.38717098116375437, + "learning_rate": 0.00012942068575399023, + "loss": 0.9289, + "step": 11789 + }, + { + "epoch": 2.5149317406143346, + "grad_norm": 0.3472585344540951, + "learning_rate": 0.00012941399366872116, + "loss": 0.9325, + "step": 11790 + }, + { + "epoch": 2.5151450511945392, + "grad_norm": 0.3750010572911918, + "learning_rate": 0.00012940730102433193, + "loss": 0.9705, + "step": 11791 + }, + { + "epoch": 2.515358361774744, + "grad_norm": 0.3192815881018217, + "learning_rate": 0.00012940060782089824, + "loss": 0.9472, + "step": 11792 + }, + { + "epoch": 2.515571672354949, + "grad_norm": 0.3145790171734665, + "learning_rate": 0.00012939391405849586, + "loss": 0.9334, + "step": 11793 + }, + { + "epoch": 2.5157849829351537, + "grad_norm": 0.29028108602708624, + "learning_rate": 0.00012938721973720048, + "loss": 0.9323, + "step": 11794 + }, + { + "epoch": 2.5159982935153584, + "grad_norm": 0.26469804822457843, + "learning_rate": 0.0001293805248570879, + "loss": 0.932, + "step": 11795 + }, + { + "epoch": 2.516211604095563, + "grad_norm": 0.23893749758573765, + "learning_rate": 0.00012937382941823384, + "loss": 0.9682, + "step": 11796 + }, + { + "epoch": 2.516424914675768, + "grad_norm": 0.3018205879295098, + "learning_rate": 0.0001293671334207141, + "loss": 0.9483, + "step": 11797 + }, + { + "epoch": 2.5166382252559725, + "grad_norm": 0.27676871249838286, + "learning_rate": 0.00012936043686460436, + "loss": 0.9809, + "step": 11798 + }, + { + "epoch": 2.5168515358361776, + "grad_norm": 0.22760593949391236, + "learning_rate": 0.00012935373974998048, + "loss": 0.9122, + "step": 11799 + }, + { + "epoch": 2.5170648464163823, + "grad_norm": 0.26198378936513084, + "learning_rate": 0.0001293470420769182, + "loss": 0.9355, + "step": 11800 + }, + { + "epoch": 2.517278156996587, + "grad_norm": 0.24362465011950857, + "learning_rate": 0.0001293403438454933, + "loss": 0.9488, + "step": 11801 + }, + { + "epoch": 2.5174914675767917, + "grad_norm": 0.22649432509286826, + "learning_rate": 0.00012933364505578158, + "loss": 0.982, + "step": 11802 + }, + { + "epoch": 2.517704778156997, + "grad_norm": 0.27777083308838696, + "learning_rate": 0.00012932694570785884, + "loss": 0.9817, + "step": 11803 + }, + { + "epoch": 2.5179180887372015, + "grad_norm": 0.2856549761312194, + "learning_rate": 0.0001293202458018009, + "loss": 0.9489, + "step": 11804 + }, + { + "epoch": 2.518131399317406, + "grad_norm": 0.2533798458558085, + "learning_rate": 0.00012931354533768353, + "loss": 0.9537, + "step": 11805 + }, + { + "epoch": 2.518344709897611, + "grad_norm": 0.24319755041599395, + "learning_rate": 0.00012930684431558258, + "loss": 0.9685, + "step": 11806 + }, + { + "epoch": 2.5185580204778155, + "grad_norm": 0.23910013785748613, + "learning_rate": 0.00012930014273557385, + "loss": 0.93, + "step": 11807 + }, + { + "epoch": 2.51877133105802, + "grad_norm": 0.31555923385824186, + "learning_rate": 0.00012929344059773315, + "loss": 0.9604, + "step": 11808 + }, + { + "epoch": 2.5189846416382253, + "grad_norm": 0.34303621456251426, + "learning_rate": 0.00012928673790213636, + "loss": 0.9459, + "step": 11809 + }, + { + "epoch": 2.51919795221843, + "grad_norm": 0.3241380234246244, + "learning_rate": 0.0001292800346488593, + "loss": 0.9254, + "step": 11810 + }, + { + "epoch": 2.5194112627986347, + "grad_norm": 0.3397402938949176, + "learning_rate": 0.00012927333083797784, + "loss": 0.937, + "step": 11811 + }, + { + "epoch": 2.51962457337884, + "grad_norm": 0.2852599915933548, + "learning_rate": 0.0001292666264695678, + "loss": 0.9128, + "step": 11812 + }, + { + "epoch": 2.5198378839590445, + "grad_norm": 0.26608384799793894, + "learning_rate": 0.000129259921543705, + "loss": 0.9248, + "step": 11813 + }, + { + "epoch": 2.520051194539249, + "grad_norm": 0.2618262782600106, + "learning_rate": 0.00012925321606046543, + "loss": 0.9483, + "step": 11814 + }, + { + "epoch": 2.520264505119454, + "grad_norm": 0.24581821872740453, + "learning_rate": 0.00012924651001992488, + "loss": 0.9347, + "step": 11815 + }, + { + "epoch": 2.5204778156996586, + "grad_norm": 0.24889766803838909, + "learning_rate": 0.00012923980342215922, + "loss": 0.9826, + "step": 11816 + }, + { + "epoch": 2.5206911262798632, + "grad_norm": 0.2917236883649356, + "learning_rate": 0.00012923309626724437, + "loss": 0.9651, + "step": 11817 + }, + { + "epoch": 2.5209044368600684, + "grad_norm": 0.355740826656596, + "learning_rate": 0.0001292263885552562, + "loss": 0.9423, + "step": 11818 + }, + { + "epoch": 2.521117747440273, + "grad_norm": 0.38576514416344054, + "learning_rate": 0.00012921968028627062, + "loss": 0.9316, + "step": 11819 + }, + { + "epoch": 2.5213310580204777, + "grad_norm": 0.3149700637278327, + "learning_rate": 0.00012921297146036354, + "loss": 0.9405, + "step": 11820 + }, + { + "epoch": 2.5215443686006824, + "grad_norm": 0.23254786430252358, + "learning_rate": 0.00012920626207761084, + "loss": 0.9658, + "step": 11821 + }, + { + "epoch": 2.5217576791808876, + "grad_norm": 0.3159448500033789, + "learning_rate": 0.00012919955213808846, + "loss": 0.9388, + "step": 11822 + }, + { + "epoch": 2.5219709897610922, + "grad_norm": 0.3484865927902253, + "learning_rate": 0.00012919284164187233, + "loss": 0.951, + "step": 11823 + }, + { + "epoch": 2.522184300341297, + "grad_norm": 0.4194399438461566, + "learning_rate": 0.00012918613058903835, + "loss": 0.9448, + "step": 11824 + }, + { + "epoch": 2.5223976109215016, + "grad_norm": 0.445875538681887, + "learning_rate": 0.0001291794189796625, + "loss": 0.9304, + "step": 11825 + }, + { + "epoch": 2.5226109215017063, + "grad_norm": 0.3829087478947788, + "learning_rate": 0.00012917270681382068, + "loss": 0.9412, + "step": 11826 + }, + { + "epoch": 2.5228242320819114, + "grad_norm": 0.31578768211451874, + "learning_rate": 0.0001291659940915889, + "loss": 0.9291, + "step": 11827 + }, + { + "epoch": 2.523037542662116, + "grad_norm": 0.26479760986325385, + "learning_rate": 0.000129159280813043, + "loss": 0.9357, + "step": 11828 + }, + { + "epoch": 2.523250853242321, + "grad_norm": 0.2886412207892155, + "learning_rate": 0.00012915256697825905, + "loss": 0.9141, + "step": 11829 + }, + { + "epoch": 2.5234641638225255, + "grad_norm": 0.3044884955898092, + "learning_rate": 0.00012914585258731298, + "loss": 0.9682, + "step": 11830 + }, + { + "epoch": 2.5236774744027306, + "grad_norm": 0.26162630906729256, + "learning_rate": 0.00012913913764028075, + "loss": 0.9653, + "step": 11831 + }, + { + "epoch": 2.5238907849829353, + "grad_norm": 0.3530357299341937, + "learning_rate": 0.00012913242213723834, + "loss": 0.9638, + "step": 11832 + }, + { + "epoch": 2.52410409556314, + "grad_norm": 0.34765685649920613, + "learning_rate": 0.00012912570607826176, + "loss": 0.9226, + "step": 11833 + }, + { + "epoch": 2.5243174061433447, + "grad_norm": 0.25644041031884496, + "learning_rate": 0.000129118989463427, + "loss": 0.9191, + "step": 11834 + }, + { + "epoch": 2.5245307167235493, + "grad_norm": 0.23263699148296427, + "learning_rate": 0.00012911227229281004, + "loss": 0.9139, + "step": 11835 + }, + { + "epoch": 2.524744027303754, + "grad_norm": 0.2654767468663005, + "learning_rate": 0.00012910555456648688, + "loss": 0.9285, + "step": 11836 + }, + { + "epoch": 2.524957337883959, + "grad_norm": 0.24387553935356535, + "learning_rate": 0.00012909883628453356, + "loss": 0.9016, + "step": 11837 + }, + { + "epoch": 2.525170648464164, + "grad_norm": 0.25182094671740235, + "learning_rate": 0.00012909211744702607, + "loss": 0.9344, + "step": 11838 + }, + { + "epoch": 2.5253839590443685, + "grad_norm": 0.25663594853089183, + "learning_rate": 0.00012908539805404044, + "loss": 0.9513, + "step": 11839 + }, + { + "epoch": 2.5255972696245736, + "grad_norm": 0.2931691331032213, + "learning_rate": 0.0001290786781056527, + "loss": 0.9363, + "step": 11840 + }, + { + "epoch": 2.5258105802047783, + "grad_norm": 0.30355853070693933, + "learning_rate": 0.00012907195760193893, + "loss": 0.9668, + "step": 11841 + }, + { + "epoch": 2.526023890784983, + "grad_norm": 0.3755022234104581, + "learning_rate": 0.0001290652365429751, + "loss": 0.9818, + "step": 11842 + }, + { + "epoch": 2.5262372013651877, + "grad_norm": 0.3721354862087407, + "learning_rate": 0.00012905851492883735, + "loss": 0.9235, + "step": 11843 + }, + { + "epoch": 2.5264505119453924, + "grad_norm": 0.3585151656336189, + "learning_rate": 0.0001290517927596016, + "loss": 0.946, + "step": 11844 + }, + { + "epoch": 2.526663822525597, + "grad_norm": 0.38030754082845813, + "learning_rate": 0.00012904507003534403, + "loss": 0.9464, + "step": 11845 + }, + { + "epoch": 2.526877133105802, + "grad_norm": 0.3734846413834143, + "learning_rate": 0.00012903834675614067, + "loss": 0.957, + "step": 11846 + }, + { + "epoch": 2.527090443686007, + "grad_norm": 0.28167847400945983, + "learning_rate": 0.00012903162292206757, + "loss": 0.9434, + "step": 11847 + }, + { + "epoch": 2.5273037542662116, + "grad_norm": 0.22428103974162744, + "learning_rate": 0.00012902489853320085, + "loss": 0.9528, + "step": 11848 + }, + { + "epoch": 2.5275170648464163, + "grad_norm": 0.2929061688656991, + "learning_rate": 0.0001290181735896166, + "loss": 0.886, + "step": 11849 + }, + { + "epoch": 2.5277303754266214, + "grad_norm": 0.26441676873392006, + "learning_rate": 0.00012901144809139084, + "loss": 0.9558, + "step": 11850 + }, + { + "epoch": 2.527943686006826, + "grad_norm": 0.29608286345454027, + "learning_rate": 0.00012900472203859975, + "loss": 0.9337, + "step": 11851 + }, + { + "epoch": 2.5281569965870307, + "grad_norm": 0.27919512892132975, + "learning_rate": 0.0001289979954313194, + "loss": 0.9256, + "step": 11852 + }, + { + "epoch": 2.5283703071672354, + "grad_norm": 0.2260324307958154, + "learning_rate": 0.0001289912682696259, + "loss": 0.9376, + "step": 11853 + }, + { + "epoch": 2.52858361774744, + "grad_norm": 0.2500162471427124, + "learning_rate": 0.00012898454055359539, + "loss": 0.9227, + "step": 11854 + }, + { + "epoch": 2.528796928327645, + "grad_norm": 0.2903976137371143, + "learning_rate": 0.000128977812283304, + "loss": 0.9534, + "step": 11855 + }, + { + "epoch": 2.52901023890785, + "grad_norm": 0.3121791890311102, + "learning_rate": 0.0001289710834588278, + "loss": 0.9328, + "step": 11856 + }, + { + "epoch": 2.5292235494880546, + "grad_norm": 0.3262327966688357, + "learning_rate": 0.00012896435408024296, + "loss": 0.9555, + "step": 11857 + }, + { + "epoch": 2.5294368600682593, + "grad_norm": 0.360674400398844, + "learning_rate": 0.00012895762414762566, + "loss": 0.9186, + "step": 11858 + }, + { + "epoch": 2.5296501706484644, + "grad_norm": 0.30859514313937303, + "learning_rate": 0.00012895089366105202, + "loss": 0.9228, + "step": 11859 + }, + { + "epoch": 2.529863481228669, + "grad_norm": 0.2251224194859416, + "learning_rate": 0.00012894416262059817, + "loss": 0.9245, + "step": 11860 + }, + { + "epoch": 2.530076791808874, + "grad_norm": 0.2971162850639062, + "learning_rate": 0.00012893743102634033, + "loss": 0.9243, + "step": 11861 + }, + { + "epoch": 2.5302901023890785, + "grad_norm": 0.4270150211051266, + "learning_rate": 0.00012893069887835463, + "loss": 0.9393, + "step": 11862 + }, + { + "epoch": 2.530503412969283, + "grad_norm": 0.45364874490715595, + "learning_rate": 0.00012892396617671722, + "loss": 0.957, + "step": 11863 + }, + { + "epoch": 2.530716723549488, + "grad_norm": 0.35478372693040255, + "learning_rate": 0.0001289172329215043, + "loss": 0.9212, + "step": 11864 + }, + { + "epoch": 2.530930034129693, + "grad_norm": 0.35362909243276286, + "learning_rate": 0.00012891049911279212, + "loss": 0.9673, + "step": 11865 + }, + { + "epoch": 2.5311433447098977, + "grad_norm": 0.3396404330201334, + "learning_rate": 0.0001289037647506568, + "loss": 0.9327, + "step": 11866 + }, + { + "epoch": 2.5313566552901023, + "grad_norm": 0.24935431545603287, + "learning_rate": 0.00012889702983517456, + "loss": 0.9493, + "step": 11867 + }, + { + "epoch": 2.531569965870307, + "grad_norm": 0.21285922035493052, + "learning_rate": 0.00012889029436642159, + "loss": 0.92, + "step": 11868 + }, + { + "epoch": 2.531783276450512, + "grad_norm": 0.34300168577544166, + "learning_rate": 0.0001288835583444741, + "loss": 0.937, + "step": 11869 + }, + { + "epoch": 2.531996587030717, + "grad_norm": 0.3369783287000769, + "learning_rate": 0.00012887682176940837, + "loss": 0.9519, + "step": 11870 + }, + { + "epoch": 2.5322098976109215, + "grad_norm": 0.23837553757271154, + "learning_rate": 0.00012887008464130056, + "loss": 0.9354, + "step": 11871 + }, + { + "epoch": 2.532423208191126, + "grad_norm": 0.29491397554262816, + "learning_rate": 0.0001288633469602269, + "loss": 0.9308, + "step": 11872 + }, + { + "epoch": 2.532636518771331, + "grad_norm": 0.3801627503934371, + "learning_rate": 0.00012885660872626368, + "loss": 0.9329, + "step": 11873 + }, + { + "epoch": 2.5328498293515356, + "grad_norm": 0.312633231138147, + "learning_rate": 0.0001288498699394871, + "loss": 0.9203, + "step": 11874 + }, + { + "epoch": 2.5330631399317407, + "grad_norm": 0.3058793176948332, + "learning_rate": 0.0001288431305999734, + "loss": 0.9561, + "step": 11875 + }, + { + "epoch": 2.5332764505119454, + "grad_norm": 0.34786220426033193, + "learning_rate": 0.00012883639070779886, + "loss": 0.9408, + "step": 11876 + }, + { + "epoch": 2.53348976109215, + "grad_norm": 0.31096497661939604, + "learning_rate": 0.00012882965026303977, + "loss": 0.9405, + "step": 11877 + }, + { + "epoch": 2.533703071672355, + "grad_norm": 0.6014018828425908, + "learning_rate": 0.00012882290926577236, + "loss": 0.9621, + "step": 11878 + }, + { + "epoch": 2.53391638225256, + "grad_norm": 0.2641394331490311, + "learning_rate": 0.00012881616771607288, + "loss": 0.9571, + "step": 11879 + }, + { + "epoch": 2.5341296928327646, + "grad_norm": 0.26674650923453386, + "learning_rate": 0.00012880942561401766, + "loss": 0.9239, + "step": 11880 + }, + { + "epoch": 2.5343430034129693, + "grad_norm": 0.290682094363843, + "learning_rate": 0.00012880268295968297, + "loss": 0.9507, + "step": 11881 + }, + { + "epoch": 2.534556313993174, + "grad_norm": 0.26917195089206425, + "learning_rate": 0.00012879593975314508, + "loss": 0.9173, + "step": 11882 + }, + { + "epoch": 2.5347696245733786, + "grad_norm": 0.3544555143144964, + "learning_rate": 0.00012878919599448033, + "loss": 0.9625, + "step": 11883 + }, + { + "epoch": 2.5349829351535837, + "grad_norm": 0.4284419389669162, + "learning_rate": 0.00012878245168376502, + "loss": 0.9451, + "step": 11884 + }, + { + "epoch": 2.5351962457337884, + "grad_norm": 0.391765295537645, + "learning_rate": 0.00012877570682107544, + "loss": 0.9217, + "step": 11885 + }, + { + "epoch": 2.535409556313993, + "grad_norm": 0.333374914970993, + "learning_rate": 0.0001287689614064879, + "loss": 0.9309, + "step": 11886 + }, + { + "epoch": 2.535622866894198, + "grad_norm": 0.23351095472219574, + "learning_rate": 0.00012876221544007874, + "loss": 0.9529, + "step": 11887 + }, + { + "epoch": 2.535836177474403, + "grad_norm": 0.3369451547856852, + "learning_rate": 0.00012875546892192433, + "loss": 0.9266, + "step": 11888 + }, + { + "epoch": 2.5360494880546076, + "grad_norm": 0.44089118785181547, + "learning_rate": 0.00012874872185210091, + "loss": 0.9296, + "step": 11889 + }, + { + "epoch": 2.5362627986348123, + "grad_norm": 0.32974434978811906, + "learning_rate": 0.00012874197423068493, + "loss": 0.921, + "step": 11890 + }, + { + "epoch": 2.536476109215017, + "grad_norm": 0.2886172338997996, + "learning_rate": 0.0001287352260577527, + "loss": 0.932, + "step": 11891 + }, + { + "epoch": 2.5366894197952217, + "grad_norm": 0.33481044450161984, + "learning_rate": 0.00012872847733338052, + "loss": 0.9038, + "step": 11892 + }, + { + "epoch": 2.5369027303754264, + "grad_norm": 0.3843705240853804, + "learning_rate": 0.00012872172805764481, + "loss": 0.9375, + "step": 11893 + }, + { + "epoch": 2.5371160409556315, + "grad_norm": 0.32925387632080894, + "learning_rate": 0.00012871497823062198, + "loss": 0.9104, + "step": 11894 + }, + { + "epoch": 2.537329351535836, + "grad_norm": 0.28182190291294285, + "learning_rate": 0.0001287082278523883, + "loss": 0.9613, + "step": 11895 + }, + { + "epoch": 2.537542662116041, + "grad_norm": 0.2695641309406954, + "learning_rate": 0.0001287014769230202, + "loss": 0.9334, + "step": 11896 + }, + { + "epoch": 2.537755972696246, + "grad_norm": 0.2996351059070023, + "learning_rate": 0.0001286947254425941, + "loss": 0.9644, + "step": 11897 + }, + { + "epoch": 2.5379692832764507, + "grad_norm": 0.29104274855646906, + "learning_rate": 0.00012868797341118634, + "loss": 0.9315, + "step": 11898 + }, + { + "epoch": 2.5381825938566553, + "grad_norm": 0.23769078569846108, + "learning_rate": 0.00012868122082887335, + "loss": 0.9482, + "step": 11899 + }, + { + "epoch": 2.53839590443686, + "grad_norm": 0.2825676275987105, + "learning_rate": 0.00012867446769573154, + "loss": 0.9255, + "step": 11900 + }, + { + "epoch": 2.5386092150170647, + "grad_norm": 0.36900515991648664, + "learning_rate": 0.00012866771401183726, + "loss": 0.9427, + "step": 11901 + }, + { + "epoch": 2.5388225255972694, + "grad_norm": 0.4499272707617353, + "learning_rate": 0.00012866095977726703, + "loss": 0.9454, + "step": 11902 + }, + { + "epoch": 2.5390358361774745, + "grad_norm": 0.4465788725865524, + "learning_rate": 0.0001286542049920972, + "loss": 0.9174, + "step": 11903 + }, + { + "epoch": 2.539249146757679, + "grad_norm": 0.39899301019005773, + "learning_rate": 0.00012864744965640425, + "loss": 0.9324, + "step": 11904 + }, + { + "epoch": 2.539462457337884, + "grad_norm": 0.3646524198814769, + "learning_rate": 0.00012864069377026453, + "loss": 0.9057, + "step": 11905 + }, + { + "epoch": 2.5396757679180886, + "grad_norm": 0.47744079111405824, + "learning_rate": 0.0001286339373337546, + "loss": 0.9634, + "step": 11906 + }, + { + "epoch": 2.5398890784982937, + "grad_norm": 0.4026625718643134, + "learning_rate": 0.00012862718034695083, + "loss": 0.9334, + "step": 11907 + }, + { + "epoch": 2.5401023890784984, + "grad_norm": 0.7020850640227755, + "learning_rate": 0.0001286204228099297, + "loss": 0.9322, + "step": 11908 + }, + { + "epoch": 2.540315699658703, + "grad_norm": 0.8316891369069618, + "learning_rate": 0.00012861366472276766, + "loss": 0.9241, + "step": 11909 + }, + { + "epoch": 2.5405290102389078, + "grad_norm": 0.6860922970525989, + "learning_rate": 0.00012860690608554117, + "loss": 0.9284, + "step": 11910 + }, + { + "epoch": 2.5407423208191124, + "grad_norm": 0.5346250982258445, + "learning_rate": 0.00012860014689832676, + "loss": 0.929, + "step": 11911 + }, + { + "epoch": 2.5409556313993176, + "grad_norm": 0.4156666766052098, + "learning_rate": 0.00012859338716120084, + "loss": 0.961, + "step": 11912 + }, + { + "epoch": 2.5411689419795223, + "grad_norm": 0.2553183325737704, + "learning_rate": 0.00012858662687423994, + "loss": 0.9323, + "step": 11913 + }, + { + "epoch": 2.541382252559727, + "grad_norm": 0.2822452768568878, + "learning_rate": 0.00012857986603752054, + "loss": 0.9041, + "step": 11914 + }, + { + "epoch": 2.5415955631399316, + "grad_norm": 0.3257608072783993, + "learning_rate": 0.00012857310465111911, + "loss": 0.9392, + "step": 11915 + }, + { + "epoch": 2.5418088737201368, + "grad_norm": 0.3231415319314085, + "learning_rate": 0.00012856634271511222, + "loss": 0.954, + "step": 11916 + }, + { + "epoch": 2.5420221843003414, + "grad_norm": 0.3793201473979316, + "learning_rate": 0.00012855958022957634, + "loss": 0.9189, + "step": 11917 + }, + { + "epoch": 2.542235494880546, + "grad_norm": 0.311236357611308, + "learning_rate": 0.000128552817194588, + "loss": 0.9199, + "step": 11918 + }, + { + "epoch": 2.542448805460751, + "grad_norm": 0.2739292610376794, + "learning_rate": 0.0001285460536102237, + "loss": 0.9596, + "step": 11919 + }, + { + "epoch": 2.5426621160409555, + "grad_norm": 0.2922578211553025, + "learning_rate": 0.00012853928947656, + "loss": 0.9549, + "step": 11920 + }, + { + "epoch": 2.54287542662116, + "grad_norm": 0.20942136344684795, + "learning_rate": 0.00012853252479367338, + "loss": 0.9609, + "step": 11921 + }, + { + "epoch": 2.5430887372013653, + "grad_norm": 0.29807306425509805, + "learning_rate": 0.0001285257595616405, + "loss": 0.939, + "step": 11922 + }, + { + "epoch": 2.54330204778157, + "grad_norm": 0.2128984956670255, + "learning_rate": 0.00012851899378053778, + "loss": 0.942, + "step": 11923 + }, + { + "epoch": 2.5435153583617747, + "grad_norm": 0.21649693430470202, + "learning_rate": 0.00012851222745044184, + "loss": 0.9173, + "step": 11924 + }, + { + "epoch": 2.54372866894198, + "grad_norm": 0.23424409062000423, + "learning_rate": 0.00012850546057142922, + "loss": 0.9642, + "step": 11925 + }, + { + "epoch": 2.5439419795221845, + "grad_norm": 0.22266806607205708, + "learning_rate": 0.00012849869314357653, + "loss": 0.9193, + "step": 11926 + }, + { + "epoch": 2.544155290102389, + "grad_norm": 0.2593694849641467, + "learning_rate": 0.0001284919251669603, + "loss": 0.944, + "step": 11927 + }, + { + "epoch": 2.544368600682594, + "grad_norm": 0.2183093521012716, + "learning_rate": 0.0001284851566416571, + "loss": 0.9354, + "step": 11928 + }, + { + "epoch": 2.5445819112627985, + "grad_norm": 0.22594715031453483, + "learning_rate": 0.00012847838756774356, + "loss": 0.9424, + "step": 11929 + }, + { + "epoch": 2.544795221843003, + "grad_norm": 0.29870985339043965, + "learning_rate": 0.00012847161794529623, + "loss": 0.9421, + "step": 11930 + }, + { + "epoch": 2.5450085324232083, + "grad_norm": 0.3478842629902631, + "learning_rate": 0.00012846484777439174, + "loss": 0.9122, + "step": 11931 + }, + { + "epoch": 2.545221843003413, + "grad_norm": 0.35332107836511, + "learning_rate": 0.00012845807705510668, + "loss": 0.9255, + "step": 11932 + }, + { + "epoch": 2.5454351535836177, + "grad_norm": 0.2930974867585264, + "learning_rate": 0.00012845130578751767, + "loss": 0.9146, + "step": 11933 + }, + { + "epoch": 2.5456484641638224, + "grad_norm": 0.25653712121440486, + "learning_rate": 0.0001284445339717013, + "loss": 0.9117, + "step": 11934 + }, + { + "epoch": 2.5458617747440275, + "grad_norm": 0.24419930571369872, + "learning_rate": 0.00012843776160773425, + "loss": 0.9369, + "step": 11935 + }, + { + "epoch": 2.546075085324232, + "grad_norm": 0.3022475031627696, + "learning_rate": 0.00012843098869569307, + "loss": 0.9429, + "step": 11936 + }, + { + "epoch": 2.546288395904437, + "grad_norm": 0.3470800248962145, + "learning_rate": 0.00012842421523565447, + "loss": 0.9312, + "step": 11937 + }, + { + "epoch": 2.5465017064846416, + "grad_norm": 0.4024023725011713, + "learning_rate": 0.00012841744122769503, + "loss": 0.9431, + "step": 11938 + }, + { + "epoch": 2.5467150170648463, + "grad_norm": 0.43569112496041923, + "learning_rate": 0.00012841066667189146, + "loss": 0.9454, + "step": 11939 + }, + { + "epoch": 2.546928327645051, + "grad_norm": 0.33394518394691086, + "learning_rate": 0.00012840389156832036, + "loss": 0.9518, + "step": 11940 + }, + { + "epoch": 2.547141638225256, + "grad_norm": 0.30537238121006105, + "learning_rate": 0.00012839711591705843, + "loss": 0.9324, + "step": 11941 + }, + { + "epoch": 2.5473549488054608, + "grad_norm": 0.2660848480992014, + "learning_rate": 0.0001283903397181823, + "loss": 0.9371, + "step": 11942 + }, + { + "epoch": 2.5475682593856654, + "grad_norm": 0.2251406220711983, + "learning_rate": 0.00012838356297176866, + "loss": 0.9055, + "step": 11943 + }, + { + "epoch": 2.5477815699658706, + "grad_norm": 0.27400513992808556, + "learning_rate": 0.00012837678567789422, + "loss": 0.9552, + "step": 11944 + }, + { + "epoch": 2.5479948805460753, + "grad_norm": 0.31478571493323204, + "learning_rate": 0.0001283700078366356, + "loss": 0.935, + "step": 11945 + }, + { + "epoch": 2.54820819112628, + "grad_norm": 0.3700000629123138, + "learning_rate": 0.00012836322944806956, + "loss": 0.9548, + "step": 11946 + }, + { + "epoch": 2.5484215017064846, + "grad_norm": 0.35168432330591737, + "learning_rate": 0.00012835645051227277, + "loss": 0.8862, + "step": 11947 + }, + { + "epoch": 2.5486348122866893, + "grad_norm": 0.24610007436479714, + "learning_rate": 0.00012834967102932188, + "loss": 0.9413, + "step": 11948 + }, + { + "epoch": 2.548848122866894, + "grad_norm": 0.2519157979165694, + "learning_rate": 0.0001283428909992937, + "loss": 0.9191, + "step": 11949 + }, + { + "epoch": 2.549061433447099, + "grad_norm": 0.21195633347688367, + "learning_rate": 0.00012833611042226486, + "loss": 0.9452, + "step": 11950 + }, + { + "epoch": 2.549274744027304, + "grad_norm": 0.2581385061860938, + "learning_rate": 0.00012832932929831214, + "loss": 0.9083, + "step": 11951 + }, + { + "epoch": 2.5494880546075085, + "grad_norm": 0.2740997254844911, + "learning_rate": 0.00012832254762751223, + "loss": 0.9124, + "step": 11952 + }, + { + "epoch": 2.549701365187713, + "grad_norm": 0.2873351579307884, + "learning_rate": 0.00012831576540994188, + "loss": 0.9459, + "step": 11953 + }, + { + "epoch": 2.5499146757679183, + "grad_norm": 0.3412474240265174, + "learning_rate": 0.00012830898264567787, + "loss": 0.9227, + "step": 11954 + }, + { + "epoch": 2.550127986348123, + "grad_norm": 0.3461882646580046, + "learning_rate": 0.00012830219933479688, + "loss": 0.9165, + "step": 11955 + }, + { + "epoch": 2.5503412969283277, + "grad_norm": 0.25317406889082855, + "learning_rate": 0.00012829541547737568, + "loss": 0.9289, + "step": 11956 + }, + { + "epoch": 2.5505546075085324, + "grad_norm": 0.25085553749078365, + "learning_rate": 0.00012828863107349107, + "loss": 0.9343, + "step": 11957 + }, + { + "epoch": 2.550767918088737, + "grad_norm": 0.28376073223320897, + "learning_rate": 0.00012828184612321977, + "loss": 0.9144, + "step": 11958 + }, + { + "epoch": 2.5509812286689417, + "grad_norm": 0.258832306244612, + "learning_rate": 0.00012827506062663854, + "loss": 0.94, + "step": 11959 + }, + { + "epoch": 2.551194539249147, + "grad_norm": 0.25087113375516007, + "learning_rate": 0.00012826827458382422, + "loss": 0.9373, + "step": 11960 + }, + { + "epoch": 2.5514078498293515, + "grad_norm": 0.2460908241314227, + "learning_rate": 0.00012826148799485358, + "loss": 0.9561, + "step": 11961 + }, + { + "epoch": 2.551621160409556, + "grad_norm": 0.28917639944390117, + "learning_rate": 0.0001282547008598034, + "loss": 0.9875, + "step": 11962 + }, + { + "epoch": 2.5518344709897613, + "grad_norm": 0.29887534808753274, + "learning_rate": 0.0001282479131787504, + "loss": 0.9302, + "step": 11963 + }, + { + "epoch": 2.552047781569966, + "grad_norm": 0.26395203024316305, + "learning_rate": 0.0001282411249517715, + "loss": 0.9332, + "step": 11964 + }, + { + "epoch": 2.5522610921501707, + "grad_norm": 0.29849320701819093, + "learning_rate": 0.00012823433617894345, + "loss": 0.9235, + "step": 11965 + }, + { + "epoch": 2.5524744027303754, + "grad_norm": 0.26187704689685726, + "learning_rate": 0.00012822754686034305, + "loss": 0.9361, + "step": 11966 + }, + { + "epoch": 2.55268771331058, + "grad_norm": 0.2927776751798185, + "learning_rate": 0.0001282207569960472, + "loss": 0.951, + "step": 11967 + }, + { + "epoch": 2.5529010238907848, + "grad_norm": 0.3019933426731275, + "learning_rate": 0.00012821396658613265, + "loss": 0.9262, + "step": 11968 + }, + { + "epoch": 2.55311433447099, + "grad_norm": 0.27697773306864487, + "learning_rate": 0.00012820717563067624, + "loss": 0.9624, + "step": 11969 + }, + { + "epoch": 2.5533276450511946, + "grad_norm": 0.30559564465859707, + "learning_rate": 0.00012820038412975483, + "loss": 0.9531, + "step": 11970 + }, + { + "epoch": 2.5535409556313993, + "grad_norm": 0.3086512233951445, + "learning_rate": 0.00012819359208344526, + "loss": 0.9796, + "step": 11971 + }, + { + "epoch": 2.553754266211604, + "grad_norm": 0.31087743497887355, + "learning_rate": 0.0001281867994918244, + "loss": 0.8977, + "step": 11972 + }, + { + "epoch": 2.553967576791809, + "grad_norm": 0.26194685112626936, + "learning_rate": 0.00012818000635496908, + "loss": 0.934, + "step": 11973 + }, + { + "epoch": 2.5541808873720138, + "grad_norm": 0.1898659111534164, + "learning_rate": 0.00012817321267295618, + "loss": 0.9301, + "step": 11974 + }, + { + "epoch": 2.5543941979522184, + "grad_norm": 0.2676769340245204, + "learning_rate": 0.00012816641844586257, + "loss": 0.9649, + "step": 11975 + }, + { + "epoch": 2.554607508532423, + "grad_norm": 0.2744630225177973, + "learning_rate": 0.00012815962367376514, + "loss": 0.9392, + "step": 11976 + }, + { + "epoch": 2.554820819112628, + "grad_norm": 0.2365169658386226, + "learning_rate": 0.00012815282835674076, + "loss": 0.9376, + "step": 11977 + }, + { + "epoch": 2.5550341296928325, + "grad_norm": 0.19221443543339745, + "learning_rate": 0.00012814603249486632, + "loss": 0.9483, + "step": 11978 + }, + { + "epoch": 2.5552474402730376, + "grad_norm": 0.20351207678122285, + "learning_rate": 0.00012813923608821868, + "loss": 0.9394, + "step": 11979 + }, + { + "epoch": 2.5554607508532423, + "grad_norm": 0.251264533762048, + "learning_rate": 0.0001281324391368748, + "loss": 0.9642, + "step": 11980 + }, + { + "epoch": 2.555674061433447, + "grad_norm": 0.18136057093944294, + "learning_rate": 0.00012812564164091153, + "loss": 0.9395, + "step": 11981 + }, + { + "epoch": 2.555887372013652, + "grad_norm": 0.2379957034621912, + "learning_rate": 0.00012811884360040585, + "loss": 0.9606, + "step": 11982 + }, + { + "epoch": 2.556100682593857, + "grad_norm": 0.24196381517603172, + "learning_rate": 0.00012811204501543465, + "loss": 0.9051, + "step": 11983 + }, + { + "epoch": 2.5563139931740615, + "grad_norm": 0.21576243880510995, + "learning_rate": 0.00012810524588607484, + "loss": 0.9154, + "step": 11984 + }, + { + "epoch": 2.556527303754266, + "grad_norm": 0.2887514988310803, + "learning_rate": 0.00012809844621240334, + "loss": 0.8985, + "step": 11985 + }, + { + "epoch": 2.556740614334471, + "grad_norm": 0.26437465532435, + "learning_rate": 0.00012809164599449714, + "loss": 0.9257, + "step": 11986 + }, + { + "epoch": 2.5569539249146755, + "grad_norm": 0.20194952221103962, + "learning_rate": 0.00012808484523243313, + "loss": 0.9238, + "step": 11987 + }, + { + "epoch": 2.5571672354948807, + "grad_norm": 0.31867020994881856, + "learning_rate": 0.0001280780439262883, + "loss": 0.9683, + "step": 11988 + }, + { + "epoch": 2.5573805460750854, + "grad_norm": 0.3322272186073745, + "learning_rate": 0.00012807124207613963, + "loss": 0.9319, + "step": 11989 + }, + { + "epoch": 2.55759385665529, + "grad_norm": 0.2023200435268141, + "learning_rate": 0.000128064439682064, + "loss": 0.9306, + "step": 11990 + }, + { + "epoch": 2.5578071672354947, + "grad_norm": 0.2320590640251548, + "learning_rate": 0.00012805763674413845, + "loss": 0.9131, + "step": 11991 + }, + { + "epoch": 2.5580204778157, + "grad_norm": 0.2946449697199629, + "learning_rate": 0.00012805083326243992, + "loss": 0.9359, + "step": 11992 + }, + { + "epoch": 2.5582337883959045, + "grad_norm": 0.32205165945870584, + "learning_rate": 0.0001280440292370454, + "loss": 0.9276, + "step": 11993 + }, + { + "epoch": 2.5584470989761092, + "grad_norm": 0.3174780810453014, + "learning_rate": 0.0001280372246680319, + "loss": 0.9219, + "step": 11994 + }, + { + "epoch": 2.558660409556314, + "grad_norm": 0.24875771124941728, + "learning_rate": 0.00012803041955547639, + "loss": 0.9305, + "step": 11995 + }, + { + "epoch": 2.5588737201365186, + "grad_norm": 0.3165174165949951, + "learning_rate": 0.00012802361389945585, + "loss": 0.9309, + "step": 11996 + }, + { + "epoch": 2.5590870307167233, + "grad_norm": 0.32472728405377055, + "learning_rate": 0.00012801680770004734, + "loss": 0.9396, + "step": 11997 + }, + { + "epoch": 2.5593003412969284, + "grad_norm": 0.21742338185876225, + "learning_rate": 0.00012801000095732782, + "loss": 0.9308, + "step": 11998 + }, + { + "epoch": 2.559513651877133, + "grad_norm": 0.20965304352878342, + "learning_rate": 0.00012800319367137434, + "loss": 0.915, + "step": 11999 + }, + { + "epoch": 2.5597269624573378, + "grad_norm": 0.19454305741630817, + "learning_rate": 0.00012799638584226394, + "loss": 0.937, + "step": 12000 + }, + { + "epoch": 2.559940273037543, + "grad_norm": 0.21760403532297806, + "learning_rate": 0.00012798957747007361, + "loss": 0.9362, + "step": 12001 + }, + { + "epoch": 2.5601535836177476, + "grad_norm": 0.22184718737753154, + "learning_rate": 0.0001279827685548804, + "loss": 0.9213, + "step": 12002 + }, + { + "epoch": 2.5603668941979523, + "grad_norm": 0.2234172267042494, + "learning_rate": 0.00012797595909676135, + "loss": 0.9325, + "step": 12003 + }, + { + "epoch": 2.560580204778157, + "grad_norm": 0.27155106918876554, + "learning_rate": 0.0001279691490957935, + "loss": 0.9247, + "step": 12004 + }, + { + "epoch": 2.5607935153583616, + "grad_norm": 0.30750029396802825, + "learning_rate": 0.00012796233855205395, + "loss": 0.9298, + "step": 12005 + }, + { + "epoch": 2.5610068259385663, + "grad_norm": 0.2967535436297218, + "learning_rate": 0.00012795552746561976, + "loss": 0.9337, + "step": 12006 + }, + { + "epoch": 2.5612201365187715, + "grad_norm": 0.27527926317094376, + "learning_rate": 0.00012794871583656793, + "loss": 0.9227, + "step": 12007 + }, + { + "epoch": 2.561433447098976, + "grad_norm": 0.24718193297005878, + "learning_rate": 0.00012794190366497557, + "loss": 0.9338, + "step": 12008 + }, + { + "epoch": 2.561646757679181, + "grad_norm": 0.3929070672982401, + "learning_rate": 0.00012793509095091978, + "loss": 0.9703, + "step": 12009 + }, + { + "epoch": 2.5618600682593855, + "grad_norm": 0.31244240140189966, + "learning_rate": 0.00012792827769447763, + "loss": 0.9441, + "step": 12010 + }, + { + "epoch": 2.5620733788395906, + "grad_norm": 0.2835970270400938, + "learning_rate": 0.0001279214638957262, + "loss": 0.9361, + "step": 12011 + }, + { + "epoch": 2.5622866894197953, + "grad_norm": 0.22282879504269817, + "learning_rate": 0.00012791464955474262, + "loss": 0.9482, + "step": 12012 + }, + { + "epoch": 2.5625, + "grad_norm": 0.27960733694637613, + "learning_rate": 0.00012790783467160396, + "loss": 0.9297, + "step": 12013 + }, + { + "epoch": 2.5627133105802047, + "grad_norm": 0.3560769712984437, + "learning_rate": 0.00012790101924638734, + "loss": 0.9389, + "step": 12014 + }, + { + "epoch": 2.5629266211604094, + "grad_norm": 0.37290644629576214, + "learning_rate": 0.0001278942032791699, + "loss": 0.9364, + "step": 12015 + }, + { + "epoch": 2.5631399317406145, + "grad_norm": 0.41974007750290976, + "learning_rate": 0.00012788738677002871, + "loss": 0.9379, + "step": 12016 + }, + { + "epoch": 2.563353242320819, + "grad_norm": 0.4852187895679998, + "learning_rate": 0.00012788056971904096, + "loss": 0.9571, + "step": 12017 + }, + { + "epoch": 2.563566552901024, + "grad_norm": 0.4989168111912024, + "learning_rate": 0.00012787375212628375, + "loss": 0.9532, + "step": 12018 + }, + { + "epoch": 2.5637798634812285, + "grad_norm": 0.38710943703737694, + "learning_rate": 0.00012786693399183426, + "loss": 0.9049, + "step": 12019 + }, + { + "epoch": 2.5639931740614337, + "grad_norm": 0.26696163462496725, + "learning_rate": 0.00012786011531576953, + "loss": 0.9298, + "step": 12020 + }, + { + "epoch": 2.5642064846416384, + "grad_norm": 0.22834989928855431, + "learning_rate": 0.00012785329609816686, + "loss": 0.945, + "step": 12021 + }, + { + "epoch": 2.564419795221843, + "grad_norm": 0.23920660868963045, + "learning_rate": 0.00012784647633910334, + "loss": 0.9511, + "step": 12022 + }, + { + "epoch": 2.5646331058020477, + "grad_norm": 0.4112104127458903, + "learning_rate": 0.0001278396560386561, + "loss": 0.9395, + "step": 12023 + }, + { + "epoch": 2.5648464163822524, + "grad_norm": 0.5945988064251904, + "learning_rate": 0.00012783283519690237, + "loss": 0.9275, + "step": 12024 + }, + { + "epoch": 2.565059726962457, + "grad_norm": 0.739015161117847, + "learning_rate": 0.0001278260138139193, + "loss": 0.9271, + "step": 12025 + }, + { + "epoch": 2.5652730375426622, + "grad_norm": 0.8499513974507432, + "learning_rate": 0.00012781919188978406, + "loss": 0.9694, + "step": 12026 + }, + { + "epoch": 2.565486348122867, + "grad_norm": 0.877852164360631, + "learning_rate": 0.00012781236942457387, + "loss": 0.9116, + "step": 12027 + }, + { + "epoch": 2.5656996587030716, + "grad_norm": 0.7671659938712015, + "learning_rate": 0.00012780554641836592, + "loss": 0.9787, + "step": 12028 + }, + { + "epoch": 2.5659129692832767, + "grad_norm": 0.5244640330433948, + "learning_rate": 0.0001277987228712374, + "loss": 0.9293, + "step": 12029 + }, + { + "epoch": 2.5661262798634814, + "grad_norm": 0.26023323150052624, + "learning_rate": 0.00012779189878326557, + "loss": 0.932, + "step": 12030 + }, + { + "epoch": 2.566339590443686, + "grad_norm": 0.33216690046486547, + "learning_rate": 0.00012778507415452754, + "loss": 0.9549, + "step": 12031 + }, + { + "epoch": 2.5665529010238908, + "grad_norm": 0.5377591694665663, + "learning_rate": 0.00012777824898510063, + "loss": 0.9291, + "step": 12032 + }, + { + "epoch": 2.5667662116040955, + "grad_norm": 0.5770217280755703, + "learning_rate": 0.00012777142327506202, + "loss": 0.9249, + "step": 12033 + }, + { + "epoch": 2.5669795221843, + "grad_norm": 0.5062520652007839, + "learning_rate": 0.00012776459702448895, + "loss": 0.9183, + "step": 12034 + }, + { + "epoch": 2.5671928327645053, + "grad_norm": 0.3891686475870023, + "learning_rate": 0.00012775777023345865, + "loss": 0.9624, + "step": 12035 + }, + { + "epoch": 2.56740614334471, + "grad_norm": 0.2669134140401812, + "learning_rate": 0.0001277509429020484, + "loss": 0.9428, + "step": 12036 + }, + { + "epoch": 2.5676194539249146, + "grad_norm": 0.2720883415619151, + "learning_rate": 0.0001277441150303354, + "loss": 0.9342, + "step": 12037 + }, + { + "epoch": 2.5678327645051193, + "grad_norm": 0.36995474107870713, + "learning_rate": 0.00012773728661839696, + "loss": 0.9502, + "step": 12038 + }, + { + "epoch": 2.5680460750853245, + "grad_norm": 0.43436406842252895, + "learning_rate": 0.00012773045766631034, + "loss": 0.9235, + "step": 12039 + }, + { + "epoch": 2.568259385665529, + "grad_norm": 0.6937228090262763, + "learning_rate": 0.00012772362817415275, + "loss": 0.9301, + "step": 12040 + }, + { + "epoch": 2.568472696245734, + "grad_norm": 0.3993548005024261, + "learning_rate": 0.0001277167981420015, + "loss": 0.9686, + "step": 12041 + }, + { + "epoch": 2.5686860068259385, + "grad_norm": 0.31245929606150114, + "learning_rate": 0.0001277099675699339, + "loss": 0.936, + "step": 12042 + }, + { + "epoch": 2.568899317406143, + "grad_norm": 0.3035837407800267, + "learning_rate": 0.0001277031364580272, + "loss": 0.947, + "step": 12043 + }, + { + "epoch": 2.569112627986348, + "grad_norm": 0.39813302997656363, + "learning_rate": 0.0001276963048063587, + "loss": 0.9428, + "step": 12044 + }, + { + "epoch": 2.569325938566553, + "grad_norm": 0.5244320600971067, + "learning_rate": 0.00012768947261500573, + "loss": 0.9427, + "step": 12045 + }, + { + "epoch": 2.5695392491467577, + "grad_norm": 0.6373343462306434, + "learning_rate": 0.00012768263988404557, + "loss": 0.9549, + "step": 12046 + }, + { + "epoch": 2.5697525597269624, + "grad_norm": 1.0774575290388213, + "learning_rate": 0.00012767580661355554, + "loss": 0.9443, + "step": 12047 + }, + { + "epoch": 2.5699658703071675, + "grad_norm": 0.4183418517182278, + "learning_rate": 0.00012766897280361294, + "loss": 0.9095, + "step": 12048 + }, + { + "epoch": 2.570179180887372, + "grad_norm": 0.3199751757650658, + "learning_rate": 0.00012766213845429513, + "loss": 0.9506, + "step": 12049 + }, + { + "epoch": 2.570392491467577, + "grad_norm": 0.3341374853878957, + "learning_rate": 0.0001276553035656794, + "loss": 0.9402, + "step": 12050 + }, + { + "epoch": 2.5706058020477816, + "grad_norm": 0.4227126204325916, + "learning_rate": 0.00012764846813784314, + "loss": 0.9448, + "step": 12051 + }, + { + "epoch": 2.5708191126279862, + "grad_norm": 0.52226719375462, + "learning_rate": 0.00012764163217086365, + "loss": 0.9386, + "step": 12052 + }, + { + "epoch": 2.571032423208191, + "grad_norm": 0.5079426703925981, + "learning_rate": 0.00012763479566481827, + "loss": 0.9403, + "step": 12053 + }, + { + "epoch": 2.571245733788396, + "grad_norm": 0.3878500280382179, + "learning_rate": 0.0001276279586197844, + "loss": 0.9513, + "step": 12054 + }, + { + "epoch": 2.5714590443686007, + "grad_norm": 0.29670270683764993, + "learning_rate": 0.00012762112103583935, + "loss": 0.9747, + "step": 12055 + }, + { + "epoch": 2.5716723549488054, + "grad_norm": 0.30450885709426473, + "learning_rate": 0.00012761428291306053, + "loss": 0.9308, + "step": 12056 + }, + { + "epoch": 2.57188566552901, + "grad_norm": 0.4466263106130224, + "learning_rate": 0.0001276074442515253, + "loss": 0.9263, + "step": 12057 + }, + { + "epoch": 2.5720989761092152, + "grad_norm": 0.572128792454979, + "learning_rate": 0.00012760060505131104, + "loss": 0.9703, + "step": 12058 + }, + { + "epoch": 2.57231228668942, + "grad_norm": 0.5246411470462776, + "learning_rate": 0.00012759376531249514, + "loss": 0.9636, + "step": 12059 + }, + { + "epoch": 2.5725255972696246, + "grad_norm": 0.43034202638629965, + "learning_rate": 0.00012758692503515498, + "loss": 0.9117, + "step": 12060 + }, + { + "epoch": 2.5727389078498293, + "grad_norm": 0.3552696748022018, + "learning_rate": 0.00012758008421936797, + "loss": 0.9342, + "step": 12061 + }, + { + "epoch": 2.572952218430034, + "grad_norm": 0.2544590132065596, + "learning_rate": 0.00012757324286521147, + "loss": 0.9567, + "step": 12062 + }, + { + "epoch": 2.5731655290102387, + "grad_norm": 0.2888275510865394, + "learning_rate": 0.00012756640097276299, + "loss": 0.9462, + "step": 12063 + }, + { + "epoch": 2.573378839590444, + "grad_norm": 0.4278949734312794, + "learning_rate": 0.00012755955854209982, + "loss": 0.9338, + "step": 12064 + }, + { + "epoch": 2.5735921501706485, + "grad_norm": 0.46098286612535044, + "learning_rate": 0.00012755271557329947, + "loss": 0.9413, + "step": 12065 + }, + { + "epoch": 2.573805460750853, + "grad_norm": 0.41487311885276523, + "learning_rate": 0.00012754587206643934, + "loss": 0.9403, + "step": 12066 + }, + { + "epoch": 2.5740187713310583, + "grad_norm": 0.41958775094853523, + "learning_rate": 0.00012753902802159687, + "loss": 0.916, + "step": 12067 + }, + { + "epoch": 2.574232081911263, + "grad_norm": 0.37593116021512835, + "learning_rate": 0.0001275321834388495, + "loss": 0.9611, + "step": 12068 + }, + { + "epoch": 2.5744453924914676, + "grad_norm": 0.21016085999940506, + "learning_rate": 0.00012752533831827468, + "loss": 0.924, + "step": 12069 + }, + { + "epoch": 2.5746587030716723, + "grad_norm": 0.3131480936681733, + "learning_rate": 0.00012751849265994986, + "loss": 0.9392, + "step": 12070 + }, + { + "epoch": 2.574872013651877, + "grad_norm": 0.3948675614293749, + "learning_rate": 0.00012751164646395246, + "loss": 0.9556, + "step": 12071 + }, + { + "epoch": 2.5750853242320817, + "grad_norm": 0.387554929263813, + "learning_rate": 0.00012750479973036002, + "loss": 0.9351, + "step": 12072 + }, + { + "epoch": 2.575298634812287, + "grad_norm": 0.33784009221275696, + "learning_rate": 0.00012749795245924997, + "loss": 0.934, + "step": 12073 + }, + { + "epoch": 2.5755119453924915, + "grad_norm": 0.26190894957130484, + "learning_rate": 0.0001274911046506998, + "loss": 0.9315, + "step": 12074 + }, + { + "epoch": 2.575725255972696, + "grad_norm": 0.2619732506015473, + "learning_rate": 0.00012748425630478697, + "loss": 0.94, + "step": 12075 + }, + { + "epoch": 2.575938566552901, + "grad_norm": 0.2343846424107695, + "learning_rate": 0.00012747740742158895, + "loss": 0.9343, + "step": 12076 + }, + { + "epoch": 2.576151877133106, + "grad_norm": 0.21594996338083883, + "learning_rate": 0.0001274705580011833, + "loss": 0.9319, + "step": 12077 + }, + { + "epoch": 2.5763651877133107, + "grad_norm": 0.5338419068724888, + "learning_rate": 0.0001274637080436475, + "loss": 0.9381, + "step": 12078 + }, + { + "epoch": 2.5765784982935154, + "grad_norm": 0.24226649935114664, + "learning_rate": 0.000127456857549059, + "loss": 0.9386, + "step": 12079 + }, + { + "epoch": 2.57679180887372, + "grad_norm": 0.19761335890048828, + "learning_rate": 0.00012745000651749537, + "loss": 0.9353, + "step": 12080 + }, + { + "epoch": 2.5770051194539247, + "grad_norm": 0.1803908302376732, + "learning_rate": 0.00012744315494903413, + "loss": 0.9255, + "step": 12081 + }, + { + "epoch": 2.5772184300341294, + "grad_norm": 0.23020274635653393, + "learning_rate": 0.0001274363028437528, + "loss": 0.9373, + "step": 12082 + }, + { + "epoch": 2.5774317406143346, + "grad_norm": 0.2178609089623293, + "learning_rate": 0.0001274294502017289, + "loss": 0.9342, + "step": 12083 + }, + { + "epoch": 2.5776450511945392, + "grad_norm": 0.23903966816499253, + "learning_rate": 0.00012742259702303997, + "loss": 0.9384, + "step": 12084 + }, + { + "epoch": 2.577858361774744, + "grad_norm": 0.242659519520129, + "learning_rate": 0.00012741574330776356, + "loss": 0.9175, + "step": 12085 + }, + { + "epoch": 2.578071672354949, + "grad_norm": 0.2250281479175079, + "learning_rate": 0.00012740888905597722, + "loss": 0.9093, + "step": 12086 + }, + { + "epoch": 2.5782849829351537, + "grad_norm": 0.23100530165089825, + "learning_rate": 0.0001274020342677585, + "loss": 0.9371, + "step": 12087 + }, + { + "epoch": 2.5784982935153584, + "grad_norm": 0.21863262815776946, + "learning_rate": 0.00012739517894318496, + "loss": 0.9252, + "step": 12088 + }, + { + "epoch": 2.578711604095563, + "grad_norm": 0.2179544945313644, + "learning_rate": 0.00012738832308233417, + "loss": 0.9528, + "step": 12089 + }, + { + "epoch": 2.578924914675768, + "grad_norm": 0.2836356530825667, + "learning_rate": 0.00012738146668528374, + "loss": 0.9535, + "step": 12090 + }, + { + "epoch": 2.5791382252559725, + "grad_norm": 0.31618899668891587, + "learning_rate": 0.0001273746097521112, + "loss": 0.9703, + "step": 12091 + }, + { + "epoch": 2.5793515358361776, + "grad_norm": 0.2633205497972086, + "learning_rate": 0.0001273677522828941, + "loss": 0.9247, + "step": 12092 + }, + { + "epoch": 2.5795648464163823, + "grad_norm": 0.28527243651721734, + "learning_rate": 0.00012736089427771015, + "loss": 0.9296, + "step": 12093 + }, + { + "epoch": 2.579778156996587, + "grad_norm": 0.23745242990544324, + "learning_rate": 0.00012735403573663686, + "loss": 0.9265, + "step": 12094 + }, + { + "epoch": 2.5799914675767917, + "grad_norm": 0.2520876561494398, + "learning_rate": 0.00012734717665975186, + "loss": 0.9248, + "step": 12095 + }, + { + "epoch": 2.580204778156997, + "grad_norm": 0.33056301807611316, + "learning_rate": 0.00012734031704713278, + "loss": 0.965, + "step": 12096 + }, + { + "epoch": 2.5804180887372015, + "grad_norm": 0.28904511555156726, + "learning_rate": 0.00012733345689885722, + "loss": 0.925, + "step": 12097 + }, + { + "epoch": 2.580631399317406, + "grad_norm": 0.22952069928186306, + "learning_rate": 0.00012732659621500275, + "loss": 0.9141, + "step": 12098 + }, + { + "epoch": 2.580844709897611, + "grad_norm": 1.17909208521731, + "learning_rate": 0.0001273197349956471, + "loss": 0.9388, + "step": 12099 + }, + { + "epoch": 2.5810580204778155, + "grad_norm": 0.2537819301714779, + "learning_rate": 0.0001273128732408678, + "loss": 0.9551, + "step": 12100 + }, + { + "epoch": 2.58127133105802, + "grad_norm": 0.29654619492666695, + "learning_rate": 0.0001273060109507426, + "loss": 0.9218, + "step": 12101 + }, + { + "epoch": 2.5814846416382253, + "grad_norm": 0.31476035914880374, + "learning_rate": 0.00012729914812534905, + "loss": 0.9387, + "step": 12102 + }, + { + "epoch": 2.58169795221843, + "grad_norm": 0.2937917319853517, + "learning_rate": 0.00012729228476476487, + "loss": 0.9191, + "step": 12103 + }, + { + "epoch": 2.5819112627986347, + "grad_norm": 0.2666403210855775, + "learning_rate": 0.0001272854208690677, + "loss": 0.9322, + "step": 12104 + }, + { + "epoch": 2.58212457337884, + "grad_norm": 0.3160316872228524, + "learning_rate": 0.00012727855643833514, + "loss": 0.9256, + "step": 12105 + }, + { + "epoch": 2.5823378839590445, + "grad_norm": 0.32420862361968095, + "learning_rate": 0.000127271691472645, + "loss": 0.916, + "step": 12106 + }, + { + "epoch": 2.582551194539249, + "grad_norm": 0.3237411508370144, + "learning_rate": 0.0001272648259720748, + "loss": 0.9158, + "step": 12107 + }, + { + "epoch": 2.582764505119454, + "grad_norm": 0.3272041557149514, + "learning_rate": 0.00012725795993670234, + "loss": 0.9408, + "step": 12108 + }, + { + "epoch": 2.5829778156996586, + "grad_norm": 0.36095279403553915, + "learning_rate": 0.00012725109336660526, + "loss": 0.9588, + "step": 12109 + }, + { + "epoch": 2.5831911262798632, + "grad_norm": 0.5037937320185734, + "learning_rate": 0.00012724422626186126, + "loss": 0.9685, + "step": 12110 + }, + { + "epoch": 2.5834044368600684, + "grad_norm": 0.5209686316550902, + "learning_rate": 0.00012723735862254804, + "loss": 0.9665, + "step": 12111 + }, + { + "epoch": 2.583617747440273, + "grad_norm": 0.4508186237759665, + "learning_rate": 0.00012723049044874334, + "loss": 0.9561, + "step": 12112 + }, + { + "epoch": 2.5838310580204777, + "grad_norm": 0.3868909765246779, + "learning_rate": 0.00012722362174052483, + "loss": 0.9574, + "step": 12113 + }, + { + "epoch": 2.5840443686006824, + "grad_norm": 0.27953036745763093, + "learning_rate": 0.00012721675249797025, + "loss": 0.9379, + "step": 12114 + }, + { + "epoch": 2.5842576791808876, + "grad_norm": 0.26992337702219565, + "learning_rate": 0.0001272098827211573, + "loss": 0.9659, + "step": 12115 + }, + { + "epoch": 2.5844709897610922, + "grad_norm": 0.32837180367057656, + "learning_rate": 0.00012720301241016374, + "loss": 0.922, + "step": 12116 + }, + { + "epoch": 2.584684300341297, + "grad_norm": 0.432402508882918, + "learning_rate": 0.0001271961415650673, + "loss": 0.9511, + "step": 12117 + }, + { + "epoch": 2.5848976109215016, + "grad_norm": 0.48393712175065806, + "learning_rate": 0.00012718927018594574, + "loss": 0.9468, + "step": 12118 + }, + { + "epoch": 2.5851109215017063, + "grad_norm": 0.47857064886706124, + "learning_rate": 0.0001271823982728768, + "loss": 0.9363, + "step": 12119 + }, + { + "epoch": 2.5853242320819114, + "grad_norm": 0.4214956004823223, + "learning_rate": 0.00012717552582593823, + "loss": 0.9408, + "step": 12120 + }, + { + "epoch": 2.585537542662116, + "grad_norm": 0.370318135128816, + "learning_rate": 0.00012716865284520776, + "loss": 0.9604, + "step": 12121 + }, + { + "epoch": 2.585750853242321, + "grad_norm": 0.27111729633250625, + "learning_rate": 0.0001271617793307632, + "loss": 0.9295, + "step": 12122 + }, + { + "epoch": 2.5859641638225255, + "grad_norm": 0.24611304514224286, + "learning_rate": 0.00012715490528268235, + "loss": 0.9384, + "step": 12123 + }, + { + "epoch": 2.5861774744027306, + "grad_norm": 0.3005679669729497, + "learning_rate": 0.00012714803070104291, + "loss": 0.9475, + "step": 12124 + }, + { + "epoch": 2.5863907849829353, + "grad_norm": 0.2830185662468408, + "learning_rate": 0.00012714115558592275, + "loss": 0.9363, + "step": 12125 + }, + { + "epoch": 2.58660409556314, + "grad_norm": 0.3160899215170792, + "learning_rate": 0.00012713427993739956, + "loss": 0.9234, + "step": 12126 + }, + { + "epoch": 2.5868174061433447, + "grad_norm": 0.34901084757961964, + "learning_rate": 0.00012712740375555125, + "loss": 0.8939, + "step": 12127 + }, + { + "epoch": 2.5870307167235493, + "grad_norm": 0.3077750051474591, + "learning_rate": 0.00012712052704045556, + "loss": 0.9364, + "step": 12128 + }, + { + "epoch": 2.587244027303754, + "grad_norm": 0.2646642985278082, + "learning_rate": 0.0001271136497921903, + "loss": 0.9266, + "step": 12129 + }, + { + "epoch": 2.587457337883959, + "grad_norm": 0.2189854638718327, + "learning_rate": 0.00012710677201083334, + "loss": 0.9654, + "step": 12130 + }, + { + "epoch": 2.587670648464164, + "grad_norm": 0.23018979589950905, + "learning_rate": 0.0001270998936964624, + "loss": 0.9125, + "step": 12131 + }, + { + "epoch": 2.5878839590443685, + "grad_norm": 0.2548505996967933, + "learning_rate": 0.00012709301484915543, + "loss": 0.9355, + "step": 12132 + }, + { + "epoch": 2.5880972696245736, + "grad_norm": 0.24085033276858653, + "learning_rate": 0.0001270861354689902, + "loss": 0.9427, + "step": 12133 + }, + { + "epoch": 2.5883105802047783, + "grad_norm": 0.2842090346110503, + "learning_rate": 0.00012707925555604453, + "loss": 0.9263, + "step": 12134 + }, + { + "epoch": 2.588523890784983, + "grad_norm": 0.35017315141319066, + "learning_rate": 0.0001270723751103963, + "loss": 0.938, + "step": 12135 + }, + { + "epoch": 2.5887372013651877, + "grad_norm": 0.3093288952133002, + "learning_rate": 0.00012706549413212334, + "loss": 0.9405, + "step": 12136 + }, + { + "epoch": 2.5889505119453924, + "grad_norm": 0.2928775815395271, + "learning_rate": 0.00012705861262130353, + "loss": 0.9259, + "step": 12137 + }, + { + "epoch": 2.589163822525597, + "grad_norm": 0.23164935437509074, + "learning_rate": 0.00012705173057801474, + "loss": 0.9144, + "step": 12138 + }, + { + "epoch": 2.589377133105802, + "grad_norm": 0.249235828460065, + "learning_rate": 0.00012704484800233481, + "loss": 0.9347, + "step": 12139 + }, + { + "epoch": 2.589590443686007, + "grad_norm": 0.24318272595635923, + "learning_rate": 0.00012703796489434166, + "loss": 0.9835, + "step": 12140 + }, + { + "epoch": 2.5898037542662116, + "grad_norm": 0.2198014442114396, + "learning_rate": 0.00012703108125411314, + "loss": 0.9586, + "step": 12141 + }, + { + "epoch": 2.5900170648464163, + "grad_norm": 0.24314875642549133, + "learning_rate": 0.0001270241970817271, + "loss": 0.9084, + "step": 12142 + }, + { + "epoch": 2.5902303754266214, + "grad_norm": 0.2733642650460865, + "learning_rate": 0.0001270173123772615, + "loss": 0.9599, + "step": 12143 + }, + { + "epoch": 2.590443686006826, + "grad_norm": 0.23725543456993795, + "learning_rate": 0.00012701042714079424, + "loss": 0.9225, + "step": 12144 + }, + { + "epoch": 2.5906569965870307, + "grad_norm": 0.24959134696102728, + "learning_rate": 0.00012700354137240322, + "loss": 0.9301, + "step": 12145 + }, + { + "epoch": 2.5908703071672354, + "grad_norm": 0.23015419988040003, + "learning_rate": 0.0001269966550721663, + "loss": 0.9697, + "step": 12146 + }, + { + "epoch": 2.59108361774744, + "grad_norm": 0.2125091665690906, + "learning_rate": 0.00012698976824016147, + "loss": 0.9551, + "step": 12147 + }, + { + "epoch": 2.591296928327645, + "grad_norm": 0.25423105186237915, + "learning_rate": 0.00012698288087646662, + "loss": 0.9305, + "step": 12148 + }, + { + "epoch": 2.59151023890785, + "grad_norm": 0.27163067633325294, + "learning_rate": 0.00012697599298115963, + "loss": 0.9499, + "step": 12149 + }, + { + "epoch": 2.5917235494880546, + "grad_norm": 0.24713187720764113, + "learning_rate": 0.00012696910455431851, + "loss": 0.9723, + "step": 12150 + }, + { + "epoch": 2.5919368600682593, + "grad_norm": 0.2008250443393222, + "learning_rate": 0.00012696221559602123, + "loss": 0.9604, + "step": 12151 + }, + { + "epoch": 2.5921501706484644, + "grad_norm": 0.26110982575296593, + "learning_rate": 0.00012695532610634567, + "loss": 0.9587, + "step": 12152 + }, + { + "epoch": 2.592363481228669, + "grad_norm": 0.33913110211025466, + "learning_rate": 0.0001269484360853698, + "loss": 0.9226, + "step": 12153 + }, + { + "epoch": 2.592576791808874, + "grad_norm": 0.4140630766316419, + "learning_rate": 0.00012694154553317155, + "loss": 0.9023, + "step": 12154 + }, + { + "epoch": 2.5927901023890785, + "grad_norm": 0.3959015098532108, + "learning_rate": 0.00012693465444982895, + "loss": 0.9713, + "step": 12155 + }, + { + "epoch": 2.593003412969283, + "grad_norm": 0.29198159578592514, + "learning_rate": 0.00012692776283541995, + "loss": 0.9502, + "step": 12156 + }, + { + "epoch": 2.593216723549488, + "grad_norm": 0.30552595279095385, + "learning_rate": 0.00012692087069002253, + "loss": 0.9357, + "step": 12157 + }, + { + "epoch": 2.593430034129693, + "grad_norm": 0.2498198203064818, + "learning_rate": 0.00012691397801371467, + "loss": 0.9368, + "step": 12158 + }, + { + "epoch": 2.5936433447098977, + "grad_norm": 0.21046623260820102, + "learning_rate": 0.00012690708480657436, + "loss": 0.9569, + "step": 12159 + }, + { + "epoch": 2.5938566552901023, + "grad_norm": 0.2451679771386085, + "learning_rate": 0.0001269001910686796, + "loss": 0.9295, + "step": 12160 + }, + { + "epoch": 2.594069965870307, + "grad_norm": 0.23718372146757427, + "learning_rate": 0.00012689329680010838, + "loss": 0.9205, + "step": 12161 + }, + { + "epoch": 2.594283276450512, + "grad_norm": 0.2450777344276544, + "learning_rate": 0.0001268864020009387, + "loss": 0.91, + "step": 12162 + }, + { + "epoch": 2.594496587030717, + "grad_norm": 0.24400679588170454, + "learning_rate": 0.0001268795066712486, + "loss": 0.916, + "step": 12163 + }, + { + "epoch": 2.5947098976109215, + "grad_norm": 0.24006408392950654, + "learning_rate": 0.0001268726108111161, + "loss": 0.9259, + "step": 12164 + }, + { + "epoch": 2.594923208191126, + "grad_norm": 0.19080100048996526, + "learning_rate": 0.00012686571442061922, + "loss": 0.9815, + "step": 12165 + }, + { + "epoch": 2.595136518771331, + "grad_norm": 0.23200437705160623, + "learning_rate": 0.000126858817499836, + "loss": 0.9724, + "step": 12166 + }, + { + "epoch": 2.5953498293515356, + "grad_norm": 0.3255016190111606, + "learning_rate": 0.00012685192004884447, + "loss": 0.9532, + "step": 12167 + }, + { + "epoch": 2.5955631399317407, + "grad_norm": 0.36223534101392013, + "learning_rate": 0.00012684502206772267, + "loss": 0.9482, + "step": 12168 + }, + { + "epoch": 2.5957764505119454, + "grad_norm": 0.28889023739143177, + "learning_rate": 0.00012683812355654867, + "loss": 0.9241, + "step": 12169 + }, + { + "epoch": 2.59598976109215, + "grad_norm": 0.23990318511551323, + "learning_rate": 0.0001268312245154005, + "loss": 0.9393, + "step": 12170 + }, + { + "epoch": 2.596203071672355, + "grad_norm": 0.24291217768831427, + "learning_rate": 0.00012682432494435624, + "loss": 0.9091, + "step": 12171 + }, + { + "epoch": 2.59641638225256, + "grad_norm": 0.2748707568733729, + "learning_rate": 0.00012681742484349398, + "loss": 0.9656, + "step": 12172 + }, + { + "epoch": 2.5966296928327646, + "grad_norm": 0.2724566801641232, + "learning_rate": 0.00012681052421289175, + "loss": 0.9321, + "step": 12173 + }, + { + "epoch": 2.5968430034129693, + "grad_norm": 0.22503088685821807, + "learning_rate": 0.00012680362305262762, + "loss": 0.925, + "step": 12174 + }, + { + "epoch": 2.597056313993174, + "grad_norm": 0.24860754635327784, + "learning_rate": 0.00012679672136277974, + "loss": 0.9417, + "step": 12175 + }, + { + "epoch": 2.5972696245733786, + "grad_norm": 0.272257600836977, + "learning_rate": 0.00012678981914342617, + "loss": 0.9416, + "step": 12176 + }, + { + "epoch": 2.5974829351535837, + "grad_norm": 0.25765706349727907, + "learning_rate": 0.00012678291639464502, + "loss": 0.9427, + "step": 12177 + }, + { + "epoch": 2.5976962457337884, + "grad_norm": 0.2270586195557772, + "learning_rate": 0.00012677601311651435, + "loss": 0.9232, + "step": 12178 + }, + { + "epoch": 2.597909556313993, + "grad_norm": 0.2691002913097082, + "learning_rate": 0.00012676910930911234, + "loss": 0.9363, + "step": 12179 + }, + { + "epoch": 2.598122866894198, + "grad_norm": 0.2422459805255476, + "learning_rate": 0.00012676220497251706, + "loss": 0.9439, + "step": 12180 + }, + { + "epoch": 2.598336177474403, + "grad_norm": 0.23477925671620598, + "learning_rate": 0.00012675530010680664, + "loss": 0.9422, + "step": 12181 + }, + { + "epoch": 2.5985494880546076, + "grad_norm": 0.27025950855497466, + "learning_rate": 0.00012674839471205922, + "loss": 0.9167, + "step": 12182 + }, + { + "epoch": 2.5987627986348123, + "grad_norm": 0.24560078776783542, + "learning_rate": 0.0001267414887883529, + "loss": 0.958, + "step": 12183 + }, + { + "epoch": 2.598976109215017, + "grad_norm": 0.22131988682295767, + "learning_rate": 0.00012673458233576585, + "loss": 0.9275, + "step": 12184 + }, + { + "epoch": 2.5991894197952217, + "grad_norm": 0.30675845242931093, + "learning_rate": 0.00012672767535437627, + "loss": 0.9141, + "step": 12185 + }, + { + "epoch": 2.5994027303754264, + "grad_norm": 0.3142452312131657, + "learning_rate": 0.00012672076784426222, + "loss": 0.9276, + "step": 12186 + }, + { + "epoch": 2.5996160409556315, + "grad_norm": 0.30379983924481846, + "learning_rate": 0.0001267138598055019, + "loss": 0.9412, + "step": 12187 + }, + { + "epoch": 2.599829351535836, + "grad_norm": 0.30082071970942886, + "learning_rate": 0.00012670695123817347, + "loss": 0.9419, + "step": 12188 + }, + { + "epoch": 2.600042662116041, + "grad_norm": 0.30426799840552815, + "learning_rate": 0.00012670004214235508, + "loss": 0.9476, + "step": 12189 + }, + { + "epoch": 2.600255972696246, + "grad_norm": 0.32586959575120467, + "learning_rate": 0.000126693132518125, + "loss": 0.9176, + "step": 12190 + }, + { + "epoch": 2.6004692832764507, + "grad_norm": 0.31226365514746307, + "learning_rate": 0.00012668622236556125, + "loss": 0.9459, + "step": 12191 + }, + { + "epoch": 2.6006825938566553, + "grad_norm": 0.27417366731668047, + "learning_rate": 0.00012667931168474215, + "loss": 0.9445, + "step": 12192 + }, + { + "epoch": 2.60089590443686, + "grad_norm": 0.30136897510864546, + "learning_rate": 0.00012667240047574584, + "loss": 0.9258, + "step": 12193 + }, + { + "epoch": 2.6011092150170647, + "grad_norm": 0.330491065460254, + "learning_rate": 0.00012666548873865055, + "loss": 0.9548, + "step": 12194 + }, + { + "epoch": 2.6013225255972694, + "grad_norm": 0.3092513409013132, + "learning_rate": 0.00012665857647353447, + "loss": 0.9453, + "step": 12195 + }, + { + "epoch": 2.6015358361774745, + "grad_norm": 0.2682857191938687, + "learning_rate": 0.00012665166368047582, + "loss": 0.9771, + "step": 12196 + }, + { + "epoch": 2.601749146757679, + "grad_norm": 0.19519862120414558, + "learning_rate": 0.0001266447503595528, + "loss": 0.9328, + "step": 12197 + }, + { + "epoch": 2.601962457337884, + "grad_norm": 0.22494308378056718, + "learning_rate": 0.00012663783651084364, + "loss": 0.9623, + "step": 12198 + }, + { + "epoch": 2.6021757679180886, + "grad_norm": 0.2521450770837934, + "learning_rate": 0.00012663092213442654, + "loss": 0.9292, + "step": 12199 + }, + { + "epoch": 2.6023890784982937, + "grad_norm": 0.23546802905744116, + "learning_rate": 0.00012662400723037983, + "loss": 0.9341, + "step": 12200 + }, + { + "epoch": 2.6026023890784984, + "grad_norm": 0.19542195613796978, + "learning_rate": 0.00012661709179878167, + "loss": 0.9533, + "step": 12201 + }, + { + "epoch": 2.602815699658703, + "grad_norm": 0.26879317429734584, + "learning_rate": 0.00012661017583971034, + "loss": 0.9552, + "step": 12202 + }, + { + "epoch": 2.6030290102389078, + "grad_norm": 0.28655502724718007, + "learning_rate": 0.00012660325935324407, + "loss": 0.9525, + "step": 12203 + }, + { + "epoch": 2.6032423208191124, + "grad_norm": 0.26811267375772396, + "learning_rate": 0.00012659634233946115, + "loss": 0.9357, + "step": 12204 + }, + { + "epoch": 2.6034556313993176, + "grad_norm": 0.2509440310203088, + "learning_rate": 0.00012658942479843984, + "loss": 0.9256, + "step": 12205 + }, + { + "epoch": 2.6036689419795223, + "grad_norm": 0.3018141152195394, + "learning_rate": 0.00012658250673025837, + "loss": 0.9333, + "step": 12206 + }, + { + "epoch": 2.603882252559727, + "grad_norm": 0.21757747039843345, + "learning_rate": 0.0001265755881349951, + "loss": 0.8994, + "step": 12207 + }, + { + "epoch": 2.6040955631399316, + "grad_norm": 0.22817809724254723, + "learning_rate": 0.00012656866901272824, + "loss": 0.9208, + "step": 12208 + }, + { + "epoch": 2.6043088737201368, + "grad_norm": 0.2812398538413764, + "learning_rate": 0.00012656174936353612, + "loss": 0.922, + "step": 12209 + }, + { + "epoch": 2.6045221843003414, + "grad_norm": 0.2593025514119487, + "learning_rate": 0.000126554829187497, + "loss": 0.9138, + "step": 12210 + }, + { + "epoch": 2.604735494880546, + "grad_norm": 0.25274855676367186, + "learning_rate": 0.00012654790848468922, + "loss": 0.921, + "step": 12211 + }, + { + "epoch": 2.604948805460751, + "grad_norm": 0.2479836779654699, + "learning_rate": 0.00012654098725519106, + "loss": 0.9228, + "step": 12212 + }, + { + "epoch": 2.6051621160409555, + "grad_norm": 0.2253516698742696, + "learning_rate": 0.00012653406549908088, + "loss": 0.9508, + "step": 12213 + }, + { + "epoch": 2.60537542662116, + "grad_norm": 0.27690403561488985, + "learning_rate": 0.00012652714321643692, + "loss": 0.9245, + "step": 12214 + }, + { + "epoch": 2.6055887372013653, + "grad_norm": 0.321046399166865, + "learning_rate": 0.0001265202204073376, + "loss": 0.9316, + "step": 12215 + }, + { + "epoch": 2.60580204778157, + "grad_norm": 0.24692657525497846, + "learning_rate": 0.00012651329707186117, + "loss": 0.9421, + "step": 12216 + }, + { + "epoch": 2.6060153583617747, + "grad_norm": 0.23090988606186602, + "learning_rate": 0.00012650637321008602, + "loss": 0.9687, + "step": 12217 + }, + { + "epoch": 2.60622866894198, + "grad_norm": 0.3155961678770159, + "learning_rate": 0.00012649944882209046, + "loss": 0.9297, + "step": 12218 + }, + { + "epoch": 2.6064419795221845, + "grad_norm": 0.31976283083025975, + "learning_rate": 0.00012649252390795288, + "loss": 0.9081, + "step": 12219 + }, + { + "epoch": 2.606655290102389, + "grad_norm": 0.3288852864463462, + "learning_rate": 0.00012648559846775159, + "loss": 0.9254, + "step": 12220 + }, + { + "epoch": 2.606868600682594, + "grad_norm": 0.32800855720294486, + "learning_rate": 0.00012647867250156495, + "loss": 0.9316, + "step": 12221 + }, + { + "epoch": 2.6070819112627985, + "grad_norm": 0.3378563466753007, + "learning_rate": 0.00012647174600947136, + "loss": 0.9299, + "step": 12222 + }, + { + "epoch": 2.607295221843003, + "grad_norm": 0.33571663839814403, + "learning_rate": 0.0001264648189915492, + "loss": 0.9417, + "step": 12223 + }, + { + "epoch": 2.6075085324232083, + "grad_norm": 0.3512905228520603, + "learning_rate": 0.00012645789144787682, + "loss": 0.9342, + "step": 12224 + }, + { + "epoch": 2.607721843003413, + "grad_norm": 0.36429412046855514, + "learning_rate": 0.00012645096337853264, + "loss": 0.9413, + "step": 12225 + }, + { + "epoch": 2.6079351535836177, + "grad_norm": 0.31691857747455887, + "learning_rate": 0.000126444034783595, + "loss": 0.9141, + "step": 12226 + }, + { + "epoch": 2.6081484641638224, + "grad_norm": 0.24845516377977814, + "learning_rate": 0.00012643710566314235, + "loss": 0.9062, + "step": 12227 + }, + { + "epoch": 2.6083617747440275, + "grad_norm": 0.19643568224406135, + "learning_rate": 0.00012643017601725304, + "loss": 0.937, + "step": 12228 + }, + { + "epoch": 2.608575085324232, + "grad_norm": 0.2385287233464934, + "learning_rate": 0.00012642324584600552, + "loss": 0.9578, + "step": 12229 + }, + { + "epoch": 2.608788395904437, + "grad_norm": 0.2753269258990309, + "learning_rate": 0.0001264163151494782, + "loss": 0.9115, + "step": 12230 + }, + { + "epoch": 2.6090017064846416, + "grad_norm": 0.3385839935938968, + "learning_rate": 0.00012640938392774947, + "loss": 0.9471, + "step": 12231 + }, + { + "epoch": 2.6092150170648463, + "grad_norm": 0.40837823524275624, + "learning_rate": 0.0001264024521808978, + "loss": 0.9556, + "step": 12232 + }, + { + "epoch": 2.609428327645051, + "grad_norm": 0.35398085534086554, + "learning_rate": 0.0001263955199090016, + "loss": 0.9368, + "step": 12233 + }, + { + "epoch": 2.609641638225256, + "grad_norm": 0.27233071368912815, + "learning_rate": 0.00012638858711213932, + "loss": 0.9541, + "step": 12234 + }, + { + "epoch": 2.6098549488054608, + "grad_norm": 0.3015102602131437, + "learning_rate": 0.0001263816537903894, + "loss": 0.9466, + "step": 12235 + }, + { + "epoch": 2.6100682593856654, + "grad_norm": 0.24165059909146147, + "learning_rate": 0.00012637471994383025, + "loss": 0.9431, + "step": 12236 + }, + { + "epoch": 2.6102815699658706, + "grad_norm": 2.013090147151624, + "learning_rate": 0.0001263677855725404, + "loss": 1.0096, + "step": 12237 + }, + { + "epoch": 2.6104948805460753, + "grad_norm": 0.2918085729257427, + "learning_rate": 0.00012636085067659826, + "loss": 0.9156, + "step": 12238 + }, + { + "epoch": 2.61070819112628, + "grad_norm": 0.45853379086007445, + "learning_rate": 0.00012635391525608232, + "loss": 0.9405, + "step": 12239 + }, + { + "epoch": 2.6109215017064846, + "grad_norm": 0.4016238360886092, + "learning_rate": 0.00012634697931107104, + "loss": 0.9259, + "step": 12240 + }, + { + "epoch": 2.6111348122866893, + "grad_norm": 0.34559663976967214, + "learning_rate": 0.00012634004284164293, + "loss": 0.9444, + "step": 12241 + }, + { + "epoch": 2.611348122866894, + "grad_norm": 0.35616679839035525, + "learning_rate": 0.00012633310584787646, + "loss": 0.9315, + "step": 12242 + }, + { + "epoch": 2.611561433447099, + "grad_norm": 0.45302426280723745, + "learning_rate": 0.0001263261683298501, + "loss": 0.9432, + "step": 12243 + }, + { + "epoch": 2.611774744027304, + "grad_norm": 0.5045743293471396, + "learning_rate": 0.00012631923028764234, + "loss": 0.919, + "step": 12244 + }, + { + "epoch": 2.6119880546075085, + "grad_norm": 0.37281148576789674, + "learning_rate": 0.00012631229172133175, + "loss": 0.9528, + "step": 12245 + }, + { + "epoch": 2.612201365187713, + "grad_norm": 0.329579849387992, + "learning_rate": 0.00012630535263099678, + "loss": 0.9421, + "step": 12246 + }, + { + "epoch": 2.6124146757679183, + "grad_norm": 0.4132261625647889, + "learning_rate": 0.000126298413016716, + "loss": 0.9125, + "step": 12247 + }, + { + "epoch": 2.612627986348123, + "grad_norm": 0.3567538172764673, + "learning_rate": 0.00012629147287856787, + "loss": 0.9488, + "step": 12248 + }, + { + "epoch": 2.6128412969283277, + "grad_norm": 0.3613780125260928, + "learning_rate": 0.00012628453221663095, + "loss": 0.9267, + "step": 12249 + }, + { + "epoch": 2.6130546075085324, + "grad_norm": 0.4339418345868916, + "learning_rate": 0.00012627759103098375, + "loss": 0.9321, + "step": 12250 + }, + { + "epoch": 2.613267918088737, + "grad_norm": 0.5100158368522394, + "learning_rate": 0.00012627064932170485, + "loss": 0.9166, + "step": 12251 + }, + { + "epoch": 2.6134812286689417, + "grad_norm": 0.45240741981436433, + "learning_rate": 0.00012626370708887277, + "loss": 0.9602, + "step": 12252 + }, + { + "epoch": 2.613694539249147, + "grad_norm": 0.3960168728561588, + "learning_rate": 0.00012625676433256606, + "loss": 0.9372, + "step": 12253 + }, + { + "epoch": 2.6139078498293515, + "grad_norm": 0.4421458340331991, + "learning_rate": 0.0001262498210528633, + "loss": 0.9308, + "step": 12254 + }, + { + "epoch": 2.614121160409556, + "grad_norm": 0.40191530525608427, + "learning_rate": 0.00012624287724984304, + "loss": 0.9406, + "step": 12255 + }, + { + "epoch": 2.6143344709897613, + "grad_norm": 0.34115272528083296, + "learning_rate": 0.00012623593292358383, + "loss": 0.9771, + "step": 12256 + }, + { + "epoch": 2.614547781569966, + "grad_norm": 0.3244104079852555, + "learning_rate": 0.00012622898807416427, + "loss": 0.9421, + "step": 12257 + }, + { + "epoch": 2.6147610921501707, + "grad_norm": 0.3487845131100901, + "learning_rate": 0.0001262220427016629, + "loss": 0.9772, + "step": 12258 + }, + { + "epoch": 2.6149744027303754, + "grad_norm": 0.40858944045672635, + "learning_rate": 0.00012621509680615838, + "loss": 0.9304, + "step": 12259 + }, + { + "epoch": 2.61518771331058, + "grad_norm": 0.36634040201515655, + "learning_rate": 0.00012620815038772926, + "loss": 0.9286, + "step": 12260 + }, + { + "epoch": 2.6154010238907848, + "grad_norm": 0.32828930454637734, + "learning_rate": 0.00012620120344645412, + "loss": 0.925, + "step": 12261 + }, + { + "epoch": 2.61561433447099, + "grad_norm": 0.2671193773756538, + "learning_rate": 0.00012619425598241158, + "loss": 0.9067, + "step": 12262 + }, + { + "epoch": 2.6158276450511946, + "grad_norm": 3.725425421420253, + "learning_rate": 0.00012618730799568027, + "loss": 0.9717, + "step": 12263 + }, + { + "epoch": 2.6160409556313993, + "grad_norm": 0.4197835640677974, + "learning_rate": 0.00012618035948633882, + "loss": 0.93, + "step": 12264 + }, + { + "epoch": 2.616254266211604, + "grad_norm": 0.5082788224629248, + "learning_rate": 0.0001261734104544658, + "loss": 0.9543, + "step": 12265 + }, + { + "epoch": 2.616467576791809, + "grad_norm": 0.43450243186666754, + "learning_rate": 0.00012616646090013987, + "loss": 0.9031, + "step": 12266 + }, + { + "epoch": 2.6166808873720138, + "grad_norm": 0.3890053197110837, + "learning_rate": 0.00012615951082343966, + "loss": 0.9419, + "step": 12267 + }, + { + "epoch": 2.6168941979522184, + "grad_norm": 0.4039640695720074, + "learning_rate": 0.00012615256022444377, + "loss": 0.9511, + "step": 12268 + }, + { + "epoch": 2.617107508532423, + "grad_norm": 0.4000624036127029, + "learning_rate": 0.00012614560910323094, + "loss": 0.9274, + "step": 12269 + }, + { + "epoch": 2.617320819112628, + "grad_norm": 0.3934150857463931, + "learning_rate": 0.00012613865745987973, + "loss": 0.95, + "step": 12270 + }, + { + "epoch": 2.6175341296928325, + "grad_norm": 0.45229352612514706, + "learning_rate": 0.00012613170529446885, + "loss": 0.9327, + "step": 12271 + }, + { + "epoch": 2.6177474402730376, + "grad_norm": 0.4075002677060599, + "learning_rate": 0.00012612475260707697, + "loss": 0.9116, + "step": 12272 + }, + { + "epoch": 2.6179607508532423, + "grad_norm": 0.41308902792970137, + "learning_rate": 0.0001261177993977827, + "loss": 0.9176, + "step": 12273 + }, + { + "epoch": 2.618174061433447, + "grad_norm": 0.45416683624534915, + "learning_rate": 0.00012611084566666478, + "loss": 0.8991, + "step": 12274 + }, + { + "epoch": 2.618387372013652, + "grad_norm": 0.43912996175925306, + "learning_rate": 0.00012610389141380187, + "loss": 0.9333, + "step": 12275 + }, + { + "epoch": 2.618600682593857, + "grad_norm": 0.3513414330184812, + "learning_rate": 0.00012609693663927262, + "loss": 0.9124, + "step": 12276 + }, + { + "epoch": 2.6188139931740615, + "grad_norm": 0.552978007984519, + "learning_rate": 0.0001260899813431558, + "loss": 0.9799, + "step": 12277 + }, + { + "epoch": 2.619027303754266, + "grad_norm": 0.6044914422076312, + "learning_rate": 0.00012608302552553002, + "loss": 0.9589, + "step": 12278 + }, + { + "epoch": 2.619240614334471, + "grad_norm": 0.39828524261446574, + "learning_rate": 0.00012607606918647404, + "loss": 0.9037, + "step": 12279 + }, + { + "epoch": 2.6194539249146755, + "grad_norm": 0.39748501061831554, + "learning_rate": 0.00012606911232606658, + "loss": 0.9449, + "step": 12280 + }, + { + "epoch": 2.6196672354948807, + "grad_norm": 0.5252958271484536, + "learning_rate": 0.00012606215494438634, + "loss": 0.9598, + "step": 12281 + }, + { + "epoch": 2.6198805460750854, + "grad_norm": 0.4430871346942114, + "learning_rate": 0.00012605519704151204, + "loss": 0.9168, + "step": 12282 + }, + { + "epoch": 2.62009385665529, + "grad_norm": 0.332984834350766, + "learning_rate": 0.0001260482386175224, + "loss": 0.9412, + "step": 12283 + }, + { + "epoch": 2.6203071672354947, + "grad_norm": 0.40575593699158613, + "learning_rate": 0.00012604127967249618, + "loss": 0.9494, + "step": 12284 + }, + { + "epoch": 2.6205204778157, + "grad_norm": 0.31371015064109947, + "learning_rate": 0.0001260343202065121, + "loss": 0.8967, + "step": 12285 + }, + { + "epoch": 2.6207337883959045, + "grad_norm": 0.2848843941179191, + "learning_rate": 0.00012602736021964891, + "loss": 0.9486, + "step": 12286 + }, + { + "epoch": 2.6209470989761092, + "grad_norm": 0.33737901671049264, + "learning_rate": 0.00012602039971198537, + "loss": 0.9433, + "step": 12287 + }, + { + "epoch": 2.621160409556314, + "grad_norm": 0.3353819664383567, + "learning_rate": 0.00012601343868360023, + "loss": 0.9458, + "step": 12288 + }, + { + "epoch": 2.6213737201365186, + "grad_norm": 0.41043652878448866, + "learning_rate": 0.00012600647713457226, + "loss": 0.9445, + "step": 12289 + }, + { + "epoch": 2.6215870307167233, + "grad_norm": 0.3521047763038561, + "learning_rate": 0.0001259995150649802, + "loss": 0.9216, + "step": 12290 + }, + { + "epoch": 2.6218003412969284, + "grad_norm": 0.2423143659980595, + "learning_rate": 0.00012599255247490289, + "loss": 0.9209, + "step": 12291 + }, + { + "epoch": 2.622013651877133, + "grad_norm": 0.3261268277622108, + "learning_rate": 0.00012598558936441906, + "loss": 0.933, + "step": 12292 + }, + { + "epoch": 2.6222269624573378, + "grad_norm": 0.3926714091371437, + "learning_rate": 0.0001259786257336075, + "loss": 0.9714, + "step": 12293 + }, + { + "epoch": 2.622440273037543, + "grad_norm": 0.30211035230727007, + "learning_rate": 0.00012597166158254705, + "loss": 0.9151, + "step": 12294 + }, + { + "epoch": 2.6226535836177476, + "grad_norm": 0.3124755460052665, + "learning_rate": 0.00012596469691131645, + "loss": 0.9215, + "step": 12295 + }, + { + "epoch": 2.6228668941979523, + "grad_norm": 0.3500879276729579, + "learning_rate": 0.00012595773171999455, + "loss": 0.9189, + "step": 12296 + }, + { + "epoch": 2.623080204778157, + "grad_norm": 0.2510244700835439, + "learning_rate": 0.00012595076600866012, + "loss": 0.9286, + "step": 12297 + }, + { + "epoch": 2.6232935153583616, + "grad_norm": 0.25748242976615343, + "learning_rate": 0.00012594379977739202, + "loss": 0.9173, + "step": 12298 + }, + { + "epoch": 2.6235068259385663, + "grad_norm": 0.2803658122727754, + "learning_rate": 0.00012593683302626905, + "loss": 0.9341, + "step": 12299 + }, + { + "epoch": 2.6237201365187715, + "grad_norm": 0.22486413400299068, + "learning_rate": 0.00012592986575537003, + "loss": 0.9871, + "step": 12300 + }, + { + "epoch": 2.623933447098976, + "grad_norm": 0.3208306640710386, + "learning_rate": 0.0001259228979647738, + "loss": 0.9284, + "step": 12301 + }, + { + "epoch": 2.624146757679181, + "grad_norm": 0.29178219000228023, + "learning_rate": 0.00012591592965455922, + "loss": 0.9289, + "step": 12302 + }, + { + "epoch": 2.6243600682593855, + "grad_norm": 0.250979334786302, + "learning_rate": 0.00012590896082480512, + "loss": 0.9448, + "step": 12303 + }, + { + "epoch": 2.6245733788395906, + "grad_norm": 0.30459777398627536, + "learning_rate": 0.00012590199147559036, + "loss": 0.9413, + "step": 12304 + }, + { + "epoch": 2.6247866894197953, + "grad_norm": 0.21162753043860372, + "learning_rate": 0.0001258950216069938, + "loss": 0.9193, + "step": 12305 + }, + { + "epoch": 2.625, + "grad_norm": 0.25765029060373673, + "learning_rate": 0.00012588805121909429, + "loss": 0.9669, + "step": 12306 + }, + { + "epoch": 2.6252133105802047, + "grad_norm": 0.2944349173543292, + "learning_rate": 0.00012588108031197072, + "loss": 0.9213, + "step": 12307 + }, + { + "epoch": 2.6254266211604094, + "grad_norm": 0.28088770286145026, + "learning_rate": 0.00012587410888570194, + "loss": 0.9252, + "step": 12308 + }, + { + "epoch": 2.6256399317406145, + "grad_norm": 0.27726432963208153, + "learning_rate": 0.00012586713694036684, + "loss": 0.9505, + "step": 12309 + }, + { + "epoch": 2.625853242320819, + "grad_norm": 0.22109742155839252, + "learning_rate": 0.00012586016447604435, + "loss": 0.9565, + "step": 12310 + }, + { + "epoch": 2.626066552901024, + "grad_norm": 0.27334364760762325, + "learning_rate": 0.00012585319149281327, + "loss": 0.9426, + "step": 12311 + }, + { + "epoch": 2.6262798634812285, + "grad_norm": 0.25825254967034167, + "learning_rate": 0.00012584621799075258, + "loss": 0.942, + "step": 12312 + }, + { + "epoch": 2.6264931740614337, + "grad_norm": 0.270637129325381, + "learning_rate": 0.0001258392439699412, + "loss": 0.9399, + "step": 12313 + }, + { + "epoch": 2.6267064846416384, + "grad_norm": 0.3431063939782949, + "learning_rate": 0.000125832269430458, + "loss": 0.9727, + "step": 12314 + }, + { + "epoch": 2.626919795221843, + "grad_norm": 0.25737969124428983, + "learning_rate": 0.00012582529437238184, + "loss": 0.9671, + "step": 12315 + }, + { + "epoch": 2.6271331058020477, + "grad_norm": 0.3074290948717283, + "learning_rate": 0.00012581831879579175, + "loss": 0.9324, + "step": 12316 + }, + { + "epoch": 2.6273464163822524, + "grad_norm": 0.3448535247248076, + "learning_rate": 0.0001258113427007666, + "loss": 0.9518, + "step": 12317 + }, + { + "epoch": 2.627559726962457, + "grad_norm": 0.262450320539937, + "learning_rate": 0.00012580436608738534, + "loss": 0.9462, + "step": 12318 + }, + { + "epoch": 2.6277730375426622, + "grad_norm": 0.26883631563676197, + "learning_rate": 0.0001257973889557269, + "loss": 0.9216, + "step": 12319 + }, + { + "epoch": 2.627986348122867, + "grad_norm": 0.32103768942633415, + "learning_rate": 0.00012579041130587022, + "loss": 0.9298, + "step": 12320 + }, + { + "epoch": 2.6281996587030716, + "grad_norm": 0.25359333195925504, + "learning_rate": 0.00012578343313789428, + "loss": 0.9156, + "step": 12321 + }, + { + "epoch": 2.6284129692832767, + "grad_norm": 0.23674050065989, + "learning_rate": 0.00012577645445187806, + "loss": 0.9457, + "step": 12322 + }, + { + "epoch": 2.6286262798634814, + "grad_norm": 0.28019313988831035, + "learning_rate": 0.00012576947524790044, + "loss": 0.95, + "step": 12323 + }, + { + "epoch": 2.628839590443686, + "grad_norm": 0.3156150921498893, + "learning_rate": 0.00012576249552604047, + "loss": 0.9418, + "step": 12324 + }, + { + "epoch": 2.6290529010238908, + "grad_norm": 0.3255899204022272, + "learning_rate": 0.0001257555152863771, + "loss": 0.9196, + "step": 12325 + }, + { + "epoch": 2.6292662116040955, + "grad_norm": 0.404470336280633, + "learning_rate": 0.00012574853452898926, + "loss": 0.968, + "step": 12326 + }, + { + "epoch": 2.6294795221843, + "grad_norm": 0.43625192500405685, + "learning_rate": 0.00012574155325395598, + "loss": 0.9527, + "step": 12327 + }, + { + "epoch": 2.6296928327645053, + "grad_norm": 0.3934938442663803, + "learning_rate": 0.00012573457146135628, + "loss": 0.9453, + "step": 12328 + }, + { + "epoch": 2.62990614334471, + "grad_norm": 0.301983828280539, + "learning_rate": 0.00012572758915126915, + "loss": 0.9398, + "step": 12329 + }, + { + "epoch": 2.6301194539249146, + "grad_norm": 0.2540439470376791, + "learning_rate": 0.00012572060632377356, + "loss": 0.9398, + "step": 12330 + }, + { + "epoch": 2.6303327645051193, + "grad_norm": 0.23527137164273462, + "learning_rate": 0.00012571362297894855, + "loss": 0.9094, + "step": 12331 + }, + { + "epoch": 2.6305460750853245, + "grad_norm": 0.1888267689502102, + "learning_rate": 0.00012570663911687312, + "loss": 0.9294, + "step": 12332 + }, + { + "epoch": 2.630759385665529, + "grad_norm": 0.2573713124076544, + "learning_rate": 0.0001256996547376263, + "loss": 0.9177, + "step": 12333 + }, + { + "epoch": 2.630972696245734, + "grad_norm": 0.2978051311355939, + "learning_rate": 0.00012569266984128712, + "loss": 0.9389, + "step": 12334 + }, + { + "epoch": 2.6311860068259385, + "grad_norm": 0.32608814478800635, + "learning_rate": 0.0001256856844279346, + "loss": 0.9214, + "step": 12335 + }, + { + "epoch": 2.631399317406143, + "grad_norm": 0.3648094594032076, + "learning_rate": 0.00012567869849764785, + "loss": 0.9363, + "step": 12336 + }, + { + "epoch": 2.631612627986348, + "grad_norm": 0.3390500204636862, + "learning_rate": 0.0001256717120505058, + "loss": 0.8984, + "step": 12337 + }, + { + "epoch": 2.631825938566553, + "grad_norm": 0.28495513168930403, + "learning_rate": 0.00012566472508658755, + "loss": 0.9499, + "step": 12338 + }, + { + "epoch": 2.6320392491467577, + "grad_norm": 0.2594016199592134, + "learning_rate": 0.0001256577376059722, + "loss": 0.9246, + "step": 12339 + }, + { + "epoch": 2.6322525597269624, + "grad_norm": 0.2195262192928744, + "learning_rate": 0.00012565074960873876, + "loss": 0.9233, + "step": 12340 + }, + { + "epoch": 2.6324658703071675, + "grad_norm": 0.19469274288962654, + "learning_rate": 0.00012564376109496636, + "loss": 0.9303, + "step": 12341 + }, + { + "epoch": 2.632679180887372, + "grad_norm": 0.21118042035842305, + "learning_rate": 0.000125636772064734, + "loss": 0.9193, + "step": 12342 + }, + { + "epoch": 2.632892491467577, + "grad_norm": 0.20365476822748987, + "learning_rate": 0.00012562978251812082, + "loss": 0.8944, + "step": 12343 + }, + { + "epoch": 2.6331058020477816, + "grad_norm": 0.22317260924784388, + "learning_rate": 0.00012562279245520582, + "loss": 0.9799, + "step": 12344 + }, + { + "epoch": 2.6333191126279862, + "grad_norm": 0.22501672359443312, + "learning_rate": 0.00012561580187606824, + "loss": 0.9326, + "step": 12345 + }, + { + "epoch": 2.633532423208191, + "grad_norm": 0.21438320414343198, + "learning_rate": 0.00012560881078078703, + "loss": 0.9472, + "step": 12346 + }, + { + "epoch": 2.633745733788396, + "grad_norm": 0.23298899790017438, + "learning_rate": 0.0001256018191694414, + "loss": 0.921, + "step": 12347 + }, + { + "epoch": 2.6339590443686007, + "grad_norm": 0.2221991882475289, + "learning_rate": 0.0001255948270421104, + "loss": 0.9742, + "step": 12348 + }, + { + "epoch": 2.6341723549488054, + "grad_norm": 0.237715020354412, + "learning_rate": 0.00012558783439887316, + "loss": 0.946, + "step": 12349 + }, + { + "epoch": 2.63438566552901, + "grad_norm": 0.29467160687784866, + "learning_rate": 0.00012558084123980882, + "loss": 0.9213, + "step": 12350 + }, + { + "epoch": 2.6345989761092152, + "grad_norm": 0.36895974396405956, + "learning_rate": 0.00012557384756499648, + "loss": 0.9748, + "step": 12351 + }, + { + "epoch": 2.63481228668942, + "grad_norm": 0.37682543812589564, + "learning_rate": 0.00012556685337451534, + "loss": 0.9354, + "step": 12352 + }, + { + "epoch": 2.6350255972696246, + "grad_norm": 0.36004035278808766, + "learning_rate": 0.00012555985866844442, + "loss": 0.9405, + "step": 12353 + }, + { + "epoch": 2.6352389078498293, + "grad_norm": 0.32213790188701474, + "learning_rate": 0.00012555286344686297, + "loss": 0.9228, + "step": 12354 + }, + { + "epoch": 2.635452218430034, + "grad_norm": 0.21813478364630648, + "learning_rate": 0.0001255458677098501, + "loss": 0.9032, + "step": 12355 + }, + { + "epoch": 2.6356655290102387, + "grad_norm": 0.20097359215258231, + "learning_rate": 0.00012553887145748494, + "loss": 0.9512, + "step": 12356 + }, + { + "epoch": 2.635878839590444, + "grad_norm": 0.24912352347769406, + "learning_rate": 0.00012553187468984672, + "loss": 0.9339, + "step": 12357 + }, + { + "epoch": 2.6360921501706485, + "grad_norm": 0.3292382818779507, + "learning_rate": 0.00012552487740701457, + "loss": 0.9497, + "step": 12358 + }, + { + "epoch": 2.636305460750853, + "grad_norm": 0.3716814641634042, + "learning_rate": 0.00012551787960906767, + "loss": 0.9341, + "step": 12359 + }, + { + "epoch": 2.6365187713310583, + "grad_norm": 0.38172998973866296, + "learning_rate": 0.00012551088129608516, + "loss": 0.9564, + "step": 12360 + }, + { + "epoch": 2.636732081911263, + "grad_norm": 0.35141839472078396, + "learning_rate": 0.00012550388246814628, + "loss": 0.9087, + "step": 12361 + }, + { + "epoch": 2.6369453924914676, + "grad_norm": 0.2701242143308738, + "learning_rate": 0.00012549688312533022, + "loss": 0.9463, + "step": 12362 + }, + { + "epoch": 2.6371587030716723, + "grad_norm": 0.24142197921875025, + "learning_rate": 0.00012548988326771616, + "loss": 0.9254, + "step": 12363 + }, + { + "epoch": 2.637372013651877, + "grad_norm": 0.26240801643118555, + "learning_rate": 0.0001254828828953833, + "loss": 0.9121, + "step": 12364 + }, + { + "epoch": 2.6375853242320817, + "grad_norm": 0.2579079076001558, + "learning_rate": 0.00012547588200841083, + "loss": 0.9167, + "step": 12365 + }, + { + "epoch": 2.637798634812287, + "grad_norm": 0.23139522358807574, + "learning_rate": 0.00012546888060687803, + "loss": 0.9332, + "step": 12366 + }, + { + "epoch": 2.6380119453924915, + "grad_norm": 0.2407718503815088, + "learning_rate": 0.00012546187869086407, + "loss": 0.9195, + "step": 12367 + }, + { + "epoch": 2.638225255972696, + "grad_norm": 0.2646349524747919, + "learning_rate": 0.00012545487626044818, + "loss": 0.9335, + "step": 12368 + }, + { + "epoch": 2.638438566552901, + "grad_norm": 0.26686910558518123, + "learning_rate": 0.0001254478733157096, + "loss": 0.963, + "step": 12369 + }, + { + "epoch": 2.638651877133106, + "grad_norm": 0.25755880238931605, + "learning_rate": 0.00012544086985672757, + "loss": 0.9415, + "step": 12370 + }, + { + "epoch": 2.6388651877133107, + "grad_norm": 0.2637930431777884, + "learning_rate": 0.00012543386588358132, + "loss": 0.9742, + "step": 12371 + }, + { + "epoch": 2.6390784982935154, + "grad_norm": 0.2712621211198487, + "learning_rate": 0.00012542686139635012, + "loss": 0.9234, + "step": 12372 + }, + { + "epoch": 2.63929180887372, + "grad_norm": 0.21002850533349104, + "learning_rate": 0.00012541985639511326, + "loss": 0.9475, + "step": 12373 + }, + { + "epoch": 2.6395051194539247, + "grad_norm": 0.20679585582997814, + "learning_rate": 0.00012541285087994992, + "loss": 0.9245, + "step": 12374 + }, + { + "epoch": 2.6397184300341294, + "grad_norm": 0.22093099949330447, + "learning_rate": 0.0001254058448509394, + "loss": 0.9491, + "step": 12375 + }, + { + "epoch": 2.6399317406143346, + "grad_norm": 0.23711460409952886, + "learning_rate": 0.00012539883830816102, + "loss": 0.9503, + "step": 12376 + }, + { + "epoch": 2.6401450511945392, + "grad_norm": 0.2877468119521193, + "learning_rate": 0.000125391831251694, + "loss": 0.9266, + "step": 12377 + }, + { + "epoch": 2.640358361774744, + "grad_norm": 0.3043722989683867, + "learning_rate": 0.00012538482368161763, + "loss": 0.926, + "step": 12378 + }, + { + "epoch": 2.640571672354949, + "grad_norm": 0.2821939664278274, + "learning_rate": 0.00012537781559801125, + "loss": 0.909, + "step": 12379 + }, + { + "epoch": 2.6407849829351537, + "grad_norm": 0.2701431390365949, + "learning_rate": 0.00012537080700095413, + "loss": 0.9731, + "step": 12380 + }, + { + "epoch": 2.6409982935153584, + "grad_norm": 0.28508004295952294, + "learning_rate": 0.0001253637978905255, + "loss": 0.9061, + "step": 12381 + }, + { + "epoch": 2.641211604095563, + "grad_norm": 0.24745459900469283, + "learning_rate": 0.0001253567882668048, + "loss": 0.9345, + "step": 12382 + }, + { + "epoch": 2.641424914675768, + "grad_norm": 0.2309347868313843, + "learning_rate": 0.00012534977812987124, + "loss": 0.9697, + "step": 12383 + }, + { + "epoch": 2.6416382252559725, + "grad_norm": 0.24202824697435638, + "learning_rate": 0.00012534276747980418, + "loss": 0.9659, + "step": 12384 + }, + { + "epoch": 2.6418515358361776, + "grad_norm": 0.2573932988928378, + "learning_rate": 0.00012533575631668297, + "loss": 0.953, + "step": 12385 + }, + { + "epoch": 2.6420648464163823, + "grad_norm": 0.2762162681072106, + "learning_rate": 0.00012532874464058692, + "loss": 0.9191, + "step": 12386 + }, + { + "epoch": 2.642278156996587, + "grad_norm": 0.3010867674828086, + "learning_rate": 0.00012532173245159532, + "loss": 0.9171, + "step": 12387 + }, + { + "epoch": 2.6424914675767917, + "grad_norm": 0.29495552919783047, + "learning_rate": 0.0001253147197497876, + "loss": 0.9376, + "step": 12388 + }, + { + "epoch": 2.642704778156997, + "grad_norm": 0.27969129413225935, + "learning_rate": 0.000125307706535243, + "loss": 0.9116, + "step": 12389 + }, + { + "epoch": 2.6429180887372015, + "grad_norm": 0.23107014156568528, + "learning_rate": 0.000125300692808041, + "loss": 0.9097, + "step": 12390 + }, + { + "epoch": 2.643131399317406, + "grad_norm": 0.25388492367005167, + "learning_rate": 0.00012529367856826086, + "loss": 0.915, + "step": 12391 + }, + { + "epoch": 2.643344709897611, + "grad_norm": 0.2455361440872667, + "learning_rate": 0.000125286663815982, + "loss": 0.9193, + "step": 12392 + }, + { + "epoch": 2.6435580204778155, + "grad_norm": 0.2604291215627497, + "learning_rate": 0.00012527964855128378, + "loss": 0.9177, + "step": 12393 + }, + { + "epoch": 2.64377133105802, + "grad_norm": 0.2702661135558253, + "learning_rate": 0.00012527263277424557, + "loss": 0.9262, + "step": 12394 + }, + { + "epoch": 2.6439846416382253, + "grad_norm": 0.23389569765923407, + "learning_rate": 0.00012526561648494678, + "loss": 0.9295, + "step": 12395 + }, + { + "epoch": 2.64419795221843, + "grad_norm": 0.20215562622395344, + "learning_rate": 0.00012525859968346676, + "loss": 0.9188, + "step": 12396 + }, + { + "epoch": 2.6444112627986347, + "grad_norm": 0.21167986323047774, + "learning_rate": 0.00012525158236988493, + "loss": 0.9339, + "step": 12397 + }, + { + "epoch": 2.64462457337884, + "grad_norm": 0.2322723179229144, + "learning_rate": 0.00012524456454428068, + "loss": 0.9118, + "step": 12398 + }, + { + "epoch": 2.6448378839590445, + "grad_norm": 0.20101670548600387, + "learning_rate": 0.00012523754620673343, + "loss": 0.9251, + "step": 12399 + }, + { + "epoch": 2.645051194539249, + "grad_norm": 0.2788276998317997, + "learning_rate": 0.00012523052735732254, + "loss": 0.9411, + "step": 12400 + }, + { + "epoch": 2.645264505119454, + "grad_norm": 0.22010390608986738, + "learning_rate": 0.00012522350799612752, + "loss": 0.9154, + "step": 12401 + }, + { + "epoch": 2.6454778156996586, + "grad_norm": 0.25584464226333764, + "learning_rate": 0.00012521648812322773, + "loss": 0.9165, + "step": 12402 + }, + { + "epoch": 2.6456911262798632, + "grad_norm": 0.2916485559003302, + "learning_rate": 0.00012520946773870263, + "loss": 0.939, + "step": 12403 + }, + { + "epoch": 2.6459044368600684, + "grad_norm": 0.31187044005066783, + "learning_rate": 0.00012520244684263162, + "loss": 0.8875, + "step": 12404 + }, + { + "epoch": 2.646117747440273, + "grad_norm": 0.36862385605607845, + "learning_rate": 0.00012519542543509417, + "loss": 0.9212, + "step": 12405 + }, + { + "epoch": 2.6463310580204777, + "grad_norm": 0.41170480493403755, + "learning_rate": 0.00012518840351616973, + "loss": 0.9463, + "step": 12406 + }, + { + "epoch": 2.6465443686006824, + "grad_norm": 0.40475662723061634, + "learning_rate": 0.00012518138108593774, + "loss": 0.9046, + "step": 12407 + }, + { + "epoch": 2.6467576791808876, + "grad_norm": 0.3769439001976804, + "learning_rate": 0.00012517435814447765, + "loss": 0.9418, + "step": 12408 + }, + { + "epoch": 2.6469709897610922, + "grad_norm": 0.34545178537372934, + "learning_rate": 0.00012516733469186898, + "loss": 0.9081, + "step": 12409 + }, + { + "epoch": 2.647184300341297, + "grad_norm": 0.2541762929192867, + "learning_rate": 0.00012516031072819112, + "loss": 0.9051, + "step": 12410 + }, + { + "epoch": 2.6473976109215016, + "grad_norm": 0.21370533192580432, + "learning_rate": 0.00012515328625352357, + "loss": 0.9288, + "step": 12411 + }, + { + "epoch": 2.6476109215017063, + "grad_norm": 0.34346216520453876, + "learning_rate": 0.00012514626126794586, + "loss": 0.9258, + "step": 12412 + }, + { + "epoch": 2.6478242320819114, + "grad_norm": 0.49315962993466056, + "learning_rate": 0.00012513923577153743, + "loss": 0.9064, + "step": 12413 + }, + { + "epoch": 2.648037542662116, + "grad_norm": 0.615054785441482, + "learning_rate": 0.0001251322097643778, + "loss": 0.9459, + "step": 12414 + }, + { + "epoch": 2.648250853242321, + "grad_norm": 0.6702864797486673, + "learning_rate": 0.00012512518324654646, + "loss": 0.9301, + "step": 12415 + }, + { + "epoch": 2.6484641638225255, + "grad_norm": 0.6117120870996128, + "learning_rate": 0.0001251181562181229, + "loss": 0.922, + "step": 12416 + }, + { + "epoch": 2.6486774744027306, + "grad_norm": 0.4954329832399747, + "learning_rate": 0.00012511112867918663, + "loss": 0.9633, + "step": 12417 + }, + { + "epoch": 2.6488907849829353, + "grad_norm": 0.9610737537731208, + "learning_rate": 0.0001251041006298172, + "loss": 0.9502, + "step": 12418 + }, + { + "epoch": 2.64910409556314, + "grad_norm": 0.30684431776428994, + "learning_rate": 0.0001250970720700941, + "loss": 0.959, + "step": 12419 + }, + { + "epoch": 2.6493174061433447, + "grad_norm": 0.2106543644779694, + "learning_rate": 0.0001250900430000969, + "loss": 0.9148, + "step": 12420 + }, + { + "epoch": 2.6495307167235493, + "grad_norm": 0.2873209689925407, + "learning_rate": 0.0001250830134199051, + "loss": 0.9692, + "step": 12421 + }, + { + "epoch": 2.649744027303754, + "grad_norm": 0.3823803682440244, + "learning_rate": 0.0001250759833295982, + "loss": 0.9383, + "step": 12422 + }, + { + "epoch": 2.649957337883959, + "grad_norm": 0.4614569026717563, + "learning_rate": 0.00012506895272925583, + "loss": 0.9282, + "step": 12423 + }, + { + "epoch": 2.650170648464164, + "grad_norm": 0.5543253403977197, + "learning_rate": 0.0001250619216189575, + "loss": 0.9052, + "step": 12424 + }, + { + "epoch": 2.6503839590443685, + "grad_norm": 0.6355043008747605, + "learning_rate": 0.00012505488999878275, + "loss": 0.9709, + "step": 12425 + }, + { + "epoch": 2.6505972696245736, + "grad_norm": 0.6589516529222288, + "learning_rate": 0.00012504785786881116, + "loss": 0.8987, + "step": 12426 + }, + { + "epoch": 2.6508105802047783, + "grad_norm": 0.5713280899723442, + "learning_rate": 0.00012504082522912232, + "loss": 0.9479, + "step": 12427 + }, + { + "epoch": 2.651023890784983, + "grad_norm": 0.3903031681298544, + "learning_rate": 0.00012503379207979578, + "loss": 0.9414, + "step": 12428 + }, + { + "epoch": 2.6512372013651877, + "grad_norm": 0.299745005639193, + "learning_rate": 0.00012502675842091112, + "loss": 0.9644, + "step": 12429 + }, + { + "epoch": 2.6514505119453924, + "grad_norm": 0.22811556233754762, + "learning_rate": 0.00012501972425254794, + "loss": 0.9537, + "step": 12430 + }, + { + "epoch": 2.651663822525597, + "grad_norm": 0.2348888813612583, + "learning_rate": 0.0001250126895747858, + "loss": 0.9243, + "step": 12431 + }, + { + "epoch": 2.651877133105802, + "grad_norm": 0.26785972116114076, + "learning_rate": 0.00012500565438770434, + "loss": 0.9541, + "step": 12432 + }, + { + "epoch": 2.652090443686007, + "grad_norm": 0.3060105103277771, + "learning_rate": 0.00012499861869138314, + "loss": 0.9262, + "step": 12433 + }, + { + "epoch": 2.6523037542662116, + "grad_norm": 0.2582525009904707, + "learning_rate": 0.00012499158248590182, + "loss": 0.9707, + "step": 12434 + }, + { + "epoch": 2.6525170648464163, + "grad_norm": 0.2409062289778554, + "learning_rate": 0.00012498454577133998, + "loss": 0.9444, + "step": 12435 + }, + { + "epoch": 2.6527303754266214, + "grad_norm": 0.2752550595803167, + "learning_rate": 0.00012497750854777726, + "loss": 0.9555, + "step": 12436 + }, + { + "epoch": 2.652943686006826, + "grad_norm": 0.3093620980310853, + "learning_rate": 0.00012497047081529324, + "loss": 0.971, + "step": 12437 + }, + { + "epoch": 2.6531569965870307, + "grad_norm": 0.290165581740937, + "learning_rate": 0.00012496343257396763, + "loss": 0.9135, + "step": 12438 + }, + { + "epoch": 2.6533703071672354, + "grad_norm": 0.29289022824081196, + "learning_rate": 0.00012495639382388, + "loss": 0.9465, + "step": 12439 + }, + { + "epoch": 2.65358361774744, + "grad_norm": 0.2984516793655465, + "learning_rate": 0.00012494935456511, + "loss": 0.935, + "step": 12440 + }, + { + "epoch": 2.653796928327645, + "grad_norm": 0.28692881856646946, + "learning_rate": 0.0001249423147977373, + "loss": 0.942, + "step": 12441 + }, + { + "epoch": 2.65401023890785, + "grad_norm": 0.30199346016376516, + "learning_rate": 0.00012493527452184163, + "loss": 0.9558, + "step": 12442 + }, + { + "epoch": 2.6542235494880546, + "grad_norm": 0.3016268544902186, + "learning_rate": 0.00012492823373750248, + "loss": 0.9178, + "step": 12443 + }, + { + "epoch": 2.6544368600682593, + "grad_norm": 0.28357090068895874, + "learning_rate": 0.00012492119244479966, + "loss": 0.9281, + "step": 12444 + }, + { + "epoch": 2.6546501706484644, + "grad_norm": 0.19991407497988498, + "learning_rate": 0.00012491415064381278, + "loss": 0.9115, + "step": 12445 + }, + { + "epoch": 2.654863481228669, + "grad_norm": 0.2689215269671157, + "learning_rate": 0.0001249071083346215, + "loss": 0.9081, + "step": 12446 + }, + { + "epoch": 2.655076791808874, + "grad_norm": 0.3295012736520861, + "learning_rate": 0.00012490006551730556, + "loss": 0.936, + "step": 12447 + }, + { + "epoch": 2.6552901023890785, + "grad_norm": 0.2582136341721106, + "learning_rate": 0.00012489302219194462, + "loss": 0.9272, + "step": 12448 + }, + { + "epoch": 2.655503412969283, + "grad_norm": 0.23503635952411459, + "learning_rate": 0.0001248859783586184, + "loss": 0.9365, + "step": 12449 + }, + { + "epoch": 2.655716723549488, + "grad_norm": 0.23083922812972835, + "learning_rate": 0.00012487893401740653, + "loss": 0.9282, + "step": 12450 + }, + { + "epoch": 2.655930034129693, + "grad_norm": 0.25657773811996826, + "learning_rate": 0.00012487188916838882, + "loss": 0.9421, + "step": 12451 + }, + { + "epoch": 2.6561433447098977, + "grad_norm": 0.2830704404132555, + "learning_rate": 0.00012486484381164488, + "loss": 0.9102, + "step": 12452 + }, + { + "epoch": 2.6563566552901023, + "grad_norm": 0.25360840191539263, + "learning_rate": 0.00012485779794725452, + "loss": 0.9368, + "step": 12453 + }, + { + "epoch": 2.656569965870307, + "grad_norm": 0.2941489553875743, + "learning_rate": 0.00012485075157529738, + "loss": 0.9458, + "step": 12454 + }, + { + "epoch": 2.656783276450512, + "grad_norm": 0.3670650216980979, + "learning_rate": 0.0001248437046958533, + "loss": 0.9403, + "step": 12455 + }, + { + "epoch": 2.656996587030717, + "grad_norm": 0.3798487023353847, + "learning_rate": 0.0001248366573090019, + "loss": 0.9124, + "step": 12456 + }, + { + "epoch": 2.6572098976109215, + "grad_norm": 0.3812997965441862, + "learning_rate": 0.00012482960941482293, + "loss": 0.9539, + "step": 12457 + }, + { + "epoch": 2.657423208191126, + "grad_norm": 0.27361283447531537, + "learning_rate": 0.00012482256101339624, + "loss": 0.9375, + "step": 12458 + }, + { + "epoch": 2.657636518771331, + "grad_norm": 0.20606917704748168, + "learning_rate": 0.0001248155121048015, + "loss": 0.8938, + "step": 12459 + }, + { + "epoch": 2.6578498293515356, + "grad_norm": 0.2324821295577272, + "learning_rate": 0.00012480846268911848, + "loss": 0.9672, + "step": 12460 + }, + { + "epoch": 2.6580631399317407, + "grad_norm": 0.20777277965914673, + "learning_rate": 0.00012480141276642695, + "loss": 0.9183, + "step": 12461 + }, + { + "epoch": 2.6582764505119454, + "grad_norm": 0.23168480780145623, + "learning_rate": 0.0001247943623368067, + "loss": 0.9169, + "step": 12462 + }, + { + "epoch": 2.65848976109215, + "grad_norm": 0.25605270792773366, + "learning_rate": 0.00012478731140033746, + "loss": 0.9274, + "step": 12463 + }, + { + "epoch": 2.658703071672355, + "grad_norm": 0.2541776968505324, + "learning_rate": 0.00012478025995709907, + "loss": 0.9321, + "step": 12464 + }, + { + "epoch": 2.65891638225256, + "grad_norm": 0.284291618061105, + "learning_rate": 0.00012477320800717125, + "loss": 0.9122, + "step": 12465 + }, + { + "epoch": 2.6591296928327646, + "grad_norm": 0.3224255757047545, + "learning_rate": 0.00012476615555063385, + "loss": 0.9325, + "step": 12466 + }, + { + "epoch": 2.6593430034129693, + "grad_norm": 0.32450359202405255, + "learning_rate": 0.00012475910258756665, + "loss": 0.9488, + "step": 12467 + }, + { + "epoch": 2.659556313993174, + "grad_norm": 0.29093082825156275, + "learning_rate": 0.00012475204911804942, + "loss": 0.9276, + "step": 12468 + }, + { + "epoch": 2.6597696245733786, + "grad_norm": 0.32372751618780704, + "learning_rate": 0.00012474499514216202, + "loss": 0.9428, + "step": 12469 + }, + { + "epoch": 2.6599829351535837, + "grad_norm": 0.3249939737382578, + "learning_rate": 0.00012473794065998427, + "loss": 0.9169, + "step": 12470 + }, + { + "epoch": 2.6601962457337884, + "grad_norm": 0.23443309116334943, + "learning_rate": 0.00012473088567159594, + "loss": 0.9268, + "step": 12471 + }, + { + "epoch": 2.660409556313993, + "grad_norm": 0.2775595184073488, + "learning_rate": 0.00012472383017707688, + "loss": 0.9095, + "step": 12472 + }, + { + "epoch": 2.660622866894198, + "grad_norm": 0.308148182355227, + "learning_rate": 0.00012471677417650698, + "loss": 0.9418, + "step": 12473 + }, + { + "epoch": 2.660836177474403, + "grad_norm": 0.2847907250106912, + "learning_rate": 0.00012470971766996596, + "loss": 0.9399, + "step": 12474 + }, + { + "epoch": 2.6610494880546076, + "grad_norm": 0.27315219123820605, + "learning_rate": 0.00012470266065753378, + "loss": 0.9667, + "step": 12475 + }, + { + "epoch": 2.6612627986348123, + "grad_norm": 0.26889767365808015, + "learning_rate": 0.00012469560313929025, + "loss": 0.9363, + "step": 12476 + }, + { + "epoch": 2.661476109215017, + "grad_norm": 0.20947753703137045, + "learning_rate": 0.00012468854511531516, + "loss": 0.9359, + "step": 12477 + }, + { + "epoch": 2.6616894197952217, + "grad_norm": 0.22029918813393998, + "learning_rate": 0.00012468148658568848, + "loss": 0.9041, + "step": 12478 + }, + { + "epoch": 2.6619027303754264, + "grad_norm": 0.2901763474402604, + "learning_rate": 0.00012467442755049, + "loss": 0.9166, + "step": 12479 + }, + { + "epoch": 2.6621160409556315, + "grad_norm": 0.26081572534176567, + "learning_rate": 0.00012466736800979965, + "loss": 0.9036, + "step": 12480 + }, + { + "epoch": 2.662329351535836, + "grad_norm": 0.1920504864718953, + "learning_rate": 0.00012466030796369726, + "loss": 0.9646, + "step": 12481 + }, + { + "epoch": 2.662542662116041, + "grad_norm": 0.2575118620333784, + "learning_rate": 0.00012465324741226273, + "loss": 0.9281, + "step": 12482 + }, + { + "epoch": 2.662755972696246, + "grad_norm": 0.27399100402411275, + "learning_rate": 0.00012464618635557598, + "loss": 0.943, + "step": 12483 + }, + { + "epoch": 2.6629692832764507, + "grad_norm": 0.28041784881091125, + "learning_rate": 0.00012463912479371687, + "loss": 0.9288, + "step": 12484 + }, + { + "epoch": 2.6631825938566553, + "grad_norm": 0.8131413924655891, + "learning_rate": 0.0001246320627267653, + "loss": 0.9808, + "step": 12485 + }, + { + "epoch": 2.66339590443686, + "grad_norm": 0.2953765671004832, + "learning_rate": 0.00012462500015480117, + "loss": 0.9094, + "step": 12486 + }, + { + "epoch": 2.6636092150170647, + "grad_norm": 0.3639691000571155, + "learning_rate": 0.00012461793707790447, + "loss": 0.9266, + "step": 12487 + }, + { + "epoch": 2.6638225255972694, + "grad_norm": 0.4052373842959333, + "learning_rate": 0.00012461087349615503, + "loss": 0.9191, + "step": 12488 + }, + { + "epoch": 2.6640358361774745, + "grad_norm": 0.38219643650565316, + "learning_rate": 0.0001246038094096328, + "loss": 0.9595, + "step": 12489 + }, + { + "epoch": 2.664249146757679, + "grad_norm": 0.31144673445181004, + "learning_rate": 0.00012459674481841773, + "loss": 0.9355, + "step": 12490 + }, + { + "epoch": 2.664462457337884, + "grad_norm": 0.23699609004110966, + "learning_rate": 0.00012458967972258972, + "loss": 0.9305, + "step": 12491 + }, + { + "epoch": 2.6646757679180886, + "grad_norm": 0.3280267477238189, + "learning_rate": 0.00012458261412222875, + "loss": 0.933, + "step": 12492 + }, + { + "epoch": 2.6648890784982937, + "grad_norm": 0.38137509278599585, + "learning_rate": 0.00012457554801741477, + "loss": 0.9296, + "step": 12493 + }, + { + "epoch": 2.6651023890784984, + "grad_norm": 0.47284460712593435, + "learning_rate": 0.00012456848140822768, + "loss": 0.9273, + "step": 12494 + }, + { + "epoch": 2.665315699658703, + "grad_norm": 0.5294732907875913, + "learning_rate": 0.0001245614142947475, + "loss": 0.9306, + "step": 12495 + }, + { + "epoch": 2.6655290102389078, + "grad_norm": 0.5316760132966623, + "learning_rate": 0.00012455434667705417, + "loss": 0.9482, + "step": 12496 + }, + { + "epoch": 2.6657423208191124, + "grad_norm": 0.47309443380973343, + "learning_rate": 0.00012454727855522763, + "loss": 0.9267, + "step": 12497 + }, + { + "epoch": 2.6659556313993176, + "grad_norm": 0.42397098971826175, + "learning_rate": 0.00012454020992934788, + "loss": 0.9673, + "step": 12498 + }, + { + "epoch": 2.6661689419795223, + "grad_norm": 0.4615699823972759, + "learning_rate": 0.00012453314079949494, + "loss": 0.949, + "step": 12499 + }, + { + "epoch": 2.666382252559727, + "grad_norm": 0.43895102367294253, + "learning_rate": 0.00012452607116574872, + "loss": 0.9535, + "step": 12500 + }, + { + "epoch": 2.6665955631399316, + "grad_norm": 0.3842258378462554, + "learning_rate": 0.0001245190010281893, + "loss": 0.9353, + "step": 12501 + }, + { + "epoch": 2.6668088737201368, + "grad_norm": 0.20763804661667928, + "learning_rate": 0.0001245119303868966, + "loss": 0.9359, + "step": 12502 + }, + { + "epoch": 2.6670221843003414, + "grad_norm": 0.25225096560909205, + "learning_rate": 0.00012450485924195067, + "loss": 0.9563, + "step": 12503 + }, + { + "epoch": 2.667235494880546, + "grad_norm": 0.2735535119759498, + "learning_rate": 0.00012449778759343152, + "loss": 0.9168, + "step": 12504 + }, + { + "epoch": 2.667448805460751, + "grad_norm": 0.28768554817539543, + "learning_rate": 0.00012449071544141916, + "loss": 0.9182, + "step": 12505 + }, + { + "epoch": 2.6676621160409555, + "grad_norm": 0.2798360133678195, + "learning_rate": 0.0001244836427859936, + "loss": 0.9558, + "step": 12506 + }, + { + "epoch": 2.66787542662116, + "grad_norm": 0.3587775919104731, + "learning_rate": 0.00012447656962723486, + "loss": 0.9377, + "step": 12507 + }, + { + "epoch": 2.6680887372013653, + "grad_norm": 0.24823437023008427, + "learning_rate": 0.00012446949596522298, + "loss": 0.9434, + "step": 12508 + }, + { + "epoch": 2.66830204778157, + "grad_norm": 0.28160389519447543, + "learning_rate": 0.00012446242180003806, + "loss": 0.9111, + "step": 12509 + }, + { + "epoch": 2.6685153583617747, + "grad_norm": 0.2196773950610746, + "learning_rate": 0.00012445534713176004, + "loss": 0.9117, + "step": 12510 + }, + { + "epoch": 2.66872866894198, + "grad_norm": 0.2324075488236247, + "learning_rate": 0.00012444827196046902, + "loss": 0.9373, + "step": 12511 + }, + { + "epoch": 2.6689419795221845, + "grad_norm": 0.24553757472076645, + "learning_rate": 0.00012444119628624508, + "loss": 0.9223, + "step": 12512 + }, + { + "epoch": 2.669155290102389, + "grad_norm": 0.22339593799732652, + "learning_rate": 0.0001244341201091682, + "loss": 0.9382, + "step": 12513 + }, + { + "epoch": 2.669368600682594, + "grad_norm": 0.23834916165658893, + "learning_rate": 0.00012442704342931855, + "loss": 0.9306, + "step": 12514 + }, + { + "epoch": 2.6695819112627985, + "grad_norm": 0.22581045265611718, + "learning_rate": 0.00012441996624677616, + "loss": 0.9035, + "step": 12515 + }, + { + "epoch": 2.669795221843003, + "grad_norm": 0.2531041106849521, + "learning_rate": 0.0001244128885616211, + "loss": 0.9501, + "step": 12516 + }, + { + "epoch": 2.6700085324232083, + "grad_norm": 0.2683870078248021, + "learning_rate": 0.00012440581037393347, + "loss": 0.9459, + "step": 12517 + }, + { + "epoch": 2.670221843003413, + "grad_norm": 0.274373567686739, + "learning_rate": 0.00012439873168379334, + "loss": 0.9339, + "step": 12518 + }, + { + "epoch": 2.6704351535836177, + "grad_norm": 0.209461758633561, + "learning_rate": 0.0001243916524912808, + "loss": 0.9454, + "step": 12519 + }, + { + "epoch": 2.6706484641638224, + "grad_norm": 0.21075744651481698, + "learning_rate": 0.00012438457279647597, + "loss": 0.9216, + "step": 12520 + }, + { + "epoch": 2.6708617747440275, + "grad_norm": 0.28373204708768895, + "learning_rate": 0.00012437749259945895, + "loss": 0.9545, + "step": 12521 + }, + { + "epoch": 2.671075085324232, + "grad_norm": 0.2613468106691294, + "learning_rate": 0.00012437041190030987, + "loss": 0.9272, + "step": 12522 + }, + { + "epoch": 2.671288395904437, + "grad_norm": 0.22307715183591534, + "learning_rate": 0.0001243633306991088, + "loss": 0.9495, + "step": 12523 + }, + { + "epoch": 2.6715017064846416, + "grad_norm": 0.29383059194079275, + "learning_rate": 0.00012435624899593593, + "loss": 0.9533, + "step": 12524 + }, + { + "epoch": 2.6717150170648463, + "grad_norm": 0.269566888316837, + "learning_rate": 0.00012434916679087134, + "loss": 0.9114, + "step": 12525 + }, + { + "epoch": 2.671928327645051, + "grad_norm": 0.21155074119571077, + "learning_rate": 0.00012434208408399522, + "loss": 0.9618, + "step": 12526 + }, + { + "epoch": 2.672141638225256, + "grad_norm": 0.29309267983472487, + "learning_rate": 0.00012433500087538762, + "loss": 0.937, + "step": 12527 + }, + { + "epoch": 2.6723549488054608, + "grad_norm": 0.3625313202948242, + "learning_rate": 0.00012432791716512876, + "loss": 0.9566, + "step": 12528 + }, + { + "epoch": 2.6725682593856654, + "grad_norm": 0.3652830971563855, + "learning_rate": 0.00012432083295329876, + "loss": 0.9224, + "step": 12529 + }, + { + "epoch": 2.6727815699658706, + "grad_norm": 0.3626289530253015, + "learning_rate": 0.0001243137482399778, + "loss": 0.9237, + "step": 12530 + }, + { + "epoch": 2.6729948805460753, + "grad_norm": 0.37996560562293447, + "learning_rate": 0.00012430666302524603, + "loss": 0.9283, + "step": 12531 + }, + { + "epoch": 2.67320819112628, + "grad_norm": 2.106356801397531, + "learning_rate": 0.00012429957730918364, + "loss": 0.9791, + "step": 12532 + }, + { + "epoch": 2.6734215017064846, + "grad_norm": 0.4548428172488938, + "learning_rate": 0.00012429249109187078, + "loss": 0.9238, + "step": 12533 + }, + { + "epoch": 2.6736348122866893, + "grad_norm": 0.7855419134040186, + "learning_rate": 0.00012428540437338763, + "loss": 1.0032, + "step": 12534 + }, + { + "epoch": 2.673848122866894, + "grad_norm": 0.8652429801756347, + "learning_rate": 0.00012427831715381438, + "loss": 0.941, + "step": 12535 + }, + { + "epoch": 2.674061433447099, + "grad_norm": 0.7775523528796641, + "learning_rate": 0.00012427122943323125, + "loss": 0.9659, + "step": 12536 + }, + { + "epoch": 2.674274744027304, + "grad_norm": 0.8666586982461384, + "learning_rate": 0.00012426414121171839, + "loss": 0.9392, + "step": 12537 + }, + { + "epoch": 2.6744880546075085, + "grad_norm": 0.8459656133381989, + "learning_rate": 0.00012425705248935606, + "loss": 0.9567, + "step": 12538 + }, + { + "epoch": 2.674701365187713, + "grad_norm": 0.6047831135746778, + "learning_rate": 0.00012424996326622438, + "loss": 0.9773, + "step": 12539 + }, + { + "epoch": 2.6749146757679183, + "grad_norm": 0.37042283453469826, + "learning_rate": 0.00012424287354240366, + "loss": 0.9358, + "step": 12540 + }, + { + "epoch": 2.675127986348123, + "grad_norm": 0.3448824747169818, + "learning_rate": 0.0001242357833179741, + "loss": 0.9461, + "step": 12541 + }, + { + "epoch": 2.6753412969283277, + "grad_norm": 0.38290676890588127, + "learning_rate": 0.00012422869259301586, + "loss": 0.9303, + "step": 12542 + }, + { + "epoch": 2.6755546075085324, + "grad_norm": 0.4115321757150421, + "learning_rate": 0.00012422160136760926, + "loss": 0.9074, + "step": 12543 + }, + { + "epoch": 2.675767918088737, + "grad_norm": 0.4031616511608863, + "learning_rate": 0.00012421450964183447, + "loss": 0.9148, + "step": 12544 + }, + { + "epoch": 2.6759812286689417, + "grad_norm": 0.3369507497703231, + "learning_rate": 0.00012420741741577177, + "loss": 0.9645, + "step": 12545 + }, + { + "epoch": 2.676194539249147, + "grad_norm": 0.330105116655238, + "learning_rate": 0.0001242003246895014, + "loss": 0.9227, + "step": 12546 + }, + { + "epoch": 2.6764078498293515, + "grad_norm": 0.30904057137611013, + "learning_rate": 0.0001241932314631036, + "loss": 0.9227, + "step": 12547 + }, + { + "epoch": 2.676621160409556, + "grad_norm": 0.3648581286763253, + "learning_rate": 0.00012418613773665867, + "loss": 0.9264, + "step": 12548 + }, + { + "epoch": 2.6768344709897613, + "grad_norm": 0.38614608187080524, + "learning_rate": 0.00012417904351024682, + "loss": 0.9442, + "step": 12549 + }, + { + "epoch": 2.677047781569966, + "grad_norm": 0.4429144331872122, + "learning_rate": 0.00012417194878394837, + "loss": 0.9248, + "step": 12550 + }, + { + "epoch": 2.6772610921501707, + "grad_norm": 0.4086380737072059, + "learning_rate": 0.00012416485355784359, + "loss": 0.935, + "step": 12551 + }, + { + "epoch": 2.6774744027303754, + "grad_norm": 0.29366720419325104, + "learning_rate": 0.0001241577578320127, + "loss": 0.9082, + "step": 12552 + }, + { + "epoch": 2.67768771331058, + "grad_norm": 0.25043792027220163, + "learning_rate": 0.00012415066160653606, + "loss": 0.9141, + "step": 12553 + }, + { + "epoch": 2.6779010238907848, + "grad_norm": 0.30603673069151144, + "learning_rate": 0.00012414356488149396, + "loss": 0.9538, + "step": 12554 + }, + { + "epoch": 2.67811433447099, + "grad_norm": 0.4896081213804489, + "learning_rate": 0.00012413646765696665, + "loss": 0.9591, + "step": 12555 + }, + { + "epoch": 2.6783276450511946, + "grad_norm": 0.5411613198375418, + "learning_rate": 0.00012412936993303449, + "loss": 0.9519, + "step": 12556 + }, + { + "epoch": 2.6785409556313993, + "grad_norm": 0.47825222419354985, + "learning_rate": 0.00012412227170977775, + "loss": 0.9695, + "step": 12557 + }, + { + "epoch": 2.678754266211604, + "grad_norm": 0.3138117614743966, + "learning_rate": 0.0001241151729872768, + "loss": 0.9126, + "step": 12558 + }, + { + "epoch": 2.678967576791809, + "grad_norm": 0.28163884082405355, + "learning_rate": 0.0001241080737656119, + "loss": 0.9257, + "step": 12559 + }, + { + "epoch": 2.6791808873720138, + "grad_norm": 0.38466076987431264, + "learning_rate": 0.00012410097404486338, + "loss": 0.9406, + "step": 12560 + }, + { + "epoch": 2.6793941979522184, + "grad_norm": 0.5461813480368197, + "learning_rate": 0.00012409387382511164, + "loss": 0.9566, + "step": 12561 + }, + { + "epoch": 2.679607508532423, + "grad_norm": 0.5189188823541895, + "learning_rate": 0.00012408677310643695, + "loss": 0.9408, + "step": 12562 + }, + { + "epoch": 2.679820819112628, + "grad_norm": 0.4331841765349828, + "learning_rate": 0.00012407967188891967, + "loss": 0.9114, + "step": 12563 + }, + { + "epoch": 2.6800341296928325, + "grad_norm": 0.33154750878213574, + "learning_rate": 0.0001240725701726402, + "loss": 0.938, + "step": 12564 + }, + { + "epoch": 2.6802474402730376, + "grad_norm": 0.2882686520931828, + "learning_rate": 0.00012406546795767882, + "loss": 0.9437, + "step": 12565 + }, + { + "epoch": 2.6804607508532423, + "grad_norm": 0.4293523645273809, + "learning_rate": 0.00012405836524411595, + "loss": 0.9241, + "step": 12566 + }, + { + "epoch": 2.680674061433447, + "grad_norm": 0.5370906705907003, + "learning_rate": 0.00012405126203203192, + "loss": 0.9412, + "step": 12567 + }, + { + "epoch": 2.680887372013652, + "grad_norm": 0.5921734888353216, + "learning_rate": 0.00012404415832150713, + "loss": 0.9209, + "step": 12568 + }, + { + "epoch": 2.681100682593857, + "grad_norm": 0.46176448997322594, + "learning_rate": 0.00012403705411262193, + "loss": 0.9344, + "step": 12569 + }, + { + "epoch": 2.6813139931740615, + "grad_norm": 0.29956952636521855, + "learning_rate": 0.00012402994940545676, + "loss": 0.8956, + "step": 12570 + }, + { + "epoch": 2.681527303754266, + "grad_norm": 0.2972209090274809, + "learning_rate": 0.00012402284420009194, + "loss": 0.9706, + "step": 12571 + }, + { + "epoch": 2.681740614334471, + "grad_norm": 0.5200623567368335, + "learning_rate": 0.0001240157384966079, + "loss": 0.9461, + "step": 12572 + }, + { + "epoch": 2.6819539249146755, + "grad_norm": 0.6241587374289813, + "learning_rate": 0.00012400863229508507, + "loss": 0.9202, + "step": 12573 + }, + { + "epoch": 2.6821672354948807, + "grad_norm": 0.4716611064470054, + "learning_rate": 0.0001240015255956038, + "loss": 0.9429, + "step": 12574 + }, + { + "epoch": 2.6823805460750854, + "grad_norm": 0.36817830495442827, + "learning_rate": 0.0001239944183982445, + "loss": 0.9309, + "step": 12575 + }, + { + "epoch": 2.68259385665529, + "grad_norm": 0.22057703591251854, + "learning_rate": 0.00012398731070308768, + "loss": 0.89, + "step": 12576 + }, + { + "epoch": 2.6828071672354947, + "grad_norm": 0.3421422625175048, + "learning_rate": 0.00012398020251021367, + "loss": 0.912, + "step": 12577 + }, + { + "epoch": 2.6830204778157, + "grad_norm": 0.4698162930897882, + "learning_rate": 0.0001239730938197029, + "loss": 0.9245, + "step": 12578 + }, + { + "epoch": 2.6832337883959045, + "grad_norm": 0.5063536184836669, + "learning_rate": 0.00012396598463163585, + "loss": 0.9735, + "step": 12579 + }, + { + "epoch": 2.6834470989761092, + "grad_norm": 0.4720948591075797, + "learning_rate": 0.00012395887494609295, + "loss": 0.9634, + "step": 12580 + }, + { + "epoch": 2.683660409556314, + "grad_norm": 0.3242326057157271, + "learning_rate": 0.00012395176476315462, + "loss": 0.9404, + "step": 12581 + }, + { + "epoch": 2.6838737201365186, + "grad_norm": 0.2619112130692305, + "learning_rate": 0.00012394465408290138, + "loss": 0.9315, + "step": 12582 + }, + { + "epoch": 2.6840870307167233, + "grad_norm": 0.26742432895189944, + "learning_rate": 0.00012393754290541362, + "loss": 0.9246, + "step": 12583 + }, + { + "epoch": 2.6843003412969284, + "grad_norm": 0.25201611493945275, + "learning_rate": 0.0001239304312307718, + "loss": 0.9317, + "step": 12584 + }, + { + "epoch": 2.684513651877133, + "grad_norm": 0.29231855240584637, + "learning_rate": 0.00012392331905905642, + "loss": 0.9165, + "step": 12585 + }, + { + "epoch": 2.6847269624573378, + "grad_norm": 0.3105853360719323, + "learning_rate": 0.00012391620639034797, + "loss": 0.9265, + "step": 12586 + }, + { + "epoch": 2.684940273037543, + "grad_norm": 0.2929066795554802, + "learning_rate": 0.0001239090932247269, + "loss": 0.939, + "step": 12587 + }, + { + "epoch": 2.6851535836177476, + "grad_norm": 0.20071828311229295, + "learning_rate": 0.00012390197956227368, + "loss": 0.9108, + "step": 12588 + }, + { + "epoch": 2.6853668941979523, + "grad_norm": 0.2521520398526327, + "learning_rate": 0.00012389486540306886, + "loss": 0.9532, + "step": 12589 + }, + { + "epoch": 2.685580204778157, + "grad_norm": 0.31211866399746635, + "learning_rate": 0.00012388775074719286, + "loss": 0.9644, + "step": 12590 + }, + { + "epoch": 2.6857935153583616, + "grad_norm": 0.34080979434364655, + "learning_rate": 0.00012388063559472625, + "loss": 0.9266, + "step": 12591 + }, + { + "epoch": 2.6860068259385663, + "grad_norm": 0.38070269190788825, + "learning_rate": 0.0001238735199457495, + "loss": 0.9452, + "step": 12592 + }, + { + "epoch": 2.6862201365187715, + "grad_norm": 0.3294243261138317, + "learning_rate": 0.00012386640380034313, + "loss": 0.9279, + "step": 12593 + }, + { + "epoch": 2.686433447098976, + "grad_norm": 0.19865145776202392, + "learning_rate": 0.00012385928715858766, + "loss": 0.9147, + "step": 12594 + }, + { + "epoch": 2.686646757679181, + "grad_norm": 0.29316610562837014, + "learning_rate": 0.00012385217002056363, + "loss": 0.9241, + "step": 12595 + }, + { + "epoch": 2.6868600682593855, + "grad_norm": 0.34180316806256283, + "learning_rate": 0.00012384505238635155, + "loss": 0.9174, + "step": 12596 + }, + { + "epoch": 2.6870733788395906, + "grad_norm": 0.25682466933462844, + "learning_rate": 0.00012383793425603195, + "loss": 0.9523, + "step": 12597 + }, + { + "epoch": 2.6872866894197953, + "grad_norm": 0.22092450190609128, + "learning_rate": 0.00012383081562968539, + "loss": 0.9375, + "step": 12598 + }, + { + "epoch": 2.6875, + "grad_norm": 0.3009692711684715, + "learning_rate": 0.00012382369650739243, + "loss": 0.9232, + "step": 12599 + }, + { + "epoch": 2.6877133105802047, + "grad_norm": 0.34942397359210314, + "learning_rate": 0.00012381657688923363, + "loss": 0.9825, + "step": 12600 + }, + { + "epoch": 2.6879266211604094, + "grad_norm": 0.32992191438254204, + "learning_rate": 0.00012380945677528947, + "loss": 0.9607, + "step": 12601 + }, + { + "epoch": 2.6881399317406145, + "grad_norm": 0.2479307932470113, + "learning_rate": 0.0001238023361656406, + "loss": 0.9423, + "step": 12602 + }, + { + "epoch": 2.688353242320819, + "grad_norm": 0.27155900574668873, + "learning_rate": 0.00012379521506036758, + "loss": 0.9397, + "step": 12603 + }, + { + "epoch": 2.688566552901024, + "grad_norm": 0.28622298111833405, + "learning_rate": 0.00012378809345955094, + "loss": 0.93, + "step": 12604 + }, + { + "epoch": 2.6887798634812285, + "grad_norm": 0.27419119845428597, + "learning_rate": 0.0001237809713632713, + "loss": 0.932, + "step": 12605 + }, + { + "epoch": 2.6889931740614337, + "grad_norm": 0.3313747852008829, + "learning_rate": 0.0001237738487716092, + "loss": 0.9114, + "step": 12606 + }, + { + "epoch": 2.6892064846416384, + "grad_norm": 0.3468679945795946, + "learning_rate": 0.0001237667256846453, + "loss": 0.9249, + "step": 12607 + }, + { + "epoch": 2.689419795221843, + "grad_norm": 0.29305056999509943, + "learning_rate": 0.00012375960210246013, + "loss": 0.9616, + "step": 12608 + }, + { + "epoch": 2.6896331058020477, + "grad_norm": 0.20710092758699505, + "learning_rate": 0.00012375247802513435, + "loss": 0.9148, + "step": 12609 + }, + { + "epoch": 2.6898464163822524, + "grad_norm": 0.29343233682926717, + "learning_rate": 0.00012374535345274854, + "loss": 0.9309, + "step": 12610 + }, + { + "epoch": 2.690059726962457, + "grad_norm": 0.4243758541269849, + "learning_rate": 0.00012373822838538333, + "loss": 0.8933, + "step": 12611 + }, + { + "epoch": 2.6902730375426622, + "grad_norm": 0.3930468415388558, + "learning_rate": 0.00012373110282311932, + "loss": 0.9378, + "step": 12612 + }, + { + "epoch": 2.690486348122867, + "grad_norm": 0.45979279621792707, + "learning_rate": 0.00012372397676603712, + "loss": 0.9177, + "step": 12613 + }, + { + "epoch": 2.6906996587030716, + "grad_norm": 0.4181396802082708, + "learning_rate": 0.00012371685021421742, + "loss": 0.9517, + "step": 12614 + }, + { + "epoch": 2.6909129692832767, + "grad_norm": 0.2573619132369792, + "learning_rate": 0.00012370972316774084, + "loss": 0.9223, + "step": 12615 + }, + { + "epoch": 2.6911262798634814, + "grad_norm": 0.26989009662516394, + "learning_rate": 0.00012370259562668797, + "loss": 0.9255, + "step": 12616 + }, + { + "epoch": 2.691339590443686, + "grad_norm": 0.256875037335415, + "learning_rate": 0.00012369546759113953, + "loss": 0.9194, + "step": 12617 + }, + { + "epoch": 2.6915529010238908, + "grad_norm": 0.23755345576692105, + "learning_rate": 0.0001236883390611761, + "loss": 0.923, + "step": 12618 + }, + { + "epoch": 2.6917662116040955, + "grad_norm": 0.2608701678637848, + "learning_rate": 0.0001236812100368784, + "loss": 0.9576, + "step": 12619 + }, + { + "epoch": 2.6919795221843, + "grad_norm": 0.26414281118592875, + "learning_rate": 0.00012367408051832708, + "loss": 0.9357, + "step": 12620 + }, + { + "epoch": 2.6921928327645053, + "grad_norm": 0.3021556514346727, + "learning_rate": 0.00012366695050560278, + "loss": 0.9265, + "step": 12621 + }, + { + "epoch": 2.69240614334471, + "grad_norm": 0.33205705234572874, + "learning_rate": 0.00012365981999878624, + "loss": 0.9232, + "step": 12622 + }, + { + "epoch": 2.6926194539249146, + "grad_norm": 0.34478001148522597, + "learning_rate": 0.00012365268899795808, + "loss": 0.9279, + "step": 12623 + }, + { + "epoch": 2.6928327645051193, + "grad_norm": 0.2393159932031754, + "learning_rate": 0.000123645557503199, + "loss": 0.9763, + "step": 12624 + }, + { + "epoch": 2.6930460750853245, + "grad_norm": 0.25936236864901757, + "learning_rate": 0.0001236384255145897, + "loss": 0.9646, + "step": 12625 + }, + { + "epoch": 2.693259385665529, + "grad_norm": 0.3672794776481313, + "learning_rate": 0.0001236312930322109, + "loss": 0.9395, + "step": 12626 + }, + { + "epoch": 2.693472696245734, + "grad_norm": 0.25697903760879354, + "learning_rate": 0.0001236241600561433, + "loss": 0.9401, + "step": 12627 + }, + { + "epoch": 2.6936860068259385, + "grad_norm": 0.31541747072861914, + "learning_rate": 0.00012361702658646756, + "loss": 1.0054, + "step": 12628 + }, + { + "epoch": 2.693899317406143, + "grad_norm": 0.3894792535483602, + "learning_rate": 0.00012360989262326445, + "loss": 0.914, + "step": 12629 + }, + { + "epoch": 2.694112627986348, + "grad_norm": 0.269509491484773, + "learning_rate": 0.00012360275816661464, + "loss": 0.9218, + "step": 12630 + }, + { + "epoch": 2.694325938566553, + "grad_norm": 0.22541485010733767, + "learning_rate": 0.0001235956232165989, + "loss": 0.9497, + "step": 12631 + }, + { + "epoch": 2.6945392491467577, + "grad_norm": 0.27535688992622964, + "learning_rate": 0.00012358848777329799, + "loss": 0.9182, + "step": 12632 + }, + { + "epoch": 2.6947525597269624, + "grad_norm": 0.2759705785875719, + "learning_rate": 0.00012358135183679258, + "loss": 0.9321, + "step": 12633 + }, + { + "epoch": 2.6949658703071675, + "grad_norm": 0.34506246861012496, + "learning_rate": 0.00012357421540716345, + "loss": 0.9105, + "step": 12634 + }, + { + "epoch": 2.695179180887372, + "grad_norm": 0.3813955731766987, + "learning_rate": 0.00012356707848449133, + "loss": 0.9179, + "step": 12635 + }, + { + "epoch": 2.695392491467577, + "grad_norm": 0.32377576659539664, + "learning_rate": 0.00012355994106885698, + "loss": 0.9326, + "step": 12636 + }, + { + "epoch": 2.6956058020477816, + "grad_norm": 0.21322798648564512, + "learning_rate": 0.00012355280316034116, + "loss": 0.9371, + "step": 12637 + }, + { + "epoch": 2.6958191126279862, + "grad_norm": 0.24845412209255804, + "learning_rate": 0.00012354566475902468, + "loss": 0.9246, + "step": 12638 + }, + { + "epoch": 2.696032423208191, + "grad_norm": 0.3062358910976523, + "learning_rate": 0.00012353852586498821, + "loss": 0.9529, + "step": 12639 + }, + { + "epoch": 2.696245733788396, + "grad_norm": 0.3222221450422786, + "learning_rate": 0.00012353138647831262, + "loss": 0.9113, + "step": 12640 + }, + { + "epoch": 2.6964590443686007, + "grad_norm": 0.2524491191792362, + "learning_rate": 0.00012352424659907866, + "loss": 0.9436, + "step": 12641 + }, + { + "epoch": 2.6966723549488054, + "grad_norm": 0.27759140305797864, + "learning_rate": 0.00012351710622736712, + "loss": 0.9678, + "step": 12642 + }, + { + "epoch": 2.69688566552901, + "grad_norm": 0.26977318207011747, + "learning_rate": 0.0001235099653632588, + "loss": 0.9551, + "step": 12643 + }, + { + "epoch": 2.6970989761092152, + "grad_norm": 0.2118264391249746, + "learning_rate": 0.00012350282400683447, + "loss": 0.8888, + "step": 12644 + }, + { + "epoch": 2.69731228668942, + "grad_norm": 0.2619604694919015, + "learning_rate": 0.00012349568215817497, + "loss": 0.9507, + "step": 12645 + }, + { + "epoch": 2.6975255972696246, + "grad_norm": 0.24900076724525408, + "learning_rate": 0.00012348853981736108, + "loss": 0.9537, + "step": 12646 + }, + { + "epoch": 2.6977389078498293, + "grad_norm": 0.32433483246259176, + "learning_rate": 0.00012348139698447363, + "loss": 0.941, + "step": 12647 + }, + { + "epoch": 2.697952218430034, + "grad_norm": 0.32267905969292165, + "learning_rate": 0.00012347425365959346, + "loss": 0.9574, + "step": 12648 + }, + { + "epoch": 2.6981655290102387, + "grad_norm": 0.27180970589043785, + "learning_rate": 0.00012346710984280137, + "loss": 0.9224, + "step": 12649 + }, + { + "epoch": 2.698378839590444, + "grad_norm": 0.2856832517781729, + "learning_rate": 0.00012345996553417822, + "loss": 0.9335, + "step": 12650 + }, + { + "epoch": 2.6985921501706485, + "grad_norm": 0.2940730872110539, + "learning_rate": 0.00012345282073380478, + "loss": 0.9251, + "step": 12651 + }, + { + "epoch": 2.698805460750853, + "grad_norm": 0.35625158948136504, + "learning_rate": 0.00012344567544176195, + "loss": 0.9188, + "step": 12652 + }, + { + "epoch": 2.6990187713310583, + "grad_norm": 0.34992784037756897, + "learning_rate": 0.0001234385296581306, + "loss": 0.9246, + "step": 12653 + }, + { + "epoch": 2.699232081911263, + "grad_norm": 0.313586415738356, + "learning_rate": 0.00012343138338299155, + "loss": 0.9391, + "step": 12654 + }, + { + "epoch": 2.6994453924914676, + "grad_norm": 0.27672816666723205, + "learning_rate": 0.0001234242366164257, + "loss": 0.9814, + "step": 12655 + }, + { + "epoch": 2.6996587030716723, + "grad_norm": 0.23885957605630845, + "learning_rate": 0.00012341708935851382, + "loss": 0.9143, + "step": 12656 + }, + { + "epoch": 2.699872013651877, + "grad_norm": 0.266178427067107, + "learning_rate": 0.00012340994160933686, + "loss": 0.9413, + "step": 12657 + }, + { + "epoch": 2.7000853242320817, + "grad_norm": 0.39309646500582834, + "learning_rate": 0.00012340279336897568, + "loss": 0.9257, + "step": 12658 + }, + { + "epoch": 2.700298634812287, + "grad_norm": 0.45228662812920395, + "learning_rate": 0.00012339564463751117, + "loss": 0.9184, + "step": 12659 + }, + { + "epoch": 2.7005119453924915, + "grad_norm": 0.38970744300701105, + "learning_rate": 0.00012338849541502424, + "loss": 0.9168, + "step": 12660 + }, + { + "epoch": 2.700725255972696, + "grad_norm": 0.31372722535007813, + "learning_rate": 0.00012338134570159573, + "loss": 0.9333, + "step": 12661 + }, + { + "epoch": 2.700938566552901, + "grad_norm": 0.26130862841129987, + "learning_rate": 0.00012337419549730656, + "loss": 0.9157, + "step": 12662 + }, + { + "epoch": 2.701151877133106, + "grad_norm": 0.2388164555822006, + "learning_rate": 0.00012336704480223764, + "loss": 0.9389, + "step": 12663 + }, + { + "epoch": 2.7013651877133107, + "grad_norm": 0.2989513922308376, + "learning_rate": 0.00012335989361646988, + "loss": 0.9309, + "step": 12664 + }, + { + "epoch": 2.7015784982935154, + "grad_norm": 0.38693730931298354, + "learning_rate": 0.00012335274194008418, + "loss": 0.9437, + "step": 12665 + }, + { + "epoch": 2.70179180887372, + "grad_norm": 0.41346065484078515, + "learning_rate": 0.0001233455897731615, + "loss": 0.9635, + "step": 12666 + }, + { + "epoch": 2.7020051194539247, + "grad_norm": 0.4102420005523976, + "learning_rate": 0.00012333843711578271, + "loss": 0.9526, + "step": 12667 + }, + { + "epoch": 2.7022184300341294, + "grad_norm": 0.29944321715610994, + "learning_rate": 0.00012333128396802882, + "loss": 0.9277, + "step": 12668 + }, + { + "epoch": 2.7024317406143346, + "grad_norm": 0.26922688888381247, + "learning_rate": 0.0001233241303299807, + "loss": 0.9322, + "step": 12669 + }, + { + "epoch": 2.7026450511945392, + "grad_norm": 0.268605689541363, + "learning_rate": 0.0001233169762017193, + "loss": 0.9002, + "step": 12670 + }, + { + "epoch": 2.702858361774744, + "grad_norm": 0.23730672254076382, + "learning_rate": 0.00012330982158332562, + "loss": 0.9409, + "step": 12671 + }, + { + "epoch": 2.703071672354949, + "grad_norm": 0.27362369684590226, + "learning_rate": 0.00012330266647488057, + "loss": 0.9167, + "step": 12672 + }, + { + "epoch": 2.7032849829351537, + "grad_norm": 0.2782997789694878, + "learning_rate": 0.00012329551087646512, + "loss": 0.9007, + "step": 12673 + }, + { + "epoch": 2.7034982935153584, + "grad_norm": 0.2647495804176606, + "learning_rate": 0.0001232883547881602, + "loss": 0.9179, + "step": 12674 + }, + { + "epoch": 2.703711604095563, + "grad_norm": 0.23793653480600002, + "learning_rate": 0.00012328119821004686, + "loss": 0.9557, + "step": 12675 + }, + { + "epoch": 2.703924914675768, + "grad_norm": 0.32101231258904434, + "learning_rate": 0.00012327404114220603, + "loss": 0.9575, + "step": 12676 + }, + { + "epoch": 2.7041382252559725, + "grad_norm": 0.3299598539280023, + "learning_rate": 0.00012326688358471868, + "loss": 0.9437, + "step": 12677 + }, + { + "epoch": 2.7043515358361776, + "grad_norm": 0.2427628493061256, + "learning_rate": 0.00012325972553766583, + "loss": 0.9135, + "step": 12678 + }, + { + "epoch": 2.7045648464163823, + "grad_norm": 0.21513176776081452, + "learning_rate": 0.00012325256700112844, + "loss": 0.9225, + "step": 12679 + }, + { + "epoch": 2.704778156996587, + "grad_norm": 0.2776598021921937, + "learning_rate": 0.00012324540797518756, + "loss": 0.9292, + "step": 12680 + }, + { + "epoch": 2.7049914675767917, + "grad_norm": 0.23845355498842186, + "learning_rate": 0.00012323824845992412, + "loss": 0.9225, + "step": 12681 + }, + { + "epoch": 2.705204778156997, + "grad_norm": 0.29341628572915074, + "learning_rate": 0.0001232310884554192, + "loss": 0.9405, + "step": 12682 + }, + { + "epoch": 2.7054180887372015, + "grad_norm": 0.39096703981719805, + "learning_rate": 0.00012322392796175377, + "loss": 0.9758, + "step": 12683 + }, + { + "epoch": 2.705631399317406, + "grad_norm": 0.42996353375089547, + "learning_rate": 0.00012321676697900886, + "loss": 0.9188, + "step": 12684 + }, + { + "epoch": 2.705844709897611, + "grad_norm": 0.3872041094672212, + "learning_rate": 0.00012320960550726552, + "loss": 0.9143, + "step": 12685 + }, + { + "epoch": 2.7060580204778155, + "grad_norm": 0.2712413834930502, + "learning_rate": 0.00012320244354660476, + "loss": 0.94, + "step": 12686 + }, + { + "epoch": 2.70627133105802, + "grad_norm": 0.2341712145264408, + "learning_rate": 0.00012319528109710763, + "loss": 0.9218, + "step": 12687 + }, + { + "epoch": 2.7064846416382253, + "grad_norm": 0.31448689411187997, + "learning_rate": 0.0001231881181588552, + "loss": 0.9273, + "step": 12688 + }, + { + "epoch": 2.70669795221843, + "grad_norm": 0.31833072544730917, + "learning_rate": 0.00012318095473192847, + "loss": 0.9296, + "step": 12689 + }, + { + "epoch": 2.7069112627986347, + "grad_norm": 0.32188421047688576, + "learning_rate": 0.00012317379081640845, + "loss": 0.9288, + "step": 12690 + }, + { + "epoch": 2.70712457337884, + "grad_norm": 0.32166456458663495, + "learning_rate": 0.0001231666264123763, + "loss": 0.9287, + "step": 12691 + }, + { + "epoch": 2.7073378839590445, + "grad_norm": 0.22325294883471108, + "learning_rate": 0.00012315946151991307, + "loss": 0.9362, + "step": 12692 + }, + { + "epoch": 2.707551194539249, + "grad_norm": 0.2593399141488772, + "learning_rate": 0.00012315229613909978, + "loss": 0.9331, + "step": 12693 + }, + { + "epoch": 2.707764505119454, + "grad_norm": 0.33805452082322235, + "learning_rate": 0.00012314513027001755, + "loss": 0.9419, + "step": 12694 + }, + { + "epoch": 2.7079778156996586, + "grad_norm": 0.36378636637314693, + "learning_rate": 0.00012313796391274747, + "loss": 0.9095, + "step": 12695 + }, + { + "epoch": 2.7081911262798632, + "grad_norm": 0.323211559697971, + "learning_rate": 0.00012313079706737054, + "loss": 0.945, + "step": 12696 + }, + { + "epoch": 2.7084044368600684, + "grad_norm": 0.2884693345813773, + "learning_rate": 0.00012312362973396795, + "loss": 0.9218, + "step": 12697 + }, + { + "epoch": 2.708617747440273, + "grad_norm": 0.27667513254390763, + "learning_rate": 0.00012311646191262075, + "loss": 0.9141, + "step": 12698 + }, + { + "epoch": 2.7088310580204777, + "grad_norm": 0.2651865013656333, + "learning_rate": 0.0001231092936034101, + "loss": 0.947, + "step": 12699 + }, + { + "epoch": 2.7090443686006824, + "grad_norm": 0.25370147941290205, + "learning_rate": 0.00012310212480641703, + "loss": 0.9675, + "step": 12700 + }, + { + "epoch": 2.7092576791808876, + "grad_norm": 0.21974461813159357, + "learning_rate": 0.0001230949555217227, + "loss": 0.9308, + "step": 12701 + }, + { + "epoch": 2.7094709897610922, + "grad_norm": 0.23364244447643026, + "learning_rate": 0.00012308778574940822, + "loss": 0.9202, + "step": 12702 + }, + { + "epoch": 2.709684300341297, + "grad_norm": 0.3097301663319093, + "learning_rate": 0.00012308061548955472, + "loss": 0.9239, + "step": 12703 + }, + { + "epoch": 2.7098976109215016, + "grad_norm": 0.31622955770660344, + "learning_rate": 0.00012307344474224335, + "loss": 0.9524, + "step": 12704 + }, + { + "epoch": 2.7101109215017063, + "grad_norm": 0.35572403593783236, + "learning_rate": 0.00012306627350755522, + "loss": 0.8998, + "step": 12705 + }, + { + "epoch": 2.7103242320819114, + "grad_norm": 0.36985064176487936, + "learning_rate": 0.00012305910178557147, + "loss": 0.9405, + "step": 12706 + }, + { + "epoch": 2.710537542662116, + "grad_norm": 0.364921809811312, + "learning_rate": 0.00012305192957637326, + "loss": 0.9122, + "step": 12707 + }, + { + "epoch": 2.710750853242321, + "grad_norm": 0.30729691871249415, + "learning_rate": 0.0001230447568800417, + "loss": 0.9535, + "step": 12708 + }, + { + "epoch": 2.7109641638225255, + "grad_norm": 0.2535921864870576, + "learning_rate": 0.00012303758369665806, + "loss": 0.886, + "step": 12709 + }, + { + "epoch": 2.7111774744027306, + "grad_norm": 0.2542542497965697, + "learning_rate": 0.00012303041002630343, + "loss": 0.945, + "step": 12710 + }, + { + "epoch": 2.7113907849829353, + "grad_norm": 0.27137924473264374, + "learning_rate": 0.00012302323586905898, + "loss": 0.8868, + "step": 12711 + }, + { + "epoch": 2.71160409556314, + "grad_norm": 0.32204869271201836, + "learning_rate": 0.00012301606122500588, + "loss": 0.9192, + "step": 12712 + }, + { + "epoch": 2.7118174061433447, + "grad_norm": 0.30721385842538346, + "learning_rate": 0.0001230088860942253, + "loss": 0.9256, + "step": 12713 + }, + { + "epoch": 2.7120307167235493, + "grad_norm": 0.277337704069945, + "learning_rate": 0.00012300171047679848, + "loss": 0.916, + "step": 12714 + }, + { + "epoch": 2.712244027303754, + "grad_norm": 0.2952638429117217, + "learning_rate": 0.00012299453437280658, + "loss": 0.937, + "step": 12715 + }, + { + "epoch": 2.712457337883959, + "grad_norm": 0.26874271113743226, + "learning_rate": 0.0001229873577823308, + "loss": 0.9282, + "step": 12716 + }, + { + "epoch": 2.712670648464164, + "grad_norm": 0.21633088095521966, + "learning_rate": 0.00012298018070545238, + "loss": 0.9196, + "step": 12717 + }, + { + "epoch": 2.7128839590443685, + "grad_norm": 0.3120063938212246, + "learning_rate": 0.00012297300314225245, + "loss": 0.8806, + "step": 12718 + }, + { + "epoch": 2.7130972696245736, + "grad_norm": 0.4200821544388356, + "learning_rate": 0.00012296582509281224, + "loss": 0.9493, + "step": 12719 + }, + { + "epoch": 2.7133105802047783, + "grad_norm": 0.4269275512282098, + "learning_rate": 0.00012295864655721304, + "loss": 0.9348, + "step": 12720 + }, + { + "epoch": 2.713523890784983, + "grad_norm": 0.34076589232746446, + "learning_rate": 0.00012295146753553603, + "loss": 0.947, + "step": 12721 + }, + { + "epoch": 2.7137372013651877, + "grad_norm": 0.3269909907943309, + "learning_rate": 0.00012294428802786243, + "loss": 0.9363, + "step": 12722 + }, + { + "epoch": 2.7139505119453924, + "grad_norm": 0.28904057314748777, + "learning_rate": 0.0001229371080342735, + "loss": 0.9095, + "step": 12723 + }, + { + "epoch": 2.714163822525597, + "grad_norm": 0.2573301209253192, + "learning_rate": 0.00012292992755485045, + "loss": 0.9236, + "step": 12724 + }, + { + "epoch": 2.714377133105802, + "grad_norm": 0.21823160193290492, + "learning_rate": 0.00012292274658967455, + "loss": 0.9674, + "step": 12725 + }, + { + "epoch": 2.714590443686007, + "grad_norm": 0.24361647811795079, + "learning_rate": 0.00012291556513882706, + "loss": 0.9145, + "step": 12726 + }, + { + "epoch": 2.7148037542662116, + "grad_norm": 0.31628177299929544, + "learning_rate": 0.00012290838320238925, + "loss": 0.9347, + "step": 12727 + }, + { + "epoch": 2.7150170648464163, + "grad_norm": 0.3250391195724657, + "learning_rate": 0.00012290120078044234, + "loss": 0.9511, + "step": 12728 + }, + { + "epoch": 2.7152303754266214, + "grad_norm": 0.29939764133024627, + "learning_rate": 0.00012289401787306763, + "loss": 0.8936, + "step": 12729 + }, + { + "epoch": 2.715443686006826, + "grad_norm": 0.2856749256926714, + "learning_rate": 0.00012288683448034634, + "loss": 0.9313, + "step": 12730 + }, + { + "epoch": 2.7156569965870307, + "grad_norm": 0.19892694049995044, + "learning_rate": 0.00012287965060235986, + "loss": 0.9303, + "step": 12731 + }, + { + "epoch": 2.7158703071672354, + "grad_norm": 0.23897309007838022, + "learning_rate": 0.00012287246623918937, + "loss": 0.9301, + "step": 12732 + }, + { + "epoch": 2.71608361774744, + "grad_norm": 0.27015931297551987, + "learning_rate": 0.00012286528139091623, + "loss": 0.9608, + "step": 12733 + }, + { + "epoch": 2.716296928327645, + "grad_norm": 0.27356490939387834, + "learning_rate": 0.0001228580960576217, + "loss": 0.9058, + "step": 12734 + }, + { + "epoch": 2.71651023890785, + "grad_norm": 0.32471435764941636, + "learning_rate": 0.0001228509102393871, + "loss": 0.9496, + "step": 12735 + }, + { + "epoch": 2.7167235494880546, + "grad_norm": 0.32206586702168605, + "learning_rate": 0.0001228437239362937, + "loss": 0.9204, + "step": 12736 + }, + { + "epoch": 2.7169368600682593, + "grad_norm": 0.3135617396379646, + "learning_rate": 0.00012283653714842288, + "loss": 0.9104, + "step": 12737 + }, + { + "epoch": 2.7171501706484644, + "grad_norm": 0.26517951822117514, + "learning_rate": 0.00012282934987585593, + "loss": 0.9344, + "step": 12738 + }, + { + "epoch": 2.717363481228669, + "grad_norm": 0.21353102896339274, + "learning_rate": 0.00012282216211867414, + "loss": 0.9189, + "step": 12739 + }, + { + "epoch": 2.717576791808874, + "grad_norm": 0.24210938297504792, + "learning_rate": 0.00012281497387695885, + "loss": 0.9474, + "step": 12740 + }, + { + "epoch": 2.7177901023890785, + "grad_norm": 0.26862881689873064, + "learning_rate": 0.00012280778515079142, + "loss": 0.9088, + "step": 12741 + }, + { + "epoch": 2.718003412969283, + "grad_norm": 0.2632290702359985, + "learning_rate": 0.0001228005959402532, + "loss": 0.949, + "step": 12742 + }, + { + "epoch": 2.718216723549488, + "grad_norm": 0.2818465501465344, + "learning_rate": 0.00012279340624542548, + "loss": 0.9362, + "step": 12743 + }, + { + "epoch": 2.718430034129693, + "grad_norm": 0.2558724413069019, + "learning_rate": 0.00012278621606638971, + "loss": 0.9295, + "step": 12744 + }, + { + "epoch": 2.7186433447098977, + "grad_norm": 0.2894781027892403, + "learning_rate": 0.00012277902540322715, + "loss": 0.9344, + "step": 12745 + }, + { + "epoch": 2.7188566552901023, + "grad_norm": 0.25468212445519933, + "learning_rate": 0.0001227718342560192, + "loss": 0.9, + "step": 12746 + }, + { + "epoch": 2.719069965870307, + "grad_norm": 0.23977283869658117, + "learning_rate": 0.00012276464262484725, + "loss": 0.9293, + "step": 12747 + }, + { + "epoch": 2.719283276450512, + "grad_norm": 0.6514520104291445, + "learning_rate": 0.0001227574505097926, + "loss": 0.9492, + "step": 12748 + }, + { + "epoch": 2.719496587030717, + "grad_norm": 0.2375632139863381, + "learning_rate": 0.0001227502579109367, + "loss": 0.9507, + "step": 12749 + }, + { + "epoch": 2.7197098976109215, + "grad_norm": 0.24630409771526776, + "learning_rate": 0.00012274306482836094, + "loss": 0.904, + "step": 12750 + }, + { + "epoch": 2.719923208191126, + "grad_norm": 0.2477735152268175, + "learning_rate": 0.00012273587126214664, + "loss": 0.9503, + "step": 12751 + }, + { + "epoch": 2.720136518771331, + "grad_norm": 0.2714694299046938, + "learning_rate": 0.00012272867721237524, + "loss": 0.9502, + "step": 12752 + }, + { + "epoch": 2.7203498293515356, + "grad_norm": 0.25490064118297173, + "learning_rate": 0.00012272148267912817, + "loss": 0.9389, + "step": 12753 + }, + { + "epoch": 2.7205631399317407, + "grad_norm": 0.2884607307790552, + "learning_rate": 0.00012271428766248677, + "loss": 0.9288, + "step": 12754 + }, + { + "epoch": 2.7207764505119454, + "grad_norm": 0.2588245288147638, + "learning_rate": 0.00012270709216253252, + "loss": 0.9248, + "step": 12755 + }, + { + "epoch": 2.72098976109215, + "grad_norm": 0.2564730951215622, + "learning_rate": 0.00012269989617934678, + "loss": 0.9295, + "step": 12756 + }, + { + "epoch": 2.721203071672355, + "grad_norm": 0.24586204592781116, + "learning_rate": 0.000122692699713011, + "loss": 0.9338, + "step": 12757 + }, + { + "epoch": 2.72141638225256, + "grad_norm": 0.2283003557472351, + "learning_rate": 0.0001226855027636066, + "loss": 0.911, + "step": 12758 + }, + { + "epoch": 2.7216296928327646, + "grad_norm": 0.2919168973770687, + "learning_rate": 0.00012267830533121504, + "loss": 0.9145, + "step": 12759 + }, + { + "epoch": 2.7218430034129693, + "grad_norm": 0.9969728017116564, + "learning_rate": 0.00012267110741591772, + "loss": 0.9783, + "step": 12760 + }, + { + "epoch": 2.722056313993174, + "grad_norm": 0.26380463924308517, + "learning_rate": 0.00012266390901779614, + "loss": 0.9143, + "step": 12761 + }, + { + "epoch": 2.7222696245733786, + "grad_norm": 0.2668405122123261, + "learning_rate": 0.00012265671013693167, + "loss": 0.9424, + "step": 12762 + }, + { + "epoch": 2.7224829351535837, + "grad_norm": 0.2264053245734714, + "learning_rate": 0.00012264951077340581, + "loss": 0.9479, + "step": 12763 + }, + { + "epoch": 2.7226962457337884, + "grad_norm": 0.27312655254208945, + "learning_rate": 0.00012264231092730003, + "loss": 0.9281, + "step": 12764 + }, + { + "epoch": 2.722909556313993, + "grad_norm": 0.3163706844212003, + "learning_rate": 0.0001226351105986958, + "loss": 0.9389, + "step": 12765 + }, + { + "epoch": 2.723122866894198, + "grad_norm": 0.3731751188630976, + "learning_rate": 0.00012262790978767455, + "loss": 0.9534, + "step": 12766 + }, + { + "epoch": 2.723336177474403, + "grad_norm": 0.43950980820466073, + "learning_rate": 0.00012262070849431778, + "loss": 0.9469, + "step": 12767 + }, + { + "epoch": 2.7235494880546076, + "grad_norm": 0.46872275765257304, + "learning_rate": 0.000122613506718707, + "loss": 0.933, + "step": 12768 + }, + { + "epoch": 2.7237627986348123, + "grad_norm": 0.4930097570888423, + "learning_rate": 0.00012260630446092366, + "loss": 0.9227, + "step": 12769 + }, + { + "epoch": 2.723976109215017, + "grad_norm": 0.47225718815563256, + "learning_rate": 0.00012259910172104924, + "loss": 0.9417, + "step": 12770 + }, + { + "epoch": 2.7241894197952217, + "grad_norm": 0.3760493973472348, + "learning_rate": 0.00012259189849916533, + "loss": 0.9701, + "step": 12771 + }, + { + "epoch": 2.7244027303754264, + "grad_norm": 0.293591555863282, + "learning_rate": 0.00012258469479535332, + "loss": 0.9399, + "step": 12772 + }, + { + "epoch": 2.7246160409556315, + "grad_norm": 0.27475165061728163, + "learning_rate": 0.0001225774906096948, + "loss": 0.9331, + "step": 12773 + }, + { + "epoch": 2.724829351535836, + "grad_norm": 0.21859033796004318, + "learning_rate": 0.00012257028594227123, + "loss": 0.9399, + "step": 12774 + }, + { + "epoch": 2.725042662116041, + "grad_norm": 0.21600868762469624, + "learning_rate": 0.0001225630807931642, + "loss": 0.9173, + "step": 12775 + }, + { + "epoch": 2.725255972696246, + "grad_norm": 0.3062014342666734, + "learning_rate": 0.00012255587516245517, + "loss": 0.921, + "step": 12776 + }, + { + "epoch": 2.7254692832764507, + "grad_norm": 0.38059731844047867, + "learning_rate": 0.0001225486690502257, + "loss": 0.9409, + "step": 12777 + }, + { + "epoch": 2.7256825938566553, + "grad_norm": 0.3962828003054639, + "learning_rate": 0.00012254146245655733, + "loss": 0.9457, + "step": 12778 + }, + { + "epoch": 2.72589590443686, + "grad_norm": 0.37240318731192296, + "learning_rate": 0.0001225342553815316, + "loss": 0.9498, + "step": 12779 + }, + { + "epoch": 2.7261092150170647, + "grad_norm": 0.3119158676264658, + "learning_rate": 0.00012252704782523, + "loss": 0.925, + "step": 12780 + }, + { + "epoch": 2.7263225255972694, + "grad_norm": 0.23813436671466057, + "learning_rate": 0.00012251983978773422, + "loss": 0.8958, + "step": 12781 + }, + { + "epoch": 2.7265358361774745, + "grad_norm": 0.25451135999084495, + "learning_rate": 0.00012251263126912567, + "loss": 0.9223, + "step": 12782 + }, + { + "epoch": 2.726749146757679, + "grad_norm": 0.3591190293867561, + "learning_rate": 0.00012250542226948605, + "loss": 0.9203, + "step": 12783 + }, + { + "epoch": 2.726962457337884, + "grad_norm": 0.51437832889424, + "learning_rate": 0.00012249821278889682, + "loss": 0.9552, + "step": 12784 + }, + { + "epoch": 2.7271757679180886, + "grad_norm": 0.5164038884305514, + "learning_rate": 0.0001224910028274396, + "loss": 0.9116, + "step": 12785 + }, + { + "epoch": 2.7273890784982937, + "grad_norm": 0.4184305115307011, + "learning_rate": 0.00012248379238519595, + "loss": 0.9503, + "step": 12786 + }, + { + "epoch": 2.7276023890784984, + "grad_norm": 0.34750588852662007, + "learning_rate": 0.00012247658146224752, + "loss": 0.9403, + "step": 12787 + }, + { + "epoch": 2.727815699658703, + "grad_norm": 0.2990620891959387, + "learning_rate": 0.0001224693700586758, + "loss": 0.9415, + "step": 12788 + }, + { + "epoch": 2.7280290102389078, + "grad_norm": 0.24501442684451166, + "learning_rate": 0.0001224621581745625, + "loss": 0.9102, + "step": 12789 + }, + { + "epoch": 2.7282423208191124, + "grad_norm": 0.24216321901379217, + "learning_rate": 0.00012245494580998914, + "loss": 0.9463, + "step": 12790 + }, + { + "epoch": 2.7284556313993176, + "grad_norm": 0.24065536062693335, + "learning_rate": 0.00012244773296503738, + "loss": 0.9348, + "step": 12791 + }, + { + "epoch": 2.7286689419795223, + "grad_norm": 0.32346477798183154, + "learning_rate": 0.00012244051963978875, + "loss": 0.9968, + "step": 12792 + }, + { + "epoch": 2.728882252559727, + "grad_norm": 0.4582462157238087, + "learning_rate": 0.00012243330583432496, + "loss": 0.9369, + "step": 12793 + }, + { + "epoch": 2.7290955631399316, + "grad_norm": 0.517068735206653, + "learning_rate": 0.0001224260915487276, + "loss": 0.9065, + "step": 12794 + }, + { + "epoch": 2.7293088737201368, + "grad_norm": 0.46258592518352454, + "learning_rate": 0.00012241887678307833, + "loss": 0.9377, + "step": 12795 + }, + { + "epoch": 2.7295221843003414, + "grad_norm": 0.42193655206864444, + "learning_rate": 0.00012241166153745871, + "loss": 0.9626, + "step": 12796 + }, + { + "epoch": 2.729735494880546, + "grad_norm": 0.359236183629542, + "learning_rate": 0.00012240444581195043, + "loss": 0.9285, + "step": 12797 + }, + { + "epoch": 2.729948805460751, + "grad_norm": 0.3068687413215008, + "learning_rate": 0.00012239722960663514, + "loss": 0.9363, + "step": 12798 + }, + { + "epoch": 2.7301621160409555, + "grad_norm": 0.22955237679400833, + "learning_rate": 0.0001223900129215945, + "loss": 0.9648, + "step": 12799 + }, + { + "epoch": 2.73037542662116, + "grad_norm": 0.3015385726449481, + "learning_rate": 0.00012238279575691014, + "loss": 0.9156, + "step": 12800 + }, + { + "epoch": 2.7305887372013653, + "grad_norm": 0.40210038022314515, + "learning_rate": 0.00012237557811266376, + "loss": 0.8987, + "step": 12801 + }, + { + "epoch": 2.73080204778157, + "grad_norm": 0.3786372857310891, + "learning_rate": 0.00012236835998893694, + "loss": 0.9472, + "step": 12802 + }, + { + "epoch": 2.7310153583617747, + "grad_norm": 0.2984171935445512, + "learning_rate": 0.00012236114138581145, + "loss": 0.9178, + "step": 12803 + }, + { + "epoch": 2.73122866894198, + "grad_norm": 0.2566660850342974, + "learning_rate": 0.00012235392230336892, + "loss": 0.9236, + "step": 12804 + }, + { + "epoch": 2.7314419795221845, + "grad_norm": 0.2453122953790256, + "learning_rate": 0.00012234670274169108, + "loss": 0.9352, + "step": 12805 + }, + { + "epoch": 2.731655290102389, + "grad_norm": 0.2798736407603035, + "learning_rate": 0.00012233948270085957, + "loss": 0.9112, + "step": 12806 + }, + { + "epoch": 2.731868600682594, + "grad_norm": 0.34997982601158106, + "learning_rate": 0.0001223322621809561, + "loss": 0.9229, + "step": 12807 + }, + { + "epoch": 2.7320819112627985, + "grad_norm": 0.4491753745430164, + "learning_rate": 0.00012232504118206235, + "loss": 0.9269, + "step": 12808 + }, + { + "epoch": 2.732295221843003, + "grad_norm": 0.5163561852276383, + "learning_rate": 0.00012231781970426008, + "loss": 0.9509, + "step": 12809 + }, + { + "epoch": 2.7325085324232083, + "grad_norm": 0.5129238033997519, + "learning_rate": 0.00012231059774763095, + "loss": 0.9415, + "step": 12810 + }, + { + "epoch": 2.732721843003413, + "grad_norm": 0.3347025919727354, + "learning_rate": 0.00012230337531225668, + "loss": 0.9175, + "step": 12811 + }, + { + "epoch": 2.7329351535836177, + "grad_norm": 0.21116192393767733, + "learning_rate": 0.00012229615239821903, + "loss": 0.9517, + "step": 12812 + }, + { + "epoch": 2.7331484641638224, + "grad_norm": 0.3288768676260929, + "learning_rate": 0.0001222889290055997, + "loss": 0.9428, + "step": 12813 + }, + { + "epoch": 2.7333617747440275, + "grad_norm": 0.45603898712811514, + "learning_rate": 0.00012228170513448042, + "loss": 0.9307, + "step": 12814 + }, + { + "epoch": 2.733575085324232, + "grad_norm": 0.4983386212108462, + "learning_rate": 0.00012227448078494293, + "loss": 0.9175, + "step": 12815 + }, + { + "epoch": 2.733788395904437, + "grad_norm": 0.44669923084946417, + "learning_rate": 0.000122267255957069, + "loss": 0.9299, + "step": 12816 + }, + { + "epoch": 2.7340017064846416, + "grad_norm": 0.3257740138584386, + "learning_rate": 0.00012226003065094035, + "loss": 0.9458, + "step": 12817 + }, + { + "epoch": 2.7342150170648463, + "grad_norm": 0.25936804505876404, + "learning_rate": 0.00012225280486663874, + "loss": 0.914, + "step": 12818 + }, + { + "epoch": 2.734428327645051, + "grad_norm": 0.27374172913870476, + "learning_rate": 0.00012224557860424594, + "loss": 0.925, + "step": 12819 + }, + { + "epoch": 2.734641638225256, + "grad_norm": 0.5742445657375199, + "learning_rate": 0.0001222383518638437, + "loss": 0.9417, + "step": 12820 + }, + { + "epoch": 2.7348549488054608, + "grad_norm": 0.40326894927505763, + "learning_rate": 0.0001222311246455138, + "loss": 0.9317, + "step": 12821 + }, + { + "epoch": 2.7350682593856654, + "grad_norm": 0.38587636641262474, + "learning_rate": 0.000122223896949338, + "loss": 0.9406, + "step": 12822 + }, + { + "epoch": 2.7352815699658706, + "grad_norm": 0.2688085816682296, + "learning_rate": 0.00012221666877539807, + "loss": 0.9145, + "step": 12823 + }, + { + "epoch": 2.7354948805460753, + "grad_norm": 0.2049223318724149, + "learning_rate": 0.00012220944012377588, + "loss": 0.9005, + "step": 12824 + }, + { + "epoch": 2.73570819112628, + "grad_norm": 0.21091131262173204, + "learning_rate": 0.00012220221099455315, + "loss": 0.9312, + "step": 12825 + }, + { + "epoch": 2.7359215017064846, + "grad_norm": 0.27804410869322976, + "learning_rate": 0.00012219498138781168, + "loss": 0.9602, + "step": 12826 + }, + { + "epoch": 2.7361348122866893, + "grad_norm": 0.284483226894122, + "learning_rate": 0.00012218775130363332, + "loss": 0.9598, + "step": 12827 + }, + { + "epoch": 2.736348122866894, + "grad_norm": 0.2774329592744093, + "learning_rate": 0.0001221805207420998, + "loss": 0.887, + "step": 12828 + }, + { + "epoch": 2.736561433447099, + "grad_norm": 0.27006250436187673, + "learning_rate": 0.00012217328970329298, + "loss": 0.9499, + "step": 12829 + }, + { + "epoch": 2.736774744027304, + "grad_norm": 0.2210392819061437, + "learning_rate": 0.0001221660581872947, + "loss": 0.9212, + "step": 12830 + }, + { + "epoch": 2.7369880546075085, + "grad_norm": 0.24729034602816186, + "learning_rate": 0.00012215882619418677, + "loss": 0.9248, + "step": 12831 + }, + { + "epoch": 2.737201365187713, + "grad_norm": 0.2830846384067292, + "learning_rate": 0.00012215159372405097, + "loss": 0.9593, + "step": 12832 + }, + { + "epoch": 2.7374146757679183, + "grad_norm": 0.3056266941396003, + "learning_rate": 0.0001221443607769692, + "loss": 0.958, + "step": 12833 + }, + { + "epoch": 2.737627986348123, + "grad_norm": 0.30971243795194764, + "learning_rate": 0.0001221371273530233, + "loss": 0.9584, + "step": 12834 + }, + { + "epoch": 2.7378412969283277, + "grad_norm": 0.327871262559182, + "learning_rate": 0.00012212989345229507, + "loss": 0.917, + "step": 12835 + }, + { + "epoch": 2.7380546075085324, + "grad_norm": 0.33583347212279174, + "learning_rate": 0.0001221226590748664, + "loss": 0.9271, + "step": 12836 + }, + { + "epoch": 2.738267918088737, + "grad_norm": 0.2978949364825937, + "learning_rate": 0.00012211542422081915, + "loss": 0.9677, + "step": 12837 + }, + { + "epoch": 2.7384812286689417, + "grad_norm": 0.24562335707955732, + "learning_rate": 0.00012210818889023514, + "loss": 0.921, + "step": 12838 + }, + { + "epoch": 2.738694539249147, + "grad_norm": 0.25443074613107125, + "learning_rate": 0.00012210095308319628, + "loss": 0.9201, + "step": 12839 + }, + { + "epoch": 2.7389078498293515, + "grad_norm": 0.27652083715946446, + "learning_rate": 0.00012209371679978442, + "loss": 0.9379, + "step": 12840 + }, + { + "epoch": 2.739121160409556, + "grad_norm": 0.33993517525495753, + "learning_rate": 0.00012208648004008145, + "loss": 0.916, + "step": 12841 + }, + { + "epoch": 2.7393344709897613, + "grad_norm": 0.28039551071829655, + "learning_rate": 0.00012207924280416922, + "loss": 0.9334, + "step": 12842 + }, + { + "epoch": 2.739547781569966, + "grad_norm": 0.31113362273500295, + "learning_rate": 0.00012207200509212969, + "loss": 0.9172, + "step": 12843 + }, + { + "epoch": 2.7397610921501707, + "grad_norm": 0.29890134847325983, + "learning_rate": 0.00012206476690404469, + "loss": 0.9264, + "step": 12844 + }, + { + "epoch": 2.7399744027303754, + "grad_norm": 0.2922587591769452, + "learning_rate": 0.00012205752823999617, + "loss": 0.9583, + "step": 12845 + }, + { + "epoch": 2.74018771331058, + "grad_norm": 0.24647063374872719, + "learning_rate": 0.00012205028910006602, + "loss": 0.9287, + "step": 12846 + }, + { + "epoch": 2.7404010238907848, + "grad_norm": 0.3310669701193845, + "learning_rate": 0.00012204304948433609, + "loss": 0.9219, + "step": 12847 + }, + { + "epoch": 2.74061433447099, + "grad_norm": 0.3136941183009202, + "learning_rate": 0.00012203580939288838, + "loss": 0.957, + "step": 12848 + }, + { + "epoch": 2.7408276450511946, + "grad_norm": 0.23752938511086416, + "learning_rate": 0.00012202856882580478, + "loss": 0.9513, + "step": 12849 + }, + { + "epoch": 2.7410409556313993, + "grad_norm": 0.253504997959459, + "learning_rate": 0.00012202132778316722, + "loss": 0.9171, + "step": 12850 + }, + { + "epoch": 2.741254266211604, + "grad_norm": 0.2676220424349556, + "learning_rate": 0.00012201408626505761, + "loss": 0.9376, + "step": 12851 + }, + { + "epoch": 2.741467576791809, + "grad_norm": 0.24802746117235275, + "learning_rate": 0.00012200684427155794, + "loss": 0.9322, + "step": 12852 + }, + { + "epoch": 2.7416808873720138, + "grad_norm": 0.2249790358162096, + "learning_rate": 0.00012199960180275007, + "loss": 0.9241, + "step": 12853 + }, + { + "epoch": 2.7418941979522184, + "grad_norm": 0.2669642076398941, + "learning_rate": 0.00012199235885871602, + "loss": 0.9126, + "step": 12854 + }, + { + "epoch": 2.742107508532423, + "grad_norm": 0.23495430977328685, + "learning_rate": 0.00012198511543953775, + "loss": 0.953, + "step": 12855 + }, + { + "epoch": 2.742320819112628, + "grad_norm": 0.20029042095444882, + "learning_rate": 0.00012197787154529718, + "loss": 0.9276, + "step": 12856 + }, + { + "epoch": 2.7425341296928325, + "grad_norm": 0.20556112383105093, + "learning_rate": 0.00012197062717607626, + "loss": 0.9344, + "step": 12857 + }, + { + "epoch": 2.7427474402730376, + "grad_norm": 0.232939090450695, + "learning_rate": 0.00012196338233195703, + "loss": 0.9268, + "step": 12858 + }, + { + "epoch": 2.7429607508532423, + "grad_norm": 0.2500660776638597, + "learning_rate": 0.0001219561370130214, + "loss": 0.9362, + "step": 12859 + }, + { + "epoch": 2.743174061433447, + "grad_norm": 0.24461342754942159, + "learning_rate": 0.00012194889121935139, + "loss": 0.9542, + "step": 12860 + }, + { + "epoch": 2.743387372013652, + "grad_norm": 0.27485532082289377, + "learning_rate": 0.00012194164495102893, + "loss": 0.9343, + "step": 12861 + }, + { + "epoch": 2.743600682593857, + "grad_norm": 0.2861709277289756, + "learning_rate": 0.0001219343982081361, + "loss": 0.9414, + "step": 12862 + }, + { + "epoch": 2.7438139931740615, + "grad_norm": 0.28427916675370896, + "learning_rate": 0.00012192715099075482, + "loss": 0.9438, + "step": 12863 + }, + { + "epoch": 2.744027303754266, + "grad_norm": 0.2457103289885725, + "learning_rate": 0.00012191990329896713, + "loss": 0.9575, + "step": 12864 + }, + { + "epoch": 2.744240614334471, + "grad_norm": 0.2568066771352878, + "learning_rate": 0.00012191265513285502, + "loss": 0.921, + "step": 12865 + }, + { + "epoch": 2.7444539249146755, + "grad_norm": 0.23392861495691308, + "learning_rate": 0.00012190540649250053, + "loss": 0.9638, + "step": 12866 + }, + { + "epoch": 2.7446672354948807, + "grad_norm": 0.25890967220032557, + "learning_rate": 0.00012189815737798566, + "loss": 0.9587, + "step": 12867 + }, + { + "epoch": 2.7448805460750854, + "grad_norm": 0.2550097435917421, + "learning_rate": 0.00012189090778939243, + "loss": 0.944, + "step": 12868 + }, + { + "epoch": 2.74509385665529, + "grad_norm": 0.20564977194491113, + "learning_rate": 0.00012188365772680286, + "loss": 0.9474, + "step": 12869 + }, + { + "epoch": 2.7453071672354947, + "grad_norm": 0.2583378713922196, + "learning_rate": 0.000121876407190299, + "loss": 0.9572, + "step": 12870 + }, + { + "epoch": 2.7455204778157, + "grad_norm": 0.3173417793777285, + "learning_rate": 0.00012186915617996289, + "loss": 0.9614, + "step": 12871 + }, + { + "epoch": 2.7457337883959045, + "grad_norm": 0.33034565426129675, + "learning_rate": 0.00012186190469587658, + "loss": 0.9598, + "step": 12872 + }, + { + "epoch": 2.7459470989761092, + "grad_norm": 0.3561412368555804, + "learning_rate": 0.00012185465273812211, + "loss": 0.9574, + "step": 12873 + }, + { + "epoch": 2.746160409556314, + "grad_norm": 0.3364136017753337, + "learning_rate": 0.00012184740030678155, + "loss": 0.935, + "step": 12874 + }, + { + "epoch": 2.7463737201365186, + "grad_norm": 0.23534482868008405, + "learning_rate": 0.00012184014740193693, + "loss": 0.9339, + "step": 12875 + }, + { + "epoch": 2.7465870307167233, + "grad_norm": 0.2239201892020668, + "learning_rate": 0.00012183289402367037, + "loss": 0.9062, + "step": 12876 + }, + { + "epoch": 2.7468003412969284, + "grad_norm": 0.2602345688453397, + "learning_rate": 0.0001218256401720639, + "loss": 0.9427, + "step": 12877 + }, + { + "epoch": 2.747013651877133, + "grad_norm": 0.30853528418143533, + "learning_rate": 0.0001218183858471996, + "loss": 0.9423, + "step": 12878 + }, + { + "epoch": 2.7472269624573378, + "grad_norm": 0.3083309865456717, + "learning_rate": 0.00012181113104915956, + "loss": 0.9202, + "step": 12879 + }, + { + "epoch": 2.747440273037543, + "grad_norm": 0.33603677245243485, + "learning_rate": 0.00012180387577802589, + "loss": 0.9398, + "step": 12880 + }, + { + "epoch": 2.7476535836177476, + "grad_norm": 0.32762361200060025, + "learning_rate": 0.00012179662003388061, + "loss": 0.9419, + "step": 12881 + }, + { + "epoch": 2.7478668941979523, + "grad_norm": 0.2160741467367968, + "learning_rate": 0.00012178936381680593, + "loss": 0.9173, + "step": 12882 + }, + { + "epoch": 2.748080204778157, + "grad_norm": 0.24208674448380438, + "learning_rate": 0.00012178210712688385, + "loss": 0.9316, + "step": 12883 + }, + { + "epoch": 2.7482935153583616, + "grad_norm": 0.3061843820212581, + "learning_rate": 0.00012177484996419656, + "loss": 0.8992, + "step": 12884 + }, + { + "epoch": 2.7485068259385663, + "grad_norm": 0.3119800789368704, + "learning_rate": 0.00012176759232882613, + "loss": 0.9714, + "step": 12885 + }, + { + "epoch": 2.7487201365187715, + "grad_norm": 0.2889401668870467, + "learning_rate": 0.00012176033422085469, + "loss": 0.9452, + "step": 12886 + }, + { + "epoch": 2.748933447098976, + "grad_norm": 0.2870397403407458, + "learning_rate": 0.00012175307564036434, + "loss": 0.9202, + "step": 12887 + }, + { + "epoch": 2.749146757679181, + "grad_norm": 0.2743992841974185, + "learning_rate": 0.00012174581658743727, + "loss": 0.9329, + "step": 12888 + }, + { + "epoch": 2.7493600682593855, + "grad_norm": 0.2977354549479306, + "learning_rate": 0.00012173855706215558, + "loss": 0.931, + "step": 12889 + }, + { + "epoch": 2.7495733788395906, + "grad_norm": 0.2884350310627876, + "learning_rate": 0.00012173129706460142, + "loss": 0.9106, + "step": 12890 + }, + { + "epoch": 2.7497866894197953, + "grad_norm": 0.2934081185011151, + "learning_rate": 0.0001217240365948569, + "loss": 0.9455, + "step": 12891 + }, + { + "epoch": 2.75, + "grad_norm": 0.25813319826098746, + "learning_rate": 0.00012171677565300424, + "loss": 0.9315, + "step": 12892 + }, + { + "epoch": 2.7502133105802047, + "grad_norm": 0.2920398773050183, + "learning_rate": 0.00012170951423912552, + "loss": 0.9409, + "step": 12893 + }, + { + "epoch": 2.7504266211604094, + "grad_norm": 0.338436069419342, + "learning_rate": 0.00012170225235330298, + "loss": 0.9205, + "step": 12894 + }, + { + "epoch": 2.7506399317406145, + "grad_norm": 0.2774474485804998, + "learning_rate": 0.00012169498999561874, + "loss": 0.943, + "step": 12895 + }, + { + "epoch": 2.750853242320819, + "grad_norm": 0.24571140927568846, + "learning_rate": 0.00012168772716615501, + "loss": 0.9503, + "step": 12896 + }, + { + "epoch": 2.751066552901024, + "grad_norm": 0.30000756399837747, + "learning_rate": 0.00012168046386499392, + "loss": 0.9705, + "step": 12897 + }, + { + "epoch": 2.7512798634812285, + "grad_norm": 0.2611911996771203, + "learning_rate": 0.00012167320009221768, + "loss": 0.9244, + "step": 12898 + }, + { + "epoch": 2.7514931740614337, + "grad_norm": 0.23690849456997062, + "learning_rate": 0.00012166593584790847, + "loss": 0.9605, + "step": 12899 + }, + { + "epoch": 2.7517064846416384, + "grad_norm": 0.30125266125831923, + "learning_rate": 0.00012165867113214852, + "loss": 0.9533, + "step": 12900 + }, + { + "epoch": 2.751919795221843, + "grad_norm": 0.367182471766305, + "learning_rate": 0.00012165140594502, + "loss": 0.908, + "step": 12901 + }, + { + "epoch": 2.7521331058020477, + "grad_norm": 0.3982635656377915, + "learning_rate": 0.00012164414028660512, + "loss": 0.9421, + "step": 12902 + }, + { + "epoch": 2.7523464163822524, + "grad_norm": 0.3880785619296574, + "learning_rate": 0.00012163687415698608, + "loss": 0.9454, + "step": 12903 + }, + { + "epoch": 2.752559726962457, + "grad_norm": 0.3587797237693631, + "learning_rate": 0.00012162960755624511, + "loss": 0.9473, + "step": 12904 + }, + { + "epoch": 2.7527730375426622, + "grad_norm": 0.37501717787074107, + "learning_rate": 0.00012162234048446444, + "loss": 0.9407, + "step": 12905 + }, + { + "epoch": 2.752986348122867, + "grad_norm": 0.31236716830594186, + "learning_rate": 0.0001216150729417263, + "loss": 0.9561, + "step": 12906 + }, + { + "epoch": 2.7531996587030716, + "grad_norm": 0.3171039343527394, + "learning_rate": 0.00012160780492811288, + "loss": 0.9428, + "step": 12907 + }, + { + "epoch": 2.7534129692832767, + "grad_norm": 0.3566655245400639, + "learning_rate": 0.00012160053644370649, + "loss": 0.9373, + "step": 12908 + }, + { + "epoch": 2.7536262798634814, + "grad_norm": 0.2762347755332744, + "learning_rate": 0.00012159326748858927, + "loss": 0.9307, + "step": 12909 + }, + { + "epoch": 2.753839590443686, + "grad_norm": 0.22965687765253986, + "learning_rate": 0.00012158599806284358, + "loss": 0.9515, + "step": 12910 + }, + { + "epoch": 2.7540529010238908, + "grad_norm": 0.289141483324181, + "learning_rate": 0.0001215787281665516, + "loss": 0.9316, + "step": 12911 + }, + { + "epoch": 2.7542662116040955, + "grad_norm": 0.27268477530703256, + "learning_rate": 0.00012157145779979562, + "loss": 0.9311, + "step": 12912 + }, + { + "epoch": 2.7544795221843, + "grad_norm": 0.3057650938665593, + "learning_rate": 0.0001215641869626579, + "loss": 0.9346, + "step": 12913 + }, + { + "epoch": 2.7546928327645053, + "grad_norm": 0.40203426206034815, + "learning_rate": 0.0001215569156552207, + "loss": 0.9175, + "step": 12914 + }, + { + "epoch": 2.75490614334471, + "grad_norm": 0.4503861300100935, + "learning_rate": 0.00012154964387756629, + "loss": 0.9391, + "step": 12915 + }, + { + "epoch": 2.7551194539249146, + "grad_norm": 0.44629960952669884, + "learning_rate": 0.00012154237162977697, + "loss": 0.9654, + "step": 12916 + }, + { + "epoch": 2.7553327645051193, + "grad_norm": 0.39279555437978153, + "learning_rate": 0.00012153509891193502, + "loss": 0.9306, + "step": 12917 + }, + { + "epoch": 2.7555460750853245, + "grad_norm": 0.2794202047803284, + "learning_rate": 0.00012152782572412272, + "loss": 0.9053, + "step": 12918 + }, + { + "epoch": 2.755759385665529, + "grad_norm": 0.2532431800200608, + "learning_rate": 0.00012152055206642235, + "loss": 0.9647, + "step": 12919 + }, + { + "epoch": 2.755972696245734, + "grad_norm": 0.35700422008707017, + "learning_rate": 0.0001215132779389163, + "loss": 0.9582, + "step": 12920 + }, + { + "epoch": 2.7561860068259385, + "grad_norm": 0.3433888227186584, + "learning_rate": 0.00012150600334168673, + "loss": 0.9217, + "step": 12921 + }, + { + "epoch": 2.756399317406143, + "grad_norm": 0.37063347791902074, + "learning_rate": 0.0001214987282748161, + "loss": 0.9344, + "step": 12922 + }, + { + "epoch": 2.756612627986348, + "grad_norm": 0.37093568548041, + "learning_rate": 0.00012149145273838664, + "loss": 0.9101, + "step": 12923 + }, + { + "epoch": 2.756825938566553, + "grad_norm": 0.3394985383393684, + "learning_rate": 0.0001214841767324807, + "loss": 0.9409, + "step": 12924 + }, + { + "epoch": 2.7570392491467577, + "grad_norm": 0.2990892159335158, + "learning_rate": 0.00012147690025718057, + "loss": 0.9296, + "step": 12925 + }, + { + "epoch": 2.7572525597269624, + "grad_norm": 0.26992284517366166, + "learning_rate": 0.00012146962331256865, + "loss": 0.9243, + "step": 12926 + }, + { + "epoch": 2.7574658703071675, + "grad_norm": 0.2543386882565954, + "learning_rate": 0.00012146234589872723, + "loss": 0.9035, + "step": 12927 + }, + { + "epoch": 2.757679180887372, + "grad_norm": 0.24949091675697124, + "learning_rate": 0.00012145506801573867, + "loss": 0.9573, + "step": 12928 + }, + { + "epoch": 2.757892491467577, + "grad_norm": 0.2384229992135583, + "learning_rate": 0.00012144778966368533, + "loss": 0.9502, + "step": 12929 + }, + { + "epoch": 2.7581058020477816, + "grad_norm": 0.31207459960397105, + "learning_rate": 0.00012144051084264956, + "loss": 0.9351, + "step": 12930 + }, + { + "epoch": 2.7583191126279862, + "grad_norm": 0.265507171517601, + "learning_rate": 0.00012143323155271369, + "loss": 0.9225, + "step": 12931 + }, + { + "epoch": 2.758532423208191, + "grad_norm": 0.2599432937465115, + "learning_rate": 0.00012142595179396012, + "loss": 0.9215, + "step": 12932 + }, + { + "epoch": 2.758745733788396, + "grad_norm": 0.2788322730392907, + "learning_rate": 0.00012141867156647117, + "loss": 0.9106, + "step": 12933 + }, + { + "epoch": 2.7589590443686007, + "grad_norm": 0.23122632136170237, + "learning_rate": 0.00012141139087032931, + "loss": 0.9396, + "step": 12934 + }, + { + "epoch": 2.7591723549488054, + "grad_norm": 0.2418714926269476, + "learning_rate": 0.00012140410970561684, + "loss": 0.9183, + "step": 12935 + }, + { + "epoch": 2.75938566552901, + "grad_norm": 0.31782178801070754, + "learning_rate": 0.0001213968280724162, + "loss": 0.9527, + "step": 12936 + }, + { + "epoch": 2.7595989761092152, + "grad_norm": 0.34816591184465345, + "learning_rate": 0.0001213895459708097, + "loss": 0.9373, + "step": 12937 + }, + { + "epoch": 2.75981228668942, + "grad_norm": 0.3794603154461389, + "learning_rate": 0.00012138226340087983, + "loss": 0.9408, + "step": 12938 + }, + { + "epoch": 2.7600255972696246, + "grad_norm": 0.4649329693304015, + "learning_rate": 0.00012137498036270893, + "loss": 0.9225, + "step": 12939 + }, + { + "epoch": 2.7602389078498293, + "grad_norm": 0.3045260641041818, + "learning_rate": 0.00012136769685637949, + "loss": 0.9238, + "step": 12940 + }, + { + "epoch": 2.760452218430034, + "grad_norm": 0.18321893595194744, + "learning_rate": 0.0001213604128819738, + "loss": 0.9113, + "step": 12941 + }, + { + "epoch": 2.7606655290102387, + "grad_norm": 0.262171378582204, + "learning_rate": 0.00012135312843957437, + "loss": 0.9429, + "step": 12942 + }, + { + "epoch": 2.760878839590444, + "grad_norm": 0.33388627539005755, + "learning_rate": 0.00012134584352926358, + "loss": 0.928, + "step": 12943 + }, + { + "epoch": 2.7610921501706485, + "grad_norm": 0.3091541290167756, + "learning_rate": 0.00012133855815112388, + "loss": 0.922, + "step": 12944 + }, + { + "epoch": 2.761305460750853, + "grad_norm": 0.3140043178574118, + "learning_rate": 0.0001213312723052377, + "loss": 0.9626, + "step": 12945 + }, + { + "epoch": 2.7615187713310583, + "grad_norm": 0.34056772683325287, + "learning_rate": 0.0001213239859916875, + "loss": 0.9242, + "step": 12946 + }, + { + "epoch": 2.761732081911263, + "grad_norm": 0.30285995792275977, + "learning_rate": 0.00012131669921055568, + "loss": 0.9256, + "step": 12947 + }, + { + "epoch": 2.7619453924914676, + "grad_norm": 0.22989384747131017, + "learning_rate": 0.00012130941196192471, + "loss": 0.9122, + "step": 12948 + }, + { + "epoch": 2.7621587030716723, + "grad_norm": 0.199981717790852, + "learning_rate": 0.00012130212424587705, + "loss": 0.8931, + "step": 12949 + }, + { + "epoch": 2.762372013651877, + "grad_norm": 0.2239129492678835, + "learning_rate": 0.00012129483606249518, + "loss": 0.9252, + "step": 12950 + }, + { + "epoch": 2.7625853242320817, + "grad_norm": 0.23751624670072088, + "learning_rate": 0.00012128754741186155, + "loss": 0.9543, + "step": 12951 + }, + { + "epoch": 2.762798634812287, + "grad_norm": 0.2950793139200207, + "learning_rate": 0.00012128025829405862, + "loss": 0.9079, + "step": 12952 + }, + { + "epoch": 2.7630119453924915, + "grad_norm": 0.6243270415386116, + "learning_rate": 0.00012127296870916884, + "loss": 0.9724, + "step": 12953 + }, + { + "epoch": 2.763225255972696, + "grad_norm": 0.37853764221994063, + "learning_rate": 0.00012126567865727474, + "loss": 0.9338, + "step": 12954 + }, + { + "epoch": 2.763438566552901, + "grad_norm": 0.26481477337107, + "learning_rate": 0.00012125838813845879, + "loss": 0.9501, + "step": 12955 + }, + { + "epoch": 2.763651877133106, + "grad_norm": 0.24485246741878505, + "learning_rate": 0.0001212510971528035, + "loss": 0.9274, + "step": 12956 + }, + { + "epoch": 2.7638651877133107, + "grad_norm": 0.273582436796495, + "learning_rate": 0.00012124380570039135, + "loss": 0.9297, + "step": 12957 + }, + { + "epoch": 2.7640784982935154, + "grad_norm": 0.31286350642121025, + "learning_rate": 0.00012123651378130484, + "loss": 0.9531, + "step": 12958 + }, + { + "epoch": 2.76429180887372, + "grad_norm": 0.26797710834935085, + "learning_rate": 0.00012122922139562648, + "loss": 0.9468, + "step": 12959 + }, + { + "epoch": 2.7645051194539247, + "grad_norm": 0.2247622494322688, + "learning_rate": 0.0001212219285434388, + "loss": 0.9546, + "step": 12960 + }, + { + "epoch": 2.7647184300341294, + "grad_norm": 0.24735777598500194, + "learning_rate": 0.00012121463522482429, + "loss": 0.9046, + "step": 12961 + }, + { + "epoch": 2.7649317406143346, + "grad_norm": 0.22782478650560153, + "learning_rate": 0.00012120734143986551, + "loss": 0.9509, + "step": 12962 + }, + { + "epoch": 2.7651450511945392, + "grad_norm": 0.23975758195449254, + "learning_rate": 0.00012120004718864494, + "loss": 0.9542, + "step": 12963 + }, + { + "epoch": 2.765358361774744, + "grad_norm": 0.680866877978961, + "learning_rate": 0.00012119275247124516, + "loss": 0.9324, + "step": 12964 + }, + { + "epoch": 2.765571672354949, + "grad_norm": 0.27479896384211366, + "learning_rate": 0.00012118545728774867, + "loss": 0.9017, + "step": 12965 + }, + { + "epoch": 2.7657849829351537, + "grad_norm": 0.2800789291194833, + "learning_rate": 0.00012117816163823807, + "loss": 0.9437, + "step": 12966 + }, + { + "epoch": 2.7659982935153584, + "grad_norm": 0.3478905049800836, + "learning_rate": 0.00012117086552279586, + "loss": 0.9269, + "step": 12967 + }, + { + "epoch": 2.766211604095563, + "grad_norm": 0.4005351678007142, + "learning_rate": 0.00012116356894150463, + "loss": 0.9409, + "step": 12968 + }, + { + "epoch": 2.766424914675768, + "grad_norm": 0.32029087072823836, + "learning_rate": 0.0001211562718944469, + "loss": 0.9467, + "step": 12969 + }, + { + "epoch": 2.7666382252559725, + "grad_norm": 0.2696352375066277, + "learning_rate": 0.00012114897438170527, + "loss": 0.9489, + "step": 12970 + }, + { + "epoch": 2.7668515358361776, + "grad_norm": 0.2616452145701389, + "learning_rate": 0.00012114167640336231, + "loss": 0.941, + "step": 12971 + }, + { + "epoch": 2.7670648464163823, + "grad_norm": 0.2338774320729255, + "learning_rate": 0.0001211343779595006, + "loss": 0.9295, + "step": 12972 + }, + { + "epoch": 2.767278156996587, + "grad_norm": 0.28145147804644893, + "learning_rate": 0.0001211270790502027, + "loss": 0.9577, + "step": 12973 + }, + { + "epoch": 2.7674914675767917, + "grad_norm": 0.2583443237179572, + "learning_rate": 0.00012111977967555122, + "loss": 0.9492, + "step": 12974 + }, + { + "epoch": 2.767704778156997, + "grad_norm": 0.259542994668701, + "learning_rate": 0.0001211124798356287, + "loss": 0.9609, + "step": 12975 + }, + { + "epoch": 2.7679180887372015, + "grad_norm": 0.2651843524638206, + "learning_rate": 0.00012110517953051783, + "loss": 0.9347, + "step": 12976 + }, + { + "epoch": 2.768131399317406, + "grad_norm": 0.2507262755515569, + "learning_rate": 0.00012109787876030112, + "loss": 0.9096, + "step": 12977 + }, + { + "epoch": 2.768344709897611, + "grad_norm": 0.23683726526112434, + "learning_rate": 0.00012109057752506126, + "loss": 0.955, + "step": 12978 + }, + { + "epoch": 2.7685580204778155, + "grad_norm": 0.25592976253267796, + "learning_rate": 0.00012108327582488079, + "loss": 0.9234, + "step": 12979 + }, + { + "epoch": 2.76877133105802, + "grad_norm": 0.30763892223912465, + "learning_rate": 0.00012107597365984238, + "loss": 0.9395, + "step": 12980 + }, + { + "epoch": 2.7689846416382253, + "grad_norm": 0.2686920305612989, + "learning_rate": 0.00012106867103002864, + "loss": 0.9272, + "step": 12981 + }, + { + "epoch": 2.76919795221843, + "grad_norm": 0.2624287121142633, + "learning_rate": 0.00012106136793552218, + "loss": 0.9586, + "step": 12982 + }, + { + "epoch": 2.7694112627986347, + "grad_norm": 0.3637545822929782, + "learning_rate": 0.00012105406437640565, + "loss": 0.9272, + "step": 12983 + }, + { + "epoch": 2.76962457337884, + "grad_norm": 0.3274714925023499, + "learning_rate": 0.00012104676035276172, + "loss": 0.9206, + "step": 12984 + }, + { + "epoch": 2.7698378839590445, + "grad_norm": 0.39929105471240567, + "learning_rate": 0.000121039455864673, + "loss": 0.9522, + "step": 12985 + }, + { + "epoch": 2.770051194539249, + "grad_norm": 0.44148067369810645, + "learning_rate": 0.00012103215091222213, + "loss": 0.9602, + "step": 12986 + }, + { + "epoch": 2.770264505119454, + "grad_norm": 0.38126003447047985, + "learning_rate": 0.00012102484549549179, + "loss": 0.9324, + "step": 12987 + }, + { + "epoch": 2.7704778156996586, + "grad_norm": 0.336862667861697, + "learning_rate": 0.00012101753961456464, + "loss": 0.9582, + "step": 12988 + }, + { + "epoch": 2.7706911262798632, + "grad_norm": 0.34096142100737753, + "learning_rate": 0.00012101023326952331, + "loss": 0.9486, + "step": 12989 + }, + { + "epoch": 2.7709044368600684, + "grad_norm": 0.31827094189954985, + "learning_rate": 0.00012100292646045054, + "loss": 0.9267, + "step": 12990 + }, + { + "epoch": 2.771117747440273, + "grad_norm": 0.26765080181961587, + "learning_rate": 0.00012099561918742894, + "loss": 0.9419, + "step": 12991 + }, + { + "epoch": 2.7713310580204777, + "grad_norm": 0.31493033312896584, + "learning_rate": 0.00012098831145054125, + "loss": 0.9678, + "step": 12992 + }, + { + "epoch": 2.7715443686006824, + "grad_norm": 0.2827699967132814, + "learning_rate": 0.0001209810032498701, + "loss": 0.938, + "step": 12993 + }, + { + "epoch": 2.7717576791808876, + "grad_norm": 0.26222218818281884, + "learning_rate": 0.0001209736945854982, + "loss": 0.9411, + "step": 12994 + }, + { + "epoch": 2.7719709897610922, + "grad_norm": 0.37526021250775365, + "learning_rate": 0.00012096638545750826, + "loss": 0.9199, + "step": 12995 + }, + { + "epoch": 2.772184300341297, + "grad_norm": 0.33688588009257214, + "learning_rate": 0.00012095907586598302, + "loss": 0.929, + "step": 12996 + }, + { + "epoch": 2.7723976109215016, + "grad_norm": 0.3654017395440574, + "learning_rate": 0.00012095176581100512, + "loss": 0.9539, + "step": 12997 + }, + { + "epoch": 2.7726109215017063, + "grad_norm": 0.39157328849479195, + "learning_rate": 0.00012094445529265728, + "loss": 0.9158, + "step": 12998 + }, + { + "epoch": 2.7728242320819114, + "grad_norm": 0.2996629202744626, + "learning_rate": 0.00012093714431102225, + "loss": 0.9257, + "step": 12999 + }, + { + "epoch": 2.773037542662116, + "grad_norm": 0.29263730023550516, + "learning_rate": 0.00012092983286618275, + "loss": 0.9669, + "step": 13000 + }, + { + "epoch": 2.773250853242321, + "grad_norm": 0.3004965619952409, + "learning_rate": 0.00012092252095822151, + "loss": 0.9114, + "step": 13001 + }, + { + "epoch": 2.7734641638225255, + "grad_norm": 0.20860018790611426, + "learning_rate": 0.00012091520858722128, + "loss": 0.9388, + "step": 13002 + }, + { + "epoch": 2.7736774744027306, + "grad_norm": 0.23472618481021595, + "learning_rate": 0.00012090789575326473, + "loss": 0.943, + "step": 13003 + }, + { + "epoch": 2.7738907849829353, + "grad_norm": 0.27281329644408114, + "learning_rate": 0.00012090058245643467, + "loss": 0.9419, + "step": 13004 + }, + { + "epoch": 2.77410409556314, + "grad_norm": 0.35602163872709125, + "learning_rate": 0.00012089326869681382, + "loss": 0.9339, + "step": 13005 + }, + { + "epoch": 2.7743174061433447, + "grad_norm": 0.4455863153914104, + "learning_rate": 0.00012088595447448497, + "loss": 0.9294, + "step": 13006 + }, + { + "epoch": 2.7745307167235493, + "grad_norm": 0.5052918010901858, + "learning_rate": 0.00012087863978953086, + "loss": 0.9455, + "step": 13007 + }, + { + "epoch": 2.774744027303754, + "grad_norm": 0.4993487208198421, + "learning_rate": 0.00012087132464203422, + "loss": 0.9491, + "step": 13008 + }, + { + "epoch": 2.774957337883959, + "grad_norm": 0.48621578272355587, + "learning_rate": 0.00012086400903207786, + "loss": 0.9409, + "step": 13009 + }, + { + "epoch": 2.775170648464164, + "grad_norm": 0.4547011068747766, + "learning_rate": 0.0001208566929597446, + "loss": 0.9428, + "step": 13010 + }, + { + "epoch": 2.7753839590443685, + "grad_norm": 0.358621865574841, + "learning_rate": 0.0001208493764251171, + "loss": 0.9241, + "step": 13011 + }, + { + "epoch": 2.7755972696245736, + "grad_norm": 0.25516912639301076, + "learning_rate": 0.00012084205942827826, + "loss": 0.9317, + "step": 13012 + }, + { + "epoch": 2.7758105802047783, + "grad_norm": 0.29726381482717273, + "learning_rate": 0.00012083474196931082, + "loss": 0.9292, + "step": 13013 + }, + { + "epoch": 2.776023890784983, + "grad_norm": 0.3934334651226233, + "learning_rate": 0.00012082742404829762, + "loss": 0.9336, + "step": 13014 + }, + { + "epoch": 2.7762372013651877, + "grad_norm": 0.3808493774191487, + "learning_rate": 0.00012082010566532138, + "loss": 0.9298, + "step": 13015 + }, + { + "epoch": 2.7764505119453924, + "grad_norm": 0.31663199939527503, + "learning_rate": 0.00012081278682046499, + "loss": 0.9036, + "step": 13016 + }, + { + "epoch": 2.776663822525597, + "grad_norm": 0.24964178805445955, + "learning_rate": 0.00012080546751381121, + "loss": 0.9401, + "step": 13017 + }, + { + "epoch": 2.776877133105802, + "grad_norm": 0.2542154962743044, + "learning_rate": 0.0001207981477454429, + "loss": 0.9632, + "step": 13018 + }, + { + "epoch": 2.777090443686007, + "grad_norm": 0.2977591000594051, + "learning_rate": 0.00012079082751544285, + "loss": 0.9378, + "step": 13019 + }, + { + "epoch": 2.7773037542662116, + "grad_norm": 0.3752925373283652, + "learning_rate": 0.00012078350682389393, + "loss": 0.92, + "step": 13020 + }, + { + "epoch": 2.7775170648464163, + "grad_norm": 0.4481859717749597, + "learning_rate": 0.0001207761856708789, + "loss": 0.9452, + "step": 13021 + }, + { + "epoch": 2.7777303754266214, + "grad_norm": 0.4307561443964621, + "learning_rate": 0.00012076886405648065, + "loss": 0.9073, + "step": 13022 + }, + { + "epoch": 2.777943686006826, + "grad_norm": 0.3578714254840082, + "learning_rate": 0.00012076154198078201, + "loss": 0.9251, + "step": 13023 + }, + { + "epoch": 2.7781569965870307, + "grad_norm": 0.2851537240169882, + "learning_rate": 0.00012075421944386589, + "loss": 0.9297, + "step": 13024 + }, + { + "epoch": 2.7783703071672354, + "grad_norm": 0.27499167567363214, + "learning_rate": 0.00012074689644581505, + "loss": 0.9317, + "step": 13025 + }, + { + "epoch": 2.77858361774744, + "grad_norm": 0.21849004072268136, + "learning_rate": 0.0001207395729867124, + "loss": 0.8926, + "step": 13026 + }, + { + "epoch": 2.778796928327645, + "grad_norm": 0.22519430056163092, + "learning_rate": 0.00012073224906664077, + "loss": 0.9242, + "step": 13027 + }, + { + "epoch": 2.77901023890785, + "grad_norm": 0.3075948088326556, + "learning_rate": 0.00012072492468568309, + "loss": 0.9188, + "step": 13028 + }, + { + "epoch": 2.7792235494880546, + "grad_norm": 0.38758778012819606, + "learning_rate": 0.0001207175998439222, + "loss": 0.94, + "step": 13029 + }, + { + "epoch": 2.7794368600682593, + "grad_norm": 0.4236561799334747, + "learning_rate": 0.00012071027454144096, + "loss": 0.9402, + "step": 13030 + }, + { + "epoch": 2.7796501706484644, + "grad_norm": 0.431132535188254, + "learning_rate": 0.0001207029487783223, + "loss": 0.9493, + "step": 13031 + }, + { + "epoch": 2.779863481228669, + "grad_norm": 0.3195653659190364, + "learning_rate": 0.0001206956225546491, + "loss": 0.9265, + "step": 13032 + }, + { + "epoch": 2.780076791808874, + "grad_norm": 0.29842735920243096, + "learning_rate": 0.00012068829587050421, + "loss": 0.971, + "step": 13033 + }, + { + "epoch": 2.7802901023890785, + "grad_norm": 0.27010631884771985, + "learning_rate": 0.00012068096872597059, + "loss": 0.9251, + "step": 13034 + }, + { + "epoch": 2.780503412969283, + "grad_norm": 0.2596543948804709, + "learning_rate": 0.00012067364112113112, + "loss": 0.9367, + "step": 13035 + }, + { + "epoch": 2.780716723549488, + "grad_norm": 0.3038095854081252, + "learning_rate": 0.00012066631305606874, + "loss": 0.9189, + "step": 13036 + }, + { + "epoch": 2.780930034129693, + "grad_norm": 0.4284363915858806, + "learning_rate": 0.0001206589845308663, + "loss": 0.9427, + "step": 13037 + }, + { + "epoch": 2.7811433447098977, + "grad_norm": 0.4572179717287114, + "learning_rate": 0.00012065165554560679, + "loss": 0.9193, + "step": 13038 + }, + { + "epoch": 2.7813566552901023, + "grad_norm": 0.42465550287888654, + "learning_rate": 0.00012064432610037312, + "loss": 0.9257, + "step": 13039 + }, + { + "epoch": 2.781569965870307, + "grad_norm": 0.3312050014722816, + "learning_rate": 0.00012063699619524822, + "loss": 0.8948, + "step": 13040 + }, + { + "epoch": 2.781783276450512, + "grad_norm": 0.24682381904245462, + "learning_rate": 0.00012062966583031501, + "loss": 0.9108, + "step": 13041 + }, + { + "epoch": 2.781996587030717, + "grad_norm": 0.20950939895772855, + "learning_rate": 0.00012062233500565645, + "loss": 0.9327, + "step": 13042 + }, + { + "epoch": 2.7822098976109215, + "grad_norm": 0.33238867086167295, + "learning_rate": 0.00012061500372135548, + "loss": 0.9396, + "step": 13043 + }, + { + "epoch": 2.782423208191126, + "grad_norm": 0.39493023281458034, + "learning_rate": 0.00012060767197749507, + "loss": 0.9412, + "step": 13044 + }, + { + "epoch": 2.782636518771331, + "grad_norm": 0.38041722152848995, + "learning_rate": 0.00012060033977415818, + "loss": 0.924, + "step": 13045 + }, + { + "epoch": 2.7828498293515356, + "grad_norm": 0.3609131467087814, + "learning_rate": 0.00012059300711142773, + "loss": 0.9492, + "step": 13046 + }, + { + "epoch": 2.7830631399317407, + "grad_norm": 0.2746724094251323, + "learning_rate": 0.00012058567398938675, + "loss": 0.9268, + "step": 13047 + }, + { + "epoch": 2.7832764505119454, + "grad_norm": 0.2837768355363602, + "learning_rate": 0.00012057834040811817, + "loss": 0.9166, + "step": 13048 + }, + { + "epoch": 2.78348976109215, + "grad_norm": 0.22746820630091213, + "learning_rate": 0.00012057100636770501, + "loss": 0.8939, + "step": 13049 + }, + { + "epoch": 2.783703071672355, + "grad_norm": 0.31714210673637555, + "learning_rate": 0.0001205636718682302, + "loss": 0.9572, + "step": 13050 + }, + { + "epoch": 2.78391638225256, + "grad_norm": 0.4431327175988377, + "learning_rate": 0.00012055633690977676, + "loss": 0.9432, + "step": 13051 + }, + { + "epoch": 2.7841296928327646, + "grad_norm": 0.454944002540411, + "learning_rate": 0.0001205490014924277, + "loss": 0.9304, + "step": 13052 + }, + { + "epoch": 2.7843430034129693, + "grad_norm": 0.4126559003826698, + "learning_rate": 0.00012054166561626603, + "loss": 0.9314, + "step": 13053 + }, + { + "epoch": 2.784556313993174, + "grad_norm": 0.36049647690633496, + "learning_rate": 0.00012053432928137469, + "loss": 0.9476, + "step": 13054 + }, + { + "epoch": 2.7847696245733786, + "grad_norm": 0.3040713831630573, + "learning_rate": 0.00012052699248783677, + "loss": 0.929, + "step": 13055 + }, + { + "epoch": 2.7849829351535837, + "grad_norm": 0.18242191397746307, + "learning_rate": 0.00012051965523573522, + "loss": 0.8995, + "step": 13056 + }, + { + "epoch": 2.7851962457337884, + "grad_norm": 0.253177748748441, + "learning_rate": 0.00012051231752515309, + "loss": 0.946, + "step": 13057 + }, + { + "epoch": 2.785409556313993, + "grad_norm": 0.3271404993914826, + "learning_rate": 0.00012050497935617341, + "loss": 0.9662, + "step": 13058 + }, + { + "epoch": 2.785622866894198, + "grad_norm": 0.32838160222007357, + "learning_rate": 0.00012049764072887922, + "loss": 0.9306, + "step": 13059 + }, + { + "epoch": 2.785836177474403, + "grad_norm": 0.3214869216626377, + "learning_rate": 0.00012049030164335352, + "loss": 0.9672, + "step": 13060 + }, + { + "epoch": 2.7860494880546076, + "grad_norm": 0.3505649768345462, + "learning_rate": 0.00012048296209967937, + "loss": 0.9583, + "step": 13061 + }, + { + "epoch": 2.7862627986348123, + "grad_norm": 0.3098065834815824, + "learning_rate": 0.00012047562209793987, + "loss": 0.9267, + "step": 13062 + }, + { + "epoch": 2.786476109215017, + "grad_norm": 0.1845321979338137, + "learning_rate": 0.00012046828163821801, + "loss": 0.8934, + "step": 13063 + }, + { + "epoch": 2.7866894197952217, + "grad_norm": 0.26940914161404883, + "learning_rate": 0.00012046094072059682, + "loss": 0.9071, + "step": 13064 + }, + { + "epoch": 2.7869027303754264, + "grad_norm": 0.28720628358027694, + "learning_rate": 0.00012045359934515945, + "loss": 0.9405, + "step": 13065 + }, + { + "epoch": 2.7871160409556315, + "grad_norm": 0.2930219408398774, + "learning_rate": 0.00012044625751198891, + "loss": 0.8857, + "step": 13066 + }, + { + "epoch": 2.787329351535836, + "grad_norm": 0.28598200867902235, + "learning_rate": 0.00012043891522116828, + "loss": 0.9448, + "step": 13067 + }, + { + "epoch": 2.787542662116041, + "grad_norm": 0.31750683251974376, + "learning_rate": 0.00012043157247278065, + "loss": 0.9692, + "step": 13068 + }, + { + "epoch": 2.787755972696246, + "grad_norm": 0.29663918135311024, + "learning_rate": 0.0001204242292669091, + "loss": 0.9099, + "step": 13069 + }, + { + "epoch": 2.7879692832764507, + "grad_norm": 0.25179519404548745, + "learning_rate": 0.00012041688560363672, + "loss": 0.9184, + "step": 13070 + }, + { + "epoch": 2.7881825938566553, + "grad_norm": 0.2772109740260809, + "learning_rate": 0.00012040954148304657, + "loss": 0.9183, + "step": 13071 + }, + { + "epoch": 2.78839590443686, + "grad_norm": 0.24498278189614323, + "learning_rate": 0.00012040219690522183, + "loss": 0.9124, + "step": 13072 + }, + { + "epoch": 2.7886092150170647, + "grad_norm": 0.20837607195296934, + "learning_rate": 0.0001203948518702455, + "loss": 0.8983, + "step": 13073 + }, + { + "epoch": 2.7888225255972694, + "grad_norm": 0.26998112066924823, + "learning_rate": 0.00012038750637820078, + "loss": 0.9236, + "step": 13074 + }, + { + "epoch": 2.7890358361774745, + "grad_norm": 0.2863581749528059, + "learning_rate": 0.00012038016042917073, + "loss": 0.9416, + "step": 13075 + }, + { + "epoch": 2.789249146757679, + "grad_norm": 0.32773644204097935, + "learning_rate": 0.00012037281402323847, + "loss": 0.9183, + "step": 13076 + }, + { + "epoch": 2.789462457337884, + "grad_norm": 0.3523493864763251, + "learning_rate": 0.00012036546716048716, + "loss": 0.9633, + "step": 13077 + }, + { + "epoch": 2.7896757679180886, + "grad_norm": 0.36109831630807093, + "learning_rate": 0.0001203581198409999, + "loss": 0.9475, + "step": 13078 + }, + { + "epoch": 2.7898890784982937, + "grad_norm": 0.3332326087175948, + "learning_rate": 0.00012035077206485984, + "loss": 0.9274, + "step": 13079 + }, + { + "epoch": 2.7901023890784984, + "grad_norm": 0.2852724183972135, + "learning_rate": 0.00012034342383215013, + "loss": 0.9191, + "step": 13080 + }, + { + "epoch": 2.790315699658703, + "grad_norm": 0.20725708960135955, + "learning_rate": 0.00012033607514295388, + "loss": 0.9172, + "step": 13081 + }, + { + "epoch": 2.7905290102389078, + "grad_norm": 0.2127430159110438, + "learning_rate": 0.00012032872599735427, + "loss": 0.9541, + "step": 13082 + }, + { + "epoch": 2.7907423208191124, + "grad_norm": 0.2572961082863008, + "learning_rate": 0.00012032137639543444, + "loss": 0.9108, + "step": 13083 + }, + { + "epoch": 2.7909556313993176, + "grad_norm": 0.24771889739079736, + "learning_rate": 0.00012031402633727757, + "loss": 0.9686, + "step": 13084 + }, + { + "epoch": 2.7911689419795223, + "grad_norm": 0.24231394236532805, + "learning_rate": 0.0001203066758229668, + "loss": 0.9277, + "step": 13085 + }, + { + "epoch": 2.791382252559727, + "grad_norm": 0.24601037980173826, + "learning_rate": 0.00012029932485258532, + "loss": 0.9342, + "step": 13086 + }, + { + "epoch": 2.7915955631399316, + "grad_norm": 0.20988891651343974, + "learning_rate": 0.00012029197342621632, + "loss": 0.9123, + "step": 13087 + }, + { + "epoch": 2.7918088737201368, + "grad_norm": 0.19275700637948673, + "learning_rate": 0.00012028462154394293, + "loss": 0.9288, + "step": 13088 + }, + { + "epoch": 2.7920221843003414, + "grad_norm": 0.21969337631956556, + "learning_rate": 0.0001202772692058484, + "loss": 0.9051, + "step": 13089 + }, + { + "epoch": 2.792235494880546, + "grad_norm": 0.2095367181543857, + "learning_rate": 0.00012026991641201588, + "loss": 0.96, + "step": 13090 + }, + { + "epoch": 2.792448805460751, + "grad_norm": 0.22220557567277885, + "learning_rate": 0.00012026256316252858, + "loss": 0.9333, + "step": 13091 + }, + { + "epoch": 2.7926621160409555, + "grad_norm": 0.30568521708716173, + "learning_rate": 0.00012025520945746969, + "loss": 0.9559, + "step": 13092 + }, + { + "epoch": 2.79287542662116, + "grad_norm": 0.36463185721960617, + "learning_rate": 0.00012024785529692243, + "loss": 0.9563, + "step": 13093 + }, + { + "epoch": 2.7930887372013653, + "grad_norm": 0.3152778548724783, + "learning_rate": 0.00012024050068097002, + "loss": 0.9456, + "step": 13094 + }, + { + "epoch": 2.79330204778157, + "grad_norm": 0.2167572152859524, + "learning_rate": 0.00012023314560969565, + "loss": 0.9039, + "step": 13095 + }, + { + "epoch": 2.7935153583617747, + "grad_norm": 0.2138037100951881, + "learning_rate": 0.00012022579008318258, + "loss": 0.952, + "step": 13096 + }, + { + "epoch": 2.79372866894198, + "grad_norm": 0.2957783662251471, + "learning_rate": 0.00012021843410151402, + "loss": 0.9278, + "step": 13097 + }, + { + "epoch": 2.7939419795221845, + "grad_norm": 0.4093487534425196, + "learning_rate": 0.00012021107766477319, + "loss": 0.9353, + "step": 13098 + }, + { + "epoch": 2.794155290102389, + "grad_norm": 0.5731774974743634, + "learning_rate": 0.00012020372077304334, + "loss": 0.9205, + "step": 13099 + }, + { + "epoch": 2.794368600682594, + "grad_norm": 0.6502338621981225, + "learning_rate": 0.0001201963634264077, + "loss": 0.9517, + "step": 13100 + }, + { + "epoch": 2.7945819112627985, + "grad_norm": 0.50028423290212, + "learning_rate": 0.00012018900562494957, + "loss": 0.9457, + "step": 13101 + }, + { + "epoch": 2.794795221843003, + "grad_norm": 0.3875212811794572, + "learning_rate": 0.00012018164736875214, + "loss": 0.9143, + "step": 13102 + }, + { + "epoch": 2.7950085324232083, + "grad_norm": 0.28210354118804765, + "learning_rate": 0.00012017428865789872, + "loss": 0.9343, + "step": 13103 + }, + { + "epoch": 2.795221843003413, + "grad_norm": 0.20779047793124023, + "learning_rate": 0.00012016692949247249, + "loss": 0.9381, + "step": 13104 + }, + { + "epoch": 2.7954351535836177, + "grad_norm": 0.3508979612194182, + "learning_rate": 0.00012015956987255682, + "loss": 0.9672, + "step": 13105 + }, + { + "epoch": 2.7956484641638224, + "grad_norm": 0.43394791632297863, + "learning_rate": 0.00012015220979823493, + "loss": 0.9209, + "step": 13106 + }, + { + "epoch": 2.7958617747440275, + "grad_norm": 0.5412280978719742, + "learning_rate": 0.00012014484926959011, + "loss": 0.9423, + "step": 13107 + }, + { + "epoch": 2.796075085324232, + "grad_norm": 0.540035860648384, + "learning_rate": 0.00012013748828670566, + "loss": 0.9274, + "step": 13108 + }, + { + "epoch": 2.796288395904437, + "grad_norm": 0.400847375654452, + "learning_rate": 0.00012013012684966486, + "loss": 0.9417, + "step": 13109 + }, + { + "epoch": 2.7965017064846416, + "grad_norm": 0.22826730971631115, + "learning_rate": 0.00012012276495855098, + "loss": 0.9307, + "step": 13110 + }, + { + "epoch": 2.7967150170648463, + "grad_norm": 0.22867940988745955, + "learning_rate": 0.00012011540261344735, + "loss": 0.9513, + "step": 13111 + }, + { + "epoch": 2.796928327645051, + "grad_norm": 0.3761463356757943, + "learning_rate": 0.00012010803981443725, + "loss": 0.9191, + "step": 13112 + }, + { + "epoch": 2.797141638225256, + "grad_norm": 0.5279183789651379, + "learning_rate": 0.00012010067656160403, + "loss": 0.9124, + "step": 13113 + }, + { + "epoch": 2.7973549488054608, + "grad_norm": 0.5302591603884521, + "learning_rate": 0.00012009331285503095, + "loss": 0.9188, + "step": 13114 + }, + { + "epoch": 2.7975682593856654, + "grad_norm": 0.4616625550618506, + "learning_rate": 0.00012008594869480139, + "loss": 0.9509, + "step": 13115 + }, + { + "epoch": 2.7977815699658706, + "grad_norm": 0.39618753363687814, + "learning_rate": 0.00012007858408099861, + "loss": 0.922, + "step": 13116 + }, + { + "epoch": 2.7979948805460753, + "grad_norm": 0.19771058885765094, + "learning_rate": 0.000120071219013706, + "loss": 0.9284, + "step": 13117 + }, + { + "epoch": 2.79820819112628, + "grad_norm": 0.3069213495376969, + "learning_rate": 0.00012006385349300687, + "loss": 0.921, + "step": 13118 + }, + { + "epoch": 2.7984215017064846, + "grad_norm": 0.3647610451208905, + "learning_rate": 0.0001200564875189846, + "loss": 0.957, + "step": 13119 + }, + { + "epoch": 2.7986348122866893, + "grad_norm": 0.2748248314369991, + "learning_rate": 0.00012004912109172243, + "loss": 0.9057, + "step": 13120 + }, + { + "epoch": 2.798848122866894, + "grad_norm": 0.26614882429308434, + "learning_rate": 0.00012004175421130383, + "loss": 0.9168, + "step": 13121 + }, + { + "epoch": 2.799061433447099, + "grad_norm": 0.2646940110705246, + "learning_rate": 0.0001200343868778121, + "loss": 0.9294, + "step": 13122 + }, + { + "epoch": 2.799274744027304, + "grad_norm": 0.22997349603735645, + "learning_rate": 0.0001200270190913306, + "loss": 0.9248, + "step": 13123 + }, + { + "epoch": 2.7994880546075085, + "grad_norm": 0.26800266950873886, + "learning_rate": 0.00012001965085194271, + "loss": 0.9422, + "step": 13124 + }, + { + "epoch": 2.799701365187713, + "grad_norm": 0.3165296663825907, + "learning_rate": 0.00012001228215973179, + "loss": 0.9196, + "step": 13125 + }, + { + "epoch": 2.7999146757679183, + "grad_norm": 0.30230085820474, + "learning_rate": 0.00012000491301478122, + "loss": 0.9089, + "step": 13126 + }, + { + "epoch": 2.800127986348123, + "grad_norm": 0.27293676791130506, + "learning_rate": 0.00011999754341717443, + "loss": 0.9017, + "step": 13127 + }, + { + "epoch": 2.8003412969283277, + "grad_norm": 0.2640167714760462, + "learning_rate": 0.00011999017336699472, + "loss": 0.9308, + "step": 13128 + }, + { + "epoch": 2.8005546075085324, + "grad_norm": 0.24473409463402185, + "learning_rate": 0.00011998280286432552, + "loss": 0.9238, + "step": 13129 + }, + { + "epoch": 2.800767918088737, + "grad_norm": 0.26559873306911036, + "learning_rate": 0.00011997543190925027, + "loss": 0.9305, + "step": 13130 + }, + { + "epoch": 2.8009812286689417, + "grad_norm": 0.3032834140841371, + "learning_rate": 0.00011996806050185231, + "loss": 0.9406, + "step": 13131 + }, + { + "epoch": 2.801194539249147, + "grad_norm": 0.29915676077479764, + "learning_rate": 0.00011996068864221507, + "loss": 0.9246, + "step": 13132 + }, + { + "epoch": 2.8014078498293515, + "grad_norm": 0.3038456981945021, + "learning_rate": 0.00011995331633042198, + "loss": 0.9513, + "step": 13133 + }, + { + "epoch": 2.801621160409556, + "grad_norm": 0.32910968974987376, + "learning_rate": 0.00011994594356655642, + "loss": 0.929, + "step": 13134 + }, + { + "epoch": 2.8018344709897613, + "grad_norm": 0.2742414460646546, + "learning_rate": 0.00011993857035070186, + "loss": 0.9554, + "step": 13135 + }, + { + "epoch": 2.802047781569966, + "grad_norm": 0.3081174944202124, + "learning_rate": 0.00011993119668294172, + "loss": 0.9444, + "step": 13136 + }, + { + "epoch": 2.8022610921501707, + "grad_norm": 0.36995563570361634, + "learning_rate": 0.0001199238225633594, + "loss": 0.937, + "step": 13137 + }, + { + "epoch": 2.8024744027303754, + "grad_norm": 0.37880599435906515, + "learning_rate": 0.00011991644799203834, + "loss": 0.9234, + "step": 13138 + }, + { + "epoch": 2.80268771331058, + "grad_norm": 0.23930522524812686, + "learning_rate": 0.00011990907296906202, + "loss": 0.9458, + "step": 13139 + }, + { + "epoch": 2.8029010238907848, + "grad_norm": 0.24394897947471494, + "learning_rate": 0.00011990169749451384, + "loss": 0.8938, + "step": 13140 + }, + { + "epoch": 2.80311433447099, + "grad_norm": 0.26783975202754046, + "learning_rate": 0.00011989432156847732, + "loss": 0.9461, + "step": 13141 + }, + { + "epoch": 2.8033276450511946, + "grad_norm": 0.26207128323272716, + "learning_rate": 0.00011988694519103584, + "loss": 0.9167, + "step": 13142 + }, + { + "epoch": 2.8035409556313993, + "grad_norm": 0.2628648374489098, + "learning_rate": 0.00011987956836227294, + "loss": 0.9288, + "step": 13143 + }, + { + "epoch": 2.803754266211604, + "grad_norm": 0.25728452377676075, + "learning_rate": 0.00011987219108227204, + "loss": 0.8806, + "step": 13144 + }, + { + "epoch": 2.803967576791809, + "grad_norm": 0.2598682225178689, + "learning_rate": 0.00011986481335111663, + "loss": 0.9413, + "step": 13145 + }, + { + "epoch": 2.8041808873720138, + "grad_norm": 0.24219588698393785, + "learning_rate": 0.00011985743516889017, + "loss": 0.9325, + "step": 13146 + }, + { + "epoch": 2.8043941979522184, + "grad_norm": 0.21496616673720928, + "learning_rate": 0.00011985005653567619, + "loss": 0.9598, + "step": 13147 + }, + { + "epoch": 2.804607508532423, + "grad_norm": 0.20060721657204583, + "learning_rate": 0.00011984267745155813, + "loss": 0.9442, + "step": 13148 + }, + { + "epoch": 2.804820819112628, + "grad_norm": 0.2726084025955164, + "learning_rate": 0.00011983529791661952, + "loss": 0.9293, + "step": 13149 + }, + { + "epoch": 2.8050341296928325, + "grad_norm": 0.2848543671806925, + "learning_rate": 0.00011982791793094381, + "loss": 0.9024, + "step": 13150 + }, + { + "epoch": 2.8052474402730376, + "grad_norm": 0.28656762418860554, + "learning_rate": 0.00011982053749461457, + "loss": 0.9271, + "step": 13151 + }, + { + "epoch": 2.8054607508532423, + "grad_norm": 0.30819093083325316, + "learning_rate": 0.00011981315660771528, + "loss": 0.9298, + "step": 13152 + }, + { + "epoch": 2.805674061433447, + "grad_norm": 0.2614467053791682, + "learning_rate": 0.00011980577527032944, + "loss": 0.9192, + "step": 13153 + }, + { + "epoch": 2.805887372013652, + "grad_norm": 0.2717041234023186, + "learning_rate": 0.00011979839348254058, + "loss": 0.9903, + "step": 13154 + }, + { + "epoch": 2.806100682593857, + "grad_norm": 0.35789655773510537, + "learning_rate": 0.00011979101124443224, + "loss": 0.903, + "step": 13155 + }, + { + "epoch": 2.8063139931740615, + "grad_norm": 0.2625209571840374, + "learning_rate": 0.00011978362855608793, + "loss": 0.8999, + "step": 13156 + }, + { + "epoch": 2.806527303754266, + "grad_norm": 0.21182728034918286, + "learning_rate": 0.00011977624541759121, + "loss": 0.9266, + "step": 13157 + }, + { + "epoch": 2.806740614334471, + "grad_norm": 0.2825728706896091, + "learning_rate": 0.00011976886182902558, + "loss": 0.9315, + "step": 13158 + }, + { + "epoch": 2.8069539249146755, + "grad_norm": 0.27431450577159905, + "learning_rate": 0.00011976147779047463, + "loss": 0.9319, + "step": 13159 + }, + { + "epoch": 2.8071672354948807, + "grad_norm": 0.3125564641186117, + "learning_rate": 0.00011975409330202188, + "loss": 0.9271, + "step": 13160 + }, + { + "epoch": 2.8073805460750854, + "grad_norm": 0.25960619666612267, + "learning_rate": 0.00011974670836375089, + "loss": 0.9396, + "step": 13161 + }, + { + "epoch": 2.80759385665529, + "grad_norm": 0.24122005348071726, + "learning_rate": 0.00011973932297574522, + "loss": 0.9186, + "step": 13162 + }, + { + "epoch": 2.8078071672354947, + "grad_norm": 0.2613491135804719, + "learning_rate": 0.00011973193713808847, + "loss": 0.9079, + "step": 13163 + }, + { + "epoch": 2.8080204778157, + "grad_norm": 0.1694067936704578, + "learning_rate": 0.00011972455085086417, + "loss": 0.8934, + "step": 13164 + }, + { + "epoch": 2.8082337883959045, + "grad_norm": 0.2300869313543057, + "learning_rate": 0.00011971716411415592, + "loss": 0.9119, + "step": 13165 + }, + { + "epoch": 2.8084470989761092, + "grad_norm": 0.21145538512160558, + "learning_rate": 0.00011970977692804726, + "loss": 0.9638, + "step": 13166 + }, + { + "epoch": 2.808660409556314, + "grad_norm": 0.28260536368182226, + "learning_rate": 0.00011970238929262183, + "loss": 0.9749, + "step": 13167 + }, + { + "epoch": 2.8088737201365186, + "grad_norm": 0.35107558638023223, + "learning_rate": 0.00011969500120796318, + "loss": 0.9659, + "step": 13168 + }, + { + "epoch": 2.8090870307167233, + "grad_norm": 0.2516384885319517, + "learning_rate": 0.00011968761267415496, + "loss": 0.9283, + "step": 13169 + }, + { + "epoch": 2.8093003412969284, + "grad_norm": 0.28626094414191344, + "learning_rate": 0.00011968022369128069, + "loss": 0.9457, + "step": 13170 + }, + { + "epoch": 2.809513651877133, + "grad_norm": 0.25444775082513227, + "learning_rate": 0.00011967283425942404, + "loss": 0.9292, + "step": 13171 + }, + { + "epoch": 2.8097269624573378, + "grad_norm": 0.20882588895816917, + "learning_rate": 0.00011966544437866859, + "loss": 0.9335, + "step": 13172 + }, + { + "epoch": 2.809940273037543, + "grad_norm": 0.2384032654071181, + "learning_rate": 0.00011965805404909799, + "loss": 0.951, + "step": 13173 + }, + { + "epoch": 2.8101535836177476, + "grad_norm": 0.262803389284208, + "learning_rate": 0.00011965066327079582, + "loss": 0.9471, + "step": 13174 + }, + { + "epoch": 2.8103668941979523, + "grad_norm": 0.327705724177096, + "learning_rate": 0.00011964327204384576, + "loss": 0.937, + "step": 13175 + }, + { + "epoch": 2.810580204778157, + "grad_norm": 0.2947526040793901, + "learning_rate": 0.00011963588036833138, + "loss": 0.9477, + "step": 13176 + }, + { + "epoch": 2.8107935153583616, + "grad_norm": 0.22375051942710203, + "learning_rate": 0.00011962848824433639, + "loss": 0.93, + "step": 13177 + }, + { + "epoch": 2.8110068259385663, + "grad_norm": 0.20752721510958624, + "learning_rate": 0.00011962109567194432, + "loss": 0.9279, + "step": 13178 + }, + { + "epoch": 2.8112201365187715, + "grad_norm": 0.21438235860571175, + "learning_rate": 0.00011961370265123895, + "loss": 0.94, + "step": 13179 + }, + { + "epoch": 2.811433447098976, + "grad_norm": 0.22927050765204196, + "learning_rate": 0.00011960630918230383, + "loss": 0.925, + "step": 13180 + }, + { + "epoch": 2.811646757679181, + "grad_norm": 0.23967465967781568, + "learning_rate": 0.00011959891526522268, + "loss": 0.8933, + "step": 13181 + }, + { + "epoch": 2.8118600682593855, + "grad_norm": 0.22999239816599182, + "learning_rate": 0.00011959152090007913, + "loss": 0.9347, + "step": 13182 + }, + { + "epoch": 2.8120733788395906, + "grad_norm": 0.2324872356678991, + "learning_rate": 0.00011958412608695685, + "loss": 0.8891, + "step": 13183 + }, + { + "epoch": 2.8122866894197953, + "grad_norm": 0.3717168725913832, + "learning_rate": 0.0001195767308259395, + "loss": 0.9483, + "step": 13184 + }, + { + "epoch": 2.8125, + "grad_norm": 0.2690333509474061, + "learning_rate": 0.0001195693351171108, + "loss": 0.9466, + "step": 13185 + }, + { + "epoch": 2.8127133105802047, + "grad_norm": 0.3572758923668195, + "learning_rate": 0.00011956193896055441, + "loss": 0.9102, + "step": 13186 + }, + { + "epoch": 2.8129266211604094, + "grad_norm": 0.444292000372954, + "learning_rate": 0.00011955454235635402, + "loss": 0.9737, + "step": 13187 + }, + { + "epoch": 2.8131399317406145, + "grad_norm": 0.47601633923661973, + "learning_rate": 0.00011954714530459328, + "loss": 0.9468, + "step": 13188 + }, + { + "epoch": 2.813353242320819, + "grad_norm": 0.41397194646489116, + "learning_rate": 0.00011953974780535596, + "loss": 0.9524, + "step": 13189 + }, + { + "epoch": 2.813566552901024, + "grad_norm": 0.30785140643952996, + "learning_rate": 0.00011953234985872571, + "loss": 0.9323, + "step": 13190 + }, + { + "epoch": 2.8137798634812285, + "grad_norm": 0.2854848934311222, + "learning_rate": 0.00011952495146478629, + "loss": 0.9224, + "step": 13191 + }, + { + "epoch": 2.8139931740614337, + "grad_norm": 0.2518034418293954, + "learning_rate": 0.00011951755262362133, + "loss": 0.9467, + "step": 13192 + }, + { + "epoch": 2.8142064846416384, + "grad_norm": 0.3048408979457343, + "learning_rate": 0.00011951015333531463, + "loss": 0.9362, + "step": 13193 + }, + { + "epoch": 2.814419795221843, + "grad_norm": 0.3759563178509169, + "learning_rate": 0.00011950275359994987, + "loss": 0.9169, + "step": 13194 + }, + { + "epoch": 2.8146331058020477, + "grad_norm": 0.3316734789681974, + "learning_rate": 0.00011949535341761078, + "loss": 0.9199, + "step": 13195 + }, + { + "epoch": 2.8148464163822524, + "grad_norm": 0.30328231672178374, + "learning_rate": 0.00011948795278838111, + "loss": 0.9473, + "step": 13196 + }, + { + "epoch": 2.815059726962457, + "grad_norm": 0.2965880850966745, + "learning_rate": 0.00011948055171234462, + "loss": 0.9197, + "step": 13197 + }, + { + "epoch": 2.8152730375426622, + "grad_norm": 0.26374588435947177, + "learning_rate": 0.00011947315018958497, + "loss": 0.9532, + "step": 13198 + }, + { + "epoch": 2.815486348122867, + "grad_norm": 0.29915821085371236, + "learning_rate": 0.000119465748220186, + "loss": 0.9053, + "step": 13199 + }, + { + "epoch": 2.8156996587030716, + "grad_norm": 0.33917468213551033, + "learning_rate": 0.0001194583458042314, + "loss": 0.933, + "step": 13200 + }, + { + "epoch": 2.8159129692832767, + "grad_norm": 0.3007478135444634, + "learning_rate": 0.00011945094294180498, + "loss": 0.9394, + "step": 13201 + }, + { + "epoch": 2.8161262798634814, + "grad_norm": 0.24388235512023562, + "learning_rate": 0.00011944353963299045, + "loss": 0.9096, + "step": 13202 + }, + { + "epoch": 2.816339590443686, + "grad_norm": 0.22460332846999378, + "learning_rate": 0.00011943613587787163, + "loss": 0.943, + "step": 13203 + }, + { + "epoch": 2.8165529010238908, + "grad_norm": 0.2500114907186946, + "learning_rate": 0.00011942873167653226, + "loss": 0.9325, + "step": 13204 + }, + { + "epoch": 2.8167662116040955, + "grad_norm": 0.26520098678160825, + "learning_rate": 0.00011942132702905613, + "loss": 0.9472, + "step": 13205 + }, + { + "epoch": 2.8169795221843, + "grad_norm": 0.2720033266269855, + "learning_rate": 0.00011941392193552699, + "loss": 0.9155, + "step": 13206 + }, + { + "epoch": 2.8171928327645053, + "grad_norm": 0.2477190710468138, + "learning_rate": 0.0001194065163960287, + "loss": 0.9409, + "step": 13207 + }, + { + "epoch": 2.81740614334471, + "grad_norm": 0.18433852293376152, + "learning_rate": 0.00011939911041064501, + "loss": 0.8967, + "step": 13208 + }, + { + "epoch": 2.8176194539249146, + "grad_norm": 0.21941807720011394, + "learning_rate": 0.00011939170397945971, + "loss": 0.9429, + "step": 13209 + }, + { + "epoch": 2.8178327645051193, + "grad_norm": 0.2687134920823798, + "learning_rate": 0.00011938429710255662, + "loss": 0.9418, + "step": 13210 + }, + { + "epoch": 2.8180460750853245, + "grad_norm": 0.27713403522676133, + "learning_rate": 0.00011937688978001955, + "loss": 0.9401, + "step": 13211 + }, + { + "epoch": 2.818259385665529, + "grad_norm": 0.32315111156281906, + "learning_rate": 0.0001193694820119323, + "loss": 0.9419, + "step": 13212 + }, + { + "epoch": 2.818472696245734, + "grad_norm": 0.4272558417717473, + "learning_rate": 0.00011936207379837872, + "loss": 0.9148, + "step": 13213 + }, + { + "epoch": 2.8186860068259385, + "grad_norm": 0.39127013064271887, + "learning_rate": 0.00011935466513944258, + "loss": 0.9488, + "step": 13214 + }, + { + "epoch": 2.818899317406143, + "grad_norm": 0.31218174000634136, + "learning_rate": 0.00011934725603520777, + "loss": 0.9431, + "step": 13215 + }, + { + "epoch": 2.819112627986348, + "grad_norm": 0.2875160833405537, + "learning_rate": 0.00011933984648575808, + "loss": 0.9221, + "step": 13216 + }, + { + "epoch": 2.819325938566553, + "grad_norm": 0.2537778912551882, + "learning_rate": 0.00011933243649117739, + "loss": 0.9312, + "step": 13217 + }, + { + "epoch": 2.8195392491467577, + "grad_norm": 0.21944716676089568, + "learning_rate": 0.0001193250260515495, + "loss": 0.942, + "step": 13218 + }, + { + "epoch": 2.8197525597269624, + "grad_norm": 0.33582600017833136, + "learning_rate": 0.00011931761516695828, + "loss": 0.9572, + "step": 13219 + }, + { + "epoch": 2.8199658703071675, + "grad_norm": 0.34345200468956477, + "learning_rate": 0.00011931020383748759, + "loss": 0.9072, + "step": 13220 + }, + { + "epoch": 2.820179180887372, + "grad_norm": 0.23232748351722166, + "learning_rate": 0.00011930279206322128, + "loss": 0.9226, + "step": 13221 + }, + { + "epoch": 2.820392491467577, + "grad_norm": 0.2776211961690262, + "learning_rate": 0.00011929537984424319, + "loss": 0.9447, + "step": 13222 + }, + { + "epoch": 2.8206058020477816, + "grad_norm": 0.31456101030540357, + "learning_rate": 0.00011928796718063724, + "loss": 0.9357, + "step": 13223 + }, + { + "epoch": 2.8208191126279862, + "grad_norm": 0.28322343307470266, + "learning_rate": 0.00011928055407248727, + "loss": 0.9421, + "step": 13224 + }, + { + "epoch": 2.821032423208191, + "grad_norm": 0.32436647061720025, + "learning_rate": 0.00011927314051987721, + "loss": 0.9326, + "step": 13225 + }, + { + "epoch": 2.821245733788396, + "grad_norm": 0.2844026390231972, + "learning_rate": 0.00011926572652289086, + "loss": 0.8926, + "step": 13226 + }, + { + "epoch": 2.8214590443686007, + "grad_norm": 0.26929733320793386, + "learning_rate": 0.00011925831208161218, + "loss": 0.915, + "step": 13227 + }, + { + "epoch": 2.8216723549488054, + "grad_norm": 0.3557549943091226, + "learning_rate": 0.000119250897196125, + "loss": 0.9088, + "step": 13228 + }, + { + "epoch": 2.82188566552901, + "grad_norm": 0.3305358216762406, + "learning_rate": 0.00011924348186651331, + "loss": 0.9172, + "step": 13229 + }, + { + "epoch": 2.8220989761092152, + "grad_norm": 0.28720965987396696, + "learning_rate": 0.0001192360660928609, + "loss": 0.9069, + "step": 13230 + }, + { + "epoch": 2.82231228668942, + "grad_norm": 0.27752405461459906, + "learning_rate": 0.00011922864987525182, + "loss": 0.9395, + "step": 13231 + }, + { + "epoch": 2.8225255972696246, + "grad_norm": 0.3188327724054863, + "learning_rate": 0.00011922123321376983, + "loss": 0.964, + "step": 13232 + }, + { + "epoch": 2.8227389078498293, + "grad_norm": 0.35372410929126985, + "learning_rate": 0.00011921381610849898, + "loss": 0.9062, + "step": 13233 + }, + { + "epoch": 2.822952218430034, + "grad_norm": 0.38924658034901916, + "learning_rate": 0.00011920639855952311, + "loss": 0.9123, + "step": 13234 + }, + { + "epoch": 2.8231655290102387, + "grad_norm": 0.34380197485113495, + "learning_rate": 0.0001191989805669262, + "loss": 0.9313, + "step": 13235 + }, + { + "epoch": 2.823378839590444, + "grad_norm": 0.3154273962492481, + "learning_rate": 0.00011919156213079217, + "loss": 0.9263, + "step": 13236 + }, + { + "epoch": 2.8235921501706485, + "grad_norm": 0.24958031009259143, + "learning_rate": 0.00011918414325120495, + "loss": 0.9263, + "step": 13237 + }, + { + "epoch": 2.823805460750853, + "grad_norm": 0.286884366027497, + "learning_rate": 0.00011917672392824847, + "loss": 0.9612, + "step": 13238 + }, + { + "epoch": 2.8240187713310583, + "grad_norm": 0.40153326921997035, + "learning_rate": 0.00011916930416200672, + "loss": 0.9685, + "step": 13239 + }, + { + "epoch": 2.824232081911263, + "grad_norm": 0.6221273367099692, + "learning_rate": 0.00011916188395256362, + "loss": 0.9306, + "step": 13240 + }, + { + "epoch": 2.8244453924914676, + "grad_norm": 0.6860581569866537, + "learning_rate": 0.00011915446330000316, + "loss": 0.9418, + "step": 13241 + }, + { + "epoch": 2.8246587030716723, + "grad_norm": 0.5273804665905784, + "learning_rate": 0.0001191470422044093, + "loss": 0.9221, + "step": 13242 + }, + { + "epoch": 2.824872013651877, + "grad_norm": 0.3025235121153707, + "learning_rate": 0.00011913962066586596, + "loss": 0.8961, + "step": 13243 + }, + { + "epoch": 2.8250853242320817, + "grad_norm": 0.284253912456801, + "learning_rate": 0.00011913219868445718, + "loss": 0.9209, + "step": 13244 + }, + { + "epoch": 2.825298634812287, + "grad_norm": 0.263440999489158, + "learning_rate": 0.00011912477626026692, + "loss": 0.9341, + "step": 13245 + }, + { + "epoch": 2.8255119453924915, + "grad_norm": 0.3405825032370681, + "learning_rate": 0.00011911735339337914, + "loss": 0.9158, + "step": 13246 + }, + { + "epoch": 2.825725255972696, + "grad_norm": 0.34655337607144854, + "learning_rate": 0.00011910993008387785, + "loss": 0.9201, + "step": 13247 + }, + { + "epoch": 2.825938566552901, + "grad_norm": 0.3518813504688298, + "learning_rate": 0.00011910250633184705, + "loss": 0.9499, + "step": 13248 + }, + { + "epoch": 2.826151877133106, + "grad_norm": 0.3344063381781992, + "learning_rate": 0.00011909508213737074, + "loss": 0.9359, + "step": 13249 + }, + { + "epoch": 2.8263651877133107, + "grad_norm": 0.2435059691291281, + "learning_rate": 0.00011908765750053291, + "loss": 0.9463, + "step": 13250 + }, + { + "epoch": 2.8265784982935154, + "grad_norm": 0.24322414969046316, + "learning_rate": 0.00011908023242141758, + "loss": 0.9533, + "step": 13251 + }, + { + "epoch": 2.82679180887372, + "grad_norm": 0.3419193001003238, + "learning_rate": 0.00011907280690010874, + "loss": 0.9399, + "step": 13252 + }, + { + "epoch": 2.8270051194539247, + "grad_norm": 0.39354814515577424, + "learning_rate": 0.00011906538093669046, + "loss": 0.9376, + "step": 13253 + }, + { + "epoch": 2.8272184300341294, + "grad_norm": 0.3028986300911718, + "learning_rate": 0.00011905795453124674, + "loss": 0.9426, + "step": 13254 + }, + { + "epoch": 2.8274317406143346, + "grad_norm": 0.2601024375730263, + "learning_rate": 0.0001190505276838616, + "loss": 0.8935, + "step": 13255 + }, + { + "epoch": 2.8276450511945392, + "grad_norm": 0.2572192553018925, + "learning_rate": 0.0001190431003946191, + "loss": 0.9106, + "step": 13256 + }, + { + "epoch": 2.827858361774744, + "grad_norm": 0.30540466196656113, + "learning_rate": 0.00011903567266360323, + "loss": 0.9192, + "step": 13257 + }, + { + "epoch": 2.828071672354949, + "grad_norm": 0.34101349922730817, + "learning_rate": 0.00011902824449089807, + "loss": 0.9407, + "step": 13258 + }, + { + "epoch": 2.8282849829351537, + "grad_norm": 0.270856673549415, + "learning_rate": 0.00011902081587658768, + "loss": 0.9137, + "step": 13259 + }, + { + "epoch": 2.8284982935153584, + "grad_norm": 0.24952474157892562, + "learning_rate": 0.00011901338682075612, + "loss": 0.9253, + "step": 13260 + }, + { + "epoch": 2.828711604095563, + "grad_norm": 0.23008364361327963, + "learning_rate": 0.00011900595732348741, + "loss": 0.9302, + "step": 13261 + }, + { + "epoch": 2.828924914675768, + "grad_norm": 0.20319913232215264, + "learning_rate": 0.00011899852738486562, + "loss": 0.9144, + "step": 13262 + }, + { + "epoch": 2.8291382252559725, + "grad_norm": 0.20409189582852733, + "learning_rate": 0.00011899109700497487, + "loss": 0.9307, + "step": 13263 + }, + { + "epoch": 2.8293515358361776, + "grad_norm": 0.21941099441870837, + "learning_rate": 0.00011898366618389919, + "loss": 0.9114, + "step": 13264 + }, + { + "epoch": 2.8295648464163823, + "grad_norm": 0.2563841426264731, + "learning_rate": 0.00011897623492172265, + "loss": 0.9649, + "step": 13265 + }, + { + "epoch": 2.829778156996587, + "grad_norm": 0.2049492394779054, + "learning_rate": 0.00011896880321852937, + "loss": 0.9319, + "step": 13266 + }, + { + "epoch": 2.8299914675767917, + "grad_norm": 0.2092818407420398, + "learning_rate": 0.00011896137107440345, + "loss": 0.9403, + "step": 13267 + }, + { + "epoch": 2.830204778156997, + "grad_norm": 0.26043008186680616, + "learning_rate": 0.00011895393848942892, + "loss": 0.9224, + "step": 13268 + }, + { + "epoch": 2.8304180887372015, + "grad_norm": 0.22563873646642232, + "learning_rate": 0.00011894650546368996, + "loss": 0.9352, + "step": 13269 + }, + { + "epoch": 2.830631399317406, + "grad_norm": 0.21504459790167602, + "learning_rate": 0.00011893907199727063, + "loss": 0.9178, + "step": 13270 + }, + { + "epoch": 2.830844709897611, + "grad_norm": 0.2247885057536863, + "learning_rate": 0.00011893163809025501, + "loss": 0.9253, + "step": 13271 + }, + { + "epoch": 2.8310580204778155, + "grad_norm": 0.26907314973151886, + "learning_rate": 0.0001189242037427273, + "loss": 0.931, + "step": 13272 + }, + { + "epoch": 2.83127133105802, + "grad_norm": 0.3167687698159706, + "learning_rate": 0.00011891676895477153, + "loss": 0.9025, + "step": 13273 + }, + { + "epoch": 2.8314846416382253, + "grad_norm": 0.2609897870281867, + "learning_rate": 0.00011890933372647187, + "loss": 0.9135, + "step": 13274 + }, + { + "epoch": 2.83169795221843, + "grad_norm": 0.23471254484278647, + "learning_rate": 0.00011890189805791245, + "loss": 0.9491, + "step": 13275 + }, + { + "epoch": 2.8319112627986347, + "grad_norm": 0.26347874919174497, + "learning_rate": 0.00011889446194917743, + "loss": 0.9579, + "step": 13276 + }, + { + "epoch": 2.83212457337884, + "grad_norm": 0.25190869256075843, + "learning_rate": 0.00011888702540035086, + "loss": 0.9533, + "step": 13277 + }, + { + "epoch": 2.8323378839590445, + "grad_norm": 0.23435432759065275, + "learning_rate": 0.00011887958841151698, + "loss": 0.9477, + "step": 13278 + }, + { + "epoch": 2.832551194539249, + "grad_norm": 0.24913501343961875, + "learning_rate": 0.00011887215098275993, + "loss": 0.9561, + "step": 13279 + }, + { + "epoch": 2.832764505119454, + "grad_norm": 0.22582909912862403, + "learning_rate": 0.00011886471311416378, + "loss": 0.9445, + "step": 13280 + }, + { + "epoch": 2.8329778156996586, + "grad_norm": 0.20614117294886405, + "learning_rate": 0.00011885727480581278, + "loss": 0.9258, + "step": 13281 + }, + { + "epoch": 2.8331911262798632, + "grad_norm": 0.30842579906256146, + "learning_rate": 0.00011884983605779106, + "loss": 0.9425, + "step": 13282 + }, + { + "epoch": 2.8334044368600684, + "grad_norm": 0.3567825265026944, + "learning_rate": 0.00011884239687018277, + "loss": 0.9337, + "step": 13283 + }, + { + "epoch": 2.833617747440273, + "grad_norm": 0.31632908426864165, + "learning_rate": 0.00011883495724307213, + "loss": 0.9237, + "step": 13284 + }, + { + "epoch": 2.8338310580204777, + "grad_norm": 0.29643543968606384, + "learning_rate": 0.00011882751717654328, + "loss": 0.9486, + "step": 13285 + }, + { + "epoch": 2.8340443686006824, + "grad_norm": 0.23022071970782, + "learning_rate": 0.00011882007667068043, + "loss": 0.925, + "step": 13286 + }, + { + "epoch": 2.8342576791808876, + "grad_norm": 0.25353305957464606, + "learning_rate": 0.00011881263572556777, + "loss": 0.925, + "step": 13287 + }, + { + "epoch": 2.8344709897610922, + "grad_norm": 0.26287014196404807, + "learning_rate": 0.00011880519434128947, + "loss": 0.8785, + "step": 13288 + }, + { + "epoch": 2.834684300341297, + "grad_norm": 0.26929428691776025, + "learning_rate": 0.00011879775251792973, + "loss": 0.9247, + "step": 13289 + }, + { + "epoch": 2.8348976109215016, + "grad_norm": 0.286656905487531, + "learning_rate": 0.0001187903102555728, + "loss": 0.9029, + "step": 13290 + }, + { + "epoch": 2.8351109215017063, + "grad_norm": 0.32799745301039585, + "learning_rate": 0.00011878286755430284, + "loss": 0.8933, + "step": 13291 + }, + { + "epoch": 2.8353242320819114, + "grad_norm": 0.3605884543288265, + "learning_rate": 0.00011877542441420407, + "loss": 0.9057, + "step": 13292 + }, + { + "epoch": 2.835537542662116, + "grad_norm": 0.2996457219057551, + "learning_rate": 0.00011876798083536072, + "loss": 0.9205, + "step": 13293 + }, + { + "epoch": 2.835750853242321, + "grad_norm": 0.20769627394293175, + "learning_rate": 0.00011876053681785703, + "loss": 0.915, + "step": 13294 + }, + { + "epoch": 2.8359641638225255, + "grad_norm": 0.20458140173797557, + "learning_rate": 0.00011875309236177722, + "loss": 0.921, + "step": 13295 + }, + { + "epoch": 2.8361774744027306, + "grad_norm": 0.23586306108780858, + "learning_rate": 0.0001187456474672055, + "loss": 0.9375, + "step": 13296 + }, + { + "epoch": 2.8363907849829353, + "grad_norm": 0.6845414193761923, + "learning_rate": 0.00011873820213422615, + "loss": 0.9478, + "step": 13297 + }, + { + "epoch": 2.83660409556314, + "grad_norm": 0.21797047349756096, + "learning_rate": 0.00011873075636292337, + "loss": 0.9534, + "step": 13298 + }, + { + "epoch": 2.8368174061433447, + "grad_norm": 0.2581866899905397, + "learning_rate": 0.00011872331015338143, + "loss": 0.94, + "step": 13299 + }, + { + "epoch": 2.8370307167235493, + "grad_norm": 0.25489510769415863, + "learning_rate": 0.00011871586350568459, + "loss": 0.9296, + "step": 13300 + }, + { + "epoch": 2.837244027303754, + "grad_norm": 0.20739417100055055, + "learning_rate": 0.0001187084164199171, + "loss": 0.9403, + "step": 13301 + }, + { + "epoch": 2.837457337883959, + "grad_norm": 0.23688673829710707, + "learning_rate": 0.00011870096889616324, + "loss": 0.9243, + "step": 13302 + }, + { + "epoch": 2.837670648464164, + "grad_norm": 0.2587204407643066, + "learning_rate": 0.00011869352093450727, + "loss": 0.9252, + "step": 13303 + }, + { + "epoch": 2.8378839590443685, + "grad_norm": 0.22030217775473612, + "learning_rate": 0.00011868607253503345, + "loss": 0.9231, + "step": 13304 + }, + { + "epoch": 2.8380972696245736, + "grad_norm": 0.26982563182414737, + "learning_rate": 0.00011867862369782604, + "loss": 0.9623, + "step": 13305 + }, + { + "epoch": 2.8383105802047783, + "grad_norm": 0.25610230803505835, + "learning_rate": 0.00011867117442296938, + "loss": 0.9368, + "step": 13306 + }, + { + "epoch": 2.838523890784983, + "grad_norm": 0.2569157032909183, + "learning_rate": 0.00011866372471054773, + "loss": 0.9466, + "step": 13307 + }, + { + "epoch": 2.8387372013651877, + "grad_norm": 0.30317059576963645, + "learning_rate": 0.00011865627456064537, + "loss": 0.9448, + "step": 13308 + }, + { + "epoch": 2.8389505119453924, + "grad_norm": 0.33164864110016545, + "learning_rate": 0.00011864882397334666, + "loss": 0.9189, + "step": 13309 + }, + { + "epoch": 2.839163822525597, + "grad_norm": 0.30040297500664365, + "learning_rate": 0.00011864137294873582, + "loss": 0.8874, + "step": 13310 + }, + { + "epoch": 2.839377133105802, + "grad_norm": 0.22764333852403573, + "learning_rate": 0.00011863392148689719, + "loss": 0.9417, + "step": 13311 + }, + { + "epoch": 2.839590443686007, + "grad_norm": 0.22191602458770404, + "learning_rate": 0.00011862646958791509, + "loss": 0.938, + "step": 13312 + }, + { + "epoch": 2.8398037542662116, + "grad_norm": 0.2187065031363483, + "learning_rate": 0.00011861901725187385, + "loss": 0.9188, + "step": 13313 + }, + { + "epoch": 2.8400170648464163, + "grad_norm": 0.2432951438621846, + "learning_rate": 0.00011861156447885777, + "loss": 0.9327, + "step": 13314 + }, + { + "epoch": 2.8402303754266214, + "grad_norm": 0.2878747521487744, + "learning_rate": 0.0001186041112689512, + "loss": 0.9285, + "step": 13315 + }, + { + "epoch": 2.840443686006826, + "grad_norm": 0.3583107181855176, + "learning_rate": 0.00011859665762223846, + "loss": 0.9426, + "step": 13316 + }, + { + "epoch": 2.8406569965870307, + "grad_norm": 0.32562577952751404, + "learning_rate": 0.00011858920353880387, + "loss": 0.9177, + "step": 13317 + }, + { + "epoch": 2.8408703071672354, + "grad_norm": 0.28934031101549795, + "learning_rate": 0.00011858174901873181, + "loss": 0.8947, + "step": 13318 + }, + { + "epoch": 2.84108361774744, + "grad_norm": 0.2563969920252112, + "learning_rate": 0.00011857429406210661, + "loss": 0.9052, + "step": 13319 + }, + { + "epoch": 2.841296928327645, + "grad_norm": 0.21190856145191808, + "learning_rate": 0.00011856683866901263, + "loss": 0.9667, + "step": 13320 + }, + { + "epoch": 2.84151023890785, + "grad_norm": 0.2670456852197853, + "learning_rate": 0.0001185593828395342, + "loss": 0.9506, + "step": 13321 + }, + { + "epoch": 2.8417235494880546, + "grad_norm": 0.2591633488844704, + "learning_rate": 0.00011855192657375572, + "loss": 0.9307, + "step": 13322 + }, + { + "epoch": 2.8419368600682593, + "grad_norm": 0.2545064666506085, + "learning_rate": 0.00011854446987176154, + "loss": 0.9458, + "step": 13323 + }, + { + "epoch": 2.8421501706484644, + "grad_norm": 0.26838518502450864, + "learning_rate": 0.00011853701273363604, + "loss": 0.9361, + "step": 13324 + }, + { + "epoch": 2.842363481228669, + "grad_norm": 0.26772933312232716, + "learning_rate": 0.00011852955515946357, + "loss": 0.9084, + "step": 13325 + }, + { + "epoch": 2.842576791808874, + "grad_norm": 0.2641412768683482, + "learning_rate": 0.00011852209714932857, + "loss": 0.9396, + "step": 13326 + }, + { + "epoch": 2.8427901023890785, + "grad_norm": 0.33886284004689565, + "learning_rate": 0.00011851463870331537, + "loss": 0.9321, + "step": 13327 + }, + { + "epoch": 2.843003412969283, + "grad_norm": 0.305062498380066, + "learning_rate": 0.00011850717982150839, + "loss": 0.9231, + "step": 13328 + }, + { + "epoch": 2.843216723549488, + "grad_norm": 0.24191081462398517, + "learning_rate": 0.00011849972050399203, + "loss": 0.914, + "step": 13329 + }, + { + "epoch": 2.843430034129693, + "grad_norm": 0.2166897956672446, + "learning_rate": 0.00011849226075085068, + "loss": 0.9535, + "step": 13330 + }, + { + "epoch": 2.8436433447098977, + "grad_norm": 0.20724362315782716, + "learning_rate": 0.00011848480056216876, + "loss": 0.8923, + "step": 13331 + }, + { + "epoch": 2.8438566552901023, + "grad_norm": 0.2520745956182106, + "learning_rate": 0.00011847733993803066, + "loss": 0.9118, + "step": 13332 + }, + { + "epoch": 2.844069965870307, + "grad_norm": 0.22453847750424039, + "learning_rate": 0.00011846987887852082, + "loss": 0.9419, + "step": 13333 + }, + { + "epoch": 2.844283276450512, + "grad_norm": 0.20444418039571946, + "learning_rate": 0.00011846241738372364, + "loss": 0.9435, + "step": 13334 + }, + { + "epoch": 2.844496587030717, + "grad_norm": 0.2953130786816165, + "learning_rate": 0.00011845495545372356, + "loss": 0.925, + "step": 13335 + }, + { + "epoch": 2.8447098976109215, + "grad_norm": 0.3173510119419737, + "learning_rate": 0.00011844749308860502, + "loss": 0.9092, + "step": 13336 + }, + { + "epoch": 2.844923208191126, + "grad_norm": 0.2005991448469497, + "learning_rate": 0.00011844003028845248, + "loss": 0.9227, + "step": 13337 + }, + { + "epoch": 2.845136518771331, + "grad_norm": 0.2754174150148113, + "learning_rate": 0.0001184325670533503, + "loss": 0.9272, + "step": 13338 + }, + { + "epoch": 2.8453498293515356, + "grad_norm": 0.32440894424512373, + "learning_rate": 0.00011842510338338301, + "loss": 0.9295, + "step": 13339 + }, + { + "epoch": 2.8455631399317407, + "grad_norm": 0.23024733643835088, + "learning_rate": 0.00011841763927863501, + "loss": 0.8947, + "step": 13340 + }, + { + "epoch": 2.8457764505119454, + "grad_norm": 0.2314538709998843, + "learning_rate": 0.00011841017473919078, + "loss": 0.9485, + "step": 13341 + }, + { + "epoch": 2.84598976109215, + "grad_norm": 0.28904437033728536, + "learning_rate": 0.00011840270976513479, + "loss": 0.9389, + "step": 13342 + }, + { + "epoch": 2.846203071672355, + "grad_norm": 0.2337667218736669, + "learning_rate": 0.0001183952443565515, + "loss": 0.9167, + "step": 13343 + }, + { + "epoch": 2.84641638225256, + "grad_norm": 0.2335277043349995, + "learning_rate": 0.00011838777851352535, + "loss": 0.9107, + "step": 13344 + }, + { + "epoch": 2.8466296928327646, + "grad_norm": 0.27224336631281926, + "learning_rate": 0.00011838031223614083, + "loss": 0.9269, + "step": 13345 + }, + { + "epoch": 2.8468430034129693, + "grad_norm": 0.26398354087602605, + "learning_rate": 0.00011837284552448247, + "loss": 0.9549, + "step": 13346 + }, + { + "epoch": 2.847056313993174, + "grad_norm": 0.2732062652987478, + "learning_rate": 0.00011836537837863468, + "loss": 0.9145, + "step": 13347 + }, + { + "epoch": 2.8472696245733786, + "grad_norm": 0.23795414289327027, + "learning_rate": 0.00011835791079868202, + "loss": 0.9641, + "step": 13348 + }, + { + "epoch": 2.8474829351535837, + "grad_norm": 0.23516135675956612, + "learning_rate": 0.00011835044278470894, + "loss": 0.9469, + "step": 13349 + }, + { + "epoch": 2.8476962457337884, + "grad_norm": 0.24591971261867798, + "learning_rate": 0.00011834297433679995, + "loss": 0.9264, + "step": 13350 + }, + { + "epoch": 2.847909556313993, + "grad_norm": 0.2336997847593394, + "learning_rate": 0.00011833550545503957, + "loss": 0.9369, + "step": 13351 + }, + { + "epoch": 2.848122866894198, + "grad_norm": 0.2530632308148044, + "learning_rate": 0.00011832803613951229, + "loss": 0.93, + "step": 13352 + }, + { + "epoch": 2.848336177474403, + "grad_norm": 0.3135096405030307, + "learning_rate": 0.00011832056639030264, + "loss": 0.9246, + "step": 13353 + }, + { + "epoch": 2.8485494880546076, + "grad_norm": 0.37276777481971696, + "learning_rate": 0.00011831309620749515, + "loss": 0.9469, + "step": 13354 + }, + { + "epoch": 2.8487627986348123, + "grad_norm": 0.22156788903479194, + "learning_rate": 0.00011830562559117431, + "loss": 0.9819, + "step": 13355 + }, + { + "epoch": 2.848976109215017, + "grad_norm": 0.23547778018541385, + "learning_rate": 0.0001182981545414247, + "loss": 0.9408, + "step": 13356 + }, + { + "epoch": 2.8491894197952217, + "grad_norm": 0.30358003459889976, + "learning_rate": 0.0001182906830583308, + "loss": 0.9571, + "step": 13357 + }, + { + "epoch": 2.8494027303754264, + "grad_norm": 0.28629275870439835, + "learning_rate": 0.00011828321114197719, + "loss": 0.9388, + "step": 13358 + }, + { + "epoch": 2.8496160409556315, + "grad_norm": 0.29051186423495573, + "learning_rate": 0.0001182757387924484, + "loss": 0.9608, + "step": 13359 + }, + { + "epoch": 2.849829351535836, + "grad_norm": 0.3106209199329166, + "learning_rate": 0.00011826826600982899, + "loss": 0.9089, + "step": 13360 + }, + { + "epoch": 2.850042662116041, + "grad_norm": 0.27875262692059977, + "learning_rate": 0.00011826079279420349, + "loss": 0.8998, + "step": 13361 + }, + { + "epoch": 2.850255972696246, + "grad_norm": 0.2466775566533514, + "learning_rate": 0.0001182533191456565, + "loss": 0.9377, + "step": 13362 + }, + { + "epoch": 2.8504692832764507, + "grad_norm": 0.23187855678420868, + "learning_rate": 0.00011824584506427251, + "loss": 0.9603, + "step": 13363 + }, + { + "epoch": 2.8506825938566553, + "grad_norm": 0.2950477549488408, + "learning_rate": 0.00011823837055013619, + "loss": 0.9284, + "step": 13364 + }, + { + "epoch": 2.85089590443686, + "grad_norm": 0.2686979871905753, + "learning_rate": 0.00011823089560333207, + "loss": 0.9461, + "step": 13365 + }, + { + "epoch": 2.8511092150170647, + "grad_norm": 0.24375804450721797, + "learning_rate": 0.00011822342022394468, + "loss": 0.9191, + "step": 13366 + }, + { + "epoch": 2.8513225255972694, + "grad_norm": 0.23506408674842388, + "learning_rate": 0.00011821594441205867, + "loss": 0.9209, + "step": 13367 + }, + { + "epoch": 2.8515358361774745, + "grad_norm": 0.2651135212049767, + "learning_rate": 0.00011820846816775859, + "loss": 0.902, + "step": 13368 + }, + { + "epoch": 2.851749146757679, + "grad_norm": 0.22646329065181067, + "learning_rate": 0.00011820099149112907, + "loss": 0.9665, + "step": 13369 + }, + { + "epoch": 2.851962457337884, + "grad_norm": 0.2745815902655917, + "learning_rate": 0.0001181935143822547, + "loss": 0.9298, + "step": 13370 + }, + { + "epoch": 2.8521757679180886, + "grad_norm": 0.29837560935181057, + "learning_rate": 0.00011818603684122004, + "loss": 0.9552, + "step": 13371 + }, + { + "epoch": 2.8523890784982937, + "grad_norm": 0.3439394345684575, + "learning_rate": 0.00011817855886810976, + "loss": 0.9565, + "step": 13372 + }, + { + "epoch": 2.8526023890784984, + "grad_norm": 0.38257021968835647, + "learning_rate": 0.00011817108046300843, + "loss": 0.8983, + "step": 13373 + }, + { + "epoch": 2.852815699658703, + "grad_norm": 0.32034347454475126, + "learning_rate": 0.00011816360162600067, + "loss": 0.9433, + "step": 13374 + }, + { + "epoch": 2.8530290102389078, + "grad_norm": 0.2927650870185858, + "learning_rate": 0.00011815612235717112, + "loss": 0.8996, + "step": 13375 + }, + { + "epoch": 2.8532423208191124, + "grad_norm": 0.28371982307583593, + "learning_rate": 0.00011814864265660444, + "loss": 0.9387, + "step": 13376 + }, + { + "epoch": 2.8534556313993176, + "grad_norm": 0.25008065576991084, + "learning_rate": 0.00011814116252438519, + "loss": 0.9048, + "step": 13377 + }, + { + "epoch": 2.8536689419795223, + "grad_norm": 0.24883649010028538, + "learning_rate": 0.00011813368196059806, + "loss": 0.9427, + "step": 13378 + }, + { + "epoch": 2.853882252559727, + "grad_norm": 0.2964219497236967, + "learning_rate": 0.00011812620096532766, + "loss": 0.954, + "step": 13379 + }, + { + "epoch": 2.8540955631399316, + "grad_norm": 0.29814324664641095, + "learning_rate": 0.00011811871953865867, + "loss": 0.9123, + "step": 13380 + }, + { + "epoch": 2.8543088737201368, + "grad_norm": 0.2543527429675963, + "learning_rate": 0.00011811123768067571, + "loss": 0.9705, + "step": 13381 + }, + { + "epoch": 2.8545221843003414, + "grad_norm": 0.21546822460823298, + "learning_rate": 0.00011810375539146351, + "loss": 0.9511, + "step": 13382 + }, + { + "epoch": 2.854735494880546, + "grad_norm": 0.19676001494895384, + "learning_rate": 0.0001180962726711066, + "loss": 0.9484, + "step": 13383 + }, + { + "epoch": 2.854948805460751, + "grad_norm": 0.1964071768112569, + "learning_rate": 0.00011808878951968978, + "loss": 0.9068, + "step": 13384 + }, + { + "epoch": 2.8551621160409555, + "grad_norm": 0.19705039178867362, + "learning_rate": 0.00011808130593729761, + "loss": 0.9373, + "step": 13385 + }, + { + "epoch": 2.85537542662116, + "grad_norm": 0.2123454686520581, + "learning_rate": 0.00011807382192401488, + "loss": 0.9127, + "step": 13386 + }, + { + "epoch": 2.8555887372013653, + "grad_norm": 0.2303972067583941, + "learning_rate": 0.0001180663374799262, + "loss": 0.95, + "step": 13387 + }, + { + "epoch": 2.85580204778157, + "grad_norm": 0.2481420154690813, + "learning_rate": 0.00011805885260511626, + "loss": 0.9224, + "step": 13388 + }, + { + "epoch": 2.8560153583617747, + "grad_norm": 0.2595517113065548, + "learning_rate": 0.00011805136729966976, + "loss": 0.9263, + "step": 13389 + }, + { + "epoch": 2.85622866894198, + "grad_norm": 0.24405039307757076, + "learning_rate": 0.0001180438815636714, + "loss": 0.9313, + "step": 13390 + }, + { + "epoch": 2.8564419795221845, + "grad_norm": 0.8954070956821335, + "learning_rate": 0.00011803639539720588, + "loss": 0.9553, + "step": 13391 + }, + { + "epoch": 2.856655290102389, + "grad_norm": 0.22526731763823746, + "learning_rate": 0.0001180289088003579, + "loss": 0.9349, + "step": 13392 + }, + { + "epoch": 2.856868600682594, + "grad_norm": 0.26679606974739606, + "learning_rate": 0.0001180214217732122, + "loss": 0.9353, + "step": 13393 + }, + { + "epoch": 2.8570819112627985, + "grad_norm": 0.2583680810604271, + "learning_rate": 0.00011801393431585347, + "loss": 0.9507, + "step": 13394 + }, + { + "epoch": 2.857295221843003, + "grad_norm": 0.27467332424115687, + "learning_rate": 0.00011800644642836642, + "loss": 0.9052, + "step": 13395 + }, + { + "epoch": 2.8575085324232083, + "grad_norm": 0.2713205905282199, + "learning_rate": 0.0001179989581108358, + "loss": 0.9291, + "step": 13396 + }, + { + "epoch": 2.857721843003413, + "grad_norm": 0.22719037972325723, + "learning_rate": 0.0001179914693633463, + "loss": 0.9288, + "step": 13397 + }, + { + "epoch": 2.8579351535836177, + "grad_norm": 0.24520890967733364, + "learning_rate": 0.00011798398018598275, + "loss": 0.9282, + "step": 13398 + }, + { + "epoch": 2.8581484641638224, + "grad_norm": 0.26780239628072056, + "learning_rate": 0.00011797649057882978, + "loss": 0.9404, + "step": 13399 + }, + { + "epoch": 2.8583617747440275, + "grad_norm": 0.24406364401579944, + "learning_rate": 0.00011796900054197217, + "loss": 0.882, + "step": 13400 + }, + { + "epoch": 2.858575085324232, + "grad_norm": 0.3393127845460157, + "learning_rate": 0.00011796151007549469, + "loss": 0.9651, + "step": 13401 + }, + { + "epoch": 2.858788395904437, + "grad_norm": 0.3707941824246416, + "learning_rate": 0.00011795401917948209, + "loss": 0.9066, + "step": 13402 + }, + { + "epoch": 2.8590017064846416, + "grad_norm": 0.30918805314620434, + "learning_rate": 0.00011794652785401912, + "loss": 0.941, + "step": 13403 + }, + { + "epoch": 2.8592150170648463, + "grad_norm": 0.36621023541091635, + "learning_rate": 0.00011793903609919057, + "loss": 0.9568, + "step": 13404 + }, + { + "epoch": 2.859428327645051, + "grad_norm": 0.34469498746098953, + "learning_rate": 0.00011793154391508117, + "loss": 0.9016, + "step": 13405 + }, + { + "epoch": 2.859641638225256, + "grad_norm": 0.3613061921913035, + "learning_rate": 0.00011792405130177571, + "loss": 0.9232, + "step": 13406 + }, + { + "epoch": 2.8598549488054608, + "grad_norm": 0.3669318236008168, + "learning_rate": 0.00011791655825935897, + "loss": 0.9226, + "step": 13407 + }, + { + "epoch": 2.8600682593856654, + "grad_norm": 0.27892132202941045, + "learning_rate": 0.00011790906478791575, + "loss": 0.8927, + "step": 13408 + }, + { + "epoch": 2.8602815699658706, + "grad_norm": 0.28413129995587477, + "learning_rate": 0.0001179015708875308, + "loss": 0.9675, + "step": 13409 + }, + { + "epoch": 2.8604948805460753, + "grad_norm": 0.28923494973777575, + "learning_rate": 0.00011789407655828898, + "loss": 0.9038, + "step": 13410 + }, + { + "epoch": 2.86070819112628, + "grad_norm": 0.3194784979589676, + "learning_rate": 0.00011788658180027502, + "loss": 0.9374, + "step": 13411 + }, + { + "epoch": 2.8609215017064846, + "grad_norm": 0.4633792072659377, + "learning_rate": 0.00011787908661357375, + "loss": 0.9491, + "step": 13412 + }, + { + "epoch": 2.8611348122866893, + "grad_norm": 0.5037395404457972, + "learning_rate": 0.00011787159099826996, + "loss": 0.8964, + "step": 13413 + }, + { + "epoch": 2.861348122866894, + "grad_norm": 0.46228997124503235, + "learning_rate": 0.00011786409495444851, + "loss": 0.9508, + "step": 13414 + }, + { + "epoch": 2.861561433447099, + "grad_norm": 0.35790195706383376, + "learning_rate": 0.00011785659848219419, + "loss": 0.9146, + "step": 13415 + }, + { + "epoch": 2.861774744027304, + "grad_norm": 0.2380844761991701, + "learning_rate": 0.00011784910158159181, + "loss": 0.9071, + "step": 13416 + }, + { + "epoch": 2.8619880546075085, + "grad_norm": 0.2003685219055282, + "learning_rate": 0.0001178416042527262, + "loss": 0.9473, + "step": 13417 + }, + { + "epoch": 2.862201365187713, + "grad_norm": 0.27077443253729633, + "learning_rate": 0.00011783410649568221, + "loss": 0.9736, + "step": 13418 + }, + { + "epoch": 2.8624146757679183, + "grad_norm": 0.385160329180406, + "learning_rate": 0.00011782660831054466, + "loss": 0.9706, + "step": 13419 + }, + { + "epoch": 2.862627986348123, + "grad_norm": 0.4471339554032797, + "learning_rate": 0.00011781910969739841, + "loss": 0.9388, + "step": 13420 + }, + { + "epoch": 2.8628412969283277, + "grad_norm": 0.4970665755548874, + "learning_rate": 0.0001178116106563283, + "loss": 0.913, + "step": 13421 + }, + { + "epoch": 2.8630546075085324, + "grad_norm": 0.4712949494154962, + "learning_rate": 0.00011780411118741917, + "loss": 0.9327, + "step": 13422 + }, + { + "epoch": 2.863267918088737, + "grad_norm": 0.3896107247215437, + "learning_rate": 0.0001177966112907559, + "loss": 0.8878, + "step": 13423 + }, + { + "epoch": 2.8634812286689417, + "grad_norm": 0.2646994640152393, + "learning_rate": 0.00011778911096642333, + "loss": 0.9129, + "step": 13424 + }, + { + "epoch": 2.863694539249147, + "grad_norm": 0.27063308721904433, + "learning_rate": 0.00011778161021450631, + "loss": 0.9423, + "step": 13425 + }, + { + "epoch": 2.8639078498293515, + "grad_norm": 0.3395992562017098, + "learning_rate": 0.00011777410903508978, + "loss": 0.9249, + "step": 13426 + }, + { + "epoch": 2.864121160409556, + "grad_norm": 0.31591903172145475, + "learning_rate": 0.00011776660742825855, + "loss": 0.9174, + "step": 13427 + }, + { + "epoch": 2.8643344709897613, + "grad_norm": 0.31000385843183, + "learning_rate": 0.00011775910539409751, + "loss": 0.9421, + "step": 13428 + }, + { + "epoch": 2.864547781569966, + "grad_norm": 0.2867756352660216, + "learning_rate": 0.00011775160293269157, + "loss": 0.924, + "step": 13429 + }, + { + "epoch": 2.8647610921501707, + "grad_norm": 0.2640121125402454, + "learning_rate": 0.00011774410004412562, + "loss": 0.9326, + "step": 13430 + }, + { + "epoch": 2.8649744027303754, + "grad_norm": 0.2592085782806892, + "learning_rate": 0.00011773659672848451, + "loss": 0.9393, + "step": 13431 + }, + { + "epoch": 2.86518771331058, + "grad_norm": 0.23415181469597338, + "learning_rate": 0.00011772909298585321, + "loss": 0.9227, + "step": 13432 + }, + { + "epoch": 2.8654010238907848, + "grad_norm": 0.2687585004711943, + "learning_rate": 0.00011772158881631657, + "loss": 0.9329, + "step": 13433 + }, + { + "epoch": 2.86561433447099, + "grad_norm": 0.2375096428274205, + "learning_rate": 0.00011771408421995952, + "loss": 0.9229, + "step": 13434 + }, + { + "epoch": 2.8658276450511946, + "grad_norm": 0.25089292574252314, + "learning_rate": 0.00011770657919686696, + "loss": 0.9427, + "step": 13435 + }, + { + "epoch": 2.8660409556313993, + "grad_norm": 0.2810099205569036, + "learning_rate": 0.00011769907374712383, + "loss": 0.9062, + "step": 13436 + }, + { + "epoch": 2.866254266211604, + "grad_norm": 0.29548146099649597, + "learning_rate": 0.00011769156787081504, + "loss": 0.9046, + "step": 13437 + }, + { + "epoch": 2.866467576791809, + "grad_norm": 0.28578898836729166, + "learning_rate": 0.00011768406156802556, + "loss": 0.9215, + "step": 13438 + }, + { + "epoch": 2.8666808873720138, + "grad_norm": 0.3214965934285964, + "learning_rate": 0.00011767655483884025, + "loss": 0.9563, + "step": 13439 + }, + { + "epoch": 2.8668941979522184, + "grad_norm": 0.21179554256032468, + "learning_rate": 0.0001176690476833441, + "loss": 0.9198, + "step": 13440 + }, + { + "epoch": 2.867107508532423, + "grad_norm": 0.23586507756272387, + "learning_rate": 0.00011766154010162205, + "loss": 0.9241, + "step": 13441 + }, + { + "epoch": 2.867320819112628, + "grad_norm": 0.3133130647565721, + "learning_rate": 0.00011765403209375903, + "loss": 0.9405, + "step": 13442 + }, + { + "epoch": 2.8675341296928325, + "grad_norm": 0.27122998954422334, + "learning_rate": 0.00011764652365984004, + "loss": 0.9152, + "step": 13443 + }, + { + "epoch": 2.8677474402730376, + "grad_norm": 0.26861829960485295, + "learning_rate": 0.00011763901479994996, + "loss": 0.937, + "step": 13444 + }, + { + "epoch": 2.8679607508532423, + "grad_norm": 0.2802139408489461, + "learning_rate": 0.0001176315055141738, + "loss": 0.9357, + "step": 13445 + }, + { + "epoch": 2.868174061433447, + "grad_norm": 0.22234446437917707, + "learning_rate": 0.00011762399580259652, + "loss": 0.9376, + "step": 13446 + }, + { + "epoch": 2.868387372013652, + "grad_norm": 0.28348427689880223, + "learning_rate": 0.00011761648566530309, + "loss": 0.9114, + "step": 13447 + }, + { + "epoch": 2.868600682593857, + "grad_norm": 0.3793482322641889, + "learning_rate": 0.00011760897510237851, + "loss": 0.9383, + "step": 13448 + }, + { + "epoch": 2.8688139931740615, + "grad_norm": 0.45191540588832746, + "learning_rate": 0.00011760146411390776, + "loss": 0.9078, + "step": 13449 + }, + { + "epoch": 2.869027303754266, + "grad_norm": 0.5358586647111442, + "learning_rate": 0.00011759395269997576, + "loss": 0.9525, + "step": 13450 + }, + { + "epoch": 2.869240614334471, + "grad_norm": 0.5571535153081325, + "learning_rate": 0.00011758644086066759, + "loss": 0.9296, + "step": 13451 + }, + { + "epoch": 2.8694539249146755, + "grad_norm": 0.4540609638336841, + "learning_rate": 0.00011757892859606821, + "loss": 0.9361, + "step": 13452 + }, + { + "epoch": 2.8696672354948807, + "grad_norm": 0.299295813204834, + "learning_rate": 0.00011757141590626258, + "loss": 0.9087, + "step": 13453 + }, + { + "epoch": 2.8698805460750854, + "grad_norm": 0.2463902600139876, + "learning_rate": 0.00011756390279133581, + "loss": 0.9389, + "step": 13454 + }, + { + "epoch": 2.87009385665529, + "grad_norm": 0.25655136594250416, + "learning_rate": 0.0001175563892513728, + "loss": 0.915, + "step": 13455 + }, + { + "epoch": 2.8703071672354947, + "grad_norm": 0.30600634563366796, + "learning_rate": 0.00011754887528645864, + "loss": 0.9317, + "step": 13456 + }, + { + "epoch": 2.8705204778157, + "grad_norm": 0.43308032779087974, + "learning_rate": 0.00011754136089667832, + "loss": 0.9557, + "step": 13457 + }, + { + "epoch": 2.8707337883959045, + "grad_norm": 0.44171958245333814, + "learning_rate": 0.00011753384608211687, + "loss": 0.9172, + "step": 13458 + }, + { + "epoch": 2.8709470989761092, + "grad_norm": 0.3722855165442718, + "learning_rate": 0.00011752633084285929, + "loss": 0.8903, + "step": 13459 + }, + { + "epoch": 2.871160409556314, + "grad_norm": 0.2556544065248829, + "learning_rate": 0.0001175188151789907, + "loss": 0.9236, + "step": 13460 + }, + { + "epoch": 2.8713737201365186, + "grad_norm": 0.20896627237442297, + "learning_rate": 0.00011751129909059605, + "loss": 0.9185, + "step": 13461 + }, + { + "epoch": 2.8715870307167233, + "grad_norm": 0.3048807998800038, + "learning_rate": 0.00011750378257776042, + "loss": 0.9385, + "step": 13462 + }, + { + "epoch": 2.8718003412969284, + "grad_norm": 0.41145836482126064, + "learning_rate": 0.00011749626564056887, + "loss": 0.9343, + "step": 13463 + }, + { + "epoch": 2.872013651877133, + "grad_norm": 0.4442094275662675, + "learning_rate": 0.00011748874827910643, + "loss": 0.9466, + "step": 13464 + }, + { + "epoch": 2.8722269624573378, + "grad_norm": 0.3645907504718241, + "learning_rate": 0.00011748123049345819, + "loss": 0.928, + "step": 13465 + }, + { + "epoch": 2.872440273037543, + "grad_norm": 0.2616842480086141, + "learning_rate": 0.0001174737122837092, + "loss": 0.9472, + "step": 13466 + }, + { + "epoch": 2.8726535836177476, + "grad_norm": 0.26390025693375774, + "learning_rate": 0.00011746619364994449, + "loss": 0.9113, + "step": 13467 + }, + { + "epoch": 2.8728668941979523, + "grad_norm": 0.33665740135599614, + "learning_rate": 0.00011745867459224921, + "loss": 0.9385, + "step": 13468 + }, + { + "epoch": 2.873080204778157, + "grad_norm": 0.4529704299148064, + "learning_rate": 0.00011745115511070836, + "loss": 0.9293, + "step": 13469 + }, + { + "epoch": 2.8732935153583616, + "grad_norm": 0.42178124358908115, + "learning_rate": 0.00011744363520540708, + "loss": 0.915, + "step": 13470 + }, + { + "epoch": 2.8735068259385663, + "grad_norm": 0.4045017873259507, + "learning_rate": 0.00011743611487643045, + "loss": 0.9253, + "step": 13471 + }, + { + "epoch": 2.8737201365187715, + "grad_norm": 0.3344392244931169, + "learning_rate": 0.00011742859412386355, + "loss": 0.9334, + "step": 13472 + }, + { + "epoch": 2.873933447098976, + "grad_norm": 0.24354520047580475, + "learning_rate": 0.00011742107294779146, + "loss": 0.9491, + "step": 13473 + }, + { + "epoch": 2.874146757679181, + "grad_norm": 0.29038840206775773, + "learning_rate": 0.0001174135513482993, + "loss": 0.9396, + "step": 13474 + }, + { + "epoch": 2.8743600682593855, + "grad_norm": 0.4060304253854594, + "learning_rate": 0.00011740602932547217, + "loss": 0.9526, + "step": 13475 + }, + { + "epoch": 2.8745733788395906, + "grad_norm": 0.5092999141908604, + "learning_rate": 0.0001173985068793952, + "loss": 0.9489, + "step": 13476 + }, + { + "epoch": 2.8747866894197953, + "grad_norm": 0.46693299193422005, + "learning_rate": 0.0001173909840101535, + "loss": 0.9528, + "step": 13477 + }, + { + "epoch": 2.875, + "grad_norm": 0.33359256967798684, + "learning_rate": 0.00011738346071783221, + "loss": 0.9176, + "step": 13478 + }, + { + "epoch": 2.8752133105802047, + "grad_norm": 0.2734107750213474, + "learning_rate": 0.00011737593700251638, + "loss": 0.9222, + "step": 13479 + }, + { + "epoch": 2.8754266211604094, + "grad_norm": 0.3194841738212767, + "learning_rate": 0.00011736841286429124, + "loss": 0.9286, + "step": 13480 + }, + { + "epoch": 2.8756399317406145, + "grad_norm": 0.4051399625974802, + "learning_rate": 0.00011736088830324184, + "loss": 0.9565, + "step": 13481 + }, + { + "epoch": 2.875853242320819, + "grad_norm": 0.479808502370951, + "learning_rate": 0.00011735336331945338, + "loss": 0.9299, + "step": 13482 + }, + { + "epoch": 2.876066552901024, + "grad_norm": 0.5822138305757513, + "learning_rate": 0.000117345837913011, + "loss": 0.9441, + "step": 13483 + }, + { + "epoch": 2.8762798634812285, + "grad_norm": 0.4947712614552893, + "learning_rate": 0.00011733831208399983, + "loss": 0.9501, + "step": 13484 + }, + { + "epoch": 2.8764931740614337, + "grad_norm": 0.3159019747602263, + "learning_rate": 0.000117330785832505, + "loss": 0.9289, + "step": 13485 + }, + { + "epoch": 2.8767064846416384, + "grad_norm": 0.21069554498561757, + "learning_rate": 0.00011732325915861174, + "loss": 0.9327, + "step": 13486 + }, + { + "epoch": 2.876919795221843, + "grad_norm": 0.33222554087343936, + "learning_rate": 0.00011731573206240515, + "loss": 0.9269, + "step": 13487 + }, + { + "epoch": 2.8771331058020477, + "grad_norm": 0.41809131320670695, + "learning_rate": 0.00011730820454397043, + "loss": 0.9183, + "step": 13488 + }, + { + "epoch": 2.8773464163822524, + "grad_norm": 0.4445705634005292, + "learning_rate": 0.00011730067660339276, + "loss": 0.9219, + "step": 13489 + }, + { + "epoch": 2.877559726962457, + "grad_norm": 0.41404007733000747, + "learning_rate": 0.0001172931482407573, + "loss": 0.9433, + "step": 13490 + }, + { + "epoch": 2.8777730375426622, + "grad_norm": 0.3829071367143789, + "learning_rate": 0.00011728561945614923, + "loss": 0.9345, + "step": 13491 + }, + { + "epoch": 2.877986348122867, + "grad_norm": 0.3702672859519106, + "learning_rate": 0.00011727809024965378, + "loss": 0.9267, + "step": 13492 + }, + { + "epoch": 2.8781996587030716, + "grad_norm": 0.2992096922420552, + "learning_rate": 0.00011727056062135607, + "loss": 0.8928, + "step": 13493 + }, + { + "epoch": 2.8784129692832767, + "grad_norm": 0.2629339351047802, + "learning_rate": 0.00011726303057134136, + "loss": 0.9443, + "step": 13494 + }, + { + "epoch": 2.8786262798634814, + "grad_norm": 0.35510453905986494, + "learning_rate": 0.00011725550009969484, + "loss": 0.95, + "step": 13495 + }, + { + "epoch": 2.878839590443686, + "grad_norm": 0.37343990526723553, + "learning_rate": 0.0001172479692065017, + "loss": 0.916, + "step": 13496 + }, + { + "epoch": 2.8790529010238908, + "grad_norm": 0.3285466053098546, + "learning_rate": 0.00011724043789184715, + "loss": 0.9098, + "step": 13497 + }, + { + "epoch": 2.8792662116040955, + "grad_norm": 0.29790153654502755, + "learning_rate": 0.00011723290615581644, + "loss": 0.9261, + "step": 13498 + }, + { + "epoch": 2.8794795221843, + "grad_norm": 0.27107794158216536, + "learning_rate": 0.00011722537399849476, + "loss": 0.9447, + "step": 13499 + }, + { + "epoch": 2.8796928327645053, + "grad_norm": 0.22782312689322526, + "learning_rate": 0.00011721784141996735, + "loss": 0.9198, + "step": 13500 + }, + { + "epoch": 2.87990614334471, + "grad_norm": 0.6311702455009341, + "learning_rate": 0.00011721030842031945, + "loss": 0.9223, + "step": 13501 + }, + { + "epoch": 2.8801194539249146, + "grad_norm": 0.3016043219217847, + "learning_rate": 0.00011720277499963626, + "loss": 0.9196, + "step": 13502 + }, + { + "epoch": 2.8803327645051193, + "grad_norm": 0.38481971905220563, + "learning_rate": 0.00011719524115800305, + "loss": 0.9261, + "step": 13503 + }, + { + "epoch": 2.8805460750853245, + "grad_norm": 0.3914157697563112, + "learning_rate": 0.00011718770689550508, + "loss": 0.9408, + "step": 13504 + }, + { + "epoch": 2.880759385665529, + "grad_norm": 0.3382354371520661, + "learning_rate": 0.00011718017221222756, + "loss": 0.9304, + "step": 13505 + }, + { + "epoch": 2.880972696245734, + "grad_norm": 0.3247832350686015, + "learning_rate": 0.00011717263710825578, + "loss": 0.9382, + "step": 13506 + }, + { + "epoch": 2.8811860068259385, + "grad_norm": 0.3034618544838752, + "learning_rate": 0.00011716510158367498, + "loss": 0.9501, + "step": 13507 + }, + { + "epoch": 2.881399317406143, + "grad_norm": 0.2678407456131954, + "learning_rate": 0.00011715756563857042, + "loss": 0.951, + "step": 13508 + }, + { + "epoch": 2.881612627986348, + "grad_norm": 0.27985389002630495, + "learning_rate": 0.00011715002927302739, + "loss": 0.9275, + "step": 13509 + }, + { + "epoch": 2.881825938566553, + "grad_norm": 0.34659458481146116, + "learning_rate": 0.00011714249248713115, + "loss": 0.9559, + "step": 13510 + }, + { + "epoch": 2.8820392491467577, + "grad_norm": 0.5117725776022457, + "learning_rate": 0.000117134955280967, + "loss": 0.9576, + "step": 13511 + }, + { + "epoch": 2.8822525597269624, + "grad_norm": 0.5356031888606516, + "learning_rate": 0.00011712741765462018, + "loss": 0.9489, + "step": 13512 + }, + { + "epoch": 2.8824658703071675, + "grad_norm": 0.47155253522087665, + "learning_rate": 0.00011711987960817602, + "loss": 0.9706, + "step": 13513 + }, + { + "epoch": 2.882679180887372, + "grad_norm": 0.3675971367056232, + "learning_rate": 0.0001171123411417198, + "loss": 0.9232, + "step": 13514 + }, + { + "epoch": 2.882892491467577, + "grad_norm": 0.2512668673477446, + "learning_rate": 0.0001171048022553368, + "loss": 0.8979, + "step": 13515 + }, + { + "epoch": 2.8831058020477816, + "grad_norm": 0.3061002195026749, + "learning_rate": 0.00011709726294911236, + "loss": 0.9273, + "step": 13516 + }, + { + "epoch": 2.8833191126279862, + "grad_norm": 0.36847051427863886, + "learning_rate": 0.00011708972322313178, + "loss": 0.9273, + "step": 13517 + }, + { + "epoch": 2.883532423208191, + "grad_norm": 0.3346126265197009, + "learning_rate": 0.00011708218307748031, + "loss": 0.948, + "step": 13518 + }, + { + "epoch": 2.883745733788396, + "grad_norm": 0.34613717483376777, + "learning_rate": 0.00011707464251224334, + "loss": 0.9274, + "step": 13519 + }, + { + "epoch": 2.8839590443686007, + "grad_norm": 0.4401543069627964, + "learning_rate": 0.00011706710152750618, + "loss": 0.9342, + "step": 13520 + }, + { + "epoch": 2.8841723549488054, + "grad_norm": 0.4342213100997724, + "learning_rate": 0.0001170595601233541, + "loss": 0.9074, + "step": 13521 + }, + { + "epoch": 2.88438566552901, + "grad_norm": 0.4056567999957475, + "learning_rate": 0.00011705201829987253, + "loss": 0.9323, + "step": 13522 + }, + { + "epoch": 2.8845989761092152, + "grad_norm": 0.32186056288690174, + "learning_rate": 0.00011704447605714672, + "loss": 0.9716, + "step": 13523 + }, + { + "epoch": 2.88481228668942, + "grad_norm": 0.20162793629113548, + "learning_rate": 0.00011703693339526204, + "loss": 0.9153, + "step": 13524 + }, + { + "epoch": 2.8850255972696246, + "grad_norm": 0.29817737107672043, + "learning_rate": 0.00011702939031430383, + "loss": 0.9225, + "step": 13525 + }, + { + "epoch": 2.8852389078498293, + "grad_norm": 0.3244728990907365, + "learning_rate": 0.00011702184681435746, + "loss": 0.9056, + "step": 13526 + }, + { + "epoch": 2.885452218430034, + "grad_norm": 0.291675729263041, + "learning_rate": 0.00011701430289550828, + "loss": 0.9379, + "step": 13527 + }, + { + "epoch": 2.8856655290102387, + "grad_norm": 0.26596001433921806, + "learning_rate": 0.00011700675855784159, + "loss": 0.9178, + "step": 13528 + }, + { + "epoch": 2.885878839590444, + "grad_norm": 0.888612092945503, + "learning_rate": 0.00011699921380144285, + "loss": 0.9409, + "step": 13529 + }, + { + "epoch": 2.8860921501706485, + "grad_norm": 0.2510804120090297, + "learning_rate": 0.00011699166862639736, + "loss": 0.9121, + "step": 13530 + }, + { + "epoch": 2.886305460750853, + "grad_norm": 0.25528956081925813, + "learning_rate": 0.00011698412303279054, + "loss": 0.9082, + "step": 13531 + }, + { + "epoch": 2.8865187713310583, + "grad_norm": 0.24562669136010046, + "learning_rate": 0.00011697657702070773, + "loss": 0.9283, + "step": 13532 + }, + { + "epoch": 2.886732081911263, + "grad_norm": 0.25566645430124724, + "learning_rate": 0.00011696903059023435, + "loss": 0.9098, + "step": 13533 + }, + { + "epoch": 2.8869453924914676, + "grad_norm": 0.29807692584897993, + "learning_rate": 0.00011696148374145573, + "loss": 0.9452, + "step": 13534 + }, + { + "epoch": 2.8871587030716723, + "grad_norm": 0.30813213268498996, + "learning_rate": 0.00011695393647445735, + "loss": 0.9496, + "step": 13535 + }, + { + "epoch": 2.887372013651877, + "grad_norm": 0.33988724640450657, + "learning_rate": 0.00011694638878932451, + "loss": 0.9356, + "step": 13536 + }, + { + "epoch": 2.8875853242320817, + "grad_norm": 0.3600416703409985, + "learning_rate": 0.00011693884068614267, + "loss": 0.9488, + "step": 13537 + }, + { + "epoch": 2.887798634812287, + "grad_norm": 0.36362892388636325, + "learning_rate": 0.00011693129216499724, + "loss": 0.9617, + "step": 13538 + }, + { + "epoch": 2.8880119453924915, + "grad_norm": 0.3314735230157191, + "learning_rate": 0.00011692374322597362, + "loss": 0.925, + "step": 13539 + }, + { + "epoch": 2.888225255972696, + "grad_norm": 0.3221409666828554, + "learning_rate": 0.00011691619386915721, + "loss": 0.892, + "step": 13540 + }, + { + "epoch": 2.888438566552901, + "grad_norm": 0.27608679288187915, + "learning_rate": 0.00011690864409463345, + "loss": 0.9097, + "step": 13541 + }, + { + "epoch": 2.888651877133106, + "grad_norm": 0.2629354959719027, + "learning_rate": 0.00011690109390248777, + "loss": 0.9419, + "step": 13542 + }, + { + "epoch": 2.8888651877133107, + "grad_norm": 0.38042916943877353, + "learning_rate": 0.00011689354329280557, + "loss": 0.9288, + "step": 13543 + }, + { + "epoch": 2.8890784982935154, + "grad_norm": 0.43547263149691157, + "learning_rate": 0.00011688599226567233, + "loss": 0.9284, + "step": 13544 + }, + { + "epoch": 2.88929180887372, + "grad_norm": 0.5728666788631386, + "learning_rate": 0.00011687844082117347, + "loss": 0.9838, + "step": 13545 + }, + { + "epoch": 2.8895051194539247, + "grad_norm": 0.4008123731095371, + "learning_rate": 0.0001168708889593944, + "loss": 0.929, + "step": 13546 + }, + { + "epoch": 2.8897184300341294, + "grad_norm": 0.33472245842882614, + "learning_rate": 0.00011686333668042063, + "loss": 0.9645, + "step": 13547 + }, + { + "epoch": 2.8899317406143346, + "grad_norm": 0.23743806551120492, + "learning_rate": 0.00011685578398433756, + "loss": 0.9413, + "step": 13548 + }, + { + "epoch": 2.8901450511945392, + "grad_norm": 0.21995088949493158, + "learning_rate": 0.00011684823087123072, + "loss": 0.9365, + "step": 13549 + }, + { + "epoch": 2.890358361774744, + "grad_norm": 0.34109346654770856, + "learning_rate": 0.00011684067734118549, + "loss": 0.9544, + "step": 13550 + }, + { + "epoch": 2.890571672354949, + "grad_norm": 0.4409197606360907, + "learning_rate": 0.0001168331233942874, + "loss": 0.93, + "step": 13551 + }, + { + "epoch": 2.8907849829351537, + "grad_norm": 0.44499308948257077, + "learning_rate": 0.00011682556903062186, + "loss": 0.9436, + "step": 13552 + }, + { + "epoch": 2.8909982935153584, + "grad_norm": 0.44470994458388713, + "learning_rate": 0.00011681801425027441, + "loss": 0.94, + "step": 13553 + }, + { + "epoch": 2.891211604095563, + "grad_norm": 0.48523405240000733, + "learning_rate": 0.00011681045905333052, + "loss": 0.9634, + "step": 13554 + }, + { + "epoch": 2.891424914675768, + "grad_norm": 0.4979756789217077, + "learning_rate": 0.00011680290343987566, + "loss": 0.9181, + "step": 13555 + }, + { + "epoch": 2.8916382252559725, + "grad_norm": 0.35652544388064245, + "learning_rate": 0.00011679534740999533, + "loss": 0.9448, + "step": 13556 + }, + { + "epoch": 2.8918515358361776, + "grad_norm": 0.23549666655331739, + "learning_rate": 0.000116787790963775, + "loss": 0.9213, + "step": 13557 + }, + { + "epoch": 2.8920648464163823, + "grad_norm": 0.3680066137022789, + "learning_rate": 0.00011678023410130022, + "loss": 0.9415, + "step": 13558 + }, + { + "epoch": 2.892278156996587, + "grad_norm": 0.3947950070452182, + "learning_rate": 0.00011677267682265649, + "loss": 0.9277, + "step": 13559 + }, + { + "epoch": 2.8924914675767917, + "grad_norm": 0.42703725269391807, + "learning_rate": 0.00011676511912792927, + "loss": 0.9371, + "step": 13560 + }, + { + "epoch": 2.892704778156997, + "grad_norm": 0.38161647491639955, + "learning_rate": 0.00011675756101720414, + "loss": 0.9601, + "step": 13561 + }, + { + "epoch": 2.8929180887372015, + "grad_norm": 0.31252070652043795, + "learning_rate": 0.00011675000249056656, + "loss": 0.9503, + "step": 13562 + }, + { + "epoch": 2.893131399317406, + "grad_norm": 0.19609453781322345, + "learning_rate": 0.00011674244354810211, + "loss": 0.9475, + "step": 13563 + }, + { + "epoch": 2.893344709897611, + "grad_norm": 0.28344134246837427, + "learning_rate": 0.00011673488418989625, + "loss": 0.9151, + "step": 13564 + }, + { + "epoch": 2.8935580204778155, + "grad_norm": 0.2919008003548263, + "learning_rate": 0.0001167273244160346, + "loss": 0.9375, + "step": 13565 + }, + { + "epoch": 2.89377133105802, + "grad_norm": 0.24324289086096676, + "learning_rate": 0.00011671976422660265, + "loss": 0.9666, + "step": 13566 + }, + { + "epoch": 2.8939846416382253, + "grad_norm": 0.22285956527014147, + "learning_rate": 0.00011671220362168595, + "loss": 0.9402, + "step": 13567 + }, + { + "epoch": 2.89419795221843, + "grad_norm": 0.2381106316308344, + "learning_rate": 0.00011670464260137002, + "loss": 0.9552, + "step": 13568 + }, + { + "epoch": 2.8944112627986347, + "grad_norm": 0.2472079585924234, + "learning_rate": 0.00011669708116574047, + "loss": 0.9566, + "step": 13569 + }, + { + "epoch": 2.89462457337884, + "grad_norm": 0.23479193215355545, + "learning_rate": 0.00011668951931488281, + "loss": 0.9281, + "step": 13570 + }, + { + "epoch": 2.8948378839590445, + "grad_norm": 0.23407860336143435, + "learning_rate": 0.00011668195704888264, + "loss": 0.942, + "step": 13571 + }, + { + "epoch": 2.895051194539249, + "grad_norm": 0.2339346713158335, + "learning_rate": 0.0001166743943678255, + "loss": 0.9395, + "step": 13572 + }, + { + "epoch": 2.895264505119454, + "grad_norm": 0.28157567432740566, + "learning_rate": 0.00011666683127179697, + "loss": 0.97, + "step": 13573 + }, + { + "epoch": 2.8954778156996586, + "grad_norm": 0.3600797428121584, + "learning_rate": 0.00011665926776088263, + "loss": 0.9496, + "step": 13574 + }, + { + "epoch": 2.8956911262798632, + "grad_norm": 0.4623476520893883, + "learning_rate": 0.00011665170383516806, + "loss": 0.913, + "step": 13575 + }, + { + "epoch": 2.8959044368600684, + "grad_norm": 0.25587347223872847, + "learning_rate": 0.00011664413949473883, + "loss": 0.9219, + "step": 13576 + }, + { + "epoch": 2.896117747440273, + "grad_norm": 0.24225200490647525, + "learning_rate": 0.0001166365747396806, + "loss": 0.9152, + "step": 13577 + }, + { + "epoch": 2.8963310580204777, + "grad_norm": 0.3350924451033696, + "learning_rate": 0.00011662900957007886, + "loss": 0.9264, + "step": 13578 + }, + { + "epoch": 2.8965443686006824, + "grad_norm": 0.39817219465698045, + "learning_rate": 0.00011662144398601928, + "loss": 0.9489, + "step": 13579 + }, + { + "epoch": 2.8967576791808876, + "grad_norm": 0.39478029862677627, + "learning_rate": 0.00011661387798758741, + "loss": 0.9138, + "step": 13580 + }, + { + "epoch": 2.8969709897610922, + "grad_norm": 0.3631935424376553, + "learning_rate": 0.00011660631157486892, + "loss": 0.9303, + "step": 13581 + }, + { + "epoch": 2.897184300341297, + "grad_norm": 0.2680737438287313, + "learning_rate": 0.00011659874474794941, + "loss": 0.9446, + "step": 13582 + }, + { + "epoch": 2.8973976109215016, + "grad_norm": 0.2452881665057814, + "learning_rate": 0.00011659117750691448, + "loss": 0.9255, + "step": 13583 + }, + { + "epoch": 2.8976109215017063, + "grad_norm": 0.29959066377076127, + "learning_rate": 0.00011658360985184974, + "loss": 0.9258, + "step": 13584 + }, + { + "epoch": 2.8978242320819114, + "grad_norm": 0.32770515374986314, + "learning_rate": 0.00011657604178284086, + "loss": 0.9408, + "step": 13585 + }, + { + "epoch": 2.898037542662116, + "grad_norm": 0.33893407437187545, + "learning_rate": 0.00011656847329997345, + "loss": 0.9425, + "step": 13586 + }, + { + "epoch": 2.898250853242321, + "grad_norm": 0.2841294033582971, + "learning_rate": 0.00011656090440333316, + "loss": 0.9203, + "step": 13587 + }, + { + "epoch": 2.8984641638225255, + "grad_norm": 0.23986140688219054, + "learning_rate": 0.00011655333509300558, + "loss": 0.922, + "step": 13588 + }, + { + "epoch": 2.8986774744027306, + "grad_norm": 0.25784282713589485, + "learning_rate": 0.00011654576536907647, + "loss": 0.9117, + "step": 13589 + }, + { + "epoch": 2.8988907849829353, + "grad_norm": 0.29347121527707765, + "learning_rate": 0.00011653819523163136, + "loss": 0.943, + "step": 13590 + }, + { + "epoch": 2.89910409556314, + "grad_norm": 0.2350848028524549, + "learning_rate": 0.00011653062468075597, + "loss": 0.9472, + "step": 13591 + }, + { + "epoch": 2.8993174061433447, + "grad_norm": 0.21942645168484926, + "learning_rate": 0.00011652305371653592, + "loss": 0.955, + "step": 13592 + }, + { + "epoch": 2.8995307167235493, + "grad_norm": 0.25154142078232344, + "learning_rate": 0.00011651548233905694, + "loss": 0.9501, + "step": 13593 + }, + { + "epoch": 2.899744027303754, + "grad_norm": 0.25437607887453345, + "learning_rate": 0.00011650791054840465, + "loss": 0.9544, + "step": 13594 + }, + { + "epoch": 2.899957337883959, + "grad_norm": 0.2579050557110865, + "learning_rate": 0.00011650033834466474, + "loss": 0.921, + "step": 13595 + }, + { + "epoch": 2.900170648464164, + "grad_norm": 0.24191239546014404, + "learning_rate": 0.00011649276572792288, + "loss": 0.8895, + "step": 13596 + }, + { + "epoch": 2.9003839590443685, + "grad_norm": 0.17045637113487633, + "learning_rate": 0.00011648519269826477, + "loss": 0.9347, + "step": 13597 + }, + { + "epoch": 2.9005972696245736, + "grad_norm": 0.2648798142445341, + "learning_rate": 0.00011647761925577607, + "loss": 0.9231, + "step": 13598 + }, + { + "epoch": 2.9008105802047783, + "grad_norm": 0.3423266875396581, + "learning_rate": 0.00011647004540054253, + "loss": 0.9234, + "step": 13599 + }, + { + "epoch": 2.901023890784983, + "grad_norm": 0.33638906829125115, + "learning_rate": 0.00011646247113264979, + "loss": 0.9532, + "step": 13600 + }, + { + "epoch": 2.9012372013651877, + "grad_norm": 0.29112465350401207, + "learning_rate": 0.0001164548964521836, + "loss": 0.9479, + "step": 13601 + }, + { + "epoch": 2.9014505119453924, + "grad_norm": 0.22277573632541514, + "learning_rate": 0.00011644732135922961, + "loss": 0.9411, + "step": 13602 + }, + { + "epoch": 2.901663822525597, + "grad_norm": 0.263644313813829, + "learning_rate": 0.00011643974585387358, + "loss": 0.9445, + "step": 13603 + }, + { + "epoch": 2.901877133105802, + "grad_norm": 0.2593015192244883, + "learning_rate": 0.00011643216993620119, + "loss": 0.9506, + "step": 13604 + }, + { + "epoch": 2.902090443686007, + "grad_norm": 0.3395396998810787, + "learning_rate": 0.00011642459360629823, + "loss": 0.9226, + "step": 13605 + }, + { + "epoch": 2.9023037542662116, + "grad_norm": 0.3866809175522395, + "learning_rate": 0.00011641701686425035, + "loss": 0.9846, + "step": 13606 + }, + { + "epoch": 2.9025170648464163, + "grad_norm": 0.35825788457051483, + "learning_rate": 0.00011640943971014333, + "loss": 0.932, + "step": 13607 + }, + { + "epoch": 2.9027303754266214, + "grad_norm": 0.22766656409460395, + "learning_rate": 0.00011640186214406286, + "loss": 0.8979, + "step": 13608 + }, + { + "epoch": 2.902943686006826, + "grad_norm": 0.23529814674329494, + "learning_rate": 0.00011639428416609473, + "loss": 0.934, + "step": 13609 + }, + { + "epoch": 2.9031569965870307, + "grad_norm": 0.2555031786751325, + "learning_rate": 0.00011638670577632465, + "loss": 0.9308, + "step": 13610 + }, + { + "epoch": 2.9033703071672354, + "grad_norm": 0.32225625060701985, + "learning_rate": 0.0001163791269748384, + "loss": 0.9508, + "step": 13611 + }, + { + "epoch": 2.90358361774744, + "grad_norm": 0.3550518558723405, + "learning_rate": 0.0001163715477617217, + "loss": 0.9379, + "step": 13612 + }, + { + "epoch": 2.903796928327645, + "grad_norm": 0.3458685361166704, + "learning_rate": 0.00011636396813706033, + "loss": 0.9308, + "step": 13613 + }, + { + "epoch": 2.90401023890785, + "grad_norm": 0.3566240657326961, + "learning_rate": 0.00011635638810094004, + "loss": 0.9075, + "step": 13614 + }, + { + "epoch": 2.9042235494880546, + "grad_norm": 0.31176092523293614, + "learning_rate": 0.00011634880765344662, + "loss": 0.9514, + "step": 13615 + }, + { + "epoch": 2.9044368600682593, + "grad_norm": 0.21366255505370055, + "learning_rate": 0.00011634122679466581, + "loss": 0.905, + "step": 13616 + }, + { + "epoch": 2.9046501706484644, + "grad_norm": 0.20400693983681564, + "learning_rate": 0.00011633364552468344, + "loss": 0.9335, + "step": 13617 + }, + { + "epoch": 2.904863481228669, + "grad_norm": 0.23465388911481633, + "learning_rate": 0.00011632606384358526, + "loss": 0.9794, + "step": 13618 + }, + { + "epoch": 2.905076791808874, + "grad_norm": 0.32340239324000686, + "learning_rate": 0.00011631848175145702, + "loss": 0.9363, + "step": 13619 + }, + { + "epoch": 2.9052901023890785, + "grad_norm": 0.3896332189186431, + "learning_rate": 0.00011631089924838457, + "loss": 0.9385, + "step": 13620 + }, + { + "epoch": 2.905503412969283, + "grad_norm": 0.32855986710848645, + "learning_rate": 0.00011630331633445368, + "loss": 0.9454, + "step": 13621 + }, + { + "epoch": 2.905716723549488, + "grad_norm": 0.2822510200247139, + "learning_rate": 0.00011629573300975017, + "loss": 0.9394, + "step": 13622 + }, + { + "epoch": 2.905930034129693, + "grad_norm": 0.24218158269510462, + "learning_rate": 0.00011628814927435982, + "loss": 0.9165, + "step": 13623 + }, + { + "epoch": 2.9061433447098977, + "grad_norm": 0.19772051375171085, + "learning_rate": 0.00011628056512836844, + "loss": 0.9157, + "step": 13624 + }, + { + "epoch": 2.9063566552901023, + "grad_norm": 0.28487445772071357, + "learning_rate": 0.00011627298057186187, + "loss": 0.944, + "step": 13625 + }, + { + "epoch": 2.906569965870307, + "grad_norm": 0.33066781486900726, + "learning_rate": 0.00011626539560492589, + "loss": 0.9065, + "step": 13626 + }, + { + "epoch": 2.906783276450512, + "grad_norm": 0.3368018868477701, + "learning_rate": 0.00011625781022764639, + "loss": 0.9444, + "step": 13627 + }, + { + "epoch": 2.906996587030717, + "grad_norm": 0.31285359755915065, + "learning_rate": 0.00011625022444010914, + "loss": 0.8796, + "step": 13628 + }, + { + "epoch": 2.9072098976109215, + "grad_norm": 0.23508677543996298, + "learning_rate": 0.00011624263824239997, + "loss": 0.9022, + "step": 13629 + }, + { + "epoch": 2.907423208191126, + "grad_norm": 0.18103066072430818, + "learning_rate": 0.00011623505163460476, + "loss": 0.918, + "step": 13630 + }, + { + "epoch": 2.907636518771331, + "grad_norm": 0.30647998008667476, + "learning_rate": 0.00011622746461680932, + "loss": 0.9534, + "step": 13631 + }, + { + "epoch": 2.9078498293515356, + "grad_norm": 0.4087421773436042, + "learning_rate": 0.00011621987718909951, + "loss": 0.9483, + "step": 13632 + }, + { + "epoch": 2.9080631399317407, + "grad_norm": 0.4240938370019489, + "learning_rate": 0.00011621228935156119, + "loss": 0.9489, + "step": 13633 + }, + { + "epoch": 2.9082764505119454, + "grad_norm": 0.337877485571267, + "learning_rate": 0.00011620470110428021, + "loss": 0.9402, + "step": 13634 + }, + { + "epoch": 2.90848976109215, + "grad_norm": 0.23581490725954524, + "learning_rate": 0.00011619711244734241, + "loss": 0.9456, + "step": 13635 + }, + { + "epoch": 2.908703071672355, + "grad_norm": 0.22586976107214538, + "learning_rate": 0.00011618952338083369, + "loss": 0.9136, + "step": 13636 + }, + { + "epoch": 2.90891638225256, + "grad_norm": 0.3104769352725105, + "learning_rate": 0.00011618193390483989, + "loss": 0.901, + "step": 13637 + }, + { + "epoch": 2.9091296928327646, + "grad_norm": 0.36476453242557116, + "learning_rate": 0.0001161743440194469, + "loss": 0.9235, + "step": 13638 + }, + { + "epoch": 2.9093430034129693, + "grad_norm": 0.3344673371341442, + "learning_rate": 0.00011616675372474062, + "loss": 0.9166, + "step": 13639 + }, + { + "epoch": 2.909556313993174, + "grad_norm": 0.2524843440223838, + "learning_rate": 0.00011615916302080689, + "loss": 0.9242, + "step": 13640 + }, + { + "epoch": 2.9097696245733786, + "grad_norm": 0.2045055757526484, + "learning_rate": 0.00011615157190773164, + "loss": 0.9764, + "step": 13641 + }, + { + "epoch": 2.9099829351535837, + "grad_norm": 0.3422483945530377, + "learning_rate": 0.00011614398038560073, + "loss": 0.9241, + "step": 13642 + }, + { + "epoch": 2.9101962457337884, + "grad_norm": 0.3456845021887669, + "learning_rate": 0.00011613638845450008, + "loss": 0.9373, + "step": 13643 + }, + { + "epoch": 2.910409556313993, + "grad_norm": 0.4049828509336558, + "learning_rate": 0.00011612879611451559, + "loss": 0.9459, + "step": 13644 + }, + { + "epoch": 2.910622866894198, + "grad_norm": 0.45609270866998286, + "learning_rate": 0.00011612120336573317, + "loss": 0.9307, + "step": 13645 + }, + { + "epoch": 2.910836177474403, + "grad_norm": 0.3201074994127459, + "learning_rate": 0.0001161136102082387, + "loss": 0.9575, + "step": 13646 + }, + { + "epoch": 2.9110494880546076, + "grad_norm": 0.2495447753386086, + "learning_rate": 0.00011610601664211815, + "loss": 0.9389, + "step": 13647 + }, + { + "epoch": 2.9112627986348123, + "grad_norm": 0.37216359688689055, + "learning_rate": 0.0001160984226674574, + "loss": 0.9491, + "step": 13648 + }, + { + "epoch": 2.911476109215017, + "grad_norm": 0.3064194955741817, + "learning_rate": 0.00011609082828434241, + "loss": 0.978, + "step": 13649 + }, + { + "epoch": 2.9116894197952217, + "grad_norm": 0.2960145075424941, + "learning_rate": 0.00011608323349285906, + "loss": 0.939, + "step": 13650 + }, + { + "epoch": 2.9119027303754264, + "grad_norm": 0.3381103985887285, + "learning_rate": 0.00011607563829309336, + "loss": 0.9017, + "step": 13651 + }, + { + "epoch": 2.9121160409556315, + "grad_norm": 0.3272412180334075, + "learning_rate": 0.00011606804268513115, + "loss": 0.9373, + "step": 13652 + }, + { + "epoch": 2.912329351535836, + "grad_norm": 0.2771685576479035, + "learning_rate": 0.00011606044666905845, + "loss": 0.9403, + "step": 13653 + }, + { + "epoch": 2.912542662116041, + "grad_norm": 0.2514782561675693, + "learning_rate": 0.00011605285024496121, + "loss": 0.917, + "step": 13654 + }, + { + "epoch": 2.912755972696246, + "grad_norm": 0.19197324232543805, + "learning_rate": 0.00011604525341292536, + "loss": 0.9576, + "step": 13655 + }, + { + "epoch": 2.9129692832764507, + "grad_norm": 0.2082297772809326, + "learning_rate": 0.00011603765617303686, + "loss": 0.9532, + "step": 13656 + }, + { + "epoch": 2.9131825938566553, + "grad_norm": 0.29397459209269117, + "learning_rate": 0.00011603005852538167, + "loss": 0.9327, + "step": 13657 + }, + { + "epoch": 2.91339590443686, + "grad_norm": 0.27668735967586056, + "learning_rate": 0.00011602246047004574, + "loss": 0.9322, + "step": 13658 + }, + { + "epoch": 2.9136092150170647, + "grad_norm": 0.22413918298763813, + "learning_rate": 0.0001160148620071151, + "loss": 0.9478, + "step": 13659 + }, + { + "epoch": 2.9138225255972694, + "grad_norm": 0.2375467945725204, + "learning_rate": 0.00011600726313667566, + "loss": 0.9237, + "step": 13660 + }, + { + "epoch": 2.9140358361774745, + "grad_norm": 0.22107553964732107, + "learning_rate": 0.00011599966385881346, + "loss": 0.906, + "step": 13661 + }, + { + "epoch": 2.914249146757679, + "grad_norm": 0.19507916243662352, + "learning_rate": 0.00011599206417361443, + "loss": 0.9092, + "step": 13662 + }, + { + "epoch": 2.914462457337884, + "grad_norm": 0.25893963613474635, + "learning_rate": 0.00011598446408116462, + "loss": 0.9281, + "step": 13663 + }, + { + "epoch": 2.9146757679180886, + "grad_norm": 0.23562162495609099, + "learning_rate": 0.00011597686358154995, + "loss": 0.9362, + "step": 13664 + }, + { + "epoch": 2.9148890784982937, + "grad_norm": 0.24267713851575773, + "learning_rate": 0.0001159692626748565, + "loss": 0.9393, + "step": 13665 + }, + { + "epoch": 2.9151023890784984, + "grad_norm": 0.24216641046017953, + "learning_rate": 0.00011596166136117021, + "loss": 0.9193, + "step": 13666 + }, + { + "epoch": 2.915315699658703, + "grad_norm": 0.2421716795721862, + "learning_rate": 0.00011595405964057715, + "loss": 0.9396, + "step": 13667 + }, + { + "epoch": 2.9155290102389078, + "grad_norm": 0.2517218647170413, + "learning_rate": 0.00011594645751316329, + "loss": 0.9419, + "step": 13668 + }, + { + "epoch": 2.9157423208191124, + "grad_norm": 0.2801151712632539, + "learning_rate": 0.00011593885497901466, + "loss": 0.9522, + "step": 13669 + }, + { + "epoch": 2.9159556313993176, + "grad_norm": 0.2618110886852661, + "learning_rate": 0.00011593125203821725, + "loss": 0.9181, + "step": 13670 + }, + { + "epoch": 2.9161689419795223, + "grad_norm": 0.22774706336457662, + "learning_rate": 0.00011592364869085714, + "loss": 0.8994, + "step": 13671 + }, + { + "epoch": 2.916382252559727, + "grad_norm": 0.23612849769557478, + "learning_rate": 0.00011591604493702034, + "loss": 0.9153, + "step": 13672 + }, + { + "epoch": 2.9165955631399316, + "grad_norm": 0.2692249105871852, + "learning_rate": 0.00011590844077679292, + "loss": 0.9476, + "step": 13673 + }, + { + "epoch": 2.9168088737201368, + "grad_norm": 0.29309612620256875, + "learning_rate": 0.00011590083621026085, + "loss": 0.9335, + "step": 13674 + }, + { + "epoch": 2.9170221843003414, + "grad_norm": 0.3174547719956507, + "learning_rate": 0.00011589323123751023, + "loss": 0.942, + "step": 13675 + }, + { + "epoch": 2.917235494880546, + "grad_norm": 0.3365485183046752, + "learning_rate": 0.00011588562585862707, + "loss": 0.9078, + "step": 13676 + }, + { + "epoch": 2.917448805460751, + "grad_norm": 0.35662918877391553, + "learning_rate": 0.00011587802007369747, + "loss": 0.9176, + "step": 13677 + }, + { + "epoch": 2.9176621160409555, + "grad_norm": 0.36852415737253835, + "learning_rate": 0.00011587041388280749, + "loss": 0.9065, + "step": 13678 + }, + { + "epoch": 2.91787542662116, + "grad_norm": 0.32790978246313546, + "learning_rate": 0.00011586280728604315, + "loss": 0.9494, + "step": 13679 + }, + { + "epoch": 2.9180887372013653, + "grad_norm": 0.32777887547907203, + "learning_rate": 0.00011585520028349052, + "loss": 0.9661, + "step": 13680 + }, + { + "epoch": 2.91830204778157, + "grad_norm": 0.23058798970060485, + "learning_rate": 0.00011584759287523573, + "loss": 0.9254, + "step": 13681 + }, + { + "epoch": 2.9185153583617747, + "grad_norm": 0.26525030337330974, + "learning_rate": 0.00011583998506136481, + "loss": 0.9175, + "step": 13682 + }, + { + "epoch": 2.91872866894198, + "grad_norm": 0.399553235598159, + "learning_rate": 0.00011583237684196387, + "loss": 0.9455, + "step": 13683 + }, + { + "epoch": 2.9189419795221845, + "grad_norm": 0.4332890890264414, + "learning_rate": 0.00011582476821711897, + "loss": 0.9229, + "step": 13684 + }, + { + "epoch": 2.919155290102389, + "grad_norm": 0.3364117512660659, + "learning_rate": 0.00011581715918691622, + "loss": 0.9466, + "step": 13685 + }, + { + "epoch": 2.919368600682594, + "grad_norm": 0.352641410429774, + "learning_rate": 0.00011580954975144169, + "loss": 0.9073, + "step": 13686 + }, + { + "epoch": 2.9195819112627985, + "grad_norm": 0.4008480314030729, + "learning_rate": 0.00011580193991078153, + "loss": 0.8937, + "step": 13687 + }, + { + "epoch": 2.919795221843003, + "grad_norm": 0.3479956079861212, + "learning_rate": 0.0001157943296650218, + "loss": 0.9438, + "step": 13688 + }, + { + "epoch": 2.9200085324232083, + "grad_norm": 0.29246348911349224, + "learning_rate": 0.00011578671901424865, + "loss": 0.9659, + "step": 13689 + }, + { + "epoch": 2.920221843003413, + "grad_norm": 0.21377935467443618, + "learning_rate": 0.00011577910795854817, + "loss": 0.9329, + "step": 13690 + }, + { + "epoch": 2.9204351535836177, + "grad_norm": 0.2803165104794183, + "learning_rate": 0.00011577149649800647, + "loss": 0.9405, + "step": 13691 + }, + { + "epoch": 2.9206484641638224, + "grad_norm": 0.27961500126422945, + "learning_rate": 0.00011576388463270967, + "loss": 0.9177, + "step": 13692 + }, + { + "epoch": 2.9208617747440275, + "grad_norm": 0.24314434573681007, + "learning_rate": 0.00011575627236274394, + "loss": 0.9503, + "step": 13693 + }, + { + "epoch": 2.921075085324232, + "grad_norm": 0.209916762334677, + "learning_rate": 0.00011574865968819536, + "loss": 0.949, + "step": 13694 + }, + { + "epoch": 2.921288395904437, + "grad_norm": 0.2432583606254937, + "learning_rate": 0.00011574104660915012, + "loss": 0.8917, + "step": 13695 + }, + { + "epoch": 2.9215017064846416, + "grad_norm": 0.22541099148181715, + "learning_rate": 0.00011573343312569433, + "loss": 0.9252, + "step": 13696 + }, + { + "epoch": 2.9217150170648463, + "grad_norm": 0.19965967995002368, + "learning_rate": 0.00011572581923791415, + "loss": 0.9158, + "step": 13697 + }, + { + "epoch": 2.921928327645051, + "grad_norm": 0.20916825290326044, + "learning_rate": 0.00011571820494589571, + "loss": 0.9626, + "step": 13698 + }, + { + "epoch": 2.922141638225256, + "grad_norm": 0.19941151266414076, + "learning_rate": 0.00011571059024972517, + "loss": 0.904, + "step": 13699 + }, + { + "epoch": 2.9223549488054608, + "grad_norm": 0.2303333157643036, + "learning_rate": 0.00011570297514948872, + "loss": 0.9116, + "step": 13700 + }, + { + "epoch": 2.9225682593856654, + "grad_norm": 0.23712408224199732, + "learning_rate": 0.00011569535964527252, + "loss": 0.9574, + "step": 13701 + }, + { + "epoch": 2.9227815699658706, + "grad_norm": 0.23713090287007157, + "learning_rate": 0.00011568774373716271, + "loss": 0.9349, + "step": 13702 + }, + { + "epoch": 2.9229948805460753, + "grad_norm": 0.2634673893234884, + "learning_rate": 0.00011568012742524548, + "loss": 0.9285, + "step": 13703 + }, + { + "epoch": 2.92320819112628, + "grad_norm": 0.2565186612850016, + "learning_rate": 0.00011567251070960698, + "loss": 0.9402, + "step": 13704 + }, + { + "epoch": 2.9234215017064846, + "grad_norm": 0.23891775354371594, + "learning_rate": 0.00011566489359033345, + "loss": 0.9433, + "step": 13705 + }, + { + "epoch": 2.9236348122866893, + "grad_norm": 0.24331387269229876, + "learning_rate": 0.00011565727606751105, + "loss": 0.9468, + "step": 13706 + }, + { + "epoch": 2.923848122866894, + "grad_norm": 0.22455948104522064, + "learning_rate": 0.00011564965814122597, + "loss": 0.9522, + "step": 13707 + }, + { + "epoch": 2.924061433447099, + "grad_norm": 0.22507545174782287, + "learning_rate": 0.00011564203981156439, + "loss": 0.9217, + "step": 13708 + }, + { + "epoch": 2.924274744027304, + "grad_norm": 0.2663430530051543, + "learning_rate": 0.00011563442107861255, + "loss": 0.9396, + "step": 13709 + }, + { + "epoch": 2.9244880546075085, + "grad_norm": 0.22243110585488096, + "learning_rate": 0.00011562680194245662, + "loss": 0.9317, + "step": 13710 + }, + { + "epoch": 2.924701365187713, + "grad_norm": 0.2746741882099048, + "learning_rate": 0.00011561918240318285, + "loss": 0.9511, + "step": 13711 + }, + { + "epoch": 2.9249146757679183, + "grad_norm": 0.3332445996751909, + "learning_rate": 0.00011561156246087742, + "loss": 0.9376, + "step": 13712 + }, + { + "epoch": 2.925127986348123, + "grad_norm": 0.27522452475562137, + "learning_rate": 0.00011560394211562657, + "loss": 0.934, + "step": 13713 + }, + { + "epoch": 2.9253412969283277, + "grad_norm": 0.23909680364319918, + "learning_rate": 0.0001155963213675165, + "loss": 0.9385, + "step": 13714 + }, + { + "epoch": 2.9255546075085324, + "grad_norm": 0.2410668128579261, + "learning_rate": 0.00011558870021663346, + "loss": 0.928, + "step": 13715 + }, + { + "epoch": 2.925767918088737, + "grad_norm": 0.233510616148324, + "learning_rate": 0.00011558107866306368, + "loss": 0.9198, + "step": 13716 + }, + { + "epoch": 2.9259812286689417, + "grad_norm": 0.23651819789937356, + "learning_rate": 0.0001155734567068934, + "loss": 0.9452, + "step": 13717 + }, + { + "epoch": 2.926194539249147, + "grad_norm": 0.19745896882939207, + "learning_rate": 0.00011556583434820886, + "loss": 0.9428, + "step": 13718 + }, + { + "epoch": 2.9264078498293515, + "grad_norm": 0.23475637891018622, + "learning_rate": 0.00011555821158709631, + "loss": 0.9068, + "step": 13719 + }, + { + "epoch": 2.926621160409556, + "grad_norm": 0.23617797118764483, + "learning_rate": 0.00011555058842364198, + "loss": 0.9212, + "step": 13720 + }, + { + "epoch": 2.9268344709897613, + "grad_norm": 0.23629218558838103, + "learning_rate": 0.00011554296485793216, + "loss": 0.9513, + "step": 13721 + }, + { + "epoch": 2.927047781569966, + "grad_norm": 0.33770080018912996, + "learning_rate": 0.00011553534089005308, + "loss": 0.9572, + "step": 13722 + }, + { + "epoch": 2.9272610921501707, + "grad_norm": 0.3325925410362335, + "learning_rate": 0.00011552771652009103, + "loss": 0.8963, + "step": 13723 + }, + { + "epoch": 2.9274744027303754, + "grad_norm": 0.22896633609362407, + "learning_rate": 0.00011552009174813228, + "loss": 0.906, + "step": 13724 + }, + { + "epoch": 2.92768771331058, + "grad_norm": 0.2499285328670804, + "learning_rate": 0.00011551246657426309, + "loss": 0.913, + "step": 13725 + }, + { + "epoch": 2.9279010238907848, + "grad_norm": 0.2438946539535402, + "learning_rate": 0.00011550484099856972, + "loss": 0.9258, + "step": 13726 + }, + { + "epoch": 2.92811433447099, + "grad_norm": 0.2521141895462455, + "learning_rate": 0.0001154972150211385, + "loss": 0.9214, + "step": 13727 + }, + { + "epoch": 2.9283276450511946, + "grad_norm": 0.2639563763315297, + "learning_rate": 0.00011548958864205569, + "loss": 0.9011, + "step": 13728 + }, + { + "epoch": 2.9285409556313993, + "grad_norm": 0.37260684641452924, + "learning_rate": 0.00011548196186140758, + "loss": 0.9336, + "step": 13729 + }, + { + "epoch": 2.928754266211604, + "grad_norm": 0.4223885690973815, + "learning_rate": 0.00011547433467928049, + "loss": 0.9412, + "step": 13730 + }, + { + "epoch": 2.928967576791809, + "grad_norm": 0.29339217386012995, + "learning_rate": 0.00011546670709576071, + "loss": 0.9504, + "step": 13731 + }, + { + "epoch": 2.9291808873720138, + "grad_norm": 0.21542096018646895, + "learning_rate": 0.0001154590791109345, + "loss": 0.9017, + "step": 13732 + }, + { + "epoch": 2.9293941979522184, + "grad_norm": 0.30578723273393893, + "learning_rate": 0.00011545145072488827, + "loss": 0.9235, + "step": 13733 + }, + { + "epoch": 2.929607508532423, + "grad_norm": 0.2672442531184887, + "learning_rate": 0.00011544382193770823, + "loss": 0.9203, + "step": 13734 + }, + { + "epoch": 2.929820819112628, + "grad_norm": 0.2631918867029324, + "learning_rate": 0.00011543619274948078, + "loss": 0.9159, + "step": 13735 + }, + { + "epoch": 2.9300341296928325, + "grad_norm": 0.34022382010011504, + "learning_rate": 0.0001154285631602922, + "loss": 0.9336, + "step": 13736 + }, + { + "epoch": 2.9302474402730376, + "grad_norm": 0.25915517094020324, + "learning_rate": 0.00011542093317022885, + "loss": 0.9294, + "step": 13737 + }, + { + "epoch": 2.9304607508532423, + "grad_norm": 0.24588614348313206, + "learning_rate": 0.00011541330277937701, + "loss": 0.9307, + "step": 13738 + }, + { + "epoch": 2.930674061433447, + "grad_norm": 0.3552587428364485, + "learning_rate": 0.00011540567198782308, + "loss": 0.9266, + "step": 13739 + }, + { + "epoch": 2.930887372013652, + "grad_norm": 0.2638233967230041, + "learning_rate": 0.00011539804079565336, + "loss": 0.9064, + "step": 13740 + }, + { + "epoch": 2.931100682593857, + "grad_norm": 0.1943679039303691, + "learning_rate": 0.00011539040920295423, + "loss": 0.905, + "step": 13741 + }, + { + "epoch": 2.9313139931740615, + "grad_norm": 0.831522443137884, + "learning_rate": 0.00011538277720981204, + "loss": 0.9775, + "step": 13742 + }, + { + "epoch": 2.931527303754266, + "grad_norm": 0.3029488084374442, + "learning_rate": 0.0001153751448163131, + "loss": 0.9454, + "step": 13743 + }, + { + "epoch": 2.931740614334471, + "grad_norm": 0.38339096222226005, + "learning_rate": 0.0001153675120225438, + "loss": 0.9357, + "step": 13744 + }, + { + "epoch": 2.9319539249146755, + "grad_norm": 0.3195208682483449, + "learning_rate": 0.00011535987882859051, + "loss": 0.9466, + "step": 13745 + }, + { + "epoch": 2.9321672354948807, + "grad_norm": 0.4147686307838474, + "learning_rate": 0.00011535224523453961, + "loss": 0.9216, + "step": 13746 + }, + { + "epoch": 2.9323805460750854, + "grad_norm": 0.48337459676174643, + "learning_rate": 0.00011534461124047746, + "loss": 0.9346, + "step": 13747 + }, + { + "epoch": 2.93259385665529, + "grad_norm": 0.3882187709307774, + "learning_rate": 0.00011533697684649042, + "loss": 0.9363, + "step": 13748 + }, + { + "epoch": 2.9328071672354947, + "grad_norm": 0.3581565624314609, + "learning_rate": 0.00011532934205266491, + "loss": 0.9758, + "step": 13749 + }, + { + "epoch": 2.9330204778157, + "grad_norm": 0.4719941814232676, + "learning_rate": 0.00011532170685908728, + "loss": 0.9509, + "step": 13750 + }, + { + "epoch": 2.9332337883959045, + "grad_norm": 0.4651163965233725, + "learning_rate": 0.00011531407126584399, + "loss": 0.9408, + "step": 13751 + }, + { + "epoch": 2.9334470989761092, + "grad_norm": 0.4925657245477248, + "learning_rate": 0.00011530643527302134, + "loss": 0.9434, + "step": 13752 + }, + { + "epoch": 2.933660409556314, + "grad_norm": 0.47515121301347435, + "learning_rate": 0.00011529879888070579, + "loss": 0.9635, + "step": 13753 + }, + { + "epoch": 2.9338737201365186, + "grad_norm": 0.5065095610211501, + "learning_rate": 0.00011529116208898377, + "loss": 0.9544, + "step": 13754 + }, + { + "epoch": 2.9340870307167233, + "grad_norm": 0.42121993377426, + "learning_rate": 0.00011528352489794164, + "loss": 0.9621, + "step": 13755 + }, + { + "epoch": 2.9343003412969284, + "grad_norm": 0.37799911329041413, + "learning_rate": 0.00011527588730766582, + "loss": 0.9481, + "step": 13756 + }, + { + "epoch": 2.934513651877133, + "grad_norm": 0.41797200496231157, + "learning_rate": 0.00011526824931824274, + "loss": 0.959, + "step": 13757 + }, + { + "epoch": 2.9347269624573378, + "grad_norm": 0.40955646145142777, + "learning_rate": 0.00011526061092975884, + "loss": 0.9482, + "step": 13758 + }, + { + "epoch": 2.934940273037543, + "grad_norm": 0.3855444212180302, + "learning_rate": 0.00011525297214230051, + "loss": 0.9411, + "step": 13759 + }, + { + "epoch": 2.9351535836177476, + "grad_norm": 0.3297031387150543, + "learning_rate": 0.00011524533295595423, + "loss": 0.946, + "step": 13760 + }, + { + "epoch": 2.9353668941979523, + "grad_norm": 0.29953766772865625, + "learning_rate": 0.00011523769337080642, + "loss": 0.9195, + "step": 13761 + }, + { + "epoch": 2.935580204778157, + "grad_norm": 0.3409144797856654, + "learning_rate": 0.00011523005338694351, + "loss": 0.9168, + "step": 13762 + }, + { + "epoch": 2.9357935153583616, + "grad_norm": 0.2959866966565202, + "learning_rate": 0.00011522241300445194, + "loss": 0.9407, + "step": 13763 + }, + { + "epoch": 2.9360068259385663, + "grad_norm": 0.2511843067823058, + "learning_rate": 0.0001152147722234182, + "loss": 0.9331, + "step": 13764 + }, + { + "epoch": 2.9362201365187715, + "grad_norm": 0.246596983080971, + "learning_rate": 0.00011520713104392869, + "loss": 0.9366, + "step": 13765 + }, + { + "epoch": 2.936433447098976, + "grad_norm": 0.20249084290587316, + "learning_rate": 0.00011519948946606992, + "loss": 0.9343, + "step": 13766 + }, + { + "epoch": 2.936646757679181, + "grad_norm": 0.28419642449731636, + "learning_rate": 0.00011519184748992832, + "loss": 0.9445, + "step": 13767 + }, + { + "epoch": 2.9368600682593855, + "grad_norm": 0.31417590489129943, + "learning_rate": 0.0001151842051155904, + "loss": 0.9202, + "step": 13768 + }, + { + "epoch": 2.9370733788395906, + "grad_norm": 0.28538937286779215, + "learning_rate": 0.00011517656234314256, + "loss": 0.9692, + "step": 13769 + }, + { + "epoch": 2.9372866894197953, + "grad_norm": 0.2890692126002642, + "learning_rate": 0.00011516891917267137, + "loss": 0.9547, + "step": 13770 + }, + { + "epoch": 2.9375, + "grad_norm": 0.2607843006725816, + "learning_rate": 0.00011516127560426324, + "loss": 0.9317, + "step": 13771 + }, + { + "epoch": 2.9377133105802047, + "grad_norm": 0.2256316157729018, + "learning_rate": 0.00011515363163800472, + "loss": 0.9165, + "step": 13772 + }, + { + "epoch": 2.9379266211604094, + "grad_norm": 0.2351657767062647, + "learning_rate": 0.00011514598727398224, + "loss": 0.9269, + "step": 13773 + }, + { + "epoch": 2.9381399317406145, + "grad_norm": 0.28051240431754426, + "learning_rate": 0.00011513834251228232, + "loss": 0.9432, + "step": 13774 + }, + { + "epoch": 2.938353242320819, + "grad_norm": 0.25657336932872565, + "learning_rate": 0.00011513069735299149, + "loss": 0.9601, + "step": 13775 + }, + { + "epoch": 2.938566552901024, + "grad_norm": 0.2430393645965164, + "learning_rate": 0.00011512305179619621, + "loss": 0.9191, + "step": 13776 + }, + { + "epoch": 2.9387798634812285, + "grad_norm": 0.2484544281680006, + "learning_rate": 0.00011511540584198298, + "loss": 0.9225, + "step": 13777 + }, + { + "epoch": 2.9389931740614337, + "grad_norm": 0.25409422775301965, + "learning_rate": 0.00011510775949043839, + "loss": 0.9346, + "step": 13778 + }, + { + "epoch": 2.9392064846416384, + "grad_norm": 0.2303935772031452, + "learning_rate": 0.0001151001127416489, + "loss": 0.9316, + "step": 13779 + }, + { + "epoch": 2.939419795221843, + "grad_norm": 0.16507285398102914, + "learning_rate": 0.00011509246559570102, + "loss": 0.8998, + "step": 13780 + }, + { + "epoch": 2.9396331058020477, + "grad_norm": 0.22958227036378984, + "learning_rate": 0.00011508481805268132, + "loss": 0.9356, + "step": 13781 + }, + { + "epoch": 2.9398464163822524, + "grad_norm": 0.23662148664967472, + "learning_rate": 0.00011507717011267632, + "loss": 0.962, + "step": 13782 + }, + { + "epoch": 2.940059726962457, + "grad_norm": 0.22781744887643562, + "learning_rate": 0.00011506952177577252, + "loss": 0.9437, + "step": 13783 + }, + { + "epoch": 2.9402730375426622, + "grad_norm": 0.21726615468854044, + "learning_rate": 0.00011506187304205654, + "loss": 0.9187, + "step": 13784 + }, + { + "epoch": 2.940486348122867, + "grad_norm": 0.29252669174810964, + "learning_rate": 0.00011505422391161482, + "loss": 0.9294, + "step": 13785 + }, + { + "epoch": 2.9406996587030716, + "grad_norm": 0.2753995655396277, + "learning_rate": 0.000115046574384534, + "loss": 0.9139, + "step": 13786 + }, + { + "epoch": 2.9409129692832767, + "grad_norm": 0.31792667022871846, + "learning_rate": 0.0001150389244609006, + "loss": 0.925, + "step": 13787 + }, + { + "epoch": 2.9411262798634814, + "grad_norm": 0.3699911743556066, + "learning_rate": 0.00011503127414080118, + "loss": 0.929, + "step": 13788 + }, + { + "epoch": 2.941339590443686, + "grad_norm": 0.4428140686584852, + "learning_rate": 0.00011502362342432228, + "loss": 0.9845, + "step": 13789 + }, + { + "epoch": 2.9415529010238908, + "grad_norm": 0.5270013999692277, + "learning_rate": 0.00011501597231155054, + "loss": 0.9096, + "step": 13790 + }, + { + "epoch": 2.9417662116040955, + "grad_norm": 0.4511581327658084, + "learning_rate": 0.00011500832080257245, + "loss": 0.9219, + "step": 13791 + }, + { + "epoch": 2.9419795221843, + "grad_norm": 0.34916377404203347, + "learning_rate": 0.00011500066889747463, + "loss": 0.9164, + "step": 13792 + }, + { + "epoch": 2.9421928327645053, + "grad_norm": 0.31000518926758985, + "learning_rate": 0.00011499301659634365, + "loss": 0.9361, + "step": 13793 + }, + { + "epoch": 2.94240614334471, + "grad_norm": 0.2770801459510103, + "learning_rate": 0.0001149853638992661, + "loss": 0.9302, + "step": 13794 + }, + { + "epoch": 2.9426194539249146, + "grad_norm": 0.34813159138160826, + "learning_rate": 0.00011497771080632856, + "loss": 0.957, + "step": 13795 + }, + { + "epoch": 2.9428327645051193, + "grad_norm": 0.4363575589201204, + "learning_rate": 0.00011497005731761767, + "loss": 0.9054, + "step": 13796 + }, + { + "epoch": 2.9430460750853245, + "grad_norm": 0.43901298822162876, + "learning_rate": 0.00011496240343321994, + "loss": 0.9719, + "step": 13797 + }, + { + "epoch": 2.943259385665529, + "grad_norm": 0.3262078229462909, + "learning_rate": 0.00011495474915322207, + "loss": 0.9557, + "step": 13798 + }, + { + "epoch": 2.943472696245734, + "grad_norm": 0.21746472226935767, + "learning_rate": 0.00011494709447771059, + "loss": 0.9199, + "step": 13799 + }, + { + "epoch": 2.9436860068259385, + "grad_norm": 0.25353008562753054, + "learning_rate": 0.00011493943940677216, + "loss": 0.8958, + "step": 13800 + }, + { + "epoch": 2.943899317406143, + "grad_norm": 0.3491462645032764, + "learning_rate": 0.0001149317839404934, + "loss": 0.9094, + "step": 13801 + }, + { + "epoch": 2.944112627986348, + "grad_norm": 0.4094922485289287, + "learning_rate": 0.00011492412807896092, + "loss": 0.9348, + "step": 13802 + }, + { + "epoch": 2.944325938566553, + "grad_norm": 0.39947135037625553, + "learning_rate": 0.00011491647182226132, + "loss": 0.9011, + "step": 13803 + }, + { + "epoch": 2.9445392491467577, + "grad_norm": 0.4343550374204956, + "learning_rate": 0.00011490881517048128, + "loss": 0.9314, + "step": 13804 + }, + { + "epoch": 2.9447525597269624, + "grad_norm": 0.325044717008982, + "learning_rate": 0.00011490115812370738, + "loss": 0.9232, + "step": 13805 + }, + { + "epoch": 2.9449658703071675, + "grad_norm": 0.2396214247050316, + "learning_rate": 0.0001148935006820263, + "loss": 0.9551, + "step": 13806 + }, + { + "epoch": 2.945179180887372, + "grad_norm": 0.3014122492249548, + "learning_rate": 0.00011488584284552469, + "loss": 0.9496, + "step": 13807 + }, + { + "epoch": 2.945392491467577, + "grad_norm": 0.27852366337113743, + "learning_rate": 0.00011487818461428918, + "loss": 0.9443, + "step": 13808 + }, + { + "epoch": 2.9456058020477816, + "grad_norm": 0.315349626037173, + "learning_rate": 0.00011487052598840638, + "loss": 0.9308, + "step": 13809 + }, + { + "epoch": 2.9458191126279862, + "grad_norm": 0.3702764626454072, + "learning_rate": 0.00011486286696796305, + "loss": 0.9501, + "step": 13810 + }, + { + "epoch": 2.946032423208191, + "grad_norm": 0.39312283646139556, + "learning_rate": 0.00011485520755304575, + "loss": 0.9601, + "step": 13811 + }, + { + "epoch": 2.946245733788396, + "grad_norm": 0.29697567561814897, + "learning_rate": 0.00011484754774374124, + "loss": 0.9187, + "step": 13812 + }, + { + "epoch": 2.9464590443686007, + "grad_norm": 0.24334042100589956, + "learning_rate": 0.0001148398875401361, + "loss": 0.9212, + "step": 13813 + }, + { + "epoch": 2.9466723549488054, + "grad_norm": 0.3844722536210872, + "learning_rate": 0.00011483222694231706, + "loss": 0.9295, + "step": 13814 + }, + { + "epoch": 2.94688566552901, + "grad_norm": 0.3663640461121502, + "learning_rate": 0.00011482456595037078, + "loss": 0.9363, + "step": 13815 + }, + { + "epoch": 2.9470989761092152, + "grad_norm": 0.2406507930758667, + "learning_rate": 0.00011481690456438395, + "loss": 0.9672, + "step": 13816 + }, + { + "epoch": 2.94731228668942, + "grad_norm": 0.29010105292408545, + "learning_rate": 0.00011480924278444324, + "loss": 0.9013, + "step": 13817 + }, + { + "epoch": 2.9475255972696246, + "grad_norm": 0.296295923297253, + "learning_rate": 0.00011480158061063542, + "loss": 0.9274, + "step": 13818 + }, + { + "epoch": 2.9477389078498293, + "grad_norm": 0.23404005503317535, + "learning_rate": 0.00011479391804304708, + "loss": 0.9352, + "step": 13819 + }, + { + "epoch": 2.947952218430034, + "grad_norm": 0.27210350717168974, + "learning_rate": 0.000114786255081765, + "loss": 0.9079, + "step": 13820 + }, + { + "epoch": 2.9481655290102387, + "grad_norm": 0.30889176972550536, + "learning_rate": 0.00011477859172687584, + "loss": 0.92, + "step": 13821 + }, + { + "epoch": 2.948378839590444, + "grad_norm": 0.3144153867382593, + "learning_rate": 0.00011477092797846633, + "loss": 0.945, + "step": 13822 + }, + { + "epoch": 2.9485921501706485, + "grad_norm": 0.2383994907146895, + "learning_rate": 0.00011476326383662317, + "loss": 0.9311, + "step": 13823 + }, + { + "epoch": 2.948805460750853, + "grad_norm": 0.209368081619534, + "learning_rate": 0.00011475559930143315, + "loss": 0.9303, + "step": 13824 + }, + { + "epoch": 2.9490187713310583, + "grad_norm": 0.26070130440217987, + "learning_rate": 0.0001147479343729829, + "loss": 0.9571, + "step": 13825 + }, + { + "epoch": 2.949232081911263, + "grad_norm": 0.22551069740492302, + "learning_rate": 0.0001147402690513592, + "loss": 0.8966, + "step": 13826 + }, + { + "epoch": 2.9494453924914676, + "grad_norm": 0.28257774915866646, + "learning_rate": 0.00011473260333664875, + "loss": 0.9262, + "step": 13827 + }, + { + "epoch": 2.9496587030716723, + "grad_norm": 0.3825139636217905, + "learning_rate": 0.0001147249372289383, + "loss": 0.9348, + "step": 13828 + }, + { + "epoch": 2.949872013651877, + "grad_norm": 0.3303465888303536, + "learning_rate": 0.00011471727072831464, + "loss": 0.9359, + "step": 13829 + }, + { + "epoch": 2.9500853242320817, + "grad_norm": 0.275031980090762, + "learning_rate": 0.00011470960383486446, + "loss": 0.9373, + "step": 13830 + }, + { + "epoch": 2.950298634812287, + "grad_norm": 0.1888729222128696, + "learning_rate": 0.00011470193654867454, + "loss": 0.9178, + "step": 13831 + }, + { + "epoch": 2.9505119453924915, + "grad_norm": 0.3832942724293713, + "learning_rate": 0.00011469426886983161, + "loss": 0.9738, + "step": 13832 + }, + { + "epoch": 2.950725255972696, + "grad_norm": 0.519020261010683, + "learning_rate": 0.00011468660079842244, + "loss": 0.9698, + "step": 13833 + }, + { + "epoch": 2.950938566552901, + "grad_norm": 0.4685167898879297, + "learning_rate": 0.00011467893233453382, + "loss": 0.9301, + "step": 13834 + }, + { + "epoch": 2.951151877133106, + "grad_norm": 0.366878876812467, + "learning_rate": 0.0001146712634782525, + "loss": 0.9158, + "step": 13835 + }, + { + "epoch": 2.9513651877133107, + "grad_norm": 0.3200539586681658, + "learning_rate": 0.00011466359422966523, + "loss": 0.9157, + "step": 13836 + }, + { + "epoch": 2.9515784982935154, + "grad_norm": 0.2564246836829976, + "learning_rate": 0.00011465592458885881, + "loss": 0.9142, + "step": 13837 + }, + { + "epoch": 2.95179180887372, + "grad_norm": 0.24541611435626548, + "learning_rate": 0.00011464825455592002, + "loss": 0.9782, + "step": 13838 + }, + { + "epoch": 2.9520051194539247, + "grad_norm": 0.43571145565318015, + "learning_rate": 0.00011464058413093564, + "loss": 0.9548, + "step": 13839 + }, + { + "epoch": 2.9522184300341294, + "grad_norm": 0.5537238405033001, + "learning_rate": 0.0001146329133139925, + "loss": 0.9444, + "step": 13840 + }, + { + "epoch": 2.9524317406143346, + "grad_norm": 0.5462149324709674, + "learning_rate": 0.00011462524210517732, + "loss": 0.9287, + "step": 13841 + }, + { + "epoch": 2.9526450511945392, + "grad_norm": 0.5278034277560618, + "learning_rate": 0.00011461757050457697, + "loss": 0.9151, + "step": 13842 + }, + { + "epoch": 2.952858361774744, + "grad_norm": 0.40965343036518864, + "learning_rate": 0.00011460989851227822, + "loss": 0.937, + "step": 13843 + }, + { + "epoch": 2.953071672354949, + "grad_norm": 0.27770343078483667, + "learning_rate": 0.0001146022261283679, + "loss": 0.923, + "step": 13844 + }, + { + "epoch": 2.9532849829351537, + "grad_norm": 0.35709122461892945, + "learning_rate": 0.00011459455335293279, + "loss": 0.9304, + "step": 13845 + }, + { + "epoch": 2.9534982935153584, + "grad_norm": 0.5284039600808472, + "learning_rate": 0.00011458688018605976, + "loss": 0.9108, + "step": 13846 + }, + { + "epoch": 2.953711604095563, + "grad_norm": 0.6120753853214596, + "learning_rate": 0.00011457920662783554, + "loss": 0.9337, + "step": 13847 + }, + { + "epoch": 2.953924914675768, + "grad_norm": 0.5866542103154226, + "learning_rate": 0.00011457153267834707, + "loss": 0.9656, + "step": 13848 + }, + { + "epoch": 2.9541382252559725, + "grad_norm": 0.4543535582740719, + "learning_rate": 0.0001145638583376811, + "loss": 0.9621, + "step": 13849 + }, + { + "epoch": 2.9543515358361776, + "grad_norm": 0.20567618740453497, + "learning_rate": 0.00011455618360592451, + "loss": 0.9526, + "step": 13850 + }, + { + "epoch": 2.9545648464163823, + "grad_norm": 0.4050570819579969, + "learning_rate": 0.00011454850848316409, + "loss": 0.8993, + "step": 13851 + }, + { + "epoch": 2.954778156996587, + "grad_norm": 0.5551614568689841, + "learning_rate": 0.00011454083296948676, + "loss": 0.913, + "step": 13852 + }, + { + "epoch": 2.9549914675767917, + "grad_norm": 0.367368397027651, + "learning_rate": 0.00011453315706497928, + "loss": 0.9307, + "step": 13853 + }, + { + "epoch": 2.955204778156997, + "grad_norm": 0.3555385928112746, + "learning_rate": 0.00011452548076972857, + "loss": 0.924, + "step": 13854 + }, + { + "epoch": 2.9554180887372015, + "grad_norm": 0.2882340549323102, + "learning_rate": 0.00011451780408382145, + "loss": 0.9078, + "step": 13855 + }, + { + "epoch": 2.955631399317406, + "grad_norm": 0.29992926393090835, + "learning_rate": 0.00011451012700734481, + "loss": 0.9261, + "step": 13856 + }, + { + "epoch": 2.955844709897611, + "grad_norm": 0.33923085943204306, + "learning_rate": 0.0001145024495403855, + "loss": 0.915, + "step": 13857 + }, + { + "epoch": 2.9560580204778155, + "grad_norm": 0.3146803048469418, + "learning_rate": 0.0001144947716830304, + "loss": 0.9254, + "step": 13858 + }, + { + "epoch": 2.95627133105802, + "grad_norm": 0.2663337062405577, + "learning_rate": 0.00011448709343536635, + "loss": 0.9322, + "step": 13859 + }, + { + "epoch": 2.9564846416382253, + "grad_norm": 0.23274430440260727, + "learning_rate": 0.00011447941479748029, + "loss": 0.8723, + "step": 13860 + }, + { + "epoch": 2.95669795221843, + "grad_norm": 0.2576913687556665, + "learning_rate": 0.00011447173576945904, + "loss": 0.9125, + "step": 13861 + }, + { + "epoch": 2.9569112627986347, + "grad_norm": 0.3259448462306418, + "learning_rate": 0.00011446405635138956, + "loss": 0.961, + "step": 13862 + }, + { + "epoch": 2.95712457337884, + "grad_norm": 0.2522621193684587, + "learning_rate": 0.00011445637654335868, + "loss": 0.9315, + "step": 13863 + }, + { + "epoch": 2.9573378839590445, + "grad_norm": 0.19517825684316137, + "learning_rate": 0.00011444869634545332, + "loss": 0.9304, + "step": 13864 + }, + { + "epoch": 2.957551194539249, + "grad_norm": 0.23928117228716983, + "learning_rate": 0.00011444101575776037, + "loss": 0.9231, + "step": 13865 + }, + { + "epoch": 2.957764505119454, + "grad_norm": 0.2578643931911805, + "learning_rate": 0.00011443333478036677, + "loss": 0.9266, + "step": 13866 + }, + { + "epoch": 2.9579778156996586, + "grad_norm": 0.22838984478485239, + "learning_rate": 0.00011442565341335938, + "loss": 0.9545, + "step": 13867 + }, + { + "epoch": 2.9581911262798632, + "grad_norm": 0.23201814146409705, + "learning_rate": 0.00011441797165682519, + "loss": 0.9173, + "step": 13868 + }, + { + "epoch": 2.9584044368600684, + "grad_norm": 0.19358660519865156, + "learning_rate": 0.00011441028951085103, + "loss": 0.8828, + "step": 13869 + }, + { + "epoch": 2.958617747440273, + "grad_norm": 0.20511043393236386, + "learning_rate": 0.00011440260697552389, + "loss": 0.9333, + "step": 13870 + }, + { + "epoch": 2.9588310580204777, + "grad_norm": 0.26419925735107747, + "learning_rate": 0.00011439492405093064, + "loss": 0.9402, + "step": 13871 + }, + { + "epoch": 2.9590443686006824, + "grad_norm": 0.33823375927046456, + "learning_rate": 0.00011438724073715828, + "loss": 0.9477, + "step": 13872 + }, + { + "epoch": 2.9592576791808876, + "grad_norm": 0.35965891343340983, + "learning_rate": 0.00011437955703429367, + "loss": 0.9464, + "step": 13873 + }, + { + "epoch": 2.9594709897610922, + "grad_norm": 0.32325113672362205, + "learning_rate": 0.00011437187294242387, + "loss": 0.9301, + "step": 13874 + }, + { + "epoch": 2.959684300341297, + "grad_norm": 0.3085960544777941, + "learning_rate": 0.00011436418846163568, + "loss": 0.9234, + "step": 13875 + }, + { + "epoch": 2.9598976109215016, + "grad_norm": 0.24358464873855712, + "learning_rate": 0.00011435650359201614, + "loss": 0.9251, + "step": 13876 + }, + { + "epoch": 2.9601109215017063, + "grad_norm": 0.2429040989760595, + "learning_rate": 0.00011434881833365218, + "loss": 0.9177, + "step": 13877 + }, + { + "epoch": 2.9603242320819114, + "grad_norm": 0.2523488288225437, + "learning_rate": 0.00011434113268663076, + "loss": 0.9225, + "step": 13878 + }, + { + "epoch": 2.960537542662116, + "grad_norm": 0.26897619406251033, + "learning_rate": 0.00011433344665103886, + "loss": 0.9192, + "step": 13879 + }, + { + "epoch": 2.960750853242321, + "grad_norm": 0.2886210371414747, + "learning_rate": 0.00011432576022696344, + "loss": 0.9264, + "step": 13880 + }, + { + "epoch": 2.9609641638225255, + "grad_norm": 0.20224449036082556, + "learning_rate": 0.00011431807341449144, + "loss": 0.9416, + "step": 13881 + }, + { + "epoch": 2.9611774744027306, + "grad_norm": 0.27891507254215053, + "learning_rate": 0.0001143103862137099, + "loss": 0.9209, + "step": 13882 + }, + { + "epoch": 2.9613907849829353, + "grad_norm": 0.32986340373692663, + "learning_rate": 0.00011430269862470571, + "loss": 0.9082, + "step": 13883 + }, + { + "epoch": 2.96160409556314, + "grad_norm": 0.30378298054141345, + "learning_rate": 0.00011429501064756596, + "loss": 0.9311, + "step": 13884 + }, + { + "epoch": 2.9618174061433447, + "grad_norm": 0.30248810825110617, + "learning_rate": 0.00011428732228237757, + "loss": 0.9261, + "step": 13885 + }, + { + "epoch": 2.9620307167235493, + "grad_norm": 0.27265048520066576, + "learning_rate": 0.00011427963352922755, + "loss": 0.9497, + "step": 13886 + }, + { + "epoch": 2.962244027303754, + "grad_norm": 0.26908860864956824, + "learning_rate": 0.00011427194438820288, + "loss": 0.9075, + "step": 13887 + }, + { + "epoch": 2.962457337883959, + "grad_norm": 0.2662896267694494, + "learning_rate": 0.0001142642548593906, + "loss": 0.984, + "step": 13888 + }, + { + "epoch": 2.962670648464164, + "grad_norm": 0.274592842342906, + "learning_rate": 0.00011425656494287769, + "loss": 0.9479, + "step": 13889 + }, + { + "epoch": 2.9628839590443685, + "grad_norm": 0.26613056455212214, + "learning_rate": 0.0001142488746387512, + "loss": 0.9414, + "step": 13890 + }, + { + "epoch": 2.9630972696245736, + "grad_norm": 0.2785577744165971, + "learning_rate": 0.00011424118394709809, + "loss": 0.9196, + "step": 13891 + }, + { + "epoch": 2.9633105802047783, + "grad_norm": 0.2755246721155929, + "learning_rate": 0.00011423349286800543, + "loss": 0.9273, + "step": 13892 + }, + { + "epoch": 2.963523890784983, + "grad_norm": 0.20480375425044295, + "learning_rate": 0.0001142258014015602, + "loss": 0.9452, + "step": 13893 + }, + { + "epoch": 2.9637372013651877, + "grad_norm": 0.2710149910959034, + "learning_rate": 0.00011421810954784947, + "loss": 0.9336, + "step": 13894 + }, + { + "epoch": 2.9639505119453924, + "grad_norm": 0.3562231511902574, + "learning_rate": 0.00011421041730696022, + "loss": 0.9306, + "step": 13895 + }, + { + "epoch": 2.964163822525597, + "grad_norm": 0.31442207384496396, + "learning_rate": 0.00011420272467897956, + "loss": 0.9353, + "step": 13896 + }, + { + "epoch": 2.964377133105802, + "grad_norm": 0.2204720524175939, + "learning_rate": 0.00011419503166399449, + "loss": 0.9023, + "step": 13897 + }, + { + "epoch": 2.964590443686007, + "grad_norm": 0.2998103603851017, + "learning_rate": 0.00011418733826209207, + "loss": 0.9403, + "step": 13898 + }, + { + "epoch": 2.9648037542662116, + "grad_norm": 0.27059576792139717, + "learning_rate": 0.0001141796444733593, + "loss": 0.9491, + "step": 13899 + }, + { + "epoch": 2.9650170648464163, + "grad_norm": 0.18969817566952843, + "learning_rate": 0.00011417195029788333, + "loss": 0.9398, + "step": 13900 + }, + { + "epoch": 2.9652303754266214, + "grad_norm": 0.3109367852800344, + "learning_rate": 0.00011416425573575112, + "loss": 0.9456, + "step": 13901 + }, + { + "epoch": 2.965443686006826, + "grad_norm": 0.38068142817559547, + "learning_rate": 0.00011415656078704981, + "loss": 0.9223, + "step": 13902 + }, + { + "epoch": 2.9656569965870307, + "grad_norm": 0.34843299926817944, + "learning_rate": 0.00011414886545186643, + "loss": 0.9109, + "step": 13903 + }, + { + "epoch": 2.9658703071672354, + "grad_norm": 0.2940638275985456, + "learning_rate": 0.00011414116973028807, + "loss": 0.9301, + "step": 13904 + }, + { + "epoch": 2.96608361774744, + "grad_norm": 0.31658509726715073, + "learning_rate": 0.00011413347362240178, + "loss": 0.9337, + "step": 13905 + }, + { + "epoch": 2.966296928327645, + "grad_norm": 0.4266035526126678, + "learning_rate": 0.0001141257771282947, + "loss": 0.9357, + "step": 13906 + }, + { + "epoch": 2.96651023890785, + "grad_norm": 0.35811120086425563, + "learning_rate": 0.00011411808024805386, + "loss": 0.9318, + "step": 13907 + }, + { + "epoch": 2.9667235494880546, + "grad_norm": 0.3426769454781474, + "learning_rate": 0.00011411038298176635, + "loss": 0.9516, + "step": 13908 + }, + { + "epoch": 2.9669368600682593, + "grad_norm": 0.38225618916808624, + "learning_rate": 0.00011410268532951929, + "loss": 0.9497, + "step": 13909 + }, + { + "epoch": 2.9671501706484644, + "grad_norm": 0.34233820564239226, + "learning_rate": 0.00011409498729139981, + "loss": 0.9423, + "step": 13910 + }, + { + "epoch": 2.967363481228669, + "grad_norm": 0.29616411170140444, + "learning_rate": 0.00011408728886749493, + "loss": 0.9279, + "step": 13911 + }, + { + "epoch": 2.967576791808874, + "grad_norm": 0.317172924831545, + "learning_rate": 0.00011407959005789182, + "loss": 0.9085, + "step": 13912 + }, + { + "epoch": 2.9677901023890785, + "grad_norm": 0.2371544073757972, + "learning_rate": 0.00011407189086267757, + "loss": 0.9311, + "step": 13913 + }, + { + "epoch": 2.968003412969283, + "grad_norm": 0.24405836878516735, + "learning_rate": 0.0001140641912819393, + "loss": 0.9142, + "step": 13914 + }, + { + "epoch": 2.968216723549488, + "grad_norm": 0.38790147798320207, + "learning_rate": 0.00011405649131576415, + "loss": 0.9662, + "step": 13915 + }, + { + "epoch": 2.968430034129693, + "grad_norm": 0.4192818437999711, + "learning_rate": 0.00011404879096423922, + "loss": 0.9425, + "step": 13916 + }, + { + "epoch": 2.9686433447098977, + "grad_norm": 0.26943910552577266, + "learning_rate": 0.00011404109022745161, + "loss": 0.9207, + "step": 13917 + }, + { + "epoch": 2.9688566552901023, + "grad_norm": 0.20605869727064513, + "learning_rate": 0.00011403338910548855, + "loss": 0.9335, + "step": 13918 + }, + { + "epoch": 2.969069965870307, + "grad_norm": 0.3286621621875262, + "learning_rate": 0.0001140256875984371, + "loss": 0.9611, + "step": 13919 + }, + { + "epoch": 2.969283276450512, + "grad_norm": 0.37015505473518623, + "learning_rate": 0.00011401798570638441, + "loss": 0.9443, + "step": 13920 + }, + { + "epoch": 2.969496587030717, + "grad_norm": 0.35819367425113313, + "learning_rate": 0.00011401028342941765, + "loss": 0.928, + "step": 13921 + }, + { + "epoch": 2.9697098976109215, + "grad_norm": 0.3469654582551231, + "learning_rate": 0.00011400258076762396, + "loss": 0.9279, + "step": 13922 + }, + { + "epoch": 2.969923208191126, + "grad_norm": 0.25042795125608214, + "learning_rate": 0.00011399487772109049, + "loss": 0.9379, + "step": 13923 + }, + { + "epoch": 2.970136518771331, + "grad_norm": 0.21997399991600036, + "learning_rate": 0.0001139871742899044, + "loss": 0.9362, + "step": 13924 + }, + { + "epoch": 2.9703498293515356, + "grad_norm": 0.28420099895569717, + "learning_rate": 0.00011397947047415287, + "loss": 0.9523, + "step": 13925 + }, + { + "epoch": 2.9705631399317407, + "grad_norm": 0.28459616807308485, + "learning_rate": 0.00011397176627392308, + "loss": 0.9344, + "step": 13926 + }, + { + "epoch": 2.9707764505119454, + "grad_norm": 0.28179350838362216, + "learning_rate": 0.00011396406168930215, + "loss": 0.9031, + "step": 13927 + }, + { + "epoch": 2.97098976109215, + "grad_norm": 0.32121308442188484, + "learning_rate": 0.00011395635672037732, + "loss": 0.9498, + "step": 13928 + }, + { + "epoch": 2.971203071672355, + "grad_norm": 0.3347464942843125, + "learning_rate": 0.00011394865136723572, + "loss": 0.9083, + "step": 13929 + }, + { + "epoch": 2.97141638225256, + "grad_norm": 0.27947761001643423, + "learning_rate": 0.00011394094562996456, + "loss": 0.938, + "step": 13930 + }, + { + "epoch": 2.9716296928327646, + "grad_norm": 0.25637461723866956, + "learning_rate": 0.00011393323950865104, + "loss": 0.9542, + "step": 13931 + }, + { + "epoch": 2.9718430034129693, + "grad_norm": 0.19736573777661492, + "learning_rate": 0.00011392553300338235, + "loss": 0.9295, + "step": 13932 + }, + { + "epoch": 2.972056313993174, + "grad_norm": 0.2306312138616108, + "learning_rate": 0.00011391782611424566, + "loss": 0.9394, + "step": 13933 + }, + { + "epoch": 2.9722696245733786, + "grad_norm": 0.26197121202279583, + "learning_rate": 0.00011391011884132822, + "loss": 0.9288, + "step": 13934 + }, + { + "epoch": 2.9724829351535837, + "grad_norm": 0.29694994911476286, + "learning_rate": 0.00011390241118471722, + "loss": 0.9333, + "step": 13935 + }, + { + "epoch": 2.9726962457337884, + "grad_norm": 0.36468667636626695, + "learning_rate": 0.00011389470314449985, + "loss": 0.9136, + "step": 13936 + }, + { + "epoch": 2.972909556313993, + "grad_norm": 0.3296650086523096, + "learning_rate": 0.00011388699472076333, + "loss": 0.9, + "step": 13937 + }, + { + "epoch": 2.973122866894198, + "grad_norm": 0.26007808879521255, + "learning_rate": 0.00011387928591359492, + "loss": 0.9366, + "step": 13938 + }, + { + "epoch": 2.973336177474403, + "grad_norm": 0.2651396966580025, + "learning_rate": 0.0001138715767230818, + "loss": 0.9185, + "step": 13939 + }, + { + "epoch": 2.9735494880546076, + "grad_norm": 0.22221883359660993, + "learning_rate": 0.00011386386714931125, + "loss": 0.9462, + "step": 13940 + }, + { + "epoch": 2.9737627986348123, + "grad_norm": 0.20479571747136552, + "learning_rate": 0.00011385615719237045, + "loss": 0.955, + "step": 13941 + }, + { + "epoch": 2.973976109215017, + "grad_norm": 0.28982067074945, + "learning_rate": 0.00011384844685234665, + "loss": 0.9163, + "step": 13942 + }, + { + "epoch": 2.9741894197952217, + "grad_norm": 0.268069890352163, + "learning_rate": 0.00011384073612932714, + "loss": 0.9436, + "step": 13943 + }, + { + "epoch": 2.9744027303754264, + "grad_norm": 0.2626345834616733, + "learning_rate": 0.00011383302502339912, + "loss": 0.9286, + "step": 13944 + }, + { + "epoch": 2.9746160409556315, + "grad_norm": 0.29575655358092146, + "learning_rate": 0.00011382531353464982, + "loss": 0.9575, + "step": 13945 + }, + { + "epoch": 2.974829351535836, + "grad_norm": 0.2887657843148884, + "learning_rate": 0.00011381760166316658, + "loss": 0.916, + "step": 13946 + }, + { + "epoch": 2.975042662116041, + "grad_norm": 0.22224752203914025, + "learning_rate": 0.00011380988940903657, + "loss": 0.9182, + "step": 13947 + }, + { + "epoch": 2.975255972696246, + "grad_norm": 0.2287864811394127, + "learning_rate": 0.0001138021767723471, + "loss": 0.9287, + "step": 13948 + }, + { + "epoch": 2.9754692832764507, + "grad_norm": 0.1881230161153401, + "learning_rate": 0.00011379446375318545, + "loss": 0.9362, + "step": 13949 + }, + { + "epoch": 2.9756825938566553, + "grad_norm": 0.22975681467610595, + "learning_rate": 0.00011378675035163887, + "loss": 0.9172, + "step": 13950 + }, + { + "epoch": 2.97589590443686, + "grad_norm": 0.19506684256123605, + "learning_rate": 0.0001137790365677946, + "loss": 0.9714, + "step": 13951 + }, + { + "epoch": 2.9761092150170647, + "grad_norm": 0.19906448001524196, + "learning_rate": 0.00011377132240174001, + "loss": 0.8874, + "step": 13952 + }, + { + "epoch": 2.9763225255972694, + "grad_norm": 0.22600434812321873, + "learning_rate": 0.00011376360785356233, + "loss": 0.9159, + "step": 13953 + }, + { + "epoch": 2.9765358361774745, + "grad_norm": 0.29937675086253596, + "learning_rate": 0.00011375589292334884, + "loss": 0.922, + "step": 13954 + }, + { + "epoch": 2.976749146757679, + "grad_norm": 0.36295784502761774, + "learning_rate": 0.00011374817761118688, + "loss": 0.9339, + "step": 13955 + }, + { + "epoch": 2.976962457337884, + "grad_norm": 0.23902803654129168, + "learning_rate": 0.00011374046191716372, + "loss": 0.9208, + "step": 13956 + }, + { + "epoch": 2.9771757679180886, + "grad_norm": 0.27042727025282826, + "learning_rate": 0.00011373274584136664, + "loss": 0.9487, + "step": 13957 + }, + { + "epoch": 2.9773890784982937, + "grad_norm": 0.3045325572499878, + "learning_rate": 0.00011372502938388298, + "loss": 0.9387, + "step": 13958 + }, + { + "epoch": 2.9776023890784984, + "grad_norm": 0.23839894299715778, + "learning_rate": 0.00011371731254480007, + "loss": 0.9112, + "step": 13959 + }, + { + "epoch": 2.977815699658703, + "grad_norm": 0.2954980177062153, + "learning_rate": 0.00011370959532420518, + "loss": 0.8964, + "step": 13960 + }, + { + "epoch": 2.9780290102389078, + "grad_norm": 0.2406878316145122, + "learning_rate": 0.00011370187772218565, + "loss": 0.9423, + "step": 13961 + }, + { + "epoch": 2.9782423208191124, + "grad_norm": 0.22842003356080756, + "learning_rate": 0.00011369415973882883, + "loss": 0.9307, + "step": 13962 + }, + { + "epoch": 2.9784556313993176, + "grad_norm": 0.3065652147625892, + "learning_rate": 0.00011368644137422202, + "loss": 0.9259, + "step": 13963 + }, + { + "epoch": 2.9786689419795223, + "grad_norm": 0.27736608971747234, + "learning_rate": 0.00011367872262845254, + "loss": 0.9137, + "step": 13964 + }, + { + "epoch": 2.978882252559727, + "grad_norm": 0.3303495002815161, + "learning_rate": 0.00011367100350160775, + "loss": 0.9168, + "step": 13965 + }, + { + "epoch": 2.9790955631399316, + "grad_norm": 0.3367353990064867, + "learning_rate": 0.00011366328399377499, + "loss": 0.9394, + "step": 13966 + }, + { + "epoch": 2.9793088737201368, + "grad_norm": 0.3019184939677262, + "learning_rate": 0.00011365556410504162, + "loss": 0.941, + "step": 13967 + }, + { + "epoch": 2.9795221843003414, + "grad_norm": 0.26582859579959384, + "learning_rate": 0.00011364784383549497, + "loss": 0.9156, + "step": 13968 + }, + { + "epoch": 2.979735494880546, + "grad_norm": 0.25184256714930703, + "learning_rate": 0.0001136401231852224, + "loss": 0.9339, + "step": 13969 + }, + { + "epoch": 2.979948805460751, + "grad_norm": 0.2377196048981016, + "learning_rate": 0.00011363240215431126, + "loss": 0.9044, + "step": 13970 + }, + { + "epoch": 2.9801621160409555, + "grad_norm": 0.2559084546223416, + "learning_rate": 0.00011362468074284894, + "loss": 0.9152, + "step": 13971 + }, + { + "epoch": 2.98037542662116, + "grad_norm": 0.2635493843416872, + "learning_rate": 0.00011361695895092276, + "loss": 0.9475, + "step": 13972 + }, + { + "epoch": 2.9805887372013653, + "grad_norm": 0.2888242427344509, + "learning_rate": 0.00011360923677862016, + "loss": 0.9335, + "step": 13973 + }, + { + "epoch": 2.98080204778157, + "grad_norm": 0.2676276165725279, + "learning_rate": 0.00011360151422602848, + "loss": 0.9362, + "step": 13974 + }, + { + "epoch": 2.9810153583617747, + "grad_norm": 0.28808382001310623, + "learning_rate": 0.00011359379129323509, + "loss": 0.9492, + "step": 13975 + }, + { + "epoch": 2.98122866894198, + "grad_norm": 0.29912649054798607, + "learning_rate": 0.00011358606798032737, + "loss": 0.9377, + "step": 13976 + }, + { + "epoch": 2.9814419795221845, + "grad_norm": 0.32418913594411436, + "learning_rate": 0.00011357834428739275, + "loss": 0.9192, + "step": 13977 + }, + { + "epoch": 2.981655290102389, + "grad_norm": 0.29860587431024127, + "learning_rate": 0.0001135706202145186, + "loss": 0.9073, + "step": 13978 + }, + { + "epoch": 2.981868600682594, + "grad_norm": 0.2588347650597032, + "learning_rate": 0.0001135628957617923, + "loss": 0.9007, + "step": 13979 + }, + { + "epoch": 2.9820819112627985, + "grad_norm": 0.21279402017032056, + "learning_rate": 0.00011355517092930129, + "loss": 0.9246, + "step": 13980 + }, + { + "epoch": 2.982295221843003, + "grad_norm": 0.19438238555557902, + "learning_rate": 0.00011354744571713295, + "loss": 0.9009, + "step": 13981 + }, + { + "epoch": 2.9825085324232083, + "grad_norm": 0.23817597160944579, + "learning_rate": 0.00011353972012537468, + "loss": 0.9625, + "step": 13982 + }, + { + "epoch": 2.982721843003413, + "grad_norm": 0.20680205346053654, + "learning_rate": 0.00011353199415411394, + "loss": 0.9064, + "step": 13983 + }, + { + "epoch": 2.9829351535836177, + "grad_norm": 0.25852183789269473, + "learning_rate": 0.0001135242678034381, + "loss": 0.9054, + "step": 13984 + }, + { + "epoch": 2.9831484641638224, + "grad_norm": 0.2549477595476371, + "learning_rate": 0.00011351654107343461, + "loss": 0.9341, + "step": 13985 + }, + { + "epoch": 2.9833617747440275, + "grad_norm": 0.19513664843029083, + "learning_rate": 0.0001135088139641909, + "loss": 0.8827, + "step": 13986 + }, + { + "epoch": 2.983575085324232, + "grad_norm": 0.19444872909525726, + "learning_rate": 0.0001135010864757944, + "loss": 0.9103, + "step": 13987 + }, + { + "epoch": 2.983788395904437, + "grad_norm": 0.2051562051162505, + "learning_rate": 0.00011349335860833253, + "loss": 0.9307, + "step": 13988 + }, + { + "epoch": 2.9840017064846416, + "grad_norm": 0.20231714205660803, + "learning_rate": 0.00011348563036189274, + "loss": 0.8749, + "step": 13989 + }, + { + "epoch": 2.9842150170648463, + "grad_norm": 0.18680195324595494, + "learning_rate": 0.0001134779017365625, + "loss": 0.9297, + "step": 13990 + }, + { + "epoch": 2.984428327645051, + "grad_norm": 0.23139223345131485, + "learning_rate": 0.00011347017273242924, + "loss": 0.9153, + "step": 13991 + }, + { + "epoch": 2.984641638225256, + "grad_norm": 0.23034152161030752, + "learning_rate": 0.0001134624433495804, + "loss": 0.9095, + "step": 13992 + }, + { + "epoch": 2.9848549488054608, + "grad_norm": 0.21811015554697935, + "learning_rate": 0.00011345471358810346, + "loss": 0.9133, + "step": 13993 + }, + { + "epoch": 2.9850682593856654, + "grad_norm": 0.24858642535060294, + "learning_rate": 0.00011344698344808585, + "loss": 0.9643, + "step": 13994 + }, + { + "epoch": 2.9852815699658706, + "grad_norm": 0.216351727071861, + "learning_rate": 0.00011343925292961507, + "loss": 0.9371, + "step": 13995 + }, + { + "epoch": 2.9854948805460753, + "grad_norm": 0.23491483996987803, + "learning_rate": 0.0001134315220327786, + "loss": 0.9237, + "step": 13996 + }, + { + "epoch": 2.98570819112628, + "grad_norm": 0.2658129815875132, + "learning_rate": 0.00011342379075766389, + "loss": 0.9456, + "step": 13997 + }, + { + "epoch": 2.9859215017064846, + "grad_norm": 0.2022959692673982, + "learning_rate": 0.00011341605910435841, + "loss": 0.9207, + "step": 13998 + }, + { + "epoch": 2.9861348122866893, + "grad_norm": 0.22588064377101513, + "learning_rate": 0.00011340832707294965, + "loss": 0.8982, + "step": 13999 + }, + { + "epoch": 2.986348122866894, + "grad_norm": 0.23569360103025266, + "learning_rate": 0.00011340059466352513, + "loss": 0.9557, + "step": 14000 + }, + { + "epoch": 2.986561433447099, + "grad_norm": 0.20758545125224304, + "learning_rate": 0.00011339286187617231, + "loss": 0.9567, + "step": 14001 + }, + { + "epoch": 2.986774744027304, + "grad_norm": 0.2861779340695661, + "learning_rate": 0.0001133851287109787, + "loss": 0.9366, + "step": 14002 + }, + { + "epoch": 2.9869880546075085, + "grad_norm": 0.32890045247442606, + "learning_rate": 0.00011337739516803181, + "loss": 0.9228, + "step": 14003 + }, + { + "epoch": 2.987201365187713, + "grad_norm": 0.2862738670189429, + "learning_rate": 0.0001133696612474191, + "loss": 0.9155, + "step": 14004 + }, + { + "epoch": 2.9874146757679183, + "grad_norm": 0.27450491666077254, + "learning_rate": 0.00011336192694922814, + "loss": 0.9567, + "step": 14005 + }, + { + "epoch": 2.987627986348123, + "grad_norm": 0.2534785893036072, + "learning_rate": 0.0001133541922735464, + "loss": 0.9408, + "step": 14006 + }, + { + "epoch": 2.9878412969283277, + "grad_norm": 0.2909524275054697, + "learning_rate": 0.00011334645722046142, + "loss": 0.9415, + "step": 14007 + }, + { + "epoch": 2.9880546075085324, + "grad_norm": 0.27268810851878156, + "learning_rate": 0.00011333872179006072, + "loss": 0.9226, + "step": 14008 + }, + { + "epoch": 2.988267918088737, + "grad_norm": 0.1972157873937781, + "learning_rate": 0.00011333098598243184, + "loss": 0.8961, + "step": 14009 + }, + { + "epoch": 2.9884812286689417, + "grad_norm": 0.22752717818555637, + "learning_rate": 0.00011332324979766225, + "loss": 0.8982, + "step": 14010 + }, + { + "epoch": 2.988694539249147, + "grad_norm": 0.18513517038214589, + "learning_rate": 0.00011331551323583955, + "loss": 0.8763, + "step": 14011 + }, + { + "epoch": 2.9889078498293515, + "grad_norm": 0.20905334886160817, + "learning_rate": 0.00011330777629705124, + "loss": 0.9397, + "step": 14012 + }, + { + "epoch": 2.989121160409556, + "grad_norm": 0.21405529891907038, + "learning_rate": 0.00011330003898138491, + "loss": 0.9373, + "step": 14013 + }, + { + "epoch": 2.9893344709897613, + "grad_norm": 0.18548296086247779, + "learning_rate": 0.00011329230128892808, + "loss": 0.902, + "step": 14014 + }, + { + "epoch": 2.989547781569966, + "grad_norm": 0.19368985470910582, + "learning_rate": 0.00011328456321976828, + "loss": 0.9521, + "step": 14015 + }, + { + "epoch": 2.9897610921501707, + "grad_norm": 0.23071620567495235, + "learning_rate": 0.00011327682477399308, + "loss": 0.9325, + "step": 14016 + }, + { + "epoch": 2.9899744027303754, + "grad_norm": 0.22720975471687213, + "learning_rate": 0.00011326908595169006, + "loss": 0.9164, + "step": 14017 + }, + { + "epoch": 2.99018771331058, + "grad_norm": 0.17309860715979897, + "learning_rate": 0.00011326134675294676, + "loss": 0.9184, + "step": 14018 + }, + { + "epoch": 2.9904010238907848, + "grad_norm": 0.18719557872491024, + "learning_rate": 0.0001132536071778508, + "loss": 0.9128, + "step": 14019 + }, + { + "epoch": 2.99061433447099, + "grad_norm": 0.19821111008185882, + "learning_rate": 0.0001132458672264897, + "loss": 0.9249, + "step": 14020 + }, + { + "epoch": 2.9908276450511946, + "grad_norm": 0.20250205858145037, + "learning_rate": 0.00011323812689895104, + "loss": 0.9514, + "step": 14021 + }, + { + "epoch": 2.9910409556313993, + "grad_norm": 0.2472823709096212, + "learning_rate": 0.0001132303861953224, + "loss": 0.9344, + "step": 14022 + }, + { + "epoch": 2.991254266211604, + "grad_norm": 0.2661545354010929, + "learning_rate": 0.00011322264511569142, + "loss": 0.9281, + "step": 14023 + }, + { + "epoch": 2.991467576791809, + "grad_norm": 0.3085373627751084, + "learning_rate": 0.00011321490366014563, + "loss": 0.925, + "step": 14024 + }, + { + "epoch": 2.9916808873720138, + "grad_norm": 0.41384923241648525, + "learning_rate": 0.00011320716182877265, + "loss": 0.9478, + "step": 14025 + }, + { + "epoch": 2.9918941979522184, + "grad_norm": 0.320358772339283, + "learning_rate": 0.00011319941962166006, + "loss": 0.9443, + "step": 14026 + }, + { + "epoch": 2.992107508532423, + "grad_norm": 0.27009255479309996, + "learning_rate": 0.00011319167703889551, + "loss": 0.9137, + "step": 14027 + }, + { + "epoch": 2.992320819112628, + "grad_norm": 0.29326463080061943, + "learning_rate": 0.00011318393408056655, + "loss": 0.9155, + "step": 14028 + }, + { + "epoch": 2.9925341296928325, + "grad_norm": 0.23472303046541876, + "learning_rate": 0.00011317619074676082, + "loss": 0.9109, + "step": 14029 + }, + { + "epoch": 2.9927474402730376, + "grad_norm": 0.36851014503296853, + "learning_rate": 0.00011316844703756595, + "loss": 0.9114, + "step": 14030 + }, + { + "epoch": 2.9929607508532423, + "grad_norm": 0.2981177593319385, + "learning_rate": 0.00011316070295306955, + "loss": 0.9251, + "step": 14031 + }, + { + "epoch": 2.993174061433447, + "grad_norm": 0.2562262662653704, + "learning_rate": 0.0001131529584933592, + "loss": 0.9128, + "step": 14032 + }, + { + "epoch": 2.993387372013652, + "grad_norm": 0.21608513740187069, + "learning_rate": 0.00011314521365852261, + "loss": 0.9089, + "step": 14033 + }, + { + "epoch": 2.993600682593857, + "grad_norm": 0.2364801283143955, + "learning_rate": 0.00011313746844864732, + "loss": 0.9169, + "step": 14034 + }, + { + "epoch": 2.9938139931740615, + "grad_norm": 0.26997872047886995, + "learning_rate": 0.00011312972286382107, + "loss": 0.9427, + "step": 14035 + }, + { + "epoch": 2.994027303754266, + "grad_norm": 0.5296394545063435, + "learning_rate": 0.00011312197690413142, + "loss": 0.9266, + "step": 14036 + }, + { + "epoch": 2.994240614334471, + "grad_norm": 0.37539291214895715, + "learning_rate": 0.00011311423056966606, + "loss": 0.9426, + "step": 14037 + }, + { + "epoch": 2.9944539249146755, + "grad_norm": 0.33727411934647145, + "learning_rate": 0.0001131064838605126, + "loss": 0.9184, + "step": 14038 + }, + { + "epoch": 2.9946672354948807, + "grad_norm": 0.2818797028127441, + "learning_rate": 0.00011309873677675874, + "loss": 0.9352, + "step": 14039 + }, + { + "epoch": 2.9948805460750854, + "grad_norm": 0.24975570533053706, + "learning_rate": 0.0001130909893184921, + "loss": 0.9252, + "step": 14040 + }, + { + "epoch": 2.99509385665529, + "grad_norm": 0.313590694939672, + "learning_rate": 0.00011308324148580037, + "loss": 0.9319, + "step": 14041 + }, + { + "epoch": 2.9953071672354947, + "grad_norm": 0.25054538960536615, + "learning_rate": 0.0001130754932787712, + "loss": 0.9461, + "step": 14042 + }, + { + "epoch": 2.9955204778157, + "grad_norm": 0.2067091592955271, + "learning_rate": 0.00011306774469749228, + "loss": 0.9241, + "step": 14043 + }, + { + "epoch": 2.9957337883959045, + "grad_norm": 0.28929920442781043, + "learning_rate": 0.00011305999574205125, + "loss": 0.931, + "step": 14044 + }, + { + "epoch": 2.9959470989761092, + "grad_norm": 0.19467795804651458, + "learning_rate": 0.00011305224641253583, + "loss": 0.929, + "step": 14045 + }, + { + "epoch": 2.996160409556314, + "grad_norm": 0.21004588346927988, + "learning_rate": 0.00011304449670903367, + "loss": 0.9564, + "step": 14046 + }, + { + "epoch": 2.9963737201365186, + "grad_norm": 0.259265608677809, + "learning_rate": 0.0001130367466316325, + "loss": 0.9529, + "step": 14047 + }, + { + "epoch": 2.9965870307167233, + "grad_norm": 0.2123500182876619, + "learning_rate": 0.00011302899618041998, + "loss": 0.92, + "step": 14048 + }, + { + "epoch": 2.9968003412969284, + "grad_norm": 0.21373575002164769, + "learning_rate": 0.00011302124535548382, + "loss": 0.9367, + "step": 14049 + }, + { + "epoch": 2.997013651877133, + "grad_norm": 0.20563553867839704, + "learning_rate": 0.00011301349415691168, + "loss": 0.8974, + "step": 14050 + }, + { + "epoch": 2.9972269624573378, + "grad_norm": 0.22431595974563767, + "learning_rate": 0.00011300574258479131, + "loss": 0.9279, + "step": 14051 + }, + { + "epoch": 2.997440273037543, + "grad_norm": 0.20366033300000141, + "learning_rate": 0.0001129979906392104, + "loss": 0.9115, + "step": 14052 + }, + { + "epoch": 2.9976535836177476, + "grad_norm": 0.19637582223455716, + "learning_rate": 0.0001129902383202567, + "loss": 0.9205, + "step": 14053 + }, + { + "epoch": 2.9978668941979523, + "grad_norm": 0.1960024193422375, + "learning_rate": 0.0001129824856280179, + "loss": 0.9313, + "step": 14054 + }, + { + "epoch": 2.998080204778157, + "grad_norm": 0.21180888871546025, + "learning_rate": 0.0001129747325625817, + "loss": 0.9164, + "step": 14055 + }, + { + "epoch": 2.9982935153583616, + "grad_norm": 0.27852629357774705, + "learning_rate": 0.00011296697912403583, + "loss": 0.9011, + "step": 14056 + }, + { + "epoch": 2.9985068259385663, + "grad_norm": 0.27914188426954095, + "learning_rate": 0.00011295922531246806, + "loss": 0.934, + "step": 14057 + }, + { + "epoch": 2.9987201365187715, + "grad_norm": 0.2478746607853725, + "learning_rate": 0.0001129514711279661, + "loss": 0.9193, + "step": 14058 + }, + { + "epoch": 2.998933447098976, + "grad_norm": 0.20604443179874343, + "learning_rate": 0.0001129437165706177, + "loss": 0.9035, + "step": 14059 + }, + { + "epoch": 2.999146757679181, + "grad_norm": 0.2319557432591981, + "learning_rate": 0.00011293596164051058, + "loss": 0.9771, + "step": 14060 + }, + { + "epoch": 2.9993600682593855, + "grad_norm": 0.2862127392523367, + "learning_rate": 0.00011292820633773249, + "loss": 0.9134, + "step": 14061 + }, + { + "epoch": 2.9995733788395906, + "grad_norm": 0.35813131827556466, + "learning_rate": 0.0001129204506623712, + "loss": 0.9289, + "step": 14062 + }, + { + "epoch": 2.9997866894197953, + "grad_norm": 0.4169966622248479, + "learning_rate": 0.00011291269461451447, + "loss": 0.9312, + "step": 14063 + }, + { + "epoch": 3.0, + "grad_norm": 0.41532463655621826, + "learning_rate": 0.00011290493819425003, + "loss": 0.9377, + "step": 14064 + }, + { + "epoch": 3.0002133105802047, + "grad_norm": 0.31213571320299516, + "learning_rate": 0.00011289718140166567, + "loss": 0.9129, + "step": 14065 + }, + { + "epoch": 3.0004266211604094, + "grad_norm": 0.23702172617490302, + "learning_rate": 0.00011288942423684914, + "loss": 0.9112, + "step": 14066 + }, + { + "epoch": 3.0006399317406145, + "grad_norm": 0.24951306376204282, + "learning_rate": 0.00011288166669988823, + "loss": 0.8981, + "step": 14067 + }, + { + "epoch": 3.000853242320819, + "grad_norm": 0.3014842942692487, + "learning_rate": 0.0001128739087908707, + "loss": 0.931, + "step": 14068 + }, + { + "epoch": 3.001066552901024, + "grad_norm": 0.37642877432851773, + "learning_rate": 0.00011286615050988435, + "loss": 0.9175, + "step": 14069 + }, + { + "epoch": 3.0012798634812285, + "grad_norm": 0.33223095250941725, + "learning_rate": 0.00011285839185701694, + "loss": 0.9086, + "step": 14070 + }, + { + "epoch": 3.0014931740614332, + "grad_norm": 0.2524806086887875, + "learning_rate": 0.0001128506328323563, + "loss": 0.9203, + "step": 14071 + }, + { + "epoch": 3.0017064846416384, + "grad_norm": 0.22931340324426808, + "learning_rate": 0.00011284287343599016, + "loss": 0.9082, + "step": 14072 + }, + { + "epoch": 3.001919795221843, + "grad_norm": 0.23112914429279194, + "learning_rate": 0.00011283511366800638, + "loss": 0.9611, + "step": 14073 + }, + { + "epoch": 3.0021331058020477, + "grad_norm": 0.23330502549015156, + "learning_rate": 0.00011282735352849273, + "loss": 0.9017, + "step": 14074 + }, + { + "epoch": 3.0023464163822524, + "grad_norm": 0.30419237500659735, + "learning_rate": 0.00011281959301753705, + "loss": 0.9011, + "step": 14075 + }, + { + "epoch": 3.0025597269624575, + "grad_norm": 0.2570779435605605, + "learning_rate": 0.00011281183213522707, + "loss": 0.8883, + "step": 14076 + }, + { + "epoch": 3.0027730375426622, + "grad_norm": 0.23836060764506323, + "learning_rate": 0.0001128040708816507, + "loss": 0.9221, + "step": 14077 + }, + { + "epoch": 3.002986348122867, + "grad_norm": 0.23914883118142327, + "learning_rate": 0.00011279630925689568, + "loss": 0.9565, + "step": 14078 + }, + { + "epoch": 3.0031996587030716, + "grad_norm": 0.22706217806190182, + "learning_rate": 0.00011278854726104989, + "loss": 0.9101, + "step": 14079 + }, + { + "epoch": 3.0034129692832763, + "grad_norm": 0.21814551457461823, + "learning_rate": 0.00011278078489420113, + "loss": 0.9295, + "step": 14080 + }, + { + "epoch": 3.0036262798634814, + "grad_norm": 0.2650922994808264, + "learning_rate": 0.00011277302215643726, + "loss": 0.9222, + "step": 14081 + }, + { + "epoch": 3.003839590443686, + "grad_norm": 0.2537452872448505, + "learning_rate": 0.00011276525904784606, + "loss": 0.9442, + "step": 14082 + }, + { + "epoch": 3.0040529010238908, + "grad_norm": 0.21606551007161443, + "learning_rate": 0.00011275749556851543, + "loss": 0.9494, + "step": 14083 + }, + { + "epoch": 3.0042662116040955, + "grad_norm": 0.287904989502361, + "learning_rate": 0.00011274973171853316, + "loss": 0.9257, + "step": 14084 + }, + { + "epoch": 3.0044795221843, + "grad_norm": 0.3193684151737393, + "learning_rate": 0.00011274196749798714, + "loss": 0.9272, + "step": 14085 + }, + { + "epoch": 3.0046928327645053, + "grad_norm": 0.26260391031320685, + "learning_rate": 0.00011273420290696522, + "loss": 0.9021, + "step": 14086 + }, + { + "epoch": 3.00490614334471, + "grad_norm": 0.2777644346375117, + "learning_rate": 0.00011272643794555523, + "loss": 0.9168, + "step": 14087 + }, + { + "epoch": 3.0051194539249146, + "grad_norm": 0.3202269880236777, + "learning_rate": 0.00011271867261384505, + "loss": 0.9272, + "step": 14088 + }, + { + "epoch": 3.0053327645051193, + "grad_norm": 0.32744915236509325, + "learning_rate": 0.00011271090691192254, + "loss": 0.9177, + "step": 14089 + }, + { + "epoch": 3.0055460750853245, + "grad_norm": 0.19771826551997898, + "learning_rate": 0.00011270314083987555, + "loss": 0.9334, + "step": 14090 + }, + { + "epoch": 3.005759385665529, + "grad_norm": 0.23742687783423594, + "learning_rate": 0.00011269537439779196, + "loss": 0.8999, + "step": 14091 + }, + { + "epoch": 3.005972696245734, + "grad_norm": 0.2503160310097579, + "learning_rate": 0.0001126876075857597, + "loss": 0.9086, + "step": 14092 + }, + { + "epoch": 3.0061860068259385, + "grad_norm": 0.31349560067227433, + "learning_rate": 0.00011267984040386661, + "loss": 0.9162, + "step": 14093 + }, + { + "epoch": 3.006399317406143, + "grad_norm": 0.3301197809499645, + "learning_rate": 0.00011267207285220054, + "loss": 0.9096, + "step": 14094 + }, + { + "epoch": 3.0066126279863483, + "grad_norm": 0.29981961970283916, + "learning_rate": 0.00011266430493084944, + "loss": 0.9196, + "step": 14095 + }, + { + "epoch": 3.006825938566553, + "grad_norm": 0.28298570729620476, + "learning_rate": 0.00011265653663990116, + "loss": 0.9493, + "step": 14096 + }, + { + "epoch": 3.0070392491467577, + "grad_norm": 0.23909902853977466, + "learning_rate": 0.00011264876797944365, + "loss": 0.9194, + "step": 14097 + }, + { + "epoch": 3.0072525597269624, + "grad_norm": 0.20467677712413374, + "learning_rate": 0.00011264099894956476, + "loss": 0.9105, + "step": 14098 + }, + { + "epoch": 3.007465870307167, + "grad_norm": 0.19726950307789, + "learning_rate": 0.00011263322955035243, + "loss": 0.9112, + "step": 14099 + }, + { + "epoch": 3.007679180887372, + "grad_norm": 0.2222897195244877, + "learning_rate": 0.00011262545978189454, + "loss": 0.9131, + "step": 14100 + }, + { + "epoch": 3.007892491467577, + "grad_norm": 0.22222536459856826, + "learning_rate": 0.00011261768964427904, + "loss": 0.9089, + "step": 14101 + }, + { + "epoch": 3.0081058020477816, + "grad_norm": 0.19482418008935598, + "learning_rate": 0.00011260991913759381, + "loss": 0.899, + "step": 14102 + }, + { + "epoch": 3.0083191126279862, + "grad_norm": 0.23039669755107073, + "learning_rate": 0.00011260214826192684, + "loss": 0.9131, + "step": 14103 + }, + { + "epoch": 3.008532423208191, + "grad_norm": 0.22557371627183542, + "learning_rate": 0.00011259437701736599, + "loss": 0.9099, + "step": 14104 + }, + { + "epoch": 3.008745733788396, + "grad_norm": 0.21196167675139094, + "learning_rate": 0.0001125866054039992, + "loss": 0.914, + "step": 14105 + }, + { + "epoch": 3.0089590443686007, + "grad_norm": 0.23175131944887867, + "learning_rate": 0.00011257883342191444, + "loss": 0.893, + "step": 14106 + }, + { + "epoch": 3.0091723549488054, + "grad_norm": 0.2663885319428798, + "learning_rate": 0.00011257106107119961, + "loss": 0.9242, + "step": 14107 + }, + { + "epoch": 3.00938566552901, + "grad_norm": 0.3166553448024948, + "learning_rate": 0.0001125632883519427, + "loss": 0.9335, + "step": 14108 + }, + { + "epoch": 3.0095989761092152, + "grad_norm": 0.33588591981110144, + "learning_rate": 0.00011255551526423164, + "loss": 0.9001, + "step": 14109 + }, + { + "epoch": 3.00981228668942, + "grad_norm": 0.27626687370469927, + "learning_rate": 0.00011254774180815436, + "loss": 0.9017, + "step": 14110 + }, + { + "epoch": 3.0100255972696246, + "grad_norm": 0.18474267890230045, + "learning_rate": 0.00011253996798379884, + "loss": 0.9357, + "step": 14111 + }, + { + "epoch": 3.0102389078498293, + "grad_norm": 0.23810288917971742, + "learning_rate": 0.00011253219379125302, + "loss": 0.9072, + "step": 14112 + }, + { + "epoch": 3.010452218430034, + "grad_norm": 0.2510846620441082, + "learning_rate": 0.0001125244192306049, + "loss": 0.9477, + "step": 14113 + }, + { + "epoch": 3.010665529010239, + "grad_norm": 0.29631984912540504, + "learning_rate": 0.00011251664430194244, + "loss": 0.9074, + "step": 14114 + }, + { + "epoch": 3.010878839590444, + "grad_norm": 0.28944102009629763, + "learning_rate": 0.00011250886900535359, + "loss": 0.9237, + "step": 14115 + }, + { + "epoch": 3.0110921501706485, + "grad_norm": 0.284080337847475, + "learning_rate": 0.00011250109334092632, + "loss": 0.9044, + "step": 14116 + }, + { + "epoch": 3.011305460750853, + "grad_norm": 0.34498174193895437, + "learning_rate": 0.00011249331730874865, + "loss": 0.9355, + "step": 14117 + }, + { + "epoch": 3.011518771331058, + "grad_norm": 0.26113125569802936, + "learning_rate": 0.00011248554090890856, + "loss": 0.8746, + "step": 14118 + }, + { + "epoch": 3.011732081911263, + "grad_norm": 0.26837311033710315, + "learning_rate": 0.000112477764141494, + "loss": 0.9634, + "step": 14119 + }, + { + "epoch": 3.0119453924914676, + "grad_norm": 0.25227719161539647, + "learning_rate": 0.00011246998700659302, + "loss": 0.8979, + "step": 14120 + }, + { + "epoch": 3.0121587030716723, + "grad_norm": 0.25424428930064114, + "learning_rate": 0.00011246220950429359, + "loss": 0.9166, + "step": 14121 + }, + { + "epoch": 3.012372013651877, + "grad_norm": 0.2382234704461939, + "learning_rate": 0.00011245443163468369, + "loss": 0.8949, + "step": 14122 + }, + { + "epoch": 3.0125853242320817, + "grad_norm": 0.19409429612283155, + "learning_rate": 0.00011244665339785137, + "loss": 0.9143, + "step": 14123 + }, + { + "epoch": 3.012798634812287, + "grad_norm": 0.223601454669974, + "learning_rate": 0.0001124388747938846, + "loss": 0.8708, + "step": 14124 + }, + { + "epoch": 3.0130119453924915, + "grad_norm": 0.1993178262527722, + "learning_rate": 0.00011243109582287143, + "loss": 0.9363, + "step": 14125 + }, + { + "epoch": 3.013225255972696, + "grad_norm": 0.22747120449896097, + "learning_rate": 0.00011242331648489988, + "loss": 0.8747, + "step": 14126 + }, + { + "epoch": 3.013438566552901, + "grad_norm": 0.2603249718396535, + "learning_rate": 0.00011241553678005794, + "loss": 0.938, + "step": 14127 + }, + { + "epoch": 3.013651877133106, + "grad_norm": 0.21456401511451678, + "learning_rate": 0.00011240775670843365, + "loss": 0.9041, + "step": 14128 + }, + { + "epoch": 3.0138651877133107, + "grad_norm": 0.2269387649440317, + "learning_rate": 0.00011239997627011506, + "loss": 0.8984, + "step": 14129 + }, + { + "epoch": 3.0140784982935154, + "grad_norm": 0.2518735266138465, + "learning_rate": 0.00011239219546519018, + "loss": 0.8899, + "step": 14130 + }, + { + "epoch": 3.01429180887372, + "grad_norm": 0.20143107090062257, + "learning_rate": 0.0001123844142937471, + "loss": 0.9025, + "step": 14131 + }, + { + "epoch": 3.0145051194539247, + "grad_norm": 0.31584748572878224, + "learning_rate": 0.00011237663275587379, + "loss": 0.9256, + "step": 14132 + }, + { + "epoch": 3.01471843003413, + "grad_norm": 0.3088656979269443, + "learning_rate": 0.00011236885085165835, + "loss": 0.9046, + "step": 14133 + }, + { + "epoch": 3.0149317406143346, + "grad_norm": 0.2778309938826599, + "learning_rate": 0.00011236106858118881, + "loss": 0.9181, + "step": 14134 + }, + { + "epoch": 3.0151450511945392, + "grad_norm": 0.34822571137407604, + "learning_rate": 0.00011235328594455325, + "loss": 0.9387, + "step": 14135 + }, + { + "epoch": 3.015358361774744, + "grad_norm": 0.33627377705110195, + "learning_rate": 0.00011234550294183969, + "loss": 0.9372, + "step": 14136 + }, + { + "epoch": 3.0155716723549486, + "grad_norm": 0.3268339066305602, + "learning_rate": 0.00011233771957313626, + "loss": 0.8923, + "step": 14137 + }, + { + "epoch": 3.0157849829351537, + "grad_norm": 0.3139793620891975, + "learning_rate": 0.00011232993583853095, + "loss": 0.9522, + "step": 14138 + }, + { + "epoch": 3.0159982935153584, + "grad_norm": 0.23967819266893167, + "learning_rate": 0.00011232215173811189, + "loss": 0.9168, + "step": 14139 + }, + { + "epoch": 3.016211604095563, + "grad_norm": 0.23261650807476492, + "learning_rate": 0.00011231436727196713, + "loss": 0.9453, + "step": 14140 + }, + { + "epoch": 3.016424914675768, + "grad_norm": 0.2171603987618689, + "learning_rate": 0.00011230658244018475, + "loss": 0.9344, + "step": 14141 + }, + { + "epoch": 3.016638225255973, + "grad_norm": 0.32720587010997687, + "learning_rate": 0.00011229879724285287, + "loss": 0.9183, + "step": 14142 + }, + { + "epoch": 3.0168515358361776, + "grad_norm": 0.40585855829637385, + "learning_rate": 0.00011229101168005956, + "loss": 0.9154, + "step": 14143 + }, + { + "epoch": 3.0170648464163823, + "grad_norm": 0.4107101533535735, + "learning_rate": 0.0001122832257518929, + "loss": 0.9449, + "step": 14144 + }, + { + "epoch": 3.017278156996587, + "grad_norm": 0.31571330378165785, + "learning_rate": 0.000112275439458441, + "loss": 0.9462, + "step": 14145 + }, + { + "epoch": 3.0174914675767917, + "grad_norm": 0.2544699551650217, + "learning_rate": 0.00011226765279979195, + "loss": 0.9326, + "step": 14146 + }, + { + "epoch": 3.017704778156997, + "grad_norm": 0.24894389899015923, + "learning_rate": 0.00011225986577603387, + "loss": 0.9174, + "step": 14147 + }, + { + "epoch": 3.0179180887372015, + "grad_norm": 0.22185737704982722, + "learning_rate": 0.00011225207838725488, + "loss": 0.9519, + "step": 14148 + }, + { + "epoch": 3.018131399317406, + "grad_norm": 0.24407818904505807, + "learning_rate": 0.00011224429063354306, + "loss": 0.9213, + "step": 14149 + }, + { + "epoch": 3.018344709897611, + "grad_norm": 0.22538152039646273, + "learning_rate": 0.00011223650251498655, + "loss": 0.9022, + "step": 14150 + }, + { + "epoch": 3.0185580204778155, + "grad_norm": 0.1990218217981597, + "learning_rate": 0.00011222871403167349, + "loss": 0.885, + "step": 14151 + }, + { + "epoch": 3.0187713310580206, + "grad_norm": 0.21631273270678353, + "learning_rate": 0.00011222092518369198, + "loss": 0.9199, + "step": 14152 + }, + { + "epoch": 3.0189846416382253, + "grad_norm": 0.20174623203862818, + "learning_rate": 0.00011221313597113014, + "loss": 0.8857, + "step": 14153 + }, + { + "epoch": 3.01919795221843, + "grad_norm": 0.22212109452814371, + "learning_rate": 0.00011220534639407617, + "loss": 0.9218, + "step": 14154 + }, + { + "epoch": 3.0194112627986347, + "grad_norm": 0.24078397536218174, + "learning_rate": 0.00011219755645261814, + "loss": 0.9242, + "step": 14155 + }, + { + "epoch": 3.0196245733788394, + "grad_norm": 0.20057536086605576, + "learning_rate": 0.00011218976614684419, + "loss": 0.912, + "step": 14156 + }, + { + "epoch": 3.0198378839590445, + "grad_norm": 0.25255153405019765, + "learning_rate": 0.00011218197547684252, + "loss": 0.9177, + "step": 14157 + }, + { + "epoch": 3.020051194539249, + "grad_norm": 0.23036531825734746, + "learning_rate": 0.00011217418444270124, + "loss": 0.9585, + "step": 14158 + }, + { + "epoch": 3.020264505119454, + "grad_norm": 0.2209305508263844, + "learning_rate": 0.00011216639304450856, + "loss": 0.9144, + "step": 14159 + }, + { + "epoch": 3.0204778156996586, + "grad_norm": 0.2617112635884293, + "learning_rate": 0.00011215860128235257, + "loss": 0.9095, + "step": 14160 + }, + { + "epoch": 3.0206911262798637, + "grad_norm": 0.24554065145808107, + "learning_rate": 0.0001121508091563215, + "loss": 0.9233, + "step": 14161 + }, + { + "epoch": 3.0209044368600684, + "grad_norm": 0.25511069594457697, + "learning_rate": 0.00011214301666650344, + "loss": 0.9031, + "step": 14162 + }, + { + "epoch": 3.021117747440273, + "grad_norm": 0.30999387156641994, + "learning_rate": 0.00011213522381298663, + "loss": 0.8909, + "step": 14163 + }, + { + "epoch": 3.0213310580204777, + "grad_norm": 0.25637275045743174, + "learning_rate": 0.00011212743059585919, + "loss": 0.9405, + "step": 14164 + }, + { + "epoch": 3.0215443686006824, + "grad_norm": 0.2091922855795325, + "learning_rate": 0.00011211963701520937, + "loss": 0.9347, + "step": 14165 + }, + { + "epoch": 3.0217576791808876, + "grad_norm": 0.29148349648523336, + "learning_rate": 0.0001121118430711253, + "loss": 0.899, + "step": 14166 + }, + { + "epoch": 3.0219709897610922, + "grad_norm": 0.23250017666529926, + "learning_rate": 0.0001121040487636952, + "loss": 0.9482, + "step": 14167 + }, + { + "epoch": 3.022184300341297, + "grad_norm": 0.2330535460540911, + "learning_rate": 0.00011209625409300723, + "loss": 0.9156, + "step": 14168 + }, + { + "epoch": 3.0223976109215016, + "grad_norm": 0.2092380919817914, + "learning_rate": 0.00011208845905914962, + "loss": 0.8926, + "step": 14169 + }, + { + "epoch": 3.0226109215017063, + "grad_norm": 0.2626970437217587, + "learning_rate": 0.00011208066366221055, + "loss": 0.9094, + "step": 14170 + }, + { + "epoch": 3.0228242320819114, + "grad_norm": 0.22725439688710677, + "learning_rate": 0.00011207286790227822, + "loss": 0.9079, + "step": 14171 + }, + { + "epoch": 3.023037542662116, + "grad_norm": 0.2487242304700758, + "learning_rate": 0.00011206507177944088, + "loss": 0.9162, + "step": 14172 + }, + { + "epoch": 3.023250853242321, + "grad_norm": 0.19678804479152365, + "learning_rate": 0.00011205727529378667, + "loss": 0.9117, + "step": 14173 + }, + { + "epoch": 3.0234641638225255, + "grad_norm": 0.2447650272163576, + "learning_rate": 0.00011204947844540387, + "loss": 0.9199, + "step": 14174 + }, + { + "epoch": 3.02367747440273, + "grad_norm": 0.3139139830177321, + "learning_rate": 0.00011204168123438071, + "loss": 0.8865, + "step": 14175 + }, + { + "epoch": 3.0238907849829353, + "grad_norm": 0.33362098586541944, + "learning_rate": 0.00011203388366080537, + "loss": 0.9207, + "step": 14176 + }, + { + "epoch": 3.02410409556314, + "grad_norm": 0.3192015423787364, + "learning_rate": 0.00011202608572476607, + "loss": 0.9213, + "step": 14177 + }, + { + "epoch": 3.0243174061433447, + "grad_norm": 0.3136047663744056, + "learning_rate": 0.00011201828742635111, + "loss": 0.8953, + "step": 14178 + }, + { + "epoch": 3.0245307167235493, + "grad_norm": 0.3088363150975604, + "learning_rate": 0.00011201048876564866, + "loss": 0.9286, + "step": 14179 + }, + { + "epoch": 3.0247440273037545, + "grad_norm": 0.26795018879650817, + "learning_rate": 0.00011200268974274701, + "loss": 0.9576, + "step": 14180 + }, + { + "epoch": 3.024957337883959, + "grad_norm": 0.1923771150792446, + "learning_rate": 0.00011199489035773438, + "loss": 0.921, + "step": 14181 + }, + { + "epoch": 3.025170648464164, + "grad_norm": 0.2893572812005957, + "learning_rate": 0.00011198709061069904, + "loss": 0.9252, + "step": 14182 + }, + { + "epoch": 3.0253839590443685, + "grad_norm": 0.32914501340224717, + "learning_rate": 0.0001119792905017292, + "loss": 0.8768, + "step": 14183 + }, + { + "epoch": 3.025597269624573, + "grad_norm": 0.2985120906458398, + "learning_rate": 0.00011197149003091318, + "loss": 0.9317, + "step": 14184 + }, + { + "epoch": 3.0258105802047783, + "grad_norm": 0.30796797123154473, + "learning_rate": 0.00011196368919833921, + "loss": 0.9084, + "step": 14185 + }, + { + "epoch": 3.026023890784983, + "grad_norm": 0.3462138231387973, + "learning_rate": 0.00011195588800409553, + "loss": 0.9092, + "step": 14186 + }, + { + "epoch": 3.0262372013651877, + "grad_norm": 0.3729595972720295, + "learning_rate": 0.00011194808644827048, + "loss": 0.9302, + "step": 14187 + }, + { + "epoch": 3.0264505119453924, + "grad_norm": 0.337744104251167, + "learning_rate": 0.00011194028453095227, + "loss": 0.897, + "step": 14188 + }, + { + "epoch": 3.026663822525597, + "grad_norm": 0.30058602149650887, + "learning_rate": 0.00011193248225222921, + "loss": 0.896, + "step": 14189 + }, + { + "epoch": 3.026877133105802, + "grad_norm": 0.3227098892019893, + "learning_rate": 0.00011192467961218958, + "loss": 0.8822, + "step": 14190 + }, + { + "epoch": 3.027090443686007, + "grad_norm": 0.318868250340003, + "learning_rate": 0.00011191687661092167, + "loss": 0.9197, + "step": 14191 + }, + { + "epoch": 3.0273037542662116, + "grad_norm": 0.3032960915082147, + "learning_rate": 0.00011190907324851374, + "loss": 0.9194, + "step": 14192 + }, + { + "epoch": 3.0275170648464163, + "grad_norm": 0.25507317739957525, + "learning_rate": 0.00011190126952505413, + "loss": 0.9301, + "step": 14193 + }, + { + "epoch": 3.0277303754266214, + "grad_norm": 0.6805249098529635, + "learning_rate": 0.00011189346544063111, + "loss": 0.9217, + "step": 14194 + }, + { + "epoch": 3.027943686006826, + "grad_norm": 0.21341793007214552, + "learning_rate": 0.00011188566099533298, + "loss": 0.9143, + "step": 14195 + }, + { + "epoch": 3.0281569965870307, + "grad_norm": 0.21370331864760053, + "learning_rate": 0.00011187785618924807, + "loss": 0.9045, + "step": 14196 + }, + { + "epoch": 3.0283703071672354, + "grad_norm": 0.3043789694236154, + "learning_rate": 0.00011187005102246471, + "loss": 0.9362, + "step": 14197 + }, + { + "epoch": 3.02858361774744, + "grad_norm": 0.3158974355791167, + "learning_rate": 0.00011186224549507115, + "loss": 0.9514, + "step": 14198 + }, + { + "epoch": 3.0287969283276452, + "grad_norm": 0.25280023871921625, + "learning_rate": 0.00011185443960715575, + "loss": 0.9283, + "step": 14199 + }, + { + "epoch": 3.02901023890785, + "grad_norm": 0.2780185788637334, + "learning_rate": 0.00011184663335880685, + "loss": 0.9278, + "step": 14200 + }, + { + "epoch": 3.0292235494880546, + "grad_norm": 0.3850719422651617, + "learning_rate": 0.00011183882675011273, + "loss": 0.9386, + "step": 14201 + }, + { + "epoch": 3.0294368600682593, + "grad_norm": 0.30641133076719823, + "learning_rate": 0.00011183101978116178, + "loss": 0.9015, + "step": 14202 + }, + { + "epoch": 3.029650170648464, + "grad_norm": 0.3127495105088835, + "learning_rate": 0.00011182321245204229, + "loss": 0.9407, + "step": 14203 + }, + { + "epoch": 3.029863481228669, + "grad_norm": 0.3335884265381771, + "learning_rate": 0.00011181540476284263, + "loss": 0.9466, + "step": 14204 + }, + { + "epoch": 3.030076791808874, + "grad_norm": 0.2688578104997089, + "learning_rate": 0.0001118075967136511, + "loss": 0.9122, + "step": 14205 + }, + { + "epoch": 3.0302901023890785, + "grad_norm": 0.2929173500063964, + "learning_rate": 0.00011179978830455609, + "loss": 0.9233, + "step": 14206 + }, + { + "epoch": 3.030503412969283, + "grad_norm": 0.28517643484711036, + "learning_rate": 0.00011179197953564594, + "loss": 0.9174, + "step": 14207 + }, + { + "epoch": 3.030716723549488, + "grad_norm": 0.318568884049982, + "learning_rate": 0.000111784170407009, + "loss": 0.9293, + "step": 14208 + }, + { + "epoch": 3.030930034129693, + "grad_norm": 0.41884681608901053, + "learning_rate": 0.00011177636091873368, + "loss": 0.9098, + "step": 14209 + }, + { + "epoch": 3.0311433447098977, + "grad_norm": 0.5249443254151273, + "learning_rate": 0.00011176855107090827, + "loss": 0.9338, + "step": 14210 + }, + { + "epoch": 3.0313566552901023, + "grad_norm": 0.6279424264590826, + "learning_rate": 0.00011176074086362114, + "loss": 0.9373, + "step": 14211 + }, + { + "epoch": 3.031569965870307, + "grad_norm": 0.6593096309326785, + "learning_rate": 0.00011175293029696076, + "loss": 0.9602, + "step": 14212 + }, + { + "epoch": 3.031783276450512, + "grad_norm": 0.6396415715824958, + "learning_rate": 0.00011174511937101537, + "loss": 0.9325, + "step": 14213 + }, + { + "epoch": 3.031996587030717, + "grad_norm": 0.6070816761435013, + "learning_rate": 0.00011173730808587347, + "loss": 0.884, + "step": 14214 + }, + { + "epoch": 3.0322098976109215, + "grad_norm": 0.3308410071712688, + "learning_rate": 0.00011172949644162338, + "loss": 0.9303, + "step": 14215 + }, + { + "epoch": 3.032423208191126, + "grad_norm": 0.24109376109339362, + "learning_rate": 0.00011172168443835351, + "loss": 0.9183, + "step": 14216 + }, + { + "epoch": 3.032636518771331, + "grad_norm": 0.46848656794820276, + "learning_rate": 0.00011171387207615222, + "loss": 0.9204, + "step": 14217 + }, + { + "epoch": 3.032849829351536, + "grad_norm": 0.5518695413519762, + "learning_rate": 0.00011170605935510796, + "loss": 0.9398, + "step": 14218 + }, + { + "epoch": 3.0330631399317407, + "grad_norm": 0.5430008635516331, + "learning_rate": 0.0001116982462753091, + "loss": 0.9466, + "step": 14219 + }, + { + "epoch": 3.0332764505119454, + "grad_norm": 0.535212531235563, + "learning_rate": 0.00011169043283684405, + "loss": 0.9105, + "step": 14220 + }, + { + "epoch": 3.03348976109215, + "grad_norm": 0.36166358968659545, + "learning_rate": 0.00011168261903980122, + "loss": 0.894, + "step": 14221 + }, + { + "epoch": 3.0337030716723548, + "grad_norm": 0.21693407615595786, + "learning_rate": 0.00011167480488426903, + "loss": 0.9028, + "step": 14222 + }, + { + "epoch": 3.03391638225256, + "grad_norm": 0.36338359718753954, + "learning_rate": 0.00011166699037033587, + "loss": 0.8928, + "step": 14223 + }, + { + "epoch": 3.0341296928327646, + "grad_norm": 0.41439510711301236, + "learning_rate": 0.0001116591754980902, + "loss": 0.9374, + "step": 14224 + }, + { + "epoch": 3.0343430034129693, + "grad_norm": 0.3363782818945796, + "learning_rate": 0.00011165136026762042, + "loss": 0.884, + "step": 14225 + }, + { + "epoch": 3.034556313993174, + "grad_norm": 0.2609004228864899, + "learning_rate": 0.00011164354467901497, + "loss": 0.8971, + "step": 14226 + }, + { + "epoch": 3.0347696245733786, + "grad_norm": 0.26432800446368293, + "learning_rate": 0.00011163572873236228, + "loss": 0.9418, + "step": 14227 + }, + { + "epoch": 3.0349829351535837, + "grad_norm": 0.24283793145244847, + "learning_rate": 0.00011162791242775078, + "loss": 0.9206, + "step": 14228 + }, + { + "epoch": 3.0351962457337884, + "grad_norm": 0.25962812756133663, + "learning_rate": 0.00011162009576526893, + "loss": 0.8872, + "step": 14229 + }, + { + "epoch": 3.035409556313993, + "grad_norm": 0.24614815952394647, + "learning_rate": 0.00011161227874500513, + "loss": 0.9193, + "step": 14230 + }, + { + "epoch": 3.035622866894198, + "grad_norm": 0.24683405053103022, + "learning_rate": 0.00011160446136704789, + "loss": 0.9332, + "step": 14231 + }, + { + "epoch": 3.035836177474403, + "grad_norm": 0.34344536039784745, + "learning_rate": 0.00011159664363148565, + "loss": 0.9254, + "step": 14232 + }, + { + "epoch": 3.0360494880546076, + "grad_norm": 0.37584447008951105, + "learning_rate": 0.00011158882553840684, + "loss": 0.9192, + "step": 14233 + }, + { + "epoch": 3.0362627986348123, + "grad_norm": 0.28927416756502233, + "learning_rate": 0.00011158100708789996, + "loss": 0.9242, + "step": 14234 + }, + { + "epoch": 3.036476109215017, + "grad_norm": 0.29411089980751437, + "learning_rate": 0.00011157318828005341, + "loss": 0.893, + "step": 14235 + }, + { + "epoch": 3.0366894197952217, + "grad_norm": 0.33200248613599653, + "learning_rate": 0.00011156536911495574, + "loss": 0.9152, + "step": 14236 + }, + { + "epoch": 3.036902730375427, + "grad_norm": 0.22107200639981492, + "learning_rate": 0.00011155754959269537, + "loss": 0.877, + "step": 14237 + }, + { + "epoch": 3.0371160409556315, + "grad_norm": 0.27216787082700794, + "learning_rate": 0.00011154972971336082, + "loss": 0.9394, + "step": 14238 + }, + { + "epoch": 3.037329351535836, + "grad_norm": 0.3049156845881607, + "learning_rate": 0.00011154190947704053, + "loss": 0.9237, + "step": 14239 + }, + { + "epoch": 3.037542662116041, + "grad_norm": 0.23034693457884625, + "learning_rate": 0.000111534088883823, + "loss": 0.8866, + "step": 14240 + }, + { + "epoch": 3.0377559726962455, + "grad_norm": 0.26897149222887523, + "learning_rate": 0.00011152626793379672, + "loss": 0.918, + "step": 14241 + }, + { + "epoch": 3.0379692832764507, + "grad_norm": 0.25046115455567153, + "learning_rate": 0.00011151844662705019, + "loss": 0.9159, + "step": 14242 + }, + { + "epoch": 3.0381825938566553, + "grad_norm": 0.22351123012261517, + "learning_rate": 0.00011151062496367193, + "loss": 0.9145, + "step": 14243 + }, + { + "epoch": 3.03839590443686, + "grad_norm": 0.2029994809136833, + "learning_rate": 0.00011150280294375039, + "loss": 0.9156, + "step": 14244 + }, + { + "epoch": 3.0386092150170647, + "grad_norm": 0.21154948302981333, + "learning_rate": 0.00011149498056737411, + "loss": 0.8837, + "step": 14245 + }, + { + "epoch": 3.03882252559727, + "grad_norm": 0.2370275494114809, + "learning_rate": 0.00011148715783463161, + "loss": 0.896, + "step": 14246 + }, + { + "epoch": 3.0390358361774745, + "grad_norm": 0.26466712825449673, + "learning_rate": 0.00011147933474561136, + "loss": 0.9926, + "step": 14247 + }, + { + "epoch": 3.039249146757679, + "grad_norm": 0.24752361533463302, + "learning_rate": 0.00011147151130040194, + "loss": 0.9141, + "step": 14248 + }, + { + "epoch": 3.039462457337884, + "grad_norm": 0.21646416247119252, + "learning_rate": 0.00011146368749909184, + "loss": 0.929, + "step": 14249 + }, + { + "epoch": 3.0396757679180886, + "grad_norm": 0.2563755006559458, + "learning_rate": 0.00011145586334176957, + "loss": 0.909, + "step": 14250 + }, + { + "epoch": 3.0398890784982937, + "grad_norm": 0.26752734216228224, + "learning_rate": 0.00011144803882852368, + "loss": 0.9395, + "step": 14251 + }, + { + "epoch": 3.0401023890784984, + "grad_norm": 0.22872292281038592, + "learning_rate": 0.00011144021395944271, + "loss": 0.9158, + "step": 14252 + }, + { + "epoch": 3.040315699658703, + "grad_norm": 0.2609151378955834, + "learning_rate": 0.00011143238873461518, + "loss": 0.9099, + "step": 14253 + }, + { + "epoch": 3.0405290102389078, + "grad_norm": 0.3452668844316494, + "learning_rate": 0.00011142456315412965, + "loss": 0.8962, + "step": 14254 + }, + { + "epoch": 3.0407423208191124, + "grad_norm": 0.25579071744459003, + "learning_rate": 0.00011141673721807465, + "loss": 0.9202, + "step": 14255 + }, + { + "epoch": 3.0409556313993176, + "grad_norm": 0.28360883747517474, + "learning_rate": 0.00011140891092653875, + "loss": 0.9392, + "step": 14256 + }, + { + "epoch": 3.0411689419795223, + "grad_norm": 0.26583014801198573, + "learning_rate": 0.00011140108427961048, + "loss": 0.9026, + "step": 14257 + }, + { + "epoch": 3.041382252559727, + "grad_norm": 0.24190722897592798, + "learning_rate": 0.00011139325727737842, + "loss": 0.8991, + "step": 14258 + }, + { + "epoch": 3.0415955631399316, + "grad_norm": 0.28995995959402165, + "learning_rate": 0.00011138542991993114, + "loss": 0.8961, + "step": 14259 + }, + { + "epoch": 3.0418088737201363, + "grad_norm": 0.3995205585078077, + "learning_rate": 0.00011137760220735718, + "loss": 0.9029, + "step": 14260 + }, + { + "epoch": 3.0420221843003414, + "grad_norm": 0.3761874694845022, + "learning_rate": 0.00011136977413974513, + "loss": 0.923, + "step": 14261 + }, + { + "epoch": 3.042235494880546, + "grad_norm": 0.29958395018583817, + "learning_rate": 0.00011136194571718355, + "loss": 0.9215, + "step": 14262 + }, + { + "epoch": 3.042448805460751, + "grad_norm": 0.3209507274493531, + "learning_rate": 0.00011135411693976102, + "loss": 0.9142, + "step": 14263 + }, + { + "epoch": 3.0426621160409555, + "grad_norm": 0.2521479402064755, + "learning_rate": 0.00011134628780756615, + "loss": 0.9198, + "step": 14264 + }, + { + "epoch": 3.0428754266211606, + "grad_norm": 0.18625426948975948, + "learning_rate": 0.0001113384583206875, + "loss": 0.9057, + "step": 14265 + }, + { + "epoch": 3.0430887372013653, + "grad_norm": 0.2709287553374154, + "learning_rate": 0.00011133062847921366, + "loss": 0.9311, + "step": 14266 + }, + { + "epoch": 3.04330204778157, + "grad_norm": 0.2569797883044355, + "learning_rate": 0.00011132279828323322, + "loss": 0.942, + "step": 14267 + }, + { + "epoch": 3.0435153583617747, + "grad_norm": 0.22535105678029227, + "learning_rate": 0.00011131496773283481, + "loss": 0.9235, + "step": 14268 + }, + { + "epoch": 3.0437286689419794, + "grad_norm": 0.27889325121089764, + "learning_rate": 0.00011130713682810701, + "loss": 0.9148, + "step": 14269 + }, + { + "epoch": 3.0439419795221845, + "grad_norm": 0.29428960774445684, + "learning_rate": 0.00011129930556913843, + "loss": 0.916, + "step": 14270 + }, + { + "epoch": 3.044155290102389, + "grad_norm": 0.3013690669877336, + "learning_rate": 0.00011129147395601769, + "loss": 0.9239, + "step": 14271 + }, + { + "epoch": 3.044368600682594, + "grad_norm": 0.25982756780163707, + "learning_rate": 0.00011128364198883339, + "loss": 0.9217, + "step": 14272 + }, + { + "epoch": 3.0445819112627985, + "grad_norm": 0.3203238605512216, + "learning_rate": 0.00011127580966767414, + "loss": 0.9286, + "step": 14273 + }, + { + "epoch": 3.044795221843003, + "grad_norm": 0.30009174766320984, + "learning_rate": 0.00011126797699262858, + "loss": 0.9405, + "step": 14274 + }, + { + "epoch": 3.0450085324232083, + "grad_norm": 0.18734765052862645, + "learning_rate": 0.00011126014396378534, + "loss": 0.9274, + "step": 14275 + }, + { + "epoch": 3.045221843003413, + "grad_norm": 0.24874922136513394, + "learning_rate": 0.00011125231058123305, + "loss": 0.9273, + "step": 14276 + }, + { + "epoch": 3.0454351535836177, + "grad_norm": 0.3247286188840552, + "learning_rate": 0.00011124447684506035, + "loss": 0.9162, + "step": 14277 + }, + { + "epoch": 3.0456484641638224, + "grad_norm": 0.3131737763214921, + "learning_rate": 0.00011123664275535586, + "loss": 0.9331, + "step": 14278 + }, + { + "epoch": 3.0458617747440275, + "grad_norm": 0.30858259436619184, + "learning_rate": 0.00011122880831220821, + "loss": 0.899, + "step": 14279 + }, + { + "epoch": 3.046075085324232, + "grad_norm": 0.30522753070372755, + "learning_rate": 0.00011122097351570609, + "loss": 0.9531, + "step": 14280 + }, + { + "epoch": 3.046288395904437, + "grad_norm": 0.25373805006368205, + "learning_rate": 0.00011121313836593811, + "loss": 0.9128, + "step": 14281 + }, + { + "epoch": 3.0465017064846416, + "grad_norm": 0.24734792105420192, + "learning_rate": 0.00011120530286299299, + "loss": 0.9367, + "step": 14282 + }, + { + "epoch": 3.0467150170648463, + "grad_norm": 0.20204433076870443, + "learning_rate": 0.0001111974670069593, + "loss": 0.8944, + "step": 14283 + }, + { + "epoch": 3.0469283276450514, + "grad_norm": 0.20237618788382813, + "learning_rate": 0.00011118963079792576, + "loss": 0.9131, + "step": 14284 + }, + { + "epoch": 3.047141638225256, + "grad_norm": 0.2296535650746004, + "learning_rate": 0.00011118179423598101, + "loss": 0.8985, + "step": 14285 + }, + { + "epoch": 3.0473549488054608, + "grad_norm": 0.20866403856621626, + "learning_rate": 0.00011117395732121376, + "loss": 0.9177, + "step": 14286 + }, + { + "epoch": 3.0475682593856654, + "grad_norm": 0.22085720482868534, + "learning_rate": 0.00011116612005371263, + "loss": 0.9404, + "step": 14287 + }, + { + "epoch": 3.04778156996587, + "grad_norm": 0.20487487647184857, + "learning_rate": 0.00011115828243356636, + "loss": 0.9032, + "step": 14288 + }, + { + "epoch": 3.0479948805460753, + "grad_norm": 0.2498254151234445, + "learning_rate": 0.00011115044446086357, + "loss": 0.9088, + "step": 14289 + }, + { + "epoch": 3.04820819112628, + "grad_norm": 0.1768004780996099, + "learning_rate": 0.00011114260613569302, + "loss": 0.9139, + "step": 14290 + }, + { + "epoch": 3.0484215017064846, + "grad_norm": 0.22971273712176946, + "learning_rate": 0.00011113476745814328, + "loss": 0.9384, + "step": 14291 + }, + { + "epoch": 3.0486348122866893, + "grad_norm": 0.2606554029982431, + "learning_rate": 0.0001111269284283032, + "loss": 0.9048, + "step": 14292 + }, + { + "epoch": 3.048848122866894, + "grad_norm": 0.2242313451997666, + "learning_rate": 0.00011111908904626136, + "loss": 0.9664, + "step": 14293 + }, + { + "epoch": 3.049061433447099, + "grad_norm": 0.20969296679026866, + "learning_rate": 0.00011111124931210652, + "loss": 0.9477, + "step": 14294 + }, + { + "epoch": 3.049274744027304, + "grad_norm": 0.18647734369577904, + "learning_rate": 0.00011110340922592737, + "loss": 0.9245, + "step": 14295 + }, + { + "epoch": 3.0494880546075085, + "grad_norm": 0.2194295208206654, + "learning_rate": 0.00011109556878781262, + "loss": 0.9262, + "step": 14296 + }, + { + "epoch": 3.049701365187713, + "grad_norm": 0.20831742223555705, + "learning_rate": 0.00011108772799785098, + "loss": 0.9102, + "step": 14297 + }, + { + "epoch": 3.0499146757679183, + "grad_norm": 0.19430161619282324, + "learning_rate": 0.00011107988685613116, + "loss": 0.946, + "step": 14298 + }, + { + "epoch": 3.050127986348123, + "grad_norm": 0.1787742237500484, + "learning_rate": 0.00011107204536274193, + "loss": 0.9018, + "step": 14299 + }, + { + "epoch": 3.0503412969283277, + "grad_norm": 0.23628665371356597, + "learning_rate": 0.00011106420351777199, + "loss": 0.9066, + "step": 14300 + }, + { + "epoch": 3.0505546075085324, + "grad_norm": 0.2673474730227208, + "learning_rate": 0.00011105636132131003, + "loss": 0.9084, + "step": 14301 + }, + { + "epoch": 3.050767918088737, + "grad_norm": 0.25771305117283666, + "learning_rate": 0.00011104851877344483, + "loss": 0.9052, + "step": 14302 + }, + { + "epoch": 3.050981228668942, + "grad_norm": 0.22640976604245414, + "learning_rate": 0.0001110406758742651, + "loss": 0.9116, + "step": 14303 + }, + { + "epoch": 3.051194539249147, + "grad_norm": 0.25188371990787706, + "learning_rate": 0.00011103283262385963, + "loss": 0.8945, + "step": 14304 + }, + { + "epoch": 3.0514078498293515, + "grad_norm": 0.19239159216362808, + "learning_rate": 0.00011102498902231713, + "loss": 0.9041, + "step": 14305 + }, + { + "epoch": 3.051621160409556, + "grad_norm": 0.22107082584095808, + "learning_rate": 0.00011101714506972637, + "loss": 0.9124, + "step": 14306 + }, + { + "epoch": 3.051834470989761, + "grad_norm": 0.22310115752324922, + "learning_rate": 0.00011100930076617608, + "loss": 0.907, + "step": 14307 + }, + { + "epoch": 3.052047781569966, + "grad_norm": 0.1996166479507708, + "learning_rate": 0.00011100145611175502, + "loss": 0.9214, + "step": 14308 + }, + { + "epoch": 3.0522610921501707, + "grad_norm": 0.21930401242637146, + "learning_rate": 0.00011099361110655196, + "loss": 0.98, + "step": 14309 + }, + { + "epoch": 3.0524744027303754, + "grad_norm": 0.21375108990018143, + "learning_rate": 0.00011098576575065571, + "loss": 0.9194, + "step": 14310 + }, + { + "epoch": 3.05268771331058, + "grad_norm": 0.2191069169324803, + "learning_rate": 0.00011097792004415495, + "loss": 0.9089, + "step": 14311 + }, + { + "epoch": 3.0529010238907848, + "grad_norm": 0.17065889132763168, + "learning_rate": 0.00011097007398713854, + "loss": 0.9119, + "step": 14312 + }, + { + "epoch": 3.05311433447099, + "grad_norm": 0.1808067848586753, + "learning_rate": 0.00011096222757969519, + "loss": 0.8949, + "step": 14313 + }, + { + "epoch": 3.0533276450511946, + "grad_norm": 0.23404242383220922, + "learning_rate": 0.00011095438082191373, + "loss": 0.9312, + "step": 14314 + }, + { + "epoch": 3.0535409556313993, + "grad_norm": 0.260967093793636, + "learning_rate": 0.00011094653371388291, + "loss": 0.9256, + "step": 14315 + }, + { + "epoch": 3.053754266211604, + "grad_norm": 0.26217077165760744, + "learning_rate": 0.00011093868625569158, + "loss": 0.885, + "step": 14316 + }, + { + "epoch": 3.053967576791809, + "grad_norm": 0.2699788822812219, + "learning_rate": 0.00011093083844742847, + "loss": 0.9412, + "step": 14317 + }, + { + "epoch": 3.0541808873720138, + "grad_norm": 0.22666889497631357, + "learning_rate": 0.00011092299028918242, + "loss": 0.9086, + "step": 14318 + }, + { + "epoch": 3.0543941979522184, + "grad_norm": 0.18615327424135136, + "learning_rate": 0.00011091514178104219, + "loss": 0.9295, + "step": 14319 + }, + { + "epoch": 3.054607508532423, + "grad_norm": 0.27171435926169873, + "learning_rate": 0.00011090729292309661, + "loss": 0.9212, + "step": 14320 + }, + { + "epoch": 3.054820819112628, + "grad_norm": 0.3325441107263124, + "learning_rate": 0.00011089944371543452, + "loss": 0.933, + "step": 14321 + }, + { + "epoch": 3.055034129692833, + "grad_norm": 0.41450807746121654, + "learning_rate": 0.00011089159415814469, + "loss": 0.9176, + "step": 14322 + }, + { + "epoch": 3.0552474402730376, + "grad_norm": 0.520715226886973, + "learning_rate": 0.00011088374425131595, + "loss": 0.8987, + "step": 14323 + }, + { + "epoch": 3.0554607508532423, + "grad_norm": 0.5958505392741523, + "learning_rate": 0.00011087589399503712, + "loss": 0.9198, + "step": 14324 + }, + { + "epoch": 3.055674061433447, + "grad_norm": 0.5984601323894867, + "learning_rate": 0.00011086804338939702, + "loss": 0.9134, + "step": 14325 + }, + { + "epoch": 3.0558873720136517, + "grad_norm": 0.4606606122183332, + "learning_rate": 0.00011086019243448452, + "loss": 0.9171, + "step": 14326 + }, + { + "epoch": 3.056100682593857, + "grad_norm": 0.252675468276738, + "learning_rate": 0.00011085234113038841, + "loss": 0.9139, + "step": 14327 + }, + { + "epoch": 3.0563139931740615, + "grad_norm": 0.26014754178591665, + "learning_rate": 0.00011084448947719754, + "loss": 0.9031, + "step": 14328 + }, + { + "epoch": 3.056527303754266, + "grad_norm": 0.4558808353754653, + "learning_rate": 0.00011083663747500075, + "loss": 0.9468, + "step": 14329 + }, + { + "epoch": 3.056740614334471, + "grad_norm": 0.6829229385564519, + "learning_rate": 0.0001108287851238869, + "loss": 0.9328, + "step": 14330 + }, + { + "epoch": 3.0569539249146755, + "grad_norm": 0.5530299880802644, + "learning_rate": 0.00011082093242394479, + "loss": 0.9265, + "step": 14331 + }, + { + "epoch": 3.0571672354948807, + "grad_norm": 0.554224418681937, + "learning_rate": 0.00011081307937526336, + "loss": 0.9373, + "step": 14332 + }, + { + "epoch": 3.0573805460750854, + "grad_norm": 0.5595056140087433, + "learning_rate": 0.00011080522597793139, + "loss": 0.9377, + "step": 14333 + }, + { + "epoch": 3.05759385665529, + "grad_norm": 0.43719579722940716, + "learning_rate": 0.0001107973722320378, + "loss": 0.948, + "step": 14334 + }, + { + "epoch": 3.0578071672354947, + "grad_norm": 0.27688532734124816, + "learning_rate": 0.00011078951813767138, + "loss": 0.9576, + "step": 14335 + }, + { + "epoch": 3.0580204778157, + "grad_norm": 0.26078328087843405, + "learning_rate": 0.00011078166369492107, + "loss": 0.9301, + "step": 14336 + }, + { + "epoch": 3.0582337883959045, + "grad_norm": 0.40478639181754356, + "learning_rate": 0.0001107738089038757, + "loss": 0.9172, + "step": 14337 + }, + { + "epoch": 3.0584470989761092, + "grad_norm": 0.42854530945515396, + "learning_rate": 0.0001107659537646242, + "loss": 0.9055, + "step": 14338 + }, + { + "epoch": 3.058660409556314, + "grad_norm": 0.35645631569531677, + "learning_rate": 0.00011075809827725538, + "loss": 0.9326, + "step": 14339 + }, + { + "epoch": 3.0588737201365186, + "grad_norm": 0.3049701128420281, + "learning_rate": 0.0001107502424418582, + "loss": 0.9071, + "step": 14340 + }, + { + "epoch": 3.0590870307167237, + "grad_norm": 0.2396379052613858, + "learning_rate": 0.00011074238625852147, + "loss": 0.9262, + "step": 14341 + }, + { + "epoch": 3.0593003412969284, + "grad_norm": 0.26645886190973567, + "learning_rate": 0.00011073452972733414, + "loss": 0.9354, + "step": 14342 + }, + { + "epoch": 3.059513651877133, + "grad_norm": 0.3242555831055084, + "learning_rate": 0.00011072667284838506, + "loss": 0.8981, + "step": 14343 + }, + { + "epoch": 3.0597269624573378, + "grad_norm": 0.39130523374021137, + "learning_rate": 0.00011071881562176322, + "loss": 0.9186, + "step": 14344 + }, + { + "epoch": 3.0599402730375425, + "grad_norm": 0.34916526468844233, + "learning_rate": 0.00011071095804755741, + "loss": 0.908, + "step": 14345 + }, + { + "epoch": 3.0601535836177476, + "grad_norm": 0.2555542716932886, + "learning_rate": 0.00011070310012585662, + "loss": 0.92, + "step": 14346 + }, + { + "epoch": 3.0603668941979523, + "grad_norm": 0.27917429885713707, + "learning_rate": 0.00011069524185674973, + "loss": 0.8768, + "step": 14347 + }, + { + "epoch": 3.060580204778157, + "grad_norm": 0.3170482834139693, + "learning_rate": 0.00011068738324032567, + "loss": 0.8952, + "step": 14348 + }, + { + "epoch": 3.0607935153583616, + "grad_norm": 0.35874034992204473, + "learning_rate": 0.00011067952427667333, + "loss": 0.928, + "step": 14349 + }, + { + "epoch": 3.0610068259385668, + "grad_norm": 0.4049709272067847, + "learning_rate": 0.00011067166496588169, + "loss": 0.9124, + "step": 14350 + }, + { + "epoch": 3.0612201365187715, + "grad_norm": 0.33039348209292213, + "learning_rate": 0.00011066380530803962, + "loss": 0.9138, + "step": 14351 + }, + { + "epoch": 3.061433447098976, + "grad_norm": 0.1863437780884271, + "learning_rate": 0.0001106559453032361, + "loss": 0.9156, + "step": 14352 + }, + { + "epoch": 3.061646757679181, + "grad_norm": 0.28983871330717453, + "learning_rate": 0.00011064808495156, + "loss": 0.8959, + "step": 14353 + }, + { + "epoch": 3.0618600682593855, + "grad_norm": 0.38793179560841917, + "learning_rate": 0.00011064022425310033, + "loss": 0.9202, + "step": 14354 + }, + { + "epoch": 3.0620733788395906, + "grad_norm": 0.4236082056881992, + "learning_rate": 0.00011063236320794603, + "loss": 0.9128, + "step": 14355 + }, + { + "epoch": 3.0622866894197953, + "grad_norm": 0.3835907281678288, + "learning_rate": 0.00011062450181618601, + "loss": 0.8878, + "step": 14356 + }, + { + "epoch": 3.0625, + "grad_norm": 0.251395148450373, + "learning_rate": 0.00011061664007790921, + "loss": 0.9088, + "step": 14357 + }, + { + "epoch": 3.0627133105802047, + "grad_norm": 0.3140079750847553, + "learning_rate": 0.00011060877799320463, + "loss": 0.9214, + "step": 14358 + }, + { + "epoch": 3.0629266211604094, + "grad_norm": 0.49114678145230584, + "learning_rate": 0.0001106009155621612, + "loss": 0.9376, + "step": 14359 + }, + { + "epoch": 3.0631399317406145, + "grad_norm": 0.5118822296166238, + "learning_rate": 0.00011059305278486792, + "loss": 0.9194, + "step": 14360 + }, + { + "epoch": 3.063353242320819, + "grad_norm": 0.4326565469766259, + "learning_rate": 0.00011058518966141373, + "loss": 0.8883, + "step": 14361 + }, + { + "epoch": 3.063566552901024, + "grad_norm": 0.29726920834964743, + "learning_rate": 0.0001105773261918876, + "loss": 0.902, + "step": 14362 + }, + { + "epoch": 3.0637798634812285, + "grad_norm": 0.25063922265131416, + "learning_rate": 0.0001105694623763785, + "loss": 0.9292, + "step": 14363 + }, + { + "epoch": 3.0639931740614332, + "grad_norm": 0.2694106003708438, + "learning_rate": 0.00011056159821497543, + "loss": 0.9086, + "step": 14364 + }, + { + "epoch": 3.0642064846416384, + "grad_norm": 0.3658838428112316, + "learning_rate": 0.00011055373370776734, + "loss": 0.9224, + "step": 14365 + }, + { + "epoch": 3.064419795221843, + "grad_norm": 0.36799519855226964, + "learning_rate": 0.00011054586885484329, + "loss": 0.8998, + "step": 14366 + }, + { + "epoch": 3.0646331058020477, + "grad_norm": 0.33703370313476033, + "learning_rate": 0.00011053800365629218, + "loss": 0.8947, + "step": 14367 + }, + { + "epoch": 3.0648464163822524, + "grad_norm": 0.307252870235402, + "learning_rate": 0.00011053013811220304, + "loss": 0.9348, + "step": 14368 + }, + { + "epoch": 3.0650597269624575, + "grad_norm": 0.26053419764630154, + "learning_rate": 0.00011052227222266488, + "loss": 0.9333, + "step": 14369 + }, + { + "epoch": 3.0652730375426622, + "grad_norm": 0.1925892018266642, + "learning_rate": 0.00011051440598776671, + "loss": 0.9159, + "step": 14370 + }, + { + "epoch": 3.065486348122867, + "grad_norm": 0.29020596643087115, + "learning_rate": 0.0001105065394075975, + "loss": 0.9503, + "step": 14371 + }, + { + "epoch": 3.0656996587030716, + "grad_norm": 0.32840903145198275, + "learning_rate": 0.00011049867248224633, + "loss": 0.9393, + "step": 14372 + }, + { + "epoch": 3.0659129692832763, + "grad_norm": 0.2554556939901555, + "learning_rate": 0.00011049080521180212, + "loss": 0.8849, + "step": 14373 + }, + { + "epoch": 3.0661262798634814, + "grad_norm": 0.23433097712932247, + "learning_rate": 0.00011048293759635397, + "loss": 0.9768, + "step": 14374 + }, + { + "epoch": 3.066339590443686, + "grad_norm": 0.35811655694991446, + "learning_rate": 0.00011047506963599082, + "loss": 0.9209, + "step": 14375 + }, + { + "epoch": 3.0665529010238908, + "grad_norm": 0.3292376260497908, + "learning_rate": 0.0001104672013308018, + "loss": 0.9108, + "step": 14376 + }, + { + "epoch": 3.0667662116040955, + "grad_norm": 0.20488948217373848, + "learning_rate": 0.00011045933268087584, + "loss": 0.9108, + "step": 14377 + }, + { + "epoch": 3.0669795221843, + "grad_norm": 0.2642442781396092, + "learning_rate": 0.00011045146368630207, + "loss": 0.9509, + "step": 14378 + }, + { + "epoch": 3.0671928327645053, + "grad_norm": 0.3689004767358209, + "learning_rate": 0.00011044359434716942, + "loss": 0.9306, + "step": 14379 + }, + { + "epoch": 3.06740614334471, + "grad_norm": 0.38478442836884363, + "learning_rate": 0.00011043572466356702, + "loss": 0.8877, + "step": 14380 + }, + { + "epoch": 3.0676194539249146, + "grad_norm": 0.22126513013743987, + "learning_rate": 0.00011042785463558385, + "loss": 0.9377, + "step": 14381 + }, + { + "epoch": 3.0678327645051193, + "grad_norm": 0.32510018439518384, + "learning_rate": 0.00011041998426330904, + "loss": 0.9024, + "step": 14382 + }, + { + "epoch": 3.0680460750853245, + "grad_norm": 0.3100924480020656, + "learning_rate": 0.00011041211354683159, + "loss": 0.9361, + "step": 14383 + }, + { + "epoch": 3.068259385665529, + "grad_norm": 0.22335707484553102, + "learning_rate": 0.00011040424248624055, + "loss": 0.9264, + "step": 14384 + }, + { + "epoch": 3.068472696245734, + "grad_norm": 0.233659646797202, + "learning_rate": 0.00011039637108162497, + "loss": 0.972, + "step": 14385 + }, + { + "epoch": 3.0686860068259385, + "grad_norm": 0.24287485500555556, + "learning_rate": 0.00011038849933307398, + "loss": 0.9316, + "step": 14386 + }, + { + "epoch": 3.068899317406143, + "grad_norm": 0.4879525294120165, + "learning_rate": 0.00011038062724067658, + "loss": 0.9409, + "step": 14387 + }, + { + "epoch": 3.0691126279863483, + "grad_norm": 0.18241313615415797, + "learning_rate": 0.00011037275480452188, + "loss": 0.9153, + "step": 14388 + }, + { + "epoch": 3.069325938566553, + "grad_norm": 0.24826848374343938, + "learning_rate": 0.00011036488202469898, + "loss": 0.9322, + "step": 14389 + }, + { + "epoch": 3.0695392491467577, + "grad_norm": 0.28511815996688583, + "learning_rate": 0.0001103570089012969, + "loss": 0.9255, + "step": 14390 + }, + { + "epoch": 3.0697525597269624, + "grad_norm": 0.22752258877352705, + "learning_rate": 0.00011034913543440475, + "loss": 0.9219, + "step": 14391 + }, + { + "epoch": 3.069965870307167, + "grad_norm": 0.22557626337619144, + "learning_rate": 0.00011034126162411164, + "loss": 0.9018, + "step": 14392 + }, + { + "epoch": 3.070179180887372, + "grad_norm": 0.25337115160209456, + "learning_rate": 0.00011033338747050661, + "loss": 0.9199, + "step": 14393 + }, + { + "epoch": 3.070392491467577, + "grad_norm": 0.25161861159910576, + "learning_rate": 0.00011032551297367882, + "loss": 0.9124, + "step": 14394 + }, + { + "epoch": 3.0706058020477816, + "grad_norm": 0.244987492582416, + "learning_rate": 0.00011031763813371733, + "loss": 0.9109, + "step": 14395 + }, + { + "epoch": 3.0708191126279862, + "grad_norm": 0.2579747953452988, + "learning_rate": 0.00011030976295071127, + "loss": 0.9285, + "step": 14396 + }, + { + "epoch": 3.071032423208191, + "grad_norm": 0.256020810548122, + "learning_rate": 0.0001103018874247497, + "loss": 0.9331, + "step": 14397 + }, + { + "epoch": 3.071245733788396, + "grad_norm": 0.283668007087908, + "learning_rate": 0.0001102940115559218, + "loss": 0.9111, + "step": 14398 + }, + { + "epoch": 3.0714590443686007, + "grad_norm": 0.27493218283306325, + "learning_rate": 0.00011028613534431663, + "loss": 0.9408, + "step": 14399 + }, + { + "epoch": 3.0716723549488054, + "grad_norm": 0.21036802247005595, + "learning_rate": 0.00011027825879002334, + "loss": 0.8999, + "step": 14400 + }, + { + "epoch": 3.07188566552901, + "grad_norm": 0.2060878888853648, + "learning_rate": 0.00011027038189313104, + "loss": 0.894, + "step": 14401 + }, + { + "epoch": 3.0720989761092152, + "grad_norm": 0.21817492480311032, + "learning_rate": 0.00011026250465372885, + "loss": 0.9324, + "step": 14402 + }, + { + "epoch": 3.07231228668942, + "grad_norm": 0.18724222047152395, + "learning_rate": 0.00011025462707190591, + "loss": 0.9259, + "step": 14403 + }, + { + "epoch": 3.0725255972696246, + "grad_norm": 0.2173197797417186, + "learning_rate": 0.00011024674914775138, + "loss": 0.9201, + "step": 14404 + }, + { + "epoch": 3.0727389078498293, + "grad_norm": 0.22898168051334958, + "learning_rate": 0.00011023887088135436, + "loss": 0.9303, + "step": 14405 + }, + { + "epoch": 3.072952218430034, + "grad_norm": 0.2011376340698683, + "learning_rate": 0.00011023099227280398, + "loss": 0.9285, + "step": 14406 + }, + { + "epoch": 3.073165529010239, + "grad_norm": 0.19795395073069752, + "learning_rate": 0.00011022311332218945, + "loss": 0.9019, + "step": 14407 + }, + { + "epoch": 3.073378839590444, + "grad_norm": 0.23048181097882142, + "learning_rate": 0.00011021523402959989, + "loss": 0.9198, + "step": 14408 + }, + { + "epoch": 3.0735921501706485, + "grad_norm": 0.30227356915723075, + "learning_rate": 0.0001102073543951244, + "loss": 0.9247, + "step": 14409 + }, + { + "epoch": 3.073805460750853, + "grad_norm": 0.36309628053509746, + "learning_rate": 0.00011019947441885223, + "loss": 0.9144, + "step": 14410 + }, + { + "epoch": 3.074018771331058, + "grad_norm": 0.3703573850268183, + "learning_rate": 0.00011019159410087249, + "loss": 0.8937, + "step": 14411 + }, + { + "epoch": 3.074232081911263, + "grad_norm": 0.3482515764325422, + "learning_rate": 0.00011018371344127435, + "loss": 0.8986, + "step": 14412 + }, + { + "epoch": 3.0744453924914676, + "grad_norm": 0.3200629837348808, + "learning_rate": 0.00011017583244014699, + "loss": 0.9097, + "step": 14413 + }, + { + "epoch": 3.0746587030716723, + "grad_norm": 0.261447801846783, + "learning_rate": 0.00011016795109757959, + "loss": 0.9049, + "step": 14414 + }, + { + "epoch": 3.074872013651877, + "grad_norm": 0.2180375394217375, + "learning_rate": 0.00011016006941366128, + "loss": 0.9148, + "step": 14415 + }, + { + "epoch": 3.0750853242320817, + "grad_norm": 0.2264485010762273, + "learning_rate": 0.00011015218738848132, + "loss": 0.9108, + "step": 14416 + }, + { + "epoch": 3.075298634812287, + "grad_norm": 0.3648274117363372, + "learning_rate": 0.00011014430502212883, + "loss": 0.9384, + "step": 14417 + }, + { + "epoch": 3.0755119453924915, + "grad_norm": 0.39498609154845543, + "learning_rate": 0.000110136422314693, + "loss": 0.9424, + "step": 14418 + }, + { + "epoch": 3.075725255972696, + "grad_norm": 0.3728946821701374, + "learning_rate": 0.00011012853926626307, + "loss": 0.9377, + "step": 14419 + }, + { + "epoch": 3.075938566552901, + "grad_norm": 0.2881596539279631, + "learning_rate": 0.00011012065587692822, + "loss": 0.9462, + "step": 14420 + }, + { + "epoch": 3.076151877133106, + "grad_norm": 0.19504351496277195, + "learning_rate": 0.00011011277214677762, + "loss": 0.9147, + "step": 14421 + }, + { + "epoch": 3.0763651877133107, + "grad_norm": 0.2503096553608186, + "learning_rate": 0.00011010488807590051, + "loss": 0.932, + "step": 14422 + }, + { + "epoch": 3.0765784982935154, + "grad_norm": 0.3518335375694855, + "learning_rate": 0.00011009700366438608, + "loss": 0.8921, + "step": 14423 + }, + { + "epoch": 3.07679180887372, + "grad_norm": 0.4307955956227786, + "learning_rate": 0.00011008911891232355, + "loss": 0.9254, + "step": 14424 + }, + { + "epoch": 3.0770051194539247, + "grad_norm": 0.3884580448481864, + "learning_rate": 0.00011008123381980213, + "loss": 0.9356, + "step": 14425 + }, + { + "epoch": 3.07721843003413, + "grad_norm": 0.23205924271480308, + "learning_rate": 0.00011007334838691105, + "loss": 0.867, + "step": 14426 + }, + { + "epoch": 3.0774317406143346, + "grad_norm": 0.21851320015467596, + "learning_rate": 0.00011006546261373952, + "loss": 0.8963, + "step": 14427 + }, + { + "epoch": 3.0776450511945392, + "grad_norm": 0.24615855740106682, + "learning_rate": 0.0001100575765003768, + "loss": 0.8796, + "step": 14428 + }, + { + "epoch": 3.077858361774744, + "grad_norm": 0.23144953341568325, + "learning_rate": 0.00011004969004691206, + "loss": 0.8984, + "step": 14429 + }, + { + "epoch": 3.0780716723549486, + "grad_norm": 0.22877320938729656, + "learning_rate": 0.00011004180325343458, + "loss": 0.9138, + "step": 14430 + }, + { + "epoch": 3.0782849829351537, + "grad_norm": 0.19922870889276068, + "learning_rate": 0.0001100339161200336, + "loss": 0.8928, + "step": 14431 + }, + { + "epoch": 3.0784982935153584, + "grad_norm": 0.1909711618229296, + "learning_rate": 0.00011002602864679836, + "loss": 0.9268, + "step": 14432 + }, + { + "epoch": 3.078711604095563, + "grad_norm": 0.8081977445405052, + "learning_rate": 0.00011001814083381811, + "loss": 0.9652, + "step": 14433 + }, + { + "epoch": 3.078924914675768, + "grad_norm": 0.2844642677867351, + "learning_rate": 0.00011001025268118205, + "loss": 0.9337, + "step": 14434 + }, + { + "epoch": 3.0791382252559725, + "grad_norm": 0.3334231860025832, + "learning_rate": 0.00011000236418897949, + "loss": 0.9028, + "step": 14435 + }, + { + "epoch": 3.0793515358361776, + "grad_norm": 0.4032338036003613, + "learning_rate": 0.00010999447535729968, + "loss": 0.9311, + "step": 14436 + }, + { + "epoch": 3.0795648464163823, + "grad_norm": 0.45314602940532306, + "learning_rate": 0.00010998658618623189, + "loss": 0.8996, + "step": 14437 + }, + { + "epoch": 3.079778156996587, + "grad_norm": 0.438596510748165, + "learning_rate": 0.00010997869667586538, + "loss": 0.9241, + "step": 14438 + }, + { + "epoch": 3.0799914675767917, + "grad_norm": 0.41642291800390385, + "learning_rate": 0.00010997080682628939, + "loss": 0.9127, + "step": 14439 + }, + { + "epoch": 3.080204778156997, + "grad_norm": 0.46405249905518037, + "learning_rate": 0.00010996291663759321, + "loss": 0.9121, + "step": 14440 + }, + { + "epoch": 3.0804180887372015, + "grad_norm": 0.411177188156526, + "learning_rate": 0.00010995502610986615, + "loss": 0.9257, + "step": 14441 + }, + { + "epoch": 3.080631399317406, + "grad_norm": 0.27962852310922887, + "learning_rate": 0.00010994713524319746, + "loss": 0.9287, + "step": 14442 + }, + { + "epoch": 3.080844709897611, + "grad_norm": 0.2845382001857501, + "learning_rate": 0.00010993924403767641, + "loss": 0.9274, + "step": 14443 + }, + { + "epoch": 3.0810580204778155, + "grad_norm": 0.45284164148719885, + "learning_rate": 0.00010993135249339233, + "loss": 0.9595, + "step": 14444 + }, + { + "epoch": 3.0812713310580206, + "grad_norm": 0.4896860640957607, + "learning_rate": 0.0001099234606104345, + "loss": 0.9084, + "step": 14445 + }, + { + "epoch": 3.0814846416382253, + "grad_norm": 0.367908890424525, + "learning_rate": 0.00010991556838889218, + "loss": 0.8826, + "step": 14446 + }, + { + "epoch": 3.08169795221843, + "grad_norm": 0.32032138134020405, + "learning_rate": 0.00010990767582885473, + "loss": 0.9236, + "step": 14447 + }, + { + "epoch": 3.0819112627986347, + "grad_norm": 0.3507459610395805, + "learning_rate": 0.00010989978293041139, + "loss": 0.9223, + "step": 14448 + }, + { + "epoch": 3.0821245733788394, + "grad_norm": 0.3496340708708527, + "learning_rate": 0.00010989188969365155, + "loss": 0.9338, + "step": 14449 + }, + { + "epoch": 3.0823378839590445, + "grad_norm": 0.26120419560694286, + "learning_rate": 0.00010988399611866446, + "loss": 0.8957, + "step": 14450 + }, + { + "epoch": 3.082551194539249, + "grad_norm": 0.35623203599016257, + "learning_rate": 0.00010987610220553944, + "loss": 0.9208, + "step": 14451 + }, + { + "epoch": 3.082764505119454, + "grad_norm": 0.3543906381715164, + "learning_rate": 0.00010986820795436582, + "loss": 0.9199, + "step": 14452 + }, + { + "epoch": 3.0829778156996586, + "grad_norm": 0.21270892774586575, + "learning_rate": 0.00010986031336523293, + "loss": 0.9236, + "step": 14453 + }, + { + "epoch": 3.0831911262798637, + "grad_norm": 0.3041282323508259, + "learning_rate": 0.00010985241843823007, + "loss": 0.9272, + "step": 14454 + }, + { + "epoch": 3.0834044368600684, + "grad_norm": 0.2974944499902165, + "learning_rate": 0.00010984452317344662, + "loss": 0.9468, + "step": 14455 + }, + { + "epoch": 3.083617747440273, + "grad_norm": 0.3061395812914622, + "learning_rate": 0.0001098366275709719, + "loss": 0.8872, + "step": 14456 + }, + { + "epoch": 3.0838310580204777, + "grad_norm": 0.3628884880700033, + "learning_rate": 0.00010982873163089521, + "loss": 0.9501, + "step": 14457 + }, + { + "epoch": 3.0840443686006824, + "grad_norm": 0.32079149447075544, + "learning_rate": 0.00010982083535330593, + "loss": 0.9373, + "step": 14458 + }, + { + "epoch": 3.0842576791808876, + "grad_norm": 0.20652880160766451, + "learning_rate": 0.0001098129387382934, + "loss": 0.9081, + "step": 14459 + }, + { + "epoch": 3.0844709897610922, + "grad_norm": 0.22971392338308466, + "learning_rate": 0.00010980504178594693, + "loss": 0.9191, + "step": 14460 + }, + { + "epoch": 3.084684300341297, + "grad_norm": 0.2762039884584651, + "learning_rate": 0.00010979714449635598, + "loss": 0.9141, + "step": 14461 + }, + { + "epoch": 3.0848976109215016, + "grad_norm": 0.20839083667930766, + "learning_rate": 0.00010978924686960979, + "loss": 0.948, + "step": 14462 + }, + { + "epoch": 3.0851109215017063, + "grad_norm": 0.27062583722533723, + "learning_rate": 0.00010978134890579779, + "loss": 0.9088, + "step": 14463 + }, + { + "epoch": 3.0853242320819114, + "grad_norm": 0.24673560047604842, + "learning_rate": 0.00010977345060500931, + "loss": 0.8998, + "step": 14464 + }, + { + "epoch": 3.085537542662116, + "grad_norm": 0.2250237409486631, + "learning_rate": 0.00010976555196733375, + "loss": 0.9114, + "step": 14465 + }, + { + "epoch": 3.085750853242321, + "grad_norm": 0.24343469689606934, + "learning_rate": 0.00010975765299286045, + "loss": 0.9373, + "step": 14466 + }, + { + "epoch": 3.0859641638225255, + "grad_norm": 0.20393237400195635, + "learning_rate": 0.00010974975368167883, + "loss": 0.9165, + "step": 14467 + }, + { + "epoch": 3.0861774744027306, + "grad_norm": 0.24154189142388402, + "learning_rate": 0.00010974185403387824, + "loss": 0.9234, + "step": 14468 + }, + { + "epoch": 3.0863907849829353, + "grad_norm": 0.3262475719360999, + "learning_rate": 0.00010973395404954806, + "loss": 0.9261, + "step": 14469 + }, + { + "epoch": 3.08660409556314, + "grad_norm": 0.2702260321989876, + "learning_rate": 0.0001097260537287777, + "loss": 0.9193, + "step": 14470 + }, + { + "epoch": 3.0868174061433447, + "grad_norm": 0.2506541639599041, + "learning_rate": 0.00010971815307165653, + "loss": 0.9245, + "step": 14471 + }, + { + "epoch": 3.0870307167235493, + "grad_norm": 0.3021259553079465, + "learning_rate": 0.00010971025207827398, + "loss": 0.9431, + "step": 14472 + }, + { + "epoch": 3.0872440273037545, + "grad_norm": 0.28170710188833126, + "learning_rate": 0.00010970235074871942, + "loss": 0.9115, + "step": 14473 + }, + { + "epoch": 3.087457337883959, + "grad_norm": 0.2448474054639014, + "learning_rate": 0.00010969444908308224, + "loss": 0.9379, + "step": 14474 + }, + { + "epoch": 3.087670648464164, + "grad_norm": 0.22476364984973438, + "learning_rate": 0.00010968654708145188, + "loss": 0.9392, + "step": 14475 + }, + { + "epoch": 3.0878839590443685, + "grad_norm": 0.21631484731192493, + "learning_rate": 0.00010967864474391775, + "loss": 0.9179, + "step": 14476 + }, + { + "epoch": 3.088097269624573, + "grad_norm": 0.23789636326731553, + "learning_rate": 0.00010967074207056926, + "loss": 0.914, + "step": 14477 + }, + { + "epoch": 3.0883105802047783, + "grad_norm": 0.232711463075209, + "learning_rate": 0.00010966283906149581, + "loss": 0.9481, + "step": 14478 + }, + { + "epoch": 3.088523890784983, + "grad_norm": 0.2182718138802797, + "learning_rate": 0.00010965493571678685, + "loss": 0.9499, + "step": 14479 + }, + { + "epoch": 3.0887372013651877, + "grad_norm": 0.1954696790232568, + "learning_rate": 0.00010964703203653178, + "loss": 0.9098, + "step": 14480 + }, + { + "epoch": 3.0889505119453924, + "grad_norm": 0.23476692603540897, + "learning_rate": 0.00010963912802082006, + "loss": 0.9315, + "step": 14481 + }, + { + "epoch": 3.089163822525597, + "grad_norm": 0.2146648453383994, + "learning_rate": 0.00010963122366974109, + "loss": 0.9384, + "step": 14482 + }, + { + "epoch": 3.089377133105802, + "grad_norm": 0.21031059034559893, + "learning_rate": 0.00010962331898338435, + "loss": 0.8865, + "step": 14483 + }, + { + "epoch": 3.089590443686007, + "grad_norm": 0.2715543554936318, + "learning_rate": 0.00010961541396183923, + "loss": 0.8923, + "step": 14484 + }, + { + "epoch": 3.0898037542662116, + "grad_norm": 0.33058829793083966, + "learning_rate": 0.00010960750860519523, + "loss": 0.908, + "step": 14485 + }, + { + "epoch": 3.0900170648464163, + "grad_norm": 0.2940559352804855, + "learning_rate": 0.00010959960291354174, + "loss": 0.8948, + "step": 14486 + }, + { + "epoch": 3.0902303754266214, + "grad_norm": 0.2651667312650918, + "learning_rate": 0.00010959169688696826, + "loss": 0.9215, + "step": 14487 + }, + { + "epoch": 3.090443686006826, + "grad_norm": 0.1976549835818966, + "learning_rate": 0.00010958379052556424, + "loss": 0.9187, + "step": 14488 + }, + { + "epoch": 3.0906569965870307, + "grad_norm": 0.18133347899046667, + "learning_rate": 0.00010957588382941915, + "loss": 0.912, + "step": 14489 + }, + { + "epoch": 3.0908703071672354, + "grad_norm": 0.2098513776466674, + "learning_rate": 0.00010956797679862238, + "loss": 0.9327, + "step": 14490 + }, + { + "epoch": 3.09108361774744, + "grad_norm": 0.21906755447948523, + "learning_rate": 0.00010956006943326352, + "loss": 0.9189, + "step": 14491 + }, + { + "epoch": 3.0912969283276452, + "grad_norm": 0.22570607448902755, + "learning_rate": 0.00010955216173343192, + "loss": 0.9073, + "step": 14492 + }, + { + "epoch": 3.09151023890785, + "grad_norm": 0.23857325182420752, + "learning_rate": 0.00010954425369921715, + "loss": 0.8967, + "step": 14493 + }, + { + "epoch": 3.0917235494880546, + "grad_norm": 0.2754022672518916, + "learning_rate": 0.00010953634533070863, + "loss": 0.8784, + "step": 14494 + }, + { + "epoch": 3.0919368600682593, + "grad_norm": 0.2545783165361586, + "learning_rate": 0.0001095284366279959, + "loss": 0.9369, + "step": 14495 + }, + { + "epoch": 3.092150170648464, + "grad_norm": 0.2616947463771434, + "learning_rate": 0.00010952052759116837, + "loss": 0.9027, + "step": 14496 + }, + { + "epoch": 3.092363481228669, + "grad_norm": 0.23520202719117525, + "learning_rate": 0.0001095126182203156, + "loss": 0.9137, + "step": 14497 + }, + { + "epoch": 3.092576791808874, + "grad_norm": 0.23751234632266763, + "learning_rate": 0.00010950470851552704, + "loss": 0.9115, + "step": 14498 + }, + { + "epoch": 3.0927901023890785, + "grad_norm": 0.2181343804951993, + "learning_rate": 0.00010949679847689223, + "loss": 0.8981, + "step": 14499 + }, + { + "epoch": 3.093003412969283, + "grad_norm": 0.2117182101787509, + "learning_rate": 0.00010948888810450063, + "loss": 0.9238, + "step": 14500 + }, + { + "epoch": 3.093216723549488, + "grad_norm": 0.2073185154528051, + "learning_rate": 0.00010948097739844178, + "loss": 0.8904, + "step": 14501 + }, + { + "epoch": 3.093430034129693, + "grad_norm": 0.21641920867986364, + "learning_rate": 0.00010947306635880517, + "loss": 0.954, + "step": 14502 + }, + { + "epoch": 3.0936433447098977, + "grad_norm": 0.2155784885120158, + "learning_rate": 0.00010946515498568032, + "loss": 0.9375, + "step": 14503 + }, + { + "epoch": 3.0938566552901023, + "grad_norm": 0.21080364377496472, + "learning_rate": 0.00010945724327915673, + "loss": 0.9164, + "step": 14504 + }, + { + "epoch": 3.094069965870307, + "grad_norm": 0.2104645658166398, + "learning_rate": 0.00010944933123932395, + "loss": 0.9079, + "step": 14505 + }, + { + "epoch": 3.094283276450512, + "grad_norm": 0.209826593489626, + "learning_rate": 0.0001094414188662715, + "loss": 0.9375, + "step": 14506 + }, + { + "epoch": 3.094496587030717, + "grad_norm": 0.25251698722458715, + "learning_rate": 0.00010943350616008889, + "loss": 0.9384, + "step": 14507 + }, + { + "epoch": 3.0947098976109215, + "grad_norm": 0.23845085985194409, + "learning_rate": 0.00010942559312086567, + "loss": 0.9119, + "step": 14508 + }, + { + "epoch": 3.094923208191126, + "grad_norm": 0.20779586284504636, + "learning_rate": 0.00010941767974869136, + "loss": 0.9159, + "step": 14509 + }, + { + "epoch": 3.095136518771331, + "grad_norm": 0.25633920245011943, + "learning_rate": 0.00010940976604365549, + "loss": 0.8992, + "step": 14510 + }, + { + "epoch": 3.095349829351536, + "grad_norm": 0.23691161892052814, + "learning_rate": 0.00010940185200584765, + "loss": 0.9425, + "step": 14511 + }, + { + "epoch": 3.0955631399317407, + "grad_norm": 0.3359444035768998, + "learning_rate": 0.00010939393763535736, + "loss": 0.8933, + "step": 14512 + }, + { + "epoch": 3.0957764505119454, + "grad_norm": 0.2665662929306529, + "learning_rate": 0.00010938602293227418, + "loss": 0.8825, + "step": 14513 + }, + { + "epoch": 3.09598976109215, + "grad_norm": 0.2574626937801475, + "learning_rate": 0.00010937810789668762, + "loss": 0.9111, + "step": 14514 + }, + { + "epoch": 3.0962030716723548, + "grad_norm": 0.25339964330353176, + "learning_rate": 0.0001093701925286873, + "loss": 0.9375, + "step": 14515 + }, + { + "epoch": 3.09641638225256, + "grad_norm": 0.21878785706759887, + "learning_rate": 0.00010936227682836274, + "loss": 0.9325, + "step": 14516 + }, + { + "epoch": 3.0966296928327646, + "grad_norm": 0.27916743487294954, + "learning_rate": 0.00010935436079580356, + "loss": 0.9222, + "step": 14517 + }, + { + "epoch": 3.0968430034129693, + "grad_norm": 0.35658480565611544, + "learning_rate": 0.00010934644443109927, + "loss": 0.9298, + "step": 14518 + }, + { + "epoch": 3.097056313993174, + "grad_norm": 0.2841545244146011, + "learning_rate": 0.00010933852773433947, + "loss": 0.8965, + "step": 14519 + }, + { + "epoch": 3.0972696245733786, + "grad_norm": 0.3095651935105379, + "learning_rate": 0.00010933061070561374, + "loss": 0.907, + "step": 14520 + }, + { + "epoch": 3.0974829351535837, + "grad_norm": 0.24913635511600626, + "learning_rate": 0.00010932269334501165, + "loss": 0.9005, + "step": 14521 + }, + { + "epoch": 3.0976962457337884, + "grad_norm": 0.2179677725425343, + "learning_rate": 0.00010931477565262278, + "loss": 0.9141, + "step": 14522 + }, + { + "epoch": 3.097909556313993, + "grad_norm": 0.29576262634545386, + "learning_rate": 0.00010930685762853675, + "loss": 0.9308, + "step": 14523 + }, + { + "epoch": 3.098122866894198, + "grad_norm": 0.3645720230771954, + "learning_rate": 0.00010929893927284312, + "loss": 0.9109, + "step": 14524 + }, + { + "epoch": 3.098336177474403, + "grad_norm": 0.31869700430312264, + "learning_rate": 0.00010929102058563151, + "loss": 0.9106, + "step": 14525 + }, + { + "epoch": 3.0985494880546076, + "grad_norm": 0.3183918185675771, + "learning_rate": 0.0001092831015669915, + "loss": 0.9304, + "step": 14526 + }, + { + "epoch": 3.0987627986348123, + "grad_norm": 0.48549072734580095, + "learning_rate": 0.0001092751822170127, + "loss": 0.9376, + "step": 14527 + }, + { + "epoch": 3.098976109215017, + "grad_norm": 0.41248941028445113, + "learning_rate": 0.00010926726253578475, + "loss": 0.885, + "step": 14528 + }, + { + "epoch": 3.0991894197952217, + "grad_norm": 0.26419132408096435, + "learning_rate": 0.00010925934252339721, + "loss": 0.9096, + "step": 14529 + }, + { + "epoch": 3.099402730375427, + "grad_norm": 0.3875934875937178, + "learning_rate": 0.00010925142217993972, + "loss": 0.9072, + "step": 14530 + }, + { + "epoch": 3.0996160409556315, + "grad_norm": 0.38073260666956077, + "learning_rate": 0.00010924350150550189, + "loss": 0.9205, + "step": 14531 + }, + { + "epoch": 3.099829351535836, + "grad_norm": 0.35747977470545544, + "learning_rate": 0.00010923558050017336, + "loss": 0.9113, + "step": 14532 + }, + { + "epoch": 3.100042662116041, + "grad_norm": 0.3940140918582688, + "learning_rate": 0.00010922765916404372, + "loss": 0.8924, + "step": 14533 + }, + { + "epoch": 3.1002559726962455, + "grad_norm": 0.387951211211889, + "learning_rate": 0.00010921973749720264, + "loss": 0.9333, + "step": 14534 + }, + { + "epoch": 3.1004692832764507, + "grad_norm": 0.3750099638893742, + "learning_rate": 0.00010921181549973975, + "loss": 0.9406, + "step": 14535 + }, + { + "epoch": 3.1006825938566553, + "grad_norm": 0.38724620351619216, + "learning_rate": 0.00010920389317174466, + "loss": 0.9314, + "step": 14536 + }, + { + "epoch": 3.10089590443686, + "grad_norm": 0.3486441782981055, + "learning_rate": 0.00010919597051330704, + "loss": 0.9262, + "step": 14537 + }, + { + "epoch": 3.1011092150170647, + "grad_norm": 0.30663789618779225, + "learning_rate": 0.00010918804752451651, + "loss": 0.9234, + "step": 14538 + }, + { + "epoch": 3.1013225255972694, + "grad_norm": 0.24627819063045955, + "learning_rate": 0.00010918012420546274, + "loss": 0.9487, + "step": 14539 + }, + { + "epoch": 3.1015358361774745, + "grad_norm": 0.21385331649746286, + "learning_rate": 0.00010917220055623539, + "loss": 0.9086, + "step": 14540 + }, + { + "epoch": 3.101749146757679, + "grad_norm": 0.2561903869308734, + "learning_rate": 0.00010916427657692407, + "loss": 0.9018, + "step": 14541 + }, + { + "epoch": 3.101962457337884, + "grad_norm": 0.25247131866328376, + "learning_rate": 0.00010915635226761847, + "loss": 0.9263, + "step": 14542 + }, + { + "epoch": 3.1021757679180886, + "grad_norm": 0.3107191905558907, + "learning_rate": 0.00010914842762840828, + "loss": 0.9234, + "step": 14543 + }, + { + "epoch": 3.1023890784982937, + "grad_norm": 0.3083048608489409, + "learning_rate": 0.00010914050265938311, + "loss": 0.9367, + "step": 14544 + }, + { + "epoch": 3.1026023890784984, + "grad_norm": 0.23709371463447937, + "learning_rate": 0.00010913257736063272, + "loss": 0.9447, + "step": 14545 + }, + { + "epoch": 3.102815699658703, + "grad_norm": 0.23608942703879457, + "learning_rate": 0.00010912465173224667, + "loss": 0.9114, + "step": 14546 + }, + { + "epoch": 3.1030290102389078, + "grad_norm": 0.29244939821236676, + "learning_rate": 0.00010911672577431471, + "loss": 0.9418, + "step": 14547 + }, + { + "epoch": 3.1032423208191124, + "grad_norm": 0.27608320938018094, + "learning_rate": 0.0001091087994869265, + "loss": 0.9144, + "step": 14548 + }, + { + "epoch": 3.1034556313993176, + "grad_norm": 0.2362918805087767, + "learning_rate": 0.00010910087287017174, + "loss": 0.9353, + "step": 14549 + }, + { + "epoch": 3.1036689419795223, + "grad_norm": 0.2349337550901224, + "learning_rate": 0.00010909294592414011, + "loss": 0.8838, + "step": 14550 + }, + { + "epoch": 3.103882252559727, + "grad_norm": 0.26974020393392195, + "learning_rate": 0.00010908501864892135, + "loss": 0.8984, + "step": 14551 + }, + { + "epoch": 3.1040955631399316, + "grad_norm": 0.21931958726446132, + "learning_rate": 0.00010907709104460507, + "loss": 0.9016, + "step": 14552 + }, + { + "epoch": 3.1043088737201363, + "grad_norm": 0.22340701891897183, + "learning_rate": 0.00010906916311128103, + "loss": 0.9432, + "step": 14553 + }, + { + "epoch": 3.1045221843003414, + "grad_norm": 0.29546726030253445, + "learning_rate": 0.00010906123484903892, + "loss": 0.9014, + "step": 14554 + }, + { + "epoch": 3.104735494880546, + "grad_norm": 0.2621598331874885, + "learning_rate": 0.00010905330625796844, + "loss": 0.9111, + "step": 14555 + }, + { + "epoch": 3.104948805460751, + "grad_norm": 0.2504079363078303, + "learning_rate": 0.00010904537733815933, + "loss": 0.8994, + "step": 14556 + }, + { + "epoch": 3.1051621160409555, + "grad_norm": 0.2730165400759127, + "learning_rate": 0.00010903744808970127, + "loss": 0.9152, + "step": 14557 + }, + { + "epoch": 3.1053754266211606, + "grad_norm": 0.25488305334983324, + "learning_rate": 0.00010902951851268399, + "loss": 0.9192, + "step": 14558 + }, + { + "epoch": 3.1055887372013653, + "grad_norm": 0.27472785418263695, + "learning_rate": 0.00010902158860719724, + "loss": 0.9076, + "step": 14559 + }, + { + "epoch": 3.10580204778157, + "grad_norm": 0.26716513348436877, + "learning_rate": 0.0001090136583733307, + "loss": 0.9289, + "step": 14560 + }, + { + "epoch": 3.1060153583617747, + "grad_norm": 0.21442675554235105, + "learning_rate": 0.00010900572781117415, + "loss": 0.9354, + "step": 14561 + }, + { + "epoch": 3.1062286689419794, + "grad_norm": 0.2000011641280503, + "learning_rate": 0.0001089977969208173, + "loss": 0.9504, + "step": 14562 + }, + { + "epoch": 3.1064419795221845, + "grad_norm": 0.2309815380809141, + "learning_rate": 0.0001089898657023499, + "loss": 0.9634, + "step": 14563 + }, + { + "epoch": 3.106655290102389, + "grad_norm": 0.21060333208301016, + "learning_rate": 0.00010898193415586165, + "loss": 0.9474, + "step": 14564 + }, + { + "epoch": 3.106868600682594, + "grad_norm": 0.2056918891170931, + "learning_rate": 0.00010897400228144236, + "loss": 0.9038, + "step": 14565 + }, + { + "epoch": 3.1070819112627985, + "grad_norm": 0.20621793725040105, + "learning_rate": 0.00010896607007918172, + "loss": 0.9057, + "step": 14566 + }, + { + "epoch": 3.107295221843003, + "grad_norm": 0.21287132386190974, + "learning_rate": 0.00010895813754916952, + "loss": 0.9508, + "step": 14567 + }, + { + "epoch": 3.1075085324232083, + "grad_norm": 0.22680895793840958, + "learning_rate": 0.0001089502046914955, + "loss": 0.908, + "step": 14568 + }, + { + "epoch": 3.107721843003413, + "grad_norm": 0.24877364277437236, + "learning_rate": 0.00010894227150624945, + "loss": 0.9293, + "step": 14569 + }, + { + "epoch": 3.1079351535836177, + "grad_norm": 0.1882529316646171, + "learning_rate": 0.0001089343379935211, + "loss": 0.9419, + "step": 14570 + }, + { + "epoch": 3.1081484641638224, + "grad_norm": 0.22750073542287857, + "learning_rate": 0.00010892640415340023, + "loss": 0.9048, + "step": 14571 + }, + { + "epoch": 3.1083617747440275, + "grad_norm": 0.2310211094980242, + "learning_rate": 0.00010891846998597659, + "loss": 0.9406, + "step": 14572 + }, + { + "epoch": 3.108575085324232, + "grad_norm": 0.20920017749225897, + "learning_rate": 0.00010891053549134, + "loss": 0.9098, + "step": 14573 + }, + { + "epoch": 3.108788395904437, + "grad_norm": 0.31949797931643087, + "learning_rate": 0.00010890260066958022, + "loss": 0.9079, + "step": 14574 + }, + { + "epoch": 3.1090017064846416, + "grad_norm": 0.22068113489697352, + "learning_rate": 0.000108894665520787, + "loss": 0.9187, + "step": 14575 + }, + { + "epoch": 3.1092150170648463, + "grad_norm": 0.25285318572832133, + "learning_rate": 0.00010888673004505017, + "loss": 0.9267, + "step": 14576 + }, + { + "epoch": 3.1094283276450514, + "grad_norm": 0.34123636820091285, + "learning_rate": 0.00010887879424245949, + "loss": 0.9092, + "step": 14577 + }, + { + "epoch": 3.109641638225256, + "grad_norm": 0.24137712926079383, + "learning_rate": 0.00010887085811310478, + "loss": 0.9301, + "step": 14578 + }, + { + "epoch": 3.1098549488054608, + "grad_norm": 0.2750217238736411, + "learning_rate": 0.00010886292165707584, + "loss": 0.9189, + "step": 14579 + }, + { + "epoch": 3.1100682593856654, + "grad_norm": 0.23850126680541958, + "learning_rate": 0.00010885498487446243, + "loss": 0.9171, + "step": 14580 + }, + { + "epoch": 3.11028156996587, + "grad_norm": 0.2040751593351791, + "learning_rate": 0.0001088470477653544, + "loss": 0.9286, + "step": 14581 + }, + { + "epoch": 3.1104948805460753, + "grad_norm": 0.26707804028320514, + "learning_rate": 0.00010883911032984153, + "loss": 0.897, + "step": 14582 + }, + { + "epoch": 3.11070819112628, + "grad_norm": 0.2388785717206839, + "learning_rate": 0.00010883117256801363, + "loss": 0.9383, + "step": 14583 + }, + { + "epoch": 3.1109215017064846, + "grad_norm": 0.24485257627105186, + "learning_rate": 0.00010882323447996055, + "loss": 0.9326, + "step": 14584 + }, + { + "epoch": 3.1111348122866893, + "grad_norm": 0.21908712163357702, + "learning_rate": 0.00010881529606577207, + "loss": 0.8836, + "step": 14585 + }, + { + "epoch": 3.111348122866894, + "grad_norm": 0.19327139665679385, + "learning_rate": 0.00010880735732553803, + "loss": 0.9297, + "step": 14586 + }, + { + "epoch": 3.111561433447099, + "grad_norm": 0.20491895001792387, + "learning_rate": 0.00010879941825934827, + "loss": 0.9349, + "step": 14587 + }, + { + "epoch": 3.111774744027304, + "grad_norm": 0.1815419305838057, + "learning_rate": 0.00010879147886729259, + "loss": 0.9118, + "step": 14588 + }, + { + "epoch": 3.1119880546075085, + "grad_norm": 0.232262573089888, + "learning_rate": 0.00010878353914946087, + "loss": 0.9519, + "step": 14589 + }, + { + "epoch": 3.112201365187713, + "grad_norm": 0.3637251433601946, + "learning_rate": 0.0001087755991059429, + "loss": 0.9549, + "step": 14590 + }, + { + "epoch": 3.1124146757679183, + "grad_norm": 0.22825673729447113, + "learning_rate": 0.00010876765873682855, + "loss": 0.9004, + "step": 14591 + }, + { + "epoch": 3.112627986348123, + "grad_norm": 0.26242193356776294, + "learning_rate": 0.00010875971804220765, + "loss": 0.9256, + "step": 14592 + }, + { + "epoch": 3.1128412969283277, + "grad_norm": 0.29124789231998105, + "learning_rate": 0.00010875177702217007, + "loss": 0.9304, + "step": 14593 + }, + { + "epoch": 3.1130546075085324, + "grad_norm": 0.26190330702971365, + "learning_rate": 0.00010874383567680563, + "loss": 0.9496, + "step": 14594 + }, + { + "epoch": 3.113267918088737, + "grad_norm": 0.21134497262764393, + "learning_rate": 0.00010873589400620423, + "loss": 0.9084, + "step": 14595 + }, + { + "epoch": 3.113481228668942, + "grad_norm": 0.2143183278239585, + "learning_rate": 0.00010872795201045569, + "loss": 0.9196, + "step": 14596 + }, + { + "epoch": 3.113694539249147, + "grad_norm": 0.28699734613071387, + "learning_rate": 0.0001087200096896499, + "loss": 0.9386, + "step": 14597 + }, + { + "epoch": 3.1139078498293515, + "grad_norm": 0.2754210428905315, + "learning_rate": 0.00010871206704387667, + "loss": 0.9112, + "step": 14598 + }, + { + "epoch": 3.114121160409556, + "grad_norm": 0.21372311282672946, + "learning_rate": 0.00010870412407322597, + "loss": 0.9611, + "step": 14599 + }, + { + "epoch": 3.114334470989761, + "grad_norm": 0.2543088662170384, + "learning_rate": 0.00010869618077778761, + "loss": 0.9235, + "step": 14600 + }, + { + "epoch": 3.114547781569966, + "grad_norm": 0.2976827016221363, + "learning_rate": 0.00010868823715765148, + "loss": 0.9356, + "step": 14601 + }, + { + "epoch": 3.1147610921501707, + "grad_norm": 0.28272361844814903, + "learning_rate": 0.00010868029321290748, + "loss": 0.9242, + "step": 14602 + }, + { + "epoch": 3.1149744027303754, + "grad_norm": 0.30464598573293117, + "learning_rate": 0.00010867234894364548, + "loss": 0.895, + "step": 14603 + }, + { + "epoch": 3.11518771331058, + "grad_norm": 0.4139685700424013, + "learning_rate": 0.00010866440434995532, + "loss": 0.9267, + "step": 14604 + }, + { + "epoch": 3.1154010238907848, + "grad_norm": 0.3889651081869925, + "learning_rate": 0.000108656459431927, + "loss": 0.9255, + "step": 14605 + }, + { + "epoch": 3.11561433447099, + "grad_norm": 0.2566646441462732, + "learning_rate": 0.00010864851418965032, + "loss": 0.8915, + "step": 14606 + }, + { + "epoch": 3.1158276450511946, + "grad_norm": 0.27795236858469957, + "learning_rate": 0.00010864056862321526, + "loss": 0.9317, + "step": 14607 + }, + { + "epoch": 3.1160409556313993, + "grad_norm": 0.3496447566283653, + "learning_rate": 0.00010863262273271166, + "loss": 0.948, + "step": 14608 + }, + { + "epoch": 3.116254266211604, + "grad_norm": 0.36709830566069096, + "learning_rate": 0.00010862467651822945, + "loss": 0.9493, + "step": 14609 + }, + { + "epoch": 3.116467576791809, + "grad_norm": 0.43475064312705247, + "learning_rate": 0.00010861672997985853, + "loss": 0.9096, + "step": 14610 + }, + { + "epoch": 3.1166808873720138, + "grad_norm": 0.46434757785453823, + "learning_rate": 0.00010860878311768886, + "loss": 0.9377, + "step": 14611 + }, + { + "epoch": 3.1168941979522184, + "grad_norm": 0.39664787833613996, + "learning_rate": 0.00010860083593181031, + "loss": 0.906, + "step": 14612 + }, + { + "epoch": 3.117107508532423, + "grad_norm": 0.2703997846413822, + "learning_rate": 0.00010859288842231284, + "loss": 0.9423, + "step": 14613 + }, + { + "epoch": 3.117320819112628, + "grad_norm": 0.20669944232836612, + "learning_rate": 0.00010858494058928632, + "loss": 0.9223, + "step": 14614 + }, + { + "epoch": 3.117534129692833, + "grad_norm": 0.21819930265397688, + "learning_rate": 0.00010857699243282075, + "loss": 0.9064, + "step": 14615 + }, + { + "epoch": 3.1177474402730376, + "grad_norm": 0.23194114145178357, + "learning_rate": 0.000108569043953006, + "loss": 0.9347, + "step": 14616 + }, + { + "epoch": 3.1179607508532423, + "grad_norm": 0.8544979096056887, + "learning_rate": 0.00010856109514993207, + "loss": 0.9684, + "step": 14617 + }, + { + "epoch": 3.118174061433447, + "grad_norm": 0.2640782252756507, + "learning_rate": 0.00010855314602368886, + "loss": 0.9035, + "step": 14618 + }, + { + "epoch": 3.1183873720136517, + "grad_norm": 0.30365507794527374, + "learning_rate": 0.00010854519657436632, + "loss": 0.9298, + "step": 14619 + }, + { + "epoch": 3.118600682593857, + "grad_norm": 0.2984706536380036, + "learning_rate": 0.00010853724680205439, + "loss": 0.9694, + "step": 14620 + }, + { + "epoch": 3.1188139931740615, + "grad_norm": 0.32348817856682366, + "learning_rate": 0.00010852929670684305, + "loss": 0.946, + "step": 14621 + }, + { + "epoch": 3.119027303754266, + "grad_norm": 0.374390184731896, + "learning_rate": 0.0001085213462888222, + "loss": 0.9267, + "step": 14622 + }, + { + "epoch": 3.119240614334471, + "grad_norm": 0.27248822822014146, + "learning_rate": 0.00010851339554808187, + "loss": 0.9259, + "step": 14623 + }, + { + "epoch": 3.1194539249146755, + "grad_norm": 0.25609358389543496, + "learning_rate": 0.000108505444484712, + "loss": 0.9324, + "step": 14624 + }, + { + "epoch": 3.1196672354948807, + "grad_norm": 0.22253713311412823, + "learning_rate": 0.00010849749309880253, + "loss": 0.9248, + "step": 14625 + }, + { + "epoch": 3.1198805460750854, + "grad_norm": 0.27058458660176976, + "learning_rate": 0.00010848954139044342, + "loss": 0.9596, + "step": 14626 + }, + { + "epoch": 3.12009385665529, + "grad_norm": 0.25564504392259757, + "learning_rate": 0.00010848158935972471, + "loss": 0.9147, + "step": 14627 + }, + { + "epoch": 3.1203071672354947, + "grad_norm": 0.2611470261725161, + "learning_rate": 0.00010847363700673632, + "loss": 0.9329, + "step": 14628 + }, + { + "epoch": 3.1205204778157, + "grad_norm": 0.2821046288414029, + "learning_rate": 0.00010846568433156825, + "loss": 0.9491, + "step": 14629 + }, + { + "epoch": 3.1207337883959045, + "grad_norm": 0.23492632483583045, + "learning_rate": 0.0001084577313343105, + "loss": 0.9242, + "step": 14630 + }, + { + "epoch": 3.1209470989761092, + "grad_norm": 0.27761023503122223, + "learning_rate": 0.00010844977801505302, + "loss": 0.8994, + "step": 14631 + }, + { + "epoch": 3.121160409556314, + "grad_norm": 0.3044842380463498, + "learning_rate": 0.00010844182437388582, + "loss": 0.936, + "step": 14632 + }, + { + "epoch": 3.1213737201365186, + "grad_norm": 0.3125912758774358, + "learning_rate": 0.00010843387041089891, + "loss": 0.979, + "step": 14633 + }, + { + "epoch": 3.1215870307167237, + "grad_norm": 0.384646222284776, + "learning_rate": 0.00010842591612618226, + "loss": 0.9395, + "step": 14634 + }, + { + "epoch": 3.1218003412969284, + "grad_norm": 0.35879379332100436, + "learning_rate": 0.00010841796151982592, + "loss": 0.9103, + "step": 14635 + }, + { + "epoch": 3.122013651877133, + "grad_norm": 0.2634826510770936, + "learning_rate": 0.00010841000659191984, + "loss": 0.9118, + "step": 14636 + }, + { + "epoch": 3.1222269624573378, + "grad_norm": 0.2526277441736101, + "learning_rate": 0.0001084020513425541, + "loss": 0.9274, + "step": 14637 + }, + { + "epoch": 3.1224402730375425, + "grad_norm": 0.2750700302499352, + "learning_rate": 0.00010839409577181862, + "loss": 0.9379, + "step": 14638 + }, + { + "epoch": 3.1226535836177476, + "grad_norm": 0.2342154808624432, + "learning_rate": 0.0001083861398798035, + "loss": 0.9544, + "step": 14639 + }, + { + "epoch": 3.1228668941979523, + "grad_norm": 0.2753336407117428, + "learning_rate": 0.00010837818366659873, + "loss": 0.9298, + "step": 14640 + }, + { + "epoch": 3.123080204778157, + "grad_norm": 0.25247670024960356, + "learning_rate": 0.00010837022713229432, + "loss": 0.904, + "step": 14641 + }, + { + "epoch": 3.1232935153583616, + "grad_norm": 0.228561456918269, + "learning_rate": 0.00010836227027698032, + "loss": 0.9558, + "step": 14642 + }, + { + "epoch": 3.1235068259385668, + "grad_norm": 0.27302650571209475, + "learning_rate": 0.00010835431310074676, + "loss": 0.9379, + "step": 14643 + }, + { + "epoch": 3.1237201365187715, + "grad_norm": 0.3134679665450112, + "learning_rate": 0.00010834635560368365, + "loss": 0.9257, + "step": 14644 + }, + { + "epoch": 3.123933447098976, + "grad_norm": 0.2814922597578706, + "learning_rate": 0.00010833839778588108, + "loss": 0.9519, + "step": 14645 + }, + { + "epoch": 3.124146757679181, + "grad_norm": 0.28426828706866014, + "learning_rate": 0.00010833043964742903, + "loss": 0.9085, + "step": 14646 + }, + { + "epoch": 3.1243600682593855, + "grad_norm": 0.3212439792380033, + "learning_rate": 0.00010832248118841759, + "loss": 0.951, + "step": 14647 + }, + { + "epoch": 3.1245733788395906, + "grad_norm": 0.27822699880580565, + "learning_rate": 0.00010831452240893681, + "loss": 0.9127, + "step": 14648 + }, + { + "epoch": 3.1247866894197953, + "grad_norm": 0.31031426428923325, + "learning_rate": 0.00010830656330907673, + "loss": 0.945, + "step": 14649 + }, + { + "epoch": 3.125, + "grad_norm": 0.2990715972034267, + "learning_rate": 0.00010829860388892739, + "loss": 0.9166, + "step": 14650 + }, + { + "epoch": 3.1252133105802047, + "grad_norm": 0.26003823373696444, + "learning_rate": 0.0001082906441485789, + "loss": 0.9405, + "step": 14651 + }, + { + "epoch": 3.1254266211604094, + "grad_norm": 0.23114430431482869, + "learning_rate": 0.00010828268408812129, + "loss": 0.904, + "step": 14652 + }, + { + "epoch": 3.1256399317406145, + "grad_norm": 0.23622466932139247, + "learning_rate": 0.00010827472370764462, + "loss": 0.9174, + "step": 14653 + }, + { + "epoch": 3.125853242320819, + "grad_norm": 0.3302003680917954, + "learning_rate": 0.00010826676300723897, + "loss": 0.9198, + "step": 14654 + }, + { + "epoch": 3.126066552901024, + "grad_norm": 0.4181052202781613, + "learning_rate": 0.00010825880198699445, + "loss": 0.9246, + "step": 14655 + }, + { + "epoch": 3.1262798634812285, + "grad_norm": 0.36446282838065824, + "learning_rate": 0.00010825084064700108, + "loss": 0.9352, + "step": 14656 + }, + { + "epoch": 3.1264931740614337, + "grad_norm": 0.2804653922970194, + "learning_rate": 0.00010824287898734898, + "loss": 0.9088, + "step": 14657 + }, + { + "epoch": 3.1267064846416384, + "grad_norm": 0.18052430231818178, + "learning_rate": 0.00010823491700812827, + "loss": 0.8732, + "step": 14658 + }, + { + "epoch": 3.126919795221843, + "grad_norm": 0.22166814587702377, + "learning_rate": 0.00010822695470942893, + "loss": 0.9288, + "step": 14659 + }, + { + "epoch": 3.1271331058020477, + "grad_norm": 0.6915558550395619, + "learning_rate": 0.00010821899209134117, + "loss": 0.9706, + "step": 14660 + }, + { + "epoch": 3.1273464163822524, + "grad_norm": 0.2514152418113479, + "learning_rate": 0.00010821102915395505, + "loss": 0.9378, + "step": 14661 + }, + { + "epoch": 3.1275597269624575, + "grad_norm": 0.2903133449595618, + "learning_rate": 0.0001082030658973606, + "loss": 0.9181, + "step": 14662 + }, + { + "epoch": 3.1277730375426622, + "grad_norm": 0.22061126583228793, + "learning_rate": 0.00010819510232164802, + "loss": 0.9282, + "step": 14663 + }, + { + "epoch": 3.127986348122867, + "grad_norm": 0.23430910711977135, + "learning_rate": 0.00010818713842690741, + "loss": 0.9142, + "step": 14664 + }, + { + "epoch": 3.1281996587030716, + "grad_norm": 0.35428168092983836, + "learning_rate": 0.00010817917421322883, + "loss": 0.9194, + "step": 14665 + }, + { + "epoch": 3.1284129692832763, + "grad_norm": 0.33047621420572953, + "learning_rate": 0.00010817120968070243, + "loss": 0.9331, + "step": 14666 + }, + { + "epoch": 3.1286262798634814, + "grad_norm": 0.26784151177580223, + "learning_rate": 0.0001081632448294183, + "loss": 0.9551, + "step": 14667 + }, + { + "epoch": 3.128839590443686, + "grad_norm": 0.2771904556638033, + "learning_rate": 0.0001081552796594666, + "loss": 0.9097, + "step": 14668 + }, + { + "epoch": 3.1290529010238908, + "grad_norm": 0.32333226458577335, + "learning_rate": 0.00010814731417093741, + "loss": 0.9189, + "step": 14669 + }, + { + "epoch": 3.1292662116040955, + "grad_norm": 0.3121304980188247, + "learning_rate": 0.00010813934836392091, + "loss": 0.9281, + "step": 14670 + }, + { + "epoch": 3.1294795221843, + "grad_norm": 0.27427418964047945, + "learning_rate": 0.00010813138223850719, + "loss": 0.9047, + "step": 14671 + }, + { + "epoch": 3.1296928327645053, + "grad_norm": 0.2740809452447961, + "learning_rate": 0.00010812341579478641, + "loss": 0.9539, + "step": 14672 + }, + { + "epoch": 3.12990614334471, + "grad_norm": 0.28085744658938194, + "learning_rate": 0.00010811544903284875, + "loss": 0.9152, + "step": 14673 + }, + { + "epoch": 3.1301194539249146, + "grad_norm": 0.23729113470399496, + "learning_rate": 0.00010810748195278427, + "loss": 0.9002, + "step": 14674 + }, + { + "epoch": 3.1303327645051193, + "grad_norm": 0.22447476251761067, + "learning_rate": 0.00010809951455468316, + "loss": 0.9129, + "step": 14675 + }, + { + "epoch": 3.1305460750853245, + "grad_norm": 0.26947942508815603, + "learning_rate": 0.00010809154683863558, + "loss": 0.9231, + "step": 14676 + }, + { + "epoch": 3.130759385665529, + "grad_norm": 0.22998067545967107, + "learning_rate": 0.00010808357880473166, + "loss": 0.9052, + "step": 14677 + }, + { + "epoch": 3.130972696245734, + "grad_norm": 0.2071043267424506, + "learning_rate": 0.00010807561045306159, + "loss": 0.9505, + "step": 14678 + }, + { + "epoch": 3.1311860068259385, + "grad_norm": 0.2398569662100701, + "learning_rate": 0.00010806764178371552, + "loss": 0.8982, + "step": 14679 + }, + { + "epoch": 3.131399317406143, + "grad_norm": 0.22605308462433332, + "learning_rate": 0.00010805967279678362, + "loss": 0.912, + "step": 14680 + }, + { + "epoch": 3.1316126279863483, + "grad_norm": 0.21678609824614828, + "learning_rate": 0.00010805170349235602, + "loss": 0.9493, + "step": 14681 + }, + { + "epoch": 3.131825938566553, + "grad_norm": 0.2410278934699433, + "learning_rate": 0.00010804373387052295, + "loss": 0.9379, + "step": 14682 + }, + { + "epoch": 3.1320392491467577, + "grad_norm": 0.23881783334952217, + "learning_rate": 0.00010803576393137452, + "loss": 0.931, + "step": 14683 + }, + { + "epoch": 3.1322525597269624, + "grad_norm": 0.23799526040599583, + "learning_rate": 0.00010802779367500098, + "loss": 0.9223, + "step": 14684 + }, + { + "epoch": 3.132465870307167, + "grad_norm": 0.3257942100947492, + "learning_rate": 0.0001080198231014925, + "loss": 0.9386, + "step": 14685 + }, + { + "epoch": 3.132679180887372, + "grad_norm": 0.23854979755636108, + "learning_rate": 0.00010801185221093923, + "loss": 0.9246, + "step": 14686 + }, + { + "epoch": 3.132892491467577, + "grad_norm": 0.23266389086728165, + "learning_rate": 0.00010800388100343136, + "loss": 0.9124, + "step": 14687 + }, + { + "epoch": 3.1331058020477816, + "grad_norm": 0.28036102617384495, + "learning_rate": 0.00010799590947905913, + "loss": 0.9623, + "step": 14688 + }, + { + "epoch": 3.1333191126279862, + "grad_norm": 0.2640902560251398, + "learning_rate": 0.00010798793763791271, + "loss": 0.9434, + "step": 14689 + }, + { + "epoch": 3.133532423208191, + "grad_norm": 0.2280467109489191, + "learning_rate": 0.00010797996548008229, + "loss": 0.9264, + "step": 14690 + }, + { + "epoch": 3.133745733788396, + "grad_norm": 0.2883284999742751, + "learning_rate": 0.00010797199300565812, + "loss": 0.9437, + "step": 14691 + }, + { + "epoch": 3.1339590443686007, + "grad_norm": 0.30742445085291203, + "learning_rate": 0.00010796402021473036, + "loss": 0.9662, + "step": 14692 + }, + { + "epoch": 3.1341723549488054, + "grad_norm": 0.2082247040542214, + "learning_rate": 0.00010795604710738923, + "loss": 0.9354, + "step": 14693 + }, + { + "epoch": 3.13438566552901, + "grad_norm": 0.24905759356968835, + "learning_rate": 0.00010794807368372498, + "loss": 0.9335, + "step": 14694 + }, + { + "epoch": 3.1345989761092152, + "grad_norm": 0.270635732791619, + "learning_rate": 0.00010794009994382779, + "loss": 0.9304, + "step": 14695 + }, + { + "epoch": 3.13481228668942, + "grad_norm": 0.2193247649633673, + "learning_rate": 0.00010793212588778793, + "loss": 0.9417, + "step": 14696 + }, + { + "epoch": 3.1350255972696246, + "grad_norm": 0.2711398199852847, + "learning_rate": 0.00010792415151569556, + "loss": 0.9114, + "step": 14697 + }, + { + "epoch": 3.1352389078498293, + "grad_norm": 0.25702191680791703, + "learning_rate": 0.00010791617682764095, + "loss": 0.9258, + "step": 14698 + }, + { + "epoch": 3.135452218430034, + "grad_norm": 0.21941104674787348, + "learning_rate": 0.00010790820182371434, + "loss": 0.9207, + "step": 14699 + }, + { + "epoch": 3.135665529010239, + "grad_norm": 0.218667523811943, + "learning_rate": 0.00010790022650400595, + "loss": 0.9167, + "step": 14700 + }, + { + "epoch": 3.135878839590444, + "grad_norm": 0.22586224129293203, + "learning_rate": 0.00010789225086860603, + "loss": 0.9045, + "step": 14701 + }, + { + "epoch": 3.1360921501706485, + "grad_norm": 0.2036841413311221, + "learning_rate": 0.00010788427491760485, + "loss": 0.95, + "step": 14702 + }, + { + "epoch": 3.136305460750853, + "grad_norm": 0.24223126843109818, + "learning_rate": 0.0001078762986510926, + "loss": 0.9339, + "step": 14703 + }, + { + "epoch": 3.136518771331058, + "grad_norm": 0.32427605056778996, + "learning_rate": 0.00010786832206915955, + "loss": 0.912, + "step": 14704 + }, + { + "epoch": 3.136732081911263, + "grad_norm": 0.327650917370763, + "learning_rate": 0.00010786034517189598, + "loss": 0.8865, + "step": 14705 + }, + { + "epoch": 3.1369453924914676, + "grad_norm": 0.3225361134218292, + "learning_rate": 0.00010785236795939213, + "loss": 0.9083, + "step": 14706 + }, + { + "epoch": 3.1371587030716723, + "grad_norm": 0.32615100922049395, + "learning_rate": 0.00010784439043173828, + "loss": 0.9288, + "step": 14707 + }, + { + "epoch": 3.137372013651877, + "grad_norm": 0.42189839693924197, + "learning_rate": 0.0001078364125890247, + "loss": 0.9019, + "step": 14708 + }, + { + "epoch": 3.1375853242320817, + "grad_norm": 0.19815781984859168, + "learning_rate": 0.0001078284344313416, + "loss": 0.888, + "step": 14709 + }, + { + "epoch": 3.137798634812287, + "grad_norm": 0.1967846902933148, + "learning_rate": 0.00010782045595877933, + "loss": 0.9107, + "step": 14710 + }, + { + "epoch": 3.1380119453924915, + "grad_norm": 0.24470885857871175, + "learning_rate": 0.00010781247717142812, + "loss": 0.9138, + "step": 14711 + }, + { + "epoch": 3.138225255972696, + "grad_norm": 0.29389238610979335, + "learning_rate": 0.00010780449806937826, + "loss": 0.9247, + "step": 14712 + }, + { + "epoch": 3.138438566552901, + "grad_norm": 0.336213149376406, + "learning_rate": 0.00010779651865272005, + "loss": 0.9336, + "step": 14713 + }, + { + "epoch": 3.138651877133106, + "grad_norm": 0.37610002860046426, + "learning_rate": 0.00010778853892154378, + "loss": 0.9105, + "step": 14714 + }, + { + "epoch": 3.1388651877133107, + "grad_norm": 0.3594293250702902, + "learning_rate": 0.0001077805588759397, + "loss": 0.8978, + "step": 14715 + }, + { + "epoch": 3.1390784982935154, + "grad_norm": 0.26187458507951555, + "learning_rate": 0.00010777257851599812, + "loss": 0.9094, + "step": 14716 + }, + { + "epoch": 3.13929180887372, + "grad_norm": 0.19409495126589257, + "learning_rate": 0.00010776459784180934, + "loss": 0.8797, + "step": 14717 + }, + { + "epoch": 3.1395051194539247, + "grad_norm": 0.30904828986843985, + "learning_rate": 0.0001077566168534637, + "loss": 0.9003, + "step": 14718 + }, + { + "epoch": 3.13971843003413, + "grad_norm": 0.3259745648820673, + "learning_rate": 0.00010774863555105147, + "loss": 0.9381, + "step": 14719 + }, + { + "epoch": 3.1399317406143346, + "grad_norm": 0.32253798097496017, + "learning_rate": 0.00010774065393466295, + "loss": 0.9017, + "step": 14720 + }, + { + "epoch": 3.1401450511945392, + "grad_norm": 0.3048712479385634, + "learning_rate": 0.00010773267200438846, + "loss": 0.8895, + "step": 14721 + }, + { + "epoch": 3.140358361774744, + "grad_norm": 0.29749824152969195, + "learning_rate": 0.00010772468976031832, + "loss": 0.8856, + "step": 14722 + }, + { + "epoch": 3.1405716723549486, + "grad_norm": 0.19348123930383382, + "learning_rate": 0.00010771670720254283, + "loss": 0.9414, + "step": 14723 + }, + { + "epoch": 3.1407849829351537, + "grad_norm": 0.21401551556569262, + "learning_rate": 0.0001077087243311524, + "loss": 0.9243, + "step": 14724 + }, + { + "epoch": 3.1409982935153584, + "grad_norm": 0.2437803661603309, + "learning_rate": 0.00010770074114623723, + "loss": 0.9067, + "step": 14725 + }, + { + "epoch": 3.141211604095563, + "grad_norm": 0.2923395935308066, + "learning_rate": 0.00010769275764788772, + "loss": 0.9136, + "step": 14726 + }, + { + "epoch": 3.141424914675768, + "grad_norm": 0.26035311466477845, + "learning_rate": 0.00010768477383619419, + "loss": 0.9069, + "step": 14727 + }, + { + "epoch": 3.1416382252559725, + "grad_norm": 0.19683621490855918, + "learning_rate": 0.00010767678971124697, + "loss": 0.9076, + "step": 14728 + }, + { + "epoch": 3.1418515358361776, + "grad_norm": 0.1986784550036493, + "learning_rate": 0.00010766880527313643, + "loss": 0.9066, + "step": 14729 + }, + { + "epoch": 3.1420648464163823, + "grad_norm": 0.21254443835768277, + "learning_rate": 0.0001076608205219529, + "loss": 0.9228, + "step": 14730 + }, + { + "epoch": 3.142278156996587, + "grad_norm": 0.20714138124092563, + "learning_rate": 0.0001076528354577867, + "loss": 0.9127, + "step": 14731 + }, + { + "epoch": 3.1424914675767917, + "grad_norm": 0.22440350712376778, + "learning_rate": 0.0001076448500807282, + "loss": 0.9033, + "step": 14732 + }, + { + "epoch": 3.142704778156997, + "grad_norm": 0.26157228616443057, + "learning_rate": 0.00010763686439086776, + "loss": 0.9465, + "step": 14733 + }, + { + "epoch": 3.1429180887372015, + "grad_norm": 0.2466021091947577, + "learning_rate": 0.00010762887838829574, + "loss": 0.9151, + "step": 14734 + }, + { + "epoch": 3.143131399317406, + "grad_norm": 0.20300037736087476, + "learning_rate": 0.0001076208920731025, + "loss": 0.9117, + "step": 14735 + }, + { + "epoch": 3.143344709897611, + "grad_norm": 0.2315505600307496, + "learning_rate": 0.00010761290544537841, + "loss": 0.9622, + "step": 14736 + }, + { + "epoch": 3.1435580204778155, + "grad_norm": 0.22177475015823564, + "learning_rate": 0.00010760491850521379, + "loss": 0.9295, + "step": 14737 + }, + { + "epoch": 3.1437713310580206, + "grad_norm": 0.25608400356892824, + "learning_rate": 0.00010759693125269908, + "loss": 0.9137, + "step": 14738 + }, + { + "epoch": 3.1439846416382253, + "grad_norm": 0.22143569022542747, + "learning_rate": 0.00010758894368792463, + "loss": 0.9068, + "step": 14739 + }, + { + "epoch": 3.14419795221843, + "grad_norm": 0.1953032348696345, + "learning_rate": 0.0001075809558109808, + "loss": 0.9321, + "step": 14740 + }, + { + "epoch": 3.1444112627986347, + "grad_norm": 0.22213041934987057, + "learning_rate": 0.00010757296762195802, + "loss": 0.9113, + "step": 14741 + }, + { + "epoch": 3.1446245733788394, + "grad_norm": 0.2594040282640771, + "learning_rate": 0.00010756497912094664, + "loss": 0.9175, + "step": 14742 + }, + { + "epoch": 3.1448378839590445, + "grad_norm": 0.3107173189254536, + "learning_rate": 0.00010755699030803703, + "loss": 0.9654, + "step": 14743 + }, + { + "epoch": 3.145051194539249, + "grad_norm": 0.3692432076536011, + "learning_rate": 0.00010754900118331965, + "loss": 0.9151, + "step": 14744 + }, + { + "epoch": 3.145264505119454, + "grad_norm": 0.3429158077895141, + "learning_rate": 0.00010754101174688483, + "loss": 0.9462, + "step": 14745 + }, + { + "epoch": 3.1454778156996586, + "grad_norm": 0.2735037342502125, + "learning_rate": 0.00010753302199882301, + "loss": 0.9079, + "step": 14746 + }, + { + "epoch": 3.1456911262798632, + "grad_norm": 0.2923139410335219, + "learning_rate": 0.00010752503193922458, + "loss": 0.9125, + "step": 14747 + }, + { + "epoch": 3.1459044368600684, + "grad_norm": 0.2121845906380147, + "learning_rate": 0.00010751704156817996, + "loss": 0.9062, + "step": 14748 + }, + { + "epoch": 3.146117747440273, + "grad_norm": 0.21246153294359482, + "learning_rate": 0.00010750905088577955, + "loss": 0.924, + "step": 14749 + }, + { + "epoch": 3.1463310580204777, + "grad_norm": 0.2124475591141928, + "learning_rate": 0.00010750105989211374, + "loss": 0.9076, + "step": 14750 + }, + { + "epoch": 3.1465443686006824, + "grad_norm": 0.25266578667020223, + "learning_rate": 0.00010749306858727301, + "loss": 0.9173, + "step": 14751 + }, + { + "epoch": 3.1467576791808876, + "grad_norm": 0.2568963602219427, + "learning_rate": 0.00010748507697134774, + "loss": 0.908, + "step": 14752 + }, + { + "epoch": 3.1469709897610922, + "grad_norm": 0.21060481666540304, + "learning_rate": 0.00010747708504442834, + "loss": 0.9042, + "step": 14753 + }, + { + "epoch": 3.147184300341297, + "grad_norm": 0.22959049649538985, + "learning_rate": 0.0001074690928066053, + "loss": 0.9304, + "step": 14754 + }, + { + "epoch": 3.1473976109215016, + "grad_norm": 0.2572748491704212, + "learning_rate": 0.00010746110025796897, + "loss": 0.9139, + "step": 14755 + }, + { + "epoch": 3.1476109215017063, + "grad_norm": 0.2565175091888708, + "learning_rate": 0.00010745310739860984, + "loss": 0.9139, + "step": 14756 + }, + { + "epoch": 3.1478242320819114, + "grad_norm": 0.26482755537337827, + "learning_rate": 0.00010744511422861834, + "loss": 0.9184, + "step": 14757 + }, + { + "epoch": 3.148037542662116, + "grad_norm": 0.25137573619424025, + "learning_rate": 0.00010743712074808493, + "loss": 0.9121, + "step": 14758 + }, + { + "epoch": 3.148250853242321, + "grad_norm": 0.23229423197070395, + "learning_rate": 0.00010742912695710002, + "loss": 0.9028, + "step": 14759 + }, + { + "epoch": 3.1484641638225255, + "grad_norm": 0.21491616564631214, + "learning_rate": 0.00010742113285575408, + "loss": 0.8971, + "step": 14760 + }, + { + "epoch": 3.1486774744027306, + "grad_norm": 0.2781353726656909, + "learning_rate": 0.00010741313844413755, + "loss": 0.9451, + "step": 14761 + }, + { + "epoch": 3.1488907849829353, + "grad_norm": 0.30401955318284934, + "learning_rate": 0.0001074051437223409, + "loss": 0.9346, + "step": 14762 + }, + { + "epoch": 3.14910409556314, + "grad_norm": 0.1963089673467784, + "learning_rate": 0.00010739714869045462, + "loss": 0.9341, + "step": 14763 + }, + { + "epoch": 3.1493174061433447, + "grad_norm": 0.2761941628657323, + "learning_rate": 0.00010738915334856911, + "loss": 0.9247, + "step": 14764 + }, + { + "epoch": 3.1495307167235493, + "grad_norm": 0.2970234805431454, + "learning_rate": 0.00010738115769677488, + "loss": 0.9282, + "step": 14765 + }, + { + "epoch": 3.1497440273037545, + "grad_norm": 0.2292998263479585, + "learning_rate": 0.00010737316173516239, + "loss": 0.9295, + "step": 14766 + }, + { + "epoch": 3.149957337883959, + "grad_norm": 0.2091824074672519, + "learning_rate": 0.0001073651654638221, + "loss": 0.9251, + "step": 14767 + }, + { + "epoch": 3.150170648464164, + "grad_norm": 0.25716249893132526, + "learning_rate": 0.00010735716888284451, + "loss": 0.9407, + "step": 14768 + }, + { + "epoch": 3.1503839590443685, + "grad_norm": 0.264584847971527, + "learning_rate": 0.0001073491719923201, + "loss": 0.9206, + "step": 14769 + }, + { + "epoch": 3.150597269624573, + "grad_norm": 0.2047246194265266, + "learning_rate": 0.00010734117479233935, + "loss": 0.9374, + "step": 14770 + }, + { + "epoch": 3.1508105802047783, + "grad_norm": 0.20959813222988957, + "learning_rate": 0.00010733317728299274, + "loss": 0.883, + "step": 14771 + }, + { + "epoch": 3.151023890784983, + "grad_norm": 0.23788523730796488, + "learning_rate": 0.00010732517946437078, + "loss": 0.9214, + "step": 14772 + }, + { + "epoch": 3.1512372013651877, + "grad_norm": 0.32258051788777653, + "learning_rate": 0.00010731718133656393, + "loss": 0.9226, + "step": 14773 + }, + { + "epoch": 3.1514505119453924, + "grad_norm": 0.34046334640417425, + "learning_rate": 0.00010730918289966275, + "loss": 0.9096, + "step": 14774 + }, + { + "epoch": 3.151663822525597, + "grad_norm": 0.30915256059400675, + "learning_rate": 0.00010730118415375768, + "loss": 0.9046, + "step": 14775 + }, + { + "epoch": 3.151877133105802, + "grad_norm": 0.2487362907473921, + "learning_rate": 0.00010729318509893926, + "loss": 0.9011, + "step": 14776 + }, + { + "epoch": 3.152090443686007, + "grad_norm": 0.2755398320764424, + "learning_rate": 0.00010728518573529798, + "loss": 0.9351, + "step": 14777 + }, + { + "epoch": 3.1523037542662116, + "grad_norm": 0.2708713704131886, + "learning_rate": 0.00010727718606292438, + "loss": 0.9241, + "step": 14778 + }, + { + "epoch": 3.1525170648464163, + "grad_norm": 0.38493959799633337, + "learning_rate": 0.00010726918608190895, + "loss": 0.9186, + "step": 14779 + }, + { + "epoch": 3.1527303754266214, + "grad_norm": 0.4230280305864975, + "learning_rate": 0.00010726118579234226, + "loss": 0.9177, + "step": 14780 + }, + { + "epoch": 3.152943686006826, + "grad_norm": 0.37235739320226346, + "learning_rate": 0.00010725318519431474, + "loss": 0.9175, + "step": 14781 + }, + { + "epoch": 3.1531569965870307, + "grad_norm": 0.31858793568936744, + "learning_rate": 0.00010724518428791699, + "loss": 0.9268, + "step": 14782 + }, + { + "epoch": 3.1533703071672354, + "grad_norm": 0.2739324974653224, + "learning_rate": 0.00010723718307323951, + "loss": 0.9144, + "step": 14783 + }, + { + "epoch": 3.15358361774744, + "grad_norm": 0.25592245761500226, + "learning_rate": 0.00010722918155037287, + "loss": 0.9357, + "step": 14784 + }, + { + "epoch": 3.1537969283276452, + "grad_norm": 0.2332015607754184, + "learning_rate": 0.00010722117971940756, + "loss": 0.8857, + "step": 14785 + }, + { + "epoch": 3.15401023890785, + "grad_norm": 0.3355289951782421, + "learning_rate": 0.00010721317758043418, + "loss": 0.93, + "step": 14786 + }, + { + "epoch": 3.1542235494880546, + "grad_norm": 0.39661241085518106, + "learning_rate": 0.00010720517513354321, + "loss": 0.9427, + "step": 14787 + }, + { + "epoch": 3.1544368600682593, + "grad_norm": 0.3676064462718342, + "learning_rate": 0.00010719717237882523, + "loss": 0.8994, + "step": 14788 + }, + { + "epoch": 3.154650170648464, + "grad_norm": 0.32185483288254707, + "learning_rate": 0.00010718916931637077, + "loss": 0.9085, + "step": 14789 + }, + { + "epoch": 3.154863481228669, + "grad_norm": 0.3169768129570953, + "learning_rate": 0.00010718116594627043, + "loss": 0.8898, + "step": 14790 + }, + { + "epoch": 3.155076791808874, + "grad_norm": 0.20573775743138964, + "learning_rate": 0.00010717316226861474, + "loss": 0.8838, + "step": 14791 + }, + { + "epoch": 3.1552901023890785, + "grad_norm": 0.22795199629669632, + "learning_rate": 0.00010716515828349424, + "loss": 0.9454, + "step": 14792 + }, + { + "epoch": 3.155503412969283, + "grad_norm": 0.21119080426060485, + "learning_rate": 0.00010715715399099954, + "loss": 0.9022, + "step": 14793 + }, + { + "epoch": 3.155716723549488, + "grad_norm": 0.1896938113038917, + "learning_rate": 0.00010714914939122117, + "loss": 0.8805, + "step": 14794 + }, + { + "epoch": 3.155930034129693, + "grad_norm": 0.24358508762187603, + "learning_rate": 0.0001071411444842497, + "loss": 0.8904, + "step": 14795 + }, + { + "epoch": 3.1561433447098977, + "grad_norm": 0.2645938433937103, + "learning_rate": 0.00010713313927017575, + "loss": 0.9028, + "step": 14796 + }, + { + "epoch": 3.1563566552901023, + "grad_norm": 0.2782605989209117, + "learning_rate": 0.00010712513374908988, + "loss": 0.919, + "step": 14797 + }, + { + "epoch": 3.156569965870307, + "grad_norm": 0.34142857768939794, + "learning_rate": 0.00010711712792108266, + "loss": 0.9155, + "step": 14798 + }, + { + "epoch": 3.156783276450512, + "grad_norm": 0.4862520442886251, + "learning_rate": 0.00010710912178624466, + "loss": 0.8918, + "step": 14799 + }, + { + "epoch": 3.156996587030717, + "grad_norm": 0.5076425678798154, + "learning_rate": 0.0001071011153446665, + "loss": 0.9491, + "step": 14800 + }, + { + "epoch": 3.1572098976109215, + "grad_norm": 0.38808827185670014, + "learning_rate": 0.00010709310859643875, + "loss": 0.9113, + "step": 14801 + }, + { + "epoch": 3.157423208191126, + "grad_norm": 0.29656033654483416, + "learning_rate": 0.00010708510154165208, + "loss": 0.9106, + "step": 14802 + }, + { + "epoch": 3.157636518771331, + "grad_norm": 0.22132235229371305, + "learning_rate": 0.00010707709418039698, + "loss": 0.8899, + "step": 14803 + }, + { + "epoch": 3.157849829351536, + "grad_norm": 0.3287625720738437, + "learning_rate": 0.00010706908651276412, + "loss": 0.924, + "step": 14804 + }, + { + "epoch": 3.1580631399317407, + "grad_norm": 0.4564628469448267, + "learning_rate": 0.00010706107853884407, + "loss": 0.9357, + "step": 14805 + }, + { + "epoch": 3.1582764505119454, + "grad_norm": 0.547070574590287, + "learning_rate": 0.00010705307025872747, + "loss": 0.9231, + "step": 14806 + }, + { + "epoch": 3.15848976109215, + "grad_norm": 0.5994712247166581, + "learning_rate": 0.00010704506167250494, + "loss": 0.9418, + "step": 14807 + }, + { + "epoch": 3.1587030716723548, + "grad_norm": 0.5748354284619165, + "learning_rate": 0.00010703705278026708, + "loss": 0.9417, + "step": 14808 + }, + { + "epoch": 3.15891638225256, + "grad_norm": 0.5066783966340508, + "learning_rate": 0.00010702904358210451, + "loss": 0.9745, + "step": 14809 + }, + { + "epoch": 3.1591296928327646, + "grad_norm": 0.39490930113504774, + "learning_rate": 0.00010702103407810787, + "loss": 0.9323, + "step": 14810 + }, + { + "epoch": 3.1593430034129693, + "grad_norm": 0.3135775000116561, + "learning_rate": 0.00010701302426836772, + "loss": 0.9273, + "step": 14811 + }, + { + "epoch": 3.159556313993174, + "grad_norm": 0.287702853210361, + "learning_rate": 0.0001070050141529748, + "loss": 0.9141, + "step": 14812 + }, + { + "epoch": 3.1597696245733786, + "grad_norm": 0.2492050986112439, + "learning_rate": 0.00010699700373201966, + "loss": 0.9104, + "step": 14813 + }, + { + "epoch": 3.1599829351535837, + "grad_norm": 0.3258450763386082, + "learning_rate": 0.00010698899300559302, + "loss": 0.8857, + "step": 14814 + }, + { + "epoch": 3.1601962457337884, + "grad_norm": 0.3698561589861668, + "learning_rate": 0.0001069809819737854, + "loss": 0.9006, + "step": 14815 + }, + { + "epoch": 3.160409556313993, + "grad_norm": 0.34127873117671714, + "learning_rate": 0.00010697297063668757, + "loss": 0.9306, + "step": 14816 + }, + { + "epoch": 3.160622866894198, + "grad_norm": 0.2171454809954868, + "learning_rate": 0.00010696495899439008, + "loss": 0.9294, + "step": 14817 + }, + { + "epoch": 3.160836177474403, + "grad_norm": 0.2107118677894414, + "learning_rate": 0.00010695694704698367, + "loss": 0.9129, + "step": 14818 + }, + { + "epoch": 3.1610494880546076, + "grad_norm": 0.2218544715568659, + "learning_rate": 0.00010694893479455893, + "loss": 0.9284, + "step": 14819 + }, + { + "epoch": 3.1612627986348123, + "grad_norm": 0.20774044475312398, + "learning_rate": 0.00010694092223720653, + "loss": 0.8863, + "step": 14820 + }, + { + "epoch": 3.161476109215017, + "grad_norm": 0.21853237356859134, + "learning_rate": 0.00010693290937501715, + "loss": 0.9047, + "step": 14821 + }, + { + "epoch": 3.1616894197952217, + "grad_norm": 0.19912927316508106, + "learning_rate": 0.00010692489620808146, + "loss": 0.9194, + "step": 14822 + }, + { + "epoch": 3.161902730375427, + "grad_norm": 0.23930194419393863, + "learning_rate": 0.00010691688273649007, + "loss": 0.9371, + "step": 14823 + }, + { + "epoch": 3.1621160409556315, + "grad_norm": 0.2267436538283135, + "learning_rate": 0.00010690886896033376, + "loss": 0.9222, + "step": 14824 + }, + { + "epoch": 3.162329351535836, + "grad_norm": 0.28499704483428284, + "learning_rate": 0.00010690085487970312, + "loss": 0.9136, + "step": 14825 + }, + { + "epoch": 3.162542662116041, + "grad_norm": 0.36592899820591585, + "learning_rate": 0.00010689284049468885, + "loss": 0.9032, + "step": 14826 + }, + { + "epoch": 3.1627559726962455, + "grad_norm": 0.31241170474634616, + "learning_rate": 0.00010688482580538163, + "loss": 0.93, + "step": 14827 + }, + { + "epoch": 3.1629692832764507, + "grad_norm": 0.21941921117081192, + "learning_rate": 0.00010687681081187218, + "loss": 0.9533, + "step": 14828 + }, + { + "epoch": 3.1631825938566553, + "grad_norm": 0.19628154255191493, + "learning_rate": 0.00010686879551425112, + "loss": 0.9285, + "step": 14829 + }, + { + "epoch": 3.16339590443686, + "grad_norm": 0.25345164619876576, + "learning_rate": 0.00010686077991260924, + "loss": 0.9071, + "step": 14830 + }, + { + "epoch": 3.1636092150170647, + "grad_norm": 0.27555227627572215, + "learning_rate": 0.00010685276400703715, + "loss": 0.9163, + "step": 14831 + }, + { + "epoch": 3.1638225255972694, + "grad_norm": 0.34964874662893636, + "learning_rate": 0.00010684474779762562, + "loss": 0.9032, + "step": 14832 + }, + { + "epoch": 3.1640358361774745, + "grad_norm": 0.3387588941071458, + "learning_rate": 0.00010683673128446527, + "loss": 0.9179, + "step": 14833 + }, + { + "epoch": 3.164249146757679, + "grad_norm": 0.2963708736617827, + "learning_rate": 0.00010682871446764687, + "loss": 0.9382, + "step": 14834 + }, + { + "epoch": 3.164462457337884, + "grad_norm": 0.2288530774420225, + "learning_rate": 0.00010682069734726111, + "loss": 0.9086, + "step": 14835 + }, + { + "epoch": 3.1646757679180886, + "grad_norm": 0.18502309192205302, + "learning_rate": 0.00010681267992339872, + "loss": 0.9199, + "step": 14836 + }, + { + "epoch": 3.1648890784982937, + "grad_norm": 0.28748714399818115, + "learning_rate": 0.00010680466219615039, + "loss": 0.9456, + "step": 14837 + }, + { + "epoch": 3.1651023890784984, + "grad_norm": 0.325985285931118, + "learning_rate": 0.00010679664416560688, + "loss": 0.8813, + "step": 14838 + }, + { + "epoch": 3.165315699658703, + "grad_norm": 0.3978263092439038, + "learning_rate": 0.00010678862583185886, + "loss": 0.9141, + "step": 14839 + }, + { + "epoch": 3.1655290102389078, + "grad_norm": 0.4055378870174837, + "learning_rate": 0.00010678060719499713, + "loss": 0.8694, + "step": 14840 + }, + { + "epoch": 3.1657423208191124, + "grad_norm": 0.25988756631191223, + "learning_rate": 0.00010677258825511233, + "loss": 0.9219, + "step": 14841 + }, + { + "epoch": 3.1659556313993176, + "grad_norm": 0.23549376227596525, + "learning_rate": 0.00010676456901229529, + "loss": 0.925, + "step": 14842 + }, + { + "epoch": 3.1661689419795223, + "grad_norm": 0.30140458266440406, + "learning_rate": 0.00010675654946663667, + "loss": 0.9056, + "step": 14843 + }, + { + "epoch": 3.166382252559727, + "grad_norm": 0.3449165266553205, + "learning_rate": 0.00010674852961822727, + "loss": 0.9354, + "step": 14844 + }, + { + "epoch": 3.1665955631399316, + "grad_norm": 0.32305986848185303, + "learning_rate": 0.00010674050946715778, + "loss": 0.9295, + "step": 14845 + }, + { + "epoch": 3.1668088737201368, + "grad_norm": 0.2887576545981588, + "learning_rate": 0.00010673248901351898, + "loss": 0.935, + "step": 14846 + }, + { + "epoch": 3.1670221843003414, + "grad_norm": 0.3008896581546897, + "learning_rate": 0.00010672446825740165, + "loss": 0.9412, + "step": 14847 + }, + { + "epoch": 3.167235494880546, + "grad_norm": 0.20507701466164124, + "learning_rate": 0.00010671644719889646, + "loss": 0.9143, + "step": 14848 + }, + { + "epoch": 3.167448805460751, + "grad_norm": 0.2607448342551891, + "learning_rate": 0.00010670842583809425, + "loss": 0.9027, + "step": 14849 + }, + { + "epoch": 3.1676621160409555, + "grad_norm": 0.3492016219215431, + "learning_rate": 0.00010670040417508577, + "loss": 0.927, + "step": 14850 + }, + { + "epoch": 3.1678754266211606, + "grad_norm": 0.3533951696956762, + "learning_rate": 0.00010669238220996172, + "loss": 0.9209, + "step": 14851 + }, + { + "epoch": 3.1680887372013653, + "grad_norm": 0.3527972068353694, + "learning_rate": 0.00010668435994281295, + "loss": 0.9055, + "step": 14852 + }, + { + "epoch": 3.16830204778157, + "grad_norm": 0.3821333627618867, + "learning_rate": 0.00010667633737373021, + "loss": 0.9128, + "step": 14853 + }, + { + "epoch": 3.1685153583617747, + "grad_norm": 0.33642117978844716, + "learning_rate": 0.00010666831450280421, + "loss": 0.9214, + "step": 14854 + }, + { + "epoch": 3.1687286689419794, + "grad_norm": 0.2772473941258076, + "learning_rate": 0.00010666029133012583, + "loss": 0.9328, + "step": 14855 + }, + { + "epoch": 3.1689419795221845, + "grad_norm": 0.2542137759750625, + "learning_rate": 0.00010665226785578579, + "loss": 0.9147, + "step": 14856 + }, + { + "epoch": 3.169155290102389, + "grad_norm": 0.2774451929272628, + "learning_rate": 0.0001066442440798749, + "loss": 0.9016, + "step": 14857 + }, + { + "epoch": 3.169368600682594, + "grad_norm": 0.33171842036011157, + "learning_rate": 0.00010663622000248392, + "loss": 0.9251, + "step": 14858 + }, + { + "epoch": 3.1695819112627985, + "grad_norm": 0.37620423663979263, + "learning_rate": 0.0001066281956237037, + "loss": 0.8909, + "step": 14859 + }, + { + "epoch": 3.169795221843003, + "grad_norm": 0.2944708168277345, + "learning_rate": 0.00010662017094362496, + "loss": 0.9193, + "step": 14860 + }, + { + "epoch": 3.1700085324232083, + "grad_norm": 0.21737979288320822, + "learning_rate": 0.00010661214596233856, + "loss": 0.9049, + "step": 14861 + }, + { + "epoch": 3.170221843003413, + "grad_norm": 0.19557306324643048, + "learning_rate": 0.00010660412067993527, + "loss": 0.9214, + "step": 14862 + }, + { + "epoch": 3.1704351535836177, + "grad_norm": 0.23213921303316126, + "learning_rate": 0.00010659609509650591, + "loss": 0.8889, + "step": 14863 + }, + { + "epoch": 3.1706484641638224, + "grad_norm": 0.22990537545933032, + "learning_rate": 0.0001065880692121413, + "loss": 0.8867, + "step": 14864 + }, + { + "epoch": 3.1708617747440275, + "grad_norm": 0.2573260469191571, + "learning_rate": 0.00010658004302693222, + "loss": 0.8961, + "step": 14865 + }, + { + "epoch": 3.171075085324232, + "grad_norm": 0.21972026571378522, + "learning_rate": 0.0001065720165409695, + "loss": 0.8933, + "step": 14866 + }, + { + "epoch": 3.171288395904437, + "grad_norm": 0.20346825567491642, + "learning_rate": 0.000106563989754344, + "loss": 0.9385, + "step": 14867 + }, + { + "epoch": 3.1715017064846416, + "grad_norm": 0.22107935227917333, + "learning_rate": 0.00010655596266714649, + "loss": 0.9246, + "step": 14868 + }, + { + "epoch": 3.1717150170648463, + "grad_norm": 0.2654945645435002, + "learning_rate": 0.0001065479352794678, + "loss": 0.941, + "step": 14869 + }, + { + "epoch": 3.1719283276450514, + "grad_norm": 0.23999734332261233, + "learning_rate": 0.0001065399075913988, + "loss": 0.8844, + "step": 14870 + }, + { + "epoch": 3.172141638225256, + "grad_norm": 0.19938762142005853, + "learning_rate": 0.00010653187960303028, + "loss": 0.9251, + "step": 14871 + }, + { + "epoch": 3.1723549488054608, + "grad_norm": 0.2919058460721898, + "learning_rate": 0.00010652385131445309, + "loss": 0.9359, + "step": 14872 + }, + { + "epoch": 3.1725682593856654, + "grad_norm": 0.3395352213602784, + "learning_rate": 0.0001065158227257581, + "loss": 0.8899, + "step": 14873 + }, + { + "epoch": 3.17278156996587, + "grad_norm": 0.29482934472112415, + "learning_rate": 0.00010650779383703613, + "loss": 0.8839, + "step": 14874 + }, + { + "epoch": 3.1729948805460753, + "grad_norm": 0.19890583814994184, + "learning_rate": 0.00010649976464837803, + "loss": 0.9331, + "step": 14875 + }, + { + "epoch": 3.17320819112628, + "grad_norm": 0.2900511892126773, + "learning_rate": 0.00010649173515987462, + "loss": 0.9258, + "step": 14876 + }, + { + "epoch": 3.1734215017064846, + "grad_norm": 0.2542935860884813, + "learning_rate": 0.00010648370537161679, + "loss": 0.9287, + "step": 14877 + }, + { + "epoch": 3.1736348122866893, + "grad_norm": 0.22212885305536773, + "learning_rate": 0.0001064756752836954, + "loss": 0.9446, + "step": 14878 + }, + { + "epoch": 3.173848122866894, + "grad_norm": 0.2397172365227824, + "learning_rate": 0.0001064676448962013, + "loss": 0.9133, + "step": 14879 + }, + { + "epoch": 3.174061433447099, + "grad_norm": 0.19029020119658255, + "learning_rate": 0.00010645961420922534, + "loss": 0.9116, + "step": 14880 + }, + { + "epoch": 3.174274744027304, + "grad_norm": 0.7224672935597308, + "learning_rate": 0.00010645158322285842, + "loss": 0.9393, + "step": 14881 + }, + { + "epoch": 3.1744880546075085, + "grad_norm": 0.2596318941640243, + "learning_rate": 0.00010644355193719137, + "loss": 0.9273, + "step": 14882 + }, + { + "epoch": 3.174701365187713, + "grad_norm": 0.2883721458905465, + "learning_rate": 0.00010643552035231509, + "loss": 0.9128, + "step": 14883 + }, + { + "epoch": 3.1749146757679183, + "grad_norm": 0.32186152434137194, + "learning_rate": 0.00010642748846832047, + "loss": 0.9397, + "step": 14884 + }, + { + "epoch": 3.175127986348123, + "grad_norm": 0.2730620430042458, + "learning_rate": 0.00010641945628529833, + "loss": 0.9319, + "step": 14885 + }, + { + "epoch": 3.1753412969283277, + "grad_norm": 0.36389452149062446, + "learning_rate": 0.00010641142380333965, + "loss": 0.9131, + "step": 14886 + }, + { + "epoch": 3.1755546075085324, + "grad_norm": 0.2875141857433807, + "learning_rate": 0.00010640339102253525, + "loss": 0.918, + "step": 14887 + }, + { + "epoch": 3.175767918088737, + "grad_norm": 0.27205212638025833, + "learning_rate": 0.00010639535794297601, + "loss": 0.905, + "step": 14888 + }, + { + "epoch": 3.175981228668942, + "grad_norm": 0.3434022490612068, + "learning_rate": 0.00010638732456475286, + "loss": 0.9334, + "step": 14889 + }, + { + "epoch": 3.176194539249147, + "grad_norm": 0.3190576468154695, + "learning_rate": 0.00010637929088795671, + "loss": 0.9351, + "step": 14890 + }, + { + "epoch": 3.1764078498293515, + "grad_norm": 0.24855196137418376, + "learning_rate": 0.00010637125691267842, + "loss": 0.9402, + "step": 14891 + }, + { + "epoch": 3.176621160409556, + "grad_norm": 0.310903405480293, + "learning_rate": 0.00010636322263900892, + "loss": 0.9534, + "step": 14892 + }, + { + "epoch": 3.176834470989761, + "grad_norm": 0.28660134874488713, + "learning_rate": 0.00010635518806703912, + "loss": 0.9432, + "step": 14893 + }, + { + "epoch": 3.177047781569966, + "grad_norm": 0.2577084543081467, + "learning_rate": 0.00010634715319685992, + "loss": 0.926, + "step": 14894 + }, + { + "epoch": 3.1772610921501707, + "grad_norm": 0.2721966582151683, + "learning_rate": 0.00010633911802856222, + "loss": 0.8983, + "step": 14895 + }, + { + "epoch": 3.1774744027303754, + "grad_norm": 0.29914495033780264, + "learning_rate": 0.00010633108256223699, + "loss": 0.9566, + "step": 14896 + }, + { + "epoch": 3.17768771331058, + "grad_norm": 0.2827924193222019, + "learning_rate": 0.00010632304679797509, + "loss": 0.9199, + "step": 14897 + }, + { + "epoch": 3.1779010238907848, + "grad_norm": 0.22530755292248322, + "learning_rate": 0.0001063150107358675, + "loss": 0.9368, + "step": 14898 + }, + { + "epoch": 3.17811433447099, + "grad_norm": 0.20751695976017312, + "learning_rate": 0.00010630697437600511, + "loss": 0.9161, + "step": 14899 + }, + { + "epoch": 3.1783276450511946, + "grad_norm": 0.264736669767448, + "learning_rate": 0.00010629893771847884, + "loss": 0.9435, + "step": 14900 + }, + { + "epoch": 3.1785409556313993, + "grad_norm": 0.2496532847834462, + "learning_rate": 0.00010629090076337967, + "loss": 0.9116, + "step": 14901 + }, + { + "epoch": 3.178754266211604, + "grad_norm": 0.2500012308721754, + "learning_rate": 0.00010628286351079852, + "loss": 0.9149, + "step": 14902 + }, + { + "epoch": 3.178967576791809, + "grad_norm": 0.24804578307279423, + "learning_rate": 0.00010627482596082632, + "loss": 0.9311, + "step": 14903 + }, + { + "epoch": 3.1791808873720138, + "grad_norm": 0.243109285670866, + "learning_rate": 0.00010626678811355402, + "loss": 0.9105, + "step": 14904 + }, + { + "epoch": 3.1793941979522184, + "grad_norm": 0.278603654108003, + "learning_rate": 0.00010625874996907256, + "loss": 0.9393, + "step": 14905 + }, + { + "epoch": 3.179607508532423, + "grad_norm": 0.34123296401098724, + "learning_rate": 0.0001062507115274729, + "loss": 0.9206, + "step": 14906 + }, + { + "epoch": 3.179820819112628, + "grad_norm": 0.3098134146220605, + "learning_rate": 0.00010624267278884601, + "loss": 0.9183, + "step": 14907 + }, + { + "epoch": 3.180034129692833, + "grad_norm": 0.19503530484945752, + "learning_rate": 0.00010623463375328286, + "loss": 0.9267, + "step": 14908 + }, + { + "epoch": 3.1802474402730376, + "grad_norm": 0.27486777709786736, + "learning_rate": 0.00010622659442087436, + "loss": 0.8903, + "step": 14909 + }, + { + "epoch": 3.1804607508532423, + "grad_norm": 0.30504964777577964, + "learning_rate": 0.0001062185547917115, + "loss": 0.9259, + "step": 14910 + }, + { + "epoch": 3.180674061433447, + "grad_norm": 0.3602619461218083, + "learning_rate": 0.00010621051486588527, + "loss": 0.922, + "step": 14911 + }, + { + "epoch": 3.1808873720136517, + "grad_norm": 0.33287782801080223, + "learning_rate": 0.0001062024746434866, + "loss": 0.8925, + "step": 14912 + }, + { + "epoch": 3.181100682593857, + "grad_norm": 0.3617453765022946, + "learning_rate": 0.0001061944341246065, + "loss": 0.9231, + "step": 14913 + }, + { + "epoch": 3.1813139931740615, + "grad_norm": 0.32207408113539227, + "learning_rate": 0.00010618639330933595, + "loss": 0.9002, + "step": 14914 + }, + { + "epoch": 3.181527303754266, + "grad_norm": 0.2456856235029604, + "learning_rate": 0.0001061783521977659, + "loss": 0.9329, + "step": 14915 + }, + { + "epoch": 3.181740614334471, + "grad_norm": 0.23305034229508528, + "learning_rate": 0.00010617031078998737, + "loss": 0.8964, + "step": 14916 + }, + { + "epoch": 3.1819539249146755, + "grad_norm": 0.25933488672080907, + "learning_rate": 0.00010616226908609132, + "loss": 0.8802, + "step": 14917 + }, + { + "epoch": 3.1821672354948807, + "grad_norm": 0.27853205530098746, + "learning_rate": 0.00010615422708616873, + "loss": 0.9301, + "step": 14918 + }, + { + "epoch": 3.1823805460750854, + "grad_norm": 0.2694622457738746, + "learning_rate": 0.00010614618479031067, + "loss": 0.9292, + "step": 14919 + }, + { + "epoch": 3.18259385665529, + "grad_norm": 0.24195065867362542, + "learning_rate": 0.00010613814219860808, + "loss": 0.9058, + "step": 14920 + }, + { + "epoch": 3.1828071672354947, + "grad_norm": 0.17103784899063537, + "learning_rate": 0.00010613009931115197, + "loss": 0.9007, + "step": 14921 + }, + { + "epoch": 3.1830204778157, + "grad_norm": 0.2287200972667561, + "learning_rate": 0.00010612205612803332, + "loss": 0.9272, + "step": 14922 + }, + { + "epoch": 3.1832337883959045, + "grad_norm": 0.2553916552212111, + "learning_rate": 0.0001061140126493432, + "loss": 0.9224, + "step": 14923 + }, + { + "epoch": 3.1834470989761092, + "grad_norm": 0.2721406750732867, + "learning_rate": 0.00010610596887517254, + "loss": 0.9215, + "step": 14924 + }, + { + "epoch": 3.183660409556314, + "grad_norm": 0.3090400342663582, + "learning_rate": 0.00010609792480561247, + "loss": 0.8876, + "step": 14925 + }, + { + "epoch": 3.1838737201365186, + "grad_norm": 0.29386749440793214, + "learning_rate": 0.00010608988044075389, + "loss": 0.9111, + "step": 14926 + }, + { + "epoch": 3.1840870307167237, + "grad_norm": 0.24805402995386341, + "learning_rate": 0.0001060818357806879, + "loss": 0.9192, + "step": 14927 + }, + { + "epoch": 3.1843003412969284, + "grad_norm": 0.16748416374117664, + "learning_rate": 0.00010607379082550546, + "loss": 0.921, + "step": 14928 + }, + { + "epoch": 3.184513651877133, + "grad_norm": 0.30892097032727284, + "learning_rate": 0.00010606574557529769, + "loss": 0.9183, + "step": 14929 + }, + { + "epoch": 3.1847269624573378, + "grad_norm": 0.3022872275076786, + "learning_rate": 0.00010605770003015552, + "loss": 0.9194, + "step": 14930 + }, + { + "epoch": 3.1849402730375425, + "grad_norm": 0.23003142990351894, + "learning_rate": 0.00010604965419017011, + "loss": 0.9131, + "step": 14931 + }, + { + "epoch": 3.1851535836177476, + "grad_norm": 0.25242746252053483, + "learning_rate": 0.00010604160805543235, + "loss": 0.9423, + "step": 14932 + }, + { + "epoch": 3.1853668941979523, + "grad_norm": 0.2825007856875137, + "learning_rate": 0.00010603356162603339, + "loss": 0.9261, + "step": 14933 + }, + { + "epoch": 3.185580204778157, + "grad_norm": 0.32418727191658675, + "learning_rate": 0.00010602551490206424, + "loss": 0.9331, + "step": 14934 + }, + { + "epoch": 3.1857935153583616, + "grad_norm": 0.3019887933980289, + "learning_rate": 0.00010601746788361596, + "loss": 0.9504, + "step": 14935 + }, + { + "epoch": 3.1860068259385663, + "grad_norm": 0.26793484244573873, + "learning_rate": 0.00010600942057077956, + "loss": 0.915, + "step": 14936 + }, + { + "epoch": 3.1862201365187715, + "grad_norm": 0.27299419771854494, + "learning_rate": 0.00010600137296364618, + "loss": 0.9064, + "step": 14937 + }, + { + "epoch": 3.186433447098976, + "grad_norm": 0.206059212719793, + "learning_rate": 0.00010599332506230678, + "loss": 0.9458, + "step": 14938 + }, + { + "epoch": 3.186646757679181, + "grad_norm": 0.23819972234367323, + "learning_rate": 0.00010598527686685249, + "loss": 0.9419, + "step": 14939 + }, + { + "epoch": 3.1868600682593855, + "grad_norm": 0.24517782422003864, + "learning_rate": 0.00010597722837737435, + "loss": 0.9154, + "step": 14940 + }, + { + "epoch": 3.1870733788395906, + "grad_norm": 0.19857619348287364, + "learning_rate": 0.00010596917959396344, + "loss": 0.9025, + "step": 14941 + }, + { + "epoch": 3.1872866894197953, + "grad_norm": 0.23794338444156085, + "learning_rate": 0.00010596113051671083, + "loss": 0.9325, + "step": 14942 + }, + { + "epoch": 3.1875, + "grad_norm": 0.2588132977160279, + "learning_rate": 0.00010595308114570758, + "loss": 0.9293, + "step": 14943 + }, + { + "epoch": 3.1877133105802047, + "grad_norm": 0.211316946897708, + "learning_rate": 0.00010594503148104478, + "loss": 0.9149, + "step": 14944 + }, + { + "epoch": 3.1879266211604094, + "grad_norm": 0.1938633179351432, + "learning_rate": 0.00010593698152281353, + "loss": 0.9468, + "step": 14945 + }, + { + "epoch": 3.1881399317406145, + "grad_norm": 0.22789315117051973, + "learning_rate": 0.00010592893127110486, + "loss": 0.9151, + "step": 14946 + }, + { + "epoch": 3.188353242320819, + "grad_norm": 0.3970139182034834, + "learning_rate": 0.00010592088072600995, + "loss": 0.9297, + "step": 14947 + }, + { + "epoch": 3.188566552901024, + "grad_norm": 0.26136147103315777, + "learning_rate": 0.00010591282988761979, + "loss": 0.903, + "step": 14948 + }, + { + "epoch": 3.1887798634812285, + "grad_norm": 0.32512012270591467, + "learning_rate": 0.00010590477875602556, + "loss": 0.9287, + "step": 14949 + }, + { + "epoch": 3.1889931740614337, + "grad_norm": 0.3267266342679507, + "learning_rate": 0.0001058967273313183, + "loss": 0.9105, + "step": 14950 + }, + { + "epoch": 3.1892064846416384, + "grad_norm": 0.32675536955712936, + "learning_rate": 0.00010588867561358915, + "loss": 0.9238, + "step": 14951 + }, + { + "epoch": 3.189419795221843, + "grad_norm": 0.29093246869904515, + "learning_rate": 0.00010588062360292917, + "loss": 0.9238, + "step": 14952 + }, + { + "epoch": 3.1896331058020477, + "grad_norm": 0.3031853803393525, + "learning_rate": 0.00010587257129942956, + "loss": 0.9096, + "step": 14953 + }, + { + "epoch": 3.1898464163822524, + "grad_norm": 0.30737295285076915, + "learning_rate": 0.00010586451870318132, + "loss": 0.9043, + "step": 14954 + }, + { + "epoch": 3.1900597269624575, + "grad_norm": 0.2629542693722116, + "learning_rate": 0.00010585646581427562, + "loss": 0.9069, + "step": 14955 + }, + { + "epoch": 3.1902730375426622, + "grad_norm": 0.25193437180952744, + "learning_rate": 0.00010584841263280357, + "loss": 0.9119, + "step": 14956 + }, + { + "epoch": 3.190486348122867, + "grad_norm": 0.26411967721244767, + "learning_rate": 0.00010584035915885632, + "loss": 0.9153, + "step": 14957 + }, + { + "epoch": 3.1906996587030716, + "grad_norm": 0.2539885387972006, + "learning_rate": 0.00010583230539252495, + "loss": 0.9027, + "step": 14958 + }, + { + "epoch": 3.1909129692832763, + "grad_norm": 0.24550070963700185, + "learning_rate": 0.00010582425133390063, + "loss": 0.9316, + "step": 14959 + }, + { + "epoch": 3.1911262798634814, + "grad_norm": 0.23985523462108163, + "learning_rate": 0.00010581619698307447, + "loss": 0.9116, + "step": 14960 + }, + { + "epoch": 3.191339590443686, + "grad_norm": 0.24746665837221157, + "learning_rate": 0.00010580814234013759, + "loss": 0.9417, + "step": 14961 + }, + { + "epoch": 3.1915529010238908, + "grad_norm": 0.2424901511173345, + "learning_rate": 0.00010580008740518115, + "loss": 0.9472, + "step": 14962 + }, + { + "epoch": 3.1917662116040955, + "grad_norm": 0.20658162129347699, + "learning_rate": 0.00010579203217829629, + "loss": 0.8957, + "step": 14963 + }, + { + "epoch": 3.1919795221843, + "grad_norm": 0.24138462736839542, + "learning_rate": 0.00010578397665957414, + "loss": 0.9396, + "step": 14964 + }, + { + "epoch": 3.1921928327645053, + "grad_norm": 0.28928420438508345, + "learning_rate": 0.0001057759208491059, + "loss": 0.9143, + "step": 14965 + }, + { + "epoch": 3.19240614334471, + "grad_norm": 0.23220200772553667, + "learning_rate": 0.00010576786474698264, + "loss": 0.9254, + "step": 14966 + }, + { + "epoch": 3.1926194539249146, + "grad_norm": 0.19043777541721457, + "learning_rate": 0.00010575980835329557, + "loss": 0.9303, + "step": 14967 + }, + { + "epoch": 3.1928327645051193, + "grad_norm": 0.21737494613677358, + "learning_rate": 0.00010575175166813584, + "loss": 0.9126, + "step": 14968 + }, + { + "epoch": 3.1930460750853245, + "grad_norm": 0.23462388113857754, + "learning_rate": 0.0001057436946915946, + "loss": 0.9301, + "step": 14969 + }, + { + "epoch": 3.193259385665529, + "grad_norm": 0.2750950399312132, + "learning_rate": 0.00010573563742376304, + "loss": 0.9249, + "step": 14970 + }, + { + "epoch": 3.193472696245734, + "grad_norm": 0.257578585577835, + "learning_rate": 0.00010572757986473228, + "loss": 0.9033, + "step": 14971 + }, + { + "epoch": 3.1936860068259385, + "grad_norm": 0.206989621025199, + "learning_rate": 0.00010571952201459353, + "loss": 0.8966, + "step": 14972 + }, + { + "epoch": 3.193899317406143, + "grad_norm": 0.2557370468028468, + "learning_rate": 0.00010571146387343797, + "loss": 0.9207, + "step": 14973 + }, + { + "epoch": 3.1941126279863483, + "grad_norm": 0.21892187982207914, + "learning_rate": 0.00010570340544135674, + "loss": 0.9338, + "step": 14974 + }, + { + "epoch": 3.194325938566553, + "grad_norm": 0.2548799851826879, + "learning_rate": 0.00010569534671844108, + "loss": 0.9234, + "step": 14975 + }, + { + "epoch": 3.1945392491467577, + "grad_norm": 0.2847033906027049, + "learning_rate": 0.00010568728770478212, + "loss": 0.9067, + "step": 14976 + }, + { + "epoch": 3.1947525597269624, + "grad_norm": 0.2520756238348573, + "learning_rate": 0.00010567922840047108, + "loss": 0.9315, + "step": 14977 + }, + { + "epoch": 3.194965870307167, + "grad_norm": 0.22620628772837534, + "learning_rate": 0.00010567116880559911, + "loss": 0.9026, + "step": 14978 + }, + { + "epoch": 3.195179180887372, + "grad_norm": 0.18818305934615145, + "learning_rate": 0.00010566310892025744, + "loss": 0.9008, + "step": 14979 + }, + { + "epoch": 3.195392491467577, + "grad_norm": 0.2042766312944654, + "learning_rate": 0.00010565504874453727, + "loss": 0.9007, + "step": 14980 + }, + { + "epoch": 3.1956058020477816, + "grad_norm": 0.3002695277535274, + "learning_rate": 0.00010564698827852981, + "loss": 0.9532, + "step": 14981 + }, + { + "epoch": 3.1958191126279862, + "grad_norm": 0.36672166156904795, + "learning_rate": 0.00010563892752232623, + "loss": 0.9097, + "step": 14982 + }, + { + "epoch": 3.196032423208191, + "grad_norm": 0.25650798202588343, + "learning_rate": 0.00010563086647601776, + "loss": 0.9159, + "step": 14983 + }, + { + "epoch": 3.196245733788396, + "grad_norm": 0.23829151002437032, + "learning_rate": 0.00010562280513969557, + "loss": 0.9144, + "step": 14984 + }, + { + "epoch": 3.1964590443686007, + "grad_norm": 0.3045296122541189, + "learning_rate": 0.00010561474351345094, + "loss": 0.9441, + "step": 14985 + }, + { + "epoch": 3.1966723549488054, + "grad_norm": 0.28352470234148297, + "learning_rate": 0.00010560668159737504, + "loss": 0.9508, + "step": 14986 + }, + { + "epoch": 3.19688566552901, + "grad_norm": 0.3196555391424945, + "learning_rate": 0.00010559861939155915, + "loss": 0.8849, + "step": 14987 + }, + { + "epoch": 3.1970989761092152, + "grad_norm": 0.28638466970907045, + "learning_rate": 0.00010559055689609441, + "loss": 0.9377, + "step": 14988 + }, + { + "epoch": 3.19731228668942, + "grad_norm": 0.23225668061397675, + "learning_rate": 0.00010558249411107209, + "loss": 0.9295, + "step": 14989 + }, + { + "epoch": 3.1975255972696246, + "grad_norm": 0.2018617461166211, + "learning_rate": 0.00010557443103658342, + "loss": 0.9429, + "step": 14990 + }, + { + "epoch": 3.1977389078498293, + "grad_norm": 0.2643778630947104, + "learning_rate": 0.00010556636767271965, + "loss": 0.9484, + "step": 14991 + }, + { + "epoch": 3.197952218430034, + "grad_norm": 0.30791322606042404, + "learning_rate": 0.00010555830401957199, + "loss": 0.8817, + "step": 14992 + }, + { + "epoch": 3.198165529010239, + "grad_norm": 0.3145601378254888, + "learning_rate": 0.0001055502400772317, + "loss": 0.9048, + "step": 14993 + }, + { + "epoch": 3.198378839590444, + "grad_norm": 0.3046378155685759, + "learning_rate": 0.00010554217584579001, + "loss": 0.894, + "step": 14994 + }, + { + "epoch": 3.1985921501706485, + "grad_norm": 0.26983651802127745, + "learning_rate": 0.00010553411132533816, + "loss": 0.8943, + "step": 14995 + }, + { + "epoch": 3.198805460750853, + "grad_norm": 0.24878533429595318, + "learning_rate": 0.00010552604651596741, + "loss": 0.9423, + "step": 14996 + }, + { + "epoch": 3.199018771331058, + "grad_norm": 0.2107249510085241, + "learning_rate": 0.00010551798141776902, + "loss": 0.8958, + "step": 14997 + }, + { + "epoch": 3.199232081911263, + "grad_norm": 0.20783832225465956, + "learning_rate": 0.00010550991603083424, + "loss": 0.8909, + "step": 14998 + }, + { + "epoch": 3.1994453924914676, + "grad_norm": 0.3193139139169396, + "learning_rate": 0.00010550185035525434, + "loss": 0.9278, + "step": 14999 + }, + { + "epoch": 3.1996587030716723, + "grad_norm": 0.3345494517612536, + "learning_rate": 0.00010549378439112055, + "loss": 0.9425, + "step": 15000 + }, + { + "epoch": 3.199872013651877, + "grad_norm": 0.2906106816534745, + "learning_rate": 0.00010548571813852418, + "loss": 0.9036, + "step": 15001 + }, + { + "epoch": 3.2000853242320817, + "grad_norm": 0.21745113485677847, + "learning_rate": 0.00010547765159755648, + "loss": 0.8725, + "step": 15002 + }, + { + "epoch": 3.200298634812287, + "grad_norm": 0.20631069827551354, + "learning_rate": 0.00010546958476830873, + "loss": 0.8956, + "step": 15003 + }, + { + "epoch": 3.2005119453924915, + "grad_norm": 0.21227308174100856, + "learning_rate": 0.0001054615176508722, + "loss": 0.9352, + "step": 15004 + }, + { + "epoch": 3.200725255972696, + "grad_norm": 0.18189390918461265, + "learning_rate": 0.00010545345024533815, + "loss": 0.9161, + "step": 15005 + }, + { + "epoch": 3.200938566552901, + "grad_norm": 0.2119885218957302, + "learning_rate": 0.00010544538255179789, + "loss": 0.9413, + "step": 15006 + }, + { + "epoch": 3.201151877133106, + "grad_norm": 0.23257841494892856, + "learning_rate": 0.0001054373145703427, + "loss": 0.9072, + "step": 15007 + }, + { + "epoch": 3.2013651877133107, + "grad_norm": 0.2365758740583184, + "learning_rate": 0.00010542924630106385, + "loss": 0.9235, + "step": 15008 + }, + { + "epoch": 3.2015784982935154, + "grad_norm": 0.20341342937581156, + "learning_rate": 0.0001054211777440527, + "loss": 0.8945, + "step": 15009 + }, + { + "epoch": 3.20179180887372, + "grad_norm": 0.20455418536753944, + "learning_rate": 0.00010541310889940046, + "loss": 0.9107, + "step": 15010 + }, + { + "epoch": 3.2020051194539247, + "grad_norm": 0.2028046689770077, + "learning_rate": 0.00010540503976719848, + "loss": 0.9335, + "step": 15011 + }, + { + "epoch": 3.20221843003413, + "grad_norm": 0.2074296525144755, + "learning_rate": 0.00010539697034753802, + "loss": 0.9329, + "step": 15012 + }, + { + "epoch": 3.2024317406143346, + "grad_norm": 0.23267691079517766, + "learning_rate": 0.00010538890064051043, + "loss": 0.9264, + "step": 15013 + }, + { + "epoch": 3.2026450511945392, + "grad_norm": 0.25527809434149007, + "learning_rate": 0.00010538083064620699, + "loss": 0.9316, + "step": 15014 + }, + { + "epoch": 3.202858361774744, + "grad_norm": 0.22884273662092106, + "learning_rate": 0.00010537276036471907, + "loss": 0.9327, + "step": 15015 + }, + { + "epoch": 3.2030716723549486, + "grad_norm": 0.22152039194131823, + "learning_rate": 0.0001053646897961379, + "loss": 0.8915, + "step": 15016 + }, + { + "epoch": 3.2032849829351537, + "grad_norm": 0.24128857230432763, + "learning_rate": 0.00010535661894055484, + "loss": 0.9262, + "step": 15017 + }, + { + "epoch": 3.2034982935153584, + "grad_norm": 0.43504812400151127, + "learning_rate": 0.00010534854779806122, + "loss": 0.9077, + "step": 15018 + }, + { + "epoch": 3.203711604095563, + "grad_norm": 0.26378994155300006, + "learning_rate": 0.00010534047636874835, + "loss": 0.9142, + "step": 15019 + }, + { + "epoch": 3.203924914675768, + "grad_norm": 0.3327301616300579, + "learning_rate": 0.00010533240465270756, + "loss": 0.9685, + "step": 15020 + }, + { + "epoch": 3.2041382252559725, + "grad_norm": 0.3331577490897741, + "learning_rate": 0.00010532433265003021, + "loss": 0.9144, + "step": 15021 + }, + { + "epoch": 3.2043515358361776, + "grad_norm": 0.25129921642114583, + "learning_rate": 0.00010531626036080759, + "loss": 0.9168, + "step": 15022 + }, + { + "epoch": 3.2045648464163823, + "grad_norm": 0.26833371984495, + "learning_rate": 0.00010530818778513106, + "loss": 0.9396, + "step": 15023 + }, + { + "epoch": 3.204778156996587, + "grad_norm": 0.3244372320961423, + "learning_rate": 0.00010530011492309193, + "loss": 0.9587, + "step": 15024 + }, + { + "epoch": 3.2049914675767917, + "grad_norm": 0.28820837555982265, + "learning_rate": 0.00010529204177478161, + "loss": 0.9044, + "step": 15025 + }, + { + "epoch": 3.205204778156997, + "grad_norm": 0.27329013912740724, + "learning_rate": 0.00010528396834029143, + "loss": 0.9209, + "step": 15026 + }, + { + "epoch": 3.2054180887372015, + "grad_norm": 0.3157121758497693, + "learning_rate": 0.0001052758946197127, + "loss": 0.9437, + "step": 15027 + }, + { + "epoch": 3.205631399317406, + "grad_norm": 0.519018334261617, + "learning_rate": 0.00010526782061313678, + "loss": 0.9627, + "step": 15028 + }, + { + "epoch": 3.205844709897611, + "grad_norm": 0.22011555478789524, + "learning_rate": 0.00010525974632065506, + "loss": 0.9085, + "step": 15029 + }, + { + "epoch": 3.2060580204778155, + "grad_norm": 0.3091450589962283, + "learning_rate": 0.00010525167174235887, + "loss": 0.9032, + "step": 15030 + }, + { + "epoch": 3.2062713310580206, + "grad_norm": 0.28334713717919335, + "learning_rate": 0.00010524359687833962, + "loss": 0.9322, + "step": 15031 + }, + { + "epoch": 3.2064846416382253, + "grad_norm": 0.3064084667311317, + "learning_rate": 0.00010523552172868863, + "loss": 0.9291, + "step": 15032 + }, + { + "epoch": 3.20669795221843, + "grad_norm": 0.2822451750073967, + "learning_rate": 0.0001052274462934973, + "loss": 0.9092, + "step": 15033 + }, + { + "epoch": 3.2069112627986347, + "grad_norm": 0.2594731016026407, + "learning_rate": 0.00010521937057285697, + "loss": 0.9284, + "step": 15034 + }, + { + "epoch": 3.2071245733788394, + "grad_norm": 0.34897975682319493, + "learning_rate": 0.00010521129456685904, + "loss": 0.9593, + "step": 15035 + }, + { + "epoch": 3.2073378839590445, + "grad_norm": 0.36125604893757746, + "learning_rate": 0.0001052032182755949, + "loss": 0.9339, + "step": 15036 + }, + { + "epoch": 3.207551194539249, + "grad_norm": 0.26136167245437186, + "learning_rate": 0.00010519514169915593, + "loss": 0.9463, + "step": 15037 + }, + { + "epoch": 3.207764505119454, + "grad_norm": 0.29390368148758567, + "learning_rate": 0.00010518706483763348, + "loss": 0.9487, + "step": 15038 + }, + { + "epoch": 3.2079778156996586, + "grad_norm": 0.2960847669851117, + "learning_rate": 0.000105178987691119, + "loss": 0.9167, + "step": 15039 + }, + { + "epoch": 3.2081911262798632, + "grad_norm": 0.2795741140789984, + "learning_rate": 0.0001051709102597038, + "loss": 0.9019, + "step": 15040 + }, + { + "epoch": 3.2084044368600684, + "grad_norm": 1.2154937669150567, + "learning_rate": 0.00010516283254347937, + "loss": 0.9607, + "step": 15041 + }, + { + "epoch": 3.208617747440273, + "grad_norm": 0.38314120409826724, + "learning_rate": 0.00010515475454253704, + "loss": 0.9251, + "step": 15042 + }, + { + "epoch": 3.2088310580204777, + "grad_norm": 0.37454396631408776, + "learning_rate": 0.0001051466762569683, + "loss": 0.9282, + "step": 15043 + }, + { + "epoch": 3.2090443686006824, + "grad_norm": 0.4023957822393742, + "learning_rate": 0.00010513859768686443, + "loss": 0.9493, + "step": 15044 + }, + { + "epoch": 3.2092576791808876, + "grad_norm": 0.3275458369692838, + "learning_rate": 0.00010513051883231694, + "loss": 0.9102, + "step": 15045 + }, + { + "epoch": 3.2094709897610922, + "grad_norm": 0.3583766329162235, + "learning_rate": 0.00010512243969341717, + "loss": 0.9265, + "step": 15046 + }, + { + "epoch": 3.209684300341297, + "grad_norm": 0.38159536500846714, + "learning_rate": 0.0001051143602702566, + "loss": 0.9495, + "step": 15047 + }, + { + "epoch": 3.2098976109215016, + "grad_norm": 0.34943993012032887, + "learning_rate": 0.0001051062805629266, + "loss": 0.9559, + "step": 15048 + }, + { + "epoch": 3.2101109215017063, + "grad_norm": 0.3261434464513141, + "learning_rate": 0.00010509820057151865, + "loss": 0.903, + "step": 15049 + }, + { + "epoch": 3.2103242320819114, + "grad_norm": 0.35348100122380155, + "learning_rate": 0.00010509012029612411, + "loss": 0.9148, + "step": 15050 + }, + { + "epoch": 3.210537542662116, + "grad_norm": 0.41749131252455346, + "learning_rate": 0.00010508203973683444, + "loss": 0.9535, + "step": 15051 + }, + { + "epoch": 3.210750853242321, + "grad_norm": 0.31908253311297713, + "learning_rate": 0.00010507395889374106, + "loss": 0.946, + "step": 15052 + }, + { + "epoch": 3.2109641638225255, + "grad_norm": 0.33694863150054694, + "learning_rate": 0.00010506587776693544, + "loss": 0.9438, + "step": 15053 + }, + { + "epoch": 3.2111774744027306, + "grad_norm": 0.41075406556304583, + "learning_rate": 0.00010505779635650898, + "loss": 0.9149, + "step": 15054 + }, + { + "epoch": 3.2113907849829353, + "grad_norm": 0.4533532992091694, + "learning_rate": 0.00010504971466255314, + "loss": 0.9423, + "step": 15055 + }, + { + "epoch": 3.21160409556314, + "grad_norm": 0.5707894494926351, + "learning_rate": 0.00010504163268515932, + "loss": 0.9291, + "step": 15056 + }, + { + "epoch": 3.2118174061433447, + "grad_norm": 0.4458124826588269, + "learning_rate": 0.00010503355042441904, + "loss": 0.9261, + "step": 15057 + }, + { + "epoch": 3.2120307167235493, + "grad_norm": 0.37115765006577245, + "learning_rate": 0.0001050254678804237, + "loss": 0.9044, + "step": 15058 + }, + { + "epoch": 3.2122440273037545, + "grad_norm": 0.33509855699930297, + "learning_rate": 0.0001050173850532648, + "loss": 0.9424, + "step": 15059 + }, + { + "epoch": 3.212457337883959, + "grad_norm": 0.3120767065272005, + "learning_rate": 0.00010500930194303375, + "loss": 0.9319, + "step": 15060 + }, + { + "epoch": 3.212670648464164, + "grad_norm": 0.28723203670187447, + "learning_rate": 0.00010500121854982202, + "loss": 0.9438, + "step": 15061 + }, + { + "epoch": 3.2128839590443685, + "grad_norm": 0.3384934117245678, + "learning_rate": 0.00010499313487372107, + "loss": 0.9392, + "step": 15062 + }, + { + "epoch": 3.213097269624573, + "grad_norm": 0.42497773627509167, + "learning_rate": 0.0001049850509148224, + "loss": 0.8803, + "step": 15063 + }, + { + "epoch": 3.2133105802047783, + "grad_norm": 0.401298534073313, + "learning_rate": 0.00010497696667321744, + "loss": 0.9384, + "step": 15064 + }, + { + "epoch": 3.213523890784983, + "grad_norm": 0.3081833479578697, + "learning_rate": 0.0001049688821489977, + "loss": 0.9546, + "step": 15065 + }, + { + "epoch": 3.2137372013651877, + "grad_norm": 0.28566957139934857, + "learning_rate": 0.00010496079734225464, + "loss": 0.9159, + "step": 15066 + }, + { + "epoch": 3.2139505119453924, + "grad_norm": 0.22936757742865782, + "learning_rate": 0.00010495271225307972, + "loss": 0.9337, + "step": 15067 + }, + { + "epoch": 3.214163822525597, + "grad_norm": 0.21119334012165622, + "learning_rate": 0.00010494462688156446, + "loss": 0.9419, + "step": 15068 + }, + { + "epoch": 3.214377133105802, + "grad_norm": 0.22438904164207374, + "learning_rate": 0.00010493654122780032, + "loss": 0.9187, + "step": 15069 + }, + { + "epoch": 3.214590443686007, + "grad_norm": 0.2424066135182687, + "learning_rate": 0.00010492845529187878, + "loss": 0.9606, + "step": 15070 + }, + { + "epoch": 3.2148037542662116, + "grad_norm": 0.2183118211167899, + "learning_rate": 0.00010492036907389135, + "loss": 0.944, + "step": 15071 + }, + { + "epoch": 3.2150170648464163, + "grad_norm": 0.25143695552546746, + "learning_rate": 0.00010491228257392955, + "loss": 0.9142, + "step": 15072 + }, + { + "epoch": 3.2152303754266214, + "grad_norm": 0.2535132698133515, + "learning_rate": 0.00010490419579208485, + "loss": 0.9003, + "step": 15073 + }, + { + "epoch": 3.215443686006826, + "grad_norm": 0.27016200875942953, + "learning_rate": 0.00010489610872844875, + "loss": 0.9181, + "step": 15074 + }, + { + "epoch": 3.2156569965870307, + "grad_norm": 0.23739847527907437, + "learning_rate": 0.00010488802138311276, + "loss": 0.9429, + "step": 15075 + }, + { + "epoch": 3.2158703071672354, + "grad_norm": 0.2587632734258729, + "learning_rate": 0.00010487993375616837, + "loss": 0.9169, + "step": 15076 + }, + { + "epoch": 3.21608361774744, + "grad_norm": 0.2547391894195866, + "learning_rate": 0.00010487184584770714, + "loss": 0.9159, + "step": 15077 + }, + { + "epoch": 3.2162969283276452, + "grad_norm": 0.23987044558809711, + "learning_rate": 0.00010486375765782056, + "loss": 0.9193, + "step": 15078 + }, + { + "epoch": 3.21651023890785, + "grad_norm": 0.23278166619683746, + "learning_rate": 0.00010485566918660013, + "loss": 0.9059, + "step": 15079 + }, + { + "epoch": 3.2167235494880546, + "grad_norm": 0.2976505003023815, + "learning_rate": 0.00010484758043413738, + "loss": 0.942, + "step": 15080 + }, + { + "epoch": 3.2169368600682593, + "grad_norm": 0.3268404866368607, + "learning_rate": 0.00010483949140052386, + "loss": 0.9263, + "step": 15081 + }, + { + "epoch": 3.217150170648464, + "grad_norm": 0.3203951414701456, + "learning_rate": 0.00010483140208585106, + "loss": 0.8912, + "step": 15082 + }, + { + "epoch": 3.217363481228669, + "grad_norm": 0.23435111311030507, + "learning_rate": 0.00010482331249021052, + "loss": 0.8867, + "step": 15083 + }, + { + "epoch": 3.217576791808874, + "grad_norm": 0.2773818173961909, + "learning_rate": 0.00010481522261369379, + "loss": 0.9217, + "step": 15084 + }, + { + "epoch": 3.2177901023890785, + "grad_norm": 0.2765624394778277, + "learning_rate": 0.00010480713245639241, + "loss": 0.8999, + "step": 15085 + }, + { + "epoch": 3.218003412969283, + "grad_norm": 0.26408643784901126, + "learning_rate": 0.00010479904201839787, + "loss": 0.9037, + "step": 15086 + }, + { + "epoch": 3.218216723549488, + "grad_norm": 0.27281409219649955, + "learning_rate": 0.0001047909512998018, + "loss": 0.916, + "step": 15087 + }, + { + "epoch": 3.218430034129693, + "grad_norm": 0.23240401074501138, + "learning_rate": 0.00010478286030069567, + "loss": 0.9028, + "step": 15088 + }, + { + "epoch": 3.2186433447098977, + "grad_norm": 0.24344464237771948, + "learning_rate": 0.00010477476902117107, + "loss": 0.932, + "step": 15089 + }, + { + "epoch": 3.2188566552901023, + "grad_norm": 0.2745468378699692, + "learning_rate": 0.00010476667746131954, + "loss": 0.931, + "step": 15090 + }, + { + "epoch": 3.219069965870307, + "grad_norm": 0.21335825541893094, + "learning_rate": 0.00010475858562123265, + "loss": 0.9197, + "step": 15091 + }, + { + "epoch": 3.219283276450512, + "grad_norm": 0.18796035248989856, + "learning_rate": 0.00010475049350100192, + "loss": 0.8863, + "step": 15092 + }, + { + "epoch": 3.219496587030717, + "grad_norm": 0.23843818967523633, + "learning_rate": 0.00010474240110071896, + "loss": 0.9151, + "step": 15093 + }, + { + "epoch": 3.2197098976109215, + "grad_norm": 0.2811512606400257, + "learning_rate": 0.00010473430842047532, + "loss": 0.8959, + "step": 15094 + }, + { + "epoch": 3.219923208191126, + "grad_norm": 0.3141675259622372, + "learning_rate": 0.00010472621546036254, + "loss": 0.9348, + "step": 15095 + }, + { + "epoch": 3.220136518771331, + "grad_norm": 0.28786561440174036, + "learning_rate": 0.00010471812222047224, + "loss": 0.923, + "step": 15096 + }, + { + "epoch": 3.220349829351536, + "grad_norm": 0.27978399096923484, + "learning_rate": 0.00010471002870089597, + "loss": 0.9146, + "step": 15097 + }, + { + "epoch": 3.2205631399317407, + "grad_norm": 0.2846615158568675, + "learning_rate": 0.00010470193490172528, + "loss": 0.8894, + "step": 15098 + }, + { + "epoch": 3.2207764505119454, + "grad_norm": 0.18055498118837965, + "learning_rate": 0.00010469384082305182, + "loss": 0.9416, + "step": 15099 + }, + { + "epoch": 3.22098976109215, + "grad_norm": 0.28441368179346327, + "learning_rate": 0.00010468574646496711, + "loss": 0.9146, + "step": 15100 + }, + { + "epoch": 3.2212030716723548, + "grad_norm": 0.3239562401591524, + "learning_rate": 0.00010467765182756276, + "loss": 0.9336, + "step": 15101 + }, + { + "epoch": 3.22141638225256, + "grad_norm": 0.2675577108950843, + "learning_rate": 0.00010466955691093039, + "loss": 0.9317, + "step": 15102 + }, + { + "epoch": 3.2216296928327646, + "grad_norm": 0.3459576001891592, + "learning_rate": 0.00010466146171516155, + "loss": 0.9274, + "step": 15103 + }, + { + "epoch": 3.2218430034129693, + "grad_norm": 0.27335941189663354, + "learning_rate": 0.00010465336624034788, + "loss": 0.8885, + "step": 15104 + }, + { + "epoch": 3.222056313993174, + "grad_norm": 0.20144670511192606, + "learning_rate": 0.00010464527048658093, + "loss": 0.9289, + "step": 15105 + }, + { + "epoch": 3.2222696245733786, + "grad_norm": 0.2556849568660395, + "learning_rate": 0.00010463717445395234, + "loss": 0.9312, + "step": 15106 + }, + { + "epoch": 3.2224829351535837, + "grad_norm": 0.26090054303934895, + "learning_rate": 0.0001046290781425537, + "loss": 0.9435, + "step": 15107 + }, + { + "epoch": 3.2226962457337884, + "grad_norm": 0.2529723825526319, + "learning_rate": 0.00010462098155247664, + "loss": 0.9296, + "step": 15108 + }, + { + "epoch": 3.222909556313993, + "grad_norm": 0.29635453780651444, + "learning_rate": 0.00010461288468381273, + "loss": 0.9007, + "step": 15109 + }, + { + "epoch": 3.223122866894198, + "grad_norm": 0.28835809564840736, + "learning_rate": 0.00010460478753665366, + "loss": 0.9166, + "step": 15110 + }, + { + "epoch": 3.223336177474403, + "grad_norm": 0.24209180261046018, + "learning_rate": 0.00010459669011109097, + "loss": 0.9253, + "step": 15111 + }, + { + "epoch": 3.2235494880546076, + "grad_norm": 0.31110190418726097, + "learning_rate": 0.00010458859240721632, + "loss": 0.9152, + "step": 15112 + }, + { + "epoch": 3.2237627986348123, + "grad_norm": 0.31553612067128856, + "learning_rate": 0.00010458049442512133, + "loss": 0.9205, + "step": 15113 + }, + { + "epoch": 3.223976109215017, + "grad_norm": 0.3653849594402948, + "learning_rate": 0.00010457239616489767, + "loss": 0.9193, + "step": 15114 + }, + { + "epoch": 3.2241894197952217, + "grad_norm": 0.3712423864798894, + "learning_rate": 0.00010456429762663689, + "loss": 0.9277, + "step": 15115 + }, + { + "epoch": 3.224402730375427, + "grad_norm": 0.2902836023763064, + "learning_rate": 0.0001045561988104307, + "loss": 0.9499, + "step": 15116 + }, + { + "epoch": 3.2246160409556315, + "grad_norm": 0.3755655012524581, + "learning_rate": 0.00010454809971637069, + "loss": 0.9385, + "step": 15117 + }, + { + "epoch": 3.224829351535836, + "grad_norm": 0.3851412714579881, + "learning_rate": 0.00010454000034454853, + "loss": 0.9159, + "step": 15118 + }, + { + "epoch": 3.225042662116041, + "grad_norm": 0.43316687663778564, + "learning_rate": 0.00010453190069505583, + "loss": 0.9319, + "step": 15119 + }, + { + "epoch": 3.2252559726962455, + "grad_norm": 0.47910419610577465, + "learning_rate": 0.00010452380076798427, + "loss": 0.9375, + "step": 15120 + }, + { + "epoch": 3.2254692832764507, + "grad_norm": 0.32789843420076215, + "learning_rate": 0.0001045157005634255, + "loss": 0.9335, + "step": 15121 + }, + { + "epoch": 3.2256825938566553, + "grad_norm": 0.24613446595147298, + "learning_rate": 0.0001045076000814712, + "loss": 0.9474, + "step": 15122 + }, + { + "epoch": 3.22589590443686, + "grad_norm": 0.25951077233799086, + "learning_rate": 0.00010449949932221295, + "loss": 0.9065, + "step": 15123 + }, + { + "epoch": 3.2261092150170647, + "grad_norm": 0.23349812946025603, + "learning_rate": 0.00010449139828574245, + "loss": 0.9164, + "step": 15124 + }, + { + "epoch": 3.2263225255972694, + "grad_norm": 0.2998740969914053, + "learning_rate": 0.00010448329697215137, + "loss": 0.9365, + "step": 15125 + }, + { + "epoch": 3.2265358361774745, + "grad_norm": 0.28812537059034243, + "learning_rate": 0.0001044751953815314, + "loss": 0.9163, + "step": 15126 + }, + { + "epoch": 3.226749146757679, + "grad_norm": 0.21097386467908144, + "learning_rate": 0.00010446709351397417, + "loss": 0.94, + "step": 15127 + }, + { + "epoch": 3.226962457337884, + "grad_norm": 0.1805609206344194, + "learning_rate": 0.00010445899136957135, + "loss": 0.9346, + "step": 15128 + }, + { + "epoch": 3.2271757679180886, + "grad_norm": 0.22621078658398433, + "learning_rate": 0.00010445088894841463, + "loss": 0.9012, + "step": 15129 + }, + { + "epoch": 3.2273890784982937, + "grad_norm": 0.24195749794315866, + "learning_rate": 0.00010444278625059572, + "loss": 0.8993, + "step": 15130 + }, + { + "epoch": 3.2276023890784984, + "grad_norm": 0.24340243405492062, + "learning_rate": 0.00010443468327620623, + "loss": 0.8968, + "step": 15131 + }, + { + "epoch": 3.227815699658703, + "grad_norm": 0.21560091027947284, + "learning_rate": 0.00010442658002533795, + "loss": 0.9085, + "step": 15132 + }, + { + "epoch": 3.2280290102389078, + "grad_norm": 0.21745605311674815, + "learning_rate": 0.00010441847649808245, + "loss": 0.9104, + "step": 15133 + }, + { + "epoch": 3.2282423208191124, + "grad_norm": 0.19041053151183093, + "learning_rate": 0.00010441037269453148, + "loss": 0.9315, + "step": 15134 + }, + { + "epoch": 3.2284556313993176, + "grad_norm": 0.21478796335176234, + "learning_rate": 0.00010440226861477675, + "loss": 0.9393, + "step": 15135 + }, + { + "epoch": 3.2286689419795223, + "grad_norm": 0.26901989239249147, + "learning_rate": 0.00010439416425890994, + "loss": 0.9168, + "step": 15136 + }, + { + "epoch": 3.228882252559727, + "grad_norm": 0.2871884675882724, + "learning_rate": 0.00010438605962702273, + "loss": 0.9143, + "step": 15137 + }, + { + "epoch": 3.2290955631399316, + "grad_norm": 0.2757484593229232, + "learning_rate": 0.0001043779547192069, + "loss": 0.9243, + "step": 15138 + }, + { + "epoch": 3.2293088737201368, + "grad_norm": 0.2651761709791492, + "learning_rate": 0.00010436984953555405, + "loss": 0.9302, + "step": 15139 + }, + { + "epoch": 3.2295221843003414, + "grad_norm": 0.24125460005200608, + "learning_rate": 0.00010436174407615597, + "loss": 0.9237, + "step": 15140 + }, + { + "epoch": 3.229735494880546, + "grad_norm": 0.22685490319147175, + "learning_rate": 0.00010435363834110431, + "loss": 0.911, + "step": 15141 + }, + { + "epoch": 3.229948805460751, + "grad_norm": 0.2930644704162913, + "learning_rate": 0.00010434553233049086, + "loss": 0.9188, + "step": 15142 + }, + { + "epoch": 3.2301621160409555, + "grad_norm": 0.29917043434301877, + "learning_rate": 0.00010433742604440729, + "loss": 0.9115, + "step": 15143 + }, + { + "epoch": 3.2303754266211606, + "grad_norm": 0.28198819310909873, + "learning_rate": 0.00010432931948294533, + "loss": 0.9174, + "step": 15144 + }, + { + "epoch": 3.2305887372013653, + "grad_norm": 0.26345445460845696, + "learning_rate": 0.00010432121264619672, + "loss": 0.9085, + "step": 15145 + }, + { + "epoch": 3.23080204778157, + "grad_norm": 0.21071081138162695, + "learning_rate": 0.00010431310553425317, + "loss": 0.9336, + "step": 15146 + }, + { + "epoch": 3.2310153583617747, + "grad_norm": 0.23828240839075118, + "learning_rate": 0.00010430499814720642, + "loss": 0.9288, + "step": 15147 + }, + { + "epoch": 3.2312286689419794, + "grad_norm": 0.32245432547052555, + "learning_rate": 0.00010429689048514822, + "loss": 0.9243, + "step": 15148 + }, + { + "epoch": 3.2314419795221845, + "grad_norm": 0.4461083090189353, + "learning_rate": 0.0001042887825481703, + "loss": 0.9352, + "step": 15149 + }, + { + "epoch": 3.231655290102389, + "grad_norm": 0.4776343433147862, + "learning_rate": 0.00010428067433636439, + "loss": 0.9134, + "step": 15150 + }, + { + "epoch": 3.231868600682594, + "grad_norm": 0.3335592337057475, + "learning_rate": 0.00010427256584982222, + "loss": 0.8995, + "step": 15151 + }, + { + "epoch": 3.2320819112627985, + "grad_norm": 0.2514077006924182, + "learning_rate": 0.00010426445708863558, + "loss": 0.9186, + "step": 15152 + }, + { + "epoch": 3.232295221843003, + "grad_norm": 0.2800780490757421, + "learning_rate": 0.00010425634805289618, + "loss": 0.9206, + "step": 15153 + }, + { + "epoch": 3.2325085324232083, + "grad_norm": 0.2598340110268746, + "learning_rate": 0.00010424823874269581, + "loss": 0.912, + "step": 15154 + }, + { + "epoch": 3.232721843003413, + "grad_norm": 0.28962992372588553, + "learning_rate": 0.00010424012915812623, + "loss": 0.9056, + "step": 15155 + }, + { + "epoch": 3.2329351535836177, + "grad_norm": 0.26292691662451, + "learning_rate": 0.00010423201929927916, + "loss": 0.9238, + "step": 15156 + }, + { + "epoch": 3.2331484641638224, + "grad_norm": 0.3094064699940731, + "learning_rate": 0.00010422390916624637, + "loss": 0.8969, + "step": 15157 + }, + { + "epoch": 3.2333617747440275, + "grad_norm": 0.2796940311971153, + "learning_rate": 0.00010421579875911967, + "loss": 0.9433, + "step": 15158 + }, + { + "epoch": 3.233575085324232, + "grad_norm": 0.21607130953305417, + "learning_rate": 0.00010420768807799077, + "loss": 0.95, + "step": 15159 + }, + { + "epoch": 3.233788395904437, + "grad_norm": 0.24056355779969107, + "learning_rate": 0.00010419957712295152, + "loss": 0.9235, + "step": 15160 + }, + { + "epoch": 3.2340017064846416, + "grad_norm": 0.2655868735138013, + "learning_rate": 0.00010419146589409359, + "loss": 0.9284, + "step": 15161 + }, + { + "epoch": 3.2342150170648463, + "grad_norm": 0.2915048415607289, + "learning_rate": 0.00010418335439150886, + "loss": 0.9355, + "step": 15162 + }, + { + "epoch": 3.2344283276450514, + "grad_norm": 0.33967609333525267, + "learning_rate": 0.00010417524261528905, + "loss": 0.9383, + "step": 15163 + }, + { + "epoch": 3.234641638225256, + "grad_norm": 0.32587919204035737, + "learning_rate": 0.00010416713056552595, + "loss": 0.9201, + "step": 15164 + }, + { + "epoch": 3.2348549488054608, + "grad_norm": 0.2598231821945357, + "learning_rate": 0.00010415901824231138, + "loss": 0.9072, + "step": 15165 + }, + { + "epoch": 3.2350682593856654, + "grad_norm": 0.23283861011502024, + "learning_rate": 0.00010415090564573713, + "loss": 0.9288, + "step": 15166 + }, + { + "epoch": 3.23528156996587, + "grad_norm": 0.28782054793448525, + "learning_rate": 0.00010414279277589495, + "loss": 0.9426, + "step": 15167 + }, + { + "epoch": 3.2354948805460753, + "grad_norm": 0.39904105547300384, + "learning_rate": 0.00010413467963287666, + "loss": 0.8869, + "step": 15168 + }, + { + "epoch": 3.23570819112628, + "grad_norm": 0.4845661036001737, + "learning_rate": 0.00010412656621677405, + "loss": 0.9154, + "step": 15169 + }, + { + "epoch": 3.2359215017064846, + "grad_norm": 0.5034013417954786, + "learning_rate": 0.00010411845252767898, + "loss": 0.9059, + "step": 15170 + }, + { + "epoch": 3.2361348122866893, + "grad_norm": 0.3785141912919353, + "learning_rate": 0.00010411033856568317, + "loss": 0.9093, + "step": 15171 + }, + { + "epoch": 3.236348122866894, + "grad_norm": 0.2596648981867384, + "learning_rate": 0.00010410222433087851, + "loss": 0.894, + "step": 15172 + }, + { + "epoch": 3.236561433447099, + "grad_norm": 0.2885368034555491, + "learning_rate": 0.00010409410982335676, + "loss": 0.9004, + "step": 15173 + }, + { + "epoch": 3.236774744027304, + "grad_norm": 0.4014151900287557, + "learning_rate": 0.00010408599504320975, + "loss": 0.8961, + "step": 15174 + }, + { + "epoch": 3.2369880546075085, + "grad_norm": 0.5260359955177205, + "learning_rate": 0.00010407787999052931, + "loss": 0.9241, + "step": 15175 + }, + { + "epoch": 3.237201365187713, + "grad_norm": 0.5062358127092862, + "learning_rate": 0.00010406976466540726, + "loss": 0.9288, + "step": 15176 + }, + { + "epoch": 3.2374146757679183, + "grad_norm": 0.3078558071269443, + "learning_rate": 0.00010406164906793539, + "loss": 0.9328, + "step": 15177 + }, + { + "epoch": 3.237627986348123, + "grad_norm": 0.2606633353118723, + "learning_rate": 0.00010405353319820559, + "loss": 0.8976, + "step": 15178 + }, + { + "epoch": 3.2378412969283277, + "grad_norm": 0.3913283053908651, + "learning_rate": 0.00010404541705630962, + "loss": 0.925, + "step": 15179 + }, + { + "epoch": 3.2380546075085324, + "grad_norm": 0.38869963252266204, + "learning_rate": 0.00010403730064233937, + "loss": 0.8884, + "step": 15180 + }, + { + "epoch": 3.238267918088737, + "grad_norm": 0.3323726974003557, + "learning_rate": 0.00010402918395638665, + "loss": 0.931, + "step": 15181 + }, + { + "epoch": 3.238481228668942, + "grad_norm": 0.23416268742862964, + "learning_rate": 0.00010402106699854334, + "loss": 0.9278, + "step": 15182 + }, + { + "epoch": 3.238694539249147, + "grad_norm": 0.21003755509221633, + "learning_rate": 0.00010401294976890124, + "loss": 0.9292, + "step": 15183 + }, + { + "epoch": 3.2389078498293515, + "grad_norm": 0.26073371284353913, + "learning_rate": 0.00010400483226755221, + "loss": 0.8947, + "step": 15184 + }, + { + "epoch": 3.239121160409556, + "grad_norm": 0.34104239448915913, + "learning_rate": 0.00010399671449458807, + "loss": 0.9559, + "step": 15185 + }, + { + "epoch": 3.239334470989761, + "grad_norm": 0.3433977561883899, + "learning_rate": 0.00010398859645010073, + "loss": 0.9062, + "step": 15186 + }, + { + "epoch": 3.239547781569966, + "grad_norm": 0.22520336604662147, + "learning_rate": 0.00010398047813418199, + "loss": 0.9064, + "step": 15187 + }, + { + "epoch": 3.2397610921501707, + "grad_norm": 0.2844318616485535, + "learning_rate": 0.0001039723595469238, + "loss": 0.9137, + "step": 15188 + }, + { + "epoch": 3.2399744027303754, + "grad_norm": 0.3863856389028521, + "learning_rate": 0.0001039642406884179, + "loss": 0.9216, + "step": 15189 + }, + { + "epoch": 3.24018771331058, + "grad_norm": 0.33128743971798996, + "learning_rate": 0.00010395612155875623, + "loss": 0.9739, + "step": 15190 + }, + { + "epoch": 3.2404010238907848, + "grad_norm": 0.28722704543135796, + "learning_rate": 0.00010394800215803063, + "loss": 0.9193, + "step": 15191 + }, + { + "epoch": 3.24061433447099, + "grad_norm": 0.2850108160202971, + "learning_rate": 0.000103939882486333, + "loss": 0.9502, + "step": 15192 + }, + { + "epoch": 3.2408276450511946, + "grad_norm": 0.26042808913967425, + "learning_rate": 0.00010393176254375517, + "loss": 0.9478, + "step": 15193 + }, + { + "epoch": 3.2410409556313993, + "grad_norm": 0.19609614126132793, + "learning_rate": 0.00010392364233038909, + "loss": 0.895, + "step": 15194 + }, + { + "epoch": 3.241254266211604, + "grad_norm": 0.2563854932727501, + "learning_rate": 0.00010391552184632655, + "loss": 0.8985, + "step": 15195 + }, + { + "epoch": 3.241467576791809, + "grad_norm": 0.2835083217206359, + "learning_rate": 0.00010390740109165951, + "loss": 0.9126, + "step": 15196 + }, + { + "epoch": 3.2416808873720138, + "grad_norm": 0.21314912970613897, + "learning_rate": 0.0001038992800664798, + "loss": 0.9068, + "step": 15197 + }, + { + "epoch": 3.2418941979522184, + "grad_norm": 0.20404059670691063, + "learning_rate": 0.00010389115877087936, + "loss": 0.9034, + "step": 15198 + }, + { + "epoch": 3.242107508532423, + "grad_norm": 0.22395308342198522, + "learning_rate": 0.00010388303720495004, + "loss": 0.8918, + "step": 15199 + }, + { + "epoch": 3.242320819112628, + "grad_norm": 0.22484113535791878, + "learning_rate": 0.00010387491536878376, + "loss": 0.979, + "step": 15200 + }, + { + "epoch": 3.242534129692833, + "grad_norm": 0.4029220020777203, + "learning_rate": 0.0001038667932624724, + "loss": 0.9305, + "step": 15201 + }, + { + "epoch": 3.2427474402730376, + "grad_norm": 0.2566915515669527, + "learning_rate": 0.00010385867088610788, + "loss": 0.9246, + "step": 15202 + }, + { + "epoch": 3.2429607508532423, + "grad_norm": 0.2572141741907169, + "learning_rate": 0.0001038505482397821, + "loss": 0.8884, + "step": 15203 + }, + { + "epoch": 3.243174061433447, + "grad_norm": 0.30963331094338153, + "learning_rate": 0.00010384242532358695, + "loss": 0.9389, + "step": 15204 + }, + { + "epoch": 3.2433873720136517, + "grad_norm": 0.3848674397214624, + "learning_rate": 0.00010383430213761439, + "loss": 0.9048, + "step": 15205 + }, + { + "epoch": 3.243600682593857, + "grad_norm": 0.5051933506624284, + "learning_rate": 0.00010382617868195628, + "loss": 0.9383, + "step": 15206 + }, + { + "epoch": 3.2438139931740615, + "grad_norm": 0.4682322122357225, + "learning_rate": 0.00010381805495670455, + "loss": 0.9242, + "step": 15207 + }, + { + "epoch": 3.244027303754266, + "grad_norm": 0.3776090209206025, + "learning_rate": 0.00010380993096195113, + "loss": 0.8996, + "step": 15208 + }, + { + "epoch": 3.244240614334471, + "grad_norm": 0.39837410365386294, + "learning_rate": 0.00010380180669778792, + "loss": 0.9269, + "step": 15209 + }, + { + "epoch": 3.2444539249146755, + "grad_norm": 0.26121616909480844, + "learning_rate": 0.00010379368216430691, + "loss": 0.9023, + "step": 15210 + }, + { + "epoch": 3.2446672354948807, + "grad_norm": 0.2473707672529868, + "learning_rate": 0.00010378555736159996, + "loss": 0.9024, + "step": 15211 + }, + { + "epoch": 3.2448805460750854, + "grad_norm": 0.298598751998529, + "learning_rate": 0.00010377743228975906, + "loss": 0.9318, + "step": 15212 + }, + { + "epoch": 3.24509385665529, + "grad_norm": 0.2147515893016044, + "learning_rate": 0.00010376930694887608, + "loss": 0.9365, + "step": 15213 + }, + { + "epoch": 3.2453071672354947, + "grad_norm": 0.23434735597876902, + "learning_rate": 0.000103761181339043, + "loss": 0.8892, + "step": 15214 + }, + { + "epoch": 3.2455204778157, + "grad_norm": 0.21529407388358465, + "learning_rate": 0.00010375305546035175, + "loss": 0.89, + "step": 15215 + }, + { + "epoch": 3.2457337883959045, + "grad_norm": 0.24504137143283747, + "learning_rate": 0.0001037449293128943, + "loss": 0.9188, + "step": 15216 + }, + { + "epoch": 3.2459470989761092, + "grad_norm": 0.26137022686040084, + "learning_rate": 0.00010373680289676255, + "loss": 0.9407, + "step": 15217 + }, + { + "epoch": 3.246160409556314, + "grad_norm": 0.2646637191710948, + "learning_rate": 0.00010372867621204848, + "loss": 0.9458, + "step": 15218 + }, + { + "epoch": 3.2463737201365186, + "grad_norm": 0.2511320859275299, + "learning_rate": 0.00010372054925884403, + "loss": 0.9193, + "step": 15219 + }, + { + "epoch": 3.2465870307167237, + "grad_norm": 0.23069727562732442, + "learning_rate": 0.0001037124220372412, + "loss": 0.9235, + "step": 15220 + }, + { + "epoch": 3.2468003412969284, + "grad_norm": 0.2219356402077531, + "learning_rate": 0.00010370429454733189, + "loss": 0.9122, + "step": 15221 + }, + { + "epoch": 3.247013651877133, + "grad_norm": 0.2841632100719675, + "learning_rate": 0.00010369616678920811, + "loss": 0.933, + "step": 15222 + }, + { + "epoch": 3.2472269624573378, + "grad_norm": 0.2846775017404106, + "learning_rate": 0.00010368803876296178, + "loss": 0.8996, + "step": 15223 + }, + { + "epoch": 3.2474402730375425, + "grad_norm": 0.28538789525663794, + "learning_rate": 0.00010367991046868491, + "loss": 0.8949, + "step": 15224 + }, + { + "epoch": 3.2476535836177476, + "grad_norm": 0.31234397734188574, + "learning_rate": 0.00010367178190646944, + "loss": 0.9314, + "step": 15225 + }, + { + "epoch": 3.2478668941979523, + "grad_norm": 0.216336108803536, + "learning_rate": 0.00010366365307640735, + "loss": 0.91, + "step": 15226 + }, + { + "epoch": 3.248080204778157, + "grad_norm": 0.21852591486319192, + "learning_rate": 0.00010365552397859063, + "loss": 0.9302, + "step": 15227 + }, + { + "epoch": 3.2482935153583616, + "grad_norm": 0.23691764386764272, + "learning_rate": 0.0001036473946131113, + "loss": 0.9168, + "step": 15228 + }, + { + "epoch": 3.2485068259385663, + "grad_norm": 0.20336238271081863, + "learning_rate": 0.00010363926498006128, + "loss": 0.916, + "step": 15229 + }, + { + "epoch": 3.2487201365187715, + "grad_norm": 0.2792692066735282, + "learning_rate": 0.00010363113507953256, + "loss": 0.9023, + "step": 15230 + }, + { + "epoch": 3.248933447098976, + "grad_norm": 0.2128034097272654, + "learning_rate": 0.00010362300491161715, + "loss": 0.909, + "step": 15231 + }, + { + "epoch": 3.249146757679181, + "grad_norm": 0.22697172370680477, + "learning_rate": 0.00010361487447640706, + "loss": 0.9273, + "step": 15232 + }, + { + "epoch": 3.2493600682593855, + "grad_norm": 0.24280562856879623, + "learning_rate": 0.00010360674377399428, + "loss": 0.9263, + "step": 15233 + }, + { + "epoch": 3.2495733788395906, + "grad_norm": 0.22719575130140363, + "learning_rate": 0.00010359861280447079, + "loss": 0.9208, + "step": 15234 + }, + { + "epoch": 3.2497866894197953, + "grad_norm": 0.23149475724438162, + "learning_rate": 0.00010359048156792856, + "loss": 0.9397, + "step": 15235 + }, + { + "epoch": 3.25, + "grad_norm": 0.2008976225099499, + "learning_rate": 0.00010358235006445966, + "loss": 0.9129, + "step": 15236 + }, + { + "epoch": 3.2502133105802047, + "grad_norm": 0.25444744440245826, + "learning_rate": 0.00010357421829415607, + "loss": 0.9301, + "step": 15237 + }, + { + "epoch": 3.2504266211604094, + "grad_norm": 0.25251788584320217, + "learning_rate": 0.00010356608625710982, + "loss": 0.9355, + "step": 15238 + }, + { + "epoch": 3.2506399317406145, + "grad_norm": 0.20273483234061235, + "learning_rate": 0.0001035579539534129, + "loss": 0.9138, + "step": 15239 + }, + { + "epoch": 3.250853242320819, + "grad_norm": 0.20664992319935596, + "learning_rate": 0.00010354982138315734, + "loss": 0.9219, + "step": 15240 + }, + { + "epoch": 3.251066552901024, + "grad_norm": 0.26531593534231007, + "learning_rate": 0.00010354168854643514, + "loss": 0.9141, + "step": 15241 + }, + { + "epoch": 3.2512798634812285, + "grad_norm": 0.2453217243314362, + "learning_rate": 0.00010353355544333835, + "loss": 0.8994, + "step": 15242 + }, + { + "epoch": 3.2514931740614337, + "grad_norm": 0.22310278531694425, + "learning_rate": 0.00010352542207395896, + "loss": 0.9078, + "step": 15243 + }, + { + "epoch": 3.2517064846416384, + "grad_norm": 0.29155459050200505, + "learning_rate": 0.00010351728843838906, + "loss": 0.921, + "step": 15244 + }, + { + "epoch": 3.251919795221843, + "grad_norm": 0.22333980274434323, + "learning_rate": 0.0001035091545367206, + "loss": 0.9162, + "step": 15245 + }, + { + "epoch": 3.2521331058020477, + "grad_norm": 0.19166230746952376, + "learning_rate": 0.00010350102036904571, + "loss": 0.8836, + "step": 15246 + }, + { + "epoch": 3.2523464163822524, + "grad_norm": 0.22937223782075514, + "learning_rate": 0.00010349288593545635, + "loss": 0.9468, + "step": 15247 + }, + { + "epoch": 3.252559726962457, + "grad_norm": 0.24449466192903027, + "learning_rate": 0.00010348475123604459, + "loss": 0.8679, + "step": 15248 + }, + { + "epoch": 3.2527730375426622, + "grad_norm": 0.18943296312742308, + "learning_rate": 0.00010347661627090248, + "loss": 0.903, + "step": 15249 + }, + { + "epoch": 3.252986348122867, + "grad_norm": 0.20304344706085461, + "learning_rate": 0.00010346848104012209, + "loss": 0.9298, + "step": 15250 + }, + { + "epoch": 3.2531996587030716, + "grad_norm": 0.2509390513450269, + "learning_rate": 0.0001034603455437954, + "loss": 0.951, + "step": 15251 + }, + { + "epoch": 3.2534129692832763, + "grad_norm": 0.2143425521151022, + "learning_rate": 0.00010345220978201454, + "loss": 0.9157, + "step": 15252 + }, + { + "epoch": 3.2536262798634814, + "grad_norm": 0.20032071177784413, + "learning_rate": 0.00010344407375487151, + "loss": 0.9097, + "step": 15253 + }, + { + "epoch": 3.253839590443686, + "grad_norm": 0.2752109733177723, + "learning_rate": 0.0001034359374624584, + "loss": 0.9162, + "step": 15254 + }, + { + "epoch": 3.2540529010238908, + "grad_norm": 0.2472391177334409, + "learning_rate": 0.00010342780090486724, + "loss": 0.9166, + "step": 15255 + }, + { + "epoch": 3.2542662116040955, + "grad_norm": 0.19078823730520086, + "learning_rate": 0.00010341966408219016, + "loss": 0.9134, + "step": 15256 + }, + { + "epoch": 3.2544795221843, + "grad_norm": 0.25653604106888983, + "learning_rate": 0.00010341152699451915, + "loss": 0.9315, + "step": 15257 + }, + { + "epoch": 3.2546928327645053, + "grad_norm": 0.26511255427461095, + "learning_rate": 0.00010340338964194634, + "loss": 0.9146, + "step": 15258 + }, + { + "epoch": 3.25490614334471, + "grad_norm": 0.18835551113950272, + "learning_rate": 0.00010339525202456376, + "loss": 0.9102, + "step": 15259 + }, + { + "epoch": 3.2551194539249146, + "grad_norm": 0.1974825559209799, + "learning_rate": 0.00010338711414246354, + "loss": 0.909, + "step": 15260 + }, + { + "epoch": 3.2553327645051193, + "grad_norm": 0.20586173003413127, + "learning_rate": 0.00010337897599573773, + "loss": 0.9254, + "step": 15261 + }, + { + "epoch": 3.2555460750853245, + "grad_norm": 0.18660054305189255, + "learning_rate": 0.0001033708375844784, + "loss": 0.9043, + "step": 15262 + }, + { + "epoch": 3.255759385665529, + "grad_norm": 0.19142205508615878, + "learning_rate": 0.00010336269890877763, + "loss": 0.9258, + "step": 15263 + }, + { + "epoch": 3.255972696245734, + "grad_norm": 0.22252203282790772, + "learning_rate": 0.00010335455996872756, + "loss": 0.9338, + "step": 15264 + }, + { + "epoch": 3.2561860068259385, + "grad_norm": 0.2507243833265392, + "learning_rate": 0.00010334642076442022, + "loss": 0.897, + "step": 15265 + }, + { + "epoch": 3.256399317406143, + "grad_norm": 0.2318400866999355, + "learning_rate": 0.00010333828129594775, + "loss": 0.906, + "step": 15266 + }, + { + "epoch": 3.256612627986348, + "grad_norm": 0.2104495968875058, + "learning_rate": 0.00010333014156340225, + "loss": 0.9241, + "step": 15267 + }, + { + "epoch": 3.256825938566553, + "grad_norm": 0.22518686100050794, + "learning_rate": 0.00010332200156687582, + "loss": 0.8977, + "step": 15268 + }, + { + "epoch": 3.2570392491467577, + "grad_norm": 0.18633872305717947, + "learning_rate": 0.0001033138613064605, + "loss": 0.9259, + "step": 15269 + }, + { + "epoch": 3.2572525597269624, + "grad_norm": 0.18127911335579414, + "learning_rate": 0.00010330572078224848, + "loss": 0.9186, + "step": 15270 + }, + { + "epoch": 3.257465870307167, + "grad_norm": 0.21255115703322874, + "learning_rate": 0.00010329757999433182, + "loss": 0.9502, + "step": 15271 + }, + { + "epoch": 3.257679180887372, + "grad_norm": 0.25509876562337785, + "learning_rate": 0.00010328943894280268, + "loss": 0.9175, + "step": 15272 + }, + { + "epoch": 3.257892491467577, + "grad_norm": 0.20938613495060113, + "learning_rate": 0.00010328129762775311, + "loss": 0.937, + "step": 15273 + }, + { + "epoch": 3.2581058020477816, + "grad_norm": 0.22285354738379182, + "learning_rate": 0.0001032731560492753, + "loss": 0.9085, + "step": 15274 + }, + { + "epoch": 3.2583191126279862, + "grad_norm": 0.19297220075372012, + "learning_rate": 0.0001032650142074613, + "loss": 0.9071, + "step": 15275 + }, + { + "epoch": 3.258532423208191, + "grad_norm": 0.2153689751466058, + "learning_rate": 0.00010325687210240329, + "loss": 0.9052, + "step": 15276 + }, + { + "epoch": 3.258745733788396, + "grad_norm": 0.2576821643831896, + "learning_rate": 0.00010324872973419339, + "loss": 0.9415, + "step": 15277 + }, + { + "epoch": 3.2589590443686007, + "grad_norm": 0.21859419491558268, + "learning_rate": 0.00010324058710292374, + "loss": 0.9009, + "step": 15278 + }, + { + "epoch": 3.2591723549488054, + "grad_norm": 0.18820052965438241, + "learning_rate": 0.00010323244420868641, + "loss": 0.951, + "step": 15279 + }, + { + "epoch": 3.25938566552901, + "grad_norm": 0.22455471160795606, + "learning_rate": 0.00010322430105157362, + "loss": 0.9309, + "step": 15280 + }, + { + "epoch": 3.2595989761092152, + "grad_norm": 0.21488500879514336, + "learning_rate": 0.00010321615763167744, + "loss": 0.9275, + "step": 15281 + }, + { + "epoch": 3.25981228668942, + "grad_norm": 0.2219145018530456, + "learning_rate": 0.00010320801394909007, + "loss": 0.9333, + "step": 15282 + }, + { + "epoch": 3.2600255972696246, + "grad_norm": 0.20150566813400472, + "learning_rate": 0.00010319987000390363, + "loss": 0.9418, + "step": 15283 + }, + { + "epoch": 3.2602389078498293, + "grad_norm": 0.19977974761071318, + "learning_rate": 0.00010319172579621028, + "loss": 0.9111, + "step": 15284 + }, + { + "epoch": 3.260452218430034, + "grad_norm": 0.24770851296116111, + "learning_rate": 0.00010318358132610213, + "loss": 0.94, + "step": 15285 + }, + { + "epoch": 3.260665529010239, + "grad_norm": 0.22457884202968803, + "learning_rate": 0.0001031754365936714, + "loss": 0.8787, + "step": 15286 + }, + { + "epoch": 3.260878839590444, + "grad_norm": 0.22134577368140892, + "learning_rate": 0.00010316729159901018, + "loss": 0.9386, + "step": 15287 + }, + { + "epoch": 3.2610921501706485, + "grad_norm": 0.22485933576732967, + "learning_rate": 0.00010315914634221071, + "loss": 0.9048, + "step": 15288 + }, + { + "epoch": 3.261305460750853, + "grad_norm": 0.2673446881909096, + "learning_rate": 0.00010315100082336507, + "loss": 0.9604, + "step": 15289 + }, + { + "epoch": 3.261518771331058, + "grad_norm": 0.19229475693852988, + "learning_rate": 0.0001031428550425655, + "loss": 0.9494, + "step": 15290 + }, + { + "epoch": 3.261732081911263, + "grad_norm": 0.2220688790791312, + "learning_rate": 0.00010313470899990411, + "loss": 0.9325, + "step": 15291 + }, + { + "epoch": 3.2619453924914676, + "grad_norm": 0.23462531338248724, + "learning_rate": 0.00010312656269547312, + "loss": 0.9292, + "step": 15292 + }, + { + "epoch": 3.2621587030716723, + "grad_norm": 0.2005681526260672, + "learning_rate": 0.00010311841612936466, + "loss": 0.93, + "step": 15293 + }, + { + "epoch": 3.262372013651877, + "grad_norm": 0.20945829360246798, + "learning_rate": 0.00010311026930167095, + "loss": 0.8923, + "step": 15294 + }, + { + "epoch": 3.2625853242320817, + "grad_norm": 0.20328411606447452, + "learning_rate": 0.00010310212221248414, + "loss": 0.949, + "step": 15295 + }, + { + "epoch": 3.262798634812287, + "grad_norm": 0.23277555474821077, + "learning_rate": 0.00010309397486189646, + "loss": 0.9534, + "step": 15296 + }, + { + "epoch": 3.2630119453924915, + "grad_norm": 0.22406290248419633, + "learning_rate": 0.00010308582725000002, + "loss": 0.919, + "step": 15297 + }, + { + "epoch": 3.263225255972696, + "grad_norm": 0.2552112481537869, + "learning_rate": 0.00010307767937688709, + "loss": 0.928, + "step": 15298 + }, + { + "epoch": 3.263438566552901, + "grad_norm": 0.22744153878592366, + "learning_rate": 0.00010306953124264982, + "loss": 0.9417, + "step": 15299 + }, + { + "epoch": 3.263651877133106, + "grad_norm": 0.2968265756146963, + "learning_rate": 0.0001030613828473804, + "loss": 0.9159, + "step": 15300 + }, + { + "epoch": 3.2638651877133107, + "grad_norm": 0.25248084913116486, + "learning_rate": 0.00010305323419117107, + "loss": 0.8906, + "step": 15301 + }, + { + "epoch": 3.2640784982935154, + "grad_norm": 0.2137782872432702, + "learning_rate": 0.00010304508527411401, + "loss": 0.8996, + "step": 15302 + }, + { + "epoch": 3.26429180887372, + "grad_norm": 0.23792884314255316, + "learning_rate": 0.0001030369360963014, + "loss": 0.9107, + "step": 15303 + }, + { + "epoch": 3.2645051194539247, + "grad_norm": 0.2365758866362402, + "learning_rate": 0.00010302878665782549, + "loss": 0.9254, + "step": 15304 + }, + { + "epoch": 3.26471843003413, + "grad_norm": 0.21816450355821176, + "learning_rate": 0.00010302063695877846, + "loss": 0.9254, + "step": 15305 + }, + { + "epoch": 3.2649317406143346, + "grad_norm": 0.19723731670644803, + "learning_rate": 0.00010301248699925254, + "loss": 0.9164, + "step": 15306 + }, + { + "epoch": 3.2651450511945392, + "grad_norm": 0.21410219891923973, + "learning_rate": 0.00010300433677933994, + "loss": 0.9519, + "step": 15307 + }, + { + "epoch": 3.265358361774744, + "grad_norm": 0.16025239503083546, + "learning_rate": 0.00010299618629913287, + "loss": 0.9013, + "step": 15308 + }, + { + "epoch": 3.265571672354949, + "grad_norm": 0.19724829343481015, + "learning_rate": 0.00010298803555872358, + "loss": 0.9381, + "step": 15309 + }, + { + "epoch": 3.2657849829351537, + "grad_norm": 0.2555028293579534, + "learning_rate": 0.00010297988455820428, + "loss": 0.9198, + "step": 15310 + }, + { + "epoch": 3.2659982935153584, + "grad_norm": 0.23279020291289615, + "learning_rate": 0.00010297173329766722, + "loss": 0.8889, + "step": 15311 + }, + { + "epoch": 3.266211604095563, + "grad_norm": 0.23360587263954918, + "learning_rate": 0.00010296358177720457, + "loss": 0.9236, + "step": 15312 + }, + { + "epoch": 3.266424914675768, + "grad_norm": 0.2993043076313333, + "learning_rate": 0.00010295542999690864, + "loss": 0.9352, + "step": 15313 + }, + { + "epoch": 3.2666382252559725, + "grad_norm": 0.27770610166408355, + "learning_rate": 0.00010294727795687162, + "loss": 0.9242, + "step": 15314 + }, + { + "epoch": 3.2668515358361776, + "grad_norm": 0.25978964684270994, + "learning_rate": 0.00010293912565718576, + "loss": 0.9249, + "step": 15315 + }, + { + "epoch": 3.2670648464163823, + "grad_norm": 0.35528843155834666, + "learning_rate": 0.00010293097309794331, + "loss": 0.9225, + "step": 15316 + }, + { + "epoch": 3.267278156996587, + "grad_norm": 0.3266029737204738, + "learning_rate": 0.00010292282027923652, + "loss": 0.9485, + "step": 15317 + }, + { + "epoch": 3.2674914675767917, + "grad_norm": 0.3578735019989659, + "learning_rate": 0.00010291466720115761, + "loss": 0.9121, + "step": 15318 + }, + { + "epoch": 3.267704778156997, + "grad_norm": 0.36341288091673823, + "learning_rate": 0.00010290651386379888, + "loss": 0.9524, + "step": 15319 + }, + { + "epoch": 3.2679180887372015, + "grad_norm": 0.3345938607835613, + "learning_rate": 0.00010289836026725256, + "loss": 0.9062, + "step": 15320 + }, + { + "epoch": 3.268131399317406, + "grad_norm": 0.24953523307746667, + "learning_rate": 0.00010289020641161088, + "loss": 0.9465, + "step": 15321 + }, + { + "epoch": 3.268344709897611, + "grad_norm": 0.18072182174302545, + "learning_rate": 0.00010288205229696616, + "loss": 0.909, + "step": 15322 + }, + { + "epoch": 3.2685580204778155, + "grad_norm": 0.2189839827658688, + "learning_rate": 0.00010287389792341064, + "loss": 0.9108, + "step": 15323 + }, + { + "epoch": 3.2687713310580206, + "grad_norm": 0.2385522567079946, + "learning_rate": 0.00010286574329103655, + "loss": 0.9297, + "step": 15324 + }, + { + "epoch": 3.2689846416382253, + "grad_norm": 0.21668390052975803, + "learning_rate": 0.00010285758839993622, + "loss": 0.946, + "step": 15325 + }, + { + "epoch": 3.26919795221843, + "grad_norm": 0.20896490143870713, + "learning_rate": 0.00010284943325020187, + "loss": 0.9372, + "step": 15326 + }, + { + "epoch": 3.2694112627986347, + "grad_norm": 0.21472887904475588, + "learning_rate": 0.0001028412778419258, + "loss": 0.9316, + "step": 15327 + }, + { + "epoch": 3.26962457337884, + "grad_norm": 0.20516306360834677, + "learning_rate": 0.0001028331221752003, + "loss": 0.9219, + "step": 15328 + }, + { + "epoch": 3.2698378839590445, + "grad_norm": 0.20165584071790116, + "learning_rate": 0.00010282496625011764, + "loss": 0.9135, + "step": 15329 + }, + { + "epoch": 3.270051194539249, + "grad_norm": 0.20582255966779006, + "learning_rate": 0.00010281681006677009, + "loss": 0.9137, + "step": 15330 + }, + { + "epoch": 3.270264505119454, + "grad_norm": 0.21192384608315137, + "learning_rate": 0.00010280865362524995, + "loss": 0.9114, + "step": 15331 + }, + { + "epoch": 3.2704778156996586, + "grad_norm": 0.22823898559749745, + "learning_rate": 0.00010280049692564952, + "loss": 0.9047, + "step": 15332 + }, + { + "epoch": 3.2706911262798632, + "grad_norm": 0.21718756542898446, + "learning_rate": 0.00010279233996806107, + "loss": 0.8854, + "step": 15333 + }, + { + "epoch": 3.2709044368600684, + "grad_norm": 0.21011527747638442, + "learning_rate": 0.00010278418275257693, + "loss": 0.9332, + "step": 15334 + }, + { + "epoch": 3.271117747440273, + "grad_norm": 0.22656071884782505, + "learning_rate": 0.00010277602527928937, + "loss": 0.9022, + "step": 15335 + }, + { + "epoch": 3.2713310580204777, + "grad_norm": 0.20853676464937707, + "learning_rate": 0.0001027678675482907, + "loss": 0.9149, + "step": 15336 + }, + { + "epoch": 3.2715443686006824, + "grad_norm": 0.2378303183402542, + "learning_rate": 0.00010275970955967321, + "loss": 0.9032, + "step": 15337 + }, + { + "epoch": 3.2717576791808876, + "grad_norm": 0.22401306015443415, + "learning_rate": 0.00010275155131352926, + "loss": 0.9252, + "step": 15338 + }, + { + "epoch": 3.2719709897610922, + "grad_norm": 0.23593630862242942, + "learning_rate": 0.0001027433928099511, + "loss": 0.8953, + "step": 15339 + }, + { + "epoch": 3.272184300341297, + "grad_norm": 0.2784739360880278, + "learning_rate": 0.00010273523404903106, + "loss": 0.9211, + "step": 15340 + }, + { + "epoch": 3.2723976109215016, + "grad_norm": 0.26398369665256227, + "learning_rate": 0.00010272707503086147, + "loss": 0.9505, + "step": 15341 + }, + { + "epoch": 3.2726109215017063, + "grad_norm": 0.250087812948648, + "learning_rate": 0.00010271891575553464, + "loss": 0.9597, + "step": 15342 + }, + { + "epoch": 3.2728242320819114, + "grad_norm": 0.19963244678548442, + "learning_rate": 0.00010271075622314291, + "loss": 0.8867, + "step": 15343 + }, + { + "epoch": 3.273037542662116, + "grad_norm": 0.2018342419444822, + "learning_rate": 0.0001027025964337786, + "loss": 0.9465, + "step": 15344 + }, + { + "epoch": 3.273250853242321, + "grad_norm": 0.25966108489024525, + "learning_rate": 0.00010269443638753402, + "loss": 0.9089, + "step": 15345 + }, + { + "epoch": 3.2734641638225255, + "grad_norm": 0.20797113334184408, + "learning_rate": 0.00010268627608450148, + "loss": 0.9048, + "step": 15346 + }, + { + "epoch": 3.2736774744027306, + "grad_norm": 0.2508985497833432, + "learning_rate": 0.00010267811552477338, + "loss": 0.9369, + "step": 15347 + }, + { + "epoch": 3.2738907849829353, + "grad_norm": 0.29350813682238486, + "learning_rate": 0.000102669954708442, + "loss": 0.888, + "step": 15348 + }, + { + "epoch": 3.27410409556314, + "grad_norm": 0.2954207334971932, + "learning_rate": 0.00010266179363559971, + "loss": 0.9236, + "step": 15349 + }, + { + "epoch": 3.2743174061433447, + "grad_norm": 0.3107990166253723, + "learning_rate": 0.00010265363230633887, + "loss": 0.9017, + "step": 15350 + }, + { + "epoch": 3.2745307167235493, + "grad_norm": 0.24602245823443905, + "learning_rate": 0.00010264547072075177, + "loss": 0.9113, + "step": 15351 + }, + { + "epoch": 3.274744027303754, + "grad_norm": 0.26983873876286774, + "learning_rate": 0.0001026373088789308, + "loss": 0.9176, + "step": 15352 + }, + { + "epoch": 3.274957337883959, + "grad_norm": 0.2484959391956442, + "learning_rate": 0.00010262914678096828, + "loss": 0.9356, + "step": 15353 + }, + { + "epoch": 3.275170648464164, + "grad_norm": 0.2944124113953426, + "learning_rate": 0.00010262098442695658, + "loss": 0.9379, + "step": 15354 + }, + { + "epoch": 3.2753839590443685, + "grad_norm": 0.4623191469697338, + "learning_rate": 0.00010261282181698809, + "loss": 0.9372, + "step": 15355 + }, + { + "epoch": 3.275597269624573, + "grad_norm": 0.5265241774819847, + "learning_rate": 0.00010260465895115512, + "loss": 0.9297, + "step": 15356 + }, + { + "epoch": 3.2758105802047783, + "grad_norm": 0.49273922400846093, + "learning_rate": 0.00010259649582955008, + "loss": 0.9149, + "step": 15357 + }, + { + "epoch": 3.276023890784983, + "grad_norm": 0.45785451024322044, + "learning_rate": 0.00010258833245226529, + "loss": 0.9242, + "step": 15358 + }, + { + "epoch": 3.2762372013651877, + "grad_norm": 0.3462617936715466, + "learning_rate": 0.00010258016881939313, + "loss": 0.9006, + "step": 15359 + }, + { + "epoch": 3.2764505119453924, + "grad_norm": 0.2322250319504287, + "learning_rate": 0.00010257200493102599, + "loss": 0.9165, + "step": 15360 + }, + { + "epoch": 3.276663822525597, + "grad_norm": 0.2627604035752736, + "learning_rate": 0.00010256384078725625, + "loss": 0.9422, + "step": 15361 + }, + { + "epoch": 3.276877133105802, + "grad_norm": 0.33524817252270045, + "learning_rate": 0.00010255567638817625, + "loss": 0.8998, + "step": 15362 + }, + { + "epoch": 3.277090443686007, + "grad_norm": 0.4454495293034551, + "learning_rate": 0.00010254751173387841, + "loss": 0.8984, + "step": 15363 + }, + { + "epoch": 3.2773037542662116, + "grad_norm": 0.4395253166271464, + "learning_rate": 0.00010253934682445506, + "loss": 0.9336, + "step": 15364 + }, + { + "epoch": 3.2775170648464163, + "grad_norm": 0.3491798145246105, + "learning_rate": 0.00010253118165999866, + "loss": 0.9354, + "step": 15365 + }, + { + "epoch": 3.2777303754266214, + "grad_norm": 0.25915144215235736, + "learning_rate": 0.00010252301624060154, + "loss": 0.9075, + "step": 15366 + }, + { + "epoch": 3.277943686006826, + "grad_norm": 0.33058777358051195, + "learning_rate": 0.00010251485056635613, + "loss": 0.9114, + "step": 15367 + }, + { + "epoch": 3.2781569965870307, + "grad_norm": 0.21848497338009257, + "learning_rate": 0.00010250668463735478, + "loss": 0.9256, + "step": 15368 + }, + { + "epoch": 3.2783703071672354, + "grad_norm": 0.2220464652483339, + "learning_rate": 0.00010249851845368995, + "loss": 0.9315, + "step": 15369 + }, + { + "epoch": 3.27858361774744, + "grad_norm": 0.29723023249534675, + "learning_rate": 0.00010249035201545398, + "loss": 0.9185, + "step": 15370 + }, + { + "epoch": 3.2787969283276452, + "grad_norm": 0.2781422313562608, + "learning_rate": 0.0001024821853227393, + "loss": 0.9189, + "step": 15371 + }, + { + "epoch": 3.27901023890785, + "grad_norm": 0.23614054515935504, + "learning_rate": 0.00010247401837563831, + "loss": 0.9215, + "step": 15372 + }, + { + "epoch": 3.2792235494880546, + "grad_norm": 0.29136428916481477, + "learning_rate": 0.00010246585117424346, + "loss": 0.9215, + "step": 15373 + }, + { + "epoch": 3.2794368600682593, + "grad_norm": 0.2575907538957896, + "learning_rate": 0.00010245768371864709, + "loss": 0.9202, + "step": 15374 + }, + { + "epoch": 3.279650170648464, + "grad_norm": 0.24306506893415672, + "learning_rate": 0.00010244951600894165, + "loss": 0.9133, + "step": 15375 + }, + { + "epoch": 3.279863481228669, + "grad_norm": 0.2825528677663059, + "learning_rate": 0.00010244134804521956, + "loss": 0.9139, + "step": 15376 + }, + { + "epoch": 3.280076791808874, + "grad_norm": 0.2503350376678418, + "learning_rate": 0.00010243317982757327, + "loss": 0.936, + "step": 15377 + }, + { + "epoch": 3.2802901023890785, + "grad_norm": 0.359460422017877, + "learning_rate": 0.00010242501135609512, + "loss": 0.9371, + "step": 15378 + }, + { + "epoch": 3.280503412969283, + "grad_norm": 0.38539919913939846, + "learning_rate": 0.00010241684263087765, + "loss": 0.91, + "step": 15379 + }, + { + "epoch": 3.280716723549488, + "grad_norm": 0.2442246505769423, + "learning_rate": 0.00010240867365201319, + "loss": 0.8957, + "step": 15380 + }, + { + "epoch": 3.280930034129693, + "grad_norm": 0.2497503525955563, + "learning_rate": 0.00010240050441959422, + "loss": 0.9386, + "step": 15381 + }, + { + "epoch": 3.2811433447098977, + "grad_norm": 0.2523570780397261, + "learning_rate": 0.00010239233493371316, + "loss": 0.9474, + "step": 15382 + }, + { + "epoch": 3.2813566552901023, + "grad_norm": 0.23244213086051196, + "learning_rate": 0.00010238416519446248, + "loss": 0.9315, + "step": 15383 + }, + { + "epoch": 3.281569965870307, + "grad_norm": 0.27381937161434333, + "learning_rate": 0.00010237599520193457, + "loss": 0.8804, + "step": 15384 + }, + { + "epoch": 3.281783276450512, + "grad_norm": 0.28398782306278497, + "learning_rate": 0.00010236782495622191, + "loss": 0.9425, + "step": 15385 + }, + { + "epoch": 3.281996587030717, + "grad_norm": 0.28930628237016426, + "learning_rate": 0.00010235965445741692, + "loss": 0.9452, + "step": 15386 + }, + { + "epoch": 3.2822098976109215, + "grad_norm": 0.18025820021509278, + "learning_rate": 0.00010235148370561206, + "loss": 0.9209, + "step": 15387 + }, + { + "epoch": 3.282423208191126, + "grad_norm": 0.2877078241741059, + "learning_rate": 0.0001023433127008998, + "loss": 0.9523, + "step": 15388 + }, + { + "epoch": 3.282636518771331, + "grad_norm": 0.298163210819962, + "learning_rate": 0.00010233514144337257, + "loss": 0.9465, + "step": 15389 + }, + { + "epoch": 3.282849829351536, + "grad_norm": 0.33006419177299584, + "learning_rate": 0.00010232696993312286, + "loss": 0.9049, + "step": 15390 + }, + { + "epoch": 3.2830631399317407, + "grad_norm": 0.36791108875960843, + "learning_rate": 0.0001023187981702431, + "loss": 0.9306, + "step": 15391 + }, + { + "epoch": 3.2832764505119454, + "grad_norm": 0.34421456536587663, + "learning_rate": 0.00010231062615482574, + "loss": 0.9159, + "step": 15392 + }, + { + "epoch": 3.28348976109215, + "grad_norm": 0.22756146003268385, + "learning_rate": 0.00010230245388696329, + "loss": 0.9223, + "step": 15393 + }, + { + "epoch": 3.2837030716723548, + "grad_norm": 0.21695902189361405, + "learning_rate": 0.00010229428136674819, + "loss": 0.9329, + "step": 15394 + }, + { + "epoch": 3.28391638225256, + "grad_norm": 0.36619958400753844, + "learning_rate": 0.00010228610859427294, + "loss": 0.9448, + "step": 15395 + }, + { + "epoch": 3.2841296928327646, + "grad_norm": 0.3781701106654912, + "learning_rate": 0.00010227793556962997, + "loss": 0.9336, + "step": 15396 + }, + { + "epoch": 3.2843430034129693, + "grad_norm": 0.37140778095455007, + "learning_rate": 0.00010226976229291181, + "loss": 0.9319, + "step": 15397 + }, + { + "epoch": 3.284556313993174, + "grad_norm": 0.31196482935221925, + "learning_rate": 0.00010226158876421089, + "loss": 0.8895, + "step": 15398 + }, + { + "epoch": 3.2847696245733786, + "grad_norm": 0.2731706200611014, + "learning_rate": 0.00010225341498361975, + "loss": 0.8989, + "step": 15399 + }, + { + "epoch": 3.2849829351535837, + "grad_norm": 0.2039704465779975, + "learning_rate": 0.0001022452409512308, + "loss": 0.9222, + "step": 15400 + }, + { + "epoch": 3.2851962457337884, + "grad_norm": 0.2284167589548842, + "learning_rate": 0.00010223706666713665, + "loss": 0.9241, + "step": 15401 + }, + { + "epoch": 3.285409556313993, + "grad_norm": 0.31455754109884254, + "learning_rate": 0.00010222889213142964, + "loss": 0.9331, + "step": 15402 + }, + { + "epoch": 3.285622866894198, + "grad_norm": 0.30173157673648426, + "learning_rate": 0.00010222071734420238, + "loss": 0.9244, + "step": 15403 + }, + { + "epoch": 3.285836177474403, + "grad_norm": 0.3084568554759862, + "learning_rate": 0.00010221254230554731, + "loss": 0.9064, + "step": 15404 + }, + { + "epoch": 3.2860494880546076, + "grad_norm": 0.34128669855196986, + "learning_rate": 0.00010220436701555697, + "loss": 0.888, + "step": 15405 + }, + { + "epoch": 3.2862627986348123, + "grad_norm": 0.3053446955577211, + "learning_rate": 0.00010219619147432383, + "loss": 0.9233, + "step": 15406 + }, + { + "epoch": 3.286476109215017, + "grad_norm": 0.2064018772062696, + "learning_rate": 0.00010218801568194042, + "loss": 0.929, + "step": 15407 + }, + { + "epoch": 3.2866894197952217, + "grad_norm": 0.21341819276363483, + "learning_rate": 0.00010217983963849923, + "loss": 0.9206, + "step": 15408 + }, + { + "epoch": 3.286902730375427, + "grad_norm": 0.3251124857487953, + "learning_rate": 0.00010217166334409278, + "loss": 0.9323, + "step": 15409 + }, + { + "epoch": 3.2871160409556315, + "grad_norm": 0.32146159112184464, + "learning_rate": 0.0001021634867988136, + "loss": 0.9194, + "step": 15410 + }, + { + "epoch": 3.287329351535836, + "grad_norm": 0.308084993456279, + "learning_rate": 0.00010215531000275418, + "loss": 0.9102, + "step": 15411 + }, + { + "epoch": 3.287542662116041, + "grad_norm": 0.2922760096313012, + "learning_rate": 0.00010214713295600707, + "loss": 0.9306, + "step": 15412 + }, + { + "epoch": 3.287755972696246, + "grad_norm": 0.2272131591482984, + "learning_rate": 0.00010213895565866477, + "loss": 0.9102, + "step": 15413 + }, + { + "epoch": 3.2879692832764507, + "grad_norm": 0.20933327358132958, + "learning_rate": 0.00010213077811081981, + "loss": 0.9102, + "step": 15414 + }, + { + "epoch": 3.2881825938566553, + "grad_norm": 0.25066734282820713, + "learning_rate": 0.00010212260031256472, + "loss": 0.912, + "step": 15415 + }, + { + "epoch": 3.28839590443686, + "grad_norm": 0.2510044105143302, + "learning_rate": 0.00010211442226399205, + "loss": 0.905, + "step": 15416 + }, + { + "epoch": 3.2886092150170647, + "grad_norm": 0.2929255499991047, + "learning_rate": 0.00010210624396519431, + "loss": 0.9145, + "step": 15417 + }, + { + "epoch": 3.2888225255972694, + "grad_norm": 0.27013852762029517, + "learning_rate": 0.00010209806541626405, + "loss": 0.9223, + "step": 15418 + }, + { + "epoch": 3.2890358361774745, + "grad_norm": 0.24298058800106526, + "learning_rate": 0.00010208988661729382, + "loss": 0.9181, + "step": 15419 + }, + { + "epoch": 3.289249146757679, + "grad_norm": 0.25440176518086205, + "learning_rate": 0.00010208170756837612, + "loss": 0.9323, + "step": 15420 + }, + { + "epoch": 3.289462457337884, + "grad_norm": 0.2318261325908236, + "learning_rate": 0.00010207352826960355, + "loss": 0.9181, + "step": 15421 + }, + { + "epoch": 3.2896757679180886, + "grad_norm": 0.21498949987333835, + "learning_rate": 0.00010206534872106862, + "loss": 0.8979, + "step": 15422 + }, + { + "epoch": 3.2898890784982937, + "grad_norm": 0.21790328349051646, + "learning_rate": 0.00010205716892286394, + "loss": 0.9299, + "step": 15423 + }, + { + "epoch": 3.2901023890784984, + "grad_norm": 0.20688635965902835, + "learning_rate": 0.00010204898887508198, + "loss": 0.9187, + "step": 15424 + }, + { + "epoch": 3.290315699658703, + "grad_norm": 0.23311273911995192, + "learning_rate": 0.00010204080857781537, + "loss": 0.9203, + "step": 15425 + }, + { + "epoch": 3.2905290102389078, + "grad_norm": 0.2509957036002831, + "learning_rate": 0.00010203262803115661, + "loss": 0.9317, + "step": 15426 + }, + { + "epoch": 3.2907423208191124, + "grad_norm": 0.2802638547938892, + "learning_rate": 0.0001020244472351983, + "loss": 0.9513, + "step": 15427 + }, + { + "epoch": 3.2909556313993176, + "grad_norm": 0.27340576214457885, + "learning_rate": 0.00010201626619003301, + "loss": 0.9295, + "step": 15428 + }, + { + "epoch": 3.2911689419795223, + "grad_norm": 0.26702280957579355, + "learning_rate": 0.00010200808489575331, + "loss": 0.9151, + "step": 15429 + }, + { + "epoch": 3.291382252559727, + "grad_norm": 0.2277685813643412, + "learning_rate": 0.00010199990335245173, + "loss": 0.9028, + "step": 15430 + }, + { + "epoch": 3.2915955631399316, + "grad_norm": 0.1888181792151092, + "learning_rate": 0.0001019917215602209, + "loss": 0.9158, + "step": 15431 + }, + { + "epoch": 3.2918088737201368, + "grad_norm": 0.20736842657758595, + "learning_rate": 0.00010198353951915334, + "loss": 0.8767, + "step": 15432 + }, + { + "epoch": 3.2920221843003414, + "grad_norm": 0.22943635907487325, + "learning_rate": 0.00010197535722934167, + "loss": 0.9086, + "step": 15433 + }, + { + "epoch": 3.292235494880546, + "grad_norm": 0.1845075422765288, + "learning_rate": 0.00010196717469087846, + "loss": 0.8962, + "step": 15434 + }, + { + "epoch": 3.292448805460751, + "grad_norm": 0.2062652966212931, + "learning_rate": 0.00010195899190385633, + "loss": 0.9112, + "step": 15435 + }, + { + "epoch": 3.2926621160409555, + "grad_norm": 0.22363964917198137, + "learning_rate": 0.00010195080886836782, + "loss": 0.9276, + "step": 15436 + }, + { + "epoch": 3.29287542662116, + "grad_norm": 0.1926936632139916, + "learning_rate": 0.00010194262558450554, + "loss": 0.8894, + "step": 15437 + }, + { + "epoch": 3.2930887372013653, + "grad_norm": 0.20142710460296717, + "learning_rate": 0.00010193444205236207, + "loss": 0.9459, + "step": 15438 + }, + { + "epoch": 3.29330204778157, + "grad_norm": 0.19755238224789198, + "learning_rate": 0.00010192625827203004, + "loss": 0.9289, + "step": 15439 + }, + { + "epoch": 3.2935153583617747, + "grad_norm": 0.2248200291475634, + "learning_rate": 0.000101918074243602, + "loss": 0.9084, + "step": 15440 + }, + { + "epoch": 3.2937286689419794, + "grad_norm": 0.23808054965338216, + "learning_rate": 0.00010190988996717061, + "loss": 0.9173, + "step": 15441 + }, + { + "epoch": 3.2939419795221845, + "grad_norm": 0.32271600140598555, + "learning_rate": 0.00010190170544282842, + "loss": 0.9162, + "step": 15442 + }, + { + "epoch": 3.294155290102389, + "grad_norm": 0.2717345821746575, + "learning_rate": 0.00010189352067066809, + "loss": 0.8786, + "step": 15443 + }, + { + "epoch": 3.294368600682594, + "grad_norm": 0.19843948931161243, + "learning_rate": 0.00010188533565078217, + "loss": 0.8967, + "step": 15444 + }, + { + "epoch": 3.2945819112627985, + "grad_norm": 0.2862642925543552, + "learning_rate": 0.00010187715038326335, + "loss": 0.9285, + "step": 15445 + }, + { + "epoch": 3.294795221843003, + "grad_norm": 0.26088165448517664, + "learning_rate": 0.00010186896486820418, + "loss": 0.8998, + "step": 15446 + }, + { + "epoch": 3.2950085324232083, + "grad_norm": 0.29180705091452197, + "learning_rate": 0.00010186077910569733, + "loss": 0.8902, + "step": 15447 + }, + { + "epoch": 3.295221843003413, + "grad_norm": 0.35500818928208977, + "learning_rate": 0.00010185259309583537, + "loss": 0.9063, + "step": 15448 + }, + { + "epoch": 3.2954351535836177, + "grad_norm": 0.29384518581435604, + "learning_rate": 0.00010184440683871096, + "loss": 0.9343, + "step": 15449 + }, + { + "epoch": 3.2956484641638224, + "grad_norm": 0.23355455103148756, + "learning_rate": 0.00010183622033441672, + "loss": 0.9074, + "step": 15450 + }, + { + "epoch": 3.2958617747440275, + "grad_norm": 0.21670990898545914, + "learning_rate": 0.0001018280335830453, + "loss": 0.8936, + "step": 15451 + }, + { + "epoch": 3.296075085324232, + "grad_norm": 0.24654303658397278, + "learning_rate": 0.00010181984658468928, + "loss": 0.9082, + "step": 15452 + }, + { + "epoch": 3.296288395904437, + "grad_norm": 0.3228189810027144, + "learning_rate": 0.00010181165933944134, + "loss": 0.9094, + "step": 15453 + }, + { + "epoch": 3.2965017064846416, + "grad_norm": 0.3539872886417855, + "learning_rate": 0.0001018034718473941, + "loss": 0.8992, + "step": 15454 + }, + { + "epoch": 3.2967150170648463, + "grad_norm": 0.30969176596478704, + "learning_rate": 0.00010179528410864022, + "loss": 0.9341, + "step": 15455 + }, + { + "epoch": 3.296928327645051, + "grad_norm": 0.21838916758640048, + "learning_rate": 0.00010178709612327233, + "loss": 0.9058, + "step": 15456 + }, + { + "epoch": 3.297141638225256, + "grad_norm": 0.20482858909753185, + "learning_rate": 0.00010177890789138311, + "loss": 0.9014, + "step": 15457 + }, + { + "epoch": 3.2973549488054608, + "grad_norm": 0.2204504769546143, + "learning_rate": 0.00010177071941306514, + "loss": 0.9226, + "step": 15458 + }, + { + "epoch": 3.2975682593856654, + "grad_norm": 0.2895496630726177, + "learning_rate": 0.00010176253068841112, + "loss": 0.8924, + "step": 15459 + }, + { + "epoch": 3.29778156996587, + "grad_norm": 0.40063733388516254, + "learning_rate": 0.0001017543417175137, + "loss": 0.9085, + "step": 15460 + }, + { + "epoch": 3.2979948805460753, + "grad_norm": 0.4045599962724859, + "learning_rate": 0.00010174615250046555, + "loss": 0.9224, + "step": 15461 + }, + { + "epoch": 3.29820819112628, + "grad_norm": 0.311039314453404, + "learning_rate": 0.00010173796303735932, + "loss": 0.9382, + "step": 15462 + }, + { + "epoch": 3.2984215017064846, + "grad_norm": 0.22992134037672887, + "learning_rate": 0.00010172977332828768, + "loss": 0.8933, + "step": 15463 + }, + { + "epoch": 3.2986348122866893, + "grad_norm": 0.23665873917212363, + "learning_rate": 0.00010172158337334327, + "loss": 0.8954, + "step": 15464 + }, + { + "epoch": 3.298848122866894, + "grad_norm": 0.21507166855337054, + "learning_rate": 0.00010171339317261878, + "loss": 0.8861, + "step": 15465 + }, + { + "epoch": 3.299061433447099, + "grad_norm": 0.23561440209658033, + "learning_rate": 0.00010170520272620688, + "loss": 0.9133, + "step": 15466 + }, + { + "epoch": 3.299274744027304, + "grad_norm": 0.24104674629445777, + "learning_rate": 0.00010169701203420024, + "loss": 0.9548, + "step": 15467 + }, + { + "epoch": 3.2994880546075085, + "grad_norm": 0.31462565936699194, + "learning_rate": 0.00010168882109669157, + "loss": 0.9065, + "step": 15468 + }, + { + "epoch": 3.299701365187713, + "grad_norm": 0.3856247988294411, + "learning_rate": 0.00010168062991377352, + "loss": 0.9061, + "step": 15469 + }, + { + "epoch": 3.2999146757679183, + "grad_norm": 0.32692822019271306, + "learning_rate": 0.00010167243848553874, + "loss": 0.9023, + "step": 15470 + }, + { + "epoch": 3.300127986348123, + "grad_norm": 0.19836888886112822, + "learning_rate": 0.00010166424681208, + "loss": 0.9023, + "step": 15471 + }, + { + "epoch": 3.3003412969283277, + "grad_norm": 0.2508264562084452, + "learning_rate": 0.00010165605489348992, + "loss": 0.9717, + "step": 15472 + }, + { + "epoch": 3.3005546075085324, + "grad_norm": 0.29364545542048753, + "learning_rate": 0.00010164786272986121, + "loss": 0.9299, + "step": 15473 + }, + { + "epoch": 3.300767918088737, + "grad_norm": 0.29400108019390764, + "learning_rate": 0.00010163967032128658, + "loss": 0.9214, + "step": 15474 + }, + { + "epoch": 3.300981228668942, + "grad_norm": 0.33539120280639273, + "learning_rate": 0.00010163147766785873, + "loss": 0.9296, + "step": 15475 + }, + { + "epoch": 3.301194539249147, + "grad_norm": 0.39288091240749445, + "learning_rate": 0.00010162328476967031, + "loss": 0.9276, + "step": 15476 + }, + { + "epoch": 3.3014078498293515, + "grad_norm": 0.35389189783745634, + "learning_rate": 0.0001016150916268141, + "loss": 0.9191, + "step": 15477 + }, + { + "epoch": 3.301621160409556, + "grad_norm": 0.24449206330514162, + "learning_rate": 0.00010160689823938272, + "loss": 0.9297, + "step": 15478 + }, + { + "epoch": 3.301834470989761, + "grad_norm": 0.2219582020277601, + "learning_rate": 0.00010159870460746897, + "loss": 0.9225, + "step": 15479 + }, + { + "epoch": 3.302047781569966, + "grad_norm": 0.3689031252340116, + "learning_rate": 0.00010159051073116548, + "loss": 0.9226, + "step": 15480 + }, + { + "epoch": 3.3022610921501707, + "grad_norm": 0.3139095871621471, + "learning_rate": 0.00010158231661056501, + "loss": 0.9197, + "step": 15481 + }, + { + "epoch": 3.3024744027303754, + "grad_norm": 0.28350965161498487, + "learning_rate": 0.00010157412224576025, + "loss": 0.9014, + "step": 15482 + }, + { + "epoch": 3.30268771331058, + "grad_norm": 0.3309019623018606, + "learning_rate": 0.00010156592763684396, + "loss": 0.9287, + "step": 15483 + }, + { + "epoch": 3.3029010238907848, + "grad_norm": 0.280261004875457, + "learning_rate": 0.0001015577327839088, + "loss": 0.8947, + "step": 15484 + }, + { + "epoch": 3.30311433447099, + "grad_norm": 0.22440246042917486, + "learning_rate": 0.00010154953768704758, + "loss": 0.9363, + "step": 15485 + }, + { + "epoch": 3.3033276450511946, + "grad_norm": 0.2730425240681108, + "learning_rate": 0.00010154134234635295, + "loss": 0.9013, + "step": 15486 + }, + { + "epoch": 3.3035409556313993, + "grad_norm": 0.33307906715557783, + "learning_rate": 0.00010153314676191769, + "loss": 0.936, + "step": 15487 + }, + { + "epoch": 3.303754266211604, + "grad_norm": 0.26826291961137105, + "learning_rate": 0.00010152495093383448, + "loss": 0.9564, + "step": 15488 + }, + { + "epoch": 3.303967576791809, + "grad_norm": 0.22767414222875976, + "learning_rate": 0.0001015167548621961, + "loss": 0.9199, + "step": 15489 + }, + { + "epoch": 3.3041808873720138, + "grad_norm": 0.27176964329040015, + "learning_rate": 0.00010150855854709528, + "loss": 0.9276, + "step": 15490 + }, + { + "epoch": 3.3043941979522184, + "grad_norm": 0.2681169939389418, + "learning_rate": 0.00010150036198862477, + "loss": 0.9228, + "step": 15491 + }, + { + "epoch": 3.304607508532423, + "grad_norm": 0.21410754063642498, + "learning_rate": 0.00010149216518687728, + "loss": 0.9004, + "step": 15492 + }, + { + "epoch": 3.304820819112628, + "grad_norm": 0.28806282199087513, + "learning_rate": 0.0001014839681419456, + "loss": 0.9198, + "step": 15493 + }, + { + "epoch": 3.305034129692833, + "grad_norm": 0.3749194823135545, + "learning_rate": 0.00010147577085392244, + "loss": 0.9139, + "step": 15494 + }, + { + "epoch": 3.3052474402730376, + "grad_norm": 0.3514731620611743, + "learning_rate": 0.00010146757332290058, + "loss": 0.9208, + "step": 15495 + }, + { + "epoch": 3.3054607508532423, + "grad_norm": 0.2891091812972248, + "learning_rate": 0.00010145937554897277, + "loss": 0.9177, + "step": 15496 + }, + { + "epoch": 3.305674061433447, + "grad_norm": 0.3085907849204405, + "learning_rate": 0.00010145117753223176, + "loss": 0.8871, + "step": 15497 + }, + { + "epoch": 3.305887372013652, + "grad_norm": 0.24757430056389415, + "learning_rate": 0.0001014429792727703, + "loss": 0.9762, + "step": 15498 + }, + { + "epoch": 3.306100682593857, + "grad_norm": 0.2329913968724521, + "learning_rate": 0.0001014347807706812, + "loss": 0.876, + "step": 15499 + }, + { + "epoch": 3.3063139931740615, + "grad_norm": 0.35672716662913895, + "learning_rate": 0.00010142658202605717, + "loss": 0.9429, + "step": 15500 + }, + { + "epoch": 3.306527303754266, + "grad_norm": 0.4444682625392062, + "learning_rate": 0.00010141838303899103, + "loss": 0.933, + "step": 15501 + }, + { + "epoch": 3.306740614334471, + "grad_norm": 0.33698397355612253, + "learning_rate": 0.00010141018380957552, + "loss": 0.9093, + "step": 15502 + }, + { + "epoch": 3.3069539249146755, + "grad_norm": 0.2845149004705969, + "learning_rate": 0.0001014019843379034, + "loss": 0.8956, + "step": 15503 + }, + { + "epoch": 3.3071672354948807, + "grad_norm": 0.2412807308187734, + "learning_rate": 0.00010139378462406747, + "loss": 0.9334, + "step": 15504 + }, + { + "epoch": 3.3073805460750854, + "grad_norm": 0.22705849651621138, + "learning_rate": 0.00010138558466816052, + "loss": 0.9105, + "step": 15505 + }, + { + "epoch": 3.30759385665529, + "grad_norm": 0.23912866384209835, + "learning_rate": 0.00010137738447027531, + "loss": 0.8944, + "step": 15506 + }, + { + "epoch": 3.3078071672354947, + "grad_norm": 0.2882382137438124, + "learning_rate": 0.00010136918403050465, + "loss": 0.9151, + "step": 15507 + }, + { + "epoch": 3.3080204778157, + "grad_norm": 0.35775334562337524, + "learning_rate": 0.0001013609833489413, + "loss": 0.9227, + "step": 15508 + }, + { + "epoch": 3.3082337883959045, + "grad_norm": 0.28858347426809206, + "learning_rate": 0.00010135278242567808, + "loss": 0.9119, + "step": 15509 + }, + { + "epoch": 3.3084470989761092, + "grad_norm": 0.23621422696117242, + "learning_rate": 0.00010134458126080774, + "loss": 0.9255, + "step": 15510 + }, + { + "epoch": 3.308660409556314, + "grad_norm": 0.24527071406117368, + "learning_rate": 0.00010133637985442312, + "loss": 0.9041, + "step": 15511 + }, + { + "epoch": 3.3088737201365186, + "grad_norm": 0.2054255785330727, + "learning_rate": 0.00010132817820661699, + "loss": 0.8982, + "step": 15512 + }, + { + "epoch": 3.3090870307167237, + "grad_norm": 0.26164252460376447, + "learning_rate": 0.0001013199763174822, + "loss": 0.93, + "step": 15513 + }, + { + "epoch": 3.3093003412969284, + "grad_norm": 0.2361498531554305, + "learning_rate": 0.00010131177418711148, + "loss": 0.8888, + "step": 15514 + }, + { + "epoch": 3.309513651877133, + "grad_norm": 0.26757222934499303, + "learning_rate": 0.00010130357181559771, + "loss": 0.9285, + "step": 15515 + }, + { + "epoch": 3.3097269624573378, + "grad_norm": 0.1946584100197964, + "learning_rate": 0.00010129536920303364, + "loss": 0.9101, + "step": 15516 + }, + { + "epoch": 3.309940273037543, + "grad_norm": 0.2292254904952346, + "learning_rate": 0.00010128716634951211, + "loss": 0.962, + "step": 15517 + }, + { + "epoch": 3.3101535836177476, + "grad_norm": 0.19974760095177413, + "learning_rate": 0.00010127896325512595, + "loss": 0.9348, + "step": 15518 + }, + { + "epoch": 3.3103668941979523, + "grad_norm": 0.3062970791618409, + "learning_rate": 0.00010127075991996796, + "loss": 0.9441, + "step": 15519 + }, + { + "epoch": 3.310580204778157, + "grad_norm": 0.21961831877839025, + "learning_rate": 0.00010126255634413095, + "loss": 0.95, + "step": 15520 + }, + { + "epoch": 3.3107935153583616, + "grad_norm": 0.1832415341471482, + "learning_rate": 0.00010125435252770777, + "loss": 0.9023, + "step": 15521 + }, + { + "epoch": 3.3110068259385663, + "grad_norm": 0.24268740703414762, + "learning_rate": 0.00010124614847079123, + "loss": 0.9344, + "step": 15522 + }, + { + "epoch": 3.3112201365187715, + "grad_norm": 0.24303739954823855, + "learning_rate": 0.00010123794417347417, + "loss": 0.887, + "step": 15523 + }, + { + "epoch": 3.311433447098976, + "grad_norm": 0.19155037110555, + "learning_rate": 0.00010122973963584941, + "loss": 0.9585, + "step": 15524 + }, + { + "epoch": 3.311646757679181, + "grad_norm": 0.24238444892039365, + "learning_rate": 0.0001012215348580098, + "loss": 0.9239, + "step": 15525 + }, + { + "epoch": 3.3118600682593855, + "grad_norm": 0.18001613443260164, + "learning_rate": 0.00010121332984004813, + "loss": 0.9122, + "step": 15526 + }, + { + "epoch": 3.3120733788395906, + "grad_norm": 0.2695055342731684, + "learning_rate": 0.00010120512458205732, + "loss": 0.9088, + "step": 15527 + }, + { + "epoch": 3.3122866894197953, + "grad_norm": 0.23337510333188122, + "learning_rate": 0.00010119691908413013, + "loss": 0.8928, + "step": 15528 + }, + { + "epoch": 3.3125, + "grad_norm": 0.17364876712602392, + "learning_rate": 0.00010118871334635945, + "loss": 0.8886, + "step": 15529 + }, + { + "epoch": 3.3127133105802047, + "grad_norm": 0.23210609178159802, + "learning_rate": 0.00010118050736883815, + "loss": 0.9148, + "step": 15530 + }, + { + "epoch": 3.3129266211604094, + "grad_norm": 0.1900127522975518, + "learning_rate": 0.00010117230115165903, + "loss": 0.9312, + "step": 15531 + }, + { + "epoch": 3.3131399317406145, + "grad_norm": 0.19260343663627785, + "learning_rate": 0.00010116409469491498, + "loss": 0.9179, + "step": 15532 + }, + { + "epoch": 3.313353242320819, + "grad_norm": 0.23338101297842748, + "learning_rate": 0.00010115588799869884, + "loss": 0.9471, + "step": 15533 + }, + { + "epoch": 3.313566552901024, + "grad_norm": 0.2263254039681459, + "learning_rate": 0.00010114768106310344, + "loss": 0.9457, + "step": 15534 + }, + { + "epoch": 3.3137798634812285, + "grad_norm": 0.19208896616058202, + "learning_rate": 0.00010113947388822171, + "loss": 0.9125, + "step": 15535 + }, + { + "epoch": 3.3139931740614337, + "grad_norm": 0.20256580178434727, + "learning_rate": 0.00010113126647414648, + "loss": 0.9009, + "step": 15536 + }, + { + "epoch": 3.3142064846416384, + "grad_norm": 0.23024387362101248, + "learning_rate": 0.0001011230588209706, + "loss": 0.9211, + "step": 15537 + }, + { + "epoch": 3.314419795221843, + "grad_norm": 0.21100281144258196, + "learning_rate": 0.00010111485092878695, + "loss": 0.9226, + "step": 15538 + }, + { + "epoch": 3.3146331058020477, + "grad_norm": 0.20579139556871803, + "learning_rate": 0.00010110664279768841, + "loss": 0.9348, + "step": 15539 + }, + { + "epoch": 3.3148464163822524, + "grad_norm": 0.18503784465989198, + "learning_rate": 0.00010109843442776785, + "loss": 0.8964, + "step": 15540 + }, + { + "epoch": 3.315059726962457, + "grad_norm": 0.21035726300788013, + "learning_rate": 0.00010109022581911818, + "loss": 0.9539, + "step": 15541 + }, + { + "epoch": 3.3152730375426622, + "grad_norm": 0.21159823775542252, + "learning_rate": 0.00010108201697183222, + "loss": 0.9184, + "step": 15542 + }, + { + "epoch": 3.315486348122867, + "grad_norm": 0.19006213437226238, + "learning_rate": 0.00010107380788600289, + "loss": 0.9047, + "step": 15543 + }, + { + "epoch": 3.3156996587030716, + "grad_norm": 0.5721401955446281, + "learning_rate": 0.00010106559856172305, + "loss": 0.9665, + "step": 15544 + }, + { + "epoch": 3.3159129692832763, + "grad_norm": 0.19937710144262355, + "learning_rate": 0.00010105738899908563, + "loss": 0.9454, + "step": 15545 + }, + { + "epoch": 3.3161262798634814, + "grad_norm": 0.21149985938705773, + "learning_rate": 0.0001010491791981835, + "loss": 0.9326, + "step": 15546 + }, + { + "epoch": 3.316339590443686, + "grad_norm": 0.21526150595040205, + "learning_rate": 0.00010104096915910954, + "loss": 0.8823, + "step": 15547 + }, + { + "epoch": 3.3165529010238908, + "grad_norm": 0.2320808437460468, + "learning_rate": 0.00010103275888195667, + "loss": 0.9392, + "step": 15548 + }, + { + "epoch": 3.3167662116040955, + "grad_norm": 0.24010020950620117, + "learning_rate": 0.00010102454836681779, + "loss": 0.9416, + "step": 15549 + }, + { + "epoch": 3.3169795221843, + "grad_norm": 0.21870268413536215, + "learning_rate": 0.00010101633761378576, + "loss": 0.9291, + "step": 15550 + }, + { + "epoch": 3.3171928327645053, + "grad_norm": 0.2065529163018662, + "learning_rate": 0.00010100812662295354, + "loss": 0.9331, + "step": 15551 + }, + { + "epoch": 3.31740614334471, + "grad_norm": 0.22384402949991303, + "learning_rate": 0.00010099991539441402, + "loss": 0.9259, + "step": 15552 + }, + { + "epoch": 3.3176194539249146, + "grad_norm": 0.28713019211795415, + "learning_rate": 0.00010099170392826008, + "loss": 0.919, + "step": 15553 + }, + { + "epoch": 3.3178327645051193, + "grad_norm": 0.2673315131993411, + "learning_rate": 0.00010098349222458468, + "loss": 0.8956, + "step": 15554 + }, + { + "epoch": 3.3180460750853245, + "grad_norm": 0.19590948703156186, + "learning_rate": 0.0001009752802834807, + "loss": 0.9208, + "step": 15555 + }, + { + "epoch": 3.318259385665529, + "grad_norm": 0.2085384727084801, + "learning_rate": 0.00010096706810504107, + "loss": 0.9352, + "step": 15556 + }, + { + "epoch": 3.318472696245734, + "grad_norm": 0.20278873582082413, + "learning_rate": 0.0001009588556893587, + "loss": 0.9229, + "step": 15557 + }, + { + "epoch": 3.3186860068259385, + "grad_norm": 0.30312279974854767, + "learning_rate": 0.00010095064303652656, + "loss": 0.9421, + "step": 15558 + }, + { + "epoch": 3.318899317406143, + "grad_norm": 0.3281460132969356, + "learning_rate": 0.0001009424301466375, + "loss": 0.9314, + "step": 15559 + }, + { + "epoch": 3.319112627986348, + "grad_norm": 0.34861527990196806, + "learning_rate": 0.00010093421701978453, + "loss": 0.9118, + "step": 15560 + }, + { + "epoch": 3.319325938566553, + "grad_norm": 0.3334352649503409, + "learning_rate": 0.00010092600365606052, + "loss": 0.9284, + "step": 15561 + }, + { + "epoch": 3.3195392491467577, + "grad_norm": 0.2554191098427971, + "learning_rate": 0.00010091779005555842, + "loss": 0.8962, + "step": 15562 + }, + { + "epoch": 3.3197525597269624, + "grad_norm": 0.21593746841558925, + "learning_rate": 0.00010090957621837119, + "loss": 0.9056, + "step": 15563 + }, + { + "epoch": 3.319965870307167, + "grad_norm": 0.25457867787783667, + "learning_rate": 0.00010090136214459175, + "loss": 0.9152, + "step": 15564 + }, + { + "epoch": 3.320179180887372, + "grad_norm": 0.28749590065402486, + "learning_rate": 0.00010089314783431304, + "loss": 0.9046, + "step": 15565 + }, + { + "epoch": 3.320392491467577, + "grad_norm": 0.315113324358811, + "learning_rate": 0.00010088493328762803, + "loss": 0.9196, + "step": 15566 + }, + { + "epoch": 3.3206058020477816, + "grad_norm": 0.3496536381258312, + "learning_rate": 0.00010087671850462963, + "loss": 0.9448, + "step": 15567 + }, + { + "epoch": 3.3208191126279862, + "grad_norm": 0.27374270896056674, + "learning_rate": 0.00010086850348541079, + "loss": 0.9266, + "step": 15568 + }, + { + "epoch": 3.321032423208191, + "grad_norm": 0.2202849663242612, + "learning_rate": 0.00010086028823006452, + "loss": 0.8983, + "step": 15569 + }, + { + "epoch": 3.321245733788396, + "grad_norm": 0.2324722554352824, + "learning_rate": 0.00010085207273868373, + "loss": 0.9015, + "step": 15570 + }, + { + "epoch": 3.3214590443686007, + "grad_norm": 0.18832659201455093, + "learning_rate": 0.00010084385701136136, + "loss": 0.9221, + "step": 15571 + }, + { + "epoch": 3.3216723549488054, + "grad_norm": 0.19443485815155964, + "learning_rate": 0.00010083564104819042, + "loss": 0.9147, + "step": 15572 + }, + { + "epoch": 3.32188566552901, + "grad_norm": 0.2362536919047268, + "learning_rate": 0.00010082742484926386, + "loss": 0.9097, + "step": 15573 + }, + { + "epoch": 3.3220989761092152, + "grad_norm": 0.23334038642372476, + "learning_rate": 0.00010081920841467462, + "loss": 0.9497, + "step": 15574 + }, + { + "epoch": 3.32231228668942, + "grad_norm": 0.20390725631264295, + "learning_rate": 0.00010081099174451569, + "loss": 0.9494, + "step": 15575 + }, + { + "epoch": 3.3225255972696246, + "grad_norm": 0.21020297275616337, + "learning_rate": 0.00010080277483888004, + "loss": 0.8864, + "step": 15576 + }, + { + "epoch": 3.3227389078498293, + "grad_norm": 0.21292033696213072, + "learning_rate": 0.00010079455769786063, + "loss": 0.9537, + "step": 15577 + }, + { + "epoch": 3.322952218430034, + "grad_norm": 0.223939102515254, + "learning_rate": 0.00010078634032155049, + "loss": 0.9165, + "step": 15578 + }, + { + "epoch": 3.323165529010239, + "grad_norm": 0.2395412397894361, + "learning_rate": 0.00010077812271004253, + "loss": 0.9222, + "step": 15579 + }, + { + "epoch": 3.323378839590444, + "grad_norm": 0.23530495307722968, + "learning_rate": 0.00010076990486342976, + "loss": 0.8971, + "step": 15580 + }, + { + "epoch": 3.3235921501706485, + "grad_norm": 0.20878450468620546, + "learning_rate": 0.00010076168678180517, + "loss": 0.889, + "step": 15581 + }, + { + "epoch": 3.323805460750853, + "grad_norm": 0.24871933183510875, + "learning_rate": 0.00010075346846526174, + "loss": 0.9433, + "step": 15582 + }, + { + "epoch": 3.324018771331058, + "grad_norm": 0.2890702375853585, + "learning_rate": 0.00010074524991389245, + "loss": 0.8796, + "step": 15583 + }, + { + "epoch": 3.324232081911263, + "grad_norm": 0.22540951124164255, + "learning_rate": 0.00010073703112779035, + "loss": 0.9358, + "step": 15584 + }, + { + "epoch": 3.3244453924914676, + "grad_norm": 1.062398318813812, + "learning_rate": 0.00010072881210704836, + "loss": 1.0035, + "step": 15585 + }, + { + "epoch": 3.3246587030716723, + "grad_norm": 0.3134122264675552, + "learning_rate": 0.00010072059285175954, + "loss": 0.9016, + "step": 15586 + }, + { + "epoch": 3.324872013651877, + "grad_norm": 0.2990034740029439, + "learning_rate": 0.00010071237336201685, + "loss": 0.9382, + "step": 15587 + }, + { + "epoch": 3.3250853242320817, + "grad_norm": 0.2887030914942807, + "learning_rate": 0.00010070415363791331, + "loss": 0.9357, + "step": 15588 + }, + { + "epoch": 3.325298634812287, + "grad_norm": 0.33407596681992036, + "learning_rate": 0.0001006959336795419, + "loss": 0.8991, + "step": 15589 + }, + { + "epoch": 3.3255119453924915, + "grad_norm": 0.38831102607833357, + "learning_rate": 0.00010068771348699567, + "loss": 0.8919, + "step": 15590 + }, + { + "epoch": 3.325725255972696, + "grad_norm": 0.4305107810005645, + "learning_rate": 0.00010067949306036763, + "loss": 0.9167, + "step": 15591 + }, + { + "epoch": 3.325938566552901, + "grad_norm": 0.3826982406995613, + "learning_rate": 0.00010067127239975076, + "loss": 0.9305, + "step": 15592 + }, + { + "epoch": 3.326151877133106, + "grad_norm": 0.4090924778900884, + "learning_rate": 0.0001006630515052381, + "loss": 0.9343, + "step": 15593 + }, + { + "epoch": 3.3263651877133107, + "grad_norm": 0.37888427396086183, + "learning_rate": 0.00010065483037692266, + "loss": 0.9769, + "step": 15594 + }, + { + "epoch": 3.3265784982935154, + "grad_norm": 0.3472016876376058, + "learning_rate": 0.00010064660901489746, + "loss": 0.9094, + "step": 15595 + }, + { + "epoch": 3.32679180887372, + "grad_norm": 0.2449083179667724, + "learning_rate": 0.00010063838741925555, + "loss": 0.9233, + "step": 15596 + }, + { + "epoch": 3.3270051194539247, + "grad_norm": 0.32281312010891433, + "learning_rate": 0.00010063016559008994, + "loss": 0.9371, + "step": 15597 + }, + { + "epoch": 3.32721843003413, + "grad_norm": 0.3922315634990177, + "learning_rate": 0.00010062194352749367, + "loss": 0.9293, + "step": 15598 + }, + { + "epoch": 3.3274317406143346, + "grad_norm": 0.4213515287369562, + "learning_rate": 0.00010061372123155973, + "loss": 0.9275, + "step": 15599 + }, + { + "epoch": 3.3276450511945392, + "grad_norm": 0.38450520582321396, + "learning_rate": 0.00010060549870238119, + "loss": 0.9727, + "step": 15600 + }, + { + "epoch": 3.327858361774744, + "grad_norm": 0.3324719577625751, + "learning_rate": 0.00010059727594005108, + "loss": 0.9309, + "step": 15601 + }, + { + "epoch": 3.328071672354949, + "grad_norm": 0.25433166260561785, + "learning_rate": 0.00010058905294466248, + "loss": 0.9255, + "step": 15602 + }, + { + "epoch": 3.3282849829351537, + "grad_norm": 0.23028065174252643, + "learning_rate": 0.00010058082971630837, + "loss": 0.924, + "step": 15603 + }, + { + "epoch": 3.3284982935153584, + "grad_norm": 0.31603300712860066, + "learning_rate": 0.00010057260625508186, + "loss": 0.9106, + "step": 15604 + }, + { + "epoch": 3.328711604095563, + "grad_norm": 0.29347436981675545, + "learning_rate": 0.00010056438256107592, + "loss": 0.9468, + "step": 15605 + }, + { + "epoch": 3.328924914675768, + "grad_norm": 0.24617004193064412, + "learning_rate": 0.00010055615863438368, + "loss": 0.8945, + "step": 15606 + }, + { + "epoch": 3.3291382252559725, + "grad_norm": 0.22409989459501034, + "learning_rate": 0.00010054793447509813, + "loss": 0.921, + "step": 15607 + }, + { + "epoch": 3.3293515358361776, + "grad_norm": 0.23427583045276604, + "learning_rate": 0.00010053971008331239, + "loss": 0.9221, + "step": 15608 + }, + { + "epoch": 3.3295648464163823, + "grad_norm": 0.2691832761958728, + "learning_rate": 0.00010053148545911945, + "loss": 0.9402, + "step": 15609 + }, + { + "epoch": 3.329778156996587, + "grad_norm": 0.33342730346112304, + "learning_rate": 0.00010052326060261245, + "loss": 0.9203, + "step": 15610 + }, + { + "epoch": 3.3299914675767917, + "grad_norm": 0.3330395759139851, + "learning_rate": 0.00010051503551388436, + "loss": 0.9229, + "step": 15611 + }, + { + "epoch": 3.330204778156997, + "grad_norm": 0.3230999371773979, + "learning_rate": 0.00010050681019302832, + "loss": 0.9147, + "step": 15612 + }, + { + "epoch": 3.3304180887372015, + "grad_norm": 0.3228880919314504, + "learning_rate": 0.0001004985846401374, + "loss": 0.9063, + "step": 15613 + }, + { + "epoch": 3.330631399317406, + "grad_norm": 0.28370203678254613, + "learning_rate": 0.00010049035885530464, + "loss": 0.8957, + "step": 15614 + }, + { + "epoch": 3.330844709897611, + "grad_norm": 0.2235650244140983, + "learning_rate": 0.00010048213283862311, + "loss": 0.9289, + "step": 15615 + }, + { + "epoch": 3.3310580204778155, + "grad_norm": 0.2775344102402608, + "learning_rate": 0.0001004739065901859, + "loss": 0.927, + "step": 15616 + }, + { + "epoch": 3.3312713310580206, + "grad_norm": 0.33354752572526136, + "learning_rate": 0.00010046568011008608, + "loss": 0.8872, + "step": 15617 + }, + { + "epoch": 3.3314846416382253, + "grad_norm": 0.26707902993708327, + "learning_rate": 0.00010045745339841677, + "loss": 0.9071, + "step": 15618 + }, + { + "epoch": 3.33169795221843, + "grad_norm": 0.27275366733093115, + "learning_rate": 0.00010044922645527104, + "loss": 0.9336, + "step": 15619 + }, + { + "epoch": 3.3319112627986347, + "grad_norm": 0.27378013966921894, + "learning_rate": 0.00010044099928074197, + "loss": 0.9135, + "step": 15620 + }, + { + "epoch": 3.33212457337884, + "grad_norm": 0.2713000945762902, + "learning_rate": 0.00010043277187492262, + "loss": 0.9504, + "step": 15621 + }, + { + "epoch": 3.3323378839590445, + "grad_norm": 0.3366596598802374, + "learning_rate": 0.00010042454423790614, + "loss": 0.9341, + "step": 15622 + }, + { + "epoch": 3.332551194539249, + "grad_norm": 0.3453464590972913, + "learning_rate": 0.00010041631636978556, + "loss": 0.9153, + "step": 15623 + }, + { + "epoch": 3.332764505119454, + "grad_norm": 0.3287863581004956, + "learning_rate": 0.00010040808827065406, + "loss": 0.9277, + "step": 15624 + }, + { + "epoch": 3.3329778156996586, + "grad_norm": 0.21965640663566252, + "learning_rate": 0.00010039985994060469, + "loss": 0.9122, + "step": 15625 + }, + { + "epoch": 3.3331911262798632, + "grad_norm": 0.20798702591590162, + "learning_rate": 0.00010039163137973054, + "loss": 0.9416, + "step": 15626 + }, + { + "epoch": 3.3334044368600684, + "grad_norm": 0.27012699281488034, + "learning_rate": 0.00010038340258812475, + "loss": 0.9088, + "step": 15627 + }, + { + "epoch": 3.333617747440273, + "grad_norm": 0.3264418546108216, + "learning_rate": 0.0001003751735658804, + "loss": 0.883, + "step": 15628 + }, + { + "epoch": 3.3338310580204777, + "grad_norm": 0.3643585047903687, + "learning_rate": 0.00010036694431309065, + "loss": 0.925, + "step": 15629 + }, + { + "epoch": 3.3340443686006824, + "grad_norm": 0.23309362004913928, + "learning_rate": 0.00010035871482984858, + "loss": 0.8768, + "step": 15630 + }, + { + "epoch": 3.3342576791808876, + "grad_norm": 0.2110003519983153, + "learning_rate": 0.00010035048511624729, + "loss": 0.9064, + "step": 15631 + }, + { + "epoch": 3.3344709897610922, + "grad_norm": 0.21264324878382398, + "learning_rate": 0.00010034225517237993, + "loss": 0.926, + "step": 15632 + }, + { + "epoch": 3.334684300341297, + "grad_norm": 0.2806968557098419, + "learning_rate": 0.00010033402499833959, + "loss": 0.9007, + "step": 15633 + }, + { + "epoch": 3.3348976109215016, + "grad_norm": 0.3078044017575303, + "learning_rate": 0.00010032579459421943, + "loss": 0.9438, + "step": 15634 + }, + { + "epoch": 3.3351109215017063, + "grad_norm": 0.279901644298786, + "learning_rate": 0.00010031756396011256, + "loss": 0.9305, + "step": 15635 + }, + { + "epoch": 3.3353242320819114, + "grad_norm": 0.25220874923278075, + "learning_rate": 0.00010030933309611215, + "loss": 0.9137, + "step": 15636 + }, + { + "epoch": 3.335537542662116, + "grad_norm": 0.5027292629090705, + "learning_rate": 0.00010030110200231123, + "loss": 0.9344, + "step": 15637 + }, + { + "epoch": 3.335750853242321, + "grad_norm": 0.24910439031380344, + "learning_rate": 0.00010029287067880304, + "loss": 0.9325, + "step": 15638 + }, + { + "epoch": 3.3359641638225255, + "grad_norm": 0.25300832985896726, + "learning_rate": 0.00010028463912568065, + "loss": 0.9499, + "step": 15639 + }, + { + "epoch": 3.3361774744027306, + "grad_norm": 0.293539310432947, + "learning_rate": 0.00010027640734303726, + "loss": 0.9279, + "step": 15640 + }, + { + "epoch": 3.3363907849829353, + "grad_norm": 0.3622820968910396, + "learning_rate": 0.00010026817533096597, + "loss": 0.9217, + "step": 15641 + }, + { + "epoch": 3.33660409556314, + "grad_norm": 0.32205371520843734, + "learning_rate": 0.00010025994308955993, + "loss": 0.9207, + "step": 15642 + }, + { + "epoch": 3.3368174061433447, + "grad_norm": 0.30361559645863007, + "learning_rate": 0.00010025171061891228, + "loss": 0.9192, + "step": 15643 + }, + { + "epoch": 3.3370307167235493, + "grad_norm": 0.24679731512723696, + "learning_rate": 0.0001002434779191162, + "loss": 0.9194, + "step": 15644 + }, + { + "epoch": 3.337244027303754, + "grad_norm": 0.2233021284254193, + "learning_rate": 0.0001002352449902648, + "loss": 0.9013, + "step": 15645 + }, + { + "epoch": 3.337457337883959, + "grad_norm": 0.2800315422252971, + "learning_rate": 0.00010022701183245129, + "loss": 0.9282, + "step": 15646 + }, + { + "epoch": 3.337670648464164, + "grad_norm": 0.32112640120811614, + "learning_rate": 0.00010021877844576882, + "loss": 0.9074, + "step": 15647 + }, + { + "epoch": 3.3378839590443685, + "grad_norm": 0.3331642889566838, + "learning_rate": 0.0001002105448303105, + "loss": 0.9343, + "step": 15648 + }, + { + "epoch": 3.338097269624573, + "grad_norm": 0.2932201326740236, + "learning_rate": 0.00010020231098616953, + "loss": 0.9517, + "step": 15649 + }, + { + "epoch": 3.3383105802047783, + "grad_norm": 0.23763371493115604, + "learning_rate": 0.00010019407691343906, + "loss": 0.9556, + "step": 15650 + }, + { + "epoch": 3.338523890784983, + "grad_norm": 0.2095456065980501, + "learning_rate": 0.0001001858426122123, + "loss": 0.9307, + "step": 15651 + }, + { + "epoch": 3.3387372013651877, + "grad_norm": 0.23746024401412233, + "learning_rate": 0.00010017760808258237, + "loss": 0.8941, + "step": 15652 + }, + { + "epoch": 3.3389505119453924, + "grad_norm": 0.22005130263038775, + "learning_rate": 0.00010016937332464248, + "loss": 0.9181, + "step": 15653 + }, + { + "epoch": 3.339163822525597, + "grad_norm": 0.2852280482040426, + "learning_rate": 0.00010016113833848577, + "loss": 0.9375, + "step": 15654 + }, + { + "epoch": 3.339377133105802, + "grad_norm": 0.3074483462290859, + "learning_rate": 0.00010015290312420545, + "loss": 0.9118, + "step": 15655 + }, + { + "epoch": 3.339590443686007, + "grad_norm": 0.23543933914472587, + "learning_rate": 0.00010014466768189471, + "loss": 0.9196, + "step": 15656 + }, + { + "epoch": 3.3398037542662116, + "grad_norm": 0.2256974526311712, + "learning_rate": 0.00010013643201164668, + "loss": 0.9322, + "step": 15657 + }, + { + "epoch": 3.3400170648464163, + "grad_norm": 0.27032633017984775, + "learning_rate": 0.00010012819611355464, + "loss": 0.9146, + "step": 15658 + }, + { + "epoch": 3.3402303754266214, + "grad_norm": 0.25835695128889885, + "learning_rate": 0.00010011995998771167, + "loss": 0.9178, + "step": 15659 + }, + { + "epoch": 3.340443686006826, + "grad_norm": 0.27448535401211355, + "learning_rate": 0.00010011172363421104, + "loss": 0.9416, + "step": 15660 + }, + { + "epoch": 3.3406569965870307, + "grad_norm": 0.31325922988605376, + "learning_rate": 0.00010010348705314589, + "loss": 0.9077, + "step": 15661 + }, + { + "epoch": 3.3408703071672354, + "grad_norm": 0.3188280615480169, + "learning_rate": 0.00010009525024460945, + "loss": 0.9124, + "step": 15662 + }, + { + "epoch": 3.34108361774744, + "grad_norm": 0.21237401284306706, + "learning_rate": 0.00010008701320869494, + "loss": 0.9151, + "step": 15663 + }, + { + "epoch": 3.3412969283276452, + "grad_norm": 0.30527269216969366, + "learning_rate": 0.00010007877594549556, + "loss": 0.9425, + "step": 15664 + }, + { + "epoch": 3.34151023890785, + "grad_norm": 0.35475525093575827, + "learning_rate": 0.00010007053845510443, + "loss": 0.9185, + "step": 15665 + }, + { + "epoch": 3.3417235494880546, + "grad_norm": 0.30208226355753975, + "learning_rate": 0.00010006230073761487, + "loss": 0.9297, + "step": 15666 + }, + { + "epoch": 3.3419368600682593, + "grad_norm": 0.2857634031781239, + "learning_rate": 0.00010005406279312, + "loss": 0.9388, + "step": 15667 + }, + { + "epoch": 3.342150170648464, + "grad_norm": 0.31924656962745684, + "learning_rate": 0.00010004582462171311, + "loss": 0.9067, + "step": 15668 + }, + { + "epoch": 3.342363481228669, + "grad_norm": 0.28750293972105617, + "learning_rate": 0.00010003758622348736, + "loss": 0.9406, + "step": 15669 + }, + { + "epoch": 3.342576791808874, + "grad_norm": 0.24999672265072173, + "learning_rate": 0.00010002934759853599, + "loss": 0.9263, + "step": 15670 + }, + { + "epoch": 3.3427901023890785, + "grad_norm": 0.2755492973877373, + "learning_rate": 0.00010002110874695221, + "loss": 0.9313, + "step": 15671 + }, + { + "epoch": 3.343003412969283, + "grad_norm": 0.20478462224863614, + "learning_rate": 0.00010001286966882924, + "loss": 0.8844, + "step": 15672 + }, + { + "epoch": 3.343216723549488, + "grad_norm": 0.2671831583943498, + "learning_rate": 0.00010000463036426032, + "loss": 0.9488, + "step": 15673 + }, + { + "epoch": 3.343430034129693, + "grad_norm": 0.2700727645163485, + "learning_rate": 9.999639083333867e-05, + "loss": 0.9008, + "step": 15674 + }, + { + "epoch": 3.3436433447098977, + "grad_norm": 0.20166685108081467, + "learning_rate": 9.998815107615752e-05, + "loss": 0.9098, + "step": 15675 + }, + { + "epoch": 3.3438566552901023, + "grad_norm": 0.2606443610323613, + "learning_rate": 9.997991109281012e-05, + "loss": 0.9008, + "step": 15676 + }, + { + "epoch": 3.344069965870307, + "grad_norm": 0.21083428581478283, + "learning_rate": 9.997167088338966e-05, + "loss": 0.8969, + "step": 15677 + }, + { + "epoch": 3.344283276450512, + "grad_norm": 0.18595605150549135, + "learning_rate": 9.996343044798943e-05, + "loss": 0.9281, + "step": 15678 + }, + { + "epoch": 3.344496587030717, + "grad_norm": 0.23800000383050493, + "learning_rate": 9.995518978670264e-05, + "loss": 0.976, + "step": 15679 + }, + { + "epoch": 3.3447098976109215, + "grad_norm": 0.2386976532816069, + "learning_rate": 9.994694889962255e-05, + "loss": 0.9072, + "step": 15680 + }, + { + "epoch": 3.344923208191126, + "grad_norm": 0.2820065673687434, + "learning_rate": 9.99387077868424e-05, + "loss": 0.9253, + "step": 15681 + }, + { + "epoch": 3.345136518771331, + "grad_norm": 0.3232568340511505, + "learning_rate": 9.993046644845542e-05, + "loss": 0.9227, + "step": 15682 + }, + { + "epoch": 3.345349829351536, + "grad_norm": 0.24078603597842624, + "learning_rate": 9.992222488455489e-05, + "loss": 0.8998, + "step": 15683 + }, + { + "epoch": 3.3455631399317407, + "grad_norm": 0.24287959567002884, + "learning_rate": 9.991398309523405e-05, + "loss": 0.9072, + "step": 15684 + }, + { + "epoch": 3.3457764505119454, + "grad_norm": 0.2524855882672864, + "learning_rate": 9.990574108058614e-05, + "loss": 0.9077, + "step": 15685 + }, + { + "epoch": 3.34598976109215, + "grad_norm": 0.22887218824067676, + "learning_rate": 9.989749884070447e-05, + "loss": 0.8856, + "step": 15686 + }, + { + "epoch": 3.3462030716723548, + "grad_norm": 0.27560397085787003, + "learning_rate": 9.988925637568223e-05, + "loss": 0.9119, + "step": 15687 + }, + { + "epoch": 3.34641638225256, + "grad_norm": 0.2460109199163382, + "learning_rate": 9.988101368561276e-05, + "loss": 0.8903, + "step": 15688 + }, + { + "epoch": 3.3466296928327646, + "grad_norm": 0.244570724850134, + "learning_rate": 9.987277077058925e-05, + "loss": 0.9266, + "step": 15689 + }, + { + "epoch": 3.3468430034129693, + "grad_norm": 0.24146826586028758, + "learning_rate": 9.9864527630705e-05, + "loss": 0.9282, + "step": 15690 + }, + { + "epoch": 3.347056313993174, + "grad_norm": 0.172630506652515, + "learning_rate": 9.98562842660533e-05, + "loss": 0.9095, + "step": 15691 + }, + { + "epoch": 3.3472696245733786, + "grad_norm": 0.2221859273109451, + "learning_rate": 9.984804067672742e-05, + "loss": 0.9244, + "step": 15692 + }, + { + "epoch": 3.3474829351535837, + "grad_norm": 0.2849291075014626, + "learning_rate": 9.98397968628206e-05, + "loss": 0.9334, + "step": 15693 + }, + { + "epoch": 3.3476962457337884, + "grad_norm": 0.26798214024004413, + "learning_rate": 9.983155282442617e-05, + "loss": 0.9074, + "step": 15694 + }, + { + "epoch": 3.347909556313993, + "grad_norm": 0.21770573764130519, + "learning_rate": 9.982330856163738e-05, + "loss": 0.9373, + "step": 15695 + }, + { + "epoch": 3.348122866894198, + "grad_norm": 0.2134700982296643, + "learning_rate": 9.981506407454751e-05, + "loss": 0.9598, + "step": 15696 + }, + { + "epoch": 3.348336177474403, + "grad_norm": 0.22248479841125737, + "learning_rate": 9.980681936324987e-05, + "loss": 0.9292, + "step": 15697 + }, + { + "epoch": 3.3485494880546076, + "grad_norm": 0.173201740084726, + "learning_rate": 9.979857442783774e-05, + "loss": 0.8935, + "step": 15698 + }, + { + "epoch": 3.3487627986348123, + "grad_norm": 0.2577790622609815, + "learning_rate": 9.979032926840438e-05, + "loss": 0.9134, + "step": 15699 + }, + { + "epoch": 3.348976109215017, + "grad_norm": 0.34658877875642663, + "learning_rate": 9.978208388504313e-05, + "loss": 0.9439, + "step": 15700 + }, + { + "epoch": 3.3491894197952217, + "grad_norm": 0.23490678456539943, + "learning_rate": 9.977383827784725e-05, + "loss": 0.9636, + "step": 15701 + }, + { + "epoch": 3.349402730375427, + "grad_norm": 0.24109534063577595, + "learning_rate": 9.976559244691008e-05, + "loss": 0.955, + "step": 15702 + }, + { + "epoch": 3.3496160409556315, + "grad_norm": 0.31458562331452145, + "learning_rate": 9.975734639232489e-05, + "loss": 0.9157, + "step": 15703 + }, + { + "epoch": 3.349829351535836, + "grad_norm": 0.2195686294345002, + "learning_rate": 9.974910011418502e-05, + "loss": 0.9462, + "step": 15704 + }, + { + "epoch": 3.350042662116041, + "grad_norm": 0.23942408704208612, + "learning_rate": 9.97408536125837e-05, + "loss": 0.9101, + "step": 15705 + }, + { + "epoch": 3.350255972696246, + "grad_norm": 0.24830672491914743, + "learning_rate": 9.973260688761433e-05, + "loss": 0.9043, + "step": 15706 + }, + { + "epoch": 3.3504692832764507, + "grad_norm": 0.18914453310697021, + "learning_rate": 9.972435993937015e-05, + "loss": 0.9077, + "step": 15707 + }, + { + "epoch": 3.3506825938566553, + "grad_norm": 0.21271043879208976, + "learning_rate": 9.971611276794452e-05, + "loss": 0.9339, + "step": 15708 + }, + { + "epoch": 3.35089590443686, + "grad_norm": 0.1951184843330005, + "learning_rate": 9.970786537343076e-05, + "loss": 0.9419, + "step": 15709 + }, + { + "epoch": 3.3511092150170647, + "grad_norm": 0.21405286683825825, + "learning_rate": 9.969961775592215e-05, + "loss": 0.9145, + "step": 15710 + }, + { + "epoch": 3.3513225255972694, + "grad_norm": 0.28041771798759996, + "learning_rate": 9.969136991551203e-05, + "loss": 0.924, + "step": 15711 + }, + { + "epoch": 3.3515358361774745, + "grad_norm": 0.24318314571522262, + "learning_rate": 9.968312185229375e-05, + "loss": 0.8952, + "step": 15712 + }, + { + "epoch": 3.351749146757679, + "grad_norm": 0.19843850649512193, + "learning_rate": 9.967487356636059e-05, + "loss": 0.9353, + "step": 15713 + }, + { + "epoch": 3.351962457337884, + "grad_norm": 0.237513919416305, + "learning_rate": 9.966662505780595e-05, + "loss": 0.8854, + "step": 15714 + }, + { + "epoch": 3.3521757679180886, + "grad_norm": 0.23925408166535284, + "learning_rate": 9.965837632672305e-05, + "loss": 0.8979, + "step": 15715 + }, + { + "epoch": 3.3523890784982937, + "grad_norm": 0.25319932143927665, + "learning_rate": 9.965012737320534e-05, + "loss": 0.9117, + "step": 15716 + }, + { + "epoch": 3.3526023890784984, + "grad_norm": 0.2364993514215574, + "learning_rate": 9.964187819734609e-05, + "loss": 0.8975, + "step": 15717 + }, + { + "epoch": 3.352815699658703, + "grad_norm": 0.160238506192588, + "learning_rate": 9.963362879923866e-05, + "loss": 0.9026, + "step": 15718 + }, + { + "epoch": 3.3530290102389078, + "grad_norm": 0.2372302563363426, + "learning_rate": 9.962537917897637e-05, + "loss": 0.9676, + "step": 15719 + }, + { + "epoch": 3.3532423208191124, + "grad_norm": 0.2350415462068065, + "learning_rate": 9.961712933665264e-05, + "loss": 0.9081, + "step": 15720 + }, + { + "epoch": 3.3534556313993176, + "grad_norm": 0.20756031524874105, + "learning_rate": 9.960887927236072e-05, + "loss": 0.9261, + "step": 15721 + }, + { + "epoch": 3.3536689419795223, + "grad_norm": 0.2077674553969975, + "learning_rate": 9.9600628986194e-05, + "loss": 0.9182, + "step": 15722 + }, + { + "epoch": 3.353882252559727, + "grad_norm": 0.2359953997180276, + "learning_rate": 9.959237847824583e-05, + "loss": 0.8994, + "step": 15723 + }, + { + "epoch": 3.3540955631399316, + "grad_norm": 0.22453238454250066, + "learning_rate": 9.95841277486096e-05, + "loss": 0.8923, + "step": 15724 + }, + { + "epoch": 3.3543088737201368, + "grad_norm": 0.1806496354270532, + "learning_rate": 9.957587679737859e-05, + "loss": 0.9039, + "step": 15725 + }, + { + "epoch": 3.3545221843003414, + "grad_norm": 0.20283389943563282, + "learning_rate": 9.956762562464626e-05, + "loss": 0.9321, + "step": 15726 + }, + { + "epoch": 3.354735494880546, + "grad_norm": 0.24411003886934518, + "learning_rate": 9.955937423050586e-05, + "loss": 0.9273, + "step": 15727 + }, + { + "epoch": 3.354948805460751, + "grad_norm": 0.22366213040797586, + "learning_rate": 9.955112261505085e-05, + "loss": 0.9197, + "step": 15728 + }, + { + "epoch": 3.3551621160409555, + "grad_norm": 0.2209832969101347, + "learning_rate": 9.954287077837454e-05, + "loss": 0.9195, + "step": 15729 + }, + { + "epoch": 3.35537542662116, + "grad_norm": 0.2472027664932478, + "learning_rate": 9.953461872057033e-05, + "loss": 0.8995, + "step": 15730 + }, + { + "epoch": 3.3555887372013653, + "grad_norm": 0.23341640788851123, + "learning_rate": 9.952636644173158e-05, + "loss": 0.8849, + "step": 15731 + }, + { + "epoch": 3.35580204778157, + "grad_norm": 0.19476321863211835, + "learning_rate": 9.951811394195168e-05, + "loss": 0.9017, + "step": 15732 + }, + { + "epoch": 3.3560153583617747, + "grad_norm": 0.25031727699483436, + "learning_rate": 9.950986122132394e-05, + "loss": 0.9249, + "step": 15733 + }, + { + "epoch": 3.3562286689419794, + "grad_norm": 0.24377934884921498, + "learning_rate": 9.950160827994183e-05, + "loss": 0.9156, + "step": 15734 + }, + { + "epoch": 3.3564419795221845, + "grad_norm": 0.1729521993686368, + "learning_rate": 9.94933551178987e-05, + "loss": 0.9341, + "step": 15735 + }, + { + "epoch": 3.356655290102389, + "grad_norm": 0.21627048969305956, + "learning_rate": 9.94851017352879e-05, + "loss": 0.9142, + "step": 15736 + }, + { + "epoch": 3.356868600682594, + "grad_norm": 0.20723875256171, + "learning_rate": 9.947684813220289e-05, + "loss": 0.931, + "step": 15737 + }, + { + "epoch": 3.3570819112627985, + "grad_norm": 0.18028578959346653, + "learning_rate": 9.946859430873698e-05, + "loss": 0.9499, + "step": 15738 + }, + { + "epoch": 3.357295221843003, + "grad_norm": 0.21142792153843654, + "learning_rate": 9.94603402649836e-05, + "loss": 0.8968, + "step": 15739 + }, + { + "epoch": 3.3575085324232083, + "grad_norm": 0.21822783204530266, + "learning_rate": 9.945208600103616e-05, + "loss": 0.8931, + "step": 15740 + }, + { + "epoch": 3.357721843003413, + "grad_norm": 0.1683480011667381, + "learning_rate": 9.944383151698803e-05, + "loss": 0.9336, + "step": 15741 + }, + { + "epoch": 3.3579351535836177, + "grad_norm": 0.17189975695620005, + "learning_rate": 9.943557681293261e-05, + "loss": 0.9313, + "step": 15742 + }, + { + "epoch": 3.3581484641638224, + "grad_norm": 0.19789705108493452, + "learning_rate": 9.942732188896334e-05, + "loss": 0.9565, + "step": 15743 + }, + { + "epoch": 3.3583617747440275, + "grad_norm": 0.20185823118281698, + "learning_rate": 9.941906674517357e-05, + "loss": 0.9448, + "step": 15744 + }, + { + "epoch": 3.358575085324232, + "grad_norm": 0.21907335812009107, + "learning_rate": 9.941081138165674e-05, + "loss": 0.9316, + "step": 15745 + }, + { + "epoch": 3.358788395904437, + "grad_norm": 0.17350783308257156, + "learning_rate": 9.940255579850625e-05, + "loss": 0.9139, + "step": 15746 + }, + { + "epoch": 3.3590017064846416, + "grad_norm": 0.19896904780345198, + "learning_rate": 9.939429999581552e-05, + "loss": 0.9191, + "step": 15747 + }, + { + "epoch": 3.3592150170648463, + "grad_norm": 0.2166719294688831, + "learning_rate": 9.938604397367796e-05, + "loss": 0.8941, + "step": 15748 + }, + { + "epoch": 3.359428327645051, + "grad_norm": 0.2708220258098135, + "learning_rate": 9.937778773218698e-05, + "loss": 0.9124, + "step": 15749 + }, + { + "epoch": 3.359641638225256, + "grad_norm": 0.29178697113354835, + "learning_rate": 9.936953127143603e-05, + "loss": 0.912, + "step": 15750 + }, + { + "epoch": 3.3598549488054608, + "grad_norm": 0.22581695536364946, + "learning_rate": 9.936127459151847e-05, + "loss": 0.9287, + "step": 15751 + }, + { + "epoch": 3.3600682593856654, + "grad_norm": 0.19070218865881447, + "learning_rate": 9.935301769252779e-05, + "loss": 0.9011, + "step": 15752 + }, + { + "epoch": 3.36028156996587, + "grad_norm": 0.1890442628928393, + "learning_rate": 9.934476057455737e-05, + "loss": 0.9071, + "step": 15753 + }, + { + "epoch": 3.3604948805460753, + "grad_norm": 0.21293669507611826, + "learning_rate": 9.933650323770066e-05, + "loss": 0.928, + "step": 15754 + }, + { + "epoch": 3.36070819112628, + "grad_norm": 0.23676615668965248, + "learning_rate": 9.93282456820511e-05, + "loss": 0.9024, + "step": 15755 + }, + { + "epoch": 3.3609215017064846, + "grad_norm": 0.2387668093882373, + "learning_rate": 9.931998790770212e-05, + "loss": 0.918, + "step": 15756 + }, + { + "epoch": 3.3611348122866893, + "grad_norm": 0.36137344503626145, + "learning_rate": 9.931172991474715e-05, + "loss": 0.9162, + "step": 15757 + }, + { + "epoch": 3.361348122866894, + "grad_norm": 0.38051968190279567, + "learning_rate": 9.930347170327962e-05, + "loss": 0.9317, + "step": 15758 + }, + { + "epoch": 3.361561433447099, + "grad_norm": 0.3133773073052418, + "learning_rate": 9.9295213273393e-05, + "loss": 0.908, + "step": 15759 + }, + { + "epoch": 3.361774744027304, + "grad_norm": 0.27773315239029267, + "learning_rate": 9.928695462518069e-05, + "loss": 0.9217, + "step": 15760 + }, + { + "epoch": 3.3619880546075085, + "grad_norm": 0.2365952874689617, + "learning_rate": 9.92786957587362e-05, + "loss": 0.954, + "step": 15761 + }, + { + "epoch": 3.362201365187713, + "grad_norm": 0.2177291518512259, + "learning_rate": 9.927043667415291e-05, + "loss": 0.9166, + "step": 15762 + }, + { + "epoch": 3.3624146757679183, + "grad_norm": 0.29947649940490356, + "learning_rate": 9.926217737152431e-05, + "loss": 0.9268, + "step": 15763 + }, + { + "epoch": 3.362627986348123, + "grad_norm": 0.3658035229847826, + "learning_rate": 9.925391785094388e-05, + "loss": 0.9517, + "step": 15764 + }, + { + "epoch": 3.3628412969283277, + "grad_norm": 0.3422456846463605, + "learning_rate": 9.924565811250502e-05, + "loss": 0.9138, + "step": 15765 + }, + { + "epoch": 3.3630546075085324, + "grad_norm": 0.259972581566884, + "learning_rate": 9.92373981563012e-05, + "loss": 0.9101, + "step": 15766 + }, + { + "epoch": 3.363267918088737, + "grad_norm": 0.19870707200844048, + "learning_rate": 9.922913798242592e-05, + "loss": 0.9546, + "step": 15767 + }, + { + "epoch": 3.363481228668942, + "grad_norm": 0.17523939211139158, + "learning_rate": 9.922087759097263e-05, + "loss": 0.9185, + "step": 15768 + }, + { + "epoch": 3.363694539249147, + "grad_norm": 0.22756993117183882, + "learning_rate": 9.921261698203476e-05, + "loss": 0.9182, + "step": 15769 + }, + { + "epoch": 3.3639078498293515, + "grad_norm": 0.2628370820410863, + "learning_rate": 9.920435615570582e-05, + "loss": 0.8828, + "step": 15770 + }, + { + "epoch": 3.364121160409556, + "grad_norm": 0.30268459683262916, + "learning_rate": 9.919609511207928e-05, + "loss": 0.9132, + "step": 15771 + }, + { + "epoch": 3.364334470989761, + "grad_norm": 0.32804866251996784, + "learning_rate": 9.918783385124857e-05, + "loss": 0.9091, + "step": 15772 + }, + { + "epoch": 3.364547781569966, + "grad_norm": 0.28982543060524024, + "learning_rate": 9.91795723733072e-05, + "loss": 0.8855, + "step": 15773 + }, + { + "epoch": 3.3647610921501707, + "grad_norm": 0.18723093451609082, + "learning_rate": 9.917131067834866e-05, + "loss": 0.9113, + "step": 15774 + }, + { + "epoch": 3.3649744027303754, + "grad_norm": 0.18122288161612013, + "learning_rate": 9.916304876646642e-05, + "loss": 0.9129, + "step": 15775 + }, + { + "epoch": 3.36518771331058, + "grad_norm": 0.22431230122763662, + "learning_rate": 9.915478663775397e-05, + "loss": 0.8944, + "step": 15776 + }, + { + "epoch": 3.3654010238907848, + "grad_norm": 0.19323563064741933, + "learning_rate": 9.914652429230478e-05, + "loss": 0.9184, + "step": 15777 + }, + { + "epoch": 3.36561433447099, + "grad_norm": 0.20095063424388424, + "learning_rate": 9.913826173021232e-05, + "loss": 0.9187, + "step": 15778 + }, + { + "epoch": 3.3658276450511946, + "grad_norm": 0.2040382471349102, + "learning_rate": 9.912999895157014e-05, + "loss": 0.9112, + "step": 15779 + }, + { + "epoch": 3.3660409556313993, + "grad_norm": 0.1965092890530384, + "learning_rate": 9.912173595647168e-05, + "loss": 0.9061, + "step": 15780 + }, + { + "epoch": 3.366254266211604, + "grad_norm": 0.2617321097711209, + "learning_rate": 9.911347274501048e-05, + "loss": 0.9126, + "step": 15781 + }, + { + "epoch": 3.366467576791809, + "grad_norm": 0.24792225825166778, + "learning_rate": 9.910520931727999e-05, + "loss": 0.9131, + "step": 15782 + }, + { + "epoch": 3.3666808873720138, + "grad_norm": 0.200268602846276, + "learning_rate": 9.909694567337374e-05, + "loss": 0.8906, + "step": 15783 + }, + { + "epoch": 3.3668941979522184, + "grad_norm": 0.21176781977443482, + "learning_rate": 9.908868181338522e-05, + "loss": 0.9113, + "step": 15784 + }, + { + "epoch": 3.367107508532423, + "grad_norm": 0.21978766242477696, + "learning_rate": 9.908041773740796e-05, + "loss": 0.8771, + "step": 15785 + }, + { + "epoch": 3.367320819112628, + "grad_norm": 0.20707517243100973, + "learning_rate": 9.907215344553544e-05, + "loss": 0.9193, + "step": 15786 + }, + { + "epoch": 3.367534129692833, + "grad_norm": 0.21585669252582237, + "learning_rate": 9.90638889378612e-05, + "loss": 0.9192, + "step": 15787 + }, + { + "epoch": 3.3677474402730376, + "grad_norm": 0.19061337303665737, + "learning_rate": 9.905562421447871e-05, + "loss": 0.8964, + "step": 15788 + }, + { + "epoch": 3.3679607508532423, + "grad_norm": 0.2016410932185528, + "learning_rate": 9.904735927548153e-05, + "loss": 0.9081, + "step": 15789 + }, + { + "epoch": 3.368174061433447, + "grad_norm": 0.193493135672179, + "learning_rate": 9.903909412096318e-05, + "loss": 0.9238, + "step": 15790 + }, + { + "epoch": 3.368387372013652, + "grad_norm": 0.20294887663180747, + "learning_rate": 9.903082875101712e-05, + "loss": 0.9169, + "step": 15791 + }, + { + "epoch": 3.368600682593857, + "grad_norm": 0.22931808488046224, + "learning_rate": 9.902256316573693e-05, + "loss": 0.933, + "step": 15792 + }, + { + "epoch": 3.3688139931740615, + "grad_norm": 0.24940289843620989, + "learning_rate": 9.901429736521612e-05, + "loss": 0.8892, + "step": 15793 + }, + { + "epoch": 3.369027303754266, + "grad_norm": 0.2653158840522195, + "learning_rate": 9.90060313495482e-05, + "loss": 0.9268, + "step": 15794 + }, + { + "epoch": 3.369240614334471, + "grad_norm": 0.3091776197141675, + "learning_rate": 9.899776511882674e-05, + "loss": 0.9027, + "step": 15795 + }, + { + "epoch": 3.3694539249146755, + "grad_norm": 0.26170177564880215, + "learning_rate": 9.898949867314523e-05, + "loss": 0.891, + "step": 15796 + }, + { + "epoch": 3.3696672354948807, + "grad_norm": 0.2082132087015784, + "learning_rate": 9.898123201259722e-05, + "loss": 0.9514, + "step": 15797 + }, + { + "epoch": 3.3698805460750854, + "grad_norm": 0.3516247514654912, + "learning_rate": 9.897296513727628e-05, + "loss": 0.9291, + "step": 15798 + }, + { + "epoch": 3.37009385665529, + "grad_norm": 0.23577280389922312, + "learning_rate": 9.89646980472759e-05, + "loss": 0.9428, + "step": 15799 + }, + { + "epoch": 3.3703071672354947, + "grad_norm": 0.24025889983996626, + "learning_rate": 9.895643074268964e-05, + "loss": 0.8623, + "step": 15800 + }, + { + "epoch": 3.3705204778157, + "grad_norm": 0.30240955260807895, + "learning_rate": 9.894816322361105e-05, + "loss": 0.9461, + "step": 15801 + }, + { + "epoch": 3.3707337883959045, + "grad_norm": 0.3908489689277122, + "learning_rate": 9.893989549013366e-05, + "loss": 0.9407, + "step": 15802 + }, + { + "epoch": 3.3709470989761092, + "grad_norm": 0.39583274729314855, + "learning_rate": 9.893162754235106e-05, + "loss": 0.9082, + "step": 15803 + }, + { + "epoch": 3.371160409556314, + "grad_norm": 0.3992044089593501, + "learning_rate": 9.892335938035676e-05, + "loss": 0.9178, + "step": 15804 + }, + { + "epoch": 3.3713737201365186, + "grad_norm": 0.45129418917644143, + "learning_rate": 9.891509100424433e-05, + "loss": 0.938, + "step": 15805 + }, + { + "epoch": 3.3715870307167237, + "grad_norm": 0.40202150167307216, + "learning_rate": 9.890682241410733e-05, + "loss": 0.9195, + "step": 15806 + }, + { + "epoch": 3.3718003412969284, + "grad_norm": 0.3656976963309999, + "learning_rate": 9.889855361003932e-05, + "loss": 0.9468, + "step": 15807 + }, + { + "epoch": 3.372013651877133, + "grad_norm": 0.309111726536017, + "learning_rate": 9.889028459213386e-05, + "loss": 0.9049, + "step": 15808 + }, + { + "epoch": 3.3722269624573378, + "grad_norm": 0.2258290292424988, + "learning_rate": 9.888201536048451e-05, + "loss": 0.9298, + "step": 15809 + }, + { + "epoch": 3.372440273037543, + "grad_norm": 0.32053139864628466, + "learning_rate": 9.887374591518484e-05, + "loss": 0.934, + "step": 15810 + }, + { + "epoch": 3.3726535836177476, + "grad_norm": 0.30234043377148684, + "learning_rate": 9.886547625632841e-05, + "loss": 0.8964, + "step": 15811 + }, + { + "epoch": 3.3728668941979523, + "grad_norm": 0.2571193344953173, + "learning_rate": 9.88572063840088e-05, + "loss": 0.9323, + "step": 15812 + }, + { + "epoch": 3.373080204778157, + "grad_norm": 0.34340802584463226, + "learning_rate": 9.884893629831958e-05, + "loss": 0.9208, + "step": 15813 + }, + { + "epoch": 3.3732935153583616, + "grad_norm": 0.3810147049329409, + "learning_rate": 9.884066599935435e-05, + "loss": 0.9316, + "step": 15814 + }, + { + "epoch": 3.3735068259385663, + "grad_norm": 0.2915812652147138, + "learning_rate": 9.883239548720666e-05, + "loss": 0.9338, + "step": 15815 + }, + { + "epoch": 3.3737201365187715, + "grad_norm": 0.2641945964046202, + "learning_rate": 9.882412476197007e-05, + "loss": 0.9289, + "step": 15816 + }, + { + "epoch": 3.373933447098976, + "grad_norm": 0.27247788777308596, + "learning_rate": 9.881585382373821e-05, + "loss": 0.9397, + "step": 15817 + }, + { + "epoch": 3.374146757679181, + "grad_norm": 0.21627994998774153, + "learning_rate": 9.880758267260466e-05, + "loss": 0.9091, + "step": 15818 + }, + { + "epoch": 3.3743600682593855, + "grad_norm": 0.25102263904418043, + "learning_rate": 9.879931130866297e-05, + "loss": 0.9201, + "step": 15819 + }, + { + "epoch": 3.3745733788395906, + "grad_norm": 0.2655399151434814, + "learning_rate": 9.879103973200678e-05, + "loss": 0.9143, + "step": 15820 + }, + { + "epoch": 3.3747866894197953, + "grad_norm": 0.21446117127067385, + "learning_rate": 9.878276794272966e-05, + "loss": 0.9229, + "step": 15821 + }, + { + "epoch": 3.375, + "grad_norm": 0.18990621105534852, + "learning_rate": 9.877449594092518e-05, + "loss": 0.907, + "step": 15822 + }, + { + "epoch": 3.3752133105802047, + "grad_norm": 0.2405242028354802, + "learning_rate": 9.8766223726687e-05, + "loss": 0.9353, + "step": 15823 + }, + { + "epoch": 3.3754266211604094, + "grad_norm": 0.3050418335755779, + "learning_rate": 9.875795130010864e-05, + "loss": 0.9104, + "step": 15824 + }, + { + "epoch": 3.3756399317406145, + "grad_norm": 0.3001227220261654, + "learning_rate": 9.874967866128377e-05, + "loss": 0.9379, + "step": 15825 + }, + { + "epoch": 3.375853242320819, + "grad_norm": 0.2501414970391061, + "learning_rate": 9.874140581030598e-05, + "loss": 0.9212, + "step": 15826 + }, + { + "epoch": 3.376066552901024, + "grad_norm": 0.2414281497171241, + "learning_rate": 9.873313274726886e-05, + "loss": 0.9084, + "step": 15827 + }, + { + "epoch": 3.3762798634812285, + "grad_norm": 0.22364350233285418, + "learning_rate": 9.8724859472266e-05, + "loss": 0.8876, + "step": 15828 + }, + { + "epoch": 3.3764931740614337, + "grad_norm": 0.300083939829494, + "learning_rate": 9.871658598539106e-05, + "loss": 0.9294, + "step": 15829 + }, + { + "epoch": 3.3767064846416384, + "grad_norm": 0.3675362501569966, + "learning_rate": 9.870831228673764e-05, + "loss": 0.9299, + "step": 15830 + }, + { + "epoch": 3.376919795221843, + "grad_norm": 0.366591123757611, + "learning_rate": 9.870003837639936e-05, + "loss": 0.9359, + "step": 15831 + }, + { + "epoch": 3.3771331058020477, + "grad_norm": 0.2657698647503007, + "learning_rate": 9.869176425446983e-05, + "loss": 0.8944, + "step": 15832 + }, + { + "epoch": 3.3773464163822524, + "grad_norm": 0.21640044550121273, + "learning_rate": 9.868348992104265e-05, + "loss": 0.9132, + "step": 15833 + }, + { + "epoch": 3.377559726962457, + "grad_norm": 0.2307199336353246, + "learning_rate": 9.867521537621148e-05, + "loss": 0.925, + "step": 15834 + }, + { + "epoch": 3.3777730375426622, + "grad_norm": 0.23875576519813232, + "learning_rate": 9.866694062006995e-05, + "loss": 0.9345, + "step": 15835 + }, + { + "epoch": 3.377986348122867, + "grad_norm": 0.23983783801422193, + "learning_rate": 9.865866565271165e-05, + "loss": 0.9108, + "step": 15836 + }, + { + "epoch": 3.3781996587030716, + "grad_norm": 0.2584822372835871, + "learning_rate": 9.865039047423026e-05, + "loss": 0.9231, + "step": 15837 + }, + { + "epoch": 3.3784129692832763, + "grad_norm": 0.24017897458538814, + "learning_rate": 9.864211508471935e-05, + "loss": 0.9174, + "step": 15838 + }, + { + "epoch": 3.3786262798634814, + "grad_norm": 0.2385475472216365, + "learning_rate": 9.863383948427261e-05, + "loss": 0.8987, + "step": 15839 + }, + { + "epoch": 3.378839590443686, + "grad_norm": 0.2181820174465385, + "learning_rate": 9.862556367298367e-05, + "loss": 0.9131, + "step": 15840 + }, + { + "epoch": 3.3790529010238908, + "grad_norm": 0.22299745948038913, + "learning_rate": 9.861728765094615e-05, + "loss": 0.8986, + "step": 15841 + }, + { + "epoch": 3.3792662116040955, + "grad_norm": 0.26883724692759575, + "learning_rate": 9.860901141825372e-05, + "loss": 0.9374, + "step": 15842 + }, + { + "epoch": 3.3794795221843, + "grad_norm": 0.26831490997072976, + "learning_rate": 9.860073497500002e-05, + "loss": 0.9291, + "step": 15843 + }, + { + "epoch": 3.3796928327645053, + "grad_norm": 0.27175482266484347, + "learning_rate": 9.85924583212787e-05, + "loss": 0.9179, + "step": 15844 + }, + { + "epoch": 3.37990614334471, + "grad_norm": 0.2533788152513007, + "learning_rate": 9.858418145718336e-05, + "loss": 0.9165, + "step": 15845 + }, + { + "epoch": 3.3801194539249146, + "grad_norm": 0.19074710273465964, + "learning_rate": 9.857590438280772e-05, + "loss": 0.8781, + "step": 15846 + }, + { + "epoch": 3.3803327645051193, + "grad_norm": 0.19163102531310508, + "learning_rate": 9.856762709824541e-05, + "loss": 0.9132, + "step": 15847 + }, + { + "epoch": 3.3805460750853245, + "grad_norm": 0.19246562437435047, + "learning_rate": 9.855934960359009e-05, + "loss": 0.9285, + "step": 15848 + }, + { + "epoch": 3.380759385665529, + "grad_norm": 0.1962552587627084, + "learning_rate": 9.855107189893543e-05, + "loss": 0.9162, + "step": 15849 + }, + { + "epoch": 3.380972696245734, + "grad_norm": 0.1939973315758593, + "learning_rate": 9.854279398437504e-05, + "loss": 0.926, + "step": 15850 + }, + { + "epoch": 3.3811860068259385, + "grad_norm": 0.1897008773528078, + "learning_rate": 9.853451586000264e-05, + "loss": 0.8936, + "step": 15851 + }, + { + "epoch": 3.381399317406143, + "grad_norm": 0.19907128136512522, + "learning_rate": 9.852623752591187e-05, + "loss": 0.9431, + "step": 15852 + }, + { + "epoch": 3.381612627986348, + "grad_norm": 0.26515059246611394, + "learning_rate": 9.851795898219645e-05, + "loss": 0.9376, + "step": 15853 + }, + { + "epoch": 3.381825938566553, + "grad_norm": 0.3074512793643621, + "learning_rate": 9.850968022894999e-05, + "loss": 0.9159, + "step": 15854 + }, + { + "epoch": 3.3820392491467577, + "grad_norm": 0.27693025183948594, + "learning_rate": 9.850140126626618e-05, + "loss": 0.9367, + "step": 15855 + }, + { + "epoch": 3.3822525597269624, + "grad_norm": 0.230873792363584, + "learning_rate": 9.84931220942387e-05, + "loss": 0.9127, + "step": 15856 + }, + { + "epoch": 3.382465870307167, + "grad_norm": 0.22507607992880999, + "learning_rate": 9.848484271296125e-05, + "loss": 0.9159, + "step": 15857 + }, + { + "epoch": 3.382679180887372, + "grad_norm": 0.23778814901468853, + "learning_rate": 9.847656312252747e-05, + "loss": 0.9174, + "step": 15858 + }, + { + "epoch": 3.382892491467577, + "grad_norm": 0.30713407953421107, + "learning_rate": 9.846828332303111e-05, + "loss": 0.9208, + "step": 15859 + }, + { + "epoch": 3.3831058020477816, + "grad_norm": 0.2871285964791612, + "learning_rate": 9.846000331456579e-05, + "loss": 0.9336, + "step": 15860 + }, + { + "epoch": 3.3833191126279862, + "grad_norm": 0.20816820973922476, + "learning_rate": 9.845172309722521e-05, + "loss": 0.915, + "step": 15861 + }, + { + "epoch": 3.383532423208191, + "grad_norm": 0.19256519412194673, + "learning_rate": 9.844344267110308e-05, + "loss": 0.9288, + "step": 15862 + }, + { + "epoch": 3.383745733788396, + "grad_norm": 0.2864684575326765, + "learning_rate": 9.843516203629309e-05, + "loss": 0.9402, + "step": 15863 + }, + { + "epoch": 3.3839590443686007, + "grad_norm": 0.3531834397076718, + "learning_rate": 9.842688119288892e-05, + "loss": 0.9166, + "step": 15864 + }, + { + "epoch": 3.3841723549488054, + "grad_norm": 0.3923493780171064, + "learning_rate": 9.84186001409843e-05, + "loss": 0.9215, + "step": 15865 + }, + { + "epoch": 3.38438566552901, + "grad_norm": 0.32165704910926146, + "learning_rate": 9.841031888067289e-05, + "loss": 0.9297, + "step": 15866 + }, + { + "epoch": 3.3845989761092152, + "grad_norm": 0.24252180398904072, + "learning_rate": 9.840203741204843e-05, + "loss": 0.9004, + "step": 15867 + }, + { + "epoch": 3.38481228668942, + "grad_norm": 0.2710486522856011, + "learning_rate": 9.839375573520459e-05, + "loss": 0.9452, + "step": 15868 + }, + { + "epoch": 3.3850255972696246, + "grad_norm": 0.27620117311430464, + "learning_rate": 9.83854738502351e-05, + "loss": 0.9227, + "step": 15869 + }, + { + "epoch": 3.3852389078498293, + "grad_norm": 0.32723558370137956, + "learning_rate": 9.837719175723365e-05, + "loss": 0.9258, + "step": 15870 + }, + { + "epoch": 3.385452218430034, + "grad_norm": 0.30105428055346334, + "learning_rate": 9.836890945629402e-05, + "loss": 0.9092, + "step": 15871 + }, + { + "epoch": 3.385665529010239, + "grad_norm": 0.20792084429385607, + "learning_rate": 9.836062694750982e-05, + "loss": 0.951, + "step": 15872 + }, + { + "epoch": 3.385878839590444, + "grad_norm": 0.2347210214408477, + "learning_rate": 9.835234423097483e-05, + "loss": 0.9103, + "step": 15873 + }, + { + "epoch": 3.3860921501706485, + "grad_norm": 0.22053690527027225, + "learning_rate": 9.834406130678274e-05, + "loss": 0.9278, + "step": 15874 + }, + { + "epoch": 3.386305460750853, + "grad_norm": 0.2048469322068287, + "learning_rate": 9.833577817502732e-05, + "loss": 0.8888, + "step": 15875 + }, + { + "epoch": 3.386518771331058, + "grad_norm": 0.23862314303226007, + "learning_rate": 9.832749483580225e-05, + "loss": 0.9489, + "step": 15876 + }, + { + "epoch": 3.386732081911263, + "grad_norm": 0.23902042479455668, + "learning_rate": 9.831921128920127e-05, + "loss": 0.927, + "step": 15877 + }, + { + "epoch": 3.3869453924914676, + "grad_norm": 0.22369970926361418, + "learning_rate": 9.831092753531808e-05, + "loss": 0.9428, + "step": 15878 + }, + { + "epoch": 3.3871587030716723, + "grad_norm": 0.1935874284264603, + "learning_rate": 9.830264357424646e-05, + "loss": 0.8805, + "step": 15879 + }, + { + "epoch": 3.387372013651877, + "grad_norm": 0.18296491502620496, + "learning_rate": 9.82943594060801e-05, + "loss": 0.9015, + "step": 15880 + }, + { + "epoch": 3.3875853242320817, + "grad_norm": 0.24021249159146799, + "learning_rate": 9.828607503091277e-05, + "loss": 0.9104, + "step": 15881 + }, + { + "epoch": 3.387798634812287, + "grad_norm": 0.32111520932575577, + "learning_rate": 9.827779044883821e-05, + "loss": 0.8999, + "step": 15882 + }, + { + "epoch": 3.3880119453924915, + "grad_norm": 0.251186619127743, + "learning_rate": 9.826950565995014e-05, + "loss": 0.9102, + "step": 15883 + }, + { + "epoch": 3.388225255972696, + "grad_norm": 0.1991492741060273, + "learning_rate": 9.82612206643423e-05, + "loss": 0.9312, + "step": 15884 + }, + { + "epoch": 3.388438566552901, + "grad_norm": 0.2638445306350368, + "learning_rate": 9.825293546210843e-05, + "loss": 0.9326, + "step": 15885 + }, + { + "epoch": 3.388651877133106, + "grad_norm": 0.2325621107873009, + "learning_rate": 9.824465005334228e-05, + "loss": 0.9071, + "step": 15886 + }, + { + "epoch": 3.3888651877133107, + "grad_norm": 0.2098297632952719, + "learning_rate": 9.823636443813763e-05, + "loss": 0.9462, + "step": 15887 + }, + { + "epoch": 3.3890784982935154, + "grad_norm": 0.2368589846047495, + "learning_rate": 9.822807861658821e-05, + "loss": 0.8896, + "step": 15888 + }, + { + "epoch": 3.38929180887372, + "grad_norm": 0.20182619091565784, + "learning_rate": 9.821979258878778e-05, + "loss": 0.8933, + "step": 15889 + }, + { + "epoch": 3.3895051194539247, + "grad_norm": 0.21200742336432396, + "learning_rate": 9.821150635483005e-05, + "loss": 0.918, + "step": 15890 + }, + { + "epoch": 3.38971843003413, + "grad_norm": 0.21183478553268184, + "learning_rate": 9.820321991480885e-05, + "loss": 0.9112, + "step": 15891 + }, + { + "epoch": 3.3899317406143346, + "grad_norm": 0.22096870476169267, + "learning_rate": 9.81949332688179e-05, + "loss": 0.8879, + "step": 15892 + }, + { + "epoch": 3.3901450511945392, + "grad_norm": 0.22975275838005568, + "learning_rate": 9.8186646416951e-05, + "loss": 0.9405, + "step": 15893 + }, + { + "epoch": 3.390358361774744, + "grad_norm": 0.22776646064865783, + "learning_rate": 9.817835935930186e-05, + "loss": 0.9138, + "step": 15894 + }, + { + "epoch": 3.390571672354949, + "grad_norm": 0.22859191120596836, + "learning_rate": 9.817007209596431e-05, + "loss": 0.9112, + "step": 15895 + }, + { + "epoch": 3.3907849829351537, + "grad_norm": 0.23132906798273792, + "learning_rate": 9.816178462703203e-05, + "loss": 0.9212, + "step": 15896 + }, + { + "epoch": 3.3909982935153584, + "grad_norm": 0.2335176430755124, + "learning_rate": 9.815349695259891e-05, + "loss": 0.9224, + "step": 15897 + }, + { + "epoch": 3.391211604095563, + "grad_norm": 0.1993118596960267, + "learning_rate": 9.814520907275862e-05, + "loss": 0.9119, + "step": 15898 + }, + { + "epoch": 3.391424914675768, + "grad_norm": 0.1848263949756331, + "learning_rate": 9.813692098760504e-05, + "loss": 0.9058, + "step": 15899 + }, + { + "epoch": 3.3916382252559725, + "grad_norm": 0.20000253038084712, + "learning_rate": 9.812863269723185e-05, + "loss": 0.9366, + "step": 15900 + }, + { + "epoch": 3.3918515358361776, + "grad_norm": 0.282613783928331, + "learning_rate": 9.81203442017329e-05, + "loss": 0.9289, + "step": 15901 + }, + { + "epoch": 3.3920648464163823, + "grad_norm": 0.30214924760209916, + "learning_rate": 9.811205550120191e-05, + "loss": 0.9016, + "step": 15902 + }, + { + "epoch": 3.392278156996587, + "grad_norm": 0.31034390283390456, + "learning_rate": 9.810376659573275e-05, + "loss": 0.9342, + "step": 15903 + }, + { + "epoch": 3.3924914675767917, + "grad_norm": 0.3259993325514052, + "learning_rate": 9.809547748541915e-05, + "loss": 0.9177, + "step": 15904 + }, + { + "epoch": 3.392704778156997, + "grad_norm": 0.32728637964407903, + "learning_rate": 9.808718817035493e-05, + "loss": 0.9388, + "step": 15905 + }, + { + "epoch": 3.3929180887372015, + "grad_norm": 0.29695933722998974, + "learning_rate": 9.807889865063387e-05, + "loss": 0.9158, + "step": 15906 + }, + { + "epoch": 3.393131399317406, + "grad_norm": 0.30686513907360685, + "learning_rate": 9.807060892634976e-05, + "loss": 0.9391, + "step": 15907 + }, + { + "epoch": 3.393344709897611, + "grad_norm": 0.28510546248689767, + "learning_rate": 9.806231899759639e-05, + "loss": 0.9547, + "step": 15908 + }, + { + "epoch": 3.3935580204778155, + "grad_norm": 0.25289825454132703, + "learning_rate": 9.80540288644676e-05, + "loss": 0.9118, + "step": 15909 + }, + { + "epoch": 3.3937713310580206, + "grad_norm": 0.26339580437508103, + "learning_rate": 9.804573852705716e-05, + "loss": 0.9067, + "step": 15910 + }, + { + "epoch": 3.3939846416382253, + "grad_norm": 0.33478034316612765, + "learning_rate": 9.803744798545889e-05, + "loss": 0.9042, + "step": 15911 + }, + { + "epoch": 3.39419795221843, + "grad_norm": 0.3561283576346762, + "learning_rate": 9.80291572397666e-05, + "loss": 0.9069, + "step": 15912 + }, + { + "epoch": 3.3944112627986347, + "grad_norm": 0.3276492131494054, + "learning_rate": 9.802086629007409e-05, + "loss": 0.902, + "step": 15913 + }, + { + "epoch": 3.39462457337884, + "grad_norm": 0.2780444999129726, + "learning_rate": 9.801257513647514e-05, + "loss": 0.9358, + "step": 15914 + }, + { + "epoch": 3.3948378839590445, + "grad_norm": 0.20101957212219515, + "learning_rate": 9.800428377906364e-05, + "loss": 0.9274, + "step": 15915 + }, + { + "epoch": 3.395051194539249, + "grad_norm": 0.20852781417207933, + "learning_rate": 9.799599221793336e-05, + "loss": 0.8825, + "step": 15916 + }, + { + "epoch": 3.395264505119454, + "grad_norm": 0.20243659682575763, + "learning_rate": 9.798770045317812e-05, + "loss": 0.914, + "step": 15917 + }, + { + "epoch": 3.3954778156996586, + "grad_norm": 0.23600470827696896, + "learning_rate": 9.797940848489174e-05, + "loss": 0.9303, + "step": 15918 + }, + { + "epoch": 3.3956911262798632, + "grad_norm": 0.255982348473085, + "learning_rate": 9.797111631316806e-05, + "loss": 0.9013, + "step": 15919 + }, + { + "epoch": 3.3959044368600684, + "grad_norm": 0.2689804550730078, + "learning_rate": 9.796282393810087e-05, + "loss": 0.9215, + "step": 15920 + }, + { + "epoch": 3.396117747440273, + "grad_norm": 0.22809627995276896, + "learning_rate": 9.795453135978409e-05, + "loss": 0.9117, + "step": 15921 + }, + { + "epoch": 3.3963310580204777, + "grad_norm": 0.20523387553607816, + "learning_rate": 9.794623857831142e-05, + "loss": 0.9384, + "step": 15922 + }, + { + "epoch": 3.3965443686006824, + "grad_norm": 0.2528156270838161, + "learning_rate": 9.793794559377679e-05, + "loss": 0.9137, + "step": 15923 + }, + { + "epoch": 3.3967576791808876, + "grad_norm": 0.2655807445914216, + "learning_rate": 9.792965240627399e-05, + "loss": 0.9377, + "step": 15924 + }, + { + "epoch": 3.3969709897610922, + "grad_norm": 0.27392503670952895, + "learning_rate": 9.792135901589687e-05, + "loss": 0.9124, + "step": 15925 + }, + { + "epoch": 3.397184300341297, + "grad_norm": 0.23289497483457303, + "learning_rate": 9.791306542273927e-05, + "loss": 0.8933, + "step": 15926 + }, + { + "epoch": 3.3973976109215016, + "grad_norm": 0.2231079703841486, + "learning_rate": 9.790477162689506e-05, + "loss": 0.8938, + "step": 15927 + }, + { + "epoch": 3.3976109215017063, + "grad_norm": 0.2424740065068826, + "learning_rate": 9.789647762845804e-05, + "loss": 0.91, + "step": 15928 + }, + { + "epoch": 3.3978242320819114, + "grad_norm": 0.2700214304611953, + "learning_rate": 9.788818342752208e-05, + "loss": 0.9148, + "step": 15929 + }, + { + "epoch": 3.398037542662116, + "grad_norm": 0.22959190059758794, + "learning_rate": 9.7879889024181e-05, + "loss": 0.8997, + "step": 15930 + }, + { + "epoch": 3.398250853242321, + "grad_norm": 0.1942783221768272, + "learning_rate": 9.78715944185287e-05, + "loss": 0.9021, + "step": 15931 + }, + { + "epoch": 3.3984641638225255, + "grad_norm": 0.2077792314853457, + "learning_rate": 9.786329961065902e-05, + "loss": 0.9403, + "step": 15932 + }, + { + "epoch": 3.3986774744027306, + "grad_norm": 0.18894418346683597, + "learning_rate": 9.785500460066579e-05, + "loss": 0.8663, + "step": 15933 + }, + { + "epoch": 3.3988907849829353, + "grad_norm": 0.1794214452674251, + "learning_rate": 9.784670938864285e-05, + "loss": 0.9121, + "step": 15934 + }, + { + "epoch": 3.39910409556314, + "grad_norm": 0.24414231568514272, + "learning_rate": 9.783841397468413e-05, + "loss": 0.9505, + "step": 15935 + }, + { + "epoch": 3.3993174061433447, + "grad_norm": 0.25950197931761226, + "learning_rate": 9.783011835888345e-05, + "loss": 0.9298, + "step": 15936 + }, + { + "epoch": 3.3995307167235493, + "grad_norm": 0.2011964015182644, + "learning_rate": 9.782182254133468e-05, + "loss": 0.9376, + "step": 15937 + }, + { + "epoch": 3.399744027303754, + "grad_norm": 0.23127498942809144, + "learning_rate": 9.78135265221317e-05, + "loss": 0.9045, + "step": 15938 + }, + { + "epoch": 3.399957337883959, + "grad_norm": 0.2541524594384983, + "learning_rate": 9.780523030136835e-05, + "loss": 0.9247, + "step": 15939 + }, + { + "epoch": 3.400170648464164, + "grad_norm": 0.2156609000315612, + "learning_rate": 9.779693387913851e-05, + "loss": 0.9004, + "step": 15940 + }, + { + "epoch": 3.4003839590443685, + "grad_norm": 0.20887101999592828, + "learning_rate": 9.778863725553608e-05, + "loss": 0.9076, + "step": 15941 + }, + { + "epoch": 3.400597269624573, + "grad_norm": 0.21787621192503925, + "learning_rate": 9.778034043065491e-05, + "loss": 0.8946, + "step": 15942 + }, + { + "epoch": 3.4008105802047783, + "grad_norm": 0.24750561693948742, + "learning_rate": 9.777204340458891e-05, + "loss": 0.9072, + "step": 15943 + }, + { + "epoch": 3.401023890784983, + "grad_norm": 0.2678113655620243, + "learning_rate": 9.776374617743193e-05, + "loss": 0.9406, + "step": 15944 + }, + { + "epoch": 3.4012372013651877, + "grad_norm": 0.2545238492581046, + "learning_rate": 9.775544874927786e-05, + "loss": 0.9229, + "step": 15945 + }, + { + "epoch": 3.4014505119453924, + "grad_norm": 0.2550704035181218, + "learning_rate": 9.774715112022058e-05, + "loss": 0.916, + "step": 15946 + }, + { + "epoch": 3.401663822525597, + "grad_norm": 0.21475647626410746, + "learning_rate": 9.7738853290354e-05, + "loss": 0.9081, + "step": 15947 + }, + { + "epoch": 3.401877133105802, + "grad_norm": 0.23851049361907403, + "learning_rate": 9.773055525977197e-05, + "loss": 0.904, + "step": 15948 + }, + { + "epoch": 3.402090443686007, + "grad_norm": 0.2599623779303059, + "learning_rate": 9.772225702856848e-05, + "loss": 0.9235, + "step": 15949 + }, + { + "epoch": 3.4023037542662116, + "grad_norm": 0.19905314532611917, + "learning_rate": 9.771395859683728e-05, + "loss": 0.9245, + "step": 15950 + }, + { + "epoch": 3.4025170648464163, + "grad_norm": 0.23920964245973086, + "learning_rate": 9.770565996467238e-05, + "loss": 0.9127, + "step": 15951 + }, + { + "epoch": 3.4027303754266214, + "grad_norm": 0.21443063658980585, + "learning_rate": 9.769736113216761e-05, + "loss": 0.9178, + "step": 15952 + }, + { + "epoch": 3.402943686006826, + "grad_norm": 0.222583194767303, + "learning_rate": 9.768906209941692e-05, + "loss": 0.9224, + "step": 15953 + }, + { + "epoch": 3.4031569965870307, + "grad_norm": 0.2037884106336813, + "learning_rate": 9.768076286651418e-05, + "loss": 0.9248, + "step": 15954 + }, + { + "epoch": 3.4033703071672354, + "grad_norm": 0.2566397180773253, + "learning_rate": 9.767246343355334e-05, + "loss": 0.9187, + "step": 15955 + }, + { + "epoch": 3.40358361774744, + "grad_norm": 0.20179427808730208, + "learning_rate": 9.766416380062826e-05, + "loss": 0.9177, + "step": 15956 + }, + { + "epoch": 3.4037969283276452, + "grad_norm": 0.23972116606533606, + "learning_rate": 9.765586396783286e-05, + "loss": 0.9362, + "step": 15957 + }, + { + "epoch": 3.40401023890785, + "grad_norm": 0.2665780670688675, + "learning_rate": 9.764756393526107e-05, + "loss": 0.9155, + "step": 15958 + }, + { + "epoch": 3.4042235494880546, + "grad_norm": 0.21267506680057294, + "learning_rate": 9.763926370300681e-05, + "loss": 0.9146, + "step": 15959 + }, + { + "epoch": 3.4044368600682593, + "grad_norm": 0.5922506975573798, + "learning_rate": 9.763096327116398e-05, + "loss": 0.9126, + "step": 15960 + }, + { + "epoch": 3.404650170648464, + "grad_norm": 0.24335145699030616, + "learning_rate": 9.762266263982651e-05, + "loss": 0.9504, + "step": 15961 + }, + { + "epoch": 3.404863481228669, + "grad_norm": 0.18620042718738997, + "learning_rate": 9.761436180908828e-05, + "loss": 0.9103, + "step": 15962 + }, + { + "epoch": 3.405076791808874, + "grad_norm": 0.24453859197153952, + "learning_rate": 9.760606077904328e-05, + "loss": 0.9063, + "step": 15963 + }, + { + "epoch": 3.4052901023890785, + "grad_norm": 0.24394323349957597, + "learning_rate": 9.75977595497854e-05, + "loss": 0.8842, + "step": 15964 + }, + { + "epoch": 3.405503412969283, + "grad_norm": 0.26349535607075414, + "learning_rate": 9.758945812140857e-05, + "loss": 0.935, + "step": 15965 + }, + { + "epoch": 3.405716723549488, + "grad_norm": 0.2983177237028029, + "learning_rate": 9.758115649400673e-05, + "loss": 0.9073, + "step": 15966 + }, + { + "epoch": 3.405930034129693, + "grad_norm": 0.29810794006312386, + "learning_rate": 9.757285466767383e-05, + "loss": 0.8987, + "step": 15967 + }, + { + "epoch": 3.4061433447098977, + "grad_norm": 0.3352663930215604, + "learning_rate": 9.756455264250374e-05, + "loss": 0.9251, + "step": 15968 + }, + { + "epoch": 3.4063566552901023, + "grad_norm": 0.2870764324468999, + "learning_rate": 9.755625041859047e-05, + "loss": 0.9251, + "step": 15969 + }, + { + "epoch": 3.406569965870307, + "grad_norm": 0.20169902180899663, + "learning_rate": 9.754794799602792e-05, + "loss": 0.9142, + "step": 15970 + }, + { + "epoch": 3.406783276450512, + "grad_norm": 0.2059868315662207, + "learning_rate": 9.753964537491005e-05, + "loss": 0.9069, + "step": 15971 + }, + { + "epoch": 3.406996587030717, + "grad_norm": 0.18593964179687036, + "learning_rate": 9.75313425553308e-05, + "loss": 0.9454, + "step": 15972 + }, + { + "epoch": 3.4072098976109215, + "grad_norm": 0.23464995262533728, + "learning_rate": 9.752303953738412e-05, + "loss": 0.8792, + "step": 15973 + }, + { + "epoch": 3.407423208191126, + "grad_norm": 0.21884058767542403, + "learning_rate": 9.751473632116393e-05, + "loss": 0.9226, + "step": 15974 + }, + { + "epoch": 3.407636518771331, + "grad_norm": 0.24619689966699856, + "learning_rate": 9.750643290676423e-05, + "loss": 0.9405, + "step": 15975 + }, + { + "epoch": 3.407849829351536, + "grad_norm": 0.272543412125847, + "learning_rate": 9.749812929427892e-05, + "loss": 0.9247, + "step": 15976 + }, + { + "epoch": 3.4080631399317407, + "grad_norm": 0.24742120115530988, + "learning_rate": 9.7489825483802e-05, + "loss": 0.8768, + "step": 15977 + }, + { + "epoch": 3.4082764505119454, + "grad_norm": 0.22439909346127085, + "learning_rate": 9.748152147542741e-05, + "loss": 0.9113, + "step": 15978 + }, + { + "epoch": 3.40848976109215, + "grad_norm": 0.26773181849023314, + "learning_rate": 9.747321726924912e-05, + "loss": 0.9141, + "step": 15979 + }, + { + "epoch": 3.4087030716723548, + "grad_norm": 0.25662799952389964, + "learning_rate": 9.746491286536105e-05, + "loss": 0.9138, + "step": 15980 + }, + { + "epoch": 3.40891638225256, + "grad_norm": 0.2641360163544836, + "learning_rate": 9.745660826385722e-05, + "loss": 0.8918, + "step": 15981 + }, + { + "epoch": 3.4091296928327646, + "grad_norm": 0.536806377308859, + "learning_rate": 9.744830346483156e-05, + "loss": 0.9377, + "step": 15982 + }, + { + "epoch": 3.4093430034129693, + "grad_norm": 0.24956871113438303, + "learning_rate": 9.743999846837809e-05, + "loss": 0.9282, + "step": 15983 + }, + { + "epoch": 3.409556313993174, + "grad_norm": 0.32724640897261087, + "learning_rate": 9.743169327459073e-05, + "loss": 0.9344, + "step": 15984 + }, + { + "epoch": 3.4097696245733786, + "grad_norm": 0.3155026073518927, + "learning_rate": 9.742338788356345e-05, + "loss": 0.9429, + "step": 15985 + }, + { + "epoch": 3.4099829351535837, + "grad_norm": 0.3153935538772832, + "learning_rate": 9.741508229539023e-05, + "loss": 0.8803, + "step": 15986 + }, + { + "epoch": 3.4101962457337884, + "grad_norm": 0.3218670768703375, + "learning_rate": 9.740677651016508e-05, + "loss": 0.9169, + "step": 15987 + }, + { + "epoch": 3.410409556313993, + "grad_norm": 0.23002271718496717, + "learning_rate": 9.739847052798198e-05, + "loss": 0.9055, + "step": 15988 + }, + { + "epoch": 3.410622866894198, + "grad_norm": 0.21484114992809128, + "learning_rate": 9.739016434893485e-05, + "loss": 0.9058, + "step": 15989 + }, + { + "epoch": 3.410836177474403, + "grad_norm": 0.2906750206781211, + "learning_rate": 9.738185797311776e-05, + "loss": 0.9285, + "step": 15990 + }, + { + "epoch": 3.4110494880546076, + "grad_norm": 0.25963598945773303, + "learning_rate": 9.737355140062463e-05, + "loss": 0.9083, + "step": 15991 + }, + { + "epoch": 3.4112627986348123, + "grad_norm": 0.232827736983095, + "learning_rate": 9.736524463154947e-05, + "loss": 0.9067, + "step": 15992 + }, + { + "epoch": 3.411476109215017, + "grad_norm": 0.21532150949273018, + "learning_rate": 9.73569376659863e-05, + "loss": 0.9308, + "step": 15993 + }, + { + "epoch": 3.4116894197952217, + "grad_norm": 0.20876075155109122, + "learning_rate": 9.734863050402908e-05, + "loss": 0.9341, + "step": 15994 + }, + { + "epoch": 3.411902730375427, + "grad_norm": 0.21276391118239665, + "learning_rate": 9.734032314577178e-05, + "loss": 0.9096, + "step": 15995 + }, + { + "epoch": 3.4121160409556315, + "grad_norm": 0.2233510811515992, + "learning_rate": 9.733201559130849e-05, + "loss": 0.9067, + "step": 15996 + }, + { + "epoch": 3.412329351535836, + "grad_norm": 0.21865349356897737, + "learning_rate": 9.732370784073312e-05, + "loss": 0.8981, + "step": 15997 + }, + { + "epoch": 3.412542662116041, + "grad_norm": 0.23822986922622974, + "learning_rate": 9.731539989413968e-05, + "loss": 0.9204, + "step": 15998 + }, + { + "epoch": 3.412755972696246, + "grad_norm": 0.2038435541401482, + "learning_rate": 9.730709175162225e-05, + "loss": 0.941, + "step": 15999 + }, + { + "epoch": 3.4129692832764507, + "grad_norm": 0.21138289340364652, + "learning_rate": 9.729878341327475e-05, + "loss": 0.9233, + "step": 16000 + }, + { + "epoch": 3.4131825938566553, + "grad_norm": 0.20632072472201005, + "learning_rate": 9.729047487919123e-05, + "loss": 0.9284, + "step": 16001 + }, + { + "epoch": 3.41339590443686, + "grad_norm": 0.22634859484038442, + "learning_rate": 9.728216614946571e-05, + "loss": 0.9129, + "step": 16002 + }, + { + "epoch": 3.4136092150170647, + "grad_norm": 0.16859751080636234, + "learning_rate": 9.727385722419219e-05, + "loss": 0.8766, + "step": 16003 + }, + { + "epoch": 3.4138225255972694, + "grad_norm": 0.1950279137474181, + "learning_rate": 9.726554810346464e-05, + "loss": 0.9163, + "step": 16004 + }, + { + "epoch": 3.4140358361774745, + "grad_norm": 0.2530943574222073, + "learning_rate": 9.725723878737718e-05, + "loss": 0.9378, + "step": 16005 + }, + { + "epoch": 3.414249146757679, + "grad_norm": 0.2256187577057243, + "learning_rate": 9.724892927602377e-05, + "loss": 0.9435, + "step": 16006 + }, + { + "epoch": 3.414462457337884, + "grad_norm": 0.25749349521544584, + "learning_rate": 9.72406195694984e-05, + "loss": 0.9119, + "step": 16007 + }, + { + "epoch": 3.4146757679180886, + "grad_norm": 0.3016382737284667, + "learning_rate": 9.723230966789515e-05, + "loss": 0.9132, + "step": 16008 + }, + { + "epoch": 3.4148890784982937, + "grad_norm": 0.37530897088150905, + "learning_rate": 9.722399957130801e-05, + "loss": 0.9329, + "step": 16009 + }, + { + "epoch": 3.4151023890784984, + "grad_norm": 0.3227119874575711, + "learning_rate": 9.721568927983104e-05, + "loss": 0.9052, + "step": 16010 + }, + { + "epoch": 3.415315699658703, + "grad_norm": 0.2894171405710371, + "learning_rate": 9.720737879355827e-05, + "loss": 0.9066, + "step": 16011 + }, + { + "epoch": 3.4155290102389078, + "grad_norm": 0.2339548129183884, + "learning_rate": 9.719906811258369e-05, + "loss": 0.9713, + "step": 16012 + }, + { + "epoch": 3.4157423208191124, + "grad_norm": 0.18090750792041696, + "learning_rate": 9.719075723700137e-05, + "loss": 0.9199, + "step": 16013 + }, + { + "epoch": 3.4159556313993176, + "grad_norm": 0.22205507379649123, + "learning_rate": 9.718244616690536e-05, + "loss": 0.9554, + "step": 16014 + }, + { + "epoch": 3.4161689419795223, + "grad_norm": 0.24002443912079224, + "learning_rate": 9.717413490238968e-05, + "loss": 0.9, + "step": 16015 + }, + { + "epoch": 3.416382252559727, + "grad_norm": 0.20896644472411308, + "learning_rate": 9.716582344354838e-05, + "loss": 0.9246, + "step": 16016 + }, + { + "epoch": 3.4165955631399316, + "grad_norm": 0.20750275583180558, + "learning_rate": 9.715751179047547e-05, + "loss": 0.8779, + "step": 16017 + }, + { + "epoch": 3.4168088737201368, + "grad_norm": 0.22630042534927686, + "learning_rate": 9.714919994326504e-05, + "loss": 0.9434, + "step": 16018 + }, + { + "epoch": 3.4170221843003414, + "grad_norm": 0.23138238155499494, + "learning_rate": 9.714088790201113e-05, + "loss": 0.9187, + "step": 16019 + }, + { + "epoch": 3.417235494880546, + "grad_norm": 0.808835300082674, + "learning_rate": 9.713257566680778e-05, + "loss": 0.9045, + "step": 16020 + }, + { + "epoch": 3.417448805460751, + "grad_norm": 0.2372296371501032, + "learning_rate": 9.712426323774905e-05, + "loss": 0.9168, + "step": 16021 + }, + { + "epoch": 3.4176621160409555, + "grad_norm": 0.2706353847451262, + "learning_rate": 9.7115950614929e-05, + "loss": 0.9319, + "step": 16022 + }, + { + "epoch": 3.41787542662116, + "grad_norm": 0.2619555631754232, + "learning_rate": 9.710763779844167e-05, + "loss": 0.9199, + "step": 16023 + }, + { + "epoch": 3.4180887372013653, + "grad_norm": 0.2594809627485521, + "learning_rate": 9.709932478838114e-05, + "loss": 0.9382, + "step": 16024 + }, + { + "epoch": 3.41830204778157, + "grad_norm": 0.22571236219008092, + "learning_rate": 9.709101158484144e-05, + "loss": 0.9188, + "step": 16025 + }, + { + "epoch": 3.4185153583617747, + "grad_norm": 0.2282390760279398, + "learning_rate": 9.708269818791668e-05, + "loss": 0.9354, + "step": 16026 + }, + { + "epoch": 3.4187286689419794, + "grad_norm": 0.18930538668783012, + "learning_rate": 9.707438459770091e-05, + "loss": 0.9202, + "step": 16027 + }, + { + "epoch": 3.4189419795221845, + "grad_norm": 0.20409906547903284, + "learning_rate": 9.706607081428819e-05, + "loss": 0.9304, + "step": 16028 + }, + { + "epoch": 3.419155290102389, + "grad_norm": 0.2079466060149938, + "learning_rate": 9.705775683777255e-05, + "loss": 0.914, + "step": 16029 + }, + { + "epoch": 3.419368600682594, + "grad_norm": 0.25888328133461785, + "learning_rate": 9.704944266824815e-05, + "loss": 0.9369, + "step": 16030 + }, + { + "epoch": 3.4195819112627985, + "grad_norm": 0.3110484246023354, + "learning_rate": 9.704112830580898e-05, + "loss": 0.9286, + "step": 16031 + }, + { + "epoch": 3.419795221843003, + "grad_norm": 0.3714994772589989, + "learning_rate": 9.703281375054918e-05, + "loss": 0.9369, + "step": 16032 + }, + { + "epoch": 3.4200085324232083, + "grad_norm": 0.3989911530080284, + "learning_rate": 9.702449900256282e-05, + "loss": 0.9394, + "step": 16033 + }, + { + "epoch": 3.420221843003413, + "grad_norm": 0.3836539548636733, + "learning_rate": 9.701618406194396e-05, + "loss": 0.9112, + "step": 16034 + }, + { + "epoch": 3.4204351535836177, + "grad_norm": 0.3084522767318285, + "learning_rate": 9.700786892878667e-05, + "loss": 0.9337, + "step": 16035 + }, + { + "epoch": 3.4206484641638224, + "grad_norm": 0.2216881195158351, + "learning_rate": 9.699955360318507e-05, + "loss": 0.9413, + "step": 16036 + }, + { + "epoch": 3.4208617747440275, + "grad_norm": 0.2273672478895177, + "learning_rate": 9.699123808523324e-05, + "loss": 0.9499, + "step": 16037 + }, + { + "epoch": 3.421075085324232, + "grad_norm": 0.37588489365845573, + "learning_rate": 9.698292237502525e-05, + "loss": 0.9213, + "step": 16038 + }, + { + "epoch": 3.421288395904437, + "grad_norm": 0.5252879577016369, + "learning_rate": 9.697460647265523e-05, + "loss": 0.91, + "step": 16039 + }, + { + "epoch": 3.4215017064846416, + "grad_norm": 0.5586149721560133, + "learning_rate": 9.696629037821725e-05, + "loss": 0.9288, + "step": 16040 + }, + { + "epoch": 3.4217150170648463, + "grad_norm": 0.46843452370070937, + "learning_rate": 9.69579740918054e-05, + "loss": 0.9497, + "step": 16041 + }, + { + "epoch": 3.421928327645051, + "grad_norm": 0.281754459485931, + "learning_rate": 9.694965761351378e-05, + "loss": 0.9044, + "step": 16042 + }, + { + "epoch": 3.422141638225256, + "grad_norm": 0.24411983914638768, + "learning_rate": 9.694134094343651e-05, + "loss": 0.9634, + "step": 16043 + }, + { + "epoch": 3.4223549488054608, + "grad_norm": 0.3450396999669149, + "learning_rate": 9.693302408166769e-05, + "loss": 0.9491, + "step": 16044 + }, + { + "epoch": 3.4225682593856654, + "grad_norm": 0.36100759200994964, + "learning_rate": 9.692470702830139e-05, + "loss": 0.8957, + "step": 16045 + }, + { + "epoch": 3.42278156996587, + "grad_norm": 0.3939359468356345, + "learning_rate": 9.691638978343177e-05, + "loss": 0.9487, + "step": 16046 + }, + { + "epoch": 3.4229948805460753, + "grad_norm": 0.4085518144468984, + "learning_rate": 9.690807234715291e-05, + "loss": 0.9141, + "step": 16047 + }, + { + "epoch": 3.42320819112628, + "grad_norm": 0.32090566210837207, + "learning_rate": 9.689975471955892e-05, + "loss": 0.8982, + "step": 16048 + }, + { + "epoch": 3.4234215017064846, + "grad_norm": 0.2464793538528731, + "learning_rate": 9.689143690074391e-05, + "loss": 0.9482, + "step": 16049 + }, + { + "epoch": 3.4236348122866893, + "grad_norm": 0.2765516541160573, + "learning_rate": 9.688311889080204e-05, + "loss": 0.9354, + "step": 16050 + }, + { + "epoch": 3.423848122866894, + "grad_norm": 0.30776931094515264, + "learning_rate": 9.687480068982737e-05, + "loss": 0.8789, + "step": 16051 + }, + { + "epoch": 3.424061433447099, + "grad_norm": 0.31385582693391506, + "learning_rate": 9.686648229791407e-05, + "loss": 0.9017, + "step": 16052 + }, + { + "epoch": 3.424274744027304, + "grad_norm": 0.23904815423219977, + "learning_rate": 9.685816371515622e-05, + "loss": 0.8947, + "step": 16053 + }, + { + "epoch": 3.4244880546075085, + "grad_norm": 0.19438871416082346, + "learning_rate": 9.684984494164797e-05, + "loss": 0.9216, + "step": 16054 + }, + { + "epoch": 3.424701365187713, + "grad_norm": 0.2538916647306563, + "learning_rate": 9.684152597748345e-05, + "loss": 0.925, + "step": 16055 + }, + { + "epoch": 3.4249146757679183, + "grad_norm": 0.36067811621382306, + "learning_rate": 9.683320682275677e-05, + "loss": 0.9291, + "step": 16056 + }, + { + "epoch": 3.425127986348123, + "grad_norm": 0.31157689500631736, + "learning_rate": 9.682488747756205e-05, + "loss": 0.9119, + "step": 16057 + }, + { + "epoch": 3.4253412969283277, + "grad_norm": 0.23130352926647244, + "learning_rate": 9.681656794199347e-05, + "loss": 0.9251, + "step": 16058 + }, + { + "epoch": 3.4255546075085324, + "grad_norm": 0.19631253407941612, + "learning_rate": 9.680824821614513e-05, + "loss": 0.9101, + "step": 16059 + }, + { + "epoch": 3.425767918088737, + "grad_norm": 0.21787354435771028, + "learning_rate": 9.679992830011121e-05, + "loss": 0.8964, + "step": 16060 + }, + { + "epoch": 3.425981228668942, + "grad_norm": 0.2615200521657107, + "learning_rate": 9.679160819398579e-05, + "loss": 0.8985, + "step": 16061 + }, + { + "epoch": 3.426194539249147, + "grad_norm": 0.3030164375445614, + "learning_rate": 9.678328789786305e-05, + "loss": 0.8869, + "step": 16062 + }, + { + "epoch": 3.4264078498293515, + "grad_norm": 0.321240061855736, + "learning_rate": 9.67749674118371e-05, + "loss": 0.8779, + "step": 16063 + }, + { + "epoch": 3.426621160409556, + "grad_norm": 0.29579238988735945, + "learning_rate": 9.676664673600216e-05, + "loss": 0.925, + "step": 16064 + }, + { + "epoch": 3.426834470989761, + "grad_norm": 0.2081968698899903, + "learning_rate": 9.675832587045227e-05, + "loss": 0.918, + "step": 16065 + }, + { + "epoch": 3.427047781569966, + "grad_norm": 0.23531192066950563, + "learning_rate": 9.675000481528168e-05, + "loss": 0.9389, + "step": 16066 + }, + { + "epoch": 3.4272610921501707, + "grad_norm": 0.21395920158904627, + "learning_rate": 9.674168357058451e-05, + "loss": 0.9422, + "step": 16067 + }, + { + "epoch": 3.4274744027303754, + "grad_norm": 0.22106279143774565, + "learning_rate": 9.673336213645491e-05, + "loss": 0.9363, + "step": 16068 + }, + { + "epoch": 3.42768771331058, + "grad_norm": 0.2006516659916746, + "learning_rate": 9.672504051298699e-05, + "loss": 0.9279, + "step": 16069 + }, + { + "epoch": 3.4279010238907848, + "grad_norm": 0.21565246864540005, + "learning_rate": 9.6716718700275e-05, + "loss": 0.895, + "step": 16070 + }, + { + "epoch": 3.42811433447099, + "grad_norm": 0.23721595206870433, + "learning_rate": 9.670839669841305e-05, + "loss": 0.9318, + "step": 16071 + }, + { + "epoch": 3.4283276450511946, + "grad_norm": 0.20840385642538328, + "learning_rate": 9.670007450749531e-05, + "loss": 0.9215, + "step": 16072 + }, + { + "epoch": 3.4285409556313993, + "grad_norm": 0.23474645412031098, + "learning_rate": 9.669175212761593e-05, + "loss": 0.9096, + "step": 16073 + }, + { + "epoch": 3.428754266211604, + "grad_norm": 0.2721304145166035, + "learning_rate": 9.66834295588691e-05, + "loss": 0.9064, + "step": 16074 + }, + { + "epoch": 3.428967576791809, + "grad_norm": 0.20010590141665044, + "learning_rate": 9.667510680134898e-05, + "loss": 0.9111, + "step": 16075 + }, + { + "epoch": 3.4291808873720138, + "grad_norm": 0.23498306079871614, + "learning_rate": 9.666678385514976e-05, + "loss": 0.9315, + "step": 16076 + }, + { + "epoch": 3.4293941979522184, + "grad_norm": 0.2579640062037586, + "learning_rate": 9.66584607203656e-05, + "loss": 0.9099, + "step": 16077 + }, + { + "epoch": 3.429607508532423, + "grad_norm": 0.20264107995421005, + "learning_rate": 9.665013739709068e-05, + "loss": 0.9518, + "step": 16078 + }, + { + "epoch": 3.429820819112628, + "grad_norm": 0.21548375263601846, + "learning_rate": 9.664181388541915e-05, + "loss": 0.9214, + "step": 16079 + }, + { + "epoch": 3.430034129692833, + "grad_norm": 0.2487844709206371, + "learning_rate": 9.663349018544525e-05, + "loss": 0.9035, + "step": 16080 + }, + { + "epoch": 3.4302474402730376, + "grad_norm": 0.26559656186371583, + "learning_rate": 9.662516629726309e-05, + "loss": 0.8923, + "step": 16081 + }, + { + "epoch": 3.4304607508532423, + "grad_norm": 0.2023742356715966, + "learning_rate": 9.661684222096693e-05, + "loss": 0.8909, + "step": 16082 + }, + { + "epoch": 3.430674061433447, + "grad_norm": 0.22399165410622662, + "learning_rate": 9.660851795665091e-05, + "loss": 0.9348, + "step": 16083 + }, + { + "epoch": 3.430887372013652, + "grad_norm": 0.2210762140638707, + "learning_rate": 9.660019350440924e-05, + "loss": 0.9133, + "step": 16084 + }, + { + "epoch": 3.431100682593857, + "grad_norm": 0.1928768523314705, + "learning_rate": 9.659186886433607e-05, + "loss": 0.9337, + "step": 16085 + }, + { + "epoch": 3.4313139931740615, + "grad_norm": 0.22058892329076132, + "learning_rate": 9.658354403652563e-05, + "loss": 0.9442, + "step": 16086 + }, + { + "epoch": 3.431527303754266, + "grad_norm": 0.21882161664042038, + "learning_rate": 9.657521902107211e-05, + "loss": 0.924, + "step": 16087 + }, + { + "epoch": 3.431740614334471, + "grad_norm": 0.19035221433656613, + "learning_rate": 9.656689381806974e-05, + "loss": 0.9391, + "step": 16088 + }, + { + "epoch": 3.4319539249146755, + "grad_norm": 0.19774943871013173, + "learning_rate": 9.655856842761268e-05, + "loss": 0.9031, + "step": 16089 + }, + { + "epoch": 3.4321672354948807, + "grad_norm": 0.2574152173924193, + "learning_rate": 9.655024284979513e-05, + "loss": 0.8906, + "step": 16090 + }, + { + "epoch": 3.4323805460750854, + "grad_norm": 0.2135943637301545, + "learning_rate": 9.654191708471128e-05, + "loss": 0.9274, + "step": 16091 + }, + { + "epoch": 3.43259385665529, + "grad_norm": 0.2175302595541897, + "learning_rate": 9.65335911324554e-05, + "loss": 0.9105, + "step": 16092 + }, + { + "epoch": 3.4328071672354947, + "grad_norm": 0.242068097624455, + "learning_rate": 9.652526499312162e-05, + "loss": 0.9205, + "step": 16093 + }, + { + "epoch": 3.4330204778157, + "grad_norm": 0.26509754323875945, + "learning_rate": 9.65169386668042e-05, + "loss": 0.9098, + "step": 16094 + }, + { + "epoch": 3.4332337883959045, + "grad_norm": 0.2696193264755348, + "learning_rate": 9.650861215359738e-05, + "loss": 0.9114, + "step": 16095 + }, + { + "epoch": 3.4334470989761092, + "grad_norm": 0.27516374543451555, + "learning_rate": 9.650028545359529e-05, + "loss": 0.9325, + "step": 16096 + }, + { + "epoch": 3.433660409556314, + "grad_norm": 0.23124529727502272, + "learning_rate": 9.649195856689221e-05, + "loss": 0.9032, + "step": 16097 + }, + { + "epoch": 3.4338737201365186, + "grad_norm": 0.26915921289081574, + "learning_rate": 9.648363149358232e-05, + "loss": 0.9203, + "step": 16098 + }, + { + "epoch": 3.4340870307167237, + "grad_norm": 0.26534901107686915, + "learning_rate": 9.647530423375989e-05, + "loss": 0.9349, + "step": 16099 + }, + { + "epoch": 3.4343003412969284, + "grad_norm": 0.28841330648067226, + "learning_rate": 9.646697678751914e-05, + "loss": 0.9204, + "step": 16100 + }, + { + "epoch": 3.434513651877133, + "grad_norm": 0.30541986877279986, + "learning_rate": 9.645864915495421e-05, + "loss": 0.9094, + "step": 16101 + }, + { + "epoch": 3.4347269624573378, + "grad_norm": 0.22410442904457872, + "learning_rate": 9.645032133615942e-05, + "loss": 0.9384, + "step": 16102 + }, + { + "epoch": 3.434940273037543, + "grad_norm": 0.27689042505705963, + "learning_rate": 9.644199333122896e-05, + "loss": 0.9271, + "step": 16103 + }, + { + "epoch": 3.4351535836177476, + "grad_norm": 0.2398498052482098, + "learning_rate": 9.643366514025707e-05, + "loss": 0.9239, + "step": 16104 + }, + { + "epoch": 3.4353668941979523, + "grad_norm": 0.33670260800068436, + "learning_rate": 9.642533676333797e-05, + "loss": 0.9447, + "step": 16105 + }, + { + "epoch": 3.435580204778157, + "grad_norm": 0.3039458664756672, + "learning_rate": 9.641700820056595e-05, + "loss": 0.9288, + "step": 16106 + }, + { + "epoch": 3.4357935153583616, + "grad_norm": 0.3156073510286617, + "learning_rate": 9.640867945203516e-05, + "loss": 0.916, + "step": 16107 + }, + { + "epoch": 3.4360068259385663, + "grad_norm": 0.34214789616771757, + "learning_rate": 9.640035051783992e-05, + "loss": 0.9416, + "step": 16108 + }, + { + "epoch": 3.4362201365187715, + "grad_norm": 0.29264653137733393, + "learning_rate": 9.639202139807441e-05, + "loss": 0.9372, + "step": 16109 + }, + { + "epoch": 3.436433447098976, + "grad_norm": 0.21089916323424257, + "learning_rate": 9.638369209283293e-05, + "loss": 0.9185, + "step": 16110 + }, + { + "epoch": 3.436646757679181, + "grad_norm": 0.24083117890107972, + "learning_rate": 9.637536260220968e-05, + "loss": 0.9459, + "step": 16111 + }, + { + "epoch": 3.4368600682593855, + "grad_norm": 0.29375939679258456, + "learning_rate": 9.636703292629893e-05, + "loss": 0.9479, + "step": 16112 + }, + { + "epoch": 3.4370733788395906, + "grad_norm": 0.38136051701124424, + "learning_rate": 9.635870306519492e-05, + "loss": 0.9158, + "step": 16113 + }, + { + "epoch": 3.4372866894197953, + "grad_norm": 0.408153040422899, + "learning_rate": 9.635037301899193e-05, + "loss": 0.9009, + "step": 16114 + }, + { + "epoch": 3.4375, + "grad_norm": 0.336233919166304, + "learning_rate": 9.634204278778417e-05, + "loss": 0.9323, + "step": 16115 + }, + { + "epoch": 3.4377133105802047, + "grad_norm": 0.24448718236929814, + "learning_rate": 9.633371237166593e-05, + "loss": 0.9237, + "step": 16116 + }, + { + "epoch": 3.4379266211604094, + "grad_norm": 0.2254104291378124, + "learning_rate": 9.632538177073147e-05, + "loss": 0.8789, + "step": 16117 + }, + { + "epoch": 3.4381399317406145, + "grad_norm": 0.3117187785453192, + "learning_rate": 9.631705098507504e-05, + "loss": 0.908, + "step": 16118 + }, + { + "epoch": 3.438353242320819, + "grad_norm": 0.3387389538656406, + "learning_rate": 9.630872001479089e-05, + "loss": 0.9073, + "step": 16119 + }, + { + "epoch": 3.438566552901024, + "grad_norm": 0.34605117557915827, + "learning_rate": 9.63003888599733e-05, + "loss": 0.8916, + "step": 16120 + }, + { + "epoch": 3.4387798634812285, + "grad_norm": 0.25962920547245183, + "learning_rate": 9.629205752071652e-05, + "loss": 0.9095, + "step": 16121 + }, + { + "epoch": 3.4389931740614337, + "grad_norm": 0.19762503885210603, + "learning_rate": 9.628372599711487e-05, + "loss": 0.9212, + "step": 16122 + }, + { + "epoch": 3.4392064846416384, + "grad_norm": 0.23300381416934243, + "learning_rate": 9.627539428926257e-05, + "loss": 0.9256, + "step": 16123 + }, + { + "epoch": 3.439419795221843, + "grad_norm": 0.2810553286425311, + "learning_rate": 9.626706239725393e-05, + "loss": 0.9271, + "step": 16124 + }, + { + "epoch": 3.4396331058020477, + "grad_norm": 0.2666713509234472, + "learning_rate": 9.625873032118318e-05, + "loss": 0.9349, + "step": 16125 + }, + { + "epoch": 3.4398464163822524, + "grad_norm": 0.20936209703887926, + "learning_rate": 9.625039806114463e-05, + "loss": 0.947, + "step": 16126 + }, + { + "epoch": 3.440059726962457, + "grad_norm": 0.18692729273652278, + "learning_rate": 9.624206561723255e-05, + "loss": 0.955, + "step": 16127 + }, + { + "epoch": 3.4402730375426622, + "grad_norm": 0.25223464122875683, + "learning_rate": 9.623373298954124e-05, + "loss": 0.9138, + "step": 16128 + }, + { + "epoch": 3.440486348122867, + "grad_norm": 0.19972779098656052, + "learning_rate": 9.622540017816495e-05, + "loss": 0.938, + "step": 16129 + }, + { + "epoch": 3.4406996587030716, + "grad_norm": 0.2014322250767543, + "learning_rate": 9.621706718319799e-05, + "loss": 0.9191, + "step": 16130 + }, + { + "epoch": 3.4409129692832763, + "grad_norm": 0.2160972001433411, + "learning_rate": 9.620873400473462e-05, + "loss": 0.8986, + "step": 16131 + }, + { + "epoch": 3.4411262798634814, + "grad_norm": 0.1954199467053956, + "learning_rate": 9.620040064286919e-05, + "loss": 0.9433, + "step": 16132 + }, + { + "epoch": 3.441339590443686, + "grad_norm": 0.1939068683729343, + "learning_rate": 9.619206709769593e-05, + "loss": 0.9041, + "step": 16133 + }, + { + "epoch": 3.4415529010238908, + "grad_norm": 0.19702478212220592, + "learning_rate": 9.618373336930918e-05, + "loss": 0.9113, + "step": 16134 + }, + { + "epoch": 3.4417662116040955, + "grad_norm": 0.2333561102242433, + "learning_rate": 9.61753994578032e-05, + "loss": 0.906, + "step": 16135 + }, + { + "epoch": 3.4419795221843, + "grad_norm": 0.1791271756489217, + "learning_rate": 9.616706536327229e-05, + "loss": 0.9153, + "step": 16136 + }, + { + "epoch": 3.4421928327645053, + "grad_norm": 0.21299630219212246, + "learning_rate": 9.615873108581078e-05, + "loss": 0.9138, + "step": 16137 + }, + { + "epoch": 3.44240614334471, + "grad_norm": 0.19773783358886904, + "learning_rate": 9.615039662551296e-05, + "loss": 0.954, + "step": 16138 + }, + { + "epoch": 3.4426194539249146, + "grad_norm": 0.21412293387955622, + "learning_rate": 9.614206198247312e-05, + "loss": 0.9072, + "step": 16139 + }, + { + "epoch": 3.4428327645051193, + "grad_norm": 0.25575958855420267, + "learning_rate": 9.613372715678559e-05, + "loss": 0.9214, + "step": 16140 + }, + { + "epoch": 3.4430460750853245, + "grad_norm": 0.27936094750732027, + "learning_rate": 9.612539214854465e-05, + "loss": 0.9326, + "step": 16141 + }, + { + "epoch": 3.443259385665529, + "grad_norm": 0.19681158271720292, + "learning_rate": 9.611705695784464e-05, + "loss": 0.9181, + "step": 16142 + }, + { + "epoch": 3.443472696245734, + "grad_norm": 0.21689643827065452, + "learning_rate": 9.610872158477984e-05, + "loss": 0.9224, + "step": 16143 + }, + { + "epoch": 3.4436860068259385, + "grad_norm": 0.242454326114206, + "learning_rate": 9.610038602944461e-05, + "loss": 0.9682, + "step": 16144 + }, + { + "epoch": 3.443899317406143, + "grad_norm": 0.2378114402336617, + "learning_rate": 9.609205029193323e-05, + "loss": 0.9471, + "step": 16145 + }, + { + "epoch": 3.444112627986348, + "grad_norm": 0.28198005741694715, + "learning_rate": 9.608371437234003e-05, + "loss": 0.9345, + "step": 16146 + }, + { + "epoch": 3.444325938566553, + "grad_norm": 0.2899003970072762, + "learning_rate": 9.607537827075932e-05, + "loss": 0.9123, + "step": 16147 + }, + { + "epoch": 3.4445392491467577, + "grad_norm": 0.3052899506119497, + "learning_rate": 9.606704198728545e-05, + "loss": 0.9259, + "step": 16148 + }, + { + "epoch": 3.4447525597269624, + "grad_norm": 0.25480307273353114, + "learning_rate": 9.605870552201271e-05, + "loss": 0.9359, + "step": 16149 + }, + { + "epoch": 3.444965870307167, + "grad_norm": 0.23586109534401575, + "learning_rate": 9.605036887503549e-05, + "loss": 0.9223, + "step": 16150 + }, + { + "epoch": 3.445179180887372, + "grad_norm": 0.2614034877458149, + "learning_rate": 9.604203204644803e-05, + "loss": 0.9271, + "step": 16151 + }, + { + "epoch": 3.445392491467577, + "grad_norm": 0.24603447043203278, + "learning_rate": 9.603369503634474e-05, + "loss": 0.9171, + "step": 16152 + }, + { + "epoch": 3.4456058020477816, + "grad_norm": 0.21081271505043503, + "learning_rate": 9.602535784481989e-05, + "loss": 0.9295, + "step": 16153 + }, + { + "epoch": 3.4458191126279862, + "grad_norm": 0.2345382424633492, + "learning_rate": 9.601702047196787e-05, + "loss": 0.9064, + "step": 16154 + }, + { + "epoch": 3.446032423208191, + "grad_norm": 0.2601350606972677, + "learning_rate": 9.600868291788298e-05, + "loss": 0.9269, + "step": 16155 + }, + { + "epoch": 3.446245733788396, + "grad_norm": 0.22086675149297605, + "learning_rate": 9.60003451826596e-05, + "loss": 0.9148, + "step": 16156 + }, + { + "epoch": 3.4464590443686007, + "grad_norm": 0.2144038481635675, + "learning_rate": 9.599200726639201e-05, + "loss": 0.9319, + "step": 16157 + }, + { + "epoch": 3.4466723549488054, + "grad_norm": 0.22473304800475968, + "learning_rate": 9.59836691691746e-05, + "loss": 0.935, + "step": 16158 + }, + { + "epoch": 3.44688566552901, + "grad_norm": 0.2708348687701805, + "learning_rate": 9.597533089110171e-05, + "loss": 0.9118, + "step": 16159 + }, + { + "epoch": 3.4470989761092152, + "grad_norm": 0.2442375176985486, + "learning_rate": 9.596699243226768e-05, + "loss": 0.942, + "step": 16160 + }, + { + "epoch": 3.44731228668942, + "grad_norm": 0.23541684350284622, + "learning_rate": 9.595865379276685e-05, + "loss": 0.9566, + "step": 16161 + }, + { + "epoch": 3.4475255972696246, + "grad_norm": 0.2374502805267931, + "learning_rate": 9.595031497269363e-05, + "loss": 0.9148, + "step": 16162 + }, + { + "epoch": 3.4477389078498293, + "grad_norm": 0.20327360349367568, + "learning_rate": 9.594197597214226e-05, + "loss": 0.9275, + "step": 16163 + }, + { + "epoch": 3.447952218430034, + "grad_norm": 0.30876417431373504, + "learning_rate": 9.593363679120721e-05, + "loss": 0.9204, + "step": 16164 + }, + { + "epoch": 3.448165529010239, + "grad_norm": 0.29685372965222867, + "learning_rate": 9.592529742998278e-05, + "loss": 0.942, + "step": 16165 + }, + { + "epoch": 3.448378839590444, + "grad_norm": 0.2972756971075792, + "learning_rate": 9.591695788856336e-05, + "loss": 0.9422, + "step": 16166 + }, + { + "epoch": 3.4485921501706485, + "grad_norm": 0.24762037532395592, + "learning_rate": 9.590861816704326e-05, + "loss": 0.9112, + "step": 16167 + }, + { + "epoch": 3.448805460750853, + "grad_norm": 0.24158552796575095, + "learning_rate": 9.590027826551691e-05, + "loss": 0.9468, + "step": 16168 + }, + { + "epoch": 3.449018771331058, + "grad_norm": 0.21188082537469724, + "learning_rate": 9.589193818407861e-05, + "loss": 0.932, + "step": 16169 + }, + { + "epoch": 3.449232081911263, + "grad_norm": 0.17656787050609793, + "learning_rate": 9.588359792282279e-05, + "loss": 0.8938, + "step": 16170 + }, + { + "epoch": 3.4494453924914676, + "grad_norm": 0.2784800957827779, + "learning_rate": 9.587525748184377e-05, + "loss": 0.9204, + "step": 16171 + }, + { + "epoch": 3.4496587030716723, + "grad_norm": 0.26539634000587203, + "learning_rate": 9.586691686123596e-05, + "loss": 0.9152, + "step": 16172 + }, + { + "epoch": 3.449872013651877, + "grad_norm": 0.32864159750660304, + "learning_rate": 9.585857606109373e-05, + "loss": 0.945, + "step": 16173 + }, + { + "epoch": 3.4500853242320817, + "grad_norm": 0.3888420339018493, + "learning_rate": 9.585023508151143e-05, + "loss": 0.9342, + "step": 16174 + }, + { + "epoch": 3.450298634812287, + "grad_norm": 0.37709950950581805, + "learning_rate": 9.584189392258344e-05, + "loss": 0.8792, + "step": 16175 + }, + { + "epoch": 3.4505119453924915, + "grad_norm": 0.2740470856297605, + "learning_rate": 9.583355258440418e-05, + "loss": 0.9593, + "step": 16176 + }, + { + "epoch": 3.450725255972696, + "grad_norm": 0.2583698493887959, + "learning_rate": 9.582521106706797e-05, + "loss": 0.9265, + "step": 16177 + }, + { + "epoch": 3.450938566552901, + "grad_norm": 0.31095109718286407, + "learning_rate": 9.581686937066928e-05, + "loss": 0.9425, + "step": 16178 + }, + { + "epoch": 3.451151877133106, + "grad_norm": 0.3943934365343056, + "learning_rate": 9.580852749530242e-05, + "loss": 0.9192, + "step": 16179 + }, + { + "epoch": 3.4513651877133107, + "grad_norm": 0.49538869778855316, + "learning_rate": 9.580018544106181e-05, + "loss": 0.9139, + "step": 16180 + }, + { + "epoch": 3.4515784982935154, + "grad_norm": 0.565431114437667, + "learning_rate": 9.579184320804182e-05, + "loss": 0.9019, + "step": 16181 + }, + { + "epoch": 3.45179180887372, + "grad_norm": 0.41816265492963256, + "learning_rate": 9.578350079633687e-05, + "loss": 0.9011, + "step": 16182 + }, + { + "epoch": 3.4520051194539247, + "grad_norm": 0.2963665308689712, + "learning_rate": 9.577515820604134e-05, + "loss": 0.9373, + "step": 16183 + }, + { + "epoch": 3.45221843003413, + "grad_norm": 0.33207986620577046, + "learning_rate": 9.576681543724966e-05, + "loss": 0.9131, + "step": 16184 + }, + { + "epoch": 3.4524317406143346, + "grad_norm": 0.32084917946132585, + "learning_rate": 9.575847249005614e-05, + "loss": 0.9083, + "step": 16185 + }, + { + "epoch": 3.4526450511945392, + "grad_norm": 0.4554422662310421, + "learning_rate": 9.575012936455528e-05, + "loss": 0.9127, + "step": 16186 + }, + { + "epoch": 3.452858361774744, + "grad_norm": 0.5075617991521745, + "learning_rate": 9.574178606084142e-05, + "loss": 0.8948, + "step": 16187 + }, + { + "epoch": 3.453071672354949, + "grad_norm": 0.42470499530660694, + "learning_rate": 9.5733442579009e-05, + "loss": 0.9222, + "step": 16188 + }, + { + "epoch": 3.4532849829351537, + "grad_norm": 0.3302181385383185, + "learning_rate": 9.57250989191524e-05, + "loss": 0.9183, + "step": 16189 + }, + { + "epoch": 3.4534982935153584, + "grad_norm": 0.23750957338579662, + "learning_rate": 9.571675508136609e-05, + "loss": 0.9034, + "step": 16190 + }, + { + "epoch": 3.453711604095563, + "grad_norm": 0.21461853879443019, + "learning_rate": 9.570841106574436e-05, + "loss": 0.9165, + "step": 16191 + }, + { + "epoch": 3.453924914675768, + "grad_norm": 0.2518016078581616, + "learning_rate": 9.570006687238175e-05, + "loss": 0.9044, + "step": 16192 + }, + { + "epoch": 3.4541382252559725, + "grad_norm": 0.2586247776671018, + "learning_rate": 9.569172250137258e-05, + "loss": 0.8847, + "step": 16193 + }, + { + "epoch": 3.4543515358361776, + "grad_norm": 0.24601547317886086, + "learning_rate": 9.568337795281134e-05, + "loss": 0.9285, + "step": 16194 + }, + { + "epoch": 3.4545648464163823, + "grad_norm": 0.20929786004437206, + "learning_rate": 9.567503322679239e-05, + "loss": 0.9285, + "step": 16195 + }, + { + "epoch": 3.454778156996587, + "grad_norm": 0.22421743256494162, + "learning_rate": 9.56666883234102e-05, + "loss": 0.9363, + "step": 16196 + }, + { + "epoch": 3.4549914675767917, + "grad_norm": 0.22109767354625082, + "learning_rate": 9.565834324275912e-05, + "loss": 0.9201, + "step": 16197 + }, + { + "epoch": 3.455204778156997, + "grad_norm": 0.24380303394360509, + "learning_rate": 9.564999798493367e-05, + "loss": 0.9309, + "step": 16198 + }, + { + "epoch": 3.4554180887372015, + "grad_norm": 0.21330660260168852, + "learning_rate": 9.56416525500282e-05, + "loss": 0.9278, + "step": 16199 + }, + { + "epoch": 3.455631399317406, + "grad_norm": 0.22077227122455373, + "learning_rate": 9.563330693813719e-05, + "loss": 0.9327, + "step": 16200 + }, + { + "epoch": 3.455844709897611, + "grad_norm": 0.2665340734848011, + "learning_rate": 9.562496114935504e-05, + "loss": 0.9422, + "step": 16201 + }, + { + "epoch": 3.4560580204778155, + "grad_norm": 0.231218333719128, + "learning_rate": 9.561661518377622e-05, + "loss": 0.9519, + "step": 16202 + }, + { + "epoch": 3.4562713310580206, + "grad_norm": 0.25121619685072544, + "learning_rate": 9.560826904149508e-05, + "loss": 0.9092, + "step": 16203 + }, + { + "epoch": 3.4564846416382253, + "grad_norm": 0.21255211706237837, + "learning_rate": 9.559992272260616e-05, + "loss": 0.9153, + "step": 16204 + }, + { + "epoch": 3.45669795221843, + "grad_norm": 0.17622442142258368, + "learning_rate": 9.559157622720381e-05, + "loss": 0.9153, + "step": 16205 + }, + { + "epoch": 3.4569112627986347, + "grad_norm": 0.27662292970192043, + "learning_rate": 9.558322955538257e-05, + "loss": 0.9043, + "step": 16206 + }, + { + "epoch": 3.45712457337884, + "grad_norm": 0.28117827879853446, + "learning_rate": 9.557488270723678e-05, + "loss": 0.9445, + "step": 16207 + }, + { + "epoch": 3.4573378839590445, + "grad_norm": 0.2274189415372487, + "learning_rate": 9.556653568286096e-05, + "loss": 0.9, + "step": 16208 + }, + { + "epoch": 3.457551194539249, + "grad_norm": 0.24790805037917601, + "learning_rate": 9.555818848234949e-05, + "loss": 0.8818, + "step": 16209 + }, + { + "epoch": 3.457764505119454, + "grad_norm": 0.2071470901788836, + "learning_rate": 9.554984110579687e-05, + "loss": 0.9033, + "step": 16210 + }, + { + "epoch": 3.4579778156996586, + "grad_norm": 0.21903547479921856, + "learning_rate": 9.554149355329753e-05, + "loss": 0.902, + "step": 16211 + }, + { + "epoch": 3.4581911262798632, + "grad_norm": 0.22160197008732924, + "learning_rate": 9.553314582494596e-05, + "loss": 0.9339, + "step": 16212 + }, + { + "epoch": 3.4584044368600684, + "grad_norm": 0.3368981271260609, + "learning_rate": 9.552479792083656e-05, + "loss": 0.918, + "step": 16213 + }, + { + "epoch": 3.458617747440273, + "grad_norm": 0.23707950202914532, + "learning_rate": 9.551644984106382e-05, + "loss": 0.9213, + "step": 16214 + }, + { + "epoch": 3.4588310580204777, + "grad_norm": 0.23514332494036294, + "learning_rate": 9.550810158572218e-05, + "loss": 0.9195, + "step": 16215 + }, + { + "epoch": 3.4590443686006824, + "grad_norm": 0.2499928695544366, + "learning_rate": 9.549975315490612e-05, + "loss": 0.9145, + "step": 16216 + }, + { + "epoch": 3.4592576791808876, + "grad_norm": 0.21503905662708528, + "learning_rate": 9.549140454871007e-05, + "loss": 0.9053, + "step": 16217 + }, + { + "epoch": 3.4594709897610922, + "grad_norm": 0.2372020208817401, + "learning_rate": 9.548305576722854e-05, + "loss": 0.9416, + "step": 16218 + }, + { + "epoch": 3.459684300341297, + "grad_norm": 0.3123152365864854, + "learning_rate": 9.547470681055597e-05, + "loss": 0.8952, + "step": 16219 + }, + { + "epoch": 3.4598976109215016, + "grad_norm": 0.31868509922419325, + "learning_rate": 9.546635767878684e-05, + "loss": 0.9245, + "step": 16220 + }, + { + "epoch": 3.4601109215017063, + "grad_norm": 0.2225552662473432, + "learning_rate": 9.545800837201558e-05, + "loss": 0.9201, + "step": 16221 + }, + { + "epoch": 3.4603242320819114, + "grad_norm": 0.254080328287532, + "learning_rate": 9.544965889033674e-05, + "loss": 0.9406, + "step": 16222 + }, + { + "epoch": 3.460537542662116, + "grad_norm": 0.19738469479982856, + "learning_rate": 9.544130923384473e-05, + "loss": 0.9354, + "step": 16223 + }, + { + "epoch": 3.460750853242321, + "grad_norm": 0.24094550997369535, + "learning_rate": 9.543295940263403e-05, + "loss": 0.9291, + "step": 16224 + }, + { + "epoch": 3.4609641638225255, + "grad_norm": 0.2787543677362625, + "learning_rate": 9.542460939679915e-05, + "loss": 0.905, + "step": 16225 + }, + { + "epoch": 3.4611774744027306, + "grad_norm": 0.24622242561424776, + "learning_rate": 9.541625921643455e-05, + "loss": 0.9343, + "step": 16226 + }, + { + "epoch": 3.4613907849829353, + "grad_norm": 0.18971174715044228, + "learning_rate": 9.540790886163472e-05, + "loss": 0.9286, + "step": 16227 + }, + { + "epoch": 3.46160409556314, + "grad_norm": 0.21474678072943618, + "learning_rate": 9.539955833249414e-05, + "loss": 0.9148, + "step": 16228 + }, + { + "epoch": 3.4618174061433447, + "grad_norm": 0.1929800959692385, + "learning_rate": 9.539120762910732e-05, + "loss": 0.9403, + "step": 16229 + }, + { + "epoch": 3.4620307167235493, + "grad_norm": 0.20268471455998882, + "learning_rate": 9.538285675156869e-05, + "loss": 0.931, + "step": 16230 + }, + { + "epoch": 3.462244027303754, + "grad_norm": 0.26005470581515133, + "learning_rate": 9.537450569997281e-05, + "loss": 0.9318, + "step": 16231 + }, + { + "epoch": 3.462457337883959, + "grad_norm": 0.22251692207934448, + "learning_rate": 9.536615447441412e-05, + "loss": 0.9262, + "step": 16232 + }, + { + "epoch": 3.462670648464164, + "grad_norm": 0.25773132963763873, + "learning_rate": 9.535780307498713e-05, + "loss": 0.9245, + "step": 16233 + }, + { + "epoch": 3.4628839590443685, + "grad_norm": 0.27826106419957264, + "learning_rate": 9.534945150178635e-05, + "loss": 0.9305, + "step": 16234 + }, + { + "epoch": 3.463097269624573, + "grad_norm": 0.22461368649940755, + "learning_rate": 9.534109975490627e-05, + "loss": 0.9419, + "step": 16235 + }, + { + "epoch": 3.4633105802047783, + "grad_norm": 0.19955082824844306, + "learning_rate": 9.533274783444137e-05, + "loss": 0.9418, + "step": 16236 + }, + { + "epoch": 3.463523890784983, + "grad_norm": 0.2567659125038565, + "learning_rate": 9.53243957404862e-05, + "loss": 0.9125, + "step": 16237 + }, + { + "epoch": 3.4637372013651877, + "grad_norm": 0.2049835361810622, + "learning_rate": 9.531604347313523e-05, + "loss": 0.897, + "step": 16238 + }, + { + "epoch": 3.4639505119453924, + "grad_norm": 0.18906277011797956, + "learning_rate": 9.530769103248293e-05, + "loss": 0.9039, + "step": 16239 + }, + { + "epoch": 3.464163822525597, + "grad_norm": 0.24237680872053124, + "learning_rate": 9.529933841862388e-05, + "loss": 0.915, + "step": 16240 + }, + { + "epoch": 3.464377133105802, + "grad_norm": 0.1800713586933588, + "learning_rate": 9.529098563165257e-05, + "loss": 0.9504, + "step": 16241 + }, + { + "epoch": 3.464590443686007, + "grad_norm": 0.22454247724731172, + "learning_rate": 9.528263267166347e-05, + "loss": 0.8969, + "step": 16242 + }, + { + "epoch": 3.4648037542662116, + "grad_norm": 0.26780609719170245, + "learning_rate": 9.527427953875117e-05, + "loss": 0.9363, + "step": 16243 + }, + { + "epoch": 3.4650170648464163, + "grad_norm": 0.21193833898127334, + "learning_rate": 9.526592623301011e-05, + "loss": 0.9342, + "step": 16244 + }, + { + "epoch": 3.4652303754266214, + "grad_norm": 0.20536652090206642, + "learning_rate": 9.525757275453481e-05, + "loss": 0.9184, + "step": 16245 + }, + { + "epoch": 3.465443686006826, + "grad_norm": 0.3044105683144326, + "learning_rate": 9.524921910341986e-05, + "loss": 0.9082, + "step": 16246 + }, + { + "epoch": 3.4656569965870307, + "grad_norm": 0.27035831221647455, + "learning_rate": 9.524086527975974e-05, + "loss": 0.9508, + "step": 16247 + }, + { + "epoch": 3.4658703071672354, + "grad_norm": 0.2152974637345174, + "learning_rate": 9.523251128364895e-05, + "loss": 0.9152, + "step": 16248 + }, + { + "epoch": 3.46608361774744, + "grad_norm": 0.2701572616997403, + "learning_rate": 9.522415711518204e-05, + "loss": 0.9141, + "step": 16249 + }, + { + "epoch": 3.4662969283276452, + "grad_norm": 0.24880645136837307, + "learning_rate": 9.521580277445355e-05, + "loss": 0.8961, + "step": 16250 + }, + { + "epoch": 3.46651023890785, + "grad_norm": 0.23767612092060636, + "learning_rate": 9.5207448261558e-05, + "loss": 0.9066, + "step": 16251 + }, + { + "epoch": 3.4667235494880546, + "grad_norm": 0.2479164809352497, + "learning_rate": 9.51990935765899e-05, + "loss": 0.9326, + "step": 16252 + }, + { + "epoch": 3.4669368600682593, + "grad_norm": 0.200154028300068, + "learning_rate": 9.519073871964381e-05, + "loss": 0.9222, + "step": 16253 + }, + { + "epoch": 3.467150170648464, + "grad_norm": 0.21701805479166955, + "learning_rate": 9.518238369081424e-05, + "loss": 0.9286, + "step": 16254 + }, + { + "epoch": 3.467363481228669, + "grad_norm": 0.19997380769269682, + "learning_rate": 9.517402849019576e-05, + "loss": 0.9004, + "step": 16255 + }, + { + "epoch": 3.467576791808874, + "grad_norm": 0.23933539071562776, + "learning_rate": 9.516567311788289e-05, + "loss": 0.9295, + "step": 16256 + }, + { + "epoch": 3.4677901023890785, + "grad_norm": 0.2014206530464684, + "learning_rate": 9.515731757397017e-05, + "loss": 0.8969, + "step": 16257 + }, + { + "epoch": 3.468003412969283, + "grad_norm": 0.22272216219544466, + "learning_rate": 9.514896185855215e-05, + "loss": 0.9181, + "step": 16258 + }, + { + "epoch": 3.468216723549488, + "grad_norm": 0.18769003821210137, + "learning_rate": 9.514060597172338e-05, + "loss": 0.8868, + "step": 16259 + }, + { + "epoch": 3.468430034129693, + "grad_norm": 0.21762617044081384, + "learning_rate": 9.513224991357837e-05, + "loss": 0.9001, + "step": 16260 + }, + { + "epoch": 3.4686433447098977, + "grad_norm": 0.19131503078812445, + "learning_rate": 9.512389368421172e-05, + "loss": 0.9015, + "step": 16261 + }, + { + "epoch": 3.4688566552901023, + "grad_norm": 0.19234459759892236, + "learning_rate": 9.511553728371796e-05, + "loss": 0.9196, + "step": 16262 + }, + { + "epoch": 3.469069965870307, + "grad_norm": 0.2008296477672604, + "learning_rate": 9.510718071219165e-05, + "loss": 0.9197, + "step": 16263 + }, + { + "epoch": 3.469283276450512, + "grad_norm": 0.22747816760575973, + "learning_rate": 9.50988239697273e-05, + "loss": 0.9094, + "step": 16264 + }, + { + "epoch": 3.469496587030717, + "grad_norm": 0.1947037581153705, + "learning_rate": 9.509046705641954e-05, + "loss": 0.9146, + "step": 16265 + }, + { + "epoch": 3.4697098976109215, + "grad_norm": 0.21539977985444028, + "learning_rate": 9.508210997236288e-05, + "loss": 0.918, + "step": 16266 + }, + { + "epoch": 3.469923208191126, + "grad_norm": 0.23372833335971485, + "learning_rate": 9.507375271765188e-05, + "loss": 0.9684, + "step": 16267 + }, + { + "epoch": 3.470136518771331, + "grad_norm": 0.22524087908569287, + "learning_rate": 9.506539529238114e-05, + "loss": 0.9151, + "step": 16268 + }, + { + "epoch": 3.470349829351536, + "grad_norm": 0.22428655849930026, + "learning_rate": 9.505703769664518e-05, + "loss": 0.9261, + "step": 16269 + }, + { + "epoch": 3.4705631399317407, + "grad_norm": 0.2225891171102543, + "learning_rate": 9.504867993053858e-05, + "loss": 0.8969, + "step": 16270 + }, + { + "epoch": 3.4707764505119454, + "grad_norm": 0.197412511251734, + "learning_rate": 9.504032199415593e-05, + "loss": 0.9212, + "step": 16271 + }, + { + "epoch": 3.47098976109215, + "grad_norm": 0.21391157625970675, + "learning_rate": 9.503196388759176e-05, + "loss": 0.9475, + "step": 16272 + }, + { + "epoch": 3.4712030716723548, + "grad_norm": 0.2328196188493763, + "learning_rate": 9.50236056109407e-05, + "loss": 0.9377, + "step": 16273 + }, + { + "epoch": 3.47141638225256, + "grad_norm": 0.23487647406800807, + "learning_rate": 9.501524716429727e-05, + "loss": 0.9168, + "step": 16274 + }, + { + "epoch": 3.4716296928327646, + "grad_norm": 0.2017616576866866, + "learning_rate": 9.500688854775606e-05, + "loss": 0.9325, + "step": 16275 + }, + { + "epoch": 3.4718430034129693, + "grad_norm": 0.2307095736120444, + "learning_rate": 9.499852976141166e-05, + "loss": 0.9317, + "step": 16276 + }, + { + "epoch": 3.472056313993174, + "grad_norm": 0.291497468217418, + "learning_rate": 9.499017080535864e-05, + "loss": 0.9213, + "step": 16277 + }, + { + "epoch": 3.4722696245733786, + "grad_norm": 0.22352062339038273, + "learning_rate": 9.498181167969158e-05, + "loss": 0.9153, + "step": 16278 + }, + { + "epoch": 3.4724829351535837, + "grad_norm": 0.2141329643882877, + "learning_rate": 9.497345238450509e-05, + "loss": 0.9285, + "step": 16279 + }, + { + "epoch": 3.4726962457337884, + "grad_norm": 0.3336409160824078, + "learning_rate": 9.496509291989371e-05, + "loss": 0.9214, + "step": 16280 + }, + { + "epoch": 3.472909556313993, + "grad_norm": 0.26481973943534265, + "learning_rate": 9.495673328595206e-05, + "loss": 0.9257, + "step": 16281 + }, + { + "epoch": 3.473122866894198, + "grad_norm": 0.184361271568094, + "learning_rate": 9.494837348277474e-05, + "loss": 0.9171, + "step": 16282 + }, + { + "epoch": 3.473336177474403, + "grad_norm": 0.2592259026285455, + "learning_rate": 9.49400135104563e-05, + "loss": 0.9262, + "step": 16283 + }, + { + "epoch": 3.4735494880546076, + "grad_norm": 0.1904988789523786, + "learning_rate": 9.493165336909135e-05, + "loss": 0.8802, + "step": 16284 + }, + { + "epoch": 3.4737627986348123, + "grad_norm": 0.17318116074631662, + "learning_rate": 9.492329305877454e-05, + "loss": 0.9508, + "step": 16285 + }, + { + "epoch": 3.473976109215017, + "grad_norm": 0.15745230510200267, + "learning_rate": 9.491493257960037e-05, + "loss": 0.9535, + "step": 16286 + }, + { + "epoch": 3.4741894197952217, + "grad_norm": 0.18715110329804435, + "learning_rate": 9.49065719316635e-05, + "loss": 0.9166, + "step": 16287 + }, + { + "epoch": 3.474402730375427, + "grad_norm": 0.19360707583152625, + "learning_rate": 9.489821111505851e-05, + "loss": 0.9526, + "step": 16288 + }, + { + "epoch": 3.4746160409556315, + "grad_norm": 0.213189628307777, + "learning_rate": 9.488985012988002e-05, + "loss": 0.9584, + "step": 16289 + }, + { + "epoch": 3.474829351535836, + "grad_norm": 0.19912938814242515, + "learning_rate": 9.488148897622262e-05, + "loss": 0.9045, + "step": 16290 + }, + { + "epoch": 3.475042662116041, + "grad_norm": 0.18540044012414947, + "learning_rate": 9.487312765418094e-05, + "loss": 0.931, + "step": 16291 + }, + { + "epoch": 3.475255972696246, + "grad_norm": 0.1964025297866963, + "learning_rate": 9.486476616384955e-05, + "loss": 0.9049, + "step": 16292 + }, + { + "epoch": 3.4754692832764507, + "grad_norm": 0.16673110739226019, + "learning_rate": 9.485640450532309e-05, + "loss": 0.9095, + "step": 16293 + }, + { + "epoch": 3.4756825938566553, + "grad_norm": 0.1973997893755743, + "learning_rate": 9.484804267869615e-05, + "loss": 0.9383, + "step": 16294 + }, + { + "epoch": 3.47589590443686, + "grad_norm": 0.19666985751718027, + "learning_rate": 9.483968068406336e-05, + "loss": 0.8957, + "step": 16295 + }, + { + "epoch": 3.4761092150170647, + "grad_norm": 0.22313440271939983, + "learning_rate": 9.483131852151935e-05, + "loss": 0.9099, + "step": 16296 + }, + { + "epoch": 3.4763225255972694, + "grad_norm": 0.23678817083204717, + "learning_rate": 9.482295619115873e-05, + "loss": 0.9162, + "step": 16297 + }, + { + "epoch": 3.4765358361774745, + "grad_norm": 0.18186617879516356, + "learning_rate": 9.481459369307608e-05, + "loss": 0.9325, + "step": 16298 + }, + { + "epoch": 3.476749146757679, + "grad_norm": 0.19458136671530096, + "learning_rate": 9.480623102736607e-05, + "loss": 0.8945, + "step": 16299 + }, + { + "epoch": 3.476962457337884, + "grad_norm": 0.19401022426978812, + "learning_rate": 9.479786819412329e-05, + "loss": 0.9289, + "step": 16300 + }, + { + "epoch": 3.4771757679180886, + "grad_norm": 0.19416848603071737, + "learning_rate": 9.478950519344241e-05, + "loss": 0.8939, + "step": 16301 + }, + { + "epoch": 3.4773890784982937, + "grad_norm": 0.1713183929115882, + "learning_rate": 9.478114202541801e-05, + "loss": 0.899, + "step": 16302 + }, + { + "epoch": 3.4776023890784984, + "grad_norm": 0.18747438237266445, + "learning_rate": 9.477277869014476e-05, + "loss": 0.926, + "step": 16303 + }, + { + "epoch": 3.477815699658703, + "grad_norm": 0.19252247417315416, + "learning_rate": 9.476441518771724e-05, + "loss": 0.9214, + "step": 16304 + }, + { + "epoch": 3.4780290102389078, + "grad_norm": 0.22932128966486204, + "learning_rate": 9.475605151823014e-05, + "loss": 0.8923, + "step": 16305 + }, + { + "epoch": 3.4782423208191124, + "grad_norm": 0.21878223768875105, + "learning_rate": 9.474768768177804e-05, + "loss": 0.9323, + "step": 16306 + }, + { + "epoch": 3.4784556313993176, + "grad_norm": 0.19321332142365535, + "learning_rate": 9.473932367845564e-05, + "loss": 0.9213, + "step": 16307 + }, + { + "epoch": 3.4786689419795223, + "grad_norm": 0.2075016289458637, + "learning_rate": 9.473095950835753e-05, + "loss": 0.9316, + "step": 16308 + }, + { + "epoch": 3.478882252559727, + "grad_norm": 0.2186551313928091, + "learning_rate": 9.472259517157837e-05, + "loss": 0.8962, + "step": 16309 + }, + { + "epoch": 3.4790955631399316, + "grad_norm": 0.1988139956507641, + "learning_rate": 9.471423066821278e-05, + "loss": 0.9082, + "step": 16310 + }, + { + "epoch": 3.4793088737201368, + "grad_norm": 0.29865542955755997, + "learning_rate": 9.470586599835543e-05, + "loss": 0.9565, + "step": 16311 + }, + { + "epoch": 3.4795221843003414, + "grad_norm": 0.3188007576500055, + "learning_rate": 9.469750116210096e-05, + "loss": 0.9099, + "step": 16312 + }, + { + "epoch": 3.479735494880546, + "grad_norm": 0.28860577399390647, + "learning_rate": 9.468913615954404e-05, + "loss": 0.9148, + "step": 16313 + }, + { + "epoch": 3.479948805460751, + "grad_norm": 0.2875614208832122, + "learning_rate": 9.468077099077926e-05, + "loss": 0.8996, + "step": 16314 + }, + { + "epoch": 3.4801621160409555, + "grad_norm": 0.26113528725689805, + "learning_rate": 9.467240565590134e-05, + "loss": 0.917, + "step": 16315 + }, + { + "epoch": 3.48037542662116, + "grad_norm": 0.250892082387786, + "learning_rate": 9.466404015500488e-05, + "loss": 0.9015, + "step": 16316 + }, + { + "epoch": 3.4805887372013653, + "grad_norm": 0.26427842242840854, + "learning_rate": 9.465567448818456e-05, + "loss": 0.951, + "step": 16317 + }, + { + "epoch": 3.48080204778157, + "grad_norm": 0.3110053306501365, + "learning_rate": 9.464730865553507e-05, + "loss": 0.8872, + "step": 16318 + }, + { + "epoch": 3.4810153583617747, + "grad_norm": 0.33188840366785405, + "learning_rate": 9.463894265715102e-05, + "loss": 0.9194, + "step": 16319 + }, + { + "epoch": 3.4812286689419794, + "grad_norm": 0.3652877926949079, + "learning_rate": 9.463057649312706e-05, + "loss": 0.9269, + "step": 16320 + }, + { + "epoch": 3.4814419795221845, + "grad_norm": 0.27969317237409314, + "learning_rate": 9.462221016355792e-05, + "loss": 0.9014, + "step": 16321 + }, + { + "epoch": 3.481655290102389, + "grad_norm": 0.18709885715514174, + "learning_rate": 9.461384366853819e-05, + "loss": 0.9451, + "step": 16322 + }, + { + "epoch": 3.481868600682594, + "grad_norm": 0.27390584655068206, + "learning_rate": 9.460547700816259e-05, + "loss": 0.9342, + "step": 16323 + }, + { + "epoch": 3.4820819112627985, + "grad_norm": 0.37694682787516715, + "learning_rate": 9.459711018252577e-05, + "loss": 0.9245, + "step": 16324 + }, + { + "epoch": 3.482295221843003, + "grad_norm": 0.3600015302045724, + "learning_rate": 9.458874319172243e-05, + "loss": 0.9211, + "step": 16325 + }, + { + "epoch": 3.4825085324232083, + "grad_norm": 0.31323742048043285, + "learning_rate": 9.458037603584716e-05, + "loss": 0.9229, + "step": 16326 + }, + { + "epoch": 3.482721843003413, + "grad_norm": 0.27468369506770257, + "learning_rate": 9.457200871499475e-05, + "loss": 0.9311, + "step": 16327 + }, + { + "epoch": 3.4829351535836177, + "grad_norm": 0.16656964254882708, + "learning_rate": 9.456364122925976e-05, + "loss": 0.8932, + "step": 16328 + }, + { + "epoch": 3.4831484641638224, + "grad_norm": 0.2669722392357657, + "learning_rate": 9.455527357873698e-05, + "loss": 0.9281, + "step": 16329 + }, + { + "epoch": 3.4833617747440275, + "grad_norm": 0.3259524805854258, + "learning_rate": 9.454690576352099e-05, + "loss": 0.9305, + "step": 16330 + }, + { + "epoch": 3.483575085324232, + "grad_norm": 0.24779928633343004, + "learning_rate": 9.453853778370655e-05, + "loss": 0.8991, + "step": 16331 + }, + { + "epoch": 3.483788395904437, + "grad_norm": 0.1671986369901524, + "learning_rate": 9.453016963938826e-05, + "loss": 0.8831, + "step": 16332 + }, + { + "epoch": 3.4840017064846416, + "grad_norm": 0.23734855459902218, + "learning_rate": 9.452180133066091e-05, + "loss": 0.9394, + "step": 16333 + }, + { + "epoch": 3.4842150170648463, + "grad_norm": 0.19674710893230693, + "learning_rate": 9.451343285761909e-05, + "loss": 0.9201, + "step": 16334 + }, + { + "epoch": 3.484428327645051, + "grad_norm": 0.22029679360200435, + "learning_rate": 9.450506422035759e-05, + "loss": 0.914, + "step": 16335 + }, + { + "epoch": 3.484641638225256, + "grad_norm": 0.22060903463816278, + "learning_rate": 9.449669541897098e-05, + "loss": 0.9126, + "step": 16336 + }, + { + "epoch": 3.4848549488054608, + "grad_norm": 0.23137018456160635, + "learning_rate": 9.448832645355407e-05, + "loss": 0.9341, + "step": 16337 + }, + { + "epoch": 3.4850682593856654, + "grad_norm": 0.3681166113394351, + "learning_rate": 9.447995732420145e-05, + "loss": 0.936, + "step": 16338 + }, + { + "epoch": 3.48528156996587, + "grad_norm": 0.1863235787719258, + "learning_rate": 9.447158803100792e-05, + "loss": 0.9151, + "step": 16339 + }, + { + "epoch": 3.4854948805460753, + "grad_norm": 0.16807581871717434, + "learning_rate": 9.446321857406809e-05, + "loss": 0.9354, + "step": 16340 + }, + { + "epoch": 3.48570819112628, + "grad_norm": 0.17266290421613267, + "learning_rate": 9.445484895347675e-05, + "loss": 0.8864, + "step": 16341 + }, + { + "epoch": 3.4859215017064846, + "grad_norm": 0.24136126119504303, + "learning_rate": 9.444647916932851e-05, + "loss": 0.916, + "step": 16342 + }, + { + "epoch": 3.4861348122866893, + "grad_norm": 0.25493763569013295, + "learning_rate": 9.44381092217181e-05, + "loss": 0.9321, + "step": 16343 + }, + { + "epoch": 3.486348122866894, + "grad_norm": 0.32169662778899916, + "learning_rate": 9.442973911074026e-05, + "loss": 0.9075, + "step": 16344 + }, + { + "epoch": 3.486561433447099, + "grad_norm": 0.2370846668335913, + "learning_rate": 9.44213688364897e-05, + "loss": 0.9083, + "step": 16345 + }, + { + "epoch": 3.486774744027304, + "grad_norm": 0.21070725688173483, + "learning_rate": 9.441299839906109e-05, + "loss": 0.9137, + "step": 16346 + }, + { + "epoch": 3.4869880546075085, + "grad_norm": 0.2994189572015661, + "learning_rate": 9.440462779854918e-05, + "loss": 0.952, + "step": 16347 + }, + { + "epoch": 3.487201365187713, + "grad_norm": 0.2988646585596652, + "learning_rate": 9.439625703504862e-05, + "loss": 0.9446, + "step": 16348 + }, + { + "epoch": 3.4874146757679183, + "grad_norm": 0.3333831892907063, + "learning_rate": 9.43878861086542e-05, + "loss": 0.9256, + "step": 16349 + }, + { + "epoch": 3.487627986348123, + "grad_norm": 0.3227838056644848, + "learning_rate": 9.43795150194606e-05, + "loss": 0.9015, + "step": 16350 + }, + { + "epoch": 3.4878412969283277, + "grad_norm": 0.2640926502552899, + "learning_rate": 9.437114376756255e-05, + "loss": 0.938, + "step": 16351 + }, + { + "epoch": 3.4880546075085324, + "grad_norm": 0.2553795305937057, + "learning_rate": 9.436277235305478e-05, + "loss": 0.9146, + "step": 16352 + }, + { + "epoch": 3.488267918088737, + "grad_norm": 0.35120410271716895, + "learning_rate": 9.435440077603197e-05, + "loss": 0.9096, + "step": 16353 + }, + { + "epoch": 3.488481228668942, + "grad_norm": 0.24290703060317498, + "learning_rate": 9.434602903658888e-05, + "loss": 0.9166, + "step": 16354 + }, + { + "epoch": 3.488694539249147, + "grad_norm": 0.2453760078190787, + "learning_rate": 9.433765713482026e-05, + "loss": 0.896, + "step": 16355 + }, + { + "epoch": 3.4889078498293515, + "grad_norm": 0.22866305828101396, + "learning_rate": 9.432928507082077e-05, + "loss": 0.8841, + "step": 16356 + }, + { + "epoch": 3.489121160409556, + "grad_norm": 0.23809435798304968, + "learning_rate": 9.432091284468521e-05, + "loss": 0.9741, + "step": 16357 + }, + { + "epoch": 3.489334470989761, + "grad_norm": 0.27015875273209294, + "learning_rate": 9.431254045650826e-05, + "loss": 0.9325, + "step": 16358 + }, + { + "epoch": 3.489547781569966, + "grad_norm": 0.284138405716332, + "learning_rate": 9.430416790638469e-05, + "loss": 0.8955, + "step": 16359 + }, + { + "epoch": 3.4897610921501707, + "grad_norm": 0.30114451133014924, + "learning_rate": 9.429579519440919e-05, + "loss": 0.9272, + "step": 16360 + }, + { + "epoch": 3.4899744027303754, + "grad_norm": 0.21512595442616195, + "learning_rate": 9.428742232067654e-05, + "loss": 0.9233, + "step": 16361 + }, + { + "epoch": 3.49018771331058, + "grad_norm": 0.31128879260407216, + "learning_rate": 9.427904928528147e-05, + "loss": 0.9011, + "step": 16362 + }, + { + "epoch": 3.4904010238907848, + "grad_norm": 0.29260298207023666, + "learning_rate": 9.427067608831874e-05, + "loss": 0.9065, + "step": 16363 + }, + { + "epoch": 3.49061433447099, + "grad_norm": 0.24352118076298157, + "learning_rate": 9.426230272988303e-05, + "loss": 0.9166, + "step": 16364 + }, + { + "epoch": 3.4908276450511946, + "grad_norm": 0.23687127513403636, + "learning_rate": 9.425392921006917e-05, + "loss": 0.8866, + "step": 16365 + }, + { + "epoch": 3.4910409556313993, + "grad_norm": 0.23056201835438137, + "learning_rate": 9.424555552897181e-05, + "loss": 0.9262, + "step": 16366 + }, + { + "epoch": 3.491254266211604, + "grad_norm": 0.22270282743067427, + "learning_rate": 9.423718168668578e-05, + "loss": 0.9301, + "step": 16367 + }, + { + "epoch": 3.491467576791809, + "grad_norm": 0.18746445314257146, + "learning_rate": 9.422880768330578e-05, + "loss": 0.9136, + "step": 16368 + }, + { + "epoch": 3.4916808873720138, + "grad_norm": 0.2782427743133095, + "learning_rate": 9.422043351892662e-05, + "loss": 0.9011, + "step": 16369 + }, + { + "epoch": 3.4918941979522184, + "grad_norm": 0.31601614349038376, + "learning_rate": 9.421205919364297e-05, + "loss": 0.9404, + "step": 16370 + }, + { + "epoch": 3.492107508532423, + "grad_norm": 0.2837787626511694, + "learning_rate": 9.420368470754966e-05, + "loss": 0.9093, + "step": 16371 + }, + { + "epoch": 3.492320819112628, + "grad_norm": 0.2033009733420747, + "learning_rate": 9.419531006074141e-05, + "loss": 0.9392, + "step": 16372 + }, + { + "epoch": 3.492534129692833, + "grad_norm": 0.19892978003354597, + "learning_rate": 9.418693525331301e-05, + "loss": 0.9294, + "step": 16373 + }, + { + "epoch": 3.4927474402730376, + "grad_norm": 0.21349373017285644, + "learning_rate": 9.417856028535916e-05, + "loss": 0.9329, + "step": 16374 + }, + { + "epoch": 3.4929607508532423, + "grad_norm": 0.2423059453465231, + "learning_rate": 9.41701851569747e-05, + "loss": 0.8809, + "step": 16375 + }, + { + "epoch": 3.493174061433447, + "grad_norm": 0.3293821707789319, + "learning_rate": 9.416180986825432e-05, + "loss": 0.9237, + "step": 16376 + }, + { + "epoch": 3.493387372013652, + "grad_norm": 0.33786614171312457, + "learning_rate": 9.415343441929285e-05, + "loss": 0.9167, + "step": 16377 + }, + { + "epoch": 3.493600682593857, + "grad_norm": 0.27442960823995527, + "learning_rate": 9.4145058810185e-05, + "loss": 0.9214, + "step": 16378 + }, + { + "epoch": 3.4938139931740615, + "grad_norm": 0.1709364525517389, + "learning_rate": 9.41366830410256e-05, + "loss": 0.897, + "step": 16379 + }, + { + "epoch": 3.494027303754266, + "grad_norm": 0.2815705599439172, + "learning_rate": 9.41283071119094e-05, + "loss": 0.9238, + "step": 16380 + }, + { + "epoch": 3.494240614334471, + "grad_norm": 0.30898530488407616, + "learning_rate": 9.411993102293117e-05, + "loss": 0.922, + "step": 16381 + }, + { + "epoch": 3.4944539249146755, + "grad_norm": 0.4269648247229623, + "learning_rate": 9.411155477418564e-05, + "loss": 0.9213, + "step": 16382 + }, + { + "epoch": 3.4946672354948807, + "grad_norm": 0.25571788821414587, + "learning_rate": 9.410317836576768e-05, + "loss": 0.9045, + "step": 16383 + }, + { + "epoch": 3.4948805460750854, + "grad_norm": 0.2691780616923264, + "learning_rate": 9.409480179777197e-05, + "loss": 0.9161, + "step": 16384 + }, + { + "epoch": 3.49509385665529, + "grad_norm": 0.21419170535631274, + "learning_rate": 9.408642507029338e-05, + "loss": 0.8805, + "step": 16385 + }, + { + "epoch": 3.4953071672354947, + "grad_norm": 0.24384895792588673, + "learning_rate": 9.407804818342664e-05, + "loss": 0.9262, + "step": 16386 + }, + { + "epoch": 3.4955204778157, + "grad_norm": 0.25157196847797236, + "learning_rate": 9.406967113726655e-05, + "loss": 0.9162, + "step": 16387 + }, + { + "epoch": 3.4957337883959045, + "grad_norm": 0.24260940576453827, + "learning_rate": 9.406129393190789e-05, + "loss": 0.9318, + "step": 16388 + }, + { + "epoch": 3.4959470989761092, + "grad_norm": 0.20546066031060287, + "learning_rate": 9.405291656744546e-05, + "loss": 0.9068, + "step": 16389 + }, + { + "epoch": 3.496160409556314, + "grad_norm": 0.2358818054482151, + "learning_rate": 9.404453904397402e-05, + "loss": 0.9488, + "step": 16390 + }, + { + "epoch": 3.4963737201365186, + "grad_norm": 0.25895164410772664, + "learning_rate": 9.403616136158843e-05, + "loss": 0.9217, + "step": 16391 + }, + { + "epoch": 3.4965870307167237, + "grad_norm": 0.2560770167136468, + "learning_rate": 9.402778352038341e-05, + "loss": 0.8952, + "step": 16392 + }, + { + "epoch": 3.4968003412969284, + "grad_norm": 0.209831734262831, + "learning_rate": 9.401940552045377e-05, + "loss": 0.9171, + "step": 16393 + }, + { + "epoch": 3.497013651877133, + "grad_norm": 0.1986466898160466, + "learning_rate": 9.401102736189433e-05, + "loss": 0.8908, + "step": 16394 + }, + { + "epoch": 3.4972269624573378, + "grad_norm": 0.20982138139882003, + "learning_rate": 9.400264904479988e-05, + "loss": 0.9345, + "step": 16395 + }, + { + "epoch": 3.497440273037543, + "grad_norm": 0.22928872724938962, + "learning_rate": 9.399427056926522e-05, + "loss": 0.9218, + "step": 16396 + }, + { + "epoch": 3.4976535836177476, + "grad_norm": 0.18693479507261881, + "learning_rate": 9.398589193538518e-05, + "loss": 0.9003, + "step": 16397 + }, + { + "epoch": 3.4978668941979523, + "grad_norm": 0.19729920876114562, + "learning_rate": 9.397751314325452e-05, + "loss": 0.8787, + "step": 16398 + }, + { + "epoch": 3.498080204778157, + "grad_norm": 0.18177784766500302, + "learning_rate": 9.396913419296805e-05, + "loss": 0.9018, + "step": 16399 + }, + { + "epoch": 3.4982935153583616, + "grad_norm": 0.18394910174132828, + "learning_rate": 9.396075508462059e-05, + "loss": 0.9478, + "step": 16400 + }, + { + "epoch": 3.4985068259385663, + "grad_norm": 0.18000110828234878, + "learning_rate": 9.395237581830697e-05, + "loss": 0.8942, + "step": 16401 + }, + { + "epoch": 3.4987201365187715, + "grad_norm": 0.17431519124820222, + "learning_rate": 9.394399639412199e-05, + "loss": 0.8921, + "step": 16402 + }, + { + "epoch": 3.498933447098976, + "grad_norm": 0.1780396697876723, + "learning_rate": 9.393561681216044e-05, + "loss": 0.9308, + "step": 16403 + }, + { + "epoch": 3.499146757679181, + "grad_norm": 0.21071460190715197, + "learning_rate": 9.392723707251715e-05, + "loss": 0.9197, + "step": 16404 + }, + { + "epoch": 3.4993600682593855, + "grad_norm": 0.24093177169572036, + "learning_rate": 9.391885717528693e-05, + "loss": 0.9159, + "step": 16405 + }, + { + "epoch": 3.4995733788395906, + "grad_norm": 0.2251327536644065, + "learning_rate": 9.391047712056462e-05, + "loss": 0.9172, + "step": 16406 + }, + { + "epoch": 3.4997866894197953, + "grad_norm": 0.22062575346601634, + "learning_rate": 9.390209690844502e-05, + "loss": 0.8865, + "step": 16407 + }, + { + "epoch": 3.5, + "grad_norm": 0.20771077919957948, + "learning_rate": 9.389371653902298e-05, + "loss": 0.9196, + "step": 16408 + }, + { + "epoch": 3.5002133105802047, + "grad_norm": 0.17174145593057286, + "learning_rate": 9.38853360123933e-05, + "loss": 0.8983, + "step": 16409 + }, + { + "epoch": 3.5004266211604094, + "grad_norm": 0.2289494980203451, + "learning_rate": 9.387695532865078e-05, + "loss": 0.9295, + "step": 16410 + }, + { + "epoch": 3.5006399317406145, + "grad_norm": 0.2819611465997033, + "learning_rate": 9.386857448789031e-05, + "loss": 0.8792, + "step": 16411 + }, + { + "epoch": 3.500853242320819, + "grad_norm": 0.28890371739062193, + "learning_rate": 9.386019349020664e-05, + "loss": 0.9381, + "step": 16412 + }, + { + "epoch": 3.501066552901024, + "grad_norm": 0.2694081192597696, + "learning_rate": 9.38518123356947e-05, + "loss": 0.9236, + "step": 16413 + }, + { + "epoch": 3.5012798634812285, + "grad_norm": 0.25616189758425106, + "learning_rate": 9.384343102444926e-05, + "loss": 0.9273, + "step": 16414 + }, + { + "epoch": 3.5014931740614337, + "grad_norm": 0.2456352452355817, + "learning_rate": 9.383504955656515e-05, + "loss": 0.9274, + "step": 16415 + }, + { + "epoch": 3.5017064846416384, + "grad_norm": 0.18713744152894232, + "learning_rate": 9.382666793213722e-05, + "loss": 0.8837, + "step": 16416 + }, + { + "epoch": 3.501919795221843, + "grad_norm": 0.14881679155179442, + "learning_rate": 9.381828615126034e-05, + "loss": 0.8956, + "step": 16417 + }, + { + "epoch": 3.5021331058020477, + "grad_norm": 0.17466241868411295, + "learning_rate": 9.380990421402926e-05, + "loss": 0.9123, + "step": 16418 + }, + { + "epoch": 3.5023464163822524, + "grad_norm": 0.1727939991909462, + "learning_rate": 9.380152212053895e-05, + "loss": 0.9078, + "step": 16419 + }, + { + "epoch": 3.502559726962457, + "grad_norm": 0.22053465247157467, + "learning_rate": 9.379313987088414e-05, + "loss": 0.9234, + "step": 16420 + }, + { + "epoch": 3.5027730375426622, + "grad_norm": 0.24424595477802913, + "learning_rate": 9.378475746515974e-05, + "loss": 0.9091, + "step": 16421 + }, + { + "epoch": 3.502986348122867, + "grad_norm": 0.24102025719348943, + "learning_rate": 9.377637490346056e-05, + "loss": 0.9203, + "step": 16422 + }, + { + "epoch": 3.5031996587030716, + "grad_norm": 0.27740301143915863, + "learning_rate": 9.376799218588148e-05, + "loss": 0.9125, + "step": 16423 + }, + { + "epoch": 3.5034129692832767, + "grad_norm": 0.28730765440358963, + "learning_rate": 9.375960931251732e-05, + "loss": 0.9365, + "step": 16424 + }, + { + "epoch": 3.5036262798634814, + "grad_norm": 0.3321221275772309, + "learning_rate": 9.3751226283463e-05, + "loss": 0.9066, + "step": 16425 + }, + { + "epoch": 3.503839590443686, + "grad_norm": 0.2171451584312355, + "learning_rate": 9.374284309881325e-05, + "loss": 0.8982, + "step": 16426 + }, + { + "epoch": 3.5040529010238908, + "grad_norm": 0.2653304006308943, + "learning_rate": 9.373445975866306e-05, + "loss": 0.9049, + "step": 16427 + }, + { + "epoch": 3.5042662116040955, + "grad_norm": 0.3427568740010789, + "learning_rate": 9.372607626310719e-05, + "loss": 0.9367, + "step": 16428 + }, + { + "epoch": 3.5044795221843, + "grad_norm": 0.3391691002157982, + "learning_rate": 9.371769261224055e-05, + "loss": 0.9488, + "step": 16429 + }, + { + "epoch": 3.5046928327645053, + "grad_norm": 0.2664033642030548, + "learning_rate": 9.370930880615799e-05, + "loss": 0.9058, + "step": 16430 + }, + { + "epoch": 3.50490614334471, + "grad_norm": 0.17866705781923597, + "learning_rate": 9.370092484495438e-05, + "loss": 0.9163, + "step": 16431 + }, + { + "epoch": 3.5051194539249146, + "grad_norm": 0.2442131138840779, + "learning_rate": 9.369254072872453e-05, + "loss": 0.9448, + "step": 16432 + }, + { + "epoch": 3.5053327645051193, + "grad_norm": 0.23242433270771232, + "learning_rate": 9.368415645756341e-05, + "loss": 0.9239, + "step": 16433 + }, + { + "epoch": 3.5055460750853245, + "grad_norm": 0.19181970624985423, + "learning_rate": 9.367577203156578e-05, + "loss": 0.9089, + "step": 16434 + }, + { + "epoch": 3.505759385665529, + "grad_norm": 0.23545002447163244, + "learning_rate": 9.366738745082659e-05, + "loss": 0.9323, + "step": 16435 + }, + { + "epoch": 3.505972696245734, + "grad_norm": 0.26584552727556404, + "learning_rate": 9.365900271544068e-05, + "loss": 0.9142, + "step": 16436 + }, + { + "epoch": 3.5061860068259385, + "grad_norm": 0.2184491290762037, + "learning_rate": 9.365061782550293e-05, + "loss": 0.924, + "step": 16437 + }, + { + "epoch": 3.506399317406143, + "grad_norm": 0.22323155932559297, + "learning_rate": 9.364223278110817e-05, + "loss": 0.9025, + "step": 16438 + }, + { + "epoch": 3.506612627986348, + "grad_norm": 0.2529414955034027, + "learning_rate": 9.363384758235135e-05, + "loss": 0.911, + "step": 16439 + }, + { + "epoch": 3.506825938566553, + "grad_norm": 0.2642507388699785, + "learning_rate": 9.36254622293273e-05, + "loss": 0.8904, + "step": 16440 + }, + { + "epoch": 3.5070392491467577, + "grad_norm": 0.18064514752258748, + "learning_rate": 9.361707672213092e-05, + "loss": 0.9158, + "step": 16441 + }, + { + "epoch": 3.5072525597269624, + "grad_norm": 0.24956002950734008, + "learning_rate": 9.360869106085711e-05, + "loss": 0.9028, + "step": 16442 + }, + { + "epoch": 3.5074658703071675, + "grad_norm": 0.22509390130955698, + "learning_rate": 9.360030524560071e-05, + "loss": 0.8971, + "step": 16443 + }, + { + "epoch": 3.507679180887372, + "grad_norm": 0.2443832653349523, + "learning_rate": 9.359191927645664e-05, + "loss": 0.9077, + "step": 16444 + }, + { + "epoch": 3.507892491467577, + "grad_norm": 0.2300866193942796, + "learning_rate": 9.358353315351976e-05, + "loss": 0.9273, + "step": 16445 + }, + { + "epoch": 3.5081058020477816, + "grad_norm": 0.20876385462164138, + "learning_rate": 9.357514687688497e-05, + "loss": 0.8993, + "step": 16446 + }, + { + "epoch": 3.5083191126279862, + "grad_norm": 0.23088519730102317, + "learning_rate": 9.356676044664717e-05, + "loss": 0.9164, + "step": 16447 + }, + { + "epoch": 3.508532423208191, + "grad_norm": 0.2295566160142163, + "learning_rate": 9.355837386290126e-05, + "loss": 0.9114, + "step": 16448 + }, + { + "epoch": 3.508745733788396, + "grad_norm": 0.25836328737368924, + "learning_rate": 9.354998712574212e-05, + "loss": 0.9284, + "step": 16449 + }, + { + "epoch": 3.5089590443686007, + "grad_norm": 0.24850630378335295, + "learning_rate": 9.354160023526464e-05, + "loss": 0.9236, + "step": 16450 + }, + { + "epoch": 3.5091723549488054, + "grad_norm": 0.17650032486577052, + "learning_rate": 9.353321319156373e-05, + "loss": 0.8978, + "step": 16451 + }, + { + "epoch": 3.50938566552901, + "grad_norm": 0.21983858762372052, + "learning_rate": 9.352482599473426e-05, + "loss": 0.9246, + "step": 16452 + }, + { + "epoch": 3.5095989761092152, + "grad_norm": 0.2228219885898541, + "learning_rate": 9.351643864487119e-05, + "loss": 0.9353, + "step": 16453 + }, + { + "epoch": 3.50981228668942, + "grad_norm": 0.19195711059590065, + "learning_rate": 9.35080511420694e-05, + "loss": 0.889, + "step": 16454 + }, + { + "epoch": 3.5100255972696246, + "grad_norm": 0.21812777507879996, + "learning_rate": 9.349966348642374e-05, + "loss": 0.9338, + "step": 16455 + }, + { + "epoch": 3.5102389078498293, + "grad_norm": 0.23811450500848216, + "learning_rate": 9.349127567802918e-05, + "loss": 0.9303, + "step": 16456 + }, + { + "epoch": 3.510452218430034, + "grad_norm": 0.21375888292569523, + "learning_rate": 9.348288771698062e-05, + "loss": 0.8914, + "step": 16457 + }, + { + "epoch": 3.5106655290102387, + "grad_norm": 0.22174288594445907, + "learning_rate": 9.347449960337296e-05, + "loss": 0.9174, + "step": 16458 + }, + { + "epoch": 3.510878839590444, + "grad_norm": 0.26829547819166866, + "learning_rate": 9.346611133730108e-05, + "loss": 0.9456, + "step": 16459 + }, + { + "epoch": 3.5110921501706485, + "grad_norm": 0.2113492682720501, + "learning_rate": 9.345772291885996e-05, + "loss": 0.9314, + "step": 16460 + }, + { + "epoch": 3.511305460750853, + "grad_norm": 0.22088257468450528, + "learning_rate": 9.344933434814446e-05, + "loss": 0.9002, + "step": 16461 + }, + { + "epoch": 3.5115187713310583, + "grad_norm": 0.23041089069875717, + "learning_rate": 9.34409456252495e-05, + "loss": 0.9001, + "step": 16462 + }, + { + "epoch": 3.511732081911263, + "grad_norm": 0.3154652706088021, + "learning_rate": 9.343255675027006e-05, + "loss": 0.9106, + "step": 16463 + }, + { + "epoch": 3.5119453924914676, + "grad_norm": 0.17053803996603317, + "learning_rate": 9.342416772330098e-05, + "loss": 0.8896, + "step": 16464 + }, + { + "epoch": 3.5121587030716723, + "grad_norm": 0.20620967104006752, + "learning_rate": 9.341577854443721e-05, + "loss": 0.9088, + "step": 16465 + }, + { + "epoch": 3.512372013651877, + "grad_norm": 0.22021270831095863, + "learning_rate": 9.34073892137737e-05, + "loss": 0.8981, + "step": 16466 + }, + { + "epoch": 3.5125853242320817, + "grad_norm": 0.1931211086280529, + "learning_rate": 9.339899973140536e-05, + "loss": 0.9196, + "step": 16467 + }, + { + "epoch": 3.512798634812287, + "grad_norm": 0.19805504384926526, + "learning_rate": 9.339061009742709e-05, + "loss": 0.9436, + "step": 16468 + }, + { + "epoch": 3.5130119453924915, + "grad_norm": 0.295586520217555, + "learning_rate": 9.338222031193385e-05, + "loss": 0.9082, + "step": 16469 + }, + { + "epoch": 3.513225255972696, + "grad_norm": 0.2321514433690841, + "learning_rate": 9.337383037502058e-05, + "loss": 0.9108, + "step": 16470 + }, + { + "epoch": 3.513438566552901, + "grad_norm": 0.17992343636851638, + "learning_rate": 9.336544028678216e-05, + "loss": 0.9112, + "step": 16471 + }, + { + "epoch": 3.513651877133106, + "grad_norm": 0.23450116545066446, + "learning_rate": 9.335705004731358e-05, + "loss": 0.9739, + "step": 16472 + }, + { + "epoch": 3.5138651877133107, + "grad_norm": 0.1884728715393466, + "learning_rate": 9.334865965670975e-05, + "loss": 0.9049, + "step": 16473 + }, + { + "epoch": 3.5140784982935154, + "grad_norm": 0.1771299754845702, + "learning_rate": 9.334026911506561e-05, + "loss": 0.9435, + "step": 16474 + }, + { + "epoch": 3.51429180887372, + "grad_norm": 0.21595782540338962, + "learning_rate": 9.333187842247612e-05, + "loss": 0.9049, + "step": 16475 + }, + { + "epoch": 3.5145051194539247, + "grad_norm": 0.19734827064194616, + "learning_rate": 9.332348757903619e-05, + "loss": 0.9507, + "step": 16476 + }, + { + "epoch": 3.5147184300341294, + "grad_norm": 0.1767527767646651, + "learning_rate": 9.331509658484076e-05, + "loss": 0.9249, + "step": 16477 + }, + { + "epoch": 3.5149317406143346, + "grad_norm": 0.2122306802182123, + "learning_rate": 9.33067054399848e-05, + "loss": 0.9216, + "step": 16478 + }, + { + "epoch": 3.5151450511945392, + "grad_norm": 0.18932995175246853, + "learning_rate": 9.329831414456324e-05, + "loss": 0.925, + "step": 16479 + }, + { + "epoch": 3.515358361774744, + "grad_norm": 0.1838248352690665, + "learning_rate": 9.328992269867103e-05, + "loss": 0.9205, + "step": 16480 + }, + { + "epoch": 3.515571672354949, + "grad_norm": 0.20001962881824797, + "learning_rate": 9.328153110240312e-05, + "loss": 0.9031, + "step": 16481 + }, + { + "epoch": 3.5157849829351537, + "grad_norm": 0.2448166120633118, + "learning_rate": 9.327313935585448e-05, + "loss": 0.9025, + "step": 16482 + }, + { + "epoch": 3.5159982935153584, + "grad_norm": 0.24668718707610665, + "learning_rate": 9.326474745912002e-05, + "loss": 0.9318, + "step": 16483 + }, + { + "epoch": 3.516211604095563, + "grad_norm": 0.2680019994524869, + "learning_rate": 9.325635541229474e-05, + "loss": 0.9302, + "step": 16484 + }, + { + "epoch": 3.516424914675768, + "grad_norm": 0.28090393656678514, + "learning_rate": 9.324796321547358e-05, + "loss": 0.9285, + "step": 16485 + }, + { + "epoch": 3.5166382252559725, + "grad_norm": 0.3039413892673307, + "learning_rate": 9.323957086875147e-05, + "loss": 0.9058, + "step": 16486 + }, + { + "epoch": 3.5168515358361776, + "grad_norm": 0.25790587325822995, + "learning_rate": 9.32311783722234e-05, + "loss": 0.944, + "step": 16487 + }, + { + "epoch": 3.5170648464163823, + "grad_norm": 0.21509552569236046, + "learning_rate": 9.322278572598433e-05, + "loss": 0.8978, + "step": 16488 + }, + { + "epoch": 3.517278156996587, + "grad_norm": 0.23756046754721943, + "learning_rate": 9.32143929301292e-05, + "loss": 0.923, + "step": 16489 + }, + { + "epoch": 3.5174914675767917, + "grad_norm": 0.2193763955908573, + "learning_rate": 9.320599998475302e-05, + "loss": 0.9479, + "step": 16490 + }, + { + "epoch": 3.517704778156997, + "grad_norm": 0.18358543030059057, + "learning_rate": 9.319760688995072e-05, + "loss": 0.8785, + "step": 16491 + }, + { + "epoch": 3.5179180887372015, + "grad_norm": 0.22544984754396674, + "learning_rate": 9.318921364581727e-05, + "loss": 0.938, + "step": 16492 + }, + { + "epoch": 3.518131399317406, + "grad_norm": 0.23865541324654643, + "learning_rate": 9.318082025244764e-05, + "loss": 0.9182, + "step": 16493 + }, + { + "epoch": 3.518344709897611, + "grad_norm": 0.2527968482558154, + "learning_rate": 9.317242670993682e-05, + "loss": 0.9219, + "step": 16494 + }, + { + "epoch": 3.5185580204778155, + "grad_norm": 0.28139840910349995, + "learning_rate": 9.316403301837976e-05, + "loss": 0.9134, + "step": 16495 + }, + { + "epoch": 3.51877133105802, + "grad_norm": 0.31359121658670885, + "learning_rate": 9.315563917787144e-05, + "loss": 0.9074, + "step": 16496 + }, + { + "epoch": 3.5189846416382253, + "grad_norm": 0.35200747087192924, + "learning_rate": 9.314724518850686e-05, + "loss": 0.9038, + "step": 16497 + }, + { + "epoch": 3.51919795221843, + "grad_norm": 0.2545687163212687, + "learning_rate": 9.313885105038097e-05, + "loss": 0.9267, + "step": 16498 + }, + { + "epoch": 3.5194112627986347, + "grad_norm": 0.21777016164640806, + "learning_rate": 9.313045676358874e-05, + "loss": 0.9337, + "step": 16499 + }, + { + "epoch": 3.51962457337884, + "grad_norm": 0.3242636360250655, + "learning_rate": 9.31220623282252e-05, + "loss": 0.9059, + "step": 16500 + }, + { + "epoch": 3.5198378839590445, + "grad_norm": 0.32377306589392224, + "learning_rate": 9.311366774438527e-05, + "loss": 0.904, + "step": 16501 + }, + { + "epoch": 3.520051194539249, + "grad_norm": 0.2514898450931751, + "learning_rate": 9.310527301216401e-05, + "loss": 0.9196, + "step": 16502 + }, + { + "epoch": 3.520264505119454, + "grad_norm": 0.23251359545722658, + "learning_rate": 9.309687813165636e-05, + "loss": 0.9133, + "step": 16503 + }, + { + "epoch": 3.5204778156996586, + "grad_norm": 0.2339791321715079, + "learning_rate": 9.308848310295731e-05, + "loss": 0.9322, + "step": 16504 + }, + { + "epoch": 3.5206911262798632, + "grad_norm": 0.32186518029470085, + "learning_rate": 9.308008792616182e-05, + "loss": 0.9704, + "step": 16505 + }, + { + "epoch": 3.5209044368600684, + "grad_norm": 0.37585202116577787, + "learning_rate": 9.307169260136495e-05, + "loss": 0.8996, + "step": 16506 + }, + { + "epoch": 3.521117747440273, + "grad_norm": 0.3057487496523417, + "learning_rate": 9.306329712866163e-05, + "loss": 0.9129, + "step": 16507 + }, + { + "epoch": 3.5213310580204777, + "grad_norm": 0.29060680467828753, + "learning_rate": 9.305490150814694e-05, + "loss": 0.961, + "step": 16508 + }, + { + "epoch": 3.5215443686006824, + "grad_norm": 0.2120641608509262, + "learning_rate": 9.304650573991576e-05, + "loss": 0.9686, + "step": 16509 + }, + { + "epoch": 3.5217576791808876, + "grad_norm": 0.21475788353140202, + "learning_rate": 9.303810982406317e-05, + "loss": 0.9169, + "step": 16510 + }, + { + "epoch": 3.5219709897610922, + "grad_norm": 0.2645360219177458, + "learning_rate": 9.302971376068414e-05, + "loss": 0.9209, + "step": 16511 + }, + { + "epoch": 3.522184300341297, + "grad_norm": 0.34214408129042534, + "learning_rate": 9.302131754987369e-05, + "loss": 0.9183, + "step": 16512 + }, + { + "epoch": 3.5223976109215016, + "grad_norm": 0.35017056936083873, + "learning_rate": 9.301292119172679e-05, + "loss": 0.9403, + "step": 16513 + }, + { + "epoch": 3.5226109215017063, + "grad_norm": 0.2932990813314364, + "learning_rate": 9.300452468633852e-05, + "loss": 0.8859, + "step": 16514 + }, + { + "epoch": 3.5228242320819114, + "grad_norm": 0.19701753507967185, + "learning_rate": 9.29961280338038e-05, + "loss": 0.9132, + "step": 16515 + }, + { + "epoch": 3.523037542662116, + "grad_norm": 0.2179553760922293, + "learning_rate": 9.298773123421769e-05, + "loss": 0.9034, + "step": 16516 + }, + { + "epoch": 3.523250853242321, + "grad_norm": 0.2779951377355877, + "learning_rate": 9.297933428767516e-05, + "loss": 0.9261, + "step": 16517 + }, + { + "epoch": 3.5234641638225255, + "grad_norm": 0.37148740051464635, + "learning_rate": 9.297093719427123e-05, + "loss": 0.9241, + "step": 16518 + }, + { + "epoch": 3.5236774744027306, + "grad_norm": 0.4159654619592958, + "learning_rate": 9.296253995410094e-05, + "loss": 0.9062, + "step": 16519 + }, + { + "epoch": 3.5238907849829353, + "grad_norm": 0.3623673580898021, + "learning_rate": 9.295414256725933e-05, + "loss": 0.9391, + "step": 16520 + }, + { + "epoch": 3.52410409556314, + "grad_norm": 0.23800954905900887, + "learning_rate": 9.294574503384134e-05, + "loss": 0.913, + "step": 16521 + }, + { + "epoch": 3.5243174061433447, + "grad_norm": 0.2799943450768508, + "learning_rate": 9.293734735394203e-05, + "loss": 0.9221, + "step": 16522 + }, + { + "epoch": 3.5245307167235493, + "grad_norm": 0.3300757541911953, + "learning_rate": 9.292894952765644e-05, + "loss": 0.9157, + "step": 16523 + }, + { + "epoch": 3.524744027303754, + "grad_norm": 0.28063115460282795, + "learning_rate": 9.292055155507955e-05, + "loss": 0.8997, + "step": 16524 + }, + { + "epoch": 3.524957337883959, + "grad_norm": 0.3048598236326187, + "learning_rate": 9.29121534363064e-05, + "loss": 0.931, + "step": 16525 + }, + { + "epoch": 3.525170648464164, + "grad_norm": 0.2693136927371717, + "learning_rate": 9.290375517143205e-05, + "loss": 0.9169, + "step": 16526 + }, + { + "epoch": 3.5253839590443685, + "grad_norm": 1.890344937764786, + "learning_rate": 9.289535676055144e-05, + "loss": 0.9292, + "step": 16527 + }, + { + "epoch": 3.5255972696245736, + "grad_norm": 0.3022648801464415, + "learning_rate": 9.288695820375967e-05, + "loss": 0.8977, + "step": 16528 + }, + { + "epoch": 3.5258105802047783, + "grad_norm": 0.5664386807200346, + "learning_rate": 9.287855950115178e-05, + "loss": 0.8997, + "step": 16529 + }, + { + "epoch": 3.526023890784983, + "grad_norm": 0.3541393658759368, + "learning_rate": 9.287016065282273e-05, + "loss": 0.9334, + "step": 16530 + }, + { + "epoch": 3.5262372013651877, + "grad_norm": 0.4465457873687278, + "learning_rate": 9.286176165886763e-05, + "loss": 0.9039, + "step": 16531 + }, + { + "epoch": 3.5264505119453924, + "grad_norm": 0.3634659240271472, + "learning_rate": 9.285336251938147e-05, + "loss": 0.9204, + "step": 16532 + }, + { + "epoch": 3.526663822525597, + "grad_norm": 0.3403233591356122, + "learning_rate": 9.28449632344593e-05, + "loss": 0.9012, + "step": 16533 + }, + { + "epoch": 3.526877133105802, + "grad_norm": 0.2738244089976775, + "learning_rate": 9.283656380419614e-05, + "loss": 0.9458, + "step": 16534 + }, + { + "epoch": 3.527090443686007, + "grad_norm": 0.2253401978132658, + "learning_rate": 9.282816422868703e-05, + "loss": 0.9518, + "step": 16535 + }, + { + "epoch": 3.5273037542662116, + "grad_norm": 0.2583964847766965, + "learning_rate": 9.281976450802708e-05, + "loss": 0.9129, + "step": 16536 + }, + { + "epoch": 3.5275170648464163, + "grad_norm": 0.257391268749305, + "learning_rate": 9.281136464231124e-05, + "loss": 0.8929, + "step": 16537 + }, + { + "epoch": 3.5277303754266214, + "grad_norm": 0.2764308480700756, + "learning_rate": 9.28029646316346e-05, + "loss": 0.932, + "step": 16538 + }, + { + "epoch": 3.527943686006826, + "grad_norm": 0.28567932053314954, + "learning_rate": 9.279456447609219e-05, + "loss": 0.9148, + "step": 16539 + }, + { + "epoch": 3.5281569965870307, + "grad_norm": 0.24447739521709577, + "learning_rate": 9.278616417577908e-05, + "loss": 0.9164, + "step": 16540 + }, + { + "epoch": 3.5283703071672354, + "grad_norm": 0.20728393946069124, + "learning_rate": 9.27777637307903e-05, + "loss": 0.9044, + "step": 16541 + }, + { + "epoch": 3.52858361774744, + "grad_norm": 0.2097333504882998, + "learning_rate": 9.276936314122093e-05, + "loss": 0.9115, + "step": 16542 + }, + { + "epoch": 3.528796928327645, + "grad_norm": 0.23166368219290123, + "learning_rate": 9.276096240716598e-05, + "loss": 0.9258, + "step": 16543 + }, + { + "epoch": 3.52901023890785, + "grad_norm": 0.2658639941975586, + "learning_rate": 9.275256152872054e-05, + "loss": 0.9236, + "step": 16544 + }, + { + "epoch": 3.5292235494880546, + "grad_norm": 0.24218024466636928, + "learning_rate": 9.274416050597965e-05, + "loss": 0.903, + "step": 16545 + }, + { + "epoch": 3.5294368600682593, + "grad_norm": 0.2580380437719632, + "learning_rate": 9.273575933903836e-05, + "loss": 0.8913, + "step": 16546 + }, + { + "epoch": 3.5296501706484644, + "grad_norm": 0.2511330267594008, + "learning_rate": 9.272735802799173e-05, + "loss": 0.8947, + "step": 16547 + }, + { + "epoch": 3.529863481228669, + "grad_norm": 0.21665612204585644, + "learning_rate": 9.271895657293487e-05, + "loss": 0.8887, + "step": 16548 + }, + { + "epoch": 3.530076791808874, + "grad_norm": 0.19957848319637359, + "learning_rate": 9.271055497396277e-05, + "loss": 0.9381, + "step": 16549 + }, + { + "epoch": 3.5302901023890785, + "grad_norm": 0.22910451502434565, + "learning_rate": 9.270215323117053e-05, + "loss": 0.9407, + "step": 16550 + }, + { + "epoch": 3.530503412969283, + "grad_norm": 0.21461837674348536, + "learning_rate": 9.26937513446532e-05, + "loss": 0.9354, + "step": 16551 + }, + { + "epoch": 3.530716723549488, + "grad_norm": 0.15803613368826658, + "learning_rate": 9.268534931450588e-05, + "loss": 0.9375, + "step": 16552 + }, + { + "epoch": 3.530930034129693, + "grad_norm": 0.2700758454593627, + "learning_rate": 9.267694714082361e-05, + "loss": 0.9078, + "step": 16553 + }, + { + "epoch": 3.5311433447098977, + "grad_norm": 0.3014361328432326, + "learning_rate": 9.266854482370148e-05, + "loss": 0.9113, + "step": 16554 + }, + { + "epoch": 3.5313566552901023, + "grad_norm": 0.3328868627006711, + "learning_rate": 9.266014236323455e-05, + "loss": 0.8987, + "step": 16555 + }, + { + "epoch": 3.531569965870307, + "grad_norm": 0.2957146392731107, + "learning_rate": 9.265173975951789e-05, + "loss": 0.9355, + "step": 16556 + }, + { + "epoch": 3.531783276450512, + "grad_norm": 0.2607013706300809, + "learning_rate": 9.264333701264656e-05, + "loss": 0.9319, + "step": 16557 + }, + { + "epoch": 3.531996587030717, + "grad_norm": 0.25980653402856346, + "learning_rate": 9.26349341227157e-05, + "loss": 0.8873, + "step": 16558 + }, + { + "epoch": 3.5322098976109215, + "grad_norm": 0.19250858033416776, + "learning_rate": 9.262653108982033e-05, + "loss": 0.9135, + "step": 16559 + }, + { + "epoch": 3.532423208191126, + "grad_norm": 0.2410890499091242, + "learning_rate": 9.261812791405554e-05, + "loss": 0.9306, + "step": 16560 + }, + { + "epoch": 3.532636518771331, + "grad_norm": 0.3331398219514464, + "learning_rate": 9.260972459551642e-05, + "loss": 0.9592, + "step": 16561 + }, + { + "epoch": 3.5328498293515356, + "grad_norm": 0.24071256671751182, + "learning_rate": 9.260132113429807e-05, + "loss": 0.9262, + "step": 16562 + }, + { + "epoch": 3.5330631399317407, + "grad_norm": 0.2797631224544592, + "learning_rate": 9.259291753049555e-05, + "loss": 0.927, + "step": 16563 + }, + { + "epoch": 3.5332764505119454, + "grad_norm": 0.21925612045085696, + "learning_rate": 9.258451378420398e-05, + "loss": 0.8899, + "step": 16564 + }, + { + "epoch": 3.53348976109215, + "grad_norm": 0.21643603453092955, + "learning_rate": 9.257610989551839e-05, + "loss": 0.8805, + "step": 16565 + }, + { + "epoch": 3.533703071672355, + "grad_norm": 0.39492097120145825, + "learning_rate": 9.256770586453393e-05, + "loss": 0.9032, + "step": 16566 + }, + { + "epoch": 3.53391638225256, + "grad_norm": 0.3644033637554775, + "learning_rate": 9.255930169134564e-05, + "loss": 0.9169, + "step": 16567 + }, + { + "epoch": 3.5341296928327646, + "grad_norm": 0.30027059704627485, + "learning_rate": 9.255089737604867e-05, + "loss": 0.9127, + "step": 16568 + }, + { + "epoch": 3.5343430034129693, + "grad_norm": 0.29643100347052, + "learning_rate": 9.254249291873806e-05, + "loss": 0.9311, + "step": 16569 + }, + { + "epoch": 3.534556313993174, + "grad_norm": 0.2842213254771401, + "learning_rate": 9.253408831950895e-05, + "loss": 0.91, + "step": 16570 + }, + { + "epoch": 3.5347696245733786, + "grad_norm": 0.22390800992199503, + "learning_rate": 9.25256835784564e-05, + "loss": 0.9319, + "step": 16571 + }, + { + "epoch": 3.5349829351535837, + "grad_norm": 0.20880383170350814, + "learning_rate": 9.251727869567556e-05, + "loss": 0.9039, + "step": 16572 + }, + { + "epoch": 3.5351962457337884, + "grad_norm": 0.2777664442160136, + "learning_rate": 9.250887367126146e-05, + "loss": 0.936, + "step": 16573 + }, + { + "epoch": 3.535409556313993, + "grad_norm": 0.2988435639252392, + "learning_rate": 9.250046850530928e-05, + "loss": 0.9325, + "step": 16574 + }, + { + "epoch": 3.535622866894198, + "grad_norm": 0.28768790903769126, + "learning_rate": 9.249206319791406e-05, + "loss": 0.919, + "step": 16575 + }, + { + "epoch": 3.535836177474403, + "grad_norm": 0.2162505016693702, + "learning_rate": 9.248365774917097e-05, + "loss": 0.9272, + "step": 16576 + }, + { + "epoch": 3.5360494880546076, + "grad_norm": 0.20196439745788677, + "learning_rate": 9.247525215917505e-05, + "loss": 0.9064, + "step": 16577 + }, + { + "epoch": 3.5362627986348123, + "grad_norm": 0.21140248033052284, + "learning_rate": 9.246684642802147e-05, + "loss": 0.9021, + "step": 16578 + }, + { + "epoch": 3.536476109215017, + "grad_norm": 0.26619891764910375, + "learning_rate": 9.245844055580527e-05, + "loss": 0.9108, + "step": 16579 + }, + { + "epoch": 3.5366894197952217, + "grad_norm": 0.2804125526209945, + "learning_rate": 9.245003454262165e-05, + "loss": 0.8881, + "step": 16580 + }, + { + "epoch": 3.5369027303754264, + "grad_norm": 0.2074194275191021, + "learning_rate": 9.244162838856566e-05, + "loss": 0.8972, + "step": 16581 + }, + { + "epoch": 3.5371160409556315, + "grad_norm": 0.2458538062257983, + "learning_rate": 9.243322209373244e-05, + "loss": 0.938, + "step": 16582 + }, + { + "epoch": 3.537329351535836, + "grad_norm": 0.33543483018672493, + "learning_rate": 9.24248156582171e-05, + "loss": 0.974, + "step": 16583 + }, + { + "epoch": 3.537542662116041, + "grad_norm": 0.37343172715239775, + "learning_rate": 9.241640908211477e-05, + "loss": 0.9192, + "step": 16584 + }, + { + "epoch": 3.537755972696246, + "grad_norm": 0.35425673529159757, + "learning_rate": 9.240800236552054e-05, + "loss": 0.9269, + "step": 16585 + }, + { + "epoch": 3.5379692832764507, + "grad_norm": 0.3479513979602841, + "learning_rate": 9.239959550852958e-05, + "loss": 0.9237, + "step": 16586 + }, + { + "epoch": 3.5381825938566553, + "grad_norm": 0.26879764886839225, + "learning_rate": 9.239118851123698e-05, + "loss": 0.9075, + "step": 16587 + }, + { + "epoch": 3.53839590443686, + "grad_norm": 0.22088709621368638, + "learning_rate": 9.238278137373787e-05, + "loss": 0.8929, + "step": 16588 + }, + { + "epoch": 3.5386092150170647, + "grad_norm": 0.3293155421784442, + "learning_rate": 9.237437409612737e-05, + "loss": 0.93, + "step": 16589 + }, + { + "epoch": 3.5388225255972694, + "grad_norm": 0.40362108542013986, + "learning_rate": 9.236596667850065e-05, + "loss": 0.9308, + "step": 16590 + }, + { + "epoch": 3.5390358361774745, + "grad_norm": 0.3796398804550808, + "learning_rate": 9.235755912095278e-05, + "loss": 0.8805, + "step": 16591 + }, + { + "epoch": 3.539249146757679, + "grad_norm": 0.3151317706046654, + "learning_rate": 9.234915142357894e-05, + "loss": 0.9313, + "step": 16592 + }, + { + "epoch": 3.539462457337884, + "grad_norm": 0.3173413571059781, + "learning_rate": 9.234074358647423e-05, + "loss": 0.9683, + "step": 16593 + }, + { + "epoch": 3.5396757679180886, + "grad_norm": 0.31958969130953524, + "learning_rate": 9.233233560973381e-05, + "loss": 0.94, + "step": 16594 + }, + { + "epoch": 3.5398890784982937, + "grad_norm": 0.33712112622868534, + "learning_rate": 9.232392749345279e-05, + "loss": 0.9252, + "step": 16595 + }, + { + "epoch": 3.5401023890784984, + "grad_norm": 0.3497793542423886, + "learning_rate": 9.231551923772631e-05, + "loss": 0.8692, + "step": 16596 + }, + { + "epoch": 3.540315699658703, + "grad_norm": 0.3033162572566138, + "learning_rate": 9.230711084264955e-05, + "loss": 0.8955, + "step": 16597 + }, + { + "epoch": 3.5405290102389078, + "grad_norm": 0.23560020144080612, + "learning_rate": 9.229870230831765e-05, + "loss": 0.9331, + "step": 16598 + }, + { + "epoch": 3.5407423208191124, + "grad_norm": 0.17894229267399112, + "learning_rate": 9.229029363482568e-05, + "loss": 0.8849, + "step": 16599 + }, + { + "epoch": 3.5409556313993176, + "grad_norm": 0.2882289782336022, + "learning_rate": 9.228188482226885e-05, + "loss": 0.9399, + "step": 16600 + }, + { + "epoch": 3.5411689419795223, + "grad_norm": 0.2933812386705326, + "learning_rate": 9.227347587074227e-05, + "loss": 0.9242, + "step": 16601 + }, + { + "epoch": 3.541382252559727, + "grad_norm": 0.24877055903628967, + "learning_rate": 9.226506678034112e-05, + "loss": 0.9077, + "step": 16602 + }, + { + "epoch": 3.5415955631399316, + "grad_norm": 0.20487946280759683, + "learning_rate": 9.225665755116051e-05, + "loss": 0.9183, + "step": 16603 + }, + { + "epoch": 3.5418088737201368, + "grad_norm": 0.30474681889239225, + "learning_rate": 9.224824818329567e-05, + "loss": 0.9259, + "step": 16604 + }, + { + "epoch": 3.5420221843003414, + "grad_norm": 0.26440641173443663, + "learning_rate": 9.223983867684163e-05, + "loss": 0.9266, + "step": 16605 + }, + { + "epoch": 3.542235494880546, + "grad_norm": 0.16773758220729573, + "learning_rate": 9.223142903189365e-05, + "loss": 0.8824, + "step": 16606 + }, + { + "epoch": 3.542448805460751, + "grad_norm": 0.24041156507060857, + "learning_rate": 9.222301924854682e-05, + "loss": 0.8904, + "step": 16607 + }, + { + "epoch": 3.5426621160409555, + "grad_norm": 0.2528117243205692, + "learning_rate": 9.221460932689635e-05, + "loss": 0.9463, + "step": 16608 + }, + { + "epoch": 3.54287542662116, + "grad_norm": 0.18821170549579697, + "learning_rate": 9.220619926703736e-05, + "loss": 0.9086, + "step": 16609 + }, + { + "epoch": 3.5430887372013653, + "grad_norm": 0.2010193981472195, + "learning_rate": 9.219778906906502e-05, + "loss": 0.9227, + "step": 16610 + }, + { + "epoch": 3.54330204778157, + "grad_norm": 0.2578668417108459, + "learning_rate": 9.218937873307446e-05, + "loss": 0.9301, + "step": 16611 + }, + { + "epoch": 3.5435153583617747, + "grad_norm": 0.21268402042800838, + "learning_rate": 9.218096825916093e-05, + "loss": 0.9019, + "step": 16612 + }, + { + "epoch": 3.54372866894198, + "grad_norm": 0.17468309669643103, + "learning_rate": 9.217255764741949e-05, + "loss": 0.9302, + "step": 16613 + }, + { + "epoch": 3.5439419795221845, + "grad_norm": 0.19895525861482283, + "learning_rate": 9.216414689794537e-05, + "loss": 0.9132, + "step": 16614 + }, + { + "epoch": 3.544155290102389, + "grad_norm": 0.19971225296637843, + "learning_rate": 9.215573601083376e-05, + "loss": 0.9264, + "step": 16615 + }, + { + "epoch": 3.544368600682594, + "grad_norm": 0.1862631315742585, + "learning_rate": 9.214732498617974e-05, + "loss": 0.9552, + "step": 16616 + }, + { + "epoch": 3.5445819112627985, + "grad_norm": 0.2105908689679597, + "learning_rate": 9.213891382407856e-05, + "loss": 0.9108, + "step": 16617 + }, + { + "epoch": 3.544795221843003, + "grad_norm": 0.21195281781085465, + "learning_rate": 9.213050252462536e-05, + "loss": 0.9086, + "step": 16618 + }, + { + "epoch": 3.5450085324232083, + "grad_norm": 0.2218698880892248, + "learning_rate": 9.212209108791531e-05, + "loss": 0.922, + "step": 16619 + }, + { + "epoch": 3.545221843003413, + "grad_norm": 0.2704282492978338, + "learning_rate": 9.211367951404363e-05, + "loss": 0.9279, + "step": 16620 + }, + { + "epoch": 3.5454351535836177, + "grad_norm": 0.29251968253627086, + "learning_rate": 9.210526780310542e-05, + "loss": 0.926, + "step": 16621 + }, + { + "epoch": 3.5456484641638224, + "grad_norm": 0.3250318173579104, + "learning_rate": 9.209685595519592e-05, + "loss": 0.932, + "step": 16622 + }, + { + "epoch": 3.5458617747440275, + "grad_norm": 0.32550950210848956, + "learning_rate": 9.20884439704103e-05, + "loss": 0.911, + "step": 16623 + }, + { + "epoch": 3.546075085324232, + "grad_norm": 0.29656168121510595, + "learning_rate": 9.208003184884371e-05, + "loss": 0.9303, + "step": 16624 + }, + { + "epoch": 3.546288395904437, + "grad_norm": 0.26130860387876254, + "learning_rate": 9.207161959059136e-05, + "loss": 0.9411, + "step": 16625 + }, + { + "epoch": 3.5465017064846416, + "grad_norm": 0.19164476880878056, + "learning_rate": 9.206320719574849e-05, + "loss": 0.918, + "step": 16626 + }, + { + "epoch": 3.5467150170648463, + "grad_norm": 0.17575799136555456, + "learning_rate": 9.205479466441015e-05, + "loss": 0.8753, + "step": 16627 + }, + { + "epoch": 3.546928327645051, + "grad_norm": 0.2049785690674802, + "learning_rate": 9.204638199667165e-05, + "loss": 0.9027, + "step": 16628 + }, + { + "epoch": 3.547141638225256, + "grad_norm": 0.22040566227400193, + "learning_rate": 9.203796919262809e-05, + "loss": 0.9054, + "step": 16629 + }, + { + "epoch": 3.5473549488054608, + "grad_norm": 0.2258574118503236, + "learning_rate": 9.202955625237476e-05, + "loss": 0.9249, + "step": 16630 + }, + { + "epoch": 3.5475682593856654, + "grad_norm": 0.21692150859269188, + "learning_rate": 9.202114317600675e-05, + "loss": 0.9158, + "step": 16631 + }, + { + "epoch": 3.5477815699658706, + "grad_norm": 0.204959438275041, + "learning_rate": 9.201272996361935e-05, + "loss": 0.9388, + "step": 16632 + }, + { + "epoch": 3.5479948805460753, + "grad_norm": 0.20169235333405575, + "learning_rate": 9.200431661530768e-05, + "loss": 0.91, + "step": 16633 + }, + { + "epoch": 3.54820819112628, + "grad_norm": 0.17234396749459355, + "learning_rate": 9.199590313116698e-05, + "loss": 0.9003, + "step": 16634 + }, + { + "epoch": 3.5484215017064846, + "grad_norm": 0.1883389004756079, + "learning_rate": 9.198748951129242e-05, + "loss": 0.9564, + "step": 16635 + }, + { + "epoch": 3.5486348122866893, + "grad_norm": 0.2149714207380604, + "learning_rate": 9.197907575577922e-05, + "loss": 0.9103, + "step": 16636 + }, + { + "epoch": 3.548848122866894, + "grad_norm": 0.21377327791044234, + "learning_rate": 9.197066186472259e-05, + "loss": 0.9313, + "step": 16637 + }, + { + "epoch": 3.549061433447099, + "grad_norm": 0.22833136588963562, + "learning_rate": 9.19622478382177e-05, + "loss": 0.9029, + "step": 16638 + }, + { + "epoch": 3.549274744027304, + "grad_norm": 0.1993265657037535, + "learning_rate": 9.195383367635978e-05, + "loss": 0.9276, + "step": 16639 + }, + { + "epoch": 3.5494880546075085, + "grad_norm": 0.17582574355173453, + "learning_rate": 9.194541937924403e-05, + "loss": 0.8917, + "step": 16640 + }, + { + "epoch": 3.549701365187713, + "grad_norm": 0.21917292510607767, + "learning_rate": 9.193700494696565e-05, + "loss": 0.9256, + "step": 16641 + }, + { + "epoch": 3.5499146757679183, + "grad_norm": 0.2709082457126093, + "learning_rate": 9.192859037961988e-05, + "loss": 0.9304, + "step": 16642 + }, + { + "epoch": 3.550127986348123, + "grad_norm": 0.3078782804405115, + "learning_rate": 9.19201756773019e-05, + "loss": 0.9358, + "step": 16643 + }, + { + "epoch": 3.5503412969283277, + "grad_norm": 0.2660571918749242, + "learning_rate": 9.191176084010694e-05, + "loss": 0.9465, + "step": 16644 + }, + { + "epoch": 3.5505546075085324, + "grad_norm": 0.2522157233960951, + "learning_rate": 9.190334586813019e-05, + "loss": 0.9202, + "step": 16645 + }, + { + "epoch": 3.550767918088737, + "grad_norm": 0.19515777123433473, + "learning_rate": 9.189493076146688e-05, + "loss": 0.8789, + "step": 16646 + }, + { + "epoch": 3.5509812286689417, + "grad_norm": 0.19422123347367612, + "learning_rate": 9.188651552021222e-05, + "loss": 0.9472, + "step": 16647 + }, + { + "epoch": 3.551194539249147, + "grad_norm": 0.2068613452376978, + "learning_rate": 9.187810014446147e-05, + "loss": 0.8986, + "step": 16648 + }, + { + "epoch": 3.5514078498293515, + "grad_norm": 0.20675042143201097, + "learning_rate": 9.18696846343098e-05, + "loss": 0.9399, + "step": 16649 + }, + { + "epoch": 3.551621160409556, + "grad_norm": 0.2348027299489997, + "learning_rate": 9.186126898985245e-05, + "loss": 0.9253, + "step": 16650 + }, + { + "epoch": 3.5518344709897613, + "grad_norm": 0.19974819631680196, + "learning_rate": 9.185285321118463e-05, + "loss": 0.9517, + "step": 16651 + }, + { + "epoch": 3.552047781569966, + "grad_norm": 0.1954828208532, + "learning_rate": 9.184443729840159e-05, + "loss": 0.9259, + "step": 16652 + }, + { + "epoch": 3.5522610921501707, + "grad_norm": 0.21302914434285178, + "learning_rate": 9.183602125159854e-05, + "loss": 0.9067, + "step": 16653 + }, + { + "epoch": 3.5524744027303754, + "grad_norm": 0.18353917303246514, + "learning_rate": 9.182760507087071e-05, + "loss": 0.9207, + "step": 16654 + }, + { + "epoch": 3.55268771331058, + "grad_norm": 0.19249566310650515, + "learning_rate": 9.181918875631334e-05, + "loss": 0.898, + "step": 16655 + }, + { + "epoch": 3.5529010238907848, + "grad_norm": 0.18857656714977783, + "learning_rate": 9.181077230802164e-05, + "loss": 0.9111, + "step": 16656 + }, + { + "epoch": 3.55311433447099, + "grad_norm": 0.1736048830135009, + "learning_rate": 9.180235572609084e-05, + "loss": 0.9281, + "step": 16657 + }, + { + "epoch": 3.5533276450511946, + "grad_norm": 0.200814633948352, + "learning_rate": 9.17939390106162e-05, + "loss": 0.9171, + "step": 16658 + }, + { + "epoch": 3.5535409556313993, + "grad_norm": 0.24807228473830129, + "learning_rate": 9.178552216169293e-05, + "loss": 0.9319, + "step": 16659 + }, + { + "epoch": 3.553754266211604, + "grad_norm": 0.32140368378079065, + "learning_rate": 9.17771051794163e-05, + "loss": 0.9131, + "step": 16660 + }, + { + "epoch": 3.553967576791809, + "grad_norm": 0.25981251498171204, + "learning_rate": 9.176868806388153e-05, + "loss": 0.9148, + "step": 16661 + }, + { + "epoch": 3.5541808873720138, + "grad_norm": 0.20199773933600831, + "learning_rate": 9.176027081518384e-05, + "loss": 0.9312, + "step": 16662 + }, + { + "epoch": 3.5543941979522184, + "grad_norm": 0.18459630030148233, + "learning_rate": 9.175185343341847e-05, + "loss": 0.8987, + "step": 16663 + }, + { + "epoch": 3.554607508532423, + "grad_norm": 0.19674381591866508, + "learning_rate": 9.174343591868072e-05, + "loss": 0.8967, + "step": 16664 + }, + { + "epoch": 3.554820819112628, + "grad_norm": 0.18289531323883873, + "learning_rate": 9.173501827106577e-05, + "loss": 0.9548, + "step": 16665 + }, + { + "epoch": 3.5550341296928325, + "grad_norm": 0.19118875703840382, + "learning_rate": 9.172660049066889e-05, + "loss": 0.934, + "step": 16666 + }, + { + "epoch": 3.5552474402730376, + "grad_norm": 0.1770638758318294, + "learning_rate": 9.171818257758535e-05, + "loss": 0.8864, + "step": 16667 + }, + { + "epoch": 3.5554607508532423, + "grad_norm": 0.1974279837023248, + "learning_rate": 9.170976453191035e-05, + "loss": 0.9317, + "step": 16668 + }, + { + "epoch": 3.555674061433447, + "grad_norm": 0.22270185361614714, + "learning_rate": 9.170134635373918e-05, + "loss": 0.9074, + "step": 16669 + }, + { + "epoch": 3.555887372013652, + "grad_norm": 0.18301167439468985, + "learning_rate": 9.169292804316708e-05, + "loss": 0.932, + "step": 16670 + }, + { + "epoch": 3.556100682593857, + "grad_norm": 0.1766790906513762, + "learning_rate": 9.168450960028932e-05, + "loss": 0.9042, + "step": 16671 + }, + { + "epoch": 3.5563139931740615, + "grad_norm": 0.21430316729807247, + "learning_rate": 9.16760910252011e-05, + "loss": 0.961, + "step": 16672 + }, + { + "epoch": 3.556527303754266, + "grad_norm": 0.21575749878365652, + "learning_rate": 9.166767231799773e-05, + "loss": 0.9297, + "step": 16673 + }, + { + "epoch": 3.556740614334471, + "grad_norm": 0.19986476523443072, + "learning_rate": 9.165925347877447e-05, + "loss": 0.8849, + "step": 16674 + }, + { + "epoch": 3.5569539249146755, + "grad_norm": 0.23608911123506235, + "learning_rate": 9.165083450762653e-05, + "loss": 0.9021, + "step": 16675 + }, + { + "epoch": 3.5571672354948807, + "grad_norm": 0.21191869962131288, + "learning_rate": 9.164241540464922e-05, + "loss": 0.8926, + "step": 16676 + }, + { + "epoch": 3.5573805460750854, + "grad_norm": 0.23700137632161136, + "learning_rate": 9.16339961699378e-05, + "loss": 0.9344, + "step": 16677 + }, + { + "epoch": 3.55759385665529, + "grad_norm": 0.22309462967997776, + "learning_rate": 9.162557680358747e-05, + "loss": 0.9393, + "step": 16678 + }, + { + "epoch": 3.5578071672354947, + "grad_norm": 0.16511290348553467, + "learning_rate": 9.161715730569358e-05, + "loss": 0.8818, + "step": 16679 + }, + { + "epoch": 3.5580204778157, + "grad_norm": 0.2062871952721899, + "learning_rate": 9.160873767635135e-05, + "loss": 0.9136, + "step": 16680 + }, + { + "epoch": 3.5582337883959045, + "grad_norm": 0.20970175646784767, + "learning_rate": 9.160031791565604e-05, + "loss": 0.8818, + "step": 16681 + }, + { + "epoch": 3.5584470989761092, + "grad_norm": 0.24234142828217556, + "learning_rate": 9.159189802370296e-05, + "loss": 0.9502, + "step": 16682 + }, + { + "epoch": 3.558660409556314, + "grad_norm": 0.25579533033267116, + "learning_rate": 9.158347800058737e-05, + "loss": 0.9469, + "step": 16683 + }, + { + "epoch": 3.5588737201365186, + "grad_norm": 0.2681294743309397, + "learning_rate": 9.157505784640452e-05, + "loss": 0.9391, + "step": 16684 + }, + { + "epoch": 3.5590870307167233, + "grad_norm": 0.32676064488194245, + "learning_rate": 9.156663756124968e-05, + "loss": 0.9313, + "step": 16685 + }, + { + "epoch": 3.5593003412969284, + "grad_norm": 0.2408714516174114, + "learning_rate": 9.155821714521817e-05, + "loss": 0.9261, + "step": 16686 + }, + { + "epoch": 3.559513651877133, + "grad_norm": 0.2139906754992848, + "learning_rate": 9.154979659840521e-05, + "loss": 0.9168, + "step": 16687 + }, + { + "epoch": 3.5597269624573378, + "grad_norm": 0.3228348783017199, + "learning_rate": 9.154137592090612e-05, + "loss": 0.9129, + "step": 16688 + }, + { + "epoch": 3.559940273037543, + "grad_norm": 0.261832983894678, + "learning_rate": 9.153295511281617e-05, + "loss": 0.9102, + "step": 16689 + }, + { + "epoch": 3.5601535836177476, + "grad_norm": 0.24403656582508504, + "learning_rate": 9.152453417423063e-05, + "loss": 0.9306, + "step": 16690 + }, + { + "epoch": 3.5603668941979523, + "grad_norm": 0.2738551894855129, + "learning_rate": 9.151611310524481e-05, + "loss": 0.9173, + "step": 16691 + }, + { + "epoch": 3.560580204778157, + "grad_norm": 0.2081475736380433, + "learning_rate": 9.150769190595396e-05, + "loss": 0.9322, + "step": 16692 + }, + { + "epoch": 3.5607935153583616, + "grad_norm": 0.25769168320933383, + "learning_rate": 9.14992705764534e-05, + "loss": 0.9595, + "step": 16693 + }, + { + "epoch": 3.5610068259385663, + "grad_norm": 0.3256044438039443, + "learning_rate": 9.149084911683837e-05, + "loss": 0.9278, + "step": 16694 + }, + { + "epoch": 3.5612201365187715, + "grad_norm": 0.3337157597761392, + "learning_rate": 9.148242752720422e-05, + "loss": 0.8951, + "step": 16695 + }, + { + "epoch": 3.561433447098976, + "grad_norm": 0.3006242114004511, + "learning_rate": 9.147400580764619e-05, + "loss": 0.9213, + "step": 16696 + }, + { + "epoch": 3.561646757679181, + "grad_norm": 0.27473847439505944, + "learning_rate": 9.146558395825959e-05, + "loss": 0.9243, + "step": 16697 + }, + { + "epoch": 3.5618600682593855, + "grad_norm": 0.1907624672337887, + "learning_rate": 9.145716197913973e-05, + "loss": 0.9148, + "step": 16698 + }, + { + "epoch": 3.5620733788395906, + "grad_norm": 0.235032209669029, + "learning_rate": 9.144873987038188e-05, + "loss": 0.9204, + "step": 16699 + }, + { + "epoch": 3.5622866894197953, + "grad_norm": 0.32669722445187094, + "learning_rate": 9.144031763208134e-05, + "loss": 0.9132, + "step": 16700 + }, + { + "epoch": 3.5625, + "grad_norm": 0.18042756276687136, + "learning_rate": 9.143189526433341e-05, + "loss": 0.8969, + "step": 16701 + }, + { + "epoch": 3.5627133105802047, + "grad_norm": 0.24451417194414818, + "learning_rate": 9.142347276723342e-05, + "loss": 0.8959, + "step": 16702 + }, + { + "epoch": 3.5629266211604094, + "grad_norm": 0.2653263746461672, + "learning_rate": 9.141505014087662e-05, + "loss": 0.9227, + "step": 16703 + }, + { + "epoch": 3.5631399317406145, + "grad_norm": 0.2163543535345907, + "learning_rate": 9.140662738535834e-05, + "loss": 0.9169, + "step": 16704 + }, + { + "epoch": 3.563353242320819, + "grad_norm": 0.20116312759950844, + "learning_rate": 9.139820450077388e-05, + "loss": 0.9167, + "step": 16705 + }, + { + "epoch": 3.563566552901024, + "grad_norm": 0.2617242109458198, + "learning_rate": 9.138978148721855e-05, + "loss": 0.9331, + "step": 16706 + }, + { + "epoch": 3.5637798634812285, + "grad_norm": 0.1918600702066726, + "learning_rate": 9.138135834478764e-05, + "loss": 0.9489, + "step": 16707 + }, + { + "epoch": 3.5639931740614337, + "grad_norm": 0.22413932680315746, + "learning_rate": 9.137293507357648e-05, + "loss": 0.9345, + "step": 16708 + }, + { + "epoch": 3.5642064846416384, + "grad_norm": 0.2737886475131055, + "learning_rate": 9.136451167368036e-05, + "loss": 0.9344, + "step": 16709 + }, + { + "epoch": 3.564419795221843, + "grad_norm": 0.23083948629860565, + "learning_rate": 9.13560881451946e-05, + "loss": 0.9232, + "step": 16710 + }, + { + "epoch": 3.5646331058020477, + "grad_norm": 0.24763609509677764, + "learning_rate": 9.134766448821452e-05, + "loss": 0.9111, + "step": 16711 + }, + { + "epoch": 3.5648464163822524, + "grad_norm": 0.2498868494378917, + "learning_rate": 9.133924070283542e-05, + "loss": 0.9076, + "step": 16712 + }, + { + "epoch": 3.565059726962457, + "grad_norm": 0.2153806621824494, + "learning_rate": 9.133081678915261e-05, + "loss": 0.9153, + "step": 16713 + }, + { + "epoch": 3.5652730375426622, + "grad_norm": 0.23183424976118097, + "learning_rate": 9.132239274726145e-05, + "loss": 0.9249, + "step": 16714 + }, + { + "epoch": 3.565486348122867, + "grad_norm": 0.17563082924149584, + "learning_rate": 9.131396857725719e-05, + "loss": 0.9053, + "step": 16715 + }, + { + "epoch": 3.5656996587030716, + "grad_norm": 0.22662371205974208, + "learning_rate": 9.13055442792352e-05, + "loss": 0.9222, + "step": 16716 + }, + { + "epoch": 3.5659129692832767, + "grad_norm": 0.23691511792689193, + "learning_rate": 9.129711985329077e-05, + "loss": 0.9151, + "step": 16717 + }, + { + "epoch": 3.5661262798634814, + "grad_norm": 0.25213217018727824, + "learning_rate": 9.128869529951926e-05, + "loss": 0.9098, + "step": 16718 + }, + { + "epoch": 3.566339590443686, + "grad_norm": 0.2511829496820534, + "learning_rate": 9.128027061801597e-05, + "loss": 0.889, + "step": 16719 + }, + { + "epoch": 3.5665529010238908, + "grad_norm": 0.21395824753407128, + "learning_rate": 9.127184580887623e-05, + "loss": 0.9184, + "step": 16720 + }, + { + "epoch": 3.5667662116040955, + "grad_norm": 0.17647528798689532, + "learning_rate": 9.126342087219537e-05, + "loss": 0.9409, + "step": 16721 + }, + { + "epoch": 3.5669795221843, + "grad_norm": 0.25543359051912473, + "learning_rate": 9.12549958080687e-05, + "loss": 0.9531, + "step": 16722 + }, + { + "epoch": 3.5671928327645053, + "grad_norm": 0.32529443721501927, + "learning_rate": 9.124657061659156e-05, + "loss": 0.9285, + "step": 16723 + }, + { + "epoch": 3.56740614334471, + "grad_norm": 0.2838291785531942, + "learning_rate": 9.123814529785928e-05, + "loss": 0.9356, + "step": 16724 + }, + { + "epoch": 3.5676194539249146, + "grad_norm": 0.25089054301426406, + "learning_rate": 9.122971985196721e-05, + "loss": 0.9319, + "step": 16725 + }, + { + "epoch": 3.5678327645051193, + "grad_norm": 0.3265411915354465, + "learning_rate": 9.122129427901069e-05, + "loss": 0.9177, + "step": 16726 + }, + { + "epoch": 3.5680460750853245, + "grad_norm": 0.24656928620287127, + "learning_rate": 9.121286857908502e-05, + "loss": 0.9166, + "step": 16727 + }, + { + "epoch": 3.568259385665529, + "grad_norm": 0.33550287931896733, + "learning_rate": 9.120444275228552e-05, + "loss": 0.9263, + "step": 16728 + }, + { + "epoch": 3.568472696245734, + "grad_norm": 0.3853794297377655, + "learning_rate": 9.11960167987076e-05, + "loss": 0.9343, + "step": 16729 + }, + { + "epoch": 3.5686860068259385, + "grad_norm": 0.2954474711578379, + "learning_rate": 9.118759071844655e-05, + "loss": 0.9224, + "step": 16730 + }, + { + "epoch": 3.568899317406143, + "grad_norm": 0.33591704099147096, + "learning_rate": 9.117916451159772e-05, + "loss": 0.8796, + "step": 16731 + }, + { + "epoch": 3.569112627986348, + "grad_norm": 0.23710768632092954, + "learning_rate": 9.117073817825647e-05, + "loss": 0.8906, + "step": 16732 + }, + { + "epoch": 3.569325938566553, + "grad_norm": 0.18827624284941688, + "learning_rate": 9.116231171851812e-05, + "loss": 0.9264, + "step": 16733 + }, + { + "epoch": 3.5695392491467577, + "grad_norm": 0.24552518055807987, + "learning_rate": 9.115388513247801e-05, + "loss": 0.9252, + "step": 16734 + }, + { + "epoch": 3.5697525597269624, + "grad_norm": 0.2340840232756508, + "learning_rate": 9.114545842023154e-05, + "loss": 0.8975, + "step": 16735 + }, + { + "epoch": 3.5699658703071675, + "grad_norm": 0.19011128670082486, + "learning_rate": 9.113703158187399e-05, + "loss": 0.9317, + "step": 16736 + }, + { + "epoch": 3.570179180887372, + "grad_norm": 0.19253999983129266, + "learning_rate": 9.112860461750075e-05, + "loss": 0.9328, + "step": 16737 + }, + { + "epoch": 3.570392491467577, + "grad_norm": 0.22260084188632043, + "learning_rate": 9.112017752720716e-05, + "loss": 0.903, + "step": 16738 + }, + { + "epoch": 3.5706058020477816, + "grad_norm": 0.1970467415672955, + "learning_rate": 9.111175031108857e-05, + "loss": 0.9026, + "step": 16739 + }, + { + "epoch": 3.5708191126279862, + "grad_norm": 0.21482406264666018, + "learning_rate": 9.110332296924034e-05, + "loss": 0.9297, + "step": 16740 + }, + { + "epoch": 3.571032423208191, + "grad_norm": 0.2213203344361181, + "learning_rate": 9.109489550175784e-05, + "loss": 0.9374, + "step": 16741 + }, + { + "epoch": 3.571245733788396, + "grad_norm": 0.2029310364765209, + "learning_rate": 9.108646790873638e-05, + "loss": 0.9099, + "step": 16742 + }, + { + "epoch": 3.5714590443686007, + "grad_norm": 0.2329827821576141, + "learning_rate": 9.10780401902714e-05, + "loss": 0.906, + "step": 16743 + }, + { + "epoch": 3.5716723549488054, + "grad_norm": 0.20561585681906075, + "learning_rate": 9.106961234645815e-05, + "loss": 0.9294, + "step": 16744 + }, + { + "epoch": 3.57188566552901, + "grad_norm": 0.2037164221951895, + "learning_rate": 9.106118437739207e-05, + "loss": 0.9062, + "step": 16745 + }, + { + "epoch": 3.5720989761092152, + "grad_norm": 0.2361639770317045, + "learning_rate": 9.10527562831685e-05, + "loss": 0.8913, + "step": 16746 + }, + { + "epoch": 3.57231228668942, + "grad_norm": 0.20488936428348262, + "learning_rate": 9.104432806388283e-05, + "loss": 0.8975, + "step": 16747 + }, + { + "epoch": 3.5725255972696246, + "grad_norm": 0.19423555933721004, + "learning_rate": 9.103589971963038e-05, + "loss": 0.949, + "step": 16748 + }, + { + "epoch": 3.5727389078498293, + "grad_norm": 0.23021175194174331, + "learning_rate": 9.102747125050656e-05, + "loss": 0.919, + "step": 16749 + }, + { + "epoch": 3.572952218430034, + "grad_norm": 0.24006263720438495, + "learning_rate": 9.10190426566067e-05, + "loss": 0.9358, + "step": 16750 + }, + { + "epoch": 3.5731655290102387, + "grad_norm": 0.27468499987385914, + "learning_rate": 9.101061393802619e-05, + "loss": 0.9332, + "step": 16751 + }, + { + "epoch": 3.573378839590444, + "grad_norm": 0.33649250863701247, + "learning_rate": 9.100218509486039e-05, + "loss": 0.8928, + "step": 16752 + }, + { + "epoch": 3.5735921501706485, + "grad_norm": 0.24128210125586222, + "learning_rate": 9.099375612720469e-05, + "loss": 0.8872, + "step": 16753 + }, + { + "epoch": 3.573805460750853, + "grad_norm": 0.22821730471593973, + "learning_rate": 9.098532703515445e-05, + "loss": 0.9154, + "step": 16754 + }, + { + "epoch": 3.5740187713310583, + "grad_norm": 0.2831841328281821, + "learning_rate": 9.097689781880506e-05, + "loss": 0.886, + "step": 16755 + }, + { + "epoch": 3.574232081911263, + "grad_norm": 0.20529696943733436, + "learning_rate": 9.096846847825188e-05, + "loss": 0.8862, + "step": 16756 + }, + { + "epoch": 3.5744453924914676, + "grad_norm": 0.26092529983869744, + "learning_rate": 9.09600390135903e-05, + "loss": 0.9267, + "step": 16757 + }, + { + "epoch": 3.5746587030716723, + "grad_norm": 0.2383424824038217, + "learning_rate": 9.095160942491569e-05, + "loss": 0.93, + "step": 16758 + }, + { + "epoch": 3.574872013651877, + "grad_norm": 0.23888663624220402, + "learning_rate": 9.094317971232343e-05, + "loss": 0.9283, + "step": 16759 + }, + { + "epoch": 3.5750853242320817, + "grad_norm": 0.21847782495429624, + "learning_rate": 9.093474987590892e-05, + "loss": 0.9248, + "step": 16760 + }, + { + "epoch": 3.575298634812287, + "grad_norm": 0.240803196984832, + "learning_rate": 9.092631991576753e-05, + "loss": 0.9297, + "step": 16761 + }, + { + "epoch": 3.5755119453924915, + "grad_norm": 0.26237427446973943, + "learning_rate": 9.091788983199461e-05, + "loss": 0.9157, + "step": 16762 + }, + { + "epoch": 3.575725255972696, + "grad_norm": 0.2233660163594995, + "learning_rate": 9.090945962468562e-05, + "loss": 0.9194, + "step": 16763 + }, + { + "epoch": 3.575938566552901, + "grad_norm": 0.30581785146529966, + "learning_rate": 9.09010292939359e-05, + "loss": 0.9359, + "step": 16764 + }, + { + "epoch": 3.576151877133106, + "grad_norm": 0.336404341324729, + "learning_rate": 9.089259883984086e-05, + "loss": 0.9121, + "step": 16765 + }, + { + "epoch": 3.5763651877133107, + "grad_norm": 0.19578830305638434, + "learning_rate": 9.088416826249588e-05, + "loss": 0.9178, + "step": 16766 + }, + { + "epoch": 3.5765784982935154, + "grad_norm": 0.21574177373716874, + "learning_rate": 9.087573756199634e-05, + "loss": 0.8986, + "step": 16767 + }, + { + "epoch": 3.57679180887372, + "grad_norm": 0.2533976746982529, + "learning_rate": 9.086730673843765e-05, + "loss": 0.906, + "step": 16768 + }, + { + "epoch": 3.5770051194539247, + "grad_norm": 0.3481446675237384, + "learning_rate": 9.085887579191522e-05, + "loss": 0.9117, + "step": 16769 + }, + { + "epoch": 3.5772184300341294, + "grad_norm": 0.3983103482585481, + "learning_rate": 9.085044472252439e-05, + "loss": 0.8864, + "step": 16770 + }, + { + "epoch": 3.5774317406143346, + "grad_norm": 0.327723958839645, + "learning_rate": 9.084201353036064e-05, + "loss": 0.9291, + "step": 16771 + }, + { + "epoch": 3.5776450511945392, + "grad_norm": 0.22463774534532518, + "learning_rate": 9.08335822155193e-05, + "loss": 0.9459, + "step": 16772 + }, + { + "epoch": 3.577858361774744, + "grad_norm": 0.2001899732156699, + "learning_rate": 9.082515077809579e-05, + "loss": 0.9037, + "step": 16773 + }, + { + "epoch": 3.578071672354949, + "grad_norm": 0.31637022601013115, + "learning_rate": 9.081671921818552e-05, + "loss": 0.9049, + "step": 16774 + }, + { + "epoch": 3.5782849829351537, + "grad_norm": 0.4021599752800921, + "learning_rate": 9.08082875358839e-05, + "loss": 0.9059, + "step": 16775 + }, + { + "epoch": 3.5784982935153584, + "grad_norm": 0.3949547908284647, + "learning_rate": 9.079985573128631e-05, + "loss": 0.8995, + "step": 16776 + }, + { + "epoch": 3.578711604095563, + "grad_norm": 0.328629245273382, + "learning_rate": 9.079142380448819e-05, + "loss": 0.9134, + "step": 16777 + }, + { + "epoch": 3.578924914675768, + "grad_norm": 0.21363969445185158, + "learning_rate": 9.078299175558492e-05, + "loss": 0.9143, + "step": 16778 + }, + { + "epoch": 3.5791382252559725, + "grad_norm": 0.33360756406435393, + "learning_rate": 9.077455958467192e-05, + "loss": 0.917, + "step": 16779 + }, + { + "epoch": 3.5793515358361776, + "grad_norm": 0.4890826505800377, + "learning_rate": 9.076612729184458e-05, + "loss": 0.8891, + "step": 16780 + }, + { + "epoch": 3.5795648464163823, + "grad_norm": 0.49378572403352033, + "learning_rate": 9.075769487719834e-05, + "loss": 0.9176, + "step": 16781 + }, + { + "epoch": 3.579778156996587, + "grad_norm": 0.38851573075262463, + "learning_rate": 9.074926234082861e-05, + "loss": 0.904, + "step": 16782 + }, + { + "epoch": 3.5799914675767917, + "grad_norm": 0.23663956002674202, + "learning_rate": 9.07408296828308e-05, + "loss": 0.907, + "step": 16783 + }, + { + "epoch": 3.580204778156997, + "grad_norm": 0.1640186808535594, + "learning_rate": 9.073239690330029e-05, + "loss": 0.9024, + "step": 16784 + }, + { + "epoch": 3.5804180887372015, + "grad_norm": 0.2898606049912141, + "learning_rate": 9.072396400233257e-05, + "loss": 0.9357, + "step": 16785 + }, + { + "epoch": 3.580631399317406, + "grad_norm": 0.3971632589225573, + "learning_rate": 9.071553098002296e-05, + "loss": 0.9282, + "step": 16786 + }, + { + "epoch": 3.580844709897611, + "grad_norm": 0.4005279272891894, + "learning_rate": 9.0707097836467e-05, + "loss": 0.9353, + "step": 16787 + }, + { + "epoch": 3.5810580204778155, + "grad_norm": 0.3106843537007251, + "learning_rate": 9.069866457176001e-05, + "loss": 0.9069, + "step": 16788 + }, + { + "epoch": 3.58127133105802, + "grad_norm": 0.21633537732473576, + "learning_rate": 9.069023118599747e-05, + "loss": 0.9289, + "step": 16789 + }, + { + "epoch": 3.5814846416382253, + "grad_norm": 0.2891907345459864, + "learning_rate": 9.068179767927476e-05, + "loss": 0.8977, + "step": 16790 + }, + { + "epoch": 3.58169795221843, + "grad_norm": 0.345542364905394, + "learning_rate": 9.067336405168736e-05, + "loss": 0.9183, + "step": 16791 + }, + { + "epoch": 3.5819112627986347, + "grad_norm": 0.3478838875141136, + "learning_rate": 9.066493030333063e-05, + "loss": 0.976, + "step": 16792 + }, + { + "epoch": 3.58212457337884, + "grad_norm": 0.30015416147739543, + "learning_rate": 9.065649643430006e-05, + "loss": 0.912, + "step": 16793 + }, + { + "epoch": 3.5823378839590445, + "grad_norm": 0.2664943198516501, + "learning_rate": 9.064806244469104e-05, + "loss": 0.9161, + "step": 16794 + }, + { + "epoch": 3.582551194539249, + "grad_norm": 0.2141795654373821, + "learning_rate": 9.063962833459904e-05, + "loss": 0.916, + "step": 16795 + }, + { + "epoch": 3.582764505119454, + "grad_norm": 0.2574489617297598, + "learning_rate": 9.063119410411944e-05, + "loss": 0.9228, + "step": 16796 + }, + { + "epoch": 3.5829778156996586, + "grad_norm": 0.2892521650385926, + "learning_rate": 9.062275975334772e-05, + "loss": 0.9386, + "step": 16797 + }, + { + "epoch": 3.5831911262798632, + "grad_norm": 0.2612666275338504, + "learning_rate": 9.061432528237928e-05, + "loss": 0.9333, + "step": 16798 + }, + { + "epoch": 3.5834044368600684, + "grad_norm": 0.2078778956351431, + "learning_rate": 9.060589069130958e-05, + "loss": 0.9134, + "step": 16799 + }, + { + "epoch": 3.583617747440273, + "grad_norm": 0.23972472226453, + "learning_rate": 9.059745598023404e-05, + "loss": 0.9049, + "step": 16800 + }, + { + "epoch": 3.5838310580204777, + "grad_norm": 0.26290406531290905, + "learning_rate": 9.05890211492481e-05, + "loss": 0.9007, + "step": 16801 + }, + { + "epoch": 3.5840443686006824, + "grad_norm": 0.3065939475455041, + "learning_rate": 9.058058619844721e-05, + "loss": 0.9079, + "step": 16802 + }, + { + "epoch": 3.5842576791808876, + "grad_norm": 0.3107063657084852, + "learning_rate": 9.057215112792682e-05, + "loss": 0.9062, + "step": 16803 + }, + { + "epoch": 3.5844709897610922, + "grad_norm": 0.2720596225172823, + "learning_rate": 9.056371593778235e-05, + "loss": 0.9491, + "step": 16804 + }, + { + "epoch": 3.584684300341297, + "grad_norm": 0.24623605229657317, + "learning_rate": 9.05552806281093e-05, + "loss": 0.893, + "step": 16805 + }, + { + "epoch": 3.5848976109215016, + "grad_norm": 0.2331194020680974, + "learning_rate": 9.054684519900301e-05, + "loss": 0.8943, + "step": 16806 + }, + { + "epoch": 3.5851109215017063, + "grad_norm": 0.22337832758293716, + "learning_rate": 9.053840965055902e-05, + "loss": 0.8995, + "step": 16807 + }, + { + "epoch": 3.5853242320819114, + "grad_norm": 0.2929646772402708, + "learning_rate": 9.052997398287273e-05, + "loss": 0.918, + "step": 16808 + }, + { + "epoch": 3.585537542662116, + "grad_norm": 0.28684135075973566, + "learning_rate": 9.052153819603962e-05, + "loss": 0.9302, + "step": 16809 + }, + { + "epoch": 3.585750853242321, + "grad_norm": 0.2489550107026264, + "learning_rate": 9.051310229015514e-05, + "loss": 0.9586, + "step": 16810 + }, + { + "epoch": 3.5859641638225255, + "grad_norm": 0.2098523981921272, + "learning_rate": 9.050466626531474e-05, + "loss": 0.9655, + "step": 16811 + }, + { + "epoch": 3.5861774744027306, + "grad_norm": 0.2243830498272589, + "learning_rate": 9.049623012161384e-05, + "loss": 0.911, + "step": 16812 + }, + { + "epoch": 3.5863907849829353, + "grad_norm": 0.2371582895385503, + "learning_rate": 9.048779385914792e-05, + "loss": 0.9139, + "step": 16813 + }, + { + "epoch": 3.58660409556314, + "grad_norm": 0.27779353057112294, + "learning_rate": 9.047935747801243e-05, + "loss": 0.9634, + "step": 16814 + }, + { + "epoch": 3.5868174061433447, + "grad_norm": 0.30547692064237564, + "learning_rate": 9.047092097830284e-05, + "loss": 0.9366, + "step": 16815 + }, + { + "epoch": 3.5870307167235493, + "grad_norm": 0.2562323508096785, + "learning_rate": 9.046248436011463e-05, + "loss": 0.877, + "step": 16816 + }, + { + "epoch": 3.587244027303754, + "grad_norm": 0.2055300205255474, + "learning_rate": 9.045404762354321e-05, + "loss": 0.9119, + "step": 16817 + }, + { + "epoch": 3.587457337883959, + "grad_norm": 0.19193236810504274, + "learning_rate": 9.044561076868406e-05, + "loss": 0.9132, + "step": 16818 + }, + { + "epoch": 3.587670648464164, + "grad_norm": 0.19686865215948057, + "learning_rate": 9.043717379563266e-05, + "loss": 0.9188, + "step": 16819 + }, + { + "epoch": 3.5878839590443685, + "grad_norm": 0.21308862910011603, + "learning_rate": 9.042873670448444e-05, + "loss": 0.9267, + "step": 16820 + }, + { + "epoch": 3.5880972696245736, + "grad_norm": 0.17176908375039857, + "learning_rate": 9.042029949533492e-05, + "loss": 0.9226, + "step": 16821 + }, + { + "epoch": 3.5883105802047783, + "grad_norm": 0.19646993198440596, + "learning_rate": 9.041186216827953e-05, + "loss": 0.9333, + "step": 16822 + }, + { + "epoch": 3.588523890784983, + "grad_norm": 0.18104777397423913, + "learning_rate": 9.040342472341373e-05, + "loss": 0.9098, + "step": 16823 + }, + { + "epoch": 3.5887372013651877, + "grad_norm": 0.1819997729155731, + "learning_rate": 9.0394987160833e-05, + "loss": 0.9474, + "step": 16824 + }, + { + "epoch": 3.5889505119453924, + "grad_norm": 0.22480740428787496, + "learning_rate": 9.038654948063283e-05, + "loss": 0.9039, + "step": 16825 + }, + { + "epoch": 3.589163822525597, + "grad_norm": 0.18649946681423343, + "learning_rate": 9.037811168290868e-05, + "loss": 0.927, + "step": 16826 + }, + { + "epoch": 3.589377133105802, + "grad_norm": 0.1754871898527514, + "learning_rate": 9.036967376775604e-05, + "loss": 0.9168, + "step": 16827 + }, + { + "epoch": 3.589590443686007, + "grad_norm": 0.1962704860523616, + "learning_rate": 9.036123573527033e-05, + "loss": 0.8861, + "step": 16828 + }, + { + "epoch": 3.5898037542662116, + "grad_norm": 0.16898578800071043, + "learning_rate": 9.035279758554706e-05, + "loss": 0.9107, + "step": 16829 + }, + { + "epoch": 3.5900170648464163, + "grad_norm": 0.2329196807507714, + "learning_rate": 9.034435931868173e-05, + "loss": 0.9333, + "step": 16830 + }, + { + "epoch": 3.5902303754266214, + "grad_norm": 0.2738144025696322, + "learning_rate": 9.03359209347698e-05, + "loss": 0.9139, + "step": 16831 + }, + { + "epoch": 3.590443686006826, + "grad_norm": 0.2446488096214398, + "learning_rate": 9.032748243390675e-05, + "loss": 0.902, + "step": 16832 + }, + { + "epoch": 3.5906569965870307, + "grad_norm": 0.19530287922615794, + "learning_rate": 9.031904381618808e-05, + "loss": 0.9094, + "step": 16833 + }, + { + "epoch": 3.5908703071672354, + "grad_norm": 0.20422853262285368, + "learning_rate": 9.031060508170923e-05, + "loss": 0.9055, + "step": 16834 + }, + { + "epoch": 3.59108361774744, + "grad_norm": 0.19483188351797948, + "learning_rate": 9.030216623056572e-05, + "loss": 0.9009, + "step": 16835 + }, + { + "epoch": 3.591296928327645, + "grad_norm": 0.22844961639846145, + "learning_rate": 9.029372726285302e-05, + "loss": 0.9152, + "step": 16836 + }, + { + "epoch": 3.59151023890785, + "grad_norm": 0.2478443784893515, + "learning_rate": 9.028528817866662e-05, + "loss": 0.9144, + "step": 16837 + }, + { + "epoch": 3.5917235494880546, + "grad_norm": 0.25187699190393814, + "learning_rate": 9.027684897810203e-05, + "loss": 0.8862, + "step": 16838 + }, + { + "epoch": 3.5919368600682593, + "grad_norm": 0.23126143328855805, + "learning_rate": 9.02684096612547e-05, + "loss": 0.8987, + "step": 16839 + }, + { + "epoch": 3.5921501706484644, + "grad_norm": 0.2102641070807965, + "learning_rate": 9.025997022822014e-05, + "loss": 0.9111, + "step": 16840 + }, + { + "epoch": 3.592363481228669, + "grad_norm": 0.2550973648492022, + "learning_rate": 9.025153067909386e-05, + "loss": 0.9374, + "step": 16841 + }, + { + "epoch": 3.592576791808874, + "grad_norm": 0.31446193537394573, + "learning_rate": 9.024309101397133e-05, + "loss": 0.8647, + "step": 16842 + }, + { + "epoch": 3.5927901023890785, + "grad_norm": 0.238917915503262, + "learning_rate": 9.023465123294806e-05, + "loss": 0.9248, + "step": 16843 + }, + { + "epoch": 3.593003412969283, + "grad_norm": 0.22131723791604155, + "learning_rate": 9.022621133611953e-05, + "loss": 0.9164, + "step": 16844 + }, + { + "epoch": 3.593216723549488, + "grad_norm": 0.24296348375695967, + "learning_rate": 9.021777132358125e-05, + "loss": 0.9433, + "step": 16845 + }, + { + "epoch": 3.593430034129693, + "grad_norm": 0.21068340531783492, + "learning_rate": 9.020933119542872e-05, + "loss": 0.9262, + "step": 16846 + }, + { + "epoch": 3.5936433447098977, + "grad_norm": 0.2747553383553695, + "learning_rate": 9.020089095175744e-05, + "loss": 0.8952, + "step": 16847 + }, + { + "epoch": 3.5938566552901023, + "grad_norm": 0.3189177623133275, + "learning_rate": 9.019245059266288e-05, + "loss": 0.9533, + "step": 16848 + }, + { + "epoch": 3.594069965870307, + "grad_norm": 0.27465220509130095, + "learning_rate": 9.01840101182406e-05, + "loss": 0.9377, + "step": 16849 + }, + { + "epoch": 3.594283276450512, + "grad_norm": 0.23644189421989723, + "learning_rate": 9.017556952858606e-05, + "loss": 0.9169, + "step": 16850 + }, + { + "epoch": 3.594496587030717, + "grad_norm": 0.25534792095627923, + "learning_rate": 9.01671288237948e-05, + "loss": 0.905, + "step": 16851 + }, + { + "epoch": 3.5947098976109215, + "grad_norm": 0.27593924654578317, + "learning_rate": 9.015868800396227e-05, + "loss": 0.9135, + "step": 16852 + }, + { + "epoch": 3.594923208191126, + "grad_norm": 0.2640815483144139, + "learning_rate": 9.015024706918404e-05, + "loss": 0.9134, + "step": 16853 + }, + { + "epoch": 3.595136518771331, + "grad_norm": 0.2748450113140971, + "learning_rate": 9.014180601955557e-05, + "loss": 0.9209, + "step": 16854 + }, + { + "epoch": 3.5953498293515356, + "grad_norm": 0.1985819647817874, + "learning_rate": 9.013336485517244e-05, + "loss": 0.9145, + "step": 16855 + }, + { + "epoch": 3.5955631399317407, + "grad_norm": 0.20940908941689032, + "learning_rate": 9.012492357613008e-05, + "loss": 0.9148, + "step": 16856 + }, + { + "epoch": 3.5957764505119454, + "grad_norm": 0.26483059636678213, + "learning_rate": 9.011648218252407e-05, + "loss": 0.9176, + "step": 16857 + }, + { + "epoch": 3.59598976109215, + "grad_norm": 0.22319835850141326, + "learning_rate": 9.010804067444984e-05, + "loss": 0.8895, + "step": 16858 + }, + { + "epoch": 3.596203071672355, + "grad_norm": 0.25707326153713783, + "learning_rate": 9.009959905200301e-05, + "loss": 0.9224, + "step": 16859 + }, + { + "epoch": 3.59641638225256, + "grad_norm": 0.28596002957476674, + "learning_rate": 9.009115731527901e-05, + "loss": 0.9409, + "step": 16860 + }, + { + "epoch": 3.5966296928327646, + "grad_norm": 0.20175899669520833, + "learning_rate": 9.008271546437345e-05, + "loss": 0.9537, + "step": 16861 + }, + { + "epoch": 3.5968430034129693, + "grad_norm": 0.22250162776846466, + "learning_rate": 9.007427349938173e-05, + "loss": 0.927, + "step": 16862 + }, + { + "epoch": 3.597056313993174, + "grad_norm": 0.2746332412240832, + "learning_rate": 9.006583142039947e-05, + "loss": 0.925, + "step": 16863 + }, + { + "epoch": 3.5972696245733786, + "grad_norm": 0.284898498780904, + "learning_rate": 9.005738922752214e-05, + "loss": 0.9303, + "step": 16864 + }, + { + "epoch": 3.5974829351535837, + "grad_norm": 0.22600047967144482, + "learning_rate": 9.004894692084531e-05, + "loss": 0.9346, + "step": 16865 + }, + { + "epoch": 3.5976962457337884, + "grad_norm": 0.2055869398921551, + "learning_rate": 9.004050450046446e-05, + "loss": 0.9139, + "step": 16866 + }, + { + "epoch": 3.597909556313993, + "grad_norm": 0.19462916437834418, + "learning_rate": 9.003206196647513e-05, + "loss": 0.9401, + "step": 16867 + }, + { + "epoch": 3.598122866894198, + "grad_norm": 0.18602509780637055, + "learning_rate": 9.002361931897286e-05, + "loss": 0.8808, + "step": 16868 + }, + { + "epoch": 3.598336177474403, + "grad_norm": 0.17353827395024424, + "learning_rate": 9.001517655805317e-05, + "loss": 0.9036, + "step": 16869 + }, + { + "epoch": 3.5985494880546076, + "grad_norm": 0.1579956743825147, + "learning_rate": 9.000673368381156e-05, + "loss": 0.9349, + "step": 16870 + }, + { + "epoch": 3.5987627986348123, + "grad_norm": 0.19685439312396558, + "learning_rate": 8.999829069634363e-05, + "loss": 0.9163, + "step": 16871 + }, + { + "epoch": 3.598976109215017, + "grad_norm": 0.18807192590783633, + "learning_rate": 8.998984759574486e-05, + "loss": 0.9167, + "step": 16872 + }, + { + "epoch": 3.5991894197952217, + "grad_norm": 0.19122133754142276, + "learning_rate": 8.998140438211078e-05, + "loss": 0.9094, + "step": 16873 + }, + { + "epoch": 3.5994027303754264, + "grad_norm": 0.23209866610060248, + "learning_rate": 8.997296105553695e-05, + "loss": 0.9542, + "step": 16874 + }, + { + "epoch": 3.5996160409556315, + "grad_norm": 0.22595037954038583, + "learning_rate": 8.99645176161189e-05, + "loss": 0.8844, + "step": 16875 + }, + { + "epoch": 3.599829351535836, + "grad_norm": 0.2177326951543429, + "learning_rate": 8.995607406395216e-05, + "loss": 0.9398, + "step": 16876 + }, + { + "epoch": 3.600042662116041, + "grad_norm": 0.21732462822769855, + "learning_rate": 8.994763039913228e-05, + "loss": 0.9014, + "step": 16877 + }, + { + "epoch": 3.600255972696246, + "grad_norm": 0.20182630818650701, + "learning_rate": 8.99391866217548e-05, + "loss": 0.914, + "step": 16878 + }, + { + "epoch": 3.6004692832764507, + "grad_norm": 0.18663690211620604, + "learning_rate": 8.993074273191525e-05, + "loss": 0.9307, + "step": 16879 + }, + { + "epoch": 3.6006825938566553, + "grad_norm": 0.17151243699586385, + "learning_rate": 8.992229872970918e-05, + "loss": 0.8637, + "step": 16880 + }, + { + "epoch": 3.60089590443686, + "grad_norm": 0.21423609908271513, + "learning_rate": 8.991385461523214e-05, + "loss": 0.8984, + "step": 16881 + }, + { + "epoch": 3.6011092150170647, + "grad_norm": 0.21971434451921745, + "learning_rate": 8.990541038857965e-05, + "loss": 0.9218, + "step": 16882 + }, + { + "epoch": 3.6013225255972694, + "grad_norm": 0.2370436223784005, + "learning_rate": 8.989696604984729e-05, + "loss": 0.9213, + "step": 16883 + }, + { + "epoch": 3.6015358361774745, + "grad_norm": 0.2361006129262206, + "learning_rate": 8.988852159913059e-05, + "loss": 0.9133, + "step": 16884 + }, + { + "epoch": 3.601749146757679, + "grad_norm": 0.3443645449784196, + "learning_rate": 8.988007703652511e-05, + "loss": 0.8927, + "step": 16885 + }, + { + "epoch": 3.601962457337884, + "grad_norm": 0.3060649647694569, + "learning_rate": 8.987163236212636e-05, + "loss": 0.9218, + "step": 16886 + }, + { + "epoch": 3.6021757679180886, + "grad_norm": 0.18987962713413786, + "learning_rate": 8.986318757602998e-05, + "loss": 0.9316, + "step": 16887 + }, + { + "epoch": 3.6023890784982937, + "grad_norm": 0.20233854779232607, + "learning_rate": 8.985474267833141e-05, + "loss": 0.9054, + "step": 16888 + }, + { + "epoch": 3.6026023890784984, + "grad_norm": 0.3103963079244483, + "learning_rate": 8.984629766912628e-05, + "loss": 0.8793, + "step": 16889 + }, + { + "epoch": 3.602815699658703, + "grad_norm": 0.3373677440366234, + "learning_rate": 8.983785254851013e-05, + "loss": 0.945, + "step": 16890 + }, + { + "epoch": 3.6030290102389078, + "grad_norm": 0.2706901144974451, + "learning_rate": 8.982940731657852e-05, + "loss": 0.9345, + "step": 16891 + }, + { + "epoch": 3.6032423208191124, + "grad_norm": 0.21564851581772992, + "learning_rate": 8.9820961973427e-05, + "loss": 0.92, + "step": 16892 + }, + { + "epoch": 3.6034556313993176, + "grad_norm": 0.18097376702309106, + "learning_rate": 8.981251651915111e-05, + "loss": 0.9217, + "step": 16893 + }, + { + "epoch": 3.6036689419795223, + "grad_norm": 0.23415388362140596, + "learning_rate": 8.980407095384646e-05, + "loss": 0.9435, + "step": 16894 + }, + { + "epoch": 3.603882252559727, + "grad_norm": 0.2898862714803468, + "learning_rate": 8.979562527760856e-05, + "loss": 0.9612, + "step": 16895 + }, + { + "epoch": 3.6040955631399316, + "grad_norm": 0.29150231405777816, + "learning_rate": 8.9787179490533e-05, + "loss": 0.9046, + "step": 16896 + }, + { + "epoch": 3.6043088737201368, + "grad_norm": 0.23812181110641917, + "learning_rate": 8.977873359271535e-05, + "loss": 0.9077, + "step": 16897 + }, + { + "epoch": 3.6045221843003414, + "grad_norm": 0.2011542906372038, + "learning_rate": 8.977028758425115e-05, + "loss": 0.9343, + "step": 16898 + }, + { + "epoch": 3.604735494880546, + "grad_norm": 0.20705578381658216, + "learning_rate": 8.976184146523599e-05, + "loss": 0.8969, + "step": 16899 + }, + { + "epoch": 3.604948805460751, + "grad_norm": 0.2372897610678673, + "learning_rate": 8.975339523576543e-05, + "loss": 0.9249, + "step": 16900 + }, + { + "epoch": 3.6051621160409555, + "grad_norm": 0.24999885736691532, + "learning_rate": 8.974494889593501e-05, + "loss": 0.9381, + "step": 16901 + }, + { + "epoch": 3.60537542662116, + "grad_norm": 0.20058502740739356, + "learning_rate": 8.973650244584038e-05, + "loss": 0.9329, + "step": 16902 + }, + { + "epoch": 3.6055887372013653, + "grad_norm": 0.2127257833941484, + "learning_rate": 8.972805588557704e-05, + "loss": 0.9233, + "step": 16903 + }, + { + "epoch": 3.60580204778157, + "grad_norm": 0.23140643660767143, + "learning_rate": 8.971960921524056e-05, + "loss": 0.9007, + "step": 16904 + }, + { + "epoch": 3.6060153583617747, + "grad_norm": 0.1997382637063818, + "learning_rate": 8.971116243492658e-05, + "loss": 0.9633, + "step": 16905 + }, + { + "epoch": 3.60622866894198, + "grad_norm": 0.23209933520796033, + "learning_rate": 8.970271554473061e-05, + "loss": 0.8951, + "step": 16906 + }, + { + "epoch": 3.6064419795221845, + "grad_norm": 0.23838877100469746, + "learning_rate": 8.969426854474825e-05, + "loss": 0.9146, + "step": 16907 + }, + { + "epoch": 3.606655290102389, + "grad_norm": 0.21444823168910204, + "learning_rate": 8.968582143507509e-05, + "loss": 0.8982, + "step": 16908 + }, + { + "epoch": 3.606868600682594, + "grad_norm": 0.23583317107938778, + "learning_rate": 8.96773742158067e-05, + "loss": 0.9087, + "step": 16909 + }, + { + "epoch": 3.6070819112627985, + "grad_norm": 0.240857146018082, + "learning_rate": 8.966892688703862e-05, + "loss": 0.9081, + "step": 16910 + }, + { + "epoch": 3.607295221843003, + "grad_norm": 0.2229685456426983, + "learning_rate": 8.96604794488665e-05, + "loss": 0.9263, + "step": 16911 + }, + { + "epoch": 3.6075085324232083, + "grad_norm": 0.2302607959628027, + "learning_rate": 8.965203190138591e-05, + "loss": 0.9359, + "step": 16912 + }, + { + "epoch": 3.607721843003413, + "grad_norm": 0.21804512012402288, + "learning_rate": 8.964358424469237e-05, + "loss": 0.9262, + "step": 16913 + }, + { + "epoch": 3.6079351535836177, + "grad_norm": 0.24236042857796714, + "learning_rate": 8.963513647888156e-05, + "loss": 0.9318, + "step": 16914 + }, + { + "epoch": 3.6081484641638224, + "grad_norm": 0.24754437820989686, + "learning_rate": 8.962668860404898e-05, + "loss": 0.922, + "step": 16915 + }, + { + "epoch": 3.6083617747440275, + "grad_norm": 0.21662483056167028, + "learning_rate": 8.961824062029028e-05, + "loss": 0.9087, + "step": 16916 + }, + { + "epoch": 3.608575085324232, + "grad_norm": 0.23391891692689037, + "learning_rate": 8.960979252770102e-05, + "loss": 0.9154, + "step": 16917 + }, + { + "epoch": 3.608788395904437, + "grad_norm": 0.25811317255749655, + "learning_rate": 8.960134432637679e-05, + "loss": 0.9325, + "step": 16918 + }, + { + "epoch": 3.6090017064846416, + "grad_norm": 0.19518089682989317, + "learning_rate": 8.959289601641318e-05, + "loss": 0.9525, + "step": 16919 + }, + { + "epoch": 3.6092150170648463, + "grad_norm": 0.24545489891822003, + "learning_rate": 8.958444759790582e-05, + "loss": 0.9362, + "step": 16920 + }, + { + "epoch": 3.609428327645051, + "grad_norm": 0.2481569328587769, + "learning_rate": 8.957599907095026e-05, + "loss": 0.9019, + "step": 16921 + }, + { + "epoch": 3.609641638225256, + "grad_norm": 0.1985192551554891, + "learning_rate": 8.956755043564211e-05, + "loss": 0.8929, + "step": 16922 + }, + { + "epoch": 3.6098549488054608, + "grad_norm": 0.23947293057464508, + "learning_rate": 8.955910169207695e-05, + "loss": 0.933, + "step": 16923 + }, + { + "epoch": 3.6100682593856654, + "grad_norm": 0.23942966627436446, + "learning_rate": 8.95506528403504e-05, + "loss": 0.9233, + "step": 16924 + }, + { + "epoch": 3.6102815699658706, + "grad_norm": 0.19568485553853468, + "learning_rate": 8.954220388055806e-05, + "loss": 0.8862, + "step": 16925 + }, + { + "epoch": 3.6104948805460753, + "grad_norm": 0.20812312538214175, + "learning_rate": 8.953375481279552e-05, + "loss": 0.9314, + "step": 16926 + }, + { + "epoch": 3.61070819112628, + "grad_norm": 0.2691800490844441, + "learning_rate": 8.952530563715838e-05, + "loss": 0.9327, + "step": 16927 + }, + { + "epoch": 3.6109215017064846, + "grad_norm": 0.1893748962188842, + "learning_rate": 8.951685635374226e-05, + "loss": 0.8999, + "step": 16928 + }, + { + "epoch": 3.6111348122866893, + "grad_norm": 0.2288079569958444, + "learning_rate": 8.950840696264275e-05, + "loss": 0.9006, + "step": 16929 + }, + { + "epoch": 3.611348122866894, + "grad_norm": 0.23053339064276393, + "learning_rate": 8.949995746395544e-05, + "loss": 0.9475, + "step": 16930 + }, + { + "epoch": 3.611561433447099, + "grad_norm": 0.18654061099534874, + "learning_rate": 8.949150785777595e-05, + "loss": 0.9291, + "step": 16931 + }, + { + "epoch": 3.611774744027304, + "grad_norm": 0.23351133441017927, + "learning_rate": 8.94830581441999e-05, + "loss": 0.9339, + "step": 16932 + }, + { + "epoch": 3.6119880546075085, + "grad_norm": 0.21922841159286224, + "learning_rate": 8.947460832332289e-05, + "loss": 0.904, + "step": 16933 + }, + { + "epoch": 3.612201365187713, + "grad_norm": 0.25888063873720224, + "learning_rate": 8.946615839524053e-05, + "loss": 0.8963, + "step": 16934 + }, + { + "epoch": 3.6124146757679183, + "grad_norm": 0.2415310270250106, + "learning_rate": 8.94577083600484e-05, + "loss": 0.9117, + "step": 16935 + }, + { + "epoch": 3.612627986348123, + "grad_norm": 0.16701371958932407, + "learning_rate": 8.944925821784217e-05, + "loss": 0.8851, + "step": 16936 + }, + { + "epoch": 3.6128412969283277, + "grad_norm": 0.21018195866980274, + "learning_rate": 8.94408079687174e-05, + "loss": 0.9669, + "step": 16937 + }, + { + "epoch": 3.6130546075085324, + "grad_norm": 0.2670542351574269, + "learning_rate": 8.943235761276977e-05, + "loss": 0.8863, + "step": 16938 + }, + { + "epoch": 3.613267918088737, + "grad_norm": 0.24328161248486635, + "learning_rate": 8.942390715009483e-05, + "loss": 0.9091, + "step": 16939 + }, + { + "epoch": 3.6134812286689417, + "grad_norm": 0.2131328602562798, + "learning_rate": 8.941545658078822e-05, + "loss": 0.9348, + "step": 16940 + }, + { + "epoch": 3.613694539249147, + "grad_norm": 0.18135243271038884, + "learning_rate": 8.940700590494555e-05, + "loss": 0.938, + "step": 16941 + }, + { + "epoch": 3.6139078498293515, + "grad_norm": 0.214583523801609, + "learning_rate": 8.939855512266248e-05, + "loss": 0.9141, + "step": 16942 + }, + { + "epoch": 3.614121160409556, + "grad_norm": 0.29055079474214857, + "learning_rate": 8.939010423403457e-05, + "loss": 0.9485, + "step": 16943 + }, + { + "epoch": 3.6143344709897613, + "grad_norm": 0.3466325795788685, + "learning_rate": 8.938165323915749e-05, + "loss": 0.9393, + "step": 16944 + }, + { + "epoch": 3.614547781569966, + "grad_norm": 0.30729822076359103, + "learning_rate": 8.937320213812683e-05, + "loss": 0.9465, + "step": 16945 + }, + { + "epoch": 3.6147610921501707, + "grad_norm": 0.2629849479474489, + "learning_rate": 8.936475093103825e-05, + "loss": 0.9488, + "step": 16946 + }, + { + "epoch": 3.6149744027303754, + "grad_norm": 0.2315300049495154, + "learning_rate": 8.935629961798734e-05, + "loss": 0.9127, + "step": 16947 + }, + { + "epoch": 3.61518771331058, + "grad_norm": 0.1899159239688593, + "learning_rate": 8.934784819906976e-05, + "loss": 0.9582, + "step": 16948 + }, + { + "epoch": 3.6154010238907848, + "grad_norm": 0.2186277641774816, + "learning_rate": 8.933939667438109e-05, + "loss": 0.9097, + "step": 16949 + }, + { + "epoch": 3.61561433447099, + "grad_norm": 0.2418447978461663, + "learning_rate": 8.933094504401704e-05, + "loss": 0.9216, + "step": 16950 + }, + { + "epoch": 3.6158276450511946, + "grad_norm": 0.2073804977989427, + "learning_rate": 8.932249330807315e-05, + "loss": 0.923, + "step": 16951 + }, + { + "epoch": 3.6160409556313993, + "grad_norm": 0.1881884426228258, + "learning_rate": 8.93140414666451e-05, + "loss": 0.8923, + "step": 16952 + }, + { + "epoch": 3.616254266211604, + "grad_norm": 0.308439462036395, + "learning_rate": 8.93055895198285e-05, + "loss": 0.9207, + "step": 16953 + }, + { + "epoch": 3.616467576791809, + "grad_norm": 0.1758112084499686, + "learning_rate": 8.929713746771903e-05, + "loss": 0.9192, + "step": 16954 + }, + { + "epoch": 3.6166808873720138, + "grad_norm": 0.1921046711882369, + "learning_rate": 8.928868531041225e-05, + "loss": 0.8905, + "step": 16955 + }, + { + "epoch": 3.6168941979522184, + "grad_norm": 0.19251982854572372, + "learning_rate": 8.92802330480039e-05, + "loss": 0.8992, + "step": 16956 + }, + { + "epoch": 3.617107508532423, + "grad_norm": 0.1896190582239161, + "learning_rate": 8.92717806805895e-05, + "loss": 0.9115, + "step": 16957 + }, + { + "epoch": 3.617320819112628, + "grad_norm": 0.2094273399829705, + "learning_rate": 8.926332820826478e-05, + "loss": 0.9392, + "step": 16958 + }, + { + "epoch": 3.6175341296928325, + "grad_norm": 0.191656494377503, + "learning_rate": 8.925487563112534e-05, + "loss": 0.9285, + "step": 16959 + }, + { + "epoch": 3.6177474402730376, + "grad_norm": 0.20448717536503228, + "learning_rate": 8.924642294926682e-05, + "loss": 0.9349, + "step": 16960 + }, + { + "epoch": 3.6179607508532423, + "grad_norm": 0.1983377758585253, + "learning_rate": 8.923797016278485e-05, + "loss": 0.9054, + "step": 16961 + }, + { + "epoch": 3.618174061433447, + "grad_norm": 0.20493077758772746, + "learning_rate": 8.922951727177513e-05, + "loss": 0.9029, + "step": 16962 + }, + { + "epoch": 3.618387372013652, + "grad_norm": 0.22553713803924602, + "learning_rate": 8.922106427633324e-05, + "loss": 0.9029, + "step": 16963 + }, + { + "epoch": 3.618600682593857, + "grad_norm": 0.24495392555240778, + "learning_rate": 8.921261117655487e-05, + "loss": 0.9109, + "step": 16964 + }, + { + "epoch": 3.6188139931740615, + "grad_norm": 0.2697786035443202, + "learning_rate": 8.920415797253564e-05, + "loss": 0.9198, + "step": 16965 + }, + { + "epoch": 3.619027303754266, + "grad_norm": 0.19178958253936154, + "learning_rate": 8.919570466437121e-05, + "loss": 0.918, + "step": 16966 + }, + { + "epoch": 3.619240614334471, + "grad_norm": 0.22965289184495824, + "learning_rate": 8.918725125215723e-05, + "loss": 0.9228, + "step": 16967 + }, + { + "epoch": 3.6194539249146755, + "grad_norm": 0.2957195069274185, + "learning_rate": 8.917879773598935e-05, + "loss": 0.9155, + "step": 16968 + }, + { + "epoch": 3.6196672354948807, + "grad_norm": 0.20095117512259494, + "learning_rate": 8.91703441159632e-05, + "loss": 0.8819, + "step": 16969 + }, + { + "epoch": 3.6198805460750854, + "grad_norm": 0.1844439348452048, + "learning_rate": 8.916189039217447e-05, + "loss": 0.9567, + "step": 16970 + }, + { + "epoch": 3.62009385665529, + "grad_norm": 0.2678518200902118, + "learning_rate": 8.915343656471876e-05, + "loss": 0.9324, + "step": 16971 + }, + { + "epoch": 3.6203071672354947, + "grad_norm": 0.2216986779221716, + "learning_rate": 8.914498263369182e-05, + "loss": 0.9056, + "step": 16972 + }, + { + "epoch": 3.6205204778157, + "grad_norm": 0.1970311554812973, + "learning_rate": 8.913652859918922e-05, + "loss": 0.9068, + "step": 16973 + }, + { + "epoch": 3.6207337883959045, + "grad_norm": 0.321302459951899, + "learning_rate": 8.912807446130665e-05, + "loss": 0.954, + "step": 16974 + }, + { + "epoch": 3.6209470989761092, + "grad_norm": 0.35100810741152094, + "learning_rate": 8.911962022013974e-05, + "loss": 0.8905, + "step": 16975 + }, + { + "epoch": 3.621160409556314, + "grad_norm": 0.32899708025436436, + "learning_rate": 8.91111658757842e-05, + "loss": 0.9531, + "step": 16976 + }, + { + "epoch": 3.6213737201365186, + "grad_norm": 0.31658893504264524, + "learning_rate": 8.910271142833564e-05, + "loss": 0.9254, + "step": 16977 + }, + { + "epoch": 3.6215870307167233, + "grad_norm": 0.27743944882560573, + "learning_rate": 8.909425687788979e-05, + "loss": 0.9301, + "step": 16978 + }, + { + "epoch": 3.6218003412969284, + "grad_norm": 0.2220906515955294, + "learning_rate": 8.908580222454224e-05, + "loss": 0.9055, + "step": 16979 + }, + { + "epoch": 3.622013651877133, + "grad_norm": 0.283667717023736, + "learning_rate": 8.907734746838868e-05, + "loss": 0.9037, + "step": 16980 + }, + { + "epoch": 3.6222269624573378, + "grad_norm": 0.3827979004564645, + "learning_rate": 8.90688926095248e-05, + "loss": 0.9263, + "step": 16981 + }, + { + "epoch": 3.622440273037543, + "grad_norm": 0.3998724380139027, + "learning_rate": 8.906043764804623e-05, + "loss": 0.9146, + "step": 16982 + }, + { + "epoch": 3.6226535836177476, + "grad_norm": 0.24121972444926046, + "learning_rate": 8.905198258404864e-05, + "loss": 0.8932, + "step": 16983 + }, + { + "epoch": 3.6228668941979523, + "grad_norm": 0.1984728535780312, + "learning_rate": 8.904352741762778e-05, + "loss": 0.8987, + "step": 16984 + }, + { + "epoch": 3.623080204778157, + "grad_norm": 0.2823267107785445, + "learning_rate": 8.903507214887919e-05, + "loss": 0.921, + "step": 16985 + }, + { + "epoch": 3.6232935153583616, + "grad_norm": 0.29085601826013935, + "learning_rate": 8.902661677789864e-05, + "loss": 0.9133, + "step": 16986 + }, + { + "epoch": 3.6235068259385663, + "grad_norm": 0.33310099290205986, + "learning_rate": 8.901816130478175e-05, + "loss": 0.9013, + "step": 16987 + }, + { + "epoch": 3.6237201365187715, + "grad_norm": 0.3115821782199631, + "learning_rate": 8.900970572962422e-05, + "loss": 0.935, + "step": 16988 + }, + { + "epoch": 3.623933447098976, + "grad_norm": 0.22624923977627903, + "learning_rate": 8.900125005252172e-05, + "loss": 0.9234, + "step": 16989 + }, + { + "epoch": 3.624146757679181, + "grad_norm": 0.2647137320492485, + "learning_rate": 8.899279427356995e-05, + "loss": 0.9169, + "step": 16990 + }, + { + "epoch": 3.6243600682593855, + "grad_norm": 0.21601880083374808, + "learning_rate": 8.898433839286451e-05, + "loss": 0.9181, + "step": 16991 + }, + { + "epoch": 3.6245733788395906, + "grad_norm": 0.2009788008014019, + "learning_rate": 8.897588241050116e-05, + "loss": 0.9371, + "step": 16992 + }, + { + "epoch": 3.6247866894197953, + "grad_norm": 0.24481878971701476, + "learning_rate": 8.896742632657555e-05, + "loss": 0.8928, + "step": 16993 + }, + { + "epoch": 3.625, + "grad_norm": 0.26360245735495874, + "learning_rate": 8.895897014118335e-05, + "loss": 0.9292, + "step": 16994 + }, + { + "epoch": 3.6252133105802047, + "grad_norm": 0.24340212424903618, + "learning_rate": 8.895051385442028e-05, + "loss": 0.9438, + "step": 16995 + }, + { + "epoch": 3.6254266211604094, + "grad_norm": 0.1914454175690088, + "learning_rate": 8.894205746638197e-05, + "loss": 0.9027, + "step": 16996 + }, + { + "epoch": 3.6256399317406145, + "grad_norm": 0.19135854910275857, + "learning_rate": 8.893360097716411e-05, + "loss": 0.9281, + "step": 16997 + }, + { + "epoch": 3.625853242320819, + "grad_norm": 0.2279655874379276, + "learning_rate": 8.892514438686244e-05, + "loss": 0.9215, + "step": 16998 + }, + { + "epoch": 3.626066552901024, + "grad_norm": 0.2847557435881913, + "learning_rate": 8.89166876955726e-05, + "loss": 0.932, + "step": 16999 + }, + { + "epoch": 3.6262798634812285, + "grad_norm": 0.3350929447450729, + "learning_rate": 8.890823090339029e-05, + "loss": 0.9293, + "step": 17000 + }, + { + "epoch": 3.6264931740614337, + "grad_norm": 0.30426964475770746, + "learning_rate": 8.889977401041119e-05, + "loss": 0.9207, + "step": 17001 + }, + { + "epoch": 3.6267064846416384, + "grad_norm": 0.21152194241378441, + "learning_rate": 8.889131701673102e-05, + "loss": 0.9155, + "step": 17002 + }, + { + "epoch": 3.626919795221843, + "grad_norm": 0.24339376834204268, + "learning_rate": 8.888285992244542e-05, + "loss": 0.939, + "step": 17003 + }, + { + "epoch": 3.6271331058020477, + "grad_norm": 0.23280076345068676, + "learning_rate": 8.887440272765013e-05, + "loss": 0.9198, + "step": 17004 + }, + { + "epoch": 3.6273464163822524, + "grad_norm": 0.22486202943106556, + "learning_rate": 8.886594543244079e-05, + "loss": 0.8861, + "step": 17005 + }, + { + "epoch": 3.627559726962457, + "grad_norm": 0.3204629268413157, + "learning_rate": 8.885748803691318e-05, + "loss": 0.9271, + "step": 17006 + }, + { + "epoch": 3.6277730375426622, + "grad_norm": 0.32688645187452314, + "learning_rate": 8.88490305411629e-05, + "loss": 0.8778, + "step": 17007 + }, + { + "epoch": 3.627986348122867, + "grad_norm": 0.28739083415743144, + "learning_rate": 8.884057294528572e-05, + "loss": 0.9151, + "step": 17008 + }, + { + "epoch": 3.6281996587030716, + "grad_norm": 0.20611743381948264, + "learning_rate": 8.88321152493773e-05, + "loss": 0.9237, + "step": 17009 + }, + { + "epoch": 3.6284129692832767, + "grad_norm": 0.25377712139087666, + "learning_rate": 8.882365745353333e-05, + "loss": 0.9083, + "step": 17010 + }, + { + "epoch": 3.6286262798634814, + "grad_norm": 0.3340346890091451, + "learning_rate": 8.881519955784953e-05, + "loss": 0.9186, + "step": 17011 + }, + { + "epoch": 3.628839590443686, + "grad_norm": 0.35500039771231173, + "learning_rate": 8.880674156242163e-05, + "loss": 0.9427, + "step": 17012 + }, + { + "epoch": 3.6290529010238908, + "grad_norm": 0.3142187021000118, + "learning_rate": 8.879828346734527e-05, + "loss": 0.9252, + "step": 17013 + }, + { + "epoch": 3.6292662116040955, + "grad_norm": 0.20134294067143646, + "learning_rate": 8.878982527271622e-05, + "loss": 0.8959, + "step": 17014 + }, + { + "epoch": 3.6294795221843, + "grad_norm": 0.20858987169340562, + "learning_rate": 8.87813669786301e-05, + "loss": 0.926, + "step": 17015 + }, + { + "epoch": 3.6296928327645053, + "grad_norm": 0.20793556340483066, + "learning_rate": 8.877290858518269e-05, + "loss": 0.9098, + "step": 17016 + }, + { + "epoch": 3.62990614334471, + "grad_norm": 0.23827519578063663, + "learning_rate": 8.876445009246969e-05, + "loss": 0.8963, + "step": 17017 + }, + { + "epoch": 3.6301194539249146, + "grad_norm": 0.25416213688942435, + "learning_rate": 8.875599150058677e-05, + "loss": 0.9137, + "step": 17018 + }, + { + "epoch": 3.6303327645051193, + "grad_norm": 0.20027550496283716, + "learning_rate": 8.874753280962964e-05, + "loss": 0.9565, + "step": 17019 + }, + { + "epoch": 3.6305460750853245, + "grad_norm": 0.25426290110562044, + "learning_rate": 8.873907401969407e-05, + "loss": 0.9006, + "step": 17020 + }, + { + "epoch": 3.630759385665529, + "grad_norm": 0.23268275049492226, + "learning_rate": 8.873061513087569e-05, + "loss": 0.897, + "step": 17021 + }, + { + "epoch": 3.630972696245734, + "grad_norm": 0.24325632864479296, + "learning_rate": 8.872215614327028e-05, + "loss": 0.93, + "step": 17022 + }, + { + "epoch": 3.6311860068259385, + "grad_norm": 0.25632208347891244, + "learning_rate": 8.871369705697353e-05, + "loss": 0.9418, + "step": 17023 + }, + { + "epoch": 3.631399317406143, + "grad_norm": 0.22964204111428724, + "learning_rate": 8.870523787208115e-05, + "loss": 0.9172, + "step": 17024 + }, + { + "epoch": 3.631612627986348, + "grad_norm": 0.25095151399375043, + "learning_rate": 8.869677858868883e-05, + "loss": 0.9049, + "step": 17025 + }, + { + "epoch": 3.631825938566553, + "grad_norm": 0.19338525467331086, + "learning_rate": 8.868831920689234e-05, + "loss": 0.9165, + "step": 17026 + }, + { + "epoch": 3.6320392491467577, + "grad_norm": 0.2832427617444291, + "learning_rate": 8.867985972678734e-05, + "loss": 0.9514, + "step": 17027 + }, + { + "epoch": 3.6322525597269624, + "grad_norm": 0.22843066203706663, + "learning_rate": 8.867140014846961e-05, + "loss": 0.9318, + "step": 17028 + }, + { + "epoch": 3.6324658703071675, + "grad_norm": 0.2516877204477367, + "learning_rate": 8.866294047203484e-05, + "loss": 0.9363, + "step": 17029 + }, + { + "epoch": 3.632679180887372, + "grad_norm": 0.2699430928310845, + "learning_rate": 8.865448069757876e-05, + "loss": 0.9388, + "step": 17030 + }, + { + "epoch": 3.632892491467577, + "grad_norm": 0.21009925477572763, + "learning_rate": 8.864602082519707e-05, + "loss": 0.9435, + "step": 17031 + }, + { + "epoch": 3.6331058020477816, + "grad_norm": 0.19682295529532243, + "learning_rate": 8.863756085498551e-05, + "loss": 0.907, + "step": 17032 + }, + { + "epoch": 3.6333191126279862, + "grad_norm": 0.2203412646416129, + "learning_rate": 8.86291007870398e-05, + "loss": 0.9076, + "step": 17033 + }, + { + "epoch": 3.633532423208191, + "grad_norm": 0.16804677249728472, + "learning_rate": 8.862064062145569e-05, + "loss": 0.9304, + "step": 17034 + }, + { + "epoch": 3.633745733788396, + "grad_norm": 0.18537211340202903, + "learning_rate": 8.861218035832887e-05, + "loss": 0.9083, + "step": 17035 + }, + { + "epoch": 3.6339590443686007, + "grad_norm": 0.20945823916399234, + "learning_rate": 8.86037199977551e-05, + "loss": 0.9067, + "step": 17036 + }, + { + "epoch": 3.6341723549488054, + "grad_norm": 0.19569328271794437, + "learning_rate": 8.859525953983007e-05, + "loss": 0.9208, + "step": 17037 + }, + { + "epoch": 3.63438566552901, + "grad_norm": 0.22908127872045417, + "learning_rate": 8.858679898464955e-05, + "loss": 0.8979, + "step": 17038 + }, + { + "epoch": 3.6345989761092152, + "grad_norm": 0.258655357414626, + "learning_rate": 8.857833833230922e-05, + "loss": 0.9059, + "step": 17039 + }, + { + "epoch": 3.63481228668942, + "grad_norm": 0.2184745470303212, + "learning_rate": 8.85698775829049e-05, + "loss": 0.9004, + "step": 17040 + }, + { + "epoch": 3.6350255972696246, + "grad_norm": 0.22429074044508135, + "learning_rate": 8.856141673653225e-05, + "loss": 0.8988, + "step": 17041 + }, + { + "epoch": 3.6352389078498293, + "grad_norm": 0.15820435473899871, + "learning_rate": 8.855295579328703e-05, + "loss": 0.917, + "step": 17042 + }, + { + "epoch": 3.635452218430034, + "grad_norm": 0.20722496788239408, + "learning_rate": 8.854449475326494e-05, + "loss": 0.9046, + "step": 17043 + }, + { + "epoch": 3.6356655290102387, + "grad_norm": 0.19464305554409259, + "learning_rate": 8.853603361656179e-05, + "loss": 0.9147, + "step": 17044 + }, + { + "epoch": 3.635878839590444, + "grad_norm": 0.21266674271317737, + "learning_rate": 8.852757238327327e-05, + "loss": 0.9119, + "step": 17045 + }, + { + "epoch": 3.6360921501706485, + "grad_norm": 0.2471612756737512, + "learning_rate": 8.851911105349512e-05, + "loss": 0.8931, + "step": 17046 + }, + { + "epoch": 3.636305460750853, + "grad_norm": 0.1929731555355194, + "learning_rate": 8.851064962732306e-05, + "loss": 0.932, + "step": 17047 + }, + { + "epoch": 3.6365187713310583, + "grad_norm": 0.19655362908017873, + "learning_rate": 8.850218810485289e-05, + "loss": 0.9636, + "step": 17048 + }, + { + "epoch": 3.636732081911263, + "grad_norm": 0.24274152820024564, + "learning_rate": 8.849372648618028e-05, + "loss": 0.8961, + "step": 17049 + }, + { + "epoch": 3.6369453924914676, + "grad_norm": 0.23627074613400953, + "learning_rate": 8.848526477140106e-05, + "loss": 0.8894, + "step": 17050 + }, + { + "epoch": 3.6371587030716723, + "grad_norm": 0.23935785932391695, + "learning_rate": 8.84768029606109e-05, + "loss": 0.9005, + "step": 17051 + }, + { + "epoch": 3.637372013651877, + "grad_norm": 0.1832135798925102, + "learning_rate": 8.846834105390557e-05, + "loss": 0.9008, + "step": 17052 + }, + { + "epoch": 3.6375853242320817, + "grad_norm": 0.18242340105138896, + "learning_rate": 8.845987905138081e-05, + "loss": 0.9128, + "step": 17053 + }, + { + "epoch": 3.637798634812287, + "grad_norm": 0.18675871655094223, + "learning_rate": 8.84514169531324e-05, + "loss": 0.9302, + "step": 17054 + }, + { + "epoch": 3.6380119453924915, + "grad_norm": 0.17756458429783145, + "learning_rate": 8.844295475925605e-05, + "loss": 0.8942, + "step": 17055 + }, + { + "epoch": 3.638225255972696, + "grad_norm": 0.25075186511053366, + "learning_rate": 8.843449246984754e-05, + "loss": 0.9223, + "step": 17056 + }, + { + "epoch": 3.638438566552901, + "grad_norm": 0.1987503805468557, + "learning_rate": 8.84260300850026e-05, + "loss": 0.9045, + "step": 17057 + }, + { + "epoch": 3.638651877133106, + "grad_norm": 0.19290916256773244, + "learning_rate": 8.841756760481698e-05, + "loss": 0.8944, + "step": 17058 + }, + { + "epoch": 3.6388651877133107, + "grad_norm": 0.29072840237268793, + "learning_rate": 8.840910502938643e-05, + "loss": 0.9322, + "step": 17059 + }, + { + "epoch": 3.6390784982935154, + "grad_norm": 0.257031739188666, + "learning_rate": 8.840064235880674e-05, + "loss": 0.9295, + "step": 17060 + }, + { + "epoch": 3.63929180887372, + "grad_norm": 0.18678606563093145, + "learning_rate": 8.839217959317362e-05, + "loss": 0.9574, + "step": 17061 + }, + { + "epoch": 3.6395051194539247, + "grad_norm": 0.24982768154988766, + "learning_rate": 8.838371673258288e-05, + "loss": 0.9179, + "step": 17062 + }, + { + "epoch": 3.6397184300341294, + "grad_norm": 0.2551607547773099, + "learning_rate": 8.83752537771302e-05, + "loss": 0.9061, + "step": 17063 + }, + { + "epoch": 3.6399317406143346, + "grad_norm": 0.24445171013445366, + "learning_rate": 8.836679072691141e-05, + "loss": 0.9376, + "step": 17064 + }, + { + "epoch": 3.6401450511945392, + "grad_norm": 0.3054028547898771, + "learning_rate": 8.835832758202223e-05, + "loss": 0.9305, + "step": 17065 + }, + { + "epoch": 3.640358361774744, + "grad_norm": 0.3007519004435853, + "learning_rate": 8.834986434255843e-05, + "loss": 0.9293, + "step": 17066 + }, + { + "epoch": 3.640571672354949, + "grad_norm": 0.2855313877443164, + "learning_rate": 8.834140100861577e-05, + "loss": 0.89, + "step": 17067 + }, + { + "epoch": 3.6407849829351537, + "grad_norm": 0.17954341433277918, + "learning_rate": 8.833293758029007e-05, + "loss": 0.9459, + "step": 17068 + }, + { + "epoch": 3.6409982935153584, + "grad_norm": 0.2769357815595116, + "learning_rate": 8.832447405767697e-05, + "loss": 0.9121, + "step": 17069 + }, + { + "epoch": 3.641211604095563, + "grad_norm": 0.34764255945881756, + "learning_rate": 8.831601044087236e-05, + "loss": 0.928, + "step": 17070 + }, + { + "epoch": 3.641424914675768, + "grad_norm": 0.3106686515706199, + "learning_rate": 8.830754672997192e-05, + "loss": 0.9056, + "step": 17071 + }, + { + "epoch": 3.6416382252559725, + "grad_norm": 0.286982148476459, + "learning_rate": 8.829908292507146e-05, + "loss": 0.8734, + "step": 17072 + }, + { + "epoch": 3.6418515358361776, + "grad_norm": 0.23747731248527285, + "learning_rate": 8.829061902626674e-05, + "loss": 0.9048, + "step": 17073 + }, + { + "epoch": 3.6420648464163823, + "grad_norm": 0.19738855786230114, + "learning_rate": 8.828215503365352e-05, + "loss": 0.9048, + "step": 17074 + }, + { + "epoch": 3.642278156996587, + "grad_norm": 0.20488750034193995, + "learning_rate": 8.827369094732756e-05, + "loss": 0.8921, + "step": 17075 + }, + { + "epoch": 3.6424914675767917, + "grad_norm": 0.3314987877054391, + "learning_rate": 8.826522676738468e-05, + "loss": 0.9239, + "step": 17076 + }, + { + "epoch": 3.642704778156997, + "grad_norm": 0.4136141209503898, + "learning_rate": 8.82567624939206e-05, + "loss": 0.9036, + "step": 17077 + }, + { + "epoch": 3.6429180887372015, + "grad_norm": 0.4039910132564687, + "learning_rate": 8.824829812703112e-05, + "loss": 0.9414, + "step": 17078 + }, + { + "epoch": 3.643131399317406, + "grad_norm": 0.3625638608454687, + "learning_rate": 8.823983366681202e-05, + "loss": 0.9025, + "step": 17079 + }, + { + "epoch": 3.643344709897611, + "grad_norm": 0.3370338283348912, + "learning_rate": 8.823136911335905e-05, + "loss": 0.9485, + "step": 17080 + }, + { + "epoch": 3.6435580204778155, + "grad_norm": 0.25053992484311594, + "learning_rate": 8.822290446676798e-05, + "loss": 0.913, + "step": 17081 + }, + { + "epoch": 3.64377133105802, + "grad_norm": 0.21897811498569492, + "learning_rate": 8.821443972713465e-05, + "loss": 0.9229, + "step": 17082 + }, + { + "epoch": 3.6439846416382253, + "grad_norm": 0.17827619225124736, + "learning_rate": 8.820597489455476e-05, + "loss": 0.8776, + "step": 17083 + }, + { + "epoch": 3.64419795221843, + "grad_norm": 0.2582815344813213, + "learning_rate": 8.819750996912414e-05, + "loss": 0.9753, + "step": 17084 + }, + { + "epoch": 3.6444112627986347, + "grad_norm": 0.31780474946235177, + "learning_rate": 8.818904495093857e-05, + "loss": 0.9052, + "step": 17085 + }, + { + "epoch": 3.64462457337884, + "grad_norm": 0.26157508551127495, + "learning_rate": 8.818057984009381e-05, + "loss": 0.9465, + "step": 17086 + }, + { + "epoch": 3.6448378839590445, + "grad_norm": 0.20234338763139878, + "learning_rate": 8.817211463668562e-05, + "loss": 0.8989, + "step": 17087 + }, + { + "epoch": 3.645051194539249, + "grad_norm": 0.17214859170255506, + "learning_rate": 8.816364934080986e-05, + "loss": 0.8994, + "step": 17088 + }, + { + "epoch": 3.645264505119454, + "grad_norm": 0.2044819713359969, + "learning_rate": 8.815518395256223e-05, + "loss": 0.92, + "step": 17089 + }, + { + "epoch": 3.6454778156996586, + "grad_norm": 0.22116204950752277, + "learning_rate": 8.814671847203862e-05, + "loss": 0.9325, + "step": 17090 + }, + { + "epoch": 3.6456911262798632, + "grad_norm": 0.19829318820957537, + "learning_rate": 8.81382528993347e-05, + "loss": 0.9192, + "step": 17091 + }, + { + "epoch": 3.6459044368600684, + "grad_norm": 0.1773323083034135, + "learning_rate": 8.812978723454633e-05, + "loss": 0.8952, + "step": 17092 + }, + { + "epoch": 3.646117747440273, + "grad_norm": 0.19777129046453637, + "learning_rate": 8.812132147776926e-05, + "loss": 0.9264, + "step": 17093 + }, + { + "epoch": 3.6463310580204777, + "grad_norm": 0.20259143449932837, + "learning_rate": 8.811285562909932e-05, + "loss": 0.9403, + "step": 17094 + }, + { + "epoch": 3.6465443686006824, + "grad_norm": 0.21599998837984913, + "learning_rate": 8.810438968863227e-05, + "loss": 0.9402, + "step": 17095 + }, + { + "epoch": 3.6467576791808876, + "grad_norm": 0.1865102553973863, + "learning_rate": 8.809592365646394e-05, + "loss": 0.9409, + "step": 17096 + }, + { + "epoch": 3.6469709897610922, + "grad_norm": 0.26730983701449385, + "learning_rate": 8.808745753269006e-05, + "loss": 0.9324, + "step": 17097 + }, + { + "epoch": 3.647184300341297, + "grad_norm": 0.3045574138366282, + "learning_rate": 8.80789913174065e-05, + "loss": 0.9203, + "step": 17098 + }, + { + "epoch": 3.6473976109215016, + "grad_norm": 0.25770975494598697, + "learning_rate": 8.8070525010709e-05, + "loss": 0.8992, + "step": 17099 + }, + { + "epoch": 3.6476109215017063, + "grad_norm": 0.18150316716722187, + "learning_rate": 8.806205861269338e-05, + "loss": 0.9248, + "step": 17100 + }, + { + "epoch": 3.6478242320819114, + "grad_norm": 0.2231604346002703, + "learning_rate": 8.805359212345542e-05, + "loss": 0.9296, + "step": 17101 + }, + { + "epoch": 3.648037542662116, + "grad_norm": 0.24683219516494725, + "learning_rate": 8.804512554309094e-05, + "loss": 0.942, + "step": 17102 + }, + { + "epoch": 3.648250853242321, + "grad_norm": 0.1826189683392419, + "learning_rate": 8.803665887169571e-05, + "loss": 0.9376, + "step": 17103 + }, + { + "epoch": 3.6484641638225255, + "grad_norm": 0.20966881866215475, + "learning_rate": 8.802819210936557e-05, + "loss": 0.931, + "step": 17104 + }, + { + "epoch": 3.6486774744027306, + "grad_norm": 0.21313493910020453, + "learning_rate": 8.801972525619628e-05, + "loss": 0.8733, + "step": 17105 + }, + { + "epoch": 3.6488907849829353, + "grad_norm": 0.17255547741448932, + "learning_rate": 8.80112583122837e-05, + "loss": 0.9449, + "step": 17106 + }, + { + "epoch": 3.64910409556314, + "grad_norm": 0.19711226889772607, + "learning_rate": 8.800279127772356e-05, + "loss": 0.9564, + "step": 17107 + }, + { + "epoch": 3.6493174061433447, + "grad_norm": 0.21651603580064005, + "learning_rate": 8.799432415261173e-05, + "loss": 0.8736, + "step": 17108 + }, + { + "epoch": 3.6495307167235493, + "grad_norm": 0.16742928391000647, + "learning_rate": 8.798585693704397e-05, + "loss": 0.9135, + "step": 17109 + }, + { + "epoch": 3.649744027303754, + "grad_norm": 0.20472398393498012, + "learning_rate": 8.797738963111611e-05, + "loss": 0.9266, + "step": 17110 + }, + { + "epoch": 3.649957337883959, + "grad_norm": 0.1832485131416468, + "learning_rate": 8.796892223492395e-05, + "loss": 0.9419, + "step": 17111 + }, + { + "epoch": 3.650170648464164, + "grad_norm": 0.1866360492943611, + "learning_rate": 8.79604547485633e-05, + "loss": 0.8835, + "step": 17112 + }, + { + "epoch": 3.6503839590443685, + "grad_norm": 0.19785256773455268, + "learning_rate": 8.795198717212998e-05, + "loss": 0.9357, + "step": 17113 + }, + { + "epoch": 3.6505972696245736, + "grad_norm": 0.2128086323076082, + "learning_rate": 8.79435195057198e-05, + "loss": 0.9545, + "step": 17114 + }, + { + "epoch": 3.6508105802047783, + "grad_norm": 0.19389382239495304, + "learning_rate": 8.793505174942853e-05, + "loss": 0.8831, + "step": 17115 + }, + { + "epoch": 3.651023890784983, + "grad_norm": 0.19990810561439357, + "learning_rate": 8.792658390335205e-05, + "loss": 0.8945, + "step": 17116 + }, + { + "epoch": 3.6512372013651877, + "grad_norm": 0.2741338062924713, + "learning_rate": 8.79181159675861e-05, + "loss": 0.8933, + "step": 17117 + }, + { + "epoch": 3.6514505119453924, + "grad_norm": 0.2705045004646097, + "learning_rate": 8.790964794222657e-05, + "loss": 0.9028, + "step": 17118 + }, + { + "epoch": 3.651663822525597, + "grad_norm": 0.22737387817189747, + "learning_rate": 8.790117982736922e-05, + "loss": 0.9521, + "step": 17119 + }, + { + "epoch": 3.651877133105802, + "grad_norm": 0.20295233571483778, + "learning_rate": 8.789271162310992e-05, + "loss": 0.8985, + "step": 17120 + }, + { + "epoch": 3.652090443686007, + "grad_norm": 0.24226762534791743, + "learning_rate": 8.788424332954441e-05, + "loss": 0.9165, + "step": 17121 + }, + { + "epoch": 3.6523037542662116, + "grad_norm": 0.20913117707673462, + "learning_rate": 8.787577494676858e-05, + "loss": 0.931, + "step": 17122 + }, + { + "epoch": 3.6525170648464163, + "grad_norm": 0.22548182602130726, + "learning_rate": 8.786730647487821e-05, + "loss": 0.9215, + "step": 17123 + }, + { + "epoch": 3.6527303754266214, + "grad_norm": 0.24491026619159242, + "learning_rate": 8.785883791396915e-05, + "loss": 0.9142, + "step": 17124 + }, + { + "epoch": 3.652943686006826, + "grad_norm": 0.2753033271616464, + "learning_rate": 8.785036926413722e-05, + "loss": 0.8933, + "step": 17125 + }, + { + "epoch": 3.6531569965870307, + "grad_norm": 0.24100901110537823, + "learning_rate": 8.78419005254782e-05, + "loss": 0.8879, + "step": 17126 + }, + { + "epoch": 3.6533703071672354, + "grad_norm": 0.19779805212041204, + "learning_rate": 8.783343169808796e-05, + "loss": 0.928, + "step": 17127 + }, + { + "epoch": 3.65358361774744, + "grad_norm": 0.21295353347786294, + "learning_rate": 8.78249627820623e-05, + "loss": 0.9289, + "step": 17128 + }, + { + "epoch": 3.653796928327645, + "grad_norm": 0.24688151513818105, + "learning_rate": 8.781649377749705e-05, + "loss": 0.9619, + "step": 17129 + }, + { + "epoch": 3.65401023890785, + "grad_norm": 0.27414249670206997, + "learning_rate": 8.780802468448805e-05, + "loss": 0.9009, + "step": 17130 + }, + { + "epoch": 3.6542235494880546, + "grad_norm": 0.24818891070984792, + "learning_rate": 8.779955550313113e-05, + "loss": 0.904, + "step": 17131 + }, + { + "epoch": 3.6544368600682593, + "grad_norm": 0.2123407416412618, + "learning_rate": 8.779108623352209e-05, + "loss": 0.8981, + "step": 17132 + }, + { + "epoch": 3.6546501706484644, + "grad_norm": 0.1964703648578909, + "learning_rate": 8.778261687575678e-05, + "loss": 0.9295, + "step": 17133 + }, + { + "epoch": 3.654863481228669, + "grad_norm": 0.2501987901489244, + "learning_rate": 8.777414742993105e-05, + "loss": 0.9157, + "step": 17134 + }, + { + "epoch": 3.655076791808874, + "grad_norm": 0.1984026544176532, + "learning_rate": 8.77656778961407e-05, + "loss": 0.8974, + "step": 17135 + }, + { + "epoch": 3.6552901023890785, + "grad_norm": 0.18839727147960078, + "learning_rate": 8.775720827448154e-05, + "loss": 0.9276, + "step": 17136 + }, + { + "epoch": 3.655503412969283, + "grad_norm": 0.21880091839804025, + "learning_rate": 8.774873856504949e-05, + "loss": 0.9177, + "step": 17137 + }, + { + "epoch": 3.655716723549488, + "grad_norm": 0.1836244423412352, + "learning_rate": 8.774026876794031e-05, + "loss": 0.9285, + "step": 17138 + }, + { + "epoch": 3.655930034129693, + "grad_norm": 0.18831321418223934, + "learning_rate": 8.773179888324985e-05, + "loss": 0.9455, + "step": 17139 + }, + { + "epoch": 3.6561433447098977, + "grad_norm": 0.19163685081554463, + "learning_rate": 8.772332891107397e-05, + "loss": 0.9243, + "step": 17140 + }, + { + "epoch": 3.6563566552901023, + "grad_norm": 0.189931679975075, + "learning_rate": 8.77148588515085e-05, + "loss": 0.9339, + "step": 17141 + }, + { + "epoch": 3.656569965870307, + "grad_norm": 0.22096414968991562, + "learning_rate": 8.770638870464924e-05, + "loss": 0.9031, + "step": 17142 + }, + { + "epoch": 3.656783276450512, + "grad_norm": 0.27353727329964217, + "learning_rate": 8.769791847059209e-05, + "loss": 0.8897, + "step": 17143 + }, + { + "epoch": 3.656996587030717, + "grad_norm": 0.2429645228666648, + "learning_rate": 8.768944814943287e-05, + "loss": 0.9414, + "step": 17144 + }, + { + "epoch": 3.6572098976109215, + "grad_norm": 0.21289158203572803, + "learning_rate": 8.768097774126737e-05, + "loss": 0.9137, + "step": 17145 + }, + { + "epoch": 3.657423208191126, + "grad_norm": 0.3151427045963395, + "learning_rate": 8.767250724619151e-05, + "loss": 0.9267, + "step": 17146 + }, + { + "epoch": 3.657636518771331, + "grad_norm": 0.30285864001503787, + "learning_rate": 8.766403666430112e-05, + "loss": 0.9252, + "step": 17147 + }, + { + "epoch": 3.6578498293515356, + "grad_norm": 0.21752501540134844, + "learning_rate": 8.765556599569199e-05, + "loss": 0.9372, + "step": 17148 + }, + { + "epoch": 3.6580631399317407, + "grad_norm": 0.29403177940914277, + "learning_rate": 8.764709524046002e-05, + "loss": 0.9027, + "step": 17149 + }, + { + "epoch": 3.6582764505119454, + "grad_norm": 0.3323103707289725, + "learning_rate": 8.763862439870105e-05, + "loss": 0.915, + "step": 17150 + }, + { + "epoch": 3.65848976109215, + "grad_norm": 0.23224855877733405, + "learning_rate": 8.763015347051089e-05, + "loss": 0.901, + "step": 17151 + }, + { + "epoch": 3.658703071672355, + "grad_norm": 0.27414040391813527, + "learning_rate": 8.762168245598542e-05, + "loss": 0.8929, + "step": 17152 + }, + { + "epoch": 3.65891638225256, + "grad_norm": 0.28473889893465415, + "learning_rate": 8.76132113552205e-05, + "loss": 0.9088, + "step": 17153 + }, + { + "epoch": 3.6591296928327646, + "grad_norm": 0.22808516076146468, + "learning_rate": 8.760474016831195e-05, + "loss": 0.8741, + "step": 17154 + }, + { + "epoch": 3.6593430034129693, + "grad_norm": 0.2690479853039946, + "learning_rate": 8.759626889535565e-05, + "loss": 0.9373, + "step": 17155 + }, + { + "epoch": 3.659556313993174, + "grad_norm": 0.29509028308067464, + "learning_rate": 8.758779753644745e-05, + "loss": 0.9126, + "step": 17156 + }, + { + "epoch": 3.6597696245733786, + "grad_norm": 0.245096998416244, + "learning_rate": 8.757932609168318e-05, + "loss": 0.9344, + "step": 17157 + }, + { + "epoch": 3.6599829351535837, + "grad_norm": 0.28186964802166853, + "learning_rate": 8.75708545611587e-05, + "loss": 0.9014, + "step": 17158 + }, + { + "epoch": 3.6601962457337884, + "grad_norm": 0.35397101816313997, + "learning_rate": 8.756238294496987e-05, + "loss": 0.9383, + "step": 17159 + }, + { + "epoch": 3.660409556313993, + "grad_norm": 0.3004457666149789, + "learning_rate": 8.755391124321255e-05, + "loss": 0.9018, + "step": 17160 + }, + { + "epoch": 3.660622866894198, + "grad_norm": 0.22261901112519375, + "learning_rate": 8.754543945598262e-05, + "loss": 0.9399, + "step": 17161 + }, + { + "epoch": 3.660836177474403, + "grad_norm": 0.26291452587472913, + "learning_rate": 8.75369675833759e-05, + "loss": 0.8926, + "step": 17162 + }, + { + "epoch": 3.6610494880546076, + "grad_norm": 0.2797493041270466, + "learning_rate": 8.752849562548827e-05, + "loss": 0.9136, + "step": 17163 + }, + { + "epoch": 3.6612627986348123, + "grad_norm": 0.2272455603702348, + "learning_rate": 8.752002358241558e-05, + "loss": 0.9249, + "step": 17164 + }, + { + "epoch": 3.661476109215017, + "grad_norm": 0.2622310580370899, + "learning_rate": 8.75115514542537e-05, + "loss": 0.9291, + "step": 17165 + }, + { + "epoch": 3.6616894197952217, + "grad_norm": 0.25898291041571764, + "learning_rate": 8.750307924109848e-05, + "loss": 0.9087, + "step": 17166 + }, + { + "epoch": 3.6619027303754264, + "grad_norm": 0.31339453266233247, + "learning_rate": 8.74946069430458e-05, + "loss": 0.9043, + "step": 17167 + }, + { + "epoch": 3.6621160409556315, + "grad_norm": 0.23715184882941703, + "learning_rate": 8.748613456019153e-05, + "loss": 0.9229, + "step": 17168 + }, + { + "epoch": 3.662329351535836, + "grad_norm": 0.2045815035769929, + "learning_rate": 8.747766209263152e-05, + "loss": 0.9282, + "step": 17169 + }, + { + "epoch": 3.662542662116041, + "grad_norm": 0.2249590887880436, + "learning_rate": 8.746918954046161e-05, + "loss": 0.9307, + "step": 17170 + }, + { + "epoch": 3.662755972696246, + "grad_norm": 0.21032306731122824, + "learning_rate": 8.746071690377772e-05, + "loss": 0.9393, + "step": 17171 + }, + { + "epoch": 3.6629692832764507, + "grad_norm": 0.22442521434232526, + "learning_rate": 8.745224418267566e-05, + "loss": 0.8933, + "step": 17172 + }, + { + "epoch": 3.6631825938566553, + "grad_norm": 0.21654599802763466, + "learning_rate": 8.744377137725136e-05, + "loss": 0.9273, + "step": 17173 + }, + { + "epoch": 3.66339590443686, + "grad_norm": 0.22531189613219177, + "learning_rate": 8.743529848760066e-05, + "loss": 0.9158, + "step": 17174 + }, + { + "epoch": 3.6636092150170647, + "grad_norm": 0.19184228980172238, + "learning_rate": 8.742682551381944e-05, + "loss": 0.8972, + "step": 17175 + }, + { + "epoch": 3.6638225255972694, + "grad_norm": 0.19444104139593277, + "learning_rate": 8.741835245600355e-05, + "loss": 0.9264, + "step": 17176 + }, + { + "epoch": 3.6640358361774745, + "grad_norm": 0.24328387736850557, + "learning_rate": 8.740987931424889e-05, + "loss": 0.9251, + "step": 17177 + }, + { + "epoch": 3.664249146757679, + "grad_norm": 0.21173482937802038, + "learning_rate": 8.74014060886513e-05, + "loss": 0.9158, + "step": 17178 + }, + { + "epoch": 3.664462457337884, + "grad_norm": 0.19699934833804142, + "learning_rate": 8.739293277930668e-05, + "loss": 0.9071, + "step": 17179 + }, + { + "epoch": 3.6646757679180886, + "grad_norm": 0.20640986457336066, + "learning_rate": 8.738445938631092e-05, + "loss": 0.8891, + "step": 17180 + }, + { + "epoch": 3.6648890784982937, + "grad_norm": 0.21347083048075297, + "learning_rate": 8.737598590975987e-05, + "loss": 0.9299, + "step": 17181 + }, + { + "epoch": 3.6651023890784984, + "grad_norm": 0.21923771555896546, + "learning_rate": 8.73675123497494e-05, + "loss": 0.9, + "step": 17182 + }, + { + "epoch": 3.665315699658703, + "grad_norm": 0.22095357215342123, + "learning_rate": 8.735903870637541e-05, + "loss": 0.896, + "step": 17183 + }, + { + "epoch": 3.6655290102389078, + "grad_norm": 0.20513233628820943, + "learning_rate": 8.735056497973377e-05, + "loss": 0.9193, + "step": 17184 + }, + { + "epoch": 3.6657423208191124, + "grad_norm": 0.18929690410156064, + "learning_rate": 8.73420911699204e-05, + "loss": 0.9196, + "step": 17185 + }, + { + "epoch": 3.6659556313993176, + "grad_norm": 0.18069109098291042, + "learning_rate": 8.733361727703111e-05, + "loss": 0.8887, + "step": 17186 + }, + { + "epoch": 3.6661689419795223, + "grad_norm": 0.21411991954202367, + "learning_rate": 8.732514330116183e-05, + "loss": 0.8903, + "step": 17187 + }, + { + "epoch": 3.666382252559727, + "grad_norm": 0.3050605667252554, + "learning_rate": 8.731666924240841e-05, + "loss": 0.9239, + "step": 17188 + }, + { + "epoch": 3.6665955631399316, + "grad_norm": 0.31369330850315746, + "learning_rate": 8.730819510086678e-05, + "loss": 0.8918, + "step": 17189 + }, + { + "epoch": 3.6668088737201368, + "grad_norm": 0.2511126120092316, + "learning_rate": 8.729972087663278e-05, + "loss": 0.9554, + "step": 17190 + }, + { + "epoch": 3.6670221843003414, + "grad_norm": 0.2226554913475782, + "learning_rate": 8.729124656980235e-05, + "loss": 0.9263, + "step": 17191 + }, + { + "epoch": 3.667235494880546, + "grad_norm": 0.20003654395450096, + "learning_rate": 8.72827721804713e-05, + "loss": 0.9105, + "step": 17192 + }, + { + "epoch": 3.667448805460751, + "grad_norm": 0.21172949152712867, + "learning_rate": 8.727429770873558e-05, + "loss": 0.9319, + "step": 17193 + }, + { + "epoch": 3.6676621160409555, + "grad_norm": 0.2603057523007559, + "learning_rate": 8.726582315469104e-05, + "loss": 0.9478, + "step": 17194 + }, + { + "epoch": 3.66787542662116, + "grad_norm": 0.2736408453942166, + "learning_rate": 8.725734851843362e-05, + "loss": 0.9389, + "step": 17195 + }, + { + "epoch": 3.6680887372013653, + "grad_norm": 0.30068259175396705, + "learning_rate": 8.724887380005917e-05, + "loss": 0.904, + "step": 17196 + }, + { + "epoch": 3.66830204778157, + "grad_norm": 0.25866246529451853, + "learning_rate": 8.724039899966359e-05, + "loss": 0.9152, + "step": 17197 + }, + { + "epoch": 3.6685153583617747, + "grad_norm": 0.19903155134584663, + "learning_rate": 8.723192411734275e-05, + "loss": 0.9215, + "step": 17198 + }, + { + "epoch": 3.66872866894198, + "grad_norm": 0.27829299445996225, + "learning_rate": 8.722344915319259e-05, + "loss": 0.949, + "step": 17199 + }, + { + "epoch": 3.6689419795221845, + "grad_norm": 0.2404638593157877, + "learning_rate": 8.721497410730897e-05, + "loss": 0.9042, + "step": 17200 + }, + { + "epoch": 3.669155290102389, + "grad_norm": 0.20451751864915985, + "learning_rate": 8.720649897978782e-05, + "loss": 0.9055, + "step": 17201 + }, + { + "epoch": 3.669368600682594, + "grad_norm": 0.2507218542574296, + "learning_rate": 8.7198023770725e-05, + "loss": 0.8889, + "step": 17202 + }, + { + "epoch": 3.6695819112627985, + "grad_norm": 0.2618569864657427, + "learning_rate": 8.718954848021641e-05, + "loss": 0.9376, + "step": 17203 + }, + { + "epoch": 3.669795221843003, + "grad_norm": 0.18316868154279328, + "learning_rate": 8.718107310835795e-05, + "loss": 0.9219, + "step": 17204 + }, + { + "epoch": 3.6700085324232083, + "grad_norm": 0.3858983372101989, + "learning_rate": 8.717259765524555e-05, + "loss": 0.9107, + "step": 17205 + }, + { + "epoch": 3.670221843003413, + "grad_norm": 0.20990610299513526, + "learning_rate": 8.716412212097507e-05, + "loss": 0.9316, + "step": 17206 + }, + { + "epoch": 3.6704351535836177, + "grad_norm": 0.2144912645084797, + "learning_rate": 8.715564650564244e-05, + "loss": 0.9254, + "step": 17207 + }, + { + "epoch": 3.6706484641638224, + "grad_norm": 0.22624753405516226, + "learning_rate": 8.714717080934355e-05, + "loss": 0.8937, + "step": 17208 + }, + { + "epoch": 3.6708617747440275, + "grad_norm": 0.2283382747401276, + "learning_rate": 8.71386950321743e-05, + "loss": 0.9186, + "step": 17209 + }, + { + "epoch": 3.671075085324232, + "grad_norm": 0.26695902814565486, + "learning_rate": 8.713021917423059e-05, + "loss": 0.921, + "step": 17210 + }, + { + "epoch": 3.671288395904437, + "grad_norm": 0.358281808717219, + "learning_rate": 8.712174323560833e-05, + "loss": 0.9179, + "step": 17211 + }, + { + "epoch": 3.6715017064846416, + "grad_norm": 0.38449023229885665, + "learning_rate": 8.711326721640343e-05, + "loss": 0.9296, + "step": 17212 + }, + { + "epoch": 3.6717150170648463, + "grad_norm": 0.31144463564864455, + "learning_rate": 8.710479111671181e-05, + "loss": 0.9227, + "step": 17213 + }, + { + "epoch": 3.671928327645051, + "grad_norm": 0.2214153627634627, + "learning_rate": 8.709631493662933e-05, + "loss": 0.9067, + "step": 17214 + }, + { + "epoch": 3.672141638225256, + "grad_norm": 0.26563870763821035, + "learning_rate": 8.708783867625194e-05, + "loss": 0.8949, + "step": 17215 + }, + { + "epoch": 3.6723549488054608, + "grad_norm": 0.2587901512787399, + "learning_rate": 8.707936233567551e-05, + "loss": 0.9204, + "step": 17216 + }, + { + "epoch": 3.6725682593856654, + "grad_norm": 0.27957165361087194, + "learning_rate": 8.7070885914996e-05, + "loss": 0.9116, + "step": 17217 + }, + { + "epoch": 3.6727815699658706, + "grad_norm": 0.4006799279296265, + "learning_rate": 8.70624094143093e-05, + "loss": 0.9652, + "step": 17218 + }, + { + "epoch": 3.6729948805460753, + "grad_norm": 0.36168602317108844, + "learning_rate": 8.705393283371134e-05, + "loss": 0.8982, + "step": 17219 + }, + { + "epoch": 3.67320819112628, + "grad_norm": 0.3038543855490614, + "learning_rate": 8.704545617329797e-05, + "loss": 0.909, + "step": 17220 + }, + { + "epoch": 3.6734215017064846, + "grad_norm": 0.28777542168275005, + "learning_rate": 8.703697943316517e-05, + "loss": 0.9414, + "step": 17221 + }, + { + "epoch": 3.6736348122866893, + "grad_norm": 0.2574322704669206, + "learning_rate": 8.70285026134088e-05, + "loss": 0.9123, + "step": 17222 + }, + { + "epoch": 3.673848122866894, + "grad_norm": 0.27659823254561117, + "learning_rate": 8.702002571412484e-05, + "loss": 0.9178, + "step": 17223 + }, + { + "epoch": 3.674061433447099, + "grad_norm": 0.2744556944016753, + "learning_rate": 8.701154873540916e-05, + "loss": 0.9401, + "step": 17224 + }, + { + "epoch": 3.674274744027304, + "grad_norm": 0.443258154958548, + "learning_rate": 8.70030716773577e-05, + "loss": 0.9278, + "step": 17225 + }, + { + "epoch": 3.6744880546075085, + "grad_norm": 0.23901250270666774, + "learning_rate": 8.699459454006634e-05, + "loss": 0.9004, + "step": 17226 + }, + { + "epoch": 3.674701365187713, + "grad_norm": 0.3012422793754425, + "learning_rate": 8.698611732363102e-05, + "loss": 0.9094, + "step": 17227 + }, + { + "epoch": 3.6749146757679183, + "grad_norm": 0.3445641726158621, + "learning_rate": 8.697764002814767e-05, + "loss": 0.9162, + "step": 17228 + }, + { + "epoch": 3.675127986348123, + "grad_norm": 0.3888521430574507, + "learning_rate": 8.696916265371222e-05, + "loss": 0.9023, + "step": 17229 + }, + { + "epoch": 3.6753412969283277, + "grad_norm": 0.37446160287038216, + "learning_rate": 8.696068520042058e-05, + "loss": 0.947, + "step": 17230 + }, + { + "epoch": 3.6755546075085324, + "grad_norm": 0.3507171076224212, + "learning_rate": 8.695220766836867e-05, + "loss": 0.9338, + "step": 17231 + }, + { + "epoch": 3.675767918088737, + "grad_norm": 0.28817257170409266, + "learning_rate": 8.69437300576524e-05, + "loss": 0.9301, + "step": 17232 + }, + { + "epoch": 3.6759812286689417, + "grad_norm": 0.26324972464195384, + "learning_rate": 8.693525236836772e-05, + "loss": 0.9412, + "step": 17233 + }, + { + "epoch": 3.676194539249147, + "grad_norm": 0.2813240315339535, + "learning_rate": 8.692677460061053e-05, + "loss": 0.9538, + "step": 17234 + }, + { + "epoch": 3.6764078498293515, + "grad_norm": 0.35260986763096763, + "learning_rate": 8.691829675447677e-05, + "loss": 0.9439, + "step": 17235 + }, + { + "epoch": 3.676621160409556, + "grad_norm": 0.4200847957439166, + "learning_rate": 8.690981883006238e-05, + "loss": 0.9441, + "step": 17236 + }, + { + "epoch": 3.6768344709897613, + "grad_norm": 0.42163170683414414, + "learning_rate": 8.690134082746328e-05, + "loss": 0.9152, + "step": 17237 + }, + { + "epoch": 3.677047781569966, + "grad_norm": 0.28531959632435466, + "learning_rate": 8.689286274677535e-05, + "loss": 0.9224, + "step": 17238 + }, + { + "epoch": 3.6772610921501707, + "grad_norm": 0.26337571608497284, + "learning_rate": 8.68843845880946e-05, + "loss": 0.9181, + "step": 17239 + }, + { + "epoch": 3.6774744027303754, + "grad_norm": 0.24166667016595647, + "learning_rate": 8.687590635151691e-05, + "loss": 0.9271, + "step": 17240 + }, + { + "epoch": 3.67768771331058, + "grad_norm": 0.33091593952204656, + "learning_rate": 8.686742803713825e-05, + "loss": 0.9285, + "step": 17241 + }, + { + "epoch": 3.6779010238907848, + "grad_norm": 0.30155459170376686, + "learning_rate": 8.68589496450545e-05, + "loss": 0.8866, + "step": 17242 + }, + { + "epoch": 3.67811433447099, + "grad_norm": 0.21966193543811924, + "learning_rate": 8.685047117536162e-05, + "loss": 0.9286, + "step": 17243 + }, + { + "epoch": 3.6783276450511946, + "grad_norm": 0.23141786506530865, + "learning_rate": 8.684199262815553e-05, + "loss": 0.9597, + "step": 17244 + }, + { + "epoch": 3.6785409556313993, + "grad_norm": 0.24928150321376932, + "learning_rate": 8.68335140035322e-05, + "loss": 0.9222, + "step": 17245 + }, + { + "epoch": 3.678754266211604, + "grad_norm": 0.3060161164262306, + "learning_rate": 8.682503530158752e-05, + "loss": 0.9217, + "step": 17246 + }, + { + "epoch": 3.678967576791809, + "grad_norm": 0.3081395393840763, + "learning_rate": 8.681655652241751e-05, + "loss": 0.9056, + "step": 17247 + }, + { + "epoch": 3.6791808873720138, + "grad_norm": 0.244203869431985, + "learning_rate": 8.680807766611799e-05, + "loss": 0.9183, + "step": 17248 + }, + { + "epoch": 3.6793941979522184, + "grad_norm": 0.1982259397432707, + "learning_rate": 8.679959873278497e-05, + "loss": 0.8959, + "step": 17249 + }, + { + "epoch": 3.679607508532423, + "grad_norm": 0.21770251008351352, + "learning_rate": 8.679111972251438e-05, + "loss": 0.9171, + "step": 17250 + }, + { + "epoch": 3.679820819112628, + "grad_norm": 0.21446459036096735, + "learning_rate": 8.678264063540216e-05, + "loss": 0.8904, + "step": 17251 + }, + { + "epoch": 3.6800341296928325, + "grad_norm": 0.22426498197181785, + "learning_rate": 8.677416147154423e-05, + "loss": 0.9073, + "step": 17252 + }, + { + "epoch": 3.6802474402730376, + "grad_norm": 0.19819208992506254, + "learning_rate": 8.676568223103658e-05, + "loss": 0.9051, + "step": 17253 + }, + { + "epoch": 3.6804607508532423, + "grad_norm": 0.2148936623867691, + "learning_rate": 8.67572029139751e-05, + "loss": 0.9307, + "step": 17254 + }, + { + "epoch": 3.680674061433447, + "grad_norm": 0.2210167802942658, + "learning_rate": 8.674872352045576e-05, + "loss": 0.8906, + "step": 17255 + }, + { + "epoch": 3.680887372013652, + "grad_norm": 0.18859898290239654, + "learning_rate": 8.67402440505745e-05, + "loss": 0.9032, + "step": 17256 + }, + { + "epoch": 3.681100682593857, + "grad_norm": 0.1851794604076965, + "learning_rate": 8.673176450442726e-05, + "loss": 0.9453, + "step": 17257 + }, + { + "epoch": 3.6813139931740615, + "grad_norm": 0.2166059865398864, + "learning_rate": 8.672328488211e-05, + "loss": 0.9284, + "step": 17258 + }, + { + "epoch": 3.681527303754266, + "grad_norm": 0.21784316085310249, + "learning_rate": 8.671480518371866e-05, + "loss": 0.9258, + "step": 17259 + }, + { + "epoch": 3.681740614334471, + "grad_norm": 0.22035214342430057, + "learning_rate": 8.670632540934917e-05, + "loss": 0.9378, + "step": 17260 + }, + { + "epoch": 3.6819539249146755, + "grad_norm": 0.21016531666379373, + "learning_rate": 8.669784555909751e-05, + "loss": 0.9132, + "step": 17261 + }, + { + "epoch": 3.6821672354948807, + "grad_norm": 0.19232221494543922, + "learning_rate": 8.668936563305959e-05, + "loss": 0.91, + "step": 17262 + }, + { + "epoch": 3.6823805460750854, + "grad_norm": 0.20931191343862854, + "learning_rate": 8.668088563133142e-05, + "loss": 0.9226, + "step": 17263 + }, + { + "epoch": 3.68259385665529, + "grad_norm": 0.20050357561754054, + "learning_rate": 8.66724055540089e-05, + "loss": 0.94, + "step": 17264 + }, + { + "epoch": 3.6828071672354947, + "grad_norm": 0.19459755566896716, + "learning_rate": 8.6663925401188e-05, + "loss": 0.9064, + "step": 17265 + }, + { + "epoch": 3.6830204778157, + "grad_norm": 0.18339743466666028, + "learning_rate": 8.665544517296468e-05, + "loss": 0.9147, + "step": 17266 + }, + { + "epoch": 3.6832337883959045, + "grad_norm": 0.18032954089491532, + "learning_rate": 8.664696486943488e-05, + "loss": 0.9075, + "step": 17267 + }, + { + "epoch": 3.6834470989761092, + "grad_norm": 0.1591803164975247, + "learning_rate": 8.663848449069455e-05, + "loss": 0.8847, + "step": 17268 + }, + { + "epoch": 3.683660409556314, + "grad_norm": 0.20288014754697722, + "learning_rate": 8.663000403683969e-05, + "loss": 0.9035, + "step": 17269 + }, + { + "epoch": 3.6838737201365186, + "grad_norm": 0.21597091373700747, + "learning_rate": 8.662152350796618e-05, + "loss": 0.9267, + "step": 17270 + }, + { + "epoch": 3.6840870307167233, + "grad_norm": 0.21844597268767535, + "learning_rate": 8.661304290417005e-05, + "loss": 0.899, + "step": 17271 + }, + { + "epoch": 3.6843003412969284, + "grad_norm": 0.21224266830347085, + "learning_rate": 8.66045622255472e-05, + "loss": 0.9045, + "step": 17272 + }, + { + "epoch": 3.684513651877133, + "grad_norm": 0.19550168017844236, + "learning_rate": 8.659608147219365e-05, + "loss": 0.9215, + "step": 17273 + }, + { + "epoch": 3.6847269624573378, + "grad_norm": 0.21037148716502235, + "learning_rate": 8.658760064420529e-05, + "loss": 0.9516, + "step": 17274 + }, + { + "epoch": 3.684940273037543, + "grad_norm": 0.2833007928634473, + "learning_rate": 8.657911974167817e-05, + "loss": 0.9408, + "step": 17275 + }, + { + "epoch": 3.6851535836177476, + "grad_norm": 0.2715394404319238, + "learning_rate": 8.657063876470815e-05, + "loss": 0.9194, + "step": 17276 + }, + { + "epoch": 3.6853668941979523, + "grad_norm": 0.2309767673978472, + "learning_rate": 8.656215771339129e-05, + "loss": 0.9346, + "step": 17277 + }, + { + "epoch": 3.685580204778157, + "grad_norm": 0.18779709329154468, + "learning_rate": 8.655367658782346e-05, + "loss": 0.9093, + "step": 17278 + }, + { + "epoch": 3.6857935153583616, + "grad_norm": 0.22543231126393004, + "learning_rate": 8.654519538810068e-05, + "loss": 0.919, + "step": 17279 + }, + { + "epoch": 3.6860068259385663, + "grad_norm": 0.23245712293436158, + "learning_rate": 8.65367141143189e-05, + "loss": 0.9165, + "step": 17280 + }, + { + "epoch": 3.6862201365187715, + "grad_norm": 0.20597571328494826, + "learning_rate": 8.652823276657412e-05, + "loss": 0.9329, + "step": 17281 + }, + { + "epoch": 3.686433447098976, + "grad_norm": 0.23711521660943408, + "learning_rate": 8.651975134496224e-05, + "loss": 0.9268, + "step": 17282 + }, + { + "epoch": 3.686646757679181, + "grad_norm": 0.28304505397373336, + "learning_rate": 8.651126984957929e-05, + "loss": 0.9054, + "step": 17283 + }, + { + "epoch": 3.6868600682593855, + "grad_norm": 0.26274810578227675, + "learning_rate": 8.650278828052119e-05, + "loss": 0.9373, + "step": 17284 + }, + { + "epoch": 3.6870733788395906, + "grad_norm": 0.21108466901724096, + "learning_rate": 8.649430663788395e-05, + "loss": 0.8977, + "step": 17285 + }, + { + "epoch": 3.6872866894197953, + "grad_norm": 0.19493967367856754, + "learning_rate": 8.648582492176352e-05, + "loss": 0.9165, + "step": 17286 + }, + { + "epoch": 3.6875, + "grad_norm": 0.18167687338047037, + "learning_rate": 8.647734313225586e-05, + "loss": 0.9216, + "step": 17287 + }, + { + "epoch": 3.6877133105802047, + "grad_norm": 0.22265757008379586, + "learning_rate": 8.646886126945696e-05, + "loss": 0.9047, + "step": 17288 + }, + { + "epoch": 3.6879266211604094, + "grad_norm": 0.24869673074726456, + "learning_rate": 8.646037933346279e-05, + "loss": 0.9344, + "step": 17289 + }, + { + "epoch": 3.6881399317406145, + "grad_norm": 0.24759711744549845, + "learning_rate": 8.64518973243693e-05, + "loss": 0.8817, + "step": 17290 + }, + { + "epoch": 3.688353242320819, + "grad_norm": 0.2932176834632707, + "learning_rate": 8.644341524227252e-05, + "loss": 0.8817, + "step": 17291 + }, + { + "epoch": 3.688566552901024, + "grad_norm": 0.2495434581319881, + "learning_rate": 8.643493308726836e-05, + "loss": 0.9206, + "step": 17292 + }, + { + "epoch": 3.6887798634812285, + "grad_norm": 0.17346973301789423, + "learning_rate": 8.642645085945283e-05, + "loss": 0.9485, + "step": 17293 + }, + { + "epoch": 3.6889931740614337, + "grad_norm": 0.22480699885722052, + "learning_rate": 8.641796855892189e-05, + "loss": 0.9109, + "step": 17294 + }, + { + "epoch": 3.6892064846416384, + "grad_norm": 0.24936580439727468, + "learning_rate": 8.640948618577154e-05, + "loss": 0.9287, + "step": 17295 + }, + { + "epoch": 3.689419795221843, + "grad_norm": 0.28617722826844977, + "learning_rate": 8.640100374009774e-05, + "loss": 0.9148, + "step": 17296 + }, + { + "epoch": 3.6896331058020477, + "grad_norm": 0.2191130961273722, + "learning_rate": 8.639252122199651e-05, + "loss": 0.9259, + "step": 17297 + }, + { + "epoch": 3.6898464163822524, + "grad_norm": 0.23015790720162801, + "learning_rate": 8.638403863156375e-05, + "loss": 0.9374, + "step": 17298 + }, + { + "epoch": 3.690059726962457, + "grad_norm": 0.23243457959822678, + "learning_rate": 8.637555596889551e-05, + "loss": 0.9083, + "step": 17299 + }, + { + "epoch": 3.6902730375426622, + "grad_norm": 0.22990010071815406, + "learning_rate": 8.636707323408772e-05, + "loss": 0.9057, + "step": 17300 + }, + { + "epoch": 3.690486348122867, + "grad_norm": 0.29426270299954255, + "learning_rate": 8.63585904272364e-05, + "loss": 0.9374, + "step": 17301 + }, + { + "epoch": 3.6906996587030716, + "grad_norm": 0.2816203478676137, + "learning_rate": 8.635010754843752e-05, + "loss": 0.8956, + "step": 17302 + }, + { + "epoch": 3.6909129692832767, + "grad_norm": 0.22089965578952844, + "learning_rate": 8.63416245977871e-05, + "loss": 0.9085, + "step": 17303 + }, + { + "epoch": 3.6911262798634814, + "grad_norm": 0.319249480416733, + "learning_rate": 8.633314157538106e-05, + "loss": 0.919, + "step": 17304 + }, + { + "epoch": 3.691339590443686, + "grad_norm": 0.3743411739301631, + "learning_rate": 8.632465848131542e-05, + "loss": 0.9168, + "step": 17305 + }, + { + "epoch": 3.6915529010238908, + "grad_norm": 0.3417013182070861, + "learning_rate": 8.631617531568615e-05, + "loss": 0.8876, + "step": 17306 + }, + { + "epoch": 3.6917662116040955, + "grad_norm": 0.32836096598808134, + "learning_rate": 8.630769207858927e-05, + "loss": 0.9261, + "step": 17307 + }, + { + "epoch": 3.6919795221843, + "grad_norm": 0.23264931837872851, + "learning_rate": 8.629920877012076e-05, + "loss": 0.9195, + "step": 17308 + }, + { + "epoch": 3.6921928327645053, + "grad_norm": 0.5865659476604624, + "learning_rate": 8.629072539037659e-05, + "loss": 0.9406, + "step": 17309 + }, + { + "epoch": 3.69240614334471, + "grad_norm": 0.27896788002549733, + "learning_rate": 8.628224193945272e-05, + "loss": 0.8855, + "step": 17310 + }, + { + "epoch": 3.6926194539249146, + "grad_norm": 0.27535250852574406, + "learning_rate": 8.627375841744523e-05, + "loss": 0.9134, + "step": 17311 + }, + { + "epoch": 3.6928327645051193, + "grad_norm": 0.2554816856291132, + "learning_rate": 8.626527482445001e-05, + "loss": 0.922, + "step": 17312 + }, + { + "epoch": 3.6930460750853245, + "grad_norm": 0.25831082549802903, + "learning_rate": 8.625679116056312e-05, + "loss": 0.9093, + "step": 17313 + }, + { + "epoch": 3.693259385665529, + "grad_norm": 0.19978527964152124, + "learning_rate": 8.624830742588055e-05, + "loss": 0.8981, + "step": 17314 + }, + { + "epoch": 3.693472696245734, + "grad_norm": 0.28178927520491553, + "learning_rate": 8.623982362049827e-05, + "loss": 0.8965, + "step": 17315 + }, + { + "epoch": 3.6936860068259385, + "grad_norm": 0.34624855709602137, + "learning_rate": 8.623133974451225e-05, + "loss": 0.9323, + "step": 17316 + }, + { + "epoch": 3.693899317406143, + "grad_norm": 0.2938003674742484, + "learning_rate": 8.622285579801855e-05, + "loss": 0.901, + "step": 17317 + }, + { + "epoch": 3.694112627986348, + "grad_norm": 0.27534694288366074, + "learning_rate": 8.62143717811131e-05, + "loss": 0.94, + "step": 17318 + }, + { + "epoch": 3.694325938566553, + "grad_norm": 0.25474355834518164, + "learning_rate": 8.620588769389195e-05, + "loss": 0.9021, + "step": 17319 + }, + { + "epoch": 3.6945392491467577, + "grad_norm": 0.2120310465092784, + "learning_rate": 8.619740353645108e-05, + "loss": 0.9201, + "step": 17320 + }, + { + "epoch": 3.6947525597269624, + "grad_norm": 0.2547123635455223, + "learning_rate": 8.618891930888648e-05, + "loss": 0.9197, + "step": 17321 + }, + { + "epoch": 3.6949658703071675, + "grad_norm": 0.2971924453859022, + "learning_rate": 8.618043501129415e-05, + "loss": 0.932, + "step": 17322 + }, + { + "epoch": 3.695179180887372, + "grad_norm": 0.20602576899601513, + "learning_rate": 8.61719506437701e-05, + "loss": 0.9209, + "step": 17323 + }, + { + "epoch": 3.695392491467577, + "grad_norm": 0.21830542374798256, + "learning_rate": 8.61634662064103e-05, + "loss": 0.9228, + "step": 17324 + }, + { + "epoch": 3.6956058020477816, + "grad_norm": 0.24896430063663186, + "learning_rate": 8.615498169931083e-05, + "loss": 0.94, + "step": 17325 + }, + { + "epoch": 3.6958191126279862, + "grad_norm": 0.23133390046762367, + "learning_rate": 8.61464971225676e-05, + "loss": 0.9107, + "step": 17326 + }, + { + "epoch": 3.696032423208191, + "grad_norm": 0.23332643856321206, + "learning_rate": 8.613801247627665e-05, + "loss": 0.9224, + "step": 17327 + }, + { + "epoch": 3.696245733788396, + "grad_norm": 0.20976790708978854, + "learning_rate": 8.6129527760534e-05, + "loss": 0.9196, + "step": 17328 + }, + { + "epoch": 3.6964590443686007, + "grad_norm": 0.23948003397264678, + "learning_rate": 8.612104297543563e-05, + "loss": 0.9141, + "step": 17329 + }, + { + "epoch": 3.6966723549488054, + "grad_norm": 0.2686390049200307, + "learning_rate": 8.611255812107754e-05, + "loss": 0.9055, + "step": 17330 + }, + { + "epoch": 3.69688566552901, + "grad_norm": 0.26382582968758705, + "learning_rate": 8.610407319755582e-05, + "loss": 0.9139, + "step": 17331 + }, + { + "epoch": 3.6970989761092152, + "grad_norm": 0.2842989689982686, + "learning_rate": 8.609558820496632e-05, + "loss": 0.902, + "step": 17332 + }, + { + "epoch": 3.69731228668942, + "grad_norm": 0.2743721947909269, + "learning_rate": 8.608710314340518e-05, + "loss": 0.9198, + "step": 17333 + }, + { + "epoch": 3.6975255972696246, + "grad_norm": 0.2584903547167332, + "learning_rate": 8.607861801296837e-05, + "loss": 0.9194, + "step": 17334 + }, + { + "epoch": 3.6977389078498293, + "grad_norm": 0.29147755603958175, + "learning_rate": 8.607013281375189e-05, + "loss": 0.9097, + "step": 17335 + }, + { + "epoch": 3.697952218430034, + "grad_norm": 0.33227046403101185, + "learning_rate": 8.606164754585177e-05, + "loss": 0.9259, + "step": 17336 + }, + { + "epoch": 3.6981655290102387, + "grad_norm": 0.23764814885284136, + "learning_rate": 8.6053162209364e-05, + "loss": 0.9324, + "step": 17337 + }, + { + "epoch": 3.698378839590444, + "grad_norm": 0.24096263596657497, + "learning_rate": 8.604467680438457e-05, + "loss": 0.9391, + "step": 17338 + }, + { + "epoch": 3.6985921501706485, + "grad_norm": 0.33063662445357744, + "learning_rate": 8.603619133100954e-05, + "loss": 0.9262, + "step": 17339 + }, + { + "epoch": 3.698805460750853, + "grad_norm": 0.3190684269561447, + "learning_rate": 8.602770578933488e-05, + "loss": 0.9103, + "step": 17340 + }, + { + "epoch": 3.6990187713310583, + "grad_norm": 0.28205457746966267, + "learning_rate": 8.601922017945665e-05, + "loss": 0.8868, + "step": 17341 + }, + { + "epoch": 3.699232081911263, + "grad_norm": 0.27123729786641976, + "learning_rate": 8.601073450147086e-05, + "loss": 0.9104, + "step": 17342 + }, + { + "epoch": 3.6994453924914676, + "grad_norm": 0.192095954143755, + "learning_rate": 8.600224875547348e-05, + "loss": 0.8791, + "step": 17343 + }, + { + "epoch": 3.6996587030716723, + "grad_norm": 0.24911458600320546, + "learning_rate": 8.599376294156056e-05, + "loss": 0.9209, + "step": 17344 + }, + { + "epoch": 3.699872013651877, + "grad_norm": 0.2926090272011675, + "learning_rate": 8.598527705982811e-05, + "loss": 0.8822, + "step": 17345 + }, + { + "epoch": 3.7000853242320817, + "grad_norm": 0.304147088163405, + "learning_rate": 8.597679111037215e-05, + "loss": 0.9221, + "step": 17346 + }, + { + "epoch": 3.700298634812287, + "grad_norm": 0.2955688415328898, + "learning_rate": 8.596830509328871e-05, + "loss": 0.9239, + "step": 17347 + }, + { + "epoch": 3.7005119453924915, + "grad_norm": 0.2533450888464828, + "learning_rate": 8.59598190086738e-05, + "loss": 0.9449, + "step": 17348 + }, + { + "epoch": 3.700725255972696, + "grad_norm": 0.22527958824459623, + "learning_rate": 8.595133285662343e-05, + "loss": 0.9213, + "step": 17349 + }, + { + "epoch": 3.700938566552901, + "grad_norm": 0.2336403051510735, + "learning_rate": 8.594284663723361e-05, + "loss": 0.9103, + "step": 17350 + }, + { + "epoch": 3.701151877133106, + "grad_norm": 0.2426183616012926, + "learning_rate": 8.593436035060041e-05, + "loss": 0.8994, + "step": 17351 + }, + { + "epoch": 3.7013651877133107, + "grad_norm": 0.25747128785006385, + "learning_rate": 8.592587399681978e-05, + "loss": 0.9227, + "step": 17352 + }, + { + "epoch": 3.7015784982935154, + "grad_norm": 0.22835974025456202, + "learning_rate": 8.591738757598783e-05, + "loss": 0.9312, + "step": 17353 + }, + { + "epoch": 3.70179180887372, + "grad_norm": 0.2307728390108988, + "learning_rate": 8.590890108820053e-05, + "loss": 0.9141, + "step": 17354 + }, + { + "epoch": 3.7020051194539247, + "grad_norm": 0.2556548832070048, + "learning_rate": 8.590041453355393e-05, + "loss": 0.9128, + "step": 17355 + }, + { + "epoch": 3.7022184300341294, + "grad_norm": 0.2289449646435334, + "learning_rate": 8.589192791214401e-05, + "loss": 0.9286, + "step": 17356 + }, + { + "epoch": 3.7024317406143346, + "grad_norm": 0.21771846289246358, + "learning_rate": 8.588344122406684e-05, + "loss": 0.9173, + "step": 17357 + }, + { + "epoch": 3.7026450511945392, + "grad_norm": 0.24709668447512717, + "learning_rate": 8.587495446941842e-05, + "loss": 0.914, + "step": 17358 + }, + { + "epoch": 3.702858361774744, + "grad_norm": 0.1867938376043729, + "learning_rate": 8.58664676482948e-05, + "loss": 0.9206, + "step": 17359 + }, + { + "epoch": 3.703071672354949, + "grad_norm": 0.24601703454466817, + "learning_rate": 8.585798076079201e-05, + "loss": 0.9253, + "step": 17360 + }, + { + "epoch": 3.7032849829351537, + "grad_norm": 0.21534584351420644, + "learning_rate": 8.584949380700605e-05, + "loss": 0.9141, + "step": 17361 + }, + { + "epoch": 3.7034982935153584, + "grad_norm": 0.2065623194821484, + "learning_rate": 8.584100678703298e-05, + "loss": 0.9303, + "step": 17362 + }, + { + "epoch": 3.703711604095563, + "grad_norm": 0.20154270167134453, + "learning_rate": 8.583251970096882e-05, + "loss": 0.9105, + "step": 17363 + }, + { + "epoch": 3.703924914675768, + "grad_norm": 0.2924512455531151, + "learning_rate": 8.582403254890962e-05, + "loss": 0.9556, + "step": 17364 + }, + { + "epoch": 3.7041382252559725, + "grad_norm": 0.21533932551028395, + "learning_rate": 8.581554533095135e-05, + "loss": 0.9154, + "step": 17365 + }, + { + "epoch": 3.7043515358361776, + "grad_norm": 0.2343170432156552, + "learning_rate": 8.580705804719012e-05, + "loss": 0.9692, + "step": 17366 + }, + { + "epoch": 3.7045648464163823, + "grad_norm": 0.2530417867847374, + "learning_rate": 8.579857069772193e-05, + "loss": 0.8867, + "step": 17367 + }, + { + "epoch": 3.704778156996587, + "grad_norm": 0.22277663405952622, + "learning_rate": 8.57900832826428e-05, + "loss": 0.9734, + "step": 17368 + }, + { + "epoch": 3.7049914675767917, + "grad_norm": 0.22808464869821335, + "learning_rate": 8.57815958020488e-05, + "loss": 0.895, + "step": 17369 + }, + { + "epoch": 3.705204778156997, + "grad_norm": 0.21274296083104696, + "learning_rate": 8.577310825603594e-05, + "loss": 0.8974, + "step": 17370 + }, + { + "epoch": 3.7054180887372015, + "grad_norm": 0.21473898601655872, + "learning_rate": 8.576462064470024e-05, + "loss": 0.9408, + "step": 17371 + }, + { + "epoch": 3.705631399317406, + "grad_norm": 0.2722465269742027, + "learning_rate": 8.575613296813779e-05, + "loss": 0.9277, + "step": 17372 + }, + { + "epoch": 3.705844709897611, + "grad_norm": 0.28623477923366186, + "learning_rate": 8.574764522644458e-05, + "loss": 0.9228, + "step": 17373 + }, + { + "epoch": 3.7060580204778155, + "grad_norm": 0.20422197193612737, + "learning_rate": 8.573915741971667e-05, + "loss": 0.9223, + "step": 17374 + }, + { + "epoch": 3.70627133105802, + "grad_norm": 0.19807434374409852, + "learning_rate": 8.57306695480501e-05, + "loss": 0.9073, + "step": 17375 + }, + { + "epoch": 3.7064846416382253, + "grad_norm": 0.1885217280032424, + "learning_rate": 8.572218161154091e-05, + "loss": 0.9159, + "step": 17376 + }, + { + "epoch": 3.70669795221843, + "grad_norm": 0.19222519230951043, + "learning_rate": 8.571369361028513e-05, + "loss": 0.9446, + "step": 17377 + }, + { + "epoch": 3.7069112627986347, + "grad_norm": 0.1827414392040064, + "learning_rate": 8.570520554437883e-05, + "loss": 0.9297, + "step": 17378 + }, + { + "epoch": 3.70712457337884, + "grad_norm": 0.20426674052689545, + "learning_rate": 8.569671741391801e-05, + "loss": 0.922, + "step": 17379 + }, + { + "epoch": 3.7073378839590445, + "grad_norm": 0.17905553173173389, + "learning_rate": 8.568822921899876e-05, + "loss": 0.9049, + "step": 17380 + }, + { + "epoch": 3.707551194539249, + "grad_norm": 0.18049154901675768, + "learning_rate": 8.567974095971709e-05, + "loss": 0.9042, + "step": 17381 + }, + { + "epoch": 3.707764505119454, + "grad_norm": 0.17600261956619237, + "learning_rate": 8.567125263616905e-05, + "loss": 0.9132, + "step": 17382 + }, + { + "epoch": 3.7079778156996586, + "grad_norm": 0.16745068348233355, + "learning_rate": 8.566276424845068e-05, + "loss": 0.918, + "step": 17383 + }, + { + "epoch": 3.7081911262798632, + "grad_norm": 0.2080068433869616, + "learning_rate": 8.565427579665807e-05, + "loss": 0.9514, + "step": 17384 + }, + { + "epoch": 3.7084044368600684, + "grad_norm": 0.21604707435707066, + "learning_rate": 8.564578728088722e-05, + "loss": 0.8933, + "step": 17385 + }, + { + "epoch": 3.708617747440273, + "grad_norm": 0.2137964172194621, + "learning_rate": 8.563729870123418e-05, + "loss": 0.9158, + "step": 17386 + }, + { + "epoch": 3.7088310580204777, + "grad_norm": 0.23817897963589715, + "learning_rate": 8.562881005779503e-05, + "loss": 0.9347, + "step": 17387 + }, + { + "epoch": 3.7090443686006824, + "grad_norm": 0.19308935717735648, + "learning_rate": 8.562032135066577e-05, + "loss": 0.9316, + "step": 17388 + }, + { + "epoch": 3.7092576791808876, + "grad_norm": 0.1924701704607394, + "learning_rate": 8.561183257994251e-05, + "loss": 0.8968, + "step": 17389 + }, + { + "epoch": 3.7094709897610922, + "grad_norm": 0.19750553191677545, + "learning_rate": 8.560334374572126e-05, + "loss": 0.9318, + "step": 17390 + }, + { + "epoch": 3.709684300341297, + "grad_norm": 0.17690030517521954, + "learning_rate": 8.559485484809808e-05, + "loss": 0.9377, + "step": 17391 + }, + { + "epoch": 3.7098976109215016, + "grad_norm": 0.2029985370740473, + "learning_rate": 8.558636588716904e-05, + "loss": 0.9095, + "step": 17392 + }, + { + "epoch": 3.7101109215017063, + "grad_norm": 0.2232384663422675, + "learning_rate": 8.557787686303013e-05, + "loss": 0.9306, + "step": 17393 + }, + { + "epoch": 3.7103242320819114, + "grad_norm": 0.17886979570922557, + "learning_rate": 8.556938777577749e-05, + "loss": 0.9024, + "step": 17394 + }, + { + "epoch": 3.710537542662116, + "grad_norm": 0.19494636750178815, + "learning_rate": 8.556089862550711e-05, + "loss": 0.9173, + "step": 17395 + }, + { + "epoch": 3.710750853242321, + "grad_norm": 0.21476023410741898, + "learning_rate": 8.55524094123151e-05, + "loss": 0.9219, + "step": 17396 + }, + { + "epoch": 3.7109641638225255, + "grad_norm": 0.2061030426523565, + "learning_rate": 8.554392013629745e-05, + "loss": 0.9197, + "step": 17397 + }, + { + "epoch": 3.7111774744027306, + "grad_norm": 0.21018675723714408, + "learning_rate": 8.553543079755027e-05, + "loss": 0.9125, + "step": 17398 + }, + { + "epoch": 3.7113907849829353, + "grad_norm": 0.16952912825447866, + "learning_rate": 8.552694139616959e-05, + "loss": 0.9203, + "step": 17399 + }, + { + "epoch": 3.71160409556314, + "grad_norm": 0.17503448455057485, + "learning_rate": 8.551845193225149e-05, + "loss": 0.9603, + "step": 17400 + }, + { + "epoch": 3.7118174061433447, + "grad_norm": 0.17255255853452575, + "learning_rate": 8.550996240589198e-05, + "loss": 0.9394, + "step": 17401 + }, + { + "epoch": 3.7120307167235493, + "grad_norm": 0.17274539784154336, + "learning_rate": 8.550147281718717e-05, + "loss": 0.9289, + "step": 17402 + }, + { + "epoch": 3.712244027303754, + "grad_norm": 0.19300897072050366, + "learning_rate": 8.549298316623312e-05, + "loss": 0.9224, + "step": 17403 + }, + { + "epoch": 3.712457337883959, + "grad_norm": 0.2308595563889261, + "learning_rate": 8.548449345312586e-05, + "loss": 0.9136, + "step": 17404 + }, + { + "epoch": 3.712670648464164, + "grad_norm": 0.2620499261536117, + "learning_rate": 8.547600367796145e-05, + "loss": 0.9517, + "step": 17405 + }, + { + "epoch": 3.7128839590443685, + "grad_norm": 0.23329868543999913, + "learning_rate": 8.5467513840836e-05, + "loss": 0.9098, + "step": 17406 + }, + { + "epoch": 3.7130972696245736, + "grad_norm": 0.1998162255659457, + "learning_rate": 8.54590239418455e-05, + "loss": 0.9194, + "step": 17407 + }, + { + "epoch": 3.7133105802047783, + "grad_norm": 0.17846154624664573, + "learning_rate": 8.545053398108608e-05, + "loss": 0.9117, + "step": 17408 + }, + { + "epoch": 3.713523890784983, + "grad_norm": 0.16691934048725585, + "learning_rate": 8.544204395865377e-05, + "loss": 0.9038, + "step": 17409 + }, + { + "epoch": 3.7137372013651877, + "grad_norm": 0.18666950961645684, + "learning_rate": 8.543355387464464e-05, + "loss": 0.913, + "step": 17410 + }, + { + "epoch": 3.7139505119453924, + "grad_norm": 0.2158110686825726, + "learning_rate": 8.542506372915474e-05, + "loss": 0.9063, + "step": 17411 + }, + { + "epoch": 3.714163822525597, + "grad_norm": 0.19872146962172074, + "learning_rate": 8.541657352228019e-05, + "loss": 0.9364, + "step": 17412 + }, + { + "epoch": 3.714377133105802, + "grad_norm": 0.17444654190230294, + "learning_rate": 8.540808325411697e-05, + "loss": 0.9023, + "step": 17413 + }, + { + "epoch": 3.714590443686007, + "grad_norm": 0.18754080660433686, + "learning_rate": 8.539959292476124e-05, + "loss": 0.9486, + "step": 17414 + }, + { + "epoch": 3.7148037542662116, + "grad_norm": 0.1712846814588955, + "learning_rate": 8.539110253430902e-05, + "loss": 0.9176, + "step": 17415 + }, + { + "epoch": 3.7150170648464163, + "grad_norm": 0.20099671985676607, + "learning_rate": 8.538261208285637e-05, + "loss": 0.9322, + "step": 17416 + }, + { + "epoch": 3.7152303754266214, + "grad_norm": 0.18044744344419358, + "learning_rate": 8.537412157049937e-05, + "loss": 0.9022, + "step": 17417 + }, + { + "epoch": 3.715443686006826, + "grad_norm": 0.23237585630838, + "learning_rate": 8.53656309973341e-05, + "loss": 0.8952, + "step": 17418 + }, + { + "epoch": 3.7156569965870307, + "grad_norm": 0.20665465241497805, + "learning_rate": 8.535714036345662e-05, + "loss": 0.9303, + "step": 17419 + }, + { + "epoch": 3.7158703071672354, + "grad_norm": 0.17440742811930168, + "learning_rate": 8.534864966896303e-05, + "loss": 0.9058, + "step": 17420 + }, + { + "epoch": 3.71608361774744, + "grad_norm": 0.23977705910068262, + "learning_rate": 8.534015891394936e-05, + "loss": 0.9128, + "step": 17421 + }, + { + "epoch": 3.716296928327645, + "grad_norm": 0.1789270919853829, + "learning_rate": 8.53316680985117e-05, + "loss": 0.8847, + "step": 17422 + }, + { + "epoch": 3.71651023890785, + "grad_norm": 0.21054254539520156, + "learning_rate": 8.532317722274612e-05, + "loss": 0.917, + "step": 17423 + }, + { + "epoch": 3.7167235494880546, + "grad_norm": 0.23055986675184165, + "learning_rate": 8.531468628674872e-05, + "loss": 0.9324, + "step": 17424 + }, + { + "epoch": 3.7169368600682593, + "grad_norm": 0.23320027971936938, + "learning_rate": 8.530619529061552e-05, + "loss": 0.9312, + "step": 17425 + }, + { + "epoch": 3.7171501706484644, + "grad_norm": 0.17146584473159854, + "learning_rate": 8.52977042344427e-05, + "loss": 0.9136, + "step": 17426 + }, + { + "epoch": 3.717363481228669, + "grad_norm": 0.21194195100722915, + "learning_rate": 8.52892131183262e-05, + "loss": 0.9368, + "step": 17427 + }, + { + "epoch": 3.717576791808874, + "grad_norm": 0.2274690587569353, + "learning_rate": 8.528072194236218e-05, + "loss": 0.9156, + "step": 17428 + }, + { + "epoch": 3.7177901023890785, + "grad_norm": 0.2732764966286922, + "learning_rate": 8.52722307066467e-05, + "loss": 0.8928, + "step": 17429 + }, + { + "epoch": 3.718003412969283, + "grad_norm": 0.311572086648698, + "learning_rate": 8.526373941127587e-05, + "loss": 0.9076, + "step": 17430 + }, + { + "epoch": 3.718216723549488, + "grad_norm": 0.23028009803350316, + "learning_rate": 8.525524805634572e-05, + "loss": 0.9419, + "step": 17431 + }, + { + "epoch": 3.718430034129693, + "grad_norm": 0.19181432022961292, + "learning_rate": 8.524675664195237e-05, + "loss": 0.9188, + "step": 17432 + }, + { + "epoch": 3.7186433447098977, + "grad_norm": 0.2097150830071183, + "learning_rate": 8.523826516819185e-05, + "loss": 0.9152, + "step": 17433 + }, + { + "epoch": 3.7188566552901023, + "grad_norm": 0.20705180192586192, + "learning_rate": 8.522977363516028e-05, + "loss": 0.8987, + "step": 17434 + }, + { + "epoch": 3.719069965870307, + "grad_norm": 0.19842776578327653, + "learning_rate": 8.522128204295373e-05, + "loss": 0.9357, + "step": 17435 + }, + { + "epoch": 3.719283276450512, + "grad_norm": 0.23299477967292148, + "learning_rate": 8.52127903916683e-05, + "loss": 0.9211, + "step": 17436 + }, + { + "epoch": 3.719496587030717, + "grad_norm": 0.23779940682671835, + "learning_rate": 8.520429868140005e-05, + "loss": 0.8971, + "step": 17437 + }, + { + "epoch": 3.7197098976109215, + "grad_norm": 0.19191635112176011, + "learning_rate": 8.519580691224508e-05, + "loss": 0.9052, + "step": 17438 + }, + { + "epoch": 3.719923208191126, + "grad_norm": 0.20801415477462548, + "learning_rate": 8.518731508429945e-05, + "loss": 0.9023, + "step": 17439 + }, + { + "epoch": 3.720136518771331, + "grad_norm": 0.208183317430295, + "learning_rate": 8.517882319765928e-05, + "loss": 0.9242, + "step": 17440 + }, + { + "epoch": 3.7203498293515356, + "grad_norm": 0.18482420111839748, + "learning_rate": 8.517033125242062e-05, + "loss": 0.8981, + "step": 17441 + }, + { + "epoch": 3.7205631399317407, + "grad_norm": 0.2330527400816158, + "learning_rate": 8.51618392486796e-05, + "loss": 0.9104, + "step": 17442 + }, + { + "epoch": 3.7207764505119454, + "grad_norm": 0.23755904581825907, + "learning_rate": 8.515334718653228e-05, + "loss": 0.8994, + "step": 17443 + }, + { + "epoch": 3.72098976109215, + "grad_norm": 0.2640724804864001, + "learning_rate": 8.514485506607473e-05, + "loss": 0.9159, + "step": 17444 + }, + { + "epoch": 3.721203071672355, + "grad_norm": 0.28984041111577613, + "learning_rate": 8.513636288740308e-05, + "loss": 0.9224, + "step": 17445 + }, + { + "epoch": 3.72141638225256, + "grad_norm": 0.36305623530414577, + "learning_rate": 8.512787065061337e-05, + "loss": 0.9204, + "step": 17446 + }, + { + "epoch": 3.7216296928327646, + "grad_norm": 0.266315985762627, + "learning_rate": 8.511937835580172e-05, + "loss": 0.9036, + "step": 17447 + }, + { + "epoch": 3.7218430034129693, + "grad_norm": 0.19713393969825585, + "learning_rate": 8.511088600306427e-05, + "loss": 0.9125, + "step": 17448 + }, + { + "epoch": 3.722056313993174, + "grad_norm": 0.27751885959900296, + "learning_rate": 8.5102393592497e-05, + "loss": 0.9175, + "step": 17449 + }, + { + "epoch": 3.7222696245733786, + "grad_norm": 0.3187464059722229, + "learning_rate": 8.509390112419609e-05, + "loss": 0.9159, + "step": 17450 + }, + { + "epoch": 3.7224829351535837, + "grad_norm": 0.3027278867812728, + "learning_rate": 8.508540859825758e-05, + "loss": 0.9215, + "step": 17451 + }, + { + "epoch": 3.7226962457337884, + "grad_norm": 0.24750471493032153, + "learning_rate": 8.50769160147776e-05, + "loss": 0.9468, + "step": 17452 + }, + { + "epoch": 3.722909556313993, + "grad_norm": 0.20498280530826724, + "learning_rate": 8.506842337385223e-05, + "loss": 0.9093, + "step": 17453 + }, + { + "epoch": 3.723122866894198, + "grad_norm": 0.27074696593184266, + "learning_rate": 8.505993067557758e-05, + "loss": 0.9079, + "step": 17454 + }, + { + "epoch": 3.723336177474403, + "grad_norm": 0.38964173847249167, + "learning_rate": 8.50514379200497e-05, + "loss": 0.93, + "step": 17455 + }, + { + "epoch": 3.7235494880546076, + "grad_norm": 0.32981715069382994, + "learning_rate": 8.504294510736474e-05, + "loss": 0.9594, + "step": 17456 + }, + { + "epoch": 3.7237627986348123, + "grad_norm": 0.18209955396949912, + "learning_rate": 8.503445223761874e-05, + "loss": 0.8877, + "step": 17457 + }, + { + "epoch": 3.723976109215017, + "grad_norm": 0.2100312070853556, + "learning_rate": 8.502595931090784e-05, + "loss": 0.9146, + "step": 17458 + }, + { + "epoch": 3.7241894197952217, + "grad_norm": 0.2450304012800032, + "learning_rate": 8.501746632732815e-05, + "loss": 0.916, + "step": 17459 + }, + { + "epoch": 3.7244027303754264, + "grad_norm": 0.18819283196618705, + "learning_rate": 8.500897328697575e-05, + "loss": 0.9132, + "step": 17460 + }, + { + "epoch": 3.7246160409556315, + "grad_norm": 0.25124962194452066, + "learning_rate": 8.500048018994669e-05, + "loss": 0.9366, + "step": 17461 + }, + { + "epoch": 3.724829351535836, + "grad_norm": 0.24561392673998392, + "learning_rate": 8.499198703633713e-05, + "loss": 0.8883, + "step": 17462 + }, + { + "epoch": 3.725042662116041, + "grad_norm": 0.21734786986122792, + "learning_rate": 8.498349382624316e-05, + "loss": 0.9113, + "step": 17463 + }, + { + "epoch": 3.725255972696246, + "grad_norm": 0.2967856860837834, + "learning_rate": 8.497500055976087e-05, + "loss": 0.9387, + "step": 17464 + }, + { + "epoch": 3.7254692832764507, + "grad_norm": 0.3124680138343121, + "learning_rate": 8.496650723698637e-05, + "loss": 0.8965, + "step": 17465 + }, + { + "epoch": 3.7256825938566553, + "grad_norm": 0.19403175791825564, + "learning_rate": 8.495801385801576e-05, + "loss": 0.9162, + "step": 17466 + }, + { + "epoch": 3.72589590443686, + "grad_norm": 0.25777087912726276, + "learning_rate": 8.494952042294511e-05, + "loss": 0.9083, + "step": 17467 + }, + { + "epoch": 3.7261092150170647, + "grad_norm": 0.25044110743895964, + "learning_rate": 8.494102693187059e-05, + "loss": 0.9036, + "step": 17468 + }, + { + "epoch": 3.7263225255972694, + "grad_norm": 0.20498205879015183, + "learning_rate": 8.493253338488822e-05, + "loss": 0.8994, + "step": 17469 + }, + { + "epoch": 3.7265358361774745, + "grad_norm": 0.22153391535961742, + "learning_rate": 8.49240397820942e-05, + "loss": 0.9328, + "step": 17470 + }, + { + "epoch": 3.726749146757679, + "grad_norm": 0.21933224107902863, + "learning_rate": 8.491554612358458e-05, + "loss": 0.9081, + "step": 17471 + }, + { + "epoch": 3.726962457337884, + "grad_norm": 0.21290283067054405, + "learning_rate": 8.490705240945547e-05, + "loss": 0.9315, + "step": 17472 + }, + { + "epoch": 3.7271757679180886, + "grad_norm": 0.2133962336786503, + "learning_rate": 8.489855863980297e-05, + "loss": 0.9164, + "step": 17473 + }, + { + "epoch": 3.7273890784982937, + "grad_norm": 0.24272019117778493, + "learning_rate": 8.48900648147232e-05, + "loss": 0.9366, + "step": 17474 + }, + { + "epoch": 3.7276023890784984, + "grad_norm": 0.207496888230388, + "learning_rate": 8.488157093431225e-05, + "loss": 0.9196, + "step": 17475 + }, + { + "epoch": 3.727815699658703, + "grad_norm": 0.22862306458562884, + "learning_rate": 8.487307699866627e-05, + "loss": 0.888, + "step": 17476 + }, + { + "epoch": 3.7280290102389078, + "grad_norm": 0.25355105328591915, + "learning_rate": 8.486458300788132e-05, + "loss": 0.906, + "step": 17477 + }, + { + "epoch": 3.7282423208191124, + "grad_norm": 0.2107844168160261, + "learning_rate": 8.485608896205355e-05, + "loss": 0.8892, + "step": 17478 + }, + { + "epoch": 3.7284556313993176, + "grad_norm": 0.2605616441294855, + "learning_rate": 8.484759486127903e-05, + "loss": 0.8952, + "step": 17479 + }, + { + "epoch": 3.7286689419795223, + "grad_norm": 0.29090718901160645, + "learning_rate": 8.48391007056539e-05, + "loss": 0.9205, + "step": 17480 + }, + { + "epoch": 3.728882252559727, + "grad_norm": 0.18657370955447086, + "learning_rate": 8.483060649527426e-05, + "loss": 0.9102, + "step": 17481 + }, + { + "epoch": 3.7290955631399316, + "grad_norm": 0.2599753368149165, + "learning_rate": 8.482211223023626e-05, + "loss": 0.9453, + "step": 17482 + }, + { + "epoch": 3.7293088737201368, + "grad_norm": 0.2500406806556055, + "learning_rate": 8.481361791063595e-05, + "loss": 0.9101, + "step": 17483 + }, + { + "epoch": 3.7295221843003414, + "grad_norm": 0.19312284333148094, + "learning_rate": 8.480512353656947e-05, + "loss": 0.8983, + "step": 17484 + }, + { + "epoch": 3.729735494880546, + "grad_norm": 0.1863434685035784, + "learning_rate": 8.479662910813293e-05, + "loss": 0.926, + "step": 17485 + }, + { + "epoch": 3.729948805460751, + "grad_norm": 0.19945889721487195, + "learning_rate": 8.478813462542248e-05, + "loss": 0.9171, + "step": 17486 + }, + { + "epoch": 3.7301621160409555, + "grad_norm": 0.19726555350793828, + "learning_rate": 8.47796400885342e-05, + "loss": 0.9125, + "step": 17487 + }, + { + "epoch": 3.73037542662116, + "grad_norm": 0.2260265172507184, + "learning_rate": 8.47711454975642e-05, + "loss": 0.9306, + "step": 17488 + }, + { + "epoch": 3.7305887372013653, + "grad_norm": 0.23994480418825195, + "learning_rate": 8.47626508526086e-05, + "loss": 0.9273, + "step": 17489 + }, + { + "epoch": 3.73080204778157, + "grad_norm": 0.1806778681762367, + "learning_rate": 8.475415615376353e-05, + "loss": 0.8979, + "step": 17490 + }, + { + "epoch": 3.7310153583617747, + "grad_norm": 0.1866069321354577, + "learning_rate": 8.47456614011251e-05, + "loss": 0.9028, + "step": 17491 + }, + { + "epoch": 3.73122866894198, + "grad_norm": 0.22901816577819592, + "learning_rate": 8.473716659478945e-05, + "loss": 0.8824, + "step": 17492 + }, + { + "epoch": 3.7314419795221845, + "grad_norm": 0.18198643979211904, + "learning_rate": 8.472867173485267e-05, + "loss": 0.9137, + "step": 17493 + }, + { + "epoch": 3.731655290102389, + "grad_norm": 0.24297447548148574, + "learning_rate": 8.472017682141091e-05, + "loss": 0.9234, + "step": 17494 + }, + { + "epoch": 3.731868600682594, + "grad_norm": 0.25052021726236207, + "learning_rate": 8.471168185456024e-05, + "loss": 0.9195, + "step": 17495 + }, + { + "epoch": 3.7320819112627985, + "grad_norm": 0.20447686682937521, + "learning_rate": 8.470318683439681e-05, + "loss": 0.9218, + "step": 17496 + }, + { + "epoch": 3.732295221843003, + "grad_norm": 0.20293397300280047, + "learning_rate": 8.469469176101674e-05, + "loss": 0.8973, + "step": 17497 + }, + { + "epoch": 3.7325085324232083, + "grad_norm": 0.2140834919534603, + "learning_rate": 8.468619663451619e-05, + "loss": 0.8848, + "step": 17498 + }, + { + "epoch": 3.732721843003413, + "grad_norm": 0.19869771705514144, + "learning_rate": 8.467770145499121e-05, + "loss": 0.9425, + "step": 17499 + }, + { + "epoch": 3.7329351535836177, + "grad_norm": 0.19791998875174183, + "learning_rate": 8.466920622253797e-05, + "loss": 0.9323, + "step": 17500 + }, + { + "epoch": 3.7331484641638224, + "grad_norm": 0.16658052740561413, + "learning_rate": 8.466071093725258e-05, + "loss": 0.9114, + "step": 17501 + }, + { + "epoch": 3.7333617747440275, + "grad_norm": 0.1929266582846319, + "learning_rate": 8.465221559923118e-05, + "loss": 0.9102, + "step": 17502 + }, + { + "epoch": 3.733575085324232, + "grad_norm": 0.17228519642642096, + "learning_rate": 8.464372020856987e-05, + "loss": 0.9045, + "step": 17503 + }, + { + "epoch": 3.733788395904437, + "grad_norm": 0.2565195470740108, + "learning_rate": 8.463522476536481e-05, + "loss": 0.9292, + "step": 17504 + }, + { + "epoch": 3.7340017064846416, + "grad_norm": 0.2393225400952661, + "learning_rate": 8.462672926971208e-05, + "loss": 0.949, + "step": 17505 + }, + { + "epoch": 3.7342150170648463, + "grad_norm": 0.20590653932137667, + "learning_rate": 8.461823372170785e-05, + "loss": 0.9053, + "step": 17506 + }, + { + "epoch": 3.734428327645051, + "grad_norm": 0.21812078151974953, + "learning_rate": 8.460973812144819e-05, + "loss": 0.9017, + "step": 17507 + }, + { + "epoch": 3.734641638225256, + "grad_norm": 0.19905107215557674, + "learning_rate": 8.46012424690293e-05, + "loss": 0.8874, + "step": 17508 + }, + { + "epoch": 3.7348549488054608, + "grad_norm": 0.19223167117092124, + "learning_rate": 8.459274676454725e-05, + "loss": 0.9137, + "step": 17509 + }, + { + "epoch": 3.7350682593856654, + "grad_norm": 0.1757591852872795, + "learning_rate": 8.458425100809823e-05, + "loss": 0.8831, + "step": 17510 + }, + { + "epoch": 3.7352815699658706, + "grad_norm": 0.21286862827094533, + "learning_rate": 8.457575519977829e-05, + "loss": 0.9427, + "step": 17511 + }, + { + "epoch": 3.7354948805460753, + "grad_norm": 0.20261196136533116, + "learning_rate": 8.456725933968363e-05, + "loss": 0.9075, + "step": 17512 + }, + { + "epoch": 3.73570819112628, + "grad_norm": 0.19316896379541432, + "learning_rate": 8.455876342791034e-05, + "loss": 0.8936, + "step": 17513 + }, + { + "epoch": 3.7359215017064846, + "grad_norm": 0.2032313281050393, + "learning_rate": 8.455026746455458e-05, + "loss": 0.8932, + "step": 17514 + }, + { + "epoch": 3.7361348122866893, + "grad_norm": 0.18825559489751725, + "learning_rate": 8.454177144971246e-05, + "loss": 0.9199, + "step": 17515 + }, + { + "epoch": 3.736348122866894, + "grad_norm": 0.21566401143668704, + "learning_rate": 8.453327538348012e-05, + "loss": 0.9032, + "step": 17516 + }, + { + "epoch": 3.736561433447099, + "grad_norm": 0.17139574187765794, + "learning_rate": 8.452477926595368e-05, + "loss": 0.9002, + "step": 17517 + }, + { + "epoch": 3.736774744027304, + "grad_norm": 0.1810446540409975, + "learning_rate": 8.45162830972293e-05, + "loss": 0.9131, + "step": 17518 + }, + { + "epoch": 3.7369880546075085, + "grad_norm": 0.2080215425906886, + "learning_rate": 8.450778687740309e-05, + "loss": 0.9058, + "step": 17519 + }, + { + "epoch": 3.737201365187713, + "grad_norm": 0.15680544452217315, + "learning_rate": 8.449929060657122e-05, + "loss": 0.9263, + "step": 17520 + }, + { + "epoch": 3.7374146757679183, + "grad_norm": 0.219762936854807, + "learning_rate": 8.449079428482979e-05, + "loss": 0.9164, + "step": 17521 + }, + { + "epoch": 3.737627986348123, + "grad_norm": 0.21055111654953773, + "learning_rate": 8.448229791227495e-05, + "loss": 0.9019, + "step": 17522 + }, + { + "epoch": 3.7378412969283277, + "grad_norm": 0.22757017396787327, + "learning_rate": 8.44738014890028e-05, + "loss": 0.9483, + "step": 17523 + }, + { + "epoch": 3.7380546075085324, + "grad_norm": 0.23739732831045304, + "learning_rate": 8.446530501510955e-05, + "loss": 0.9371, + "step": 17524 + }, + { + "epoch": 3.738267918088737, + "grad_norm": 0.26318756594490667, + "learning_rate": 8.445680849069128e-05, + "loss": 0.9336, + "step": 17525 + }, + { + "epoch": 3.7384812286689417, + "grad_norm": 0.2787540371001305, + "learning_rate": 8.444831191584415e-05, + "loss": 0.9104, + "step": 17526 + }, + { + "epoch": 3.738694539249147, + "grad_norm": 0.2836030447884412, + "learning_rate": 8.44398152906643e-05, + "loss": 0.9071, + "step": 17527 + }, + { + "epoch": 3.7389078498293515, + "grad_norm": 0.23010291584932363, + "learning_rate": 8.443131861524788e-05, + "loss": 0.9041, + "step": 17528 + }, + { + "epoch": 3.739121160409556, + "grad_norm": 0.20746652587284845, + "learning_rate": 8.442282188969097e-05, + "loss": 0.9044, + "step": 17529 + }, + { + "epoch": 3.7393344709897613, + "grad_norm": 0.27656107013241243, + "learning_rate": 8.44143251140898e-05, + "loss": 0.8859, + "step": 17530 + }, + { + "epoch": 3.739547781569966, + "grad_norm": 0.2575533293731345, + "learning_rate": 8.440582828854045e-05, + "loss": 0.9005, + "step": 17531 + }, + { + "epoch": 3.7397610921501707, + "grad_norm": 0.19276266037869577, + "learning_rate": 8.439733141313909e-05, + "loss": 0.9156, + "step": 17532 + }, + { + "epoch": 3.7399744027303754, + "grad_norm": 0.2597980825696073, + "learning_rate": 8.438883448798184e-05, + "loss": 0.9188, + "step": 17533 + }, + { + "epoch": 3.74018771331058, + "grad_norm": 0.29373094814495265, + "learning_rate": 8.438033751316487e-05, + "loss": 0.9175, + "step": 17534 + }, + { + "epoch": 3.7404010238907848, + "grad_norm": 0.22273793713084342, + "learning_rate": 8.437184048878428e-05, + "loss": 0.8954, + "step": 17535 + }, + { + "epoch": 3.74061433447099, + "grad_norm": 0.24140706436079415, + "learning_rate": 8.436334341493624e-05, + "loss": 0.9216, + "step": 17536 + }, + { + "epoch": 3.7408276450511946, + "grad_norm": 0.2639485842147496, + "learning_rate": 8.435484629171691e-05, + "loss": 0.9245, + "step": 17537 + }, + { + "epoch": 3.7410409556313993, + "grad_norm": 0.20719502553378075, + "learning_rate": 8.434634911922245e-05, + "loss": 0.9026, + "step": 17538 + }, + { + "epoch": 3.741254266211604, + "grad_norm": 0.2706173562189773, + "learning_rate": 8.433785189754893e-05, + "loss": 0.8902, + "step": 17539 + }, + { + "epoch": 3.741467576791809, + "grad_norm": 0.252923227849676, + "learning_rate": 8.432935462679256e-05, + "loss": 0.867, + "step": 17540 + }, + { + "epoch": 3.7416808873720138, + "grad_norm": 0.23313369097161094, + "learning_rate": 8.432085730704947e-05, + "loss": 0.9364, + "step": 17541 + }, + { + "epoch": 3.7418941979522184, + "grad_norm": 0.24540728206968557, + "learning_rate": 8.431235993841581e-05, + "loss": 0.9256, + "step": 17542 + }, + { + "epoch": 3.742107508532423, + "grad_norm": 0.2399552235779389, + "learning_rate": 8.430386252098772e-05, + "loss": 0.9479, + "step": 17543 + }, + { + "epoch": 3.742320819112628, + "grad_norm": 0.23803407584581782, + "learning_rate": 8.429536505486136e-05, + "loss": 0.9119, + "step": 17544 + }, + { + "epoch": 3.7425341296928325, + "grad_norm": 0.29032981651240386, + "learning_rate": 8.428686754013286e-05, + "loss": 0.9276, + "step": 17545 + }, + { + "epoch": 3.7427474402730376, + "grad_norm": 0.23734697755520284, + "learning_rate": 8.42783699768984e-05, + "loss": 0.9195, + "step": 17546 + }, + { + "epoch": 3.7429607508532423, + "grad_norm": 0.17545190144181164, + "learning_rate": 8.426987236525408e-05, + "loss": 0.911, + "step": 17547 + }, + { + "epoch": 3.743174061433447, + "grad_norm": 0.24260149962152014, + "learning_rate": 8.426137470529611e-05, + "loss": 0.9011, + "step": 17548 + }, + { + "epoch": 3.743387372013652, + "grad_norm": 0.46581346090359355, + "learning_rate": 8.42528769971206e-05, + "loss": 0.902, + "step": 17549 + }, + { + "epoch": 3.743600682593857, + "grad_norm": 0.2410185945644049, + "learning_rate": 8.424437924082374e-05, + "loss": 0.9069, + "step": 17550 + }, + { + "epoch": 3.7438139931740615, + "grad_norm": 0.19959475763835205, + "learning_rate": 8.423588143650164e-05, + "loss": 0.9105, + "step": 17551 + }, + { + "epoch": 3.744027303754266, + "grad_norm": 0.2352078053443052, + "learning_rate": 8.422738358425045e-05, + "loss": 0.9076, + "step": 17552 + }, + { + "epoch": 3.744240614334471, + "grad_norm": 0.19724519228223342, + "learning_rate": 8.421888568416636e-05, + "loss": 0.9204, + "step": 17553 + }, + { + "epoch": 3.7444539249146755, + "grad_norm": 0.21713353148889808, + "learning_rate": 8.421038773634552e-05, + "loss": 0.9447, + "step": 17554 + }, + { + "epoch": 3.7446672354948807, + "grad_norm": 0.21425927891402272, + "learning_rate": 8.420188974088407e-05, + "loss": 0.9411, + "step": 17555 + }, + { + "epoch": 3.7448805460750854, + "grad_norm": 0.23138375062969244, + "learning_rate": 8.419339169787818e-05, + "loss": 0.9247, + "step": 17556 + }, + { + "epoch": 3.74509385665529, + "grad_norm": 0.2131238874101732, + "learning_rate": 8.418489360742394e-05, + "loss": 0.9376, + "step": 17557 + }, + { + "epoch": 3.7453071672354947, + "grad_norm": 0.21787297231541905, + "learning_rate": 8.41763954696176e-05, + "loss": 0.9113, + "step": 17558 + }, + { + "epoch": 3.7455204778157, + "grad_norm": 0.2153553747907588, + "learning_rate": 8.416789728455526e-05, + "loss": 0.9086, + "step": 17559 + }, + { + "epoch": 3.7457337883959045, + "grad_norm": 0.21933429095037082, + "learning_rate": 8.41593990523331e-05, + "loss": 0.89, + "step": 17560 + }, + { + "epoch": 3.7459470989761092, + "grad_norm": 0.22899939710209635, + "learning_rate": 8.415090077304728e-05, + "loss": 0.9224, + "step": 17561 + }, + { + "epoch": 3.746160409556314, + "grad_norm": 0.19153083428188117, + "learning_rate": 8.414240244679393e-05, + "loss": 0.8857, + "step": 17562 + }, + { + "epoch": 3.7463737201365186, + "grad_norm": 0.2569996679289191, + "learning_rate": 8.413390407366923e-05, + "loss": 0.9087, + "step": 17563 + }, + { + "epoch": 3.7465870307167233, + "grad_norm": 0.19194650713744876, + "learning_rate": 8.412540565376934e-05, + "loss": 0.9128, + "step": 17564 + }, + { + "epoch": 3.7468003412969284, + "grad_norm": 0.18736223764541066, + "learning_rate": 8.41169071871904e-05, + "loss": 0.9191, + "step": 17565 + }, + { + "epoch": 3.747013651877133, + "grad_norm": 0.19940484817385354, + "learning_rate": 8.41084086740286e-05, + "loss": 0.9039, + "step": 17566 + }, + { + "epoch": 3.7472269624573378, + "grad_norm": 0.23675222734783857, + "learning_rate": 8.40999101143801e-05, + "loss": 0.9283, + "step": 17567 + }, + { + "epoch": 3.747440273037543, + "grad_norm": 0.2619840603117448, + "learning_rate": 8.409141150834103e-05, + "loss": 0.9417, + "step": 17568 + }, + { + "epoch": 3.7476535836177476, + "grad_norm": 0.18712664076080293, + "learning_rate": 8.408291285600757e-05, + "loss": 0.9225, + "step": 17569 + }, + { + "epoch": 3.7478668941979523, + "grad_norm": 0.22582874522596008, + "learning_rate": 8.407441415747589e-05, + "loss": 0.8739, + "step": 17570 + }, + { + "epoch": 3.748080204778157, + "grad_norm": 0.1936839793737442, + "learning_rate": 8.406591541284216e-05, + "loss": 0.8945, + "step": 17571 + }, + { + "epoch": 3.7482935153583616, + "grad_norm": 0.22141292741879245, + "learning_rate": 8.405741662220249e-05, + "loss": 0.923, + "step": 17572 + }, + { + "epoch": 3.7485068259385663, + "grad_norm": 0.2546479280527574, + "learning_rate": 8.40489177856531e-05, + "loss": 0.9158, + "step": 17573 + }, + { + "epoch": 3.7487201365187715, + "grad_norm": 0.17605280434126455, + "learning_rate": 8.404041890329016e-05, + "loss": 0.8902, + "step": 17574 + }, + { + "epoch": 3.748933447098976, + "grad_norm": 0.21701164959258878, + "learning_rate": 8.403191997520976e-05, + "loss": 0.9194, + "step": 17575 + }, + { + "epoch": 3.749146757679181, + "grad_norm": 0.2054101687000345, + "learning_rate": 8.402342100150817e-05, + "loss": 0.9033, + "step": 17576 + }, + { + "epoch": 3.7493600682593855, + "grad_norm": 0.22477009687327468, + "learning_rate": 8.401492198228148e-05, + "loss": 0.9188, + "step": 17577 + }, + { + "epoch": 3.7495733788395906, + "grad_norm": 0.2342704048757253, + "learning_rate": 8.400642291762588e-05, + "loss": 0.9313, + "step": 17578 + }, + { + "epoch": 3.7497866894197953, + "grad_norm": 0.22552039795799803, + "learning_rate": 8.399792380763755e-05, + "loss": 0.9094, + "step": 17579 + }, + { + "epoch": 3.75, + "grad_norm": 0.22439082767332788, + "learning_rate": 8.398942465241265e-05, + "loss": 0.9245, + "step": 17580 + }, + { + "epoch": 3.7502133105802047, + "grad_norm": 0.1963752330104886, + "learning_rate": 8.39809254520473e-05, + "loss": 0.9246, + "step": 17581 + }, + { + "epoch": 3.7504266211604094, + "grad_norm": 0.19974810991568567, + "learning_rate": 8.397242620663776e-05, + "loss": 0.912, + "step": 17582 + }, + { + "epoch": 3.7506399317406145, + "grad_norm": 0.23819882247712176, + "learning_rate": 8.396392691628014e-05, + "loss": 0.8994, + "step": 17583 + }, + { + "epoch": 3.750853242320819, + "grad_norm": 0.2927374338786201, + "learning_rate": 8.39554275810706e-05, + "loss": 0.9215, + "step": 17584 + }, + { + "epoch": 3.751066552901024, + "grad_norm": 0.30711454712438185, + "learning_rate": 8.394692820110534e-05, + "loss": 0.8864, + "step": 17585 + }, + { + "epoch": 3.7512798634812285, + "grad_norm": 0.3590264890999308, + "learning_rate": 8.393842877648052e-05, + "loss": 0.9197, + "step": 17586 + }, + { + "epoch": 3.7514931740614337, + "grad_norm": 0.3682056898709778, + "learning_rate": 8.392992930729231e-05, + "loss": 0.9197, + "step": 17587 + }, + { + "epoch": 3.7517064846416384, + "grad_norm": 0.2905114415305505, + "learning_rate": 8.392142979363689e-05, + "loss": 0.9281, + "step": 17588 + }, + { + "epoch": 3.751919795221843, + "grad_norm": 0.2265364150793092, + "learning_rate": 8.391293023561043e-05, + "loss": 0.8912, + "step": 17589 + }, + { + "epoch": 3.7521331058020477, + "grad_norm": 0.2525929196429265, + "learning_rate": 8.390443063330908e-05, + "loss": 0.9177, + "step": 17590 + }, + { + "epoch": 3.7523464163822524, + "grad_norm": 0.28645780083197925, + "learning_rate": 8.389593098682906e-05, + "loss": 0.904, + "step": 17591 + }, + { + "epoch": 3.752559726962457, + "grad_norm": 0.29416547071525423, + "learning_rate": 8.388743129626651e-05, + "loss": 0.8744, + "step": 17592 + }, + { + "epoch": 3.7527730375426622, + "grad_norm": 0.3214078265498949, + "learning_rate": 8.387893156171759e-05, + "loss": 0.945, + "step": 17593 + }, + { + "epoch": 3.752986348122867, + "grad_norm": 0.3810183979815006, + "learning_rate": 8.38704317832785e-05, + "loss": 0.8987, + "step": 17594 + }, + { + "epoch": 3.7531996587030716, + "grad_norm": 0.28676269048298547, + "learning_rate": 8.386193196104542e-05, + "loss": 0.9138, + "step": 17595 + }, + { + "epoch": 3.7534129692832767, + "grad_norm": 0.24323223737124752, + "learning_rate": 8.38534320951145e-05, + "loss": 0.9049, + "step": 17596 + }, + { + "epoch": 3.7536262798634814, + "grad_norm": 0.23300022465262563, + "learning_rate": 8.384493218558193e-05, + "loss": 0.9454, + "step": 17597 + }, + { + "epoch": 3.753839590443686, + "grad_norm": 0.21728824069516797, + "learning_rate": 8.383643223254391e-05, + "loss": 0.9441, + "step": 17598 + }, + { + "epoch": 3.7540529010238908, + "grad_norm": 0.25521691267474733, + "learning_rate": 8.38279322360966e-05, + "loss": 0.9301, + "step": 17599 + }, + { + "epoch": 3.7542662116040955, + "grad_norm": 0.30648323295818214, + "learning_rate": 8.381943219633613e-05, + "loss": 0.9297, + "step": 17600 + }, + { + "epoch": 3.7544795221843, + "grad_norm": 0.24060958691473283, + "learning_rate": 8.381093211335876e-05, + "loss": 0.9379, + "step": 17601 + }, + { + "epoch": 3.7546928327645053, + "grad_norm": 0.2261962440803473, + "learning_rate": 8.380243198726063e-05, + "loss": 0.9371, + "step": 17602 + }, + { + "epoch": 3.75490614334471, + "grad_norm": 0.2292728201039924, + "learning_rate": 8.379393181813788e-05, + "loss": 0.9001, + "step": 17603 + }, + { + "epoch": 3.7551194539249146, + "grad_norm": 0.29508649774743184, + "learning_rate": 8.378543160608677e-05, + "loss": 0.8937, + "step": 17604 + }, + { + "epoch": 3.7553327645051193, + "grad_norm": 0.4471841605946028, + "learning_rate": 8.377693135120341e-05, + "loss": 0.9262, + "step": 17605 + }, + { + "epoch": 3.7555460750853245, + "grad_norm": 0.44768871875664584, + "learning_rate": 8.3768431053584e-05, + "loss": 0.8918, + "step": 17606 + }, + { + "epoch": 3.755759385665529, + "grad_norm": 0.24349007228335762, + "learning_rate": 8.375993071332474e-05, + "loss": 0.9057, + "step": 17607 + }, + { + "epoch": 3.755972696245734, + "grad_norm": 0.2356349886244443, + "learning_rate": 8.375143033052182e-05, + "loss": 0.9428, + "step": 17608 + }, + { + "epoch": 3.7561860068259385, + "grad_norm": 0.25267603513621567, + "learning_rate": 8.374292990527137e-05, + "loss": 0.9243, + "step": 17609 + }, + { + "epoch": 3.756399317406143, + "grad_norm": 0.34610490099133034, + "learning_rate": 8.373442943766962e-05, + "loss": 0.9416, + "step": 17610 + }, + { + "epoch": 3.756612627986348, + "grad_norm": 0.412709038525292, + "learning_rate": 8.372592892781277e-05, + "loss": 0.898, + "step": 17611 + }, + { + "epoch": 3.756825938566553, + "grad_norm": 0.3818033683344565, + "learning_rate": 8.371742837579692e-05, + "loss": 0.9012, + "step": 17612 + }, + { + "epoch": 3.7570392491467577, + "grad_norm": 0.29115544180569486, + "learning_rate": 8.370892778171833e-05, + "loss": 0.9022, + "step": 17613 + }, + { + "epoch": 3.7572525597269624, + "grad_norm": 0.1855478399267463, + "learning_rate": 8.370042714567318e-05, + "loss": 0.896, + "step": 17614 + }, + { + "epoch": 3.7574658703071675, + "grad_norm": 0.23990867509179808, + "learning_rate": 8.36919264677576e-05, + "loss": 0.9042, + "step": 17615 + }, + { + "epoch": 3.757679180887372, + "grad_norm": 0.31047097959461467, + "learning_rate": 8.368342574806783e-05, + "loss": 0.9214, + "step": 17616 + }, + { + "epoch": 3.757892491467577, + "grad_norm": 0.34831078202382076, + "learning_rate": 8.367492498670003e-05, + "loss": 0.9125, + "step": 17617 + }, + { + "epoch": 3.7581058020477816, + "grad_norm": 0.3486604467192358, + "learning_rate": 8.366642418375038e-05, + "loss": 0.9232, + "step": 17618 + }, + { + "epoch": 3.7583191126279862, + "grad_norm": 0.2528702556495638, + "learning_rate": 8.365792333931508e-05, + "loss": 0.9171, + "step": 17619 + }, + { + "epoch": 3.758532423208191, + "grad_norm": 0.19962723150568365, + "learning_rate": 8.364942245349033e-05, + "loss": 0.9406, + "step": 17620 + }, + { + "epoch": 3.758745733788396, + "grad_norm": 0.22454511902127486, + "learning_rate": 8.364092152637229e-05, + "loss": 0.8922, + "step": 17621 + }, + { + "epoch": 3.7589590443686007, + "grad_norm": 0.24880435943876977, + "learning_rate": 8.363242055805717e-05, + "loss": 0.8968, + "step": 17622 + }, + { + "epoch": 3.7591723549488054, + "grad_norm": 0.3596624039294406, + "learning_rate": 8.362391954864117e-05, + "loss": 0.936, + "step": 17623 + }, + { + "epoch": 3.75938566552901, + "grad_norm": 0.2746941239983171, + "learning_rate": 8.361541849822042e-05, + "loss": 0.8962, + "step": 17624 + }, + { + "epoch": 3.7595989761092152, + "grad_norm": 0.18658056790921257, + "learning_rate": 8.360691740689118e-05, + "loss": 0.9146, + "step": 17625 + }, + { + "epoch": 3.75981228668942, + "grad_norm": 0.2613807564061061, + "learning_rate": 8.359841627474961e-05, + "loss": 0.9145, + "step": 17626 + }, + { + "epoch": 3.7600255972696246, + "grad_norm": 0.3206889969058971, + "learning_rate": 8.358991510189189e-05, + "loss": 0.9014, + "step": 17627 + }, + { + "epoch": 3.7602389078498293, + "grad_norm": 0.3360760232741043, + "learning_rate": 8.35814138884142e-05, + "loss": 0.9239, + "step": 17628 + }, + { + "epoch": 3.760452218430034, + "grad_norm": 0.254921263182504, + "learning_rate": 8.357291263441276e-05, + "loss": 0.8822, + "step": 17629 + }, + { + "epoch": 3.7606655290102387, + "grad_norm": 0.20986998891348974, + "learning_rate": 8.356441133998376e-05, + "loss": 0.9182, + "step": 17630 + }, + { + "epoch": 3.760878839590444, + "grad_norm": 0.2275420226190855, + "learning_rate": 8.355591000522339e-05, + "loss": 0.8879, + "step": 17631 + }, + { + "epoch": 3.7610921501706485, + "grad_norm": 0.29684322570354155, + "learning_rate": 8.354740863022785e-05, + "loss": 0.8986, + "step": 17632 + }, + { + "epoch": 3.761305460750853, + "grad_norm": 0.324039278483345, + "learning_rate": 8.35389072150933e-05, + "loss": 0.8924, + "step": 17633 + }, + { + "epoch": 3.7615187713310583, + "grad_norm": 0.29290272765275144, + "learning_rate": 8.353040575991596e-05, + "loss": 0.9125, + "step": 17634 + }, + { + "epoch": 3.761732081911263, + "grad_norm": 0.21523954520665248, + "learning_rate": 8.352190426479204e-05, + "loss": 0.8982, + "step": 17635 + }, + { + "epoch": 3.7619453924914676, + "grad_norm": 0.21165430516111972, + "learning_rate": 8.351340272981769e-05, + "loss": 0.9277, + "step": 17636 + }, + { + "epoch": 3.7621587030716723, + "grad_norm": 0.21151458575036486, + "learning_rate": 8.350490115508913e-05, + "loss": 0.8895, + "step": 17637 + }, + { + "epoch": 3.762372013651877, + "grad_norm": 0.1746231440042262, + "learning_rate": 8.349639954070257e-05, + "loss": 0.9071, + "step": 17638 + }, + { + "epoch": 3.7625853242320817, + "grad_norm": 0.20939397714784028, + "learning_rate": 8.348789788675417e-05, + "loss": 0.8949, + "step": 17639 + }, + { + "epoch": 3.762798634812287, + "grad_norm": 0.2048157219348791, + "learning_rate": 8.347939619334016e-05, + "loss": 0.9093, + "step": 17640 + }, + { + "epoch": 3.7630119453924915, + "grad_norm": 0.2515855351160068, + "learning_rate": 8.347089446055671e-05, + "loss": 0.9057, + "step": 17641 + }, + { + "epoch": 3.763225255972696, + "grad_norm": 0.23106823571669569, + "learning_rate": 8.346239268850005e-05, + "loss": 0.9083, + "step": 17642 + }, + { + "epoch": 3.763438566552901, + "grad_norm": 0.190604496693113, + "learning_rate": 8.345389087726634e-05, + "loss": 0.8977, + "step": 17643 + }, + { + "epoch": 3.763651877133106, + "grad_norm": 0.23373046514325932, + "learning_rate": 8.344538902695182e-05, + "loss": 0.9263, + "step": 17644 + }, + { + "epoch": 3.7638651877133107, + "grad_norm": 0.2858398093056102, + "learning_rate": 8.343688713765266e-05, + "loss": 0.9214, + "step": 17645 + }, + { + "epoch": 3.7640784982935154, + "grad_norm": 0.2774275276689199, + "learning_rate": 8.342838520946505e-05, + "loss": 0.9018, + "step": 17646 + }, + { + "epoch": 3.76429180887372, + "grad_norm": 0.23850098077533627, + "learning_rate": 8.341988324248521e-05, + "loss": 0.9489, + "step": 17647 + }, + { + "epoch": 3.7645051194539247, + "grad_norm": 0.2123738714984515, + "learning_rate": 8.341138123680932e-05, + "loss": 0.9142, + "step": 17648 + }, + { + "epoch": 3.7647184300341294, + "grad_norm": 0.23741600745341715, + "learning_rate": 8.340287919253362e-05, + "loss": 0.9169, + "step": 17649 + }, + { + "epoch": 3.7649317406143346, + "grad_norm": 0.20939112759444334, + "learning_rate": 8.339437710975427e-05, + "loss": 0.9149, + "step": 17650 + }, + { + "epoch": 3.7651450511945392, + "grad_norm": 0.19089935363481275, + "learning_rate": 8.338587498856748e-05, + "loss": 0.9203, + "step": 17651 + }, + { + "epoch": 3.765358361774744, + "grad_norm": 0.21630582871044962, + "learning_rate": 8.337737282906947e-05, + "loss": 0.9469, + "step": 17652 + }, + { + "epoch": 3.765571672354949, + "grad_norm": 0.2190204942849017, + "learning_rate": 8.336887063135642e-05, + "loss": 0.9116, + "step": 17653 + }, + { + "epoch": 3.7657849829351537, + "grad_norm": 0.26887935019279524, + "learning_rate": 8.336036839552455e-05, + "loss": 0.9326, + "step": 17654 + }, + { + "epoch": 3.7659982935153584, + "grad_norm": 0.2994672530443058, + "learning_rate": 8.335186612167008e-05, + "loss": 0.9135, + "step": 17655 + }, + { + "epoch": 3.766211604095563, + "grad_norm": 0.22566841200910928, + "learning_rate": 8.334336380988915e-05, + "loss": 0.9121, + "step": 17656 + }, + { + "epoch": 3.766424914675768, + "grad_norm": 0.2197070636696584, + "learning_rate": 8.333486146027802e-05, + "loss": 0.9299, + "step": 17657 + }, + { + "epoch": 3.7666382252559725, + "grad_norm": 0.25876844223604684, + "learning_rate": 8.332635907293284e-05, + "loss": 0.9199, + "step": 17658 + }, + { + "epoch": 3.7668515358361776, + "grad_norm": 0.22933637840907067, + "learning_rate": 8.33178566479499e-05, + "loss": 0.8862, + "step": 17659 + }, + { + "epoch": 3.7670648464163823, + "grad_norm": 0.25061050713464184, + "learning_rate": 8.330935418542532e-05, + "loss": 0.9118, + "step": 17660 + }, + { + "epoch": 3.767278156996587, + "grad_norm": 0.2664739533032306, + "learning_rate": 8.330085168545537e-05, + "loss": 0.8938, + "step": 17661 + }, + { + "epoch": 3.7674914675767917, + "grad_norm": 0.1902424079023715, + "learning_rate": 8.329234914813623e-05, + "loss": 0.8928, + "step": 17662 + }, + { + "epoch": 3.767704778156997, + "grad_norm": 0.21419171118557315, + "learning_rate": 8.328384657356408e-05, + "loss": 0.9032, + "step": 17663 + }, + { + "epoch": 3.7679180887372015, + "grad_norm": 0.20692265469095997, + "learning_rate": 8.327534396183515e-05, + "loss": 0.9431, + "step": 17664 + }, + { + "epoch": 3.768131399317406, + "grad_norm": 0.19226858456093207, + "learning_rate": 8.326684131304566e-05, + "loss": 0.9401, + "step": 17665 + }, + { + "epoch": 3.768344709897611, + "grad_norm": 0.19302451618774508, + "learning_rate": 8.325833862729182e-05, + "loss": 0.9219, + "step": 17666 + }, + { + "epoch": 3.7685580204778155, + "grad_norm": 0.22897730736384442, + "learning_rate": 8.32498359046698e-05, + "loss": 0.933, + "step": 17667 + }, + { + "epoch": 3.76877133105802, + "grad_norm": 0.2857904876781296, + "learning_rate": 8.324133314527584e-05, + "loss": 0.9279, + "step": 17668 + }, + { + "epoch": 3.7689846416382253, + "grad_norm": 0.22535484959029278, + "learning_rate": 8.323283034920615e-05, + "loss": 0.9227, + "step": 17669 + }, + { + "epoch": 3.76919795221843, + "grad_norm": 0.23030331772304702, + "learning_rate": 8.32243275165569e-05, + "loss": 0.9049, + "step": 17670 + }, + { + "epoch": 3.7694112627986347, + "grad_norm": 0.1686722129042094, + "learning_rate": 8.321582464742437e-05, + "loss": 0.9062, + "step": 17671 + }, + { + "epoch": 3.76962457337884, + "grad_norm": 0.17894690588052048, + "learning_rate": 8.32073217419047e-05, + "loss": 0.9038, + "step": 17672 + }, + { + "epoch": 3.7698378839590445, + "grad_norm": 0.17476415679142662, + "learning_rate": 8.319881880009416e-05, + "loss": 0.9314, + "step": 17673 + }, + { + "epoch": 3.770051194539249, + "grad_norm": 0.19574410828191716, + "learning_rate": 8.319031582208891e-05, + "loss": 0.8933, + "step": 17674 + }, + { + "epoch": 3.770264505119454, + "grad_norm": 0.19523598093907377, + "learning_rate": 8.318181280798519e-05, + "loss": 0.9145, + "step": 17675 + }, + { + "epoch": 3.7704778156996586, + "grad_norm": 0.25872133814540793, + "learning_rate": 8.317330975787919e-05, + "loss": 0.9034, + "step": 17676 + }, + { + "epoch": 3.7706911262798632, + "grad_norm": 0.2039363807453844, + "learning_rate": 8.316480667186714e-05, + "loss": 0.9412, + "step": 17677 + }, + { + "epoch": 3.7709044368600684, + "grad_norm": 0.2297011421352189, + "learning_rate": 8.315630355004526e-05, + "loss": 0.9281, + "step": 17678 + }, + { + "epoch": 3.771117747440273, + "grad_norm": 0.2024296728026232, + "learning_rate": 8.314780039250975e-05, + "loss": 0.8978, + "step": 17679 + }, + { + "epoch": 3.7713310580204777, + "grad_norm": 0.2056336700682489, + "learning_rate": 8.313929719935683e-05, + "loss": 0.9136, + "step": 17680 + }, + { + "epoch": 3.7715443686006824, + "grad_norm": 0.2226701019110002, + "learning_rate": 8.313079397068271e-05, + "loss": 0.9171, + "step": 17681 + }, + { + "epoch": 3.7717576791808876, + "grad_norm": 0.21003531387216678, + "learning_rate": 8.312229070658358e-05, + "loss": 0.9195, + "step": 17682 + }, + { + "epoch": 3.7719709897610922, + "grad_norm": 0.2226789570000619, + "learning_rate": 8.311378740715573e-05, + "loss": 0.9281, + "step": 17683 + }, + { + "epoch": 3.772184300341297, + "grad_norm": 0.18004714283833945, + "learning_rate": 8.310528407249527e-05, + "loss": 0.9204, + "step": 17684 + }, + { + "epoch": 3.7723976109215016, + "grad_norm": 0.20089145231640987, + "learning_rate": 8.30967807026985e-05, + "loss": 0.9316, + "step": 17685 + }, + { + "epoch": 3.7726109215017063, + "grad_norm": 0.20238986021501396, + "learning_rate": 8.308827729786157e-05, + "loss": 0.9137, + "step": 17686 + }, + { + "epoch": 3.7728242320819114, + "grad_norm": 0.2032771329860881, + "learning_rate": 8.307977385808075e-05, + "loss": 0.8997, + "step": 17687 + }, + { + "epoch": 3.773037542662116, + "grad_norm": 0.17575756527691427, + "learning_rate": 8.307127038345225e-05, + "loss": 0.9315, + "step": 17688 + }, + { + "epoch": 3.773250853242321, + "grad_norm": 0.1920328297430542, + "learning_rate": 8.306276687407227e-05, + "loss": 0.9085, + "step": 17689 + }, + { + "epoch": 3.7734641638225255, + "grad_norm": 0.19743513451930983, + "learning_rate": 8.305426333003702e-05, + "loss": 0.9556, + "step": 17690 + }, + { + "epoch": 3.7736774744027306, + "grad_norm": 0.19237798699126452, + "learning_rate": 8.304575975144274e-05, + "loss": 0.8721, + "step": 17691 + }, + { + "epoch": 3.7738907849829353, + "grad_norm": 0.169751540534979, + "learning_rate": 8.30372561383856e-05, + "loss": 0.9287, + "step": 17692 + }, + { + "epoch": 3.77410409556314, + "grad_norm": 0.2369012317821829, + "learning_rate": 8.30287524909619e-05, + "loss": 0.9602, + "step": 17693 + }, + { + "epoch": 3.7743174061433447, + "grad_norm": 0.24224292612224635, + "learning_rate": 8.30202488092678e-05, + "loss": 0.9039, + "step": 17694 + }, + { + "epoch": 3.7745307167235493, + "grad_norm": 0.2341905295379988, + "learning_rate": 8.301174509339956e-05, + "loss": 0.9315, + "step": 17695 + }, + { + "epoch": 3.774744027303754, + "grad_norm": 0.17735202398143426, + "learning_rate": 8.300324134345334e-05, + "loss": 0.9102, + "step": 17696 + }, + { + "epoch": 3.774957337883959, + "grad_norm": 0.15886967413725342, + "learning_rate": 8.299473755952539e-05, + "loss": 0.9094, + "step": 17697 + }, + { + "epoch": 3.775170648464164, + "grad_norm": 0.23342876443423627, + "learning_rate": 8.298623374171193e-05, + "loss": 0.9457, + "step": 17698 + }, + { + "epoch": 3.7753839590443685, + "grad_norm": 0.20928838567938796, + "learning_rate": 8.297772989010922e-05, + "loss": 0.9319, + "step": 17699 + }, + { + "epoch": 3.7755972696245736, + "grad_norm": 0.19601372245012252, + "learning_rate": 8.296922600481344e-05, + "loss": 0.9073, + "step": 17700 + }, + { + "epoch": 3.7758105802047783, + "grad_norm": 0.2365937141223371, + "learning_rate": 8.296072208592079e-05, + "loss": 0.9843, + "step": 17701 + }, + { + "epoch": 3.776023890784983, + "grad_norm": 0.26853322104932087, + "learning_rate": 8.295221813352755e-05, + "loss": 0.9258, + "step": 17702 + }, + { + "epoch": 3.7762372013651877, + "grad_norm": 0.2032741864320317, + "learning_rate": 8.294371414772988e-05, + "loss": 0.9285, + "step": 17703 + }, + { + "epoch": 3.7764505119453924, + "grad_norm": 0.19175070038288602, + "learning_rate": 8.293521012862406e-05, + "loss": 0.903, + "step": 17704 + }, + { + "epoch": 3.776663822525597, + "grad_norm": 0.20633211989460157, + "learning_rate": 8.292670607630628e-05, + "loss": 0.938, + "step": 17705 + }, + { + "epoch": 3.776877133105802, + "grad_norm": 0.20930336123972704, + "learning_rate": 8.291820199087277e-05, + "loss": 0.925, + "step": 17706 + }, + { + "epoch": 3.777090443686007, + "grad_norm": 0.17529778852852942, + "learning_rate": 8.290969787241975e-05, + "loss": 0.9045, + "step": 17707 + }, + { + "epoch": 3.7773037542662116, + "grad_norm": 0.18525750742339195, + "learning_rate": 8.290119372104346e-05, + "loss": 0.8978, + "step": 17708 + }, + { + "epoch": 3.7775170648464163, + "grad_norm": 0.17150607177650612, + "learning_rate": 8.289268953684012e-05, + "loss": 0.8963, + "step": 17709 + }, + { + "epoch": 3.7777303754266214, + "grad_norm": 0.17991928162418067, + "learning_rate": 8.288418531990593e-05, + "loss": 0.9042, + "step": 17710 + }, + { + "epoch": 3.777943686006826, + "grad_norm": 0.2027126388259898, + "learning_rate": 8.287568107033717e-05, + "loss": 0.9162, + "step": 17711 + }, + { + "epoch": 3.7781569965870307, + "grad_norm": 0.20566555403239647, + "learning_rate": 8.286717678822997e-05, + "loss": 0.903, + "step": 17712 + }, + { + "epoch": 3.7783703071672354, + "grad_norm": 0.21849076086219535, + "learning_rate": 8.285867247368068e-05, + "loss": 0.922, + "step": 17713 + }, + { + "epoch": 3.77858361774744, + "grad_norm": 0.26012449022610096, + "learning_rate": 8.285016812678541e-05, + "loss": 0.8878, + "step": 17714 + }, + { + "epoch": 3.778796928327645, + "grad_norm": 0.2436255998488376, + "learning_rate": 8.284166374764047e-05, + "loss": 0.9355, + "step": 17715 + }, + { + "epoch": 3.77901023890785, + "grad_norm": 0.2527766633463667, + "learning_rate": 8.283315933634205e-05, + "loss": 0.8953, + "step": 17716 + }, + { + "epoch": 3.7792235494880546, + "grad_norm": 0.26599589719425504, + "learning_rate": 8.28246548929864e-05, + "loss": 0.9415, + "step": 17717 + }, + { + "epoch": 3.7794368600682593, + "grad_norm": 0.20524377805248992, + "learning_rate": 8.281615041766971e-05, + "loss": 0.899, + "step": 17718 + }, + { + "epoch": 3.7796501706484644, + "grad_norm": 0.21073607346486667, + "learning_rate": 8.280764591048824e-05, + "loss": 0.9201, + "step": 17719 + }, + { + "epoch": 3.779863481228669, + "grad_norm": 0.23161919859449637, + "learning_rate": 8.279914137153819e-05, + "loss": 0.9068, + "step": 17720 + }, + { + "epoch": 3.780076791808874, + "grad_norm": 0.20440498018569658, + "learning_rate": 8.279063680091584e-05, + "loss": 0.935, + "step": 17721 + }, + { + "epoch": 3.7802901023890785, + "grad_norm": 0.20855094525615583, + "learning_rate": 8.278213219871738e-05, + "loss": 0.9016, + "step": 17722 + }, + { + "epoch": 3.780503412969283, + "grad_norm": 0.23094430918921713, + "learning_rate": 8.277362756503905e-05, + "loss": 0.9478, + "step": 17723 + }, + { + "epoch": 3.780716723549488, + "grad_norm": 0.28498526672930313, + "learning_rate": 8.276512289997704e-05, + "loss": 0.8952, + "step": 17724 + }, + { + "epoch": 3.780930034129693, + "grad_norm": 0.28010693631163674, + "learning_rate": 8.275661820362766e-05, + "loss": 0.9013, + "step": 17725 + }, + { + "epoch": 3.7811433447098977, + "grad_norm": 0.234990110056351, + "learning_rate": 8.274811347608709e-05, + "loss": 0.9171, + "step": 17726 + }, + { + "epoch": 3.7813566552901023, + "grad_norm": 0.18873742989243616, + "learning_rate": 8.273960871745156e-05, + "loss": 0.9254, + "step": 17727 + }, + { + "epoch": 3.781569965870307, + "grad_norm": 0.22406239334467107, + "learning_rate": 8.273110392781733e-05, + "loss": 0.9358, + "step": 17728 + }, + { + "epoch": 3.781783276450512, + "grad_norm": 0.24280388271992184, + "learning_rate": 8.27225991072806e-05, + "loss": 0.9132, + "step": 17729 + }, + { + "epoch": 3.781996587030717, + "grad_norm": 0.24419010478039818, + "learning_rate": 8.27140942559376e-05, + "loss": 0.9111, + "step": 17730 + }, + { + "epoch": 3.7822098976109215, + "grad_norm": 0.21836455778125316, + "learning_rate": 8.27055893738846e-05, + "loss": 0.9102, + "step": 17731 + }, + { + "epoch": 3.782423208191126, + "grad_norm": 0.17240779408834683, + "learning_rate": 8.26970844612178e-05, + "loss": 0.9018, + "step": 17732 + }, + { + "epoch": 3.782636518771331, + "grad_norm": 0.18997275928041402, + "learning_rate": 8.268857951803346e-05, + "loss": 0.9172, + "step": 17733 + }, + { + "epoch": 3.7828498293515356, + "grad_norm": 0.27545363109387283, + "learning_rate": 8.268007454442779e-05, + "loss": 0.9, + "step": 17734 + }, + { + "epoch": 3.7830631399317407, + "grad_norm": 0.35340646886844906, + "learning_rate": 8.267156954049702e-05, + "loss": 0.9473, + "step": 17735 + }, + { + "epoch": 3.7832764505119454, + "grad_norm": 0.3173537938173453, + "learning_rate": 8.26630645063374e-05, + "loss": 0.8959, + "step": 17736 + }, + { + "epoch": 3.78348976109215, + "grad_norm": 0.23032406170443678, + "learning_rate": 8.265455944204515e-05, + "loss": 0.9156, + "step": 17737 + }, + { + "epoch": 3.783703071672355, + "grad_norm": 0.24662801551583605, + "learning_rate": 8.264605434771652e-05, + "loss": 0.8979, + "step": 17738 + }, + { + "epoch": 3.78391638225256, + "grad_norm": 0.20426351301282916, + "learning_rate": 8.263754922344778e-05, + "loss": 0.9179, + "step": 17739 + }, + { + "epoch": 3.7841296928327646, + "grad_norm": 0.2650261309338209, + "learning_rate": 8.262904406933507e-05, + "loss": 0.9456, + "step": 17740 + }, + { + "epoch": 3.7843430034129693, + "grad_norm": 0.36385782506297437, + "learning_rate": 8.262053888547471e-05, + "loss": 0.9291, + "step": 17741 + }, + { + "epoch": 3.784556313993174, + "grad_norm": 0.3402778204333826, + "learning_rate": 8.26120336719629e-05, + "loss": 0.9289, + "step": 17742 + }, + { + "epoch": 3.7847696245733786, + "grad_norm": 0.2775390363111514, + "learning_rate": 8.260352842889589e-05, + "loss": 0.9272, + "step": 17743 + }, + { + "epoch": 3.7849829351535837, + "grad_norm": 0.22423718455710587, + "learning_rate": 8.25950231563699e-05, + "loss": 0.885, + "step": 17744 + }, + { + "epoch": 3.7851962457337884, + "grad_norm": 0.1780653671863376, + "learning_rate": 8.258651785448122e-05, + "loss": 0.8942, + "step": 17745 + }, + { + "epoch": 3.785409556313993, + "grad_norm": 0.2499257386660475, + "learning_rate": 8.257801252332601e-05, + "loss": 0.9105, + "step": 17746 + }, + { + "epoch": 3.785622866894198, + "grad_norm": 0.2835421885060544, + "learning_rate": 8.256950716300056e-05, + "loss": 0.9343, + "step": 17747 + }, + { + "epoch": 3.785836177474403, + "grad_norm": 0.24685627239956692, + "learning_rate": 8.256100177360108e-05, + "loss": 0.8942, + "step": 17748 + }, + { + "epoch": 3.7860494880546076, + "grad_norm": 0.2309897016297758, + "learning_rate": 8.255249635522383e-05, + "loss": 0.9233, + "step": 17749 + }, + { + "epoch": 3.7862627986348123, + "grad_norm": 0.21353100094081232, + "learning_rate": 8.254399090796504e-05, + "loss": 0.8626, + "step": 17750 + }, + { + "epoch": 3.786476109215017, + "grad_norm": 0.20072222168116727, + "learning_rate": 8.253548543192095e-05, + "loss": 0.9308, + "step": 17751 + }, + { + "epoch": 3.7866894197952217, + "grad_norm": 0.28338939047150485, + "learning_rate": 8.252697992718779e-05, + "loss": 0.9031, + "step": 17752 + }, + { + "epoch": 3.7869027303754264, + "grad_norm": 0.36039688730554953, + "learning_rate": 8.251847439386182e-05, + "loss": 0.8953, + "step": 17753 + }, + { + "epoch": 3.7871160409556315, + "grad_norm": 0.32392578149082246, + "learning_rate": 8.250996883203926e-05, + "loss": 0.9214, + "step": 17754 + }, + { + "epoch": 3.787329351535836, + "grad_norm": 0.2564100145469822, + "learning_rate": 8.250146324181638e-05, + "loss": 0.9227, + "step": 17755 + }, + { + "epoch": 3.787542662116041, + "grad_norm": 0.201497537660611, + "learning_rate": 8.249295762328939e-05, + "loss": 0.9328, + "step": 17756 + }, + { + "epoch": 3.787755972696246, + "grad_norm": 0.17745207475813757, + "learning_rate": 8.248445197655455e-05, + "loss": 0.9505, + "step": 17757 + }, + { + "epoch": 3.7879692832764507, + "grad_norm": 0.25319155867675014, + "learning_rate": 8.247594630170808e-05, + "loss": 0.9059, + "step": 17758 + }, + { + "epoch": 3.7881825938566553, + "grad_norm": 0.2597454491750508, + "learning_rate": 8.246744059884623e-05, + "loss": 0.9386, + "step": 17759 + }, + { + "epoch": 3.78839590443686, + "grad_norm": 0.24670537470723813, + "learning_rate": 8.245893486806525e-05, + "loss": 0.9275, + "step": 17760 + }, + { + "epoch": 3.7886092150170647, + "grad_norm": 0.2695343930649752, + "learning_rate": 8.245042910946142e-05, + "loss": 0.9182, + "step": 17761 + }, + { + "epoch": 3.7888225255972694, + "grad_norm": 0.23727520854070994, + "learning_rate": 8.244192332313092e-05, + "loss": 0.9123, + "step": 17762 + }, + { + "epoch": 3.7890358361774745, + "grad_norm": 0.2563689804020803, + "learning_rate": 8.243341750917e-05, + "loss": 0.9111, + "step": 17763 + }, + { + "epoch": 3.789249146757679, + "grad_norm": 0.3163662102211356, + "learning_rate": 8.242491166767493e-05, + "loss": 0.9181, + "step": 17764 + }, + { + "epoch": 3.789462457337884, + "grad_norm": 0.24704330783853518, + "learning_rate": 8.241640579874195e-05, + "loss": 0.9025, + "step": 17765 + }, + { + "epoch": 3.7896757679180886, + "grad_norm": 0.24406176776502825, + "learning_rate": 8.240789990246728e-05, + "loss": 0.9036, + "step": 17766 + }, + { + "epoch": 3.7898890784982937, + "grad_norm": 0.23234583501577116, + "learning_rate": 8.239939397894723e-05, + "loss": 0.8913, + "step": 17767 + }, + { + "epoch": 3.7901023890784984, + "grad_norm": 0.2404488049472226, + "learning_rate": 8.239088802827794e-05, + "loss": 0.9064, + "step": 17768 + }, + { + "epoch": 3.790315699658703, + "grad_norm": 0.2182057028559241, + "learning_rate": 8.238238205055574e-05, + "loss": 0.9215, + "step": 17769 + }, + { + "epoch": 3.7905290102389078, + "grad_norm": 0.23941807689267552, + "learning_rate": 8.237387604587682e-05, + "loss": 0.9379, + "step": 17770 + }, + { + "epoch": 3.7907423208191124, + "grad_norm": 0.20317899177985807, + "learning_rate": 8.236537001433748e-05, + "loss": 0.9262, + "step": 17771 + }, + { + "epoch": 3.7909556313993176, + "grad_norm": 0.18908020788652333, + "learning_rate": 8.235686395603392e-05, + "loss": 0.9254, + "step": 17772 + }, + { + "epoch": 3.7911689419795223, + "grad_norm": 0.27714235072411403, + "learning_rate": 8.234835787106245e-05, + "loss": 0.9559, + "step": 17773 + }, + { + "epoch": 3.791382252559727, + "grad_norm": 0.2753891255290502, + "learning_rate": 8.233985175951921e-05, + "loss": 0.9558, + "step": 17774 + }, + { + "epoch": 3.7915955631399316, + "grad_norm": 0.21813428660090034, + "learning_rate": 8.233134562150054e-05, + "loss": 0.9024, + "step": 17775 + }, + { + "epoch": 3.7918088737201368, + "grad_norm": 0.19510696061118635, + "learning_rate": 8.232283945710263e-05, + "loss": 0.908, + "step": 17776 + }, + { + "epoch": 3.7920221843003414, + "grad_norm": 0.17574690523117528, + "learning_rate": 8.23143332664218e-05, + "loss": 0.9088, + "step": 17777 + }, + { + "epoch": 3.792235494880546, + "grad_norm": 0.1959714729449513, + "learning_rate": 8.230582704955423e-05, + "loss": 0.9201, + "step": 17778 + }, + { + "epoch": 3.792448805460751, + "grad_norm": 0.2152579742757292, + "learning_rate": 8.229732080659619e-05, + "loss": 0.9161, + "step": 17779 + }, + { + "epoch": 3.7926621160409555, + "grad_norm": 0.2580628229496799, + "learning_rate": 8.22888145376439e-05, + "loss": 0.9009, + "step": 17780 + }, + { + "epoch": 3.79287542662116, + "grad_norm": 0.26034043388486, + "learning_rate": 8.228030824279365e-05, + "loss": 0.9044, + "step": 17781 + }, + { + "epoch": 3.7930887372013653, + "grad_norm": 0.22384302847225734, + "learning_rate": 8.227180192214166e-05, + "loss": 0.9189, + "step": 17782 + }, + { + "epoch": 3.79330204778157, + "grad_norm": 0.2027630321129204, + "learning_rate": 8.226329557578421e-05, + "loss": 0.9205, + "step": 17783 + }, + { + "epoch": 3.7935153583617747, + "grad_norm": 0.26189844565469816, + "learning_rate": 8.225478920381755e-05, + "loss": 0.913, + "step": 17784 + }, + { + "epoch": 3.79372866894198, + "grad_norm": 0.23004113887098987, + "learning_rate": 8.22462828063379e-05, + "loss": 0.8769, + "step": 17785 + }, + { + "epoch": 3.7939419795221845, + "grad_norm": 0.1918964159886764, + "learning_rate": 8.22377763834415e-05, + "loss": 0.9515, + "step": 17786 + }, + { + "epoch": 3.794155290102389, + "grad_norm": 0.2572027380663258, + "learning_rate": 8.222926993522465e-05, + "loss": 0.9476, + "step": 17787 + }, + { + "epoch": 3.794368600682594, + "grad_norm": 0.22192500477071575, + "learning_rate": 8.222076346178356e-05, + "loss": 0.919, + "step": 17788 + }, + { + "epoch": 3.7945819112627985, + "grad_norm": 0.22405212949773876, + "learning_rate": 8.221225696321449e-05, + "loss": 0.9232, + "step": 17789 + }, + { + "epoch": 3.794795221843003, + "grad_norm": 0.2671054577885077, + "learning_rate": 8.220375043961372e-05, + "loss": 0.9435, + "step": 17790 + }, + { + "epoch": 3.7950085324232083, + "grad_norm": 0.2102866181084317, + "learning_rate": 8.219524389107745e-05, + "loss": 0.9055, + "step": 17791 + }, + { + "epoch": 3.795221843003413, + "grad_norm": 0.21853544486475743, + "learning_rate": 8.218673731770196e-05, + "loss": 0.91, + "step": 17792 + }, + { + "epoch": 3.7954351535836177, + "grad_norm": 0.19486812996075192, + "learning_rate": 8.21782307195835e-05, + "loss": 0.8995, + "step": 17793 + }, + { + "epoch": 3.7956484641638224, + "grad_norm": 0.20142694023341443, + "learning_rate": 8.216972409681832e-05, + "loss": 0.9124, + "step": 17794 + }, + { + "epoch": 3.7958617747440275, + "grad_norm": 0.21392689111251226, + "learning_rate": 8.21612174495027e-05, + "loss": 0.9398, + "step": 17795 + }, + { + "epoch": 3.796075085324232, + "grad_norm": 0.1975207882621033, + "learning_rate": 8.215271077773286e-05, + "loss": 0.9308, + "step": 17796 + }, + { + "epoch": 3.796288395904437, + "grad_norm": 0.16967665116447522, + "learning_rate": 8.214420408160505e-05, + "loss": 0.8931, + "step": 17797 + }, + { + "epoch": 3.7965017064846416, + "grad_norm": 0.20910139677314177, + "learning_rate": 8.213569736121554e-05, + "loss": 0.9108, + "step": 17798 + }, + { + "epoch": 3.7967150170648463, + "grad_norm": 0.18532430045458395, + "learning_rate": 8.212719061666058e-05, + "loss": 0.9139, + "step": 17799 + }, + { + "epoch": 3.796928327645051, + "grad_norm": 0.2112178820324542, + "learning_rate": 8.211868384803641e-05, + "loss": 0.9123, + "step": 17800 + }, + { + "epoch": 3.797141638225256, + "grad_norm": 0.20735112880448353, + "learning_rate": 8.211017705543932e-05, + "loss": 0.8944, + "step": 17801 + }, + { + "epoch": 3.7973549488054608, + "grad_norm": 0.1895072128661845, + "learning_rate": 8.210167023896553e-05, + "loss": 0.8955, + "step": 17802 + }, + { + "epoch": 3.7975682593856654, + "grad_norm": 0.1996614873722625, + "learning_rate": 8.20931633987113e-05, + "loss": 0.9421, + "step": 17803 + }, + { + "epoch": 3.7977815699658706, + "grad_norm": 0.23672097097034567, + "learning_rate": 8.20846565347729e-05, + "loss": 0.9306, + "step": 17804 + }, + { + "epoch": 3.7979948805460753, + "grad_norm": 0.1757068813697972, + "learning_rate": 8.207614964724656e-05, + "loss": 0.9045, + "step": 17805 + }, + { + "epoch": 3.79820819112628, + "grad_norm": 0.20720329093910161, + "learning_rate": 8.206764273622858e-05, + "loss": 0.9355, + "step": 17806 + }, + { + "epoch": 3.7984215017064846, + "grad_norm": 0.20970481684504896, + "learning_rate": 8.205913580181515e-05, + "loss": 0.9273, + "step": 17807 + }, + { + "epoch": 3.7986348122866893, + "grad_norm": 0.21088305765971296, + "learning_rate": 8.205062884410259e-05, + "loss": 0.8898, + "step": 17808 + }, + { + "epoch": 3.798848122866894, + "grad_norm": 0.19610129999626144, + "learning_rate": 8.204212186318714e-05, + "loss": 0.9092, + "step": 17809 + }, + { + "epoch": 3.799061433447099, + "grad_norm": 0.17110370690064522, + "learning_rate": 8.203361485916501e-05, + "loss": 0.9048, + "step": 17810 + }, + { + "epoch": 3.799274744027304, + "grad_norm": 0.22834866041005303, + "learning_rate": 8.202510783213253e-05, + "loss": 0.9152, + "step": 17811 + }, + { + "epoch": 3.7994880546075085, + "grad_norm": 0.2251458036707767, + "learning_rate": 8.20166007821859e-05, + "loss": 0.8944, + "step": 17812 + }, + { + "epoch": 3.799701365187713, + "grad_norm": 0.17840793289860404, + "learning_rate": 8.20080937094214e-05, + "loss": 0.908, + "step": 17813 + }, + { + "epoch": 3.7999146757679183, + "grad_norm": 0.16644981364173003, + "learning_rate": 8.19995866139353e-05, + "loss": 0.8973, + "step": 17814 + }, + { + "epoch": 3.800127986348123, + "grad_norm": 0.20505370036756085, + "learning_rate": 8.199107949582383e-05, + "loss": 0.9014, + "step": 17815 + }, + { + "epoch": 3.8003412969283277, + "grad_norm": 0.1706091922007949, + "learning_rate": 8.198257235518326e-05, + "loss": 0.8801, + "step": 17816 + }, + { + "epoch": 3.8005546075085324, + "grad_norm": 0.17020795225810104, + "learning_rate": 8.197406519210987e-05, + "loss": 0.9189, + "step": 17817 + }, + { + "epoch": 3.800767918088737, + "grad_norm": 0.18553803146749984, + "learning_rate": 8.196555800669988e-05, + "loss": 0.9338, + "step": 17818 + }, + { + "epoch": 3.8009812286689417, + "grad_norm": 0.1976658430604271, + "learning_rate": 8.195705079904957e-05, + "loss": 0.9183, + "step": 17819 + }, + { + "epoch": 3.801194539249147, + "grad_norm": 0.19058667907935142, + "learning_rate": 8.194854356925521e-05, + "loss": 0.8796, + "step": 17820 + }, + { + "epoch": 3.8014078498293515, + "grad_norm": 0.20417628340507185, + "learning_rate": 8.194003631741304e-05, + "loss": 0.909, + "step": 17821 + }, + { + "epoch": 3.801621160409556, + "grad_norm": 0.2042912761396208, + "learning_rate": 8.193152904361931e-05, + "loss": 0.9297, + "step": 17822 + }, + { + "epoch": 3.8018344709897613, + "grad_norm": 0.17746993050357143, + "learning_rate": 8.192302174797031e-05, + "loss": 0.9346, + "step": 17823 + }, + { + "epoch": 3.802047781569966, + "grad_norm": 0.2142446726154715, + "learning_rate": 8.191451443056229e-05, + "loss": 0.8974, + "step": 17824 + }, + { + "epoch": 3.8022610921501707, + "grad_norm": 0.20645529180211522, + "learning_rate": 8.190600709149148e-05, + "loss": 0.916, + "step": 17825 + }, + { + "epoch": 3.8024744027303754, + "grad_norm": 0.21630249314002292, + "learning_rate": 8.189749973085422e-05, + "loss": 0.9127, + "step": 17826 + }, + { + "epoch": 3.80268771331058, + "grad_norm": 0.2734601031213688, + "learning_rate": 8.188899234874669e-05, + "loss": 0.9525, + "step": 17827 + }, + { + "epoch": 3.8029010238907848, + "grad_norm": 0.26770417435972427, + "learning_rate": 8.188048494526516e-05, + "loss": 0.8897, + "step": 17828 + }, + { + "epoch": 3.80311433447099, + "grad_norm": 0.17681664993966326, + "learning_rate": 8.187197752050595e-05, + "loss": 0.9425, + "step": 17829 + }, + { + "epoch": 3.8033276450511946, + "grad_norm": 0.2440654220375448, + "learning_rate": 8.186347007456526e-05, + "loss": 0.9257, + "step": 17830 + }, + { + "epoch": 3.8035409556313993, + "grad_norm": 0.20625436274108866, + "learning_rate": 8.185496260753937e-05, + "loss": 0.9042, + "step": 17831 + }, + { + "epoch": 3.803754266211604, + "grad_norm": 0.21305905711678383, + "learning_rate": 8.184645511952456e-05, + "loss": 0.9166, + "step": 17832 + }, + { + "epoch": 3.803967576791809, + "grad_norm": 0.22107674833899033, + "learning_rate": 8.183794761061709e-05, + "loss": 0.9541, + "step": 17833 + }, + { + "epoch": 3.8041808873720138, + "grad_norm": 0.2634395882032644, + "learning_rate": 8.182944008091319e-05, + "loss": 0.9164, + "step": 17834 + }, + { + "epoch": 3.8043941979522184, + "grad_norm": 0.2656070393486541, + "learning_rate": 8.182093253050914e-05, + "loss": 0.9444, + "step": 17835 + }, + { + "epoch": 3.804607508532423, + "grad_norm": 0.227318867049357, + "learning_rate": 8.181242495950123e-05, + "loss": 0.8974, + "step": 17836 + }, + { + "epoch": 3.804820819112628, + "grad_norm": 0.23132462185931082, + "learning_rate": 8.180391736798568e-05, + "loss": 0.89, + "step": 17837 + }, + { + "epoch": 3.8050341296928325, + "grad_norm": 0.22751269282408398, + "learning_rate": 8.17954097560588e-05, + "loss": 0.9148, + "step": 17838 + }, + { + "epoch": 3.8052474402730376, + "grad_norm": 0.23194539584195062, + "learning_rate": 8.17869021238168e-05, + "loss": 0.9203, + "step": 17839 + }, + { + "epoch": 3.8054607508532423, + "grad_norm": 0.29929114358301795, + "learning_rate": 8.177839447135601e-05, + "loss": 0.9029, + "step": 17840 + }, + { + "epoch": 3.805674061433447, + "grad_norm": 0.2752747833889816, + "learning_rate": 8.176988679877261e-05, + "loss": 0.9072, + "step": 17841 + }, + { + "epoch": 3.805887372013652, + "grad_norm": 0.2098604891564562, + "learning_rate": 8.176137910616294e-05, + "loss": 0.9224, + "step": 17842 + }, + { + "epoch": 3.806100682593857, + "grad_norm": 0.23027530069889693, + "learning_rate": 8.175287139362322e-05, + "loss": 0.9034, + "step": 17843 + }, + { + "epoch": 3.8063139931740615, + "grad_norm": 0.2736468616559874, + "learning_rate": 8.174436366124975e-05, + "loss": 0.9143, + "step": 17844 + }, + { + "epoch": 3.806527303754266, + "grad_norm": 0.36024286508462994, + "learning_rate": 8.173585590913877e-05, + "loss": 0.923, + "step": 17845 + }, + { + "epoch": 3.806740614334471, + "grad_norm": 0.37537810283109463, + "learning_rate": 8.172734813738657e-05, + "loss": 0.9191, + "step": 17846 + }, + { + "epoch": 3.8069539249146755, + "grad_norm": 0.28381629595383195, + "learning_rate": 8.171884034608934e-05, + "loss": 0.9059, + "step": 17847 + }, + { + "epoch": 3.8071672354948807, + "grad_norm": 0.2133844810335149, + "learning_rate": 8.171033253534343e-05, + "loss": 0.9248, + "step": 17848 + }, + { + "epoch": 3.8073805460750854, + "grad_norm": 0.2270922447537374, + "learning_rate": 8.170182470524509e-05, + "loss": 0.8864, + "step": 17849 + }, + { + "epoch": 3.80759385665529, + "grad_norm": 0.2259763753745459, + "learning_rate": 8.169331685589058e-05, + "loss": 0.9153, + "step": 17850 + }, + { + "epoch": 3.8078071672354947, + "grad_norm": 0.33447686847564834, + "learning_rate": 8.168480898737615e-05, + "loss": 0.9392, + "step": 17851 + }, + { + "epoch": 3.8080204778157, + "grad_norm": 0.3549295219461371, + "learning_rate": 8.167630109979808e-05, + "loss": 0.8952, + "step": 17852 + }, + { + "epoch": 3.8082337883959045, + "grad_norm": 0.29200173309175814, + "learning_rate": 8.166779319325263e-05, + "loss": 0.9128, + "step": 17853 + }, + { + "epoch": 3.8084470989761092, + "grad_norm": 0.19490890621455662, + "learning_rate": 8.165928526783609e-05, + "loss": 0.9026, + "step": 17854 + }, + { + "epoch": 3.808660409556314, + "grad_norm": 0.2491407542260762, + "learning_rate": 8.165077732364468e-05, + "loss": 0.9378, + "step": 17855 + }, + { + "epoch": 3.8088737201365186, + "grad_norm": 0.3615711830577022, + "learning_rate": 8.164226936077472e-05, + "loss": 0.9082, + "step": 17856 + }, + { + "epoch": 3.8090870307167233, + "grad_norm": 0.9536496348928898, + "learning_rate": 8.163376137932246e-05, + "loss": 0.9844, + "step": 17857 + }, + { + "epoch": 3.8093003412969284, + "grad_norm": 0.29233624581638035, + "learning_rate": 8.162525337938414e-05, + "loss": 0.9184, + "step": 17858 + }, + { + "epoch": 3.809513651877133, + "grad_norm": 0.18432001054495747, + "learning_rate": 8.161674536105605e-05, + "loss": 0.9192, + "step": 17859 + }, + { + "epoch": 3.8097269624573378, + "grad_norm": 0.2071206942190806, + "learning_rate": 8.160823732443447e-05, + "loss": 0.9226, + "step": 17860 + }, + { + "epoch": 3.809940273037543, + "grad_norm": 0.26579400811336507, + "learning_rate": 8.159972926961566e-05, + "loss": 0.8986, + "step": 17861 + }, + { + "epoch": 3.8101535836177476, + "grad_norm": 0.2417927742522908, + "learning_rate": 8.15912211966959e-05, + "loss": 0.9214, + "step": 17862 + }, + { + "epoch": 3.8103668941979523, + "grad_norm": 0.196204909812909, + "learning_rate": 8.15827131057714e-05, + "loss": 0.8924, + "step": 17863 + }, + { + "epoch": 3.810580204778157, + "grad_norm": 0.19721664801791297, + "learning_rate": 8.157420499693852e-05, + "loss": 0.8932, + "step": 17864 + }, + { + "epoch": 3.8107935153583616, + "grad_norm": 0.2805506817169414, + "learning_rate": 8.156569687029345e-05, + "loss": 0.9421, + "step": 17865 + }, + { + "epoch": 3.8110068259385663, + "grad_norm": 0.29899116264220416, + "learning_rate": 8.155718872593252e-05, + "loss": 0.927, + "step": 17866 + }, + { + "epoch": 3.8112201365187715, + "grad_norm": 0.27375788560581643, + "learning_rate": 8.154868056395195e-05, + "loss": 0.9123, + "step": 17867 + }, + { + "epoch": 3.811433447098976, + "grad_norm": 0.1826134415535833, + "learning_rate": 8.154017238444807e-05, + "loss": 0.9075, + "step": 17868 + }, + { + "epoch": 3.811646757679181, + "grad_norm": 0.1975460951327243, + "learning_rate": 8.153166418751708e-05, + "loss": 0.921, + "step": 17869 + }, + { + "epoch": 3.8118600682593855, + "grad_norm": 0.3253164276446281, + "learning_rate": 8.152315597325529e-05, + "loss": 0.9103, + "step": 17870 + }, + { + "epoch": 3.8120733788395906, + "grad_norm": 0.320120410312672, + "learning_rate": 8.151464774175899e-05, + "loss": 0.8919, + "step": 17871 + }, + { + "epoch": 3.8122866894197953, + "grad_norm": 0.2565334845386494, + "learning_rate": 8.15061394931244e-05, + "loss": 0.9302, + "step": 17872 + }, + { + "epoch": 3.8125, + "grad_norm": 0.1980415636409031, + "learning_rate": 8.149763122744782e-05, + "loss": 0.9074, + "step": 17873 + }, + { + "epoch": 3.8127133105802047, + "grad_norm": 0.22972879817720412, + "learning_rate": 8.148912294482552e-05, + "loss": 0.9316, + "step": 17874 + }, + { + "epoch": 3.8129266211604094, + "grad_norm": 0.20868780182198585, + "learning_rate": 8.148061464535376e-05, + "loss": 0.8989, + "step": 17875 + }, + { + "epoch": 3.8131399317406145, + "grad_norm": 0.20779034962765944, + "learning_rate": 8.147210632912883e-05, + "loss": 0.9241, + "step": 17876 + }, + { + "epoch": 3.813353242320819, + "grad_norm": 0.18897755492696258, + "learning_rate": 8.146359799624698e-05, + "loss": 0.8819, + "step": 17877 + }, + { + "epoch": 3.813566552901024, + "grad_norm": 0.20727879461421486, + "learning_rate": 8.145508964680452e-05, + "loss": 0.9178, + "step": 17878 + }, + { + "epoch": 3.8137798634812285, + "grad_norm": 0.24284284671759546, + "learning_rate": 8.144658128089769e-05, + "loss": 0.9354, + "step": 17879 + }, + { + "epoch": 3.8139931740614337, + "grad_norm": 0.2247664946924056, + "learning_rate": 8.143807289862275e-05, + "loss": 0.9125, + "step": 17880 + }, + { + "epoch": 3.8142064846416384, + "grad_norm": 0.24820102635518004, + "learning_rate": 8.142956450007599e-05, + "loss": 0.9038, + "step": 17881 + }, + { + "epoch": 3.814419795221843, + "grad_norm": 0.17592723754241477, + "learning_rate": 8.142105608535369e-05, + "loss": 0.9647, + "step": 17882 + }, + { + "epoch": 3.8146331058020477, + "grad_norm": 0.23807775383590424, + "learning_rate": 8.14125476545521e-05, + "loss": 0.8944, + "step": 17883 + }, + { + "epoch": 3.8148464163822524, + "grad_norm": 0.2788190816283778, + "learning_rate": 8.140403920776758e-05, + "loss": 0.9112, + "step": 17884 + }, + { + "epoch": 3.815059726962457, + "grad_norm": 0.28621682218135197, + "learning_rate": 8.139553074509626e-05, + "loss": 0.9014, + "step": 17885 + }, + { + "epoch": 3.8152730375426622, + "grad_norm": 0.2619147045004941, + "learning_rate": 8.138702226663451e-05, + "loss": 0.9263, + "step": 17886 + }, + { + "epoch": 3.815486348122867, + "grad_norm": 0.223275624909368, + "learning_rate": 8.137851377247856e-05, + "loss": 0.9302, + "step": 17887 + }, + { + "epoch": 3.8156996587030716, + "grad_norm": 0.16495898386651267, + "learning_rate": 8.137000526272473e-05, + "loss": 0.9071, + "step": 17888 + }, + { + "epoch": 3.8159129692832767, + "grad_norm": 0.2531093292665552, + "learning_rate": 8.136149673746926e-05, + "loss": 0.8955, + "step": 17889 + }, + { + "epoch": 3.8161262798634814, + "grad_norm": 0.29235166328944795, + "learning_rate": 8.135298819680846e-05, + "loss": 0.9338, + "step": 17890 + }, + { + "epoch": 3.816339590443686, + "grad_norm": 0.24185337514172167, + "learning_rate": 8.134447964083852e-05, + "loss": 0.9333, + "step": 17891 + }, + { + "epoch": 3.8165529010238908, + "grad_norm": 0.9888797226541776, + "learning_rate": 8.13359710696558e-05, + "loss": 0.9877, + "step": 17892 + }, + { + "epoch": 3.8167662116040955, + "grad_norm": 0.21136438888855177, + "learning_rate": 8.132746248335655e-05, + "loss": 0.9432, + "step": 17893 + }, + { + "epoch": 3.8169795221843, + "grad_norm": 0.22708419534371777, + "learning_rate": 8.131895388203703e-05, + "loss": 0.9152, + "step": 17894 + }, + { + "epoch": 3.8171928327645053, + "grad_norm": 0.19990876476852176, + "learning_rate": 8.131044526579351e-05, + "loss": 0.905, + "step": 17895 + }, + { + "epoch": 3.81740614334471, + "grad_norm": 0.1911722797421461, + "learning_rate": 8.130193663472231e-05, + "loss": 0.9409, + "step": 17896 + }, + { + "epoch": 3.8176194539249146, + "grad_norm": 0.1981439093526603, + "learning_rate": 8.129342798891964e-05, + "loss": 0.8979, + "step": 17897 + }, + { + "epoch": 3.8178327645051193, + "grad_norm": 0.1665193948790254, + "learning_rate": 8.128491932848186e-05, + "loss": 0.9237, + "step": 17898 + }, + { + "epoch": 3.8180460750853245, + "grad_norm": 0.17454194900823644, + "learning_rate": 8.127641065350514e-05, + "loss": 0.9004, + "step": 17899 + }, + { + "epoch": 3.818259385665529, + "grad_norm": 0.19737117864758624, + "learning_rate": 8.126790196408584e-05, + "loss": 0.9275, + "step": 17900 + }, + { + "epoch": 3.818472696245734, + "grad_norm": 0.2096634700406153, + "learning_rate": 8.125939326032023e-05, + "loss": 0.9309, + "step": 17901 + }, + { + "epoch": 3.8186860068259385, + "grad_norm": 0.23384250047365307, + "learning_rate": 8.125088454230452e-05, + "loss": 0.9189, + "step": 17902 + }, + { + "epoch": 3.818899317406143, + "grad_norm": 0.262293524648673, + "learning_rate": 8.124237581013505e-05, + "loss": 0.9358, + "step": 17903 + }, + { + "epoch": 3.819112627986348, + "grad_norm": 0.20657793640710664, + "learning_rate": 8.12338670639081e-05, + "loss": 0.9337, + "step": 17904 + }, + { + "epoch": 3.819325938566553, + "grad_norm": 0.2273213172224784, + "learning_rate": 8.122535830371987e-05, + "loss": 0.9022, + "step": 17905 + }, + { + "epoch": 3.8195392491467577, + "grad_norm": 0.3175845498729179, + "learning_rate": 8.121684952966674e-05, + "loss": 0.9068, + "step": 17906 + }, + { + "epoch": 3.8197525597269624, + "grad_norm": 0.3134313507665577, + "learning_rate": 8.120834074184492e-05, + "loss": 0.9208, + "step": 17907 + }, + { + "epoch": 3.8199658703071675, + "grad_norm": 0.2554691065082809, + "learning_rate": 8.119983194035071e-05, + "loss": 0.9177, + "step": 17908 + }, + { + "epoch": 3.820179180887372, + "grad_norm": 0.2727859085020103, + "learning_rate": 8.119132312528036e-05, + "loss": 0.8949, + "step": 17909 + }, + { + "epoch": 3.820392491467577, + "grad_norm": 0.3025912506857549, + "learning_rate": 8.118281429673019e-05, + "loss": 0.9406, + "step": 17910 + }, + { + "epoch": 3.8206058020477816, + "grad_norm": 0.32805170932264177, + "learning_rate": 8.117430545479644e-05, + "loss": 0.9643, + "step": 17911 + }, + { + "epoch": 3.8208191126279862, + "grad_norm": 0.31159643782855106, + "learning_rate": 8.116579659957543e-05, + "loss": 0.8943, + "step": 17912 + }, + { + "epoch": 3.821032423208191, + "grad_norm": 0.2188187717060116, + "learning_rate": 8.115728773116338e-05, + "loss": 0.9183, + "step": 17913 + }, + { + "epoch": 3.821245733788396, + "grad_norm": 0.19830755576706857, + "learning_rate": 8.114877884965663e-05, + "loss": 0.8861, + "step": 17914 + }, + { + "epoch": 3.8214590443686007, + "grad_norm": 0.22800418009534698, + "learning_rate": 8.11402699551514e-05, + "loss": 0.9128, + "step": 17915 + }, + { + "epoch": 3.8216723549488054, + "grad_norm": 0.2570223225954155, + "learning_rate": 8.113176104774402e-05, + "loss": 0.9033, + "step": 17916 + }, + { + "epoch": 3.82188566552901, + "grad_norm": 0.2916630649584671, + "learning_rate": 8.112325212753073e-05, + "loss": 0.9176, + "step": 17917 + }, + { + "epoch": 3.8220989761092152, + "grad_norm": 0.23801322088784518, + "learning_rate": 8.111474319460785e-05, + "loss": 0.9248, + "step": 17918 + }, + { + "epoch": 3.82231228668942, + "grad_norm": 0.18498856796286106, + "learning_rate": 8.110623424907159e-05, + "loss": 0.9429, + "step": 17919 + }, + { + "epoch": 3.8225255972696246, + "grad_norm": 0.2974830451439502, + "learning_rate": 8.10977252910183e-05, + "loss": 0.8786, + "step": 17920 + }, + { + "epoch": 3.8227389078498293, + "grad_norm": 0.243427924540797, + "learning_rate": 8.108921632054421e-05, + "loss": 0.9243, + "step": 17921 + }, + { + "epoch": 3.822952218430034, + "grad_norm": 0.230434806130578, + "learning_rate": 8.108070733774563e-05, + "loss": 0.9156, + "step": 17922 + }, + { + "epoch": 3.8231655290102387, + "grad_norm": 0.19465327566356755, + "learning_rate": 8.107219834271881e-05, + "loss": 0.9246, + "step": 17923 + }, + { + "epoch": 3.823378839590444, + "grad_norm": 0.1674325486283458, + "learning_rate": 8.10636893355601e-05, + "loss": 0.9704, + "step": 17924 + }, + { + "epoch": 3.8235921501706485, + "grad_norm": 0.16784648878463718, + "learning_rate": 8.105518031636568e-05, + "loss": 0.9186, + "step": 17925 + }, + { + "epoch": 3.823805460750853, + "grad_norm": 0.1798530006255107, + "learning_rate": 8.104667128523188e-05, + "loss": 0.9147, + "step": 17926 + }, + { + "epoch": 3.8240187713310583, + "grad_norm": 0.22571109192149744, + "learning_rate": 8.1038162242255e-05, + "loss": 0.954, + "step": 17927 + }, + { + "epoch": 3.824232081911263, + "grad_norm": 0.18219492451578312, + "learning_rate": 8.102965318753127e-05, + "loss": 0.927, + "step": 17928 + }, + { + "epoch": 3.8244453924914676, + "grad_norm": 0.20291616058830597, + "learning_rate": 8.102114412115702e-05, + "loss": 0.9102, + "step": 17929 + }, + { + "epoch": 3.8246587030716723, + "grad_norm": 0.20031703421118796, + "learning_rate": 8.10126350432285e-05, + "loss": 0.9257, + "step": 17930 + }, + { + "epoch": 3.824872013651877, + "grad_norm": 0.25073199543176405, + "learning_rate": 8.100412595384199e-05, + "loss": 0.9056, + "step": 17931 + }, + { + "epoch": 3.8250853242320817, + "grad_norm": 0.2268287762153643, + "learning_rate": 8.09956168530938e-05, + "loss": 0.9288, + "step": 17932 + }, + { + "epoch": 3.825298634812287, + "grad_norm": 0.22965505018637883, + "learning_rate": 8.098710774108014e-05, + "loss": 0.907, + "step": 17933 + }, + { + "epoch": 3.8255119453924915, + "grad_norm": 0.253826363693125, + "learning_rate": 8.097859861789738e-05, + "loss": 0.9312, + "step": 17934 + }, + { + "epoch": 3.825725255972696, + "grad_norm": 0.171470171131302, + "learning_rate": 8.097008948364176e-05, + "loss": 0.895, + "step": 17935 + }, + { + "epoch": 3.825938566552901, + "grad_norm": 0.19889369053955952, + "learning_rate": 8.096158033840957e-05, + "loss": 0.9101, + "step": 17936 + }, + { + "epoch": 3.826151877133106, + "grad_norm": 0.22699560238383726, + "learning_rate": 8.095307118229705e-05, + "loss": 0.9409, + "step": 17937 + }, + { + "epoch": 3.8263651877133107, + "grad_norm": 0.21877816739821845, + "learning_rate": 8.094456201540052e-05, + "loss": 0.9066, + "step": 17938 + }, + { + "epoch": 3.8265784982935154, + "grad_norm": 0.25130722899949426, + "learning_rate": 8.093605283781625e-05, + "loss": 0.9263, + "step": 17939 + }, + { + "epoch": 3.82679180887372, + "grad_norm": 0.22386544878535716, + "learning_rate": 8.092754364964057e-05, + "loss": 0.9308, + "step": 17940 + }, + { + "epoch": 3.8270051194539247, + "grad_norm": 0.22131646059572063, + "learning_rate": 8.091903445096967e-05, + "loss": 0.9066, + "step": 17941 + }, + { + "epoch": 3.8272184300341294, + "grad_norm": 0.17070932781982018, + "learning_rate": 8.091052524189991e-05, + "loss": 0.8786, + "step": 17942 + }, + { + "epoch": 3.8274317406143346, + "grad_norm": 0.21902741409183502, + "learning_rate": 8.09020160225275e-05, + "loss": 0.9056, + "step": 17943 + }, + { + "epoch": 3.8276450511945392, + "grad_norm": 0.2507448058067841, + "learning_rate": 8.08935067929488e-05, + "loss": 0.9197, + "step": 17944 + }, + { + "epoch": 3.827858361774744, + "grad_norm": 0.23435473729673337, + "learning_rate": 8.088499755326004e-05, + "loss": 0.9409, + "step": 17945 + }, + { + "epoch": 3.828071672354949, + "grad_norm": 0.1792455299514742, + "learning_rate": 8.087648830355754e-05, + "loss": 0.9266, + "step": 17946 + }, + { + "epoch": 3.8282849829351537, + "grad_norm": 0.20051732906051412, + "learning_rate": 8.086797904393754e-05, + "loss": 0.9044, + "step": 17947 + }, + { + "epoch": 3.8284982935153584, + "grad_norm": 0.2545954897428454, + "learning_rate": 8.085946977449637e-05, + "loss": 0.9097, + "step": 17948 + }, + { + "epoch": 3.828711604095563, + "grad_norm": 0.2631977691089654, + "learning_rate": 8.085096049533024e-05, + "loss": 0.9213, + "step": 17949 + }, + { + "epoch": 3.828924914675768, + "grad_norm": 0.22352403894898176, + "learning_rate": 8.084245120653551e-05, + "loss": 0.8986, + "step": 17950 + }, + { + "epoch": 3.8291382252559725, + "grad_norm": 0.169993592846994, + "learning_rate": 8.083394190820842e-05, + "loss": 0.9239, + "step": 17951 + }, + { + "epoch": 3.8293515358361776, + "grad_norm": 0.24082456572464664, + "learning_rate": 8.082543260044527e-05, + "loss": 0.9085, + "step": 17952 + }, + { + "epoch": 3.8295648464163823, + "grad_norm": 0.2171160597174873, + "learning_rate": 8.081692328334232e-05, + "loss": 0.9076, + "step": 17953 + }, + { + "epoch": 3.829778156996587, + "grad_norm": 0.18478867930856543, + "learning_rate": 8.080841395699588e-05, + "loss": 0.9091, + "step": 17954 + }, + { + "epoch": 3.8299914675767917, + "grad_norm": 0.2286148032425788, + "learning_rate": 8.079990462150222e-05, + "loss": 0.9258, + "step": 17955 + }, + { + "epoch": 3.830204778156997, + "grad_norm": 0.1983384747204922, + "learning_rate": 8.079139527695763e-05, + "loss": 0.9067, + "step": 17956 + }, + { + "epoch": 3.8304180887372015, + "grad_norm": 0.25648538409830557, + "learning_rate": 8.07828859234584e-05, + "loss": 0.8842, + "step": 17957 + }, + { + "epoch": 3.830631399317406, + "grad_norm": 0.2283775945142326, + "learning_rate": 8.077437656110079e-05, + "loss": 0.9551, + "step": 17958 + }, + { + "epoch": 3.830844709897611, + "grad_norm": 0.165796126308495, + "learning_rate": 8.076586718998108e-05, + "loss": 0.9045, + "step": 17959 + }, + { + "epoch": 3.8310580204778155, + "grad_norm": 0.2193250344541518, + "learning_rate": 8.075735781019559e-05, + "loss": 0.9234, + "step": 17960 + }, + { + "epoch": 3.83127133105802, + "grad_norm": 0.22708091454265322, + "learning_rate": 8.074884842184057e-05, + "loss": 0.9084, + "step": 17961 + }, + { + "epoch": 3.8314846416382253, + "grad_norm": 0.2171374362904843, + "learning_rate": 8.074033902501233e-05, + "loss": 0.8985, + "step": 17962 + }, + { + "epoch": 3.83169795221843, + "grad_norm": 0.2167893417913059, + "learning_rate": 8.073182961980715e-05, + "loss": 0.9155, + "step": 17963 + }, + { + "epoch": 3.8319112627986347, + "grad_norm": 0.1817305134665644, + "learning_rate": 8.07233202063213e-05, + "loss": 0.9104, + "step": 17964 + }, + { + "epoch": 3.83212457337884, + "grad_norm": 0.1877574936802497, + "learning_rate": 8.071481078465106e-05, + "loss": 0.9052, + "step": 17965 + }, + { + "epoch": 3.8323378839590445, + "grad_norm": 0.22118165875721743, + "learning_rate": 8.070630135489271e-05, + "loss": 0.8991, + "step": 17966 + }, + { + "epoch": 3.832551194539249, + "grad_norm": 0.20329905536815454, + "learning_rate": 8.069779191714257e-05, + "loss": 0.9074, + "step": 17967 + }, + { + "epoch": 3.832764505119454, + "grad_norm": 0.15646043299348183, + "learning_rate": 8.068928247149692e-05, + "loss": 0.944, + "step": 17968 + }, + { + "epoch": 3.8329778156996586, + "grad_norm": 0.201917542420289, + "learning_rate": 8.068077301805199e-05, + "loss": 0.9338, + "step": 17969 + }, + { + "epoch": 3.8331911262798632, + "grad_norm": 0.23280184453558903, + "learning_rate": 8.067226355690413e-05, + "loss": 0.9233, + "step": 17970 + }, + { + "epoch": 3.8334044368600684, + "grad_norm": 0.19711024415020967, + "learning_rate": 8.066375408814957e-05, + "loss": 0.9023, + "step": 17971 + }, + { + "epoch": 3.833617747440273, + "grad_norm": 0.19961639606237436, + "learning_rate": 8.065524461188464e-05, + "loss": 0.9407, + "step": 17972 + }, + { + "epoch": 3.8338310580204777, + "grad_norm": 0.20799025309685626, + "learning_rate": 8.06467351282056e-05, + "loss": 0.925, + "step": 17973 + }, + { + "epoch": 3.8340443686006824, + "grad_norm": 0.19495152963174445, + "learning_rate": 8.063822563720876e-05, + "loss": 0.8821, + "step": 17974 + }, + { + "epoch": 3.8342576791808876, + "grad_norm": 0.19743085760959114, + "learning_rate": 8.062971613899037e-05, + "loss": 0.8897, + "step": 17975 + }, + { + "epoch": 3.8344709897610922, + "grad_norm": 0.18565720862799304, + "learning_rate": 8.062120663364673e-05, + "loss": 0.9367, + "step": 17976 + }, + { + "epoch": 3.834684300341297, + "grad_norm": 0.23302050041128664, + "learning_rate": 8.061269712127411e-05, + "loss": 0.9168, + "step": 17977 + }, + { + "epoch": 3.8348976109215016, + "grad_norm": 0.2250773949837012, + "learning_rate": 8.060418760196882e-05, + "loss": 0.9343, + "step": 17978 + }, + { + "epoch": 3.8351109215017063, + "grad_norm": 0.17500546001851874, + "learning_rate": 8.059567807582713e-05, + "loss": 0.9156, + "step": 17979 + }, + { + "epoch": 3.8353242320819114, + "grad_norm": 0.22605533300090216, + "learning_rate": 8.058716854294538e-05, + "loss": 0.9216, + "step": 17980 + }, + { + "epoch": 3.835537542662116, + "grad_norm": 0.24906296450566567, + "learning_rate": 8.057865900341977e-05, + "loss": 0.9382, + "step": 17981 + }, + { + "epoch": 3.835750853242321, + "grad_norm": 0.23671432125096334, + "learning_rate": 8.057014945734663e-05, + "loss": 0.9525, + "step": 17982 + }, + { + "epoch": 3.8359641638225255, + "grad_norm": 0.19338548407385267, + "learning_rate": 8.056163990482222e-05, + "loss": 0.9144, + "step": 17983 + }, + { + "epoch": 3.8361774744027306, + "grad_norm": 0.21920911575220275, + "learning_rate": 8.055313034594289e-05, + "loss": 0.913, + "step": 17984 + }, + { + "epoch": 3.8363907849829353, + "grad_norm": 0.17295412767260218, + "learning_rate": 8.054462078080486e-05, + "loss": 0.9153, + "step": 17985 + }, + { + "epoch": 3.83660409556314, + "grad_norm": 0.19445308476209072, + "learning_rate": 8.053611120950443e-05, + "loss": 0.9524, + "step": 17986 + }, + { + "epoch": 3.8368174061433447, + "grad_norm": 0.18900288000712367, + "learning_rate": 8.052760163213789e-05, + "loss": 0.9082, + "step": 17987 + }, + { + "epoch": 3.8370307167235493, + "grad_norm": 0.19537945784859803, + "learning_rate": 8.051909204880153e-05, + "loss": 0.895, + "step": 17988 + }, + { + "epoch": 3.837244027303754, + "grad_norm": 0.157385534986151, + "learning_rate": 8.051058245959163e-05, + "loss": 0.9, + "step": 17989 + }, + { + "epoch": 3.837457337883959, + "grad_norm": 0.21079554968236547, + "learning_rate": 8.05020728646045e-05, + "loss": 0.905, + "step": 17990 + }, + { + "epoch": 3.837670648464164, + "grad_norm": 0.18838141244024395, + "learning_rate": 8.049356326393639e-05, + "loss": 0.9425, + "step": 17991 + }, + { + "epoch": 3.8378839590443685, + "grad_norm": 0.19987926962434222, + "learning_rate": 8.048505365768363e-05, + "loss": 0.8891, + "step": 17992 + }, + { + "epoch": 3.8380972696245736, + "grad_norm": 0.23112996136983738, + "learning_rate": 8.047654404594245e-05, + "loss": 0.905, + "step": 17993 + }, + { + "epoch": 3.8383105802047783, + "grad_norm": 0.18951828864361567, + "learning_rate": 8.046803442880917e-05, + "loss": 0.9099, + "step": 17994 + }, + { + "epoch": 3.838523890784983, + "grad_norm": 0.18030187420877977, + "learning_rate": 8.045952480638006e-05, + "loss": 0.9111, + "step": 17995 + }, + { + "epoch": 3.8387372013651877, + "grad_norm": 0.25310175342608393, + "learning_rate": 8.045101517875146e-05, + "loss": 0.915, + "step": 17996 + }, + { + "epoch": 3.8389505119453924, + "grad_norm": 0.23223659131039845, + "learning_rate": 8.04425055460196e-05, + "loss": 0.9353, + "step": 17997 + }, + { + "epoch": 3.839163822525597, + "grad_norm": 0.20157292154539713, + "learning_rate": 8.043399590828076e-05, + "loss": 0.9148, + "step": 17998 + }, + { + "epoch": 3.839377133105802, + "grad_norm": 0.3063783414037973, + "learning_rate": 8.042548626563125e-05, + "loss": 0.8966, + "step": 17999 + }, + { + "epoch": 3.839590443686007, + "grad_norm": 0.25710831565649467, + "learning_rate": 8.041697661816738e-05, + "loss": 0.916, + "step": 18000 + }, + { + "epoch": 3.8398037542662116, + "grad_norm": 0.21373944981106702, + "learning_rate": 8.040846696598539e-05, + "loss": 0.9302, + "step": 18001 + }, + { + "epoch": 3.8400170648464163, + "grad_norm": 0.2885793094231703, + "learning_rate": 8.039995730918163e-05, + "loss": 0.9227, + "step": 18002 + }, + { + "epoch": 3.8402303754266214, + "grad_norm": 0.2644614497174806, + "learning_rate": 8.03914476478523e-05, + "loss": 0.9261, + "step": 18003 + }, + { + "epoch": 3.840443686006826, + "grad_norm": 0.23817528812441474, + "learning_rate": 8.038293798209376e-05, + "loss": 0.9049, + "step": 18004 + }, + { + "epoch": 3.8406569965870307, + "grad_norm": 0.29044272442710134, + "learning_rate": 8.037442831200224e-05, + "loss": 0.9072, + "step": 18005 + }, + { + "epoch": 3.8408703071672354, + "grad_norm": 0.21217235900022643, + "learning_rate": 8.03659186376741e-05, + "loss": 0.9136, + "step": 18006 + }, + { + "epoch": 3.84108361774744, + "grad_norm": 0.21710283109582149, + "learning_rate": 8.035740895920555e-05, + "loss": 0.891, + "step": 18007 + }, + { + "epoch": 3.841296928327645, + "grad_norm": 0.27741453627580304, + "learning_rate": 8.034889927669294e-05, + "loss": 0.9191, + "step": 18008 + }, + { + "epoch": 3.84151023890785, + "grad_norm": 0.24505728319283737, + "learning_rate": 8.034038959023252e-05, + "loss": 0.8973, + "step": 18009 + }, + { + "epoch": 3.8417235494880546, + "grad_norm": 0.21486404253799918, + "learning_rate": 8.033187989992057e-05, + "loss": 0.9388, + "step": 18010 + }, + { + "epoch": 3.8419368600682593, + "grad_norm": 0.23152248611925616, + "learning_rate": 8.032337020585338e-05, + "loss": 0.8988, + "step": 18011 + }, + { + "epoch": 3.8421501706484644, + "grad_norm": 0.18751842159089877, + "learning_rate": 8.03148605081273e-05, + "loss": 0.9127, + "step": 18012 + }, + { + "epoch": 3.842363481228669, + "grad_norm": 0.21297300571710828, + "learning_rate": 8.030635080683853e-05, + "loss": 0.9047, + "step": 18013 + }, + { + "epoch": 3.842576791808874, + "grad_norm": 0.21229428052958804, + "learning_rate": 8.029784110208342e-05, + "loss": 0.9791, + "step": 18014 + }, + { + "epoch": 3.8427901023890785, + "grad_norm": 0.20426293673095763, + "learning_rate": 8.028933139395822e-05, + "loss": 0.9122, + "step": 18015 + }, + { + "epoch": 3.843003412969283, + "grad_norm": 0.22198478698787022, + "learning_rate": 8.028082168255923e-05, + "loss": 0.9137, + "step": 18016 + }, + { + "epoch": 3.843216723549488, + "grad_norm": 0.2125622705905414, + "learning_rate": 8.027231196798273e-05, + "loss": 0.9101, + "step": 18017 + }, + { + "epoch": 3.843430034129693, + "grad_norm": 0.17563879716901457, + "learning_rate": 8.026380225032504e-05, + "loss": 0.9373, + "step": 18018 + }, + { + "epoch": 3.8436433447098977, + "grad_norm": 0.20986830757870104, + "learning_rate": 8.02552925296824e-05, + "loss": 0.8928, + "step": 18019 + }, + { + "epoch": 3.8438566552901023, + "grad_norm": 0.18998983372480785, + "learning_rate": 8.024678280615114e-05, + "loss": 0.9423, + "step": 18020 + }, + { + "epoch": 3.844069965870307, + "grad_norm": 0.21028960587094447, + "learning_rate": 8.02382730798275e-05, + "loss": 0.898, + "step": 18021 + }, + { + "epoch": 3.844283276450512, + "grad_norm": 0.2172559788581714, + "learning_rate": 8.022976335080781e-05, + "loss": 0.939, + "step": 18022 + }, + { + "epoch": 3.844496587030717, + "grad_norm": 0.20660723052914753, + "learning_rate": 8.022125361918834e-05, + "loss": 0.8853, + "step": 18023 + }, + { + "epoch": 3.8447098976109215, + "grad_norm": 0.2489209117708991, + "learning_rate": 8.02127438850654e-05, + "loss": 0.9006, + "step": 18024 + }, + { + "epoch": 3.844923208191126, + "grad_norm": 0.21917554378264556, + "learning_rate": 8.020423414853524e-05, + "loss": 0.9377, + "step": 18025 + }, + { + "epoch": 3.845136518771331, + "grad_norm": 0.1770058879908612, + "learning_rate": 8.019572440969418e-05, + "loss": 0.9137, + "step": 18026 + }, + { + "epoch": 3.8453498293515356, + "grad_norm": 0.19668724884324137, + "learning_rate": 8.018721466863849e-05, + "loss": 0.9158, + "step": 18027 + }, + { + "epoch": 3.8455631399317407, + "grad_norm": 0.20205080344328746, + "learning_rate": 8.017870492546445e-05, + "loss": 0.9094, + "step": 18028 + }, + { + "epoch": 3.8457764505119454, + "grad_norm": 0.22505510527852557, + "learning_rate": 8.017019518026838e-05, + "loss": 0.9193, + "step": 18029 + }, + { + "epoch": 3.84598976109215, + "grad_norm": 0.17721397355177732, + "learning_rate": 8.016168543314654e-05, + "loss": 0.9318, + "step": 18030 + }, + { + "epoch": 3.846203071672355, + "grad_norm": 0.2559162436890489, + "learning_rate": 8.015317568419524e-05, + "loss": 0.9245, + "step": 18031 + }, + { + "epoch": 3.84641638225256, + "grad_norm": 0.31577583036655993, + "learning_rate": 8.014466593351074e-05, + "loss": 0.9228, + "step": 18032 + }, + { + "epoch": 3.8466296928327646, + "grad_norm": 0.21469879826702995, + "learning_rate": 8.013615618118934e-05, + "loss": 0.909, + "step": 18033 + }, + { + "epoch": 3.8468430034129693, + "grad_norm": 0.17729425049100683, + "learning_rate": 8.012764642732734e-05, + "loss": 0.9192, + "step": 18034 + }, + { + "epoch": 3.847056313993174, + "grad_norm": 0.2686703664167492, + "learning_rate": 8.0119136672021e-05, + "loss": 0.8999, + "step": 18035 + }, + { + "epoch": 3.8472696245733786, + "grad_norm": 0.26962958290982225, + "learning_rate": 8.011062691536664e-05, + "loss": 0.9086, + "step": 18036 + }, + { + "epoch": 3.8474829351535837, + "grad_norm": 0.28550680355179003, + "learning_rate": 8.010211715746054e-05, + "loss": 0.9179, + "step": 18037 + }, + { + "epoch": 3.8476962457337884, + "grad_norm": 0.2787846510446049, + "learning_rate": 8.0093607398399e-05, + "loss": 0.9244, + "step": 18038 + }, + { + "epoch": 3.847909556313993, + "grad_norm": 0.20343225759580863, + "learning_rate": 8.008509763827824e-05, + "loss": 0.8914, + "step": 18039 + }, + { + "epoch": 3.848122866894198, + "grad_norm": 0.17233830733040015, + "learning_rate": 8.007658787719465e-05, + "loss": 0.9479, + "step": 18040 + }, + { + "epoch": 3.848336177474403, + "grad_norm": 0.19239564530143755, + "learning_rate": 8.006807811524445e-05, + "loss": 0.9207, + "step": 18041 + }, + { + "epoch": 3.8485494880546076, + "grad_norm": 0.17891887902721712, + "learning_rate": 8.005956835252391e-05, + "loss": 0.9006, + "step": 18042 + }, + { + "epoch": 3.8487627986348123, + "grad_norm": 0.18024269896061834, + "learning_rate": 8.00510585891294e-05, + "loss": 0.9307, + "step": 18043 + }, + { + "epoch": 3.848976109215017, + "grad_norm": 0.21164929542495844, + "learning_rate": 8.004254882515715e-05, + "loss": 0.8911, + "step": 18044 + }, + { + "epoch": 3.8491894197952217, + "grad_norm": 0.20010574522424415, + "learning_rate": 8.003403906070343e-05, + "loss": 0.9213, + "step": 18045 + }, + { + "epoch": 3.8494027303754264, + "grad_norm": 0.17449312699486935, + "learning_rate": 8.002552929586459e-05, + "loss": 0.8984, + "step": 18046 + }, + { + "epoch": 3.8496160409556315, + "grad_norm": 0.22945929278562674, + "learning_rate": 8.00170195307369e-05, + "loss": 0.9252, + "step": 18047 + }, + { + "epoch": 3.849829351535836, + "grad_norm": 0.1841656454204226, + "learning_rate": 8.000850976541659e-05, + "loss": 0.8988, + "step": 18048 + }, + { + "epoch": 3.850042662116041, + "grad_norm": 0.18045169196881797, + "learning_rate": 8e-05, + "loss": 0.901, + "step": 18049 + }, + { + "epoch": 3.850255972696246, + "grad_norm": 0.2154294080883672, + "learning_rate": 7.999149023458344e-05, + "loss": 0.9093, + "step": 18050 + }, + { + "epoch": 3.8504692832764507, + "grad_norm": 0.18540755108409748, + "learning_rate": 7.998298046926313e-05, + "loss": 0.9492, + "step": 18051 + }, + { + "epoch": 3.8506825938566553, + "grad_norm": 0.19253767747884507, + "learning_rate": 7.997447070413542e-05, + "loss": 0.9235, + "step": 18052 + }, + { + "epoch": 3.85089590443686, + "grad_norm": 0.19932864915440468, + "learning_rate": 7.996596093929657e-05, + "loss": 0.9104, + "step": 18053 + }, + { + "epoch": 3.8511092150170647, + "grad_norm": 0.19308933900586905, + "learning_rate": 7.995745117484287e-05, + "loss": 0.9253, + "step": 18054 + }, + { + "epoch": 3.8513225255972694, + "grad_norm": 0.20993442778108926, + "learning_rate": 7.994894141087065e-05, + "loss": 0.9341, + "step": 18055 + }, + { + "epoch": 3.8515358361774745, + "grad_norm": 0.22677682683801467, + "learning_rate": 7.99404316474761e-05, + "loss": 0.9235, + "step": 18056 + }, + { + "epoch": 3.851749146757679, + "grad_norm": 0.246191159980036, + "learning_rate": 7.99319218847556e-05, + "loss": 0.94, + "step": 18057 + }, + { + "epoch": 3.851962457337884, + "grad_norm": 0.16992016294681048, + "learning_rate": 7.992341212280538e-05, + "loss": 0.9253, + "step": 18058 + }, + { + "epoch": 3.8521757679180886, + "grad_norm": 0.18038922514018962, + "learning_rate": 7.991490236172177e-05, + "loss": 0.9119, + "step": 18059 + }, + { + "epoch": 3.8523890784982937, + "grad_norm": 0.2021244207374573, + "learning_rate": 7.990639260160103e-05, + "loss": 0.9344, + "step": 18060 + }, + { + "epoch": 3.8526023890784984, + "grad_norm": 0.19844588755125872, + "learning_rate": 7.989788284253948e-05, + "loss": 0.949, + "step": 18061 + }, + { + "epoch": 3.852815699658703, + "grad_norm": 0.22518203035732773, + "learning_rate": 7.988937308463338e-05, + "loss": 0.9152, + "step": 18062 + }, + { + "epoch": 3.8530290102389078, + "grad_norm": 0.2357334969360696, + "learning_rate": 7.988086332797901e-05, + "loss": 0.9383, + "step": 18063 + }, + { + "epoch": 3.8532423208191124, + "grad_norm": 0.21720391324838312, + "learning_rate": 7.987235357267268e-05, + "loss": 0.9025, + "step": 18064 + }, + { + "epoch": 3.8534556313993176, + "grad_norm": 0.17978653589956042, + "learning_rate": 7.986384381881067e-05, + "loss": 0.9027, + "step": 18065 + }, + { + "epoch": 3.8536689419795223, + "grad_norm": 0.21769818339707284, + "learning_rate": 7.98553340664893e-05, + "loss": 0.9016, + "step": 18066 + }, + { + "epoch": 3.853882252559727, + "grad_norm": 0.20004214019063107, + "learning_rate": 7.98468243158048e-05, + "loss": 0.9157, + "step": 18067 + }, + { + "epoch": 3.8540955631399316, + "grad_norm": 0.22029801678038044, + "learning_rate": 7.983831456685349e-05, + "loss": 0.9198, + "step": 18068 + }, + { + "epoch": 3.8543088737201368, + "grad_norm": 0.26160972656359427, + "learning_rate": 7.982980481973165e-05, + "loss": 0.9197, + "step": 18069 + }, + { + "epoch": 3.8545221843003414, + "grad_norm": 0.22555570850850895, + "learning_rate": 7.982129507453556e-05, + "loss": 0.9244, + "step": 18070 + }, + { + "epoch": 3.854735494880546, + "grad_norm": 0.1780527955770795, + "learning_rate": 7.981278533136155e-05, + "loss": 0.8983, + "step": 18071 + }, + { + "epoch": 3.854948805460751, + "grad_norm": 0.2317266506008649, + "learning_rate": 7.980427559030584e-05, + "loss": 0.911, + "step": 18072 + }, + { + "epoch": 3.8551621160409555, + "grad_norm": 0.18982106075107275, + "learning_rate": 7.979576585146477e-05, + "loss": 0.9048, + "step": 18073 + }, + { + "epoch": 3.85537542662116, + "grad_norm": 0.21100135266674763, + "learning_rate": 7.978725611493464e-05, + "loss": 0.9054, + "step": 18074 + }, + { + "epoch": 3.8555887372013653, + "grad_norm": 0.200730982881468, + "learning_rate": 7.977874638081166e-05, + "loss": 0.9123, + "step": 18075 + }, + { + "epoch": 3.85580204778157, + "grad_norm": 0.215525476083251, + "learning_rate": 7.97702366491922e-05, + "loss": 0.9019, + "step": 18076 + }, + { + "epoch": 3.8560153583617747, + "grad_norm": 0.20113330565877252, + "learning_rate": 7.976172692017254e-05, + "loss": 0.859, + "step": 18077 + }, + { + "epoch": 3.85622866894198, + "grad_norm": 0.2565358343871447, + "learning_rate": 7.97532171938489e-05, + "loss": 0.9163, + "step": 18078 + }, + { + "epoch": 3.8564419795221845, + "grad_norm": 0.32971993668817434, + "learning_rate": 7.974470747031764e-05, + "loss": 0.9004, + "step": 18079 + }, + { + "epoch": 3.856655290102389, + "grad_norm": 0.4101324531509742, + "learning_rate": 7.9736197749675e-05, + "loss": 0.9055, + "step": 18080 + }, + { + "epoch": 3.856868600682594, + "grad_norm": 0.3710161728782138, + "learning_rate": 7.97276880320173e-05, + "loss": 0.9374, + "step": 18081 + }, + { + "epoch": 3.8570819112627985, + "grad_norm": 0.2421669780484657, + "learning_rate": 7.97191783174408e-05, + "loss": 0.9214, + "step": 18082 + }, + { + "epoch": 3.857295221843003, + "grad_norm": 0.1902956552264604, + "learning_rate": 7.971066860604181e-05, + "loss": 0.907, + "step": 18083 + }, + { + "epoch": 3.8575085324232083, + "grad_norm": 0.2124577394166644, + "learning_rate": 7.97021588979166e-05, + "loss": 0.9096, + "step": 18084 + }, + { + "epoch": 3.857721843003413, + "grad_norm": 0.275284985280446, + "learning_rate": 7.969364919316148e-05, + "loss": 0.937, + "step": 18085 + }, + { + "epoch": 3.8579351535836177, + "grad_norm": 0.29525705366969807, + "learning_rate": 7.968513949187273e-05, + "loss": 0.9483, + "step": 18086 + }, + { + "epoch": 3.8581484641638224, + "grad_norm": 0.2625979843194037, + "learning_rate": 7.96766297941466e-05, + "loss": 0.9439, + "step": 18087 + }, + { + "epoch": 3.8583617747440275, + "grad_norm": 0.19740498277393864, + "learning_rate": 7.966812010007946e-05, + "loss": 0.8971, + "step": 18088 + }, + { + "epoch": 3.858575085324232, + "grad_norm": 0.20218463520825744, + "learning_rate": 7.965961040976754e-05, + "loss": 0.9092, + "step": 18089 + }, + { + "epoch": 3.858788395904437, + "grad_norm": 0.2873839129251301, + "learning_rate": 7.96511007233071e-05, + "loss": 0.8884, + "step": 18090 + }, + { + "epoch": 3.8590017064846416, + "grad_norm": 0.28231723742277887, + "learning_rate": 7.964259104079447e-05, + "loss": 0.9245, + "step": 18091 + }, + { + "epoch": 3.8592150170648463, + "grad_norm": 0.2195338317991507, + "learning_rate": 7.963408136232592e-05, + "loss": 0.9509, + "step": 18092 + }, + { + "epoch": 3.859428327645051, + "grad_norm": 0.20075921233226382, + "learning_rate": 7.962557168799777e-05, + "loss": 0.921, + "step": 18093 + }, + { + "epoch": 3.859641638225256, + "grad_norm": 0.20492646112195734, + "learning_rate": 7.961706201790627e-05, + "loss": 0.9196, + "step": 18094 + }, + { + "epoch": 3.8598549488054608, + "grad_norm": 0.5963174202023449, + "learning_rate": 7.960855235214772e-05, + "loss": 0.9416, + "step": 18095 + }, + { + "epoch": 3.8600682593856654, + "grad_norm": 0.22259491168914383, + "learning_rate": 7.960004269081839e-05, + "loss": 0.909, + "step": 18096 + }, + { + "epoch": 3.8602815699658706, + "grad_norm": 0.2185762515452708, + "learning_rate": 7.959153303401461e-05, + "loss": 0.9141, + "step": 18097 + }, + { + "epoch": 3.8604948805460753, + "grad_norm": 0.25172501566184896, + "learning_rate": 7.958302338183264e-05, + "loss": 0.9232, + "step": 18098 + }, + { + "epoch": 3.86070819112628, + "grad_norm": 0.20554576414694767, + "learning_rate": 7.957451373436875e-05, + "loss": 0.9131, + "step": 18099 + }, + { + "epoch": 3.8609215017064846, + "grad_norm": 0.20027541699776938, + "learning_rate": 7.956600409171928e-05, + "loss": 0.9094, + "step": 18100 + }, + { + "epoch": 3.8611348122866893, + "grad_norm": 0.21876026684472094, + "learning_rate": 7.955749445398046e-05, + "loss": 0.8972, + "step": 18101 + }, + { + "epoch": 3.861348122866894, + "grad_norm": 0.19090965895491224, + "learning_rate": 7.954898482124858e-05, + "loss": 0.8966, + "step": 18102 + }, + { + "epoch": 3.861561433447099, + "grad_norm": 0.2025875090597653, + "learning_rate": 7.954047519361995e-05, + "loss": 0.8955, + "step": 18103 + }, + { + "epoch": 3.861774744027304, + "grad_norm": 0.18997100474897385, + "learning_rate": 7.953196557119086e-05, + "loss": 0.9203, + "step": 18104 + }, + { + "epoch": 3.8619880546075085, + "grad_norm": 0.28702782914768726, + "learning_rate": 7.952345595405758e-05, + "loss": 0.9489, + "step": 18105 + }, + { + "epoch": 3.862201365187713, + "grad_norm": 0.2942838320071693, + "learning_rate": 7.95149463423164e-05, + "loss": 0.8656, + "step": 18106 + }, + { + "epoch": 3.8624146757679183, + "grad_norm": 0.3020061851472156, + "learning_rate": 7.950643673606362e-05, + "loss": 0.9136, + "step": 18107 + }, + { + "epoch": 3.862627986348123, + "grad_norm": 0.23984149383538522, + "learning_rate": 7.949792713539552e-05, + "loss": 0.916, + "step": 18108 + }, + { + "epoch": 3.8628412969283277, + "grad_norm": 0.18324331560752946, + "learning_rate": 7.948941754040837e-05, + "loss": 0.9078, + "step": 18109 + }, + { + "epoch": 3.8630546075085324, + "grad_norm": 0.2319262202569442, + "learning_rate": 7.948090795119848e-05, + "loss": 0.9314, + "step": 18110 + }, + { + "epoch": 3.863267918088737, + "grad_norm": 0.23433420682584363, + "learning_rate": 7.947239836786216e-05, + "loss": 0.9249, + "step": 18111 + }, + { + "epoch": 3.8634812286689417, + "grad_norm": 0.19997128619561644, + "learning_rate": 7.946388879049561e-05, + "loss": 0.9208, + "step": 18112 + }, + { + "epoch": 3.863694539249147, + "grad_norm": 0.21438148840118929, + "learning_rate": 7.945537921919518e-05, + "loss": 0.9079, + "step": 18113 + }, + { + "epoch": 3.8639078498293515, + "grad_norm": 0.24553539062199417, + "learning_rate": 7.944686965405714e-05, + "loss": 0.9121, + "step": 18114 + }, + { + "epoch": 3.864121160409556, + "grad_norm": 0.22157928724750248, + "learning_rate": 7.943836009517779e-05, + "loss": 0.9063, + "step": 18115 + }, + { + "epoch": 3.8643344709897613, + "grad_norm": 0.1977073282629572, + "learning_rate": 7.942985054265339e-05, + "loss": 0.8823, + "step": 18116 + }, + { + "epoch": 3.864547781569966, + "grad_norm": 0.2748912973785187, + "learning_rate": 7.942134099658025e-05, + "loss": 0.921, + "step": 18117 + }, + { + "epoch": 3.8647610921501707, + "grad_norm": 0.25550272096296456, + "learning_rate": 7.941283145705463e-05, + "loss": 0.8982, + "step": 18118 + }, + { + "epoch": 3.8649744027303754, + "grad_norm": 0.2296610901337069, + "learning_rate": 7.940432192417285e-05, + "loss": 0.9477, + "step": 18119 + }, + { + "epoch": 3.86518771331058, + "grad_norm": 0.3591148950421046, + "learning_rate": 7.939581239803117e-05, + "loss": 0.8848, + "step": 18120 + }, + { + "epoch": 3.8654010238907848, + "grad_norm": 0.3114743330302245, + "learning_rate": 7.93873028787259e-05, + "loss": 0.9096, + "step": 18121 + }, + { + "epoch": 3.86561433447099, + "grad_norm": 0.22697372709650407, + "learning_rate": 7.937879336635331e-05, + "loss": 0.9203, + "step": 18122 + }, + { + "epoch": 3.8658276450511946, + "grad_norm": 0.28560771210007635, + "learning_rate": 7.937028386100968e-05, + "loss": 0.9185, + "step": 18123 + }, + { + "epoch": 3.8660409556313993, + "grad_norm": 0.31561701837214456, + "learning_rate": 7.936177436279128e-05, + "loss": 0.8944, + "step": 18124 + }, + { + "epoch": 3.866254266211604, + "grad_norm": 0.25400241019427455, + "learning_rate": 7.935326487179443e-05, + "loss": 0.9051, + "step": 18125 + }, + { + "epoch": 3.866467576791809, + "grad_norm": 0.33073802729892027, + "learning_rate": 7.934475538811539e-05, + "loss": 0.9347, + "step": 18126 + }, + { + "epoch": 3.8666808873720138, + "grad_norm": 0.33289317436109217, + "learning_rate": 7.933624591185045e-05, + "loss": 0.8829, + "step": 18127 + }, + { + "epoch": 3.8668941979522184, + "grad_norm": 0.1934858802990554, + "learning_rate": 7.93277364430959e-05, + "loss": 0.8969, + "step": 18128 + }, + { + "epoch": 3.867107508532423, + "grad_norm": 0.2931936135089298, + "learning_rate": 7.931922698194803e-05, + "loss": 0.9231, + "step": 18129 + }, + { + "epoch": 3.867320819112628, + "grad_norm": 0.2884097589696534, + "learning_rate": 7.93107175285031e-05, + "loss": 0.8992, + "step": 18130 + }, + { + "epoch": 3.8675341296928325, + "grad_norm": 0.3055456016568225, + "learning_rate": 7.930220808285744e-05, + "loss": 0.9101, + "step": 18131 + }, + { + "epoch": 3.8677474402730376, + "grad_norm": 0.39437696340480755, + "learning_rate": 7.929369864510728e-05, + "loss": 0.9371, + "step": 18132 + }, + { + "epoch": 3.8679607508532423, + "grad_norm": 0.26772064418843544, + "learning_rate": 7.9285189215349e-05, + "loss": 0.8863, + "step": 18133 + }, + { + "epoch": 3.868174061433447, + "grad_norm": 0.22994392159514718, + "learning_rate": 7.927667979367874e-05, + "loss": 0.9309, + "step": 18134 + }, + { + "epoch": 3.868387372013652, + "grad_norm": 0.337985249947552, + "learning_rate": 7.926817038019289e-05, + "loss": 0.9517, + "step": 18135 + }, + { + "epoch": 3.868600682593857, + "grad_norm": 0.3764718085946127, + "learning_rate": 7.925966097498768e-05, + "loss": 0.9041, + "step": 18136 + }, + { + "epoch": 3.8688139931740615, + "grad_norm": 0.35408480148228916, + "learning_rate": 7.925115157815945e-05, + "loss": 0.9531, + "step": 18137 + }, + { + "epoch": 3.869027303754266, + "grad_norm": 0.3474620955393877, + "learning_rate": 7.924264218980444e-05, + "loss": 0.8892, + "step": 18138 + }, + { + "epoch": 3.869240614334471, + "grad_norm": 0.22395193104664105, + "learning_rate": 7.923413281001894e-05, + "loss": 0.9289, + "step": 18139 + }, + { + "epoch": 3.8694539249146755, + "grad_norm": 0.23357781030078648, + "learning_rate": 7.922562343889923e-05, + "loss": 0.909, + "step": 18140 + }, + { + "epoch": 3.8696672354948807, + "grad_norm": 0.3036764772715418, + "learning_rate": 7.921711407654163e-05, + "loss": 0.9544, + "step": 18141 + }, + { + "epoch": 3.8698805460750854, + "grad_norm": 0.33524169310635626, + "learning_rate": 7.920860472304237e-05, + "loss": 0.9216, + "step": 18142 + }, + { + "epoch": 3.87009385665529, + "grad_norm": 0.328569385380405, + "learning_rate": 7.920009537849778e-05, + "loss": 0.9198, + "step": 18143 + }, + { + "epoch": 3.8703071672354947, + "grad_norm": 0.2604229732475075, + "learning_rate": 7.919158604300413e-05, + "loss": 0.9164, + "step": 18144 + }, + { + "epoch": 3.8705204778157, + "grad_norm": 0.19120411023172068, + "learning_rate": 7.918307671665772e-05, + "loss": 0.9318, + "step": 18145 + }, + { + "epoch": 3.8707337883959045, + "grad_norm": 0.2575798773813322, + "learning_rate": 7.917456739955476e-05, + "loss": 0.9406, + "step": 18146 + }, + { + "epoch": 3.8709470989761092, + "grad_norm": 0.34522130971389836, + "learning_rate": 7.916605809179162e-05, + "loss": 0.9187, + "step": 18147 + }, + { + "epoch": 3.871160409556314, + "grad_norm": 0.26802348062382936, + "learning_rate": 7.915754879346452e-05, + "loss": 0.9209, + "step": 18148 + }, + { + "epoch": 3.8713737201365186, + "grad_norm": 0.2530366567786691, + "learning_rate": 7.914903950466978e-05, + "loss": 0.9109, + "step": 18149 + }, + { + "epoch": 3.8715870307167233, + "grad_norm": 0.20829612107930817, + "learning_rate": 7.914053022550367e-05, + "loss": 0.9021, + "step": 18150 + }, + { + "epoch": 3.8718003412969284, + "grad_norm": 0.21371380226856865, + "learning_rate": 7.913202095606247e-05, + "loss": 0.9276, + "step": 18151 + }, + { + "epoch": 3.872013651877133, + "grad_norm": 0.2655183788462951, + "learning_rate": 7.912351169644247e-05, + "loss": 0.9077, + "step": 18152 + }, + { + "epoch": 3.8722269624573378, + "grad_norm": 0.2765432752321806, + "learning_rate": 7.911500244673997e-05, + "loss": 0.9178, + "step": 18153 + }, + { + "epoch": 3.872440273037543, + "grad_norm": 0.236460295540294, + "learning_rate": 7.91064932070512e-05, + "loss": 0.9286, + "step": 18154 + }, + { + "epoch": 3.8726535836177476, + "grad_norm": 0.21882991325101714, + "learning_rate": 7.909798397747249e-05, + "loss": 0.9248, + "step": 18155 + }, + { + "epoch": 3.8728668941979523, + "grad_norm": 0.2615660223634716, + "learning_rate": 7.908947475810014e-05, + "loss": 0.9193, + "step": 18156 + }, + { + "epoch": 3.873080204778157, + "grad_norm": 0.31318029037939976, + "learning_rate": 7.908096554903037e-05, + "loss": 0.9023, + "step": 18157 + }, + { + "epoch": 3.8732935153583616, + "grad_norm": 0.291567708908622, + "learning_rate": 7.907245635035948e-05, + "loss": 0.9473, + "step": 18158 + }, + { + "epoch": 3.8735068259385663, + "grad_norm": 0.27076903161164945, + "learning_rate": 7.906394716218378e-05, + "loss": 0.941, + "step": 18159 + }, + { + "epoch": 3.8737201365187715, + "grad_norm": 0.1952341305149494, + "learning_rate": 7.90554379845995e-05, + "loss": 0.9079, + "step": 18160 + }, + { + "epoch": 3.873933447098976, + "grad_norm": 0.22433175386152082, + "learning_rate": 7.904692881770298e-05, + "loss": 0.9246, + "step": 18161 + }, + { + "epoch": 3.874146757679181, + "grad_norm": 0.22472327033121187, + "learning_rate": 7.903841966159047e-05, + "loss": 0.9007, + "step": 18162 + }, + { + "epoch": 3.8743600682593855, + "grad_norm": 0.18047564432012717, + "learning_rate": 7.902991051635827e-05, + "loss": 0.8954, + "step": 18163 + }, + { + "epoch": 3.8745733788395906, + "grad_norm": 0.16424195487355134, + "learning_rate": 7.902140138210263e-05, + "loss": 0.9039, + "step": 18164 + }, + { + "epoch": 3.8747866894197953, + "grad_norm": 0.18070709157703482, + "learning_rate": 7.901289225891986e-05, + "loss": 0.8996, + "step": 18165 + }, + { + "epoch": 3.875, + "grad_norm": 0.19566731374083607, + "learning_rate": 7.900438314690622e-05, + "loss": 0.926, + "step": 18166 + }, + { + "epoch": 3.8752133105802047, + "grad_norm": 0.20653567125608976, + "learning_rate": 7.899587404615805e-05, + "loss": 0.868, + "step": 18167 + }, + { + "epoch": 3.8754266211604094, + "grad_norm": 0.20384483969615272, + "learning_rate": 7.898736495677153e-05, + "loss": 0.9201, + "step": 18168 + }, + { + "epoch": 3.8756399317406145, + "grad_norm": 0.1659563142096319, + "learning_rate": 7.897885587884302e-05, + "loss": 0.9176, + "step": 18169 + }, + { + "epoch": 3.875853242320819, + "grad_norm": 0.22177309586678243, + "learning_rate": 7.897034681246874e-05, + "loss": 0.9463, + "step": 18170 + }, + { + "epoch": 3.876066552901024, + "grad_norm": 0.23902241973558155, + "learning_rate": 7.896183775774503e-05, + "loss": 0.9069, + "step": 18171 + }, + { + "epoch": 3.8762798634812285, + "grad_norm": 0.20014430245339088, + "learning_rate": 7.895332871476813e-05, + "loss": 0.9145, + "step": 18172 + }, + { + "epoch": 3.8764931740614337, + "grad_norm": 0.19926476700977147, + "learning_rate": 7.894481968363434e-05, + "loss": 0.9089, + "step": 18173 + }, + { + "epoch": 3.8767064846416384, + "grad_norm": 0.16327098032659, + "learning_rate": 7.893631066443992e-05, + "loss": 0.881, + "step": 18174 + }, + { + "epoch": 3.876919795221843, + "grad_norm": 0.19766544527524038, + "learning_rate": 7.892780165728119e-05, + "loss": 0.9291, + "step": 18175 + }, + { + "epoch": 3.8771331058020477, + "grad_norm": 0.17192776151764905, + "learning_rate": 7.891929266225437e-05, + "loss": 0.8988, + "step": 18176 + }, + { + "epoch": 3.8773464163822524, + "grad_norm": 0.1755598852201228, + "learning_rate": 7.89107836794558e-05, + "loss": 0.9317, + "step": 18177 + }, + { + "epoch": 3.877559726962457, + "grad_norm": 0.20086486788292351, + "learning_rate": 7.890227470898174e-05, + "loss": 0.9327, + "step": 18178 + }, + { + "epoch": 3.8777730375426622, + "grad_norm": 0.3304514134656197, + "learning_rate": 7.889376575092845e-05, + "loss": 0.9399, + "step": 18179 + }, + { + "epoch": 3.877986348122867, + "grad_norm": 0.18023050530015386, + "learning_rate": 7.888525680539219e-05, + "loss": 0.9305, + "step": 18180 + }, + { + "epoch": 3.8781996587030716, + "grad_norm": 0.1969226369559769, + "learning_rate": 7.88767478724693e-05, + "loss": 0.913, + "step": 18181 + }, + { + "epoch": 3.8784129692832767, + "grad_norm": 0.20626921443831084, + "learning_rate": 7.8868238952256e-05, + "loss": 0.8927, + "step": 18182 + }, + { + "epoch": 3.8786262798634814, + "grad_norm": 0.21304819532862307, + "learning_rate": 7.885973004484861e-05, + "loss": 0.9173, + "step": 18183 + }, + { + "epoch": 3.878839590443686, + "grad_norm": 0.20049215809520568, + "learning_rate": 7.88512211503434e-05, + "loss": 0.924, + "step": 18184 + }, + { + "epoch": 3.8790529010238908, + "grad_norm": 0.17800330075182563, + "learning_rate": 7.884271226883663e-05, + "loss": 0.9324, + "step": 18185 + }, + { + "epoch": 3.8792662116040955, + "grad_norm": 0.21240783028540366, + "learning_rate": 7.883420340042458e-05, + "loss": 0.948, + "step": 18186 + }, + { + "epoch": 3.8794795221843, + "grad_norm": 0.2588292110139375, + "learning_rate": 7.882569454520357e-05, + "loss": 0.9055, + "step": 18187 + }, + { + "epoch": 3.8796928327645053, + "grad_norm": 0.2362126077360876, + "learning_rate": 7.88171857032698e-05, + "loss": 0.9076, + "step": 18188 + }, + { + "epoch": 3.87990614334471, + "grad_norm": 0.179375479225382, + "learning_rate": 7.880867687471965e-05, + "loss": 0.9091, + "step": 18189 + }, + { + "epoch": 3.8801194539249146, + "grad_norm": 0.1862843414576542, + "learning_rate": 7.880016805964933e-05, + "loss": 0.9356, + "step": 18190 + }, + { + "epoch": 3.8803327645051193, + "grad_norm": 0.19746484599953074, + "learning_rate": 7.879165925815512e-05, + "loss": 0.9108, + "step": 18191 + }, + { + "epoch": 3.8805460750853245, + "grad_norm": 0.22342575404963325, + "learning_rate": 7.878315047033329e-05, + "loss": 0.9194, + "step": 18192 + }, + { + "epoch": 3.880759385665529, + "grad_norm": 0.21069493098094488, + "learning_rate": 7.877464169628014e-05, + "loss": 0.9025, + "step": 18193 + }, + { + "epoch": 3.880972696245734, + "grad_norm": 0.17033232363897174, + "learning_rate": 7.876613293609193e-05, + "loss": 0.9251, + "step": 18194 + }, + { + "epoch": 3.8811860068259385, + "grad_norm": 0.21340804202846278, + "learning_rate": 7.875762418986496e-05, + "loss": 0.9084, + "step": 18195 + }, + { + "epoch": 3.881399317406143, + "grad_norm": 0.30922189876597855, + "learning_rate": 7.874911545769549e-05, + "loss": 0.908, + "step": 18196 + }, + { + "epoch": 3.881612627986348, + "grad_norm": 0.3704129056123164, + "learning_rate": 7.87406067396798e-05, + "loss": 0.907, + "step": 18197 + }, + { + "epoch": 3.881825938566553, + "grad_norm": 0.22225945679152084, + "learning_rate": 7.873209803591415e-05, + "loss": 0.9226, + "step": 18198 + }, + { + "epoch": 3.8820392491467577, + "grad_norm": 0.19839506874779203, + "learning_rate": 7.872358934649486e-05, + "loss": 0.9125, + "step": 18199 + }, + { + "epoch": 3.8822525597269624, + "grad_norm": 0.22864405514734099, + "learning_rate": 7.871508067151815e-05, + "loss": 0.9168, + "step": 18200 + }, + { + "epoch": 3.8824658703071675, + "grad_norm": 0.3198812285251376, + "learning_rate": 7.870657201108038e-05, + "loss": 0.9599, + "step": 18201 + }, + { + "epoch": 3.882679180887372, + "grad_norm": 0.3352152361931455, + "learning_rate": 7.869806336527773e-05, + "loss": 0.8845, + "step": 18202 + }, + { + "epoch": 3.882892491467577, + "grad_norm": 0.2602343242639918, + "learning_rate": 7.868955473420652e-05, + "loss": 0.9332, + "step": 18203 + }, + { + "epoch": 3.8831058020477816, + "grad_norm": 0.17944407399712928, + "learning_rate": 7.8681046117963e-05, + "loss": 0.91, + "step": 18204 + }, + { + "epoch": 3.8833191126279862, + "grad_norm": 0.28552850169051797, + "learning_rate": 7.86725375166435e-05, + "loss": 0.9184, + "step": 18205 + }, + { + "epoch": 3.883532423208191, + "grad_norm": 0.33273554420033447, + "learning_rate": 7.866402893034422e-05, + "loss": 0.8938, + "step": 18206 + }, + { + "epoch": 3.883745733788396, + "grad_norm": 0.26096789568583934, + "learning_rate": 7.865552035916149e-05, + "loss": 0.9058, + "step": 18207 + }, + { + "epoch": 3.8839590443686007, + "grad_norm": 0.2216968821769515, + "learning_rate": 7.864701180319158e-05, + "loss": 0.9158, + "step": 18208 + }, + { + "epoch": 3.8841723549488054, + "grad_norm": 0.22881048177084728, + "learning_rate": 7.863850326253076e-05, + "loss": 0.9425, + "step": 18209 + }, + { + "epoch": 3.88438566552901, + "grad_norm": 0.26823104033311085, + "learning_rate": 7.862999473727527e-05, + "loss": 0.9251, + "step": 18210 + }, + { + "epoch": 3.8845989761092152, + "grad_norm": 0.27081830668706175, + "learning_rate": 7.862148622752144e-05, + "loss": 0.9088, + "step": 18211 + }, + { + "epoch": 3.88481228668942, + "grad_norm": 0.2752132263860447, + "learning_rate": 7.861297773336553e-05, + "loss": 0.8959, + "step": 18212 + }, + { + "epoch": 3.8850255972696246, + "grad_norm": 0.1960925904785296, + "learning_rate": 7.860446925490376e-05, + "loss": 0.8983, + "step": 18213 + }, + { + "epoch": 3.8852389078498293, + "grad_norm": 0.2361685442176977, + "learning_rate": 7.859596079223247e-05, + "loss": 0.9114, + "step": 18214 + }, + { + "epoch": 3.885452218430034, + "grad_norm": 0.23265154699698007, + "learning_rate": 7.858745234544791e-05, + "loss": 0.9203, + "step": 18215 + }, + { + "epoch": 3.8856655290102387, + "grad_norm": 0.22347124798379026, + "learning_rate": 7.857894391464632e-05, + "loss": 0.9169, + "step": 18216 + }, + { + "epoch": 3.885878839590444, + "grad_norm": 0.25565878934354647, + "learning_rate": 7.857043549992403e-05, + "loss": 0.8981, + "step": 18217 + }, + { + "epoch": 3.8860921501706485, + "grad_norm": 0.2329033837413968, + "learning_rate": 7.856192710137727e-05, + "loss": 0.895, + "step": 18218 + }, + { + "epoch": 3.886305460750853, + "grad_norm": 0.14588880547739766, + "learning_rate": 7.855341871910235e-05, + "loss": 0.8745, + "step": 18219 + }, + { + "epoch": 3.8865187713310583, + "grad_norm": 0.18318521653519776, + "learning_rate": 7.85449103531955e-05, + "loss": 0.9192, + "step": 18220 + }, + { + "epoch": 3.886732081911263, + "grad_norm": 0.17000439033329254, + "learning_rate": 7.853640200375302e-05, + "loss": 0.9087, + "step": 18221 + }, + { + "epoch": 3.8869453924914676, + "grad_norm": 0.177021232404973, + "learning_rate": 7.852789367087118e-05, + "loss": 0.927, + "step": 18222 + }, + { + "epoch": 3.8871587030716723, + "grad_norm": 0.17860141587076528, + "learning_rate": 7.851938535464626e-05, + "loss": 0.9061, + "step": 18223 + }, + { + "epoch": 3.887372013651877, + "grad_norm": 0.16712171966869174, + "learning_rate": 7.85108770551745e-05, + "loss": 0.8745, + "step": 18224 + }, + { + "epoch": 3.8875853242320817, + "grad_norm": 0.18152441045870404, + "learning_rate": 7.85023687725522e-05, + "loss": 0.921, + "step": 18225 + }, + { + "epoch": 3.887798634812287, + "grad_norm": 0.17112492075782368, + "learning_rate": 7.849386050687562e-05, + "loss": 0.9361, + "step": 18226 + }, + { + "epoch": 3.8880119453924915, + "grad_norm": 0.17886764038830108, + "learning_rate": 7.848535225824105e-05, + "loss": 0.9263, + "step": 18227 + }, + { + "epoch": 3.888225255972696, + "grad_norm": 0.19151196196815268, + "learning_rate": 7.847684402674472e-05, + "loss": 0.9132, + "step": 18228 + }, + { + "epoch": 3.888438566552901, + "grad_norm": 0.17263011360906563, + "learning_rate": 7.846833581248293e-05, + "loss": 0.9092, + "step": 18229 + }, + { + "epoch": 3.888651877133106, + "grad_norm": 0.19215406628185577, + "learning_rate": 7.845982761555195e-05, + "loss": 0.924, + "step": 18230 + }, + { + "epoch": 3.8888651877133107, + "grad_norm": 0.21233963986045326, + "learning_rate": 7.845131943604805e-05, + "loss": 0.8862, + "step": 18231 + }, + { + "epoch": 3.8890784982935154, + "grad_norm": 0.21342729149227416, + "learning_rate": 7.844281127406748e-05, + "loss": 0.9072, + "step": 18232 + }, + { + "epoch": 3.88929180887372, + "grad_norm": 0.17587338760086596, + "learning_rate": 7.843430312970655e-05, + "loss": 0.9301, + "step": 18233 + }, + { + "epoch": 3.8895051194539247, + "grad_norm": 0.17884221281615548, + "learning_rate": 7.842579500306152e-05, + "loss": 0.9223, + "step": 18234 + }, + { + "epoch": 3.8897184300341294, + "grad_norm": 0.22761496644029203, + "learning_rate": 7.841728689422861e-05, + "loss": 0.8769, + "step": 18235 + }, + { + "epoch": 3.8899317406143346, + "grad_norm": 0.1978776675121589, + "learning_rate": 7.840877880330414e-05, + "loss": 0.9325, + "step": 18236 + }, + { + "epoch": 3.8901450511945392, + "grad_norm": 0.16425695707731994, + "learning_rate": 7.840027073038437e-05, + "loss": 0.9394, + "step": 18237 + }, + { + "epoch": 3.890358361774744, + "grad_norm": 0.20689443874308866, + "learning_rate": 7.839176267556554e-05, + "loss": 0.8957, + "step": 18238 + }, + { + "epoch": 3.890571672354949, + "grad_norm": 0.17614980409830278, + "learning_rate": 7.838325463894398e-05, + "loss": 0.9007, + "step": 18239 + }, + { + "epoch": 3.8907849829351537, + "grad_norm": 0.2262728577285916, + "learning_rate": 7.837474662061588e-05, + "loss": 0.9143, + "step": 18240 + }, + { + "epoch": 3.8909982935153584, + "grad_norm": 0.25970691524245965, + "learning_rate": 7.836623862067758e-05, + "loss": 0.8644, + "step": 18241 + }, + { + "epoch": 3.891211604095563, + "grad_norm": 0.2068946879532755, + "learning_rate": 7.835773063922529e-05, + "loss": 0.9187, + "step": 18242 + }, + { + "epoch": 3.891424914675768, + "grad_norm": 0.26394047521278535, + "learning_rate": 7.834922267635533e-05, + "loss": 0.9345, + "step": 18243 + }, + { + "epoch": 3.8916382252559725, + "grad_norm": 0.1954869523532516, + "learning_rate": 7.834071473216391e-05, + "loss": 0.9125, + "step": 18244 + }, + { + "epoch": 3.8918515358361776, + "grad_norm": 0.2751651997980627, + "learning_rate": 7.83322068067474e-05, + "loss": 0.9123, + "step": 18245 + }, + { + "epoch": 3.8920648464163823, + "grad_norm": 0.2619392900059039, + "learning_rate": 7.832369890020196e-05, + "loss": 0.9116, + "step": 18246 + }, + { + "epoch": 3.892278156996587, + "grad_norm": 0.19693855115704664, + "learning_rate": 7.831519101262386e-05, + "loss": 0.9071, + "step": 18247 + }, + { + "epoch": 3.8924914675767917, + "grad_norm": 0.24315693238820443, + "learning_rate": 7.830668314410944e-05, + "loss": 0.9068, + "step": 18248 + }, + { + "epoch": 3.892704778156997, + "grad_norm": 0.23451053609808145, + "learning_rate": 7.829817529475492e-05, + "loss": 0.9124, + "step": 18249 + }, + { + "epoch": 3.8929180887372015, + "grad_norm": 0.3101452487825439, + "learning_rate": 7.828966746465658e-05, + "loss": 0.8963, + "step": 18250 + }, + { + "epoch": 3.893131399317406, + "grad_norm": 0.27367163774888115, + "learning_rate": 7.828115965391068e-05, + "loss": 0.9154, + "step": 18251 + }, + { + "epoch": 3.893344709897611, + "grad_norm": 0.2535600288066789, + "learning_rate": 7.827265186261347e-05, + "loss": 0.8991, + "step": 18252 + }, + { + "epoch": 3.8935580204778155, + "grad_norm": 0.2433898817017865, + "learning_rate": 7.826414409086125e-05, + "loss": 0.9321, + "step": 18253 + }, + { + "epoch": 3.89377133105802, + "grad_norm": 0.19156822170686325, + "learning_rate": 7.825563633875026e-05, + "loss": 0.8925, + "step": 18254 + }, + { + "epoch": 3.8939846416382253, + "grad_norm": 0.2881546436409184, + "learning_rate": 7.824712860637678e-05, + "loss": 0.9139, + "step": 18255 + }, + { + "epoch": 3.89419795221843, + "grad_norm": 0.33932186547114845, + "learning_rate": 7.82386208938371e-05, + "loss": 0.9185, + "step": 18256 + }, + { + "epoch": 3.8944112627986347, + "grad_norm": 0.21933735852988218, + "learning_rate": 7.82301132012274e-05, + "loss": 0.9451, + "step": 18257 + }, + { + "epoch": 3.89462457337884, + "grad_norm": 0.19969866829526892, + "learning_rate": 7.822160552864404e-05, + "loss": 0.9013, + "step": 18258 + }, + { + "epoch": 3.8948378839590445, + "grad_norm": 0.24566463433372415, + "learning_rate": 7.821309787618321e-05, + "loss": 0.9271, + "step": 18259 + }, + { + "epoch": 3.895051194539249, + "grad_norm": 0.3282106175385877, + "learning_rate": 7.820459024394123e-05, + "loss": 0.9311, + "step": 18260 + }, + { + "epoch": 3.895264505119454, + "grad_norm": 0.3809440344696742, + "learning_rate": 7.819608263201434e-05, + "loss": 0.8925, + "step": 18261 + }, + { + "epoch": 3.8954778156996586, + "grad_norm": 0.30279207714843426, + "learning_rate": 7.818757504049878e-05, + "loss": 0.8786, + "step": 18262 + }, + { + "epoch": 3.8956911262798632, + "grad_norm": 0.22871051516217816, + "learning_rate": 7.817906746949088e-05, + "loss": 0.901, + "step": 18263 + }, + { + "epoch": 3.8959044368600684, + "grad_norm": 0.282811893430074, + "learning_rate": 7.817055991908682e-05, + "loss": 0.8946, + "step": 18264 + }, + { + "epoch": 3.896117747440273, + "grad_norm": 0.4064749470750576, + "learning_rate": 7.816205238938294e-05, + "loss": 0.8906, + "step": 18265 + }, + { + "epoch": 3.8963310580204777, + "grad_norm": 0.4433789971489793, + "learning_rate": 7.815354488047545e-05, + "loss": 0.8823, + "step": 18266 + }, + { + "epoch": 3.8965443686006824, + "grad_norm": 0.33778333860387744, + "learning_rate": 7.814503739246063e-05, + "loss": 0.9012, + "step": 18267 + }, + { + "epoch": 3.8967576791808876, + "grad_norm": 0.22972210780550195, + "learning_rate": 7.813652992543478e-05, + "loss": 0.9387, + "step": 18268 + }, + { + "epoch": 3.8969709897610922, + "grad_norm": 0.2980850461071916, + "learning_rate": 7.812802247949408e-05, + "loss": 0.9003, + "step": 18269 + }, + { + "epoch": 3.897184300341297, + "grad_norm": 0.35462878697851613, + "learning_rate": 7.811951505473485e-05, + "loss": 0.9242, + "step": 18270 + }, + { + "epoch": 3.8973976109215016, + "grad_norm": 0.24891133358519682, + "learning_rate": 7.811100765125333e-05, + "loss": 0.9325, + "step": 18271 + }, + { + "epoch": 3.8976109215017063, + "grad_norm": 0.23212199568435057, + "learning_rate": 7.810250026914582e-05, + "loss": 0.952, + "step": 18272 + }, + { + "epoch": 3.8978242320819114, + "grad_norm": 0.291106038079641, + "learning_rate": 7.809399290850853e-05, + "loss": 0.9107, + "step": 18273 + }, + { + "epoch": 3.898037542662116, + "grad_norm": 0.26647288178706346, + "learning_rate": 7.808548556943772e-05, + "loss": 0.9003, + "step": 18274 + }, + { + "epoch": 3.898250853242321, + "grad_norm": 0.2720943129670257, + "learning_rate": 7.80769782520297e-05, + "loss": 0.9092, + "step": 18275 + }, + { + "epoch": 3.8984641638225255, + "grad_norm": 0.215659076595819, + "learning_rate": 7.80684709563807e-05, + "loss": 0.9132, + "step": 18276 + }, + { + "epoch": 3.8986774744027306, + "grad_norm": 0.22362129594459615, + "learning_rate": 7.8059963682587e-05, + "loss": 0.9231, + "step": 18277 + }, + { + "epoch": 3.8988907849829353, + "grad_norm": 0.25271636385069834, + "learning_rate": 7.80514564307448e-05, + "loss": 0.927, + "step": 18278 + }, + { + "epoch": 3.89910409556314, + "grad_norm": 0.19939283517656162, + "learning_rate": 7.804294920095046e-05, + "loss": 0.9366, + "step": 18279 + }, + { + "epoch": 3.8993174061433447, + "grad_norm": 0.164878857781755, + "learning_rate": 7.803444199330016e-05, + "loss": 0.9122, + "step": 18280 + }, + { + "epoch": 3.8995307167235493, + "grad_norm": 0.18601213455114055, + "learning_rate": 7.802593480789017e-05, + "loss": 0.8915, + "step": 18281 + }, + { + "epoch": 3.899744027303754, + "grad_norm": 0.1824799760545635, + "learning_rate": 7.801742764481677e-05, + "loss": 0.9207, + "step": 18282 + }, + { + "epoch": 3.899957337883959, + "grad_norm": 0.17045294713537337, + "learning_rate": 7.800892050417618e-05, + "loss": 0.9126, + "step": 18283 + }, + { + "epoch": 3.900170648464164, + "grad_norm": 0.17375202662198716, + "learning_rate": 7.800041338606473e-05, + "loss": 0.9018, + "step": 18284 + }, + { + "epoch": 3.9003839590443685, + "grad_norm": 0.20162499407703546, + "learning_rate": 7.799190629057863e-05, + "loss": 0.8979, + "step": 18285 + }, + { + "epoch": 3.9005972696245736, + "grad_norm": 0.2525777741832287, + "learning_rate": 7.798339921781411e-05, + "loss": 0.9203, + "step": 18286 + }, + { + "epoch": 3.9008105802047783, + "grad_norm": 0.1878383522208501, + "learning_rate": 7.797489216786749e-05, + "loss": 0.9195, + "step": 18287 + }, + { + "epoch": 3.901023890784983, + "grad_norm": 0.1842382517086575, + "learning_rate": 7.796638514083499e-05, + "loss": 0.9425, + "step": 18288 + }, + { + "epoch": 3.9012372013651877, + "grad_norm": 0.24638928392368617, + "learning_rate": 7.795787813681288e-05, + "loss": 0.9377, + "step": 18289 + }, + { + "epoch": 3.9014505119453924, + "grad_norm": 0.2053969539867817, + "learning_rate": 7.794937115589745e-05, + "loss": 0.8906, + "step": 18290 + }, + { + "epoch": 3.901663822525597, + "grad_norm": 0.1781790482354573, + "learning_rate": 7.794086419818486e-05, + "loss": 0.9265, + "step": 18291 + }, + { + "epoch": 3.901877133105802, + "grad_norm": 0.1693924631417778, + "learning_rate": 7.793235726377146e-05, + "loss": 0.914, + "step": 18292 + }, + { + "epoch": 3.902090443686007, + "grad_norm": 0.21919891022728558, + "learning_rate": 7.792385035275346e-05, + "loss": 0.897, + "step": 18293 + }, + { + "epoch": 3.9023037542662116, + "grad_norm": 0.24019370741532675, + "learning_rate": 7.791534346522714e-05, + "loss": 0.9444, + "step": 18294 + }, + { + "epoch": 3.9025170648464163, + "grad_norm": 0.22656292775627473, + "learning_rate": 7.790683660128872e-05, + "loss": 0.9492, + "step": 18295 + }, + { + "epoch": 3.9027303754266214, + "grad_norm": 0.2084057134939029, + "learning_rate": 7.78983297610345e-05, + "loss": 0.9012, + "step": 18296 + }, + { + "epoch": 3.902943686006826, + "grad_norm": 0.19629979862548155, + "learning_rate": 7.788982294456071e-05, + "loss": 0.9106, + "step": 18297 + }, + { + "epoch": 3.9031569965870307, + "grad_norm": 0.18443480641016316, + "learning_rate": 7.78813161519636e-05, + "loss": 0.9293, + "step": 18298 + }, + { + "epoch": 3.9033703071672354, + "grad_norm": 0.23344122097475134, + "learning_rate": 7.787280938333945e-05, + "loss": 0.9223, + "step": 18299 + }, + { + "epoch": 3.90358361774744, + "grad_norm": 0.2136979520427049, + "learning_rate": 7.786430263878447e-05, + "loss": 0.9141, + "step": 18300 + }, + { + "epoch": 3.903796928327645, + "grad_norm": 0.1732234412291611, + "learning_rate": 7.785579591839499e-05, + "loss": 0.9069, + "step": 18301 + }, + { + "epoch": 3.90401023890785, + "grad_norm": 0.255999307737127, + "learning_rate": 7.784728922226718e-05, + "loss": 0.9385, + "step": 18302 + }, + { + "epoch": 3.9042235494880546, + "grad_norm": 0.2045093817982117, + "learning_rate": 7.783878255049733e-05, + "loss": 0.9122, + "step": 18303 + }, + { + "epoch": 3.9044368600682593, + "grad_norm": 0.17106651190314065, + "learning_rate": 7.78302759031817e-05, + "loss": 0.9282, + "step": 18304 + }, + { + "epoch": 3.9046501706484644, + "grad_norm": 0.19846827078933832, + "learning_rate": 7.782176928041651e-05, + "loss": 0.8947, + "step": 18305 + }, + { + "epoch": 3.904863481228669, + "grad_norm": 0.1989368276880227, + "learning_rate": 7.781326268229807e-05, + "loss": 0.8776, + "step": 18306 + }, + { + "epoch": 3.905076791808874, + "grad_norm": 0.1656626202879836, + "learning_rate": 7.780475610892257e-05, + "loss": 0.9053, + "step": 18307 + }, + { + "epoch": 3.9052901023890785, + "grad_norm": 0.20870005938773134, + "learning_rate": 7.779624956038631e-05, + "loss": 0.9023, + "step": 18308 + }, + { + "epoch": 3.905503412969283, + "grad_norm": 0.20891609335365852, + "learning_rate": 7.778774303678553e-05, + "loss": 0.9181, + "step": 18309 + }, + { + "epoch": 3.905716723549488, + "grad_norm": 0.22939603848485213, + "learning_rate": 7.777923653821645e-05, + "loss": 0.9142, + "step": 18310 + }, + { + "epoch": 3.905930034129693, + "grad_norm": 0.18963139634580767, + "learning_rate": 7.777073006477537e-05, + "loss": 0.9098, + "step": 18311 + }, + { + "epoch": 3.9061433447098977, + "grad_norm": 0.22371744120573844, + "learning_rate": 7.776222361655854e-05, + "loss": 0.9153, + "step": 18312 + }, + { + "epoch": 3.9063566552901023, + "grad_norm": 0.24977087462545286, + "learning_rate": 7.775371719366214e-05, + "loss": 0.9055, + "step": 18313 + }, + { + "epoch": 3.906569965870307, + "grad_norm": 0.2829312277109494, + "learning_rate": 7.774521079618249e-05, + "loss": 0.9285, + "step": 18314 + }, + { + "epoch": 3.906783276450512, + "grad_norm": 0.2187957151655399, + "learning_rate": 7.77367044242158e-05, + "loss": 0.9189, + "step": 18315 + }, + { + "epoch": 3.906996587030717, + "grad_norm": 0.20114903175356796, + "learning_rate": 7.772819807785835e-05, + "loss": 0.9281, + "step": 18316 + }, + { + "epoch": 3.9072098976109215, + "grad_norm": 0.20802862791620388, + "learning_rate": 7.771969175720637e-05, + "loss": 0.9178, + "step": 18317 + }, + { + "epoch": 3.907423208191126, + "grad_norm": 0.2432753292890643, + "learning_rate": 7.771118546235613e-05, + "loss": 0.9272, + "step": 18318 + }, + { + "epoch": 3.907636518771331, + "grad_norm": 0.26277996317035845, + "learning_rate": 7.770267919340384e-05, + "loss": 0.9251, + "step": 18319 + }, + { + "epoch": 3.9078498293515356, + "grad_norm": 0.23089059996331238, + "learning_rate": 7.76941729504458e-05, + "loss": 0.8957, + "step": 18320 + }, + { + "epoch": 3.9080631399317407, + "grad_norm": 0.18204169374764662, + "learning_rate": 7.768566673357823e-05, + "loss": 0.8972, + "step": 18321 + }, + { + "epoch": 3.9082764505119454, + "grad_norm": 0.20123775315744788, + "learning_rate": 7.767716054289736e-05, + "loss": 0.9546, + "step": 18322 + }, + { + "epoch": 3.90848976109215, + "grad_norm": 0.2483433244169504, + "learning_rate": 7.766865437849948e-05, + "loss": 0.9203, + "step": 18323 + }, + { + "epoch": 3.908703071672355, + "grad_norm": 0.1892221892096288, + "learning_rate": 7.766014824048083e-05, + "loss": 0.9053, + "step": 18324 + }, + { + "epoch": 3.90891638225256, + "grad_norm": 0.21845852218981301, + "learning_rate": 7.765164212893761e-05, + "loss": 0.913, + "step": 18325 + }, + { + "epoch": 3.9091296928327646, + "grad_norm": 0.1885525935907355, + "learning_rate": 7.76431360439661e-05, + "loss": 0.9116, + "step": 18326 + }, + { + "epoch": 3.9093430034129693, + "grad_norm": 0.18442589349740646, + "learning_rate": 7.763462998566255e-05, + "loss": 0.9216, + "step": 18327 + }, + { + "epoch": 3.909556313993174, + "grad_norm": 0.22767226034431676, + "learning_rate": 7.76261239541232e-05, + "loss": 0.9011, + "step": 18328 + }, + { + "epoch": 3.9097696245733786, + "grad_norm": 0.22744556105430624, + "learning_rate": 7.761761794944429e-05, + "loss": 0.9026, + "step": 18329 + }, + { + "epoch": 3.9099829351535837, + "grad_norm": 0.1618111522729807, + "learning_rate": 7.760911197172209e-05, + "loss": 0.9197, + "step": 18330 + }, + { + "epoch": 3.9101962457337884, + "grad_norm": 0.2025458693620623, + "learning_rate": 7.760060602105281e-05, + "loss": 0.8857, + "step": 18331 + }, + { + "epoch": 3.910409556313993, + "grad_norm": 0.2136303302278779, + "learning_rate": 7.759210009753273e-05, + "loss": 0.9256, + "step": 18332 + }, + { + "epoch": 3.910622866894198, + "grad_norm": 0.1980578948842617, + "learning_rate": 7.758359420125808e-05, + "loss": 0.9226, + "step": 18333 + }, + { + "epoch": 3.910836177474403, + "grad_norm": 0.1876085295329448, + "learning_rate": 7.757508833232509e-05, + "loss": 0.9401, + "step": 18334 + }, + { + "epoch": 3.9110494880546076, + "grad_norm": 0.20848404620784178, + "learning_rate": 7.756658249083003e-05, + "loss": 0.9404, + "step": 18335 + }, + { + "epoch": 3.9112627986348123, + "grad_norm": 0.24674107914446208, + "learning_rate": 7.755807667686914e-05, + "loss": 0.8804, + "step": 18336 + }, + { + "epoch": 3.911476109215017, + "grad_norm": 0.22375550920679585, + "learning_rate": 7.754957089053862e-05, + "loss": 0.9092, + "step": 18337 + }, + { + "epoch": 3.9116894197952217, + "grad_norm": 0.17816703822296176, + "learning_rate": 7.754106513193477e-05, + "loss": 0.8719, + "step": 18338 + }, + { + "epoch": 3.9119027303754264, + "grad_norm": 0.23653445057769532, + "learning_rate": 7.753255940115378e-05, + "loss": 0.9452, + "step": 18339 + }, + { + "epoch": 3.9121160409556315, + "grad_norm": 0.2164735557435997, + "learning_rate": 7.752405369829195e-05, + "loss": 0.9014, + "step": 18340 + }, + { + "epoch": 3.912329351535836, + "grad_norm": 0.19804858743764708, + "learning_rate": 7.751554802344548e-05, + "loss": 0.889, + "step": 18341 + }, + { + "epoch": 3.912542662116041, + "grad_norm": 0.19999049159176488, + "learning_rate": 7.750704237671064e-05, + "loss": 0.9236, + "step": 18342 + }, + { + "epoch": 3.912755972696246, + "grad_norm": 0.20758245401170897, + "learning_rate": 7.749853675818363e-05, + "loss": 0.901, + "step": 18343 + }, + { + "epoch": 3.9129692832764507, + "grad_norm": 0.1743167272821454, + "learning_rate": 7.749003116796075e-05, + "loss": 0.9107, + "step": 18344 + }, + { + "epoch": 3.9131825938566553, + "grad_norm": 0.19545188800029512, + "learning_rate": 7.74815256061382e-05, + "loss": 0.8986, + "step": 18345 + }, + { + "epoch": 3.91339590443686, + "grad_norm": 0.16953260637087977, + "learning_rate": 7.747302007281225e-05, + "loss": 0.8959, + "step": 18346 + }, + { + "epoch": 3.9136092150170647, + "grad_norm": 0.19250528076350482, + "learning_rate": 7.74645145680791e-05, + "loss": 0.9142, + "step": 18347 + }, + { + "epoch": 3.9138225255972694, + "grad_norm": 0.22157977474461488, + "learning_rate": 7.7456009092035e-05, + "loss": 0.8756, + "step": 18348 + }, + { + "epoch": 3.9140358361774745, + "grad_norm": 0.17380708483267066, + "learning_rate": 7.74475036447762e-05, + "loss": 0.9009, + "step": 18349 + }, + { + "epoch": 3.914249146757679, + "grad_norm": 0.18681849305462322, + "learning_rate": 7.743899822639896e-05, + "loss": 0.8757, + "step": 18350 + }, + { + "epoch": 3.914462457337884, + "grad_norm": 0.20748520370895782, + "learning_rate": 7.743049283699946e-05, + "loss": 0.9232, + "step": 18351 + }, + { + "epoch": 3.9146757679180886, + "grad_norm": 0.220037355047771, + "learning_rate": 7.742198747667401e-05, + "loss": 0.9055, + "step": 18352 + }, + { + "epoch": 3.9148890784982937, + "grad_norm": 0.18508076422107617, + "learning_rate": 7.741348214551881e-05, + "loss": 0.8745, + "step": 18353 + }, + { + "epoch": 3.9151023890784984, + "grad_norm": 0.23446244132565944, + "learning_rate": 7.74049768436301e-05, + "loss": 0.9069, + "step": 18354 + }, + { + "epoch": 3.915315699658703, + "grad_norm": 0.18279385829726405, + "learning_rate": 7.739647157110411e-05, + "loss": 0.9155, + "step": 18355 + }, + { + "epoch": 3.9155290102389078, + "grad_norm": 0.19052232921281298, + "learning_rate": 7.73879663280371e-05, + "loss": 0.9285, + "step": 18356 + }, + { + "epoch": 3.9157423208191124, + "grad_norm": 0.21032266065500016, + "learning_rate": 7.737946111452533e-05, + "loss": 0.9203, + "step": 18357 + }, + { + "epoch": 3.9159556313993176, + "grad_norm": 0.20839817043495815, + "learning_rate": 7.737095593066497e-05, + "loss": 0.9139, + "step": 18358 + }, + { + "epoch": 3.9161689419795223, + "grad_norm": 0.19016579647769577, + "learning_rate": 7.736245077655227e-05, + "loss": 0.8896, + "step": 18359 + }, + { + "epoch": 3.916382252559727, + "grad_norm": 0.20734995736677472, + "learning_rate": 7.73539456522835e-05, + "loss": 0.9062, + "step": 18360 + }, + { + "epoch": 3.9165955631399316, + "grad_norm": 0.22273272908171418, + "learning_rate": 7.734544055795486e-05, + "loss": 0.9119, + "step": 18361 + }, + { + "epoch": 3.9168088737201368, + "grad_norm": 0.24761430864778175, + "learning_rate": 7.733693549366264e-05, + "loss": 0.9315, + "step": 18362 + }, + { + "epoch": 3.9170221843003414, + "grad_norm": 0.2038769263734648, + "learning_rate": 7.7328430459503e-05, + "loss": 0.9169, + "step": 18363 + }, + { + "epoch": 3.917235494880546, + "grad_norm": 0.23248432876400577, + "learning_rate": 7.731992545557225e-05, + "loss": 0.9183, + "step": 18364 + }, + { + "epoch": 3.917448805460751, + "grad_norm": 0.18580846647431604, + "learning_rate": 7.731142048196657e-05, + "loss": 0.9067, + "step": 18365 + }, + { + "epoch": 3.9176621160409555, + "grad_norm": 0.19438167310046445, + "learning_rate": 7.730291553878221e-05, + "loss": 0.9099, + "step": 18366 + }, + { + "epoch": 3.91787542662116, + "grad_norm": 0.17986491608819993, + "learning_rate": 7.729441062611541e-05, + "loss": 0.9163, + "step": 18367 + }, + { + "epoch": 3.9180887372013653, + "grad_norm": 0.1877524684898725, + "learning_rate": 7.728590574406244e-05, + "loss": 0.9216, + "step": 18368 + }, + { + "epoch": 3.91830204778157, + "grad_norm": 0.18494794686683041, + "learning_rate": 7.727740089271944e-05, + "loss": 0.9234, + "step": 18369 + }, + { + "epoch": 3.9185153583617747, + "grad_norm": 0.20363749931999622, + "learning_rate": 7.726889607218272e-05, + "loss": 0.9198, + "step": 18370 + }, + { + "epoch": 3.91872866894198, + "grad_norm": 0.18131616071609322, + "learning_rate": 7.726039128254845e-05, + "loss": 0.8863, + "step": 18371 + }, + { + "epoch": 3.9189419795221845, + "grad_norm": 0.18249072519481446, + "learning_rate": 7.725188652391294e-05, + "loss": 0.9529, + "step": 18372 + }, + { + "epoch": 3.919155290102389, + "grad_norm": 0.42183443984769786, + "learning_rate": 7.724338179637236e-05, + "loss": 0.8851, + "step": 18373 + }, + { + "epoch": 3.919368600682594, + "grad_norm": 0.18699107561964573, + "learning_rate": 7.723487710002297e-05, + "loss": 0.9043, + "step": 18374 + }, + { + "epoch": 3.9195819112627985, + "grad_norm": 0.16802842859855305, + "learning_rate": 7.722637243496097e-05, + "loss": 0.9502, + "step": 18375 + }, + { + "epoch": 3.919795221843003, + "grad_norm": 0.20560698507916583, + "learning_rate": 7.721786780128263e-05, + "loss": 0.9156, + "step": 18376 + }, + { + "epoch": 3.9200085324232083, + "grad_norm": 0.17392033344425514, + "learning_rate": 7.720936319908416e-05, + "loss": 0.9287, + "step": 18377 + }, + { + "epoch": 3.920221843003413, + "grad_norm": 0.19691692103895572, + "learning_rate": 7.720085862846181e-05, + "loss": 0.8937, + "step": 18378 + }, + { + "epoch": 3.9204351535836177, + "grad_norm": 0.20361432748830388, + "learning_rate": 7.719235408951176e-05, + "loss": 0.9309, + "step": 18379 + }, + { + "epoch": 3.9206484641638224, + "grad_norm": 0.17032485825522115, + "learning_rate": 7.718384958233033e-05, + "loss": 0.9165, + "step": 18380 + }, + { + "epoch": 3.9208617747440275, + "grad_norm": 0.18397231625901456, + "learning_rate": 7.717534510701364e-05, + "loss": 0.9142, + "step": 18381 + }, + { + "epoch": 3.921075085324232, + "grad_norm": 0.1950950601328175, + "learning_rate": 7.716684066365798e-05, + "loss": 0.9257, + "step": 18382 + }, + { + "epoch": 3.921288395904437, + "grad_norm": 0.21121428587100521, + "learning_rate": 7.715833625235954e-05, + "loss": 0.9266, + "step": 18383 + }, + { + "epoch": 3.9215017064846416, + "grad_norm": 0.18476603894614788, + "learning_rate": 7.71498318732146e-05, + "loss": 0.9284, + "step": 18384 + }, + { + "epoch": 3.9217150170648463, + "grad_norm": 0.19812519867830086, + "learning_rate": 7.714132752631935e-05, + "loss": 0.8994, + "step": 18385 + }, + { + "epoch": 3.921928327645051, + "grad_norm": 0.19927495990228355, + "learning_rate": 7.713282321177004e-05, + "loss": 0.9364, + "step": 18386 + }, + { + "epoch": 3.922141638225256, + "grad_norm": 0.2069433182637676, + "learning_rate": 7.712431892966286e-05, + "loss": 0.8906, + "step": 18387 + }, + { + "epoch": 3.9223549488054608, + "grad_norm": 0.21345038457426643, + "learning_rate": 7.711581468009408e-05, + "loss": 0.9102, + "step": 18388 + }, + { + "epoch": 3.9225682593856654, + "grad_norm": 0.16675990570727472, + "learning_rate": 7.710731046315989e-05, + "loss": 0.9117, + "step": 18389 + }, + { + "epoch": 3.9227815699658706, + "grad_norm": 0.20235079757693852, + "learning_rate": 7.709880627895655e-05, + "loss": 0.882, + "step": 18390 + }, + { + "epoch": 3.9229948805460753, + "grad_norm": 0.1748391386006672, + "learning_rate": 7.709030212758027e-05, + "loss": 0.9255, + "step": 18391 + }, + { + "epoch": 3.92320819112628, + "grad_norm": 0.18508145823554673, + "learning_rate": 7.708179800912726e-05, + "loss": 0.921, + "step": 18392 + }, + { + "epoch": 3.9234215017064846, + "grad_norm": 0.21361511717614845, + "learning_rate": 7.707329392369375e-05, + "loss": 0.9051, + "step": 18393 + }, + { + "epoch": 3.9236348122866893, + "grad_norm": 0.22488363898971184, + "learning_rate": 7.706478987137598e-05, + "loss": 0.9225, + "step": 18394 + }, + { + "epoch": 3.923848122866894, + "grad_norm": 0.179305819618139, + "learning_rate": 7.705628585227013e-05, + "loss": 0.9001, + "step": 18395 + }, + { + "epoch": 3.924061433447099, + "grad_norm": 0.17925513345697847, + "learning_rate": 7.704778186647248e-05, + "loss": 0.8957, + "step": 18396 + }, + { + "epoch": 3.924274744027304, + "grad_norm": 0.17894769278458605, + "learning_rate": 7.703927791407922e-05, + "loss": 0.8905, + "step": 18397 + }, + { + "epoch": 3.9244880546075085, + "grad_norm": 0.20254548578219708, + "learning_rate": 7.703077399518659e-05, + "loss": 0.9218, + "step": 18398 + }, + { + "epoch": 3.924701365187713, + "grad_norm": 0.21706017297510016, + "learning_rate": 7.702227010989079e-05, + "loss": 0.9163, + "step": 18399 + }, + { + "epoch": 3.9249146757679183, + "grad_norm": 0.2047450931962478, + "learning_rate": 7.701376625828807e-05, + "loss": 0.9289, + "step": 18400 + }, + { + "epoch": 3.925127986348123, + "grad_norm": 0.19198864382695283, + "learning_rate": 7.700526244047461e-05, + "loss": 0.881, + "step": 18401 + }, + { + "epoch": 3.9253412969283277, + "grad_norm": 0.18303000236482436, + "learning_rate": 7.699675865654672e-05, + "loss": 0.8911, + "step": 18402 + }, + { + "epoch": 3.9255546075085324, + "grad_norm": 0.21189783500605575, + "learning_rate": 7.698825490660048e-05, + "loss": 0.9312, + "step": 18403 + }, + { + "epoch": 3.925767918088737, + "grad_norm": 0.2930554740860849, + "learning_rate": 7.697975119073223e-05, + "loss": 0.9392, + "step": 18404 + }, + { + "epoch": 3.9259812286689417, + "grad_norm": 0.15770206073599397, + "learning_rate": 7.697124750903812e-05, + "loss": 0.9056, + "step": 18405 + }, + { + "epoch": 3.926194539249147, + "grad_norm": 0.25039684839464854, + "learning_rate": 7.696274386161441e-05, + "loss": 0.9256, + "step": 18406 + }, + { + "epoch": 3.9264078498293515, + "grad_norm": 0.9099357424774623, + "learning_rate": 7.695424024855729e-05, + "loss": 0.9734, + "step": 18407 + }, + { + "epoch": 3.926621160409556, + "grad_norm": 0.19636510027976592, + "learning_rate": 7.6945736669963e-05, + "loss": 0.9598, + "step": 18408 + }, + { + "epoch": 3.9268344709897613, + "grad_norm": 0.2033309440154379, + "learning_rate": 7.693723312592775e-05, + "loss": 0.9273, + "step": 18409 + }, + { + "epoch": 3.927047781569966, + "grad_norm": 0.21502089006765915, + "learning_rate": 7.692872961654777e-05, + "loss": 0.9133, + "step": 18410 + }, + { + "epoch": 3.9272610921501707, + "grad_norm": 0.27041968520022586, + "learning_rate": 7.692022614191924e-05, + "loss": 0.894, + "step": 18411 + }, + { + "epoch": 3.9274744027303754, + "grad_norm": 0.24370249228769383, + "learning_rate": 7.691172270213843e-05, + "loss": 0.9277, + "step": 18412 + }, + { + "epoch": 3.92768771331058, + "grad_norm": 0.15891299773222087, + "learning_rate": 7.690321929730154e-05, + "loss": 0.8999, + "step": 18413 + }, + { + "epoch": 3.9279010238907848, + "grad_norm": 0.22654315219951665, + "learning_rate": 7.689471592750477e-05, + "loss": 0.9211, + "step": 18414 + }, + { + "epoch": 3.92811433447099, + "grad_norm": 0.2604973603359315, + "learning_rate": 7.688621259284432e-05, + "loss": 0.9213, + "step": 18415 + }, + { + "epoch": 3.9283276450511946, + "grad_norm": 0.24771877132132078, + "learning_rate": 7.687770929341644e-05, + "loss": 0.9288, + "step": 18416 + }, + { + "epoch": 3.9285409556313993, + "grad_norm": 0.18278964581497498, + "learning_rate": 7.686920602931732e-05, + "loss": 0.9111, + "step": 18417 + }, + { + "epoch": 3.928754266211604, + "grad_norm": 0.22333470725362375, + "learning_rate": 7.68607028006432e-05, + "loss": 0.8894, + "step": 18418 + }, + { + "epoch": 3.928967576791809, + "grad_norm": 0.27312099762281145, + "learning_rate": 7.685219960749026e-05, + "loss": 0.9054, + "step": 18419 + }, + { + "epoch": 3.9291808873720138, + "grad_norm": 0.3324274376587657, + "learning_rate": 7.684369644995475e-05, + "loss": 0.9164, + "step": 18420 + }, + { + "epoch": 3.9293941979522184, + "grad_norm": 0.28700063374834506, + "learning_rate": 7.683519332813286e-05, + "loss": 0.938, + "step": 18421 + }, + { + "epoch": 3.929607508532423, + "grad_norm": 0.23937881978862616, + "learning_rate": 7.682669024212082e-05, + "loss": 0.9032, + "step": 18422 + }, + { + "epoch": 3.929820819112628, + "grad_norm": 0.1616359430017806, + "learning_rate": 7.681818719201481e-05, + "loss": 0.937, + "step": 18423 + }, + { + "epoch": 3.9300341296928325, + "grad_norm": 0.19184230222357324, + "learning_rate": 7.680968417791112e-05, + "loss": 0.8988, + "step": 18424 + }, + { + "epoch": 3.9302474402730376, + "grad_norm": 0.1961808001665531, + "learning_rate": 7.680118119990588e-05, + "loss": 0.9138, + "step": 18425 + }, + { + "epoch": 3.9304607508532423, + "grad_norm": 0.21263916134109878, + "learning_rate": 7.679267825809532e-05, + "loss": 0.9098, + "step": 18426 + }, + { + "epoch": 3.930674061433447, + "grad_norm": 0.23678948072284228, + "learning_rate": 7.678417535257566e-05, + "loss": 0.9361, + "step": 18427 + }, + { + "epoch": 3.930887372013652, + "grad_norm": 0.2381631868369424, + "learning_rate": 7.677567248344311e-05, + "loss": 0.9511, + "step": 18428 + }, + { + "epoch": 3.931100682593857, + "grad_norm": 0.22770475099851015, + "learning_rate": 7.676716965079388e-05, + "loss": 0.917, + "step": 18429 + }, + { + "epoch": 3.9313139931740615, + "grad_norm": 0.2406242926812305, + "learning_rate": 7.675866685472418e-05, + "loss": 0.9213, + "step": 18430 + }, + { + "epoch": 3.931527303754266, + "grad_norm": 0.1717534281008099, + "learning_rate": 7.675016409533021e-05, + "loss": 0.9353, + "step": 18431 + }, + { + "epoch": 3.931740614334471, + "grad_norm": 0.31941216983068893, + "learning_rate": 7.67416613727082e-05, + "loss": 0.8908, + "step": 18432 + }, + { + "epoch": 3.9319539249146755, + "grad_norm": 0.24373036527921854, + "learning_rate": 7.673315868695434e-05, + "loss": 0.9269, + "step": 18433 + }, + { + "epoch": 3.9321672354948807, + "grad_norm": 0.18116033859251116, + "learning_rate": 7.672465603816485e-05, + "loss": 0.9012, + "step": 18434 + }, + { + "epoch": 3.9323805460750854, + "grad_norm": 0.22035312312970523, + "learning_rate": 7.671615342643596e-05, + "loss": 0.9127, + "step": 18435 + }, + { + "epoch": 3.93259385665529, + "grad_norm": 0.19327203900117804, + "learning_rate": 7.67076508518638e-05, + "loss": 0.9187, + "step": 18436 + }, + { + "epoch": 3.9328071672354947, + "grad_norm": 0.18009178679622415, + "learning_rate": 7.669914831454467e-05, + "loss": 0.9247, + "step": 18437 + }, + { + "epoch": 3.9330204778157, + "grad_norm": 0.1824441081707815, + "learning_rate": 7.66906458145747e-05, + "loss": 0.9326, + "step": 18438 + }, + { + "epoch": 3.9332337883959045, + "grad_norm": 0.17406481718687777, + "learning_rate": 7.668214335205013e-05, + "loss": 0.9232, + "step": 18439 + }, + { + "epoch": 3.9334470989761092, + "grad_norm": 0.19240046263643915, + "learning_rate": 7.667364092706717e-05, + "loss": 0.893, + "step": 18440 + }, + { + "epoch": 3.933660409556314, + "grad_norm": 0.22708794313600164, + "learning_rate": 7.666513853972201e-05, + "loss": 0.9269, + "step": 18441 + }, + { + "epoch": 3.9338737201365186, + "grad_norm": 0.21467377370840734, + "learning_rate": 7.665663619011089e-05, + "loss": 0.9444, + "step": 18442 + }, + { + "epoch": 3.9340870307167233, + "grad_norm": 0.1985842067932164, + "learning_rate": 7.664813387832996e-05, + "loss": 0.8962, + "step": 18443 + }, + { + "epoch": 3.9343003412969284, + "grad_norm": 0.21478561572391774, + "learning_rate": 7.663963160447546e-05, + "loss": 0.9302, + "step": 18444 + }, + { + "epoch": 3.934513651877133, + "grad_norm": 0.17105324704459082, + "learning_rate": 7.663112936864357e-05, + "loss": 0.9255, + "step": 18445 + }, + { + "epoch": 3.9347269624573378, + "grad_norm": 0.20060770902837538, + "learning_rate": 7.662262717093053e-05, + "loss": 0.9229, + "step": 18446 + }, + { + "epoch": 3.934940273037543, + "grad_norm": 0.1592385863876596, + "learning_rate": 7.661412501143254e-05, + "loss": 0.9159, + "step": 18447 + }, + { + "epoch": 3.9351535836177476, + "grad_norm": 0.1818199637050759, + "learning_rate": 7.660562289024575e-05, + "loss": 0.8876, + "step": 18448 + }, + { + "epoch": 3.9353668941979523, + "grad_norm": 0.1899414137676646, + "learning_rate": 7.65971208074664e-05, + "loss": 0.9026, + "step": 18449 + }, + { + "epoch": 3.935580204778157, + "grad_norm": 0.2208922077983203, + "learning_rate": 7.65886187631907e-05, + "loss": 0.9009, + "step": 18450 + }, + { + "epoch": 3.9357935153583616, + "grad_norm": 0.28073303163530117, + "learning_rate": 7.658011675751482e-05, + "loss": 0.9193, + "step": 18451 + }, + { + "epoch": 3.9360068259385663, + "grad_norm": 0.22995262126864965, + "learning_rate": 7.657161479053498e-05, + "loss": 0.9145, + "step": 18452 + }, + { + "epoch": 3.9362201365187715, + "grad_norm": 0.17921561762781463, + "learning_rate": 7.656311286234736e-05, + "loss": 0.9423, + "step": 18453 + }, + { + "epoch": 3.936433447098976, + "grad_norm": 0.19759215966168567, + "learning_rate": 7.65546109730482e-05, + "loss": 0.9196, + "step": 18454 + }, + { + "epoch": 3.936646757679181, + "grad_norm": 0.2459206251942759, + "learning_rate": 7.654610912273366e-05, + "loss": 0.9001, + "step": 18455 + }, + { + "epoch": 3.9368600682593855, + "grad_norm": 0.28198494233133387, + "learning_rate": 7.653760731149996e-05, + "loss": 0.9516, + "step": 18456 + }, + { + "epoch": 3.9370733788395906, + "grad_norm": 0.2117015369109289, + "learning_rate": 7.652910553944327e-05, + "loss": 0.8828, + "step": 18457 + }, + { + "epoch": 3.9372866894197953, + "grad_norm": 0.22012048478785018, + "learning_rate": 7.652060380665987e-05, + "loss": 0.9429, + "step": 18458 + }, + { + "epoch": 3.9375, + "grad_norm": 0.22420024102922764, + "learning_rate": 7.651210211324585e-05, + "loss": 0.9343, + "step": 18459 + }, + { + "epoch": 3.9377133105802047, + "grad_norm": 0.2018859276209295, + "learning_rate": 7.650360045929746e-05, + "loss": 0.8913, + "step": 18460 + }, + { + "epoch": 3.9379266211604094, + "grad_norm": 0.1931075867542452, + "learning_rate": 7.649509884491089e-05, + "loss": 0.9075, + "step": 18461 + }, + { + "epoch": 3.9381399317406145, + "grad_norm": 0.17522789739912778, + "learning_rate": 7.648659727018235e-05, + "loss": 0.8926, + "step": 18462 + }, + { + "epoch": 3.938353242320819, + "grad_norm": 0.19004772487749605, + "learning_rate": 7.647809573520799e-05, + "loss": 0.9303, + "step": 18463 + }, + { + "epoch": 3.938566552901024, + "grad_norm": 0.25133976394133106, + "learning_rate": 7.646959424008406e-05, + "loss": 0.9219, + "step": 18464 + }, + { + "epoch": 3.9387798634812285, + "grad_norm": 0.21504771300588835, + "learning_rate": 7.646109278490671e-05, + "loss": 0.9088, + "step": 18465 + }, + { + "epoch": 3.9389931740614337, + "grad_norm": 0.2138682992568459, + "learning_rate": 7.645259136977217e-05, + "loss": 0.9188, + "step": 18466 + }, + { + "epoch": 3.9392064846416384, + "grad_norm": 0.2456453524017724, + "learning_rate": 7.644408999477661e-05, + "loss": 0.938, + "step": 18467 + }, + { + "epoch": 3.939419795221843, + "grad_norm": 0.20426838147793064, + "learning_rate": 7.643558866001625e-05, + "loss": 0.8999, + "step": 18468 + }, + { + "epoch": 3.9396331058020477, + "grad_norm": 0.22094437540989165, + "learning_rate": 7.642708736558726e-05, + "loss": 0.9279, + "step": 18469 + }, + { + "epoch": 3.9398464163822524, + "grad_norm": 0.21495889423750802, + "learning_rate": 7.641858611158582e-05, + "loss": 0.8749, + "step": 18470 + }, + { + "epoch": 3.940059726962457, + "grad_norm": 0.21055052461079038, + "learning_rate": 7.641008489810816e-05, + "loss": 0.924, + "step": 18471 + }, + { + "epoch": 3.9402730375426622, + "grad_norm": 0.23228901749892777, + "learning_rate": 7.640158372525042e-05, + "loss": 0.9212, + "step": 18472 + }, + { + "epoch": 3.940486348122867, + "grad_norm": 0.2686866354408977, + "learning_rate": 7.639308259310884e-05, + "loss": 0.8918, + "step": 18473 + }, + { + "epoch": 3.9406996587030716, + "grad_norm": 0.2426209309875773, + "learning_rate": 7.63845815017796e-05, + "loss": 0.9329, + "step": 18474 + }, + { + "epoch": 3.9409129692832767, + "grad_norm": 0.20266837020037567, + "learning_rate": 7.637608045135886e-05, + "loss": 0.9148, + "step": 18475 + }, + { + "epoch": 3.9411262798634814, + "grad_norm": 0.25045898940977246, + "learning_rate": 7.636757944194284e-05, + "loss": 0.8975, + "step": 18476 + }, + { + "epoch": 3.941339590443686, + "grad_norm": 0.32876113770225573, + "learning_rate": 7.635907847362771e-05, + "loss": 0.9275, + "step": 18477 + }, + { + "epoch": 3.9415529010238908, + "grad_norm": 0.21657033953418067, + "learning_rate": 7.63505775465097e-05, + "loss": 0.913, + "step": 18478 + }, + { + "epoch": 3.9417662116040955, + "grad_norm": 0.24585254110419436, + "learning_rate": 7.634207666068492e-05, + "loss": 0.9224, + "step": 18479 + }, + { + "epoch": 3.9419795221843, + "grad_norm": 0.2982047303347703, + "learning_rate": 7.633357581624965e-05, + "loss": 0.9358, + "step": 18480 + }, + { + "epoch": 3.9421928327645053, + "grad_norm": 0.23344538546726454, + "learning_rate": 7.632507501330001e-05, + "loss": 0.9077, + "step": 18481 + }, + { + "epoch": 3.94240614334471, + "grad_norm": 0.20851324318457784, + "learning_rate": 7.63165742519322e-05, + "loss": 0.9119, + "step": 18482 + }, + { + "epoch": 3.9426194539249146, + "grad_norm": 0.2341210635684881, + "learning_rate": 7.630807353224243e-05, + "loss": 0.8975, + "step": 18483 + }, + { + "epoch": 3.9428327645051193, + "grad_norm": 0.2084770961178561, + "learning_rate": 7.629957285432685e-05, + "loss": 0.891, + "step": 18484 + }, + { + "epoch": 3.9430460750853245, + "grad_norm": 0.22605089476681736, + "learning_rate": 7.629107221828168e-05, + "loss": 0.9378, + "step": 18485 + }, + { + "epoch": 3.943259385665529, + "grad_norm": 0.2177798978450423, + "learning_rate": 7.62825716242031e-05, + "loss": 0.9103, + "step": 18486 + }, + { + "epoch": 3.943472696245734, + "grad_norm": 0.19243303915821203, + "learning_rate": 7.627407107218726e-05, + "loss": 0.9448, + "step": 18487 + }, + { + "epoch": 3.9436860068259385, + "grad_norm": 0.16465201938568105, + "learning_rate": 7.626557056233039e-05, + "loss": 0.9027, + "step": 18488 + }, + { + "epoch": 3.943899317406143, + "grad_norm": 0.2125737622613089, + "learning_rate": 7.625707009472863e-05, + "loss": 0.9231, + "step": 18489 + }, + { + "epoch": 3.944112627986348, + "grad_norm": 0.2391937318401895, + "learning_rate": 7.624856966947821e-05, + "loss": 0.9271, + "step": 18490 + }, + { + "epoch": 3.944325938566553, + "grad_norm": 0.1742634994563566, + "learning_rate": 7.624006928667529e-05, + "loss": 0.9121, + "step": 18491 + }, + { + "epoch": 3.9445392491467577, + "grad_norm": 0.21923202936501013, + "learning_rate": 7.623156894641602e-05, + "loss": 0.9159, + "step": 18492 + }, + { + "epoch": 3.9447525597269624, + "grad_norm": 0.23028001499970505, + "learning_rate": 7.622306864879664e-05, + "loss": 0.9124, + "step": 18493 + }, + { + "epoch": 3.9449658703071675, + "grad_norm": 0.29861991040251046, + "learning_rate": 7.621456839391327e-05, + "loss": 0.9144, + "step": 18494 + }, + { + "epoch": 3.945179180887372, + "grad_norm": 0.31673455435335485, + "learning_rate": 7.620606818186216e-05, + "loss": 0.9112, + "step": 18495 + }, + { + "epoch": 3.945392491467577, + "grad_norm": 0.23567846406694148, + "learning_rate": 7.619756801273941e-05, + "loss": 0.9119, + "step": 18496 + }, + { + "epoch": 3.9456058020477816, + "grad_norm": 0.1615636733259144, + "learning_rate": 7.618906788664127e-05, + "loss": 0.8923, + "step": 18497 + }, + { + "epoch": 3.9458191126279862, + "grad_norm": 0.21264699512855573, + "learning_rate": 7.618056780366389e-05, + "loss": 0.9225, + "step": 18498 + }, + { + "epoch": 3.946032423208191, + "grad_norm": 0.24829889726819623, + "learning_rate": 7.617206776390343e-05, + "loss": 0.9107, + "step": 18499 + }, + { + "epoch": 3.946245733788396, + "grad_norm": 0.28125184211785426, + "learning_rate": 7.61635677674561e-05, + "loss": 0.9476, + "step": 18500 + }, + { + "epoch": 3.9464590443686007, + "grad_norm": 0.21465435070169658, + "learning_rate": 7.615506781441805e-05, + "loss": 0.9355, + "step": 18501 + }, + { + "epoch": 3.9466723549488054, + "grad_norm": 0.17938018037943498, + "learning_rate": 7.614656790488551e-05, + "loss": 0.9214, + "step": 18502 + }, + { + "epoch": 3.94688566552901, + "grad_norm": 0.2258955397006034, + "learning_rate": 7.613806803895462e-05, + "loss": 0.919, + "step": 18503 + }, + { + "epoch": 3.9470989761092152, + "grad_norm": 0.2729483843568333, + "learning_rate": 7.612956821672152e-05, + "loss": 0.9184, + "step": 18504 + }, + { + "epoch": 3.94731228668942, + "grad_norm": 0.21050182279995763, + "learning_rate": 7.612106843828244e-05, + "loss": 0.9606, + "step": 18505 + }, + { + "epoch": 3.9475255972696246, + "grad_norm": 0.201619518123864, + "learning_rate": 7.611256870373351e-05, + "loss": 0.8957, + "step": 18506 + }, + { + "epoch": 3.9477389078498293, + "grad_norm": 0.1889745166641562, + "learning_rate": 7.610406901317097e-05, + "loss": 0.9114, + "step": 18507 + }, + { + "epoch": 3.947952218430034, + "grad_norm": 0.16512532524568121, + "learning_rate": 7.609556936669093e-05, + "loss": 0.9321, + "step": 18508 + }, + { + "epoch": 3.9481655290102387, + "grad_norm": 0.18565971334003406, + "learning_rate": 7.608706976438958e-05, + "loss": 0.9132, + "step": 18509 + }, + { + "epoch": 3.948378839590444, + "grad_norm": 0.18274937654654236, + "learning_rate": 7.607857020636312e-05, + "loss": 0.9157, + "step": 18510 + }, + { + "epoch": 3.9485921501706485, + "grad_norm": 0.1504629975607936, + "learning_rate": 7.607007069270769e-05, + "loss": 0.8914, + "step": 18511 + }, + { + "epoch": 3.948805460750853, + "grad_norm": 0.17058495612292823, + "learning_rate": 7.60615712235195e-05, + "loss": 0.9182, + "step": 18512 + }, + { + "epoch": 3.9490187713310583, + "grad_norm": 0.15524087894875352, + "learning_rate": 7.605307179889467e-05, + "loss": 0.9022, + "step": 18513 + }, + { + "epoch": 3.949232081911263, + "grad_norm": 0.17902307550726554, + "learning_rate": 7.604457241892944e-05, + "loss": 0.8894, + "step": 18514 + }, + { + "epoch": 3.9494453924914676, + "grad_norm": 0.15892246108836758, + "learning_rate": 7.60360730837199e-05, + "loss": 0.8968, + "step": 18515 + }, + { + "epoch": 3.9496587030716723, + "grad_norm": 0.18158444563594037, + "learning_rate": 7.602757379336227e-05, + "loss": 0.9232, + "step": 18516 + }, + { + "epoch": 3.949872013651877, + "grad_norm": 0.23301100976533487, + "learning_rate": 7.601907454795272e-05, + "loss": 0.9216, + "step": 18517 + }, + { + "epoch": 3.9500853242320817, + "grad_norm": 0.26477944016653604, + "learning_rate": 7.601057534758739e-05, + "loss": 0.934, + "step": 18518 + }, + { + "epoch": 3.950298634812287, + "grad_norm": 0.22239532888232744, + "learning_rate": 7.600207619236247e-05, + "loss": 0.9315, + "step": 18519 + }, + { + "epoch": 3.9505119453924915, + "grad_norm": 0.19263690292167188, + "learning_rate": 7.599357708237414e-05, + "loss": 0.9345, + "step": 18520 + }, + { + "epoch": 3.950725255972696, + "grad_norm": 0.1804978205349106, + "learning_rate": 7.598507801771853e-05, + "loss": 0.9231, + "step": 18521 + }, + { + "epoch": 3.950938566552901, + "grad_norm": 0.19030005743773887, + "learning_rate": 7.597657899849186e-05, + "loss": 0.9299, + "step": 18522 + }, + { + "epoch": 3.951151877133106, + "grad_norm": 0.215468554734861, + "learning_rate": 7.596808002479024e-05, + "loss": 0.9107, + "step": 18523 + }, + { + "epoch": 3.9513651877133107, + "grad_norm": 0.2941404940128805, + "learning_rate": 7.595958109670988e-05, + "loss": 0.9381, + "step": 18524 + }, + { + "epoch": 3.9515784982935154, + "grad_norm": 0.24907660543328894, + "learning_rate": 7.595108221434693e-05, + "loss": 0.9139, + "step": 18525 + }, + { + "epoch": 3.95179180887372, + "grad_norm": 0.1677940364915644, + "learning_rate": 7.594258337779753e-05, + "loss": 0.8877, + "step": 18526 + }, + { + "epoch": 3.9520051194539247, + "grad_norm": 0.2128901002342081, + "learning_rate": 7.59340845871579e-05, + "loss": 0.8991, + "step": 18527 + }, + { + "epoch": 3.9522184300341294, + "grad_norm": 0.18901057073443522, + "learning_rate": 7.592558584252414e-05, + "loss": 0.914, + "step": 18528 + }, + { + "epoch": 3.9524317406143346, + "grad_norm": 0.18770429341511122, + "learning_rate": 7.591708714399246e-05, + "loss": 0.9308, + "step": 18529 + }, + { + "epoch": 3.9526450511945392, + "grad_norm": 0.2248236350864428, + "learning_rate": 7.590858849165899e-05, + "loss": 0.8735, + "step": 18530 + }, + { + "epoch": 3.952858361774744, + "grad_norm": 0.23773536219566824, + "learning_rate": 7.590008988561993e-05, + "loss": 0.9305, + "step": 18531 + }, + { + "epoch": 3.953071672354949, + "grad_norm": 0.1792507000106335, + "learning_rate": 7.589159132597142e-05, + "loss": 0.8738, + "step": 18532 + }, + { + "epoch": 3.9532849829351537, + "grad_norm": 0.2066005489486114, + "learning_rate": 7.588309281280961e-05, + "loss": 0.9336, + "step": 18533 + }, + { + "epoch": 3.9534982935153584, + "grad_norm": 0.17875366422241826, + "learning_rate": 7.587459434623069e-05, + "loss": 0.9192, + "step": 18534 + }, + { + "epoch": 3.953711604095563, + "grad_norm": 0.18812917776777077, + "learning_rate": 7.586609592633078e-05, + "loss": 0.9038, + "step": 18535 + }, + { + "epoch": 3.953924914675768, + "grad_norm": 0.24118835496339822, + "learning_rate": 7.585759755320611e-05, + "loss": 0.9044, + "step": 18536 + }, + { + "epoch": 3.9541382252559725, + "grad_norm": 0.2103114196310924, + "learning_rate": 7.584909922695277e-05, + "loss": 0.9274, + "step": 18537 + }, + { + "epoch": 3.9543515358361776, + "grad_norm": 0.21092754039840136, + "learning_rate": 7.584060094766692e-05, + "loss": 0.9219, + "step": 18538 + }, + { + "epoch": 3.9545648464163823, + "grad_norm": 0.2969966264284281, + "learning_rate": 7.583210271544477e-05, + "loss": 0.9195, + "step": 18539 + }, + { + "epoch": 3.954778156996587, + "grad_norm": 0.23973325631316197, + "learning_rate": 7.582360453038241e-05, + "loss": 0.9375, + "step": 18540 + }, + { + "epoch": 3.9549914675767917, + "grad_norm": 0.23185024295617532, + "learning_rate": 7.581510639257608e-05, + "loss": 0.8964, + "step": 18541 + }, + { + "epoch": 3.955204778156997, + "grad_norm": 0.29013863692784925, + "learning_rate": 7.580660830212186e-05, + "loss": 0.9115, + "step": 18542 + }, + { + "epoch": 3.9554180887372015, + "grad_norm": 0.25480479659798216, + "learning_rate": 7.579811025911596e-05, + "loss": 0.9285, + "step": 18543 + }, + { + "epoch": 3.955631399317406, + "grad_norm": 0.27434984917232663, + "learning_rate": 7.57896122636545e-05, + "loss": 0.9603, + "step": 18544 + }, + { + "epoch": 3.955844709897611, + "grad_norm": 0.23652004623446865, + "learning_rate": 7.578111431583364e-05, + "loss": 0.9378, + "step": 18545 + }, + { + "epoch": 3.9560580204778155, + "grad_norm": 0.2528018113411377, + "learning_rate": 7.577261641574956e-05, + "loss": 0.8975, + "step": 18546 + }, + { + "epoch": 3.95627133105802, + "grad_norm": 0.29030696798407496, + "learning_rate": 7.576411856349842e-05, + "loss": 0.9316, + "step": 18547 + }, + { + "epoch": 3.9564846416382253, + "grad_norm": 0.20901651881670144, + "learning_rate": 7.57556207591763e-05, + "loss": 0.9193, + "step": 18548 + }, + { + "epoch": 3.95669795221843, + "grad_norm": 0.23234439862303172, + "learning_rate": 7.574712300287943e-05, + "loss": 0.9177, + "step": 18549 + }, + { + "epoch": 3.9569112627986347, + "grad_norm": 0.23961499348212242, + "learning_rate": 7.573862529470391e-05, + "loss": 0.9019, + "step": 18550 + }, + { + "epoch": 3.95712457337884, + "grad_norm": 0.22146799029313022, + "learning_rate": 7.573012763474593e-05, + "loss": 0.9289, + "step": 18551 + }, + { + "epoch": 3.9573378839590445, + "grad_norm": 0.224388449519489, + "learning_rate": 7.572163002310163e-05, + "loss": 0.9325, + "step": 18552 + }, + { + "epoch": 3.957551194539249, + "grad_norm": 0.1788089851109168, + "learning_rate": 7.571313245986716e-05, + "loss": 0.9159, + "step": 18553 + }, + { + "epoch": 3.957764505119454, + "grad_norm": 0.20797449068922086, + "learning_rate": 7.570463494513867e-05, + "loss": 0.9262, + "step": 18554 + }, + { + "epoch": 3.9579778156996586, + "grad_norm": 0.19612764293813556, + "learning_rate": 7.56961374790123e-05, + "loss": 0.9037, + "step": 18555 + }, + { + "epoch": 3.9581911262798632, + "grad_norm": 0.19900949649761904, + "learning_rate": 7.568764006158421e-05, + "loss": 0.8995, + "step": 18556 + }, + { + "epoch": 3.9584044368600684, + "grad_norm": 0.2494906932928656, + "learning_rate": 7.567914269295055e-05, + "loss": 0.9213, + "step": 18557 + }, + { + "epoch": 3.958617747440273, + "grad_norm": 0.1929964811155621, + "learning_rate": 7.567064537320746e-05, + "loss": 0.918, + "step": 18558 + }, + { + "epoch": 3.9588310580204777, + "grad_norm": 0.1944117001005607, + "learning_rate": 7.566214810245111e-05, + "loss": 0.9509, + "step": 18559 + }, + { + "epoch": 3.9590443686006824, + "grad_norm": 0.1801540620071115, + "learning_rate": 7.56536508807776e-05, + "loss": 0.8959, + "step": 18560 + }, + { + "epoch": 3.9592576791808876, + "grad_norm": 1.257997229428571, + "learning_rate": 7.564515370828312e-05, + "loss": 0.9758, + "step": 18561 + }, + { + "epoch": 3.9594709897610922, + "grad_norm": 0.17414262357093077, + "learning_rate": 7.563665658506378e-05, + "loss": 0.903, + "step": 18562 + }, + { + "epoch": 3.959684300341297, + "grad_norm": 0.17927576629438371, + "learning_rate": 7.562815951121576e-05, + "loss": 0.8922, + "step": 18563 + }, + { + "epoch": 3.9598976109215016, + "grad_norm": 0.16934645081016172, + "learning_rate": 7.561966248683517e-05, + "loss": 0.9071, + "step": 18564 + }, + { + "epoch": 3.9601109215017063, + "grad_norm": 0.19485460094730808, + "learning_rate": 7.561116551201819e-05, + "loss": 0.9129, + "step": 18565 + }, + { + "epoch": 3.9603242320819114, + "grad_norm": 0.2362172627228471, + "learning_rate": 7.560266858686094e-05, + "loss": 0.9022, + "step": 18566 + }, + { + "epoch": 3.960537542662116, + "grad_norm": 0.20226395987339868, + "learning_rate": 7.559417171145957e-05, + "loss": 0.9198, + "step": 18567 + }, + { + "epoch": 3.960750853242321, + "grad_norm": 0.23611570911636076, + "learning_rate": 7.558567488591022e-05, + "loss": 0.9368, + "step": 18568 + }, + { + "epoch": 3.9609641638225255, + "grad_norm": 0.24971561353843277, + "learning_rate": 7.557717811030902e-05, + "loss": 0.9463, + "step": 18569 + }, + { + "epoch": 3.9611774744027306, + "grad_norm": 0.25526693138396317, + "learning_rate": 7.556868138475216e-05, + "loss": 0.9495, + "step": 18570 + }, + { + "epoch": 3.9613907849829353, + "grad_norm": 0.20993231179592667, + "learning_rate": 7.556018470933573e-05, + "loss": 0.9044, + "step": 18571 + }, + { + "epoch": 3.96160409556314, + "grad_norm": 0.2263858423680969, + "learning_rate": 7.555168808415588e-05, + "loss": 0.9301, + "step": 18572 + }, + { + "epoch": 3.9618174061433447, + "grad_norm": 0.27396974431133975, + "learning_rate": 7.554319150930876e-05, + "loss": 0.9272, + "step": 18573 + }, + { + "epoch": 3.9620307167235493, + "grad_norm": 0.2543845991029926, + "learning_rate": 7.553469498489048e-05, + "loss": 0.9082, + "step": 18574 + }, + { + "epoch": 3.962244027303754, + "grad_norm": 0.2513236803545968, + "learning_rate": 7.552619851099721e-05, + "loss": 0.9047, + "step": 18575 + }, + { + "epoch": 3.962457337883959, + "grad_norm": 0.2448851921217016, + "learning_rate": 7.551770208772508e-05, + "loss": 0.9211, + "step": 18576 + }, + { + "epoch": 3.962670648464164, + "grad_norm": 0.5563838483958772, + "learning_rate": 7.550920571517024e-05, + "loss": 0.9569, + "step": 18577 + }, + { + "epoch": 3.9628839590443685, + "grad_norm": 0.2636020162072787, + "learning_rate": 7.55007093934288e-05, + "loss": 0.8992, + "step": 18578 + }, + { + "epoch": 3.9630972696245736, + "grad_norm": 0.24235442268509713, + "learning_rate": 7.549221312259691e-05, + "loss": 0.9032, + "step": 18579 + }, + { + "epoch": 3.9633105802047783, + "grad_norm": 0.2645347476959568, + "learning_rate": 7.548371690277072e-05, + "loss": 0.9213, + "step": 18580 + }, + { + "epoch": 3.963523890784983, + "grad_norm": 0.32498660381163075, + "learning_rate": 7.547522073404636e-05, + "loss": 0.932, + "step": 18581 + }, + { + "epoch": 3.9637372013651877, + "grad_norm": 0.18842145702511384, + "learning_rate": 7.546672461651992e-05, + "loss": 0.9228, + "step": 18582 + }, + { + "epoch": 3.9639505119453924, + "grad_norm": 0.25646849817927203, + "learning_rate": 7.545822855028758e-05, + "loss": 0.932, + "step": 18583 + }, + { + "epoch": 3.964163822525597, + "grad_norm": 0.26880180056145125, + "learning_rate": 7.544973253544544e-05, + "loss": 0.9218, + "step": 18584 + }, + { + "epoch": 3.964377133105802, + "grad_norm": 0.22693966516643602, + "learning_rate": 7.544123657208969e-05, + "loss": 0.9374, + "step": 18585 + }, + { + "epoch": 3.964590443686007, + "grad_norm": 0.2608215631196811, + "learning_rate": 7.543274066031638e-05, + "loss": 0.9506, + "step": 18586 + }, + { + "epoch": 3.9648037542662116, + "grad_norm": 0.18021910447805578, + "learning_rate": 7.542424480022172e-05, + "loss": 0.8944, + "step": 18587 + }, + { + "epoch": 3.9650170648464163, + "grad_norm": 0.27607075537870746, + "learning_rate": 7.54157489919018e-05, + "loss": 0.8851, + "step": 18588 + }, + { + "epoch": 3.9652303754266214, + "grad_norm": 0.2343421875740891, + "learning_rate": 7.540725323545275e-05, + "loss": 0.9179, + "step": 18589 + }, + { + "epoch": 3.965443686006826, + "grad_norm": 0.21459582972955582, + "learning_rate": 7.53987575309707e-05, + "loss": 0.9366, + "step": 18590 + }, + { + "epoch": 3.9656569965870307, + "grad_norm": 0.23785455077836754, + "learning_rate": 7.539026187855181e-05, + "loss": 0.8803, + "step": 18591 + }, + { + "epoch": 3.9658703071672354, + "grad_norm": 0.17296310886694913, + "learning_rate": 7.53817662782922e-05, + "loss": 0.9224, + "step": 18592 + }, + { + "epoch": 3.96608361774744, + "grad_norm": 0.21340731676259164, + "learning_rate": 7.537327073028796e-05, + "loss": 0.9342, + "step": 18593 + }, + { + "epoch": 3.966296928327645, + "grad_norm": 0.23912849038638564, + "learning_rate": 7.536477523463523e-05, + "loss": 0.9032, + "step": 18594 + }, + { + "epoch": 3.96651023890785, + "grad_norm": 0.18161791384090847, + "learning_rate": 7.535627979143017e-05, + "loss": 0.8977, + "step": 18595 + }, + { + "epoch": 3.9667235494880546, + "grad_norm": 0.2551156229741495, + "learning_rate": 7.534778440076885e-05, + "loss": 0.9393, + "step": 18596 + }, + { + "epoch": 3.9669368600682593, + "grad_norm": 0.2101689541281721, + "learning_rate": 7.533928906274743e-05, + "loss": 0.9047, + "step": 18597 + }, + { + "epoch": 3.9671501706484644, + "grad_norm": 0.2123796164698897, + "learning_rate": 7.533079377746204e-05, + "loss": 0.8866, + "step": 18598 + }, + { + "epoch": 3.967363481228669, + "grad_norm": 0.19996214719973013, + "learning_rate": 7.53222985450088e-05, + "loss": 0.9327, + "step": 18599 + }, + { + "epoch": 3.967576791808874, + "grad_norm": 0.22450315286100295, + "learning_rate": 7.531380336548383e-05, + "loss": 0.9727, + "step": 18600 + }, + { + "epoch": 3.9677901023890785, + "grad_norm": 0.2581013612146061, + "learning_rate": 7.530530823898326e-05, + "loss": 0.885, + "step": 18601 + }, + { + "epoch": 3.968003412969283, + "grad_norm": 0.23121024205414728, + "learning_rate": 7.529681316560319e-05, + "loss": 0.9138, + "step": 18602 + }, + { + "epoch": 3.968216723549488, + "grad_norm": 0.199851219384391, + "learning_rate": 7.52883181454398e-05, + "loss": 0.9243, + "step": 18603 + }, + { + "epoch": 3.968430034129693, + "grad_norm": 0.24671362514847833, + "learning_rate": 7.527982317858913e-05, + "loss": 0.9044, + "step": 18604 + }, + { + "epoch": 3.9686433447098977, + "grad_norm": 0.19316109484244995, + "learning_rate": 7.527132826514737e-05, + "loss": 0.9043, + "step": 18605 + }, + { + "epoch": 3.9688566552901023, + "grad_norm": 0.18842701243446464, + "learning_rate": 7.526283340521058e-05, + "loss": 0.9098, + "step": 18606 + }, + { + "epoch": 3.969069965870307, + "grad_norm": 0.15503227986114934, + "learning_rate": 7.525433859887493e-05, + "loss": 0.9336, + "step": 18607 + }, + { + "epoch": 3.969283276450512, + "grad_norm": 0.17538491851120258, + "learning_rate": 7.524584384623649e-05, + "loss": 0.9126, + "step": 18608 + }, + { + "epoch": 3.969496587030717, + "grad_norm": 0.16653374353617162, + "learning_rate": 7.523734914739143e-05, + "loss": 0.9268, + "step": 18609 + }, + { + "epoch": 3.9697098976109215, + "grad_norm": 0.1758291592335125, + "learning_rate": 7.522885450243582e-05, + "loss": 0.941, + "step": 18610 + }, + { + "epoch": 3.969923208191126, + "grad_norm": 0.18367600120750632, + "learning_rate": 7.522035991146583e-05, + "loss": 0.9148, + "step": 18611 + }, + { + "epoch": 3.970136518771331, + "grad_norm": 0.20336202544793383, + "learning_rate": 7.521186537457753e-05, + "loss": 0.9282, + "step": 18612 + }, + { + "epoch": 3.9703498293515356, + "grad_norm": 0.18740017187798683, + "learning_rate": 7.520337089186707e-05, + "loss": 0.9287, + "step": 18613 + }, + { + "epoch": 3.9705631399317407, + "grad_norm": 0.17985843131968227, + "learning_rate": 7.519487646343056e-05, + "loss": 0.9043, + "step": 18614 + }, + { + "epoch": 3.9707764505119454, + "grad_norm": 0.18845746930698798, + "learning_rate": 7.518638208936409e-05, + "loss": 0.9249, + "step": 18615 + }, + { + "epoch": 3.97098976109215, + "grad_norm": 0.2078556198965565, + "learning_rate": 7.517788776976378e-05, + "loss": 0.8878, + "step": 18616 + }, + { + "epoch": 3.971203071672355, + "grad_norm": 0.20066492708564426, + "learning_rate": 7.516939350472576e-05, + "loss": 0.9098, + "step": 18617 + }, + { + "epoch": 3.97141638225256, + "grad_norm": 0.18272877292336193, + "learning_rate": 7.516089929434612e-05, + "loss": 0.9317, + "step": 18618 + }, + { + "epoch": 3.9716296928327646, + "grad_norm": 0.20015403461935133, + "learning_rate": 7.5152405138721e-05, + "loss": 0.9292, + "step": 18619 + }, + { + "epoch": 3.9718430034129693, + "grad_norm": 0.19993614506624655, + "learning_rate": 7.514391103794647e-05, + "loss": 0.9026, + "step": 18620 + }, + { + "epoch": 3.972056313993174, + "grad_norm": 0.19365583303570252, + "learning_rate": 7.51354169921187e-05, + "loss": 0.9268, + "step": 18621 + }, + { + "epoch": 3.9722696245733786, + "grad_norm": 0.21508575584254458, + "learning_rate": 7.512692300133374e-05, + "loss": 0.9172, + "step": 18622 + }, + { + "epoch": 3.9724829351535837, + "grad_norm": 0.17397260086216557, + "learning_rate": 7.511842906568776e-05, + "loss": 0.9149, + "step": 18623 + }, + { + "epoch": 3.9726962457337884, + "grad_norm": 0.24854398646308443, + "learning_rate": 7.510993518527681e-05, + "loss": 0.9435, + "step": 18624 + }, + { + "epoch": 3.972909556313993, + "grad_norm": 0.17321812101148307, + "learning_rate": 7.510144136019705e-05, + "loss": 0.9028, + "step": 18625 + }, + { + "epoch": 3.973122866894198, + "grad_norm": 0.23477420937175555, + "learning_rate": 7.509294759054457e-05, + "loss": 0.9062, + "step": 18626 + }, + { + "epoch": 3.973336177474403, + "grad_norm": 0.2466478686105503, + "learning_rate": 7.508445387641546e-05, + "loss": 0.9134, + "step": 18627 + }, + { + "epoch": 3.9735494880546076, + "grad_norm": 0.269941892224447, + "learning_rate": 7.507596021790582e-05, + "loss": 0.929, + "step": 18628 + }, + { + "epoch": 3.9737627986348123, + "grad_norm": 0.21275699431134593, + "learning_rate": 7.506746661511179e-05, + "loss": 0.9322, + "step": 18629 + }, + { + "epoch": 3.973976109215017, + "grad_norm": 0.1900176911031592, + "learning_rate": 7.505897306812944e-05, + "loss": 0.9105, + "step": 18630 + }, + { + "epoch": 3.9741894197952217, + "grad_norm": 0.24882202850032345, + "learning_rate": 7.505047957705492e-05, + "loss": 0.8871, + "step": 18631 + }, + { + "epoch": 3.9744027303754264, + "grad_norm": 0.2700207007976555, + "learning_rate": 7.504198614198426e-05, + "loss": 0.8851, + "step": 18632 + }, + { + "epoch": 3.9746160409556315, + "grad_norm": 0.2469161661671166, + "learning_rate": 7.503349276301366e-05, + "loss": 0.8976, + "step": 18633 + }, + { + "epoch": 3.974829351535836, + "grad_norm": 0.23357824564410837, + "learning_rate": 7.502499944023914e-05, + "loss": 0.9415, + "step": 18634 + }, + { + "epoch": 3.975042662116041, + "grad_norm": 0.21542388642903193, + "learning_rate": 7.501650617375685e-05, + "loss": 0.918, + "step": 18635 + }, + { + "epoch": 3.975255972696246, + "grad_norm": 0.2982819626482869, + "learning_rate": 7.500801296366287e-05, + "loss": 0.8916, + "step": 18636 + }, + { + "epoch": 3.9754692832764507, + "grad_norm": 0.2442393265618691, + "learning_rate": 7.499951981005335e-05, + "loss": 0.8909, + "step": 18637 + }, + { + "epoch": 3.9756825938566553, + "grad_norm": 0.2071793489302909, + "learning_rate": 7.49910267130243e-05, + "loss": 0.9094, + "step": 18638 + }, + { + "epoch": 3.97589590443686, + "grad_norm": 0.34313209996142674, + "learning_rate": 7.498253367267189e-05, + "loss": 0.9232, + "step": 18639 + }, + { + "epoch": 3.9761092150170647, + "grad_norm": 0.3464514127630806, + "learning_rate": 7.497404068909217e-05, + "loss": 0.8817, + "step": 18640 + }, + { + "epoch": 3.9763225255972694, + "grad_norm": 0.2528243356608358, + "learning_rate": 7.496554776238127e-05, + "loss": 0.9107, + "step": 18641 + }, + { + "epoch": 3.9765358361774745, + "grad_norm": 0.2156797141362094, + "learning_rate": 7.495705489263529e-05, + "loss": 0.9295, + "step": 18642 + }, + { + "epoch": 3.976749146757679, + "grad_norm": 0.2615423812010121, + "learning_rate": 7.494856207995032e-05, + "loss": 0.9227, + "step": 18643 + }, + { + "epoch": 3.976962457337884, + "grad_norm": 0.3189344352978965, + "learning_rate": 7.494006932442243e-05, + "loss": 0.9067, + "step": 18644 + }, + { + "epoch": 3.9771757679180886, + "grad_norm": 0.2803207501398837, + "learning_rate": 7.493157662614778e-05, + "loss": 0.9073, + "step": 18645 + }, + { + "epoch": 3.9773890784982937, + "grad_norm": 0.21702814109082424, + "learning_rate": 7.49230839852224e-05, + "loss": 0.9288, + "step": 18646 + }, + { + "epoch": 3.9776023890784984, + "grad_norm": 0.2344194332222383, + "learning_rate": 7.491459140174242e-05, + "loss": 0.9228, + "step": 18647 + }, + { + "epoch": 3.977815699658703, + "grad_norm": 0.21034250504690924, + "learning_rate": 7.490609887580395e-05, + "loss": 0.8931, + "step": 18648 + }, + { + "epoch": 3.9780290102389078, + "grad_norm": 0.22436821560407336, + "learning_rate": 7.489760640750304e-05, + "loss": 0.9121, + "step": 18649 + }, + { + "epoch": 3.9782423208191124, + "grad_norm": 0.27220032381861337, + "learning_rate": 7.488911399693577e-05, + "loss": 0.9168, + "step": 18650 + }, + { + "epoch": 3.9784556313993176, + "grad_norm": 0.1929057231245066, + "learning_rate": 7.488062164419829e-05, + "loss": 0.8914, + "step": 18651 + }, + { + "epoch": 3.9786689419795223, + "grad_norm": 0.2658742810103133, + "learning_rate": 7.487212934938664e-05, + "loss": 0.9014, + "step": 18652 + }, + { + "epoch": 3.978882252559727, + "grad_norm": 0.23237423205292002, + "learning_rate": 7.486363711259695e-05, + "loss": 0.8911, + "step": 18653 + }, + { + "epoch": 3.9790955631399316, + "grad_norm": 0.24468937378989145, + "learning_rate": 7.485514493392528e-05, + "loss": 0.9121, + "step": 18654 + }, + { + "epoch": 3.9793088737201368, + "grad_norm": 0.2841830321711004, + "learning_rate": 7.484665281346775e-05, + "loss": 0.906, + "step": 18655 + }, + { + "epoch": 3.9795221843003414, + "grad_norm": 0.1698510402188455, + "learning_rate": 7.483816075132041e-05, + "loss": 0.9481, + "step": 18656 + }, + { + "epoch": 3.979735494880546, + "grad_norm": 0.25516489054758706, + "learning_rate": 7.482966874757938e-05, + "loss": 0.8781, + "step": 18657 + }, + { + "epoch": 3.979948805460751, + "grad_norm": 0.22863556055490208, + "learning_rate": 7.482117680234072e-05, + "loss": 0.89, + "step": 18658 + }, + { + "epoch": 3.9801621160409555, + "grad_norm": 0.1834654313673168, + "learning_rate": 7.481268491570056e-05, + "loss": 0.9111, + "step": 18659 + }, + { + "epoch": 3.98037542662116, + "grad_norm": 0.22206206737113499, + "learning_rate": 7.480419308775496e-05, + "loss": 0.9286, + "step": 18660 + }, + { + "epoch": 3.9805887372013653, + "grad_norm": 0.2086557643775174, + "learning_rate": 7.479570131859999e-05, + "loss": 0.9143, + "step": 18661 + }, + { + "epoch": 3.98080204778157, + "grad_norm": 0.20907504714219005, + "learning_rate": 7.478720960833172e-05, + "loss": 0.9302, + "step": 18662 + }, + { + "epoch": 3.9810153583617747, + "grad_norm": 0.23812843350008817, + "learning_rate": 7.47787179570463e-05, + "loss": 0.936, + "step": 18663 + }, + { + "epoch": 3.98122866894198, + "grad_norm": 0.20111019087247187, + "learning_rate": 7.477022636483973e-05, + "loss": 0.8984, + "step": 18664 + }, + { + "epoch": 3.9814419795221845, + "grad_norm": 0.1805349963907811, + "learning_rate": 7.476173483180818e-05, + "loss": 0.9346, + "step": 18665 + }, + { + "epoch": 3.981655290102389, + "grad_norm": 0.2675287374691009, + "learning_rate": 7.475324335804766e-05, + "loss": 0.9021, + "step": 18666 + }, + { + "epoch": 3.981868600682594, + "grad_norm": 0.24330359226489848, + "learning_rate": 7.474475194365429e-05, + "loss": 0.9065, + "step": 18667 + }, + { + "epoch": 3.9820819112627985, + "grad_norm": 0.2126614538239932, + "learning_rate": 7.473626058872414e-05, + "loss": 0.9333, + "step": 18668 + }, + { + "epoch": 3.982295221843003, + "grad_norm": 0.18122499964073605, + "learning_rate": 7.47277692933533e-05, + "loss": 0.903, + "step": 18669 + }, + { + "epoch": 3.9825085324232083, + "grad_norm": 0.16996728320733337, + "learning_rate": 7.471927805763784e-05, + "loss": 0.8953, + "step": 18670 + }, + { + "epoch": 3.982721843003413, + "grad_norm": 0.180674918896848, + "learning_rate": 7.471078688167381e-05, + "loss": 0.8893, + "step": 18671 + }, + { + "epoch": 3.9829351535836177, + "grad_norm": 0.2488623659359621, + "learning_rate": 7.470229576555735e-05, + "loss": 0.9501, + "step": 18672 + }, + { + "epoch": 3.9831484641638224, + "grad_norm": 0.24398764315172627, + "learning_rate": 7.46938047093845e-05, + "loss": 0.9056, + "step": 18673 + }, + { + "epoch": 3.9833617747440275, + "grad_norm": 0.20032509952068978, + "learning_rate": 7.468531371325131e-05, + "loss": 0.9337, + "step": 18674 + }, + { + "epoch": 3.983575085324232, + "grad_norm": 0.1624440242271978, + "learning_rate": 7.46768227772539e-05, + "loss": 0.9215, + "step": 18675 + }, + { + "epoch": 3.983788395904437, + "grad_norm": 0.2335830291001403, + "learning_rate": 7.466833190148832e-05, + "loss": 0.9048, + "step": 18676 + }, + { + "epoch": 3.9840017064846416, + "grad_norm": 0.254170415102033, + "learning_rate": 7.465984108605068e-05, + "loss": 0.9294, + "step": 18677 + }, + { + "epoch": 3.9842150170648463, + "grad_norm": 0.19553686504355952, + "learning_rate": 7.4651350331037e-05, + "loss": 0.9111, + "step": 18678 + }, + { + "epoch": 3.984428327645051, + "grad_norm": 0.16453722397904505, + "learning_rate": 7.46428596365434e-05, + "loss": 0.8794, + "step": 18679 + }, + { + "epoch": 3.984641638225256, + "grad_norm": 0.24927286285815908, + "learning_rate": 7.463436900266591e-05, + "loss": 0.907, + "step": 18680 + }, + { + "epoch": 3.9848549488054608, + "grad_norm": 0.2515535496155219, + "learning_rate": 7.462587842950065e-05, + "loss": 0.8938, + "step": 18681 + }, + { + "epoch": 3.9850682593856654, + "grad_norm": 0.24429087239533118, + "learning_rate": 7.461738791714367e-05, + "loss": 0.9021, + "step": 18682 + }, + { + "epoch": 3.9852815699658706, + "grad_norm": 0.18127379648204148, + "learning_rate": 7.460889746569101e-05, + "loss": 0.9262, + "step": 18683 + }, + { + "epoch": 3.9854948805460753, + "grad_norm": 0.1941655812406831, + "learning_rate": 7.460040707523879e-05, + "loss": 0.9404, + "step": 18684 + }, + { + "epoch": 3.98570819112628, + "grad_norm": 0.3198877970297564, + "learning_rate": 7.459191674588304e-05, + "loss": 0.9703, + "step": 18685 + }, + { + "epoch": 3.9859215017064846, + "grad_norm": 0.3945553322405024, + "learning_rate": 7.458342647771984e-05, + "loss": 0.9717, + "step": 18686 + }, + { + "epoch": 3.9861348122866893, + "grad_norm": 0.3554391915603964, + "learning_rate": 7.457493627084527e-05, + "loss": 0.8896, + "step": 18687 + }, + { + "epoch": 3.986348122866894, + "grad_norm": 0.2096390549153505, + "learning_rate": 7.456644612535538e-05, + "loss": 0.9314, + "step": 18688 + }, + { + "epoch": 3.986561433447099, + "grad_norm": 0.23441715795929083, + "learning_rate": 7.455795604134625e-05, + "loss": 0.8714, + "step": 18689 + }, + { + "epoch": 3.986774744027304, + "grad_norm": 0.3817240809166655, + "learning_rate": 7.454946601891394e-05, + "loss": 0.9293, + "step": 18690 + }, + { + "epoch": 3.9869880546075085, + "grad_norm": 0.38864436405181024, + "learning_rate": 7.454097605815451e-05, + "loss": 0.878, + "step": 18691 + }, + { + "epoch": 3.987201365187713, + "grad_norm": 0.30968730386003995, + "learning_rate": 7.453248615916402e-05, + "loss": 0.9375, + "step": 18692 + }, + { + "epoch": 3.9874146757679183, + "grad_norm": 0.23429581852321332, + "learning_rate": 7.452399632203856e-05, + "loss": 0.9169, + "step": 18693 + }, + { + "epoch": 3.987627986348123, + "grad_norm": 0.3312577485698021, + "learning_rate": 7.451550654687418e-05, + "loss": 0.9434, + "step": 18694 + }, + { + "epoch": 3.9878412969283277, + "grad_norm": 0.4570285930792127, + "learning_rate": 7.450701683376691e-05, + "loss": 0.8827, + "step": 18695 + }, + { + "epoch": 3.9880546075085324, + "grad_norm": 0.37349601501362534, + "learning_rate": 7.449852718281284e-05, + "loss": 0.8958, + "step": 18696 + }, + { + "epoch": 3.988267918088737, + "grad_norm": 0.18579450488354046, + "learning_rate": 7.449003759410804e-05, + "loss": 0.8724, + "step": 18697 + }, + { + "epoch": 3.9884812286689417, + "grad_norm": 0.2661550654449217, + "learning_rate": 7.448154806774854e-05, + "loss": 0.925, + "step": 18698 + }, + { + "epoch": 3.988694539249147, + "grad_norm": 0.3363200861934729, + "learning_rate": 7.447305860383044e-05, + "loss": 0.9184, + "step": 18699 + }, + { + "epoch": 3.9889078498293515, + "grad_norm": 0.36820891715748494, + "learning_rate": 7.446456920244975e-05, + "loss": 0.8995, + "step": 18700 + }, + { + "epoch": 3.989121160409556, + "grad_norm": 0.24688611828743112, + "learning_rate": 7.445607986370257e-05, + "loss": 0.9041, + "step": 18701 + }, + { + "epoch": 3.9893344709897613, + "grad_norm": 0.20301655258394047, + "learning_rate": 7.444759058768492e-05, + "loss": 0.9066, + "step": 18702 + }, + { + "epoch": 3.989547781569966, + "grad_norm": 0.3472005757675432, + "learning_rate": 7.44391013744929e-05, + "loss": 0.9056, + "step": 18703 + }, + { + "epoch": 3.9897610921501707, + "grad_norm": 0.3364971622538116, + "learning_rate": 7.443061222422256e-05, + "loss": 0.9477, + "step": 18704 + }, + { + "epoch": 3.9899744027303754, + "grad_norm": 0.20620608996954612, + "learning_rate": 7.442212313696989e-05, + "loss": 0.9029, + "step": 18705 + }, + { + "epoch": 3.99018771331058, + "grad_norm": 0.19886649656837072, + "learning_rate": 7.441363411283102e-05, + "loss": 0.9106, + "step": 18706 + }, + { + "epoch": 3.9904010238907848, + "grad_norm": 0.30754494820672923, + "learning_rate": 7.440514515190195e-05, + "loss": 0.9221, + "step": 18707 + }, + { + "epoch": 3.99061433447099, + "grad_norm": 0.28247998611871233, + "learning_rate": 7.439665625427877e-05, + "loss": 0.9023, + "step": 18708 + }, + { + "epoch": 3.9908276450511946, + "grad_norm": 0.19181859772174534, + "learning_rate": 7.438816742005752e-05, + "loss": 0.9203, + "step": 18709 + }, + { + "epoch": 3.9910409556313993, + "grad_norm": 0.19315338730046444, + "learning_rate": 7.437967864933424e-05, + "loss": 0.9079, + "step": 18710 + }, + { + "epoch": 3.991254266211604, + "grad_norm": 0.27067170182747097, + "learning_rate": 7.4371189942205e-05, + "loss": 0.9438, + "step": 18711 + }, + { + "epoch": 3.991467576791809, + "grad_norm": 0.36265922208999696, + "learning_rate": 7.436270129876584e-05, + "loss": 0.9021, + "step": 18712 + }, + { + "epoch": 3.9916808873720138, + "grad_norm": 0.2751139361516766, + "learning_rate": 7.43542127191128e-05, + "loss": 0.9188, + "step": 18713 + }, + { + "epoch": 3.9918941979522184, + "grad_norm": 0.1901240791248012, + "learning_rate": 7.434572420334194e-05, + "loss": 0.9339, + "step": 18714 + }, + { + "epoch": 3.992107508532423, + "grad_norm": 0.25382606827242027, + "learning_rate": 7.433723575154933e-05, + "loss": 0.9372, + "step": 18715 + }, + { + "epoch": 3.992320819112628, + "grad_norm": 0.2955292747106926, + "learning_rate": 7.432874736383099e-05, + "loss": 0.9371, + "step": 18716 + }, + { + "epoch": 3.9925341296928325, + "grad_norm": 0.2439440797961598, + "learning_rate": 7.432025904028294e-05, + "loss": 0.9281, + "step": 18717 + }, + { + "epoch": 3.9927474402730376, + "grad_norm": 0.22853219446017356, + "learning_rate": 7.431177078100128e-05, + "loss": 0.9262, + "step": 18718 + }, + { + "epoch": 3.9929607508532423, + "grad_norm": 0.20697021604714674, + "learning_rate": 7.4303282586082e-05, + "loss": 0.9636, + "step": 18719 + }, + { + "epoch": 3.993174061433447, + "grad_norm": 0.2618233125127848, + "learning_rate": 7.42947944556212e-05, + "loss": 0.9116, + "step": 18720 + }, + { + "epoch": 3.993387372013652, + "grad_norm": 0.3274937559591442, + "learning_rate": 7.428630638971488e-05, + "loss": 0.9297, + "step": 18721 + }, + { + "epoch": 3.993600682593857, + "grad_norm": 0.21291439496789621, + "learning_rate": 7.427781838845911e-05, + "loss": 0.8911, + "step": 18722 + }, + { + "epoch": 3.9938139931740615, + "grad_norm": 0.27535869739153457, + "learning_rate": 7.426933045194992e-05, + "loss": 0.9114, + "step": 18723 + }, + { + "epoch": 3.994027303754266, + "grad_norm": 0.37212069736732734, + "learning_rate": 7.426084258028333e-05, + "loss": 0.9088, + "step": 18724 + }, + { + "epoch": 3.994240614334471, + "grad_norm": 0.32158073450593283, + "learning_rate": 7.425235477355544e-05, + "loss": 0.9389, + "step": 18725 + }, + { + "epoch": 3.9944539249146755, + "grad_norm": 0.2585842191284546, + "learning_rate": 7.424386703186227e-05, + "loss": 0.9066, + "step": 18726 + }, + { + "epoch": 3.9946672354948807, + "grad_norm": 0.192689852358035, + "learning_rate": 7.423537935529979e-05, + "loss": 0.9208, + "step": 18727 + }, + { + "epoch": 3.9948805460750854, + "grad_norm": 0.30742393613209484, + "learning_rate": 7.42268917439641e-05, + "loss": 0.9346, + "step": 18728 + }, + { + "epoch": 3.99509385665529, + "grad_norm": 0.30401302516637196, + "learning_rate": 7.421840419795123e-05, + "loss": 0.9237, + "step": 18729 + }, + { + "epoch": 3.9953071672354947, + "grad_norm": 0.2500249325526292, + "learning_rate": 7.420991671735722e-05, + "loss": 0.8927, + "step": 18730 + }, + { + "epoch": 3.9955204778157, + "grad_norm": 0.2195850441622297, + "learning_rate": 7.420142930227808e-05, + "loss": 0.8928, + "step": 18731 + }, + { + "epoch": 3.9957337883959045, + "grad_norm": 0.174040503830858, + "learning_rate": 7.41929419528099e-05, + "loss": 0.8874, + "step": 18732 + }, + { + "epoch": 3.9959470989761092, + "grad_norm": 0.17071578238932825, + "learning_rate": 7.418445466904866e-05, + "loss": 0.8746, + "step": 18733 + }, + { + "epoch": 3.996160409556314, + "grad_norm": 0.23017124698976651, + "learning_rate": 7.417596745109042e-05, + "loss": 0.935, + "step": 18734 + }, + { + "epoch": 3.9963737201365186, + "grad_norm": 0.17497691191561351, + "learning_rate": 7.41674802990312e-05, + "loss": 0.8979, + "step": 18735 + }, + { + "epoch": 3.9965870307167233, + "grad_norm": 0.17717715536831213, + "learning_rate": 7.415899321296702e-05, + "loss": 0.9104, + "step": 18736 + }, + { + "epoch": 3.9968003412969284, + "grad_norm": 0.20435863356091388, + "learning_rate": 7.415050619299396e-05, + "loss": 0.9237, + "step": 18737 + }, + { + "epoch": 3.997013651877133, + "grad_norm": 0.1601874704363427, + "learning_rate": 7.414201923920804e-05, + "loss": 0.9321, + "step": 18738 + }, + { + "epoch": 3.9972269624573378, + "grad_norm": 0.1990119336534985, + "learning_rate": 7.413353235170523e-05, + "loss": 0.9399, + "step": 18739 + }, + { + "epoch": 3.997440273037543, + "grad_norm": 0.17941484935030164, + "learning_rate": 7.412504553058161e-05, + "loss": 0.9061, + "step": 18740 + }, + { + "epoch": 3.9976535836177476, + "grad_norm": 0.18887897688228156, + "learning_rate": 7.411655877593319e-05, + "loss": 0.8944, + "step": 18741 + }, + { + "epoch": 3.9978668941979523, + "grad_norm": 0.1867045583949194, + "learning_rate": 7.410807208785602e-05, + "loss": 0.9264, + "step": 18742 + }, + { + "epoch": 3.998080204778157, + "grad_norm": 0.1821039824902216, + "learning_rate": 7.409958546644611e-05, + "loss": 0.906, + "step": 18743 + }, + { + "epoch": 3.9982935153583616, + "grad_norm": 0.19277174055630847, + "learning_rate": 7.409109891179949e-05, + "loss": 0.923, + "step": 18744 + }, + { + "epoch": 3.9985068259385663, + "grad_norm": 0.18463621084315057, + "learning_rate": 7.408261242401219e-05, + "loss": 0.915, + "step": 18745 + }, + { + "epoch": 3.9987201365187715, + "grad_norm": 0.16442158992388722, + "learning_rate": 7.407412600318022e-05, + "loss": 0.9168, + "step": 18746 + }, + { + "epoch": 3.998933447098976, + "grad_norm": 0.21043792489855892, + "learning_rate": 7.406563964939961e-05, + "loss": 0.9449, + "step": 18747 + }, + { + "epoch": 3.999146757679181, + "grad_norm": 0.1812699188223429, + "learning_rate": 7.405715336276639e-05, + "loss": 0.9051, + "step": 18748 + }, + { + "epoch": 3.9993600682593855, + "grad_norm": 0.1798026297411328, + "learning_rate": 7.404866714337661e-05, + "loss": 0.907, + "step": 18749 + }, + { + "epoch": 3.9995733788395906, + "grad_norm": 0.18929727204686922, + "learning_rate": 7.404018099132624e-05, + "loss": 0.9066, + "step": 18750 + }, + { + "epoch": 3.9997866894197953, + "grad_norm": 0.1959547299409873, + "learning_rate": 7.403169490671131e-05, + "loss": 0.922, + "step": 18751 + }, + { + "epoch": 4.0, + "grad_norm": 0.1720523557404637, + "learning_rate": 7.402320888962788e-05, + "loss": 0.9124, + "step": 18752 + }, + { + "epoch": 4.000213310580205, + "grad_norm": 0.22765291932640358, + "learning_rate": 7.40147229401719e-05, + "loss": 0.9197, + "step": 18753 + }, + { + "epoch": 4.000426621160409, + "grad_norm": 0.1697618624030363, + "learning_rate": 7.400623705843946e-05, + "loss": 0.8799, + "step": 18754 + }, + { + "epoch": 4.000639931740614, + "grad_norm": 0.2103492161248223, + "learning_rate": 7.399775124452654e-05, + "loss": 0.9068, + "step": 18755 + }, + { + "epoch": 4.000853242320819, + "grad_norm": 0.19616229786484474, + "learning_rate": 7.398926549852917e-05, + "loss": 0.9014, + "step": 18756 + }, + { + "epoch": 4.001066552901024, + "grad_norm": 0.17867898860215328, + "learning_rate": 7.398077982054336e-05, + "loss": 0.9073, + "step": 18757 + }, + { + "epoch": 4.001279863481229, + "grad_norm": 0.20864791139403027, + "learning_rate": 7.397229421066512e-05, + "loss": 0.8927, + "step": 18758 + }, + { + "epoch": 4.001493174061434, + "grad_norm": 0.1942568524669516, + "learning_rate": 7.396380866899049e-05, + "loss": 0.9095, + "step": 18759 + }, + { + "epoch": 4.001706484641638, + "grad_norm": 0.19832400092996802, + "learning_rate": 7.395532319561547e-05, + "loss": 0.8789, + "step": 18760 + }, + { + "epoch": 4.001919795221843, + "grad_norm": 0.23650393460717747, + "learning_rate": 7.394683779063606e-05, + "loss": 0.8815, + "step": 18761 + }, + { + "epoch": 4.002133105802048, + "grad_norm": 0.19225676151797502, + "learning_rate": 7.393835245414827e-05, + "loss": 0.923, + "step": 18762 + }, + { + "epoch": 4.002346416382252, + "grad_norm": 0.19970582656064084, + "learning_rate": 7.392986718624813e-05, + "loss": 0.9153, + "step": 18763 + }, + { + "epoch": 4.002559726962457, + "grad_norm": 0.17838781216510852, + "learning_rate": 7.392138198703166e-05, + "loss": 0.8911, + "step": 18764 + }, + { + "epoch": 4.002773037542662, + "grad_norm": 0.19994366775495942, + "learning_rate": 7.391289685659483e-05, + "loss": 0.8873, + "step": 18765 + }, + { + "epoch": 4.0029863481228665, + "grad_norm": 0.16652958556240596, + "learning_rate": 7.390441179503369e-05, + "loss": 0.9058, + "step": 18766 + }, + { + "epoch": 4.003199658703072, + "grad_norm": 0.20821304285954514, + "learning_rate": 7.389592680244422e-05, + "loss": 0.913, + "step": 18767 + }, + { + "epoch": 4.003412969283277, + "grad_norm": 0.203100976136041, + "learning_rate": 7.388744187892246e-05, + "loss": 0.9051, + "step": 18768 + }, + { + "epoch": 4.003626279863481, + "grad_norm": 0.1672095848946508, + "learning_rate": 7.387895702456439e-05, + "loss": 0.928, + "step": 18769 + }, + { + "epoch": 4.003839590443686, + "grad_norm": 0.21874516176926967, + "learning_rate": 7.387047223946602e-05, + "loss": 0.8753, + "step": 18770 + }, + { + "epoch": 4.004052901023891, + "grad_norm": 0.1839567150173498, + "learning_rate": 7.386198752372337e-05, + "loss": 0.8985, + "step": 18771 + }, + { + "epoch": 4.0042662116040955, + "grad_norm": 0.1946515875014558, + "learning_rate": 7.385350287743244e-05, + "loss": 0.9017, + "step": 18772 + }, + { + "epoch": 4.0044795221843, + "grad_norm": 0.20741801103498309, + "learning_rate": 7.38450183006892e-05, + "loss": 0.9138, + "step": 18773 + }, + { + "epoch": 4.004692832764505, + "grad_norm": 0.1704922366723106, + "learning_rate": 7.383653379358971e-05, + "loss": 0.8525, + "step": 18774 + }, + { + "epoch": 4.0049061433447095, + "grad_norm": 0.18120529861602594, + "learning_rate": 7.382804935622992e-05, + "loss": 0.8793, + "step": 18775 + }, + { + "epoch": 4.005119453924915, + "grad_norm": 0.1641336469972174, + "learning_rate": 7.381956498870586e-05, + "loss": 0.8919, + "step": 18776 + }, + { + "epoch": 4.00533276450512, + "grad_norm": 0.18353109505464307, + "learning_rate": 7.381108069111353e-05, + "loss": 0.9043, + "step": 18777 + }, + { + "epoch": 4.0055460750853245, + "grad_norm": 0.21322272690434602, + "learning_rate": 7.380259646354893e-05, + "loss": 0.9194, + "step": 18778 + }, + { + "epoch": 4.005759385665529, + "grad_norm": 0.1764846872541533, + "learning_rate": 7.379411230610806e-05, + "loss": 0.9084, + "step": 18779 + }, + { + "epoch": 4.005972696245734, + "grad_norm": 0.15845799313853798, + "learning_rate": 7.37856282188869e-05, + "loss": 0.8862, + "step": 18780 + }, + { + "epoch": 4.0061860068259385, + "grad_norm": 0.17735516792671352, + "learning_rate": 7.377714420198149e-05, + "loss": 0.8922, + "step": 18781 + }, + { + "epoch": 4.006399317406143, + "grad_norm": 0.16906775024127, + "learning_rate": 7.376866025548779e-05, + "loss": 0.9362, + "step": 18782 + }, + { + "epoch": 4.006612627986348, + "grad_norm": 0.26285312294737423, + "learning_rate": 7.376017637950178e-05, + "loss": 0.9445, + "step": 18783 + }, + { + "epoch": 4.006825938566553, + "grad_norm": 0.2430634189920027, + "learning_rate": 7.375169257411949e-05, + "loss": 0.8867, + "step": 18784 + }, + { + "epoch": 4.007039249146757, + "grad_norm": 0.20551377656260003, + "learning_rate": 7.374320883943689e-05, + "loss": 0.8957, + "step": 18785 + }, + { + "epoch": 4.007252559726963, + "grad_norm": 0.22417348937682502, + "learning_rate": 7.373472517555002e-05, + "loss": 0.8817, + "step": 18786 + }, + { + "epoch": 4.0074658703071675, + "grad_norm": 0.23758690450239892, + "learning_rate": 7.37262415825548e-05, + "loss": 0.8925, + "step": 18787 + }, + { + "epoch": 4.007679180887372, + "grad_norm": 0.29478835881319143, + "learning_rate": 7.371775806054729e-05, + "loss": 0.9236, + "step": 18788 + }, + { + "epoch": 4.007892491467577, + "grad_norm": 0.33018291929493343, + "learning_rate": 7.370927460962343e-05, + "loss": 0.8925, + "step": 18789 + }, + { + "epoch": 4.0081058020477816, + "grad_norm": 0.23152000848642887, + "learning_rate": 7.370079122987927e-05, + "loss": 0.9111, + "step": 18790 + }, + { + "epoch": 4.008319112627986, + "grad_norm": 0.2199306582833486, + "learning_rate": 7.369230792141074e-05, + "loss": 0.9014, + "step": 18791 + }, + { + "epoch": 4.008532423208191, + "grad_norm": 0.33216924560815975, + "learning_rate": 7.368382468431385e-05, + "loss": 0.8842, + "step": 18792 + }, + { + "epoch": 4.008745733788396, + "grad_norm": 0.3065794464957261, + "learning_rate": 7.367534151868462e-05, + "loss": 0.9097, + "step": 18793 + }, + { + "epoch": 4.0089590443686, + "grad_norm": 0.2357815395544929, + "learning_rate": 7.366685842461898e-05, + "loss": 0.884, + "step": 18794 + }, + { + "epoch": 4.009172354948806, + "grad_norm": 0.2444097822239271, + "learning_rate": 7.365837540221293e-05, + "loss": 0.8892, + "step": 18795 + }, + { + "epoch": 4.0093856655290105, + "grad_norm": 0.20552795219939998, + "learning_rate": 7.364989245156249e-05, + "loss": 0.912, + "step": 18796 + }, + { + "epoch": 4.009598976109215, + "grad_norm": 0.2266720578573887, + "learning_rate": 7.364140957276361e-05, + "loss": 0.9233, + "step": 18797 + }, + { + "epoch": 4.00981228668942, + "grad_norm": 0.24637640177047324, + "learning_rate": 7.36329267659123e-05, + "loss": 0.8883, + "step": 18798 + }, + { + "epoch": 4.010025597269625, + "grad_norm": 0.23030997115731808, + "learning_rate": 7.362444403110453e-05, + "loss": 0.8823, + "step": 18799 + }, + { + "epoch": 4.010238907849829, + "grad_norm": 0.21051015052993174, + "learning_rate": 7.361596136843627e-05, + "loss": 0.8731, + "step": 18800 + }, + { + "epoch": 4.010452218430034, + "grad_norm": 0.20842454960205986, + "learning_rate": 7.360747877800352e-05, + "loss": 0.9052, + "step": 18801 + }, + { + "epoch": 4.010665529010239, + "grad_norm": 0.20966419927945087, + "learning_rate": 7.359899625990226e-05, + "loss": 0.9282, + "step": 18802 + }, + { + "epoch": 4.010878839590443, + "grad_norm": 0.2265319357465981, + "learning_rate": 7.359051381422847e-05, + "loss": 0.9071, + "step": 18803 + }, + { + "epoch": 4.011092150170649, + "grad_norm": 0.19095841587086382, + "learning_rate": 7.358203144107811e-05, + "loss": 0.8837, + "step": 18804 + }, + { + "epoch": 4.011305460750854, + "grad_norm": 0.21634173227657016, + "learning_rate": 7.35735491405472e-05, + "loss": 0.8853, + "step": 18805 + }, + { + "epoch": 4.011518771331058, + "grad_norm": 0.19295550170048462, + "learning_rate": 7.356506691273168e-05, + "loss": 0.9178, + "step": 18806 + }, + { + "epoch": 4.011732081911263, + "grad_norm": 0.2004722117113387, + "learning_rate": 7.355658475772752e-05, + "loss": 0.9388, + "step": 18807 + }, + { + "epoch": 4.011945392491468, + "grad_norm": 0.2680909963466983, + "learning_rate": 7.354810267563071e-05, + "loss": 0.9096, + "step": 18808 + }, + { + "epoch": 4.012158703071672, + "grad_norm": 0.17487455524461382, + "learning_rate": 7.353962066653724e-05, + "loss": 0.8792, + "step": 18809 + }, + { + "epoch": 4.012372013651877, + "grad_norm": 0.19184337274884253, + "learning_rate": 7.353113873054307e-05, + "loss": 0.8807, + "step": 18810 + }, + { + "epoch": 4.012585324232082, + "grad_norm": 0.21293039384386633, + "learning_rate": 7.352265686774415e-05, + "loss": 0.9189, + "step": 18811 + }, + { + "epoch": 4.012798634812286, + "grad_norm": 0.17357450783155362, + "learning_rate": 7.35141750782365e-05, + "loss": 0.9221, + "step": 18812 + }, + { + "epoch": 4.013011945392491, + "grad_norm": 0.21975237419150087, + "learning_rate": 7.350569336211605e-05, + "loss": 0.9161, + "step": 18813 + }, + { + "epoch": 4.013225255972697, + "grad_norm": 0.1780503768604549, + "learning_rate": 7.349721171947881e-05, + "loss": 0.9121, + "step": 18814 + }, + { + "epoch": 4.013438566552901, + "grad_norm": 0.2106772592917406, + "learning_rate": 7.348873015042073e-05, + "loss": 0.9011, + "step": 18815 + }, + { + "epoch": 4.013651877133106, + "grad_norm": 0.21066862364053754, + "learning_rate": 7.34802486550378e-05, + "loss": 0.8992, + "step": 18816 + }, + { + "epoch": 4.013865187713311, + "grad_norm": 0.15779670944222657, + "learning_rate": 7.347176723342592e-05, + "loss": 0.8979, + "step": 18817 + }, + { + "epoch": 4.014078498293515, + "grad_norm": 0.23235419642622154, + "learning_rate": 7.346328588568113e-05, + "loss": 0.8811, + "step": 18818 + }, + { + "epoch": 4.01429180887372, + "grad_norm": 0.2535488329164307, + "learning_rate": 7.345480461189934e-05, + "loss": 0.8819, + "step": 18819 + }, + { + "epoch": 4.014505119453925, + "grad_norm": 0.18246850355618446, + "learning_rate": 7.344632341217657e-05, + "loss": 0.8922, + "step": 18820 + }, + { + "epoch": 4.014718430034129, + "grad_norm": 0.24522506720531126, + "learning_rate": 7.343784228660875e-05, + "loss": 0.9141, + "step": 18821 + }, + { + "epoch": 4.014931740614334, + "grad_norm": 0.22545591589709285, + "learning_rate": 7.342936123529186e-05, + "loss": 0.879, + "step": 18822 + }, + { + "epoch": 4.01514505119454, + "grad_norm": 0.22698938252490203, + "learning_rate": 7.342088025832185e-05, + "loss": 0.9198, + "step": 18823 + }, + { + "epoch": 4.015358361774744, + "grad_norm": 0.18343127434594628, + "learning_rate": 7.341239935579471e-05, + "loss": 0.9254, + "step": 18824 + }, + { + "epoch": 4.015571672354949, + "grad_norm": 0.18044330515855853, + "learning_rate": 7.340391852780636e-05, + "loss": 0.9163, + "step": 18825 + }, + { + "epoch": 4.015784982935154, + "grad_norm": 0.7618943289615181, + "learning_rate": 7.339543777445281e-05, + "loss": 0.9257, + "step": 18826 + }, + { + "epoch": 4.015998293515358, + "grad_norm": 0.19387061866641692, + "learning_rate": 7.338695709582999e-05, + "loss": 0.8973, + "step": 18827 + }, + { + "epoch": 4.016211604095563, + "grad_norm": 0.34714848992114133, + "learning_rate": 7.337847649203386e-05, + "loss": 0.9519, + "step": 18828 + }, + { + "epoch": 4.016424914675768, + "grad_norm": 0.2323091145364031, + "learning_rate": 7.336999596316035e-05, + "loss": 0.8935, + "step": 18829 + }, + { + "epoch": 4.0166382252559725, + "grad_norm": 0.2012698188472845, + "learning_rate": 7.336151550930547e-05, + "loss": 0.9069, + "step": 18830 + }, + { + "epoch": 4.016851535836177, + "grad_norm": 0.18111660838970747, + "learning_rate": 7.335303513056515e-05, + "loss": 0.8741, + "step": 18831 + }, + { + "epoch": 4.017064846416382, + "grad_norm": 0.22972442699120504, + "learning_rate": 7.334455482703534e-05, + "loss": 0.8809, + "step": 18832 + }, + { + "epoch": 4.017278156996587, + "grad_norm": 0.2333478786769796, + "learning_rate": 7.333607459881201e-05, + "loss": 0.8933, + "step": 18833 + }, + { + "epoch": 4.017491467576792, + "grad_norm": 0.21939514001448823, + "learning_rate": 7.332759444599113e-05, + "loss": 0.9126, + "step": 18834 + }, + { + "epoch": 4.017704778156997, + "grad_norm": 0.20088614264886373, + "learning_rate": 7.331911436866858e-05, + "loss": 0.8832, + "step": 18835 + }, + { + "epoch": 4.0179180887372015, + "grad_norm": 0.23824943934147802, + "learning_rate": 7.331063436694041e-05, + "loss": 0.925, + "step": 18836 + }, + { + "epoch": 4.018131399317406, + "grad_norm": 0.18304717636130094, + "learning_rate": 7.33021544409025e-05, + "loss": 0.9152, + "step": 18837 + }, + { + "epoch": 4.018344709897611, + "grad_norm": 0.22941857231879115, + "learning_rate": 7.329367459065087e-05, + "loss": 0.9172, + "step": 18838 + }, + { + "epoch": 4.0185580204778155, + "grad_norm": 0.20739233867053875, + "learning_rate": 7.32851948162814e-05, + "loss": 0.8794, + "step": 18839 + }, + { + "epoch": 4.01877133105802, + "grad_norm": 0.18264682413422825, + "learning_rate": 7.327671511789004e-05, + "loss": 0.8781, + "step": 18840 + }, + { + "epoch": 4.018984641638225, + "grad_norm": 0.19160572648773316, + "learning_rate": 7.326823549557277e-05, + "loss": 0.8914, + "step": 18841 + }, + { + "epoch": 4.0191979522184305, + "grad_norm": 0.1756704313278657, + "learning_rate": 7.325975594942555e-05, + "loss": 0.9277, + "step": 18842 + }, + { + "epoch": 4.019411262798635, + "grad_norm": 0.17986613766178952, + "learning_rate": 7.325127647954426e-05, + "loss": 0.8803, + "step": 18843 + }, + { + "epoch": 4.01962457337884, + "grad_norm": 0.18816350586227, + "learning_rate": 7.324279708602492e-05, + "loss": 0.9031, + "step": 18844 + }, + { + "epoch": 4.0198378839590445, + "grad_norm": 0.19085867999100628, + "learning_rate": 7.323431776896344e-05, + "loss": 0.9151, + "step": 18845 + }, + { + "epoch": 4.020051194539249, + "grad_norm": 0.19545268444124986, + "learning_rate": 7.322583852845578e-05, + "loss": 0.9233, + "step": 18846 + }, + { + "epoch": 4.020264505119454, + "grad_norm": 0.17511453930203294, + "learning_rate": 7.321735936459785e-05, + "loss": 0.8813, + "step": 18847 + }, + { + "epoch": 4.020477815699659, + "grad_norm": 0.14953272355262845, + "learning_rate": 7.320888027748564e-05, + "loss": 0.909, + "step": 18848 + }, + { + "epoch": 4.020691126279863, + "grad_norm": 0.1658437055836705, + "learning_rate": 7.320040126721507e-05, + "loss": 0.8961, + "step": 18849 + }, + { + "epoch": 4.020904436860068, + "grad_norm": 0.1711159937261424, + "learning_rate": 7.319192233388205e-05, + "loss": 0.9005, + "step": 18850 + }, + { + "epoch": 4.021117747440273, + "grad_norm": 0.192816338311827, + "learning_rate": 7.318344347758253e-05, + "loss": 0.9025, + "step": 18851 + }, + { + "epoch": 4.021331058020478, + "grad_norm": 0.19234284007359098, + "learning_rate": 7.317496469841249e-05, + "loss": 0.9267, + "step": 18852 + }, + { + "epoch": 4.021544368600683, + "grad_norm": 0.18721336653333012, + "learning_rate": 7.316648599646783e-05, + "loss": 0.9111, + "step": 18853 + }, + { + "epoch": 4.021757679180888, + "grad_norm": 0.15017907638007053, + "learning_rate": 7.315800737184449e-05, + "loss": 0.8759, + "step": 18854 + }, + { + "epoch": 4.021970989761092, + "grad_norm": 0.18834327655437957, + "learning_rate": 7.31495288246384e-05, + "loss": 0.9116, + "step": 18855 + }, + { + "epoch": 4.022184300341297, + "grad_norm": 0.24294320450717746, + "learning_rate": 7.314105035494554e-05, + "loss": 0.8971, + "step": 18856 + }, + { + "epoch": 4.022397610921502, + "grad_norm": 0.27251838084810365, + "learning_rate": 7.313257196286177e-05, + "loss": 0.8996, + "step": 18857 + }, + { + "epoch": 4.022610921501706, + "grad_norm": 0.24035027637072728, + "learning_rate": 7.312409364848309e-05, + "loss": 0.9042, + "step": 18858 + }, + { + "epoch": 4.022824232081911, + "grad_norm": 0.187115831838777, + "learning_rate": 7.311561541190539e-05, + "loss": 0.9347, + "step": 18859 + }, + { + "epoch": 4.023037542662116, + "grad_norm": 0.1575462471575112, + "learning_rate": 7.310713725322464e-05, + "loss": 0.8902, + "step": 18860 + }, + { + "epoch": 4.023250853242321, + "grad_norm": 0.18908700205838558, + "learning_rate": 7.309865917253677e-05, + "loss": 0.8801, + "step": 18861 + }, + { + "epoch": 4.023464163822526, + "grad_norm": 0.15247005981398135, + "learning_rate": 7.309018116993766e-05, + "loss": 0.8908, + "step": 18862 + }, + { + "epoch": 4.023677474402731, + "grad_norm": 0.19888873233830912, + "learning_rate": 7.308170324552324e-05, + "loss": 0.9499, + "step": 18863 + }, + { + "epoch": 4.023890784982935, + "grad_norm": 0.17846653158864983, + "learning_rate": 7.30732253993895e-05, + "loss": 0.8702, + "step": 18864 + }, + { + "epoch": 4.02410409556314, + "grad_norm": 0.21494832618491677, + "learning_rate": 7.306474763163229e-05, + "loss": 0.9102, + "step": 18865 + }, + { + "epoch": 4.024317406143345, + "grad_norm": 0.2675884829035356, + "learning_rate": 7.305626994234762e-05, + "loss": 0.9011, + "step": 18866 + }, + { + "epoch": 4.024530716723549, + "grad_norm": 0.1633608677283549, + "learning_rate": 7.304779233163134e-05, + "loss": 0.9004, + "step": 18867 + }, + { + "epoch": 4.024744027303754, + "grad_norm": 0.18858399924938885, + "learning_rate": 7.303931479957943e-05, + "loss": 0.8809, + "step": 18868 + }, + { + "epoch": 4.024957337883959, + "grad_norm": 0.18192840203117328, + "learning_rate": 7.303083734628778e-05, + "loss": 0.9057, + "step": 18869 + }, + { + "epoch": 4.025170648464163, + "grad_norm": 0.16468163936446134, + "learning_rate": 7.302235997185233e-05, + "loss": 0.8751, + "step": 18870 + }, + { + "epoch": 4.025383959044369, + "grad_norm": 0.16346382813778823, + "learning_rate": 7.301388267636898e-05, + "loss": 0.9313, + "step": 18871 + }, + { + "epoch": 4.025597269624574, + "grad_norm": 0.1561848840085716, + "learning_rate": 7.300540545993372e-05, + "loss": 0.9063, + "step": 18872 + }, + { + "epoch": 4.025810580204778, + "grad_norm": 0.1693156212717911, + "learning_rate": 7.299692832264235e-05, + "loss": 0.889, + "step": 18873 + }, + { + "epoch": 4.026023890784983, + "grad_norm": 0.6994209712457822, + "learning_rate": 7.298845126459088e-05, + "loss": 0.928, + "step": 18874 + }, + { + "epoch": 4.026237201365188, + "grad_norm": 0.1787657398078274, + "learning_rate": 7.297997428587518e-05, + "loss": 0.9066, + "step": 18875 + }, + { + "epoch": 4.026450511945392, + "grad_norm": 0.18380742152430776, + "learning_rate": 7.297149738659121e-05, + "loss": 0.8812, + "step": 18876 + }, + { + "epoch": 4.026663822525597, + "grad_norm": 0.19249313501405024, + "learning_rate": 7.296302056683486e-05, + "loss": 0.8821, + "step": 18877 + }, + { + "epoch": 4.026877133105802, + "grad_norm": 0.19513635331847523, + "learning_rate": 7.295454382670205e-05, + "loss": 0.8955, + "step": 18878 + }, + { + "epoch": 4.027090443686006, + "grad_norm": 0.23294501429152084, + "learning_rate": 7.294606716628869e-05, + "loss": 0.8978, + "step": 18879 + }, + { + "epoch": 4.027303754266212, + "grad_norm": 0.20802427765290515, + "learning_rate": 7.293759058569071e-05, + "loss": 0.8851, + "step": 18880 + }, + { + "epoch": 4.027517064846417, + "grad_norm": 0.2218236947839509, + "learning_rate": 7.292911408500398e-05, + "loss": 0.9146, + "step": 18881 + }, + { + "epoch": 4.027730375426621, + "grad_norm": 0.16767127529360276, + "learning_rate": 7.292063766432449e-05, + "loss": 0.9074, + "step": 18882 + }, + { + "epoch": 4.027943686006826, + "grad_norm": 0.161907071000398, + "learning_rate": 7.29121613237481e-05, + "loss": 0.8939, + "step": 18883 + }, + { + "epoch": 4.028156996587031, + "grad_norm": 0.2030070808749553, + "learning_rate": 7.290368506337071e-05, + "loss": 0.9245, + "step": 18884 + }, + { + "epoch": 4.028370307167235, + "grad_norm": 0.18925118948789446, + "learning_rate": 7.289520888328823e-05, + "loss": 0.9073, + "step": 18885 + }, + { + "epoch": 4.02858361774744, + "grad_norm": 0.21509188132646898, + "learning_rate": 7.28867327835966e-05, + "loss": 0.9113, + "step": 18886 + }, + { + "epoch": 4.028796928327645, + "grad_norm": 0.1745883254805128, + "learning_rate": 7.287825676439168e-05, + "loss": 0.8919, + "step": 18887 + }, + { + "epoch": 4.0290102389078495, + "grad_norm": 0.17275498921241841, + "learning_rate": 7.286978082576944e-05, + "loss": 0.9072, + "step": 18888 + }, + { + "epoch": 4.029223549488054, + "grad_norm": 0.17075354120053735, + "learning_rate": 7.286130496782571e-05, + "loss": 0.8855, + "step": 18889 + }, + { + "epoch": 4.02943686006826, + "grad_norm": 0.1819748015492152, + "learning_rate": 7.285282919065647e-05, + "loss": 0.8872, + "step": 18890 + }, + { + "epoch": 4.029650170648464, + "grad_norm": 0.1743136278663961, + "learning_rate": 7.284435349435756e-05, + "loss": 0.9249, + "step": 18891 + }, + { + "epoch": 4.029863481228669, + "grad_norm": 0.19649280956702578, + "learning_rate": 7.283587787902493e-05, + "loss": 0.9327, + "step": 18892 + }, + { + "epoch": 4.030076791808874, + "grad_norm": 0.16529582283612582, + "learning_rate": 7.282740234475445e-05, + "loss": 0.9031, + "step": 18893 + }, + { + "epoch": 4.0302901023890785, + "grad_norm": 0.18322453736862832, + "learning_rate": 7.281892689164206e-05, + "loss": 0.9125, + "step": 18894 + }, + { + "epoch": 4.030503412969283, + "grad_norm": 0.18745758484730013, + "learning_rate": 7.281045151978363e-05, + "loss": 0.8982, + "step": 18895 + }, + { + "epoch": 4.030716723549488, + "grad_norm": 0.19683583809529318, + "learning_rate": 7.280197622927505e-05, + "loss": 0.9055, + "step": 18896 + }, + { + "epoch": 4.0309300341296925, + "grad_norm": 0.19169261019143044, + "learning_rate": 7.279350102021221e-05, + "loss": 0.9047, + "step": 18897 + }, + { + "epoch": 4.031143344709897, + "grad_norm": 0.183940052193737, + "learning_rate": 7.278502589269106e-05, + "loss": 0.943, + "step": 18898 + }, + { + "epoch": 4.031356655290103, + "grad_norm": 0.2562466682010815, + "learning_rate": 7.277655084680743e-05, + "loss": 0.8689, + "step": 18899 + }, + { + "epoch": 4.0315699658703075, + "grad_norm": 0.23715502675275366, + "learning_rate": 7.276807588265727e-05, + "loss": 0.8913, + "step": 18900 + }, + { + "epoch": 4.031783276450512, + "grad_norm": 0.31839058488653876, + "learning_rate": 7.275960100033644e-05, + "loss": 0.8954, + "step": 18901 + }, + { + "epoch": 4.031996587030717, + "grad_norm": 0.25904083319076937, + "learning_rate": 7.275112619994086e-05, + "loss": 0.9099, + "step": 18902 + }, + { + "epoch": 4.0322098976109215, + "grad_norm": 0.18998661512316792, + "learning_rate": 7.27426514815664e-05, + "loss": 0.8981, + "step": 18903 + }, + { + "epoch": 4.032423208191126, + "grad_norm": 0.33726094144103114, + "learning_rate": 7.273417684530896e-05, + "loss": 0.8839, + "step": 18904 + }, + { + "epoch": 4.032636518771331, + "grad_norm": 0.31887416597014734, + "learning_rate": 7.272570229126447e-05, + "loss": 0.9014, + "step": 18905 + }, + { + "epoch": 4.032849829351536, + "grad_norm": 0.2572764974268434, + "learning_rate": 7.271722781952872e-05, + "loss": 0.8781, + "step": 18906 + }, + { + "epoch": 4.03306313993174, + "grad_norm": 0.21449530409320589, + "learning_rate": 7.27087534301977e-05, + "loss": 0.8904, + "step": 18907 + }, + { + "epoch": 4.033276450511946, + "grad_norm": 0.2756139962343466, + "learning_rate": 7.270027912336725e-05, + "loss": 0.8582, + "step": 18908 + }, + { + "epoch": 4.0334897610921505, + "grad_norm": 0.2654188787047054, + "learning_rate": 7.269180489913324e-05, + "loss": 0.9017, + "step": 18909 + }, + { + "epoch": 4.033703071672355, + "grad_norm": 0.27318037151588254, + "learning_rate": 7.268333075759161e-05, + "loss": 0.9203, + "step": 18910 + }, + { + "epoch": 4.03391638225256, + "grad_norm": 0.2103642514531009, + "learning_rate": 7.267485669883819e-05, + "loss": 0.9041, + "step": 18911 + }, + { + "epoch": 4.034129692832765, + "grad_norm": 0.21135023354966023, + "learning_rate": 7.266638272296892e-05, + "loss": 0.903, + "step": 18912 + }, + { + "epoch": 4.034343003412969, + "grad_norm": 0.28121581635662346, + "learning_rate": 7.265790883007961e-05, + "loss": 0.9025, + "step": 18913 + }, + { + "epoch": 4.034556313993174, + "grad_norm": 0.1621328186090075, + "learning_rate": 7.264943502026622e-05, + "loss": 0.8817, + "step": 18914 + }, + { + "epoch": 4.034769624573379, + "grad_norm": 0.21870901799538175, + "learning_rate": 7.264096129362459e-05, + "loss": 0.8889, + "step": 18915 + }, + { + "epoch": 4.034982935153583, + "grad_norm": 0.22385685215513115, + "learning_rate": 7.26324876502506e-05, + "loss": 0.9074, + "step": 18916 + }, + { + "epoch": 4.035196245733788, + "grad_norm": 0.20962552118283428, + "learning_rate": 7.262401409024017e-05, + "loss": 0.9011, + "step": 18917 + }, + { + "epoch": 4.035409556313994, + "grad_norm": 0.21757827894375142, + "learning_rate": 7.26155406136891e-05, + "loss": 0.8955, + "step": 18918 + }, + { + "epoch": 4.035622866894198, + "grad_norm": 0.20332456808728203, + "learning_rate": 7.260706722069333e-05, + "loss": 0.9076, + "step": 18919 + }, + { + "epoch": 4.035836177474403, + "grad_norm": 0.2691004613795711, + "learning_rate": 7.259859391134873e-05, + "loss": 0.9239, + "step": 18920 + }, + { + "epoch": 4.036049488054608, + "grad_norm": 0.29947821661408963, + "learning_rate": 7.259012068575114e-05, + "loss": 0.8832, + "step": 18921 + }, + { + "epoch": 4.036262798634812, + "grad_norm": 0.25227515295949854, + "learning_rate": 7.258164754399648e-05, + "loss": 0.9, + "step": 18922 + }, + { + "epoch": 4.036476109215017, + "grad_norm": 0.19038576490305337, + "learning_rate": 7.257317448618057e-05, + "loss": 0.9086, + "step": 18923 + }, + { + "epoch": 4.036689419795222, + "grad_norm": 0.21529670967499617, + "learning_rate": 7.256470151239935e-05, + "loss": 0.9003, + "step": 18924 + }, + { + "epoch": 4.036902730375426, + "grad_norm": 0.29879328464931204, + "learning_rate": 7.255622862274865e-05, + "loss": 0.8981, + "step": 18925 + }, + { + "epoch": 4.037116040955631, + "grad_norm": 0.2963241638481162, + "learning_rate": 7.254775581732434e-05, + "loss": 0.9359, + "step": 18926 + }, + { + "epoch": 4.037329351535837, + "grad_norm": 0.2154757516483228, + "learning_rate": 7.253928309622229e-05, + "loss": 0.9091, + "step": 18927 + }, + { + "epoch": 4.037542662116041, + "grad_norm": 0.17267359545820335, + "learning_rate": 7.253081045953841e-05, + "loss": 0.9167, + "step": 18928 + }, + { + "epoch": 4.037755972696246, + "grad_norm": 0.24500493143166754, + "learning_rate": 7.252233790736853e-05, + "loss": 0.9246, + "step": 18929 + }, + { + "epoch": 4.037969283276451, + "grad_norm": 0.2361718991296966, + "learning_rate": 7.25138654398085e-05, + "loss": 0.8756, + "step": 18930 + }, + { + "epoch": 4.038182593856655, + "grad_norm": 0.18648571785746215, + "learning_rate": 7.250539305695421e-05, + "loss": 0.9168, + "step": 18931 + }, + { + "epoch": 4.03839590443686, + "grad_norm": 0.19259307829662187, + "learning_rate": 7.249692075890154e-05, + "loss": 0.9083, + "step": 18932 + }, + { + "epoch": 4.038609215017065, + "grad_norm": 0.16231183533292615, + "learning_rate": 7.248844854574632e-05, + "loss": 0.9188, + "step": 18933 + }, + { + "epoch": 4.038822525597269, + "grad_norm": 0.17800393809450518, + "learning_rate": 7.247997641758444e-05, + "loss": 0.9015, + "step": 18934 + }, + { + "epoch": 4.039035836177474, + "grad_norm": 0.19626594889292273, + "learning_rate": 7.247150437451174e-05, + "loss": 0.8753, + "step": 18935 + }, + { + "epoch": 4.039249146757679, + "grad_norm": 0.16445586402829346, + "learning_rate": 7.246303241662411e-05, + "loss": 0.9089, + "step": 18936 + }, + { + "epoch": 4.039462457337884, + "grad_norm": 0.23130893164848404, + "learning_rate": 7.245456054401739e-05, + "loss": 0.8996, + "step": 18937 + }, + { + "epoch": 4.039675767918089, + "grad_norm": 0.24544014166634942, + "learning_rate": 7.244608875678745e-05, + "loss": 0.9054, + "step": 18938 + }, + { + "epoch": 4.039889078498294, + "grad_norm": 0.1798452571547201, + "learning_rate": 7.243761705503016e-05, + "loss": 0.8524, + "step": 18939 + }, + { + "epoch": 4.040102389078498, + "grad_norm": 0.16969043401831213, + "learning_rate": 7.242914543884133e-05, + "loss": 0.8971, + "step": 18940 + }, + { + "epoch": 4.040315699658703, + "grad_norm": 0.16926177832379127, + "learning_rate": 7.242067390831686e-05, + "loss": 0.9092, + "step": 18941 + }, + { + "epoch": 4.040529010238908, + "grad_norm": 0.17307195771250364, + "learning_rate": 7.241220246355258e-05, + "loss": 0.9023, + "step": 18942 + }, + { + "epoch": 4.040742320819112, + "grad_norm": 0.18102021747166383, + "learning_rate": 7.240373110464436e-05, + "loss": 0.8746, + "step": 18943 + }, + { + "epoch": 4.040955631399317, + "grad_norm": 0.19532966711585834, + "learning_rate": 7.239525983168807e-05, + "loss": 0.889, + "step": 18944 + }, + { + "epoch": 4.041168941979522, + "grad_norm": 0.15907842985292692, + "learning_rate": 7.238678864477951e-05, + "loss": 0.8945, + "step": 18945 + }, + { + "epoch": 4.041382252559727, + "grad_norm": 0.1885183976620177, + "learning_rate": 7.237831754401459e-05, + "loss": 0.9121, + "step": 18946 + }, + { + "epoch": 4.041595563139932, + "grad_norm": 0.18509682126471955, + "learning_rate": 7.236984652948912e-05, + "loss": 0.8764, + "step": 18947 + }, + { + "epoch": 4.041808873720137, + "grad_norm": 0.20816297922596053, + "learning_rate": 7.236137560129899e-05, + "loss": 0.9137, + "step": 18948 + }, + { + "epoch": 4.042022184300341, + "grad_norm": 0.19160743369266392, + "learning_rate": 7.235290475953998e-05, + "loss": 0.9018, + "step": 18949 + }, + { + "epoch": 4.042235494880546, + "grad_norm": 0.20584415697080263, + "learning_rate": 7.234443400430803e-05, + "loss": 0.8929, + "step": 18950 + }, + { + "epoch": 4.042448805460751, + "grad_norm": 0.1662282627509928, + "learning_rate": 7.233596333569892e-05, + "loss": 0.9111, + "step": 18951 + }, + { + "epoch": 4.0426621160409555, + "grad_norm": 0.22734925387764587, + "learning_rate": 7.23274927538085e-05, + "loss": 0.9132, + "step": 18952 + }, + { + "epoch": 4.04287542662116, + "grad_norm": 0.18184972595438878, + "learning_rate": 7.231902225873264e-05, + "loss": 0.9158, + "step": 18953 + }, + { + "epoch": 4.043088737201365, + "grad_norm": 0.21111916759540988, + "learning_rate": 7.231055185056717e-05, + "loss": 0.9025, + "step": 18954 + }, + { + "epoch": 4.0433020477815695, + "grad_norm": 0.2611664812752606, + "learning_rate": 7.230208152940794e-05, + "loss": 0.9138, + "step": 18955 + }, + { + "epoch": 4.043515358361775, + "grad_norm": 0.22129807245794264, + "learning_rate": 7.229361129535079e-05, + "loss": 0.9036, + "step": 18956 + }, + { + "epoch": 4.04372866894198, + "grad_norm": 0.22050534295633406, + "learning_rate": 7.228514114849154e-05, + "loss": 0.9232, + "step": 18957 + }, + { + "epoch": 4.0439419795221845, + "grad_norm": 0.19365934719164574, + "learning_rate": 7.227667108892606e-05, + "loss": 0.9071, + "step": 18958 + }, + { + "epoch": 4.044155290102389, + "grad_norm": 0.22445305133986154, + "learning_rate": 7.226820111675016e-05, + "loss": 0.9102, + "step": 18959 + }, + { + "epoch": 4.044368600682594, + "grad_norm": 0.19370107891840074, + "learning_rate": 7.225973123205971e-05, + "loss": 0.8861, + "step": 18960 + }, + { + "epoch": 4.0445819112627985, + "grad_norm": 0.22057433116657493, + "learning_rate": 7.225126143495056e-05, + "loss": 0.8849, + "step": 18961 + }, + { + "epoch": 4.044795221843003, + "grad_norm": 0.23222414405630531, + "learning_rate": 7.224279172551847e-05, + "loss": 0.8767, + "step": 18962 + }, + { + "epoch": 4.045008532423208, + "grad_norm": 0.17214745720834806, + "learning_rate": 7.223432210385935e-05, + "loss": 0.9458, + "step": 18963 + }, + { + "epoch": 4.045221843003413, + "grad_norm": 0.3206293812509373, + "learning_rate": 7.222585257006898e-05, + "loss": 0.8736, + "step": 18964 + }, + { + "epoch": 4.045435153583618, + "grad_norm": 0.2783689084546326, + "learning_rate": 7.221738312424324e-05, + "loss": 0.8853, + "step": 18965 + }, + { + "epoch": 4.045648464163823, + "grad_norm": 0.19496058145800682, + "learning_rate": 7.220891376647793e-05, + "loss": 0.8785, + "step": 18966 + }, + { + "epoch": 4.0458617747440275, + "grad_norm": 0.21890462015814885, + "learning_rate": 7.22004444968689e-05, + "loss": 0.9327, + "step": 18967 + }, + { + "epoch": 4.046075085324232, + "grad_norm": 0.3039623433441697, + "learning_rate": 7.219197531551197e-05, + "loss": 0.8936, + "step": 18968 + }, + { + "epoch": 4.046288395904437, + "grad_norm": 0.3727332982612738, + "learning_rate": 7.218350622250296e-05, + "loss": 0.8851, + "step": 18969 + }, + { + "epoch": 4.046501706484642, + "grad_norm": 0.30856461649590883, + "learning_rate": 7.217503721793772e-05, + "loss": 0.9195, + "step": 18970 + }, + { + "epoch": 4.046715017064846, + "grad_norm": 0.22986381844410875, + "learning_rate": 7.216656830191205e-05, + "loss": 0.9289, + "step": 18971 + }, + { + "epoch": 4.046928327645051, + "grad_norm": 0.1941011197565062, + "learning_rate": 7.215809947452184e-05, + "loss": 0.8754, + "step": 18972 + }, + { + "epoch": 4.047141638225256, + "grad_norm": 0.27561172144476304, + "learning_rate": 7.214963073586283e-05, + "loss": 0.884, + "step": 18973 + }, + { + "epoch": 4.04735494880546, + "grad_norm": 0.2693554319065806, + "learning_rate": 7.214116208603088e-05, + "loss": 0.9149, + "step": 18974 + }, + { + "epoch": 4.047568259385666, + "grad_norm": 0.5342460535063547, + "learning_rate": 7.213269352512182e-05, + "loss": 0.8882, + "step": 18975 + }, + { + "epoch": 4.047781569965871, + "grad_norm": 0.16693105749082787, + "learning_rate": 7.212422505323144e-05, + "loss": 0.9018, + "step": 18976 + }, + { + "epoch": 4.047994880546075, + "grad_norm": 0.17197044639585146, + "learning_rate": 7.211575667045562e-05, + "loss": 0.8903, + "step": 18977 + }, + { + "epoch": 4.04820819112628, + "grad_norm": 0.1744823482233091, + "learning_rate": 7.210728837689012e-05, + "loss": 0.8831, + "step": 18978 + }, + { + "epoch": 4.048421501706485, + "grad_norm": 0.16592298628939664, + "learning_rate": 7.209882017263079e-05, + "loss": 0.9382, + "step": 18979 + }, + { + "epoch": 4.048634812286689, + "grad_norm": 0.15499839655879416, + "learning_rate": 7.209035205777346e-05, + "loss": 0.9006, + "step": 18980 + }, + { + "epoch": 4.048848122866894, + "grad_norm": 0.15325401598554034, + "learning_rate": 7.208188403241391e-05, + "loss": 0.9007, + "step": 18981 + }, + { + "epoch": 4.049061433447099, + "grad_norm": 0.18848618077236018, + "learning_rate": 7.207341609664798e-05, + "loss": 0.8819, + "step": 18982 + }, + { + "epoch": 4.049274744027303, + "grad_norm": 0.1964498034452194, + "learning_rate": 7.206494825057148e-05, + "loss": 0.8806, + "step": 18983 + }, + { + "epoch": 4.049488054607509, + "grad_norm": 0.14879303204703295, + "learning_rate": 7.205648049428024e-05, + "loss": 0.8843, + "step": 18984 + }, + { + "epoch": 4.049701365187714, + "grad_norm": 0.2115603161570412, + "learning_rate": 7.204801282787006e-05, + "loss": 0.8964, + "step": 18985 + }, + { + "epoch": 4.049914675767918, + "grad_norm": 0.1655060264429649, + "learning_rate": 7.203954525143673e-05, + "loss": 0.9367, + "step": 18986 + }, + { + "epoch": 4.050127986348123, + "grad_norm": 0.2024277821183838, + "learning_rate": 7.203107776507608e-05, + "loss": 0.9325, + "step": 18987 + }, + { + "epoch": 4.050341296928328, + "grad_norm": 0.2548904511816051, + "learning_rate": 7.202261036888392e-05, + "loss": 0.892, + "step": 18988 + }, + { + "epoch": 4.050554607508532, + "grad_norm": 0.18786747817296678, + "learning_rate": 7.201414306295606e-05, + "loss": 0.9379, + "step": 18989 + }, + { + "epoch": 4.050767918088737, + "grad_norm": 0.246099909090372, + "learning_rate": 7.20056758473883e-05, + "loss": 0.9385, + "step": 18990 + }, + { + "epoch": 4.050981228668942, + "grad_norm": 0.20698531305642393, + "learning_rate": 7.199720872227645e-05, + "loss": 0.9215, + "step": 18991 + }, + { + "epoch": 4.051194539249146, + "grad_norm": 0.18551849133846102, + "learning_rate": 7.198874168771633e-05, + "loss": 0.8877, + "step": 18992 + }, + { + "epoch": 4.051407849829351, + "grad_norm": 0.1894412009243541, + "learning_rate": 7.198027474380373e-05, + "loss": 0.8949, + "step": 18993 + }, + { + "epoch": 4.051621160409557, + "grad_norm": 0.1748713537548594, + "learning_rate": 7.197180789063446e-05, + "loss": 0.8726, + "step": 18994 + }, + { + "epoch": 4.051834470989761, + "grad_norm": 0.2047235852314749, + "learning_rate": 7.196334112830433e-05, + "loss": 0.9189, + "step": 18995 + }, + { + "epoch": 4.052047781569966, + "grad_norm": 0.21703457047239788, + "learning_rate": 7.19548744569091e-05, + "loss": 0.9318, + "step": 18996 + }, + { + "epoch": 4.052261092150171, + "grad_norm": 0.14992937754506488, + "learning_rate": 7.194640787654462e-05, + "loss": 0.8725, + "step": 18997 + }, + { + "epoch": 4.052474402730375, + "grad_norm": 0.1841467822907492, + "learning_rate": 7.193794138730665e-05, + "loss": 0.9195, + "step": 18998 + }, + { + "epoch": 4.05268771331058, + "grad_norm": 0.1558417170588507, + "learning_rate": 7.192947498929103e-05, + "loss": 0.898, + "step": 18999 + }, + { + "epoch": 4.052901023890785, + "grad_norm": 0.20094651469627559, + "learning_rate": 7.192100868259352e-05, + "loss": 0.9481, + "step": 19000 + }, + { + "epoch": 4.0531143344709895, + "grad_norm": 0.20656118826947842, + "learning_rate": 7.191254246730995e-05, + "loss": 0.8971, + "step": 19001 + }, + { + "epoch": 4.053327645051194, + "grad_norm": 0.19183495194800457, + "learning_rate": 7.190407634353607e-05, + "loss": 0.914, + "step": 19002 + }, + { + "epoch": 4.0535409556314, + "grad_norm": 0.21397346438927684, + "learning_rate": 7.189561031136774e-05, + "loss": 0.9177, + "step": 19003 + }, + { + "epoch": 4.053754266211604, + "grad_norm": 0.17615502125924762, + "learning_rate": 7.18871443709007e-05, + "loss": 0.9255, + "step": 19004 + }, + { + "epoch": 4.053967576791809, + "grad_norm": 0.19013994604539483, + "learning_rate": 7.187867852223074e-05, + "loss": 0.8954, + "step": 19005 + }, + { + "epoch": 4.054180887372014, + "grad_norm": 0.1909390398496337, + "learning_rate": 7.187021276545372e-05, + "loss": 0.9163, + "step": 19006 + }, + { + "epoch": 4.0543941979522184, + "grad_norm": 0.2236682333127409, + "learning_rate": 7.186174710066534e-05, + "loss": 0.874, + "step": 19007 + }, + { + "epoch": 4.054607508532423, + "grad_norm": 0.16697429798651872, + "learning_rate": 7.185328152796143e-05, + "loss": 0.9058, + "step": 19008 + }, + { + "epoch": 4.054820819112628, + "grad_norm": 0.18644804389622535, + "learning_rate": 7.184481604743778e-05, + "loss": 0.8931, + "step": 19009 + }, + { + "epoch": 4.0550341296928325, + "grad_norm": 0.18963694093274072, + "learning_rate": 7.183635065919017e-05, + "loss": 0.8914, + "step": 19010 + }, + { + "epoch": 4.055247440273037, + "grad_norm": 0.17243745841664398, + "learning_rate": 7.182788536331439e-05, + "loss": 0.9427, + "step": 19011 + }, + { + "epoch": 4.055460750853243, + "grad_norm": 0.1959287611462518, + "learning_rate": 7.181942015990623e-05, + "loss": 0.9039, + "step": 19012 + }, + { + "epoch": 4.055674061433447, + "grad_norm": 0.16880592308698703, + "learning_rate": 7.181095504906146e-05, + "loss": 0.8731, + "step": 19013 + }, + { + "epoch": 4.055887372013652, + "grad_norm": 0.17194920628246738, + "learning_rate": 7.180249003087586e-05, + "loss": 0.9087, + "step": 19014 + }, + { + "epoch": 4.056100682593857, + "grad_norm": 0.1667247351537085, + "learning_rate": 7.179402510544525e-05, + "loss": 0.8886, + "step": 19015 + }, + { + "epoch": 4.0563139931740615, + "grad_norm": 0.17730584933456206, + "learning_rate": 7.178556027286538e-05, + "loss": 0.869, + "step": 19016 + }, + { + "epoch": 4.056527303754266, + "grad_norm": 0.1681872011032125, + "learning_rate": 7.177709553323206e-05, + "loss": 0.8783, + "step": 19017 + }, + { + "epoch": 4.056740614334471, + "grad_norm": 0.21785299770384514, + "learning_rate": 7.176863088664099e-05, + "loss": 0.8864, + "step": 19018 + }, + { + "epoch": 4.0569539249146755, + "grad_norm": 0.17632820877959848, + "learning_rate": 7.176016633318803e-05, + "loss": 0.8955, + "step": 19019 + }, + { + "epoch": 4.05716723549488, + "grad_norm": 0.2218202261268626, + "learning_rate": 7.175170187296891e-05, + "loss": 0.9254, + "step": 19020 + }, + { + "epoch": 4.057380546075085, + "grad_norm": 0.2537277841949147, + "learning_rate": 7.174323750607943e-05, + "loss": 0.8985, + "step": 19021 + }, + { + "epoch": 4.0575938566552905, + "grad_norm": 0.16561055028474286, + "learning_rate": 7.173477323261533e-05, + "loss": 0.9156, + "step": 19022 + }, + { + "epoch": 4.057807167235495, + "grad_norm": 0.22782553590181578, + "learning_rate": 7.172630905267246e-05, + "loss": 0.8895, + "step": 19023 + }, + { + "epoch": 4.0580204778157, + "grad_norm": 0.22508223126494456, + "learning_rate": 7.17178449663465e-05, + "loss": 0.8758, + "step": 19024 + }, + { + "epoch": 4.0582337883959045, + "grad_norm": 0.16904667430345532, + "learning_rate": 7.170938097373329e-05, + "loss": 0.9202, + "step": 19025 + }, + { + "epoch": 4.058447098976109, + "grad_norm": 0.199417265385077, + "learning_rate": 7.170091707492855e-05, + "loss": 0.9008, + "step": 19026 + }, + { + "epoch": 4.058660409556314, + "grad_norm": 0.1885983832236727, + "learning_rate": 7.16924532700281e-05, + "loss": 0.885, + "step": 19027 + }, + { + "epoch": 4.058873720136519, + "grad_norm": 0.17203447303289138, + "learning_rate": 7.168398955912769e-05, + "loss": 0.8889, + "step": 19028 + }, + { + "epoch": 4.059087030716723, + "grad_norm": 0.21347416667643268, + "learning_rate": 7.167552594232305e-05, + "loss": 0.9017, + "step": 19029 + }, + { + "epoch": 4.059300341296928, + "grad_norm": 0.2019859242115258, + "learning_rate": 7.166706241970998e-05, + "loss": 0.9107, + "step": 19030 + }, + { + "epoch": 4.0595136518771335, + "grad_norm": 0.18339591139649564, + "learning_rate": 7.165859899138424e-05, + "loss": 0.9135, + "step": 19031 + }, + { + "epoch": 4.059726962457338, + "grad_norm": 0.15741478297433706, + "learning_rate": 7.165013565744159e-05, + "loss": 0.9245, + "step": 19032 + }, + { + "epoch": 4.059940273037543, + "grad_norm": 0.19617713423097988, + "learning_rate": 7.16416724179778e-05, + "loss": 0.9343, + "step": 19033 + }, + { + "epoch": 4.060153583617748, + "grad_norm": 0.21833588117666008, + "learning_rate": 7.163320927308861e-05, + "loss": 0.8919, + "step": 19034 + }, + { + "epoch": 4.060366894197952, + "grad_norm": 0.20655629777756299, + "learning_rate": 7.162474622286982e-05, + "loss": 0.903, + "step": 19035 + }, + { + "epoch": 4.060580204778157, + "grad_norm": 0.18534299209469543, + "learning_rate": 7.161628326741715e-05, + "loss": 0.8933, + "step": 19036 + }, + { + "epoch": 4.060793515358362, + "grad_norm": 0.19745316276135796, + "learning_rate": 7.16078204068264e-05, + "loss": 0.9446, + "step": 19037 + }, + { + "epoch": 4.061006825938566, + "grad_norm": 0.15869660700813773, + "learning_rate": 7.159935764119326e-05, + "loss": 0.8753, + "step": 19038 + }, + { + "epoch": 4.061220136518771, + "grad_norm": 0.20698234631717896, + "learning_rate": 7.159089497061357e-05, + "loss": 0.9152, + "step": 19039 + }, + { + "epoch": 4.061433447098976, + "grad_norm": 0.18061168150193466, + "learning_rate": 7.158243239518306e-05, + "loss": 0.8886, + "step": 19040 + }, + { + "epoch": 4.061646757679181, + "grad_norm": 0.1605366642445959, + "learning_rate": 7.157396991499744e-05, + "loss": 0.9037, + "step": 19041 + }, + { + "epoch": 4.061860068259386, + "grad_norm": 0.22016279758361612, + "learning_rate": 7.156550753015249e-05, + "loss": 0.8992, + "step": 19042 + }, + { + "epoch": 4.062073378839591, + "grad_norm": 0.21456938806893247, + "learning_rate": 7.155704524074398e-05, + "loss": 0.8957, + "step": 19043 + }, + { + "epoch": 4.062286689419795, + "grad_norm": 0.23077926613421856, + "learning_rate": 7.154858304686762e-05, + "loss": 0.8941, + "step": 19044 + }, + { + "epoch": 4.0625, + "grad_norm": 0.16140023189932823, + "learning_rate": 7.15401209486192e-05, + "loss": 0.8883, + "step": 19045 + }, + { + "epoch": 4.062713310580205, + "grad_norm": 0.17634860501750646, + "learning_rate": 7.153165894609444e-05, + "loss": 0.8981, + "step": 19046 + }, + { + "epoch": 4.062926621160409, + "grad_norm": 0.20411400149669842, + "learning_rate": 7.152319703938913e-05, + "loss": 0.8974, + "step": 19047 + }, + { + "epoch": 4.063139931740614, + "grad_norm": 0.20950977413167504, + "learning_rate": 7.151473522859896e-05, + "loss": 0.8676, + "step": 19048 + }, + { + "epoch": 4.063353242320819, + "grad_norm": 0.1875686591270837, + "learning_rate": 7.150627351381972e-05, + "loss": 0.9087, + "step": 19049 + }, + { + "epoch": 4.063566552901024, + "grad_norm": 0.16921385616497772, + "learning_rate": 7.149781189514714e-05, + "loss": 0.9114, + "step": 19050 + }, + { + "epoch": 4.063779863481229, + "grad_norm": 0.17342328053478223, + "learning_rate": 7.148935037267698e-05, + "loss": 0.9047, + "step": 19051 + }, + { + "epoch": 4.063993174061434, + "grad_norm": 0.1944604531764924, + "learning_rate": 7.148088894650492e-05, + "loss": 0.8894, + "step": 19052 + }, + { + "epoch": 4.064206484641638, + "grad_norm": 0.19026976210117597, + "learning_rate": 7.147242761672677e-05, + "loss": 0.8588, + "step": 19053 + }, + { + "epoch": 4.064419795221843, + "grad_norm": 0.18148893429282495, + "learning_rate": 7.146396638343823e-05, + "loss": 0.8719, + "step": 19054 + }, + { + "epoch": 4.064633105802048, + "grad_norm": 0.17252331086249617, + "learning_rate": 7.145550524673507e-05, + "loss": 0.8967, + "step": 19055 + }, + { + "epoch": 4.064846416382252, + "grad_norm": 0.16791161788546868, + "learning_rate": 7.144704420671299e-05, + "loss": 0.9178, + "step": 19056 + }, + { + "epoch": 4.065059726962457, + "grad_norm": 0.18596037748390803, + "learning_rate": 7.143858326346777e-05, + "loss": 0.8964, + "step": 19057 + }, + { + "epoch": 4.065273037542662, + "grad_norm": 0.18291164981399516, + "learning_rate": 7.143012241709511e-05, + "loss": 0.9082, + "step": 19058 + }, + { + "epoch": 4.0654863481228665, + "grad_norm": 0.22130612794308568, + "learning_rate": 7.142166166769076e-05, + "loss": 0.9237, + "step": 19059 + }, + { + "epoch": 4.065699658703072, + "grad_norm": 0.21635131882152311, + "learning_rate": 7.141320101535046e-05, + "loss": 0.8868, + "step": 19060 + }, + { + "epoch": 4.065912969283277, + "grad_norm": 0.19725007661692215, + "learning_rate": 7.140474046016994e-05, + "loss": 0.9296, + "step": 19061 + }, + { + "epoch": 4.066126279863481, + "grad_norm": 0.2038509095334118, + "learning_rate": 7.139628000224495e-05, + "loss": 0.9154, + "step": 19062 + }, + { + "epoch": 4.066339590443686, + "grad_norm": 0.21236488742330467, + "learning_rate": 7.138781964167116e-05, + "loss": 0.8953, + "step": 19063 + }, + { + "epoch": 4.066552901023891, + "grad_norm": 0.1739939251177372, + "learning_rate": 7.137935937854433e-05, + "loss": 0.8779, + "step": 19064 + }, + { + "epoch": 4.0667662116040955, + "grad_norm": 0.16684120738490218, + "learning_rate": 7.137089921296021e-05, + "loss": 0.9062, + "step": 19065 + }, + { + "epoch": 4.0669795221843, + "grad_norm": 0.18276707013965862, + "learning_rate": 7.13624391450145e-05, + "loss": 0.8777, + "step": 19066 + }, + { + "epoch": 4.067192832764505, + "grad_norm": 0.16169729527820145, + "learning_rate": 7.135397917480295e-05, + "loss": 0.8689, + "step": 19067 + }, + { + "epoch": 4.0674061433447095, + "grad_norm": 0.19964447059216323, + "learning_rate": 7.134551930242126e-05, + "loss": 0.927, + "step": 19068 + }, + { + "epoch": 4.067619453924915, + "grad_norm": 0.23731612541000033, + "learning_rate": 7.133705952796517e-05, + "loss": 0.8696, + "step": 19069 + }, + { + "epoch": 4.06783276450512, + "grad_norm": 0.1852728555635631, + "learning_rate": 7.132859985153039e-05, + "loss": 0.924, + "step": 19070 + }, + { + "epoch": 4.0680460750853245, + "grad_norm": 0.18754762076276046, + "learning_rate": 7.132014027321266e-05, + "loss": 0.8867, + "step": 19071 + }, + { + "epoch": 4.068259385665529, + "grad_norm": 0.21957882496045436, + "learning_rate": 7.131168079310767e-05, + "loss": 0.8783, + "step": 19072 + }, + { + "epoch": 4.068472696245734, + "grad_norm": 0.20650339482981006, + "learning_rate": 7.13032214113112e-05, + "loss": 0.9145, + "step": 19073 + }, + { + "epoch": 4.0686860068259385, + "grad_norm": 0.18815822367713897, + "learning_rate": 7.12947621279189e-05, + "loss": 0.8996, + "step": 19074 + }, + { + "epoch": 4.068899317406143, + "grad_norm": 0.2021874549756334, + "learning_rate": 7.128630294302652e-05, + "loss": 0.8945, + "step": 19075 + }, + { + "epoch": 4.069112627986348, + "grad_norm": 0.17448266477563373, + "learning_rate": 7.127784385672975e-05, + "loss": 0.877, + "step": 19076 + }, + { + "epoch": 4.069325938566553, + "grad_norm": 0.2127597495217092, + "learning_rate": 7.126938486912432e-05, + "loss": 0.9155, + "step": 19077 + }, + { + "epoch": 4.069539249146757, + "grad_norm": 0.22015161346488427, + "learning_rate": 7.126092598030596e-05, + "loss": 0.8702, + "step": 19078 + }, + { + "epoch": 4.069752559726963, + "grad_norm": 0.20224417536219083, + "learning_rate": 7.125246719037038e-05, + "loss": 0.9308, + "step": 19079 + }, + { + "epoch": 4.0699658703071675, + "grad_norm": 0.2256060061839972, + "learning_rate": 7.124400849941326e-05, + "loss": 0.9091, + "step": 19080 + }, + { + "epoch": 4.070179180887372, + "grad_norm": 0.2029834275807346, + "learning_rate": 7.123554990753034e-05, + "loss": 0.8816, + "step": 19081 + }, + { + "epoch": 4.070392491467577, + "grad_norm": 0.18106958830012881, + "learning_rate": 7.122709141481731e-05, + "loss": 0.8965, + "step": 19082 + }, + { + "epoch": 4.0706058020477816, + "grad_norm": 0.22518760039123684, + "learning_rate": 7.121863302136991e-05, + "loss": 0.9233, + "step": 19083 + }, + { + "epoch": 4.070819112627986, + "grad_norm": 0.23200849910830873, + "learning_rate": 7.121017472728383e-05, + "loss": 0.9047, + "step": 19084 + }, + { + "epoch": 4.071032423208191, + "grad_norm": 0.22026277833300206, + "learning_rate": 7.120171653265477e-05, + "loss": 0.866, + "step": 19085 + }, + { + "epoch": 4.071245733788396, + "grad_norm": 0.2814878904637903, + "learning_rate": 7.11932584375784e-05, + "loss": 0.8972, + "step": 19086 + }, + { + "epoch": 4.0714590443686, + "grad_norm": 0.18332605185962805, + "learning_rate": 7.118480044215048e-05, + "loss": 0.886, + "step": 19087 + }, + { + "epoch": 4.071672354948806, + "grad_norm": 0.2521926675611739, + "learning_rate": 7.117634254646668e-05, + "loss": 0.9215, + "step": 19088 + }, + { + "epoch": 4.0718856655290105, + "grad_norm": 0.27272543110291514, + "learning_rate": 7.116788475062275e-05, + "loss": 0.9285, + "step": 19089 + }, + { + "epoch": 4.072098976109215, + "grad_norm": 0.15483098842347984, + "learning_rate": 7.11594270547143e-05, + "loss": 0.898, + "step": 19090 + }, + { + "epoch": 4.07231228668942, + "grad_norm": 0.19336315284487268, + "learning_rate": 7.115096945883712e-05, + "loss": 0.9291, + "step": 19091 + }, + { + "epoch": 4.072525597269625, + "grad_norm": 0.17938394185155213, + "learning_rate": 7.114251196308684e-05, + "loss": 0.8963, + "step": 19092 + }, + { + "epoch": 4.072738907849829, + "grad_norm": 0.17126089420009286, + "learning_rate": 7.113405456755921e-05, + "loss": 0.9096, + "step": 19093 + }, + { + "epoch": 4.072952218430034, + "grad_norm": 0.18580900632010364, + "learning_rate": 7.112559727234989e-05, + "loss": 0.8926, + "step": 19094 + }, + { + "epoch": 4.073165529010239, + "grad_norm": 0.19681653409435068, + "learning_rate": 7.111714007755461e-05, + "loss": 0.8793, + "step": 19095 + }, + { + "epoch": 4.073378839590443, + "grad_norm": 0.20574113505078795, + "learning_rate": 7.110868298326902e-05, + "loss": 0.9213, + "step": 19096 + }, + { + "epoch": 4.073592150170649, + "grad_norm": 0.18146405878306757, + "learning_rate": 7.110022598958885e-05, + "loss": 0.925, + "step": 19097 + }, + { + "epoch": 4.073805460750854, + "grad_norm": 0.21853049224433654, + "learning_rate": 7.109176909660974e-05, + "loss": 0.9493, + "step": 19098 + }, + { + "epoch": 4.074018771331058, + "grad_norm": 0.1753116997406031, + "learning_rate": 7.108331230442743e-05, + "loss": 0.8606, + "step": 19099 + }, + { + "epoch": 4.074232081911263, + "grad_norm": 0.17967670923581616, + "learning_rate": 7.107485561313757e-05, + "loss": 0.8756, + "step": 19100 + }, + { + "epoch": 4.074445392491468, + "grad_norm": 0.19035748457037627, + "learning_rate": 7.10663990228359e-05, + "loss": 0.8711, + "step": 19101 + }, + { + "epoch": 4.074658703071672, + "grad_norm": 0.1870183430205177, + "learning_rate": 7.105794253361806e-05, + "loss": 0.905, + "step": 19102 + }, + { + "epoch": 4.074872013651877, + "grad_norm": 0.18642008687265624, + "learning_rate": 7.104948614557976e-05, + "loss": 0.8929, + "step": 19103 + }, + { + "epoch": 4.075085324232082, + "grad_norm": 0.17766233313099816, + "learning_rate": 7.104102985881664e-05, + "loss": 0.8863, + "step": 19104 + }, + { + "epoch": 4.075298634812286, + "grad_norm": 0.1908375168878039, + "learning_rate": 7.103257367342446e-05, + "loss": 0.8732, + "step": 19105 + }, + { + "epoch": 4.075511945392491, + "grad_norm": 0.17548037675886421, + "learning_rate": 7.102411758949884e-05, + "loss": 0.9183, + "step": 19106 + }, + { + "epoch": 4.075725255972697, + "grad_norm": 0.20927005208611715, + "learning_rate": 7.10156616071355e-05, + "loss": 0.9002, + "step": 19107 + }, + { + "epoch": 4.075938566552901, + "grad_norm": 0.22487209028586944, + "learning_rate": 7.10072057264301e-05, + "loss": 0.8901, + "step": 19108 + }, + { + "epoch": 4.076151877133106, + "grad_norm": 0.15753083468053444, + "learning_rate": 7.099874994747832e-05, + "loss": 0.9093, + "step": 19109 + }, + { + "epoch": 4.076365187713311, + "grad_norm": 0.21308875060806926, + "learning_rate": 7.09902942703758e-05, + "loss": 0.8976, + "step": 19110 + }, + { + "epoch": 4.076578498293515, + "grad_norm": 0.22770848823803827, + "learning_rate": 7.098183869521827e-05, + "loss": 0.9199, + "step": 19111 + }, + { + "epoch": 4.07679180887372, + "grad_norm": 0.23924277931451943, + "learning_rate": 7.097338322210138e-05, + "loss": 0.9174, + "step": 19112 + }, + { + "epoch": 4.077005119453925, + "grad_norm": 0.2513228508934973, + "learning_rate": 7.096492785112083e-05, + "loss": 0.909, + "step": 19113 + }, + { + "epoch": 4.077218430034129, + "grad_norm": 0.17991794483716586, + "learning_rate": 7.095647258237225e-05, + "loss": 0.888, + "step": 19114 + }, + { + "epoch": 4.077431740614334, + "grad_norm": 0.21823754645859808, + "learning_rate": 7.094801741595136e-05, + "loss": 0.8827, + "step": 19115 + }, + { + "epoch": 4.07764505119454, + "grad_norm": 0.26129505874627124, + "learning_rate": 7.093956235195377e-05, + "loss": 0.8843, + "step": 19116 + }, + { + "epoch": 4.077858361774744, + "grad_norm": 0.18134331348828714, + "learning_rate": 7.093110739047522e-05, + "loss": 0.9019, + "step": 19117 + }, + { + "epoch": 4.078071672354949, + "grad_norm": 0.2646875718740682, + "learning_rate": 7.092265253161136e-05, + "loss": 0.9057, + "step": 19118 + }, + { + "epoch": 4.078284982935154, + "grad_norm": 0.231832456219117, + "learning_rate": 7.091419777545779e-05, + "loss": 0.9067, + "step": 19119 + }, + { + "epoch": 4.078498293515358, + "grad_norm": 0.16342282684040793, + "learning_rate": 7.090574312211025e-05, + "loss": 0.9158, + "step": 19120 + }, + { + "epoch": 4.078711604095563, + "grad_norm": 0.2527107995192089, + "learning_rate": 7.089728857166438e-05, + "loss": 0.876, + "step": 19121 + }, + { + "epoch": 4.078924914675768, + "grad_norm": 0.17992431471388445, + "learning_rate": 7.088883412421583e-05, + "loss": 0.8719, + "step": 19122 + }, + { + "epoch": 4.0791382252559725, + "grad_norm": 0.21011116581696496, + "learning_rate": 7.088037977986028e-05, + "loss": 0.9182, + "step": 19123 + }, + { + "epoch": 4.079351535836177, + "grad_norm": 0.2220245800207486, + "learning_rate": 7.087192553869338e-05, + "loss": 0.9068, + "step": 19124 + }, + { + "epoch": 4.079564846416382, + "grad_norm": 0.1715291261949037, + "learning_rate": 7.086347140081082e-05, + "loss": 0.8955, + "step": 19125 + }, + { + "epoch": 4.079778156996587, + "grad_norm": 0.23241815994416998, + "learning_rate": 7.085501736630819e-05, + "loss": 0.8853, + "step": 19126 + }, + { + "epoch": 4.079991467576792, + "grad_norm": 0.189181691326604, + "learning_rate": 7.084656343528124e-05, + "loss": 0.8993, + "step": 19127 + }, + { + "epoch": 4.080204778156997, + "grad_norm": 0.18869042703652944, + "learning_rate": 7.083810960782554e-05, + "loss": 0.8975, + "step": 19128 + }, + { + "epoch": 4.0804180887372015, + "grad_norm": 0.1845095849298965, + "learning_rate": 7.082965588403683e-05, + "loss": 0.8969, + "step": 19129 + }, + { + "epoch": 4.080631399317406, + "grad_norm": 0.1912833771184159, + "learning_rate": 7.08212022640107e-05, + "loss": 0.8812, + "step": 19130 + }, + { + "epoch": 4.080844709897611, + "grad_norm": 0.2665308774979836, + "learning_rate": 7.081274874784279e-05, + "loss": 0.8985, + "step": 19131 + }, + { + "epoch": 4.0810580204778155, + "grad_norm": 0.214823120846056, + "learning_rate": 7.080429533562882e-05, + "loss": 0.9209, + "step": 19132 + }, + { + "epoch": 4.08127133105802, + "grad_norm": 0.17804267395371667, + "learning_rate": 7.07958420274644e-05, + "loss": 0.9156, + "step": 19133 + }, + { + "epoch": 4.081484641638225, + "grad_norm": 0.17528100539872019, + "learning_rate": 7.078738882344516e-05, + "loss": 0.8975, + "step": 19134 + }, + { + "epoch": 4.0816979522184305, + "grad_norm": 0.1883504125602975, + "learning_rate": 7.077893572366678e-05, + "loss": 0.9264, + "step": 19135 + }, + { + "epoch": 4.081911262798635, + "grad_norm": 0.19060473955451124, + "learning_rate": 7.077048272822488e-05, + "loss": 0.8956, + "step": 19136 + }, + { + "epoch": 4.08212457337884, + "grad_norm": 0.19028668898768378, + "learning_rate": 7.076202983721516e-05, + "loss": 0.9537, + "step": 19137 + }, + { + "epoch": 4.0823378839590445, + "grad_norm": 0.17668328190425897, + "learning_rate": 7.075357705073318e-05, + "loss": 0.8849, + "step": 19138 + }, + { + "epoch": 4.082551194539249, + "grad_norm": 0.1999092296067889, + "learning_rate": 7.074512436887467e-05, + "loss": 0.9112, + "step": 19139 + }, + { + "epoch": 4.082764505119454, + "grad_norm": 0.20845547403824308, + "learning_rate": 7.073667179173525e-05, + "loss": 0.9275, + "step": 19140 + }, + { + "epoch": 4.082977815699659, + "grad_norm": 0.1640381470710446, + "learning_rate": 7.072821931941051e-05, + "loss": 0.9362, + "step": 19141 + }, + { + "epoch": 4.083191126279863, + "grad_norm": 0.16424646592775194, + "learning_rate": 7.071976695199615e-05, + "loss": 0.9072, + "step": 19142 + }, + { + "epoch": 4.083404436860068, + "grad_norm": 0.1636691900274001, + "learning_rate": 7.071131468958776e-05, + "loss": 0.8953, + "step": 19143 + }, + { + "epoch": 4.083617747440273, + "grad_norm": 0.1549698980106344, + "learning_rate": 7.070286253228099e-05, + "loss": 0.8783, + "step": 19144 + }, + { + "epoch": 4.083831058020478, + "grad_norm": 0.2202310395354978, + "learning_rate": 7.069441048017151e-05, + "loss": 0.9106, + "step": 19145 + }, + { + "epoch": 4.084044368600683, + "grad_norm": 0.1718356010323095, + "learning_rate": 7.068595853335492e-05, + "loss": 0.922, + "step": 19146 + }, + { + "epoch": 4.084257679180888, + "grad_norm": 0.17396054532017746, + "learning_rate": 7.067750669192687e-05, + "loss": 0.9067, + "step": 19147 + }, + { + "epoch": 4.084470989761092, + "grad_norm": 0.21370526438899057, + "learning_rate": 7.066905495598299e-05, + "loss": 0.9037, + "step": 19148 + }, + { + "epoch": 4.084684300341297, + "grad_norm": 0.18806656270236494, + "learning_rate": 7.06606033256189e-05, + "loss": 0.884, + "step": 19149 + }, + { + "epoch": 4.084897610921502, + "grad_norm": 0.1647457453038508, + "learning_rate": 7.065215180093025e-05, + "loss": 0.8994, + "step": 19150 + }, + { + "epoch": 4.085110921501706, + "grad_norm": 0.17342810916059476, + "learning_rate": 7.064370038201267e-05, + "loss": 0.9021, + "step": 19151 + }, + { + "epoch": 4.085324232081911, + "grad_norm": 0.1786710786206049, + "learning_rate": 7.063524906896179e-05, + "loss": 0.9349, + "step": 19152 + }, + { + "epoch": 4.085537542662116, + "grad_norm": 0.17325313059286657, + "learning_rate": 7.062679786187318e-05, + "loss": 0.9188, + "step": 19153 + }, + { + "epoch": 4.085750853242321, + "grad_norm": 0.16884652967463146, + "learning_rate": 7.061834676084253e-05, + "loss": 0.8879, + "step": 19154 + }, + { + "epoch": 4.085964163822526, + "grad_norm": 0.17866718167825046, + "learning_rate": 7.060989576596546e-05, + "loss": 0.9204, + "step": 19155 + }, + { + "epoch": 4.086177474402731, + "grad_norm": 0.19298765080682886, + "learning_rate": 7.060144487733755e-05, + "loss": 0.9002, + "step": 19156 + }, + { + "epoch": 4.086390784982935, + "grad_norm": 0.18690976505626475, + "learning_rate": 7.059299409505446e-05, + "loss": 0.8749, + "step": 19157 + }, + { + "epoch": 4.08660409556314, + "grad_norm": 0.18034650473774633, + "learning_rate": 7.05845434192118e-05, + "loss": 0.903, + "step": 19158 + }, + { + "epoch": 4.086817406143345, + "grad_norm": 0.16548335062645148, + "learning_rate": 7.05760928499052e-05, + "loss": 0.9228, + "step": 19159 + }, + { + "epoch": 4.087030716723549, + "grad_norm": 0.18605236103417158, + "learning_rate": 7.056764238723025e-05, + "loss": 0.9233, + "step": 19160 + }, + { + "epoch": 4.087244027303754, + "grad_norm": 0.17419057534627383, + "learning_rate": 7.055919203128259e-05, + "loss": 0.9485, + "step": 19161 + }, + { + "epoch": 4.087457337883959, + "grad_norm": 0.17947719595711326, + "learning_rate": 7.055074178215783e-05, + "loss": 0.8901, + "step": 19162 + }, + { + "epoch": 4.087670648464163, + "grad_norm": 0.2066820937975054, + "learning_rate": 7.054229163995161e-05, + "loss": 0.9116, + "step": 19163 + }, + { + "epoch": 4.087883959044369, + "grad_norm": 0.18976069254319639, + "learning_rate": 7.053384160475951e-05, + "loss": 0.8955, + "step": 19164 + }, + { + "epoch": 4.088097269624574, + "grad_norm": 0.16209143494297804, + "learning_rate": 7.052539167667713e-05, + "loss": 0.9257, + "step": 19165 + }, + { + "epoch": 4.088310580204778, + "grad_norm": 0.17172510089867404, + "learning_rate": 7.051694185580011e-05, + "loss": 0.92, + "step": 19166 + }, + { + "epoch": 4.088523890784983, + "grad_norm": 0.1702681574295968, + "learning_rate": 7.050849214222406e-05, + "loss": 0.8998, + "step": 19167 + }, + { + "epoch": 4.088737201365188, + "grad_norm": 0.18474318892862127, + "learning_rate": 7.050004253604458e-05, + "loss": 0.9253, + "step": 19168 + }, + { + "epoch": 4.088950511945392, + "grad_norm": 0.15203941215936775, + "learning_rate": 7.049159303735729e-05, + "loss": 0.8859, + "step": 19169 + }, + { + "epoch": 4.089163822525597, + "grad_norm": 0.1719528699024187, + "learning_rate": 7.048314364625775e-05, + "loss": 0.9154, + "step": 19170 + }, + { + "epoch": 4.089377133105802, + "grad_norm": 0.19370912272431337, + "learning_rate": 7.047469436284163e-05, + "loss": 0.9163, + "step": 19171 + }, + { + "epoch": 4.089590443686006, + "grad_norm": 0.16358586299866626, + "learning_rate": 7.046624518720448e-05, + "loss": 0.8939, + "step": 19172 + }, + { + "epoch": 4.089803754266212, + "grad_norm": 0.16705254322070037, + "learning_rate": 7.045779611944196e-05, + "loss": 0.8913, + "step": 19173 + }, + { + "epoch": 4.090017064846417, + "grad_norm": 0.15380492851388128, + "learning_rate": 7.044934715964964e-05, + "loss": 0.9053, + "step": 19174 + }, + { + "epoch": 4.090230375426621, + "grad_norm": 0.19808949551909247, + "learning_rate": 7.044089830792307e-05, + "loss": 0.9085, + "step": 19175 + }, + { + "epoch": 4.090443686006826, + "grad_norm": 0.1861974389960845, + "learning_rate": 7.043244956435793e-05, + "loss": 0.9049, + "step": 19176 + }, + { + "epoch": 4.090656996587031, + "grad_norm": 0.21060310415359185, + "learning_rate": 7.042400092904977e-05, + "loss": 0.8914, + "step": 19177 + }, + { + "epoch": 4.090870307167235, + "grad_norm": 0.2846608755663735, + "learning_rate": 7.041555240209421e-05, + "loss": 0.9421, + "step": 19178 + }, + { + "epoch": 4.09108361774744, + "grad_norm": 0.23989893438511736, + "learning_rate": 7.040710398358683e-05, + "loss": 0.8838, + "step": 19179 + }, + { + "epoch": 4.091296928327645, + "grad_norm": 0.1876414378263498, + "learning_rate": 7.039865567362322e-05, + "loss": 0.8995, + "step": 19180 + }, + { + "epoch": 4.0915102389078495, + "grad_norm": 0.21470576297033667, + "learning_rate": 7.0390207472299e-05, + "loss": 0.8846, + "step": 19181 + }, + { + "epoch": 4.091723549488055, + "grad_norm": 0.18754727264480592, + "learning_rate": 7.038175937970974e-05, + "loss": 0.9238, + "step": 19182 + }, + { + "epoch": 4.09193686006826, + "grad_norm": 0.20983709798436243, + "learning_rate": 7.037331139595103e-05, + "loss": 0.9006, + "step": 19183 + }, + { + "epoch": 4.092150170648464, + "grad_norm": 0.18917757914324682, + "learning_rate": 7.036486352111845e-05, + "loss": 0.9156, + "step": 19184 + }, + { + "epoch": 4.092363481228669, + "grad_norm": 0.1841837647563135, + "learning_rate": 7.035641575530765e-05, + "loss": 0.9053, + "step": 19185 + }, + { + "epoch": 4.092576791808874, + "grad_norm": 0.17995867845781466, + "learning_rate": 7.034796809861414e-05, + "loss": 0.9182, + "step": 19186 + }, + { + "epoch": 4.0927901023890785, + "grad_norm": 0.20106372412856371, + "learning_rate": 7.033952055113352e-05, + "loss": 0.8668, + "step": 19187 + }, + { + "epoch": 4.093003412969283, + "grad_norm": 0.18872096978459812, + "learning_rate": 7.033107311296139e-05, + "loss": 0.9125, + "step": 19188 + }, + { + "epoch": 4.093216723549488, + "grad_norm": 0.22375730293865068, + "learning_rate": 7.032262578419334e-05, + "loss": 0.8898, + "step": 19189 + }, + { + "epoch": 4.0934300341296925, + "grad_norm": 0.21481143317665466, + "learning_rate": 7.031417856492494e-05, + "loss": 0.8987, + "step": 19190 + }, + { + "epoch": 4.093643344709897, + "grad_norm": 0.1633512234293111, + "learning_rate": 7.030573145525178e-05, + "loss": 0.8952, + "step": 19191 + }, + { + "epoch": 4.093856655290103, + "grad_norm": 0.2591994763331339, + "learning_rate": 7.02972844552694e-05, + "loss": 0.883, + "step": 19192 + }, + { + "epoch": 4.0940699658703075, + "grad_norm": 0.20121991328731134, + "learning_rate": 7.028883756507345e-05, + "loss": 0.9079, + "step": 19193 + }, + { + "epoch": 4.094283276450512, + "grad_norm": 0.16727400687183314, + "learning_rate": 7.028039078475944e-05, + "loss": 0.8908, + "step": 19194 + }, + { + "epoch": 4.094496587030717, + "grad_norm": 0.184530175533007, + "learning_rate": 7.027194411442298e-05, + "loss": 0.9023, + "step": 19195 + }, + { + "epoch": 4.0947098976109215, + "grad_norm": 0.18025095904101138, + "learning_rate": 7.026349755415967e-05, + "loss": 0.8968, + "step": 19196 + }, + { + "epoch": 4.094923208191126, + "grad_norm": 0.18389199036959336, + "learning_rate": 7.0255051104065e-05, + "loss": 0.8833, + "step": 19197 + }, + { + "epoch": 4.095136518771331, + "grad_norm": 0.2279378823095614, + "learning_rate": 7.024660476423461e-05, + "loss": 0.903, + "step": 19198 + }, + { + "epoch": 4.095349829351536, + "grad_norm": 0.20707119729038834, + "learning_rate": 7.023815853476404e-05, + "loss": 0.8975, + "step": 19199 + }, + { + "epoch": 4.09556313993174, + "grad_norm": 0.17009263776499134, + "learning_rate": 7.022971241574888e-05, + "loss": 0.9232, + "step": 19200 + }, + { + "epoch": 4.095776450511945, + "grad_norm": 0.17748211753646806, + "learning_rate": 7.022126640728468e-05, + "loss": 0.9022, + "step": 19201 + }, + { + "epoch": 4.0959897610921505, + "grad_norm": 0.17756778113508687, + "learning_rate": 7.021282050946702e-05, + "loss": 0.8979, + "step": 19202 + }, + { + "epoch": 4.096203071672355, + "grad_norm": 0.19925432835570686, + "learning_rate": 7.020437472239147e-05, + "loss": 0.916, + "step": 19203 + }, + { + "epoch": 4.09641638225256, + "grad_norm": 0.190028977612443, + "learning_rate": 7.019592904615355e-05, + "loss": 0.9047, + "step": 19204 + }, + { + "epoch": 4.096629692832765, + "grad_norm": 0.15863511684150444, + "learning_rate": 7.01874834808489e-05, + "loss": 0.9173, + "step": 19205 + }, + { + "epoch": 4.096843003412969, + "grad_norm": 0.20586638230428392, + "learning_rate": 7.017903802657301e-05, + "loss": 0.8971, + "step": 19206 + }, + { + "epoch": 4.097056313993174, + "grad_norm": 0.21597832392209085, + "learning_rate": 7.017059268342152e-05, + "loss": 0.9105, + "step": 19207 + }, + { + "epoch": 4.097269624573379, + "grad_norm": 0.1961584384893726, + "learning_rate": 7.01621474514899e-05, + "loss": 0.9134, + "step": 19208 + }, + { + "epoch": 4.097482935153583, + "grad_norm": 0.26707343207356427, + "learning_rate": 7.015370233087373e-05, + "loss": 0.9172, + "step": 19209 + }, + { + "epoch": 4.097696245733788, + "grad_norm": 0.20627975402009577, + "learning_rate": 7.014525732166861e-05, + "loss": 0.9265, + "step": 19210 + }, + { + "epoch": 4.097909556313994, + "grad_norm": 0.1995335496266041, + "learning_rate": 7.013681242397007e-05, + "loss": 0.9489, + "step": 19211 + }, + { + "epoch": 4.098122866894198, + "grad_norm": 0.23242685037491723, + "learning_rate": 7.012836763787365e-05, + "loss": 0.9009, + "step": 19212 + }, + { + "epoch": 4.098336177474403, + "grad_norm": 0.21638014533473665, + "learning_rate": 7.011992296347492e-05, + "loss": 0.9014, + "step": 19213 + }, + { + "epoch": 4.098549488054608, + "grad_norm": 0.2349429247304521, + "learning_rate": 7.011147840086944e-05, + "loss": 0.8884, + "step": 19214 + }, + { + "epoch": 4.098762798634812, + "grad_norm": 0.17762147636113754, + "learning_rate": 7.010303395015272e-05, + "loss": 0.9099, + "step": 19215 + }, + { + "epoch": 4.098976109215017, + "grad_norm": 0.21522573264543673, + "learning_rate": 7.009458961142037e-05, + "loss": 0.8821, + "step": 19216 + }, + { + "epoch": 4.099189419795222, + "grad_norm": 0.19010665798706702, + "learning_rate": 7.008614538476788e-05, + "loss": 0.9347, + "step": 19217 + }, + { + "epoch": 4.099402730375426, + "grad_norm": 0.19927340232248142, + "learning_rate": 7.007770127029083e-05, + "loss": 0.8955, + "step": 19218 + }, + { + "epoch": 4.099616040955631, + "grad_norm": 0.17465255434394772, + "learning_rate": 7.006925726808477e-05, + "loss": 0.9107, + "step": 19219 + }, + { + "epoch": 4.099829351535837, + "grad_norm": 0.2111330733562721, + "learning_rate": 7.006081337824524e-05, + "loss": 0.8799, + "step": 19220 + }, + { + "epoch": 4.100042662116041, + "grad_norm": 0.18686414582488925, + "learning_rate": 7.005236960086775e-05, + "loss": 0.9046, + "step": 19221 + }, + { + "epoch": 4.100255972696246, + "grad_norm": 0.21113462587825407, + "learning_rate": 7.004392593604787e-05, + "loss": 0.88, + "step": 19222 + }, + { + "epoch": 4.100469283276451, + "grad_norm": 0.19389823330887926, + "learning_rate": 7.003548238388112e-05, + "loss": 0.8893, + "step": 19223 + }, + { + "epoch": 4.100682593856655, + "grad_norm": 0.16567558023568837, + "learning_rate": 7.002703894446308e-05, + "loss": 0.88, + "step": 19224 + }, + { + "epoch": 4.10089590443686, + "grad_norm": 0.2127243729454617, + "learning_rate": 7.001859561788923e-05, + "loss": 0.87, + "step": 19225 + }, + { + "epoch": 4.101109215017065, + "grad_norm": 0.19572588481349654, + "learning_rate": 7.001015240425517e-05, + "loss": 0.8591, + "step": 19226 + }, + { + "epoch": 4.101322525597269, + "grad_norm": 0.1916148397368714, + "learning_rate": 7.00017093036564e-05, + "loss": 0.9016, + "step": 19227 + }, + { + "epoch": 4.101535836177474, + "grad_norm": 0.1530556239129705, + "learning_rate": 6.999326631618844e-05, + "loss": 0.9168, + "step": 19228 + }, + { + "epoch": 4.101749146757679, + "grad_norm": 0.18595830792653084, + "learning_rate": 6.998482344194686e-05, + "loss": 0.9066, + "step": 19229 + }, + { + "epoch": 4.101962457337884, + "grad_norm": 0.17983021380015973, + "learning_rate": 6.997638068102718e-05, + "loss": 0.8957, + "step": 19230 + }, + { + "epoch": 4.102175767918089, + "grad_norm": 0.2166111399188493, + "learning_rate": 6.99679380335249e-05, + "loss": 0.9376, + "step": 19231 + }, + { + "epoch": 4.102389078498294, + "grad_norm": 0.1899120620633539, + "learning_rate": 6.995949549953558e-05, + "loss": 0.8933, + "step": 19232 + }, + { + "epoch": 4.102602389078498, + "grad_norm": 0.2108334055366925, + "learning_rate": 6.995105307915471e-05, + "loss": 0.8772, + "step": 19233 + }, + { + "epoch": 4.102815699658703, + "grad_norm": 0.17298969702732853, + "learning_rate": 6.994261077247787e-05, + "loss": 0.8812, + "step": 19234 + }, + { + "epoch": 4.103029010238908, + "grad_norm": 0.2781305359024516, + "learning_rate": 6.993416857960054e-05, + "loss": 0.9089, + "step": 19235 + }, + { + "epoch": 4.103242320819112, + "grad_norm": 0.2200460639354933, + "learning_rate": 6.992572650061828e-05, + "loss": 0.9128, + "step": 19236 + }, + { + "epoch": 4.103455631399317, + "grad_norm": 0.16421920812636887, + "learning_rate": 6.991728453562659e-05, + "loss": 0.9048, + "step": 19237 + }, + { + "epoch": 4.103668941979522, + "grad_norm": 0.16752274775559264, + "learning_rate": 6.9908842684721e-05, + "loss": 0.884, + "step": 19238 + }, + { + "epoch": 4.103882252559727, + "grad_norm": 0.20304703623980652, + "learning_rate": 6.990040094799702e-05, + "loss": 0.8999, + "step": 19239 + }, + { + "epoch": 4.104095563139932, + "grad_norm": 0.21822833272146894, + "learning_rate": 6.989195932555015e-05, + "loss": 0.8989, + "step": 19240 + }, + { + "epoch": 4.104308873720137, + "grad_norm": 0.20064948904634589, + "learning_rate": 6.988351781747599e-05, + "loss": 0.905, + "step": 19241 + }, + { + "epoch": 4.104522184300341, + "grad_norm": 0.17926894501993174, + "learning_rate": 6.987507642386996e-05, + "loss": 0.8873, + "step": 19242 + }, + { + "epoch": 4.104735494880546, + "grad_norm": 0.21362639773352574, + "learning_rate": 6.986663514482759e-05, + "loss": 0.9075, + "step": 19243 + }, + { + "epoch": 4.104948805460751, + "grad_norm": 0.6315216656185263, + "learning_rate": 6.985819398044444e-05, + "loss": 0.8971, + "step": 19244 + }, + { + "epoch": 4.1051621160409555, + "grad_norm": 0.20917145527897105, + "learning_rate": 6.984975293081598e-05, + "loss": 0.922, + "step": 19245 + }, + { + "epoch": 4.10537542662116, + "grad_norm": 0.2027011023288179, + "learning_rate": 6.984131199603774e-05, + "loss": 0.8818, + "step": 19246 + }, + { + "epoch": 4.105588737201365, + "grad_norm": 0.1918675192690831, + "learning_rate": 6.983287117620522e-05, + "loss": 0.8807, + "step": 19247 + }, + { + "epoch": 4.1058020477815695, + "grad_norm": 0.24265637828995557, + "learning_rate": 6.982443047141395e-05, + "loss": 0.9191, + "step": 19248 + }, + { + "epoch": 4.106015358361775, + "grad_norm": 0.17283643402572801, + "learning_rate": 6.98159898817594e-05, + "loss": 0.8686, + "step": 19249 + }, + { + "epoch": 4.10622866894198, + "grad_norm": 0.20653226345526965, + "learning_rate": 6.980754940733712e-05, + "loss": 0.9018, + "step": 19250 + }, + { + "epoch": 4.1064419795221845, + "grad_norm": 0.18045896892724747, + "learning_rate": 6.979910904824259e-05, + "loss": 0.8923, + "step": 19251 + }, + { + "epoch": 4.106655290102389, + "grad_norm": 0.18805565822549916, + "learning_rate": 6.979066880457132e-05, + "loss": 0.9258, + "step": 19252 + }, + { + "epoch": 4.106868600682594, + "grad_norm": 0.20410017293014984, + "learning_rate": 6.978222867641877e-05, + "loss": 0.8824, + "step": 19253 + }, + { + "epoch": 4.1070819112627985, + "grad_norm": 0.1894468276464364, + "learning_rate": 6.977378866388051e-05, + "loss": 0.8849, + "step": 19254 + }, + { + "epoch": 4.107295221843003, + "grad_norm": 0.16006064639055684, + "learning_rate": 6.976534876705198e-05, + "loss": 0.9445, + "step": 19255 + }, + { + "epoch": 4.107508532423208, + "grad_norm": 0.174731319207215, + "learning_rate": 6.975690898602869e-05, + "loss": 0.9382, + "step": 19256 + }, + { + "epoch": 4.107721843003413, + "grad_norm": 0.19562431433276153, + "learning_rate": 6.974846932090616e-05, + "loss": 0.9211, + "step": 19257 + }, + { + "epoch": 4.107935153583618, + "grad_norm": 0.18987920646720394, + "learning_rate": 6.974002977177987e-05, + "loss": 0.9021, + "step": 19258 + }, + { + "epoch": 4.108148464163823, + "grad_norm": 0.17366455844719375, + "learning_rate": 6.973159033874531e-05, + "loss": 0.9004, + "step": 19259 + }, + { + "epoch": 4.1083617747440275, + "grad_norm": 0.20867320635934852, + "learning_rate": 6.9723151021898e-05, + "loss": 0.9093, + "step": 19260 + }, + { + "epoch": 4.108575085324232, + "grad_norm": 0.18456255440943334, + "learning_rate": 6.971471182133338e-05, + "loss": 0.8982, + "step": 19261 + }, + { + "epoch": 4.108788395904437, + "grad_norm": 0.2667815965972143, + "learning_rate": 6.9706272737147e-05, + "loss": 0.9036, + "step": 19262 + }, + { + "epoch": 4.109001706484642, + "grad_norm": 0.21301662305983282, + "learning_rate": 6.969783376943432e-05, + "loss": 0.8633, + "step": 19263 + }, + { + "epoch": 4.109215017064846, + "grad_norm": 0.18396024966018976, + "learning_rate": 6.968939491829082e-05, + "loss": 0.9035, + "step": 19264 + }, + { + "epoch": 4.109428327645051, + "grad_norm": 0.21708144767421927, + "learning_rate": 6.968095618381196e-05, + "loss": 0.8957, + "step": 19265 + }, + { + "epoch": 4.109641638225256, + "grad_norm": 0.1888083209592285, + "learning_rate": 6.967251756609327e-05, + "loss": 0.9059, + "step": 19266 + }, + { + "epoch": 4.10985494880546, + "grad_norm": 0.23276038757340195, + "learning_rate": 6.966407906523022e-05, + "loss": 0.9302, + "step": 19267 + }, + { + "epoch": 4.110068259385666, + "grad_norm": 0.2531575699727373, + "learning_rate": 6.96556406813183e-05, + "loss": 0.8935, + "step": 19268 + }, + { + "epoch": 4.110281569965871, + "grad_norm": 0.22268750210377236, + "learning_rate": 6.964720241445295e-05, + "loss": 0.9316, + "step": 19269 + }, + { + "epoch": 4.110494880546075, + "grad_norm": 0.18751905992022402, + "learning_rate": 6.963876426472971e-05, + "loss": 0.9302, + "step": 19270 + }, + { + "epoch": 4.11070819112628, + "grad_norm": 0.2325668227686575, + "learning_rate": 6.963032623224399e-05, + "loss": 0.8969, + "step": 19271 + }, + { + "epoch": 4.110921501706485, + "grad_norm": 0.1895458953723721, + "learning_rate": 6.962188831709133e-05, + "loss": 0.916, + "step": 19272 + }, + { + "epoch": 4.111134812286689, + "grad_norm": 0.19021382414353272, + "learning_rate": 6.961345051936716e-05, + "loss": 0.8909, + "step": 19273 + }, + { + "epoch": 4.111348122866894, + "grad_norm": 0.20680513717940477, + "learning_rate": 6.960501283916703e-05, + "loss": 0.8987, + "step": 19274 + }, + { + "epoch": 4.111561433447099, + "grad_norm": 0.20462660371255767, + "learning_rate": 6.95965752765863e-05, + "loss": 0.9125, + "step": 19275 + }, + { + "epoch": 4.111774744027303, + "grad_norm": 0.27071256119334763, + "learning_rate": 6.958813783172051e-05, + "loss": 0.8988, + "step": 19276 + }, + { + "epoch": 4.111988054607509, + "grad_norm": 0.29403002447112153, + "learning_rate": 6.957970050466512e-05, + "loss": 0.9398, + "step": 19277 + }, + { + "epoch": 4.112201365187714, + "grad_norm": 0.2038286489905469, + "learning_rate": 6.957126329551557e-05, + "loss": 0.916, + "step": 19278 + }, + { + "epoch": 4.112414675767918, + "grad_norm": 0.2073473899152025, + "learning_rate": 6.956282620436737e-05, + "loss": 0.885, + "step": 19279 + }, + { + "epoch": 4.112627986348123, + "grad_norm": 0.26207542293566344, + "learning_rate": 6.955438923131597e-05, + "loss": 0.8961, + "step": 19280 + }, + { + "epoch": 4.112841296928328, + "grad_norm": 0.2789982726642691, + "learning_rate": 6.95459523764568e-05, + "loss": 0.8968, + "step": 19281 + }, + { + "epoch": 4.113054607508532, + "grad_norm": 0.20388980409986848, + "learning_rate": 6.95375156398854e-05, + "loss": 0.8815, + "step": 19282 + }, + { + "epoch": 4.113267918088737, + "grad_norm": 0.1755096720706545, + "learning_rate": 6.952907902169715e-05, + "loss": 0.8975, + "step": 19283 + }, + { + "epoch": 4.113481228668942, + "grad_norm": 0.2401413015184329, + "learning_rate": 6.952064252198757e-05, + "loss": 0.9044, + "step": 19284 + }, + { + "epoch": 4.113694539249146, + "grad_norm": 0.22553627339884108, + "learning_rate": 6.951220614085208e-05, + "loss": 0.9132, + "step": 19285 + }, + { + "epoch": 4.113907849829351, + "grad_norm": 0.23240697467769458, + "learning_rate": 6.950376987838621e-05, + "loss": 0.9049, + "step": 19286 + }, + { + "epoch": 4.114121160409557, + "grad_norm": 0.1889690832452959, + "learning_rate": 6.94953337346853e-05, + "loss": 0.8812, + "step": 19287 + }, + { + "epoch": 4.114334470989761, + "grad_norm": 0.1819455684089713, + "learning_rate": 6.948689770984489e-05, + "loss": 0.928, + "step": 19288 + }, + { + "epoch": 4.114547781569966, + "grad_norm": 0.2036060072035157, + "learning_rate": 6.947846180396039e-05, + "loss": 0.8839, + "step": 19289 + }, + { + "epoch": 4.114761092150171, + "grad_norm": 0.19091929575531594, + "learning_rate": 6.947002601712728e-05, + "loss": 0.8833, + "step": 19290 + }, + { + "epoch": 4.114974402730375, + "grad_norm": 0.17705218927846153, + "learning_rate": 6.9461590349441e-05, + "loss": 0.941, + "step": 19291 + }, + { + "epoch": 4.11518771331058, + "grad_norm": 0.214202209479532, + "learning_rate": 6.945315480099701e-05, + "loss": 0.9097, + "step": 19292 + }, + { + "epoch": 4.115401023890785, + "grad_norm": 0.2005513500934144, + "learning_rate": 6.944471937189073e-05, + "loss": 0.8749, + "step": 19293 + }, + { + "epoch": 4.1156143344709895, + "grad_norm": 0.2155636509210359, + "learning_rate": 6.943628406221765e-05, + "loss": 0.9208, + "step": 19294 + }, + { + "epoch": 4.115827645051194, + "grad_norm": 0.1720688934616961, + "learning_rate": 6.942784887207318e-05, + "loss": 0.886, + "step": 19295 + }, + { + "epoch": 4.1160409556314, + "grad_norm": 0.17658581280354588, + "learning_rate": 6.941941380155279e-05, + "loss": 0.8971, + "step": 19296 + }, + { + "epoch": 4.116254266211604, + "grad_norm": 0.2175385360339506, + "learning_rate": 6.941097885075193e-05, + "loss": 0.9037, + "step": 19297 + }, + { + "epoch": 4.116467576791809, + "grad_norm": 0.21821171689573624, + "learning_rate": 6.940254401976601e-05, + "loss": 0.8616, + "step": 19298 + }, + { + "epoch": 4.116680887372014, + "grad_norm": 0.27132676508042575, + "learning_rate": 6.939410930869046e-05, + "loss": 0.9146, + "step": 19299 + }, + { + "epoch": 4.1168941979522184, + "grad_norm": 0.20134459531305535, + "learning_rate": 6.938567471762076e-05, + "loss": 0.9039, + "step": 19300 + }, + { + "epoch": 4.117107508532423, + "grad_norm": 0.21954012314677146, + "learning_rate": 6.937724024665232e-05, + "loss": 0.932, + "step": 19301 + }, + { + "epoch": 4.117320819112628, + "grad_norm": 0.22137039106741271, + "learning_rate": 6.936880589588058e-05, + "loss": 0.9437, + "step": 19302 + }, + { + "epoch": 4.1175341296928325, + "grad_norm": 0.18715204840005598, + "learning_rate": 6.936037166540098e-05, + "loss": 0.8923, + "step": 19303 + }, + { + "epoch": 4.117747440273037, + "grad_norm": 0.15517879064658177, + "learning_rate": 6.935193755530898e-05, + "loss": 0.9149, + "step": 19304 + }, + { + "epoch": 4.117960750853243, + "grad_norm": 0.19628052337870003, + "learning_rate": 6.934350356569995e-05, + "loss": 0.8908, + "step": 19305 + }, + { + "epoch": 4.118174061433447, + "grad_norm": 0.15620703827411658, + "learning_rate": 6.933506969666937e-05, + "loss": 0.8689, + "step": 19306 + }, + { + "epoch": 4.118387372013652, + "grad_norm": 0.2023717071680834, + "learning_rate": 6.932663594831265e-05, + "loss": 0.9046, + "step": 19307 + }, + { + "epoch": 4.118600682593857, + "grad_norm": 0.14446870406506412, + "learning_rate": 6.931820232072528e-05, + "loss": 0.9027, + "step": 19308 + }, + { + "epoch": 4.1188139931740615, + "grad_norm": 0.17739100894874754, + "learning_rate": 6.930976881400257e-05, + "loss": 0.8842, + "step": 19309 + }, + { + "epoch": 4.119027303754266, + "grad_norm": 0.17887141181449817, + "learning_rate": 6.930133542824003e-05, + "loss": 0.8949, + "step": 19310 + }, + { + "epoch": 4.119240614334471, + "grad_norm": 0.18420993522451712, + "learning_rate": 6.929290216353303e-05, + "loss": 0.922, + "step": 19311 + }, + { + "epoch": 4.1194539249146755, + "grad_norm": 0.21642156851360375, + "learning_rate": 6.928446901997705e-05, + "loss": 0.9088, + "step": 19312 + }, + { + "epoch": 4.11966723549488, + "grad_norm": 0.16775943735211468, + "learning_rate": 6.927603599766747e-05, + "loss": 0.8987, + "step": 19313 + }, + { + "epoch": 4.119880546075085, + "grad_norm": 0.19055526594169656, + "learning_rate": 6.926760309669974e-05, + "loss": 0.9123, + "step": 19314 + }, + { + "epoch": 4.1200938566552905, + "grad_norm": 0.17812725640183258, + "learning_rate": 6.925917031716923e-05, + "loss": 0.9021, + "step": 19315 + }, + { + "epoch": 4.120307167235495, + "grad_norm": 0.16655949686785593, + "learning_rate": 6.92507376591714e-05, + "loss": 0.8759, + "step": 19316 + }, + { + "epoch": 4.1205204778157, + "grad_norm": 0.18538078679119568, + "learning_rate": 6.924230512280166e-05, + "loss": 0.9161, + "step": 19317 + }, + { + "epoch": 4.1207337883959045, + "grad_norm": 0.18050382711469712, + "learning_rate": 6.923387270815544e-05, + "loss": 0.9028, + "step": 19318 + }, + { + "epoch": 4.120947098976109, + "grad_norm": 0.17453857229760944, + "learning_rate": 6.922544041532812e-05, + "loss": 0.9298, + "step": 19319 + }, + { + "epoch": 4.121160409556314, + "grad_norm": 0.18615940619442928, + "learning_rate": 6.921700824441513e-05, + "loss": 0.9009, + "step": 19320 + }, + { + "epoch": 4.121373720136519, + "grad_norm": 0.1777948333314721, + "learning_rate": 6.920857619551184e-05, + "loss": 0.918, + "step": 19321 + }, + { + "epoch": 4.121587030716723, + "grad_norm": 0.18547946394455186, + "learning_rate": 6.920014426871372e-05, + "loss": 0.8704, + "step": 19322 + }, + { + "epoch": 4.121800341296928, + "grad_norm": 0.16660568208331583, + "learning_rate": 6.919171246411611e-05, + "loss": 0.9333, + "step": 19323 + }, + { + "epoch": 4.1220136518771335, + "grad_norm": 0.19097182390201295, + "learning_rate": 6.91832807818145e-05, + "loss": 0.8872, + "step": 19324 + }, + { + "epoch": 4.122226962457338, + "grad_norm": 0.19280340771030194, + "learning_rate": 6.917484922190422e-05, + "loss": 0.895, + "step": 19325 + }, + { + "epoch": 4.122440273037543, + "grad_norm": 0.21798746382267367, + "learning_rate": 6.916641778448073e-05, + "loss": 0.9338, + "step": 19326 + }, + { + "epoch": 4.122653583617748, + "grad_norm": 0.23234141026875998, + "learning_rate": 6.915798646963938e-05, + "loss": 0.9227, + "step": 19327 + }, + { + "epoch": 4.122866894197952, + "grad_norm": 0.21258301138332616, + "learning_rate": 6.914955527747562e-05, + "loss": 0.8959, + "step": 19328 + }, + { + "epoch": 4.123080204778157, + "grad_norm": 0.3439849636320236, + "learning_rate": 6.91411242080848e-05, + "loss": 0.8958, + "step": 19329 + }, + { + "epoch": 4.123293515358362, + "grad_norm": 0.29063498556129297, + "learning_rate": 6.913269326156238e-05, + "loss": 0.9157, + "step": 19330 + }, + { + "epoch": 4.123506825938566, + "grad_norm": 0.18391588351074212, + "learning_rate": 6.912426243800369e-05, + "loss": 0.8978, + "step": 19331 + }, + { + "epoch": 4.123720136518771, + "grad_norm": 0.2428209109003394, + "learning_rate": 6.911583173750417e-05, + "loss": 0.8963, + "step": 19332 + }, + { + "epoch": 4.123933447098976, + "grad_norm": 0.2396113707357451, + "learning_rate": 6.910740116015916e-05, + "loss": 0.8756, + "step": 19333 + }, + { + "epoch": 4.124146757679181, + "grad_norm": 0.25984949277418506, + "learning_rate": 6.909897070606412e-05, + "loss": 0.8943, + "step": 19334 + }, + { + "epoch": 4.124360068259386, + "grad_norm": 0.2275671048096443, + "learning_rate": 6.909054037531439e-05, + "loss": 0.8992, + "step": 19335 + }, + { + "epoch": 4.124573378839591, + "grad_norm": 0.1919327300040831, + "learning_rate": 6.90821101680054e-05, + "loss": 0.9088, + "step": 19336 + }, + { + "epoch": 4.124786689419795, + "grad_norm": 0.16254613984499044, + "learning_rate": 6.907368008423251e-05, + "loss": 0.8866, + "step": 19337 + }, + { + "epoch": 4.125, + "grad_norm": 0.200265180052803, + "learning_rate": 6.906525012409111e-05, + "loss": 0.889, + "step": 19338 + }, + { + "epoch": 4.125213310580205, + "grad_norm": 0.18257896515929306, + "learning_rate": 6.905682028767658e-05, + "loss": 0.8974, + "step": 19339 + }, + { + "epoch": 4.125426621160409, + "grad_norm": 0.17908985828401078, + "learning_rate": 6.904839057508433e-05, + "loss": 0.8868, + "step": 19340 + }, + { + "epoch": 4.125639931740614, + "grad_norm": 0.24427046107526032, + "learning_rate": 6.903996098640971e-05, + "loss": 0.9095, + "step": 19341 + }, + { + "epoch": 4.125853242320819, + "grad_norm": 0.22097082101962484, + "learning_rate": 6.903153152174815e-05, + "loss": 0.9106, + "step": 19342 + }, + { + "epoch": 4.126066552901024, + "grad_norm": 0.25522059167700156, + "learning_rate": 6.902310218119496e-05, + "loss": 0.8836, + "step": 19343 + }, + { + "epoch": 4.126279863481229, + "grad_norm": 0.16706078905015967, + "learning_rate": 6.901467296484558e-05, + "loss": 0.9061, + "step": 19344 + }, + { + "epoch": 4.126493174061434, + "grad_norm": 0.18425788577558508, + "learning_rate": 6.900624387279533e-05, + "loss": 0.885, + "step": 19345 + }, + { + "epoch": 4.126706484641638, + "grad_norm": 0.19567203197183552, + "learning_rate": 6.899781490513963e-05, + "loss": 0.889, + "step": 19346 + }, + { + "epoch": 4.126919795221843, + "grad_norm": 0.1853125758965342, + "learning_rate": 6.898938606197383e-05, + "loss": 0.8847, + "step": 19347 + }, + { + "epoch": 4.127133105802048, + "grad_norm": 0.2114238954513467, + "learning_rate": 6.898095734339333e-05, + "loss": 0.9233, + "step": 19348 + }, + { + "epoch": 4.127346416382252, + "grad_norm": 0.17987048695341157, + "learning_rate": 6.897252874949345e-05, + "loss": 0.8785, + "step": 19349 + }, + { + "epoch": 4.127559726962457, + "grad_norm": 0.16343199084268623, + "learning_rate": 6.896410028036963e-05, + "loss": 0.8942, + "step": 19350 + }, + { + "epoch": 4.127773037542662, + "grad_norm": 0.1726177265040212, + "learning_rate": 6.895567193611717e-05, + "loss": 0.905, + "step": 19351 + }, + { + "epoch": 4.1279863481228665, + "grad_norm": 0.20970961730930707, + "learning_rate": 6.89472437168315e-05, + "loss": 0.8953, + "step": 19352 + }, + { + "epoch": 4.128199658703072, + "grad_norm": 0.18083086469825715, + "learning_rate": 6.893881562260796e-05, + "loss": 0.9329, + "step": 19353 + }, + { + "epoch": 4.128412969283277, + "grad_norm": 0.16243395103473715, + "learning_rate": 6.893038765354187e-05, + "loss": 0.909, + "step": 19354 + }, + { + "epoch": 4.128626279863481, + "grad_norm": 0.19615155567130166, + "learning_rate": 6.892195980972866e-05, + "loss": 0.9168, + "step": 19355 + }, + { + "epoch": 4.128839590443686, + "grad_norm": 0.18981161049481965, + "learning_rate": 6.891353209126365e-05, + "loss": 0.9406, + "step": 19356 + }, + { + "epoch": 4.129052901023891, + "grad_norm": 0.22262132678646754, + "learning_rate": 6.890510449824218e-05, + "loss": 0.9212, + "step": 19357 + }, + { + "epoch": 4.1292662116040955, + "grad_norm": 0.2263383735514129, + "learning_rate": 6.889667703075967e-05, + "loss": 0.8839, + "step": 19358 + }, + { + "epoch": 4.1294795221843, + "grad_norm": 0.2066773904174208, + "learning_rate": 6.888824968891144e-05, + "loss": 0.8981, + "step": 19359 + }, + { + "epoch": 4.129692832764505, + "grad_norm": 0.17910824723185037, + "learning_rate": 6.887982247279285e-05, + "loss": 0.8887, + "step": 19360 + }, + { + "epoch": 4.1299061433447095, + "grad_norm": 0.16854729723034692, + "learning_rate": 6.887139538249925e-05, + "loss": 0.8948, + "step": 19361 + }, + { + "epoch": 4.130119453924915, + "grad_norm": 0.20446398623094972, + "learning_rate": 6.886296841812602e-05, + "loss": 0.9121, + "step": 19362 + }, + { + "epoch": 4.13033276450512, + "grad_norm": 0.17021327761782903, + "learning_rate": 6.885454157976846e-05, + "loss": 0.9312, + "step": 19363 + }, + { + "epoch": 4.1305460750853245, + "grad_norm": 0.13990951282554887, + "learning_rate": 6.8846114867522e-05, + "loss": 0.9169, + "step": 19364 + }, + { + "epoch": 4.130759385665529, + "grad_norm": 0.15394415051551696, + "learning_rate": 6.88376882814819e-05, + "loss": 0.9321, + "step": 19365 + }, + { + "epoch": 4.130972696245734, + "grad_norm": 0.1621772274333901, + "learning_rate": 6.882926182174354e-05, + "loss": 0.9017, + "step": 19366 + }, + { + "epoch": 4.1311860068259385, + "grad_norm": 0.1707188463774342, + "learning_rate": 6.88208354884023e-05, + "loss": 0.9109, + "step": 19367 + }, + { + "epoch": 4.131399317406143, + "grad_norm": 0.16187644759025954, + "learning_rate": 6.881240928155346e-05, + "loss": 0.9109, + "step": 19368 + }, + { + "epoch": 4.131612627986348, + "grad_norm": 0.14831526834236064, + "learning_rate": 6.880398320129242e-05, + "loss": 0.9347, + "step": 19369 + }, + { + "epoch": 4.131825938566553, + "grad_norm": 0.17442778132904657, + "learning_rate": 6.879555724771449e-05, + "loss": 0.9105, + "step": 19370 + }, + { + "epoch": 4.132039249146757, + "grad_norm": 0.1595597342098337, + "learning_rate": 6.878713142091501e-05, + "loss": 0.9229, + "step": 19371 + }, + { + "epoch": 4.132252559726963, + "grad_norm": 0.20220997261163165, + "learning_rate": 6.877870572098934e-05, + "loss": 0.9082, + "step": 19372 + }, + { + "epoch": 4.1324658703071675, + "grad_norm": 0.18870766673154185, + "learning_rate": 6.877028014803279e-05, + "loss": 0.9092, + "step": 19373 + }, + { + "epoch": 4.132679180887372, + "grad_norm": 0.16149291294246873, + "learning_rate": 6.876185470214073e-05, + "loss": 0.8828, + "step": 19374 + }, + { + "epoch": 4.132892491467577, + "grad_norm": 0.21220282444994948, + "learning_rate": 6.875342938340848e-05, + "loss": 0.8705, + "step": 19375 + }, + { + "epoch": 4.1331058020477816, + "grad_norm": 0.23906781804809368, + "learning_rate": 6.874500419193133e-05, + "loss": 0.9157, + "step": 19376 + }, + { + "epoch": 4.133319112627986, + "grad_norm": 0.23822060704216372, + "learning_rate": 6.873657912780467e-05, + "loss": 0.9299, + "step": 19377 + }, + { + "epoch": 4.133532423208191, + "grad_norm": 0.2036457938424112, + "learning_rate": 6.872815419112379e-05, + "loss": 0.8988, + "step": 19378 + }, + { + "epoch": 4.133745733788396, + "grad_norm": 0.158477443148685, + "learning_rate": 6.871972938198404e-05, + "loss": 0.8834, + "step": 19379 + }, + { + "epoch": 4.1339590443686, + "grad_norm": 0.19622303899122034, + "learning_rate": 6.871130470048076e-05, + "loss": 0.9234, + "step": 19380 + }, + { + "epoch": 4.134172354948806, + "grad_norm": 0.2261307807245374, + "learning_rate": 6.870288014670924e-05, + "loss": 0.8963, + "step": 19381 + }, + { + "epoch": 4.1343856655290105, + "grad_norm": 0.216345377841486, + "learning_rate": 6.869445572076483e-05, + "loss": 0.9104, + "step": 19382 + }, + { + "epoch": 4.134598976109215, + "grad_norm": 0.19265169529408852, + "learning_rate": 6.868603142274283e-05, + "loss": 0.915, + "step": 19383 + }, + { + "epoch": 4.13481228668942, + "grad_norm": 0.15735213770879528, + "learning_rate": 6.867760725273858e-05, + "loss": 0.8926, + "step": 19384 + }, + { + "epoch": 4.135025597269625, + "grad_norm": 0.15252796692165244, + "learning_rate": 6.866918321084737e-05, + "loss": 0.9081, + "step": 19385 + }, + { + "epoch": 4.135238907849829, + "grad_norm": 0.16615082014496305, + "learning_rate": 6.866075929716461e-05, + "loss": 0.895, + "step": 19386 + }, + { + "epoch": 4.135452218430034, + "grad_norm": 0.17150856880076473, + "learning_rate": 6.865233551178552e-05, + "loss": 0.8953, + "step": 19387 + }, + { + "epoch": 4.135665529010239, + "grad_norm": 0.15474616161755592, + "learning_rate": 6.864391185480542e-05, + "loss": 0.8859, + "step": 19388 + }, + { + "epoch": 4.135878839590443, + "grad_norm": 0.20571341342595936, + "learning_rate": 6.863548832631967e-05, + "loss": 0.8726, + "step": 19389 + }, + { + "epoch": 4.136092150170649, + "grad_norm": 0.18323917602413423, + "learning_rate": 6.862706492642354e-05, + "loss": 0.8922, + "step": 19390 + }, + { + "epoch": 4.136305460750854, + "grad_norm": 0.185513330192231, + "learning_rate": 6.861864165521237e-05, + "loss": 0.909, + "step": 19391 + }, + { + "epoch": 4.136518771331058, + "grad_norm": 0.21153640840825066, + "learning_rate": 6.861021851278149e-05, + "loss": 0.8706, + "step": 19392 + }, + { + "epoch": 4.136732081911263, + "grad_norm": 0.15166527697823476, + "learning_rate": 6.860179549922613e-05, + "loss": 0.8795, + "step": 19393 + }, + { + "epoch": 4.136945392491468, + "grad_norm": 0.19539435681253495, + "learning_rate": 6.859337261464168e-05, + "loss": 0.9055, + "step": 19394 + }, + { + "epoch": 4.137158703071672, + "grad_norm": 0.22088606613815642, + "learning_rate": 6.85849498591234e-05, + "loss": 0.903, + "step": 19395 + }, + { + "epoch": 4.137372013651877, + "grad_norm": 0.23228128019020403, + "learning_rate": 6.85765272327666e-05, + "loss": 0.9309, + "step": 19396 + }, + { + "epoch": 4.137585324232082, + "grad_norm": 0.20792927631697733, + "learning_rate": 6.856810473566659e-05, + "loss": 0.8983, + "step": 19397 + }, + { + "epoch": 4.137798634812286, + "grad_norm": 0.14721872377609, + "learning_rate": 6.855968236791869e-05, + "loss": 0.8961, + "step": 19398 + }, + { + "epoch": 4.138011945392491, + "grad_norm": 0.20459104428025623, + "learning_rate": 6.855126012961815e-05, + "loss": 0.9077, + "step": 19399 + }, + { + "epoch": 4.138225255972697, + "grad_norm": 0.1526611268781651, + "learning_rate": 6.85428380208603e-05, + "loss": 0.8896, + "step": 19400 + }, + { + "epoch": 4.138438566552901, + "grad_norm": 0.1929708525593898, + "learning_rate": 6.853441604174044e-05, + "loss": 0.9143, + "step": 19401 + }, + { + "epoch": 4.138651877133106, + "grad_norm": 0.20377767351678247, + "learning_rate": 6.852599419235382e-05, + "loss": 0.9028, + "step": 19402 + }, + { + "epoch": 4.138865187713311, + "grad_norm": 0.214439344824188, + "learning_rate": 6.851757247279582e-05, + "loss": 0.9066, + "step": 19403 + }, + { + "epoch": 4.139078498293515, + "grad_norm": 0.18543616564695883, + "learning_rate": 6.850915088316165e-05, + "loss": 0.8799, + "step": 19404 + }, + { + "epoch": 4.13929180887372, + "grad_norm": 0.22921203782766827, + "learning_rate": 6.850072942354663e-05, + "loss": 0.9241, + "step": 19405 + }, + { + "epoch": 4.139505119453925, + "grad_norm": 0.2327063046186468, + "learning_rate": 6.849230809404606e-05, + "loss": 0.8889, + "step": 19406 + }, + { + "epoch": 4.139718430034129, + "grad_norm": 0.257976464205091, + "learning_rate": 6.84838868947552e-05, + "loss": 0.8929, + "step": 19407 + }, + { + "epoch": 4.139931740614334, + "grad_norm": 0.22612909406516585, + "learning_rate": 6.847546582576938e-05, + "loss": 0.8812, + "step": 19408 + }, + { + "epoch": 4.140145051194539, + "grad_norm": 0.17084338420103426, + "learning_rate": 6.846704488718386e-05, + "loss": 0.9034, + "step": 19409 + }, + { + "epoch": 4.140358361774744, + "grad_norm": 0.21991702648530065, + "learning_rate": 6.84586240790939e-05, + "loss": 0.9013, + "step": 19410 + }, + { + "epoch": 4.140571672354949, + "grad_norm": 0.26086754615819846, + "learning_rate": 6.845020340159481e-05, + "loss": 0.9173, + "step": 19411 + }, + { + "epoch": 4.140784982935154, + "grad_norm": 0.25317442809028595, + "learning_rate": 6.844178285478185e-05, + "loss": 0.8945, + "step": 19412 + }, + { + "epoch": 4.140998293515358, + "grad_norm": 0.1964494863292625, + "learning_rate": 6.843336243875033e-05, + "loss": 0.8779, + "step": 19413 + }, + { + "epoch": 4.141211604095563, + "grad_norm": 0.16813812167698092, + "learning_rate": 6.842494215359552e-05, + "loss": 0.8943, + "step": 19414 + }, + { + "epoch": 4.141424914675768, + "grad_norm": 0.22630962027452514, + "learning_rate": 6.841652199941265e-05, + "loss": 0.9175, + "step": 19415 + }, + { + "epoch": 4.1416382252559725, + "grad_norm": 0.2567406361066553, + "learning_rate": 6.840810197629705e-05, + "loss": 0.8924, + "step": 19416 + }, + { + "epoch": 4.141851535836177, + "grad_norm": 0.24281820660665213, + "learning_rate": 6.839968208434396e-05, + "loss": 0.9305, + "step": 19417 + }, + { + "epoch": 4.142064846416382, + "grad_norm": 0.19162981378406457, + "learning_rate": 6.839126232364867e-05, + "loss": 0.9054, + "step": 19418 + }, + { + "epoch": 4.142278156996587, + "grad_norm": 0.20766072777022584, + "learning_rate": 6.838284269430642e-05, + "loss": 0.9242, + "step": 19419 + }, + { + "epoch": 4.142491467576792, + "grad_norm": 0.5014767714407657, + "learning_rate": 6.837442319641254e-05, + "loss": 1.011, + "step": 19420 + }, + { + "epoch": 4.142704778156997, + "grad_norm": 0.2298182895875529, + "learning_rate": 6.836600383006225e-05, + "loss": 0.8837, + "step": 19421 + }, + { + "epoch": 4.1429180887372015, + "grad_norm": 0.19452379290439162, + "learning_rate": 6.83575845953508e-05, + "loss": 0.9306, + "step": 19422 + }, + { + "epoch": 4.143131399317406, + "grad_norm": 0.16858949384883856, + "learning_rate": 6.834916549237349e-05, + "loss": 0.894, + "step": 19423 + }, + { + "epoch": 4.143344709897611, + "grad_norm": 0.18392858380674948, + "learning_rate": 6.834074652122556e-05, + "loss": 0.909, + "step": 19424 + }, + { + "epoch": 4.1435580204778155, + "grad_norm": 0.20942262492058367, + "learning_rate": 6.833232768200228e-05, + "loss": 0.8803, + "step": 19425 + }, + { + "epoch": 4.14377133105802, + "grad_norm": 0.2366765696666555, + "learning_rate": 6.832390897479892e-05, + "loss": 0.9005, + "step": 19426 + }, + { + "epoch": 4.143984641638225, + "grad_norm": 0.20317708915946767, + "learning_rate": 6.831549039971071e-05, + "loss": 0.9076, + "step": 19427 + }, + { + "epoch": 4.1441979522184305, + "grad_norm": 0.18690065640653064, + "learning_rate": 6.830707195683294e-05, + "loss": 0.8985, + "step": 19428 + }, + { + "epoch": 4.144411262798635, + "grad_norm": 0.2291896070572061, + "learning_rate": 6.829865364626083e-05, + "loss": 0.9047, + "step": 19429 + }, + { + "epoch": 4.14462457337884, + "grad_norm": 0.243225093927697, + "learning_rate": 6.829023546808966e-05, + "loss": 0.8761, + "step": 19430 + }, + { + "epoch": 4.1448378839590445, + "grad_norm": 0.21480821321369437, + "learning_rate": 6.82818174224147e-05, + "loss": 0.8971, + "step": 19431 + }, + { + "epoch": 4.145051194539249, + "grad_norm": 0.18202306820930714, + "learning_rate": 6.827339950933112e-05, + "loss": 0.9337, + "step": 19432 + }, + { + "epoch": 4.145264505119454, + "grad_norm": 0.16443688776227872, + "learning_rate": 6.826498172893427e-05, + "loss": 0.9036, + "step": 19433 + }, + { + "epoch": 4.145477815699659, + "grad_norm": 0.22094544852775558, + "learning_rate": 6.825656408131931e-05, + "loss": 0.9164, + "step": 19434 + }, + { + "epoch": 4.145691126279863, + "grad_norm": 0.22887878293732702, + "learning_rate": 6.824814656658155e-05, + "loss": 0.9133, + "step": 19435 + }, + { + "epoch": 4.145904436860068, + "grad_norm": 0.2641838873770841, + "learning_rate": 6.823972918481617e-05, + "loss": 0.9, + "step": 19436 + }, + { + "epoch": 4.146117747440273, + "grad_norm": 0.1884208733924772, + "learning_rate": 6.82313119361185e-05, + "loss": 0.8988, + "step": 19437 + }, + { + "epoch": 4.146331058020478, + "grad_norm": 0.2148132175625571, + "learning_rate": 6.822289482058372e-05, + "loss": 0.8975, + "step": 19438 + }, + { + "epoch": 4.146544368600683, + "grad_norm": 0.20491599181543757, + "learning_rate": 6.821447783830707e-05, + "loss": 0.9097, + "step": 19439 + }, + { + "epoch": 4.146757679180888, + "grad_norm": 0.226591090853086, + "learning_rate": 6.820606098938381e-05, + "loss": 0.92, + "step": 19440 + }, + { + "epoch": 4.146970989761092, + "grad_norm": 0.21948599311246006, + "learning_rate": 6.819764427390916e-05, + "loss": 0.9253, + "step": 19441 + }, + { + "epoch": 4.147184300341297, + "grad_norm": 0.16769505451153496, + "learning_rate": 6.81892276919784e-05, + "loss": 0.8677, + "step": 19442 + }, + { + "epoch": 4.147397610921502, + "grad_norm": 0.19879537330429986, + "learning_rate": 6.81808112436867e-05, + "loss": 0.8966, + "step": 19443 + }, + { + "epoch": 4.147610921501706, + "grad_norm": 0.20379007161486462, + "learning_rate": 6.817239492912931e-05, + "loss": 0.9092, + "step": 19444 + }, + { + "epoch": 4.147824232081911, + "grad_norm": 0.23516767605089303, + "learning_rate": 6.816397874840149e-05, + "loss": 0.9031, + "step": 19445 + }, + { + "epoch": 4.148037542662116, + "grad_norm": 0.1649174734205185, + "learning_rate": 6.815556270159842e-05, + "loss": 0.8987, + "step": 19446 + }, + { + "epoch": 4.148250853242321, + "grad_norm": 0.1888186540767343, + "learning_rate": 6.814714678881538e-05, + "loss": 0.8794, + "step": 19447 + }, + { + "epoch": 4.148464163822526, + "grad_norm": 0.26392032872686644, + "learning_rate": 6.813873101014757e-05, + "loss": 0.8845, + "step": 19448 + }, + { + "epoch": 4.148677474402731, + "grad_norm": 0.2162594080591019, + "learning_rate": 6.813031536569023e-05, + "loss": 0.8893, + "step": 19449 + }, + { + "epoch": 4.148890784982935, + "grad_norm": 0.20658215721070503, + "learning_rate": 6.812189985553856e-05, + "loss": 0.8871, + "step": 19450 + }, + { + "epoch": 4.14910409556314, + "grad_norm": 0.22572204098047424, + "learning_rate": 6.811348447978778e-05, + "loss": 0.9167, + "step": 19451 + }, + { + "epoch": 4.149317406143345, + "grad_norm": 0.2360576698159991, + "learning_rate": 6.810506923853313e-05, + "loss": 0.9201, + "step": 19452 + }, + { + "epoch": 4.149530716723549, + "grad_norm": 0.21388741743442163, + "learning_rate": 6.809665413186987e-05, + "loss": 0.8565, + "step": 19453 + }, + { + "epoch": 4.149744027303754, + "grad_norm": 0.20054560233340032, + "learning_rate": 6.80882391598931e-05, + "loss": 0.8652, + "step": 19454 + }, + { + "epoch": 4.149957337883959, + "grad_norm": 0.23704679303167617, + "learning_rate": 6.807982432269814e-05, + "loss": 0.8875, + "step": 19455 + }, + { + "epoch": 4.150170648464163, + "grad_norm": 0.23334056525417562, + "learning_rate": 6.807140962038014e-05, + "loss": 0.9191, + "step": 19456 + }, + { + "epoch": 4.150383959044369, + "grad_norm": 0.2121951352732674, + "learning_rate": 6.806299505303437e-05, + "loss": 0.8961, + "step": 19457 + }, + { + "epoch": 4.150597269624574, + "grad_norm": 0.17006358575754163, + "learning_rate": 6.805458062075599e-05, + "loss": 0.9299, + "step": 19458 + }, + { + "epoch": 4.150810580204778, + "grad_norm": 0.214103529179401, + "learning_rate": 6.804616632364025e-05, + "loss": 0.9346, + "step": 19459 + }, + { + "epoch": 4.151023890784983, + "grad_norm": 0.22649564100343555, + "learning_rate": 6.80377521617823e-05, + "loss": 0.8986, + "step": 19460 + }, + { + "epoch": 4.151237201365188, + "grad_norm": 0.19700838200826107, + "learning_rate": 6.802933813527744e-05, + "loss": 0.8938, + "step": 19461 + }, + { + "epoch": 4.151450511945392, + "grad_norm": 0.18055290299848847, + "learning_rate": 6.80209242442208e-05, + "loss": 0.8987, + "step": 19462 + }, + { + "epoch": 4.151663822525597, + "grad_norm": 0.1809023101182603, + "learning_rate": 6.801251048870759e-05, + "loss": 0.9041, + "step": 19463 + }, + { + "epoch": 4.151877133105802, + "grad_norm": 0.1852394656249978, + "learning_rate": 6.800409686883305e-05, + "loss": 0.8822, + "step": 19464 + }, + { + "epoch": 4.152090443686006, + "grad_norm": 0.17251183066230164, + "learning_rate": 6.799568338469236e-05, + "loss": 0.9051, + "step": 19465 + }, + { + "epoch": 4.152303754266212, + "grad_norm": 0.1908280815912848, + "learning_rate": 6.798727003638069e-05, + "loss": 0.8952, + "step": 19466 + }, + { + "epoch": 4.152517064846417, + "grad_norm": 0.2185694011965761, + "learning_rate": 6.797885682399327e-05, + "loss": 0.9, + "step": 19467 + }, + { + "epoch": 4.152730375426621, + "grad_norm": 0.23524940774838385, + "learning_rate": 6.797044374762528e-05, + "loss": 0.8789, + "step": 19468 + }, + { + "epoch": 4.152943686006826, + "grad_norm": 0.2175592471206583, + "learning_rate": 6.796203080737192e-05, + "loss": 0.8955, + "step": 19469 + }, + { + "epoch": 4.153156996587031, + "grad_norm": 0.19479252611325826, + "learning_rate": 6.795361800332838e-05, + "loss": 0.9022, + "step": 19470 + }, + { + "epoch": 4.153370307167235, + "grad_norm": 0.2734930153683194, + "learning_rate": 6.794520533558987e-05, + "loss": 0.8891, + "step": 19471 + }, + { + "epoch": 4.15358361774744, + "grad_norm": 0.27914318255219506, + "learning_rate": 6.793679280425155e-05, + "loss": 0.897, + "step": 19472 + }, + { + "epoch": 4.153796928327645, + "grad_norm": 0.24149996679720404, + "learning_rate": 6.792838040940864e-05, + "loss": 0.9057, + "step": 19473 + }, + { + "epoch": 4.1540102389078495, + "grad_norm": 0.14964807743798916, + "learning_rate": 6.79199681511563e-05, + "loss": 0.9056, + "step": 19474 + }, + { + "epoch": 4.154223549488055, + "grad_norm": 0.23240567389915504, + "learning_rate": 6.791155602958972e-05, + "loss": 0.8929, + "step": 19475 + }, + { + "epoch": 4.15443686006826, + "grad_norm": 0.2232459350384526, + "learning_rate": 6.790314404480411e-05, + "loss": 0.9183, + "step": 19476 + }, + { + "epoch": 4.154650170648464, + "grad_norm": 0.22631092436868794, + "learning_rate": 6.789473219689461e-05, + "loss": 0.9326, + "step": 19477 + }, + { + "epoch": 4.154863481228669, + "grad_norm": 0.19226843216724243, + "learning_rate": 6.788632048595641e-05, + "loss": 0.9168, + "step": 19478 + }, + { + "epoch": 4.155076791808874, + "grad_norm": 0.17638539242485224, + "learning_rate": 6.787790891208472e-05, + "loss": 0.875, + "step": 19479 + }, + { + "epoch": 4.1552901023890785, + "grad_norm": 0.27285467612890574, + "learning_rate": 6.786949747537467e-05, + "loss": 0.9339, + "step": 19480 + }, + { + "epoch": 4.155503412969283, + "grad_norm": 0.28090219198795363, + "learning_rate": 6.786108617592147e-05, + "loss": 0.894, + "step": 19481 + }, + { + "epoch": 4.155716723549488, + "grad_norm": 0.25687761795286956, + "learning_rate": 6.785267501382027e-05, + "loss": 0.9089, + "step": 19482 + }, + { + "epoch": 4.1559300341296925, + "grad_norm": 0.1604636871414189, + "learning_rate": 6.784426398916628e-05, + "loss": 0.8773, + "step": 19483 + }, + { + "epoch": 4.156143344709897, + "grad_norm": 0.2008955328499508, + "learning_rate": 6.783585310205462e-05, + "loss": 0.8813, + "step": 19484 + }, + { + "epoch": 4.156356655290103, + "grad_norm": 0.1993800581156485, + "learning_rate": 6.782744235258052e-05, + "loss": 0.9214, + "step": 19485 + }, + { + "epoch": 4.1565699658703075, + "grad_norm": 0.2548723939722683, + "learning_rate": 6.78190317408391e-05, + "loss": 0.8981, + "step": 19486 + }, + { + "epoch": 4.156783276450512, + "grad_norm": 0.2032662049650553, + "learning_rate": 6.781062126692558e-05, + "loss": 0.8975, + "step": 19487 + }, + { + "epoch": 4.156996587030717, + "grad_norm": 0.21359306863936336, + "learning_rate": 6.780221093093503e-05, + "loss": 0.8866, + "step": 19488 + }, + { + "epoch": 4.1572098976109215, + "grad_norm": 0.32031850829847314, + "learning_rate": 6.779380073296268e-05, + "loss": 0.8959, + "step": 19489 + }, + { + "epoch": 4.157423208191126, + "grad_norm": 0.2519042618211932, + "learning_rate": 6.778539067310368e-05, + "loss": 0.9295, + "step": 19490 + }, + { + "epoch": 4.157636518771331, + "grad_norm": 0.1937974337836924, + "learning_rate": 6.77769807514532e-05, + "loss": 0.8601, + "step": 19491 + }, + { + "epoch": 4.157849829351536, + "grad_norm": 0.26277828699907435, + "learning_rate": 6.776857096810636e-05, + "loss": 0.913, + "step": 19492 + }, + { + "epoch": 4.15806313993174, + "grad_norm": 0.26575645723054786, + "learning_rate": 6.776016132315838e-05, + "loss": 0.8961, + "step": 19493 + }, + { + "epoch": 4.158276450511945, + "grad_norm": 0.20567582842360968, + "learning_rate": 6.775175181670436e-05, + "loss": 0.9094, + "step": 19494 + }, + { + "epoch": 4.1584897610921505, + "grad_norm": 0.3160485292881384, + "learning_rate": 6.774334244883949e-05, + "loss": 0.8898, + "step": 19495 + }, + { + "epoch": 4.158703071672355, + "grad_norm": 0.2107232362027704, + "learning_rate": 6.773493321965888e-05, + "loss": 0.878, + "step": 19496 + }, + { + "epoch": 4.15891638225256, + "grad_norm": 0.2046227091346482, + "learning_rate": 6.772652412925773e-05, + "loss": 0.8774, + "step": 19497 + }, + { + "epoch": 4.159129692832765, + "grad_norm": 0.2238212140553338, + "learning_rate": 6.771811517773118e-05, + "loss": 0.8773, + "step": 19498 + }, + { + "epoch": 4.159343003412969, + "grad_norm": 0.16046714634754083, + "learning_rate": 6.770970636517437e-05, + "loss": 0.8912, + "step": 19499 + }, + { + "epoch": 4.159556313993174, + "grad_norm": 0.19160510989927065, + "learning_rate": 6.77012976916824e-05, + "loss": 0.8794, + "step": 19500 + }, + { + "epoch": 4.159769624573379, + "grad_norm": 0.15908145200519314, + "learning_rate": 6.769288915735046e-05, + "loss": 0.8932, + "step": 19501 + }, + { + "epoch": 4.159982935153583, + "grad_norm": 0.17427970138530988, + "learning_rate": 6.76844807622737e-05, + "loss": 0.8843, + "step": 19502 + }, + { + "epoch": 4.160196245733788, + "grad_norm": 0.18507882941198667, + "learning_rate": 6.767607250654724e-05, + "loss": 0.9013, + "step": 19503 + }, + { + "epoch": 4.160409556313994, + "grad_norm": 0.15999287782139393, + "learning_rate": 6.766766439026622e-05, + "loss": 0.9161, + "step": 19504 + }, + { + "epoch": 4.160622866894198, + "grad_norm": 0.17463591743592788, + "learning_rate": 6.76592564135258e-05, + "loss": 0.9107, + "step": 19505 + }, + { + "epoch": 4.160836177474403, + "grad_norm": 0.1867489279945869, + "learning_rate": 6.765084857642108e-05, + "loss": 0.8931, + "step": 19506 + }, + { + "epoch": 4.161049488054608, + "grad_norm": 0.15242947722816713, + "learning_rate": 6.764244087904723e-05, + "loss": 0.8848, + "step": 19507 + }, + { + "epoch": 4.161262798634812, + "grad_norm": 0.19462294563498486, + "learning_rate": 6.763403332149936e-05, + "loss": 0.8887, + "step": 19508 + }, + { + "epoch": 4.161476109215017, + "grad_norm": 0.1359972182572596, + "learning_rate": 6.762562590387267e-05, + "loss": 0.8728, + "step": 19509 + }, + { + "epoch": 4.161689419795222, + "grad_norm": 0.17517336929211916, + "learning_rate": 6.761721862626217e-05, + "loss": 0.8937, + "step": 19510 + }, + { + "epoch": 4.161902730375426, + "grad_norm": 0.15733416390906446, + "learning_rate": 6.760881148876307e-05, + "loss": 0.9183, + "step": 19511 + }, + { + "epoch": 4.162116040955631, + "grad_norm": 0.1807748015119384, + "learning_rate": 6.760040449147046e-05, + "loss": 0.9282, + "step": 19512 + }, + { + "epoch": 4.162329351535837, + "grad_norm": 0.1460097147734864, + "learning_rate": 6.759199763447949e-05, + "loss": 0.9185, + "step": 19513 + }, + { + "epoch": 4.162542662116041, + "grad_norm": 0.1809049526738848, + "learning_rate": 6.758359091788527e-05, + "loss": 0.9083, + "step": 19514 + }, + { + "epoch": 4.162755972696246, + "grad_norm": 0.16142827880987123, + "learning_rate": 6.757518434178293e-05, + "loss": 0.9161, + "step": 19515 + }, + { + "epoch": 4.162969283276451, + "grad_norm": 0.1836708904705673, + "learning_rate": 6.756677790626757e-05, + "loss": 0.9162, + "step": 19516 + }, + { + "epoch": 4.163182593856655, + "grad_norm": 0.1822348490780701, + "learning_rate": 6.755837161143437e-05, + "loss": 0.9137, + "step": 19517 + }, + { + "epoch": 4.16339590443686, + "grad_norm": 0.25979773961438973, + "learning_rate": 6.754996545737836e-05, + "loss": 0.91, + "step": 19518 + }, + { + "epoch": 4.163609215017065, + "grad_norm": 0.25836665659321334, + "learning_rate": 6.754155944419473e-05, + "loss": 0.892, + "step": 19519 + }, + { + "epoch": 4.163822525597269, + "grad_norm": 0.17187939147833306, + "learning_rate": 6.753315357197854e-05, + "loss": 0.9049, + "step": 19520 + }, + { + "epoch": 4.164035836177474, + "grad_norm": 0.22936978666709135, + "learning_rate": 6.752474784082499e-05, + "loss": 0.9128, + "step": 19521 + }, + { + "epoch": 4.164249146757679, + "grad_norm": 0.2661157595075417, + "learning_rate": 6.751634225082907e-05, + "loss": 0.9003, + "step": 19522 + }, + { + "epoch": 4.164462457337884, + "grad_norm": 0.252836521812368, + "learning_rate": 6.750793680208597e-05, + "loss": 0.8988, + "step": 19523 + }, + { + "epoch": 4.164675767918089, + "grad_norm": 0.22803560215784657, + "learning_rate": 6.749953149469076e-05, + "loss": 0.905, + "step": 19524 + }, + { + "epoch": 4.164889078498294, + "grad_norm": 0.17935496506481766, + "learning_rate": 6.749112632873856e-05, + "loss": 0.9122, + "step": 19525 + }, + { + "epoch": 4.165102389078498, + "grad_norm": 0.22688449780052977, + "learning_rate": 6.748272130432447e-05, + "loss": 0.9365, + "step": 19526 + }, + { + "epoch": 4.165315699658703, + "grad_norm": 0.2872897382266194, + "learning_rate": 6.747431642154361e-05, + "loss": 0.9329, + "step": 19527 + }, + { + "epoch": 4.165529010238908, + "grad_norm": 0.19793210751530027, + "learning_rate": 6.746591168049107e-05, + "loss": 0.8879, + "step": 19528 + }, + { + "epoch": 4.165742320819112, + "grad_norm": 0.17756863655320745, + "learning_rate": 6.745750708126195e-05, + "loss": 0.8966, + "step": 19529 + }, + { + "epoch": 4.165955631399317, + "grad_norm": 0.26293043050554293, + "learning_rate": 6.744910262395134e-05, + "loss": 0.9019, + "step": 19530 + }, + { + "epoch": 4.166168941979522, + "grad_norm": 0.22901549763444437, + "learning_rate": 6.744069830865437e-05, + "loss": 0.8648, + "step": 19531 + }, + { + "epoch": 4.166382252559727, + "grad_norm": 0.21370668256665268, + "learning_rate": 6.743229413546612e-05, + "loss": 0.9272, + "step": 19532 + }, + { + "epoch": 4.166595563139932, + "grad_norm": 0.1755959233971404, + "learning_rate": 6.742389010448165e-05, + "loss": 0.8941, + "step": 19533 + }, + { + "epoch": 4.166808873720137, + "grad_norm": 0.20405295498363518, + "learning_rate": 6.741548621579606e-05, + "loss": 0.8917, + "step": 19534 + }, + { + "epoch": 4.167022184300341, + "grad_norm": 0.20236801394163617, + "learning_rate": 6.740708246950448e-05, + "loss": 0.9061, + "step": 19535 + }, + { + "epoch": 4.167235494880546, + "grad_norm": 0.1719509626517103, + "learning_rate": 6.739867886570196e-05, + "loss": 0.9233, + "step": 19536 + }, + { + "epoch": 4.167448805460751, + "grad_norm": 0.18810803044411256, + "learning_rate": 6.739027540448359e-05, + "loss": 0.8939, + "step": 19537 + }, + { + "epoch": 4.1676621160409555, + "grad_norm": 0.18861577561771894, + "learning_rate": 6.738187208594447e-05, + "loss": 0.8981, + "step": 19538 + }, + { + "epoch": 4.16787542662116, + "grad_norm": 0.15119405646870285, + "learning_rate": 6.73734689101797e-05, + "loss": 0.8931, + "step": 19539 + }, + { + "epoch": 4.168088737201365, + "grad_norm": 0.20172670472570872, + "learning_rate": 6.736506587728432e-05, + "loss": 0.8789, + "step": 19540 + }, + { + "epoch": 4.1683020477815695, + "grad_norm": 0.15778843399417225, + "learning_rate": 6.735666298735345e-05, + "loss": 0.8685, + "step": 19541 + }, + { + "epoch": 4.168515358361775, + "grad_norm": 0.15267369226229857, + "learning_rate": 6.734826024048212e-05, + "loss": 0.9115, + "step": 19542 + }, + { + "epoch": 4.16872866894198, + "grad_norm": 0.19837696508012562, + "learning_rate": 6.73398576367655e-05, + "loss": 0.8826, + "step": 19543 + }, + { + "epoch": 4.1689419795221845, + "grad_norm": 0.14185004685151478, + "learning_rate": 6.733145517629856e-05, + "loss": 0.9194, + "step": 19544 + }, + { + "epoch": 4.169155290102389, + "grad_norm": 0.17235494256593722, + "learning_rate": 6.732305285917642e-05, + "loss": 0.9086, + "step": 19545 + }, + { + "epoch": 4.169368600682594, + "grad_norm": 0.21520204023283784, + "learning_rate": 6.731465068549415e-05, + "loss": 0.915, + "step": 19546 + }, + { + "epoch": 4.1695819112627985, + "grad_norm": 0.1717109808504144, + "learning_rate": 6.730624865534684e-05, + "loss": 0.8992, + "step": 19547 + }, + { + "epoch": 4.169795221843003, + "grad_norm": 0.18314005998439611, + "learning_rate": 6.72978467688295e-05, + "loss": 0.8883, + "step": 19548 + }, + { + "epoch": 4.170008532423208, + "grad_norm": 0.2020845567816302, + "learning_rate": 6.728944502603726e-05, + "loss": 0.9031, + "step": 19549 + }, + { + "epoch": 4.170221843003413, + "grad_norm": 0.1739979633821067, + "learning_rate": 6.728104342706516e-05, + "loss": 0.9161, + "step": 19550 + }, + { + "epoch": 4.170435153583618, + "grad_norm": 0.20736754601573548, + "learning_rate": 6.727264197200827e-05, + "loss": 0.9046, + "step": 19551 + }, + { + "epoch": 4.170648464163823, + "grad_norm": 0.21783601588724444, + "learning_rate": 6.726424066096166e-05, + "loss": 0.8927, + "step": 19552 + }, + { + "epoch": 4.1708617747440275, + "grad_norm": 0.2030440723156305, + "learning_rate": 6.725583949402037e-05, + "loss": 0.889, + "step": 19553 + }, + { + "epoch": 4.171075085324232, + "grad_norm": 0.1741338290249476, + "learning_rate": 6.72474384712795e-05, + "loss": 0.8676, + "step": 19554 + }, + { + "epoch": 4.171288395904437, + "grad_norm": 0.20963361374166745, + "learning_rate": 6.723903759283405e-05, + "loss": 0.9145, + "step": 19555 + }, + { + "epoch": 4.171501706484642, + "grad_norm": 0.20806902428618076, + "learning_rate": 6.72306368587791e-05, + "loss": 0.8956, + "step": 19556 + }, + { + "epoch": 4.171715017064846, + "grad_norm": 0.23678960217958953, + "learning_rate": 6.722223626920972e-05, + "loss": 0.9046, + "step": 19557 + }, + { + "epoch": 4.171928327645051, + "grad_norm": 0.15070637359256703, + "learning_rate": 6.721383582422093e-05, + "loss": 0.9041, + "step": 19558 + }, + { + "epoch": 4.172141638225256, + "grad_norm": 0.2458287531124291, + "learning_rate": 6.720543552390782e-05, + "loss": 0.9006, + "step": 19559 + }, + { + "epoch": 4.172354948805461, + "grad_norm": 0.20499381539599087, + "learning_rate": 6.719703536836542e-05, + "loss": 0.8685, + "step": 19560 + }, + { + "epoch": 4.172568259385666, + "grad_norm": 0.223272650207709, + "learning_rate": 6.718863535768879e-05, + "loss": 0.8719, + "step": 19561 + }, + { + "epoch": 4.172781569965871, + "grad_norm": 0.21006469035025788, + "learning_rate": 6.718023549197294e-05, + "loss": 0.914, + "step": 19562 + }, + { + "epoch": 4.172994880546075, + "grad_norm": 0.17034530912743728, + "learning_rate": 6.717183577131297e-05, + "loss": 0.9194, + "step": 19563 + }, + { + "epoch": 4.17320819112628, + "grad_norm": 0.19351247073526173, + "learning_rate": 6.716343619580386e-05, + "loss": 0.9048, + "step": 19564 + }, + { + "epoch": 4.173421501706485, + "grad_norm": 0.2096266390322716, + "learning_rate": 6.715503676554074e-05, + "loss": 0.9005, + "step": 19565 + }, + { + "epoch": 4.173634812286689, + "grad_norm": 0.1583437742510222, + "learning_rate": 6.714663748061857e-05, + "loss": 0.8999, + "step": 19566 + }, + { + "epoch": 4.173848122866894, + "grad_norm": 0.20892655789303619, + "learning_rate": 6.713823834113241e-05, + "loss": 0.9169, + "step": 19567 + }, + { + "epoch": 4.174061433447099, + "grad_norm": 0.2547129824706913, + "learning_rate": 6.712983934717729e-05, + "loss": 0.9043, + "step": 19568 + }, + { + "epoch": 4.174274744027303, + "grad_norm": 0.22495816832345417, + "learning_rate": 6.712144049884826e-05, + "loss": 0.9004, + "step": 19569 + }, + { + "epoch": 4.174488054607509, + "grad_norm": 0.2009907718838033, + "learning_rate": 6.711304179624034e-05, + "loss": 0.9073, + "step": 19570 + }, + { + "epoch": 4.174701365187714, + "grad_norm": 0.1508280058653189, + "learning_rate": 6.710464323944858e-05, + "loss": 0.8865, + "step": 19571 + }, + { + "epoch": 4.174914675767918, + "grad_norm": 0.21027592680694054, + "learning_rate": 6.709624482856798e-05, + "loss": 0.8681, + "step": 19572 + }, + { + "epoch": 4.175127986348123, + "grad_norm": 0.22656687121956226, + "learning_rate": 6.708784656369361e-05, + "loss": 0.8972, + "step": 19573 + }, + { + "epoch": 4.175341296928328, + "grad_norm": 0.25457182006285667, + "learning_rate": 6.707944844492047e-05, + "loss": 0.9168, + "step": 19574 + }, + { + "epoch": 4.175554607508532, + "grad_norm": 0.22942060561010466, + "learning_rate": 6.707105047234357e-05, + "loss": 0.8846, + "step": 19575 + }, + { + "epoch": 4.175767918088737, + "grad_norm": 0.22168417922457007, + "learning_rate": 6.706265264605795e-05, + "loss": 0.9098, + "step": 19576 + }, + { + "epoch": 4.175981228668942, + "grad_norm": 0.26930888689311655, + "learning_rate": 6.705425496615869e-05, + "loss": 0.9, + "step": 19577 + }, + { + "epoch": 4.176194539249146, + "grad_norm": 0.31973182409446876, + "learning_rate": 6.70458574327407e-05, + "loss": 0.9125, + "step": 19578 + }, + { + "epoch": 4.176407849829351, + "grad_norm": 0.33896875073767435, + "learning_rate": 6.703746004589907e-05, + "loss": 0.92, + "step": 19579 + }, + { + "epoch": 4.176621160409557, + "grad_norm": 0.16970991664679275, + "learning_rate": 6.702906280572878e-05, + "loss": 0.8738, + "step": 19580 + }, + { + "epoch": 4.176834470989761, + "grad_norm": 0.29834218961322895, + "learning_rate": 6.702066571232488e-05, + "loss": 0.9216, + "step": 19581 + }, + { + "epoch": 4.177047781569966, + "grad_norm": 0.35631599260821367, + "learning_rate": 6.701226876578235e-05, + "loss": 0.9115, + "step": 19582 + }, + { + "epoch": 4.177261092150171, + "grad_norm": 0.24743373542445393, + "learning_rate": 6.700387196619622e-05, + "loss": 0.9047, + "step": 19583 + }, + { + "epoch": 4.177474402730375, + "grad_norm": 0.2752791874148006, + "learning_rate": 6.69954753136615e-05, + "loss": 0.8986, + "step": 19584 + }, + { + "epoch": 4.17768771331058, + "grad_norm": 0.24893841216653798, + "learning_rate": 6.69870788082732e-05, + "loss": 0.928, + "step": 19585 + }, + { + "epoch": 4.177901023890785, + "grad_norm": 0.33491980194691934, + "learning_rate": 6.697868245012631e-05, + "loss": 0.9058, + "step": 19586 + }, + { + "epoch": 4.1781143344709895, + "grad_norm": 0.3754746004494082, + "learning_rate": 6.697028623931586e-05, + "loss": 0.9041, + "step": 19587 + }, + { + "epoch": 4.178327645051194, + "grad_norm": 0.19032772807277315, + "learning_rate": 6.696189017593687e-05, + "loss": 0.9155, + "step": 19588 + }, + { + "epoch": 4.1785409556314, + "grad_norm": 0.2705589342989055, + "learning_rate": 6.695349426008426e-05, + "loss": 0.9108, + "step": 19589 + }, + { + "epoch": 4.178754266211604, + "grad_norm": 0.33142826902345063, + "learning_rate": 6.69450984918531e-05, + "loss": 0.9162, + "step": 19590 + }, + { + "epoch": 4.178967576791809, + "grad_norm": 0.21616930328974335, + "learning_rate": 6.693670287133839e-05, + "loss": 0.9138, + "step": 19591 + }, + { + "epoch": 4.179180887372014, + "grad_norm": 0.2624414462634933, + "learning_rate": 6.692830739863506e-05, + "loss": 0.9232, + "step": 19592 + }, + { + "epoch": 4.1793941979522184, + "grad_norm": 0.20559230357671812, + "learning_rate": 6.69199120738382e-05, + "loss": 0.9119, + "step": 19593 + }, + { + "epoch": 4.179607508532423, + "grad_norm": 0.20975889583442595, + "learning_rate": 6.691151689704273e-05, + "loss": 0.8965, + "step": 19594 + }, + { + "epoch": 4.179820819112628, + "grad_norm": 0.2478049315339897, + "learning_rate": 6.690312186834367e-05, + "loss": 0.9048, + "step": 19595 + }, + { + "epoch": 4.1800341296928325, + "grad_norm": 0.17387250230561074, + "learning_rate": 6.689472698783599e-05, + "loss": 0.9028, + "step": 19596 + }, + { + "epoch": 4.180247440273037, + "grad_norm": 0.2422538576943508, + "learning_rate": 6.688633225561473e-05, + "loss": 0.9131, + "step": 19597 + }, + { + "epoch": 4.180460750853243, + "grad_norm": 0.2966537555915311, + "learning_rate": 6.687793767177481e-05, + "loss": 0.8845, + "step": 19598 + }, + { + "epoch": 4.180674061433447, + "grad_norm": 0.23248778575078768, + "learning_rate": 6.686954323641128e-05, + "loss": 0.9029, + "step": 19599 + }, + { + "epoch": 4.180887372013652, + "grad_norm": 0.26723163872838096, + "learning_rate": 6.686114894961907e-05, + "loss": 0.9011, + "step": 19600 + }, + { + "epoch": 4.181100682593857, + "grad_norm": 0.26959711159678096, + "learning_rate": 6.685275481149318e-05, + "loss": 0.8801, + "step": 19601 + }, + { + "epoch": 4.1813139931740615, + "grad_norm": 0.24729264226753644, + "learning_rate": 6.684436082212858e-05, + "loss": 0.911, + "step": 19602 + }, + { + "epoch": 4.181527303754266, + "grad_norm": 0.23287534491105377, + "learning_rate": 6.683596698162028e-05, + "loss": 0.9329, + "step": 19603 + }, + { + "epoch": 4.181740614334471, + "grad_norm": 0.19675761215514784, + "learning_rate": 6.682757329006321e-05, + "loss": 0.8679, + "step": 19604 + }, + { + "epoch": 4.1819539249146755, + "grad_norm": 0.31124723830748796, + "learning_rate": 6.68191797475524e-05, + "loss": 0.9169, + "step": 19605 + }, + { + "epoch": 4.18216723549488, + "grad_norm": 0.31081837249995564, + "learning_rate": 6.681078635418276e-05, + "loss": 0.9032, + "step": 19606 + }, + { + "epoch": 4.182380546075085, + "grad_norm": 0.18911089709715947, + "learning_rate": 6.680239311004932e-05, + "loss": 0.882, + "step": 19607 + }, + { + "epoch": 4.1825938566552905, + "grad_norm": 0.24713497998661427, + "learning_rate": 6.679400001524698e-05, + "loss": 0.9125, + "step": 19608 + }, + { + "epoch": 4.182807167235495, + "grad_norm": 0.33351935587431425, + "learning_rate": 6.67856070698708e-05, + "loss": 0.9125, + "step": 19609 + }, + { + "epoch": 4.1830204778157, + "grad_norm": 0.23643178227518985, + "learning_rate": 6.67772142740157e-05, + "loss": 0.8712, + "step": 19610 + }, + { + "epoch": 4.1832337883959045, + "grad_norm": 0.20474787455399926, + "learning_rate": 6.676882162777663e-05, + "loss": 0.8978, + "step": 19611 + }, + { + "epoch": 4.183447098976109, + "grad_norm": 0.2178403432852358, + "learning_rate": 6.676042913124857e-05, + "loss": 0.9165, + "step": 19612 + }, + { + "epoch": 4.183660409556314, + "grad_norm": 0.32679555746802075, + "learning_rate": 6.675203678452646e-05, + "loss": 0.9094, + "step": 19613 + }, + { + "epoch": 4.183873720136519, + "grad_norm": 0.3220438189754824, + "learning_rate": 6.674364458770528e-05, + "loss": 0.8781, + "step": 19614 + }, + { + "epoch": 4.184087030716723, + "grad_norm": 0.2034049182050231, + "learning_rate": 6.673525254088e-05, + "loss": 0.9261, + "step": 19615 + }, + { + "epoch": 4.184300341296928, + "grad_norm": 0.26138776046360296, + "learning_rate": 6.672686064414553e-05, + "loss": 0.8712, + "step": 19616 + }, + { + "epoch": 4.1845136518771335, + "grad_norm": 0.3518948499060934, + "learning_rate": 6.671846889759689e-05, + "loss": 0.8879, + "step": 19617 + }, + { + "epoch": 4.184726962457338, + "grad_norm": 0.27154804879802147, + "learning_rate": 6.671007730132898e-05, + "loss": 0.9358, + "step": 19618 + }, + { + "epoch": 4.184940273037543, + "grad_norm": 0.20130831566186552, + "learning_rate": 6.670168585543677e-05, + "loss": 0.8987, + "step": 19619 + }, + { + "epoch": 4.185153583617748, + "grad_norm": 0.22533820699040055, + "learning_rate": 6.669329456001521e-05, + "loss": 0.9119, + "step": 19620 + }, + { + "epoch": 4.185366894197952, + "grad_norm": 0.22910931499066403, + "learning_rate": 6.668490341515926e-05, + "loss": 0.8816, + "step": 19621 + }, + { + "epoch": 4.185580204778157, + "grad_norm": 0.20088527293784877, + "learning_rate": 6.667651242096385e-05, + "loss": 0.9048, + "step": 19622 + }, + { + "epoch": 4.185793515358362, + "grad_norm": 0.16730689900119658, + "learning_rate": 6.66681215775239e-05, + "loss": 0.8956, + "step": 19623 + }, + { + "epoch": 4.186006825938566, + "grad_norm": 0.2282349056217834, + "learning_rate": 6.66597308849344e-05, + "loss": 0.9446, + "step": 19624 + }, + { + "epoch": 4.186220136518771, + "grad_norm": 0.29835619238725425, + "learning_rate": 6.665134034329026e-05, + "loss": 0.8739, + "step": 19625 + }, + { + "epoch": 4.186433447098976, + "grad_norm": 0.21271874606748542, + "learning_rate": 6.664294995268644e-05, + "loss": 0.9159, + "step": 19626 + }, + { + "epoch": 4.186646757679181, + "grad_norm": 0.17322448149709854, + "learning_rate": 6.663455971321786e-05, + "loss": 0.898, + "step": 19627 + }, + { + "epoch": 4.186860068259386, + "grad_norm": 0.28738300918709553, + "learning_rate": 6.662616962497945e-05, + "loss": 0.9124, + "step": 19628 + }, + { + "epoch": 4.187073378839591, + "grad_norm": 0.22757868093054237, + "learning_rate": 6.661777968806616e-05, + "loss": 0.8952, + "step": 19629 + }, + { + "epoch": 4.187286689419795, + "grad_norm": 0.17442294397095479, + "learning_rate": 6.660938990257291e-05, + "loss": 0.9007, + "step": 19630 + }, + { + "epoch": 4.1875, + "grad_norm": 0.20734199389887462, + "learning_rate": 6.660100026859466e-05, + "loss": 0.8861, + "step": 19631 + }, + { + "epoch": 4.187713310580205, + "grad_norm": 0.25028669722005575, + "learning_rate": 6.659261078622634e-05, + "loss": 0.8965, + "step": 19632 + }, + { + "epoch": 4.187926621160409, + "grad_norm": 0.18423383856200323, + "learning_rate": 6.658422145556282e-05, + "loss": 0.8973, + "step": 19633 + }, + { + "epoch": 4.188139931740614, + "grad_norm": 0.17245172761090288, + "learning_rate": 6.657583227669906e-05, + "loss": 0.903, + "step": 19634 + }, + { + "epoch": 4.188353242320819, + "grad_norm": 0.22095942409045585, + "learning_rate": 6.656744324972999e-05, + "loss": 0.9136, + "step": 19635 + }, + { + "epoch": 4.188566552901024, + "grad_norm": 0.2575522380257501, + "learning_rate": 6.655905437475051e-05, + "loss": 0.9093, + "step": 19636 + }, + { + "epoch": 4.188779863481229, + "grad_norm": 0.21583441058217237, + "learning_rate": 6.655066565185557e-05, + "loss": 0.904, + "step": 19637 + }, + { + "epoch": 4.188993174061434, + "grad_norm": 0.18019767754821345, + "learning_rate": 6.654227708114007e-05, + "loss": 0.8762, + "step": 19638 + }, + { + "epoch": 4.189206484641638, + "grad_norm": 0.20041545400448277, + "learning_rate": 6.653388866269894e-05, + "loss": 0.9011, + "step": 19639 + }, + { + "epoch": 4.189419795221843, + "grad_norm": 0.1778444216163986, + "learning_rate": 6.652550039662707e-05, + "loss": 0.9153, + "step": 19640 + }, + { + "epoch": 4.189633105802048, + "grad_norm": 0.18427647968704614, + "learning_rate": 6.65171122830194e-05, + "loss": 0.8891, + "step": 19641 + }, + { + "epoch": 4.189846416382252, + "grad_norm": 0.16914752288370322, + "learning_rate": 6.650872432197082e-05, + "loss": 0.8981, + "step": 19642 + }, + { + "epoch": 4.190059726962457, + "grad_norm": 0.1865173386236494, + "learning_rate": 6.650033651357627e-05, + "loss": 0.8991, + "step": 19643 + }, + { + "epoch": 4.190273037542662, + "grad_norm": 0.20130013592512735, + "learning_rate": 6.649194885793066e-05, + "loss": 0.8995, + "step": 19644 + }, + { + "epoch": 4.1904863481228665, + "grad_norm": 0.20037954053865667, + "learning_rate": 6.648356135512882e-05, + "loss": 0.9246, + "step": 19645 + }, + { + "epoch": 4.190699658703072, + "grad_norm": 0.15544464758649468, + "learning_rate": 6.647517400526575e-05, + "loss": 0.8789, + "step": 19646 + }, + { + "epoch": 4.190912969283277, + "grad_norm": 0.2153847211839712, + "learning_rate": 6.64667868084363e-05, + "loss": 0.8891, + "step": 19647 + }, + { + "epoch": 4.191126279863481, + "grad_norm": 0.19131858762327433, + "learning_rate": 6.64583997647354e-05, + "loss": 0.8997, + "step": 19648 + }, + { + "epoch": 4.191339590443686, + "grad_norm": 0.17004013862398362, + "learning_rate": 6.64500128742579e-05, + "loss": 0.9012, + "step": 19649 + }, + { + "epoch": 4.191552901023891, + "grad_norm": 0.15862231930222664, + "learning_rate": 6.644162613709876e-05, + "loss": 0.8902, + "step": 19650 + }, + { + "epoch": 4.1917662116040955, + "grad_norm": 0.18616575100363744, + "learning_rate": 6.643323955335285e-05, + "loss": 0.9, + "step": 19651 + }, + { + "epoch": 4.1919795221843, + "grad_norm": 0.20282947330737394, + "learning_rate": 6.642485312311504e-05, + "loss": 0.9151, + "step": 19652 + }, + { + "epoch": 4.192192832764505, + "grad_norm": 0.24769133379205585, + "learning_rate": 6.641646684648027e-05, + "loss": 0.8988, + "step": 19653 + }, + { + "epoch": 4.1924061433447095, + "grad_norm": 0.20667855486061962, + "learning_rate": 6.640808072354337e-05, + "loss": 0.8952, + "step": 19654 + }, + { + "epoch": 4.192619453924915, + "grad_norm": 0.2135906359159788, + "learning_rate": 6.639969475439933e-05, + "loss": 0.928, + "step": 19655 + }, + { + "epoch": 4.19283276450512, + "grad_norm": 0.18915559442601848, + "learning_rate": 6.639130893914293e-05, + "loss": 0.8977, + "step": 19656 + }, + { + "epoch": 4.1930460750853245, + "grad_norm": 0.21981966141309478, + "learning_rate": 6.638292327786909e-05, + "loss": 0.8798, + "step": 19657 + }, + { + "epoch": 4.193259385665529, + "grad_norm": 0.16391892823795712, + "learning_rate": 6.637453777067272e-05, + "loss": 0.8843, + "step": 19658 + }, + { + "epoch": 4.193472696245734, + "grad_norm": 0.19971216234565928, + "learning_rate": 6.636615241764867e-05, + "loss": 0.9154, + "step": 19659 + }, + { + "epoch": 4.1936860068259385, + "grad_norm": 0.2127500580204205, + "learning_rate": 6.635776721889185e-05, + "loss": 0.931, + "step": 19660 + }, + { + "epoch": 4.193899317406143, + "grad_norm": 0.1931302641051414, + "learning_rate": 6.63493821744971e-05, + "loss": 0.9019, + "step": 19661 + }, + { + "epoch": 4.194112627986348, + "grad_norm": 0.24619866625757142, + "learning_rate": 6.634099728455936e-05, + "loss": 0.9131, + "step": 19662 + }, + { + "epoch": 4.194325938566553, + "grad_norm": 0.1815108242900881, + "learning_rate": 6.633261254917344e-05, + "loss": 0.8945, + "step": 19663 + }, + { + "epoch": 4.194539249146757, + "grad_norm": 0.27432355768055144, + "learning_rate": 6.632422796843422e-05, + "loss": 0.9267, + "step": 19664 + }, + { + "epoch": 4.194752559726963, + "grad_norm": 0.23562621914345552, + "learning_rate": 6.631584354243663e-05, + "loss": 0.9389, + "step": 19665 + }, + { + "epoch": 4.1949658703071675, + "grad_norm": 0.20729358630966385, + "learning_rate": 6.63074592712755e-05, + "loss": 0.9111, + "step": 19666 + }, + { + "epoch": 4.195179180887372, + "grad_norm": 0.22026831908569752, + "learning_rate": 6.629907515504566e-05, + "loss": 0.8984, + "step": 19667 + }, + { + "epoch": 4.195392491467577, + "grad_norm": 0.19181698432523475, + "learning_rate": 6.629069119384205e-05, + "loss": 0.8891, + "step": 19668 + }, + { + "epoch": 4.1956058020477816, + "grad_norm": 0.2455165329361939, + "learning_rate": 6.628230738775947e-05, + "loss": 0.9046, + "step": 19669 + }, + { + "epoch": 4.195819112627986, + "grad_norm": 0.19817884905694233, + "learning_rate": 6.627392373689284e-05, + "loss": 0.9115, + "step": 19670 + }, + { + "epoch": 4.196032423208191, + "grad_norm": 0.18965106211791885, + "learning_rate": 6.626554024133697e-05, + "loss": 0.8923, + "step": 19671 + }, + { + "epoch": 4.196245733788396, + "grad_norm": 0.227343151075273, + "learning_rate": 6.625715690118676e-05, + "loss": 0.8887, + "step": 19672 + }, + { + "epoch": 4.1964590443686, + "grad_norm": 0.23541713853556068, + "learning_rate": 6.624877371653704e-05, + "loss": 0.9286, + "step": 19673 + }, + { + "epoch": 4.196672354948806, + "grad_norm": 0.21954774406463923, + "learning_rate": 6.624039068748268e-05, + "loss": 0.8721, + "step": 19674 + }, + { + "epoch": 4.1968856655290105, + "grad_norm": 0.18911561260908086, + "learning_rate": 6.623200781411853e-05, + "loss": 0.8992, + "step": 19675 + }, + { + "epoch": 4.197098976109215, + "grad_norm": 0.28002447109885437, + "learning_rate": 6.622362509653944e-05, + "loss": 0.911, + "step": 19676 + }, + { + "epoch": 4.19731228668942, + "grad_norm": 0.2589052354912959, + "learning_rate": 6.62152425348403e-05, + "loss": 0.9247, + "step": 19677 + }, + { + "epoch": 4.197525597269625, + "grad_norm": 0.19915294457290592, + "learning_rate": 6.62068601291159e-05, + "loss": 0.9219, + "step": 19678 + }, + { + "epoch": 4.197738907849829, + "grad_norm": 0.33903909906366, + "learning_rate": 6.619847787946109e-05, + "loss": 0.9301, + "step": 19679 + }, + { + "epoch": 4.197952218430034, + "grad_norm": 0.2301500262007325, + "learning_rate": 6.619009578597075e-05, + "loss": 0.904, + "step": 19680 + }, + { + "epoch": 4.198165529010239, + "grad_norm": 0.2993806292149219, + "learning_rate": 6.61817138487397e-05, + "loss": 0.8869, + "step": 19681 + }, + { + "epoch": 4.198378839590443, + "grad_norm": 0.36701142197920783, + "learning_rate": 6.61733320678628e-05, + "loss": 0.9152, + "step": 19682 + }, + { + "epoch": 4.198592150170649, + "grad_norm": 0.2175805438726208, + "learning_rate": 6.616495044343486e-05, + "loss": 0.8953, + "step": 19683 + }, + { + "epoch": 4.198805460750854, + "grad_norm": 0.3359421428203866, + "learning_rate": 6.615656897555076e-05, + "loss": 0.9331, + "step": 19684 + }, + { + "epoch": 4.199018771331058, + "grad_norm": 0.2724489849943168, + "learning_rate": 6.614818766430532e-05, + "loss": 0.9088, + "step": 19685 + }, + { + "epoch": 4.199232081911263, + "grad_norm": 0.19116924538778227, + "learning_rate": 6.613980650979334e-05, + "loss": 0.9028, + "step": 19686 + }, + { + "epoch": 4.199445392491468, + "grad_norm": 0.27519857402610426, + "learning_rate": 6.613142551210973e-05, + "loss": 0.9236, + "step": 19687 + }, + { + "epoch": 4.199658703071672, + "grad_norm": 0.23216048591914906, + "learning_rate": 6.612304467134926e-05, + "loss": 0.9049, + "step": 19688 + }, + { + "epoch": 4.199872013651877, + "grad_norm": 0.2302728253230909, + "learning_rate": 6.611466398760674e-05, + "loss": 0.9089, + "step": 19689 + }, + { + "epoch": 4.200085324232082, + "grad_norm": 0.2542455184407163, + "learning_rate": 6.610628346097706e-05, + "loss": 0.8987, + "step": 19690 + }, + { + "epoch": 4.200298634812286, + "grad_norm": 0.15981531311404898, + "learning_rate": 6.609790309155499e-05, + "loss": 0.8845, + "step": 19691 + }, + { + "epoch": 4.200511945392491, + "grad_norm": 0.2891309385240794, + "learning_rate": 6.60895228794354e-05, + "loss": 0.9244, + "step": 19692 + }, + { + "epoch": 4.200725255972697, + "grad_norm": 0.23698087548518143, + "learning_rate": 6.608114282471308e-05, + "loss": 0.9216, + "step": 19693 + }, + { + "epoch": 4.200938566552901, + "grad_norm": 0.20576129086528855, + "learning_rate": 6.607276292748288e-05, + "loss": 0.9232, + "step": 19694 + }, + { + "epoch": 4.201151877133106, + "grad_norm": 0.20555417898788053, + "learning_rate": 6.606438318783958e-05, + "loss": 0.9099, + "step": 19695 + }, + { + "epoch": 4.201365187713311, + "grad_norm": 0.16397020423919145, + "learning_rate": 6.605600360587804e-05, + "loss": 0.884, + "step": 19696 + }, + { + "epoch": 4.201578498293515, + "grad_norm": 0.21077991660562698, + "learning_rate": 6.604762418169305e-05, + "loss": 0.9246, + "step": 19697 + }, + { + "epoch": 4.20179180887372, + "grad_norm": 0.19074095028717641, + "learning_rate": 6.603924491537941e-05, + "loss": 0.8766, + "step": 19698 + }, + { + "epoch": 4.202005119453925, + "grad_norm": 0.17532285294529504, + "learning_rate": 6.603086580703196e-05, + "loss": 0.9218, + "step": 19699 + }, + { + "epoch": 4.202218430034129, + "grad_norm": 0.2187644958640961, + "learning_rate": 6.602248685674554e-05, + "loss": 0.8812, + "step": 19700 + }, + { + "epoch": 4.202431740614334, + "grad_norm": 0.21062210116462918, + "learning_rate": 6.601410806461486e-05, + "loss": 0.9139, + "step": 19701 + }, + { + "epoch": 4.202645051194539, + "grad_norm": 0.26765367490369024, + "learning_rate": 6.60057294307348e-05, + "loss": 0.8819, + "step": 19702 + }, + { + "epoch": 4.202858361774744, + "grad_norm": 0.23645932451360957, + "learning_rate": 6.599735095520013e-05, + "loss": 0.9046, + "step": 19703 + }, + { + "epoch": 4.203071672354949, + "grad_norm": 0.25959465175197327, + "learning_rate": 6.59889726381057e-05, + "loss": 0.9157, + "step": 19704 + }, + { + "epoch": 4.203284982935154, + "grad_norm": 0.2555086168190437, + "learning_rate": 6.598059447954624e-05, + "loss": 0.8912, + "step": 19705 + }, + { + "epoch": 4.203498293515358, + "grad_norm": 0.27301156858948944, + "learning_rate": 6.597221647961663e-05, + "loss": 0.9062, + "step": 19706 + }, + { + "epoch": 4.203711604095563, + "grad_norm": 0.21459710744783503, + "learning_rate": 6.59638386384116e-05, + "loss": 0.9071, + "step": 19707 + }, + { + "epoch": 4.203924914675768, + "grad_norm": 0.28877259864893795, + "learning_rate": 6.5955460956026e-05, + "loss": 0.8912, + "step": 19708 + }, + { + "epoch": 4.2041382252559725, + "grad_norm": 0.19452599795844805, + "learning_rate": 6.594708343255458e-05, + "loss": 0.9136, + "step": 19709 + }, + { + "epoch": 4.204351535836177, + "grad_norm": 0.21822519319889389, + "learning_rate": 6.593870606809213e-05, + "loss": 0.882, + "step": 19710 + }, + { + "epoch": 4.204564846416382, + "grad_norm": 0.24643940417183613, + "learning_rate": 6.59303288627335e-05, + "loss": 0.9174, + "step": 19711 + }, + { + "epoch": 4.204778156996587, + "grad_norm": 0.1569134454093207, + "learning_rate": 6.59219518165734e-05, + "loss": 0.9138, + "step": 19712 + }, + { + "epoch": 4.204991467576792, + "grad_norm": 0.20277195097112394, + "learning_rate": 6.591357492970666e-05, + "loss": 0.9085, + "step": 19713 + }, + { + "epoch": 4.205204778156997, + "grad_norm": 0.19634489738320832, + "learning_rate": 6.590519820222805e-05, + "loss": 0.8581, + "step": 19714 + }, + { + "epoch": 4.2054180887372015, + "grad_norm": 0.18193715622680404, + "learning_rate": 6.589682163423237e-05, + "loss": 0.8978, + "step": 19715 + }, + { + "epoch": 4.205631399317406, + "grad_norm": 0.19030068365811634, + "learning_rate": 6.588844522581438e-05, + "loss": 0.8801, + "step": 19716 + }, + { + "epoch": 4.205844709897611, + "grad_norm": 0.19005020196702843, + "learning_rate": 6.588006897706887e-05, + "loss": 0.9224, + "step": 19717 + }, + { + "epoch": 4.2060580204778155, + "grad_norm": 0.20940662188868892, + "learning_rate": 6.587169288809063e-05, + "loss": 0.9065, + "step": 19718 + }, + { + "epoch": 4.20627133105802, + "grad_norm": 0.23386552442528988, + "learning_rate": 6.58633169589744e-05, + "loss": 0.9041, + "step": 19719 + }, + { + "epoch": 4.206484641638225, + "grad_norm": 0.20622554580391558, + "learning_rate": 6.5854941189815e-05, + "loss": 0.9063, + "step": 19720 + }, + { + "epoch": 4.2066979522184305, + "grad_norm": 0.20477181889105064, + "learning_rate": 6.584656558070718e-05, + "loss": 0.9191, + "step": 19721 + }, + { + "epoch": 4.206911262798635, + "grad_norm": 0.17997459586509962, + "learning_rate": 6.583819013174572e-05, + "loss": 0.9096, + "step": 19722 + }, + { + "epoch": 4.20712457337884, + "grad_norm": 0.19625182317389428, + "learning_rate": 6.582981484302535e-05, + "loss": 0.904, + "step": 19723 + }, + { + "epoch": 4.2073378839590445, + "grad_norm": 0.22384610608196265, + "learning_rate": 6.582143971464087e-05, + "loss": 0.9307, + "step": 19724 + }, + { + "epoch": 4.207551194539249, + "grad_norm": 0.16471720734924966, + "learning_rate": 6.581306474668703e-05, + "loss": 0.9158, + "step": 19725 + }, + { + "epoch": 4.207764505119454, + "grad_norm": 0.2082255741954891, + "learning_rate": 6.58046899392586e-05, + "loss": 0.92, + "step": 19726 + }, + { + "epoch": 4.207977815699659, + "grad_norm": 0.1785269679578008, + "learning_rate": 6.579631529245036e-05, + "loss": 0.893, + "step": 19727 + }, + { + "epoch": 4.208191126279863, + "grad_norm": 0.217022336750887, + "learning_rate": 6.578794080635704e-05, + "loss": 0.9224, + "step": 19728 + }, + { + "epoch": 4.208404436860068, + "grad_norm": 0.20346130977028049, + "learning_rate": 6.57795664810734e-05, + "loss": 0.879, + "step": 19729 + }, + { + "epoch": 4.208617747440273, + "grad_norm": 0.17865044893916993, + "learning_rate": 6.577119231669422e-05, + "loss": 0.8989, + "step": 19730 + }, + { + "epoch": 4.208831058020478, + "grad_norm": 0.17177779874010682, + "learning_rate": 6.576281831331422e-05, + "loss": 0.9104, + "step": 19731 + }, + { + "epoch": 4.209044368600683, + "grad_norm": 0.1782701792443659, + "learning_rate": 6.57544444710282e-05, + "loss": 0.878, + "step": 19732 + }, + { + "epoch": 4.209257679180888, + "grad_norm": 0.19339583439432656, + "learning_rate": 6.574607078993089e-05, + "loss": 0.889, + "step": 19733 + }, + { + "epoch": 4.209470989761092, + "grad_norm": 0.14930731347802878, + "learning_rate": 6.5737697270117e-05, + "loss": 0.9097, + "step": 19734 + }, + { + "epoch": 4.209684300341297, + "grad_norm": 0.18968754916864014, + "learning_rate": 6.57293239116813e-05, + "loss": 0.9204, + "step": 19735 + }, + { + "epoch": 4.209897610921502, + "grad_norm": 0.17372170022383163, + "learning_rate": 6.572095071471855e-05, + "loss": 0.9137, + "step": 19736 + }, + { + "epoch": 4.210110921501706, + "grad_norm": 0.2154842140056884, + "learning_rate": 6.571257767932347e-05, + "loss": 0.8862, + "step": 19737 + }, + { + "epoch": 4.210324232081911, + "grad_norm": 0.2423750050163339, + "learning_rate": 6.570420480559083e-05, + "loss": 0.92, + "step": 19738 + }, + { + "epoch": 4.210537542662116, + "grad_norm": 0.20820434604577148, + "learning_rate": 6.569583209361535e-05, + "loss": 0.9044, + "step": 19739 + }, + { + "epoch": 4.210750853242321, + "grad_norm": 0.2415680182687197, + "learning_rate": 6.568745954349176e-05, + "loss": 0.9171, + "step": 19740 + }, + { + "epoch": 4.210964163822526, + "grad_norm": 0.2396900958190166, + "learning_rate": 6.56790871553148e-05, + "loss": 0.9061, + "step": 19741 + }, + { + "epoch": 4.211177474402731, + "grad_norm": 0.19283252297228606, + "learning_rate": 6.567071492917924e-05, + "loss": 0.8888, + "step": 19742 + }, + { + "epoch": 4.211390784982935, + "grad_norm": 0.19896193197181158, + "learning_rate": 6.566234286517975e-05, + "loss": 0.896, + "step": 19743 + }, + { + "epoch": 4.21160409556314, + "grad_norm": 0.20077415067374998, + "learning_rate": 6.565397096341116e-05, + "loss": 0.8823, + "step": 19744 + }, + { + "epoch": 4.211817406143345, + "grad_norm": 0.22686771230893954, + "learning_rate": 6.564559922396805e-05, + "loss": 0.9135, + "step": 19745 + }, + { + "epoch": 4.212030716723549, + "grad_norm": 0.20719094936928617, + "learning_rate": 6.563722764694528e-05, + "loss": 0.8769, + "step": 19746 + }, + { + "epoch": 4.212244027303754, + "grad_norm": 0.18987054279422366, + "learning_rate": 6.562885623243748e-05, + "loss": 0.8898, + "step": 19747 + }, + { + "epoch": 4.212457337883959, + "grad_norm": 0.1661018045794649, + "learning_rate": 6.562048498053942e-05, + "loss": 0.8962, + "step": 19748 + }, + { + "epoch": 4.212670648464163, + "grad_norm": 0.17924177510121128, + "learning_rate": 6.561211389134581e-05, + "loss": 0.906, + "step": 19749 + }, + { + "epoch": 4.212883959044369, + "grad_norm": 0.19308089458278646, + "learning_rate": 6.560374296495139e-05, + "loss": 0.9343, + "step": 19750 + }, + { + "epoch": 4.213097269624574, + "grad_norm": 0.18939846442258246, + "learning_rate": 6.559537220145085e-05, + "loss": 0.9162, + "step": 19751 + }, + { + "epoch": 4.213310580204778, + "grad_norm": 0.22756905243743092, + "learning_rate": 6.558700160093894e-05, + "loss": 0.8915, + "step": 19752 + }, + { + "epoch": 4.213523890784983, + "grad_norm": 0.20441964271472718, + "learning_rate": 6.55786311635103e-05, + "loss": 0.8922, + "step": 19753 + }, + { + "epoch": 4.213737201365188, + "grad_norm": 0.24545955474620954, + "learning_rate": 6.557026088925974e-05, + "loss": 0.9226, + "step": 19754 + }, + { + "epoch": 4.213950511945392, + "grad_norm": 0.1707165682935097, + "learning_rate": 6.556189077828188e-05, + "loss": 0.9191, + "step": 19755 + }, + { + "epoch": 4.214163822525597, + "grad_norm": 0.20416286547801135, + "learning_rate": 6.555352083067155e-05, + "loss": 0.8505, + "step": 19756 + }, + { + "epoch": 4.214377133105802, + "grad_norm": 0.17072988948680176, + "learning_rate": 6.554515104652331e-05, + "loss": 0.89, + "step": 19757 + }, + { + "epoch": 4.214590443686006, + "grad_norm": 0.1994928639792735, + "learning_rate": 6.553678142593193e-05, + "loss": 0.8676, + "step": 19758 + }, + { + "epoch": 4.214803754266212, + "grad_norm": 0.19613331475049514, + "learning_rate": 6.55284119689921e-05, + "loss": 0.9116, + "step": 19759 + }, + { + "epoch": 4.215017064846417, + "grad_norm": 0.18456924840759742, + "learning_rate": 6.552004267579856e-05, + "loss": 0.9124, + "step": 19760 + }, + { + "epoch": 4.215230375426621, + "grad_norm": 0.24666601610571448, + "learning_rate": 6.551167354644597e-05, + "loss": 0.9298, + "step": 19761 + }, + { + "epoch": 4.215443686006826, + "grad_norm": 0.2273589373020304, + "learning_rate": 6.550330458102903e-05, + "loss": 0.8831, + "step": 19762 + }, + { + "epoch": 4.215656996587031, + "grad_norm": 0.16922292034433065, + "learning_rate": 6.549493577964243e-05, + "loss": 0.9331, + "step": 19763 + }, + { + "epoch": 4.215870307167235, + "grad_norm": 0.19040264120203534, + "learning_rate": 6.548656714238091e-05, + "loss": 0.8628, + "step": 19764 + }, + { + "epoch": 4.21608361774744, + "grad_norm": 0.1541015431704434, + "learning_rate": 6.54781986693391e-05, + "loss": 0.8869, + "step": 19765 + }, + { + "epoch": 4.216296928327645, + "grad_norm": 0.2097069422515439, + "learning_rate": 6.546983036061174e-05, + "loss": 0.9042, + "step": 19766 + }, + { + "epoch": 4.2165102389078495, + "grad_norm": 0.1641229501916126, + "learning_rate": 6.546146221629351e-05, + "loss": 0.8908, + "step": 19767 + }, + { + "epoch": 4.216723549488055, + "grad_norm": 0.16619252377538007, + "learning_rate": 6.545309423647905e-05, + "loss": 0.9125, + "step": 19768 + }, + { + "epoch": 4.21693686006826, + "grad_norm": 0.20257886372890727, + "learning_rate": 6.544472642126306e-05, + "loss": 0.8799, + "step": 19769 + }, + { + "epoch": 4.217150170648464, + "grad_norm": 0.16927928816798252, + "learning_rate": 6.543635877074026e-05, + "loss": 0.9049, + "step": 19770 + }, + { + "epoch": 4.217363481228669, + "grad_norm": 0.16623034475672893, + "learning_rate": 6.542799128500529e-05, + "loss": 0.9, + "step": 19771 + }, + { + "epoch": 4.217576791808874, + "grad_norm": 0.1912582130618933, + "learning_rate": 6.541962396415285e-05, + "loss": 0.8989, + "step": 19772 + }, + { + "epoch": 4.2177901023890785, + "grad_norm": 0.25872402480157414, + "learning_rate": 6.54112568082776e-05, + "loss": 0.9012, + "step": 19773 + }, + { + "epoch": 4.218003412969283, + "grad_norm": 0.23175664346513045, + "learning_rate": 6.540288981747424e-05, + "loss": 0.9139, + "step": 19774 + }, + { + "epoch": 4.218216723549488, + "grad_norm": 0.17461155667160644, + "learning_rate": 6.539452299183741e-05, + "loss": 0.9158, + "step": 19775 + }, + { + "epoch": 4.2184300341296925, + "grad_norm": 1.0013674570512883, + "learning_rate": 6.538615633146181e-05, + "loss": 0.9354, + "step": 19776 + }, + { + "epoch": 4.218643344709897, + "grad_norm": 0.1939911168258529, + "learning_rate": 6.53777898364421e-05, + "loss": 0.8886, + "step": 19777 + }, + { + "epoch": 4.218856655290103, + "grad_norm": 0.2247469805241528, + "learning_rate": 6.536942350687297e-05, + "loss": 0.8985, + "step": 19778 + }, + { + "epoch": 4.2190699658703075, + "grad_norm": 0.24271019747692818, + "learning_rate": 6.536105734284903e-05, + "loss": 0.9256, + "step": 19779 + }, + { + "epoch": 4.219283276450512, + "grad_norm": 0.2001144206700291, + "learning_rate": 6.535269134446497e-05, + "loss": 0.9073, + "step": 19780 + }, + { + "epoch": 4.219496587030717, + "grad_norm": 0.2560683747351866, + "learning_rate": 6.534432551181545e-05, + "loss": 0.8681, + "step": 19781 + }, + { + "epoch": 4.2197098976109215, + "grad_norm": 0.22898384557752363, + "learning_rate": 6.533595984499515e-05, + "loss": 0.9221, + "step": 19782 + }, + { + "epoch": 4.219923208191126, + "grad_norm": 0.16573422683784791, + "learning_rate": 6.532759434409868e-05, + "loss": 0.8938, + "step": 19783 + }, + { + "epoch": 4.220136518771331, + "grad_norm": 0.2332217686728847, + "learning_rate": 6.531922900922076e-05, + "loss": 0.9188, + "step": 19784 + }, + { + "epoch": 4.220349829351536, + "grad_norm": 0.1879709658712598, + "learning_rate": 6.531086384045598e-05, + "loss": 0.9019, + "step": 19785 + }, + { + "epoch": 4.22056313993174, + "grad_norm": 0.1781325731678905, + "learning_rate": 6.530249883789904e-05, + "loss": 0.8925, + "step": 19786 + }, + { + "epoch": 4.220776450511945, + "grad_norm": 0.19879204922686236, + "learning_rate": 6.529413400164457e-05, + "loss": 0.9016, + "step": 19787 + }, + { + "epoch": 4.2209897610921505, + "grad_norm": 0.1896635810070223, + "learning_rate": 6.528576933178724e-05, + "loss": 0.9245, + "step": 19788 + }, + { + "epoch": 4.221203071672355, + "grad_norm": 0.17558130809025174, + "learning_rate": 6.527740482842167e-05, + "loss": 0.9261, + "step": 19789 + }, + { + "epoch": 4.22141638225256, + "grad_norm": 0.19294393961730633, + "learning_rate": 6.526904049164252e-05, + "loss": 0.9185, + "step": 19790 + }, + { + "epoch": 4.221629692832765, + "grad_norm": 0.16889581040362378, + "learning_rate": 6.526067632154439e-05, + "loss": 0.9211, + "step": 19791 + }, + { + "epoch": 4.221843003412969, + "grad_norm": 0.1752751928284164, + "learning_rate": 6.525231231822198e-05, + "loss": 0.8938, + "step": 19792 + }, + { + "epoch": 4.222056313993174, + "grad_norm": 0.149655612323033, + "learning_rate": 6.524394848176989e-05, + "loss": 0.8646, + "step": 19793 + }, + { + "epoch": 4.222269624573379, + "grad_norm": 0.17805489939013375, + "learning_rate": 6.523558481228278e-05, + "loss": 0.8813, + "step": 19794 + }, + { + "epoch": 4.222482935153583, + "grad_norm": 0.18407416617517144, + "learning_rate": 6.522722130985527e-05, + "loss": 0.8712, + "step": 19795 + }, + { + "epoch": 4.222696245733788, + "grad_norm": 0.20540602563404614, + "learning_rate": 6.5218857974582e-05, + "loss": 0.8896, + "step": 19796 + }, + { + "epoch": 4.222909556313994, + "grad_norm": 0.1447439832072696, + "learning_rate": 6.52104948065576e-05, + "loss": 0.9227, + "step": 19797 + }, + { + "epoch": 4.223122866894198, + "grad_norm": 0.21733385331013857, + "learning_rate": 6.520213180587672e-05, + "loss": 0.8796, + "step": 19798 + }, + { + "epoch": 4.223336177474403, + "grad_norm": 0.18455514404746934, + "learning_rate": 6.519376897263394e-05, + "loss": 0.8985, + "step": 19799 + }, + { + "epoch": 4.223549488054608, + "grad_norm": 0.17070964130765676, + "learning_rate": 6.518540630692394e-05, + "loss": 0.8926, + "step": 19800 + }, + { + "epoch": 4.223762798634812, + "grad_norm": 0.17246103748964725, + "learning_rate": 6.517704380884132e-05, + "loss": 0.8918, + "step": 19801 + }, + { + "epoch": 4.223976109215017, + "grad_norm": 0.18541171528831887, + "learning_rate": 6.516868147848069e-05, + "loss": 0.913, + "step": 19802 + }, + { + "epoch": 4.224189419795222, + "grad_norm": 0.16633274542506685, + "learning_rate": 6.516031931593665e-05, + "loss": 0.9043, + "step": 19803 + }, + { + "epoch": 4.224402730375426, + "grad_norm": 0.17007474618634297, + "learning_rate": 6.515195732130387e-05, + "loss": 0.8931, + "step": 19804 + }, + { + "epoch": 4.224616040955631, + "grad_norm": 0.1774000421243793, + "learning_rate": 6.514359549467694e-05, + "loss": 0.8877, + "step": 19805 + }, + { + "epoch": 4.224829351535837, + "grad_norm": 0.1804556585242663, + "learning_rate": 6.513523383615048e-05, + "loss": 0.8883, + "step": 19806 + }, + { + "epoch": 4.225042662116041, + "grad_norm": 0.1453830344810504, + "learning_rate": 6.512687234581907e-05, + "loss": 0.8975, + "step": 19807 + }, + { + "epoch": 4.225255972696246, + "grad_norm": 0.1767080424247428, + "learning_rate": 6.51185110237774e-05, + "loss": 0.8978, + "step": 19808 + }, + { + "epoch": 4.225469283276451, + "grad_norm": 0.1537365108453532, + "learning_rate": 6.511014987011998e-05, + "loss": 0.879, + "step": 19809 + }, + { + "epoch": 4.225682593856655, + "grad_norm": 0.17838405389653203, + "learning_rate": 6.51017888849415e-05, + "loss": 0.9072, + "step": 19810 + }, + { + "epoch": 4.22589590443686, + "grad_norm": 0.17494571795772867, + "learning_rate": 6.509342806833654e-05, + "loss": 0.9127, + "step": 19811 + }, + { + "epoch": 4.226109215017065, + "grad_norm": 0.17957988046431694, + "learning_rate": 6.508506742039966e-05, + "loss": 0.9104, + "step": 19812 + }, + { + "epoch": 4.226322525597269, + "grad_norm": 0.16791012235106945, + "learning_rate": 6.50767069412255e-05, + "loss": 0.9268, + "step": 19813 + }, + { + "epoch": 4.226535836177474, + "grad_norm": 0.16717308538296224, + "learning_rate": 6.506834663090866e-05, + "loss": 0.8993, + "step": 19814 + }, + { + "epoch": 4.226749146757679, + "grad_norm": 0.168013878231467, + "learning_rate": 6.505998648954371e-05, + "loss": 0.9186, + "step": 19815 + }, + { + "epoch": 4.226962457337884, + "grad_norm": 0.1986038433006903, + "learning_rate": 6.505162651722529e-05, + "loss": 0.9084, + "step": 19816 + }, + { + "epoch": 4.227175767918089, + "grad_norm": 0.1727207526621501, + "learning_rate": 6.504326671404795e-05, + "loss": 0.9087, + "step": 19817 + }, + { + "epoch": 4.227389078498294, + "grad_norm": 0.18509307111406986, + "learning_rate": 6.50349070801063e-05, + "loss": 0.9099, + "step": 19818 + }, + { + "epoch": 4.227602389078498, + "grad_norm": 0.18385591944352253, + "learning_rate": 6.502654761549493e-05, + "loss": 0.8707, + "step": 19819 + }, + { + "epoch": 4.227815699658703, + "grad_norm": 0.17165966988879564, + "learning_rate": 6.501818832030844e-05, + "loss": 0.9113, + "step": 19820 + }, + { + "epoch": 4.228029010238908, + "grad_norm": 0.18793691218352943, + "learning_rate": 6.500982919464136e-05, + "loss": 0.8859, + "step": 19821 + }, + { + "epoch": 4.228242320819112, + "grad_norm": 0.16167094874861798, + "learning_rate": 6.500147023858835e-05, + "loss": 0.8917, + "step": 19822 + }, + { + "epoch": 4.228455631399317, + "grad_norm": 0.17768771320684906, + "learning_rate": 6.499311145224398e-05, + "loss": 0.9206, + "step": 19823 + }, + { + "epoch": 4.228668941979522, + "grad_norm": 0.18839025516783456, + "learning_rate": 6.498475283570277e-05, + "loss": 0.9142, + "step": 19824 + }, + { + "epoch": 4.228882252559727, + "grad_norm": 0.17562310837207998, + "learning_rate": 6.497639438905934e-05, + "loss": 0.8754, + "step": 19825 + }, + { + "epoch": 4.229095563139932, + "grad_norm": 0.2203299075099592, + "learning_rate": 6.496803611240826e-05, + "loss": 0.9196, + "step": 19826 + }, + { + "epoch": 4.229308873720137, + "grad_norm": 0.17726772355822495, + "learning_rate": 6.495967800584411e-05, + "loss": 0.9001, + "step": 19827 + }, + { + "epoch": 4.229522184300341, + "grad_norm": 0.18833015338649145, + "learning_rate": 6.495132006946145e-05, + "loss": 0.9034, + "step": 19828 + }, + { + "epoch": 4.229735494880546, + "grad_norm": 0.17331534599866907, + "learning_rate": 6.494296230335484e-05, + "loss": 0.8935, + "step": 19829 + }, + { + "epoch": 4.229948805460751, + "grad_norm": 0.17489902106429162, + "learning_rate": 6.49346047076189e-05, + "loss": 0.9013, + "step": 19830 + }, + { + "epoch": 4.2301621160409555, + "grad_norm": 0.17421221434607082, + "learning_rate": 6.492624728234813e-05, + "loss": 0.9217, + "step": 19831 + }, + { + "epoch": 4.23037542662116, + "grad_norm": 0.15851452882145078, + "learning_rate": 6.491789002763714e-05, + "loss": 0.923, + "step": 19832 + }, + { + "epoch": 4.230588737201365, + "grad_norm": 0.1887608568029804, + "learning_rate": 6.490953294358046e-05, + "loss": 0.8784, + "step": 19833 + }, + { + "epoch": 4.2308020477815695, + "grad_norm": 0.16678071547734138, + "learning_rate": 6.490117603027271e-05, + "loss": 0.8993, + "step": 19834 + }, + { + "epoch": 4.231015358361775, + "grad_norm": 0.18718412442271126, + "learning_rate": 6.48928192878084e-05, + "loss": 0.8973, + "step": 19835 + }, + { + "epoch": 4.23122866894198, + "grad_norm": 0.18445572371707836, + "learning_rate": 6.488446271628205e-05, + "loss": 0.8676, + "step": 19836 + }, + { + "epoch": 4.2314419795221845, + "grad_norm": 0.16069278998946837, + "learning_rate": 6.487610631578829e-05, + "loss": 0.9109, + "step": 19837 + }, + { + "epoch": 4.231655290102389, + "grad_norm": 0.15656148668791345, + "learning_rate": 6.486775008642165e-05, + "loss": 0.8811, + "step": 19838 + }, + { + "epoch": 4.231868600682594, + "grad_norm": 0.1710429850806226, + "learning_rate": 6.485939402827665e-05, + "loss": 0.9257, + "step": 19839 + }, + { + "epoch": 4.2320819112627985, + "grad_norm": 0.18136135258698663, + "learning_rate": 6.485103814144788e-05, + "loss": 0.889, + "step": 19840 + }, + { + "epoch": 4.232295221843003, + "grad_norm": 0.20438249876124281, + "learning_rate": 6.484268242602984e-05, + "loss": 0.8999, + "step": 19841 + }, + { + "epoch": 4.232508532423208, + "grad_norm": 0.2090040820115672, + "learning_rate": 6.483432688211713e-05, + "loss": 0.919, + "step": 19842 + }, + { + "epoch": 4.232721843003413, + "grad_norm": 0.18041167963312454, + "learning_rate": 6.482597150980425e-05, + "loss": 0.9137, + "step": 19843 + }, + { + "epoch": 4.232935153583618, + "grad_norm": 0.17389845540932977, + "learning_rate": 6.481761630918577e-05, + "loss": 0.9184, + "step": 19844 + }, + { + "epoch": 4.233148464163823, + "grad_norm": 0.18151140477822084, + "learning_rate": 6.480926128035623e-05, + "loss": 0.8784, + "step": 19845 + }, + { + "epoch": 4.2333617747440275, + "grad_norm": 0.19322011430924835, + "learning_rate": 6.480090642341013e-05, + "loss": 0.9353, + "step": 19846 + }, + { + "epoch": 4.233575085324232, + "grad_norm": 0.1835439314134685, + "learning_rate": 6.479255173844204e-05, + "loss": 0.8909, + "step": 19847 + }, + { + "epoch": 4.233788395904437, + "grad_norm": 0.15975085539837056, + "learning_rate": 6.478419722554646e-05, + "loss": 0.8968, + "step": 19848 + }, + { + "epoch": 4.234001706484642, + "grad_norm": 0.1495144255982273, + "learning_rate": 6.477584288481799e-05, + "loss": 0.8954, + "step": 19849 + }, + { + "epoch": 4.234215017064846, + "grad_norm": 0.18658826455577113, + "learning_rate": 6.476748871635108e-05, + "loss": 0.8629, + "step": 19850 + }, + { + "epoch": 4.234428327645051, + "grad_norm": 0.19515578464020372, + "learning_rate": 6.47591347202403e-05, + "loss": 0.9171, + "step": 19851 + }, + { + "epoch": 4.234641638225256, + "grad_norm": 0.1612011081287848, + "learning_rate": 6.475078089658016e-05, + "loss": 0.8996, + "step": 19852 + }, + { + "epoch": 4.234854948805461, + "grad_norm": 0.1927167208754033, + "learning_rate": 6.474242724546519e-05, + "loss": 0.9278, + "step": 19853 + }, + { + "epoch": 4.235068259385666, + "grad_norm": 0.18359561075645173, + "learning_rate": 6.473407376698992e-05, + "loss": 0.9051, + "step": 19854 + }, + { + "epoch": 4.235281569965871, + "grad_norm": 0.14819970865054968, + "learning_rate": 6.472572046124885e-05, + "loss": 0.8762, + "step": 19855 + }, + { + "epoch": 4.235494880546075, + "grad_norm": 0.18363018673472808, + "learning_rate": 6.471736732833654e-05, + "loss": 0.869, + "step": 19856 + }, + { + "epoch": 4.23570819112628, + "grad_norm": 0.1656730115447951, + "learning_rate": 6.470901436834747e-05, + "loss": 0.8785, + "step": 19857 + }, + { + "epoch": 4.235921501706485, + "grad_norm": 0.1572323766087864, + "learning_rate": 6.470066158137614e-05, + "loss": 0.8916, + "step": 19858 + }, + { + "epoch": 4.236134812286689, + "grad_norm": 0.21589813650071762, + "learning_rate": 6.469230896751708e-05, + "loss": 0.9174, + "step": 19859 + }, + { + "epoch": 4.236348122866894, + "grad_norm": 0.15763139486867622, + "learning_rate": 6.468395652686481e-05, + "loss": 0.9282, + "step": 19860 + }, + { + "epoch": 4.236561433447099, + "grad_norm": 0.22869403706086583, + "learning_rate": 6.467560425951383e-05, + "loss": 0.9286, + "step": 19861 + }, + { + "epoch": 4.236774744027303, + "grad_norm": 0.16903126808820496, + "learning_rate": 6.466725216555865e-05, + "loss": 0.8841, + "step": 19862 + }, + { + "epoch": 4.236988054607509, + "grad_norm": 0.16692498197890338, + "learning_rate": 6.465890024509375e-05, + "loss": 0.8751, + "step": 19863 + }, + { + "epoch": 4.237201365187714, + "grad_norm": 0.1385210090863351, + "learning_rate": 6.465054849821369e-05, + "loss": 0.9042, + "step": 19864 + }, + { + "epoch": 4.237414675767918, + "grad_norm": 0.16822360264920055, + "learning_rate": 6.464219692501287e-05, + "loss": 0.8998, + "step": 19865 + }, + { + "epoch": 4.237627986348123, + "grad_norm": 0.15924513051369202, + "learning_rate": 6.463384552558591e-05, + "loss": 0.923, + "step": 19866 + }, + { + "epoch": 4.237841296928328, + "grad_norm": 0.15959799102457967, + "learning_rate": 6.462549430002725e-05, + "loss": 0.9181, + "step": 19867 + }, + { + "epoch": 4.238054607508532, + "grad_norm": 0.1404778326268816, + "learning_rate": 6.461714324843133e-05, + "loss": 0.9011, + "step": 19868 + }, + { + "epoch": 4.238267918088737, + "grad_norm": 0.1612768289397601, + "learning_rate": 6.460879237089274e-05, + "loss": 0.8829, + "step": 19869 + }, + { + "epoch": 4.238481228668942, + "grad_norm": 0.16193537320162418, + "learning_rate": 6.460044166750588e-05, + "loss": 0.9034, + "step": 19870 + }, + { + "epoch": 4.238694539249146, + "grad_norm": 0.18011883971665746, + "learning_rate": 6.459209113836531e-05, + "loss": 0.898, + "step": 19871 + }, + { + "epoch": 4.238907849829351, + "grad_norm": 0.162985968819706, + "learning_rate": 6.458374078356547e-05, + "loss": 0.8651, + "step": 19872 + }, + { + "epoch": 4.239121160409557, + "grad_norm": 0.18755325463242106, + "learning_rate": 6.457539060320088e-05, + "loss": 0.9026, + "step": 19873 + }, + { + "epoch": 4.239334470989761, + "grad_norm": 0.20469447204930716, + "learning_rate": 6.4567040597366e-05, + "loss": 0.9037, + "step": 19874 + }, + { + "epoch": 4.239547781569966, + "grad_norm": 0.18444265948766891, + "learning_rate": 6.455869076615531e-05, + "loss": 0.9033, + "step": 19875 + }, + { + "epoch": 4.239761092150171, + "grad_norm": 0.17748031176332277, + "learning_rate": 6.45503411096633e-05, + "loss": 0.9165, + "step": 19876 + }, + { + "epoch": 4.239974402730375, + "grad_norm": 0.18667270873573366, + "learning_rate": 6.454199162798441e-05, + "loss": 0.9006, + "step": 19877 + }, + { + "epoch": 4.24018771331058, + "grad_norm": 0.19444170710442715, + "learning_rate": 6.453364232121319e-05, + "loss": 0.9037, + "step": 19878 + }, + { + "epoch": 4.240401023890785, + "grad_norm": 0.22226147141391986, + "learning_rate": 6.452529318944407e-05, + "loss": 0.8892, + "step": 19879 + }, + { + "epoch": 4.2406143344709895, + "grad_norm": 0.22809384698626245, + "learning_rate": 6.451694423277148e-05, + "loss": 0.854, + "step": 19880 + }, + { + "epoch": 4.240827645051194, + "grad_norm": 0.15821594578178425, + "learning_rate": 6.450859545128995e-05, + "loss": 0.8936, + "step": 19881 + }, + { + "epoch": 4.2410409556314, + "grad_norm": 0.19165274186240996, + "learning_rate": 6.450024684509391e-05, + "loss": 0.9107, + "step": 19882 + }, + { + "epoch": 4.241254266211604, + "grad_norm": 0.17147336311935588, + "learning_rate": 6.449189841427784e-05, + "loss": 0.9065, + "step": 19883 + }, + { + "epoch": 4.241467576791809, + "grad_norm": 0.1827107303072908, + "learning_rate": 6.44835501589362e-05, + "loss": 0.9064, + "step": 19884 + }, + { + "epoch": 4.241680887372014, + "grad_norm": 0.20976904280727837, + "learning_rate": 6.447520207916346e-05, + "loss": 0.9174, + "step": 19885 + }, + { + "epoch": 4.2418941979522184, + "grad_norm": 0.15497357123780495, + "learning_rate": 6.446685417505407e-05, + "loss": 0.9344, + "step": 19886 + }, + { + "epoch": 4.242107508532423, + "grad_norm": 0.21969164483271522, + "learning_rate": 6.445850644670247e-05, + "loss": 0.9411, + "step": 19887 + }, + { + "epoch": 4.242320819112628, + "grad_norm": 0.19743010926017895, + "learning_rate": 6.445015889420314e-05, + "loss": 0.9297, + "step": 19888 + }, + { + "epoch": 4.2425341296928325, + "grad_norm": 0.27130335453233617, + "learning_rate": 6.444181151765053e-05, + "loss": 0.9008, + "step": 19889 + }, + { + "epoch": 4.242747440273037, + "grad_norm": 0.26789607859234726, + "learning_rate": 6.44334643171391e-05, + "loss": 0.9079, + "step": 19890 + }, + { + "epoch": 4.242960750853243, + "grad_norm": 0.20438496951032484, + "learning_rate": 6.442511729276326e-05, + "loss": 0.8878, + "step": 19891 + }, + { + "epoch": 4.243174061433447, + "grad_norm": 0.22550387393382326, + "learning_rate": 6.441677044461747e-05, + "loss": 0.8847, + "step": 19892 + }, + { + "epoch": 4.243387372013652, + "grad_norm": 0.2996175040896547, + "learning_rate": 6.44084237727962e-05, + "loss": 0.9487, + "step": 19893 + }, + { + "epoch": 4.243600682593857, + "grad_norm": 0.2839364309040698, + "learning_rate": 6.440007727739387e-05, + "loss": 0.9029, + "step": 19894 + }, + { + "epoch": 4.2438139931740615, + "grad_norm": 0.1921843689114039, + "learning_rate": 6.439173095850493e-05, + "loss": 0.8988, + "step": 19895 + }, + { + "epoch": 4.244027303754266, + "grad_norm": 0.18150188404179737, + "learning_rate": 6.438338481622382e-05, + "loss": 0.9042, + "step": 19896 + }, + { + "epoch": 4.244240614334471, + "grad_norm": 0.24877502035582882, + "learning_rate": 6.437503885064497e-05, + "loss": 0.9138, + "step": 19897 + }, + { + "epoch": 4.2444539249146755, + "grad_norm": 0.22718766708914484, + "learning_rate": 6.436669306186284e-05, + "loss": 0.9193, + "step": 19898 + }, + { + "epoch": 4.24466723549488, + "grad_norm": 0.15370209820358222, + "learning_rate": 6.43583474499718e-05, + "loss": 0.9267, + "step": 19899 + }, + { + "epoch": 4.244880546075085, + "grad_norm": 0.2240717805472958, + "learning_rate": 6.435000201506634e-05, + "loss": 0.9042, + "step": 19900 + }, + { + "epoch": 4.2450938566552905, + "grad_norm": 0.21626733655065128, + "learning_rate": 6.43416567572409e-05, + "loss": 0.8931, + "step": 19901 + }, + { + "epoch": 4.245307167235495, + "grad_norm": 0.20976734858273113, + "learning_rate": 6.433331167658985e-05, + "loss": 0.9189, + "step": 19902 + }, + { + "epoch": 4.2455204778157, + "grad_norm": 0.18259230740704072, + "learning_rate": 6.432496677320765e-05, + "loss": 0.9041, + "step": 19903 + }, + { + "epoch": 4.2457337883959045, + "grad_norm": 0.23354078776181367, + "learning_rate": 6.431662204718869e-05, + "loss": 0.9163, + "step": 19904 + }, + { + "epoch": 4.245947098976109, + "grad_norm": 0.25499089310826023, + "learning_rate": 6.430827749862744e-05, + "loss": 0.9165, + "step": 19905 + }, + { + "epoch": 4.246160409556314, + "grad_norm": 0.15642140744773086, + "learning_rate": 6.429993312761827e-05, + "loss": 0.8966, + "step": 19906 + }, + { + "epoch": 4.246373720136519, + "grad_norm": 0.2153579131482555, + "learning_rate": 6.429158893425565e-05, + "loss": 0.899, + "step": 19907 + }, + { + "epoch": 4.246587030716723, + "grad_norm": 0.25650873931799784, + "learning_rate": 6.428324491863394e-05, + "loss": 0.9007, + "step": 19908 + }, + { + "epoch": 4.246800341296928, + "grad_norm": 0.22753108094455482, + "learning_rate": 6.427490108084761e-05, + "loss": 0.9017, + "step": 19909 + }, + { + "epoch": 4.2470136518771335, + "grad_norm": 0.14242427464078145, + "learning_rate": 6.426655742099101e-05, + "loss": 0.8771, + "step": 19910 + }, + { + "epoch": 4.247226962457338, + "grad_norm": 0.19777092671916408, + "learning_rate": 6.425821393915858e-05, + "loss": 0.869, + "step": 19911 + }, + { + "epoch": 4.247440273037543, + "grad_norm": 0.19747635319551424, + "learning_rate": 6.424987063544476e-05, + "loss": 0.897, + "step": 19912 + }, + { + "epoch": 4.247653583617748, + "grad_norm": 0.18138853833376783, + "learning_rate": 6.42415275099439e-05, + "loss": 0.8842, + "step": 19913 + }, + { + "epoch": 4.247866894197952, + "grad_norm": 0.1916397763751772, + "learning_rate": 6.42331845627504e-05, + "loss": 0.905, + "step": 19914 + }, + { + "epoch": 4.248080204778157, + "grad_norm": 0.21860008253334462, + "learning_rate": 6.422484179395869e-05, + "loss": 0.9136, + "step": 19915 + }, + { + "epoch": 4.248293515358362, + "grad_norm": 0.25191021477196607, + "learning_rate": 6.421649920366316e-05, + "loss": 0.9367, + "step": 19916 + }, + { + "epoch": 4.248506825938566, + "grad_norm": 0.18985266564043912, + "learning_rate": 6.420815679195821e-05, + "loss": 0.9074, + "step": 19917 + }, + { + "epoch": 4.248720136518771, + "grad_norm": 0.1673275465656302, + "learning_rate": 6.419981455893823e-05, + "loss": 0.9016, + "step": 19918 + }, + { + "epoch": 4.248933447098976, + "grad_norm": 0.2079081991944822, + "learning_rate": 6.419147250469762e-05, + "loss": 0.8854, + "step": 19919 + }, + { + "epoch": 4.249146757679181, + "grad_norm": 0.24422308777964788, + "learning_rate": 6.418313062933074e-05, + "loss": 0.8871, + "step": 19920 + }, + { + "epoch": 4.249360068259386, + "grad_norm": 0.1793870555209805, + "learning_rate": 6.417478893293203e-05, + "loss": 0.8913, + "step": 19921 + }, + { + "epoch": 4.249573378839591, + "grad_norm": 0.18074806771313107, + "learning_rate": 6.416644741559586e-05, + "loss": 0.9389, + "step": 19922 + }, + { + "epoch": 4.249786689419795, + "grad_norm": 0.24852029911761683, + "learning_rate": 6.41581060774166e-05, + "loss": 0.9171, + "step": 19923 + }, + { + "epoch": 4.25, + "grad_norm": 0.23514563166530464, + "learning_rate": 6.414976491848861e-05, + "loss": 0.9154, + "step": 19924 + }, + { + "epoch": 4.250213310580205, + "grad_norm": 0.15719739770758917, + "learning_rate": 6.414142393890633e-05, + "loss": 0.889, + "step": 19925 + }, + { + "epoch": 4.250426621160409, + "grad_norm": 0.17042166258758823, + "learning_rate": 6.413308313876407e-05, + "loss": 0.8951, + "step": 19926 + }, + { + "epoch": 4.250639931740614, + "grad_norm": 0.19651624889480349, + "learning_rate": 6.412474251815627e-05, + "loss": 0.9024, + "step": 19927 + }, + { + "epoch": 4.250853242320819, + "grad_norm": 0.23716489336898766, + "learning_rate": 6.411640207717724e-05, + "loss": 0.9237, + "step": 19928 + }, + { + "epoch": 4.251066552901024, + "grad_norm": 0.20300767580889495, + "learning_rate": 6.41080618159214e-05, + "loss": 0.9089, + "step": 19929 + }, + { + "epoch": 4.251279863481229, + "grad_norm": 0.15804472331267586, + "learning_rate": 6.409972173448312e-05, + "loss": 0.8931, + "step": 19930 + }, + { + "epoch": 4.251493174061434, + "grad_norm": 0.21827851689228125, + "learning_rate": 6.409138183295675e-05, + "loss": 0.8841, + "step": 19931 + }, + { + "epoch": 4.251706484641638, + "grad_norm": 0.19513165645812605, + "learning_rate": 6.408304211143666e-05, + "loss": 0.9565, + "step": 19932 + }, + { + "epoch": 4.251919795221843, + "grad_norm": 0.15547308684254943, + "learning_rate": 6.407470257001723e-05, + "loss": 0.8736, + "step": 19933 + }, + { + "epoch": 4.252133105802048, + "grad_norm": 0.16773022753216527, + "learning_rate": 6.40663632087928e-05, + "loss": 0.8891, + "step": 19934 + }, + { + "epoch": 4.252346416382252, + "grad_norm": 0.14712823280522191, + "learning_rate": 6.405802402785776e-05, + "loss": 0.9114, + "step": 19935 + }, + { + "epoch": 4.252559726962457, + "grad_norm": 0.1510679769587163, + "learning_rate": 6.404968502730641e-05, + "loss": 0.895, + "step": 19936 + }, + { + "epoch": 4.252773037542662, + "grad_norm": 0.14605638548101732, + "learning_rate": 6.404134620723316e-05, + "loss": 0.8789, + "step": 19937 + }, + { + "epoch": 4.252986348122867, + "grad_norm": 0.1341299727626215, + "learning_rate": 6.403300756773233e-05, + "loss": 0.877, + "step": 19938 + }, + { + "epoch": 4.253199658703072, + "grad_norm": 0.13867888805927794, + "learning_rate": 6.402466910889832e-05, + "loss": 0.8731, + "step": 19939 + }, + { + "epoch": 4.253412969283277, + "grad_norm": 0.1606218161789329, + "learning_rate": 6.401633083082541e-05, + "loss": 0.9235, + "step": 19940 + }, + { + "epoch": 4.253626279863481, + "grad_norm": 0.14936996670474928, + "learning_rate": 6.400799273360802e-05, + "loss": 0.9007, + "step": 19941 + }, + { + "epoch": 4.253839590443686, + "grad_norm": 0.1975125577943235, + "learning_rate": 6.399965481734043e-05, + "loss": 0.9167, + "step": 19942 + }, + { + "epoch": 4.254052901023891, + "grad_norm": 0.19469559821804008, + "learning_rate": 6.399131708211704e-05, + "loss": 0.9169, + "step": 19943 + }, + { + "epoch": 4.2542662116040955, + "grad_norm": 0.15467524766786453, + "learning_rate": 6.398297952803214e-05, + "loss": 0.8944, + "step": 19944 + }, + { + "epoch": 4.2544795221843, + "grad_norm": 0.19429926952692525, + "learning_rate": 6.397464215518011e-05, + "loss": 0.9037, + "step": 19945 + }, + { + "epoch": 4.254692832764505, + "grad_norm": 0.22932878738110377, + "learning_rate": 6.396630496365531e-05, + "loss": 0.8996, + "step": 19946 + }, + { + "epoch": 4.2549061433447095, + "grad_norm": 0.26010664829168945, + "learning_rate": 6.395796795355201e-05, + "loss": 0.9092, + "step": 19947 + }, + { + "epoch": 4.255119453924915, + "grad_norm": 0.2024389651381048, + "learning_rate": 6.394963112496455e-05, + "loss": 0.9037, + "step": 19948 + }, + { + "epoch": 4.25533276450512, + "grad_norm": 0.1459156562367151, + "learning_rate": 6.39412944779873e-05, + "loss": 0.9059, + "step": 19949 + }, + { + "epoch": 4.2555460750853245, + "grad_norm": 0.17898510861993494, + "learning_rate": 6.393295801271457e-05, + "loss": 0.9086, + "step": 19950 + }, + { + "epoch": 4.255759385665529, + "grad_norm": 0.18288125091022492, + "learning_rate": 6.39246217292407e-05, + "loss": 0.9142, + "step": 19951 + }, + { + "epoch": 4.255972696245734, + "grad_norm": 0.16767252757386325, + "learning_rate": 6.391628562765998e-05, + "loss": 0.8904, + "step": 19952 + }, + { + "epoch": 4.2561860068259385, + "grad_norm": 0.15149943771540048, + "learning_rate": 6.39079497080668e-05, + "loss": 0.8943, + "step": 19953 + }, + { + "epoch": 4.256399317406143, + "grad_norm": 0.17634979924297345, + "learning_rate": 6.38996139705554e-05, + "loss": 0.9072, + "step": 19954 + }, + { + "epoch": 4.256612627986348, + "grad_norm": 0.1816004496438849, + "learning_rate": 6.389127841522016e-05, + "loss": 0.9034, + "step": 19955 + }, + { + "epoch": 4.256825938566553, + "grad_norm": 0.15892218282076864, + "learning_rate": 6.388294304215539e-05, + "loss": 0.9136, + "step": 19956 + }, + { + "epoch": 4.257039249146757, + "grad_norm": 0.1794845524960937, + "learning_rate": 6.387460785145539e-05, + "loss": 0.8838, + "step": 19957 + }, + { + "epoch": 4.257252559726963, + "grad_norm": 0.17214177632047226, + "learning_rate": 6.386627284321444e-05, + "loss": 0.9306, + "step": 19958 + }, + { + "epoch": 4.2574658703071675, + "grad_norm": 0.16184867646043388, + "learning_rate": 6.385793801752692e-05, + "loss": 0.9238, + "step": 19959 + }, + { + "epoch": 4.257679180887372, + "grad_norm": 0.17948409445122343, + "learning_rate": 6.384960337448707e-05, + "loss": 0.9068, + "step": 19960 + }, + { + "epoch": 4.257892491467577, + "grad_norm": 0.1531714863725489, + "learning_rate": 6.384126891418924e-05, + "loss": 0.8681, + "step": 19961 + }, + { + "epoch": 4.2581058020477816, + "grad_norm": 0.1940799551363312, + "learning_rate": 6.383293463672772e-05, + "loss": 0.8627, + "step": 19962 + }, + { + "epoch": 4.258319112627986, + "grad_norm": 0.17995104710878082, + "learning_rate": 6.382460054219684e-05, + "loss": 0.9047, + "step": 19963 + }, + { + "epoch": 4.258532423208191, + "grad_norm": 0.22495540728691296, + "learning_rate": 6.381626663069083e-05, + "loss": 0.9049, + "step": 19964 + }, + { + "epoch": 4.258745733788396, + "grad_norm": 0.21732870247123262, + "learning_rate": 6.380793290230408e-05, + "loss": 0.8977, + "step": 19965 + }, + { + "epoch": 4.2589590443686, + "grad_norm": 0.16658269834074269, + "learning_rate": 6.379959935713082e-05, + "loss": 0.9276, + "step": 19966 + }, + { + "epoch": 4.259172354948806, + "grad_norm": 0.23469897951526755, + "learning_rate": 6.379126599526538e-05, + "loss": 0.8863, + "step": 19967 + }, + { + "epoch": 4.2593856655290105, + "grad_norm": 0.21459953381180508, + "learning_rate": 6.378293281680205e-05, + "loss": 0.9476, + "step": 19968 + }, + { + "epoch": 4.259598976109215, + "grad_norm": 0.20647934405871562, + "learning_rate": 6.377459982183511e-05, + "loss": 0.9211, + "step": 19969 + }, + { + "epoch": 4.25981228668942, + "grad_norm": 0.18652781038730948, + "learning_rate": 6.37662670104588e-05, + "loss": 0.9123, + "step": 19970 + }, + { + "epoch": 4.260025597269625, + "grad_norm": 0.1622543320172809, + "learning_rate": 6.375793438276748e-05, + "loss": 0.9076, + "step": 19971 + }, + { + "epoch": 4.260238907849829, + "grad_norm": 0.1854617606389392, + "learning_rate": 6.374960193885538e-05, + "loss": 0.9228, + "step": 19972 + }, + { + "epoch": 4.260452218430034, + "grad_norm": 0.21509588332616716, + "learning_rate": 6.374126967881685e-05, + "loss": 0.9369, + "step": 19973 + }, + { + "epoch": 4.260665529010239, + "grad_norm": 0.16126928618223643, + "learning_rate": 6.37329376027461e-05, + "loss": 0.8911, + "step": 19974 + }, + { + "epoch": 4.260878839590443, + "grad_norm": 0.19667130250815582, + "learning_rate": 6.372460571073744e-05, + "loss": 0.8887, + "step": 19975 + }, + { + "epoch": 4.261092150170649, + "grad_norm": 0.20319312720049912, + "learning_rate": 6.371627400288513e-05, + "loss": 0.9131, + "step": 19976 + }, + { + "epoch": 4.261305460750854, + "grad_norm": 0.19836114142206696, + "learning_rate": 6.370794247928348e-05, + "loss": 0.9078, + "step": 19977 + }, + { + "epoch": 4.261518771331058, + "grad_norm": 0.21740056181621656, + "learning_rate": 6.369961114002672e-05, + "loss": 0.8605, + "step": 19978 + }, + { + "epoch": 4.261732081911263, + "grad_norm": 0.19786316822493533, + "learning_rate": 6.369127998520915e-05, + "loss": 0.905, + "step": 19979 + }, + { + "epoch": 4.261945392491468, + "grad_norm": 0.21023354617745715, + "learning_rate": 6.3682949014925e-05, + "loss": 0.9189, + "step": 19980 + }, + { + "epoch": 4.262158703071672, + "grad_norm": 0.1931558362471058, + "learning_rate": 6.367461822926857e-05, + "loss": 0.8877, + "step": 19981 + }, + { + "epoch": 4.262372013651877, + "grad_norm": 0.21616741134476422, + "learning_rate": 6.36662876283341e-05, + "loss": 0.9189, + "step": 19982 + }, + { + "epoch": 4.262585324232082, + "grad_norm": 0.1732029162976782, + "learning_rate": 6.365795721221587e-05, + "loss": 0.9017, + "step": 19983 + }, + { + "epoch": 4.262798634812286, + "grad_norm": 0.22702908640018687, + "learning_rate": 6.36496269810081e-05, + "loss": 0.8756, + "step": 19984 + }, + { + "epoch": 4.263011945392491, + "grad_norm": 0.17305523071430798, + "learning_rate": 6.36412969348051e-05, + "loss": 0.9236, + "step": 19985 + }, + { + "epoch": 4.263225255972697, + "grad_norm": 0.2242695186210594, + "learning_rate": 6.363296707370108e-05, + "loss": 0.8834, + "step": 19986 + }, + { + "epoch": 4.263438566552901, + "grad_norm": 0.18397931265764178, + "learning_rate": 6.362463739779034e-05, + "loss": 0.9033, + "step": 19987 + }, + { + "epoch": 4.263651877133106, + "grad_norm": 0.2168924945650043, + "learning_rate": 6.361630790716708e-05, + "loss": 0.9084, + "step": 19988 + }, + { + "epoch": 4.263865187713311, + "grad_norm": 0.22797487318748683, + "learning_rate": 6.36079786019256e-05, + "loss": 0.8828, + "step": 19989 + }, + { + "epoch": 4.264078498293515, + "grad_norm": 0.1718560823506376, + "learning_rate": 6.359964948216012e-05, + "loss": 0.8987, + "step": 19990 + }, + { + "epoch": 4.26429180887372, + "grad_norm": 0.2100776284742305, + "learning_rate": 6.359132054796486e-05, + "loss": 0.9021, + "step": 19991 + }, + { + "epoch": 4.264505119453925, + "grad_norm": 0.15635627106039435, + "learning_rate": 6.358299179943409e-05, + "loss": 0.8893, + "step": 19992 + }, + { + "epoch": 4.264718430034129, + "grad_norm": 0.17571778998200446, + "learning_rate": 6.357466323666204e-05, + "loss": 0.8986, + "step": 19993 + }, + { + "epoch": 4.264931740614334, + "grad_norm": 0.17462208828107237, + "learning_rate": 6.356633485974294e-05, + "loss": 0.8891, + "step": 19994 + }, + { + "epoch": 4.265145051194539, + "grad_norm": 0.15465702645331153, + "learning_rate": 6.355800666877106e-05, + "loss": 0.9102, + "step": 19995 + }, + { + "epoch": 4.265358361774744, + "grad_norm": 0.17267388547445628, + "learning_rate": 6.354967866384059e-05, + "loss": 0.9097, + "step": 19996 + }, + { + "epoch": 4.265571672354949, + "grad_norm": 0.15634149335820396, + "learning_rate": 6.354135084504581e-05, + "loss": 0.9142, + "step": 19997 + }, + { + "epoch": 4.265784982935154, + "grad_norm": 0.17562485586100274, + "learning_rate": 6.353302321248089e-05, + "loss": 0.8751, + "step": 19998 + }, + { + "epoch": 4.265998293515358, + "grad_norm": 0.16568099829068572, + "learning_rate": 6.352469576624011e-05, + "loss": 0.8798, + "step": 19999 + }, + { + "epoch": 4.266211604095563, + "grad_norm": 0.17100786793648717, + "learning_rate": 6.351636850641766e-05, + "loss": 0.8999, + "step": 20000 + }, + { + "epoch": 4.266424914675768, + "grad_norm": 0.1891453943358859, + "learning_rate": 6.35080414331078e-05, + "loss": 0.9058, + "step": 20001 + }, + { + "epoch": 4.2666382252559725, + "grad_norm": 0.18370575311526638, + "learning_rate": 6.349971454640475e-05, + "loss": 0.914, + "step": 20002 + }, + { + "epoch": 4.266851535836177, + "grad_norm": 0.18238357891083626, + "learning_rate": 6.349138784640266e-05, + "loss": 0.9033, + "step": 20003 + }, + { + "epoch": 4.267064846416382, + "grad_norm": 0.19592630472399983, + "learning_rate": 6.348306133319581e-05, + "loss": 0.8828, + "step": 20004 + }, + { + "epoch": 4.267278156996587, + "grad_norm": 0.19038421127588545, + "learning_rate": 6.34747350068784e-05, + "loss": 0.8948, + "step": 20005 + }, + { + "epoch": 4.267491467576792, + "grad_norm": 0.22208024781639418, + "learning_rate": 6.346640886754463e-05, + "loss": 0.9187, + "step": 20006 + }, + { + "epoch": 4.267704778156997, + "grad_norm": 0.2062390090015615, + "learning_rate": 6.345808291528874e-05, + "loss": 0.8916, + "step": 20007 + }, + { + "epoch": 4.2679180887372015, + "grad_norm": 0.22222943654037877, + "learning_rate": 6.344975715020489e-05, + "loss": 0.9264, + "step": 20008 + }, + { + "epoch": 4.268131399317406, + "grad_norm": 0.1847977156597027, + "learning_rate": 6.344143157238736e-05, + "loss": 0.8954, + "step": 20009 + }, + { + "epoch": 4.268344709897611, + "grad_norm": 0.18603740629971294, + "learning_rate": 6.343310618193027e-05, + "loss": 0.9331, + "step": 20010 + }, + { + "epoch": 4.2685580204778155, + "grad_norm": 0.24767892712888365, + "learning_rate": 6.342478097892788e-05, + "loss": 0.9144, + "step": 20011 + }, + { + "epoch": 4.26877133105802, + "grad_norm": 0.23674504144965747, + "learning_rate": 6.341645596347437e-05, + "loss": 0.9166, + "step": 20012 + }, + { + "epoch": 4.268984641638225, + "grad_norm": 0.22294387606969346, + "learning_rate": 6.340813113566396e-05, + "loss": 0.8955, + "step": 20013 + }, + { + "epoch": 4.2691979522184305, + "grad_norm": 0.17116337925338124, + "learning_rate": 6.339980649559082e-05, + "loss": 0.942, + "step": 20014 + }, + { + "epoch": 4.269411262798635, + "grad_norm": 0.27178469509210046, + "learning_rate": 6.339148204334914e-05, + "loss": 0.8776, + "step": 20015 + }, + { + "epoch": 4.26962457337884, + "grad_norm": 0.2479221038829858, + "learning_rate": 6.33831577790331e-05, + "loss": 0.9234, + "step": 20016 + }, + { + "epoch": 4.2698378839590445, + "grad_norm": 0.19055023666627507, + "learning_rate": 6.337483370273693e-05, + "loss": 0.8867, + "step": 20017 + }, + { + "epoch": 4.270051194539249, + "grad_norm": 0.21093306934904146, + "learning_rate": 6.336650981455478e-05, + "loss": 0.9354, + "step": 20018 + }, + { + "epoch": 4.270264505119454, + "grad_norm": 0.23780717935213053, + "learning_rate": 6.335818611458086e-05, + "loss": 0.8639, + "step": 20019 + }, + { + "epoch": 4.270477815699659, + "grad_norm": 0.23015366029429793, + "learning_rate": 6.334986260290934e-05, + "loss": 0.9212, + "step": 20020 + }, + { + "epoch": 4.270691126279863, + "grad_norm": 0.21407189042429137, + "learning_rate": 6.334153927963442e-05, + "loss": 0.9086, + "step": 20021 + }, + { + "epoch": 4.270904436860068, + "grad_norm": 0.1979362555649636, + "learning_rate": 6.333321614485024e-05, + "loss": 0.9228, + "step": 20022 + }, + { + "epoch": 4.2711177474402735, + "grad_norm": 0.24195607964386898, + "learning_rate": 6.332489319865102e-05, + "loss": 0.8953, + "step": 20023 + }, + { + "epoch": 4.271331058020478, + "grad_norm": 0.16927818236162817, + "learning_rate": 6.331657044113094e-05, + "loss": 0.9133, + "step": 20024 + }, + { + "epoch": 4.271544368600683, + "grad_norm": 0.16218829108340357, + "learning_rate": 6.330824787238409e-05, + "loss": 0.8916, + "step": 20025 + }, + { + "epoch": 4.271757679180888, + "grad_norm": 0.22931147431163132, + "learning_rate": 6.329992549250473e-05, + "loss": 0.8935, + "step": 20026 + }, + { + "epoch": 4.271970989761092, + "grad_norm": 0.17811957357898237, + "learning_rate": 6.329160330158699e-05, + "loss": 0.9043, + "step": 20027 + }, + { + "epoch": 4.272184300341297, + "grad_norm": 0.1607776831475439, + "learning_rate": 6.328328129972502e-05, + "loss": 0.9003, + "step": 20028 + }, + { + "epoch": 4.272397610921502, + "grad_norm": 0.20109277243183143, + "learning_rate": 6.327495948701303e-05, + "loss": 0.9016, + "step": 20029 + }, + { + "epoch": 4.272610921501706, + "grad_norm": 0.22219930480289618, + "learning_rate": 6.326663786354513e-05, + "loss": 0.8762, + "step": 20030 + }, + { + "epoch": 4.272824232081911, + "grad_norm": 0.20739455850965444, + "learning_rate": 6.325831642941552e-05, + "loss": 0.9058, + "step": 20031 + }, + { + "epoch": 4.273037542662116, + "grad_norm": 0.1431290866168131, + "learning_rate": 6.324999518471833e-05, + "loss": 0.9179, + "step": 20032 + }, + { + "epoch": 4.273250853242321, + "grad_norm": 0.21850935463702656, + "learning_rate": 6.324167412954773e-05, + "loss": 0.9314, + "step": 20033 + }, + { + "epoch": 4.273464163822526, + "grad_norm": 0.2246015779738975, + "learning_rate": 6.323335326399786e-05, + "loss": 0.9276, + "step": 20034 + }, + { + "epoch": 4.273677474402731, + "grad_norm": 0.18814290193467892, + "learning_rate": 6.322503258816291e-05, + "loss": 0.8892, + "step": 20035 + }, + { + "epoch": 4.273890784982935, + "grad_norm": 0.16139370457627084, + "learning_rate": 6.3216712102137e-05, + "loss": 0.9134, + "step": 20036 + }, + { + "epoch": 4.27410409556314, + "grad_norm": 0.17162198164646766, + "learning_rate": 6.320839180601424e-05, + "loss": 0.9296, + "step": 20037 + }, + { + "epoch": 4.274317406143345, + "grad_norm": 0.1812728104496565, + "learning_rate": 6.320007169988883e-05, + "loss": 0.8946, + "step": 20038 + }, + { + "epoch": 4.274530716723549, + "grad_norm": 0.18492485446376863, + "learning_rate": 6.319175178385488e-05, + "loss": 0.9163, + "step": 20039 + }, + { + "epoch": 4.274744027303754, + "grad_norm": 0.1648936069924789, + "learning_rate": 6.318343205800654e-05, + "loss": 0.893, + "step": 20040 + }, + { + "epoch": 4.274957337883959, + "grad_norm": 0.1734517291287044, + "learning_rate": 6.317511252243797e-05, + "loss": 0.9104, + "step": 20041 + }, + { + "epoch": 4.275170648464163, + "grad_norm": 0.15005473006966796, + "learning_rate": 6.316679317724326e-05, + "loss": 0.8917, + "step": 20042 + }, + { + "epoch": 4.275383959044369, + "grad_norm": 0.1740110418213955, + "learning_rate": 6.315847402251659e-05, + "loss": 0.8947, + "step": 20043 + }, + { + "epoch": 4.275597269624574, + "grad_norm": 0.15341559521195644, + "learning_rate": 6.315015505835204e-05, + "loss": 0.9133, + "step": 20044 + }, + { + "epoch": 4.275810580204778, + "grad_norm": 0.1714382145245882, + "learning_rate": 6.31418362848438e-05, + "loss": 0.9185, + "step": 20045 + }, + { + "epoch": 4.276023890784983, + "grad_norm": 0.15800925518944392, + "learning_rate": 6.313351770208597e-05, + "loss": 0.9352, + "step": 20046 + }, + { + "epoch": 4.276237201365188, + "grad_norm": 0.19373673250214915, + "learning_rate": 6.312519931017264e-05, + "loss": 0.9099, + "step": 20047 + }, + { + "epoch": 4.276450511945392, + "grad_norm": 0.22017963767349225, + "learning_rate": 6.311688110919798e-05, + "loss": 0.9211, + "step": 20048 + }, + { + "epoch": 4.276663822525597, + "grad_norm": 0.18461481411168268, + "learning_rate": 6.31085630992561e-05, + "loss": 0.8955, + "step": 20049 + }, + { + "epoch": 4.276877133105802, + "grad_norm": 0.21798328892673283, + "learning_rate": 6.31002452804411e-05, + "loss": 0.8729, + "step": 20050 + }, + { + "epoch": 4.277090443686006, + "grad_norm": 0.1882160421519919, + "learning_rate": 6.309192765284713e-05, + "loss": 0.8713, + "step": 20051 + }, + { + "epoch": 4.277303754266212, + "grad_norm": 0.21041958819526904, + "learning_rate": 6.308361021656824e-05, + "loss": 0.8849, + "step": 20052 + }, + { + "epoch": 4.277517064846417, + "grad_norm": 0.22436324108542297, + "learning_rate": 6.307529297169862e-05, + "loss": 0.9048, + "step": 20053 + }, + { + "epoch": 4.277730375426621, + "grad_norm": 0.21922510370327875, + "learning_rate": 6.306697591833233e-05, + "loss": 0.8864, + "step": 20054 + }, + { + "epoch": 4.277943686006826, + "grad_norm": 0.19593645790451739, + "learning_rate": 6.30586590565635e-05, + "loss": 0.8843, + "step": 20055 + }, + { + "epoch": 4.278156996587031, + "grad_norm": 0.1937258892290609, + "learning_rate": 6.305034238648623e-05, + "loss": 0.9362, + "step": 20056 + }, + { + "epoch": 4.278370307167235, + "grad_norm": 0.22739290970350592, + "learning_rate": 6.304202590819461e-05, + "loss": 0.9326, + "step": 20057 + }, + { + "epoch": 4.27858361774744, + "grad_norm": 0.31623640867097313, + "learning_rate": 6.30337096217828e-05, + "loss": 0.8962, + "step": 20058 + }, + { + "epoch": 4.278796928327645, + "grad_norm": 0.18807308787244176, + "learning_rate": 6.302539352734479e-05, + "loss": 0.8795, + "step": 20059 + }, + { + "epoch": 4.2790102389078495, + "grad_norm": 0.16730904493920554, + "learning_rate": 6.301707762497476e-05, + "loss": 0.9213, + "step": 20060 + }, + { + "epoch": 4.279223549488055, + "grad_norm": 0.19811314254974707, + "learning_rate": 6.30087619147668e-05, + "loss": 0.9095, + "step": 20061 + }, + { + "epoch": 4.27943686006826, + "grad_norm": 0.1658586252987201, + "learning_rate": 6.300044639681495e-05, + "loss": 0.8975, + "step": 20062 + }, + { + "epoch": 4.279650170648464, + "grad_norm": 0.18369050708308646, + "learning_rate": 6.299213107121335e-05, + "loss": 0.9066, + "step": 20063 + }, + { + "epoch": 4.279863481228669, + "grad_norm": 0.17451808011415731, + "learning_rate": 6.298381593805606e-05, + "loss": 0.8937, + "step": 20064 + }, + { + "epoch": 4.280076791808874, + "grad_norm": 0.16008268503925832, + "learning_rate": 6.29755009974372e-05, + "loss": 0.8766, + "step": 20065 + }, + { + "epoch": 4.2802901023890785, + "grad_norm": 0.15571110091288148, + "learning_rate": 6.296718624945081e-05, + "loss": 0.9093, + "step": 20066 + }, + { + "epoch": 4.280503412969283, + "grad_norm": 0.16292431933444176, + "learning_rate": 6.295887169419102e-05, + "loss": 0.8912, + "step": 20067 + }, + { + "epoch": 4.280716723549488, + "grad_norm": 0.17414064968270257, + "learning_rate": 6.295055733175186e-05, + "loss": 0.9004, + "step": 20068 + }, + { + "epoch": 4.2809300341296925, + "grad_norm": 0.16384371562643957, + "learning_rate": 6.294224316222748e-05, + "loss": 0.8817, + "step": 20069 + }, + { + "epoch": 4.281143344709897, + "grad_norm": 0.1623368778483837, + "learning_rate": 6.293392918571187e-05, + "loss": 0.9311, + "step": 20070 + }, + { + "epoch": 4.281356655290103, + "grad_norm": 0.1622121835096384, + "learning_rate": 6.292561540229912e-05, + "loss": 0.903, + "step": 20071 + }, + { + "epoch": 4.2815699658703075, + "grad_norm": 0.14960374256410033, + "learning_rate": 6.291730181208335e-05, + "loss": 0.8804, + "step": 20072 + }, + { + "epoch": 4.281783276450512, + "grad_norm": 0.1733979616491972, + "learning_rate": 6.290898841515857e-05, + "loss": 0.9234, + "step": 20073 + }, + { + "epoch": 4.281996587030717, + "grad_norm": 0.15786884838091722, + "learning_rate": 6.290067521161889e-05, + "loss": 0.8911, + "step": 20074 + }, + { + "epoch": 4.2822098976109215, + "grad_norm": 0.1601856361208456, + "learning_rate": 6.289236220155836e-05, + "loss": 0.9004, + "step": 20075 + }, + { + "epoch": 4.282423208191126, + "grad_norm": 0.17733276511488563, + "learning_rate": 6.288404938507101e-05, + "loss": 0.9037, + "step": 20076 + }, + { + "epoch": 4.282636518771331, + "grad_norm": 0.2016144564399419, + "learning_rate": 6.287573676225096e-05, + "loss": 0.8595, + "step": 20077 + }, + { + "epoch": 4.282849829351536, + "grad_norm": 0.18878759865102218, + "learning_rate": 6.286742433319223e-05, + "loss": 0.9072, + "step": 20078 + }, + { + "epoch": 4.28306313993174, + "grad_norm": 0.19681051441430483, + "learning_rate": 6.285911209798888e-05, + "loss": 0.925, + "step": 20079 + }, + { + "epoch": 4.283276450511945, + "grad_norm": 0.19042124535387364, + "learning_rate": 6.2850800056735e-05, + "loss": 0.9135, + "step": 20080 + }, + { + "epoch": 4.2834897610921505, + "grad_norm": 0.17054973606034612, + "learning_rate": 6.284248820952456e-05, + "loss": 0.9215, + "step": 20081 + }, + { + "epoch": 4.283703071672355, + "grad_norm": 0.16867007527178496, + "learning_rate": 6.283417655645167e-05, + "loss": 0.8885, + "step": 20082 + }, + { + "epoch": 4.28391638225256, + "grad_norm": 0.17767872130425538, + "learning_rate": 6.282586509761034e-05, + "loss": 0.8968, + "step": 20083 + }, + { + "epoch": 4.284129692832765, + "grad_norm": 0.18419303482509647, + "learning_rate": 6.281755383309467e-05, + "loss": 0.9224, + "step": 20084 + }, + { + "epoch": 4.284343003412969, + "grad_norm": 0.15941171342376995, + "learning_rate": 6.280924276299866e-05, + "loss": 0.8846, + "step": 20085 + }, + { + "epoch": 4.284556313993174, + "grad_norm": 0.17446756858140472, + "learning_rate": 6.280093188741632e-05, + "loss": 0.9054, + "step": 20086 + }, + { + "epoch": 4.284769624573379, + "grad_norm": 0.17786863594557062, + "learning_rate": 6.279262120644177e-05, + "loss": 0.8845, + "step": 20087 + }, + { + "epoch": 4.284982935153583, + "grad_norm": 0.17647994194657765, + "learning_rate": 6.278431072016897e-05, + "loss": 0.9327, + "step": 20088 + }, + { + "epoch": 4.285196245733788, + "grad_norm": 0.17528072619845983, + "learning_rate": 6.2776000428692e-05, + "loss": 0.914, + "step": 20089 + }, + { + "epoch": 4.285409556313994, + "grad_norm": 0.2440567804448373, + "learning_rate": 6.276769033210487e-05, + "loss": 0.8969, + "step": 20090 + }, + { + "epoch": 4.285622866894198, + "grad_norm": 0.22757508003236432, + "learning_rate": 6.275938043050164e-05, + "loss": 0.8955, + "step": 20091 + }, + { + "epoch": 4.285836177474403, + "grad_norm": 0.16985769866006997, + "learning_rate": 6.275107072397629e-05, + "loss": 0.9049, + "step": 20092 + }, + { + "epoch": 4.286049488054608, + "grad_norm": 0.21713995921041093, + "learning_rate": 6.274276121262285e-05, + "loss": 0.8923, + "step": 20093 + }, + { + "epoch": 4.286262798634812, + "grad_norm": 0.1480996717450969, + "learning_rate": 6.273445189653537e-05, + "loss": 0.8915, + "step": 20094 + }, + { + "epoch": 4.286476109215017, + "grad_norm": 0.24333009766457933, + "learning_rate": 6.272614277580784e-05, + "loss": 0.9066, + "step": 20095 + }, + { + "epoch": 4.286689419795222, + "grad_norm": 0.18604881853845232, + "learning_rate": 6.271783385053432e-05, + "loss": 0.8916, + "step": 20096 + }, + { + "epoch": 4.286902730375426, + "grad_norm": 0.171016705016292, + "learning_rate": 6.270952512080879e-05, + "loss": 0.878, + "step": 20097 + }, + { + "epoch": 4.287116040955631, + "grad_norm": 0.16228518987424606, + "learning_rate": 6.270121658672526e-05, + "loss": 0.8982, + "step": 20098 + }, + { + "epoch": 4.287329351535837, + "grad_norm": 0.1933468529995191, + "learning_rate": 6.26929082483778e-05, + "loss": 0.9137, + "step": 20099 + }, + { + "epoch": 4.287542662116041, + "grad_norm": 0.16398536191060697, + "learning_rate": 6.268460010586031e-05, + "loss": 0.8927, + "step": 20100 + }, + { + "epoch": 4.287755972696246, + "grad_norm": 0.161365651971045, + "learning_rate": 6.267629215926691e-05, + "loss": 0.8806, + "step": 20101 + }, + { + "epoch": 4.287969283276451, + "grad_norm": 0.18598391468047526, + "learning_rate": 6.266798440869157e-05, + "loss": 0.9004, + "step": 20102 + }, + { + "epoch": 4.288182593856655, + "grad_norm": 0.17800134918834776, + "learning_rate": 6.265967685422823e-05, + "loss": 0.9072, + "step": 20103 + }, + { + "epoch": 4.28839590443686, + "grad_norm": 0.22171916827029162, + "learning_rate": 6.265136949597096e-05, + "loss": 0.8848, + "step": 20104 + }, + { + "epoch": 4.288609215017065, + "grad_norm": 0.19152242128341976, + "learning_rate": 6.264306233401373e-05, + "loss": 0.9199, + "step": 20105 + }, + { + "epoch": 4.288822525597269, + "grad_norm": 0.17316211334895404, + "learning_rate": 6.263475536845055e-05, + "loss": 0.8991, + "step": 20106 + }, + { + "epoch": 4.289035836177474, + "grad_norm": 0.19120344357645708, + "learning_rate": 6.262644859937539e-05, + "loss": 0.8906, + "step": 20107 + }, + { + "epoch": 4.289249146757679, + "grad_norm": 0.17457165075399708, + "learning_rate": 6.261814202688227e-05, + "loss": 0.8915, + "step": 20108 + }, + { + "epoch": 4.289462457337884, + "grad_norm": 0.18646811589629717, + "learning_rate": 6.260983565106516e-05, + "loss": 0.9162, + "step": 20109 + }, + { + "epoch": 4.289675767918089, + "grad_norm": 0.3388328718907714, + "learning_rate": 6.260152947201805e-05, + "loss": 0.9365, + "step": 20110 + }, + { + "epoch": 4.289889078498294, + "grad_norm": 0.20834967469701113, + "learning_rate": 6.259322348983493e-05, + "loss": 0.9265, + "step": 20111 + }, + { + "epoch": 4.290102389078498, + "grad_norm": 0.2491736522828083, + "learning_rate": 6.258491770460977e-05, + "loss": 0.8837, + "step": 20112 + }, + { + "epoch": 4.290315699658703, + "grad_norm": 0.2533287821288041, + "learning_rate": 6.257661211643658e-05, + "loss": 0.89, + "step": 20113 + }, + { + "epoch": 4.290529010238908, + "grad_norm": 0.248372186031206, + "learning_rate": 6.256830672540933e-05, + "loss": 0.8929, + "step": 20114 + }, + { + "epoch": 4.290742320819112, + "grad_norm": 0.19298245468016764, + "learning_rate": 6.256000153162195e-05, + "loss": 0.9332, + "step": 20115 + }, + { + "epoch": 4.290955631399317, + "grad_norm": 0.2017984185948763, + "learning_rate": 6.255169653516846e-05, + "loss": 0.8788, + "step": 20116 + }, + { + "epoch": 4.291168941979522, + "grad_norm": 0.2203689463321757, + "learning_rate": 6.254339173614279e-05, + "loss": 0.9071, + "step": 20117 + }, + { + "epoch": 4.2913822525597265, + "grad_norm": 0.15665545759473765, + "learning_rate": 6.253508713463896e-05, + "loss": 0.9271, + "step": 20118 + }, + { + "epoch": 4.291595563139932, + "grad_norm": 0.21856041521529443, + "learning_rate": 6.25267827307509e-05, + "loss": 0.9087, + "step": 20119 + }, + { + "epoch": 4.291808873720137, + "grad_norm": 0.18561766888012513, + "learning_rate": 6.25184785245726e-05, + "loss": 0.8903, + "step": 20120 + }, + { + "epoch": 4.292022184300341, + "grad_norm": 0.19619522084310717, + "learning_rate": 6.251017451619802e-05, + "loss": 0.9046, + "step": 20121 + }, + { + "epoch": 4.292235494880546, + "grad_norm": 0.1676697095712939, + "learning_rate": 6.25018707057211e-05, + "loss": 0.8843, + "step": 20122 + }, + { + "epoch": 4.292448805460751, + "grad_norm": 0.15998068727607437, + "learning_rate": 6.24935670932358e-05, + "loss": 0.8961, + "step": 20123 + }, + { + "epoch": 4.2926621160409555, + "grad_norm": 0.1700089072002001, + "learning_rate": 6.248526367883607e-05, + "loss": 0.9172, + "step": 20124 + }, + { + "epoch": 4.29287542662116, + "grad_norm": 0.1907249486712934, + "learning_rate": 6.247696046261592e-05, + "loss": 0.9139, + "step": 20125 + }, + { + "epoch": 4.293088737201365, + "grad_norm": 0.18264176245988914, + "learning_rate": 6.246865744466923e-05, + "loss": 0.9142, + "step": 20126 + }, + { + "epoch": 4.2933020477815695, + "grad_norm": 0.18138564521962458, + "learning_rate": 6.246035462508998e-05, + "loss": 0.9138, + "step": 20127 + }, + { + "epoch": 4.293515358361775, + "grad_norm": 0.16488605732375858, + "learning_rate": 6.245205200397211e-05, + "loss": 0.9075, + "step": 20128 + }, + { + "epoch": 4.29372866894198, + "grad_norm": 0.15633264368623745, + "learning_rate": 6.244374958140955e-05, + "loss": 0.9056, + "step": 20129 + }, + { + "epoch": 4.2939419795221845, + "grad_norm": 0.174423348725519, + "learning_rate": 6.243544735749628e-05, + "loss": 0.9338, + "step": 20130 + }, + { + "epoch": 4.294155290102389, + "grad_norm": 0.1783271996586622, + "learning_rate": 6.24271453323262e-05, + "loss": 0.9417, + "step": 20131 + }, + { + "epoch": 4.294368600682594, + "grad_norm": 0.1619937153491971, + "learning_rate": 6.241884350599328e-05, + "loss": 0.9521, + "step": 20132 + }, + { + "epoch": 4.2945819112627985, + "grad_norm": 0.19451717985306732, + "learning_rate": 6.241054187859144e-05, + "loss": 0.9393, + "step": 20133 + }, + { + "epoch": 4.294795221843003, + "grad_norm": 0.18467411120477636, + "learning_rate": 6.24022404502146e-05, + "loss": 0.9005, + "step": 20134 + }, + { + "epoch": 4.295008532423208, + "grad_norm": 0.1606977869563022, + "learning_rate": 6.239393922095674e-05, + "loss": 0.9031, + "step": 20135 + }, + { + "epoch": 4.295221843003413, + "grad_norm": 0.18882943120648074, + "learning_rate": 6.238563819091176e-05, + "loss": 0.8916, + "step": 20136 + }, + { + "epoch": 4.295435153583618, + "grad_norm": 0.16294683037371854, + "learning_rate": 6.237733736017353e-05, + "loss": 0.9011, + "step": 20137 + }, + { + "epoch": 4.295648464163823, + "grad_norm": 0.1798158335620019, + "learning_rate": 6.236903672883606e-05, + "loss": 0.9203, + "step": 20138 + }, + { + "epoch": 4.2958617747440275, + "grad_norm": 0.15524468771010752, + "learning_rate": 6.236073629699322e-05, + "loss": 0.8748, + "step": 20139 + }, + { + "epoch": 4.296075085324232, + "grad_norm": 0.16629464732267538, + "learning_rate": 6.235243606473895e-05, + "loss": 0.8878, + "step": 20140 + }, + { + "epoch": 4.296288395904437, + "grad_norm": 0.16663149687700204, + "learning_rate": 6.234413603216715e-05, + "loss": 0.881, + "step": 20141 + }, + { + "epoch": 4.296501706484642, + "grad_norm": 0.19538123701033583, + "learning_rate": 6.233583619937177e-05, + "loss": 0.8867, + "step": 20142 + }, + { + "epoch": 4.296715017064846, + "grad_norm": 0.20162983273558743, + "learning_rate": 6.232753656644668e-05, + "loss": 0.9334, + "step": 20143 + }, + { + "epoch": 4.296928327645051, + "grad_norm": 0.16166873383471803, + "learning_rate": 6.231923713348583e-05, + "loss": 0.9068, + "step": 20144 + }, + { + "epoch": 4.297141638225256, + "grad_norm": 0.2320095037876775, + "learning_rate": 6.23109379005831e-05, + "loss": 0.8991, + "step": 20145 + }, + { + "epoch": 4.297354948805461, + "grad_norm": 0.1829668659703685, + "learning_rate": 6.23026388678324e-05, + "loss": 0.913, + "step": 20146 + }, + { + "epoch": 4.297568259385666, + "grad_norm": 0.20998970855359841, + "learning_rate": 6.229434003532766e-05, + "loss": 0.9436, + "step": 20147 + }, + { + "epoch": 4.297781569965871, + "grad_norm": 0.19241738539604708, + "learning_rate": 6.228604140316275e-05, + "loss": 0.9162, + "step": 20148 + }, + { + "epoch": 4.297994880546075, + "grad_norm": 0.175705275196866, + "learning_rate": 6.227774297143157e-05, + "loss": 0.9147, + "step": 20149 + }, + { + "epoch": 4.29820819112628, + "grad_norm": 0.1867527167227308, + "learning_rate": 6.226944474022804e-05, + "loss": 0.9028, + "step": 20150 + }, + { + "epoch": 4.298421501706485, + "grad_norm": 0.1818036486199741, + "learning_rate": 6.226114670964601e-05, + "loss": 0.8558, + "step": 20151 + }, + { + "epoch": 4.298634812286689, + "grad_norm": 0.16325932550071529, + "learning_rate": 6.225284887977944e-05, + "loss": 0.9045, + "step": 20152 + }, + { + "epoch": 4.298848122866894, + "grad_norm": 0.1836692083423937, + "learning_rate": 6.224455125072215e-05, + "loss": 0.9201, + "step": 20153 + }, + { + "epoch": 4.299061433447099, + "grad_norm": 0.19190976823421155, + "learning_rate": 6.223625382256808e-05, + "loss": 0.9056, + "step": 20154 + }, + { + "epoch": 4.299274744027303, + "grad_norm": 0.18048435822160613, + "learning_rate": 6.22279565954111e-05, + "loss": 0.8981, + "step": 20155 + }, + { + "epoch": 4.299488054607509, + "grad_norm": 0.18566503624162176, + "learning_rate": 6.221965956934509e-05, + "loss": 0.8857, + "step": 20156 + }, + { + "epoch": 4.299701365187714, + "grad_norm": 0.17803499905928322, + "learning_rate": 6.221136274446393e-05, + "loss": 0.9252, + "step": 20157 + }, + { + "epoch": 4.299914675767918, + "grad_norm": 0.2030634396743664, + "learning_rate": 6.220306612086153e-05, + "loss": 0.9157, + "step": 20158 + }, + { + "epoch": 4.300127986348123, + "grad_norm": 0.18131982184989004, + "learning_rate": 6.219476969863169e-05, + "loss": 0.9219, + "step": 20159 + }, + { + "epoch": 4.300341296928328, + "grad_norm": 0.16215746894192956, + "learning_rate": 6.218647347786835e-05, + "loss": 0.906, + "step": 20160 + }, + { + "epoch": 4.300554607508532, + "grad_norm": 0.18028886072114117, + "learning_rate": 6.217817745866534e-05, + "loss": 0.8945, + "step": 20161 + }, + { + "epoch": 4.300767918088737, + "grad_norm": 0.19432151764978006, + "learning_rate": 6.216988164111658e-05, + "loss": 0.9158, + "step": 20162 + }, + { + "epoch": 4.300981228668942, + "grad_norm": 0.18215681661663571, + "learning_rate": 6.216158602531588e-05, + "loss": 0.8949, + "step": 20163 + }, + { + "epoch": 4.301194539249146, + "grad_norm": 0.17962800632143378, + "learning_rate": 6.215329061135716e-05, + "loss": 0.8874, + "step": 20164 + }, + { + "epoch": 4.301407849829351, + "grad_norm": 0.20878795992243485, + "learning_rate": 6.214499539933424e-05, + "loss": 0.8895, + "step": 20165 + }, + { + "epoch": 4.301621160409557, + "grad_norm": 0.2211416859609287, + "learning_rate": 6.213670038934101e-05, + "loss": 0.8794, + "step": 20166 + }, + { + "epoch": 4.301834470989761, + "grad_norm": 0.1994803514107806, + "learning_rate": 6.21284055814713e-05, + "loss": 0.8794, + "step": 20167 + }, + { + "epoch": 4.302047781569966, + "grad_norm": 0.15286599210426227, + "learning_rate": 6.2120110975819e-05, + "loss": 0.9108, + "step": 20168 + }, + { + "epoch": 4.302261092150171, + "grad_norm": 0.17116585490170633, + "learning_rate": 6.211181657247796e-05, + "loss": 0.9287, + "step": 20169 + }, + { + "epoch": 4.302474402730375, + "grad_norm": 0.1797050737092659, + "learning_rate": 6.2103522371542e-05, + "loss": 0.9032, + "step": 20170 + }, + { + "epoch": 4.30268771331058, + "grad_norm": 0.15895764743093016, + "learning_rate": 6.209522837310497e-05, + "loss": 0.8984, + "step": 20171 + }, + { + "epoch": 4.302901023890785, + "grad_norm": 0.16406745867248104, + "learning_rate": 6.208693457726074e-05, + "loss": 0.8978, + "step": 20172 + }, + { + "epoch": 4.3031143344709895, + "grad_norm": 0.20024583664236126, + "learning_rate": 6.207864098410315e-05, + "loss": 0.9209, + "step": 20173 + }, + { + "epoch": 4.303327645051194, + "grad_norm": 0.15303100161824465, + "learning_rate": 6.207034759372604e-05, + "loss": 0.8741, + "step": 20174 + }, + { + "epoch": 4.3035409556314, + "grad_norm": 0.1891987947739781, + "learning_rate": 6.206205440622324e-05, + "loss": 0.9353, + "step": 20175 + }, + { + "epoch": 4.303754266211604, + "grad_norm": 0.19450681211109144, + "learning_rate": 6.20537614216886e-05, + "loss": 0.9031, + "step": 20176 + }, + { + "epoch": 4.303967576791809, + "grad_norm": 0.15380941023304173, + "learning_rate": 6.204546864021594e-05, + "loss": 0.9323, + "step": 20177 + }, + { + "epoch": 4.304180887372014, + "grad_norm": 0.1895502664859565, + "learning_rate": 6.203717606189913e-05, + "loss": 0.8881, + "step": 20178 + }, + { + "epoch": 4.3043941979522184, + "grad_norm": 0.22291326106687462, + "learning_rate": 6.202888368683194e-05, + "loss": 0.8817, + "step": 20179 + }, + { + "epoch": 4.304607508532423, + "grad_norm": 0.17771178944229196, + "learning_rate": 6.202059151510826e-05, + "loss": 0.8893, + "step": 20180 + }, + { + "epoch": 4.304820819112628, + "grad_norm": 0.1746937159779041, + "learning_rate": 6.201229954682192e-05, + "loss": 0.9067, + "step": 20181 + }, + { + "epoch": 4.3050341296928325, + "grad_norm": 0.18819350872572382, + "learning_rate": 6.200400778206668e-05, + "loss": 0.9013, + "step": 20182 + }, + { + "epoch": 4.305247440273037, + "grad_norm": 0.1680745293615089, + "learning_rate": 6.199571622093638e-05, + "loss": 0.8855, + "step": 20183 + }, + { + "epoch": 4.305460750853243, + "grad_norm": 0.17346026985168855, + "learning_rate": 6.198742486352488e-05, + "loss": 0.8728, + "step": 20184 + }, + { + "epoch": 4.305674061433447, + "grad_norm": 0.17493847956807249, + "learning_rate": 6.197913370992594e-05, + "loss": 0.9157, + "step": 20185 + }, + { + "epoch": 4.305887372013652, + "grad_norm": 0.20800091127546647, + "learning_rate": 6.197084276023343e-05, + "loss": 0.8859, + "step": 20186 + }, + { + "epoch": 4.306100682593857, + "grad_norm": 0.2581179004414566, + "learning_rate": 6.196255201454112e-05, + "loss": 0.9124, + "step": 20187 + }, + { + "epoch": 4.3063139931740615, + "grad_norm": 0.1777377211508226, + "learning_rate": 6.195426147294285e-05, + "loss": 0.9232, + "step": 20188 + }, + { + "epoch": 4.306527303754266, + "grad_norm": 0.19481677070785083, + "learning_rate": 6.19459711355324e-05, + "loss": 0.8911, + "step": 20189 + }, + { + "epoch": 4.306740614334471, + "grad_norm": 0.16411500458959163, + "learning_rate": 6.193768100240361e-05, + "loss": 0.8852, + "step": 20190 + }, + { + "epoch": 4.3069539249146755, + "grad_norm": 0.19897100873784518, + "learning_rate": 6.192939107365025e-05, + "loss": 0.8835, + "step": 20191 + }, + { + "epoch": 4.30716723549488, + "grad_norm": 0.15656011383378923, + "learning_rate": 6.192110134936619e-05, + "loss": 0.8965, + "step": 20192 + }, + { + "epoch": 4.307380546075085, + "grad_norm": 0.2251063852325308, + "learning_rate": 6.19128118296451e-05, + "loss": 0.8892, + "step": 20193 + }, + { + "epoch": 4.3075938566552905, + "grad_norm": 0.17554612716604273, + "learning_rate": 6.190452251458087e-05, + "loss": 0.9014, + "step": 20194 + }, + { + "epoch": 4.307807167235495, + "grad_norm": 0.19269893477275618, + "learning_rate": 6.189623340426726e-05, + "loss": 0.913, + "step": 20195 + }, + { + "epoch": 4.3080204778157, + "grad_norm": 0.1759683892885481, + "learning_rate": 6.18879444987981e-05, + "loss": 0.8845, + "step": 20196 + }, + { + "epoch": 4.3082337883959045, + "grad_norm": 0.17313249173692213, + "learning_rate": 6.187965579826714e-05, + "loss": 0.8742, + "step": 20197 + }, + { + "epoch": 4.308447098976109, + "grad_norm": 0.17534594619032381, + "learning_rate": 6.187136730276817e-05, + "loss": 0.9034, + "step": 20198 + }, + { + "epoch": 4.308660409556314, + "grad_norm": 0.1855365643992431, + "learning_rate": 6.186307901239499e-05, + "loss": 0.881, + "step": 20199 + }, + { + "epoch": 4.308873720136519, + "grad_norm": 0.15473811880806518, + "learning_rate": 6.185479092724138e-05, + "loss": 0.8907, + "step": 20200 + }, + { + "epoch": 4.309087030716723, + "grad_norm": 0.18263591856772005, + "learning_rate": 6.18465030474011e-05, + "loss": 0.8874, + "step": 20201 + }, + { + "epoch": 4.309300341296928, + "grad_norm": 0.19421795548738047, + "learning_rate": 6.183821537296797e-05, + "loss": 0.9126, + "step": 20202 + }, + { + "epoch": 4.309513651877133, + "grad_norm": 0.23141670514153018, + "learning_rate": 6.182992790403575e-05, + "loss": 0.899, + "step": 20203 + }, + { + "epoch": 4.309726962457338, + "grad_norm": 0.17888218402239936, + "learning_rate": 6.182164064069818e-05, + "loss": 0.8914, + "step": 20204 + }, + { + "epoch": 4.309940273037543, + "grad_norm": 0.1641200383624321, + "learning_rate": 6.181335358304903e-05, + "loss": 0.871, + "step": 20205 + }, + { + "epoch": 4.310153583617748, + "grad_norm": 0.19529133657164324, + "learning_rate": 6.180506673118213e-05, + "loss": 0.9315, + "step": 20206 + }, + { + "epoch": 4.310366894197952, + "grad_norm": 0.22992630288671664, + "learning_rate": 6.179678008519116e-05, + "loss": 0.9066, + "step": 20207 + }, + { + "epoch": 4.310580204778157, + "grad_norm": 0.19067072998711293, + "learning_rate": 6.178849364516997e-05, + "loss": 0.8712, + "step": 20208 + }, + { + "epoch": 4.310793515358362, + "grad_norm": 0.17660415688093586, + "learning_rate": 6.178020741121225e-05, + "loss": 0.8947, + "step": 20209 + }, + { + "epoch": 4.311006825938566, + "grad_norm": 0.1648896939418932, + "learning_rate": 6.177192138341182e-05, + "loss": 0.8952, + "step": 20210 + }, + { + "epoch": 4.311220136518771, + "grad_norm": 0.19080994861467307, + "learning_rate": 6.176363556186237e-05, + "loss": 0.9308, + "step": 20211 + }, + { + "epoch": 4.311433447098976, + "grad_norm": 0.15861315726687844, + "learning_rate": 6.175534994665772e-05, + "loss": 0.8929, + "step": 20212 + }, + { + "epoch": 4.311646757679181, + "grad_norm": 0.17783094547471218, + "learning_rate": 6.174706453789157e-05, + "loss": 0.9252, + "step": 20213 + }, + { + "epoch": 4.311860068259386, + "grad_norm": 0.2152721531923236, + "learning_rate": 6.173877933565776e-05, + "loss": 0.8854, + "step": 20214 + }, + { + "epoch": 4.312073378839591, + "grad_norm": 0.23685558138791177, + "learning_rate": 6.17304943400499e-05, + "loss": 0.9188, + "step": 20215 + }, + { + "epoch": 4.312286689419795, + "grad_norm": 0.1664321320416988, + "learning_rate": 6.172220955116181e-05, + "loss": 0.8927, + "step": 20216 + }, + { + "epoch": 4.3125, + "grad_norm": 0.20416090012970967, + "learning_rate": 6.171392496908724e-05, + "loss": 0.9049, + "step": 20217 + }, + { + "epoch": 4.312713310580205, + "grad_norm": 0.27535777862771693, + "learning_rate": 6.170564059391992e-05, + "loss": 0.9139, + "step": 20218 + }, + { + "epoch": 4.312926621160409, + "grad_norm": 0.17378116237078273, + "learning_rate": 6.169735642575357e-05, + "loss": 0.8889, + "step": 20219 + }, + { + "epoch": 4.313139931740614, + "grad_norm": 0.15499734288741263, + "learning_rate": 6.168907246468194e-05, + "loss": 0.888, + "step": 20220 + }, + { + "epoch": 4.313353242320819, + "grad_norm": 0.1615335561997503, + "learning_rate": 6.168078871079876e-05, + "loss": 0.899, + "step": 20221 + }, + { + "epoch": 4.313566552901024, + "grad_norm": 0.175943914897803, + "learning_rate": 6.167250516419777e-05, + "loss": 0.8959, + "step": 20222 + }, + { + "epoch": 4.313779863481229, + "grad_norm": 0.18988121904789998, + "learning_rate": 6.16642218249727e-05, + "loss": 0.9023, + "step": 20223 + }, + { + "epoch": 4.313993174061434, + "grad_norm": 0.17208227346846966, + "learning_rate": 6.165593869321725e-05, + "loss": 0.9227, + "step": 20224 + }, + { + "epoch": 4.314206484641638, + "grad_norm": 0.1928514821099501, + "learning_rate": 6.164765576902521e-05, + "loss": 0.9381, + "step": 20225 + }, + { + "epoch": 4.314419795221843, + "grad_norm": 0.15374124961502791, + "learning_rate": 6.163937305249022e-05, + "loss": 0.8673, + "step": 20226 + }, + { + "epoch": 4.314633105802048, + "grad_norm": 0.21120276859663856, + "learning_rate": 6.163109054370603e-05, + "loss": 0.9127, + "step": 20227 + }, + { + "epoch": 4.314846416382252, + "grad_norm": 0.1656490256415965, + "learning_rate": 6.162280824276636e-05, + "loss": 0.8927, + "step": 20228 + }, + { + "epoch": 4.315059726962457, + "grad_norm": 0.17128009085832552, + "learning_rate": 6.161452614976492e-05, + "loss": 0.8867, + "step": 20229 + }, + { + "epoch": 4.315273037542662, + "grad_norm": 0.18309409358480805, + "learning_rate": 6.160624426479544e-05, + "loss": 0.8992, + "step": 20230 + }, + { + "epoch": 4.315486348122867, + "grad_norm": 0.1530841692711955, + "learning_rate": 6.15979625879516e-05, + "loss": 0.8753, + "step": 20231 + }, + { + "epoch": 4.315699658703072, + "grad_norm": 0.16796300590481075, + "learning_rate": 6.158968111932713e-05, + "loss": 0.942, + "step": 20232 + }, + { + "epoch": 4.315912969283277, + "grad_norm": 0.16019897326975197, + "learning_rate": 6.158139985901571e-05, + "loss": 0.8783, + "step": 20233 + }, + { + "epoch": 4.316126279863481, + "grad_norm": 0.19541468497847747, + "learning_rate": 6.15731188071111e-05, + "loss": 0.9065, + "step": 20234 + }, + { + "epoch": 4.316339590443686, + "grad_norm": 0.1798861089202676, + "learning_rate": 6.156483796370691e-05, + "loss": 0.9108, + "step": 20235 + }, + { + "epoch": 4.316552901023891, + "grad_norm": 0.19068906814366737, + "learning_rate": 6.155655732889693e-05, + "loss": 0.8931, + "step": 20236 + }, + { + "epoch": 4.3167662116040955, + "grad_norm": 0.22499256706699594, + "learning_rate": 6.154827690277483e-05, + "loss": 0.8956, + "step": 20237 + }, + { + "epoch": 4.3169795221843, + "grad_norm": 0.5712007735390947, + "learning_rate": 6.153999668543427e-05, + "loss": 0.9219, + "step": 20238 + }, + { + "epoch": 4.317192832764505, + "grad_norm": 0.19446661592617692, + "learning_rate": 6.153171667696893e-05, + "loss": 0.8761, + "step": 20239 + }, + { + "epoch": 4.3174061433447095, + "grad_norm": 0.20502526385883674, + "learning_rate": 6.152343687747254e-05, + "loss": 0.9006, + "step": 20240 + }, + { + "epoch": 4.317619453924915, + "grad_norm": 0.18647952152566014, + "learning_rate": 6.151515728703876e-05, + "loss": 0.9135, + "step": 20241 + }, + { + "epoch": 4.31783276450512, + "grad_norm": 0.1792378933018431, + "learning_rate": 6.150687790576132e-05, + "loss": 0.8978, + "step": 20242 + }, + { + "epoch": 4.3180460750853245, + "grad_norm": 0.18827197799437442, + "learning_rate": 6.149859873373383e-05, + "loss": 0.9099, + "step": 20243 + }, + { + "epoch": 4.318259385665529, + "grad_norm": 0.19362359383456237, + "learning_rate": 6.149031977105003e-05, + "loss": 0.9012, + "step": 20244 + }, + { + "epoch": 4.318472696245734, + "grad_norm": 0.1780227618670038, + "learning_rate": 6.148204101780357e-05, + "loss": 0.9198, + "step": 20245 + }, + { + "epoch": 4.3186860068259385, + "grad_norm": 0.1514529668573112, + "learning_rate": 6.147376247408813e-05, + "loss": 0.8823, + "step": 20246 + }, + { + "epoch": 4.318899317406143, + "grad_norm": 0.16487138824856742, + "learning_rate": 6.146548413999736e-05, + "loss": 0.9008, + "step": 20247 + }, + { + "epoch": 4.319112627986348, + "grad_norm": 0.15966534464364926, + "learning_rate": 6.145720601562499e-05, + "loss": 0.9054, + "step": 20248 + }, + { + "epoch": 4.319325938566553, + "grad_norm": 0.15788687077025193, + "learning_rate": 6.144892810106461e-05, + "loss": 0.9199, + "step": 20249 + }, + { + "epoch": 4.319539249146757, + "grad_norm": 0.1898975482110807, + "learning_rate": 6.144065039640994e-05, + "loss": 0.8893, + "step": 20250 + }, + { + "epoch": 4.319752559726963, + "grad_norm": 0.17166072808793767, + "learning_rate": 6.14323729017546e-05, + "loss": 0.9299, + "step": 20251 + }, + { + "epoch": 4.3199658703071675, + "grad_norm": 0.1694192495233523, + "learning_rate": 6.14240956171923e-05, + "loss": 0.9, + "step": 20252 + }, + { + "epoch": 4.320179180887372, + "grad_norm": 0.21434219650357605, + "learning_rate": 6.141581854281665e-05, + "loss": 0.922, + "step": 20253 + }, + { + "epoch": 4.320392491467577, + "grad_norm": 0.2648781405134989, + "learning_rate": 6.140754167872134e-05, + "loss": 0.9047, + "step": 20254 + }, + { + "epoch": 4.3206058020477816, + "grad_norm": 0.18586671387788378, + "learning_rate": 6.139926502499999e-05, + "loss": 0.9159, + "step": 20255 + }, + { + "epoch": 4.320819112627986, + "grad_norm": 0.16197871259506982, + "learning_rate": 6.139098858174628e-05, + "loss": 0.9268, + "step": 20256 + }, + { + "epoch": 4.321032423208191, + "grad_norm": 0.19711355122154575, + "learning_rate": 6.138271234905384e-05, + "loss": 0.8881, + "step": 20257 + }, + { + "epoch": 4.321245733788396, + "grad_norm": 0.1871302283713989, + "learning_rate": 6.137443632701635e-05, + "loss": 0.8965, + "step": 20258 + }, + { + "epoch": 4.3214590443686, + "grad_norm": 0.17153288839977557, + "learning_rate": 6.136616051572743e-05, + "loss": 0.9065, + "step": 20259 + }, + { + "epoch": 4.321672354948806, + "grad_norm": 0.14833623107323793, + "learning_rate": 6.135788491528068e-05, + "loss": 0.9215, + "step": 20260 + }, + { + "epoch": 4.3218856655290105, + "grad_norm": 0.15735404666498612, + "learning_rate": 6.13496095257698e-05, + "loss": 0.9052, + "step": 20261 + }, + { + "epoch": 4.322098976109215, + "grad_norm": 0.19400361979064756, + "learning_rate": 6.134133434728839e-05, + "loss": 0.9039, + "step": 20262 + }, + { + "epoch": 4.32231228668942, + "grad_norm": 0.14453141720532997, + "learning_rate": 6.133305937993008e-05, + "loss": 0.9096, + "step": 20263 + }, + { + "epoch": 4.322525597269625, + "grad_norm": 0.1747546389608765, + "learning_rate": 6.132478462378855e-05, + "loss": 0.9023, + "step": 20264 + }, + { + "epoch": 4.322738907849829, + "grad_norm": 0.1883614748970747, + "learning_rate": 6.131651007895736e-05, + "loss": 0.8884, + "step": 20265 + }, + { + "epoch": 4.322952218430034, + "grad_norm": 0.14428977988040068, + "learning_rate": 6.13082357455302e-05, + "loss": 0.8884, + "step": 20266 + }, + { + "epoch": 4.323165529010239, + "grad_norm": 0.16376597821741123, + "learning_rate": 6.129996162360065e-05, + "loss": 0.9199, + "step": 20267 + }, + { + "epoch": 4.323378839590443, + "grad_norm": 0.15840836411614462, + "learning_rate": 6.129168771326237e-05, + "loss": 0.8677, + "step": 20268 + }, + { + "epoch": 4.323592150170649, + "grad_norm": 0.16908690472780116, + "learning_rate": 6.128341401460893e-05, + "loss": 0.8971, + "step": 20269 + }, + { + "epoch": 4.323805460750854, + "grad_norm": 0.18204083126782844, + "learning_rate": 6.127514052773402e-05, + "loss": 0.8579, + "step": 20270 + }, + { + "epoch": 4.324018771331058, + "grad_norm": 0.16857600214503057, + "learning_rate": 6.126686725273118e-05, + "loss": 0.9215, + "step": 20271 + }, + { + "epoch": 4.324232081911263, + "grad_norm": 0.15715118898931366, + "learning_rate": 6.125859418969405e-05, + "loss": 0.9015, + "step": 20272 + }, + { + "epoch": 4.324445392491468, + "grad_norm": 0.15053410010573065, + "learning_rate": 6.125032133871625e-05, + "loss": 0.9208, + "step": 20273 + }, + { + "epoch": 4.324658703071672, + "grad_norm": 0.14585235741293207, + "learning_rate": 6.124204869989138e-05, + "loss": 0.9185, + "step": 20274 + }, + { + "epoch": 4.324872013651877, + "grad_norm": 0.15069798971807777, + "learning_rate": 6.123377627331303e-05, + "loss": 0.9164, + "step": 20275 + }, + { + "epoch": 4.325085324232082, + "grad_norm": 0.1701052087460525, + "learning_rate": 6.122550405907483e-05, + "loss": 0.8559, + "step": 20276 + }, + { + "epoch": 4.325298634812286, + "grad_norm": 0.16704335204073198, + "learning_rate": 6.121723205727037e-05, + "loss": 0.8918, + "step": 20277 + }, + { + "epoch": 4.325511945392491, + "grad_norm": 0.15957832402749958, + "learning_rate": 6.120896026799323e-05, + "loss": 0.9093, + "step": 20278 + }, + { + "epoch": 4.325725255972697, + "grad_norm": 0.18570833976122297, + "learning_rate": 6.120068869133703e-05, + "loss": 0.9224, + "step": 20279 + }, + { + "epoch": 4.325938566552901, + "grad_norm": 0.15737245212914977, + "learning_rate": 6.119241732739536e-05, + "loss": 0.8942, + "step": 20280 + }, + { + "epoch": 4.326151877133106, + "grad_norm": 0.18167758488692165, + "learning_rate": 6.118414617626181e-05, + "loss": 0.9089, + "step": 20281 + }, + { + "epoch": 4.326365187713311, + "grad_norm": 0.1496187894050043, + "learning_rate": 6.117587523802994e-05, + "loss": 0.8801, + "step": 20282 + }, + { + "epoch": 4.326578498293515, + "grad_norm": 0.163412281481679, + "learning_rate": 6.116760451279338e-05, + "loss": 0.9165, + "step": 20283 + }, + { + "epoch": 4.32679180887372, + "grad_norm": 0.19300731176146563, + "learning_rate": 6.115933400064568e-05, + "loss": 0.8906, + "step": 20284 + }, + { + "epoch": 4.327005119453925, + "grad_norm": 0.19417088152071, + "learning_rate": 6.115106370168043e-05, + "loss": 0.8956, + "step": 20285 + }, + { + "epoch": 4.327218430034129, + "grad_norm": 0.18845295971814768, + "learning_rate": 6.114279361599122e-05, + "loss": 0.9233, + "step": 20286 + }, + { + "epoch": 4.327431740614334, + "grad_norm": 0.18486732371541753, + "learning_rate": 6.11345237436716e-05, + "loss": 0.907, + "step": 20287 + }, + { + "epoch": 4.327645051194539, + "grad_norm": 0.21373653887226893, + "learning_rate": 6.112625408481518e-05, + "loss": 0.9188, + "step": 20288 + }, + { + "epoch": 4.327858361774744, + "grad_norm": 0.20956338529116655, + "learning_rate": 6.11179846395155e-05, + "loss": 0.8413, + "step": 20289 + }, + { + "epoch": 4.328071672354949, + "grad_norm": 0.16196311007633027, + "learning_rate": 6.110971540786615e-05, + "loss": 0.8821, + "step": 20290 + }, + { + "epoch": 4.328284982935154, + "grad_norm": 0.16500085049446134, + "learning_rate": 6.110144638996068e-05, + "loss": 0.9188, + "step": 20291 + }, + { + "epoch": 4.328498293515358, + "grad_norm": 0.19082954086906834, + "learning_rate": 6.109317758589268e-05, + "loss": 0.9164, + "step": 20292 + }, + { + "epoch": 4.328711604095563, + "grad_norm": 0.23828124343389387, + "learning_rate": 6.10849089957557e-05, + "loss": 0.899, + "step": 20293 + }, + { + "epoch": 4.328924914675768, + "grad_norm": 0.19420146843582753, + "learning_rate": 6.107664061964327e-05, + "loss": 0.8917, + "step": 20294 + }, + { + "epoch": 4.3291382252559725, + "grad_norm": 0.17412541146244576, + "learning_rate": 6.106837245764897e-05, + "loss": 0.8977, + "step": 20295 + }, + { + "epoch": 4.329351535836177, + "grad_norm": 0.17829330168484922, + "learning_rate": 6.106010450986635e-05, + "loss": 0.8922, + "step": 20296 + }, + { + "epoch": 4.329564846416382, + "grad_norm": 0.2252832062967727, + "learning_rate": 6.105183677638898e-05, + "loss": 0.8905, + "step": 20297 + }, + { + "epoch": 4.329778156996587, + "grad_norm": 0.2505446689887191, + "learning_rate": 6.104356925731039e-05, + "loss": 0.9173, + "step": 20298 + }, + { + "epoch": 4.329991467576792, + "grad_norm": 0.16356567837207087, + "learning_rate": 6.103530195272412e-05, + "loss": 0.9065, + "step": 20299 + }, + { + "epoch": 4.330204778156997, + "grad_norm": 0.18246469552324274, + "learning_rate": 6.102703486272375e-05, + "loss": 0.9062, + "step": 20300 + }, + { + "epoch": 4.3304180887372015, + "grad_norm": 0.19624659317577292, + "learning_rate": 6.1018767987402775e-05, + "loss": 0.9388, + "step": 20301 + }, + { + "epoch": 4.330631399317406, + "grad_norm": 0.15296873706458325, + "learning_rate": 6.101050132685478e-05, + "loss": 0.8999, + "step": 20302 + }, + { + "epoch": 4.330844709897611, + "grad_norm": 0.17603381199771256, + "learning_rate": 6.1002234881173264e-05, + "loss": 0.8998, + "step": 20303 + }, + { + "epoch": 4.3310580204778155, + "grad_norm": 0.19476937687066764, + "learning_rate": 6.099396865045181e-05, + "loss": 0.8789, + "step": 20304 + }, + { + "epoch": 4.33127133105802, + "grad_norm": 0.16596218125407736, + "learning_rate": 6.098570263478392e-05, + "loss": 0.9129, + "step": 20305 + }, + { + "epoch": 4.331484641638225, + "grad_norm": 0.1677537386288283, + "learning_rate": 6.0977436834263094e-05, + "loss": 0.9076, + "step": 20306 + }, + { + "epoch": 4.3316979522184305, + "grad_norm": 0.17810540759496354, + "learning_rate": 6.096917124898291e-05, + "loss": 0.8865, + "step": 20307 + }, + { + "epoch": 4.331911262798635, + "grad_norm": 0.18888773882380216, + "learning_rate": 6.0960905879036855e-05, + "loss": 0.8869, + "step": 20308 + }, + { + "epoch": 4.33212457337884, + "grad_norm": 0.1840391137334139, + "learning_rate": 6.095264072451848e-05, + "loss": 0.8942, + "step": 20309 + }, + { + "epoch": 4.3323378839590445, + "grad_norm": 0.1363125753875405, + "learning_rate": 6.09443757855213e-05, + "loss": 0.9092, + "step": 20310 + }, + { + "epoch": 4.332551194539249, + "grad_norm": 0.1979908407901838, + "learning_rate": 6.0936111062138816e-05, + "loss": 0.9271, + "step": 20311 + }, + { + "epoch": 4.332764505119454, + "grad_norm": 0.24271209447783368, + "learning_rate": 6.092784655446458e-05, + "loss": 0.9075, + "step": 20312 + }, + { + "epoch": 4.332977815699659, + "grad_norm": 0.21148557933814277, + "learning_rate": 6.0919582262592055e-05, + "loss": 0.893, + "step": 20313 + }, + { + "epoch": 4.333191126279863, + "grad_norm": 0.16091811762357733, + "learning_rate": 6.091131818661479e-05, + "loss": 0.9117, + "step": 20314 + }, + { + "epoch": 4.333404436860068, + "grad_norm": 0.20086965091437214, + "learning_rate": 6.09030543266263e-05, + "loss": 0.908, + "step": 20315 + }, + { + "epoch": 4.3336177474402735, + "grad_norm": 0.2284379830190712, + "learning_rate": 6.0894790682720046e-05, + "loss": 0.8927, + "step": 20316 + }, + { + "epoch": 4.333831058020478, + "grad_norm": 0.15570608546779088, + "learning_rate": 6.088652725498957e-05, + "loss": 0.8946, + "step": 20317 + }, + { + "epoch": 4.334044368600683, + "grad_norm": 0.18362267294324858, + "learning_rate": 6.087826404352834e-05, + "loss": 0.9458, + "step": 20318 + }, + { + "epoch": 4.334257679180888, + "grad_norm": 0.20518292386311873, + "learning_rate": 6.087000104842989e-05, + "loss": 0.8957, + "step": 20319 + }, + { + "epoch": 4.334470989761092, + "grad_norm": 0.16941681075226217, + "learning_rate": 6.08617382697877e-05, + "loss": 0.9291, + "step": 20320 + }, + { + "epoch": 4.334684300341297, + "grad_norm": 0.18771790662976445, + "learning_rate": 6.085347570769525e-05, + "loss": 0.8722, + "step": 20321 + }, + { + "epoch": 4.334897610921502, + "grad_norm": 0.14888135136951913, + "learning_rate": 6.0845213362246054e-05, + "loss": 0.895, + "step": 20322 + }, + { + "epoch": 4.335110921501706, + "grad_norm": 0.16690341938471281, + "learning_rate": 6.083695123353359e-05, + "loss": 0.9095, + "step": 20323 + }, + { + "epoch": 4.335324232081911, + "grad_norm": 0.16436481963521382, + "learning_rate": 6.082868932165134e-05, + "loss": 0.8967, + "step": 20324 + }, + { + "epoch": 4.335537542662116, + "grad_norm": 0.16921453142797804, + "learning_rate": 6.082042762669279e-05, + "loss": 0.9069, + "step": 20325 + }, + { + "epoch": 4.335750853242321, + "grad_norm": 0.19275222914494786, + "learning_rate": 6.0812166148751454e-05, + "loss": 0.8973, + "step": 20326 + }, + { + "epoch": 4.335964163822526, + "grad_norm": 0.15143815366369828, + "learning_rate": 6.080390488792077e-05, + "loss": 0.9148, + "step": 20327 + }, + { + "epoch": 4.336177474402731, + "grad_norm": 0.19306443326766934, + "learning_rate": 6.07956438442942e-05, + "loss": 0.8978, + "step": 20328 + }, + { + "epoch": 4.336390784982935, + "grad_norm": 0.19298315087044937, + "learning_rate": 6.0787383017965264e-05, + "loss": 0.9209, + "step": 20329 + }, + { + "epoch": 4.33660409556314, + "grad_norm": 0.20766313637952216, + "learning_rate": 6.0779122409027395e-05, + "loss": 0.9035, + "step": 20330 + }, + { + "epoch": 4.336817406143345, + "grad_norm": 0.15769672120602188, + "learning_rate": 6.077086201757409e-05, + "loss": 0.8982, + "step": 20331 + }, + { + "epoch": 4.337030716723549, + "grad_norm": 0.1766347945193819, + "learning_rate": 6.076260184369881e-05, + "loss": 0.8855, + "step": 20332 + }, + { + "epoch": 4.337244027303754, + "grad_norm": 0.1751722120625148, + "learning_rate": 6.075434188749501e-05, + "loss": 0.8854, + "step": 20333 + }, + { + "epoch": 4.337457337883959, + "grad_norm": 0.1580444698032015, + "learning_rate": 6.074608214905615e-05, + "loss": 0.8977, + "step": 20334 + }, + { + "epoch": 4.337670648464163, + "grad_norm": 0.18187019833285054, + "learning_rate": 6.0737822628475685e-05, + "loss": 0.883, + "step": 20335 + }, + { + "epoch": 4.337883959044369, + "grad_norm": 0.1858108488298675, + "learning_rate": 6.07295633258471e-05, + "loss": 0.9129, + "step": 20336 + }, + { + "epoch": 4.338097269624574, + "grad_norm": 0.1984634721345133, + "learning_rate": 6.0721304241263845e-05, + "loss": 0.8839, + "step": 20337 + }, + { + "epoch": 4.338310580204778, + "grad_norm": 0.2352857924028113, + "learning_rate": 6.0713045374819326e-05, + "loss": 0.9191, + "step": 20338 + }, + { + "epoch": 4.338523890784983, + "grad_norm": 0.1497800159658386, + "learning_rate": 6.0704786726607045e-05, + "loss": 0.9337, + "step": 20339 + }, + { + "epoch": 4.338737201365188, + "grad_norm": 0.1984242858599518, + "learning_rate": 6.06965282967204e-05, + "loss": 0.8994, + "step": 20340 + }, + { + "epoch": 4.338950511945392, + "grad_norm": 0.202021730223961, + "learning_rate": 6.068827008525288e-05, + "loss": 0.8914, + "step": 20341 + }, + { + "epoch": 4.339163822525597, + "grad_norm": 0.23728795870616176, + "learning_rate": 6.06800120922979e-05, + "loss": 0.8819, + "step": 20342 + }, + { + "epoch": 4.339377133105802, + "grad_norm": 0.1730382532102203, + "learning_rate": 6.067175431794892e-05, + "loss": 0.9094, + "step": 20343 + }, + { + "epoch": 4.339590443686006, + "grad_norm": 0.183158777182005, + "learning_rate": 6.0663496762299365e-05, + "loss": 0.8781, + "step": 20344 + }, + { + "epoch": 4.339803754266212, + "grad_norm": 0.17337446394319678, + "learning_rate": 6.065523942544264e-05, + "loss": 0.8704, + "step": 20345 + }, + { + "epoch": 4.340017064846417, + "grad_norm": 0.1961447787177089, + "learning_rate": 6.064698230747224e-05, + "loss": 0.9103, + "step": 20346 + }, + { + "epoch": 4.340230375426621, + "grad_norm": 0.16737199192507154, + "learning_rate": 6.0638725408481545e-05, + "loss": 0.9237, + "step": 20347 + }, + { + "epoch": 4.340443686006826, + "grad_norm": 0.14892241315103139, + "learning_rate": 6.063046872856402e-05, + "loss": 0.8998, + "step": 20348 + }, + { + "epoch": 4.340656996587031, + "grad_norm": 0.1828700292608507, + "learning_rate": 6.062221226781305e-05, + "loss": 0.8667, + "step": 20349 + }, + { + "epoch": 4.340870307167235, + "grad_norm": 0.20380419889463866, + "learning_rate": 6.061395602632207e-05, + "loss": 0.9, + "step": 20350 + }, + { + "epoch": 4.34108361774744, + "grad_norm": 0.18360429539713682, + "learning_rate": 6.060570000418451e-05, + "loss": 0.9189, + "step": 20351 + }, + { + "epoch": 4.341296928327645, + "grad_norm": 0.16266116739157974, + "learning_rate": 6.0597444201493774e-05, + "loss": 0.8771, + "step": 20352 + }, + { + "epoch": 4.3415102389078495, + "grad_norm": 0.14037252367533062, + "learning_rate": 6.0589188618343294e-05, + "loss": 0.8828, + "step": 20353 + }, + { + "epoch": 4.341723549488055, + "grad_norm": 0.17618331628766604, + "learning_rate": 6.0580933254826445e-05, + "loss": 0.8988, + "step": 20354 + }, + { + "epoch": 4.34193686006826, + "grad_norm": 0.15541439429276693, + "learning_rate": 6.0572678111036685e-05, + "loss": 0.9219, + "step": 20355 + }, + { + "epoch": 4.342150170648464, + "grad_norm": 0.15761762032239174, + "learning_rate": 6.0564423187067405e-05, + "loss": 0.9165, + "step": 20356 + }, + { + "epoch": 4.342363481228669, + "grad_norm": 0.1528066040619342, + "learning_rate": 6.055616848301198e-05, + "loss": 0.8977, + "step": 20357 + }, + { + "epoch": 4.342576791808874, + "grad_norm": 0.1535950363099324, + "learning_rate": 6.0547913998963864e-05, + "loss": 0.9144, + "step": 20358 + }, + { + "epoch": 4.3427901023890785, + "grad_norm": 0.1663573890439794, + "learning_rate": 6.0539659735016405e-05, + "loss": 0.8873, + "step": 20359 + }, + { + "epoch": 4.343003412969283, + "grad_norm": 0.1678912036631377, + "learning_rate": 6.053140569126305e-05, + "loss": 0.9258, + "step": 20360 + }, + { + "epoch": 4.343216723549488, + "grad_norm": 0.1710787909418228, + "learning_rate": 6.0523151867797156e-05, + "loss": 0.8924, + "step": 20361 + }, + { + "epoch": 4.3434300341296925, + "grad_norm": 0.173828427071023, + "learning_rate": 6.0514898264712114e-05, + "loss": 0.9162, + "step": 20362 + }, + { + "epoch": 4.343643344709897, + "grad_norm": 0.2006573680469815, + "learning_rate": 6.050664488210133e-05, + "loss": 0.8739, + "step": 20363 + }, + { + "epoch": 4.343856655290103, + "grad_norm": 0.19767242722611314, + "learning_rate": 6.049839172005819e-05, + "loss": 0.9084, + "step": 20364 + }, + { + "epoch": 4.3440699658703075, + "grad_norm": 0.4081307913307579, + "learning_rate": 6.049013877867607e-05, + "loss": 0.8967, + "step": 20365 + }, + { + "epoch": 4.344283276450512, + "grad_norm": 0.16703703114730784, + "learning_rate": 6.0481886058048357e-05, + "loss": 0.9018, + "step": 20366 + }, + { + "epoch": 4.344496587030717, + "grad_norm": 0.16486608249878776, + "learning_rate": 6.0473633558268445e-05, + "loss": 0.9019, + "step": 20367 + }, + { + "epoch": 4.3447098976109215, + "grad_norm": 0.158260271013061, + "learning_rate": 6.04653812794297e-05, + "loss": 0.8714, + "step": 20368 + }, + { + "epoch": 4.344923208191126, + "grad_norm": 0.1942940790644424, + "learning_rate": 6.0457129221625466e-05, + "loss": 0.8956, + "step": 20369 + }, + { + "epoch": 4.345136518771331, + "grad_norm": 0.15643809643043904, + "learning_rate": 6.044887738494917e-05, + "loss": 0.907, + "step": 20370 + }, + { + "epoch": 4.345349829351536, + "grad_norm": 0.217730794856957, + "learning_rate": 6.044062576949417e-05, + "loss": 0.8819, + "step": 20371 + }, + { + "epoch": 4.34556313993174, + "grad_norm": 0.20590513317835812, + "learning_rate": 6.0432374375353784e-05, + "loss": 0.8827, + "step": 20372 + }, + { + "epoch": 4.345776450511945, + "grad_norm": 0.17634801060500474, + "learning_rate": 6.0424123202621434e-05, + "loss": 0.9047, + "step": 20373 + }, + { + "epoch": 4.3459897610921505, + "grad_norm": 0.18773772397599983, + "learning_rate": 6.0415872251390436e-05, + "loss": 0.9097, + "step": 20374 + }, + { + "epoch": 4.346203071672355, + "grad_norm": 0.21007243461388092, + "learning_rate": 6.040762152175419e-05, + "loss": 0.895, + "step": 20375 + }, + { + "epoch": 4.34641638225256, + "grad_norm": 0.22491102816664726, + "learning_rate": 6.0399371013806024e-05, + "loss": 0.9078, + "step": 20376 + }, + { + "epoch": 4.346629692832765, + "grad_norm": 0.17438781452023777, + "learning_rate": 6.0391120727639316e-05, + "loss": 0.9054, + "step": 20377 + }, + { + "epoch": 4.346843003412969, + "grad_norm": 0.2817428897032607, + "learning_rate": 6.038287066334739e-05, + "loss": 0.9005, + "step": 20378 + }, + { + "epoch": 4.347056313993174, + "grad_norm": 0.27156493833845874, + "learning_rate": 6.0374620821023626e-05, + "loss": 0.9131, + "step": 20379 + }, + { + "epoch": 4.347269624573379, + "grad_norm": 0.2907484254083831, + "learning_rate": 6.036637120076137e-05, + "loss": 0.8838, + "step": 20380 + }, + { + "epoch": 4.347482935153583, + "grad_norm": 0.2165981764153767, + "learning_rate": 6.035812180265392e-05, + "loss": 0.9133, + "step": 20381 + }, + { + "epoch": 4.347696245733788, + "grad_norm": 0.17337980030321312, + "learning_rate": 6.03498726267947e-05, + "loss": 0.9022, + "step": 20382 + }, + { + "epoch": 4.347909556313994, + "grad_norm": 0.22316767124847925, + "learning_rate": 6.034162367327698e-05, + "loss": 0.8997, + "step": 20383 + }, + { + "epoch": 4.348122866894198, + "grad_norm": 0.2788056226174771, + "learning_rate": 6.03333749421941e-05, + "loss": 0.9281, + "step": 20384 + }, + { + "epoch": 4.348336177474403, + "grad_norm": 0.25076779579998226, + "learning_rate": 6.032512643363943e-05, + "loss": 0.9401, + "step": 20385 + }, + { + "epoch": 4.348549488054608, + "grad_norm": 0.18553106463428862, + "learning_rate": 6.031687814770628e-05, + "loss": 0.9434, + "step": 20386 + }, + { + "epoch": 4.348762798634812, + "grad_norm": 0.3274517696321328, + "learning_rate": 6.030863008448799e-05, + "loss": 0.906, + "step": 20387 + }, + { + "epoch": 4.348976109215017, + "grad_norm": 0.23005613048041085, + "learning_rate": 6.0300382244077863e-05, + "loss": 0.9185, + "step": 20388 + }, + { + "epoch": 4.349189419795222, + "grad_norm": 0.25524075463592966, + "learning_rate": 6.029213462656927e-05, + "loss": 0.9192, + "step": 20389 + }, + { + "epoch": 4.349402730375426, + "grad_norm": 0.2279083286200654, + "learning_rate": 6.028388723205548e-05, + "loss": 0.9025, + "step": 20390 + }, + { + "epoch": 4.349616040955631, + "grad_norm": 0.17894835061465109, + "learning_rate": 6.027564006062985e-05, + "loss": 0.9036, + "step": 20391 + }, + { + "epoch": 4.349829351535837, + "grad_norm": 0.2487420403748506, + "learning_rate": 6.02673931123857e-05, + "loss": 0.9078, + "step": 20392 + }, + { + "epoch": 4.350042662116041, + "grad_norm": 0.21503812064060612, + "learning_rate": 6.0259146387416336e-05, + "loss": 0.9124, + "step": 20393 + }, + { + "epoch": 4.350255972696246, + "grad_norm": 0.2143641689579798, + "learning_rate": 6.025089988581503e-05, + "loss": 0.8797, + "step": 20394 + }, + { + "epoch": 4.350469283276451, + "grad_norm": 0.23905999364290975, + "learning_rate": 6.024265360767514e-05, + "loss": 0.9111, + "step": 20395 + }, + { + "epoch": 4.350682593856655, + "grad_norm": 0.20096955612975131, + "learning_rate": 6.0234407553089936e-05, + "loss": 0.9068, + "step": 20396 + }, + { + "epoch": 4.35089590443686, + "grad_norm": 0.18288021233061863, + "learning_rate": 6.022616172215276e-05, + "loss": 0.9028, + "step": 20397 + }, + { + "epoch": 4.351109215017065, + "grad_norm": 0.16526996157566293, + "learning_rate": 6.021791611495689e-05, + "loss": 0.933, + "step": 20398 + }, + { + "epoch": 4.351322525597269, + "grad_norm": 0.22220243164231004, + "learning_rate": 6.020967073159565e-05, + "loss": 0.9104, + "step": 20399 + }, + { + "epoch": 4.351535836177474, + "grad_norm": 0.1976777750115134, + "learning_rate": 6.020142557216229e-05, + "loss": 0.9129, + "step": 20400 + }, + { + "epoch": 4.351749146757679, + "grad_norm": 0.19519309304432125, + "learning_rate": 6.019318063675015e-05, + "loss": 0.9104, + "step": 20401 + }, + { + "epoch": 4.351962457337884, + "grad_norm": 0.19095873854018947, + "learning_rate": 6.0184935925452494e-05, + "loss": 0.8848, + "step": 20402 + }, + { + "epoch": 4.352175767918089, + "grad_norm": 0.1856961163291044, + "learning_rate": 6.017669143836264e-05, + "loss": 0.897, + "step": 20403 + }, + { + "epoch": 4.352389078498294, + "grad_norm": 0.2324076210175416, + "learning_rate": 6.0168447175573863e-05, + "loss": 0.9025, + "step": 20404 + }, + { + "epoch": 4.352602389078498, + "grad_norm": 0.1911718713476114, + "learning_rate": 6.016020313717943e-05, + "loss": 0.8926, + "step": 20405 + }, + { + "epoch": 4.352815699658703, + "grad_norm": 0.16135992018108344, + "learning_rate": 6.0151959323272615e-05, + "loss": 0.9148, + "step": 20406 + }, + { + "epoch": 4.353029010238908, + "grad_norm": 0.19551192293922068, + "learning_rate": 6.014371573394673e-05, + "loss": 0.8958, + "step": 20407 + }, + { + "epoch": 4.353242320819112, + "grad_norm": 0.15492488218807324, + "learning_rate": 6.013547236929501e-05, + "loss": 0.9414, + "step": 20408 + }, + { + "epoch": 4.353455631399317, + "grad_norm": 0.17443188254648706, + "learning_rate": 6.012722922941079e-05, + "loss": 0.9057, + "step": 20409 + }, + { + "epoch": 4.353668941979522, + "grad_norm": 0.1844169239739476, + "learning_rate": 6.011898631438728e-05, + "loss": 0.8983, + "step": 20410 + }, + { + "epoch": 4.3538822525597265, + "grad_norm": 0.15569757379845162, + "learning_rate": 6.011074362431778e-05, + "loss": 0.9156, + "step": 20411 + }, + { + "epoch": 4.354095563139932, + "grad_norm": 0.1761260536783438, + "learning_rate": 6.010250115929554e-05, + "loss": 0.9037, + "step": 20412 + }, + { + "epoch": 4.354308873720137, + "grad_norm": 0.1670404115570161, + "learning_rate": 6.0094258919413854e-05, + "loss": 0.8927, + "step": 20413 + }, + { + "epoch": 4.354522184300341, + "grad_norm": 0.17870222127127552, + "learning_rate": 6.008601690476595e-05, + "loss": 0.9225, + "step": 20414 + }, + { + "epoch": 4.354735494880546, + "grad_norm": 0.2070120723031013, + "learning_rate": 6.007777511544511e-05, + "loss": 0.8855, + "step": 20415 + }, + { + "epoch": 4.354948805460751, + "grad_norm": 0.17067167985380524, + "learning_rate": 6.0069533551544604e-05, + "loss": 0.8868, + "step": 20416 + }, + { + "epoch": 4.3551621160409555, + "grad_norm": 0.18911499004565674, + "learning_rate": 6.006129221315764e-05, + "loss": 0.9124, + "step": 20417 + }, + { + "epoch": 4.35537542662116, + "grad_norm": 0.16916021897400324, + "learning_rate": 6.005305110037747e-05, + "loss": 0.9128, + "step": 20418 + }, + { + "epoch": 4.355588737201365, + "grad_norm": 0.16220172575212047, + "learning_rate": 6.004481021329739e-05, + "loss": 0.9035, + "step": 20419 + }, + { + "epoch": 4.3558020477815695, + "grad_norm": 0.15920215758120518, + "learning_rate": 6.003656955201059e-05, + "loss": 0.9014, + "step": 20420 + }, + { + "epoch": 4.356015358361775, + "grad_norm": 0.14277593261111718, + "learning_rate": 6.002832911661036e-05, + "loss": 0.898, + "step": 20421 + }, + { + "epoch": 4.35622866894198, + "grad_norm": 0.16886857798554622, + "learning_rate": 6.0020088907189903e-05, + "loss": 0.8764, + "step": 20422 + }, + { + "epoch": 4.3564419795221845, + "grad_norm": 0.17777439154657956, + "learning_rate": 6.001184892384249e-05, + "loss": 0.8863, + "step": 20423 + }, + { + "epoch": 4.356655290102389, + "grad_norm": 0.17808960389185932, + "learning_rate": 6.0003609166661336e-05, + "loss": 0.9397, + "step": 20424 + }, + { + "epoch": 4.356868600682594, + "grad_norm": 0.1397992533822976, + "learning_rate": 5.99953696357397e-05, + "loss": 0.8954, + "step": 20425 + }, + { + "epoch": 4.3570819112627985, + "grad_norm": 0.20965289927616448, + "learning_rate": 5.998713033117076e-05, + "loss": 0.8799, + "step": 20426 + }, + { + "epoch": 4.357295221843003, + "grad_norm": 0.17217716762827992, + "learning_rate": 5.9978891253047845e-05, + "loss": 0.911, + "step": 20427 + }, + { + "epoch": 4.357508532423208, + "grad_norm": 0.22120613475022685, + "learning_rate": 5.997065240146405e-05, + "loss": 0.907, + "step": 20428 + }, + { + "epoch": 4.357721843003413, + "grad_norm": 0.20512852946642463, + "learning_rate": 5.9962413776512687e-05, + "loss": 0.906, + "step": 20429 + }, + { + "epoch": 4.357935153583618, + "grad_norm": 0.17926384006300006, + "learning_rate": 5.995417537828692e-05, + "loss": 0.8993, + "step": 20430 + }, + { + "epoch": 4.358148464163823, + "grad_norm": 0.19871014202545573, + "learning_rate": 5.9945937206880024e-05, + "loss": 0.9086, + "step": 20431 + }, + { + "epoch": 4.3583617747440275, + "grad_norm": 0.1545509061479152, + "learning_rate": 5.993769926238516e-05, + "loss": 0.9216, + "step": 20432 + }, + { + "epoch": 4.358575085324232, + "grad_norm": 0.1844503199933128, + "learning_rate": 5.9929461544895586e-05, + "loss": 0.8745, + "step": 20433 + }, + { + "epoch": 4.358788395904437, + "grad_norm": 0.17641194111693775, + "learning_rate": 5.9921224054504473e-05, + "loss": 0.918, + "step": 20434 + }, + { + "epoch": 4.359001706484642, + "grad_norm": 0.16050557903269788, + "learning_rate": 5.9912986791305066e-05, + "loss": 0.9165, + "step": 20435 + }, + { + "epoch": 4.359215017064846, + "grad_norm": 0.18253789542704113, + "learning_rate": 5.990474975539053e-05, + "loss": 0.9109, + "step": 20436 + }, + { + "epoch": 4.359428327645051, + "grad_norm": 0.19091707006081773, + "learning_rate": 5.989651294685411e-05, + "loss": 0.8716, + "step": 20437 + }, + { + "epoch": 4.359641638225256, + "grad_norm": 0.18603777440520478, + "learning_rate": 5.988827636578901e-05, + "loss": 0.887, + "step": 20438 + }, + { + "epoch": 4.359854948805461, + "grad_norm": 0.20804135886900912, + "learning_rate": 5.988004001228838e-05, + "loss": 0.9004, + "step": 20439 + }, + { + "epoch": 4.360068259385666, + "grad_norm": 0.1787753901387602, + "learning_rate": 5.987180388644542e-05, + "loss": 0.9094, + "step": 20440 + }, + { + "epoch": 4.360281569965871, + "grad_norm": 0.1864978362792754, + "learning_rate": 5.9863567988353344e-05, + "loss": 0.9058, + "step": 20441 + }, + { + "epoch": 4.360494880546075, + "grad_norm": 0.16494747341871585, + "learning_rate": 5.985533231810532e-05, + "loss": 0.9006, + "step": 20442 + }, + { + "epoch": 4.36070819112628, + "grad_norm": 0.20695919310095942, + "learning_rate": 5.9847096875794575e-05, + "loss": 0.9195, + "step": 20443 + }, + { + "epoch": 4.360921501706485, + "grad_norm": 0.17685269045394963, + "learning_rate": 5.983886166151424e-05, + "loss": 0.8893, + "step": 20444 + }, + { + "epoch": 4.361134812286689, + "grad_norm": 0.15320170940720265, + "learning_rate": 5.983062667535756e-05, + "loss": 0.9326, + "step": 20445 + }, + { + "epoch": 4.361348122866894, + "grad_norm": 0.15958330975021257, + "learning_rate": 5.982239191741764e-05, + "loss": 0.8973, + "step": 20446 + }, + { + "epoch": 4.361561433447099, + "grad_norm": 0.17935329047972334, + "learning_rate": 5.981415738778772e-05, + "loss": 0.931, + "step": 20447 + }, + { + "epoch": 4.361774744027303, + "grad_norm": 0.18626750484178392, + "learning_rate": 5.980592308656093e-05, + "loss": 0.9105, + "step": 20448 + }, + { + "epoch": 4.361988054607509, + "grad_norm": 0.17005876325614555, + "learning_rate": 5.9797689013830525e-05, + "loss": 0.8922, + "step": 20449 + }, + { + "epoch": 4.362201365187714, + "grad_norm": 0.1498353124979341, + "learning_rate": 5.978945516968954e-05, + "loss": 0.9063, + "step": 20450 + }, + { + "epoch": 4.362414675767918, + "grad_norm": 0.17832920058890825, + "learning_rate": 5.978122155423123e-05, + "loss": 0.9188, + "step": 20451 + }, + { + "epoch": 4.362627986348123, + "grad_norm": 0.1874969184414263, + "learning_rate": 5.977298816754873e-05, + "loss": 0.917, + "step": 20452 + }, + { + "epoch": 4.362841296928328, + "grad_norm": 0.18008814522572647, + "learning_rate": 5.976475500973522e-05, + "loss": 0.9255, + "step": 20453 + }, + { + "epoch": 4.363054607508532, + "grad_norm": 0.22825174449038346, + "learning_rate": 5.975652208088383e-05, + "loss": 0.9189, + "step": 20454 + }, + { + "epoch": 4.363267918088737, + "grad_norm": 0.21211092203819198, + "learning_rate": 5.974828938108775e-05, + "loss": 0.8905, + "step": 20455 + }, + { + "epoch": 4.363481228668942, + "grad_norm": 0.15380881375553224, + "learning_rate": 5.97400569104401e-05, + "loss": 0.921, + "step": 20456 + }, + { + "epoch": 4.363694539249146, + "grad_norm": 0.19244863105789176, + "learning_rate": 5.973182466903406e-05, + "loss": 0.9095, + "step": 20457 + }, + { + "epoch": 4.363907849829351, + "grad_norm": 0.2046833720223548, + "learning_rate": 5.9723592656962755e-05, + "loss": 0.9014, + "step": 20458 + }, + { + "epoch": 4.364121160409557, + "grad_norm": 0.20672767409074044, + "learning_rate": 5.9715360874319346e-05, + "loss": 0.8911, + "step": 20459 + }, + { + "epoch": 4.364334470989761, + "grad_norm": 0.2181825861380641, + "learning_rate": 5.9707129321196997e-05, + "loss": 0.9229, + "step": 20460 + }, + { + "epoch": 4.364547781569966, + "grad_norm": 0.149605473935832, + "learning_rate": 5.9698897997688806e-05, + "loss": 0.8997, + "step": 20461 + }, + { + "epoch": 4.364761092150171, + "grad_norm": 0.16282453490335008, + "learning_rate": 5.96906669038879e-05, + "loss": 0.879, + "step": 20462 + }, + { + "epoch": 4.364974402730375, + "grad_norm": 0.19457544795616002, + "learning_rate": 5.968243603988747e-05, + "loss": 0.9113, + "step": 20463 + }, + { + "epoch": 4.36518771331058, + "grad_norm": 0.14301462703251502, + "learning_rate": 5.967420540578058e-05, + "loss": 0.945, + "step": 20464 + }, + { + "epoch": 4.365401023890785, + "grad_norm": 0.1632145098737486, + "learning_rate": 5.966597500166043e-05, + "loss": 0.9039, + "step": 20465 + }, + { + "epoch": 4.3656143344709895, + "grad_norm": 0.14124951764935514, + "learning_rate": 5.96577448276201e-05, + "loss": 0.9167, + "step": 20466 + }, + { + "epoch": 4.365827645051194, + "grad_norm": 0.14314889076499188, + "learning_rate": 5.9649514883752744e-05, + "loss": 0.8818, + "step": 20467 + }, + { + "epoch": 4.3660409556314, + "grad_norm": 0.16298369736114465, + "learning_rate": 5.964128517015144e-05, + "loss": 0.8937, + "step": 20468 + }, + { + "epoch": 4.366254266211604, + "grad_norm": 0.17877305306766036, + "learning_rate": 5.9633055686909365e-05, + "loss": 0.8953, + "step": 20469 + }, + { + "epoch": 4.366467576791809, + "grad_norm": 0.163452449247864, + "learning_rate": 5.962482643411959e-05, + "loss": 0.9074, + "step": 20470 + }, + { + "epoch": 4.366680887372014, + "grad_norm": 0.14574981464402015, + "learning_rate": 5.961659741187526e-05, + "loss": 0.9078, + "step": 20471 + }, + { + "epoch": 4.3668941979522184, + "grad_norm": 0.17092210829931812, + "learning_rate": 5.96083686202695e-05, + "loss": 0.9066, + "step": 20472 + }, + { + "epoch": 4.367107508532423, + "grad_norm": 0.1885110598310342, + "learning_rate": 5.960014005939537e-05, + "loss": 0.8793, + "step": 20473 + }, + { + "epoch": 4.367320819112628, + "grad_norm": 0.15979379918450617, + "learning_rate": 5.959191172934598e-05, + "loss": 0.9072, + "step": 20474 + }, + { + "epoch": 4.3675341296928325, + "grad_norm": 0.15638108463148373, + "learning_rate": 5.958368363021446e-05, + "loss": 0.9299, + "step": 20475 + }, + { + "epoch": 4.367747440273037, + "grad_norm": 0.1763349630788273, + "learning_rate": 5.9575455762093895e-05, + "loss": 0.901, + "step": 20476 + }, + { + "epoch": 4.367960750853243, + "grad_norm": 0.15260241001932867, + "learning_rate": 5.956722812507741e-05, + "loss": 0.9133, + "step": 20477 + }, + { + "epoch": 4.368174061433447, + "grad_norm": 0.16845054993192832, + "learning_rate": 5.955900071925806e-05, + "loss": 0.9343, + "step": 20478 + }, + { + "epoch": 4.368387372013652, + "grad_norm": 0.14717562713743823, + "learning_rate": 5.955077354472898e-05, + "loss": 0.9407, + "step": 20479 + }, + { + "epoch": 4.368600682593857, + "grad_norm": 0.18757826479657932, + "learning_rate": 5.954254660158322e-05, + "loss": 0.9103, + "step": 20480 + }, + { + "epoch": 4.3688139931740615, + "grad_norm": 0.18815791475883573, + "learning_rate": 5.9534319889913904e-05, + "loss": 0.8951, + "step": 20481 + }, + { + "epoch": 4.369027303754266, + "grad_norm": 0.18565625977345313, + "learning_rate": 5.9526093409814096e-05, + "loss": 0.8894, + "step": 20482 + }, + { + "epoch": 4.369240614334471, + "grad_norm": 0.18984374575717886, + "learning_rate": 5.951786716137692e-05, + "loss": 0.8949, + "step": 20483 + }, + { + "epoch": 4.3694539249146755, + "grad_norm": 0.16647977198566907, + "learning_rate": 5.950964114469541e-05, + "loss": 0.9218, + "step": 20484 + }, + { + "epoch": 4.36966723549488, + "grad_norm": 0.20984189694648592, + "learning_rate": 5.950141535986264e-05, + "loss": 0.9152, + "step": 20485 + }, + { + "epoch": 4.369880546075085, + "grad_norm": 0.1588040517161528, + "learning_rate": 5.949318980697168e-05, + "loss": 0.8831, + "step": 20486 + }, + { + "epoch": 4.3700938566552905, + "grad_norm": 0.17847562808159728, + "learning_rate": 5.9484964486115656e-05, + "loss": 0.8929, + "step": 20487 + }, + { + "epoch": 4.370307167235495, + "grad_norm": 0.17372699044506276, + "learning_rate": 5.947673939738759e-05, + "loss": 0.917, + "step": 20488 + }, + { + "epoch": 4.3705204778157, + "grad_norm": 0.19137519781994605, + "learning_rate": 5.946851454088056e-05, + "loss": 0.9084, + "step": 20489 + }, + { + "epoch": 4.3707337883959045, + "grad_norm": 0.18010750229065806, + "learning_rate": 5.946028991668762e-05, + "loss": 0.8851, + "step": 20490 + }, + { + "epoch": 4.370947098976109, + "grad_norm": 0.18305475795319132, + "learning_rate": 5.945206552490187e-05, + "loss": 0.8669, + "step": 20491 + }, + { + "epoch": 4.371160409556314, + "grad_norm": 0.14595072093498168, + "learning_rate": 5.944384136561633e-05, + "loss": 0.8757, + "step": 20492 + }, + { + "epoch": 4.371373720136519, + "grad_norm": 0.1709908566306144, + "learning_rate": 5.943561743892409e-05, + "loss": 0.8917, + "step": 20493 + }, + { + "epoch": 4.371587030716723, + "grad_norm": 0.1518467858305938, + "learning_rate": 5.9427393744918186e-05, + "loss": 0.8897, + "step": 20494 + }, + { + "epoch": 4.371800341296928, + "grad_norm": 0.1755963947818493, + "learning_rate": 5.9419170283691636e-05, + "loss": 0.9123, + "step": 20495 + }, + { + "epoch": 4.372013651877133, + "grad_norm": 0.16572636253727377, + "learning_rate": 5.941094705533754e-05, + "loss": 0.899, + "step": 20496 + }, + { + "epoch": 4.372226962457338, + "grad_norm": 0.15167866242380135, + "learning_rate": 5.940272405994894e-05, + "loss": 0.9069, + "step": 20497 + }, + { + "epoch": 4.372440273037543, + "grad_norm": 0.1592122923710713, + "learning_rate": 5.939450129761883e-05, + "loss": 0.9052, + "step": 20498 + }, + { + "epoch": 4.372653583617748, + "grad_norm": 0.18800190072561473, + "learning_rate": 5.938627876844031e-05, + "loss": 0.9179, + "step": 20499 + }, + { + "epoch": 4.372866894197952, + "grad_norm": 0.13983565280877452, + "learning_rate": 5.9378056472506365e-05, + "loss": 0.8861, + "step": 20500 + }, + { + "epoch": 4.373080204778157, + "grad_norm": 0.3219281890367115, + "learning_rate": 5.9369834409910084e-05, + "loss": 0.9155, + "step": 20501 + }, + { + "epoch": 4.373293515358362, + "grad_norm": 0.16532155469649884, + "learning_rate": 5.9361612580744454e-05, + "loss": 0.9103, + "step": 20502 + }, + { + "epoch": 4.373506825938566, + "grad_norm": 0.20369306831196218, + "learning_rate": 5.9353390985102536e-05, + "loss": 0.9125, + "step": 20503 + }, + { + "epoch": 4.373720136518771, + "grad_norm": 0.17466866417675436, + "learning_rate": 5.934516962307734e-05, + "loss": 0.9083, + "step": 20504 + }, + { + "epoch": 4.373933447098976, + "grad_norm": 0.1571705027953539, + "learning_rate": 5.933694849476192e-05, + "loss": 0.891, + "step": 20505 + }, + { + "epoch": 4.374146757679181, + "grad_norm": 0.14347171591219393, + "learning_rate": 5.932872760024927e-05, + "loss": 0.921, + "step": 20506 + }, + { + "epoch": 4.374360068259386, + "grad_norm": 0.18903190414459384, + "learning_rate": 5.93205069396324e-05, + "loss": 0.8942, + "step": 20507 + }, + { + "epoch": 4.374573378839591, + "grad_norm": 0.16489096738887946, + "learning_rate": 5.9312286513004345e-05, + "loss": 0.9151, + "step": 20508 + }, + { + "epoch": 4.374786689419795, + "grad_norm": 0.19298317141401455, + "learning_rate": 5.9304066320458125e-05, + "loss": 0.8978, + "step": 20509 + }, + { + "epoch": 4.375, + "grad_norm": 0.18234549116808105, + "learning_rate": 5.9295846362086725e-05, + "loss": 0.921, + "step": 20510 + }, + { + "epoch": 4.375213310580205, + "grad_norm": 0.21472073470342615, + "learning_rate": 5.928762663798318e-05, + "loss": 0.9302, + "step": 20511 + }, + { + "epoch": 4.375426621160409, + "grad_norm": 0.22681247497578194, + "learning_rate": 5.927940714824047e-05, + "loss": 0.8941, + "step": 20512 + }, + { + "epoch": 4.375639931740614, + "grad_norm": 0.15865837556326576, + "learning_rate": 5.927118789295165e-05, + "loss": 0.9249, + "step": 20513 + }, + { + "epoch": 4.375853242320819, + "grad_norm": 0.20350127119720526, + "learning_rate": 5.9262968872209664e-05, + "loss": 0.8834, + "step": 20514 + }, + { + "epoch": 4.376066552901024, + "grad_norm": 0.2448610302836095, + "learning_rate": 5.925475008610754e-05, + "loss": 0.8454, + "step": 20515 + }, + { + "epoch": 4.376279863481229, + "grad_norm": 0.1627641635433334, + "learning_rate": 5.92465315347383e-05, + "loss": 0.9175, + "step": 20516 + }, + { + "epoch": 4.376493174061434, + "grad_norm": 0.17295536422969268, + "learning_rate": 5.923831321819486e-05, + "loss": 0.9042, + "step": 20517 + }, + { + "epoch": 4.376706484641638, + "grad_norm": 0.17705958086039014, + "learning_rate": 5.923009513657028e-05, + "loss": 0.8932, + "step": 20518 + }, + { + "epoch": 4.376919795221843, + "grad_norm": 0.2124456850214404, + "learning_rate": 5.9221877289957504e-05, + "loss": 0.9064, + "step": 20519 + }, + { + "epoch": 4.377133105802048, + "grad_norm": 0.1632030509744981, + "learning_rate": 5.9213659678449545e-05, + "loss": 0.9016, + "step": 20520 + }, + { + "epoch": 4.377346416382252, + "grad_norm": 0.22941642978688725, + "learning_rate": 5.9205442302139385e-05, + "loss": 0.9099, + "step": 20521 + }, + { + "epoch": 4.377559726962457, + "grad_norm": 0.23691733857450725, + "learning_rate": 5.919722516111998e-05, + "loss": 0.8744, + "step": 20522 + }, + { + "epoch": 4.377773037542662, + "grad_norm": 0.14122144877969506, + "learning_rate": 5.918900825548433e-05, + "loss": 0.8721, + "step": 20523 + }, + { + "epoch": 4.377986348122867, + "grad_norm": 0.2315171783261746, + "learning_rate": 5.918079158532539e-05, + "loss": 0.9068, + "step": 20524 + }, + { + "epoch": 4.378199658703072, + "grad_norm": 0.23999177435206967, + "learning_rate": 5.9172575150736155e-05, + "loss": 0.8921, + "step": 20525 + }, + { + "epoch": 4.378412969283277, + "grad_norm": 0.19761261028886754, + "learning_rate": 5.916435895180957e-05, + "loss": 0.8955, + "step": 20526 + }, + { + "epoch": 4.378626279863481, + "grad_norm": 0.1611024973630294, + "learning_rate": 5.915614298863865e-05, + "loss": 0.8936, + "step": 20527 + }, + { + "epoch": 4.378839590443686, + "grad_norm": 0.18899441513686555, + "learning_rate": 5.914792726131631e-05, + "loss": 0.8773, + "step": 20528 + }, + { + "epoch": 4.379052901023891, + "grad_norm": 0.19701141821549065, + "learning_rate": 5.91397117699355e-05, + "loss": 0.8759, + "step": 20529 + }, + { + "epoch": 4.3792662116040955, + "grad_norm": 0.17534991422143323, + "learning_rate": 5.9131496514589216e-05, + "loss": 0.9258, + "step": 20530 + }, + { + "epoch": 4.3794795221843, + "grad_norm": 0.1759541060779276, + "learning_rate": 5.91232814953704e-05, + "loss": 0.8821, + "step": 20531 + }, + { + "epoch": 4.379692832764505, + "grad_norm": 0.1943987781692066, + "learning_rate": 5.911506671237201e-05, + "loss": 0.9012, + "step": 20532 + }, + { + "epoch": 4.3799061433447095, + "grad_norm": 0.2515239055235923, + "learning_rate": 5.910685216568699e-05, + "loss": 0.9524, + "step": 20533 + }, + { + "epoch": 4.380119453924915, + "grad_norm": 0.19286822817629723, + "learning_rate": 5.9098637855408265e-05, + "loss": 0.9094, + "step": 20534 + }, + { + "epoch": 4.38033276450512, + "grad_norm": 0.1944694078303897, + "learning_rate": 5.9090423781628826e-05, + "loss": 0.8992, + "step": 20535 + }, + { + "epoch": 4.3805460750853245, + "grad_norm": 0.26744622732146345, + "learning_rate": 5.908220994444158e-05, + "loss": 0.9175, + "step": 20536 + }, + { + "epoch": 4.380759385665529, + "grad_norm": 0.2536414860846121, + "learning_rate": 5.907399634393949e-05, + "loss": 0.9259, + "step": 20537 + }, + { + "epoch": 4.380972696245734, + "grad_norm": 0.16911294289149864, + "learning_rate": 5.906578298021548e-05, + "loss": 0.8603, + "step": 20538 + }, + { + "epoch": 4.3811860068259385, + "grad_norm": 0.15563341616732937, + "learning_rate": 5.9057569853362517e-05, + "loss": 0.884, + "step": 20539 + }, + { + "epoch": 4.381399317406143, + "grad_norm": 0.18059138636761177, + "learning_rate": 5.9049356963473486e-05, + "loss": 0.872, + "step": 20540 + }, + { + "epoch": 4.381612627986348, + "grad_norm": 0.1908111996826232, + "learning_rate": 5.904114431064132e-05, + "loss": 0.8824, + "step": 20541 + }, + { + "epoch": 4.381825938566553, + "grad_norm": 0.15517028199341346, + "learning_rate": 5.9032931894958966e-05, + "loss": 0.8756, + "step": 20542 + }, + { + "epoch": 4.382039249146757, + "grad_norm": 0.16844511764667724, + "learning_rate": 5.902471971651933e-05, + "loss": 0.877, + "step": 20543 + }, + { + "epoch": 4.382252559726963, + "grad_norm": 0.16701458475591788, + "learning_rate": 5.9016507775415355e-05, + "loss": 0.9032, + "step": 20544 + }, + { + "epoch": 4.3824658703071675, + "grad_norm": 0.2170998753263561, + "learning_rate": 5.900829607173995e-05, + "loss": 0.8973, + "step": 20545 + }, + { + "epoch": 4.382679180887372, + "grad_norm": 0.16367937614201641, + "learning_rate": 5.900008460558601e-05, + "loss": 0.925, + "step": 20546 + }, + { + "epoch": 4.382892491467577, + "grad_norm": 0.2015114816532804, + "learning_rate": 5.8991873377046484e-05, + "loss": 0.9185, + "step": 20547 + }, + { + "epoch": 4.3831058020477816, + "grad_norm": 0.24456721575234533, + "learning_rate": 5.8983662386214246e-05, + "loss": 0.9142, + "step": 20548 + }, + { + "epoch": 4.383319112627986, + "grad_norm": 0.19593834660553736, + "learning_rate": 5.8975451633182245e-05, + "loss": 0.8907, + "step": 20549 + }, + { + "epoch": 4.383532423208191, + "grad_norm": 0.1764910641554964, + "learning_rate": 5.896724111804338e-05, + "loss": 0.9216, + "step": 20550 + }, + { + "epoch": 4.383745733788396, + "grad_norm": 0.19707153452377385, + "learning_rate": 5.8959030840890486e-05, + "loss": 0.8803, + "step": 20551 + }, + { + "epoch": 4.3839590443686, + "grad_norm": 0.18672296451676548, + "learning_rate": 5.895082080181654e-05, + "loss": 0.9072, + "step": 20552 + }, + { + "epoch": 4.384172354948806, + "grad_norm": 0.1684165271633204, + "learning_rate": 5.89426110009144e-05, + "loss": 0.8898, + "step": 20553 + }, + { + "epoch": 4.3843856655290105, + "grad_norm": 0.16217874392208426, + "learning_rate": 5.893440143827697e-05, + "loss": 0.9113, + "step": 20554 + }, + { + "epoch": 4.384598976109215, + "grad_norm": 0.16110222874886435, + "learning_rate": 5.892619211399714e-05, + "loss": 0.9013, + "step": 20555 + }, + { + "epoch": 4.38481228668942, + "grad_norm": 0.16382517159043322, + "learning_rate": 5.891798302816782e-05, + "loss": 0.8985, + "step": 20556 + }, + { + "epoch": 4.385025597269625, + "grad_norm": 0.17336433015658392, + "learning_rate": 5.8909774180881865e-05, + "loss": 0.8975, + "step": 20557 + }, + { + "epoch": 4.385238907849829, + "grad_norm": 0.16856627710190653, + "learning_rate": 5.890156557223215e-05, + "loss": 0.8915, + "step": 20558 + }, + { + "epoch": 4.385452218430034, + "grad_norm": 0.20216783821864331, + "learning_rate": 5.8893357202311593e-05, + "loss": 0.9265, + "step": 20559 + }, + { + "epoch": 4.385665529010239, + "grad_norm": 0.17168558518123764, + "learning_rate": 5.888514907121305e-05, + "loss": 0.9028, + "step": 20560 + }, + { + "epoch": 4.385878839590443, + "grad_norm": 0.17200649835114887, + "learning_rate": 5.887694117902943e-05, + "loss": 0.8907, + "step": 20561 + }, + { + "epoch": 4.386092150170649, + "grad_norm": 0.18065082853886832, + "learning_rate": 5.886873352585356e-05, + "loss": 0.9163, + "step": 20562 + }, + { + "epoch": 4.386305460750854, + "grad_norm": 0.16507993981844146, + "learning_rate": 5.88605261117783e-05, + "loss": 0.917, + "step": 20563 + }, + { + "epoch": 4.386518771331058, + "grad_norm": 0.21379511104820487, + "learning_rate": 5.885231893689657e-05, + "loss": 0.8995, + "step": 20564 + }, + { + "epoch": 4.386732081911263, + "grad_norm": 0.19618425090657118, + "learning_rate": 5.884411200130119e-05, + "loss": 0.9654, + "step": 20565 + }, + { + "epoch": 4.386945392491468, + "grad_norm": 0.21877641596549185, + "learning_rate": 5.883590530508505e-05, + "loss": 0.8985, + "step": 20566 + }, + { + "epoch": 4.387158703071672, + "grad_norm": 0.20109942827338273, + "learning_rate": 5.882769884834098e-05, + "loss": 0.8943, + "step": 20567 + }, + { + "epoch": 4.387372013651877, + "grad_norm": 0.1720985662035243, + "learning_rate": 5.881949263116188e-05, + "loss": 0.9132, + "step": 20568 + }, + { + "epoch": 4.387585324232082, + "grad_norm": 0.202902217063218, + "learning_rate": 5.8811286653640566e-05, + "loss": 0.9055, + "step": 20569 + }, + { + "epoch": 4.387798634812286, + "grad_norm": 0.18934501632238573, + "learning_rate": 5.880308091586988e-05, + "loss": 0.9216, + "step": 20570 + }, + { + "epoch": 4.388011945392491, + "grad_norm": 0.1637222229329198, + "learning_rate": 5.879487541794271e-05, + "loss": 0.9243, + "step": 20571 + }, + { + "epoch": 4.388225255972697, + "grad_norm": 0.16341702288277765, + "learning_rate": 5.878667015995191e-05, + "loss": 0.8934, + "step": 20572 + }, + { + "epoch": 4.388438566552901, + "grad_norm": 0.18600840105669883, + "learning_rate": 5.877846514199025e-05, + "loss": 0.9076, + "step": 20573 + }, + { + "epoch": 4.388651877133106, + "grad_norm": 0.17544181367642808, + "learning_rate": 5.877026036415063e-05, + "loss": 0.8812, + "step": 20574 + }, + { + "epoch": 4.388865187713311, + "grad_norm": 0.14745401107187206, + "learning_rate": 5.876205582652586e-05, + "loss": 0.9099, + "step": 20575 + }, + { + "epoch": 4.389078498293515, + "grad_norm": 0.16674704179199792, + "learning_rate": 5.87538515292088e-05, + "loss": 0.9253, + "step": 20576 + }, + { + "epoch": 4.38929180887372, + "grad_norm": 0.14376632074249934, + "learning_rate": 5.874564747229224e-05, + "loss": 0.9257, + "step": 20577 + }, + { + "epoch": 4.389505119453925, + "grad_norm": 0.16237350041647267, + "learning_rate": 5.873744365586907e-05, + "loss": 0.8976, + "step": 20578 + }, + { + "epoch": 4.389718430034129, + "grad_norm": 0.169211727101068, + "learning_rate": 5.872924008003206e-05, + "loss": 0.8919, + "step": 20579 + }, + { + "epoch": 4.389931740614334, + "grad_norm": 0.16782534495854803, + "learning_rate": 5.8721036744874066e-05, + "loss": 0.9089, + "step": 20580 + }, + { + "epoch": 4.390145051194539, + "grad_norm": 0.16383268436421441, + "learning_rate": 5.8712833650487905e-05, + "loss": 0.8711, + "step": 20581 + }, + { + "epoch": 4.390358361774744, + "grad_norm": 0.16972972827944435, + "learning_rate": 5.8704630796966384e-05, + "loss": 0.9048, + "step": 20582 + }, + { + "epoch": 4.390571672354949, + "grad_norm": 0.16089689541795957, + "learning_rate": 5.869642818440234e-05, + "loss": 0.8569, + "step": 20583 + }, + { + "epoch": 4.390784982935154, + "grad_norm": 0.2350992134447624, + "learning_rate": 5.8688225812888554e-05, + "loss": 0.8977, + "step": 20584 + }, + { + "epoch": 4.390998293515358, + "grad_norm": 0.2618412756868963, + "learning_rate": 5.868002368251784e-05, + "loss": 0.8854, + "step": 20585 + }, + { + "epoch": 4.391211604095563, + "grad_norm": 0.20199202665933574, + "learning_rate": 5.867182179338303e-05, + "loss": 0.9117, + "step": 20586 + }, + { + "epoch": 4.391424914675768, + "grad_norm": 0.20436305503933855, + "learning_rate": 5.86636201455769e-05, + "loss": 0.9209, + "step": 20587 + }, + { + "epoch": 4.3916382252559725, + "grad_norm": 0.24379582243996534, + "learning_rate": 5.8655418739192294e-05, + "loss": 0.8863, + "step": 20588 + }, + { + "epoch": 4.391851535836177, + "grad_norm": 0.2298696973842396, + "learning_rate": 5.864721757432195e-05, + "loss": 0.886, + "step": 20589 + }, + { + "epoch": 4.392064846416382, + "grad_norm": 0.15993859743274036, + "learning_rate": 5.8639016651058725e-05, + "loss": 0.8916, + "step": 20590 + }, + { + "epoch": 4.392278156996587, + "grad_norm": 0.2453054838738741, + "learning_rate": 5.863081596949539e-05, + "loss": 0.8992, + "step": 20591 + }, + { + "epoch": 4.392491467576792, + "grad_norm": 0.2134003101973868, + "learning_rate": 5.86226155297247e-05, + "loss": 0.9043, + "step": 20592 + }, + { + "epoch": 4.392704778156997, + "grad_norm": 0.16157167501760206, + "learning_rate": 5.86144153318395e-05, + "loss": 0.9145, + "step": 20593 + }, + { + "epoch": 4.3929180887372015, + "grad_norm": 0.19746333952863404, + "learning_rate": 5.8606215375932544e-05, + "loss": 0.8948, + "step": 20594 + }, + { + "epoch": 4.393131399317406, + "grad_norm": 0.18897773880510393, + "learning_rate": 5.859801566209664e-05, + "loss": 0.9014, + "step": 20595 + }, + { + "epoch": 4.393344709897611, + "grad_norm": 0.18123963281163954, + "learning_rate": 5.8589816190424536e-05, + "loss": 0.9345, + "step": 20596 + }, + { + "epoch": 4.3935580204778155, + "grad_norm": 0.17302521496691303, + "learning_rate": 5.8581616961009e-05, + "loss": 0.9058, + "step": 20597 + }, + { + "epoch": 4.39377133105802, + "grad_norm": 0.19210373498991964, + "learning_rate": 5.8573417973942854e-05, + "loss": 0.896, + "step": 20598 + }, + { + "epoch": 4.393984641638225, + "grad_norm": 0.24688964275297975, + "learning_rate": 5.8565219229318814e-05, + "loss": 0.9366, + "step": 20599 + }, + { + "epoch": 4.3941979522184305, + "grad_norm": 0.1927028240410765, + "learning_rate": 5.8557020727229715e-05, + "loss": 0.9095, + "step": 20600 + }, + { + "epoch": 4.394411262798635, + "grad_norm": 0.19039757766443188, + "learning_rate": 5.854882246776825e-05, + "loss": 0.9134, + "step": 20601 + }, + { + "epoch": 4.39462457337884, + "grad_norm": 0.3261674696635106, + "learning_rate": 5.8540624451027256e-05, + "loss": 0.8914, + "step": 20602 + }, + { + "epoch": 4.3948378839590445, + "grad_norm": 0.2282394752925299, + "learning_rate": 5.853242667709944e-05, + "loss": 0.9269, + "step": 20603 + }, + { + "epoch": 4.395051194539249, + "grad_norm": 0.19654476181306918, + "learning_rate": 5.852422914607758e-05, + "loss": 0.9041, + "step": 20604 + }, + { + "epoch": 4.395264505119454, + "grad_norm": 0.16688589659082995, + "learning_rate": 5.8516031858054437e-05, + "loss": 0.9404, + "step": 20605 + }, + { + "epoch": 4.395477815699659, + "grad_norm": 0.17705832321682566, + "learning_rate": 5.850783481312276e-05, + "loss": 0.8878, + "step": 20606 + }, + { + "epoch": 4.395691126279863, + "grad_norm": 0.18449281313946, + "learning_rate": 5.8499638011375275e-05, + "loss": 0.89, + "step": 20607 + }, + { + "epoch": 4.395904436860068, + "grad_norm": 0.15844855820601075, + "learning_rate": 5.849144145290476e-05, + "loss": 0.9227, + "step": 20608 + }, + { + "epoch": 4.3961177474402735, + "grad_norm": 0.1820670062363834, + "learning_rate": 5.8483245137803925e-05, + "loss": 0.8788, + "step": 20609 + }, + { + "epoch": 4.396331058020478, + "grad_norm": 0.1476840922526989, + "learning_rate": 5.847504906616556e-05, + "loss": 0.9091, + "step": 20610 + }, + { + "epoch": 4.396544368600683, + "grad_norm": 0.2117158115178228, + "learning_rate": 5.8466853238082346e-05, + "loss": 0.8929, + "step": 20611 + }, + { + "epoch": 4.396757679180888, + "grad_norm": 0.16758453792054243, + "learning_rate": 5.845865765364707e-05, + "loss": 0.9463, + "step": 20612 + }, + { + "epoch": 4.396970989761092, + "grad_norm": 0.16904343801446142, + "learning_rate": 5.845046231295243e-05, + "loss": 0.8765, + "step": 20613 + }, + { + "epoch": 4.397184300341297, + "grad_norm": 0.17752288896301946, + "learning_rate": 5.844226721609119e-05, + "loss": 0.915, + "step": 20614 + }, + { + "epoch": 4.397397610921502, + "grad_norm": 0.15394321069789926, + "learning_rate": 5.8434072363156066e-05, + "loss": 0.8862, + "step": 20615 + }, + { + "epoch": 4.397610921501706, + "grad_norm": 0.17503984527120037, + "learning_rate": 5.8425877754239745e-05, + "loss": 0.8851, + "step": 20616 + }, + { + "epoch": 4.397824232081911, + "grad_norm": 0.22457699624888314, + "learning_rate": 5.841768338943502e-05, + "loss": 0.9078, + "step": 20617 + }, + { + "epoch": 4.398037542662116, + "grad_norm": 0.2437775976429422, + "learning_rate": 5.8409489268834556e-05, + "loss": 0.8973, + "step": 20618 + }, + { + "epoch": 4.398250853242321, + "grad_norm": 0.18215965739285345, + "learning_rate": 5.8401295392531066e-05, + "loss": 0.9114, + "step": 20619 + }, + { + "epoch": 4.398464163822526, + "grad_norm": 0.16277549412193332, + "learning_rate": 5.83931017606173e-05, + "loss": 0.911, + "step": 20620 + }, + { + "epoch": 4.398677474402731, + "grad_norm": 0.17728070051826883, + "learning_rate": 5.838490837318594e-05, + "loss": 0.9249, + "step": 20621 + }, + { + "epoch": 4.398890784982935, + "grad_norm": 0.14333269909260238, + "learning_rate": 5.8376715230329706e-05, + "loss": 0.9074, + "step": 20622 + }, + { + "epoch": 4.39910409556314, + "grad_norm": 0.1607486458025067, + "learning_rate": 5.83685223321413e-05, + "loss": 0.9191, + "step": 20623 + }, + { + "epoch": 4.399317406143345, + "grad_norm": 0.14662209669380968, + "learning_rate": 5.836032967871344e-05, + "loss": 0.9131, + "step": 20624 + }, + { + "epoch": 4.399530716723549, + "grad_norm": 0.17532609977063254, + "learning_rate": 5.835213727013879e-05, + "loss": 0.9072, + "step": 20625 + }, + { + "epoch": 4.399744027303754, + "grad_norm": 0.15672622911806838, + "learning_rate": 5.8343945106510096e-05, + "loss": 0.8824, + "step": 20626 + }, + { + "epoch": 4.399957337883959, + "grad_norm": 0.1890380381805472, + "learning_rate": 5.833575318792003e-05, + "loss": 0.9023, + "step": 20627 + }, + { + "epoch": 4.400170648464163, + "grad_norm": 0.207336664191302, + "learning_rate": 5.832756151446129e-05, + "loss": 0.9094, + "step": 20628 + }, + { + "epoch": 4.400383959044369, + "grad_norm": 0.19547483176439567, + "learning_rate": 5.831937008622653e-05, + "loss": 0.8954, + "step": 20629 + }, + { + "epoch": 4.400597269624574, + "grad_norm": 0.15770792284013413, + "learning_rate": 5.8311178903308466e-05, + "loss": 0.9034, + "step": 20630 + }, + { + "epoch": 4.400810580204778, + "grad_norm": 0.19383551888192227, + "learning_rate": 5.830298796579978e-05, + "loss": 0.8906, + "step": 20631 + }, + { + "epoch": 4.401023890784983, + "grad_norm": 0.16734456761817604, + "learning_rate": 5.8294797273793156e-05, + "loss": 0.904, + "step": 20632 + }, + { + "epoch": 4.401237201365188, + "grad_norm": 0.19049168935464342, + "learning_rate": 5.828660682738124e-05, + "loss": 0.891, + "step": 20633 + }, + { + "epoch": 4.401450511945392, + "grad_norm": 0.1583084790653989, + "learning_rate": 5.827841662665677e-05, + "loss": 0.9059, + "step": 20634 + }, + { + "epoch": 4.401663822525597, + "grad_norm": 0.16195420347439793, + "learning_rate": 5.827022667171235e-05, + "loss": 0.8897, + "step": 20635 + }, + { + "epoch": 4.401877133105802, + "grad_norm": 0.16300324632953495, + "learning_rate": 5.8262036962640704e-05, + "loss": 0.9009, + "step": 20636 + }, + { + "epoch": 4.402090443686006, + "grad_norm": 0.17627530222109575, + "learning_rate": 5.825384749953445e-05, + "loss": 0.904, + "step": 20637 + }, + { + "epoch": 4.402303754266212, + "grad_norm": 0.19734660965648662, + "learning_rate": 5.8245658282486296e-05, + "loss": 0.9038, + "step": 20638 + }, + { + "epoch": 4.402517064846417, + "grad_norm": 0.16015171609756582, + "learning_rate": 5.823746931158891e-05, + "loss": 0.8811, + "step": 20639 + }, + { + "epoch": 4.402730375426621, + "grad_norm": 0.22880643706939086, + "learning_rate": 5.8229280586934906e-05, + "loss": 0.927, + "step": 20640 + }, + { + "epoch": 4.402943686006826, + "grad_norm": 0.2112179834149033, + "learning_rate": 5.822109210861694e-05, + "loss": 0.903, + "step": 20641 + }, + { + "epoch": 4.403156996587031, + "grad_norm": 0.16486869843599533, + "learning_rate": 5.8212903876727694e-05, + "loss": 0.9041, + "step": 20642 + }, + { + "epoch": 4.403370307167235, + "grad_norm": 0.20306066161543662, + "learning_rate": 5.820471589135979e-05, + "loss": 0.901, + "step": 20643 + }, + { + "epoch": 4.40358361774744, + "grad_norm": 0.19439599720051778, + "learning_rate": 5.8196528152605915e-05, + "loss": 0.9077, + "step": 20644 + }, + { + "epoch": 4.403796928327645, + "grad_norm": 0.26295193311249415, + "learning_rate": 5.818834066055868e-05, + "loss": 0.8912, + "step": 20645 + }, + { + "epoch": 4.4040102389078495, + "grad_norm": 0.1715659841317903, + "learning_rate": 5.818015341531075e-05, + "loss": 0.9015, + "step": 20646 + }, + { + "epoch": 4.404223549488055, + "grad_norm": 0.2621543832235721, + "learning_rate": 5.817196641695473e-05, + "loss": 0.9079, + "step": 20647 + }, + { + "epoch": 4.40443686006826, + "grad_norm": 0.28652704281427727, + "learning_rate": 5.8163779665583294e-05, + "loss": 0.9233, + "step": 20648 + }, + { + "epoch": 4.404650170648464, + "grad_norm": 0.18519055168925047, + "learning_rate": 5.8155593161289036e-05, + "loss": 0.8911, + "step": 20649 + }, + { + "epoch": 4.404863481228669, + "grad_norm": 0.23424354848334344, + "learning_rate": 5.8147406904164637e-05, + "loss": 0.8797, + "step": 20650 + }, + { + "epoch": 4.405076791808874, + "grad_norm": 0.20848868681790686, + "learning_rate": 5.8139220894302714e-05, + "loss": 0.8916, + "step": 20651 + }, + { + "epoch": 4.4052901023890785, + "grad_norm": 0.17216349754991778, + "learning_rate": 5.8131035131795845e-05, + "loss": 0.8751, + "step": 20652 + }, + { + "epoch": 4.405503412969283, + "grad_norm": 0.22420031760646247, + "learning_rate": 5.8122849616736675e-05, + "loss": 0.9305, + "step": 20653 + }, + { + "epoch": 4.405716723549488, + "grad_norm": 0.1891579663886695, + "learning_rate": 5.8114664349217836e-05, + "loss": 0.923, + "step": 20654 + }, + { + "epoch": 4.4059300341296925, + "grad_norm": 0.14709250591642453, + "learning_rate": 5.810647932933194e-05, + "loss": 0.8848, + "step": 20655 + }, + { + "epoch": 4.406143344709897, + "grad_norm": 0.1820546752283117, + "learning_rate": 5.8098294557171594e-05, + "loss": 0.9129, + "step": 20656 + }, + { + "epoch": 4.406356655290103, + "grad_norm": 0.17331527956607415, + "learning_rate": 5.809011003282941e-05, + "loss": 0.907, + "step": 20657 + }, + { + "epoch": 4.4065699658703075, + "grad_norm": 0.1705369984151223, + "learning_rate": 5.8081925756398015e-05, + "loss": 0.9138, + "step": 20658 + }, + { + "epoch": 4.406783276450512, + "grad_norm": 0.1935763886510339, + "learning_rate": 5.8073741727969976e-05, + "loss": 0.8953, + "step": 20659 + }, + { + "epoch": 4.406996587030717, + "grad_norm": 0.16213788848250368, + "learning_rate": 5.806555794763794e-05, + "loss": 0.9097, + "step": 20660 + }, + { + "epoch": 4.4072098976109215, + "grad_norm": 0.2238696367207361, + "learning_rate": 5.805737441549447e-05, + "loss": 0.8932, + "step": 20661 + }, + { + "epoch": 4.407423208191126, + "grad_norm": 0.16813423688241447, + "learning_rate": 5.804919113163222e-05, + "loss": 0.8983, + "step": 20662 + }, + { + "epoch": 4.407636518771331, + "grad_norm": 0.1572866190262917, + "learning_rate": 5.80410080961437e-05, + "loss": 0.903, + "step": 20663 + }, + { + "epoch": 4.407849829351536, + "grad_norm": 0.1634321221033688, + "learning_rate": 5.803282530912155e-05, + "loss": 0.9048, + "step": 20664 + }, + { + "epoch": 4.40806313993174, + "grad_norm": 0.15254976072209878, + "learning_rate": 5.8024642770658346e-05, + "loss": 0.8872, + "step": 20665 + }, + { + "epoch": 4.408276450511945, + "grad_norm": 0.17323733505105388, + "learning_rate": 5.801646048084669e-05, + "loss": 0.9534, + "step": 20666 + }, + { + "epoch": 4.4084897610921505, + "grad_norm": 0.17527776269761894, + "learning_rate": 5.800827843977914e-05, + "loss": 0.9046, + "step": 20667 + }, + { + "epoch": 4.408703071672355, + "grad_norm": 0.18751297141772014, + "learning_rate": 5.8000096647548295e-05, + "loss": 0.8863, + "step": 20668 + }, + { + "epoch": 4.40891638225256, + "grad_norm": 0.14716460950921867, + "learning_rate": 5.7991915104246714e-05, + "loss": 0.8874, + "step": 20669 + }, + { + "epoch": 4.409129692832765, + "grad_norm": 0.18134231490391697, + "learning_rate": 5.798373380996701e-05, + "loss": 0.8845, + "step": 20670 + }, + { + "epoch": 4.409343003412969, + "grad_norm": 0.1524428051146184, + "learning_rate": 5.79755527648017e-05, + "loss": 0.9111, + "step": 20671 + }, + { + "epoch": 4.409556313993174, + "grad_norm": 0.16176133524054132, + "learning_rate": 5.7967371968843404e-05, + "loss": 0.8878, + "step": 20672 + }, + { + "epoch": 4.409769624573379, + "grad_norm": 0.16572598212034134, + "learning_rate": 5.795919142218468e-05, + "loss": 0.9066, + "step": 20673 + }, + { + "epoch": 4.409982935153583, + "grad_norm": 0.17323514878231916, + "learning_rate": 5.795101112491805e-05, + "loss": 0.9234, + "step": 20674 + }, + { + "epoch": 4.410196245733788, + "grad_norm": 0.19999052286961064, + "learning_rate": 5.7942831077136094e-05, + "loss": 0.8696, + "step": 20675 + }, + { + "epoch": 4.410409556313994, + "grad_norm": 0.1618634791705696, + "learning_rate": 5.79346512789314e-05, + "loss": 0.8874, + "step": 20676 + }, + { + "epoch": 4.410622866894198, + "grad_norm": 0.19571892741518052, + "learning_rate": 5.7926471730396465e-05, + "loss": 0.8525, + "step": 20677 + }, + { + "epoch": 4.410836177474403, + "grad_norm": 0.19779200998459384, + "learning_rate": 5.79182924316239e-05, + "loss": 0.9019, + "step": 20678 + }, + { + "epoch": 4.411049488054608, + "grad_norm": 0.1587406933565869, + "learning_rate": 5.79101133827062e-05, + "loss": 0.9344, + "step": 20679 + }, + { + "epoch": 4.411262798634812, + "grad_norm": 0.22360825656972627, + "learning_rate": 5.790193458373597e-05, + "loss": 0.8701, + "step": 20680 + }, + { + "epoch": 4.411476109215017, + "grad_norm": 0.1840718625094341, + "learning_rate": 5.78937560348057e-05, + "loss": 0.9065, + "step": 20681 + }, + { + "epoch": 4.411689419795222, + "grad_norm": 0.16403134304177844, + "learning_rate": 5.7885577736007965e-05, + "loss": 0.8813, + "step": 20682 + }, + { + "epoch": 4.411902730375426, + "grad_norm": 0.13987800025713637, + "learning_rate": 5.787739968743527e-05, + "loss": 0.9071, + "step": 20683 + }, + { + "epoch": 4.412116040955631, + "grad_norm": 0.20102619405974864, + "learning_rate": 5.786922188918024e-05, + "loss": 0.8751, + "step": 20684 + }, + { + "epoch": 4.412329351535837, + "grad_norm": 0.25368905905297623, + "learning_rate": 5.786104434133526e-05, + "loss": 0.9088, + "step": 20685 + }, + { + "epoch": 4.412542662116041, + "grad_norm": 0.26898913302098987, + "learning_rate": 5.785286704399296e-05, + "loss": 0.8849, + "step": 20686 + }, + { + "epoch": 4.412755972696246, + "grad_norm": 0.15189421155173125, + "learning_rate": 5.784468999724583e-05, + "loss": 0.907, + "step": 20687 + }, + { + "epoch": 4.412969283276451, + "grad_norm": 0.18496579820954592, + "learning_rate": 5.783651320118642e-05, + "loss": 0.9051, + "step": 20688 + }, + { + "epoch": 4.413182593856655, + "grad_norm": 0.2260773471384197, + "learning_rate": 5.782833665590723e-05, + "loss": 0.9094, + "step": 20689 + }, + { + "epoch": 4.41339590443686, + "grad_norm": 0.20194818582490834, + "learning_rate": 5.782016036150078e-05, + "loss": 0.9243, + "step": 20690 + }, + { + "epoch": 4.413609215017065, + "grad_norm": 0.15646792925590647, + "learning_rate": 5.781198431805958e-05, + "loss": 0.9017, + "step": 20691 + }, + { + "epoch": 4.413822525597269, + "grad_norm": 0.16377460483747866, + "learning_rate": 5.780380852567618e-05, + "loss": 0.907, + "step": 20692 + }, + { + "epoch": 4.414035836177474, + "grad_norm": 0.22207006905782536, + "learning_rate": 5.779563298444304e-05, + "loss": 0.8893, + "step": 20693 + }, + { + "epoch": 4.414249146757679, + "grad_norm": 0.22412105944878155, + "learning_rate": 5.77874576944527e-05, + "loss": 0.9047, + "step": 20694 + }, + { + "epoch": 4.414462457337884, + "grad_norm": 0.142907090343143, + "learning_rate": 5.7779282655797656e-05, + "loss": 0.9104, + "step": 20695 + }, + { + "epoch": 4.414675767918089, + "grad_norm": 0.19147396068976438, + "learning_rate": 5.777110786857039e-05, + "loss": 0.9033, + "step": 20696 + }, + { + "epoch": 4.414889078498294, + "grad_norm": 0.19302275972607902, + "learning_rate": 5.77629333328634e-05, + "loss": 0.9034, + "step": 20697 + }, + { + "epoch": 4.415102389078498, + "grad_norm": 0.19289521214379438, + "learning_rate": 5.775475904876921e-05, + "loss": 0.8553, + "step": 20698 + }, + { + "epoch": 4.415315699658703, + "grad_norm": 0.1425762152932914, + "learning_rate": 5.7746585016380275e-05, + "loss": 0.8884, + "step": 20699 + }, + { + "epoch": 4.415529010238908, + "grad_norm": 0.23160936868715612, + "learning_rate": 5.773841123578912e-05, + "loss": 0.9279, + "step": 20700 + }, + { + "epoch": 4.415742320819112, + "grad_norm": 0.2243639645952262, + "learning_rate": 5.77302377070882e-05, + "loss": 0.9393, + "step": 20701 + }, + { + "epoch": 4.415955631399317, + "grad_norm": 0.19244378834698053, + "learning_rate": 5.7722064430370036e-05, + "loss": 0.9213, + "step": 20702 + }, + { + "epoch": 4.416168941979522, + "grad_norm": 0.14009290244248668, + "learning_rate": 5.771389140572707e-05, + "loss": 0.8948, + "step": 20703 + }, + { + "epoch": 4.4163822525597265, + "grad_norm": 0.17868886037276824, + "learning_rate": 5.770571863325182e-05, + "loss": 0.9255, + "step": 20704 + }, + { + "epoch": 4.416595563139932, + "grad_norm": 0.15147596565049387, + "learning_rate": 5.7697546113036716e-05, + "loss": 0.8887, + "step": 20705 + }, + { + "epoch": 4.416808873720137, + "grad_norm": 0.15591842704846268, + "learning_rate": 5.768937384517428e-05, + "loss": 0.8655, + "step": 20706 + }, + { + "epoch": 4.417022184300341, + "grad_norm": 0.14933767921981414, + "learning_rate": 5.7681201829756953e-05, + "loss": 0.8938, + "step": 20707 + }, + { + "epoch": 4.417235494880546, + "grad_norm": 0.1649136735930021, + "learning_rate": 5.767303006687719e-05, + "loss": 0.8879, + "step": 20708 + }, + { + "epoch": 4.417448805460751, + "grad_norm": 0.16423886491403825, + "learning_rate": 5.766485855662746e-05, + "loss": 0.8801, + "step": 20709 + }, + { + "epoch": 4.4176621160409555, + "grad_norm": 0.18274135787586415, + "learning_rate": 5.765668729910023e-05, + "loss": 0.9022, + "step": 20710 + }, + { + "epoch": 4.41787542662116, + "grad_norm": 0.19638837091436573, + "learning_rate": 5.764851629438796e-05, + "loss": 0.8926, + "step": 20711 + }, + { + "epoch": 4.418088737201365, + "grad_norm": 0.20009219239257822, + "learning_rate": 5.7640345542583117e-05, + "loss": 0.9112, + "step": 20712 + }, + { + "epoch": 4.4183020477815695, + "grad_norm": 0.22519314550959854, + "learning_rate": 5.763217504377812e-05, + "loss": 0.9064, + "step": 20713 + }, + { + "epoch": 4.418515358361775, + "grad_norm": 0.21731350000052205, + "learning_rate": 5.762400479806546e-05, + "loss": 0.8938, + "step": 20714 + }, + { + "epoch": 4.41872866894198, + "grad_norm": 0.21013642445593023, + "learning_rate": 5.761583480553753e-05, + "loss": 0.9029, + "step": 20715 + }, + { + "epoch": 4.4189419795221845, + "grad_norm": 0.15624269139786676, + "learning_rate": 5.7607665066286845e-05, + "loss": 0.8599, + "step": 20716 + }, + { + "epoch": 4.419155290102389, + "grad_norm": 0.17056186170711704, + "learning_rate": 5.759949558040578e-05, + "loss": 0.9214, + "step": 20717 + }, + { + "epoch": 4.419368600682594, + "grad_norm": 0.17562831391000341, + "learning_rate": 5.759132634798683e-05, + "loss": 0.9135, + "step": 20718 + }, + { + "epoch": 4.4195819112627985, + "grad_norm": 0.16065803158565564, + "learning_rate": 5.758315736912238e-05, + "loss": 0.905, + "step": 20719 + }, + { + "epoch": 4.419795221843003, + "grad_norm": 0.2173567906923477, + "learning_rate": 5.7574988643904896e-05, + "loss": 0.9112, + "step": 20720 + }, + { + "epoch": 4.420008532423208, + "grad_norm": 0.16646962265548357, + "learning_rate": 5.7566820172426764e-05, + "loss": 0.894, + "step": 20721 + }, + { + "epoch": 4.420221843003413, + "grad_norm": 0.17918948393374, + "learning_rate": 5.755865195478046e-05, + "loss": 0.9175, + "step": 20722 + }, + { + "epoch": 4.420435153583618, + "grad_norm": 0.23462117114946518, + "learning_rate": 5.755048399105836e-05, + "loss": 0.9017, + "step": 20723 + }, + { + "epoch": 4.420648464163823, + "grad_norm": 0.20626553018816154, + "learning_rate": 5.754231628135294e-05, + "loss": 0.9226, + "step": 20724 + }, + { + "epoch": 4.4208617747440275, + "grad_norm": 0.19409097520410654, + "learning_rate": 5.753414882575657e-05, + "loss": 0.9084, + "step": 20725 + }, + { + "epoch": 4.421075085324232, + "grad_norm": 0.21000422531676147, + "learning_rate": 5.75259816243617e-05, + "loss": 0.8991, + "step": 20726 + }, + { + "epoch": 4.421288395904437, + "grad_norm": 0.20885725171201908, + "learning_rate": 5.751781467726071e-05, + "loss": 0.8945, + "step": 20727 + }, + { + "epoch": 4.421501706484642, + "grad_norm": 0.18958333901907962, + "learning_rate": 5.750964798454604e-05, + "loss": 0.9044, + "step": 20728 + }, + { + "epoch": 4.421715017064846, + "grad_norm": 0.20608132879176244, + "learning_rate": 5.7501481546310094e-05, + "loss": 0.8773, + "step": 20729 + }, + { + "epoch": 4.421928327645051, + "grad_norm": 0.16641398136525146, + "learning_rate": 5.7493315362645225e-05, + "loss": 0.9025, + "step": 20730 + }, + { + "epoch": 4.422141638225256, + "grad_norm": 0.2420004672204304, + "learning_rate": 5.74851494336439e-05, + "loss": 0.9011, + "step": 20731 + }, + { + "epoch": 4.422354948805461, + "grad_norm": 0.16361415976930138, + "learning_rate": 5.7476983759398484e-05, + "loss": 0.9016, + "step": 20732 + }, + { + "epoch": 4.422568259385666, + "grad_norm": 0.1923264172385189, + "learning_rate": 5.746881834000136e-05, + "loss": 0.8965, + "step": 20733 + }, + { + "epoch": 4.422781569965871, + "grad_norm": 0.1997699581603998, + "learning_rate": 5.746065317554495e-05, + "loss": 0.9442, + "step": 20734 + }, + { + "epoch": 4.422994880546075, + "grad_norm": 0.1685615834905963, + "learning_rate": 5.745248826612162e-05, + "loss": 0.9112, + "step": 20735 + }, + { + "epoch": 4.42320819112628, + "grad_norm": 0.17962269647937407, + "learning_rate": 5.744432361182377e-05, + "loss": 0.9241, + "step": 20736 + }, + { + "epoch": 4.423421501706485, + "grad_norm": 0.18887004200749233, + "learning_rate": 5.743615921274377e-05, + "loss": 0.9136, + "step": 20737 + }, + { + "epoch": 4.423634812286689, + "grad_norm": 0.15297313357828504, + "learning_rate": 5.7427995068974014e-05, + "loss": 0.9126, + "step": 20738 + }, + { + "epoch": 4.423848122866894, + "grad_norm": 0.17370995299652803, + "learning_rate": 5.7419831180606866e-05, + "loss": 0.9069, + "step": 20739 + }, + { + "epoch": 4.424061433447099, + "grad_norm": 0.22278483322787257, + "learning_rate": 5.7411667547734744e-05, + "loss": 0.9031, + "step": 20740 + }, + { + "epoch": 4.424274744027303, + "grad_norm": 0.20028672843043757, + "learning_rate": 5.7403504170449956e-05, + "loss": 0.9014, + "step": 20741 + }, + { + "epoch": 4.424488054607509, + "grad_norm": 0.17125694654807316, + "learning_rate": 5.739534104884488e-05, + "loss": 0.8959, + "step": 20742 + }, + { + "epoch": 4.424701365187714, + "grad_norm": 0.2032317530602496, + "learning_rate": 5.738717818301193e-05, + "loss": 0.8859, + "step": 20743 + }, + { + "epoch": 4.424914675767918, + "grad_norm": 0.23660401716768265, + "learning_rate": 5.7379015573043425e-05, + "loss": 0.9156, + "step": 20744 + }, + { + "epoch": 4.425127986348123, + "grad_norm": 0.16292052794132747, + "learning_rate": 5.737085321903174e-05, + "loss": 0.88, + "step": 20745 + }, + { + "epoch": 4.425341296928328, + "grad_norm": 0.19303125085355, + "learning_rate": 5.736269112106923e-05, + "loss": 0.8813, + "step": 20746 + }, + { + "epoch": 4.425554607508532, + "grad_norm": 0.239522623599738, + "learning_rate": 5.735452927924825e-05, + "loss": 0.8968, + "step": 20747 + }, + { + "epoch": 4.425767918088737, + "grad_norm": 0.2896142468233598, + "learning_rate": 5.734636769366116e-05, + "loss": 0.902, + "step": 20748 + }, + { + "epoch": 4.425981228668942, + "grad_norm": 0.21627394360869426, + "learning_rate": 5.733820636440028e-05, + "loss": 0.9013, + "step": 20749 + }, + { + "epoch": 4.426194539249146, + "grad_norm": 0.16314649864750058, + "learning_rate": 5.7330045291558e-05, + "loss": 0.8998, + "step": 20750 + }, + { + "epoch": 4.426407849829351, + "grad_norm": 0.22473571931138878, + "learning_rate": 5.7321884475226656e-05, + "loss": 0.9128, + "step": 20751 + }, + { + "epoch": 4.426621160409557, + "grad_norm": 0.18187831236077173, + "learning_rate": 5.731372391549853e-05, + "loss": 0.8651, + "step": 20752 + }, + { + "epoch": 4.426834470989761, + "grad_norm": 0.20782463286520908, + "learning_rate": 5.730556361246603e-05, + "loss": 0.8751, + "step": 20753 + }, + { + "epoch": 4.427047781569966, + "grad_norm": 0.21386616971648087, + "learning_rate": 5.729740356622143e-05, + "loss": 0.8929, + "step": 20754 + }, + { + "epoch": 4.427261092150171, + "grad_norm": 0.23672823654722605, + "learning_rate": 5.728924377685712e-05, + "loss": 0.9361, + "step": 20755 + }, + { + "epoch": 4.427474402730375, + "grad_norm": 0.1815269614808984, + "learning_rate": 5.7281084244465376e-05, + "loss": 0.9186, + "step": 20756 + }, + { + "epoch": 4.42768771331058, + "grad_norm": 0.19104081793512914, + "learning_rate": 5.727292496913855e-05, + "loss": 0.8989, + "step": 20757 + }, + { + "epoch": 4.427901023890785, + "grad_norm": 0.2549123658521883, + "learning_rate": 5.7264765950968964e-05, + "loss": 0.9131, + "step": 20758 + }, + { + "epoch": 4.4281143344709895, + "grad_norm": 0.18734263835982748, + "learning_rate": 5.725660719004893e-05, + "loss": 0.8935, + "step": 20759 + }, + { + "epoch": 4.428327645051194, + "grad_norm": 0.2143993467422317, + "learning_rate": 5.7248448686470775e-05, + "loss": 0.8829, + "step": 20760 + }, + { + "epoch": 4.4285409556314, + "grad_norm": 0.25573536181037254, + "learning_rate": 5.724029044032679e-05, + "loss": 0.913, + "step": 20761 + }, + { + "epoch": 4.428754266211604, + "grad_norm": 0.27097515709127845, + "learning_rate": 5.723213245170934e-05, + "loss": 0.8988, + "step": 20762 + }, + { + "epoch": 4.428967576791809, + "grad_norm": 0.18082225860209933, + "learning_rate": 5.722397472071067e-05, + "loss": 0.9046, + "step": 20763 + }, + { + "epoch": 4.429180887372014, + "grad_norm": 0.19629854751448486, + "learning_rate": 5.7215817247423096e-05, + "loss": 0.8847, + "step": 20764 + }, + { + "epoch": 4.4293941979522184, + "grad_norm": 0.21704858081780473, + "learning_rate": 5.720766003193896e-05, + "loss": 0.8898, + "step": 20765 + }, + { + "epoch": 4.429607508532423, + "grad_norm": 0.19024267549973292, + "learning_rate": 5.71995030743505e-05, + "loss": 0.9089, + "step": 20766 + }, + { + "epoch": 4.429820819112628, + "grad_norm": 0.1590610606519438, + "learning_rate": 5.719134637475007e-05, + "loss": 0.9513, + "step": 20767 + }, + { + "epoch": 4.4300341296928325, + "grad_norm": 0.24546544710709609, + "learning_rate": 5.718318993322994e-05, + "loss": 0.9042, + "step": 20768 + }, + { + "epoch": 4.430247440273037, + "grad_norm": 0.2612908284962224, + "learning_rate": 5.717503374988239e-05, + "loss": 0.8888, + "step": 20769 + }, + { + "epoch": 4.430460750853243, + "grad_norm": 0.21868507720844324, + "learning_rate": 5.716687782479972e-05, + "loss": 0.9238, + "step": 20770 + }, + { + "epoch": 4.430674061433447, + "grad_norm": 0.16290974143608553, + "learning_rate": 5.71587221580742e-05, + "loss": 0.8986, + "step": 20771 + }, + { + "epoch": 4.430887372013652, + "grad_norm": 0.22588796862854899, + "learning_rate": 5.7150566749798154e-05, + "loss": 0.9092, + "step": 20772 + }, + { + "epoch": 4.431100682593857, + "grad_norm": 0.21179072548922245, + "learning_rate": 5.7142411600063795e-05, + "loss": 0.9096, + "step": 20773 + }, + { + "epoch": 4.4313139931740615, + "grad_norm": 0.22695493450948265, + "learning_rate": 5.713425670896347e-05, + "loss": 0.9185, + "step": 20774 + }, + { + "epoch": 4.431527303754266, + "grad_norm": 0.17844509638415312, + "learning_rate": 5.71261020765894e-05, + "loss": 0.9234, + "step": 20775 + }, + { + "epoch": 4.431740614334471, + "grad_norm": 0.22668616160980123, + "learning_rate": 5.7117947703033856e-05, + "loss": 0.8783, + "step": 20776 + }, + { + "epoch": 4.4319539249146755, + "grad_norm": 0.21884250594227542, + "learning_rate": 5.710979358838913e-05, + "loss": 0.9182, + "step": 20777 + }, + { + "epoch": 4.43216723549488, + "grad_norm": 0.17687335060654433, + "learning_rate": 5.710163973274746e-05, + "loss": 0.9138, + "step": 20778 + }, + { + "epoch": 4.432380546075085, + "grad_norm": 0.220152261093573, + "learning_rate": 5.7093486136201136e-05, + "loss": 0.8961, + "step": 20779 + }, + { + "epoch": 4.4325938566552905, + "grad_norm": 0.20879063268517392, + "learning_rate": 5.7085332798842405e-05, + "loss": 0.9046, + "step": 20780 + }, + { + "epoch": 4.432807167235495, + "grad_norm": 0.22202112564478896, + "learning_rate": 5.707717972076351e-05, + "loss": 0.8862, + "step": 20781 + }, + { + "epoch": 4.4330204778157, + "grad_norm": 0.20082515339351295, + "learning_rate": 5.706902690205672e-05, + "loss": 0.9236, + "step": 20782 + }, + { + "epoch": 4.4332337883959045, + "grad_norm": 0.19252377833162712, + "learning_rate": 5.7060874342814254e-05, + "loss": 0.9302, + "step": 20783 + }, + { + "epoch": 4.433447098976109, + "grad_norm": 0.3259805192933008, + "learning_rate": 5.70527220431284e-05, + "loss": 0.9198, + "step": 20784 + }, + { + "epoch": 4.433660409556314, + "grad_norm": 0.23001519878876883, + "learning_rate": 5.704457000309141e-05, + "loss": 0.9161, + "step": 20785 + }, + { + "epoch": 4.433873720136519, + "grad_norm": 0.18778641859537396, + "learning_rate": 5.703641822279545e-05, + "loss": 0.932, + "step": 20786 + }, + { + "epoch": 4.434087030716723, + "grad_norm": 0.2981041843907388, + "learning_rate": 5.702826670233283e-05, + "loss": 0.9301, + "step": 20787 + }, + { + "epoch": 4.434300341296928, + "grad_norm": 0.2545611499409301, + "learning_rate": 5.702011544179574e-05, + "loss": 0.8985, + "step": 20788 + }, + { + "epoch": 4.434513651877133, + "grad_norm": 0.18011084242814232, + "learning_rate": 5.701196444127644e-05, + "loss": 0.8717, + "step": 20789 + }, + { + "epoch": 4.434726962457338, + "grad_norm": 0.2846281639442336, + "learning_rate": 5.700381370086714e-05, + "loss": 0.8895, + "step": 20790 + }, + { + "epoch": 4.434940273037543, + "grad_norm": 0.28819369348737744, + "learning_rate": 5.699566322066009e-05, + "loss": 0.9084, + "step": 20791 + }, + { + "epoch": 4.435153583617748, + "grad_norm": 0.20865890503183865, + "learning_rate": 5.6987513000747495e-05, + "loss": 0.9033, + "step": 20792 + }, + { + "epoch": 4.435366894197952, + "grad_norm": 0.18971442139194525, + "learning_rate": 5.697936304122156e-05, + "loss": 0.9064, + "step": 20793 + }, + { + "epoch": 4.435580204778157, + "grad_norm": 0.2798817759760903, + "learning_rate": 5.6971213342174534e-05, + "loss": 0.873, + "step": 20794 + }, + { + "epoch": 4.435793515358362, + "grad_norm": 0.3411776186109747, + "learning_rate": 5.6963063903698613e-05, + "loss": 0.8982, + "step": 20795 + }, + { + "epoch": 4.436006825938566, + "grad_norm": 0.22185337959978557, + "learning_rate": 5.695491472588603e-05, + "loss": 0.9129, + "step": 20796 + }, + { + "epoch": 4.436220136518771, + "grad_norm": 0.21010043115283783, + "learning_rate": 5.694676580882896e-05, + "loss": 0.8734, + "step": 20797 + }, + { + "epoch": 4.436433447098976, + "grad_norm": 0.2738355736499211, + "learning_rate": 5.693861715261961e-05, + "loss": 0.8717, + "step": 20798 + }, + { + "epoch": 4.436646757679181, + "grad_norm": 0.1937088447447295, + "learning_rate": 5.6930468757350216e-05, + "loss": 0.9338, + "step": 20799 + }, + { + "epoch": 4.436860068259386, + "grad_norm": 0.1860471576450958, + "learning_rate": 5.6922320623112934e-05, + "loss": 0.8941, + "step": 20800 + }, + { + "epoch": 4.437073378839591, + "grad_norm": 0.20990729985751036, + "learning_rate": 5.691417275e-05, + "loss": 0.8958, + "step": 20801 + }, + { + "epoch": 4.437286689419795, + "grad_norm": 0.2055080315182677, + "learning_rate": 5.690602513810357e-05, + "loss": 0.8922, + "step": 20802 + }, + { + "epoch": 4.4375, + "grad_norm": 0.16567396105755935, + "learning_rate": 5.689787778751587e-05, + "loss": 0.8902, + "step": 20803 + }, + { + "epoch": 4.437713310580205, + "grad_norm": 0.18732211468445345, + "learning_rate": 5.688973069832908e-05, + "loss": 0.9042, + "step": 20804 + }, + { + "epoch": 4.437926621160409, + "grad_norm": 0.2135948936145696, + "learning_rate": 5.688158387063535e-05, + "loss": 0.9194, + "step": 20805 + }, + { + "epoch": 4.438139931740614, + "grad_norm": 0.18678911533582346, + "learning_rate": 5.687343730452691e-05, + "loss": 0.8966, + "step": 20806 + }, + { + "epoch": 4.438353242320819, + "grad_norm": 0.21976485882029298, + "learning_rate": 5.6865291000095926e-05, + "loss": 0.8776, + "step": 20807 + }, + { + "epoch": 4.438566552901024, + "grad_norm": 0.15147388528405192, + "learning_rate": 5.685714495743454e-05, + "loss": 0.9133, + "step": 20808 + }, + { + "epoch": 4.438779863481229, + "grad_norm": 0.1835652065804719, + "learning_rate": 5.684899917663495e-05, + "loss": 0.8979, + "step": 20809 + }, + { + "epoch": 4.438993174061434, + "grad_norm": 0.17055049662619912, + "learning_rate": 5.6840853657789325e-05, + "loss": 0.9134, + "step": 20810 + }, + { + "epoch": 4.439206484641638, + "grad_norm": 0.19523381332872025, + "learning_rate": 5.683270840098983e-05, + "loss": 0.9107, + "step": 20811 + }, + { + "epoch": 4.439419795221843, + "grad_norm": 0.14630843887377967, + "learning_rate": 5.682456340632863e-05, + "loss": 0.887, + "step": 20812 + }, + { + "epoch": 4.439633105802048, + "grad_norm": 0.18786054332737517, + "learning_rate": 5.681641867389789e-05, + "loss": 0.8804, + "step": 20813 + }, + { + "epoch": 4.439846416382252, + "grad_norm": 0.15038241832551777, + "learning_rate": 5.680827420378975e-05, + "loss": 0.9098, + "step": 20814 + }, + { + "epoch": 4.440059726962457, + "grad_norm": 0.1775899558993131, + "learning_rate": 5.6800129996096385e-05, + "loss": 0.9107, + "step": 20815 + }, + { + "epoch": 4.440273037542662, + "grad_norm": 0.1718976503999234, + "learning_rate": 5.679198605090995e-05, + "loss": 0.8848, + "step": 20816 + }, + { + "epoch": 4.440486348122867, + "grad_norm": 0.1605454673998409, + "learning_rate": 5.6783842368322565e-05, + "loss": 0.9087, + "step": 20817 + }, + { + "epoch": 4.440699658703072, + "grad_norm": 0.21498031041662596, + "learning_rate": 5.6775698948426424e-05, + "loss": 0.9037, + "step": 20818 + }, + { + "epoch": 4.440912969283277, + "grad_norm": 0.18582111806945606, + "learning_rate": 5.676755579131362e-05, + "loss": 0.8974, + "step": 20819 + }, + { + "epoch": 4.441126279863481, + "grad_norm": 0.18317322229752123, + "learning_rate": 5.675941289707631e-05, + "loss": 0.8949, + "step": 20820 + }, + { + "epoch": 4.441339590443686, + "grad_norm": 0.21716062941710984, + "learning_rate": 5.6751270265806634e-05, + "loss": 0.8927, + "step": 20821 + }, + { + "epoch": 4.441552901023891, + "grad_norm": 0.17486032891832218, + "learning_rate": 5.674312789759672e-05, + "loss": 0.9082, + "step": 20822 + }, + { + "epoch": 4.4417662116040955, + "grad_norm": 0.18511798271295096, + "learning_rate": 5.673498579253872e-05, + "loss": 0.9168, + "step": 20823 + }, + { + "epoch": 4.4419795221843, + "grad_norm": 0.1878589941720404, + "learning_rate": 5.672684395072473e-05, + "loss": 0.8974, + "step": 20824 + }, + { + "epoch": 4.442192832764505, + "grad_norm": 0.15967244350043358, + "learning_rate": 5.6718702372246906e-05, + "loss": 0.9066, + "step": 20825 + }, + { + "epoch": 4.4424061433447095, + "grad_norm": 0.1874953246537096, + "learning_rate": 5.671056105719734e-05, + "loss": 0.915, + "step": 20826 + }, + { + "epoch": 4.442619453924915, + "grad_norm": 0.17196398152699602, + "learning_rate": 5.6702420005668184e-05, + "loss": 0.906, + "step": 20827 + }, + { + "epoch": 4.44283276450512, + "grad_norm": 0.15062973371184385, + "learning_rate": 5.669427921775154e-05, + "loss": 0.8962, + "step": 20828 + }, + { + "epoch": 4.4430460750853245, + "grad_norm": 0.17165479008864104, + "learning_rate": 5.6686138693539535e-05, + "loss": 0.9229, + "step": 20829 + }, + { + "epoch": 4.443259385665529, + "grad_norm": 0.16910912465775294, + "learning_rate": 5.6677998433124236e-05, + "loss": 0.9135, + "step": 20830 + }, + { + "epoch": 4.443472696245734, + "grad_norm": 0.15255539206091687, + "learning_rate": 5.666985843659778e-05, + "loss": 0.8928, + "step": 20831 + }, + { + "epoch": 4.4436860068259385, + "grad_norm": 0.18087480319617535, + "learning_rate": 5.666171870405226e-05, + "loss": 0.9203, + "step": 20832 + }, + { + "epoch": 4.443899317406143, + "grad_norm": 0.16999524457540222, + "learning_rate": 5.6653579235579804e-05, + "loss": 0.8625, + "step": 20833 + }, + { + "epoch": 4.444112627986348, + "grad_norm": 0.17775116386789774, + "learning_rate": 5.6645440031272464e-05, + "loss": 0.8704, + "step": 20834 + }, + { + "epoch": 4.444325938566553, + "grad_norm": 0.2001871943938236, + "learning_rate": 5.6637301091222396e-05, + "loss": 0.8986, + "step": 20835 + }, + { + "epoch": 4.444539249146757, + "grad_norm": 0.20362712429675445, + "learning_rate": 5.662916241552163e-05, + "loss": 0.895, + "step": 20836 + }, + { + "epoch": 4.444752559726963, + "grad_norm": 0.170275925004011, + "learning_rate": 5.66210240042623e-05, + "loss": 0.9105, + "step": 20837 + }, + { + "epoch": 4.4449658703071675, + "grad_norm": 0.16124379529208907, + "learning_rate": 5.661288585753647e-05, + "loss": 0.8671, + "step": 20838 + }, + { + "epoch": 4.445179180887372, + "grad_norm": 0.19809952941279052, + "learning_rate": 5.6604747975436236e-05, + "loss": 0.8912, + "step": 20839 + }, + { + "epoch": 4.445392491467577, + "grad_norm": 0.16836156912498895, + "learning_rate": 5.659661035805368e-05, + "loss": 0.8976, + "step": 20840 + }, + { + "epoch": 4.4456058020477816, + "grad_norm": 0.16686703258667582, + "learning_rate": 5.658847300548089e-05, + "loss": 0.8775, + "step": 20841 + }, + { + "epoch": 4.445819112627986, + "grad_norm": 0.1826709589084152, + "learning_rate": 5.658033591780988e-05, + "loss": 0.9119, + "step": 20842 + }, + { + "epoch": 4.446032423208191, + "grad_norm": 0.16995965274501468, + "learning_rate": 5.657219909513279e-05, + "loss": 0.8981, + "step": 20843 + }, + { + "epoch": 4.446245733788396, + "grad_norm": 0.15116249293827172, + "learning_rate": 5.6564062537541635e-05, + "loss": 0.9253, + "step": 20844 + }, + { + "epoch": 4.4464590443686, + "grad_norm": 0.17838058417685165, + "learning_rate": 5.655592624512852e-05, + "loss": 0.9137, + "step": 20845 + }, + { + "epoch": 4.446672354948806, + "grad_norm": 0.16155125450987937, + "learning_rate": 5.654779021798549e-05, + "loss": 0.888, + "step": 20846 + }, + { + "epoch": 4.4468856655290105, + "grad_norm": 0.1491291078841316, + "learning_rate": 5.653965445620461e-05, + "loss": 0.8958, + "step": 20847 + }, + { + "epoch": 4.447098976109215, + "grad_norm": 0.1906022344307465, + "learning_rate": 5.653151895987793e-05, + "loss": 0.8973, + "step": 20848 + }, + { + "epoch": 4.44731228668942, + "grad_norm": 0.17200685943045455, + "learning_rate": 5.6523383729097516e-05, + "loss": 0.8543, + "step": 20849 + }, + { + "epoch": 4.447525597269625, + "grad_norm": 0.179122702735344, + "learning_rate": 5.65152487639554e-05, + "loss": 0.8756, + "step": 20850 + }, + { + "epoch": 4.447738907849829, + "grad_norm": 0.22441550473471655, + "learning_rate": 5.650711406454365e-05, + "loss": 0.8915, + "step": 20851 + }, + { + "epoch": 4.447952218430034, + "grad_norm": 0.13951310326553193, + "learning_rate": 5.6498979630954326e-05, + "loss": 0.8932, + "step": 20852 + }, + { + "epoch": 4.448165529010239, + "grad_norm": 0.22409804505649733, + "learning_rate": 5.649084546327942e-05, + "loss": 0.8897, + "step": 20853 + }, + { + "epoch": 4.448378839590443, + "grad_norm": 0.21892353521774727, + "learning_rate": 5.648271156161098e-05, + "loss": 0.8677, + "step": 20854 + }, + { + "epoch": 4.448592150170649, + "grad_norm": 0.18279789005808975, + "learning_rate": 5.6474577926041064e-05, + "loss": 0.9168, + "step": 20855 + }, + { + "epoch": 4.448805460750854, + "grad_norm": 0.2017977182598186, + "learning_rate": 5.646644455666168e-05, + "loss": 0.9127, + "step": 20856 + }, + { + "epoch": 4.449018771331058, + "grad_norm": 0.15508058593190946, + "learning_rate": 5.645831145356489e-05, + "loss": 0.901, + "step": 20857 + }, + { + "epoch": 4.449232081911263, + "grad_norm": 0.18557648047705178, + "learning_rate": 5.645017861684269e-05, + "loss": 0.9168, + "step": 20858 + }, + { + "epoch": 4.449445392491468, + "grad_norm": 0.17682850643466203, + "learning_rate": 5.644204604658712e-05, + "loss": 0.889, + "step": 20859 + }, + { + "epoch": 4.449658703071672, + "grad_norm": 0.15146820020400573, + "learning_rate": 5.643391374289019e-05, + "loss": 0.8782, + "step": 20860 + }, + { + "epoch": 4.449872013651877, + "grad_norm": 0.22520453009568475, + "learning_rate": 5.6425781705843925e-05, + "loss": 0.8886, + "step": 20861 + }, + { + "epoch": 4.450085324232082, + "grad_norm": 0.18107302153651508, + "learning_rate": 5.641764993554034e-05, + "loss": 0.882, + "step": 20862 + }, + { + "epoch": 4.450298634812286, + "grad_norm": 0.18553930434032667, + "learning_rate": 5.640951843207148e-05, + "loss": 0.8739, + "step": 20863 + }, + { + "epoch": 4.450511945392491, + "grad_norm": 0.17702105844765226, + "learning_rate": 5.6401387195529265e-05, + "loss": 0.9397, + "step": 20864 + }, + { + "epoch": 4.450725255972697, + "grad_norm": 0.16110339471620913, + "learning_rate": 5.639325622600577e-05, + "loss": 0.9192, + "step": 20865 + }, + { + "epoch": 4.450938566552901, + "grad_norm": 0.17700864989736612, + "learning_rate": 5.6385125523592954e-05, + "loss": 0.9341, + "step": 20866 + }, + { + "epoch": 4.451151877133106, + "grad_norm": 0.18450460812791278, + "learning_rate": 5.637699508838287e-05, + "loss": 0.9247, + "step": 20867 + }, + { + "epoch": 4.451365187713311, + "grad_norm": 0.16213715369141732, + "learning_rate": 5.6368864920467454e-05, + "loss": 0.895, + "step": 20868 + }, + { + "epoch": 4.451578498293515, + "grad_norm": 0.16030854339442813, + "learning_rate": 5.636073501993876e-05, + "loss": 0.9037, + "step": 20869 + }, + { + "epoch": 4.45179180887372, + "grad_norm": 0.18618552904496366, + "learning_rate": 5.635260538688872e-05, + "loss": 0.9247, + "step": 20870 + }, + { + "epoch": 4.452005119453925, + "grad_norm": 0.16657680198634636, + "learning_rate": 5.6344476021409364e-05, + "loss": 0.9, + "step": 20871 + }, + { + "epoch": 4.452218430034129, + "grad_norm": 0.13958942016563605, + "learning_rate": 5.633634692359265e-05, + "loss": 0.8838, + "step": 20872 + }, + { + "epoch": 4.452431740614334, + "grad_norm": 0.2351603609078008, + "learning_rate": 5.632821809353058e-05, + "loss": 0.8948, + "step": 20873 + }, + { + "epoch": 4.452645051194539, + "grad_norm": 0.16361506494045042, + "learning_rate": 5.632008953131514e-05, + "loss": 0.9312, + "step": 20874 + }, + { + "epoch": 4.452858361774744, + "grad_norm": 0.12601620076499356, + "learning_rate": 5.6311961237038266e-05, + "loss": 0.8892, + "step": 20875 + }, + { + "epoch": 4.453071672354949, + "grad_norm": 0.14687449964492477, + "learning_rate": 5.630383321079193e-05, + "loss": 0.9048, + "step": 20876 + }, + { + "epoch": 4.453284982935154, + "grad_norm": 0.135851958683499, + "learning_rate": 5.629570545266815e-05, + "loss": 0.9044, + "step": 20877 + }, + { + "epoch": 4.453498293515358, + "grad_norm": 0.16223552452428316, + "learning_rate": 5.6287577962758834e-05, + "loss": 0.9222, + "step": 20878 + }, + { + "epoch": 4.453711604095563, + "grad_norm": 0.17092811175027686, + "learning_rate": 5.627945074115598e-05, + "loss": 0.9018, + "step": 20879 + }, + { + "epoch": 4.453924914675768, + "grad_norm": 0.2002225848988954, + "learning_rate": 5.627132378795153e-05, + "loss": 0.89, + "step": 20880 + }, + { + "epoch": 4.4541382252559725, + "grad_norm": 0.15843152711036532, + "learning_rate": 5.626319710323747e-05, + "loss": 0.9005, + "step": 20881 + }, + { + "epoch": 4.454351535836177, + "grad_norm": 0.2020800998889744, + "learning_rate": 5.625507068710573e-05, + "loss": 0.9054, + "step": 20882 + }, + { + "epoch": 4.454564846416382, + "grad_norm": 0.16307517066077304, + "learning_rate": 5.624694453964826e-05, + "loss": 0.8653, + "step": 20883 + }, + { + "epoch": 4.454778156996587, + "grad_norm": 0.1896564704800219, + "learning_rate": 5.6238818660957004e-05, + "loss": 0.8929, + "step": 20884 + }, + { + "epoch": 4.454991467576792, + "grad_norm": 0.19595691216727407, + "learning_rate": 5.6230693051123965e-05, + "loss": 0.8847, + "step": 20885 + }, + { + "epoch": 4.455204778156997, + "grad_norm": 0.1782373751558197, + "learning_rate": 5.6222567710240984e-05, + "loss": 0.9063, + "step": 20886 + }, + { + "epoch": 4.4554180887372015, + "grad_norm": 0.15403190665925348, + "learning_rate": 5.621444263840007e-05, + "loss": 0.888, + "step": 20887 + }, + { + "epoch": 4.455631399317406, + "grad_norm": 0.15281078778654544, + "learning_rate": 5.6206317835693105e-05, + "loss": 0.8817, + "step": 20888 + }, + { + "epoch": 4.455844709897611, + "grad_norm": 0.16110202170980872, + "learning_rate": 5.6198193302212087e-05, + "loss": 0.8887, + "step": 20889 + }, + { + "epoch": 4.4560580204778155, + "grad_norm": 0.152513868259878, + "learning_rate": 5.619006903804889e-05, + "loss": 0.8691, + "step": 20890 + }, + { + "epoch": 4.45627133105802, + "grad_norm": 0.20092118371935747, + "learning_rate": 5.618194504329548e-05, + "loss": 0.898, + "step": 20891 + }, + { + "epoch": 4.456484641638225, + "grad_norm": 0.20250629678700374, + "learning_rate": 5.617382131804375e-05, + "loss": 0.8856, + "step": 20892 + }, + { + "epoch": 4.4566979522184305, + "grad_norm": 0.16933196679191978, + "learning_rate": 5.616569786238564e-05, + "loss": 0.8766, + "step": 20893 + }, + { + "epoch": 4.456911262798635, + "grad_norm": 0.17210716782085778, + "learning_rate": 5.615757467641304e-05, + "loss": 0.907, + "step": 20894 + }, + { + "epoch": 4.45712457337884, + "grad_norm": 0.1964516759109809, + "learning_rate": 5.614945176021792e-05, + "loss": 0.9054, + "step": 20895 + }, + { + "epoch": 4.4573378839590445, + "grad_norm": 0.19747243755837976, + "learning_rate": 5.6141329113892114e-05, + "loss": 0.9074, + "step": 20896 + }, + { + "epoch": 4.457551194539249, + "grad_norm": 0.19221429565723155, + "learning_rate": 5.613320673752764e-05, + "loss": 0.9054, + "step": 20897 + }, + { + "epoch": 4.457764505119454, + "grad_norm": 0.26718102438743074, + "learning_rate": 5.612508463121627e-05, + "loss": 0.8893, + "step": 20898 + }, + { + "epoch": 4.457977815699659, + "grad_norm": 0.21939168957921382, + "learning_rate": 5.6116962795049994e-05, + "loss": 0.9274, + "step": 20899 + }, + { + "epoch": 4.458191126279863, + "grad_norm": 0.17571604954388742, + "learning_rate": 5.610884122912066e-05, + "loss": 0.9374, + "step": 20900 + }, + { + "epoch": 4.458404436860068, + "grad_norm": 0.2033850336198857, + "learning_rate": 5.610071993352021e-05, + "loss": 0.8673, + "step": 20901 + }, + { + "epoch": 4.4586177474402735, + "grad_norm": 0.16325279859408034, + "learning_rate": 5.60925989083405e-05, + "loss": 0.8889, + "step": 20902 + }, + { + "epoch": 4.458831058020478, + "grad_norm": 0.22811906371219234, + "learning_rate": 5.6084478153673455e-05, + "loss": 0.8961, + "step": 20903 + }, + { + "epoch": 4.459044368600683, + "grad_norm": 0.16056571110933285, + "learning_rate": 5.6076357669610925e-05, + "loss": 0.9127, + "step": 20904 + }, + { + "epoch": 4.459257679180888, + "grad_norm": 0.2583841773208901, + "learning_rate": 5.606823745624483e-05, + "loss": 0.8863, + "step": 20905 + }, + { + "epoch": 4.459470989761092, + "grad_norm": 0.20355302457492522, + "learning_rate": 5.6060117513667005e-05, + "loss": 0.9014, + "step": 20906 + }, + { + "epoch": 4.459684300341297, + "grad_norm": 0.21680928304035707, + "learning_rate": 5.605199784196938e-05, + "loss": 0.9074, + "step": 20907 + }, + { + "epoch": 4.459897610921502, + "grad_norm": 0.22358516914280885, + "learning_rate": 5.604387844124381e-05, + "loss": 0.9055, + "step": 20908 + }, + { + "epoch": 4.460110921501706, + "grad_norm": 0.1832093939760536, + "learning_rate": 5.6035759311582145e-05, + "loss": 0.8729, + "step": 20909 + }, + { + "epoch": 4.460324232081911, + "grad_norm": 0.23687696051834842, + "learning_rate": 5.602764045307625e-05, + "loss": 0.8756, + "step": 20910 + }, + { + "epoch": 4.460537542662116, + "grad_norm": 0.26878304905943545, + "learning_rate": 5.6019521865818016e-05, + "loss": 0.8838, + "step": 20911 + }, + { + "epoch": 4.460750853242321, + "grad_norm": 0.18104559582637145, + "learning_rate": 5.6011403549899284e-05, + "loss": 0.9246, + "step": 20912 + }, + { + "epoch": 4.460964163822526, + "grad_norm": 0.2628906094297275, + "learning_rate": 5.600328550541195e-05, + "loss": 0.9014, + "step": 20913 + }, + { + "epoch": 4.461177474402731, + "grad_norm": 0.22572110153639832, + "learning_rate": 5.599516773244782e-05, + "loss": 0.8959, + "step": 20914 + }, + { + "epoch": 4.461390784982935, + "grad_norm": 0.18539983756835315, + "learning_rate": 5.5987050231098785e-05, + "loss": 0.9118, + "step": 20915 + }, + { + "epoch": 4.46160409556314, + "grad_norm": 0.20710499994478188, + "learning_rate": 5.5978933001456663e-05, + "loss": 0.9085, + "step": 20916 + }, + { + "epoch": 4.461817406143345, + "grad_norm": 0.1731010277609166, + "learning_rate": 5.597081604361334e-05, + "loss": 0.9202, + "step": 20917 + }, + { + "epoch": 4.462030716723549, + "grad_norm": 0.2053576945934668, + "learning_rate": 5.596269935766062e-05, + "loss": 0.9207, + "step": 20918 + }, + { + "epoch": 4.462244027303754, + "grad_norm": 0.2143379281925529, + "learning_rate": 5.5954582943690396e-05, + "loss": 0.8873, + "step": 20919 + }, + { + "epoch": 4.462457337883959, + "grad_norm": 0.17446489779293847, + "learning_rate": 5.5946466801794446e-05, + "loss": 0.8663, + "step": 20920 + }, + { + "epoch": 4.462670648464163, + "grad_norm": 0.16830604427439586, + "learning_rate": 5.5938350932064636e-05, + "loss": 0.9244, + "step": 20921 + }, + { + "epoch": 4.462883959044369, + "grad_norm": 0.17923307475041247, + "learning_rate": 5.593023533459278e-05, + "loss": 0.9069, + "step": 20922 + }, + { + "epoch": 4.463097269624574, + "grad_norm": 0.19257394895878543, + "learning_rate": 5.592212000947072e-05, + "loss": 0.8773, + "step": 20923 + }, + { + "epoch": 4.463310580204778, + "grad_norm": 0.15011277133818202, + "learning_rate": 5.591400495679026e-05, + "loss": 0.8792, + "step": 20924 + }, + { + "epoch": 4.463523890784983, + "grad_norm": 0.20580100560501016, + "learning_rate": 5.590589017664326e-05, + "loss": 0.8967, + "step": 20925 + }, + { + "epoch": 4.463737201365188, + "grad_norm": 0.21533887071755406, + "learning_rate": 5.58977756691215e-05, + "loss": 0.9125, + "step": 20926 + }, + { + "epoch": 4.463950511945392, + "grad_norm": 0.13751765905140942, + "learning_rate": 5.588966143431684e-05, + "loss": 0.8967, + "step": 20927 + }, + { + "epoch": 4.464163822525597, + "grad_norm": 0.2428979707698114, + "learning_rate": 5.5881547472321034e-05, + "loss": 0.8932, + "step": 20928 + }, + { + "epoch": 4.464377133105802, + "grad_norm": 0.18855312147740855, + "learning_rate": 5.587343378322596e-05, + "loss": 0.8981, + "step": 20929 + }, + { + "epoch": 4.464590443686006, + "grad_norm": 0.16836839795261477, + "learning_rate": 5.586532036712339e-05, + "loss": 0.9049, + "step": 20930 + }, + { + "epoch": 4.464803754266212, + "grad_norm": 0.194007462783793, + "learning_rate": 5.585720722410509e-05, + "loss": 0.8987, + "step": 20931 + }, + { + "epoch": 4.465017064846417, + "grad_norm": 0.16179080687722017, + "learning_rate": 5.584909435426293e-05, + "loss": 0.9115, + "step": 20932 + }, + { + "epoch": 4.465230375426621, + "grad_norm": 0.13707881930882582, + "learning_rate": 5.5840981757688655e-05, + "loss": 0.878, + "step": 20933 + }, + { + "epoch": 4.465443686006826, + "grad_norm": 0.1591150314097079, + "learning_rate": 5.583286943447407e-05, + "loss": 0.9058, + "step": 20934 + }, + { + "epoch": 4.465656996587031, + "grad_norm": 0.17580818057237357, + "learning_rate": 5.5824757384710984e-05, + "loss": 0.9031, + "step": 20935 + }, + { + "epoch": 4.465870307167235, + "grad_norm": 0.15425932139929532, + "learning_rate": 5.581664560849117e-05, + "loss": 0.8841, + "step": 20936 + }, + { + "epoch": 4.46608361774744, + "grad_norm": 0.20835734409355094, + "learning_rate": 5.580853410590642e-05, + "loss": 0.8699, + "step": 20937 + }, + { + "epoch": 4.466296928327645, + "grad_norm": 0.21697737336619083, + "learning_rate": 5.580042287704851e-05, + "loss": 0.9069, + "step": 20938 + }, + { + "epoch": 4.4665102389078495, + "grad_norm": 0.16935648048817278, + "learning_rate": 5.5792311922009235e-05, + "loss": 0.8852, + "step": 20939 + }, + { + "epoch": 4.466723549488055, + "grad_norm": 0.19422852290690254, + "learning_rate": 5.578420124088034e-05, + "loss": 0.894, + "step": 20940 + }, + { + "epoch": 4.46693686006826, + "grad_norm": 0.2339147538174954, + "learning_rate": 5.577609083375365e-05, + "loss": 0.902, + "step": 20941 + }, + { + "epoch": 4.467150170648464, + "grad_norm": 0.25959499314548684, + "learning_rate": 5.576798070072087e-05, + "loss": 0.9142, + "step": 20942 + }, + { + "epoch": 4.467363481228669, + "grad_norm": 0.1787324577253613, + "learning_rate": 5.57598708418738e-05, + "loss": 0.8907, + "step": 20943 + }, + { + "epoch": 4.467576791808874, + "grad_norm": 0.17021601126271707, + "learning_rate": 5.5751761257304205e-05, + "loss": 0.8817, + "step": 20944 + }, + { + "epoch": 4.4677901023890785, + "grad_norm": 0.21054570943504872, + "learning_rate": 5.574365194710384e-05, + "loss": 0.9045, + "step": 20945 + }, + { + "epoch": 4.468003412969283, + "grad_norm": 0.1882575163044737, + "learning_rate": 5.5735542911364446e-05, + "loss": 0.8866, + "step": 20946 + }, + { + "epoch": 4.468216723549488, + "grad_norm": 0.14301874942792256, + "learning_rate": 5.572743415017781e-05, + "loss": 0.9177, + "step": 20947 + }, + { + "epoch": 4.4684300341296925, + "grad_norm": 0.18847738824200533, + "learning_rate": 5.5719325663635646e-05, + "loss": 0.9072, + "step": 20948 + }, + { + "epoch": 4.468643344709897, + "grad_norm": 0.19348481235236553, + "learning_rate": 5.571121745182973e-05, + "loss": 0.9128, + "step": 20949 + }, + { + "epoch": 4.468856655290103, + "grad_norm": 0.19172050335219018, + "learning_rate": 5.570310951485179e-05, + "loss": 0.8759, + "step": 20950 + }, + { + "epoch": 4.4690699658703075, + "grad_norm": 0.1549754458288463, + "learning_rate": 5.569500185279359e-05, + "loss": 0.9427, + "step": 20951 + }, + { + "epoch": 4.469283276450512, + "grad_norm": 0.2393529926211253, + "learning_rate": 5.568689446574683e-05, + "loss": 0.8697, + "step": 20952 + }, + { + "epoch": 4.469496587030717, + "grad_norm": 0.2533286788270828, + "learning_rate": 5.5678787353803315e-05, + "loss": 0.8716, + "step": 20953 + }, + { + "epoch": 4.4697098976109215, + "grad_norm": 0.18652776312239458, + "learning_rate": 5.5670680517054695e-05, + "loss": 0.9028, + "step": 20954 + }, + { + "epoch": 4.469923208191126, + "grad_norm": 0.16560879674902895, + "learning_rate": 5.5662573955592746e-05, + "loss": 0.9165, + "step": 20955 + }, + { + "epoch": 4.470136518771331, + "grad_norm": 0.18788095381074393, + "learning_rate": 5.565446766950917e-05, + "loss": 0.9007, + "step": 20956 + }, + { + "epoch": 4.470349829351536, + "grad_norm": 0.24735102696599157, + "learning_rate": 5.564636165889571e-05, + "loss": 0.9033, + "step": 20957 + }, + { + "epoch": 4.47056313993174, + "grad_norm": 0.2333014016561511, + "learning_rate": 5.5638255923844066e-05, + "loss": 0.9131, + "step": 20958 + }, + { + "epoch": 4.470776450511945, + "grad_norm": 0.14209066395268355, + "learning_rate": 5.563015046444598e-05, + "loss": 0.9244, + "step": 20959 + }, + { + "epoch": 4.4709897610921505, + "grad_norm": 0.17992342540120893, + "learning_rate": 5.5622045280793137e-05, + "loss": 0.8942, + "step": 20960 + }, + { + "epoch": 4.471203071672355, + "grad_norm": 0.1710449998574779, + "learning_rate": 5.561394037297727e-05, + "loss": 0.8834, + "step": 20961 + }, + { + "epoch": 4.47141638225256, + "grad_norm": 0.15498664647407656, + "learning_rate": 5.5605835741090076e-05, + "loss": 0.9136, + "step": 20962 + }, + { + "epoch": 4.471629692832765, + "grad_norm": 0.19074897824047213, + "learning_rate": 5.559773138522326e-05, + "loss": 0.9114, + "step": 20963 + }, + { + "epoch": 4.471843003412969, + "grad_norm": 0.22609926721947962, + "learning_rate": 5.558962730546855e-05, + "loss": 0.884, + "step": 20964 + }, + { + "epoch": 4.472056313993174, + "grad_norm": 0.16371438120453297, + "learning_rate": 5.5581523501917585e-05, + "loss": 0.9089, + "step": 20965 + }, + { + "epoch": 4.472269624573379, + "grad_norm": 0.1804833740271737, + "learning_rate": 5.557341997466211e-05, + "loss": 0.9017, + "step": 20966 + }, + { + "epoch": 4.472482935153583, + "grad_norm": 0.21483593267701104, + "learning_rate": 5.556531672379379e-05, + "loss": 0.8875, + "step": 20967 + }, + { + "epoch": 4.472696245733788, + "grad_norm": 0.1657694979125311, + "learning_rate": 5.5557213749404315e-05, + "loss": 0.8712, + "step": 20968 + }, + { + "epoch": 4.472909556313994, + "grad_norm": 0.17116772878922068, + "learning_rate": 5.554911105158539e-05, + "loss": 0.9287, + "step": 20969 + }, + { + "epoch": 4.473122866894198, + "grad_norm": 0.20956386447421443, + "learning_rate": 5.554100863042866e-05, + "loss": 0.9161, + "step": 20970 + }, + { + "epoch": 4.473336177474403, + "grad_norm": 0.17289339815708737, + "learning_rate": 5.553290648602586e-05, + "loss": 0.8866, + "step": 20971 + }, + { + "epoch": 4.473549488054608, + "grad_norm": 0.1898538490585646, + "learning_rate": 5.5524804618468606e-05, + "loss": 0.9257, + "step": 20972 + }, + { + "epoch": 4.473762798634812, + "grad_norm": 0.1654938951729583, + "learning_rate": 5.551670302784863e-05, + "loss": 0.9056, + "step": 20973 + }, + { + "epoch": 4.473976109215017, + "grad_norm": 0.17042977115388425, + "learning_rate": 5.550860171425754e-05, + "loss": 0.922, + "step": 20974 + }, + { + "epoch": 4.474189419795222, + "grad_norm": 0.138672923362406, + "learning_rate": 5.550050067778708e-05, + "loss": 0.8954, + "step": 20975 + }, + { + "epoch": 4.474402730375426, + "grad_norm": 0.17221146661754508, + "learning_rate": 5.5492399918528845e-05, + "loss": 0.8996, + "step": 20976 + }, + { + "epoch": 4.474616040955631, + "grad_norm": 0.17825942202372208, + "learning_rate": 5.54842994365745e-05, + "loss": 0.8993, + "step": 20977 + }, + { + "epoch": 4.474829351535837, + "grad_norm": 0.16940621171919554, + "learning_rate": 5.5476199232015734e-05, + "loss": 0.9242, + "step": 20978 + }, + { + "epoch": 4.475042662116041, + "grad_norm": 0.14068996553889784, + "learning_rate": 5.5468099304944196e-05, + "loss": 0.9245, + "step": 20979 + }, + { + "epoch": 4.475255972696246, + "grad_norm": 0.16665606713469286, + "learning_rate": 5.5459999655451494e-05, + "loss": 0.9046, + "step": 20980 + }, + { + "epoch": 4.475469283276451, + "grad_norm": 0.16542225020996537, + "learning_rate": 5.545190028362934e-05, + "loss": 0.8964, + "step": 20981 + }, + { + "epoch": 4.475682593856655, + "grad_norm": 0.14617269722465434, + "learning_rate": 5.544380118956932e-05, + "loss": 0.8882, + "step": 20982 + }, + { + "epoch": 4.47589590443686, + "grad_norm": 0.1691047308312596, + "learning_rate": 5.543570237336312e-05, + "loss": 0.8753, + "step": 20983 + }, + { + "epoch": 4.476109215017065, + "grad_norm": 0.12849415173385653, + "learning_rate": 5.542760383510235e-05, + "loss": 0.8968, + "step": 20984 + }, + { + "epoch": 4.476322525597269, + "grad_norm": 0.19596592569609067, + "learning_rate": 5.541950557487866e-05, + "loss": 0.8941, + "step": 20985 + }, + { + "epoch": 4.476535836177474, + "grad_norm": 0.1737394975777477, + "learning_rate": 5.541140759278371e-05, + "loss": 0.8686, + "step": 20986 + }, + { + "epoch": 4.476749146757679, + "grad_norm": 0.17158155166326602, + "learning_rate": 5.540330988890907e-05, + "loss": 0.9086, + "step": 20987 + }, + { + "epoch": 4.476962457337884, + "grad_norm": 0.18923223252345775, + "learning_rate": 5.539521246334639e-05, + "loss": 0.9138, + "step": 20988 + }, + { + "epoch": 4.477175767918089, + "grad_norm": 0.5227155283223988, + "learning_rate": 5.5387115316187273e-05, + "loss": 0.9162, + "step": 20989 + }, + { + "epoch": 4.477389078498294, + "grad_norm": 0.20214203411758677, + "learning_rate": 5.5379018447523395e-05, + "loss": 0.9046, + "step": 20990 + }, + { + "epoch": 4.477602389078498, + "grad_norm": 0.20198648058674726, + "learning_rate": 5.5370921857446336e-05, + "loss": 0.9007, + "step": 20991 + }, + { + "epoch": 4.477815699658703, + "grad_norm": 0.19037523639760187, + "learning_rate": 5.536282554604768e-05, + "loss": 0.9122, + "step": 20992 + }, + { + "epoch": 4.478029010238908, + "grad_norm": 0.21778141136484483, + "learning_rate": 5.5354729513419094e-05, + "loss": 0.8889, + "step": 20993 + }, + { + "epoch": 4.478242320819112, + "grad_norm": 0.24376609019306972, + "learning_rate": 5.534663375965214e-05, + "loss": 0.8759, + "step": 20994 + }, + { + "epoch": 4.478455631399317, + "grad_norm": 0.19864596205966384, + "learning_rate": 5.533853828483846e-05, + "loss": 0.8915, + "step": 20995 + }, + { + "epoch": 4.478668941979522, + "grad_norm": 0.1807845190593868, + "learning_rate": 5.533044308906961e-05, + "loss": 0.9102, + "step": 20996 + }, + { + "epoch": 4.4788822525597265, + "grad_norm": 0.25936060795476157, + "learning_rate": 5.5322348172437255e-05, + "loss": 0.9201, + "step": 20997 + }, + { + "epoch": 4.479095563139932, + "grad_norm": 0.2055699563868357, + "learning_rate": 5.531425353503291e-05, + "loss": 0.9077, + "step": 20998 + }, + { + "epoch": 4.479308873720137, + "grad_norm": 0.216180988644024, + "learning_rate": 5.530615917694821e-05, + "loss": 0.8941, + "step": 20999 + }, + { + "epoch": 4.479522184300341, + "grad_norm": 0.22565087778993026, + "learning_rate": 5.529806509827474e-05, + "loss": 0.8964, + "step": 21000 + }, + { + "epoch": 4.479735494880546, + "grad_norm": 0.18520605089727277, + "learning_rate": 5.528997129910406e-05, + "loss": 0.9164, + "step": 21001 + }, + { + "epoch": 4.479948805460751, + "grad_norm": 0.2276822567562948, + "learning_rate": 5.528187777952779e-05, + "loss": 0.9133, + "step": 21002 + }, + { + "epoch": 4.4801621160409555, + "grad_norm": 0.2161253008594147, + "learning_rate": 5.5273784539637476e-05, + "loss": 0.8987, + "step": 21003 + }, + { + "epoch": 4.48037542662116, + "grad_norm": 0.18086424848008167, + "learning_rate": 5.5265691579524707e-05, + "loss": 0.8814, + "step": 21004 + }, + { + "epoch": 4.480588737201365, + "grad_norm": 0.18279552924114353, + "learning_rate": 5.5257598899281055e-05, + "loss": 0.9388, + "step": 21005 + }, + { + "epoch": 4.4808020477815695, + "grad_norm": 0.22107296578699256, + "learning_rate": 5.524950649899809e-05, + "loss": 0.889, + "step": 21006 + }, + { + "epoch": 4.481015358361775, + "grad_norm": 0.21113762786565604, + "learning_rate": 5.524141437876738e-05, + "loss": 0.9179, + "step": 21007 + }, + { + "epoch": 4.48122866894198, + "grad_norm": 0.16828072546986095, + "learning_rate": 5.5233322538680495e-05, + "loss": 0.8962, + "step": 21008 + }, + { + "epoch": 4.4814419795221845, + "grad_norm": 0.2243521273320677, + "learning_rate": 5.522523097882896e-05, + "loss": 0.8921, + "step": 21009 + }, + { + "epoch": 4.481655290102389, + "grad_norm": 0.19890900396490485, + "learning_rate": 5.521713969930436e-05, + "loss": 0.9192, + "step": 21010 + }, + { + "epoch": 4.481868600682594, + "grad_norm": 0.20402837227314302, + "learning_rate": 5.5209048700198225e-05, + "loss": 0.9237, + "step": 21011 + }, + { + "epoch": 4.4820819112627985, + "grad_norm": 0.17304981662969304, + "learning_rate": 5.520095798160214e-05, + "loss": 0.8835, + "step": 21012 + }, + { + "epoch": 4.482295221843003, + "grad_norm": 0.22849831145413937, + "learning_rate": 5.519286754360762e-05, + "loss": 0.9032, + "step": 21013 + }, + { + "epoch": 4.482508532423208, + "grad_norm": 0.1626076649619388, + "learning_rate": 5.518477738630624e-05, + "loss": 0.8894, + "step": 21014 + }, + { + "epoch": 4.482721843003413, + "grad_norm": 0.21208626380247037, + "learning_rate": 5.517668750978951e-05, + "loss": 0.9017, + "step": 21015 + }, + { + "epoch": 4.482935153583618, + "grad_norm": 0.2063247386019735, + "learning_rate": 5.516859791414897e-05, + "loss": 0.9012, + "step": 21016 + }, + { + "epoch": 4.483148464163823, + "grad_norm": 0.18184987746550008, + "learning_rate": 5.5160508599476174e-05, + "loss": 0.9175, + "step": 21017 + }, + { + "epoch": 4.4833617747440275, + "grad_norm": 0.1724034413756537, + "learning_rate": 5.5152419565862626e-05, + "loss": 0.8818, + "step": 21018 + }, + { + "epoch": 4.483575085324232, + "grad_norm": 0.1898111079351743, + "learning_rate": 5.5144330813399887e-05, + "loss": 0.9242, + "step": 21019 + }, + { + "epoch": 4.483788395904437, + "grad_norm": 1.2903932746157478, + "learning_rate": 5.513624234217949e-05, + "loss": 0.9013, + "step": 21020 + }, + { + "epoch": 4.484001706484642, + "grad_norm": 0.21014801889368787, + "learning_rate": 5.512815415229288e-05, + "loss": 0.8934, + "step": 21021 + }, + { + "epoch": 4.484215017064846, + "grad_norm": 0.309715336783883, + "learning_rate": 5.5120066243831635e-05, + "loss": 0.9102, + "step": 21022 + }, + { + "epoch": 4.484428327645051, + "grad_norm": 0.24805105549808143, + "learning_rate": 5.5111978616887255e-05, + "loss": 0.9208, + "step": 21023 + }, + { + "epoch": 4.484641638225256, + "grad_norm": 0.2749871833098527, + "learning_rate": 5.5103891271551277e-05, + "loss": 0.8999, + "step": 21024 + }, + { + "epoch": 4.484854948805461, + "grad_norm": 0.20562606580200268, + "learning_rate": 5.509580420791515e-05, + "loss": 0.8856, + "step": 21025 + }, + { + "epoch": 4.485068259385666, + "grad_norm": 0.29570832633116756, + "learning_rate": 5.508771742607046e-05, + "loss": 0.9083, + "step": 21026 + }, + { + "epoch": 4.485281569965871, + "grad_norm": 0.24386695172913356, + "learning_rate": 5.507963092610865e-05, + "loss": 0.9102, + "step": 21027 + }, + { + "epoch": 4.485494880546075, + "grad_norm": 0.21993433593194237, + "learning_rate": 5.507154470812122e-05, + "loss": 0.8865, + "step": 21028 + }, + { + "epoch": 4.48570819112628, + "grad_norm": 0.20181444957280076, + "learning_rate": 5.5063458772199705e-05, + "loss": 0.8949, + "step": 21029 + }, + { + "epoch": 4.485921501706485, + "grad_norm": 0.2472732898473638, + "learning_rate": 5.5055373118435556e-05, + "loss": 0.8949, + "step": 21030 + }, + { + "epoch": 4.486134812286689, + "grad_norm": 0.19561307658081248, + "learning_rate": 5.50472877469203e-05, + "loss": 0.9231, + "step": 21031 + }, + { + "epoch": 4.486348122866894, + "grad_norm": 0.1730710476160895, + "learning_rate": 5.5039202657745395e-05, + "loss": 0.9317, + "step": 21032 + }, + { + "epoch": 4.486561433447099, + "grad_norm": 0.1943771630162677, + "learning_rate": 5.503111785100233e-05, + "loss": 0.8865, + "step": 21033 + }, + { + "epoch": 4.486774744027303, + "grad_norm": 0.1800423332130355, + "learning_rate": 5.502303332678258e-05, + "loss": 0.8888, + "step": 21034 + }, + { + "epoch": 4.486988054607509, + "grad_norm": 0.18685004448925213, + "learning_rate": 5.501494908517762e-05, + "loss": 0.8997, + "step": 21035 + }, + { + "epoch": 4.487201365187714, + "grad_norm": 0.1964477375313758, + "learning_rate": 5.500686512627895e-05, + "loss": 0.8928, + "step": 21036 + }, + { + "epoch": 4.487414675767918, + "grad_norm": 0.22432067508960005, + "learning_rate": 5.4998781450177995e-05, + "loss": 0.9161, + "step": 21037 + }, + { + "epoch": 4.487627986348123, + "grad_norm": 0.1923794133304841, + "learning_rate": 5.499069805696628e-05, + "loss": 0.9094, + "step": 21038 + }, + { + "epoch": 4.487841296928328, + "grad_norm": 0.15496685755594217, + "learning_rate": 5.498261494673523e-05, + "loss": 0.908, + "step": 21039 + }, + { + "epoch": 4.488054607508532, + "grad_norm": 0.19126153485249156, + "learning_rate": 5.49745321195763e-05, + "loss": 0.905, + "step": 21040 + }, + { + "epoch": 4.488267918088737, + "grad_norm": 0.16913978377304137, + "learning_rate": 5.496644957558097e-05, + "loss": 0.9076, + "step": 21041 + }, + { + "epoch": 4.488481228668942, + "grad_norm": 0.15494372263586872, + "learning_rate": 5.4958367314840705e-05, + "loss": 0.9142, + "step": 21042 + }, + { + "epoch": 4.488694539249146, + "grad_norm": 0.17586738515242908, + "learning_rate": 5.49502853374469e-05, + "loss": 0.9029, + "step": 21043 + }, + { + "epoch": 4.488907849829351, + "grad_norm": 0.201486014469851, + "learning_rate": 5.4942203643491064e-05, + "loss": 0.916, + "step": 21044 + }, + { + "epoch": 4.489121160409557, + "grad_norm": 0.16396451157426647, + "learning_rate": 5.4934122233064585e-05, + "loss": 0.9227, + "step": 21045 + }, + { + "epoch": 4.489334470989761, + "grad_norm": 0.18258940580982255, + "learning_rate": 5.492604110625895e-05, + "loss": 0.9107, + "step": 21046 + }, + { + "epoch": 4.489547781569966, + "grad_norm": 0.19054902321169007, + "learning_rate": 5.491796026316557e-05, + "loss": 0.9027, + "step": 21047 + }, + { + "epoch": 4.489761092150171, + "grad_norm": 0.16263186882768263, + "learning_rate": 5.490987970387592e-05, + "loss": 0.8992, + "step": 21048 + }, + { + "epoch": 4.489974402730375, + "grad_norm": 0.1702208090592537, + "learning_rate": 5.4901799428481375e-05, + "loss": 0.913, + "step": 21049 + }, + { + "epoch": 4.49018771331058, + "grad_norm": 0.1569456211798378, + "learning_rate": 5.48937194370734e-05, + "loss": 0.9017, + "step": 21050 + }, + { + "epoch": 4.490401023890785, + "grad_norm": 0.1782039991776634, + "learning_rate": 5.488563972974342e-05, + "loss": 0.8861, + "step": 21051 + }, + { + "epoch": 4.4906143344709895, + "grad_norm": 0.1389814811738734, + "learning_rate": 5.487756030658284e-05, + "loss": 0.8828, + "step": 21052 + }, + { + "epoch": 4.490827645051194, + "grad_norm": 0.21035061859316534, + "learning_rate": 5.486948116768311e-05, + "loss": 0.9342, + "step": 21053 + }, + { + "epoch": 4.4910409556314, + "grad_norm": 0.14739744942182517, + "learning_rate": 5.4861402313135614e-05, + "loss": 0.8962, + "step": 21054 + }, + { + "epoch": 4.491254266211604, + "grad_norm": 0.19107138070722832, + "learning_rate": 5.485332374303175e-05, + "loss": 0.9273, + "step": 21055 + }, + { + "epoch": 4.491467576791809, + "grad_norm": 0.15891611934936062, + "learning_rate": 5.484524545746297e-05, + "loss": 0.9017, + "step": 21056 + }, + { + "epoch": 4.491680887372014, + "grad_norm": 0.19527504470268042, + "learning_rate": 5.4837167456520645e-05, + "loss": 0.907, + "step": 21057 + }, + { + "epoch": 4.4918941979522184, + "grad_norm": 0.16174467875264056, + "learning_rate": 5.482908974029621e-05, + "loss": 0.916, + "step": 21058 + }, + { + "epoch": 4.492107508532423, + "grad_norm": 0.20130223558859336, + "learning_rate": 5.482101230888103e-05, + "loss": 0.9001, + "step": 21059 + }, + { + "epoch": 4.492320819112628, + "grad_norm": 0.1756495732979336, + "learning_rate": 5.481293516236654e-05, + "loss": 0.9045, + "step": 21060 + }, + { + "epoch": 4.4925341296928325, + "grad_norm": 0.1765098045180678, + "learning_rate": 5.4804858300844086e-05, + "loss": 0.8957, + "step": 21061 + }, + { + "epoch": 4.492747440273037, + "grad_norm": 0.17260559039131973, + "learning_rate": 5.479678172440512e-05, + "loss": 0.9198, + "step": 21062 + }, + { + "epoch": 4.492960750853243, + "grad_norm": 0.1957734832907157, + "learning_rate": 5.478870543314097e-05, + "loss": 0.8968, + "step": 21063 + }, + { + "epoch": 4.493174061433447, + "grad_norm": 0.1621492991716509, + "learning_rate": 5.478062942714308e-05, + "loss": 0.9301, + "step": 21064 + }, + { + "epoch": 4.493387372013652, + "grad_norm": 0.1636404357664064, + "learning_rate": 5.477255370650274e-05, + "loss": 0.907, + "step": 21065 + }, + { + "epoch": 4.493600682593857, + "grad_norm": 0.18304860449945398, + "learning_rate": 5.476447827131141e-05, + "loss": 0.8827, + "step": 21066 + }, + { + "epoch": 4.4938139931740615, + "grad_norm": 0.20141694284394454, + "learning_rate": 5.4756403121660404e-05, + "loss": 0.8908, + "step": 21067 + }, + { + "epoch": 4.494027303754266, + "grad_norm": 0.16435002977167892, + "learning_rate": 5.4748328257641146e-05, + "loss": 0.8944, + "step": 21068 + }, + { + "epoch": 4.494240614334471, + "grad_norm": 0.1904752307210373, + "learning_rate": 5.474025367934496e-05, + "loss": 0.8937, + "step": 21069 + }, + { + "epoch": 4.4944539249146755, + "grad_norm": 0.1965052021105525, + "learning_rate": 5.4732179386863244e-05, + "loss": 0.9045, + "step": 21070 + }, + { + "epoch": 4.49466723549488, + "grad_norm": 0.18130319597148528, + "learning_rate": 5.4724105380287334e-05, + "loss": 0.8868, + "step": 21071 + }, + { + "epoch": 4.494880546075085, + "grad_norm": 0.17415522161868024, + "learning_rate": 5.47160316597086e-05, + "loss": 0.9238, + "step": 21072 + }, + { + "epoch": 4.4950938566552905, + "grad_norm": 0.19622410312224742, + "learning_rate": 5.470795822521839e-05, + "loss": 0.9149, + "step": 21073 + }, + { + "epoch": 4.495307167235495, + "grad_norm": 0.17143487252544548, + "learning_rate": 5.4699885076908064e-05, + "loss": 0.9026, + "step": 21074 + }, + { + "epoch": 4.4955204778157, + "grad_norm": 0.16451721595613653, + "learning_rate": 5.4691812214868965e-05, + "loss": 0.9221, + "step": 21075 + }, + { + "epoch": 4.4957337883959045, + "grad_norm": 0.19925713892154007, + "learning_rate": 5.468373963919246e-05, + "loss": 0.914, + "step": 21076 + }, + { + "epoch": 4.495947098976109, + "grad_norm": 0.17740617508704426, + "learning_rate": 5.467566734996983e-05, + "loss": 0.9049, + "step": 21077 + }, + { + "epoch": 4.496160409556314, + "grad_norm": 0.17329127188453292, + "learning_rate": 5.466759534729246e-05, + "loss": 0.8543, + "step": 21078 + }, + { + "epoch": 4.496373720136519, + "grad_norm": 0.1526727902133574, + "learning_rate": 5.4659523631251674e-05, + "loss": 0.8914, + "step": 21079 + }, + { + "epoch": 4.496587030716723, + "grad_norm": 0.17511448406678864, + "learning_rate": 5.465145220193881e-05, + "loss": 0.8968, + "step": 21080 + }, + { + "epoch": 4.496800341296928, + "grad_norm": 0.24446752963533971, + "learning_rate": 5.464338105944517e-05, + "loss": 0.9029, + "step": 21081 + }, + { + "epoch": 4.497013651877133, + "grad_norm": 0.17992961601717777, + "learning_rate": 5.463531020386212e-05, + "loss": 0.8962, + "step": 21082 + }, + { + "epoch": 4.497226962457338, + "grad_norm": 0.17143855872818345, + "learning_rate": 5.4627239635280956e-05, + "loss": 0.9023, + "step": 21083 + }, + { + "epoch": 4.497440273037543, + "grad_norm": 0.204846626323696, + "learning_rate": 5.4619169353793e-05, + "loss": 0.9085, + "step": 21084 + }, + { + "epoch": 4.497653583617748, + "grad_norm": 0.21472760814315806, + "learning_rate": 5.4611099359489565e-05, + "loss": 0.8925, + "step": 21085 + }, + { + "epoch": 4.497866894197952, + "grad_norm": 0.15177560839419518, + "learning_rate": 5.460302965246198e-05, + "loss": 0.9228, + "step": 21086 + }, + { + "epoch": 4.498080204778157, + "grad_norm": 0.1922326383241882, + "learning_rate": 5.459496023280156e-05, + "loss": 0.9421, + "step": 21087 + }, + { + "epoch": 4.498293515358362, + "grad_norm": 0.18366559231686252, + "learning_rate": 5.458689110059959e-05, + "loss": 0.918, + "step": 21088 + }, + { + "epoch": 4.498506825938566, + "grad_norm": 0.16130265273574546, + "learning_rate": 5.457882225594734e-05, + "loss": 0.9074, + "step": 21089 + }, + { + "epoch": 4.498720136518771, + "grad_norm": 0.2057377214243835, + "learning_rate": 5.457075369893616e-05, + "loss": 0.8929, + "step": 21090 + }, + { + "epoch": 4.498933447098976, + "grad_norm": 0.19949502570101196, + "learning_rate": 5.456268542965732e-05, + "loss": 0.8941, + "step": 21091 + }, + { + "epoch": 4.499146757679181, + "grad_norm": 0.14771154005212084, + "learning_rate": 5.455461744820214e-05, + "loss": 0.8858, + "step": 21092 + }, + { + "epoch": 4.499360068259386, + "grad_norm": 0.18057755564644976, + "learning_rate": 5.454654975466187e-05, + "loss": 0.9061, + "step": 21093 + }, + { + "epoch": 4.499573378839591, + "grad_norm": 0.1755951927232757, + "learning_rate": 5.4538482349127835e-05, + "loss": 0.9194, + "step": 21094 + }, + { + "epoch": 4.499786689419795, + "grad_norm": 0.18358833257753296, + "learning_rate": 5.453041523169129e-05, + "loss": 0.8995, + "step": 21095 + }, + { + "epoch": 4.5, + "grad_norm": 0.1755833708468824, + "learning_rate": 5.452234840244354e-05, + "loss": 0.8934, + "step": 21096 + }, + { + "epoch": 4.500213310580205, + "grad_norm": 0.22453930093266306, + "learning_rate": 5.451428186147581e-05, + "loss": 0.893, + "step": 21097 + }, + { + "epoch": 4.500426621160409, + "grad_norm": 0.24255598433259973, + "learning_rate": 5.450621560887948e-05, + "loss": 0.9121, + "step": 21098 + }, + { + "epoch": 4.500639931740614, + "grad_norm": 0.15086665954970796, + "learning_rate": 5.44981496447457e-05, + "loss": 0.8946, + "step": 21099 + }, + { + "epoch": 4.500853242320819, + "grad_norm": 0.1988133633434748, + "learning_rate": 5.449008396916579e-05, + "loss": 0.894, + "step": 21100 + }, + { + "epoch": 4.501066552901024, + "grad_norm": 0.2099821821765104, + "learning_rate": 5.4482018582231014e-05, + "loss": 0.8968, + "step": 21101 + }, + { + "epoch": 4.501279863481229, + "grad_norm": 0.1483644410345662, + "learning_rate": 5.4473953484032626e-05, + "loss": 0.8886, + "step": 21102 + }, + { + "epoch": 4.501493174061434, + "grad_norm": 0.18321184591937223, + "learning_rate": 5.446588867466187e-05, + "loss": 0.9357, + "step": 21103 + }, + { + "epoch": 4.501706484641638, + "grad_norm": 0.20067646977772322, + "learning_rate": 5.4457824154210023e-05, + "loss": 0.8698, + "step": 21104 + }, + { + "epoch": 4.501919795221843, + "grad_norm": 0.19610245466475623, + "learning_rate": 5.4449759922768314e-05, + "loss": 0.8853, + "step": 21105 + }, + { + "epoch": 4.502133105802048, + "grad_norm": 0.153381841564804, + "learning_rate": 5.444169598042802e-05, + "loss": 0.9083, + "step": 21106 + }, + { + "epoch": 4.502346416382252, + "grad_norm": 0.20596715727825995, + "learning_rate": 5.443363232728035e-05, + "loss": 0.9274, + "step": 21107 + }, + { + "epoch": 4.502559726962457, + "grad_norm": 0.20560246981029262, + "learning_rate": 5.442556896341657e-05, + "loss": 0.9203, + "step": 21108 + }, + { + "epoch": 4.502773037542662, + "grad_norm": 0.19789028266561762, + "learning_rate": 5.4417505888927934e-05, + "loss": 0.8949, + "step": 21109 + }, + { + "epoch": 4.502986348122867, + "grad_norm": 0.146374340480137, + "learning_rate": 5.440944310390563e-05, + "loss": 0.9109, + "step": 21110 + }, + { + "epoch": 4.503199658703072, + "grad_norm": 0.23668243795101274, + "learning_rate": 5.4401380608440885e-05, + "loss": 0.9018, + "step": 21111 + }, + { + "epoch": 4.503412969283277, + "grad_norm": 0.22420651939350805, + "learning_rate": 5.439331840262498e-05, + "loss": 0.8981, + "step": 21112 + }, + { + "epoch": 4.503626279863481, + "grad_norm": 0.16758438754078614, + "learning_rate": 5.4385256486549076e-05, + "loss": 0.9, + "step": 21113 + }, + { + "epoch": 4.503839590443686, + "grad_norm": 0.1683000536345334, + "learning_rate": 5.4377194860304444e-05, + "loss": 0.9159, + "step": 21114 + }, + { + "epoch": 4.504052901023891, + "grad_norm": 0.2221085591367428, + "learning_rate": 5.436913352398227e-05, + "loss": 0.9119, + "step": 21115 + }, + { + "epoch": 4.5042662116040955, + "grad_norm": 0.24583601431267946, + "learning_rate": 5.43610724776738e-05, + "loss": 0.9202, + "step": 21116 + }, + { + "epoch": 4.5044795221843, + "grad_norm": 0.16448597146367036, + "learning_rate": 5.43530117214702e-05, + "loss": 0.9055, + "step": 21117 + }, + { + "epoch": 4.504692832764505, + "grad_norm": 0.2273981273325653, + "learning_rate": 5.434495125546273e-05, + "loss": 0.9115, + "step": 21118 + }, + { + "epoch": 4.5049061433447095, + "grad_norm": 0.20962415786230817, + "learning_rate": 5.433689107974255e-05, + "loss": 0.9184, + "step": 21119 + }, + { + "epoch": 4.505119453924914, + "grad_norm": 0.1933621627623657, + "learning_rate": 5.432883119440093e-05, + "loss": 0.9315, + "step": 21120 + }, + { + "epoch": 4.50533276450512, + "grad_norm": 0.2195714998030658, + "learning_rate": 5.432077159952897e-05, + "loss": 0.8849, + "step": 21121 + }, + { + "epoch": 4.5055460750853245, + "grad_norm": 0.2674118870020395, + "learning_rate": 5.431271229521793e-05, + "loss": 0.8916, + "step": 21122 + }, + { + "epoch": 4.505759385665529, + "grad_norm": 0.19458287504521563, + "learning_rate": 5.4304653281558956e-05, + "loss": 0.9198, + "step": 21123 + }, + { + "epoch": 4.505972696245734, + "grad_norm": 0.17125318415934365, + "learning_rate": 5.4296594558643275e-05, + "loss": 0.8865, + "step": 21124 + }, + { + "epoch": 4.5061860068259385, + "grad_norm": 0.25184688854754356, + "learning_rate": 5.4288536126562056e-05, + "loss": 0.9172, + "step": 21125 + }, + { + "epoch": 4.506399317406143, + "grad_norm": 0.22307065324240247, + "learning_rate": 5.4280477985406486e-05, + "loss": 0.9003, + "step": 21126 + }, + { + "epoch": 4.506612627986348, + "grad_norm": 0.19984596791868087, + "learning_rate": 5.4272420135267726e-05, + "loss": 0.8909, + "step": 21127 + }, + { + "epoch": 4.506825938566553, + "grad_norm": 0.20725761266263554, + "learning_rate": 5.426436257623699e-05, + "loss": 0.9221, + "step": 21128 + }, + { + "epoch": 4.507039249146757, + "grad_norm": 0.3148142516372082, + "learning_rate": 5.4256305308405405e-05, + "loss": 0.8962, + "step": 21129 + }, + { + "epoch": 4.507252559726963, + "grad_norm": 0.19149352757455904, + "learning_rate": 5.424824833186417e-05, + "loss": 0.902, + "step": 21130 + }, + { + "epoch": 4.5074658703071675, + "grad_norm": 0.2217773361135215, + "learning_rate": 5.424019164670442e-05, + "loss": 0.861, + "step": 21131 + }, + { + "epoch": 4.507679180887372, + "grad_norm": 0.2653198118176663, + "learning_rate": 5.423213525301741e-05, + "loss": 0.923, + "step": 21132 + }, + { + "epoch": 4.507892491467577, + "grad_norm": 0.26128796853738806, + "learning_rate": 5.422407915089414e-05, + "loss": 0.9093, + "step": 21133 + }, + { + "epoch": 4.5081058020477816, + "grad_norm": 0.16664359576068577, + "learning_rate": 5.4216023340425876e-05, + "loss": 0.8925, + "step": 21134 + }, + { + "epoch": 4.508319112627986, + "grad_norm": 0.25392845030467065, + "learning_rate": 5.420796782170374e-05, + "loss": 0.895, + "step": 21135 + }, + { + "epoch": 4.508532423208191, + "grad_norm": 0.24546413897930128, + "learning_rate": 5.419991259481889e-05, + "loss": 0.8766, + "step": 21136 + }, + { + "epoch": 4.508745733788396, + "grad_norm": 0.17664965324199522, + "learning_rate": 5.4191857659862425e-05, + "loss": 0.9053, + "step": 21137 + }, + { + "epoch": 4.5089590443686, + "grad_norm": 0.2088216008501999, + "learning_rate": 5.4183803016925564e-05, + "loss": 0.8891, + "step": 21138 + }, + { + "epoch": 4.509172354948806, + "grad_norm": 0.20988235619204343, + "learning_rate": 5.417574866609938e-05, + "loss": 0.9135, + "step": 21139 + }, + { + "epoch": 4.5093856655290105, + "grad_norm": 0.18990916668866817, + "learning_rate": 5.416769460747506e-05, + "loss": 0.8811, + "step": 21140 + }, + { + "epoch": 4.509598976109215, + "grad_norm": 0.15116709460554012, + "learning_rate": 5.415964084114368e-05, + "loss": 0.9206, + "step": 21141 + }, + { + "epoch": 4.50981228668942, + "grad_norm": 0.18720022398203306, + "learning_rate": 5.415158736719643e-05, + "loss": 0.9111, + "step": 21142 + }, + { + "epoch": 4.510025597269625, + "grad_norm": 0.1389444465197559, + "learning_rate": 5.4143534185724415e-05, + "loss": 0.8955, + "step": 21143 + }, + { + "epoch": 4.510238907849829, + "grad_norm": 0.21065622948646984, + "learning_rate": 5.4135481296818734e-05, + "loss": 0.8954, + "step": 21144 + }, + { + "epoch": 4.510452218430034, + "grad_norm": 0.18673350435178532, + "learning_rate": 5.412742870057049e-05, + "loss": 0.8836, + "step": 21145 + }, + { + "epoch": 4.510665529010239, + "grad_norm": 0.20571870073274817, + "learning_rate": 5.411937639707084e-05, + "loss": 0.916, + "step": 21146 + }, + { + "epoch": 4.510878839590443, + "grad_norm": 0.19561897717140433, + "learning_rate": 5.411132438641087e-05, + "loss": 0.9082, + "step": 21147 + }, + { + "epoch": 4.511092150170649, + "grad_norm": 0.2404370619275617, + "learning_rate": 5.410327266868172e-05, + "loss": 0.8811, + "step": 21148 + }, + { + "epoch": 4.511305460750854, + "grad_norm": 0.2054418505783604, + "learning_rate": 5.409522124397446e-05, + "loss": 0.8943, + "step": 21149 + }, + { + "epoch": 4.511518771331058, + "grad_norm": 0.17971798678051118, + "learning_rate": 5.4087170112380216e-05, + "loss": 0.8907, + "step": 21150 + }, + { + "epoch": 4.511732081911263, + "grad_norm": 0.22212821517463435, + "learning_rate": 5.4079119273990067e-05, + "loss": 0.8668, + "step": 21151 + }, + { + "epoch": 4.511945392491468, + "grad_norm": 0.1766156368067852, + "learning_rate": 5.407106872889513e-05, + "loss": 0.9365, + "step": 21152 + }, + { + "epoch": 4.512158703071672, + "grad_norm": 0.1702359284893224, + "learning_rate": 5.4063018477186477e-05, + "loss": 0.9251, + "step": 21153 + }, + { + "epoch": 4.512372013651877, + "grad_norm": 0.1601544761289793, + "learning_rate": 5.4054968518955236e-05, + "loss": 0.8811, + "step": 21154 + }, + { + "epoch": 4.512585324232082, + "grad_norm": 0.1792788098773457, + "learning_rate": 5.404691885429245e-05, + "loss": 0.9049, + "step": 21155 + }, + { + "epoch": 4.512798634812286, + "grad_norm": 0.15190798843028586, + "learning_rate": 5.4038869483289214e-05, + "loss": 0.9092, + "step": 21156 + }, + { + "epoch": 4.513011945392492, + "grad_norm": 0.17417745510253618, + "learning_rate": 5.403082040603658e-05, + "loss": 0.9027, + "step": 21157 + }, + { + "epoch": 4.513225255972696, + "grad_norm": 0.1461948269578023, + "learning_rate": 5.402277162262568e-05, + "loss": 0.9086, + "step": 21158 + }, + { + "epoch": 4.513438566552901, + "grad_norm": 0.16319655537975356, + "learning_rate": 5.401472313314753e-05, + "loss": 0.8829, + "step": 21159 + }, + { + "epoch": 4.513651877133106, + "grad_norm": 0.18667664121638236, + "learning_rate": 5.400667493769324e-05, + "loss": 0.8925, + "step": 21160 + }, + { + "epoch": 4.513865187713311, + "grad_norm": 0.1623876671359484, + "learning_rate": 5.399862703635385e-05, + "loss": 0.8937, + "step": 21161 + }, + { + "epoch": 4.514078498293515, + "grad_norm": 0.17402371923519255, + "learning_rate": 5.399057942922044e-05, + "loss": 0.8984, + "step": 21162 + }, + { + "epoch": 4.51429180887372, + "grad_norm": 0.14573118874840577, + "learning_rate": 5.3982532116384056e-05, + "loss": 0.8993, + "step": 21163 + }, + { + "epoch": 4.514505119453925, + "grad_norm": 0.14871274141545407, + "learning_rate": 5.3974485097935774e-05, + "loss": 0.9167, + "step": 21164 + }, + { + "epoch": 4.514718430034129, + "grad_norm": 0.1477367328076374, + "learning_rate": 5.3966438373966635e-05, + "loss": 0.9135, + "step": 21165 + }, + { + "epoch": 4.514931740614334, + "grad_norm": 0.1612139599751264, + "learning_rate": 5.395839194456766e-05, + "loss": 0.8942, + "step": 21166 + }, + { + "epoch": 4.515145051194539, + "grad_norm": 0.16373401977681312, + "learning_rate": 5.3950345809829946e-05, + "loss": 0.8927, + "step": 21167 + }, + { + "epoch": 4.515358361774744, + "grad_norm": 0.14261363853967984, + "learning_rate": 5.394229996984448e-05, + "loss": 0.8997, + "step": 21168 + }, + { + "epoch": 4.515571672354949, + "grad_norm": 0.15668304092849925, + "learning_rate": 5.393425442470234e-05, + "loss": 0.9233, + "step": 21169 + }, + { + "epoch": 4.515784982935154, + "grad_norm": 0.14996876895681027, + "learning_rate": 5.3926209174494544e-05, + "loss": 0.897, + "step": 21170 + }, + { + "epoch": 4.515998293515358, + "grad_norm": 0.17789631334803263, + "learning_rate": 5.391816421931213e-05, + "loss": 0.8909, + "step": 21171 + }, + { + "epoch": 4.516211604095563, + "grad_norm": 0.15938854567132105, + "learning_rate": 5.3910119559246134e-05, + "loss": 0.9205, + "step": 21172 + }, + { + "epoch": 4.516424914675768, + "grad_norm": 0.17816453980971003, + "learning_rate": 5.3902075194387555e-05, + "loss": 0.9085, + "step": 21173 + }, + { + "epoch": 4.5166382252559725, + "grad_norm": 0.15823978474730296, + "learning_rate": 5.3894031124827456e-05, + "loss": 0.9071, + "step": 21174 + }, + { + "epoch": 4.516851535836177, + "grad_norm": 0.16467930314725152, + "learning_rate": 5.388598735065682e-05, + "loss": 0.8811, + "step": 21175 + }, + { + "epoch": 4.517064846416382, + "grad_norm": 0.1445777675711055, + "learning_rate": 5.38779438719667e-05, + "loss": 0.883, + "step": 21176 + }, + { + "epoch": 4.517278156996587, + "grad_norm": 0.19180004307285414, + "learning_rate": 5.3869900688848085e-05, + "loss": 0.9139, + "step": 21177 + }, + { + "epoch": 4.517491467576792, + "grad_norm": 0.17663575272353602, + "learning_rate": 5.386185780139195e-05, + "loss": 0.8874, + "step": 21178 + }, + { + "epoch": 4.517704778156997, + "grad_norm": 0.1916155162645462, + "learning_rate": 5.3853815209689346e-05, + "loss": 0.9256, + "step": 21179 + }, + { + "epoch": 4.5179180887372015, + "grad_norm": 0.1612001317646179, + "learning_rate": 5.384577291383128e-05, + "loss": 0.8961, + "step": 21180 + }, + { + "epoch": 4.518131399317406, + "grad_norm": 0.17861051696870714, + "learning_rate": 5.383773091390871e-05, + "loss": 0.922, + "step": 21181 + }, + { + "epoch": 4.518344709897611, + "grad_norm": 0.18605010575752595, + "learning_rate": 5.3829689210012665e-05, + "loss": 0.899, + "step": 21182 + }, + { + "epoch": 4.5185580204778155, + "grad_norm": 0.24696835217582855, + "learning_rate": 5.382164780223412e-05, + "loss": 0.8934, + "step": 21183 + }, + { + "epoch": 4.51877133105802, + "grad_norm": 0.19121836589483726, + "learning_rate": 5.381360669066409e-05, + "loss": 0.911, + "step": 21184 + }, + { + "epoch": 4.518984641638225, + "grad_norm": 0.1755910559225854, + "learning_rate": 5.380556587539351e-05, + "loss": 0.9092, + "step": 21185 + }, + { + "epoch": 4.5191979522184305, + "grad_norm": 0.18334483584346453, + "learning_rate": 5.379752535651341e-05, + "loss": 0.877, + "step": 21186 + }, + { + "epoch": 4.519411262798635, + "grad_norm": 0.17594760351583574, + "learning_rate": 5.378948513411478e-05, + "loss": 0.9271, + "step": 21187 + }, + { + "epoch": 4.51962457337884, + "grad_norm": 0.24191235413567405, + "learning_rate": 5.378144520828853e-05, + "loss": 0.9042, + "step": 21188 + }, + { + "epoch": 4.5198378839590445, + "grad_norm": 0.18451947450810288, + "learning_rate": 5.3773405579125685e-05, + "loss": 0.8744, + "step": 21189 + }, + { + "epoch": 4.520051194539249, + "grad_norm": 0.1670394815088228, + "learning_rate": 5.376536624671717e-05, + "loss": 0.8916, + "step": 21190 + }, + { + "epoch": 4.520264505119454, + "grad_norm": 0.198289187848074, + "learning_rate": 5.3757327211154e-05, + "loss": 0.8771, + "step": 21191 + }, + { + "epoch": 4.520477815699659, + "grad_norm": 0.15904420825089702, + "learning_rate": 5.374928847252712e-05, + "loss": 0.9008, + "step": 21192 + }, + { + "epoch": 4.520691126279863, + "grad_norm": 0.1970377455086047, + "learning_rate": 5.374125003092745e-05, + "loss": 0.9168, + "step": 21193 + }, + { + "epoch": 4.520904436860068, + "grad_norm": 0.18786294309516224, + "learning_rate": 5.373321188644601e-05, + "loss": 0.9049, + "step": 21194 + }, + { + "epoch": 4.5211177474402735, + "grad_norm": 0.15956684498165516, + "learning_rate": 5.37251740391737e-05, + "loss": 0.9132, + "step": 21195 + }, + { + "epoch": 4.521331058020478, + "grad_norm": 0.1836487768772693, + "learning_rate": 5.3717136489201507e-05, + "loss": 0.9093, + "step": 21196 + }, + { + "epoch": 4.521544368600683, + "grad_norm": 0.13848046148629098, + "learning_rate": 5.3709099236620335e-05, + "loss": 0.9618, + "step": 21197 + }, + { + "epoch": 4.521757679180888, + "grad_norm": 0.21736644358354518, + "learning_rate": 5.370106228152116e-05, + "loss": 0.903, + "step": 21198 + }, + { + "epoch": 4.521970989761092, + "grad_norm": 0.1998547058571313, + "learning_rate": 5.369302562399494e-05, + "loss": 0.8969, + "step": 21199 + }, + { + "epoch": 4.522184300341297, + "grad_norm": 0.16232776966241946, + "learning_rate": 5.368498926413253e-05, + "loss": 0.898, + "step": 21200 + }, + { + "epoch": 4.522397610921502, + "grad_norm": 0.17235775851583754, + "learning_rate": 5.3676953202024926e-05, + "loss": 0.8981, + "step": 21201 + }, + { + "epoch": 4.522610921501706, + "grad_norm": 0.164712902594422, + "learning_rate": 5.366891743776302e-05, + "loss": 0.9122, + "step": 21202 + }, + { + "epoch": 4.522824232081911, + "grad_norm": 0.156847732100595, + "learning_rate": 5.366088197143779e-05, + "loss": 0.8884, + "step": 21203 + }, + { + "epoch": 4.523037542662116, + "grad_norm": 0.1783280613297688, + "learning_rate": 5.365284680314011e-05, + "loss": 0.9273, + "step": 21204 + }, + { + "epoch": 4.52325085324232, + "grad_norm": 0.15642626895667242, + "learning_rate": 5.364481193296089e-05, + "loss": 0.884, + "step": 21205 + }, + { + "epoch": 4.523464163822526, + "grad_norm": 0.17518979035754684, + "learning_rate": 5.3636777360991095e-05, + "loss": 0.9446, + "step": 21206 + }, + { + "epoch": 4.523677474402731, + "grad_norm": 0.17129649885866208, + "learning_rate": 5.362874308732158e-05, + "loss": 0.9057, + "step": 21207 + }, + { + "epoch": 4.523890784982935, + "grad_norm": 0.15901154338957965, + "learning_rate": 5.3620709112043314e-05, + "loss": 0.8894, + "step": 21208 + }, + { + "epoch": 4.52410409556314, + "grad_norm": 0.19002286490700698, + "learning_rate": 5.3612675435247135e-05, + "loss": 0.8912, + "step": 21209 + }, + { + "epoch": 4.524317406143345, + "grad_norm": 0.16580874514080993, + "learning_rate": 5.360464205702402e-05, + "loss": 0.9243, + "step": 21210 + }, + { + "epoch": 4.524530716723549, + "grad_norm": 0.16639555231770156, + "learning_rate": 5.35966089774648e-05, + "loss": 0.8986, + "step": 21211 + }, + { + "epoch": 4.524744027303754, + "grad_norm": 0.16479509590260522, + "learning_rate": 5.358857619666039e-05, + "loss": 0.9147, + "step": 21212 + }, + { + "epoch": 4.524957337883959, + "grad_norm": 0.1679604013604711, + "learning_rate": 5.358054371470168e-05, + "loss": 0.9042, + "step": 21213 + }, + { + "epoch": 4.525170648464163, + "grad_norm": 0.17646299179579053, + "learning_rate": 5.357251153167956e-05, + "loss": 0.9009, + "step": 21214 + }, + { + "epoch": 4.525383959044369, + "grad_norm": 0.1582756179402587, + "learning_rate": 5.356447964768492e-05, + "loss": 0.8953, + "step": 21215 + }, + { + "epoch": 4.525597269624574, + "grad_norm": 0.19883118937779443, + "learning_rate": 5.355644806280866e-05, + "loss": 0.9344, + "step": 21216 + }, + { + "epoch": 4.525810580204778, + "grad_norm": 0.15591335352518784, + "learning_rate": 5.3548416777141603e-05, + "loss": 0.9094, + "step": 21217 + }, + { + "epoch": 4.526023890784983, + "grad_norm": 0.2137697123280138, + "learning_rate": 5.354038579077467e-05, + "loss": 0.9192, + "step": 21218 + }, + { + "epoch": 4.526237201365188, + "grad_norm": 0.15434946030264987, + "learning_rate": 5.353235510379871e-05, + "loss": 0.9029, + "step": 21219 + }, + { + "epoch": 4.526450511945392, + "grad_norm": 0.20226702281572922, + "learning_rate": 5.352432471630461e-05, + "loss": 0.891, + "step": 21220 + }, + { + "epoch": 4.526663822525597, + "grad_norm": 0.19470611277873479, + "learning_rate": 5.3516294628383235e-05, + "loss": 0.924, + "step": 21221 + }, + { + "epoch": 4.526877133105802, + "grad_norm": 0.16012718554065233, + "learning_rate": 5.3508264840125406e-05, + "loss": 0.8898, + "step": 21222 + }, + { + "epoch": 4.527090443686006, + "grad_norm": 0.19224764752308995, + "learning_rate": 5.3500235351622013e-05, + "loss": 0.8602, + "step": 21223 + }, + { + "epoch": 4.527303754266212, + "grad_norm": 0.14414825539717027, + "learning_rate": 5.349220616296389e-05, + "loss": 0.9431, + "step": 21224 + }, + { + "epoch": 4.527517064846417, + "grad_norm": 0.19591040924876935, + "learning_rate": 5.348417727424192e-05, + "loss": 0.876, + "step": 21225 + }, + { + "epoch": 4.527730375426621, + "grad_norm": 0.17141062714314842, + "learning_rate": 5.347614868554692e-05, + "loss": 0.9058, + "step": 21226 + }, + { + "epoch": 4.527943686006826, + "grad_norm": 0.16620453971277813, + "learning_rate": 5.3468120396969735e-05, + "loss": 0.9103, + "step": 21227 + }, + { + "epoch": 4.528156996587031, + "grad_norm": 0.20098477829329656, + "learning_rate": 5.346009240860123e-05, + "loss": 0.8896, + "step": 21228 + }, + { + "epoch": 4.528370307167235, + "grad_norm": 0.13577604062027798, + "learning_rate": 5.345206472053221e-05, + "loss": 0.9045, + "step": 21229 + }, + { + "epoch": 4.52858361774744, + "grad_norm": 0.20700957866434977, + "learning_rate": 5.3444037332853536e-05, + "loss": 0.9226, + "step": 21230 + }, + { + "epoch": 4.528796928327645, + "grad_norm": 0.20415066552585914, + "learning_rate": 5.343601024565601e-05, + "loss": 0.8954, + "step": 21231 + }, + { + "epoch": 4.5290102389078495, + "grad_norm": 0.1813755168214692, + "learning_rate": 5.3427983459030516e-05, + "loss": 0.9355, + "step": 21232 + }, + { + "epoch": 4.529223549488055, + "grad_norm": 0.23109079775546643, + "learning_rate": 5.341995697306782e-05, + "loss": 0.9076, + "step": 21233 + }, + { + "epoch": 4.52943686006826, + "grad_norm": 0.1612552164792885, + "learning_rate": 5.341193078785873e-05, + "loss": 0.9218, + "step": 21234 + }, + { + "epoch": 4.529650170648464, + "grad_norm": 0.2380726105066518, + "learning_rate": 5.340390490349411e-05, + "loss": 0.9224, + "step": 21235 + }, + { + "epoch": 4.529863481228669, + "grad_norm": 0.1597840219273059, + "learning_rate": 5.339587932006474e-05, + "loss": 0.8803, + "step": 21236 + }, + { + "epoch": 4.530076791808874, + "grad_norm": 0.20037097842961218, + "learning_rate": 5.338785403766146e-05, + "loss": 0.9191, + "step": 21237 + }, + { + "epoch": 4.5302901023890785, + "grad_norm": 0.19312028312786741, + "learning_rate": 5.337982905637506e-05, + "loss": 0.9124, + "step": 21238 + }, + { + "epoch": 4.530503412969283, + "grad_norm": 0.17189131415608253, + "learning_rate": 5.3371804376296325e-05, + "loss": 0.913, + "step": 21239 + }, + { + "epoch": 4.530716723549488, + "grad_norm": 0.21586353956048798, + "learning_rate": 5.336377999751609e-05, + "loss": 0.9123, + "step": 21240 + }, + { + "epoch": 4.5309300341296925, + "grad_norm": 0.1806728934620099, + "learning_rate": 5.335575592012511e-05, + "loss": 0.878, + "step": 21241 + }, + { + "epoch": 4.531143344709898, + "grad_norm": 0.1795689952505545, + "learning_rate": 5.3347732144214215e-05, + "loss": 0.8894, + "step": 21242 + }, + { + "epoch": 4.531356655290102, + "grad_norm": 0.19742816359303322, + "learning_rate": 5.33397086698742e-05, + "loss": 0.8883, + "step": 21243 + }, + { + "epoch": 4.5315699658703075, + "grad_norm": 0.14033965389236883, + "learning_rate": 5.33316854971958e-05, + "loss": 0.932, + "step": 21244 + }, + { + "epoch": 4.531783276450512, + "grad_norm": 0.1597627680717712, + "learning_rate": 5.3323662626269844e-05, + "loss": 0.8745, + "step": 21245 + }, + { + "epoch": 4.531996587030717, + "grad_norm": 0.16114447145286392, + "learning_rate": 5.331564005718707e-05, + "loss": 0.8723, + "step": 21246 + }, + { + "epoch": 4.5322098976109215, + "grad_norm": 0.14244253610137833, + "learning_rate": 5.33076177900383e-05, + "loss": 0.8725, + "step": 21247 + }, + { + "epoch": 4.532423208191126, + "grad_norm": 0.17923932075902174, + "learning_rate": 5.329959582491427e-05, + "loss": 0.9198, + "step": 21248 + }, + { + "epoch": 4.532636518771331, + "grad_norm": 0.15456439157482174, + "learning_rate": 5.329157416190577e-05, + "loss": 0.8742, + "step": 21249 + }, + { + "epoch": 4.532849829351536, + "grad_norm": 0.16865912693815222, + "learning_rate": 5.328355280110356e-05, + "loss": 0.887, + "step": 21250 + }, + { + "epoch": 4.53306313993174, + "grad_norm": 0.17774212808854706, + "learning_rate": 5.327553174259839e-05, + "loss": 0.8787, + "step": 21251 + }, + { + "epoch": 4.533276450511945, + "grad_norm": 0.166688725071525, + "learning_rate": 5.326751098648103e-05, + "loss": 0.9107, + "step": 21252 + }, + { + "epoch": 4.5334897610921505, + "grad_norm": 0.18263363600158694, + "learning_rate": 5.325949053284223e-05, + "loss": 0.9028, + "step": 21253 + }, + { + "epoch": 4.533703071672355, + "grad_norm": 0.16234006329780257, + "learning_rate": 5.325147038177275e-05, + "loss": 0.8943, + "step": 21254 + }, + { + "epoch": 4.53391638225256, + "grad_norm": 0.1662923017415023, + "learning_rate": 5.324345053336336e-05, + "loss": 0.8928, + "step": 21255 + }, + { + "epoch": 4.534129692832765, + "grad_norm": 0.16810116477185688, + "learning_rate": 5.323543098770474e-05, + "loss": 0.903, + "step": 21256 + }, + { + "epoch": 4.534343003412969, + "grad_norm": 0.16631180448043054, + "learning_rate": 5.322741174488769e-05, + "loss": 0.9224, + "step": 21257 + }, + { + "epoch": 4.534556313993174, + "grad_norm": 0.19854139530977297, + "learning_rate": 5.321939280500291e-05, + "loss": 0.9043, + "step": 21258 + }, + { + "epoch": 4.534769624573379, + "grad_norm": 0.14985840035570705, + "learning_rate": 5.321137416814115e-05, + "loss": 0.8941, + "step": 21259 + }, + { + "epoch": 4.534982935153583, + "grad_norm": 0.15166730895385352, + "learning_rate": 5.320335583439313e-05, + "loss": 0.9015, + "step": 21260 + }, + { + "epoch": 4.535196245733788, + "grad_norm": 0.1553154666559326, + "learning_rate": 5.3195337803849625e-05, + "loss": 0.8749, + "step": 21261 + }, + { + "epoch": 4.535409556313994, + "grad_norm": 0.17121555984375053, + "learning_rate": 5.3187320076601316e-05, + "loss": 0.8976, + "step": 21262 + }, + { + "epoch": 4.535622866894198, + "grad_norm": 0.15718816872125238, + "learning_rate": 5.317930265273891e-05, + "loss": 0.9198, + "step": 21263 + }, + { + "epoch": 4.535836177474403, + "grad_norm": 0.17783909418921326, + "learning_rate": 5.317128553235316e-05, + "loss": 0.9138, + "step": 21264 + }, + { + "epoch": 4.536049488054608, + "grad_norm": 0.1531422151670245, + "learning_rate": 5.3163268715534754e-05, + "loss": 0.8962, + "step": 21265 + }, + { + "epoch": 4.536262798634812, + "grad_norm": 0.17517238323090167, + "learning_rate": 5.315525220237444e-05, + "loss": 0.9068, + "step": 21266 + }, + { + "epoch": 4.536476109215017, + "grad_norm": 0.18600688685237016, + "learning_rate": 5.314723599296289e-05, + "loss": 0.9326, + "step": 21267 + }, + { + "epoch": 4.536689419795222, + "grad_norm": 0.18185685018218314, + "learning_rate": 5.313922008739079e-05, + "loss": 0.937, + "step": 21268 + }, + { + "epoch": 4.536902730375426, + "grad_norm": 0.15288742290745297, + "learning_rate": 5.3131204485748894e-05, + "loss": 0.9229, + "step": 21269 + }, + { + "epoch": 4.537116040955631, + "grad_norm": 0.14033451852991496, + "learning_rate": 5.312318918812785e-05, + "loss": 0.8951, + "step": 21270 + }, + { + "epoch": 4.537329351535837, + "grad_norm": 0.15058330637809242, + "learning_rate": 5.311517419461838e-05, + "loss": 0.9068, + "step": 21271 + }, + { + "epoch": 4.537542662116041, + "grad_norm": 0.13688666560273646, + "learning_rate": 5.3107159505311165e-05, + "loss": 0.8834, + "step": 21272 + }, + { + "epoch": 4.537755972696246, + "grad_norm": 0.13743364611051453, + "learning_rate": 5.309914512029691e-05, + "loss": 0.91, + "step": 21273 + }, + { + "epoch": 4.537969283276451, + "grad_norm": 0.17400287242512205, + "learning_rate": 5.309113103966627e-05, + "loss": 0.918, + "step": 21274 + }, + { + "epoch": 4.538182593856655, + "grad_norm": 0.15038351940165326, + "learning_rate": 5.308311726350992e-05, + "loss": 0.905, + "step": 21275 + }, + { + "epoch": 4.53839590443686, + "grad_norm": 0.15000048689822765, + "learning_rate": 5.307510379191858e-05, + "loss": 0.8803, + "step": 21276 + }, + { + "epoch": 4.538609215017065, + "grad_norm": 0.15264716039978435, + "learning_rate": 5.306709062498289e-05, + "loss": 0.9322, + "step": 21277 + }, + { + "epoch": 4.538822525597269, + "grad_norm": 0.14471083852733713, + "learning_rate": 5.30590777627935e-05, + "loss": 0.88, + "step": 21278 + }, + { + "epoch": 4.539035836177474, + "grad_norm": 0.15686583011760122, + "learning_rate": 5.3051065205441114e-05, + "loss": 0.869, + "step": 21279 + }, + { + "epoch": 4.53924914675768, + "grad_norm": 0.14824245954251758, + "learning_rate": 5.3043052953016355e-05, + "loss": 0.9037, + "step": 21280 + }, + { + "epoch": 4.539462457337884, + "grad_norm": 0.1546809898285799, + "learning_rate": 5.303504100560993e-05, + "loss": 0.8774, + "step": 21281 + }, + { + "epoch": 4.539675767918089, + "grad_norm": 0.15326318066350014, + "learning_rate": 5.302702936331246e-05, + "loss": 0.909, + "step": 21282 + }, + { + "epoch": 4.539889078498294, + "grad_norm": 0.16629797904274096, + "learning_rate": 5.301901802621461e-05, + "loss": 0.9254, + "step": 21283 + }, + { + "epoch": 4.540102389078498, + "grad_norm": 0.16887240894299893, + "learning_rate": 5.301100699440701e-05, + "loss": 0.9137, + "step": 21284 + }, + { + "epoch": 4.540315699658703, + "grad_norm": 0.16671960885151138, + "learning_rate": 5.300299626798034e-05, + "loss": 0.8921, + "step": 21285 + }, + { + "epoch": 4.540529010238908, + "grad_norm": 0.16869976872988995, + "learning_rate": 5.2994985847025225e-05, + "loss": 0.8638, + "step": 21286 + }, + { + "epoch": 4.540742320819112, + "grad_norm": 0.14800079368308003, + "learning_rate": 5.298697573163228e-05, + "loss": 0.9154, + "step": 21287 + }, + { + "epoch": 4.540955631399317, + "grad_norm": 0.1494349199743993, + "learning_rate": 5.297896592189219e-05, + "loss": 0.9161, + "step": 21288 + }, + { + "epoch": 4.541168941979522, + "grad_norm": 0.1590380831401856, + "learning_rate": 5.297095641789554e-05, + "loss": 0.9465, + "step": 21289 + }, + { + "epoch": 4.5413822525597265, + "grad_norm": 0.18938196438505076, + "learning_rate": 5.296294721973295e-05, + "loss": 0.9081, + "step": 21290 + }, + { + "epoch": 4.541595563139932, + "grad_norm": 0.15429932001750263, + "learning_rate": 5.295493832749509e-05, + "loss": 0.8866, + "step": 21291 + }, + { + "epoch": 4.541808873720137, + "grad_norm": 0.14383408631672315, + "learning_rate": 5.2946929741272534e-05, + "loss": 0.9233, + "step": 21292 + }, + { + "epoch": 4.542022184300341, + "grad_norm": 0.1535746794450157, + "learning_rate": 5.2938921461155954e-05, + "loss": 0.8771, + "step": 21293 + }, + { + "epoch": 4.542235494880546, + "grad_norm": 0.15096619092974461, + "learning_rate": 5.29309134872359e-05, + "loss": 0.8772, + "step": 21294 + }, + { + "epoch": 4.542448805460751, + "grad_norm": 0.1397334262293558, + "learning_rate": 5.292290581960305e-05, + "loss": 0.9465, + "step": 21295 + }, + { + "epoch": 4.5426621160409555, + "grad_norm": 0.1704697584923902, + "learning_rate": 5.291489845834795e-05, + "loss": 0.8712, + "step": 21296 + }, + { + "epoch": 4.54287542662116, + "grad_norm": 0.15139697145004086, + "learning_rate": 5.290689140356124e-05, + "loss": 0.9084, + "step": 21297 + }, + { + "epoch": 4.543088737201365, + "grad_norm": 0.15201062566555096, + "learning_rate": 5.289888465533351e-05, + "loss": 0.895, + "step": 21298 + }, + { + "epoch": 4.5433020477815695, + "grad_norm": 0.1655362355049139, + "learning_rate": 5.289087821375538e-05, + "loss": 0.9135, + "step": 21299 + }, + { + "epoch": 4.543515358361775, + "grad_norm": 0.15174506786094044, + "learning_rate": 5.288287207891739e-05, + "loss": 0.8757, + "step": 21300 + }, + { + "epoch": 4.54372866894198, + "grad_norm": 0.1500291987092767, + "learning_rate": 5.2874866250910164e-05, + "loss": 0.884, + "step": 21301 + }, + { + "epoch": 4.5439419795221845, + "grad_norm": 0.19856063624120865, + "learning_rate": 5.286686072982427e-05, + "loss": 0.9544, + "step": 21302 + }, + { + "epoch": 4.544155290102389, + "grad_norm": 0.18142022638615687, + "learning_rate": 5.285885551575032e-05, + "loss": 0.9089, + "step": 21303 + }, + { + "epoch": 4.544368600682594, + "grad_norm": 0.1750747133738544, + "learning_rate": 5.285085060877886e-05, + "loss": 0.9237, + "step": 21304 + }, + { + "epoch": 4.5445819112627985, + "grad_norm": 0.21550438866915114, + "learning_rate": 5.28428460090005e-05, + "loss": 0.9166, + "step": 21305 + }, + { + "epoch": 4.544795221843003, + "grad_norm": 0.1928659720698643, + "learning_rate": 5.2834841716505776e-05, + "loss": 0.8872, + "step": 21306 + }, + { + "epoch": 4.545008532423208, + "grad_norm": 0.15668719754652977, + "learning_rate": 5.282683773138529e-05, + "loss": 0.8878, + "step": 21307 + }, + { + "epoch": 4.545221843003413, + "grad_norm": 0.18412696174161855, + "learning_rate": 5.281883405372957e-05, + "loss": 0.9053, + "step": 21308 + }, + { + "epoch": 4.545435153583618, + "grad_norm": 0.25011711970784634, + "learning_rate": 5.2810830683629234e-05, + "loss": 0.9035, + "step": 21309 + }, + { + "epoch": 4.545648464163823, + "grad_norm": 0.18672399075017515, + "learning_rate": 5.280282762117479e-05, + "loss": 0.9223, + "step": 21310 + }, + { + "epoch": 4.5458617747440275, + "grad_norm": 0.15757528386173097, + "learning_rate": 5.279482486645684e-05, + "loss": 0.9046, + "step": 21311 + }, + { + "epoch": 4.546075085324232, + "grad_norm": 0.21367006544023337, + "learning_rate": 5.278682241956586e-05, + "loss": 0.9158, + "step": 21312 + }, + { + "epoch": 4.546288395904437, + "grad_norm": 0.20879518031624839, + "learning_rate": 5.277882028059246e-05, + "loss": 0.883, + "step": 21313 + }, + { + "epoch": 4.546501706484642, + "grad_norm": 0.16631236584303946, + "learning_rate": 5.277081844962714e-05, + "loss": 0.8949, + "step": 21314 + }, + { + "epoch": 4.546715017064846, + "grad_norm": 0.19705352125301334, + "learning_rate": 5.2762816926760506e-05, + "loss": 0.8964, + "step": 21315 + }, + { + "epoch": 4.546928327645051, + "grad_norm": 0.17244060739412206, + "learning_rate": 5.275481571208303e-05, + "loss": 0.8895, + "step": 21316 + }, + { + "epoch": 4.547141638225256, + "grad_norm": 0.175487676224009, + "learning_rate": 5.2746814805685284e-05, + "loss": 0.8999, + "step": 21317 + }, + { + "epoch": 4.547354948805461, + "grad_norm": 0.1810360943109805, + "learning_rate": 5.273881420765777e-05, + "loss": 0.9225, + "step": 21318 + }, + { + "epoch": 4.547568259385666, + "grad_norm": 0.20771497920955878, + "learning_rate": 5.273081391809106e-05, + "loss": 0.8797, + "step": 21319 + }, + { + "epoch": 4.547781569965871, + "grad_norm": 0.14424644421182437, + "learning_rate": 5.2722813937075624e-05, + "loss": 0.8889, + "step": 21320 + }, + { + "epoch": 4.547994880546075, + "grad_norm": 0.17007932153497293, + "learning_rate": 5.271481426470202e-05, + "loss": 0.8882, + "step": 21321 + }, + { + "epoch": 4.54820819112628, + "grad_norm": 0.15837913026740477, + "learning_rate": 5.270681490106077e-05, + "loss": 0.918, + "step": 21322 + }, + { + "epoch": 4.548421501706485, + "grad_norm": 0.16721941885833944, + "learning_rate": 5.269881584624235e-05, + "loss": 0.9201, + "step": 21323 + }, + { + "epoch": 4.548634812286689, + "grad_norm": 0.14222935113365343, + "learning_rate": 5.269081710033728e-05, + "loss": 0.9129, + "step": 21324 + }, + { + "epoch": 4.548848122866894, + "grad_norm": 0.16168651186569713, + "learning_rate": 5.268281866343609e-05, + "loss": 0.8748, + "step": 21325 + }, + { + "epoch": 4.549061433447099, + "grad_norm": 0.14311944253868267, + "learning_rate": 5.267482053562925e-05, + "loss": 0.8773, + "step": 21326 + }, + { + "epoch": 4.549274744027304, + "grad_norm": 0.15979130942254768, + "learning_rate": 5.266682271700729e-05, + "loss": 0.9241, + "step": 21327 + }, + { + "epoch": 4.549488054607508, + "grad_norm": 0.15199272935673858, + "learning_rate": 5.2658825207660664e-05, + "loss": 0.9373, + "step": 21328 + }, + { + "epoch": 4.549701365187714, + "grad_norm": 0.15570574599850057, + "learning_rate": 5.265082800767992e-05, + "loss": 0.9061, + "step": 21329 + }, + { + "epoch": 4.549914675767918, + "grad_norm": 0.17307733930652422, + "learning_rate": 5.26428311171555e-05, + "loss": 0.8569, + "step": 21330 + }, + { + "epoch": 4.550127986348123, + "grad_norm": 0.15424458685177356, + "learning_rate": 5.263483453617792e-05, + "loss": 0.9138, + "step": 21331 + }, + { + "epoch": 4.550341296928328, + "grad_norm": 0.207587813743161, + "learning_rate": 5.262683826483763e-05, + "loss": 0.9031, + "step": 21332 + }, + { + "epoch": 4.550554607508532, + "grad_norm": 0.16441156285953426, + "learning_rate": 5.261884230322517e-05, + "loss": 0.9082, + "step": 21333 + }, + { + "epoch": 4.550767918088737, + "grad_norm": 0.1880274295085845, + "learning_rate": 5.261084665143092e-05, + "loss": 0.888, + "step": 21334 + }, + { + "epoch": 4.550981228668942, + "grad_norm": 0.1687959339673221, + "learning_rate": 5.260285130954542e-05, + "loss": 0.9282, + "step": 21335 + }, + { + "epoch": 4.551194539249146, + "grad_norm": 0.17131895029651503, + "learning_rate": 5.259485627765911e-05, + "loss": 0.8875, + "step": 21336 + }, + { + "epoch": 4.551407849829351, + "grad_norm": 0.17851740571280827, + "learning_rate": 5.2586861555862474e-05, + "loss": 0.9057, + "step": 21337 + }, + { + "epoch": 4.551621160409557, + "grad_norm": 0.15013132886659697, + "learning_rate": 5.257886714424594e-05, + "loss": 0.9109, + "step": 21338 + }, + { + "epoch": 4.551834470989761, + "grad_norm": 0.14855441587247076, + "learning_rate": 5.2570873042900004e-05, + "loss": 0.9238, + "step": 21339 + }, + { + "epoch": 4.552047781569966, + "grad_norm": 0.21223918331367478, + "learning_rate": 5.256287925191508e-05, + "loss": 0.8707, + "step": 21340 + }, + { + "epoch": 4.552261092150171, + "grad_norm": 0.17720153438104788, + "learning_rate": 5.255488577138167e-05, + "loss": 0.8786, + "step": 21341 + }, + { + "epoch": 4.552474402730375, + "grad_norm": 0.16264019040604386, + "learning_rate": 5.2546892601390155e-05, + "loss": 0.9121, + "step": 21342 + }, + { + "epoch": 4.55268771331058, + "grad_norm": 0.17560462058589665, + "learning_rate": 5.253889974203103e-05, + "loss": 0.8993, + "step": 21343 + }, + { + "epoch": 4.552901023890785, + "grad_norm": 0.17926836466795165, + "learning_rate": 5.253090719339476e-05, + "loss": 0.9036, + "step": 21344 + }, + { + "epoch": 4.5531143344709895, + "grad_norm": 0.18607435385414214, + "learning_rate": 5.2522914955571685e-05, + "loss": 0.905, + "step": 21345 + }, + { + "epoch": 4.553327645051194, + "grad_norm": 0.15110239600837172, + "learning_rate": 5.251492302865229e-05, + "loss": 0.8731, + "step": 21346 + }, + { + "epoch": 4.5535409556314, + "grad_norm": 0.16923136789149223, + "learning_rate": 5.2506931412727024e-05, + "loss": 0.9197, + "step": 21347 + }, + { + "epoch": 4.553754266211604, + "grad_norm": 0.14917869868726127, + "learning_rate": 5.2498940107886264e-05, + "loss": 0.8898, + "step": 21348 + }, + { + "epoch": 4.553967576791809, + "grad_norm": 0.15055155970883133, + "learning_rate": 5.2490949114220485e-05, + "loss": 0.8853, + "step": 21349 + }, + { + "epoch": 4.554180887372014, + "grad_norm": 0.14624322738486792, + "learning_rate": 5.248295843182005e-05, + "loss": 0.8937, + "step": 21350 + }, + { + "epoch": 4.5543941979522184, + "grad_norm": 0.13931338705809093, + "learning_rate": 5.247496806077543e-05, + "loss": 0.9025, + "step": 21351 + }, + { + "epoch": 4.554607508532423, + "grad_norm": 0.14670882964815893, + "learning_rate": 5.2466978001177e-05, + "loss": 0.8907, + "step": 21352 + }, + { + "epoch": 4.554820819112628, + "grad_norm": 0.14631006028559035, + "learning_rate": 5.245898825311518e-05, + "loss": 0.8851, + "step": 21353 + }, + { + "epoch": 4.5550341296928325, + "grad_norm": 0.17316753181285272, + "learning_rate": 5.2450998816680366e-05, + "loss": 0.9303, + "step": 21354 + }, + { + "epoch": 4.555247440273037, + "grad_norm": 0.1296135810949645, + "learning_rate": 5.2443009691963e-05, + "loss": 0.9228, + "step": 21355 + }, + { + "epoch": 4.555460750853243, + "grad_norm": 0.1686736214194126, + "learning_rate": 5.24350208790534e-05, + "loss": 0.8847, + "step": 21356 + }, + { + "epoch": 4.555674061433447, + "grad_norm": 0.14619616129980417, + "learning_rate": 5.2427032378042025e-05, + "loss": 0.9006, + "step": 21357 + }, + { + "epoch": 4.555887372013652, + "grad_norm": 0.14737034655696152, + "learning_rate": 5.241904418901921e-05, + "loss": 0.8808, + "step": 21358 + }, + { + "epoch": 4.556100682593857, + "grad_norm": 0.1351579240305471, + "learning_rate": 5.241105631207541e-05, + "loss": 0.9081, + "step": 21359 + }, + { + "epoch": 4.5563139931740615, + "grad_norm": 0.16045018466908564, + "learning_rate": 5.2403068747300936e-05, + "loss": 0.9357, + "step": 21360 + }, + { + "epoch": 4.556527303754266, + "grad_norm": 0.17790994197535934, + "learning_rate": 5.239508149478623e-05, + "loss": 0.915, + "step": 21361 + }, + { + "epoch": 4.556740614334471, + "grad_norm": 0.18939672625511358, + "learning_rate": 5.2387094554621625e-05, + "loss": 0.884, + "step": 21362 + }, + { + "epoch": 4.5569539249146755, + "grad_norm": 0.14300330758416344, + "learning_rate": 5.237910792689753e-05, + "loss": 0.9184, + "step": 21363 + }, + { + "epoch": 4.55716723549488, + "grad_norm": 0.15989784915652047, + "learning_rate": 5.2371121611704265e-05, + "loss": 0.9152, + "step": 21364 + }, + { + "epoch": 4.557380546075086, + "grad_norm": 0.16538966619321194, + "learning_rate": 5.236313560913225e-05, + "loss": 0.8977, + "step": 21365 + }, + { + "epoch": 4.5575938566552905, + "grad_norm": 0.14571213768941393, + "learning_rate": 5.235514991927184e-05, + "loss": 0.9202, + "step": 21366 + }, + { + "epoch": 4.557807167235495, + "grad_norm": 0.18388063418231188, + "learning_rate": 5.234716454221334e-05, + "loss": 0.8926, + "step": 21367 + }, + { + "epoch": 4.5580204778157, + "grad_norm": 0.13674053210964246, + "learning_rate": 5.2339179478047145e-05, + "loss": 0.8976, + "step": 21368 + }, + { + "epoch": 4.5582337883959045, + "grad_norm": 0.18285876672437593, + "learning_rate": 5.23311947268636e-05, + "loss": 0.9141, + "step": 21369 + }, + { + "epoch": 4.558447098976109, + "grad_norm": 0.16643011714281566, + "learning_rate": 5.232321028875303e-05, + "loss": 0.8731, + "step": 21370 + }, + { + "epoch": 4.558660409556314, + "grad_norm": 0.18956642543440688, + "learning_rate": 5.231522616380583e-05, + "loss": 0.9064, + "step": 21371 + }, + { + "epoch": 4.558873720136519, + "grad_norm": 0.1539907390706703, + "learning_rate": 5.23072423521123e-05, + "loss": 0.9083, + "step": 21372 + }, + { + "epoch": 4.559087030716723, + "grad_norm": 0.16759203948491566, + "learning_rate": 5.229925885376279e-05, + "loss": 0.9017, + "step": 21373 + }, + { + "epoch": 4.559300341296928, + "grad_norm": 0.17737736465637613, + "learning_rate": 5.229127566884763e-05, + "loss": 0.8951, + "step": 21374 + }, + { + "epoch": 4.559513651877133, + "grad_norm": 0.1361688812359171, + "learning_rate": 5.228329279745715e-05, + "loss": 0.9114, + "step": 21375 + }, + { + "epoch": 4.559726962457338, + "grad_norm": 0.18606768290628745, + "learning_rate": 5.2275310239681686e-05, + "loss": 0.9183, + "step": 21376 + }, + { + "epoch": 4.559940273037543, + "grad_norm": 0.15312325032891566, + "learning_rate": 5.226732799561156e-05, + "loss": 0.9111, + "step": 21377 + }, + { + "epoch": 4.560153583617748, + "grad_norm": 0.15211736246158222, + "learning_rate": 5.2259346065337095e-05, + "loss": 0.923, + "step": 21378 + }, + { + "epoch": 4.560366894197952, + "grad_norm": 0.16957694272584503, + "learning_rate": 5.2251364448948584e-05, + "loss": 0.91, + "step": 21379 + }, + { + "epoch": 4.560580204778157, + "grad_norm": 0.17482046491374822, + "learning_rate": 5.224338314653633e-05, + "loss": 0.9056, + "step": 21380 + }, + { + "epoch": 4.560793515358362, + "grad_norm": 0.1694146541216786, + "learning_rate": 5.223540215819068e-05, + "loss": 0.9139, + "step": 21381 + }, + { + "epoch": 4.561006825938566, + "grad_norm": 0.20127504577815547, + "learning_rate": 5.222742148400191e-05, + "loss": 0.8908, + "step": 21382 + }, + { + "epoch": 4.561220136518771, + "grad_norm": 0.1807260200256073, + "learning_rate": 5.221944112406034e-05, + "loss": 0.9313, + "step": 21383 + }, + { + "epoch": 4.561433447098976, + "grad_norm": 0.19814151567628366, + "learning_rate": 5.221146107845625e-05, + "loss": 0.891, + "step": 21384 + }, + { + "epoch": 4.561646757679181, + "grad_norm": 0.1593076085636094, + "learning_rate": 5.220348134727996e-05, + "loss": 0.9169, + "step": 21385 + }, + { + "epoch": 4.561860068259386, + "grad_norm": 0.1940562380862254, + "learning_rate": 5.219550193062173e-05, + "loss": 0.8796, + "step": 21386 + }, + { + "epoch": 4.562073378839591, + "grad_norm": 0.19952896068431683, + "learning_rate": 5.218752282857189e-05, + "loss": 0.8806, + "step": 21387 + }, + { + "epoch": 4.562286689419795, + "grad_norm": 0.16233992621729312, + "learning_rate": 5.2179544041220666e-05, + "loss": 0.8819, + "step": 21388 + }, + { + "epoch": 4.5625, + "grad_norm": 0.18556253675553447, + "learning_rate": 5.21715655686584e-05, + "loss": 0.8867, + "step": 21389 + }, + { + "epoch": 4.562713310580205, + "grad_norm": 0.22654011659838438, + "learning_rate": 5.216358741097534e-05, + "loss": 0.8868, + "step": 21390 + }, + { + "epoch": 4.562926621160409, + "grad_norm": 0.24169877203318826, + "learning_rate": 5.215560956826174e-05, + "loss": 0.894, + "step": 21391 + }, + { + "epoch": 4.563139931740614, + "grad_norm": 0.14552250975280123, + "learning_rate": 5.214763204060788e-05, + "loss": 0.8957, + "step": 21392 + }, + { + "epoch": 4.563353242320819, + "grad_norm": 0.18996484741505415, + "learning_rate": 5.2139654828104044e-05, + "loss": 0.8932, + "step": 21393 + }, + { + "epoch": 4.563566552901024, + "grad_norm": 0.2101033835076072, + "learning_rate": 5.213167793084047e-05, + "loss": 0.8892, + "step": 21394 + }, + { + "epoch": 4.563779863481229, + "grad_norm": 0.1822767184373559, + "learning_rate": 5.212370134890744e-05, + "loss": 0.8922, + "step": 21395 + }, + { + "epoch": 4.563993174061434, + "grad_norm": 0.17201296400789343, + "learning_rate": 5.211572508239518e-05, + "loss": 0.8882, + "step": 21396 + }, + { + "epoch": 4.564206484641638, + "grad_norm": 0.15766016252404896, + "learning_rate": 5.2107749131393983e-05, + "loss": 0.9146, + "step": 21397 + }, + { + "epoch": 4.564419795221843, + "grad_norm": 0.16076690497992907, + "learning_rate": 5.2099773495994045e-05, + "loss": 0.9018, + "step": 21398 + }, + { + "epoch": 4.564633105802048, + "grad_norm": 0.18177253697384346, + "learning_rate": 5.2091798176285665e-05, + "loss": 0.9237, + "step": 21399 + }, + { + "epoch": 4.564846416382252, + "grad_norm": 0.15284556563451374, + "learning_rate": 5.2083823172359076e-05, + "loss": 0.8984, + "step": 21400 + }, + { + "epoch": 4.565059726962457, + "grad_norm": 0.13506212816734797, + "learning_rate": 5.207584848430446e-05, + "loss": 0.8851, + "step": 21401 + }, + { + "epoch": 4.565273037542662, + "grad_norm": 0.15545021679825122, + "learning_rate": 5.2067874112212105e-05, + "loss": 0.8706, + "step": 21402 + }, + { + "epoch": 4.565486348122867, + "grad_norm": 0.15005449331042964, + "learning_rate": 5.205990005617222e-05, + "loss": 0.9243, + "step": 21403 + }, + { + "epoch": 4.565699658703072, + "grad_norm": 0.18501685528829093, + "learning_rate": 5.205192631627503e-05, + "loss": 0.8904, + "step": 21404 + }, + { + "epoch": 4.565912969283277, + "grad_norm": 0.18553991672822848, + "learning_rate": 5.2043952892610774e-05, + "loss": 0.9092, + "step": 21405 + }, + { + "epoch": 4.566126279863481, + "grad_norm": 0.16500684412014416, + "learning_rate": 5.203597978526965e-05, + "loss": 0.9027, + "step": 21406 + }, + { + "epoch": 4.566339590443686, + "grad_norm": 0.18832898668552603, + "learning_rate": 5.20280069943419e-05, + "loss": 0.8998, + "step": 21407 + }, + { + "epoch": 4.566552901023891, + "grad_norm": 0.1548333488071847, + "learning_rate": 5.20200345199177e-05, + "loss": 0.9066, + "step": 21408 + }, + { + "epoch": 4.5667662116040955, + "grad_norm": 0.14981504306045243, + "learning_rate": 5.2012062362087305e-05, + "loss": 0.8674, + "step": 21409 + }, + { + "epoch": 4.5669795221843, + "grad_norm": 0.17514282483180724, + "learning_rate": 5.2004090520940867e-05, + "loss": 0.9113, + "step": 21410 + }, + { + "epoch": 4.567192832764505, + "grad_norm": 0.14243900989582933, + "learning_rate": 5.1996118996568656e-05, + "loss": 0.8792, + "step": 21411 + }, + { + "epoch": 4.5674061433447095, + "grad_norm": 0.17093641698647888, + "learning_rate": 5.198814778906082e-05, + "loss": 0.9261, + "step": 21412 + }, + { + "epoch": 4.567619453924914, + "grad_norm": 0.15239403159281267, + "learning_rate": 5.198017689850753e-05, + "loss": 0.894, + "step": 21413 + }, + { + "epoch": 4.56783276450512, + "grad_norm": 0.15266088670749728, + "learning_rate": 5.1972206324999034e-05, + "loss": 0.9053, + "step": 21414 + }, + { + "epoch": 4.5680460750853245, + "grad_norm": 0.1448797441771742, + "learning_rate": 5.1964236068625496e-05, + "loss": 0.8978, + "step": 21415 + }, + { + "epoch": 4.568259385665529, + "grad_norm": 0.1590677742923291, + "learning_rate": 5.195626612947708e-05, + "loss": 0.8688, + "step": 21416 + }, + { + "epoch": 4.568472696245734, + "grad_norm": 0.1471216297104965, + "learning_rate": 5.1948296507644004e-05, + "loss": 0.8955, + "step": 21417 + }, + { + "epoch": 4.5686860068259385, + "grad_norm": 0.17921899545556075, + "learning_rate": 5.194032720321641e-05, + "loss": 0.9009, + "step": 21418 + }, + { + "epoch": 4.568899317406143, + "grad_norm": 0.13179511884849823, + "learning_rate": 5.1932358216284486e-05, + "loss": 0.9204, + "step": 21419 + }, + { + "epoch": 4.569112627986348, + "grad_norm": 0.18436956697317208, + "learning_rate": 5.19243895469384e-05, + "loss": 0.9002, + "step": 21420 + }, + { + "epoch": 4.569325938566553, + "grad_norm": 0.15296748560375859, + "learning_rate": 5.191642119526833e-05, + "loss": 0.8942, + "step": 21421 + }, + { + "epoch": 4.569539249146757, + "grad_norm": 0.15936309587443143, + "learning_rate": 5.190845316136445e-05, + "loss": 0.8664, + "step": 21422 + }, + { + "epoch": 4.569752559726963, + "grad_norm": 0.16728287506916256, + "learning_rate": 5.190048544531686e-05, + "loss": 0.9206, + "step": 21423 + }, + { + "epoch": 4.5699658703071675, + "grad_norm": 0.18716097687956476, + "learning_rate": 5.189251804721576e-05, + "loss": 0.8966, + "step": 21424 + }, + { + "epoch": 4.570179180887372, + "grad_norm": 0.225839298673002, + "learning_rate": 5.1884550967151284e-05, + "loss": 0.9393, + "step": 21425 + }, + { + "epoch": 4.570392491467577, + "grad_norm": 0.13935965122734223, + "learning_rate": 5.187658420521359e-05, + "loss": 0.9146, + "step": 21426 + }, + { + "epoch": 4.5706058020477816, + "grad_norm": 0.23931850862092371, + "learning_rate": 5.1868617761492825e-05, + "loss": 0.906, + "step": 21427 + }, + { + "epoch": 4.570819112627986, + "grad_norm": 0.17960643996585504, + "learning_rate": 5.186065163607911e-05, + "loss": 0.8789, + "step": 21428 + }, + { + "epoch": 4.571032423208191, + "grad_norm": 0.191788146507552, + "learning_rate": 5.185268582906261e-05, + "loss": 0.8992, + "step": 21429 + }, + { + "epoch": 4.571245733788396, + "grad_norm": 0.17125957316971285, + "learning_rate": 5.184472034053343e-05, + "loss": 0.9162, + "step": 21430 + }, + { + "epoch": 4.5714590443686, + "grad_norm": 0.16006967202844197, + "learning_rate": 5.183675517058172e-05, + "loss": 0.9008, + "step": 21431 + }, + { + "epoch": 4.571672354948806, + "grad_norm": 0.18959893620602086, + "learning_rate": 5.182879031929759e-05, + "loss": 0.9041, + "step": 21432 + }, + { + "epoch": 4.5718856655290105, + "grad_norm": 0.18834370251864663, + "learning_rate": 5.182082578677119e-05, + "loss": 0.8956, + "step": 21433 + }, + { + "epoch": 4.572098976109215, + "grad_norm": 0.18312469786315935, + "learning_rate": 5.181286157309263e-05, + "loss": 0.8921, + "step": 21434 + }, + { + "epoch": 4.57231228668942, + "grad_norm": 0.1656137255720562, + "learning_rate": 5.180489767835198e-05, + "loss": 0.8827, + "step": 21435 + }, + { + "epoch": 4.572525597269625, + "grad_norm": 0.1786777091155474, + "learning_rate": 5.179693410263941e-05, + "loss": 0.9155, + "step": 21436 + }, + { + "epoch": 4.572738907849829, + "grad_norm": 0.17902189408439312, + "learning_rate": 5.178897084604499e-05, + "loss": 0.8996, + "step": 21437 + }, + { + "epoch": 4.572952218430034, + "grad_norm": 0.19189390805275838, + "learning_rate": 5.1781007908658847e-05, + "loss": 0.9145, + "step": 21438 + }, + { + "epoch": 4.573165529010239, + "grad_norm": 0.17200563493541457, + "learning_rate": 5.1773045290571085e-05, + "loss": 0.8765, + "step": 21439 + }, + { + "epoch": 4.573378839590443, + "grad_norm": 0.20481627428879898, + "learning_rate": 5.176508299187177e-05, + "loss": 0.911, + "step": 21440 + }, + { + "epoch": 4.573592150170649, + "grad_norm": 0.1679787680406184, + "learning_rate": 5.1757121012651024e-05, + "loss": 0.9062, + "step": 21441 + }, + { + "epoch": 4.573805460750854, + "grad_norm": 0.16257564382692663, + "learning_rate": 5.174915935299892e-05, + "loss": 0.885, + "step": 21442 + }, + { + "epoch": 4.574018771331058, + "grad_norm": 0.17518077588119796, + "learning_rate": 5.1741198013005574e-05, + "loss": 0.9198, + "step": 21443 + }, + { + "epoch": 4.574232081911263, + "grad_norm": 0.16924226145933122, + "learning_rate": 5.173323699276102e-05, + "loss": 0.9043, + "step": 21444 + }, + { + "epoch": 4.574445392491468, + "grad_norm": 0.15732330819459706, + "learning_rate": 5.172527629235541e-05, + "loss": 0.8791, + "step": 21445 + }, + { + "epoch": 4.574658703071672, + "grad_norm": 0.1670219430498303, + "learning_rate": 5.1717315911878755e-05, + "loss": 0.9048, + "step": 21446 + }, + { + "epoch": 4.574872013651877, + "grad_norm": 0.16814712492020417, + "learning_rate": 5.170935585142112e-05, + "loss": 0.8887, + "step": 21447 + }, + { + "epoch": 4.575085324232082, + "grad_norm": 0.2048865930001784, + "learning_rate": 5.170139611107262e-05, + "loss": 0.8994, + "step": 21448 + }, + { + "epoch": 4.575298634812286, + "grad_norm": 0.1824526983279884, + "learning_rate": 5.1693436690923285e-05, + "loss": 0.878, + "step": 21449 + }, + { + "epoch": 4.575511945392492, + "grad_norm": 0.15640031980446248, + "learning_rate": 5.168547759106322e-05, + "loss": 0.9017, + "step": 21450 + }, + { + "epoch": 4.575725255972696, + "grad_norm": 0.18839384989127658, + "learning_rate": 5.167751881158243e-05, + "loss": 0.8774, + "step": 21451 + }, + { + "epoch": 4.575938566552901, + "grad_norm": 0.18977702374189487, + "learning_rate": 5.166956035257098e-05, + "loss": 0.9226, + "step": 21452 + }, + { + "epoch": 4.576151877133106, + "grad_norm": 0.24310795128004495, + "learning_rate": 5.166160221411895e-05, + "loss": 0.8982, + "step": 21453 + }, + { + "epoch": 4.576365187713311, + "grad_norm": 0.20262367905109066, + "learning_rate": 5.165364439631635e-05, + "loss": 0.9095, + "step": 21454 + }, + { + "epoch": 4.576578498293515, + "grad_norm": 0.16681483094176267, + "learning_rate": 5.164568689925326e-05, + "loss": 0.8705, + "step": 21455 + }, + { + "epoch": 4.57679180887372, + "grad_norm": 0.2255241796904324, + "learning_rate": 5.163772972301972e-05, + "loss": 0.904, + "step": 21456 + }, + { + "epoch": 4.577005119453925, + "grad_norm": 0.13625730048753484, + "learning_rate": 5.16297728677057e-05, + "loss": 0.889, + "step": 21457 + }, + { + "epoch": 4.577218430034129, + "grad_norm": 0.17889726466050979, + "learning_rate": 5.1621816333401314e-05, + "loss": 0.9068, + "step": 21458 + }, + { + "epoch": 4.577431740614334, + "grad_norm": 0.14757590017424835, + "learning_rate": 5.161386012019652e-05, + "loss": 0.8781, + "step": 21459 + }, + { + "epoch": 4.577645051194539, + "grad_norm": 0.15938857755151545, + "learning_rate": 5.160590422818139e-05, + "loss": 0.9197, + "step": 21460 + }, + { + "epoch": 4.577858361774744, + "grad_norm": 0.1849129694138264, + "learning_rate": 5.159794865744593e-05, + "loss": 0.8966, + "step": 21461 + }, + { + "epoch": 4.578071672354949, + "grad_norm": 0.1507042212580263, + "learning_rate": 5.1589993408080165e-05, + "loss": 0.8963, + "step": 21462 + }, + { + "epoch": 4.578284982935154, + "grad_norm": 0.19263454160352217, + "learning_rate": 5.15820384801741e-05, + "loss": 0.9035, + "step": 21463 + }, + { + "epoch": 4.578498293515358, + "grad_norm": 0.15458569193344032, + "learning_rate": 5.157408387381773e-05, + "loss": 0.8798, + "step": 21464 + }, + { + "epoch": 4.578711604095563, + "grad_norm": 0.19161051015850136, + "learning_rate": 5.156612958910111e-05, + "loss": 0.9414, + "step": 21465 + }, + { + "epoch": 4.578924914675768, + "grad_norm": 0.19807123919317707, + "learning_rate": 5.155817562611418e-05, + "loss": 0.9159, + "step": 21466 + }, + { + "epoch": 4.5791382252559725, + "grad_norm": 0.14986813834653914, + "learning_rate": 5.155022198494702e-05, + "loss": 0.9217, + "step": 21467 + }, + { + "epoch": 4.579351535836177, + "grad_norm": 0.19736715003901817, + "learning_rate": 5.154226866568955e-05, + "loss": 0.8911, + "step": 21468 + }, + { + "epoch": 4.579564846416382, + "grad_norm": 0.1642694125617202, + "learning_rate": 5.153431566843177e-05, + "loss": 0.9047, + "step": 21469 + }, + { + "epoch": 4.579778156996587, + "grad_norm": 0.19228692636812766, + "learning_rate": 5.1526362993263703e-05, + "loss": 0.9024, + "step": 21470 + }, + { + "epoch": 4.579991467576792, + "grad_norm": 0.1962661712394802, + "learning_rate": 5.1518410640275314e-05, + "loss": 0.919, + "step": 21471 + }, + { + "epoch": 4.580204778156997, + "grad_norm": 0.18292589095142467, + "learning_rate": 5.1510458609556594e-05, + "loss": 0.9016, + "step": 21472 + }, + { + "epoch": 4.5804180887372015, + "grad_norm": 0.1874589006399552, + "learning_rate": 5.15025069011975e-05, + "loss": 0.889, + "step": 21473 + }, + { + "epoch": 4.580631399317406, + "grad_norm": 0.16586659547505495, + "learning_rate": 5.149455551528803e-05, + "loss": 0.9028, + "step": 21474 + }, + { + "epoch": 4.580844709897611, + "grad_norm": 0.20082042848870077, + "learning_rate": 5.148660445191815e-05, + "loss": 0.8989, + "step": 21475 + }, + { + "epoch": 4.5810580204778155, + "grad_norm": 0.21010618819076296, + "learning_rate": 5.14786537111778e-05, + "loss": 0.8911, + "step": 21476 + }, + { + "epoch": 4.58127133105802, + "grad_norm": 0.20502256162466817, + "learning_rate": 5.147070329315698e-05, + "loss": 0.8957, + "step": 21477 + }, + { + "epoch": 4.581484641638225, + "grad_norm": 0.19551374332296814, + "learning_rate": 5.146275319794565e-05, + "loss": 0.9227, + "step": 21478 + }, + { + "epoch": 4.5816979522184305, + "grad_norm": 0.14938980761513523, + "learning_rate": 5.145480342563372e-05, + "loss": 0.8752, + "step": 21479 + }, + { + "epoch": 4.581911262798635, + "grad_norm": 0.16363710614723728, + "learning_rate": 5.1446853976311176e-05, + "loss": 0.8713, + "step": 21480 + }, + { + "epoch": 4.58212457337884, + "grad_norm": 0.15630761975402113, + "learning_rate": 5.143890485006795e-05, + "loss": 0.8916, + "step": 21481 + }, + { + "epoch": 4.5823378839590445, + "grad_norm": 0.16575331019321995, + "learning_rate": 5.143095604699401e-05, + "loss": 0.886, + "step": 21482 + }, + { + "epoch": 4.582551194539249, + "grad_norm": 0.15003671614154498, + "learning_rate": 5.142300756717926e-05, + "loss": 0.9214, + "step": 21483 + }, + { + "epoch": 4.582764505119454, + "grad_norm": 0.14134736692827227, + "learning_rate": 5.1415059410713696e-05, + "loss": 0.8732, + "step": 21484 + }, + { + "epoch": 4.582977815699659, + "grad_norm": 0.1765870772861526, + "learning_rate": 5.140711157768718e-05, + "loss": 0.9284, + "step": 21485 + }, + { + "epoch": 4.583191126279863, + "grad_norm": 0.14674150919006354, + "learning_rate": 5.13991640681897e-05, + "loss": 0.878, + "step": 21486 + }, + { + "epoch": 4.583404436860068, + "grad_norm": 0.16508718242607837, + "learning_rate": 5.1391216882311165e-05, + "loss": 0.8995, + "step": 21487 + }, + { + "epoch": 4.5836177474402735, + "grad_norm": 0.2214400568401179, + "learning_rate": 5.138327002014147e-05, + "loss": 0.8974, + "step": 21488 + }, + { + "epoch": 4.583831058020478, + "grad_norm": 0.14387471976253546, + "learning_rate": 5.1375323481770574e-05, + "loss": 0.8947, + "step": 21489 + }, + { + "epoch": 4.584044368600683, + "grad_norm": 0.25861304961534937, + "learning_rate": 5.1367377267288385e-05, + "loss": 0.91, + "step": 21490 + }, + { + "epoch": 4.584257679180888, + "grad_norm": 0.24389612167069274, + "learning_rate": 5.135943137678478e-05, + "loss": 0.9041, + "step": 21491 + }, + { + "epoch": 4.584470989761092, + "grad_norm": 0.2053027572093757, + "learning_rate": 5.13514858103497e-05, + "loss": 0.9451, + "step": 21492 + }, + { + "epoch": 4.584684300341297, + "grad_norm": 0.17707343672373002, + "learning_rate": 5.1343540568073024e-05, + "loss": 0.8926, + "step": 21493 + }, + { + "epoch": 4.584897610921502, + "grad_norm": 0.23584368739692543, + "learning_rate": 5.1335595650044685e-05, + "loss": 0.9038, + "step": 21494 + }, + { + "epoch": 4.585110921501706, + "grad_norm": 0.17561594689728, + "learning_rate": 5.132765105635456e-05, + "loss": 0.8877, + "step": 21495 + }, + { + "epoch": 4.585324232081911, + "grad_norm": 0.179728006284379, + "learning_rate": 5.1319706787092545e-05, + "loss": 0.8916, + "step": 21496 + }, + { + "epoch": 4.585537542662116, + "grad_norm": 0.23985324470188138, + "learning_rate": 5.131176284234854e-05, + "loss": 0.9233, + "step": 21497 + }, + { + "epoch": 4.58575085324232, + "grad_norm": 0.2250728586381829, + "learning_rate": 5.13038192222124e-05, + "loss": 0.8854, + "step": 21498 + }, + { + "epoch": 4.585964163822526, + "grad_norm": 0.19085421269749353, + "learning_rate": 5.129587592677404e-05, + "loss": 0.9226, + "step": 21499 + }, + { + "epoch": 4.586177474402731, + "grad_norm": 0.15986451755512196, + "learning_rate": 5.128793295612331e-05, + "loss": 0.9168, + "step": 21500 + }, + { + "epoch": 4.586390784982935, + "grad_norm": 0.17473367065863737, + "learning_rate": 5.127999031035015e-05, + "loss": 0.8589, + "step": 21501 + }, + { + "epoch": 4.58660409556314, + "grad_norm": 0.16045932511187694, + "learning_rate": 5.1272047989544365e-05, + "loss": 0.919, + "step": 21502 + }, + { + "epoch": 4.586817406143345, + "grad_norm": 0.15355684342250983, + "learning_rate": 5.126410599379581e-05, + "loss": 0.8796, + "step": 21503 + }, + { + "epoch": 4.587030716723549, + "grad_norm": 0.14132208081830838, + "learning_rate": 5.12561643231944e-05, + "loss": 0.8989, + "step": 21504 + }, + { + "epoch": 4.587244027303754, + "grad_norm": 0.16617470308951035, + "learning_rate": 5.1248222977829954e-05, + "loss": 0.8904, + "step": 21505 + }, + { + "epoch": 4.587457337883959, + "grad_norm": 0.13126827804654467, + "learning_rate": 5.124028195779238e-05, + "loss": 0.8913, + "step": 21506 + }, + { + "epoch": 4.587670648464163, + "grad_norm": 0.1712573306083382, + "learning_rate": 5.123234126317146e-05, + "loss": 0.8992, + "step": 21507 + }, + { + "epoch": 4.587883959044369, + "grad_norm": 0.1596584181502584, + "learning_rate": 5.122440089405712e-05, + "loss": 0.9037, + "step": 21508 + }, + { + "epoch": 4.588097269624574, + "grad_norm": 0.14544033589423289, + "learning_rate": 5.121646085053915e-05, + "loss": 0.8952, + "step": 21509 + }, + { + "epoch": 4.588310580204778, + "grad_norm": 0.15648625466293659, + "learning_rate": 5.120852113270741e-05, + "loss": 0.9112, + "step": 21510 + }, + { + "epoch": 4.588523890784983, + "grad_norm": 0.15507117054852135, + "learning_rate": 5.120058174065174e-05, + "loss": 0.8975, + "step": 21511 + }, + { + "epoch": 4.588737201365188, + "grad_norm": 0.1899657860710615, + "learning_rate": 5.1192642674462e-05, + "loss": 0.9249, + "step": 21512 + }, + { + "epoch": 4.588950511945392, + "grad_norm": 0.16432078249781304, + "learning_rate": 5.1184703934227955e-05, + "loss": 0.901, + "step": 21513 + }, + { + "epoch": 4.589163822525597, + "grad_norm": 0.1741439657073746, + "learning_rate": 5.117676552003948e-05, + "loss": 0.9143, + "step": 21514 + }, + { + "epoch": 4.589377133105802, + "grad_norm": 0.1834651937116246, + "learning_rate": 5.116882743198639e-05, + "loss": 0.9097, + "step": 21515 + }, + { + "epoch": 4.589590443686006, + "grad_norm": 0.1609670935204354, + "learning_rate": 5.1160889670158505e-05, + "loss": 0.9375, + "step": 21516 + }, + { + "epoch": 4.589803754266212, + "grad_norm": 0.15365624333408637, + "learning_rate": 5.115295223464563e-05, + "loss": 0.9112, + "step": 21517 + }, + { + "epoch": 4.590017064846417, + "grad_norm": 0.1532731724550834, + "learning_rate": 5.114501512553759e-05, + "loss": 0.9316, + "step": 21518 + }, + { + "epoch": 4.590230375426621, + "grad_norm": 0.14699476289678842, + "learning_rate": 5.113707834292418e-05, + "loss": 0.8802, + "step": 21519 + }, + { + "epoch": 4.590443686006826, + "grad_norm": 0.16540945155652098, + "learning_rate": 5.112914188689522e-05, + "loss": 0.9077, + "step": 21520 + }, + { + "epoch": 4.590656996587031, + "grad_norm": 0.1824151205960765, + "learning_rate": 5.112120575754052e-05, + "loss": 0.9224, + "step": 21521 + }, + { + "epoch": 4.590870307167235, + "grad_norm": 0.14446004095035497, + "learning_rate": 5.1113269954949836e-05, + "loss": 0.8973, + "step": 21522 + }, + { + "epoch": 4.59108361774744, + "grad_norm": 0.17797719510175103, + "learning_rate": 5.110533447921302e-05, + "loss": 0.9279, + "step": 21523 + }, + { + "epoch": 4.591296928327645, + "grad_norm": 0.1459099859096006, + "learning_rate": 5.109739933041983e-05, + "loss": 0.9198, + "step": 21524 + }, + { + "epoch": 4.5915102389078495, + "grad_norm": 0.14386529242729001, + "learning_rate": 5.108946450866002e-05, + "loss": 0.9032, + "step": 21525 + }, + { + "epoch": 4.591723549488055, + "grad_norm": 0.15141168888353648, + "learning_rate": 5.1081530014023434e-05, + "loss": 0.8949, + "step": 21526 + }, + { + "epoch": 4.59193686006826, + "grad_norm": 0.1858262929610512, + "learning_rate": 5.10735958465998e-05, + "loss": 0.9262, + "step": 21527 + }, + { + "epoch": 4.592150170648464, + "grad_norm": 0.1434546821152744, + "learning_rate": 5.1065662006478925e-05, + "loss": 0.897, + "step": 21528 + }, + { + "epoch": 4.592363481228669, + "grad_norm": 0.1622223643848656, + "learning_rate": 5.105772849375056e-05, + "loss": 0.9264, + "step": 21529 + }, + { + "epoch": 4.592576791808874, + "grad_norm": 0.17267065858645625, + "learning_rate": 5.1049795308504504e-05, + "loss": 0.8998, + "step": 21530 + }, + { + "epoch": 4.5927901023890785, + "grad_norm": 0.17366385368014095, + "learning_rate": 5.104186245083048e-05, + "loss": 0.8995, + "step": 21531 + }, + { + "epoch": 4.593003412969283, + "grad_norm": 0.191490028438346, + "learning_rate": 5.103392992081829e-05, + "loss": 0.8978, + "step": 21532 + }, + { + "epoch": 4.593216723549488, + "grad_norm": 0.1473213073027269, + "learning_rate": 5.102599771855767e-05, + "loss": 0.9372, + "step": 21533 + }, + { + "epoch": 4.5934300341296925, + "grad_norm": 0.2101223460766167, + "learning_rate": 5.101806584413839e-05, + "loss": 0.9076, + "step": 21534 + }, + { + "epoch": 4.593643344709898, + "grad_norm": 0.18552479111753278, + "learning_rate": 5.1010134297650143e-05, + "loss": 0.8878, + "step": 21535 + }, + { + "epoch": 4.593856655290102, + "grad_norm": 0.15863935446908942, + "learning_rate": 5.100220307918274e-05, + "loss": 0.9104, + "step": 21536 + }, + { + "epoch": 4.5940699658703075, + "grad_norm": 0.15574343818226305, + "learning_rate": 5.0994272188825874e-05, + "loss": 0.9078, + "step": 21537 + }, + { + "epoch": 4.594283276450512, + "grad_norm": 0.18933218488120857, + "learning_rate": 5.098634162666932e-05, + "loss": 0.9051, + "step": 21538 + }, + { + "epoch": 4.594496587030717, + "grad_norm": 0.14390456158529988, + "learning_rate": 5.097841139280279e-05, + "loss": 0.9072, + "step": 21539 + }, + { + "epoch": 4.5947098976109215, + "grad_norm": 0.1855076171758625, + "learning_rate": 5.0970481487316036e-05, + "loss": 0.8792, + "step": 21540 + }, + { + "epoch": 4.594923208191126, + "grad_norm": 0.1408904612382675, + "learning_rate": 5.096255191029875e-05, + "loss": 0.8839, + "step": 21541 + }, + { + "epoch": 4.595136518771331, + "grad_norm": 0.21564354265443458, + "learning_rate": 5.09546226618407e-05, + "loss": 0.9087, + "step": 21542 + }, + { + "epoch": 4.595349829351536, + "grad_norm": 0.1482029692881069, + "learning_rate": 5.0946693742031564e-05, + "loss": 0.8882, + "step": 21543 + }, + { + "epoch": 4.59556313993174, + "grad_norm": 0.2028784959279536, + "learning_rate": 5.09387651509611e-05, + "loss": 0.8785, + "step": 21544 + }, + { + "epoch": 4.595776450511945, + "grad_norm": 0.15737792616700738, + "learning_rate": 5.0930836888719005e-05, + "loss": 0.896, + "step": 21545 + }, + { + "epoch": 4.5959897610921505, + "grad_norm": 0.16759291271478577, + "learning_rate": 5.092290895539497e-05, + "loss": 0.8964, + "step": 21546 + }, + { + "epoch": 4.596203071672355, + "grad_norm": 0.1970195847239095, + "learning_rate": 5.091498135107869e-05, + "loss": 0.8926, + "step": 21547 + }, + { + "epoch": 4.59641638225256, + "grad_norm": 0.18238207695544667, + "learning_rate": 5.09070540758599e-05, + "loss": 0.8928, + "step": 21548 + }, + { + "epoch": 4.596629692832765, + "grad_norm": 0.1551870417996046, + "learning_rate": 5.089912712982827e-05, + "loss": 0.8863, + "step": 21549 + }, + { + "epoch": 4.596843003412969, + "grad_norm": 0.16446292679428262, + "learning_rate": 5.089120051307352e-05, + "loss": 0.92, + "step": 21550 + }, + { + "epoch": 4.597056313993174, + "grad_norm": 0.196733939222949, + "learning_rate": 5.088327422568531e-05, + "loss": 0.9302, + "step": 21551 + }, + { + "epoch": 4.597269624573379, + "grad_norm": 0.21091819032694303, + "learning_rate": 5.087534826775336e-05, + "loss": 0.8912, + "step": 21552 + }, + { + "epoch": 4.597482935153583, + "grad_norm": 0.16231801198352658, + "learning_rate": 5.0867422639367317e-05, + "loss": 0.9039, + "step": 21553 + }, + { + "epoch": 4.597696245733788, + "grad_norm": 0.16369713635915742, + "learning_rate": 5.0859497340616886e-05, + "loss": 0.8988, + "step": 21554 + }, + { + "epoch": 4.597909556313994, + "grad_norm": 0.17651313735785013, + "learning_rate": 5.0851572371591724e-05, + "loss": 0.8962, + "step": 21555 + }, + { + "epoch": 4.598122866894198, + "grad_norm": 0.16553016740082688, + "learning_rate": 5.084364773238152e-05, + "loss": 0.8927, + "step": 21556 + }, + { + "epoch": 4.598336177474403, + "grad_norm": 0.1565500952290554, + "learning_rate": 5.0835723423075964e-05, + "loss": 0.9286, + "step": 21557 + }, + { + "epoch": 4.598549488054608, + "grad_norm": 0.19198219782847223, + "learning_rate": 5.082779944376466e-05, + "loss": 0.9163, + "step": 21558 + }, + { + "epoch": 4.598762798634812, + "grad_norm": 0.14810660255397162, + "learning_rate": 5.081987579453728e-05, + "loss": 0.9077, + "step": 21559 + }, + { + "epoch": 4.598976109215017, + "grad_norm": 0.17186179786422934, + "learning_rate": 5.081195247548351e-05, + "loss": 0.9109, + "step": 21560 + }, + { + "epoch": 4.599189419795222, + "grad_norm": 0.14810144265663636, + "learning_rate": 5.080402948669298e-05, + "loss": 0.9194, + "step": 21561 + }, + { + "epoch": 4.599402730375426, + "grad_norm": 0.15605281640127777, + "learning_rate": 5.079610682825536e-05, + "loss": 0.902, + "step": 21562 + }, + { + "epoch": 4.599616040955631, + "grad_norm": 0.14899371327435348, + "learning_rate": 5.0788184500260266e-05, + "loss": 0.8818, + "step": 21563 + }, + { + "epoch": 4.599829351535837, + "grad_norm": 0.1691662134308979, + "learning_rate": 5.078026250279737e-05, + "loss": 0.8767, + "step": 21564 + }, + { + "epoch": 4.600042662116041, + "grad_norm": 0.17308678140574765, + "learning_rate": 5.077234083595628e-05, + "loss": 0.8992, + "step": 21565 + }, + { + "epoch": 4.600255972696246, + "grad_norm": 0.16995125199136482, + "learning_rate": 5.0764419499826665e-05, + "loss": 0.8816, + "step": 21566 + }, + { + "epoch": 4.600469283276451, + "grad_norm": 0.154872010403236, + "learning_rate": 5.0756498494498115e-05, + "loss": 0.8933, + "step": 21567 + }, + { + "epoch": 4.600682593856655, + "grad_norm": 0.16262445191432756, + "learning_rate": 5.0748577820060336e-05, + "loss": 0.9169, + "step": 21568 + }, + { + "epoch": 4.60089590443686, + "grad_norm": 0.21204292365700333, + "learning_rate": 5.074065747660282e-05, + "loss": 0.8859, + "step": 21569 + }, + { + "epoch": 4.601109215017065, + "grad_norm": 0.1656779638500579, + "learning_rate": 5.07327374642153e-05, + "loss": 0.8789, + "step": 21570 + }, + { + "epoch": 4.601322525597269, + "grad_norm": 0.16905321019543393, + "learning_rate": 5.072481778298731e-05, + "loss": 0.8942, + "step": 21571 + }, + { + "epoch": 4.601535836177474, + "grad_norm": 0.18951971430970183, + "learning_rate": 5.0716898433008525e-05, + "loss": 0.906, + "step": 21572 + }, + { + "epoch": 4.60174914675768, + "grad_norm": 0.15676267463273455, + "learning_rate": 5.070897941436851e-05, + "loss": 0.8731, + "step": 21573 + }, + { + "epoch": 4.601962457337884, + "grad_norm": 0.1537566081307109, + "learning_rate": 5.07010607271569e-05, + "loss": 0.9033, + "step": 21574 + }, + { + "epoch": 4.602175767918089, + "grad_norm": 0.13679667973957524, + "learning_rate": 5.069314237146326e-05, + "loss": 0.8982, + "step": 21575 + }, + { + "epoch": 4.602389078498294, + "grad_norm": 0.15935066171706894, + "learning_rate": 5.068522434737723e-05, + "loss": 0.9188, + "step": 21576 + }, + { + "epoch": 4.602602389078498, + "grad_norm": 0.14592846044687113, + "learning_rate": 5.067730665498836e-05, + "loss": 0.8795, + "step": 21577 + }, + { + "epoch": 4.602815699658703, + "grad_norm": 0.20383974740117794, + "learning_rate": 5.0669389294386284e-05, + "loss": 0.8744, + "step": 21578 + }, + { + "epoch": 4.603029010238908, + "grad_norm": 0.17057750391397614, + "learning_rate": 5.066147226566056e-05, + "loss": 0.8783, + "step": 21579 + }, + { + "epoch": 4.603242320819112, + "grad_norm": 0.16432411278034117, + "learning_rate": 5.065355556890077e-05, + "loss": 0.8978, + "step": 21580 + }, + { + "epoch": 4.603455631399317, + "grad_norm": 0.22944839986171914, + "learning_rate": 5.0645639204196464e-05, + "loss": 0.9101, + "step": 21581 + }, + { + "epoch": 4.603668941979522, + "grad_norm": 0.23441154598118508, + "learning_rate": 5.063772317163727e-05, + "loss": 0.8772, + "step": 21582 + }, + { + "epoch": 4.6038822525597265, + "grad_norm": 0.1660907736763685, + "learning_rate": 5.0629807471312713e-05, + "loss": 0.9087, + "step": 21583 + }, + { + "epoch": 4.604095563139932, + "grad_norm": 0.18020652010100327, + "learning_rate": 5.0621892103312394e-05, + "loss": 0.9112, + "step": 21584 + }, + { + "epoch": 4.604308873720137, + "grad_norm": 0.2239659766909593, + "learning_rate": 5.061397706772585e-05, + "loss": 0.8902, + "step": 21585 + }, + { + "epoch": 4.604522184300341, + "grad_norm": 0.17903077522632255, + "learning_rate": 5.0606062364642655e-05, + "loss": 0.9345, + "step": 21586 + }, + { + "epoch": 4.604735494880546, + "grad_norm": 0.18554538677257257, + "learning_rate": 5.059814799415235e-05, + "loss": 0.9027, + "step": 21587 + }, + { + "epoch": 4.604948805460751, + "grad_norm": 0.2080686180843486, + "learning_rate": 5.0590233956344507e-05, + "loss": 0.8874, + "step": 21588 + }, + { + "epoch": 4.6051621160409555, + "grad_norm": 0.18372775116486792, + "learning_rate": 5.058232025130865e-05, + "loss": 0.9074, + "step": 21589 + }, + { + "epoch": 4.60537542662116, + "grad_norm": 0.19179980523940057, + "learning_rate": 5.057440687913438e-05, + "loss": 0.9008, + "step": 21590 + }, + { + "epoch": 4.605588737201365, + "grad_norm": 0.1775429062272161, + "learning_rate": 5.056649383991114e-05, + "loss": 0.8984, + "step": 21591 + }, + { + "epoch": 4.6058020477815695, + "grad_norm": 0.18511673201901488, + "learning_rate": 5.0558581133728534e-05, + "loss": 0.884, + "step": 21592 + }, + { + "epoch": 4.606015358361775, + "grad_norm": 0.15660471119795338, + "learning_rate": 5.0550668760676064e-05, + "loss": 0.9134, + "step": 21593 + }, + { + "epoch": 4.60622866894198, + "grad_norm": 0.22337702152773634, + "learning_rate": 5.054275672084329e-05, + "loss": 0.8898, + "step": 21594 + }, + { + "epoch": 4.6064419795221845, + "grad_norm": 0.17354463506298073, + "learning_rate": 5.053484501431971e-05, + "loss": 0.9085, + "step": 21595 + }, + { + "epoch": 4.606655290102389, + "grad_norm": 0.18199262966616042, + "learning_rate": 5.052693364119486e-05, + "loss": 0.9496, + "step": 21596 + }, + { + "epoch": 4.606868600682594, + "grad_norm": 0.13966137939093767, + "learning_rate": 5.051902260155823e-05, + "loss": 0.9016, + "step": 21597 + }, + { + "epoch": 4.6070819112627985, + "grad_norm": 0.1762761684484577, + "learning_rate": 5.0511111895499375e-05, + "loss": 0.859, + "step": 21598 + }, + { + "epoch": 4.607295221843003, + "grad_norm": 0.17153523975493148, + "learning_rate": 5.050320152310778e-05, + "loss": 0.9092, + "step": 21599 + }, + { + "epoch": 4.607508532423208, + "grad_norm": 0.155970343874257, + "learning_rate": 5.049529148447296e-05, + "loss": 0.9058, + "step": 21600 + }, + { + "epoch": 4.607721843003413, + "grad_norm": 0.14241087610318873, + "learning_rate": 5.048738177968444e-05, + "loss": 0.8965, + "step": 21601 + }, + { + "epoch": 4.607935153583618, + "grad_norm": 0.16697000478452723, + "learning_rate": 5.047947240883167e-05, + "loss": 0.9335, + "step": 21602 + }, + { + "epoch": 4.608148464163823, + "grad_norm": 0.1900449744328079, + "learning_rate": 5.047156337200415e-05, + "loss": 0.8967, + "step": 21603 + }, + { + "epoch": 4.6083617747440275, + "grad_norm": 0.19761525534254676, + "learning_rate": 5.046365466929139e-05, + "loss": 0.907, + "step": 21604 + }, + { + "epoch": 4.608575085324232, + "grad_norm": 0.1351185104484305, + "learning_rate": 5.045574630078287e-05, + "loss": 0.9006, + "step": 21605 + }, + { + "epoch": 4.608788395904437, + "grad_norm": 0.18517311571088507, + "learning_rate": 5.04478382665681e-05, + "loss": 0.9319, + "step": 21606 + }, + { + "epoch": 4.609001706484642, + "grad_norm": 0.2133780449867732, + "learning_rate": 5.043993056673652e-05, + "loss": 0.921, + "step": 21607 + }, + { + "epoch": 4.609215017064846, + "grad_norm": 0.17268425307198534, + "learning_rate": 5.043202320137763e-05, + "loss": 0.9035, + "step": 21608 + }, + { + "epoch": 4.609428327645051, + "grad_norm": 0.15229177668138008, + "learning_rate": 5.042411617058088e-05, + "loss": 0.8999, + "step": 21609 + }, + { + "epoch": 4.609641638225256, + "grad_norm": 0.14423694442861568, + "learning_rate": 5.0416209474435776e-05, + "loss": 0.8659, + "step": 21610 + }, + { + "epoch": 4.609854948805461, + "grad_norm": 0.17100401921616182, + "learning_rate": 5.0408303113031734e-05, + "loss": 0.8994, + "step": 21611 + }, + { + "epoch": 4.610068259385666, + "grad_norm": 0.14696693184357296, + "learning_rate": 5.040039708645827e-05, + "loss": 0.8961, + "step": 21612 + }, + { + "epoch": 4.610281569965871, + "grad_norm": 0.14733340874362297, + "learning_rate": 5.0392491394804806e-05, + "loss": 0.8749, + "step": 21613 + }, + { + "epoch": 4.610494880546075, + "grad_norm": 0.156185350867967, + "learning_rate": 5.0384586038160807e-05, + "loss": 0.8795, + "step": 21614 + }, + { + "epoch": 4.61070819112628, + "grad_norm": 0.15196556977044495, + "learning_rate": 5.0376681016615686e-05, + "loss": 0.8817, + "step": 21615 + }, + { + "epoch": 4.610921501706485, + "grad_norm": 0.16967226714977401, + "learning_rate": 5.036877633025894e-05, + "loss": 0.9022, + "step": 21616 + }, + { + "epoch": 4.611134812286689, + "grad_norm": 0.1445443097454849, + "learning_rate": 5.0360871979179955e-05, + "loss": 0.8624, + "step": 21617 + }, + { + "epoch": 4.611348122866894, + "grad_norm": 0.18008263282394826, + "learning_rate": 5.0352967963468245e-05, + "loss": 0.908, + "step": 21618 + }, + { + "epoch": 4.611561433447099, + "grad_norm": 0.1560360392585138, + "learning_rate": 5.034506428321317e-05, + "loss": 0.8975, + "step": 21619 + }, + { + "epoch": 4.611774744027304, + "grad_norm": 0.18381435212001085, + "learning_rate": 5.0337160938504205e-05, + "loss": 0.9131, + "step": 21620 + }, + { + "epoch": 4.611988054607508, + "grad_norm": 0.14341941774944997, + "learning_rate": 5.032925792943075e-05, + "loss": 0.8709, + "step": 21621 + }, + { + "epoch": 4.612201365187714, + "grad_norm": 0.17245102205650634, + "learning_rate": 5.032135525608226e-05, + "loss": 0.8792, + "step": 21622 + }, + { + "epoch": 4.612414675767918, + "grad_norm": 0.1686747169265182, + "learning_rate": 5.031345291854811e-05, + "loss": 0.9146, + "step": 21623 + }, + { + "epoch": 4.612627986348123, + "grad_norm": 0.15562566981421636, + "learning_rate": 5.030555091691778e-05, + "loss": 0.8814, + "step": 21624 + }, + { + "epoch": 4.612841296928328, + "grad_norm": 0.17362569093208333, + "learning_rate": 5.0297649251280616e-05, + "loss": 0.8682, + "step": 21625 + }, + { + "epoch": 4.613054607508532, + "grad_norm": 0.16297703537421146, + "learning_rate": 5.028974792172606e-05, + "loss": 0.8986, + "step": 21626 + }, + { + "epoch": 4.613267918088737, + "grad_norm": 0.1520437552533542, + "learning_rate": 5.028184692834348e-05, + "loss": 0.9097, + "step": 21627 + }, + { + "epoch": 4.613481228668942, + "grad_norm": 0.17228520274489612, + "learning_rate": 5.027394627122233e-05, + "loss": 0.909, + "step": 21628 + }, + { + "epoch": 4.613694539249146, + "grad_norm": 0.14291024258875076, + "learning_rate": 5.0266045950451956e-05, + "loss": 0.8931, + "step": 21629 + }, + { + "epoch": 4.613907849829351, + "grad_norm": 0.1494746848097124, + "learning_rate": 5.025814596612179e-05, + "loss": 0.8963, + "step": 21630 + }, + { + "epoch": 4.614121160409557, + "grad_norm": 0.15702801882503764, + "learning_rate": 5.0250246318321176e-05, + "loss": 0.9013, + "step": 21631 + }, + { + "epoch": 4.614334470989761, + "grad_norm": 0.15679639755822408, + "learning_rate": 5.024234700713955e-05, + "loss": 0.8969, + "step": 21632 + }, + { + "epoch": 4.614547781569966, + "grad_norm": 0.1296856412564758, + "learning_rate": 5.0234448032666265e-05, + "loss": 0.8725, + "step": 21633 + }, + { + "epoch": 4.614761092150171, + "grad_norm": 0.14920311691515947, + "learning_rate": 5.02265493949907e-05, + "loss": 0.9183, + "step": 21634 + }, + { + "epoch": 4.614974402730375, + "grad_norm": 0.2088166500856261, + "learning_rate": 5.021865109420225e-05, + "loss": 0.907, + "step": 21635 + }, + { + "epoch": 4.61518771331058, + "grad_norm": 0.22199721025320565, + "learning_rate": 5.0210753130390234e-05, + "loss": 0.9518, + "step": 21636 + }, + { + "epoch": 4.615401023890785, + "grad_norm": 0.19308874655537217, + "learning_rate": 5.0202855503644066e-05, + "loss": 0.9134, + "step": 21637 + }, + { + "epoch": 4.6156143344709895, + "grad_norm": 0.2121788375051534, + "learning_rate": 5.019495821405308e-05, + "loss": 0.9229, + "step": 21638 + }, + { + "epoch": 4.615827645051194, + "grad_norm": 0.14315060952856928, + "learning_rate": 5.0187061261706624e-05, + "loss": 0.8635, + "step": 21639 + }, + { + "epoch": 4.6160409556314, + "grad_norm": 0.22080605832078723, + "learning_rate": 5.017916464669411e-05, + "loss": 0.8949, + "step": 21640 + }, + { + "epoch": 4.616254266211604, + "grad_norm": 0.23643263138984646, + "learning_rate": 5.01712683691048e-05, + "loss": 0.8878, + "step": 21641 + }, + { + "epoch": 4.616467576791809, + "grad_norm": 0.14342691372609384, + "learning_rate": 5.016337242902813e-05, + "loss": 0.9074, + "step": 21642 + }, + { + "epoch": 4.616680887372014, + "grad_norm": 0.25510659167747957, + "learning_rate": 5.0155476826553383e-05, + "loss": 0.908, + "step": 21643 + }, + { + "epoch": 4.6168941979522184, + "grad_norm": 0.21750099040777823, + "learning_rate": 5.014758156176993e-05, + "loss": 0.9016, + "step": 21644 + }, + { + "epoch": 4.617107508532423, + "grad_norm": 0.1684708567726051, + "learning_rate": 5.013968663476708e-05, + "loss": 0.878, + "step": 21645 + }, + { + "epoch": 4.617320819112628, + "grad_norm": 0.2692639499753348, + "learning_rate": 5.013179204563422e-05, + "loss": 0.9042, + "step": 21646 + }, + { + "epoch": 4.6175341296928325, + "grad_norm": 0.19122362596545397, + "learning_rate": 5.012389779446061e-05, + "loss": 0.8924, + "step": 21647 + }, + { + "epoch": 4.617747440273037, + "grad_norm": 0.2582830139667529, + "learning_rate": 5.011600388133557e-05, + "loss": 0.9227, + "step": 21648 + }, + { + "epoch": 4.617960750853243, + "grad_norm": 0.15747162690108502, + "learning_rate": 5.010811030634848e-05, + "loss": 0.882, + "step": 21649 + }, + { + "epoch": 4.618174061433447, + "grad_norm": 0.26030349703721123, + "learning_rate": 5.010021706958862e-05, + "loss": 0.9205, + "step": 21650 + }, + { + "epoch": 4.618387372013652, + "grad_norm": 0.24403641532593312, + "learning_rate": 5.009232417114529e-05, + "loss": 0.8824, + "step": 21651 + }, + { + "epoch": 4.618600682593857, + "grad_norm": 0.1562280274322041, + "learning_rate": 5.008443161110783e-05, + "loss": 0.9198, + "step": 21652 + }, + { + "epoch": 4.6188139931740615, + "grad_norm": 0.23698811426653016, + "learning_rate": 5.0076539389565524e-05, + "loss": 0.9228, + "step": 21653 + }, + { + "epoch": 4.619027303754266, + "grad_norm": 0.1953727025451405, + "learning_rate": 5.006864750660769e-05, + "loss": 0.891, + "step": 21654 + }, + { + "epoch": 4.619240614334471, + "grad_norm": 0.21239842291295197, + "learning_rate": 5.006075596232359e-05, + "loss": 0.8926, + "step": 21655 + }, + { + "epoch": 4.6194539249146755, + "grad_norm": 0.23168959769681058, + "learning_rate": 5.0052864756802564e-05, + "loss": 0.8959, + "step": 21656 + }, + { + "epoch": 4.61966723549488, + "grad_norm": 0.22381036185019554, + "learning_rate": 5.0044973890133886e-05, + "loss": 0.9118, + "step": 21657 + }, + { + "epoch": 4.619880546075086, + "grad_norm": 0.19773235335691122, + "learning_rate": 5.0037083362406796e-05, + "loss": 0.9228, + "step": 21658 + }, + { + "epoch": 4.6200938566552905, + "grad_norm": 0.35497398018434567, + "learning_rate": 5.002919317371064e-05, + "loss": 0.8969, + "step": 21659 + }, + { + "epoch": 4.620307167235495, + "grad_norm": 0.264969778062183, + "learning_rate": 5.002130332413466e-05, + "loss": 0.8993, + "step": 21660 + }, + { + "epoch": 4.6205204778157, + "grad_norm": 0.20692159412946293, + "learning_rate": 5.0013413813768134e-05, + "loss": 0.9013, + "step": 21661 + }, + { + "epoch": 4.6207337883959045, + "grad_norm": 0.1801875085055243, + "learning_rate": 5.0005524642700336e-05, + "loss": 0.897, + "step": 21662 + }, + { + "epoch": 4.620947098976109, + "grad_norm": 0.22311108786090642, + "learning_rate": 4.999763581102052e-05, + "loss": 0.9454, + "step": 21663 + }, + { + "epoch": 4.621160409556314, + "grad_norm": 0.21202072202097338, + "learning_rate": 4.998974731881797e-05, + "loss": 0.888, + "step": 21664 + }, + { + "epoch": 4.621373720136519, + "grad_norm": 0.17572228335158022, + "learning_rate": 4.9981859166181926e-05, + "loss": 0.9163, + "step": 21665 + }, + { + "epoch": 4.621587030716723, + "grad_norm": 0.19570113972747788, + "learning_rate": 4.997397135320167e-05, + "loss": 0.8829, + "step": 21666 + }, + { + "epoch": 4.621800341296928, + "grad_norm": 0.28340040748569306, + "learning_rate": 4.99660838799664e-05, + "loss": 0.9301, + "step": 21667 + }, + { + "epoch": 4.622013651877133, + "grad_norm": 0.188986761210145, + "learning_rate": 4.995819674656543e-05, + "loss": 0.8819, + "step": 21668 + }, + { + "epoch": 4.622226962457338, + "grad_norm": 0.1970445965514272, + "learning_rate": 4.995030995308798e-05, + "loss": 0.892, + "step": 21669 + }, + { + "epoch": 4.622440273037543, + "grad_norm": 0.2140974782834734, + "learning_rate": 4.9942423499623246e-05, + "loss": 0.8819, + "step": 21670 + }, + { + "epoch": 4.622653583617748, + "grad_norm": 0.1840420782949343, + "learning_rate": 4.9934537386260504e-05, + "loss": 0.9032, + "step": 21671 + }, + { + "epoch": 4.622866894197952, + "grad_norm": 0.1764412252770795, + "learning_rate": 4.992665161308896e-05, + "loss": 0.9066, + "step": 21672 + }, + { + "epoch": 4.623080204778157, + "grad_norm": 0.2124968043497098, + "learning_rate": 4.9918766180197885e-05, + "loss": 0.9005, + "step": 21673 + }, + { + "epoch": 4.623293515358362, + "grad_norm": 0.16841274376065382, + "learning_rate": 4.991088108767647e-05, + "loss": 0.8948, + "step": 21674 + }, + { + "epoch": 4.623506825938566, + "grad_norm": 0.20297132536271315, + "learning_rate": 4.990299633561393e-05, + "loss": 0.91, + "step": 21675 + }, + { + "epoch": 4.623720136518771, + "grad_norm": 0.17920320756734923, + "learning_rate": 4.9895111924099514e-05, + "loss": 0.8831, + "step": 21676 + }, + { + "epoch": 4.623933447098976, + "grad_norm": 0.16971834007281478, + "learning_rate": 4.9887227853222386e-05, + "loss": 0.8941, + "step": 21677 + }, + { + "epoch": 4.624146757679181, + "grad_norm": 0.16668157116166343, + "learning_rate": 4.98793441230718e-05, + "loss": 0.8846, + "step": 21678 + }, + { + "epoch": 4.624360068259386, + "grad_norm": 0.1680215093393306, + "learning_rate": 4.987146073373693e-05, + "loss": 0.9111, + "step": 21679 + }, + { + "epoch": 4.624573378839591, + "grad_norm": 0.1741393245262516, + "learning_rate": 4.986357768530703e-05, + "loss": 0.8943, + "step": 21680 + }, + { + "epoch": 4.624786689419795, + "grad_norm": 0.17113736990463346, + "learning_rate": 4.985569497787122e-05, + "loss": 0.8988, + "step": 21681 + }, + { + "epoch": 4.625, + "grad_norm": 0.15021258588321498, + "learning_rate": 4.984781261151872e-05, + "loss": 0.8989, + "step": 21682 + }, + { + "epoch": 4.625213310580205, + "grad_norm": 0.19102705618838373, + "learning_rate": 4.983993058633875e-05, + "loss": 0.9246, + "step": 21683 + }, + { + "epoch": 4.625426621160409, + "grad_norm": 0.16592720670562583, + "learning_rate": 4.983204890242045e-05, + "loss": 0.9026, + "step": 21684 + }, + { + "epoch": 4.625639931740614, + "grad_norm": 0.18193660230529465, + "learning_rate": 4.982416755985304e-05, + "loss": 0.8933, + "step": 21685 + }, + { + "epoch": 4.625853242320819, + "grad_norm": 0.1694484402086342, + "learning_rate": 4.981628655872568e-05, + "loss": 0.8871, + "step": 21686 + }, + { + "epoch": 4.626066552901024, + "grad_norm": 0.17190206859567553, + "learning_rate": 4.980840589912752e-05, + "loss": 0.8727, + "step": 21687 + }, + { + "epoch": 4.626279863481229, + "grad_norm": 0.15949140022015454, + "learning_rate": 4.9800525581147794e-05, + "loss": 0.912, + "step": 21688 + }, + { + "epoch": 4.626493174061434, + "grad_norm": 0.16097780228443198, + "learning_rate": 4.97926456048756e-05, + "loss": 0.9017, + "step": 21689 + }, + { + "epoch": 4.626706484641638, + "grad_norm": 0.18656670620426785, + "learning_rate": 4.978476597040015e-05, + "loss": 0.8908, + "step": 21690 + }, + { + "epoch": 4.626919795221843, + "grad_norm": 0.14976059930272964, + "learning_rate": 4.977688667781059e-05, + "loss": 0.8905, + "step": 21691 + }, + { + "epoch": 4.627133105802048, + "grad_norm": 0.16696278046995983, + "learning_rate": 4.976900772719604e-05, + "loss": 0.9134, + "step": 21692 + }, + { + "epoch": 4.627346416382252, + "grad_norm": 0.21373304297368842, + "learning_rate": 4.976112911864569e-05, + "loss": 0.8975, + "step": 21693 + }, + { + "epoch": 4.627559726962457, + "grad_norm": 0.16160458728903754, + "learning_rate": 4.975325085224866e-05, + "loss": 0.9047, + "step": 21694 + }, + { + "epoch": 4.627773037542662, + "grad_norm": 0.19378019867198876, + "learning_rate": 4.9745372928094105e-05, + "loss": 0.9048, + "step": 21695 + }, + { + "epoch": 4.627986348122867, + "grad_norm": 0.21388582545638893, + "learning_rate": 4.973749534627116e-05, + "loss": 0.9073, + "step": 21696 + }, + { + "epoch": 4.628199658703072, + "grad_norm": 0.16572360204881104, + "learning_rate": 4.972961810686899e-05, + "loss": 0.907, + "step": 21697 + }, + { + "epoch": 4.628412969283277, + "grad_norm": 0.19998343674316219, + "learning_rate": 4.972174120997669e-05, + "loss": 0.9287, + "step": 21698 + }, + { + "epoch": 4.628626279863481, + "grad_norm": 0.1654679631450012, + "learning_rate": 4.9713864655683387e-05, + "loss": 0.8859, + "step": 21699 + }, + { + "epoch": 4.628839590443686, + "grad_norm": 0.17756157143893356, + "learning_rate": 4.970598844407822e-05, + "loss": 0.8957, + "step": 21700 + }, + { + "epoch": 4.629052901023891, + "grad_norm": 0.13884681268423338, + "learning_rate": 4.9698112575250294e-05, + "loss": 0.914, + "step": 21701 + }, + { + "epoch": 4.6292662116040955, + "grad_norm": 0.16607538946872166, + "learning_rate": 4.969023704928877e-05, + "loss": 0.8751, + "step": 21702 + }, + { + "epoch": 4.6294795221843, + "grad_norm": 0.15318850048187296, + "learning_rate": 4.9682361866282696e-05, + "loss": 0.9123, + "step": 21703 + }, + { + "epoch": 4.629692832764505, + "grad_norm": 0.15128827191592062, + "learning_rate": 4.967448702632121e-05, + "loss": 0.8961, + "step": 21704 + }, + { + "epoch": 4.6299061433447095, + "grad_norm": 0.19383874260988257, + "learning_rate": 4.966661252949341e-05, + "loss": 0.9122, + "step": 21705 + }, + { + "epoch": 4.630119453924914, + "grad_norm": 0.20819583095043095, + "learning_rate": 4.96587383758884e-05, + "loss": 0.8945, + "step": 21706 + }, + { + "epoch": 4.63033276450512, + "grad_norm": 0.15584401956463526, + "learning_rate": 4.965086456559528e-05, + "loss": 0.8947, + "step": 21707 + }, + { + "epoch": 4.6305460750853245, + "grad_norm": 0.1534820213296568, + "learning_rate": 4.964299109870313e-05, + "loss": 0.8911, + "step": 21708 + }, + { + "epoch": 4.630759385665529, + "grad_norm": 0.15756619281765105, + "learning_rate": 4.963511797530105e-05, + "loss": 0.8883, + "step": 21709 + }, + { + "epoch": 4.630972696245734, + "grad_norm": 0.14075491132473478, + "learning_rate": 4.962724519547813e-05, + "loss": 0.9124, + "step": 21710 + }, + { + "epoch": 4.6311860068259385, + "grad_norm": 0.17987557897833656, + "learning_rate": 4.961937275932342e-05, + "loss": 0.9032, + "step": 21711 + }, + { + "epoch": 4.631399317406143, + "grad_norm": 0.16703355030876496, + "learning_rate": 4.961150066692604e-05, + "loss": 0.895, + "step": 21712 + }, + { + "epoch": 4.631612627986348, + "grad_norm": 0.14576221738068235, + "learning_rate": 4.960362891837505e-05, + "loss": 0.8808, + "step": 21713 + }, + { + "epoch": 4.631825938566553, + "grad_norm": 0.13591249821195356, + "learning_rate": 4.959575751375949e-05, + "loss": 0.893, + "step": 21714 + }, + { + "epoch": 4.632039249146757, + "grad_norm": 0.14729455753323728, + "learning_rate": 4.9587886453168456e-05, + "loss": 0.8994, + "step": 21715 + }, + { + "epoch": 4.632252559726963, + "grad_norm": 0.13737580574792077, + "learning_rate": 4.958001573669098e-05, + "loss": 0.8819, + "step": 21716 + }, + { + "epoch": 4.6324658703071675, + "grad_norm": 0.1364975159568764, + "learning_rate": 4.9572145364416155e-05, + "loss": 0.8923, + "step": 21717 + }, + { + "epoch": 4.632679180887372, + "grad_norm": 0.1365306751888294, + "learning_rate": 4.9564275336433004e-05, + "loss": 0.9009, + "step": 21718 + }, + { + "epoch": 4.632892491467577, + "grad_norm": 0.15761447958517188, + "learning_rate": 4.95564056528306e-05, + "loss": 0.8617, + "step": 21719 + }, + { + "epoch": 4.6331058020477816, + "grad_norm": 0.1316867524088532, + "learning_rate": 4.954853631369796e-05, + "loss": 0.87, + "step": 21720 + }, + { + "epoch": 4.633319112627986, + "grad_norm": 0.15907257441802963, + "learning_rate": 4.9540667319124164e-05, + "loss": 0.9016, + "step": 21721 + }, + { + "epoch": 4.633532423208191, + "grad_norm": 0.15392285935771932, + "learning_rate": 4.953279866919823e-05, + "loss": 0.8929, + "step": 21722 + }, + { + "epoch": 4.633745733788396, + "grad_norm": 0.15440935816214804, + "learning_rate": 4.952493036400917e-05, + "loss": 0.8866, + "step": 21723 + }, + { + "epoch": 4.6339590443686, + "grad_norm": 0.16887491008373076, + "learning_rate": 4.951706240364608e-05, + "loss": 0.9274, + "step": 21724 + }, + { + "epoch": 4.634172354948806, + "grad_norm": 0.1587012811693411, + "learning_rate": 4.9509194788197924e-05, + "loss": 0.9091, + "step": 21725 + }, + { + "epoch": 4.6343856655290105, + "grad_norm": 0.1557813435109071, + "learning_rate": 4.9501327517753714e-05, + "loss": 0.9153, + "step": 21726 + }, + { + "epoch": 4.634598976109215, + "grad_norm": 0.15829335371070516, + "learning_rate": 4.949346059240252e-05, + "loss": 0.9055, + "step": 21727 + }, + { + "epoch": 4.63481228668942, + "grad_norm": 0.14463061429164853, + "learning_rate": 4.948559401223331e-05, + "loss": 0.8854, + "step": 21728 + }, + { + "epoch": 4.635025597269625, + "grad_norm": 0.16913367307410254, + "learning_rate": 4.947772777733514e-05, + "loss": 0.9219, + "step": 21729 + }, + { + "epoch": 4.635238907849829, + "grad_norm": 0.190540628670273, + "learning_rate": 4.946986188779697e-05, + "loss": 0.9197, + "step": 21730 + }, + { + "epoch": 4.635452218430034, + "grad_norm": 0.17072765703273302, + "learning_rate": 4.9461996343707854e-05, + "loss": 0.9, + "step": 21731 + }, + { + "epoch": 4.635665529010239, + "grad_norm": 0.1664562273648936, + "learning_rate": 4.9454131145156744e-05, + "loss": 0.8826, + "step": 21732 + }, + { + "epoch": 4.635878839590443, + "grad_norm": 0.1950286784836744, + "learning_rate": 4.944626629223266e-05, + "loss": 0.8954, + "step": 21733 + }, + { + "epoch": 4.636092150170649, + "grad_norm": 0.15719708836398266, + "learning_rate": 4.943840178502459e-05, + "loss": 0.9039, + "step": 21734 + }, + { + "epoch": 4.636305460750854, + "grad_norm": 0.1637332117642876, + "learning_rate": 4.943053762362152e-05, + "loss": 0.9031, + "step": 21735 + }, + { + "epoch": 4.636518771331058, + "grad_norm": 0.14756098627236777, + "learning_rate": 4.942267380811244e-05, + "loss": 0.8744, + "step": 21736 + }, + { + "epoch": 4.636732081911263, + "grad_norm": 0.19622759997263287, + "learning_rate": 4.941481033858632e-05, + "loss": 0.9436, + "step": 21737 + }, + { + "epoch": 4.636945392491468, + "grad_norm": 0.15137831335323954, + "learning_rate": 4.94069472151321e-05, + "loss": 0.8879, + "step": 21738 + }, + { + "epoch": 4.637158703071672, + "grad_norm": 0.19344583408874316, + "learning_rate": 4.939908443783882e-05, + "loss": 0.9068, + "step": 21739 + }, + { + "epoch": 4.637372013651877, + "grad_norm": 0.14621150720323084, + "learning_rate": 4.939122200679538e-05, + "loss": 0.9053, + "step": 21740 + }, + { + "epoch": 4.637585324232082, + "grad_norm": 0.17339402354950276, + "learning_rate": 4.938335992209082e-05, + "loss": 0.9237, + "step": 21741 + }, + { + "epoch": 4.637798634812286, + "grad_norm": 0.15416667706212575, + "learning_rate": 4.937549818381403e-05, + "loss": 0.8999, + "step": 21742 + }, + { + "epoch": 4.638011945392492, + "grad_norm": 0.18823568344848743, + "learning_rate": 4.936763679205401e-05, + "loss": 0.8955, + "step": 21743 + }, + { + "epoch": 4.638225255972696, + "grad_norm": 0.16721917876627498, + "learning_rate": 4.9359775746899666e-05, + "loss": 0.8936, + "step": 21744 + }, + { + "epoch": 4.638438566552901, + "grad_norm": 0.21129381269323763, + "learning_rate": 4.9351915048440004e-05, + "loss": 0.9163, + "step": 21745 + }, + { + "epoch": 4.638651877133106, + "grad_norm": 0.1921028080815802, + "learning_rate": 4.9344054696763944e-05, + "loss": 0.9114, + "step": 21746 + }, + { + "epoch": 4.638865187713311, + "grad_norm": 0.1689002651957936, + "learning_rate": 4.9336194691960426e-05, + "loss": 0.8825, + "step": 21747 + }, + { + "epoch": 4.639078498293515, + "grad_norm": 0.1694200923435184, + "learning_rate": 4.932833503411834e-05, + "loss": 0.945, + "step": 21748 + }, + { + "epoch": 4.63929180887372, + "grad_norm": 0.15573698148429824, + "learning_rate": 4.9320475723326694e-05, + "loss": 0.9124, + "step": 21749 + }, + { + "epoch": 4.639505119453925, + "grad_norm": 0.16776953502471315, + "learning_rate": 4.9312616759674357e-05, + "loss": 0.8969, + "step": 21750 + }, + { + "epoch": 4.639718430034129, + "grad_norm": 0.14268989400323442, + "learning_rate": 4.930475814325029e-05, + "loss": 0.9091, + "step": 21751 + }, + { + "epoch": 4.639931740614334, + "grad_norm": 0.17819759911057398, + "learning_rate": 4.929689987414339e-05, + "loss": 0.8903, + "step": 21752 + }, + { + "epoch": 4.640145051194539, + "grad_norm": 0.15636405016752025, + "learning_rate": 4.928904195244259e-05, + "loss": 0.9156, + "step": 21753 + }, + { + "epoch": 4.640358361774744, + "grad_norm": 0.1794771046375531, + "learning_rate": 4.92811843782368e-05, + "loss": 0.9104, + "step": 21754 + }, + { + "epoch": 4.640571672354949, + "grad_norm": 0.14888675129607415, + "learning_rate": 4.927332715161493e-05, + "loss": 0.8945, + "step": 21755 + }, + { + "epoch": 4.640784982935154, + "grad_norm": 0.1811509680810581, + "learning_rate": 4.926547027266587e-05, + "loss": 0.8828, + "step": 21756 + }, + { + "epoch": 4.640998293515358, + "grad_norm": 0.15992920832273433, + "learning_rate": 4.9257613741478544e-05, + "loss": 0.8797, + "step": 21757 + }, + { + "epoch": 4.641211604095563, + "grad_norm": 0.18542270683884518, + "learning_rate": 4.924975755814185e-05, + "loss": 0.9213, + "step": 21758 + }, + { + "epoch": 4.641424914675768, + "grad_norm": 0.16108986370367775, + "learning_rate": 4.924190172274465e-05, + "loss": 0.8894, + "step": 21759 + }, + { + "epoch": 4.6416382252559725, + "grad_norm": 0.17455937736022295, + "learning_rate": 4.9234046235375835e-05, + "loss": 0.8987, + "step": 21760 + }, + { + "epoch": 4.641851535836177, + "grad_norm": 0.14333232873386884, + "learning_rate": 4.922619109612432e-05, + "loss": 0.8688, + "step": 21761 + }, + { + "epoch": 4.642064846416382, + "grad_norm": 0.20112349848133293, + "learning_rate": 4.921833630507895e-05, + "loss": 0.9353, + "step": 21762 + }, + { + "epoch": 4.642278156996587, + "grad_norm": 0.14153282287409727, + "learning_rate": 4.921048186232865e-05, + "loss": 0.8718, + "step": 21763 + }, + { + "epoch": 4.642491467576792, + "grad_norm": 0.20157955186886217, + "learning_rate": 4.9202627767962235e-05, + "loss": 0.8643, + "step": 21764 + }, + { + "epoch": 4.642704778156997, + "grad_norm": 0.13740944194136906, + "learning_rate": 4.9194774022068624e-05, + "loss": 0.8832, + "step": 21765 + }, + { + "epoch": 4.6429180887372015, + "grad_norm": 0.17829409292763823, + "learning_rate": 4.918692062473666e-05, + "loss": 0.902, + "step": 21766 + }, + { + "epoch": 4.643131399317406, + "grad_norm": 0.1783587060620696, + "learning_rate": 4.917906757605521e-05, + "loss": 0.9232, + "step": 21767 + }, + { + "epoch": 4.643344709897611, + "grad_norm": 0.1629815315754447, + "learning_rate": 4.917121487611313e-05, + "loss": 0.9181, + "step": 21768 + }, + { + "epoch": 4.6435580204778155, + "grad_norm": 0.1886626679968537, + "learning_rate": 4.91633625249993e-05, + "loss": 0.9153, + "step": 21769 + }, + { + "epoch": 4.64377133105802, + "grad_norm": 0.1450918912428082, + "learning_rate": 4.91555105228025e-05, + "loss": 0.8872, + "step": 21770 + }, + { + "epoch": 4.643984641638225, + "grad_norm": 0.16816934317358315, + "learning_rate": 4.9147658869611636e-05, + "loss": 0.8825, + "step": 21771 + }, + { + "epoch": 4.6441979522184305, + "grad_norm": 0.16169001876247124, + "learning_rate": 4.913980756551551e-05, + "loss": 0.9012, + "step": 21772 + }, + { + "epoch": 4.644411262798635, + "grad_norm": 0.19308342310107807, + "learning_rate": 4.9131956610603e-05, + "loss": 0.8827, + "step": 21773 + }, + { + "epoch": 4.64462457337884, + "grad_norm": 0.16027824405340638, + "learning_rate": 4.91241060049629e-05, + "loss": 0.9388, + "step": 21774 + }, + { + "epoch": 4.6448378839590445, + "grad_norm": 0.1490974845464314, + "learning_rate": 4.911625574868408e-05, + "loss": 0.8994, + "step": 21775 + }, + { + "epoch": 4.645051194539249, + "grad_norm": 0.16939025726577114, + "learning_rate": 4.9108405841855326e-05, + "loss": 0.9191, + "step": 21776 + }, + { + "epoch": 4.645264505119454, + "grad_norm": 0.16138415678725856, + "learning_rate": 4.9100556284565504e-05, + "loss": 0.8583, + "step": 21777 + }, + { + "epoch": 4.645477815699659, + "grad_norm": 0.15663429405055354, + "learning_rate": 4.909270707690338e-05, + "loss": 0.9112, + "step": 21778 + }, + { + "epoch": 4.645691126279863, + "grad_norm": 0.16880358746095622, + "learning_rate": 4.908485821895783e-05, + "loss": 0.9287, + "step": 21779 + }, + { + "epoch": 4.645904436860068, + "grad_norm": 0.13845574863964585, + "learning_rate": 4.907700971081763e-05, + "loss": 0.8891, + "step": 21780 + }, + { + "epoch": 4.6461177474402735, + "grad_norm": 0.16178512040547882, + "learning_rate": 4.9069161552571566e-05, + "loss": 0.9027, + "step": 21781 + }, + { + "epoch": 4.646331058020478, + "grad_norm": 0.15067233144146713, + "learning_rate": 4.906131374430845e-05, + "loss": 0.8645, + "step": 21782 + }, + { + "epoch": 4.646544368600683, + "grad_norm": 0.1588380365281352, + "learning_rate": 4.9053466286117105e-05, + "loss": 0.8923, + "step": 21783 + }, + { + "epoch": 4.646757679180888, + "grad_norm": 0.14312350908032312, + "learning_rate": 4.9045619178086294e-05, + "loss": 0.894, + "step": 21784 + }, + { + "epoch": 4.646970989761092, + "grad_norm": 0.156911474621075, + "learning_rate": 4.903777242030484e-05, + "loss": 0.8961, + "step": 21785 + }, + { + "epoch": 4.647184300341297, + "grad_norm": 0.15232437480731498, + "learning_rate": 4.90299260128615e-05, + "loss": 0.8892, + "step": 21786 + }, + { + "epoch": 4.647397610921502, + "grad_norm": 0.15222596769639102, + "learning_rate": 4.9022079955845065e-05, + "loss": 0.9079, + "step": 21787 + }, + { + "epoch": 4.647610921501706, + "grad_norm": 0.15386641245870517, + "learning_rate": 4.901423424934432e-05, + "loss": 0.9057, + "step": 21788 + }, + { + "epoch": 4.647824232081911, + "grad_norm": 0.14755318595650674, + "learning_rate": 4.900638889344804e-05, + "loss": 0.894, + "step": 21789 + }, + { + "epoch": 4.648037542662116, + "grad_norm": 0.16062076481386142, + "learning_rate": 4.8998543888244976e-05, + "loss": 0.8748, + "step": 21790 + }, + { + "epoch": 4.64825085324232, + "grad_norm": 0.13594359579281506, + "learning_rate": 4.899069923382394e-05, + "loss": 0.9034, + "step": 21791 + }, + { + "epoch": 4.648464163822526, + "grad_norm": 0.13731100522256953, + "learning_rate": 4.898285493027366e-05, + "loss": 0.8731, + "step": 21792 + }, + { + "epoch": 4.648677474402731, + "grad_norm": 0.1539748489693993, + "learning_rate": 4.89750109776829e-05, + "loss": 0.8909, + "step": 21793 + }, + { + "epoch": 4.648890784982935, + "grad_norm": 0.14745472996210954, + "learning_rate": 4.896716737614038e-05, + "loss": 0.9132, + "step": 21794 + }, + { + "epoch": 4.64910409556314, + "grad_norm": 0.17021671093474477, + "learning_rate": 4.895932412573491e-05, + "loss": 0.8916, + "step": 21795 + }, + { + "epoch": 4.649317406143345, + "grad_norm": 0.12682368684368742, + "learning_rate": 4.8951481226555186e-05, + "loss": 0.8688, + "step": 21796 + }, + { + "epoch": 4.649530716723549, + "grad_norm": 0.17325165126362735, + "learning_rate": 4.894363867869e-05, + "loss": 0.9177, + "step": 21797 + }, + { + "epoch": 4.649744027303754, + "grad_norm": 0.13210848925750837, + "learning_rate": 4.893579648222804e-05, + "loss": 0.9267, + "step": 21798 + }, + { + "epoch": 4.649957337883959, + "grad_norm": 0.1653382716389978, + "learning_rate": 4.8927954637258086e-05, + "loss": 0.9045, + "step": 21799 + }, + { + "epoch": 4.650170648464163, + "grad_norm": 0.16362462683191975, + "learning_rate": 4.892011314386883e-05, + "loss": 0.8906, + "step": 21800 + }, + { + "epoch": 4.650383959044369, + "grad_norm": 0.15732131490679505, + "learning_rate": 4.891227200214904e-05, + "loss": 0.8903, + "step": 21801 + }, + { + "epoch": 4.650597269624574, + "grad_norm": 0.14877123727152852, + "learning_rate": 4.890443121218739e-05, + "loss": 0.9004, + "step": 21802 + }, + { + "epoch": 4.650810580204778, + "grad_norm": 0.14657050715673153, + "learning_rate": 4.8896590774072675e-05, + "loss": 0.9179, + "step": 21803 + }, + { + "epoch": 4.651023890784983, + "grad_norm": 0.16490935386322586, + "learning_rate": 4.8888750687893503e-05, + "loss": 0.895, + "step": 21804 + }, + { + "epoch": 4.651237201365188, + "grad_norm": 0.1505942909012278, + "learning_rate": 4.888091095373867e-05, + "loss": 0.9089, + "step": 21805 + }, + { + "epoch": 4.651450511945392, + "grad_norm": 0.1516500847387714, + "learning_rate": 4.887307157169683e-05, + "loss": 0.9027, + "step": 21806 + }, + { + "epoch": 4.651663822525597, + "grad_norm": 0.1537156896567164, + "learning_rate": 4.886523254185672e-05, + "loss": 0.9035, + "step": 21807 + }, + { + "epoch": 4.651877133105802, + "grad_norm": 0.1426876771824842, + "learning_rate": 4.8857393864307024e-05, + "loss": 0.8886, + "step": 21808 + }, + { + "epoch": 4.652090443686006, + "grad_norm": 0.1474318480916076, + "learning_rate": 4.884955553913644e-05, + "loss": 0.9312, + "step": 21809 + }, + { + "epoch": 4.652303754266212, + "grad_norm": 0.15506409787263312, + "learning_rate": 4.8841717566433656e-05, + "loss": 0.9021, + "step": 21810 + }, + { + "epoch": 4.652517064846417, + "grad_norm": 0.17672486299846407, + "learning_rate": 4.8833879946287376e-05, + "loss": 0.8831, + "step": 21811 + }, + { + "epoch": 4.652730375426621, + "grad_norm": 0.16124271740401913, + "learning_rate": 4.882604267878625e-05, + "loss": 0.9078, + "step": 21812 + }, + { + "epoch": 4.652943686006826, + "grad_norm": 0.18169773032470848, + "learning_rate": 4.881820576401898e-05, + "loss": 0.8881, + "step": 21813 + }, + { + "epoch": 4.653156996587031, + "grad_norm": 0.1654545091793923, + "learning_rate": 4.881036920207427e-05, + "loss": 0.906, + "step": 21814 + }, + { + "epoch": 4.653370307167235, + "grad_norm": 0.18681339935966568, + "learning_rate": 4.880253299304074e-05, + "loss": 0.8936, + "step": 21815 + }, + { + "epoch": 4.65358361774744, + "grad_norm": 0.17720509389217073, + "learning_rate": 4.8794697137007045e-05, + "loss": 0.8739, + "step": 21816 + }, + { + "epoch": 4.653796928327645, + "grad_norm": 0.20366935751033136, + "learning_rate": 4.878686163406191e-05, + "loss": 0.9238, + "step": 21817 + }, + { + "epoch": 4.6540102389078495, + "grad_norm": 0.1673769266358239, + "learning_rate": 4.877902648429393e-05, + "loss": 0.897, + "step": 21818 + }, + { + "epoch": 4.654223549488055, + "grad_norm": 0.20694266678474116, + "learning_rate": 4.877119168779182e-05, + "loss": 0.9193, + "step": 21819 + }, + { + "epoch": 4.65443686006826, + "grad_norm": 0.14906278652787414, + "learning_rate": 4.8763357244644166e-05, + "loss": 0.8759, + "step": 21820 + }, + { + "epoch": 4.654650170648464, + "grad_norm": 0.2459730758629174, + "learning_rate": 4.875552315493968e-05, + "loss": 0.8963, + "step": 21821 + }, + { + "epoch": 4.654863481228669, + "grad_norm": 0.16486117597951294, + "learning_rate": 4.8747689418766954e-05, + "loss": 0.8863, + "step": 21822 + }, + { + "epoch": 4.655076791808874, + "grad_norm": 0.19122253901923808, + "learning_rate": 4.8739856036214674e-05, + "loss": 0.9197, + "step": 21823 + }, + { + "epoch": 4.6552901023890785, + "grad_norm": 0.16198073397138057, + "learning_rate": 4.8732023007371426e-05, + "loss": 0.8987, + "step": 21824 + }, + { + "epoch": 4.655503412969283, + "grad_norm": 0.19609466886833213, + "learning_rate": 4.872419033232589e-05, + "loss": 0.9041, + "step": 21825 + }, + { + "epoch": 4.655716723549488, + "grad_norm": 0.20361756755733793, + "learning_rate": 4.8716358011166664e-05, + "loss": 0.8959, + "step": 21826 + }, + { + "epoch": 4.6559300341296925, + "grad_norm": 0.16373645280887386, + "learning_rate": 4.870852604398236e-05, + "loss": 0.8979, + "step": 21827 + }, + { + "epoch": 4.656143344709898, + "grad_norm": 0.21871687299151846, + "learning_rate": 4.8700694430861594e-05, + "loss": 0.9118, + "step": 21828 + }, + { + "epoch": 4.656356655290102, + "grad_norm": 0.21350860765191207, + "learning_rate": 4.869286317189302e-05, + "loss": 0.9082, + "step": 21829 + }, + { + "epoch": 4.6565699658703075, + "grad_norm": 0.22242331146641423, + "learning_rate": 4.86850322671652e-05, + "loss": 0.893, + "step": 21830 + }, + { + "epoch": 4.656783276450512, + "grad_norm": 0.22025496860855037, + "learning_rate": 4.8677201716766786e-05, + "loss": 0.8987, + "step": 21831 + }, + { + "epoch": 4.656996587030717, + "grad_norm": 0.19348058912773572, + "learning_rate": 4.866937152078636e-05, + "loss": 0.9089, + "step": 21832 + }, + { + "epoch": 4.6572098976109215, + "grad_norm": 0.1997164845299252, + "learning_rate": 4.866154167931253e-05, + "loss": 0.9056, + "step": 21833 + }, + { + "epoch": 4.657423208191126, + "grad_norm": 0.19948042204233038, + "learning_rate": 4.865371219243386e-05, + "loss": 0.8961, + "step": 21834 + }, + { + "epoch": 4.657636518771331, + "grad_norm": 0.23823302953200878, + "learning_rate": 4.8645883060238986e-05, + "loss": 0.9241, + "step": 21835 + }, + { + "epoch": 4.657849829351536, + "grad_norm": 0.18642305333791023, + "learning_rate": 4.863805428281648e-05, + "loss": 0.9023, + "step": 21836 + }, + { + "epoch": 4.65806313993174, + "grad_norm": 0.18495412462123212, + "learning_rate": 4.86302258602549e-05, + "loss": 0.8804, + "step": 21837 + }, + { + "epoch": 4.658276450511945, + "grad_norm": 0.19995056180382562, + "learning_rate": 4.862239779264284e-05, + "loss": 0.9081, + "step": 21838 + }, + { + "epoch": 4.6584897610921505, + "grad_norm": 0.1718583501611271, + "learning_rate": 4.86145700800689e-05, + "loss": 0.9067, + "step": 21839 + }, + { + "epoch": 4.658703071672355, + "grad_norm": 0.1729441452735198, + "learning_rate": 4.860674272262159e-05, + "loss": 0.8882, + "step": 21840 + }, + { + "epoch": 4.65891638225256, + "grad_norm": 0.18669326386504495, + "learning_rate": 4.859891572038955e-05, + "loss": 0.8773, + "step": 21841 + }, + { + "epoch": 4.659129692832765, + "grad_norm": 0.15761568271861895, + "learning_rate": 4.859108907346126e-05, + "loss": 0.9092, + "step": 21842 + }, + { + "epoch": 4.659343003412969, + "grad_norm": 0.16002089579353895, + "learning_rate": 4.8583262781925376e-05, + "loss": 0.8974, + "step": 21843 + }, + { + "epoch": 4.659556313993174, + "grad_norm": 0.17436204060206728, + "learning_rate": 4.857543684587037e-05, + "loss": 0.8978, + "step": 21844 + }, + { + "epoch": 4.659769624573379, + "grad_norm": 0.18220978720706169, + "learning_rate": 4.856761126538484e-05, + "loss": 0.8923, + "step": 21845 + }, + { + "epoch": 4.659982935153583, + "grad_norm": 0.1352866195772156, + "learning_rate": 4.8559786040557303e-05, + "loss": 0.9025, + "step": 21846 + }, + { + "epoch": 4.660196245733788, + "grad_norm": 0.22197991700695427, + "learning_rate": 4.855196117147634e-05, + "loss": 0.8966, + "step": 21847 + }, + { + "epoch": 4.660409556313994, + "grad_norm": 0.23123556123427524, + "learning_rate": 4.854413665823046e-05, + "loss": 0.9239, + "step": 21848 + }, + { + "epoch": 4.660622866894198, + "grad_norm": 0.18088350885772145, + "learning_rate": 4.8536312500908184e-05, + "loss": 0.8926, + "step": 21849 + }, + { + "epoch": 4.660836177474403, + "grad_norm": 0.18824462403511955, + "learning_rate": 4.8528488699598076e-05, + "loss": 0.9089, + "step": 21850 + }, + { + "epoch": 4.661049488054608, + "grad_norm": 0.2235610673474824, + "learning_rate": 4.852066525438865e-05, + "loss": 0.8849, + "step": 21851 + }, + { + "epoch": 4.661262798634812, + "grad_norm": 0.155586508886769, + "learning_rate": 4.851284216536841e-05, + "loss": 0.8809, + "step": 21852 + }, + { + "epoch": 4.661476109215017, + "grad_norm": 0.19489903528519034, + "learning_rate": 4.850501943262591e-05, + "loss": 0.8792, + "step": 21853 + }, + { + "epoch": 4.661689419795222, + "grad_norm": 0.2093688976464701, + "learning_rate": 4.849719705624962e-05, + "loss": 0.907, + "step": 21854 + }, + { + "epoch": 4.661902730375426, + "grad_norm": 0.22094987291056692, + "learning_rate": 4.84893750363281e-05, + "loss": 0.9006, + "step": 21855 + }, + { + "epoch": 4.662116040955631, + "grad_norm": 0.20287489283314872, + "learning_rate": 4.848155337294981e-05, + "loss": 0.9027, + "step": 21856 + }, + { + "epoch": 4.662329351535837, + "grad_norm": 0.19642685947614966, + "learning_rate": 4.847373206620329e-05, + "loss": 0.9214, + "step": 21857 + }, + { + "epoch": 4.662542662116041, + "grad_norm": 0.23574012104658373, + "learning_rate": 4.8465911116177e-05, + "loss": 0.8871, + "step": 21858 + }, + { + "epoch": 4.662755972696246, + "grad_norm": 0.1448144072008103, + "learning_rate": 4.84580905229595e-05, + "loss": 0.9033, + "step": 21859 + }, + { + "epoch": 4.662969283276451, + "grad_norm": 0.20002666815439663, + "learning_rate": 4.845027028663921e-05, + "loss": 0.9068, + "step": 21860 + }, + { + "epoch": 4.663182593856655, + "grad_norm": 0.15624245167485182, + "learning_rate": 4.8442450407304655e-05, + "loss": 0.9084, + "step": 21861 + }, + { + "epoch": 4.66339590443686, + "grad_norm": 0.1985549494962351, + "learning_rate": 4.8434630885044283e-05, + "loss": 0.8931, + "step": 21862 + }, + { + "epoch": 4.663609215017065, + "grad_norm": 0.1458617413219945, + "learning_rate": 4.84268117199466e-05, + "loss": 0.8894, + "step": 21863 + }, + { + "epoch": 4.663822525597269, + "grad_norm": 0.18580631294349456, + "learning_rate": 4.841899291210008e-05, + "loss": 0.9111, + "step": 21864 + }, + { + "epoch": 4.664035836177474, + "grad_norm": 0.19118823392270856, + "learning_rate": 4.841117446159318e-05, + "loss": 0.9017, + "step": 21865 + }, + { + "epoch": 4.66424914675768, + "grad_norm": 0.1488138467417055, + "learning_rate": 4.840335636851435e-05, + "loss": 0.9097, + "step": 21866 + }, + { + "epoch": 4.664462457337884, + "grad_norm": 0.2061136488772595, + "learning_rate": 4.839553863295211e-05, + "loss": 0.8642, + "step": 21867 + }, + { + "epoch": 4.664675767918089, + "grad_norm": 0.1572941593971697, + "learning_rate": 4.8387721254994866e-05, + "loss": 0.9167, + "step": 21868 + }, + { + "epoch": 4.664889078498294, + "grad_norm": 0.17022468020711992, + "learning_rate": 4.837990423473109e-05, + "loss": 0.8976, + "step": 21869 + }, + { + "epoch": 4.665102389078498, + "grad_norm": 0.16195121081084607, + "learning_rate": 4.8372087572249254e-05, + "loss": 0.8997, + "step": 21870 + }, + { + "epoch": 4.665315699658703, + "grad_norm": 0.1596394491504641, + "learning_rate": 4.8364271267637754e-05, + "loss": 0.9308, + "step": 21871 + }, + { + "epoch": 4.665529010238908, + "grad_norm": 0.1575732955418912, + "learning_rate": 4.8356455320985054e-05, + "loss": 0.8958, + "step": 21872 + }, + { + "epoch": 4.665742320819112, + "grad_norm": 0.15422856967453583, + "learning_rate": 4.834863973237961e-05, + "loss": 0.8848, + "step": 21873 + }, + { + "epoch": 4.665955631399317, + "grad_norm": 0.17727761546058005, + "learning_rate": 4.834082450190982e-05, + "loss": 0.9126, + "step": 21874 + }, + { + "epoch": 4.666168941979522, + "grad_norm": 0.14130763524521983, + "learning_rate": 4.833300962966414e-05, + "loss": 0.882, + "step": 21875 + }, + { + "epoch": 4.6663822525597265, + "grad_norm": 0.1753912794434019, + "learning_rate": 4.832519511573099e-05, + "loss": 0.9002, + "step": 21876 + }, + { + "epoch": 4.666595563139932, + "grad_norm": 0.16463958343615048, + "learning_rate": 4.83173809601988e-05, + "loss": 0.9026, + "step": 21877 + }, + { + "epoch": 4.666808873720137, + "grad_norm": 0.16169346281737043, + "learning_rate": 4.830956716315595e-05, + "loss": 0.9163, + "step": 21878 + }, + { + "epoch": 4.667022184300341, + "grad_norm": 0.14261655007502586, + "learning_rate": 4.8301753724690914e-05, + "loss": 0.9058, + "step": 21879 + }, + { + "epoch": 4.667235494880546, + "grad_norm": 0.15137136494423356, + "learning_rate": 4.8293940644892036e-05, + "loss": 0.9088, + "step": 21880 + }, + { + "epoch": 4.667448805460751, + "grad_norm": 0.13944043260523178, + "learning_rate": 4.828612792384779e-05, + "loss": 0.8975, + "step": 21881 + }, + { + "epoch": 4.6676621160409555, + "grad_norm": 0.16381741811074593, + "learning_rate": 4.827831556164652e-05, + "loss": 0.8901, + "step": 21882 + }, + { + "epoch": 4.66787542662116, + "grad_norm": 0.17553795065540953, + "learning_rate": 4.8270503558376634e-05, + "loss": 0.8939, + "step": 21883 + }, + { + "epoch": 4.668088737201365, + "grad_norm": 0.13914618752796004, + "learning_rate": 4.826269191412655e-05, + "loss": 0.8863, + "step": 21884 + }, + { + "epoch": 4.6683020477815695, + "grad_norm": 0.17457728443288248, + "learning_rate": 4.825488062898464e-05, + "loss": 0.9277, + "step": 21885 + }, + { + "epoch": 4.668515358361775, + "grad_norm": 0.1605745093470875, + "learning_rate": 4.8247069703039274e-05, + "loss": 0.9292, + "step": 21886 + }, + { + "epoch": 4.66872866894198, + "grad_norm": 0.16265736085702448, + "learning_rate": 4.823925913637887e-05, + "loss": 0.8789, + "step": 21887 + }, + { + "epoch": 4.6689419795221845, + "grad_norm": 0.1625989814600512, + "learning_rate": 4.8231448929091756e-05, + "loss": 0.8742, + "step": 21888 + }, + { + "epoch": 4.669155290102389, + "grad_norm": 0.16739863328256457, + "learning_rate": 4.822363908126635e-05, + "loss": 0.9243, + "step": 21889 + }, + { + "epoch": 4.669368600682594, + "grad_norm": 0.15215368690191974, + "learning_rate": 4.821582959299099e-05, + "loss": 0.9108, + "step": 21890 + }, + { + "epoch": 4.6695819112627985, + "grad_norm": 0.15866277582818325, + "learning_rate": 4.8208020464354064e-05, + "loss": 0.9027, + "step": 21891 + }, + { + "epoch": 4.669795221843003, + "grad_norm": 0.18983099309838194, + "learning_rate": 4.8200211695443936e-05, + "loss": 0.8927, + "step": 21892 + }, + { + "epoch": 4.670008532423208, + "grad_norm": 0.1881264894575783, + "learning_rate": 4.8192403286348925e-05, + "loss": 0.8854, + "step": 21893 + }, + { + "epoch": 4.670221843003413, + "grad_norm": 0.16739511937647086, + "learning_rate": 4.818459523715742e-05, + "loss": 0.9273, + "step": 21894 + }, + { + "epoch": 4.670435153583618, + "grad_norm": 0.15443127295397627, + "learning_rate": 4.817678754795774e-05, + "loss": 0.8543, + "step": 21895 + }, + { + "epoch": 4.670648464163823, + "grad_norm": 0.17429492223197016, + "learning_rate": 4.816898021883825e-05, + "loss": 0.9006, + "step": 21896 + }, + { + "epoch": 4.6708617747440275, + "grad_norm": 0.14899375516671057, + "learning_rate": 4.816117324988729e-05, + "loss": 0.8682, + "step": 21897 + }, + { + "epoch": 4.671075085324232, + "grad_norm": 0.18836040755005926, + "learning_rate": 4.815336664119317e-05, + "loss": 0.8982, + "step": 21898 + }, + { + "epoch": 4.671288395904437, + "grad_norm": 0.1717048612477129, + "learning_rate": 4.8145560392844265e-05, + "loss": 0.9157, + "step": 21899 + }, + { + "epoch": 4.671501706484642, + "grad_norm": 0.1676184618506187, + "learning_rate": 4.8137754504928855e-05, + "loss": 0.8835, + "step": 21900 + }, + { + "epoch": 4.671715017064846, + "grad_norm": 0.14464461178411114, + "learning_rate": 4.812994897753532e-05, + "loss": 0.8929, + "step": 21901 + }, + { + "epoch": 4.671928327645051, + "grad_norm": 0.20803866899982515, + "learning_rate": 4.812214381075192e-05, + "loss": 0.9107, + "step": 21902 + }, + { + "epoch": 4.672141638225256, + "grad_norm": 0.2114477998688759, + "learning_rate": 4.811433900466704e-05, + "loss": 0.8933, + "step": 21903 + }, + { + "epoch": 4.672354948805461, + "grad_norm": 0.15401630707538133, + "learning_rate": 4.810653455936893e-05, + "loss": 0.8977, + "step": 21904 + }, + { + "epoch": 4.672568259385666, + "grad_norm": 0.20164137455049708, + "learning_rate": 4.80987304749459e-05, + "loss": 0.9294, + "step": 21905 + }, + { + "epoch": 4.672781569965871, + "grad_norm": 0.22058359485459755, + "learning_rate": 4.8090926751486284e-05, + "loss": 0.9036, + "step": 21906 + }, + { + "epoch": 4.672994880546075, + "grad_norm": 0.14809524856556464, + "learning_rate": 4.808312338907836e-05, + "loss": 0.9237, + "step": 21907 + }, + { + "epoch": 4.67320819112628, + "grad_norm": 0.19908988863135516, + "learning_rate": 4.807532038781045e-05, + "loss": 0.8838, + "step": 21908 + }, + { + "epoch": 4.673421501706485, + "grad_norm": 0.1599822505478524, + "learning_rate": 4.8067517747770816e-05, + "loss": 0.9129, + "step": 21909 + }, + { + "epoch": 4.673634812286689, + "grad_norm": 0.15216637532559318, + "learning_rate": 4.805971546904774e-05, + "loss": 0.9087, + "step": 21910 + }, + { + "epoch": 4.673848122866894, + "grad_norm": 0.15045124662410306, + "learning_rate": 4.805191355172954e-05, + "loss": 0.8919, + "step": 21911 + }, + { + "epoch": 4.674061433447099, + "grad_norm": 0.18198576030703004, + "learning_rate": 4.8044111995904456e-05, + "loss": 0.881, + "step": 21912 + }, + { + "epoch": 4.674274744027304, + "grad_norm": 0.1508020640987771, + "learning_rate": 4.8036310801660814e-05, + "loss": 0.9014, + "step": 21913 + }, + { + "epoch": 4.674488054607508, + "grad_norm": 0.15888382910595628, + "learning_rate": 4.8028509969086825e-05, + "loss": 0.8785, + "step": 21914 + }, + { + "epoch": 4.674701365187714, + "grad_norm": 0.1603941718732473, + "learning_rate": 4.802070949827082e-05, + "loss": 0.9305, + "step": 21915 + }, + { + "epoch": 4.674914675767918, + "grad_norm": 0.1528389509554431, + "learning_rate": 4.8012909389301e-05, + "loss": 0.9212, + "step": 21916 + }, + { + "epoch": 4.675127986348123, + "grad_norm": 0.18524960633598678, + "learning_rate": 4.8005109642265636e-05, + "loss": 0.9195, + "step": 21917 + }, + { + "epoch": 4.675341296928328, + "grad_norm": 0.16116507694168922, + "learning_rate": 4.7997310257253016e-05, + "loss": 0.918, + "step": 21918 + }, + { + "epoch": 4.675554607508532, + "grad_norm": 0.14056074534326543, + "learning_rate": 4.798951123435135e-05, + "loss": 0.8869, + "step": 21919 + }, + { + "epoch": 4.675767918088737, + "grad_norm": 0.147349793733215, + "learning_rate": 4.7981712573648924e-05, + "loss": 0.8796, + "step": 21920 + }, + { + "epoch": 4.675981228668942, + "grad_norm": 0.22342974503184618, + "learning_rate": 4.797391427523394e-05, + "loss": 0.9007, + "step": 21921 + }, + { + "epoch": 4.676194539249146, + "grad_norm": 0.18164014970137393, + "learning_rate": 4.796611633919466e-05, + "loss": 0.9085, + "step": 21922 + }, + { + "epoch": 4.676407849829351, + "grad_norm": 0.14752755857949984, + "learning_rate": 4.7958318765619326e-05, + "loss": 0.888, + "step": 21923 + }, + { + "epoch": 4.676621160409557, + "grad_norm": 0.24339292142678418, + "learning_rate": 4.795052155459613e-05, + "loss": 0.9208, + "step": 21924 + }, + { + "epoch": 4.676834470989761, + "grad_norm": 0.24900796394903332, + "learning_rate": 4.794272470621334e-05, + "loss": 0.8753, + "step": 21925 + }, + { + "epoch": 4.677047781569966, + "grad_norm": 0.16499891579722697, + "learning_rate": 4.793492822055918e-05, + "loss": 0.9283, + "step": 21926 + }, + { + "epoch": 4.677261092150171, + "grad_norm": 0.244497056413768, + "learning_rate": 4.7927132097721796e-05, + "loss": 0.8979, + "step": 21927 + }, + { + "epoch": 4.677474402730375, + "grad_norm": 0.284885030648419, + "learning_rate": 4.791933633778948e-05, + "loss": 0.8928, + "step": 21928 + }, + { + "epoch": 4.67768771331058, + "grad_norm": 0.21976072608416164, + "learning_rate": 4.7911540940850404e-05, + "loss": 0.92, + "step": 21929 + }, + { + "epoch": 4.677901023890785, + "grad_norm": 0.17662583143157198, + "learning_rate": 4.790374590699279e-05, + "loss": 0.885, + "step": 21930 + }, + { + "epoch": 4.6781143344709895, + "grad_norm": 0.26239458473516747, + "learning_rate": 4.789595123630483e-05, + "loss": 0.9126, + "step": 21931 + }, + { + "epoch": 4.678327645051194, + "grad_norm": 0.22639160348800672, + "learning_rate": 4.788815692887472e-05, + "loss": 0.8776, + "step": 21932 + }, + { + "epoch": 4.6785409556314, + "grad_norm": 0.15909723834188544, + "learning_rate": 4.788036298479066e-05, + "loss": 0.9204, + "step": 21933 + }, + { + "epoch": 4.678754266211604, + "grad_norm": 0.2867549325788384, + "learning_rate": 4.7872569404140807e-05, + "loss": 0.9457, + "step": 21934 + }, + { + "epoch": 4.678967576791809, + "grad_norm": 0.2589233993659332, + "learning_rate": 4.78647761870134e-05, + "loss": 0.9121, + "step": 21935 + }, + { + "epoch": 4.679180887372014, + "grad_norm": 0.1443572137694807, + "learning_rate": 4.7856983333496576e-05, + "loss": 0.9104, + "step": 21936 + }, + { + "epoch": 4.6793941979522184, + "grad_norm": 0.24337162416224578, + "learning_rate": 4.7849190843678556e-05, + "loss": 0.9096, + "step": 21937 + }, + { + "epoch": 4.679607508532423, + "grad_norm": 0.16926430388062452, + "learning_rate": 4.784139871764746e-05, + "loss": 0.8959, + "step": 21938 + }, + { + "epoch": 4.679820819112628, + "grad_norm": 0.15623475560353156, + "learning_rate": 4.783360695549147e-05, + "loss": 0.9016, + "step": 21939 + }, + { + "epoch": 4.6800341296928325, + "grad_norm": 0.20410434389830934, + "learning_rate": 4.7825815557298775e-05, + "loss": 0.8766, + "step": 21940 + }, + { + "epoch": 4.680247440273037, + "grad_norm": 0.1830793068814987, + "learning_rate": 4.781802452315749e-05, + "loss": 0.8818, + "step": 21941 + }, + { + "epoch": 4.680460750853243, + "grad_norm": 0.13390322495833148, + "learning_rate": 4.7810233853155825e-05, + "loss": 0.8926, + "step": 21942 + }, + { + "epoch": 4.680674061433447, + "grad_norm": 0.20745989194746411, + "learning_rate": 4.780244354738189e-05, + "loss": 0.8881, + "step": 21943 + }, + { + "epoch": 4.680887372013652, + "grad_norm": 0.17895842577816293, + "learning_rate": 4.779465360592386e-05, + "loss": 0.9204, + "step": 21944 + }, + { + "epoch": 4.681100682593857, + "grad_norm": 0.18492862121512663, + "learning_rate": 4.778686402886987e-05, + "loss": 0.9099, + "step": 21945 + }, + { + "epoch": 4.6813139931740615, + "grad_norm": 0.2189672225511778, + "learning_rate": 4.777907481630804e-05, + "loss": 0.9231, + "step": 21946 + }, + { + "epoch": 4.681527303754266, + "grad_norm": 0.1827012498933924, + "learning_rate": 4.777128596832653e-05, + "loss": 0.8773, + "step": 21947 + }, + { + "epoch": 4.681740614334471, + "grad_norm": 0.16646227694131743, + "learning_rate": 4.776349748501348e-05, + "loss": 0.9056, + "step": 21948 + }, + { + "epoch": 4.6819539249146755, + "grad_norm": 0.2130888174610256, + "learning_rate": 4.7755709366456965e-05, + "loss": 0.9151, + "step": 21949 + }, + { + "epoch": 4.68216723549488, + "grad_norm": 0.1925364965844941, + "learning_rate": 4.774792161274517e-05, + "loss": 0.897, + "step": 21950 + }, + { + "epoch": 4.682380546075086, + "grad_norm": 0.18227223631344538, + "learning_rate": 4.774013422396615e-05, + "loss": 0.9206, + "step": 21951 + }, + { + "epoch": 4.6825938566552905, + "grad_norm": 0.20302282543560912, + "learning_rate": 4.773234720020808e-05, + "loss": 0.891, + "step": 21952 + }, + { + "epoch": 4.682807167235495, + "grad_norm": 0.18302524110186688, + "learning_rate": 4.7724560541559023e-05, + "loss": 0.9013, + "step": 21953 + }, + { + "epoch": 4.6830204778157, + "grad_norm": 0.1978740213816824, + "learning_rate": 4.771677424810713e-05, + "loss": 0.9171, + "step": 21954 + }, + { + "epoch": 4.6832337883959045, + "grad_norm": 0.21320204474923454, + "learning_rate": 4.770898831994045e-05, + "loss": 0.9128, + "step": 21955 + }, + { + "epoch": 4.683447098976109, + "grad_norm": 0.19415710031607378, + "learning_rate": 4.770120275714714e-05, + "loss": 0.9251, + "step": 21956 + }, + { + "epoch": 4.683660409556314, + "grad_norm": 0.1699951573206785, + "learning_rate": 4.7693417559815255e-05, + "loss": 0.8673, + "step": 21957 + }, + { + "epoch": 4.683873720136519, + "grad_norm": 0.23350794398431587, + "learning_rate": 4.768563272803288e-05, + "loss": 0.8958, + "step": 21958 + }, + { + "epoch": 4.684087030716723, + "grad_norm": 0.14862285456712326, + "learning_rate": 4.767784826188815e-05, + "loss": 0.8922, + "step": 21959 + }, + { + "epoch": 4.684300341296928, + "grad_norm": 0.20450089480222025, + "learning_rate": 4.7670064161469096e-05, + "loss": 0.9054, + "step": 21960 + }, + { + "epoch": 4.684513651877133, + "grad_norm": 0.19831196260277473, + "learning_rate": 4.766228042686379e-05, + "loss": 0.8851, + "step": 21961 + }, + { + "epoch": 4.684726962457338, + "grad_norm": 0.1569124201665598, + "learning_rate": 4.765449705816034e-05, + "loss": 0.9091, + "step": 21962 + }, + { + "epoch": 4.684940273037543, + "grad_norm": 0.18974909105934745, + "learning_rate": 4.764671405544679e-05, + "loss": 0.897, + "step": 21963 + }, + { + "epoch": 4.685153583617748, + "grad_norm": 0.20548727004683598, + "learning_rate": 4.763893141881122e-05, + "loss": 0.8754, + "step": 21964 + }, + { + "epoch": 4.685366894197952, + "grad_norm": 0.14900870930577345, + "learning_rate": 4.763114914834168e-05, + "loss": 0.8826, + "step": 21965 + }, + { + "epoch": 4.685580204778157, + "grad_norm": 0.1546210967679022, + "learning_rate": 4.762336724412624e-05, + "loss": 0.893, + "step": 21966 + }, + { + "epoch": 4.685793515358362, + "grad_norm": 0.1835694592694175, + "learning_rate": 4.7615585706252925e-05, + "loss": 0.921, + "step": 21967 + }, + { + "epoch": 4.686006825938566, + "grad_norm": 0.13614635414585333, + "learning_rate": 4.7607804534809826e-05, + "loss": 0.9086, + "step": 21968 + }, + { + "epoch": 4.686220136518771, + "grad_norm": 0.15327355024791486, + "learning_rate": 4.760002372988496e-05, + "loss": 0.8973, + "step": 21969 + }, + { + "epoch": 4.686433447098976, + "grad_norm": 0.14395819948612942, + "learning_rate": 4.7592243291566365e-05, + "loss": 0.9166, + "step": 21970 + }, + { + "epoch": 4.686646757679181, + "grad_norm": 0.15567287203264898, + "learning_rate": 4.7584463219942103e-05, + "loss": 0.9051, + "step": 21971 + }, + { + "epoch": 4.686860068259386, + "grad_norm": 0.13544653874127577, + "learning_rate": 4.757668351510017e-05, + "loss": 0.896, + "step": 21972 + }, + { + "epoch": 4.687073378839591, + "grad_norm": 0.15517876813095136, + "learning_rate": 4.7568904177128584e-05, + "loss": 0.8797, + "step": 21973 + }, + { + "epoch": 4.687286689419795, + "grad_norm": 0.15231757555828723, + "learning_rate": 4.756112520611543e-05, + "loss": 0.9112, + "step": 21974 + }, + { + "epoch": 4.6875, + "grad_norm": 0.15126499481710604, + "learning_rate": 4.755334660214866e-05, + "loss": 0.8956, + "step": 21975 + }, + { + "epoch": 4.687713310580205, + "grad_norm": 0.18502452116200396, + "learning_rate": 4.754556836531634e-05, + "loss": 0.9238, + "step": 21976 + }, + { + "epoch": 4.687926621160409, + "grad_norm": 0.15265814905836592, + "learning_rate": 4.753779049570644e-05, + "loss": 0.9275, + "step": 21977 + }, + { + "epoch": 4.688139931740614, + "grad_norm": 0.15342945049978313, + "learning_rate": 4.7530012993406997e-05, + "loss": 0.9073, + "step": 21978 + }, + { + "epoch": 4.688353242320819, + "grad_norm": 0.15631192100948074, + "learning_rate": 4.7522235858505995e-05, + "loss": 0.9067, + "step": 21979 + }, + { + "epoch": 4.688566552901024, + "grad_norm": 0.1267251675741933, + "learning_rate": 4.751445909109146e-05, + "loss": 0.8938, + "step": 21980 + }, + { + "epoch": 4.688779863481229, + "grad_norm": 0.17085542620763705, + "learning_rate": 4.750668269125136e-05, + "loss": 0.8853, + "step": 21981 + }, + { + "epoch": 4.688993174061434, + "grad_norm": 0.1427527966630262, + "learning_rate": 4.749890665907371e-05, + "loss": 0.8607, + "step": 21982 + }, + { + "epoch": 4.689206484641638, + "grad_norm": 0.19776139478894522, + "learning_rate": 4.749113099464644e-05, + "loss": 0.9056, + "step": 21983 + }, + { + "epoch": 4.689419795221843, + "grad_norm": 0.12312590314328568, + "learning_rate": 4.7483355698057596e-05, + "loss": 0.8991, + "step": 21984 + }, + { + "epoch": 4.689633105802048, + "grad_norm": 0.21079209401565374, + "learning_rate": 4.747558076939511e-05, + "loss": 0.9162, + "step": 21985 + }, + { + "epoch": 4.689846416382252, + "grad_norm": 0.17164388427743765, + "learning_rate": 4.746780620874699e-05, + "loss": 0.9002, + "step": 21986 + }, + { + "epoch": 4.690059726962457, + "grad_norm": 0.19438919889685816, + "learning_rate": 4.7460032016201176e-05, + "loss": 0.9061, + "step": 21987 + }, + { + "epoch": 4.690273037542662, + "grad_norm": 0.24526617934846648, + "learning_rate": 4.7452258191845656e-05, + "loss": 0.9005, + "step": 21988 + }, + { + "epoch": 4.690486348122867, + "grad_norm": 0.14752208475052153, + "learning_rate": 4.744448473576837e-05, + "loss": 0.9465, + "step": 21989 + }, + { + "epoch": 4.690699658703072, + "grad_norm": 0.20502072475607355, + "learning_rate": 4.7436711648057315e-05, + "loss": 0.8822, + "step": 21990 + }, + { + "epoch": 4.690912969283277, + "grad_norm": 0.20833053729465098, + "learning_rate": 4.7428938928800385e-05, + "loss": 0.9121, + "step": 21991 + }, + { + "epoch": 4.691126279863481, + "grad_norm": 0.19412171919255833, + "learning_rate": 4.742116657808557e-05, + "loss": 0.8961, + "step": 21992 + }, + { + "epoch": 4.691339590443686, + "grad_norm": 0.16225721440572768, + "learning_rate": 4.741339459600083e-05, + "loss": 0.8819, + "step": 21993 + }, + { + "epoch": 4.691552901023891, + "grad_norm": 0.21010408567669528, + "learning_rate": 4.740562298263406e-05, + "loss": 0.9077, + "step": 21994 + }, + { + "epoch": 4.6917662116040955, + "grad_norm": 0.1579936878755055, + "learning_rate": 4.7397851738073206e-05, + "loss": 0.8904, + "step": 21995 + }, + { + "epoch": 4.6919795221843, + "grad_norm": 0.15308020109930148, + "learning_rate": 4.7390080862406206e-05, + "loss": 0.8912, + "step": 21996 + }, + { + "epoch": 4.692192832764505, + "grad_norm": 0.1694046931997776, + "learning_rate": 4.738231035572098e-05, + "loss": 0.9108, + "step": 21997 + }, + { + "epoch": 4.6924061433447095, + "grad_norm": 0.18031730625525946, + "learning_rate": 4.737454021810548e-05, + "loss": 0.9214, + "step": 21998 + }, + { + "epoch": 4.692619453924914, + "grad_norm": 0.15338831434419087, + "learning_rate": 4.7366770449647596e-05, + "loss": 0.9169, + "step": 21999 + }, + { + "epoch": 4.69283276450512, + "grad_norm": 0.16906751256864236, + "learning_rate": 4.7359001050435263e-05, + "loss": 0.9118, + "step": 22000 + }, + { + "epoch": 4.6930460750853245, + "grad_norm": 0.20770383413961677, + "learning_rate": 4.735123202055636e-05, + "loss": 0.9429, + "step": 22001 + }, + { + "epoch": 4.693259385665529, + "grad_norm": 0.14531565349507966, + "learning_rate": 4.7343463360098836e-05, + "loss": 0.9216, + "step": 22002 + }, + { + "epoch": 4.693472696245734, + "grad_norm": 0.21861300324210406, + "learning_rate": 4.733569506915057e-05, + "loss": 0.8804, + "step": 22003 + }, + { + "epoch": 4.6936860068259385, + "grad_norm": 0.1868044083807829, + "learning_rate": 4.7327927147799496e-05, + "loss": 0.9035, + "step": 22004 + }, + { + "epoch": 4.693899317406143, + "grad_norm": 0.15248843881170254, + "learning_rate": 4.7320159596133436e-05, + "loss": 0.9258, + "step": 22005 + }, + { + "epoch": 4.694112627986348, + "grad_norm": 0.1603735196976712, + "learning_rate": 4.7312392414240334e-05, + "loss": 0.8895, + "step": 22006 + }, + { + "epoch": 4.694325938566553, + "grad_norm": 0.15862230577817896, + "learning_rate": 4.7304625602208045e-05, + "loss": 0.9124, + "step": 22007 + }, + { + "epoch": 4.694539249146757, + "grad_norm": 0.17155779801792315, + "learning_rate": 4.729685916012448e-05, + "loss": 0.8852, + "step": 22008 + }, + { + "epoch": 4.694752559726963, + "grad_norm": 0.15712565817997262, + "learning_rate": 4.7289093088077496e-05, + "loss": 0.8756, + "step": 22009 + }, + { + "epoch": 4.6949658703071675, + "grad_norm": 0.16938933294916045, + "learning_rate": 4.728132738615499e-05, + "loss": 0.899, + "step": 22010 + }, + { + "epoch": 4.695179180887372, + "grad_norm": 0.18151145840106347, + "learning_rate": 4.727356205444478e-05, + "loss": 0.9066, + "step": 22011 + }, + { + "epoch": 4.695392491467577, + "grad_norm": 0.14548838676624323, + "learning_rate": 4.7265797093034795e-05, + "loss": 0.9059, + "step": 22012 + }, + { + "epoch": 4.6956058020477816, + "grad_norm": 0.1703079473158037, + "learning_rate": 4.725803250201286e-05, + "loss": 0.8963, + "step": 22013 + }, + { + "epoch": 4.695819112627986, + "grad_norm": 0.14389588568918013, + "learning_rate": 4.7250268281466836e-05, + "loss": 0.8917, + "step": 22014 + }, + { + "epoch": 4.696032423208191, + "grad_norm": 0.14126302162118465, + "learning_rate": 4.72425044314846e-05, + "loss": 0.8914, + "step": 22015 + }, + { + "epoch": 4.696245733788396, + "grad_norm": 0.1600196211446449, + "learning_rate": 4.723474095215397e-05, + "loss": 0.9096, + "step": 22016 + }, + { + "epoch": 4.6964590443686, + "grad_norm": 0.146064701762471, + "learning_rate": 4.722697784356278e-05, + "loss": 0.872, + "step": 22017 + }, + { + "epoch": 4.696672354948806, + "grad_norm": 0.16446216732963057, + "learning_rate": 4.72192151057989e-05, + "loss": 0.9156, + "step": 22018 + }, + { + "epoch": 4.6968856655290105, + "grad_norm": 0.18590840604673653, + "learning_rate": 4.721145273895013e-05, + "loss": 0.9073, + "step": 22019 + }, + { + "epoch": 4.697098976109215, + "grad_norm": 0.1681843753553369, + "learning_rate": 4.720369074310434e-05, + "loss": 0.8982, + "step": 22020 + }, + { + "epoch": 4.69731228668942, + "grad_norm": 0.12945037051453148, + "learning_rate": 4.719592911834934e-05, + "loss": 0.8916, + "step": 22021 + }, + { + "epoch": 4.697525597269625, + "grad_norm": 0.15759655393315003, + "learning_rate": 4.718816786477295e-05, + "loss": 0.9077, + "step": 22022 + }, + { + "epoch": 4.697738907849829, + "grad_norm": 0.14705004752069106, + "learning_rate": 4.718040698246299e-05, + "loss": 0.8851, + "step": 22023 + }, + { + "epoch": 4.697952218430034, + "grad_norm": 0.14882772772458652, + "learning_rate": 4.717264647150728e-05, + "loss": 0.9041, + "step": 22024 + }, + { + "epoch": 4.698165529010239, + "grad_norm": 0.16876928587602907, + "learning_rate": 4.7164886331993616e-05, + "loss": 0.8892, + "step": 22025 + }, + { + "epoch": 4.698378839590443, + "grad_norm": 0.1635653164496142, + "learning_rate": 4.715712656400984e-05, + "loss": 0.8821, + "step": 22026 + }, + { + "epoch": 4.698592150170649, + "grad_norm": 0.15393847793756474, + "learning_rate": 4.714936716764374e-05, + "loss": 0.8807, + "step": 22027 + }, + { + "epoch": 4.698805460750854, + "grad_norm": 0.15863946075205587, + "learning_rate": 4.714160814298309e-05, + "loss": 0.909, + "step": 22028 + }, + { + "epoch": 4.699018771331058, + "grad_norm": 0.22215788678665913, + "learning_rate": 4.713384949011568e-05, + "loss": 0.8974, + "step": 22029 + }, + { + "epoch": 4.699232081911263, + "grad_norm": 0.17775995023869626, + "learning_rate": 4.712609120912932e-05, + "loss": 0.8712, + "step": 22030 + }, + { + "epoch": 4.699445392491468, + "grad_norm": 0.15891956147454556, + "learning_rate": 4.711833330011179e-05, + "loss": 0.923, + "step": 22031 + }, + { + "epoch": 4.699658703071672, + "grad_norm": 0.1998512993382485, + "learning_rate": 4.711057576315089e-05, + "loss": 0.852, + "step": 22032 + }, + { + "epoch": 4.699872013651877, + "grad_norm": 0.17016133735022834, + "learning_rate": 4.710281859833435e-05, + "loss": 0.8691, + "step": 22033 + }, + { + "epoch": 4.700085324232082, + "grad_norm": 0.19639953901748322, + "learning_rate": 4.7095061805749994e-05, + "loss": 0.9004, + "step": 22034 + }, + { + "epoch": 4.700298634812286, + "grad_norm": 0.17216258318754618, + "learning_rate": 4.708730538548555e-05, + "loss": 0.8857, + "step": 22035 + }, + { + "epoch": 4.700511945392492, + "grad_norm": 0.15108010533959298, + "learning_rate": 4.707954933762881e-05, + "loss": 0.8925, + "step": 22036 + }, + { + "epoch": 4.700725255972696, + "grad_norm": 0.1752700055986825, + "learning_rate": 4.707179366226751e-05, + "loss": 0.9314, + "step": 22037 + }, + { + "epoch": 4.700938566552901, + "grad_norm": 0.1652147806253744, + "learning_rate": 4.706403835948947e-05, + "loss": 0.9128, + "step": 22038 + }, + { + "epoch": 4.701151877133106, + "grad_norm": 0.17540206489257498, + "learning_rate": 4.705628342938234e-05, + "loss": 0.8923, + "step": 22039 + }, + { + "epoch": 4.701365187713311, + "grad_norm": 0.16574407612873973, + "learning_rate": 4.704852887203393e-05, + "loss": 0.8861, + "step": 22040 + }, + { + "epoch": 4.701578498293515, + "grad_norm": 0.1660304499976913, + "learning_rate": 4.7040774687531955e-05, + "loss": 0.8846, + "step": 22041 + }, + { + "epoch": 4.70179180887372, + "grad_norm": 0.208050899127387, + "learning_rate": 4.703302087596418e-05, + "loss": 0.8785, + "step": 22042 + }, + { + "epoch": 4.702005119453925, + "grad_norm": 0.17476897911113246, + "learning_rate": 4.7025267437418326e-05, + "loss": 0.9139, + "step": 22043 + }, + { + "epoch": 4.702218430034129, + "grad_norm": 0.18286653215149165, + "learning_rate": 4.7017514371982135e-05, + "loss": 0.9093, + "step": 22044 + }, + { + "epoch": 4.702431740614334, + "grad_norm": 0.17338009240710658, + "learning_rate": 4.700976167974331e-05, + "loss": 0.9003, + "step": 22045 + }, + { + "epoch": 4.702645051194539, + "grad_norm": 0.18707061008380657, + "learning_rate": 4.700200936078959e-05, + "loss": 0.9367, + "step": 22046 + }, + { + "epoch": 4.702858361774744, + "grad_norm": 0.16708087657851411, + "learning_rate": 4.6994257415208694e-05, + "loss": 0.9029, + "step": 22047 + }, + { + "epoch": 4.703071672354949, + "grad_norm": 0.14759572008716101, + "learning_rate": 4.698650584308834e-05, + "loss": 0.9198, + "step": 22048 + }, + { + "epoch": 4.703284982935154, + "grad_norm": 0.1717036173800319, + "learning_rate": 4.697875464451623e-05, + "loss": 0.9069, + "step": 22049 + }, + { + "epoch": 4.703498293515358, + "grad_norm": 0.15516245277847515, + "learning_rate": 4.697100381958006e-05, + "loss": 0.8945, + "step": 22050 + }, + { + "epoch": 4.703711604095563, + "grad_norm": 0.15790705004984742, + "learning_rate": 4.696325336836752e-05, + "loss": 0.9176, + "step": 22051 + }, + { + "epoch": 4.703924914675768, + "grad_norm": 0.15642210100059215, + "learning_rate": 4.695550329096634e-05, + "loss": 0.8991, + "step": 22052 + }, + { + "epoch": 4.7041382252559725, + "grad_norm": 0.6163628292440821, + "learning_rate": 4.694775358746419e-05, + "loss": 0.9033, + "step": 22053 + }, + { + "epoch": 4.704351535836177, + "grad_norm": 0.17745002389257186, + "learning_rate": 4.694000425794876e-05, + "loss": 0.8951, + "step": 22054 + }, + { + "epoch": 4.704564846416382, + "grad_norm": 0.22911638820977945, + "learning_rate": 4.6932255302507736e-05, + "loss": 0.8783, + "step": 22055 + }, + { + "epoch": 4.704778156996587, + "grad_norm": 0.18784070685638093, + "learning_rate": 4.692450672122881e-05, + "loss": 0.8879, + "step": 22056 + }, + { + "epoch": 4.704991467576792, + "grad_norm": 0.20920284187535512, + "learning_rate": 4.691675851419964e-05, + "loss": 0.906, + "step": 22057 + }, + { + "epoch": 4.705204778156997, + "grad_norm": 0.20121212448000658, + "learning_rate": 4.6909010681507905e-05, + "loss": 0.9067, + "step": 22058 + }, + { + "epoch": 4.7054180887372015, + "grad_norm": 0.1746541633848856, + "learning_rate": 4.690126322324126e-05, + "loss": 0.9128, + "step": 22059 + }, + { + "epoch": 4.705631399317406, + "grad_norm": 0.2066802425241575, + "learning_rate": 4.6893516139487425e-05, + "loss": 0.8982, + "step": 22060 + }, + { + "epoch": 4.705844709897611, + "grad_norm": 0.1866724734950302, + "learning_rate": 4.688576943033399e-05, + "loss": 0.8984, + "step": 22061 + }, + { + "epoch": 4.7060580204778155, + "grad_norm": 0.19435509441172213, + "learning_rate": 4.687802309586861e-05, + "loss": 0.9047, + "step": 22062 + }, + { + "epoch": 4.70627133105802, + "grad_norm": 0.17639269068560542, + "learning_rate": 4.6870277136178965e-05, + "loss": 0.9029, + "step": 22063 + }, + { + "epoch": 4.706484641638225, + "grad_norm": 0.2007202265662113, + "learning_rate": 4.686253155135269e-05, + "loss": 0.904, + "step": 22064 + }, + { + "epoch": 4.7066979522184305, + "grad_norm": 0.16049823807731226, + "learning_rate": 4.685478634147742e-05, + "loss": 0.9245, + "step": 22065 + }, + { + "epoch": 4.706911262798635, + "grad_norm": 0.2356728243799868, + "learning_rate": 4.684704150664082e-05, + "loss": 0.8734, + "step": 22066 + }, + { + "epoch": 4.70712457337884, + "grad_norm": 0.1596054695815253, + "learning_rate": 4.6839297046930486e-05, + "loss": 0.8964, + "step": 22067 + }, + { + "epoch": 4.7073378839590445, + "grad_norm": 0.22166537721922705, + "learning_rate": 4.683155296243406e-05, + "loss": 0.9067, + "step": 22068 + }, + { + "epoch": 4.707551194539249, + "grad_norm": 0.14086368265348442, + "learning_rate": 4.682380925323918e-05, + "loss": 0.8928, + "step": 22069 + }, + { + "epoch": 4.707764505119454, + "grad_norm": 0.20858197145959775, + "learning_rate": 4.681606591943346e-05, + "loss": 0.8962, + "step": 22070 + }, + { + "epoch": 4.707977815699659, + "grad_norm": 0.1492961427125241, + "learning_rate": 4.6808322961104525e-05, + "loss": 0.8732, + "step": 22071 + }, + { + "epoch": 4.708191126279863, + "grad_norm": 0.16024172780366472, + "learning_rate": 4.680058037833995e-05, + "loss": 0.8871, + "step": 22072 + }, + { + "epoch": 4.708404436860068, + "grad_norm": 0.1596186980049945, + "learning_rate": 4.679283817122738e-05, + "loss": 0.8931, + "step": 22073 + }, + { + "epoch": 4.7086177474402735, + "grad_norm": 0.1338261868752505, + "learning_rate": 4.678509633985441e-05, + "loss": 0.897, + "step": 22074 + }, + { + "epoch": 4.708831058020478, + "grad_norm": 0.15973851943075906, + "learning_rate": 4.677735488430861e-05, + "loss": 0.8836, + "step": 22075 + }, + { + "epoch": 4.709044368600683, + "grad_norm": 0.14713845059080619, + "learning_rate": 4.676961380467761e-05, + "loss": 0.9041, + "step": 22076 + }, + { + "epoch": 4.709257679180888, + "grad_norm": 0.16215531382708345, + "learning_rate": 4.676187310104898e-05, + "loss": 0.9116, + "step": 22077 + }, + { + "epoch": 4.709470989761092, + "grad_norm": 0.18960959328126425, + "learning_rate": 4.675413277351034e-05, + "loss": 0.9151, + "step": 22078 + }, + { + "epoch": 4.709684300341297, + "grad_norm": 0.1583266780882555, + "learning_rate": 4.674639282214921e-05, + "loss": 0.8763, + "step": 22079 + }, + { + "epoch": 4.709897610921502, + "grad_norm": 0.19354328336733062, + "learning_rate": 4.673865324705324e-05, + "loss": 0.8886, + "step": 22080 + }, + { + "epoch": 4.710110921501706, + "grad_norm": 0.22238804698389097, + "learning_rate": 4.6730914048309936e-05, + "loss": 0.9234, + "step": 22081 + }, + { + "epoch": 4.710324232081911, + "grad_norm": 0.18449142839945565, + "learning_rate": 4.672317522600694e-05, + "loss": 0.8782, + "step": 22082 + }, + { + "epoch": 4.710537542662116, + "grad_norm": 0.2622749743785863, + "learning_rate": 4.671543678023176e-05, + "loss": 0.9469, + "step": 22083 + }, + { + "epoch": 4.71075085324232, + "grad_norm": 0.25664897523418145, + "learning_rate": 4.670769871107198e-05, + "loss": 0.9339, + "step": 22084 + }, + { + "epoch": 4.710964163822526, + "grad_norm": 0.22016042198739444, + "learning_rate": 4.6699961018615125e-05, + "loss": 0.9146, + "step": 22085 + }, + { + "epoch": 4.711177474402731, + "grad_norm": 0.15573532639632426, + "learning_rate": 4.6692223702948756e-05, + "loss": 0.9099, + "step": 22086 + }, + { + "epoch": 4.711390784982935, + "grad_norm": 0.2511769402868526, + "learning_rate": 4.668448676416047e-05, + "loss": 0.8981, + "step": 22087 + }, + { + "epoch": 4.71160409556314, + "grad_norm": 0.22773034149887003, + "learning_rate": 4.667675020233778e-05, + "loss": 0.9177, + "step": 22088 + }, + { + "epoch": 4.711817406143345, + "grad_norm": 0.14775024392080457, + "learning_rate": 4.666901401756821e-05, + "loss": 0.8814, + "step": 22089 + }, + { + "epoch": 4.712030716723549, + "grad_norm": 0.23551843712717221, + "learning_rate": 4.6661278209939284e-05, + "loss": 0.8951, + "step": 22090 + }, + { + "epoch": 4.712244027303754, + "grad_norm": 0.1518448000356994, + "learning_rate": 4.665354277953858e-05, + "loss": 0.9084, + "step": 22091 + }, + { + "epoch": 4.712457337883959, + "grad_norm": 0.19157201201014237, + "learning_rate": 4.6645807726453605e-05, + "loss": 0.8825, + "step": 22092 + }, + { + "epoch": 4.712670648464163, + "grad_norm": 0.23832886960689095, + "learning_rate": 4.6638073050771876e-05, + "loss": 0.9201, + "step": 22093 + }, + { + "epoch": 4.712883959044369, + "grad_norm": 0.14867154827762447, + "learning_rate": 4.663033875258091e-05, + "loss": 0.8816, + "step": 22094 + }, + { + "epoch": 4.713097269624574, + "grad_norm": 0.1837573217690972, + "learning_rate": 4.662260483196823e-05, + "loss": 0.8868, + "step": 22095 + }, + { + "epoch": 4.713310580204778, + "grad_norm": 0.1957022955008359, + "learning_rate": 4.661487128902133e-05, + "loss": 0.8788, + "step": 22096 + }, + { + "epoch": 4.713523890784983, + "grad_norm": 0.1433038499312134, + "learning_rate": 4.6607138123827727e-05, + "loss": 0.8919, + "step": 22097 + }, + { + "epoch": 4.713737201365188, + "grad_norm": 0.18352635661034833, + "learning_rate": 4.659940533647488e-05, + "loss": 0.8915, + "step": 22098 + }, + { + "epoch": 4.713950511945392, + "grad_norm": 0.17889043045471376, + "learning_rate": 4.659167292705036e-05, + "loss": 0.8895, + "step": 22099 + }, + { + "epoch": 4.714163822525597, + "grad_norm": 0.1648337057376573, + "learning_rate": 4.658394089564162e-05, + "loss": 0.8986, + "step": 22100 + }, + { + "epoch": 4.714377133105802, + "grad_norm": 0.14209549276780054, + "learning_rate": 4.657620924233615e-05, + "loss": 0.8969, + "step": 22101 + }, + { + "epoch": 4.714590443686006, + "grad_norm": 0.29182063283733317, + "learning_rate": 4.65684779672214e-05, + "loss": 0.9431, + "step": 22102 + }, + { + "epoch": 4.714803754266212, + "grad_norm": 0.13029781830153703, + "learning_rate": 4.6560747070384926e-05, + "loss": 0.9027, + "step": 22103 + }, + { + "epoch": 4.715017064846417, + "grad_norm": 0.1747908219300324, + "learning_rate": 4.6553016551914156e-05, + "loss": 0.9082, + "step": 22104 + }, + { + "epoch": 4.715230375426621, + "grad_norm": 0.15323842783894032, + "learning_rate": 4.6545286411896576e-05, + "loss": 0.9198, + "step": 22105 + }, + { + "epoch": 4.715443686006826, + "grad_norm": 0.15943646609762308, + "learning_rate": 4.6537556650419625e-05, + "loss": 0.8739, + "step": 22106 + }, + { + "epoch": 4.715656996587031, + "grad_norm": 0.1463128796331052, + "learning_rate": 4.652982726757079e-05, + "loss": 0.8965, + "step": 22107 + }, + { + "epoch": 4.715870307167235, + "grad_norm": 0.17505975165082618, + "learning_rate": 4.652209826343753e-05, + "loss": 0.9042, + "step": 22108 + }, + { + "epoch": 4.71608361774744, + "grad_norm": 0.13660400598659264, + "learning_rate": 4.651436963810728e-05, + "loss": 0.9159, + "step": 22109 + }, + { + "epoch": 4.716296928327645, + "grad_norm": 0.16253617958981043, + "learning_rate": 4.650664139166748e-05, + "loss": 0.9134, + "step": 22110 + }, + { + "epoch": 4.7165102389078495, + "grad_norm": 0.15863639125268794, + "learning_rate": 4.649891352420563e-05, + "loss": 0.9109, + "step": 22111 + }, + { + "epoch": 4.716723549488055, + "grad_norm": 0.15896609011862647, + "learning_rate": 4.6491186035809126e-05, + "loss": 0.8733, + "step": 22112 + }, + { + "epoch": 4.71693686006826, + "grad_norm": 0.15801897780672122, + "learning_rate": 4.648345892656542e-05, + "loss": 0.8934, + "step": 22113 + }, + { + "epoch": 4.717150170648464, + "grad_norm": 0.16514132536752535, + "learning_rate": 4.64757321965619e-05, + "loss": 0.9077, + "step": 22114 + }, + { + "epoch": 4.717363481228669, + "grad_norm": 0.18738975040466813, + "learning_rate": 4.646800584588608e-05, + "loss": 0.8937, + "step": 22115 + }, + { + "epoch": 4.717576791808874, + "grad_norm": 0.17123469378793973, + "learning_rate": 4.646027987462535e-05, + "loss": 0.8925, + "step": 22116 + }, + { + "epoch": 4.7177901023890785, + "grad_norm": 0.16424857113482674, + "learning_rate": 4.645255428286708e-05, + "loss": 0.9035, + "step": 22117 + }, + { + "epoch": 4.718003412969283, + "grad_norm": 0.13766360790668955, + "learning_rate": 4.6444829070698735e-05, + "loss": 0.8915, + "step": 22118 + }, + { + "epoch": 4.718216723549488, + "grad_norm": 0.19799735694195725, + "learning_rate": 4.643710423820772e-05, + "loss": 0.9358, + "step": 22119 + }, + { + "epoch": 4.7184300341296925, + "grad_norm": 0.1656334629435977, + "learning_rate": 4.6429379785481444e-05, + "loss": 0.9255, + "step": 22120 + }, + { + "epoch": 4.718643344709898, + "grad_norm": 0.15739585202072642, + "learning_rate": 4.642165571260728e-05, + "loss": 0.9042, + "step": 22121 + }, + { + "epoch": 4.718856655290102, + "grad_norm": 0.19101067826876483, + "learning_rate": 4.6413932019672625e-05, + "loss": 0.895, + "step": 22122 + }, + { + "epoch": 4.7190699658703075, + "grad_norm": 0.1400498302038313, + "learning_rate": 4.640620870676493e-05, + "loss": 0.9101, + "step": 22123 + }, + { + "epoch": 4.719283276450512, + "grad_norm": 0.13566163670809595, + "learning_rate": 4.639848577397155e-05, + "loss": 0.8688, + "step": 22124 + }, + { + "epoch": 4.719496587030717, + "grad_norm": 0.14466116133697932, + "learning_rate": 4.6390763221379855e-05, + "loss": 0.8528, + "step": 22125 + }, + { + "epoch": 4.7197098976109215, + "grad_norm": 0.1444572919129014, + "learning_rate": 4.6383041049077216e-05, + "loss": 0.9214, + "step": 22126 + }, + { + "epoch": 4.719923208191126, + "grad_norm": 0.1360890021059237, + "learning_rate": 4.637531925715111e-05, + "loss": 0.8946, + "step": 22127 + }, + { + "epoch": 4.720136518771331, + "grad_norm": 0.153071801667784, + "learning_rate": 4.636759784568877e-05, + "loss": 0.8965, + "step": 22128 + }, + { + "epoch": 4.720349829351536, + "grad_norm": 0.15026635042397266, + "learning_rate": 4.635987681477762e-05, + "loss": 0.9036, + "step": 22129 + }, + { + "epoch": 4.72056313993174, + "grad_norm": 0.17762716207094487, + "learning_rate": 4.635215616450506e-05, + "loss": 0.8901, + "step": 22130 + }, + { + "epoch": 4.720776450511945, + "grad_norm": 0.17559396323925985, + "learning_rate": 4.63444358949584e-05, + "loss": 0.9134, + "step": 22131 + }, + { + "epoch": 4.7209897610921505, + "grad_norm": 0.19764847640625907, + "learning_rate": 4.633671600622503e-05, + "loss": 0.8629, + "step": 22132 + }, + { + "epoch": 4.721203071672355, + "grad_norm": 0.19950906529252993, + "learning_rate": 4.632899649839228e-05, + "loss": 0.8931, + "step": 22133 + }, + { + "epoch": 4.72141638225256, + "grad_norm": 0.18095790639859105, + "learning_rate": 4.632127737154747e-05, + "loss": 0.8928, + "step": 22134 + }, + { + "epoch": 4.721629692832765, + "grad_norm": 0.18140749541884066, + "learning_rate": 4.631355862577801e-05, + "loss": 0.8998, + "step": 22135 + }, + { + "epoch": 4.721843003412969, + "grad_norm": 0.16155933368600298, + "learning_rate": 4.6305840261171194e-05, + "loss": 0.9001, + "step": 22136 + }, + { + "epoch": 4.722056313993174, + "grad_norm": 0.18359829837967057, + "learning_rate": 4.629812227781436e-05, + "loss": 0.8922, + "step": 22137 + }, + { + "epoch": 4.722269624573379, + "grad_norm": 0.155513330264654, + "learning_rate": 4.6290404675794846e-05, + "loss": 0.881, + "step": 22138 + }, + { + "epoch": 4.722482935153583, + "grad_norm": 0.16842596276964436, + "learning_rate": 4.6282687455199966e-05, + "loss": 0.9166, + "step": 22139 + }, + { + "epoch": 4.722696245733788, + "grad_norm": 0.17179222766722632, + "learning_rate": 4.6274970616117046e-05, + "loss": 0.9061, + "step": 22140 + }, + { + "epoch": 4.722909556313994, + "grad_norm": 0.15353675791574306, + "learning_rate": 4.626725415863337e-05, + "loss": 0.9131, + "step": 22141 + }, + { + "epoch": 4.723122866894198, + "grad_norm": 0.21002460205687623, + "learning_rate": 4.6259538082836316e-05, + "loss": 0.9227, + "step": 22142 + }, + { + "epoch": 4.723336177474403, + "grad_norm": 0.18491056335516817, + "learning_rate": 4.625182238881315e-05, + "loss": 0.8915, + "step": 22143 + }, + { + "epoch": 4.723549488054608, + "grad_norm": 0.19943074351861947, + "learning_rate": 4.6244107076651175e-05, + "loss": 0.9104, + "step": 22144 + }, + { + "epoch": 4.723762798634812, + "grad_norm": 0.21032324973832825, + "learning_rate": 4.623639214643771e-05, + "loss": 0.9081, + "step": 22145 + }, + { + "epoch": 4.723976109215017, + "grad_norm": 0.20479652562431955, + "learning_rate": 4.622867759825999e-05, + "loss": 0.8924, + "step": 22146 + }, + { + "epoch": 4.724189419795222, + "grad_norm": 0.15027210082671336, + "learning_rate": 4.6220963432205394e-05, + "loss": 0.8703, + "step": 22147 + }, + { + "epoch": 4.724402730375426, + "grad_norm": 0.19748628692717013, + "learning_rate": 4.621324964836117e-05, + "loss": 0.9317, + "step": 22148 + }, + { + "epoch": 4.724616040955631, + "grad_norm": 0.14687570178360917, + "learning_rate": 4.620553624681458e-05, + "loss": 0.9202, + "step": 22149 + }, + { + "epoch": 4.724829351535837, + "grad_norm": 0.1748233102162134, + "learning_rate": 4.619782322765292e-05, + "loss": 0.8947, + "step": 22150 + }, + { + "epoch": 4.725042662116041, + "grad_norm": 0.1947144347161981, + "learning_rate": 4.619011059096347e-05, + "loss": 0.9001, + "step": 22151 + }, + { + "epoch": 4.725255972696246, + "grad_norm": 0.21122166456698857, + "learning_rate": 4.618239833683347e-05, + "loss": 0.9105, + "step": 22152 + }, + { + "epoch": 4.725469283276451, + "grad_norm": 0.16441935127812096, + "learning_rate": 4.6174686465350176e-05, + "loss": 0.8876, + "step": 22153 + }, + { + "epoch": 4.725682593856655, + "grad_norm": 0.1763507261359361, + "learning_rate": 4.616697497660091e-05, + "loss": 0.8972, + "step": 22154 + }, + { + "epoch": 4.72589590443686, + "grad_norm": 0.1436569366659335, + "learning_rate": 4.615926387067289e-05, + "loss": 0.8806, + "step": 22155 + }, + { + "epoch": 4.726109215017065, + "grad_norm": 0.19609439026478498, + "learning_rate": 4.615155314765336e-05, + "loss": 0.9218, + "step": 22156 + }, + { + "epoch": 4.726322525597269, + "grad_norm": 0.20791652250229992, + "learning_rate": 4.614384280762958e-05, + "loss": 0.8786, + "step": 22157 + }, + { + "epoch": 4.726535836177474, + "grad_norm": 0.1371946797800036, + "learning_rate": 4.613613285068876e-05, + "loss": 0.8999, + "step": 22158 + }, + { + "epoch": 4.72674914675768, + "grad_norm": 0.1855097148078945, + "learning_rate": 4.61284232769182e-05, + "loss": 0.8477, + "step": 22159 + }, + { + "epoch": 4.726962457337884, + "grad_norm": 0.17953904368659013, + "learning_rate": 4.61207140864051e-05, + "loss": 0.8834, + "step": 22160 + }, + { + "epoch": 4.727175767918089, + "grad_norm": 0.1400088222491213, + "learning_rate": 4.611300527923669e-05, + "loss": 0.9016, + "step": 22161 + }, + { + "epoch": 4.727389078498294, + "grad_norm": 0.16745852981512846, + "learning_rate": 4.610529685550018e-05, + "loss": 0.908, + "step": 22162 + }, + { + "epoch": 4.727602389078498, + "grad_norm": 0.1642375631470094, + "learning_rate": 4.609758881528283e-05, + "loss": 0.87, + "step": 22163 + }, + { + "epoch": 4.727815699658703, + "grad_norm": 0.168825402576141, + "learning_rate": 4.608988115867181e-05, + "loss": 0.9139, + "step": 22164 + }, + { + "epoch": 4.728029010238908, + "grad_norm": 0.16312639513725555, + "learning_rate": 4.6082173885754344e-05, + "loss": 0.8775, + "step": 22165 + }, + { + "epoch": 4.728242320819112, + "grad_norm": 0.1567725347215018, + "learning_rate": 4.6074466996617666e-05, + "loss": 0.8739, + "step": 22166 + }, + { + "epoch": 4.728455631399317, + "grad_norm": 0.20518206443571793, + "learning_rate": 4.6066760491348974e-05, + "loss": 0.9127, + "step": 22167 + }, + { + "epoch": 4.728668941979522, + "grad_norm": 0.22306286526893124, + "learning_rate": 4.6059054370035466e-05, + "loss": 0.877, + "step": 22168 + }, + { + "epoch": 4.7288822525597265, + "grad_norm": 0.15097063683232526, + "learning_rate": 4.605134863276432e-05, + "loss": 0.9219, + "step": 22169 + }, + { + "epoch": 4.729095563139932, + "grad_norm": 0.18434011039670486, + "learning_rate": 4.60436432796227e-05, + "loss": 0.9249, + "step": 22170 + }, + { + "epoch": 4.729308873720137, + "grad_norm": 0.2071442378646061, + "learning_rate": 4.603593831069786e-05, + "loss": 0.8827, + "step": 22171 + }, + { + "epoch": 4.729522184300341, + "grad_norm": 0.16239116976854528, + "learning_rate": 4.602823372607698e-05, + "loss": 0.9238, + "step": 22172 + }, + { + "epoch": 4.729735494880546, + "grad_norm": 0.1517534930631821, + "learning_rate": 4.6020529525847144e-05, + "loss": 0.8997, + "step": 22173 + }, + { + "epoch": 4.729948805460751, + "grad_norm": 0.19482832221905447, + "learning_rate": 4.601282571009562e-05, + "loss": 0.8746, + "step": 22174 + }, + { + "epoch": 4.7301621160409555, + "grad_norm": 0.18198034999807292, + "learning_rate": 4.600512227890954e-05, + "loss": 0.927, + "step": 22175 + }, + { + "epoch": 4.73037542662116, + "grad_norm": 0.15296543945420182, + "learning_rate": 4.599741923237608e-05, + "loss": 0.9155, + "step": 22176 + }, + { + "epoch": 4.730588737201365, + "grad_norm": 0.204303589488604, + "learning_rate": 4.598971657058236e-05, + "loss": 0.872, + "step": 22177 + }, + { + "epoch": 4.7308020477815695, + "grad_norm": 0.16434884714171238, + "learning_rate": 4.5982014293615616e-05, + "loss": 0.9298, + "step": 22178 + }, + { + "epoch": 4.731015358361775, + "grad_norm": 0.1810555174822843, + "learning_rate": 4.597431240156293e-05, + "loss": 0.9024, + "step": 22179 + }, + { + "epoch": 4.73122866894198, + "grad_norm": 0.2085030332499992, + "learning_rate": 4.5966610894511484e-05, + "loss": 0.9133, + "step": 22180 + }, + { + "epoch": 4.7314419795221845, + "grad_norm": 0.1602948349001761, + "learning_rate": 4.5958909772548396e-05, + "loss": 0.9143, + "step": 22181 + }, + { + "epoch": 4.731655290102389, + "grad_norm": 0.20333028243249177, + "learning_rate": 4.5951209035760795e-05, + "loss": 0.877, + "step": 22182 + }, + { + "epoch": 4.731868600682594, + "grad_norm": 0.18105788489437644, + "learning_rate": 4.59435086842359e-05, + "loss": 0.8768, + "step": 22183 + }, + { + "epoch": 4.7320819112627985, + "grad_norm": 0.21110749202526446, + "learning_rate": 4.593580871806074e-05, + "loss": 0.8981, + "step": 22184 + }, + { + "epoch": 4.732295221843003, + "grad_norm": 0.17121222824280966, + "learning_rate": 4.592810913732245e-05, + "loss": 0.8877, + "step": 22185 + }, + { + "epoch": 4.732508532423208, + "grad_norm": 0.2313605283703681, + "learning_rate": 4.592040994210821e-05, + "loss": 0.9081, + "step": 22186 + }, + { + "epoch": 4.732721843003413, + "grad_norm": 0.1452383840312633, + "learning_rate": 4.5912711132505106e-05, + "loss": 0.9189, + "step": 22187 + }, + { + "epoch": 4.732935153583618, + "grad_norm": 0.25481516910393776, + "learning_rate": 4.590501270860024e-05, + "loss": 0.9158, + "step": 22188 + }, + { + "epoch": 4.733148464163823, + "grad_norm": 0.18682408525729322, + "learning_rate": 4.58973146704807e-05, + "loss": 0.9182, + "step": 22189 + }, + { + "epoch": 4.7333617747440275, + "grad_norm": 0.2286470021207743, + "learning_rate": 4.588961701823365e-05, + "loss": 0.8971, + "step": 22190 + }, + { + "epoch": 4.733575085324232, + "grad_norm": 0.1747321479965984, + "learning_rate": 4.588191975194617e-05, + "loss": 0.9355, + "step": 22191 + }, + { + "epoch": 4.733788395904437, + "grad_norm": 0.21337776041107043, + "learning_rate": 4.587422287170533e-05, + "loss": 0.8941, + "step": 22192 + }, + { + "epoch": 4.734001706484642, + "grad_norm": 0.18332514681254336, + "learning_rate": 4.586652637759823e-05, + "loss": 0.8764, + "step": 22193 + }, + { + "epoch": 4.734215017064846, + "grad_norm": 0.18759384512368882, + "learning_rate": 4.5858830269711964e-05, + "loss": 0.9072, + "step": 22194 + }, + { + "epoch": 4.734428327645051, + "grad_norm": 0.1689894309274495, + "learning_rate": 4.5851134548133606e-05, + "loss": 0.8966, + "step": 22195 + }, + { + "epoch": 4.734641638225256, + "grad_norm": 0.18332978584788065, + "learning_rate": 4.584343921295023e-05, + "loss": 0.9113, + "step": 22196 + }, + { + "epoch": 4.734854948805461, + "grad_norm": 0.17871514793227758, + "learning_rate": 4.583574426424889e-05, + "loss": 0.8973, + "step": 22197 + }, + { + "epoch": 4.735068259385666, + "grad_norm": 0.16748400047350845, + "learning_rate": 4.5828049702116706e-05, + "loss": 0.9256, + "step": 22198 + }, + { + "epoch": 4.735281569965871, + "grad_norm": 0.1228264703253726, + "learning_rate": 4.5820355526640725e-05, + "loss": 0.913, + "step": 22199 + }, + { + "epoch": 4.735494880546075, + "grad_norm": 0.163558059397304, + "learning_rate": 4.581266173790798e-05, + "loss": 0.908, + "step": 22200 + }, + { + "epoch": 4.73570819112628, + "grad_norm": 0.13485007102098948, + "learning_rate": 4.580496833600552e-05, + "loss": 0.8956, + "step": 22201 + }, + { + "epoch": 4.735921501706485, + "grad_norm": 0.145109075536207, + "learning_rate": 4.5797275321020447e-05, + "loss": 0.9303, + "step": 22202 + }, + { + "epoch": 4.736134812286689, + "grad_norm": 0.15992018172083497, + "learning_rate": 4.578958269303978e-05, + "loss": 0.8975, + "step": 22203 + }, + { + "epoch": 4.736348122866894, + "grad_norm": 0.128283187580517, + "learning_rate": 4.578189045215056e-05, + "loss": 0.8996, + "step": 22204 + }, + { + "epoch": 4.736561433447099, + "grad_norm": 0.17571139822430332, + "learning_rate": 4.577419859843982e-05, + "loss": 0.8982, + "step": 22205 + }, + { + "epoch": 4.736774744027304, + "grad_norm": 0.14352098855563145, + "learning_rate": 4.57665071319946e-05, + "loss": 0.8915, + "step": 22206 + }, + { + "epoch": 4.736988054607508, + "grad_norm": 0.16359143513264757, + "learning_rate": 4.575881605290194e-05, + "loss": 0.9023, + "step": 22207 + }, + { + "epoch": 4.737201365187714, + "grad_norm": 0.137940003302205, + "learning_rate": 4.5751125361248844e-05, + "loss": 0.8998, + "step": 22208 + }, + { + "epoch": 4.737414675767918, + "grad_norm": 0.15542399589034378, + "learning_rate": 4.5743435057122305e-05, + "loss": 0.9004, + "step": 22209 + }, + { + "epoch": 4.737627986348123, + "grad_norm": 0.13979472391800732, + "learning_rate": 4.573574514060941e-05, + "loss": 0.8991, + "step": 22210 + }, + { + "epoch": 4.737841296928328, + "grad_norm": 0.15466540908248708, + "learning_rate": 4.5728055611797136e-05, + "loss": 0.8905, + "step": 22211 + }, + { + "epoch": 4.738054607508532, + "grad_norm": 0.17113533120095956, + "learning_rate": 4.5720366470772486e-05, + "loss": 0.908, + "step": 22212 + }, + { + "epoch": 4.738267918088737, + "grad_norm": 0.14609083815976392, + "learning_rate": 4.5712677717622444e-05, + "loss": 0.8676, + "step": 22213 + }, + { + "epoch": 4.738481228668942, + "grad_norm": 0.17301221748796328, + "learning_rate": 4.570498935243405e-05, + "loss": 0.8936, + "step": 22214 + }, + { + "epoch": 4.738694539249146, + "grad_norm": 0.16821046092751216, + "learning_rate": 4.569730137529428e-05, + "loss": 0.9099, + "step": 22215 + }, + { + "epoch": 4.738907849829351, + "grad_norm": 0.17133050907604344, + "learning_rate": 4.568961378629013e-05, + "loss": 0.907, + "step": 22216 + }, + { + "epoch": 4.739121160409557, + "grad_norm": 0.1627122786795229, + "learning_rate": 4.568192658550857e-05, + "loss": 0.9243, + "step": 22217 + }, + { + "epoch": 4.739334470989761, + "grad_norm": 0.16793229492192982, + "learning_rate": 4.567423977303659e-05, + "loss": 0.8876, + "step": 22218 + }, + { + "epoch": 4.739547781569966, + "grad_norm": 0.13451294576305245, + "learning_rate": 4.5666553348961154e-05, + "loss": 0.9001, + "step": 22219 + }, + { + "epoch": 4.739761092150171, + "grad_norm": 0.14588843841253907, + "learning_rate": 4.5658867313369265e-05, + "loss": 0.903, + "step": 22220 + }, + { + "epoch": 4.739974402730375, + "grad_norm": 0.13491567699145177, + "learning_rate": 4.5651181666347825e-05, + "loss": 0.8974, + "step": 22221 + }, + { + "epoch": 4.74018771331058, + "grad_norm": 0.1719111389648611, + "learning_rate": 4.5643496407983875e-05, + "loss": 0.8932, + "step": 22222 + }, + { + "epoch": 4.740401023890785, + "grad_norm": 0.13751183114670637, + "learning_rate": 4.563581153836435e-05, + "loss": 0.9227, + "step": 22223 + }, + { + "epoch": 4.7406143344709895, + "grad_norm": 0.15720105621891436, + "learning_rate": 4.562812705757618e-05, + "loss": 0.9156, + "step": 22224 + }, + { + "epoch": 4.740827645051194, + "grad_norm": 0.1471752412360856, + "learning_rate": 4.562044296570631e-05, + "loss": 0.8994, + "step": 22225 + }, + { + "epoch": 4.7410409556314, + "grad_norm": 0.14683226713259082, + "learning_rate": 4.5612759262841736e-05, + "loss": 0.9179, + "step": 22226 + }, + { + "epoch": 4.741254266211604, + "grad_norm": 0.15533687134394378, + "learning_rate": 4.560507594906936e-05, + "loss": 0.8912, + "step": 22227 + }, + { + "epoch": 4.741467576791809, + "grad_norm": 0.14168309698536233, + "learning_rate": 4.559739302447617e-05, + "loss": 0.9068, + "step": 22228 + }, + { + "epoch": 4.741680887372014, + "grad_norm": 0.15236859529772506, + "learning_rate": 4.5589710489148986e-05, + "loss": 0.889, + "step": 22229 + }, + { + "epoch": 4.7418941979522184, + "grad_norm": 0.16173219256934535, + "learning_rate": 4.5582028343174845e-05, + "loss": 0.8983, + "step": 22230 + }, + { + "epoch": 4.742107508532423, + "grad_norm": 0.17347425843648623, + "learning_rate": 4.557434658664063e-05, + "loss": 0.9052, + "step": 22231 + }, + { + "epoch": 4.742320819112628, + "grad_norm": 0.15374126909033234, + "learning_rate": 4.556666521963326e-05, + "loss": 0.9209, + "step": 22232 + }, + { + "epoch": 4.7425341296928325, + "grad_norm": 0.1523507350444448, + "learning_rate": 4.555898424223963e-05, + "loss": 0.9017, + "step": 22233 + }, + { + "epoch": 4.742747440273037, + "grad_norm": 0.15548520856339182, + "learning_rate": 4.5551303654546696e-05, + "loss": 0.9307, + "step": 22234 + }, + { + "epoch": 4.742960750853243, + "grad_norm": 0.1399924535275591, + "learning_rate": 4.554362345664134e-05, + "loss": 0.9041, + "step": 22235 + }, + { + "epoch": 4.743174061433447, + "grad_norm": 0.15632414376553613, + "learning_rate": 4.553594364861047e-05, + "loss": 0.9251, + "step": 22236 + }, + { + "epoch": 4.743387372013652, + "grad_norm": 0.1568589187736829, + "learning_rate": 4.5528264230540946e-05, + "loss": 0.89, + "step": 22237 + }, + { + "epoch": 4.743600682593857, + "grad_norm": 0.16565827652915216, + "learning_rate": 4.552058520251972e-05, + "loss": 0.8998, + "step": 22238 + }, + { + "epoch": 4.7438139931740615, + "grad_norm": 0.14867754634907135, + "learning_rate": 4.5512906564633686e-05, + "loss": 0.9056, + "step": 22239 + }, + { + "epoch": 4.744027303754266, + "grad_norm": 0.14498406741865474, + "learning_rate": 4.550522831696965e-05, + "loss": 0.9391, + "step": 22240 + }, + { + "epoch": 4.744240614334471, + "grad_norm": 0.15125762507992768, + "learning_rate": 4.5497550459614524e-05, + "loss": 0.8777, + "step": 22241 + }, + { + "epoch": 4.7444539249146755, + "grad_norm": 0.14703452944198026, + "learning_rate": 4.5489872992655214e-05, + "loss": 0.8478, + "step": 22242 + }, + { + "epoch": 4.74466723549488, + "grad_norm": 0.13096098754435373, + "learning_rate": 4.548219591617857e-05, + "loss": 0.9019, + "step": 22243 + }, + { + "epoch": 4.744880546075086, + "grad_norm": 0.14019922640041957, + "learning_rate": 4.5474519230271475e-05, + "loss": 0.8842, + "step": 22244 + }, + { + "epoch": 4.7450938566552905, + "grad_norm": 0.14792764740341788, + "learning_rate": 4.546684293502073e-05, + "loss": 0.9405, + "step": 22245 + }, + { + "epoch": 4.745307167235495, + "grad_norm": 0.13421310506796416, + "learning_rate": 4.545916703051327e-05, + "loss": 0.9165, + "step": 22246 + }, + { + "epoch": 4.7455204778157, + "grad_norm": 0.12551133986824153, + "learning_rate": 4.545149151683593e-05, + "loss": 0.9089, + "step": 22247 + }, + { + "epoch": 4.7457337883959045, + "grad_norm": 0.15666998933009849, + "learning_rate": 4.544381639407552e-05, + "loss": 0.9178, + "step": 22248 + }, + { + "epoch": 4.745947098976109, + "grad_norm": 0.15688407292692907, + "learning_rate": 4.543614166231889e-05, + "loss": 0.9131, + "step": 22249 + }, + { + "epoch": 4.746160409556314, + "grad_norm": 0.15495100285271746, + "learning_rate": 4.5428467321652966e-05, + "loss": 0.8896, + "step": 22250 + }, + { + "epoch": 4.746373720136519, + "grad_norm": 0.1674860761427171, + "learning_rate": 4.542079337216448e-05, + "loss": 0.9105, + "step": 22251 + }, + { + "epoch": 4.746587030716723, + "grad_norm": 0.16316513428943738, + "learning_rate": 4.54131198139403e-05, + "loss": 0.9025, + "step": 22252 + }, + { + "epoch": 4.746800341296928, + "grad_norm": 0.16605550525800564, + "learning_rate": 4.540544664706723e-05, + "loss": 0.9031, + "step": 22253 + }, + { + "epoch": 4.747013651877133, + "grad_norm": 0.1911224705850865, + "learning_rate": 4.5397773871632126e-05, + "loss": 0.8871, + "step": 22254 + }, + { + "epoch": 4.747226962457338, + "grad_norm": 0.16027938786216878, + "learning_rate": 4.53901014877218e-05, + "loss": 0.8897, + "step": 22255 + }, + { + "epoch": 4.747440273037543, + "grad_norm": 0.1793516343590049, + "learning_rate": 4.538242949542305e-05, + "loss": 0.8826, + "step": 22256 + }, + { + "epoch": 4.747653583617748, + "grad_norm": 0.17557780178512616, + "learning_rate": 4.5374757894822675e-05, + "loss": 0.8956, + "step": 22257 + }, + { + "epoch": 4.747866894197952, + "grad_norm": 0.16571984056548733, + "learning_rate": 4.536708668600753e-05, + "loss": 0.9255, + "step": 22258 + }, + { + "epoch": 4.748080204778157, + "grad_norm": 0.16290580976207253, + "learning_rate": 4.535941586906437e-05, + "loss": 0.8987, + "step": 22259 + }, + { + "epoch": 4.748293515358362, + "grad_norm": 0.19127389998305142, + "learning_rate": 4.5351745444080006e-05, + "loss": 0.8874, + "step": 22260 + }, + { + "epoch": 4.748506825938566, + "grad_norm": 0.14315844502762726, + "learning_rate": 4.5344075411141224e-05, + "loss": 0.913, + "step": 22261 + }, + { + "epoch": 4.748720136518771, + "grad_norm": 0.15579067732827734, + "learning_rate": 4.533640577033481e-05, + "loss": 0.8899, + "step": 22262 + }, + { + "epoch": 4.748933447098976, + "grad_norm": 0.15816670760460647, + "learning_rate": 4.532873652174756e-05, + "loss": 0.9146, + "step": 22263 + }, + { + "epoch": 4.749146757679181, + "grad_norm": 0.1534113157551625, + "learning_rate": 4.532106766546623e-05, + "loss": 0.9283, + "step": 22264 + }, + { + "epoch": 4.749360068259386, + "grad_norm": 0.17784331118150917, + "learning_rate": 4.531339920157757e-05, + "loss": 0.904, + "step": 22265 + }, + { + "epoch": 4.749573378839591, + "grad_norm": 0.1606619434738964, + "learning_rate": 4.530573113016841e-05, + "loss": 0.8892, + "step": 22266 + }, + { + "epoch": 4.749786689419795, + "grad_norm": 0.1504040160674464, + "learning_rate": 4.529806345132549e-05, + "loss": 0.8994, + "step": 22267 + }, + { + "epoch": 4.75, + "grad_norm": 0.1372666151600627, + "learning_rate": 4.529039616513556e-05, + "loss": 0.9249, + "step": 22268 + }, + { + "epoch": 4.750213310580205, + "grad_norm": 0.16097509940445837, + "learning_rate": 4.5282729271685356e-05, + "loss": 0.9139, + "step": 22269 + }, + { + "epoch": 4.750426621160409, + "grad_norm": 0.17773029813077415, + "learning_rate": 4.527506277106169e-05, + "loss": 0.9356, + "step": 22270 + }, + { + "epoch": 4.750639931740614, + "grad_norm": 0.1569837783188563, + "learning_rate": 4.526739666335127e-05, + "loss": 0.9054, + "step": 22271 + }, + { + "epoch": 4.750853242320819, + "grad_norm": 0.16537055181389698, + "learning_rate": 4.5259730948640836e-05, + "loss": 0.9049, + "step": 22272 + }, + { + "epoch": 4.751066552901024, + "grad_norm": 0.1667483164706433, + "learning_rate": 4.525206562701714e-05, + "loss": 0.8922, + "step": 22273 + }, + { + "epoch": 4.751279863481229, + "grad_norm": 0.1425667927568013, + "learning_rate": 4.52444006985669e-05, + "loss": 0.934, + "step": 22274 + }, + { + "epoch": 4.751493174061434, + "grad_norm": 0.1519916273758332, + "learning_rate": 4.523673616337684e-05, + "loss": 0.8829, + "step": 22275 + }, + { + "epoch": 4.751706484641638, + "grad_norm": 0.16752480736376144, + "learning_rate": 4.522907202153372e-05, + "loss": 0.8896, + "step": 22276 + }, + { + "epoch": 4.751919795221843, + "grad_norm": 0.18091414764372218, + "learning_rate": 4.5221408273124184e-05, + "loss": 0.9053, + "step": 22277 + }, + { + "epoch": 4.752133105802048, + "grad_norm": 0.13410727387961072, + "learning_rate": 4.521374491823503e-05, + "loss": 0.8862, + "step": 22278 + }, + { + "epoch": 4.752346416382252, + "grad_norm": 0.15973946759707833, + "learning_rate": 4.5206081956952943e-05, + "loss": 0.8687, + "step": 22279 + }, + { + "epoch": 4.752559726962457, + "grad_norm": 0.17066796378125085, + "learning_rate": 4.519841938936463e-05, + "loss": 0.8561, + "step": 22280 + }, + { + "epoch": 4.752773037542662, + "grad_norm": 0.12681361279726203, + "learning_rate": 4.519075721555674e-05, + "loss": 0.8931, + "step": 22281 + }, + { + "epoch": 4.752986348122867, + "grad_norm": 0.1544206403411365, + "learning_rate": 4.518309543561605e-05, + "loss": 0.9124, + "step": 22282 + }, + { + "epoch": 4.753199658703072, + "grad_norm": 0.12184363385566951, + "learning_rate": 4.517543404962924e-05, + "loss": 0.8768, + "step": 22283 + }, + { + "epoch": 4.753412969283277, + "grad_norm": 0.14823785324298874, + "learning_rate": 4.516777305768297e-05, + "loss": 0.9142, + "step": 22284 + }, + { + "epoch": 4.753626279863481, + "grad_norm": 0.16839234645860762, + "learning_rate": 4.516011245986393e-05, + "loss": 0.9034, + "step": 22285 + }, + { + "epoch": 4.753839590443686, + "grad_norm": 0.14867571177847813, + "learning_rate": 4.51524522562588e-05, + "loss": 0.9026, + "step": 22286 + }, + { + "epoch": 4.754052901023891, + "grad_norm": 0.19715730192502554, + "learning_rate": 4.514479244695427e-05, + "loss": 0.9287, + "step": 22287 + }, + { + "epoch": 4.7542662116040955, + "grad_norm": 0.17221096062962324, + "learning_rate": 4.513713303203699e-05, + "loss": 0.9134, + "step": 22288 + }, + { + "epoch": 4.7544795221843, + "grad_norm": 0.21558916593057284, + "learning_rate": 4.512947401159361e-05, + "loss": 0.8982, + "step": 22289 + }, + { + "epoch": 4.754692832764505, + "grad_norm": 0.12729380907380597, + "learning_rate": 4.512181538571085e-05, + "loss": 0.8861, + "step": 22290 + }, + { + "epoch": 4.7549061433447095, + "grad_norm": 0.21395541447239927, + "learning_rate": 4.511415715447533e-05, + "loss": 0.9188, + "step": 22291 + }, + { + "epoch": 4.755119453924914, + "grad_norm": 0.14359644599714427, + "learning_rate": 4.510649931797371e-05, + "loss": 0.8681, + "step": 22292 + }, + { + "epoch": 4.75533276450512, + "grad_norm": 0.1600748342549389, + "learning_rate": 4.509884187629261e-05, + "loss": 0.9138, + "step": 22293 + }, + { + "epoch": 4.7555460750853245, + "grad_norm": 0.1525711535885422, + "learning_rate": 4.5091184829518734e-05, + "loss": 0.9193, + "step": 22294 + }, + { + "epoch": 4.755759385665529, + "grad_norm": 0.16231451267851013, + "learning_rate": 4.5083528177738717e-05, + "loss": 0.9405, + "step": 22295 + }, + { + "epoch": 4.755972696245734, + "grad_norm": 0.19283974804335538, + "learning_rate": 4.50758719210391e-05, + "loss": 0.9141, + "step": 22296 + }, + { + "epoch": 4.7561860068259385, + "grad_norm": 0.14985126920901332, + "learning_rate": 4.506821605950661e-05, + "loss": 0.893, + "step": 22297 + }, + { + "epoch": 4.756399317406143, + "grad_norm": 0.22165328300035234, + "learning_rate": 4.506056059322784e-05, + "loss": 0.8863, + "step": 22298 + }, + { + "epoch": 4.756612627986348, + "grad_norm": 0.13625079154712644, + "learning_rate": 4.505290552228942e-05, + "loss": 0.8926, + "step": 22299 + }, + { + "epoch": 4.756825938566553, + "grad_norm": 0.22483425381513608, + "learning_rate": 4.5045250846777974e-05, + "loss": 0.9088, + "step": 22300 + }, + { + "epoch": 4.757039249146757, + "grad_norm": 0.16224218079542344, + "learning_rate": 4.503759656678006e-05, + "loss": 0.8899, + "step": 22301 + }, + { + "epoch": 4.757252559726963, + "grad_norm": 0.23793403229505475, + "learning_rate": 4.502994268238237e-05, + "loss": 0.9057, + "step": 22302 + }, + { + "epoch": 4.7574658703071675, + "grad_norm": 0.18939344954924203, + "learning_rate": 4.502228919367145e-05, + "loss": 0.888, + "step": 22303 + }, + { + "epoch": 4.757679180887372, + "grad_norm": 0.18020211890558785, + "learning_rate": 4.501463610073392e-05, + "loss": 0.8889, + "step": 22304 + }, + { + "epoch": 4.757892491467577, + "grad_norm": 0.23104883162646594, + "learning_rate": 4.500698340365635e-05, + "loss": 0.9055, + "step": 22305 + }, + { + "epoch": 4.7581058020477816, + "grad_norm": 0.1337799182435498, + "learning_rate": 4.49993311025254e-05, + "loss": 0.9009, + "step": 22306 + }, + { + "epoch": 4.758319112627986, + "grad_norm": 0.21835470562409784, + "learning_rate": 4.4991679197427585e-05, + "loss": 0.9255, + "step": 22307 + }, + { + "epoch": 4.758532423208191, + "grad_norm": 0.2032069549933857, + "learning_rate": 4.498402768844951e-05, + "loss": 0.9436, + "step": 22308 + }, + { + "epoch": 4.758745733788396, + "grad_norm": 0.1981056121956158, + "learning_rate": 4.4976376575677716e-05, + "loss": 0.9084, + "step": 22309 + }, + { + "epoch": 4.7589590443686, + "grad_norm": 0.13354848997241467, + "learning_rate": 4.4968725859198845e-05, + "loss": 0.89, + "step": 22310 + }, + { + "epoch": 4.759172354948806, + "grad_norm": 0.20849322801739412, + "learning_rate": 4.496107553909943e-05, + "loss": 0.9327, + "step": 22311 + }, + { + "epoch": 4.7593856655290105, + "grad_norm": 0.17269111140167961, + "learning_rate": 4.495342561546602e-05, + "loss": 0.8902, + "step": 22312 + }, + { + "epoch": 4.759598976109215, + "grad_norm": 0.14616955014230834, + "learning_rate": 4.494577608838517e-05, + "loss": 0.9124, + "step": 22313 + }, + { + "epoch": 4.75981228668942, + "grad_norm": 0.1718015033002124, + "learning_rate": 4.493812695794349e-05, + "loss": 0.9075, + "step": 22314 + }, + { + "epoch": 4.760025597269625, + "grad_norm": 0.1404557605621696, + "learning_rate": 4.4930478224227476e-05, + "loss": 0.8955, + "step": 22315 + }, + { + "epoch": 4.760238907849829, + "grad_norm": 0.18947895348888696, + "learning_rate": 4.4922829887323705e-05, + "loss": 0.8849, + "step": 22316 + }, + { + "epoch": 4.760452218430034, + "grad_norm": 0.2002914690169119, + "learning_rate": 4.491518194731871e-05, + "loss": 0.936, + "step": 22317 + }, + { + "epoch": 4.760665529010239, + "grad_norm": 0.18869624714001018, + "learning_rate": 4.490753440429901e-05, + "loss": 0.8884, + "step": 22318 + }, + { + "epoch": 4.760878839590443, + "grad_norm": 0.1720197438922995, + "learning_rate": 4.489988725835115e-05, + "loss": 0.8871, + "step": 22319 + }, + { + "epoch": 4.761092150170649, + "grad_norm": 0.20371928859426233, + "learning_rate": 4.489224050956166e-05, + "loss": 0.8923, + "step": 22320 + }, + { + "epoch": 4.761305460750854, + "grad_norm": 0.14856287457489603, + "learning_rate": 4.4884594158017016e-05, + "loss": 0.9185, + "step": 22321 + }, + { + "epoch": 4.761518771331058, + "grad_norm": 0.17071806867454636, + "learning_rate": 4.487694820380382e-05, + "loss": 0.897, + "step": 22322 + }, + { + "epoch": 4.761732081911263, + "grad_norm": 0.1494134888438019, + "learning_rate": 4.486930264700855e-05, + "loss": 0.9048, + "step": 22323 + }, + { + "epoch": 4.761945392491468, + "grad_norm": 0.19502828204971895, + "learning_rate": 4.48616574877177e-05, + "loss": 0.8962, + "step": 22324 + }, + { + "epoch": 4.762158703071672, + "grad_norm": 0.17328617880454666, + "learning_rate": 4.4854012726017764e-05, + "loss": 0.8932, + "step": 22325 + }, + { + "epoch": 4.762372013651877, + "grad_norm": 0.19062443811152446, + "learning_rate": 4.4846368361995296e-05, + "loss": 0.9511, + "step": 22326 + }, + { + "epoch": 4.762585324232082, + "grad_norm": 0.16970525972694614, + "learning_rate": 4.4838724395736755e-05, + "loss": 0.8568, + "step": 22327 + }, + { + "epoch": 4.762798634812286, + "grad_norm": 0.1515382351501689, + "learning_rate": 4.483108082732864e-05, + "loss": 0.8825, + "step": 22328 + }, + { + "epoch": 4.763011945392492, + "grad_norm": 0.17546995603641122, + "learning_rate": 4.4823437656857445e-05, + "loss": 0.903, + "step": 22329 + }, + { + "epoch": 4.763225255972696, + "grad_norm": 0.1599145532249187, + "learning_rate": 4.481579488440965e-05, + "loss": 0.9162, + "step": 22330 + }, + { + "epoch": 4.763438566552901, + "grad_norm": 0.19558667649843833, + "learning_rate": 4.4808152510071705e-05, + "loss": 0.912, + "step": 22331 + }, + { + "epoch": 4.763651877133106, + "grad_norm": 0.18540360371234257, + "learning_rate": 4.4800510533930125e-05, + "loss": 0.8971, + "step": 22332 + }, + { + "epoch": 4.763865187713311, + "grad_norm": 0.14417789981911236, + "learning_rate": 4.479286895607132e-05, + "loss": 0.8939, + "step": 22333 + }, + { + "epoch": 4.764078498293515, + "grad_norm": 0.20852548098813622, + "learning_rate": 4.4785227776581836e-05, + "loss": 0.9172, + "step": 22334 + }, + { + "epoch": 4.76429180887372, + "grad_norm": 0.17046691061872077, + "learning_rate": 4.477758699554808e-05, + "loss": 0.8861, + "step": 22335 + }, + { + "epoch": 4.764505119453925, + "grad_norm": 0.18411740335415752, + "learning_rate": 4.476994661305653e-05, + "loss": 0.8648, + "step": 22336 + }, + { + "epoch": 4.764718430034129, + "grad_norm": 0.1503370217725381, + "learning_rate": 4.476230662919358e-05, + "loss": 0.8773, + "step": 22337 + }, + { + "epoch": 4.764931740614334, + "grad_norm": 0.23172041510834815, + "learning_rate": 4.475466704404577e-05, + "loss": 0.9016, + "step": 22338 + }, + { + "epoch": 4.765145051194539, + "grad_norm": 0.1426044943446384, + "learning_rate": 4.4747027857699495e-05, + "loss": 0.899, + "step": 22339 + }, + { + "epoch": 4.765358361774744, + "grad_norm": 0.16588398407012622, + "learning_rate": 4.473938907024119e-05, + "loss": 0.9128, + "step": 22340 + }, + { + "epoch": 4.765571672354949, + "grad_norm": 0.15783540793671727, + "learning_rate": 4.473175068175728e-05, + "loss": 0.9002, + "step": 22341 + }, + { + "epoch": 4.765784982935154, + "grad_norm": 0.13473572276350415, + "learning_rate": 4.472411269233421e-05, + "loss": 0.9282, + "step": 22342 + }, + { + "epoch": 4.765998293515358, + "grad_norm": 0.13725564006252786, + "learning_rate": 4.4716475102058406e-05, + "loss": 0.8863, + "step": 22343 + }, + { + "epoch": 4.766211604095563, + "grad_norm": 0.15455844226645085, + "learning_rate": 4.470883791101628e-05, + "loss": 0.8979, + "step": 22344 + }, + { + "epoch": 4.766424914675768, + "grad_norm": 0.1295708836772537, + "learning_rate": 4.4701201119294204e-05, + "loss": 0.9252, + "step": 22345 + }, + { + "epoch": 4.7666382252559725, + "grad_norm": 0.1577400185191118, + "learning_rate": 4.469356472697866e-05, + "loss": 0.9018, + "step": 22346 + }, + { + "epoch": 4.766851535836177, + "grad_norm": 0.14129906215304616, + "learning_rate": 4.4685928734156044e-05, + "loss": 0.8816, + "step": 22347 + }, + { + "epoch": 4.767064846416382, + "grad_norm": 0.15456477690740272, + "learning_rate": 4.467829314091272e-05, + "loss": 0.9262, + "step": 22348 + }, + { + "epoch": 4.767278156996587, + "grad_norm": 0.1401810704572264, + "learning_rate": 4.467065794733509e-05, + "loss": 0.9056, + "step": 22349 + }, + { + "epoch": 4.767491467576792, + "grad_norm": 0.1712216275765307, + "learning_rate": 4.466302315350957e-05, + "loss": 0.893, + "step": 22350 + }, + { + "epoch": 4.767704778156997, + "grad_norm": 0.15410557295270774, + "learning_rate": 4.465538875952259e-05, + "loss": 0.9091, + "step": 22351 + }, + { + "epoch": 4.7679180887372015, + "grad_norm": 0.1531155369019552, + "learning_rate": 4.46477547654604e-05, + "loss": 0.9031, + "step": 22352 + }, + { + "epoch": 4.768131399317406, + "grad_norm": 0.16360559489429533, + "learning_rate": 4.46401211714095e-05, + "loss": 0.9075, + "step": 22353 + }, + { + "epoch": 4.768344709897611, + "grad_norm": 0.17181334532637219, + "learning_rate": 4.463248797745623e-05, + "loss": 0.9278, + "step": 22354 + }, + { + "epoch": 4.7685580204778155, + "grad_norm": 0.15518472883529488, + "learning_rate": 4.462485518368693e-05, + "loss": 0.9019, + "step": 22355 + }, + { + "epoch": 4.76877133105802, + "grad_norm": 0.16659418870734563, + "learning_rate": 4.4617222790188014e-05, + "loss": 0.9302, + "step": 22356 + }, + { + "epoch": 4.768984641638225, + "grad_norm": 0.15492257919633914, + "learning_rate": 4.460959079704577e-05, + "loss": 0.9384, + "step": 22357 + }, + { + "epoch": 4.7691979522184305, + "grad_norm": 0.16706518770285753, + "learning_rate": 4.460195920434664e-05, + "loss": 0.9279, + "step": 22358 + }, + { + "epoch": 4.769411262798635, + "grad_norm": 0.15831602904111722, + "learning_rate": 4.4594328012176936e-05, + "loss": 0.9356, + "step": 22359 + }, + { + "epoch": 4.76962457337884, + "grad_norm": 0.16925211011943744, + "learning_rate": 4.4586697220623e-05, + "loss": 0.9095, + "step": 22360 + }, + { + "epoch": 4.7698378839590445, + "grad_norm": 0.15491723714015765, + "learning_rate": 4.457906682977116e-05, + "loss": 0.9029, + "step": 22361 + }, + { + "epoch": 4.770051194539249, + "grad_norm": 0.1684844693809218, + "learning_rate": 4.457143683970784e-05, + "loss": 0.9053, + "step": 22362 + }, + { + "epoch": 4.770264505119454, + "grad_norm": 0.14136616012581066, + "learning_rate": 4.4563807250519265e-05, + "loss": 0.8769, + "step": 22363 + }, + { + "epoch": 4.770477815699659, + "grad_norm": 0.13412924769561987, + "learning_rate": 4.4556178062291784e-05, + "loss": 0.9039, + "step": 22364 + }, + { + "epoch": 4.770691126279863, + "grad_norm": 0.15221493983364415, + "learning_rate": 4.454854927511177e-05, + "loss": 0.9251, + "step": 22365 + }, + { + "epoch": 4.770904436860068, + "grad_norm": 0.1458788847446254, + "learning_rate": 4.454092088906551e-05, + "loss": 0.8956, + "step": 22366 + }, + { + "epoch": 4.7711177474402735, + "grad_norm": 0.15808024215345795, + "learning_rate": 4.4533292904239344e-05, + "loss": 0.8968, + "step": 22367 + }, + { + "epoch": 4.771331058020478, + "grad_norm": 0.15208380901212554, + "learning_rate": 4.4525665320719556e-05, + "loss": 0.8885, + "step": 22368 + }, + { + "epoch": 4.771544368600683, + "grad_norm": 0.18249807010190233, + "learning_rate": 4.451803813859243e-05, + "loss": 0.9119, + "step": 22369 + }, + { + "epoch": 4.771757679180888, + "grad_norm": 0.16346392467074328, + "learning_rate": 4.451041135794433e-05, + "loss": 0.8967, + "step": 22370 + }, + { + "epoch": 4.771970989761092, + "grad_norm": 0.18233521877533587, + "learning_rate": 4.450278497886152e-05, + "loss": 0.8851, + "step": 22371 + }, + { + "epoch": 4.772184300341297, + "grad_norm": 0.1536906466338222, + "learning_rate": 4.449515900143031e-05, + "loss": 0.873, + "step": 22372 + }, + { + "epoch": 4.772397610921502, + "grad_norm": 0.173756139001204, + "learning_rate": 4.448753342573696e-05, + "loss": 0.934, + "step": 22373 + }, + { + "epoch": 4.772610921501706, + "grad_norm": 0.15872356641156757, + "learning_rate": 4.447990825186777e-05, + "loss": 0.9021, + "step": 22374 + }, + { + "epoch": 4.772824232081911, + "grad_norm": 0.15798773715637235, + "learning_rate": 4.447228347990901e-05, + "loss": 0.9113, + "step": 22375 + }, + { + "epoch": 4.773037542662116, + "grad_norm": 0.16518429122205341, + "learning_rate": 4.446465910994693e-05, + "loss": 0.8962, + "step": 22376 + }, + { + "epoch": 4.77325085324232, + "grad_norm": 0.173278738208545, + "learning_rate": 4.4457035142067856e-05, + "loss": 0.9279, + "step": 22377 + }, + { + "epoch": 4.773464163822526, + "grad_norm": 0.14125726712084333, + "learning_rate": 4.4449411576358045e-05, + "loss": 0.9087, + "step": 22378 + }, + { + "epoch": 4.773677474402731, + "grad_norm": 0.17378857628379016, + "learning_rate": 4.4441788412903726e-05, + "loss": 0.8994, + "step": 22379 + }, + { + "epoch": 4.773890784982935, + "grad_norm": 0.1467921857410203, + "learning_rate": 4.4434165651791175e-05, + "loss": 0.8788, + "step": 22380 + }, + { + "epoch": 4.77410409556314, + "grad_norm": 0.18966672525608824, + "learning_rate": 4.44265432931066e-05, + "loss": 0.9241, + "step": 22381 + }, + { + "epoch": 4.774317406143345, + "grad_norm": 0.1345171255929432, + "learning_rate": 4.441892133693634e-05, + "loss": 0.9038, + "step": 22382 + }, + { + "epoch": 4.774530716723549, + "grad_norm": 0.16294603952502462, + "learning_rate": 4.441129978336656e-05, + "loss": 0.8927, + "step": 22383 + }, + { + "epoch": 4.774744027303754, + "grad_norm": 0.14859711112474772, + "learning_rate": 4.440367863248352e-05, + "loss": 0.8808, + "step": 22384 + }, + { + "epoch": 4.774957337883959, + "grad_norm": 0.14957816577828897, + "learning_rate": 4.439605788437346e-05, + "loss": 0.875, + "step": 22385 + }, + { + "epoch": 4.775170648464163, + "grad_norm": 0.1647290302476898, + "learning_rate": 4.438843753912261e-05, + "loss": 0.8992, + "step": 22386 + }, + { + "epoch": 4.775383959044369, + "grad_norm": 0.14223409519082525, + "learning_rate": 4.438081759681719e-05, + "loss": 0.9187, + "step": 22387 + }, + { + "epoch": 4.775597269624574, + "grad_norm": 0.1507536479088936, + "learning_rate": 4.437319805754338e-05, + "loss": 0.9015, + "step": 22388 + }, + { + "epoch": 4.775810580204778, + "grad_norm": 0.13893710181419475, + "learning_rate": 4.436557892138746e-05, + "loss": 0.9004, + "step": 22389 + }, + { + "epoch": 4.776023890784983, + "grad_norm": 0.16608381211857823, + "learning_rate": 4.435796018843563e-05, + "loss": 0.9222, + "step": 22390 + }, + { + "epoch": 4.776237201365188, + "grad_norm": 0.1599125168915645, + "learning_rate": 4.4350341858774064e-05, + "loss": 0.9336, + "step": 22391 + }, + { + "epoch": 4.776450511945392, + "grad_norm": 0.15195459623714305, + "learning_rate": 4.4342723932488984e-05, + "loss": 0.9326, + "step": 22392 + }, + { + "epoch": 4.776663822525597, + "grad_norm": 0.1679339386349615, + "learning_rate": 4.4335106409666555e-05, + "loss": 0.9105, + "step": 22393 + }, + { + "epoch": 4.776877133105802, + "grad_norm": 0.16312192143527612, + "learning_rate": 4.432748929039303e-05, + "loss": 0.9079, + "step": 22394 + }, + { + "epoch": 4.777090443686006, + "grad_norm": 0.15279005012797234, + "learning_rate": 4.431987257475456e-05, + "loss": 0.8904, + "step": 22395 + }, + { + "epoch": 4.777303754266212, + "grad_norm": 0.16790404178616794, + "learning_rate": 4.4312256262837324e-05, + "loss": 0.911, + "step": 22396 + }, + { + "epoch": 4.777517064846417, + "grad_norm": 0.1325225597796427, + "learning_rate": 4.430464035472752e-05, + "loss": 0.9179, + "step": 22397 + }, + { + "epoch": 4.777730375426621, + "grad_norm": 0.16094852007240706, + "learning_rate": 4.429702485051131e-05, + "loss": 0.9028, + "step": 22398 + }, + { + "epoch": 4.777943686006826, + "grad_norm": 0.13350135772943877, + "learning_rate": 4.428940975027486e-05, + "loss": 0.91, + "step": 22399 + }, + { + "epoch": 4.778156996587031, + "grad_norm": 0.15571267980475656, + "learning_rate": 4.4281795054104304e-05, + "loss": 0.8721, + "step": 22400 + }, + { + "epoch": 4.778370307167235, + "grad_norm": 0.1480699130456993, + "learning_rate": 4.427418076208588e-05, + "loss": 0.9139, + "step": 22401 + }, + { + "epoch": 4.77858361774744, + "grad_norm": 0.1721585148409807, + "learning_rate": 4.4266566874305695e-05, + "loss": 0.9198, + "step": 22402 + }, + { + "epoch": 4.778796928327645, + "grad_norm": 0.151595060194217, + "learning_rate": 4.4258953390849905e-05, + "loss": 0.9002, + "step": 22403 + }, + { + "epoch": 4.7790102389078495, + "grad_norm": 0.2171198727750067, + "learning_rate": 4.425134031180466e-05, + "loss": 0.886, + "step": 22404 + }, + { + "epoch": 4.779223549488055, + "grad_norm": 0.19975658510781097, + "learning_rate": 4.4243727637256075e-05, + "loss": 0.8908, + "step": 22405 + }, + { + "epoch": 4.77943686006826, + "grad_norm": 0.13276279343715439, + "learning_rate": 4.4236115367290334e-05, + "loss": 0.8955, + "step": 22406 + }, + { + "epoch": 4.779650170648464, + "grad_norm": 0.20645461274549695, + "learning_rate": 4.422850350199358e-05, + "loss": 0.9057, + "step": 22407 + }, + { + "epoch": 4.779863481228669, + "grad_norm": 0.1510306684373197, + "learning_rate": 4.422089204145186e-05, + "loss": 0.9147, + "step": 22408 + }, + { + "epoch": 4.780076791808874, + "grad_norm": 0.16901921274081222, + "learning_rate": 4.4213280985751375e-05, + "loss": 0.8905, + "step": 22409 + }, + { + "epoch": 4.7802901023890785, + "grad_norm": 0.1570621857968843, + "learning_rate": 4.420567033497822e-05, + "loss": 0.9257, + "step": 22410 + }, + { + "epoch": 4.780503412969283, + "grad_norm": 0.13678706114903147, + "learning_rate": 4.41980600892185e-05, + "loss": 0.859, + "step": 22411 + }, + { + "epoch": 4.780716723549488, + "grad_norm": 0.15271650430777714, + "learning_rate": 4.41904502485583e-05, + "loss": 0.9078, + "step": 22412 + }, + { + "epoch": 4.7809300341296925, + "grad_norm": 0.1323241361568801, + "learning_rate": 4.418284081308381e-05, + "loss": 0.8962, + "step": 22413 + }, + { + "epoch": 4.781143344709898, + "grad_norm": 0.16888996095909878, + "learning_rate": 4.417523178288106e-05, + "loss": 0.9077, + "step": 22414 + }, + { + "epoch": 4.781356655290102, + "grad_norm": 0.13943493175742272, + "learning_rate": 4.4167623158036165e-05, + "loss": 0.9145, + "step": 22415 + }, + { + "epoch": 4.7815699658703075, + "grad_norm": 0.18037965826102628, + "learning_rate": 4.416001493863522e-05, + "loss": 0.9004, + "step": 22416 + }, + { + "epoch": 4.781783276450512, + "grad_norm": 0.149501528529477, + "learning_rate": 4.4152407124764274e-05, + "loss": 0.9251, + "step": 22417 + }, + { + "epoch": 4.781996587030717, + "grad_norm": 0.1607851988899867, + "learning_rate": 4.414479971650951e-05, + "loss": 0.8743, + "step": 22418 + }, + { + "epoch": 4.7822098976109215, + "grad_norm": 0.15919424076081218, + "learning_rate": 4.413719271395691e-05, + "loss": 0.8892, + "step": 22419 + }, + { + "epoch": 4.782423208191126, + "grad_norm": 0.1366858263081141, + "learning_rate": 4.412958611719255e-05, + "loss": 0.8677, + "step": 22420 + }, + { + "epoch": 4.782636518771331, + "grad_norm": 0.1743017393505263, + "learning_rate": 4.4121979926302544e-05, + "loss": 0.8789, + "step": 22421 + }, + { + "epoch": 4.782849829351536, + "grad_norm": 0.14874908034637885, + "learning_rate": 4.411437414137295e-05, + "loss": 0.8987, + "step": 22422 + }, + { + "epoch": 4.78306313993174, + "grad_norm": 0.16999623466038893, + "learning_rate": 4.410676876248982e-05, + "loss": 0.8879, + "step": 22423 + }, + { + "epoch": 4.783276450511945, + "grad_norm": 0.15739514834758642, + "learning_rate": 4.409916378973916e-05, + "loss": 0.9022, + "step": 22424 + }, + { + "epoch": 4.7834897610921505, + "grad_norm": 0.1549743759417274, + "learning_rate": 4.409155922320711e-05, + "loss": 0.9097, + "step": 22425 + }, + { + "epoch": 4.783703071672355, + "grad_norm": 0.1406377843656878, + "learning_rate": 4.408395506297966e-05, + "loss": 0.8881, + "step": 22426 + }, + { + "epoch": 4.78391638225256, + "grad_norm": 0.14929767382255482, + "learning_rate": 4.407635130914287e-05, + "loss": 0.9023, + "step": 22427 + }, + { + "epoch": 4.784129692832765, + "grad_norm": 0.1462907378181503, + "learning_rate": 4.4068747961782766e-05, + "loss": 0.8932, + "step": 22428 + }, + { + "epoch": 4.784343003412969, + "grad_norm": 0.14596021452211552, + "learning_rate": 4.406114502098539e-05, + "loss": 0.9068, + "step": 22429 + }, + { + "epoch": 4.784556313993174, + "grad_norm": 0.161108470785787, + "learning_rate": 4.4053542486836756e-05, + "loss": 0.8979, + "step": 22430 + }, + { + "epoch": 4.784769624573379, + "grad_norm": 0.17993193651215217, + "learning_rate": 4.404594035942289e-05, + "loss": 0.8803, + "step": 22431 + }, + { + "epoch": 4.784982935153583, + "grad_norm": 0.13988545676668768, + "learning_rate": 4.403833863882979e-05, + "loss": 0.8946, + "step": 22432 + }, + { + "epoch": 4.785196245733788, + "grad_norm": 0.16905483890466966, + "learning_rate": 4.4030737325143524e-05, + "loss": 0.8971, + "step": 22433 + }, + { + "epoch": 4.785409556313994, + "grad_norm": 0.17923346526767509, + "learning_rate": 4.402313641845007e-05, + "loss": 0.9286, + "step": 22434 + }, + { + "epoch": 4.785622866894198, + "grad_norm": 0.1513781029954373, + "learning_rate": 4.401553591883543e-05, + "loss": 0.8911, + "step": 22435 + }, + { + "epoch": 4.785836177474403, + "grad_norm": 0.1688111599093453, + "learning_rate": 4.4007935826385576e-05, + "loss": 0.9203, + "step": 22436 + }, + { + "epoch": 4.786049488054608, + "grad_norm": 0.17633525773027633, + "learning_rate": 4.400033614118655e-05, + "loss": 0.9341, + "step": 22437 + }, + { + "epoch": 4.786262798634812, + "grad_norm": 0.15642524013824186, + "learning_rate": 4.3992736863324354e-05, + "loss": 0.8812, + "step": 22438 + }, + { + "epoch": 4.786476109215017, + "grad_norm": 0.16577474883405208, + "learning_rate": 4.398513799288493e-05, + "loss": 0.8937, + "step": 22439 + }, + { + "epoch": 4.786689419795222, + "grad_norm": 0.15669256403747708, + "learning_rate": 4.397753952995428e-05, + "loss": 0.9331, + "step": 22440 + }, + { + "epoch": 4.786902730375426, + "grad_norm": 0.37735178622688914, + "learning_rate": 4.396994147461837e-05, + "loss": 0.9217, + "step": 22441 + }, + { + "epoch": 4.787116040955631, + "grad_norm": 0.20516810398357102, + "learning_rate": 4.396234382696318e-05, + "loss": 0.9034, + "step": 22442 + }, + { + "epoch": 4.787329351535837, + "grad_norm": 0.17951551582116887, + "learning_rate": 4.395474658707468e-05, + "loss": 0.8736, + "step": 22443 + }, + { + "epoch": 4.787542662116041, + "grad_norm": 0.14691686912622812, + "learning_rate": 4.394714975503879e-05, + "loss": 0.876, + "step": 22444 + }, + { + "epoch": 4.787755972696246, + "grad_norm": 0.21181058477485276, + "learning_rate": 4.393955333094155e-05, + "loss": 0.8947, + "step": 22445 + }, + { + "epoch": 4.787969283276451, + "grad_norm": 0.14160076384838893, + "learning_rate": 4.393195731486886e-05, + "loss": 0.8889, + "step": 22446 + }, + { + "epoch": 4.788182593856655, + "grad_norm": 0.2068146292146356, + "learning_rate": 4.392436170690669e-05, + "loss": 0.9053, + "step": 22447 + }, + { + "epoch": 4.78839590443686, + "grad_norm": 0.22227663227828887, + "learning_rate": 4.391676650714093e-05, + "loss": 0.8837, + "step": 22448 + }, + { + "epoch": 4.788609215017065, + "grad_norm": 0.15268783298444905, + "learning_rate": 4.390917171565761e-05, + "loss": 0.9596, + "step": 22449 + }, + { + "epoch": 4.788822525597269, + "grad_norm": 0.1835470364378336, + "learning_rate": 4.3901577332542605e-05, + "loss": 0.8796, + "step": 22450 + }, + { + "epoch": 4.789035836177474, + "grad_norm": 0.14195331619085466, + "learning_rate": 4.3893983357881866e-05, + "loss": 0.9165, + "step": 22451 + }, + { + "epoch": 4.78924914675768, + "grad_norm": 0.1373739359653884, + "learning_rate": 4.388638979176131e-05, + "loss": 0.8792, + "step": 22452 + }, + { + "epoch": 4.789462457337884, + "grad_norm": 0.1513451358265319, + "learning_rate": 4.387879663426686e-05, + "loss": 0.9045, + "step": 22453 + }, + { + "epoch": 4.789675767918089, + "grad_norm": 0.14764649901761162, + "learning_rate": 4.387120388548445e-05, + "loss": 0.8849, + "step": 22454 + }, + { + "epoch": 4.789889078498294, + "grad_norm": 0.1359642260155748, + "learning_rate": 4.386361154549995e-05, + "loss": 0.9181, + "step": 22455 + }, + { + "epoch": 4.790102389078498, + "grad_norm": 0.16592455797124098, + "learning_rate": 4.3856019614399286e-05, + "loss": 0.855, + "step": 22456 + }, + { + "epoch": 4.790315699658703, + "grad_norm": 0.18048048772558528, + "learning_rate": 4.384842809226838e-05, + "loss": 0.8947, + "step": 22457 + }, + { + "epoch": 4.790529010238908, + "grad_norm": 0.15606258626228775, + "learning_rate": 4.384083697919313e-05, + "loss": 0.9446, + "step": 22458 + }, + { + "epoch": 4.790742320819112, + "grad_norm": 0.15412806829239484, + "learning_rate": 4.383324627525941e-05, + "loss": 0.9154, + "step": 22459 + }, + { + "epoch": 4.790955631399317, + "grad_norm": 0.19651092094260245, + "learning_rate": 4.3825655980553095e-05, + "loss": 0.9029, + "step": 22460 + }, + { + "epoch": 4.791168941979522, + "grad_norm": 0.133852301478565, + "learning_rate": 4.381806609516011e-05, + "loss": 0.8911, + "step": 22461 + }, + { + "epoch": 4.7913822525597265, + "grad_norm": 0.22798819346168003, + "learning_rate": 4.3810476619166323e-05, + "loss": 0.9283, + "step": 22462 + }, + { + "epoch": 4.791595563139932, + "grad_norm": 0.1783204844698422, + "learning_rate": 4.3802887552657636e-05, + "loss": 0.9069, + "step": 22463 + }, + { + "epoch": 4.791808873720137, + "grad_norm": 0.16187596546186256, + "learning_rate": 4.3795298895719824e-05, + "loss": 0.8803, + "step": 22464 + }, + { + "epoch": 4.792022184300341, + "grad_norm": 0.15453668454613476, + "learning_rate": 4.378771064843884e-05, + "loss": 0.9282, + "step": 22465 + }, + { + "epoch": 4.792235494880546, + "grad_norm": 0.15368090844023777, + "learning_rate": 4.378012281090051e-05, + "loss": 0.8888, + "step": 22466 + }, + { + "epoch": 4.792448805460751, + "grad_norm": 0.13675320328794716, + "learning_rate": 4.377253538319071e-05, + "loss": 0.8783, + "step": 22467 + }, + { + "epoch": 4.7926621160409555, + "grad_norm": 0.1360373161171371, + "learning_rate": 4.376494836539525e-05, + "loss": 0.8964, + "step": 22468 + }, + { + "epoch": 4.79287542662116, + "grad_norm": 0.15611553567611067, + "learning_rate": 4.375736175760004e-05, + "loss": 0.9193, + "step": 22469 + }, + { + "epoch": 4.793088737201365, + "grad_norm": 0.1584077851810648, + "learning_rate": 4.374977555989089e-05, + "loss": 0.9009, + "step": 22470 + }, + { + "epoch": 4.7933020477815695, + "grad_norm": 0.17423229001761914, + "learning_rate": 4.374218977235364e-05, + "loss": 0.9186, + "step": 22471 + }, + { + "epoch": 4.793515358361775, + "grad_norm": 0.1846783317545217, + "learning_rate": 4.3734604395074094e-05, + "loss": 0.9263, + "step": 22472 + }, + { + "epoch": 4.79372866894198, + "grad_norm": 0.13483312034310874, + "learning_rate": 4.3727019428138145e-05, + "loss": 0.8881, + "step": 22473 + }, + { + "epoch": 4.7939419795221845, + "grad_norm": 0.208916071239985, + "learning_rate": 4.37194348716316e-05, + "loss": 0.9182, + "step": 22474 + }, + { + "epoch": 4.794155290102389, + "grad_norm": 0.20073157445368203, + "learning_rate": 4.371185072564023e-05, + "loss": 0.9054, + "step": 22475 + }, + { + "epoch": 4.794368600682594, + "grad_norm": 0.11744489348390397, + "learning_rate": 4.370426699024985e-05, + "loss": 0.9054, + "step": 22476 + }, + { + "epoch": 4.7945819112627985, + "grad_norm": 0.2148519081124318, + "learning_rate": 4.369668366554633e-05, + "loss": 0.9015, + "step": 22477 + }, + { + "epoch": 4.794795221843003, + "grad_norm": 0.18613031201250785, + "learning_rate": 4.368910075161545e-05, + "loss": 0.9115, + "step": 22478 + }, + { + "epoch": 4.795008532423208, + "grad_norm": 0.16517479782302696, + "learning_rate": 4.3681518248543e-05, + "loss": 0.9047, + "step": 22479 + }, + { + "epoch": 4.795221843003413, + "grad_norm": 0.1487586207753769, + "learning_rate": 4.367393615641476e-05, + "loss": 0.9071, + "step": 22480 + }, + { + "epoch": 4.795435153583618, + "grad_norm": 0.19024821906501285, + "learning_rate": 4.366635447531657e-05, + "loss": 0.9243, + "step": 22481 + }, + { + "epoch": 4.795648464163823, + "grad_norm": 0.13721394900426398, + "learning_rate": 4.365877320533418e-05, + "loss": 0.8878, + "step": 22482 + }, + { + "epoch": 4.7958617747440275, + "grad_norm": 0.16149436015315938, + "learning_rate": 4.36511923465534e-05, + "loss": 0.9191, + "step": 22483 + }, + { + "epoch": 4.796075085324232, + "grad_norm": 0.14004243543012795, + "learning_rate": 4.364361189905995e-05, + "loss": 0.8946, + "step": 22484 + }, + { + "epoch": 4.796288395904437, + "grad_norm": 0.15694362820007193, + "learning_rate": 4.36360318629397e-05, + "loss": 0.8928, + "step": 22485 + }, + { + "epoch": 4.796501706484642, + "grad_norm": 0.15898889513576792, + "learning_rate": 4.362845223827834e-05, + "loss": 0.8529, + "step": 22486 + }, + { + "epoch": 4.796715017064846, + "grad_norm": 0.15918519908606757, + "learning_rate": 4.362087302516164e-05, + "loss": 0.8962, + "step": 22487 + }, + { + "epoch": 4.796928327645051, + "grad_norm": 0.17692769813078918, + "learning_rate": 4.361329422367536e-05, + "loss": 0.8862, + "step": 22488 + }, + { + "epoch": 4.797141638225256, + "grad_norm": 0.19176387104810166, + "learning_rate": 4.3605715833905295e-05, + "loss": 0.9172, + "step": 22489 + }, + { + "epoch": 4.797354948805461, + "grad_norm": 0.14769500464526025, + "learning_rate": 4.3598137855937154e-05, + "loss": 0.9004, + "step": 22490 + }, + { + "epoch": 4.797568259385666, + "grad_norm": 0.19533932700688686, + "learning_rate": 4.359056028985672e-05, + "loss": 0.9108, + "step": 22491 + }, + { + "epoch": 4.797781569965871, + "grad_norm": 0.16357903058776885, + "learning_rate": 4.3582983135749653e-05, + "loss": 0.8963, + "step": 22492 + }, + { + "epoch": 4.797994880546075, + "grad_norm": 0.17403405820584592, + "learning_rate": 4.357540639370179e-05, + "loss": 0.9108, + "step": 22493 + }, + { + "epoch": 4.79820819112628, + "grad_norm": 0.20284363051472945, + "learning_rate": 4.356783006379881e-05, + "loss": 0.8948, + "step": 22494 + }, + { + "epoch": 4.798421501706485, + "grad_norm": 0.16452812394162555, + "learning_rate": 4.356025414612645e-05, + "loss": 0.9073, + "step": 22495 + }, + { + "epoch": 4.798634812286689, + "grad_norm": 0.18719897748611422, + "learning_rate": 4.355267864077043e-05, + "loss": 0.8595, + "step": 22496 + }, + { + "epoch": 4.798848122866894, + "grad_norm": 0.18669280728061735, + "learning_rate": 4.354510354781646e-05, + "loss": 0.8821, + "step": 22497 + }, + { + "epoch": 4.799061433447099, + "grad_norm": 0.17097863102035046, + "learning_rate": 4.353752886735026e-05, + "loss": 0.9066, + "step": 22498 + }, + { + "epoch": 4.799274744027304, + "grad_norm": 0.20507015231209455, + "learning_rate": 4.352995459945751e-05, + "loss": 0.9119, + "step": 22499 + }, + { + "epoch": 4.799488054607508, + "grad_norm": 0.19674630585763217, + "learning_rate": 4.352238074422394e-05, + "loss": 0.8895, + "step": 22500 + }, + { + "epoch": 4.799701365187714, + "grad_norm": 0.23208340728353208, + "learning_rate": 4.351480730173526e-05, + "loss": 0.9063, + "step": 22501 + }, + { + "epoch": 4.799914675767918, + "grad_norm": 0.15959846161321695, + "learning_rate": 4.350723427207715e-05, + "loss": 0.9159, + "step": 22502 + }, + { + "epoch": 4.800127986348123, + "grad_norm": 0.28366856623646053, + "learning_rate": 4.3499661655335296e-05, + "loss": 0.9138, + "step": 22503 + }, + { + "epoch": 4.800341296928328, + "grad_norm": 0.18218192782723877, + "learning_rate": 4.349208945159536e-05, + "loss": 0.8991, + "step": 22504 + }, + { + "epoch": 4.800554607508532, + "grad_norm": 0.17311601113490843, + "learning_rate": 4.3484517660943075e-05, + "loss": 0.8845, + "step": 22505 + }, + { + "epoch": 4.800767918088737, + "grad_norm": 0.1633040391121445, + "learning_rate": 4.347694628346408e-05, + "loss": 0.9095, + "step": 22506 + }, + { + "epoch": 4.800981228668942, + "grad_norm": 0.17167828950822261, + "learning_rate": 4.346937531924406e-05, + "loss": 0.8826, + "step": 22507 + }, + { + "epoch": 4.801194539249146, + "grad_norm": 0.13713669907479104, + "learning_rate": 4.346180476836867e-05, + "loss": 0.8973, + "step": 22508 + }, + { + "epoch": 4.801407849829351, + "grad_norm": 0.1757084330500075, + "learning_rate": 4.345423463092358e-05, + "loss": 0.8792, + "step": 22509 + }, + { + "epoch": 4.801621160409557, + "grad_norm": 0.13284112028888467, + "learning_rate": 4.344666490699442e-05, + "loss": 0.9071, + "step": 22510 + }, + { + "epoch": 4.801834470989761, + "grad_norm": 0.14314548772546828, + "learning_rate": 4.343909559666688e-05, + "loss": 0.9171, + "step": 22511 + }, + { + "epoch": 4.802047781569966, + "grad_norm": 0.1446087899189499, + "learning_rate": 4.3431526700026554e-05, + "loss": 0.928, + "step": 22512 + }, + { + "epoch": 4.802261092150171, + "grad_norm": 0.14959567601858834, + "learning_rate": 4.342395821715915e-05, + "loss": 0.9426, + "step": 22513 + }, + { + "epoch": 4.802474402730375, + "grad_norm": 0.15867431871273174, + "learning_rate": 4.341639014815027e-05, + "loss": 0.9094, + "step": 22514 + }, + { + "epoch": 4.80268771331058, + "grad_norm": 0.14332917160967446, + "learning_rate": 4.340882249308555e-05, + "loss": 0.9091, + "step": 22515 + }, + { + "epoch": 4.802901023890785, + "grad_norm": 0.18281694972147278, + "learning_rate": 4.340125525205059e-05, + "loss": 0.91, + "step": 22516 + }, + { + "epoch": 4.8031143344709895, + "grad_norm": 0.15097464838760744, + "learning_rate": 4.339368842513108e-05, + "loss": 0.8845, + "step": 22517 + }, + { + "epoch": 4.803327645051194, + "grad_norm": 0.15949175419021633, + "learning_rate": 4.33861220124126e-05, + "loss": 0.918, + "step": 22518 + }, + { + "epoch": 4.8035409556314, + "grad_norm": 0.15735351512618984, + "learning_rate": 4.337855601398076e-05, + "loss": 0.8765, + "step": 22519 + }, + { + "epoch": 4.803754266211604, + "grad_norm": 0.14319285047319197, + "learning_rate": 4.337099042992117e-05, + "loss": 0.8861, + "step": 22520 + }, + { + "epoch": 4.803967576791809, + "grad_norm": 0.18689090162636277, + "learning_rate": 4.336342526031945e-05, + "loss": 0.8919, + "step": 22521 + }, + { + "epoch": 4.804180887372014, + "grad_norm": 0.18390036119588152, + "learning_rate": 4.335586050526118e-05, + "loss": 0.9272, + "step": 22522 + }, + { + "epoch": 4.8043941979522184, + "grad_norm": 0.14064576209944238, + "learning_rate": 4.334829616483196e-05, + "loss": 0.9073, + "step": 22523 + }, + { + "epoch": 4.804607508532423, + "grad_norm": 0.14735071433813088, + "learning_rate": 4.334073223911738e-05, + "loss": 0.9071, + "step": 22524 + }, + { + "epoch": 4.804820819112628, + "grad_norm": 0.1378263594568493, + "learning_rate": 4.3333168728203033e-05, + "loss": 0.8944, + "step": 22525 + }, + { + "epoch": 4.8050341296928325, + "grad_norm": 0.16205968944232815, + "learning_rate": 4.332560563217451e-05, + "loss": 0.9095, + "step": 22526 + }, + { + "epoch": 4.805247440273037, + "grad_norm": 0.14068706022443198, + "learning_rate": 4.331804295111738e-05, + "loss": 0.9026, + "step": 22527 + }, + { + "epoch": 4.805460750853243, + "grad_norm": 0.14558311540648408, + "learning_rate": 4.331048068511718e-05, + "loss": 0.8982, + "step": 22528 + }, + { + "epoch": 4.805674061433447, + "grad_norm": 0.1383788958995141, + "learning_rate": 4.330291883425954e-05, + "loss": 0.9113, + "step": 22529 + }, + { + "epoch": 4.805887372013652, + "grad_norm": 0.1403243917122269, + "learning_rate": 4.329535739863002e-05, + "loss": 0.9238, + "step": 22530 + }, + { + "epoch": 4.806100682593857, + "grad_norm": 0.15300503857720096, + "learning_rate": 4.3287796378314084e-05, + "loss": 0.9272, + "step": 22531 + }, + { + "epoch": 4.8063139931740615, + "grad_norm": 0.18658115861163177, + "learning_rate": 4.328023577339737e-05, + "loss": 0.8916, + "step": 22532 + }, + { + "epoch": 4.806527303754266, + "grad_norm": 0.16269119043134972, + "learning_rate": 4.327267558396543e-05, + "loss": 0.8689, + "step": 22533 + }, + { + "epoch": 4.806740614334471, + "grad_norm": 0.1689072526513778, + "learning_rate": 4.326511581010376e-05, + "loss": 0.9071, + "step": 22534 + }, + { + "epoch": 4.8069539249146755, + "grad_norm": 0.15688118945438018, + "learning_rate": 4.325755645189794e-05, + "loss": 0.8663, + "step": 22535 + }, + { + "epoch": 4.80716723549488, + "grad_norm": 0.17287691976001918, + "learning_rate": 4.324999750943345e-05, + "loss": 0.8893, + "step": 22536 + }, + { + "epoch": 4.807380546075086, + "grad_norm": 0.1564620738829188, + "learning_rate": 4.324243898279589e-05, + "loss": 0.9199, + "step": 22537 + }, + { + "epoch": 4.8075938566552905, + "grad_norm": 0.16927091465076471, + "learning_rate": 4.323488087207074e-05, + "loss": 0.8729, + "step": 22538 + }, + { + "epoch": 4.807807167235495, + "grad_norm": 0.1343836526009106, + "learning_rate": 4.322732317734354e-05, + "loss": 0.904, + "step": 22539 + }, + { + "epoch": 4.8080204778157, + "grad_norm": 0.16217190661647543, + "learning_rate": 4.321976589869977e-05, + "loss": 0.8844, + "step": 22540 + }, + { + "epoch": 4.8082337883959045, + "grad_norm": 0.1392472566045478, + "learning_rate": 4.321220903622502e-05, + "loss": 0.9176, + "step": 22541 + }, + { + "epoch": 4.808447098976109, + "grad_norm": 0.19301064975620968, + "learning_rate": 4.3204652590004715e-05, + "loss": 0.9161, + "step": 22542 + }, + { + "epoch": 4.808660409556314, + "grad_norm": 0.1720840618965252, + "learning_rate": 4.319709656012436e-05, + "loss": 0.884, + "step": 22543 + }, + { + "epoch": 4.808873720136519, + "grad_norm": 0.15464275072095093, + "learning_rate": 4.31895409466695e-05, + "loss": 0.91, + "step": 22544 + }, + { + "epoch": 4.809087030716723, + "grad_norm": 0.1635413890107987, + "learning_rate": 4.3181985749725606e-05, + "loss": 0.8925, + "step": 22545 + }, + { + "epoch": 4.809300341296928, + "grad_norm": 0.15368260884630444, + "learning_rate": 4.3174430969378156e-05, + "loss": 0.9027, + "step": 22546 + }, + { + "epoch": 4.809513651877133, + "grad_norm": 0.17898674200731307, + "learning_rate": 4.3166876605712653e-05, + "loss": 0.9186, + "step": 22547 + }, + { + "epoch": 4.809726962457338, + "grad_norm": 0.16796522569767144, + "learning_rate": 4.315932265881453e-05, + "loss": 0.9231, + "step": 22548 + }, + { + "epoch": 4.809940273037543, + "grad_norm": 0.151205744428386, + "learning_rate": 4.315176912876931e-05, + "loss": 0.932, + "step": 22549 + }, + { + "epoch": 4.810153583617748, + "grad_norm": 0.145733216916853, + "learning_rate": 4.3144216015662435e-05, + "loss": 0.9346, + "step": 22550 + }, + { + "epoch": 4.810366894197952, + "grad_norm": 0.1570769603371409, + "learning_rate": 4.3136663319579394e-05, + "loss": 0.9111, + "step": 22551 + }, + { + "epoch": 4.810580204778157, + "grad_norm": 0.13891824770473968, + "learning_rate": 4.312911104060562e-05, + "loss": 0.8872, + "step": 22552 + }, + { + "epoch": 4.810793515358362, + "grad_norm": 0.13605145825434192, + "learning_rate": 4.312155917882658e-05, + "loss": 0.9074, + "step": 22553 + }, + { + "epoch": 4.811006825938566, + "grad_norm": 0.18567213577924338, + "learning_rate": 4.311400773432771e-05, + "loss": 0.9002, + "step": 22554 + }, + { + "epoch": 4.811220136518771, + "grad_norm": 0.14291975973524976, + "learning_rate": 4.310645670719443e-05, + "loss": 0.9035, + "step": 22555 + }, + { + "epoch": 4.811433447098976, + "grad_norm": 0.18892248944815704, + "learning_rate": 4.3098906097512254e-05, + "loss": 0.8988, + "step": 22556 + }, + { + "epoch": 4.811646757679181, + "grad_norm": 0.18635862895394678, + "learning_rate": 4.309135590536658e-05, + "loss": 0.9215, + "step": 22557 + }, + { + "epoch": 4.811860068259386, + "grad_norm": 0.16207025335421923, + "learning_rate": 4.308380613084282e-05, + "loss": 0.8993, + "step": 22558 + }, + { + "epoch": 4.812073378839591, + "grad_norm": 0.15343547933923538, + "learning_rate": 4.307625677402642e-05, + "loss": 0.8992, + "step": 22559 + }, + { + "epoch": 4.812286689419795, + "grad_norm": 0.15164897952583622, + "learning_rate": 4.3068707835002764e-05, + "loss": 0.905, + "step": 22560 + }, + { + "epoch": 4.8125, + "grad_norm": 0.14639141032294614, + "learning_rate": 4.306115931385733e-05, + "loss": 0.9065, + "step": 22561 + }, + { + "epoch": 4.812713310580205, + "grad_norm": 0.16139984300198557, + "learning_rate": 4.3053611210675506e-05, + "loss": 0.8925, + "step": 22562 + }, + { + "epoch": 4.812926621160409, + "grad_norm": 0.15330767062767237, + "learning_rate": 4.304606352554268e-05, + "loss": 0.9215, + "step": 22563 + }, + { + "epoch": 4.813139931740614, + "grad_norm": 0.1490743769277784, + "learning_rate": 4.303851625854428e-05, + "loss": 0.9441, + "step": 22564 + }, + { + "epoch": 4.813353242320819, + "grad_norm": 0.14102893285029444, + "learning_rate": 4.303096940976569e-05, + "loss": 0.8822, + "step": 22565 + }, + { + "epoch": 4.813566552901024, + "grad_norm": 0.16088810358051683, + "learning_rate": 4.3023422979292296e-05, + "loss": 0.9135, + "step": 22566 + }, + { + "epoch": 4.813779863481229, + "grad_norm": 0.14666862901584, + "learning_rate": 4.3015876967209464e-05, + "loss": 0.9242, + "step": 22567 + }, + { + "epoch": 4.813993174061434, + "grad_norm": 0.1480599853676691, + "learning_rate": 4.3008331373602636e-05, + "loss": 0.9049, + "step": 22568 + }, + { + "epoch": 4.814206484641638, + "grad_norm": 0.14990247071133983, + "learning_rate": 4.3000786198557166e-05, + "loss": 0.9268, + "step": 22569 + }, + { + "epoch": 4.814419795221843, + "grad_norm": 0.15518047794918502, + "learning_rate": 4.2993241442158414e-05, + "loss": 0.9081, + "step": 22570 + }, + { + "epoch": 4.814633105802048, + "grad_norm": 0.15970659844894722, + "learning_rate": 4.298569710449177e-05, + "loss": 0.8817, + "step": 22571 + }, + { + "epoch": 4.814846416382252, + "grad_norm": 0.15116459921116157, + "learning_rate": 4.2978153185642543e-05, + "loss": 0.9316, + "step": 22572 + }, + { + "epoch": 4.815059726962457, + "grad_norm": 0.19301260273826873, + "learning_rate": 4.297060968569618e-05, + "loss": 0.8966, + "step": 22573 + }, + { + "epoch": 4.815273037542662, + "grad_norm": 0.1338829009812019, + "learning_rate": 4.296306660473798e-05, + "loss": 0.8832, + "step": 22574 + }, + { + "epoch": 4.815486348122867, + "grad_norm": 0.17594362912769562, + "learning_rate": 4.295552394285331e-05, + "loss": 0.9122, + "step": 22575 + }, + { + "epoch": 4.815699658703072, + "grad_norm": 0.13498337388440762, + "learning_rate": 4.29479817001275e-05, + "loss": 0.877, + "step": 22576 + }, + { + "epoch": 4.815912969283277, + "grad_norm": 0.19365799860259056, + "learning_rate": 4.294043987664591e-05, + "loss": 0.8908, + "step": 22577 + }, + { + "epoch": 4.816126279863481, + "grad_norm": 0.13913097655495404, + "learning_rate": 4.293289847249386e-05, + "loss": 0.8985, + "step": 22578 + }, + { + "epoch": 4.816339590443686, + "grad_norm": 0.1879270791717758, + "learning_rate": 4.292535748775668e-05, + "loss": 0.9117, + "step": 22579 + }, + { + "epoch": 4.816552901023891, + "grad_norm": 0.14115722180018284, + "learning_rate": 4.291781692251969e-05, + "loss": 0.8659, + "step": 22580 + }, + { + "epoch": 4.8167662116040955, + "grad_norm": 0.19415490853273762, + "learning_rate": 4.291027677686826e-05, + "loss": 0.8963, + "step": 22581 + }, + { + "epoch": 4.8169795221843, + "grad_norm": 0.13947180642598944, + "learning_rate": 4.290273705088765e-05, + "loss": 0.8872, + "step": 22582 + }, + { + "epoch": 4.817192832764505, + "grad_norm": 0.1790697516737804, + "learning_rate": 4.289519774466321e-05, + "loss": 0.8702, + "step": 22583 + }, + { + "epoch": 4.8174061433447095, + "grad_norm": 0.17745922369175857, + "learning_rate": 4.288765885828019e-05, + "loss": 0.9334, + "step": 22584 + }, + { + "epoch": 4.817619453924914, + "grad_norm": 0.17991046934605948, + "learning_rate": 4.2880120391823975e-05, + "loss": 0.9118, + "step": 22585 + }, + { + "epoch": 4.81783276450512, + "grad_norm": 0.15774682560017636, + "learning_rate": 4.287258234537986e-05, + "loss": 0.9005, + "step": 22586 + }, + { + "epoch": 4.8180460750853245, + "grad_norm": 0.22075580479321955, + "learning_rate": 4.286504471903302e-05, + "loss": 0.898, + "step": 22587 + }, + { + "epoch": 4.818259385665529, + "grad_norm": 0.1717634289794413, + "learning_rate": 4.2857507512868854e-05, + "loss": 0.8681, + "step": 22588 + }, + { + "epoch": 4.818472696245734, + "grad_norm": 0.16874728549628748, + "learning_rate": 4.284997072697262e-05, + "loss": 0.8919, + "step": 22589 + }, + { + "epoch": 4.8186860068259385, + "grad_norm": 0.1921812442898118, + "learning_rate": 4.2842434361429604e-05, + "loss": 0.906, + "step": 22590 + }, + { + "epoch": 4.818899317406143, + "grad_norm": 0.1489663146481419, + "learning_rate": 4.2834898416325055e-05, + "loss": 0.8932, + "step": 22591 + }, + { + "epoch": 4.819112627986348, + "grad_norm": 0.18857262225611454, + "learning_rate": 4.282736289174423e-05, + "loss": 0.9198, + "step": 22592 + }, + { + "epoch": 4.819325938566553, + "grad_norm": 0.14566059543401713, + "learning_rate": 4.281982778777245e-05, + "loss": 0.8596, + "step": 22593 + }, + { + "epoch": 4.819539249146757, + "grad_norm": 0.18882444833891107, + "learning_rate": 4.2812293104494956e-05, + "loss": 0.8793, + "step": 22594 + }, + { + "epoch": 4.819752559726963, + "grad_norm": 0.16110321863862784, + "learning_rate": 4.2804758841996976e-05, + "loss": 0.8941, + "step": 22595 + }, + { + "epoch": 4.8199658703071675, + "grad_norm": 0.1590351435511413, + "learning_rate": 4.279722500036374e-05, + "loss": 0.8813, + "step": 22596 + }, + { + "epoch": 4.820179180887372, + "grad_norm": 0.17655066456920981, + "learning_rate": 4.27896915796806e-05, + "loss": 0.8817, + "step": 22597 + }, + { + "epoch": 4.820392491467577, + "grad_norm": 0.13886095150968805, + "learning_rate": 4.278215858003269e-05, + "loss": 0.8828, + "step": 22598 + }, + { + "epoch": 4.8206058020477816, + "grad_norm": 0.1601279042606362, + "learning_rate": 4.277462600150525e-05, + "loss": 0.9234, + "step": 22599 + }, + { + "epoch": 4.820819112627986, + "grad_norm": 0.1480405591498536, + "learning_rate": 4.276709384418358e-05, + "loss": 0.9126, + "step": 22600 + }, + { + "epoch": 4.821032423208191, + "grad_norm": 0.13287629815806987, + "learning_rate": 4.2759562108152866e-05, + "loss": 0.9017, + "step": 22601 + }, + { + "epoch": 4.821245733788396, + "grad_norm": 0.14029141320900065, + "learning_rate": 4.275203079349833e-05, + "loss": 0.9046, + "step": 22602 + }, + { + "epoch": 4.8214590443686, + "grad_norm": 0.1414439820466827, + "learning_rate": 4.274449990030521e-05, + "loss": 0.8882, + "step": 22603 + }, + { + "epoch": 4.821672354948806, + "grad_norm": 0.1415501182063103, + "learning_rate": 4.273696942865865e-05, + "loss": 0.894, + "step": 22604 + }, + { + "epoch": 4.8218856655290105, + "grad_norm": 0.1502860021054323, + "learning_rate": 4.272943937864394e-05, + "loss": 0.8855, + "step": 22605 + }, + { + "epoch": 4.822098976109215, + "grad_norm": 0.13776598043213467, + "learning_rate": 4.272190975034626e-05, + "loss": 0.8865, + "step": 22606 + }, + { + "epoch": 4.82231228668942, + "grad_norm": 0.13880809885685264, + "learning_rate": 4.271438054385079e-05, + "loss": 0.898, + "step": 22607 + }, + { + "epoch": 4.822525597269625, + "grad_norm": 0.1301168047732928, + "learning_rate": 4.2706851759242734e-05, + "loss": 0.8884, + "step": 22608 + }, + { + "epoch": 4.822738907849829, + "grad_norm": 0.1532063426247866, + "learning_rate": 4.2699323396607276e-05, + "loss": 0.9129, + "step": 22609 + }, + { + "epoch": 4.822952218430034, + "grad_norm": 0.14114308216418353, + "learning_rate": 4.26917954560296e-05, + "loss": 0.9183, + "step": 22610 + }, + { + "epoch": 4.823165529010239, + "grad_norm": 0.16131738293788492, + "learning_rate": 4.268426793759486e-05, + "loss": 0.9074, + "step": 22611 + }, + { + "epoch": 4.823378839590443, + "grad_norm": 0.1693313450288614, + "learning_rate": 4.267674084138828e-05, + "loss": 0.8773, + "step": 22612 + }, + { + "epoch": 4.823592150170649, + "grad_norm": 0.1524971218935779, + "learning_rate": 4.266921416749501e-05, + "loss": 0.8777, + "step": 22613 + }, + { + "epoch": 4.823805460750854, + "grad_norm": 0.179410412197679, + "learning_rate": 4.266168791600021e-05, + "loss": 0.8791, + "step": 22614 + }, + { + "epoch": 4.824018771331058, + "grad_norm": 0.19818679848494564, + "learning_rate": 4.265416208698904e-05, + "loss": 0.9245, + "step": 22615 + }, + { + "epoch": 4.824232081911263, + "grad_norm": 0.1530460432124704, + "learning_rate": 4.2646636680546613e-05, + "loss": 0.9159, + "step": 22616 + }, + { + "epoch": 4.824445392491468, + "grad_norm": 0.17184581371441526, + "learning_rate": 4.263911169675817e-05, + "loss": 0.9105, + "step": 22617 + }, + { + "epoch": 4.824658703071672, + "grad_norm": 0.17701595813437387, + "learning_rate": 4.26315871357088e-05, + "loss": 0.8599, + "step": 22618 + }, + { + "epoch": 4.824872013651877, + "grad_norm": 0.17313076497720228, + "learning_rate": 4.262406299748364e-05, + "loss": 0.903, + "step": 22619 + }, + { + "epoch": 4.825085324232082, + "grad_norm": 0.1905808584346863, + "learning_rate": 4.2616539282167845e-05, + "loss": 0.894, + "step": 22620 + }, + { + "epoch": 4.825298634812286, + "grad_norm": 0.1543564686237655, + "learning_rate": 4.260901598984653e-05, + "loss": 0.9029, + "step": 22621 + }, + { + "epoch": 4.825511945392492, + "grad_norm": 0.1496077656955835, + "learning_rate": 4.260149312060483e-05, + "loss": 0.8881, + "step": 22622 + }, + { + "epoch": 4.825725255972696, + "grad_norm": 0.15798054019281738, + "learning_rate": 4.2593970674527834e-05, + "loss": 0.8994, + "step": 22623 + }, + { + "epoch": 4.825938566552901, + "grad_norm": 0.1453592051154782, + "learning_rate": 4.258644865170073e-05, + "loss": 0.8926, + "step": 22624 + }, + { + "epoch": 4.826151877133106, + "grad_norm": 0.15888141412733583, + "learning_rate": 4.2578927052208584e-05, + "loss": 0.9377, + "step": 22625 + }, + { + "epoch": 4.826365187713311, + "grad_norm": 0.15274470653230626, + "learning_rate": 4.25714058761365e-05, + "loss": 0.8824, + "step": 22626 + }, + { + "epoch": 4.826578498293515, + "grad_norm": 0.12706591103955583, + "learning_rate": 4.2563885123569584e-05, + "loss": 0.8988, + "step": 22627 + }, + { + "epoch": 4.82679180887372, + "grad_norm": 0.1820469391168311, + "learning_rate": 4.2556364794592913e-05, + "loss": 0.8727, + "step": 22628 + }, + { + "epoch": 4.827005119453925, + "grad_norm": 0.17256900482973547, + "learning_rate": 4.254884488929165e-05, + "loss": 0.8699, + "step": 22629 + }, + { + "epoch": 4.827218430034129, + "grad_norm": 0.14509130126305322, + "learning_rate": 4.254132540775082e-05, + "loss": 0.9329, + "step": 22630 + }, + { + "epoch": 4.827431740614334, + "grad_norm": 0.16479314175142207, + "learning_rate": 4.253380635005552e-05, + "loss": 0.9245, + "step": 22631 + }, + { + "epoch": 4.827645051194539, + "grad_norm": 0.12861603112218378, + "learning_rate": 4.252628771629084e-05, + "loss": 0.8877, + "step": 22632 + }, + { + "epoch": 4.827858361774744, + "grad_norm": 0.18670412616616583, + "learning_rate": 4.251876950654184e-05, + "loss": 0.8708, + "step": 22633 + }, + { + "epoch": 4.828071672354949, + "grad_norm": 0.14501374773881925, + "learning_rate": 4.25112517208936e-05, + "loss": 0.8968, + "step": 22634 + }, + { + "epoch": 4.828284982935154, + "grad_norm": 0.15533130692717728, + "learning_rate": 4.2503734359431144e-05, + "loss": 0.8937, + "step": 22635 + }, + { + "epoch": 4.828498293515358, + "grad_norm": 0.16391064973611294, + "learning_rate": 4.2496217422239597e-05, + "loss": 0.8936, + "step": 22636 + }, + { + "epoch": 4.828711604095563, + "grad_norm": 0.1757989395361901, + "learning_rate": 4.248870090940397e-05, + "loss": 0.9021, + "step": 22637 + }, + { + "epoch": 4.828924914675768, + "grad_norm": 0.1502212803063907, + "learning_rate": 4.2481184821009336e-05, + "loss": 0.9063, + "step": 22638 + }, + { + "epoch": 4.8291382252559725, + "grad_norm": 0.1516303720366716, + "learning_rate": 4.247366915714073e-05, + "loss": 0.9076, + "step": 22639 + }, + { + "epoch": 4.829351535836177, + "grad_norm": 0.1365700984071097, + "learning_rate": 4.246615391788314e-05, + "loss": 0.9082, + "step": 22640 + }, + { + "epoch": 4.829564846416382, + "grad_norm": 0.1665648715726086, + "learning_rate": 4.245863910332169e-05, + "loss": 0.8851, + "step": 22641 + }, + { + "epoch": 4.829778156996587, + "grad_norm": 0.15263723792741546, + "learning_rate": 4.2451124713541405e-05, + "loss": 0.8574, + "step": 22642 + }, + { + "epoch": 4.829991467576792, + "grad_norm": 0.16210122943149843, + "learning_rate": 4.24436107486272e-05, + "loss": 0.9079, + "step": 22643 + }, + { + "epoch": 4.830204778156997, + "grad_norm": 0.16681746186526333, + "learning_rate": 4.243609720866422e-05, + "loss": 0.9156, + "step": 22644 + }, + { + "epoch": 4.8304180887372015, + "grad_norm": 0.15249515555874543, + "learning_rate": 4.242858409373742e-05, + "loss": 0.8667, + "step": 22645 + }, + { + "epoch": 4.830631399317406, + "grad_norm": 0.16485148237989047, + "learning_rate": 4.2421071403931825e-05, + "loss": 0.8749, + "step": 22646 + }, + { + "epoch": 4.830844709897611, + "grad_norm": 0.16630765884972623, + "learning_rate": 4.241355913933242e-05, + "loss": 0.8929, + "step": 22647 + }, + { + "epoch": 4.8310580204778155, + "grad_norm": 0.1599161429371025, + "learning_rate": 4.240604730002424e-05, + "loss": 0.9104, + "step": 22648 + }, + { + "epoch": 4.83127133105802, + "grad_norm": 0.18361860179334438, + "learning_rate": 4.239853588609228e-05, + "loss": 0.9178, + "step": 22649 + }, + { + "epoch": 4.831484641638225, + "grad_norm": 0.15329751242384718, + "learning_rate": 4.239102489762151e-05, + "loss": 0.9197, + "step": 22650 + }, + { + "epoch": 4.8316979522184305, + "grad_norm": 0.17053045313069393, + "learning_rate": 4.238351433469692e-05, + "loss": 0.9002, + "step": 22651 + }, + { + "epoch": 4.831911262798635, + "grad_norm": 0.15814224106265662, + "learning_rate": 4.237600419740348e-05, + "loss": 0.9105, + "step": 22652 + }, + { + "epoch": 4.83212457337884, + "grad_norm": 0.1565425900147492, + "learning_rate": 4.236849448582625e-05, + "loss": 0.9022, + "step": 22653 + }, + { + "epoch": 4.8323378839590445, + "grad_norm": 0.129515732594814, + "learning_rate": 4.23609852000501e-05, + "loss": 0.891, + "step": 22654 + }, + { + "epoch": 4.832551194539249, + "grad_norm": 0.15353185500895894, + "learning_rate": 4.235347634016e-05, + "loss": 0.8824, + "step": 22655 + }, + { + "epoch": 4.832764505119454, + "grad_norm": 0.13502667099098933, + "learning_rate": 4.234596790624098e-05, + "loss": 0.9149, + "step": 22656 + }, + { + "epoch": 4.832977815699659, + "grad_norm": 0.14332946851091893, + "learning_rate": 4.233845989837797e-05, + "loss": 0.8883, + "step": 22657 + }, + { + "epoch": 4.833191126279863, + "grad_norm": 0.14928133774269617, + "learning_rate": 4.233095231665592e-05, + "loss": 0.92, + "step": 22658 + }, + { + "epoch": 4.833404436860068, + "grad_norm": 0.1606998225061143, + "learning_rate": 4.2323445161159746e-05, + "loss": 0.9109, + "step": 22659 + }, + { + "epoch": 4.8336177474402735, + "grad_norm": 0.13566892997276508, + "learning_rate": 4.231593843197447e-05, + "loss": 0.876, + "step": 22660 + }, + { + "epoch": 4.833831058020478, + "grad_norm": 0.1456674983592189, + "learning_rate": 4.2308432129184956e-05, + "loss": 0.8858, + "step": 22661 + }, + { + "epoch": 4.834044368600683, + "grad_norm": 0.13802841913216132, + "learning_rate": 4.230092625287618e-05, + "loss": 0.8799, + "step": 22662 + }, + { + "epoch": 4.834257679180888, + "grad_norm": 0.15589736935732917, + "learning_rate": 4.2293420803133065e-05, + "loss": 0.9266, + "step": 22663 + }, + { + "epoch": 4.834470989761092, + "grad_norm": 0.14708059945734075, + "learning_rate": 4.228591578004052e-05, + "loss": 0.8852, + "step": 22664 + }, + { + "epoch": 4.834684300341297, + "grad_norm": 0.19128279762633704, + "learning_rate": 4.227841118368348e-05, + "loss": 0.9049, + "step": 22665 + }, + { + "epoch": 4.834897610921502, + "grad_norm": 0.1500527568776337, + "learning_rate": 4.2270907014146834e-05, + "loss": 0.8731, + "step": 22666 + }, + { + "epoch": 4.835110921501706, + "grad_norm": 0.17225940045089835, + "learning_rate": 4.226340327151549e-05, + "loss": 0.9006, + "step": 22667 + }, + { + "epoch": 4.835324232081911, + "grad_norm": 0.16476684385246013, + "learning_rate": 4.225589995587442e-05, + "loss": 0.8833, + "step": 22668 + }, + { + "epoch": 4.835537542662116, + "grad_norm": 0.14688528414356283, + "learning_rate": 4.224839706730845e-05, + "loss": 0.882, + "step": 22669 + }, + { + "epoch": 4.83575085324232, + "grad_norm": 0.1594207253391259, + "learning_rate": 4.2240894605902516e-05, + "loss": 0.8913, + "step": 22670 + }, + { + "epoch": 4.835964163822526, + "grad_norm": 0.14640570855067508, + "learning_rate": 4.2233392571741465e-05, + "loss": 0.8988, + "step": 22671 + }, + { + "epoch": 4.836177474402731, + "grad_norm": 0.14461567430626002, + "learning_rate": 4.222589096491023e-05, + "loss": 0.9515, + "step": 22672 + }, + { + "epoch": 4.836390784982935, + "grad_norm": 0.14390982303501595, + "learning_rate": 4.221838978549368e-05, + "loss": 0.9202, + "step": 22673 + }, + { + "epoch": 4.83660409556314, + "grad_norm": 0.15601899090800972, + "learning_rate": 4.22108890335767e-05, + "loss": 0.8844, + "step": 22674 + }, + { + "epoch": 4.836817406143345, + "grad_norm": 0.15339972232690433, + "learning_rate": 4.220338870924413e-05, + "loss": 0.9084, + "step": 22675 + }, + { + "epoch": 4.837030716723549, + "grad_norm": 0.16893560995221818, + "learning_rate": 4.219588881258085e-05, + "loss": 0.9069, + "step": 22676 + }, + { + "epoch": 4.837244027303754, + "grad_norm": 0.18086375030614665, + "learning_rate": 4.218838934367174e-05, + "loss": 0.9308, + "step": 22677 + }, + { + "epoch": 4.837457337883959, + "grad_norm": 0.13440401878103467, + "learning_rate": 4.2180890302601627e-05, + "loss": 0.9264, + "step": 22678 + }, + { + "epoch": 4.837670648464163, + "grad_norm": 0.1498534525101072, + "learning_rate": 4.217339168945535e-05, + "loss": 0.9068, + "step": 22679 + }, + { + "epoch": 4.837883959044369, + "grad_norm": 0.1510137234599898, + "learning_rate": 4.216589350431781e-05, + "loss": 0.9055, + "step": 22680 + }, + { + "epoch": 4.838097269624574, + "grad_norm": 0.1457556446737827, + "learning_rate": 4.215839574727383e-05, + "loss": 0.9309, + "step": 22681 + }, + { + "epoch": 4.838310580204778, + "grad_norm": 0.1426949387971526, + "learning_rate": 4.215089841840823e-05, + "loss": 0.8998, + "step": 22682 + }, + { + "epoch": 4.838523890784983, + "grad_norm": 0.14884746326319975, + "learning_rate": 4.214340151780582e-05, + "loss": 0.8603, + "step": 22683 + }, + { + "epoch": 4.838737201365188, + "grad_norm": 0.1352173234764994, + "learning_rate": 4.21359050455515e-05, + "loss": 0.8945, + "step": 22684 + }, + { + "epoch": 4.838950511945392, + "grad_norm": 0.17485092083586826, + "learning_rate": 4.212840900173004e-05, + "loss": 0.8836, + "step": 22685 + }, + { + "epoch": 4.839163822525597, + "grad_norm": 0.16510168499262431, + "learning_rate": 4.212091338642627e-05, + "loss": 0.9063, + "step": 22686 + }, + { + "epoch": 4.839377133105802, + "grad_norm": 0.13738939650002516, + "learning_rate": 4.211341819972501e-05, + "loss": 0.8783, + "step": 22687 + }, + { + "epoch": 4.839590443686006, + "grad_norm": 0.15071118642846082, + "learning_rate": 4.2105923441711055e-05, + "loss": 0.8837, + "step": 22688 + }, + { + "epoch": 4.839803754266212, + "grad_norm": 0.12415857377951593, + "learning_rate": 4.209842911246922e-05, + "loss": 0.8862, + "step": 22689 + }, + { + "epoch": 4.840017064846417, + "grad_norm": 0.15317912200331107, + "learning_rate": 4.2090935212084286e-05, + "loss": 0.9093, + "step": 22690 + }, + { + "epoch": 4.840230375426621, + "grad_norm": 0.1290622486978509, + "learning_rate": 4.2083441740641025e-05, + "loss": 0.9013, + "step": 22691 + }, + { + "epoch": 4.840443686006826, + "grad_norm": 0.1588399700662575, + "learning_rate": 4.20759486982243e-05, + "loss": 0.9013, + "step": 22692 + }, + { + "epoch": 4.840656996587031, + "grad_norm": 0.13094376411609188, + "learning_rate": 4.206845608491885e-05, + "loss": 0.9079, + "step": 22693 + }, + { + "epoch": 4.840870307167235, + "grad_norm": 0.15085534455363261, + "learning_rate": 4.2060963900809465e-05, + "loss": 0.8955, + "step": 22694 + }, + { + "epoch": 4.84108361774744, + "grad_norm": 0.14407505668979126, + "learning_rate": 4.2053472145980867e-05, + "loss": 0.8965, + "step": 22695 + }, + { + "epoch": 4.841296928327645, + "grad_norm": 0.135712416580157, + "learning_rate": 4.2045980820517904e-05, + "loss": 0.8852, + "step": 22696 + }, + { + "epoch": 4.8415102389078495, + "grad_norm": 0.14572297051614297, + "learning_rate": 4.2038489924505316e-05, + "loss": 0.8763, + "step": 22697 + }, + { + "epoch": 4.841723549488055, + "grad_norm": 0.15559246340112068, + "learning_rate": 4.203099945802788e-05, + "loss": 0.9038, + "step": 22698 + }, + { + "epoch": 4.84193686006826, + "grad_norm": 0.1501238541209305, + "learning_rate": 4.202350942117026e-05, + "loss": 0.8989, + "step": 22699 + }, + { + "epoch": 4.842150170648464, + "grad_norm": 0.15440402928419936, + "learning_rate": 4.2016019814017295e-05, + "loss": 0.9141, + "step": 22700 + }, + { + "epoch": 4.842363481228669, + "grad_norm": 0.16937403043571325, + "learning_rate": 4.2008530636653705e-05, + "loss": 0.9214, + "step": 22701 + }, + { + "epoch": 4.842576791808874, + "grad_norm": 0.1500342514605564, + "learning_rate": 4.2001041889164236e-05, + "loss": 0.8993, + "step": 22702 + }, + { + "epoch": 4.8427901023890785, + "grad_norm": 0.165688763680235, + "learning_rate": 4.199355357163358e-05, + "loss": 0.8991, + "step": 22703 + }, + { + "epoch": 4.843003412969283, + "grad_norm": 0.1694098751083391, + "learning_rate": 4.1986065684146545e-05, + "loss": 0.9067, + "step": 22704 + }, + { + "epoch": 4.843216723549488, + "grad_norm": 0.1394615405688542, + "learning_rate": 4.197857822678781e-05, + "loss": 0.8985, + "step": 22705 + }, + { + "epoch": 4.8434300341296925, + "grad_norm": 0.16286052632574238, + "learning_rate": 4.1971091199642106e-05, + "loss": 0.8933, + "step": 22706 + }, + { + "epoch": 4.843643344709898, + "grad_norm": 0.17418164000152772, + "learning_rate": 4.196360460279411e-05, + "loss": 0.8768, + "step": 22707 + }, + { + "epoch": 4.843856655290102, + "grad_norm": 0.14586018528833025, + "learning_rate": 4.195611843632859e-05, + "loss": 0.893, + "step": 22708 + }, + { + "epoch": 4.8440699658703075, + "grad_norm": 0.1617935379894528, + "learning_rate": 4.194863270033028e-05, + "loss": 0.9018, + "step": 22709 + }, + { + "epoch": 4.844283276450512, + "grad_norm": 0.15274765206199117, + "learning_rate": 4.194114739488379e-05, + "loss": 0.9131, + "step": 22710 + }, + { + "epoch": 4.844496587030717, + "grad_norm": 0.14254991888808233, + "learning_rate": 4.1933662520073836e-05, + "loss": 0.872, + "step": 22711 + }, + { + "epoch": 4.8447098976109215, + "grad_norm": 0.16655157556810782, + "learning_rate": 4.192617807598515e-05, + "loss": 0.8788, + "step": 22712 + }, + { + "epoch": 4.844923208191126, + "grad_norm": 0.126670253172191, + "learning_rate": 4.19186940627024e-05, + "loss": 0.9031, + "step": 22713 + }, + { + "epoch": 4.845136518771331, + "grad_norm": 0.1743149619636995, + "learning_rate": 4.191121048031026e-05, + "loss": 0.8999, + "step": 22714 + }, + { + "epoch": 4.845349829351536, + "grad_norm": 0.15985533855560538, + "learning_rate": 4.19037273288934e-05, + "loss": 0.8951, + "step": 22715 + }, + { + "epoch": 4.84556313993174, + "grad_norm": 0.14086398306397693, + "learning_rate": 4.1896244608536524e-05, + "loss": 0.9003, + "step": 22716 + }, + { + "epoch": 4.845776450511945, + "grad_norm": 0.18079376156596916, + "learning_rate": 4.18887623193243e-05, + "loss": 0.9011, + "step": 22717 + }, + { + "epoch": 4.8459897610921505, + "grad_norm": 0.11996695339277375, + "learning_rate": 4.188128046134135e-05, + "loss": 0.907, + "step": 22718 + }, + { + "epoch": 4.846203071672355, + "grad_norm": 0.17525946132669099, + "learning_rate": 4.187379903467233e-05, + "loss": 0.9302, + "step": 22719 + }, + { + "epoch": 4.84641638225256, + "grad_norm": 0.12765037278318098, + "learning_rate": 4.186631803940198e-05, + "loss": 0.9076, + "step": 22720 + }, + { + "epoch": 4.846629692832765, + "grad_norm": 0.15146506977857824, + "learning_rate": 4.185883747561485e-05, + "loss": 0.9263, + "step": 22721 + }, + { + "epoch": 4.846843003412969, + "grad_norm": 0.13083525140408545, + "learning_rate": 4.1851357343395606e-05, + "loss": 0.8811, + "step": 22722 + }, + { + "epoch": 4.847056313993174, + "grad_norm": 0.15081610431272877, + "learning_rate": 4.1843877642828874e-05, + "loss": 0.9224, + "step": 22723 + }, + { + "epoch": 4.847269624573379, + "grad_norm": 0.15466379155561397, + "learning_rate": 4.183639837399934e-05, + "loss": 0.9157, + "step": 22724 + }, + { + "epoch": 4.847482935153583, + "grad_norm": 0.1509052101926848, + "learning_rate": 4.182891953699161e-05, + "loss": 0.8971, + "step": 22725 + }, + { + "epoch": 4.847696245733788, + "grad_norm": 0.16323370033512302, + "learning_rate": 4.182144113189027e-05, + "loss": 0.885, + "step": 22726 + }, + { + "epoch": 4.847909556313994, + "grad_norm": 0.12808458295947117, + "learning_rate": 4.181396315877995e-05, + "loss": 0.9236, + "step": 22727 + }, + { + "epoch": 4.848122866894198, + "grad_norm": 0.16329852077513773, + "learning_rate": 4.1806485617745315e-05, + "loss": 0.8863, + "step": 22728 + }, + { + "epoch": 4.848336177474403, + "grad_norm": 0.1765986458738388, + "learning_rate": 4.179900850887093e-05, + "loss": 0.8836, + "step": 22729 + }, + { + "epoch": 4.848549488054608, + "grad_norm": 0.14054120412239543, + "learning_rate": 4.1791531832241414e-05, + "loss": 0.9361, + "step": 22730 + }, + { + "epoch": 4.848762798634812, + "grad_norm": 0.16762110072112638, + "learning_rate": 4.178405558794135e-05, + "loss": 0.904, + "step": 22731 + }, + { + "epoch": 4.848976109215017, + "grad_norm": 0.19491009087489486, + "learning_rate": 4.1776579776055345e-05, + "loss": 0.9032, + "step": 22732 + }, + { + "epoch": 4.849189419795222, + "grad_norm": 0.15879200180218295, + "learning_rate": 4.1769104396667986e-05, + "loss": 0.8927, + "step": 22733 + }, + { + "epoch": 4.849402730375426, + "grad_norm": 0.14628491199603932, + "learning_rate": 4.176162944986384e-05, + "loss": 0.8934, + "step": 22734 + }, + { + "epoch": 4.849616040955631, + "grad_norm": 0.16580206276889795, + "learning_rate": 4.175415493572748e-05, + "loss": 0.8709, + "step": 22735 + }, + { + "epoch": 4.849829351535837, + "grad_norm": 0.13086091943203498, + "learning_rate": 4.1746680854343536e-05, + "loss": 0.8932, + "step": 22736 + }, + { + "epoch": 4.850042662116041, + "grad_norm": 0.1750385490119855, + "learning_rate": 4.173920720579654e-05, + "loss": 0.9247, + "step": 22737 + }, + { + "epoch": 4.850255972696246, + "grad_norm": 0.15799561043418092, + "learning_rate": 4.173173399017104e-05, + "loss": 0.8926, + "step": 22738 + }, + { + "epoch": 4.850469283276451, + "grad_norm": 0.1572920303074149, + "learning_rate": 4.17242612075516e-05, + "loss": 0.8894, + "step": 22739 + }, + { + "epoch": 4.850682593856655, + "grad_norm": 0.14005070988650312, + "learning_rate": 4.1716788858022814e-05, + "loss": 0.8852, + "step": 22740 + }, + { + "epoch": 4.85089590443686, + "grad_norm": 0.15714842520870403, + "learning_rate": 4.170931694166921e-05, + "loss": 0.8901, + "step": 22741 + }, + { + "epoch": 4.851109215017065, + "grad_norm": 0.13261148513058618, + "learning_rate": 4.170184545857533e-05, + "loss": 0.9192, + "step": 22742 + }, + { + "epoch": 4.851322525597269, + "grad_norm": 0.15150427793670232, + "learning_rate": 4.1694374408825704e-05, + "loss": 0.9016, + "step": 22743 + }, + { + "epoch": 4.851535836177474, + "grad_norm": 0.12920491874356205, + "learning_rate": 4.168690379250488e-05, + "loss": 0.8893, + "step": 22744 + }, + { + "epoch": 4.85174914675768, + "grad_norm": 0.1253426149470411, + "learning_rate": 4.167943360969739e-05, + "loss": 0.8985, + "step": 22745 + }, + { + "epoch": 4.851962457337884, + "grad_norm": 0.14334450741696747, + "learning_rate": 4.167196386048774e-05, + "loss": 0.8993, + "step": 22746 + }, + { + "epoch": 4.852175767918089, + "grad_norm": 0.1399129843377973, + "learning_rate": 4.166449454496045e-05, + "loss": 0.8854, + "step": 22747 + }, + { + "epoch": 4.852389078498294, + "grad_norm": 0.15594313025601078, + "learning_rate": 4.165702566320006e-05, + "loss": 0.8961, + "step": 22748 + }, + { + "epoch": 4.852602389078498, + "grad_norm": 0.2005860348529684, + "learning_rate": 4.164955721529108e-05, + "loss": 0.9262, + "step": 22749 + }, + { + "epoch": 4.852815699658703, + "grad_norm": 0.1838744323084108, + "learning_rate": 4.164208920131801e-05, + "loss": 0.9034, + "step": 22750 + }, + { + "epoch": 4.853029010238908, + "grad_norm": 0.16231180454813507, + "learning_rate": 4.163462162136531e-05, + "loss": 0.9098, + "step": 22751 + }, + { + "epoch": 4.853242320819112, + "grad_norm": 0.179114174722, + "learning_rate": 4.162715447551755e-05, + "loss": 0.9163, + "step": 22752 + }, + { + "epoch": 4.853455631399317, + "grad_norm": 0.16209047462772624, + "learning_rate": 4.1619687763859165e-05, + "loss": 0.9052, + "step": 22753 + }, + { + "epoch": 4.853668941979522, + "grad_norm": 0.15353283107748036, + "learning_rate": 4.161222148647467e-05, + "loss": 0.8888, + "step": 22754 + }, + { + "epoch": 4.8538822525597265, + "grad_norm": 0.17209399587907778, + "learning_rate": 4.160475564344854e-05, + "loss": 0.9413, + "step": 22755 + }, + { + "epoch": 4.854095563139932, + "grad_norm": 0.18504209868473107, + "learning_rate": 4.1597290234865236e-05, + "loss": 0.8684, + "step": 22756 + }, + { + "epoch": 4.854308873720137, + "grad_norm": 0.15380299022676255, + "learning_rate": 4.158982526080923e-05, + "loss": 0.8886, + "step": 22757 + }, + { + "epoch": 4.854522184300341, + "grad_norm": 0.1850744730013322, + "learning_rate": 4.158236072136502e-05, + "loss": 0.9362, + "step": 22758 + }, + { + "epoch": 4.854735494880546, + "grad_norm": 0.14196081635595853, + "learning_rate": 4.1574896616617e-05, + "loss": 0.9105, + "step": 22759 + }, + { + "epoch": 4.854948805460751, + "grad_norm": 0.17809051671640871, + "learning_rate": 4.1567432946649706e-05, + "loss": 0.8922, + "step": 22760 + }, + { + "epoch": 4.8551621160409555, + "grad_norm": 0.1457147048316982, + "learning_rate": 4.155996971154755e-05, + "loss": 0.8937, + "step": 22761 + }, + { + "epoch": 4.85537542662116, + "grad_norm": 0.16350564946715807, + "learning_rate": 4.1552506911394985e-05, + "loss": 0.893, + "step": 22762 + }, + { + "epoch": 4.855588737201365, + "grad_norm": 0.1382280944897633, + "learning_rate": 4.1545044546276427e-05, + "loss": 0.9067, + "step": 22763 + }, + { + "epoch": 4.8558020477815695, + "grad_norm": 0.15716942881151322, + "learning_rate": 4.153758261627636e-05, + "loss": 0.8993, + "step": 22764 + }, + { + "epoch": 4.856015358361775, + "grad_norm": 0.13546046355524446, + "learning_rate": 4.153012112147923e-05, + "loss": 0.8764, + "step": 22765 + }, + { + "epoch": 4.85622866894198, + "grad_norm": 0.1556564763329038, + "learning_rate": 4.1522660061969356e-05, + "loss": 0.89, + "step": 22766 + }, + { + "epoch": 4.8564419795221845, + "grad_norm": 0.15083841043714943, + "learning_rate": 4.151519943783127e-05, + "loss": 0.8968, + "step": 22767 + }, + { + "epoch": 4.856655290102389, + "grad_norm": 0.1596338161666097, + "learning_rate": 4.150773924914934e-05, + "loss": 0.8946, + "step": 22768 + }, + { + "epoch": 4.856868600682594, + "grad_norm": 0.16638475920075496, + "learning_rate": 4.1500279496007995e-05, + "loss": 0.9138, + "step": 22769 + }, + { + "epoch": 4.8570819112627985, + "grad_norm": 0.14340819022773527, + "learning_rate": 4.149282017849164e-05, + "loss": 0.8969, + "step": 22770 + }, + { + "epoch": 4.857295221843003, + "grad_norm": 0.15456542026767886, + "learning_rate": 4.148536129668463e-05, + "loss": 0.87, + "step": 22771 + }, + { + "epoch": 4.857508532423208, + "grad_norm": 0.1667629102141958, + "learning_rate": 4.147790285067144e-05, + "loss": 0.9215, + "step": 22772 + }, + { + "epoch": 4.857721843003413, + "grad_norm": 0.15630612336453406, + "learning_rate": 4.147044484053643e-05, + "loss": 0.889, + "step": 22773 + }, + { + "epoch": 4.857935153583618, + "grad_norm": 0.155887748706732, + "learning_rate": 4.1462987266363995e-05, + "loss": 0.9285, + "step": 22774 + }, + { + "epoch": 4.858148464163823, + "grad_norm": 0.2062667193645358, + "learning_rate": 4.145553012823845e-05, + "loss": 0.9172, + "step": 22775 + }, + { + "epoch": 4.8583617747440275, + "grad_norm": 0.13784589884281526, + "learning_rate": 4.1448073426244306e-05, + "loss": 0.9029, + "step": 22776 + }, + { + "epoch": 4.858575085324232, + "grad_norm": 0.18205326177604478, + "learning_rate": 4.144061716046583e-05, + "loss": 0.8912, + "step": 22777 + }, + { + "epoch": 4.858788395904437, + "grad_norm": 0.18338047222926906, + "learning_rate": 4.143316133098739e-05, + "loss": 0.9112, + "step": 22778 + }, + { + "epoch": 4.859001706484642, + "grad_norm": 0.1394362142349389, + "learning_rate": 4.1425705937893405e-05, + "loss": 0.9284, + "step": 22779 + }, + { + "epoch": 4.859215017064846, + "grad_norm": 0.16817445783035517, + "learning_rate": 4.1418250981268214e-05, + "loss": 0.8865, + "step": 22780 + }, + { + "epoch": 4.859428327645051, + "grad_norm": 0.18585351023174052, + "learning_rate": 4.1410796461196154e-05, + "loss": 0.9091, + "step": 22781 + }, + { + "epoch": 4.859641638225256, + "grad_norm": 0.13057131797336569, + "learning_rate": 4.140334237776158e-05, + "loss": 0.8925, + "step": 22782 + }, + { + "epoch": 4.859854948805461, + "grad_norm": 0.23971123505317166, + "learning_rate": 4.139588873104881e-05, + "loss": 0.9024, + "step": 22783 + }, + { + "epoch": 4.860068259385666, + "grad_norm": 0.18367115832941913, + "learning_rate": 4.138843552114224e-05, + "loss": 0.9248, + "step": 22784 + }, + { + "epoch": 4.860281569965871, + "grad_norm": 0.15522323951405836, + "learning_rate": 4.138098274812617e-05, + "loss": 0.9108, + "step": 22785 + }, + { + "epoch": 4.860494880546075, + "grad_norm": 0.19889837425366264, + "learning_rate": 4.137353041208492e-05, + "loss": 0.8919, + "step": 22786 + }, + { + "epoch": 4.86070819112628, + "grad_norm": 0.16516308350006922, + "learning_rate": 4.1366078513102835e-05, + "loss": 0.9338, + "step": 22787 + }, + { + "epoch": 4.860921501706485, + "grad_norm": 0.13083011593731883, + "learning_rate": 4.135862705126422e-05, + "loss": 0.9156, + "step": 22788 + }, + { + "epoch": 4.861134812286689, + "grad_norm": 0.14943603978276315, + "learning_rate": 4.1351176026653395e-05, + "loss": 0.8888, + "step": 22789 + }, + { + "epoch": 4.861348122866894, + "grad_norm": 0.17442633581024852, + "learning_rate": 4.134372543935461e-05, + "loss": 0.9117, + "step": 22790 + }, + { + "epoch": 4.861561433447099, + "grad_norm": 0.13430930932600355, + "learning_rate": 4.1336275289452284e-05, + "loss": 0.8748, + "step": 22791 + }, + { + "epoch": 4.861774744027304, + "grad_norm": 0.18869549788985518, + "learning_rate": 4.132882557703063e-05, + "loss": 0.8734, + "step": 22792 + }, + { + "epoch": 4.861988054607508, + "grad_norm": 0.1327488308475368, + "learning_rate": 4.132137630217397e-05, + "loss": 0.867, + "step": 22793 + }, + { + "epoch": 4.862201365187714, + "grad_norm": 0.19383127640560505, + "learning_rate": 4.131392746496659e-05, + "loss": 0.904, + "step": 22794 + }, + { + "epoch": 4.862414675767918, + "grad_norm": 0.1480802989397744, + "learning_rate": 4.130647906549275e-05, + "loss": 0.9262, + "step": 22795 + }, + { + "epoch": 4.862627986348123, + "grad_norm": 0.17336839505298063, + "learning_rate": 4.129903110383676e-05, + "loss": 0.9217, + "step": 22796 + }, + { + "epoch": 4.862841296928328, + "grad_norm": 0.15567359149192742, + "learning_rate": 4.12915835800829e-05, + "loss": 0.8996, + "step": 22797 + }, + { + "epoch": 4.863054607508532, + "grad_norm": 0.14131004182725468, + "learning_rate": 4.1284136494315416e-05, + "loss": 0.8836, + "step": 22798 + }, + { + "epoch": 4.863267918088737, + "grad_norm": 0.14093722018415716, + "learning_rate": 4.1276689846618586e-05, + "loss": 0.9042, + "step": 22799 + }, + { + "epoch": 4.863481228668942, + "grad_norm": 0.14069053158133424, + "learning_rate": 4.126924363707666e-05, + "loss": 0.9099, + "step": 22800 + }, + { + "epoch": 4.863694539249146, + "grad_norm": 0.14266107231396635, + "learning_rate": 4.126179786577389e-05, + "loss": 0.931, + "step": 22801 + }, + { + "epoch": 4.863907849829351, + "grad_norm": 0.15094599581078574, + "learning_rate": 4.12543525327945e-05, + "loss": 0.9025, + "step": 22802 + }, + { + "epoch": 4.864121160409557, + "grad_norm": 0.1391078536607726, + "learning_rate": 4.12469076382228e-05, + "loss": 0.8781, + "step": 22803 + }, + { + "epoch": 4.864334470989761, + "grad_norm": 0.14926004631799172, + "learning_rate": 4.123946318214298e-05, + "loss": 0.8991, + "step": 22804 + }, + { + "epoch": 4.864547781569966, + "grad_norm": 0.1356134963141615, + "learning_rate": 4.123201916463929e-05, + "loss": 0.9002, + "step": 22805 + }, + { + "epoch": 4.864761092150171, + "grad_norm": 0.1613660394051271, + "learning_rate": 4.122457558579596e-05, + "loss": 0.9017, + "step": 22806 + }, + { + "epoch": 4.864974402730375, + "grad_norm": 0.14627521207172506, + "learning_rate": 4.121713244569717e-05, + "loss": 0.903, + "step": 22807 + }, + { + "epoch": 4.86518771331058, + "grad_norm": 0.13491312342313289, + "learning_rate": 4.120968974442722e-05, + "loss": 0.8985, + "step": 22808 + }, + { + "epoch": 4.865401023890785, + "grad_norm": 0.14975768268265743, + "learning_rate": 4.1202247482070274e-05, + "loss": 0.8739, + "step": 22809 + }, + { + "epoch": 4.8656143344709895, + "grad_norm": 0.15106886842337372, + "learning_rate": 4.1194805658710555e-05, + "loss": 0.8732, + "step": 22810 + }, + { + "epoch": 4.865827645051194, + "grad_norm": 0.1455535498953201, + "learning_rate": 4.1187364274432264e-05, + "loss": 0.8814, + "step": 22811 + }, + { + "epoch": 4.8660409556314, + "grad_norm": 0.20580013547652493, + "learning_rate": 4.117992332931959e-05, + "loss": 0.9097, + "step": 22812 + }, + { + "epoch": 4.866254266211604, + "grad_norm": 0.17813723254800023, + "learning_rate": 4.1172482823456744e-05, + "loss": 0.9537, + "step": 22813 + }, + { + "epoch": 4.866467576791809, + "grad_norm": 0.1539050957669367, + "learning_rate": 4.116504275692788e-05, + "loss": 0.8801, + "step": 22814 + }, + { + "epoch": 4.866680887372014, + "grad_norm": 0.1625333954857142, + "learning_rate": 4.1157603129817234e-05, + "loss": 0.936, + "step": 22815 + }, + { + "epoch": 4.8668941979522184, + "grad_norm": 0.1557188403865482, + "learning_rate": 4.115016394220897e-05, + "loss": 0.9168, + "step": 22816 + }, + { + "epoch": 4.867107508532423, + "grad_norm": 0.14684798073122443, + "learning_rate": 4.1142725194187246e-05, + "loss": 0.8887, + "step": 22817 + }, + { + "epoch": 4.867320819112628, + "grad_norm": 0.1868292553150555, + "learning_rate": 4.1135286885836246e-05, + "loss": 0.9252, + "step": 22818 + }, + { + "epoch": 4.8675341296928325, + "grad_norm": 0.14277026295657166, + "learning_rate": 4.112784901724009e-05, + "loss": 0.9222, + "step": 22819 + }, + { + "epoch": 4.867747440273037, + "grad_norm": 0.19274995508061402, + "learning_rate": 4.1120411588483015e-05, + "loss": 0.8633, + "step": 22820 + }, + { + "epoch": 4.867960750853243, + "grad_norm": 0.14643847898409593, + "learning_rate": 4.111297459964917e-05, + "loss": 0.8726, + "step": 22821 + }, + { + "epoch": 4.868174061433447, + "grad_norm": 0.14379314770870658, + "learning_rate": 4.11055380508226e-05, + "loss": 0.9085, + "step": 22822 + }, + { + "epoch": 4.868387372013652, + "grad_norm": 0.18796887548681634, + "learning_rate": 4.1098101942087555e-05, + "loss": 0.914, + "step": 22823 + }, + { + "epoch": 4.868600682593857, + "grad_norm": 0.14298007720028713, + "learning_rate": 4.1090666273528145e-05, + "loss": 0.8949, + "step": 22824 + }, + { + "epoch": 4.8688139931740615, + "grad_norm": 0.1498944062968562, + "learning_rate": 4.1083231045228495e-05, + "loss": 0.8812, + "step": 22825 + }, + { + "epoch": 4.869027303754266, + "grad_norm": 0.1646973088823419, + "learning_rate": 4.1075796257272725e-05, + "loss": 0.9207, + "step": 22826 + }, + { + "epoch": 4.869240614334471, + "grad_norm": 0.13589136364470308, + "learning_rate": 4.1068361909744984e-05, + "loss": 0.8856, + "step": 22827 + }, + { + "epoch": 4.8694539249146755, + "grad_norm": 0.18938192394918774, + "learning_rate": 4.10609280027294e-05, + "loss": 0.9295, + "step": 22828 + }, + { + "epoch": 4.86966723549488, + "grad_norm": 0.15538881344477506, + "learning_rate": 4.105349453631006e-05, + "loss": 0.9122, + "step": 22829 + }, + { + "epoch": 4.869880546075086, + "grad_norm": 0.1491954764215763, + "learning_rate": 4.104606151057109e-05, + "loss": 0.9131, + "step": 22830 + }, + { + "epoch": 4.8700938566552905, + "grad_norm": 0.2179926319974086, + "learning_rate": 4.1038628925596554e-05, + "loss": 0.9141, + "step": 22831 + }, + { + "epoch": 4.870307167235495, + "grad_norm": 0.16163881067938707, + "learning_rate": 4.1031196781470656e-05, + "loss": 0.9148, + "step": 22832 + }, + { + "epoch": 4.8705204778157, + "grad_norm": 0.14727201293984604, + "learning_rate": 4.102376507827738e-05, + "loss": 0.9185, + "step": 22833 + }, + { + "epoch": 4.8707337883959045, + "grad_norm": 0.18581299263396822, + "learning_rate": 4.101633381610083e-05, + "loss": 0.9079, + "step": 22834 + }, + { + "epoch": 4.870947098976109, + "grad_norm": 0.12658269924170137, + "learning_rate": 4.100890299502515e-05, + "loss": 0.8992, + "step": 22835 + }, + { + "epoch": 4.871160409556314, + "grad_norm": 0.17215873274538795, + "learning_rate": 4.1001472615134395e-05, + "loss": 0.8978, + "step": 22836 + }, + { + "epoch": 4.871373720136519, + "grad_norm": 0.16019247074689827, + "learning_rate": 4.0994042676512634e-05, + "loss": 0.9252, + "step": 22837 + }, + { + "epoch": 4.871587030716723, + "grad_norm": 0.14924603098245473, + "learning_rate": 4.0986613179243895e-05, + "loss": 0.9083, + "step": 22838 + }, + { + "epoch": 4.871800341296928, + "grad_norm": 0.1600485147771787, + "learning_rate": 4.097918412341232e-05, + "loss": 0.8876, + "step": 22839 + }, + { + "epoch": 4.872013651877133, + "grad_norm": 0.1553077595428621, + "learning_rate": 4.097175550910193e-05, + "loss": 0.886, + "step": 22840 + }, + { + "epoch": 4.872226962457338, + "grad_norm": 0.1367785944445578, + "learning_rate": 4.0964327336396794e-05, + "loss": 0.9191, + "step": 22841 + }, + { + "epoch": 4.872440273037543, + "grad_norm": 0.1382775811071053, + "learning_rate": 4.095689960538094e-05, + "loss": 0.9118, + "step": 22842 + }, + { + "epoch": 4.872653583617748, + "grad_norm": 0.14482829402607186, + "learning_rate": 4.094947231613843e-05, + "loss": 0.8883, + "step": 22843 + }, + { + "epoch": 4.872866894197952, + "grad_norm": 0.14814715751002966, + "learning_rate": 4.094204546875329e-05, + "loss": 0.9001, + "step": 22844 + }, + { + "epoch": 4.873080204778157, + "grad_norm": 0.13333917516169455, + "learning_rate": 4.093461906330957e-05, + "loss": 0.9074, + "step": 22845 + }, + { + "epoch": 4.873293515358362, + "grad_norm": 0.14907936007811357, + "learning_rate": 4.092719309989125e-05, + "loss": 0.8793, + "step": 22846 + }, + { + "epoch": 4.873506825938566, + "grad_norm": 0.1424739038867945, + "learning_rate": 4.091976757858245e-05, + "loss": 0.8776, + "step": 22847 + }, + { + "epoch": 4.873720136518771, + "grad_norm": 0.1238518285624289, + "learning_rate": 4.091234249946712e-05, + "loss": 0.8848, + "step": 22848 + }, + { + "epoch": 4.873933447098976, + "grad_norm": 0.1322673756657174, + "learning_rate": 4.090491786262929e-05, + "loss": 0.8773, + "step": 22849 + }, + { + "epoch": 4.874146757679181, + "grad_norm": 0.14889443382916764, + "learning_rate": 4.0897493668152945e-05, + "loss": 0.8983, + "step": 22850 + }, + { + "epoch": 4.874360068259386, + "grad_norm": 0.127230532349249, + "learning_rate": 4.089006991612215e-05, + "loss": 0.902, + "step": 22851 + }, + { + "epoch": 4.874573378839591, + "grad_norm": 0.1632035391132419, + "learning_rate": 4.088264660662088e-05, + "loss": 0.8994, + "step": 22852 + }, + { + "epoch": 4.874786689419795, + "grad_norm": 0.12509842492802825, + "learning_rate": 4.087522373973311e-05, + "loss": 0.8729, + "step": 22853 + }, + { + "epoch": 4.875, + "grad_norm": 0.142545118266984, + "learning_rate": 4.0867801315542844e-05, + "loss": 0.915, + "step": 22854 + }, + { + "epoch": 4.875213310580205, + "grad_norm": 0.12100760712910269, + "learning_rate": 4.086037933413407e-05, + "loss": 0.8832, + "step": 22855 + }, + { + "epoch": 4.875426621160409, + "grad_norm": 0.14330284138880614, + "learning_rate": 4.085295779559075e-05, + "loss": 0.8958, + "step": 22856 + }, + { + "epoch": 4.875639931740614, + "grad_norm": 0.13682749551424728, + "learning_rate": 4.0845536699996876e-05, + "loss": 0.8887, + "step": 22857 + }, + { + "epoch": 4.875853242320819, + "grad_norm": 0.1326345271520374, + "learning_rate": 4.083811604743639e-05, + "loss": 0.8983, + "step": 22858 + }, + { + "epoch": 4.876066552901024, + "grad_norm": 0.13901623439390387, + "learning_rate": 4.083069583799329e-05, + "loss": 0.9137, + "step": 22859 + }, + { + "epoch": 4.876279863481229, + "grad_norm": 0.13951243846982495, + "learning_rate": 4.0823276071751545e-05, + "loss": 0.9343, + "step": 22860 + }, + { + "epoch": 4.876493174061434, + "grad_norm": 0.1836653135510645, + "learning_rate": 4.081585674879509e-05, + "loss": 0.9087, + "step": 22861 + }, + { + "epoch": 4.876706484641638, + "grad_norm": 0.14051465849828265, + "learning_rate": 4.0808437869207865e-05, + "loss": 0.9116, + "step": 22862 + }, + { + "epoch": 4.876919795221843, + "grad_norm": 0.1560249107595986, + "learning_rate": 4.08010194330738e-05, + "loss": 0.8916, + "step": 22863 + }, + { + "epoch": 4.877133105802048, + "grad_norm": 0.15098105635042425, + "learning_rate": 4.079360144047689e-05, + "loss": 0.8629, + "step": 22864 + }, + { + "epoch": 4.877346416382252, + "grad_norm": 0.15165898110086196, + "learning_rate": 4.078618389150104e-05, + "loss": 0.9178, + "step": 22865 + }, + { + "epoch": 4.877559726962457, + "grad_norm": 0.14354593715524705, + "learning_rate": 4.0778766786230175e-05, + "loss": 0.8605, + "step": 22866 + }, + { + "epoch": 4.877773037542662, + "grad_norm": 0.18035253222371184, + "learning_rate": 4.0771350124748225e-05, + "loss": 0.9267, + "step": 22867 + }, + { + "epoch": 4.877986348122867, + "grad_norm": 0.15732976858143063, + "learning_rate": 4.076393390713911e-05, + "loss": 0.9063, + "step": 22868 + }, + { + "epoch": 4.878199658703072, + "grad_norm": 0.1565550163387331, + "learning_rate": 4.0756518133486746e-05, + "loss": 0.8937, + "step": 22869 + }, + { + "epoch": 4.878412969283277, + "grad_norm": 0.15312498619757714, + "learning_rate": 4.0749102803875004e-05, + "loss": 0.894, + "step": 22870 + }, + { + "epoch": 4.878626279863481, + "grad_norm": 0.13218833003764918, + "learning_rate": 4.074168791838785e-05, + "loss": 0.9154, + "step": 22871 + }, + { + "epoch": 4.878839590443686, + "grad_norm": 0.12521871932125553, + "learning_rate": 4.073427347710917e-05, + "loss": 0.8993, + "step": 22872 + }, + { + "epoch": 4.879052901023891, + "grad_norm": 0.15526890333360696, + "learning_rate": 4.072685948012284e-05, + "loss": 0.8837, + "step": 22873 + }, + { + "epoch": 4.8792662116040955, + "grad_norm": 0.13660471652715714, + "learning_rate": 4.0719445927512744e-05, + "loss": 0.8975, + "step": 22874 + }, + { + "epoch": 4.8794795221843, + "grad_norm": 0.1596819053555723, + "learning_rate": 4.071203281936276e-05, + "loss": 0.8879, + "step": 22875 + }, + { + "epoch": 4.879692832764505, + "grad_norm": 0.1423604396552332, + "learning_rate": 4.0704620155756814e-05, + "loss": 0.9341, + "step": 22876 + }, + { + "epoch": 4.8799061433447095, + "grad_norm": 0.14987997506121253, + "learning_rate": 4.069720793677878e-05, + "loss": 0.9356, + "step": 22877 + }, + { + "epoch": 4.880119453924914, + "grad_norm": 0.13844556849540843, + "learning_rate": 4.0689796162512446e-05, + "loss": 0.9013, + "step": 22878 + }, + { + "epoch": 4.88033276450512, + "grad_norm": 0.1331266908099038, + "learning_rate": 4.068238483304175e-05, + "loss": 0.9035, + "step": 22879 + }, + { + "epoch": 4.8805460750853245, + "grad_norm": 0.1399209580299456, + "learning_rate": 4.0674973948450545e-05, + "loss": 0.8931, + "step": 22880 + }, + { + "epoch": 4.880759385665529, + "grad_norm": 0.15599565399819398, + "learning_rate": 4.0667563508822655e-05, + "loss": 0.9003, + "step": 22881 + }, + { + "epoch": 4.880972696245734, + "grad_norm": 0.13761587317655136, + "learning_rate": 4.0660153514241914e-05, + "loss": 0.8756, + "step": 22882 + }, + { + "epoch": 4.8811860068259385, + "grad_norm": 0.1551152798072272, + "learning_rate": 4.065274396479225e-05, + "loss": 0.9, + "step": 22883 + }, + { + "epoch": 4.881399317406143, + "grad_norm": 0.15997691399056235, + "learning_rate": 4.064533486055743e-05, + "loss": 0.935, + "step": 22884 + }, + { + "epoch": 4.881612627986348, + "grad_norm": 0.1485011426257011, + "learning_rate": 4.063792620162131e-05, + "loss": 0.8962, + "step": 22885 + }, + { + "epoch": 4.881825938566553, + "grad_norm": 0.18891175547646366, + "learning_rate": 4.063051798806772e-05, + "loss": 0.9327, + "step": 22886 + }, + { + "epoch": 4.882039249146757, + "grad_norm": 0.146853546345075, + "learning_rate": 4.0623110219980455e-05, + "loss": 0.8847, + "step": 22887 + }, + { + "epoch": 4.882252559726963, + "grad_norm": 0.1717393500738929, + "learning_rate": 4.0615702897443426e-05, + "loss": 0.9055, + "step": 22888 + }, + { + "epoch": 4.8824658703071675, + "grad_norm": 0.16093544387533856, + "learning_rate": 4.060829602054034e-05, + "loss": 0.9114, + "step": 22889 + }, + { + "epoch": 4.882679180887372, + "grad_norm": 0.17633861557092523, + "learning_rate": 4.060088958935502e-05, + "loss": 0.9226, + "step": 22890 + }, + { + "epoch": 4.882892491467577, + "grad_norm": 0.14810679797997592, + "learning_rate": 4.059348360397132e-05, + "loss": 0.9192, + "step": 22891 + }, + { + "epoch": 4.8831058020477816, + "grad_norm": 0.15530921172281037, + "learning_rate": 4.0586078064473025e-05, + "loss": 0.878, + "step": 22892 + }, + { + "epoch": 4.883319112627986, + "grad_norm": 0.1364053184147392, + "learning_rate": 4.057867297094392e-05, + "loss": 0.9066, + "step": 22893 + }, + { + "epoch": 4.883532423208191, + "grad_norm": 0.19221104724219937, + "learning_rate": 4.057126832346775e-05, + "loss": 0.8791, + "step": 22894 + }, + { + "epoch": 4.883745733788396, + "grad_norm": 0.16301203411925416, + "learning_rate": 4.056386412212838e-05, + "loss": 0.9034, + "step": 22895 + }, + { + "epoch": 4.8839590443686, + "grad_norm": 0.17043686567842728, + "learning_rate": 4.0556460367009565e-05, + "loss": 0.8975, + "step": 22896 + }, + { + "epoch": 4.884172354948806, + "grad_norm": 0.14469894651411896, + "learning_rate": 4.054905705819504e-05, + "loss": 0.91, + "step": 22897 + }, + { + "epoch": 4.8843856655290105, + "grad_norm": 0.1453900120178278, + "learning_rate": 4.054165419576862e-05, + "loss": 0.9132, + "step": 22898 + }, + { + "epoch": 4.884598976109215, + "grad_norm": 0.15079671339373818, + "learning_rate": 4.0534251779814034e-05, + "loss": 0.9094, + "step": 22899 + }, + { + "epoch": 4.88481228668942, + "grad_norm": 0.12923553751350494, + "learning_rate": 4.0526849810415056e-05, + "loss": 0.8823, + "step": 22900 + }, + { + "epoch": 4.885025597269625, + "grad_norm": 0.16242037646513469, + "learning_rate": 4.051944828765544e-05, + "loss": 0.8892, + "step": 22901 + }, + { + "epoch": 4.885238907849829, + "grad_norm": 0.12439065128222641, + "learning_rate": 4.05120472116189e-05, + "loss": 0.8948, + "step": 22902 + }, + { + "epoch": 4.885452218430034, + "grad_norm": 0.15068988271314027, + "learning_rate": 4.050464658238923e-05, + "loss": 0.9335, + "step": 22903 + }, + { + "epoch": 4.885665529010239, + "grad_norm": 0.16530386210239223, + "learning_rate": 4.049724640005016e-05, + "loss": 0.927, + "step": 22904 + }, + { + "epoch": 4.885878839590443, + "grad_norm": 0.14385237833861525, + "learning_rate": 4.04898466646854e-05, + "loss": 0.9352, + "step": 22905 + }, + { + "epoch": 4.886092150170649, + "grad_norm": 0.13678284330926166, + "learning_rate": 4.048244737637867e-05, + "loss": 0.8732, + "step": 22906 + }, + { + "epoch": 4.886305460750854, + "grad_norm": 0.18347461562106832, + "learning_rate": 4.047504853521374e-05, + "loss": 0.9283, + "step": 22907 + }, + { + "epoch": 4.886518771331058, + "grad_norm": 0.1410984904220483, + "learning_rate": 4.0467650141274296e-05, + "loss": 0.9012, + "step": 22908 + }, + { + "epoch": 4.886732081911263, + "grad_norm": 0.1498685404550937, + "learning_rate": 4.046025219464406e-05, + "loss": 0.9003, + "step": 22909 + }, + { + "epoch": 4.886945392491468, + "grad_norm": 0.14958875252077822, + "learning_rate": 4.045285469540673e-05, + "loss": 0.8969, + "step": 22910 + }, + { + "epoch": 4.887158703071672, + "grad_norm": 0.1406845497854628, + "learning_rate": 4.044545764364603e-05, + "loss": 0.8858, + "step": 22911 + }, + { + "epoch": 4.887372013651877, + "grad_norm": 0.13814156707523675, + "learning_rate": 4.043806103944563e-05, + "loss": 0.8782, + "step": 22912 + }, + { + "epoch": 4.887585324232082, + "grad_norm": 0.14192957082203286, + "learning_rate": 4.043066488288924e-05, + "loss": 0.936, + "step": 22913 + }, + { + "epoch": 4.887798634812286, + "grad_norm": 0.14093853868440775, + "learning_rate": 4.0423269174060505e-05, + "loss": 0.9209, + "step": 22914 + }, + { + "epoch": 4.888011945392492, + "grad_norm": 0.17307184228994807, + "learning_rate": 4.0415873913043173e-05, + "loss": 0.873, + "step": 22915 + }, + { + "epoch": 4.888225255972696, + "grad_norm": 0.14919129275151072, + "learning_rate": 4.0408479099920906e-05, + "loss": 0.8816, + "step": 22916 + }, + { + "epoch": 4.888438566552901, + "grad_norm": 0.18138728387167655, + "learning_rate": 4.0401084734777356e-05, + "loss": 0.917, + "step": 22917 + }, + { + "epoch": 4.888651877133106, + "grad_norm": 0.1404889684155193, + "learning_rate": 4.039369081769616e-05, + "loss": 0.9092, + "step": 22918 + }, + { + "epoch": 4.888865187713311, + "grad_norm": 0.16078893556542742, + "learning_rate": 4.038629734876106e-05, + "loss": 0.9069, + "step": 22919 + }, + { + "epoch": 4.889078498293515, + "grad_norm": 0.1344066987607558, + "learning_rate": 4.037890432805567e-05, + "loss": 0.8943, + "step": 22920 + }, + { + "epoch": 4.88929180887372, + "grad_norm": 0.1419599055632045, + "learning_rate": 4.0371511755663674e-05, + "loss": 0.8845, + "step": 22921 + }, + { + "epoch": 4.889505119453925, + "grad_norm": 0.14288265548191337, + "learning_rate": 4.0364119631668634e-05, + "loss": 0.898, + "step": 22922 + }, + { + "epoch": 4.889718430034129, + "grad_norm": 0.15739869580452237, + "learning_rate": 4.0356727956154264e-05, + "loss": 0.8747, + "step": 22923 + }, + { + "epoch": 4.889931740614334, + "grad_norm": 0.13191170097613028, + "learning_rate": 4.034933672920419e-05, + "loss": 0.8694, + "step": 22924 + }, + { + "epoch": 4.890145051194539, + "grad_norm": 0.14066914342851272, + "learning_rate": 4.034194595090205e-05, + "loss": 0.8919, + "step": 22925 + }, + { + "epoch": 4.890358361774744, + "grad_norm": 0.1301779806725859, + "learning_rate": 4.033455562133141e-05, + "loss": 0.9101, + "step": 22926 + }, + { + "epoch": 4.890571672354949, + "grad_norm": 0.12948628587580005, + "learning_rate": 4.032716574057598e-05, + "loss": 0.9031, + "step": 22927 + }, + { + "epoch": 4.890784982935154, + "grad_norm": 0.1264667380018706, + "learning_rate": 4.031977630871934e-05, + "loss": 0.9035, + "step": 22928 + }, + { + "epoch": 4.890998293515358, + "grad_norm": 0.12815290455930176, + "learning_rate": 4.031238732584509e-05, + "loss": 0.9028, + "step": 22929 + }, + { + "epoch": 4.891211604095563, + "grad_norm": 0.13175788016335133, + "learning_rate": 4.0304998792036815e-05, + "loss": 0.9171, + "step": 22930 + }, + { + "epoch": 4.891424914675768, + "grad_norm": 0.13112510495063373, + "learning_rate": 4.0297610707378186e-05, + "loss": 0.9016, + "step": 22931 + }, + { + "epoch": 4.8916382252559725, + "grad_norm": 0.13923586727697815, + "learning_rate": 4.029022307195275e-05, + "loss": 0.8799, + "step": 22932 + }, + { + "epoch": 4.891851535836177, + "grad_norm": 0.1414258678387594, + "learning_rate": 4.028283588584414e-05, + "loss": 0.9102, + "step": 22933 + }, + { + "epoch": 4.892064846416382, + "grad_norm": 0.15344947955669166, + "learning_rate": 4.0275449149135856e-05, + "loss": 0.86, + "step": 22934 + }, + { + "epoch": 4.892278156996587, + "grad_norm": 0.1583976177263719, + "learning_rate": 4.026806286191155e-05, + "loss": 0.9203, + "step": 22935 + }, + { + "epoch": 4.892491467576792, + "grad_norm": 0.15185742947567302, + "learning_rate": 4.026067702425479e-05, + "loss": 0.9036, + "step": 22936 + }, + { + "epoch": 4.892704778156997, + "grad_norm": 0.1581525326290499, + "learning_rate": 4.025329163624914e-05, + "loss": 0.9114, + "step": 22937 + }, + { + "epoch": 4.8929180887372015, + "grad_norm": 0.1762044038616947, + "learning_rate": 4.024590669797814e-05, + "loss": 0.8747, + "step": 22938 + }, + { + "epoch": 4.893131399317406, + "grad_norm": 0.1256463528959064, + "learning_rate": 4.023852220952539e-05, + "loss": 0.8943, + "step": 22939 + }, + { + "epoch": 4.893344709897611, + "grad_norm": 0.17009176581364605, + "learning_rate": 4.023113817097444e-05, + "loss": 0.8702, + "step": 22940 + }, + { + "epoch": 4.8935580204778155, + "grad_norm": 0.15420281251973816, + "learning_rate": 4.022375458240882e-05, + "loss": 0.8718, + "step": 22941 + }, + { + "epoch": 4.89377133105802, + "grad_norm": 0.1391441417904119, + "learning_rate": 4.021637144391207e-05, + "loss": 0.8997, + "step": 22942 + }, + { + "epoch": 4.893984641638225, + "grad_norm": 0.13339822404065793, + "learning_rate": 4.020898875556776e-05, + "loss": 0.8825, + "step": 22943 + }, + { + "epoch": 4.8941979522184305, + "grad_norm": 0.165409608045727, + "learning_rate": 4.0201606517459456e-05, + "loss": 0.9044, + "step": 22944 + }, + { + "epoch": 4.894411262798635, + "grad_norm": 0.14104461457917666, + "learning_rate": 4.0194224729670604e-05, + "loss": 0.92, + "step": 22945 + }, + { + "epoch": 4.89462457337884, + "grad_norm": 0.1516670328608727, + "learning_rate": 4.0186843392284746e-05, + "loss": 0.9105, + "step": 22946 + }, + { + "epoch": 4.8948378839590445, + "grad_norm": 0.13940703041756095, + "learning_rate": 4.0179462505385454e-05, + "loss": 0.9206, + "step": 22947 + }, + { + "epoch": 4.895051194539249, + "grad_norm": 0.15319475353696052, + "learning_rate": 4.01720820690562e-05, + "loss": 0.9002, + "step": 22948 + }, + { + "epoch": 4.895264505119454, + "grad_norm": 0.17577104165260038, + "learning_rate": 4.016470208338053e-05, + "loss": 0.9254, + "step": 22949 + }, + { + "epoch": 4.895477815699659, + "grad_norm": 0.1493909011065905, + "learning_rate": 4.0157322548441876e-05, + "loss": 0.9064, + "step": 22950 + }, + { + "epoch": 4.895691126279863, + "grad_norm": 0.17921389729381793, + "learning_rate": 4.014994346432382e-05, + "loss": 0.8862, + "step": 22951 + }, + { + "epoch": 4.895904436860068, + "grad_norm": 0.13744156710618013, + "learning_rate": 4.014256483110984e-05, + "loss": 0.892, + "step": 22952 + }, + { + "epoch": 4.8961177474402735, + "grad_norm": 0.1910275920684703, + "learning_rate": 4.0135186648883396e-05, + "loss": 0.9132, + "step": 22953 + }, + { + "epoch": 4.896331058020478, + "grad_norm": 0.14298730361681652, + "learning_rate": 4.012780891772796e-05, + "loss": 0.8795, + "step": 22954 + }, + { + "epoch": 4.896544368600683, + "grad_norm": 0.1512024025976769, + "learning_rate": 4.0120431637727096e-05, + "loss": 0.8856, + "step": 22955 + }, + { + "epoch": 4.896757679180888, + "grad_norm": 0.1525172015311731, + "learning_rate": 4.0113054808964184e-05, + "loss": 0.911, + "step": 22956 + }, + { + "epoch": 4.896970989761092, + "grad_norm": 0.13749826016521102, + "learning_rate": 4.0105678431522735e-05, + "loss": 0.898, + "step": 22957 + }, + { + "epoch": 4.897184300341297, + "grad_norm": 0.17077238828900945, + "learning_rate": 4.009830250548617e-05, + "loss": 0.8711, + "step": 22958 + }, + { + "epoch": 4.897397610921502, + "grad_norm": 0.14669853848861492, + "learning_rate": 4.0090927030938014e-05, + "loss": 0.8979, + "step": 22959 + }, + { + "epoch": 4.897610921501706, + "grad_norm": 0.1643442406797045, + "learning_rate": 4.0083552007961695e-05, + "loss": 0.9393, + "step": 22960 + }, + { + "epoch": 4.897824232081911, + "grad_norm": 0.1570103171141148, + "learning_rate": 4.007617743664066e-05, + "loss": 0.8925, + "step": 22961 + }, + { + "epoch": 4.898037542662116, + "grad_norm": 0.15507626664970375, + "learning_rate": 4.00688033170583e-05, + "loss": 0.8764, + "step": 22962 + }, + { + "epoch": 4.89825085324232, + "grad_norm": 0.16646089116241655, + "learning_rate": 4.006142964929814e-05, + "loss": 0.9194, + "step": 22963 + }, + { + "epoch": 4.898464163822526, + "grad_norm": 0.17147107225223643, + "learning_rate": 4.0054056433443584e-05, + "loss": 0.9048, + "step": 22964 + }, + { + "epoch": 4.898677474402731, + "grad_norm": 0.1463585140051819, + "learning_rate": 4.0046683669578045e-05, + "loss": 0.9078, + "step": 22965 + }, + { + "epoch": 4.898890784982935, + "grad_norm": 0.1478939141512975, + "learning_rate": 4.003931135778496e-05, + "loss": 0.9067, + "step": 22966 + }, + { + "epoch": 4.89910409556314, + "grad_norm": 0.13436039458637103, + "learning_rate": 4.0031939498147725e-05, + "loss": 0.9051, + "step": 22967 + }, + { + "epoch": 4.899317406143345, + "grad_norm": 0.1518427983848178, + "learning_rate": 4.002456809074978e-05, + "loss": 0.8727, + "step": 22968 + }, + { + "epoch": 4.899530716723549, + "grad_norm": 0.151215840816311, + "learning_rate": 4.001719713567451e-05, + "loss": 0.8968, + "step": 22969 + }, + { + "epoch": 4.899744027303754, + "grad_norm": 0.13765462377519141, + "learning_rate": 4.00098266330053e-05, + "loss": 0.8814, + "step": 22970 + }, + { + "epoch": 4.899957337883959, + "grad_norm": 0.12065111456699788, + "learning_rate": 4.0002456582825605e-05, + "loss": 0.9152, + "step": 22971 + }, + { + "epoch": 4.900170648464163, + "grad_norm": 0.1517259336909324, + "learning_rate": 3.999508698521879e-05, + "loss": 0.8684, + "step": 22972 + }, + { + "epoch": 4.900383959044369, + "grad_norm": 0.14675907890008105, + "learning_rate": 3.998771784026824e-05, + "loss": 0.9193, + "step": 22973 + }, + { + "epoch": 4.900597269624574, + "grad_norm": 0.16144918188827861, + "learning_rate": 3.9980349148057295e-05, + "loss": 0.9138, + "step": 22974 + }, + { + "epoch": 4.900810580204778, + "grad_norm": 0.15176670237831258, + "learning_rate": 3.997298090866941e-05, + "loss": 0.9019, + "step": 22975 + }, + { + "epoch": 4.901023890784983, + "grad_norm": 0.12964049730287436, + "learning_rate": 3.996561312218792e-05, + "loss": 0.9034, + "step": 22976 + }, + { + "epoch": 4.901237201365188, + "grad_norm": 0.15565889854819148, + "learning_rate": 3.995824578869619e-05, + "loss": 0.8934, + "step": 22977 + }, + { + "epoch": 4.901450511945392, + "grad_norm": 0.1568945916010896, + "learning_rate": 3.995087890827759e-05, + "loss": 0.9047, + "step": 22978 + }, + { + "epoch": 4.901663822525597, + "grad_norm": 0.13059102636172543, + "learning_rate": 3.994351248101545e-05, + "loss": 0.8954, + "step": 22979 + }, + { + "epoch": 4.901877133105802, + "grad_norm": 0.1612734757177285, + "learning_rate": 3.9936146506993154e-05, + "loss": 0.9112, + "step": 22980 + }, + { + "epoch": 4.902090443686006, + "grad_norm": 0.14844317876691604, + "learning_rate": 3.992878098629403e-05, + "loss": 0.8917, + "step": 22981 + }, + { + "epoch": 4.902303754266212, + "grad_norm": 0.1658746502703221, + "learning_rate": 3.9921415919001394e-05, + "loss": 0.8944, + "step": 22982 + }, + { + "epoch": 4.902517064846417, + "grad_norm": 0.11899982793618667, + "learning_rate": 3.991405130519864e-05, + "loss": 0.8703, + "step": 22983 + }, + { + "epoch": 4.902730375426621, + "grad_norm": 0.15428353881046294, + "learning_rate": 3.9906687144969065e-05, + "loss": 0.897, + "step": 22984 + }, + { + "epoch": 4.902943686006826, + "grad_norm": 0.1365084188882907, + "learning_rate": 3.989932343839601e-05, + "loss": 0.9104, + "step": 22985 + }, + { + "epoch": 4.903156996587031, + "grad_norm": 0.14194134917853302, + "learning_rate": 3.989196018556275e-05, + "loss": 0.872, + "step": 22986 + }, + { + "epoch": 4.903370307167235, + "grad_norm": 0.13255779304082077, + "learning_rate": 3.988459738655266e-05, + "loss": 0.8965, + "step": 22987 + }, + { + "epoch": 4.90358361774744, + "grad_norm": 0.14406716699004818, + "learning_rate": 3.987723504144903e-05, + "loss": 0.8897, + "step": 22988 + }, + { + "epoch": 4.903796928327645, + "grad_norm": 0.11458396853320184, + "learning_rate": 3.9869873150335164e-05, + "loss": 0.9032, + "step": 22989 + }, + { + "epoch": 4.9040102389078495, + "grad_norm": 0.1379547475915887, + "learning_rate": 3.9862511713294356e-05, + "loss": 0.9005, + "step": 22990 + }, + { + "epoch": 4.904223549488055, + "grad_norm": 0.12197407141914501, + "learning_rate": 3.98551507304099e-05, + "loss": 0.9337, + "step": 22991 + }, + { + "epoch": 4.90443686006826, + "grad_norm": 0.1280728172019663, + "learning_rate": 3.984779020176509e-05, + "loss": 0.8756, + "step": 22992 + }, + { + "epoch": 4.904650170648464, + "grad_norm": 0.13763368936575276, + "learning_rate": 3.984043012744321e-05, + "loss": 0.8841, + "step": 22993 + }, + { + "epoch": 4.904863481228669, + "grad_norm": 0.14808549835557314, + "learning_rate": 3.9833070507527506e-05, + "loss": 0.9064, + "step": 22994 + }, + { + "epoch": 4.905076791808874, + "grad_norm": 0.14756393141057317, + "learning_rate": 3.982571134210132e-05, + "loss": 0.8941, + "step": 22995 + }, + { + "epoch": 4.9052901023890785, + "grad_norm": 0.13602138399580846, + "learning_rate": 3.981835263124788e-05, + "loss": 0.9046, + "step": 22996 + }, + { + "epoch": 4.905503412969283, + "grad_norm": 0.1509225853249841, + "learning_rate": 3.981099437505046e-05, + "loss": 0.9056, + "step": 22997 + }, + { + "epoch": 4.905716723549488, + "grad_norm": 0.1649481313597445, + "learning_rate": 3.9803636573592285e-05, + "loss": 0.9103, + "step": 22998 + }, + { + "epoch": 4.9059300341296925, + "grad_norm": 0.1353953949384511, + "learning_rate": 3.9796279226956664e-05, + "loss": 0.9282, + "step": 22999 + }, + { + "epoch": 4.906143344709898, + "grad_norm": 0.1513156085874592, + "learning_rate": 3.978892233522685e-05, + "loss": 0.8931, + "step": 23000 + }, + { + "epoch": 4.906356655290102, + "grad_norm": 0.13049537320653404, + "learning_rate": 3.9781565898486e-05, + "loss": 0.8816, + "step": 23001 + }, + { + "epoch": 4.9065699658703075, + "grad_norm": 0.13407057772771386, + "learning_rate": 3.977420991681744e-05, + "loss": 0.9122, + "step": 23002 + }, + { + "epoch": 4.906783276450512, + "grad_norm": 0.1376958103051507, + "learning_rate": 3.976685439030437e-05, + "loss": 0.9093, + "step": 23003 + }, + { + "epoch": 4.906996587030717, + "grad_norm": 0.14341754754083869, + "learning_rate": 3.9759499319030015e-05, + "loss": 0.9135, + "step": 23004 + }, + { + "epoch": 4.9072098976109215, + "grad_norm": 0.13101394628289978, + "learning_rate": 3.9752144703077606e-05, + "loss": 0.9381, + "step": 23005 + }, + { + "epoch": 4.907423208191126, + "grad_norm": 0.15070705133051687, + "learning_rate": 3.974479054253032e-05, + "loss": 0.8943, + "step": 23006 + }, + { + "epoch": 4.907636518771331, + "grad_norm": 0.1408086123409938, + "learning_rate": 3.9737436837471445e-05, + "loss": 0.8982, + "step": 23007 + }, + { + "epoch": 4.907849829351536, + "grad_norm": 0.1565015244742997, + "learning_rate": 3.9730083587984144e-05, + "loss": 0.8835, + "step": 23008 + }, + { + "epoch": 4.90806313993174, + "grad_norm": 0.13203929038949122, + "learning_rate": 3.972273079415163e-05, + "loss": 0.9275, + "step": 23009 + }, + { + "epoch": 4.908276450511945, + "grad_norm": 0.16335487053505152, + "learning_rate": 3.971537845605706e-05, + "loss": 0.9076, + "step": 23010 + }, + { + "epoch": 4.9084897610921505, + "grad_norm": 0.14869897268616866, + "learning_rate": 3.970802657378372e-05, + "loss": 0.8822, + "step": 23011 + }, + { + "epoch": 4.908703071672355, + "grad_norm": 0.163732140889902, + "learning_rate": 3.9700675147414716e-05, + "loss": 0.9068, + "step": 23012 + }, + { + "epoch": 4.90891638225256, + "grad_norm": 0.13447565796211403, + "learning_rate": 3.969332417703321e-05, + "loss": 0.9081, + "step": 23013 + }, + { + "epoch": 4.909129692832765, + "grad_norm": 0.16725459601632814, + "learning_rate": 3.9685973662722446e-05, + "loss": 0.9181, + "step": 23014 + }, + { + "epoch": 4.909343003412969, + "grad_norm": 0.15123507409822268, + "learning_rate": 3.967862360456558e-05, + "loss": 0.9146, + "step": 23015 + }, + { + "epoch": 4.909556313993174, + "grad_norm": 0.13562533096552268, + "learning_rate": 3.967127400264576e-05, + "loss": 0.9107, + "step": 23016 + }, + { + "epoch": 4.909769624573379, + "grad_norm": 0.13624387224481205, + "learning_rate": 3.9663924857046155e-05, + "loss": 0.8712, + "step": 23017 + }, + { + "epoch": 4.909982935153583, + "grad_norm": 0.11936369022011249, + "learning_rate": 3.965657616784988e-05, + "loss": 0.8837, + "step": 23018 + }, + { + "epoch": 4.910196245733788, + "grad_norm": 0.16293166886877228, + "learning_rate": 3.964922793514016e-05, + "loss": 0.8993, + "step": 23019 + }, + { + "epoch": 4.910409556313994, + "grad_norm": 0.1455408619398884, + "learning_rate": 3.9641880159000115e-05, + "loss": 0.8777, + "step": 23020 + }, + { + "epoch": 4.910622866894198, + "grad_norm": 0.1572860103510395, + "learning_rate": 3.963453283951286e-05, + "loss": 0.8858, + "step": 23021 + }, + { + "epoch": 4.910836177474403, + "grad_norm": 0.15321424168081257, + "learning_rate": 3.962718597676156e-05, + "loss": 0.8883, + "step": 23022 + }, + { + "epoch": 4.911049488054608, + "grad_norm": 0.1484432851020081, + "learning_rate": 3.9619839570829316e-05, + "loss": 0.9121, + "step": 23023 + }, + { + "epoch": 4.911262798634812, + "grad_norm": 0.1652893109239212, + "learning_rate": 3.961249362179927e-05, + "loss": 0.915, + "step": 23024 + }, + { + "epoch": 4.911476109215017, + "grad_norm": 0.1880179338196006, + "learning_rate": 3.9605148129754507e-05, + "loss": 0.9043, + "step": 23025 + }, + { + "epoch": 4.911689419795222, + "grad_norm": 0.16081627214023536, + "learning_rate": 3.95978030947782e-05, + "loss": 0.8788, + "step": 23026 + }, + { + "epoch": 4.911902730375426, + "grad_norm": 0.16868074112859913, + "learning_rate": 3.9590458516953435e-05, + "loss": 0.9066, + "step": 23027 + }, + { + "epoch": 4.912116040955631, + "grad_norm": 0.15322199573294681, + "learning_rate": 3.958311439636332e-05, + "loss": 0.9023, + "step": 23028 + }, + { + "epoch": 4.912329351535837, + "grad_norm": 0.15940607024693818, + "learning_rate": 3.957577073309093e-05, + "loss": 0.9009, + "step": 23029 + }, + { + "epoch": 4.912542662116041, + "grad_norm": 0.15687001109851645, + "learning_rate": 3.956842752721935e-05, + "loss": 0.8687, + "step": 23030 + }, + { + "epoch": 4.912755972696246, + "grad_norm": 0.16334540947155438, + "learning_rate": 3.956108477883173e-05, + "loss": 0.9105, + "step": 23031 + }, + { + "epoch": 4.912969283276451, + "grad_norm": 0.1562873719459024, + "learning_rate": 3.955374248801111e-05, + "loss": 0.8998, + "step": 23032 + }, + { + "epoch": 4.913182593856655, + "grad_norm": 0.14503024030955944, + "learning_rate": 3.954640065484057e-05, + "loss": 0.8928, + "step": 23033 + }, + { + "epoch": 4.91339590443686, + "grad_norm": 0.1900251042916604, + "learning_rate": 3.953905927940319e-05, + "loss": 0.8964, + "step": 23034 + }, + { + "epoch": 4.913609215017065, + "grad_norm": 0.13531731051429882, + "learning_rate": 3.953171836178204e-05, + "loss": 0.8938, + "step": 23035 + }, + { + "epoch": 4.913822525597269, + "grad_norm": 0.15901047225400686, + "learning_rate": 3.952437790206017e-05, + "loss": 0.9087, + "step": 23036 + }, + { + "epoch": 4.914035836177474, + "grad_norm": 0.1573331459208321, + "learning_rate": 3.951703790032062e-05, + "loss": 0.9019, + "step": 23037 + }, + { + "epoch": 4.91424914675768, + "grad_norm": 0.1450970194957789, + "learning_rate": 3.950969835664649e-05, + "loss": 0.8865, + "step": 23038 + }, + { + "epoch": 4.914462457337884, + "grad_norm": 0.13509755225490955, + "learning_rate": 3.950235927112081e-05, + "loss": 0.9156, + "step": 23039 + }, + { + "epoch": 4.914675767918089, + "grad_norm": 0.13720182363910136, + "learning_rate": 3.9495020643826614e-05, + "loss": 0.8674, + "step": 23040 + }, + { + "epoch": 4.914889078498294, + "grad_norm": 0.14378888167330217, + "learning_rate": 3.948768247484694e-05, + "loss": 0.8919, + "step": 23041 + }, + { + "epoch": 4.915102389078498, + "grad_norm": 0.13188241466321668, + "learning_rate": 3.9480344764264787e-05, + "loss": 0.8942, + "step": 23042 + }, + { + "epoch": 4.915315699658703, + "grad_norm": 0.18842709279771588, + "learning_rate": 3.9473007512163254e-05, + "loss": 0.9034, + "step": 23043 + }, + { + "epoch": 4.915529010238908, + "grad_norm": 0.12019360231792249, + "learning_rate": 3.946567071862532e-05, + "loss": 0.9195, + "step": 23044 + }, + { + "epoch": 4.915742320819112, + "grad_norm": 0.1750180271962964, + "learning_rate": 3.945833438373401e-05, + "loss": 0.8764, + "step": 23045 + }, + { + "epoch": 4.915955631399317, + "grad_norm": 0.15516862432008272, + "learning_rate": 3.945099850757231e-05, + "loss": 0.8898, + "step": 23046 + }, + { + "epoch": 4.916168941979522, + "grad_norm": 0.14998527920126148, + "learning_rate": 3.944366309022326e-05, + "loss": 0.9238, + "step": 23047 + }, + { + "epoch": 4.9163822525597265, + "grad_norm": 0.17391398720343962, + "learning_rate": 3.943632813176983e-05, + "loss": 0.91, + "step": 23048 + }, + { + "epoch": 4.916595563139932, + "grad_norm": 0.16134712534392193, + "learning_rate": 3.942899363229501e-05, + "loss": 0.9037, + "step": 23049 + }, + { + "epoch": 4.916808873720137, + "grad_norm": 0.14542641323871033, + "learning_rate": 3.942165959188183e-05, + "loss": 0.9059, + "step": 23050 + }, + { + "epoch": 4.917022184300341, + "grad_norm": 0.15287150526757437, + "learning_rate": 3.941432601061327e-05, + "loss": 0.8789, + "step": 23051 + }, + { + "epoch": 4.917235494880546, + "grad_norm": 0.16194091253637546, + "learning_rate": 3.9406992888572285e-05, + "loss": 0.8866, + "step": 23052 + }, + { + "epoch": 4.917448805460751, + "grad_norm": 0.14119885303910426, + "learning_rate": 3.939966022584186e-05, + "loss": 0.8984, + "step": 23053 + }, + { + "epoch": 4.9176621160409555, + "grad_norm": 0.2053161588234763, + "learning_rate": 3.939232802250493e-05, + "loss": 0.8972, + "step": 23054 + }, + { + "epoch": 4.91787542662116, + "grad_norm": 0.1326683301295006, + "learning_rate": 3.9384996278644524e-05, + "loss": 0.9071, + "step": 23055 + }, + { + "epoch": 4.918088737201365, + "grad_norm": 0.16191217051394693, + "learning_rate": 3.93776649943436e-05, + "loss": 0.8966, + "step": 23056 + }, + { + "epoch": 4.9183020477815695, + "grad_norm": 0.16618386284577608, + "learning_rate": 3.937033416968502e-05, + "loss": 0.8915, + "step": 23057 + }, + { + "epoch": 4.918515358361775, + "grad_norm": 0.14610221813467336, + "learning_rate": 3.9363003804751804e-05, + "loss": 0.8851, + "step": 23058 + }, + { + "epoch": 4.91872866894198, + "grad_norm": 0.16489546578424044, + "learning_rate": 3.935567389962691e-05, + "loss": 0.8863, + "step": 23059 + }, + { + "epoch": 4.9189419795221845, + "grad_norm": 0.13937249038422753, + "learning_rate": 3.9348344454393235e-05, + "loss": 0.8678, + "step": 23060 + }, + { + "epoch": 4.919155290102389, + "grad_norm": 0.1628466254959924, + "learning_rate": 3.934101546913369e-05, + "loss": 0.9256, + "step": 23061 + }, + { + "epoch": 4.919368600682594, + "grad_norm": 0.14076127243465875, + "learning_rate": 3.933368694393128e-05, + "loss": 0.8998, + "step": 23062 + }, + { + "epoch": 4.9195819112627985, + "grad_norm": 0.164729870452972, + "learning_rate": 3.932635887886889e-05, + "loss": 0.9069, + "step": 23063 + }, + { + "epoch": 4.919795221843003, + "grad_norm": 0.1638267990053023, + "learning_rate": 3.931903127402943e-05, + "loss": 0.8861, + "step": 23064 + }, + { + "epoch": 4.920008532423208, + "grad_norm": 0.17403457878591266, + "learning_rate": 3.93117041294958e-05, + "loss": 0.9054, + "step": 23065 + }, + { + "epoch": 4.920221843003413, + "grad_norm": 0.14187865329038452, + "learning_rate": 3.930437744535091e-05, + "loss": 0.9407, + "step": 23066 + }, + { + "epoch": 4.920435153583618, + "grad_norm": 0.15939206942065198, + "learning_rate": 3.929705122167773e-05, + "loss": 0.9351, + "step": 23067 + }, + { + "epoch": 4.920648464163823, + "grad_norm": 0.16112690253575834, + "learning_rate": 3.928972545855907e-05, + "loss": 0.8732, + "step": 23068 + }, + { + "epoch": 4.9208617747440275, + "grad_norm": 0.15545045325023538, + "learning_rate": 3.9282400156077816e-05, + "loss": 0.9235, + "step": 23069 + }, + { + "epoch": 4.921075085324232, + "grad_norm": 0.15367863227561926, + "learning_rate": 3.927507531431693e-05, + "loss": 0.907, + "step": 23070 + }, + { + "epoch": 4.921288395904437, + "grad_norm": 0.1390820960259725, + "learning_rate": 3.9267750933359245e-05, + "loss": 0.9076, + "step": 23071 + }, + { + "epoch": 4.921501706484642, + "grad_norm": 0.16625150544710687, + "learning_rate": 3.926042701328764e-05, + "loss": 0.9059, + "step": 23072 + }, + { + "epoch": 4.921715017064846, + "grad_norm": 0.16010649172540278, + "learning_rate": 3.925310355418497e-05, + "loss": 0.881, + "step": 23073 + }, + { + "epoch": 4.921928327645051, + "grad_norm": 0.1598810435475627, + "learning_rate": 3.924578055613413e-05, + "loss": 0.8868, + "step": 23074 + }, + { + "epoch": 4.922141638225256, + "grad_norm": 0.13307150676670915, + "learning_rate": 3.9238458019217984e-05, + "loss": 0.9106, + "step": 23075 + }, + { + "epoch": 4.922354948805461, + "grad_norm": 0.1616101695291739, + "learning_rate": 3.9231135943519365e-05, + "loss": 0.8928, + "step": 23076 + }, + { + "epoch": 4.922568259385666, + "grad_norm": 0.1395504882083149, + "learning_rate": 3.922381432912113e-05, + "loss": 0.8877, + "step": 23077 + }, + { + "epoch": 4.922781569965871, + "grad_norm": 0.16201098638000883, + "learning_rate": 3.9216493176106126e-05, + "loss": 0.9005, + "step": 23078 + }, + { + "epoch": 4.922994880546075, + "grad_norm": 0.12786295183657428, + "learning_rate": 3.920917248455718e-05, + "loss": 0.8829, + "step": 23079 + }, + { + "epoch": 4.92320819112628, + "grad_norm": 0.1449344656405072, + "learning_rate": 3.920185225455714e-05, + "loss": 0.8913, + "step": 23080 + }, + { + "epoch": 4.923421501706485, + "grad_norm": 0.1393111642941492, + "learning_rate": 3.9194532486188794e-05, + "loss": 0.8932, + "step": 23081 + }, + { + "epoch": 4.923634812286689, + "grad_norm": 0.16697980441800553, + "learning_rate": 3.918721317953503e-05, + "loss": 0.903, + "step": 23082 + }, + { + "epoch": 4.923848122866894, + "grad_norm": 0.12549485257609322, + "learning_rate": 3.917989433467863e-05, + "loss": 0.8765, + "step": 23083 + }, + { + "epoch": 4.924061433447099, + "grad_norm": 0.1833308347442328, + "learning_rate": 3.9172575951702425e-05, + "loss": 0.9242, + "step": 23084 + }, + { + "epoch": 4.924274744027304, + "grad_norm": 0.18468322325700148, + "learning_rate": 3.9165258030689163e-05, + "loss": 0.8817, + "step": 23085 + }, + { + "epoch": 4.924488054607508, + "grad_norm": 0.12439408219470859, + "learning_rate": 3.915794057172174e-05, + "loss": 0.8965, + "step": 23086 + }, + { + "epoch": 4.924701365187714, + "grad_norm": 0.18231182000067817, + "learning_rate": 3.9150623574882895e-05, + "loss": 0.893, + "step": 23087 + }, + { + "epoch": 4.924914675767918, + "grad_norm": 0.179375128711515, + "learning_rate": 3.914330704025544e-05, + "loss": 0.8946, + "step": 23088 + }, + { + "epoch": 4.925127986348123, + "grad_norm": 0.12252905494529853, + "learning_rate": 3.913599096792215e-05, + "loss": 0.8637, + "step": 23089 + }, + { + "epoch": 4.925341296928328, + "grad_norm": 0.1403971625569901, + "learning_rate": 3.91286753579658e-05, + "loss": 0.9328, + "step": 23090 + }, + { + "epoch": 4.925554607508532, + "grad_norm": 0.1285958839534881, + "learning_rate": 3.9121360210469187e-05, + "loss": 0.8873, + "step": 23091 + }, + { + "epoch": 4.925767918088737, + "grad_norm": 0.1430230971605508, + "learning_rate": 3.911404552551507e-05, + "loss": 0.8838, + "step": 23092 + }, + { + "epoch": 4.925981228668942, + "grad_norm": 0.1389660780498545, + "learning_rate": 3.910673130318619e-05, + "loss": 0.9258, + "step": 23093 + }, + { + "epoch": 4.926194539249146, + "grad_norm": 0.13460051870019182, + "learning_rate": 3.909941754356535e-05, + "loss": 0.8879, + "step": 23094 + }, + { + "epoch": 4.926407849829351, + "grad_norm": 0.12862805376704897, + "learning_rate": 3.909210424673529e-05, + "loss": 0.896, + "step": 23095 + }, + { + "epoch": 4.926621160409557, + "grad_norm": 0.12714593491571674, + "learning_rate": 3.908479141277876e-05, + "loss": 0.9064, + "step": 23096 + }, + { + "epoch": 4.926834470989761, + "grad_norm": 0.1448135935265196, + "learning_rate": 3.907747904177848e-05, + "loss": 0.9142, + "step": 23097 + }, + { + "epoch": 4.927047781569966, + "grad_norm": 0.12888463594432847, + "learning_rate": 3.907016713381724e-05, + "loss": 0.8912, + "step": 23098 + }, + { + "epoch": 4.927261092150171, + "grad_norm": 0.138662405360882, + "learning_rate": 3.906285568897775e-05, + "loss": 0.9034, + "step": 23099 + }, + { + "epoch": 4.927474402730375, + "grad_norm": 0.12660029868021325, + "learning_rate": 3.905554470734276e-05, + "loss": 0.9365, + "step": 23100 + }, + { + "epoch": 4.92768771331058, + "grad_norm": 0.1523654214616312, + "learning_rate": 3.904823418899492e-05, + "loss": 0.903, + "step": 23101 + }, + { + "epoch": 4.927901023890785, + "grad_norm": 0.1282299161311425, + "learning_rate": 3.904092413401702e-05, + "loss": 0.8941, + "step": 23102 + }, + { + "epoch": 4.9281143344709895, + "grad_norm": 0.15566183437675218, + "learning_rate": 3.9033614542491755e-05, + "loss": 0.8968, + "step": 23103 + }, + { + "epoch": 4.928327645051194, + "grad_norm": 0.14238594047880063, + "learning_rate": 3.902630541450183e-05, + "loss": 0.889, + "step": 23104 + }, + { + "epoch": 4.9285409556314, + "grad_norm": 0.12581566560980792, + "learning_rate": 3.901899675012992e-05, + "loss": 0.8824, + "step": 23105 + }, + { + "epoch": 4.928754266211604, + "grad_norm": 0.16193043393218018, + "learning_rate": 3.901168854945878e-05, + "loss": 0.8854, + "step": 23106 + }, + { + "epoch": 4.928967576791809, + "grad_norm": 0.15255348077168818, + "learning_rate": 3.9004380812571074e-05, + "loss": 0.8888, + "step": 23107 + }, + { + "epoch": 4.929180887372014, + "grad_norm": 0.1486660909244433, + "learning_rate": 3.899707353954948e-05, + "loss": 0.8728, + "step": 23108 + }, + { + "epoch": 4.9293941979522184, + "grad_norm": 0.1432180730080867, + "learning_rate": 3.898976673047668e-05, + "loss": 0.922, + "step": 23109 + }, + { + "epoch": 4.929607508532423, + "grad_norm": 0.15152169700972234, + "learning_rate": 3.898246038543537e-05, + "loss": 0.8923, + "step": 23110 + }, + { + "epoch": 4.929820819112628, + "grad_norm": 0.1327411701817707, + "learning_rate": 3.897515450450822e-05, + "loss": 0.8678, + "step": 23111 + }, + { + "epoch": 4.9300341296928325, + "grad_norm": 0.23142509270661216, + "learning_rate": 3.896784908777791e-05, + "loss": 0.9054, + "step": 23112 + }, + { + "epoch": 4.930247440273037, + "grad_norm": 0.15009289231120523, + "learning_rate": 3.896054413532702e-05, + "loss": 0.8827, + "step": 23113 + }, + { + "epoch": 4.930460750853243, + "grad_norm": 0.14454157487771702, + "learning_rate": 3.8953239647238304e-05, + "loss": 0.9233, + "step": 23114 + }, + { + "epoch": 4.930674061433447, + "grad_norm": 0.17227811676742436, + "learning_rate": 3.894593562359436e-05, + "loss": 0.9002, + "step": 23115 + }, + { + "epoch": 4.930887372013652, + "grad_norm": 0.14938187741065587, + "learning_rate": 3.893863206447785e-05, + "loss": 0.9253, + "step": 23116 + }, + { + "epoch": 4.931100682593857, + "grad_norm": 0.16114688672716493, + "learning_rate": 3.893132896997137e-05, + "loss": 0.8961, + "step": 23117 + }, + { + "epoch": 4.9313139931740615, + "grad_norm": 0.16017282446460057, + "learning_rate": 3.892402634015764e-05, + "loss": 0.9038, + "step": 23118 + }, + { + "epoch": 4.931527303754266, + "grad_norm": 0.14246396091846136, + "learning_rate": 3.891672417511923e-05, + "loss": 0.9065, + "step": 23119 + }, + { + "epoch": 4.931740614334471, + "grad_norm": 0.1560121733318641, + "learning_rate": 3.890942247493877e-05, + "loss": 0.9329, + "step": 23120 + }, + { + "epoch": 4.9319539249146755, + "grad_norm": 0.17287966034675342, + "learning_rate": 3.890212123969887e-05, + "loss": 0.8768, + "step": 23121 + }, + { + "epoch": 4.93216723549488, + "grad_norm": 0.16512954671449795, + "learning_rate": 3.889482046948218e-05, + "loss": 0.9415, + "step": 23122 + }, + { + "epoch": 4.932380546075086, + "grad_norm": 0.15957134228859704, + "learning_rate": 3.888752016437133e-05, + "loss": 0.8934, + "step": 23123 + }, + { + "epoch": 4.9325938566552905, + "grad_norm": 0.1291710532229633, + "learning_rate": 3.888022032444884e-05, + "loss": 0.8826, + "step": 23124 + }, + { + "epoch": 4.932807167235495, + "grad_norm": 0.15435914845995305, + "learning_rate": 3.887292094979732e-05, + "loss": 0.8823, + "step": 23125 + }, + { + "epoch": 4.9330204778157, + "grad_norm": 0.14343820856484013, + "learning_rate": 3.886562204049943e-05, + "loss": 0.8936, + "step": 23126 + }, + { + "epoch": 4.9332337883959045, + "grad_norm": 0.14832181994828314, + "learning_rate": 3.8858323596637706e-05, + "loss": 0.9025, + "step": 23127 + }, + { + "epoch": 4.933447098976109, + "grad_norm": 0.1466006650462106, + "learning_rate": 3.8851025618294756e-05, + "loss": 0.8995, + "step": 23128 + }, + { + "epoch": 4.933660409556314, + "grad_norm": 0.14404198742280125, + "learning_rate": 3.8843728105553104e-05, + "loss": 0.9012, + "step": 23129 + }, + { + "epoch": 4.933873720136519, + "grad_norm": 0.14192109183130916, + "learning_rate": 3.8836431058495395e-05, + "loss": 0.8813, + "step": 23130 + }, + { + "epoch": 4.934087030716723, + "grad_norm": 0.1359494870674809, + "learning_rate": 3.882913447720416e-05, + "loss": 0.899, + "step": 23131 + }, + { + "epoch": 4.934300341296928, + "grad_norm": 0.13760695089783553, + "learning_rate": 3.8821838361761957e-05, + "loss": 0.9455, + "step": 23132 + }, + { + "epoch": 4.934513651877133, + "grad_norm": 0.16502229913714173, + "learning_rate": 3.881454271225135e-05, + "loss": 0.9057, + "step": 23133 + }, + { + "epoch": 4.934726962457338, + "grad_norm": 0.14470906095363656, + "learning_rate": 3.880724752875488e-05, + "loss": 0.8919, + "step": 23134 + }, + { + "epoch": 4.934940273037543, + "grad_norm": 0.13558360496784558, + "learning_rate": 3.8799952811355096e-05, + "loss": 0.8721, + "step": 23135 + }, + { + "epoch": 4.935153583617748, + "grad_norm": 0.15392338393761062, + "learning_rate": 3.8792658560134544e-05, + "loss": 0.9333, + "step": 23136 + }, + { + "epoch": 4.935366894197952, + "grad_norm": 0.15855661909531768, + "learning_rate": 3.878536477517573e-05, + "loss": 0.933, + "step": 23137 + }, + { + "epoch": 4.935580204778157, + "grad_norm": 0.14690895115034136, + "learning_rate": 3.877807145656123e-05, + "loss": 0.9067, + "step": 23138 + }, + { + "epoch": 4.935793515358362, + "grad_norm": 0.17605347324352064, + "learning_rate": 3.877077860437354e-05, + "loss": 0.9006, + "step": 23139 + }, + { + "epoch": 4.936006825938566, + "grad_norm": 0.19908306661067338, + "learning_rate": 3.876348621869519e-05, + "loss": 0.9305, + "step": 23140 + }, + { + "epoch": 4.936220136518771, + "grad_norm": 0.5120630007568594, + "learning_rate": 3.875619429960865e-05, + "loss": 0.926, + "step": 23141 + }, + { + "epoch": 4.936433447098976, + "grad_norm": 0.20160830563115498, + "learning_rate": 3.87489028471965e-05, + "loss": 0.9288, + "step": 23142 + }, + { + "epoch": 4.936646757679181, + "grad_norm": 0.23185300817700222, + "learning_rate": 3.874161186154121e-05, + "loss": 0.8892, + "step": 23143 + }, + { + "epoch": 4.936860068259386, + "grad_norm": 0.1525061068862394, + "learning_rate": 3.8734321342725275e-05, + "loss": 0.908, + "step": 23144 + }, + { + "epoch": 4.937073378839591, + "grad_norm": 0.21233453792621995, + "learning_rate": 3.872703129083119e-05, + "loss": 0.8918, + "step": 23145 + }, + { + "epoch": 4.937286689419795, + "grad_norm": 0.17458663072552663, + "learning_rate": 3.8719741705941436e-05, + "loss": 0.891, + "step": 23146 + }, + { + "epoch": 4.9375, + "grad_norm": 0.17730613033092255, + "learning_rate": 3.87124525881385e-05, + "loss": 0.9084, + "step": 23147 + }, + { + "epoch": 4.937713310580205, + "grad_norm": 0.19679804289685604, + "learning_rate": 3.870516393750485e-05, + "loss": 0.9155, + "step": 23148 + }, + { + "epoch": 4.937926621160409, + "grad_norm": 0.1407145456450002, + "learning_rate": 3.8697875754122946e-05, + "loss": 0.8777, + "step": 23149 + }, + { + "epoch": 4.938139931740614, + "grad_norm": 0.19202495556717464, + "learning_rate": 3.86905880380753e-05, + "loss": 0.8794, + "step": 23150 + }, + { + "epoch": 4.938353242320819, + "grad_norm": 0.15693647916889386, + "learning_rate": 3.8683300789444346e-05, + "loss": 0.9, + "step": 23151 + }, + { + "epoch": 4.938566552901024, + "grad_norm": 0.18459545174351338, + "learning_rate": 3.8676014008312536e-05, + "loss": 0.9052, + "step": 23152 + }, + { + "epoch": 4.938779863481229, + "grad_norm": 0.1505765181507895, + "learning_rate": 3.8668727694762294e-05, + "loss": 0.9018, + "step": 23153 + }, + { + "epoch": 4.938993174061434, + "grad_norm": 0.16668663913484766, + "learning_rate": 3.8661441848876126e-05, + "loss": 0.8958, + "step": 23154 + }, + { + "epoch": 4.939206484641638, + "grad_norm": 0.14968197902482266, + "learning_rate": 3.865415647073643e-05, + "loss": 0.9008, + "step": 23155 + }, + { + "epoch": 4.939419795221843, + "grad_norm": 0.15198670655445443, + "learning_rate": 3.864687156042569e-05, + "loss": 0.8692, + "step": 23156 + }, + { + "epoch": 4.939633105802048, + "grad_norm": 0.12931827962475193, + "learning_rate": 3.863958711802623e-05, + "loss": 0.8947, + "step": 23157 + }, + { + "epoch": 4.939846416382252, + "grad_norm": 0.15548211470638448, + "learning_rate": 3.863230314362056e-05, + "loss": 0.9084, + "step": 23158 + }, + { + "epoch": 4.940059726962457, + "grad_norm": 0.16255798909669805, + "learning_rate": 3.8625019637291085e-05, + "loss": 0.9091, + "step": 23159 + }, + { + "epoch": 4.940273037542662, + "grad_norm": 0.1486343097660208, + "learning_rate": 3.86177365991202e-05, + "loss": 0.9215, + "step": 23160 + }, + { + "epoch": 4.940486348122867, + "grad_norm": 0.1501095698514879, + "learning_rate": 3.86104540291903e-05, + "loss": 0.9207, + "step": 23161 + }, + { + "epoch": 4.940699658703072, + "grad_norm": 0.1224067025007361, + "learning_rate": 3.860317192758383e-05, + "loss": 0.8773, + "step": 23162 + }, + { + "epoch": 4.940912969283277, + "grad_norm": 0.1521702917276998, + "learning_rate": 3.859589029438318e-05, + "loss": 0.9553, + "step": 23163 + }, + { + "epoch": 4.941126279863481, + "grad_norm": 0.12267408149676122, + "learning_rate": 3.858860912967071e-05, + "loss": 0.8753, + "step": 23164 + }, + { + "epoch": 4.941339590443686, + "grad_norm": 0.1269458829346531, + "learning_rate": 3.858132843352881e-05, + "loss": 0.8874, + "step": 23165 + }, + { + "epoch": 4.941552901023891, + "grad_norm": 0.13424614077244543, + "learning_rate": 3.8574048206039895e-05, + "loss": 0.9016, + "step": 23166 + }, + { + "epoch": 4.9417662116040955, + "grad_norm": 0.11971731884479535, + "learning_rate": 3.856676844728632e-05, + "loss": 0.8988, + "step": 23167 + }, + { + "epoch": 4.9419795221843, + "grad_norm": 0.14794360678491777, + "learning_rate": 3.85594891573505e-05, + "loss": 0.8907, + "step": 23168 + }, + { + "epoch": 4.942192832764505, + "grad_norm": 0.13033415517526234, + "learning_rate": 3.855221033631469e-05, + "loss": 0.9193, + "step": 23169 + }, + { + "epoch": 4.9424061433447095, + "grad_norm": 0.14514534797294723, + "learning_rate": 3.854493198426134e-05, + "loss": 0.8919, + "step": 23170 + }, + { + "epoch": 4.942619453924914, + "grad_norm": 0.14608986454877376, + "learning_rate": 3.8537654101272794e-05, + "loss": 0.9247, + "step": 23171 + }, + { + "epoch": 4.94283276450512, + "grad_norm": 0.14564748718447781, + "learning_rate": 3.853037668743138e-05, + "loss": 0.9302, + "step": 23172 + }, + { + "epoch": 4.9430460750853245, + "grad_norm": 0.14774406162530251, + "learning_rate": 3.852309974281942e-05, + "loss": 0.9033, + "step": 23173 + }, + { + "epoch": 4.943259385665529, + "grad_norm": 0.15967432885047841, + "learning_rate": 3.851582326751933e-05, + "loss": 0.8734, + "step": 23174 + }, + { + "epoch": 4.943472696245734, + "grad_norm": 0.14605702559631809, + "learning_rate": 3.8508547261613384e-05, + "loss": 0.928, + "step": 23175 + }, + { + "epoch": 4.9436860068259385, + "grad_norm": 0.14647251632692257, + "learning_rate": 3.850127172518392e-05, + "loss": 0.8979, + "step": 23176 + }, + { + "epoch": 4.943899317406143, + "grad_norm": 0.17654187448806524, + "learning_rate": 3.8493996658313244e-05, + "loss": 0.88, + "step": 23177 + }, + { + "epoch": 4.944112627986348, + "grad_norm": 0.17940268756362895, + "learning_rate": 3.848672206108372e-05, + "loss": 0.8928, + "step": 23178 + }, + { + "epoch": 4.944325938566553, + "grad_norm": 0.12247367034844292, + "learning_rate": 3.847944793357766e-05, + "loss": 0.8973, + "step": 23179 + }, + { + "epoch": 4.944539249146757, + "grad_norm": 0.13282069116329534, + "learning_rate": 3.847217427587733e-05, + "loss": 0.9054, + "step": 23180 + }, + { + "epoch": 4.944752559726963, + "grad_norm": 0.12890780158511136, + "learning_rate": 3.8464901088065e-05, + "loss": 0.9023, + "step": 23181 + }, + { + "epoch": 4.9449658703071675, + "grad_norm": 0.12475985843955777, + "learning_rate": 3.845762837022306e-05, + "loss": 0.9112, + "step": 23182 + }, + { + "epoch": 4.945179180887372, + "grad_norm": 0.14309152848502157, + "learning_rate": 3.845035612243374e-05, + "loss": 0.8885, + "step": 23183 + }, + { + "epoch": 4.945392491467577, + "grad_norm": 0.16129634143118157, + "learning_rate": 3.844308434477934e-05, + "loss": 0.9044, + "step": 23184 + }, + { + "epoch": 4.9456058020477816, + "grad_norm": 0.14149343243420415, + "learning_rate": 3.843581303734211e-05, + "loss": 0.9023, + "step": 23185 + }, + { + "epoch": 4.945819112627986, + "grad_norm": 0.14000398770219785, + "learning_rate": 3.8428542200204384e-05, + "loss": 0.9136, + "step": 23186 + }, + { + "epoch": 4.946032423208191, + "grad_norm": 0.13622368650266006, + "learning_rate": 3.842127183344841e-05, + "loss": 0.8818, + "step": 23187 + }, + { + "epoch": 4.946245733788396, + "grad_norm": 0.1542347505070983, + "learning_rate": 3.841400193715644e-05, + "loss": 0.8965, + "step": 23188 + }, + { + "epoch": 4.9464590443686, + "grad_norm": 0.1411740238224244, + "learning_rate": 3.8406732511410704e-05, + "loss": 0.9023, + "step": 23189 + }, + { + "epoch": 4.946672354948806, + "grad_norm": 0.15088975853584108, + "learning_rate": 3.839946355629355e-05, + "loss": 0.9104, + "step": 23190 + }, + { + "epoch": 4.9468856655290105, + "grad_norm": 0.14810463106826394, + "learning_rate": 3.8392195071887145e-05, + "loss": 0.8863, + "step": 23191 + }, + { + "epoch": 4.947098976109215, + "grad_norm": 0.14688631710527425, + "learning_rate": 3.8384927058273745e-05, + "loss": 0.8815, + "step": 23192 + }, + { + "epoch": 4.94731228668942, + "grad_norm": 0.14621402012116472, + "learning_rate": 3.837765951553556e-05, + "loss": 0.8648, + "step": 23193 + }, + { + "epoch": 4.947525597269625, + "grad_norm": 0.14460781301123868, + "learning_rate": 3.8370392443754895e-05, + "loss": 0.9081, + "step": 23194 + }, + { + "epoch": 4.947738907849829, + "grad_norm": 0.13670002218517674, + "learning_rate": 3.836312584301394e-05, + "loss": 0.9075, + "step": 23195 + }, + { + "epoch": 4.947952218430034, + "grad_norm": 0.1285849671356044, + "learning_rate": 3.835585971339491e-05, + "loss": 0.9245, + "step": 23196 + }, + { + "epoch": 4.948165529010239, + "grad_norm": 0.14767728004436603, + "learning_rate": 3.8348594054980005e-05, + "loss": 0.9205, + "step": 23197 + }, + { + "epoch": 4.948378839590443, + "grad_norm": 0.13181888628763158, + "learning_rate": 3.834132886785149e-05, + "loss": 0.9052, + "step": 23198 + }, + { + "epoch": 4.948592150170649, + "grad_norm": 0.14370641945373228, + "learning_rate": 3.8334064152091526e-05, + "loss": 0.8906, + "step": 23199 + }, + { + "epoch": 4.948805460750854, + "grad_norm": 0.13738387995589865, + "learning_rate": 3.8326799907782344e-05, + "loss": 0.9035, + "step": 23200 + }, + { + "epoch": 4.949018771331058, + "grad_norm": 0.1721127172140413, + "learning_rate": 3.831953613500611e-05, + "loss": 0.8986, + "step": 23201 + }, + { + "epoch": 4.949232081911263, + "grad_norm": 0.14484148650359116, + "learning_rate": 3.831227283384503e-05, + "loss": 0.9095, + "step": 23202 + }, + { + "epoch": 4.949445392491468, + "grad_norm": 0.16568469651097115, + "learning_rate": 3.830501000438129e-05, + "loss": 0.8901, + "step": 23203 + }, + { + "epoch": 4.949658703071672, + "grad_norm": 0.17488231361233977, + "learning_rate": 3.829774764669706e-05, + "loss": 0.9211, + "step": 23204 + }, + { + "epoch": 4.949872013651877, + "grad_norm": 0.15741626559300262, + "learning_rate": 3.8290485760874485e-05, + "loss": 0.9044, + "step": 23205 + }, + { + "epoch": 4.950085324232082, + "grad_norm": 0.156606636088189, + "learning_rate": 3.828322434699579e-05, + "loss": 0.9015, + "step": 23206 + }, + { + "epoch": 4.950298634812286, + "grad_norm": 0.15904045831605346, + "learning_rate": 3.827596340514312e-05, + "loss": 0.9028, + "step": 23207 + }, + { + "epoch": 4.950511945392492, + "grad_norm": 0.17885080207436177, + "learning_rate": 3.826870293539862e-05, + "loss": 0.9196, + "step": 23208 + }, + { + "epoch": 4.950725255972696, + "grad_norm": 0.12855638787560883, + "learning_rate": 3.826144293784443e-05, + "loss": 0.8899, + "step": 23209 + }, + { + "epoch": 4.950938566552901, + "grad_norm": 0.1561593207015214, + "learning_rate": 3.8254183412562736e-05, + "loss": 0.8993, + "step": 23210 + }, + { + "epoch": 4.951151877133106, + "grad_norm": 0.14286043965920422, + "learning_rate": 3.8246924359635655e-05, + "loss": 0.8732, + "step": 23211 + }, + { + "epoch": 4.951365187713311, + "grad_norm": 0.15486437458501232, + "learning_rate": 3.823966577914533e-05, + "loss": 0.9217, + "step": 23212 + }, + { + "epoch": 4.951578498293515, + "grad_norm": 0.14673445223293338, + "learning_rate": 3.8232407671173894e-05, + "loss": 0.9028, + "step": 23213 + }, + { + "epoch": 4.95179180887372, + "grad_norm": 0.14432777909229677, + "learning_rate": 3.8225150035803464e-05, + "loss": 0.9304, + "step": 23214 + }, + { + "epoch": 4.952005119453925, + "grad_norm": 0.1656633301118264, + "learning_rate": 3.8217892873116165e-05, + "loss": 0.9121, + "step": 23215 + }, + { + "epoch": 4.952218430034129, + "grad_norm": 0.13217554784844335, + "learning_rate": 3.821063618319412e-05, + "loss": 0.8869, + "step": 23216 + }, + { + "epoch": 4.952431740614334, + "grad_norm": 0.1971348174342842, + "learning_rate": 3.8203379966119386e-05, + "loss": 0.8917, + "step": 23217 + }, + { + "epoch": 4.952645051194539, + "grad_norm": 0.1656396309555202, + "learning_rate": 3.819612422197414e-05, + "loss": 0.8947, + "step": 23218 + }, + { + "epoch": 4.952858361774744, + "grad_norm": 0.12978489122280457, + "learning_rate": 3.818886895084046e-05, + "loss": 0.9253, + "step": 23219 + }, + { + "epoch": 4.953071672354949, + "grad_norm": 0.14454042633073735, + "learning_rate": 3.8181614152800426e-05, + "loss": 0.8913, + "step": 23220 + }, + { + "epoch": 4.953284982935154, + "grad_norm": 0.14031983150510846, + "learning_rate": 3.81743598279361e-05, + "loss": 0.8821, + "step": 23221 + }, + { + "epoch": 4.953498293515358, + "grad_norm": 0.14465619563580445, + "learning_rate": 3.8167105976329636e-05, + "loss": 0.92, + "step": 23222 + }, + { + "epoch": 4.953711604095563, + "grad_norm": 0.16812278930446234, + "learning_rate": 3.815985259806306e-05, + "loss": 0.891, + "step": 23223 + }, + { + "epoch": 4.953924914675768, + "grad_norm": 0.139325678078141, + "learning_rate": 3.815259969321846e-05, + "loss": 0.9087, + "step": 23224 + }, + { + "epoch": 4.9541382252559725, + "grad_norm": 0.18119361563820932, + "learning_rate": 3.8145347261877906e-05, + "loss": 0.9118, + "step": 23225 + }, + { + "epoch": 4.954351535836177, + "grad_norm": 0.18938258527614277, + "learning_rate": 3.813809530412344e-05, + "loss": 0.9232, + "step": 23226 + }, + { + "epoch": 4.954564846416382, + "grad_norm": 0.14609355895271964, + "learning_rate": 3.8130843820037135e-05, + "loss": 0.9362, + "step": 23227 + }, + { + "epoch": 4.954778156996587, + "grad_norm": 0.2055886028653513, + "learning_rate": 3.8123592809701034e-05, + "loss": 0.8755, + "step": 23228 + }, + { + "epoch": 4.954991467576792, + "grad_norm": 0.12487516494551175, + "learning_rate": 3.811634227319716e-05, + "loss": 0.883, + "step": 23229 + }, + { + "epoch": 4.955204778156997, + "grad_norm": 0.15823245663925034, + "learning_rate": 3.8109092210607596e-05, + "loss": 0.9124, + "step": 23230 + }, + { + "epoch": 4.9554180887372015, + "grad_norm": 0.1557340373265045, + "learning_rate": 3.810184262201437e-05, + "loss": 0.9019, + "step": 23231 + }, + { + "epoch": 4.955631399317406, + "grad_norm": 0.1468917113868215, + "learning_rate": 3.80945935074995e-05, + "loss": 0.8964, + "step": 23232 + }, + { + "epoch": 4.955844709897611, + "grad_norm": 0.15045983666686216, + "learning_rate": 3.808734486714497e-05, + "loss": 0.8826, + "step": 23233 + }, + { + "epoch": 4.9560580204778155, + "grad_norm": 0.1288369136677978, + "learning_rate": 3.808009670103287e-05, + "loss": 0.9137, + "step": 23234 + }, + { + "epoch": 4.95627133105802, + "grad_norm": 0.14664407390319012, + "learning_rate": 3.807284900924522e-05, + "loss": 0.9111, + "step": 23235 + }, + { + "epoch": 4.956484641638225, + "grad_norm": 0.13416372993182135, + "learning_rate": 3.806560179186392e-05, + "loss": 0.9253, + "step": 23236 + }, + { + "epoch": 4.9566979522184305, + "grad_norm": 0.14907667626217777, + "learning_rate": 3.805835504897108e-05, + "loss": 0.9158, + "step": 23237 + }, + { + "epoch": 4.956911262798635, + "grad_norm": 0.1678827915394058, + "learning_rate": 3.8051108780648645e-05, + "loss": 0.9167, + "step": 23238 + }, + { + "epoch": 4.95712457337884, + "grad_norm": 0.13568898271745808, + "learning_rate": 3.804386298697863e-05, + "loss": 0.9312, + "step": 23239 + }, + { + "epoch": 4.9573378839590445, + "grad_norm": 0.16885985307497814, + "learning_rate": 3.8036617668043e-05, + "loss": 0.905, + "step": 23240 + }, + { + "epoch": 4.957551194539249, + "grad_norm": 0.1324037731618426, + "learning_rate": 3.802937282392373e-05, + "loss": 0.8611, + "step": 23241 + }, + { + "epoch": 4.957764505119454, + "grad_norm": 0.19577752733700057, + "learning_rate": 3.8022128454702844e-05, + "loss": 0.8862, + "step": 23242 + }, + { + "epoch": 4.957977815699659, + "grad_norm": 0.17292611248134263, + "learning_rate": 3.801488456046227e-05, + "loss": 0.8767, + "step": 23243 + }, + { + "epoch": 4.958191126279863, + "grad_norm": 0.1438927225326727, + "learning_rate": 3.800764114128399e-05, + "loss": 0.8889, + "step": 23244 + }, + { + "epoch": 4.958404436860068, + "grad_norm": 0.1894898293896863, + "learning_rate": 3.800039819724992e-05, + "loss": 0.9096, + "step": 23245 + }, + { + "epoch": 4.9586177474402735, + "grad_norm": 0.1288844158175094, + "learning_rate": 3.799315572844211e-05, + "loss": 0.8887, + "step": 23246 + }, + { + "epoch": 4.958831058020478, + "grad_norm": 0.18227654708923505, + "learning_rate": 3.798591373494243e-05, + "loss": 0.9192, + "step": 23247 + }, + { + "epoch": 4.959044368600683, + "grad_norm": 0.1350895852184587, + "learning_rate": 3.7978672216832804e-05, + "loss": 0.8952, + "step": 23248 + }, + { + "epoch": 4.959257679180888, + "grad_norm": 0.197799098520417, + "learning_rate": 3.7971431174195245e-05, + "loss": 0.8858, + "step": 23249 + }, + { + "epoch": 4.959470989761092, + "grad_norm": 0.13217625524191304, + "learning_rate": 3.796419060711165e-05, + "loss": 0.9206, + "step": 23250 + }, + { + "epoch": 4.959684300341297, + "grad_norm": 0.1891581545222146, + "learning_rate": 3.7956950515663934e-05, + "loss": 0.8908, + "step": 23251 + }, + { + "epoch": 4.959897610921502, + "grad_norm": 0.15681167075455987, + "learning_rate": 3.794971089993404e-05, + "loss": 0.9149, + "step": 23252 + }, + { + "epoch": 4.960110921501706, + "grad_norm": 0.18030866700755754, + "learning_rate": 3.794247176000384e-05, + "loss": 0.9315, + "step": 23253 + }, + { + "epoch": 4.960324232081911, + "grad_norm": 0.14037493738411969, + "learning_rate": 3.7935233095955313e-05, + "loss": 0.8843, + "step": 23254 + }, + { + "epoch": 4.960537542662116, + "grad_norm": 0.1697816065263737, + "learning_rate": 3.792799490787032e-05, + "loss": 0.8903, + "step": 23255 + }, + { + "epoch": 4.96075085324232, + "grad_norm": 0.1481272033547861, + "learning_rate": 3.7920757195830785e-05, + "loss": 0.9028, + "step": 23256 + }, + { + "epoch": 4.960964163822526, + "grad_norm": 0.18410034811802625, + "learning_rate": 3.791351995991859e-05, + "loss": 0.919, + "step": 23257 + }, + { + "epoch": 4.961177474402731, + "grad_norm": 0.16716104509245655, + "learning_rate": 3.7906283200215624e-05, + "loss": 0.9317, + "step": 23258 + }, + { + "epoch": 4.961390784982935, + "grad_norm": 0.14137570918495465, + "learning_rate": 3.7899046916803766e-05, + "loss": 0.8998, + "step": 23259 + }, + { + "epoch": 4.96160409556314, + "grad_norm": 0.17172533431774906, + "learning_rate": 3.789181110976487e-05, + "loss": 0.9256, + "step": 23260 + }, + { + "epoch": 4.961817406143345, + "grad_norm": 0.13055591465403799, + "learning_rate": 3.788457577918087e-05, + "loss": 0.8836, + "step": 23261 + }, + { + "epoch": 4.962030716723549, + "grad_norm": 0.1560594488133023, + "learning_rate": 3.7877340925133613e-05, + "loss": 0.8905, + "step": 23262 + }, + { + "epoch": 4.962244027303754, + "grad_norm": 0.1368803599803405, + "learning_rate": 3.7870106547704946e-05, + "loss": 0.8825, + "step": 23263 + }, + { + "epoch": 4.962457337883959, + "grad_norm": 0.14202188346900138, + "learning_rate": 3.7862872646976724e-05, + "loss": 0.9046, + "step": 23264 + }, + { + "epoch": 4.962670648464163, + "grad_norm": 0.12917179664316383, + "learning_rate": 3.7855639223030786e-05, + "loss": 0.8606, + "step": 23265 + }, + { + "epoch": 4.962883959044369, + "grad_norm": 0.134003871907523, + "learning_rate": 3.784840627594903e-05, + "loss": 0.9236, + "step": 23266 + }, + { + "epoch": 4.963097269624574, + "grad_norm": 0.13127000051050894, + "learning_rate": 3.784117380581326e-05, + "loss": 0.8884, + "step": 23267 + }, + { + "epoch": 4.963310580204778, + "grad_norm": 0.123346512791377, + "learning_rate": 3.783394181270532e-05, + "loss": 0.9304, + "step": 23268 + }, + { + "epoch": 4.963523890784983, + "grad_norm": 0.12509405622723746, + "learning_rate": 3.782671029670704e-05, + "loss": 0.9113, + "step": 23269 + }, + { + "epoch": 4.963737201365188, + "grad_norm": 0.1313631361170485, + "learning_rate": 3.781947925790024e-05, + "loss": 0.9001, + "step": 23270 + }, + { + "epoch": 4.963950511945392, + "grad_norm": 0.12174233565498968, + "learning_rate": 3.781224869636674e-05, + "loss": 0.9341, + "step": 23271 + }, + { + "epoch": 4.964163822525597, + "grad_norm": 0.1371402089282955, + "learning_rate": 3.780501861218833e-05, + "loss": 0.885, + "step": 23272 + }, + { + "epoch": 4.964377133105802, + "grad_norm": 0.14348529856386277, + "learning_rate": 3.779778900544687e-05, + "loss": 0.9291, + "step": 23273 + }, + { + "epoch": 4.964590443686006, + "grad_norm": 0.13709926008345047, + "learning_rate": 3.779055987622414e-05, + "loss": 0.885, + "step": 23274 + }, + { + "epoch": 4.964803754266212, + "grad_norm": 0.1542790949593442, + "learning_rate": 3.778333122460193e-05, + "loss": 0.9394, + "step": 23275 + }, + { + "epoch": 4.965017064846417, + "grad_norm": 0.12189768116782983, + "learning_rate": 3.7776103050662044e-05, + "loss": 0.8985, + "step": 23276 + }, + { + "epoch": 4.965230375426621, + "grad_norm": 0.15374322569666032, + "learning_rate": 3.776887535448623e-05, + "loss": 0.9154, + "step": 23277 + }, + { + "epoch": 4.965443686006826, + "grad_norm": 0.11919763166078849, + "learning_rate": 3.776164813615634e-05, + "loss": 0.9088, + "step": 23278 + }, + { + "epoch": 4.965656996587031, + "grad_norm": 0.1490096062104433, + "learning_rate": 3.775442139575413e-05, + "loss": 0.8843, + "step": 23279 + }, + { + "epoch": 4.965870307167235, + "grad_norm": 0.12898907910138893, + "learning_rate": 3.774719513336129e-05, + "loss": 0.8982, + "step": 23280 + }, + { + "epoch": 4.96608361774744, + "grad_norm": 0.14490703949384418, + "learning_rate": 3.7739969349059685e-05, + "loss": 0.9114, + "step": 23281 + }, + { + "epoch": 4.966296928327645, + "grad_norm": 0.13240401898974125, + "learning_rate": 3.773274404293103e-05, + "loss": 0.8863, + "step": 23282 + }, + { + "epoch": 4.9665102389078495, + "grad_norm": 0.14130053465524428, + "learning_rate": 3.772551921505709e-05, + "loss": 0.8927, + "step": 23283 + }, + { + "epoch": 4.966723549488055, + "grad_norm": 0.11648910654318069, + "learning_rate": 3.7718294865519595e-05, + "loss": 0.8805, + "step": 23284 + }, + { + "epoch": 4.96693686006826, + "grad_norm": 0.1403427209285768, + "learning_rate": 3.7711070994400326e-05, + "loss": 0.8949, + "step": 23285 + }, + { + "epoch": 4.967150170648464, + "grad_norm": 0.13078824602878716, + "learning_rate": 3.770384760178099e-05, + "loss": 0.9119, + "step": 23286 + }, + { + "epoch": 4.967363481228669, + "grad_norm": 0.14078345190307145, + "learning_rate": 3.769662468774334e-05, + "loss": 0.8974, + "step": 23287 + }, + { + "epoch": 4.967576791808874, + "grad_norm": 0.138450198357375, + "learning_rate": 3.7689402252369093e-05, + "loss": 0.9402, + "step": 23288 + }, + { + "epoch": 4.9677901023890785, + "grad_norm": 0.1368075880118126, + "learning_rate": 3.7682180295739936e-05, + "loss": 0.9218, + "step": 23289 + }, + { + "epoch": 4.968003412969283, + "grad_norm": 0.1367788083630794, + "learning_rate": 3.7674958817937656e-05, + "loss": 0.8691, + "step": 23290 + }, + { + "epoch": 4.968216723549488, + "grad_norm": 0.1400314142504057, + "learning_rate": 3.7667737819043956e-05, + "loss": 0.8905, + "step": 23291 + }, + { + "epoch": 4.9684300341296925, + "grad_norm": 0.13848210555850016, + "learning_rate": 3.766051729914046e-05, + "loss": 0.9102, + "step": 23292 + }, + { + "epoch": 4.968643344709898, + "grad_norm": 0.1355585693521827, + "learning_rate": 3.7653297258308944e-05, + "loss": 0.9057, + "step": 23293 + }, + { + "epoch": 4.968856655290102, + "grad_norm": 0.1363782391893472, + "learning_rate": 3.764607769663108e-05, + "loss": 0.9077, + "step": 23294 + }, + { + "epoch": 4.9690699658703075, + "grad_norm": 0.13039169664907743, + "learning_rate": 3.763885861418858e-05, + "loss": 0.914, + "step": 23295 + }, + { + "epoch": 4.969283276450512, + "grad_norm": 0.12632244157836525, + "learning_rate": 3.763164001106305e-05, + "loss": 0.9233, + "step": 23296 + }, + { + "epoch": 4.969496587030717, + "grad_norm": 0.1445139096995176, + "learning_rate": 3.7624421887336275e-05, + "loss": 0.9423, + "step": 23297 + }, + { + "epoch": 4.9697098976109215, + "grad_norm": 0.15694764797504077, + "learning_rate": 3.761720424308987e-05, + "loss": 0.8979, + "step": 23298 + }, + { + "epoch": 4.969923208191126, + "grad_norm": 0.1371104415412806, + "learning_rate": 3.7609987078405515e-05, + "loss": 0.8778, + "step": 23299 + }, + { + "epoch": 4.970136518771331, + "grad_norm": 0.16183239397364046, + "learning_rate": 3.760277039336487e-05, + "loss": 0.9198, + "step": 23300 + }, + { + "epoch": 4.970349829351536, + "grad_norm": 0.1261306214575447, + "learning_rate": 3.759555418804956e-05, + "loss": 0.8899, + "step": 23301 + }, + { + "epoch": 4.97056313993174, + "grad_norm": 0.14563425320086396, + "learning_rate": 3.758833846254133e-05, + "loss": 0.9175, + "step": 23302 + }, + { + "epoch": 4.970776450511945, + "grad_norm": 0.1375679956194378, + "learning_rate": 3.758112321692173e-05, + "loss": 0.9199, + "step": 23303 + }, + { + "epoch": 4.9709897610921505, + "grad_norm": 0.15027719847195073, + "learning_rate": 3.7573908451272406e-05, + "loss": 0.9422, + "step": 23304 + }, + { + "epoch": 4.971203071672355, + "grad_norm": 0.14817858834181133, + "learning_rate": 3.756669416567505e-05, + "loss": 0.9232, + "step": 23305 + }, + { + "epoch": 4.97141638225256, + "grad_norm": 0.16890925897796447, + "learning_rate": 3.755948036021127e-05, + "loss": 0.8808, + "step": 23306 + }, + { + "epoch": 4.971629692832765, + "grad_norm": 0.1264009959228601, + "learning_rate": 3.755226703496267e-05, + "loss": 0.8859, + "step": 23307 + }, + { + "epoch": 4.971843003412969, + "grad_norm": 0.17539094110377693, + "learning_rate": 3.7545054190010864e-05, + "loss": 0.8934, + "step": 23308 + }, + { + "epoch": 4.972056313993174, + "grad_norm": 0.14830321061900054, + "learning_rate": 3.753784182543751e-05, + "loss": 0.8743, + "step": 23309 + }, + { + "epoch": 4.972269624573379, + "grad_norm": 0.16515916043715362, + "learning_rate": 3.753062994132419e-05, + "loss": 0.8942, + "step": 23310 + }, + { + "epoch": 4.972482935153583, + "grad_norm": 0.17333579336605903, + "learning_rate": 3.7523418537752504e-05, + "loss": 0.895, + "step": 23311 + }, + { + "epoch": 4.972696245733788, + "grad_norm": 0.14109785094519195, + "learning_rate": 3.751620761480406e-05, + "loss": 0.8999, + "step": 23312 + }, + { + "epoch": 4.972909556313994, + "grad_norm": 0.16796241967811198, + "learning_rate": 3.750899717256044e-05, + "loss": 0.9479, + "step": 23313 + }, + { + "epoch": 4.973122866894198, + "grad_norm": 0.14742793615367142, + "learning_rate": 3.750178721110322e-05, + "loss": 0.9083, + "step": 23314 + }, + { + "epoch": 4.973336177474403, + "grad_norm": 0.1433015506046879, + "learning_rate": 3.7494577730514006e-05, + "loss": 0.9008, + "step": 23315 + }, + { + "epoch": 4.973549488054608, + "grad_norm": 0.15407602797505737, + "learning_rate": 3.748736873087432e-05, + "loss": 0.8919, + "step": 23316 + }, + { + "epoch": 4.973762798634812, + "grad_norm": 0.15316176856527394, + "learning_rate": 3.7480160212265804e-05, + "loss": 0.9068, + "step": 23317 + }, + { + "epoch": 4.973976109215017, + "grad_norm": 0.15014572959421552, + "learning_rate": 3.7472952174769996e-05, + "loss": 0.8823, + "step": 23318 + }, + { + "epoch": 4.974189419795222, + "grad_norm": 0.20017637514445855, + "learning_rate": 3.746574461846844e-05, + "loss": 0.8988, + "step": 23319 + }, + { + "epoch": 4.974402730375426, + "grad_norm": 0.14499448824555122, + "learning_rate": 3.745853754344268e-05, + "loss": 0.898, + "step": 23320 + }, + { + "epoch": 4.974616040955631, + "grad_norm": 0.1385928621140934, + "learning_rate": 3.745133094977431e-05, + "loss": 0.8845, + "step": 23321 + }, + { + "epoch": 4.974829351535837, + "grad_norm": 0.14830864279050648, + "learning_rate": 3.7444124837544845e-05, + "loss": 0.9457, + "step": 23322 + }, + { + "epoch": 4.975042662116041, + "grad_norm": 0.165867011688619, + "learning_rate": 3.743691920683582e-05, + "loss": 0.906, + "step": 23323 + }, + { + "epoch": 4.975255972696246, + "grad_norm": 0.142739211747654, + "learning_rate": 3.7429714057728775e-05, + "loss": 0.9106, + "step": 23324 + }, + { + "epoch": 4.975469283276451, + "grad_norm": 0.16329232771412655, + "learning_rate": 3.742250939030522e-05, + "loss": 0.8939, + "step": 23325 + }, + { + "epoch": 4.975682593856655, + "grad_norm": 0.14765191805471156, + "learning_rate": 3.7415305204646705e-05, + "loss": 0.9075, + "step": 23326 + }, + { + "epoch": 4.97589590443686, + "grad_norm": 0.14976006008110218, + "learning_rate": 3.740810150083472e-05, + "loss": 0.9358, + "step": 23327 + }, + { + "epoch": 4.976109215017065, + "grad_norm": 0.14032096785570575, + "learning_rate": 3.740089827895075e-05, + "loss": 0.9249, + "step": 23328 + }, + { + "epoch": 4.976322525597269, + "grad_norm": 0.16944037103734766, + "learning_rate": 3.7393695539076366e-05, + "loss": 0.8951, + "step": 23329 + }, + { + "epoch": 4.976535836177474, + "grad_norm": 0.15448224024008333, + "learning_rate": 3.7386493281293034e-05, + "loss": 0.9016, + "step": 23330 + }, + { + "epoch": 4.97674914675768, + "grad_norm": 0.1413799844268028, + "learning_rate": 3.7379291505682246e-05, + "loss": 0.9255, + "step": 23331 + }, + { + "epoch": 4.976962457337884, + "grad_norm": 0.1329758853195486, + "learning_rate": 3.737209021232546e-05, + "loss": 0.8719, + "step": 23332 + }, + { + "epoch": 4.977175767918089, + "grad_norm": 0.15368767367599856, + "learning_rate": 3.736488940130422e-05, + "loss": 0.8849, + "step": 23333 + }, + { + "epoch": 4.977389078498294, + "grad_norm": 0.12821033354315353, + "learning_rate": 3.735768907269998e-05, + "loss": 0.905, + "step": 23334 + }, + { + "epoch": 4.977602389078498, + "grad_norm": 0.149675449188176, + "learning_rate": 3.7350489226594226e-05, + "loss": 0.8866, + "step": 23335 + }, + { + "epoch": 4.977815699658703, + "grad_norm": 0.14481500028920585, + "learning_rate": 3.7343289863068354e-05, + "loss": 0.889, + "step": 23336 + }, + { + "epoch": 4.978029010238908, + "grad_norm": 0.15331603098101593, + "learning_rate": 3.73360909822039e-05, + "loss": 0.9091, + "step": 23337 + }, + { + "epoch": 4.978242320819112, + "grad_norm": 0.13063181309405208, + "learning_rate": 3.732889258408229e-05, + "loss": 0.8977, + "step": 23338 + }, + { + "epoch": 4.978455631399317, + "grad_norm": 0.13247791677542342, + "learning_rate": 3.7321694668784976e-05, + "loss": 0.9038, + "step": 23339 + }, + { + "epoch": 4.978668941979522, + "grad_norm": 0.15444160736482496, + "learning_rate": 3.7314497236393385e-05, + "loss": 0.9418, + "step": 23340 + }, + { + "epoch": 4.9788822525597265, + "grad_norm": 0.15486683375431334, + "learning_rate": 3.7307300286989e-05, + "loss": 0.9149, + "step": 23341 + }, + { + "epoch": 4.979095563139932, + "grad_norm": 0.1276393753301223, + "learning_rate": 3.730010382065323e-05, + "loss": 0.9249, + "step": 23342 + }, + { + "epoch": 4.979308873720137, + "grad_norm": 0.15435208281344429, + "learning_rate": 3.72929078374675e-05, + "loss": 0.9073, + "step": 23343 + }, + { + "epoch": 4.979522184300341, + "grad_norm": 0.14796200607891552, + "learning_rate": 3.7285712337513214e-05, + "loss": 0.906, + "step": 23344 + }, + { + "epoch": 4.979735494880546, + "grad_norm": 0.12917833130258646, + "learning_rate": 3.727851732087184e-05, + "loss": 0.9123, + "step": 23345 + }, + { + "epoch": 4.979948805460751, + "grad_norm": 0.12420409993046075, + "learning_rate": 3.727132278762476e-05, + "loss": 0.876, + "step": 23346 + }, + { + "epoch": 4.9801621160409555, + "grad_norm": 0.15575180571041142, + "learning_rate": 3.7264128737853414e-05, + "loss": 0.8682, + "step": 23347 + }, + { + "epoch": 4.98037542662116, + "grad_norm": 0.12250350709217782, + "learning_rate": 3.72569351716391e-05, + "loss": 0.9126, + "step": 23348 + }, + { + "epoch": 4.980588737201365, + "grad_norm": 0.15324383916756648, + "learning_rate": 3.7249742089063324e-05, + "loss": 0.9195, + "step": 23349 + }, + { + "epoch": 4.9808020477815695, + "grad_norm": 0.1361120211704319, + "learning_rate": 3.724254949020743e-05, + "loss": 0.9096, + "step": 23350 + }, + { + "epoch": 4.981015358361775, + "grad_norm": 0.14138527915803367, + "learning_rate": 3.723535737515281e-05, + "loss": 0.8951, + "step": 23351 + }, + { + "epoch": 4.98122866894198, + "grad_norm": 0.14306899521085056, + "learning_rate": 3.722816574398081e-05, + "loss": 0.8911, + "step": 23352 + }, + { + "epoch": 4.9814419795221845, + "grad_norm": 0.12150613698542762, + "learning_rate": 3.7220974596772866e-05, + "loss": 0.8862, + "step": 23353 + }, + { + "epoch": 4.981655290102389, + "grad_norm": 0.12428497191513467, + "learning_rate": 3.7213783933610305e-05, + "loss": 0.8844, + "step": 23354 + }, + { + "epoch": 4.981868600682594, + "grad_norm": 0.17575919501310636, + "learning_rate": 3.720659375457451e-05, + "loss": 0.9079, + "step": 23355 + }, + { + "epoch": 4.9820819112627985, + "grad_norm": 0.15042459871132222, + "learning_rate": 3.7199404059746796e-05, + "loss": 0.8766, + "step": 23356 + }, + { + "epoch": 4.982295221843003, + "grad_norm": 0.1379098401497641, + "learning_rate": 3.7192214849208566e-05, + "loss": 0.9028, + "step": 23357 + }, + { + "epoch": 4.982508532423208, + "grad_norm": 0.13738409416534922, + "learning_rate": 3.718502612304118e-05, + "loss": 0.8717, + "step": 23358 + }, + { + "epoch": 4.982721843003413, + "grad_norm": 0.11918991766333853, + "learning_rate": 3.717783788132591e-05, + "loss": 0.8587, + "step": 23359 + }, + { + "epoch": 4.982935153583618, + "grad_norm": 0.1308012253888694, + "learning_rate": 3.71706501241441e-05, + "loss": 0.9121, + "step": 23360 + }, + { + "epoch": 4.983148464163823, + "grad_norm": 0.1221534842510485, + "learning_rate": 3.716346285157714e-05, + "loss": 0.8751, + "step": 23361 + }, + { + "epoch": 4.9833617747440275, + "grad_norm": 0.1362580714857126, + "learning_rate": 3.7156276063706306e-05, + "loss": 0.9119, + "step": 23362 + }, + { + "epoch": 4.983575085324232, + "grad_norm": 0.13812823259882, + "learning_rate": 3.714908976061294e-05, + "loss": 0.8739, + "step": 23363 + }, + { + "epoch": 4.983788395904437, + "grad_norm": 0.11790976448628313, + "learning_rate": 3.714190394237831e-05, + "loss": 0.8791, + "step": 23364 + }, + { + "epoch": 4.984001706484642, + "grad_norm": 0.14498113191909642, + "learning_rate": 3.713471860908378e-05, + "loss": 0.8861, + "step": 23365 + }, + { + "epoch": 4.984215017064846, + "grad_norm": 0.15302722536137411, + "learning_rate": 3.7127533760810635e-05, + "loss": 0.921, + "step": 23366 + }, + { + "epoch": 4.984428327645051, + "grad_norm": 0.12990199286769266, + "learning_rate": 3.7120349397640164e-05, + "loss": 0.9012, + "step": 23367 + }, + { + "epoch": 4.984641638225256, + "grad_norm": 0.13503130845479, + "learning_rate": 3.7113165519653636e-05, + "loss": 0.8928, + "step": 23368 + }, + { + "epoch": 4.984854948805461, + "grad_norm": 0.15572496221648363, + "learning_rate": 3.710598212693242e-05, + "loss": 0.9123, + "step": 23369 + }, + { + "epoch": 4.985068259385666, + "grad_norm": 0.12798685481049277, + "learning_rate": 3.7098799219557705e-05, + "loss": 0.9095, + "step": 23370 + }, + { + "epoch": 4.985281569965871, + "grad_norm": 0.1824474697876127, + "learning_rate": 3.7091616797610804e-05, + "loss": 0.8905, + "step": 23371 + }, + { + "epoch": 4.985494880546075, + "grad_norm": 0.170757452040891, + "learning_rate": 3.708443486117294e-05, + "loss": 0.8766, + "step": 23372 + }, + { + "epoch": 4.98570819112628, + "grad_norm": 0.151430404660875, + "learning_rate": 3.707725341032546e-05, + "loss": 0.9066, + "step": 23373 + }, + { + "epoch": 4.985921501706485, + "grad_norm": 0.17989775130013635, + "learning_rate": 3.707007244514958e-05, + "loss": 0.8828, + "step": 23374 + }, + { + "epoch": 4.986134812286689, + "grad_norm": 0.13910237664089847, + "learning_rate": 3.7062891965726545e-05, + "loss": 0.8943, + "step": 23375 + }, + { + "epoch": 4.986348122866894, + "grad_norm": 0.1866808663617275, + "learning_rate": 3.7055711972137574e-05, + "loss": 0.925, + "step": 23376 + }, + { + "epoch": 4.986561433447099, + "grad_norm": 0.20712181324609893, + "learning_rate": 3.704853246446399e-05, + "loss": 0.9062, + "step": 23377 + }, + { + "epoch": 4.986774744027304, + "grad_norm": 0.16112177482309475, + "learning_rate": 3.7041353442786973e-05, + "loss": 0.8632, + "step": 23378 + }, + { + "epoch": 4.986988054607508, + "grad_norm": 0.1793340327777389, + "learning_rate": 3.703417490718777e-05, + "loss": 0.8901, + "step": 23379 + }, + { + "epoch": 4.987201365187714, + "grad_norm": 0.20496840391448765, + "learning_rate": 3.702699685774759e-05, + "loss": 0.9127, + "step": 23380 + }, + { + "epoch": 4.987414675767918, + "grad_norm": 0.1380195157418121, + "learning_rate": 3.701981929454768e-05, + "loss": 0.9091, + "step": 23381 + }, + { + "epoch": 4.987627986348123, + "grad_norm": 0.1874057656368459, + "learning_rate": 3.7012642217669226e-05, + "loss": 0.8961, + "step": 23382 + }, + { + "epoch": 4.987841296928328, + "grad_norm": 0.16813291628638546, + "learning_rate": 3.700546562719344e-05, + "loss": 0.908, + "step": 23383 + }, + { + "epoch": 4.988054607508532, + "grad_norm": 0.16202248906234615, + "learning_rate": 3.699828952320152e-05, + "loss": 0.8996, + "step": 23384 + }, + { + "epoch": 4.988267918088737, + "grad_norm": 0.1904118093679821, + "learning_rate": 3.6991113905774704e-05, + "loss": 0.855, + "step": 23385 + }, + { + "epoch": 4.988481228668942, + "grad_norm": 0.1745341994512346, + "learning_rate": 3.698393877499416e-05, + "loss": 0.8885, + "step": 23386 + }, + { + "epoch": 4.988694539249146, + "grad_norm": 0.1367549026615287, + "learning_rate": 3.6976764130941064e-05, + "loss": 0.878, + "step": 23387 + }, + { + "epoch": 4.988907849829351, + "grad_norm": 0.15211844163109495, + "learning_rate": 3.6969589973696576e-05, + "loss": 0.8848, + "step": 23388 + }, + { + "epoch": 4.989121160409557, + "grad_norm": 0.14597374052970613, + "learning_rate": 3.696241630334194e-05, + "loss": 0.8729, + "step": 23389 + }, + { + "epoch": 4.989334470989761, + "grad_norm": 0.13291303239464355, + "learning_rate": 3.6955243119958276e-05, + "loss": 0.9087, + "step": 23390 + }, + { + "epoch": 4.989547781569966, + "grad_norm": 0.16126006359607903, + "learning_rate": 3.6948070423626795e-05, + "loss": 0.915, + "step": 23391 + }, + { + "epoch": 4.989761092150171, + "grad_norm": 0.13331992511525922, + "learning_rate": 3.6940898214428564e-05, + "loss": 0.8983, + "step": 23392 + }, + { + "epoch": 4.989974402730375, + "grad_norm": 0.17539646685127963, + "learning_rate": 3.693372649244482e-05, + "loss": 0.8944, + "step": 23393 + }, + { + "epoch": 4.99018771331058, + "grad_norm": 0.13714691504810028, + "learning_rate": 3.692655525775669e-05, + "loss": 0.9188, + "step": 23394 + }, + { + "epoch": 4.990401023890785, + "grad_norm": 0.18197725941310322, + "learning_rate": 3.691938451044531e-05, + "loss": 0.887, + "step": 23395 + }, + { + "epoch": 4.9906143344709895, + "grad_norm": 0.2045939579614235, + "learning_rate": 3.691221425059179e-05, + "loss": 0.9482, + "step": 23396 + }, + { + "epoch": 4.990827645051194, + "grad_norm": 0.13620311420980255, + "learning_rate": 3.6905044478277317e-05, + "loss": 0.8806, + "step": 23397 + }, + { + "epoch": 4.9910409556314, + "grad_norm": 0.1996099850440039, + "learning_rate": 3.6897875193582986e-05, + "loss": 0.917, + "step": 23398 + }, + { + "epoch": 4.991254266211604, + "grad_norm": 0.18276231043085983, + "learning_rate": 3.689070639658994e-05, + "loss": 0.888, + "step": 23399 + }, + { + "epoch": 4.991467576791809, + "grad_norm": 0.1786432481806774, + "learning_rate": 3.688353808737923e-05, + "loss": 0.9104, + "step": 23400 + }, + { + "epoch": 4.991680887372014, + "grad_norm": 0.2678435346145175, + "learning_rate": 3.687637026603205e-05, + "loss": 0.8758, + "step": 23401 + }, + { + "epoch": 4.9918941979522184, + "grad_norm": 0.17443087537274418, + "learning_rate": 3.686920293262946e-05, + "loss": 0.896, + "step": 23402 + }, + { + "epoch": 4.992107508532423, + "grad_norm": 0.20859066912798854, + "learning_rate": 3.68620360872526e-05, + "loss": 0.8957, + "step": 23403 + }, + { + "epoch": 4.992320819112628, + "grad_norm": 0.17201299649688295, + "learning_rate": 3.685486972998246e-05, + "loss": 0.8773, + "step": 23404 + }, + { + "epoch": 4.9925341296928325, + "grad_norm": 0.17694402412763094, + "learning_rate": 3.684770386090023e-05, + "loss": 0.9024, + "step": 23405 + }, + { + "epoch": 4.992747440273037, + "grad_norm": 0.2022202549244334, + "learning_rate": 3.684053848008695e-05, + "loss": 0.8961, + "step": 23406 + }, + { + "epoch": 4.992960750853243, + "grad_norm": 0.15565625467914684, + "learning_rate": 3.683337358762371e-05, + "loss": 0.9015, + "step": 23407 + }, + { + "epoch": 4.993174061433447, + "grad_norm": 0.20174612703978093, + "learning_rate": 3.6826209183591546e-05, + "loss": 0.9148, + "step": 23408 + }, + { + "epoch": 4.993387372013652, + "grad_norm": 0.18181813832538096, + "learning_rate": 3.681904526807157e-05, + "loss": 0.9119, + "step": 23409 + }, + { + "epoch": 4.993600682593857, + "grad_norm": 0.14862611456120453, + "learning_rate": 3.6811881841144835e-05, + "loss": 0.9074, + "step": 23410 + }, + { + "epoch": 4.9938139931740615, + "grad_norm": 0.19908687352126295, + "learning_rate": 3.680471890289238e-05, + "loss": 0.9092, + "step": 23411 + }, + { + "epoch": 4.994027303754266, + "grad_norm": 0.1654129829255273, + "learning_rate": 3.679755645339523e-05, + "loss": 0.8974, + "step": 23412 + }, + { + "epoch": 4.994240614334471, + "grad_norm": 0.15757297679305143, + "learning_rate": 3.679039449273448e-05, + "loss": 0.8952, + "step": 23413 + }, + { + "epoch": 4.9944539249146755, + "grad_norm": 0.20831528878400243, + "learning_rate": 3.678323302099117e-05, + "loss": 0.9114, + "step": 23414 + }, + { + "epoch": 4.99466723549488, + "grad_norm": 0.12397838724156304, + "learning_rate": 3.677607203824628e-05, + "loss": 0.9187, + "step": 23415 + }, + { + "epoch": 4.994880546075086, + "grad_norm": 0.21267147236469755, + "learning_rate": 3.6768911544580836e-05, + "loss": 0.894, + "step": 23416 + }, + { + "epoch": 4.9950938566552905, + "grad_norm": 0.2017350675107202, + "learning_rate": 3.676175154007591e-05, + "loss": 0.8814, + "step": 23417 + }, + { + "epoch": 4.995307167235495, + "grad_norm": 0.15351840252772317, + "learning_rate": 3.675459202481249e-05, + "loss": 0.8976, + "step": 23418 + }, + { + "epoch": 4.9955204778157, + "grad_norm": 0.17863000785416794, + "learning_rate": 3.674743299887159e-05, + "loss": 0.8669, + "step": 23419 + }, + { + "epoch": 4.9957337883959045, + "grad_norm": 0.17065034014631938, + "learning_rate": 3.6740274462334184e-05, + "loss": 0.9151, + "step": 23420 + }, + { + "epoch": 4.995947098976109, + "grad_norm": 0.1251903377748286, + "learning_rate": 3.6733116415281334e-05, + "loss": 0.884, + "step": 23421 + }, + { + "epoch": 4.996160409556314, + "grad_norm": 0.156890195349148, + "learning_rate": 3.6725958857794e-05, + "loss": 0.8949, + "step": 23422 + }, + { + "epoch": 4.996373720136519, + "grad_norm": 0.13746674506741852, + "learning_rate": 3.6718801789953166e-05, + "loss": 0.8999, + "step": 23423 + }, + { + "epoch": 4.996587030716723, + "grad_norm": 0.13825876357496322, + "learning_rate": 3.671164521183978e-05, + "loss": 0.891, + "step": 23424 + }, + { + "epoch": 4.996800341296928, + "grad_norm": 0.13496613406594116, + "learning_rate": 3.670448912353493e-05, + "loss": 0.8964, + "step": 23425 + }, + { + "epoch": 4.997013651877133, + "grad_norm": 0.14772115784286413, + "learning_rate": 3.669733352511948e-05, + "loss": 0.9172, + "step": 23426 + }, + { + "epoch": 4.997226962457338, + "grad_norm": 0.1308189585842881, + "learning_rate": 3.669017841667443e-05, + "loss": 0.8846, + "step": 23427 + }, + { + "epoch": 4.997440273037543, + "grad_norm": 0.1589857179473207, + "learning_rate": 3.66830237982807e-05, + "loss": 0.8971, + "step": 23428 + }, + { + "epoch": 4.997653583617748, + "grad_norm": 0.15978089992445518, + "learning_rate": 3.667586967001932e-05, + "loss": 0.9194, + "step": 23429 + }, + { + "epoch": 4.997866894197952, + "grad_norm": 0.13646699134076795, + "learning_rate": 3.666871603197121e-05, + "loss": 0.896, + "step": 23430 + }, + { + "epoch": 4.998080204778157, + "grad_norm": 0.16857498305644805, + "learning_rate": 3.6661562884217305e-05, + "loss": 0.9183, + "step": 23431 + }, + { + "epoch": 4.998293515358362, + "grad_norm": 0.13359974790918785, + "learning_rate": 3.665441022683851e-05, + "loss": 0.9155, + "step": 23432 + }, + { + "epoch": 4.998506825938566, + "grad_norm": 0.16785691464785144, + "learning_rate": 3.664725805991582e-05, + "loss": 0.8964, + "step": 23433 + }, + { + "epoch": 4.998720136518771, + "grad_norm": 0.15494255010298671, + "learning_rate": 3.664010638353015e-05, + "loss": 0.8947, + "step": 23434 + }, + { + "epoch": 4.998933447098976, + "grad_norm": 0.16590497667468407, + "learning_rate": 3.663295519776239e-05, + "loss": 0.9167, + "step": 23435 + }, + { + "epoch": 4.999146757679181, + "grad_norm": 0.16236457483214814, + "learning_rate": 3.6625804502693477e-05, + "loss": 0.8772, + "step": 23436 + }, + { + "epoch": 4.999360068259386, + "grad_norm": 0.13922742789701711, + "learning_rate": 3.661865429840431e-05, + "loss": 0.8968, + "step": 23437 + }, + { + "epoch": 4.999573378839591, + "grad_norm": 0.13238331843353418, + "learning_rate": 3.6611504584975805e-05, + "loss": 0.9035, + "step": 23438 + }, + { + "epoch": 4.999786689419795, + "grad_norm": 0.14659674929167366, + "learning_rate": 3.660435536248885e-05, + "loss": 0.8893, + "step": 23439 + }, + { + "epoch": 5.0, + "grad_norm": 0.12265619185961109, + "learning_rate": 3.659720663102432e-05, + "loss": 0.9091, + "step": 23440 + }, + { + "epoch": 5.000213310580205, + "grad_norm": 0.13091321256541505, + "learning_rate": 3.659005839066316e-05, + "loss": 0.9019, + "step": 23441 + }, + { + "epoch": 5.000426621160409, + "grad_norm": 0.12406520407522766, + "learning_rate": 3.658291064148621e-05, + "loss": 0.8819, + "step": 23442 + }, + { + "epoch": 5.000639931740614, + "grad_norm": 0.17725574790765866, + "learning_rate": 3.6575763383574366e-05, + "loss": 0.8916, + "step": 23443 + }, + { + "epoch": 5.000853242320819, + "grad_norm": 0.13804372850409655, + "learning_rate": 3.656861661700845e-05, + "loss": 0.8628, + "step": 23444 + }, + { + "epoch": 5.001066552901024, + "grad_norm": 0.1532634197613095, + "learning_rate": 3.6561470341869406e-05, + "loss": 0.9068, + "step": 23445 + }, + { + "epoch": 5.001279863481229, + "grad_norm": 0.1435814438597637, + "learning_rate": 3.655432455823805e-05, + "loss": 0.8467, + "step": 23446 + }, + { + "epoch": 5.001493174061434, + "grad_norm": 0.13368357094309297, + "learning_rate": 3.6547179266195246e-05, + "loss": 0.8959, + "step": 23447 + }, + { + "epoch": 5.001706484641638, + "grad_norm": 0.13225411328761805, + "learning_rate": 3.654003446582183e-05, + "loss": 0.8857, + "step": 23448 + }, + { + "epoch": 5.001919795221843, + "grad_norm": 0.15569959716742007, + "learning_rate": 3.6532890157198666e-05, + "loss": 0.9119, + "step": 23449 + }, + { + "epoch": 5.002133105802048, + "grad_norm": 0.16569906829448286, + "learning_rate": 3.652574634040658e-05, + "loss": 0.8885, + "step": 23450 + }, + { + "epoch": 5.002346416382252, + "grad_norm": 0.13519615650808653, + "learning_rate": 3.65186030155264e-05, + "loss": 0.9168, + "step": 23451 + }, + { + "epoch": 5.002559726962457, + "grad_norm": 0.1262994726077112, + "learning_rate": 3.651146018263893e-05, + "loss": 0.8939, + "step": 23452 + }, + { + "epoch": 5.002773037542662, + "grad_norm": 0.15141698507685386, + "learning_rate": 3.650431784182505e-05, + "loss": 0.9158, + "step": 23453 + }, + { + "epoch": 5.0029863481228665, + "grad_norm": 0.12923753643128205, + "learning_rate": 3.649717599316555e-05, + "loss": 0.8704, + "step": 23454 + }, + { + "epoch": 5.003199658703072, + "grad_norm": 0.16387680359187812, + "learning_rate": 3.649003463674123e-05, + "loss": 0.8568, + "step": 23455 + }, + { + "epoch": 5.003412969283277, + "grad_norm": 0.14350173373063338, + "learning_rate": 3.648289377263288e-05, + "loss": 0.8678, + "step": 23456 + }, + { + "epoch": 5.003626279863481, + "grad_norm": 0.1541466311742419, + "learning_rate": 3.647575340092133e-05, + "loss": 0.9016, + "step": 23457 + }, + { + "epoch": 5.003839590443686, + "grad_norm": 0.16475015342835383, + "learning_rate": 3.646861352168741e-05, + "loss": 0.86, + "step": 23458 + }, + { + "epoch": 5.004052901023891, + "grad_norm": 0.14702838069729654, + "learning_rate": 3.646147413501179e-05, + "loss": 0.8898, + "step": 23459 + }, + { + "epoch": 5.0042662116040955, + "grad_norm": 0.15029194417528174, + "learning_rate": 3.645433524097536e-05, + "loss": 0.8786, + "step": 23460 + }, + { + "epoch": 5.0044795221843, + "grad_norm": 0.17395708130665963, + "learning_rate": 3.6447196839658856e-05, + "loss": 0.8716, + "step": 23461 + }, + { + "epoch": 5.004692832764505, + "grad_norm": 0.13488664224016128, + "learning_rate": 3.644005893114304e-05, + "loss": 0.9033, + "step": 23462 + }, + { + "epoch": 5.0049061433447095, + "grad_norm": 0.1465382105119488, + "learning_rate": 3.64329215155087e-05, + "loss": 0.9081, + "step": 23463 + }, + { + "epoch": 5.005119453924915, + "grad_norm": 0.13481978598548913, + "learning_rate": 3.642578459283656e-05, + "loss": 0.8794, + "step": 23464 + }, + { + "epoch": 5.00533276450512, + "grad_norm": 0.12661126533919848, + "learning_rate": 3.641864816320743e-05, + "loss": 0.8817, + "step": 23465 + }, + { + "epoch": 5.0055460750853245, + "grad_norm": 0.15710751555560484, + "learning_rate": 3.6411512226702034e-05, + "loss": 0.9344, + "step": 23466 + }, + { + "epoch": 5.005759385665529, + "grad_norm": 0.12436135915849457, + "learning_rate": 3.64043767834011e-05, + "loss": 0.8799, + "step": 23467 + }, + { + "epoch": 5.005972696245734, + "grad_norm": 0.12906514491564614, + "learning_rate": 3.639724183338535e-05, + "loss": 0.9094, + "step": 23468 + }, + { + "epoch": 5.0061860068259385, + "grad_norm": 0.13560723768239116, + "learning_rate": 3.639010737673557e-05, + "loss": 0.8617, + "step": 23469 + }, + { + "epoch": 5.006399317406143, + "grad_norm": 0.14341093856147683, + "learning_rate": 3.638297341353249e-05, + "loss": 0.9077, + "step": 23470 + }, + { + "epoch": 5.006612627986348, + "grad_norm": 0.13706784876195124, + "learning_rate": 3.637583994385674e-05, + "loss": 0.8958, + "step": 23471 + }, + { + "epoch": 5.006825938566553, + "grad_norm": 0.18408684584554744, + "learning_rate": 3.6368706967789115e-05, + "loss": 0.8861, + "step": 23472 + }, + { + "epoch": 5.007039249146757, + "grad_norm": 0.12651297504762832, + "learning_rate": 3.636157448541032e-05, + "loss": 0.8769, + "step": 23473 + }, + { + "epoch": 5.007252559726963, + "grad_norm": 0.15090594303213217, + "learning_rate": 3.635444249680103e-05, + "loss": 0.8765, + "step": 23474 + }, + { + "epoch": 5.0074658703071675, + "grad_norm": 0.1408231268530012, + "learning_rate": 3.6347311002041966e-05, + "loss": 0.8697, + "step": 23475 + }, + { + "epoch": 5.007679180887372, + "grad_norm": 0.1426812422998028, + "learning_rate": 3.634018000121378e-05, + "loss": 0.8889, + "step": 23476 + }, + { + "epoch": 5.007892491467577, + "grad_norm": 0.1526879328127725, + "learning_rate": 3.633304949439723e-05, + "loss": 0.8551, + "step": 23477 + }, + { + "epoch": 5.0081058020477816, + "grad_norm": 0.12369956187401593, + "learning_rate": 3.632591948167295e-05, + "loss": 0.8812, + "step": 23478 + }, + { + "epoch": 5.008319112627986, + "grad_norm": 0.15044880512973272, + "learning_rate": 3.631878996312162e-05, + "loss": 0.8923, + "step": 23479 + }, + { + "epoch": 5.008532423208191, + "grad_norm": 0.15160820080425372, + "learning_rate": 3.63116609388239e-05, + "loss": 0.8711, + "step": 23480 + }, + { + "epoch": 5.008745733788396, + "grad_norm": 0.13847438868968148, + "learning_rate": 3.630453240886052e-05, + "loss": 0.8915, + "step": 23481 + }, + { + "epoch": 5.0089590443686, + "grad_norm": 0.15741748810006229, + "learning_rate": 3.629740437331206e-05, + "loss": 0.9142, + "step": 23482 + }, + { + "epoch": 5.009172354948806, + "grad_norm": 0.1148464539284545, + "learning_rate": 3.629027683225918e-05, + "loss": 0.9298, + "step": 23483 + }, + { + "epoch": 5.0093856655290105, + "grad_norm": 0.1502140141789682, + "learning_rate": 3.628314978578259e-05, + "loss": 0.9048, + "step": 23484 + }, + { + "epoch": 5.009598976109215, + "grad_norm": 0.13476290676316502, + "learning_rate": 3.627602323396288e-05, + "loss": 0.9228, + "step": 23485 + }, + { + "epoch": 5.00981228668942, + "grad_norm": 0.14142997304136215, + "learning_rate": 3.626889717688072e-05, + "loss": 0.8855, + "step": 23486 + }, + { + "epoch": 5.010025597269625, + "grad_norm": 0.14111885102842836, + "learning_rate": 3.6261771614616715e-05, + "loss": 0.8855, + "step": 23487 + }, + { + "epoch": 5.010238907849829, + "grad_norm": 0.13257020742139367, + "learning_rate": 3.625464654725146e-05, + "loss": 0.8899, + "step": 23488 + }, + { + "epoch": 5.010452218430034, + "grad_norm": 0.13522500133777138, + "learning_rate": 3.624752197486566e-05, + "loss": 0.8938, + "step": 23489 + }, + { + "epoch": 5.010665529010239, + "grad_norm": 0.1399703516960848, + "learning_rate": 3.624039789753989e-05, + "loss": 0.8997, + "step": 23490 + }, + { + "epoch": 5.010878839590443, + "grad_norm": 0.14898663258787703, + "learning_rate": 3.623327431535473e-05, + "loss": 0.8953, + "step": 23491 + }, + { + "epoch": 5.011092150170649, + "grad_norm": 0.13057682566919612, + "learning_rate": 3.622615122839083e-05, + "loss": 0.8978, + "step": 23492 + }, + { + "epoch": 5.011305460750854, + "grad_norm": 0.1590089141341047, + "learning_rate": 3.621902863672875e-05, + "loss": 0.9035, + "step": 23493 + }, + { + "epoch": 5.011518771331058, + "grad_norm": 0.12092928635870744, + "learning_rate": 3.62119065404491e-05, + "loss": 0.8743, + "step": 23494 + }, + { + "epoch": 5.011732081911263, + "grad_norm": 0.1534811469796886, + "learning_rate": 3.6204784939632445e-05, + "loss": 0.8857, + "step": 23495 + }, + { + "epoch": 5.011945392491468, + "grad_norm": 0.29681024323668964, + "learning_rate": 3.61976638343594e-05, + "loss": 0.9012, + "step": 23496 + }, + { + "epoch": 5.012158703071672, + "grad_norm": 0.15171089160233014, + "learning_rate": 3.619054322471054e-05, + "loss": 0.8847, + "step": 23497 + }, + { + "epoch": 5.012372013651877, + "grad_norm": 0.14717676304151206, + "learning_rate": 3.618342311076642e-05, + "loss": 0.8791, + "step": 23498 + }, + { + "epoch": 5.012585324232082, + "grad_norm": 0.1431131818601867, + "learning_rate": 3.617630349260759e-05, + "loss": 0.9233, + "step": 23499 + }, + { + "epoch": 5.012798634812286, + "grad_norm": 0.17294054307339726, + "learning_rate": 3.61691843703146e-05, + "loss": 0.8773, + "step": 23500 + }, + { + "epoch": 5.013011945392491, + "grad_norm": 0.14539312408414662, + "learning_rate": 3.616206574396806e-05, + "loss": 0.8936, + "step": 23501 + }, + { + "epoch": 5.013225255972697, + "grad_norm": 0.16569037653203625, + "learning_rate": 3.615494761364848e-05, + "loss": 0.8831, + "step": 23502 + }, + { + "epoch": 5.013438566552901, + "grad_norm": 0.16608570624817487, + "learning_rate": 3.614782997943641e-05, + "loss": 0.8882, + "step": 23503 + }, + { + "epoch": 5.013651877133106, + "grad_norm": 0.1801952466104242, + "learning_rate": 3.614071284141237e-05, + "loss": 0.8688, + "step": 23504 + }, + { + "epoch": 5.013865187713311, + "grad_norm": 0.15430150981832566, + "learning_rate": 3.613359619965691e-05, + "loss": 0.9086, + "step": 23505 + }, + { + "epoch": 5.014078498293515, + "grad_norm": 0.13973586627798334, + "learning_rate": 3.612648005425055e-05, + "loss": 0.8642, + "step": 23506 + }, + { + "epoch": 5.01429180887372, + "grad_norm": 0.17795571807427413, + "learning_rate": 3.611936440527377e-05, + "loss": 0.8948, + "step": 23507 + }, + { + "epoch": 5.014505119453925, + "grad_norm": 0.12640450053841568, + "learning_rate": 3.611224925280715e-05, + "loss": 0.8813, + "step": 23508 + }, + { + "epoch": 5.014718430034129, + "grad_norm": 0.17120675410670932, + "learning_rate": 3.610513459693117e-05, + "loss": 0.88, + "step": 23509 + }, + { + "epoch": 5.014931740614334, + "grad_norm": 0.15123270571970582, + "learning_rate": 3.609802043772633e-05, + "loss": 0.8604, + "step": 23510 + }, + { + "epoch": 5.01514505119454, + "grad_norm": 0.15706532858439287, + "learning_rate": 3.609090677527313e-05, + "loss": 0.8748, + "step": 23511 + }, + { + "epoch": 5.015358361774744, + "grad_norm": 0.16072138572876618, + "learning_rate": 3.608379360965202e-05, + "loss": 0.9051, + "step": 23512 + }, + { + "epoch": 5.015571672354949, + "grad_norm": 0.14929036060021728, + "learning_rate": 3.607668094094356e-05, + "loss": 0.8913, + "step": 23513 + }, + { + "epoch": 5.015784982935154, + "grad_norm": 0.14128904775612577, + "learning_rate": 3.606956876922823e-05, + "loss": 0.8786, + "step": 23514 + }, + { + "epoch": 5.015998293515358, + "grad_norm": 0.18381492631087354, + "learning_rate": 3.60624570945864e-05, + "loss": 0.8996, + "step": 23515 + }, + { + "epoch": 5.016211604095563, + "grad_norm": 0.14974022117251806, + "learning_rate": 3.605534591709864e-05, + "loss": 0.8813, + "step": 23516 + }, + { + "epoch": 5.016424914675768, + "grad_norm": 0.24614464232728395, + "learning_rate": 3.6048235236845375e-05, + "loss": 0.8977, + "step": 23517 + }, + { + "epoch": 5.0166382252559725, + "grad_norm": 0.15720310303831336, + "learning_rate": 3.604112505390707e-05, + "loss": 0.8612, + "step": 23518 + }, + { + "epoch": 5.016851535836177, + "grad_norm": 0.1459337885887765, + "learning_rate": 3.6034015368364155e-05, + "loss": 0.8791, + "step": 23519 + }, + { + "epoch": 5.017064846416382, + "grad_norm": 0.14464549584339045, + "learning_rate": 3.602690618029711e-05, + "loss": 0.9062, + "step": 23520 + }, + { + "epoch": 5.017278156996587, + "grad_norm": 0.14084628069832136, + "learning_rate": 3.601979748978637e-05, + "loss": 0.8938, + "step": 23521 + }, + { + "epoch": 5.017491467576792, + "grad_norm": 0.13009825633534142, + "learning_rate": 3.6012689296912355e-05, + "loss": 0.8717, + "step": 23522 + }, + { + "epoch": 5.017704778156997, + "grad_norm": 0.1459061904004551, + "learning_rate": 3.600558160175551e-05, + "loss": 0.8892, + "step": 23523 + }, + { + "epoch": 5.0179180887372015, + "grad_norm": 0.11895907802663354, + "learning_rate": 3.5998474404396216e-05, + "loss": 0.9106, + "step": 23524 + }, + { + "epoch": 5.018131399317406, + "grad_norm": 0.1328580889081959, + "learning_rate": 3.5991367704914946e-05, + "loss": 0.9082, + "step": 23525 + }, + { + "epoch": 5.018344709897611, + "grad_norm": 0.13525736517850048, + "learning_rate": 3.5984261503392135e-05, + "loss": 0.8868, + "step": 23526 + }, + { + "epoch": 5.0185580204778155, + "grad_norm": 0.12818968759813693, + "learning_rate": 3.597715579990807e-05, + "loss": 0.8867, + "step": 23527 + }, + { + "epoch": 5.01877133105802, + "grad_norm": 0.17175468976058625, + "learning_rate": 3.597005059454327e-05, + "loss": 0.904, + "step": 23528 + }, + { + "epoch": 5.018984641638225, + "grad_norm": 0.1618926901097212, + "learning_rate": 3.596294588737807e-05, + "loss": 0.9156, + "step": 23529 + }, + { + "epoch": 5.0191979522184305, + "grad_norm": 0.15320392286729637, + "learning_rate": 3.59558416784929e-05, + "loss": 0.917, + "step": 23530 + }, + { + "epoch": 5.019411262798635, + "grad_norm": 0.17126379292627764, + "learning_rate": 3.594873796796808e-05, + "loss": 0.8936, + "step": 23531 + }, + { + "epoch": 5.01962457337884, + "grad_norm": 0.15828260501728886, + "learning_rate": 3.594163475588407e-05, + "loss": 0.8952, + "step": 23532 + }, + { + "epoch": 5.0198378839590445, + "grad_norm": 0.12874542723109872, + "learning_rate": 3.5934532042321195e-05, + "loss": 0.8587, + "step": 23533 + }, + { + "epoch": 5.020051194539249, + "grad_norm": 0.15633210384544408, + "learning_rate": 3.592742982735984e-05, + "loss": 0.9362, + "step": 23534 + }, + { + "epoch": 5.020264505119454, + "grad_norm": 0.15559945712054343, + "learning_rate": 3.592032811108035e-05, + "loss": 0.8965, + "step": 23535 + }, + { + "epoch": 5.020477815699659, + "grad_norm": 0.1373687793453297, + "learning_rate": 3.5913226893563054e-05, + "loss": 0.9062, + "step": 23536 + }, + { + "epoch": 5.020691126279863, + "grad_norm": 0.15183579152818277, + "learning_rate": 3.5906126174888407e-05, + "loss": 0.9015, + "step": 23537 + }, + { + "epoch": 5.020904436860068, + "grad_norm": 0.12997059390394053, + "learning_rate": 3.589902595513665e-05, + "loss": 0.8823, + "step": 23538 + }, + { + "epoch": 5.021117747440273, + "grad_norm": 0.16208916126661205, + "learning_rate": 3.5891926234388123e-05, + "loss": 0.897, + "step": 23539 + }, + { + "epoch": 5.021331058020478, + "grad_norm": 0.12696942802707642, + "learning_rate": 3.5884827012723234e-05, + "loss": 0.8927, + "step": 23540 + }, + { + "epoch": 5.021544368600683, + "grad_norm": 0.1512823538613776, + "learning_rate": 3.587772829022225e-05, + "loss": 0.9005, + "step": 23541 + }, + { + "epoch": 5.021757679180888, + "grad_norm": 0.13694907923360203, + "learning_rate": 3.5870630066965534e-05, + "loss": 0.8869, + "step": 23542 + }, + { + "epoch": 5.021970989761092, + "grad_norm": 0.14235441391664144, + "learning_rate": 3.586353234303334e-05, + "loss": 0.8546, + "step": 23543 + }, + { + "epoch": 5.022184300341297, + "grad_norm": 0.13856497484519026, + "learning_rate": 3.585643511850605e-05, + "loss": 0.861, + "step": 23544 + }, + { + "epoch": 5.022397610921502, + "grad_norm": 0.1418595954285301, + "learning_rate": 3.584933839346394e-05, + "loss": 0.8749, + "step": 23545 + }, + { + "epoch": 5.022610921501706, + "grad_norm": 0.14193236105795162, + "learning_rate": 3.584224216798731e-05, + "loss": 0.8578, + "step": 23546 + }, + { + "epoch": 5.022824232081911, + "grad_norm": 0.11933244672532396, + "learning_rate": 3.5835146442156454e-05, + "loss": 0.9161, + "step": 23547 + }, + { + "epoch": 5.023037542662116, + "grad_norm": 0.15639031986725915, + "learning_rate": 3.582805121605166e-05, + "loss": 0.9171, + "step": 23548 + }, + { + "epoch": 5.023250853242321, + "grad_norm": 0.1662346355264104, + "learning_rate": 3.5820956489753204e-05, + "loss": 0.8822, + "step": 23549 + }, + { + "epoch": 5.023464163822526, + "grad_norm": 0.17180508548748338, + "learning_rate": 3.5813862263341366e-05, + "loss": 0.9246, + "step": 23550 + }, + { + "epoch": 5.023677474402731, + "grad_norm": 0.1825347141314873, + "learning_rate": 3.58067685368964e-05, + "loss": 0.9031, + "step": 23551 + }, + { + "epoch": 5.023890784982935, + "grad_norm": 0.15732063402392552, + "learning_rate": 3.579967531049861e-05, + "loss": 0.8809, + "step": 23552 + }, + { + "epoch": 5.02410409556314, + "grad_norm": 0.26038901788036245, + "learning_rate": 3.579258258422825e-05, + "loss": 0.889, + "step": 23553 + }, + { + "epoch": 5.024317406143345, + "grad_norm": 0.16023361283487397, + "learning_rate": 3.578549035816555e-05, + "loss": 0.937, + "step": 23554 + }, + { + "epoch": 5.024530716723549, + "grad_norm": 0.17378209978335082, + "learning_rate": 3.577839863239075e-05, + "loss": 0.8724, + "step": 23555 + }, + { + "epoch": 5.024744027303754, + "grad_norm": 0.1660455911997555, + "learning_rate": 3.577130740698413e-05, + "loss": 0.9098, + "step": 23556 + }, + { + "epoch": 5.024957337883959, + "grad_norm": 0.17615622165303277, + "learning_rate": 3.576421668202592e-05, + "loss": 0.9191, + "step": 23557 + }, + { + "epoch": 5.025170648464163, + "grad_norm": 0.14149046559857845, + "learning_rate": 3.5757126457596344e-05, + "loss": 0.9124, + "step": 23558 + }, + { + "epoch": 5.025383959044369, + "grad_norm": 0.1804571155914398, + "learning_rate": 3.575003673377562e-05, + "loss": 0.8894, + "step": 23559 + }, + { + "epoch": 5.025597269624574, + "grad_norm": 0.14176525899939113, + "learning_rate": 3.5742947510643985e-05, + "loss": 0.8948, + "step": 23560 + }, + { + "epoch": 5.025810580204778, + "grad_norm": 0.177022797639501, + "learning_rate": 3.573585878828164e-05, + "loss": 0.8944, + "step": 23561 + }, + { + "epoch": 5.026023890784983, + "grad_norm": 0.16929953997210356, + "learning_rate": 3.572877056676879e-05, + "loss": 0.8933, + "step": 23562 + }, + { + "epoch": 5.026237201365188, + "grad_norm": 0.1452879838976568, + "learning_rate": 3.5721682846185614e-05, + "loss": 0.8925, + "step": 23563 + }, + { + "epoch": 5.026450511945392, + "grad_norm": 0.14928971576012712, + "learning_rate": 3.571459562661238e-05, + "loss": 0.9139, + "step": 23564 + }, + { + "epoch": 5.026663822525597, + "grad_norm": 0.1475389012512217, + "learning_rate": 3.5707508908129236e-05, + "loss": 0.8921, + "step": 23565 + }, + { + "epoch": 5.026877133105802, + "grad_norm": 0.13370758841978453, + "learning_rate": 3.5700422690816383e-05, + "loss": 0.8842, + "step": 23566 + }, + { + "epoch": 5.027090443686006, + "grad_norm": 0.1600258002236351, + "learning_rate": 3.569333697475395e-05, + "loss": 0.8822, + "step": 23567 + }, + { + "epoch": 5.027303754266212, + "grad_norm": 0.13679458951603576, + "learning_rate": 3.568625176002219e-05, + "loss": 0.875, + "step": 23568 + }, + { + "epoch": 5.027517064846417, + "grad_norm": 0.1608125844343694, + "learning_rate": 3.5679167046701235e-05, + "loss": 0.8652, + "step": 23569 + }, + { + "epoch": 5.027730375426621, + "grad_norm": 0.1383863492812832, + "learning_rate": 3.567208283487128e-05, + "loss": 0.8683, + "step": 23570 + }, + { + "epoch": 5.027943686006826, + "grad_norm": 0.13515530291877373, + "learning_rate": 3.5664999124612406e-05, + "loss": 0.9043, + "step": 23571 + }, + { + "epoch": 5.028156996587031, + "grad_norm": 0.15119213009007412, + "learning_rate": 3.5657915916004825e-05, + "loss": 0.8919, + "step": 23572 + }, + { + "epoch": 5.028370307167235, + "grad_norm": 0.1159018711537289, + "learning_rate": 3.565083320912868e-05, + "loss": 0.8765, + "step": 23573 + }, + { + "epoch": 5.02858361774744, + "grad_norm": 0.16266410533531486, + "learning_rate": 3.5643751004064096e-05, + "loss": 0.8957, + "step": 23574 + }, + { + "epoch": 5.028796928327645, + "grad_norm": 0.12927675150167858, + "learning_rate": 3.5636669300891194e-05, + "loss": 0.9128, + "step": 23575 + }, + { + "epoch": 5.0290102389078495, + "grad_norm": 0.15615001780854967, + "learning_rate": 3.5629588099690156e-05, + "loss": 0.9043, + "step": 23576 + }, + { + "epoch": 5.029223549488054, + "grad_norm": 0.15236201785504033, + "learning_rate": 3.562250740054107e-05, + "loss": 0.9135, + "step": 23577 + }, + { + "epoch": 5.02943686006826, + "grad_norm": 0.157649044685218, + "learning_rate": 3.561542720352406e-05, + "loss": 0.886, + "step": 23578 + }, + { + "epoch": 5.029650170648464, + "grad_norm": 0.19420620588633675, + "learning_rate": 3.5608347508719204e-05, + "loss": 0.9005, + "step": 23579 + }, + { + "epoch": 5.029863481228669, + "grad_norm": 0.12949323817611735, + "learning_rate": 3.560126831620668e-05, + "loss": 0.8678, + "step": 23580 + }, + { + "epoch": 5.030076791808874, + "grad_norm": 0.20417081915734503, + "learning_rate": 3.559418962606655e-05, + "loss": 0.8799, + "step": 23581 + }, + { + "epoch": 5.0302901023890785, + "grad_norm": 0.15100527337082312, + "learning_rate": 3.5587111438378934e-05, + "loss": 0.8872, + "step": 23582 + }, + { + "epoch": 5.030503412969283, + "grad_norm": 0.1878484736943831, + "learning_rate": 3.5580033753223843e-05, + "loss": 0.8853, + "step": 23583 + }, + { + "epoch": 5.030716723549488, + "grad_norm": 0.1768452111314104, + "learning_rate": 3.557295657068145e-05, + "loss": 0.8902, + "step": 23584 + }, + { + "epoch": 5.0309300341296925, + "grad_norm": 0.13991134321168436, + "learning_rate": 3.55658798908318e-05, + "loss": 0.8851, + "step": 23585 + }, + { + "epoch": 5.031143344709897, + "grad_norm": 0.1767371828894079, + "learning_rate": 3.555880371375497e-05, + "loss": 0.8874, + "step": 23586 + }, + { + "epoch": 5.031356655290103, + "grad_norm": 0.1412890922972271, + "learning_rate": 3.555172803953099e-05, + "loss": 0.8593, + "step": 23587 + }, + { + "epoch": 5.0315699658703075, + "grad_norm": 0.14729074771018738, + "learning_rate": 3.554465286823998e-05, + "loss": 0.8832, + "step": 23588 + }, + { + "epoch": 5.031783276450512, + "grad_norm": 0.13873420646848084, + "learning_rate": 3.553757819996198e-05, + "loss": 0.8761, + "step": 23589 + }, + { + "epoch": 5.031996587030717, + "grad_norm": 0.12708641458093126, + "learning_rate": 3.553050403477702e-05, + "loss": 0.8891, + "step": 23590 + }, + { + "epoch": 5.0322098976109215, + "grad_norm": 0.15614399376876456, + "learning_rate": 3.5523430372765134e-05, + "loss": 0.8572, + "step": 23591 + }, + { + "epoch": 5.032423208191126, + "grad_norm": 0.12898839322844188, + "learning_rate": 3.551635721400641e-05, + "loss": 0.8917, + "step": 23592 + }, + { + "epoch": 5.032636518771331, + "grad_norm": 0.13665813988728284, + "learning_rate": 3.5509284558580876e-05, + "loss": 0.8948, + "step": 23593 + }, + { + "epoch": 5.032849829351536, + "grad_norm": 0.1525137227958403, + "learning_rate": 3.550221240656851e-05, + "loss": 0.924, + "step": 23594 + }, + { + "epoch": 5.03306313993174, + "grad_norm": 0.12950889080067618, + "learning_rate": 3.5495140758049325e-05, + "loss": 0.8612, + "step": 23595 + }, + { + "epoch": 5.033276450511946, + "grad_norm": 0.1568983391169167, + "learning_rate": 3.548806961310341e-05, + "loss": 0.9099, + "step": 23596 + }, + { + "epoch": 5.0334897610921505, + "grad_norm": 0.15418238684327992, + "learning_rate": 3.5480998971810735e-05, + "loss": 0.8812, + "step": 23597 + }, + { + "epoch": 5.033703071672355, + "grad_norm": 0.12648978742520028, + "learning_rate": 3.547392883425129e-05, + "loss": 0.8954, + "step": 23598 + }, + { + "epoch": 5.03391638225256, + "grad_norm": 0.16626577729894004, + "learning_rate": 3.546685920050507e-05, + "loss": 0.9054, + "step": 23599 + }, + { + "epoch": 5.034129692832765, + "grad_norm": 0.1185668276659042, + "learning_rate": 3.5459790070652124e-05, + "loss": 0.8808, + "step": 23600 + }, + { + "epoch": 5.034343003412969, + "grad_norm": 0.1858489897552144, + "learning_rate": 3.54527214447724e-05, + "loss": 0.8836, + "step": 23601 + }, + { + "epoch": 5.034556313993174, + "grad_norm": 0.13152018151845504, + "learning_rate": 3.544565332294587e-05, + "loss": 0.883, + "step": 23602 + }, + { + "epoch": 5.034769624573379, + "grad_norm": 0.19358132853502605, + "learning_rate": 3.543858570525251e-05, + "loss": 0.8869, + "step": 23603 + }, + { + "epoch": 5.034982935153583, + "grad_norm": 0.14390190607071587, + "learning_rate": 3.543151859177234e-05, + "loss": 0.8844, + "step": 23604 + }, + { + "epoch": 5.035196245733788, + "grad_norm": 0.20741032328476092, + "learning_rate": 3.542445198258528e-05, + "loss": 0.9026, + "step": 23605 + }, + { + "epoch": 5.035409556313994, + "grad_norm": 0.14287517850719778, + "learning_rate": 3.541738587777128e-05, + "loss": 0.8844, + "step": 23606 + }, + { + "epoch": 5.035622866894198, + "grad_norm": 0.15292118635761368, + "learning_rate": 3.541032027741029e-05, + "loss": 0.8945, + "step": 23607 + }, + { + "epoch": 5.035836177474403, + "grad_norm": 0.13443647270317519, + "learning_rate": 3.54032551815823e-05, + "loss": 0.8849, + "step": 23608 + }, + { + "epoch": 5.036049488054608, + "grad_norm": 0.15255113064397352, + "learning_rate": 3.5396190590367223e-05, + "loss": 0.8913, + "step": 23609 + }, + { + "epoch": 5.036262798634812, + "grad_norm": 0.14762202565851376, + "learning_rate": 3.5389126503845e-05, + "loss": 0.8757, + "step": 23610 + }, + { + "epoch": 5.036476109215017, + "grad_norm": 0.1702385271438519, + "learning_rate": 3.5382062922095534e-05, + "loss": 0.8984, + "step": 23611 + }, + { + "epoch": 5.036689419795222, + "grad_norm": 0.1534907185836214, + "learning_rate": 3.537499984519881e-05, + "loss": 0.8736, + "step": 23612 + }, + { + "epoch": 5.036902730375426, + "grad_norm": 0.14785517166561282, + "learning_rate": 3.5367937273234714e-05, + "loss": 0.8716, + "step": 23613 + }, + { + "epoch": 5.037116040955631, + "grad_norm": 0.14996709492251592, + "learning_rate": 3.536087520628315e-05, + "loss": 0.8926, + "step": 23614 + }, + { + "epoch": 5.037329351535837, + "grad_norm": 0.17142175456873052, + "learning_rate": 3.5353813644424044e-05, + "loss": 0.9127, + "step": 23615 + }, + { + "epoch": 5.037542662116041, + "grad_norm": 0.1444607480429534, + "learning_rate": 3.5346752587737284e-05, + "loss": 0.8569, + "step": 23616 + }, + { + "epoch": 5.037755972696246, + "grad_norm": 0.1979040845499463, + "learning_rate": 3.533969203630276e-05, + "loss": 0.8998, + "step": 23617 + }, + { + "epoch": 5.037969283276451, + "grad_norm": 0.1254307634877122, + "learning_rate": 3.5332631990200387e-05, + "loss": 0.88, + "step": 23618 + }, + { + "epoch": 5.038182593856655, + "grad_norm": 0.18654557727136745, + "learning_rate": 3.532557244951e-05, + "loss": 0.8947, + "step": 23619 + }, + { + "epoch": 5.03839590443686, + "grad_norm": 0.16178890683439506, + "learning_rate": 3.5318513414311536e-05, + "loss": 0.9166, + "step": 23620 + }, + { + "epoch": 5.038609215017065, + "grad_norm": 0.1743930747965151, + "learning_rate": 3.531145488468485e-05, + "loss": 0.9067, + "step": 23621 + }, + { + "epoch": 5.038822525597269, + "grad_norm": 0.1539222638213409, + "learning_rate": 3.53043968607098e-05, + "loss": 0.9028, + "step": 23622 + }, + { + "epoch": 5.039035836177474, + "grad_norm": 0.16466668030762252, + "learning_rate": 3.529733934246623e-05, + "loss": 0.8891, + "step": 23623 + }, + { + "epoch": 5.039249146757679, + "grad_norm": 0.14189300340043542, + "learning_rate": 3.5290282330034035e-05, + "loss": 0.9078, + "step": 23624 + }, + { + "epoch": 5.039462457337884, + "grad_norm": 0.16810786550457502, + "learning_rate": 3.528322582349305e-05, + "loss": 0.8883, + "step": 23625 + }, + { + "epoch": 5.039675767918089, + "grad_norm": 0.1981946840425934, + "learning_rate": 3.527616982292315e-05, + "loss": 0.8924, + "step": 23626 + }, + { + "epoch": 5.039889078498294, + "grad_norm": 0.13620366577096055, + "learning_rate": 3.526911432840408e-05, + "loss": 0.9048, + "step": 23627 + }, + { + "epoch": 5.040102389078498, + "grad_norm": 0.1956303647411487, + "learning_rate": 3.5262059340015766e-05, + "loss": 0.8746, + "step": 23628 + }, + { + "epoch": 5.040315699658703, + "grad_norm": 0.1576461249316845, + "learning_rate": 3.5255004857838e-05, + "loss": 0.9046, + "step": 23629 + }, + { + "epoch": 5.040529010238908, + "grad_norm": 0.19079815552772966, + "learning_rate": 3.5247950881950615e-05, + "loss": 0.9008, + "step": 23630 + }, + { + "epoch": 5.040742320819112, + "grad_norm": 0.1683251003849089, + "learning_rate": 3.524089741243338e-05, + "loss": 0.8862, + "step": 23631 + }, + { + "epoch": 5.040955631399317, + "grad_norm": 0.13445733041515742, + "learning_rate": 3.523384444936618e-05, + "loss": 0.8913, + "step": 23632 + }, + { + "epoch": 5.041168941979522, + "grad_norm": 0.1577225762957609, + "learning_rate": 3.5226791992828766e-05, + "loss": 0.8969, + "step": 23633 + }, + { + "epoch": 5.041382252559727, + "grad_norm": 0.1490749030189241, + "learning_rate": 3.5219740042900966e-05, + "loss": 0.8985, + "step": 23634 + }, + { + "epoch": 5.041595563139932, + "grad_norm": 0.13803690938785035, + "learning_rate": 3.5212688599662535e-05, + "loss": 0.8911, + "step": 23635 + }, + { + "epoch": 5.041808873720137, + "grad_norm": 0.16400890584778507, + "learning_rate": 3.520563766319331e-05, + "loss": 0.8869, + "step": 23636 + }, + { + "epoch": 5.042022184300341, + "grad_norm": 0.17180181179116083, + "learning_rate": 3.519858723357308e-05, + "loss": 0.874, + "step": 23637 + }, + { + "epoch": 5.042235494880546, + "grad_norm": 0.13180478587974132, + "learning_rate": 3.519153731088156e-05, + "loss": 0.8815, + "step": 23638 + }, + { + "epoch": 5.042448805460751, + "grad_norm": 0.16885653600834835, + "learning_rate": 3.518448789519852e-05, + "loss": 0.9102, + "step": 23639 + }, + { + "epoch": 5.0426621160409555, + "grad_norm": 0.14464248246199463, + "learning_rate": 3.517743898660378e-05, + "loss": 0.9048, + "step": 23640 + }, + { + "epoch": 5.04287542662116, + "grad_norm": 0.1562595977136278, + "learning_rate": 3.517039058517707e-05, + "loss": 0.8869, + "step": 23641 + }, + { + "epoch": 5.043088737201365, + "grad_norm": 0.17284428707984595, + "learning_rate": 3.516334269099815e-05, + "loss": 0.8652, + "step": 23642 + }, + { + "epoch": 5.0433020477815695, + "grad_norm": 0.12856971389901373, + "learning_rate": 3.515629530414673e-05, + "loss": 0.8986, + "step": 23643 + }, + { + "epoch": 5.043515358361775, + "grad_norm": 0.14732449841613895, + "learning_rate": 3.514924842470261e-05, + "loss": 0.891, + "step": 23644 + }, + { + "epoch": 5.04372866894198, + "grad_norm": 0.12294065410015004, + "learning_rate": 3.51422020527455e-05, + "loss": 0.879, + "step": 23645 + }, + { + "epoch": 5.0439419795221845, + "grad_norm": 0.14039733124867607, + "learning_rate": 3.513515618835513e-05, + "loss": 0.8908, + "step": 23646 + }, + { + "epoch": 5.044155290102389, + "grad_norm": 0.1297126339219931, + "learning_rate": 3.5128110831611195e-05, + "loss": 0.8915, + "step": 23647 + }, + { + "epoch": 5.044368600682594, + "grad_norm": 0.15387791511655502, + "learning_rate": 3.5121065982593465e-05, + "loss": 0.8875, + "step": 23648 + }, + { + "epoch": 5.0445819112627985, + "grad_norm": 0.13096221042610745, + "learning_rate": 3.511402164138166e-05, + "loss": 0.8714, + "step": 23649 + }, + { + "epoch": 5.044795221843003, + "grad_norm": 0.13550989651640402, + "learning_rate": 3.5106977808055417e-05, + "loss": 0.894, + "step": 23650 + }, + { + "epoch": 5.045008532423208, + "grad_norm": 0.1502441751516094, + "learning_rate": 3.5099934482694456e-05, + "loss": 0.8539, + "step": 23651 + }, + { + "epoch": 5.045221843003413, + "grad_norm": 0.16164649475240053, + "learning_rate": 3.509289166537852e-05, + "loss": 0.8696, + "step": 23652 + }, + { + "epoch": 5.045435153583618, + "grad_norm": 0.12238479975731832, + "learning_rate": 3.5085849356187264e-05, + "loss": 0.902, + "step": 23653 + }, + { + "epoch": 5.045648464163823, + "grad_norm": 0.14488245348922346, + "learning_rate": 3.507880755520039e-05, + "loss": 0.8718, + "step": 23654 + }, + { + "epoch": 5.0458617747440275, + "grad_norm": 0.12911643680335358, + "learning_rate": 3.507176626249753e-05, + "loss": 0.8458, + "step": 23655 + }, + { + "epoch": 5.046075085324232, + "grad_norm": 0.1474531484722199, + "learning_rate": 3.506472547815841e-05, + "loss": 0.8911, + "step": 23656 + }, + { + "epoch": 5.046288395904437, + "grad_norm": 0.13585184294818156, + "learning_rate": 3.505768520226269e-05, + "loss": 0.8997, + "step": 23657 + }, + { + "epoch": 5.046501706484642, + "grad_norm": 0.14178081703011958, + "learning_rate": 3.5050645434890015e-05, + "loss": 0.8843, + "step": 23658 + }, + { + "epoch": 5.046715017064846, + "grad_norm": 0.12784948757217915, + "learning_rate": 3.504360617612001e-05, + "loss": 0.9038, + "step": 23659 + }, + { + "epoch": 5.046928327645051, + "grad_norm": 0.1340571940792082, + "learning_rate": 3.503656742603242e-05, + "loss": 0.8712, + "step": 23660 + }, + { + "epoch": 5.047141638225256, + "grad_norm": 0.14216647680649738, + "learning_rate": 3.502952918470679e-05, + "loss": 0.891, + "step": 23661 + }, + { + "epoch": 5.04735494880546, + "grad_norm": 0.1243235336119128, + "learning_rate": 3.50224914522228e-05, + "loss": 0.8773, + "step": 23662 + }, + { + "epoch": 5.047568259385666, + "grad_norm": 0.13821515687408278, + "learning_rate": 3.501545422866004e-05, + "loss": 0.9371, + "step": 23663 + }, + { + "epoch": 5.047781569965871, + "grad_norm": 0.13308666273755956, + "learning_rate": 3.5008417514098206e-05, + "loss": 0.8969, + "step": 23664 + }, + { + "epoch": 5.047994880546075, + "grad_norm": 0.116157733932074, + "learning_rate": 3.500138130861689e-05, + "loss": 0.8779, + "step": 23665 + }, + { + "epoch": 5.04820819112628, + "grad_norm": 0.13863545100036928, + "learning_rate": 3.499434561229569e-05, + "loss": 0.8686, + "step": 23666 + }, + { + "epoch": 5.048421501706485, + "grad_norm": 0.12263893120428407, + "learning_rate": 3.4987310425214196e-05, + "loss": 0.8628, + "step": 23667 + }, + { + "epoch": 5.048634812286689, + "grad_norm": 0.15033696134181984, + "learning_rate": 3.4980275747452074e-05, + "loss": 0.9216, + "step": 23668 + }, + { + "epoch": 5.048848122866894, + "grad_norm": 0.12348843228352743, + "learning_rate": 3.49732415790889e-05, + "loss": 0.8561, + "step": 23669 + }, + { + "epoch": 5.049061433447099, + "grad_norm": 0.14615620277177166, + "learning_rate": 3.4966207920204244e-05, + "loss": 0.8759, + "step": 23670 + }, + { + "epoch": 5.049274744027303, + "grad_norm": 0.12898823165721368, + "learning_rate": 3.495917477087771e-05, + "loss": 0.8978, + "step": 23671 + }, + { + "epoch": 5.049488054607509, + "grad_norm": 0.16042015142915492, + "learning_rate": 3.4952142131188865e-05, + "loss": 0.887, + "step": 23672 + }, + { + "epoch": 5.049701365187714, + "grad_norm": 0.12772790557188804, + "learning_rate": 3.494511000121729e-05, + "loss": 0.9061, + "step": 23673 + }, + { + "epoch": 5.049914675767918, + "grad_norm": 0.1338073820784191, + "learning_rate": 3.493807838104256e-05, + "loss": 0.8949, + "step": 23674 + }, + { + "epoch": 5.050127986348123, + "grad_norm": 0.12092297482924977, + "learning_rate": 3.493104727074419e-05, + "loss": 0.8875, + "step": 23675 + }, + { + "epoch": 5.050341296928328, + "grad_norm": 0.13230680938440442, + "learning_rate": 3.4924016670401814e-05, + "loss": 0.8537, + "step": 23676 + }, + { + "epoch": 5.050554607508532, + "grad_norm": 0.12870840925544622, + "learning_rate": 3.4916986580094945e-05, + "loss": 0.8918, + "step": 23677 + }, + { + "epoch": 5.050767918088737, + "grad_norm": 0.13235975498710376, + "learning_rate": 3.490995699990314e-05, + "loss": 0.9085, + "step": 23678 + }, + { + "epoch": 5.050981228668942, + "grad_norm": 0.14070770032609778, + "learning_rate": 3.490292792990589e-05, + "loss": 0.8855, + "step": 23679 + }, + { + "epoch": 5.051194539249146, + "grad_norm": 0.1316758616452753, + "learning_rate": 3.48958993701828e-05, + "loss": 0.8896, + "step": 23680 + }, + { + "epoch": 5.051407849829351, + "grad_norm": 0.1589684724610993, + "learning_rate": 3.4888871320813374e-05, + "loss": 0.8859, + "step": 23681 + }, + { + "epoch": 5.051621160409557, + "grad_norm": 0.14148192888214706, + "learning_rate": 3.4881843781877126e-05, + "loss": 0.8906, + "step": 23682 + }, + { + "epoch": 5.051834470989761, + "grad_norm": 0.14061806215106218, + "learning_rate": 3.487481675345358e-05, + "loss": 0.9007, + "step": 23683 + }, + { + "epoch": 5.052047781569966, + "grad_norm": 0.13548757167825798, + "learning_rate": 3.486779023562223e-05, + "loss": 0.9023, + "step": 23684 + }, + { + "epoch": 5.052261092150171, + "grad_norm": 0.1380375467415819, + "learning_rate": 3.486076422846259e-05, + "loss": 0.8797, + "step": 23685 + }, + { + "epoch": 5.052474402730375, + "grad_norm": 0.12794559792367377, + "learning_rate": 3.485373873205418e-05, + "loss": 0.8949, + "step": 23686 + }, + { + "epoch": 5.05268771331058, + "grad_norm": 0.15845023651931622, + "learning_rate": 3.484671374647642e-05, + "loss": 0.889, + "step": 23687 + }, + { + "epoch": 5.052901023890785, + "grad_norm": 0.145200428520762, + "learning_rate": 3.48396892718089e-05, + "loss": 0.9261, + "step": 23688 + }, + { + "epoch": 5.0531143344709895, + "grad_norm": 0.13384758837371372, + "learning_rate": 3.4832665308131064e-05, + "loss": 0.8965, + "step": 23689 + }, + { + "epoch": 5.053327645051194, + "grad_norm": 0.14683078915090275, + "learning_rate": 3.482564185552236e-05, + "loss": 0.8767, + "step": 23690 + }, + { + "epoch": 5.0535409556314, + "grad_norm": 0.12852705184082172, + "learning_rate": 3.481861891406226e-05, + "loss": 0.9147, + "step": 23691 + }, + { + "epoch": 5.053754266211604, + "grad_norm": 0.16482615429194214, + "learning_rate": 3.4811596483830274e-05, + "loss": 0.8667, + "step": 23692 + }, + { + "epoch": 5.053967576791809, + "grad_norm": 0.14963444495587036, + "learning_rate": 3.480457456490587e-05, + "loss": 0.9052, + "step": 23693 + }, + { + "epoch": 5.054180887372014, + "grad_norm": 0.162103039071512, + "learning_rate": 3.479755315736839e-05, + "loss": 0.8751, + "step": 23694 + }, + { + "epoch": 5.0543941979522184, + "grad_norm": 0.1944858943445167, + "learning_rate": 3.47905322612974e-05, + "loss": 0.9039, + "step": 23695 + }, + { + "epoch": 5.054607508532423, + "grad_norm": 0.1166409598120375, + "learning_rate": 3.4783511876772294e-05, + "loss": 0.913, + "step": 23696 + }, + { + "epoch": 5.054820819112628, + "grad_norm": 0.1786412719282566, + "learning_rate": 3.4776492003872506e-05, + "loss": 0.886, + "step": 23697 + }, + { + "epoch": 5.0550341296928325, + "grad_norm": 0.13570536456332846, + "learning_rate": 3.476947264267748e-05, + "loss": 0.8836, + "step": 23698 + }, + { + "epoch": 5.055247440273037, + "grad_norm": 0.15027206891599007, + "learning_rate": 3.47624537932666e-05, + "loss": 0.8846, + "step": 23699 + }, + { + "epoch": 5.055460750853243, + "grad_norm": 0.157750532305795, + "learning_rate": 3.475543545571934e-05, + "loss": 0.8967, + "step": 23700 + }, + { + "epoch": 5.055674061433447, + "grad_norm": 0.12870925117748358, + "learning_rate": 3.47484176301151e-05, + "loss": 0.8993, + "step": 23701 + }, + { + "epoch": 5.055887372013652, + "grad_norm": 0.19797215321483574, + "learning_rate": 3.4741400316533264e-05, + "loss": 0.9005, + "step": 23702 + }, + { + "epoch": 5.056100682593857, + "grad_norm": 0.11549516842758667, + "learning_rate": 3.473438351505322e-05, + "loss": 0.8928, + "step": 23703 + }, + { + "epoch": 5.0563139931740615, + "grad_norm": 0.18969357057168043, + "learning_rate": 3.472736722575442e-05, + "loss": 0.8937, + "step": 23704 + }, + { + "epoch": 5.056527303754266, + "grad_norm": 0.11183286651167784, + "learning_rate": 3.472035144871625e-05, + "loss": 0.888, + "step": 23705 + }, + { + "epoch": 5.056740614334471, + "grad_norm": 0.1665577100417904, + "learning_rate": 3.4713336184018e-05, + "loss": 0.8691, + "step": 23706 + }, + { + "epoch": 5.0569539249146755, + "grad_norm": 0.1300516178539664, + "learning_rate": 3.470632143173915e-05, + "loss": 0.9073, + "step": 23707 + }, + { + "epoch": 5.05716723549488, + "grad_norm": 0.14522239321891037, + "learning_rate": 3.4699307191959025e-05, + "loss": 0.8864, + "step": 23708 + }, + { + "epoch": 5.057380546075085, + "grad_norm": 0.1457796454318108, + "learning_rate": 3.469229346475701e-05, + "loss": 0.8755, + "step": 23709 + }, + { + "epoch": 5.0575938566552905, + "grad_norm": 0.16114841024257995, + "learning_rate": 3.468528025021246e-05, + "loss": 0.89, + "step": 23710 + }, + { + "epoch": 5.057807167235495, + "grad_norm": 0.12188567372865798, + "learning_rate": 3.467826754840469e-05, + "loss": 0.8678, + "step": 23711 + }, + { + "epoch": 5.0580204778157, + "grad_norm": 0.14706850444659808, + "learning_rate": 3.467125535941311e-05, + "loss": 0.8806, + "step": 23712 + }, + { + "epoch": 5.0582337883959045, + "grad_norm": 0.13548724417282063, + "learning_rate": 3.466424368331705e-05, + "loss": 0.9052, + "step": 23713 + }, + { + "epoch": 5.058447098976109, + "grad_norm": 0.14737289454964458, + "learning_rate": 3.465723252019583e-05, + "loss": 0.9289, + "step": 23714 + }, + { + "epoch": 5.058660409556314, + "grad_norm": 0.14547056304221623, + "learning_rate": 3.465022187012875e-05, + "loss": 0.9025, + "step": 23715 + }, + { + "epoch": 5.058873720136519, + "grad_norm": 0.11359417865352811, + "learning_rate": 3.464321173319524e-05, + "loss": 0.8813, + "step": 23716 + }, + { + "epoch": 5.059087030716723, + "grad_norm": 0.14728595005394857, + "learning_rate": 3.463620210947453e-05, + "loss": 0.889, + "step": 23717 + }, + { + "epoch": 5.059300341296928, + "grad_norm": 0.12519217125805357, + "learning_rate": 3.462919299904591e-05, + "loss": 0.8899, + "step": 23718 + }, + { + "epoch": 5.0595136518771335, + "grad_norm": 0.13475978356544438, + "learning_rate": 3.462218440198877e-05, + "loss": 0.8915, + "step": 23719 + }, + { + "epoch": 5.059726962457338, + "grad_norm": 0.13326929077333943, + "learning_rate": 3.461517631838238e-05, + "loss": 0.8955, + "step": 23720 + }, + { + "epoch": 5.059940273037543, + "grad_norm": 0.13033560644894956, + "learning_rate": 3.4608168748306024e-05, + "loss": 0.9001, + "step": 23721 + }, + { + "epoch": 5.060153583617748, + "grad_norm": 0.12316527838910901, + "learning_rate": 3.460116169183901e-05, + "loss": 0.8997, + "step": 23722 + }, + { + "epoch": 5.060366894197952, + "grad_norm": 0.1349962443068225, + "learning_rate": 3.4594155149060585e-05, + "loss": 0.9019, + "step": 23723 + }, + { + "epoch": 5.060580204778157, + "grad_norm": 0.1288725544856069, + "learning_rate": 3.458714912005008e-05, + "loss": 0.8862, + "step": 23724 + }, + { + "epoch": 5.060793515358362, + "grad_norm": 0.11566999231822454, + "learning_rate": 3.458014360488676e-05, + "loss": 0.8682, + "step": 23725 + }, + { + "epoch": 5.061006825938566, + "grad_norm": 0.13570192407129777, + "learning_rate": 3.4573138603649875e-05, + "loss": 0.9133, + "step": 23726 + }, + { + "epoch": 5.061220136518771, + "grad_norm": 0.12888543498607644, + "learning_rate": 3.456613411641868e-05, + "loss": 0.8822, + "step": 23727 + }, + { + "epoch": 5.061433447098976, + "grad_norm": 0.12886125054736874, + "learning_rate": 3.455913014327246e-05, + "loss": 0.9019, + "step": 23728 + }, + { + "epoch": 5.061646757679181, + "grad_norm": 0.1450428526854081, + "learning_rate": 3.4552126684290435e-05, + "loss": 0.8904, + "step": 23729 + }, + { + "epoch": 5.061860068259386, + "grad_norm": 0.13278025453274225, + "learning_rate": 3.454512373955183e-05, + "loss": 0.8946, + "step": 23730 + }, + { + "epoch": 5.062073378839591, + "grad_norm": 0.1343933305054186, + "learning_rate": 3.453812130913595e-05, + "loss": 0.872, + "step": 23731 + }, + { + "epoch": 5.062286689419795, + "grad_norm": 0.13607169633842714, + "learning_rate": 3.453111939312199e-05, + "loss": 0.9075, + "step": 23732 + }, + { + "epoch": 5.0625, + "grad_norm": 0.14902401425742595, + "learning_rate": 3.452411799158918e-05, + "loss": 0.8781, + "step": 23733 + }, + { + "epoch": 5.062713310580205, + "grad_norm": 0.12132794024274514, + "learning_rate": 3.4517117104616736e-05, + "loss": 0.8889, + "step": 23734 + }, + { + "epoch": 5.062926621160409, + "grad_norm": 0.13902084904970724, + "learning_rate": 3.4510116732283846e-05, + "loss": 0.8648, + "step": 23735 + }, + { + "epoch": 5.063139931740614, + "grad_norm": 0.14460769535518453, + "learning_rate": 3.450311687466979e-05, + "loss": 0.8678, + "step": 23736 + }, + { + "epoch": 5.063353242320819, + "grad_norm": 0.158241154761469, + "learning_rate": 3.449611753185372e-05, + "loss": 0.871, + "step": 23737 + }, + { + "epoch": 5.063566552901024, + "grad_norm": 0.15965443212713792, + "learning_rate": 3.448911870391485e-05, + "loss": 0.9114, + "step": 23738 + }, + { + "epoch": 5.063779863481229, + "grad_norm": 0.18681827229213388, + "learning_rate": 3.448212039093237e-05, + "loss": 0.8841, + "step": 23739 + }, + { + "epoch": 5.063993174061434, + "grad_norm": 0.1264838810318932, + "learning_rate": 3.4475122592985464e-05, + "loss": 0.9073, + "step": 23740 + }, + { + "epoch": 5.064206484641638, + "grad_norm": 0.20724830033762695, + "learning_rate": 3.446812531015331e-05, + "loss": 0.8792, + "step": 23741 + }, + { + "epoch": 5.064419795221843, + "grad_norm": 0.12770979237088606, + "learning_rate": 3.446112854251506e-05, + "loss": 0.899, + "step": 23742 + }, + { + "epoch": 5.064633105802048, + "grad_norm": 0.1728316982462802, + "learning_rate": 3.445413229014992e-05, + "loss": 0.8751, + "step": 23743 + }, + { + "epoch": 5.064846416382252, + "grad_norm": 0.12979480014510328, + "learning_rate": 3.444713655313705e-05, + "loss": 0.9113, + "step": 23744 + }, + { + "epoch": 5.065059726962457, + "grad_norm": 0.14825197042426635, + "learning_rate": 3.44401413315556e-05, + "loss": 0.9198, + "step": 23745 + }, + { + "epoch": 5.065273037542662, + "grad_norm": 0.14839615924571875, + "learning_rate": 3.4433146625484704e-05, + "loss": 0.8885, + "step": 23746 + }, + { + "epoch": 5.0654863481228665, + "grad_norm": 0.13356153062237822, + "learning_rate": 3.442615243500351e-05, + "loss": 0.8953, + "step": 23747 + }, + { + "epoch": 5.065699658703072, + "grad_norm": 0.16432342918586668, + "learning_rate": 3.4419158760191175e-05, + "loss": 0.8558, + "step": 23748 + }, + { + "epoch": 5.065912969283277, + "grad_norm": 0.4577313273717952, + "learning_rate": 3.441216560112687e-05, + "loss": 0.9316, + "step": 23749 + }, + { + "epoch": 5.066126279863481, + "grad_norm": 0.17744819555435548, + "learning_rate": 3.4405172957889615e-05, + "loss": 0.9218, + "step": 23750 + }, + { + "epoch": 5.066339590443686, + "grad_norm": 0.12374764489226252, + "learning_rate": 3.439818083055863e-05, + "loss": 0.9187, + "step": 23751 + }, + { + "epoch": 5.066552901023891, + "grad_norm": 0.16039683350210804, + "learning_rate": 3.439118921921298e-05, + "loss": 0.876, + "step": 23752 + }, + { + "epoch": 5.0667662116040955, + "grad_norm": 0.13803898897712574, + "learning_rate": 3.438419812393181e-05, + "loss": 0.8943, + "step": 23753 + }, + { + "epoch": 5.0669795221843, + "grad_norm": 0.14499799535700603, + "learning_rate": 3.437720754479417e-05, + "loss": 0.8898, + "step": 23754 + }, + { + "epoch": 5.067192832764505, + "grad_norm": 0.12745750593714378, + "learning_rate": 3.4370217481879224e-05, + "loss": 0.9081, + "step": 23755 + }, + { + "epoch": 5.0674061433447095, + "grad_norm": 0.13603583223253593, + "learning_rate": 3.436322793526603e-05, + "loss": 0.8762, + "step": 23756 + }, + { + "epoch": 5.067619453924915, + "grad_norm": 0.12820406418075697, + "learning_rate": 3.4356238905033676e-05, + "loss": 0.8782, + "step": 23757 + }, + { + "epoch": 5.06783276450512, + "grad_norm": 0.12656178483407818, + "learning_rate": 3.434925039126125e-05, + "loss": 0.9065, + "step": 23758 + }, + { + "epoch": 5.0680460750853245, + "grad_norm": 0.12783319629425263, + "learning_rate": 3.43422623940278e-05, + "loss": 0.8837, + "step": 23759 + }, + { + "epoch": 5.068259385665529, + "grad_norm": 0.15202584152256748, + "learning_rate": 3.4335274913412445e-05, + "loss": 0.8711, + "step": 23760 + }, + { + "epoch": 5.068472696245734, + "grad_norm": 0.12822335616870859, + "learning_rate": 3.432828794949425e-05, + "loss": 0.8742, + "step": 23761 + }, + { + "epoch": 5.0686860068259385, + "grad_norm": 0.1368800507126739, + "learning_rate": 3.432130150235219e-05, + "loss": 0.8793, + "step": 23762 + }, + { + "epoch": 5.068899317406143, + "grad_norm": 0.1342876001982007, + "learning_rate": 3.43143155720654e-05, + "loss": 0.8791, + "step": 23763 + }, + { + "epoch": 5.069112627986348, + "grad_norm": 0.15185183207932768, + "learning_rate": 3.43073301587129e-05, + "loss": 0.896, + "step": 23764 + }, + { + "epoch": 5.069325938566553, + "grad_norm": 0.12637178753883782, + "learning_rate": 3.4300345262373725e-05, + "loss": 0.8737, + "step": 23765 + }, + { + "epoch": 5.069539249146757, + "grad_norm": 0.1562309628829338, + "learning_rate": 3.4293360883126874e-05, + "loss": 0.9091, + "step": 23766 + }, + { + "epoch": 5.069752559726963, + "grad_norm": 0.11844961921022917, + "learning_rate": 3.4286377021051453e-05, + "loss": 0.8612, + "step": 23767 + }, + { + "epoch": 5.0699658703071675, + "grad_norm": 0.1513851845835729, + "learning_rate": 3.427939367622644e-05, + "loss": 0.8895, + "step": 23768 + }, + { + "epoch": 5.070179180887372, + "grad_norm": 0.1394751269644842, + "learning_rate": 3.4272410848730863e-05, + "loss": 0.9004, + "step": 23769 + }, + { + "epoch": 5.070392491467577, + "grad_norm": 0.1494593617101254, + "learning_rate": 3.426542853864373e-05, + "loss": 0.8667, + "step": 23770 + }, + { + "epoch": 5.0706058020477816, + "grad_norm": 0.14724835238775863, + "learning_rate": 3.4258446746044e-05, + "loss": 0.8944, + "step": 23771 + }, + { + "epoch": 5.070819112627986, + "grad_norm": 0.14580262176543496, + "learning_rate": 3.425146547101078e-05, + "loss": 0.8946, + "step": 23772 + }, + { + "epoch": 5.071032423208191, + "grad_norm": 0.1702338468854895, + "learning_rate": 3.424448471362297e-05, + "loss": 0.9158, + "step": 23773 + }, + { + "epoch": 5.071245733788396, + "grad_norm": 0.12745161714596123, + "learning_rate": 3.423750447395955e-05, + "loss": 0.864, + "step": 23774 + }, + { + "epoch": 5.0714590443686, + "grad_norm": 0.14735883939394837, + "learning_rate": 3.423052475209957e-05, + "loss": 0.9122, + "step": 23775 + }, + { + "epoch": 5.071672354948806, + "grad_norm": 0.12767426466253556, + "learning_rate": 3.422354554812197e-05, + "loss": 0.8747, + "step": 23776 + }, + { + "epoch": 5.0718856655290105, + "grad_norm": 0.1440273963420305, + "learning_rate": 3.421656686210573e-05, + "loss": 0.8835, + "step": 23777 + }, + { + "epoch": 5.072098976109215, + "grad_norm": 0.13378561231040983, + "learning_rate": 3.4209588694129764e-05, + "loss": 0.8872, + "step": 23778 + }, + { + "epoch": 5.07231228668942, + "grad_norm": 0.14579986112746782, + "learning_rate": 3.4202611044273105e-05, + "loss": 0.9008, + "step": 23779 + }, + { + "epoch": 5.072525597269625, + "grad_norm": 0.15418483271338362, + "learning_rate": 3.419563391261468e-05, + "loss": 0.8817, + "step": 23780 + }, + { + "epoch": 5.072738907849829, + "grad_norm": 0.1192806307699508, + "learning_rate": 3.418865729923342e-05, + "loss": 0.884, + "step": 23781 + }, + { + "epoch": 5.072952218430034, + "grad_norm": 0.1582815465912036, + "learning_rate": 3.418168120420828e-05, + "loss": 0.8997, + "step": 23782 + }, + { + "epoch": 5.073165529010239, + "grad_norm": 0.13341326207836235, + "learning_rate": 3.417470562761817e-05, + "loss": 0.9214, + "step": 23783 + }, + { + "epoch": 5.073378839590443, + "grad_norm": 0.1366662034118791, + "learning_rate": 3.4167730569542064e-05, + "loss": 0.8732, + "step": 23784 + }, + { + "epoch": 5.073592150170649, + "grad_norm": 0.13258772611909092, + "learning_rate": 3.416075603005884e-05, + "loss": 0.9007, + "step": 23785 + }, + { + "epoch": 5.073805460750854, + "grad_norm": 0.15005029143421317, + "learning_rate": 3.4153782009247407e-05, + "loss": 0.8753, + "step": 23786 + }, + { + "epoch": 5.074018771331058, + "grad_norm": 0.11910405478403921, + "learning_rate": 3.414680850718673e-05, + "loss": 0.8619, + "step": 23787 + }, + { + "epoch": 5.074232081911263, + "grad_norm": 0.13493807038993993, + "learning_rate": 3.4139835523955686e-05, + "loss": 0.8814, + "step": 23788 + }, + { + "epoch": 5.074445392491468, + "grad_norm": 0.1311509709127509, + "learning_rate": 3.4132863059633185e-05, + "loss": 0.8982, + "step": 23789 + }, + { + "epoch": 5.074658703071672, + "grad_norm": 0.1375049179157058, + "learning_rate": 3.4125891114298064e-05, + "loss": 0.8799, + "step": 23790 + }, + { + "epoch": 5.074872013651877, + "grad_norm": 0.12639834784698303, + "learning_rate": 3.411891968802929e-05, + "loss": 0.8818, + "step": 23791 + }, + { + "epoch": 5.075085324232082, + "grad_norm": 0.13371479544866013, + "learning_rate": 3.411194878090572e-05, + "loss": 0.901, + "step": 23792 + }, + { + "epoch": 5.075298634812286, + "grad_norm": 0.14153812136562172, + "learning_rate": 3.410497839300622e-05, + "loss": 0.8814, + "step": 23793 + }, + { + "epoch": 5.075511945392491, + "grad_norm": 0.12369315594862122, + "learning_rate": 3.409800852440966e-05, + "loss": 0.878, + "step": 23794 + }, + { + "epoch": 5.075725255972697, + "grad_norm": 0.15253422757294707, + "learning_rate": 3.40910391751949e-05, + "loss": 0.9116, + "step": 23795 + }, + { + "epoch": 5.075938566552901, + "grad_norm": 0.1339971179430861, + "learning_rate": 3.4084070345440804e-05, + "loss": 0.8847, + "step": 23796 + }, + { + "epoch": 5.076151877133106, + "grad_norm": 0.15830921302602585, + "learning_rate": 3.407710203522623e-05, + "loss": 0.8863, + "step": 23797 + }, + { + "epoch": 5.076365187713311, + "grad_norm": 0.14900441070332476, + "learning_rate": 3.4070134244629984e-05, + "loss": 0.9073, + "step": 23798 + }, + { + "epoch": 5.076578498293515, + "grad_norm": 0.13095412131624326, + "learning_rate": 3.406316697373098e-05, + "loss": 0.8878, + "step": 23799 + }, + { + "epoch": 5.07679180887372, + "grad_norm": 0.15817508245207212, + "learning_rate": 3.4056200222608005e-05, + "loss": 0.8862, + "step": 23800 + }, + { + "epoch": 5.077005119453925, + "grad_norm": 0.15662383384789919, + "learning_rate": 3.40492339913399e-05, + "loss": 0.89, + "step": 23801 + }, + { + "epoch": 5.077218430034129, + "grad_norm": 0.14456535135916007, + "learning_rate": 3.404226828000546e-05, + "loss": 0.8913, + "step": 23802 + }, + { + "epoch": 5.077431740614334, + "grad_norm": 0.1679089138173814, + "learning_rate": 3.4035303088683554e-05, + "loss": 0.8843, + "step": 23803 + }, + { + "epoch": 5.07764505119454, + "grad_norm": 0.13874453415955093, + "learning_rate": 3.402833841745296e-05, + "loss": 0.8526, + "step": 23804 + }, + { + "epoch": 5.077858361774744, + "grad_norm": 0.15611918114303439, + "learning_rate": 3.402137426639253e-05, + "loss": 0.8832, + "step": 23805 + }, + { + "epoch": 5.078071672354949, + "grad_norm": 0.15454783642435604, + "learning_rate": 3.401441063558096e-05, + "loss": 0.8947, + "step": 23806 + }, + { + "epoch": 5.078284982935154, + "grad_norm": 0.14980787476795926, + "learning_rate": 3.400744752509714e-05, + "loss": 0.8828, + "step": 23807 + }, + { + "epoch": 5.078498293515358, + "grad_norm": 0.16439655776925124, + "learning_rate": 3.400048493501982e-05, + "loss": 0.9318, + "step": 23808 + }, + { + "epoch": 5.078711604095563, + "grad_norm": 0.1429280026192111, + "learning_rate": 3.3993522865427785e-05, + "loss": 0.9108, + "step": 23809 + }, + { + "epoch": 5.078924914675768, + "grad_norm": 0.15483047984053253, + "learning_rate": 3.398656131639979e-05, + "loss": 0.8839, + "step": 23810 + }, + { + "epoch": 5.0791382252559725, + "grad_norm": 0.15980481082927206, + "learning_rate": 3.397960028801465e-05, + "loss": 0.8971, + "step": 23811 + }, + { + "epoch": 5.079351535836177, + "grad_norm": 0.171728469157818, + "learning_rate": 3.397263978035111e-05, + "loss": 0.8751, + "step": 23812 + }, + { + "epoch": 5.079564846416382, + "grad_norm": 0.1508216563302014, + "learning_rate": 3.3965679793487924e-05, + "loss": 0.9056, + "step": 23813 + }, + { + "epoch": 5.079778156996587, + "grad_norm": 0.15895911798793508, + "learning_rate": 3.395872032750382e-05, + "loss": 0.8745, + "step": 23814 + }, + { + "epoch": 5.079991467576792, + "grad_norm": 0.11932800865297791, + "learning_rate": 3.3951761382477594e-05, + "loss": 0.8666, + "step": 23815 + }, + { + "epoch": 5.080204778156997, + "grad_norm": 0.16373276411668822, + "learning_rate": 3.3944802958487995e-05, + "loss": 0.8941, + "step": 23816 + }, + { + "epoch": 5.0804180887372015, + "grad_norm": 0.13049075530912652, + "learning_rate": 3.39378450556137e-05, + "loss": 0.8834, + "step": 23817 + }, + { + "epoch": 5.080631399317406, + "grad_norm": 0.15800064611591902, + "learning_rate": 3.3930887673933425e-05, + "loss": 0.8974, + "step": 23818 + }, + { + "epoch": 5.080844709897611, + "grad_norm": 0.1498760900967775, + "learning_rate": 3.3923930813525975e-05, + "loss": 0.8794, + "step": 23819 + }, + { + "epoch": 5.0810580204778155, + "grad_norm": 0.16093801862492488, + "learning_rate": 3.391697447447e-05, + "loss": 0.9142, + "step": 23820 + }, + { + "epoch": 5.08127133105802, + "grad_norm": 0.14251647671622603, + "learning_rate": 3.391001865684425e-05, + "loss": 0.8751, + "step": 23821 + }, + { + "epoch": 5.081484641638225, + "grad_norm": 0.1806338748372225, + "learning_rate": 3.390306336072739e-05, + "loss": 0.9027, + "step": 23822 + }, + { + "epoch": 5.0816979522184305, + "grad_norm": 0.1232020658645886, + "learning_rate": 3.3896108586198154e-05, + "loss": 0.8822, + "step": 23823 + }, + { + "epoch": 5.081911262798635, + "grad_norm": 0.16000254223355925, + "learning_rate": 3.388915433333524e-05, + "loss": 0.8904, + "step": 23824 + }, + { + "epoch": 5.08212457337884, + "grad_norm": 0.12301214764980516, + "learning_rate": 3.388220060221732e-05, + "loss": 0.882, + "step": 23825 + }, + { + "epoch": 5.0823378839590445, + "grad_norm": 0.14568970464772124, + "learning_rate": 3.387524739292303e-05, + "loss": 0.897, + "step": 23826 + }, + { + "epoch": 5.082551194539249, + "grad_norm": 0.12459018705815403, + "learning_rate": 3.386829470553114e-05, + "loss": 0.8949, + "step": 23827 + }, + { + "epoch": 5.082764505119454, + "grad_norm": 0.1421853174661392, + "learning_rate": 3.3861342540120285e-05, + "loss": 0.8474, + "step": 23828 + }, + { + "epoch": 5.082977815699659, + "grad_norm": 0.13873904754910985, + "learning_rate": 3.385439089676911e-05, + "loss": 0.8851, + "step": 23829 + }, + { + "epoch": 5.083191126279863, + "grad_norm": 0.13058075528458937, + "learning_rate": 3.3847439775556225e-05, + "loss": 0.8975, + "step": 23830 + }, + { + "epoch": 5.083404436860068, + "grad_norm": 0.1656838251088018, + "learning_rate": 3.384048917656038e-05, + "loss": 0.8838, + "step": 23831 + }, + { + "epoch": 5.083617747440273, + "grad_norm": 0.12100281831038738, + "learning_rate": 3.3833539099860164e-05, + "loss": 0.9014, + "step": 23832 + }, + { + "epoch": 5.083831058020478, + "grad_norm": 0.12759282222438675, + "learning_rate": 3.3826589545534235e-05, + "loss": 0.8898, + "step": 23833 + }, + { + "epoch": 5.084044368600683, + "grad_norm": 0.14885336097357796, + "learning_rate": 3.381964051366119e-05, + "loss": 0.8939, + "step": 23834 + }, + { + "epoch": 5.084257679180888, + "grad_norm": 0.11653097452431666, + "learning_rate": 3.381269200431972e-05, + "loss": 0.9162, + "step": 23835 + }, + { + "epoch": 5.084470989761092, + "grad_norm": 0.13696220578554513, + "learning_rate": 3.3805744017588416e-05, + "loss": 0.8929, + "step": 23836 + }, + { + "epoch": 5.084684300341297, + "grad_norm": 0.12571469423256326, + "learning_rate": 3.3798796553545905e-05, + "loss": 0.9247, + "step": 23837 + }, + { + "epoch": 5.084897610921502, + "grad_norm": 0.1281831872700319, + "learning_rate": 3.3791849612270744e-05, + "loss": 0.8656, + "step": 23838 + }, + { + "epoch": 5.085110921501706, + "grad_norm": 0.14923535322684284, + "learning_rate": 3.378490319384165e-05, + "loss": 0.906, + "step": 23839 + }, + { + "epoch": 5.085324232081911, + "grad_norm": 0.13271890181919616, + "learning_rate": 3.3777957298337116e-05, + "loss": 0.8877, + "step": 23840 + }, + { + "epoch": 5.085537542662116, + "grad_norm": 0.1457548024220435, + "learning_rate": 3.3771011925835786e-05, + "loss": 0.8974, + "step": 23841 + }, + { + "epoch": 5.085750853242321, + "grad_norm": 0.1641038794841178, + "learning_rate": 3.3764067076416196e-05, + "loss": 0.8695, + "step": 23842 + }, + { + "epoch": 5.085964163822526, + "grad_norm": 0.15378646309206576, + "learning_rate": 3.375712275015699e-05, + "loss": 0.8701, + "step": 23843 + }, + { + "epoch": 5.086177474402731, + "grad_norm": 0.15572209493676545, + "learning_rate": 3.375017894713673e-05, + "loss": 0.8685, + "step": 23844 + }, + { + "epoch": 5.086390784982935, + "grad_norm": 0.16677114235589988, + "learning_rate": 3.374323566743396e-05, + "loss": 0.8833, + "step": 23845 + }, + { + "epoch": 5.08660409556314, + "grad_norm": 0.15613704486873609, + "learning_rate": 3.3736292911127235e-05, + "loss": 0.8957, + "step": 23846 + }, + { + "epoch": 5.086817406143345, + "grad_norm": 0.12674929704996932, + "learning_rate": 3.372935067829517e-05, + "loss": 0.9056, + "step": 23847 + }, + { + "epoch": 5.087030716723549, + "grad_norm": 0.1767575091798489, + "learning_rate": 3.3722408969016265e-05, + "loss": 0.8676, + "step": 23848 + }, + { + "epoch": 5.087244027303754, + "grad_norm": 0.1336011602632613, + "learning_rate": 3.3715467783369086e-05, + "loss": 0.887, + "step": 23849 + }, + { + "epoch": 5.087457337883959, + "grad_norm": 0.17173641528159805, + "learning_rate": 3.3708527121432166e-05, + "loss": 0.8857, + "step": 23850 + }, + { + "epoch": 5.087670648464163, + "grad_norm": 0.17030610346267505, + "learning_rate": 3.370158698328404e-05, + "loss": 0.8905, + "step": 23851 + }, + { + "epoch": 5.087883959044369, + "grad_norm": 0.1468065613249794, + "learning_rate": 3.3694647369003245e-05, + "loss": 0.904, + "step": 23852 + }, + { + "epoch": 5.088097269624574, + "grad_norm": 0.18166004521729592, + "learning_rate": 3.368770827866829e-05, + "loss": 0.9014, + "step": 23853 + }, + { + "epoch": 5.088310580204778, + "grad_norm": 0.1133038039124321, + "learning_rate": 3.368076971235766e-05, + "loss": 0.8781, + "step": 23854 + }, + { + "epoch": 5.088523890784983, + "grad_norm": 0.21640528483268287, + "learning_rate": 3.367383167014994e-05, + "loss": 0.877, + "step": 23855 + }, + { + "epoch": 5.088737201365188, + "grad_norm": 0.15924496257494244, + "learning_rate": 3.366689415212358e-05, + "loss": 0.8674, + "step": 23856 + }, + { + "epoch": 5.088950511945392, + "grad_norm": 0.16254975566881513, + "learning_rate": 3.36599571583571e-05, + "loss": 0.9058, + "step": 23857 + }, + { + "epoch": 5.089163822525597, + "grad_norm": 0.15032522359908962, + "learning_rate": 3.365302068892896e-05, + "loss": 0.8997, + "step": 23858 + }, + { + "epoch": 5.089377133105802, + "grad_norm": 0.1759715584904498, + "learning_rate": 3.364608474391769e-05, + "loss": 0.9082, + "step": 23859 + }, + { + "epoch": 5.089590443686006, + "grad_norm": 0.16261526689998165, + "learning_rate": 3.3639149323401755e-05, + "loss": 0.8824, + "step": 23860 + }, + { + "epoch": 5.089803754266212, + "grad_norm": 0.13360473524729338, + "learning_rate": 3.363221442745965e-05, + "loss": 0.8562, + "step": 23861 + }, + { + "epoch": 5.090017064846417, + "grad_norm": 0.15321861710868118, + "learning_rate": 3.362528005616977e-05, + "loss": 0.8886, + "step": 23862 + }, + { + "epoch": 5.090230375426621, + "grad_norm": 0.14354008602777318, + "learning_rate": 3.3618346209610645e-05, + "loss": 0.8889, + "step": 23863 + }, + { + "epoch": 5.090443686006826, + "grad_norm": 0.1244938990337559, + "learning_rate": 3.3611412887860716e-05, + "loss": 0.8885, + "step": 23864 + }, + { + "epoch": 5.090656996587031, + "grad_norm": 0.14121886221077778, + "learning_rate": 3.3604480090998435e-05, + "loss": 0.8711, + "step": 23865 + }, + { + "epoch": 5.090870307167235, + "grad_norm": 0.1233520039068133, + "learning_rate": 3.359754781910221e-05, + "loss": 0.9083, + "step": 23866 + }, + { + "epoch": 5.09108361774744, + "grad_norm": 0.13664106133527124, + "learning_rate": 3.359061607225054e-05, + "loss": 0.8805, + "step": 23867 + }, + { + "epoch": 5.091296928327645, + "grad_norm": 0.14698523051337664, + "learning_rate": 3.358368485052183e-05, + "loss": 0.8885, + "step": 23868 + }, + { + "epoch": 5.0915102389078495, + "grad_norm": 0.1296865097052755, + "learning_rate": 3.357675415399451e-05, + "loss": 0.8784, + "step": 23869 + }, + { + "epoch": 5.091723549488055, + "grad_norm": 0.14569439205969187, + "learning_rate": 3.3569823982746966e-05, + "loss": 0.9026, + "step": 23870 + }, + { + "epoch": 5.09193686006826, + "grad_norm": 0.13974262238086557, + "learning_rate": 3.356289433685767e-05, + "loss": 0.8902, + "step": 23871 + }, + { + "epoch": 5.092150170648464, + "grad_norm": 0.1276474830621021, + "learning_rate": 3.355596521640504e-05, + "loss": 0.8848, + "step": 23872 + }, + { + "epoch": 5.092363481228669, + "grad_norm": 0.1456219751030236, + "learning_rate": 3.3549036621467415e-05, + "loss": 0.8728, + "step": 23873 + }, + { + "epoch": 5.092576791808874, + "grad_norm": 0.1310442316606975, + "learning_rate": 3.3542108552123194e-05, + "loss": 0.9097, + "step": 23874 + }, + { + "epoch": 5.0927901023890785, + "grad_norm": 0.1302226886574074, + "learning_rate": 3.353518100845082e-05, + "loss": 0.8702, + "step": 23875 + }, + { + "epoch": 5.093003412969283, + "grad_norm": 0.14871688372193817, + "learning_rate": 3.352825399052865e-05, + "loss": 0.8948, + "step": 23876 + }, + { + "epoch": 5.093216723549488, + "grad_norm": 0.12284562201948644, + "learning_rate": 3.3521327498435075e-05, + "loss": 0.8915, + "step": 23877 + }, + { + "epoch": 5.0934300341296925, + "grad_norm": 0.1626238748882789, + "learning_rate": 3.351440153224844e-05, + "loss": 0.8944, + "step": 23878 + }, + { + "epoch": 5.093643344709897, + "grad_norm": 0.11633562638054885, + "learning_rate": 3.350747609204716e-05, + "loss": 0.8987, + "step": 23879 + }, + { + "epoch": 5.093856655290103, + "grad_norm": 0.14213364725272803, + "learning_rate": 3.3500551177909555e-05, + "loss": 0.8948, + "step": 23880 + }, + { + "epoch": 5.0940699658703075, + "grad_norm": 0.12547524285107528, + "learning_rate": 3.3493626789914004e-05, + "loss": 0.8644, + "step": 23881 + }, + { + "epoch": 5.094283276450512, + "grad_norm": 0.13530247652922714, + "learning_rate": 3.348670292813882e-05, + "loss": 0.9061, + "step": 23882 + }, + { + "epoch": 5.094496587030717, + "grad_norm": 0.14277713696283728, + "learning_rate": 3.3479779592662414e-05, + "loss": 0.8395, + "step": 23883 + }, + { + "epoch": 5.0947098976109215, + "grad_norm": 0.12987106959132821, + "learning_rate": 3.34728567835631e-05, + "loss": 0.9006, + "step": 23884 + }, + { + "epoch": 5.094923208191126, + "grad_norm": 0.1478848732881297, + "learning_rate": 3.3465934500919165e-05, + "loss": 0.8895, + "step": 23885 + }, + { + "epoch": 5.095136518771331, + "grad_norm": 0.1505595698148549, + "learning_rate": 3.345901274480894e-05, + "loss": 0.8731, + "step": 23886 + }, + { + "epoch": 5.095349829351536, + "grad_norm": 0.14925925045691166, + "learning_rate": 3.345209151531079e-05, + "loss": 0.8818, + "step": 23887 + }, + { + "epoch": 5.09556313993174, + "grad_norm": 0.1259963971990946, + "learning_rate": 3.344517081250301e-05, + "loss": 0.8617, + "step": 23888 + }, + { + "epoch": 5.095776450511945, + "grad_norm": 0.16432757991645117, + "learning_rate": 3.343825063646392e-05, + "loss": 0.8754, + "step": 23889 + }, + { + "epoch": 5.0959897610921505, + "grad_norm": 0.12766913300390267, + "learning_rate": 3.343133098727177e-05, + "loss": 0.8993, + "step": 23890 + }, + { + "epoch": 5.096203071672355, + "grad_norm": 0.15640297469284878, + "learning_rate": 3.3424411865004914e-05, + "loss": 0.8603, + "step": 23891 + }, + { + "epoch": 5.09641638225256, + "grad_norm": 0.13511218084107976, + "learning_rate": 3.341749326974163e-05, + "loss": 0.8786, + "step": 23892 + }, + { + "epoch": 5.096629692832765, + "grad_norm": 0.13546167493422465, + "learning_rate": 3.341057520156019e-05, + "loss": 0.8927, + "step": 23893 + }, + { + "epoch": 5.096843003412969, + "grad_norm": 0.15810110559030868, + "learning_rate": 3.340365766053884e-05, + "loss": 0.8779, + "step": 23894 + }, + { + "epoch": 5.097056313993174, + "grad_norm": 0.13307692676419733, + "learning_rate": 3.339674064675595e-05, + "loss": 0.8802, + "step": 23895 + }, + { + "epoch": 5.097269624573379, + "grad_norm": 0.1273995953997768, + "learning_rate": 3.3389824160289705e-05, + "loss": 0.892, + "step": 23896 + }, + { + "epoch": 5.097482935153583, + "grad_norm": 0.12101540810962508, + "learning_rate": 3.338290820121837e-05, + "loss": 0.8898, + "step": 23897 + }, + { + "epoch": 5.097696245733788, + "grad_norm": 0.12306439380031578, + "learning_rate": 3.337599276962019e-05, + "loss": 0.8809, + "step": 23898 + }, + { + "epoch": 5.097909556313994, + "grad_norm": 0.15587058370229745, + "learning_rate": 3.336907786557346e-05, + "loss": 0.919, + "step": 23899 + }, + { + "epoch": 5.098122866894198, + "grad_norm": 0.14434243586344248, + "learning_rate": 3.33621634891564e-05, + "loss": 0.9065, + "step": 23900 + }, + { + "epoch": 5.098336177474403, + "grad_norm": 0.13792167014361362, + "learning_rate": 3.3355249640447245e-05, + "loss": 0.8643, + "step": 23901 + }, + { + "epoch": 5.098549488054608, + "grad_norm": 0.13978198786137458, + "learning_rate": 3.3348336319524205e-05, + "loss": 0.8875, + "step": 23902 + }, + { + "epoch": 5.098762798634812, + "grad_norm": 0.12341774856958254, + "learning_rate": 3.334142352646554e-05, + "loss": 0.9028, + "step": 23903 + }, + { + "epoch": 5.098976109215017, + "grad_norm": 0.1335071881633399, + "learning_rate": 3.3334511261349455e-05, + "loss": 0.8909, + "step": 23904 + }, + { + "epoch": 5.099189419795222, + "grad_norm": 0.1394185987703655, + "learning_rate": 3.332759952425416e-05, + "loss": 0.9022, + "step": 23905 + }, + { + "epoch": 5.099402730375426, + "grad_norm": 0.12872332062497122, + "learning_rate": 3.332068831525788e-05, + "loss": 0.9199, + "step": 23906 + }, + { + "epoch": 5.099616040955631, + "grad_norm": 0.12712854457379044, + "learning_rate": 3.3313777634438773e-05, + "loss": 0.8935, + "step": 23907 + }, + { + "epoch": 5.099829351535837, + "grad_norm": 0.15541140362725162, + "learning_rate": 3.3306867481875066e-05, + "loss": 0.8672, + "step": 23908 + }, + { + "epoch": 5.100042662116041, + "grad_norm": 0.12516323355036044, + "learning_rate": 3.329995785764494e-05, + "loss": 0.8808, + "step": 23909 + }, + { + "epoch": 5.100255972696246, + "grad_norm": 0.1810277032419085, + "learning_rate": 3.329304876182654e-05, + "loss": 0.9051, + "step": 23910 + }, + { + "epoch": 5.100469283276451, + "grad_norm": 0.13956405242393766, + "learning_rate": 3.328614019449812e-05, + "loss": 0.8896, + "step": 23911 + }, + { + "epoch": 5.100682593856655, + "grad_norm": 0.1298923478928936, + "learning_rate": 3.3279232155737794e-05, + "loss": 0.8728, + "step": 23912 + }, + { + "epoch": 5.10089590443686, + "grad_norm": 0.13920953484078508, + "learning_rate": 3.327232464562376e-05, + "loss": 0.8882, + "step": 23913 + }, + { + "epoch": 5.101109215017065, + "grad_norm": 0.10995040890820887, + "learning_rate": 3.326541766423412e-05, + "loss": 0.9018, + "step": 23914 + }, + { + "epoch": 5.101322525597269, + "grad_norm": 0.12620854279121843, + "learning_rate": 3.3258511211647096e-05, + "loss": 0.9059, + "step": 23915 + }, + { + "epoch": 5.101535836177474, + "grad_norm": 0.1373618633483841, + "learning_rate": 3.32516052879408e-05, + "loss": 0.8892, + "step": 23916 + }, + { + "epoch": 5.101749146757679, + "grad_norm": 0.11788971963807997, + "learning_rate": 3.3244699893193384e-05, + "loss": 0.8666, + "step": 23917 + }, + { + "epoch": 5.101962457337884, + "grad_norm": 0.13655140211444322, + "learning_rate": 3.323779502748297e-05, + "loss": 0.9016, + "step": 23918 + }, + { + "epoch": 5.102175767918089, + "grad_norm": 0.10876168112280739, + "learning_rate": 3.323089069088769e-05, + "loss": 0.8917, + "step": 23919 + }, + { + "epoch": 5.102389078498294, + "grad_norm": 0.14610445362170277, + "learning_rate": 3.322398688348567e-05, + "loss": 0.872, + "step": 23920 + }, + { + "epoch": 5.102602389078498, + "grad_norm": 0.11473692675706339, + "learning_rate": 3.3217083605355024e-05, + "loss": 0.9008, + "step": 23921 + }, + { + "epoch": 5.102815699658703, + "grad_norm": 0.12191471614426805, + "learning_rate": 3.321018085657384e-05, + "loss": 0.9099, + "step": 23922 + }, + { + "epoch": 5.103029010238908, + "grad_norm": 0.13414100428478215, + "learning_rate": 3.320327863722027e-05, + "loss": 0.8725, + "step": 23923 + }, + { + "epoch": 5.103242320819112, + "grad_norm": 0.11678319584499613, + "learning_rate": 3.319637694737239e-05, + "loss": 0.8358, + "step": 23924 + }, + { + "epoch": 5.103455631399317, + "grad_norm": 0.13617124311084228, + "learning_rate": 3.3189475787108297e-05, + "loss": 0.8736, + "step": 23925 + }, + { + "epoch": 5.103668941979522, + "grad_norm": 0.1327468743595263, + "learning_rate": 3.318257515650604e-05, + "loss": 0.8827, + "step": 23926 + }, + { + "epoch": 5.103882252559727, + "grad_norm": 0.12479264191779997, + "learning_rate": 3.3175675055643756e-05, + "loss": 0.8948, + "step": 23927 + }, + { + "epoch": 5.104095563139932, + "grad_norm": 0.1303228146437887, + "learning_rate": 3.316877548459954e-05, + "loss": 0.8782, + "step": 23928 + }, + { + "epoch": 5.104308873720137, + "grad_norm": 0.14123240409415216, + "learning_rate": 3.316187644345135e-05, + "loss": 0.8928, + "step": 23929 + }, + { + "epoch": 5.104522184300341, + "grad_norm": 0.1285177540417411, + "learning_rate": 3.315497793227734e-05, + "loss": 0.8916, + "step": 23930 + }, + { + "epoch": 5.104735494880546, + "grad_norm": 0.1472976254420712, + "learning_rate": 3.3148079951155554e-05, + "loss": 0.8666, + "step": 23931 + }, + { + "epoch": 5.104948805460751, + "grad_norm": 0.1336447369740465, + "learning_rate": 3.314118250016402e-05, + "loss": 0.9188, + "step": 23932 + }, + { + "epoch": 5.1051621160409555, + "grad_norm": 0.1392736447913198, + "learning_rate": 3.31342855793808e-05, + "loss": 0.8876, + "step": 23933 + }, + { + "epoch": 5.10537542662116, + "grad_norm": 0.13194693546126124, + "learning_rate": 3.31273891888839e-05, + "loss": 0.9242, + "step": 23934 + }, + { + "epoch": 5.105588737201365, + "grad_norm": 0.12817867918433432, + "learning_rate": 3.3120493328751413e-05, + "loss": 0.9277, + "step": 23935 + }, + { + "epoch": 5.1058020477815695, + "grad_norm": 0.1455446512959506, + "learning_rate": 3.311359799906132e-05, + "loss": 0.9385, + "step": 23936 + }, + { + "epoch": 5.106015358361775, + "grad_norm": 0.12561921005695278, + "learning_rate": 3.310670319989166e-05, + "loss": 0.8623, + "step": 23937 + }, + { + "epoch": 5.10622866894198, + "grad_norm": 0.16369671350807935, + "learning_rate": 3.3099808931320416e-05, + "loss": 0.8588, + "step": 23938 + }, + { + "epoch": 5.1064419795221845, + "grad_norm": 0.12625007783148123, + "learning_rate": 3.309291519342565e-05, + "loss": 0.9066, + "step": 23939 + }, + { + "epoch": 5.106655290102389, + "grad_norm": 0.16482404957722047, + "learning_rate": 3.3086021986285366e-05, + "loss": 0.8766, + "step": 23940 + }, + { + "epoch": 5.106868600682594, + "grad_norm": 0.12655176141223404, + "learning_rate": 3.307912930997747e-05, + "loss": 0.8901, + "step": 23941 + }, + { + "epoch": 5.1070819112627985, + "grad_norm": 0.16266136794350466, + "learning_rate": 3.307223716458006e-05, + "loss": 0.8907, + "step": 23942 + }, + { + "epoch": 5.107295221843003, + "grad_norm": 0.1242490657129712, + "learning_rate": 3.3065345550171054e-05, + "loss": 0.8965, + "step": 23943 + }, + { + "epoch": 5.107508532423208, + "grad_norm": 0.15326537326882742, + "learning_rate": 3.305845446682846e-05, + "loss": 0.898, + "step": 23944 + }, + { + "epoch": 5.107721843003413, + "grad_norm": 0.13903098688661755, + "learning_rate": 3.305156391463025e-05, + "loss": 0.89, + "step": 23945 + }, + { + "epoch": 5.107935153583618, + "grad_norm": 0.13576485330214494, + "learning_rate": 3.304467389365435e-05, + "loss": 0.8927, + "step": 23946 + }, + { + "epoch": 5.108148464163823, + "grad_norm": 0.1414502157443228, + "learning_rate": 3.3037784403978785e-05, + "loss": 0.9067, + "step": 23947 + }, + { + "epoch": 5.1083617747440275, + "grad_norm": 0.13936959674899116, + "learning_rate": 3.303089544568148e-05, + "loss": 0.8917, + "step": 23948 + }, + { + "epoch": 5.108575085324232, + "grad_norm": 0.1379397499513956, + "learning_rate": 3.3024007018840385e-05, + "loss": 0.9114, + "step": 23949 + }, + { + "epoch": 5.108788395904437, + "grad_norm": 0.14466414517200585, + "learning_rate": 3.301711912353341e-05, + "loss": 0.9126, + "step": 23950 + }, + { + "epoch": 5.109001706484642, + "grad_norm": 0.13254651987393715, + "learning_rate": 3.301023175983858e-05, + "loss": 0.8773, + "step": 23951 + }, + { + "epoch": 5.109215017064846, + "grad_norm": 0.13621527639586545, + "learning_rate": 3.300334492783374e-05, + "loss": 0.8727, + "step": 23952 + }, + { + "epoch": 5.109428327645051, + "grad_norm": 0.15645792796666408, + "learning_rate": 3.299645862759681e-05, + "loss": 0.8893, + "step": 23953 + }, + { + "epoch": 5.109641638225256, + "grad_norm": 0.12039949816697966, + "learning_rate": 3.298957285920577e-05, + "loss": 0.8941, + "step": 23954 + }, + { + "epoch": 5.10985494880546, + "grad_norm": 0.14683280110673966, + "learning_rate": 3.298268762273851e-05, + "loss": 0.9069, + "step": 23955 + }, + { + "epoch": 5.110068259385666, + "grad_norm": 0.13312485871237179, + "learning_rate": 3.297580291827292e-05, + "loss": 0.9051, + "step": 23956 + }, + { + "epoch": 5.110281569965871, + "grad_norm": 0.1470537187363124, + "learning_rate": 3.296891874588692e-05, + "loss": 0.8486, + "step": 23957 + }, + { + "epoch": 5.110494880546075, + "grad_norm": 0.12005315447078946, + "learning_rate": 3.2962035105658364e-05, + "loss": 0.9092, + "step": 23958 + }, + { + "epoch": 5.11070819112628, + "grad_norm": 0.18147603155792383, + "learning_rate": 3.29551519976652e-05, + "loss": 0.8927, + "step": 23959 + }, + { + "epoch": 5.110921501706485, + "grad_norm": 0.15899415150175936, + "learning_rate": 3.294826942198528e-05, + "loss": 0.8739, + "step": 23960 + }, + { + "epoch": 5.111134812286689, + "grad_norm": 0.14009772989451857, + "learning_rate": 3.2941387378696485e-05, + "loss": 0.9016, + "step": 23961 + }, + { + "epoch": 5.111348122866894, + "grad_norm": 0.18454906893698783, + "learning_rate": 3.293450586787669e-05, + "loss": 0.8976, + "step": 23962 + }, + { + "epoch": 5.111561433447099, + "grad_norm": 0.14212225004736592, + "learning_rate": 3.2927624889603744e-05, + "loss": 0.9035, + "step": 23963 + }, + { + "epoch": 5.111774744027303, + "grad_norm": 0.1616738849405548, + "learning_rate": 3.292074444395552e-05, + "loss": 0.9151, + "step": 23964 + }, + { + "epoch": 5.111988054607509, + "grad_norm": 0.17119212998708472, + "learning_rate": 3.291386453100983e-05, + "loss": 0.8764, + "step": 23965 + }, + { + "epoch": 5.112201365187714, + "grad_norm": 0.12239436618429195, + "learning_rate": 3.290698515084459e-05, + "loss": 0.9024, + "step": 23966 + }, + { + "epoch": 5.112414675767918, + "grad_norm": 0.14199462180367226, + "learning_rate": 3.29001063035376e-05, + "loss": 0.8842, + "step": 23967 + }, + { + "epoch": 5.112627986348123, + "grad_norm": 0.14456047917377307, + "learning_rate": 3.28932279891667e-05, + "loss": 0.8854, + "step": 23968 + }, + { + "epoch": 5.112841296928328, + "grad_norm": 0.13802823184659657, + "learning_rate": 3.288635020780972e-05, + "loss": 0.8782, + "step": 23969 + }, + { + "epoch": 5.113054607508532, + "grad_norm": 0.16100531706067422, + "learning_rate": 3.2879472959544455e-05, + "loss": 0.8515, + "step": 23970 + }, + { + "epoch": 5.113267918088737, + "grad_norm": 0.12373243722864723, + "learning_rate": 3.2872596244448765e-05, + "loss": 0.8978, + "step": 23971 + }, + { + "epoch": 5.113481228668942, + "grad_norm": 0.18961023127653132, + "learning_rate": 3.2865720062600445e-05, + "loss": 0.8802, + "step": 23972 + }, + { + "epoch": 5.113694539249146, + "grad_norm": 0.13728238612469842, + "learning_rate": 3.285884441407729e-05, + "loss": 0.8975, + "step": 23973 + }, + { + "epoch": 5.113907849829351, + "grad_norm": 0.1469013790538392, + "learning_rate": 3.285196929895712e-05, + "loss": 0.9006, + "step": 23974 + }, + { + "epoch": 5.114121160409557, + "grad_norm": 0.1513446539637641, + "learning_rate": 3.28450947173177e-05, + "loss": 0.9204, + "step": 23975 + }, + { + "epoch": 5.114334470989761, + "grad_norm": 0.11515339090900147, + "learning_rate": 3.283822066923682e-05, + "loss": 0.8837, + "step": 23976 + }, + { + "epoch": 5.114547781569966, + "grad_norm": 0.152592763108748, + "learning_rate": 3.283134715479225e-05, + "loss": 0.8874, + "step": 23977 + }, + { + "epoch": 5.114761092150171, + "grad_norm": 0.1306596560102368, + "learning_rate": 3.282447417406181e-05, + "loss": 0.9078, + "step": 23978 + }, + { + "epoch": 5.114974402730375, + "grad_norm": 0.12606549459797814, + "learning_rate": 3.281760172712322e-05, + "loss": 0.8852, + "step": 23979 + }, + { + "epoch": 5.11518771331058, + "grad_norm": 0.13648918529873438, + "learning_rate": 3.2810729814054275e-05, + "loss": 0.9086, + "step": 23980 + }, + { + "epoch": 5.115401023890785, + "grad_norm": 0.11896367137505735, + "learning_rate": 3.280385843493271e-05, + "loss": 0.9079, + "step": 23981 + }, + { + "epoch": 5.1156143344709895, + "grad_norm": 0.1429452405286986, + "learning_rate": 3.2796987589836254e-05, + "loss": 0.8646, + "step": 23982 + }, + { + "epoch": 5.115827645051194, + "grad_norm": 0.12037543553246798, + "learning_rate": 3.279011727884271e-05, + "loss": 0.8982, + "step": 23983 + }, + { + "epoch": 5.1160409556314, + "grad_norm": 0.14834073173012105, + "learning_rate": 3.2783247502029804e-05, + "loss": 0.907, + "step": 23984 + }, + { + "epoch": 5.116254266211604, + "grad_norm": 0.14046565411468004, + "learning_rate": 3.27763782594752e-05, + "loss": 0.8717, + "step": 23985 + }, + { + "epoch": 5.116467576791809, + "grad_norm": 0.13773132778007044, + "learning_rate": 3.276950955125669e-05, + "loss": 0.886, + "step": 23986 + }, + { + "epoch": 5.116680887372014, + "grad_norm": 0.16942973512182172, + "learning_rate": 3.276264137745197e-05, + "loss": 0.8813, + "step": 23987 + }, + { + "epoch": 5.1168941979522184, + "grad_norm": 0.14547169626692327, + "learning_rate": 3.275577373813877e-05, + "loss": 0.8599, + "step": 23988 + }, + { + "epoch": 5.117107508532423, + "grad_norm": 0.16820944694740236, + "learning_rate": 3.2748906633394755e-05, + "loss": 0.9152, + "step": 23989 + }, + { + "epoch": 5.117320819112628, + "grad_norm": 0.1609573765451343, + "learning_rate": 3.2742040063297676e-05, + "loss": 0.9034, + "step": 23990 + }, + { + "epoch": 5.1175341296928325, + "grad_norm": 0.16095228802024908, + "learning_rate": 3.273517402792521e-05, + "loss": 0.8836, + "step": 23991 + }, + { + "epoch": 5.117747440273037, + "grad_norm": 0.14865798011440798, + "learning_rate": 3.272830852735505e-05, + "loss": 0.8758, + "step": 23992 + }, + { + "epoch": 5.117960750853243, + "grad_norm": 0.1547394072665976, + "learning_rate": 3.272144356166487e-05, + "loss": 0.887, + "step": 23993 + }, + { + "epoch": 5.118174061433447, + "grad_norm": 0.18153177777629814, + "learning_rate": 3.271457913093232e-05, + "loss": 0.8771, + "step": 23994 + }, + { + "epoch": 5.118387372013652, + "grad_norm": 0.14422624044197566, + "learning_rate": 3.270771523523517e-05, + "loss": 0.8628, + "step": 23995 + }, + { + "epoch": 5.118600682593857, + "grad_norm": 0.12944792592349164, + "learning_rate": 3.270085187465098e-05, + "loss": 0.8735, + "step": 23996 + }, + { + "epoch": 5.1188139931740615, + "grad_norm": 0.12272154558295137, + "learning_rate": 3.2693989049257425e-05, + "loss": 0.8845, + "step": 23997 + }, + { + "epoch": 5.119027303754266, + "grad_norm": 0.1322059161763732, + "learning_rate": 3.2687126759132197e-05, + "loss": 0.8946, + "step": 23998 + }, + { + "epoch": 5.119240614334471, + "grad_norm": 0.12845994030495514, + "learning_rate": 3.268026500435293e-05, + "loss": 0.8977, + "step": 23999 + }, + { + "epoch": 5.1194539249146755, + "grad_norm": 0.12775425579938923, + "learning_rate": 3.267340378499727e-05, + "loss": 0.9084, + "step": 24000 + }, + { + "epoch": 5.11966723549488, + "grad_norm": 0.1297023311170608, + "learning_rate": 3.26665431011428e-05, + "loss": 0.899, + "step": 24001 + }, + { + "epoch": 5.119880546075085, + "grad_norm": 0.1279987254965824, + "learning_rate": 3.265968295286723e-05, + "loss": 0.8772, + "step": 24002 + }, + { + "epoch": 5.1200938566552905, + "grad_norm": 0.15453334712139344, + "learning_rate": 3.2652823340248144e-05, + "loss": 0.873, + "step": 24003 + }, + { + "epoch": 5.120307167235495, + "grad_norm": 0.11886671781638725, + "learning_rate": 3.264596426336315e-05, + "loss": 0.8872, + "step": 24004 + }, + { + "epoch": 5.1205204778157, + "grad_norm": 0.15809659111934635, + "learning_rate": 3.2639105722289874e-05, + "loss": 0.8912, + "step": 24005 + }, + { + "epoch": 5.1207337883959045, + "grad_norm": 0.15640755831872258, + "learning_rate": 3.2632247717105883e-05, + "loss": 0.915, + "step": 24006 + }, + { + "epoch": 5.120947098976109, + "grad_norm": 0.12309094581720612, + "learning_rate": 3.262539024788886e-05, + "loss": 0.8942, + "step": 24007 + }, + { + "epoch": 5.121160409556314, + "grad_norm": 0.13167234928634897, + "learning_rate": 3.2618533314716314e-05, + "loss": 0.8824, + "step": 24008 + }, + { + "epoch": 5.121373720136519, + "grad_norm": 0.12366396976479446, + "learning_rate": 3.261167691766583e-05, + "loss": 0.8699, + "step": 24009 + }, + { + "epoch": 5.121587030716723, + "grad_norm": 0.1533949743429172, + "learning_rate": 3.260482105681506e-05, + "loss": 0.8598, + "step": 24010 + }, + { + "epoch": 5.121800341296928, + "grad_norm": 0.14866222334420492, + "learning_rate": 3.2597965732241524e-05, + "loss": 0.8922, + "step": 24011 + }, + { + "epoch": 5.1220136518771335, + "grad_norm": 0.1438869668144732, + "learning_rate": 3.259111094402281e-05, + "loss": 0.884, + "step": 24012 + }, + { + "epoch": 5.122226962457338, + "grad_norm": 0.1247242029891644, + "learning_rate": 3.258425669223645e-05, + "loss": 0.873, + "step": 24013 + }, + { + "epoch": 5.122440273037543, + "grad_norm": 0.12243186037926067, + "learning_rate": 3.257740297696003e-05, + "loss": 0.91, + "step": 24014 + }, + { + "epoch": 5.122653583617748, + "grad_norm": 0.15157524986071674, + "learning_rate": 3.2570549798271114e-05, + "loss": 0.8632, + "step": 24015 + }, + { + "epoch": 5.122866894197952, + "grad_norm": 0.12156693587512124, + "learning_rate": 3.256369715624722e-05, + "loss": 0.862, + "step": 24016 + }, + { + "epoch": 5.123080204778157, + "grad_norm": 0.15354220942170246, + "learning_rate": 3.255684505096589e-05, + "loss": 0.8849, + "step": 24017 + }, + { + "epoch": 5.123293515358362, + "grad_norm": 0.11717163666871391, + "learning_rate": 3.254999348250465e-05, + "loss": 0.8997, + "step": 24018 + }, + { + "epoch": 5.123506825938566, + "grad_norm": 0.16538466813770109, + "learning_rate": 3.2543142450941034e-05, + "loss": 0.8736, + "step": 24019 + }, + { + "epoch": 5.123720136518771, + "grad_norm": 0.1464660170820894, + "learning_rate": 3.253629195635257e-05, + "loss": 0.9003, + "step": 24020 + }, + { + "epoch": 5.123933447098976, + "grad_norm": 0.1521338695183554, + "learning_rate": 3.252944199881672e-05, + "loss": 0.9196, + "step": 24021 + }, + { + "epoch": 5.124146757679181, + "grad_norm": 0.16247517175536694, + "learning_rate": 3.252259257841107e-05, + "loss": 0.9155, + "step": 24022 + }, + { + "epoch": 5.124360068259386, + "grad_norm": 0.14713860642627125, + "learning_rate": 3.251574369521307e-05, + "loss": 0.8793, + "step": 24023 + }, + { + "epoch": 5.124573378839591, + "grad_norm": 0.1310694759039312, + "learning_rate": 3.250889534930024e-05, + "loss": 0.8788, + "step": 24024 + }, + { + "epoch": 5.124786689419795, + "grad_norm": 0.14448012453565878, + "learning_rate": 3.2502047540750025e-05, + "loss": 0.8915, + "step": 24025 + }, + { + "epoch": 5.125, + "grad_norm": 0.15181255605440835, + "learning_rate": 3.249520026963997e-05, + "loss": 0.8746, + "step": 24026 + }, + { + "epoch": 5.125213310580205, + "grad_norm": 0.1291210395776349, + "learning_rate": 3.248835353604753e-05, + "loss": 0.9154, + "step": 24027 + }, + { + "epoch": 5.125426621160409, + "grad_norm": 0.11662544880365842, + "learning_rate": 3.248150734005016e-05, + "loss": 0.902, + "step": 24028 + }, + { + "epoch": 5.125639931740614, + "grad_norm": 0.13668809108751384, + "learning_rate": 3.247466168172533e-05, + "loss": 0.8791, + "step": 24029 + }, + { + "epoch": 5.125853242320819, + "grad_norm": 0.13226572061384936, + "learning_rate": 3.2467816561150517e-05, + "loss": 0.8604, + "step": 24030 + }, + { + "epoch": 5.126066552901024, + "grad_norm": 0.127702747255411, + "learning_rate": 3.246097197840315e-05, + "loss": 0.8886, + "step": 24031 + }, + { + "epoch": 5.126279863481229, + "grad_norm": 0.14332104709964105, + "learning_rate": 3.245412793356069e-05, + "loss": 0.8872, + "step": 24032 + }, + { + "epoch": 5.126493174061434, + "grad_norm": 0.15245295055060645, + "learning_rate": 3.244728442670054e-05, + "loss": 0.887, + "step": 24033 + }, + { + "epoch": 5.126706484641638, + "grad_norm": 0.11426666074015501, + "learning_rate": 3.244044145790019e-05, + "loss": 0.8718, + "step": 24034 + }, + { + "epoch": 5.126919795221843, + "grad_norm": 0.14405854028419818, + "learning_rate": 3.243359902723705e-05, + "loss": 0.9177, + "step": 24035 + }, + { + "epoch": 5.127133105802048, + "grad_norm": 0.15804396184687877, + "learning_rate": 3.242675713478853e-05, + "loss": 0.8623, + "step": 24036 + }, + { + "epoch": 5.127346416382252, + "grad_norm": 0.12366981882532753, + "learning_rate": 3.241991578063204e-05, + "loss": 0.8865, + "step": 24037 + }, + { + "epoch": 5.127559726962457, + "grad_norm": 0.13897781302136555, + "learning_rate": 3.2413074964845024e-05, + "loss": 0.8842, + "step": 24038 + }, + { + "epoch": 5.127773037542662, + "grad_norm": 0.14022802108108035, + "learning_rate": 3.2406234687504864e-05, + "loss": 0.8642, + "step": 24039 + }, + { + "epoch": 5.1279863481228665, + "grad_norm": 0.14721507066228487, + "learning_rate": 3.239939494868899e-05, + "loss": 0.8907, + "step": 24040 + }, + { + "epoch": 5.128199658703072, + "grad_norm": 0.15939202790536744, + "learning_rate": 3.23925557484747e-05, + "loss": 0.9078, + "step": 24041 + }, + { + "epoch": 5.128412969283277, + "grad_norm": 0.13083799605242621, + "learning_rate": 3.238571708693948e-05, + "loss": 0.8794, + "step": 24042 + }, + { + "epoch": 5.128626279863481, + "grad_norm": 0.12111796220333432, + "learning_rate": 3.237887896416066e-05, + "loss": 0.8673, + "step": 24043 + }, + { + "epoch": 5.128839590443686, + "grad_norm": 0.14355165316621007, + "learning_rate": 3.2372041380215634e-05, + "loss": 0.8879, + "step": 24044 + }, + { + "epoch": 5.129052901023891, + "grad_norm": 0.13718275852587652, + "learning_rate": 3.236520433518173e-05, + "loss": 0.8563, + "step": 24045 + }, + { + "epoch": 5.1292662116040955, + "grad_norm": 0.13596044490766984, + "learning_rate": 3.235836782913637e-05, + "loss": 0.8743, + "step": 24046 + }, + { + "epoch": 5.1294795221843, + "grad_norm": 0.14298800499338762, + "learning_rate": 3.235153186215688e-05, + "loss": 0.8702, + "step": 24047 + }, + { + "epoch": 5.129692832764505, + "grad_norm": 0.1386369072556237, + "learning_rate": 3.234469643432061e-05, + "loss": 0.8969, + "step": 24048 + }, + { + "epoch": 5.1299061433447095, + "grad_norm": 0.14437109709308413, + "learning_rate": 3.233786154570487e-05, + "loss": 0.8875, + "step": 24049 + }, + { + "epoch": 5.130119453924915, + "grad_norm": 0.1290156014453787, + "learning_rate": 3.233102719638705e-05, + "loss": 0.8836, + "step": 24050 + }, + { + "epoch": 5.13033276450512, + "grad_norm": 0.12432014483856356, + "learning_rate": 3.2324193386444497e-05, + "loss": 0.8807, + "step": 24051 + }, + { + "epoch": 5.1305460750853245, + "grad_norm": 0.1299883177643243, + "learning_rate": 3.2317360115954474e-05, + "loss": 0.9036, + "step": 24052 + }, + { + "epoch": 5.130759385665529, + "grad_norm": 0.1579193925638168, + "learning_rate": 3.231052738499428e-05, + "loss": 0.8782, + "step": 24053 + }, + { + "epoch": 5.130972696245734, + "grad_norm": 0.12044193755895165, + "learning_rate": 3.2303695193641313e-05, + "loss": 0.899, + "step": 24054 + }, + { + "epoch": 5.1311860068259385, + "grad_norm": 0.13760491039716502, + "learning_rate": 3.229686354197282e-05, + "loss": 0.8822, + "step": 24055 + }, + { + "epoch": 5.131399317406143, + "grad_norm": 0.12549520565358674, + "learning_rate": 3.229003243006614e-05, + "loss": 0.8759, + "step": 24056 + }, + { + "epoch": 5.131612627986348, + "grad_norm": 0.13522868027316895, + "learning_rate": 3.2283201857998506e-05, + "loss": 0.9021, + "step": 24057 + }, + { + "epoch": 5.131825938566553, + "grad_norm": 0.14231209124982558, + "learning_rate": 3.2276371825847266e-05, + "loss": 0.8926, + "step": 24058 + }, + { + "epoch": 5.132039249146757, + "grad_norm": 0.15268021422537542, + "learning_rate": 3.22695423336897e-05, + "loss": 0.862, + "step": 24059 + }, + { + "epoch": 5.132252559726963, + "grad_norm": 0.1305401529786756, + "learning_rate": 3.226271338160305e-05, + "loss": 0.9021, + "step": 24060 + }, + { + "epoch": 5.1324658703071675, + "grad_norm": 0.1423941684192291, + "learning_rate": 3.2255884969664575e-05, + "loss": 0.904, + "step": 24061 + }, + { + "epoch": 5.132679180887372, + "grad_norm": 0.14487239827432988, + "learning_rate": 3.2249057097951607e-05, + "loss": 0.9003, + "step": 24062 + }, + { + "epoch": 5.132892491467577, + "grad_norm": 0.15408372354986025, + "learning_rate": 3.224222976654138e-05, + "loss": 0.8962, + "step": 24063 + }, + { + "epoch": 5.1331058020477816, + "grad_norm": 0.1420313829720018, + "learning_rate": 3.223540297551109e-05, + "loss": 0.8859, + "step": 24064 + }, + { + "epoch": 5.133319112627986, + "grad_norm": 0.13652120231416662, + "learning_rate": 3.2228576724938e-05, + "loss": 0.8892, + "step": 24065 + }, + { + "epoch": 5.133532423208191, + "grad_norm": 0.14177189166040344, + "learning_rate": 3.2221751014899394e-05, + "loss": 0.9009, + "step": 24066 + }, + { + "epoch": 5.133745733788396, + "grad_norm": 0.1259459216417382, + "learning_rate": 3.221492584547248e-05, + "loss": 0.8693, + "step": 24067 + }, + { + "epoch": 5.1339590443686, + "grad_norm": 0.13143347392552837, + "learning_rate": 3.220810121673448e-05, + "loss": 0.8849, + "step": 24068 + }, + { + "epoch": 5.134172354948806, + "grad_norm": 0.1252250202686958, + "learning_rate": 3.220127712876259e-05, + "loss": 0.8637, + "step": 24069 + }, + { + "epoch": 5.1343856655290105, + "grad_norm": 0.14535458469530863, + "learning_rate": 3.219445358163408e-05, + "loss": 0.9373, + "step": 24070 + }, + { + "epoch": 5.134598976109215, + "grad_norm": 0.13872134004316988, + "learning_rate": 3.218763057542614e-05, + "loss": 0.9057, + "step": 24071 + }, + { + "epoch": 5.13481228668942, + "grad_norm": 0.12576755382013607, + "learning_rate": 3.218080811021596e-05, + "loss": 0.8813, + "step": 24072 + }, + { + "epoch": 5.135025597269625, + "grad_norm": 0.14878411072027223, + "learning_rate": 3.217398618608071e-05, + "loss": 0.8738, + "step": 24073 + }, + { + "epoch": 5.135238907849829, + "grad_norm": 0.12500012822608036, + "learning_rate": 3.2167164803097674e-05, + "loss": 0.8848, + "step": 24074 + }, + { + "epoch": 5.135452218430034, + "grad_norm": 0.168811057625122, + "learning_rate": 3.2160343961343935e-05, + "loss": 0.8934, + "step": 24075 + }, + { + "epoch": 5.135665529010239, + "grad_norm": 0.11909086779035483, + "learning_rate": 3.215352366089672e-05, + "loss": 0.8928, + "step": 24076 + }, + { + "epoch": 5.135878839590443, + "grad_norm": 0.19723652105592912, + "learning_rate": 3.214670390183315e-05, + "loss": 0.9089, + "step": 24077 + }, + { + "epoch": 5.136092150170649, + "grad_norm": 0.11941914713404118, + "learning_rate": 3.213988468423047e-05, + "loss": 0.8761, + "step": 24078 + }, + { + "epoch": 5.136305460750854, + "grad_norm": 0.16044369763311755, + "learning_rate": 3.213306600816579e-05, + "loss": 0.8748, + "step": 24079 + }, + { + "epoch": 5.136518771331058, + "grad_norm": 0.1285267756743851, + "learning_rate": 3.2126247873716273e-05, + "loss": 0.8747, + "step": 24080 + }, + { + "epoch": 5.136732081911263, + "grad_norm": 0.14245715955939495, + "learning_rate": 3.2119430280959045e-05, + "loss": 0.8818, + "step": 24081 + }, + { + "epoch": 5.136945392491468, + "grad_norm": 0.1538493902524147, + "learning_rate": 3.211261322997129e-05, + "loss": 0.9095, + "step": 24082 + }, + { + "epoch": 5.137158703071672, + "grad_norm": 0.11895650625135785, + "learning_rate": 3.210579672083013e-05, + "loss": 0.8844, + "step": 24083 + }, + { + "epoch": 5.137372013651877, + "grad_norm": 0.15018809548263917, + "learning_rate": 3.2098980753612685e-05, + "loss": 0.8799, + "step": 24084 + }, + { + "epoch": 5.137585324232082, + "grad_norm": 0.13383101846895548, + "learning_rate": 3.209216532839607e-05, + "loss": 0.8612, + "step": 24085 + }, + { + "epoch": 5.137798634812286, + "grad_norm": 0.14432650530852778, + "learning_rate": 3.208535044525742e-05, + "loss": 0.9031, + "step": 24086 + }, + { + "epoch": 5.138011945392491, + "grad_norm": 0.15432227242446886, + "learning_rate": 3.207853610427383e-05, + "loss": 0.9096, + "step": 24087 + }, + { + "epoch": 5.138225255972697, + "grad_norm": 0.14260140530791457, + "learning_rate": 3.2071722305522406e-05, + "loss": 0.87, + "step": 24088 + }, + { + "epoch": 5.138438566552901, + "grad_norm": 0.16952749531068914, + "learning_rate": 3.206490904908023e-05, + "loss": 0.8748, + "step": 24089 + }, + { + "epoch": 5.138651877133106, + "grad_norm": 0.11999714905825964, + "learning_rate": 3.205809633502444e-05, + "loss": 0.8486, + "step": 24090 + }, + { + "epoch": 5.138865187713311, + "grad_norm": 0.15322758699200809, + "learning_rate": 3.205128416343209e-05, + "loss": 0.8817, + "step": 24091 + }, + { + "epoch": 5.139078498293515, + "grad_norm": 0.13202986182915596, + "learning_rate": 3.2044472534380284e-05, + "loss": 0.8718, + "step": 24092 + }, + { + "epoch": 5.13929180887372, + "grad_norm": 0.14829776456512753, + "learning_rate": 3.203766144794603e-05, + "loss": 0.887, + "step": 24093 + }, + { + "epoch": 5.139505119453925, + "grad_norm": 0.17810748374002563, + "learning_rate": 3.203085090420648e-05, + "loss": 0.9037, + "step": 24094 + }, + { + "epoch": 5.139718430034129, + "grad_norm": 0.12362242166883212, + "learning_rate": 3.2024040903238665e-05, + "loss": 0.8786, + "step": 24095 + }, + { + "epoch": 5.139931740614334, + "grad_norm": 0.18608187518585198, + "learning_rate": 3.201723144511962e-05, + "loss": 0.8749, + "step": 24096 + }, + { + "epoch": 5.140145051194539, + "grad_norm": 0.16718898849569455, + "learning_rate": 3.201042252992641e-05, + "loss": 0.8926, + "step": 24097 + }, + { + "epoch": 5.140358361774744, + "grad_norm": 0.13622232425818115, + "learning_rate": 3.2003614157736086e-05, + "loss": 0.8946, + "step": 24098 + }, + { + "epoch": 5.140571672354949, + "grad_norm": 0.1884655804313836, + "learning_rate": 3.199680632862568e-05, + "loss": 0.9015, + "step": 24099 + }, + { + "epoch": 5.140784982935154, + "grad_norm": 0.11640941444332746, + "learning_rate": 3.19899990426722e-05, + "loss": 0.8914, + "step": 24100 + }, + { + "epoch": 5.140998293515358, + "grad_norm": 0.17080651055218316, + "learning_rate": 3.198319229995267e-05, + "loss": 0.8947, + "step": 24101 + }, + { + "epoch": 5.141211604095563, + "grad_norm": 0.13190908100081503, + "learning_rate": 3.1976386100544155e-05, + "loss": 0.8778, + "step": 24102 + }, + { + "epoch": 5.141424914675768, + "grad_norm": 0.155452254166424, + "learning_rate": 3.196958044452364e-05, + "loss": 0.862, + "step": 24103 + }, + { + "epoch": 5.1416382252559725, + "grad_norm": 0.14551883141432706, + "learning_rate": 3.196277533196813e-05, + "loss": 0.8972, + "step": 24104 + }, + { + "epoch": 5.141851535836177, + "grad_norm": 0.12427732996995937, + "learning_rate": 3.1955970762954595e-05, + "loss": 0.8798, + "step": 24105 + }, + { + "epoch": 5.142064846416382, + "grad_norm": 0.14476893554115186, + "learning_rate": 3.194916673756009e-05, + "loss": 0.8871, + "step": 24106 + }, + { + "epoch": 5.142278156996587, + "grad_norm": 0.12225168689376217, + "learning_rate": 3.1942363255861596e-05, + "loss": 0.893, + "step": 24107 + }, + { + "epoch": 5.142491467576792, + "grad_norm": 0.1402349447053247, + "learning_rate": 3.1935560317936016e-05, + "loss": 0.8866, + "step": 24108 + }, + { + "epoch": 5.142704778156997, + "grad_norm": 0.13859808035967947, + "learning_rate": 3.192875792386041e-05, + "loss": 0.8913, + "step": 24109 + }, + { + "epoch": 5.1429180887372015, + "grad_norm": 0.15591300946589665, + "learning_rate": 3.192195607371171e-05, + "loss": 0.9044, + "step": 24110 + }, + { + "epoch": 5.143131399317406, + "grad_norm": 0.14828705913612422, + "learning_rate": 3.191515476756689e-05, + "loss": 0.8654, + "step": 24111 + }, + { + "epoch": 5.143344709897611, + "grad_norm": 0.1359040941404091, + "learning_rate": 3.19083540055029e-05, + "loss": 0.9089, + "step": 24112 + }, + { + "epoch": 5.1435580204778155, + "grad_norm": 0.17402007825163912, + "learning_rate": 3.190155378759666e-05, + "loss": 0.895, + "step": 24113 + }, + { + "epoch": 5.14377133105802, + "grad_norm": 0.12132578535211687, + "learning_rate": 3.189475411392519e-05, + "loss": 0.8744, + "step": 24114 + }, + { + "epoch": 5.143984641638225, + "grad_norm": 0.18418451553479764, + "learning_rate": 3.1887954984565387e-05, + "loss": 0.8719, + "step": 24115 + }, + { + "epoch": 5.1441979522184305, + "grad_norm": 0.12455617935086635, + "learning_rate": 3.188115639959417e-05, + "loss": 0.8834, + "step": 24116 + }, + { + "epoch": 5.144411262798635, + "grad_norm": 0.17927375060636466, + "learning_rate": 3.187435835908845e-05, + "loss": 0.8975, + "step": 24117 + }, + { + "epoch": 5.14462457337884, + "grad_norm": 0.12581585892178293, + "learning_rate": 3.186756086312521e-05, + "loss": 0.8682, + "step": 24118 + }, + { + "epoch": 5.1448378839590445, + "grad_norm": 0.2010975243814611, + "learning_rate": 3.1860763911781356e-05, + "loss": 0.8968, + "step": 24119 + }, + { + "epoch": 5.145051194539249, + "grad_norm": 0.12487022542980802, + "learning_rate": 3.1853967505133736e-05, + "loss": 0.9046, + "step": 24120 + }, + { + "epoch": 5.145264505119454, + "grad_norm": 0.1533862553308235, + "learning_rate": 3.184717164325925e-05, + "loss": 0.8547, + "step": 24121 + }, + { + "epoch": 5.145477815699659, + "grad_norm": 0.12925570308111126, + "learning_rate": 3.184037632623487e-05, + "loss": 0.8833, + "step": 24122 + }, + { + "epoch": 5.145691126279863, + "grad_norm": 0.12230539789430797, + "learning_rate": 3.1833581554137425e-05, + "loss": 0.8778, + "step": 24123 + }, + { + "epoch": 5.145904436860068, + "grad_norm": 0.12482974282512095, + "learning_rate": 3.182678732704383e-05, + "loss": 0.8989, + "step": 24124 + }, + { + "epoch": 5.146117747440273, + "grad_norm": 0.12117506797631264, + "learning_rate": 3.1819993645030915e-05, + "loss": 0.8998, + "step": 24125 + }, + { + "epoch": 5.146331058020478, + "grad_norm": 0.11592049874674683, + "learning_rate": 3.18132005081756e-05, + "loss": 0.8866, + "step": 24126 + }, + { + "epoch": 5.146544368600683, + "grad_norm": 0.1299968834408574, + "learning_rate": 3.180640791655474e-05, + "loss": 0.8817, + "step": 24127 + }, + { + "epoch": 5.146757679180888, + "grad_norm": 0.12403496392741255, + "learning_rate": 3.179961587024519e-05, + "loss": 0.8945, + "step": 24128 + }, + { + "epoch": 5.146970989761092, + "grad_norm": 0.11668040410216346, + "learning_rate": 3.1792824369323764e-05, + "loss": 0.8764, + "step": 24129 + }, + { + "epoch": 5.147184300341297, + "grad_norm": 0.13328851855385862, + "learning_rate": 3.178603341386739e-05, + "loss": 0.8812, + "step": 24130 + }, + { + "epoch": 5.147397610921502, + "grad_norm": 0.11050991218573326, + "learning_rate": 3.177924300395285e-05, + "loss": 0.8621, + "step": 24131 + }, + { + "epoch": 5.147610921501706, + "grad_norm": 0.1426542194521646, + "learning_rate": 3.1772453139656976e-05, + "loss": 0.8874, + "step": 24132 + }, + { + "epoch": 5.147824232081911, + "grad_norm": 0.12777413070955745, + "learning_rate": 3.1765663821056566e-05, + "loss": 0.8777, + "step": 24133 + }, + { + "epoch": 5.148037542662116, + "grad_norm": 0.1250985886021371, + "learning_rate": 3.175887504822853e-05, + "loss": 0.8978, + "step": 24134 + }, + { + "epoch": 5.148250853242321, + "grad_norm": 0.14599741980629133, + "learning_rate": 3.1752086821249616e-05, + "loss": 0.9003, + "step": 24135 + }, + { + "epoch": 5.148464163822526, + "grad_norm": 0.12901094758710713, + "learning_rate": 3.1745299140196665e-05, + "loss": 0.906, + "step": 24136 + }, + { + "epoch": 5.148677474402731, + "grad_norm": 0.14871088638939467, + "learning_rate": 3.1738512005146424e-05, + "loss": 0.8744, + "step": 24137 + }, + { + "epoch": 5.148890784982935, + "grad_norm": 0.14194447653122308, + "learning_rate": 3.173172541617577e-05, + "loss": 0.8913, + "step": 24138 + }, + { + "epoch": 5.14910409556314, + "grad_norm": 0.1281502151660106, + "learning_rate": 3.172493937336145e-05, + "loss": 0.9048, + "step": 24139 + }, + { + "epoch": 5.149317406143345, + "grad_norm": 0.11936127495844907, + "learning_rate": 3.171815387678026e-05, + "loss": 0.8641, + "step": 24140 + }, + { + "epoch": 5.149530716723549, + "grad_norm": 0.14173830719984265, + "learning_rate": 3.1711368926508966e-05, + "loss": 0.8834, + "step": 24141 + }, + { + "epoch": 5.149744027303754, + "grad_norm": 0.11237848689727441, + "learning_rate": 3.170458452262434e-05, + "loss": 0.8698, + "step": 24142 + }, + { + "epoch": 5.149957337883959, + "grad_norm": 0.1603510758103427, + "learning_rate": 3.169780066520316e-05, + "loss": 0.869, + "step": 24143 + }, + { + "epoch": 5.150170648464163, + "grad_norm": 0.12364151590891433, + "learning_rate": 3.1691017354322174e-05, + "loss": 0.8897, + "step": 24144 + }, + { + "epoch": 5.150383959044369, + "grad_norm": 0.137772059889616, + "learning_rate": 3.168423459005811e-05, + "loss": 0.8714, + "step": 24145 + }, + { + "epoch": 5.150597269624574, + "grad_norm": 0.13227682185770323, + "learning_rate": 3.167745237248777e-05, + "loss": 0.9019, + "step": 24146 + }, + { + "epoch": 5.150810580204778, + "grad_norm": 0.1498202856009756, + "learning_rate": 3.167067070168788e-05, + "loss": 0.8756, + "step": 24147 + }, + { + "epoch": 5.151023890784983, + "grad_norm": 0.1405170790872729, + "learning_rate": 3.166388957773516e-05, + "loss": 0.8977, + "step": 24148 + }, + { + "epoch": 5.151237201365188, + "grad_norm": 0.15834864276288063, + "learning_rate": 3.1657109000706314e-05, + "loss": 0.8725, + "step": 24149 + }, + { + "epoch": 5.151450511945392, + "grad_norm": 0.13257178854851323, + "learning_rate": 3.1650328970678115e-05, + "loss": 0.868, + "step": 24150 + }, + { + "epoch": 5.151663822525597, + "grad_norm": 0.15393619312831278, + "learning_rate": 3.164354948772726e-05, + "loss": 0.9015, + "step": 24151 + }, + { + "epoch": 5.151877133105802, + "grad_norm": 0.1610019762908324, + "learning_rate": 3.1636770551930456e-05, + "loss": 0.9009, + "step": 24152 + }, + { + "epoch": 5.152090443686006, + "grad_norm": 0.14062900335228953, + "learning_rate": 3.1629992163364404e-05, + "loss": 0.8825, + "step": 24153 + }, + { + "epoch": 5.152303754266212, + "grad_norm": 0.19222611659185604, + "learning_rate": 3.162321432210582e-05, + "loss": 0.916, + "step": 24154 + }, + { + "epoch": 5.152517064846417, + "grad_norm": 0.13884330938260447, + "learning_rate": 3.161643702823136e-05, + "loss": 0.8739, + "step": 24155 + }, + { + "epoch": 5.152730375426621, + "grad_norm": 0.15678669091986663, + "learning_rate": 3.1609660281817735e-05, + "loss": 0.882, + "step": 24156 + }, + { + "epoch": 5.152943686006826, + "grad_norm": 0.1727093987058403, + "learning_rate": 3.160288408294159e-05, + "loss": 0.8986, + "step": 24157 + }, + { + "epoch": 5.153156996587031, + "grad_norm": 0.13808332167310033, + "learning_rate": 3.159610843167966e-05, + "loss": 0.8699, + "step": 24158 + }, + { + "epoch": 5.153370307167235, + "grad_norm": 0.14761258256410514, + "learning_rate": 3.1589333328108576e-05, + "loss": 0.8905, + "step": 24159 + }, + { + "epoch": 5.15358361774744, + "grad_norm": 0.16779804985631255, + "learning_rate": 3.1582558772304993e-05, + "loss": 0.9092, + "step": 24160 + }, + { + "epoch": 5.153796928327645, + "grad_norm": 0.13838506496494046, + "learning_rate": 3.157578476434554e-05, + "loss": 0.923, + "step": 24161 + }, + { + "epoch": 5.1540102389078495, + "grad_norm": 0.2123117377391251, + "learning_rate": 3.156901130430693e-05, + "loss": 0.8856, + "step": 24162 + }, + { + "epoch": 5.154223549488055, + "grad_norm": 0.1438772624554876, + "learning_rate": 3.156223839226581e-05, + "loss": 0.9088, + "step": 24163 + }, + { + "epoch": 5.15443686006826, + "grad_norm": 0.1958596168795479, + "learning_rate": 3.155546602829871e-05, + "loss": 0.8678, + "step": 24164 + }, + { + "epoch": 5.154650170648464, + "grad_norm": 0.13771734487274476, + "learning_rate": 3.1548694212482364e-05, + "loss": 0.8803, + "step": 24165 + }, + { + "epoch": 5.154863481228669, + "grad_norm": 0.1836764999221876, + "learning_rate": 3.1541922944893344e-05, + "loss": 0.8749, + "step": 24166 + }, + { + "epoch": 5.155076791808874, + "grad_norm": 0.20523444370868368, + "learning_rate": 3.1535152225608294e-05, + "loss": 0.91, + "step": 24167 + }, + { + "epoch": 5.1552901023890785, + "grad_norm": 0.1442057676626875, + "learning_rate": 3.1528382054703805e-05, + "loss": 0.9307, + "step": 24168 + }, + { + "epoch": 5.155503412969283, + "grad_norm": 0.16930757734457957, + "learning_rate": 3.152161243225646e-05, + "loss": 0.9033, + "step": 24169 + }, + { + "epoch": 5.155716723549488, + "grad_norm": 0.15873580079804356, + "learning_rate": 3.1514843358342916e-05, + "loss": 0.8992, + "step": 24170 + }, + { + "epoch": 5.1559300341296925, + "grad_norm": 0.17538584302483293, + "learning_rate": 3.150807483303973e-05, + "loss": 0.8859, + "step": 24171 + }, + { + "epoch": 5.156143344709897, + "grad_norm": 0.1823000238346213, + "learning_rate": 3.15013068564235e-05, + "loss": 0.8962, + "step": 24172 + }, + { + "epoch": 5.156356655290103, + "grad_norm": 0.13622167135300517, + "learning_rate": 3.149453942857077e-05, + "loss": 0.8877, + "step": 24173 + }, + { + "epoch": 5.1565699658703075, + "grad_norm": 0.18066645421731392, + "learning_rate": 3.148777254955819e-05, + "loss": 0.8877, + "step": 24174 + }, + { + "epoch": 5.156783276450512, + "grad_norm": 0.1446302884034222, + "learning_rate": 3.148100621946227e-05, + "loss": 0.8901, + "step": 24175 + }, + { + "epoch": 5.156996587030717, + "grad_norm": 0.20386105781963987, + "learning_rate": 3.147424043835953e-05, + "loss": 0.9122, + "step": 24176 + }, + { + "epoch": 5.1572098976109215, + "grad_norm": 0.1747862506837845, + "learning_rate": 3.146747520632662e-05, + "loss": 0.8719, + "step": 24177 + }, + { + "epoch": 5.157423208191126, + "grad_norm": 0.15032425438687605, + "learning_rate": 3.146071052344003e-05, + "loss": 0.8803, + "step": 24178 + }, + { + "epoch": 5.157636518771331, + "grad_norm": 0.20251682789649575, + "learning_rate": 3.145394638977633e-05, + "loss": 0.8757, + "step": 24179 + }, + { + "epoch": 5.157849829351536, + "grad_norm": 0.12729861681013696, + "learning_rate": 3.144718280541204e-05, + "loss": 0.9086, + "step": 24180 + }, + { + "epoch": 5.15806313993174, + "grad_norm": 0.1798942674449956, + "learning_rate": 3.144041977042366e-05, + "loss": 0.8947, + "step": 24181 + }, + { + "epoch": 5.158276450511945, + "grad_norm": 0.1380340680117391, + "learning_rate": 3.143365728488778e-05, + "loss": 0.8797, + "step": 24182 + }, + { + "epoch": 5.1584897610921505, + "grad_norm": 0.15440385974407161, + "learning_rate": 3.1426895348880886e-05, + "loss": 0.891, + "step": 24183 + }, + { + "epoch": 5.158703071672355, + "grad_norm": 0.14391199366207083, + "learning_rate": 3.142013396247949e-05, + "loss": 0.9022, + "step": 24184 + }, + { + "epoch": 5.15891638225256, + "grad_norm": 0.13124754300875646, + "learning_rate": 3.141337312576005e-05, + "loss": 0.898, + "step": 24185 + }, + { + "epoch": 5.159129692832765, + "grad_norm": 0.12937547724525247, + "learning_rate": 3.140661283879919e-05, + "loss": 0.905, + "step": 24186 + }, + { + "epoch": 5.159343003412969, + "grad_norm": 0.13315498741152604, + "learning_rate": 3.139985310167328e-05, + "loss": 0.8774, + "step": 24187 + }, + { + "epoch": 5.159556313993174, + "grad_norm": 0.1351726933066459, + "learning_rate": 3.139309391445883e-05, + "loss": 0.8749, + "step": 24188 + }, + { + "epoch": 5.159769624573379, + "grad_norm": 0.12628367345914746, + "learning_rate": 3.138633527723236e-05, + "loss": 0.8576, + "step": 24189 + }, + { + "epoch": 5.159982935153583, + "grad_norm": 0.12333325030188635, + "learning_rate": 3.137957719007033e-05, + "loss": 0.8659, + "step": 24190 + }, + { + "epoch": 5.160196245733788, + "grad_norm": 0.14850951777119387, + "learning_rate": 3.137281965304919e-05, + "loss": 0.8674, + "step": 24191 + }, + { + "epoch": 5.160409556313994, + "grad_norm": 0.1196035267083625, + "learning_rate": 3.1366062666245434e-05, + "loss": 0.8939, + "step": 24192 + }, + { + "epoch": 5.160622866894198, + "grad_norm": 0.1534090693086093, + "learning_rate": 3.135930622973546e-05, + "loss": 0.8747, + "step": 24193 + }, + { + "epoch": 5.160836177474403, + "grad_norm": 0.1456703268796372, + "learning_rate": 3.1352550343595776e-05, + "loss": 0.9145, + "step": 24194 + }, + { + "epoch": 5.161049488054608, + "grad_norm": 0.15723744670791423, + "learning_rate": 3.134579500790281e-05, + "loss": 0.8783, + "step": 24195 + }, + { + "epoch": 5.161262798634812, + "grad_norm": 0.12856082739947805, + "learning_rate": 3.1339040222732994e-05, + "loss": 0.8741, + "step": 24196 + }, + { + "epoch": 5.161476109215017, + "grad_norm": 0.1518146389381079, + "learning_rate": 3.133228598816275e-05, + "loss": 0.88, + "step": 24197 + }, + { + "epoch": 5.161689419795222, + "grad_norm": 0.12782697008268656, + "learning_rate": 3.132553230426851e-05, + "loss": 0.909, + "step": 24198 + }, + { + "epoch": 5.161902730375426, + "grad_norm": 0.16360025749081952, + "learning_rate": 3.131877917112669e-05, + "loss": 0.8923, + "step": 24199 + }, + { + "epoch": 5.162116040955631, + "grad_norm": 0.13275560515361529, + "learning_rate": 3.131202658881368e-05, + "loss": 0.863, + "step": 24200 + }, + { + "epoch": 5.162329351535837, + "grad_norm": 0.13886890832734178, + "learning_rate": 3.130527455740593e-05, + "loss": 0.8672, + "step": 24201 + }, + { + "epoch": 5.162542662116041, + "grad_norm": 0.12623153885187452, + "learning_rate": 3.129852307697982e-05, + "loss": 0.8679, + "step": 24202 + }, + { + "epoch": 5.162755972696246, + "grad_norm": 0.1429490260751375, + "learning_rate": 3.129177214761174e-05, + "loss": 0.8729, + "step": 24203 + }, + { + "epoch": 5.162969283276451, + "grad_norm": 0.1334219200362471, + "learning_rate": 3.128502176937807e-05, + "loss": 0.8878, + "step": 24204 + }, + { + "epoch": 5.163182593856655, + "grad_norm": 0.14216591061164646, + "learning_rate": 3.1278271942355185e-05, + "loss": 0.8849, + "step": 24205 + }, + { + "epoch": 5.16339590443686, + "grad_norm": 0.14545173295774572, + "learning_rate": 3.1271522666619484e-05, + "loss": 0.9103, + "step": 24206 + }, + { + "epoch": 5.163609215017065, + "grad_norm": 0.14169998535224296, + "learning_rate": 3.126477394224734e-05, + "loss": 0.8986, + "step": 24207 + }, + { + "epoch": 5.163822525597269, + "grad_norm": 0.16056859548489522, + "learning_rate": 3.125802576931509e-05, + "loss": 0.8979, + "step": 24208 + }, + { + "epoch": 5.164035836177474, + "grad_norm": 0.16079013026675282, + "learning_rate": 3.12512781478991e-05, + "loss": 0.8907, + "step": 24209 + }, + { + "epoch": 5.164249146757679, + "grad_norm": 0.1303084224848938, + "learning_rate": 3.1244531078075716e-05, + "loss": 0.8724, + "step": 24210 + }, + { + "epoch": 5.164462457337884, + "grad_norm": 0.13207176184498015, + "learning_rate": 3.123778455992129e-05, + "loss": 0.885, + "step": 24211 + }, + { + "epoch": 5.164675767918089, + "grad_norm": 0.14862757124098333, + "learning_rate": 3.1231038593512114e-05, + "loss": 0.8872, + "step": 24212 + }, + { + "epoch": 5.164889078498294, + "grad_norm": 0.14556833866861354, + "learning_rate": 3.122429317892459e-05, + "loss": 0.9014, + "step": 24213 + }, + { + "epoch": 5.165102389078498, + "grad_norm": 0.13679953942636944, + "learning_rate": 3.1217548316235e-05, + "loss": 0.9013, + "step": 24214 + }, + { + "epoch": 5.165315699658703, + "grad_norm": 0.13645536602179756, + "learning_rate": 3.121080400551968e-05, + "loss": 0.9129, + "step": 24215 + }, + { + "epoch": 5.165529010238908, + "grad_norm": 0.1297343077007368, + "learning_rate": 3.120406024685494e-05, + "loss": 0.8596, + "step": 24216 + }, + { + "epoch": 5.165742320819112, + "grad_norm": 0.1212570481973491, + "learning_rate": 3.119731704031704e-05, + "loss": 0.8709, + "step": 24217 + }, + { + "epoch": 5.165955631399317, + "grad_norm": 0.16156506995186784, + "learning_rate": 3.119057438598235e-05, + "loss": 0.9058, + "step": 24218 + }, + { + "epoch": 5.166168941979522, + "grad_norm": 0.12204083225612215, + "learning_rate": 3.1183832283927164e-05, + "loss": 0.8822, + "step": 24219 + }, + { + "epoch": 5.166382252559727, + "grad_norm": 0.16630961400408778, + "learning_rate": 3.117709073422767e-05, + "loss": 0.8816, + "step": 24220 + }, + { + "epoch": 5.166595563139932, + "grad_norm": 0.12045111801838, + "learning_rate": 3.1170349736960246e-05, + "loss": 0.8895, + "step": 24221 + }, + { + "epoch": 5.166808873720137, + "grad_norm": 0.16265054197625362, + "learning_rate": 3.1163609292201144e-05, + "loss": 0.8945, + "step": 24222 + }, + { + "epoch": 5.167022184300341, + "grad_norm": 0.11875539202533916, + "learning_rate": 3.115686940002662e-05, + "loss": 0.8665, + "step": 24223 + }, + { + "epoch": 5.167235494880546, + "grad_norm": 0.21678356554229264, + "learning_rate": 3.1150130060512915e-05, + "loss": 0.8943, + "step": 24224 + }, + { + "epoch": 5.167448805460751, + "grad_norm": 0.12226590854504404, + "learning_rate": 3.114339127373634e-05, + "loss": 0.8625, + "step": 24225 + }, + { + "epoch": 5.1676621160409555, + "grad_norm": 0.1459148934043646, + "learning_rate": 3.1136653039773104e-05, + "loss": 0.8743, + "step": 24226 + }, + { + "epoch": 5.16787542662116, + "grad_norm": 0.1522467398729023, + "learning_rate": 3.112991535869947e-05, + "loss": 0.895, + "step": 24227 + }, + { + "epoch": 5.168088737201365, + "grad_norm": 0.13031504811207165, + "learning_rate": 3.112317823059166e-05, + "loss": 0.8854, + "step": 24228 + }, + { + "epoch": 5.1683020477815695, + "grad_norm": 0.15522913557527027, + "learning_rate": 3.111644165552589e-05, + "loss": 0.9047, + "step": 24229 + }, + { + "epoch": 5.168515358361775, + "grad_norm": 0.12613806720578738, + "learning_rate": 3.1109705633578454e-05, + "loss": 0.9092, + "step": 24230 + }, + { + "epoch": 5.16872866894198, + "grad_norm": 0.14159906492605448, + "learning_rate": 3.11029701648255e-05, + "loss": 0.8658, + "step": 24231 + }, + { + "epoch": 5.1689419795221845, + "grad_norm": 0.1256855737178766, + "learning_rate": 3.109623524934322e-05, + "loss": 0.9119, + "step": 24232 + }, + { + "epoch": 5.169155290102389, + "grad_norm": 0.12702119835699624, + "learning_rate": 3.1089500887207904e-05, + "loss": 0.883, + "step": 24233 + }, + { + "epoch": 5.169368600682594, + "grad_norm": 0.1281887547679004, + "learning_rate": 3.108276707849569e-05, + "loss": 0.8759, + "step": 24234 + }, + { + "epoch": 5.1695819112627985, + "grad_norm": 0.12679884307248462, + "learning_rate": 3.10760338232828e-05, + "loss": 0.9009, + "step": 24235 + }, + { + "epoch": 5.169795221843003, + "grad_norm": 0.14274281876936576, + "learning_rate": 3.10693011216454e-05, + "loss": 0.889, + "step": 24236 + }, + { + "epoch": 5.170008532423208, + "grad_norm": 0.12882633193020354, + "learning_rate": 3.1062568973659686e-05, + "loss": 0.8681, + "step": 24237 + }, + { + "epoch": 5.170221843003413, + "grad_norm": 0.1849457674673202, + "learning_rate": 3.1055837379401835e-05, + "loss": 0.8773, + "step": 24238 + }, + { + "epoch": 5.170435153583618, + "grad_norm": 0.13628900700511512, + "learning_rate": 3.1049106338948e-05, + "loss": 0.9064, + "step": 24239 + }, + { + "epoch": 5.170648464163823, + "grad_norm": 0.15078003541332258, + "learning_rate": 3.104237585237435e-05, + "loss": 0.8944, + "step": 24240 + }, + { + "epoch": 5.1708617747440275, + "grad_norm": 0.11910122025127348, + "learning_rate": 3.103564591975702e-05, + "loss": 0.8851, + "step": 24241 + }, + { + "epoch": 5.171075085324232, + "grad_norm": 0.16131880602156287, + "learning_rate": 3.102891654117223e-05, + "loss": 0.9086, + "step": 24242 + }, + { + "epoch": 5.171288395904437, + "grad_norm": 0.1342375847099497, + "learning_rate": 3.102218771669606e-05, + "loss": 0.8819, + "step": 24243 + }, + { + "epoch": 5.171501706484642, + "grad_norm": 0.12180308500502571, + "learning_rate": 3.101545944640462e-05, + "loss": 0.8572, + "step": 24244 + }, + { + "epoch": 5.171715017064846, + "grad_norm": 0.12797192862933643, + "learning_rate": 3.1008731730374106e-05, + "loss": 0.9116, + "step": 24245 + }, + { + "epoch": 5.171928327645051, + "grad_norm": 0.13935833118332036, + "learning_rate": 3.100200456868062e-05, + "loss": 0.9221, + "step": 24246 + }, + { + "epoch": 5.172141638225256, + "grad_norm": 0.1440129691283257, + "learning_rate": 3.099527796140027e-05, + "loss": 0.9047, + "step": 24247 + }, + { + "epoch": 5.172354948805461, + "grad_norm": 0.12026209392098897, + "learning_rate": 3.098855190860916e-05, + "loss": 0.8762, + "step": 24248 + }, + { + "epoch": 5.172568259385666, + "grad_norm": 0.12474749512557053, + "learning_rate": 3.098182641038343e-05, + "loss": 0.8846, + "step": 24249 + }, + { + "epoch": 5.172781569965871, + "grad_norm": 0.12832642530654256, + "learning_rate": 3.0975101466799156e-05, + "loss": 0.8978, + "step": 24250 + }, + { + "epoch": 5.172994880546075, + "grad_norm": 0.11898268874596579, + "learning_rate": 3.0968377077932435e-05, + "loss": 0.8714, + "step": 24251 + }, + { + "epoch": 5.17320819112628, + "grad_norm": 0.1280445493775943, + "learning_rate": 3.096165324385935e-05, + "loss": 0.8831, + "step": 24252 + }, + { + "epoch": 5.173421501706485, + "grad_norm": 0.13468513546111205, + "learning_rate": 3.095492996465599e-05, + "loss": 0.8641, + "step": 24253 + }, + { + "epoch": 5.173634812286689, + "grad_norm": 0.1327791657167678, + "learning_rate": 3.094820724039842e-05, + "loss": 0.8784, + "step": 24254 + }, + { + "epoch": 5.173848122866894, + "grad_norm": 0.1283789474661391, + "learning_rate": 3.094148507116271e-05, + "loss": 0.8801, + "step": 24255 + }, + { + "epoch": 5.174061433447099, + "grad_norm": 0.12727359338059158, + "learning_rate": 3.093476345702489e-05, + "loss": 0.9145, + "step": 24256 + }, + { + "epoch": 5.174274744027303, + "grad_norm": 0.1451455003811394, + "learning_rate": 3.0928042398061074e-05, + "loss": 0.9014, + "step": 24257 + }, + { + "epoch": 5.174488054607509, + "grad_norm": 0.13448967604667017, + "learning_rate": 3.092132189434729e-05, + "loss": 0.9136, + "step": 24258 + }, + { + "epoch": 5.174701365187714, + "grad_norm": 0.12086493494499659, + "learning_rate": 3.091460194595957e-05, + "loss": 0.8838, + "step": 24259 + }, + { + "epoch": 5.174914675767918, + "grad_norm": 0.14572540746995868, + "learning_rate": 3.090788255297393e-05, + "loss": 0.8847, + "step": 24260 + }, + { + "epoch": 5.175127986348123, + "grad_norm": 0.127903043953503, + "learning_rate": 3.090116371546645e-05, + "loss": 0.927, + "step": 24261 + }, + { + "epoch": 5.175341296928328, + "grad_norm": 0.14006632402360006, + "learning_rate": 3.089444543351313e-05, + "loss": 0.8809, + "step": 24262 + }, + { + "epoch": 5.175554607508532, + "grad_norm": 0.11727312083778244, + "learning_rate": 3.088772770718999e-05, + "loss": 0.8503, + "step": 24263 + }, + { + "epoch": 5.175767918088737, + "grad_norm": 0.1341357534722193, + "learning_rate": 3.088101053657303e-05, + "loss": 0.868, + "step": 24264 + }, + { + "epoch": 5.175981228668942, + "grad_norm": 0.11791062712185289, + "learning_rate": 3.087429392173826e-05, + "loss": 0.8862, + "step": 24265 + }, + { + "epoch": 5.176194539249146, + "grad_norm": 0.12365270569229435, + "learning_rate": 3.0867577862761684e-05, + "loss": 0.8783, + "step": 24266 + }, + { + "epoch": 5.176407849829351, + "grad_norm": 0.1278681212929064, + "learning_rate": 3.086086235971929e-05, + "loss": 0.8905, + "step": 24267 + }, + { + "epoch": 5.176621160409557, + "grad_norm": 0.13733553354134123, + "learning_rate": 3.0854147412687044e-05, + "loss": 0.9041, + "step": 24268 + }, + { + "epoch": 5.176834470989761, + "grad_norm": 0.12039519442972152, + "learning_rate": 3.084743302174097e-05, + "loss": 0.8698, + "step": 24269 + }, + { + "epoch": 5.177047781569966, + "grad_norm": 0.1419222445030525, + "learning_rate": 3.084071918695702e-05, + "loss": 0.8991, + "step": 24270 + }, + { + "epoch": 5.177261092150171, + "grad_norm": 0.11966593945209195, + "learning_rate": 3.0834005908411154e-05, + "loss": 0.8816, + "step": 24271 + }, + { + "epoch": 5.177474402730375, + "grad_norm": 0.15793401352117376, + "learning_rate": 3.082729318617931e-05, + "loss": 0.8878, + "step": 24272 + }, + { + "epoch": 5.17768771331058, + "grad_norm": 0.12692289059876802, + "learning_rate": 3.082058102033751e-05, + "loss": 0.9101, + "step": 24273 + }, + { + "epoch": 5.177901023890785, + "grad_norm": 0.15680650347813266, + "learning_rate": 3.081386941096165e-05, + "loss": 0.8886, + "step": 24274 + }, + { + "epoch": 5.1781143344709895, + "grad_norm": 0.11726600822844656, + "learning_rate": 3.080715835812771e-05, + "loss": 0.8839, + "step": 24275 + }, + { + "epoch": 5.178327645051194, + "grad_norm": 0.12937285719016725, + "learning_rate": 3.080044786191155e-05, + "loss": 0.8838, + "step": 24276 + }, + { + "epoch": 5.1785409556314, + "grad_norm": 0.1254918617431536, + "learning_rate": 3.0793737922389183e-05, + "loss": 0.8991, + "step": 24277 + }, + { + "epoch": 5.178754266211604, + "grad_norm": 0.11962024547082893, + "learning_rate": 3.07870285396365e-05, + "loss": 0.8575, + "step": 24278 + }, + { + "epoch": 5.178967576791809, + "grad_norm": 0.141281085894353, + "learning_rate": 3.078031971372941e-05, + "loss": 0.9082, + "step": 24279 + }, + { + "epoch": 5.179180887372014, + "grad_norm": 0.12850074317642685, + "learning_rate": 3.077361144474381e-05, + "loss": 0.8837, + "step": 24280 + }, + { + "epoch": 5.1793941979522184, + "grad_norm": 0.13395550326598646, + "learning_rate": 3.0766903732755646e-05, + "loss": 0.8616, + "step": 24281 + }, + { + "epoch": 5.179607508532423, + "grad_norm": 0.1545728358913535, + "learning_rate": 3.0760196577840794e-05, + "loss": 0.8661, + "step": 24282 + }, + { + "epoch": 5.179820819112628, + "grad_norm": 0.12536374346073517, + "learning_rate": 3.075348998007515e-05, + "loss": 0.8641, + "step": 24283 + }, + { + "epoch": 5.1800341296928325, + "grad_norm": 0.16067359383178723, + "learning_rate": 3.0746783939534565e-05, + "loss": 0.8792, + "step": 24284 + }, + { + "epoch": 5.180247440273037, + "grad_norm": 0.13036030566130197, + "learning_rate": 3.0740078456294975e-05, + "loss": 0.889, + "step": 24285 + }, + { + "epoch": 5.180460750853243, + "grad_norm": 0.17370331018768342, + "learning_rate": 3.073337353043226e-05, + "loss": 0.8743, + "step": 24286 + }, + { + "epoch": 5.180674061433447, + "grad_norm": 0.13821362003909157, + "learning_rate": 3.072666916202221e-05, + "loss": 0.8925, + "step": 24287 + }, + { + "epoch": 5.180887372013652, + "grad_norm": 0.1549881580269116, + "learning_rate": 3.071996535114071e-05, + "loss": 0.8439, + "step": 24288 + }, + { + "epoch": 5.181100682593857, + "grad_norm": 0.1277182353090593, + "learning_rate": 3.0713262097863665e-05, + "loss": 0.8562, + "step": 24289 + }, + { + "epoch": 5.1813139931740615, + "grad_norm": 0.13582078435418057, + "learning_rate": 3.070655940226687e-05, + "loss": 0.8743, + "step": 24290 + }, + { + "epoch": 5.181527303754266, + "grad_norm": 0.13469799490262988, + "learning_rate": 3.06998572644262e-05, + "loss": 0.8908, + "step": 24291 + }, + { + "epoch": 5.181740614334471, + "grad_norm": 0.12523200925989902, + "learning_rate": 3.0693155684417444e-05, + "loss": 0.8617, + "step": 24292 + }, + { + "epoch": 5.1819539249146755, + "grad_norm": 0.15637211861581343, + "learning_rate": 3.068645466231648e-05, + "loss": 0.8932, + "step": 24293 + }, + { + "epoch": 5.18216723549488, + "grad_norm": 0.11738345749236749, + "learning_rate": 3.067975419819912e-05, + "loss": 0.8816, + "step": 24294 + }, + { + "epoch": 5.182380546075085, + "grad_norm": 0.18006684277947624, + "learning_rate": 3.067305429214117e-05, + "loss": 0.8811, + "step": 24295 + }, + { + "epoch": 5.1825938566552905, + "grad_norm": 0.12950618968637703, + "learning_rate": 3.0666354944218413e-05, + "loss": 0.8744, + "step": 24296 + }, + { + "epoch": 5.182807167235495, + "grad_norm": 0.16467025587001177, + "learning_rate": 3.0659656154506704e-05, + "loss": 0.8895, + "step": 24297 + }, + { + "epoch": 5.1830204778157, + "grad_norm": 0.1275288172748328, + "learning_rate": 3.0652957923081844e-05, + "loss": 0.8849, + "step": 24298 + }, + { + "epoch": 5.1832337883959045, + "grad_norm": 0.15138828280673844, + "learning_rate": 3.064626025001957e-05, + "loss": 0.8884, + "step": 24299 + }, + { + "epoch": 5.183447098976109, + "grad_norm": 0.14303089284202988, + "learning_rate": 3.0639563135395665e-05, + "loss": 0.8942, + "step": 24300 + }, + { + "epoch": 5.183660409556314, + "grad_norm": 0.12777247654278556, + "learning_rate": 3.063286657928595e-05, + "loss": 0.9058, + "step": 24301 + }, + { + "epoch": 5.183873720136519, + "grad_norm": 0.11695841593833663, + "learning_rate": 3.062617058176619e-05, + "loss": 0.879, + "step": 24302 + }, + { + "epoch": 5.184087030716723, + "grad_norm": 0.14057825270520136, + "learning_rate": 3.061947514291214e-05, + "loss": 0.9219, + "step": 24303 + }, + { + "epoch": 5.184300341296928, + "grad_norm": 0.11633471948404128, + "learning_rate": 3.0612780262799525e-05, + "loss": 0.8679, + "step": 24304 + }, + { + "epoch": 5.1845136518771335, + "grad_norm": 0.13130807351742568, + "learning_rate": 3.060608594150416e-05, + "loss": 0.8914, + "step": 24305 + }, + { + "epoch": 5.184726962457338, + "grad_norm": 0.12662312286471647, + "learning_rate": 3.059939217910177e-05, + "loss": 0.9094, + "step": 24306 + }, + { + "epoch": 5.184940273037543, + "grad_norm": 0.13634054539796894, + "learning_rate": 3.059269897566809e-05, + "loss": 0.8887, + "step": 24307 + }, + { + "epoch": 5.185153583617748, + "grad_norm": 0.12862385130553636, + "learning_rate": 3.058600633127882e-05, + "loss": 0.9034, + "step": 24308 + }, + { + "epoch": 5.185366894197952, + "grad_norm": 0.14418639406018632, + "learning_rate": 3.0579314246009775e-05, + "loss": 0.8901, + "step": 24309 + }, + { + "epoch": 5.185580204778157, + "grad_norm": 0.15057200265492718, + "learning_rate": 3.0572622719936595e-05, + "loss": 0.9085, + "step": 24310 + }, + { + "epoch": 5.185793515358362, + "grad_norm": 0.1286243095576554, + "learning_rate": 3.056593175313502e-05, + "loss": 0.8932, + "step": 24311 + }, + { + "epoch": 5.186006825938566, + "grad_norm": 0.11685959857651912, + "learning_rate": 3.0559241345680735e-05, + "loss": 0.8764, + "step": 24312 + }, + { + "epoch": 5.186220136518771, + "grad_norm": 0.12706229375370146, + "learning_rate": 3.0552551497649484e-05, + "loss": 0.8869, + "step": 24313 + }, + { + "epoch": 5.186433447098976, + "grad_norm": 0.130175543455511, + "learning_rate": 3.054586220911695e-05, + "loss": 0.9109, + "step": 24314 + }, + { + "epoch": 5.186646757679181, + "grad_norm": 0.12882796987039252, + "learning_rate": 3.0539173480158814e-05, + "loss": 0.8688, + "step": 24315 + }, + { + "epoch": 5.186860068259386, + "grad_norm": 0.12416670884584378, + "learning_rate": 3.0532485310850736e-05, + "loss": 0.889, + "step": 24316 + }, + { + "epoch": 5.187073378839591, + "grad_norm": 0.13501660328259948, + "learning_rate": 3.052579770126845e-05, + "loss": 0.8788, + "step": 24317 + }, + { + "epoch": 5.187286689419795, + "grad_norm": 0.14121475902598143, + "learning_rate": 3.051911065148759e-05, + "loss": 0.9006, + "step": 24318 + }, + { + "epoch": 5.1875, + "grad_norm": 0.13139958687922382, + "learning_rate": 3.051242416158383e-05, + "loss": 0.8963, + "step": 24319 + }, + { + "epoch": 5.187713310580205, + "grad_norm": 0.14967599707968, + "learning_rate": 3.0505738231632817e-05, + "loss": 0.9004, + "step": 24320 + }, + { + "epoch": 5.187926621160409, + "grad_norm": 0.14065087684342448, + "learning_rate": 3.049905286171021e-05, + "loss": 0.9084, + "step": 24321 + }, + { + "epoch": 5.188139931740614, + "grad_norm": 0.16978664027054202, + "learning_rate": 3.049236805189166e-05, + "loss": 0.891, + "step": 24322 + }, + { + "epoch": 5.188353242320819, + "grad_norm": 0.12675279947047283, + "learning_rate": 3.0485683802252787e-05, + "loss": 0.897, + "step": 24323 + }, + { + "epoch": 5.188566552901024, + "grad_norm": 0.13573818877546603, + "learning_rate": 3.047900011286921e-05, + "loss": 0.9115, + "step": 24324 + }, + { + "epoch": 5.188779863481229, + "grad_norm": 0.13884020671324968, + "learning_rate": 3.047231698381661e-05, + "loss": 0.8922, + "step": 24325 + }, + { + "epoch": 5.188993174061434, + "grad_norm": 0.1361852872230105, + "learning_rate": 3.0465634415170578e-05, + "loss": 0.8777, + "step": 24326 + }, + { + "epoch": 5.189206484641638, + "grad_norm": 0.1290135826658119, + "learning_rate": 3.045895240700672e-05, + "loss": 0.9102, + "step": 24327 + }, + { + "epoch": 5.189419795221843, + "grad_norm": 0.15662455047976112, + "learning_rate": 3.045227095940063e-05, + "loss": 0.8747, + "step": 24328 + }, + { + "epoch": 5.189633105802048, + "grad_norm": 0.12446386295411734, + "learning_rate": 3.0445590072427946e-05, + "loss": 0.889, + "step": 24329 + }, + { + "epoch": 5.189846416382252, + "grad_norm": 0.13788347396492812, + "learning_rate": 3.0438909746164255e-05, + "loss": 0.8813, + "step": 24330 + }, + { + "epoch": 5.190059726962457, + "grad_norm": 0.12690926095500227, + "learning_rate": 3.0432229980685116e-05, + "loss": 0.8429, + "step": 24331 + }, + { + "epoch": 5.190273037542662, + "grad_norm": 0.12413333184813775, + "learning_rate": 3.0425550776066147e-05, + "loss": 0.8711, + "step": 24332 + }, + { + "epoch": 5.1904863481228665, + "grad_norm": 0.12526418132493694, + "learning_rate": 3.0418872132382896e-05, + "loss": 0.8598, + "step": 24333 + }, + { + "epoch": 5.190699658703072, + "grad_norm": 0.11399691947197434, + "learning_rate": 3.041219404971094e-05, + "loss": 0.8851, + "step": 24334 + }, + { + "epoch": 5.190912969283277, + "grad_norm": 0.1342653638030641, + "learning_rate": 3.0405516528125845e-05, + "loss": 0.8399, + "step": 24335 + }, + { + "epoch": 5.191126279863481, + "grad_norm": 0.11084510768237049, + "learning_rate": 3.0398839567703136e-05, + "loss": 0.9014, + "step": 24336 + }, + { + "epoch": 5.191339590443686, + "grad_norm": 0.14738958632265992, + "learning_rate": 3.0392163168518417e-05, + "loss": 0.9034, + "step": 24337 + }, + { + "epoch": 5.191552901023891, + "grad_norm": 0.12371137353019426, + "learning_rate": 3.038548733064721e-05, + "loss": 0.8893, + "step": 24338 + }, + { + "epoch": 5.1917662116040955, + "grad_norm": 0.1399071164942065, + "learning_rate": 3.037881205416505e-05, + "loss": 0.9102, + "step": 24339 + }, + { + "epoch": 5.1919795221843, + "grad_norm": 0.17979872145943196, + "learning_rate": 3.037213733914743e-05, + "loss": 0.8861, + "step": 24340 + }, + { + "epoch": 5.192192832764505, + "grad_norm": 0.128084503241321, + "learning_rate": 3.036546318566995e-05, + "loss": 0.8986, + "step": 24341 + }, + { + "epoch": 5.1924061433447095, + "grad_norm": 0.17539035093984728, + "learning_rate": 3.035878959380811e-05, + "loss": 0.9034, + "step": 24342 + }, + { + "epoch": 5.192619453924915, + "grad_norm": 0.1598043736109558, + "learning_rate": 3.0352116563637347e-05, + "loss": 0.8817, + "step": 24343 + }, + { + "epoch": 5.19283276450512, + "grad_norm": 0.11313261795790439, + "learning_rate": 3.0345444095233242e-05, + "loss": 0.8662, + "step": 24344 + }, + { + "epoch": 5.1930460750853245, + "grad_norm": 0.18252694870289968, + "learning_rate": 3.0338772188671278e-05, + "loss": 0.896, + "step": 24345 + }, + { + "epoch": 5.193259385665529, + "grad_norm": 0.1392735331612707, + "learning_rate": 3.0332100844026943e-05, + "loss": 0.8856, + "step": 24346 + }, + { + "epoch": 5.193472696245734, + "grad_norm": 0.14723467070405194, + "learning_rate": 3.032543006137572e-05, + "loss": 0.8944, + "step": 24347 + }, + { + "epoch": 5.1936860068259385, + "grad_norm": 0.1370942327549011, + "learning_rate": 3.0318759840793065e-05, + "loss": 0.8661, + "step": 24348 + }, + { + "epoch": 5.193899317406143, + "grad_norm": 0.1510548988068153, + "learning_rate": 3.0312090182354504e-05, + "loss": 0.9083, + "step": 24349 + }, + { + "epoch": 5.194112627986348, + "grad_norm": 0.1162690408987469, + "learning_rate": 3.0305421086135482e-05, + "loss": 0.8693, + "step": 24350 + }, + { + "epoch": 5.194325938566553, + "grad_norm": 0.15458466831409437, + "learning_rate": 3.0298752552211455e-05, + "loss": 0.8951, + "step": 24351 + }, + { + "epoch": 5.194539249146757, + "grad_norm": 0.12233655321999258, + "learning_rate": 3.0292084580657843e-05, + "loss": 0.9011, + "step": 24352 + }, + { + "epoch": 5.194752559726963, + "grad_norm": 0.1236253813984127, + "learning_rate": 3.02854171715502e-05, + "loss": 0.8661, + "step": 24353 + }, + { + "epoch": 5.1949658703071675, + "grad_norm": 0.11681798238118948, + "learning_rate": 3.027875032496386e-05, + "loss": 0.8848, + "step": 24354 + }, + { + "epoch": 5.195179180887372, + "grad_norm": 0.13328494798513535, + "learning_rate": 3.0272084040974267e-05, + "loss": 0.8511, + "step": 24355 + }, + { + "epoch": 5.195392491467577, + "grad_norm": 0.1337766522504186, + "learning_rate": 3.026541831965691e-05, + "loss": 0.8873, + "step": 24356 + }, + { + "epoch": 5.1956058020477816, + "grad_norm": 0.12635503850085655, + "learning_rate": 3.025875316108718e-05, + "loss": 0.855, + "step": 24357 + }, + { + "epoch": 5.195819112627986, + "grad_norm": 0.12293707436037943, + "learning_rate": 3.0252088565340504e-05, + "loss": 0.8816, + "step": 24358 + }, + { + "epoch": 5.196032423208191, + "grad_norm": 0.12127641197402986, + "learning_rate": 3.0245424532492272e-05, + "loss": 0.9014, + "step": 24359 + }, + { + "epoch": 5.196245733788396, + "grad_norm": 0.11982116424786071, + "learning_rate": 3.0238761062617875e-05, + "loss": 0.9031, + "step": 24360 + }, + { + "epoch": 5.1964590443686, + "grad_norm": 0.11822744751089818, + "learning_rate": 3.023209815579276e-05, + "loss": 0.9063, + "step": 24361 + }, + { + "epoch": 5.196672354948806, + "grad_norm": 0.125153640140233, + "learning_rate": 3.0225435812092292e-05, + "loss": 0.9057, + "step": 24362 + }, + { + "epoch": 5.1968856655290105, + "grad_norm": 0.1068852927542606, + "learning_rate": 3.021877403159186e-05, + "loss": 0.9032, + "step": 24363 + }, + { + "epoch": 5.197098976109215, + "grad_norm": 0.12333373655407172, + "learning_rate": 3.0212112814366804e-05, + "loss": 0.8596, + "step": 24364 + }, + { + "epoch": 5.19731228668942, + "grad_norm": 0.13004429635784387, + "learning_rate": 3.020545216049259e-05, + "loss": 0.9057, + "step": 24365 + }, + { + "epoch": 5.197525597269625, + "grad_norm": 0.114727852259704, + "learning_rate": 3.019879207004449e-05, + "loss": 0.8759, + "step": 24366 + }, + { + "epoch": 5.197738907849829, + "grad_norm": 0.1448732968273003, + "learning_rate": 3.0192132543097864e-05, + "loss": 0.9254, + "step": 24367 + }, + { + "epoch": 5.197952218430034, + "grad_norm": 0.11395184630551755, + "learning_rate": 3.0185473579728123e-05, + "loss": 0.853, + "step": 24368 + }, + { + "epoch": 5.198165529010239, + "grad_norm": 0.1290763666614251, + "learning_rate": 3.0178815180010592e-05, + "loss": 0.9167, + "step": 24369 + }, + { + "epoch": 5.198378839590443, + "grad_norm": 0.1253923943366385, + "learning_rate": 3.0172157344020607e-05, + "loss": 0.8982, + "step": 24370 + }, + { + "epoch": 5.198592150170649, + "grad_norm": 0.14575530193936792, + "learning_rate": 3.0165500071833498e-05, + "loss": 0.9013, + "step": 24371 + }, + { + "epoch": 5.198805460750854, + "grad_norm": 0.12488224983909428, + "learning_rate": 3.0158843363524563e-05, + "loss": 0.9003, + "step": 24372 + }, + { + "epoch": 5.199018771331058, + "grad_norm": 0.1494777034731735, + "learning_rate": 3.0152187219169186e-05, + "loss": 0.9128, + "step": 24373 + }, + { + "epoch": 5.199232081911263, + "grad_norm": 0.1086120252301538, + "learning_rate": 3.014553163884264e-05, + "loss": 0.8766, + "step": 24374 + }, + { + "epoch": 5.199445392491468, + "grad_norm": 0.14269626941274202, + "learning_rate": 3.0138876622620253e-05, + "loss": 0.8997, + "step": 24375 + }, + { + "epoch": 5.199658703071672, + "grad_norm": 0.1173137978016393, + "learning_rate": 3.013222217057731e-05, + "loss": 0.87, + "step": 24376 + }, + { + "epoch": 5.199872013651877, + "grad_norm": 0.14416658880912073, + "learning_rate": 3.0125568282789116e-05, + "loss": 0.9087, + "step": 24377 + }, + { + "epoch": 5.200085324232082, + "grad_norm": 0.1158249501191283, + "learning_rate": 3.011891495933095e-05, + "loss": 0.8689, + "step": 24378 + }, + { + "epoch": 5.200298634812286, + "grad_norm": 0.1635528843681279, + "learning_rate": 3.0112262200278076e-05, + "loss": 0.904, + "step": 24379 + }, + { + "epoch": 5.200511945392491, + "grad_norm": 0.11871135341274226, + "learning_rate": 3.010561000570582e-05, + "loss": 0.8631, + "step": 24380 + }, + { + "epoch": 5.200725255972697, + "grad_norm": 0.150816880325653, + "learning_rate": 3.0098958375689425e-05, + "loss": 0.9172, + "step": 24381 + }, + { + "epoch": 5.200938566552901, + "grad_norm": 0.12823891672631302, + "learning_rate": 3.0092307310304156e-05, + "loss": 0.8776, + "step": 24382 + }, + { + "epoch": 5.201151877133106, + "grad_norm": 0.14792057114395377, + "learning_rate": 3.0085656809625274e-05, + "loss": 0.8986, + "step": 24383 + }, + { + "epoch": 5.201365187713311, + "grad_norm": 0.1372646683775495, + "learning_rate": 3.007900687372799e-05, + "loss": 0.9045, + "step": 24384 + }, + { + "epoch": 5.201578498293515, + "grad_norm": 0.16399150130669507, + "learning_rate": 3.0072357502687616e-05, + "loss": 0.8909, + "step": 24385 + }, + { + "epoch": 5.20179180887372, + "grad_norm": 0.15139868328779316, + "learning_rate": 3.006570869657936e-05, + "loss": 0.8749, + "step": 24386 + }, + { + "epoch": 5.202005119453925, + "grad_norm": 0.14817706943519413, + "learning_rate": 3.0059060455478444e-05, + "loss": 0.8651, + "step": 24387 + }, + { + "epoch": 5.202218430034129, + "grad_norm": 0.17829267905372626, + "learning_rate": 3.0052412779460104e-05, + "loss": 0.9142, + "step": 24388 + }, + { + "epoch": 5.202431740614334, + "grad_norm": 0.14558267118329185, + "learning_rate": 3.004576566859955e-05, + "loss": 0.8948, + "step": 24389 + }, + { + "epoch": 5.202645051194539, + "grad_norm": 0.1590611630455742, + "learning_rate": 3.0039119122971993e-05, + "loss": 0.9065, + "step": 24390 + }, + { + "epoch": 5.202858361774744, + "grad_norm": 0.15741617550875406, + "learning_rate": 3.0032473142652648e-05, + "loss": 0.8957, + "step": 24391 + }, + { + "epoch": 5.203071672354949, + "grad_norm": 0.13738115830388417, + "learning_rate": 3.0025827727716685e-05, + "loss": 0.8861, + "step": 24392 + }, + { + "epoch": 5.203284982935154, + "grad_norm": 0.14677023183127788, + "learning_rate": 3.0019182878239345e-05, + "loss": 0.875, + "step": 24393 + }, + { + "epoch": 5.203498293515358, + "grad_norm": 0.1540700268417002, + "learning_rate": 3.0012538594295793e-05, + "loss": 0.882, + "step": 24394 + }, + { + "epoch": 5.203711604095563, + "grad_norm": 0.1525094689291411, + "learning_rate": 3.0005894875961207e-05, + "loss": 0.8872, + "step": 24395 + }, + { + "epoch": 5.203924914675768, + "grad_norm": 0.15447473588662763, + "learning_rate": 2.9999251723310733e-05, + "loss": 0.8595, + "step": 24396 + }, + { + "epoch": 5.2041382252559725, + "grad_norm": 0.14559901481448295, + "learning_rate": 2.9992609136419602e-05, + "loss": 0.8772, + "step": 24397 + }, + { + "epoch": 5.204351535836177, + "grad_norm": 0.13534526996554755, + "learning_rate": 2.998596711536296e-05, + "loss": 0.8683, + "step": 24398 + }, + { + "epoch": 5.204564846416382, + "grad_norm": 0.14117927873745198, + "learning_rate": 2.9979325660215885e-05, + "loss": 0.8913, + "step": 24399 + }, + { + "epoch": 5.204778156996587, + "grad_norm": 0.14132484139003265, + "learning_rate": 2.997268477105361e-05, + "loss": 0.9174, + "step": 24400 + }, + { + "epoch": 5.204991467576792, + "grad_norm": 0.13572650654297422, + "learning_rate": 2.996604444795125e-05, + "loss": 0.9214, + "step": 24401 + }, + { + "epoch": 5.205204778156997, + "grad_norm": 0.18742427730524458, + "learning_rate": 2.9959404690983943e-05, + "loss": 0.8513, + "step": 24402 + }, + { + "epoch": 5.2054180887372015, + "grad_norm": 0.1232609381463117, + "learning_rate": 2.9952765500226806e-05, + "loss": 0.8665, + "step": 24403 + }, + { + "epoch": 5.205631399317406, + "grad_norm": 0.12980293195232756, + "learning_rate": 2.9946126875754946e-05, + "loss": 0.8985, + "step": 24404 + }, + { + "epoch": 5.205844709897611, + "grad_norm": 0.12154089257920726, + "learning_rate": 2.993948881764353e-05, + "loss": 0.8925, + "step": 24405 + }, + { + "epoch": 5.2060580204778155, + "grad_norm": 0.132066308302589, + "learning_rate": 2.9932851325967635e-05, + "loss": 0.8835, + "step": 24406 + }, + { + "epoch": 5.20627133105802, + "grad_norm": 0.13841766363007868, + "learning_rate": 2.992621440080237e-05, + "loss": 0.8843, + "step": 24407 + }, + { + "epoch": 5.206484641638225, + "grad_norm": 0.11142762947471177, + "learning_rate": 2.991957804222281e-05, + "loss": 0.8407, + "step": 24408 + }, + { + "epoch": 5.2066979522184305, + "grad_norm": 0.16618156562755373, + "learning_rate": 2.9912942250304122e-05, + "loss": 0.9209, + "step": 24409 + }, + { + "epoch": 5.206911262798635, + "grad_norm": 0.12431638752741463, + "learning_rate": 2.99063070251213e-05, + "loss": 0.899, + "step": 24410 + }, + { + "epoch": 5.20712457337884, + "grad_norm": 0.17284759883652043, + "learning_rate": 2.9899672366749438e-05, + "loss": 0.9115, + "step": 24411 + }, + { + "epoch": 5.2073378839590445, + "grad_norm": 0.17713265444197435, + "learning_rate": 2.9893038275263645e-05, + "loss": 0.9163, + "step": 24412 + }, + { + "epoch": 5.207551194539249, + "grad_norm": 0.12584749322069325, + "learning_rate": 2.9886404750738972e-05, + "loss": 0.8875, + "step": 24413 + }, + { + "epoch": 5.207764505119454, + "grad_norm": 0.15485153716334127, + "learning_rate": 2.9879771793250474e-05, + "loss": 0.8988, + "step": 24414 + }, + { + "epoch": 5.207977815699659, + "grad_norm": 0.12279968358128934, + "learning_rate": 2.9873139402873197e-05, + "loss": 0.8908, + "step": 24415 + }, + { + "epoch": 5.208191126279863, + "grad_norm": 0.16049037353229034, + "learning_rate": 2.9866507579682165e-05, + "loss": 0.8752, + "step": 24416 + }, + { + "epoch": 5.208404436860068, + "grad_norm": 0.12283898366547737, + "learning_rate": 2.9859876323752465e-05, + "loss": 0.893, + "step": 24417 + }, + { + "epoch": 5.208617747440273, + "grad_norm": 0.1566519672830691, + "learning_rate": 2.9853245635159108e-05, + "loss": 0.8926, + "step": 24418 + }, + { + "epoch": 5.208831058020478, + "grad_norm": 0.1627801632875511, + "learning_rate": 2.9846615513977127e-05, + "loss": 0.8608, + "step": 24419 + }, + { + "epoch": 5.209044368600683, + "grad_norm": 0.12834547471904623, + "learning_rate": 2.9839985960281504e-05, + "loss": 0.8551, + "step": 24420 + }, + { + "epoch": 5.209257679180888, + "grad_norm": 0.13701783350518176, + "learning_rate": 2.9833356974147345e-05, + "loss": 0.882, + "step": 24421 + }, + { + "epoch": 5.209470989761092, + "grad_norm": 0.1340044575176057, + "learning_rate": 2.982672855564956e-05, + "loss": 0.8734, + "step": 24422 + }, + { + "epoch": 5.209684300341297, + "grad_norm": 0.17117815263435737, + "learning_rate": 2.9820100704863166e-05, + "loss": 0.8791, + "step": 24423 + }, + { + "epoch": 5.209897610921502, + "grad_norm": 0.14366704413344455, + "learning_rate": 2.9813473421863192e-05, + "loss": 0.8636, + "step": 24424 + }, + { + "epoch": 5.210110921501706, + "grad_norm": 0.14176311140496986, + "learning_rate": 2.980684670672462e-05, + "loss": 0.8639, + "step": 24425 + }, + { + "epoch": 5.210324232081911, + "grad_norm": 0.14528895332663738, + "learning_rate": 2.9800220559522414e-05, + "loss": 0.9059, + "step": 24426 + }, + { + "epoch": 5.210537542662116, + "grad_norm": 0.16665602104597885, + "learning_rate": 2.9793594980331568e-05, + "loss": 0.8955, + "step": 24427 + }, + { + "epoch": 5.210750853242321, + "grad_norm": 0.1392634269569929, + "learning_rate": 2.9786969969227004e-05, + "loss": 0.8706, + "step": 24428 + }, + { + "epoch": 5.210964163822526, + "grad_norm": 0.16880012308007636, + "learning_rate": 2.9780345526283745e-05, + "loss": 0.9248, + "step": 24429 + }, + { + "epoch": 5.211177474402731, + "grad_norm": 0.1495110405079719, + "learning_rate": 2.9773721651576727e-05, + "loss": 0.862, + "step": 24430 + }, + { + "epoch": 5.211390784982935, + "grad_norm": 0.15433015399094813, + "learning_rate": 2.976709834518089e-05, + "loss": 0.8787, + "step": 24431 + }, + { + "epoch": 5.21160409556314, + "grad_norm": 0.1418081859205254, + "learning_rate": 2.9760475607171176e-05, + "loss": 0.8724, + "step": 24432 + }, + { + "epoch": 5.211817406143345, + "grad_norm": 0.14853787521453163, + "learning_rate": 2.9753853437622538e-05, + "loss": 0.8806, + "step": 24433 + }, + { + "epoch": 5.212030716723549, + "grad_norm": 0.13615771867196474, + "learning_rate": 2.974723183660988e-05, + "loss": 0.9001, + "step": 24434 + }, + { + "epoch": 5.212244027303754, + "grad_norm": 0.14389727073441305, + "learning_rate": 2.9740610804208115e-05, + "loss": 0.9002, + "step": 24435 + }, + { + "epoch": 5.212457337883959, + "grad_norm": 0.13784337644856137, + "learning_rate": 2.973399034049222e-05, + "loss": 0.8756, + "step": 24436 + }, + { + "epoch": 5.212670648464163, + "grad_norm": 0.12631087951968828, + "learning_rate": 2.9727370445537064e-05, + "loss": 0.8857, + "step": 24437 + }, + { + "epoch": 5.212883959044369, + "grad_norm": 0.15021644796905573, + "learning_rate": 2.972075111941755e-05, + "loss": 0.8631, + "step": 24438 + }, + { + "epoch": 5.213097269624574, + "grad_norm": 0.1370969086520216, + "learning_rate": 2.9714132362208593e-05, + "loss": 0.9094, + "step": 24439 + }, + { + "epoch": 5.213310580204778, + "grad_norm": 0.14321673985842215, + "learning_rate": 2.9707514173985044e-05, + "loss": 0.8842, + "step": 24440 + }, + { + "epoch": 5.213523890784983, + "grad_norm": 0.1177274807951125, + "learning_rate": 2.9700896554821842e-05, + "loss": 0.9022, + "step": 24441 + }, + { + "epoch": 5.213737201365188, + "grad_norm": 0.13482534636011026, + "learning_rate": 2.969427950479385e-05, + "loss": 0.8664, + "step": 24442 + }, + { + "epoch": 5.213950511945392, + "grad_norm": 0.12861519973107113, + "learning_rate": 2.968766302397592e-05, + "loss": 0.8769, + "step": 24443 + }, + { + "epoch": 5.214163822525597, + "grad_norm": 0.14245815849119037, + "learning_rate": 2.9681047112442934e-05, + "loss": 0.9217, + "step": 24444 + }, + { + "epoch": 5.214377133105802, + "grad_norm": 0.12401723921664011, + "learning_rate": 2.967443177026975e-05, + "loss": 0.9058, + "step": 24445 + }, + { + "epoch": 5.214590443686006, + "grad_norm": 0.12819625548882382, + "learning_rate": 2.9667816997531215e-05, + "loss": 0.9096, + "step": 24446 + }, + { + "epoch": 5.214803754266212, + "grad_norm": 0.12263211069816485, + "learning_rate": 2.9661202794302153e-05, + "loss": 0.8848, + "step": 24447 + }, + { + "epoch": 5.215017064846417, + "grad_norm": 0.14074090317367877, + "learning_rate": 2.9654589160657453e-05, + "loss": 0.9069, + "step": 24448 + }, + { + "epoch": 5.215230375426621, + "grad_norm": 0.14081151770485417, + "learning_rate": 2.9647976096671917e-05, + "loss": 0.8846, + "step": 24449 + }, + { + "epoch": 5.215443686006826, + "grad_norm": 0.11907583611113189, + "learning_rate": 2.9641363602420382e-05, + "loss": 0.8842, + "step": 24450 + }, + { + "epoch": 5.215656996587031, + "grad_norm": 0.12104617607502526, + "learning_rate": 2.963475167797767e-05, + "loss": 0.9018, + "step": 24451 + }, + { + "epoch": 5.215870307167235, + "grad_norm": 0.13914066202473555, + "learning_rate": 2.9628140323418556e-05, + "loss": 0.8837, + "step": 24452 + }, + { + "epoch": 5.21608361774744, + "grad_norm": 0.11739310238868095, + "learning_rate": 2.9621529538817912e-05, + "loss": 0.8844, + "step": 24453 + }, + { + "epoch": 5.216296928327645, + "grad_norm": 0.12840453576842148, + "learning_rate": 2.9614919324250536e-05, + "loss": 0.8971, + "step": 24454 + }, + { + "epoch": 5.2165102389078495, + "grad_norm": 0.11676668608181852, + "learning_rate": 2.9608309679791144e-05, + "loss": 0.888, + "step": 24455 + }, + { + "epoch": 5.216723549488055, + "grad_norm": 0.1235227471017982, + "learning_rate": 2.96017006055146e-05, + "loss": 0.9075, + "step": 24456 + }, + { + "epoch": 5.21693686006826, + "grad_norm": 0.10965948011070757, + "learning_rate": 2.959509210149566e-05, + "loss": 0.8887, + "step": 24457 + }, + { + "epoch": 5.217150170648464, + "grad_norm": 0.12045087276113238, + "learning_rate": 2.95884841678091e-05, + "loss": 0.8941, + "step": 24458 + }, + { + "epoch": 5.217363481228669, + "grad_norm": 0.11355889900076442, + "learning_rate": 2.958187680452966e-05, + "loss": 0.9125, + "step": 24459 + }, + { + "epoch": 5.217576791808874, + "grad_norm": 0.12212260794763999, + "learning_rate": 2.957527001173216e-05, + "loss": 0.8886, + "step": 24460 + }, + { + "epoch": 5.2177901023890785, + "grad_norm": 0.11573718761262586, + "learning_rate": 2.9568663789491335e-05, + "loss": 0.8717, + "step": 24461 + }, + { + "epoch": 5.218003412969283, + "grad_norm": 0.11860692350988356, + "learning_rate": 2.9562058137881925e-05, + "loss": 0.8892, + "step": 24462 + }, + { + "epoch": 5.218216723549488, + "grad_norm": 0.12130880424132987, + "learning_rate": 2.955545305697867e-05, + "loss": 0.903, + "step": 24463 + }, + { + "epoch": 5.2184300341296925, + "grad_norm": 0.1080302807645162, + "learning_rate": 2.954884854685629e-05, + "loss": 0.8796, + "step": 24464 + }, + { + "epoch": 5.218643344709897, + "grad_norm": 0.1379474185206935, + "learning_rate": 2.954224460758959e-05, + "loss": 0.8883, + "step": 24465 + }, + { + "epoch": 5.218856655290103, + "grad_norm": 0.12311375472922055, + "learning_rate": 2.953564123925322e-05, + "loss": 0.8751, + "step": 24466 + }, + { + "epoch": 5.2190699658703075, + "grad_norm": 0.11685855406588523, + "learning_rate": 2.9529038441921885e-05, + "loss": 0.9174, + "step": 24467 + }, + { + "epoch": 5.219283276450512, + "grad_norm": 0.13744213057947946, + "learning_rate": 2.9522436215670356e-05, + "loss": 0.9109, + "step": 24468 + }, + { + "epoch": 5.219496587030717, + "grad_norm": 0.1289775732113052, + "learning_rate": 2.9515834560573312e-05, + "loss": 0.9267, + "step": 24469 + }, + { + "epoch": 5.2197098976109215, + "grad_norm": 0.12371665822996582, + "learning_rate": 2.9509233476705446e-05, + "loss": 0.8869, + "step": 24470 + }, + { + "epoch": 5.219923208191126, + "grad_norm": 0.15514793754627978, + "learning_rate": 2.9502632964141427e-05, + "loss": 0.8995, + "step": 24471 + }, + { + "epoch": 5.220136518771331, + "grad_norm": 0.12066007613594307, + "learning_rate": 2.9496033022955993e-05, + "loss": 0.8963, + "step": 24472 + }, + { + "epoch": 5.220349829351536, + "grad_norm": 0.15514674855637453, + "learning_rate": 2.9489433653223796e-05, + "loss": 0.8893, + "step": 24473 + }, + { + "epoch": 5.22056313993174, + "grad_norm": 0.1196835391151377, + "learning_rate": 2.9482834855019507e-05, + "loss": 0.8704, + "step": 24474 + }, + { + "epoch": 5.220776450511945, + "grad_norm": 0.1566626754833377, + "learning_rate": 2.947623662841779e-05, + "loss": 0.8615, + "step": 24475 + }, + { + "epoch": 5.2209897610921505, + "grad_norm": 0.13199683446702806, + "learning_rate": 2.9469638973493276e-05, + "loss": 0.9137, + "step": 24476 + }, + { + "epoch": 5.221203071672355, + "grad_norm": 0.14927778096836392, + "learning_rate": 2.9463041890320706e-05, + "loss": 0.9117, + "step": 24477 + }, + { + "epoch": 5.22141638225256, + "grad_norm": 0.15715203734838612, + "learning_rate": 2.9456445378974634e-05, + "loss": 0.8991, + "step": 24478 + }, + { + "epoch": 5.221629692832765, + "grad_norm": 0.1426217295198415, + "learning_rate": 2.944984943952971e-05, + "loss": 0.8875, + "step": 24479 + }, + { + "epoch": 5.221843003412969, + "grad_norm": 0.13935110243023163, + "learning_rate": 2.944325407206061e-05, + "loss": 0.8682, + "step": 24480 + }, + { + "epoch": 5.222056313993174, + "grad_norm": 0.1226684679018723, + "learning_rate": 2.9436659276641947e-05, + "loss": 0.9103, + "step": 24481 + }, + { + "epoch": 5.222269624573379, + "grad_norm": 0.11314381433486344, + "learning_rate": 2.9430065053348343e-05, + "loss": 0.916, + "step": 24482 + }, + { + "epoch": 5.222482935153583, + "grad_norm": 0.1308520980134897, + "learning_rate": 2.9423471402254366e-05, + "loss": 0.8871, + "step": 24483 + }, + { + "epoch": 5.222696245733788, + "grad_norm": 0.1312752307953991, + "learning_rate": 2.941687832343469e-05, + "loss": 0.8899, + "step": 24484 + }, + { + "epoch": 5.222909556313994, + "grad_norm": 0.10659953400494039, + "learning_rate": 2.9410285816963885e-05, + "loss": 0.8793, + "step": 24485 + }, + { + "epoch": 5.223122866894198, + "grad_norm": 0.15305971098955265, + "learning_rate": 2.9403693882916548e-05, + "loss": 0.8833, + "step": 24486 + }, + { + "epoch": 5.223336177474403, + "grad_norm": 0.11152402391999355, + "learning_rate": 2.9397102521367267e-05, + "loss": 0.9026, + "step": 24487 + }, + { + "epoch": 5.223549488054608, + "grad_norm": 0.12855458552072221, + "learning_rate": 2.9390511732390615e-05, + "loss": 0.9213, + "step": 24488 + }, + { + "epoch": 5.223762798634812, + "grad_norm": 0.14842345664797252, + "learning_rate": 2.9383921516061185e-05, + "loss": 0.89, + "step": 24489 + }, + { + "epoch": 5.223976109215017, + "grad_norm": 0.12481385158167925, + "learning_rate": 2.9377331872453526e-05, + "loss": 0.9062, + "step": 24490 + }, + { + "epoch": 5.224189419795222, + "grad_norm": 0.1439207684750954, + "learning_rate": 2.937074280164218e-05, + "loss": 0.8729, + "step": 24491 + }, + { + "epoch": 5.224402730375426, + "grad_norm": 0.12244811579002284, + "learning_rate": 2.9364154303701763e-05, + "loss": 0.9005, + "step": 24492 + }, + { + "epoch": 5.224616040955631, + "grad_norm": 0.150526729191903, + "learning_rate": 2.9357566378706796e-05, + "loss": 0.9095, + "step": 24493 + }, + { + "epoch": 5.224829351535837, + "grad_norm": 0.1170014334699624, + "learning_rate": 2.9350979026731818e-05, + "loss": 0.8706, + "step": 24494 + }, + { + "epoch": 5.225042662116041, + "grad_norm": 0.14242079969625393, + "learning_rate": 2.934439224785132e-05, + "loss": 0.8662, + "step": 24495 + }, + { + "epoch": 5.225255972696246, + "grad_norm": 0.1302125659141192, + "learning_rate": 2.9337806042139925e-05, + "loss": 0.8977, + "step": 24496 + }, + { + "epoch": 5.225469283276451, + "grad_norm": 0.16256461294736368, + "learning_rate": 2.9331220409672096e-05, + "loss": 0.8887, + "step": 24497 + }, + { + "epoch": 5.225682593856655, + "grad_norm": 0.13067208573188058, + "learning_rate": 2.932463535052236e-05, + "loss": 0.8841, + "step": 24498 + }, + { + "epoch": 5.22589590443686, + "grad_norm": 0.1453259474083218, + "learning_rate": 2.931805086476523e-05, + "loss": 0.8625, + "step": 24499 + }, + { + "epoch": 5.226109215017065, + "grad_norm": 0.13949916410773522, + "learning_rate": 2.9311466952475202e-05, + "loss": 0.8765, + "step": 24500 + }, + { + "epoch": 5.226322525597269, + "grad_norm": 0.1489267470549169, + "learning_rate": 2.9304883613726783e-05, + "loss": 0.8972, + "step": 24501 + }, + { + "epoch": 5.226535836177474, + "grad_norm": 0.14344166113135, + "learning_rate": 2.929830084859445e-05, + "loss": 0.8866, + "step": 24502 + }, + { + "epoch": 5.226749146757679, + "grad_norm": 0.13724379935361775, + "learning_rate": 2.929171865715268e-05, + "loss": 0.8999, + "step": 24503 + }, + { + "epoch": 5.226962457337884, + "grad_norm": 0.14501301887449344, + "learning_rate": 2.9285137039475987e-05, + "loss": 0.8728, + "step": 24504 + }, + { + "epoch": 5.227175767918089, + "grad_norm": 0.14214262422618248, + "learning_rate": 2.9278555995638825e-05, + "loss": 0.9042, + "step": 24505 + }, + { + "epoch": 5.227389078498294, + "grad_norm": 0.12699090275729702, + "learning_rate": 2.9271975525715653e-05, + "loss": 0.8898, + "step": 24506 + }, + { + "epoch": 5.227602389078498, + "grad_norm": 0.1352173861405247, + "learning_rate": 2.92653956297809e-05, + "loss": 0.8996, + "step": 24507 + }, + { + "epoch": 5.227815699658703, + "grad_norm": 0.13411549341096413, + "learning_rate": 2.9258816307909082e-05, + "loss": 0.903, + "step": 24508 + }, + { + "epoch": 5.228029010238908, + "grad_norm": 0.12591276364435974, + "learning_rate": 2.9252237560174613e-05, + "loss": 0.8722, + "step": 24509 + }, + { + "epoch": 5.228242320819112, + "grad_norm": 0.14373632245226636, + "learning_rate": 2.924565938665195e-05, + "loss": 0.8792, + "step": 24510 + }, + { + "epoch": 5.228455631399317, + "grad_norm": 0.15311040078523527, + "learning_rate": 2.9239081787415452e-05, + "loss": 0.9158, + "step": 24511 + }, + { + "epoch": 5.228668941979522, + "grad_norm": 0.13660410376854767, + "learning_rate": 2.923250476253962e-05, + "loss": 0.8812, + "step": 24512 + }, + { + "epoch": 5.228882252559727, + "grad_norm": 0.14481751488388261, + "learning_rate": 2.9225928312098858e-05, + "loss": 0.8654, + "step": 24513 + }, + { + "epoch": 5.229095563139932, + "grad_norm": 0.1296767158230525, + "learning_rate": 2.9219352436167577e-05, + "loss": 0.8808, + "step": 24514 + }, + { + "epoch": 5.229308873720137, + "grad_norm": 0.13379331784047613, + "learning_rate": 2.921277713482014e-05, + "loss": 0.9061, + "step": 24515 + }, + { + "epoch": 5.229522184300341, + "grad_norm": 0.14829238324212368, + "learning_rate": 2.920620240813102e-05, + "loss": 0.8835, + "step": 24516 + }, + { + "epoch": 5.229735494880546, + "grad_norm": 0.13520522699766968, + "learning_rate": 2.9199628256174565e-05, + "loss": 0.9261, + "step": 24517 + }, + { + "epoch": 5.229948805460751, + "grad_norm": 0.1239927764117236, + "learning_rate": 2.9193054679025176e-05, + "loss": 0.8848, + "step": 24518 + }, + { + "epoch": 5.2301621160409555, + "grad_norm": 0.12690405555397816, + "learning_rate": 2.918648167675719e-05, + "loss": 0.9071, + "step": 24519 + }, + { + "epoch": 5.23037542662116, + "grad_norm": 0.12826928591709436, + "learning_rate": 2.9179909249445056e-05, + "loss": 0.8997, + "step": 24520 + }, + { + "epoch": 5.230588737201365, + "grad_norm": 0.13094292626945636, + "learning_rate": 2.9173337397163126e-05, + "loss": 0.902, + "step": 24521 + }, + { + "epoch": 5.2308020477815695, + "grad_norm": 0.13249850379264064, + "learning_rate": 2.9166766119985717e-05, + "loss": 0.918, + "step": 24522 + }, + { + "epoch": 5.231015358361775, + "grad_norm": 0.1453623684499522, + "learning_rate": 2.9160195417987177e-05, + "loss": 0.9021, + "step": 24523 + }, + { + "epoch": 5.23122866894198, + "grad_norm": 0.15627125040613393, + "learning_rate": 2.91536252912419e-05, + "loss": 0.904, + "step": 24524 + }, + { + "epoch": 5.2314419795221845, + "grad_norm": 0.13180090309251025, + "learning_rate": 2.9147055739824215e-05, + "loss": 0.897, + "step": 24525 + }, + { + "epoch": 5.231655290102389, + "grad_norm": 0.14513003743045494, + "learning_rate": 2.9140486763808452e-05, + "loss": 0.8719, + "step": 24526 + }, + { + "epoch": 5.231868600682594, + "grad_norm": 0.1308317611943283, + "learning_rate": 2.9133918363268906e-05, + "loss": 0.8567, + "step": 24527 + }, + { + "epoch": 5.2320819112627985, + "grad_norm": 0.14256953190294863, + "learning_rate": 2.9127350538279953e-05, + "loss": 0.8925, + "step": 24528 + }, + { + "epoch": 5.232295221843003, + "grad_norm": 0.12322684071873156, + "learning_rate": 2.9120783288915892e-05, + "loss": 0.893, + "step": 24529 + }, + { + "epoch": 5.232508532423208, + "grad_norm": 0.14027692612073778, + "learning_rate": 2.9114216615251014e-05, + "loss": 0.8999, + "step": 24530 + }, + { + "epoch": 5.232721843003413, + "grad_norm": 0.11588365091868752, + "learning_rate": 2.9107650517359612e-05, + "loss": 0.8691, + "step": 24531 + }, + { + "epoch": 5.232935153583618, + "grad_norm": 0.14006876086853653, + "learning_rate": 2.910108499531605e-05, + "loss": 0.8646, + "step": 24532 + }, + { + "epoch": 5.233148464163823, + "grad_norm": 0.11292165637824536, + "learning_rate": 2.9094520049194544e-05, + "loss": 0.9085, + "step": 24533 + }, + { + "epoch": 5.2333617747440275, + "grad_norm": 0.14440192304332858, + "learning_rate": 2.90879556790694e-05, + "loss": 0.8487, + "step": 24534 + }, + { + "epoch": 5.233575085324232, + "grad_norm": 0.11948105649072202, + "learning_rate": 2.908139188501486e-05, + "loss": 0.8879, + "step": 24535 + }, + { + "epoch": 5.233788395904437, + "grad_norm": 0.12402789514897296, + "learning_rate": 2.9074828667105265e-05, + "loss": 0.9253, + "step": 24536 + }, + { + "epoch": 5.234001706484642, + "grad_norm": 0.1337816156451083, + "learning_rate": 2.9068266025414832e-05, + "loss": 0.8736, + "step": 24537 + }, + { + "epoch": 5.234215017064846, + "grad_norm": 0.12655384647818563, + "learning_rate": 2.906170396001783e-05, + "loss": 0.8826, + "step": 24538 + }, + { + "epoch": 5.234428327645051, + "grad_norm": 0.13890024652157928, + "learning_rate": 2.9055142470988476e-05, + "loss": 0.907, + "step": 24539 + }, + { + "epoch": 5.234641638225256, + "grad_norm": 0.12421145481852001, + "learning_rate": 2.9048581558401063e-05, + "loss": 0.9211, + "step": 24540 + }, + { + "epoch": 5.234854948805461, + "grad_norm": 0.11740374573511797, + "learning_rate": 2.904202122232982e-05, + "loss": 0.841, + "step": 24541 + }, + { + "epoch": 5.235068259385666, + "grad_norm": 0.1392118510475982, + "learning_rate": 2.903546146284895e-05, + "loss": 0.8882, + "step": 24542 + }, + { + "epoch": 5.235281569965871, + "grad_norm": 0.11210703980441297, + "learning_rate": 2.9028902280032675e-05, + "loss": 0.895, + "step": 24543 + }, + { + "epoch": 5.235494880546075, + "grad_norm": 0.13644241446211583, + "learning_rate": 2.9022343673955276e-05, + "loss": 0.9072, + "step": 24544 + }, + { + "epoch": 5.23570819112628, + "grad_norm": 0.14033550705843012, + "learning_rate": 2.9015785644690884e-05, + "loss": 0.8661, + "step": 24545 + }, + { + "epoch": 5.235921501706485, + "grad_norm": 0.1286606119095709, + "learning_rate": 2.900922819231374e-05, + "loss": 0.8957, + "step": 24546 + }, + { + "epoch": 5.236134812286689, + "grad_norm": 0.1380443548467628, + "learning_rate": 2.900267131689801e-05, + "loss": 0.874, + "step": 24547 + }, + { + "epoch": 5.236348122866894, + "grad_norm": 0.15907047832006088, + "learning_rate": 2.8996115018517942e-05, + "loss": 0.9026, + "step": 24548 + }, + { + "epoch": 5.236561433447099, + "grad_norm": 0.144137486614925, + "learning_rate": 2.8989559297247685e-05, + "loss": 0.8669, + "step": 24549 + }, + { + "epoch": 5.236774744027303, + "grad_norm": 0.1472996877904525, + "learning_rate": 2.8983004153161424e-05, + "loss": 0.9023, + "step": 24550 + }, + { + "epoch": 5.236988054607509, + "grad_norm": 0.15308938235982159, + "learning_rate": 2.897644958633331e-05, + "loss": 0.8793, + "step": 24551 + }, + { + "epoch": 5.237201365187714, + "grad_norm": 0.14130274371198162, + "learning_rate": 2.8969895596837543e-05, + "loss": 0.8842, + "step": 24552 + }, + { + "epoch": 5.237414675767918, + "grad_norm": 0.13554032815730715, + "learning_rate": 2.8963342184748273e-05, + "loss": 0.8771, + "step": 24553 + }, + { + "epoch": 5.237627986348123, + "grad_norm": 0.13779549405724284, + "learning_rate": 2.8956789350139646e-05, + "loss": 0.8942, + "step": 24554 + }, + { + "epoch": 5.237841296928328, + "grad_norm": 0.1356214441175826, + "learning_rate": 2.89502370930858e-05, + "loss": 0.8892, + "step": 24555 + }, + { + "epoch": 5.238054607508532, + "grad_norm": 0.12126826817857206, + "learning_rate": 2.8943685413660884e-05, + "loss": 0.8738, + "step": 24556 + }, + { + "epoch": 5.238267918088737, + "grad_norm": 0.13891951785978002, + "learning_rate": 2.893713431193902e-05, + "loss": 0.8589, + "step": 24557 + }, + { + "epoch": 5.238481228668942, + "grad_norm": 0.1292783752870521, + "learning_rate": 2.8930583787994345e-05, + "loss": 0.8811, + "step": 24558 + }, + { + "epoch": 5.238694539249146, + "grad_norm": 0.1337376266691112, + "learning_rate": 2.892403384190095e-05, + "loss": 0.8979, + "step": 24559 + }, + { + "epoch": 5.238907849829351, + "grad_norm": 0.13146893072449592, + "learning_rate": 2.8917484473732993e-05, + "loss": 0.8647, + "step": 24560 + }, + { + "epoch": 5.239121160409557, + "grad_norm": 0.1195297693331969, + "learning_rate": 2.891093568356455e-05, + "loss": 0.8712, + "step": 24561 + }, + { + "epoch": 5.239334470989761, + "grad_norm": 0.13647578486905995, + "learning_rate": 2.8904387471469745e-05, + "loss": 0.8879, + "step": 24562 + }, + { + "epoch": 5.239547781569966, + "grad_norm": 0.12546079100544028, + "learning_rate": 2.8897839837522617e-05, + "loss": 0.8931, + "step": 24563 + }, + { + "epoch": 5.239761092150171, + "grad_norm": 0.12139191901897091, + "learning_rate": 2.8891292781797323e-05, + "loss": 0.9164, + "step": 24564 + }, + { + "epoch": 5.239974402730375, + "grad_norm": 0.138652062368148, + "learning_rate": 2.8884746304367914e-05, + "loss": 0.9097, + "step": 24565 + }, + { + "epoch": 5.24018771331058, + "grad_norm": 0.11207329434394898, + "learning_rate": 2.887820040530846e-05, + "loss": 0.8995, + "step": 24566 + }, + { + "epoch": 5.240401023890785, + "grad_norm": 0.15310912489162193, + "learning_rate": 2.887165508469302e-05, + "loss": 0.8773, + "step": 24567 + }, + { + "epoch": 5.2406143344709895, + "grad_norm": 0.12325858726390351, + "learning_rate": 2.8865110342595665e-05, + "loss": 0.8839, + "step": 24568 + }, + { + "epoch": 5.240827645051194, + "grad_norm": 0.1304742018579671, + "learning_rate": 2.8858566179090455e-05, + "loss": 0.8739, + "step": 24569 + }, + { + "epoch": 5.2410409556314, + "grad_norm": 0.12033691120838956, + "learning_rate": 2.8852022594251425e-05, + "loss": 0.9128, + "step": 24570 + }, + { + "epoch": 5.241254266211604, + "grad_norm": 0.12311800646010539, + "learning_rate": 2.884547958815259e-05, + "loss": 0.886, + "step": 24571 + }, + { + "epoch": 5.241467576791809, + "grad_norm": 0.1257214732630617, + "learning_rate": 2.883893716086804e-05, + "loss": 0.892, + "step": 24572 + }, + { + "epoch": 5.241680887372014, + "grad_norm": 0.12331955301697917, + "learning_rate": 2.8832395312471777e-05, + "loss": 0.8564, + "step": 24573 + }, + { + "epoch": 5.2418941979522184, + "grad_norm": 0.12556090182688587, + "learning_rate": 2.8825854043037824e-05, + "loss": 0.8816, + "step": 24574 + }, + { + "epoch": 5.242107508532423, + "grad_norm": 0.11062446683410289, + "learning_rate": 2.8819313352640164e-05, + "loss": 0.8503, + "step": 24575 + }, + { + "epoch": 5.242320819112628, + "grad_norm": 0.12597065988989728, + "learning_rate": 2.8812773241352866e-05, + "loss": 0.8877, + "step": 24576 + }, + { + "epoch": 5.2425341296928325, + "grad_norm": 0.12360353601103663, + "learning_rate": 2.8806233709249922e-05, + "loss": 0.8821, + "step": 24577 + }, + { + "epoch": 5.242747440273037, + "grad_norm": 0.10905489984934269, + "learning_rate": 2.8799694756405255e-05, + "loss": 0.8899, + "step": 24578 + }, + { + "epoch": 5.242960750853243, + "grad_norm": 0.13458948930272682, + "learning_rate": 2.879315638289292e-05, + "loss": 0.9179, + "step": 24579 + }, + { + "epoch": 5.243174061433447, + "grad_norm": 0.11400738782704425, + "learning_rate": 2.8786618588786895e-05, + "loss": 0.8892, + "step": 24580 + }, + { + "epoch": 5.243387372013652, + "grad_norm": 0.11894652084360219, + "learning_rate": 2.8780081374161128e-05, + "loss": 0.8984, + "step": 24581 + }, + { + "epoch": 5.243600682593857, + "grad_norm": 0.11326612229032493, + "learning_rate": 2.8773544739089615e-05, + "loss": 0.8906, + "step": 24582 + }, + { + "epoch": 5.2438139931740615, + "grad_norm": 0.13125977646129128, + "learning_rate": 2.876700868364627e-05, + "loss": 0.8879, + "step": 24583 + }, + { + "epoch": 5.244027303754266, + "grad_norm": 0.11515213279556878, + "learning_rate": 2.8760473207905114e-05, + "loss": 0.8826, + "step": 24584 + }, + { + "epoch": 5.244240614334471, + "grad_norm": 0.14110164094091424, + "learning_rate": 2.8753938311940062e-05, + "loss": 0.9006, + "step": 24585 + }, + { + "epoch": 5.2444539249146755, + "grad_norm": 0.1325813692469219, + "learning_rate": 2.874740399582507e-05, + "loss": 0.9, + "step": 24586 + }, + { + "epoch": 5.24466723549488, + "grad_norm": 0.14637146012285307, + "learning_rate": 2.8740870259634026e-05, + "loss": 0.901, + "step": 24587 + }, + { + "epoch": 5.244880546075085, + "grad_norm": 0.1471137073878006, + "learning_rate": 2.8734337103440953e-05, + "loss": 0.9039, + "step": 24588 + }, + { + "epoch": 5.2450938566552905, + "grad_norm": 0.1360511758100405, + "learning_rate": 2.872780452731969e-05, + "loss": 0.8942, + "step": 24589 + }, + { + "epoch": 5.245307167235495, + "grad_norm": 0.15950663099988205, + "learning_rate": 2.8721272531344156e-05, + "loss": 0.903, + "step": 24590 + }, + { + "epoch": 5.2455204778157, + "grad_norm": 0.12957132671893043, + "learning_rate": 2.871474111558832e-05, + "loss": 0.8434, + "step": 24591 + }, + { + "epoch": 5.2457337883959045, + "grad_norm": 0.17509342822201565, + "learning_rate": 2.8708210280126042e-05, + "loss": 0.8961, + "step": 24592 + }, + { + "epoch": 5.245947098976109, + "grad_norm": 0.16568646784258917, + "learning_rate": 2.8701680025031228e-05, + "loss": 0.9089, + "step": 24593 + }, + { + "epoch": 5.246160409556314, + "grad_norm": 0.14166286585875815, + "learning_rate": 2.8695150350377773e-05, + "loss": 0.9128, + "step": 24594 + }, + { + "epoch": 5.246373720136519, + "grad_norm": 0.14619598459387445, + "learning_rate": 2.8688621256239524e-05, + "loss": 0.8979, + "step": 24595 + }, + { + "epoch": 5.246587030716723, + "grad_norm": 0.393366252255756, + "learning_rate": 2.8682092742690417e-05, + "loss": 0.8693, + "step": 24596 + }, + { + "epoch": 5.246800341296928, + "grad_norm": 0.14778209980598828, + "learning_rate": 2.8675564809804296e-05, + "loss": 0.8689, + "step": 24597 + }, + { + "epoch": 5.2470136518771335, + "grad_norm": 0.1521330733303344, + "learning_rate": 2.866903745765502e-05, + "loss": 0.895, + "step": 24598 + }, + { + "epoch": 5.247226962457338, + "grad_norm": 0.15307490082565403, + "learning_rate": 2.866251068631641e-05, + "loss": 0.8775, + "step": 24599 + }, + { + "epoch": 5.247440273037543, + "grad_norm": 0.16379499548178636, + "learning_rate": 2.8655984495862426e-05, + "loss": 0.9032, + "step": 24600 + }, + { + "epoch": 5.247653583617748, + "grad_norm": 0.16852317473299755, + "learning_rate": 2.8649458886366805e-05, + "loss": 0.8945, + "step": 24601 + }, + { + "epoch": 5.247866894197952, + "grad_norm": 0.14808430385788163, + "learning_rate": 2.8642933857903395e-05, + "loss": 0.8939, + "step": 24602 + }, + { + "epoch": 5.248080204778157, + "grad_norm": 0.20596563891932135, + "learning_rate": 2.863640941054608e-05, + "loss": 0.9029, + "step": 24603 + }, + { + "epoch": 5.248293515358362, + "grad_norm": 0.15301850764320965, + "learning_rate": 2.8629885544368662e-05, + "loss": 0.8597, + "step": 24604 + }, + { + "epoch": 5.248506825938566, + "grad_norm": 0.15636889148848324, + "learning_rate": 2.8623362259444945e-05, + "loss": 0.899, + "step": 24605 + }, + { + "epoch": 5.248720136518771, + "grad_norm": 0.1457527298674803, + "learning_rate": 2.8616839555848756e-05, + "loss": 0.9045, + "step": 24606 + }, + { + "epoch": 5.248933447098976, + "grad_norm": 0.18675404081489375, + "learning_rate": 2.861031743365386e-05, + "loss": 0.9031, + "step": 24607 + }, + { + "epoch": 5.249146757679181, + "grad_norm": 0.1442307181850334, + "learning_rate": 2.860379589293411e-05, + "loss": 0.8655, + "step": 24608 + }, + { + "epoch": 5.249360068259386, + "grad_norm": 0.18378195627121216, + "learning_rate": 2.859727493376328e-05, + "loss": 0.88, + "step": 24609 + }, + { + "epoch": 5.249573378839591, + "grad_norm": 0.1382720944805368, + "learning_rate": 2.8590754556215147e-05, + "loss": 0.8841, + "step": 24610 + }, + { + "epoch": 5.249786689419795, + "grad_norm": 0.16182357823219856, + "learning_rate": 2.8584234760363485e-05, + "loss": 0.8657, + "step": 24611 + }, + { + "epoch": 5.25, + "grad_norm": 0.14108140144904094, + "learning_rate": 2.8577715546282082e-05, + "loss": 0.9087, + "step": 24612 + }, + { + "epoch": 5.250213310580205, + "grad_norm": 0.16979402731625204, + "learning_rate": 2.8571196914044682e-05, + "loss": 0.8906, + "step": 24613 + }, + { + "epoch": 5.250426621160409, + "grad_norm": 0.15095798163451457, + "learning_rate": 2.856467886372503e-05, + "loss": 0.8936, + "step": 24614 + }, + { + "epoch": 5.250639931740614, + "grad_norm": 0.15893711214689368, + "learning_rate": 2.855816139539694e-05, + "loss": 0.9111, + "step": 24615 + }, + { + "epoch": 5.250853242320819, + "grad_norm": 0.1527160596611221, + "learning_rate": 2.8551644509134103e-05, + "loss": 0.8839, + "step": 24616 + }, + { + "epoch": 5.251066552901024, + "grad_norm": 0.132261197728261, + "learning_rate": 2.854512820501028e-05, + "loss": 0.8836, + "step": 24617 + }, + { + "epoch": 5.251279863481229, + "grad_norm": 0.1398219091960873, + "learning_rate": 2.8538612483099198e-05, + "loss": 0.8824, + "step": 24618 + }, + { + "epoch": 5.251493174061434, + "grad_norm": 0.13021850770939616, + "learning_rate": 2.8532097343474556e-05, + "loss": 0.9095, + "step": 24619 + }, + { + "epoch": 5.251706484641638, + "grad_norm": 0.14413831467782606, + "learning_rate": 2.8525582786210128e-05, + "loss": 0.8865, + "step": 24620 + }, + { + "epoch": 5.251919795221843, + "grad_norm": 0.1324642938194843, + "learning_rate": 2.8519068811379588e-05, + "loss": 0.8671, + "step": 24621 + }, + { + "epoch": 5.252133105802048, + "grad_norm": 0.13845050569739406, + "learning_rate": 2.8512555419056654e-05, + "loss": 0.8958, + "step": 24622 + }, + { + "epoch": 5.252346416382252, + "grad_norm": 0.14294946216979637, + "learning_rate": 2.850604260931502e-05, + "loss": 0.8885, + "step": 24623 + }, + { + "epoch": 5.252559726962457, + "grad_norm": 0.12564539649006645, + "learning_rate": 2.8499530382228387e-05, + "loss": 0.908, + "step": 24624 + }, + { + "epoch": 5.252773037542662, + "grad_norm": 0.14293279337510637, + "learning_rate": 2.8493018737870427e-05, + "loss": 0.9116, + "step": 24625 + }, + { + "epoch": 5.252986348122867, + "grad_norm": 0.13282806911952658, + "learning_rate": 2.8486507676314797e-05, + "loss": 0.8944, + "step": 24626 + }, + { + "epoch": 5.253199658703072, + "grad_norm": 0.12340238363204999, + "learning_rate": 2.8479997197635233e-05, + "loss": 0.8925, + "step": 24627 + }, + { + "epoch": 5.253412969283277, + "grad_norm": 0.11068329859220664, + "learning_rate": 2.847348730190536e-05, + "loss": 0.8523, + "step": 24628 + }, + { + "epoch": 5.253626279863481, + "grad_norm": 0.12254938923051589, + "learning_rate": 2.846697798919885e-05, + "loss": 0.882, + "step": 24629 + }, + { + "epoch": 5.253839590443686, + "grad_norm": 0.13321642136964296, + "learning_rate": 2.846046925958936e-05, + "loss": 0.9028, + "step": 24630 + }, + { + "epoch": 5.254052901023891, + "grad_norm": 0.12028188537288682, + "learning_rate": 2.8453961113150482e-05, + "loss": 0.8842, + "step": 24631 + }, + { + "epoch": 5.2542662116040955, + "grad_norm": 0.13463159935639432, + "learning_rate": 2.8447453549955938e-05, + "loss": 0.8894, + "step": 24632 + }, + { + "epoch": 5.2544795221843, + "grad_norm": 0.11015682735688943, + "learning_rate": 2.8440946570079347e-05, + "loss": 0.8629, + "step": 24633 + }, + { + "epoch": 5.254692832764505, + "grad_norm": 0.12767434102952427, + "learning_rate": 2.843444017359427e-05, + "loss": 0.9078, + "step": 24634 + }, + { + "epoch": 5.2549061433447095, + "grad_norm": 0.13883456912744288, + "learning_rate": 2.8427934360574386e-05, + "loss": 0.8898, + "step": 24635 + }, + { + "epoch": 5.255119453924915, + "grad_norm": 0.25828259919454655, + "learning_rate": 2.8421429131093288e-05, + "loss": 0.8662, + "step": 24636 + }, + { + "epoch": 5.25533276450512, + "grad_norm": 0.13881189484686457, + "learning_rate": 2.84149244852246e-05, + "loss": 0.8779, + "step": 24637 + }, + { + "epoch": 5.2555460750853245, + "grad_norm": 0.12241951767997174, + "learning_rate": 2.8408420423041873e-05, + "loss": 0.9218, + "step": 24638 + }, + { + "epoch": 5.255759385665529, + "grad_norm": 0.15356107571290273, + "learning_rate": 2.8401916944618766e-05, + "loss": 0.8947, + "step": 24639 + }, + { + "epoch": 5.255972696245734, + "grad_norm": 0.13615627799435906, + "learning_rate": 2.8395414050028836e-05, + "loss": 0.8965, + "step": 24640 + }, + { + "epoch": 5.2561860068259385, + "grad_norm": 0.14304575873435169, + "learning_rate": 2.838891173934567e-05, + "loss": 0.8915, + "step": 24641 + }, + { + "epoch": 5.256399317406143, + "grad_norm": 0.13419701184966498, + "learning_rate": 2.8382410012642822e-05, + "loss": 0.8634, + "step": 24642 + }, + { + "epoch": 5.256612627986348, + "grad_norm": 0.1262074116450959, + "learning_rate": 2.8375908869993858e-05, + "loss": 0.8777, + "step": 24643 + }, + { + "epoch": 5.256825938566553, + "grad_norm": 0.17217187747420562, + "learning_rate": 2.8369408311472404e-05, + "loss": 0.8745, + "step": 24644 + }, + { + "epoch": 5.257039249146757, + "grad_norm": 0.12099369596571391, + "learning_rate": 2.8362908337151938e-05, + "loss": 0.8688, + "step": 24645 + }, + { + "epoch": 5.257252559726963, + "grad_norm": 0.1484400554304898, + "learning_rate": 2.8356408947106007e-05, + "loss": 0.8927, + "step": 24646 + }, + { + "epoch": 5.2574658703071675, + "grad_norm": 0.13138930065705298, + "learning_rate": 2.83499101414082e-05, + "loss": 0.8918, + "step": 24647 + }, + { + "epoch": 5.257679180887372, + "grad_norm": 0.14914536674151357, + "learning_rate": 2.834341192013203e-05, + "loss": 0.8858, + "step": 24648 + }, + { + "epoch": 5.257892491467577, + "grad_norm": 0.12090386296512128, + "learning_rate": 2.8336914283351018e-05, + "loss": 0.8942, + "step": 24649 + }, + { + "epoch": 5.2581058020477816, + "grad_norm": 0.12809142276510443, + "learning_rate": 2.8330417231138675e-05, + "loss": 0.8967, + "step": 24650 + }, + { + "epoch": 5.258319112627986, + "grad_norm": 0.12236601635323953, + "learning_rate": 2.832392076356855e-05, + "loss": 0.9252, + "step": 24651 + }, + { + "epoch": 5.258532423208191, + "grad_norm": 0.11193657058633141, + "learning_rate": 2.831742488071413e-05, + "loss": 0.9073, + "step": 24652 + }, + { + "epoch": 5.258745733788396, + "grad_norm": 0.14670035541493257, + "learning_rate": 2.8310929582648916e-05, + "loss": 0.8756, + "step": 24653 + }, + { + "epoch": 5.2589590443686, + "grad_norm": 0.12594761441210853, + "learning_rate": 2.83044348694464e-05, + "loss": 0.8888, + "step": 24654 + }, + { + "epoch": 5.259172354948806, + "grad_norm": 0.12197948232074363, + "learning_rate": 2.8297940741180085e-05, + "loss": 0.8794, + "step": 24655 + }, + { + "epoch": 5.2593856655290105, + "grad_norm": 0.11524614553244812, + "learning_rate": 2.8291447197923428e-05, + "loss": 0.8908, + "step": 24656 + }, + { + "epoch": 5.259598976109215, + "grad_norm": 0.12896747856635607, + "learning_rate": 2.8284954239749923e-05, + "loss": 0.8912, + "step": 24657 + }, + { + "epoch": 5.25981228668942, + "grad_norm": 0.11623043152113782, + "learning_rate": 2.8278461866733003e-05, + "loss": 0.8678, + "step": 24658 + }, + { + "epoch": 5.260025597269625, + "grad_norm": 0.13649372100895504, + "learning_rate": 2.8271970078946175e-05, + "loss": 0.8729, + "step": 24659 + }, + { + "epoch": 5.260238907849829, + "grad_norm": 0.1399016605710061, + "learning_rate": 2.826547887646289e-05, + "loss": 0.8989, + "step": 24660 + }, + { + "epoch": 5.260452218430034, + "grad_norm": 0.1327259427822832, + "learning_rate": 2.8258988259356574e-05, + "loss": 0.8835, + "step": 24661 + }, + { + "epoch": 5.260665529010239, + "grad_norm": 0.13502194270008275, + "learning_rate": 2.825249822770065e-05, + "loss": 0.8688, + "step": 24662 + }, + { + "epoch": 5.260878839590443, + "grad_norm": 0.14387791767612987, + "learning_rate": 2.8246008781568605e-05, + "loss": 0.872, + "step": 24663 + }, + { + "epoch": 5.261092150170649, + "grad_norm": 0.1268553261160347, + "learning_rate": 2.8239519921033837e-05, + "loss": 0.8756, + "step": 24664 + }, + { + "epoch": 5.261305460750854, + "grad_norm": 0.14474694910985977, + "learning_rate": 2.823303164616978e-05, + "loss": 0.8831, + "step": 24665 + }, + { + "epoch": 5.261518771331058, + "grad_norm": 0.1366439018072001, + "learning_rate": 2.8226543957049838e-05, + "loss": 0.917, + "step": 24666 + }, + { + "epoch": 5.261732081911263, + "grad_norm": 0.132134771677639, + "learning_rate": 2.822005685374742e-05, + "loss": 0.8699, + "step": 24667 + }, + { + "epoch": 5.261945392491468, + "grad_norm": 0.11758714224834806, + "learning_rate": 2.8213570336335928e-05, + "loss": 0.8958, + "step": 24668 + }, + { + "epoch": 5.262158703071672, + "grad_norm": 0.12625491115299084, + "learning_rate": 2.8207084404888765e-05, + "loss": 0.8917, + "step": 24669 + }, + { + "epoch": 5.262372013651877, + "grad_norm": 0.1210593305418925, + "learning_rate": 2.820059905947928e-05, + "loss": 0.8703, + "step": 24670 + }, + { + "epoch": 5.262585324232082, + "grad_norm": 0.11160014708471563, + "learning_rate": 2.819411430018092e-05, + "loss": 0.8998, + "step": 24671 + }, + { + "epoch": 5.262798634812286, + "grad_norm": 0.12852825482142893, + "learning_rate": 2.818763012706703e-05, + "loss": 0.9068, + "step": 24672 + }, + { + "epoch": 5.263011945392491, + "grad_norm": 0.11267796815746343, + "learning_rate": 2.8181146540210966e-05, + "loss": 0.8987, + "step": 24673 + }, + { + "epoch": 5.263225255972697, + "grad_norm": 0.12299119535670792, + "learning_rate": 2.8174663539686116e-05, + "loss": 0.9011, + "step": 24674 + }, + { + "epoch": 5.263438566552901, + "grad_norm": 0.1380667278162794, + "learning_rate": 2.816818112556578e-05, + "loss": 0.9121, + "step": 24675 + }, + { + "epoch": 5.263651877133106, + "grad_norm": 0.13091245304363078, + "learning_rate": 2.816169929792338e-05, + "loss": 0.8834, + "step": 24676 + }, + { + "epoch": 5.263865187713311, + "grad_norm": 0.12864925151761394, + "learning_rate": 2.815521805683222e-05, + "loss": 0.8587, + "step": 24677 + }, + { + "epoch": 5.264078498293515, + "grad_norm": 0.12376407176477139, + "learning_rate": 2.8148737402365642e-05, + "loss": 0.8665, + "step": 24678 + }, + { + "epoch": 5.26429180887372, + "grad_norm": 0.11637110871311696, + "learning_rate": 2.8142257334596973e-05, + "loss": 0.8747, + "step": 24679 + }, + { + "epoch": 5.264505119453925, + "grad_norm": 0.12928350221583632, + "learning_rate": 2.813577785359953e-05, + "loss": 0.913, + "step": 24680 + }, + { + "epoch": 5.264718430034129, + "grad_norm": 0.10894124247684617, + "learning_rate": 2.812929895944663e-05, + "loss": 0.8815, + "step": 24681 + }, + { + "epoch": 5.264931740614334, + "grad_norm": 0.12428346700231883, + "learning_rate": 2.8122820652211564e-05, + "loss": 0.9072, + "step": 24682 + }, + { + "epoch": 5.265145051194539, + "grad_norm": 0.12768374325840628, + "learning_rate": 2.811634293196769e-05, + "loss": 0.9071, + "step": 24683 + }, + { + "epoch": 5.265358361774744, + "grad_norm": 0.10445626099710018, + "learning_rate": 2.810986579878826e-05, + "loss": 0.8617, + "step": 24684 + }, + { + "epoch": 5.265571672354949, + "grad_norm": 0.14245701010041478, + "learning_rate": 2.810338925274657e-05, + "loss": 0.8787, + "step": 24685 + }, + { + "epoch": 5.265784982935154, + "grad_norm": 0.12266399401128252, + "learning_rate": 2.8096913293915907e-05, + "loss": 0.8852, + "step": 24686 + }, + { + "epoch": 5.265998293515358, + "grad_norm": 0.12375110078269044, + "learning_rate": 2.809043792236951e-05, + "loss": 0.8988, + "step": 24687 + }, + { + "epoch": 5.266211604095563, + "grad_norm": 0.1446265440714694, + "learning_rate": 2.8083963138180713e-05, + "loss": 0.9059, + "step": 24688 + }, + { + "epoch": 5.266424914675768, + "grad_norm": 0.12010349045276403, + "learning_rate": 2.807748894142277e-05, + "loss": 0.903, + "step": 24689 + }, + { + "epoch": 5.2666382252559725, + "grad_norm": 0.13265990550794324, + "learning_rate": 2.8071015332168863e-05, + "loss": 0.8835, + "step": 24690 + }, + { + "epoch": 5.266851535836177, + "grad_norm": 0.11834364684772943, + "learning_rate": 2.806454231049232e-05, + "loss": 0.9187, + "step": 24691 + }, + { + "epoch": 5.267064846416382, + "grad_norm": 0.1327074454360466, + "learning_rate": 2.8058069876466346e-05, + "loss": 0.8603, + "step": 24692 + }, + { + "epoch": 5.267278156996587, + "grad_norm": 0.14138791460895989, + "learning_rate": 2.8051598030164194e-05, + "loss": 0.8875, + "step": 24693 + }, + { + "epoch": 5.267491467576792, + "grad_norm": 0.12053547399563452, + "learning_rate": 2.8045126771659048e-05, + "loss": 0.847, + "step": 24694 + }, + { + "epoch": 5.267704778156997, + "grad_norm": 0.1424683341910296, + "learning_rate": 2.8038656101024203e-05, + "loss": 0.8768, + "step": 24695 + }, + { + "epoch": 5.2679180887372015, + "grad_norm": 0.11005874606464386, + "learning_rate": 2.8032186018332833e-05, + "loss": 0.9037, + "step": 24696 + }, + { + "epoch": 5.268131399317406, + "grad_norm": 0.12775311846257886, + "learning_rate": 2.8025716523658152e-05, + "loss": 0.876, + "step": 24697 + }, + { + "epoch": 5.268344709897611, + "grad_norm": 0.11404445732957452, + "learning_rate": 2.801924761707336e-05, + "loss": 0.904, + "step": 24698 + }, + { + "epoch": 5.2685580204778155, + "grad_norm": 0.1308089003908279, + "learning_rate": 2.8012779298651637e-05, + "loss": 0.8706, + "step": 24699 + }, + { + "epoch": 5.26877133105802, + "grad_norm": 0.12910681278902655, + "learning_rate": 2.800631156846624e-05, + "loss": 0.9017, + "step": 24700 + }, + { + "epoch": 5.268984641638225, + "grad_norm": 0.1366310621091331, + "learning_rate": 2.799984442659027e-05, + "loss": 0.9193, + "step": 24701 + }, + { + "epoch": 5.2691979522184305, + "grad_norm": 0.13622977849162968, + "learning_rate": 2.799337787309692e-05, + "loss": 0.9187, + "step": 24702 + }, + { + "epoch": 5.269411262798635, + "grad_norm": 0.12345749392726942, + "learning_rate": 2.7986911908059392e-05, + "loss": 0.929, + "step": 24703 + }, + { + "epoch": 5.26962457337884, + "grad_norm": 0.15229401608033818, + "learning_rate": 2.798044653155083e-05, + "loss": 0.8947, + "step": 24704 + }, + { + "epoch": 5.2698378839590445, + "grad_norm": 0.12582653374747638, + "learning_rate": 2.79739817436444e-05, + "loss": 0.9004, + "step": 24705 + }, + { + "epoch": 5.270051194539249, + "grad_norm": 0.1418296633355283, + "learning_rate": 2.7967517544413197e-05, + "loss": 0.9047, + "step": 24706 + }, + { + "epoch": 5.270264505119454, + "grad_norm": 0.1300658129873138, + "learning_rate": 2.7961053933930454e-05, + "loss": 0.9068, + "step": 24707 + }, + { + "epoch": 5.270477815699659, + "grad_norm": 0.1240816155368112, + "learning_rate": 2.795459091226925e-05, + "loss": 0.9061, + "step": 24708 + }, + { + "epoch": 5.270691126279863, + "grad_norm": 0.1353579839071055, + "learning_rate": 2.7948128479502733e-05, + "loss": 0.8981, + "step": 24709 + }, + { + "epoch": 5.270904436860068, + "grad_norm": 0.1257379457455396, + "learning_rate": 2.794166663570401e-05, + "loss": 0.8824, + "step": 24710 + }, + { + "epoch": 5.2711177474402735, + "grad_norm": 0.14326032416765416, + "learning_rate": 2.7935205380946207e-05, + "loss": 0.8834, + "step": 24711 + }, + { + "epoch": 5.271331058020478, + "grad_norm": 0.12624076790089958, + "learning_rate": 2.792874471530243e-05, + "loss": 0.8644, + "step": 24712 + }, + { + "epoch": 5.271544368600683, + "grad_norm": 0.14984118660678042, + "learning_rate": 2.7922284638845786e-05, + "loss": 0.898, + "step": 24713 + }, + { + "epoch": 5.271757679180888, + "grad_norm": 0.14528674496056657, + "learning_rate": 2.791582515164934e-05, + "loss": 0.8959, + "step": 24714 + }, + { + "epoch": 5.271970989761092, + "grad_norm": 0.13304257439073924, + "learning_rate": 2.790936625378623e-05, + "loss": 0.9029, + "step": 24715 + }, + { + "epoch": 5.272184300341297, + "grad_norm": 0.14195660691465753, + "learning_rate": 2.7902907945329524e-05, + "loss": 0.901, + "step": 24716 + }, + { + "epoch": 5.272397610921502, + "grad_norm": 0.13195739239851179, + "learning_rate": 2.7896450226352292e-05, + "loss": 0.8891, + "step": 24717 + }, + { + "epoch": 5.272610921501706, + "grad_norm": 0.13281632760126788, + "learning_rate": 2.7889993096927566e-05, + "loss": 0.9038, + "step": 24718 + }, + { + "epoch": 5.272824232081911, + "grad_norm": 0.12063658774756884, + "learning_rate": 2.7883536557128477e-05, + "loss": 0.8815, + "step": 24719 + }, + { + "epoch": 5.273037542662116, + "grad_norm": 0.1259766943582911, + "learning_rate": 2.7877080607028052e-05, + "loss": 0.9181, + "step": 24720 + }, + { + "epoch": 5.273250853242321, + "grad_norm": 0.1375491088992506, + "learning_rate": 2.787062524669933e-05, + "loss": 0.9173, + "step": 24721 + }, + { + "epoch": 5.273464163822526, + "grad_norm": 0.11512453367442103, + "learning_rate": 2.7864170476215365e-05, + "loss": 0.8806, + "step": 24722 + }, + { + "epoch": 5.273677474402731, + "grad_norm": 0.1435288614693094, + "learning_rate": 2.7857716295649183e-05, + "loss": 0.8784, + "step": 24723 + }, + { + "epoch": 5.273890784982935, + "grad_norm": 0.12160746062084764, + "learning_rate": 2.785126270507382e-05, + "loss": 0.8877, + "step": 24724 + }, + { + "epoch": 5.27410409556314, + "grad_norm": 0.12196994885950428, + "learning_rate": 2.7844809704562296e-05, + "loss": 0.8638, + "step": 24725 + }, + { + "epoch": 5.274317406143345, + "grad_norm": 0.1250596412768324, + "learning_rate": 2.78383572941876e-05, + "loss": 0.9157, + "step": 24726 + }, + { + "epoch": 5.274530716723549, + "grad_norm": 0.13295971064612827, + "learning_rate": 2.7831905474022806e-05, + "loss": 0.8899, + "step": 24727 + }, + { + "epoch": 5.274744027303754, + "grad_norm": 0.14781719891548456, + "learning_rate": 2.782545424414087e-05, + "loss": 0.8796, + "step": 24728 + }, + { + "epoch": 5.274957337883959, + "grad_norm": 0.15457339671857304, + "learning_rate": 2.7819003604614788e-05, + "loss": 0.8638, + "step": 24729 + }, + { + "epoch": 5.275170648464163, + "grad_norm": 0.1375453449613832, + "learning_rate": 2.7812553555517543e-05, + "loss": 0.8557, + "step": 24730 + }, + { + "epoch": 5.275383959044369, + "grad_norm": 0.16497889086882664, + "learning_rate": 2.7806104096922157e-05, + "loss": 0.877, + "step": 24731 + }, + { + "epoch": 5.275597269624574, + "grad_norm": 0.13707603191792833, + "learning_rate": 2.779965522890159e-05, + "loss": 0.8891, + "step": 24732 + }, + { + "epoch": 5.275810580204778, + "grad_norm": 0.11746790420197172, + "learning_rate": 2.779320695152879e-05, + "loss": 0.8664, + "step": 24733 + }, + { + "epoch": 5.276023890784983, + "grad_norm": 0.17942954908701164, + "learning_rate": 2.7786759264876734e-05, + "loss": 0.9302, + "step": 24734 + }, + { + "epoch": 5.276237201365188, + "grad_norm": 0.12273830168046168, + "learning_rate": 2.7780312169018376e-05, + "loss": 0.8744, + "step": 24735 + }, + { + "epoch": 5.276450511945392, + "grad_norm": 0.17060015156247102, + "learning_rate": 2.7773865664026664e-05, + "loss": 0.8998, + "step": 24736 + }, + { + "epoch": 5.276663822525597, + "grad_norm": 0.12197785934556504, + "learning_rate": 2.7767419749974542e-05, + "loss": 0.8861, + "step": 24737 + }, + { + "epoch": 5.276877133105802, + "grad_norm": 0.18259847808297716, + "learning_rate": 2.7760974426934912e-05, + "loss": 0.933, + "step": 24738 + }, + { + "epoch": 5.277090443686006, + "grad_norm": 0.12201450047193545, + "learning_rate": 2.7754529694980765e-05, + "loss": 0.9021, + "step": 24739 + }, + { + "epoch": 5.277303754266212, + "grad_norm": 0.16436167136998955, + "learning_rate": 2.7748085554184992e-05, + "loss": 0.8929, + "step": 24740 + }, + { + "epoch": 5.277517064846417, + "grad_norm": 0.14215295745674245, + "learning_rate": 2.7741642004620514e-05, + "loss": 0.8621, + "step": 24741 + }, + { + "epoch": 5.277730375426621, + "grad_norm": 0.14027867193622104, + "learning_rate": 2.773519904636021e-05, + "loss": 0.9013, + "step": 24742 + }, + { + "epoch": 5.277943686006826, + "grad_norm": 0.1497222511911901, + "learning_rate": 2.7728756679477027e-05, + "loss": 0.8817, + "step": 24743 + }, + { + "epoch": 5.278156996587031, + "grad_norm": 0.11928353429858404, + "learning_rate": 2.7722314904043847e-05, + "loss": 0.8884, + "step": 24744 + }, + { + "epoch": 5.278370307167235, + "grad_norm": 0.14028559576286245, + "learning_rate": 2.7715873720133574e-05, + "loss": 0.8551, + "step": 24745 + }, + { + "epoch": 5.27858361774744, + "grad_norm": 0.13006086290947738, + "learning_rate": 2.770943312781902e-05, + "loss": 0.9288, + "step": 24746 + }, + { + "epoch": 5.278796928327645, + "grad_norm": 0.14945854072358195, + "learning_rate": 2.770299312717314e-05, + "loss": 0.8834, + "step": 24747 + }, + { + "epoch": 5.2790102389078495, + "grad_norm": 0.12068336968452589, + "learning_rate": 2.7696553718268764e-05, + "loss": 0.8638, + "step": 24748 + }, + { + "epoch": 5.279223549488055, + "grad_norm": 0.14444512263847953, + "learning_rate": 2.7690114901178772e-05, + "loss": 0.8883, + "step": 24749 + }, + { + "epoch": 5.27943686006826, + "grad_norm": 0.1343079836254993, + "learning_rate": 2.7683676675975973e-05, + "loss": 0.8773, + "step": 24750 + }, + { + "epoch": 5.279650170648464, + "grad_norm": 0.14222616522941134, + "learning_rate": 2.7677239042733286e-05, + "loss": 0.8903, + "step": 24751 + }, + { + "epoch": 5.279863481228669, + "grad_norm": 0.14608739490999847, + "learning_rate": 2.767080200152352e-05, + "loss": 0.892, + "step": 24752 + }, + { + "epoch": 5.280076791808874, + "grad_norm": 0.1447706781761515, + "learning_rate": 2.766436555241951e-05, + "loss": 0.8504, + "step": 24753 + }, + { + "epoch": 5.2802901023890785, + "grad_norm": 0.12486271351226261, + "learning_rate": 2.7657929695494058e-05, + "loss": 0.9197, + "step": 24754 + }, + { + "epoch": 5.280503412969283, + "grad_norm": 0.13912796238423836, + "learning_rate": 2.7651494430820034e-05, + "loss": 0.8947, + "step": 24755 + }, + { + "epoch": 5.280716723549488, + "grad_norm": 0.14239930780110713, + "learning_rate": 2.7645059758470254e-05, + "loss": 0.8835, + "step": 24756 + }, + { + "epoch": 5.2809300341296925, + "grad_norm": 0.15336502095429777, + "learning_rate": 2.7638625678517486e-05, + "loss": 0.8722, + "step": 24757 + }, + { + "epoch": 5.281143344709897, + "grad_norm": 0.14718202069073677, + "learning_rate": 2.7632192191034516e-05, + "loss": 0.8829, + "step": 24758 + }, + { + "epoch": 5.281356655290103, + "grad_norm": 0.17533434475757143, + "learning_rate": 2.7625759296094197e-05, + "loss": 0.8871, + "step": 24759 + }, + { + "epoch": 5.2815699658703075, + "grad_norm": 0.10732223746057035, + "learning_rate": 2.76193269937693e-05, + "loss": 0.8786, + "step": 24760 + }, + { + "epoch": 5.281783276450512, + "grad_norm": 0.1731864470903292, + "learning_rate": 2.76128952841326e-05, + "loss": 0.8785, + "step": 24761 + }, + { + "epoch": 5.281996587030717, + "grad_norm": 0.11416717184534338, + "learning_rate": 2.760646416725684e-05, + "loss": 0.8964, + "step": 24762 + }, + { + "epoch": 5.2822098976109215, + "grad_norm": 0.1573179280945587, + "learning_rate": 2.7600033643214845e-05, + "loss": 0.9008, + "step": 24763 + }, + { + "epoch": 5.282423208191126, + "grad_norm": 0.12377250895279644, + "learning_rate": 2.759360371207935e-05, + "loss": 0.8871, + "step": 24764 + }, + { + "epoch": 5.282636518771331, + "grad_norm": 0.1507714913064502, + "learning_rate": 2.758717437392311e-05, + "loss": 0.875, + "step": 24765 + }, + { + "epoch": 5.282849829351536, + "grad_norm": 0.14984814870649255, + "learning_rate": 2.758074562881885e-05, + "loss": 0.8898, + "step": 24766 + }, + { + "epoch": 5.28306313993174, + "grad_norm": 0.14661911943573605, + "learning_rate": 2.7574317476839394e-05, + "loss": 0.8954, + "step": 24767 + }, + { + "epoch": 5.283276450511945, + "grad_norm": 0.12959852162382157, + "learning_rate": 2.7567889918057386e-05, + "loss": 0.8879, + "step": 24768 + }, + { + "epoch": 5.2834897610921505, + "grad_norm": 0.1301980962849146, + "learning_rate": 2.756146295254558e-05, + "loss": 0.9124, + "step": 24769 + }, + { + "epoch": 5.283703071672355, + "grad_norm": 0.13295391064913872, + "learning_rate": 2.755503658037668e-05, + "loss": 0.8776, + "step": 24770 + }, + { + "epoch": 5.28391638225256, + "grad_norm": 0.11478387654191957, + "learning_rate": 2.7548610801623446e-05, + "loss": 0.8728, + "step": 24771 + }, + { + "epoch": 5.284129692832765, + "grad_norm": 0.13734601040582656, + "learning_rate": 2.754218561635857e-05, + "loss": 0.9008, + "step": 24772 + }, + { + "epoch": 5.284343003412969, + "grad_norm": 0.12993445582079569, + "learning_rate": 2.7535761024654742e-05, + "loss": 0.8759, + "step": 24773 + }, + { + "epoch": 5.284556313993174, + "grad_norm": 0.13530701400973236, + "learning_rate": 2.7529337026584636e-05, + "loss": 0.8916, + "step": 24774 + }, + { + "epoch": 5.284769624573379, + "grad_norm": 0.12552057227068972, + "learning_rate": 2.7522913622220993e-05, + "loss": 0.9136, + "step": 24775 + }, + { + "epoch": 5.284982935153583, + "grad_norm": 0.1422182076794227, + "learning_rate": 2.7516490811636463e-05, + "loss": 0.9011, + "step": 24776 + }, + { + "epoch": 5.285196245733788, + "grad_norm": 0.11012428997094521, + "learning_rate": 2.751006859490372e-05, + "loss": 0.9059, + "step": 24777 + }, + { + "epoch": 5.285409556313994, + "grad_norm": 0.1493499652752374, + "learning_rate": 2.7503646972095405e-05, + "loss": 0.8782, + "step": 24778 + }, + { + "epoch": 5.285622866894198, + "grad_norm": 0.11383137636099229, + "learning_rate": 2.749722594328426e-05, + "loss": 0.8726, + "step": 24779 + }, + { + "epoch": 5.285836177474403, + "grad_norm": 0.11889994301275802, + "learning_rate": 2.749080550854287e-05, + "loss": 0.8968, + "step": 24780 + }, + { + "epoch": 5.286049488054608, + "grad_norm": 0.12284111613250773, + "learning_rate": 2.7484385667943887e-05, + "loss": 0.9123, + "step": 24781 + }, + { + "epoch": 5.286262798634812, + "grad_norm": 0.11042147766595728, + "learning_rate": 2.7477966421559947e-05, + "loss": 0.8568, + "step": 24782 + }, + { + "epoch": 5.286476109215017, + "grad_norm": 0.12018213379507252, + "learning_rate": 2.7471547769463725e-05, + "loss": 0.8897, + "step": 24783 + }, + { + "epoch": 5.286689419795222, + "grad_norm": 0.12318309453811163, + "learning_rate": 2.7465129711727814e-05, + "loss": 0.8902, + "step": 24784 + }, + { + "epoch": 5.286902730375426, + "grad_norm": 0.11412617515715155, + "learning_rate": 2.7458712248424855e-05, + "loss": 0.8666, + "step": 24785 + }, + { + "epoch": 5.287116040955631, + "grad_norm": 0.14880382285318944, + "learning_rate": 2.7452295379627422e-05, + "loss": 0.8962, + "step": 24786 + }, + { + "epoch": 5.287329351535837, + "grad_norm": 0.11723056971744625, + "learning_rate": 2.744587910540817e-05, + "loss": 0.8808, + "step": 24787 + }, + { + "epoch": 5.287542662116041, + "grad_norm": 0.1574585901080996, + "learning_rate": 2.7439463425839688e-05, + "loss": 0.9039, + "step": 24788 + }, + { + "epoch": 5.287755972696246, + "grad_norm": 0.11753768326865148, + "learning_rate": 2.7433048340994557e-05, + "loss": 0.9022, + "step": 24789 + }, + { + "epoch": 5.287969283276451, + "grad_norm": 0.14412452957317645, + "learning_rate": 2.742663385094537e-05, + "loss": 0.8551, + "step": 24790 + }, + { + "epoch": 5.288182593856655, + "grad_norm": 0.12502969353979318, + "learning_rate": 2.7420219955764707e-05, + "loss": 0.8808, + "step": 24791 + }, + { + "epoch": 5.28839590443686, + "grad_norm": 0.13758336237961163, + "learning_rate": 2.7413806655525142e-05, + "loss": 0.8773, + "step": 24792 + }, + { + "epoch": 5.288609215017065, + "grad_norm": 0.1523600560797483, + "learning_rate": 2.7407393950299232e-05, + "loss": 0.8938, + "step": 24793 + }, + { + "epoch": 5.288822525597269, + "grad_norm": 0.1762766742936565, + "learning_rate": 2.7400981840159518e-05, + "loss": 0.8958, + "step": 24794 + }, + { + "epoch": 5.289035836177474, + "grad_norm": 0.1591312948612801, + "learning_rate": 2.7394570325178613e-05, + "loss": 0.8957, + "step": 24795 + }, + { + "epoch": 5.289249146757679, + "grad_norm": 0.1442348255095589, + "learning_rate": 2.738815940542902e-05, + "loss": 0.8903, + "step": 24796 + }, + { + "epoch": 5.289462457337884, + "grad_norm": 0.14624611614505362, + "learning_rate": 2.7381749080983292e-05, + "loss": 0.8776, + "step": 24797 + }, + { + "epoch": 5.289675767918089, + "grad_norm": 0.14433156105855457, + "learning_rate": 2.7375339351913938e-05, + "loss": 0.8565, + "step": 24798 + }, + { + "epoch": 5.289889078498294, + "grad_norm": 0.17070553475791647, + "learning_rate": 2.736893021829352e-05, + "loss": 0.8905, + "step": 24799 + }, + { + "epoch": 5.290102389078498, + "grad_norm": 0.1219696896026522, + "learning_rate": 2.736252168019455e-05, + "loss": 0.8847, + "step": 24800 + }, + { + "epoch": 5.290315699658703, + "grad_norm": 0.14603091812924537, + "learning_rate": 2.7356113737689535e-05, + "loss": 0.878, + "step": 24801 + }, + { + "epoch": 5.290529010238908, + "grad_norm": 0.11957159594330219, + "learning_rate": 2.7349706390850966e-05, + "loss": 0.8745, + "step": 24802 + }, + { + "epoch": 5.290742320819112, + "grad_norm": 0.13464526093327792, + "learning_rate": 2.734329963975137e-05, + "loss": 0.9014, + "step": 24803 + }, + { + "epoch": 5.290955631399317, + "grad_norm": 0.1110597165518412, + "learning_rate": 2.7336893484463208e-05, + "loss": 0.8952, + "step": 24804 + }, + { + "epoch": 5.291168941979522, + "grad_norm": 0.12612068869950108, + "learning_rate": 2.7330487925058996e-05, + "loss": 0.8881, + "step": 24805 + }, + { + "epoch": 5.2913822525597265, + "grad_norm": 0.11912242016926394, + "learning_rate": 2.7324082961611167e-05, + "loss": 0.8675, + "step": 24806 + }, + { + "epoch": 5.291595563139932, + "grad_norm": 0.12283060696644839, + "learning_rate": 2.7317678594192255e-05, + "loss": 0.8855, + "step": 24807 + }, + { + "epoch": 5.291808873720137, + "grad_norm": 0.13253946146262546, + "learning_rate": 2.7311274822874686e-05, + "loss": 0.8886, + "step": 24808 + }, + { + "epoch": 5.292022184300341, + "grad_norm": 0.12871976373211907, + "learning_rate": 2.730487164773094e-05, + "loss": 0.8888, + "step": 24809 + }, + { + "epoch": 5.292235494880546, + "grad_norm": 0.13815559934101096, + "learning_rate": 2.729846906883343e-05, + "loss": 0.8451, + "step": 24810 + }, + { + "epoch": 5.292448805460751, + "grad_norm": 0.14285927485001934, + "learning_rate": 2.7292067086254648e-05, + "loss": 0.8818, + "step": 24811 + }, + { + "epoch": 5.2926621160409555, + "grad_norm": 0.12168205375828921, + "learning_rate": 2.728566570006705e-05, + "loss": 0.8821, + "step": 24812 + }, + { + "epoch": 5.29287542662116, + "grad_norm": 0.16324441174036627, + "learning_rate": 2.727926491034297e-05, + "loss": 0.8655, + "step": 24813 + }, + { + "epoch": 5.293088737201365, + "grad_norm": 0.1094883004481136, + "learning_rate": 2.727286471715492e-05, + "loss": 0.9025, + "step": 24814 + }, + { + "epoch": 5.2933020477815695, + "grad_norm": 0.1480661941652741, + "learning_rate": 2.7266465120575297e-05, + "loss": 0.9034, + "step": 24815 + }, + { + "epoch": 5.293515358361775, + "grad_norm": 0.1426980362060722, + "learning_rate": 2.726006612067651e-05, + "loss": 0.8919, + "step": 24816 + }, + { + "epoch": 5.29372866894198, + "grad_norm": 0.12877822030594346, + "learning_rate": 2.7253667717530957e-05, + "loss": 0.8916, + "step": 24817 + }, + { + "epoch": 5.2939419795221845, + "grad_norm": 0.13889385485941796, + "learning_rate": 2.7247269911211015e-05, + "loss": 0.8762, + "step": 24818 + }, + { + "epoch": 5.294155290102389, + "grad_norm": 0.1395762384719605, + "learning_rate": 2.724087270178913e-05, + "loss": 0.8857, + "step": 24819 + }, + { + "epoch": 5.294368600682594, + "grad_norm": 0.14190969866353945, + "learning_rate": 2.7234476089337662e-05, + "loss": 0.8864, + "step": 24820 + }, + { + "epoch": 5.2945819112627985, + "grad_norm": 0.13973465785144387, + "learning_rate": 2.7228080073928978e-05, + "loss": 0.8852, + "step": 24821 + }, + { + "epoch": 5.294795221843003, + "grad_norm": 0.14475576857736153, + "learning_rate": 2.7221684655635424e-05, + "loss": 0.8876, + "step": 24822 + }, + { + "epoch": 5.295008532423208, + "grad_norm": 0.1424161659982115, + "learning_rate": 2.721528983452946e-05, + "loss": 0.8745, + "step": 24823 + }, + { + "epoch": 5.295221843003413, + "grad_norm": 0.1399310884096165, + "learning_rate": 2.7208895610683348e-05, + "loss": 0.8808, + "step": 24824 + }, + { + "epoch": 5.295435153583618, + "grad_norm": 0.1540478141593815, + "learning_rate": 2.7202501984169443e-05, + "loss": 0.9199, + "step": 24825 + }, + { + "epoch": 5.295648464163823, + "grad_norm": 0.1346420460667129, + "learning_rate": 2.719610895506015e-05, + "loss": 0.8668, + "step": 24826 + }, + { + "epoch": 5.2958617747440275, + "grad_norm": 0.15509558263462492, + "learning_rate": 2.7189716523427762e-05, + "loss": 0.8914, + "step": 24827 + }, + { + "epoch": 5.296075085324232, + "grad_norm": 0.12430571934419049, + "learning_rate": 2.7183324689344625e-05, + "loss": 0.9092, + "step": 24828 + }, + { + "epoch": 5.296288395904437, + "grad_norm": 0.14107677797678247, + "learning_rate": 2.7176933452883053e-05, + "loss": 0.864, + "step": 24829 + }, + { + "epoch": 5.296501706484642, + "grad_norm": 0.13745417414428693, + "learning_rate": 2.7170542814115345e-05, + "loss": 0.898, + "step": 24830 + }, + { + "epoch": 5.296715017064846, + "grad_norm": 0.12894652645671148, + "learning_rate": 2.716415277311386e-05, + "loss": 0.9014, + "step": 24831 + }, + { + "epoch": 5.296928327645051, + "grad_norm": 0.17874221744829655, + "learning_rate": 2.7157763329950878e-05, + "loss": 0.9078, + "step": 24832 + }, + { + "epoch": 5.297141638225256, + "grad_norm": 0.13959179449254883, + "learning_rate": 2.7151374484698688e-05, + "loss": 0.8744, + "step": 24833 + }, + { + "epoch": 5.297354948805461, + "grad_norm": 0.1708917356308437, + "learning_rate": 2.7144986237429586e-05, + "loss": 0.8765, + "step": 24834 + }, + { + "epoch": 5.297568259385666, + "grad_norm": 0.1422310545794028, + "learning_rate": 2.713859858821585e-05, + "loss": 0.9451, + "step": 24835 + }, + { + "epoch": 5.297781569965871, + "grad_norm": 0.14499186848255224, + "learning_rate": 2.7132211537129766e-05, + "loss": 0.8858, + "step": 24836 + }, + { + "epoch": 5.297994880546075, + "grad_norm": 0.1461149044603009, + "learning_rate": 2.7125825084243575e-05, + "loss": 0.8966, + "step": 24837 + }, + { + "epoch": 5.29820819112628, + "grad_norm": 0.1424004043318799, + "learning_rate": 2.7119439229629575e-05, + "loss": 0.8679, + "step": 24838 + }, + { + "epoch": 5.298421501706485, + "grad_norm": 0.14361369394099063, + "learning_rate": 2.7113053973360018e-05, + "loss": 0.8676, + "step": 24839 + }, + { + "epoch": 5.298634812286689, + "grad_norm": 0.16811175277860135, + "learning_rate": 2.710666931550715e-05, + "loss": 0.859, + "step": 24840 + }, + { + "epoch": 5.298848122866894, + "grad_norm": 0.14353488554647273, + "learning_rate": 2.7100285256143198e-05, + "loss": 0.9144, + "step": 24841 + }, + { + "epoch": 5.299061433447099, + "grad_norm": 0.16002963874389187, + "learning_rate": 2.7093901795340394e-05, + "loss": 0.8598, + "step": 24842 + }, + { + "epoch": 5.299274744027303, + "grad_norm": 0.12935832418732465, + "learning_rate": 2.7087518933171e-05, + "loss": 0.8808, + "step": 24843 + }, + { + "epoch": 5.299488054607509, + "grad_norm": 0.16140363192058726, + "learning_rate": 2.7081136669707215e-05, + "loss": 0.8822, + "step": 24844 + }, + { + "epoch": 5.299701365187714, + "grad_norm": 0.15144641132999415, + "learning_rate": 2.7074755005021266e-05, + "loss": 0.9287, + "step": 24845 + }, + { + "epoch": 5.299914675767918, + "grad_norm": 0.12825232615710888, + "learning_rate": 2.7068373939185347e-05, + "loss": 0.8618, + "step": 24846 + }, + { + "epoch": 5.300127986348123, + "grad_norm": 0.12888245605092377, + "learning_rate": 2.706199347227167e-05, + "loss": 0.8836, + "step": 24847 + }, + { + "epoch": 5.300341296928328, + "grad_norm": 0.14587649208535555, + "learning_rate": 2.7055613604352432e-05, + "loss": 0.8857, + "step": 24848 + }, + { + "epoch": 5.300554607508532, + "grad_norm": 0.11467437512326027, + "learning_rate": 2.704923433549979e-05, + "loss": 0.8778, + "step": 24849 + }, + { + "epoch": 5.300767918088737, + "grad_norm": 0.13365521018260648, + "learning_rate": 2.704285566578597e-05, + "loss": 0.891, + "step": 24850 + }, + { + "epoch": 5.300981228668942, + "grad_norm": 0.12212160812742097, + "learning_rate": 2.7036477595283142e-05, + "loss": 0.8781, + "step": 24851 + }, + { + "epoch": 5.301194539249146, + "grad_norm": 0.11833976143073312, + "learning_rate": 2.7030100124063453e-05, + "loss": 0.896, + "step": 24852 + }, + { + "epoch": 5.301407849829351, + "grad_norm": 0.12494658079702496, + "learning_rate": 2.702372325219907e-05, + "loss": 0.87, + "step": 24853 + }, + { + "epoch": 5.301621160409557, + "grad_norm": 0.1147600189576759, + "learning_rate": 2.701734697976212e-05, + "loss": 0.8991, + "step": 24854 + }, + { + "epoch": 5.301834470989761, + "grad_norm": 0.11745826277727892, + "learning_rate": 2.7010971306824817e-05, + "loss": 0.8744, + "step": 24855 + }, + { + "epoch": 5.302047781569966, + "grad_norm": 0.11532627161234513, + "learning_rate": 2.7004596233459266e-05, + "loss": 0.8715, + "step": 24856 + }, + { + "epoch": 5.302261092150171, + "grad_norm": 0.11525472919806468, + "learning_rate": 2.6998221759737585e-05, + "loss": 0.9084, + "step": 24857 + }, + { + "epoch": 5.302474402730375, + "grad_norm": 0.11615760212945747, + "learning_rate": 2.6991847885731934e-05, + "loss": 0.8879, + "step": 24858 + }, + { + "epoch": 5.30268771331058, + "grad_norm": 0.12289047480105723, + "learning_rate": 2.6985474611514403e-05, + "loss": 0.8881, + "step": 24859 + }, + { + "epoch": 5.302901023890785, + "grad_norm": 0.11862315004735295, + "learning_rate": 2.6979101937157124e-05, + "loss": 0.8467, + "step": 24860 + }, + { + "epoch": 5.3031143344709895, + "grad_norm": 0.12683131895726826, + "learning_rate": 2.697272986273217e-05, + "loss": 0.9018, + "step": 24861 + }, + { + "epoch": 5.303327645051194, + "grad_norm": 0.12860741507260734, + "learning_rate": 2.69663583883117e-05, + "loss": 0.8702, + "step": 24862 + }, + { + "epoch": 5.3035409556314, + "grad_norm": 0.12831960951091545, + "learning_rate": 2.6959987513967775e-05, + "loss": 0.8879, + "step": 24863 + }, + { + "epoch": 5.303754266211604, + "grad_norm": 0.11676826128295593, + "learning_rate": 2.6953617239772475e-05, + "loss": 0.8994, + "step": 24864 + }, + { + "epoch": 5.303967576791809, + "grad_norm": 0.135082573574141, + "learning_rate": 2.69472475657979e-05, + "loss": 0.8931, + "step": 24865 + }, + { + "epoch": 5.304180887372014, + "grad_norm": 0.1379498771618381, + "learning_rate": 2.6940878492116078e-05, + "loss": 0.8839, + "step": 24866 + }, + { + "epoch": 5.3043941979522184, + "grad_norm": 0.11133084877107087, + "learning_rate": 2.693451001879914e-05, + "loss": 0.9529, + "step": 24867 + }, + { + "epoch": 5.304607508532423, + "grad_norm": 0.1328523758896589, + "learning_rate": 2.6928142145919137e-05, + "loss": 0.8798, + "step": 24868 + }, + { + "epoch": 5.304820819112628, + "grad_norm": 0.12725475740156814, + "learning_rate": 2.692177487354804e-05, + "loss": 0.8899, + "step": 24869 + }, + { + "epoch": 5.3050341296928325, + "grad_norm": 0.13304038714217256, + "learning_rate": 2.6915408201757985e-05, + "loss": 0.8849, + "step": 24870 + }, + { + "epoch": 5.305247440273037, + "grad_norm": 0.1308767036435762, + "learning_rate": 2.6909042130620975e-05, + "loss": 0.8698, + "step": 24871 + }, + { + "epoch": 5.305460750853243, + "grad_norm": 0.13787424796638623, + "learning_rate": 2.6902676660209057e-05, + "loss": 0.8841, + "step": 24872 + }, + { + "epoch": 5.305674061433447, + "grad_norm": 0.1150744611123245, + "learning_rate": 2.6896311790594212e-05, + "loss": 0.876, + "step": 24873 + }, + { + "epoch": 5.305887372013652, + "grad_norm": 0.11465784664995253, + "learning_rate": 2.6889947521848515e-05, + "loss": 0.898, + "step": 24874 + }, + { + "epoch": 5.306100682593857, + "grad_norm": 0.14374507167266745, + "learning_rate": 2.6883583854043966e-05, + "loss": 0.8716, + "step": 24875 + }, + { + "epoch": 5.3063139931740615, + "grad_norm": 0.1247630323538793, + "learning_rate": 2.687722078725255e-05, + "loss": 0.8719, + "step": 24876 + }, + { + "epoch": 5.306527303754266, + "grad_norm": 0.11479214605167115, + "learning_rate": 2.687085832154628e-05, + "loss": 0.871, + "step": 24877 + }, + { + "epoch": 5.306740614334471, + "grad_norm": 0.11806444511376017, + "learning_rate": 2.6864496456997103e-05, + "loss": 0.8751, + "step": 24878 + }, + { + "epoch": 5.3069539249146755, + "grad_norm": 0.11743500580311929, + "learning_rate": 2.6858135193677114e-05, + "loss": 0.8966, + "step": 24879 + }, + { + "epoch": 5.30716723549488, + "grad_norm": 0.113766581882717, + "learning_rate": 2.6851774531658185e-05, + "loss": 0.9194, + "step": 24880 + }, + { + "epoch": 5.307380546075085, + "grad_norm": 0.11530779999928502, + "learning_rate": 2.6845414471012285e-05, + "loss": 0.9222, + "step": 24881 + }, + { + "epoch": 5.3075938566552905, + "grad_norm": 0.11540167170613974, + "learning_rate": 2.683905501181145e-05, + "loss": 0.9097, + "step": 24882 + }, + { + "epoch": 5.307807167235495, + "grad_norm": 0.10842191823777729, + "learning_rate": 2.6832696154127603e-05, + "loss": 0.8858, + "step": 24883 + }, + { + "epoch": 5.3080204778157, + "grad_norm": 0.12319364334730239, + "learning_rate": 2.682633789803269e-05, + "loss": 0.8932, + "step": 24884 + }, + { + "epoch": 5.3082337883959045, + "grad_norm": 0.11913395624210085, + "learning_rate": 2.6819980243598628e-05, + "loss": 0.892, + "step": 24885 + }, + { + "epoch": 5.308447098976109, + "grad_norm": 0.11927555361485513, + "learning_rate": 2.6813623190897414e-05, + "loss": 0.8736, + "step": 24886 + }, + { + "epoch": 5.308660409556314, + "grad_norm": 0.11097161716044304, + "learning_rate": 2.680726674000095e-05, + "loss": 0.8742, + "step": 24887 + }, + { + "epoch": 5.308873720136519, + "grad_norm": 0.11828312327644737, + "learning_rate": 2.6800910890981157e-05, + "loss": 0.8887, + "step": 24888 + }, + { + "epoch": 5.309087030716723, + "grad_norm": 0.11569447320631618, + "learning_rate": 2.6794555643909953e-05, + "loss": 0.8789, + "step": 24889 + }, + { + "epoch": 5.309300341296928, + "grad_norm": 0.12205902294577661, + "learning_rate": 2.6788200998859245e-05, + "loss": 0.8854, + "step": 24890 + }, + { + "epoch": 5.309513651877133, + "grad_norm": 0.14206328701209917, + "learning_rate": 2.678184695590094e-05, + "loss": 0.9073, + "step": 24891 + }, + { + "epoch": 5.309726962457338, + "grad_norm": 0.1420633630404304, + "learning_rate": 2.6775493515106928e-05, + "loss": 0.9071, + "step": 24892 + }, + { + "epoch": 5.309940273037543, + "grad_norm": 0.14547322818915187, + "learning_rate": 2.6769140676549078e-05, + "loss": 0.9026, + "step": 24893 + }, + { + "epoch": 5.310153583617748, + "grad_norm": 0.13349502755345813, + "learning_rate": 2.6762788440299325e-05, + "loss": 0.8813, + "step": 24894 + }, + { + "epoch": 5.310366894197952, + "grad_norm": 0.12997547328943804, + "learning_rate": 2.675643680642951e-05, + "loss": 0.8308, + "step": 24895 + }, + { + "epoch": 5.310580204778157, + "grad_norm": 0.1207999168660732, + "learning_rate": 2.6750085775011513e-05, + "loss": 0.8944, + "step": 24896 + }, + { + "epoch": 5.310793515358362, + "grad_norm": 0.126771261417157, + "learning_rate": 2.6743735346117163e-05, + "loss": 0.8872, + "step": 24897 + }, + { + "epoch": 5.311006825938566, + "grad_norm": 0.13788973382920763, + "learning_rate": 2.6737385519818375e-05, + "loss": 0.902, + "step": 24898 + }, + { + "epoch": 5.311220136518771, + "grad_norm": 0.13994483444682113, + "learning_rate": 2.673103629618696e-05, + "loss": 0.8977, + "step": 24899 + }, + { + "epoch": 5.311433447098976, + "grad_norm": 0.13878033361181288, + "learning_rate": 2.672468767529477e-05, + "loss": 0.8951, + "step": 24900 + }, + { + "epoch": 5.311646757679181, + "grad_norm": 0.14779867126613955, + "learning_rate": 2.671833965721363e-05, + "loss": 0.9089, + "step": 24901 + }, + { + "epoch": 5.311860068259386, + "grad_norm": 0.11855948454214027, + "learning_rate": 2.671199224201537e-05, + "loss": 0.8847, + "step": 24902 + }, + { + "epoch": 5.312073378839591, + "grad_norm": 0.12571045353248494, + "learning_rate": 2.6705645429771822e-05, + "loss": 0.8825, + "step": 24903 + }, + { + "epoch": 5.312286689419795, + "grad_norm": 0.11087480943192723, + "learning_rate": 2.669929922055479e-05, + "loss": 0.8761, + "step": 24904 + }, + { + "epoch": 5.3125, + "grad_norm": 0.11834455794766537, + "learning_rate": 2.669295361443605e-05, + "loss": 0.902, + "step": 24905 + }, + { + "epoch": 5.312713310580205, + "grad_norm": 0.12641153587849707, + "learning_rate": 2.6686608611487472e-05, + "loss": 0.8841, + "step": 24906 + }, + { + "epoch": 5.312926621160409, + "grad_norm": 0.14334380827356466, + "learning_rate": 2.6680264211780813e-05, + "loss": 0.8815, + "step": 24907 + }, + { + "epoch": 5.313139931740614, + "grad_norm": 0.11561804301386897, + "learning_rate": 2.6673920415387865e-05, + "loss": 0.879, + "step": 24908 + }, + { + "epoch": 5.313353242320819, + "grad_norm": 0.13395392472822823, + "learning_rate": 2.666757722238037e-05, + "loss": 0.8935, + "step": 24909 + }, + { + "epoch": 5.313566552901024, + "grad_norm": 0.12498509900472389, + "learning_rate": 2.6661234632830164e-05, + "loss": 0.8899, + "step": 24910 + }, + { + "epoch": 5.313779863481229, + "grad_norm": 0.13892331206390726, + "learning_rate": 2.6654892646808988e-05, + "loss": 0.8827, + "step": 24911 + }, + { + "epoch": 5.313993174061434, + "grad_norm": 0.11637618058428278, + "learning_rate": 2.66485512643886e-05, + "loss": 0.9146, + "step": 24912 + }, + { + "epoch": 5.314206484641638, + "grad_norm": 0.1267108558244501, + "learning_rate": 2.6642210485640744e-05, + "loss": 0.8552, + "step": 24913 + }, + { + "epoch": 5.314419795221843, + "grad_norm": 0.1199642724646276, + "learning_rate": 2.6635870310637178e-05, + "loss": 0.9063, + "step": 24914 + }, + { + "epoch": 5.314633105802048, + "grad_norm": 0.12064950402489927, + "learning_rate": 2.662953073944964e-05, + "loss": 0.9009, + "step": 24915 + }, + { + "epoch": 5.314846416382252, + "grad_norm": 0.1307399699744884, + "learning_rate": 2.6623191772149863e-05, + "loss": 0.9068, + "step": 24916 + }, + { + "epoch": 5.315059726962457, + "grad_norm": 0.1237567641754155, + "learning_rate": 2.6616853408809533e-05, + "loss": 0.889, + "step": 24917 + }, + { + "epoch": 5.315273037542662, + "grad_norm": 0.11453918695090477, + "learning_rate": 2.661051564950044e-05, + "loss": 0.8784, + "step": 24918 + }, + { + "epoch": 5.315486348122867, + "grad_norm": 0.14732680904305034, + "learning_rate": 2.6604178494294254e-05, + "loss": 0.8663, + "step": 24919 + }, + { + "epoch": 5.315699658703072, + "grad_norm": 0.11619544378501163, + "learning_rate": 2.6597841943262683e-05, + "loss": 0.8818, + "step": 24920 + }, + { + "epoch": 5.315912969283277, + "grad_norm": 0.11325951926207237, + "learning_rate": 2.6591505996477408e-05, + "loss": 0.889, + "step": 24921 + }, + { + "epoch": 5.316126279863481, + "grad_norm": 0.12998146348355546, + "learning_rate": 2.658517065401017e-05, + "loss": 0.9006, + "step": 24922 + }, + { + "epoch": 5.316339590443686, + "grad_norm": 0.11743672153518964, + "learning_rate": 2.6578835915932616e-05, + "loss": 0.8932, + "step": 24923 + }, + { + "epoch": 5.316552901023891, + "grad_norm": 0.12939116010832455, + "learning_rate": 2.657250178231647e-05, + "loss": 0.8973, + "step": 24924 + }, + { + "epoch": 5.3167662116040955, + "grad_norm": 0.11874865638333339, + "learning_rate": 2.6566168253233306e-05, + "loss": 0.8813, + "step": 24925 + }, + { + "epoch": 5.3169795221843, + "grad_norm": 0.12905848879387422, + "learning_rate": 2.655983532875487e-05, + "loss": 0.8967, + "step": 24926 + }, + { + "epoch": 5.317192832764505, + "grad_norm": 0.1374493773751347, + "learning_rate": 2.65535030089528e-05, + "loss": 0.8924, + "step": 24927 + }, + { + "epoch": 5.3174061433447095, + "grad_norm": 0.13049483398299458, + "learning_rate": 2.6547171293898748e-05, + "loss": 0.937, + "step": 24928 + }, + { + "epoch": 5.317619453924915, + "grad_norm": 0.13340535172737655, + "learning_rate": 2.6540840183664313e-05, + "loss": 0.86, + "step": 24929 + }, + { + "epoch": 5.31783276450512, + "grad_norm": 0.13663583271631258, + "learning_rate": 2.653450967832121e-05, + "loss": 0.8841, + "step": 24930 + }, + { + "epoch": 5.3180460750853245, + "grad_norm": 0.12594564492439692, + "learning_rate": 2.6528179777941016e-05, + "loss": 0.8855, + "step": 24931 + }, + { + "epoch": 5.318259385665529, + "grad_norm": 0.12135180742346004, + "learning_rate": 2.6521850482595383e-05, + "loss": 0.8889, + "step": 24932 + }, + { + "epoch": 5.318472696245734, + "grad_norm": 0.1300134513073362, + "learning_rate": 2.651552179235587e-05, + "loss": 0.9013, + "step": 24933 + }, + { + "epoch": 5.3186860068259385, + "grad_norm": 0.1316792023483525, + "learning_rate": 2.6509193707294162e-05, + "loss": 0.8613, + "step": 24934 + }, + { + "epoch": 5.318899317406143, + "grad_norm": 0.12543538571539678, + "learning_rate": 2.6502866227481852e-05, + "loss": 0.872, + "step": 24935 + }, + { + "epoch": 5.319112627986348, + "grad_norm": 0.14887341825403702, + "learning_rate": 2.649653935299048e-05, + "loss": 0.8781, + "step": 24936 + }, + { + "epoch": 5.319325938566553, + "grad_norm": 0.13360844807108185, + "learning_rate": 2.6490213083891646e-05, + "loss": 0.9115, + "step": 24937 + }, + { + "epoch": 5.319539249146757, + "grad_norm": 0.14200031495568383, + "learning_rate": 2.648388742025697e-05, + "loss": 0.9183, + "step": 24938 + }, + { + "epoch": 5.319752559726963, + "grad_norm": 0.12652920236016496, + "learning_rate": 2.647756236215801e-05, + "loss": 0.8826, + "step": 24939 + }, + { + "epoch": 5.3199658703071675, + "grad_norm": 0.1411185014654719, + "learning_rate": 2.6471237909666334e-05, + "loss": 0.8747, + "step": 24940 + }, + { + "epoch": 5.320179180887372, + "grad_norm": 0.12899071345774876, + "learning_rate": 2.646491406285348e-05, + "loss": 0.8835, + "step": 24941 + }, + { + "epoch": 5.320392491467577, + "grad_norm": 0.14978791093667534, + "learning_rate": 2.6458590821791044e-05, + "loss": 0.885, + "step": 24942 + }, + { + "epoch": 5.3206058020477816, + "grad_norm": 0.12062402591565569, + "learning_rate": 2.6452268186550553e-05, + "loss": 0.8711, + "step": 24943 + }, + { + "epoch": 5.320819112627986, + "grad_norm": 0.15745334897274307, + "learning_rate": 2.6445946157203545e-05, + "loss": 0.8779, + "step": 24944 + }, + { + "epoch": 5.321032423208191, + "grad_norm": 0.12489015804319673, + "learning_rate": 2.6439624733821562e-05, + "loss": 0.878, + "step": 24945 + }, + { + "epoch": 5.321245733788396, + "grad_norm": 0.15142888389297163, + "learning_rate": 2.6433303916476128e-05, + "loss": 0.8805, + "step": 24946 + }, + { + "epoch": 5.3214590443686, + "grad_norm": 0.15944137349462142, + "learning_rate": 2.642698370523876e-05, + "loss": 0.907, + "step": 24947 + }, + { + "epoch": 5.321672354948806, + "grad_norm": 0.16895816276926723, + "learning_rate": 2.642066410018097e-05, + "loss": 0.8856, + "step": 24948 + }, + { + "epoch": 5.3218856655290105, + "grad_norm": 0.14612793158935722, + "learning_rate": 2.6414345101374243e-05, + "loss": 0.9024, + "step": 24949 + }, + { + "epoch": 5.322098976109215, + "grad_norm": 0.15214703255754883, + "learning_rate": 2.640802670889013e-05, + "loss": 0.8888, + "step": 24950 + }, + { + "epoch": 5.32231228668942, + "grad_norm": 0.1437346096104024, + "learning_rate": 2.6401708922800085e-05, + "loss": 0.8744, + "step": 24951 + }, + { + "epoch": 5.322525597269625, + "grad_norm": 0.15613660429638995, + "learning_rate": 2.6395391743175615e-05, + "loss": 0.8697, + "step": 24952 + }, + { + "epoch": 5.322738907849829, + "grad_norm": 0.15489642318107644, + "learning_rate": 2.638907517008817e-05, + "loss": 0.843, + "step": 24953 + }, + { + "epoch": 5.322952218430034, + "grad_norm": 0.1422411093722535, + "learning_rate": 2.6382759203609257e-05, + "loss": 0.9038, + "step": 24954 + }, + { + "epoch": 5.323165529010239, + "grad_norm": 0.1581308735503488, + "learning_rate": 2.6376443843810327e-05, + "loss": 0.858, + "step": 24955 + }, + { + "epoch": 5.323378839590443, + "grad_norm": 0.11441435510106468, + "learning_rate": 2.6370129090762832e-05, + "loss": 0.8819, + "step": 24956 + }, + { + "epoch": 5.323592150170649, + "grad_norm": 0.14221591892495747, + "learning_rate": 2.636381494453824e-05, + "loss": 0.8638, + "step": 24957 + }, + { + "epoch": 5.323805460750854, + "grad_norm": 0.12677245002405857, + "learning_rate": 2.6357501405207977e-05, + "loss": 0.8967, + "step": 24958 + }, + { + "epoch": 5.324018771331058, + "grad_norm": 0.13636162602492002, + "learning_rate": 2.6351188472843488e-05, + "loss": 0.8841, + "step": 24959 + }, + { + "epoch": 5.324232081911263, + "grad_norm": 0.12111350095041287, + "learning_rate": 2.6344876147516206e-05, + "loss": 0.9029, + "step": 24960 + }, + { + "epoch": 5.324445392491468, + "grad_norm": 0.12920236799118792, + "learning_rate": 2.6338564429297525e-05, + "loss": 0.872, + "step": 24961 + }, + { + "epoch": 5.324658703071672, + "grad_norm": 0.13694370783426665, + "learning_rate": 2.6332253318258918e-05, + "loss": 0.8798, + "step": 24962 + }, + { + "epoch": 5.324872013651877, + "grad_norm": 0.14180727048489222, + "learning_rate": 2.632594281447176e-05, + "loss": 0.8716, + "step": 24963 + }, + { + "epoch": 5.325085324232082, + "grad_norm": 0.15035297529221653, + "learning_rate": 2.6319632918007473e-05, + "loss": 0.894, + "step": 24964 + }, + { + "epoch": 5.325298634812286, + "grad_norm": 0.1353000389361934, + "learning_rate": 2.6313323628937407e-05, + "loss": 0.9082, + "step": 24965 + }, + { + "epoch": 5.325511945392491, + "grad_norm": 0.15206692361948734, + "learning_rate": 2.6307014947333018e-05, + "loss": 0.917, + "step": 24966 + }, + { + "epoch": 5.325725255972697, + "grad_norm": 0.13478373396593554, + "learning_rate": 2.6300706873265663e-05, + "loss": 0.8823, + "step": 24967 + }, + { + "epoch": 5.325938566552901, + "grad_norm": 0.1653577502574618, + "learning_rate": 2.629439940680671e-05, + "loss": 0.9316, + "step": 24968 + }, + { + "epoch": 5.326151877133106, + "grad_norm": 0.11626629573353144, + "learning_rate": 2.628809254802753e-05, + "loss": 0.9389, + "step": 24969 + }, + { + "epoch": 5.326365187713311, + "grad_norm": 0.14996144411503226, + "learning_rate": 2.6281786296999477e-05, + "loss": 0.8887, + "step": 24970 + }, + { + "epoch": 5.326578498293515, + "grad_norm": 0.14675901562692079, + "learning_rate": 2.627548065379393e-05, + "loss": 0.9054, + "step": 24971 + }, + { + "epoch": 5.32679180887372, + "grad_norm": 0.12125448694246015, + "learning_rate": 2.6269175618482212e-05, + "loss": 0.89, + "step": 24972 + }, + { + "epoch": 5.327005119453925, + "grad_norm": 0.15709732103601853, + "learning_rate": 2.626287119113565e-05, + "loss": 0.9026, + "step": 24973 + }, + { + "epoch": 5.327218430034129, + "grad_norm": 0.1207882210137354, + "learning_rate": 2.6256567371825643e-05, + "loss": 0.9189, + "step": 24974 + }, + { + "epoch": 5.327431740614334, + "grad_norm": 0.133656222333488, + "learning_rate": 2.6250264160623468e-05, + "loss": 0.9092, + "step": 24975 + }, + { + "epoch": 5.327645051194539, + "grad_norm": 0.12385570500653281, + "learning_rate": 2.624396155760046e-05, + "loss": 0.9201, + "step": 24976 + }, + { + "epoch": 5.327858361774744, + "grad_norm": 0.1750408085997485, + "learning_rate": 2.6237659562827907e-05, + "loss": 0.8992, + "step": 24977 + }, + { + "epoch": 5.328071672354949, + "grad_norm": 0.11170401628729586, + "learning_rate": 2.6231358176377165e-05, + "loss": 0.8995, + "step": 24978 + }, + { + "epoch": 5.328284982935154, + "grad_norm": 0.15453347733503742, + "learning_rate": 2.6225057398319517e-05, + "loss": 0.8911, + "step": 24979 + }, + { + "epoch": 5.328498293515358, + "grad_norm": 0.11532709983140602, + "learning_rate": 2.621875722872627e-05, + "loss": 0.9123, + "step": 24980 + }, + { + "epoch": 5.328711604095563, + "grad_norm": 0.14946967847765277, + "learning_rate": 2.6212457667668634e-05, + "loss": 0.8934, + "step": 24981 + }, + { + "epoch": 5.328924914675768, + "grad_norm": 0.11959901836793177, + "learning_rate": 2.620615871521797e-05, + "loss": 0.8553, + "step": 24982 + }, + { + "epoch": 5.3291382252559725, + "grad_norm": 0.1656977604226802, + "learning_rate": 2.6199860371445533e-05, + "loss": 0.8818, + "step": 24983 + }, + { + "epoch": 5.329351535836177, + "grad_norm": 0.12848496925090694, + "learning_rate": 2.6193562636422574e-05, + "loss": 0.8936, + "step": 24984 + }, + { + "epoch": 5.329564846416382, + "grad_norm": 0.13479145314493673, + "learning_rate": 2.618726551022033e-05, + "loss": 0.9027, + "step": 24985 + }, + { + "epoch": 5.329778156996587, + "grad_norm": 0.14775080648278022, + "learning_rate": 2.618096899291011e-05, + "loss": 0.8809, + "step": 24986 + }, + { + "epoch": 5.329991467576792, + "grad_norm": 0.12374707274696518, + "learning_rate": 2.6174673084563136e-05, + "loss": 0.8665, + "step": 24987 + }, + { + "epoch": 5.330204778156997, + "grad_norm": 0.14175816340232886, + "learning_rate": 2.6168377785250644e-05, + "loss": 0.8945, + "step": 24988 + }, + { + "epoch": 5.3304180887372015, + "grad_norm": 0.11836219674010355, + "learning_rate": 2.6162083095043827e-05, + "loss": 0.9066, + "step": 24989 + }, + { + "epoch": 5.330631399317406, + "grad_norm": 0.14110772917256126, + "learning_rate": 2.6155789014013975e-05, + "loss": 0.9075, + "step": 24990 + }, + { + "epoch": 5.330844709897611, + "grad_norm": 0.12302246862492203, + "learning_rate": 2.6149495542232302e-05, + "loss": 0.8846, + "step": 24991 + }, + { + "epoch": 5.3310580204778155, + "grad_norm": 0.13045980281788233, + "learning_rate": 2.6143202679769974e-05, + "loss": 0.8844, + "step": 24992 + }, + { + "epoch": 5.33127133105802, + "grad_norm": 0.12166049495060924, + "learning_rate": 2.6136910426698174e-05, + "loss": 0.8867, + "step": 24993 + }, + { + "epoch": 5.331484641638225, + "grad_norm": 0.11638889830633165, + "learning_rate": 2.613061878308818e-05, + "loss": 0.8813, + "step": 24994 + }, + { + "epoch": 5.3316979522184305, + "grad_norm": 0.12647204422629285, + "learning_rate": 2.6124327749011124e-05, + "loss": 0.9041, + "step": 24995 + }, + { + "epoch": 5.331911262798635, + "grad_norm": 0.11971045802949, + "learning_rate": 2.6118037324538225e-05, + "loss": 0.8795, + "step": 24996 + }, + { + "epoch": 5.33212457337884, + "grad_norm": 0.14664757904723327, + "learning_rate": 2.6111747509740594e-05, + "loss": 0.8964, + "step": 24997 + }, + { + "epoch": 5.3323378839590445, + "grad_norm": 0.11312090434981856, + "learning_rate": 2.6105458304689482e-05, + "loss": 0.8989, + "step": 24998 + }, + { + "epoch": 5.332551194539249, + "grad_norm": 0.13973268567516844, + "learning_rate": 2.6099169709456017e-05, + "loss": 0.8765, + "step": 24999 + }, + { + "epoch": 5.332764505119454, + "grad_norm": 0.11302066782552889, + "learning_rate": 2.6092881724111357e-05, + "loss": 0.8871, + "step": 25000 + }, + { + "epoch": 5.332977815699659, + "grad_norm": 0.11938945044720231, + "learning_rate": 2.608659434872661e-05, + "loss": 0.8479, + "step": 25001 + }, + { + "epoch": 5.333191126279863, + "grad_norm": 0.13200645934868357, + "learning_rate": 2.608030758337301e-05, + "loss": 0.8845, + "step": 25002 + }, + { + "epoch": 5.333404436860068, + "grad_norm": 0.13312379537727273, + "learning_rate": 2.6074021428121618e-05, + "loss": 0.8873, + "step": 25003 + }, + { + "epoch": 5.3336177474402735, + "grad_norm": 0.121303311114347, + "learning_rate": 2.606773588304357e-05, + "loss": 0.8892, + "step": 25004 + }, + { + "epoch": 5.333831058020478, + "grad_norm": 0.11840479273148487, + "learning_rate": 2.606145094820998e-05, + "loss": 0.8694, + "step": 25005 + }, + { + "epoch": 5.334044368600683, + "grad_norm": 0.1363579397183888, + "learning_rate": 2.6055166623692004e-05, + "loss": 0.8874, + "step": 25006 + }, + { + "epoch": 5.334257679180888, + "grad_norm": 0.1130387058864126, + "learning_rate": 2.6048882909560728e-05, + "loss": 0.8726, + "step": 25007 + }, + { + "epoch": 5.334470989761092, + "grad_norm": 0.14100221929391912, + "learning_rate": 2.6042599805887242e-05, + "loss": 0.893, + "step": 25008 + }, + { + "epoch": 5.334684300341297, + "grad_norm": 0.10797325658412922, + "learning_rate": 2.6036317312742626e-05, + "loss": 0.895, + "step": 25009 + }, + { + "epoch": 5.334897610921502, + "grad_norm": 0.1319673390817302, + "learning_rate": 2.603003543019801e-05, + "loss": 0.9033, + "step": 25010 + }, + { + "epoch": 5.335110921501706, + "grad_norm": 0.12418074865631844, + "learning_rate": 2.6023754158324455e-05, + "loss": 0.8904, + "step": 25011 + }, + { + "epoch": 5.335324232081911, + "grad_norm": 0.11950886083232741, + "learning_rate": 2.601747349719302e-05, + "loss": 0.8601, + "step": 25012 + }, + { + "epoch": 5.335537542662116, + "grad_norm": 0.12362790119184537, + "learning_rate": 2.6011193446874793e-05, + "loss": 0.9275, + "step": 25013 + }, + { + "epoch": 5.335750853242321, + "grad_norm": 0.10102684335412397, + "learning_rate": 2.6004914007440816e-05, + "loss": 0.8889, + "step": 25014 + }, + { + "epoch": 5.335964163822526, + "grad_norm": 0.12396465603408807, + "learning_rate": 2.5998635178962142e-05, + "loss": 0.8648, + "step": 25015 + }, + { + "epoch": 5.336177474402731, + "grad_norm": 0.11052580880210056, + "learning_rate": 2.5992356961509818e-05, + "loss": 0.8694, + "step": 25016 + }, + { + "epoch": 5.336390784982935, + "grad_norm": 0.11623460360343663, + "learning_rate": 2.598607935515486e-05, + "loss": 0.873, + "step": 25017 + }, + { + "epoch": 5.33660409556314, + "grad_norm": 0.11653833479621879, + "learning_rate": 2.5979802359968352e-05, + "loss": 0.914, + "step": 25018 + }, + { + "epoch": 5.336817406143345, + "grad_norm": 0.12553167257777212, + "learning_rate": 2.597352597602129e-05, + "loss": 0.8827, + "step": 25019 + }, + { + "epoch": 5.337030716723549, + "grad_norm": 0.1228476505181958, + "learning_rate": 2.596725020338468e-05, + "loss": 0.9046, + "step": 25020 + }, + { + "epoch": 5.337244027303754, + "grad_norm": 0.12511033193173876, + "learning_rate": 2.5960975042129527e-05, + "loss": 0.9021, + "step": 25021 + }, + { + "epoch": 5.337457337883959, + "grad_norm": 0.10816398326481308, + "learning_rate": 2.5954700492326873e-05, + "loss": 0.8952, + "step": 25022 + }, + { + "epoch": 5.337670648464163, + "grad_norm": 0.13450827590996142, + "learning_rate": 2.5948426554047688e-05, + "loss": 0.8971, + "step": 25023 + }, + { + "epoch": 5.337883959044369, + "grad_norm": 0.13371301924592116, + "learning_rate": 2.5942153227362967e-05, + "loss": 0.8922, + "step": 25024 + }, + { + "epoch": 5.338097269624574, + "grad_norm": 0.11222220986483873, + "learning_rate": 2.5935880512343693e-05, + "loss": 0.8835, + "step": 25025 + }, + { + "epoch": 5.338310580204778, + "grad_norm": 0.13744686237135165, + "learning_rate": 2.5929608409060842e-05, + "loss": 0.875, + "step": 25026 + }, + { + "epoch": 5.338523890784983, + "grad_norm": 0.11917038482366547, + "learning_rate": 2.5923336917585386e-05, + "loss": 0.8769, + "step": 25027 + }, + { + "epoch": 5.338737201365188, + "grad_norm": 0.12942888619834741, + "learning_rate": 2.5917066037988276e-05, + "loss": 0.8563, + "step": 25028 + }, + { + "epoch": 5.338950511945392, + "grad_norm": 0.11935489066033428, + "learning_rate": 2.5910795770340445e-05, + "loss": 0.9037, + "step": 25029 + }, + { + "epoch": 5.339163822525597, + "grad_norm": 0.1402342274238331, + "learning_rate": 2.5904526114712896e-05, + "loss": 0.8917, + "step": 25030 + }, + { + "epoch": 5.339377133105802, + "grad_norm": 0.13708726410554656, + "learning_rate": 2.5898257071176556e-05, + "loss": 0.9002, + "step": 25031 + }, + { + "epoch": 5.339590443686006, + "grad_norm": 0.14162702923529102, + "learning_rate": 2.5891988639802338e-05, + "loss": 0.8784, + "step": 25032 + }, + { + "epoch": 5.339803754266212, + "grad_norm": 0.13322701646500382, + "learning_rate": 2.5885720820661152e-05, + "loss": 0.8982, + "step": 25033 + }, + { + "epoch": 5.340017064846417, + "grad_norm": 0.1257728551252426, + "learning_rate": 2.5879453613823967e-05, + "loss": 0.8842, + "step": 25034 + }, + { + "epoch": 5.340230375426621, + "grad_norm": 0.13543621715191503, + "learning_rate": 2.587318701936168e-05, + "loss": 0.8844, + "step": 25035 + }, + { + "epoch": 5.340443686006826, + "grad_norm": 0.12403508262467164, + "learning_rate": 2.586692103734519e-05, + "loss": 0.8819, + "step": 25036 + }, + { + "epoch": 5.340656996587031, + "grad_norm": 0.12435240264096938, + "learning_rate": 2.586065566784541e-05, + "loss": 0.8627, + "step": 25037 + }, + { + "epoch": 5.340870307167235, + "grad_norm": 0.13332002601064366, + "learning_rate": 2.5854390910933208e-05, + "loss": 0.9022, + "step": 25038 + }, + { + "epoch": 5.34108361774744, + "grad_norm": 0.12891336067758266, + "learning_rate": 2.584812676667949e-05, + "loss": 0.8655, + "step": 25039 + }, + { + "epoch": 5.341296928327645, + "grad_norm": 0.12856162440886157, + "learning_rate": 2.5841863235155123e-05, + "loss": 0.8798, + "step": 25040 + }, + { + "epoch": 5.3415102389078495, + "grad_norm": 0.12351810417899842, + "learning_rate": 2.5835600316430967e-05, + "loss": 0.879, + "step": 25041 + }, + { + "epoch": 5.341723549488055, + "grad_norm": 0.12865920409320628, + "learning_rate": 2.5829338010577924e-05, + "loss": 0.8608, + "step": 25042 + }, + { + "epoch": 5.34193686006826, + "grad_norm": 0.13249238521439852, + "learning_rate": 2.5823076317666837e-05, + "loss": 0.8568, + "step": 25043 + }, + { + "epoch": 5.342150170648464, + "grad_norm": 0.13058076096567642, + "learning_rate": 2.5816815237768546e-05, + "loss": 0.8774, + "step": 25044 + }, + { + "epoch": 5.342363481228669, + "grad_norm": 0.13719268192373482, + "learning_rate": 2.5810554770953885e-05, + "loss": 0.8943, + "step": 25045 + }, + { + "epoch": 5.342576791808874, + "grad_norm": 0.12952541274409074, + "learning_rate": 2.5804294917293727e-05, + "loss": 0.9132, + "step": 25046 + }, + { + "epoch": 5.3427901023890785, + "grad_norm": 0.12226906865201588, + "learning_rate": 2.5798035676858907e-05, + "loss": 0.9006, + "step": 25047 + }, + { + "epoch": 5.343003412969283, + "grad_norm": 0.14521553621444233, + "learning_rate": 2.5791777049720183e-05, + "loss": 0.9142, + "step": 25048 + }, + { + "epoch": 5.343216723549488, + "grad_norm": 0.12862801589787512, + "learning_rate": 2.5785519035948428e-05, + "loss": 0.8727, + "step": 25049 + }, + { + "epoch": 5.3434300341296925, + "grad_norm": 0.1388969470445441, + "learning_rate": 2.5779261635614444e-05, + "loss": 0.8747, + "step": 25050 + }, + { + "epoch": 5.343643344709897, + "grad_norm": 0.12354574664500508, + "learning_rate": 2.5773004848789026e-05, + "loss": 0.9011, + "step": 25051 + }, + { + "epoch": 5.343856655290103, + "grad_norm": 0.15164980502660041, + "learning_rate": 2.5766748675542975e-05, + "loss": 0.9069, + "step": 25052 + }, + { + "epoch": 5.3440699658703075, + "grad_norm": 0.1312116168173551, + "learning_rate": 2.5760493115947043e-05, + "loss": 0.9299, + "step": 25053 + }, + { + "epoch": 5.344283276450512, + "grad_norm": 0.15167487791029047, + "learning_rate": 2.5754238170072068e-05, + "loss": 0.9067, + "step": 25054 + }, + { + "epoch": 5.344496587030717, + "grad_norm": 0.13172924661399496, + "learning_rate": 2.5747983837988802e-05, + "loss": 0.8939, + "step": 25055 + }, + { + "epoch": 5.3447098976109215, + "grad_norm": 0.1255777728246534, + "learning_rate": 2.574173011976802e-05, + "loss": 0.8948, + "step": 25056 + }, + { + "epoch": 5.344923208191126, + "grad_norm": 0.15660365999679535, + "learning_rate": 2.5735477015480433e-05, + "loss": 0.8968, + "step": 25057 + }, + { + "epoch": 5.345136518771331, + "grad_norm": 0.1417727410771331, + "learning_rate": 2.5729224525196904e-05, + "loss": 0.8981, + "step": 25058 + }, + { + "epoch": 5.345349829351536, + "grad_norm": 0.13096452647477771, + "learning_rate": 2.5722972648988077e-05, + "loss": 0.9133, + "step": 25059 + }, + { + "epoch": 5.34556313993174, + "grad_norm": 0.13341289009282378, + "learning_rate": 2.5716721386924706e-05, + "loss": 0.8685, + "step": 25060 + }, + { + "epoch": 5.345776450511945, + "grad_norm": 0.17099680680868093, + "learning_rate": 2.5710470739077567e-05, + "loss": 0.911, + "step": 25061 + }, + { + "epoch": 5.3459897610921505, + "grad_norm": 0.11096336473294793, + "learning_rate": 2.570422070551737e-05, + "loss": 0.8701, + "step": 25062 + }, + { + "epoch": 5.346203071672355, + "grad_norm": 0.1630478952867607, + "learning_rate": 2.569797128631482e-05, + "loss": 0.8924, + "step": 25063 + }, + { + "epoch": 5.34641638225256, + "grad_norm": 0.12970005676640572, + "learning_rate": 2.5691722481540646e-05, + "loss": 0.8904, + "step": 25064 + }, + { + "epoch": 5.346629692832765, + "grad_norm": 0.13115088566258792, + "learning_rate": 2.568547429126552e-05, + "loss": 0.8765, + "step": 25065 + }, + { + "epoch": 5.346843003412969, + "grad_norm": 0.15493560974555132, + "learning_rate": 2.5679226715560184e-05, + "loss": 0.9078, + "step": 25066 + }, + { + "epoch": 5.347056313993174, + "grad_norm": 0.1274542797830225, + "learning_rate": 2.567297975449532e-05, + "loss": 0.8924, + "step": 25067 + }, + { + "epoch": 5.347269624573379, + "grad_norm": 0.15704734239990215, + "learning_rate": 2.56667334081416e-05, + "loss": 0.8712, + "step": 25068 + }, + { + "epoch": 5.347482935153583, + "grad_norm": 0.1277348188127989, + "learning_rate": 2.5660487676569705e-05, + "loss": 0.8617, + "step": 25069 + }, + { + "epoch": 5.347696245733788, + "grad_norm": 0.14640020479762625, + "learning_rate": 2.565424255985031e-05, + "loss": 0.8823, + "step": 25070 + }, + { + "epoch": 5.347909556313994, + "grad_norm": 0.12134649802117413, + "learning_rate": 2.5647998058054063e-05, + "loss": 0.8731, + "step": 25071 + }, + { + "epoch": 5.348122866894198, + "grad_norm": 0.14435248733814487, + "learning_rate": 2.5641754171251617e-05, + "loss": 0.8792, + "step": 25072 + }, + { + "epoch": 5.348336177474403, + "grad_norm": 0.1414467802504071, + "learning_rate": 2.563551089951366e-05, + "loss": 0.8976, + "step": 25073 + }, + { + "epoch": 5.348549488054608, + "grad_norm": 0.14229585106500653, + "learning_rate": 2.562926824291082e-05, + "loss": 0.8793, + "step": 25074 + }, + { + "epoch": 5.348762798634812, + "grad_norm": 0.13346754134689787, + "learning_rate": 2.5623026201513712e-05, + "loss": 0.8754, + "step": 25075 + }, + { + "epoch": 5.348976109215017, + "grad_norm": 0.14549913178551602, + "learning_rate": 2.5616784775392986e-05, + "loss": 0.8863, + "step": 25076 + }, + { + "epoch": 5.349189419795222, + "grad_norm": 0.11641639163638363, + "learning_rate": 2.5610543964619226e-05, + "loss": 0.8837, + "step": 25077 + }, + { + "epoch": 5.349402730375426, + "grad_norm": 0.12580918285223294, + "learning_rate": 2.560430376926309e-05, + "loss": 0.8793, + "step": 25078 + }, + { + "epoch": 5.349616040955631, + "grad_norm": 0.11292285444286193, + "learning_rate": 2.5598064189395188e-05, + "loss": 0.8939, + "step": 25079 + }, + { + "epoch": 5.349829351535837, + "grad_norm": 0.13014920448156853, + "learning_rate": 2.5591825225086097e-05, + "loss": 0.886, + "step": 25080 + }, + { + "epoch": 5.350042662116041, + "grad_norm": 0.11220794268605588, + "learning_rate": 2.558558687640642e-05, + "loss": 0.8845, + "step": 25081 + }, + { + "epoch": 5.350255972696246, + "grad_norm": 0.11387510269795773, + "learning_rate": 2.557934914342675e-05, + "loss": 0.855, + "step": 25082 + }, + { + "epoch": 5.350469283276451, + "grad_norm": 0.13792832137331412, + "learning_rate": 2.5573112026217652e-05, + "loss": 0.8453, + "step": 25083 + }, + { + "epoch": 5.350682593856655, + "grad_norm": 0.10968533662455485, + "learning_rate": 2.5566875524849684e-05, + "loss": 0.8927, + "step": 25084 + }, + { + "epoch": 5.35089590443686, + "grad_norm": 0.15532839264250262, + "learning_rate": 2.5560639639393453e-05, + "loss": 0.9157, + "step": 25085 + }, + { + "epoch": 5.351109215017065, + "grad_norm": 0.14051114479987997, + "learning_rate": 2.5554404369919513e-05, + "loss": 0.9077, + "step": 25086 + }, + { + "epoch": 5.351322525597269, + "grad_norm": 0.12716989469760626, + "learning_rate": 2.5548169716498396e-05, + "loss": 0.8745, + "step": 25087 + }, + { + "epoch": 5.351535836177474, + "grad_norm": 0.14775107126427947, + "learning_rate": 2.5541935679200656e-05, + "loss": 0.8902, + "step": 25088 + }, + { + "epoch": 5.351749146757679, + "grad_norm": 0.12944638700595779, + "learning_rate": 2.5535702258096803e-05, + "loss": 0.8653, + "step": 25089 + }, + { + "epoch": 5.351962457337884, + "grad_norm": 0.1402397573637897, + "learning_rate": 2.552946945325742e-05, + "loss": 0.8825, + "step": 25090 + }, + { + "epoch": 5.352175767918089, + "grad_norm": 0.14114999255839658, + "learning_rate": 2.552323726475301e-05, + "loss": 0.8993, + "step": 25091 + }, + { + "epoch": 5.352389078498294, + "grad_norm": 0.11957627136244867, + "learning_rate": 2.5517005692654078e-05, + "loss": 0.912, + "step": 25092 + }, + { + "epoch": 5.352602389078498, + "grad_norm": 0.14715419344621367, + "learning_rate": 2.5510774737031155e-05, + "loss": 0.9327, + "step": 25093 + }, + { + "epoch": 5.352815699658703, + "grad_norm": 0.12359656369553841, + "learning_rate": 2.5504544397954723e-05, + "loss": 0.8615, + "step": 25094 + }, + { + "epoch": 5.353029010238908, + "grad_norm": 0.13089842262372373, + "learning_rate": 2.5498314675495292e-05, + "loss": 0.9001, + "step": 25095 + }, + { + "epoch": 5.353242320819112, + "grad_norm": 0.1306620822999463, + "learning_rate": 2.5492085569723327e-05, + "loss": 0.8605, + "step": 25096 + }, + { + "epoch": 5.353455631399317, + "grad_norm": 0.12244113532748988, + "learning_rate": 2.5485857080709346e-05, + "loss": 0.8794, + "step": 25097 + }, + { + "epoch": 5.353668941979522, + "grad_norm": 0.14731103856382968, + "learning_rate": 2.5479629208523813e-05, + "loss": 0.8817, + "step": 25098 + }, + { + "epoch": 5.3538822525597265, + "grad_norm": 0.12634098948779382, + "learning_rate": 2.5473401953237193e-05, + "loss": 0.8979, + "step": 25099 + }, + { + "epoch": 5.354095563139932, + "grad_norm": 0.13263945684616402, + "learning_rate": 2.546717531491995e-05, + "loss": 0.8717, + "step": 25100 + }, + { + "epoch": 5.354308873720137, + "grad_norm": 0.1143444889315538, + "learning_rate": 2.5460949293642507e-05, + "loss": 0.9048, + "step": 25101 + }, + { + "epoch": 5.354522184300341, + "grad_norm": 0.1349830101570032, + "learning_rate": 2.5454723889475358e-05, + "loss": 0.8723, + "step": 25102 + }, + { + "epoch": 5.354735494880546, + "grad_norm": 0.10934937378529073, + "learning_rate": 2.5448499102488963e-05, + "loss": 0.8787, + "step": 25103 + }, + { + "epoch": 5.354948805460751, + "grad_norm": 0.13005957367934426, + "learning_rate": 2.544227493275367e-05, + "loss": 0.8924, + "step": 25104 + }, + { + "epoch": 5.3551621160409555, + "grad_norm": 0.11344091298291241, + "learning_rate": 2.5436051380339963e-05, + "loss": 0.9015, + "step": 25105 + }, + { + "epoch": 5.35537542662116, + "grad_norm": 0.11777401790474022, + "learning_rate": 2.5429828445318263e-05, + "loss": 0.8981, + "step": 25106 + }, + { + "epoch": 5.355588737201365, + "grad_norm": 0.11207983411788833, + "learning_rate": 2.5423606127758974e-05, + "loss": 0.8938, + "step": 25107 + }, + { + "epoch": 5.3558020477815695, + "grad_norm": 0.10640487344903943, + "learning_rate": 2.5417384427732473e-05, + "loss": 0.8894, + "step": 25108 + }, + { + "epoch": 5.356015358361775, + "grad_norm": 0.11936131864893991, + "learning_rate": 2.5411163345309204e-05, + "loss": 0.9018, + "step": 25109 + }, + { + "epoch": 5.35622866894198, + "grad_norm": 0.1094418295158827, + "learning_rate": 2.540494288055954e-05, + "loss": 0.9073, + "step": 25110 + }, + { + "epoch": 5.3564419795221845, + "grad_norm": 0.11092943457408251, + "learning_rate": 2.539872303355388e-05, + "loss": 0.8693, + "step": 25111 + }, + { + "epoch": 5.356655290102389, + "grad_norm": 0.11597444856768217, + "learning_rate": 2.5392503804362578e-05, + "loss": 0.8736, + "step": 25112 + }, + { + "epoch": 5.356868600682594, + "grad_norm": 0.1201397121556447, + "learning_rate": 2.538628519305598e-05, + "loss": 0.9005, + "step": 25113 + }, + { + "epoch": 5.3570819112627985, + "grad_norm": 0.1303071374791088, + "learning_rate": 2.5380067199704542e-05, + "loss": 0.8576, + "step": 25114 + }, + { + "epoch": 5.357295221843003, + "grad_norm": 0.12410547631179766, + "learning_rate": 2.537384982437854e-05, + "loss": 0.8916, + "step": 25115 + }, + { + "epoch": 5.357508532423208, + "grad_norm": 0.12195894048798298, + "learning_rate": 2.536763306714832e-05, + "loss": 0.8615, + "step": 25116 + }, + { + "epoch": 5.357721843003413, + "grad_norm": 0.12854526480119846, + "learning_rate": 2.5361416928084274e-05, + "loss": 0.878, + "step": 25117 + }, + { + "epoch": 5.357935153583618, + "grad_norm": 0.13550574980617058, + "learning_rate": 2.5355201407256714e-05, + "loss": 0.8685, + "step": 25118 + }, + { + "epoch": 5.358148464163823, + "grad_norm": 0.13615273967409156, + "learning_rate": 2.5348986504735972e-05, + "loss": 0.8827, + "step": 25119 + }, + { + "epoch": 5.3583617747440275, + "grad_norm": 0.13878026868460483, + "learning_rate": 2.5342772220592337e-05, + "loss": 0.895, + "step": 25120 + }, + { + "epoch": 5.358575085324232, + "grad_norm": 0.1559088195644144, + "learning_rate": 2.5336558554896185e-05, + "loss": 0.8697, + "step": 25121 + }, + { + "epoch": 5.358788395904437, + "grad_norm": 0.13776221694267535, + "learning_rate": 2.5330345507717793e-05, + "loss": 0.904, + "step": 25122 + }, + { + "epoch": 5.359001706484642, + "grad_norm": 0.12815465470881524, + "learning_rate": 2.5324133079127454e-05, + "loss": 0.8734, + "step": 25123 + }, + { + "epoch": 5.359215017064846, + "grad_norm": 0.12719129055487471, + "learning_rate": 2.5317921269195474e-05, + "loss": 0.8616, + "step": 25124 + }, + { + "epoch": 5.359428327645051, + "grad_norm": 0.12492361768349264, + "learning_rate": 2.5311710077992125e-05, + "loss": 0.8853, + "step": 25125 + }, + { + "epoch": 5.359641638225256, + "grad_norm": 0.12286182587016256, + "learning_rate": 2.5305499505587713e-05, + "loss": 0.8893, + "step": 25126 + }, + { + "epoch": 5.359854948805461, + "grad_norm": 0.1153106049394704, + "learning_rate": 2.5299289552052483e-05, + "loss": 0.8855, + "step": 25127 + }, + { + "epoch": 5.360068259385666, + "grad_norm": 0.1451913662715737, + "learning_rate": 2.529308021745669e-05, + "loss": 0.8601, + "step": 25128 + }, + { + "epoch": 5.360281569965871, + "grad_norm": 0.12064000879542951, + "learning_rate": 2.5286871501870637e-05, + "loss": 0.8923, + "step": 25129 + }, + { + "epoch": 5.360494880546075, + "grad_norm": 0.12510014735372954, + "learning_rate": 2.5280663405364553e-05, + "loss": 0.877, + "step": 25130 + }, + { + "epoch": 5.36070819112628, + "grad_norm": 0.11721701365311052, + "learning_rate": 2.5274455928008684e-05, + "loss": 0.8607, + "step": 25131 + }, + { + "epoch": 5.360921501706485, + "grad_norm": 0.1029726181574373, + "learning_rate": 2.5268249069873242e-05, + "loss": 0.8809, + "step": 25132 + }, + { + "epoch": 5.361134812286689, + "grad_norm": 0.14778126998377017, + "learning_rate": 2.5262042831028505e-05, + "loss": 0.9096, + "step": 25133 + }, + { + "epoch": 5.361348122866894, + "grad_norm": 0.10947777726157751, + "learning_rate": 2.525583721154468e-05, + "loss": 0.9137, + "step": 25134 + }, + { + "epoch": 5.361561433447099, + "grad_norm": 0.13921576589383722, + "learning_rate": 2.5249632211491974e-05, + "loss": 0.8893, + "step": 25135 + }, + { + "epoch": 5.361774744027303, + "grad_norm": 0.11673503320492686, + "learning_rate": 2.52434278309406e-05, + "loss": 0.8603, + "step": 25136 + }, + { + "epoch": 5.361988054607509, + "grad_norm": 0.1391703657281721, + "learning_rate": 2.5237224069960763e-05, + "loss": 0.8886, + "step": 25137 + }, + { + "epoch": 5.362201365187714, + "grad_norm": 0.11217150826437931, + "learning_rate": 2.5231020928622655e-05, + "loss": 0.91, + "step": 25138 + }, + { + "epoch": 5.362414675767918, + "grad_norm": 0.12797424490379541, + "learning_rate": 2.522481840699647e-05, + "loss": 0.904, + "step": 25139 + }, + { + "epoch": 5.362627986348123, + "grad_norm": 0.10590423418937038, + "learning_rate": 2.521861650515236e-05, + "loss": 0.892, + "step": 25140 + }, + { + "epoch": 5.362841296928328, + "grad_norm": 0.1297227276199274, + "learning_rate": 2.5212415223160544e-05, + "loss": 0.9043, + "step": 25141 + }, + { + "epoch": 5.363054607508532, + "grad_norm": 0.11269717431341603, + "learning_rate": 2.520621456109118e-05, + "loss": 0.9102, + "step": 25142 + }, + { + "epoch": 5.363267918088737, + "grad_norm": 0.12698576477250662, + "learning_rate": 2.5200014519014415e-05, + "loss": 0.8717, + "step": 25143 + }, + { + "epoch": 5.363481228668942, + "grad_norm": 0.11312139204544662, + "learning_rate": 2.519381509700038e-05, + "loss": 0.9002, + "step": 25144 + }, + { + "epoch": 5.363694539249146, + "grad_norm": 0.12201818409160085, + "learning_rate": 2.5187616295119286e-05, + "loss": 0.9068, + "step": 25145 + }, + { + "epoch": 5.363907849829351, + "grad_norm": 0.1372669773392686, + "learning_rate": 2.5181418113441227e-05, + "loss": 0.8802, + "step": 25146 + }, + { + "epoch": 5.364121160409557, + "grad_norm": 0.11874213660106187, + "learning_rate": 2.5175220552036343e-05, + "loss": 0.887, + "step": 25147 + }, + { + "epoch": 5.364334470989761, + "grad_norm": 0.14218529947229505, + "learning_rate": 2.5169023610974757e-05, + "loss": 0.9014, + "step": 25148 + }, + { + "epoch": 5.364547781569966, + "grad_norm": 0.11428608122765845, + "learning_rate": 2.5162827290326597e-05, + "loss": 0.8934, + "step": 25149 + }, + { + "epoch": 5.364761092150171, + "grad_norm": 0.14168302988188972, + "learning_rate": 2.515663159016197e-05, + "loss": 0.8848, + "step": 25150 + }, + { + "epoch": 5.364974402730375, + "grad_norm": 0.1205599734637947, + "learning_rate": 2.5150436510550977e-05, + "loss": 0.8836, + "step": 25151 + }, + { + "epoch": 5.36518771331058, + "grad_norm": 0.16447498045632475, + "learning_rate": 2.5144242051563694e-05, + "loss": 0.8835, + "step": 25152 + }, + { + "epoch": 5.365401023890785, + "grad_norm": 0.10936011643183041, + "learning_rate": 2.5138048213270255e-05, + "loss": 0.8896, + "step": 25153 + }, + { + "epoch": 5.3656143344709895, + "grad_norm": 0.15945030912819416, + "learning_rate": 2.513185499574073e-05, + "loss": 0.9307, + "step": 25154 + }, + { + "epoch": 5.365827645051194, + "grad_norm": 0.11494659378930278, + "learning_rate": 2.5125662399045182e-05, + "loss": 0.9057, + "step": 25155 + }, + { + "epoch": 5.3660409556314, + "grad_norm": 0.13856486183068104, + "learning_rate": 2.5119470423253667e-05, + "loss": 0.867, + "step": 25156 + }, + { + "epoch": 5.366254266211604, + "grad_norm": 0.1275811728952546, + "learning_rate": 2.5113279068436293e-05, + "loss": 0.8671, + "step": 25157 + }, + { + "epoch": 5.366467576791809, + "grad_norm": 0.13618138628945428, + "learning_rate": 2.5107088334663086e-05, + "loss": 0.8829, + "step": 25158 + }, + { + "epoch": 5.366680887372014, + "grad_norm": 0.14364503399625994, + "learning_rate": 2.5100898222004127e-05, + "loss": 0.9145, + "step": 25159 + }, + { + "epoch": 5.3668941979522184, + "grad_norm": 0.11645053092439893, + "learning_rate": 2.509470873052937e-05, + "loss": 0.8998, + "step": 25160 + }, + { + "epoch": 5.367107508532423, + "grad_norm": 0.1321350494691117, + "learning_rate": 2.5088519860308936e-05, + "loss": 0.8731, + "step": 25161 + }, + { + "epoch": 5.367320819112628, + "grad_norm": 0.11329148614904845, + "learning_rate": 2.5082331611412824e-05, + "loss": 0.9069, + "step": 25162 + }, + { + "epoch": 5.3675341296928325, + "grad_norm": 0.1172127051406838, + "learning_rate": 2.5076143983911055e-05, + "loss": 0.8956, + "step": 25163 + }, + { + "epoch": 5.367747440273037, + "grad_norm": 0.12654179760001324, + "learning_rate": 2.5069956977873613e-05, + "loss": 0.9058, + "step": 25164 + }, + { + "epoch": 5.367960750853243, + "grad_norm": 0.10853348541045757, + "learning_rate": 2.506377059337056e-05, + "loss": 0.8755, + "step": 25165 + }, + { + "epoch": 5.368174061433447, + "grad_norm": 0.14223724637680107, + "learning_rate": 2.5057584830471866e-05, + "loss": 0.8713, + "step": 25166 + }, + { + "epoch": 5.368387372013652, + "grad_norm": 0.1271995929268378, + "learning_rate": 2.505139968924752e-05, + "loss": 0.8849, + "step": 25167 + }, + { + "epoch": 5.368600682593857, + "grad_norm": 0.1399904578023463, + "learning_rate": 2.5045215169767493e-05, + "loss": 0.9243, + "step": 25168 + }, + { + "epoch": 5.3688139931740615, + "grad_norm": 0.1303692611675867, + "learning_rate": 2.5039031272101796e-05, + "loss": 0.8958, + "step": 25169 + }, + { + "epoch": 5.369027303754266, + "grad_norm": 0.13887221491795532, + "learning_rate": 2.5032847996320414e-05, + "loss": 0.871, + "step": 25170 + }, + { + "epoch": 5.369240614334471, + "grad_norm": 0.13206735547199097, + "learning_rate": 2.5026665342493262e-05, + "loss": 0.8962, + "step": 25171 + }, + { + "epoch": 5.3694539249146755, + "grad_norm": 0.1510432082038811, + "learning_rate": 2.5020483310690288e-05, + "loss": 0.8876, + "step": 25172 + }, + { + "epoch": 5.36966723549488, + "grad_norm": 0.12689561706526084, + "learning_rate": 2.501430190098149e-05, + "loss": 0.9076, + "step": 25173 + }, + { + "epoch": 5.369880546075085, + "grad_norm": 0.14326528658449278, + "learning_rate": 2.5008121113436793e-05, + "loss": 0.8896, + "step": 25174 + }, + { + "epoch": 5.3700938566552905, + "grad_norm": 0.15609905575423377, + "learning_rate": 2.5001940948126134e-05, + "loss": 0.9186, + "step": 25175 + }, + { + "epoch": 5.370307167235495, + "grad_norm": 0.12528683881081565, + "learning_rate": 2.499576140511941e-05, + "loss": 0.8719, + "step": 25176 + }, + { + "epoch": 5.3705204778157, + "grad_norm": 0.15141696807625302, + "learning_rate": 2.49895824844866e-05, + "loss": 0.9029, + "step": 25177 + }, + { + "epoch": 5.3707337883959045, + "grad_norm": 0.11503220586249019, + "learning_rate": 2.4983404186297585e-05, + "loss": 0.9043, + "step": 25178 + }, + { + "epoch": 5.370947098976109, + "grad_norm": 0.14110579430382683, + "learning_rate": 2.497722651062228e-05, + "loss": 0.8695, + "step": 25179 + }, + { + "epoch": 5.371160409556314, + "grad_norm": 0.12692745304306807, + "learning_rate": 2.4971049457530558e-05, + "loss": 0.8906, + "step": 25180 + }, + { + "epoch": 5.371373720136519, + "grad_norm": 0.1246135687593732, + "learning_rate": 2.4964873027092384e-05, + "loss": 0.8813, + "step": 25181 + }, + { + "epoch": 5.371587030716723, + "grad_norm": 0.11653687448287334, + "learning_rate": 2.4958697219377577e-05, + "loss": 0.8603, + "step": 25182 + }, + { + "epoch": 5.371800341296928, + "grad_norm": 0.1365853033783176, + "learning_rate": 2.4952522034456035e-05, + "loss": 0.8655, + "step": 25183 + }, + { + "epoch": 5.372013651877133, + "grad_norm": 0.11601399056542472, + "learning_rate": 2.4946347472397604e-05, + "loss": 0.8928, + "step": 25184 + }, + { + "epoch": 5.372226962457338, + "grad_norm": 0.14520510300436576, + "learning_rate": 2.4940173533272206e-05, + "loss": 0.891, + "step": 25185 + }, + { + "epoch": 5.372440273037543, + "grad_norm": 0.15608274636958644, + "learning_rate": 2.4934000217149665e-05, + "loss": 0.8847, + "step": 25186 + }, + { + "epoch": 5.372653583617748, + "grad_norm": 0.12179587994150286, + "learning_rate": 2.4927827524099843e-05, + "loss": 0.8888, + "step": 25187 + }, + { + "epoch": 5.372866894197952, + "grad_norm": 0.1448157143190353, + "learning_rate": 2.4921655454192547e-05, + "loss": 0.8764, + "step": 25188 + }, + { + "epoch": 5.373080204778157, + "grad_norm": 0.122613306916575, + "learning_rate": 2.4915484007497677e-05, + "loss": 0.8971, + "step": 25189 + }, + { + "epoch": 5.373293515358362, + "grad_norm": 0.12535027640237015, + "learning_rate": 2.490931318408503e-05, + "loss": 0.8866, + "step": 25190 + }, + { + "epoch": 5.373506825938566, + "grad_norm": 0.12376486321279846, + "learning_rate": 2.490314298402444e-05, + "loss": 0.8845, + "step": 25191 + }, + { + "epoch": 5.373720136518771, + "grad_norm": 0.13356499565293067, + "learning_rate": 2.4896973407385705e-05, + "loss": 0.8988, + "step": 25192 + }, + { + "epoch": 5.373933447098976, + "grad_norm": 0.12099386937744369, + "learning_rate": 2.4890804454238646e-05, + "loss": 0.8953, + "step": 25193 + }, + { + "epoch": 5.374146757679181, + "grad_norm": 0.13690861206023075, + "learning_rate": 2.4884636124653056e-05, + "loss": 0.8941, + "step": 25194 + }, + { + "epoch": 5.374360068259386, + "grad_norm": 0.11516731739292212, + "learning_rate": 2.4878468418698738e-05, + "loss": 0.8859, + "step": 25195 + }, + { + "epoch": 5.374573378839591, + "grad_norm": 0.12663878262166622, + "learning_rate": 2.4872301336445456e-05, + "loss": 0.9029, + "step": 25196 + }, + { + "epoch": 5.374786689419795, + "grad_norm": 0.1259587862014678, + "learning_rate": 2.486613487796303e-05, + "loss": 0.8844, + "step": 25197 + }, + { + "epoch": 5.375, + "grad_norm": 0.11219048452895466, + "learning_rate": 2.4859969043321223e-05, + "loss": 0.8794, + "step": 25198 + }, + { + "epoch": 5.375213310580205, + "grad_norm": 0.12083424838070737, + "learning_rate": 2.4853803832589786e-05, + "loss": 0.8539, + "step": 25199 + }, + { + "epoch": 5.375426621160409, + "grad_norm": 0.11350667301342764, + "learning_rate": 2.484763924583846e-05, + "loss": 0.895, + "step": 25200 + }, + { + "epoch": 5.375639931740614, + "grad_norm": 0.11580493073708252, + "learning_rate": 2.484147528313706e-05, + "loss": 0.8886, + "step": 25201 + }, + { + "epoch": 5.375853242320819, + "grad_norm": 0.116751746195301, + "learning_rate": 2.4835311944555285e-05, + "loss": 0.8926, + "step": 25202 + }, + { + "epoch": 5.376066552901024, + "grad_norm": 0.11017870461466743, + "learning_rate": 2.4829149230162887e-05, + "loss": 0.8682, + "step": 25203 + }, + { + "epoch": 5.376279863481229, + "grad_norm": 0.1298976021330644, + "learning_rate": 2.4822987140029593e-05, + "loss": 0.8856, + "step": 25204 + }, + { + "epoch": 5.376493174061434, + "grad_norm": 0.10222266512806191, + "learning_rate": 2.481682567422513e-05, + "loss": 0.8772, + "step": 25205 + }, + { + "epoch": 5.376706484641638, + "grad_norm": 0.12537780620553235, + "learning_rate": 2.4810664832819207e-05, + "loss": 0.8527, + "step": 25206 + }, + { + "epoch": 5.376919795221843, + "grad_norm": 0.1029616552285126, + "learning_rate": 2.480450461588155e-05, + "loss": 0.8797, + "step": 25207 + }, + { + "epoch": 5.377133105802048, + "grad_norm": 0.11158805893446727, + "learning_rate": 2.4798345023481812e-05, + "loss": 0.9183, + "step": 25208 + }, + { + "epoch": 5.377346416382252, + "grad_norm": 0.11030242812786421, + "learning_rate": 2.4792186055689768e-05, + "loss": 0.916, + "step": 25209 + }, + { + "epoch": 5.377559726962457, + "grad_norm": 0.11076595691588507, + "learning_rate": 2.4786027712575056e-05, + "loss": 0.8869, + "step": 25210 + }, + { + "epoch": 5.377773037542662, + "grad_norm": 0.11978790404703668, + "learning_rate": 2.477986999420737e-05, + "loss": 0.8867, + "step": 25211 + }, + { + "epoch": 5.377986348122867, + "grad_norm": 0.1260790992892105, + "learning_rate": 2.4773712900656363e-05, + "loss": 0.911, + "step": 25212 + }, + { + "epoch": 5.378199658703072, + "grad_norm": 0.11317390556216701, + "learning_rate": 2.476755643199175e-05, + "loss": 0.8747, + "step": 25213 + }, + { + "epoch": 5.378412969283277, + "grad_norm": 0.12532894788176432, + "learning_rate": 2.4761400588283165e-05, + "loss": 0.8971, + "step": 25214 + }, + { + "epoch": 5.378626279863481, + "grad_norm": 0.11801253391199092, + "learning_rate": 2.475524536960029e-05, + "loss": 0.8902, + "step": 25215 + }, + { + "epoch": 5.378839590443686, + "grad_norm": 0.12686308398413626, + "learning_rate": 2.474909077601268e-05, + "loss": 0.8851, + "step": 25216 + }, + { + "epoch": 5.379052901023891, + "grad_norm": 0.10684200456619192, + "learning_rate": 2.474293680759007e-05, + "loss": 0.8788, + "step": 25217 + }, + { + "epoch": 5.3792662116040955, + "grad_norm": 0.11987453458581351, + "learning_rate": 2.473678346440206e-05, + "loss": 0.9008, + "step": 25218 + }, + { + "epoch": 5.3794795221843, + "grad_norm": 0.11134969835838907, + "learning_rate": 2.4730630746518277e-05, + "loss": 0.9003, + "step": 25219 + }, + { + "epoch": 5.379692832764505, + "grad_norm": 0.1135804941238402, + "learning_rate": 2.47244786540083e-05, + "loss": 0.8429, + "step": 25220 + }, + { + "epoch": 5.3799061433447095, + "grad_norm": 0.12570686666648895, + "learning_rate": 2.471832718694182e-05, + "loss": 0.8276, + "step": 25221 + }, + { + "epoch": 5.380119453924915, + "grad_norm": 0.11374008348494, + "learning_rate": 2.471217634538839e-05, + "loss": 0.8854, + "step": 25222 + }, + { + "epoch": 5.38033276450512, + "grad_norm": 0.12529520646549328, + "learning_rate": 2.470602612941761e-05, + "loss": 0.8659, + "step": 25223 + }, + { + "epoch": 5.3805460750853245, + "grad_norm": 0.11438567313548967, + "learning_rate": 2.4699876539099047e-05, + "loss": 0.8963, + "step": 25224 + }, + { + "epoch": 5.380759385665529, + "grad_norm": 0.12492765059904662, + "learning_rate": 2.469372757450234e-05, + "loss": 0.8587, + "step": 25225 + }, + { + "epoch": 5.380972696245734, + "grad_norm": 0.13691157046021005, + "learning_rate": 2.4687579235697065e-05, + "loss": 0.8801, + "step": 25226 + }, + { + "epoch": 5.3811860068259385, + "grad_norm": 0.12987022242481466, + "learning_rate": 2.468143152275273e-05, + "loss": 0.8885, + "step": 25227 + }, + { + "epoch": 5.381399317406143, + "grad_norm": 0.13943210068219686, + "learning_rate": 2.4675284435738912e-05, + "loss": 0.9041, + "step": 25228 + }, + { + "epoch": 5.381612627986348, + "grad_norm": 0.11593103459679212, + "learning_rate": 2.4669137974725202e-05, + "loss": 0.8782, + "step": 25229 + }, + { + "epoch": 5.381825938566553, + "grad_norm": 0.15588705781575699, + "learning_rate": 2.466299213978112e-05, + "loss": 0.8959, + "step": 25230 + }, + { + "epoch": 5.382039249146757, + "grad_norm": 0.11662763714565222, + "learning_rate": 2.4656846930976216e-05, + "loss": 0.9072, + "step": 25231 + }, + { + "epoch": 5.382252559726963, + "grad_norm": 0.1345912946789485, + "learning_rate": 2.465070234838e-05, + "loss": 0.9108, + "step": 25232 + }, + { + "epoch": 5.3824658703071675, + "grad_norm": 0.1394305773565634, + "learning_rate": 2.4644558392062033e-05, + "loss": 0.8886, + "step": 25233 + }, + { + "epoch": 5.382679180887372, + "grad_norm": 0.1200866966494992, + "learning_rate": 2.4638415062091827e-05, + "loss": 0.8806, + "step": 25234 + }, + { + "epoch": 5.382892491467577, + "grad_norm": 0.14701736093721635, + "learning_rate": 2.4632272358538884e-05, + "loss": 0.8812, + "step": 25235 + }, + { + "epoch": 5.3831058020477816, + "grad_norm": 0.1312225160491254, + "learning_rate": 2.4626130281472687e-05, + "loss": 0.885, + "step": 25236 + }, + { + "epoch": 5.383319112627986, + "grad_norm": 0.12324415798960678, + "learning_rate": 2.4619988830962803e-05, + "loss": 0.8608, + "step": 25237 + }, + { + "epoch": 5.383532423208191, + "grad_norm": 0.1851481249026515, + "learning_rate": 2.4613848007078654e-05, + "loss": 0.91, + "step": 25238 + }, + { + "epoch": 5.383745733788396, + "grad_norm": 0.11883240974174543, + "learning_rate": 2.460770780988975e-05, + "loss": 0.8621, + "step": 25239 + }, + { + "epoch": 5.3839590443686, + "grad_norm": 0.1619072765780127, + "learning_rate": 2.4601568239465536e-05, + "loss": 0.9233, + "step": 25240 + }, + { + "epoch": 5.384172354948806, + "grad_norm": 0.135561496284347, + "learning_rate": 2.4595429295875538e-05, + "loss": 0.9185, + "step": 25241 + }, + { + "epoch": 5.3843856655290105, + "grad_norm": 0.16601376050776367, + "learning_rate": 2.458929097918918e-05, + "loss": 0.8915, + "step": 25242 + }, + { + "epoch": 5.384598976109215, + "grad_norm": 0.12919760750222686, + "learning_rate": 2.4583153289475936e-05, + "loss": 0.8807, + "step": 25243 + }, + { + "epoch": 5.38481228668942, + "grad_norm": 0.13969498008300393, + "learning_rate": 2.4577016226805217e-05, + "loss": 0.876, + "step": 25244 + }, + { + "epoch": 5.385025597269625, + "grad_norm": 0.13390303944649903, + "learning_rate": 2.4570879791246515e-05, + "loss": 0.8972, + "step": 25245 + }, + { + "epoch": 5.385238907849829, + "grad_norm": 0.1277242439347769, + "learning_rate": 2.4564743982869236e-05, + "loss": 0.8638, + "step": 25246 + }, + { + "epoch": 5.385452218430034, + "grad_norm": 0.11283381249578953, + "learning_rate": 2.4558608801742815e-05, + "loss": 0.8666, + "step": 25247 + }, + { + "epoch": 5.385665529010239, + "grad_norm": 0.11263742415311755, + "learning_rate": 2.4552474247936672e-05, + "loss": 0.8848, + "step": 25248 + }, + { + "epoch": 5.385878839590443, + "grad_norm": 0.1325138336293512, + "learning_rate": 2.4546340321520214e-05, + "loss": 0.8815, + "step": 25249 + }, + { + "epoch": 5.386092150170649, + "grad_norm": 0.10680793656634531, + "learning_rate": 2.4540207022562842e-05, + "loss": 0.874, + "step": 25250 + }, + { + "epoch": 5.386305460750854, + "grad_norm": 0.1485056507320219, + "learning_rate": 2.4534074351133966e-05, + "loss": 0.8778, + "step": 25251 + }, + { + "epoch": 5.386518771331058, + "grad_norm": 0.11650330177872285, + "learning_rate": 2.4527942307302944e-05, + "loss": 0.8916, + "step": 25252 + }, + { + "epoch": 5.386732081911263, + "grad_norm": 0.1555398337656728, + "learning_rate": 2.452181089113922e-05, + "loss": 0.9084, + "step": 25253 + }, + { + "epoch": 5.386945392491468, + "grad_norm": 0.11634179888061143, + "learning_rate": 2.4515680102712135e-05, + "loss": 0.9007, + "step": 25254 + }, + { + "epoch": 5.387158703071672, + "grad_norm": 0.13321134042109828, + "learning_rate": 2.4509549942091058e-05, + "loss": 0.8878, + "step": 25255 + }, + { + "epoch": 5.387372013651877, + "grad_norm": 0.11943673175834832, + "learning_rate": 2.4503420409345332e-05, + "loss": 0.8661, + "step": 25256 + }, + { + "epoch": 5.387585324232082, + "grad_norm": 0.1624318445844602, + "learning_rate": 2.4497291504544373e-05, + "loss": 0.9043, + "step": 25257 + }, + { + "epoch": 5.387798634812286, + "grad_norm": 0.11449230978321556, + "learning_rate": 2.4491163227757485e-05, + "loss": 0.8881, + "step": 25258 + }, + { + "epoch": 5.388011945392491, + "grad_norm": 0.11450582464343818, + "learning_rate": 2.4485035579054016e-05, + "loss": 0.8831, + "step": 25259 + }, + { + "epoch": 5.388225255972697, + "grad_norm": 0.13433454687094473, + "learning_rate": 2.4478908558503312e-05, + "loss": 0.8799, + "step": 25260 + }, + { + "epoch": 5.388438566552901, + "grad_norm": 0.11384005375497691, + "learning_rate": 2.447278216617469e-05, + "loss": 0.8984, + "step": 25261 + }, + { + "epoch": 5.388651877133106, + "grad_norm": 0.14169028142295054, + "learning_rate": 2.4466656402137464e-05, + "loss": 0.8942, + "step": 25262 + }, + { + "epoch": 5.388865187713311, + "grad_norm": 0.1203081185961274, + "learning_rate": 2.4460531266460965e-05, + "loss": 0.8714, + "step": 25263 + }, + { + "epoch": 5.389078498293515, + "grad_norm": 0.14115453925253668, + "learning_rate": 2.445440675921446e-05, + "loss": 0.8536, + "step": 25264 + }, + { + "epoch": 5.38929180887372, + "grad_norm": 0.11478434545704805, + "learning_rate": 2.444828288046729e-05, + "loss": 0.899, + "step": 25265 + }, + { + "epoch": 5.389505119453925, + "grad_norm": 0.1218407685124129, + "learning_rate": 2.444215963028875e-05, + "loss": 0.8879, + "step": 25266 + }, + { + "epoch": 5.389718430034129, + "grad_norm": 0.11850263303425487, + "learning_rate": 2.4436037008748104e-05, + "loss": 0.9157, + "step": 25267 + }, + { + "epoch": 5.389931740614334, + "grad_norm": 0.11871094464481943, + "learning_rate": 2.4429915015914603e-05, + "loss": 0.8867, + "step": 25268 + }, + { + "epoch": 5.390145051194539, + "grad_norm": 0.12130883251198288, + "learning_rate": 2.442379365185758e-05, + "loss": 0.9001, + "step": 25269 + }, + { + "epoch": 5.390358361774744, + "grad_norm": 0.10974190155445915, + "learning_rate": 2.4417672916646264e-05, + "loss": 0.8728, + "step": 25270 + }, + { + "epoch": 5.390571672354949, + "grad_norm": 0.12402276474300707, + "learning_rate": 2.4411552810349913e-05, + "loss": 0.9029, + "step": 25271 + }, + { + "epoch": 5.390784982935154, + "grad_norm": 0.1133060339777519, + "learning_rate": 2.4405433333037785e-05, + "loss": 0.8881, + "step": 25272 + }, + { + "epoch": 5.390998293515358, + "grad_norm": 0.10864976558336226, + "learning_rate": 2.4399314484779108e-05, + "loss": 0.8801, + "step": 25273 + }, + { + "epoch": 5.391211604095563, + "grad_norm": 0.11864867671086568, + "learning_rate": 2.439319626564312e-05, + "loss": 0.9116, + "step": 25274 + }, + { + "epoch": 5.391424914675768, + "grad_norm": 0.10501461404580298, + "learning_rate": 2.438707867569905e-05, + "loss": 0.8816, + "step": 25275 + }, + { + "epoch": 5.3916382252559725, + "grad_norm": 0.11358886506078805, + "learning_rate": 2.4380961715016102e-05, + "loss": 0.8692, + "step": 25276 + }, + { + "epoch": 5.391851535836177, + "grad_norm": 0.13456445973741454, + "learning_rate": 2.437484538366353e-05, + "loss": 0.8691, + "step": 25277 + }, + { + "epoch": 5.392064846416382, + "grad_norm": 0.11453399582328289, + "learning_rate": 2.4368729681710512e-05, + "loss": 0.9045, + "step": 25278 + }, + { + "epoch": 5.392278156996587, + "grad_norm": 0.12054260902344037, + "learning_rate": 2.4362614609226266e-05, + "loss": 0.8783, + "step": 25279 + }, + { + "epoch": 5.392491467576792, + "grad_norm": 0.12669119681638663, + "learning_rate": 2.4356500166279933e-05, + "loss": 0.9112, + "step": 25280 + }, + { + "epoch": 5.392704778156997, + "grad_norm": 0.11677695526846776, + "learning_rate": 2.435038635294077e-05, + "loss": 0.9094, + "step": 25281 + }, + { + "epoch": 5.3929180887372015, + "grad_norm": 0.1251430107390608, + "learning_rate": 2.434427316927795e-05, + "loss": 0.8841, + "step": 25282 + }, + { + "epoch": 5.393131399317406, + "grad_norm": 0.1201561466903124, + "learning_rate": 2.4338160615360557e-05, + "loss": 0.8504, + "step": 25283 + }, + { + "epoch": 5.393344709897611, + "grad_norm": 0.12595453748860327, + "learning_rate": 2.433204869125784e-05, + "loss": 0.8857, + "step": 25284 + }, + { + "epoch": 5.3935580204778155, + "grad_norm": 0.12928274485063943, + "learning_rate": 2.4325937397038927e-05, + "loss": 0.8665, + "step": 25285 + }, + { + "epoch": 5.39377133105802, + "grad_norm": 0.12597404033707713, + "learning_rate": 2.4319826732772968e-05, + "loss": 0.9146, + "step": 25286 + }, + { + "epoch": 5.393984641638225, + "grad_norm": 0.13538081287121811, + "learning_rate": 2.4313716698529116e-05, + "loss": 0.8955, + "step": 25287 + }, + { + "epoch": 5.3941979522184305, + "grad_norm": 0.11232874769200389, + "learning_rate": 2.4307607294376458e-05, + "loss": 0.8848, + "step": 25288 + }, + { + "epoch": 5.394411262798635, + "grad_norm": 0.12899429531512133, + "learning_rate": 2.4301498520384197e-05, + "loss": 0.8816, + "step": 25289 + }, + { + "epoch": 5.39462457337884, + "grad_norm": 0.11373389618072555, + "learning_rate": 2.4295390376621413e-05, + "loss": 0.8765, + "step": 25290 + }, + { + "epoch": 5.3948378839590445, + "grad_norm": 0.13145395393956205, + "learning_rate": 2.4289282863157214e-05, + "loss": 0.9129, + "step": 25291 + }, + { + "epoch": 5.395051194539249, + "grad_norm": 0.10923500210518945, + "learning_rate": 2.4283175980060705e-05, + "loss": 0.8587, + "step": 25292 + }, + { + "epoch": 5.395264505119454, + "grad_norm": 0.13920511915370892, + "learning_rate": 2.4277069727401043e-05, + "loss": 0.868, + "step": 25293 + }, + { + "epoch": 5.395477815699659, + "grad_norm": 0.12481732943328569, + "learning_rate": 2.4270964105247242e-05, + "loss": 0.8881, + "step": 25294 + }, + { + "epoch": 5.395691126279863, + "grad_norm": 0.1196327904146778, + "learning_rate": 2.4264859113668403e-05, + "loss": 0.8881, + "step": 25295 + }, + { + "epoch": 5.395904436860068, + "grad_norm": 0.12191867614749156, + "learning_rate": 2.4258754752733643e-05, + "loss": 0.8974, + "step": 25296 + }, + { + "epoch": 5.3961177474402735, + "grad_norm": 0.12145887355547766, + "learning_rate": 2.425265102251201e-05, + "loss": 0.8992, + "step": 25297 + }, + { + "epoch": 5.396331058020478, + "grad_norm": 0.13421510010925058, + "learning_rate": 2.4246547923072576e-05, + "loss": 0.8791, + "step": 25298 + }, + { + "epoch": 5.396544368600683, + "grad_norm": 0.13058465317152443, + "learning_rate": 2.424044545448438e-05, + "loss": 0.9049, + "step": 25299 + }, + { + "epoch": 5.396757679180888, + "grad_norm": 0.1183555938786481, + "learning_rate": 2.423434361681646e-05, + "loss": 0.8796, + "step": 25300 + }, + { + "epoch": 5.396970989761092, + "grad_norm": 0.13267915857752638, + "learning_rate": 2.4228242410137908e-05, + "loss": 0.8974, + "step": 25301 + }, + { + "epoch": 5.397184300341297, + "grad_norm": 0.15930522616226114, + "learning_rate": 2.4222141834517726e-05, + "loss": 0.8914, + "step": 25302 + }, + { + "epoch": 5.397397610921502, + "grad_norm": 0.10669703329842455, + "learning_rate": 2.421604189002495e-05, + "loss": 0.8678, + "step": 25303 + }, + { + "epoch": 5.397610921501706, + "grad_norm": 0.13084429624365698, + "learning_rate": 2.4209942576728594e-05, + "loss": 0.8797, + "step": 25304 + }, + { + "epoch": 5.397824232081911, + "grad_norm": 0.10952444716608259, + "learning_rate": 2.420384389469768e-05, + "loss": 0.907, + "step": 25305 + }, + { + "epoch": 5.398037542662116, + "grad_norm": 0.1455836460820557, + "learning_rate": 2.4197745844001217e-05, + "loss": 0.8999, + "step": 25306 + }, + { + "epoch": 5.398250853242321, + "grad_norm": 0.1225318899611953, + "learning_rate": 2.4191648424708163e-05, + "loss": 0.904, + "step": 25307 + }, + { + "epoch": 5.398464163822526, + "grad_norm": 0.10692836128882237, + "learning_rate": 2.4185551636887573e-05, + "loss": 0.8981, + "step": 25308 + }, + { + "epoch": 5.398677474402731, + "grad_norm": 0.11444636212200414, + "learning_rate": 2.417945548060841e-05, + "loss": 0.8985, + "step": 25309 + }, + { + "epoch": 5.398890784982935, + "grad_norm": 0.1237269860730103, + "learning_rate": 2.4173359955939645e-05, + "loss": 0.8798, + "step": 25310 + }, + { + "epoch": 5.39910409556314, + "grad_norm": 0.12204143829129048, + "learning_rate": 2.4167265062950244e-05, + "loss": 0.8613, + "step": 25311 + }, + { + "epoch": 5.399317406143345, + "grad_norm": 0.11818246366179098, + "learning_rate": 2.4161170801709163e-05, + "loss": 0.9096, + "step": 25312 + }, + { + "epoch": 5.399530716723549, + "grad_norm": 0.13349436967023656, + "learning_rate": 2.4155077172285396e-05, + "loss": 0.9157, + "step": 25313 + }, + { + "epoch": 5.399744027303754, + "grad_norm": 0.12381519551162179, + "learning_rate": 2.4148984174747866e-05, + "loss": 0.915, + "step": 25314 + }, + { + "epoch": 5.399957337883959, + "grad_norm": 0.13150276783622553, + "learning_rate": 2.4142891809165524e-05, + "loss": 0.883, + "step": 25315 + }, + { + "epoch": 5.400170648464163, + "grad_norm": 0.12364664717290198, + "learning_rate": 2.4136800075607302e-05, + "loss": 0.8952, + "step": 25316 + }, + { + "epoch": 5.400383959044369, + "grad_norm": 0.11670188369224765, + "learning_rate": 2.4130708974142118e-05, + "loss": 0.8912, + "step": 25317 + }, + { + "epoch": 5.400597269624574, + "grad_norm": 0.1392903348711812, + "learning_rate": 2.412461850483891e-05, + "loss": 0.8905, + "step": 25318 + }, + { + "epoch": 5.400810580204778, + "grad_norm": 0.14297718666029122, + "learning_rate": 2.4118528667766555e-05, + "loss": 0.8827, + "step": 25319 + }, + { + "epoch": 5.401023890784983, + "grad_norm": 0.12198385084059979, + "learning_rate": 2.4112439462994016e-05, + "loss": 0.9211, + "step": 25320 + }, + { + "epoch": 5.401237201365188, + "grad_norm": 0.12395407011091022, + "learning_rate": 2.4106350890590163e-05, + "loss": 0.8864, + "step": 25321 + }, + { + "epoch": 5.401450511945392, + "grad_norm": 0.1300829773891754, + "learning_rate": 2.410026295062389e-05, + "loss": 0.8974, + "step": 25322 + }, + { + "epoch": 5.401663822525597, + "grad_norm": 0.13149979610714374, + "learning_rate": 2.4094175643164082e-05, + "loss": 0.877, + "step": 25323 + }, + { + "epoch": 5.401877133105802, + "grad_norm": 0.13052574557609195, + "learning_rate": 2.4088088968279592e-05, + "loss": 0.8722, + "step": 25324 + }, + { + "epoch": 5.402090443686006, + "grad_norm": 0.1294661774040806, + "learning_rate": 2.4082002926039342e-05, + "loss": 0.9117, + "step": 25325 + }, + { + "epoch": 5.402303754266212, + "grad_norm": 0.12809063341628374, + "learning_rate": 2.407591751651216e-05, + "loss": 0.8971, + "step": 25326 + }, + { + "epoch": 5.402517064846417, + "grad_norm": 0.12707409681061502, + "learning_rate": 2.4069832739766926e-05, + "loss": 0.8637, + "step": 25327 + }, + { + "epoch": 5.402730375426621, + "grad_norm": 0.11896226402773763, + "learning_rate": 2.4063748595872465e-05, + "loss": 0.9158, + "step": 25328 + }, + { + "epoch": 5.402943686006826, + "grad_norm": 0.13862863300257539, + "learning_rate": 2.4057665084897634e-05, + "loss": 0.8692, + "step": 25329 + }, + { + "epoch": 5.403156996587031, + "grad_norm": 0.1263023616687029, + "learning_rate": 2.4051582206911263e-05, + "loss": 0.8934, + "step": 25330 + }, + { + "epoch": 5.403370307167235, + "grad_norm": 0.12302462057398222, + "learning_rate": 2.4045499961982158e-05, + "loss": 0.8847, + "step": 25331 + }, + { + "epoch": 5.40358361774744, + "grad_norm": 0.13666907709930937, + "learning_rate": 2.4039418350179177e-05, + "loss": 0.8681, + "step": 25332 + }, + { + "epoch": 5.403796928327645, + "grad_norm": 0.12313749731957245, + "learning_rate": 2.403333737157113e-05, + "loss": 0.9219, + "step": 25333 + }, + { + "epoch": 5.4040102389078495, + "grad_norm": 0.12180948871153499, + "learning_rate": 2.4027257026226812e-05, + "loss": 0.9027, + "step": 25334 + }, + { + "epoch": 5.404223549488055, + "grad_norm": 0.1282920840602814, + "learning_rate": 2.4021177314215017e-05, + "loss": 0.8921, + "step": 25335 + }, + { + "epoch": 5.40443686006826, + "grad_norm": 0.12405054205865511, + "learning_rate": 2.401509823560452e-05, + "loss": 0.8854, + "step": 25336 + }, + { + "epoch": 5.404650170648464, + "grad_norm": 0.12153713138074047, + "learning_rate": 2.400901979046415e-05, + "loss": 0.9022, + "step": 25337 + }, + { + "epoch": 5.404863481228669, + "grad_norm": 0.1338770516167368, + "learning_rate": 2.400294197886269e-05, + "loss": 0.8974, + "step": 25338 + }, + { + "epoch": 5.405076791808874, + "grad_norm": 0.12068607151748906, + "learning_rate": 2.3996864800868834e-05, + "loss": 0.8774, + "step": 25339 + }, + { + "epoch": 5.4052901023890785, + "grad_norm": 0.11184964270044087, + "learning_rate": 2.399078825655142e-05, + "loss": 0.8803, + "step": 25340 + }, + { + "epoch": 5.405503412969283, + "grad_norm": 0.12807798476777651, + "learning_rate": 2.398471234597917e-05, + "loss": 0.899, + "step": 25341 + }, + { + "epoch": 5.405716723549488, + "grad_norm": 0.11853340806454302, + "learning_rate": 2.3978637069220856e-05, + "loss": 0.8859, + "step": 25342 + }, + { + "epoch": 5.4059300341296925, + "grad_norm": 0.12178175362585639, + "learning_rate": 2.3972562426345175e-05, + "loss": 0.8876, + "step": 25343 + }, + { + "epoch": 5.406143344709897, + "grad_norm": 0.1512786817054329, + "learning_rate": 2.396648841742091e-05, + "loss": 0.8777, + "step": 25344 + }, + { + "epoch": 5.406356655290103, + "grad_norm": 0.11312443143307332, + "learning_rate": 2.396041504251678e-05, + "loss": 0.8917, + "step": 25345 + }, + { + "epoch": 5.4065699658703075, + "grad_norm": 0.14150079402881313, + "learning_rate": 2.3954342301701498e-05, + "loss": 0.8747, + "step": 25346 + }, + { + "epoch": 5.406783276450512, + "grad_norm": 0.11142753438582666, + "learning_rate": 2.3948270195043773e-05, + "loss": 0.8776, + "step": 25347 + }, + { + "epoch": 5.406996587030717, + "grad_norm": 0.13167102799378044, + "learning_rate": 2.3942198722612285e-05, + "loss": 0.8555, + "step": 25348 + }, + { + "epoch": 5.4072098976109215, + "grad_norm": 0.1083705961859697, + "learning_rate": 2.393612788447581e-05, + "loss": 0.8512, + "step": 25349 + }, + { + "epoch": 5.407423208191126, + "grad_norm": 0.13297267561898962, + "learning_rate": 2.3930057680702977e-05, + "loss": 0.8637, + "step": 25350 + }, + { + "epoch": 5.407636518771331, + "grad_norm": 0.11633597663747428, + "learning_rate": 2.3923988111362448e-05, + "loss": 0.8725, + "step": 25351 + }, + { + "epoch": 5.407849829351536, + "grad_norm": 0.13837794373748674, + "learning_rate": 2.391791917652297e-05, + "loss": 0.8752, + "step": 25352 + }, + { + "epoch": 5.40806313993174, + "grad_norm": 0.11898746889976623, + "learning_rate": 2.3911850876253172e-05, + "loss": 0.8729, + "step": 25353 + }, + { + "epoch": 5.408276450511945, + "grad_norm": 0.1306599233400291, + "learning_rate": 2.3905783210621733e-05, + "loss": 0.9045, + "step": 25354 + }, + { + "epoch": 5.4084897610921505, + "grad_norm": 0.12987174833229811, + "learning_rate": 2.3899716179697272e-05, + "loss": 0.8999, + "step": 25355 + }, + { + "epoch": 5.408703071672355, + "grad_norm": 0.13351522918948341, + "learning_rate": 2.3893649783548484e-05, + "loss": 0.9174, + "step": 25356 + }, + { + "epoch": 5.40891638225256, + "grad_norm": 0.12463282336739263, + "learning_rate": 2.3887584022243998e-05, + "loss": 0.9098, + "step": 25357 + }, + { + "epoch": 5.409129692832765, + "grad_norm": 0.12170582145743063, + "learning_rate": 2.3881518895852444e-05, + "loss": 0.8555, + "step": 25358 + }, + { + "epoch": 5.409343003412969, + "grad_norm": 0.1279524164347781, + "learning_rate": 2.387545440444245e-05, + "loss": 0.9056, + "step": 25359 + }, + { + "epoch": 5.409556313993174, + "grad_norm": 0.1096359087977269, + "learning_rate": 2.3869390548082628e-05, + "loss": 0.907, + "step": 25360 + }, + { + "epoch": 5.409769624573379, + "grad_norm": 0.13299307858244855, + "learning_rate": 2.38633273268416e-05, + "loss": 0.9039, + "step": 25361 + }, + { + "epoch": 5.409982935153583, + "grad_norm": 0.10443260223344408, + "learning_rate": 2.385726474078797e-05, + "loss": 0.9107, + "step": 25362 + }, + { + "epoch": 5.410196245733788, + "grad_norm": 0.12856912551692726, + "learning_rate": 2.38512027899903e-05, + "loss": 0.9362, + "step": 25363 + }, + { + "epoch": 5.410409556313994, + "grad_norm": 0.10435432603495429, + "learning_rate": 2.384514147451725e-05, + "loss": 0.8684, + "step": 25364 + }, + { + "epoch": 5.410622866894198, + "grad_norm": 0.13067481681917945, + "learning_rate": 2.383908079443736e-05, + "loss": 0.9075, + "step": 25365 + }, + { + "epoch": 5.410836177474403, + "grad_norm": 0.12231145353704344, + "learning_rate": 2.3833020749819218e-05, + "loss": 0.9063, + "step": 25366 + }, + { + "epoch": 5.411049488054608, + "grad_norm": 0.13267342366846516, + "learning_rate": 2.382696134073137e-05, + "loss": 0.8897, + "step": 25367 + }, + { + "epoch": 5.411262798634812, + "grad_norm": 0.1380981094967519, + "learning_rate": 2.382090256724242e-05, + "loss": 0.8864, + "step": 25368 + }, + { + "epoch": 5.411476109215017, + "grad_norm": 0.10953033926810073, + "learning_rate": 2.3814844429420908e-05, + "loss": 0.9134, + "step": 25369 + }, + { + "epoch": 5.411689419795222, + "grad_norm": 0.13608073009416646, + "learning_rate": 2.3808786927335367e-05, + "loss": 0.8709, + "step": 25370 + }, + { + "epoch": 5.411902730375426, + "grad_norm": 0.09994344844366104, + "learning_rate": 2.3802730061054352e-05, + "loss": 0.9144, + "step": 25371 + }, + { + "epoch": 5.412116040955631, + "grad_norm": 0.11502352988837394, + "learning_rate": 2.379667383064639e-05, + "loss": 0.8783, + "step": 25372 + }, + { + "epoch": 5.412329351535837, + "grad_norm": 0.12566986285951662, + "learning_rate": 2.3790618236180018e-05, + "loss": 0.8997, + "step": 25373 + }, + { + "epoch": 5.412542662116041, + "grad_norm": 0.11538088263669657, + "learning_rate": 2.3784563277723738e-05, + "loss": 0.8575, + "step": 25374 + }, + { + "epoch": 5.412755972696246, + "grad_norm": 0.15313828164416848, + "learning_rate": 2.3778508955346043e-05, + "loss": 0.931, + "step": 25375 + }, + { + "epoch": 5.412969283276451, + "grad_norm": 0.11825662835034463, + "learning_rate": 2.3772455269115496e-05, + "loss": 0.8718, + "step": 25376 + }, + { + "epoch": 5.413182593856655, + "grad_norm": 0.11532339114434469, + "learning_rate": 2.376640221910056e-05, + "loss": 0.8825, + "step": 25377 + }, + { + "epoch": 5.41339590443686, + "grad_norm": 0.12843604100932318, + "learning_rate": 2.3760349805369725e-05, + "loss": 0.8828, + "step": 25378 + }, + { + "epoch": 5.413609215017065, + "grad_norm": 0.10926204431182479, + "learning_rate": 2.375429802799146e-05, + "loss": 0.8827, + "step": 25379 + }, + { + "epoch": 5.413822525597269, + "grad_norm": 0.13677138655150942, + "learning_rate": 2.374824688703428e-05, + "loss": 0.864, + "step": 25380 + }, + { + "epoch": 5.414035836177474, + "grad_norm": 0.11187018954477342, + "learning_rate": 2.374219638256663e-05, + "loss": 0.8905, + "step": 25381 + }, + { + "epoch": 5.414249146757679, + "grad_norm": 0.1410197967121332, + "learning_rate": 2.3736146514656975e-05, + "loss": 0.8776, + "step": 25382 + }, + { + "epoch": 5.414462457337884, + "grad_norm": 0.11507713234796246, + "learning_rate": 2.3730097283373778e-05, + "loss": 0.8934, + "step": 25383 + }, + { + "epoch": 5.414675767918089, + "grad_norm": 0.1353250855635002, + "learning_rate": 2.3724048688785464e-05, + "loss": 0.9067, + "step": 25384 + }, + { + "epoch": 5.414889078498294, + "grad_norm": 0.128271418608666, + "learning_rate": 2.3718000730960498e-05, + "loss": 0.8703, + "step": 25385 + }, + { + "epoch": 5.415102389078498, + "grad_norm": 0.1309343571139305, + "learning_rate": 2.371195340996729e-05, + "loss": 0.8846, + "step": 25386 + }, + { + "epoch": 5.415315699658703, + "grad_norm": 0.14650192538372414, + "learning_rate": 2.370590672587425e-05, + "loss": 0.8982, + "step": 25387 + }, + { + "epoch": 5.415529010238908, + "grad_norm": 0.11897100498979382, + "learning_rate": 2.3699860678749856e-05, + "loss": 0.8912, + "step": 25388 + }, + { + "epoch": 5.415742320819112, + "grad_norm": 0.1442086946981894, + "learning_rate": 2.3693815268662484e-05, + "loss": 0.8813, + "step": 25389 + }, + { + "epoch": 5.415955631399317, + "grad_norm": 0.12051904335463812, + "learning_rate": 2.3687770495680535e-05, + "loss": 0.8799, + "step": 25390 + }, + { + "epoch": 5.416168941979522, + "grad_norm": 0.1497752352669307, + "learning_rate": 2.3681726359872385e-05, + "loss": 0.9022, + "step": 25391 + }, + { + "epoch": 5.4163822525597265, + "grad_norm": 0.1161039335833536, + "learning_rate": 2.3675682861306477e-05, + "loss": 0.8984, + "step": 25392 + }, + { + "epoch": 5.416595563139932, + "grad_norm": 0.15381746663609724, + "learning_rate": 2.3669640000051165e-05, + "loss": 0.8816, + "step": 25393 + }, + { + "epoch": 5.416808873720137, + "grad_norm": 0.12728778122257575, + "learning_rate": 2.3663597776174843e-05, + "loss": 0.8653, + "step": 25394 + }, + { + "epoch": 5.417022184300341, + "grad_norm": 0.13580910625454967, + "learning_rate": 2.365755618974581e-05, + "loss": 0.9066, + "step": 25395 + }, + { + "epoch": 5.417235494880546, + "grad_norm": 0.15508927345498322, + "learning_rate": 2.3651515240832507e-05, + "loss": 0.8897, + "step": 25396 + }, + { + "epoch": 5.417448805460751, + "grad_norm": 0.1340675625493045, + "learning_rate": 2.364547492950326e-05, + "loss": 0.8945, + "step": 25397 + }, + { + "epoch": 5.4176621160409555, + "grad_norm": 0.14869085873041551, + "learning_rate": 2.3639435255826405e-05, + "loss": 0.9166, + "step": 25398 + }, + { + "epoch": 5.41787542662116, + "grad_norm": 0.1533018645966718, + "learning_rate": 2.3633396219870263e-05, + "loss": 0.8968, + "step": 25399 + }, + { + "epoch": 5.418088737201365, + "grad_norm": 0.14255320290579804, + "learning_rate": 2.3627357821703222e-05, + "loss": 0.8814, + "step": 25400 + }, + { + "epoch": 5.4183020477815695, + "grad_norm": 0.15540137615359564, + "learning_rate": 2.3621320061393566e-05, + "loss": 0.8669, + "step": 25401 + }, + { + "epoch": 5.418515358361775, + "grad_norm": 0.1226217199100409, + "learning_rate": 2.361528293900963e-05, + "loss": 0.9114, + "step": 25402 + }, + { + "epoch": 5.41872866894198, + "grad_norm": 0.14204618662992796, + "learning_rate": 2.3609246454619693e-05, + "loss": 0.8832, + "step": 25403 + }, + { + "epoch": 5.4189419795221845, + "grad_norm": 0.1443146972987076, + "learning_rate": 2.3603210608292102e-05, + "loss": 0.8883, + "step": 25404 + }, + { + "epoch": 5.419155290102389, + "grad_norm": 0.14793622916059748, + "learning_rate": 2.3597175400095152e-05, + "loss": 0.9087, + "step": 25405 + }, + { + "epoch": 5.419368600682594, + "grad_norm": 0.15154755934560066, + "learning_rate": 2.3591140830097094e-05, + "loss": 0.9118, + "step": 25406 + }, + { + "epoch": 5.4195819112627985, + "grad_norm": 0.14107781063275882, + "learning_rate": 2.3585106898366192e-05, + "loss": 0.9084, + "step": 25407 + }, + { + "epoch": 5.419795221843003, + "grad_norm": 0.1691915434343511, + "learning_rate": 2.3579073604970784e-05, + "loss": 0.8679, + "step": 25408 + }, + { + "epoch": 5.420008532423208, + "grad_norm": 0.13507500785749546, + "learning_rate": 2.357304094997911e-05, + "loss": 0.8885, + "step": 25409 + }, + { + "epoch": 5.420221843003413, + "grad_norm": 0.1445813759680493, + "learning_rate": 2.3567008933459424e-05, + "loss": 0.8667, + "step": 25410 + }, + { + "epoch": 5.420435153583618, + "grad_norm": 0.13960304430655635, + "learning_rate": 2.356097755547996e-05, + "loss": 0.9004, + "step": 25411 + }, + { + "epoch": 5.420648464163823, + "grad_norm": 0.13532102830400003, + "learning_rate": 2.3554946816109007e-05, + "loss": 0.9031, + "step": 25412 + }, + { + "epoch": 5.4208617747440275, + "grad_norm": 0.11415269988111665, + "learning_rate": 2.3548916715414772e-05, + "loss": 0.8832, + "step": 25413 + }, + { + "epoch": 5.421075085324232, + "grad_norm": 0.13519772088251575, + "learning_rate": 2.3542887253465504e-05, + "loss": 0.8696, + "step": 25414 + }, + { + "epoch": 5.421288395904437, + "grad_norm": 0.11539391490533316, + "learning_rate": 2.3536858430329393e-05, + "loss": 0.8998, + "step": 25415 + }, + { + "epoch": 5.421501706484642, + "grad_norm": 0.14230587247733567, + "learning_rate": 2.3530830246074717e-05, + "loss": 0.8968, + "step": 25416 + }, + { + "epoch": 5.421715017064846, + "grad_norm": 0.126710110946048, + "learning_rate": 2.3524802700769627e-05, + "loss": 0.9057, + "step": 25417 + }, + { + "epoch": 5.421928327645051, + "grad_norm": 0.12482009084974996, + "learning_rate": 2.3518775794482353e-05, + "loss": 0.9165, + "step": 25418 + }, + { + "epoch": 5.422141638225256, + "grad_norm": 0.11500771393084422, + "learning_rate": 2.3512749527281046e-05, + "loss": 0.8818, + "step": 25419 + }, + { + "epoch": 5.422354948805461, + "grad_norm": 0.14269925076986872, + "learning_rate": 2.3506723899233954e-05, + "loss": 0.8821, + "step": 25420 + }, + { + "epoch": 5.422568259385666, + "grad_norm": 0.10983624348637573, + "learning_rate": 2.3500698910409234e-05, + "loss": 0.8802, + "step": 25421 + }, + { + "epoch": 5.422781569965871, + "grad_norm": 0.15752891199560787, + "learning_rate": 2.3494674560875056e-05, + "loss": 0.8806, + "step": 25422 + }, + { + "epoch": 5.422994880546075, + "grad_norm": 0.12168041096419986, + "learning_rate": 2.3488650850699557e-05, + "loss": 0.9149, + "step": 25423 + }, + { + "epoch": 5.42320819112628, + "grad_norm": 0.14572033768488687, + "learning_rate": 2.348262777995095e-05, + "loss": 0.9013, + "step": 25424 + }, + { + "epoch": 5.423421501706485, + "grad_norm": 0.11802743378151578, + "learning_rate": 2.3476605348697364e-05, + "loss": 0.8678, + "step": 25425 + }, + { + "epoch": 5.423634812286689, + "grad_norm": 0.12537444916738127, + "learning_rate": 2.3470583557006944e-05, + "loss": 0.8812, + "step": 25426 + }, + { + "epoch": 5.423848122866894, + "grad_norm": 0.12887492088282484, + "learning_rate": 2.3464562404947816e-05, + "loss": 0.9039, + "step": 25427 + }, + { + "epoch": 5.424061433447099, + "grad_norm": 0.11147345428668136, + "learning_rate": 2.3458541892588115e-05, + "loss": 0.8691, + "step": 25428 + }, + { + "epoch": 5.424274744027303, + "grad_norm": 0.12253399472181926, + "learning_rate": 2.3452522019995964e-05, + "loss": 0.8836, + "step": 25429 + }, + { + "epoch": 5.424488054607509, + "grad_norm": 0.11409693473938408, + "learning_rate": 2.3446502787239477e-05, + "loss": 0.8938, + "step": 25430 + }, + { + "epoch": 5.424701365187714, + "grad_norm": 0.12164164034554438, + "learning_rate": 2.3440484194386733e-05, + "loss": 0.8716, + "step": 25431 + }, + { + "epoch": 5.424914675767918, + "grad_norm": 0.12333165593484859, + "learning_rate": 2.3434466241505893e-05, + "loss": 0.8646, + "step": 25432 + }, + { + "epoch": 5.425127986348123, + "grad_norm": 0.12170731893151793, + "learning_rate": 2.342844892866502e-05, + "loss": 0.8844, + "step": 25433 + }, + { + "epoch": 5.425341296928328, + "grad_norm": 0.13061510619044717, + "learning_rate": 2.3422432255932196e-05, + "loss": 0.9063, + "step": 25434 + }, + { + "epoch": 5.425554607508532, + "grad_norm": 0.12801761001405337, + "learning_rate": 2.341641622337548e-05, + "loss": 0.8845, + "step": 25435 + }, + { + "epoch": 5.425767918088737, + "grad_norm": 0.11420609171899389, + "learning_rate": 2.3410400831062983e-05, + "loss": 0.901, + "step": 25436 + }, + { + "epoch": 5.425981228668942, + "grad_norm": 0.1366567885817426, + "learning_rate": 2.3404386079062764e-05, + "loss": 0.8965, + "step": 25437 + }, + { + "epoch": 5.426194539249146, + "grad_norm": 0.11469110744574233, + "learning_rate": 2.3398371967442862e-05, + "loss": 0.8877, + "step": 25438 + }, + { + "epoch": 5.426407849829351, + "grad_norm": 0.12770775989341349, + "learning_rate": 2.3392358496271327e-05, + "loss": 0.915, + "step": 25439 + }, + { + "epoch": 5.426621160409557, + "grad_norm": 0.1282380984994156, + "learning_rate": 2.3386345665616216e-05, + "loss": 0.8621, + "step": 25440 + }, + { + "epoch": 5.426834470989761, + "grad_norm": 0.12305169585274088, + "learning_rate": 2.3380333475545554e-05, + "loss": 0.8675, + "step": 25441 + }, + { + "epoch": 5.427047781569966, + "grad_norm": 0.1387689990104274, + "learning_rate": 2.337432192612737e-05, + "loss": 0.889, + "step": 25442 + }, + { + "epoch": 5.427261092150171, + "grad_norm": 0.12321480905387813, + "learning_rate": 2.3368311017429673e-05, + "loss": 0.8785, + "step": 25443 + }, + { + "epoch": 5.427474402730375, + "grad_norm": 0.12279187036136185, + "learning_rate": 2.3362300749520497e-05, + "loss": 0.8687, + "step": 25444 + }, + { + "epoch": 5.42768771331058, + "grad_norm": 0.1274841003723085, + "learning_rate": 2.3356291122467853e-05, + "loss": 0.8505, + "step": 25445 + }, + { + "epoch": 5.427901023890785, + "grad_norm": 0.1419663434938675, + "learning_rate": 2.3350282136339724e-05, + "loss": 0.8871, + "step": 25446 + }, + { + "epoch": 5.4281143344709895, + "grad_norm": 0.1184634315350086, + "learning_rate": 2.334427379120408e-05, + "loss": 0.9156, + "step": 25447 + }, + { + "epoch": 5.428327645051194, + "grad_norm": 0.1260349805650679, + "learning_rate": 2.3338266087128947e-05, + "loss": 0.9382, + "step": 25448 + }, + { + "epoch": 5.4285409556314, + "grad_norm": 0.11763238303613589, + "learning_rate": 2.3332259024182297e-05, + "loss": 0.857, + "step": 25449 + }, + { + "epoch": 5.428754266211604, + "grad_norm": 0.1184141418761141, + "learning_rate": 2.3326252602432115e-05, + "loss": 0.8901, + "step": 25450 + }, + { + "epoch": 5.428967576791809, + "grad_norm": 0.1148513957211723, + "learning_rate": 2.3320246821946288e-05, + "loss": 0.8862, + "step": 25451 + }, + { + "epoch": 5.429180887372014, + "grad_norm": 0.1325202308317282, + "learning_rate": 2.3314241682792842e-05, + "loss": 0.9055, + "step": 25452 + }, + { + "epoch": 5.4293941979522184, + "grad_norm": 0.11559124427545595, + "learning_rate": 2.33082371850397e-05, + "loss": 0.9222, + "step": 25453 + }, + { + "epoch": 5.429607508532423, + "grad_norm": 0.11610053919838208, + "learning_rate": 2.3302233328754814e-05, + "loss": 0.8871, + "step": 25454 + }, + { + "epoch": 5.429820819112628, + "grad_norm": 0.11944269985827058, + "learning_rate": 2.3296230114006086e-05, + "loss": 0.8675, + "step": 25455 + }, + { + "epoch": 5.4300341296928325, + "grad_norm": 0.12474530047607078, + "learning_rate": 2.3290227540861494e-05, + "loss": 0.8598, + "step": 25456 + }, + { + "epoch": 5.430247440273037, + "grad_norm": 0.1230931660130015, + "learning_rate": 2.3284225609388934e-05, + "loss": 0.9049, + "step": 25457 + }, + { + "epoch": 5.430460750853243, + "grad_norm": 0.13251543767059626, + "learning_rate": 2.3278224319656308e-05, + "loss": 0.8675, + "step": 25458 + }, + { + "epoch": 5.430674061433447, + "grad_norm": 0.12325503085165325, + "learning_rate": 2.3272223671731507e-05, + "loss": 0.848, + "step": 25459 + }, + { + "epoch": 5.430887372013652, + "grad_norm": 0.11798785686051633, + "learning_rate": 2.3266223665682475e-05, + "loss": 0.8922, + "step": 25460 + }, + { + "epoch": 5.431100682593857, + "grad_norm": 0.11241553589623973, + "learning_rate": 2.3260224301577106e-05, + "loss": 0.8749, + "step": 25461 + }, + { + "epoch": 5.4313139931740615, + "grad_norm": 0.138588908990023, + "learning_rate": 2.3254225579483222e-05, + "loss": 0.8926, + "step": 25462 + }, + { + "epoch": 5.431527303754266, + "grad_norm": 0.1113047923877155, + "learning_rate": 2.3248227499468704e-05, + "loss": 0.8772, + "step": 25463 + }, + { + "epoch": 5.431740614334471, + "grad_norm": 0.13960526840412066, + "learning_rate": 2.3242230061601472e-05, + "loss": 0.8912, + "step": 25464 + }, + { + "epoch": 5.4319539249146755, + "grad_norm": 0.11103434030609073, + "learning_rate": 2.3236233265949365e-05, + "loss": 0.8644, + "step": 25465 + }, + { + "epoch": 5.43216723549488, + "grad_norm": 0.15288629163372325, + "learning_rate": 2.3230237112580234e-05, + "loss": 0.937, + "step": 25466 + }, + { + "epoch": 5.432380546075085, + "grad_norm": 0.10719811135013006, + "learning_rate": 2.3224241601561897e-05, + "loss": 0.8861, + "step": 25467 + }, + { + "epoch": 5.4325938566552905, + "grad_norm": 0.14493542871163664, + "learning_rate": 2.3218246732962248e-05, + "loss": 0.8753, + "step": 25468 + }, + { + "epoch": 5.432807167235495, + "grad_norm": 0.11359196424354209, + "learning_rate": 2.3212252506849086e-05, + "loss": 0.8883, + "step": 25469 + }, + { + "epoch": 5.4330204778157, + "grad_norm": 0.1373965391533773, + "learning_rate": 2.320625892329025e-05, + "loss": 0.9016, + "step": 25470 + }, + { + "epoch": 5.4332337883959045, + "grad_norm": 0.11720316775476679, + "learning_rate": 2.3200265982353525e-05, + "loss": 0.8964, + "step": 25471 + }, + { + "epoch": 5.433447098976109, + "grad_norm": 0.11147226756833883, + "learning_rate": 2.3194273684106792e-05, + "loss": 0.8955, + "step": 25472 + }, + { + "epoch": 5.433660409556314, + "grad_norm": 0.11822896592345203, + "learning_rate": 2.318828202861778e-05, + "loss": 0.9088, + "step": 25473 + }, + { + "epoch": 5.433873720136519, + "grad_norm": 0.11177545721584768, + "learning_rate": 2.3182291015954315e-05, + "loss": 0.8737, + "step": 25474 + }, + { + "epoch": 5.434087030716723, + "grad_norm": 0.11602929001056535, + "learning_rate": 2.317630064618416e-05, + "loss": 0.8754, + "step": 25475 + }, + { + "epoch": 5.434300341296928, + "grad_norm": 0.11147977970801967, + "learning_rate": 2.317031091937514e-05, + "loss": 0.8597, + "step": 25476 + }, + { + "epoch": 5.434513651877133, + "grad_norm": 0.11380575239246789, + "learning_rate": 2.316432183559501e-05, + "loss": 0.8779, + "step": 25477 + }, + { + "epoch": 5.434726962457338, + "grad_norm": 0.10998856070240932, + "learning_rate": 2.315833339491154e-05, + "loss": 0.8714, + "step": 25478 + }, + { + "epoch": 5.434940273037543, + "grad_norm": 0.13703691254944722, + "learning_rate": 2.3152345597392454e-05, + "loss": 0.9076, + "step": 25479 + }, + { + "epoch": 5.435153583617748, + "grad_norm": 0.10530960338279753, + "learning_rate": 2.3146358443105558e-05, + "loss": 0.9274, + "step": 25480 + }, + { + "epoch": 5.435366894197952, + "grad_norm": 0.1326441864524656, + "learning_rate": 2.3140371932118573e-05, + "loss": 0.8879, + "step": 25481 + }, + { + "epoch": 5.435580204778157, + "grad_norm": 0.1225817426574205, + "learning_rate": 2.313438606449923e-05, + "loss": 0.9187, + "step": 25482 + }, + { + "epoch": 5.435793515358362, + "grad_norm": 0.1287945456970333, + "learning_rate": 2.312840084031527e-05, + "loss": 0.9025, + "step": 25483 + }, + { + "epoch": 5.436006825938566, + "grad_norm": 0.14971075398261602, + "learning_rate": 2.3122416259634414e-05, + "loss": 0.8855, + "step": 25484 + }, + { + "epoch": 5.436220136518771, + "grad_norm": 0.12156156409489768, + "learning_rate": 2.311643232252438e-05, + "loss": 0.8868, + "step": 25485 + }, + { + "epoch": 5.436433447098976, + "grad_norm": 0.18791451149461214, + "learning_rate": 2.3110449029052856e-05, + "loss": 0.8751, + "step": 25486 + }, + { + "epoch": 5.436646757679181, + "grad_norm": 0.10864847602941372, + "learning_rate": 2.3104466379287543e-05, + "loss": 0.8934, + "step": 25487 + }, + { + "epoch": 5.436860068259386, + "grad_norm": 0.1728323302482133, + "learning_rate": 2.3098484373296177e-05, + "loss": 0.9216, + "step": 25488 + }, + { + "epoch": 5.437073378839591, + "grad_norm": 0.12484611552517415, + "learning_rate": 2.3092503011146416e-05, + "loss": 0.8939, + "step": 25489 + }, + { + "epoch": 5.437286689419795, + "grad_norm": 0.14250120748600104, + "learning_rate": 2.308652229290593e-05, + "loss": 0.8674, + "step": 25490 + }, + { + "epoch": 5.4375, + "grad_norm": 0.14231105012558473, + "learning_rate": 2.3080542218642387e-05, + "loss": 0.8837, + "step": 25491 + }, + { + "epoch": 5.437713310580205, + "grad_norm": 0.14631967849122174, + "learning_rate": 2.3074562788423485e-05, + "loss": 0.92, + "step": 25492 + }, + { + "epoch": 5.437926621160409, + "grad_norm": 0.13372116919485535, + "learning_rate": 2.3068584002316873e-05, + "loss": 0.8797, + "step": 25493 + }, + { + "epoch": 5.438139931740614, + "grad_norm": 0.1299491176519433, + "learning_rate": 2.3062605860390185e-05, + "loss": 0.8844, + "step": 25494 + }, + { + "epoch": 5.438353242320819, + "grad_norm": 0.15414274239715894, + "learning_rate": 2.3056628362711067e-05, + "loss": 0.898, + "step": 25495 + }, + { + "epoch": 5.438566552901024, + "grad_norm": 0.12122003125181864, + "learning_rate": 2.3050651509347154e-05, + "loss": 0.8758, + "step": 25496 + }, + { + "epoch": 5.438779863481229, + "grad_norm": 0.1556243860704009, + "learning_rate": 2.3044675300366084e-05, + "loss": 0.8872, + "step": 25497 + }, + { + "epoch": 5.438993174061434, + "grad_norm": 0.12403036013510199, + "learning_rate": 2.303869973583547e-05, + "loss": 0.8883, + "step": 25498 + }, + { + "epoch": 5.439206484641638, + "grad_norm": 0.1579404243371923, + "learning_rate": 2.3032724815822896e-05, + "loss": 0.8823, + "step": 25499 + }, + { + "epoch": 5.439419795221843, + "grad_norm": 0.12932154305954463, + "learning_rate": 2.302675054039603e-05, + "loss": 0.8698, + "step": 25500 + }, + { + "epoch": 5.439633105802048, + "grad_norm": 0.12051361045977349, + "learning_rate": 2.3020776909622437e-05, + "loss": 0.8847, + "step": 25501 + }, + { + "epoch": 5.439846416382252, + "grad_norm": 0.15144501405669122, + "learning_rate": 2.3014803923569716e-05, + "loss": 0.882, + "step": 25502 + }, + { + "epoch": 5.440059726962457, + "grad_norm": 0.10632937067472925, + "learning_rate": 2.3008831582305414e-05, + "loss": 0.8767, + "step": 25503 + }, + { + "epoch": 5.440273037542662, + "grad_norm": 0.15722001499417615, + "learning_rate": 2.3002859885897174e-05, + "loss": 0.8517, + "step": 25504 + }, + { + "epoch": 5.440486348122867, + "grad_norm": 0.1137161148305822, + "learning_rate": 2.2996888834412526e-05, + "loss": 0.8875, + "step": 25505 + }, + { + "epoch": 5.440699658703072, + "grad_norm": 0.13480423365205588, + "learning_rate": 2.2990918427919046e-05, + "loss": 0.906, + "step": 25506 + }, + { + "epoch": 5.440912969283277, + "grad_norm": 0.1297028819956929, + "learning_rate": 2.298494866648428e-05, + "loss": 0.875, + "step": 25507 + }, + { + "epoch": 5.441126279863481, + "grad_norm": 0.11309755669934773, + "learning_rate": 2.297897955017578e-05, + "loss": 0.88, + "step": 25508 + }, + { + "epoch": 5.441339590443686, + "grad_norm": 0.12774660437107643, + "learning_rate": 2.297301107906108e-05, + "loss": 0.8869, + "step": 25509 + }, + { + "epoch": 5.441552901023891, + "grad_norm": 0.11213352685413039, + "learning_rate": 2.2967043253207733e-05, + "loss": 0.8593, + "step": 25510 + }, + { + "epoch": 5.4417662116040955, + "grad_norm": 0.1261479672769432, + "learning_rate": 2.2961076072683213e-05, + "loss": 0.8823, + "step": 25511 + }, + { + "epoch": 5.4419795221843, + "grad_norm": 0.12504128844608303, + "learning_rate": 2.2955109537555106e-05, + "loss": 0.9016, + "step": 25512 + }, + { + "epoch": 5.442192832764505, + "grad_norm": 0.10611546138235914, + "learning_rate": 2.294914364789089e-05, + "loss": 0.8624, + "step": 25513 + }, + { + "epoch": 5.4424061433447095, + "grad_norm": 0.11622759041363291, + "learning_rate": 2.294317840375808e-05, + "loss": 0.9074, + "step": 25514 + }, + { + "epoch": 5.442619453924915, + "grad_norm": 0.1132350079551248, + "learning_rate": 2.2937213805224133e-05, + "loss": 0.8647, + "step": 25515 + }, + { + "epoch": 5.44283276450512, + "grad_norm": 0.12321404352962052, + "learning_rate": 2.29312498523566e-05, + "loss": 0.9014, + "step": 25516 + }, + { + "epoch": 5.4430460750853245, + "grad_norm": 0.13914549623232472, + "learning_rate": 2.292528654522297e-05, + "loss": 0.8796, + "step": 25517 + }, + { + "epoch": 5.443259385665529, + "grad_norm": 0.12396878381575571, + "learning_rate": 2.2919323883890616e-05, + "loss": 0.8762, + "step": 25518 + }, + { + "epoch": 5.443472696245734, + "grad_norm": 0.13238423157861642, + "learning_rate": 2.2913361868427105e-05, + "loss": 0.9118, + "step": 25519 + }, + { + "epoch": 5.4436860068259385, + "grad_norm": 0.12023108146073976, + "learning_rate": 2.290740049889987e-05, + "loss": 0.8744, + "step": 25520 + }, + { + "epoch": 5.443899317406143, + "grad_norm": 0.1175840430156688, + "learning_rate": 2.2901439775376356e-05, + "loss": 0.9114, + "step": 25521 + }, + { + "epoch": 5.444112627986348, + "grad_norm": 0.12770267792015333, + "learning_rate": 2.2895479697924002e-05, + "loss": 0.8819, + "step": 25522 + }, + { + "epoch": 5.444325938566553, + "grad_norm": 0.12953066103634445, + "learning_rate": 2.288952026661024e-05, + "loss": 0.8837, + "step": 25523 + }, + { + "epoch": 5.444539249146757, + "grad_norm": 0.10492615650737361, + "learning_rate": 2.2883561481502542e-05, + "loss": 0.9082, + "step": 25524 + }, + { + "epoch": 5.444752559726963, + "grad_norm": 0.12097925652938915, + "learning_rate": 2.28776033426683e-05, + "loss": 0.8879, + "step": 25525 + }, + { + "epoch": 5.4449658703071675, + "grad_norm": 0.10763846556327815, + "learning_rate": 2.287164585017494e-05, + "loss": 0.9005, + "step": 25526 + }, + { + "epoch": 5.445179180887372, + "grad_norm": 0.1128742622117954, + "learning_rate": 2.2865689004089848e-05, + "loss": 0.8662, + "step": 25527 + }, + { + "epoch": 5.445392491467577, + "grad_norm": 0.11950307259397012, + "learning_rate": 2.2859732804480494e-05, + "loss": 0.893, + "step": 25528 + }, + { + "epoch": 5.4456058020477816, + "grad_norm": 0.12216998331989473, + "learning_rate": 2.2853777251414197e-05, + "loss": 0.8963, + "step": 25529 + }, + { + "epoch": 5.445819112627986, + "grad_norm": 0.113689737129661, + "learning_rate": 2.2847822344958344e-05, + "loss": 0.8699, + "step": 25530 + }, + { + "epoch": 5.446032423208191, + "grad_norm": 0.11109293810161291, + "learning_rate": 2.284186808518037e-05, + "loss": 0.8773, + "step": 25531 + }, + { + "epoch": 5.446245733788396, + "grad_norm": 0.13143467511406437, + "learning_rate": 2.2835914472147618e-05, + "loss": 0.8912, + "step": 25532 + }, + { + "epoch": 5.4464590443686, + "grad_norm": 0.10230684226642732, + "learning_rate": 2.2829961505927463e-05, + "loss": 0.8537, + "step": 25533 + }, + { + "epoch": 5.446672354948806, + "grad_norm": 0.14503716166991826, + "learning_rate": 2.2824009186587245e-05, + "loss": 0.8996, + "step": 25534 + }, + { + "epoch": 5.4468856655290105, + "grad_norm": 0.11599659972263231, + "learning_rate": 2.2818057514194312e-05, + "loss": 0.8993, + "step": 25535 + }, + { + "epoch": 5.447098976109215, + "grad_norm": 0.13889234872046943, + "learning_rate": 2.2812106488816034e-05, + "loss": 0.9078, + "step": 25536 + }, + { + "epoch": 5.44731228668942, + "grad_norm": 0.12049947354706611, + "learning_rate": 2.2806156110519736e-05, + "loss": 0.9019, + "step": 25537 + }, + { + "epoch": 5.447525597269625, + "grad_norm": 0.13440197785178004, + "learning_rate": 2.2800206379372744e-05, + "loss": 0.8819, + "step": 25538 + }, + { + "epoch": 5.447738907849829, + "grad_norm": 0.12341681886945535, + "learning_rate": 2.2794257295442384e-05, + "loss": 0.8634, + "step": 25539 + }, + { + "epoch": 5.447952218430034, + "grad_norm": 0.1086581259524208, + "learning_rate": 2.2788308858795957e-05, + "loss": 0.8508, + "step": 25540 + }, + { + "epoch": 5.448165529010239, + "grad_norm": 0.13377243924769547, + "learning_rate": 2.2782361069500782e-05, + "loss": 0.8674, + "step": 25541 + }, + { + "epoch": 5.448378839590443, + "grad_norm": 0.1143623457973377, + "learning_rate": 2.277641392762413e-05, + "loss": 0.8727, + "step": 25542 + }, + { + "epoch": 5.448592150170649, + "grad_norm": 0.12077844570998664, + "learning_rate": 2.2770467433233332e-05, + "loss": 0.8848, + "step": 25543 + }, + { + "epoch": 5.448805460750854, + "grad_norm": 0.11547495046966001, + "learning_rate": 2.2764521586395674e-05, + "loss": 0.9017, + "step": 25544 + }, + { + "epoch": 5.449018771331058, + "grad_norm": 0.14295254584430445, + "learning_rate": 2.2758576387178408e-05, + "loss": 0.8798, + "step": 25545 + }, + { + "epoch": 5.449232081911263, + "grad_norm": 0.10765783369515451, + "learning_rate": 2.2752631835648805e-05, + "loss": 0.9069, + "step": 25546 + }, + { + "epoch": 5.449445392491468, + "grad_norm": 0.13332026552823095, + "learning_rate": 2.274668793187412e-05, + "loss": 0.9213, + "step": 25547 + }, + { + "epoch": 5.449658703071672, + "grad_norm": 0.11266084943830373, + "learning_rate": 2.2740744675921647e-05, + "loss": 0.9329, + "step": 25548 + }, + { + "epoch": 5.449872013651877, + "grad_norm": 0.11169253902465397, + "learning_rate": 2.2734802067858618e-05, + "loss": 0.8786, + "step": 25549 + }, + { + "epoch": 5.450085324232082, + "grad_norm": 0.11754807151059311, + "learning_rate": 2.272886010775226e-05, + "loss": 0.9065, + "step": 25550 + }, + { + "epoch": 5.450298634812286, + "grad_norm": 0.12226567632043435, + "learning_rate": 2.2722918795669813e-05, + "loss": 0.8799, + "step": 25551 + }, + { + "epoch": 5.450511945392491, + "grad_norm": 0.11302428607200364, + "learning_rate": 2.27169781316785e-05, + "loss": 0.9156, + "step": 25552 + }, + { + "epoch": 5.450725255972697, + "grad_norm": 0.1193045874000873, + "learning_rate": 2.2711038115845553e-05, + "loss": 0.853, + "step": 25553 + }, + { + "epoch": 5.450938566552901, + "grad_norm": 0.1263340165019057, + "learning_rate": 2.270509874823814e-05, + "loss": 0.8732, + "step": 25554 + }, + { + "epoch": 5.451151877133106, + "grad_norm": 0.11484333923723045, + "learning_rate": 2.2699160028923527e-05, + "loss": 0.8812, + "step": 25555 + }, + { + "epoch": 5.451365187713311, + "grad_norm": 0.13932245700494375, + "learning_rate": 2.2693221957968882e-05, + "loss": 0.9074, + "step": 25556 + }, + { + "epoch": 5.451578498293515, + "grad_norm": 0.12869632517376065, + "learning_rate": 2.2687284535441396e-05, + "loss": 0.8884, + "step": 25557 + }, + { + "epoch": 5.45179180887372, + "grad_norm": 0.11985524354148637, + "learning_rate": 2.2681347761408244e-05, + "loss": 0.8995, + "step": 25558 + }, + { + "epoch": 5.452005119453925, + "grad_norm": 0.14470831361212777, + "learning_rate": 2.2675411635936585e-05, + "loss": 0.9185, + "step": 25559 + }, + { + "epoch": 5.452218430034129, + "grad_norm": 0.12739974459508105, + "learning_rate": 2.2669476159093636e-05, + "loss": 0.9205, + "step": 25560 + }, + { + "epoch": 5.452431740614334, + "grad_norm": 0.11554405663447262, + "learning_rate": 2.2663541330946526e-05, + "loss": 0.8703, + "step": 25561 + }, + { + "epoch": 5.452645051194539, + "grad_norm": 0.13590262389060642, + "learning_rate": 2.2657607151562404e-05, + "loss": 0.8843, + "step": 25562 + }, + { + "epoch": 5.452858361774744, + "grad_norm": 0.11884292319064091, + "learning_rate": 2.2651673621008427e-05, + "loss": 0.8955, + "step": 25563 + }, + { + "epoch": 5.453071672354949, + "grad_norm": 0.13453176326671135, + "learning_rate": 2.2645740739351724e-05, + "loss": 0.853, + "step": 25564 + }, + { + "epoch": 5.453284982935154, + "grad_norm": 0.11919567897234395, + "learning_rate": 2.2639808506659434e-05, + "loss": 0.8919, + "step": 25565 + }, + { + "epoch": 5.453498293515358, + "grad_norm": 0.12830724897079998, + "learning_rate": 2.2633876922998655e-05, + "loss": 0.8922, + "step": 25566 + }, + { + "epoch": 5.453711604095563, + "grad_norm": 0.13102580081413065, + "learning_rate": 2.2627945988436536e-05, + "loss": 0.8948, + "step": 25567 + }, + { + "epoch": 5.453924914675768, + "grad_norm": 0.12907885780474554, + "learning_rate": 2.2622015703040184e-05, + "loss": 0.8721, + "step": 25568 + }, + { + "epoch": 5.4541382252559725, + "grad_norm": 0.11926145760191482, + "learning_rate": 2.261608606687668e-05, + "loss": 0.9077, + "step": 25569 + }, + { + "epoch": 5.454351535836177, + "grad_norm": 0.12177234109403219, + "learning_rate": 2.2610157080013133e-05, + "loss": 0.8789, + "step": 25570 + }, + { + "epoch": 5.454564846416382, + "grad_norm": 0.13472742083569225, + "learning_rate": 2.26042287425166e-05, + "loss": 0.869, + "step": 25571 + }, + { + "epoch": 5.454778156996587, + "grad_norm": 0.11757399943781274, + "learning_rate": 2.2598301054454206e-05, + "loss": 0.8805, + "step": 25572 + }, + { + "epoch": 5.454991467576792, + "grad_norm": 0.12195962005996001, + "learning_rate": 2.259237401589303e-05, + "loss": 0.8712, + "step": 25573 + }, + { + "epoch": 5.455204778156997, + "grad_norm": 0.13146506106637457, + "learning_rate": 2.258644762690005e-05, + "loss": 0.8692, + "step": 25574 + }, + { + "epoch": 5.4554180887372015, + "grad_norm": 0.12540932933077859, + "learning_rate": 2.258052188754242e-05, + "loss": 0.8837, + "step": 25575 + }, + { + "epoch": 5.455631399317406, + "grad_norm": 0.1218337979985551, + "learning_rate": 2.2574596797887135e-05, + "loss": 0.8795, + "step": 25576 + }, + { + "epoch": 5.455844709897611, + "grad_norm": 0.11597403906186074, + "learning_rate": 2.256867235800126e-05, + "loss": 0.8914, + "step": 25577 + }, + { + "epoch": 5.4560580204778155, + "grad_norm": 0.12031877464747294, + "learning_rate": 2.25627485679518e-05, + "loss": 0.8809, + "step": 25578 + }, + { + "epoch": 5.45627133105802, + "grad_norm": 0.11179270871627013, + "learning_rate": 2.2556825427805827e-05, + "loss": 0.88, + "step": 25579 + }, + { + "epoch": 5.456484641638225, + "grad_norm": 0.11754444984837917, + "learning_rate": 2.2550902937630344e-05, + "loss": 0.8881, + "step": 25580 + }, + { + "epoch": 5.4566979522184305, + "grad_norm": 0.11939933770896548, + "learning_rate": 2.2544981097492357e-05, + "loss": 0.8972, + "step": 25581 + }, + { + "epoch": 5.456911262798635, + "grad_norm": 0.1211386581730984, + "learning_rate": 2.253905990745888e-05, + "loss": 0.8913, + "step": 25582 + }, + { + "epoch": 5.45712457337884, + "grad_norm": 0.11491103044822765, + "learning_rate": 2.2533139367596883e-05, + "loss": 0.9045, + "step": 25583 + }, + { + "epoch": 5.4573378839590445, + "grad_norm": 0.11530470626756245, + "learning_rate": 2.2527219477973423e-05, + "loss": 0.9045, + "step": 25584 + }, + { + "epoch": 5.457551194539249, + "grad_norm": 0.11762709405555523, + "learning_rate": 2.252130023865542e-05, + "loss": 0.8728, + "step": 25585 + }, + { + "epoch": 5.457764505119454, + "grad_norm": 0.09883624585374175, + "learning_rate": 2.2515381649709845e-05, + "loss": 0.8669, + "step": 25586 + }, + { + "epoch": 5.457977815699659, + "grad_norm": 0.14938323639878454, + "learning_rate": 2.2509463711203717e-05, + "loss": 0.8998, + "step": 25587 + }, + { + "epoch": 5.458191126279863, + "grad_norm": 0.10048730321333835, + "learning_rate": 2.2503546423203984e-05, + "loss": 0.8944, + "step": 25588 + }, + { + "epoch": 5.458404436860068, + "grad_norm": 0.1480320479659771, + "learning_rate": 2.249762978577758e-05, + "loss": 0.9101, + "step": 25589 + }, + { + "epoch": 5.4586177474402735, + "grad_norm": 0.11126875472617889, + "learning_rate": 2.249171379899144e-05, + "loss": 0.8724, + "step": 25590 + }, + { + "epoch": 5.458831058020478, + "grad_norm": 0.14779019518869435, + "learning_rate": 2.2485798462912554e-05, + "loss": 0.8801, + "step": 25591 + }, + { + "epoch": 5.459044368600683, + "grad_norm": 0.12225221975378946, + "learning_rate": 2.247988377760782e-05, + "loss": 0.9035, + "step": 25592 + }, + { + "epoch": 5.459257679180888, + "grad_norm": 0.12188182688253012, + "learning_rate": 2.247396974314418e-05, + "loss": 0.8629, + "step": 25593 + }, + { + "epoch": 5.459470989761092, + "grad_norm": 0.1332500819123187, + "learning_rate": 2.246805635958853e-05, + "loss": 0.8922, + "step": 25594 + }, + { + "epoch": 5.459684300341297, + "grad_norm": 0.11918430959077529, + "learning_rate": 2.246214362700779e-05, + "loss": 0.9339, + "step": 25595 + }, + { + "epoch": 5.459897610921502, + "grad_norm": 0.13386884647170436, + "learning_rate": 2.2456231545468867e-05, + "loss": 0.904, + "step": 25596 + }, + { + "epoch": 5.460110921501706, + "grad_norm": 0.11832983304160896, + "learning_rate": 2.2450320115038647e-05, + "loss": 0.8667, + "step": 25597 + }, + { + "epoch": 5.460324232081911, + "grad_norm": 0.14897675439404814, + "learning_rate": 2.2444409335783995e-05, + "loss": 0.8879, + "step": 25598 + }, + { + "epoch": 5.460537542662116, + "grad_norm": 0.13653742227350263, + "learning_rate": 2.2438499207771852e-05, + "loss": 0.8777, + "step": 25599 + }, + { + "epoch": 5.460750853242321, + "grad_norm": 0.14422855012805588, + "learning_rate": 2.2432589731069064e-05, + "loss": 0.9126, + "step": 25600 + }, + { + "epoch": 5.460964163822526, + "grad_norm": 0.12636465400394548, + "learning_rate": 2.242668090574248e-05, + "loss": 0.8953, + "step": 25601 + }, + { + "epoch": 5.461177474402731, + "grad_norm": 0.13309532538420318, + "learning_rate": 2.2420772731858947e-05, + "loss": 0.8794, + "step": 25602 + }, + { + "epoch": 5.461390784982935, + "grad_norm": 0.11026530343158529, + "learning_rate": 2.241486520948537e-05, + "loss": 0.8791, + "step": 25603 + }, + { + "epoch": 5.46160409556314, + "grad_norm": 0.1402264413700835, + "learning_rate": 2.240895833868856e-05, + "loss": 0.8725, + "step": 25604 + }, + { + "epoch": 5.461817406143345, + "grad_norm": 0.10959644185464475, + "learning_rate": 2.240305211953535e-05, + "loss": 0.8844, + "step": 25605 + }, + { + "epoch": 5.462030716723549, + "grad_norm": 0.1313882135988591, + "learning_rate": 2.239714655209258e-05, + "loss": 0.8888, + "step": 25606 + }, + { + "epoch": 5.462244027303754, + "grad_norm": 0.12368818808662654, + "learning_rate": 2.239124163642706e-05, + "loss": 0.8901, + "step": 25607 + }, + { + "epoch": 5.462457337883959, + "grad_norm": 0.12761887596142196, + "learning_rate": 2.2385337372605614e-05, + "loss": 0.8909, + "step": 25608 + }, + { + "epoch": 5.462670648464163, + "grad_norm": 0.10963789635606946, + "learning_rate": 2.2379433760695043e-05, + "loss": 0.878, + "step": 25609 + }, + { + "epoch": 5.462883959044369, + "grad_norm": 0.11137109610414278, + "learning_rate": 2.2373530800762123e-05, + "loss": 0.8945, + "step": 25610 + }, + { + "epoch": 5.463097269624574, + "grad_norm": 0.12979866832609227, + "learning_rate": 2.2367628492873692e-05, + "loss": 0.8997, + "step": 25611 + }, + { + "epoch": 5.463310580204778, + "grad_norm": 0.10340187668453729, + "learning_rate": 2.2361726837096523e-05, + "loss": 0.8704, + "step": 25612 + }, + { + "epoch": 5.463523890784983, + "grad_norm": 0.12962109641985176, + "learning_rate": 2.2355825833497376e-05, + "loss": 0.8857, + "step": 25613 + }, + { + "epoch": 5.463737201365188, + "grad_norm": 0.12156048316441068, + "learning_rate": 2.2349925482143008e-05, + "loss": 0.8795, + "step": 25614 + }, + { + "epoch": 5.463950511945392, + "grad_norm": 0.1382172987961013, + "learning_rate": 2.2344025783100235e-05, + "loss": 0.8944, + "step": 25615 + }, + { + "epoch": 5.464163822525597, + "grad_norm": 0.13858639705622952, + "learning_rate": 2.233812673643577e-05, + "loss": 0.8932, + "step": 25616 + }, + { + "epoch": 5.464377133105802, + "grad_norm": 0.11642613711270498, + "learning_rate": 2.2332228342216374e-05, + "loss": 0.8975, + "step": 25617 + }, + { + "epoch": 5.464590443686006, + "grad_norm": 0.14952916832366908, + "learning_rate": 2.232633060050878e-05, + "loss": 0.8753, + "step": 25618 + }, + { + "epoch": 5.464803754266212, + "grad_norm": 0.11307490595833607, + "learning_rate": 2.232043351137973e-05, + "loss": 0.902, + "step": 25619 + }, + { + "epoch": 5.465017064846417, + "grad_norm": 0.12965200561602727, + "learning_rate": 2.2314537074895953e-05, + "loss": 0.876, + "step": 25620 + }, + { + "epoch": 5.465230375426621, + "grad_norm": 0.13738945135294767, + "learning_rate": 2.2308641291124147e-05, + "loss": 0.863, + "step": 25621 + }, + { + "epoch": 5.465443686006826, + "grad_norm": 0.145288296285726, + "learning_rate": 2.2302746160131018e-05, + "loss": 0.9076, + "step": 25622 + }, + { + "epoch": 5.465656996587031, + "grad_norm": 0.11883984485205716, + "learning_rate": 2.22968516819833e-05, + "loss": 0.8829, + "step": 25623 + }, + { + "epoch": 5.465870307167235, + "grad_norm": 0.11002497211249768, + "learning_rate": 2.229095785674769e-05, + "loss": 0.9032, + "step": 25624 + }, + { + "epoch": 5.46608361774744, + "grad_norm": 0.12884941950221981, + "learning_rate": 2.2285064684490854e-05, + "loss": 0.8724, + "step": 25625 + }, + { + "epoch": 5.466296928327645, + "grad_norm": 0.0995244428006807, + "learning_rate": 2.2279172165279457e-05, + "loss": 0.8911, + "step": 25626 + }, + { + "epoch": 5.4665102389078495, + "grad_norm": 0.1378644879004077, + "learning_rate": 2.227328029918023e-05, + "loss": 0.8971, + "step": 25627 + }, + { + "epoch": 5.466723549488055, + "grad_norm": 0.10951763506152007, + "learning_rate": 2.2267389086259802e-05, + "loss": 0.8767, + "step": 25628 + }, + { + "epoch": 5.46693686006826, + "grad_norm": 0.14260082587160983, + "learning_rate": 2.2261498526584857e-05, + "loss": 0.8876, + "step": 25629 + }, + { + "epoch": 5.467150170648464, + "grad_norm": 0.12512198240987996, + "learning_rate": 2.2255608620221986e-05, + "loss": 0.8708, + "step": 25630 + }, + { + "epoch": 5.467363481228669, + "grad_norm": 0.11115956812208526, + "learning_rate": 2.2249719367237894e-05, + "loss": 0.8982, + "step": 25631 + }, + { + "epoch": 5.467576791808874, + "grad_norm": 0.14097670897285594, + "learning_rate": 2.224383076769921e-05, + "loss": 0.8878, + "step": 25632 + }, + { + "epoch": 5.4677901023890785, + "grad_norm": 0.11841800073518902, + "learning_rate": 2.223794282167254e-05, + "loss": 0.915, + "step": 25633 + }, + { + "epoch": 5.468003412969283, + "grad_norm": 0.125814422085851, + "learning_rate": 2.2232055529224495e-05, + "loss": 0.8892, + "step": 25634 + }, + { + "epoch": 5.468216723549488, + "grad_norm": 0.1181710399997122, + "learning_rate": 2.2226168890421746e-05, + "loss": 0.9056, + "step": 25635 + }, + { + "epoch": 5.4684300341296925, + "grad_norm": 0.12688661333256288, + "learning_rate": 2.2220282905330856e-05, + "loss": 0.9246, + "step": 25636 + }, + { + "epoch": 5.468643344709897, + "grad_norm": 0.12775073690049754, + "learning_rate": 2.2214397574018447e-05, + "loss": 0.879, + "step": 25637 + }, + { + "epoch": 5.468856655290103, + "grad_norm": 0.12249504092312728, + "learning_rate": 2.2208512896551078e-05, + "loss": 0.8863, + "step": 25638 + }, + { + "epoch": 5.4690699658703075, + "grad_norm": 0.13589683672581115, + "learning_rate": 2.220262887299537e-05, + "loss": 0.9164, + "step": 25639 + }, + { + "epoch": 5.469283276450512, + "grad_norm": 0.10552151699600197, + "learning_rate": 2.2196745503417922e-05, + "loss": 0.8824, + "step": 25640 + }, + { + "epoch": 5.469496587030717, + "grad_norm": 0.13070430595616975, + "learning_rate": 2.2190862787885247e-05, + "loss": 0.8852, + "step": 25641 + }, + { + "epoch": 5.4697098976109215, + "grad_norm": 0.11960659800026681, + "learning_rate": 2.2184980726463906e-05, + "loss": 0.9026, + "step": 25642 + }, + { + "epoch": 5.469923208191126, + "grad_norm": 0.11152185245769514, + "learning_rate": 2.2179099319220504e-05, + "loss": 0.9021, + "step": 25643 + }, + { + "epoch": 5.470136518771331, + "grad_norm": 0.12507070312035462, + "learning_rate": 2.2173218566221565e-05, + "loss": 0.872, + "step": 25644 + }, + { + "epoch": 5.470349829351536, + "grad_norm": 0.1083587210261832, + "learning_rate": 2.2167338467533633e-05, + "loss": 0.8591, + "step": 25645 + }, + { + "epoch": 5.47056313993174, + "grad_norm": 0.1183363481527271, + "learning_rate": 2.216145902322321e-05, + "loss": 0.861, + "step": 25646 + }, + { + "epoch": 5.470776450511945, + "grad_norm": 0.12023331874407932, + "learning_rate": 2.215558023335688e-05, + "loss": 0.8942, + "step": 25647 + }, + { + "epoch": 5.4709897610921505, + "grad_norm": 0.12177088627597808, + "learning_rate": 2.2149702098001137e-05, + "loss": 0.8764, + "step": 25648 + }, + { + "epoch": 5.471203071672355, + "grad_norm": 0.11867514142759639, + "learning_rate": 2.214382461722248e-05, + "loss": 0.8925, + "step": 25649 + }, + { + "epoch": 5.47141638225256, + "grad_norm": 0.11959568516560609, + "learning_rate": 2.2137947791087396e-05, + "loss": 0.8743, + "step": 25650 + }, + { + "epoch": 5.471629692832765, + "grad_norm": 0.11737438373720609, + "learning_rate": 2.2132071619662455e-05, + "loss": 0.8844, + "step": 25651 + }, + { + "epoch": 5.471843003412969, + "grad_norm": 0.11220738056720146, + "learning_rate": 2.2126196103014078e-05, + "loss": 0.8749, + "step": 25652 + }, + { + "epoch": 5.472056313993174, + "grad_norm": 0.12225898592130884, + "learning_rate": 2.2120321241208762e-05, + "loss": 0.8913, + "step": 25653 + }, + { + "epoch": 5.472269624573379, + "grad_norm": 0.0990121003785758, + "learning_rate": 2.2114447034312967e-05, + "loss": 0.8845, + "step": 25654 + }, + { + "epoch": 5.472482935153583, + "grad_norm": 0.1410195762322706, + "learning_rate": 2.210857348239319e-05, + "loss": 0.897, + "step": 25655 + }, + { + "epoch": 5.472696245733788, + "grad_norm": 0.10475189842634695, + "learning_rate": 2.2102700585515883e-05, + "loss": 0.8699, + "step": 25656 + }, + { + "epoch": 5.472909556313994, + "grad_norm": 0.124846159842342, + "learning_rate": 2.20968283437475e-05, + "loss": 0.8694, + "step": 25657 + }, + { + "epoch": 5.473122866894198, + "grad_norm": 0.12290122709700625, + "learning_rate": 2.2090956757154443e-05, + "loss": 0.8925, + "step": 25658 + }, + { + "epoch": 5.473336177474403, + "grad_norm": 0.11090973125946835, + "learning_rate": 2.208508582580321e-05, + "loss": 0.8994, + "step": 25659 + }, + { + "epoch": 5.473549488054608, + "grad_norm": 0.14052269054925112, + "learning_rate": 2.2079215549760207e-05, + "loss": 0.8833, + "step": 25660 + }, + { + "epoch": 5.473762798634812, + "grad_norm": 0.12172979031771543, + "learning_rate": 2.2073345929091853e-05, + "loss": 0.8619, + "step": 25661 + }, + { + "epoch": 5.473976109215017, + "grad_norm": 0.1692465829125335, + "learning_rate": 2.206747696386457e-05, + "loss": 0.8613, + "step": 25662 + }, + { + "epoch": 5.474189419795222, + "grad_norm": 0.15007477728193933, + "learning_rate": 2.206160865414476e-05, + "loss": 0.8959, + "step": 25663 + }, + { + "epoch": 5.474402730375426, + "grad_norm": 0.16492243234491397, + "learning_rate": 2.205574099999882e-05, + "loss": 0.8834, + "step": 25664 + }, + { + "epoch": 5.474616040955631, + "grad_norm": 0.13771829653303796, + "learning_rate": 2.204987400149314e-05, + "loss": 0.8673, + "step": 25665 + }, + { + "epoch": 5.474829351535837, + "grad_norm": 0.16239088654612138, + "learning_rate": 2.2044007658694087e-05, + "loss": 0.866, + "step": 25666 + }, + { + "epoch": 5.475042662116041, + "grad_norm": 0.15799611476937347, + "learning_rate": 2.203814197166809e-05, + "loss": 0.905, + "step": 25667 + }, + { + "epoch": 5.475255972696246, + "grad_norm": 0.1552436126726735, + "learning_rate": 2.2032276940481488e-05, + "loss": 0.901, + "step": 25668 + }, + { + "epoch": 5.475469283276451, + "grad_norm": 0.13483652509763394, + "learning_rate": 2.2026412565200656e-05, + "loss": 0.8836, + "step": 25669 + }, + { + "epoch": 5.475682593856655, + "grad_norm": 0.15406610893972753, + "learning_rate": 2.202054884589191e-05, + "loss": 0.9192, + "step": 25670 + }, + { + "epoch": 5.47589590443686, + "grad_norm": 0.10980506966574723, + "learning_rate": 2.2014685782621648e-05, + "loss": 0.856, + "step": 25671 + }, + { + "epoch": 5.476109215017065, + "grad_norm": 0.1591290310126011, + "learning_rate": 2.20088233754562e-05, + "loss": 0.8986, + "step": 25672 + }, + { + "epoch": 5.476322525597269, + "grad_norm": 0.1042912241557773, + "learning_rate": 2.200296162446189e-05, + "loss": 0.8793, + "step": 25673 + }, + { + "epoch": 5.476535836177474, + "grad_norm": 0.1320847669950219, + "learning_rate": 2.1997100529705044e-05, + "loss": 0.8691, + "step": 25674 + }, + { + "epoch": 5.476749146757679, + "grad_norm": 0.1244154869739152, + "learning_rate": 2.199124009125198e-05, + "loss": 0.8445, + "step": 25675 + }, + { + "epoch": 5.476962457337884, + "grad_norm": 0.11352445274291746, + "learning_rate": 2.1985380309169014e-05, + "loss": 0.8901, + "step": 25676 + }, + { + "epoch": 5.477175767918089, + "grad_norm": 0.10975418735089364, + "learning_rate": 2.1979521183522437e-05, + "loss": 0.878, + "step": 25677 + }, + { + "epoch": 5.477389078498294, + "grad_norm": 0.12582291386369623, + "learning_rate": 2.1973662714378533e-05, + "loss": 0.8599, + "step": 25678 + }, + { + "epoch": 5.477602389078498, + "grad_norm": 0.11033289450424764, + "learning_rate": 2.1967804901803644e-05, + "loss": 0.8863, + "step": 25679 + }, + { + "epoch": 5.477815699658703, + "grad_norm": 0.12939210664125309, + "learning_rate": 2.196194774586401e-05, + "loss": 0.8969, + "step": 25680 + }, + { + "epoch": 5.478029010238908, + "grad_norm": 0.12528107151898077, + "learning_rate": 2.1956091246625918e-05, + "loss": 0.8799, + "step": 25681 + }, + { + "epoch": 5.478242320819112, + "grad_norm": 0.11524941466176924, + "learning_rate": 2.1950235404155605e-05, + "loss": 0.8737, + "step": 25682 + }, + { + "epoch": 5.478455631399317, + "grad_norm": 0.11533363684281468, + "learning_rate": 2.1944380218519374e-05, + "loss": 0.9167, + "step": 25683 + }, + { + "epoch": 5.478668941979522, + "grad_norm": 0.1226169249315074, + "learning_rate": 2.193852568978346e-05, + "loss": 0.8714, + "step": 25684 + }, + { + "epoch": 5.4788822525597265, + "grad_norm": 0.12035491811122773, + "learning_rate": 2.193267181801413e-05, + "loss": 0.8915, + "step": 25685 + }, + { + "epoch": 5.479095563139932, + "grad_norm": 0.13081467768330227, + "learning_rate": 2.1926818603277544e-05, + "loss": 0.8759, + "step": 25686 + }, + { + "epoch": 5.479308873720137, + "grad_norm": 0.11412389793703101, + "learning_rate": 2.192096604564001e-05, + "loss": 0.9139, + "step": 25687 + }, + { + "epoch": 5.479522184300341, + "grad_norm": 0.12771907574580407, + "learning_rate": 2.1915114145167723e-05, + "loss": 0.8991, + "step": 25688 + }, + { + "epoch": 5.479735494880546, + "grad_norm": 0.1119479576405194, + "learning_rate": 2.190926290192689e-05, + "loss": 0.8988, + "step": 25689 + }, + { + "epoch": 5.479948805460751, + "grad_norm": 0.12031940129132801, + "learning_rate": 2.190341231598371e-05, + "loss": 0.8908, + "step": 25690 + }, + { + "epoch": 5.4801621160409555, + "grad_norm": 0.10991816871275825, + "learning_rate": 2.189756238740441e-05, + "loss": 0.8561, + "step": 25691 + }, + { + "epoch": 5.48037542662116, + "grad_norm": 0.10928135962303297, + "learning_rate": 2.1891713116255175e-05, + "loss": 0.8516, + "step": 25692 + }, + { + "epoch": 5.480588737201365, + "grad_norm": 0.1258763269666171, + "learning_rate": 2.1885864502602186e-05, + "loss": 0.9003, + "step": 25693 + }, + { + "epoch": 5.4808020477815695, + "grad_norm": 0.10558735189736373, + "learning_rate": 2.188001654651159e-05, + "loss": 0.8592, + "step": 25694 + }, + { + "epoch": 5.481015358361775, + "grad_norm": 0.11495033617341366, + "learning_rate": 2.1874169248049607e-05, + "loss": 0.8769, + "step": 25695 + }, + { + "epoch": 5.48122866894198, + "grad_norm": 0.10379586593454511, + "learning_rate": 2.1868322607282398e-05, + "loss": 0.8736, + "step": 25696 + }, + { + "epoch": 5.4814419795221845, + "grad_norm": 0.10753861207139637, + "learning_rate": 2.186247662427608e-05, + "loss": 0.9043, + "step": 25697 + }, + { + "epoch": 5.481655290102389, + "grad_norm": 0.10268665014093448, + "learning_rate": 2.1856631299096788e-05, + "loss": 0.905, + "step": 25698 + }, + { + "epoch": 5.481868600682594, + "grad_norm": 0.10140601277063217, + "learning_rate": 2.1850786631810718e-05, + "loss": 0.9072, + "step": 25699 + }, + { + "epoch": 5.4820819112627985, + "grad_norm": 0.10979322115021671, + "learning_rate": 2.1844942622483974e-05, + "loss": 0.8843, + "step": 25700 + }, + { + "epoch": 5.482295221843003, + "grad_norm": 0.11183481378074285, + "learning_rate": 2.1839099271182683e-05, + "loss": 0.89, + "step": 25701 + }, + { + "epoch": 5.482508532423208, + "grad_norm": 0.1071008299267432, + "learning_rate": 2.183325657797294e-05, + "loss": 0.9057, + "step": 25702 + }, + { + "epoch": 5.482721843003413, + "grad_norm": 0.13560830609380978, + "learning_rate": 2.1827414542920894e-05, + "loss": 0.8982, + "step": 25703 + }, + { + "epoch": 5.482935153583618, + "grad_norm": 0.11453202253473277, + "learning_rate": 2.1821573166092635e-05, + "loss": 0.8953, + "step": 25704 + }, + { + "epoch": 5.483148464163823, + "grad_norm": 0.11801107974783605, + "learning_rate": 2.1815732447554262e-05, + "loss": 0.8879, + "step": 25705 + }, + { + "epoch": 5.4833617747440275, + "grad_norm": 0.11705773435929125, + "learning_rate": 2.1809892387371817e-05, + "loss": 0.8674, + "step": 25706 + }, + { + "epoch": 5.483575085324232, + "grad_norm": 0.11991785434908356, + "learning_rate": 2.180405298561148e-05, + "loss": 0.8926, + "step": 25707 + }, + { + "epoch": 5.483788395904437, + "grad_norm": 0.11064930022985636, + "learning_rate": 2.1798214242339225e-05, + "loss": 0.8958, + "step": 25708 + }, + { + "epoch": 5.484001706484642, + "grad_norm": 0.26199939779247255, + "learning_rate": 2.1792376157621165e-05, + "loss": 0.8863, + "step": 25709 + }, + { + "epoch": 5.484215017064846, + "grad_norm": 0.11577779233341697, + "learning_rate": 2.1786538731523318e-05, + "loss": 0.8801, + "step": 25710 + }, + { + "epoch": 5.484428327645051, + "grad_norm": 0.14085432318960434, + "learning_rate": 2.178070196411179e-05, + "loss": 0.8823, + "step": 25711 + }, + { + "epoch": 5.484641638225256, + "grad_norm": 0.12932100799241003, + "learning_rate": 2.1774865855452595e-05, + "loss": 0.8584, + "step": 25712 + }, + { + "epoch": 5.484854948805461, + "grad_norm": 0.12263084413543968, + "learning_rate": 2.176903040561178e-05, + "loss": 0.8698, + "step": 25713 + }, + { + "epoch": 5.485068259385666, + "grad_norm": 0.1324330977468099, + "learning_rate": 2.176319561465534e-05, + "loss": 0.8882, + "step": 25714 + }, + { + "epoch": 5.485281569965871, + "grad_norm": 0.10867312672519151, + "learning_rate": 2.1757361482649344e-05, + "loss": 0.8748, + "step": 25715 + }, + { + "epoch": 5.485494880546075, + "grad_norm": 0.12575759409981407, + "learning_rate": 2.175152800965978e-05, + "loss": 0.8963, + "step": 25716 + }, + { + "epoch": 5.48570819112628, + "grad_norm": 0.11983253591705166, + "learning_rate": 2.1745695195752655e-05, + "loss": 0.9343, + "step": 25717 + }, + { + "epoch": 5.485921501706485, + "grad_norm": 0.1340196011573032, + "learning_rate": 2.1739863040993976e-05, + "loss": 0.9223, + "step": 25718 + }, + { + "epoch": 5.486134812286689, + "grad_norm": 0.11660937288992403, + "learning_rate": 2.173403154544972e-05, + "loss": 0.8893, + "step": 25719 + }, + { + "epoch": 5.486348122866894, + "grad_norm": 0.13117164647418006, + "learning_rate": 2.1728200709185884e-05, + "loss": 0.9056, + "step": 25720 + }, + { + "epoch": 5.486561433447099, + "grad_norm": 0.1131877221828273, + "learning_rate": 2.1722370532268425e-05, + "loss": 0.8961, + "step": 25721 + }, + { + "epoch": 5.486774744027303, + "grad_norm": 0.1484243594408569, + "learning_rate": 2.1716541014763307e-05, + "loss": 0.8958, + "step": 25722 + }, + { + "epoch": 5.486988054607509, + "grad_norm": 0.12276209497106177, + "learning_rate": 2.1710712156736537e-05, + "loss": 0.9032, + "step": 25723 + }, + { + "epoch": 5.487201365187714, + "grad_norm": 0.14634338430531946, + "learning_rate": 2.1704883958254034e-05, + "loss": 0.8748, + "step": 25724 + }, + { + "epoch": 5.487414675767918, + "grad_norm": 0.14442654297303584, + "learning_rate": 2.1699056419381747e-05, + "loss": 0.8763, + "step": 25725 + }, + { + "epoch": 5.487627986348123, + "grad_norm": 0.1451220447800028, + "learning_rate": 2.1693229540185596e-05, + "loss": 0.896, + "step": 25726 + }, + { + "epoch": 5.487841296928328, + "grad_norm": 0.13634769684939357, + "learning_rate": 2.1687403320731556e-05, + "loss": 0.9005, + "step": 25727 + }, + { + "epoch": 5.488054607508532, + "grad_norm": 0.13483396726669475, + "learning_rate": 2.1681577761085527e-05, + "loss": 0.8797, + "step": 25728 + }, + { + "epoch": 5.488267918088737, + "grad_norm": 0.12619418773328533, + "learning_rate": 2.167575286131343e-05, + "loss": 0.8974, + "step": 25729 + }, + { + "epoch": 5.488481228668942, + "grad_norm": 0.13322903656659607, + "learning_rate": 2.1669928621481173e-05, + "loss": 0.8897, + "step": 25730 + }, + { + "epoch": 5.488694539249146, + "grad_norm": 0.12279253170107582, + "learning_rate": 2.166410504165465e-05, + "loss": 0.887, + "step": 25731 + }, + { + "epoch": 5.488907849829351, + "grad_norm": 0.1418979101876651, + "learning_rate": 2.1658282121899764e-05, + "loss": 0.8848, + "step": 25732 + }, + { + "epoch": 5.489121160409557, + "grad_norm": 0.11820150418159162, + "learning_rate": 2.1652459862282394e-05, + "loss": 0.9046, + "step": 25733 + }, + { + "epoch": 5.489334470989761, + "grad_norm": 0.13344342582949245, + "learning_rate": 2.16466382628684e-05, + "loss": 0.9073, + "step": 25734 + }, + { + "epoch": 5.489547781569966, + "grad_norm": 0.13056287809107978, + "learning_rate": 2.1640817323723695e-05, + "loss": 0.8987, + "step": 25735 + }, + { + "epoch": 5.489761092150171, + "grad_norm": 0.10193883315114441, + "learning_rate": 2.163499704491413e-05, + "loss": 0.878, + "step": 25736 + }, + { + "epoch": 5.489974402730375, + "grad_norm": 0.12634064689850474, + "learning_rate": 2.1629177426505548e-05, + "loss": 0.9161, + "step": 25737 + }, + { + "epoch": 5.49018771331058, + "grad_norm": 0.10700523031609992, + "learning_rate": 2.1623358468563785e-05, + "loss": 0.8724, + "step": 25738 + }, + { + "epoch": 5.490401023890785, + "grad_norm": 0.11576292768479636, + "learning_rate": 2.161754017115473e-05, + "loss": 0.8907, + "step": 25739 + }, + { + "epoch": 5.4906143344709895, + "grad_norm": 0.10956233401568109, + "learning_rate": 2.161172253434418e-05, + "loss": 0.9059, + "step": 25740 + }, + { + "epoch": 5.490827645051194, + "grad_norm": 0.11426343003887109, + "learning_rate": 2.1605905558197985e-05, + "loss": 0.8813, + "step": 25741 + }, + { + "epoch": 5.4910409556314, + "grad_norm": 0.11565355807623703, + "learning_rate": 2.160008924278194e-05, + "loss": 0.8768, + "step": 25742 + }, + { + "epoch": 5.491254266211604, + "grad_norm": 0.10788584852710731, + "learning_rate": 2.1594273588161874e-05, + "loss": 0.8697, + "step": 25743 + }, + { + "epoch": 5.491467576791809, + "grad_norm": 0.10558012422464746, + "learning_rate": 2.1588458594403585e-05, + "loss": 0.9006, + "step": 25744 + }, + { + "epoch": 5.491680887372014, + "grad_norm": 0.10647093704803696, + "learning_rate": 2.1582644261572864e-05, + "loss": 0.8938, + "step": 25745 + }, + { + "epoch": 5.4918941979522184, + "grad_norm": 0.1109915844122482, + "learning_rate": 2.1576830589735484e-05, + "loss": 0.8727, + "step": 25746 + }, + { + "epoch": 5.492107508532423, + "grad_norm": 0.09835959038237557, + "learning_rate": 2.1571017578957276e-05, + "loss": 0.8848, + "step": 25747 + }, + { + "epoch": 5.492320819112628, + "grad_norm": 0.1136085314131479, + "learning_rate": 2.1565205229303984e-05, + "loss": 0.8594, + "step": 25748 + }, + { + "epoch": 5.4925341296928325, + "grad_norm": 0.09472549346161371, + "learning_rate": 2.1559393540841372e-05, + "loss": 0.8546, + "step": 25749 + }, + { + "epoch": 5.492747440273037, + "grad_norm": 0.09942801040872029, + "learning_rate": 2.1553582513635193e-05, + "loss": 0.8658, + "step": 25750 + }, + { + "epoch": 5.492960750853243, + "grad_norm": 0.11506018334362375, + "learning_rate": 2.1547772147751224e-05, + "loss": 0.9189, + "step": 25751 + }, + { + "epoch": 5.493174061433447, + "grad_norm": 0.0993183086531194, + "learning_rate": 2.1541962443255225e-05, + "loss": 0.9028, + "step": 25752 + }, + { + "epoch": 5.493387372013652, + "grad_norm": 0.11021839108969946, + "learning_rate": 2.1536153400212863e-05, + "loss": 0.8751, + "step": 25753 + }, + { + "epoch": 5.493600682593857, + "grad_norm": 0.1121323814078575, + "learning_rate": 2.1530345018689917e-05, + "loss": 0.8872, + "step": 25754 + }, + { + "epoch": 5.4938139931740615, + "grad_norm": 0.1135041740147129, + "learning_rate": 2.1524537298752113e-05, + "loss": 0.8934, + "step": 25755 + }, + { + "epoch": 5.494027303754266, + "grad_norm": 0.11639174969747121, + "learning_rate": 2.1518730240465152e-05, + "loss": 0.8732, + "step": 25756 + }, + { + "epoch": 5.494240614334471, + "grad_norm": 0.11645117461787101, + "learning_rate": 2.151292384389474e-05, + "loss": 0.8834, + "step": 25757 + }, + { + "epoch": 5.4944539249146755, + "grad_norm": 0.11650499279992588, + "learning_rate": 2.1507118109106557e-05, + "loss": 0.8826, + "step": 25758 + }, + { + "epoch": 5.49466723549488, + "grad_norm": 0.1233003265726203, + "learning_rate": 2.1501313036166334e-05, + "loss": 0.9255, + "step": 25759 + }, + { + "epoch": 5.494880546075085, + "grad_norm": 0.11141099635238022, + "learning_rate": 2.1495508625139737e-05, + "loss": 0.8669, + "step": 25760 + }, + { + "epoch": 5.4950938566552905, + "grad_norm": 0.117291852693507, + "learning_rate": 2.1489704876092448e-05, + "loss": 0.9216, + "step": 25761 + }, + { + "epoch": 5.495307167235495, + "grad_norm": 0.11947316080278368, + "learning_rate": 2.1483901789090112e-05, + "loss": 0.8829, + "step": 25762 + }, + { + "epoch": 5.4955204778157, + "grad_norm": 0.1039449726532009, + "learning_rate": 2.1478099364198462e-05, + "loss": 0.8917, + "step": 25763 + }, + { + "epoch": 5.4957337883959045, + "grad_norm": 0.10820401655858089, + "learning_rate": 2.1472297601483073e-05, + "loss": 0.8968, + "step": 25764 + }, + { + "epoch": 5.495947098976109, + "grad_norm": 0.11842851113934036, + "learning_rate": 2.146649650100959e-05, + "loss": 0.8845, + "step": 25765 + }, + { + "epoch": 5.496160409556314, + "grad_norm": 0.11003224292091175, + "learning_rate": 2.146069606284372e-05, + "loss": 0.9105, + "step": 25766 + }, + { + "epoch": 5.496373720136519, + "grad_norm": 0.12350790677212778, + "learning_rate": 2.1454896287051056e-05, + "loss": 0.9193, + "step": 25767 + }, + { + "epoch": 5.496587030716723, + "grad_norm": 0.10784197714650237, + "learning_rate": 2.144909717369722e-05, + "loss": 0.879, + "step": 25768 + }, + { + "epoch": 5.496800341296928, + "grad_norm": 0.1287746648186436, + "learning_rate": 2.144329872284783e-05, + "loss": 0.8733, + "step": 25769 + }, + { + "epoch": 5.497013651877133, + "grad_norm": 0.12320162054501659, + "learning_rate": 2.1437500934568486e-05, + "loss": 0.8938, + "step": 25770 + }, + { + "epoch": 5.497226962457338, + "grad_norm": 0.11240826402160893, + "learning_rate": 2.1431703808924825e-05, + "loss": 0.8958, + "step": 25771 + }, + { + "epoch": 5.497440273037543, + "grad_norm": 0.14585722914075203, + "learning_rate": 2.1425907345982422e-05, + "loss": 0.8639, + "step": 25772 + }, + { + "epoch": 5.497653583617748, + "grad_norm": 0.11156619837963906, + "learning_rate": 2.1420111545806864e-05, + "loss": 0.8896, + "step": 25773 + }, + { + "epoch": 5.497866894197952, + "grad_norm": 0.12920187180448692, + "learning_rate": 2.1414316408463725e-05, + "loss": 0.8657, + "step": 25774 + }, + { + "epoch": 5.498080204778157, + "grad_norm": 0.12314674292692038, + "learning_rate": 2.140852193401858e-05, + "loss": 0.8793, + "step": 25775 + }, + { + "epoch": 5.498293515358362, + "grad_norm": 0.11943191442340358, + "learning_rate": 2.1402728122536995e-05, + "loss": 0.8639, + "step": 25776 + }, + { + "epoch": 5.498506825938566, + "grad_norm": 0.11286781039950436, + "learning_rate": 2.139693497408451e-05, + "loss": 0.8789, + "step": 25777 + }, + { + "epoch": 5.498720136518771, + "grad_norm": 0.11214055710862986, + "learning_rate": 2.1391142488726705e-05, + "loss": 0.895, + "step": 25778 + }, + { + "epoch": 5.498933447098976, + "grad_norm": 0.10651508408335773, + "learning_rate": 2.138535066652912e-05, + "loss": 0.8786, + "step": 25779 + }, + { + "epoch": 5.499146757679181, + "grad_norm": 0.1106182583510923, + "learning_rate": 2.1379559507557266e-05, + "loss": 0.9063, + "step": 25780 + }, + { + "epoch": 5.499360068259386, + "grad_norm": 0.10608366251519842, + "learning_rate": 2.1373769011876693e-05, + "loss": 0.8865, + "step": 25781 + }, + { + "epoch": 5.499573378839591, + "grad_norm": 0.1112720985898441, + "learning_rate": 2.1367979179552877e-05, + "loss": 0.9079, + "step": 25782 + }, + { + "epoch": 5.499786689419795, + "grad_norm": 0.1174744110851977, + "learning_rate": 2.1362190010651388e-05, + "loss": 0.9186, + "step": 25783 + }, + { + "epoch": 5.5, + "grad_norm": 0.11633656454756722, + "learning_rate": 2.1356401505237708e-05, + "loss": 0.8937, + "step": 25784 + }, + { + "epoch": 5.500213310580205, + "grad_norm": 0.11362592758290081, + "learning_rate": 2.135061366337734e-05, + "loss": 0.8777, + "step": 25785 + }, + { + "epoch": 5.500426621160409, + "grad_norm": 0.11791649423371359, + "learning_rate": 2.1344826485135756e-05, + "loss": 0.8918, + "step": 25786 + }, + { + "epoch": 5.500639931740614, + "grad_norm": 0.11346056932335732, + "learning_rate": 2.1339039970578446e-05, + "loss": 0.8924, + "step": 25787 + }, + { + "epoch": 5.500853242320819, + "grad_norm": 0.11836835901086064, + "learning_rate": 2.1333254119770884e-05, + "loss": 0.8809, + "step": 25788 + }, + { + "epoch": 5.501066552901024, + "grad_norm": 0.10845004342516862, + "learning_rate": 2.132746893277852e-05, + "loss": 0.8968, + "step": 25789 + }, + { + "epoch": 5.501279863481229, + "grad_norm": 0.1219318137161978, + "learning_rate": 2.1321684409666857e-05, + "loss": 0.8776, + "step": 25790 + }, + { + "epoch": 5.501493174061434, + "grad_norm": 0.11770863111211545, + "learning_rate": 2.131590055050131e-05, + "loss": 0.9055, + "step": 25791 + }, + { + "epoch": 5.501706484641638, + "grad_norm": 0.14148129443293592, + "learning_rate": 2.1310117355347348e-05, + "loss": 0.8648, + "step": 25792 + }, + { + "epoch": 5.501919795221843, + "grad_norm": 0.13853828411669353, + "learning_rate": 2.130433482427039e-05, + "loss": 0.9328, + "step": 25793 + }, + { + "epoch": 5.502133105802048, + "grad_norm": 0.12244423194410547, + "learning_rate": 2.1298552957335846e-05, + "loss": 0.9037, + "step": 25794 + }, + { + "epoch": 5.502346416382252, + "grad_norm": 0.11478787589089923, + "learning_rate": 2.129277175460919e-05, + "loss": 0.8923, + "step": 25795 + }, + { + "epoch": 5.502559726962457, + "grad_norm": 0.12696767482997445, + "learning_rate": 2.128699121615581e-05, + "loss": 0.887, + "step": 25796 + }, + { + "epoch": 5.502773037542662, + "grad_norm": 0.13035003534802725, + "learning_rate": 2.1281211342041105e-05, + "loss": 0.9014, + "step": 25797 + }, + { + "epoch": 5.502986348122867, + "grad_norm": 0.1345252699848552, + "learning_rate": 2.1275432132330484e-05, + "loss": 0.8714, + "step": 25798 + }, + { + "epoch": 5.503199658703072, + "grad_norm": 0.12977834210389944, + "learning_rate": 2.126965358708934e-05, + "loss": 0.8959, + "step": 25799 + }, + { + "epoch": 5.503412969283277, + "grad_norm": 0.13685980048238253, + "learning_rate": 2.126387570638305e-05, + "loss": 0.8754, + "step": 25800 + }, + { + "epoch": 5.503626279863481, + "grad_norm": 0.11570462666349994, + "learning_rate": 2.1258098490276975e-05, + "loss": 0.8985, + "step": 25801 + }, + { + "epoch": 5.503839590443686, + "grad_norm": 0.1179364286263526, + "learning_rate": 2.125232193883652e-05, + "loss": 0.8686, + "step": 25802 + }, + { + "epoch": 5.504052901023891, + "grad_norm": 0.12251844088376121, + "learning_rate": 2.1246546052127038e-05, + "loss": 0.8718, + "step": 25803 + }, + { + "epoch": 5.5042662116040955, + "grad_norm": 0.1005346861769776, + "learning_rate": 2.124077083021387e-05, + "loss": 0.876, + "step": 25804 + }, + { + "epoch": 5.5044795221843, + "grad_norm": 0.12942923911746204, + "learning_rate": 2.123499627316237e-05, + "loss": 0.9042, + "step": 25805 + }, + { + "epoch": 5.504692832764505, + "grad_norm": 0.11792931895993407, + "learning_rate": 2.1229222381037856e-05, + "loss": 0.9123, + "step": 25806 + }, + { + "epoch": 5.5049061433447095, + "grad_norm": 0.11688380045584096, + "learning_rate": 2.1223449153905698e-05, + "loss": 0.8657, + "step": 25807 + }, + { + "epoch": 5.505119453924914, + "grad_norm": 0.12636939310246437, + "learning_rate": 2.1217676591831227e-05, + "loss": 0.8999, + "step": 25808 + }, + { + "epoch": 5.50533276450512, + "grad_norm": 0.1299332019500057, + "learning_rate": 2.121190469487968e-05, + "loss": 0.8683, + "step": 25809 + }, + { + "epoch": 5.5055460750853245, + "grad_norm": 0.10387015487231281, + "learning_rate": 2.1206133463116457e-05, + "loss": 0.8874, + "step": 25810 + }, + { + "epoch": 5.505759385665529, + "grad_norm": 0.12823122098623793, + "learning_rate": 2.120036289660681e-05, + "loss": 0.8548, + "step": 25811 + }, + { + "epoch": 5.505972696245734, + "grad_norm": 0.13832519320759984, + "learning_rate": 2.119459299541606e-05, + "loss": 0.9074, + "step": 25812 + }, + { + "epoch": 5.5061860068259385, + "grad_norm": 0.1262467218184708, + "learning_rate": 2.1188823759609446e-05, + "loss": 0.9244, + "step": 25813 + }, + { + "epoch": 5.506399317406143, + "grad_norm": 0.13338524705728622, + "learning_rate": 2.1183055189252305e-05, + "loss": 0.8743, + "step": 25814 + }, + { + "epoch": 5.506612627986348, + "grad_norm": 0.1101318130862633, + "learning_rate": 2.117728728440988e-05, + "loss": 0.9047, + "step": 25815 + }, + { + "epoch": 5.506825938566553, + "grad_norm": 0.13268405621596707, + "learning_rate": 2.1171520045147453e-05, + "loss": 0.8606, + "step": 25816 + }, + { + "epoch": 5.507039249146757, + "grad_norm": 0.10284880314106318, + "learning_rate": 2.116575347153025e-05, + "loss": 0.8809, + "step": 25817 + }, + { + "epoch": 5.507252559726963, + "grad_norm": 0.12155472418791444, + "learning_rate": 2.1159987563623518e-05, + "loss": 0.9259, + "step": 25818 + }, + { + "epoch": 5.5074658703071675, + "grad_norm": 0.11595446331426608, + "learning_rate": 2.115422232149257e-05, + "loss": 0.8946, + "step": 25819 + }, + { + "epoch": 5.507679180887372, + "grad_norm": 0.11519932438606668, + "learning_rate": 2.114845774520255e-05, + "loss": 0.9143, + "step": 25820 + }, + { + "epoch": 5.507892491467577, + "grad_norm": 0.12716252846840986, + "learning_rate": 2.1142693834818706e-05, + "loss": 0.9387, + "step": 25821 + }, + { + "epoch": 5.5081058020477816, + "grad_norm": 0.12615297394128572, + "learning_rate": 2.11369305904063e-05, + "loss": 0.8998, + "step": 25822 + }, + { + "epoch": 5.508319112627986, + "grad_norm": 0.10319269486201958, + "learning_rate": 2.1131168012030505e-05, + "loss": 0.9212, + "step": 25823 + }, + { + "epoch": 5.508532423208191, + "grad_norm": 0.10686070844418724, + "learning_rate": 2.1125406099756533e-05, + "loss": 0.8842, + "step": 25824 + }, + { + "epoch": 5.508745733788396, + "grad_norm": 0.11530094606460926, + "learning_rate": 2.111964485364956e-05, + "loss": 0.9079, + "step": 25825 + }, + { + "epoch": 5.5089590443686, + "grad_norm": 0.10631366965552791, + "learning_rate": 2.111388427377483e-05, + "loss": 0.8991, + "step": 25826 + }, + { + "epoch": 5.509172354948806, + "grad_norm": 0.11770309465711724, + "learning_rate": 2.110812436019748e-05, + "loss": 0.9014, + "step": 25827 + }, + { + "epoch": 5.5093856655290105, + "grad_norm": 0.1137252836094727, + "learning_rate": 2.1102365112982693e-05, + "loss": 0.9078, + "step": 25828 + }, + { + "epoch": 5.509598976109215, + "grad_norm": 0.12312467905751434, + "learning_rate": 2.109660653219563e-05, + "loss": 0.8798, + "step": 25829 + }, + { + "epoch": 5.50981228668942, + "grad_norm": 0.10257960641942328, + "learning_rate": 2.109084861790146e-05, + "loss": 0.8732, + "step": 25830 + }, + { + "epoch": 5.510025597269625, + "grad_norm": 0.12073187843910115, + "learning_rate": 2.1085091370165332e-05, + "loss": 0.904, + "step": 25831 + }, + { + "epoch": 5.510238907849829, + "grad_norm": 0.12167411583006617, + "learning_rate": 2.1079334789052383e-05, + "loss": 0.8828, + "step": 25832 + }, + { + "epoch": 5.510452218430034, + "grad_norm": 0.10478764023939686, + "learning_rate": 2.1073578874627733e-05, + "loss": 0.8723, + "step": 25833 + }, + { + "epoch": 5.510665529010239, + "grad_norm": 0.12930623343421815, + "learning_rate": 2.106782362695655e-05, + "loss": 0.8777, + "step": 25834 + }, + { + "epoch": 5.510878839590443, + "grad_norm": 0.11164836417652999, + "learning_rate": 2.1062069046103934e-05, + "loss": 0.9166, + "step": 25835 + }, + { + "epoch": 5.511092150170649, + "grad_norm": 0.11644943022314706, + "learning_rate": 2.1056315132135e-05, + "loss": 0.8669, + "step": 25836 + }, + { + "epoch": 5.511305460750854, + "grad_norm": 0.13447305597730702, + "learning_rate": 2.1050561885114825e-05, + "loss": 0.8661, + "step": 25837 + }, + { + "epoch": 5.511518771331058, + "grad_norm": 0.11192054315964883, + "learning_rate": 2.104480930510856e-05, + "loss": 0.8869, + "step": 25838 + }, + { + "epoch": 5.511732081911263, + "grad_norm": 0.1283631608827786, + "learning_rate": 2.103905739218126e-05, + "loss": 0.8789, + "step": 25839 + }, + { + "epoch": 5.511945392491468, + "grad_norm": 0.11384293873572032, + "learning_rate": 2.103330614639803e-05, + "loss": 0.8774, + "step": 25840 + }, + { + "epoch": 5.512158703071672, + "grad_norm": 0.12252041142241386, + "learning_rate": 2.102755556782393e-05, + "loss": 0.8699, + "step": 25841 + }, + { + "epoch": 5.512372013651877, + "grad_norm": 0.1321047193234046, + "learning_rate": 2.1021805656524022e-05, + "loss": 0.8889, + "step": 25842 + }, + { + "epoch": 5.512585324232082, + "grad_norm": 0.12344702525976195, + "learning_rate": 2.1016056412563382e-05, + "loss": 0.8826, + "step": 25843 + }, + { + "epoch": 5.512798634812286, + "grad_norm": 0.11835317685759525, + "learning_rate": 2.101030783600706e-05, + "loss": 0.8717, + "step": 25844 + }, + { + "epoch": 5.513011945392492, + "grad_norm": 0.12956316051321637, + "learning_rate": 2.1004559926920063e-05, + "loss": 0.8811, + "step": 25845 + }, + { + "epoch": 5.513225255972696, + "grad_norm": 0.11962590482373468, + "learning_rate": 2.0998812685367493e-05, + "loss": 0.8804, + "step": 25846 + }, + { + "epoch": 5.513438566552901, + "grad_norm": 0.1219007429340471, + "learning_rate": 2.0993066111414348e-05, + "loss": 0.8812, + "step": 25847 + }, + { + "epoch": 5.513651877133106, + "grad_norm": 0.1247969622212006, + "learning_rate": 2.0987320205125653e-05, + "loss": 0.9115, + "step": 25848 + }, + { + "epoch": 5.513865187713311, + "grad_norm": 0.12989071210728986, + "learning_rate": 2.09815749665664e-05, + "loss": 0.9129, + "step": 25849 + }, + { + "epoch": 5.514078498293515, + "grad_norm": 0.10973022862248187, + "learning_rate": 2.0975830395801642e-05, + "loss": 0.8992, + "step": 25850 + }, + { + "epoch": 5.51429180887372, + "grad_norm": 0.11976363603388682, + "learning_rate": 2.0970086492896352e-05, + "loss": 0.8932, + "step": 25851 + }, + { + "epoch": 5.514505119453925, + "grad_norm": 0.11686505931929977, + "learning_rate": 2.0964343257915535e-05, + "loss": 0.8886, + "step": 25852 + }, + { + "epoch": 5.514718430034129, + "grad_norm": 0.127708667449125, + "learning_rate": 2.0958600690924165e-05, + "loss": 0.8589, + "step": 25853 + }, + { + "epoch": 5.514931740614334, + "grad_norm": 0.11352250387344073, + "learning_rate": 2.0952858791987216e-05, + "loss": 0.8767, + "step": 25854 + }, + { + "epoch": 5.515145051194539, + "grad_norm": 0.12945617374953497, + "learning_rate": 2.0947117561169666e-05, + "loss": 0.8717, + "step": 25855 + }, + { + "epoch": 5.515358361774744, + "grad_norm": 0.11503006646753507, + "learning_rate": 2.0941376998536472e-05, + "loss": 0.8417, + "step": 25856 + }, + { + "epoch": 5.515571672354949, + "grad_norm": 0.12025885381720322, + "learning_rate": 2.093563710415257e-05, + "loss": 0.8647, + "step": 25857 + }, + { + "epoch": 5.515784982935154, + "grad_norm": 0.11755777497009136, + "learning_rate": 2.0929897878082945e-05, + "loss": 0.8819, + "step": 25858 + }, + { + "epoch": 5.515998293515358, + "grad_norm": 0.12924692238943028, + "learning_rate": 2.092415932039253e-05, + "loss": 0.8968, + "step": 25859 + }, + { + "epoch": 5.516211604095563, + "grad_norm": 0.12138128982306935, + "learning_rate": 2.091842143114623e-05, + "loss": 0.8879, + "step": 25860 + }, + { + "epoch": 5.516424914675768, + "grad_norm": 0.12032791128472592, + "learning_rate": 2.0912684210408978e-05, + "loss": 0.8871, + "step": 25861 + }, + { + "epoch": 5.5166382252559725, + "grad_norm": 0.11765726742663232, + "learning_rate": 2.0906947658245712e-05, + "loss": 0.8659, + "step": 25862 + }, + { + "epoch": 5.516851535836177, + "grad_norm": 0.127152611953908, + "learning_rate": 2.090121177472133e-05, + "loss": 0.9121, + "step": 25863 + }, + { + "epoch": 5.517064846416382, + "grad_norm": 0.11313223222980166, + "learning_rate": 2.0895476559900753e-05, + "loss": 0.8769, + "step": 25864 + }, + { + "epoch": 5.517278156996587, + "grad_norm": 0.11893001067885589, + "learning_rate": 2.0889742013848808e-05, + "loss": 0.8861, + "step": 25865 + }, + { + "epoch": 5.517491467576792, + "grad_norm": 0.09735398784265534, + "learning_rate": 2.0884008136630447e-05, + "loss": 0.8559, + "step": 25866 + }, + { + "epoch": 5.517704778156997, + "grad_norm": 0.11542777273918224, + "learning_rate": 2.0878274928310534e-05, + "loss": 0.8896, + "step": 25867 + }, + { + "epoch": 5.5179180887372015, + "grad_norm": 0.11139475720500097, + "learning_rate": 2.087254238895393e-05, + "loss": 0.8818, + "step": 25868 + }, + { + "epoch": 5.518131399317406, + "grad_norm": 0.11480438945657702, + "learning_rate": 2.0866810518625486e-05, + "loss": 0.8695, + "step": 25869 + }, + { + "epoch": 5.518344709897611, + "grad_norm": 0.11248235831229081, + "learning_rate": 2.0861079317390088e-05, + "loss": 0.8863, + "step": 25870 + }, + { + "epoch": 5.5185580204778155, + "grad_norm": 0.12329555065248259, + "learning_rate": 2.085534878531258e-05, + "loss": 0.9, + "step": 25871 + }, + { + "epoch": 5.51877133105802, + "grad_norm": 0.12010932542254414, + "learning_rate": 2.0849618922457796e-05, + "loss": 0.9019, + "step": 25872 + }, + { + "epoch": 5.518984641638225, + "grad_norm": 0.1140082356744167, + "learning_rate": 2.0843889728890545e-05, + "loss": 0.8757, + "step": 25873 + }, + { + "epoch": 5.5191979522184305, + "grad_norm": 0.13218999642060097, + "learning_rate": 2.083816120467569e-05, + "loss": 0.8858, + "step": 25874 + }, + { + "epoch": 5.519411262798635, + "grad_norm": 0.1187018227306332, + "learning_rate": 2.083243334987807e-05, + "loss": 0.8725, + "step": 25875 + }, + { + "epoch": 5.51962457337884, + "grad_norm": 0.12075998635047724, + "learning_rate": 2.0826706164562437e-05, + "loss": 0.8695, + "step": 25876 + }, + { + "epoch": 5.5198378839590445, + "grad_norm": 0.1318885454593858, + "learning_rate": 2.08209796487936e-05, + "loss": 0.8851, + "step": 25877 + }, + { + "epoch": 5.520051194539249, + "grad_norm": 0.12586369794484636, + "learning_rate": 2.0815253802636385e-05, + "loss": 0.8974, + "step": 25878 + }, + { + "epoch": 5.520264505119454, + "grad_norm": 0.13144592586786133, + "learning_rate": 2.080952862615558e-05, + "loss": 0.8611, + "step": 25879 + }, + { + "epoch": 5.520477815699659, + "grad_norm": 0.12110575385637129, + "learning_rate": 2.0803804119415948e-05, + "loss": 0.886, + "step": 25880 + }, + { + "epoch": 5.520691126279863, + "grad_norm": 0.1424530133020847, + "learning_rate": 2.0798080282482244e-05, + "loss": 0.9118, + "step": 25881 + }, + { + "epoch": 5.520904436860068, + "grad_norm": 0.1159655756234447, + "learning_rate": 2.0792357115419284e-05, + "loss": 0.8824, + "step": 25882 + }, + { + "epoch": 5.5211177474402735, + "grad_norm": 0.15555458439450737, + "learning_rate": 2.07866346182918e-05, + "loss": 0.8844, + "step": 25883 + }, + { + "epoch": 5.521331058020478, + "grad_norm": 0.1352856975004171, + "learning_rate": 2.0780912791164544e-05, + "loss": 0.8947, + "step": 25884 + }, + { + "epoch": 5.521544368600683, + "grad_norm": 0.11683657320902317, + "learning_rate": 2.077519163410222e-05, + "loss": 0.8897, + "step": 25885 + }, + { + "epoch": 5.521757679180888, + "grad_norm": 0.13100515891857945, + "learning_rate": 2.0769471147169656e-05, + "loss": 0.877, + "step": 25886 + }, + { + "epoch": 5.521970989761092, + "grad_norm": 0.1434120960052769, + "learning_rate": 2.0763751330431495e-05, + "loss": 0.8671, + "step": 25887 + }, + { + "epoch": 5.522184300341297, + "grad_norm": 0.11181939008123105, + "learning_rate": 2.075803218395249e-05, + "loss": 0.8712, + "step": 25888 + }, + { + "epoch": 5.522397610921502, + "grad_norm": 0.15600974051571012, + "learning_rate": 2.075231370779732e-05, + "loss": 0.9232, + "step": 25889 + }, + { + "epoch": 5.522610921501706, + "grad_norm": 0.11561141284409575, + "learning_rate": 2.0746595902030737e-05, + "loss": 0.8923, + "step": 25890 + }, + { + "epoch": 5.522824232081911, + "grad_norm": 0.14679347347734858, + "learning_rate": 2.0740878766717422e-05, + "loss": 0.8924, + "step": 25891 + }, + { + "epoch": 5.523037542662116, + "grad_norm": 0.1009368495901304, + "learning_rate": 2.0735162301922065e-05, + "loss": 0.8928, + "step": 25892 + }, + { + "epoch": 5.52325085324232, + "grad_norm": 0.1260267086216131, + "learning_rate": 2.0729446507709313e-05, + "loss": 0.8992, + "step": 25893 + }, + { + "epoch": 5.523464163822526, + "grad_norm": 0.13203293627518503, + "learning_rate": 2.0723731384143893e-05, + "loss": 0.8835, + "step": 25894 + }, + { + "epoch": 5.523677474402731, + "grad_norm": 0.107430598362536, + "learning_rate": 2.0718016931290456e-05, + "loss": 0.8904, + "step": 25895 + }, + { + "epoch": 5.523890784982935, + "grad_norm": 0.13155709062957832, + "learning_rate": 2.0712303149213646e-05, + "loss": 0.88, + "step": 25896 + }, + { + "epoch": 5.52410409556314, + "grad_norm": 0.1213015793245774, + "learning_rate": 2.070659003797813e-05, + "loss": 0.8979, + "step": 25897 + }, + { + "epoch": 5.524317406143345, + "grad_norm": 0.11672954470118228, + "learning_rate": 2.070087759764854e-05, + "loss": 0.8657, + "step": 25898 + }, + { + "epoch": 5.524530716723549, + "grad_norm": 0.1489850895181672, + "learning_rate": 2.0695165828289527e-05, + "loss": 0.8895, + "step": 25899 + }, + { + "epoch": 5.524744027303754, + "grad_norm": 0.1091597561749928, + "learning_rate": 2.0689454729965696e-05, + "loss": 0.879, + "step": 25900 + }, + { + "epoch": 5.524957337883959, + "grad_norm": 0.1558310880316525, + "learning_rate": 2.068374430274167e-05, + "loss": 0.8887, + "step": 25901 + }, + { + "epoch": 5.525170648464163, + "grad_norm": 0.11156174658528371, + "learning_rate": 2.0678034546682095e-05, + "loss": 0.884, + "step": 25902 + }, + { + "epoch": 5.525383959044369, + "grad_norm": 0.13559758861354088, + "learning_rate": 2.0672325461851555e-05, + "loss": 0.9, + "step": 25903 + }, + { + "epoch": 5.525597269624574, + "grad_norm": 0.12560411659307277, + "learning_rate": 2.0666617048314654e-05, + "loss": 0.8734, + "step": 25904 + }, + { + "epoch": 5.525810580204778, + "grad_norm": 0.13378826215320933, + "learning_rate": 2.066090930613595e-05, + "loss": 0.8955, + "step": 25905 + }, + { + "epoch": 5.526023890784983, + "grad_norm": 0.12324363680202227, + "learning_rate": 2.0655202235380082e-05, + "loss": 0.8904, + "step": 25906 + }, + { + "epoch": 5.526237201365188, + "grad_norm": 0.11255159145720044, + "learning_rate": 2.0649495836111598e-05, + "loss": 0.8797, + "step": 25907 + }, + { + "epoch": 5.526450511945392, + "grad_norm": 0.1255262338540184, + "learning_rate": 2.064379010839507e-05, + "loss": 0.9256, + "step": 25908 + }, + { + "epoch": 5.526663822525597, + "grad_norm": 0.13223216959920012, + "learning_rate": 2.0638085052295052e-05, + "loss": 0.8676, + "step": 25909 + }, + { + "epoch": 5.526877133105802, + "grad_norm": 0.1293181000403215, + "learning_rate": 2.0632380667876105e-05, + "loss": 0.9094, + "step": 25910 + }, + { + "epoch": 5.527090443686006, + "grad_norm": 0.135926195531412, + "learning_rate": 2.0626676955202766e-05, + "loss": 0.8856, + "step": 25911 + }, + { + "epoch": 5.527303754266212, + "grad_norm": 0.11244132111639678, + "learning_rate": 2.062097391433958e-05, + "loss": 0.901, + "step": 25912 + }, + { + "epoch": 5.527517064846417, + "grad_norm": 0.1508912389394444, + "learning_rate": 2.061527154535104e-05, + "loss": 0.888, + "step": 25913 + }, + { + "epoch": 5.527730375426621, + "grad_norm": 0.10831185102032938, + "learning_rate": 2.060956984830173e-05, + "loss": 0.9273, + "step": 25914 + }, + { + "epoch": 5.527943686006826, + "grad_norm": 0.15824446708080672, + "learning_rate": 2.060386882325614e-05, + "loss": 0.8825, + "step": 25915 + }, + { + "epoch": 5.528156996587031, + "grad_norm": 0.12017340422170847, + "learning_rate": 2.059816847027877e-05, + "loss": 0.9082, + "step": 25916 + }, + { + "epoch": 5.528370307167235, + "grad_norm": 0.12338831322411169, + "learning_rate": 2.0592468789434096e-05, + "loss": 0.8833, + "step": 25917 + }, + { + "epoch": 5.52858361774744, + "grad_norm": 0.14228410914066927, + "learning_rate": 2.0586769780786662e-05, + "loss": 0.8846, + "step": 25918 + }, + { + "epoch": 5.528796928327645, + "grad_norm": 0.11195638951684464, + "learning_rate": 2.0581071444400927e-05, + "loss": 0.9041, + "step": 25919 + }, + { + "epoch": 5.5290102389078495, + "grad_norm": 0.14149795111382552, + "learning_rate": 2.0575373780341373e-05, + "loss": 0.8832, + "step": 25920 + }, + { + "epoch": 5.529223549488055, + "grad_norm": 0.11857868640222304, + "learning_rate": 2.0569676788672464e-05, + "loss": 0.9051, + "step": 25921 + }, + { + "epoch": 5.52943686006826, + "grad_norm": 0.1297063854912559, + "learning_rate": 2.0563980469458657e-05, + "loss": 0.8855, + "step": 25922 + }, + { + "epoch": 5.529650170648464, + "grad_norm": 0.13434359494511283, + "learning_rate": 2.0558284822764412e-05, + "loss": 0.8905, + "step": 25923 + }, + { + "epoch": 5.529863481228669, + "grad_norm": 0.11881696756837144, + "learning_rate": 2.0552589848654172e-05, + "loss": 0.9104, + "step": 25924 + }, + { + "epoch": 5.530076791808874, + "grad_norm": 0.1388189083065923, + "learning_rate": 2.054689554719236e-05, + "loss": 0.8883, + "step": 25925 + }, + { + "epoch": 5.5302901023890785, + "grad_norm": 0.11081323740342433, + "learning_rate": 2.0541201918443444e-05, + "loss": 0.8937, + "step": 25926 + }, + { + "epoch": 5.530503412969283, + "grad_norm": 0.1420222004030205, + "learning_rate": 2.0535508962471827e-05, + "loss": 0.9134, + "step": 25927 + }, + { + "epoch": 5.530716723549488, + "grad_norm": 0.13398084021590156, + "learning_rate": 2.0529816679341933e-05, + "loss": 0.8915, + "step": 25928 + }, + { + "epoch": 5.5309300341296925, + "grad_norm": 0.11114044307486783, + "learning_rate": 2.0524125069118136e-05, + "loss": 0.8931, + "step": 25929 + }, + { + "epoch": 5.531143344709898, + "grad_norm": 0.1470380623713373, + "learning_rate": 2.0518434131864886e-05, + "loss": 0.9008, + "step": 25930 + }, + { + "epoch": 5.531356655290102, + "grad_norm": 0.11815336265537826, + "learning_rate": 2.051274386764658e-05, + "loss": 0.8945, + "step": 25931 + }, + { + "epoch": 5.5315699658703075, + "grad_norm": 0.13915374019416224, + "learning_rate": 2.050705427652755e-05, + "loss": 0.8896, + "step": 25932 + }, + { + "epoch": 5.531783276450512, + "grad_norm": 0.13569889668990032, + "learning_rate": 2.0501365358572185e-05, + "loss": 0.892, + "step": 25933 + }, + { + "epoch": 5.531996587030717, + "grad_norm": 0.16967776355702888, + "learning_rate": 2.049567711384489e-05, + "loss": 0.8948, + "step": 25934 + }, + { + "epoch": 5.5322098976109215, + "grad_norm": 0.12814535021457035, + "learning_rate": 2.048998954241001e-05, + "loss": 0.8964, + "step": 25935 + }, + { + "epoch": 5.532423208191126, + "grad_norm": 0.14717147465148928, + "learning_rate": 2.04843026443319e-05, + "loss": 0.8733, + "step": 25936 + }, + { + "epoch": 5.532636518771331, + "grad_norm": 0.12684851751747223, + "learning_rate": 2.0478616419674882e-05, + "loss": 0.8804, + "step": 25937 + }, + { + "epoch": 5.532849829351536, + "grad_norm": 0.14325320742561387, + "learning_rate": 2.047293086850335e-05, + "loss": 0.8559, + "step": 25938 + }, + { + "epoch": 5.53306313993174, + "grad_norm": 0.1254062663246203, + "learning_rate": 2.0467245990881597e-05, + "loss": 0.8906, + "step": 25939 + }, + { + "epoch": 5.533276450511945, + "grad_norm": 0.13142780260301345, + "learning_rate": 2.0461561786873964e-05, + "loss": 0.8797, + "step": 25940 + }, + { + "epoch": 5.5334897610921505, + "grad_norm": 0.1436004347669196, + "learning_rate": 2.045587825654474e-05, + "loss": 0.9066, + "step": 25941 + }, + { + "epoch": 5.533703071672355, + "grad_norm": 0.11423513939307453, + "learning_rate": 2.0450195399958294e-05, + "loss": 0.8655, + "step": 25942 + }, + { + "epoch": 5.53391638225256, + "grad_norm": 0.13750035740458183, + "learning_rate": 2.0444513217178878e-05, + "loss": 0.908, + "step": 25943 + }, + { + "epoch": 5.534129692832765, + "grad_norm": 0.11190038027170018, + "learning_rate": 2.043883170827078e-05, + "loss": 0.8815, + "step": 25944 + }, + { + "epoch": 5.534343003412969, + "grad_norm": 0.10870827326786486, + "learning_rate": 2.0433150873298286e-05, + "loss": 0.8862, + "step": 25945 + }, + { + "epoch": 5.534556313993174, + "grad_norm": 0.11845410387487305, + "learning_rate": 2.042747071232572e-05, + "loss": 0.9114, + "step": 25946 + }, + { + "epoch": 5.534769624573379, + "grad_norm": 0.10777831497822761, + "learning_rate": 2.0421791225417324e-05, + "loss": 0.8714, + "step": 25947 + }, + { + "epoch": 5.534982935153583, + "grad_norm": 0.10469220920312577, + "learning_rate": 2.0416112412637357e-05, + "loss": 0.8832, + "step": 25948 + }, + { + "epoch": 5.535196245733788, + "grad_norm": 0.11065750046903376, + "learning_rate": 2.041043427405006e-05, + "loss": 0.8948, + "step": 25949 + }, + { + "epoch": 5.535409556313994, + "grad_norm": 0.11656620670564226, + "learning_rate": 2.040475680971972e-05, + "loss": 0.8753, + "step": 25950 + }, + { + "epoch": 5.535622866894198, + "grad_norm": 0.11797381379475583, + "learning_rate": 2.039908001971057e-05, + "loss": 0.8977, + "step": 25951 + }, + { + "epoch": 5.535836177474403, + "grad_norm": 0.11685020422513032, + "learning_rate": 2.0393403904086825e-05, + "loss": 0.8719, + "step": 25952 + }, + { + "epoch": 5.536049488054608, + "grad_norm": 0.12507556048721719, + "learning_rate": 2.0387728462912716e-05, + "loss": 0.8524, + "step": 25953 + }, + { + "epoch": 5.536262798634812, + "grad_norm": 0.12787221543105942, + "learning_rate": 2.0382053696252467e-05, + "loss": 0.8971, + "step": 25954 + }, + { + "epoch": 5.536476109215017, + "grad_norm": 0.12090022319432607, + "learning_rate": 2.037637960417028e-05, + "loss": 0.8986, + "step": 25955 + }, + { + "epoch": 5.536689419795222, + "grad_norm": 0.15631527646845597, + "learning_rate": 2.0370706186730356e-05, + "loss": 0.9058, + "step": 25956 + }, + { + "epoch": 5.536902730375426, + "grad_norm": 0.10514066563742025, + "learning_rate": 2.0365033443996883e-05, + "loss": 0.8834, + "step": 25957 + }, + { + "epoch": 5.537116040955631, + "grad_norm": 0.136722641437859, + "learning_rate": 2.0359361376034074e-05, + "loss": 0.9047, + "step": 25958 + }, + { + "epoch": 5.537329351535837, + "grad_norm": 0.11188132318817155, + "learning_rate": 2.0353689982906096e-05, + "loss": 0.9179, + "step": 25959 + }, + { + "epoch": 5.537542662116041, + "grad_norm": 0.13567874416930933, + "learning_rate": 2.0348019264677116e-05, + "loss": 0.909, + "step": 25960 + }, + { + "epoch": 5.537755972696246, + "grad_norm": 0.12111755254753286, + "learning_rate": 2.0342349221411286e-05, + "loss": 0.8759, + "step": 25961 + }, + { + "epoch": 5.537969283276451, + "grad_norm": 0.11512962812587725, + "learning_rate": 2.0336679853172798e-05, + "loss": 0.8822, + "step": 25962 + }, + { + "epoch": 5.538182593856655, + "grad_norm": 0.12161839837881987, + "learning_rate": 2.0331011160025777e-05, + "loss": 0.8559, + "step": 25963 + }, + { + "epoch": 5.53839590443686, + "grad_norm": 0.1100219805981976, + "learning_rate": 2.032534314203437e-05, + "loss": 0.8921, + "step": 25964 + }, + { + "epoch": 5.538609215017065, + "grad_norm": 0.11622802314394506, + "learning_rate": 2.0319675799262714e-05, + "loss": 0.8953, + "step": 25965 + }, + { + "epoch": 5.538822525597269, + "grad_norm": 0.1124667955411786, + "learning_rate": 2.0314009131774925e-05, + "loss": 0.8875, + "step": 25966 + }, + { + "epoch": 5.539035836177474, + "grad_norm": 0.1121453730962247, + "learning_rate": 2.0308343139635127e-05, + "loss": 0.9148, + "step": 25967 + }, + { + "epoch": 5.53924914675768, + "grad_norm": 0.11404041195901989, + "learning_rate": 2.0302677822907428e-05, + "loss": 0.8841, + "step": 25968 + }, + { + "epoch": 5.539462457337884, + "grad_norm": 0.11172107886293448, + "learning_rate": 2.029701318165592e-05, + "loss": 0.8817, + "step": 25969 + }, + { + "epoch": 5.539675767918089, + "grad_norm": 0.11068070666907273, + "learning_rate": 2.029134921594473e-05, + "loss": 0.8878, + "step": 25970 + }, + { + "epoch": 5.539889078498294, + "grad_norm": 0.11381016283403468, + "learning_rate": 2.0285685925837928e-05, + "loss": 0.8671, + "step": 25971 + }, + { + "epoch": 5.540102389078498, + "grad_norm": 0.12136461474085385, + "learning_rate": 2.028002331139959e-05, + "loss": 0.8752, + "step": 25972 + }, + { + "epoch": 5.540315699658703, + "grad_norm": 0.11737247819740693, + "learning_rate": 2.0274361372693772e-05, + "loss": 0.8868, + "step": 25973 + }, + { + "epoch": 5.540529010238908, + "grad_norm": 0.12386848606158847, + "learning_rate": 2.0268700109784582e-05, + "loss": 0.8894, + "step": 25974 + }, + { + "epoch": 5.540742320819112, + "grad_norm": 0.13566010479915092, + "learning_rate": 2.0263039522736055e-05, + "loss": 0.9043, + "step": 25975 + }, + { + "epoch": 5.540955631399317, + "grad_norm": 0.12912432122673884, + "learning_rate": 2.0257379611612248e-05, + "loss": 0.8971, + "step": 25976 + }, + { + "epoch": 5.541168941979522, + "grad_norm": 0.11652242896464363, + "learning_rate": 2.0251720376477183e-05, + "loss": 0.9086, + "step": 25977 + }, + { + "epoch": 5.5413822525597265, + "grad_norm": 0.11301924636073397, + "learning_rate": 2.0246061817394915e-05, + "loss": 0.8559, + "step": 25978 + }, + { + "epoch": 5.541595563139932, + "grad_norm": 0.12534503052646864, + "learning_rate": 2.0240403934429463e-05, + "loss": 0.8547, + "step": 25979 + }, + { + "epoch": 5.541808873720137, + "grad_norm": 0.11215440805512433, + "learning_rate": 2.0234746727644846e-05, + "loss": 0.8708, + "step": 25980 + }, + { + "epoch": 5.542022184300341, + "grad_norm": 0.12632943141440725, + "learning_rate": 2.0229090197105046e-05, + "loss": 0.8956, + "step": 25981 + }, + { + "epoch": 5.542235494880546, + "grad_norm": 0.10240489256603952, + "learning_rate": 2.0223434342874126e-05, + "loss": 0.8745, + "step": 25982 + }, + { + "epoch": 5.542448805460751, + "grad_norm": 0.14186997533821577, + "learning_rate": 2.0217779165016046e-05, + "loss": 0.9056, + "step": 25983 + }, + { + "epoch": 5.5426621160409555, + "grad_norm": 0.12113005207588418, + "learning_rate": 2.0212124663594805e-05, + "loss": 0.8978, + "step": 25984 + }, + { + "epoch": 5.54287542662116, + "grad_norm": 0.13180946813456668, + "learning_rate": 2.0206470838674353e-05, + "loss": 0.8805, + "step": 25985 + }, + { + "epoch": 5.543088737201365, + "grad_norm": 0.11957464442087433, + "learning_rate": 2.0200817690318714e-05, + "loss": 0.8926, + "step": 25986 + }, + { + "epoch": 5.5433020477815695, + "grad_norm": 0.12493728321163958, + "learning_rate": 2.019516521859186e-05, + "loss": 0.8972, + "step": 25987 + }, + { + "epoch": 5.543515358361775, + "grad_norm": 0.14256567472430512, + "learning_rate": 2.0189513423557656e-05, + "loss": 0.8894, + "step": 25988 + }, + { + "epoch": 5.54372866894198, + "grad_norm": 0.11886587093771647, + "learning_rate": 2.018386230528014e-05, + "loss": 0.8811, + "step": 25989 + }, + { + "epoch": 5.5439419795221845, + "grad_norm": 0.13414124950953824, + "learning_rate": 2.0178211863823237e-05, + "loss": 0.8994, + "step": 25990 + }, + { + "epoch": 5.544155290102389, + "grad_norm": 0.11786609719234258, + "learning_rate": 2.0172562099250862e-05, + "loss": 0.8668, + "step": 25991 + }, + { + "epoch": 5.544368600682594, + "grad_norm": 0.12589858723071204, + "learning_rate": 2.0166913011626964e-05, + "loss": 0.8792, + "step": 25992 + }, + { + "epoch": 5.5445819112627985, + "grad_norm": 0.1143246094225261, + "learning_rate": 2.0161264601015417e-05, + "loss": 0.9134, + "step": 25993 + }, + { + "epoch": 5.544795221843003, + "grad_norm": 0.1336584463838856, + "learning_rate": 2.01556168674802e-05, + "loss": 0.8946, + "step": 25994 + }, + { + "epoch": 5.545008532423208, + "grad_norm": 0.12944393917824554, + "learning_rate": 2.014996981108518e-05, + "loss": 0.9136, + "step": 25995 + }, + { + "epoch": 5.545221843003413, + "grad_norm": 0.12429660268195476, + "learning_rate": 2.0144323431894255e-05, + "loss": 0.8915, + "step": 25996 + }, + { + "epoch": 5.545435153583618, + "grad_norm": 0.13364600757060507, + "learning_rate": 2.0138677729971308e-05, + "loss": 0.8841, + "step": 25997 + }, + { + "epoch": 5.545648464163823, + "grad_norm": 0.11150812708164967, + "learning_rate": 2.0133032705380266e-05, + "loss": 0.9003, + "step": 25998 + }, + { + "epoch": 5.5458617747440275, + "grad_norm": 0.12953923342013351, + "learning_rate": 2.012738835818494e-05, + "loss": 0.8872, + "step": 25999 + }, + { + "epoch": 5.546075085324232, + "grad_norm": 0.11157550885044197, + "learning_rate": 2.0121744688449204e-05, + "loss": 0.8803, + "step": 26000 + }, + { + "epoch": 5.546288395904437, + "grad_norm": 0.13049054261660387, + "learning_rate": 2.011610169623695e-05, + "loss": 0.8797, + "step": 26001 + }, + { + "epoch": 5.546501706484642, + "grad_norm": 0.11460305574168232, + "learning_rate": 2.0110459381612024e-05, + "loss": 0.859, + "step": 26002 + }, + { + "epoch": 5.546715017064846, + "grad_norm": 0.13169912581383797, + "learning_rate": 2.0104817744638252e-05, + "loss": 0.8904, + "step": 26003 + }, + { + "epoch": 5.546928327645051, + "grad_norm": 0.10841828238970416, + "learning_rate": 2.0099176785379465e-05, + "loss": 0.8847, + "step": 26004 + }, + { + "epoch": 5.547141638225256, + "grad_norm": 0.11794870830159102, + "learning_rate": 2.0093536503899492e-05, + "loss": 0.9002, + "step": 26005 + }, + { + "epoch": 5.547354948805461, + "grad_norm": 0.11065257486607326, + "learning_rate": 2.008789690026218e-05, + "loss": 0.88, + "step": 26006 + }, + { + "epoch": 5.547568259385666, + "grad_norm": 0.12112229475912771, + "learning_rate": 2.0082257974531316e-05, + "loss": 0.9114, + "step": 26007 + }, + { + "epoch": 5.547781569965871, + "grad_norm": 0.10962253422803216, + "learning_rate": 2.0076619726770714e-05, + "loss": 0.8903, + "step": 26008 + }, + { + "epoch": 5.547994880546075, + "grad_norm": 0.11328101702736242, + "learning_rate": 2.0070982157044173e-05, + "loss": 0.8632, + "step": 26009 + }, + { + "epoch": 5.54820819112628, + "grad_norm": 0.11048730899916526, + "learning_rate": 2.006534526541547e-05, + "loss": 0.8865, + "step": 26010 + }, + { + "epoch": 5.548421501706485, + "grad_norm": 0.10673542253549285, + "learning_rate": 2.0059709051948397e-05, + "loss": 0.9036, + "step": 26011 + }, + { + "epoch": 5.548634812286689, + "grad_norm": 0.12061524618310145, + "learning_rate": 2.0054073516706704e-05, + "loss": 0.9024, + "step": 26012 + }, + { + "epoch": 5.548848122866894, + "grad_norm": 0.10486075624599517, + "learning_rate": 2.0048438659754196e-05, + "loss": 0.9131, + "step": 26013 + }, + { + "epoch": 5.549061433447099, + "grad_norm": 0.11854011585332258, + "learning_rate": 2.004280448115461e-05, + "loss": 0.9025, + "step": 26014 + }, + { + "epoch": 5.549274744027304, + "grad_norm": 0.1224472685147247, + "learning_rate": 2.0037170980971706e-05, + "loss": 0.8928, + "step": 26015 + }, + { + "epoch": 5.549488054607508, + "grad_norm": 0.11098916432056025, + "learning_rate": 2.0031538159269218e-05, + "loss": 0.873, + "step": 26016 + }, + { + "epoch": 5.549701365187714, + "grad_norm": 0.13209066282794815, + "learning_rate": 2.0025906016110862e-05, + "loss": 0.9176, + "step": 26017 + }, + { + "epoch": 5.549914675767918, + "grad_norm": 0.10944221195370217, + "learning_rate": 2.002027455156041e-05, + "loss": 0.8858, + "step": 26018 + }, + { + "epoch": 5.550127986348123, + "grad_norm": 0.10972012756041419, + "learning_rate": 2.0014643765681566e-05, + "loss": 0.8893, + "step": 26019 + }, + { + "epoch": 5.550341296928328, + "grad_norm": 0.11882991743873539, + "learning_rate": 2.0009013658538033e-05, + "loss": 0.9194, + "step": 26020 + }, + { + "epoch": 5.550554607508532, + "grad_norm": 0.11396259309992367, + "learning_rate": 2.0003384230193514e-05, + "loss": 0.9143, + "step": 26021 + }, + { + "epoch": 5.550767918088737, + "grad_norm": 0.11141150985426314, + "learning_rate": 1.9997755480711723e-05, + "loss": 0.8862, + "step": 26022 + }, + { + "epoch": 5.550981228668942, + "grad_norm": 0.11402592341306553, + "learning_rate": 1.999212741015633e-05, + "loss": 0.9062, + "step": 26023 + }, + { + "epoch": 5.551194539249146, + "grad_norm": 0.10554608580650977, + "learning_rate": 1.9986500018591004e-05, + "loss": 0.891, + "step": 26024 + }, + { + "epoch": 5.551407849829351, + "grad_norm": 0.1140105647170255, + "learning_rate": 1.9980873306079455e-05, + "loss": 0.8879, + "step": 26025 + }, + { + "epoch": 5.551621160409557, + "grad_norm": 0.11400783612960255, + "learning_rate": 1.9975247272685347e-05, + "loss": 0.9152, + "step": 26026 + }, + { + "epoch": 5.551834470989761, + "grad_norm": 0.10648331213518972, + "learning_rate": 1.9969621918472316e-05, + "loss": 0.9167, + "step": 26027 + }, + { + "epoch": 5.552047781569966, + "grad_norm": 0.1091580673379735, + "learning_rate": 1.996399724350403e-05, + "loss": 0.8755, + "step": 26028 + }, + { + "epoch": 5.552261092150171, + "grad_norm": 0.11626773235267415, + "learning_rate": 1.995837324784409e-05, + "loss": 0.8657, + "step": 26029 + }, + { + "epoch": 5.552474402730375, + "grad_norm": 0.09939807085483336, + "learning_rate": 1.9952749931556204e-05, + "loss": 0.8811, + "step": 26030 + }, + { + "epoch": 5.55268771331058, + "grad_norm": 0.10808109997577012, + "learning_rate": 1.994712729470396e-05, + "loss": 0.8577, + "step": 26031 + }, + { + "epoch": 5.552901023890785, + "grad_norm": 0.11687574837196565, + "learning_rate": 1.9941505337350975e-05, + "loss": 0.8927, + "step": 26032 + }, + { + "epoch": 5.5531143344709895, + "grad_norm": 0.10874336237247159, + "learning_rate": 1.9935884059560866e-05, + "loss": 0.8952, + "step": 26033 + }, + { + "epoch": 5.553327645051194, + "grad_norm": 0.11062854665014099, + "learning_rate": 1.9930263461397247e-05, + "loss": 0.8693, + "step": 26034 + }, + { + "epoch": 5.5535409556314, + "grad_norm": 0.11204258567567953, + "learning_rate": 1.9924643542923708e-05, + "loss": 0.909, + "step": 26035 + }, + { + "epoch": 5.553754266211604, + "grad_norm": 0.10502632112455709, + "learning_rate": 1.9919024304203808e-05, + "loss": 0.8948, + "step": 26036 + }, + { + "epoch": 5.553967576791809, + "grad_norm": 0.10352022270699689, + "learning_rate": 1.991340574530118e-05, + "loss": 0.8851, + "step": 26037 + }, + { + "epoch": 5.554180887372014, + "grad_norm": 0.10380815204852739, + "learning_rate": 1.990778786627938e-05, + "loss": 0.8948, + "step": 26038 + }, + { + "epoch": 5.5543941979522184, + "grad_norm": 0.09964324636703532, + "learning_rate": 1.990217066720197e-05, + "loss": 0.8856, + "step": 26039 + }, + { + "epoch": 5.554607508532423, + "grad_norm": 0.11176745336404958, + "learning_rate": 1.9896554148132513e-05, + "loss": 0.8996, + "step": 26040 + }, + { + "epoch": 5.554820819112628, + "grad_norm": 0.10088470057912097, + "learning_rate": 1.9890938309134534e-05, + "loss": 0.8917, + "step": 26041 + }, + { + "epoch": 5.5550341296928325, + "grad_norm": 0.10877392258628867, + "learning_rate": 1.988532315027162e-05, + "loss": 0.8996, + "step": 26042 + }, + { + "epoch": 5.555247440273037, + "grad_norm": 0.10976874284882092, + "learning_rate": 1.987970867160731e-05, + "loss": 0.8909, + "step": 26043 + }, + { + "epoch": 5.555460750853243, + "grad_norm": 0.11965126236655992, + "learning_rate": 1.9874094873205053e-05, + "loss": 0.8922, + "step": 26044 + }, + { + "epoch": 5.555674061433447, + "grad_norm": 0.10715075199694833, + "learning_rate": 1.9868481755128456e-05, + "loss": 0.8918, + "step": 26045 + }, + { + "epoch": 5.555887372013652, + "grad_norm": 0.13124367894881026, + "learning_rate": 1.9862869317441004e-05, + "loss": 0.8893, + "step": 26046 + }, + { + "epoch": 5.556100682593857, + "grad_norm": 0.1264222344643473, + "learning_rate": 1.985725756020619e-05, + "loss": 0.8722, + "step": 26047 + }, + { + "epoch": 5.5563139931740615, + "grad_norm": 0.11062931005268437, + "learning_rate": 1.985164648348749e-05, + "loss": 0.9051, + "step": 26048 + }, + { + "epoch": 5.556527303754266, + "grad_norm": 0.11190325204072414, + "learning_rate": 1.9846036087348458e-05, + "loss": 0.89, + "step": 26049 + }, + { + "epoch": 5.556740614334471, + "grad_norm": 0.11297540857728347, + "learning_rate": 1.9840426371852528e-05, + "loss": 0.9007, + "step": 26050 + }, + { + "epoch": 5.5569539249146755, + "grad_norm": 0.12137942398610092, + "learning_rate": 1.9834817337063194e-05, + "loss": 0.8882, + "step": 26051 + }, + { + "epoch": 5.55716723549488, + "grad_norm": 0.12019231836226171, + "learning_rate": 1.982920898304391e-05, + "loss": 0.9014, + "step": 26052 + }, + { + "epoch": 5.557380546075086, + "grad_norm": 0.11804036165888533, + "learning_rate": 1.9823601309858107e-05, + "loss": 0.8908, + "step": 26053 + }, + { + "epoch": 5.5575938566552905, + "grad_norm": 0.10848407476448775, + "learning_rate": 1.981799431756932e-05, + "loss": 0.8746, + "step": 26054 + }, + { + "epoch": 5.557807167235495, + "grad_norm": 0.10534175043928966, + "learning_rate": 1.981238800624093e-05, + "loss": 0.8885, + "step": 26055 + }, + { + "epoch": 5.5580204778157, + "grad_norm": 0.11696909401876401, + "learning_rate": 1.9806782375936338e-05, + "loss": 0.8851, + "step": 26056 + }, + { + "epoch": 5.5582337883959045, + "grad_norm": 0.10215485501075051, + "learning_rate": 1.9801177426719054e-05, + "loss": 0.9038, + "step": 26057 + }, + { + "epoch": 5.558447098976109, + "grad_norm": 0.10432679550950039, + "learning_rate": 1.979557315865245e-05, + "loss": 0.8704, + "step": 26058 + }, + { + "epoch": 5.558660409556314, + "grad_norm": 0.10428853885607939, + "learning_rate": 1.9789969571799952e-05, + "loss": 0.9132, + "step": 26059 + }, + { + "epoch": 5.558873720136519, + "grad_norm": 0.10427167364212636, + "learning_rate": 1.9784366666224935e-05, + "loss": 0.8843, + "step": 26060 + }, + { + "epoch": 5.559087030716723, + "grad_norm": 0.11223396916246642, + "learning_rate": 1.977876444199085e-05, + "loss": 0.8915, + "step": 26061 + }, + { + "epoch": 5.559300341296928, + "grad_norm": 0.11034866375065527, + "learning_rate": 1.9773162899161057e-05, + "loss": 0.9002, + "step": 26062 + }, + { + "epoch": 5.559513651877133, + "grad_norm": 0.11705448760566663, + "learning_rate": 1.9767562037798943e-05, + "loss": 0.8843, + "step": 26063 + }, + { + "epoch": 5.559726962457338, + "grad_norm": 0.1101890767318309, + "learning_rate": 1.976196185796787e-05, + "loss": 0.8864, + "step": 26064 + }, + { + "epoch": 5.559940273037543, + "grad_norm": 0.10874119668783297, + "learning_rate": 1.975636235973122e-05, + "loss": 0.8832, + "step": 26065 + }, + { + "epoch": 5.560153583617748, + "grad_norm": 0.12079590444170177, + "learning_rate": 1.9750763543152338e-05, + "loss": 0.8924, + "step": 26066 + }, + { + "epoch": 5.560366894197952, + "grad_norm": 0.11228196449035228, + "learning_rate": 1.9745165408294573e-05, + "loss": 0.8451, + "step": 26067 + }, + { + "epoch": 5.560580204778157, + "grad_norm": 0.11799589969895855, + "learning_rate": 1.973956795522126e-05, + "loss": 0.8635, + "step": 26068 + }, + { + "epoch": 5.560793515358362, + "grad_norm": 0.13580794270047822, + "learning_rate": 1.973397118399578e-05, + "loss": 0.8944, + "step": 26069 + }, + { + "epoch": 5.561006825938566, + "grad_norm": 0.11563694048576964, + "learning_rate": 1.9728375094681418e-05, + "loss": 0.8623, + "step": 26070 + }, + { + "epoch": 5.561220136518771, + "grad_norm": 0.14536906521585288, + "learning_rate": 1.972277968734151e-05, + "loss": 0.8707, + "step": 26071 + }, + { + "epoch": 5.561433447098976, + "grad_norm": 0.11654428695712014, + "learning_rate": 1.971718496203933e-05, + "loss": 0.9209, + "step": 26072 + }, + { + "epoch": 5.561646757679181, + "grad_norm": 0.14026315885512522, + "learning_rate": 1.9711590918838253e-05, + "loss": 0.8842, + "step": 26073 + }, + { + "epoch": 5.561860068259386, + "grad_norm": 0.11336143871105686, + "learning_rate": 1.9705997557801532e-05, + "loss": 0.8624, + "step": 26074 + }, + { + "epoch": 5.562073378839591, + "grad_norm": 0.12453568315331084, + "learning_rate": 1.9700404878992466e-05, + "loss": 0.9205, + "step": 26075 + }, + { + "epoch": 5.562286689419795, + "grad_norm": 0.1268323303951129, + "learning_rate": 1.9694812882474333e-05, + "loss": 0.8957, + "step": 26076 + }, + { + "epoch": 5.5625, + "grad_norm": 0.10051072804358777, + "learning_rate": 1.968922156831041e-05, + "loss": 0.8812, + "step": 26077 + }, + { + "epoch": 5.562713310580205, + "grad_norm": 0.11054386598671971, + "learning_rate": 1.9683630936563948e-05, + "loss": 0.8953, + "step": 26078 + }, + { + "epoch": 5.562926621160409, + "grad_norm": 0.11906069207071085, + "learning_rate": 1.967804098729823e-05, + "loss": 0.8797, + "step": 26079 + }, + { + "epoch": 5.563139931740614, + "grad_norm": 0.10188348464056478, + "learning_rate": 1.967245172057646e-05, + "loss": 0.8918, + "step": 26080 + }, + { + "epoch": 5.563353242320819, + "grad_norm": 0.1015948848160121, + "learning_rate": 1.966686313646194e-05, + "loss": 0.8601, + "step": 26081 + }, + { + "epoch": 5.563566552901024, + "grad_norm": 0.10614914490274445, + "learning_rate": 1.966127523501788e-05, + "loss": 0.8792, + "step": 26082 + }, + { + "epoch": 5.563779863481229, + "grad_norm": 0.10183649509305649, + "learning_rate": 1.9655688016307505e-05, + "loss": 0.9107, + "step": 26083 + }, + { + "epoch": 5.563993174061434, + "grad_norm": 0.10636615063085723, + "learning_rate": 1.9650101480394007e-05, + "loss": 0.8891, + "step": 26084 + }, + { + "epoch": 5.564206484641638, + "grad_norm": 0.10817204159836212, + "learning_rate": 1.9644515627340654e-05, + "loss": 0.8959, + "step": 26085 + }, + { + "epoch": 5.564419795221843, + "grad_norm": 0.1069174062749495, + "learning_rate": 1.9638930457210622e-05, + "loss": 0.9082, + "step": 26086 + }, + { + "epoch": 5.564633105802048, + "grad_norm": 0.09502768624470904, + "learning_rate": 1.9633345970067133e-05, + "loss": 0.8693, + "step": 26087 + }, + { + "epoch": 5.564846416382252, + "grad_norm": 0.11742784855839282, + "learning_rate": 1.9627762165973295e-05, + "loss": 0.8983, + "step": 26088 + }, + { + "epoch": 5.565059726962457, + "grad_norm": 0.1161646723633956, + "learning_rate": 1.9622179044992373e-05, + "loss": 0.8762, + "step": 26089 + }, + { + "epoch": 5.565273037542662, + "grad_norm": 0.10911243140748438, + "learning_rate": 1.961659660718751e-05, + "loss": 0.8885, + "step": 26090 + }, + { + "epoch": 5.565486348122867, + "grad_norm": 0.11677640847669919, + "learning_rate": 1.961101485262187e-05, + "loss": 0.8599, + "step": 26091 + }, + { + "epoch": 5.565699658703072, + "grad_norm": 0.11862934062531781, + "learning_rate": 1.960543378135859e-05, + "loss": 0.9009, + "step": 26092 + }, + { + "epoch": 5.565912969283277, + "grad_norm": 0.1091072009875969, + "learning_rate": 1.959985339346087e-05, + "loss": 0.9011, + "step": 26093 + }, + { + "epoch": 5.566126279863481, + "grad_norm": 0.1385575897597338, + "learning_rate": 1.959427368899182e-05, + "loss": 0.8937, + "step": 26094 + }, + { + "epoch": 5.566339590443686, + "grad_norm": 0.11283771480689263, + "learning_rate": 1.958869466801458e-05, + "loss": 0.8927, + "step": 26095 + }, + { + "epoch": 5.566552901023891, + "grad_norm": 0.12476243900283968, + "learning_rate": 1.958311633059225e-05, + "loss": 0.8989, + "step": 26096 + }, + { + "epoch": 5.5667662116040955, + "grad_norm": 0.11608938131987234, + "learning_rate": 1.9577538676788003e-05, + "loss": 0.8853, + "step": 26097 + }, + { + "epoch": 5.5669795221843, + "grad_norm": 0.10307241196488831, + "learning_rate": 1.9571961706664922e-05, + "loss": 0.8932, + "step": 26098 + }, + { + "epoch": 5.567192832764505, + "grad_norm": 0.11097536007731258, + "learning_rate": 1.9566385420286132e-05, + "loss": 0.9107, + "step": 26099 + }, + { + "epoch": 5.5674061433447095, + "grad_norm": 0.1149394885014542, + "learning_rate": 1.9560809817714662e-05, + "loss": 0.8895, + "step": 26100 + }, + { + "epoch": 5.567619453924914, + "grad_norm": 0.11212908521886532, + "learning_rate": 1.955523489901367e-05, + "loss": 0.9037, + "step": 26101 + }, + { + "epoch": 5.56783276450512, + "grad_norm": 0.12360066281158025, + "learning_rate": 1.9549660664246203e-05, + "loss": 0.9048, + "step": 26102 + }, + { + "epoch": 5.5680460750853245, + "grad_norm": 0.10249666301404146, + "learning_rate": 1.9544087113475357e-05, + "loss": 0.8444, + "step": 26103 + }, + { + "epoch": 5.568259385665529, + "grad_norm": 0.1144084591064022, + "learning_rate": 1.9538514246764155e-05, + "loss": 0.897, + "step": 26104 + }, + { + "epoch": 5.568472696245734, + "grad_norm": 0.12477016263315023, + "learning_rate": 1.95329420641757e-05, + "loss": 0.8927, + "step": 26105 + }, + { + "epoch": 5.5686860068259385, + "grad_norm": 0.10558119293617216, + "learning_rate": 1.952737056577302e-05, + "loss": 0.8823, + "step": 26106 + }, + { + "epoch": 5.568899317406143, + "grad_norm": 0.12469270888076533, + "learning_rate": 1.952179975161916e-05, + "loss": 0.8683, + "step": 26107 + }, + { + "epoch": 5.569112627986348, + "grad_norm": 0.11280930178185679, + "learning_rate": 1.9516229621777133e-05, + "loss": 0.8775, + "step": 26108 + }, + { + "epoch": 5.569325938566553, + "grad_norm": 0.10857137768934236, + "learning_rate": 1.9510660176310006e-05, + "loss": 0.8806, + "step": 26109 + }, + { + "epoch": 5.569539249146757, + "grad_norm": 0.12441059083044553, + "learning_rate": 1.9505091415280802e-05, + "loss": 0.9088, + "step": 26110 + }, + { + "epoch": 5.569752559726963, + "grad_norm": 0.1276359514596909, + "learning_rate": 1.9499523338752482e-05, + "loss": 0.8768, + "step": 26111 + }, + { + "epoch": 5.5699658703071675, + "grad_norm": 0.15031032162451646, + "learning_rate": 1.949395594678806e-05, + "loss": 0.8721, + "step": 26112 + }, + { + "epoch": 5.570179180887372, + "grad_norm": 0.1159226691314059, + "learning_rate": 1.948838923945055e-05, + "loss": 0.8946, + "step": 26113 + }, + { + "epoch": 5.570392491467577, + "grad_norm": 0.11547383022574169, + "learning_rate": 1.9482823216802947e-05, + "loss": 0.8839, + "step": 26114 + }, + { + "epoch": 5.5706058020477816, + "grad_norm": 0.13095452542338065, + "learning_rate": 1.9477257878908213e-05, + "loss": 0.9254, + "step": 26115 + }, + { + "epoch": 5.570819112627986, + "grad_norm": 0.10875535921939766, + "learning_rate": 1.94716932258293e-05, + "loss": 0.8836, + "step": 26116 + }, + { + "epoch": 5.571032423208191, + "grad_norm": 0.12165141988033291, + "learning_rate": 1.9466129257629217e-05, + "loss": 0.9148, + "step": 26117 + }, + { + "epoch": 5.571245733788396, + "grad_norm": 0.10197785992754478, + "learning_rate": 1.94605659743709e-05, + "loss": 0.8901, + "step": 26118 + }, + { + "epoch": 5.5714590443686, + "grad_norm": 0.12523281631531127, + "learning_rate": 1.94550033761173e-05, + "loss": 0.8693, + "step": 26119 + }, + { + "epoch": 5.571672354948806, + "grad_norm": 0.10368242982722652, + "learning_rate": 1.944944146293133e-05, + "loss": 0.8872, + "step": 26120 + }, + { + "epoch": 5.5718856655290105, + "grad_norm": 0.1297296068492606, + "learning_rate": 1.9443880234875996e-05, + "loss": 0.8908, + "step": 26121 + }, + { + "epoch": 5.572098976109215, + "grad_norm": 0.1070877113554156, + "learning_rate": 1.9438319692014156e-05, + "loss": 0.8762, + "step": 26122 + }, + { + "epoch": 5.57231228668942, + "grad_norm": 0.14545904068727608, + "learning_rate": 1.9432759834408744e-05, + "loss": 0.9219, + "step": 26123 + }, + { + "epoch": 5.572525597269625, + "grad_norm": 0.11408268376227086, + "learning_rate": 1.942720066212265e-05, + "loss": 0.8962, + "step": 26124 + }, + { + "epoch": 5.572738907849829, + "grad_norm": 0.13087395049081793, + "learning_rate": 1.9421642175218816e-05, + "loss": 0.8891, + "step": 26125 + }, + { + "epoch": 5.572952218430034, + "grad_norm": 0.11368934399767826, + "learning_rate": 1.9416084373760134e-05, + "loss": 0.914, + "step": 26126 + }, + { + "epoch": 5.573165529010239, + "grad_norm": 0.11519813578552196, + "learning_rate": 1.9410527257809467e-05, + "loss": 0.8811, + "step": 26127 + }, + { + "epoch": 5.573378839590443, + "grad_norm": 0.1182315904334226, + "learning_rate": 1.940497082742968e-05, + "loss": 0.8683, + "step": 26128 + }, + { + "epoch": 5.573592150170649, + "grad_norm": 0.12104754005764834, + "learning_rate": 1.939941508268369e-05, + "loss": 0.9026, + "step": 26129 + }, + { + "epoch": 5.573805460750854, + "grad_norm": 0.1322016158964489, + "learning_rate": 1.9393860023634338e-05, + "loss": 0.8926, + "step": 26130 + }, + { + "epoch": 5.574018771331058, + "grad_norm": 0.13793614140308091, + "learning_rate": 1.9388305650344483e-05, + "loss": 0.9128, + "step": 26131 + }, + { + "epoch": 5.574232081911263, + "grad_norm": 0.11254835474370373, + "learning_rate": 1.9382751962876964e-05, + "loss": 0.8781, + "step": 26132 + }, + { + "epoch": 5.574445392491468, + "grad_norm": 0.12092670859158479, + "learning_rate": 1.9377198961294628e-05, + "loss": 0.8838, + "step": 26133 + }, + { + "epoch": 5.574658703071672, + "grad_norm": 0.10548063101440654, + "learning_rate": 1.937164664566031e-05, + "loss": 0.8928, + "step": 26134 + }, + { + "epoch": 5.574872013651877, + "grad_norm": 0.1259233350721244, + "learning_rate": 1.936609501603683e-05, + "loss": 0.8873, + "step": 26135 + }, + { + "epoch": 5.575085324232082, + "grad_norm": 0.09949960145767239, + "learning_rate": 1.9360544072486975e-05, + "loss": 0.884, + "step": 26136 + }, + { + "epoch": 5.575298634812286, + "grad_norm": 0.12803167702365534, + "learning_rate": 1.935499381507361e-05, + "loss": 0.8881, + "step": 26137 + }, + { + "epoch": 5.575511945392492, + "grad_norm": 0.11648060253409351, + "learning_rate": 1.9349444243859513e-05, + "loss": 0.8636, + "step": 26138 + }, + { + "epoch": 5.575725255972696, + "grad_norm": 0.11443943972885662, + "learning_rate": 1.9343895358907466e-05, + "loss": 0.8805, + "step": 26139 + }, + { + "epoch": 5.575938566552901, + "grad_norm": 0.139927056229877, + "learning_rate": 1.9338347160280246e-05, + "loss": 0.8721, + "step": 26140 + }, + { + "epoch": 5.576151877133106, + "grad_norm": 0.11384645243879937, + "learning_rate": 1.9332799648040676e-05, + "loss": 0.8925, + "step": 26141 + }, + { + "epoch": 5.576365187713311, + "grad_norm": 0.1311038912025374, + "learning_rate": 1.9327252822251486e-05, + "loss": 0.8835, + "step": 26142 + }, + { + "epoch": 5.576578498293515, + "grad_norm": 0.12174373334062492, + "learning_rate": 1.9321706682975464e-05, + "loss": 0.8781, + "step": 26143 + }, + { + "epoch": 5.57679180887372, + "grad_norm": 0.10949947275447175, + "learning_rate": 1.931616123027535e-05, + "loss": 0.8676, + "step": 26144 + }, + { + "epoch": 5.577005119453925, + "grad_norm": 0.12672408806901733, + "learning_rate": 1.9310616464213884e-05, + "loss": 0.8905, + "step": 26145 + }, + { + "epoch": 5.577218430034129, + "grad_norm": 0.10274700707758504, + "learning_rate": 1.9305072384853823e-05, + "loss": 0.8876, + "step": 26146 + }, + { + "epoch": 5.577431740614334, + "grad_norm": 0.11842339617357221, + "learning_rate": 1.9299528992257885e-05, + "loss": 0.8939, + "step": 26147 + }, + { + "epoch": 5.577645051194539, + "grad_norm": 0.11124870013765828, + "learning_rate": 1.929398628648877e-05, + "loss": 0.8935, + "step": 26148 + }, + { + "epoch": 5.577858361774744, + "grad_norm": 0.1173310492146507, + "learning_rate": 1.928844426760924e-05, + "loss": 0.8865, + "step": 26149 + }, + { + "epoch": 5.578071672354949, + "grad_norm": 0.12399497594037256, + "learning_rate": 1.9282902935681996e-05, + "loss": 0.8925, + "step": 26150 + }, + { + "epoch": 5.578284982935154, + "grad_norm": 0.11280748158110598, + "learning_rate": 1.927736229076972e-05, + "loss": 0.9226, + "step": 26151 + }, + { + "epoch": 5.578498293515358, + "grad_norm": 0.12216321724067664, + "learning_rate": 1.9271822332935093e-05, + "loss": 0.9025, + "step": 26152 + }, + { + "epoch": 5.578711604095563, + "grad_norm": 0.11223302011045455, + "learning_rate": 1.9266283062240837e-05, + "loss": 0.8876, + "step": 26153 + }, + { + "epoch": 5.578924914675768, + "grad_norm": 0.11879458516073323, + "learning_rate": 1.9260744478749612e-05, + "loss": 0.8898, + "step": 26154 + }, + { + "epoch": 5.5791382252559725, + "grad_norm": 0.13206722566067117, + "learning_rate": 1.925520658252409e-05, + "loss": 0.8904, + "step": 26155 + }, + { + "epoch": 5.579351535836177, + "grad_norm": 0.11942334609306694, + "learning_rate": 1.9249669373626912e-05, + "loss": 0.8763, + "step": 26156 + }, + { + "epoch": 5.579564846416382, + "grad_norm": 0.12352639927244631, + "learning_rate": 1.9244132852120758e-05, + "loss": 0.8883, + "step": 26157 + }, + { + "epoch": 5.579778156996587, + "grad_norm": 0.11433791637378955, + "learning_rate": 1.9238597018068264e-05, + "loss": 0.9049, + "step": 26158 + }, + { + "epoch": 5.579991467576792, + "grad_norm": 0.10901971654127436, + "learning_rate": 1.923306187153206e-05, + "loss": 0.9208, + "step": 26159 + }, + { + "epoch": 5.580204778156997, + "grad_norm": 0.1187937034201311, + "learning_rate": 1.922752741257477e-05, + "loss": 0.8704, + "step": 26160 + }, + { + "epoch": 5.5804180887372015, + "grad_norm": 0.11039734084155127, + "learning_rate": 1.922199364125904e-05, + "loss": 0.8735, + "step": 26161 + }, + { + "epoch": 5.580631399317406, + "grad_norm": 0.11573153302025742, + "learning_rate": 1.9216460557647484e-05, + "loss": 0.8849, + "step": 26162 + }, + { + "epoch": 5.580844709897611, + "grad_norm": 0.12093179711717292, + "learning_rate": 1.9210928161802693e-05, + "loss": 0.8697, + "step": 26163 + }, + { + "epoch": 5.5810580204778155, + "grad_norm": 0.11678227476302508, + "learning_rate": 1.9205396453787256e-05, + "loss": 0.894, + "step": 26164 + }, + { + "epoch": 5.58127133105802, + "grad_norm": 0.1136219206113573, + "learning_rate": 1.9199865433663793e-05, + "loss": 0.8899, + "step": 26165 + }, + { + "epoch": 5.581484641638225, + "grad_norm": 0.1248176679810631, + "learning_rate": 1.9194335101494906e-05, + "loss": 0.9119, + "step": 26166 + }, + { + "epoch": 5.5816979522184305, + "grad_norm": 0.11028874367632531, + "learning_rate": 1.9188805457343092e-05, + "loss": 0.8775, + "step": 26167 + }, + { + "epoch": 5.581911262798635, + "grad_norm": 0.10849415255793528, + "learning_rate": 1.9183276501270995e-05, + "loss": 0.8858, + "step": 26168 + }, + { + "epoch": 5.58212457337884, + "grad_norm": 0.11012748852301793, + "learning_rate": 1.9177748233341144e-05, + "loss": 0.8785, + "step": 26169 + }, + { + "epoch": 5.5823378839590445, + "grad_norm": 0.11973209996479674, + "learning_rate": 1.9172220653616095e-05, + "loss": 0.8796, + "step": 26170 + }, + { + "epoch": 5.582551194539249, + "grad_norm": 0.1183799526207885, + "learning_rate": 1.9166693762158387e-05, + "loss": 0.8703, + "step": 26171 + }, + { + "epoch": 5.582764505119454, + "grad_norm": 0.11197948430086617, + "learning_rate": 1.9161167559030546e-05, + "loss": 0.9204, + "step": 26172 + }, + { + "epoch": 5.582977815699659, + "grad_norm": 0.11964290610553503, + "learning_rate": 1.9155642044295132e-05, + "loss": 0.8812, + "step": 26173 + }, + { + "epoch": 5.583191126279863, + "grad_norm": 0.1239665806723024, + "learning_rate": 1.9150117218014664e-05, + "loss": 0.8892, + "step": 26174 + }, + { + "epoch": 5.583404436860068, + "grad_norm": 0.13375846371346117, + "learning_rate": 1.9144593080251636e-05, + "loss": 0.8541, + "step": 26175 + }, + { + "epoch": 5.5836177474402735, + "grad_norm": 0.13108894453011063, + "learning_rate": 1.9139069631068538e-05, + "loss": 0.8999, + "step": 26176 + }, + { + "epoch": 5.583831058020478, + "grad_norm": 0.11249013802050375, + "learning_rate": 1.9133546870527935e-05, + "loss": 0.9168, + "step": 26177 + }, + { + "epoch": 5.584044368600683, + "grad_norm": 0.15061631846329102, + "learning_rate": 1.9128024798692255e-05, + "loss": 0.8606, + "step": 26178 + }, + { + "epoch": 5.584257679180888, + "grad_norm": 0.10549237639570908, + "learning_rate": 1.9122503415623973e-05, + "loss": 0.8824, + "step": 26179 + }, + { + "epoch": 5.584470989761092, + "grad_norm": 0.14234429467652326, + "learning_rate": 1.9116982721385612e-05, + "loss": 0.8908, + "step": 26180 + }, + { + "epoch": 5.584684300341297, + "grad_norm": 0.12740648820995976, + "learning_rate": 1.911146271603961e-05, + "loss": 0.9227, + "step": 26181 + }, + { + "epoch": 5.584897610921502, + "grad_norm": 0.134247610320711, + "learning_rate": 1.9105943399648444e-05, + "loss": 0.8935, + "step": 26182 + }, + { + "epoch": 5.585110921501706, + "grad_norm": 0.1306544223548918, + "learning_rate": 1.9100424772274548e-05, + "loss": 0.908, + "step": 26183 + }, + { + "epoch": 5.585324232081911, + "grad_norm": 0.11718806904938066, + "learning_rate": 1.909490683398034e-05, + "loss": 0.8838, + "step": 26184 + }, + { + "epoch": 5.585537542662116, + "grad_norm": 0.12376193580969846, + "learning_rate": 1.908938958482832e-05, + "loss": 0.8702, + "step": 26185 + }, + { + "epoch": 5.58575085324232, + "grad_norm": 0.11828022042549365, + "learning_rate": 1.9083873024880876e-05, + "loss": 0.895, + "step": 26186 + }, + { + "epoch": 5.585964163822526, + "grad_norm": 0.11916265364872601, + "learning_rate": 1.907835715420043e-05, + "loss": 0.8872, + "step": 26187 + }, + { + "epoch": 5.586177474402731, + "grad_norm": 0.11809552733186703, + "learning_rate": 1.9072841972849395e-05, + "loss": 0.8713, + "step": 26188 + }, + { + "epoch": 5.586390784982935, + "grad_norm": 0.13087383901496086, + "learning_rate": 1.9067327480890183e-05, + "loss": 0.8819, + "step": 26189 + }, + { + "epoch": 5.58660409556314, + "grad_norm": 0.11312530581430445, + "learning_rate": 1.906181367838518e-05, + "loss": 0.8852, + "step": 26190 + }, + { + "epoch": 5.586817406143345, + "grad_norm": 0.11814227026751299, + "learning_rate": 1.9056300565396765e-05, + "loss": 0.8645, + "step": 26191 + }, + { + "epoch": 5.587030716723549, + "grad_norm": 0.12362557846264756, + "learning_rate": 1.9050788141987358e-05, + "loss": 0.9168, + "step": 26192 + }, + { + "epoch": 5.587244027303754, + "grad_norm": 0.09482944614168795, + "learning_rate": 1.9045276408219306e-05, + "loss": 0.8896, + "step": 26193 + }, + { + "epoch": 5.587457337883959, + "grad_norm": 0.12153751143064997, + "learning_rate": 1.9039765364154973e-05, + "loss": 0.8917, + "step": 26194 + }, + { + "epoch": 5.587670648464163, + "grad_norm": 0.10470351941965163, + "learning_rate": 1.903425500985672e-05, + "loss": 0.8663, + "step": 26195 + }, + { + "epoch": 5.587883959044369, + "grad_norm": 0.12249631381119412, + "learning_rate": 1.902874534538688e-05, + "loss": 0.8897, + "step": 26196 + }, + { + "epoch": 5.588097269624574, + "grad_norm": 0.11264799179399608, + "learning_rate": 1.902323637080783e-05, + "loss": 0.9272, + "step": 26197 + }, + { + "epoch": 5.588310580204778, + "grad_norm": 0.11706166562250203, + "learning_rate": 1.9017728086181888e-05, + "loss": 0.9018, + "step": 26198 + }, + { + "epoch": 5.588523890784983, + "grad_norm": 0.12710938800573715, + "learning_rate": 1.901222049157138e-05, + "loss": 0.8718, + "step": 26199 + }, + { + "epoch": 5.588737201365188, + "grad_norm": 0.11309438506035457, + "learning_rate": 1.9006713587038625e-05, + "loss": 0.8841, + "step": 26200 + }, + { + "epoch": 5.588950511945392, + "grad_norm": 0.1159246610193361, + "learning_rate": 1.9001207372645927e-05, + "loss": 0.8891, + "step": 26201 + }, + { + "epoch": 5.589163822525597, + "grad_norm": 0.10485757860281354, + "learning_rate": 1.899570184845559e-05, + "loss": 0.8704, + "step": 26202 + }, + { + "epoch": 5.589377133105802, + "grad_norm": 0.10179293567570841, + "learning_rate": 1.899019701452992e-05, + "loss": 0.8874, + "step": 26203 + }, + { + "epoch": 5.589590443686006, + "grad_norm": 0.11009701957981223, + "learning_rate": 1.8984692870931168e-05, + "loss": 0.8785, + "step": 26204 + }, + { + "epoch": 5.589803754266212, + "grad_norm": 0.11401233731723831, + "learning_rate": 1.897918941772167e-05, + "loss": 0.9008, + "step": 26205 + }, + { + "epoch": 5.590017064846417, + "grad_norm": 0.112326291427361, + "learning_rate": 1.897368665496367e-05, + "loss": 0.9142, + "step": 26206 + }, + { + "epoch": 5.590230375426621, + "grad_norm": 0.11640835881103738, + "learning_rate": 1.896818458271943e-05, + "loss": 0.9131, + "step": 26207 + }, + { + "epoch": 5.590443686006826, + "grad_norm": 0.12052675365515036, + "learning_rate": 1.8962683201051185e-05, + "loss": 0.9012, + "step": 26208 + }, + { + "epoch": 5.590656996587031, + "grad_norm": 0.10967838692380022, + "learning_rate": 1.8957182510021237e-05, + "loss": 0.8552, + "step": 26209 + }, + { + "epoch": 5.590870307167235, + "grad_norm": 0.10465690676399797, + "learning_rate": 1.895168250969178e-05, + "loss": 0.8712, + "step": 26210 + }, + { + "epoch": 5.59108361774744, + "grad_norm": 0.11266984236198802, + "learning_rate": 1.8946183200125078e-05, + "loss": 0.8837, + "step": 26211 + }, + { + "epoch": 5.591296928327645, + "grad_norm": 0.11328243901720955, + "learning_rate": 1.8940684581383326e-05, + "loss": 0.9017, + "step": 26212 + }, + { + "epoch": 5.5915102389078495, + "grad_norm": 0.10760396682828782, + "learning_rate": 1.8935186653528765e-05, + "loss": 0.8607, + "step": 26213 + }, + { + "epoch": 5.591723549488055, + "grad_norm": 0.10879177385294506, + "learning_rate": 1.8929689416623597e-05, + "loss": 0.8962, + "step": 26214 + }, + { + "epoch": 5.59193686006826, + "grad_norm": 0.11660271107633731, + "learning_rate": 1.8924192870730014e-05, + "loss": 0.8793, + "step": 26215 + }, + { + "epoch": 5.592150170648464, + "grad_norm": 0.11154197469263508, + "learning_rate": 1.891869701591019e-05, + "loss": 0.8786, + "step": 26216 + }, + { + "epoch": 5.592363481228669, + "grad_norm": 0.20961295631308138, + "learning_rate": 1.891320185222636e-05, + "loss": 0.9022, + "step": 26217 + }, + { + "epoch": 5.592576791808874, + "grad_norm": 0.11220450371342434, + "learning_rate": 1.8907707379740684e-05, + "loss": 0.909, + "step": 26218 + }, + { + "epoch": 5.5927901023890785, + "grad_norm": 0.10394758165695604, + "learning_rate": 1.8902213598515323e-05, + "loss": 0.8725, + "step": 26219 + }, + { + "epoch": 5.593003412969283, + "grad_norm": 0.11177623989564953, + "learning_rate": 1.8896720508612425e-05, + "loss": 0.9158, + "step": 26220 + }, + { + "epoch": 5.593216723549488, + "grad_norm": 0.11485742024244204, + "learning_rate": 1.8891228110094184e-05, + "loss": 0.8838, + "step": 26221 + }, + { + "epoch": 5.5934300341296925, + "grad_norm": 0.10784200090618772, + "learning_rate": 1.8885736403022745e-05, + "loss": 0.8955, + "step": 26222 + }, + { + "epoch": 5.593643344709898, + "grad_norm": 0.10568905538149236, + "learning_rate": 1.8880245387460174e-05, + "loss": 0.9063, + "step": 26223 + }, + { + "epoch": 5.593856655290102, + "grad_norm": 0.10261947023382599, + "learning_rate": 1.8874755063468688e-05, + "loss": 0.883, + "step": 26224 + }, + { + "epoch": 5.5940699658703075, + "grad_norm": 0.1063914208428227, + "learning_rate": 1.886926543111037e-05, + "loss": 0.8957, + "step": 26225 + }, + { + "epoch": 5.594283276450512, + "grad_norm": 0.11067789664308346, + "learning_rate": 1.886377649044735e-05, + "loss": 0.8897, + "step": 26226 + }, + { + "epoch": 5.594496587030717, + "grad_norm": 0.11891380663226918, + "learning_rate": 1.8858288241541717e-05, + "loss": 0.8675, + "step": 26227 + }, + { + "epoch": 5.5947098976109215, + "grad_norm": 0.12490575413677119, + "learning_rate": 1.8852800684455568e-05, + "loss": 0.8783, + "step": 26228 + }, + { + "epoch": 5.594923208191126, + "grad_norm": 0.12180138201516393, + "learning_rate": 1.8847313819251026e-05, + "loss": 0.9024, + "step": 26229 + }, + { + "epoch": 5.595136518771331, + "grad_norm": 0.12715443900684562, + "learning_rate": 1.8841827645990155e-05, + "loss": 0.9129, + "step": 26230 + }, + { + "epoch": 5.595349829351536, + "grad_norm": 0.11209564270428446, + "learning_rate": 1.883634216473503e-05, + "loss": 0.9245, + "step": 26231 + }, + { + "epoch": 5.59556313993174, + "grad_norm": 0.1262758309796189, + "learning_rate": 1.8830857375547706e-05, + "loss": 0.8978, + "step": 26232 + }, + { + "epoch": 5.595776450511945, + "grad_norm": 0.11714378587811289, + "learning_rate": 1.882537327849031e-05, + "loss": 0.9032, + "step": 26233 + }, + { + "epoch": 5.5959897610921505, + "grad_norm": 0.12351800701736246, + "learning_rate": 1.8819889873624814e-05, + "loss": 0.8892, + "step": 26234 + }, + { + "epoch": 5.596203071672355, + "grad_norm": 0.10794817496936286, + "learning_rate": 1.8814407161013265e-05, + "loss": 0.8862, + "step": 26235 + }, + { + "epoch": 5.59641638225256, + "grad_norm": 0.11975777413410939, + "learning_rate": 1.8808925140717757e-05, + "loss": 0.8869, + "step": 26236 + }, + { + "epoch": 5.596629692832765, + "grad_norm": 0.11624431045291729, + "learning_rate": 1.880344381280029e-05, + "loss": 0.8905, + "step": 26237 + }, + { + "epoch": 5.596843003412969, + "grad_norm": 0.11181951529111939, + "learning_rate": 1.8797963177322892e-05, + "loss": 0.8715, + "step": 26238 + }, + { + "epoch": 5.597056313993174, + "grad_norm": 0.1291281026085339, + "learning_rate": 1.879248323434756e-05, + "loss": 0.8895, + "step": 26239 + }, + { + "epoch": 5.597269624573379, + "grad_norm": 0.11831842072188442, + "learning_rate": 1.8787003983936297e-05, + "loss": 0.886, + "step": 26240 + }, + { + "epoch": 5.597482935153583, + "grad_norm": 0.12132097528270087, + "learning_rate": 1.878152542615112e-05, + "loss": 0.8675, + "step": 26241 + }, + { + "epoch": 5.597696245733788, + "grad_norm": 0.11783395510266413, + "learning_rate": 1.877604756105403e-05, + "loss": 0.8926, + "step": 26242 + }, + { + "epoch": 5.597909556313994, + "grad_norm": 0.11697951932834817, + "learning_rate": 1.8770570388706985e-05, + "loss": 0.912, + "step": 26243 + }, + { + "epoch": 5.598122866894198, + "grad_norm": 0.11845845074731857, + "learning_rate": 1.876509390917196e-05, + "loss": 0.8808, + "step": 26244 + }, + { + "epoch": 5.598336177474403, + "grad_norm": 0.11124892441071997, + "learning_rate": 1.8759618122510934e-05, + "loss": 0.9049, + "step": 26245 + }, + { + "epoch": 5.598549488054608, + "grad_norm": 0.11490265103680056, + "learning_rate": 1.8754143028785857e-05, + "loss": 0.876, + "step": 26246 + }, + { + "epoch": 5.598762798634812, + "grad_norm": 0.10850196648486297, + "learning_rate": 1.874866862805866e-05, + "loss": 0.8749, + "step": 26247 + }, + { + "epoch": 5.598976109215017, + "grad_norm": 0.11596686413236594, + "learning_rate": 1.874319492039133e-05, + "loss": 0.8908, + "step": 26248 + }, + { + "epoch": 5.599189419795222, + "grad_norm": 0.12371218644001716, + "learning_rate": 1.873772190584578e-05, + "loss": 0.9052, + "step": 26249 + }, + { + "epoch": 5.599402730375426, + "grad_norm": 0.1113711044280047, + "learning_rate": 1.8732249584483942e-05, + "loss": 0.8807, + "step": 26250 + }, + { + "epoch": 5.599616040955631, + "grad_norm": 0.12783727385210145, + "learning_rate": 1.872677795636772e-05, + "loss": 0.9022, + "step": 26251 + }, + { + "epoch": 5.599829351535837, + "grad_norm": 0.1158518416421426, + "learning_rate": 1.8721307021559027e-05, + "loss": 0.8524, + "step": 26252 + }, + { + "epoch": 5.600042662116041, + "grad_norm": 0.11508597313783814, + "learning_rate": 1.8715836780119784e-05, + "loss": 0.8832, + "step": 26253 + }, + { + "epoch": 5.600255972696246, + "grad_norm": 0.11025273895524801, + "learning_rate": 1.871036723211189e-05, + "loss": 0.9006, + "step": 26254 + }, + { + "epoch": 5.600469283276451, + "grad_norm": 0.10246897307472735, + "learning_rate": 1.8704898377597214e-05, + "loss": 0.8812, + "step": 26255 + }, + { + "epoch": 5.600682593856655, + "grad_norm": 0.10606567045186532, + "learning_rate": 1.8699430216637643e-05, + "loss": 0.8748, + "step": 26256 + }, + { + "epoch": 5.60089590443686, + "grad_norm": 0.10566552564434839, + "learning_rate": 1.869396274929506e-05, + "loss": 0.8844, + "step": 26257 + }, + { + "epoch": 5.601109215017065, + "grad_norm": 0.1015018166036555, + "learning_rate": 1.8688495975631314e-05, + "loss": 0.8764, + "step": 26258 + }, + { + "epoch": 5.601322525597269, + "grad_norm": 0.09945066709252927, + "learning_rate": 1.8683029895708244e-05, + "loss": 0.8856, + "step": 26259 + }, + { + "epoch": 5.601535836177474, + "grad_norm": 0.10207813734043084, + "learning_rate": 1.8677564509587744e-05, + "loss": 0.8904, + "step": 26260 + }, + { + "epoch": 5.60174914675768, + "grad_norm": 0.10664343884195124, + "learning_rate": 1.8672099817331633e-05, + "loss": 0.9077, + "step": 26261 + }, + { + "epoch": 5.601962457337884, + "grad_norm": 0.1209817518121199, + "learning_rate": 1.8666635819001745e-05, + "loss": 0.9116, + "step": 26262 + }, + { + "epoch": 5.602175767918089, + "grad_norm": 0.10692944762781087, + "learning_rate": 1.8661172514659906e-05, + "loss": 0.8947, + "step": 26263 + }, + { + "epoch": 5.602389078498294, + "grad_norm": 0.1158244730452086, + "learning_rate": 1.8655709904367906e-05, + "loss": 0.9092, + "step": 26264 + }, + { + "epoch": 5.602602389078498, + "grad_norm": 0.11700296799012486, + "learning_rate": 1.8650247988187605e-05, + "loss": 0.8832, + "step": 26265 + }, + { + "epoch": 5.602815699658703, + "grad_norm": 0.12277640745940789, + "learning_rate": 1.86447867661808e-05, + "loss": 0.9164, + "step": 26266 + }, + { + "epoch": 5.603029010238908, + "grad_norm": 0.12328680973402219, + "learning_rate": 1.8639326238409225e-05, + "loss": 0.8686, + "step": 26267 + }, + { + "epoch": 5.603242320819112, + "grad_norm": 0.11787696720464073, + "learning_rate": 1.8633866404934727e-05, + "loss": 0.8906, + "step": 26268 + }, + { + "epoch": 5.603455631399317, + "grad_norm": 0.12972073722636587, + "learning_rate": 1.8628407265819063e-05, + "loss": 0.8651, + "step": 26269 + }, + { + "epoch": 5.603668941979522, + "grad_norm": 0.11185386881095477, + "learning_rate": 1.8622948821123997e-05, + "loss": 0.8662, + "step": 26270 + }, + { + "epoch": 5.6038822525597265, + "grad_norm": 0.1443842881889124, + "learning_rate": 1.861749107091128e-05, + "loss": 0.882, + "step": 26271 + }, + { + "epoch": 5.604095563139932, + "grad_norm": 0.11219514051235933, + "learning_rate": 1.8612034015242707e-05, + "loss": 0.8984, + "step": 26272 + }, + { + "epoch": 5.604308873720137, + "grad_norm": 0.16273503410539936, + "learning_rate": 1.860657765418e-05, + "loss": 0.873, + "step": 26273 + }, + { + "epoch": 5.604522184300341, + "grad_norm": 0.11498563769405232, + "learning_rate": 1.8601121987784898e-05, + "loss": 0.8751, + "step": 26274 + }, + { + "epoch": 5.604735494880546, + "grad_norm": 0.14014293649236736, + "learning_rate": 1.8595667016119137e-05, + "loss": 0.9058, + "step": 26275 + }, + { + "epoch": 5.604948805460751, + "grad_norm": 0.13050786029785927, + "learning_rate": 1.859021273924441e-05, + "loss": 0.8789, + "step": 26276 + }, + { + "epoch": 5.6051621160409555, + "grad_norm": 0.12790429778558404, + "learning_rate": 1.8584759157222478e-05, + "loss": 0.8754, + "step": 26277 + }, + { + "epoch": 5.60537542662116, + "grad_norm": 0.11617149593637896, + "learning_rate": 1.8579306270115066e-05, + "loss": 0.9089, + "step": 26278 + }, + { + "epoch": 5.605588737201365, + "grad_norm": 0.10661057769937655, + "learning_rate": 1.857385407798378e-05, + "loss": 0.902, + "step": 26279 + }, + { + "epoch": 5.6058020477815695, + "grad_norm": 0.10986837190655772, + "learning_rate": 1.8568402580890393e-05, + "loss": 0.8799, + "step": 26280 + }, + { + "epoch": 5.606015358361775, + "grad_norm": 0.10437897209861256, + "learning_rate": 1.8562951778896564e-05, + "loss": 0.9021, + "step": 26281 + }, + { + "epoch": 5.60622866894198, + "grad_norm": 0.11827253968076121, + "learning_rate": 1.855750167206397e-05, + "loss": 0.8659, + "step": 26282 + }, + { + "epoch": 5.6064419795221845, + "grad_norm": 0.11450147797020506, + "learning_rate": 1.855205226045426e-05, + "loss": 0.9005, + "step": 26283 + }, + { + "epoch": 5.606655290102389, + "grad_norm": 0.11541281104236589, + "learning_rate": 1.8546603544129125e-05, + "loss": 0.8884, + "step": 26284 + }, + { + "epoch": 5.606868600682594, + "grad_norm": 0.12701335151556656, + "learning_rate": 1.8541155523150206e-05, + "loss": 0.8629, + "step": 26285 + }, + { + "epoch": 5.6070819112627985, + "grad_norm": 0.10567231562291994, + "learning_rate": 1.8535708197579154e-05, + "loss": 0.8917, + "step": 26286 + }, + { + "epoch": 5.607295221843003, + "grad_norm": 0.10793607933999061, + "learning_rate": 1.8530261567477588e-05, + "loss": 0.9021, + "step": 26287 + }, + { + "epoch": 5.607508532423208, + "grad_norm": 0.12279986353568392, + "learning_rate": 1.8524815632907128e-05, + "loss": 0.9204, + "step": 26288 + }, + { + "epoch": 5.607721843003413, + "grad_norm": 0.12948979839617042, + "learning_rate": 1.8519370393929463e-05, + "loss": 0.9137, + "step": 26289 + }, + { + "epoch": 5.607935153583618, + "grad_norm": 0.11825453934693878, + "learning_rate": 1.8513925850606133e-05, + "loss": 0.8994, + "step": 26290 + }, + { + "epoch": 5.608148464163823, + "grad_norm": 0.1220609340445689, + "learning_rate": 1.850848200299874e-05, + "loss": 0.8894, + "step": 26291 + }, + { + "epoch": 5.6083617747440275, + "grad_norm": 0.11214829421422717, + "learning_rate": 1.850303885116893e-05, + "loss": 0.8843, + "step": 26292 + }, + { + "epoch": 5.608575085324232, + "grad_norm": 0.10591926591630751, + "learning_rate": 1.849759639517827e-05, + "loss": 0.8573, + "step": 26293 + }, + { + "epoch": 5.608788395904437, + "grad_norm": 0.11675324156412631, + "learning_rate": 1.8492154635088343e-05, + "loss": 0.8639, + "step": 26294 + }, + { + "epoch": 5.609001706484642, + "grad_norm": 0.10544621067294437, + "learning_rate": 1.8486713570960697e-05, + "loss": 0.8744, + "step": 26295 + }, + { + "epoch": 5.609215017064846, + "grad_norm": 0.1114259271068407, + "learning_rate": 1.848127320285694e-05, + "loss": 0.8995, + "step": 26296 + }, + { + "epoch": 5.609428327645051, + "grad_norm": 0.11476056373446364, + "learning_rate": 1.847583353083861e-05, + "loss": 0.8831, + "step": 26297 + }, + { + "epoch": 5.609641638225256, + "grad_norm": 0.1100568066393092, + "learning_rate": 1.847039455496727e-05, + "loss": 0.9078, + "step": 26298 + }, + { + "epoch": 5.609854948805461, + "grad_norm": 0.10847750038953645, + "learning_rate": 1.8464956275304447e-05, + "loss": 0.8768, + "step": 26299 + }, + { + "epoch": 5.610068259385666, + "grad_norm": 0.10744118246778024, + "learning_rate": 1.8459518691911674e-05, + "loss": 0.8689, + "step": 26300 + }, + { + "epoch": 5.610281569965871, + "grad_norm": 0.10426530585864702, + "learning_rate": 1.8454081804850484e-05, + "loss": 0.8925, + "step": 26301 + }, + { + "epoch": 5.610494880546075, + "grad_norm": 0.12352557414383528, + "learning_rate": 1.8448645614182395e-05, + "loss": 0.8719, + "step": 26302 + }, + { + "epoch": 5.61070819112628, + "grad_norm": 0.11298508409458798, + "learning_rate": 1.844321011996889e-05, + "loss": 0.9027, + "step": 26303 + }, + { + "epoch": 5.610921501706485, + "grad_norm": 0.11303897088302362, + "learning_rate": 1.8437775322271527e-05, + "loss": 0.8684, + "step": 26304 + }, + { + "epoch": 5.611134812286689, + "grad_norm": 0.11115555771121348, + "learning_rate": 1.8432341221151766e-05, + "loss": 0.8811, + "step": 26305 + }, + { + "epoch": 5.611348122866894, + "grad_norm": 0.11952571792938445, + "learning_rate": 1.84269078166711e-05, + "loss": 0.9127, + "step": 26306 + }, + { + "epoch": 5.611561433447099, + "grad_norm": 0.11059382696270108, + "learning_rate": 1.842147510889099e-05, + "loss": 0.8748, + "step": 26307 + }, + { + "epoch": 5.611774744027304, + "grad_norm": 0.11226248867751165, + "learning_rate": 1.841604309787294e-05, + "loss": 0.8755, + "step": 26308 + }, + { + "epoch": 5.611988054607508, + "grad_norm": 0.12150335964568452, + "learning_rate": 1.8410611783678406e-05, + "loss": 0.8751, + "step": 26309 + }, + { + "epoch": 5.612201365187714, + "grad_norm": 0.10963278686451663, + "learning_rate": 1.8405181166368827e-05, + "loss": 0.8542, + "step": 26310 + }, + { + "epoch": 5.612414675767918, + "grad_norm": 0.12331330568071361, + "learning_rate": 1.8399751246005665e-05, + "loss": 0.8972, + "step": 26311 + }, + { + "epoch": 5.612627986348123, + "grad_norm": 0.1078716499391453, + "learning_rate": 1.8394322022650353e-05, + "loss": 0.8708, + "step": 26312 + }, + { + "epoch": 5.612841296928328, + "grad_norm": 0.11032353287190337, + "learning_rate": 1.838889349636432e-05, + "loss": 0.8578, + "step": 26313 + }, + { + "epoch": 5.613054607508532, + "grad_norm": 0.10919132285934992, + "learning_rate": 1.8383465667209003e-05, + "loss": 0.9111, + "step": 26314 + }, + { + "epoch": 5.613267918088737, + "grad_norm": 0.10671150297644215, + "learning_rate": 1.8378038535245774e-05, + "loss": 0.8537, + "step": 26315 + }, + { + "epoch": 5.613481228668942, + "grad_norm": 0.11630213174459599, + "learning_rate": 1.837261210053611e-05, + "loss": 0.8961, + "step": 26316 + }, + { + "epoch": 5.613694539249146, + "grad_norm": 0.1008706157629231, + "learning_rate": 1.836718636314137e-05, + "loss": 0.8992, + "step": 26317 + }, + { + "epoch": 5.613907849829351, + "grad_norm": 0.11478206821649234, + "learning_rate": 1.8361761323122945e-05, + "loss": 0.8993, + "step": 26318 + }, + { + "epoch": 5.614121160409557, + "grad_norm": 0.11112739266456893, + "learning_rate": 1.8356336980542224e-05, + "loss": 0.9194, + "step": 26319 + }, + { + "epoch": 5.614334470989761, + "grad_norm": 0.11559003560441143, + "learning_rate": 1.8350913335460595e-05, + "loss": 0.8928, + "step": 26320 + }, + { + "epoch": 5.614547781569966, + "grad_norm": 0.14280907962386874, + "learning_rate": 1.8345490387939425e-05, + "loss": 0.8754, + "step": 26321 + }, + { + "epoch": 5.614761092150171, + "grad_norm": 0.10934265088332164, + "learning_rate": 1.834006813804008e-05, + "loss": 0.9035, + "step": 26322 + }, + { + "epoch": 5.614974402730375, + "grad_norm": 0.13981965296497068, + "learning_rate": 1.8334646585823865e-05, + "loss": 0.9027, + "step": 26323 + }, + { + "epoch": 5.61518771331058, + "grad_norm": 0.1101873462367189, + "learning_rate": 1.8329225731352186e-05, + "loss": 0.8856, + "step": 26324 + }, + { + "epoch": 5.615401023890785, + "grad_norm": 0.133055904753486, + "learning_rate": 1.8323805574686355e-05, + "loss": 0.8895, + "step": 26325 + }, + { + "epoch": 5.6156143344709895, + "grad_norm": 0.11896618507183537, + "learning_rate": 1.8318386115887693e-05, + "loss": 0.888, + "step": 26326 + }, + { + "epoch": 5.615827645051194, + "grad_norm": 0.12500344908698904, + "learning_rate": 1.831296735501751e-05, + "loss": 0.9078, + "step": 26327 + }, + { + "epoch": 5.6160409556314, + "grad_norm": 0.12487356839722097, + "learning_rate": 1.8307549292137165e-05, + "loss": 0.9028, + "step": 26328 + }, + { + "epoch": 5.616254266211604, + "grad_norm": 0.11631022423349363, + "learning_rate": 1.8302131927307937e-05, + "loss": 0.8786, + "step": 26329 + }, + { + "epoch": 5.616467576791809, + "grad_norm": 0.12234206592956762, + "learning_rate": 1.829671526059112e-05, + "loss": 0.9014, + "step": 26330 + }, + { + "epoch": 5.616680887372014, + "grad_norm": 0.11249487731858743, + "learning_rate": 1.8291299292047995e-05, + "loss": 0.8775, + "step": 26331 + }, + { + "epoch": 5.6168941979522184, + "grad_norm": 0.10870924595739902, + "learning_rate": 1.8285884021739873e-05, + "loss": 0.8905, + "step": 26332 + }, + { + "epoch": 5.617107508532423, + "grad_norm": 0.1420737979544312, + "learning_rate": 1.828046944972801e-05, + "loss": 0.881, + "step": 26333 + }, + { + "epoch": 5.617320819112628, + "grad_norm": 0.11016321135575582, + "learning_rate": 1.8275055576073698e-05, + "loss": 0.9082, + "step": 26334 + }, + { + "epoch": 5.6175341296928325, + "grad_norm": 0.10817149226954627, + "learning_rate": 1.826964240083812e-05, + "loss": 0.8769, + "step": 26335 + }, + { + "epoch": 5.617747440273037, + "grad_norm": 0.13576892201929391, + "learning_rate": 1.8264229924082604e-05, + "loss": 0.887, + "step": 26336 + }, + { + "epoch": 5.617960750853243, + "grad_norm": 0.11244187221742159, + "learning_rate": 1.8258818145868362e-05, + "loss": 0.8892, + "step": 26337 + }, + { + "epoch": 5.618174061433447, + "grad_norm": 0.13503969708458716, + "learning_rate": 1.8253407066256638e-05, + "loss": 0.8633, + "step": 26338 + }, + { + "epoch": 5.618387372013652, + "grad_norm": 0.10989902166964544, + "learning_rate": 1.8247996685308626e-05, + "loss": 0.8944, + "step": 26339 + }, + { + "epoch": 5.618600682593857, + "grad_norm": 0.1427262904124253, + "learning_rate": 1.8242587003085583e-05, + "loss": 0.8773, + "step": 26340 + }, + { + "epoch": 5.6188139931740615, + "grad_norm": 0.11381754108061498, + "learning_rate": 1.8237178019648717e-05, + "loss": 0.8986, + "step": 26341 + }, + { + "epoch": 5.619027303754266, + "grad_norm": 0.13934309955204113, + "learning_rate": 1.8231769735059223e-05, + "loss": 0.9043, + "step": 26342 + }, + { + "epoch": 5.619240614334471, + "grad_norm": 0.11975802823159207, + "learning_rate": 1.822636214937826e-05, + "loss": 0.8968, + "step": 26343 + }, + { + "epoch": 5.6194539249146755, + "grad_norm": 0.1312457661261168, + "learning_rate": 1.8220955262667076e-05, + "loss": 0.9033, + "step": 26344 + }, + { + "epoch": 5.61966723549488, + "grad_norm": 0.12013555931213198, + "learning_rate": 1.821554907498685e-05, + "loss": 0.9047, + "step": 26345 + }, + { + "epoch": 5.619880546075086, + "grad_norm": 0.125003119183239, + "learning_rate": 1.8210143586398698e-05, + "loss": 0.9244, + "step": 26346 + }, + { + "epoch": 5.6200938566552905, + "grad_norm": 0.10992346112857973, + "learning_rate": 1.820473879696379e-05, + "loss": 0.8987, + "step": 26347 + }, + { + "epoch": 5.620307167235495, + "grad_norm": 0.11567207378707549, + "learning_rate": 1.8199334706743322e-05, + "loss": 0.8554, + "step": 26348 + }, + { + "epoch": 5.6205204778157, + "grad_norm": 0.12477227374564231, + "learning_rate": 1.819393131579842e-05, + "loss": 0.8876, + "step": 26349 + }, + { + "epoch": 5.6207337883959045, + "grad_norm": 0.13063179565783317, + "learning_rate": 1.818852862419023e-05, + "loss": 0.8872, + "step": 26350 + }, + { + "epoch": 5.620947098976109, + "grad_norm": 0.14053887420695033, + "learning_rate": 1.8183126631979854e-05, + "loss": 0.9104, + "step": 26351 + }, + { + "epoch": 5.621160409556314, + "grad_norm": 0.1405280773839949, + "learning_rate": 1.817772533922846e-05, + "loss": 0.8946, + "step": 26352 + }, + { + "epoch": 5.621373720136519, + "grad_norm": 0.129122454995126, + "learning_rate": 1.817232474599715e-05, + "loss": 0.8865, + "step": 26353 + }, + { + "epoch": 5.621587030716723, + "grad_norm": 0.13218818337986185, + "learning_rate": 1.816692485234702e-05, + "loss": 0.8885, + "step": 26354 + }, + { + "epoch": 5.621800341296928, + "grad_norm": 0.1283121738342534, + "learning_rate": 1.8161525658339157e-05, + "loss": 0.9184, + "step": 26355 + }, + { + "epoch": 5.622013651877133, + "grad_norm": 0.11261399835150003, + "learning_rate": 1.8156127164034708e-05, + "loss": 0.8908, + "step": 26356 + }, + { + "epoch": 5.622226962457338, + "grad_norm": 0.13824576666953248, + "learning_rate": 1.81507293694947e-05, + "loss": 0.8756, + "step": 26357 + }, + { + "epoch": 5.622440273037543, + "grad_norm": 0.113368481223803, + "learning_rate": 1.814533227478023e-05, + "loss": 0.8607, + "step": 26358 + }, + { + "epoch": 5.622653583617748, + "grad_norm": 0.11071738433208853, + "learning_rate": 1.813993587995235e-05, + "loss": 0.8873, + "step": 26359 + }, + { + "epoch": 5.622866894197952, + "grad_norm": 0.13018456465330885, + "learning_rate": 1.8134540185072156e-05, + "loss": 0.8466, + "step": 26360 + }, + { + "epoch": 5.623080204778157, + "grad_norm": 0.10447120371344155, + "learning_rate": 1.812914519020068e-05, + "loss": 0.8963, + "step": 26361 + }, + { + "epoch": 5.623293515358362, + "grad_norm": 0.11306644775513874, + "learning_rate": 1.812375089539896e-05, + "loss": 0.89, + "step": 26362 + }, + { + "epoch": 5.623506825938566, + "grad_norm": 0.10770892152431168, + "learning_rate": 1.8118357300728028e-05, + "loss": 0.8694, + "step": 26363 + }, + { + "epoch": 5.623720136518771, + "grad_norm": 0.10782158645041, + "learning_rate": 1.8112964406248937e-05, + "loss": 0.8839, + "step": 26364 + }, + { + "epoch": 5.623933447098976, + "grad_norm": 0.11073757754329658, + "learning_rate": 1.8107572212022697e-05, + "loss": 0.9022, + "step": 26365 + }, + { + "epoch": 5.624146757679181, + "grad_norm": 0.1137017392446604, + "learning_rate": 1.8102180718110328e-05, + "loss": 0.919, + "step": 26366 + }, + { + "epoch": 5.624360068259386, + "grad_norm": 0.10984708479061098, + "learning_rate": 1.809678992457281e-05, + "loss": 0.8865, + "step": 26367 + }, + { + "epoch": 5.624573378839591, + "grad_norm": 0.11408041142471938, + "learning_rate": 1.8091399831471168e-05, + "loss": 0.8761, + "step": 26368 + }, + { + "epoch": 5.624786689419795, + "grad_norm": 0.1035678442986189, + "learning_rate": 1.808601043886637e-05, + "loss": 0.8879, + "step": 26369 + }, + { + "epoch": 5.625, + "grad_norm": 0.12004335165363476, + "learning_rate": 1.8080621746819407e-05, + "loss": 0.8878, + "step": 26370 + }, + { + "epoch": 5.625213310580205, + "grad_norm": 0.12205104915208057, + "learning_rate": 1.8075233755391233e-05, + "loss": 0.887, + "step": 26371 + }, + { + "epoch": 5.625426621160409, + "grad_norm": 0.12876433591654887, + "learning_rate": 1.8069846464642847e-05, + "loss": 0.8881, + "step": 26372 + }, + { + "epoch": 5.625639931740614, + "grad_norm": 0.12007750522634339, + "learning_rate": 1.806445987463519e-05, + "loss": 0.8816, + "step": 26373 + }, + { + "epoch": 5.625853242320819, + "grad_norm": 0.11293370922172628, + "learning_rate": 1.8059073985429217e-05, + "loss": 0.8837, + "step": 26374 + }, + { + "epoch": 5.626066552901024, + "grad_norm": 0.15461637874118853, + "learning_rate": 1.805368879708583e-05, + "loss": 0.8927, + "step": 26375 + }, + { + "epoch": 5.626279863481229, + "grad_norm": 0.10281180565843917, + "learning_rate": 1.804830430966603e-05, + "loss": 0.8914, + "step": 26376 + }, + { + "epoch": 5.626493174061434, + "grad_norm": 0.1476354056599913, + "learning_rate": 1.8042920523230708e-05, + "loss": 0.8711, + "step": 26377 + }, + { + "epoch": 5.626706484641638, + "grad_norm": 0.12522310016547858, + "learning_rate": 1.8037537437840776e-05, + "loss": 0.9166, + "step": 26378 + }, + { + "epoch": 5.626919795221843, + "grad_norm": 0.10497939427039266, + "learning_rate": 1.8032155053557155e-05, + "loss": 0.8848, + "step": 26379 + }, + { + "epoch": 5.627133105802048, + "grad_norm": 0.13607761290844758, + "learning_rate": 1.8026773370440736e-05, + "loss": 0.9404, + "step": 26380 + }, + { + "epoch": 5.627346416382252, + "grad_norm": 0.10464561312049268, + "learning_rate": 1.802139238855243e-05, + "loss": 0.8839, + "step": 26381 + }, + { + "epoch": 5.627559726962457, + "grad_norm": 0.1061087966428769, + "learning_rate": 1.80160121079531e-05, + "loss": 0.8672, + "step": 26382 + }, + { + "epoch": 5.627773037542662, + "grad_norm": 0.12523548235641713, + "learning_rate": 1.801063252870362e-05, + "loss": 0.9191, + "step": 26383 + }, + { + "epoch": 5.627986348122867, + "grad_norm": 0.10605527472551474, + "learning_rate": 1.8005253650864897e-05, + "loss": 0.8744, + "step": 26384 + }, + { + "epoch": 5.628199658703072, + "grad_norm": 0.11547502479909535, + "learning_rate": 1.799987547449777e-05, + "loss": 0.8607, + "step": 26385 + }, + { + "epoch": 5.628412969283277, + "grad_norm": 0.10065018106354665, + "learning_rate": 1.7994497999663094e-05, + "loss": 0.8469, + "step": 26386 + }, + { + "epoch": 5.628626279863481, + "grad_norm": 0.11084094782820562, + "learning_rate": 1.79891212264217e-05, + "loss": 0.8918, + "step": 26387 + }, + { + "epoch": 5.628839590443686, + "grad_norm": 0.10023732129424141, + "learning_rate": 1.7983745154834463e-05, + "loss": 0.8998, + "step": 26388 + }, + { + "epoch": 5.629052901023891, + "grad_norm": 0.10812844284222077, + "learning_rate": 1.7978369784962203e-05, + "loss": 0.9178, + "step": 26389 + }, + { + "epoch": 5.6292662116040955, + "grad_norm": 0.10802977976186862, + "learning_rate": 1.7972995116865703e-05, + "loss": 0.9204, + "step": 26390 + }, + { + "epoch": 5.6294795221843, + "grad_norm": 0.10036527344368447, + "learning_rate": 1.7967621150605817e-05, + "loss": 0.8981, + "step": 26391 + }, + { + "epoch": 5.629692832764505, + "grad_norm": 0.11075418859260144, + "learning_rate": 1.7962247886243353e-05, + "loss": 0.8822, + "step": 26392 + }, + { + "epoch": 5.6299061433447095, + "grad_norm": 0.09402697237344468, + "learning_rate": 1.7956875323839086e-05, + "loss": 0.861, + "step": 26393 + }, + { + "epoch": 5.630119453924914, + "grad_norm": 0.12221034751528476, + "learning_rate": 1.7951503463453824e-05, + "loss": 0.883, + "step": 26394 + }, + { + "epoch": 5.63033276450512, + "grad_norm": 0.11687625238130545, + "learning_rate": 1.7946132305148324e-05, + "loss": 0.8804, + "step": 26395 + }, + { + "epoch": 5.6305460750853245, + "grad_norm": 0.10020892997107969, + "learning_rate": 1.79407618489834e-05, + "loss": 0.9063, + "step": 26396 + }, + { + "epoch": 5.630759385665529, + "grad_norm": 0.11213638564492401, + "learning_rate": 1.7935392095019803e-05, + "loss": 0.8873, + "step": 26397 + }, + { + "epoch": 5.630972696245734, + "grad_norm": 0.107633714554126, + "learning_rate": 1.793002304331828e-05, + "loss": 0.8917, + "step": 26398 + }, + { + "epoch": 5.6311860068259385, + "grad_norm": 0.11503297461040109, + "learning_rate": 1.7924654693939578e-05, + "loss": 0.8818, + "step": 26399 + }, + { + "epoch": 5.631399317406143, + "grad_norm": 0.11522045707786385, + "learning_rate": 1.7919287046944464e-05, + "loss": 0.8655, + "step": 26400 + }, + { + "epoch": 5.631612627986348, + "grad_norm": 0.09755738227399728, + "learning_rate": 1.7913920102393696e-05, + "loss": 0.8756, + "step": 26401 + }, + { + "epoch": 5.631825938566553, + "grad_norm": 0.13647403608369163, + "learning_rate": 1.7908553860347914e-05, + "loss": 0.8892, + "step": 26402 + }, + { + "epoch": 5.632039249146757, + "grad_norm": 0.11125916054131534, + "learning_rate": 1.7903188320867907e-05, + "loss": 0.882, + "step": 26403 + }, + { + "epoch": 5.632252559726963, + "grad_norm": 0.11145598613862855, + "learning_rate": 1.7897823484014375e-05, + "loss": 0.8373, + "step": 26404 + }, + { + "epoch": 5.6324658703071675, + "grad_norm": 0.10536307095045015, + "learning_rate": 1.789245934984802e-05, + "loss": 0.8762, + "step": 26405 + }, + { + "epoch": 5.632679180887372, + "grad_norm": 0.12837805811213776, + "learning_rate": 1.7887095918429524e-05, + "loss": 0.8662, + "step": 26406 + }, + { + "epoch": 5.632892491467577, + "grad_norm": 0.11798210333258746, + "learning_rate": 1.788173318981956e-05, + "loss": 0.889, + "step": 26407 + }, + { + "epoch": 5.6331058020477816, + "grad_norm": 0.11130446564061304, + "learning_rate": 1.7876371164078855e-05, + "loss": 0.8723, + "step": 26408 + }, + { + "epoch": 5.633319112627986, + "grad_norm": 0.130410162573009, + "learning_rate": 1.7871009841268058e-05, + "loss": 0.8877, + "step": 26409 + }, + { + "epoch": 5.633532423208191, + "grad_norm": 0.10594968934898816, + "learning_rate": 1.7865649221447824e-05, + "loss": 0.8871, + "step": 26410 + }, + { + "epoch": 5.633745733788396, + "grad_norm": 0.12111125255101084, + "learning_rate": 1.7860289304678797e-05, + "loss": 0.8791, + "step": 26411 + }, + { + "epoch": 5.6339590443686, + "grad_norm": 0.12070886038314309, + "learning_rate": 1.785493009102168e-05, + "loss": 0.8654, + "step": 26412 + }, + { + "epoch": 5.634172354948806, + "grad_norm": 0.1162909909720897, + "learning_rate": 1.7849571580537056e-05, + "loss": 0.8645, + "step": 26413 + }, + { + "epoch": 5.6343856655290105, + "grad_norm": 0.1354499362883824, + "learning_rate": 1.7844213773285553e-05, + "loss": 0.8991, + "step": 26414 + }, + { + "epoch": 5.634598976109215, + "grad_norm": 0.11057517857619989, + "learning_rate": 1.7838856669327843e-05, + "loss": 0.8755, + "step": 26415 + }, + { + "epoch": 5.63481228668942, + "grad_norm": 0.16627225221623093, + "learning_rate": 1.7833500268724507e-05, + "loss": 0.8861, + "step": 26416 + }, + { + "epoch": 5.635025597269625, + "grad_norm": 0.13657219237478888, + "learning_rate": 1.7828144571536166e-05, + "loss": 0.8893, + "step": 26417 + }, + { + "epoch": 5.635238907849829, + "grad_norm": 0.11573998836643154, + "learning_rate": 1.7822789577823415e-05, + "loss": 0.88, + "step": 26418 + }, + { + "epoch": 5.635452218430034, + "grad_norm": 0.1517980051433995, + "learning_rate": 1.7817435287646825e-05, + "loss": 0.8914, + "step": 26419 + }, + { + "epoch": 5.635665529010239, + "grad_norm": 0.10508206117632786, + "learning_rate": 1.7812081701067015e-05, + "loss": 0.891, + "step": 26420 + }, + { + "epoch": 5.635878839590443, + "grad_norm": 0.14493554030147995, + "learning_rate": 1.7806728818144553e-05, + "loss": 0.8843, + "step": 26421 + }, + { + "epoch": 5.636092150170649, + "grad_norm": 0.12845772893184193, + "learning_rate": 1.780137663894e-05, + "loss": 0.909, + "step": 26422 + }, + { + "epoch": 5.636305460750854, + "grad_norm": 0.12352391449478055, + "learning_rate": 1.7796025163513915e-05, + "loss": 0.8865, + "step": 26423 + }, + { + "epoch": 5.636518771331058, + "grad_norm": 0.14625990925087973, + "learning_rate": 1.7790674391926855e-05, + "loss": 0.9029, + "step": 26424 + }, + { + "epoch": 5.636732081911263, + "grad_norm": 0.12486665913049251, + "learning_rate": 1.7785324324239353e-05, + "loss": 0.8795, + "step": 26425 + }, + { + "epoch": 5.636945392491468, + "grad_norm": 0.11654948519401065, + "learning_rate": 1.777997496051194e-05, + "loss": 0.9008, + "step": 26426 + }, + { + "epoch": 5.637158703071672, + "grad_norm": 0.13490767839936704, + "learning_rate": 1.7774626300805176e-05, + "loss": 0.9017, + "step": 26427 + }, + { + "epoch": 5.637372013651877, + "grad_norm": 0.10414812618081906, + "learning_rate": 1.776927834517956e-05, + "loss": 0.872, + "step": 26428 + }, + { + "epoch": 5.637585324232082, + "grad_norm": 0.12712710666873892, + "learning_rate": 1.7763931093695602e-05, + "loss": 0.8895, + "step": 26429 + }, + { + "epoch": 5.637798634812286, + "grad_norm": 0.13077031773326409, + "learning_rate": 1.7758584546413818e-05, + "loss": 0.8979, + "step": 26430 + }, + { + "epoch": 5.638011945392492, + "grad_norm": 0.12117994825144497, + "learning_rate": 1.7753238703394674e-05, + "loss": 0.8873, + "step": 26431 + }, + { + "epoch": 5.638225255972696, + "grad_norm": 0.1393480752086312, + "learning_rate": 1.77478935646987e-05, + "loss": 0.8825, + "step": 26432 + }, + { + "epoch": 5.638438566552901, + "grad_norm": 0.12175016003174903, + "learning_rate": 1.774254913038636e-05, + "loss": 0.9039, + "step": 26433 + }, + { + "epoch": 5.638651877133106, + "grad_norm": 0.12293338103160302, + "learning_rate": 1.7737205400518124e-05, + "loss": 0.8678, + "step": 26434 + }, + { + "epoch": 5.638865187713311, + "grad_norm": 0.12956058078343966, + "learning_rate": 1.7731862375154455e-05, + "loss": 0.8931, + "step": 26435 + }, + { + "epoch": 5.639078498293515, + "grad_norm": 0.12112038608730581, + "learning_rate": 1.772652005435581e-05, + "loss": 0.8577, + "step": 26436 + }, + { + "epoch": 5.63929180887372, + "grad_norm": 0.14891640305438736, + "learning_rate": 1.772117843818264e-05, + "loss": 0.8923, + "step": 26437 + }, + { + "epoch": 5.639505119453925, + "grad_norm": 0.10674844047425033, + "learning_rate": 1.7715837526695355e-05, + "loss": 0.8944, + "step": 26438 + }, + { + "epoch": 5.639718430034129, + "grad_norm": 0.15783470892920606, + "learning_rate": 1.7710497319954445e-05, + "loss": 0.9008, + "step": 26439 + }, + { + "epoch": 5.639931740614334, + "grad_norm": 0.11732177253239727, + "learning_rate": 1.77051578180203e-05, + "loss": 0.8679, + "step": 26440 + }, + { + "epoch": 5.640145051194539, + "grad_norm": 0.10903304627804075, + "learning_rate": 1.769981902095335e-05, + "loss": 0.8917, + "step": 26441 + }, + { + "epoch": 5.640358361774744, + "grad_norm": 0.15918544385458772, + "learning_rate": 1.7694480928813992e-05, + "loss": 0.8789, + "step": 26442 + }, + { + "epoch": 5.640571672354949, + "grad_norm": 0.10940786865465058, + "learning_rate": 1.7689143541662605e-05, + "loss": 0.8762, + "step": 26443 + }, + { + "epoch": 5.640784982935154, + "grad_norm": 0.12685579400610994, + "learning_rate": 1.7683806859559627e-05, + "loss": 0.8747, + "step": 26444 + }, + { + "epoch": 5.640998293515358, + "grad_norm": 0.11671777781380673, + "learning_rate": 1.767847088256545e-05, + "loss": 0.9084, + "step": 26445 + }, + { + "epoch": 5.641211604095563, + "grad_norm": 0.10938648480980913, + "learning_rate": 1.767313561074038e-05, + "loss": 0.8568, + "step": 26446 + }, + { + "epoch": 5.641424914675768, + "grad_norm": 0.11491571538883327, + "learning_rate": 1.766780104414485e-05, + "loss": 0.8747, + "step": 26447 + }, + { + "epoch": 5.6416382252559725, + "grad_norm": 0.12853103349037237, + "learning_rate": 1.7662467182839203e-05, + "loss": 0.8805, + "step": 26448 + }, + { + "epoch": 5.641851535836177, + "grad_norm": 0.10869099555125752, + "learning_rate": 1.765713402688378e-05, + "loss": 0.9124, + "step": 26449 + }, + { + "epoch": 5.642064846416382, + "grad_norm": 0.13592058344400776, + "learning_rate": 1.765180157633892e-05, + "loss": 0.8812, + "step": 26450 + }, + { + "epoch": 5.642278156996587, + "grad_norm": 0.10593686283694455, + "learning_rate": 1.7646469831264985e-05, + "loss": 0.868, + "step": 26451 + }, + { + "epoch": 5.642491467576792, + "grad_norm": 0.15053146248967386, + "learning_rate": 1.7641138791722302e-05, + "loss": 0.8925, + "step": 26452 + }, + { + "epoch": 5.642704778156997, + "grad_norm": 0.10479401964943333, + "learning_rate": 1.763580845777118e-05, + "loss": 0.8693, + "step": 26453 + }, + { + "epoch": 5.6429180887372015, + "grad_norm": 0.13843075550226394, + "learning_rate": 1.7630478829471938e-05, + "loss": 0.8736, + "step": 26454 + }, + { + "epoch": 5.643131399317406, + "grad_norm": 0.0999054887737848, + "learning_rate": 1.7625149906884847e-05, + "loss": 0.8749, + "step": 26455 + }, + { + "epoch": 5.643344709897611, + "grad_norm": 0.14656918628155768, + "learning_rate": 1.761982169007026e-05, + "loss": 0.9181, + "step": 26456 + }, + { + "epoch": 5.6435580204778155, + "grad_norm": 0.10617535863297001, + "learning_rate": 1.761449417908847e-05, + "loss": 0.8962, + "step": 26457 + }, + { + "epoch": 5.64377133105802, + "grad_norm": 0.12784956041866047, + "learning_rate": 1.760916737399967e-05, + "loss": 0.8685, + "step": 26458 + }, + { + "epoch": 5.643984641638225, + "grad_norm": 0.11530658788511305, + "learning_rate": 1.760384127486422e-05, + "loss": 0.8908, + "step": 26459 + }, + { + "epoch": 5.6441979522184305, + "grad_norm": 0.12423728849823518, + "learning_rate": 1.759851588174236e-05, + "loss": 0.879, + "step": 26460 + }, + { + "epoch": 5.644411262798635, + "grad_norm": 0.13897617703391255, + "learning_rate": 1.7593191194694333e-05, + "loss": 0.8977, + "step": 26461 + }, + { + "epoch": 5.64462457337884, + "grad_norm": 0.10789684455432508, + "learning_rate": 1.758786721378038e-05, + "loss": 0.9057, + "step": 26462 + }, + { + "epoch": 5.6448378839590445, + "grad_norm": 0.14480580634745435, + "learning_rate": 1.7582543939060777e-05, + "loss": 0.9031, + "step": 26463 + }, + { + "epoch": 5.645051194539249, + "grad_norm": 0.11064047840488024, + "learning_rate": 1.7577221370595748e-05, + "loss": 0.896, + "step": 26464 + }, + { + "epoch": 5.645264505119454, + "grad_norm": 0.12305988136251578, + "learning_rate": 1.7571899508445504e-05, + "loss": 0.8736, + "step": 26465 + }, + { + "epoch": 5.645477815699659, + "grad_norm": 0.11048821211974723, + "learning_rate": 1.756657835267026e-05, + "loss": 0.8785, + "step": 26466 + }, + { + "epoch": 5.645691126279863, + "grad_norm": 0.12448045851479933, + "learning_rate": 1.7561257903330223e-05, + "loss": 0.896, + "step": 26467 + }, + { + "epoch": 5.645904436860068, + "grad_norm": 0.10705250866800466, + "learning_rate": 1.755593816048564e-05, + "loss": 0.8992, + "step": 26468 + }, + { + "epoch": 5.6461177474402735, + "grad_norm": 0.11713180881131624, + "learning_rate": 1.755061912419664e-05, + "loss": 0.9022, + "step": 26469 + }, + { + "epoch": 5.646331058020478, + "grad_norm": 0.11361610371546911, + "learning_rate": 1.7545300794523426e-05, + "loss": 0.8653, + "step": 26470 + }, + { + "epoch": 5.646544368600683, + "grad_norm": 0.10199548482017519, + "learning_rate": 1.7539983171526193e-05, + "loss": 0.8421, + "step": 26471 + }, + { + "epoch": 5.646757679180888, + "grad_norm": 0.11501920116202159, + "learning_rate": 1.7534666255265112e-05, + "loss": 0.8811, + "step": 26472 + }, + { + "epoch": 5.646970989761092, + "grad_norm": 0.1341820488244789, + "learning_rate": 1.7529350045800324e-05, + "loss": 0.9123, + "step": 26473 + }, + { + "epoch": 5.647184300341297, + "grad_norm": 0.13217920552518206, + "learning_rate": 1.7524034543191977e-05, + "loss": 0.8772, + "step": 26474 + }, + { + "epoch": 5.647397610921502, + "grad_norm": 0.12311584072267752, + "learning_rate": 1.7518719747500244e-05, + "loss": 0.9173, + "step": 26475 + }, + { + "epoch": 5.647610921501706, + "grad_norm": 0.12084681085932454, + "learning_rate": 1.7513405658785253e-05, + "loss": 0.8977, + "step": 26476 + }, + { + "epoch": 5.647824232081911, + "grad_norm": 0.1159814840322584, + "learning_rate": 1.750809227710713e-05, + "loss": 0.8704, + "step": 26477 + }, + { + "epoch": 5.648037542662116, + "grad_norm": 0.12847226959923858, + "learning_rate": 1.750277960252599e-05, + "loss": 0.8715, + "step": 26478 + }, + { + "epoch": 5.64825085324232, + "grad_norm": 0.12647528338341021, + "learning_rate": 1.749746763510195e-05, + "loss": 0.8887, + "step": 26479 + }, + { + "epoch": 5.648464163822526, + "grad_norm": 0.1256108413322722, + "learning_rate": 1.7492156374895113e-05, + "loss": 0.9052, + "step": 26480 + }, + { + "epoch": 5.648677474402731, + "grad_norm": 0.12232261622343928, + "learning_rate": 1.7486845821965586e-05, + "loss": 0.857, + "step": 26481 + }, + { + "epoch": 5.648890784982935, + "grad_norm": 0.12251101871371477, + "learning_rate": 1.748153597637342e-05, + "loss": 0.9077, + "step": 26482 + }, + { + "epoch": 5.64910409556314, + "grad_norm": 0.11081588711472591, + "learning_rate": 1.747622683817875e-05, + "loss": 0.8668, + "step": 26483 + }, + { + "epoch": 5.649317406143345, + "grad_norm": 0.10035020296709825, + "learning_rate": 1.7470918407441626e-05, + "loss": 0.888, + "step": 26484 + }, + { + "epoch": 5.649530716723549, + "grad_norm": 0.11986321871551334, + "learning_rate": 1.7465610684222117e-05, + "loss": 0.8791, + "step": 26485 + }, + { + "epoch": 5.649744027303754, + "grad_norm": 0.11131951011820401, + "learning_rate": 1.746030366858027e-05, + "loss": 0.8989, + "step": 26486 + }, + { + "epoch": 5.649957337883959, + "grad_norm": 0.10961453318702899, + "learning_rate": 1.745499736057611e-05, + "loss": 0.8875, + "step": 26487 + }, + { + "epoch": 5.650170648464163, + "grad_norm": 0.12935502340047403, + "learning_rate": 1.7449691760269738e-05, + "loss": 0.8443, + "step": 26488 + }, + { + "epoch": 5.650383959044369, + "grad_norm": 0.1138295318927952, + "learning_rate": 1.7444386867721155e-05, + "loss": 0.8773, + "step": 26489 + }, + { + "epoch": 5.650597269624574, + "grad_norm": 0.12449209567129811, + "learning_rate": 1.7439082682990385e-05, + "loss": 0.9099, + "step": 26490 + }, + { + "epoch": 5.650810580204778, + "grad_norm": 0.11470112714620179, + "learning_rate": 1.743377920613745e-05, + "loss": 0.9116, + "step": 26491 + }, + { + "epoch": 5.651023890784983, + "grad_norm": 0.12047574615542174, + "learning_rate": 1.742847643722235e-05, + "loss": 0.8802, + "step": 26492 + }, + { + "epoch": 5.651237201365188, + "grad_norm": 0.1320712152859885, + "learning_rate": 1.7423174376305095e-05, + "loss": 0.9111, + "step": 26493 + }, + { + "epoch": 5.651450511945392, + "grad_norm": 0.11842687999856366, + "learning_rate": 1.741787302344565e-05, + "loss": 0.9075, + "step": 26494 + }, + { + "epoch": 5.651663822525597, + "grad_norm": 0.10374930763453287, + "learning_rate": 1.7412572378704053e-05, + "loss": 0.8638, + "step": 26495 + }, + { + "epoch": 5.651877133105802, + "grad_norm": 0.12030729509435938, + "learning_rate": 1.740727244214025e-05, + "loss": 0.8751, + "step": 26496 + }, + { + "epoch": 5.652090443686006, + "grad_norm": 0.10401209829143311, + "learning_rate": 1.7401973213814206e-05, + "loss": 0.8799, + "step": 26497 + }, + { + "epoch": 5.652303754266212, + "grad_norm": 0.11245415049251277, + "learning_rate": 1.7396674693785894e-05, + "loss": 0.8822, + "step": 26498 + }, + { + "epoch": 5.652517064846417, + "grad_norm": 0.1059121498768753, + "learning_rate": 1.7391376882115238e-05, + "loss": 0.8838, + "step": 26499 + }, + { + "epoch": 5.652730375426621, + "grad_norm": 0.106131662488545, + "learning_rate": 1.7386079778862227e-05, + "loss": 0.9398, + "step": 26500 + }, + { + "epoch": 5.652943686006826, + "grad_norm": 0.12218802891442805, + "learning_rate": 1.7380783384086797e-05, + "loss": 0.9279, + "step": 26501 + }, + { + "epoch": 5.653156996587031, + "grad_norm": 0.09688510184278112, + "learning_rate": 1.7375487697848816e-05, + "loss": 0.8672, + "step": 26502 + }, + { + "epoch": 5.653370307167235, + "grad_norm": 0.12108077696632594, + "learning_rate": 1.7370192720208256e-05, + "loss": 0.8994, + "step": 26503 + }, + { + "epoch": 5.65358361774744, + "grad_norm": 0.11075903989779776, + "learning_rate": 1.7364898451225034e-05, + "loss": 0.8828, + "step": 26504 + }, + { + "epoch": 5.653796928327645, + "grad_norm": 0.12899482989429675, + "learning_rate": 1.735960489095903e-05, + "loss": 0.8823, + "step": 26505 + }, + { + "epoch": 5.6540102389078495, + "grad_norm": 0.11701097418325579, + "learning_rate": 1.7354312039470133e-05, + "loss": 0.908, + "step": 26506 + }, + { + "epoch": 5.654223549488055, + "grad_norm": 0.12702340696697714, + "learning_rate": 1.7349019896818272e-05, + "loss": 0.9186, + "step": 26507 + }, + { + "epoch": 5.65443686006826, + "grad_norm": 0.11399753040623696, + "learning_rate": 1.7343728463063295e-05, + "loss": 0.8808, + "step": 26508 + }, + { + "epoch": 5.654650170648464, + "grad_norm": 0.12618333076424784, + "learning_rate": 1.7338437738265092e-05, + "loss": 0.8819, + "step": 26509 + }, + { + "epoch": 5.654863481228669, + "grad_norm": 0.11041173120220446, + "learning_rate": 1.7333147722483515e-05, + "loss": 0.8956, + "step": 26510 + }, + { + "epoch": 5.655076791808874, + "grad_norm": 0.1224197742086748, + "learning_rate": 1.732785841577841e-05, + "loss": 0.8985, + "step": 26511 + }, + { + "epoch": 5.6552901023890785, + "grad_norm": 0.1426645490624345, + "learning_rate": 1.7322569818209664e-05, + "loss": 0.8807, + "step": 26512 + }, + { + "epoch": 5.655503412969283, + "grad_norm": 0.10440578110793829, + "learning_rate": 1.7317281929837112e-05, + "loss": 0.9047, + "step": 26513 + }, + { + "epoch": 5.655716723549488, + "grad_norm": 0.1175842669964981, + "learning_rate": 1.731199475072053e-05, + "loss": 0.8704, + "step": 26514 + }, + { + "epoch": 5.6559300341296925, + "grad_norm": 0.11491505729075291, + "learning_rate": 1.73067082809198e-05, + "loss": 0.8906, + "step": 26515 + }, + { + "epoch": 5.656143344709898, + "grad_norm": 0.11942345285908623, + "learning_rate": 1.730142252049472e-05, + "loss": 0.8728, + "step": 26516 + }, + { + "epoch": 5.656356655290102, + "grad_norm": 0.1128096494795703, + "learning_rate": 1.72961374695051e-05, + "loss": 0.8739, + "step": 26517 + }, + { + "epoch": 5.6565699658703075, + "grad_norm": 0.10907626266581713, + "learning_rate": 1.729085312801072e-05, + "loss": 0.8636, + "step": 26518 + }, + { + "epoch": 5.656783276450512, + "grad_norm": 0.11888236474930722, + "learning_rate": 1.728556949607141e-05, + "loss": 0.8954, + "step": 26519 + }, + { + "epoch": 5.656996587030717, + "grad_norm": 0.12774741854336244, + "learning_rate": 1.7280286573746935e-05, + "loss": 0.9105, + "step": 26520 + }, + { + "epoch": 5.6572098976109215, + "grad_norm": 0.11705468842719473, + "learning_rate": 1.7275004361097075e-05, + "loss": 0.9034, + "step": 26521 + }, + { + "epoch": 5.657423208191126, + "grad_norm": 0.12818129529315225, + "learning_rate": 1.7269722858181593e-05, + "loss": 0.9037, + "step": 26522 + }, + { + "epoch": 5.657636518771331, + "grad_norm": 0.11200230572312068, + "learning_rate": 1.726444206506023e-05, + "loss": 0.8765, + "step": 26523 + }, + { + "epoch": 5.657849829351536, + "grad_norm": 0.12895177621509737, + "learning_rate": 1.725916198179281e-05, + "loss": 0.8878, + "step": 26524 + }, + { + "epoch": 5.65806313993174, + "grad_norm": 0.11846334618435914, + "learning_rate": 1.7253882608439e-05, + "loss": 0.9072, + "step": 26525 + }, + { + "epoch": 5.658276450511945, + "grad_norm": 0.11611144082491254, + "learning_rate": 1.7248603945058543e-05, + "loss": 0.8846, + "step": 26526 + }, + { + "epoch": 5.6584897610921505, + "grad_norm": 0.11142592432131364, + "learning_rate": 1.7243325991711202e-05, + "loss": 0.8615, + "step": 26527 + }, + { + "epoch": 5.658703071672355, + "grad_norm": 0.12202446039142091, + "learning_rate": 1.72380487484567e-05, + "loss": 0.8765, + "step": 26528 + }, + { + "epoch": 5.65891638225256, + "grad_norm": 0.10266855155008306, + "learning_rate": 1.7232772215354724e-05, + "loss": 0.8883, + "step": 26529 + }, + { + "epoch": 5.659129692832765, + "grad_norm": 0.12536870611760234, + "learning_rate": 1.722749639246496e-05, + "loss": 0.8871, + "step": 26530 + }, + { + "epoch": 5.659343003412969, + "grad_norm": 0.1211281875898225, + "learning_rate": 1.7222221279847156e-05, + "loss": 0.8759, + "step": 26531 + }, + { + "epoch": 5.659556313993174, + "grad_norm": 0.10684557558535454, + "learning_rate": 1.721694687756097e-05, + "loss": 0.9151, + "step": 26532 + }, + { + "epoch": 5.659769624573379, + "grad_norm": 0.12823377975716138, + "learning_rate": 1.721167318566609e-05, + "loss": 0.901, + "step": 26533 + }, + { + "epoch": 5.659982935153583, + "grad_norm": 0.11159490220644794, + "learning_rate": 1.7206400204222183e-05, + "loss": 0.9058, + "step": 26534 + }, + { + "epoch": 5.660196245733788, + "grad_norm": 0.11051996332879135, + "learning_rate": 1.720112793328892e-05, + "loss": 0.8918, + "step": 26535 + }, + { + "epoch": 5.660409556313994, + "grad_norm": 0.11390986239398078, + "learning_rate": 1.7195856372925942e-05, + "loss": 0.8866, + "step": 26536 + }, + { + "epoch": 5.660622866894198, + "grad_norm": 0.10185485096979335, + "learning_rate": 1.7190585523192912e-05, + "loss": 0.8757, + "step": 26537 + }, + { + "epoch": 5.660836177474403, + "grad_norm": 0.12421772777032979, + "learning_rate": 1.7185315384149436e-05, + "loss": 0.8759, + "step": 26538 + }, + { + "epoch": 5.661049488054608, + "grad_norm": 0.11121727089910567, + "learning_rate": 1.7180045955855195e-05, + "loss": 0.8737, + "step": 26539 + }, + { + "epoch": 5.661262798634812, + "grad_norm": 0.11389553539107101, + "learning_rate": 1.7174777238369797e-05, + "loss": 0.8631, + "step": 26540 + }, + { + "epoch": 5.661476109215017, + "grad_norm": 0.11257304255448532, + "learning_rate": 1.7169509231752846e-05, + "loss": 0.8776, + "step": 26541 + }, + { + "epoch": 5.661689419795222, + "grad_norm": 0.10905396113257489, + "learning_rate": 1.7164241936063933e-05, + "loss": 0.894, + "step": 26542 + }, + { + "epoch": 5.661902730375426, + "grad_norm": 0.11333055093286935, + "learning_rate": 1.7158975351362705e-05, + "loss": 0.8453, + "step": 26543 + }, + { + "epoch": 5.662116040955631, + "grad_norm": 0.10414272037812619, + "learning_rate": 1.7153709477708726e-05, + "loss": 0.8827, + "step": 26544 + }, + { + "epoch": 5.662329351535837, + "grad_norm": 0.1143067314557398, + "learning_rate": 1.7148444315161573e-05, + "loss": 0.8858, + "step": 26545 + }, + { + "epoch": 5.662542662116041, + "grad_norm": 0.11471574057767445, + "learning_rate": 1.7143179863780847e-05, + "loss": 0.907, + "step": 26546 + }, + { + "epoch": 5.662755972696246, + "grad_norm": 0.10943896431999076, + "learning_rate": 1.7137916123626083e-05, + "loss": 0.9055, + "step": 26547 + }, + { + "epoch": 5.662969283276451, + "grad_norm": 0.12377416996162643, + "learning_rate": 1.713265309475686e-05, + "loss": 0.88, + "step": 26548 + }, + { + "epoch": 5.663182593856655, + "grad_norm": 0.10496246115123811, + "learning_rate": 1.7127390777232724e-05, + "loss": 0.8797, + "step": 26549 + }, + { + "epoch": 5.66339590443686, + "grad_norm": 0.11192786410356784, + "learning_rate": 1.7122129171113204e-05, + "loss": 0.8782, + "step": 26550 + }, + { + "epoch": 5.663609215017065, + "grad_norm": 0.10582971054129524, + "learning_rate": 1.711686827645786e-05, + "loss": 0.8905, + "step": 26551 + }, + { + "epoch": 5.663822525597269, + "grad_norm": 0.11255968132174908, + "learning_rate": 1.7111608093326218e-05, + "loss": 0.8682, + "step": 26552 + }, + { + "epoch": 5.664035836177474, + "grad_norm": 0.12055234559513221, + "learning_rate": 1.710634862177779e-05, + "loss": 0.9218, + "step": 26553 + }, + { + "epoch": 5.66424914675768, + "grad_norm": 0.11496364087824888, + "learning_rate": 1.7101089861872058e-05, + "loss": 0.8933, + "step": 26554 + }, + { + "epoch": 5.664462457337884, + "grad_norm": 0.11963632663621117, + "learning_rate": 1.709583181366857e-05, + "loss": 0.8963, + "step": 26555 + }, + { + "epoch": 5.664675767918089, + "grad_norm": 0.1032696809103366, + "learning_rate": 1.7090574477226817e-05, + "loss": 0.8802, + "step": 26556 + }, + { + "epoch": 5.664889078498294, + "grad_norm": 0.12224739237502623, + "learning_rate": 1.708531785260628e-05, + "loss": 0.8895, + "step": 26557 + }, + { + "epoch": 5.665102389078498, + "grad_norm": 0.12052708307737038, + "learning_rate": 1.7080061939866396e-05, + "loss": 0.8917, + "step": 26558 + }, + { + "epoch": 5.665315699658703, + "grad_norm": 0.10942969971062996, + "learning_rate": 1.7074806739066686e-05, + "loss": 0.8763, + "step": 26559 + }, + { + "epoch": 5.665529010238908, + "grad_norm": 0.11624087780973803, + "learning_rate": 1.7069552250266593e-05, + "loss": 0.8701, + "step": 26560 + }, + { + "epoch": 5.665742320819112, + "grad_norm": 0.1134179027234764, + "learning_rate": 1.706429847352558e-05, + "loss": 0.9016, + "step": 26561 + }, + { + "epoch": 5.665955631399317, + "grad_norm": 0.10578641448033985, + "learning_rate": 1.705904540890306e-05, + "loss": 0.9025, + "step": 26562 + }, + { + "epoch": 5.666168941979522, + "grad_norm": 0.11863343377920647, + "learning_rate": 1.7053793056458534e-05, + "loss": 0.8928, + "step": 26563 + }, + { + "epoch": 5.6663822525597265, + "grad_norm": 0.1042338026556535, + "learning_rate": 1.7048541416251384e-05, + "loss": 0.8704, + "step": 26564 + }, + { + "epoch": 5.666595563139932, + "grad_norm": 0.11361141705584613, + "learning_rate": 1.7043290488341063e-05, + "loss": 0.8824, + "step": 26565 + }, + { + "epoch": 5.666808873720137, + "grad_norm": 0.10784531216524766, + "learning_rate": 1.703804027278693e-05, + "loss": 0.8614, + "step": 26566 + }, + { + "epoch": 5.667022184300341, + "grad_norm": 0.10903695710670326, + "learning_rate": 1.7032790769648464e-05, + "loss": 0.8915, + "step": 26567 + }, + { + "epoch": 5.667235494880546, + "grad_norm": 0.09870107293317468, + "learning_rate": 1.7027541978985046e-05, + "loss": 0.8918, + "step": 26568 + }, + { + "epoch": 5.667448805460751, + "grad_norm": 0.1087767256030566, + "learning_rate": 1.7022293900856032e-05, + "loss": 0.9152, + "step": 26569 + }, + { + "epoch": 5.6676621160409555, + "grad_norm": 0.10572681882755752, + "learning_rate": 1.7017046535320797e-05, + "loss": 0.9088, + "step": 26570 + }, + { + "epoch": 5.66787542662116, + "grad_norm": 0.12137165552464742, + "learning_rate": 1.701179988243876e-05, + "loss": 0.8647, + "step": 26571 + }, + { + "epoch": 5.668088737201365, + "grad_norm": 0.11529026132662634, + "learning_rate": 1.7006553942269272e-05, + "loss": 0.9247, + "step": 26572 + }, + { + "epoch": 5.6683020477815695, + "grad_norm": 0.11951683710202704, + "learning_rate": 1.7001308714871685e-05, + "loss": 0.8894, + "step": 26573 + }, + { + "epoch": 5.668515358361775, + "grad_norm": 0.11038065807275915, + "learning_rate": 1.6996064200305324e-05, + "loss": 0.9116, + "step": 26574 + }, + { + "epoch": 5.66872866894198, + "grad_norm": 0.14305521541875396, + "learning_rate": 1.6990820398629583e-05, + "loss": 0.8957, + "step": 26575 + }, + { + "epoch": 5.6689419795221845, + "grad_norm": 0.10439842010887922, + "learning_rate": 1.698557730990376e-05, + "loss": 0.8762, + "step": 26576 + }, + { + "epoch": 5.669155290102389, + "grad_norm": 0.1027883407201751, + "learning_rate": 1.69803349341872e-05, + "loss": 0.9004, + "step": 26577 + }, + { + "epoch": 5.669368600682594, + "grad_norm": 0.13232938180205248, + "learning_rate": 1.6975093271539183e-05, + "loss": 0.9039, + "step": 26578 + }, + { + "epoch": 5.6695819112627985, + "grad_norm": 0.10657703356223368, + "learning_rate": 1.6969852322019064e-05, + "loss": 0.9089, + "step": 26579 + }, + { + "epoch": 5.669795221843003, + "grad_norm": 0.13542925467918943, + "learning_rate": 1.696461208568615e-05, + "loss": 0.9063, + "step": 26580 + }, + { + "epoch": 5.670008532423208, + "grad_norm": 0.10889405125304687, + "learning_rate": 1.6959372562599696e-05, + "loss": 0.8934, + "step": 26581 + }, + { + "epoch": 5.670221843003413, + "grad_norm": 0.12613826035956197, + "learning_rate": 1.6954133752818975e-05, + "loss": 0.9173, + "step": 26582 + }, + { + "epoch": 5.670435153583618, + "grad_norm": 0.10944898757682826, + "learning_rate": 1.694889565640331e-05, + "loss": 0.9019, + "step": 26583 + }, + { + "epoch": 5.670648464163823, + "grad_norm": 0.1413905908455325, + "learning_rate": 1.694365827341196e-05, + "loss": 0.8983, + "step": 26584 + }, + { + "epoch": 5.6708617747440275, + "grad_norm": 0.11414814477139235, + "learning_rate": 1.693842160390417e-05, + "loss": 0.898, + "step": 26585 + }, + { + "epoch": 5.671075085324232, + "grad_norm": 0.15324076251888108, + "learning_rate": 1.6933185647939183e-05, + "loss": 0.8869, + "step": 26586 + }, + { + "epoch": 5.671288395904437, + "grad_norm": 0.1208246532129267, + "learning_rate": 1.692795040557628e-05, + "loss": 0.8908, + "step": 26587 + }, + { + "epoch": 5.671501706484642, + "grad_norm": 0.15971021141742756, + "learning_rate": 1.6922715876874674e-05, + "loss": 0.9184, + "step": 26588 + }, + { + "epoch": 5.671715017064846, + "grad_norm": 0.13398535664054126, + "learning_rate": 1.6917482061893605e-05, + "loss": 0.9032, + "step": 26589 + }, + { + "epoch": 5.671928327645051, + "grad_norm": 0.1385178942351551, + "learning_rate": 1.691224896069226e-05, + "loss": 0.8926, + "step": 26590 + }, + { + "epoch": 5.672141638225256, + "grad_norm": 0.13977270197160852, + "learning_rate": 1.6907016573329934e-05, + "loss": 0.876, + "step": 26591 + }, + { + "epoch": 5.672354948805461, + "grad_norm": 0.12744178366237166, + "learning_rate": 1.6901784899865744e-05, + "loss": 0.9154, + "step": 26592 + }, + { + "epoch": 5.672568259385666, + "grad_norm": 0.1403918066531266, + "learning_rate": 1.689655394035892e-05, + "loss": 0.8767, + "step": 26593 + }, + { + "epoch": 5.672781569965871, + "grad_norm": 0.13642458598244295, + "learning_rate": 1.6891323694868633e-05, + "loss": 0.8934, + "step": 26594 + }, + { + "epoch": 5.672994880546075, + "grad_norm": 0.16867230319012125, + "learning_rate": 1.6886094163454104e-05, + "loss": 0.8947, + "step": 26595 + }, + { + "epoch": 5.67320819112628, + "grad_norm": 0.10406988777435189, + "learning_rate": 1.6880865346174475e-05, + "loss": 0.8381, + "step": 26596 + }, + { + "epoch": 5.673421501706485, + "grad_norm": 0.17317356625517968, + "learning_rate": 1.687563724308892e-05, + "loss": 0.8723, + "step": 26597 + }, + { + "epoch": 5.673634812286689, + "grad_norm": 0.10885705756529751, + "learning_rate": 1.6870409854256564e-05, + "loss": 0.8792, + "step": 26598 + }, + { + "epoch": 5.673848122866894, + "grad_norm": 0.17111450329510153, + "learning_rate": 1.6865183179736613e-05, + "loss": 0.8963, + "step": 26599 + }, + { + "epoch": 5.674061433447099, + "grad_norm": 0.10352372978441786, + "learning_rate": 1.6859957219588166e-05, + "loss": 0.8618, + "step": 26600 + }, + { + "epoch": 5.674274744027304, + "grad_norm": 0.1452404044731857, + "learning_rate": 1.6854731973870376e-05, + "loss": 0.8857, + "step": 26601 + }, + { + "epoch": 5.674488054607508, + "grad_norm": 0.11774506459356567, + "learning_rate": 1.684950744264235e-05, + "loss": 0.9087, + "step": 26602 + }, + { + "epoch": 5.674701365187714, + "grad_norm": 0.13597373472259391, + "learning_rate": 1.6844283625963207e-05, + "loss": 0.8961, + "step": 26603 + }, + { + "epoch": 5.674914675767918, + "grad_norm": 0.13302873511423716, + "learning_rate": 1.6839060523892063e-05, + "loss": 0.9255, + "step": 26604 + }, + { + "epoch": 5.675127986348123, + "grad_norm": 0.13176400674706748, + "learning_rate": 1.6833838136488e-05, + "loss": 0.8931, + "step": 26605 + }, + { + "epoch": 5.675341296928328, + "grad_norm": 0.11690759840602313, + "learning_rate": 1.682861646381011e-05, + "loss": 0.912, + "step": 26606 + }, + { + "epoch": 5.675554607508532, + "grad_norm": 0.11184588598618642, + "learning_rate": 1.6823395505917508e-05, + "loss": 0.875, + "step": 26607 + }, + { + "epoch": 5.675767918088737, + "grad_norm": 0.12871847491102476, + "learning_rate": 1.681817526286925e-05, + "loss": 0.8835, + "step": 26608 + }, + { + "epoch": 5.675981228668942, + "grad_norm": 0.10991195869907261, + "learning_rate": 1.6812955734724398e-05, + "loss": 0.8628, + "step": 26609 + }, + { + "epoch": 5.676194539249146, + "grad_norm": 0.11713815044238692, + "learning_rate": 1.6807736921541994e-05, + "loss": 0.8626, + "step": 26610 + }, + { + "epoch": 5.676407849829351, + "grad_norm": 0.11047963334016056, + "learning_rate": 1.680251882338113e-05, + "loss": 0.8892, + "step": 26611 + }, + { + "epoch": 5.676621160409557, + "grad_norm": 0.12095512487729597, + "learning_rate": 1.6797301440300833e-05, + "loss": 0.9151, + "step": 26612 + }, + { + "epoch": 5.676834470989761, + "grad_norm": 0.10763444959627003, + "learning_rate": 1.6792084772360126e-05, + "loss": 0.8806, + "step": 26613 + }, + { + "epoch": 5.677047781569966, + "grad_norm": 0.10847992071294381, + "learning_rate": 1.678686881961805e-05, + "loss": 0.8713, + "step": 26614 + }, + { + "epoch": 5.677261092150171, + "grad_norm": 0.12669457696233433, + "learning_rate": 1.6781653582133612e-05, + "loss": 0.8888, + "step": 26615 + }, + { + "epoch": 5.677474402730375, + "grad_norm": 0.10384903270013486, + "learning_rate": 1.6776439059965827e-05, + "loss": 0.8955, + "step": 26616 + }, + { + "epoch": 5.67768771331058, + "grad_norm": 0.13857719400698879, + "learning_rate": 1.67712252531737e-05, + "loss": 0.8744, + "step": 26617 + }, + { + "epoch": 5.677901023890785, + "grad_norm": 0.10342150225710789, + "learning_rate": 1.67660121618162e-05, + "loss": 0.9113, + "step": 26618 + }, + { + "epoch": 5.6781143344709895, + "grad_norm": 0.12965263502290475, + "learning_rate": 1.676079978595236e-05, + "loss": 0.8796, + "step": 26619 + }, + { + "epoch": 5.678327645051194, + "grad_norm": 0.10802457362783853, + "learning_rate": 1.6755588125641125e-05, + "loss": 0.8658, + "step": 26620 + }, + { + "epoch": 5.6785409556314, + "grad_norm": 0.10867701571890374, + "learning_rate": 1.6750377180941483e-05, + "loss": 0.8905, + "step": 26621 + }, + { + "epoch": 5.678754266211604, + "grad_norm": 0.09859020285298017, + "learning_rate": 1.6745166951912367e-05, + "loss": 0.8962, + "step": 26622 + }, + { + "epoch": 5.678967576791809, + "grad_norm": 0.11139382516276658, + "learning_rate": 1.6739957438612764e-05, + "loss": 0.9121, + "step": 26623 + }, + { + "epoch": 5.679180887372014, + "grad_norm": 0.09477387429706081, + "learning_rate": 1.6734748641101635e-05, + "loss": 0.8617, + "step": 26624 + }, + { + "epoch": 5.6793941979522184, + "grad_norm": 0.10924451044418577, + "learning_rate": 1.6729540559437847e-05, + "loss": 0.904, + "step": 26625 + }, + { + "epoch": 5.679607508532423, + "grad_norm": 0.10102997328277334, + "learning_rate": 1.6724333193680387e-05, + "loss": 0.8747, + "step": 26626 + }, + { + "epoch": 5.679820819112628, + "grad_norm": 0.10735496014315495, + "learning_rate": 1.6719126543888168e-05, + "loss": 0.8977, + "step": 26627 + }, + { + "epoch": 5.6800341296928325, + "grad_norm": 0.10576156348972424, + "learning_rate": 1.671392061012009e-05, + "loss": 0.8811, + "step": 26628 + }, + { + "epoch": 5.680247440273037, + "grad_norm": 0.11284001620671502, + "learning_rate": 1.670871539243507e-05, + "loss": 0.8808, + "step": 26629 + }, + { + "epoch": 5.680460750853243, + "grad_norm": 0.1101647608081347, + "learning_rate": 1.6703510890891975e-05, + "loss": 0.8789, + "step": 26630 + }, + { + "epoch": 5.680674061433447, + "grad_norm": 0.11394570539222655, + "learning_rate": 1.6698307105549743e-05, + "loss": 0.8851, + "step": 26631 + }, + { + "epoch": 5.680887372013652, + "grad_norm": 0.12977720056189807, + "learning_rate": 1.6693104036467227e-05, + "loss": 0.8905, + "step": 26632 + }, + { + "epoch": 5.681100682593857, + "grad_norm": 0.11892649393200508, + "learning_rate": 1.6687901683703307e-05, + "loss": 0.8717, + "step": 26633 + }, + { + "epoch": 5.6813139931740615, + "grad_norm": 0.12011494404473784, + "learning_rate": 1.6682700047316824e-05, + "loss": 0.9112, + "step": 26634 + }, + { + "epoch": 5.681527303754266, + "grad_norm": 0.12441132881163874, + "learning_rate": 1.6677499127366675e-05, + "loss": 0.9062, + "step": 26635 + }, + { + "epoch": 5.681740614334471, + "grad_norm": 0.11825148058828078, + "learning_rate": 1.6672298923911708e-05, + "loss": 0.8769, + "step": 26636 + }, + { + "epoch": 5.6819539249146755, + "grad_norm": 0.10756321360635712, + "learning_rate": 1.6667099437010704e-05, + "loss": 0.883, + "step": 26637 + }, + { + "epoch": 5.68216723549488, + "grad_norm": 0.11048270638107459, + "learning_rate": 1.6661900666722557e-05, + "loss": 0.9037, + "step": 26638 + }, + { + "epoch": 5.682380546075086, + "grad_norm": 0.0963214893509337, + "learning_rate": 1.6656702613106067e-05, + "loss": 0.8808, + "step": 26639 + }, + { + "epoch": 5.6825938566552905, + "grad_norm": 0.1136516297238221, + "learning_rate": 1.6651505276220055e-05, + "loss": 0.9025, + "step": 26640 + }, + { + "epoch": 5.682807167235495, + "grad_norm": 0.09273457408318209, + "learning_rate": 1.664630865612332e-05, + "loss": 0.8585, + "step": 26641 + }, + { + "epoch": 5.6830204778157, + "grad_norm": 0.10017833312150352, + "learning_rate": 1.6641112752874652e-05, + "loss": 0.8697, + "step": 26642 + }, + { + "epoch": 5.6832337883959045, + "grad_norm": 0.12002214352361845, + "learning_rate": 1.663591756653288e-05, + "loss": 0.9034, + "step": 26643 + }, + { + "epoch": 5.683447098976109, + "grad_norm": 0.10085100772597473, + "learning_rate": 1.663072309715677e-05, + "loss": 0.9047, + "step": 26644 + }, + { + "epoch": 5.683660409556314, + "grad_norm": 0.117494044071365, + "learning_rate": 1.6625529344805088e-05, + "loss": 0.8997, + "step": 26645 + }, + { + "epoch": 5.683873720136519, + "grad_norm": 0.10950277386121228, + "learning_rate": 1.6620336309536592e-05, + "loss": 0.8868, + "step": 26646 + }, + { + "epoch": 5.684087030716723, + "grad_norm": 0.11123480814243131, + "learning_rate": 1.6615143991410096e-05, + "loss": 0.8822, + "step": 26647 + }, + { + "epoch": 5.684300341296928, + "grad_norm": 0.11693095241638755, + "learning_rate": 1.660995239048429e-05, + "loss": 0.878, + "step": 26648 + }, + { + "epoch": 5.684513651877133, + "grad_norm": 0.11088165799767445, + "learning_rate": 1.6604761506817916e-05, + "loss": 0.9183, + "step": 26649 + }, + { + "epoch": 5.684726962457338, + "grad_norm": 0.11098389022554082, + "learning_rate": 1.659957134046976e-05, + "loss": 0.8971, + "step": 26650 + }, + { + "epoch": 5.684940273037543, + "grad_norm": 0.10698875948875256, + "learning_rate": 1.659438189149851e-05, + "loss": 0.8684, + "step": 26651 + }, + { + "epoch": 5.685153583617748, + "grad_norm": 0.1111543471141405, + "learning_rate": 1.65891931599629e-05, + "loss": 0.8777, + "step": 26652 + }, + { + "epoch": 5.685366894197952, + "grad_norm": 0.10765737062565874, + "learning_rate": 1.658400514592164e-05, + "loss": 0.8726, + "step": 26653 + }, + { + "epoch": 5.685580204778157, + "grad_norm": 0.10204203361017733, + "learning_rate": 1.6578817849433402e-05, + "loss": 0.8921, + "step": 26654 + }, + { + "epoch": 5.685793515358362, + "grad_norm": 0.1063018922305769, + "learning_rate": 1.6573631270556934e-05, + "loss": 0.8943, + "step": 26655 + }, + { + "epoch": 5.686006825938566, + "grad_norm": 0.10826560547812995, + "learning_rate": 1.6568445409350883e-05, + "loss": 0.8839, + "step": 26656 + }, + { + "epoch": 5.686220136518771, + "grad_norm": 0.1006427397167866, + "learning_rate": 1.6563260265873954e-05, + "loss": 0.8805, + "step": 26657 + }, + { + "epoch": 5.686433447098976, + "grad_norm": 0.10927101749838995, + "learning_rate": 1.6558075840184792e-05, + "loss": 0.8831, + "step": 26658 + }, + { + "epoch": 5.686646757679181, + "grad_norm": 0.10928499405564657, + "learning_rate": 1.655289213234207e-05, + "loss": 0.8862, + "step": 26659 + }, + { + "epoch": 5.686860068259386, + "grad_norm": 0.10972413811761342, + "learning_rate": 1.6547709142404445e-05, + "loss": 0.8744, + "step": 26660 + }, + { + "epoch": 5.687073378839591, + "grad_norm": 0.11971829984815203, + "learning_rate": 1.6542526870430542e-05, + "loss": 0.9012, + "step": 26661 + }, + { + "epoch": 5.687286689419795, + "grad_norm": 0.12605425640295737, + "learning_rate": 1.653734531647903e-05, + "loss": 0.8765, + "step": 26662 + }, + { + "epoch": 5.6875, + "grad_norm": 0.10905726299252438, + "learning_rate": 1.6532164480608523e-05, + "loss": 0.8773, + "step": 26663 + }, + { + "epoch": 5.687713310580205, + "grad_norm": 0.10801133314640719, + "learning_rate": 1.6526984362877635e-05, + "loss": 0.9193, + "step": 26664 + }, + { + "epoch": 5.687926621160409, + "grad_norm": 0.09948090850074039, + "learning_rate": 1.6521804963344993e-05, + "loss": 0.8928, + "step": 26665 + }, + { + "epoch": 5.688139931740614, + "grad_norm": 0.09867214782732252, + "learning_rate": 1.6516626282069176e-05, + "loss": 0.8953, + "step": 26666 + }, + { + "epoch": 5.688353242320819, + "grad_norm": 0.12569962553462288, + "learning_rate": 1.651144831910882e-05, + "loss": 0.9082, + "step": 26667 + }, + { + "epoch": 5.688566552901024, + "grad_norm": 0.11289255792404411, + "learning_rate": 1.65062710745225e-05, + "loss": 0.8678, + "step": 26668 + }, + { + "epoch": 5.688779863481229, + "grad_norm": 0.1302232663927287, + "learning_rate": 1.650109454836878e-05, + "loss": 0.9162, + "step": 26669 + }, + { + "epoch": 5.688993174061434, + "grad_norm": 0.10764920698460553, + "learning_rate": 1.6495918740706248e-05, + "loss": 0.8931, + "step": 26670 + }, + { + "epoch": 5.689206484641638, + "grad_norm": 0.1090172040133924, + "learning_rate": 1.649074365159347e-05, + "loss": 0.8718, + "step": 26671 + }, + { + "epoch": 5.689419795221843, + "grad_norm": 0.11049575598315517, + "learning_rate": 1.6485569281088994e-05, + "loss": 0.8769, + "step": 26672 + }, + { + "epoch": 5.689633105802048, + "grad_norm": 0.09964368122592096, + "learning_rate": 1.648039562925134e-05, + "loss": 0.9066, + "step": 26673 + }, + { + "epoch": 5.689846416382252, + "grad_norm": 0.11070787257198214, + "learning_rate": 1.6475222696139104e-05, + "loss": 0.9156, + "step": 26674 + }, + { + "epoch": 5.690059726962457, + "grad_norm": 0.1043342609832954, + "learning_rate": 1.647005048181079e-05, + "loss": 0.89, + "step": 26675 + }, + { + "epoch": 5.690273037542662, + "grad_norm": 0.10898850681376136, + "learning_rate": 1.6464878986324926e-05, + "loss": 0.9063, + "step": 26676 + }, + { + "epoch": 5.690486348122867, + "grad_norm": 0.11004254542591431, + "learning_rate": 1.6459708209740028e-05, + "loss": 0.8653, + "step": 26677 + }, + { + "epoch": 5.690699658703072, + "grad_norm": 0.1069471725820836, + "learning_rate": 1.6454538152114575e-05, + "loss": 0.8834, + "step": 26678 + }, + { + "epoch": 5.690912969283277, + "grad_norm": 0.1061355818861039, + "learning_rate": 1.644936881350711e-05, + "loss": 0.8919, + "step": 26679 + }, + { + "epoch": 5.691126279863481, + "grad_norm": 0.11536973761565139, + "learning_rate": 1.6444200193976136e-05, + "loss": 0.8933, + "step": 26680 + }, + { + "epoch": 5.691339590443686, + "grad_norm": 0.10158502996464176, + "learning_rate": 1.6439032293580055e-05, + "loss": 0.8782, + "step": 26681 + }, + { + "epoch": 5.691552901023891, + "grad_norm": 0.11139330466593855, + "learning_rate": 1.6433865112377416e-05, + "loss": 0.8954, + "step": 26682 + }, + { + "epoch": 5.6917662116040955, + "grad_norm": 0.11336915636513158, + "learning_rate": 1.642869865042666e-05, + "loss": 0.9055, + "step": 26683 + }, + { + "epoch": 5.6919795221843, + "grad_norm": 0.11504795400052173, + "learning_rate": 1.6423532907786254e-05, + "loss": 0.8743, + "step": 26684 + }, + { + "epoch": 5.692192832764505, + "grad_norm": 0.10455892161658209, + "learning_rate": 1.641836788451462e-05, + "loss": 0.9019, + "step": 26685 + }, + { + "epoch": 5.6924061433447095, + "grad_norm": 0.10317873320610081, + "learning_rate": 1.6413203580670236e-05, + "loss": 0.8686, + "step": 26686 + }, + { + "epoch": 5.692619453924914, + "grad_norm": 0.1251493359011284, + "learning_rate": 1.640803999631153e-05, + "loss": 0.8817, + "step": 26687 + }, + { + "epoch": 5.69283276450512, + "grad_norm": 0.10631349918681753, + "learning_rate": 1.640287713149692e-05, + "loss": 0.8818, + "step": 26688 + }, + { + "epoch": 5.6930460750853245, + "grad_norm": 0.11411119830213003, + "learning_rate": 1.639771498628483e-05, + "loss": 0.8851, + "step": 26689 + }, + { + "epoch": 5.693259385665529, + "grad_norm": 0.11968380016620123, + "learning_rate": 1.639255356073364e-05, + "loss": 0.9307, + "step": 26690 + }, + { + "epoch": 5.693472696245734, + "grad_norm": 0.10317300662066274, + "learning_rate": 1.6387392854901795e-05, + "loss": 0.9178, + "step": 26691 + }, + { + "epoch": 5.6936860068259385, + "grad_norm": 0.11198786890377375, + "learning_rate": 1.6382232868847693e-05, + "loss": 0.8827, + "step": 26692 + }, + { + "epoch": 5.693899317406143, + "grad_norm": 0.10835966309195352, + "learning_rate": 1.6377073602629653e-05, + "loss": 0.921, + "step": 26693 + }, + { + "epoch": 5.694112627986348, + "grad_norm": 0.09843757483060851, + "learning_rate": 1.6371915056306118e-05, + "loss": 0.8691, + "step": 26694 + }, + { + "epoch": 5.694325938566553, + "grad_norm": 0.12008679292836653, + "learning_rate": 1.6366757229935438e-05, + "loss": 0.8761, + "step": 26695 + }, + { + "epoch": 5.694539249146757, + "grad_norm": 0.10192649568349747, + "learning_rate": 1.6361600123575964e-05, + "loss": 0.88, + "step": 26696 + }, + { + "epoch": 5.694752559726963, + "grad_norm": 0.11300289721230385, + "learning_rate": 1.6356443737286028e-05, + "loss": 0.8995, + "step": 26697 + }, + { + "epoch": 5.6949658703071675, + "grad_norm": 0.11235479349553756, + "learning_rate": 1.6351288071124036e-05, + "loss": 0.8832, + "step": 26698 + }, + { + "epoch": 5.695179180887372, + "grad_norm": 0.09926282162717079, + "learning_rate": 1.6346133125148284e-05, + "loss": 0.8414, + "step": 26699 + }, + { + "epoch": 5.695392491467577, + "grad_norm": 0.12804155690975488, + "learning_rate": 1.6340978899417105e-05, + "loss": 0.8848, + "step": 26700 + }, + { + "epoch": 5.6956058020477816, + "grad_norm": 0.09750167052666467, + "learning_rate": 1.6335825393988826e-05, + "loss": 0.873, + "step": 26701 + }, + { + "epoch": 5.695819112627986, + "grad_norm": 0.130605109071788, + "learning_rate": 1.6330672608921726e-05, + "loss": 0.8661, + "step": 26702 + }, + { + "epoch": 5.696032423208191, + "grad_norm": 0.11947772461187439, + "learning_rate": 1.632552054427418e-05, + "loss": 0.8592, + "step": 26703 + }, + { + "epoch": 5.696245733788396, + "grad_norm": 0.10756016252512107, + "learning_rate": 1.6320369200104424e-05, + "loss": 0.8929, + "step": 26704 + }, + { + "epoch": 5.6964590443686, + "grad_norm": 0.12229876486810044, + "learning_rate": 1.6315218576470737e-05, + "loss": 0.8927, + "step": 26705 + }, + { + "epoch": 5.696672354948806, + "grad_norm": 0.11548156638128088, + "learning_rate": 1.631006867343144e-05, + "loss": 0.8905, + "step": 26706 + }, + { + "epoch": 5.6968856655290105, + "grad_norm": 0.10511606455577324, + "learning_rate": 1.630491949104479e-05, + "loss": 0.8635, + "step": 26707 + }, + { + "epoch": 5.697098976109215, + "grad_norm": 0.11720060004838061, + "learning_rate": 1.6299771029369048e-05, + "loss": 0.8873, + "step": 26708 + }, + { + "epoch": 5.69731228668942, + "grad_norm": 0.12221460779082308, + "learning_rate": 1.6294623288462448e-05, + "loss": 0.8744, + "step": 26709 + }, + { + "epoch": 5.697525597269625, + "grad_norm": 0.09987490639669022, + "learning_rate": 1.6289476268383282e-05, + "loss": 0.8825, + "step": 26710 + }, + { + "epoch": 5.697738907849829, + "grad_norm": 0.11033371654579292, + "learning_rate": 1.628432996918976e-05, + "loss": 0.8434, + "step": 26711 + }, + { + "epoch": 5.697952218430034, + "grad_norm": 0.1152621499129717, + "learning_rate": 1.6279184390940113e-05, + "loss": 0.9078, + "step": 26712 + }, + { + "epoch": 5.698165529010239, + "grad_norm": 0.10564946255987502, + "learning_rate": 1.6274039533692567e-05, + "loss": 0.8936, + "step": 26713 + }, + { + "epoch": 5.698378839590443, + "grad_norm": 0.10947297936440131, + "learning_rate": 1.6268895397505334e-05, + "loss": 0.9057, + "step": 26714 + }, + { + "epoch": 5.698592150170649, + "grad_norm": 0.10149147183954563, + "learning_rate": 1.626375198243663e-05, + "loss": 0.9091, + "step": 26715 + }, + { + "epoch": 5.698805460750854, + "grad_norm": 0.11043011708669978, + "learning_rate": 1.625860928854464e-05, + "loss": 0.8715, + "step": 26716 + }, + { + "epoch": 5.699018771331058, + "grad_norm": 0.09856657935240794, + "learning_rate": 1.6253467315887537e-05, + "loss": 0.914, + "step": 26717 + }, + { + "epoch": 5.699232081911263, + "grad_norm": 0.10732771535497508, + "learning_rate": 1.6248326064523537e-05, + "loss": 0.8736, + "step": 26718 + }, + { + "epoch": 5.699445392491468, + "grad_norm": 0.10268646405802391, + "learning_rate": 1.6243185534510813e-05, + "loss": 0.8812, + "step": 26719 + }, + { + "epoch": 5.699658703071672, + "grad_norm": 0.10475275589432097, + "learning_rate": 1.6238045725907506e-05, + "loss": 0.9197, + "step": 26720 + }, + { + "epoch": 5.699872013651877, + "grad_norm": 0.10634271123519272, + "learning_rate": 1.6232906638771772e-05, + "loss": 0.883, + "step": 26721 + }, + { + "epoch": 5.700085324232082, + "grad_norm": 0.10850010110023607, + "learning_rate": 1.6227768273161793e-05, + "loss": 0.9003, + "step": 26722 + }, + { + "epoch": 5.700298634812286, + "grad_norm": 0.11647874513233017, + "learning_rate": 1.622263062913568e-05, + "loss": 0.9058, + "step": 26723 + }, + { + "epoch": 5.700511945392492, + "grad_norm": 0.1015201307667616, + "learning_rate": 1.6217493706751578e-05, + "loss": 0.8797, + "step": 26724 + }, + { + "epoch": 5.700725255972696, + "grad_norm": 0.10969307939408059, + "learning_rate": 1.6212357506067613e-05, + "loss": 0.8731, + "step": 26725 + }, + { + "epoch": 5.700938566552901, + "grad_norm": 0.10842553374375673, + "learning_rate": 1.6207222027141893e-05, + "loss": 0.875, + "step": 26726 + }, + { + "epoch": 5.701151877133106, + "grad_norm": 0.09630073475540046, + "learning_rate": 1.620208727003252e-05, + "loss": 0.8811, + "step": 26727 + }, + { + "epoch": 5.701365187713311, + "grad_norm": 0.09964816858413922, + "learning_rate": 1.6196953234797613e-05, + "loss": 0.9147, + "step": 26728 + }, + { + "epoch": 5.701578498293515, + "grad_norm": 0.10108111164109684, + "learning_rate": 1.619181992149523e-05, + "loss": 0.8909, + "step": 26729 + }, + { + "epoch": 5.70179180887372, + "grad_norm": 0.10315523272418946, + "learning_rate": 1.61866873301835e-05, + "loss": 0.8949, + "step": 26730 + }, + { + "epoch": 5.702005119453925, + "grad_norm": 0.10018087702601988, + "learning_rate": 1.618155546092047e-05, + "loss": 0.8931, + "step": 26731 + }, + { + "epoch": 5.702218430034129, + "grad_norm": 0.11057754233752252, + "learning_rate": 1.6176424313764218e-05, + "loss": 0.8944, + "step": 26732 + }, + { + "epoch": 5.702431740614334, + "grad_norm": 0.10944134611947559, + "learning_rate": 1.6171293888772776e-05, + "loss": 0.8871, + "step": 26733 + }, + { + "epoch": 5.702645051194539, + "grad_norm": 0.10388491544250304, + "learning_rate": 1.6166164186004243e-05, + "loss": 0.8719, + "step": 26734 + }, + { + "epoch": 5.702858361774744, + "grad_norm": 0.10745456923023261, + "learning_rate": 1.616103520551663e-05, + "loss": 0.8792, + "step": 26735 + }, + { + "epoch": 5.703071672354949, + "grad_norm": 0.11149771873748004, + "learning_rate": 1.6155906947368e-05, + "loss": 0.8751, + "step": 26736 + }, + { + "epoch": 5.703284982935154, + "grad_norm": 0.10068425525498247, + "learning_rate": 1.615077941161632e-05, + "loss": 0.9378, + "step": 26737 + }, + { + "epoch": 5.703498293515358, + "grad_norm": 0.1083629477065371, + "learning_rate": 1.6145652598319658e-05, + "loss": 0.9036, + "step": 26738 + }, + { + "epoch": 5.703711604095563, + "grad_norm": 0.10653896622462126, + "learning_rate": 1.6140526507536013e-05, + "loss": 0.8822, + "step": 26739 + }, + { + "epoch": 5.703924914675768, + "grad_norm": 0.12827501028270258, + "learning_rate": 1.613540113932339e-05, + "loss": 0.8839, + "step": 26740 + }, + { + "epoch": 5.7041382252559725, + "grad_norm": 0.11699410200952912, + "learning_rate": 1.613027649373975e-05, + "loss": 0.9279, + "step": 26741 + }, + { + "epoch": 5.704351535836177, + "grad_norm": 0.1078046709611983, + "learning_rate": 1.6125152570843125e-05, + "loss": 0.9002, + "step": 26742 + }, + { + "epoch": 5.704564846416382, + "grad_norm": 0.10653271498481885, + "learning_rate": 1.612002937069148e-05, + "loss": 0.9166, + "step": 26743 + }, + { + "epoch": 5.704778156996587, + "grad_norm": 0.10370221118980655, + "learning_rate": 1.6114906893342766e-05, + "loss": 0.8817, + "step": 26744 + }, + { + "epoch": 5.704991467576792, + "grad_norm": 0.12362518314270933, + "learning_rate": 1.6109785138854946e-05, + "loss": 0.8783, + "step": 26745 + }, + { + "epoch": 5.705204778156997, + "grad_norm": 0.10744327342318584, + "learning_rate": 1.6104664107286003e-05, + "loss": 0.8595, + "step": 26746 + }, + { + "epoch": 5.7054180887372015, + "grad_norm": 0.131320358207827, + "learning_rate": 1.6099543798693876e-05, + "loss": 0.8988, + "step": 26747 + }, + { + "epoch": 5.705631399317406, + "grad_norm": 0.10699310957120194, + "learning_rate": 1.6094424213136466e-05, + "loss": 0.8598, + "step": 26748 + }, + { + "epoch": 5.705844709897611, + "grad_norm": 0.1128888287460762, + "learning_rate": 1.6089305350671703e-05, + "loss": 0.9078, + "step": 26749 + }, + { + "epoch": 5.7060580204778155, + "grad_norm": 0.10782513164529914, + "learning_rate": 1.608418721135754e-05, + "loss": 0.8728, + "step": 26750 + }, + { + "epoch": 5.70627133105802, + "grad_norm": 0.1032656550443067, + "learning_rate": 1.607906979525188e-05, + "loss": 0.914, + "step": 26751 + }, + { + "epoch": 5.706484641638225, + "grad_norm": 0.100692079467142, + "learning_rate": 1.6073953102412616e-05, + "loss": 0.8892, + "step": 26752 + }, + { + "epoch": 5.7066979522184305, + "grad_norm": 0.10054832163120007, + "learning_rate": 1.6068837132897622e-05, + "loss": 0.8723, + "step": 26753 + }, + { + "epoch": 5.706911262798635, + "grad_norm": 0.10294992230735431, + "learning_rate": 1.6063721886764838e-05, + "loss": 0.8611, + "step": 26754 + }, + { + "epoch": 5.70712457337884, + "grad_norm": 0.12307157286693249, + "learning_rate": 1.6058607364072116e-05, + "loss": 0.9348, + "step": 26755 + }, + { + "epoch": 5.7073378839590445, + "grad_norm": 0.1010671280652817, + "learning_rate": 1.605349356487733e-05, + "loss": 0.8858, + "step": 26756 + }, + { + "epoch": 5.707551194539249, + "grad_norm": 0.10284653443177826, + "learning_rate": 1.604838048923834e-05, + "loss": 0.8877, + "step": 26757 + }, + { + "epoch": 5.707764505119454, + "grad_norm": 0.2808814734683852, + "learning_rate": 1.604326813721298e-05, + "loss": 0.897, + "step": 26758 + }, + { + "epoch": 5.707977815699659, + "grad_norm": 0.10702149768362389, + "learning_rate": 1.603815650885915e-05, + "loss": 0.9065, + "step": 26759 + }, + { + "epoch": 5.708191126279863, + "grad_norm": 0.10104438557579208, + "learning_rate": 1.6033045604234646e-05, + "loss": 0.8839, + "step": 26760 + }, + { + "epoch": 5.708404436860068, + "grad_norm": 0.110859857667406, + "learning_rate": 1.602793542339728e-05, + "loss": 0.8828, + "step": 26761 + }, + { + "epoch": 5.7086177474402735, + "grad_norm": 0.10827465610649133, + "learning_rate": 1.6022825966404914e-05, + "loss": 0.8971, + "step": 26762 + }, + { + "epoch": 5.708831058020478, + "grad_norm": 0.11692626869743496, + "learning_rate": 1.6017717233315365e-05, + "loss": 0.8846, + "step": 26763 + }, + { + "epoch": 5.709044368600683, + "grad_norm": 0.11416727063896764, + "learning_rate": 1.601260922418641e-05, + "loss": 0.8374, + "step": 26764 + }, + { + "epoch": 5.709257679180888, + "grad_norm": 0.10430596673907845, + "learning_rate": 1.600750193907584e-05, + "loss": 0.8916, + "step": 26765 + }, + { + "epoch": 5.709470989761092, + "grad_norm": 0.12175243435754264, + "learning_rate": 1.600239537804148e-05, + "loss": 0.8913, + "step": 26766 + }, + { + "epoch": 5.709684300341297, + "grad_norm": 0.10304738415830589, + "learning_rate": 1.5997289541141093e-05, + "loss": 0.875, + "step": 26767 + }, + { + "epoch": 5.709897610921502, + "grad_norm": 0.1073618874859402, + "learning_rate": 1.5992184428432455e-05, + "loss": 0.9259, + "step": 26768 + }, + { + "epoch": 5.710110921501706, + "grad_norm": 0.13307804816115257, + "learning_rate": 1.5987080039973325e-05, + "loss": 0.9062, + "step": 26769 + }, + { + "epoch": 5.710324232081911, + "grad_norm": 0.09877116133928418, + "learning_rate": 1.5981976375821463e-05, + "loss": 0.885, + "step": 26770 + }, + { + "epoch": 5.710537542662116, + "grad_norm": 0.11602673883118444, + "learning_rate": 1.597687343603461e-05, + "loss": 0.8622, + "step": 26771 + }, + { + "epoch": 5.71075085324232, + "grad_norm": 0.11466220208673311, + "learning_rate": 1.597177122067052e-05, + "loss": 0.865, + "step": 26772 + }, + { + "epoch": 5.710964163822526, + "grad_norm": 0.11286403502051602, + "learning_rate": 1.5966669729786887e-05, + "loss": 0.8814, + "step": 26773 + }, + { + "epoch": 5.711177474402731, + "grad_norm": 0.11728285052691521, + "learning_rate": 1.596156896344149e-05, + "loss": 0.882, + "step": 26774 + }, + { + "epoch": 5.711390784982935, + "grad_norm": 0.10368796466896645, + "learning_rate": 1.595646892169201e-05, + "loss": 0.897, + "step": 26775 + }, + { + "epoch": 5.71160409556314, + "grad_norm": 0.12032176468600846, + "learning_rate": 1.595136960459616e-05, + "loss": 0.8786, + "step": 26776 + }, + { + "epoch": 5.711817406143345, + "grad_norm": 0.11723636647426434, + "learning_rate": 1.594627101221163e-05, + "loss": 0.8815, + "step": 26777 + }, + { + "epoch": 5.712030716723549, + "grad_norm": 0.12462215469516406, + "learning_rate": 1.594117314459614e-05, + "loss": 0.8569, + "step": 26778 + }, + { + "epoch": 5.712244027303754, + "grad_norm": 0.10993546170817046, + "learning_rate": 1.593607600180734e-05, + "loss": 0.8948, + "step": 26779 + }, + { + "epoch": 5.712457337883959, + "grad_norm": 0.136522900202896, + "learning_rate": 1.5930979583902928e-05, + "loss": 0.8876, + "step": 26780 + }, + { + "epoch": 5.712670648464163, + "grad_norm": 0.11095109270851276, + "learning_rate": 1.592588389094056e-05, + "loss": 0.8918, + "step": 26781 + }, + { + "epoch": 5.712883959044369, + "grad_norm": 0.12073626173631814, + "learning_rate": 1.5920788922977893e-05, + "loss": 0.8639, + "step": 26782 + }, + { + "epoch": 5.713097269624574, + "grad_norm": 0.12278366671183197, + "learning_rate": 1.5915694680072576e-05, + "loss": 0.8797, + "step": 26783 + }, + { + "epoch": 5.713310580204778, + "grad_norm": 0.11806750175218489, + "learning_rate": 1.5910601162282252e-05, + "loss": 0.89, + "step": 26784 + }, + { + "epoch": 5.713523890784983, + "grad_norm": 0.11512393556875235, + "learning_rate": 1.5905508369664532e-05, + "loss": 0.887, + "step": 26785 + }, + { + "epoch": 5.713737201365188, + "grad_norm": 0.12029610838910053, + "learning_rate": 1.5900416302277085e-05, + "loss": 0.9012, + "step": 26786 + }, + { + "epoch": 5.713950511945392, + "grad_norm": 0.13238024781614907, + "learning_rate": 1.5895324960177507e-05, + "loss": 0.888, + "step": 26787 + }, + { + "epoch": 5.714163822525597, + "grad_norm": 0.11954152771334596, + "learning_rate": 1.5890234343423402e-05, + "loss": 0.8697, + "step": 26788 + }, + { + "epoch": 5.714377133105802, + "grad_norm": 0.11600587812371918, + "learning_rate": 1.5885144452072355e-05, + "loss": 0.8879, + "step": 26789 + }, + { + "epoch": 5.714590443686006, + "grad_norm": 0.10470474381807081, + "learning_rate": 1.5880055286181998e-05, + "loss": 0.8783, + "step": 26790 + }, + { + "epoch": 5.714803754266212, + "grad_norm": 0.11426406110261249, + "learning_rate": 1.5874966845809893e-05, + "loss": 0.8729, + "step": 26791 + }, + { + "epoch": 5.715017064846417, + "grad_norm": 0.114491042579915, + "learning_rate": 1.586987913101364e-05, + "loss": 0.8927, + "step": 26792 + }, + { + "epoch": 5.715230375426621, + "grad_norm": 0.10414289341256898, + "learning_rate": 1.586479214185074e-05, + "loss": 0.872, + "step": 26793 + }, + { + "epoch": 5.715443686006826, + "grad_norm": 0.09749822293672707, + "learning_rate": 1.585970587837881e-05, + "loss": 0.9057, + "step": 26794 + }, + { + "epoch": 5.715656996587031, + "grad_norm": 0.1251753624353697, + "learning_rate": 1.5854620340655395e-05, + "loss": 0.888, + "step": 26795 + }, + { + "epoch": 5.715870307167235, + "grad_norm": 0.10547138138119863, + "learning_rate": 1.5849535528738034e-05, + "loss": 0.882, + "step": 26796 + }, + { + "epoch": 5.71608361774744, + "grad_norm": 0.10780521140603239, + "learning_rate": 1.584445144268423e-05, + "loss": 0.9205, + "step": 26797 + }, + { + "epoch": 5.716296928327645, + "grad_norm": 0.10890383210773034, + "learning_rate": 1.583936808255156e-05, + "loss": 0.9005, + "step": 26798 + }, + { + "epoch": 5.7165102389078495, + "grad_norm": 0.1083262075807314, + "learning_rate": 1.583428544839752e-05, + "loss": 0.9375, + "step": 26799 + }, + { + "epoch": 5.716723549488055, + "grad_norm": 0.10106129569613405, + "learning_rate": 1.5829203540279615e-05, + "loss": 0.8636, + "step": 26800 + }, + { + "epoch": 5.71693686006826, + "grad_norm": 0.10760752677122283, + "learning_rate": 1.5824122358255333e-05, + "loss": 0.9054, + "step": 26801 + }, + { + "epoch": 5.717150170648464, + "grad_norm": 0.10520048644744548, + "learning_rate": 1.5819041902382208e-05, + "loss": 0.871, + "step": 26802 + }, + { + "epoch": 5.717363481228669, + "grad_norm": 0.10463464349580447, + "learning_rate": 1.581396217271772e-05, + "loss": 0.894, + "step": 26803 + }, + { + "epoch": 5.717576791808874, + "grad_norm": 0.11311042483671162, + "learning_rate": 1.5808883169319305e-05, + "loss": 0.9044, + "step": 26804 + }, + { + "epoch": 5.7177901023890785, + "grad_norm": 0.10524121945124051, + "learning_rate": 1.580380489224445e-05, + "loss": 0.9086, + "step": 26805 + }, + { + "epoch": 5.718003412969283, + "grad_norm": 0.1125750279963255, + "learning_rate": 1.579872734155063e-05, + "loss": 0.9211, + "step": 26806 + }, + { + "epoch": 5.718216723549488, + "grad_norm": 0.1123766454132586, + "learning_rate": 1.5793650517295294e-05, + "loss": 0.906, + "step": 26807 + }, + { + "epoch": 5.7184300341296925, + "grad_norm": 0.09993627976442412, + "learning_rate": 1.578857441953589e-05, + "loss": 0.8869, + "step": 26808 + }, + { + "epoch": 5.718643344709898, + "grad_norm": 0.11008779734444273, + "learning_rate": 1.5783499048329812e-05, + "loss": 0.8821, + "step": 26809 + }, + { + "epoch": 5.718856655290102, + "grad_norm": 0.11912681133809909, + "learning_rate": 1.5778424403734555e-05, + "loss": 0.8804, + "step": 26810 + }, + { + "epoch": 5.7190699658703075, + "grad_norm": 0.09631638488243167, + "learning_rate": 1.57733504858075e-05, + "loss": 0.8991, + "step": 26811 + }, + { + "epoch": 5.719283276450512, + "grad_norm": 0.10888327518390328, + "learning_rate": 1.5768277294606075e-05, + "loss": 0.8798, + "step": 26812 + }, + { + "epoch": 5.719496587030717, + "grad_norm": 0.1099523174044507, + "learning_rate": 1.5763204830187636e-05, + "loss": 0.8797, + "step": 26813 + }, + { + "epoch": 5.7197098976109215, + "grad_norm": 0.1026517230263105, + "learning_rate": 1.5758133092609647e-05, + "loss": 0.9173, + "step": 26814 + }, + { + "epoch": 5.719923208191126, + "grad_norm": 0.10591299740792326, + "learning_rate": 1.5753062081929478e-05, + "loss": 0.8852, + "step": 26815 + }, + { + "epoch": 5.720136518771331, + "grad_norm": 0.11846934822981454, + "learning_rate": 1.574799179820447e-05, + "loss": 0.8725, + "step": 26816 + }, + { + "epoch": 5.720349829351536, + "grad_norm": 0.09938908476282536, + "learning_rate": 1.5742922241492005e-05, + "loss": 0.9033, + "step": 26817 + }, + { + "epoch": 5.72056313993174, + "grad_norm": 0.10906514295937102, + "learning_rate": 1.573785341184946e-05, + "loss": 0.8906, + "step": 26818 + }, + { + "epoch": 5.720776450511945, + "grad_norm": 0.10337869758465149, + "learning_rate": 1.57327853093342e-05, + "loss": 0.8797, + "step": 26819 + }, + { + "epoch": 5.7209897610921505, + "grad_norm": 0.09755356714529494, + "learning_rate": 1.5727717934003548e-05, + "loss": 0.9328, + "step": 26820 + }, + { + "epoch": 5.721203071672355, + "grad_norm": 0.10261622642111506, + "learning_rate": 1.5722651285914833e-05, + "loss": 0.905, + "step": 26821 + }, + { + "epoch": 5.72141638225256, + "grad_norm": 0.10496527792178763, + "learning_rate": 1.5717585365125413e-05, + "loss": 0.9113, + "step": 26822 + }, + { + "epoch": 5.721629692832765, + "grad_norm": 0.09610213239171389, + "learning_rate": 1.5712520171692603e-05, + "loss": 0.8637, + "step": 26823 + }, + { + "epoch": 5.721843003412969, + "grad_norm": 0.09555964341410592, + "learning_rate": 1.57074557056737e-05, + "loss": 0.8916, + "step": 26824 + }, + { + "epoch": 5.722056313993174, + "grad_norm": 0.10395815882689398, + "learning_rate": 1.5702391967126e-05, + "loss": 0.8792, + "step": 26825 + }, + { + "epoch": 5.722269624573379, + "grad_norm": 0.09798323879972887, + "learning_rate": 1.5697328956106863e-05, + "loss": 0.929, + "step": 26826 + }, + { + "epoch": 5.722482935153583, + "grad_norm": 0.12165776065630265, + "learning_rate": 1.5692266672673505e-05, + "loss": 0.8989, + "step": 26827 + }, + { + "epoch": 5.722696245733788, + "grad_norm": 0.11856107621445426, + "learning_rate": 1.568720511688323e-05, + "loss": 0.8727, + "step": 26828 + }, + { + "epoch": 5.722909556313994, + "grad_norm": 0.11272466482710215, + "learning_rate": 1.56821442887933e-05, + "loss": 0.9151, + "step": 26829 + }, + { + "epoch": 5.723122866894198, + "grad_norm": 0.12135982539670591, + "learning_rate": 1.5677084188460998e-05, + "loss": 0.8805, + "step": 26830 + }, + { + "epoch": 5.723336177474403, + "grad_norm": 0.10408332000174908, + "learning_rate": 1.5672024815943573e-05, + "loss": 0.8547, + "step": 26831 + }, + { + "epoch": 5.723549488054608, + "grad_norm": 0.13331524816215284, + "learning_rate": 1.566696617129827e-05, + "loss": 0.8975, + "step": 26832 + }, + { + "epoch": 5.723762798634812, + "grad_norm": 0.0992943495504011, + "learning_rate": 1.5661908254582307e-05, + "loss": 0.8929, + "step": 26833 + }, + { + "epoch": 5.723976109215017, + "grad_norm": 0.11428577615113528, + "learning_rate": 1.5656851065852954e-05, + "loss": 0.9063, + "step": 26834 + }, + { + "epoch": 5.724189419795222, + "grad_norm": 0.10388440281724344, + "learning_rate": 1.5651794605167417e-05, + "loss": 0.9014, + "step": 26835 + }, + { + "epoch": 5.724402730375426, + "grad_norm": 0.1244708911620501, + "learning_rate": 1.56467388725829e-05, + "loss": 0.9, + "step": 26836 + }, + { + "epoch": 5.724616040955631, + "grad_norm": 0.1160922127694245, + "learning_rate": 1.5641683868156617e-05, + "loss": 0.9002, + "step": 26837 + }, + { + "epoch": 5.724829351535837, + "grad_norm": 0.10640030679816283, + "learning_rate": 1.563662959194577e-05, + "loss": 0.8767, + "step": 26838 + }, + { + "epoch": 5.725042662116041, + "grad_norm": 0.1252651917181581, + "learning_rate": 1.5631576044007538e-05, + "loss": 0.8745, + "step": 26839 + }, + { + "epoch": 5.725255972696246, + "grad_norm": 0.10813544222321801, + "learning_rate": 1.5626523224399102e-05, + "loss": 0.8479, + "step": 26840 + }, + { + "epoch": 5.725469283276451, + "grad_norm": 0.125486728121006, + "learning_rate": 1.5621471133177618e-05, + "loss": 0.8905, + "step": 26841 + }, + { + "epoch": 5.725682593856655, + "grad_norm": 0.10590381404610827, + "learning_rate": 1.56164197704003e-05, + "loss": 0.9077, + "step": 26842 + }, + { + "epoch": 5.72589590443686, + "grad_norm": 0.11587704958446189, + "learning_rate": 1.561136913612427e-05, + "loss": 0.882, + "step": 26843 + }, + { + "epoch": 5.726109215017065, + "grad_norm": 0.10570354297042907, + "learning_rate": 1.560631923040668e-05, + "loss": 0.9166, + "step": 26844 + }, + { + "epoch": 5.726322525597269, + "grad_norm": 0.1134197358441658, + "learning_rate": 1.5601270053304655e-05, + "loss": 0.8869, + "step": 26845 + }, + { + "epoch": 5.726535836177474, + "grad_norm": 0.10525276216268727, + "learning_rate": 1.5596221604875354e-05, + "loss": 0.8876, + "step": 26846 + }, + { + "epoch": 5.72674914675768, + "grad_norm": 0.12646536456265822, + "learning_rate": 1.5591173885175895e-05, + "loss": 0.8507, + "step": 26847 + }, + { + "epoch": 5.726962457337884, + "grad_norm": 0.10356175283965745, + "learning_rate": 1.5586126894263383e-05, + "loss": 0.8972, + "step": 26848 + }, + { + "epoch": 5.727175767918089, + "grad_norm": 0.12610275840329513, + "learning_rate": 1.5581080632194937e-05, + "loss": 0.8948, + "step": 26849 + }, + { + "epoch": 5.727389078498294, + "grad_norm": 0.1259998744221279, + "learning_rate": 1.557603509902764e-05, + "loss": 0.874, + "step": 26850 + }, + { + "epoch": 5.727602389078498, + "grad_norm": 0.09599007923270782, + "learning_rate": 1.55709902948186e-05, + "loss": 0.879, + "step": 26851 + }, + { + "epoch": 5.727815699658703, + "grad_norm": 0.12659450593645716, + "learning_rate": 1.5565946219624874e-05, + "loss": 0.8577, + "step": 26852 + }, + { + "epoch": 5.728029010238908, + "grad_norm": 0.11177836646960079, + "learning_rate": 1.5560902873503537e-05, + "loss": 0.8538, + "step": 26853 + }, + { + "epoch": 5.728242320819112, + "grad_norm": 0.10245894335991516, + "learning_rate": 1.5555860256511682e-05, + "loss": 0.8971, + "step": 26854 + }, + { + "epoch": 5.728455631399317, + "grad_norm": 0.13918791108052442, + "learning_rate": 1.5550818368706364e-05, + "loss": 0.8778, + "step": 26855 + }, + { + "epoch": 5.728668941979522, + "grad_norm": 0.10283870861984587, + "learning_rate": 1.554577721014462e-05, + "loss": 0.9074, + "step": 26856 + }, + { + "epoch": 5.7288822525597265, + "grad_norm": 0.12821892855788572, + "learning_rate": 1.5540736780883456e-05, + "loss": 0.8748, + "step": 26857 + }, + { + "epoch": 5.729095563139932, + "grad_norm": 0.12798585008570887, + "learning_rate": 1.5535697080979968e-05, + "loss": 0.8976, + "step": 26858 + }, + { + "epoch": 5.729308873720137, + "grad_norm": 0.10356812951826239, + "learning_rate": 1.553065811049117e-05, + "loss": 0.8921, + "step": 26859 + }, + { + "epoch": 5.729522184300341, + "grad_norm": 0.12808567995654893, + "learning_rate": 1.552561986947403e-05, + "loss": 0.9144, + "step": 26860 + }, + { + "epoch": 5.729735494880546, + "grad_norm": 0.10731995948095345, + "learning_rate": 1.5520582357985593e-05, + "loss": 0.8653, + "step": 26861 + }, + { + "epoch": 5.729948805460751, + "grad_norm": 0.09852416143684452, + "learning_rate": 1.5515545576082853e-05, + "loss": 0.8881, + "step": 26862 + }, + { + "epoch": 5.7301621160409555, + "grad_norm": 0.12456375172501374, + "learning_rate": 1.5510509523822808e-05, + "loss": 0.8638, + "step": 26863 + }, + { + "epoch": 5.73037542662116, + "grad_norm": 0.10365639061852243, + "learning_rate": 1.5505474201262423e-05, + "loss": 0.856, + "step": 26864 + }, + { + "epoch": 5.730588737201365, + "grad_norm": 0.09706740975982459, + "learning_rate": 1.5500439608458665e-05, + "loss": 0.8739, + "step": 26865 + }, + { + "epoch": 5.7308020477815695, + "grad_norm": 0.11720131054305354, + "learning_rate": 1.549540574546854e-05, + "loss": 0.8794, + "step": 26866 + }, + { + "epoch": 5.731015358361775, + "grad_norm": 0.11161178351865522, + "learning_rate": 1.549037261234898e-05, + "loss": 0.8964, + "step": 26867 + }, + { + "epoch": 5.73122866894198, + "grad_norm": 0.10203341565220458, + "learning_rate": 1.5485340209156943e-05, + "loss": 0.9134, + "step": 26868 + }, + { + "epoch": 5.7314419795221845, + "grad_norm": 0.11479993993024443, + "learning_rate": 1.548030853594935e-05, + "loss": 0.8822, + "step": 26869 + }, + { + "epoch": 5.731655290102389, + "grad_norm": 0.11335135538281546, + "learning_rate": 1.5475277592783165e-05, + "loss": 0.9235, + "step": 26870 + }, + { + "epoch": 5.731868600682594, + "grad_norm": 0.11324513614085073, + "learning_rate": 1.5470247379715324e-05, + "loss": 0.8908, + "step": 26871 + }, + { + "epoch": 5.7320819112627985, + "grad_norm": 0.13655608896392857, + "learning_rate": 1.5465217896802674e-05, + "loss": 0.8908, + "step": 26872 + }, + { + "epoch": 5.732295221843003, + "grad_norm": 0.10177401532705685, + "learning_rate": 1.546018914410219e-05, + "loss": 0.8868, + "step": 26873 + }, + { + "epoch": 5.732508532423208, + "grad_norm": 0.14242327481973424, + "learning_rate": 1.545516112167075e-05, + "loss": 0.8953, + "step": 26874 + }, + { + "epoch": 5.732721843003413, + "grad_norm": 0.11365749202490812, + "learning_rate": 1.5450133829565254e-05, + "loss": 0.8621, + "step": 26875 + }, + { + "epoch": 5.732935153583618, + "grad_norm": 0.1134545509656465, + "learning_rate": 1.5445107267842577e-05, + "loss": 0.8821, + "step": 26876 + }, + { + "epoch": 5.733148464163823, + "grad_norm": 0.13129959975717728, + "learning_rate": 1.5440081436559574e-05, + "loss": 0.8999, + "step": 26877 + }, + { + "epoch": 5.7333617747440275, + "grad_norm": 0.10855620163167282, + "learning_rate": 1.5435056335773158e-05, + "loss": 0.8788, + "step": 26878 + }, + { + "epoch": 5.733575085324232, + "grad_norm": 0.10704318647936051, + "learning_rate": 1.5430031965540166e-05, + "loss": 0.8902, + "step": 26879 + }, + { + "epoch": 5.733788395904437, + "grad_norm": 0.116731035677095, + "learning_rate": 1.5425008325917445e-05, + "loss": 0.8854, + "step": 26880 + }, + { + "epoch": 5.734001706484642, + "grad_norm": 0.10370910987979924, + "learning_rate": 1.5419985416961818e-05, + "loss": 0.8743, + "step": 26881 + }, + { + "epoch": 5.734215017064846, + "grad_norm": 0.10813669444242689, + "learning_rate": 1.5414963238730188e-05, + "loss": 0.8676, + "step": 26882 + }, + { + "epoch": 5.734428327645051, + "grad_norm": 0.11549347643052608, + "learning_rate": 1.5409941791279316e-05, + "loss": 0.8636, + "step": 26883 + }, + { + "epoch": 5.734641638225256, + "grad_norm": 0.10159681001669664, + "learning_rate": 1.5404921074666015e-05, + "loss": 0.8817, + "step": 26884 + }, + { + "epoch": 5.734854948805461, + "grad_norm": 0.10989044269099314, + "learning_rate": 1.539990108894714e-05, + "loss": 0.9023, + "step": 26885 + }, + { + "epoch": 5.735068259385666, + "grad_norm": 0.09748736646749052, + "learning_rate": 1.5394881834179482e-05, + "loss": 0.8756, + "step": 26886 + }, + { + "epoch": 5.735281569965871, + "grad_norm": 0.11625759542770574, + "learning_rate": 1.538986331041981e-05, + "loss": 0.8991, + "step": 26887 + }, + { + "epoch": 5.735494880546075, + "grad_norm": 0.115079033790864, + "learning_rate": 1.5384845517724923e-05, + "loss": 0.8764, + "step": 26888 + }, + { + "epoch": 5.73570819112628, + "grad_norm": 0.1292126986862092, + "learning_rate": 1.537982845615158e-05, + "loss": 0.9006, + "step": 26889 + }, + { + "epoch": 5.735921501706485, + "grad_norm": 0.11510885974991102, + "learning_rate": 1.5374812125756583e-05, + "loss": 0.9154, + "step": 26890 + }, + { + "epoch": 5.736134812286689, + "grad_norm": 0.09641724653624435, + "learning_rate": 1.5369796526596665e-05, + "loss": 0.9107, + "step": 26891 + }, + { + "epoch": 5.736348122866894, + "grad_norm": 0.10361928283033764, + "learning_rate": 1.5364781658728592e-05, + "loss": 0.907, + "step": 26892 + }, + { + "epoch": 5.736561433447099, + "grad_norm": 0.09936245492012415, + "learning_rate": 1.5359767522209112e-05, + "loss": 0.8924, + "step": 26893 + }, + { + "epoch": 5.736774744027304, + "grad_norm": 0.11759913652363353, + "learning_rate": 1.535475411709494e-05, + "loss": 0.8668, + "step": 26894 + }, + { + "epoch": 5.736988054607508, + "grad_norm": 0.10920456331315072, + "learning_rate": 1.5349741443442817e-05, + "loss": 0.8989, + "step": 26895 + }, + { + "epoch": 5.737201365187714, + "grad_norm": 0.12504551408651376, + "learning_rate": 1.534472950130943e-05, + "loss": 0.9037, + "step": 26896 + }, + { + "epoch": 5.737414675767918, + "grad_norm": 0.11106875467004444, + "learning_rate": 1.5339718290751538e-05, + "loss": 0.8781, + "step": 26897 + }, + { + "epoch": 5.737627986348123, + "grad_norm": 0.10862302337106695, + "learning_rate": 1.5334707811825823e-05, + "loss": 0.8952, + "step": 26898 + }, + { + "epoch": 5.737841296928328, + "grad_norm": 0.109168200564231, + "learning_rate": 1.5329698064588977e-05, + "loss": 0.88, + "step": 26899 + }, + { + "epoch": 5.738054607508532, + "grad_norm": 0.10955003601081711, + "learning_rate": 1.532468904909769e-05, + "loss": 0.8845, + "step": 26900 + }, + { + "epoch": 5.738267918088737, + "grad_norm": 0.11453530269724746, + "learning_rate": 1.5319680765408607e-05, + "loss": 0.8729, + "step": 26901 + }, + { + "epoch": 5.738481228668942, + "grad_norm": 0.12562893656504465, + "learning_rate": 1.5314673213578447e-05, + "loss": 0.8923, + "step": 26902 + }, + { + "epoch": 5.738694539249146, + "grad_norm": 0.10135776631736383, + "learning_rate": 1.530966639366385e-05, + "loss": 0.8826, + "step": 26903 + }, + { + "epoch": 5.738907849829351, + "grad_norm": 0.12476312534337383, + "learning_rate": 1.5304660305721465e-05, + "loss": 0.9001, + "step": 26904 + }, + { + "epoch": 5.739121160409557, + "grad_norm": 0.10172425507810982, + "learning_rate": 1.5299654949807933e-05, + "loss": 0.8935, + "step": 26905 + }, + { + "epoch": 5.739334470989761, + "grad_norm": 0.11075602600447254, + "learning_rate": 1.5294650325979896e-05, + "loss": 0.8827, + "step": 26906 + }, + { + "epoch": 5.739547781569966, + "grad_norm": 0.10896571850127405, + "learning_rate": 1.528964643429398e-05, + "loss": 0.889, + "step": 26907 + }, + { + "epoch": 5.739761092150171, + "grad_norm": 0.1036864780762033, + "learning_rate": 1.528464327480678e-05, + "loss": 0.8522, + "step": 26908 + }, + { + "epoch": 5.739974402730375, + "grad_norm": 0.11540801508782046, + "learning_rate": 1.527964084757495e-05, + "loss": 0.9076, + "step": 26909 + }, + { + "epoch": 5.74018771331058, + "grad_norm": 0.10715159347090474, + "learning_rate": 1.527463915265507e-05, + "loss": 0.8763, + "step": 26910 + }, + { + "epoch": 5.740401023890785, + "grad_norm": 0.11345482152863826, + "learning_rate": 1.5269638190103746e-05, + "loss": 0.88, + "step": 26911 + }, + { + "epoch": 5.7406143344709895, + "grad_norm": 0.10934696344650957, + "learning_rate": 1.5264637959977548e-05, + "loss": 0.8942, + "step": 26912 + }, + { + "epoch": 5.740827645051194, + "grad_norm": 0.10185540294535779, + "learning_rate": 1.525963846233304e-05, + "loss": 0.88, + "step": 26913 + }, + { + "epoch": 5.7410409556314, + "grad_norm": 0.11735077041476281, + "learning_rate": 1.5254639697226839e-05, + "loss": 0.8368, + "step": 26914 + }, + { + "epoch": 5.741254266211604, + "grad_norm": 0.10620935404589608, + "learning_rate": 1.5249641664715493e-05, + "loss": 0.8853, + "step": 26915 + }, + { + "epoch": 5.741467576791809, + "grad_norm": 0.12313233197793746, + "learning_rate": 1.5244644364855502e-05, + "loss": 0.8984, + "step": 26916 + }, + { + "epoch": 5.741680887372014, + "grad_norm": 0.11028830557630469, + "learning_rate": 1.5239647797703465e-05, + "loss": 0.9233, + "step": 26917 + }, + { + "epoch": 5.7418941979522184, + "grad_norm": 0.11621846984471275, + "learning_rate": 1.5234651963315901e-05, + "loss": 0.8808, + "step": 26918 + }, + { + "epoch": 5.742107508532423, + "grad_norm": 0.10150884950946873, + "learning_rate": 1.5229656861749349e-05, + "loss": 0.8651, + "step": 26919 + }, + { + "epoch": 5.742320819112628, + "grad_norm": 0.11353286090307076, + "learning_rate": 1.5224662493060289e-05, + "loss": 0.8664, + "step": 26920 + }, + { + "epoch": 5.7425341296928325, + "grad_norm": 0.10815805101148102, + "learning_rate": 1.521966885730529e-05, + "loss": 0.8742, + "step": 26921 + }, + { + "epoch": 5.742747440273037, + "grad_norm": 0.09978454121064503, + "learning_rate": 1.5214675954540829e-05, + "loss": 0.897, + "step": 26922 + }, + { + "epoch": 5.742960750853243, + "grad_norm": 0.10422275898492259, + "learning_rate": 1.5209683784823396e-05, + "loss": 0.8647, + "step": 26923 + }, + { + "epoch": 5.743174061433447, + "grad_norm": 0.10388718451785046, + "learning_rate": 1.520469234820948e-05, + "loss": 0.891, + "step": 26924 + }, + { + "epoch": 5.743387372013652, + "grad_norm": 0.11351047150470714, + "learning_rate": 1.519970164475554e-05, + "loss": 0.9094, + "step": 26925 + }, + { + "epoch": 5.743600682593857, + "grad_norm": 0.09748020183322828, + "learning_rate": 1.5194711674518101e-05, + "loss": 0.8799, + "step": 26926 + }, + { + "epoch": 5.7438139931740615, + "grad_norm": 0.10501905624911777, + "learning_rate": 1.518972243755358e-05, + "loss": 0.8769, + "step": 26927 + }, + { + "epoch": 5.744027303754266, + "grad_norm": 0.11101664863018196, + "learning_rate": 1.5184733933918411e-05, + "loss": 0.8352, + "step": 26928 + }, + { + "epoch": 5.744240614334471, + "grad_norm": 0.09593053420018088, + "learning_rate": 1.5179746163669094e-05, + "loss": 0.8813, + "step": 26929 + }, + { + "epoch": 5.7444539249146755, + "grad_norm": 0.10127239890895912, + "learning_rate": 1.5174759126862032e-05, + "loss": 0.8743, + "step": 26930 + }, + { + "epoch": 5.74466723549488, + "grad_norm": 0.10447595582446484, + "learning_rate": 1.5169772823553665e-05, + "loss": 0.8761, + "step": 26931 + }, + { + "epoch": 5.744880546075086, + "grad_norm": 0.0943394363634889, + "learning_rate": 1.516478725380039e-05, + "loss": 0.8657, + "step": 26932 + }, + { + "epoch": 5.7450938566552905, + "grad_norm": 0.11661534739648509, + "learning_rate": 1.515980241765866e-05, + "loss": 0.915, + "step": 26933 + }, + { + "epoch": 5.745307167235495, + "grad_norm": 0.11594166355734456, + "learning_rate": 1.5154818315184848e-05, + "loss": 0.8786, + "step": 26934 + }, + { + "epoch": 5.7455204778157, + "grad_norm": 0.10137792638645551, + "learning_rate": 1.514983494643536e-05, + "loss": 0.926, + "step": 26935 + }, + { + "epoch": 5.7457337883959045, + "grad_norm": 0.12421640196653204, + "learning_rate": 1.514485231146659e-05, + "loss": 0.8939, + "step": 26936 + }, + { + "epoch": 5.745947098976109, + "grad_norm": 0.1058569709327165, + "learning_rate": 1.5139870410334881e-05, + "loss": 0.8949, + "step": 26937 + }, + { + "epoch": 5.746160409556314, + "grad_norm": 0.10282521547848603, + "learning_rate": 1.5134889243096668e-05, + "loss": 0.8743, + "step": 26938 + }, + { + "epoch": 5.746373720136519, + "grad_norm": 0.11417246510224961, + "learning_rate": 1.5129908809808256e-05, + "loss": 0.8883, + "step": 26939 + }, + { + "epoch": 5.746587030716723, + "grad_norm": 0.111954173150103, + "learning_rate": 1.5124929110525988e-05, + "loss": 0.8585, + "step": 26940 + }, + { + "epoch": 5.746800341296928, + "grad_norm": 0.10733104070545524, + "learning_rate": 1.5119950145306272e-05, + "loss": 0.9054, + "step": 26941 + }, + { + "epoch": 5.747013651877133, + "grad_norm": 0.11967493417259283, + "learning_rate": 1.5114971914205403e-05, + "loss": 0.874, + "step": 26942 + }, + { + "epoch": 5.747226962457338, + "grad_norm": 0.12780720302493587, + "learning_rate": 1.5109994417279717e-05, + "loss": 0.8532, + "step": 26943 + }, + { + "epoch": 5.747440273037543, + "grad_norm": 0.1105442129976184, + "learning_rate": 1.5105017654585523e-05, + "loss": 0.8713, + "step": 26944 + }, + { + "epoch": 5.747653583617748, + "grad_norm": 0.14898804030272597, + "learning_rate": 1.510004162617916e-05, + "loss": 0.8775, + "step": 26945 + }, + { + "epoch": 5.747866894197952, + "grad_norm": 0.12400612500053536, + "learning_rate": 1.5095066332116922e-05, + "loss": 0.8708, + "step": 26946 + }, + { + "epoch": 5.748080204778157, + "grad_norm": 0.12097622109035483, + "learning_rate": 1.5090091772455098e-05, + "loss": 0.8715, + "step": 26947 + }, + { + "epoch": 5.748293515358362, + "grad_norm": 0.13171908600003884, + "learning_rate": 1.5085117947249974e-05, + "loss": 0.8822, + "step": 26948 + }, + { + "epoch": 5.748506825938566, + "grad_norm": 0.1102294034442173, + "learning_rate": 1.5080144856557835e-05, + "loss": 0.8709, + "step": 26949 + }, + { + "epoch": 5.748720136518771, + "grad_norm": 0.12908786604012523, + "learning_rate": 1.5075172500434954e-05, + "loss": 0.9145, + "step": 26950 + }, + { + "epoch": 5.748933447098976, + "grad_norm": 0.1075481855379522, + "learning_rate": 1.507020087893758e-05, + "loss": 0.8852, + "step": 26951 + }, + { + "epoch": 5.749146757679181, + "grad_norm": 0.11314358099057897, + "learning_rate": 1.5065229992121961e-05, + "loss": 0.9271, + "step": 26952 + }, + { + "epoch": 5.749360068259386, + "grad_norm": 0.11130499134917722, + "learning_rate": 1.5060259840044373e-05, + "loss": 0.8969, + "step": 26953 + }, + { + "epoch": 5.749573378839591, + "grad_norm": 0.11587628487723746, + "learning_rate": 1.5055290422761038e-05, + "loss": 0.8724, + "step": 26954 + }, + { + "epoch": 5.749786689419795, + "grad_norm": 0.1276043412554321, + "learning_rate": 1.5050321740328185e-05, + "loss": 0.915, + "step": 26955 + }, + { + "epoch": 5.75, + "grad_norm": 0.11320707201155741, + "learning_rate": 1.5045353792802018e-05, + "loss": 0.8852, + "step": 26956 + }, + { + "epoch": 5.750213310580205, + "grad_norm": 0.11185294489008381, + "learning_rate": 1.5040386580238778e-05, + "loss": 0.8958, + "step": 26957 + }, + { + "epoch": 5.750426621160409, + "grad_norm": 0.12731302095771993, + "learning_rate": 1.5035420102694657e-05, + "loss": 0.8935, + "step": 26958 + }, + { + "epoch": 5.750639931740614, + "grad_norm": 0.10180016534460733, + "learning_rate": 1.5030454360225855e-05, + "loss": 0.886, + "step": 26959 + }, + { + "epoch": 5.750853242320819, + "grad_norm": 0.12661261480936317, + "learning_rate": 1.5025489352888554e-05, + "loss": 0.8872, + "step": 26960 + }, + { + "epoch": 5.751066552901024, + "grad_norm": 0.12204625906674593, + "learning_rate": 1.5020525080738933e-05, + "loss": 0.8991, + "step": 26961 + }, + { + "epoch": 5.751279863481229, + "grad_norm": 0.10208627417650948, + "learning_rate": 1.501556154383316e-05, + "loss": 0.8816, + "step": 26962 + }, + { + "epoch": 5.751493174061434, + "grad_norm": 0.14370556989484357, + "learning_rate": 1.5010598742227412e-05, + "loss": 0.9019, + "step": 26963 + }, + { + "epoch": 5.751706484641638, + "grad_norm": 0.12975576990460091, + "learning_rate": 1.5005636675977804e-05, + "loss": 0.8802, + "step": 26964 + }, + { + "epoch": 5.751919795221843, + "grad_norm": 0.10684637452827371, + "learning_rate": 1.5000675345140532e-05, + "loss": 0.9049, + "step": 26965 + }, + { + "epoch": 5.752133105802048, + "grad_norm": 0.13180742082339003, + "learning_rate": 1.499571474977171e-05, + "loss": 0.8911, + "step": 26966 + }, + { + "epoch": 5.752346416382252, + "grad_norm": 0.1176782542074564, + "learning_rate": 1.4990754889927463e-05, + "loss": 0.8458, + "step": 26967 + }, + { + "epoch": 5.752559726962457, + "grad_norm": 0.11583058814344271, + "learning_rate": 1.4985795765663906e-05, + "loss": 0.8946, + "step": 26968 + }, + { + "epoch": 5.752773037542662, + "grad_norm": 0.11822415133711418, + "learning_rate": 1.498083737703718e-05, + "loss": 0.8831, + "step": 26969 + }, + { + "epoch": 5.752986348122867, + "grad_norm": 0.11486053835070581, + "learning_rate": 1.4975879724103384e-05, + "loss": 0.8703, + "step": 26970 + }, + { + "epoch": 5.753199658703072, + "grad_norm": 0.09861169059213432, + "learning_rate": 1.4970922806918613e-05, + "loss": 0.8626, + "step": 26971 + }, + { + "epoch": 5.753412969283277, + "grad_norm": 0.13886498168006176, + "learning_rate": 1.4965966625538904e-05, + "loss": 0.8662, + "step": 26972 + }, + { + "epoch": 5.753626279863481, + "grad_norm": 0.11628486464789554, + "learning_rate": 1.4961011180020396e-05, + "loss": 0.8996, + "step": 26973 + }, + { + "epoch": 5.753839590443686, + "grad_norm": 0.12002460204511646, + "learning_rate": 1.4956056470419138e-05, + "loss": 0.8973, + "step": 26974 + }, + { + "epoch": 5.754052901023891, + "grad_norm": 0.10391942162862944, + "learning_rate": 1.4951102496791205e-05, + "loss": 0.8878, + "step": 26975 + }, + { + "epoch": 5.7542662116040955, + "grad_norm": 0.10641008813403169, + "learning_rate": 1.4946149259192617e-05, + "loss": 0.9172, + "step": 26976 + }, + { + "epoch": 5.7544795221843, + "grad_norm": 0.11387338390136183, + "learning_rate": 1.4941196757679461e-05, + "loss": 0.8636, + "step": 26977 + }, + { + "epoch": 5.754692832764505, + "grad_norm": 0.11372215477698897, + "learning_rate": 1.4936244992307756e-05, + "loss": 0.8993, + "step": 26978 + }, + { + "epoch": 5.7549061433447095, + "grad_norm": 0.1108252163423043, + "learning_rate": 1.4931293963133535e-05, + "loss": 0.8568, + "step": 26979 + }, + { + "epoch": 5.755119453924914, + "grad_norm": 0.11895842964218042, + "learning_rate": 1.49263436702128e-05, + "loss": 0.9013, + "step": 26980 + }, + { + "epoch": 5.75533276450512, + "grad_norm": 0.12955271954746214, + "learning_rate": 1.4921394113601602e-05, + "loss": 0.8877, + "step": 26981 + }, + { + "epoch": 5.7555460750853245, + "grad_norm": 0.12485993697055828, + "learning_rate": 1.4916445293355941e-05, + "loss": 0.8844, + "step": 26982 + }, + { + "epoch": 5.755759385665529, + "grad_norm": 0.1127226464512596, + "learning_rate": 1.4911497209531781e-05, + "loss": 0.8985, + "step": 26983 + }, + { + "epoch": 5.755972696245734, + "grad_norm": 0.10548256862466338, + "learning_rate": 1.4906549862185107e-05, + "loss": 0.894, + "step": 26984 + }, + { + "epoch": 5.7561860068259385, + "grad_norm": 0.12332006521144354, + "learning_rate": 1.4901603251371923e-05, + "loss": 0.8901, + "step": 26985 + }, + { + "epoch": 5.756399317406143, + "grad_norm": 0.11014683234847109, + "learning_rate": 1.4896657377148204e-05, + "loss": 0.9035, + "step": 26986 + }, + { + "epoch": 5.756612627986348, + "grad_norm": 0.11699986919285937, + "learning_rate": 1.4891712239569897e-05, + "loss": 0.8741, + "step": 26987 + }, + { + "epoch": 5.756825938566553, + "grad_norm": 0.1280063019683189, + "learning_rate": 1.4886767838692948e-05, + "loss": 0.899, + "step": 26988 + }, + { + "epoch": 5.757039249146757, + "grad_norm": 0.11043991743163593, + "learning_rate": 1.4881824174573329e-05, + "loss": 0.8951, + "step": 26989 + }, + { + "epoch": 5.757252559726963, + "grad_norm": 0.13328846286234022, + "learning_rate": 1.487688124726697e-05, + "loss": 0.8944, + "step": 26990 + }, + { + "epoch": 5.7574658703071675, + "grad_norm": 0.11263207454724475, + "learning_rate": 1.4871939056829789e-05, + "loss": 0.8649, + "step": 26991 + }, + { + "epoch": 5.757679180887372, + "grad_norm": 0.12477947602949212, + "learning_rate": 1.48669976033177e-05, + "loss": 0.8979, + "step": 26992 + }, + { + "epoch": 5.757892491467577, + "grad_norm": 0.11574580829155921, + "learning_rate": 1.4862056886786644e-05, + "loss": 0.9091, + "step": 26993 + }, + { + "epoch": 5.7581058020477816, + "grad_norm": 0.13107991759006427, + "learning_rate": 1.4857116907292529e-05, + "loss": 0.8825, + "step": 26994 + }, + { + "epoch": 5.758319112627986, + "grad_norm": 0.10515609053353023, + "learning_rate": 1.4852177664891216e-05, + "loss": 0.8956, + "step": 26995 + }, + { + "epoch": 5.758532423208191, + "grad_norm": 0.10963590137416752, + "learning_rate": 1.4847239159638584e-05, + "loss": 0.9232, + "step": 26996 + }, + { + "epoch": 5.758745733788396, + "grad_norm": 0.10336503578919214, + "learning_rate": 1.4842301391590561e-05, + "loss": 0.8665, + "step": 26997 + }, + { + "epoch": 5.7589590443686, + "grad_norm": 0.1190906850749521, + "learning_rate": 1.4837364360802985e-05, + "loss": 0.8947, + "step": 26998 + }, + { + "epoch": 5.759172354948806, + "grad_norm": 0.09529419590861513, + "learning_rate": 1.4832428067331735e-05, + "loss": 0.8892, + "step": 26999 + }, + { + "epoch": 5.7593856655290105, + "grad_norm": 0.11604054603030226, + "learning_rate": 1.4827492511232637e-05, + "loss": 0.8846, + "step": 27000 + }, + { + "epoch": 5.759598976109215, + "grad_norm": 0.1358965429761216, + "learning_rate": 1.4822557692561578e-05, + "loss": 0.8878, + "step": 27001 + }, + { + "epoch": 5.75981228668942, + "grad_norm": 0.11101279161184993, + "learning_rate": 1.4817623611374373e-05, + "loss": 0.9208, + "step": 27002 + }, + { + "epoch": 5.760025597269625, + "grad_norm": 0.1206978120615116, + "learning_rate": 1.4812690267726858e-05, + "loss": 0.8859, + "step": 27003 + }, + { + "epoch": 5.760238907849829, + "grad_norm": 0.10837939122717694, + "learning_rate": 1.4807757661674828e-05, + "loss": 0.8979, + "step": 27004 + }, + { + "epoch": 5.760452218430034, + "grad_norm": 0.12465445384524648, + "learning_rate": 1.4802825793274153e-05, + "loss": 0.88, + "step": 27005 + }, + { + "epoch": 5.760665529010239, + "grad_norm": 0.12184680323255517, + "learning_rate": 1.4797894662580582e-05, + "loss": 0.8946, + "step": 27006 + }, + { + "epoch": 5.760878839590443, + "grad_norm": 0.12395467836092419, + "learning_rate": 1.479296426964993e-05, + "loss": 0.9223, + "step": 27007 + }, + { + "epoch": 5.761092150170649, + "grad_norm": 0.11615011873385757, + "learning_rate": 1.4788034614537963e-05, + "loss": 0.8695, + "step": 27008 + }, + { + "epoch": 5.761305460750854, + "grad_norm": 0.1130952060894419, + "learning_rate": 1.4783105697300495e-05, + "loss": 0.8787, + "step": 27009 + }, + { + "epoch": 5.761518771331058, + "grad_norm": 0.11599873599981202, + "learning_rate": 1.4778177517993296e-05, + "loss": 0.8918, + "step": 27010 + }, + { + "epoch": 5.761732081911263, + "grad_norm": 0.12599565428479156, + "learning_rate": 1.4773250076672106e-05, + "loss": 0.8772, + "step": 27011 + }, + { + "epoch": 5.761945392491468, + "grad_norm": 0.0949804886004329, + "learning_rate": 1.4768323373392677e-05, + "loss": 0.8627, + "step": 27012 + }, + { + "epoch": 5.762158703071672, + "grad_norm": 0.13642972926578345, + "learning_rate": 1.4763397408210778e-05, + "loss": 0.9118, + "step": 27013 + }, + { + "epoch": 5.762372013651877, + "grad_norm": 0.10448588458329448, + "learning_rate": 1.4758472181182132e-05, + "loss": 0.8956, + "step": 27014 + }, + { + "epoch": 5.762585324232082, + "grad_norm": 0.1300771701439366, + "learning_rate": 1.4753547692362484e-05, + "loss": 0.8975, + "step": 27015 + }, + { + "epoch": 5.762798634812286, + "grad_norm": 0.12041239991515278, + "learning_rate": 1.4748623941807529e-05, + "loss": 0.8568, + "step": 27016 + }, + { + "epoch": 5.763011945392492, + "grad_norm": 0.13207825829279193, + "learning_rate": 1.4743700929573002e-05, + "loss": 0.8831, + "step": 27017 + }, + { + "epoch": 5.763225255972696, + "grad_norm": 0.11802519866120392, + "learning_rate": 1.473877865571459e-05, + "loss": 0.9103, + "step": 27018 + }, + { + "epoch": 5.763438566552901, + "grad_norm": 0.09697785643423727, + "learning_rate": 1.4733857120287991e-05, + "loss": 0.8925, + "step": 27019 + }, + { + "epoch": 5.763651877133106, + "grad_norm": 0.12579888282429794, + "learning_rate": 1.4728936323348886e-05, + "loss": 0.8899, + "step": 27020 + }, + { + "epoch": 5.763865187713311, + "grad_norm": 0.11356074921789665, + "learning_rate": 1.472401626495298e-05, + "loss": 0.8795, + "step": 27021 + }, + { + "epoch": 5.764078498293515, + "grad_norm": 0.12134920624197723, + "learning_rate": 1.4719096945155929e-05, + "loss": 0.8755, + "step": 27022 + }, + { + "epoch": 5.76429180887372, + "grad_norm": 0.10211354194075771, + "learning_rate": 1.4714178364013392e-05, + "loss": 0.8778, + "step": 27023 + }, + { + "epoch": 5.764505119453925, + "grad_norm": 0.14829437079994578, + "learning_rate": 1.4709260521581005e-05, + "loss": 0.8797, + "step": 27024 + }, + { + "epoch": 5.764718430034129, + "grad_norm": 0.10464824999563917, + "learning_rate": 1.470434341791445e-05, + "loss": 0.8992, + "step": 27025 + }, + { + "epoch": 5.764931740614334, + "grad_norm": 0.13086612170820747, + "learning_rate": 1.4699427053069353e-05, + "loss": 0.8986, + "step": 27026 + }, + { + "epoch": 5.765145051194539, + "grad_norm": 0.11169769847765687, + "learning_rate": 1.4694511427101357e-05, + "loss": 0.9022, + "step": 27027 + }, + { + "epoch": 5.765358361774744, + "grad_norm": 0.1207356714335687, + "learning_rate": 1.4689596540066018e-05, + "loss": 0.8997, + "step": 27028 + }, + { + "epoch": 5.765571672354949, + "grad_norm": 0.13268034726751232, + "learning_rate": 1.4684682392019016e-05, + "loss": 0.8927, + "step": 27029 + }, + { + "epoch": 5.765784982935154, + "grad_norm": 0.09879865538974712, + "learning_rate": 1.4679768983015934e-05, + "loss": 0.9151, + "step": 27030 + }, + { + "epoch": 5.765998293515358, + "grad_norm": 0.1406140663284096, + "learning_rate": 1.4674856313112362e-05, + "loss": 0.9061, + "step": 27031 + }, + { + "epoch": 5.766211604095563, + "grad_norm": 0.11722773495592147, + "learning_rate": 1.4669944382363875e-05, + "loss": 0.8803, + "step": 27032 + }, + { + "epoch": 5.766424914675768, + "grad_norm": 0.11612715346739572, + "learning_rate": 1.4665033190826079e-05, + "loss": 0.8921, + "step": 27033 + }, + { + "epoch": 5.7666382252559725, + "grad_norm": 0.12357248908736014, + "learning_rate": 1.4660122738554533e-05, + "loss": 0.9084, + "step": 27034 + }, + { + "epoch": 5.766851535836177, + "grad_norm": 0.11055216492487242, + "learning_rate": 1.4655213025604797e-05, + "loss": 0.883, + "step": 27035 + }, + { + "epoch": 5.767064846416382, + "grad_norm": 0.13001634037826051, + "learning_rate": 1.4650304052032404e-05, + "loss": 0.8755, + "step": 27036 + }, + { + "epoch": 5.767278156996587, + "grad_norm": 0.11825813889421287, + "learning_rate": 1.4645395817892944e-05, + "loss": 0.8967, + "step": 27037 + }, + { + "epoch": 5.767491467576792, + "grad_norm": 0.10576293376776569, + "learning_rate": 1.4640488323241946e-05, + "loss": 0.8679, + "step": 27038 + }, + { + "epoch": 5.767704778156997, + "grad_norm": 0.12401533313771379, + "learning_rate": 1.4635581568134902e-05, + "loss": 0.9, + "step": 27039 + }, + { + "epoch": 5.7679180887372015, + "grad_norm": 0.12339246501063628, + "learning_rate": 1.4630675552627334e-05, + "loss": 0.8964, + "step": 27040 + }, + { + "epoch": 5.768131399317406, + "grad_norm": 0.09685568736125207, + "learning_rate": 1.4625770276774796e-05, + "loss": 0.9036, + "step": 27041 + }, + { + "epoch": 5.768344709897611, + "grad_norm": 0.13622363266436546, + "learning_rate": 1.4620865740632755e-05, + "loss": 0.8933, + "step": 27042 + }, + { + "epoch": 5.7685580204778155, + "grad_norm": 0.11849421717154113, + "learning_rate": 1.4615961944256731e-05, + "loss": 0.9011, + "step": 27043 + }, + { + "epoch": 5.76877133105802, + "grad_norm": 0.11719503049870539, + "learning_rate": 1.4611058887702165e-05, + "loss": 0.8936, + "step": 27044 + }, + { + "epoch": 5.768984641638225, + "grad_norm": 0.12559646458073187, + "learning_rate": 1.4606156571024594e-05, + "loss": 0.8878, + "step": 27045 + }, + { + "epoch": 5.7691979522184305, + "grad_norm": 0.09558393498265823, + "learning_rate": 1.4601254994279458e-05, + "loss": 0.8777, + "step": 27046 + }, + { + "epoch": 5.769411262798635, + "grad_norm": 0.11915140165872576, + "learning_rate": 1.4596354157522225e-05, + "loss": 0.8781, + "step": 27047 + }, + { + "epoch": 5.76962457337884, + "grad_norm": 0.1231262095848561, + "learning_rate": 1.4591454060808326e-05, + "loss": 0.9072, + "step": 27048 + }, + { + "epoch": 5.7698378839590445, + "grad_norm": 0.12116495487584386, + "learning_rate": 1.4586554704193242e-05, + "loss": 0.921, + "step": 27049 + }, + { + "epoch": 5.770051194539249, + "grad_norm": 0.11632518358263313, + "learning_rate": 1.458165608773242e-05, + "loss": 0.8978, + "step": 27050 + }, + { + "epoch": 5.770264505119454, + "grad_norm": 0.11044680384315173, + "learning_rate": 1.457675821148123e-05, + "loss": 0.8767, + "step": 27051 + }, + { + "epoch": 5.770477815699659, + "grad_norm": 0.1137271712895577, + "learning_rate": 1.45718610754951e-05, + "loss": 0.856, + "step": 27052 + }, + { + "epoch": 5.770691126279863, + "grad_norm": 0.127074495424866, + "learning_rate": 1.4566964679829485e-05, + "loss": 0.8997, + "step": 27053 + }, + { + "epoch": 5.770904436860068, + "grad_norm": 0.1099146113307804, + "learning_rate": 1.4562069024539774e-05, + "loss": 0.9036, + "step": 27054 + }, + { + "epoch": 5.7711177474402735, + "grad_norm": 0.11216146809541262, + "learning_rate": 1.4557174109681342e-05, + "loss": 0.8764, + "step": 27055 + }, + { + "epoch": 5.771331058020478, + "grad_norm": 0.11475115955960191, + "learning_rate": 1.455227993530957e-05, + "loss": 0.8694, + "step": 27056 + }, + { + "epoch": 5.771544368600683, + "grad_norm": 0.10427793923920382, + "learning_rate": 1.454738650147987e-05, + "loss": 0.8808, + "step": 27057 + }, + { + "epoch": 5.771757679180888, + "grad_norm": 0.11346482767574641, + "learning_rate": 1.4542493808247592e-05, + "loss": 0.8788, + "step": 27058 + }, + { + "epoch": 5.771970989761092, + "grad_norm": 0.1169777006691139, + "learning_rate": 1.45376018556681e-05, + "loss": 0.8834, + "step": 27059 + }, + { + "epoch": 5.772184300341297, + "grad_norm": 0.10398074125331652, + "learning_rate": 1.4532710643796724e-05, + "loss": 0.8832, + "step": 27060 + }, + { + "epoch": 5.772397610921502, + "grad_norm": 0.12957094632850058, + "learning_rate": 1.452782017268886e-05, + "loss": 0.8559, + "step": 27061 + }, + { + "epoch": 5.772610921501706, + "grad_norm": 0.10974954641970594, + "learning_rate": 1.4522930442399799e-05, + "loss": 0.8972, + "step": 27062 + }, + { + "epoch": 5.772824232081911, + "grad_norm": 0.11889757899706119, + "learning_rate": 1.4518041452984881e-05, + "loss": 0.9183, + "step": 27063 + }, + { + "epoch": 5.773037542662116, + "grad_norm": 0.12414361069046671, + "learning_rate": 1.4513153204499407e-05, + "loss": 0.8731, + "step": 27064 + }, + { + "epoch": 5.77325085324232, + "grad_norm": 0.12328532882541238, + "learning_rate": 1.4508265696998725e-05, + "loss": 0.909, + "step": 27065 + }, + { + "epoch": 5.773464163822526, + "grad_norm": 0.1271521721468895, + "learning_rate": 1.4503378930538116e-05, + "loss": 0.8927, + "step": 27066 + }, + { + "epoch": 5.773677474402731, + "grad_norm": 0.1099671972865646, + "learning_rate": 1.449849290517288e-05, + "loss": 0.8637, + "step": 27067 + }, + { + "epoch": 5.773890784982935, + "grad_norm": 0.14885667935875335, + "learning_rate": 1.4493607620958279e-05, + "loss": 0.9283, + "step": 27068 + }, + { + "epoch": 5.77410409556314, + "grad_norm": 0.10121664859523336, + "learning_rate": 1.4488723077949627e-05, + "loss": 0.9208, + "step": 27069 + }, + { + "epoch": 5.774317406143345, + "grad_norm": 0.13744629172163367, + "learning_rate": 1.4483839276202169e-05, + "loss": 0.8772, + "step": 27070 + }, + { + "epoch": 5.774530716723549, + "grad_norm": 0.10926250244121476, + "learning_rate": 1.4478956215771178e-05, + "loss": 0.859, + "step": 27071 + }, + { + "epoch": 5.774744027303754, + "grad_norm": 0.12140824897501537, + "learning_rate": 1.4474073896711906e-05, + "loss": 0.8745, + "step": 27072 + }, + { + "epoch": 5.774957337883959, + "grad_norm": 0.12727171472434637, + "learning_rate": 1.4469192319079581e-05, + "loss": 0.8858, + "step": 27073 + }, + { + "epoch": 5.775170648464163, + "grad_norm": 0.09643418951112953, + "learning_rate": 1.4464311482929448e-05, + "loss": 0.8766, + "step": 27074 + }, + { + "epoch": 5.775383959044369, + "grad_norm": 0.11305944515228367, + "learning_rate": 1.4459431388316741e-05, + "loss": 0.8941, + "step": 27075 + }, + { + "epoch": 5.775597269624574, + "grad_norm": 0.11339761154482476, + "learning_rate": 1.4454552035296643e-05, + "loss": 0.8573, + "step": 27076 + }, + { + "epoch": 5.775810580204778, + "grad_norm": 0.10455582884944634, + "learning_rate": 1.4449673423924409e-05, + "loss": 0.9018, + "step": 27077 + }, + { + "epoch": 5.776023890784983, + "grad_norm": 0.14462253071678943, + "learning_rate": 1.4444795554255229e-05, + "loss": 0.9057, + "step": 27078 + }, + { + "epoch": 5.776237201365188, + "grad_norm": 0.11602033004409823, + "learning_rate": 1.4439918426344286e-05, + "loss": 0.8958, + "step": 27079 + }, + { + "epoch": 5.776450511945392, + "grad_norm": 0.14211554978667243, + "learning_rate": 1.4435042040246745e-05, + "loss": 0.8706, + "step": 27080 + }, + { + "epoch": 5.776663822525597, + "grad_norm": 0.14801016891985297, + "learning_rate": 1.4430166396017833e-05, + "loss": 0.9096, + "step": 27081 + }, + { + "epoch": 5.776877133105802, + "grad_norm": 0.14116432581100044, + "learning_rate": 1.4425291493712687e-05, + "loss": 0.9133, + "step": 27082 + }, + { + "epoch": 5.777090443686006, + "grad_norm": 0.1360740203368126, + "learning_rate": 1.4420417333386474e-05, + "loss": 0.8866, + "step": 27083 + }, + { + "epoch": 5.777303754266212, + "grad_norm": 0.16200044915431652, + "learning_rate": 1.4415543915094339e-05, + "loss": 0.8893, + "step": 27084 + }, + { + "epoch": 5.777517064846417, + "grad_norm": 0.12619676951740705, + "learning_rate": 1.441067123889143e-05, + "loss": 0.9134, + "step": 27085 + }, + { + "epoch": 5.777730375426621, + "grad_norm": 0.1464673514402921, + "learning_rate": 1.4405799304832874e-05, + "loss": 0.883, + "step": 27086 + }, + { + "epoch": 5.777943686006826, + "grad_norm": 0.12321183639914622, + "learning_rate": 1.4400928112973813e-05, + "loss": 0.916, + "step": 27087 + }, + { + "epoch": 5.778156996587031, + "grad_norm": 0.11227707566099826, + "learning_rate": 1.4396057663369321e-05, + "loss": 0.9151, + "step": 27088 + }, + { + "epoch": 5.778370307167235, + "grad_norm": 0.13974408274565953, + "learning_rate": 1.439118795607457e-05, + "loss": 0.8799, + "step": 27089 + }, + { + "epoch": 5.77858361774744, + "grad_norm": 0.11321526297577622, + "learning_rate": 1.438631899114463e-05, + "loss": 0.8874, + "step": 27090 + }, + { + "epoch": 5.778796928327645, + "grad_norm": 0.11266515705808582, + "learning_rate": 1.4381450768634592e-05, + "loss": 0.8639, + "step": 27091 + }, + { + "epoch": 5.7790102389078495, + "grad_norm": 0.1185873390669323, + "learning_rate": 1.4376583288599516e-05, + "loss": 0.8905, + "step": 27092 + }, + { + "epoch": 5.779223549488055, + "grad_norm": 0.11550901944292392, + "learning_rate": 1.4371716551094532e-05, + "loss": 0.8765, + "step": 27093 + }, + { + "epoch": 5.77943686006826, + "grad_norm": 0.12323642452569473, + "learning_rate": 1.4366850556174696e-05, + "loss": 0.8963, + "step": 27094 + }, + { + "epoch": 5.779650170648464, + "grad_norm": 0.09938562182839512, + "learning_rate": 1.4361985303895014e-05, + "loss": 0.8872, + "step": 27095 + }, + { + "epoch": 5.779863481228669, + "grad_norm": 0.13706776439395724, + "learning_rate": 1.435712079431059e-05, + "loss": 0.9083, + "step": 27096 + }, + { + "epoch": 5.780076791808874, + "grad_norm": 0.11232887830539078, + "learning_rate": 1.4352257027476441e-05, + "loss": 0.8976, + "step": 27097 + }, + { + "epoch": 5.7802901023890785, + "grad_norm": 0.10846595050969789, + "learning_rate": 1.4347394003447615e-05, + "loss": 0.894, + "step": 27098 + }, + { + "epoch": 5.780503412969283, + "grad_norm": 0.12109457366713831, + "learning_rate": 1.434253172227913e-05, + "loss": 0.885, + "step": 27099 + }, + { + "epoch": 5.780716723549488, + "grad_norm": 0.10581274436060607, + "learning_rate": 1.4337670184025983e-05, + "loss": 0.8666, + "step": 27100 + }, + { + "epoch": 5.7809300341296925, + "grad_norm": 0.10014366407746754, + "learning_rate": 1.4332809388743222e-05, + "loss": 0.8994, + "step": 27101 + }, + { + "epoch": 5.781143344709898, + "grad_norm": 0.11792238220215281, + "learning_rate": 1.4327949336485828e-05, + "loss": 0.8752, + "step": 27102 + }, + { + "epoch": 5.781356655290102, + "grad_norm": 0.10653054906235192, + "learning_rate": 1.4323090027308789e-05, + "loss": 0.8889, + "step": 27103 + }, + { + "epoch": 5.7815699658703075, + "grad_norm": 0.11863696910709959, + "learning_rate": 1.431823146126707e-05, + "loss": 0.8906, + "step": 27104 + }, + { + "epoch": 5.781783276450512, + "grad_norm": 0.113829062891533, + "learning_rate": 1.4313373638415712e-05, + "loss": 0.8623, + "step": 27105 + }, + { + "epoch": 5.781996587030717, + "grad_norm": 0.11028903076042851, + "learning_rate": 1.4308516558809618e-05, + "loss": 0.9093, + "step": 27106 + }, + { + "epoch": 5.7822098976109215, + "grad_norm": 0.11770720019509805, + "learning_rate": 1.4303660222503735e-05, + "loss": 0.8817, + "step": 27107 + }, + { + "epoch": 5.782423208191126, + "grad_norm": 0.10701227637493815, + "learning_rate": 1.429880462955307e-05, + "loss": 0.8812, + "step": 27108 + }, + { + "epoch": 5.782636518771331, + "grad_norm": 0.12855929446251324, + "learning_rate": 1.4293949780012529e-05, + "loss": 0.894, + "step": 27109 + }, + { + "epoch": 5.782849829351536, + "grad_norm": 0.11309818782810967, + "learning_rate": 1.4289095673937055e-05, + "loss": 0.8992, + "step": 27110 + }, + { + "epoch": 5.78306313993174, + "grad_norm": 0.10616695192700987, + "learning_rate": 1.4284242311381572e-05, + "loss": 0.8663, + "step": 27111 + }, + { + "epoch": 5.783276450511945, + "grad_norm": 0.23189742955284792, + "learning_rate": 1.4279389692400965e-05, + "loss": 0.9047, + "step": 27112 + }, + { + "epoch": 5.7834897610921505, + "grad_norm": 0.09602471086962923, + "learning_rate": 1.4274537817050198e-05, + "loss": 0.8663, + "step": 27113 + }, + { + "epoch": 5.783703071672355, + "grad_norm": 0.11995113816037285, + "learning_rate": 1.4269686685384136e-05, + "loss": 0.8786, + "step": 27114 + }, + { + "epoch": 5.78391638225256, + "grad_norm": 0.11300735797887965, + "learning_rate": 1.4264836297457669e-05, + "loss": 0.8573, + "step": 27115 + }, + { + "epoch": 5.784129692832765, + "grad_norm": 0.2549687518634942, + "learning_rate": 1.4259986653325678e-05, + "loss": 0.9131, + "step": 27116 + }, + { + "epoch": 5.784343003412969, + "grad_norm": 0.10970430070521041, + "learning_rate": 1.4255137753043076e-05, + "loss": 0.8888, + "step": 27117 + }, + { + "epoch": 5.784556313993174, + "grad_norm": 0.09215170065853222, + "learning_rate": 1.4250289596664675e-05, + "loss": 0.8478, + "step": 27118 + }, + { + "epoch": 5.784769624573379, + "grad_norm": 0.12391405186432125, + "learning_rate": 1.4245442184245337e-05, + "loss": 0.8955, + "step": 27119 + }, + { + "epoch": 5.784982935153583, + "grad_norm": 0.10315303279463148, + "learning_rate": 1.4240595515839943e-05, + "loss": 0.8843, + "step": 27120 + }, + { + "epoch": 5.785196245733788, + "grad_norm": 0.10828900091781286, + "learning_rate": 1.4235749591503321e-05, + "loss": 0.92, + "step": 27121 + }, + { + "epoch": 5.785409556313994, + "grad_norm": 0.11430286357526691, + "learning_rate": 1.4230904411290296e-05, + "loss": 0.8835, + "step": 27122 + }, + { + "epoch": 5.785622866894198, + "grad_norm": 0.09771666851357917, + "learning_rate": 1.4226059975255689e-05, + "loss": 0.8958, + "step": 27123 + }, + { + "epoch": 5.785836177474403, + "grad_norm": 0.10403216699085405, + "learning_rate": 1.4221216283454306e-05, + "loss": 0.9037, + "step": 27124 + }, + { + "epoch": 5.786049488054608, + "grad_norm": 0.12860426684633733, + "learning_rate": 1.4216373335940988e-05, + "loss": 0.8885, + "step": 27125 + }, + { + "epoch": 5.786262798634812, + "grad_norm": 0.09379398888578812, + "learning_rate": 1.4211531132770513e-05, + "loss": 0.8702, + "step": 27126 + }, + { + "epoch": 5.786476109215017, + "grad_norm": 0.10701788179588054, + "learning_rate": 1.4206689673997668e-05, + "loss": 0.8526, + "step": 27127 + }, + { + "epoch": 5.786689419795222, + "grad_norm": 0.12396064582441663, + "learning_rate": 1.4201848959677231e-05, + "loss": 0.8807, + "step": 27128 + }, + { + "epoch": 5.786902730375426, + "grad_norm": 0.10807089970820545, + "learning_rate": 1.4197008989863981e-05, + "loss": 0.8918, + "step": 27129 + }, + { + "epoch": 5.787116040955631, + "grad_norm": 0.1069071826586442, + "learning_rate": 1.4192169764612689e-05, + "loss": 0.8698, + "step": 27130 + }, + { + "epoch": 5.787329351535837, + "grad_norm": 0.12080910329254099, + "learning_rate": 1.4187331283978077e-05, + "loss": 0.8637, + "step": 27131 + }, + { + "epoch": 5.787542662116041, + "grad_norm": 0.09081093465402872, + "learning_rate": 1.4182493548014936e-05, + "loss": 0.8448, + "step": 27132 + }, + { + "epoch": 5.787755972696246, + "grad_norm": 0.10647629293279333, + "learning_rate": 1.4177656556777995e-05, + "loss": 0.8716, + "step": 27133 + }, + { + "epoch": 5.787969283276451, + "grad_norm": 0.10663900657409621, + "learning_rate": 1.4172820310321966e-05, + "loss": 0.8592, + "step": 27134 + }, + { + "epoch": 5.788182593856655, + "grad_norm": 0.09867577561854037, + "learning_rate": 1.4167984808701598e-05, + "loss": 0.9022, + "step": 27135 + }, + { + "epoch": 5.78839590443686, + "grad_norm": 0.1305356121351861, + "learning_rate": 1.4163150051971556e-05, + "loss": 0.9146, + "step": 27136 + }, + { + "epoch": 5.788609215017065, + "grad_norm": 0.11344875526140247, + "learning_rate": 1.4158316040186607e-05, + "loss": 0.8846, + "step": 27137 + }, + { + "epoch": 5.788822525597269, + "grad_norm": 0.10561191927356571, + "learning_rate": 1.415348277340141e-05, + "loss": 0.9075, + "step": 27138 + }, + { + "epoch": 5.789035836177474, + "grad_norm": 0.11821624413533946, + "learning_rate": 1.4148650251670675e-05, + "loss": 0.8807, + "step": 27139 + }, + { + "epoch": 5.78924914675768, + "grad_norm": 0.10102148418752223, + "learning_rate": 1.414381847504906e-05, + "loss": 0.9021, + "step": 27140 + }, + { + "epoch": 5.789462457337884, + "grad_norm": 0.11333778132387441, + "learning_rate": 1.4138987443591252e-05, + "loss": 0.8974, + "step": 27141 + }, + { + "epoch": 5.789675767918089, + "grad_norm": 0.11905888199885753, + "learning_rate": 1.4134157157351908e-05, + "loss": 0.8487, + "step": 27142 + }, + { + "epoch": 5.789889078498294, + "grad_norm": 0.10528824676555454, + "learning_rate": 1.4129327616385662e-05, + "loss": 0.9319, + "step": 27143 + }, + { + "epoch": 5.790102389078498, + "grad_norm": 0.12437632114162576, + "learning_rate": 1.4124498820747205e-05, + "loss": 0.8818, + "step": 27144 + }, + { + "epoch": 5.790315699658703, + "grad_norm": 0.10538801621434737, + "learning_rate": 1.4119670770491145e-05, + "loss": 0.885, + "step": 27145 + }, + { + "epoch": 5.790529010238908, + "grad_norm": 0.14004636035373585, + "learning_rate": 1.411484346567213e-05, + "loss": 0.8991, + "step": 27146 + }, + { + "epoch": 5.790742320819112, + "grad_norm": 0.11112596118668434, + "learning_rate": 1.4110016906344757e-05, + "loss": 0.8686, + "step": 27147 + }, + { + "epoch": 5.790955631399317, + "grad_norm": 0.10386142896285357, + "learning_rate": 1.4105191092563639e-05, + "loss": 0.8577, + "step": 27148 + }, + { + "epoch": 5.791168941979522, + "grad_norm": 0.10926644060396247, + "learning_rate": 1.4100366024383409e-05, + "loss": 0.8502, + "step": 27149 + }, + { + "epoch": 5.7913822525597265, + "grad_norm": 0.11355835169448711, + "learning_rate": 1.4095541701858664e-05, + "loss": 0.8774, + "step": 27150 + }, + { + "epoch": 5.791595563139932, + "grad_norm": 0.12083667000804679, + "learning_rate": 1.4090718125043936e-05, + "loss": 0.8677, + "step": 27151 + }, + { + "epoch": 5.791808873720137, + "grad_norm": 0.12081892774571523, + "learning_rate": 1.4085895293993867e-05, + "loss": 0.9096, + "step": 27152 + }, + { + "epoch": 5.792022184300341, + "grad_norm": 0.11498977494880425, + "learning_rate": 1.4081073208762992e-05, + "loss": 0.8683, + "step": 27153 + }, + { + "epoch": 5.792235494880546, + "grad_norm": 0.10403661610725264, + "learning_rate": 1.4076251869405888e-05, + "loss": 0.9018, + "step": 27154 + }, + { + "epoch": 5.792448805460751, + "grad_norm": 0.12247669421187618, + "learning_rate": 1.4071431275977082e-05, + "loss": 0.8782, + "step": 27155 + }, + { + "epoch": 5.7926621160409555, + "grad_norm": 0.10006255604737033, + "learning_rate": 1.4066611428531158e-05, + "loss": 0.9116, + "step": 27156 + }, + { + "epoch": 5.79287542662116, + "grad_norm": 0.10557908324937637, + "learning_rate": 1.4061792327122636e-05, + "loss": 0.8656, + "step": 27157 + }, + { + "epoch": 5.793088737201365, + "grad_norm": 0.10983387598947693, + "learning_rate": 1.405697397180604e-05, + "loss": 0.8606, + "step": 27158 + }, + { + "epoch": 5.7933020477815695, + "grad_norm": 0.1055500689880748, + "learning_rate": 1.4052156362635896e-05, + "loss": 0.9234, + "step": 27159 + }, + { + "epoch": 5.793515358361775, + "grad_norm": 0.09875122071198154, + "learning_rate": 1.4047339499666692e-05, + "loss": 0.8912, + "step": 27160 + }, + { + "epoch": 5.79372866894198, + "grad_norm": 0.10407153115771663, + "learning_rate": 1.404252338295299e-05, + "loss": 0.9175, + "step": 27161 + }, + { + "epoch": 5.7939419795221845, + "grad_norm": 0.09619014255339545, + "learning_rate": 1.4037708012549226e-05, + "loss": 0.8817, + "step": 27162 + }, + { + "epoch": 5.794155290102389, + "grad_norm": 0.10231297105915295, + "learning_rate": 1.4032893388509883e-05, + "loss": 0.8965, + "step": 27163 + }, + { + "epoch": 5.794368600682594, + "grad_norm": 0.10216897513332038, + "learning_rate": 1.4028079510889484e-05, + "loss": 0.8463, + "step": 27164 + }, + { + "epoch": 5.7945819112627985, + "grad_norm": 0.0989034236166702, + "learning_rate": 1.4023266379742474e-05, + "loss": 0.8993, + "step": 27165 + }, + { + "epoch": 5.794795221843003, + "grad_norm": 0.09875460812761282, + "learning_rate": 1.4018453995123315e-05, + "loss": 0.8719, + "step": 27166 + }, + { + "epoch": 5.795008532423208, + "grad_norm": 0.1023588222183409, + "learning_rate": 1.4013642357086438e-05, + "loss": 0.9051, + "step": 27167 + }, + { + "epoch": 5.795221843003413, + "grad_norm": 0.1070808206164738, + "learning_rate": 1.4008831465686323e-05, + "loss": 0.9032, + "step": 27168 + }, + { + "epoch": 5.795435153583618, + "grad_norm": 0.10215615494058121, + "learning_rate": 1.4004021320977397e-05, + "loss": 0.9, + "step": 27169 + }, + { + "epoch": 5.795648464163823, + "grad_norm": 0.11056640433428862, + "learning_rate": 1.3999211923014078e-05, + "loss": 0.8956, + "step": 27170 + }, + { + "epoch": 5.7958617747440275, + "grad_norm": 0.11007460505057261, + "learning_rate": 1.3994403271850785e-05, + "loss": 0.8877, + "step": 27171 + }, + { + "epoch": 5.796075085324232, + "grad_norm": 0.09280549349301383, + "learning_rate": 1.3989595367541911e-05, + "loss": 0.8893, + "step": 27172 + }, + { + "epoch": 5.796288395904437, + "grad_norm": 0.10356086376351767, + "learning_rate": 1.3984788210141918e-05, + "loss": 0.8657, + "step": 27173 + }, + { + "epoch": 5.796501706484642, + "grad_norm": 0.11742516824953149, + "learning_rate": 1.3979981799705127e-05, + "loss": 0.8543, + "step": 27174 + }, + { + "epoch": 5.796715017064846, + "grad_norm": 0.10431908902034977, + "learning_rate": 1.397517613628594e-05, + "loss": 0.8949, + "step": 27175 + }, + { + "epoch": 5.796928327645051, + "grad_norm": 0.11788428857294488, + "learning_rate": 1.3970371219938766e-05, + "loss": 0.895, + "step": 27176 + }, + { + "epoch": 5.797141638225256, + "grad_norm": 0.10398237860049898, + "learning_rate": 1.3965567050717951e-05, + "loss": 0.8878, + "step": 27177 + }, + { + "epoch": 5.797354948805461, + "grad_norm": 0.11493791911643787, + "learning_rate": 1.3960763628677856e-05, + "loss": 0.887, + "step": 27178 + }, + { + "epoch": 5.797568259385666, + "grad_norm": 0.10273771652054513, + "learning_rate": 1.3955960953872806e-05, + "loss": 0.8752, + "step": 27179 + }, + { + "epoch": 5.797781569965871, + "grad_norm": 0.10581449830643273, + "learning_rate": 1.3951159026357198e-05, + "loss": 0.8808, + "step": 27180 + }, + { + "epoch": 5.797994880546075, + "grad_norm": 0.10729425400476601, + "learning_rate": 1.394635784618533e-05, + "loss": 0.9029, + "step": 27181 + }, + { + "epoch": 5.79820819112628, + "grad_norm": 0.09551527135083253, + "learning_rate": 1.3941557413411534e-05, + "loss": 0.8801, + "step": 27182 + }, + { + "epoch": 5.798421501706485, + "grad_norm": 0.1051021014442424, + "learning_rate": 1.3936757728090134e-05, + "loss": 0.8734, + "step": 27183 + }, + { + "epoch": 5.798634812286689, + "grad_norm": 0.10126763384114697, + "learning_rate": 1.3931958790275428e-05, + "loss": 0.8847, + "step": 27184 + }, + { + "epoch": 5.798848122866894, + "grad_norm": 0.10992453334247138, + "learning_rate": 1.3927160600021714e-05, + "loss": 0.8918, + "step": 27185 + }, + { + "epoch": 5.799061433447099, + "grad_norm": 0.10003215374726164, + "learning_rate": 1.3922363157383293e-05, + "loss": 0.891, + "step": 27186 + }, + { + "epoch": 5.799274744027304, + "grad_norm": 0.10632812483888639, + "learning_rate": 1.3917566462414426e-05, + "loss": 0.879, + "step": 27187 + }, + { + "epoch": 5.799488054607508, + "grad_norm": 0.11489122239478819, + "learning_rate": 1.3912770515169424e-05, + "loss": 0.8752, + "step": 27188 + }, + { + "epoch": 5.799701365187714, + "grad_norm": 0.12645359057139915, + "learning_rate": 1.3907975315702538e-05, + "loss": 0.9039, + "step": 27189 + }, + { + "epoch": 5.799914675767918, + "grad_norm": 0.10396293279484332, + "learning_rate": 1.3903180864068016e-05, + "loss": 0.8721, + "step": 27190 + }, + { + "epoch": 5.800127986348123, + "grad_norm": 0.13166303702369503, + "learning_rate": 1.38983871603201e-05, + "loss": 0.8883, + "step": 27191 + }, + { + "epoch": 5.800341296928328, + "grad_norm": 0.0940966710916551, + "learning_rate": 1.3893594204513065e-05, + "loss": 0.8959, + "step": 27192 + }, + { + "epoch": 5.800554607508532, + "grad_norm": 0.10612282504424358, + "learning_rate": 1.388880199670112e-05, + "loss": 0.8851, + "step": 27193 + }, + { + "epoch": 5.800767918088737, + "grad_norm": 0.121672885302778, + "learning_rate": 1.3884010536938504e-05, + "loss": 0.8794, + "step": 27194 + }, + { + "epoch": 5.800981228668942, + "grad_norm": 0.11517042609911994, + "learning_rate": 1.3879219825279404e-05, + "loss": 0.8818, + "step": 27195 + }, + { + "epoch": 5.801194539249146, + "grad_norm": 0.1316152477414643, + "learning_rate": 1.3874429861778054e-05, + "loss": 0.9188, + "step": 27196 + }, + { + "epoch": 5.801407849829351, + "grad_norm": 0.09700166740986864, + "learning_rate": 1.386964064648864e-05, + "loss": 0.872, + "step": 27197 + }, + { + "epoch": 5.801621160409557, + "grad_norm": 0.12481192222107636, + "learning_rate": 1.386485217946536e-05, + "loss": 0.8962, + "step": 27198 + }, + { + "epoch": 5.801834470989761, + "grad_norm": 0.13330248284892426, + "learning_rate": 1.3860064460762357e-05, + "loss": 0.8912, + "step": 27199 + }, + { + "epoch": 5.802047781569966, + "grad_norm": 0.11146569014593863, + "learning_rate": 1.3855277490433872e-05, + "loss": 0.8817, + "step": 27200 + }, + { + "epoch": 5.802261092150171, + "grad_norm": 0.1486785204842073, + "learning_rate": 1.3850491268534021e-05, + "loss": 0.8947, + "step": 27201 + }, + { + "epoch": 5.802474402730375, + "grad_norm": 0.10138458402479747, + "learning_rate": 1.3845705795116984e-05, + "loss": 0.9147, + "step": 27202 + }, + { + "epoch": 5.80268771331058, + "grad_norm": 0.12470840409053699, + "learning_rate": 1.3840921070236877e-05, + "loss": 0.8749, + "step": 27203 + }, + { + "epoch": 5.802901023890785, + "grad_norm": 0.09102504606519099, + "learning_rate": 1.383613709394788e-05, + "loss": 0.8601, + "step": 27204 + }, + { + "epoch": 5.8031143344709895, + "grad_norm": 0.13617340344849432, + "learning_rate": 1.3831353866304098e-05, + "loss": 0.9212, + "step": 27205 + }, + { + "epoch": 5.803327645051194, + "grad_norm": 0.09662408264818367, + "learning_rate": 1.3826571387359686e-05, + "loss": 0.8766, + "step": 27206 + }, + { + "epoch": 5.8035409556314, + "grad_norm": 0.11973803566187548, + "learning_rate": 1.3821789657168698e-05, + "loss": 0.8889, + "step": 27207 + }, + { + "epoch": 5.803754266211604, + "grad_norm": 0.10173305156333469, + "learning_rate": 1.3817008675785286e-05, + "loss": 0.8665, + "step": 27208 + }, + { + "epoch": 5.803967576791809, + "grad_norm": 0.1196649672477718, + "learning_rate": 1.3812228443263532e-05, + "loss": 0.8999, + "step": 27209 + }, + { + "epoch": 5.804180887372014, + "grad_norm": 0.11648766931119804, + "learning_rate": 1.3807448959657532e-05, + "loss": 0.8648, + "step": 27210 + }, + { + "epoch": 5.8043941979522184, + "grad_norm": 0.1036613566095383, + "learning_rate": 1.3802670225021335e-05, + "loss": 0.8676, + "step": 27211 + }, + { + "epoch": 5.804607508532423, + "grad_norm": 0.12549525665696135, + "learning_rate": 1.3797892239409057e-05, + "loss": 0.9232, + "step": 27212 + }, + { + "epoch": 5.804820819112628, + "grad_norm": 0.10870582267218207, + "learning_rate": 1.3793115002874751e-05, + "loss": 0.848, + "step": 27213 + }, + { + "epoch": 5.8050341296928325, + "grad_norm": 0.11641566416310647, + "learning_rate": 1.3788338515472454e-05, + "loss": 0.9256, + "step": 27214 + }, + { + "epoch": 5.805247440273037, + "grad_norm": 0.11591627355485784, + "learning_rate": 1.3783562777256205e-05, + "loss": 0.8951, + "step": 27215 + }, + { + "epoch": 5.805460750853243, + "grad_norm": 0.10913805097586565, + "learning_rate": 1.3778787788280074e-05, + "loss": 0.9023, + "step": 27216 + }, + { + "epoch": 5.805674061433447, + "grad_norm": 0.12873468097912905, + "learning_rate": 1.3774013548598091e-05, + "loss": 0.8974, + "step": 27217 + }, + { + "epoch": 5.805887372013652, + "grad_norm": 0.09944293259982259, + "learning_rate": 1.3769240058264247e-05, + "loss": 0.8704, + "step": 27218 + }, + { + "epoch": 5.806100682593857, + "grad_norm": 0.1315324809245535, + "learning_rate": 1.3764467317332542e-05, + "loss": 0.8726, + "step": 27219 + }, + { + "epoch": 5.8063139931740615, + "grad_norm": 0.11876165543934467, + "learning_rate": 1.3759695325857015e-05, + "loss": 0.8973, + "step": 27220 + }, + { + "epoch": 5.806527303754266, + "grad_norm": 0.13818245390649567, + "learning_rate": 1.3754924083891656e-05, + "loss": 0.8783, + "step": 27221 + }, + { + "epoch": 5.806740614334471, + "grad_norm": 0.1158164772961487, + "learning_rate": 1.3750153591490452e-05, + "loss": 0.8874, + "step": 27222 + }, + { + "epoch": 5.8069539249146755, + "grad_norm": 0.11435091802564379, + "learning_rate": 1.3745383848707347e-05, + "loss": 0.8871, + "step": 27223 + }, + { + "epoch": 5.80716723549488, + "grad_norm": 0.1267723105034114, + "learning_rate": 1.3740614855596354e-05, + "loss": 0.8856, + "step": 27224 + }, + { + "epoch": 5.807380546075086, + "grad_norm": 0.0975873380299851, + "learning_rate": 1.373584661221143e-05, + "loss": 0.8876, + "step": 27225 + }, + { + "epoch": 5.8075938566552905, + "grad_norm": 0.12493680896682251, + "learning_rate": 1.3731079118606512e-05, + "loss": 0.8748, + "step": 27226 + }, + { + "epoch": 5.807807167235495, + "grad_norm": 0.10078297072063291, + "learning_rate": 1.3726312374835522e-05, + "loss": 0.8822, + "step": 27227 + }, + { + "epoch": 5.8080204778157, + "grad_norm": 0.10669296197415433, + "learning_rate": 1.3721546380952444e-05, + "loss": 0.8899, + "step": 27228 + }, + { + "epoch": 5.8082337883959045, + "grad_norm": 0.09643316611252238, + "learning_rate": 1.3716781137011208e-05, + "loss": 0.8859, + "step": 27229 + }, + { + "epoch": 5.808447098976109, + "grad_norm": 0.10789024745112931, + "learning_rate": 1.3712016643065691e-05, + "loss": 0.8964, + "step": 27230 + }, + { + "epoch": 5.808660409556314, + "grad_norm": 0.09986546228967556, + "learning_rate": 1.3707252899169796e-05, + "loss": 0.8757, + "step": 27231 + }, + { + "epoch": 5.808873720136519, + "grad_norm": 0.11722206844990019, + "learning_rate": 1.3702489905377462e-05, + "loss": 0.8687, + "step": 27232 + }, + { + "epoch": 5.809087030716723, + "grad_norm": 0.09995184733133726, + "learning_rate": 1.3697727661742586e-05, + "loss": 0.9128, + "step": 27233 + }, + { + "epoch": 5.809300341296928, + "grad_norm": 0.10685216505347382, + "learning_rate": 1.3692966168319024e-05, + "loss": 0.9009, + "step": 27234 + }, + { + "epoch": 5.809513651877133, + "grad_norm": 0.1048017046694595, + "learning_rate": 1.3688205425160645e-05, + "loss": 0.8446, + "step": 27235 + }, + { + "epoch": 5.809726962457338, + "grad_norm": 0.11161106663623983, + "learning_rate": 1.3683445432321362e-05, + "loss": 0.8845, + "step": 27236 + }, + { + "epoch": 5.809940273037543, + "grad_norm": 0.12177720525041467, + "learning_rate": 1.3678686189855017e-05, + "loss": 0.8712, + "step": 27237 + }, + { + "epoch": 5.810153583617748, + "grad_norm": 0.11899689868744129, + "learning_rate": 1.367392769781544e-05, + "loss": 0.8595, + "step": 27238 + }, + { + "epoch": 5.810366894197952, + "grad_norm": 0.11597711174493944, + "learning_rate": 1.3669169956256477e-05, + "loss": 0.8787, + "step": 27239 + }, + { + "epoch": 5.810580204778157, + "grad_norm": 0.10393389219867964, + "learning_rate": 1.3664412965232e-05, + "loss": 0.8612, + "step": 27240 + }, + { + "epoch": 5.810793515358362, + "grad_norm": 0.10622724698026723, + "learning_rate": 1.3659656724795798e-05, + "loss": 0.8787, + "step": 27241 + }, + { + "epoch": 5.811006825938566, + "grad_norm": 0.1020492001879061, + "learning_rate": 1.3654901235001688e-05, + "loss": 0.8568, + "step": 27242 + }, + { + "epoch": 5.811220136518771, + "grad_norm": 0.10119934130374529, + "learning_rate": 1.3650146495903465e-05, + "loss": 0.8664, + "step": 27243 + }, + { + "epoch": 5.811433447098976, + "grad_norm": 0.11902342806803592, + "learning_rate": 1.364539250755498e-05, + "loss": 0.9012, + "step": 27244 + }, + { + "epoch": 5.811646757679181, + "grad_norm": 0.10638246996863497, + "learning_rate": 1.3640639270009985e-05, + "loss": 0.8761, + "step": 27245 + }, + { + "epoch": 5.811860068259386, + "grad_norm": 0.13114217664926156, + "learning_rate": 1.363588678332228e-05, + "loss": 0.8782, + "step": 27246 + }, + { + "epoch": 5.812073378839591, + "grad_norm": 0.09892104183733436, + "learning_rate": 1.3631135047545612e-05, + "loss": 0.8987, + "step": 27247 + }, + { + "epoch": 5.812286689419795, + "grad_norm": 0.12681895784252625, + "learning_rate": 1.3626384062733783e-05, + "loss": 0.9078, + "step": 27248 + }, + { + "epoch": 5.8125, + "grad_norm": 0.1096393250455689, + "learning_rate": 1.3621633828940532e-05, + "loss": 0.907, + "step": 27249 + }, + { + "epoch": 5.812713310580205, + "grad_norm": 0.11463065661277727, + "learning_rate": 1.3616884346219616e-05, + "loss": 0.8648, + "step": 27250 + }, + { + "epoch": 5.812926621160409, + "grad_norm": 0.11477517839928611, + "learning_rate": 1.361213561462477e-05, + "loss": 0.8869, + "step": 27251 + }, + { + "epoch": 5.813139931740614, + "grad_norm": 0.09269587644568418, + "learning_rate": 1.3607387634209731e-05, + "loss": 0.902, + "step": 27252 + }, + { + "epoch": 5.813353242320819, + "grad_norm": 0.10275982415264433, + "learning_rate": 1.3602640405028206e-05, + "loss": 0.8926, + "step": 27253 + }, + { + "epoch": 5.813566552901024, + "grad_norm": 0.10898396858435723, + "learning_rate": 1.3597893927133927e-05, + "loss": 0.883, + "step": 27254 + }, + { + "epoch": 5.813779863481229, + "grad_norm": 0.0900714130282463, + "learning_rate": 1.3593148200580583e-05, + "loss": 0.8775, + "step": 27255 + }, + { + "epoch": 5.813993174061434, + "grad_norm": 0.12010398429614666, + "learning_rate": 1.3588403225421893e-05, + "loss": 0.8758, + "step": 27256 + }, + { + "epoch": 5.814206484641638, + "grad_norm": 0.10772931205465333, + "learning_rate": 1.3583659001711547e-05, + "loss": 0.8955, + "step": 27257 + }, + { + "epoch": 5.814419795221843, + "grad_norm": 0.12050713294858083, + "learning_rate": 1.3578915529503207e-05, + "loss": 0.8888, + "step": 27258 + }, + { + "epoch": 5.814633105802048, + "grad_norm": 0.10779395227803967, + "learning_rate": 1.3574172808850538e-05, + "loss": 0.9047, + "step": 27259 + }, + { + "epoch": 5.814846416382252, + "grad_norm": 0.11596846551593087, + "learning_rate": 1.356943083980724e-05, + "loss": 0.8895, + "step": 27260 + }, + { + "epoch": 5.815059726962457, + "grad_norm": 0.10014015936905733, + "learning_rate": 1.356468962242696e-05, + "loss": 0.8762, + "step": 27261 + }, + { + "epoch": 5.815273037542662, + "grad_norm": 0.09672924395968643, + "learning_rate": 1.355994915676334e-05, + "loss": 0.8956, + "step": 27262 + }, + { + "epoch": 5.815486348122867, + "grad_norm": 0.11857783476971265, + "learning_rate": 1.3555209442869984e-05, + "loss": 0.883, + "step": 27263 + }, + { + "epoch": 5.815699658703072, + "grad_norm": 0.10856697791662002, + "learning_rate": 1.3550470480800564e-05, + "loss": 0.857, + "step": 27264 + }, + { + "epoch": 5.815912969283277, + "grad_norm": 0.10757303838151232, + "learning_rate": 1.354573227060869e-05, + "loss": 0.8692, + "step": 27265 + }, + { + "epoch": 5.816126279863481, + "grad_norm": 0.11980320359199352, + "learning_rate": 1.3540994812347976e-05, + "loss": 0.9123, + "step": 27266 + }, + { + "epoch": 5.816339590443686, + "grad_norm": 0.09914596520119706, + "learning_rate": 1.3536258106071992e-05, + "loss": 0.8667, + "step": 27267 + }, + { + "epoch": 5.816552901023891, + "grad_norm": 0.11516590557385292, + "learning_rate": 1.3531522151834397e-05, + "loss": 0.8771, + "step": 27268 + }, + { + "epoch": 5.8167662116040955, + "grad_norm": 0.10015274908371777, + "learning_rate": 1.3526786949688747e-05, + "loss": 0.8986, + "step": 27269 + }, + { + "epoch": 5.8169795221843, + "grad_norm": 0.10028738062096763, + "learning_rate": 1.3522052499688614e-05, + "loss": 0.8993, + "step": 27270 + }, + { + "epoch": 5.817192832764505, + "grad_norm": 0.10891582086377356, + "learning_rate": 1.3517318801887558e-05, + "loss": 0.8979, + "step": 27271 + }, + { + "epoch": 5.8174061433447095, + "grad_norm": 0.10631553771289295, + "learning_rate": 1.351258585633918e-05, + "loss": 0.8831, + "step": 27272 + }, + { + "epoch": 5.817619453924914, + "grad_norm": 0.09736728037511368, + "learning_rate": 1.3507853663097029e-05, + "loss": 0.862, + "step": 27273 + }, + { + "epoch": 5.81783276450512, + "grad_norm": 0.11995866399782136, + "learning_rate": 1.3503122222214616e-05, + "loss": 0.8914, + "step": 27274 + }, + { + "epoch": 5.8180460750853245, + "grad_norm": 0.09612562926437147, + "learning_rate": 1.3498391533745472e-05, + "loss": 0.8835, + "step": 27275 + }, + { + "epoch": 5.818259385665529, + "grad_norm": 0.10764334003146489, + "learning_rate": 1.3493661597743164e-05, + "loss": 0.8805, + "step": 27276 + }, + { + "epoch": 5.818472696245734, + "grad_norm": 0.11701908456372846, + "learning_rate": 1.3488932414261195e-05, + "loss": 0.8677, + "step": 27277 + }, + { + "epoch": 5.8186860068259385, + "grad_norm": 0.10109712761781527, + "learning_rate": 1.348420398335308e-05, + "loss": 0.8689, + "step": 27278 + }, + { + "epoch": 5.818899317406143, + "grad_norm": 0.09649714053926817, + "learning_rate": 1.3479476305072283e-05, + "loss": 0.8885, + "step": 27279 + }, + { + "epoch": 5.819112627986348, + "grad_norm": 0.13172443411461182, + "learning_rate": 1.3474749379472365e-05, + "loss": 0.8869, + "step": 27280 + }, + { + "epoch": 5.819325938566553, + "grad_norm": 0.10815225606054062, + "learning_rate": 1.3470023206606766e-05, + "loss": 0.9013, + "step": 27281 + }, + { + "epoch": 5.819539249146757, + "grad_norm": 0.10845106935089958, + "learning_rate": 1.3465297786528982e-05, + "loss": 0.8723, + "step": 27282 + }, + { + "epoch": 5.819752559726963, + "grad_norm": 0.11432291126761858, + "learning_rate": 1.3460573119292443e-05, + "loss": 0.8514, + "step": 27283 + }, + { + "epoch": 5.8199658703071675, + "grad_norm": 0.10950452107782323, + "learning_rate": 1.3455849204950684e-05, + "loss": 0.9092, + "step": 27284 + }, + { + "epoch": 5.820179180887372, + "grad_norm": 0.1266164024621757, + "learning_rate": 1.345112604355709e-05, + "loss": 0.9024, + "step": 27285 + }, + { + "epoch": 5.820392491467577, + "grad_norm": 0.09805055948821917, + "learning_rate": 1.3446403635165122e-05, + "loss": 0.8966, + "step": 27286 + }, + { + "epoch": 5.8206058020477816, + "grad_norm": 0.1181369441862614, + "learning_rate": 1.3441681979828203e-05, + "loss": 0.8808, + "step": 27287 + }, + { + "epoch": 5.820819112627986, + "grad_norm": 0.12081847696710993, + "learning_rate": 1.3436961077599784e-05, + "loss": 0.9002, + "step": 27288 + }, + { + "epoch": 5.821032423208191, + "grad_norm": 0.09864425509970885, + "learning_rate": 1.3432240928533271e-05, + "loss": 0.8973, + "step": 27289 + }, + { + "epoch": 5.821245733788396, + "grad_norm": 0.11762954726186765, + "learning_rate": 1.342752153268207e-05, + "loss": 0.9002, + "step": 27290 + }, + { + "epoch": 5.8214590443686, + "grad_norm": 0.09927228364005125, + "learning_rate": 1.3422802890099572e-05, + "loss": 0.8625, + "step": 27291 + }, + { + "epoch": 5.821672354948806, + "grad_norm": 0.10171816494873395, + "learning_rate": 1.3418085000839196e-05, + "loss": 0.891, + "step": 27292 + }, + { + "epoch": 5.8218856655290105, + "grad_norm": 0.1140020156653963, + "learning_rate": 1.3413367864954298e-05, + "loss": 0.8769, + "step": 27293 + }, + { + "epoch": 5.822098976109215, + "grad_norm": 0.10602558555528863, + "learning_rate": 1.3408651482498276e-05, + "loss": 0.8989, + "step": 27294 + }, + { + "epoch": 5.82231228668942, + "grad_norm": 0.10138737054390004, + "learning_rate": 1.3403935853524455e-05, + "loss": 0.903, + "step": 27295 + }, + { + "epoch": 5.822525597269625, + "grad_norm": 0.1063319383175032, + "learning_rate": 1.3399220978086258e-05, + "loss": 0.8777, + "step": 27296 + }, + { + "epoch": 5.822738907849829, + "grad_norm": 0.102293668666599, + "learning_rate": 1.3394506856236987e-05, + "loss": 0.8773, + "step": 27297 + }, + { + "epoch": 5.822952218430034, + "grad_norm": 0.11725193062433817, + "learning_rate": 1.3389793488029983e-05, + "loss": 0.8739, + "step": 27298 + }, + { + "epoch": 5.823165529010239, + "grad_norm": 0.11189233333951354, + "learning_rate": 1.3385080873518565e-05, + "loss": 0.8819, + "step": 27299 + }, + { + "epoch": 5.823378839590443, + "grad_norm": 0.10377430901388623, + "learning_rate": 1.3380369012756096e-05, + "loss": 0.8688, + "step": 27300 + }, + { + "epoch": 5.823592150170649, + "grad_norm": 0.10839424017243307, + "learning_rate": 1.3375657905795874e-05, + "loss": 0.9035, + "step": 27301 + }, + { + "epoch": 5.823805460750854, + "grad_norm": 0.11232083627597694, + "learning_rate": 1.3370947552691207e-05, + "loss": 0.9157, + "step": 27302 + }, + { + "epoch": 5.824018771331058, + "grad_norm": 0.0997932984235143, + "learning_rate": 1.3366237953495369e-05, + "loss": 0.884, + "step": 27303 + }, + { + "epoch": 5.824232081911263, + "grad_norm": 0.10393483882529937, + "learning_rate": 1.3361529108261687e-05, + "loss": 0.906, + "step": 27304 + }, + { + "epoch": 5.824445392491468, + "grad_norm": 0.09727322941714793, + "learning_rate": 1.3356821017043422e-05, + "loss": 0.8735, + "step": 27305 + }, + { + "epoch": 5.824658703071672, + "grad_norm": 0.09531719696104662, + "learning_rate": 1.335211367989385e-05, + "loss": 0.8951, + "step": 27306 + }, + { + "epoch": 5.824872013651877, + "grad_norm": 0.10287336190287445, + "learning_rate": 1.3347407096866233e-05, + "loss": 0.8585, + "step": 27307 + }, + { + "epoch": 5.825085324232082, + "grad_norm": 0.09627614396693293, + "learning_rate": 1.334270126801383e-05, + "loss": 0.8915, + "step": 27308 + }, + { + "epoch": 5.825298634812286, + "grad_norm": 0.10524852289937024, + "learning_rate": 1.3337996193389881e-05, + "loss": 0.9163, + "step": 27309 + }, + { + "epoch": 5.825511945392492, + "grad_norm": 0.10732930925120883, + "learning_rate": 1.333329187304762e-05, + "loss": 0.8887, + "step": 27310 + }, + { + "epoch": 5.825725255972696, + "grad_norm": 0.104642916982263, + "learning_rate": 1.3328588307040274e-05, + "loss": 0.933, + "step": 27311 + }, + { + "epoch": 5.825938566552901, + "grad_norm": 0.0891631433610209, + "learning_rate": 1.3323885495421087e-05, + "loss": 0.8945, + "step": 27312 + }, + { + "epoch": 5.826151877133106, + "grad_norm": 0.10500912710561106, + "learning_rate": 1.3319183438243256e-05, + "loss": 0.9009, + "step": 27313 + }, + { + "epoch": 5.826365187713311, + "grad_norm": 0.1066417135853658, + "learning_rate": 1.3314482135559988e-05, + "loss": 0.8649, + "step": 27314 + }, + { + "epoch": 5.826578498293515, + "grad_norm": 0.09410613226291006, + "learning_rate": 1.3309781587424451e-05, + "loss": 0.8728, + "step": 27315 + }, + { + "epoch": 5.82679180887372, + "grad_norm": 0.1040981218241545, + "learning_rate": 1.3305081793889882e-05, + "loss": 0.9091, + "step": 27316 + }, + { + "epoch": 5.827005119453925, + "grad_norm": 0.10900207724193571, + "learning_rate": 1.330038275500943e-05, + "loss": 0.8895, + "step": 27317 + }, + { + "epoch": 5.827218430034129, + "grad_norm": 0.1076392483940155, + "learning_rate": 1.3295684470836272e-05, + "loss": 0.896, + "step": 27318 + }, + { + "epoch": 5.827431740614334, + "grad_norm": 0.10033657793290283, + "learning_rate": 1.3290986941423572e-05, + "loss": 0.8818, + "step": 27319 + }, + { + "epoch": 5.827645051194539, + "grad_norm": 0.11646791824559533, + "learning_rate": 1.328629016682447e-05, + "loss": 0.9015, + "step": 27320 + }, + { + "epoch": 5.827858361774744, + "grad_norm": 0.10918081616010959, + "learning_rate": 1.3281594147092113e-05, + "loss": 0.8957, + "step": 27321 + }, + { + "epoch": 5.828071672354949, + "grad_norm": 0.10231099769174885, + "learning_rate": 1.3276898882279655e-05, + "loss": 0.9003, + "step": 27322 + }, + { + "epoch": 5.828284982935154, + "grad_norm": 0.11510237576529317, + "learning_rate": 1.3272204372440176e-05, + "loss": 0.8821, + "step": 27323 + }, + { + "epoch": 5.828498293515358, + "grad_norm": 0.09693561080766952, + "learning_rate": 1.3267510617626847e-05, + "loss": 0.885, + "step": 27324 + }, + { + "epoch": 5.828711604095563, + "grad_norm": 0.10386679035918824, + "learning_rate": 1.326281761789276e-05, + "loss": 0.8776, + "step": 27325 + }, + { + "epoch": 5.828924914675768, + "grad_norm": 0.11848636906172995, + "learning_rate": 1.3258125373291023e-05, + "loss": 0.8784, + "step": 27326 + }, + { + "epoch": 5.8291382252559725, + "grad_norm": 0.09512415078155129, + "learning_rate": 1.325343388387469e-05, + "loss": 0.9288, + "step": 27327 + }, + { + "epoch": 5.829351535836177, + "grad_norm": 0.1847720653162486, + "learning_rate": 1.3248743149696898e-05, + "loss": 0.8669, + "step": 27328 + }, + { + "epoch": 5.829564846416382, + "grad_norm": 0.11409587073596412, + "learning_rate": 1.3244053170810713e-05, + "loss": 0.9135, + "step": 27329 + }, + { + "epoch": 5.829778156996587, + "grad_norm": 0.10217424855031865, + "learning_rate": 1.3239363947269159e-05, + "loss": 0.8728, + "step": 27330 + }, + { + "epoch": 5.829991467576792, + "grad_norm": 0.10165465953858979, + "learning_rate": 1.323467547912534e-05, + "loss": 0.8782, + "step": 27331 + }, + { + "epoch": 5.830204778156997, + "grad_norm": 0.10172419424248129, + "learning_rate": 1.322998776643229e-05, + "loss": 0.8951, + "step": 27332 + }, + { + "epoch": 5.8304180887372015, + "grad_norm": 0.09540781361293542, + "learning_rate": 1.322530080924306e-05, + "loss": 0.8644, + "step": 27333 + }, + { + "epoch": 5.830631399317406, + "grad_norm": 0.09762686484964612, + "learning_rate": 1.3220614607610665e-05, + "loss": 0.92, + "step": 27334 + }, + { + "epoch": 5.830844709897611, + "grad_norm": 0.09926995826132771, + "learning_rate": 1.3215929161588127e-05, + "loss": 0.8689, + "step": 27335 + }, + { + "epoch": 5.8310580204778155, + "grad_norm": 0.09529837764701142, + "learning_rate": 1.3211244471228484e-05, + "loss": 0.8793, + "step": 27336 + }, + { + "epoch": 5.83127133105802, + "grad_norm": 0.09636099816149564, + "learning_rate": 1.3206560536584739e-05, + "loss": 0.8991, + "step": 27337 + }, + { + "epoch": 5.831484641638225, + "grad_norm": 0.1078440761473319, + "learning_rate": 1.320187735770989e-05, + "loss": 0.8864, + "step": 27338 + }, + { + "epoch": 5.8316979522184305, + "grad_norm": 0.09765637697878533, + "learning_rate": 1.3197194934656898e-05, + "loss": 0.8971, + "step": 27339 + }, + { + "epoch": 5.831911262798635, + "grad_norm": 0.10648284540668476, + "learning_rate": 1.3192513267478809e-05, + "loss": 0.9174, + "step": 27340 + }, + { + "epoch": 5.83212457337884, + "grad_norm": 0.09396983558746094, + "learning_rate": 1.3187832356228527e-05, + "loss": 0.9158, + "step": 27341 + }, + { + "epoch": 5.8323378839590445, + "grad_norm": 0.11900206584316796, + "learning_rate": 1.3183152200959036e-05, + "loss": 0.8744, + "step": 27342 + }, + { + "epoch": 5.832551194539249, + "grad_norm": 0.10113414103838816, + "learning_rate": 1.3178472801723321e-05, + "loss": 0.9109, + "step": 27343 + }, + { + "epoch": 5.832764505119454, + "grad_norm": 0.09922107111354898, + "learning_rate": 1.3173794158574311e-05, + "loss": 0.8676, + "step": 27344 + }, + { + "epoch": 5.832977815699659, + "grad_norm": 0.10690381026892003, + "learning_rate": 1.316911627156494e-05, + "loss": 0.9016, + "step": 27345 + }, + { + "epoch": 5.833191126279863, + "grad_norm": 0.10499050023708367, + "learning_rate": 1.3164439140748147e-05, + "loss": 0.9094, + "step": 27346 + }, + { + "epoch": 5.833404436860068, + "grad_norm": 0.11919296803466892, + "learning_rate": 1.3159762766176827e-05, + "loss": 0.9095, + "step": 27347 + }, + { + "epoch": 5.8336177474402735, + "grad_norm": 0.10587752914112172, + "learning_rate": 1.3155087147903926e-05, + "loss": 0.8602, + "step": 27348 + }, + { + "epoch": 5.833831058020478, + "grad_norm": 0.11050923163444049, + "learning_rate": 1.3150412285982355e-05, + "loss": 0.9039, + "step": 27349 + }, + { + "epoch": 5.834044368600683, + "grad_norm": 0.11258543424693339, + "learning_rate": 1.3145738180464985e-05, + "loss": 0.9333, + "step": 27350 + }, + { + "epoch": 5.834257679180888, + "grad_norm": 0.10421338217329523, + "learning_rate": 1.3141064831404688e-05, + "loss": 0.8872, + "step": 27351 + }, + { + "epoch": 5.834470989761092, + "grad_norm": 0.11288730333692665, + "learning_rate": 1.3136392238854412e-05, + "loss": 0.9148, + "step": 27352 + }, + { + "epoch": 5.834684300341297, + "grad_norm": 0.0912211257234805, + "learning_rate": 1.3131720402866965e-05, + "loss": 0.8675, + "step": 27353 + }, + { + "epoch": 5.834897610921502, + "grad_norm": 0.11032143924536497, + "learning_rate": 1.3127049323495205e-05, + "loss": 0.907, + "step": 27354 + }, + { + "epoch": 5.835110921501706, + "grad_norm": 0.10068843555002832, + "learning_rate": 1.312237900079202e-05, + "loss": 0.8675, + "step": 27355 + }, + { + "epoch": 5.835324232081911, + "grad_norm": 0.09973324528077229, + "learning_rate": 1.3117709434810251e-05, + "loss": 0.8873, + "step": 27356 + }, + { + "epoch": 5.835537542662116, + "grad_norm": 0.10796410063955304, + "learning_rate": 1.3113040625602725e-05, + "loss": 0.9111, + "step": 27357 + }, + { + "epoch": 5.83575085324232, + "grad_norm": 0.09639279585044114, + "learning_rate": 1.310837257322226e-05, + "loss": 0.8468, + "step": 27358 + }, + { + "epoch": 5.835964163822526, + "grad_norm": 0.0997877022964795, + "learning_rate": 1.3103705277721673e-05, + "loss": 0.8753, + "step": 27359 + }, + { + "epoch": 5.836177474402731, + "grad_norm": 0.09773960912976563, + "learning_rate": 1.3099038739153805e-05, + "loss": 0.903, + "step": 27360 + }, + { + "epoch": 5.836390784982935, + "grad_norm": 0.10320667870789765, + "learning_rate": 1.3094372957571433e-05, + "loss": 0.8707, + "step": 27361 + }, + { + "epoch": 5.83660409556314, + "grad_norm": 0.09375720958611562, + "learning_rate": 1.3089707933027361e-05, + "loss": 0.8816, + "step": 27362 + }, + { + "epoch": 5.836817406143345, + "grad_norm": 0.09548411333348077, + "learning_rate": 1.3085043665574362e-05, + "loss": 0.9085, + "step": 27363 + }, + { + "epoch": 5.837030716723549, + "grad_norm": 0.09741574756675304, + "learning_rate": 1.3080380155265232e-05, + "loss": 0.8803, + "step": 27364 + }, + { + "epoch": 5.837244027303754, + "grad_norm": 0.09807524573249123, + "learning_rate": 1.3075717402152716e-05, + "loss": 0.8871, + "step": 27365 + }, + { + "epoch": 5.837457337883959, + "grad_norm": 0.09568485763885429, + "learning_rate": 1.3071055406289568e-05, + "loss": 0.8935, + "step": 27366 + }, + { + "epoch": 5.837670648464163, + "grad_norm": 0.09367364951487628, + "learning_rate": 1.3066394167728569e-05, + "loss": 0.8524, + "step": 27367 + }, + { + "epoch": 5.837883959044369, + "grad_norm": 0.1221187548657444, + "learning_rate": 1.3061733686522451e-05, + "loss": 0.8871, + "step": 27368 + }, + { + "epoch": 5.838097269624574, + "grad_norm": 0.09590665108419653, + "learning_rate": 1.3057073962723945e-05, + "loss": 0.8787, + "step": 27369 + }, + { + "epoch": 5.838310580204778, + "grad_norm": 0.11089733378977124, + "learning_rate": 1.3052414996385765e-05, + "loss": 0.8733, + "step": 27370 + }, + { + "epoch": 5.838523890784983, + "grad_norm": 0.1079906627523042, + "learning_rate": 1.3047756787560623e-05, + "loss": 0.9066, + "step": 27371 + }, + { + "epoch": 5.838737201365188, + "grad_norm": 0.0946294032361234, + "learning_rate": 1.3043099336301253e-05, + "loss": 0.9079, + "step": 27372 + }, + { + "epoch": 5.838950511945392, + "grad_norm": 0.09683728634694647, + "learning_rate": 1.3038442642660347e-05, + "loss": 0.8927, + "step": 27373 + }, + { + "epoch": 5.839163822525597, + "grad_norm": 0.10489747486273285, + "learning_rate": 1.3033786706690586e-05, + "loss": 0.9024, + "step": 27374 + }, + { + "epoch": 5.839377133105802, + "grad_norm": 0.09443315785810732, + "learning_rate": 1.3029131528444645e-05, + "loss": 0.9016, + "step": 27375 + }, + { + "epoch": 5.839590443686006, + "grad_norm": 0.08924276736644232, + "learning_rate": 1.3024477107975223e-05, + "loss": 0.9143, + "step": 27376 + }, + { + "epoch": 5.839803754266212, + "grad_norm": 0.09710671512636311, + "learning_rate": 1.3019823445334958e-05, + "loss": 0.8984, + "step": 27377 + }, + { + "epoch": 5.840017064846417, + "grad_norm": 0.10564215360637898, + "learning_rate": 1.3015170540576505e-05, + "loss": 0.8861, + "step": 27378 + }, + { + "epoch": 5.840230375426621, + "grad_norm": 0.09395331742613817, + "learning_rate": 1.301051839375254e-05, + "loss": 0.9323, + "step": 27379 + }, + { + "epoch": 5.840443686006826, + "grad_norm": 0.11080848064456499, + "learning_rate": 1.300586700491568e-05, + "loss": 0.8787, + "step": 27380 + }, + { + "epoch": 5.840656996587031, + "grad_norm": 0.09336754307785212, + "learning_rate": 1.3001216374118575e-05, + "loss": 0.8932, + "step": 27381 + }, + { + "epoch": 5.840870307167235, + "grad_norm": 0.09591508083923105, + "learning_rate": 1.2996566501413824e-05, + "loss": 0.8925, + "step": 27382 + }, + { + "epoch": 5.84108361774744, + "grad_norm": 0.09833153825753149, + "learning_rate": 1.2991917386854031e-05, + "loss": 0.8597, + "step": 27383 + }, + { + "epoch": 5.841296928327645, + "grad_norm": 0.10549407075068161, + "learning_rate": 1.2987269030491843e-05, + "loss": 0.8978, + "step": 27384 + }, + { + "epoch": 5.8415102389078495, + "grad_norm": 0.0931126877324453, + "learning_rate": 1.2982621432379854e-05, + "loss": 0.8813, + "step": 27385 + }, + { + "epoch": 5.841723549488055, + "grad_norm": 0.10069616694759446, + "learning_rate": 1.2977974592570583e-05, + "loss": 0.8763, + "step": 27386 + }, + { + "epoch": 5.84193686006826, + "grad_norm": 0.113773577589572, + "learning_rate": 1.2973328511116684e-05, + "loss": 0.8876, + "step": 27387 + }, + { + "epoch": 5.842150170648464, + "grad_norm": 0.09383273921204081, + "learning_rate": 1.2968683188070696e-05, + "loss": 0.8853, + "step": 27388 + }, + { + "epoch": 5.842363481228669, + "grad_norm": 0.10709740155027211, + "learning_rate": 1.2964038623485186e-05, + "loss": 0.8934, + "step": 27389 + }, + { + "epoch": 5.842576791808874, + "grad_norm": 0.09814234242593389, + "learning_rate": 1.295939481741269e-05, + "loss": 0.8748, + "step": 27390 + }, + { + "epoch": 5.8427901023890785, + "grad_norm": 0.10578084897303136, + "learning_rate": 1.2954751769905794e-05, + "loss": 0.8869, + "step": 27391 + }, + { + "epoch": 5.843003412969283, + "grad_norm": 0.10432883136679881, + "learning_rate": 1.2950109481017e-05, + "loss": 0.8956, + "step": 27392 + }, + { + "epoch": 5.843216723549488, + "grad_norm": 0.09238490572571595, + "learning_rate": 1.2945467950798842e-05, + "loss": 0.9015, + "step": 27393 + }, + { + "epoch": 5.8434300341296925, + "grad_norm": 0.1052472933149561, + "learning_rate": 1.294082717930385e-05, + "loss": 0.9199, + "step": 27394 + }, + { + "epoch": 5.843643344709898, + "grad_norm": 0.09699165461708255, + "learning_rate": 1.2936187166584504e-05, + "loss": 0.8812, + "step": 27395 + }, + { + "epoch": 5.843856655290102, + "grad_norm": 0.09028610373134216, + "learning_rate": 1.2931547912693368e-05, + "loss": 0.8805, + "step": 27396 + }, + { + "epoch": 5.8440699658703075, + "grad_norm": 0.09073163975625896, + "learning_rate": 1.2926909417682878e-05, + "loss": 0.8929, + "step": 27397 + }, + { + "epoch": 5.844283276450512, + "grad_norm": 0.10349687375467073, + "learning_rate": 1.2922271681605518e-05, + "loss": 0.9003, + "step": 27398 + }, + { + "epoch": 5.844496587030717, + "grad_norm": 0.09290857355331879, + "learning_rate": 1.2917634704513797e-05, + "loss": 0.8849, + "step": 27399 + }, + { + "epoch": 5.8447098976109215, + "grad_norm": 0.10597626158877371, + "learning_rate": 1.291299848646017e-05, + "loss": 0.9092, + "step": 27400 + }, + { + "epoch": 5.844923208191126, + "grad_norm": 0.09859427600466769, + "learning_rate": 1.2908363027497095e-05, + "loss": 0.8634, + "step": 27401 + }, + { + "epoch": 5.845136518771331, + "grad_norm": 0.10388964747148609, + "learning_rate": 1.2903728327677e-05, + "loss": 0.8738, + "step": 27402 + }, + { + "epoch": 5.845349829351536, + "grad_norm": 0.103496761337364, + "learning_rate": 1.2899094387052364e-05, + "loss": 0.9307, + "step": 27403 + }, + { + "epoch": 5.84556313993174, + "grad_norm": 0.10933707680294193, + "learning_rate": 1.2894461205675603e-05, + "loss": 0.9002, + "step": 27404 + }, + { + "epoch": 5.845776450511945, + "grad_norm": 0.10473234082345932, + "learning_rate": 1.2889828783599146e-05, + "loss": 0.8537, + "step": 27405 + }, + { + "epoch": 5.8459897610921505, + "grad_norm": 0.10491721674286338, + "learning_rate": 1.2885197120875401e-05, + "loss": 0.8813, + "step": 27406 + }, + { + "epoch": 5.846203071672355, + "grad_norm": 0.09135502691618094, + "learning_rate": 1.2880566217556765e-05, + "loss": 0.8595, + "step": 27407 + }, + { + "epoch": 5.84641638225256, + "grad_norm": 0.09777145647213593, + "learning_rate": 1.2875936073695692e-05, + "loss": 0.9086, + "step": 27408 + }, + { + "epoch": 5.846629692832765, + "grad_norm": 0.09901346646553663, + "learning_rate": 1.2871306689344504e-05, + "loss": 0.8646, + "step": 27409 + }, + { + "epoch": 5.846843003412969, + "grad_norm": 0.09384530526854966, + "learning_rate": 1.2866678064555598e-05, + "loss": 0.9002, + "step": 27410 + }, + { + "epoch": 5.847056313993174, + "grad_norm": 0.08798122674191909, + "learning_rate": 1.2862050199381373e-05, + "loss": 0.854, + "step": 27411 + }, + { + "epoch": 5.847269624573379, + "grad_norm": 0.1103205337509795, + "learning_rate": 1.285742309387418e-05, + "loss": 0.8968, + "step": 27412 + }, + { + "epoch": 5.847482935153583, + "grad_norm": 0.1002805050001961, + "learning_rate": 1.2852796748086376e-05, + "loss": 0.8711, + "step": 27413 + }, + { + "epoch": 5.847696245733788, + "grad_norm": 0.09361100672812091, + "learning_rate": 1.2848171162070284e-05, + "loss": 0.8909, + "step": 27414 + }, + { + "epoch": 5.847909556313994, + "grad_norm": 0.11252980079533038, + "learning_rate": 1.284354633587829e-05, + "loss": 0.8808, + "step": 27415 + }, + { + "epoch": 5.848122866894198, + "grad_norm": 0.10472868229276228, + "learning_rate": 1.2838922269562688e-05, + "loss": 0.8842, + "step": 27416 + }, + { + "epoch": 5.848336177474403, + "grad_norm": 0.09766113315304176, + "learning_rate": 1.283429896317582e-05, + "loss": 0.9048, + "step": 27417 + }, + { + "epoch": 5.848549488054608, + "grad_norm": 0.10059766432087522, + "learning_rate": 1.282967641676999e-05, + "loss": 0.8677, + "step": 27418 + }, + { + "epoch": 5.848762798634812, + "grad_norm": 0.10086698593433088, + "learning_rate": 1.2825054630397492e-05, + "loss": 0.8549, + "step": 27419 + }, + { + "epoch": 5.848976109215017, + "grad_norm": 0.10487432383458697, + "learning_rate": 1.2820433604110636e-05, + "loss": 0.8851, + "step": 27420 + }, + { + "epoch": 5.849189419795222, + "grad_norm": 0.09522719314356785, + "learning_rate": 1.2815813337961705e-05, + "loss": 0.8879, + "step": 27421 + }, + { + "epoch": 5.849402730375426, + "grad_norm": 0.10143676902720887, + "learning_rate": 1.2811193832002959e-05, + "loss": 0.899, + "step": 27422 + }, + { + "epoch": 5.849616040955631, + "grad_norm": 0.09179930396920115, + "learning_rate": 1.2806575086286696e-05, + "loss": 0.8735, + "step": 27423 + }, + { + "epoch": 5.849829351535837, + "grad_norm": 0.09758612344001404, + "learning_rate": 1.2801957100865177e-05, + "loss": 0.8572, + "step": 27424 + }, + { + "epoch": 5.850042662116041, + "grad_norm": 0.09926759234113676, + "learning_rate": 1.2797339875790633e-05, + "loss": 0.8974, + "step": 27425 + }, + { + "epoch": 5.850255972696246, + "grad_norm": 0.09616633164430685, + "learning_rate": 1.2792723411115308e-05, + "loss": 0.8798, + "step": 27426 + }, + { + "epoch": 5.850469283276451, + "grad_norm": 0.10506538133769411, + "learning_rate": 1.2788107706891464e-05, + "loss": 0.8823, + "step": 27427 + }, + { + "epoch": 5.850682593856655, + "grad_norm": 0.09418495647764374, + "learning_rate": 1.2783492763171305e-05, + "loss": 0.8784, + "step": 27428 + }, + { + "epoch": 5.85089590443686, + "grad_norm": 0.10353062384719981, + "learning_rate": 1.2778878580007068e-05, + "loss": 0.8751, + "step": 27429 + }, + { + "epoch": 5.851109215017065, + "grad_norm": 0.10285469976311136, + "learning_rate": 1.277426515745094e-05, + "loss": 0.8708, + "step": 27430 + }, + { + "epoch": 5.851322525597269, + "grad_norm": 0.09818427958452446, + "learning_rate": 1.2769652495555138e-05, + "loss": 0.9108, + "step": 27431 + }, + { + "epoch": 5.851535836177474, + "grad_norm": 0.10422057909787129, + "learning_rate": 1.2765040594371852e-05, + "loss": 0.884, + "step": 27432 + }, + { + "epoch": 5.85174914675768, + "grad_norm": 0.09445998664428677, + "learning_rate": 1.2760429453953261e-05, + "loss": 0.8843, + "step": 27433 + }, + { + "epoch": 5.851962457337884, + "grad_norm": 0.10121655889563433, + "learning_rate": 1.2755819074351518e-05, + "loss": 0.9239, + "step": 27434 + }, + { + "epoch": 5.852175767918089, + "grad_norm": 0.09871431605654145, + "learning_rate": 1.2751209455618833e-05, + "loss": 0.8719, + "step": 27435 + }, + { + "epoch": 5.852389078498294, + "grad_norm": 0.12289343481987482, + "learning_rate": 1.274660059780735e-05, + "loss": 0.9119, + "step": 27436 + }, + { + "epoch": 5.852602389078498, + "grad_norm": 0.10413761773705424, + "learning_rate": 1.2741992500969204e-05, + "loss": 0.8522, + "step": 27437 + }, + { + "epoch": 5.852815699658703, + "grad_norm": 0.11322712151021376, + "learning_rate": 1.2737385165156532e-05, + "loss": 0.8666, + "step": 27438 + }, + { + "epoch": 5.853029010238908, + "grad_norm": 0.1193329094016395, + "learning_rate": 1.2732778590421498e-05, + "loss": 0.9301, + "step": 27439 + }, + { + "epoch": 5.853242320819112, + "grad_norm": 0.09887886221909978, + "learning_rate": 1.2728172776816199e-05, + "loss": 0.9037, + "step": 27440 + }, + { + "epoch": 5.853455631399317, + "grad_norm": 0.14011891821183395, + "learning_rate": 1.2723567724392783e-05, + "loss": 0.9089, + "step": 27441 + }, + { + "epoch": 5.853668941979522, + "grad_norm": 0.11818788989834815, + "learning_rate": 1.2718963433203287e-05, + "loss": 0.9144, + "step": 27442 + }, + { + "epoch": 5.8538822525597265, + "grad_norm": 0.10639306777735, + "learning_rate": 1.2714359903299865e-05, + "loss": 0.878, + "step": 27443 + }, + { + "epoch": 5.854095563139932, + "grad_norm": 0.111546750676509, + "learning_rate": 1.2709757134734599e-05, + "loss": 0.8944, + "step": 27444 + }, + { + "epoch": 5.854308873720137, + "grad_norm": 0.11339113295207083, + "learning_rate": 1.2705155127559566e-05, + "loss": 0.8843, + "step": 27445 + }, + { + "epoch": 5.854522184300341, + "grad_norm": 0.10194789833067407, + "learning_rate": 1.2700553881826809e-05, + "loss": 0.8877, + "step": 27446 + }, + { + "epoch": 5.854735494880546, + "grad_norm": 0.10418274391677301, + "learning_rate": 1.2695953397588441e-05, + "loss": 0.8739, + "step": 27447 + }, + { + "epoch": 5.854948805460751, + "grad_norm": 0.09890821831442549, + "learning_rate": 1.2691353674896489e-05, + "loss": 0.868, + "step": 27448 + }, + { + "epoch": 5.8551621160409555, + "grad_norm": 0.09446365723786643, + "learning_rate": 1.2686754713803002e-05, + "loss": 0.8891, + "step": 27449 + }, + { + "epoch": 5.85537542662116, + "grad_norm": 0.10566077618235102, + "learning_rate": 1.268215651436e-05, + "loss": 0.8947, + "step": 27450 + }, + { + "epoch": 5.855588737201365, + "grad_norm": 0.09254433537600308, + "learning_rate": 1.2677559076619548e-05, + "loss": 0.8804, + "step": 27451 + }, + { + "epoch": 5.8558020477815695, + "grad_norm": 0.11310258474057114, + "learning_rate": 1.2672962400633675e-05, + "loss": 0.8974, + "step": 27452 + }, + { + "epoch": 5.856015358361775, + "grad_norm": 0.10421757660543131, + "learning_rate": 1.2668366486454331e-05, + "loss": 0.8931, + "step": 27453 + }, + { + "epoch": 5.85622866894198, + "grad_norm": 0.10143569072196677, + "learning_rate": 1.2663771334133545e-05, + "loss": 0.8878, + "step": 27454 + }, + { + "epoch": 5.8564419795221845, + "grad_norm": 0.11459908194903691, + "learning_rate": 1.265917694372333e-05, + "loss": 0.8987, + "step": 27455 + }, + { + "epoch": 5.856655290102389, + "grad_norm": 0.11682150469897447, + "learning_rate": 1.265458331527567e-05, + "loss": 0.8832, + "step": 27456 + }, + { + "epoch": 5.856868600682594, + "grad_norm": 0.11139947879634372, + "learning_rate": 1.2649990448842523e-05, + "loss": 0.8822, + "step": 27457 + }, + { + "epoch": 5.8570819112627985, + "grad_norm": 0.1059769537863646, + "learning_rate": 1.264539834447585e-05, + "loss": 0.8894, + "step": 27458 + }, + { + "epoch": 5.857295221843003, + "grad_norm": 0.10840285942378651, + "learning_rate": 1.2640807002227651e-05, + "loss": 0.868, + "step": 27459 + }, + { + "epoch": 5.857508532423208, + "grad_norm": 0.1002456125659717, + "learning_rate": 1.263621642214985e-05, + "loss": 0.8786, + "step": 27460 + }, + { + "epoch": 5.857721843003413, + "grad_norm": 0.10180924153169822, + "learning_rate": 1.2631626604294396e-05, + "loss": 0.8896, + "step": 27461 + }, + { + "epoch": 5.857935153583618, + "grad_norm": 0.09894283296058275, + "learning_rate": 1.2627037548713204e-05, + "loss": 0.8787, + "step": 27462 + }, + { + "epoch": 5.858148464163823, + "grad_norm": 0.09920628531488891, + "learning_rate": 1.262244925545825e-05, + "loss": 0.8953, + "step": 27463 + }, + { + "epoch": 5.8583617747440275, + "grad_norm": 0.10831903783090335, + "learning_rate": 1.2617861724581392e-05, + "loss": 0.9014, + "step": 27464 + }, + { + "epoch": 5.858575085324232, + "grad_norm": 0.09692477578013092, + "learning_rate": 1.2613274956134567e-05, + "loss": 0.8854, + "step": 27465 + }, + { + "epoch": 5.858788395904437, + "grad_norm": 0.09067452642855797, + "learning_rate": 1.260868895016965e-05, + "loss": 0.8837, + "step": 27466 + }, + { + "epoch": 5.859001706484642, + "grad_norm": 0.09681512190336097, + "learning_rate": 1.2604103706738563e-05, + "loss": 0.927, + "step": 27467 + }, + { + "epoch": 5.859215017064846, + "grad_norm": 0.09183670469986503, + "learning_rate": 1.259951922589319e-05, + "loss": 0.897, + "step": 27468 + }, + { + "epoch": 5.859428327645051, + "grad_norm": 0.1031449281880891, + "learning_rate": 1.2594935507685385e-05, + "loss": 0.8781, + "step": 27469 + }, + { + "epoch": 5.859641638225256, + "grad_norm": 0.0940286587615258, + "learning_rate": 1.2590352552166993e-05, + "loss": 0.8668, + "step": 27470 + }, + { + "epoch": 5.859854948805461, + "grad_norm": 0.09241834138072537, + "learning_rate": 1.2585770359389922e-05, + "loss": 0.884, + "step": 27471 + }, + { + "epoch": 5.860068259385666, + "grad_norm": 0.09304024740304777, + "learning_rate": 1.2581188929405984e-05, + "loss": 0.8939, + "step": 27472 + }, + { + "epoch": 5.860281569965871, + "grad_norm": 0.10474629051564738, + "learning_rate": 1.2576608262267036e-05, + "loss": 0.9033, + "step": 27473 + }, + { + "epoch": 5.860494880546075, + "grad_norm": 0.08892716501136698, + "learning_rate": 1.2572028358024881e-05, + "loss": 0.8855, + "step": 27474 + }, + { + "epoch": 5.86070819112628, + "grad_norm": 0.08971645091868838, + "learning_rate": 1.2567449216731391e-05, + "loss": 0.891, + "step": 27475 + }, + { + "epoch": 5.860921501706485, + "grad_norm": 0.11293549266281985, + "learning_rate": 1.2562870838438337e-05, + "loss": 0.8803, + "step": 27476 + }, + { + "epoch": 5.861134812286689, + "grad_norm": 0.09798750813266663, + "learning_rate": 1.2558293223197526e-05, + "loss": 0.8747, + "step": 27477 + }, + { + "epoch": 5.861348122866894, + "grad_norm": 0.09864882272781268, + "learning_rate": 1.2553716371060748e-05, + "loss": 0.885, + "step": 27478 + }, + { + "epoch": 5.861561433447099, + "grad_norm": 0.11651267294494982, + "learning_rate": 1.2549140282079821e-05, + "loss": 0.8674, + "step": 27479 + }, + { + "epoch": 5.861774744027304, + "grad_norm": 0.09467199213693342, + "learning_rate": 1.2544564956306506e-05, + "loss": 0.8751, + "step": 27480 + }, + { + "epoch": 5.861988054607508, + "grad_norm": 0.10547069045924228, + "learning_rate": 1.2539990393792567e-05, + "loss": 0.8917, + "step": 27481 + }, + { + "epoch": 5.862201365187714, + "grad_norm": 0.11099370287079827, + "learning_rate": 1.2535416594589767e-05, + "loss": 0.8942, + "step": 27482 + }, + { + "epoch": 5.862414675767918, + "grad_norm": 0.09079996704825774, + "learning_rate": 1.2530843558749867e-05, + "loss": 0.8968, + "step": 27483 + }, + { + "epoch": 5.862627986348123, + "grad_norm": 0.09844943522145354, + "learning_rate": 1.2526271286324615e-05, + "loss": 0.8857, + "step": 27484 + }, + { + "epoch": 5.862841296928328, + "grad_norm": 0.11647524174027232, + "learning_rate": 1.2521699777365739e-05, + "loss": 0.8669, + "step": 27485 + }, + { + "epoch": 5.863054607508532, + "grad_norm": 0.10328074432871398, + "learning_rate": 1.2517129031924963e-05, + "loss": 0.8567, + "step": 27486 + }, + { + "epoch": 5.863267918088737, + "grad_norm": 0.09481594161310479, + "learning_rate": 1.2512559050054e-05, + "loss": 0.8882, + "step": 27487 + }, + { + "epoch": 5.863481228668942, + "grad_norm": 0.10338270450334242, + "learning_rate": 1.2507989831804577e-05, + "loss": 0.9068, + "step": 27488 + }, + { + "epoch": 5.863694539249146, + "grad_norm": 0.10015136957514399, + "learning_rate": 1.2503421377228375e-05, + "loss": 0.8917, + "step": 27489 + }, + { + "epoch": 5.863907849829351, + "grad_norm": 0.0932273160510571, + "learning_rate": 1.2498853686377089e-05, + "loss": 0.8901, + "step": 27490 + }, + { + "epoch": 5.864121160409557, + "grad_norm": 0.0915635074800073, + "learning_rate": 1.2494286759302416e-05, + "loss": 0.8785, + "step": 27491 + }, + { + "epoch": 5.864334470989761, + "grad_norm": 0.10621850863096385, + "learning_rate": 1.2489720596056034e-05, + "loss": 0.9031, + "step": 27492 + }, + { + "epoch": 5.864547781569966, + "grad_norm": 0.09536426273030392, + "learning_rate": 1.2485155196689589e-05, + "loss": 0.897, + "step": 27493 + }, + { + "epoch": 5.864761092150171, + "grad_norm": 0.10921711226817887, + "learning_rate": 1.2480590561254737e-05, + "loss": 0.8665, + "step": 27494 + }, + { + "epoch": 5.864974402730375, + "grad_norm": 0.09604670190549852, + "learning_rate": 1.2476026689803153e-05, + "loss": 0.9195, + "step": 27495 + }, + { + "epoch": 5.86518771331058, + "grad_norm": 0.09289625196887961, + "learning_rate": 1.2471463582386467e-05, + "loss": 0.8912, + "step": 27496 + }, + { + "epoch": 5.865401023890785, + "grad_norm": 0.09669831837764879, + "learning_rate": 1.2466901239056317e-05, + "loss": 0.8895, + "step": 27497 + }, + { + "epoch": 5.8656143344709895, + "grad_norm": 0.09265058255504909, + "learning_rate": 1.2462339659864288e-05, + "loss": 0.8793, + "step": 27498 + }, + { + "epoch": 5.865827645051194, + "grad_norm": 0.09824323034098909, + "learning_rate": 1.2457778844862038e-05, + "loss": 0.8819, + "step": 27499 + }, + { + "epoch": 5.8660409556314, + "grad_norm": 0.10208539457693537, + "learning_rate": 1.2453218794101152e-05, + "loss": 0.9339, + "step": 27500 + }, + { + "epoch": 5.866254266211604, + "grad_norm": 0.09354178563888907, + "learning_rate": 1.2448659507633231e-05, + "loss": 0.8992, + "step": 27501 + }, + { + "epoch": 5.866467576791809, + "grad_norm": 0.09192408442459289, + "learning_rate": 1.2444100985509846e-05, + "loss": 0.8771, + "step": 27502 + }, + { + "epoch": 5.866680887372014, + "grad_norm": 0.10041850926695693, + "learning_rate": 1.2439543227782616e-05, + "loss": 0.9094, + "step": 27503 + }, + { + "epoch": 5.8668941979522184, + "grad_norm": 0.09064479354752823, + "learning_rate": 1.243498623450309e-05, + "loss": 0.9058, + "step": 27504 + }, + { + "epoch": 5.867107508532423, + "grad_norm": 0.10039189982409118, + "learning_rate": 1.2430430005722838e-05, + "loss": 0.8687, + "step": 27505 + }, + { + "epoch": 5.867320819112628, + "grad_norm": 0.09325247682746207, + "learning_rate": 1.2425874541493382e-05, + "loss": 0.8709, + "step": 27506 + }, + { + "epoch": 5.8675341296928325, + "grad_norm": 0.11244888754481581, + "learning_rate": 1.2421319841866307e-05, + "loss": 0.8919, + "step": 27507 + }, + { + "epoch": 5.867747440273037, + "grad_norm": 0.10015255283824702, + "learning_rate": 1.2416765906893154e-05, + "loss": 0.8766, + "step": 27508 + }, + { + "epoch": 5.867960750853243, + "grad_norm": 0.09597121620273466, + "learning_rate": 1.241221273662542e-05, + "loss": 0.8651, + "step": 27509 + }, + { + "epoch": 5.868174061433447, + "grad_norm": 0.1126763578293267, + "learning_rate": 1.240766033111461e-05, + "loss": 0.9253, + "step": 27510 + }, + { + "epoch": 5.868387372013652, + "grad_norm": 0.10127484082619785, + "learning_rate": 1.2403108690412285e-05, + "loss": 0.8799, + "step": 27511 + }, + { + "epoch": 5.868600682593857, + "grad_norm": 0.10261423081848174, + "learning_rate": 1.2398557814569921e-05, + "loss": 0.8619, + "step": 27512 + }, + { + "epoch": 5.8688139931740615, + "grad_norm": 0.11529183839314469, + "learning_rate": 1.2394007703639007e-05, + "loss": 0.885, + "step": 27513 + }, + { + "epoch": 5.869027303754266, + "grad_norm": 0.10606614087927382, + "learning_rate": 1.238945835767102e-05, + "loss": 0.8869, + "step": 27514 + }, + { + "epoch": 5.869240614334471, + "grad_norm": 0.10071709010870901, + "learning_rate": 1.238490977671746e-05, + "loss": 0.8745, + "step": 27515 + }, + { + "epoch": 5.8694539249146755, + "grad_norm": 0.1185617883040224, + "learning_rate": 1.2380361960829793e-05, + "loss": 0.898, + "step": 27516 + }, + { + "epoch": 5.86966723549488, + "grad_norm": 0.1143234613083758, + "learning_rate": 1.2375814910059459e-05, + "loss": 0.8649, + "step": 27517 + }, + { + "epoch": 5.869880546075086, + "grad_norm": 0.11452540969262014, + "learning_rate": 1.2371268624457904e-05, + "loss": 0.91, + "step": 27518 + }, + { + "epoch": 5.8700938566552905, + "grad_norm": 0.11538045972787829, + "learning_rate": 1.236672310407662e-05, + "loss": 0.8957, + "step": 27519 + }, + { + "epoch": 5.870307167235495, + "grad_norm": 0.09811164040100305, + "learning_rate": 1.2362178348966984e-05, + "loss": 0.8792, + "step": 27520 + }, + { + "epoch": 5.8705204778157, + "grad_norm": 0.11294243184618213, + "learning_rate": 1.2357634359180435e-05, + "loss": 0.9099, + "step": 27521 + }, + { + "epoch": 5.8707337883959045, + "grad_norm": 0.1371045797322052, + "learning_rate": 1.2353091134768369e-05, + "loss": 0.8953, + "step": 27522 + }, + { + "epoch": 5.870947098976109, + "grad_norm": 0.09247263439710357, + "learning_rate": 1.234854867578223e-05, + "loss": 0.9033, + "step": 27523 + }, + { + "epoch": 5.871160409556314, + "grad_norm": 0.12206875902834498, + "learning_rate": 1.234400698227341e-05, + "loss": 0.8885, + "step": 27524 + }, + { + "epoch": 5.871373720136519, + "grad_norm": 0.10762702128921232, + "learning_rate": 1.2339466054293285e-05, + "loss": 0.8989, + "step": 27525 + }, + { + "epoch": 5.871587030716723, + "grad_norm": 0.10476517979093798, + "learning_rate": 1.2334925891893223e-05, + "loss": 0.9144, + "step": 27526 + }, + { + "epoch": 5.871800341296928, + "grad_norm": 0.11241953745467986, + "learning_rate": 1.233038649512463e-05, + "loss": 0.9408, + "step": 27527 + }, + { + "epoch": 5.872013651877133, + "grad_norm": 0.10999156940976035, + "learning_rate": 1.2325847864038853e-05, + "loss": 0.9315, + "step": 27528 + }, + { + "epoch": 5.872226962457338, + "grad_norm": 0.1099767754491219, + "learning_rate": 1.2321309998687245e-05, + "loss": 0.9071, + "step": 27529 + }, + { + "epoch": 5.872440273037543, + "grad_norm": 0.10425287044538026, + "learning_rate": 1.2316772899121134e-05, + "loss": 0.8813, + "step": 27530 + }, + { + "epoch": 5.872653583617748, + "grad_norm": 0.09030004602557268, + "learning_rate": 1.2312236565391911e-05, + "loss": 0.8803, + "step": 27531 + }, + { + "epoch": 5.872866894197952, + "grad_norm": 0.10111678161529356, + "learning_rate": 1.2307700997550857e-05, + "loss": 0.872, + "step": 27532 + }, + { + "epoch": 5.873080204778157, + "grad_norm": 0.098563341796231, + "learning_rate": 1.2303166195649303e-05, + "loss": 0.9096, + "step": 27533 + }, + { + "epoch": 5.873293515358362, + "grad_norm": 0.09289425456703562, + "learning_rate": 1.2298632159738539e-05, + "loss": 0.8632, + "step": 27534 + }, + { + "epoch": 5.873506825938566, + "grad_norm": 0.1051255265514558, + "learning_rate": 1.2294098889869912e-05, + "loss": 0.8979, + "step": 27535 + }, + { + "epoch": 5.873720136518771, + "grad_norm": 0.1102628387115376, + "learning_rate": 1.2289566386094696e-05, + "loss": 0.895, + "step": 27536 + }, + { + "epoch": 5.873933447098976, + "grad_norm": 0.09375817123862366, + "learning_rate": 1.2285034648464174e-05, + "loss": 0.8349, + "step": 27537 + }, + { + "epoch": 5.874146757679181, + "grad_norm": 0.10470076061132741, + "learning_rate": 1.2280503677029603e-05, + "loss": 0.8847, + "step": 27538 + }, + { + "epoch": 5.874360068259386, + "grad_norm": 0.09656829839991599, + "learning_rate": 1.2275973471842294e-05, + "loss": 0.9086, + "step": 27539 + }, + { + "epoch": 5.874573378839591, + "grad_norm": 0.0952877526697995, + "learning_rate": 1.2271444032953484e-05, + "loss": 0.8607, + "step": 27540 + }, + { + "epoch": 5.874786689419795, + "grad_norm": 0.09932343921089817, + "learning_rate": 1.2266915360414422e-05, + "loss": 0.8608, + "step": 27541 + }, + { + "epoch": 5.875, + "grad_norm": 0.09163118467869046, + "learning_rate": 1.2262387454276357e-05, + "loss": 0.884, + "step": 27542 + }, + { + "epoch": 5.875213310580205, + "grad_norm": 0.1005087527699744, + "learning_rate": 1.2257860314590517e-05, + "loss": 0.917, + "step": 27543 + }, + { + "epoch": 5.875426621160409, + "grad_norm": 0.097668435124023, + "learning_rate": 1.2253333941408125e-05, + "loss": 0.8715, + "step": 27544 + }, + { + "epoch": 5.875639931740614, + "grad_norm": 0.09704990364819437, + "learning_rate": 1.2248808334780393e-05, + "loss": 0.9272, + "step": 27545 + }, + { + "epoch": 5.875853242320819, + "grad_norm": 0.10247302032889988, + "learning_rate": 1.2244283494758525e-05, + "loss": 0.8944, + "step": 27546 + }, + { + "epoch": 5.876066552901024, + "grad_norm": 0.0924186215782654, + "learning_rate": 1.223975942139374e-05, + "loss": 0.9139, + "step": 27547 + }, + { + "epoch": 5.876279863481229, + "grad_norm": 0.09936217958150365, + "learning_rate": 1.2235236114737217e-05, + "loss": 0.9005, + "step": 27548 + }, + { + "epoch": 5.876493174061434, + "grad_norm": 0.09619232927028637, + "learning_rate": 1.2230713574840141e-05, + "loss": 0.9004, + "step": 27549 + }, + { + "epoch": 5.876706484641638, + "grad_norm": 0.09746569552375964, + "learning_rate": 1.2226191801753661e-05, + "loss": 0.8888, + "step": 27550 + }, + { + "epoch": 5.876919795221843, + "grad_norm": 0.09516676491486989, + "learning_rate": 1.2221670795528983e-05, + "loss": 0.8913, + "step": 27551 + }, + { + "epoch": 5.877133105802048, + "grad_norm": 0.10138747433715253, + "learning_rate": 1.2217150556217238e-05, + "loss": 0.9099, + "step": 27552 + }, + { + "epoch": 5.877346416382252, + "grad_norm": 0.11372927662733724, + "learning_rate": 1.2212631083869569e-05, + "loss": 0.9132, + "step": 27553 + }, + { + "epoch": 5.877559726962457, + "grad_norm": 0.10892817287098036, + "learning_rate": 1.2208112378537122e-05, + "loss": 0.8742, + "step": 27554 + }, + { + "epoch": 5.877773037542662, + "grad_norm": 0.09151799529140034, + "learning_rate": 1.2203594440271034e-05, + "loss": 0.8861, + "step": 27555 + }, + { + "epoch": 5.877986348122867, + "grad_norm": 0.10852615645378445, + "learning_rate": 1.21990772691224e-05, + "loss": 0.8923, + "step": 27556 + }, + { + "epoch": 5.878199658703072, + "grad_norm": 0.10147393278861538, + "learning_rate": 1.219456086514236e-05, + "loss": 0.8844, + "step": 27557 + }, + { + "epoch": 5.878412969283277, + "grad_norm": 0.09026270724534853, + "learning_rate": 1.2190045228381977e-05, + "loss": 0.8944, + "step": 27558 + }, + { + "epoch": 5.878626279863481, + "grad_norm": 0.09580754821674786, + "learning_rate": 1.2185530358892392e-05, + "loss": 0.9037, + "step": 27559 + }, + { + "epoch": 5.878839590443686, + "grad_norm": 0.1011415632103244, + "learning_rate": 1.2181016256724676e-05, + "loss": 0.8841, + "step": 27560 + }, + { + "epoch": 5.879052901023891, + "grad_norm": 0.10685294141696017, + "learning_rate": 1.2176502921929897e-05, + "loss": 0.9087, + "step": 27561 + }, + { + "epoch": 5.8792662116040955, + "grad_norm": 0.09263758562945178, + "learning_rate": 1.217199035455911e-05, + "loss": 0.8845, + "step": 27562 + }, + { + "epoch": 5.8794795221843, + "grad_norm": 0.10038265140440163, + "learning_rate": 1.2167478554663422e-05, + "loss": 0.8935, + "step": 27563 + }, + { + "epoch": 5.879692832764505, + "grad_norm": 0.09885827406941367, + "learning_rate": 1.2162967522293863e-05, + "loss": 0.8593, + "step": 27564 + }, + { + "epoch": 5.8799061433447095, + "grad_norm": 0.0931643305651572, + "learning_rate": 1.2158457257501434e-05, + "loss": 0.8966, + "step": 27565 + }, + { + "epoch": 5.880119453924914, + "grad_norm": 0.1025083322784292, + "learning_rate": 1.2153947760337217e-05, + "loss": 0.8898, + "step": 27566 + }, + { + "epoch": 5.88033276450512, + "grad_norm": 0.10446127412871528, + "learning_rate": 1.2149439030852226e-05, + "loss": 0.9122, + "step": 27567 + }, + { + "epoch": 5.8805460750853245, + "grad_norm": 0.0987000532410908, + "learning_rate": 1.214493106909747e-05, + "loss": 0.8996, + "step": 27568 + }, + { + "epoch": 5.880759385665529, + "grad_norm": 0.11379140248486126, + "learning_rate": 1.2140423875123966e-05, + "loss": 0.8834, + "step": 27569 + }, + { + "epoch": 5.880972696245734, + "grad_norm": 0.09670720198077075, + "learning_rate": 1.2135917448982685e-05, + "loss": 0.8574, + "step": 27570 + }, + { + "epoch": 5.8811860068259385, + "grad_norm": 0.10180593862950929, + "learning_rate": 1.2131411790724657e-05, + "loss": 0.8786, + "step": 27571 + }, + { + "epoch": 5.881399317406143, + "grad_norm": 0.12147829222819401, + "learning_rate": 1.212690690040085e-05, + "loss": 0.8759, + "step": 27572 + }, + { + "epoch": 5.881612627986348, + "grad_norm": 0.09952905714987506, + "learning_rate": 1.212240277806223e-05, + "loss": 0.8899, + "step": 27573 + }, + { + "epoch": 5.881825938566553, + "grad_norm": 0.11763691814923165, + "learning_rate": 1.2117899423759752e-05, + "loss": 0.8765, + "step": 27574 + }, + { + "epoch": 5.882039249146757, + "grad_norm": 0.11324032842323761, + "learning_rate": 1.2113396837544414e-05, + "loss": 0.8834, + "step": 27575 + }, + { + "epoch": 5.882252559726963, + "grad_norm": 0.10344404952439154, + "learning_rate": 1.2108895019467118e-05, + "loss": 0.8851, + "step": 27576 + }, + { + "epoch": 5.8824658703071675, + "grad_norm": 0.10906574098227595, + "learning_rate": 1.2104393969578792e-05, + "loss": 0.8775, + "step": 27577 + }, + { + "epoch": 5.882679180887372, + "grad_norm": 0.10431167641599087, + "learning_rate": 1.2099893687930409e-05, + "loss": 0.8945, + "step": 27578 + }, + { + "epoch": 5.882892491467577, + "grad_norm": 0.11676826239261776, + "learning_rate": 1.2095394174572879e-05, + "loss": 0.8725, + "step": 27579 + }, + { + "epoch": 5.8831058020477816, + "grad_norm": 0.09699306066225835, + "learning_rate": 1.2090895429557104e-05, + "loss": 0.8841, + "step": 27580 + }, + { + "epoch": 5.883319112627986, + "grad_norm": 0.10581992802503973, + "learning_rate": 1.2086397452933984e-05, + "loss": 0.8894, + "step": 27581 + }, + { + "epoch": 5.883532423208191, + "grad_norm": 0.12204955314498571, + "learning_rate": 1.2081900244754406e-05, + "loss": 0.9055, + "step": 27582 + }, + { + "epoch": 5.883745733788396, + "grad_norm": 0.09799075559192442, + "learning_rate": 1.2077403805069286e-05, + "loss": 0.867, + "step": 27583 + }, + { + "epoch": 5.8839590443686, + "grad_norm": 0.11483036606392383, + "learning_rate": 1.2072908133929481e-05, + "loss": 0.8939, + "step": 27584 + }, + { + "epoch": 5.884172354948806, + "grad_norm": 0.10781484569216845, + "learning_rate": 1.2068413231385869e-05, + "loss": 0.9027, + "step": 27585 + }, + { + "epoch": 5.8843856655290105, + "grad_norm": 0.09892446122474856, + "learning_rate": 1.2063919097489278e-05, + "loss": 0.8973, + "step": 27586 + }, + { + "epoch": 5.884598976109215, + "grad_norm": 0.1081259688626212, + "learning_rate": 1.2059425732290629e-05, + "loss": 0.9015, + "step": 27587 + }, + { + "epoch": 5.88481228668942, + "grad_norm": 0.09468774246236256, + "learning_rate": 1.2054933135840695e-05, + "loss": 0.9002, + "step": 27588 + }, + { + "epoch": 5.885025597269625, + "grad_norm": 0.10060235133112953, + "learning_rate": 1.205044130819033e-05, + "loss": 0.8837, + "step": 27589 + }, + { + "epoch": 5.885238907849829, + "grad_norm": 0.10541088115234178, + "learning_rate": 1.204595024939037e-05, + "loss": 0.8571, + "step": 27590 + }, + { + "epoch": 5.885452218430034, + "grad_norm": 0.10503873555028272, + "learning_rate": 1.2041459959491629e-05, + "loss": 0.8816, + "step": 27591 + }, + { + "epoch": 5.885665529010239, + "grad_norm": 0.10142736540928948, + "learning_rate": 1.2036970438544916e-05, + "loss": 0.876, + "step": 27592 + }, + { + "epoch": 5.885878839590443, + "grad_norm": 0.10009511264426386, + "learning_rate": 1.2032481686601023e-05, + "loss": 0.8741, + "step": 27593 + }, + { + "epoch": 5.886092150170649, + "grad_norm": 0.09817029800119677, + "learning_rate": 1.2027993703710724e-05, + "loss": 0.8834, + "step": 27594 + }, + { + "epoch": 5.886305460750854, + "grad_norm": 0.1016070698037276, + "learning_rate": 1.2023506489924838e-05, + "loss": 0.8912, + "step": 27595 + }, + { + "epoch": 5.886518771331058, + "grad_norm": 0.09238385683049692, + "learning_rate": 1.201902004529412e-05, + "loss": 0.861, + "step": 27596 + }, + { + "epoch": 5.886732081911263, + "grad_norm": 0.08952895585644333, + "learning_rate": 1.2014534369869337e-05, + "loss": 0.8459, + "step": 27597 + }, + { + "epoch": 5.886945392491468, + "grad_norm": 0.10539838606267096, + "learning_rate": 1.2010049463701239e-05, + "loss": 0.8958, + "step": 27598 + }, + { + "epoch": 5.887158703071672, + "grad_norm": 0.10226267069644486, + "learning_rate": 1.2005565326840566e-05, + "loss": 0.9177, + "step": 27599 + }, + { + "epoch": 5.887372013651877, + "grad_norm": 0.09284185546878795, + "learning_rate": 1.2001081959338082e-05, + "loss": 0.8821, + "step": 27600 + }, + { + "epoch": 5.887585324232082, + "grad_norm": 0.12791612897624172, + "learning_rate": 1.199659936124447e-05, + "loss": 0.936, + "step": 27601 + }, + { + "epoch": 5.887798634812286, + "grad_norm": 0.09638802447653792, + "learning_rate": 1.19921175326105e-05, + "loss": 0.9079, + "step": 27602 + }, + { + "epoch": 5.888011945392492, + "grad_norm": 0.10217927430616505, + "learning_rate": 1.1987636473486863e-05, + "loss": 0.8949, + "step": 27603 + }, + { + "epoch": 5.888225255972696, + "grad_norm": 0.1016419204772831, + "learning_rate": 1.1983156183924262e-05, + "loss": 0.8746, + "step": 27604 + }, + { + "epoch": 5.888438566552901, + "grad_norm": 0.10488410348540535, + "learning_rate": 1.1978676663973397e-05, + "loss": 0.8782, + "step": 27605 + }, + { + "epoch": 5.888651877133106, + "grad_norm": 0.1016812575046086, + "learning_rate": 1.1974197913684926e-05, + "loss": 0.906, + "step": 27606 + }, + { + "epoch": 5.888865187713311, + "grad_norm": 0.11254424209003164, + "learning_rate": 1.1969719933109576e-05, + "loss": 0.8932, + "step": 27607 + }, + { + "epoch": 5.889078498293515, + "grad_norm": 0.09097071261790224, + "learning_rate": 1.1965242722297981e-05, + "loss": 0.9015, + "step": 27608 + }, + { + "epoch": 5.88929180887372, + "grad_norm": 0.10744329313131715, + "learning_rate": 1.196076628130082e-05, + "loss": 0.882, + "step": 27609 + }, + { + "epoch": 5.889505119453925, + "grad_norm": 0.09263850717368964, + "learning_rate": 1.1956290610168719e-05, + "loss": 0.921, + "step": 27610 + }, + { + "epoch": 5.889718430034129, + "grad_norm": 0.11208135167326559, + "learning_rate": 1.195181570895234e-05, + "loss": 0.8835, + "step": 27611 + }, + { + "epoch": 5.889931740614334, + "grad_norm": 0.09747929552810425, + "learning_rate": 1.194734157770232e-05, + "loss": 0.8972, + "step": 27612 + }, + { + "epoch": 5.890145051194539, + "grad_norm": 0.09973657772934263, + "learning_rate": 1.1942868216469247e-05, + "loss": 0.8761, + "step": 27613 + }, + { + "epoch": 5.890358361774744, + "grad_norm": 0.10467522589315753, + "learning_rate": 1.1938395625303785e-05, + "loss": 0.8872, + "step": 27614 + }, + { + "epoch": 5.890571672354949, + "grad_norm": 0.10324253411626583, + "learning_rate": 1.1933923804256522e-05, + "loss": 0.9234, + "step": 27615 + }, + { + "epoch": 5.890784982935154, + "grad_norm": 0.09779971887890707, + "learning_rate": 1.1929452753378051e-05, + "loss": 0.8913, + "step": 27616 + }, + { + "epoch": 5.890998293515358, + "grad_norm": 0.10286732562004072, + "learning_rate": 1.1924982472718968e-05, + "loss": 0.9156, + "step": 27617 + }, + { + "epoch": 5.891211604095563, + "grad_norm": 0.11081837930718519, + "learning_rate": 1.1920512962329842e-05, + "loss": 0.88, + "step": 27618 + }, + { + "epoch": 5.891424914675768, + "grad_norm": 0.09298773054774562, + "learning_rate": 1.1916044222261269e-05, + "loss": 0.8583, + "step": 27619 + }, + { + "epoch": 5.8916382252559725, + "grad_norm": 0.10367434002435821, + "learning_rate": 1.1911576252563823e-05, + "loss": 0.8806, + "step": 27620 + }, + { + "epoch": 5.891851535836177, + "grad_norm": 0.09814230151757869, + "learning_rate": 1.1907109053287998e-05, + "loss": 0.9079, + "step": 27621 + }, + { + "epoch": 5.892064846416382, + "grad_norm": 0.1119145502842402, + "learning_rate": 1.19026426244844e-05, + "loss": 0.8752, + "step": 27622 + }, + { + "epoch": 5.892278156996587, + "grad_norm": 0.09767610438041674, + "learning_rate": 1.189817696620354e-05, + "loss": 0.8647, + "step": 27623 + }, + { + "epoch": 5.892491467576792, + "grad_norm": 0.10389449895077979, + "learning_rate": 1.1893712078495967e-05, + "loss": 0.8975, + "step": 27624 + }, + { + "epoch": 5.892704778156997, + "grad_norm": 0.1076765614754894, + "learning_rate": 1.1889247961412153e-05, + "loss": 0.8831, + "step": 27625 + }, + { + "epoch": 5.8929180887372015, + "grad_norm": 0.10493911320189939, + "learning_rate": 1.1884784615002674e-05, + "loss": 0.8852, + "step": 27626 + }, + { + "epoch": 5.893131399317406, + "grad_norm": 0.099984782519914, + "learning_rate": 1.1880322039317993e-05, + "loss": 0.8866, + "step": 27627 + }, + { + "epoch": 5.893344709897611, + "grad_norm": 0.09979244408045863, + "learning_rate": 1.1875860234408623e-05, + "loss": 0.9033, + "step": 27628 + }, + { + "epoch": 5.8935580204778155, + "grad_norm": 0.1008168580152854, + "learning_rate": 1.1871399200325037e-05, + "loss": 0.8889, + "step": 27629 + }, + { + "epoch": 5.89377133105802, + "grad_norm": 0.09277540320541257, + "learning_rate": 1.1866938937117695e-05, + "loss": 0.891, + "step": 27630 + }, + { + "epoch": 5.893984641638225, + "grad_norm": 0.09971258903665053, + "learning_rate": 1.1862479444837124e-05, + "loss": 0.8671, + "step": 27631 + }, + { + "epoch": 5.8941979522184305, + "grad_norm": 0.10644358506242893, + "learning_rate": 1.1858020723533721e-05, + "loss": 0.8936, + "step": 27632 + }, + { + "epoch": 5.894411262798635, + "grad_norm": 0.1012216582039515, + "learning_rate": 1.185356277325795e-05, + "loss": 0.8893, + "step": 27633 + }, + { + "epoch": 5.89462457337884, + "grad_norm": 0.1000082628877127, + "learning_rate": 1.1849105594060268e-05, + "loss": 0.8909, + "step": 27634 + }, + { + "epoch": 5.8948378839590445, + "grad_norm": 0.09991685521354816, + "learning_rate": 1.1844649185991118e-05, + "loss": 0.8973, + "step": 27635 + }, + { + "epoch": 5.895051194539249, + "grad_norm": 0.10073036141318974, + "learning_rate": 1.18401935491009e-05, + "loss": 0.8795, + "step": 27636 + }, + { + "epoch": 5.895264505119454, + "grad_norm": 0.09556051995008644, + "learning_rate": 1.183573868344002e-05, + "loss": 0.8854, + "step": 27637 + }, + { + "epoch": 5.895477815699659, + "grad_norm": 0.10655838698146448, + "learning_rate": 1.183128458905892e-05, + "loss": 0.9034, + "step": 27638 + }, + { + "epoch": 5.895691126279863, + "grad_norm": 0.10685231289620158, + "learning_rate": 1.1826831266007983e-05, + "loss": 0.9207, + "step": 27639 + }, + { + "epoch": 5.895904436860068, + "grad_norm": 0.11565805400448012, + "learning_rate": 1.1822378714337596e-05, + "loss": 0.9181, + "step": 27640 + }, + { + "epoch": 5.8961177474402735, + "grad_norm": 0.09744472888254486, + "learning_rate": 1.1817926934098148e-05, + "loss": 0.9035, + "step": 27641 + }, + { + "epoch": 5.896331058020478, + "grad_norm": 0.11361074658091815, + "learning_rate": 1.1813475925339994e-05, + "loss": 0.8563, + "step": 27642 + }, + { + "epoch": 5.896544368600683, + "grad_norm": 0.11073839767223431, + "learning_rate": 1.1809025688113515e-05, + "loss": 0.8964, + "step": 27643 + }, + { + "epoch": 5.896757679180888, + "grad_norm": 0.09151807680867066, + "learning_rate": 1.1804576222469053e-05, + "loss": 0.8863, + "step": 27644 + }, + { + "epoch": 5.896970989761092, + "grad_norm": 0.1407227979668464, + "learning_rate": 1.180012752845694e-05, + "loss": 0.8627, + "step": 27645 + }, + { + "epoch": 5.897184300341297, + "grad_norm": 0.09629495487281481, + "learning_rate": 1.1795679606127552e-05, + "loss": 0.8871, + "step": 27646 + }, + { + "epoch": 5.897397610921502, + "grad_norm": 0.09738666688067042, + "learning_rate": 1.1791232455531189e-05, + "loss": 0.8589, + "step": 27647 + }, + { + "epoch": 5.897610921501706, + "grad_norm": 0.10619075998298591, + "learning_rate": 1.1786786076718183e-05, + "loss": 0.8803, + "step": 27648 + }, + { + "epoch": 5.897824232081911, + "grad_norm": 0.08817866163652134, + "learning_rate": 1.1782340469738824e-05, + "loss": 0.8915, + "step": 27649 + }, + { + "epoch": 5.898037542662116, + "grad_norm": 0.12322441242617978, + "learning_rate": 1.1777895634643443e-05, + "loss": 0.9006, + "step": 27650 + }, + { + "epoch": 5.89825085324232, + "grad_norm": 0.09460764385364384, + "learning_rate": 1.177345157148233e-05, + "loss": 0.9099, + "step": 27651 + }, + { + "epoch": 5.898464163822526, + "grad_norm": 0.10474424230022682, + "learning_rate": 1.1769008280305747e-05, + "loss": 0.8691, + "step": 27652 + }, + { + "epoch": 5.898677474402731, + "grad_norm": 0.11903073878612086, + "learning_rate": 1.1764565761163991e-05, + "loss": 0.8919, + "step": 27653 + }, + { + "epoch": 5.898890784982935, + "grad_norm": 0.09759203561286567, + "learning_rate": 1.1760124014107324e-05, + "loss": 0.854, + "step": 27654 + }, + { + "epoch": 5.89910409556314, + "grad_norm": 0.11113326775941393, + "learning_rate": 1.1755683039185999e-05, + "loss": 0.8886, + "step": 27655 + }, + { + "epoch": 5.899317406143345, + "grad_norm": 0.11093844356071166, + "learning_rate": 1.175124283645026e-05, + "loss": 0.9031, + "step": 27656 + }, + { + "epoch": 5.899530716723549, + "grad_norm": 0.1095101468066133, + "learning_rate": 1.1746803405950344e-05, + "loss": 0.9085, + "step": 27657 + }, + { + "epoch": 5.899744027303754, + "grad_norm": 0.13740021745479064, + "learning_rate": 1.1742364747736521e-05, + "loss": 0.9081, + "step": 27658 + }, + { + "epoch": 5.899957337883959, + "grad_norm": 0.09608692049255467, + "learning_rate": 1.1737926861858981e-05, + "loss": 0.8643, + "step": 27659 + }, + { + "epoch": 5.900170648464163, + "grad_norm": 0.102326692835477, + "learning_rate": 1.1733489748367943e-05, + "loss": 0.8791, + "step": 27660 + }, + { + "epoch": 5.900383959044369, + "grad_norm": 0.12546558173655753, + "learning_rate": 1.1729053407313606e-05, + "loss": 0.9044, + "step": 27661 + }, + { + "epoch": 5.900597269624574, + "grad_norm": 0.094737393746313, + "learning_rate": 1.1724617838746198e-05, + "loss": 0.8726, + "step": 27662 + }, + { + "epoch": 5.900810580204778, + "grad_norm": 0.1144488237266937, + "learning_rate": 1.172018304271588e-05, + "loss": 0.8829, + "step": 27663 + }, + { + "epoch": 5.901023890784983, + "grad_norm": 0.09299211213393335, + "learning_rate": 1.1715749019272847e-05, + "loss": 0.8632, + "step": 27664 + }, + { + "epoch": 5.901237201365188, + "grad_norm": 0.09330628586412198, + "learning_rate": 1.171131576846726e-05, + "loss": 0.8728, + "step": 27665 + }, + { + "epoch": 5.901450511945392, + "grad_norm": 0.11542256703104294, + "learning_rate": 1.1706883290349276e-05, + "loss": 0.8708, + "step": 27666 + }, + { + "epoch": 5.901663822525597, + "grad_norm": 0.09126868331972471, + "learning_rate": 1.1702451584969067e-05, + "loss": 0.8887, + "step": 27667 + }, + { + "epoch": 5.901877133105802, + "grad_norm": 0.10526791243107261, + "learning_rate": 1.1698020652376756e-05, + "loss": 0.9054, + "step": 27668 + }, + { + "epoch": 5.902090443686006, + "grad_norm": 0.1016935781655226, + "learning_rate": 1.1693590492622487e-05, + "loss": 0.8894, + "step": 27669 + }, + { + "epoch": 5.902303754266212, + "grad_norm": 0.10134772814185176, + "learning_rate": 1.1689161105756397e-05, + "loss": 0.8784, + "step": 27670 + }, + { + "epoch": 5.902517064846417, + "grad_norm": 0.11732623755687795, + "learning_rate": 1.16847324918286e-05, + "loss": 0.8817, + "step": 27671 + }, + { + "epoch": 5.902730375426621, + "grad_norm": 0.08955563217045631, + "learning_rate": 1.1680304650889202e-05, + "loss": 0.891, + "step": 27672 + }, + { + "epoch": 5.902943686006826, + "grad_norm": 0.12127593432623258, + "learning_rate": 1.1675877582988293e-05, + "loss": 0.8735, + "step": 27673 + }, + { + "epoch": 5.903156996587031, + "grad_norm": 0.09601921246613959, + "learning_rate": 1.1671451288175999e-05, + "loss": 0.8831, + "step": 27674 + }, + { + "epoch": 5.903370307167235, + "grad_norm": 0.1008297587582097, + "learning_rate": 1.1667025766502378e-05, + "loss": 0.9008, + "step": 27675 + }, + { + "epoch": 5.90358361774744, + "grad_norm": 0.09785597180490672, + "learning_rate": 1.1662601018017523e-05, + "loss": 0.8969, + "step": 27676 + }, + { + "epoch": 5.903796928327645, + "grad_norm": 0.09948917640933315, + "learning_rate": 1.1658177042771457e-05, + "loss": 0.9113, + "step": 27677 + }, + { + "epoch": 5.9040102389078495, + "grad_norm": 0.11789205387366881, + "learning_rate": 1.165375384081429e-05, + "loss": 0.8574, + "step": 27678 + }, + { + "epoch": 5.904223549488055, + "grad_norm": 0.08846831044139454, + "learning_rate": 1.1649331412196036e-05, + "loss": 0.8675, + "step": 27679 + }, + { + "epoch": 5.90443686006826, + "grad_norm": 0.09428223914770492, + "learning_rate": 1.1644909756966754e-05, + "loss": 0.875, + "step": 27680 + }, + { + "epoch": 5.904650170648464, + "grad_norm": 0.11013977821476811, + "learning_rate": 1.1640488875176454e-05, + "loss": 0.8691, + "step": 27681 + }, + { + "epoch": 5.904863481228669, + "grad_norm": 0.09661295481957023, + "learning_rate": 1.163606876687518e-05, + "loss": 0.8517, + "step": 27682 + }, + { + "epoch": 5.905076791808874, + "grad_norm": 0.10538886708425549, + "learning_rate": 1.1631649432112941e-05, + "loss": 0.8892, + "step": 27683 + }, + { + "epoch": 5.9052901023890785, + "grad_norm": 0.09725819731814062, + "learning_rate": 1.1627230870939746e-05, + "loss": 0.8907, + "step": 27684 + }, + { + "epoch": 5.905503412969283, + "grad_norm": 0.10823472963131299, + "learning_rate": 1.162281308340556e-05, + "loss": 0.9105, + "step": 27685 + }, + { + "epoch": 5.905716723549488, + "grad_norm": 0.09750340824875983, + "learning_rate": 1.1618396069560415e-05, + "loss": 0.8632, + "step": 27686 + }, + { + "epoch": 5.9059300341296925, + "grad_norm": 0.10428803922849784, + "learning_rate": 1.1613979829454283e-05, + "loss": 0.8693, + "step": 27687 + }, + { + "epoch": 5.906143344709898, + "grad_norm": 0.10053361934556339, + "learning_rate": 1.1609564363137102e-05, + "loss": 0.8515, + "step": 27688 + }, + { + "epoch": 5.906356655290102, + "grad_norm": 0.11587842899650118, + "learning_rate": 1.1605149670658835e-05, + "loss": 0.9039, + "step": 27689 + }, + { + "epoch": 5.9065699658703075, + "grad_norm": 0.10129721201437937, + "learning_rate": 1.1600735752069468e-05, + "loss": 0.886, + "step": 27690 + }, + { + "epoch": 5.906783276450512, + "grad_norm": 0.12096231430465747, + "learning_rate": 1.1596322607418924e-05, + "loss": 0.8823, + "step": 27691 + }, + { + "epoch": 5.906996587030717, + "grad_norm": 0.09809557832611565, + "learning_rate": 1.1591910236757138e-05, + "loss": 0.8647, + "step": 27692 + }, + { + "epoch": 5.9072098976109215, + "grad_norm": 0.09664897543230559, + "learning_rate": 1.1587498640134025e-05, + "loss": 0.8647, + "step": 27693 + }, + { + "epoch": 5.907423208191126, + "grad_norm": 0.11527028293542181, + "learning_rate": 1.1583087817599527e-05, + "loss": 0.923, + "step": 27694 + }, + { + "epoch": 5.907636518771331, + "grad_norm": 0.10723817805885251, + "learning_rate": 1.1578677769203552e-05, + "loss": 0.8665, + "step": 27695 + }, + { + "epoch": 5.907849829351536, + "grad_norm": 0.09507062285202102, + "learning_rate": 1.157426849499598e-05, + "loss": 0.9094, + "step": 27696 + }, + { + "epoch": 5.90806313993174, + "grad_norm": 0.09786334833806858, + "learning_rate": 1.1569859995026698e-05, + "loss": 0.8916, + "step": 27697 + }, + { + "epoch": 5.908276450511945, + "grad_norm": 0.11376372778862949, + "learning_rate": 1.1565452269345632e-05, + "loss": 0.8812, + "step": 27698 + }, + { + "epoch": 5.9084897610921505, + "grad_norm": 0.09921944907466475, + "learning_rate": 1.1561045318002609e-05, + "loss": 0.8711, + "step": 27699 + }, + { + "epoch": 5.908703071672355, + "grad_norm": 0.10611220453730107, + "learning_rate": 1.1556639141047517e-05, + "loss": 0.8777, + "step": 27700 + }, + { + "epoch": 5.90891638225256, + "grad_norm": 0.10682618153981303, + "learning_rate": 1.1552233738530174e-05, + "loss": 0.9085, + "step": 27701 + }, + { + "epoch": 5.909129692832765, + "grad_norm": 0.10550962276294508, + "learning_rate": 1.1547829110500478e-05, + "loss": 0.9036, + "step": 27702 + }, + { + "epoch": 5.909343003412969, + "grad_norm": 0.10122162484040287, + "learning_rate": 1.1543425257008258e-05, + "loss": 0.8874, + "step": 27703 + }, + { + "epoch": 5.909556313993174, + "grad_norm": 0.10680847432241829, + "learning_rate": 1.153902217810332e-05, + "loss": 0.8781, + "step": 27704 + }, + { + "epoch": 5.909769624573379, + "grad_norm": 0.10826330896520561, + "learning_rate": 1.1534619873835484e-05, + "loss": 0.9031, + "step": 27705 + }, + { + "epoch": 5.909982935153583, + "grad_norm": 0.09615225787261189, + "learning_rate": 1.1530218344254594e-05, + "loss": 0.9147, + "step": 27706 + }, + { + "epoch": 5.910196245733788, + "grad_norm": 0.11489991427791799, + "learning_rate": 1.152581758941044e-05, + "loss": 0.9328, + "step": 27707 + }, + { + "epoch": 5.910409556313994, + "grad_norm": 0.10702803452101024, + "learning_rate": 1.1521417609352805e-05, + "loss": 0.8755, + "step": 27708 + }, + { + "epoch": 5.910622866894198, + "grad_norm": 0.1036736842212557, + "learning_rate": 1.1517018404131463e-05, + "loss": 0.8923, + "step": 27709 + }, + { + "epoch": 5.910836177474403, + "grad_norm": 0.11355698106651778, + "learning_rate": 1.151261997379625e-05, + "loss": 0.8631, + "step": 27710 + }, + { + "epoch": 5.911049488054608, + "grad_norm": 0.1125193480047908, + "learning_rate": 1.1508222318396869e-05, + "loss": 0.8958, + "step": 27711 + }, + { + "epoch": 5.911262798634812, + "grad_norm": 0.09901285645683107, + "learning_rate": 1.1503825437983119e-05, + "loss": 0.8712, + "step": 27712 + }, + { + "epoch": 5.911476109215017, + "grad_norm": 0.12128644627160637, + "learning_rate": 1.1499429332604702e-05, + "loss": 0.8921, + "step": 27713 + }, + { + "epoch": 5.911689419795222, + "grad_norm": 0.1038764297387713, + "learning_rate": 1.1495034002311418e-05, + "loss": 0.8893, + "step": 27714 + }, + { + "epoch": 5.911902730375426, + "grad_norm": 0.0893793672359418, + "learning_rate": 1.149063944715298e-05, + "loss": 0.8833, + "step": 27715 + }, + { + "epoch": 5.912116040955631, + "grad_norm": 0.11000796553464597, + "learning_rate": 1.1486245667179108e-05, + "loss": 0.87, + "step": 27716 + }, + { + "epoch": 5.912329351535837, + "grad_norm": 0.11140014890867776, + "learning_rate": 1.1481852662439494e-05, + "loss": 0.9058, + "step": 27717 + }, + { + "epoch": 5.912542662116041, + "grad_norm": 0.10702917819963904, + "learning_rate": 1.1477460432983895e-05, + "loss": 0.8748, + "step": 27718 + }, + { + "epoch": 5.912755972696246, + "grad_norm": 0.096391595934757, + "learning_rate": 1.1473068978861988e-05, + "loss": 0.862, + "step": 27719 + }, + { + "epoch": 5.912969283276451, + "grad_norm": 0.09536350867971978, + "learning_rate": 1.1468678300123445e-05, + "loss": 0.8801, + "step": 27720 + }, + { + "epoch": 5.913182593856655, + "grad_norm": 0.11284749008032406, + "learning_rate": 1.1464288396817973e-05, + "loss": 0.8535, + "step": 27721 + }, + { + "epoch": 5.91339590443686, + "grad_norm": 0.09510795528246478, + "learning_rate": 1.1459899268995218e-05, + "loss": 0.8549, + "step": 27722 + }, + { + "epoch": 5.913609215017065, + "grad_norm": 0.10072952750156704, + "learning_rate": 1.1455510916704866e-05, + "loss": 0.8864, + "step": 27723 + }, + { + "epoch": 5.913822525597269, + "grad_norm": 0.10624086001721991, + "learning_rate": 1.1451123339996558e-05, + "loss": 0.9132, + "step": 27724 + }, + { + "epoch": 5.914035836177474, + "grad_norm": 0.10337624101524198, + "learning_rate": 1.1446736538919922e-05, + "loss": 0.889, + "step": 27725 + }, + { + "epoch": 5.91424914675768, + "grad_norm": 0.10256387979354938, + "learning_rate": 1.1442350513524638e-05, + "loss": 0.8677, + "step": 27726 + }, + { + "epoch": 5.914462457337884, + "grad_norm": 0.11177119025005423, + "learning_rate": 1.143796526386031e-05, + "loss": 0.9163, + "step": 27727 + }, + { + "epoch": 5.914675767918089, + "grad_norm": 0.11249303273539356, + "learning_rate": 1.1433580789976549e-05, + "loss": 0.8844, + "step": 27728 + }, + { + "epoch": 5.914889078498294, + "grad_norm": 0.09757174918024324, + "learning_rate": 1.142919709192296e-05, + "loss": 0.8803, + "step": 27729 + }, + { + "epoch": 5.915102389078498, + "grad_norm": 0.10392187674199904, + "learning_rate": 1.1424814169749188e-05, + "loss": 0.8868, + "step": 27730 + }, + { + "epoch": 5.915315699658703, + "grad_norm": 0.11538217985432239, + "learning_rate": 1.1420432023504789e-05, + "loss": 0.8921, + "step": 27731 + }, + { + "epoch": 5.915529010238908, + "grad_norm": 0.1254256304692071, + "learning_rate": 1.1416050653239358e-05, + "loss": 0.9183, + "step": 27732 + }, + { + "epoch": 5.915742320819112, + "grad_norm": 0.11028971037380667, + "learning_rate": 1.141167005900246e-05, + "loss": 0.8897, + "step": 27733 + }, + { + "epoch": 5.915955631399317, + "grad_norm": 0.10029351963407213, + "learning_rate": 1.1407290240843679e-05, + "loss": 0.8565, + "step": 27734 + }, + { + "epoch": 5.916168941979522, + "grad_norm": 0.11693658049086217, + "learning_rate": 1.1402911198812557e-05, + "loss": 0.9296, + "step": 27735 + }, + { + "epoch": 5.9163822525597265, + "grad_norm": 0.0957465714346255, + "learning_rate": 1.1398532932958646e-05, + "loss": 0.8946, + "step": 27736 + }, + { + "epoch": 5.916595563139932, + "grad_norm": 0.10150041633031565, + "learning_rate": 1.139415544333148e-05, + "loss": 0.8826, + "step": 27737 + }, + { + "epoch": 5.916808873720137, + "grad_norm": 0.1029182403340422, + "learning_rate": 1.1389778729980611e-05, + "loss": 0.8971, + "step": 27738 + }, + { + "epoch": 5.917022184300341, + "grad_norm": 0.08798198668799473, + "learning_rate": 1.1385402792955547e-05, + "loss": 0.8792, + "step": 27739 + }, + { + "epoch": 5.917235494880546, + "grad_norm": 0.09410048759020173, + "learning_rate": 1.138102763230581e-05, + "loss": 0.887, + "step": 27740 + }, + { + "epoch": 5.917448805460751, + "grad_norm": 0.0997226882809456, + "learning_rate": 1.1376653248080882e-05, + "loss": 0.8841, + "step": 27741 + }, + { + "epoch": 5.9176621160409555, + "grad_norm": 0.0991814064166305, + "learning_rate": 1.1372279640330288e-05, + "loss": 0.8624, + "step": 27742 + }, + { + "epoch": 5.91787542662116, + "grad_norm": 0.09295275274699014, + "learning_rate": 1.1367906809103527e-05, + "loss": 0.8821, + "step": 27743 + }, + { + "epoch": 5.918088737201365, + "grad_norm": 0.1119723271695398, + "learning_rate": 1.136353475445004e-05, + "loss": 0.8874, + "step": 27744 + }, + { + "epoch": 5.9183020477815695, + "grad_norm": 0.10161028693473116, + "learning_rate": 1.1359163476419285e-05, + "loss": 0.8924, + "step": 27745 + }, + { + "epoch": 5.918515358361775, + "grad_norm": 0.09347408387447927, + "learning_rate": 1.1354792975060778e-05, + "loss": 0.8825, + "step": 27746 + }, + { + "epoch": 5.91872866894198, + "grad_norm": 0.10398054884620152, + "learning_rate": 1.1350423250423934e-05, + "loss": 0.8603, + "step": 27747 + }, + { + "epoch": 5.9189419795221845, + "grad_norm": 0.09057304534184124, + "learning_rate": 1.134605430255821e-05, + "loss": 0.8753, + "step": 27748 + }, + { + "epoch": 5.919155290102389, + "grad_norm": 0.09407844067992065, + "learning_rate": 1.134168613151302e-05, + "loss": 0.8679, + "step": 27749 + }, + { + "epoch": 5.919368600682594, + "grad_norm": 0.1006079058625713, + "learning_rate": 1.1337318737337823e-05, + "loss": 0.861, + "step": 27750 + }, + { + "epoch": 5.9195819112627985, + "grad_norm": 0.1015670783892113, + "learning_rate": 1.1332952120082016e-05, + "loss": 0.8688, + "step": 27751 + }, + { + "epoch": 5.919795221843003, + "grad_norm": 0.10007917992460955, + "learning_rate": 1.132858627979501e-05, + "loss": 0.8862, + "step": 27752 + }, + { + "epoch": 5.920008532423208, + "grad_norm": 0.10502811079917837, + "learning_rate": 1.1324221216526197e-05, + "loss": 0.897, + "step": 27753 + }, + { + "epoch": 5.920221843003413, + "grad_norm": 0.09666171955902526, + "learning_rate": 1.1319856930325002e-05, + "loss": 0.8943, + "step": 27754 + }, + { + "epoch": 5.920435153583618, + "grad_norm": 0.10075457685024904, + "learning_rate": 1.1315493421240773e-05, + "loss": 0.8762, + "step": 27755 + }, + { + "epoch": 5.920648464163823, + "grad_norm": 0.10154480378792835, + "learning_rate": 1.1311130689322885e-05, + "loss": 0.8962, + "step": 27756 + }, + { + "epoch": 5.9208617747440275, + "grad_norm": 0.09949879719201273, + "learning_rate": 1.1306768734620683e-05, + "loss": 0.8995, + "step": 27757 + }, + { + "epoch": 5.921075085324232, + "grad_norm": 0.1031344151306799, + "learning_rate": 1.130240755718358e-05, + "loss": 0.8966, + "step": 27758 + }, + { + "epoch": 5.921288395904437, + "grad_norm": 0.09455433712881324, + "learning_rate": 1.1298047157060882e-05, + "loss": 0.8545, + "step": 27759 + }, + { + "epoch": 5.921501706484642, + "grad_norm": 0.10050535481182618, + "learning_rate": 1.1293687534301943e-05, + "loss": 0.9096, + "step": 27760 + }, + { + "epoch": 5.921715017064846, + "grad_norm": 0.10647771986301843, + "learning_rate": 1.1289328688956061e-05, + "loss": 0.9087, + "step": 27761 + }, + { + "epoch": 5.921928327645051, + "grad_norm": 0.09106692288597343, + "learning_rate": 1.1284970621072601e-05, + "loss": 0.8944, + "step": 27762 + }, + { + "epoch": 5.922141638225256, + "grad_norm": 0.10230252136867829, + "learning_rate": 1.1280613330700852e-05, + "loss": 0.8759, + "step": 27763 + }, + { + "epoch": 5.922354948805461, + "grad_norm": 0.09341321726343335, + "learning_rate": 1.1276256817890122e-05, + "loss": 0.8662, + "step": 27764 + }, + { + "epoch": 5.922568259385666, + "grad_norm": 0.09615319198151105, + "learning_rate": 1.1271901082689686e-05, + "loss": 0.9029, + "step": 27765 + }, + { + "epoch": 5.922781569965871, + "grad_norm": 0.09645497922082, + "learning_rate": 1.1267546125148878e-05, + "loss": 0.8865, + "step": 27766 + }, + { + "epoch": 5.922994880546075, + "grad_norm": 0.10293762195528872, + "learning_rate": 1.1263191945316917e-05, + "loss": 0.898, + "step": 27767 + }, + { + "epoch": 5.92320819112628, + "grad_norm": 0.10573414863942873, + "learning_rate": 1.1258838543243109e-05, + "loss": 0.8718, + "step": 27768 + }, + { + "epoch": 5.923421501706485, + "grad_norm": 0.09650515921108349, + "learning_rate": 1.1254485918976674e-05, + "loss": 0.8952, + "step": 27769 + }, + { + "epoch": 5.923634812286689, + "grad_norm": 0.09037817449004328, + "learning_rate": 1.1250134072566907e-05, + "loss": 0.8791, + "step": 27770 + }, + { + "epoch": 5.923848122866894, + "grad_norm": 0.1192466686491923, + "learning_rate": 1.1245783004063022e-05, + "loss": 0.87, + "step": 27771 + }, + { + "epoch": 5.924061433447099, + "grad_norm": 0.09439141440232383, + "learning_rate": 1.124143271351427e-05, + "loss": 0.909, + "step": 27772 + }, + { + "epoch": 5.924274744027304, + "grad_norm": 0.09372580143632556, + "learning_rate": 1.1237083200969842e-05, + "loss": 0.8899, + "step": 27773 + }, + { + "epoch": 5.924488054607508, + "grad_norm": 0.09985517761217651, + "learning_rate": 1.1232734466478986e-05, + "loss": 0.8755, + "step": 27774 + }, + { + "epoch": 5.924701365187714, + "grad_norm": 0.10629673450687505, + "learning_rate": 1.12283865100909e-05, + "loss": 0.8708, + "step": 27775 + }, + { + "epoch": 5.924914675767918, + "grad_norm": 0.09328907733723453, + "learning_rate": 1.1224039331854777e-05, + "loss": 0.8938, + "step": 27776 + }, + { + "epoch": 5.925127986348123, + "grad_norm": 0.0885948398183563, + "learning_rate": 1.121969293181981e-05, + "loss": 0.8887, + "step": 27777 + }, + { + "epoch": 5.925341296928328, + "grad_norm": 0.09743716537390112, + "learning_rate": 1.121534731003517e-05, + "loss": 0.8645, + "step": 27778 + }, + { + "epoch": 5.925554607508532, + "grad_norm": 0.1085209330155667, + "learning_rate": 1.1211002466550034e-05, + "loss": 0.925, + "step": 27779 + }, + { + "epoch": 5.925767918088737, + "grad_norm": 0.09636795521288956, + "learning_rate": 1.1206658401413563e-05, + "loss": 0.9058, + "step": 27780 + }, + { + "epoch": 5.925981228668942, + "grad_norm": 0.09090613864608524, + "learning_rate": 1.1202315114674894e-05, + "loss": 0.8947, + "step": 27781 + }, + { + "epoch": 5.926194539249146, + "grad_norm": 0.09553481972277532, + "learning_rate": 1.1197972606383191e-05, + "loss": 0.9132, + "step": 27782 + }, + { + "epoch": 5.926407849829351, + "grad_norm": 0.10941284514439774, + "learning_rate": 1.1193630876587602e-05, + "loss": 0.892, + "step": 27783 + }, + { + "epoch": 5.926621160409557, + "grad_norm": 0.08947580774052549, + "learning_rate": 1.1189289925337223e-05, + "loss": 0.9, + "step": 27784 + }, + { + "epoch": 5.926834470989761, + "grad_norm": 0.10421069180905171, + "learning_rate": 1.118494975268118e-05, + "loss": 0.9156, + "step": 27785 + }, + { + "epoch": 5.927047781569966, + "grad_norm": 0.09870232422268295, + "learning_rate": 1.1180610358668599e-05, + "loss": 0.899, + "step": 27786 + }, + { + "epoch": 5.927261092150171, + "grad_norm": 0.09298804014414612, + "learning_rate": 1.1176271743348575e-05, + "loss": 0.8816, + "step": 27787 + }, + { + "epoch": 5.927474402730375, + "grad_norm": 0.09114606890292154, + "learning_rate": 1.1171933906770182e-05, + "loss": 0.8602, + "step": 27788 + }, + { + "epoch": 5.92768771331058, + "grad_norm": 0.09444798064541574, + "learning_rate": 1.116759684898253e-05, + "loss": 0.8676, + "step": 27789 + }, + { + "epoch": 5.927901023890785, + "grad_norm": 0.09632941747765575, + "learning_rate": 1.1163260570034668e-05, + "loss": 0.8899, + "step": 27790 + }, + { + "epoch": 5.9281143344709895, + "grad_norm": 0.12050857629864474, + "learning_rate": 1.1158925069975673e-05, + "loss": 0.9027, + "step": 27791 + }, + { + "epoch": 5.928327645051194, + "grad_norm": 0.09024496635982164, + "learning_rate": 1.1154590348854594e-05, + "loss": 0.8646, + "step": 27792 + }, + { + "epoch": 5.9285409556314, + "grad_norm": 0.10945268061009575, + "learning_rate": 1.1150256406720473e-05, + "loss": 0.8746, + "step": 27793 + }, + { + "epoch": 5.928754266211604, + "grad_norm": 0.11285349888780605, + "learning_rate": 1.1145923243622377e-05, + "loss": 0.879, + "step": 27794 + }, + { + "epoch": 5.928967576791809, + "grad_norm": 0.10279605449584459, + "learning_rate": 1.1141590859609307e-05, + "loss": 0.8613, + "step": 27795 + }, + { + "epoch": 5.929180887372014, + "grad_norm": 0.10784333133592292, + "learning_rate": 1.1137259254730304e-05, + "loss": 0.8832, + "step": 27796 + }, + { + "epoch": 5.9293941979522184, + "grad_norm": 0.11039101264197465, + "learning_rate": 1.1132928429034346e-05, + "loss": 0.8794, + "step": 27797 + }, + { + "epoch": 5.929607508532423, + "grad_norm": 0.10877511501006508, + "learning_rate": 1.1128598382570491e-05, + "loss": 0.9074, + "step": 27798 + }, + { + "epoch": 5.929820819112628, + "grad_norm": 0.11149600864637235, + "learning_rate": 1.1124269115387708e-05, + "loss": 0.902, + "step": 27799 + }, + { + "epoch": 5.9300341296928325, + "grad_norm": 0.10424451676003417, + "learning_rate": 1.1119940627534951e-05, + "loss": 0.8697, + "step": 27800 + }, + { + "epoch": 5.930247440273037, + "grad_norm": 0.10664265122064838, + "learning_rate": 1.1115612919061243e-05, + "loss": 0.8946, + "step": 27801 + }, + { + "epoch": 5.930460750853243, + "grad_norm": 0.1133000848626823, + "learning_rate": 1.1111285990015533e-05, + "loss": 0.9016, + "step": 27802 + }, + { + "epoch": 5.930674061433447, + "grad_norm": 0.10492585175426443, + "learning_rate": 1.1106959840446777e-05, + "loss": 0.9171, + "step": 27803 + }, + { + "epoch": 5.930887372013652, + "grad_norm": 0.09420065531807027, + "learning_rate": 1.110263447040393e-05, + "loss": 0.8454, + "step": 27804 + }, + { + "epoch": 5.931100682593857, + "grad_norm": 0.09400097621332028, + "learning_rate": 1.1098309879935916e-05, + "loss": 0.8735, + "step": 27805 + }, + { + "epoch": 5.9313139931740615, + "grad_norm": 0.09705599810381765, + "learning_rate": 1.1093986069091697e-05, + "loss": 0.9002, + "step": 27806 + }, + { + "epoch": 5.931527303754266, + "grad_norm": 0.10573599580685222, + "learning_rate": 1.1089663037920188e-05, + "loss": 0.8963, + "step": 27807 + }, + { + "epoch": 5.931740614334471, + "grad_norm": 0.09782344158938784, + "learning_rate": 1.1085340786470308e-05, + "loss": 0.852, + "step": 27808 + }, + { + "epoch": 5.9319539249146755, + "grad_norm": 0.09456526150585212, + "learning_rate": 1.1081019314790926e-05, + "loss": 0.8793, + "step": 27809 + }, + { + "epoch": 5.93216723549488, + "grad_norm": 0.09589201135968008, + "learning_rate": 1.1076698622931014e-05, + "loss": 0.8992, + "step": 27810 + }, + { + "epoch": 5.932380546075086, + "grad_norm": 0.10065864373971187, + "learning_rate": 1.1072378710939397e-05, + "loss": 0.8912, + "step": 27811 + }, + { + "epoch": 5.9325938566552905, + "grad_norm": 0.0951529391059483, + "learning_rate": 1.106805957886496e-05, + "loss": 0.8693, + "step": 27812 + }, + { + "epoch": 5.932807167235495, + "grad_norm": 0.2295962966505365, + "learning_rate": 1.1063741226756598e-05, + "loss": 0.8612, + "step": 27813 + }, + { + "epoch": 5.9330204778157, + "grad_norm": 0.10646718847385664, + "learning_rate": 1.1059423654663175e-05, + "loss": 0.8933, + "step": 27814 + }, + { + "epoch": 5.9332337883959045, + "grad_norm": 0.09741550229576491, + "learning_rate": 1.1055106862633517e-05, + "loss": 0.8712, + "step": 27815 + }, + { + "epoch": 5.933447098976109, + "grad_norm": 0.09306007840097, + "learning_rate": 1.1050790850716502e-05, + "loss": 0.9414, + "step": 27816 + }, + { + "epoch": 5.933660409556314, + "grad_norm": 0.10677495955674533, + "learning_rate": 1.1046475618960915e-05, + "loss": 0.8813, + "step": 27817 + }, + { + "epoch": 5.933873720136519, + "grad_norm": 0.0967318593770208, + "learning_rate": 1.1042161167415641e-05, + "loss": 0.8608, + "step": 27818 + }, + { + "epoch": 5.934087030716723, + "grad_norm": 0.09293429892731817, + "learning_rate": 1.1037847496129479e-05, + "loss": 0.8665, + "step": 27819 + }, + { + "epoch": 5.934300341296928, + "grad_norm": 0.0950671361410723, + "learning_rate": 1.1033534605151224e-05, + "loss": 0.8986, + "step": 27820 + }, + { + "epoch": 5.934513651877133, + "grad_norm": 0.09704660148099033, + "learning_rate": 1.102922249452969e-05, + "loss": 0.8979, + "step": 27821 + }, + { + "epoch": 5.934726962457338, + "grad_norm": 0.09599856914563662, + "learning_rate": 1.1024911164313657e-05, + "loss": 0.9028, + "step": 27822 + }, + { + "epoch": 5.934940273037543, + "grad_norm": 0.0892503653878591, + "learning_rate": 1.102060061455192e-05, + "loss": 0.8834, + "step": 27823 + }, + { + "epoch": 5.935153583617748, + "grad_norm": 0.10684380870627003, + "learning_rate": 1.1016290845293232e-05, + "loss": 0.876, + "step": 27824 + }, + { + "epoch": 5.935366894197952, + "grad_norm": 0.0997902231020508, + "learning_rate": 1.101198185658638e-05, + "loss": 0.8964, + "step": 27825 + }, + { + "epoch": 5.935580204778157, + "grad_norm": 0.11160129614856597, + "learning_rate": 1.1007673648480126e-05, + "loss": 0.8802, + "step": 27826 + }, + { + "epoch": 5.935793515358362, + "grad_norm": 0.09749224337415754, + "learning_rate": 1.1003366221023196e-05, + "loss": 0.8875, + "step": 27827 + }, + { + "epoch": 5.936006825938566, + "grad_norm": 0.10046122980535721, + "learning_rate": 1.0999059574264348e-05, + "loss": 0.8791, + "step": 27828 + }, + { + "epoch": 5.936220136518771, + "grad_norm": 0.1021616972865899, + "learning_rate": 1.0994753708252276e-05, + "loss": 0.9109, + "step": 27829 + }, + { + "epoch": 5.936433447098976, + "grad_norm": 0.10899364125779609, + "learning_rate": 1.0990448623035745e-05, + "loss": 0.8761, + "step": 27830 + }, + { + "epoch": 5.936646757679181, + "grad_norm": 0.0996878305291929, + "learning_rate": 1.098614431866345e-05, + "loss": 0.8895, + "step": 27831 + }, + { + "epoch": 5.936860068259386, + "grad_norm": 0.10899768672620765, + "learning_rate": 1.0981840795184087e-05, + "loss": 0.8937, + "step": 27832 + }, + { + "epoch": 5.937073378839591, + "grad_norm": 0.11085444020733068, + "learning_rate": 1.0977538052646364e-05, + "loss": 0.9051, + "step": 27833 + }, + { + "epoch": 5.937286689419795, + "grad_norm": 0.10186966491458323, + "learning_rate": 1.0973236091098961e-05, + "loss": 0.8761, + "step": 27834 + }, + { + "epoch": 5.9375, + "grad_norm": 0.10635426914887254, + "learning_rate": 1.0968934910590545e-05, + "loss": 0.8703, + "step": 27835 + }, + { + "epoch": 5.937713310580205, + "grad_norm": 0.1066844360722462, + "learning_rate": 1.0964634511169775e-05, + "loss": 0.8906, + "step": 27836 + }, + { + "epoch": 5.937926621160409, + "grad_norm": 0.0947315550756174, + "learning_rate": 1.0960334892885345e-05, + "loss": 0.8898, + "step": 27837 + }, + { + "epoch": 5.938139931740614, + "grad_norm": 0.09944213712541286, + "learning_rate": 1.0956036055785888e-05, + "loss": 0.9034, + "step": 27838 + }, + { + "epoch": 5.938353242320819, + "grad_norm": 0.09623165628840677, + "learning_rate": 1.0951737999920051e-05, + "loss": 0.8591, + "step": 27839 + }, + { + "epoch": 5.938566552901024, + "grad_norm": 0.1003454656638247, + "learning_rate": 1.0947440725336453e-05, + "loss": 0.8928, + "step": 27840 + }, + { + "epoch": 5.938779863481229, + "grad_norm": 0.09519772571945957, + "learning_rate": 1.0943144232083704e-05, + "loss": 0.8964, + "step": 27841 + }, + { + "epoch": 5.938993174061434, + "grad_norm": 0.09744538331178948, + "learning_rate": 1.0938848520210467e-05, + "loss": 0.8867, + "step": 27842 + }, + { + "epoch": 5.939206484641638, + "grad_norm": 0.09621327265724544, + "learning_rate": 1.0934553589765309e-05, + "loss": 0.883, + "step": 27843 + }, + { + "epoch": 5.939419795221843, + "grad_norm": 0.08934971167313632, + "learning_rate": 1.0930259440796846e-05, + "loss": 0.9122, + "step": 27844 + }, + { + "epoch": 5.939633105802048, + "grad_norm": 0.08898312342211541, + "learning_rate": 1.0925966073353655e-05, + "loss": 0.8915, + "step": 27845 + }, + { + "epoch": 5.939846416382252, + "grad_norm": 0.09454210729391758, + "learning_rate": 1.0921673487484328e-05, + "loss": 0.8728, + "step": 27846 + }, + { + "epoch": 5.940059726962457, + "grad_norm": 0.0989333856032722, + "learning_rate": 1.091738168323742e-05, + "loss": 0.8629, + "step": 27847 + }, + { + "epoch": 5.940273037542662, + "grad_norm": 0.087402139574361, + "learning_rate": 1.091309066066148e-05, + "loss": 0.873, + "step": 27848 + }, + { + "epoch": 5.940486348122867, + "grad_norm": 0.09904877691166511, + "learning_rate": 1.090880041980511e-05, + "loss": 0.8963, + "step": 27849 + }, + { + "epoch": 5.940699658703072, + "grad_norm": 0.09510399336125026, + "learning_rate": 1.0904510960716821e-05, + "loss": 0.919, + "step": 27850 + }, + { + "epoch": 5.940912969283277, + "grad_norm": 0.09666698863208725, + "learning_rate": 1.0900222283445144e-05, + "loss": 0.8935, + "step": 27851 + }, + { + "epoch": 5.941126279863481, + "grad_norm": 0.09723254502421333, + "learning_rate": 1.0895934388038626e-05, + "loss": 0.9161, + "step": 27852 + }, + { + "epoch": 5.941339590443686, + "grad_norm": 0.10184005235678338, + "learning_rate": 1.0891647274545743e-05, + "loss": 0.9081, + "step": 27853 + }, + { + "epoch": 5.941552901023891, + "grad_norm": 0.09138785370778943, + "learning_rate": 1.0887360943015062e-05, + "loss": 0.8891, + "step": 27854 + }, + { + "epoch": 5.9417662116040955, + "grad_norm": 0.09872606333458352, + "learning_rate": 1.0883075393495069e-05, + "loss": 0.8729, + "step": 27855 + }, + { + "epoch": 5.9419795221843, + "grad_norm": 0.09875623299911106, + "learning_rate": 1.0878790626034204e-05, + "loss": 0.8669, + "step": 27856 + }, + { + "epoch": 5.942192832764505, + "grad_norm": 0.09004165245339807, + "learning_rate": 1.0874506640681007e-05, + "loss": 0.8675, + "step": 27857 + }, + { + "epoch": 5.9424061433447095, + "grad_norm": 0.10000525655818288, + "learning_rate": 1.0870223437483917e-05, + "loss": 0.8837, + "step": 27858 + }, + { + "epoch": 5.942619453924914, + "grad_norm": 0.11438948804318125, + "learning_rate": 1.0865941016491424e-05, + "loss": 0.8719, + "step": 27859 + }, + { + "epoch": 5.94283276450512, + "grad_norm": 0.09071458218176495, + "learning_rate": 1.0861659377751957e-05, + "loss": 0.8981, + "step": 27860 + }, + { + "epoch": 5.9430460750853245, + "grad_norm": 0.10560514309383098, + "learning_rate": 1.0857378521313983e-05, + "loss": 0.8921, + "step": 27861 + }, + { + "epoch": 5.943259385665529, + "grad_norm": 0.10357406484340198, + "learning_rate": 1.0853098447225938e-05, + "loss": 0.9116, + "step": 27862 + }, + { + "epoch": 5.943472696245734, + "grad_norm": 0.08887633837388315, + "learning_rate": 1.0848819155536253e-05, + "loss": 0.9025, + "step": 27863 + }, + { + "epoch": 5.9436860068259385, + "grad_norm": 0.09738378372923125, + "learning_rate": 1.084454064629334e-05, + "loss": 0.8755, + "step": 27864 + }, + { + "epoch": 5.943899317406143, + "grad_norm": 0.09704658552121752, + "learning_rate": 1.084026291954559e-05, + "loss": 0.898, + "step": 27865 + }, + { + "epoch": 5.944112627986348, + "grad_norm": 0.09432537151859968, + "learning_rate": 1.083598597534147e-05, + "loss": 0.9041, + "step": 27866 + }, + { + "epoch": 5.944325938566553, + "grad_norm": 0.09028241953063644, + "learning_rate": 1.0831709813729305e-05, + "loss": 0.8844, + "step": 27867 + }, + { + "epoch": 5.944539249146757, + "grad_norm": 0.10685507570005592, + "learning_rate": 1.0827434434757494e-05, + "loss": 0.88, + "step": 27868 + }, + { + "epoch": 5.944752559726963, + "grad_norm": 0.09576269298778828, + "learning_rate": 1.0823159838474448e-05, + "loss": 0.8998, + "step": 27869 + }, + { + "epoch": 5.9449658703071675, + "grad_norm": 0.09975363530507465, + "learning_rate": 1.0818886024928506e-05, + "loss": 0.8966, + "step": 27870 + }, + { + "epoch": 5.945179180887372, + "grad_norm": 0.10457232415562165, + "learning_rate": 1.0814612994168043e-05, + "loss": 0.8584, + "step": 27871 + }, + { + "epoch": 5.945392491467577, + "grad_norm": 0.09772989236102798, + "learning_rate": 1.081034074624137e-05, + "loss": 0.8834, + "step": 27872 + }, + { + "epoch": 5.9456058020477816, + "grad_norm": 0.1021609706478721, + "learning_rate": 1.0806069281196873e-05, + "loss": 0.8857, + "step": 27873 + }, + { + "epoch": 5.945819112627986, + "grad_norm": 0.10593480890561342, + "learning_rate": 1.0801798599082871e-05, + "loss": 0.8828, + "step": 27874 + }, + { + "epoch": 5.946032423208191, + "grad_norm": 0.09089855246851393, + "learning_rate": 1.079752869994768e-05, + "loss": 0.8584, + "step": 27875 + }, + { + "epoch": 5.946245733788396, + "grad_norm": 0.09398247446100585, + "learning_rate": 1.0793259583839616e-05, + "loss": 0.8831, + "step": 27876 + }, + { + "epoch": 5.9464590443686, + "grad_norm": 0.10188589267125014, + "learning_rate": 1.0788991250806977e-05, + "loss": 0.8883, + "step": 27877 + }, + { + "epoch": 5.946672354948806, + "grad_norm": 0.09912256160679292, + "learning_rate": 1.0784723700898075e-05, + "loss": 0.9009, + "step": 27878 + }, + { + "epoch": 5.9468856655290105, + "grad_norm": 0.08971414122482853, + "learning_rate": 1.0780456934161178e-05, + "loss": 0.8718, + "step": 27879 + }, + { + "epoch": 5.947098976109215, + "grad_norm": 0.10405556820956005, + "learning_rate": 1.0776190950644572e-05, + "loss": 0.8762, + "step": 27880 + }, + { + "epoch": 5.94731228668942, + "grad_norm": 0.09119665895178126, + "learning_rate": 1.0771925750396535e-05, + "loss": 0.8648, + "step": 27881 + }, + { + "epoch": 5.947525597269625, + "grad_norm": 0.09026639471209853, + "learning_rate": 1.0767661333465332e-05, + "loss": 0.8728, + "step": 27882 + }, + { + "epoch": 5.947738907849829, + "grad_norm": 0.09856112589696832, + "learning_rate": 1.0763397699899199e-05, + "loss": 0.9333, + "step": 27883 + }, + { + "epoch": 5.947952218430034, + "grad_norm": 0.10220199514757808, + "learning_rate": 1.0759134849746373e-05, + "loss": 0.913, + "step": 27884 + }, + { + "epoch": 5.948165529010239, + "grad_norm": 0.10401211681705896, + "learning_rate": 1.075487278305511e-05, + "loss": 0.9, + "step": 27885 + }, + { + "epoch": 5.948378839590443, + "grad_norm": 0.09931558067455991, + "learning_rate": 1.0750611499873638e-05, + "loss": 0.8857, + "step": 27886 + }, + { + "epoch": 5.948592150170649, + "grad_norm": 0.12104302215514975, + "learning_rate": 1.0746351000250156e-05, + "loss": 0.8892, + "step": 27887 + }, + { + "epoch": 5.948805460750854, + "grad_norm": 0.10914513831352296, + "learning_rate": 1.0742091284232878e-05, + "loss": 0.8627, + "step": 27888 + }, + { + "epoch": 5.949018771331058, + "grad_norm": 0.11756374068166414, + "learning_rate": 1.0737832351869995e-05, + "loss": 0.909, + "step": 27889 + }, + { + "epoch": 5.949232081911263, + "grad_norm": 0.12191693358842473, + "learning_rate": 1.0733574203209715e-05, + "loss": 0.8861, + "step": 27890 + }, + { + "epoch": 5.949445392491468, + "grad_norm": 0.10430558984429485, + "learning_rate": 1.07293168383002e-05, + "loss": 0.8943, + "step": 27891 + }, + { + "epoch": 5.949658703071672, + "grad_norm": 0.10746679943603371, + "learning_rate": 1.0725060257189619e-05, + "loss": 0.8851, + "step": 27892 + }, + { + "epoch": 5.949872013651877, + "grad_norm": 0.11602611273063151, + "learning_rate": 1.072080445992616e-05, + "loss": 0.885, + "step": 27893 + }, + { + "epoch": 5.950085324232082, + "grad_norm": 0.09990507614692387, + "learning_rate": 1.071654944655796e-05, + "loss": 0.9251, + "step": 27894 + }, + { + "epoch": 5.950298634812286, + "grad_norm": 0.10967138435616175, + "learning_rate": 1.0712295217133178e-05, + "loss": 0.8562, + "step": 27895 + }, + { + "epoch": 5.950511945392492, + "grad_norm": 0.11442593608951994, + "learning_rate": 1.0708041771699916e-05, + "loss": 0.91, + "step": 27896 + }, + { + "epoch": 5.950725255972696, + "grad_norm": 0.1062229148601796, + "learning_rate": 1.070378911030634e-05, + "loss": 0.9047, + "step": 27897 + }, + { + "epoch": 5.950938566552901, + "grad_norm": 0.10539107116221438, + "learning_rate": 1.0699537233000564e-05, + "loss": 0.8519, + "step": 27898 + }, + { + "epoch": 5.951151877133106, + "grad_norm": 0.1143111952744612, + "learning_rate": 1.069528613983069e-05, + "loss": 0.9067, + "step": 27899 + }, + { + "epoch": 5.951365187713311, + "grad_norm": 0.09035570855854617, + "learning_rate": 1.0691035830844813e-05, + "loss": 0.8897, + "step": 27900 + }, + { + "epoch": 5.951578498293515, + "grad_norm": 0.10650722742891267, + "learning_rate": 1.068678630609104e-05, + "loss": 0.8963, + "step": 27901 + }, + { + "epoch": 5.95179180887372, + "grad_norm": 0.10581567581006601, + "learning_rate": 1.0682537565617444e-05, + "loss": 0.9033, + "step": 27902 + }, + { + "epoch": 5.952005119453925, + "grad_norm": 0.09154890475671774, + "learning_rate": 1.0678289609472095e-05, + "loss": 0.8819, + "step": 27903 + }, + { + "epoch": 5.952218430034129, + "grad_norm": 0.09244986096816876, + "learning_rate": 1.0674042437703052e-05, + "loss": 0.8889, + "step": 27904 + }, + { + "epoch": 5.952431740614334, + "grad_norm": 0.09356974078711386, + "learning_rate": 1.06697960503584e-05, + "loss": 0.8723, + "step": 27905 + }, + { + "epoch": 5.952645051194539, + "grad_norm": 0.09112801539022362, + "learning_rate": 1.0665550447486174e-05, + "loss": 0.8909, + "step": 27906 + }, + { + "epoch": 5.952858361774744, + "grad_norm": 0.09075584595776053, + "learning_rate": 1.0661305629134416e-05, + "loss": 0.8701, + "step": 27907 + }, + { + "epoch": 5.953071672354949, + "grad_norm": 0.09159626291957702, + "learning_rate": 1.0657061595351129e-05, + "loss": 0.8892, + "step": 27908 + }, + { + "epoch": 5.953284982935154, + "grad_norm": 0.10384189737624869, + "learning_rate": 1.0652818346184373e-05, + "loss": 0.9119, + "step": 27909 + }, + { + "epoch": 5.953498293515358, + "grad_norm": 0.09484016503976289, + "learning_rate": 1.0648575881682138e-05, + "loss": 0.9141, + "step": 27910 + }, + { + "epoch": 5.953711604095563, + "grad_norm": 0.10525148005036909, + "learning_rate": 1.0644334201892455e-05, + "loss": 0.8914, + "step": 27911 + }, + { + "epoch": 5.953924914675768, + "grad_norm": 0.09753986548031442, + "learning_rate": 1.0640093306863268e-05, + "loss": 0.8905, + "step": 27912 + }, + { + "epoch": 5.9541382252559725, + "grad_norm": 0.08853699823714296, + "learning_rate": 1.0635853196642603e-05, + "loss": 0.8674, + "step": 27913 + }, + { + "epoch": 5.954351535836177, + "grad_norm": 0.10769846268087009, + "learning_rate": 1.0631613871278427e-05, + "loss": 0.875, + "step": 27914 + }, + { + "epoch": 5.954564846416382, + "grad_norm": 0.09767760797159036, + "learning_rate": 1.0627375330818715e-05, + "loss": 0.8922, + "step": 27915 + }, + { + "epoch": 5.954778156996587, + "grad_norm": 0.09713860634387916, + "learning_rate": 1.062313757531139e-05, + "loss": 0.8902, + "step": 27916 + }, + { + "epoch": 5.954991467576792, + "grad_norm": 0.09846440161948353, + "learning_rate": 1.0618900604804448e-05, + "loss": 0.8652, + "step": 27917 + }, + { + "epoch": 5.955204778156997, + "grad_norm": 0.10672486909222861, + "learning_rate": 1.0614664419345817e-05, + "loss": 0.9133, + "step": 27918 + }, + { + "epoch": 5.9554180887372015, + "grad_norm": 0.08903243705107101, + "learning_rate": 1.0610429018983423e-05, + "loss": 0.887, + "step": 27919 + }, + { + "epoch": 5.955631399317406, + "grad_norm": 0.11215158015550286, + "learning_rate": 1.0606194403765175e-05, + "loss": 0.8754, + "step": 27920 + }, + { + "epoch": 5.955844709897611, + "grad_norm": 0.0963622297663047, + "learning_rate": 1.0601960573739026e-05, + "loss": 0.8825, + "step": 27921 + }, + { + "epoch": 5.9560580204778155, + "grad_norm": 0.0897861782010011, + "learning_rate": 1.0597727528952872e-05, + "loss": 0.8489, + "step": 27922 + }, + { + "epoch": 5.95627133105802, + "grad_norm": 0.10164563715786422, + "learning_rate": 1.0593495269454581e-05, + "loss": 0.8765, + "step": 27923 + }, + { + "epoch": 5.956484641638225, + "grad_norm": 0.10266904152748535, + "learning_rate": 1.058926379529205e-05, + "loss": 0.8982, + "step": 27924 + }, + { + "epoch": 5.9566979522184305, + "grad_norm": 0.09837684955544024, + "learning_rate": 1.0585033106513171e-05, + "loss": 0.8774, + "step": 27925 + }, + { + "epoch": 5.956911262798635, + "grad_norm": 0.11910133212255733, + "learning_rate": 1.0580803203165816e-05, + "loss": 0.881, + "step": 27926 + }, + { + "epoch": 5.95712457337884, + "grad_norm": 0.0873599592538458, + "learning_rate": 1.0576574085297841e-05, + "loss": 0.8852, + "step": 27927 + }, + { + "epoch": 5.9573378839590445, + "grad_norm": 0.1068668552088017, + "learning_rate": 1.0572345752957082e-05, + "loss": 0.8592, + "step": 27928 + }, + { + "epoch": 5.957551194539249, + "grad_norm": 0.10355722414725725, + "learning_rate": 1.0568118206191413e-05, + "loss": 0.8788, + "step": 27929 + }, + { + "epoch": 5.957764505119454, + "grad_norm": 0.09802238487618672, + "learning_rate": 1.0563891445048662e-05, + "loss": 0.9006, + "step": 27930 + }, + { + "epoch": 5.957977815699659, + "grad_norm": 0.103255802445575, + "learning_rate": 1.0559665469576639e-05, + "loss": 0.8677, + "step": 27931 + }, + { + "epoch": 5.958191126279863, + "grad_norm": 0.10144100706580218, + "learning_rate": 1.0555440279823155e-05, + "loss": 0.8959, + "step": 27932 + }, + { + "epoch": 5.958404436860068, + "grad_norm": 0.09270602591414015, + "learning_rate": 1.0551215875836065e-05, + "loss": 0.8743, + "step": 27933 + }, + { + "epoch": 5.9586177474402735, + "grad_norm": 0.09979591164879366, + "learning_rate": 1.0546992257663119e-05, + "loss": 0.8917, + "step": 27934 + }, + { + "epoch": 5.958831058020478, + "grad_norm": 0.0973578090876847, + "learning_rate": 1.0542769425352125e-05, + "loss": 0.893, + "step": 27935 + }, + { + "epoch": 5.959044368600683, + "grad_norm": 0.09110886840306168, + "learning_rate": 1.053854737895084e-05, + "loss": 0.8548, + "step": 27936 + }, + { + "epoch": 5.959257679180888, + "grad_norm": 0.09398589456761199, + "learning_rate": 1.0534326118507078e-05, + "loss": 0.9081, + "step": 27937 + }, + { + "epoch": 5.959470989761092, + "grad_norm": 0.11980453419962296, + "learning_rate": 1.0530105644068596e-05, + "loss": 0.9177, + "step": 27938 + }, + { + "epoch": 5.959684300341297, + "grad_norm": 0.09422455447137769, + "learning_rate": 1.052588595568313e-05, + "loss": 0.8908, + "step": 27939 + }, + { + "epoch": 5.959897610921502, + "grad_norm": 0.0972417761462753, + "learning_rate": 1.0521667053398415e-05, + "loss": 0.8508, + "step": 27940 + }, + { + "epoch": 5.960110921501706, + "grad_norm": 0.11853711723747848, + "learning_rate": 1.0517448937262222e-05, + "loss": 0.8847, + "step": 27941 + }, + { + "epoch": 5.960324232081911, + "grad_norm": 0.09582854545622142, + "learning_rate": 1.0513231607322258e-05, + "loss": 0.8839, + "step": 27942 + }, + { + "epoch": 5.960537542662116, + "grad_norm": 0.09131968376098362, + "learning_rate": 1.0509015063626254e-05, + "loss": 0.8679, + "step": 27943 + }, + { + "epoch": 5.96075085324232, + "grad_norm": 0.09809241992564158, + "learning_rate": 1.0504799306221903e-05, + "loss": 0.8945, + "step": 27944 + }, + { + "epoch": 5.960964163822526, + "grad_norm": 0.09569373001771714, + "learning_rate": 1.0500584335156936e-05, + "loss": 0.8919, + "step": 27945 + }, + { + "epoch": 5.961177474402731, + "grad_norm": 0.09680403203887007, + "learning_rate": 1.0496370150479023e-05, + "loss": 0.9066, + "step": 27946 + }, + { + "epoch": 5.961390784982935, + "grad_norm": 0.10188126797450926, + "learning_rate": 1.0492156752235841e-05, + "loss": 0.8721, + "step": 27947 + }, + { + "epoch": 5.96160409556314, + "grad_norm": 0.09413125813631637, + "learning_rate": 1.0487944140475056e-05, + "loss": 0.8795, + "step": 27948 + }, + { + "epoch": 5.961817406143345, + "grad_norm": 0.10290270384266993, + "learning_rate": 1.0483732315244377e-05, + "loss": 0.846, + "step": 27949 + }, + { + "epoch": 5.962030716723549, + "grad_norm": 0.09564739284409238, + "learning_rate": 1.0479521276591434e-05, + "loss": 0.8786, + "step": 27950 + }, + { + "epoch": 5.962244027303754, + "grad_norm": 0.0947235132176661, + "learning_rate": 1.0475311024563877e-05, + "loss": 0.906, + "step": 27951 + }, + { + "epoch": 5.962457337883959, + "grad_norm": 0.08863093836927398, + "learning_rate": 1.047110155920933e-05, + "loss": 0.8823, + "step": 27952 + }, + { + "epoch": 5.962670648464163, + "grad_norm": 0.09232775468997108, + "learning_rate": 1.0466892880575457e-05, + "loss": 0.8766, + "step": 27953 + }, + { + "epoch": 5.962883959044369, + "grad_norm": 0.10477011561323243, + "learning_rate": 1.046268498870985e-05, + "loss": 0.9075, + "step": 27954 + }, + { + "epoch": 5.963097269624574, + "grad_norm": 0.09931314021705594, + "learning_rate": 1.0458477883660145e-05, + "loss": 0.8958, + "step": 27955 + }, + { + "epoch": 5.963310580204778, + "grad_norm": 0.0908433416808798, + "learning_rate": 1.0454271565473922e-05, + "loss": 0.9054, + "step": 27956 + }, + { + "epoch": 5.963523890784983, + "grad_norm": 0.09526292717937954, + "learning_rate": 1.0450066034198793e-05, + "loss": 0.9353, + "step": 27957 + }, + { + "epoch": 5.963737201365188, + "grad_norm": 0.09875933373036824, + "learning_rate": 1.0445861289882333e-05, + "loss": 0.8723, + "step": 27958 + }, + { + "epoch": 5.963950511945392, + "grad_norm": 0.10010512970716963, + "learning_rate": 1.044165733257212e-05, + "loss": 0.8821, + "step": 27959 + }, + { + "epoch": 5.964163822525597, + "grad_norm": 0.09525291616321094, + "learning_rate": 1.0437454162315715e-05, + "loss": 0.8712, + "step": 27960 + }, + { + "epoch": 5.964377133105802, + "grad_norm": 0.09608627209636515, + "learning_rate": 1.0433251779160698e-05, + "loss": 0.884, + "step": 27961 + }, + { + "epoch": 5.964590443686006, + "grad_norm": 0.10412607655292982, + "learning_rate": 1.0429050183154605e-05, + "loss": 0.8781, + "step": 27962 + }, + { + "epoch": 5.964803754266212, + "grad_norm": 0.09366896299549232, + "learning_rate": 1.042484937434498e-05, + "loss": 0.8708, + "step": 27963 + }, + { + "epoch": 5.965017064846417, + "grad_norm": 0.09666166889959542, + "learning_rate": 1.0420649352779342e-05, + "loss": 0.8895, + "step": 27964 + }, + { + "epoch": 5.965230375426621, + "grad_norm": 0.10284074070518953, + "learning_rate": 1.0416450118505241e-05, + "loss": 0.926, + "step": 27965 + }, + { + "epoch": 5.965443686006826, + "grad_norm": 0.09841210257809448, + "learning_rate": 1.041225167157018e-05, + "loss": 0.9087, + "step": 27966 + }, + { + "epoch": 5.965656996587031, + "grad_norm": 0.1085663375983951, + "learning_rate": 1.0408054012021665e-05, + "loss": 0.8839, + "step": 27967 + }, + { + "epoch": 5.965870307167235, + "grad_norm": 0.0918733655841888, + "learning_rate": 1.040385713990718e-05, + "loss": 0.9016, + "step": 27968 + }, + { + "epoch": 5.96608361774744, + "grad_norm": 0.08814548150377616, + "learning_rate": 1.039966105527423e-05, + "loss": 0.8918, + "step": 27969 + }, + { + "epoch": 5.966296928327645, + "grad_norm": 0.09642597629118091, + "learning_rate": 1.0395465758170285e-05, + "loss": 0.9125, + "step": 27970 + }, + { + "epoch": 5.9665102389078495, + "grad_norm": 0.09723622514045037, + "learning_rate": 1.0391271248642806e-05, + "loss": 0.9017, + "step": 27971 + }, + { + "epoch": 5.966723549488055, + "grad_norm": 0.09533866131631091, + "learning_rate": 1.0387077526739256e-05, + "loss": 0.8806, + "step": 27972 + }, + { + "epoch": 5.96693686006826, + "grad_norm": 0.11038515266118935, + "learning_rate": 1.038288459250711e-05, + "loss": 0.9065, + "step": 27973 + }, + { + "epoch": 5.967150170648464, + "grad_norm": 0.09384426348756392, + "learning_rate": 1.0378692445993797e-05, + "loss": 0.8692, + "step": 27974 + }, + { + "epoch": 5.967363481228669, + "grad_norm": 0.10227878979584266, + "learning_rate": 1.0374501087246745e-05, + "loss": 0.8675, + "step": 27975 + }, + { + "epoch": 5.967576791808874, + "grad_norm": 0.10967028928857601, + "learning_rate": 1.0370310516313373e-05, + "loss": 0.909, + "step": 27976 + }, + { + "epoch": 5.9677901023890785, + "grad_norm": 0.11244865578384422, + "learning_rate": 1.036612073324112e-05, + "loss": 0.8712, + "step": 27977 + }, + { + "epoch": 5.968003412969283, + "grad_norm": 0.09611835661717187, + "learning_rate": 1.0361931738077407e-05, + "loss": 0.8689, + "step": 27978 + }, + { + "epoch": 5.968216723549488, + "grad_norm": 0.09840779231192334, + "learning_rate": 1.035774353086957e-05, + "loss": 0.8653, + "step": 27979 + }, + { + "epoch": 5.9684300341296925, + "grad_norm": 0.12133366765583886, + "learning_rate": 1.0353556111665051e-05, + "loss": 0.8904, + "step": 27980 + }, + { + "epoch": 5.968643344709898, + "grad_norm": 0.10101425384613756, + "learning_rate": 1.0349369480511213e-05, + "loss": 0.909, + "step": 27981 + }, + { + "epoch": 5.968856655290102, + "grad_norm": 0.10388733628287337, + "learning_rate": 1.0345183637455443e-05, + "loss": 0.8636, + "step": 27982 + }, + { + "epoch": 5.9690699658703075, + "grad_norm": 0.1066040429945914, + "learning_rate": 1.034099858254508e-05, + "loss": 0.8793, + "step": 27983 + }, + { + "epoch": 5.969283276450512, + "grad_norm": 0.0998591257742846, + "learning_rate": 1.0336814315827479e-05, + "loss": 0.902, + "step": 27984 + }, + { + "epoch": 5.969496587030717, + "grad_norm": 0.0899757456404912, + "learning_rate": 1.0332630837350011e-05, + "loss": 0.8857, + "step": 27985 + }, + { + "epoch": 5.9697098976109215, + "grad_norm": 0.10023024940132537, + "learning_rate": 1.0328448147159995e-05, + "loss": 0.9022, + "step": 27986 + }, + { + "epoch": 5.969923208191126, + "grad_norm": 0.09949439676160035, + "learning_rate": 1.0324266245304764e-05, + "loss": 0.918, + "step": 27987 + }, + { + "epoch": 5.970136518771331, + "grad_norm": 0.08609721228849272, + "learning_rate": 1.0320085131831612e-05, + "loss": 0.8911, + "step": 27988 + }, + { + "epoch": 5.970349829351536, + "grad_norm": 0.09197676232454909, + "learning_rate": 1.0315904806787906e-05, + "loss": 0.8684, + "step": 27989 + }, + { + "epoch": 5.97056313993174, + "grad_norm": 0.08661964637291732, + "learning_rate": 1.031172527022089e-05, + "loss": 0.8819, + "step": 27990 + }, + { + "epoch": 5.970776450511945, + "grad_norm": 0.09796321019916333, + "learning_rate": 1.0307546522177857e-05, + "loss": 0.8926, + "step": 27991 + }, + { + "epoch": 5.9709897610921505, + "grad_norm": 0.10255844680583659, + "learning_rate": 1.030336856270613e-05, + "loss": 0.8783, + "step": 27992 + }, + { + "epoch": 5.971203071672355, + "grad_norm": 0.09691572074549075, + "learning_rate": 1.0299191391852949e-05, + "loss": 0.8552, + "step": 27993 + }, + { + "epoch": 5.97141638225256, + "grad_norm": 0.10191465139723026, + "learning_rate": 1.0295015009665596e-05, + "loss": 0.8815, + "step": 27994 + }, + { + "epoch": 5.971629692832765, + "grad_norm": 0.0959687172115516, + "learning_rate": 1.029083941619132e-05, + "loss": 0.8675, + "step": 27995 + }, + { + "epoch": 5.971843003412969, + "grad_norm": 0.09711289397279009, + "learning_rate": 1.0286664611477354e-05, + "loss": 0.8765, + "step": 27996 + }, + { + "epoch": 5.972056313993174, + "grad_norm": 0.09447914070792582, + "learning_rate": 1.0282490595570969e-05, + "loss": 0.8731, + "step": 27997 + }, + { + "epoch": 5.972269624573379, + "grad_norm": 0.10427037828478945, + "learning_rate": 1.0278317368519369e-05, + "loss": 0.8835, + "step": 27998 + }, + { + "epoch": 5.972482935153583, + "grad_norm": 0.09265717960424785, + "learning_rate": 1.0274144930369787e-05, + "loss": 0.8912, + "step": 27999 + }, + { + "epoch": 5.972696245733788, + "grad_norm": 0.10249951879495618, + "learning_rate": 1.0269973281169423e-05, + "loss": 0.9299, + "step": 28000 + }, + { + "epoch": 5.972909556313994, + "grad_norm": 0.10682900900753084, + "learning_rate": 1.0265802420965483e-05, + "loss": 0.8815, + "step": 28001 + }, + { + "epoch": 5.973122866894198, + "grad_norm": 0.09613112930339376, + "learning_rate": 1.0261632349805172e-05, + "loss": 0.9129, + "step": 28002 + }, + { + "epoch": 5.973336177474403, + "grad_norm": 0.10021924630079115, + "learning_rate": 1.0257463067735638e-05, + "loss": 0.8633, + "step": 28003 + }, + { + "epoch": 5.973549488054608, + "grad_norm": 0.08922140122096842, + "learning_rate": 1.0253294574804103e-05, + "loss": 0.8892, + "step": 28004 + }, + { + "epoch": 5.973762798634812, + "grad_norm": 0.10066810603173655, + "learning_rate": 1.0249126871057711e-05, + "loss": 0.8833, + "step": 28005 + }, + { + "epoch": 5.973976109215017, + "grad_norm": 0.0942135921159144, + "learning_rate": 1.0244959956543617e-05, + "loss": 0.9066, + "step": 28006 + }, + { + "epoch": 5.974189419795222, + "grad_norm": 0.09053275238467058, + "learning_rate": 1.0240793831308981e-05, + "loss": 0.8569, + "step": 28007 + }, + { + "epoch": 5.974402730375426, + "grad_norm": 0.09521523436078957, + "learning_rate": 1.0236628495400915e-05, + "loss": 0.892, + "step": 28008 + }, + { + "epoch": 5.974616040955631, + "grad_norm": 0.10608384901937505, + "learning_rate": 1.0232463948866599e-05, + "loss": 0.8855, + "step": 28009 + }, + { + "epoch": 5.974829351535837, + "grad_norm": 0.09751426636827402, + "learning_rate": 1.0228300191753111e-05, + "loss": 0.9099, + "step": 28010 + }, + { + "epoch": 5.975042662116041, + "grad_norm": 0.08684929061988916, + "learning_rate": 1.0224137224107591e-05, + "loss": 0.8981, + "step": 28011 + }, + { + "epoch": 5.975255972696246, + "grad_norm": 0.0940614146491318, + "learning_rate": 1.0219975045977128e-05, + "loss": 0.88, + "step": 28012 + }, + { + "epoch": 5.975469283276451, + "grad_norm": 0.08809617346461054, + "learning_rate": 1.0215813657408823e-05, + "loss": 0.8858, + "step": 28013 + }, + { + "epoch": 5.975682593856655, + "grad_norm": 0.0914900069126791, + "learning_rate": 1.021165305844976e-05, + "loss": 0.8505, + "step": 28014 + }, + { + "epoch": 5.97589590443686, + "grad_norm": 0.09763263152230532, + "learning_rate": 1.0207493249147009e-05, + "loss": 0.9052, + "step": 28015 + }, + { + "epoch": 5.976109215017065, + "grad_norm": 0.09488768252678173, + "learning_rate": 1.0203334229547637e-05, + "loss": 0.9034, + "step": 28016 + }, + { + "epoch": 5.976322525597269, + "grad_norm": 0.08646130561282088, + "learning_rate": 1.0199175999698726e-05, + "loss": 0.8986, + "step": 28017 + }, + { + "epoch": 5.976535836177474, + "grad_norm": 0.0967158069423329, + "learning_rate": 1.019501855964732e-05, + "loss": 0.8631, + "step": 28018 + }, + { + "epoch": 5.97674914675768, + "grad_norm": 0.0984274068129745, + "learning_rate": 1.0190861909440443e-05, + "loss": 0.8999, + "step": 28019 + }, + { + "epoch": 5.976962457337884, + "grad_norm": 0.09819146466955656, + "learning_rate": 1.0186706049125124e-05, + "loss": 0.8945, + "step": 28020 + }, + { + "epoch": 5.977175767918089, + "grad_norm": 0.09513248566772152, + "learning_rate": 1.0182550978748415e-05, + "loss": 0.8886, + "step": 28021 + }, + { + "epoch": 5.977389078498294, + "grad_norm": 0.09439518827561974, + "learning_rate": 1.0178396698357323e-05, + "loss": 0.8727, + "step": 28022 + }, + { + "epoch": 5.977602389078498, + "grad_norm": 0.09577608583356366, + "learning_rate": 1.0174243207998846e-05, + "loss": 0.9039, + "step": 28023 + }, + { + "epoch": 5.977815699658703, + "grad_norm": 0.09743076276090692, + "learning_rate": 1.0170090507719972e-05, + "loss": 0.8826, + "step": 28024 + }, + { + "epoch": 5.978029010238908, + "grad_norm": 0.09316061675890634, + "learning_rate": 1.016593859756771e-05, + "loss": 0.8963, + "step": 28025 + }, + { + "epoch": 5.978242320819112, + "grad_norm": 0.11011715081573119, + "learning_rate": 1.016178747758902e-05, + "loss": 0.9016, + "step": 28026 + }, + { + "epoch": 5.978455631399317, + "grad_norm": 0.09241297454781207, + "learning_rate": 1.0157637147830885e-05, + "loss": 0.9251, + "step": 28027 + }, + { + "epoch": 5.978668941979522, + "grad_norm": 0.0928099559071538, + "learning_rate": 1.0153487608340238e-05, + "loss": 0.8929, + "step": 28028 + }, + { + "epoch": 5.9788822525597265, + "grad_norm": 0.08980041368049399, + "learning_rate": 1.014933885916407e-05, + "loss": 0.8435, + "step": 28029 + }, + { + "epoch": 5.979095563139932, + "grad_norm": 0.1040739707658869, + "learning_rate": 1.0145190900349316e-05, + "loss": 0.9258, + "step": 28030 + }, + { + "epoch": 5.979308873720137, + "grad_norm": 0.1017373752735148, + "learning_rate": 1.0141043731942894e-05, + "loss": 0.8666, + "step": 28031 + }, + { + "epoch": 5.979522184300341, + "grad_norm": 0.09449166089056411, + "learning_rate": 1.0136897353991721e-05, + "loss": 0.8908, + "step": 28032 + }, + { + "epoch": 5.979735494880546, + "grad_norm": 0.10425262486641529, + "learning_rate": 1.0132751766542754e-05, + "loss": 0.9137, + "step": 28033 + }, + { + "epoch": 5.979948805460751, + "grad_norm": 0.09638572224980216, + "learning_rate": 1.012860696964288e-05, + "loss": 0.8915, + "step": 28034 + }, + { + "epoch": 5.9801621160409555, + "grad_norm": 0.09507157821663151, + "learning_rate": 1.0124462963338967e-05, + "loss": 0.8791, + "step": 28035 + }, + { + "epoch": 5.98037542662116, + "grad_norm": 0.09589966418529644, + "learning_rate": 1.0120319747677948e-05, + "loss": 0.894, + "step": 28036 + }, + { + "epoch": 5.980588737201365, + "grad_norm": 0.1017424972283446, + "learning_rate": 1.0116177322706693e-05, + "loss": 0.8729, + "step": 28037 + }, + { + "epoch": 5.9808020477815695, + "grad_norm": 0.09653319675034228, + "learning_rate": 1.0112035688472059e-05, + "loss": 0.8924, + "step": 28038 + }, + { + "epoch": 5.981015358361775, + "grad_norm": 0.10561595201851134, + "learning_rate": 1.0107894845020918e-05, + "loss": 0.8828, + "step": 28039 + }, + { + "epoch": 5.98122866894198, + "grad_norm": 0.10200724263756868, + "learning_rate": 1.0103754792400102e-05, + "loss": 0.8881, + "step": 28040 + }, + { + "epoch": 5.9814419795221845, + "grad_norm": 0.09911627983423439, + "learning_rate": 1.0099615530656499e-05, + "loss": 0.8521, + "step": 28041 + }, + { + "epoch": 5.981655290102389, + "grad_norm": 0.10546963877512287, + "learning_rate": 1.0095477059836915e-05, + "loss": 0.8648, + "step": 28042 + }, + { + "epoch": 5.981868600682594, + "grad_norm": 0.10015539273025122, + "learning_rate": 1.0091339379988185e-05, + "loss": 0.9121, + "step": 28043 + }, + { + "epoch": 5.9820819112627985, + "grad_norm": 0.08974608897822296, + "learning_rate": 1.0087202491157116e-05, + "loss": 0.857, + "step": 28044 + }, + { + "epoch": 5.982295221843003, + "grad_norm": 0.10333182330892908, + "learning_rate": 1.0083066393390556e-05, + "loss": 0.8668, + "step": 28045 + }, + { + "epoch": 5.982508532423208, + "grad_norm": 0.09499749733642529, + "learning_rate": 1.0078931086735265e-05, + "loss": 0.8688, + "step": 28046 + }, + { + "epoch": 5.982721843003413, + "grad_norm": 0.09140871641549264, + "learning_rate": 1.0074796571238016e-05, + "loss": 0.8966, + "step": 28047 + }, + { + "epoch": 5.982935153583618, + "grad_norm": 0.09368183929416832, + "learning_rate": 1.007066284694565e-05, + "loss": 0.8958, + "step": 28048 + }, + { + "epoch": 5.983148464163823, + "grad_norm": 0.09481511786679045, + "learning_rate": 1.0066529913904896e-05, + "loss": 0.9171, + "step": 28049 + }, + { + "epoch": 5.9833617747440275, + "grad_norm": 0.08809944151211745, + "learning_rate": 1.006239777216254e-05, + "loss": 0.8937, + "step": 28050 + }, + { + "epoch": 5.983575085324232, + "grad_norm": 0.10606551717168211, + "learning_rate": 1.0058266421765333e-05, + "loss": 0.8971, + "step": 28051 + }, + { + "epoch": 5.983788395904437, + "grad_norm": 0.09978590507189611, + "learning_rate": 1.0054135862759993e-05, + "loss": 0.8942, + "step": 28052 + }, + { + "epoch": 5.984001706484642, + "grad_norm": 0.08792596078429522, + "learning_rate": 1.005000609519331e-05, + "loss": 0.8903, + "step": 28053 + }, + { + "epoch": 5.984215017064846, + "grad_norm": 0.09755021526762452, + "learning_rate": 1.0045877119111976e-05, + "loss": 0.8886, + "step": 28054 + }, + { + "epoch": 5.984428327645051, + "grad_norm": 0.09787740866628927, + "learning_rate": 1.0041748934562725e-05, + "loss": 0.9083, + "step": 28055 + }, + { + "epoch": 5.984641638225256, + "grad_norm": 0.09743842659214787, + "learning_rate": 1.0037621541592251e-05, + "loss": 0.8999, + "step": 28056 + }, + { + "epoch": 5.984854948805461, + "grad_norm": 0.08570743851500916, + "learning_rate": 1.003349494024728e-05, + "loss": 0.917, + "step": 28057 + }, + { + "epoch": 5.985068259385666, + "grad_norm": 0.10464427150404054, + "learning_rate": 1.0029369130574475e-05, + "loss": 0.9147, + "step": 28058 + }, + { + "epoch": 5.985281569965871, + "grad_norm": 0.10478563534229984, + "learning_rate": 1.002524411262053e-05, + "loss": 0.8806, + "step": 28059 + }, + { + "epoch": 5.985494880546075, + "grad_norm": 0.09216771175488855, + "learning_rate": 1.0021119886432138e-05, + "loss": 0.8985, + "step": 28060 + }, + { + "epoch": 5.98570819112628, + "grad_norm": 0.10630802046619402, + "learning_rate": 1.0016996452055951e-05, + "loss": 0.9111, + "step": 28061 + }, + { + "epoch": 5.985921501706485, + "grad_norm": 0.10778858961359704, + "learning_rate": 1.0012873809538629e-05, + "loss": 0.8919, + "step": 28062 + }, + { + "epoch": 5.986134812286689, + "grad_norm": 0.09407328364310137, + "learning_rate": 1.0008751958926815e-05, + "loss": 0.8947, + "step": 28063 + }, + { + "epoch": 5.986348122866894, + "grad_norm": 0.11070059335726361, + "learning_rate": 1.0004630900267131e-05, + "loss": 0.8876, + "step": 28064 + }, + { + "epoch": 5.986561433447099, + "grad_norm": 0.09576487107484144, + "learning_rate": 1.0000510633606244e-05, + "loss": 0.8904, + "step": 28065 + }, + { + "epoch": 5.986774744027304, + "grad_norm": 0.10208258279427253, + "learning_rate": 9.996391158990764e-06, + "loss": 0.8817, + "step": 28066 + }, + { + "epoch": 5.986988054607508, + "grad_norm": 0.10234372560326147, + "learning_rate": 9.992272476467284e-06, + "loss": 0.8942, + "step": 28067 + }, + { + "epoch": 5.987201365187714, + "grad_norm": 0.0935938074348315, + "learning_rate": 9.988154586082426e-06, + "loss": 0.8613, + "step": 28068 + }, + { + "epoch": 5.987414675767918, + "grad_norm": 0.09714436809887887, + "learning_rate": 9.984037487882774e-06, + "loss": 0.8959, + "step": 28069 + }, + { + "epoch": 5.987627986348123, + "grad_norm": 0.10027750970243472, + "learning_rate": 9.979921181914913e-06, + "loss": 0.9073, + "step": 28070 + }, + { + "epoch": 5.987841296928328, + "grad_norm": 0.08840530638419122, + "learning_rate": 9.97580566822541e-06, + "loss": 0.8426, + "step": 28071 + }, + { + "epoch": 5.988054607508532, + "grad_norm": 0.0921205335307959, + "learning_rate": 9.971690946860861e-06, + "loss": 0.8853, + "step": 28072 + }, + { + "epoch": 5.988267918088737, + "grad_norm": 0.09835198807534812, + "learning_rate": 9.967577017867795e-06, + "loss": 0.8891, + "step": 28073 + }, + { + "epoch": 5.988481228668942, + "grad_norm": 0.09406075902931572, + "learning_rate": 9.96346388129279e-06, + "loss": 0.9054, + "step": 28074 + }, + { + "epoch": 5.988694539249146, + "grad_norm": 0.0928368649260942, + "learning_rate": 9.959351537182358e-06, + "loss": 0.9048, + "step": 28075 + }, + { + "epoch": 5.988907849829351, + "grad_norm": 0.0887061468928466, + "learning_rate": 9.955239985583023e-06, + "loss": 0.9116, + "step": 28076 + }, + { + "epoch": 5.989121160409557, + "grad_norm": 0.09868963308865779, + "learning_rate": 9.951129226541352e-06, + "loss": 0.902, + "step": 28077 + }, + { + "epoch": 5.989334470989761, + "grad_norm": 0.09403596513288479, + "learning_rate": 9.947019260103823e-06, + "loss": 0.9039, + "step": 28078 + }, + { + "epoch": 5.989547781569966, + "grad_norm": 0.09470980074759047, + "learning_rate": 9.942910086316959e-06, + "loss": 0.9467, + "step": 28079 + }, + { + "epoch": 5.989761092150171, + "grad_norm": 0.09427954538343462, + "learning_rate": 9.93880170522724e-06, + "loss": 0.9103, + "step": 28080 + }, + { + "epoch": 5.989974402730375, + "grad_norm": 0.10082318228449898, + "learning_rate": 9.934694116881158e-06, + "loss": 0.8882, + "step": 28081 + }, + { + "epoch": 5.99018771331058, + "grad_norm": 0.09453899394412901, + "learning_rate": 9.930587321325186e-06, + "loss": 0.8674, + "step": 28082 + }, + { + "epoch": 5.990401023890785, + "grad_norm": 0.09894295744773193, + "learning_rate": 9.926481318605785e-06, + "loss": 0.9173, + "step": 28083 + }, + { + "epoch": 5.9906143344709895, + "grad_norm": 0.09162532370235689, + "learning_rate": 9.92237610876945e-06, + "loss": 0.9043, + "step": 28084 + }, + { + "epoch": 5.990827645051194, + "grad_norm": 0.09145748268004991, + "learning_rate": 9.918271691862595e-06, + "loss": 0.9029, + "step": 28085 + }, + { + "epoch": 5.9910409556314, + "grad_norm": 0.10851104001019124, + "learning_rate": 9.914168067931685e-06, + "loss": 0.897, + "step": 28086 + }, + { + "epoch": 5.991254266211604, + "grad_norm": 0.09807434266055047, + "learning_rate": 9.91006523702314e-06, + "loss": 0.8732, + "step": 28087 + }, + { + "epoch": 5.991467576791809, + "grad_norm": 0.10284557787805222, + "learning_rate": 9.905963199183364e-06, + "loss": 0.8752, + "step": 28088 + }, + { + "epoch": 5.991680887372014, + "grad_norm": 0.09823695636041627, + "learning_rate": 9.901861954458813e-06, + "loss": 0.8925, + "step": 28089 + }, + { + "epoch": 5.9918941979522184, + "grad_norm": 0.0935907286328235, + "learning_rate": 9.897761502895889e-06, + "loss": 0.893, + "step": 28090 + }, + { + "epoch": 5.992107508532423, + "grad_norm": 0.09064787072701656, + "learning_rate": 9.893661844540942e-06, + "loss": 0.887, + "step": 28091 + }, + { + "epoch": 5.992320819112628, + "grad_norm": 0.10256101520842259, + "learning_rate": 9.88956297944041e-06, + "loss": 0.8832, + "step": 28092 + }, + { + "epoch": 5.9925341296928325, + "grad_norm": 0.08654916414583552, + "learning_rate": 9.885464907640645e-06, + "loss": 0.8864, + "step": 28093 + }, + { + "epoch": 5.992747440273037, + "grad_norm": 0.10559908584794858, + "learning_rate": 9.881367629188028e-06, + "loss": 0.8588, + "step": 28094 + }, + { + "epoch": 5.992960750853243, + "grad_norm": 0.09006166839373214, + "learning_rate": 9.877271144128895e-06, + "loss": 0.8965, + "step": 28095 + }, + { + "epoch": 5.993174061433447, + "grad_norm": 0.09812353276361478, + "learning_rate": 9.873175452509645e-06, + "loss": 0.8958, + "step": 28096 + }, + { + "epoch": 5.993387372013652, + "grad_norm": 0.09314870908478835, + "learning_rate": 9.869080554376586e-06, + "loss": 0.8556, + "step": 28097 + }, + { + "epoch": 5.993600682593857, + "grad_norm": 0.10257188567563402, + "learning_rate": 9.864986449776057e-06, + "loss": 0.9012, + "step": 28098 + }, + { + "epoch": 5.9938139931740615, + "grad_norm": 0.09959985065527137, + "learning_rate": 9.860893138754393e-06, + "loss": 0.9008, + "step": 28099 + }, + { + "epoch": 5.994027303754266, + "grad_norm": 0.1031262793770626, + "learning_rate": 9.856800621357885e-06, + "loss": 0.8875, + "step": 28100 + }, + { + "epoch": 5.994240614334471, + "grad_norm": 0.11878531141551722, + "learning_rate": 9.852708897632888e-06, + "loss": 0.8705, + "step": 28101 + }, + { + "epoch": 5.9944539249146755, + "grad_norm": 0.09709839834598047, + "learning_rate": 9.848617967625657e-06, + "loss": 0.8788, + "step": 28102 + }, + { + "epoch": 5.99466723549488, + "grad_norm": 0.09636044022468525, + "learning_rate": 9.84452783138247e-06, + "loss": 0.8911, + "step": 28103 + }, + { + "epoch": 5.994880546075086, + "grad_norm": 0.09464312964673446, + "learning_rate": 9.840438488949657e-06, + "loss": 0.8912, + "step": 28104 + }, + { + "epoch": 5.9950938566552905, + "grad_norm": 0.11033675865785897, + "learning_rate": 9.836349940373452e-06, + "loss": 0.8748, + "step": 28105 + }, + { + "epoch": 5.995307167235495, + "grad_norm": 0.09642003320982902, + "learning_rate": 9.832262185700139e-06, + "loss": 0.9016, + "step": 28106 + }, + { + "epoch": 5.9955204778157, + "grad_norm": 0.09994865105814138, + "learning_rate": 9.828175224975935e-06, + "loss": 0.905, + "step": 28107 + }, + { + "epoch": 5.9957337883959045, + "grad_norm": 0.10364877047053715, + "learning_rate": 9.824089058247127e-06, + "loss": 0.8918, + "step": 28108 + }, + { + "epoch": 5.995947098976109, + "grad_norm": 0.09165782593820437, + "learning_rate": 9.820003685559932e-06, + "loss": 0.8728, + "step": 28109 + }, + { + "epoch": 5.996160409556314, + "grad_norm": 0.0886135517357887, + "learning_rate": 9.81591910696058e-06, + "loss": 0.8856, + "step": 28110 + }, + { + "epoch": 5.996373720136519, + "grad_norm": 0.09741934617174619, + "learning_rate": 9.811835322495278e-06, + "loss": 0.8952, + "step": 28111 + }, + { + "epoch": 5.996587030716723, + "grad_norm": 0.10359115654775265, + "learning_rate": 9.807752332210244e-06, + "loss": 0.8851, + "step": 28112 + }, + { + "epoch": 5.996800341296928, + "grad_norm": 0.090661980857536, + "learning_rate": 9.803670136151671e-06, + "loss": 0.8739, + "step": 28113 + }, + { + "epoch": 5.997013651877133, + "grad_norm": 0.10408853522783942, + "learning_rate": 9.799588734365746e-06, + "loss": 0.8943, + "step": 28114 + }, + { + "epoch": 5.997226962457338, + "grad_norm": 0.10029866543190764, + "learning_rate": 9.795508126898646e-06, + "loss": 0.9192, + "step": 28115 + }, + { + "epoch": 5.997440273037543, + "grad_norm": 0.10292367679271745, + "learning_rate": 9.791428313796563e-06, + "loss": 0.8914, + "step": 28116 + }, + { + "epoch": 5.997653583617748, + "grad_norm": 0.1015644381735873, + "learning_rate": 9.787349295105656e-06, + "loss": 0.8961, + "step": 28117 + }, + { + "epoch": 5.997866894197952, + "grad_norm": 0.09702930426242967, + "learning_rate": 9.783271070872069e-06, + "loss": 0.8954, + "step": 28118 + }, + { + "epoch": 5.998080204778157, + "grad_norm": 0.104690451612149, + "learning_rate": 9.77919364114194e-06, + "loss": 0.8912, + "step": 28119 + }, + { + "epoch": 5.998293515358362, + "grad_norm": 0.0953181916193492, + "learning_rate": 9.775117005961427e-06, + "loss": 0.8876, + "step": 28120 + }, + { + "epoch": 5.998506825938566, + "grad_norm": 0.0909528390806837, + "learning_rate": 9.771041165376646e-06, + "loss": 0.8815, + "step": 28121 + }, + { + "epoch": 5.998720136518771, + "grad_norm": 0.10384182321996606, + "learning_rate": 9.766966119433712e-06, + "loss": 0.8878, + "step": 28122 + }, + { + "epoch": 5.998933447098976, + "grad_norm": 0.10499618693930275, + "learning_rate": 9.762891868178745e-06, + "loss": 0.9324, + "step": 28123 + }, + { + "epoch": 5.999146757679181, + "grad_norm": 0.10428166249228714, + "learning_rate": 9.758818411657836e-06, + "loss": 0.8791, + "step": 28124 + }, + { + "epoch": 5.999360068259386, + "grad_norm": 0.08881670377357072, + "learning_rate": 9.754745749917079e-06, + "loss": 0.9014, + "step": 28125 + }, + { + "epoch": 5.999573378839591, + "grad_norm": 0.09755515729207295, + "learning_rate": 9.750673883002553e-06, + "loss": 0.8936, + "step": 28126 + }, + { + "epoch": 5.999786689419795, + "grad_norm": 0.1005843255613879, + "learning_rate": 9.74660281096032e-06, + "loss": 0.8657, + "step": 28127 + }, + { + "epoch": 6.0, + "grad_norm": 0.09901542378276186, + "learning_rate": 9.742532533836467e-06, + "loss": 0.8656, + "step": 28128 + }, + { + "epoch": 6.000213310580205, + "grad_norm": 0.0974971172838636, + "learning_rate": 9.738463051677045e-06, + "loss": 0.8742, + "step": 28129 + }, + { + "epoch": 6.000426621160409, + "grad_norm": 0.09153123788253344, + "learning_rate": 9.7343943645281e-06, + "loss": 0.8785, + "step": 28130 + }, + { + "epoch": 6.000639931740614, + "grad_norm": 0.09982488988163718, + "learning_rate": 9.730326472435645e-06, + "loss": 0.9388, + "step": 28131 + }, + { + "epoch": 6.000853242320819, + "grad_norm": 0.10345336906529992, + "learning_rate": 9.726259375445743e-06, + "loss": 0.8963, + "step": 28132 + }, + { + "epoch": 6.001066552901024, + "grad_norm": 0.09444640213057026, + "learning_rate": 9.722193073604394e-06, + "loss": 0.8872, + "step": 28133 + }, + { + "epoch": 6.001279863481229, + "grad_norm": 0.09743576828604338, + "learning_rate": 9.71812756695762e-06, + "loss": 0.8817, + "step": 28134 + }, + { + "epoch": 6.001493174061434, + "grad_norm": 0.0994544610390936, + "learning_rate": 9.714062855551414e-06, + "loss": 0.8793, + "step": 28135 + }, + { + "epoch": 6.001706484641638, + "grad_norm": 0.10120891213012785, + "learning_rate": 9.709998939431765e-06, + "loss": 0.864, + "step": 28136 + }, + { + "epoch": 6.001919795221843, + "grad_norm": 0.09029341701057114, + "learning_rate": 9.70593581864466e-06, + "loss": 0.8862, + "step": 28137 + }, + { + "epoch": 6.002133105802048, + "grad_norm": 0.09602384009165921, + "learning_rate": 9.701873493236076e-06, + "loss": 0.9034, + "step": 28138 + }, + { + "epoch": 6.002346416382252, + "grad_norm": 0.10113264307478251, + "learning_rate": 9.697811963251964e-06, + "loss": 0.8646, + "step": 28139 + }, + { + "epoch": 6.002559726962457, + "grad_norm": 0.09283631172766317, + "learning_rate": 9.693751228738298e-06, + "loss": 0.8732, + "step": 28140 + }, + { + "epoch": 6.002773037542662, + "grad_norm": 0.10136782554364902, + "learning_rate": 9.68969128974103e-06, + "loss": 0.8857, + "step": 28141 + }, + { + "epoch": 6.0029863481228665, + "grad_norm": 0.09755281189535286, + "learning_rate": 9.685632146306082e-06, + "loss": 0.8974, + "step": 28142 + }, + { + "epoch": 6.003199658703072, + "grad_norm": 0.0866566514867746, + "learning_rate": 9.681573798479369e-06, + "loss": 0.8773, + "step": 28143 + }, + { + "epoch": 6.003412969283277, + "grad_norm": 0.1068733572486085, + "learning_rate": 9.677516246306858e-06, + "loss": 0.8901, + "step": 28144 + }, + { + "epoch": 6.003626279863481, + "grad_norm": 0.10387508733017065, + "learning_rate": 9.67345948983443e-06, + "loss": 0.9171, + "step": 28145 + }, + { + "epoch": 6.003839590443686, + "grad_norm": 0.08328277715616567, + "learning_rate": 9.669403529108003e-06, + "loss": 0.9073, + "step": 28146 + }, + { + "epoch": 6.004052901023891, + "grad_norm": 0.0972867880610281, + "learning_rate": 9.665348364173428e-06, + "loss": 0.8661, + "step": 28147 + }, + { + "epoch": 6.0042662116040955, + "grad_norm": 0.08939085720309764, + "learning_rate": 9.66129399507663e-06, + "loss": 0.8688, + "step": 28148 + }, + { + "epoch": 6.0044795221843, + "grad_norm": 0.10291269772275749, + "learning_rate": 9.657240421863476e-06, + "loss": 0.8967, + "step": 28149 + }, + { + "epoch": 6.004692832764505, + "grad_norm": 0.09386972252694856, + "learning_rate": 9.65318764457984e-06, + "loss": 0.8841, + "step": 28150 + }, + { + "epoch": 6.0049061433447095, + "grad_norm": 0.0862366889685804, + "learning_rate": 9.649135663271543e-06, + "loss": 0.8699, + "step": 28151 + }, + { + "epoch": 6.005119453924915, + "grad_norm": 0.09307449319212856, + "learning_rate": 9.645084477984476e-06, + "loss": 0.897, + "step": 28152 + }, + { + "epoch": 6.00533276450512, + "grad_norm": 0.09375384521487315, + "learning_rate": 9.641034088764463e-06, + "loss": 0.8428, + "step": 28153 + }, + { + "epoch": 6.0055460750853245, + "grad_norm": 0.09031331169859505, + "learning_rate": 9.636984495657332e-06, + "loss": 0.9078, + "step": 28154 + }, + { + "epoch": 6.005759385665529, + "grad_norm": 0.09107467589153934, + "learning_rate": 9.632935698708886e-06, + "loss": 0.8762, + "step": 28155 + }, + { + "epoch": 6.005972696245734, + "grad_norm": 0.09378739345561282, + "learning_rate": 9.628887697964982e-06, + "loss": 0.8637, + "step": 28156 + }, + { + "epoch": 6.0061860068259385, + "grad_norm": 0.09931868549103325, + "learning_rate": 9.624840493471406e-06, + "loss": 0.8948, + "step": 28157 + }, + { + "epoch": 6.006399317406143, + "grad_norm": 0.09656960953159398, + "learning_rate": 9.620794085273925e-06, + "loss": 0.8828, + "step": 28158 + }, + { + "epoch": 6.006612627986348, + "grad_norm": 0.09822472020688253, + "learning_rate": 9.616748473418336e-06, + "loss": 0.8784, + "step": 28159 + }, + { + "epoch": 6.006825938566553, + "grad_norm": 0.09948312107922297, + "learning_rate": 9.612703657950431e-06, + "loss": 0.8528, + "step": 28160 + }, + { + "epoch": 6.007039249146757, + "grad_norm": 0.08796564288281093, + "learning_rate": 9.608659638915969e-06, + "loss": 0.8831, + "step": 28161 + }, + { + "epoch": 6.007252559726963, + "grad_norm": 0.09567865405945195, + "learning_rate": 9.60461641636071e-06, + "loss": 0.8741, + "step": 28162 + }, + { + "epoch": 6.0074658703071675, + "grad_norm": 0.09853761592081618, + "learning_rate": 9.600573990330377e-06, + "loss": 0.8694, + "step": 28163 + }, + { + "epoch": 6.007679180887372, + "grad_norm": 0.09283122598469502, + "learning_rate": 9.596532360870757e-06, + "loss": 0.8793, + "step": 28164 + }, + { + "epoch": 6.007892491467577, + "grad_norm": 0.09238709220770026, + "learning_rate": 9.592491528027553e-06, + "loss": 0.8618, + "step": 28165 + }, + { + "epoch": 6.0081058020477816, + "grad_norm": 0.10191120835740287, + "learning_rate": 9.588451491846499e-06, + "loss": 0.8607, + "step": 28166 + }, + { + "epoch": 6.008319112627986, + "grad_norm": 0.09991015210700076, + "learning_rate": 9.584412252373272e-06, + "loss": 0.88, + "step": 28167 + }, + { + "epoch": 6.008532423208191, + "grad_norm": 0.09883076609834074, + "learning_rate": 9.580373809653651e-06, + "loss": 0.9042, + "step": 28168 + }, + { + "epoch": 6.008745733788396, + "grad_norm": 0.10294609773663714, + "learning_rate": 9.576336163733261e-06, + "loss": 0.8969, + "step": 28169 + }, + { + "epoch": 6.0089590443686, + "grad_norm": 0.11726127052014194, + "learning_rate": 9.572299314657809e-06, + "loss": 0.8754, + "step": 28170 + }, + { + "epoch": 6.009172354948806, + "grad_norm": 0.0859315898620537, + "learning_rate": 9.568263262472964e-06, + "loss": 0.8646, + "step": 28171 + }, + { + "epoch": 6.0093856655290105, + "grad_norm": 0.09269992502790408, + "learning_rate": 9.564228007224421e-06, + "loss": 0.9067, + "step": 28172 + }, + { + "epoch": 6.009598976109215, + "grad_norm": 0.1121039095018153, + "learning_rate": 9.560193548957818e-06, + "loss": 0.9121, + "step": 28173 + }, + { + "epoch": 6.00981228668942, + "grad_norm": 0.0919700530836512, + "learning_rate": 9.556159887718812e-06, + "loss": 0.8771, + "step": 28174 + }, + { + "epoch": 6.010025597269625, + "grad_norm": 0.09489500829252794, + "learning_rate": 9.552127023553034e-06, + "loss": 0.8987, + "step": 28175 + }, + { + "epoch": 6.010238907849829, + "grad_norm": 0.09257471055833057, + "learning_rate": 9.548094956506131e-06, + "loss": 0.8887, + "step": 28176 + }, + { + "epoch": 6.010452218430034, + "grad_norm": 0.09944086149548645, + "learning_rate": 9.544063686623714e-06, + "loss": 0.9139, + "step": 28177 + }, + { + "epoch": 6.010665529010239, + "grad_norm": 0.0933672162264182, + "learning_rate": 9.540033213951409e-06, + "loss": 0.8667, + "step": 28178 + }, + { + "epoch": 6.010878839590443, + "grad_norm": 0.09921292982919175, + "learning_rate": 9.536003538534815e-06, + "loss": 0.8654, + "step": 28179 + }, + { + "epoch": 6.011092150170649, + "grad_norm": 0.09775026873407164, + "learning_rate": 9.53197466041952e-06, + "loss": 0.881, + "step": 28180 + }, + { + "epoch": 6.011305460750854, + "grad_norm": 0.09398116655752216, + "learning_rate": 9.527946579651126e-06, + "loss": 0.8834, + "step": 28181 + }, + { + "epoch": 6.011518771331058, + "grad_norm": 0.09489044249539828, + "learning_rate": 9.523919296275195e-06, + "loss": 0.867, + "step": 28182 + }, + { + "epoch": 6.011732081911263, + "grad_norm": 0.0896588420134646, + "learning_rate": 9.519892810337281e-06, + "loss": 0.8963, + "step": 28183 + }, + { + "epoch": 6.011945392491468, + "grad_norm": 0.10117541720382578, + "learning_rate": 9.515867121882994e-06, + "loss": 0.8896, + "step": 28184 + }, + { + "epoch": 6.012158703071672, + "grad_norm": 0.0888434208608876, + "learning_rate": 9.51184223095785e-06, + "loss": 0.8998, + "step": 28185 + }, + { + "epoch": 6.012372013651877, + "grad_norm": 0.09605099554164731, + "learning_rate": 9.507818137607399e-06, + "loss": 0.8909, + "step": 28186 + }, + { + "epoch": 6.012585324232082, + "grad_norm": 0.09944012282253137, + "learning_rate": 9.503794841877148e-06, + "loss": 0.8927, + "step": 28187 + }, + { + "epoch": 6.012798634812286, + "grad_norm": 0.09577356464634497, + "learning_rate": 9.49977234381266e-06, + "loss": 0.9292, + "step": 28188 + }, + { + "epoch": 6.013011945392491, + "grad_norm": 0.08754316122716381, + "learning_rate": 9.49575064345944e-06, + "loss": 0.8719, + "step": 28189 + }, + { + "epoch": 6.013225255972697, + "grad_norm": 0.1052029450073877, + "learning_rate": 9.491729740862987e-06, + "loss": 0.8455, + "step": 28190 + }, + { + "epoch": 6.013438566552901, + "grad_norm": 0.08958639463145772, + "learning_rate": 9.487709636068802e-06, + "loss": 0.8815, + "step": 28191 + }, + { + "epoch": 6.013651877133106, + "grad_norm": 0.08914276663190318, + "learning_rate": 9.483690329122361e-06, + "loss": 0.8593, + "step": 28192 + }, + { + "epoch": 6.013865187713311, + "grad_norm": 0.09328320321630586, + "learning_rate": 9.479671820069157e-06, + "loss": 0.8751, + "step": 28193 + }, + { + "epoch": 6.014078498293515, + "grad_norm": 0.09647627801190468, + "learning_rate": 9.475654108954643e-06, + "loss": 0.8791, + "step": 28194 + }, + { + "epoch": 6.01429180887372, + "grad_norm": 0.09429315729964623, + "learning_rate": 9.471637195824282e-06, + "loss": 0.8798, + "step": 28195 + }, + { + "epoch": 6.014505119453925, + "grad_norm": 0.09069487082767234, + "learning_rate": 9.467621080723544e-06, + "loss": 0.8674, + "step": 28196 + }, + { + "epoch": 6.014718430034129, + "grad_norm": 0.0888510778577492, + "learning_rate": 9.46360576369786e-06, + "loss": 0.8928, + "step": 28197 + }, + { + "epoch": 6.014931740614334, + "grad_norm": 0.09714075774057904, + "learning_rate": 9.459591244792663e-06, + "loss": 0.8766, + "step": 28198 + }, + { + "epoch": 6.01514505119454, + "grad_norm": 0.09341898429124532, + "learning_rate": 9.455577524053364e-06, + "loss": 0.8874, + "step": 28199 + }, + { + "epoch": 6.015358361774744, + "grad_norm": 0.09188444850309921, + "learning_rate": 9.451564601525409e-06, + "loss": 0.8973, + "step": 28200 + }, + { + "epoch": 6.015571672354949, + "grad_norm": 0.10025442736758455, + "learning_rate": 9.447552477254186e-06, + "loss": 0.8863, + "step": 28201 + }, + { + "epoch": 6.015784982935154, + "grad_norm": 0.09041576185703892, + "learning_rate": 9.443541151285091e-06, + "loss": 0.8703, + "step": 28202 + }, + { + "epoch": 6.015998293515358, + "grad_norm": 0.0984222721877431, + "learning_rate": 9.439530623663517e-06, + "loss": 0.8994, + "step": 28203 + }, + { + "epoch": 6.016211604095563, + "grad_norm": 0.08905955688146495, + "learning_rate": 9.435520894434842e-06, + "loss": 0.8791, + "step": 28204 + }, + { + "epoch": 6.016424914675768, + "grad_norm": 0.08593675545948856, + "learning_rate": 9.431511963644433e-06, + "loss": 0.8667, + "step": 28205 + }, + { + "epoch": 6.0166382252559725, + "grad_norm": 0.09761458770038887, + "learning_rate": 9.42750383133765e-06, + "loss": 0.8908, + "step": 28206 + }, + { + "epoch": 6.016851535836177, + "grad_norm": 0.09748734906779775, + "learning_rate": 9.423496497559835e-06, + "loss": 0.8872, + "step": 28207 + }, + { + "epoch": 6.017064846416382, + "grad_norm": 0.09608006688001772, + "learning_rate": 9.419489962356363e-06, + "loss": 0.8906, + "step": 28208 + }, + { + "epoch": 6.017278156996587, + "grad_norm": 0.08696618156250921, + "learning_rate": 9.415484225772542e-06, + "loss": 0.8572, + "step": 28209 + }, + { + "epoch": 6.017491467576792, + "grad_norm": 0.09846902568256813, + "learning_rate": 9.411479287853714e-06, + "loss": 0.8812, + "step": 28210 + }, + { + "epoch": 6.017704778156997, + "grad_norm": 0.09752693970125498, + "learning_rate": 9.407475148645156e-06, + "loss": 0.8684, + "step": 28211 + }, + { + "epoch": 6.0179180887372015, + "grad_norm": 0.09654795312598122, + "learning_rate": 9.403471808192229e-06, + "loss": 0.8753, + "step": 28212 + }, + { + "epoch": 6.018131399317406, + "grad_norm": 0.09622091122560494, + "learning_rate": 9.39946926654022e-06, + "loss": 0.8863, + "step": 28213 + }, + { + "epoch": 6.018344709897611, + "grad_norm": 0.09288119322841969, + "learning_rate": 9.395467523734374e-06, + "loss": 0.8972, + "step": 28214 + }, + { + "epoch": 6.0185580204778155, + "grad_norm": 0.09429299145486238, + "learning_rate": 9.391466579820014e-06, + "loss": 0.8757, + "step": 28215 + }, + { + "epoch": 6.01877133105802, + "grad_norm": 0.10961273680691364, + "learning_rate": 9.387466434842403e-06, + "loss": 0.8924, + "step": 28216 + }, + { + "epoch": 6.018984641638225, + "grad_norm": 0.10093239526172419, + "learning_rate": 9.38346708884679e-06, + "loss": 0.8996, + "step": 28217 + }, + { + "epoch": 6.0191979522184305, + "grad_norm": 0.10150525102270389, + "learning_rate": 9.379468541878443e-06, + "loss": 0.9006, + "step": 28218 + }, + { + "epoch": 6.019411262798635, + "grad_norm": 0.10847228404116765, + "learning_rate": 9.375470793982572e-06, + "loss": 0.8634, + "step": 28219 + }, + { + "epoch": 6.01962457337884, + "grad_norm": 0.11002233051653361, + "learning_rate": 9.371473845204453e-06, + "loss": 0.9276, + "step": 28220 + }, + { + "epoch": 6.0198378839590445, + "grad_norm": 0.08366384978463665, + "learning_rate": 9.3674776955893e-06, + "loss": 0.8575, + "step": 28221 + }, + { + "epoch": 6.020051194539249, + "grad_norm": 0.10012941521952161, + "learning_rate": 9.363482345182321e-06, + "loss": 0.8904, + "step": 28222 + }, + { + "epoch": 6.020264505119454, + "grad_norm": 0.10496246271720412, + "learning_rate": 9.359487794028719e-06, + "loss": 0.8713, + "step": 28223 + }, + { + "epoch": 6.020477815699659, + "grad_norm": 0.09299808487358815, + "learning_rate": 9.355494042173725e-06, + "loss": 0.8864, + "step": 28224 + }, + { + "epoch": 6.020691126279863, + "grad_norm": 0.09015858395758297, + "learning_rate": 9.351501089662496e-06, + "loss": 0.8582, + "step": 28225 + }, + { + "epoch": 6.020904436860068, + "grad_norm": 0.10806497177640682, + "learning_rate": 9.347508936540195e-06, + "loss": 0.9158, + "step": 28226 + }, + { + "epoch": 6.021117747440273, + "grad_norm": 0.0897637593195232, + "learning_rate": 9.343517582852039e-06, + "loss": 0.8749, + "step": 28227 + }, + { + "epoch": 6.021331058020478, + "grad_norm": 0.08905405809028817, + "learning_rate": 9.339527028643176e-06, + "loss": 0.8847, + "step": 28228 + }, + { + "epoch": 6.021544368600683, + "grad_norm": 0.0888815067309343, + "learning_rate": 9.33553727395875e-06, + "loss": 0.8807, + "step": 28229 + }, + { + "epoch": 6.021757679180888, + "grad_norm": 0.08631569353779368, + "learning_rate": 9.3315483188439e-06, + "loss": 0.8452, + "step": 28230 + }, + { + "epoch": 6.021970989761092, + "grad_norm": 0.0992590836311488, + "learning_rate": 9.32756016334376e-06, + "loss": 0.8665, + "step": 28231 + }, + { + "epoch": 6.022184300341297, + "grad_norm": 0.08979613276890012, + "learning_rate": 9.32357280750348e-06, + "loss": 0.908, + "step": 28232 + }, + { + "epoch": 6.022397610921502, + "grad_norm": 0.08726511185141372, + "learning_rate": 9.319586251368169e-06, + "loss": 0.9205, + "step": 28233 + }, + { + "epoch": 6.022610921501706, + "grad_norm": 0.10460006345318833, + "learning_rate": 9.31560049498292e-06, + "loss": 0.9036, + "step": 28234 + }, + { + "epoch": 6.022824232081911, + "grad_norm": 0.09435470931062312, + "learning_rate": 9.311615538392842e-06, + "loss": 0.8909, + "step": 28235 + }, + { + "epoch": 6.023037542662116, + "grad_norm": 0.10181448556790057, + "learning_rate": 9.307631381643029e-06, + "loss": 0.8851, + "step": 28236 + }, + { + "epoch": 6.023250853242321, + "grad_norm": 0.0899744925754162, + "learning_rate": 9.30364802477855e-06, + "loss": 0.8887, + "step": 28237 + }, + { + "epoch": 6.023464163822526, + "grad_norm": 0.08125212096233207, + "learning_rate": 9.299665467844465e-06, + "loss": 0.8511, + "step": 28238 + }, + { + "epoch": 6.023677474402731, + "grad_norm": 0.6314785867898285, + "learning_rate": 9.295683710885872e-06, + "loss": 0.9431, + "step": 28239 + }, + { + "epoch": 6.023890784982935, + "grad_norm": 0.100318179400802, + "learning_rate": 9.29170275394781e-06, + "loss": 0.8743, + "step": 28240 + }, + { + "epoch": 6.02410409556314, + "grad_norm": 0.09601249257293126, + "learning_rate": 9.287722597075323e-06, + "loss": 0.9135, + "step": 28241 + }, + { + "epoch": 6.024317406143345, + "grad_norm": 0.09242092356340825, + "learning_rate": 9.283743240313438e-06, + "loss": 0.8859, + "step": 28242 + }, + { + "epoch": 6.024530716723549, + "grad_norm": 0.10243416950783983, + "learning_rate": 9.279764683707175e-06, + "loss": 0.8599, + "step": 28243 + }, + { + "epoch": 6.024744027303754, + "grad_norm": 0.10240773509565314, + "learning_rate": 9.275786927301583e-06, + "loss": 0.8383, + "step": 28244 + }, + { + "epoch": 6.024957337883959, + "grad_norm": 0.09100008946729639, + "learning_rate": 9.271809971141646e-06, + "loss": 0.8513, + "step": 28245 + }, + { + "epoch": 6.025170648464163, + "grad_norm": 0.10002785296996389, + "learning_rate": 9.26783381527237e-06, + "loss": 0.8626, + "step": 28246 + }, + { + "epoch": 6.025383959044369, + "grad_norm": 0.11427079842373845, + "learning_rate": 9.263858459738748e-06, + "loss": 0.8738, + "step": 28247 + }, + { + "epoch": 6.025597269624574, + "grad_norm": 0.09590932831737474, + "learning_rate": 9.259883904585752e-06, + "loss": 0.8859, + "step": 28248 + }, + { + "epoch": 6.025810580204778, + "grad_norm": 0.10934663383877768, + "learning_rate": 9.255910149858356e-06, + "loss": 0.8772, + "step": 28249 + }, + { + "epoch": 6.026023890784983, + "grad_norm": 0.11184671178625864, + "learning_rate": 9.251937195601522e-06, + "loss": 0.8777, + "step": 28250 + }, + { + "epoch": 6.026237201365188, + "grad_norm": 0.09809519451468626, + "learning_rate": 9.247965041860217e-06, + "loss": 0.9037, + "step": 28251 + }, + { + "epoch": 6.026450511945392, + "grad_norm": 0.10072475194686628, + "learning_rate": 9.243993688679383e-06, + "loss": 0.8805, + "step": 28252 + }, + { + "epoch": 6.026663822525597, + "grad_norm": 0.0996159491521778, + "learning_rate": 9.240023136103943e-06, + "loss": 0.881, + "step": 28253 + }, + { + "epoch": 6.026877133105802, + "grad_norm": 0.09740658218418151, + "learning_rate": 9.236053384178833e-06, + "loss": 0.9012, + "step": 28254 + }, + { + "epoch": 6.027090443686006, + "grad_norm": 0.10479862260763306, + "learning_rate": 9.232084432948958e-06, + "loss": 0.8771, + "step": 28255 + }, + { + "epoch": 6.027303754266212, + "grad_norm": 0.10852008019605329, + "learning_rate": 9.22811628245925e-06, + "loss": 0.8785, + "step": 28256 + }, + { + "epoch": 6.027517064846417, + "grad_norm": 0.10481320103375738, + "learning_rate": 9.224148932754597e-06, + "loss": 0.8854, + "step": 28257 + }, + { + "epoch": 6.027730375426621, + "grad_norm": 0.09086448978197326, + "learning_rate": 9.220182383879898e-06, + "loss": 0.8312, + "step": 28258 + }, + { + "epoch": 6.027943686006826, + "grad_norm": 0.09256972537546065, + "learning_rate": 9.216216635880016e-06, + "loss": 0.8884, + "step": 28259 + }, + { + "epoch": 6.028156996587031, + "grad_norm": 0.1011289848420609, + "learning_rate": 9.212251688799836e-06, + "loss": 0.8716, + "step": 28260 + }, + { + "epoch": 6.028370307167235, + "grad_norm": 0.09313197679071311, + "learning_rate": 9.208287542684222e-06, + "loss": 0.9043, + "step": 28261 + }, + { + "epoch": 6.02858361774744, + "grad_norm": 0.10681149866374962, + "learning_rate": 9.204324197578e-06, + "loss": 0.881, + "step": 28262 + }, + { + "epoch": 6.028796928327645, + "grad_norm": 0.09531732415801393, + "learning_rate": 9.200361653526069e-06, + "loss": 0.8715, + "step": 28263 + }, + { + "epoch": 6.0290102389078495, + "grad_norm": 0.10871747683645766, + "learning_rate": 9.196399910573227e-06, + "loss": 0.8818, + "step": 28264 + }, + { + "epoch": 6.029223549488054, + "grad_norm": 0.09694668268274795, + "learning_rate": 9.192438968764317e-06, + "loss": 0.8673, + "step": 28265 + }, + { + "epoch": 6.02943686006826, + "grad_norm": 0.09004057419867684, + "learning_rate": 9.188478828144141e-06, + "loss": 0.8508, + "step": 28266 + }, + { + "epoch": 6.029650170648464, + "grad_norm": 0.10328271308024804, + "learning_rate": 9.184519488757515e-06, + "loss": 0.8952, + "step": 28267 + }, + { + "epoch": 6.029863481228669, + "grad_norm": 0.09946579195172907, + "learning_rate": 9.180560950649248e-06, + "loss": 0.9084, + "step": 28268 + }, + { + "epoch": 6.030076791808874, + "grad_norm": 0.10511724058238073, + "learning_rate": 9.17660321386415e-06, + "loss": 0.901, + "step": 28269 + }, + { + "epoch": 6.0302901023890785, + "grad_norm": 0.1035929441222999, + "learning_rate": 9.17264627844694e-06, + "loss": 0.8703, + "step": 28270 + }, + { + "epoch": 6.030503412969283, + "grad_norm": 0.08803427750015283, + "learning_rate": 9.168690144442443e-06, + "loss": 0.8766, + "step": 28271 + }, + { + "epoch": 6.030716723549488, + "grad_norm": 0.10661103819117547, + "learning_rate": 9.164734811895415e-06, + "loss": 0.8902, + "step": 28272 + }, + { + "epoch": 6.0309300341296925, + "grad_norm": 0.11276269454532699, + "learning_rate": 9.160780280850603e-06, + "loss": 0.8364, + "step": 28273 + }, + { + "epoch": 6.031143344709897, + "grad_norm": 0.09054316644085966, + "learning_rate": 9.156826551352735e-06, + "loss": 0.8659, + "step": 28274 + }, + { + "epoch": 6.031356655290103, + "grad_norm": 0.10985944037597375, + "learning_rate": 9.152873623446576e-06, + "loss": 0.8884, + "step": 28275 + }, + { + "epoch": 6.0315699658703075, + "grad_norm": 0.09485685418079297, + "learning_rate": 9.148921497176855e-06, + "loss": 0.8669, + "step": 28276 + }, + { + "epoch": 6.031783276450512, + "grad_norm": 0.11627612321631241, + "learning_rate": 9.144970172588271e-06, + "loss": 0.8793, + "step": 28277 + }, + { + "epoch": 6.031996587030717, + "grad_norm": 0.10117814448422356, + "learning_rate": 9.14101964972555e-06, + "loss": 0.8801, + "step": 28278 + }, + { + "epoch": 6.0322098976109215, + "grad_norm": 0.10567932825235896, + "learning_rate": 9.13706992863336e-06, + "loss": 0.8881, + "step": 28279 + }, + { + "epoch": 6.032423208191126, + "grad_norm": 0.11300405423678234, + "learning_rate": 9.133121009356451e-06, + "loss": 0.8781, + "step": 28280 + }, + { + "epoch": 6.032636518771331, + "grad_norm": 0.10035556737351439, + "learning_rate": 9.129172891939447e-06, + "loss": 0.8822, + "step": 28281 + }, + { + "epoch": 6.032849829351536, + "grad_norm": 0.10974479055083258, + "learning_rate": 9.125225576427038e-06, + "loss": 0.8884, + "step": 28282 + }, + { + "epoch": 6.03306313993174, + "grad_norm": 0.116023731192945, + "learning_rate": 9.1212790628639e-06, + "loss": 0.8987, + "step": 28283 + }, + { + "epoch": 6.033276450511946, + "grad_norm": 0.0997140839801004, + "learning_rate": 9.11733335129469e-06, + "loss": 0.9132, + "step": 28284 + }, + { + "epoch": 6.0334897610921505, + "grad_norm": 0.09814068107308327, + "learning_rate": 9.113388441764042e-06, + "loss": 0.9272, + "step": 28285 + }, + { + "epoch": 6.033703071672355, + "grad_norm": 0.10283595749739945, + "learning_rate": 9.109444334316576e-06, + "loss": 0.8584, + "step": 28286 + }, + { + "epoch": 6.03391638225256, + "grad_norm": 0.09916373159754974, + "learning_rate": 9.105501028996966e-06, + "loss": 0.877, + "step": 28287 + }, + { + "epoch": 6.034129692832765, + "grad_norm": 0.09914001839210897, + "learning_rate": 9.101558525849791e-06, + "loss": 0.8737, + "step": 28288 + }, + { + "epoch": 6.034343003412969, + "grad_norm": 0.09084253336260917, + "learning_rate": 9.097616824919685e-06, + "loss": 0.8808, + "step": 28289 + }, + { + "epoch": 6.034556313993174, + "grad_norm": 0.10600230672640794, + "learning_rate": 9.093675926251237e-06, + "loss": 0.8535, + "step": 28290 + }, + { + "epoch": 6.034769624573379, + "grad_norm": 0.08957411425520917, + "learning_rate": 9.08973582988903e-06, + "loss": 0.838, + "step": 28291 + }, + { + "epoch": 6.034982935153583, + "grad_norm": 0.090029419264021, + "learning_rate": 9.085796535877658e-06, + "loss": 0.8526, + "step": 28292 + }, + { + "epoch": 6.035196245733788, + "grad_norm": 0.08999252208373722, + "learning_rate": 9.081858044261699e-06, + "loss": 0.87, + "step": 28293 + }, + { + "epoch": 6.035409556313994, + "grad_norm": 0.11267723718775163, + "learning_rate": 9.077920355085683e-06, + "loss": 0.8873, + "step": 28294 + }, + { + "epoch": 6.035622866894198, + "grad_norm": 0.09948511200359633, + "learning_rate": 9.07398346839421e-06, + "loss": 0.8814, + "step": 28295 + }, + { + "epoch": 6.035836177474403, + "grad_norm": 0.0915665118501857, + "learning_rate": 9.07004738423181e-06, + "loss": 0.877, + "step": 28296 + }, + { + "epoch": 6.036049488054608, + "grad_norm": 0.11630912018198937, + "learning_rate": 9.06611210264301e-06, + "loss": 0.8855, + "step": 28297 + }, + { + "epoch": 6.036262798634812, + "grad_norm": 0.10006918055865387, + "learning_rate": 9.06217762367235e-06, + "loss": 0.857, + "step": 28298 + }, + { + "epoch": 6.036476109215017, + "grad_norm": 0.09554862816108378, + "learning_rate": 9.058243947364329e-06, + "loss": 0.8696, + "step": 28299 + }, + { + "epoch": 6.036689419795222, + "grad_norm": 0.10161216441000422, + "learning_rate": 9.054311073763489e-06, + "loss": 0.8974, + "step": 28300 + }, + { + "epoch": 6.036902730375426, + "grad_norm": 0.09560160222413673, + "learning_rate": 9.05037900291431e-06, + "loss": 0.8628, + "step": 28301 + }, + { + "epoch": 6.037116040955631, + "grad_norm": 0.09135798926566108, + "learning_rate": 9.04644773486128e-06, + "loss": 0.9009, + "step": 28302 + }, + { + "epoch": 6.037329351535837, + "grad_norm": 0.0976960175250766, + "learning_rate": 9.042517269648887e-06, + "loss": 0.8604, + "step": 28303 + }, + { + "epoch": 6.037542662116041, + "grad_norm": 0.10427993662593789, + "learning_rate": 9.038587607321613e-06, + "loss": 0.869, + "step": 28304 + }, + { + "epoch": 6.037755972696246, + "grad_norm": 0.08828572327777662, + "learning_rate": 9.034658747923913e-06, + "loss": 0.8583, + "step": 28305 + }, + { + "epoch": 6.037969283276451, + "grad_norm": 0.10267133673557251, + "learning_rate": 9.030730691500217e-06, + "loss": 0.8914, + "step": 28306 + }, + { + "epoch": 6.038182593856655, + "grad_norm": 0.10555769962789797, + "learning_rate": 9.026803438095028e-06, + "loss": 0.8732, + "step": 28307 + }, + { + "epoch": 6.03839590443686, + "grad_norm": 0.10142154879488732, + "learning_rate": 9.022876987752744e-06, + "loss": 0.8907, + "step": 28308 + }, + { + "epoch": 6.038609215017065, + "grad_norm": 0.10768897378827216, + "learning_rate": 9.018951340517801e-06, + "loss": 0.9012, + "step": 28309 + }, + { + "epoch": 6.038822525597269, + "grad_norm": 0.09228373927730299, + "learning_rate": 9.015026496434616e-06, + "loss": 0.8692, + "step": 28310 + }, + { + "epoch": 6.039035836177474, + "grad_norm": 0.11854019798174636, + "learning_rate": 9.01110245554758e-06, + "loss": 0.9012, + "step": 28311 + }, + { + "epoch": 6.039249146757679, + "grad_norm": 0.08610060662459611, + "learning_rate": 9.007179217901138e-06, + "loss": 0.8774, + "step": 28312 + }, + { + "epoch": 6.039462457337884, + "grad_norm": 0.1053833960230978, + "learning_rate": 9.003256783539656e-06, + "loss": 0.8717, + "step": 28313 + }, + { + "epoch": 6.039675767918089, + "grad_norm": 0.10224737311608219, + "learning_rate": 8.999335152507514e-06, + "loss": 0.8705, + "step": 28314 + }, + { + "epoch": 6.039889078498294, + "grad_norm": 0.09864644709194803, + "learning_rate": 8.995414324849094e-06, + "loss": 0.9095, + "step": 28315 + }, + { + "epoch": 6.040102389078498, + "grad_norm": 0.10091727052565881, + "learning_rate": 8.991494300608746e-06, + "loss": 0.8843, + "step": 28316 + }, + { + "epoch": 6.040315699658703, + "grad_norm": 0.11465325508466782, + "learning_rate": 8.987575079830848e-06, + "loss": 0.8786, + "step": 28317 + }, + { + "epoch": 6.040529010238908, + "grad_norm": 0.09665888685521906, + "learning_rate": 8.983656662559705e-06, + "loss": 0.9306, + "step": 28318 + }, + { + "epoch": 6.040742320819112, + "grad_norm": 0.09862921413545117, + "learning_rate": 8.9797390488397e-06, + "loss": 0.9207, + "step": 28319 + }, + { + "epoch": 6.040955631399317, + "grad_norm": 0.09811054315932712, + "learning_rate": 8.975822238715142e-06, + "loss": 0.8583, + "step": 28320 + }, + { + "epoch": 6.041168941979522, + "grad_norm": 0.09543099550090646, + "learning_rate": 8.971906232230352e-06, + "loss": 0.8824, + "step": 28321 + }, + { + "epoch": 6.041382252559727, + "grad_norm": 0.09480617912497583, + "learning_rate": 8.967991029429641e-06, + "loss": 0.9181, + "step": 28322 + }, + { + "epoch": 6.041595563139932, + "grad_norm": 0.1092969059983424, + "learning_rate": 8.964076630357287e-06, + "loss": 0.8717, + "step": 28323 + }, + { + "epoch": 6.041808873720137, + "grad_norm": 0.11138273133919674, + "learning_rate": 8.960163035057614e-06, + "loss": 0.8982, + "step": 28324 + }, + { + "epoch": 6.042022184300341, + "grad_norm": 0.09152378208946735, + "learning_rate": 8.956250243574911e-06, + "loss": 0.8748, + "step": 28325 + }, + { + "epoch": 6.042235494880546, + "grad_norm": 0.0935587362064016, + "learning_rate": 8.952338255953398e-06, + "loss": 0.865, + "step": 28326 + }, + { + "epoch": 6.042448805460751, + "grad_norm": 0.10467070214224407, + "learning_rate": 8.948427072237388e-06, + "loss": 0.8658, + "step": 28327 + }, + { + "epoch": 6.0426621160409555, + "grad_norm": 0.09117751900505475, + "learning_rate": 8.944516692471121e-06, + "loss": 0.8587, + "step": 28328 + }, + { + "epoch": 6.04287542662116, + "grad_norm": 0.10328893773714172, + "learning_rate": 8.940607116698846e-06, + "loss": 0.8654, + "step": 28329 + }, + { + "epoch": 6.043088737201365, + "grad_norm": 0.10542676501965881, + "learning_rate": 8.936698344964774e-06, + "loss": 0.8972, + "step": 28330 + }, + { + "epoch": 6.0433020477815695, + "grad_norm": 0.09539903633545113, + "learning_rate": 8.932790377313182e-06, + "loss": 0.8661, + "step": 28331 + }, + { + "epoch": 6.043515358361775, + "grad_norm": 0.09554543840172172, + "learning_rate": 8.928883213788252e-06, + "loss": 0.8936, + "step": 28332 + }, + { + "epoch": 6.04372866894198, + "grad_norm": 0.1101341866028672, + "learning_rate": 8.924976854434209e-06, + "loss": 0.9035, + "step": 28333 + }, + { + "epoch": 6.0439419795221845, + "grad_norm": 0.1023853604485907, + "learning_rate": 8.921071299295244e-06, + "loss": 0.9131, + "step": 28334 + }, + { + "epoch": 6.044155290102389, + "grad_norm": 0.0900659585313298, + "learning_rate": 8.917166548415541e-06, + "loss": 0.8776, + "step": 28335 + }, + { + "epoch": 6.044368600682594, + "grad_norm": 0.09491463684874916, + "learning_rate": 8.913262601839326e-06, + "loss": 0.8607, + "step": 28336 + }, + { + "epoch": 6.0445819112627985, + "grad_norm": 0.09710919896887862, + "learning_rate": 8.90935945961072e-06, + "loss": 0.8973, + "step": 28337 + }, + { + "epoch": 6.044795221843003, + "grad_norm": 0.09692627682290066, + "learning_rate": 8.905457121773895e-06, + "loss": 0.8936, + "step": 28338 + }, + { + "epoch": 6.045008532423208, + "grad_norm": 0.08861055541259302, + "learning_rate": 8.901555588373036e-06, + "loss": 0.8969, + "step": 28339 + }, + { + "epoch": 6.045221843003413, + "grad_norm": 0.08849527545970526, + "learning_rate": 8.89765485945227e-06, + "loss": 0.8845, + "step": 28340 + }, + { + "epoch": 6.045435153583618, + "grad_norm": 0.09317131607556872, + "learning_rate": 8.893754935055736e-06, + "loss": 0.8637, + "step": 28341 + }, + { + "epoch": 6.045648464163823, + "grad_norm": 0.09959985679252147, + "learning_rate": 8.889855815227544e-06, + "loss": 0.8908, + "step": 28342 + }, + { + "epoch": 6.0458617747440275, + "grad_norm": 0.09055280052579005, + "learning_rate": 8.885957500011843e-06, + "loss": 0.9091, + "step": 28343 + }, + { + "epoch": 6.046075085324232, + "grad_norm": 0.0917371415630452, + "learning_rate": 8.882059989452734e-06, + "loss": 0.8641, + "step": 28344 + }, + { + "epoch": 6.046288395904437, + "grad_norm": 0.11070231534188728, + "learning_rate": 8.878163283594313e-06, + "loss": 0.899, + "step": 28345 + }, + { + "epoch": 6.046501706484642, + "grad_norm": 0.10238583465896256, + "learning_rate": 8.874267382480669e-06, + "loss": 0.8869, + "step": 28346 + }, + { + "epoch": 6.046715017064846, + "grad_norm": 0.090883967374759, + "learning_rate": 8.870372286155882e-06, + "loss": 0.9081, + "step": 28347 + }, + { + "epoch": 6.046928327645051, + "grad_norm": 0.11290092100631932, + "learning_rate": 8.866477994664033e-06, + "loss": 0.8759, + "step": 28348 + }, + { + "epoch": 6.047141638225256, + "grad_norm": 0.08837289642224125, + "learning_rate": 8.862584508049176e-06, + "loss": 0.8954, + "step": 28349 + }, + { + "epoch": 6.04735494880546, + "grad_norm": 0.08624188864044303, + "learning_rate": 8.858691826355366e-06, + "loss": 0.8815, + "step": 28350 + }, + { + "epoch": 6.047568259385666, + "grad_norm": 0.09152033912842636, + "learning_rate": 8.854799949626663e-06, + "loss": 0.8577, + "step": 28351 + }, + { + "epoch": 6.047781569965871, + "grad_norm": 0.08453413508482735, + "learning_rate": 8.850908877907102e-06, + "loss": 0.8642, + "step": 28352 + }, + { + "epoch": 6.047994880546075, + "grad_norm": 0.09456988170161192, + "learning_rate": 8.847018611240697e-06, + "loss": 0.8711, + "step": 28353 + }, + { + "epoch": 6.04820819112628, + "grad_norm": 0.08775827334679738, + "learning_rate": 8.843129149671461e-06, + "loss": 0.8853, + "step": 28354 + }, + { + "epoch": 6.048421501706485, + "grad_norm": 0.08990625039636731, + "learning_rate": 8.839240493243424e-06, + "loss": 0.9097, + "step": 28355 + }, + { + "epoch": 6.048634812286689, + "grad_norm": 0.09312359082780337, + "learning_rate": 8.835352642000585e-06, + "loss": 0.8931, + "step": 28356 + }, + { + "epoch": 6.048848122866894, + "grad_norm": 0.08900667663259908, + "learning_rate": 8.831465595986928e-06, + "loss": 0.8847, + "step": 28357 + }, + { + "epoch": 6.049061433447099, + "grad_norm": 0.0907222076641844, + "learning_rate": 8.827579355246434e-06, + "loss": 0.8974, + "step": 28358 + }, + { + "epoch": 6.049274744027303, + "grad_norm": 0.09260840846793468, + "learning_rate": 8.823693919823077e-06, + "loss": 0.8879, + "step": 28359 + }, + { + "epoch": 6.049488054607509, + "grad_norm": 0.08623234237368883, + "learning_rate": 8.819809289760823e-06, + "loss": 0.8733, + "step": 28360 + }, + { + "epoch": 6.049701365187714, + "grad_norm": 0.08209338671665363, + "learning_rate": 8.81592546510362e-06, + "loss": 0.8603, + "step": 28361 + }, + { + "epoch": 6.049914675767918, + "grad_norm": 0.08849414561600798, + "learning_rate": 8.8120424458954e-06, + "loss": 0.8771, + "step": 28362 + }, + { + "epoch": 6.050127986348123, + "grad_norm": 0.09089019399473967, + "learning_rate": 8.808160232180135e-06, + "loss": 0.8847, + "step": 28363 + }, + { + "epoch": 6.050341296928328, + "grad_norm": 0.0803056287414193, + "learning_rate": 8.804278824001735e-06, + "loss": 0.8749, + "step": 28364 + }, + { + "epoch": 6.050554607508532, + "grad_norm": 0.08018458214828372, + "learning_rate": 8.800398221404118e-06, + "loss": 0.8757, + "step": 28365 + }, + { + "epoch": 6.050767918088737, + "grad_norm": 0.0903277157209915, + "learning_rate": 8.79651842443118e-06, + "loss": 0.9189, + "step": 28366 + }, + { + "epoch": 6.050981228668942, + "grad_norm": 0.0847474063329261, + "learning_rate": 8.79263943312684e-06, + "loss": 0.8803, + "step": 28367 + }, + { + "epoch": 6.051194539249146, + "grad_norm": 0.0901953100033971, + "learning_rate": 8.788761247534992e-06, + "loss": 0.889, + "step": 28368 + }, + { + "epoch": 6.051407849829351, + "grad_norm": 0.11804865346145837, + "learning_rate": 8.784883867699502e-06, + "loss": 0.8923, + "step": 28369 + }, + { + "epoch": 6.051621160409557, + "grad_norm": 0.09460241360256616, + "learning_rate": 8.781007293664258e-06, + "loss": 0.8887, + "step": 28370 + }, + { + "epoch": 6.051834470989761, + "grad_norm": 0.10607944453528632, + "learning_rate": 8.777131525473109e-06, + "loss": 0.8413, + "step": 28371 + }, + { + "epoch": 6.052047781569966, + "grad_norm": 0.09328577117284421, + "learning_rate": 8.77325656316991e-06, + "loss": 0.9092, + "step": 28372 + }, + { + "epoch": 6.052261092150171, + "grad_norm": 0.09401931945440598, + "learning_rate": 8.769382406798512e-06, + "loss": 0.8792, + "step": 28373 + }, + { + "epoch": 6.052474402730375, + "grad_norm": 0.10249950618590957, + "learning_rate": 8.765509056402739e-06, + "loss": 0.8602, + "step": 28374 + }, + { + "epoch": 6.05268771331058, + "grad_norm": 0.09050845044990315, + "learning_rate": 8.761636512026447e-06, + "loss": 0.85, + "step": 28375 + }, + { + "epoch": 6.052901023890785, + "grad_norm": 0.08978079365864092, + "learning_rate": 8.757764773713436e-06, + "loss": 0.8524, + "step": 28376 + }, + { + "epoch": 6.0531143344709895, + "grad_norm": 0.09881529781566384, + "learning_rate": 8.753893841507506e-06, + "loss": 0.869, + "step": 28377 + }, + { + "epoch": 6.053327645051194, + "grad_norm": 0.0943460929545283, + "learning_rate": 8.750023715452456e-06, + "loss": 0.851, + "step": 28378 + }, + { + "epoch": 6.0535409556314, + "grad_norm": 0.08926698377712501, + "learning_rate": 8.7461543955921e-06, + "loss": 0.8796, + "step": 28379 + }, + { + "epoch": 6.053754266211604, + "grad_norm": 0.09691937982101283, + "learning_rate": 8.742285881970205e-06, + "loss": 0.9008, + "step": 28380 + }, + { + "epoch": 6.053967576791809, + "grad_norm": 0.09539843522851164, + "learning_rate": 8.73841817463057e-06, + "loss": 0.8985, + "step": 28381 + }, + { + "epoch": 6.054180887372014, + "grad_norm": 0.09133946393576602, + "learning_rate": 8.73455127361689e-06, + "loss": 0.893, + "step": 28382 + }, + { + "epoch": 6.0543941979522184, + "grad_norm": 0.09555438863198626, + "learning_rate": 8.730685178972974e-06, + "loss": 0.8881, + "step": 28383 + }, + { + "epoch": 6.054607508532423, + "grad_norm": 0.092204128290865, + "learning_rate": 8.726819890742564e-06, + "loss": 0.8965, + "step": 28384 + }, + { + "epoch": 6.054820819112628, + "grad_norm": 0.08828852091234753, + "learning_rate": 8.722955408969373e-06, + "loss": 0.8848, + "step": 28385 + }, + { + "epoch": 6.0550341296928325, + "grad_norm": 0.09876940302548541, + "learning_rate": 8.719091733697138e-06, + "loss": 0.8813, + "step": 28386 + }, + { + "epoch": 6.055247440273037, + "grad_norm": 0.0905949812884694, + "learning_rate": 8.715228864969582e-06, + "loss": 0.8853, + "step": 28387 + }, + { + "epoch": 6.055460750853243, + "grad_norm": 0.08425687572806749, + "learning_rate": 8.711366802830422e-06, + "loss": 0.8965, + "step": 28388 + }, + { + "epoch": 6.055674061433447, + "grad_norm": 0.09179233830363272, + "learning_rate": 8.70750554732334e-06, + "loss": 0.862, + "step": 28389 + }, + { + "epoch": 6.055887372013652, + "grad_norm": 0.09214467675091188, + "learning_rate": 8.703645098492016e-06, + "loss": 0.8893, + "step": 28390 + }, + { + "epoch": 6.056100682593857, + "grad_norm": 0.09220791124435, + "learning_rate": 8.699785456380163e-06, + "loss": 0.8644, + "step": 28391 + }, + { + "epoch": 6.0563139931740615, + "grad_norm": 0.08854738969415589, + "learning_rate": 8.695926621031457e-06, + "loss": 0.864, + "step": 28392 + }, + { + "epoch": 6.056527303754266, + "grad_norm": 0.09556420930737505, + "learning_rate": 8.692068592489521e-06, + "loss": 0.918, + "step": 28393 + }, + { + "epoch": 6.056740614334471, + "grad_norm": 0.09288948307002544, + "learning_rate": 8.688211370798015e-06, + "loss": 0.8637, + "step": 28394 + }, + { + "epoch": 6.0569539249146755, + "grad_norm": 0.09772078044845887, + "learning_rate": 8.684354956000604e-06, + "loss": 0.8789, + "step": 28395 + }, + { + "epoch": 6.05716723549488, + "grad_norm": 0.08958293765325873, + "learning_rate": 8.68049934814093e-06, + "loss": 0.8848, + "step": 28396 + }, + { + "epoch": 6.057380546075085, + "grad_norm": 0.08852089744402082, + "learning_rate": 8.676644547262598e-06, + "loss": 0.8688, + "step": 28397 + }, + { + "epoch": 6.0575938566552905, + "grad_norm": 0.0893267187989629, + "learning_rate": 8.672790553409212e-06, + "loss": 0.9032, + "step": 28398 + }, + { + "epoch": 6.057807167235495, + "grad_norm": 0.08604800335150012, + "learning_rate": 8.668937366624424e-06, + "loss": 0.8901, + "step": 28399 + }, + { + "epoch": 6.0580204778157, + "grad_norm": 0.093597532524229, + "learning_rate": 8.665084986951808e-06, + "loss": 0.8713, + "step": 28400 + }, + { + "epoch": 6.0582337883959045, + "grad_norm": 0.09539311053853544, + "learning_rate": 8.661233414434949e-06, + "loss": 0.8777, + "step": 28401 + }, + { + "epoch": 6.058447098976109, + "grad_norm": 0.0873739197775986, + "learning_rate": 8.657382649117428e-06, + "loss": 0.8941, + "step": 28402 + }, + { + "epoch": 6.058660409556314, + "grad_norm": 0.10136791268305914, + "learning_rate": 8.653532691042845e-06, + "loss": 0.9076, + "step": 28403 + }, + { + "epoch": 6.058873720136519, + "grad_norm": 0.08475489041589379, + "learning_rate": 8.649683540254732e-06, + "loss": 0.8675, + "step": 28404 + }, + { + "epoch": 6.059087030716723, + "grad_norm": 0.09811181119758525, + "learning_rate": 8.645835196796643e-06, + "loss": 0.8633, + "step": 28405 + }, + { + "epoch": 6.059300341296928, + "grad_norm": 0.09098016885665518, + "learning_rate": 8.641987660712119e-06, + "loss": 0.8971, + "step": 28406 + }, + { + "epoch": 6.0595136518771335, + "grad_norm": 0.09025184069601389, + "learning_rate": 8.638140932044713e-06, + "loss": 0.8821, + "step": 28407 + }, + { + "epoch": 6.059726962457338, + "grad_norm": 0.08410618420953495, + "learning_rate": 8.63429501083795e-06, + "loss": 0.8525, + "step": 28408 + }, + { + "epoch": 6.059940273037543, + "grad_norm": 0.09298226363212292, + "learning_rate": 8.630449897135338e-06, + "loss": 0.8926, + "step": 28409 + }, + { + "epoch": 6.060153583617748, + "grad_norm": 0.08337292579118454, + "learning_rate": 8.626605590980373e-06, + "loss": 0.8465, + "step": 28410 + }, + { + "epoch": 6.060366894197952, + "grad_norm": 0.08849970581597841, + "learning_rate": 8.622762092416574e-06, + "loss": 0.8682, + "step": 28411 + }, + { + "epoch": 6.060580204778157, + "grad_norm": 0.08710455898206554, + "learning_rate": 8.618919401487429e-06, + "loss": 0.9024, + "step": 28412 + }, + { + "epoch": 6.060793515358362, + "grad_norm": 0.09007114432608604, + "learning_rate": 8.615077518236403e-06, + "loss": 0.8785, + "step": 28413 + }, + { + "epoch": 6.061006825938566, + "grad_norm": 0.09093967274671023, + "learning_rate": 8.611236442706982e-06, + "loss": 0.8648, + "step": 28414 + }, + { + "epoch": 6.061220136518771, + "grad_norm": 0.09771446140837242, + "learning_rate": 8.607396174942626e-06, + "loss": 0.8666, + "step": 28415 + }, + { + "epoch": 6.061433447098976, + "grad_norm": 0.09158206034926197, + "learning_rate": 8.603556714986773e-06, + "loss": 0.8704, + "step": 28416 + }, + { + "epoch": 6.061646757679181, + "grad_norm": 0.08556671802327909, + "learning_rate": 8.599718062882885e-06, + "loss": 0.8558, + "step": 28417 + }, + { + "epoch": 6.061860068259386, + "grad_norm": 0.11383928676574849, + "learning_rate": 8.595880218674373e-06, + "loss": 0.8822, + "step": 28418 + }, + { + "epoch": 6.062073378839591, + "grad_norm": 0.09141632042044762, + "learning_rate": 8.592043182404687e-06, + "loss": 0.8725, + "step": 28419 + }, + { + "epoch": 6.062286689419795, + "grad_norm": 0.09579812904359865, + "learning_rate": 8.588206954117236e-06, + "loss": 0.8829, + "step": 28420 + }, + { + "epoch": 6.0625, + "grad_norm": 0.09994818539723951, + "learning_rate": 8.58437153385542e-06, + "loss": 0.8913, + "step": 28421 + }, + { + "epoch": 6.062713310580205, + "grad_norm": 0.09824760902806995, + "learning_rate": 8.58053692166263e-06, + "loss": 0.8702, + "step": 28422 + }, + { + "epoch": 6.062926621160409, + "grad_norm": 0.10010072898560658, + "learning_rate": 8.576703117582288e-06, + "loss": 0.8949, + "step": 28423 + }, + { + "epoch": 6.063139931740614, + "grad_norm": 0.08454194725121954, + "learning_rate": 8.572870121657747e-06, + "loss": 0.8679, + "step": 28424 + }, + { + "epoch": 6.063353242320819, + "grad_norm": 0.09616747100257486, + "learning_rate": 8.569037933932373e-06, + "loss": 0.8931, + "step": 28425 + }, + { + "epoch": 6.063566552901024, + "grad_norm": 0.0941572236957632, + "learning_rate": 8.565206554449541e-06, + "loss": 0.8806, + "step": 28426 + }, + { + "epoch": 6.063779863481229, + "grad_norm": 0.09514587966205576, + "learning_rate": 8.561375983252609e-06, + "loss": 0.9038, + "step": 28427 + }, + { + "epoch": 6.063993174061434, + "grad_norm": 0.10178586659716571, + "learning_rate": 8.557546220384893e-06, + "loss": 0.8685, + "step": 28428 + }, + { + "epoch": 6.064206484641638, + "grad_norm": 0.09351595213478514, + "learning_rate": 8.553717265889757e-06, + "loss": 0.8834, + "step": 28429 + }, + { + "epoch": 6.064419795221843, + "grad_norm": 0.09061133608166427, + "learning_rate": 8.549889119810485e-06, + "loss": 0.8834, + "step": 28430 + }, + { + "epoch": 6.064633105802048, + "grad_norm": 0.10746574146153859, + "learning_rate": 8.546061782190445e-06, + "loss": 0.878, + "step": 28431 + }, + { + "epoch": 6.064846416382252, + "grad_norm": 0.09966271890557005, + "learning_rate": 8.542235253072913e-06, + "loss": 0.9094, + "step": 28432 + }, + { + "epoch": 6.065059726962457, + "grad_norm": 0.10237788873468026, + "learning_rate": 8.538409532501197e-06, + "loss": 0.8962, + "step": 28433 + }, + { + "epoch": 6.065273037542662, + "grad_norm": 0.11107941861858338, + "learning_rate": 8.534584620518554e-06, + "loss": 0.8607, + "step": 28434 + }, + { + "epoch": 6.0654863481228665, + "grad_norm": 0.1024514773608353, + "learning_rate": 8.530760517168306e-06, + "loss": 0.8559, + "step": 28435 + }, + { + "epoch": 6.065699658703072, + "grad_norm": 0.10123686112783176, + "learning_rate": 8.52693722249371e-06, + "loss": 0.8682, + "step": 28436 + }, + { + "epoch": 6.065912969283277, + "grad_norm": 0.09311912296148152, + "learning_rate": 8.523114736538019e-06, + "loss": 0.9013, + "step": 28437 + }, + { + "epoch": 6.066126279863481, + "grad_norm": 0.1057992379925198, + "learning_rate": 8.519293059344486e-06, + "loss": 0.9142, + "step": 28438 + }, + { + "epoch": 6.066339590443686, + "grad_norm": 0.09946798264181504, + "learning_rate": 8.515472190956351e-06, + "loss": 0.8784, + "step": 28439 + }, + { + "epoch": 6.066552901023891, + "grad_norm": 0.10293287069196523, + "learning_rate": 8.511652131416857e-06, + "loss": 0.8505, + "step": 28440 + }, + { + "epoch": 6.0667662116040955, + "grad_norm": 0.08773421037688311, + "learning_rate": 8.507832880769222e-06, + "loss": 0.8878, + "step": 28441 + }, + { + "epoch": 6.0669795221843, + "grad_norm": 0.11460740512908138, + "learning_rate": 8.50401443905664e-06, + "loss": 0.8796, + "step": 28442 + }, + { + "epoch": 6.067192832764505, + "grad_norm": 0.10236307519380464, + "learning_rate": 8.500196806322356e-06, + "loss": 0.8791, + "step": 28443 + }, + { + "epoch": 6.0674061433447095, + "grad_norm": 0.08686540582736502, + "learning_rate": 8.496379982609544e-06, + "loss": 0.8462, + "step": 28444 + }, + { + "epoch": 6.067619453924915, + "grad_norm": 0.11015327778348373, + "learning_rate": 8.492563967961396e-06, + "loss": 0.8736, + "step": 28445 + }, + { + "epoch": 6.06783276450512, + "grad_norm": 0.1011789071230804, + "learning_rate": 8.488748762421078e-06, + "loss": 0.8808, + "step": 28446 + }, + { + "epoch": 6.0680460750853245, + "grad_norm": 0.09602799970779069, + "learning_rate": 8.48493436603178e-06, + "loss": 0.8863, + "step": 28447 + }, + { + "epoch": 6.068259385665529, + "grad_norm": 0.10492488421709469, + "learning_rate": 8.481120778836672e-06, + "loss": 0.8868, + "step": 28448 + }, + { + "epoch": 6.068472696245734, + "grad_norm": 0.10630923663103473, + "learning_rate": 8.477308000878852e-06, + "loss": 0.8872, + "step": 28449 + }, + { + "epoch": 6.0686860068259385, + "grad_norm": 0.09719312764250207, + "learning_rate": 8.473496032201516e-06, + "loss": 0.8869, + "step": 28450 + }, + { + "epoch": 6.068899317406143, + "grad_norm": 0.10337253505356553, + "learning_rate": 8.469684872847775e-06, + "loss": 0.8771, + "step": 28451 + }, + { + "epoch": 6.069112627986348, + "grad_norm": 0.09342506199531667, + "learning_rate": 8.465874522860748e-06, + "loss": 0.8741, + "step": 28452 + }, + { + "epoch": 6.069325938566553, + "grad_norm": 0.11623612324504311, + "learning_rate": 8.462064982283551e-06, + "loss": 0.9084, + "step": 28453 + }, + { + "epoch": 6.069539249146757, + "grad_norm": 0.10671869906684511, + "learning_rate": 8.458256251159285e-06, + "loss": 0.9026, + "step": 28454 + }, + { + "epoch": 6.069752559726963, + "grad_norm": 0.10876838961286814, + "learning_rate": 8.454448329531063e-06, + "loss": 0.8785, + "step": 28455 + }, + { + "epoch": 6.0699658703071675, + "grad_norm": 0.11554977233397187, + "learning_rate": 8.450641217441969e-06, + "loss": 0.8999, + "step": 28456 + }, + { + "epoch": 6.070179180887372, + "grad_norm": 0.08999518064305188, + "learning_rate": 8.446834914935062e-06, + "loss": 0.8917, + "step": 28457 + }, + { + "epoch": 6.070392491467577, + "grad_norm": 0.09089208603992077, + "learning_rate": 8.443029422053412e-06, + "loss": 0.8834, + "step": 28458 + }, + { + "epoch": 6.0706058020477816, + "grad_norm": 0.09556545267596779, + "learning_rate": 8.439224738840122e-06, + "loss": 0.8862, + "step": 28459 + }, + { + "epoch": 6.070819112627986, + "grad_norm": 0.10140943991316341, + "learning_rate": 8.435420865338187e-06, + "loss": 0.8551, + "step": 28460 + }, + { + "epoch": 6.071032423208191, + "grad_norm": 0.09845188168940248, + "learning_rate": 8.431617801590648e-06, + "loss": 0.865, + "step": 28461 + }, + { + "epoch": 6.071245733788396, + "grad_norm": 0.09429301869069198, + "learning_rate": 8.427815547640584e-06, + "loss": 0.9151, + "step": 28462 + }, + { + "epoch": 6.0714590443686, + "grad_norm": 0.09739786005797955, + "learning_rate": 8.424014103530988e-06, + "loss": 0.8876, + "step": 28463 + }, + { + "epoch": 6.071672354948806, + "grad_norm": 0.1004686505772986, + "learning_rate": 8.420213469304869e-06, + "loss": 0.8741, + "step": 28464 + }, + { + "epoch": 6.0718856655290105, + "grad_norm": 0.09863381976080826, + "learning_rate": 8.41641364500525e-06, + "loss": 0.8925, + "step": 28465 + }, + { + "epoch": 6.072098976109215, + "grad_norm": 0.08508611247964094, + "learning_rate": 8.41261463067509e-06, + "loss": 0.9054, + "step": 28466 + }, + { + "epoch": 6.07231228668942, + "grad_norm": 0.08674275885115973, + "learning_rate": 8.408816426357416e-06, + "loss": 0.8866, + "step": 28467 + }, + { + "epoch": 6.072525597269625, + "grad_norm": 0.11021286889948555, + "learning_rate": 8.405019032095193e-06, + "loss": 0.9035, + "step": 28468 + }, + { + "epoch": 6.072738907849829, + "grad_norm": 0.09166328500616051, + "learning_rate": 8.401222447931379e-06, + "loss": 0.8776, + "step": 28469 + }, + { + "epoch": 6.072952218430034, + "grad_norm": 0.08838365382011099, + "learning_rate": 8.39742667390894e-06, + "loss": 0.883, + "step": 28470 + }, + { + "epoch": 6.073165529010239, + "grad_norm": 0.09660748742354243, + "learning_rate": 8.393631710070829e-06, + "loss": 0.8853, + "step": 28471 + }, + { + "epoch": 6.073378839590443, + "grad_norm": 0.09762467538669567, + "learning_rate": 8.389837556459972e-06, + "loss": 0.8692, + "step": 28472 + }, + { + "epoch": 6.073592150170649, + "grad_norm": 0.09584872854404912, + "learning_rate": 8.386044213119296e-06, + "loss": 0.8841, + "step": 28473 + }, + { + "epoch": 6.073805460750854, + "grad_norm": 0.09698700616837834, + "learning_rate": 8.382251680091742e-06, + "loss": 0.8966, + "step": 28474 + }, + { + "epoch": 6.074018771331058, + "grad_norm": 0.08993142001700806, + "learning_rate": 8.37845995742022e-06, + "loss": 0.8878, + "step": 28475 + }, + { + "epoch": 6.074232081911263, + "grad_norm": 0.09449865690147034, + "learning_rate": 8.374669045147628e-06, + "loss": 0.8939, + "step": 28476 + }, + { + "epoch": 6.074445392491468, + "grad_norm": 0.092554237066391, + "learning_rate": 8.370878943316864e-06, + "loss": 0.8767, + "step": 28477 + }, + { + "epoch": 6.074658703071672, + "grad_norm": 0.09784146313875301, + "learning_rate": 8.367089651970785e-06, + "loss": 0.8917, + "step": 28478 + }, + { + "epoch": 6.074872013651877, + "grad_norm": 0.10354913833159801, + "learning_rate": 8.363301171152315e-06, + "loss": 0.862, + "step": 28479 + }, + { + "epoch": 6.075085324232082, + "grad_norm": 0.09692130908969274, + "learning_rate": 8.359513500904293e-06, + "loss": 0.8759, + "step": 28480 + }, + { + "epoch": 6.075298634812286, + "grad_norm": 0.09427959968969388, + "learning_rate": 8.355726641269588e-06, + "loss": 0.8902, + "step": 28481 + }, + { + "epoch": 6.075511945392491, + "grad_norm": 0.09001264818608715, + "learning_rate": 8.35194059229103e-06, + "loss": 0.8718, + "step": 28482 + }, + { + "epoch": 6.075725255972697, + "grad_norm": 0.09896290245221348, + "learning_rate": 8.348155354011482e-06, + "loss": 0.888, + "step": 28483 + }, + { + "epoch": 6.075938566552901, + "grad_norm": 0.0970287421662115, + "learning_rate": 8.344370926473756e-06, + "loss": 0.8901, + "step": 28484 + }, + { + "epoch": 6.076151877133106, + "grad_norm": 0.09894071935079896, + "learning_rate": 8.340587309720657e-06, + "loss": 0.8826, + "step": 28485 + }, + { + "epoch": 6.076365187713311, + "grad_norm": 0.09549566338253349, + "learning_rate": 8.336804503795038e-06, + "loss": 0.8907, + "step": 28486 + }, + { + "epoch": 6.076578498293515, + "grad_norm": 0.09292937670479273, + "learning_rate": 8.333022508739677e-06, + "loss": 0.8959, + "step": 28487 + }, + { + "epoch": 6.07679180887372, + "grad_norm": 0.0895708934695949, + "learning_rate": 8.329241324597376e-06, + "loss": 0.8937, + "step": 28488 + }, + { + "epoch": 6.077005119453925, + "grad_norm": 0.10471549995743332, + "learning_rate": 8.325460951410922e-06, + "loss": 0.8487, + "step": 28489 + }, + { + "epoch": 6.077218430034129, + "grad_norm": 0.09685300274842656, + "learning_rate": 8.321681389223059e-06, + "loss": 0.8767, + "step": 28490 + }, + { + "epoch": 6.077431740614334, + "grad_norm": 0.09942254077730138, + "learning_rate": 8.317902638076591e-06, + "loss": 0.8701, + "step": 28491 + }, + { + "epoch": 6.07764505119454, + "grad_norm": 0.11816395494952771, + "learning_rate": 8.314124698014266e-06, + "loss": 0.8929, + "step": 28492 + }, + { + "epoch": 6.077858361774744, + "grad_norm": 0.09550764305171455, + "learning_rate": 8.31034756907882e-06, + "loss": 0.8645, + "step": 28493 + }, + { + "epoch": 6.078071672354949, + "grad_norm": 0.0968997225837837, + "learning_rate": 8.306571251312997e-06, + "loss": 0.8838, + "step": 28494 + }, + { + "epoch": 6.078284982935154, + "grad_norm": 0.09415650993223693, + "learning_rate": 8.30279574475953e-06, + "loss": 0.8797, + "step": 28495 + }, + { + "epoch": 6.078498293515358, + "grad_norm": 0.10034371854417642, + "learning_rate": 8.299021049461138e-06, + "loss": 0.8741, + "step": 28496 + }, + { + "epoch": 6.078711604095563, + "grad_norm": 0.09514566975683492, + "learning_rate": 8.295247165460512e-06, + "loss": 0.893, + "step": 28497 + }, + { + "epoch": 6.078924914675768, + "grad_norm": 0.09465599123368315, + "learning_rate": 8.291474092800382e-06, + "loss": 0.8775, + "step": 28498 + }, + { + "epoch": 6.0791382252559725, + "grad_norm": 0.100426342943543, + "learning_rate": 8.287701831523426e-06, + "loss": 0.8993, + "step": 28499 + }, + { + "epoch": 6.079351535836177, + "grad_norm": 0.09204044531951042, + "learning_rate": 8.283930381672332e-06, + "loss": 0.8654, + "step": 28500 + }, + { + "epoch": 6.079564846416382, + "grad_norm": 0.094515606498481, + "learning_rate": 8.280159743289782e-06, + "loss": 0.8773, + "step": 28501 + }, + { + "epoch": 6.079778156996587, + "grad_norm": 0.09160090094457869, + "learning_rate": 8.2763899164184e-06, + "loss": 0.8398, + "step": 28502 + }, + { + "epoch": 6.079991467576792, + "grad_norm": 0.09757861893947929, + "learning_rate": 8.272620901100902e-06, + "loss": 0.9054, + "step": 28503 + }, + { + "epoch": 6.080204778156997, + "grad_norm": 0.10062710184495825, + "learning_rate": 8.26885269737991e-06, + "loss": 0.8663, + "step": 28504 + }, + { + "epoch": 6.0804180887372015, + "grad_norm": 0.09055836729140064, + "learning_rate": 8.265085305298029e-06, + "loss": 0.9189, + "step": 28505 + }, + { + "epoch": 6.080631399317406, + "grad_norm": 0.08467200297125727, + "learning_rate": 8.261318724897936e-06, + "loss": 0.8634, + "step": 28506 + }, + { + "epoch": 6.080844709897611, + "grad_norm": 0.1001536341413875, + "learning_rate": 8.257552956222219e-06, + "loss": 0.8851, + "step": 28507 + }, + { + "epoch": 6.0810580204778155, + "grad_norm": 0.08924227006511572, + "learning_rate": 8.253787999313503e-06, + "loss": 0.8796, + "step": 28508 + }, + { + "epoch": 6.08127133105802, + "grad_norm": 0.09948436412694847, + "learning_rate": 8.250023854214357e-06, + "loss": 0.9006, + "step": 28509 + }, + { + "epoch": 6.081484641638225, + "grad_norm": 0.09573662605813826, + "learning_rate": 8.246260520967424e-06, + "loss": 0.8823, + "step": 28510 + }, + { + "epoch": 6.0816979522184305, + "grad_norm": 0.09588841750355911, + "learning_rate": 8.242497999615256e-06, + "loss": 0.885, + "step": 28511 + }, + { + "epoch": 6.081911262798635, + "grad_norm": 0.09629499138117129, + "learning_rate": 8.238736290200431e-06, + "loss": 0.8895, + "step": 28512 + }, + { + "epoch": 6.08212457337884, + "grad_norm": 0.09392404996130535, + "learning_rate": 8.234975392765512e-06, + "loss": 0.9281, + "step": 28513 + }, + { + "epoch": 6.0823378839590445, + "grad_norm": 0.09306389683076974, + "learning_rate": 8.231215307353041e-06, + "loss": 0.8705, + "step": 28514 + }, + { + "epoch": 6.082551194539249, + "grad_norm": 0.09980710673928614, + "learning_rate": 8.227456034005601e-06, + "loss": 0.8741, + "step": 28515 + }, + { + "epoch": 6.082764505119454, + "grad_norm": 0.08630185511288421, + "learning_rate": 8.223697572765697e-06, + "loss": 0.8765, + "step": 28516 + }, + { + "epoch": 6.082977815699659, + "grad_norm": 0.09824831043874148, + "learning_rate": 8.219939923675837e-06, + "loss": 0.8861, + "step": 28517 + }, + { + "epoch": 6.083191126279863, + "grad_norm": 0.11120969233197697, + "learning_rate": 8.216183086778584e-06, + "loss": 0.895, + "step": 28518 + }, + { + "epoch": 6.083404436860068, + "grad_norm": 0.08884686636458433, + "learning_rate": 8.212427062116428e-06, + "loss": 0.8673, + "step": 28519 + }, + { + "epoch": 6.083617747440273, + "grad_norm": 0.09327503464761704, + "learning_rate": 8.20867184973187e-06, + "loss": 0.9048, + "step": 28520 + }, + { + "epoch": 6.083831058020478, + "grad_norm": 0.09693280548422237, + "learning_rate": 8.204917449667369e-06, + "loss": 0.8787, + "step": 28521 + }, + { + "epoch": 6.084044368600683, + "grad_norm": 0.09551921089474784, + "learning_rate": 8.201163861965465e-06, + "loss": 0.8656, + "step": 28522 + }, + { + "epoch": 6.084257679180888, + "grad_norm": 0.08866596548621297, + "learning_rate": 8.197411086668591e-06, + "loss": 0.892, + "step": 28523 + }, + { + "epoch": 6.084470989761092, + "grad_norm": 0.09836532916643002, + "learning_rate": 8.193659123819224e-06, + "loss": 0.8835, + "step": 28524 + }, + { + "epoch": 6.084684300341297, + "grad_norm": 0.10239349925634672, + "learning_rate": 8.189907973459808e-06, + "loss": 0.8778, + "step": 28525 + }, + { + "epoch": 6.084897610921502, + "grad_norm": 0.09359194239615706, + "learning_rate": 8.186157635632787e-06, + "loss": 0.8783, + "step": 28526 + }, + { + "epoch": 6.085110921501706, + "grad_norm": 0.09287958504421498, + "learning_rate": 8.182408110380602e-06, + "loss": 0.9057, + "step": 28527 + }, + { + "epoch": 6.085324232081911, + "grad_norm": 0.0978248026968824, + "learning_rate": 8.178659397745687e-06, + "loss": 0.8638, + "step": 28528 + }, + { + "epoch": 6.085537542662116, + "grad_norm": 0.08728652736360004, + "learning_rate": 8.17491149777042e-06, + "loss": 0.9082, + "step": 28529 + }, + { + "epoch": 6.085750853242321, + "grad_norm": 0.0930249858993434, + "learning_rate": 8.171164410497261e-06, + "loss": 0.9081, + "step": 28530 + }, + { + "epoch": 6.085964163822526, + "grad_norm": 0.09616063980070255, + "learning_rate": 8.167418135968588e-06, + "loss": 0.8689, + "step": 28531 + }, + { + "epoch": 6.086177474402731, + "grad_norm": 0.09210854516893394, + "learning_rate": 8.163672674226784e-06, + "loss": 0.8675, + "step": 28532 + }, + { + "epoch": 6.086390784982935, + "grad_norm": 0.10257141860812859, + "learning_rate": 8.159928025314214e-06, + "loss": 0.8797, + "step": 28533 + }, + { + "epoch": 6.08660409556314, + "grad_norm": 0.08641535229260196, + "learning_rate": 8.156184189273283e-06, + "loss": 0.8823, + "step": 28534 + }, + { + "epoch": 6.086817406143345, + "grad_norm": 0.10280298688851654, + "learning_rate": 8.152441166146335e-06, + "loss": 0.8934, + "step": 28535 + }, + { + "epoch": 6.087030716723549, + "grad_norm": 0.09082602848013725, + "learning_rate": 8.14869895597572e-06, + "loss": 0.9109, + "step": 28536 + }, + { + "epoch": 6.087244027303754, + "grad_norm": 0.08335934130820179, + "learning_rate": 8.144957558803797e-06, + "loss": 0.8978, + "step": 28537 + }, + { + "epoch": 6.087457337883959, + "grad_norm": 0.09084197241779123, + "learning_rate": 8.141216974672877e-06, + "loss": 0.897, + "step": 28538 + }, + { + "epoch": 6.087670648464163, + "grad_norm": 0.08946143572675357, + "learning_rate": 8.1374772036253e-06, + "loss": 0.8792, + "step": 28539 + }, + { + "epoch": 6.087883959044369, + "grad_norm": 0.09003974764304994, + "learning_rate": 8.133738245703377e-06, + "loss": 0.9005, + "step": 28540 + }, + { + "epoch": 6.088097269624574, + "grad_norm": 0.10436160673215432, + "learning_rate": 8.130000100949399e-06, + "loss": 0.8989, + "step": 28541 + }, + { + "epoch": 6.088310580204778, + "grad_norm": 0.08894716876204813, + "learning_rate": 8.126262769405691e-06, + "loss": 0.8872, + "step": 28542 + }, + { + "epoch": 6.088523890784983, + "grad_norm": 0.09891184173758555, + "learning_rate": 8.122526251114533e-06, + "loss": 0.8886, + "step": 28543 + }, + { + "epoch": 6.088737201365188, + "grad_norm": 0.09753257266440182, + "learning_rate": 8.118790546118203e-06, + "loss": 0.9006, + "step": 28544 + }, + { + "epoch": 6.088950511945392, + "grad_norm": 0.10108197974770203, + "learning_rate": 8.115055654458941e-06, + "loss": 0.8802, + "step": 28545 + }, + { + "epoch": 6.089163822525597, + "grad_norm": 0.10322362552873499, + "learning_rate": 8.11132157617906e-06, + "loss": 0.8982, + "step": 28546 + }, + { + "epoch": 6.089377133105802, + "grad_norm": 0.10709907923508043, + "learning_rate": 8.107588311320777e-06, + "loss": 0.8799, + "step": 28547 + }, + { + "epoch": 6.089590443686006, + "grad_norm": 0.10764978644082218, + "learning_rate": 8.103855859926341e-06, + "loss": 0.8742, + "step": 28548 + }, + { + "epoch": 6.089803754266212, + "grad_norm": 0.09367763159714959, + "learning_rate": 8.100124222037994e-06, + "loss": 0.8948, + "step": 28549 + }, + { + "epoch": 6.090017064846417, + "grad_norm": 0.1033808902528766, + "learning_rate": 8.096393397697944e-06, + "loss": 0.8926, + "step": 28550 + }, + { + "epoch": 6.090230375426621, + "grad_norm": 0.11698762407540121, + "learning_rate": 8.092663386948411e-06, + "loss": 0.8721, + "step": 28551 + }, + { + "epoch": 6.090443686006826, + "grad_norm": 0.09345362632152399, + "learning_rate": 8.088934189831606e-06, + "loss": 0.8796, + "step": 28552 + }, + { + "epoch": 6.090656996587031, + "grad_norm": 0.09227333175613829, + "learning_rate": 8.085205806389695e-06, + "loss": 0.8989, + "step": 28553 + }, + { + "epoch": 6.090870307167235, + "grad_norm": 0.10421435371238788, + "learning_rate": 8.081478236664914e-06, + "loss": 0.8888, + "step": 28554 + }, + { + "epoch": 6.09108361774744, + "grad_norm": 0.10065871485141091, + "learning_rate": 8.077751480699416e-06, + "loss": 0.9322, + "step": 28555 + }, + { + "epoch": 6.091296928327645, + "grad_norm": 0.0918753096950729, + "learning_rate": 8.07402553853537e-06, + "loss": 0.9037, + "step": 28556 + }, + { + "epoch": 6.0915102389078495, + "grad_norm": 0.0954617415735847, + "learning_rate": 8.070300410214922e-06, + "loss": 0.8803, + "step": 28557 + }, + { + "epoch": 6.091723549488055, + "grad_norm": 0.10437916256748897, + "learning_rate": 8.06657609578024e-06, + "loss": 0.8892, + "step": 28558 + }, + { + "epoch": 6.09193686006826, + "grad_norm": 0.08630789099313092, + "learning_rate": 8.062852595273472e-06, + "loss": 0.8697, + "step": 28559 + }, + { + "epoch": 6.092150170648464, + "grad_norm": 0.10163818609769715, + "learning_rate": 8.059129908736741e-06, + "loss": 0.8956, + "step": 28560 + }, + { + "epoch": 6.092363481228669, + "grad_norm": 0.087130727330393, + "learning_rate": 8.055408036212138e-06, + "loss": 0.8655, + "step": 28561 + }, + { + "epoch": 6.092576791808874, + "grad_norm": 0.09407293140443157, + "learning_rate": 8.051686977741826e-06, + "loss": 0.8956, + "step": 28562 + }, + { + "epoch": 6.0927901023890785, + "grad_norm": 0.09410478395668544, + "learning_rate": 8.047966733367878e-06, + "loss": 0.8775, + "step": 28563 + }, + { + "epoch": 6.093003412969283, + "grad_norm": 0.08925935886089109, + "learning_rate": 8.044247303132392e-06, + "loss": 0.8516, + "step": 28564 + }, + { + "epoch": 6.093216723549488, + "grad_norm": 0.09281703702999519, + "learning_rate": 8.040528687077453e-06, + "loss": 0.893, + "step": 28565 + }, + { + "epoch": 6.0934300341296925, + "grad_norm": 0.08730212080944079, + "learning_rate": 8.03681088524516e-06, + "loss": 0.8912, + "step": 28566 + }, + { + "epoch": 6.093643344709897, + "grad_norm": 0.09120000330208794, + "learning_rate": 8.033093897677556e-06, + "loss": 0.8595, + "step": 28567 + }, + { + "epoch": 6.093856655290103, + "grad_norm": 0.08741308786834205, + "learning_rate": 8.029377724416699e-06, + "loss": 0.8687, + "step": 28568 + }, + { + "epoch": 6.0940699658703075, + "grad_norm": 0.08846133257423919, + "learning_rate": 8.025662365504652e-06, + "loss": 0.8662, + "step": 28569 + }, + { + "epoch": 6.094283276450512, + "grad_norm": 0.09835693312754867, + "learning_rate": 8.021947820983427e-06, + "loss": 0.8886, + "step": 28570 + }, + { + "epoch": 6.094496587030717, + "grad_norm": 0.0869773094515597, + "learning_rate": 8.018234090895106e-06, + "loss": 0.8869, + "step": 28571 + }, + { + "epoch": 6.0947098976109215, + "grad_norm": 0.08944434443433781, + "learning_rate": 8.014521175281663e-06, + "loss": 0.8826, + "step": 28572 + }, + { + "epoch": 6.094923208191126, + "grad_norm": 0.08818039493837185, + "learning_rate": 8.010809074185108e-06, + "loss": 0.882, + "step": 28573 + }, + { + "epoch": 6.095136518771331, + "grad_norm": 0.09755099251817874, + "learning_rate": 8.007097787647472e-06, + "loss": 0.8777, + "step": 28574 + }, + { + "epoch": 6.095349829351536, + "grad_norm": 0.0968676792762162, + "learning_rate": 8.003387315710731e-06, + "loss": 0.8588, + "step": 28575 + }, + { + "epoch": 6.09556313993174, + "grad_norm": 0.0918123030372762, + "learning_rate": 7.999677658416885e-06, + "loss": 0.8862, + "step": 28576 + }, + { + "epoch": 6.095776450511945, + "grad_norm": 0.10333461174439426, + "learning_rate": 7.995968815807872e-06, + "loss": 0.8852, + "step": 28577 + }, + { + "epoch": 6.0959897610921505, + "grad_norm": 0.10555056771752981, + "learning_rate": 7.9922607879257e-06, + "loss": 0.8657, + "step": 28578 + }, + { + "epoch": 6.096203071672355, + "grad_norm": 0.11092126567446238, + "learning_rate": 7.98855357481231e-06, + "loss": 0.8588, + "step": 28579 + }, + { + "epoch": 6.09641638225256, + "grad_norm": 0.0931444025511287, + "learning_rate": 7.984847176509656e-06, + "loss": 0.8993, + "step": 28580 + }, + { + "epoch": 6.096629692832765, + "grad_norm": 0.10306414931155645, + "learning_rate": 7.981141593059658e-06, + "loss": 0.8704, + "step": 28581 + }, + { + "epoch": 6.096843003412969, + "grad_norm": 0.08961711024663924, + "learning_rate": 7.977436824504257e-06, + "loss": 0.8907, + "step": 28582 + }, + { + "epoch": 6.097056313993174, + "grad_norm": 0.08363294320075298, + "learning_rate": 7.973732870885365e-06, + "loss": 0.9013, + "step": 28583 + }, + { + "epoch": 6.097269624573379, + "grad_norm": 0.09320338563017719, + "learning_rate": 7.970029732244903e-06, + "loss": 0.9036, + "step": 28584 + }, + { + "epoch": 6.097482935153583, + "grad_norm": 0.09559665872897628, + "learning_rate": 7.966327408624742e-06, + "loss": 0.8849, + "step": 28585 + }, + { + "epoch": 6.097696245733788, + "grad_norm": 0.0865902971869194, + "learning_rate": 7.962625900066823e-06, + "loss": 0.9159, + "step": 28586 + }, + { + "epoch": 6.097909556313994, + "grad_norm": 0.10388111079346904, + "learning_rate": 7.958925206613002e-06, + "loss": 0.8829, + "step": 28587 + }, + { + "epoch": 6.098122866894198, + "grad_norm": 0.10120573052454666, + "learning_rate": 7.955225328305149e-06, + "loss": 0.8956, + "step": 28588 + }, + { + "epoch": 6.098336177474403, + "grad_norm": 0.09481136530874357, + "learning_rate": 7.951526265185126e-06, + "loss": 0.8961, + "step": 28589 + }, + { + "epoch": 6.098549488054608, + "grad_norm": 0.09467114169339678, + "learning_rate": 7.947828017294807e-06, + "loss": 0.8736, + "step": 28590 + }, + { + "epoch": 6.098762798634812, + "grad_norm": 0.09613880455496666, + "learning_rate": 7.94413058467602e-06, + "loss": 0.8548, + "step": 28591 + }, + { + "epoch": 6.098976109215017, + "grad_norm": 0.11103942555291975, + "learning_rate": 7.940433967370613e-06, + "loss": 0.8511, + "step": 28592 + }, + { + "epoch": 6.099189419795222, + "grad_norm": 0.09839171523045494, + "learning_rate": 7.936738165420412e-06, + "loss": 0.8636, + "step": 28593 + }, + { + "epoch": 6.099402730375426, + "grad_norm": 0.09213196187486232, + "learning_rate": 7.933043178867223e-06, + "loss": 0.8718, + "step": 28594 + }, + { + "epoch": 6.099616040955631, + "grad_norm": 0.10350110489311082, + "learning_rate": 7.929349007752862e-06, + "loss": 0.844, + "step": 28595 + }, + { + "epoch": 6.099829351535837, + "grad_norm": 0.10467832661976309, + "learning_rate": 7.925655652119136e-06, + "loss": 0.8662, + "step": 28596 + }, + { + "epoch": 6.100042662116041, + "grad_norm": 0.09979538823928012, + "learning_rate": 7.921963112007814e-06, + "loss": 0.8828, + "step": 28597 + }, + { + "epoch": 6.100255972696246, + "grad_norm": 0.08695350848081368, + "learning_rate": 7.918271387460703e-06, + "loss": 0.8913, + "step": 28598 + }, + { + "epoch": 6.100469283276451, + "grad_norm": 0.08890323039684557, + "learning_rate": 7.914580478519558e-06, + "loss": 0.8699, + "step": 28599 + }, + { + "epoch": 6.100682593856655, + "grad_norm": 0.10756934939309103, + "learning_rate": 7.910890385226158e-06, + "loss": 0.8618, + "step": 28600 + }, + { + "epoch": 6.10089590443686, + "grad_norm": 0.0914728624099966, + "learning_rate": 7.90720110762222e-06, + "loss": 0.8663, + "step": 28601 + }, + { + "epoch": 6.101109215017065, + "grad_norm": 0.10422001329120631, + "learning_rate": 7.903512645749534e-06, + "loss": 0.8858, + "step": 28602 + }, + { + "epoch": 6.101322525597269, + "grad_norm": 0.09526199758865987, + "learning_rate": 7.899824999649816e-06, + "loss": 0.8655, + "step": 28603 + }, + { + "epoch": 6.101535836177474, + "grad_norm": 0.09540977210497933, + "learning_rate": 7.896138169364795e-06, + "loss": 0.8759, + "step": 28604 + }, + { + "epoch": 6.101749146757679, + "grad_norm": 0.09816308800619801, + "learning_rate": 7.892452154936178e-06, + "loss": 0.8759, + "step": 28605 + }, + { + "epoch": 6.101962457337884, + "grad_norm": 0.10530585871899693, + "learning_rate": 7.888766956405685e-06, + "loss": 0.9209, + "step": 28606 + }, + { + "epoch": 6.102175767918089, + "grad_norm": 0.10455056974653243, + "learning_rate": 7.885082573814995e-06, + "loss": 0.9049, + "step": 28607 + }, + { + "epoch": 6.102389078498294, + "grad_norm": 0.0902997948557341, + "learning_rate": 7.881399007205819e-06, + "loss": 0.8598, + "step": 28608 + }, + { + "epoch": 6.102602389078498, + "grad_norm": 0.09433496919723129, + "learning_rate": 7.877716256619803e-06, + "loss": 0.8535, + "step": 28609 + }, + { + "epoch": 6.102815699658703, + "grad_norm": 0.10571101676307024, + "learning_rate": 7.874034322098665e-06, + "loss": 0.8708, + "step": 28610 + }, + { + "epoch": 6.103029010238908, + "grad_norm": 0.0912528947152628, + "learning_rate": 7.870353203684034e-06, + "loss": 0.8621, + "step": 28611 + }, + { + "epoch": 6.103242320819112, + "grad_norm": 0.11000017485207644, + "learning_rate": 7.866672901417574e-06, + "loss": 0.8778, + "step": 28612 + }, + { + "epoch": 6.103455631399317, + "grad_norm": 0.10689254804700772, + "learning_rate": 7.862993415340905e-06, + "loss": 0.8849, + "step": 28613 + }, + { + "epoch": 6.103668941979522, + "grad_norm": 0.09907658980896288, + "learning_rate": 7.859314745495692e-06, + "loss": 0.8938, + "step": 28614 + }, + { + "epoch": 6.103882252559727, + "grad_norm": 0.09161445951341027, + "learning_rate": 7.855636891923555e-06, + "loss": 0.8693, + "step": 28615 + }, + { + "epoch": 6.104095563139932, + "grad_norm": 0.11154068468850466, + "learning_rate": 7.851959854666105e-06, + "loss": 0.8982, + "step": 28616 + }, + { + "epoch": 6.104308873720137, + "grad_norm": 0.08793927008484127, + "learning_rate": 7.848283633764917e-06, + "loss": 0.8885, + "step": 28617 + }, + { + "epoch": 6.104522184300341, + "grad_norm": 0.08848856974683011, + "learning_rate": 7.84460822926163e-06, + "loss": 0.8775, + "step": 28618 + }, + { + "epoch": 6.104735494880546, + "grad_norm": 0.09279832877780066, + "learning_rate": 7.840933641197819e-06, + "loss": 0.8626, + "step": 28619 + }, + { + "epoch": 6.104948805460751, + "grad_norm": 0.08940957217103135, + "learning_rate": 7.83725986961505e-06, + "loss": 0.8584, + "step": 28620 + }, + { + "epoch": 6.1051621160409555, + "grad_norm": 0.08481338622499135, + "learning_rate": 7.833586914554882e-06, + "loss": 0.875, + "step": 28621 + }, + { + "epoch": 6.10537542662116, + "grad_norm": 0.09173307147715307, + "learning_rate": 7.829914776058909e-06, + "loss": 0.8913, + "step": 28622 + }, + { + "epoch": 6.105588737201365, + "grad_norm": 0.09574152114728766, + "learning_rate": 7.826243454168669e-06, + "loss": 0.8986, + "step": 28623 + }, + { + "epoch": 6.1058020477815695, + "grad_norm": 0.09126136174063676, + "learning_rate": 7.822572948925695e-06, + "loss": 0.8702, + "step": 28624 + }, + { + "epoch": 6.106015358361775, + "grad_norm": 0.09636340950860471, + "learning_rate": 7.818903260371508e-06, + "loss": 0.8869, + "step": 28625 + }, + { + "epoch": 6.10622866894198, + "grad_norm": 0.09454352843004404, + "learning_rate": 7.81523438854766e-06, + "loss": 0.8709, + "step": 28626 + }, + { + "epoch": 6.1064419795221845, + "grad_norm": 0.1028368969895542, + "learning_rate": 7.811566333495669e-06, + "loss": 0.8943, + "step": 28627 + }, + { + "epoch": 6.106655290102389, + "grad_norm": 0.08580853897790719, + "learning_rate": 7.807899095256997e-06, + "loss": 0.8845, + "step": 28628 + }, + { + "epoch": 6.106868600682594, + "grad_norm": 0.08937977279875442, + "learning_rate": 7.80423267387315e-06, + "loss": 0.8794, + "step": 28629 + }, + { + "epoch": 6.1070819112627985, + "grad_norm": 0.08963774253052328, + "learning_rate": 7.80056706938563e-06, + "loss": 0.8725, + "step": 28630 + }, + { + "epoch": 6.107295221843003, + "grad_norm": 0.08795955726629896, + "learning_rate": 7.796902281835916e-06, + "loss": 0.883, + "step": 28631 + }, + { + "epoch": 6.107508532423208, + "grad_norm": 0.08824023087908063, + "learning_rate": 7.79323831126547e-06, + "loss": 0.9014, + "step": 28632 + }, + { + "epoch": 6.107721843003413, + "grad_norm": 0.0890721751899562, + "learning_rate": 7.789575157715731e-06, + "loss": 0.9047, + "step": 28633 + }, + { + "epoch": 6.107935153583618, + "grad_norm": 0.08931024923344245, + "learning_rate": 7.785912821228172e-06, + "loss": 0.8506, + "step": 28634 + }, + { + "epoch": 6.108148464163823, + "grad_norm": 0.09289061526347749, + "learning_rate": 7.782251301844232e-06, + "loss": 0.8978, + "step": 28635 + }, + { + "epoch": 6.1083617747440275, + "grad_norm": 0.08564775158671631, + "learning_rate": 7.77859059960533e-06, + "loss": 0.9051, + "step": 28636 + }, + { + "epoch": 6.108575085324232, + "grad_norm": 0.09406287087365466, + "learning_rate": 7.774930714552877e-06, + "loss": 0.8688, + "step": 28637 + }, + { + "epoch": 6.108788395904437, + "grad_norm": 0.08724277558787077, + "learning_rate": 7.77127164672832e-06, + "loss": 0.8881, + "step": 28638 + }, + { + "epoch": 6.109001706484642, + "grad_norm": 0.09195919275703708, + "learning_rate": 7.767613396173028e-06, + "loss": 0.8742, + "step": 28639 + }, + { + "epoch": 6.109215017064846, + "grad_norm": 0.0957365219251857, + "learning_rate": 7.763955962928409e-06, + "loss": 0.878, + "step": 28640 + }, + { + "epoch": 6.109428327645051, + "grad_norm": 0.08677940820470832, + "learning_rate": 7.760299347035834e-06, + "loss": 0.8748, + "step": 28641 + }, + { + "epoch": 6.109641638225256, + "grad_norm": 0.09866012659297892, + "learning_rate": 7.75664354853669e-06, + "loss": 0.8686, + "step": 28642 + }, + { + "epoch": 6.10985494880546, + "grad_norm": 0.08927200572771048, + "learning_rate": 7.752988567472344e-06, + "loss": 0.858, + "step": 28643 + }, + { + "epoch": 6.110068259385666, + "grad_norm": 0.08898290186218, + "learning_rate": 7.749334403884146e-06, + "loss": 0.8624, + "step": 28644 + }, + { + "epoch": 6.110281569965871, + "grad_norm": 0.09282146781764027, + "learning_rate": 7.745681057813432e-06, + "loss": 0.8714, + "step": 28645 + }, + { + "epoch": 6.110494880546075, + "grad_norm": 0.09496974635988252, + "learning_rate": 7.742028529301566e-06, + "loss": 0.8661, + "step": 28646 + }, + { + "epoch": 6.11070819112628, + "grad_norm": 0.09420832351049441, + "learning_rate": 7.738376818389865e-06, + "loss": 0.8978, + "step": 28647 + }, + { + "epoch": 6.110921501706485, + "grad_norm": 0.09045915443022173, + "learning_rate": 7.73472592511964e-06, + "loss": 0.8862, + "step": 28648 + }, + { + "epoch": 6.111134812286689, + "grad_norm": 0.09369939516258616, + "learning_rate": 7.731075849532214e-06, + "loss": 0.8702, + "step": 28649 + }, + { + "epoch": 6.111348122866894, + "grad_norm": 0.09854942115296017, + "learning_rate": 7.727426591668874e-06, + "loss": 0.8829, + "step": 28650 + }, + { + "epoch": 6.111561433447099, + "grad_norm": 0.08976848992930664, + "learning_rate": 7.723778151570927e-06, + "loss": 0.8895, + "step": 28651 + }, + { + "epoch": 6.111774744027303, + "grad_norm": 0.0856238769376531, + "learning_rate": 7.720130529279636e-06, + "loss": 0.8849, + "step": 28652 + }, + { + "epoch": 6.111988054607509, + "grad_norm": 0.09511628278150698, + "learning_rate": 7.71648372483627e-06, + "loss": 0.9031, + "step": 28653 + }, + { + "epoch": 6.112201365187714, + "grad_norm": 0.08537286508373637, + "learning_rate": 7.712837738282126e-06, + "loss": 0.879, + "step": 28654 + }, + { + "epoch": 6.112414675767918, + "grad_norm": 0.09373758877125546, + "learning_rate": 7.709192569658435e-06, + "loss": 0.8947, + "step": 28655 + }, + { + "epoch": 6.112627986348123, + "grad_norm": 0.08857737852056626, + "learning_rate": 7.705548219006451e-06, + "loss": 0.8738, + "step": 28656 + }, + { + "epoch": 6.112841296928328, + "grad_norm": 0.08375233820476047, + "learning_rate": 7.701904686367388e-06, + "loss": 0.8872, + "step": 28657 + }, + { + "epoch": 6.113054607508532, + "grad_norm": 0.10489077389789601, + "learning_rate": 7.6982619717825e-06, + "loss": 0.8811, + "step": 28658 + }, + { + "epoch": 6.113267918088737, + "grad_norm": 0.08495589830538618, + "learning_rate": 7.694620075292998e-06, + "loss": 0.8787, + "step": 28659 + }, + { + "epoch": 6.113481228668942, + "grad_norm": 0.10461295331047661, + "learning_rate": 7.690978996940086e-06, + "loss": 0.8608, + "step": 28660 + }, + { + "epoch": 6.113694539249146, + "grad_norm": 0.09717652663484859, + "learning_rate": 7.687338736764957e-06, + "loss": 0.8663, + "step": 28661 + }, + { + "epoch": 6.113907849829351, + "grad_norm": 0.09065972949364923, + "learning_rate": 7.683699294808816e-06, + "loss": 0.8844, + "step": 28662 + }, + { + "epoch": 6.114121160409557, + "grad_norm": 0.09018961286407318, + "learning_rate": 7.680060671112825e-06, + "loss": 0.8714, + "step": 28663 + }, + { + "epoch": 6.114334470989761, + "grad_norm": 0.0980214827901829, + "learning_rate": 7.676422865718174e-06, + "loss": 0.8385, + "step": 28664 + }, + { + "epoch": 6.114547781569966, + "grad_norm": 0.09699900982841914, + "learning_rate": 7.672785878665992e-06, + "loss": 0.8906, + "step": 28665 + }, + { + "epoch": 6.114761092150171, + "grad_norm": 0.08329261340664382, + "learning_rate": 7.669149709997472e-06, + "loss": 0.9088, + "step": 28666 + }, + { + "epoch": 6.114974402730375, + "grad_norm": 0.08930641515204338, + "learning_rate": 7.665514359753739e-06, + "loss": 0.8859, + "step": 28667 + }, + { + "epoch": 6.11518771331058, + "grad_norm": 0.09212819047771391, + "learning_rate": 7.661879827975922e-06, + "loss": 0.9137, + "step": 28668 + }, + { + "epoch": 6.115401023890785, + "grad_norm": 0.08882384456173549, + "learning_rate": 7.658246114705146e-06, + "loss": 0.8876, + "step": 28669 + }, + { + "epoch": 6.1156143344709895, + "grad_norm": 0.1026216719866823, + "learning_rate": 7.65461321998254e-06, + "loss": 0.8506, + "step": 28670 + }, + { + "epoch": 6.115827645051194, + "grad_norm": 0.0869309436482694, + "learning_rate": 7.650981143849212e-06, + "loss": 0.8919, + "step": 28671 + }, + { + "epoch": 6.1160409556314, + "grad_norm": 0.1003281388221, + "learning_rate": 7.647349886346238e-06, + "loss": 0.8821, + "step": 28672 + }, + { + "epoch": 6.116254266211604, + "grad_norm": 0.09359453839678905, + "learning_rate": 7.643719447514724e-06, + "loss": 0.9032, + "step": 28673 + }, + { + "epoch": 6.116467576791809, + "grad_norm": 0.10166229615410628, + "learning_rate": 7.640089827395738e-06, + "loss": 0.8852, + "step": 28674 + }, + { + "epoch": 6.116680887372014, + "grad_norm": 0.09626533555295631, + "learning_rate": 7.63646102603036e-06, + "loss": 0.9002, + "step": 28675 + }, + { + "epoch": 6.1168941979522184, + "grad_norm": 0.09700094675583806, + "learning_rate": 7.63283304345963e-06, + "loss": 0.8653, + "step": 28676 + }, + { + "epoch": 6.117107508532423, + "grad_norm": 0.08752539447088563, + "learning_rate": 7.629205879724604e-06, + "loss": 0.8904, + "step": 28677 + }, + { + "epoch": 6.117320819112628, + "grad_norm": 0.10701778943711239, + "learning_rate": 7.625579534866348e-06, + "loss": 0.863, + "step": 28678 + }, + { + "epoch": 6.1175341296928325, + "grad_norm": 0.09797144617073195, + "learning_rate": 7.621954008925869e-06, + "loss": 0.86, + "step": 28679 + }, + { + "epoch": 6.117747440273037, + "grad_norm": 0.08458714171650622, + "learning_rate": 7.618329301944203e-06, + "loss": 0.8639, + "step": 28680 + }, + { + "epoch": 6.117960750853243, + "grad_norm": 0.08811625311137324, + "learning_rate": 7.6147054139623465e-06, + "loss": 0.8675, + "step": 28681 + }, + { + "epoch": 6.118174061433447, + "grad_norm": 0.09822978848592291, + "learning_rate": 7.611082345021325e-06, + "loss": 0.8794, + "step": 28682 + }, + { + "epoch": 6.118387372013652, + "grad_norm": 0.09385976315570689, + "learning_rate": 7.607460095162147e-06, + "loss": 0.8798, + "step": 28683 + }, + { + "epoch": 6.118600682593857, + "grad_norm": 0.08886932279421002, + "learning_rate": 7.603838664425738e-06, + "loss": 0.9041, + "step": 28684 + }, + { + "epoch": 6.1188139931740615, + "grad_norm": 0.0911886927823477, + "learning_rate": 7.600218052853132e-06, + "loss": 0.8826, + "step": 28685 + }, + { + "epoch": 6.119027303754266, + "grad_norm": 0.09970933837443008, + "learning_rate": 7.596598260485275e-06, + "loss": 0.8986, + "step": 28686 + }, + { + "epoch": 6.119240614334471, + "grad_norm": 0.0995383971960659, + "learning_rate": 7.59297928736312e-06, + "loss": 0.8987, + "step": 28687 + }, + { + "epoch": 6.1194539249146755, + "grad_norm": 0.08644426957840763, + "learning_rate": 7.5893611335276304e-06, + "loss": 0.8683, + "step": 28688 + }, + { + "epoch": 6.11966723549488, + "grad_norm": 0.08704783217707422, + "learning_rate": 7.585743799019716e-06, + "loss": 0.8803, + "step": 28689 + }, + { + "epoch": 6.119880546075085, + "grad_norm": 0.08593242112032586, + "learning_rate": 7.582127283880347e-06, + "loss": 0.8428, + "step": 28690 + }, + { + "epoch": 6.1200938566552905, + "grad_norm": 0.09920920961959016, + "learning_rate": 7.578511588150434e-06, + "loss": 0.8701, + "step": 28691 + }, + { + "epoch": 6.120307167235495, + "grad_norm": 0.09665364372123345, + "learning_rate": 7.574896711870869e-06, + "loss": 0.8577, + "step": 28692 + }, + { + "epoch": 6.1205204778157, + "grad_norm": 0.10000795361396875, + "learning_rate": 7.571282655082553e-06, + "loss": 0.8602, + "step": 28693 + }, + { + "epoch": 6.1207337883959045, + "grad_norm": 0.10028392906005575, + "learning_rate": 7.567669417826429e-06, + "loss": 0.8688, + "step": 28694 + }, + { + "epoch": 6.120947098976109, + "grad_norm": 0.09289208597461905, + "learning_rate": 7.564057000143319e-06, + "loss": 0.8874, + "step": 28695 + }, + { + "epoch": 6.121160409556314, + "grad_norm": 0.09657484873492617, + "learning_rate": 7.560445402074114e-06, + "loss": 0.8931, + "step": 28696 + }, + { + "epoch": 6.121373720136519, + "grad_norm": 0.1026686246631937, + "learning_rate": 7.556834623659699e-06, + "loss": 0.8562, + "step": 28697 + }, + { + "epoch": 6.121587030716723, + "grad_norm": 0.09398901479530103, + "learning_rate": 7.553224664940928e-06, + "loss": 0.8612, + "step": 28698 + }, + { + "epoch": 6.121800341296928, + "grad_norm": 0.0936412624315702, + "learning_rate": 7.549615525958631e-06, + "loss": 0.8843, + "step": 28699 + }, + { + "epoch": 6.1220136518771335, + "grad_norm": 0.0888530066379063, + "learning_rate": 7.546007206753647e-06, + "loss": 0.8562, + "step": 28700 + }, + { + "epoch": 6.122226962457338, + "grad_norm": 0.09455744111439846, + "learning_rate": 7.542399707366806e-06, + "loss": 0.8986, + "step": 28701 + }, + { + "epoch": 6.122440273037543, + "grad_norm": 0.0922374289782902, + "learning_rate": 7.5387930278389446e-06, + "loss": 0.8666, + "step": 28702 + }, + { + "epoch": 6.122653583617748, + "grad_norm": 0.0847190026539935, + "learning_rate": 7.535187168210849e-06, + "loss": 0.8758, + "step": 28703 + }, + { + "epoch": 6.122866894197952, + "grad_norm": 0.08681924705552534, + "learning_rate": 7.5315821285233406e-06, + "loss": 0.8575, + "step": 28704 + }, + { + "epoch": 6.123080204778157, + "grad_norm": 0.09591962176962852, + "learning_rate": 7.527977908817194e-06, + "loss": 0.8895, + "step": 28705 + }, + { + "epoch": 6.123293515358362, + "grad_norm": 0.08738223978054756, + "learning_rate": 7.524374509133197e-06, + "loss": 0.8548, + "step": 28706 + }, + { + "epoch": 6.123506825938566, + "grad_norm": 0.09436683293625006, + "learning_rate": 7.520771929512123e-06, + "loss": 0.9002, + "step": 28707 + }, + { + "epoch": 6.123720136518771, + "grad_norm": 0.088585766196111, + "learning_rate": 7.517170169994714e-06, + "loss": 0.8681, + "step": 28708 + }, + { + "epoch": 6.123933447098976, + "grad_norm": 0.08802741118874718, + "learning_rate": 7.5135692306217645e-06, + "loss": 0.8863, + "step": 28709 + }, + { + "epoch": 6.124146757679181, + "grad_norm": 0.09621911981820237, + "learning_rate": 7.509969111433996e-06, + "loss": 0.9028, + "step": 28710 + }, + { + "epoch": 6.124360068259386, + "grad_norm": 0.08835010995887042, + "learning_rate": 7.506369812472143e-06, + "loss": 0.8914, + "step": 28711 + }, + { + "epoch": 6.124573378839591, + "grad_norm": 0.08655773516297913, + "learning_rate": 7.502771333776944e-06, + "loss": 0.8898, + "step": 28712 + }, + { + "epoch": 6.124786689419795, + "grad_norm": 0.10623805042185627, + "learning_rate": 7.499173675389087e-06, + "loss": 0.8979, + "step": 28713 + }, + { + "epoch": 6.125, + "grad_norm": 0.0943001538217978, + "learning_rate": 7.495576837349312e-06, + "loss": 0.8718, + "step": 28714 + }, + { + "epoch": 6.125213310580205, + "grad_norm": 0.08444434130287348, + "learning_rate": 7.491980819698308e-06, + "loss": 0.8837, + "step": 28715 + }, + { + "epoch": 6.125426621160409, + "grad_norm": 0.09674033759857298, + "learning_rate": 7.488385622476762e-06, + "loss": 0.8659, + "step": 28716 + }, + { + "epoch": 6.125639931740614, + "grad_norm": 0.10530803851743775, + "learning_rate": 7.4847912457253425e-06, + "loss": 0.911, + "step": 28717 + }, + { + "epoch": 6.125853242320819, + "grad_norm": 0.08548548834910609, + "learning_rate": 7.481197689484738e-06, + "loss": 0.8791, + "step": 28718 + }, + { + "epoch": 6.126066552901024, + "grad_norm": 0.09922245978195829, + "learning_rate": 7.477604953795601e-06, + "loss": 0.8812, + "step": 28719 + }, + { + "epoch": 6.126279863481229, + "grad_norm": 0.0949784968818824, + "learning_rate": 7.474013038698564e-06, + "loss": 0.8911, + "step": 28720 + }, + { + "epoch": 6.126493174061434, + "grad_norm": 0.08890388335579401, + "learning_rate": 7.470421944234308e-06, + "loss": 0.8747, + "step": 28721 + }, + { + "epoch": 6.126706484641638, + "grad_norm": 0.09823043249432005, + "learning_rate": 7.466831670443455e-06, + "loss": 0.8773, + "step": 28722 + }, + { + "epoch": 6.126919795221843, + "grad_norm": 0.09493751734940056, + "learning_rate": 7.4632422173666155e-06, + "loss": 0.8669, + "step": 28723 + }, + { + "epoch": 6.127133105802048, + "grad_norm": 0.08510484614886436, + "learning_rate": 7.4596535850444044e-06, + "loss": 0.8856, + "step": 28724 + }, + { + "epoch": 6.127346416382252, + "grad_norm": 0.08577694241262175, + "learning_rate": 7.45606577351742e-06, + "loss": 0.8882, + "step": 28725 + }, + { + "epoch": 6.127559726962457, + "grad_norm": 0.09100840248299556, + "learning_rate": 7.452478782826289e-06, + "loss": 0.8969, + "step": 28726 + }, + { + "epoch": 6.127773037542662, + "grad_norm": 0.09617598734605194, + "learning_rate": 7.4488926130115825e-06, + "loss": 0.8756, + "step": 28727 + }, + { + "epoch": 6.1279863481228665, + "grad_norm": 0.0961302873982918, + "learning_rate": 7.445307264113882e-06, + "loss": 0.8916, + "step": 28728 + }, + { + "epoch": 6.128199658703072, + "grad_norm": 0.09675530193086207, + "learning_rate": 7.44172273617374e-06, + "loss": 0.8891, + "step": 28729 + }, + { + "epoch": 6.128412969283277, + "grad_norm": 0.09680753441329601, + "learning_rate": 7.4381390292317304e-06, + "loss": 0.8541, + "step": 28730 + }, + { + "epoch": 6.128626279863481, + "grad_norm": 0.08686104415122149, + "learning_rate": 7.434556143328398e-06, + "loss": 0.868, + "step": 28731 + }, + { + "epoch": 6.128839590443686, + "grad_norm": 0.09023572527747371, + "learning_rate": 7.430974078504269e-06, + "loss": 0.8937, + "step": 28732 + }, + { + "epoch": 6.129052901023891, + "grad_norm": 0.09541151301824445, + "learning_rate": 7.4273928347999e-06, + "loss": 0.8765, + "step": 28733 + }, + { + "epoch": 6.1292662116040955, + "grad_norm": 0.09348258269706243, + "learning_rate": 7.423812412255809e-06, + "loss": 0.8686, + "step": 28734 + }, + { + "epoch": 6.1294795221843, + "grad_norm": 0.0898405364847295, + "learning_rate": 7.420232810912496e-06, + "loss": 0.8919, + "step": 28735 + }, + { + "epoch": 6.129692832764505, + "grad_norm": 0.08664090834888107, + "learning_rate": 7.416654030810471e-06, + "loss": 0.8829, + "step": 28736 + }, + { + "epoch": 6.1299061433447095, + "grad_norm": 0.08956133804635422, + "learning_rate": 7.413076071990208e-06, + "loss": 0.8873, + "step": 28737 + }, + { + "epoch": 6.130119453924915, + "grad_norm": 0.09194483265026411, + "learning_rate": 7.4094989344922275e-06, + "loss": 0.9082, + "step": 28738 + }, + { + "epoch": 6.13033276450512, + "grad_norm": 0.09107016857385829, + "learning_rate": 7.405922618357002e-06, + "loss": 0.8856, + "step": 28739 + }, + { + "epoch": 6.1305460750853245, + "grad_norm": 0.0863815762906628, + "learning_rate": 7.402347123624953e-06, + "loss": 0.8825, + "step": 28740 + }, + { + "epoch": 6.130759385665529, + "grad_norm": 0.0926940526414329, + "learning_rate": 7.398772450336582e-06, + "loss": 0.8504, + "step": 28741 + }, + { + "epoch": 6.130972696245734, + "grad_norm": 0.08685269707269735, + "learning_rate": 7.395198598532327e-06, + "loss": 0.8871, + "step": 28742 + }, + { + "epoch": 6.1311860068259385, + "grad_norm": 0.08950783551965767, + "learning_rate": 7.391625568252618e-06, + "loss": 0.8883, + "step": 28743 + }, + { + "epoch": 6.131399317406143, + "grad_norm": 0.08630539251759033, + "learning_rate": 7.388053359537868e-06, + "loss": 0.8935, + "step": 28744 + }, + { + "epoch": 6.131612627986348, + "grad_norm": 0.09357639517478049, + "learning_rate": 7.384481972428532e-06, + "loss": 0.8756, + "step": 28745 + }, + { + "epoch": 6.131825938566553, + "grad_norm": 0.08658284454942555, + "learning_rate": 7.380911406965006e-06, + "loss": 0.8829, + "step": 28746 + }, + { + "epoch": 6.132039249146757, + "grad_norm": 0.08911336887749935, + "learning_rate": 7.377341663187691e-06, + "loss": 0.8537, + "step": 28747 + }, + { + "epoch": 6.132252559726963, + "grad_norm": 0.0884492802829001, + "learning_rate": 7.373772741136975e-06, + "loss": 0.8802, + "step": 28748 + }, + { + "epoch": 6.1324658703071675, + "grad_norm": 0.09437495754523406, + "learning_rate": 7.370204640853225e-06, + "loss": 0.8663, + "step": 28749 + }, + { + "epoch": 6.132679180887372, + "grad_norm": 0.0875911230352892, + "learning_rate": 7.366637362376869e-06, + "loss": 0.8507, + "step": 28750 + }, + { + "epoch": 6.132892491467577, + "grad_norm": 0.08294639186140194, + "learning_rate": 7.363070905748207e-06, + "loss": 0.8866, + "step": 28751 + }, + { + "epoch": 6.1331058020477816, + "grad_norm": 0.09962583665673615, + "learning_rate": 7.359505271007612e-06, + "loss": 0.8948, + "step": 28752 + }, + { + "epoch": 6.133319112627986, + "grad_norm": 0.0938607197630674, + "learning_rate": 7.355940458195445e-06, + "loss": 0.8902, + "step": 28753 + }, + { + "epoch": 6.133532423208191, + "grad_norm": 0.09410288537265979, + "learning_rate": 7.352376467352038e-06, + "loss": 0.8847, + "step": 28754 + }, + { + "epoch": 6.133745733788396, + "grad_norm": 0.0872254471849064, + "learning_rate": 7.348813298517714e-06, + "loss": 0.906, + "step": 28755 + }, + { + "epoch": 6.1339590443686, + "grad_norm": 0.08517198873018003, + "learning_rate": 7.345250951732769e-06, + "loss": 0.887, + "step": 28756 + }, + { + "epoch": 6.134172354948806, + "grad_norm": 0.10037373956947684, + "learning_rate": 7.341689427037555e-06, + "loss": 0.8862, + "step": 28757 + }, + { + "epoch": 6.1343856655290105, + "grad_norm": 0.08322896845718936, + "learning_rate": 7.33812872447234e-06, + "loss": 0.8679, + "step": 28758 + }, + { + "epoch": 6.134598976109215, + "grad_norm": 0.08681780418772292, + "learning_rate": 7.33456884407743e-06, + "loss": 0.8697, + "step": 28759 + }, + { + "epoch": 6.13481228668942, + "grad_norm": 0.08683463692072166, + "learning_rate": 7.331009785893086e-06, + "loss": 0.8727, + "step": 28760 + }, + { + "epoch": 6.135025597269625, + "grad_norm": 0.08824632908511658, + "learning_rate": 7.327451549959588e-06, + "loss": 0.8501, + "step": 28761 + }, + { + "epoch": 6.135238907849829, + "grad_norm": 0.09076743485725894, + "learning_rate": 7.323894136317205e-06, + "loss": 0.8791, + "step": 28762 + }, + { + "epoch": 6.135452218430034, + "grad_norm": 0.08515225412338841, + "learning_rate": 7.32033754500618e-06, + "loss": 0.8613, + "step": 28763 + }, + { + "epoch": 6.135665529010239, + "grad_norm": 0.09341515143380598, + "learning_rate": 7.316781776066739e-06, + "loss": 0.8993, + "step": 28764 + }, + { + "epoch": 6.135878839590443, + "grad_norm": 0.09530535857363505, + "learning_rate": 7.313226829539161e-06, + "loss": 0.8737, + "step": 28765 + }, + { + "epoch": 6.136092150170649, + "grad_norm": 0.09715236724812208, + "learning_rate": 7.309672705463629e-06, + "loss": 0.8529, + "step": 28766 + }, + { + "epoch": 6.136305460750854, + "grad_norm": 0.09900297903617442, + "learning_rate": 7.306119403880383e-06, + "loss": 0.9002, + "step": 28767 + }, + { + "epoch": 6.136518771331058, + "grad_norm": 0.09034413836824134, + "learning_rate": 7.302566924829606e-06, + "loss": 0.8852, + "step": 28768 + }, + { + "epoch": 6.136732081911263, + "grad_norm": 0.09954850478165062, + "learning_rate": 7.299015268351515e-06, + "loss": 0.9171, + "step": 28769 + }, + { + "epoch": 6.136945392491468, + "grad_norm": 0.10007326051480919, + "learning_rate": 7.29546443448629e-06, + "loss": 0.8903, + "step": 28770 + }, + { + "epoch": 6.137158703071672, + "grad_norm": 0.08780379583799562, + "learning_rate": 7.291914423274105e-06, + "loss": 0.846, + "step": 28771 + }, + { + "epoch": 6.137372013651877, + "grad_norm": 0.08259227530930811, + "learning_rate": 7.2883652347551395e-06, + "loss": 0.8654, + "step": 28772 + }, + { + "epoch": 6.137585324232082, + "grad_norm": 0.0939763806793215, + "learning_rate": 7.28481686896954e-06, + "loss": 0.8797, + "step": 28773 + }, + { + "epoch": 6.137798634812286, + "grad_norm": 0.10105254351972984, + "learning_rate": 7.281269325957461e-06, + "loss": 0.8943, + "step": 28774 + }, + { + "epoch": 6.138011945392491, + "grad_norm": 0.08870511135016847, + "learning_rate": 7.277722605759039e-06, + "loss": 0.8682, + "step": 28775 + }, + { + "epoch": 6.138225255972697, + "grad_norm": 0.09693145748373765, + "learning_rate": 7.274176708414402e-06, + "loss": 0.8479, + "step": 28776 + }, + { + "epoch": 6.138438566552901, + "grad_norm": 0.09564033894152943, + "learning_rate": 7.270631633963687e-06, + "loss": 0.8668, + "step": 28777 + }, + { + "epoch": 6.138651877133106, + "grad_norm": 0.09719215969343974, + "learning_rate": 7.267087382447004e-06, + "loss": 0.8518, + "step": 28778 + }, + { + "epoch": 6.138865187713311, + "grad_norm": 0.08550209471191579, + "learning_rate": 7.263543953904446e-06, + "loss": 0.8893, + "step": 28779 + }, + { + "epoch": 6.139078498293515, + "grad_norm": 0.10762785211776806, + "learning_rate": 7.2600013483760955e-06, + "loss": 0.9165, + "step": 28780 + }, + { + "epoch": 6.13929180887372, + "grad_norm": 0.09812873546613779, + "learning_rate": 7.256459565902072e-06, + "loss": 0.8807, + "step": 28781 + }, + { + "epoch": 6.139505119453925, + "grad_norm": 0.09549419043544664, + "learning_rate": 7.2529186065224325e-06, + "loss": 0.8811, + "step": 28782 + }, + { + "epoch": 6.139718430034129, + "grad_norm": 0.08510215294781705, + "learning_rate": 7.249378470277242e-06, + "loss": 0.8707, + "step": 28783 + }, + { + "epoch": 6.139931740614334, + "grad_norm": 0.09013062105337478, + "learning_rate": 7.245839157206558e-06, + "loss": 0.8795, + "step": 28784 + }, + { + "epoch": 6.140145051194539, + "grad_norm": 0.09338809437136397, + "learning_rate": 7.242300667350428e-06, + "loss": 0.9033, + "step": 28785 + }, + { + "epoch": 6.140358361774744, + "grad_norm": 0.09637896671018023, + "learning_rate": 7.238763000748883e-06, + "loss": 0.8559, + "step": 28786 + }, + { + "epoch": 6.140571672354949, + "grad_norm": 0.0871953932464285, + "learning_rate": 7.235226157441961e-06, + "loss": 0.9176, + "step": 28787 + }, + { + "epoch": 6.140784982935154, + "grad_norm": 0.08858797018474486, + "learning_rate": 7.231690137469667e-06, + "loss": 0.8992, + "step": 28788 + }, + { + "epoch": 6.140998293515358, + "grad_norm": 0.0818035970182693, + "learning_rate": 7.228154940872038e-06, + "loss": 0.8784, + "step": 28789 + }, + { + "epoch": 6.141211604095563, + "grad_norm": 0.0893087399078235, + "learning_rate": 7.224620567689062e-06, + "loss": 0.8846, + "step": 28790 + }, + { + "epoch": 6.141424914675768, + "grad_norm": 0.09423899386575225, + "learning_rate": 7.221087017960724e-06, + "loss": 0.8811, + "step": 28791 + }, + { + "epoch": 6.1416382252559725, + "grad_norm": 0.08447681565415402, + "learning_rate": 7.217554291727e-06, + "loss": 0.8787, + "step": 28792 + }, + { + "epoch": 6.141851535836177, + "grad_norm": 0.0828343099949115, + "learning_rate": 7.214022389027886e-06, + "loss": 0.8873, + "step": 28793 + }, + { + "epoch": 6.142064846416382, + "grad_norm": 0.08633172323459896, + "learning_rate": 7.210491309903331e-06, + "loss": 0.8695, + "step": 28794 + }, + { + "epoch": 6.142278156996587, + "grad_norm": 0.08801616088568054, + "learning_rate": 7.206961054393313e-06, + "loss": 0.8903, + "step": 28795 + }, + { + "epoch": 6.142491467576792, + "grad_norm": 0.08773433922823265, + "learning_rate": 7.2034316225377195e-06, + "loss": 0.905, + "step": 28796 + }, + { + "epoch": 6.142704778156997, + "grad_norm": 0.08864595509944909, + "learning_rate": 7.199903014376537e-06, + "loss": 0.8701, + "step": 28797 + }, + { + "epoch": 6.1429180887372015, + "grad_norm": 0.08616638525710239, + "learning_rate": 7.196375229949679e-06, + "loss": 0.8772, + "step": 28798 + }, + { + "epoch": 6.143131399317406, + "grad_norm": 0.1019066587939404, + "learning_rate": 7.192848269297052e-06, + "loss": 0.8737, + "step": 28799 + }, + { + "epoch": 6.143344709897611, + "grad_norm": 0.08400793362490656, + "learning_rate": 7.189322132458563e-06, + "loss": 0.8849, + "step": 28800 + }, + { + "epoch": 6.1435580204778155, + "grad_norm": 0.09028058095738085, + "learning_rate": 7.185796819474134e-06, + "loss": 0.8905, + "step": 28801 + }, + { + "epoch": 6.14377133105802, + "grad_norm": 0.08606238788695779, + "learning_rate": 7.182272330383635e-06, + "loss": 0.8276, + "step": 28802 + }, + { + "epoch": 6.143984641638225, + "grad_norm": 0.09530097751712835, + "learning_rate": 7.178748665226956e-06, + "loss": 0.911, + "step": 28803 + }, + { + "epoch": 6.1441979522184305, + "grad_norm": 0.08503057882572176, + "learning_rate": 7.175225824043939e-06, + "loss": 0.8884, + "step": 28804 + }, + { + "epoch": 6.144411262798635, + "grad_norm": 0.09184824062409255, + "learning_rate": 7.17170380687449e-06, + "loss": 0.8802, + "step": 28805 + }, + { + "epoch": 6.14462457337884, + "grad_norm": 0.08335736606242572, + "learning_rate": 7.168182613758446e-06, + "loss": 0.8795, + "step": 28806 + }, + { + "epoch": 6.1448378839590445, + "grad_norm": 0.09370086685687755, + "learning_rate": 7.164662244735621e-06, + "loss": 0.8908, + "step": 28807 + }, + { + "epoch": 6.145051194539249, + "grad_norm": 0.08938282471624806, + "learning_rate": 7.16114269984586e-06, + "loss": 0.8621, + "step": 28808 + }, + { + "epoch": 6.145264505119454, + "grad_norm": 0.0959761510658647, + "learning_rate": 7.157623979129007e-06, + "loss": 0.8779, + "step": 28809 + }, + { + "epoch": 6.145477815699659, + "grad_norm": 0.09122521771328417, + "learning_rate": 7.154106082624869e-06, + "loss": 0.8714, + "step": 28810 + }, + { + "epoch": 6.145691126279863, + "grad_norm": 0.08565552496172667, + "learning_rate": 7.150589010373239e-06, + "loss": 0.8775, + "step": 28811 + }, + { + "epoch": 6.145904436860068, + "grad_norm": 0.09039287455140524, + "learning_rate": 7.1470727624139135e-06, + "loss": 0.9024, + "step": 28812 + }, + { + "epoch": 6.146117747440273, + "grad_norm": 0.09253172697077994, + "learning_rate": 7.143557338786693e-06, + "loss": 0.8885, + "step": 28813 + }, + { + "epoch": 6.146331058020478, + "grad_norm": 0.0965170787228203, + "learning_rate": 7.14004273953135e-06, + "loss": 0.8963, + "step": 28814 + }, + { + "epoch": 6.146544368600683, + "grad_norm": 0.08937982903265476, + "learning_rate": 7.136528964687639e-06, + "loss": 0.8798, + "step": 28815 + }, + { + "epoch": 6.146757679180888, + "grad_norm": 0.09900014858467224, + "learning_rate": 7.133016014295324e-06, + "loss": 0.8753, + "step": 28816 + }, + { + "epoch": 6.146970989761092, + "grad_norm": 0.09874437258588646, + "learning_rate": 7.129503888394187e-06, + "loss": 0.8901, + "step": 28817 + }, + { + "epoch": 6.147184300341297, + "grad_norm": 0.0897524642285472, + "learning_rate": 7.12599258702392e-06, + "loss": 0.8901, + "step": 28818 + }, + { + "epoch": 6.147397610921502, + "grad_norm": 0.09281443978553566, + "learning_rate": 7.122482110224278e-06, + "loss": 0.8775, + "step": 28819 + }, + { + "epoch": 6.147610921501706, + "grad_norm": 0.1010469996580348, + "learning_rate": 7.118972458034963e-06, + "loss": 0.8925, + "step": 28820 + }, + { + "epoch": 6.147824232081911, + "grad_norm": 0.09011233568440881, + "learning_rate": 7.115463630495711e-06, + "loss": 0.8379, + "step": 28821 + }, + { + "epoch": 6.148037542662116, + "grad_norm": 0.08824878740180966, + "learning_rate": 7.111955627646207e-06, + "loss": 0.8634, + "step": 28822 + }, + { + "epoch": 6.148250853242321, + "grad_norm": 0.09269614088302, + "learning_rate": 7.108448449526162e-06, + "loss": 0.8734, + "step": 28823 + }, + { + "epoch": 6.148464163822526, + "grad_norm": 0.0907980517967156, + "learning_rate": 7.104942096175231e-06, + "loss": 0.8772, + "step": 28824 + }, + { + "epoch": 6.148677474402731, + "grad_norm": 0.08880717143360842, + "learning_rate": 7.101436567633126e-06, + "loss": 0.8711, + "step": 28825 + }, + { + "epoch": 6.148890784982935, + "grad_norm": 0.09375357602321376, + "learning_rate": 7.097931863939487e-06, + "loss": 0.8668, + "step": 28826 + }, + { + "epoch": 6.14910409556314, + "grad_norm": 0.08435442153137505, + "learning_rate": 7.094427985133969e-06, + "loss": 0.8591, + "step": 28827 + }, + { + "epoch": 6.149317406143345, + "grad_norm": 0.10240045338582628, + "learning_rate": 7.090924931256231e-06, + "loss": 0.873, + "step": 28828 + }, + { + "epoch": 6.149530716723549, + "grad_norm": 0.09852704300377862, + "learning_rate": 7.087422702345903e-06, + "loss": 0.8748, + "step": 28829 + }, + { + "epoch": 6.149744027303754, + "grad_norm": 0.08837735131844202, + "learning_rate": 7.083921298442615e-06, + "loss": 0.8735, + "step": 28830 + }, + { + "epoch": 6.149957337883959, + "grad_norm": 0.09020687300807465, + "learning_rate": 7.0804207195859896e-06, + "loss": 0.9218, + "step": 28831 + }, + { + "epoch": 6.150170648464163, + "grad_norm": 0.10228890098683432, + "learning_rate": 7.0769209658156035e-06, + "loss": 0.8857, + "step": 28832 + }, + { + "epoch": 6.150383959044369, + "grad_norm": 0.09994656220494276, + "learning_rate": 7.073422037171105e-06, + "loss": 0.8607, + "step": 28833 + }, + { + "epoch": 6.150597269624574, + "grad_norm": 0.09993497874273384, + "learning_rate": 7.0699239336920625e-06, + "loss": 0.8902, + "step": 28834 + }, + { + "epoch": 6.150810580204778, + "grad_norm": 0.09617888527260607, + "learning_rate": 7.066426655418061e-06, + "loss": 0.8857, + "step": 28835 + }, + { + "epoch": 6.151023890784983, + "grad_norm": 0.10764078838501932, + "learning_rate": 7.062930202388645e-06, + "loss": 0.8806, + "step": 28836 + }, + { + "epoch": 6.151237201365188, + "grad_norm": 0.09070964975257555, + "learning_rate": 7.059434574643425e-06, + "loss": 0.8719, + "step": 28837 + }, + { + "epoch": 6.151450511945392, + "grad_norm": 0.10504347444503975, + "learning_rate": 7.055939772221925e-06, + "loss": 0.8779, + "step": 28838 + }, + { + "epoch": 6.151663822525597, + "grad_norm": 0.10696059894288376, + "learning_rate": 7.052445795163705e-06, + "loss": 0.8721, + "step": 28839 + }, + { + "epoch": 6.151877133105802, + "grad_norm": 0.0985093562759518, + "learning_rate": 7.048952643508262e-06, + "loss": 0.8951, + "step": 28840 + }, + { + "epoch": 6.152090443686006, + "grad_norm": 0.09690145691338055, + "learning_rate": 7.045460317295157e-06, + "loss": 0.916, + "step": 28841 + }, + { + "epoch": 6.152303754266212, + "grad_norm": 0.09204139470545232, + "learning_rate": 7.041968816563902e-06, + "loss": 0.8562, + "step": 28842 + }, + { + "epoch": 6.152517064846417, + "grad_norm": 0.09341298232379162, + "learning_rate": 7.0384781413539885e-06, + "loss": 0.9104, + "step": 28843 + }, + { + "epoch": 6.152730375426621, + "grad_norm": 0.09189096949460346, + "learning_rate": 7.034988291704921e-06, + "loss": 0.8583, + "step": 28844 + }, + { + "epoch": 6.152943686006826, + "grad_norm": 0.09919329690772771, + "learning_rate": 7.0314992676561875e-06, + "loss": 0.9064, + "step": 28845 + }, + { + "epoch": 6.153156996587031, + "grad_norm": 0.09451696649783616, + "learning_rate": 7.028011069247278e-06, + "loss": 0.8919, + "step": 28846 + }, + { + "epoch": 6.153370307167235, + "grad_norm": 0.08626930303603245, + "learning_rate": 7.0245236965176445e-06, + "loss": 0.8973, + "step": 28847 + }, + { + "epoch": 6.15358361774744, + "grad_norm": 0.08654586519736787, + "learning_rate": 7.021037149506749e-06, + "loss": 0.8893, + "step": 28848 + }, + { + "epoch": 6.153796928327645, + "grad_norm": 0.10380147572370321, + "learning_rate": 7.017551428254044e-06, + "loss": 0.8801, + "step": 28849 + }, + { + "epoch": 6.1540102389078495, + "grad_norm": 0.09310971042039488, + "learning_rate": 7.014066532798982e-06, + "loss": 0.8762, + "step": 28850 + }, + { + "epoch": 6.154223549488055, + "grad_norm": 0.08769169413974834, + "learning_rate": 7.010582463180999e-06, + "loss": 0.8684, + "step": 28851 + }, + { + "epoch": 6.15443686006826, + "grad_norm": 0.09398016534100041, + "learning_rate": 7.007099219439477e-06, + "loss": 0.8805, + "step": 28852 + }, + { + "epoch": 6.154650170648464, + "grad_norm": 0.08532904908511023, + "learning_rate": 7.00361680161386e-06, + "loss": 0.8833, + "step": 28853 + }, + { + "epoch": 6.154863481228669, + "grad_norm": 0.08758025047402744, + "learning_rate": 7.0001352097435546e-06, + "loss": 0.9146, + "step": 28854 + }, + { + "epoch": 6.155076791808874, + "grad_norm": 0.08699495046193546, + "learning_rate": 6.996654443867945e-06, + "loss": 0.8775, + "step": 28855 + }, + { + "epoch": 6.1552901023890785, + "grad_norm": 0.08369972475368913, + "learning_rate": 6.993174504026403e-06, + "loss": 0.8725, + "step": 28856 + }, + { + "epoch": 6.155503412969283, + "grad_norm": 0.09844256469060045, + "learning_rate": 6.989695390258337e-06, + "loss": 0.8835, + "step": 28857 + }, + { + "epoch": 6.155716723549488, + "grad_norm": 0.09825455941226334, + "learning_rate": 6.986217102603095e-06, + "loss": 0.8722, + "step": 28858 + }, + { + "epoch": 6.1559300341296925, + "grad_norm": 0.10209430718138691, + "learning_rate": 6.982739641100029e-06, + "loss": 0.8671, + "step": 28859 + }, + { + "epoch": 6.156143344709897, + "grad_norm": 0.09979623027609041, + "learning_rate": 6.979263005788488e-06, + "loss": 0.8865, + "step": 28860 + }, + { + "epoch": 6.156356655290103, + "grad_norm": 0.08981782126102791, + "learning_rate": 6.975787196707817e-06, + "loss": 0.8615, + "step": 28861 + }, + { + "epoch": 6.1565699658703075, + "grad_norm": 0.08797400517996293, + "learning_rate": 6.972312213897363e-06, + "loss": 0.8591, + "step": 28862 + }, + { + "epoch": 6.156783276450512, + "grad_norm": 0.1197160175933326, + "learning_rate": 6.96883805739641e-06, + "loss": 0.8569, + "step": 28863 + }, + { + "epoch": 6.156996587030717, + "grad_norm": 0.09745309028601125, + "learning_rate": 6.965364727244268e-06, + "loss": 0.8506, + "step": 28864 + }, + { + "epoch": 6.1572098976109215, + "grad_norm": 0.09109440892245371, + "learning_rate": 6.9618922234802665e-06, + "loss": 0.8647, + "step": 28865 + }, + { + "epoch": 6.157423208191126, + "grad_norm": 0.09452774522005297, + "learning_rate": 6.9584205461436804e-06, + "loss": 0.889, + "step": 28866 + }, + { + "epoch": 6.157636518771331, + "grad_norm": 0.10600315185629278, + "learning_rate": 6.954949695273803e-06, + "loss": 0.8887, + "step": 28867 + }, + { + "epoch": 6.157849829351536, + "grad_norm": 0.10072864041118378, + "learning_rate": 6.951479670909873e-06, + "loss": 0.8999, + "step": 28868 + }, + { + "epoch": 6.15806313993174, + "grad_norm": 0.10301810120887425, + "learning_rate": 6.948010473091202e-06, + "loss": 0.8624, + "step": 28869 + }, + { + "epoch": 6.158276450511945, + "grad_norm": 0.09102081902397449, + "learning_rate": 6.944542101857012e-06, + "loss": 0.8926, + "step": 28870 + }, + { + "epoch": 6.1584897610921505, + "grad_norm": 0.10194602929111668, + "learning_rate": 6.94107455724657e-06, + "loss": 0.8648, + "step": 28871 + }, + { + "epoch": 6.158703071672355, + "grad_norm": 0.10549147983459486, + "learning_rate": 6.9376078392990785e-06, + "loss": 0.8541, + "step": 28872 + }, + { + "epoch": 6.15891638225256, + "grad_norm": 0.09125614811846794, + "learning_rate": 6.934141948053806e-06, + "loss": 0.8857, + "step": 28873 + }, + { + "epoch": 6.159129692832765, + "grad_norm": 0.10851915415673798, + "learning_rate": 6.930676883549936e-06, + "loss": 0.9039, + "step": 28874 + }, + { + "epoch": 6.159343003412969, + "grad_norm": 0.10280258902318311, + "learning_rate": 6.927212645826684e-06, + "loss": 0.9027, + "step": 28875 + }, + { + "epoch": 6.159556313993174, + "grad_norm": 0.0860664091096446, + "learning_rate": 6.923749234923245e-06, + "loss": 0.8688, + "step": 28876 + }, + { + "epoch": 6.159769624573379, + "grad_norm": 0.09842197752695378, + "learning_rate": 6.920286650878814e-06, + "loss": 0.8928, + "step": 28877 + }, + { + "epoch": 6.159982935153583, + "grad_norm": 0.1063899859816748, + "learning_rate": 6.916824893732577e-06, + "loss": 0.8764, + "step": 28878 + }, + { + "epoch": 6.160196245733788, + "grad_norm": 0.10603661841042948, + "learning_rate": 6.913363963523694e-06, + "loss": 0.8954, + "step": 28879 + }, + { + "epoch": 6.160409556313994, + "grad_norm": 0.08610123695682799, + "learning_rate": 6.909903860291316e-06, + "loss": 0.8621, + "step": 28880 + }, + { + "epoch": 6.160622866894198, + "grad_norm": 0.09660228063374698, + "learning_rate": 6.906444584074612e-06, + "loss": 0.8744, + "step": 28881 + }, + { + "epoch": 6.160836177474403, + "grad_norm": 0.09949223256725878, + "learning_rate": 6.9029861349127234e-06, + "loss": 0.8813, + "step": 28882 + }, + { + "epoch": 6.161049488054608, + "grad_norm": 0.09070107597023533, + "learning_rate": 6.899528512844766e-06, + "loss": 0.8695, + "step": 28883 + }, + { + "epoch": 6.161262798634812, + "grad_norm": 0.09439749512281537, + "learning_rate": 6.8960717179098825e-06, + "loss": 0.888, + "step": 28884 + }, + { + "epoch": 6.161476109215017, + "grad_norm": 0.10170660268462638, + "learning_rate": 6.892615750147178e-06, + "loss": 0.9068, + "step": 28885 + }, + { + "epoch": 6.161689419795222, + "grad_norm": 0.12031769170863742, + "learning_rate": 6.889160609595751e-06, + "loss": 0.8644, + "step": 28886 + }, + { + "epoch": 6.161902730375426, + "grad_norm": 0.09310246284012294, + "learning_rate": 6.885706296294707e-06, + "loss": 0.8656, + "step": 28887 + }, + { + "epoch": 6.162116040955631, + "grad_norm": 0.09552180791432648, + "learning_rate": 6.88225281028311e-06, + "loss": 0.8807, + "step": 28888 + }, + { + "epoch": 6.162329351535837, + "grad_norm": 0.08549973760909356, + "learning_rate": 6.878800151600065e-06, + "loss": 0.893, + "step": 28889 + }, + { + "epoch": 6.162542662116041, + "grad_norm": 0.09463026177104174, + "learning_rate": 6.875348320284625e-06, + "loss": 0.8665, + "step": 28890 + }, + { + "epoch": 6.162755972696246, + "grad_norm": 0.09249648145708789, + "learning_rate": 6.871897316375853e-06, + "loss": 0.8771, + "step": 28891 + }, + { + "epoch": 6.162969283276451, + "grad_norm": 0.1044681720672931, + "learning_rate": 6.868447139912775e-06, + "loss": 0.9014, + "step": 28892 + }, + { + "epoch": 6.163182593856655, + "grad_norm": 0.09133816351140338, + "learning_rate": 6.864997790934462e-06, + "loss": 0.8745, + "step": 28893 + }, + { + "epoch": 6.16339590443686, + "grad_norm": 0.09157836726650471, + "learning_rate": 6.861549269479923e-06, + "loss": 0.8602, + "step": 28894 + }, + { + "epoch": 6.163609215017065, + "grad_norm": 0.09724810461073118, + "learning_rate": 6.858101575588194e-06, + "loss": 0.8949, + "step": 28895 + }, + { + "epoch": 6.163822525597269, + "grad_norm": 0.09147157224315237, + "learning_rate": 6.854654709298274e-06, + "loss": 0.8835, + "step": 28896 + }, + { + "epoch": 6.164035836177474, + "grad_norm": 0.09100988612132051, + "learning_rate": 6.8512086706491636e-06, + "loss": 0.9234, + "step": 28897 + }, + { + "epoch": 6.164249146757679, + "grad_norm": 0.09319468820487703, + "learning_rate": 6.847763459679861e-06, + "loss": 0.8946, + "step": 28898 + }, + { + "epoch": 6.164462457337884, + "grad_norm": 0.08217687724644929, + "learning_rate": 6.844319076429342e-06, + "loss": 0.907, + "step": 28899 + }, + { + "epoch": 6.164675767918089, + "grad_norm": 0.09355876300297679, + "learning_rate": 6.8408755209365696e-06, + "loss": 0.8934, + "step": 28900 + }, + { + "epoch": 6.164889078498294, + "grad_norm": 0.0898358274131522, + "learning_rate": 6.837432793240535e-06, + "loss": 0.9106, + "step": 28901 + }, + { + "epoch": 6.165102389078498, + "grad_norm": 0.08386115859024765, + "learning_rate": 6.833990893380185e-06, + "loss": 0.8978, + "step": 28902 + }, + { + "epoch": 6.165315699658703, + "grad_norm": 0.08730240961638236, + "learning_rate": 6.8305498213944474e-06, + "loss": 0.8462, + "step": 28903 + }, + { + "epoch": 6.165529010238908, + "grad_norm": 0.08176857837830658, + "learning_rate": 6.827109577322262e-06, + "loss": 0.8676, + "step": 28904 + }, + { + "epoch": 6.165742320819112, + "grad_norm": 0.0851810040030959, + "learning_rate": 6.823670161202582e-06, + "loss": 0.8823, + "step": 28905 + }, + { + "epoch": 6.165955631399317, + "grad_norm": 0.08479933371261097, + "learning_rate": 6.820231573074294e-06, + "loss": 0.8522, + "step": 28906 + }, + { + "epoch": 6.166168941979522, + "grad_norm": 0.0932569508387412, + "learning_rate": 6.8167938129763254e-06, + "loss": 0.8797, + "step": 28907 + }, + { + "epoch": 6.166382252559727, + "grad_norm": 0.0819640788156027, + "learning_rate": 6.8133568809475615e-06, + "loss": 0.8646, + "step": 28908 + }, + { + "epoch": 6.166595563139932, + "grad_norm": 0.0836339809863734, + "learning_rate": 6.809920777026895e-06, + "loss": 0.8796, + "step": 28909 + }, + { + "epoch": 6.166808873720137, + "grad_norm": 0.08682470662840373, + "learning_rate": 6.80648550125321e-06, + "loss": 0.8643, + "step": 28910 + }, + { + "epoch": 6.167022184300341, + "grad_norm": 0.09820161312222431, + "learning_rate": 6.803051053665366e-06, + "loss": 0.9215, + "step": 28911 + }, + { + "epoch": 6.167235494880546, + "grad_norm": 0.09563235650750823, + "learning_rate": 6.799617434302219e-06, + "loss": 0.8677, + "step": 28912 + }, + { + "epoch": 6.167448805460751, + "grad_norm": 0.09135273334754648, + "learning_rate": 6.796184643202655e-06, + "loss": 0.8723, + "step": 28913 + }, + { + "epoch": 6.1676621160409555, + "grad_norm": 0.0998390413939783, + "learning_rate": 6.7927526804054856e-06, + "loss": 0.9049, + "step": 28914 + }, + { + "epoch": 6.16787542662116, + "grad_norm": 0.08738658213714619, + "learning_rate": 6.7893215459495435e-06, + "loss": 0.8571, + "step": 28915 + }, + { + "epoch": 6.168088737201365, + "grad_norm": 0.08548619609925652, + "learning_rate": 6.785891239873658e-06, + "loss": 0.8748, + "step": 28916 + }, + { + "epoch": 6.1683020477815695, + "grad_norm": 0.09484273455383242, + "learning_rate": 6.782461762216655e-06, + "loss": 0.8975, + "step": 28917 + }, + { + "epoch": 6.168515358361775, + "grad_norm": 0.09443710222862922, + "learning_rate": 6.7790331130173345e-06, + "loss": 0.8749, + "step": 28918 + }, + { + "epoch": 6.16872866894198, + "grad_norm": 0.0858879328585858, + "learning_rate": 6.775605292314469e-06, + "loss": 0.8836, + "step": 28919 + }, + { + "epoch": 6.1689419795221845, + "grad_norm": 0.09057251877890922, + "learning_rate": 6.77217830014687e-06, + "loss": 0.8976, + "step": 28920 + }, + { + "epoch": 6.169155290102389, + "grad_norm": 0.09527640169596467, + "learning_rate": 6.768752136553307e-06, + "loss": 0.871, + "step": 28921 + }, + { + "epoch": 6.169368600682594, + "grad_norm": 0.09036436331960587, + "learning_rate": 6.7653268015725405e-06, + "loss": 0.8625, + "step": 28922 + }, + { + "epoch": 6.1695819112627985, + "grad_norm": 0.08096952330416952, + "learning_rate": 6.761902295243339e-06, + "loss": 0.9119, + "step": 28923 + }, + { + "epoch": 6.169795221843003, + "grad_norm": 0.09244925443217032, + "learning_rate": 6.758478617604427e-06, + "loss": 0.8833, + "step": 28924 + }, + { + "epoch": 6.170008532423208, + "grad_norm": 0.08743767345406114, + "learning_rate": 6.755055768694583e-06, + "loss": 0.8678, + "step": 28925 + }, + { + "epoch": 6.170221843003413, + "grad_norm": 0.08143029718337118, + "learning_rate": 6.751633748552504e-06, + "loss": 0.8583, + "step": 28926 + }, + { + "epoch": 6.170435153583618, + "grad_norm": 0.0901519857898675, + "learning_rate": 6.748212557216933e-06, + "loss": 0.8821, + "step": 28927 + }, + { + "epoch": 6.170648464163823, + "grad_norm": 0.08840208659154757, + "learning_rate": 6.744792194726541e-06, + "loss": 0.881, + "step": 28928 + }, + { + "epoch": 6.1708617747440275, + "grad_norm": 0.09749601554857147, + "learning_rate": 6.741372661120098e-06, + "loss": 0.8793, + "step": 28929 + }, + { + "epoch": 6.171075085324232, + "grad_norm": 0.0903989716084252, + "learning_rate": 6.737953956436238e-06, + "loss": 0.8867, + "step": 28930 + }, + { + "epoch": 6.171288395904437, + "grad_norm": 0.09279297665786185, + "learning_rate": 6.734536080713643e-06, + "loss": 0.8818, + "step": 28931 + }, + { + "epoch": 6.171501706484642, + "grad_norm": 0.09724392606333328, + "learning_rate": 6.731119033991027e-06, + "loss": 0.8709, + "step": 28932 + }, + { + "epoch": 6.171715017064846, + "grad_norm": 0.09921498892151463, + "learning_rate": 6.727702816307027e-06, + "loss": 0.9182, + "step": 28933 + }, + { + "epoch": 6.171928327645051, + "grad_norm": 0.08497206658428011, + "learning_rate": 6.724287427700296e-06, + "loss": 0.8448, + "step": 28934 + }, + { + "epoch": 6.172141638225256, + "grad_norm": 0.09101642170199654, + "learning_rate": 6.720872868209487e-06, + "loss": 0.8768, + "step": 28935 + }, + { + "epoch": 6.172354948805461, + "grad_norm": 0.09641923716209531, + "learning_rate": 6.717459137873228e-06, + "loss": 0.8884, + "step": 28936 + }, + { + "epoch": 6.172568259385666, + "grad_norm": 0.09326232679548435, + "learning_rate": 6.714046236730163e-06, + "loss": 0.9002, + "step": 28937 + }, + { + "epoch": 6.172781569965871, + "grad_norm": 0.08495648134468357, + "learning_rate": 6.710634164818892e-06, + "loss": 0.916, + "step": 28938 + }, + { + "epoch": 6.172994880546075, + "grad_norm": 0.10139278550848, + "learning_rate": 6.7072229221780335e-06, + "loss": 0.8345, + "step": 28939 + }, + { + "epoch": 6.17320819112628, + "grad_norm": 0.08729984168872282, + "learning_rate": 6.703812508846179e-06, + "loss": 0.8564, + "step": 28940 + }, + { + "epoch": 6.173421501706485, + "grad_norm": 0.0892823478322736, + "learning_rate": 6.70040292486192e-06, + "loss": 0.8805, + "step": 28941 + }, + { + "epoch": 6.173634812286689, + "grad_norm": 0.09365225568554335, + "learning_rate": 6.696994170263829e-06, + "loss": 0.9061, + "step": 28942 + }, + { + "epoch": 6.173848122866894, + "grad_norm": 0.08987660247693752, + "learning_rate": 6.693586245090471e-06, + "loss": 0.8929, + "step": 28943 + }, + { + "epoch": 6.174061433447099, + "grad_norm": 0.08913050725055596, + "learning_rate": 6.690179149380438e-06, + "loss": 0.8853, + "step": 28944 + }, + { + "epoch": 6.174274744027303, + "grad_norm": 0.08874068119084598, + "learning_rate": 6.68677288317225e-06, + "loss": 0.8703, + "step": 28945 + }, + { + "epoch": 6.174488054607509, + "grad_norm": 0.08628299431927658, + "learning_rate": 6.683367446504472e-06, + "loss": 0.8918, + "step": 28946 + }, + { + "epoch": 6.174701365187714, + "grad_norm": 0.0913131453138336, + "learning_rate": 6.6799628394156145e-06, + "loss": 0.8802, + "step": 28947 + }, + { + "epoch": 6.174914675767918, + "grad_norm": 0.09174767077431434, + "learning_rate": 6.676559061944199e-06, + "loss": 0.877, + "step": 28948 + }, + { + "epoch": 6.175127986348123, + "grad_norm": 0.08605417218316679, + "learning_rate": 6.673156114128772e-06, + "loss": 0.8724, + "step": 28949 + }, + { + "epoch": 6.175341296928328, + "grad_norm": 0.08008659019432249, + "learning_rate": 6.6697539960078084e-06, + "loss": 0.8622, + "step": 28950 + }, + { + "epoch": 6.175554607508532, + "grad_norm": 0.07929476054150995, + "learning_rate": 6.6663527076198205e-06, + "loss": 0.8643, + "step": 28951 + }, + { + "epoch": 6.175767918088737, + "grad_norm": 0.09714053404114488, + "learning_rate": 6.662952249003276e-06, + "loss": 0.8771, + "step": 28952 + }, + { + "epoch": 6.175981228668942, + "grad_norm": 0.08942729561448652, + "learning_rate": 6.659552620196668e-06, + "loss": 0.8357, + "step": 28953 + }, + { + "epoch": 6.176194539249146, + "grad_norm": 0.08627360999307965, + "learning_rate": 6.656153821238445e-06, + "loss": 0.8942, + "step": 28954 + }, + { + "epoch": 6.176407849829351, + "grad_norm": 0.09821936554061585, + "learning_rate": 6.652755852167074e-06, + "loss": 0.8973, + "step": 28955 + }, + { + "epoch": 6.176621160409557, + "grad_norm": 0.09076199157251032, + "learning_rate": 6.6493587130210054e-06, + "loss": 0.8738, + "step": 28956 + }, + { + "epoch": 6.176834470989761, + "grad_norm": 0.08772714307474713, + "learning_rate": 6.6459624038386885e-06, + "loss": 0.8795, + "step": 28957 + }, + { + "epoch": 6.177047781569966, + "grad_norm": 0.0945842085427892, + "learning_rate": 6.6425669246585355e-06, + "loss": 0.8651, + "step": 28958 + }, + { + "epoch": 6.177261092150171, + "grad_norm": 0.09632063179225489, + "learning_rate": 6.6391722755189705e-06, + "loss": 0.8551, + "step": 28959 + }, + { + "epoch": 6.177474402730375, + "grad_norm": 0.08992386453070678, + "learning_rate": 6.635778456458397e-06, + "loss": 0.8688, + "step": 28960 + }, + { + "epoch": 6.17768771331058, + "grad_norm": 0.08984588146469691, + "learning_rate": 6.63238546751523e-06, + "loss": 0.8778, + "step": 28961 + }, + { + "epoch": 6.177901023890785, + "grad_norm": 0.09281551679280724, + "learning_rate": 6.628993308727865e-06, + "loss": 0.8966, + "step": 28962 + }, + { + "epoch": 6.1781143344709895, + "grad_norm": 0.09851608093954, + "learning_rate": 6.625601980134662e-06, + "loss": 0.9006, + "step": 28963 + }, + { + "epoch": 6.178327645051194, + "grad_norm": 0.08901014802347072, + "learning_rate": 6.6222114817740165e-06, + "loss": 0.8804, + "step": 28964 + }, + { + "epoch": 6.1785409556314, + "grad_norm": 0.08596914940555704, + "learning_rate": 6.618821813684282e-06, + "loss": 0.8573, + "step": 28965 + }, + { + "epoch": 6.178754266211604, + "grad_norm": 0.09252442054212479, + "learning_rate": 6.615432975903808e-06, + "loss": 0.8964, + "step": 28966 + }, + { + "epoch": 6.178967576791809, + "grad_norm": 0.10127864970020968, + "learning_rate": 6.6120449684709385e-06, + "loss": 0.8856, + "step": 28967 + }, + { + "epoch": 6.179180887372014, + "grad_norm": 0.08579093885081046, + "learning_rate": 6.6086577914240245e-06, + "loss": 0.8679, + "step": 28968 + }, + { + "epoch": 6.1793941979522184, + "grad_norm": 0.08778246819725452, + "learning_rate": 6.605271444801373e-06, + "loss": 0.8804, + "step": 28969 + }, + { + "epoch": 6.179607508532423, + "grad_norm": 0.09332550072211634, + "learning_rate": 6.601885928641318e-06, + "loss": 0.8668, + "step": 28970 + }, + { + "epoch": 6.179820819112628, + "grad_norm": 0.08880951598431717, + "learning_rate": 6.598501242982158e-06, + "loss": 0.8929, + "step": 28971 + }, + { + "epoch": 6.1800341296928325, + "grad_norm": 0.0840304402022046, + "learning_rate": 6.595117387862182e-06, + "loss": 0.8554, + "step": 28972 + }, + { + "epoch": 6.180247440273037, + "grad_norm": 0.09298543524776644, + "learning_rate": 6.5917343633196975e-06, + "loss": 0.8732, + "step": 28973 + }, + { + "epoch": 6.180460750853243, + "grad_norm": 0.08960414881807655, + "learning_rate": 6.588352169392984e-06, + "loss": 0.9013, + "step": 28974 + }, + { + "epoch": 6.180674061433447, + "grad_norm": 0.0869478480487798, + "learning_rate": 6.58497080612027e-06, + "loss": 0.8721, + "step": 28975 + }, + { + "epoch": 6.180887372013652, + "grad_norm": 0.09731216976342603, + "learning_rate": 6.58159027353987e-06, + "loss": 0.9082, + "step": 28976 + }, + { + "epoch": 6.181100682593857, + "grad_norm": 0.0986778503564987, + "learning_rate": 6.578210571690004e-06, + "loss": 0.8746, + "step": 28977 + }, + { + "epoch": 6.1813139931740615, + "grad_norm": 0.09272200579580948, + "learning_rate": 6.5748317006089256e-06, + "loss": 0.8839, + "step": 28978 + }, + { + "epoch": 6.181527303754266, + "grad_norm": 0.08363974441627314, + "learning_rate": 6.571453660334844e-06, + "loss": 0.9032, + "step": 28979 + }, + { + "epoch": 6.181740614334471, + "grad_norm": 0.09213784020790822, + "learning_rate": 6.568076450906012e-06, + "loss": 0.8462, + "step": 28980 + }, + { + "epoch": 6.1819539249146755, + "grad_norm": 0.0871283901250102, + "learning_rate": 6.5647000723606305e-06, + "loss": 0.8838, + "step": 28981 + }, + { + "epoch": 6.18216723549488, + "grad_norm": 0.08901150489160249, + "learning_rate": 6.561324524736901e-06, + "loss": 0.8657, + "step": 28982 + }, + { + "epoch": 6.182380546075085, + "grad_norm": 0.09220744267858696, + "learning_rate": 6.557949808073023e-06, + "loss": 0.885, + "step": 28983 + }, + { + "epoch": 6.1825938566552905, + "grad_norm": 0.08355533256574187, + "learning_rate": 6.554575922407162e-06, + "loss": 0.8852, + "step": 28984 + }, + { + "epoch": 6.182807167235495, + "grad_norm": 0.08552599349397583, + "learning_rate": 6.551202867777546e-06, + "loss": 0.896, + "step": 28985 + }, + { + "epoch": 6.1830204778157, + "grad_norm": 0.08314070162640279, + "learning_rate": 6.547830644222286e-06, + "loss": 0.8951, + "step": 28986 + }, + { + "epoch": 6.1832337883959045, + "grad_norm": 0.10306436958926754, + "learning_rate": 6.544459251779538e-06, + "loss": 0.8956, + "step": 28987 + }, + { + "epoch": 6.183447098976109, + "grad_norm": 0.08832808426803783, + "learning_rate": 6.541088690487494e-06, + "loss": 0.8975, + "step": 28988 + }, + { + "epoch": 6.183660409556314, + "grad_norm": 0.09354274426454287, + "learning_rate": 6.537718960384256e-06, + "loss": 0.9104, + "step": 28989 + }, + { + "epoch": 6.183873720136519, + "grad_norm": 0.08777366051922222, + "learning_rate": 6.534350061507972e-06, + "loss": 0.8742, + "step": 28990 + }, + { + "epoch": 6.184087030716723, + "grad_norm": 0.0817902487021774, + "learning_rate": 6.5309819938967276e-06, + "loss": 0.8586, + "step": 28991 + }, + { + "epoch": 6.184300341296928, + "grad_norm": 0.0860726611811344, + "learning_rate": 6.527614757588687e-06, + "loss": 0.9018, + "step": 28992 + }, + { + "epoch": 6.1845136518771335, + "grad_norm": 0.09508864083582415, + "learning_rate": 6.524248352621909e-06, + "loss": 0.9099, + "step": 28993 + }, + { + "epoch": 6.184726962457338, + "grad_norm": 0.09448632711282184, + "learning_rate": 6.520882779034506e-06, + "loss": 0.8919, + "step": 28994 + }, + { + "epoch": 6.184940273037543, + "grad_norm": 0.09258915997095656, + "learning_rate": 6.517518036864543e-06, + "loss": 0.8965, + "step": 28995 + }, + { + "epoch": 6.185153583617748, + "grad_norm": 0.08743126510363765, + "learning_rate": 6.5141541261500985e-06, + "loss": 0.8821, + "step": 28996 + }, + { + "epoch": 6.185366894197952, + "grad_norm": 0.0904415709076857, + "learning_rate": 6.510791046929239e-06, + "loss": 0.8707, + "step": 28997 + }, + { + "epoch": 6.185580204778157, + "grad_norm": 0.09139441216551898, + "learning_rate": 6.5074287992400234e-06, + "loss": 0.8504, + "step": 28998 + }, + { + "epoch": 6.185793515358362, + "grad_norm": 0.09330072966794845, + "learning_rate": 6.504067383120465e-06, + "loss": 0.8729, + "step": 28999 + }, + { + "epoch": 6.186006825938566, + "grad_norm": 0.08626551439619362, + "learning_rate": 6.500706798608631e-06, + "loss": 0.8994, + "step": 29000 + }, + { + "epoch": 6.186220136518771, + "grad_norm": 0.09344388844356974, + "learning_rate": 6.497347045742537e-06, + "loss": 0.8983, + "step": 29001 + }, + { + "epoch": 6.186433447098976, + "grad_norm": 0.09255871205856006, + "learning_rate": 6.493988124560204e-06, + "loss": 0.8909, + "step": 29002 + }, + { + "epoch": 6.186646757679181, + "grad_norm": 0.08231082377671864, + "learning_rate": 6.490630035099612e-06, + "loss": 0.8973, + "step": 29003 + }, + { + "epoch": 6.186860068259386, + "grad_norm": 0.0901655797941623, + "learning_rate": 6.487272777398792e-06, + "loss": 0.8718, + "step": 29004 + }, + { + "epoch": 6.187073378839591, + "grad_norm": 0.08658386318637991, + "learning_rate": 6.483916351495722e-06, + "loss": 0.8766, + "step": 29005 + }, + { + "epoch": 6.187286689419795, + "grad_norm": 0.086263848708883, + "learning_rate": 6.480560757428365e-06, + "loss": 0.9192, + "step": 29006 + }, + { + "epoch": 6.1875, + "grad_norm": 0.10183510728151038, + "learning_rate": 6.4772059952347055e-06, + "loss": 0.8856, + "step": 29007 + }, + { + "epoch": 6.187713310580205, + "grad_norm": 0.09530510789111164, + "learning_rate": 6.473852064952706e-06, + "loss": 0.871, + "step": 29008 + }, + { + "epoch": 6.187926621160409, + "grad_norm": 0.08280102753403117, + "learning_rate": 6.470498966620291e-06, + "loss": 0.8719, + "step": 29009 + }, + { + "epoch": 6.188139931740614, + "grad_norm": 0.09092683948655303, + "learning_rate": 6.4671467002754304e-06, + "loss": 0.884, + "step": 29010 + }, + { + "epoch": 6.188353242320819, + "grad_norm": 0.09398404838168783, + "learning_rate": 6.463795265956023e-06, + "loss": 0.8745, + "step": 29011 + }, + { + "epoch": 6.188566552901024, + "grad_norm": 0.08743770120669551, + "learning_rate": 6.460444663700021e-06, + "loss": 0.8971, + "step": 29012 + }, + { + "epoch": 6.188779863481229, + "grad_norm": 0.08290469371131363, + "learning_rate": 6.457094893545321e-06, + "loss": 0.8786, + "step": 29013 + }, + { + "epoch": 6.188993174061434, + "grad_norm": 0.08480071907324244, + "learning_rate": 6.453745955529833e-06, + "loss": 0.8143, + "step": 29014 + }, + { + "epoch": 6.189206484641638, + "grad_norm": 0.08734902731994522, + "learning_rate": 6.450397849691437e-06, + "loss": 0.8581, + "step": 29015 + }, + { + "epoch": 6.189419795221843, + "grad_norm": 0.09082788051830007, + "learning_rate": 6.447050576068039e-06, + "loss": 0.8931, + "step": 29016 + }, + { + "epoch": 6.189633105802048, + "grad_norm": 0.0894825660190718, + "learning_rate": 6.443704134697495e-06, + "loss": 0.8674, + "step": 29017 + }, + { + "epoch": 6.189846416382252, + "grad_norm": 0.0972393826432937, + "learning_rate": 6.440358525617694e-06, + "loss": 0.8847, + "step": 29018 + }, + { + "epoch": 6.190059726962457, + "grad_norm": 0.09463429683206233, + "learning_rate": 6.437013748866445e-06, + "loss": 0.8892, + "step": 29019 + }, + { + "epoch": 6.190273037542662, + "grad_norm": 0.09276567912813344, + "learning_rate": 6.433669804481639e-06, + "loss": 0.8629, + "step": 29020 + }, + { + "epoch": 6.1904863481228665, + "grad_norm": 0.0898491368717424, + "learning_rate": 6.4303266925010944e-06, + "loss": 0.8868, + "step": 29021 + }, + { + "epoch": 6.190699658703072, + "grad_norm": 0.08909886410690418, + "learning_rate": 6.426984412962638e-06, + "loss": 0.8661, + "step": 29022 + }, + { + "epoch": 6.190912969283277, + "grad_norm": 0.09801568017200736, + "learning_rate": 6.42364296590408e-06, + "loss": 0.8747, + "step": 29023 + }, + { + "epoch": 6.191126279863481, + "grad_norm": 0.1008879773694372, + "learning_rate": 6.420302351363257e-06, + "loss": 0.8733, + "step": 29024 + }, + { + "epoch": 6.191339590443686, + "grad_norm": 0.08652308375177548, + "learning_rate": 6.4169625693779425e-06, + "loss": 0.9014, + "step": 29025 + }, + { + "epoch": 6.191552901023891, + "grad_norm": 0.09030895496203362, + "learning_rate": 6.413623619985938e-06, + "loss": 0.8662, + "step": 29026 + }, + { + "epoch": 6.1917662116040955, + "grad_norm": 0.0923371454961497, + "learning_rate": 6.410285503225009e-06, + "loss": 0.873, + "step": 29027 + }, + { + "epoch": 6.1919795221843, + "grad_norm": 0.09047338413510621, + "learning_rate": 6.406948219132947e-06, + "loss": 0.8788, + "step": 29028 + }, + { + "epoch": 6.192192832764505, + "grad_norm": 0.09614284749189826, + "learning_rate": 6.403611767747508e-06, + "loss": 0.8833, + "step": 29029 + }, + { + "epoch": 6.1924061433447095, + "grad_norm": 0.08698498494084483, + "learning_rate": 6.40027614910645e-06, + "loss": 0.8788, + "step": 29030 + }, + { + "epoch": 6.192619453924915, + "grad_norm": 0.09118015760731331, + "learning_rate": 6.396941363247475e-06, + "loss": 0.8932, + "step": 29031 + }, + { + "epoch": 6.19283276450512, + "grad_norm": 0.09246264896981675, + "learning_rate": 6.393607410208366e-06, + "loss": 0.8851, + "step": 29032 + }, + { + "epoch": 6.1930460750853245, + "grad_norm": 0.09359034342410072, + "learning_rate": 6.390274290026827e-06, + "loss": 0.8597, + "step": 29033 + }, + { + "epoch": 6.193259385665529, + "grad_norm": 0.08520752359945381, + "learning_rate": 6.3869420027405705e-06, + "loss": 0.905, + "step": 29034 + }, + { + "epoch": 6.193472696245734, + "grad_norm": 0.08880687892387021, + "learning_rate": 6.383610548387289e-06, + "loss": 0.851, + "step": 29035 + }, + { + "epoch": 6.1936860068259385, + "grad_norm": 0.08562742920300194, + "learning_rate": 6.380279927004705e-06, + "loss": 0.8655, + "step": 29036 + }, + { + "epoch": 6.193899317406143, + "grad_norm": 0.09469480443499674, + "learning_rate": 6.3769501386304935e-06, + "loss": 0.8572, + "step": 29037 + }, + { + "epoch": 6.194112627986348, + "grad_norm": 0.08353125098854564, + "learning_rate": 6.373621183302332e-06, + "loss": 0.8863, + "step": 29038 + }, + { + "epoch": 6.194325938566553, + "grad_norm": 0.08703157397067464, + "learning_rate": 6.370293061057871e-06, + "loss": 0.8749, + "step": 29039 + }, + { + "epoch": 6.194539249146757, + "grad_norm": 0.0979931679642784, + "learning_rate": 6.366965771934795e-06, + "loss": 0.8785, + "step": 29040 + }, + { + "epoch": 6.194752559726963, + "grad_norm": 0.08851339084487209, + "learning_rate": 6.363639315970753e-06, + "loss": 0.8959, + "step": 29041 + }, + { + "epoch": 6.1949658703071675, + "grad_norm": 0.08770333224168135, + "learning_rate": 6.360313693203353e-06, + "loss": 0.9024, + "step": 29042 + }, + { + "epoch": 6.195179180887372, + "grad_norm": 0.08654867389358246, + "learning_rate": 6.356988903670234e-06, + "loss": 0.8799, + "step": 29043 + }, + { + "epoch": 6.195392491467577, + "grad_norm": 0.08009338252078599, + "learning_rate": 6.353664947409037e-06, + "loss": 0.8819, + "step": 29044 + }, + { + "epoch": 6.1956058020477816, + "grad_norm": 0.09608526196309333, + "learning_rate": 6.35034182445736e-06, + "loss": 0.8997, + "step": 29045 + }, + { + "epoch": 6.195819112627986, + "grad_norm": 0.08553650283162588, + "learning_rate": 6.347019534852798e-06, + "loss": 0.8922, + "step": 29046 + }, + { + "epoch": 6.196032423208191, + "grad_norm": 0.09449744251748732, + "learning_rate": 6.34369807863294e-06, + "loss": 0.914, + "step": 29047 + }, + { + "epoch": 6.196245733788396, + "grad_norm": 0.09470563268031176, + "learning_rate": 6.3403774558354e-06, + "loss": 0.8953, + "step": 29048 + }, + { + "epoch": 6.1964590443686, + "grad_norm": 0.08605175249242299, + "learning_rate": 6.3370576664977126e-06, + "loss": 0.8591, + "step": 29049 + }, + { + "epoch": 6.196672354948806, + "grad_norm": 0.08355680552003335, + "learning_rate": 6.3337387106574645e-06, + "loss": 0.8756, + "step": 29050 + }, + { + "epoch": 6.1968856655290105, + "grad_norm": 0.09768106283762362, + "learning_rate": 6.3304205883521905e-06, + "loss": 0.8731, + "step": 29051 + }, + { + "epoch": 6.197098976109215, + "grad_norm": 0.09305117112210878, + "learning_rate": 6.32710329961947e-06, + "loss": 0.8909, + "step": 29052 + }, + { + "epoch": 6.19731228668942, + "grad_norm": 0.08603782611902915, + "learning_rate": 6.323786844496802e-06, + "loss": 0.8843, + "step": 29053 + }, + { + "epoch": 6.197525597269625, + "grad_norm": 0.08613854257390445, + "learning_rate": 6.320471223021729e-06, + "loss": 0.8977, + "step": 29054 + }, + { + "epoch": 6.197738907849829, + "grad_norm": 0.08247016909851855, + "learning_rate": 6.317156435231741e-06, + "loss": 0.8711, + "step": 29055 + }, + { + "epoch": 6.197952218430034, + "grad_norm": 0.08124486038985035, + "learning_rate": 6.313842481164392e-06, + "loss": 0.8855, + "step": 29056 + }, + { + "epoch": 6.198165529010239, + "grad_norm": 0.08140789163399803, + "learning_rate": 6.310529360857143e-06, + "loss": 0.8817, + "step": 29057 + }, + { + "epoch": 6.198378839590443, + "grad_norm": 0.08595898562530949, + "learning_rate": 6.307217074347502e-06, + "loss": 0.8777, + "step": 29058 + }, + { + "epoch": 6.198592150170649, + "grad_norm": 0.080888305601921, + "learning_rate": 6.303905621672925e-06, + "loss": 0.8541, + "step": 29059 + }, + { + "epoch": 6.198805460750854, + "grad_norm": 0.08976249021745994, + "learning_rate": 6.300595002870902e-06, + "loss": 0.9111, + "step": 29060 + }, + { + "epoch": 6.199018771331058, + "grad_norm": 0.08711804068051514, + "learning_rate": 6.297285217978895e-06, + "loss": 0.856, + "step": 29061 + }, + { + "epoch": 6.199232081911263, + "grad_norm": 0.10124831006668415, + "learning_rate": 6.293976267034341e-06, + "loss": 0.915, + "step": 29062 + }, + { + "epoch": 6.199445392491468, + "grad_norm": 0.08137906667321908, + "learning_rate": 6.290668150074686e-06, + "loss": 0.8866, + "step": 29063 + }, + { + "epoch": 6.199658703071672, + "grad_norm": 0.09595012131083468, + "learning_rate": 6.2873608671373665e-06, + "loss": 0.8677, + "step": 29064 + }, + { + "epoch": 6.199872013651877, + "grad_norm": 0.09108619625935874, + "learning_rate": 6.284054418259793e-06, + "loss": 0.8709, + "step": 29065 + }, + { + "epoch": 6.200085324232082, + "grad_norm": 0.08292038191788023, + "learning_rate": 6.280748803479384e-06, + "loss": 0.8847, + "step": 29066 + }, + { + "epoch": 6.200298634812286, + "grad_norm": 0.08651183239548299, + "learning_rate": 6.277444022833523e-06, + "loss": 0.8563, + "step": 29067 + }, + { + "epoch": 6.200511945392491, + "grad_norm": 0.10041316075903213, + "learning_rate": 6.2741400763596475e-06, + "loss": 0.8871, + "step": 29068 + }, + { + "epoch": 6.200725255972697, + "grad_norm": 0.08108011405964213, + "learning_rate": 6.2708369640951125e-06, + "loss": 0.8888, + "step": 29069 + }, + { + "epoch": 6.200938566552901, + "grad_norm": 0.08508319310001675, + "learning_rate": 6.267534686077295e-06, + "loss": 0.8959, + "step": 29070 + }, + { + "epoch": 6.201151877133106, + "grad_norm": 0.08577450979143333, + "learning_rate": 6.264233242343549e-06, + "loss": 0.8628, + "step": 29071 + }, + { + "epoch": 6.201365187713311, + "grad_norm": 0.08884354593171424, + "learning_rate": 6.2609326329312605e-06, + "loss": 0.8875, + "step": 29072 + }, + { + "epoch": 6.201578498293515, + "grad_norm": 0.08750370784475488, + "learning_rate": 6.2576328578777575e-06, + "loss": 0.8963, + "step": 29073 + }, + { + "epoch": 6.20179180887372, + "grad_norm": 0.08722960587927066, + "learning_rate": 6.25433391722039e-06, + "loss": 0.867, + "step": 29074 + }, + { + "epoch": 6.202005119453925, + "grad_norm": 0.08345931929904686, + "learning_rate": 6.2510358109964505e-06, + "loss": 0.8742, + "step": 29075 + }, + { + "epoch": 6.202218430034129, + "grad_norm": 0.0894797227476888, + "learning_rate": 6.247738539243298e-06, + "loss": 0.8953, + "step": 29076 + }, + { + "epoch": 6.202431740614334, + "grad_norm": 0.09225589923153947, + "learning_rate": 6.244442101998225e-06, + "loss": 0.9027, + "step": 29077 + }, + { + "epoch": 6.202645051194539, + "grad_norm": 0.08905672902961854, + "learning_rate": 6.241146499298527e-06, + "loss": 0.8718, + "step": 29078 + }, + { + "epoch": 6.202858361774744, + "grad_norm": 0.10643286539442295, + "learning_rate": 6.237851731181481e-06, + "loss": 0.8797, + "step": 29079 + }, + { + "epoch": 6.203071672354949, + "grad_norm": 0.08642210703906225, + "learning_rate": 6.2345577976844e-06, + "loss": 0.9193, + "step": 29080 + }, + { + "epoch": 6.203284982935154, + "grad_norm": 0.08717941513959544, + "learning_rate": 6.231264698844541e-06, + "loss": 0.8842, + "step": 29081 + }, + { + "epoch": 6.203498293515358, + "grad_norm": 0.08762642548351358, + "learning_rate": 6.227972434699156e-06, + "loss": 0.8828, + "step": 29082 + }, + { + "epoch": 6.203711604095563, + "grad_norm": 0.1035917424107355, + "learning_rate": 6.224681005285495e-06, + "loss": 0.8932, + "step": 29083 + }, + { + "epoch": 6.203924914675768, + "grad_norm": 0.08288081093383856, + "learning_rate": 6.221390410640826e-06, + "loss": 0.8786, + "step": 29084 + }, + { + "epoch": 6.2041382252559725, + "grad_norm": 0.08171976289895941, + "learning_rate": 6.218100650802355e-06, + "loss": 0.8964, + "step": 29085 + }, + { + "epoch": 6.204351535836177, + "grad_norm": 0.10081823591624223, + "learning_rate": 6.2148117258073396e-06, + "loss": 0.8728, + "step": 29086 + }, + { + "epoch": 6.204564846416382, + "grad_norm": 0.09155088320937463, + "learning_rate": 6.2115236356929424e-06, + "loss": 0.8727, + "step": 29087 + }, + { + "epoch": 6.204778156996587, + "grad_norm": 0.08481096905996187, + "learning_rate": 6.208236380496404e-06, + "loss": 0.8923, + "step": 29088 + }, + { + "epoch": 6.204991467576792, + "grad_norm": 0.10288217147427141, + "learning_rate": 6.204949960254913e-06, + "loss": 0.8701, + "step": 29089 + }, + { + "epoch": 6.205204778156997, + "grad_norm": 0.09079915158777123, + "learning_rate": 6.2016643750056585e-06, + "loss": 0.8866, + "step": 29090 + }, + { + "epoch": 6.2054180887372015, + "grad_norm": 0.08951911867399007, + "learning_rate": 6.1983796247858e-06, + "loss": 0.8495, + "step": 29091 + }, + { + "epoch": 6.205631399317406, + "grad_norm": 0.0905970827907074, + "learning_rate": 6.195095709632526e-06, + "loss": 0.8683, + "step": 29092 + }, + { + "epoch": 6.205844709897611, + "grad_norm": 0.09525223238863785, + "learning_rate": 6.19181262958299e-06, + "loss": 0.9009, + "step": 29093 + }, + { + "epoch": 6.2060580204778155, + "grad_norm": 0.08417261990660777, + "learning_rate": 6.188530384674334e-06, + "loss": 0.8493, + "step": 29094 + }, + { + "epoch": 6.20627133105802, + "grad_norm": 0.08864093565800178, + "learning_rate": 6.185248974943685e-06, + "loss": 0.8688, + "step": 29095 + }, + { + "epoch": 6.206484641638225, + "grad_norm": 0.09858976790839905, + "learning_rate": 6.1819684004281955e-06, + "loss": 0.8624, + "step": 29096 + }, + { + "epoch": 6.2066979522184305, + "grad_norm": 0.0892504685258758, + "learning_rate": 6.178688661164991e-06, + "loss": 0.8811, + "step": 29097 + }, + { + "epoch": 6.206911262798635, + "grad_norm": 0.09528145023002213, + "learning_rate": 6.175409757191153e-06, + "loss": 0.9254, + "step": 29098 + }, + { + "epoch": 6.20712457337884, + "grad_norm": 0.09426870686542149, + "learning_rate": 6.1721316885437896e-06, + "loss": 0.8742, + "step": 29099 + }, + { + "epoch": 6.2073378839590445, + "grad_norm": 0.08832665162808775, + "learning_rate": 6.16885445526e-06, + "loss": 0.872, + "step": 29100 + }, + { + "epoch": 6.207551194539249, + "grad_norm": 0.0885919034191408, + "learning_rate": 6.165578057376866e-06, + "loss": 0.904, + "step": 29101 + }, + { + "epoch": 6.207764505119454, + "grad_norm": 0.08380595274932814, + "learning_rate": 6.16230249493146e-06, + "loss": 0.9049, + "step": 29102 + }, + { + "epoch": 6.207977815699659, + "grad_norm": 0.08736230849780668, + "learning_rate": 6.159027767960837e-06, + "loss": 0.8644, + "step": 29103 + }, + { + "epoch": 6.208191126279863, + "grad_norm": 0.08550265492324574, + "learning_rate": 6.15575387650206e-06, + "loss": 0.8834, + "step": 29104 + }, + { + "epoch": 6.208404436860068, + "grad_norm": 0.08606732455907613, + "learning_rate": 6.1524808205921746e-06, + "loss": 0.8802, + "step": 29105 + }, + { + "epoch": 6.208617747440273, + "grad_norm": 0.09073251352443623, + "learning_rate": 6.149208600268211e-06, + "loss": 0.8768, + "step": 29106 + }, + { + "epoch": 6.208831058020478, + "grad_norm": 0.09153019794607746, + "learning_rate": 6.145937215567177e-06, + "loss": 0.8859, + "step": 29107 + }, + { + "epoch": 6.209044368600683, + "grad_norm": 0.0895862381430109, + "learning_rate": 6.142666666526138e-06, + "loss": 0.8609, + "step": 29108 + }, + { + "epoch": 6.209257679180888, + "grad_norm": 0.09415317396278482, + "learning_rate": 6.1393969531820506e-06, + "loss": 0.8506, + "step": 29109 + }, + { + "epoch": 6.209470989761092, + "grad_norm": 0.08666728938708829, + "learning_rate": 6.136128075571926e-06, + "loss": 0.8505, + "step": 29110 + }, + { + "epoch": 6.209684300341297, + "grad_norm": 0.07678088007210479, + "learning_rate": 6.132860033732746e-06, + "loss": 0.868, + "step": 29111 + }, + { + "epoch": 6.209897610921502, + "grad_norm": 0.08690986798763617, + "learning_rate": 6.129592827701514e-06, + "loss": 0.9115, + "step": 29112 + }, + { + "epoch": 6.210110921501706, + "grad_norm": 0.08431889636920925, + "learning_rate": 6.126326457515177e-06, + "loss": 0.8992, + "step": 29113 + }, + { + "epoch": 6.210324232081911, + "grad_norm": 0.09292808282338975, + "learning_rate": 6.123060923210693e-06, + "loss": 0.89, + "step": 29114 + }, + { + "epoch": 6.210537542662116, + "grad_norm": 0.09085183582728117, + "learning_rate": 6.11979622482501e-06, + "loss": 0.8663, + "step": 29115 + }, + { + "epoch": 6.210750853242321, + "grad_norm": 0.08522026288579661, + "learning_rate": 6.1165323623950935e-06, + "loss": 0.8905, + "step": 29116 + }, + { + "epoch": 6.210964163822526, + "grad_norm": 0.08962292851042357, + "learning_rate": 6.1132693359578475e-06, + "loss": 0.8813, + "step": 29117 + }, + { + "epoch": 6.211177474402731, + "grad_norm": 0.09640641697345342, + "learning_rate": 6.110007145550203e-06, + "loss": 0.8749, + "step": 29118 + }, + { + "epoch": 6.211390784982935, + "grad_norm": 0.09737488143001327, + "learning_rate": 6.106745791209081e-06, + "loss": 0.8779, + "step": 29119 + }, + { + "epoch": 6.21160409556314, + "grad_norm": 0.08246349574294318, + "learning_rate": 6.103485272971359e-06, + "loss": 0.8888, + "step": 29120 + }, + { + "epoch": 6.211817406143345, + "grad_norm": 0.08547875640846725, + "learning_rate": 6.1002255908739495e-06, + "loss": 0.8821, + "step": 29121 + }, + { + "epoch": 6.212030716723549, + "grad_norm": 0.08967257085200443, + "learning_rate": 6.096966744953739e-06, + "loss": 0.8696, + "step": 29122 + }, + { + "epoch": 6.212244027303754, + "grad_norm": 0.11369107904852206, + "learning_rate": 6.093708735247568e-06, + "loss": 0.8857, + "step": 29123 + }, + { + "epoch": 6.212457337883959, + "grad_norm": 0.09040951012637016, + "learning_rate": 6.090451561792341e-06, + "loss": 0.9054, + "step": 29124 + }, + { + "epoch": 6.212670648464163, + "grad_norm": 0.08963627115775157, + "learning_rate": 6.0871952246249e-06, + "loss": 0.8771, + "step": 29125 + }, + { + "epoch": 6.212883959044369, + "grad_norm": 0.09804994816538673, + "learning_rate": 6.083939723782086e-06, + "loss": 0.8897, + "step": 29126 + }, + { + "epoch": 6.213097269624574, + "grad_norm": 0.10839197127362847, + "learning_rate": 6.0806850593007325e-06, + "loss": 0.8865, + "step": 29127 + }, + { + "epoch": 6.213310580204778, + "grad_norm": 0.0939618988200617, + "learning_rate": 6.077431231217672e-06, + "loss": 0.9003, + "step": 29128 + }, + { + "epoch": 6.213523890784983, + "grad_norm": 0.08126291195485578, + "learning_rate": 6.074178239569728e-06, + "loss": 0.8614, + "step": 29129 + }, + { + "epoch": 6.213737201365188, + "grad_norm": 0.10427071135587954, + "learning_rate": 6.070926084393698e-06, + "loss": 0.8722, + "step": 29130 + }, + { + "epoch": 6.213950511945392, + "grad_norm": 0.10381464781305837, + "learning_rate": 6.067674765726388e-06, + "loss": 0.8813, + "step": 29131 + }, + { + "epoch": 6.214163822525597, + "grad_norm": 0.08807909069306478, + "learning_rate": 6.064424283604577e-06, + "loss": 0.8567, + "step": 29132 + }, + { + "epoch": 6.214377133105802, + "grad_norm": 0.08389457608925109, + "learning_rate": 6.061174638065046e-06, + "loss": 0.8789, + "step": 29133 + }, + { + "epoch": 6.214590443686006, + "grad_norm": 0.10138919465489314, + "learning_rate": 6.057925829144573e-06, + "loss": 0.8878, + "step": 29134 + }, + { + "epoch": 6.214803754266212, + "grad_norm": 0.11003592828985576, + "learning_rate": 6.0546778568798935e-06, + "loss": 0.902, + "step": 29135 + }, + { + "epoch": 6.215017064846417, + "grad_norm": 0.0840766091677533, + "learning_rate": 6.051430721307796e-06, + "loss": 0.8774, + "step": 29136 + }, + { + "epoch": 6.215230375426621, + "grad_norm": 0.08570421499864467, + "learning_rate": 6.048184422465006e-06, + "loss": 0.8724, + "step": 29137 + }, + { + "epoch": 6.215443686006826, + "grad_norm": 0.10039181935358911, + "learning_rate": 6.044938960388251e-06, + "loss": 0.8645, + "step": 29138 + }, + { + "epoch": 6.215656996587031, + "grad_norm": 0.0962496497759143, + "learning_rate": 6.041694335114239e-06, + "loss": 0.8863, + "step": 29139 + }, + { + "epoch": 6.215870307167235, + "grad_norm": 0.0854493778243445, + "learning_rate": 6.038450546679722e-06, + "loss": 0.8526, + "step": 29140 + }, + { + "epoch": 6.21608361774744, + "grad_norm": 0.08317564423122094, + "learning_rate": 6.035207595121373e-06, + "loss": 0.9086, + "step": 29141 + }, + { + "epoch": 6.216296928327645, + "grad_norm": 0.08849425155017163, + "learning_rate": 6.031965480475892e-06, + "loss": 0.8496, + "step": 29142 + }, + { + "epoch": 6.2165102389078495, + "grad_norm": 0.10133428842634744, + "learning_rate": 6.02872420277997e-06, + "loss": 0.8642, + "step": 29143 + }, + { + "epoch": 6.216723549488055, + "grad_norm": 0.09582850693257326, + "learning_rate": 6.025483762070279e-06, + "loss": 0.8683, + "step": 29144 + }, + { + "epoch": 6.21693686006826, + "grad_norm": 0.09597851264385884, + "learning_rate": 6.022244158383483e-06, + "loss": 0.8888, + "step": 29145 + }, + { + "epoch": 6.217150170648464, + "grad_norm": 0.09139736642268266, + "learning_rate": 6.019005391756239e-06, + "loss": 0.8635, + "step": 29146 + }, + { + "epoch": 6.217363481228669, + "grad_norm": 0.10669367348205185, + "learning_rate": 6.015767462225173e-06, + "loss": 0.8551, + "step": 29147 + }, + { + "epoch": 6.217576791808874, + "grad_norm": 0.10073732785930851, + "learning_rate": 6.012530369826959e-06, + "loss": 0.8986, + "step": 29148 + }, + { + "epoch": 6.2177901023890785, + "grad_norm": 0.08026467726698124, + "learning_rate": 6.009294114598217e-06, + "loss": 0.8714, + "step": 29149 + }, + { + "epoch": 6.218003412969283, + "grad_norm": 0.0891720672524792, + "learning_rate": 6.006058696575547e-06, + "loss": 0.8741, + "step": 29150 + }, + { + "epoch": 6.218216723549488, + "grad_norm": 0.08860340814906084, + "learning_rate": 6.002824115795563e-06, + "loss": 0.8555, + "step": 29151 + }, + { + "epoch": 6.2184300341296925, + "grad_norm": 0.08766557776531668, + "learning_rate": 5.999590372294872e-06, + "loss": 0.8757, + "step": 29152 + }, + { + "epoch": 6.218643344709897, + "grad_norm": 0.0895493583535862, + "learning_rate": 5.996357466110079e-06, + "loss": 0.8825, + "step": 29153 + }, + { + "epoch": 6.218856655290103, + "grad_norm": 0.09282198741214713, + "learning_rate": 5.993125397277722e-06, + "loss": 0.8923, + "step": 29154 + }, + { + "epoch": 6.2190699658703075, + "grad_norm": 0.09126844516701332, + "learning_rate": 5.989894165834402e-06, + "loss": 0.8517, + "step": 29155 + }, + { + "epoch": 6.219283276450512, + "grad_norm": 0.08554064048627978, + "learning_rate": 5.9866637718166784e-06, + "loss": 0.9016, + "step": 29156 + }, + { + "epoch": 6.219496587030717, + "grad_norm": 0.08878527332149659, + "learning_rate": 5.983434215261099e-06, + "loss": 0.8961, + "step": 29157 + }, + { + "epoch": 6.2197098976109215, + "grad_norm": 0.08823835833822478, + "learning_rate": 5.980205496204212e-06, + "loss": 0.8896, + "step": 29158 + }, + { + "epoch": 6.219923208191126, + "grad_norm": 0.08810915390514246, + "learning_rate": 5.976977614682522e-06, + "loss": 0.9036, + "step": 29159 + }, + { + "epoch": 6.220136518771331, + "grad_norm": 0.09234111206082418, + "learning_rate": 5.973750570732595e-06, + "loss": 0.884, + "step": 29160 + }, + { + "epoch": 6.220349829351536, + "grad_norm": 0.08588571288050441, + "learning_rate": 5.970524364390926e-06, + "loss": 0.8506, + "step": 29161 + }, + { + "epoch": 6.22056313993174, + "grad_norm": 0.09006105200359099, + "learning_rate": 5.967298995694011e-06, + "loss": 0.8679, + "step": 29162 + }, + { + "epoch": 6.220776450511945, + "grad_norm": 0.08883934030696007, + "learning_rate": 5.9640744646783445e-06, + "loss": 0.8821, + "step": 29163 + }, + { + "epoch": 6.2209897610921505, + "grad_norm": 0.09177650778769465, + "learning_rate": 5.9608507713804485e-06, + "loss": 0.8659, + "step": 29164 + }, + { + "epoch": 6.221203071672355, + "grad_norm": 0.0822828533854029, + "learning_rate": 5.957627915836765e-06, + "loss": 0.8427, + "step": 29165 + }, + { + "epoch": 6.22141638225256, + "grad_norm": 0.09381335985381892, + "learning_rate": 5.954405898083746e-06, + "loss": 0.9028, + "step": 29166 + }, + { + "epoch": 6.221629692832765, + "grad_norm": 0.09658943644856889, + "learning_rate": 5.9511847181578765e-06, + "loss": 0.8944, + "step": 29167 + }, + { + "epoch": 6.221843003412969, + "grad_norm": 0.0856424022616284, + "learning_rate": 5.947964376095607e-06, + "loss": 0.8743, + "step": 29168 + }, + { + "epoch": 6.222056313993174, + "grad_norm": 0.09280888018554366, + "learning_rate": 5.944744871933363e-06, + "loss": 0.8574, + "step": 29169 + }, + { + "epoch": 6.222269624573379, + "grad_norm": 0.099982353949314, + "learning_rate": 5.941526205707577e-06, + "loss": 0.8862, + "step": 29170 + }, + { + "epoch": 6.222482935153583, + "grad_norm": 0.08411633335037522, + "learning_rate": 5.938308377454656e-06, + "loss": 0.8665, + "step": 29171 + }, + { + "epoch": 6.222696245733788, + "grad_norm": 0.08281923854034072, + "learning_rate": 5.935091387211032e-06, + "loss": 0.8969, + "step": 29172 + }, + { + "epoch": 6.222909556313994, + "grad_norm": 0.09233226113189275, + "learning_rate": 5.931875235013103e-06, + "loss": 0.8743, + "step": 29173 + }, + { + "epoch": 6.223122866894198, + "grad_norm": 0.09096831695654502, + "learning_rate": 5.928659920897239e-06, + "loss": 0.9082, + "step": 29174 + }, + { + "epoch": 6.223336177474403, + "grad_norm": 0.09029343573416164, + "learning_rate": 5.92544544489984e-06, + "loss": 0.861, + "step": 29175 + }, + { + "epoch": 6.223549488054608, + "grad_norm": 0.08623607699610503, + "learning_rate": 5.922231807057275e-06, + "loss": 0.8795, + "step": 29176 + }, + { + "epoch": 6.223762798634812, + "grad_norm": 0.08875192501904071, + "learning_rate": 5.919019007405897e-06, + "loss": 0.8669, + "step": 29177 + }, + { + "epoch": 6.223976109215017, + "grad_norm": 0.09550895347239374, + "learning_rate": 5.915807045982052e-06, + "loss": 0.9136, + "step": 29178 + }, + { + "epoch": 6.224189419795222, + "grad_norm": 0.09071440927951305, + "learning_rate": 5.912595922822108e-06, + "loss": 0.913, + "step": 29179 + }, + { + "epoch": 6.224402730375426, + "grad_norm": 0.09180006073214174, + "learning_rate": 5.909385637962395e-06, + "loss": 0.8539, + "step": 29180 + }, + { + "epoch": 6.224616040955631, + "grad_norm": 0.08765544585577381, + "learning_rate": 5.906176191439228e-06, + "loss": 0.9131, + "step": 29181 + }, + { + "epoch": 6.224829351535837, + "grad_norm": 0.09423438000607899, + "learning_rate": 5.902967583288917e-06, + "loss": 0.8864, + "step": 29182 + }, + { + "epoch": 6.225042662116041, + "grad_norm": 0.08281199385608798, + "learning_rate": 5.89975981354777e-06, + "loss": 0.8925, + "step": 29183 + }, + { + "epoch": 6.225255972696246, + "grad_norm": 0.08304307904193686, + "learning_rate": 5.896552882252096e-06, + "loss": 0.8784, + "step": 29184 + }, + { + "epoch": 6.225469283276451, + "grad_norm": 0.08407377461764796, + "learning_rate": 5.893346789438168e-06, + "loss": 0.8571, + "step": 29185 + }, + { + "epoch": 6.225682593856655, + "grad_norm": 0.07966619788968288, + "learning_rate": 5.890141535142277e-06, + "loss": 0.8486, + "step": 29186 + }, + { + "epoch": 6.22589590443686, + "grad_norm": 0.09061796919000263, + "learning_rate": 5.886937119400671e-06, + "loss": 0.9126, + "step": 29187 + }, + { + "epoch": 6.226109215017065, + "grad_norm": 0.0830192616917367, + "learning_rate": 5.8837335422496215e-06, + "loss": 0.8587, + "step": 29188 + }, + { + "epoch": 6.226322525597269, + "grad_norm": 0.09437983908337716, + "learning_rate": 5.880530803725375e-06, + "loss": 0.8683, + "step": 29189 + }, + { + "epoch": 6.226535836177474, + "grad_norm": 0.08087947803853117, + "learning_rate": 5.877328903864162e-06, + "loss": 0.8475, + "step": 29190 + }, + { + "epoch": 6.226749146757679, + "grad_norm": 0.09006981518622242, + "learning_rate": 5.874127842702226e-06, + "loss": 0.8988, + "step": 29191 + }, + { + "epoch": 6.226962457337884, + "grad_norm": 0.07964877579721141, + "learning_rate": 5.870927620275782e-06, + "loss": 0.8507, + "step": 29192 + }, + { + "epoch": 6.227175767918089, + "grad_norm": 0.08948117020079102, + "learning_rate": 5.867728236621038e-06, + "loss": 0.9024, + "step": 29193 + }, + { + "epoch": 6.227389078498294, + "grad_norm": 0.08772330070054114, + "learning_rate": 5.864529691774188e-06, + "loss": 0.8642, + "step": 29194 + }, + { + "epoch": 6.227602389078498, + "grad_norm": 0.0849954069678778, + "learning_rate": 5.8613319857714255e-06, + "loss": 0.8909, + "step": 29195 + }, + { + "epoch": 6.227815699658703, + "grad_norm": 0.08218684954513039, + "learning_rate": 5.858135118648953e-06, + "loss": 0.8734, + "step": 29196 + }, + { + "epoch": 6.228029010238908, + "grad_norm": 0.07849973755583929, + "learning_rate": 5.854939090442946e-06, + "loss": 0.8704, + "step": 29197 + }, + { + "epoch": 6.228242320819112, + "grad_norm": 0.08325791554735977, + "learning_rate": 5.851743901189517e-06, + "loss": 0.8874, + "step": 29198 + }, + { + "epoch": 6.228455631399317, + "grad_norm": 0.08776651520042578, + "learning_rate": 5.848549550924869e-06, + "loss": 0.8778, + "step": 29199 + }, + { + "epoch": 6.228668941979522, + "grad_norm": 0.08571229400445156, + "learning_rate": 5.845356039685133e-06, + "loss": 0.8923, + "step": 29200 + }, + { + "epoch": 6.228882252559727, + "grad_norm": 0.10212498827284625, + "learning_rate": 5.84216336750643e-06, + "loss": 0.8662, + "step": 29201 + }, + { + "epoch": 6.229095563139932, + "grad_norm": 0.09935335653732562, + "learning_rate": 5.8389715344248935e-06, + "loss": 0.8609, + "step": 29202 + }, + { + "epoch": 6.229308873720137, + "grad_norm": 0.07928031295700055, + "learning_rate": 5.835780540476652e-06, + "loss": 0.8704, + "step": 29203 + }, + { + "epoch": 6.229522184300341, + "grad_norm": 0.09220998494500168, + "learning_rate": 5.832590385697794e-06, + "loss": 0.8797, + "step": 29204 + }, + { + "epoch": 6.229735494880546, + "grad_norm": 0.0863218942558301, + "learning_rate": 5.829401070124432e-06, + "loss": 0.8844, + "step": 29205 + }, + { + "epoch": 6.229948805460751, + "grad_norm": 0.08850775741526454, + "learning_rate": 5.826212593792644e-06, + "loss": 0.8766, + "step": 29206 + }, + { + "epoch": 6.2301621160409555, + "grad_norm": 0.08053902305537104, + "learning_rate": 5.82302495673849e-06, + "loss": 0.879, + "step": 29207 + }, + { + "epoch": 6.23037542662116, + "grad_norm": 0.08772808896049294, + "learning_rate": 5.8198381589980745e-06, + "loss": 0.8713, + "step": 29208 + }, + { + "epoch": 6.230588737201365, + "grad_norm": 0.08915186457109457, + "learning_rate": 5.816652200607449e-06, + "loss": 0.8623, + "step": 29209 + }, + { + "epoch": 6.2308020477815695, + "grad_norm": 0.09416813976959464, + "learning_rate": 5.813467081602628e-06, + "loss": 0.9151, + "step": 29210 + }, + { + "epoch": 6.231015358361775, + "grad_norm": 0.09234226134656882, + "learning_rate": 5.810282802019682e-06, + "loss": 0.8866, + "step": 29211 + }, + { + "epoch": 6.23122866894198, + "grad_norm": 0.08629873507430337, + "learning_rate": 5.807099361894635e-06, + "loss": 0.8774, + "step": 29212 + }, + { + "epoch": 6.2314419795221845, + "grad_norm": 0.08671219173099692, + "learning_rate": 5.803916761263511e-06, + "loss": 0.8903, + "step": 29213 + }, + { + "epoch": 6.231655290102389, + "grad_norm": 0.08260543399041476, + "learning_rate": 5.800735000162298e-06, + "loss": 0.9003, + "step": 29214 + }, + { + "epoch": 6.231868600682594, + "grad_norm": 0.08316893674652888, + "learning_rate": 5.797554078627032e-06, + "loss": 0.8732, + "step": 29215 + }, + { + "epoch": 6.2320819112627985, + "grad_norm": 0.08154430455267933, + "learning_rate": 5.794373996693692e-06, + "loss": 0.8499, + "step": 29216 + }, + { + "epoch": 6.232295221843003, + "grad_norm": 0.08487181091016131, + "learning_rate": 5.791194754398257e-06, + "loss": 0.8913, + "step": 29217 + }, + { + "epoch": 6.232508532423208, + "grad_norm": 0.08363110455059238, + "learning_rate": 5.788016351776709e-06, + "loss": 0.8825, + "step": 29218 + }, + { + "epoch": 6.232721843003413, + "grad_norm": 0.08188855184921812, + "learning_rate": 5.784838788864981e-06, + "loss": 0.8662, + "step": 29219 + }, + { + "epoch": 6.232935153583618, + "grad_norm": 0.09215839893245246, + "learning_rate": 5.781662065699074e-06, + "loss": 0.8827, + "step": 29220 + }, + { + "epoch": 6.233148464163823, + "grad_norm": 0.08554056104255571, + "learning_rate": 5.778486182314904e-06, + "loss": 0.8902, + "step": 29221 + }, + { + "epoch": 6.2333617747440275, + "grad_norm": 0.09110485785339432, + "learning_rate": 5.7753111387483985e-06, + "loss": 0.8944, + "step": 29222 + }, + { + "epoch": 6.233575085324232, + "grad_norm": 0.08868791885773862, + "learning_rate": 5.772136935035502e-06, + "loss": 0.8569, + "step": 29223 + }, + { + "epoch": 6.233788395904437, + "grad_norm": 0.0900265144742736, + "learning_rate": 5.768963571212132e-06, + "loss": 0.88, + "step": 29224 + }, + { + "epoch": 6.234001706484642, + "grad_norm": 0.09456877814049724, + "learning_rate": 5.765791047314188e-06, + "loss": 0.9051, + "step": 29225 + }, + { + "epoch": 6.234215017064846, + "grad_norm": 0.08820939135768746, + "learning_rate": 5.762619363377546e-06, + "loss": 0.8637, + "step": 29226 + }, + { + "epoch": 6.234428327645051, + "grad_norm": 0.09088074560087782, + "learning_rate": 5.759448519438131e-06, + "loss": 0.8944, + "step": 29227 + }, + { + "epoch": 6.234641638225256, + "grad_norm": 0.08361186824292788, + "learning_rate": 5.756278515531808e-06, + "loss": 0.8843, + "step": 29228 + }, + { + "epoch": 6.234854948805461, + "grad_norm": 0.09861043749230616, + "learning_rate": 5.753109351694441e-06, + "loss": 0.8912, + "step": 29229 + }, + { + "epoch": 6.235068259385666, + "grad_norm": 0.09405872742455036, + "learning_rate": 5.749941027961887e-06, + "loss": 0.9138, + "step": 29230 + }, + { + "epoch": 6.235281569965871, + "grad_norm": 0.08574179667607595, + "learning_rate": 5.746773544370001e-06, + "loss": 0.9016, + "step": 29231 + }, + { + "epoch": 6.235494880546075, + "grad_norm": 0.08953643183305603, + "learning_rate": 5.74360690095463e-06, + "loss": 0.8942, + "step": 29232 + }, + { + "epoch": 6.23570819112628, + "grad_norm": 0.09986028289649874, + "learning_rate": 5.7404410977515855e-06, + "loss": 0.8948, + "step": 29233 + }, + { + "epoch": 6.235921501706485, + "grad_norm": 0.0842136427658349, + "learning_rate": 5.737276134796688e-06, + "loss": 0.9095, + "step": 29234 + }, + { + "epoch": 6.236134812286689, + "grad_norm": 0.08898662645585861, + "learning_rate": 5.734112012125774e-06, + "loss": 0.8855, + "step": 29235 + }, + { + "epoch": 6.236348122866894, + "grad_norm": 0.09200315818711038, + "learning_rate": 5.73094872977463e-06, + "loss": 0.8565, + "step": 29236 + }, + { + "epoch": 6.236561433447099, + "grad_norm": 0.08439979309891316, + "learning_rate": 5.727786287779058e-06, + "loss": 0.8754, + "step": 29237 + }, + { + "epoch": 6.236774744027303, + "grad_norm": 0.08898012591122037, + "learning_rate": 5.724624686174807e-06, + "loss": 0.8947, + "step": 29238 + }, + { + "epoch": 6.236988054607509, + "grad_norm": 0.08319363604079928, + "learning_rate": 5.721463924997706e-06, + "loss": 0.8652, + "step": 29239 + }, + { + "epoch": 6.237201365187714, + "grad_norm": 0.08060259450153724, + "learning_rate": 5.718304004283477e-06, + "loss": 0.8536, + "step": 29240 + }, + { + "epoch": 6.237414675767918, + "grad_norm": 0.08502180525086683, + "learning_rate": 5.715144924067888e-06, + "loss": 0.902, + "step": 29241 + }, + { + "epoch": 6.237627986348123, + "grad_norm": 0.09129833061032147, + "learning_rate": 5.711986684386697e-06, + "loss": 0.883, + "step": 29242 + }, + { + "epoch": 6.237841296928328, + "grad_norm": 0.08442035194027878, + "learning_rate": 5.708829285275617e-06, + "loss": 0.8792, + "step": 29243 + }, + { + "epoch": 6.238054607508532, + "grad_norm": 0.09519981429908261, + "learning_rate": 5.70567272677038e-06, + "loss": 0.8841, + "step": 29244 + }, + { + "epoch": 6.238267918088737, + "grad_norm": 0.08779597807435682, + "learning_rate": 5.702517008906707e-06, + "loss": 0.8845, + "step": 29245 + }, + { + "epoch": 6.238481228668942, + "grad_norm": 0.07866919917889838, + "learning_rate": 5.6993621317202965e-06, + "loss": 0.8916, + "step": 29246 + }, + { + "epoch": 6.238694539249146, + "grad_norm": 0.08699192472823174, + "learning_rate": 5.69620809524686e-06, + "loss": 0.9004, + "step": 29247 + }, + { + "epoch": 6.238907849829351, + "grad_norm": 0.08900700066464476, + "learning_rate": 5.6930548995220854e-06, + "loss": 0.8457, + "step": 29248 + }, + { + "epoch": 6.239121160409557, + "grad_norm": 0.08627531281233865, + "learning_rate": 5.689902544581633e-06, + "loss": 0.867, + "step": 29249 + }, + { + "epoch": 6.239334470989761, + "grad_norm": 0.09711388768118669, + "learning_rate": 5.6867510304611814e-06, + "loss": 0.8483, + "step": 29250 + }, + { + "epoch": 6.239547781569966, + "grad_norm": 0.0823056520223198, + "learning_rate": 5.6836003571963994e-06, + "loss": 0.856, + "step": 29251 + }, + { + "epoch": 6.239761092150171, + "grad_norm": 0.08224215892946211, + "learning_rate": 5.6804505248229295e-06, + "loss": 0.8729, + "step": 29252 + }, + { + "epoch": 6.239974402730375, + "grad_norm": 0.08517255398575231, + "learning_rate": 5.677301533376432e-06, + "loss": 0.9016, + "step": 29253 + }, + { + "epoch": 6.24018771331058, + "grad_norm": 0.08278049002568558, + "learning_rate": 5.674153382892487e-06, + "loss": 0.8778, + "step": 29254 + }, + { + "epoch": 6.240401023890785, + "grad_norm": 0.08468921249037745, + "learning_rate": 5.671006073406755e-06, + "loss": 0.8861, + "step": 29255 + }, + { + "epoch": 6.2406143344709895, + "grad_norm": 0.08120133314410159, + "learning_rate": 5.667859604954844e-06, + "loss": 0.8707, + "step": 29256 + }, + { + "epoch": 6.240827645051194, + "grad_norm": 0.08532661332841468, + "learning_rate": 5.6647139775723515e-06, + "loss": 0.8835, + "step": 29257 + }, + { + "epoch": 6.2410409556314, + "grad_norm": 0.0954449133879539, + "learning_rate": 5.661569191294858e-06, + "loss": 0.9158, + "step": 29258 + }, + { + "epoch": 6.241254266211604, + "grad_norm": 0.08300736535615057, + "learning_rate": 5.65842524615797e-06, + "loss": 0.8735, + "step": 29259 + }, + { + "epoch": 6.241467576791809, + "grad_norm": 0.08780508135260137, + "learning_rate": 5.65528214219726e-06, + "loss": 0.8894, + "step": 29260 + }, + { + "epoch": 6.241680887372014, + "grad_norm": 0.08517285497010611, + "learning_rate": 5.652139879448273e-06, + "loss": 0.8575, + "step": 29261 + }, + { + "epoch": 6.2418941979522184, + "grad_norm": 0.08582829684197087, + "learning_rate": 5.648998457946562e-06, + "loss": 0.8357, + "step": 29262 + }, + { + "epoch": 6.242107508532423, + "grad_norm": 0.09067134650721977, + "learning_rate": 5.645857877727699e-06, + "loss": 0.8981, + "step": 29263 + }, + { + "epoch": 6.242320819112628, + "grad_norm": 0.08980614160482205, + "learning_rate": 5.642718138827201e-06, + "loss": 0.8874, + "step": 29264 + }, + { + "epoch": 6.2425341296928325, + "grad_norm": 0.07998030213090279, + "learning_rate": 5.639579241280615e-06, + "loss": 0.8943, + "step": 29265 + }, + { + "epoch": 6.242747440273037, + "grad_norm": 0.09250589605231774, + "learning_rate": 5.636441185123423e-06, + "loss": 0.8571, + "step": 29266 + }, + { + "epoch": 6.242960750853243, + "grad_norm": 0.09758591586815854, + "learning_rate": 5.633303970391151e-06, + "loss": 0.8557, + "step": 29267 + }, + { + "epoch": 6.243174061433447, + "grad_norm": 0.09050653179528945, + "learning_rate": 5.6301675971193e-06, + "loss": 0.8874, + "step": 29268 + }, + { + "epoch": 6.243387372013652, + "grad_norm": 0.08017870127521368, + "learning_rate": 5.627032065343345e-06, + "loss": 0.9069, + "step": 29269 + }, + { + "epoch": 6.243600682593857, + "grad_norm": 0.09283855155970284, + "learning_rate": 5.623897375098768e-06, + "loss": 0.8972, + "step": 29270 + }, + { + "epoch": 6.2438139931740615, + "grad_norm": 0.08441236292244282, + "learning_rate": 5.620763526421061e-06, + "loss": 0.8774, + "step": 29271 + }, + { + "epoch": 6.244027303754266, + "grad_norm": 0.08112003762185512, + "learning_rate": 5.61763051934566e-06, + "loss": 0.855, + "step": 29272 + }, + { + "epoch": 6.244240614334471, + "grad_norm": 0.08187331776103947, + "learning_rate": 5.6144983539080156e-06, + "loss": 0.885, + "step": 29273 + }, + { + "epoch": 6.2444539249146755, + "grad_norm": 0.08246911017793891, + "learning_rate": 5.611367030143573e-06, + "loss": 0.8661, + "step": 29274 + }, + { + "epoch": 6.24466723549488, + "grad_norm": 0.09136632344209462, + "learning_rate": 5.608236548087762e-06, + "loss": 0.8523, + "step": 29275 + }, + { + "epoch": 6.244880546075085, + "grad_norm": 0.08624880268383806, + "learning_rate": 5.60510690777603e-06, + "loss": 0.8815, + "step": 29276 + }, + { + "epoch": 6.2450938566552905, + "grad_norm": 0.07944498986911566, + "learning_rate": 5.601978109243744e-06, + "loss": 0.8902, + "step": 29277 + }, + { + "epoch": 6.245307167235495, + "grad_norm": 0.08792907581133293, + "learning_rate": 5.598850152526325e-06, + "loss": 0.9055, + "step": 29278 + }, + { + "epoch": 6.2455204778157, + "grad_norm": 0.1001185410792124, + "learning_rate": 5.5957230376591755e-06, + "loss": 0.8774, + "step": 29279 + }, + { + "epoch": 6.2457337883959045, + "grad_norm": 0.08167981368161803, + "learning_rate": 5.592596764677662e-06, + "loss": 0.9025, + "step": 29280 + }, + { + "epoch": 6.245947098976109, + "grad_norm": 0.08234785796478435, + "learning_rate": 5.58947133361718e-06, + "loss": 0.8837, + "step": 29281 + }, + { + "epoch": 6.246160409556314, + "grad_norm": 0.0858770607656327, + "learning_rate": 5.58634674451306e-06, + "loss": 0.8893, + "step": 29282 + }, + { + "epoch": 6.246373720136519, + "grad_norm": 0.07593396224018138, + "learning_rate": 5.583222997400697e-06, + "loss": 0.8513, + "step": 29283 + }, + { + "epoch": 6.246587030716723, + "grad_norm": 0.08030422571269022, + "learning_rate": 5.580100092315422e-06, + "loss": 0.8916, + "step": 29284 + }, + { + "epoch": 6.246800341296928, + "grad_norm": 0.08426719062827671, + "learning_rate": 5.576978029292557e-06, + "loss": 0.8922, + "step": 29285 + }, + { + "epoch": 6.2470136518771335, + "grad_norm": 0.08479187656419766, + "learning_rate": 5.573856808367426e-06, + "loss": 0.8783, + "step": 29286 + }, + { + "epoch": 6.247226962457338, + "grad_norm": 0.09057908242583726, + "learning_rate": 5.5707364295753876e-06, + "loss": 0.901, + "step": 29287 + }, + { + "epoch": 6.247440273037543, + "grad_norm": 0.07915504005458446, + "learning_rate": 5.567616892951701e-06, + "loss": 0.8856, + "step": 29288 + }, + { + "epoch": 6.247653583617748, + "grad_norm": 0.08327057490009503, + "learning_rate": 5.564498198531683e-06, + "loss": 0.8802, + "step": 29289 + }, + { + "epoch": 6.247866894197952, + "grad_norm": 0.07897307902678685, + "learning_rate": 5.561380346350609e-06, + "loss": 0.8704, + "step": 29290 + }, + { + "epoch": 6.248080204778157, + "grad_norm": 0.08262794050523493, + "learning_rate": 5.5582633364437764e-06, + "loss": 0.8697, + "step": 29291 + }, + { + "epoch": 6.248293515358362, + "grad_norm": 0.08354954717059869, + "learning_rate": 5.555147168846446e-06, + "loss": 0.8912, + "step": 29292 + }, + { + "epoch": 6.248506825938566, + "grad_norm": 0.08827827516449561, + "learning_rate": 5.552031843593879e-06, + "loss": 0.8558, + "step": 29293 + }, + { + "epoch": 6.248720136518771, + "grad_norm": 0.08487276998846315, + "learning_rate": 5.548917360721318e-06, + "loss": 0.8678, + "step": 29294 + }, + { + "epoch": 6.248933447098976, + "grad_norm": 0.07957470396634556, + "learning_rate": 5.545803720264014e-06, + "loss": 0.8784, + "step": 29295 + }, + { + "epoch": 6.249146757679181, + "grad_norm": 0.08325410307059565, + "learning_rate": 5.542690922257201e-06, + "loss": 0.8706, + "step": 29296 + }, + { + "epoch": 6.249360068259386, + "grad_norm": 0.08902309142602927, + "learning_rate": 5.539578966736088e-06, + "loss": 0.916, + "step": 29297 + }, + { + "epoch": 6.249573378839591, + "grad_norm": 0.09875310466762566, + "learning_rate": 5.53646785373589e-06, + "loss": 0.8874, + "step": 29298 + }, + { + "epoch": 6.249786689419795, + "grad_norm": 0.08471175191383892, + "learning_rate": 5.533357583291805e-06, + "loss": 0.8813, + "step": 29299 + }, + { + "epoch": 6.25, + "grad_norm": 0.08620565593017898, + "learning_rate": 5.530248155439042e-06, + "loss": 0.9166, + "step": 29300 + }, + { + "epoch": 6.250213310580205, + "grad_norm": 0.08630824705887057, + "learning_rate": 5.5271395702127715e-06, + "loss": 0.858, + "step": 29301 + }, + { + "epoch": 6.250426621160409, + "grad_norm": 0.09062914388538841, + "learning_rate": 5.5240318276481576e-06, + "loss": 0.8765, + "step": 29302 + }, + { + "epoch": 6.250639931740614, + "grad_norm": 0.08314250819862871, + "learning_rate": 5.52092492778038e-06, + "loss": 0.8696, + "step": 29303 + }, + { + "epoch": 6.250853242320819, + "grad_norm": 0.08162908413825774, + "learning_rate": 5.517818870644602e-06, + "loss": 0.8545, + "step": 29304 + }, + { + "epoch": 6.251066552901024, + "grad_norm": 0.09004579091616362, + "learning_rate": 5.514713656275943e-06, + "loss": 0.8727, + "step": 29305 + }, + { + "epoch": 6.251279863481229, + "grad_norm": 0.09432877969814471, + "learning_rate": 5.511609284709547e-06, + "loss": 0.8956, + "step": 29306 + }, + { + "epoch": 6.251493174061434, + "grad_norm": 0.08471494883214839, + "learning_rate": 5.50850575598056e-06, + "loss": 0.8774, + "step": 29307 + }, + { + "epoch": 6.251706484641638, + "grad_norm": 0.09784090781003257, + "learning_rate": 5.505403070124082e-06, + "loss": 0.8865, + "step": 29308 + }, + { + "epoch": 6.251919795221843, + "grad_norm": 0.10358407949683854, + "learning_rate": 5.502301227175233e-06, + "loss": 0.8576, + "step": 29309 + }, + { + "epoch": 6.252133105802048, + "grad_norm": 0.08284423508654107, + "learning_rate": 5.4992002271690685e-06, + "loss": 0.8576, + "step": 29310 + }, + { + "epoch": 6.252346416382252, + "grad_norm": 0.0956992270168978, + "learning_rate": 5.496100070140724e-06, + "loss": 0.8835, + "step": 29311 + }, + { + "epoch": 6.252559726962457, + "grad_norm": 0.11334878815106753, + "learning_rate": 5.493000756125266e-06, + "loss": 0.8697, + "step": 29312 + }, + { + "epoch": 6.252773037542662, + "grad_norm": 0.08603985108793397, + "learning_rate": 5.489902285157751e-06, + "loss": 0.8599, + "step": 29313 + }, + { + "epoch": 6.252986348122867, + "grad_norm": 0.0926145084775424, + "learning_rate": 5.486804657273226e-06, + "loss": 0.8765, + "step": 29314 + }, + { + "epoch": 6.253199658703072, + "grad_norm": 0.09049179387383852, + "learning_rate": 5.483707872506783e-06, + "loss": 0.9068, + "step": 29315 + }, + { + "epoch": 6.253412969283277, + "grad_norm": 0.09245715092126869, + "learning_rate": 5.480611930893434e-06, + "loss": 0.8902, + "step": 29316 + }, + { + "epoch": 6.253626279863481, + "grad_norm": 0.07965791744479098, + "learning_rate": 5.477516832468217e-06, + "loss": 0.8741, + "step": 29317 + }, + { + "epoch": 6.253839590443686, + "grad_norm": 0.08390093899734813, + "learning_rate": 5.474422577266137e-06, + "loss": 0.8374, + "step": 29318 + }, + { + "epoch": 6.254052901023891, + "grad_norm": 0.08897795675499591, + "learning_rate": 5.47132916532223e-06, + "loss": 0.8775, + "step": 29319 + }, + { + "epoch": 6.2542662116040955, + "grad_norm": 0.08411462823101976, + "learning_rate": 5.468236596671492e-06, + "loss": 0.8818, + "step": 29320 + }, + { + "epoch": 6.2544795221843, + "grad_norm": 0.16540176309026425, + "learning_rate": 5.465144871348918e-06, + "loss": 0.8718, + "step": 29321 + }, + { + "epoch": 6.254692832764505, + "grad_norm": 0.08467993741974837, + "learning_rate": 5.462053989389455e-06, + "loss": 0.8796, + "step": 29322 + }, + { + "epoch": 6.2549061433447095, + "grad_norm": 0.08662556822634054, + "learning_rate": 5.458963950828127e-06, + "loss": 0.8872, + "step": 29323 + }, + { + "epoch": 6.255119453924915, + "grad_norm": 0.09289322411105766, + "learning_rate": 5.455874755699864e-06, + "loss": 0.895, + "step": 29324 + }, + { + "epoch": 6.25533276450512, + "grad_norm": 0.07752869823823212, + "learning_rate": 5.452786404039643e-06, + "loss": 0.8556, + "step": 29325 + }, + { + "epoch": 6.2555460750853245, + "grad_norm": 0.08595874813405006, + "learning_rate": 5.449698895882378e-06, + "loss": 0.8869, + "step": 29326 + }, + { + "epoch": 6.255759385665529, + "grad_norm": 0.08601467442826467, + "learning_rate": 5.446612231263037e-06, + "loss": 0.8854, + "step": 29327 + }, + { + "epoch": 6.255972696245734, + "grad_norm": 0.08585136225092335, + "learning_rate": 5.443526410216544e-06, + "loss": 0.8612, + "step": 29328 + }, + { + "epoch": 6.2561860068259385, + "grad_norm": 0.08435316584774769, + "learning_rate": 5.440441432777795e-06, + "loss": 0.8607, + "step": 29329 + }, + { + "epoch": 6.256399317406143, + "grad_norm": 0.08560416719058889, + "learning_rate": 5.437357298981685e-06, + "loss": 0.856, + "step": 29330 + }, + { + "epoch": 6.256612627986348, + "grad_norm": 0.08449789129542742, + "learning_rate": 5.434274008863157e-06, + "loss": 0.8746, + "step": 29331 + }, + { + "epoch": 6.256825938566553, + "grad_norm": 0.08624606006743314, + "learning_rate": 5.431191562457079e-06, + "loss": 0.9115, + "step": 29332 + }, + { + "epoch": 6.257039249146757, + "grad_norm": 0.07974843574359974, + "learning_rate": 5.428109959798313e-06, + "loss": 0.8922, + "step": 29333 + }, + { + "epoch": 6.257252559726963, + "grad_norm": 0.08419285955300043, + "learning_rate": 5.425029200921721e-06, + "loss": 0.8648, + "step": 29334 + }, + { + "epoch": 6.2574658703071675, + "grad_norm": 0.08750472601807663, + "learning_rate": 5.421949285862189e-06, + "loss": 0.8706, + "step": 29335 + }, + { + "epoch": 6.257679180887372, + "grad_norm": 0.08276391934409957, + "learning_rate": 5.418870214654561e-06, + "loss": 0.8834, + "step": 29336 + }, + { + "epoch": 6.257892491467577, + "grad_norm": 0.08641564384714007, + "learning_rate": 5.415791987333663e-06, + "loss": 0.8952, + "step": 29337 + }, + { + "epoch": 6.2581058020477816, + "grad_norm": 0.0787316202422787, + "learning_rate": 5.412714603934328e-06, + "loss": 0.8749, + "step": 29338 + }, + { + "epoch": 6.258319112627986, + "grad_norm": 0.0835822795078722, + "learning_rate": 5.409638064491392e-06, + "loss": 0.8791, + "step": 29339 + }, + { + "epoch": 6.258532423208191, + "grad_norm": 0.08020908912474246, + "learning_rate": 5.406562369039661e-06, + "loss": 0.8852, + "step": 29340 + }, + { + "epoch": 6.258745733788396, + "grad_norm": 0.08451174259239849, + "learning_rate": 5.403487517613926e-06, + "loss": 0.9042, + "step": 29341 + }, + { + "epoch": 6.2589590443686, + "grad_norm": 0.08612329049446568, + "learning_rate": 5.400413510248968e-06, + "loss": 0.8812, + "step": 29342 + }, + { + "epoch": 6.259172354948806, + "grad_norm": 0.08718437238046883, + "learning_rate": 5.39734034697962e-06, + "loss": 0.8652, + "step": 29343 + }, + { + "epoch": 6.2593856655290105, + "grad_norm": 0.090895306142095, + "learning_rate": 5.3942680278406035e-06, + "loss": 0.8958, + "step": 29344 + }, + { + "epoch": 6.259598976109215, + "grad_norm": 0.08323942503723049, + "learning_rate": 5.391196552866698e-06, + "loss": 0.8938, + "step": 29345 + }, + { + "epoch": 6.25981228668942, + "grad_norm": 0.08492018127930503, + "learning_rate": 5.388125922092648e-06, + "loss": 0.8496, + "step": 29346 + }, + { + "epoch": 6.260025597269625, + "grad_norm": 0.0918000792566729, + "learning_rate": 5.385056135553219e-06, + "loss": 0.8821, + "step": 29347 + }, + { + "epoch": 6.260238907849829, + "grad_norm": 0.0816028848306835, + "learning_rate": 5.381987193283138e-06, + "loss": 0.8851, + "step": 29348 + }, + { + "epoch": 6.260452218430034, + "grad_norm": 0.08572583790574194, + "learning_rate": 5.378919095317123e-06, + "loss": 0.8661, + "step": 29349 + }, + { + "epoch": 6.260665529010239, + "grad_norm": 0.09061802570318356, + "learning_rate": 5.375851841689885e-06, + "loss": 0.8761, + "step": 29350 + }, + { + "epoch": 6.260878839590443, + "grad_norm": 0.08886323131463097, + "learning_rate": 5.372785432436152e-06, + "loss": 0.8525, + "step": 29351 + }, + { + "epoch": 6.261092150170649, + "grad_norm": 0.08510230731891742, + "learning_rate": 5.369719867590597e-06, + "loss": 0.9083, + "step": 29352 + }, + { + "epoch": 6.261305460750854, + "grad_norm": 0.08253178936711371, + "learning_rate": 5.366655147187931e-06, + "loss": 0.879, + "step": 29353 + }, + { + "epoch": 6.261518771331058, + "grad_norm": 0.08628496790783857, + "learning_rate": 5.3635912712628114e-06, + "loss": 0.8787, + "step": 29354 + }, + { + "epoch": 6.261732081911263, + "grad_norm": 0.08446181669223096, + "learning_rate": 5.360528239849911e-06, + "loss": 0.9135, + "step": 29355 + }, + { + "epoch": 6.261945392491468, + "grad_norm": 0.08553809117079797, + "learning_rate": 5.357466052983888e-06, + "loss": 0.8847, + "step": 29356 + }, + { + "epoch": 6.262158703071672, + "grad_norm": 0.08806856169541002, + "learning_rate": 5.35440471069939e-06, + "loss": 0.8871, + "step": 29357 + }, + { + "epoch": 6.262372013651877, + "grad_norm": 0.08623828543647678, + "learning_rate": 5.351344213031046e-06, + "loss": 0.8796, + "step": 29358 + }, + { + "epoch": 6.262585324232082, + "grad_norm": 0.09991575046179645, + "learning_rate": 5.348284560013514e-06, + "loss": 0.8711, + "step": 29359 + }, + { + "epoch": 6.262798634812286, + "grad_norm": 0.08458195963498441, + "learning_rate": 5.345225751681407e-06, + "loss": 0.9012, + "step": 29360 + }, + { + "epoch": 6.263011945392491, + "grad_norm": 0.08582311663171124, + "learning_rate": 5.3421677880693075e-06, + "loss": 0.8589, + "step": 29361 + }, + { + "epoch": 6.263225255972697, + "grad_norm": 0.08701962866916155, + "learning_rate": 5.33911066921184e-06, + "loss": 0.8876, + "step": 29362 + }, + { + "epoch": 6.263438566552901, + "grad_norm": 0.08561627964545347, + "learning_rate": 5.3360543951435885e-06, + "loss": 0.8771, + "step": 29363 + }, + { + "epoch": 6.263651877133106, + "grad_norm": 0.08705331994258857, + "learning_rate": 5.3329989658991475e-06, + "loss": 0.8721, + "step": 29364 + }, + { + "epoch": 6.263865187713311, + "grad_norm": 0.08982855277768283, + "learning_rate": 5.329944381513077e-06, + "loss": 0.8727, + "step": 29365 + }, + { + "epoch": 6.264078498293515, + "grad_norm": 0.08567614362422125, + "learning_rate": 5.326890642019944e-06, + "loss": 0.8889, + "step": 29366 + }, + { + "epoch": 6.26429180887372, + "grad_norm": 0.08017747767566655, + "learning_rate": 5.3238377474542904e-06, + "loss": 0.8739, + "step": 29367 + }, + { + "epoch": 6.264505119453925, + "grad_norm": 0.08554783809659028, + "learning_rate": 5.320785697850674e-06, + "loss": 0.8638, + "step": 29368 + }, + { + "epoch": 6.264718430034129, + "grad_norm": 0.08652538082942955, + "learning_rate": 5.317734493243629e-06, + "loss": 0.888, + "step": 29369 + }, + { + "epoch": 6.264931740614334, + "grad_norm": 0.09613391788097464, + "learning_rate": 5.31468413366766e-06, + "loss": 0.8735, + "step": 29370 + }, + { + "epoch": 6.265145051194539, + "grad_norm": 0.08129037697089142, + "learning_rate": 5.3116346191572995e-06, + "loss": 0.8625, + "step": 29371 + }, + { + "epoch": 6.265358361774744, + "grad_norm": 0.08382445475285595, + "learning_rate": 5.308585949747063e-06, + "loss": 0.8836, + "step": 29372 + }, + { + "epoch": 6.265571672354949, + "grad_norm": 0.10715025679714199, + "learning_rate": 5.305538125471419e-06, + "loss": 0.8879, + "step": 29373 + }, + { + "epoch": 6.265784982935154, + "grad_norm": 0.09445892244561066, + "learning_rate": 5.302491146364866e-06, + "loss": 0.8909, + "step": 29374 + }, + { + "epoch": 6.265998293515358, + "grad_norm": 0.09803370874336752, + "learning_rate": 5.29944501246189e-06, + "loss": 0.8622, + "step": 29375 + }, + { + "epoch": 6.266211604095563, + "grad_norm": 0.08845781542895101, + "learning_rate": 5.296399723796964e-06, + "loss": 0.8881, + "step": 29376 + }, + { + "epoch": 6.266424914675768, + "grad_norm": 0.09814220454079187, + "learning_rate": 5.293355280404502e-06, + "loss": 0.8642, + "step": 29377 + }, + { + "epoch": 6.2666382252559725, + "grad_norm": 0.09285069811787482, + "learning_rate": 5.2903116823190026e-06, + "loss": 0.8721, + "step": 29378 + }, + { + "epoch": 6.266851535836177, + "grad_norm": 0.09111519338399089, + "learning_rate": 5.287268929574873e-06, + "loss": 0.8956, + "step": 29379 + }, + { + "epoch": 6.267064846416382, + "grad_norm": 0.09316255263085671, + "learning_rate": 5.2842270222065584e-06, + "loss": 0.8868, + "step": 29380 + }, + { + "epoch": 6.267278156996587, + "grad_norm": 0.09432303113856585, + "learning_rate": 5.281185960248474e-06, + "loss": 0.8925, + "step": 29381 + }, + { + "epoch": 6.267491467576792, + "grad_norm": 0.08429957283313934, + "learning_rate": 5.27814574373501e-06, + "loss": 0.8746, + "step": 29382 + }, + { + "epoch": 6.267704778156997, + "grad_norm": 0.09214149322062516, + "learning_rate": 5.275106372700593e-06, + "loss": 0.8697, + "step": 29383 + }, + { + "epoch": 6.2679180887372015, + "grad_norm": 0.08977110458398169, + "learning_rate": 5.272067847179614e-06, + "loss": 0.8761, + "step": 29384 + }, + { + "epoch": 6.268131399317406, + "grad_norm": 0.09182766273182093, + "learning_rate": 5.269030167206444e-06, + "loss": 0.879, + "step": 29385 + }, + { + "epoch": 6.268344709897611, + "grad_norm": 0.08781376438539044, + "learning_rate": 5.265993332815429e-06, + "loss": 0.8602, + "step": 29386 + }, + { + "epoch": 6.2685580204778155, + "grad_norm": 0.08992329733409414, + "learning_rate": 5.2629573440409774e-06, + "loss": 0.8647, + "step": 29387 + }, + { + "epoch": 6.26877133105802, + "grad_norm": 0.09132099395851735, + "learning_rate": 5.259922200917435e-06, + "loss": 0.8756, + "step": 29388 + }, + { + "epoch": 6.268984641638225, + "grad_norm": 0.09006088577176756, + "learning_rate": 5.256887903479104e-06, + "loss": 0.8661, + "step": 29389 + }, + { + "epoch": 6.2691979522184305, + "grad_norm": 0.08668428844899352, + "learning_rate": 5.253854451760356e-06, + "loss": 0.8505, + "step": 29390 + }, + { + "epoch": 6.269411262798635, + "grad_norm": 0.08244934628193454, + "learning_rate": 5.250821845795502e-06, + "loss": 0.8675, + "step": 29391 + }, + { + "epoch": 6.26962457337884, + "grad_norm": 0.08781696916034437, + "learning_rate": 5.24779008561886e-06, + "loss": 0.8801, + "step": 29392 + }, + { + "epoch": 6.2698378839590445, + "grad_norm": 0.08346560440699859, + "learning_rate": 5.244759171264724e-06, + "loss": 0.8935, + "step": 29393 + }, + { + "epoch": 6.270051194539249, + "grad_norm": 0.08630486352904057, + "learning_rate": 5.241729102767386e-06, + "loss": 0.907, + "step": 29394 + }, + { + "epoch": 6.270264505119454, + "grad_norm": 0.08975713303731495, + "learning_rate": 5.238699880161146e-06, + "loss": 0.8554, + "step": 29395 + }, + { + "epoch": 6.270477815699659, + "grad_norm": 0.08200734281893463, + "learning_rate": 5.235671503480282e-06, + "loss": 0.9025, + "step": 29396 + }, + { + "epoch": 6.270691126279863, + "grad_norm": 0.08596916101443047, + "learning_rate": 5.232643972759057e-06, + "loss": 0.8892, + "step": 29397 + }, + { + "epoch": 6.270904436860068, + "grad_norm": 0.08405139769387114, + "learning_rate": 5.2296172880317035e-06, + "loss": 0.8883, + "step": 29398 + }, + { + "epoch": 6.2711177474402735, + "grad_norm": 0.08377728843942664, + "learning_rate": 5.226591449332512e-06, + "loss": 0.8738, + "step": 29399 + }, + { + "epoch": 6.271331058020478, + "grad_norm": 0.09199029886882787, + "learning_rate": 5.223566456695688e-06, + "loss": 0.9052, + "step": 29400 + }, + { + "epoch": 6.271544368600683, + "grad_norm": 0.08092784624871201, + "learning_rate": 5.220542310155452e-06, + "loss": 0.9065, + "step": 29401 + }, + { + "epoch": 6.271757679180888, + "grad_norm": 0.08338555120978267, + "learning_rate": 5.2175190097460525e-06, + "loss": 0.8911, + "step": 29402 + }, + { + "epoch": 6.271970989761092, + "grad_norm": 0.08881657731021883, + "learning_rate": 5.214496555501676e-06, + "loss": 0.878, + "step": 29403 + }, + { + "epoch": 6.272184300341297, + "grad_norm": 0.0924372569086316, + "learning_rate": 5.211474947456534e-06, + "loss": 0.8983, + "step": 29404 + }, + { + "epoch": 6.272397610921502, + "grad_norm": 0.08710452352715518, + "learning_rate": 5.2084541856448135e-06, + "loss": 0.8961, + "step": 29405 + }, + { + "epoch": 6.272610921501706, + "grad_norm": 0.09515062213275753, + "learning_rate": 5.2054342701006734e-06, + "loss": 0.9224, + "step": 29406 + }, + { + "epoch": 6.272824232081911, + "grad_norm": 0.09724756247584396, + "learning_rate": 5.202415200858326e-06, + "loss": 0.8851, + "step": 29407 + }, + { + "epoch": 6.273037542662116, + "grad_norm": 0.09307635363221584, + "learning_rate": 5.199396977951896e-06, + "loss": 0.8982, + "step": 29408 + }, + { + "epoch": 6.273250853242321, + "grad_norm": 0.08519195348309032, + "learning_rate": 5.196379601415551e-06, + "loss": 0.9291, + "step": 29409 + }, + { + "epoch": 6.273464163822526, + "grad_norm": 0.0916085878634787, + "learning_rate": 5.193363071283433e-06, + "loss": 0.8659, + "step": 29410 + }, + { + "epoch": 6.273677474402731, + "grad_norm": 0.09169459661535631, + "learning_rate": 5.190347387589674e-06, + "loss": 0.8697, + "step": 29411 + }, + { + "epoch": 6.273890784982935, + "grad_norm": 0.08549373494254996, + "learning_rate": 5.1873325503683806e-06, + "loss": 0.8842, + "step": 29412 + }, + { + "epoch": 6.27410409556314, + "grad_norm": 0.08624738902507594, + "learning_rate": 5.184318559653676e-06, + "loss": 0.8925, + "step": 29413 + }, + { + "epoch": 6.274317406143345, + "grad_norm": 0.08440934111548455, + "learning_rate": 5.181305415479667e-06, + "loss": 0.8682, + "step": 29414 + }, + { + "epoch": 6.274530716723549, + "grad_norm": 0.08217759730837809, + "learning_rate": 5.1782931178804505e-06, + "loss": 0.8302, + "step": 29415 + }, + { + "epoch": 6.274744027303754, + "grad_norm": 0.09540641279432763, + "learning_rate": 5.175281666890115e-06, + "loss": 0.87, + "step": 29416 + }, + { + "epoch": 6.274957337883959, + "grad_norm": 0.08377166473978023, + "learning_rate": 5.172271062542713e-06, + "loss": 0.8742, + "step": 29417 + }, + { + "epoch": 6.275170648464163, + "grad_norm": 0.08841937029945161, + "learning_rate": 5.169261304872315e-06, + "loss": 0.8888, + "step": 29418 + }, + { + "epoch": 6.275383959044369, + "grad_norm": 0.08618980906921217, + "learning_rate": 5.166252393913e-06, + "loss": 0.8701, + "step": 29419 + }, + { + "epoch": 6.275597269624574, + "grad_norm": 0.08540166766627351, + "learning_rate": 5.1632443296987956e-06, + "loss": 0.9077, + "step": 29420 + }, + { + "epoch": 6.275810580204778, + "grad_norm": 0.0790611269540319, + "learning_rate": 5.160237112263735e-06, + "loss": 0.8817, + "step": 29421 + }, + { + "epoch": 6.276023890784983, + "grad_norm": 0.08284685031928382, + "learning_rate": 5.157230741641854e-06, + "loss": 0.8726, + "step": 29422 + }, + { + "epoch": 6.276237201365188, + "grad_norm": 0.09054644927407138, + "learning_rate": 5.15422521786717e-06, + "loss": 0.8858, + "step": 29423 + }, + { + "epoch": 6.276450511945392, + "grad_norm": 0.091706974940539, + "learning_rate": 5.1512205409736825e-06, + "loss": 0.8922, + "step": 29424 + }, + { + "epoch": 6.276663822525597, + "grad_norm": 0.08661100120047745, + "learning_rate": 5.148216710995381e-06, + "loss": 0.891, + "step": 29425 + }, + { + "epoch": 6.276877133105802, + "grad_norm": 0.08916554927308316, + "learning_rate": 5.145213727966285e-06, + "loss": 0.8759, + "step": 29426 + }, + { + "epoch": 6.277090443686006, + "grad_norm": 0.08622989424712932, + "learning_rate": 5.142211591920348e-06, + "loss": 0.887, + "step": 29427 + }, + { + "epoch": 6.277303754266212, + "grad_norm": 0.08887653882273136, + "learning_rate": 5.139210302891551e-06, + "loss": 0.8603, + "step": 29428 + }, + { + "epoch": 6.277517064846417, + "grad_norm": 0.0775792815991913, + "learning_rate": 5.136209860913841e-06, + "loss": 0.8878, + "step": 29429 + }, + { + "epoch": 6.277730375426621, + "grad_norm": 0.08648663280924791, + "learning_rate": 5.133210266021165e-06, + "loss": 0.8928, + "step": 29430 + }, + { + "epoch": 6.277943686006826, + "grad_norm": 0.08937504785930044, + "learning_rate": 5.130211518247495e-06, + "loss": 0.9, + "step": 29431 + }, + { + "epoch": 6.278156996587031, + "grad_norm": 0.08854466113834393, + "learning_rate": 5.12721361762674e-06, + "loss": 0.8899, + "step": 29432 + }, + { + "epoch": 6.278370307167235, + "grad_norm": 0.08849257874941979, + "learning_rate": 5.124216564192805e-06, + "loss": 0.8803, + "step": 29433 + }, + { + "epoch": 6.27858361774744, + "grad_norm": 0.08989648058206545, + "learning_rate": 5.1212203579796345e-06, + "loss": 0.8884, + "step": 29434 + }, + { + "epoch": 6.278796928327645, + "grad_norm": 0.08561741218476249, + "learning_rate": 5.118224999021113e-06, + "loss": 0.8921, + "step": 29435 + }, + { + "epoch": 6.2790102389078495, + "grad_norm": 0.0945045042645826, + "learning_rate": 5.115230487351132e-06, + "loss": 0.906, + "step": 29436 + }, + { + "epoch": 6.279223549488055, + "grad_norm": 0.09952078267810452, + "learning_rate": 5.112236823003569e-06, + "loss": 0.8869, + "step": 29437 + }, + { + "epoch": 6.27943686006826, + "grad_norm": 0.09139508607621144, + "learning_rate": 5.109244006012315e-06, + "loss": 0.8954, + "step": 29438 + }, + { + "epoch": 6.279650170648464, + "grad_norm": 0.09088816246659377, + "learning_rate": 5.1062520364112275e-06, + "loss": 0.887, + "step": 29439 + }, + { + "epoch": 6.279863481228669, + "grad_norm": 0.09923666272096206, + "learning_rate": 5.103260914234156e-06, + "loss": 0.8668, + "step": 29440 + }, + { + "epoch": 6.280076791808874, + "grad_norm": 0.08514560122392975, + "learning_rate": 5.100270639514948e-06, + "loss": 0.8781, + "step": 29441 + }, + { + "epoch": 6.2802901023890785, + "grad_norm": 0.08603395083158606, + "learning_rate": 5.097281212287426e-06, + "loss": 0.8734, + "step": 29442 + }, + { + "epoch": 6.280503412969283, + "grad_norm": 0.08789511552076588, + "learning_rate": 5.094292632585438e-06, + "loss": 0.8711, + "step": 29443 + }, + { + "epoch": 6.280716723549488, + "grad_norm": 0.08861257771091269, + "learning_rate": 5.091304900442806e-06, + "loss": 0.8828, + "step": 29444 + }, + { + "epoch": 6.2809300341296925, + "grad_norm": 0.08929282384076276, + "learning_rate": 5.08831801589329e-06, + "loss": 0.8756, + "step": 29445 + }, + { + "epoch": 6.281143344709897, + "grad_norm": 0.07983447260719768, + "learning_rate": 5.0853319789707286e-06, + "loss": 0.8877, + "step": 29446 + }, + { + "epoch": 6.281356655290103, + "grad_norm": 0.08507271814346544, + "learning_rate": 5.082346789708892e-06, + "loss": 0.8851, + "step": 29447 + }, + { + "epoch": 6.2815699658703075, + "grad_norm": 0.08695985003982928, + "learning_rate": 5.079362448141565e-06, + "loss": 0.8946, + "step": 29448 + }, + { + "epoch": 6.281783276450512, + "grad_norm": 0.09258031421187461, + "learning_rate": 5.076378954302499e-06, + "loss": 0.8749, + "step": 29449 + }, + { + "epoch": 6.281996587030717, + "grad_norm": 0.08490749667338397, + "learning_rate": 5.07339630822548e-06, + "loss": 0.8665, + "step": 29450 + }, + { + "epoch": 6.2822098976109215, + "grad_norm": 0.08472786007280231, + "learning_rate": 5.070414509944242e-06, + "loss": 0.8758, + "step": 29451 + }, + { + "epoch": 6.282423208191126, + "grad_norm": 0.08857557609548075, + "learning_rate": 5.067433559492516e-06, + "loss": 0.8643, + "step": 29452 + }, + { + "epoch": 6.282636518771331, + "grad_norm": 0.08263545052954202, + "learning_rate": 5.064453456904046e-06, + "loss": 0.8509, + "step": 29453 + }, + { + "epoch": 6.282849829351536, + "grad_norm": 0.09268475590624545, + "learning_rate": 5.0614742022125375e-06, + "loss": 0.8862, + "step": 29454 + }, + { + "epoch": 6.28306313993174, + "grad_norm": 0.08018829827299187, + "learning_rate": 5.058495795451732e-06, + "loss": 0.8682, + "step": 29455 + }, + { + "epoch": 6.283276450511945, + "grad_norm": 0.08806247804366654, + "learning_rate": 5.0555182366552924e-06, + "loss": 0.865, + "step": 29456 + }, + { + "epoch": 6.2834897610921505, + "grad_norm": 0.0953650889774585, + "learning_rate": 5.052541525856907e-06, + "loss": 0.8607, + "step": 29457 + }, + { + "epoch": 6.283703071672355, + "grad_norm": 0.08163375308016745, + "learning_rate": 5.049565663090299e-06, + "loss": 0.8517, + "step": 29458 + }, + { + "epoch": 6.28391638225256, + "grad_norm": 0.08571001112038862, + "learning_rate": 5.046590648389105e-06, + "loss": 0.8814, + "step": 29459 + }, + { + "epoch": 6.284129692832765, + "grad_norm": 0.08655042420721223, + "learning_rate": 5.043616481787004e-06, + "loss": 0.9196, + "step": 29460 + }, + { + "epoch": 6.284343003412969, + "grad_norm": 0.09015004748229469, + "learning_rate": 5.040643163317631e-06, + "loss": 0.8497, + "step": 29461 + }, + { + "epoch": 6.284556313993174, + "grad_norm": 0.09651493539123063, + "learning_rate": 5.037670693014649e-06, + "loss": 0.9042, + "step": 29462 + }, + { + "epoch": 6.284769624573379, + "grad_norm": 0.08134133952119164, + "learning_rate": 5.0346990709116925e-06, + "loss": 0.8789, + "step": 29463 + }, + { + "epoch": 6.284982935153583, + "grad_norm": 0.08328384773675637, + "learning_rate": 5.031728297042371e-06, + "loss": 0.8697, + "step": 29464 + }, + { + "epoch": 6.285196245733788, + "grad_norm": 0.08252351363304053, + "learning_rate": 5.02875837144031e-06, + "loss": 0.8904, + "step": 29465 + }, + { + "epoch": 6.285409556313994, + "grad_norm": 0.09102813559599103, + "learning_rate": 5.0257892941391e-06, + "loss": 0.8486, + "step": 29466 + }, + { + "epoch": 6.285622866894198, + "grad_norm": 0.08882339417070167, + "learning_rate": 5.022821065172352e-06, + "loss": 0.866, + "step": 29467 + }, + { + "epoch": 6.285836177474403, + "grad_norm": 0.08129844593831607, + "learning_rate": 5.0198536845736456e-06, + "loss": 0.8467, + "step": 29468 + }, + { + "epoch": 6.286049488054608, + "grad_norm": 0.08723203599796611, + "learning_rate": 5.016887152376537e-06, + "loss": 0.9001, + "step": 29469 + }, + { + "epoch": 6.286262798634812, + "grad_norm": 0.08680334767361697, + "learning_rate": 5.0139214686146265e-06, + "loss": 0.9014, + "step": 29470 + }, + { + "epoch": 6.286476109215017, + "grad_norm": 0.08215708143422332, + "learning_rate": 5.01095663332146e-06, + "loss": 0.8747, + "step": 29471 + }, + { + "epoch": 6.286689419795222, + "grad_norm": 0.08103086486123445, + "learning_rate": 5.007992646530575e-06, + "loss": 0.8387, + "step": 29472 + }, + { + "epoch": 6.286902730375426, + "grad_norm": 0.08541569212230127, + "learning_rate": 5.0050295082755096e-06, + "loss": 0.8679, + "step": 29473 + }, + { + "epoch": 6.287116040955631, + "grad_norm": 0.0829950256863944, + "learning_rate": 5.002067218589801e-06, + "loss": 0.8965, + "step": 29474 + }, + { + "epoch": 6.287329351535837, + "grad_norm": 0.08810667833700163, + "learning_rate": 4.99910577750697e-06, + "loss": 0.8542, + "step": 29475 + }, + { + "epoch": 6.287542662116041, + "grad_norm": 0.07870095931651076, + "learning_rate": 4.996145185060517e-06, + "loss": 0.8705, + "step": 29476 + }, + { + "epoch": 6.287755972696246, + "grad_norm": 0.07600655807691149, + "learning_rate": 4.993185441283946e-06, + "loss": 0.9036, + "step": 29477 + }, + { + "epoch": 6.287969283276451, + "grad_norm": 0.08111710532621234, + "learning_rate": 4.99022654621073e-06, + "loss": 0.8849, + "step": 29478 + }, + { + "epoch": 6.288182593856655, + "grad_norm": 0.08330245876393323, + "learning_rate": 4.987268499874373e-06, + "loss": 0.8625, + "step": 29479 + }, + { + "epoch": 6.28839590443686, + "grad_norm": 0.08372726908051083, + "learning_rate": 4.984311302308333e-06, + "loss": 0.8657, + "step": 29480 + }, + { + "epoch": 6.288609215017065, + "grad_norm": 0.08016992997259793, + "learning_rate": 4.981354953546058e-06, + "loss": 0.9088, + "step": 29481 + }, + { + "epoch": 6.288822525597269, + "grad_norm": 0.08935491454871569, + "learning_rate": 4.978399453621032e-06, + "loss": 0.8622, + "step": 29482 + }, + { + "epoch": 6.289035836177474, + "grad_norm": 0.08662473975828903, + "learning_rate": 4.975444802566678e-06, + "loss": 0.8655, + "step": 29483 + }, + { + "epoch": 6.289249146757679, + "grad_norm": 0.08043354350467678, + "learning_rate": 4.972491000416417e-06, + "loss": 0.8847, + "step": 29484 + }, + { + "epoch": 6.289462457337884, + "grad_norm": 0.08285606051872504, + "learning_rate": 4.969538047203681e-06, + "loss": 0.8786, + "step": 29485 + }, + { + "epoch": 6.289675767918089, + "grad_norm": 0.07806061814823516, + "learning_rate": 4.966585942961892e-06, + "loss": 0.8832, + "step": 29486 + }, + { + "epoch": 6.289889078498294, + "grad_norm": 0.08603425575561285, + "learning_rate": 4.963634687724446e-06, + "loss": 0.8635, + "step": 29487 + }, + { + "epoch": 6.290102389078498, + "grad_norm": 0.09888862440074651, + "learning_rate": 4.960684281524746e-06, + "loss": 0.8663, + "step": 29488 + }, + { + "epoch": 6.290315699658703, + "grad_norm": 0.08553002723414246, + "learning_rate": 4.957734724396144e-06, + "loss": 0.9021, + "step": 29489 + }, + { + "epoch": 6.290529010238908, + "grad_norm": 0.08596631276218221, + "learning_rate": 4.954786016372044e-06, + "loss": 0.8423, + "step": 29490 + }, + { + "epoch": 6.290742320819112, + "grad_norm": 0.08999007003849809, + "learning_rate": 4.9518381574858064e-06, + "loss": 0.8361, + "step": 29491 + }, + { + "epoch": 6.290955631399317, + "grad_norm": 0.08375840828075594, + "learning_rate": 4.9488911477707825e-06, + "loss": 0.8921, + "step": 29492 + }, + { + "epoch": 6.291168941979522, + "grad_norm": 0.08155066781965871, + "learning_rate": 4.945944987260305e-06, + "loss": 0.8598, + "step": 29493 + }, + { + "epoch": 6.2913822525597265, + "grad_norm": 0.08612546334800322, + "learning_rate": 4.942999675987734e-06, + "loss": 0.8881, + "step": 29494 + }, + { + "epoch": 6.291595563139932, + "grad_norm": 0.08036852549640522, + "learning_rate": 4.940055213986377e-06, + "loss": 0.8715, + "step": 29495 + }, + { + "epoch": 6.291808873720137, + "grad_norm": 0.08302322586046225, + "learning_rate": 4.937111601289566e-06, + "loss": 0.8803, + "step": 29496 + }, + { + "epoch": 6.292022184300341, + "grad_norm": 0.09002337333684836, + "learning_rate": 4.934168837930591e-06, + "loss": 0.892, + "step": 29497 + }, + { + "epoch": 6.292235494880546, + "grad_norm": 0.08666818643722272, + "learning_rate": 4.931226923942758e-06, + "loss": 0.8884, + "step": 29498 + }, + { + "epoch": 6.292448805460751, + "grad_norm": 0.07867109625067523, + "learning_rate": 4.928285859359365e-06, + "loss": 0.8914, + "step": 29499 + }, + { + "epoch": 6.2926621160409555, + "grad_norm": 0.08557714785146973, + "learning_rate": 4.925345644213693e-06, + "loss": 0.8668, + "step": 29500 + }, + { + "epoch": 6.29287542662116, + "grad_norm": 0.07938864974880569, + "learning_rate": 4.922406278538967e-06, + "loss": 0.8353, + "step": 29501 + }, + { + "epoch": 6.293088737201365, + "grad_norm": 0.08585771974104003, + "learning_rate": 4.919467762368495e-06, + "loss": 0.8865, + "step": 29502 + }, + { + "epoch": 6.2933020477815695, + "grad_norm": 0.08102954434037468, + "learning_rate": 4.916530095735503e-06, + "loss": 0.8764, + "step": 29503 + }, + { + "epoch": 6.293515358361775, + "grad_norm": 0.08321153680971642, + "learning_rate": 4.9135932786732364e-06, + "loss": 0.879, + "step": 29504 + }, + { + "epoch": 6.29372866894198, + "grad_norm": 0.09389967578130012, + "learning_rate": 4.910657311214921e-06, + "loss": 0.886, + "step": 29505 + }, + { + "epoch": 6.2939419795221845, + "grad_norm": 0.08179640461434212, + "learning_rate": 4.907722193393785e-06, + "loss": 0.8777, + "step": 29506 + }, + { + "epoch": 6.294155290102389, + "grad_norm": 0.07986703292080473, + "learning_rate": 4.904787925243035e-06, + "loss": 0.8831, + "step": 29507 + }, + { + "epoch": 6.294368600682594, + "grad_norm": 0.08000993974179567, + "learning_rate": 4.901854506795873e-06, + "loss": 0.8668, + "step": 29508 + }, + { + "epoch": 6.2945819112627985, + "grad_norm": 0.08207429129850718, + "learning_rate": 4.898921938085481e-06, + "loss": 0.8984, + "step": 29509 + }, + { + "epoch": 6.294795221843003, + "grad_norm": 0.08313253341066693, + "learning_rate": 4.895990219145068e-06, + "loss": 0.8663, + "step": 29510 + }, + { + "epoch": 6.295008532423208, + "grad_norm": 0.08276629892557202, + "learning_rate": 4.893059350007789e-06, + "loss": 0.8774, + "step": 29511 + }, + { + "epoch": 6.295221843003413, + "grad_norm": 0.08074878888995196, + "learning_rate": 4.890129330706801e-06, + "loss": 0.874, + "step": 29512 + }, + { + "epoch": 6.295435153583618, + "grad_norm": 0.08956663881914446, + "learning_rate": 4.88720016127525e-06, + "loss": 0.8592, + "step": 29513 + }, + { + "epoch": 6.295648464163823, + "grad_norm": 0.0994467624070942, + "learning_rate": 4.884271841746309e-06, + "loss": 0.8991, + "step": 29514 + }, + { + "epoch": 6.2958617747440275, + "grad_norm": 0.07287564452133659, + "learning_rate": 4.881344372153089e-06, + "loss": 0.8443, + "step": 29515 + }, + { + "epoch": 6.296075085324232, + "grad_norm": 0.08695484269536673, + "learning_rate": 4.8784177525287304e-06, + "loss": 0.897, + "step": 29516 + }, + { + "epoch": 6.296288395904437, + "grad_norm": 0.08161297840911164, + "learning_rate": 4.875491982906315e-06, + "loss": 0.9015, + "step": 29517 + }, + { + "epoch": 6.296501706484642, + "grad_norm": 0.0878155298085824, + "learning_rate": 4.872567063318991e-06, + "loss": 0.9068, + "step": 29518 + }, + { + "epoch": 6.296715017064846, + "grad_norm": 0.08965089111633112, + "learning_rate": 4.869642993799826e-06, + "loss": 0.8841, + "step": 29519 + }, + { + "epoch": 6.296928327645051, + "grad_norm": 0.08508602807785252, + "learning_rate": 4.866719774381921e-06, + "loss": 0.8688, + "step": 29520 + }, + { + "epoch": 6.297141638225256, + "grad_norm": 0.08008268770585723, + "learning_rate": 4.8637974050983335e-06, + "loss": 0.8738, + "step": 29521 + }, + { + "epoch": 6.297354948805461, + "grad_norm": 0.10197173299490997, + "learning_rate": 4.860875885982168e-06, + "loss": 0.8798, + "step": 29522 + }, + { + "epoch": 6.297568259385666, + "grad_norm": 0.09251309069571682, + "learning_rate": 4.857955217066437e-06, + "loss": 0.8956, + "step": 29523 + }, + { + "epoch": 6.297781569965871, + "grad_norm": 0.08410792876181608, + "learning_rate": 4.855035398384216e-06, + "loss": 0.876, + "step": 29524 + }, + { + "epoch": 6.297994880546075, + "grad_norm": 0.08586508065396267, + "learning_rate": 4.85211642996851e-06, + "loss": 0.8697, + "step": 29525 + }, + { + "epoch": 6.29820819112628, + "grad_norm": 0.08293141143023033, + "learning_rate": 4.849198311852394e-06, + "loss": 0.8638, + "step": 29526 + }, + { + "epoch": 6.298421501706485, + "grad_norm": 0.08978721954571903, + "learning_rate": 4.846281044068857e-06, + "loss": 0.8812, + "step": 29527 + }, + { + "epoch": 6.298634812286689, + "grad_norm": 0.08156906036930443, + "learning_rate": 4.84336462665091e-06, + "loss": 0.8888, + "step": 29528 + }, + { + "epoch": 6.298848122866894, + "grad_norm": 0.07880990377952564, + "learning_rate": 4.840449059631552e-06, + "loss": 0.858, + "step": 29529 + }, + { + "epoch": 6.299061433447099, + "grad_norm": 0.09075172311209878, + "learning_rate": 4.8375343430437836e-06, + "loss": 0.876, + "step": 29530 + }, + { + "epoch": 6.299274744027303, + "grad_norm": 0.08961671390153984, + "learning_rate": 4.834620476920586e-06, + "loss": 0.8648, + "step": 29531 + }, + { + "epoch": 6.299488054607509, + "grad_norm": 0.09333947080384158, + "learning_rate": 4.831707461294919e-06, + "loss": 0.8866, + "step": 29532 + }, + { + "epoch": 6.299701365187714, + "grad_norm": 0.08603421656581647, + "learning_rate": 4.828795296199742e-06, + "loss": 0.8712, + "step": 29533 + }, + { + "epoch": 6.299914675767918, + "grad_norm": 0.08847548911782763, + "learning_rate": 4.825883981668016e-06, + "loss": 0.8737, + "step": 29534 + }, + { + "epoch": 6.300127986348123, + "grad_norm": 0.08370451771648224, + "learning_rate": 4.822973517732674e-06, + "loss": 0.8666, + "step": 29535 + }, + { + "epoch": 6.300341296928328, + "grad_norm": 0.08647659060854676, + "learning_rate": 4.820063904426651e-06, + "loss": 0.8938, + "step": 29536 + }, + { + "epoch": 6.300554607508532, + "grad_norm": 0.08655672504646605, + "learning_rate": 4.8171551417828524e-06, + "loss": 0.8907, + "step": 29537 + }, + { + "epoch": 6.300767918088737, + "grad_norm": 0.07960195749083972, + "learning_rate": 4.814247229834231e-06, + "loss": 0.8528, + "step": 29538 + }, + { + "epoch": 6.300981228668942, + "grad_norm": 0.09262128547053038, + "learning_rate": 4.811340168613656e-06, + "loss": 0.8835, + "step": 29539 + }, + { + "epoch": 6.301194539249146, + "grad_norm": 0.08686531390758094, + "learning_rate": 4.808433958154037e-06, + "loss": 0.8485, + "step": 29540 + }, + { + "epoch": 6.301407849829351, + "grad_norm": 0.087050444155249, + "learning_rate": 4.805528598488245e-06, + "loss": 0.9025, + "step": 29541 + }, + { + "epoch": 6.301621160409557, + "grad_norm": 0.08122319599942245, + "learning_rate": 4.802624089649169e-06, + "loss": 0.8554, + "step": 29542 + }, + { + "epoch": 6.301834470989761, + "grad_norm": 0.08549222726945638, + "learning_rate": 4.79972043166967e-06, + "loss": 0.8787, + "step": 29543 + }, + { + "epoch": 6.302047781569966, + "grad_norm": 0.08464067387282266, + "learning_rate": 4.796817624582595e-06, + "loss": 0.8573, + "step": 29544 + }, + { + "epoch": 6.302261092150171, + "grad_norm": 0.09188740706310126, + "learning_rate": 4.793915668420788e-06, + "loss": 0.8856, + "step": 29545 + }, + { + "epoch": 6.302474402730375, + "grad_norm": 0.08737887339286025, + "learning_rate": 4.791014563217102e-06, + "loss": 0.8713, + "step": 29546 + }, + { + "epoch": 6.30268771331058, + "grad_norm": 0.08926603239177756, + "learning_rate": 4.7881143090043394e-06, + "loss": 0.8642, + "step": 29547 + }, + { + "epoch": 6.302901023890785, + "grad_norm": 0.0953685421382832, + "learning_rate": 4.785214905815334e-06, + "loss": 0.8752, + "step": 29548 + }, + { + "epoch": 6.3031143344709895, + "grad_norm": 0.08419774457135214, + "learning_rate": 4.782316353682879e-06, + "loss": 0.8788, + "step": 29549 + }, + { + "epoch": 6.303327645051194, + "grad_norm": 0.095067942630069, + "learning_rate": 4.779418652639791e-06, + "loss": 0.8648, + "step": 29550 + }, + { + "epoch": 6.3035409556314, + "grad_norm": 0.09309342762957046, + "learning_rate": 4.7765218027188455e-06, + "loss": 0.904, + "step": 29551 + }, + { + "epoch": 6.303754266211604, + "grad_norm": 0.09247600140897913, + "learning_rate": 4.773625803952815e-06, + "loss": 0.8985, + "step": 29552 + }, + { + "epoch": 6.303967576791809, + "grad_norm": 0.08739809937039093, + "learning_rate": 4.770730656374465e-06, + "loss": 0.8951, + "step": 29553 + }, + { + "epoch": 6.304180887372014, + "grad_norm": 0.10031946008066094, + "learning_rate": 4.767836360016577e-06, + "loss": 0.8979, + "step": 29554 + }, + { + "epoch": 6.3043941979522184, + "grad_norm": 0.0937275563925868, + "learning_rate": 4.764942914911901e-06, + "loss": 0.9043, + "step": 29555 + }, + { + "epoch": 6.304607508532423, + "grad_norm": 0.08496933422450786, + "learning_rate": 4.762050321093146e-06, + "loss": 0.8745, + "step": 29556 + }, + { + "epoch": 6.304820819112628, + "grad_norm": 0.08994211070173401, + "learning_rate": 4.759158578593042e-06, + "loss": 0.8846, + "step": 29557 + }, + { + "epoch": 6.3050341296928325, + "grad_norm": 0.08940660416902971, + "learning_rate": 4.756267687444336e-06, + "loss": 0.9153, + "step": 29558 + }, + { + "epoch": 6.305247440273037, + "grad_norm": 0.10671412697091695, + "learning_rate": 4.753377647679731e-06, + "loss": 0.8801, + "step": 29559 + }, + { + "epoch": 6.305460750853243, + "grad_norm": 0.0793379671906489, + "learning_rate": 4.750488459331921e-06, + "loss": 0.8693, + "step": 29560 + }, + { + "epoch": 6.305674061433447, + "grad_norm": 0.0783650116052269, + "learning_rate": 4.747600122433591e-06, + "loss": 0.8604, + "step": 29561 + }, + { + "epoch": 6.305887372013652, + "grad_norm": 0.0858182564412559, + "learning_rate": 4.7447126370174434e-06, + "loss": 0.8764, + "step": 29562 + }, + { + "epoch": 6.306100682593857, + "grad_norm": 0.08764190998588833, + "learning_rate": 4.741826003116136e-06, + "loss": 0.8831, + "step": 29563 + }, + { + "epoch": 6.3063139931740615, + "grad_norm": 0.09374839106308927, + "learning_rate": 4.738940220762329e-06, + "loss": 0.8802, + "step": 29564 + }, + { + "epoch": 6.306527303754266, + "grad_norm": 0.07693818197014343, + "learning_rate": 4.736055289988679e-06, + "loss": 0.8624, + "step": 29565 + }, + { + "epoch": 6.306740614334471, + "grad_norm": 0.08959247371575871, + "learning_rate": 4.7331712108278276e-06, + "loss": 0.8711, + "step": 29566 + }, + { + "epoch": 6.3069539249146755, + "grad_norm": 0.08439490607629171, + "learning_rate": 4.730287983312432e-06, + "loss": 0.8704, + "step": 29567 + }, + { + "epoch": 6.30716723549488, + "grad_norm": 0.08571285511164956, + "learning_rate": 4.72740560747508e-06, + "loss": 0.8773, + "step": 29568 + }, + { + "epoch": 6.307380546075085, + "grad_norm": 0.08219852685151856, + "learning_rate": 4.724524083348385e-06, + "loss": 0.8843, + "step": 29569 + }, + { + "epoch": 6.3075938566552905, + "grad_norm": 0.07983083586861399, + "learning_rate": 4.72164341096498e-06, + "loss": 0.8542, + "step": 29570 + }, + { + "epoch": 6.307807167235495, + "grad_norm": 0.08334953945094392, + "learning_rate": 4.718763590357443e-06, + "loss": 0.8714, + "step": 29571 + }, + { + "epoch": 6.3080204778157, + "grad_norm": 0.08639016539681417, + "learning_rate": 4.715884621558368e-06, + "loss": 0.9228, + "step": 29572 + }, + { + "epoch": 6.3082337883959045, + "grad_norm": 0.07938683813118519, + "learning_rate": 4.71300650460031e-06, + "loss": 0.8736, + "step": 29573 + }, + { + "epoch": 6.308447098976109, + "grad_norm": 0.08397727920518736, + "learning_rate": 4.710129239515855e-06, + "loss": 0.8886, + "step": 29574 + }, + { + "epoch": 6.308660409556314, + "grad_norm": 0.08411362805546593, + "learning_rate": 4.7072528263375625e-06, + "loss": 0.8813, + "step": 29575 + }, + { + "epoch": 6.308873720136519, + "grad_norm": 0.07789669683029238, + "learning_rate": 4.704377265097968e-06, + "loss": 0.8637, + "step": 29576 + }, + { + "epoch": 6.309087030716723, + "grad_norm": 0.08474242129829486, + "learning_rate": 4.701502555829604e-06, + "loss": 0.8976, + "step": 29577 + }, + { + "epoch": 6.309300341296928, + "grad_norm": 0.07844831574311205, + "learning_rate": 4.698628698565024e-06, + "loss": 0.8764, + "step": 29578 + }, + { + "epoch": 6.309513651877133, + "grad_norm": 0.08250544866901886, + "learning_rate": 4.695755693336717e-06, + "loss": 0.8726, + "step": 29579 + }, + { + "epoch": 6.309726962457338, + "grad_norm": 0.08851052925240979, + "learning_rate": 4.692883540177198e-06, + "loss": 0.8712, + "step": 29580 + }, + { + "epoch": 6.309940273037543, + "grad_norm": 0.08827925448285788, + "learning_rate": 4.690012239118957e-06, + "loss": 0.8765, + "step": 29581 + }, + { + "epoch": 6.310153583617748, + "grad_norm": 0.08172084689102146, + "learning_rate": 4.687141790194503e-06, + "loss": 0.9017, + "step": 29582 + }, + { + "epoch": 6.310366894197952, + "grad_norm": 0.09550533084359433, + "learning_rate": 4.684272193436315e-06, + "loss": 0.927, + "step": 29583 + }, + { + "epoch": 6.310580204778157, + "grad_norm": 0.08456970263633919, + "learning_rate": 4.681403448876846e-06, + "loss": 0.8891, + "step": 29584 + }, + { + "epoch": 6.310793515358362, + "grad_norm": 0.08729822104084448, + "learning_rate": 4.678535556548554e-06, + "loss": 0.8892, + "step": 29585 + }, + { + "epoch": 6.311006825938566, + "grad_norm": 0.08033712318478664, + "learning_rate": 4.6756685164839155e-06, + "loss": 0.8638, + "step": 29586 + }, + { + "epoch": 6.311220136518771, + "grad_norm": 0.0797951614611323, + "learning_rate": 4.672802328715351e-06, + "loss": 0.8532, + "step": 29587 + }, + { + "epoch": 6.311433447098976, + "grad_norm": 0.08739709234478213, + "learning_rate": 4.6699369932752965e-06, + "loss": 0.8598, + "step": 29588 + }, + { + "epoch": 6.311646757679181, + "grad_norm": 0.08586704834086081, + "learning_rate": 4.667072510196171e-06, + "loss": 0.8988, + "step": 29589 + }, + { + "epoch": 6.311860068259386, + "grad_norm": 0.08092754931534567, + "learning_rate": 4.664208879510392e-06, + "loss": 0.8757, + "step": 29590 + }, + { + "epoch": 6.312073378839591, + "grad_norm": 0.09958940254378862, + "learning_rate": 4.6613461012503524e-06, + "loss": 0.8644, + "step": 29591 + }, + { + "epoch": 6.312286689419795, + "grad_norm": 0.0816163898760735, + "learning_rate": 4.658484175448443e-06, + "loss": 0.8524, + "step": 29592 + }, + { + "epoch": 6.3125, + "grad_norm": 0.08380877877950613, + "learning_rate": 4.655623102137048e-06, + "loss": 0.9236, + "step": 29593 + }, + { + "epoch": 6.312713310580205, + "grad_norm": 0.09098353374723227, + "learning_rate": 4.652762881348558e-06, + "loss": 0.8634, + "step": 29594 + }, + { + "epoch": 6.312926621160409, + "grad_norm": 0.0921484487661094, + "learning_rate": 4.6499035131153215e-06, + "loss": 0.8894, + "step": 29595 + }, + { + "epoch": 6.313139931740614, + "grad_norm": 0.08001755449030688, + "learning_rate": 4.647044997469694e-06, + "loss": 0.8828, + "step": 29596 + }, + { + "epoch": 6.313353242320819, + "grad_norm": 0.08777880517462577, + "learning_rate": 4.644187334444006e-06, + "loss": 0.9131, + "step": 29597 + }, + { + "epoch": 6.313566552901024, + "grad_norm": 0.08468481132582743, + "learning_rate": 4.641330524070613e-06, + "loss": 0.8557, + "step": 29598 + }, + { + "epoch": 6.313779863481229, + "grad_norm": 0.09165171133254459, + "learning_rate": 4.638474566381845e-06, + "loss": 0.8618, + "step": 29599 + }, + { + "epoch": 6.313993174061434, + "grad_norm": 0.07991712947003829, + "learning_rate": 4.635619461409997e-06, + "loss": 0.8675, + "step": 29600 + }, + { + "epoch": 6.314206484641638, + "grad_norm": 0.0888014729021301, + "learning_rate": 4.63276520918738e-06, + "loss": 0.8752, + "step": 29601 + }, + { + "epoch": 6.314419795221843, + "grad_norm": 0.09373145772593652, + "learning_rate": 4.629911809746297e-06, + "loss": 0.9071, + "step": 29602 + }, + { + "epoch": 6.314633105802048, + "grad_norm": 0.08345246489280114, + "learning_rate": 4.627059263119034e-06, + "loss": 0.8766, + "step": 29603 + }, + { + "epoch": 6.314846416382252, + "grad_norm": 0.08569160537257625, + "learning_rate": 4.624207569337857e-06, + "loss": 0.8973, + "step": 29604 + }, + { + "epoch": 6.315059726962457, + "grad_norm": 0.08265263015970348, + "learning_rate": 4.621356728435027e-06, + "loss": 0.8788, + "step": 29605 + }, + { + "epoch": 6.315273037542662, + "grad_norm": 0.09183059114116178, + "learning_rate": 4.618506740442827e-06, + "loss": 0.8712, + "step": 29606 + }, + { + "epoch": 6.315486348122867, + "grad_norm": 0.10074300403473781, + "learning_rate": 4.615657605393491e-06, + "loss": 0.8733, + "step": 29607 + }, + { + "epoch": 6.315699658703072, + "grad_norm": 0.07686839840862628, + "learning_rate": 4.612809323319259e-06, + "loss": 0.8623, + "step": 29608 + }, + { + "epoch": 6.315912969283277, + "grad_norm": 0.08765594843335894, + "learning_rate": 4.609961894252344e-06, + "loss": 0.8973, + "step": 29609 + }, + { + "epoch": 6.316126279863481, + "grad_norm": 0.08738349805038745, + "learning_rate": 4.607115318224988e-06, + "loss": 0.8777, + "step": 29610 + }, + { + "epoch": 6.316339590443686, + "grad_norm": 0.08682850580953325, + "learning_rate": 4.6042695952694065e-06, + "loss": 0.8863, + "step": 29611 + }, + { + "epoch": 6.316552901023891, + "grad_norm": 0.0840620762278091, + "learning_rate": 4.601424725417749e-06, + "loss": 0.8723, + "step": 29612 + }, + { + "epoch": 6.3167662116040955, + "grad_norm": 0.08572909621631966, + "learning_rate": 4.598580708702258e-06, + "loss": 0.8884, + "step": 29613 + }, + { + "epoch": 6.3169795221843, + "grad_norm": 0.08766634957017379, + "learning_rate": 4.595737545155094e-06, + "loss": 0.8918, + "step": 29614 + }, + { + "epoch": 6.317192832764505, + "grad_norm": 0.08869378241142886, + "learning_rate": 4.592895234808436e-06, + "loss": 0.8792, + "step": 29615 + }, + { + "epoch": 6.3174061433447095, + "grad_norm": 0.08291397111621215, + "learning_rate": 4.590053777694428e-06, + "loss": 0.8771, + "step": 29616 + }, + { + "epoch": 6.317619453924915, + "grad_norm": 0.08349117585761139, + "learning_rate": 4.58721317384522e-06, + "loss": 0.8625, + "step": 29617 + }, + { + "epoch": 6.31783276450512, + "grad_norm": 0.08950800757175965, + "learning_rate": 4.584373423292974e-06, + "loss": 0.8736, + "step": 29618 + }, + { + "epoch": 6.3180460750853245, + "grad_norm": 0.09417204293190168, + "learning_rate": 4.581534526069815e-06, + "loss": 0.8897, + "step": 29619 + }, + { + "epoch": 6.318259385665529, + "grad_norm": 0.08706033367847711, + "learning_rate": 4.578696482207851e-06, + "loss": 0.8612, + "step": 29620 + }, + { + "epoch": 6.318472696245734, + "grad_norm": 0.08617483922892905, + "learning_rate": 4.5758592917392085e-06, + "loss": 0.8623, + "step": 29621 + }, + { + "epoch": 6.3186860068259385, + "grad_norm": 0.08104197274041625, + "learning_rate": 4.573022954695985e-06, + "loss": 0.8726, + "step": 29622 + }, + { + "epoch": 6.318899317406143, + "grad_norm": 0.08738831608783995, + "learning_rate": 4.570187471110297e-06, + "loss": 0.883, + "step": 29623 + }, + { + "epoch": 6.319112627986348, + "grad_norm": 0.09391086975919512, + "learning_rate": 4.567352841014181e-06, + "loss": 0.8904, + "step": 29624 + }, + { + "epoch": 6.319325938566553, + "grad_norm": 0.07795233988502266, + "learning_rate": 4.564519064439745e-06, + "loss": 0.8783, + "step": 29625 + }, + { + "epoch": 6.319539249146757, + "grad_norm": 0.08212984114495132, + "learning_rate": 4.561686141419044e-06, + "loss": 0.871, + "step": 29626 + }, + { + "epoch": 6.319752559726963, + "grad_norm": 0.08291505004456629, + "learning_rate": 4.558854071984132e-06, + "loss": 0.8878, + "step": 29627 + }, + { + "epoch": 6.3199658703071675, + "grad_norm": 0.08737979101681924, + "learning_rate": 4.556022856167062e-06, + "loss": 0.9054, + "step": 29628 + }, + { + "epoch": 6.320179180887372, + "grad_norm": 0.08628670069331242, + "learning_rate": 4.553192493999845e-06, + "loss": 0.8871, + "step": 29629 + }, + { + "epoch": 6.320392491467577, + "grad_norm": 0.08114216853221728, + "learning_rate": 4.550362985514536e-06, + "loss": 0.8725, + "step": 29630 + }, + { + "epoch": 6.3206058020477816, + "grad_norm": 0.08107659558629726, + "learning_rate": 4.547534330743144e-06, + "loss": 0.888, + "step": 29631 + }, + { + "epoch": 6.320819112627986, + "grad_norm": 0.0872225671866653, + "learning_rate": 4.5447065297176615e-06, + "loss": 0.8505, + "step": 29632 + }, + { + "epoch": 6.321032423208191, + "grad_norm": 0.08749783945409648, + "learning_rate": 4.54187958247009e-06, + "loss": 0.8995, + "step": 29633 + }, + { + "epoch": 6.321245733788396, + "grad_norm": 0.0788971945832578, + "learning_rate": 4.539053489032439e-06, + "loss": 0.8684, + "step": 29634 + }, + { + "epoch": 6.3214590443686, + "grad_norm": 0.0833445778271681, + "learning_rate": 4.536228249436665e-06, + "loss": 0.8904, + "step": 29635 + }, + { + "epoch": 6.321672354948806, + "grad_norm": 0.09009829271304318, + "learning_rate": 4.533403863714716e-06, + "loss": 0.8822, + "step": 29636 + }, + { + "epoch": 6.3218856655290105, + "grad_norm": 0.09086649437214939, + "learning_rate": 4.530580331898584e-06, + "loss": 0.8791, + "step": 29637 + }, + { + "epoch": 6.322098976109215, + "grad_norm": 0.07619469520425277, + "learning_rate": 4.5277576540202085e-06, + "loss": 0.8662, + "step": 29638 + }, + { + "epoch": 6.32231228668942, + "grad_norm": 0.09170271136834503, + "learning_rate": 4.524935830111519e-06, + "loss": 0.8991, + "step": 29639 + }, + { + "epoch": 6.322525597269625, + "grad_norm": 0.09258795011423933, + "learning_rate": 4.522114860204454e-06, + "loss": 0.8952, + "step": 29640 + }, + { + "epoch": 6.322738907849829, + "grad_norm": 0.09009620927972942, + "learning_rate": 4.519294744330918e-06, + "loss": 0.8783, + "step": 29641 + }, + { + "epoch": 6.322952218430034, + "grad_norm": 0.08328280901825616, + "learning_rate": 4.516475482522839e-06, + "loss": 0.896, + "step": 29642 + }, + { + "epoch": 6.323165529010239, + "grad_norm": 0.08114940951054431, + "learning_rate": 4.5136570748121145e-06, + "loss": 0.8742, + "step": 29643 + }, + { + "epoch": 6.323378839590443, + "grad_norm": 0.08947522736492486, + "learning_rate": 4.510839521230627e-06, + "loss": 0.8927, + "step": 29644 + }, + { + "epoch": 6.323592150170649, + "grad_norm": 0.08480674411807917, + "learning_rate": 4.508022821810265e-06, + "loss": 0.8937, + "step": 29645 + }, + { + "epoch": 6.323805460750854, + "grad_norm": 0.0832896184897927, + "learning_rate": 4.505206976582895e-06, + "loss": 0.8927, + "step": 29646 + }, + { + "epoch": 6.324018771331058, + "grad_norm": 0.0919745797076884, + "learning_rate": 4.502391985580365e-06, + "loss": 0.8615, + "step": 29647 + }, + { + "epoch": 6.324232081911263, + "grad_norm": 0.08822909537428458, + "learning_rate": 4.499577848834538e-06, + "loss": 0.8561, + "step": 29648 + }, + { + "epoch": 6.324445392491468, + "grad_norm": 0.08179448592605423, + "learning_rate": 4.49676456637727e-06, + "loss": 0.8893, + "step": 29649 + }, + { + "epoch": 6.324658703071672, + "grad_norm": 0.09443059953809815, + "learning_rate": 4.493952138240376e-06, + "loss": 0.8778, + "step": 29650 + }, + { + "epoch": 6.324872013651877, + "grad_norm": 0.09432825542585759, + "learning_rate": 4.49114056445569e-06, + "loss": 0.8934, + "step": 29651 + }, + { + "epoch": 6.325085324232082, + "grad_norm": 0.0837587333494118, + "learning_rate": 4.488329845055015e-06, + "loss": 0.9106, + "step": 29652 + }, + { + "epoch": 6.325298634812286, + "grad_norm": 0.08374025450021239, + "learning_rate": 4.48551998007015e-06, + "loss": 0.8956, + "step": 29653 + }, + { + "epoch": 6.325511945392491, + "grad_norm": 0.09284789878100404, + "learning_rate": 4.4827109695328995e-06, + "loss": 0.9048, + "step": 29654 + }, + { + "epoch": 6.325725255972697, + "grad_norm": 0.08711662836749962, + "learning_rate": 4.4799028134750525e-06, + "loss": 0.9093, + "step": 29655 + }, + { + "epoch": 6.325938566552901, + "grad_norm": 0.08433610884482846, + "learning_rate": 4.477095511928369e-06, + "loss": 0.8601, + "step": 29656 + }, + { + "epoch": 6.326151877133106, + "grad_norm": 0.08536485626229386, + "learning_rate": 4.474289064924629e-06, + "loss": 0.8893, + "step": 29657 + }, + { + "epoch": 6.326365187713311, + "grad_norm": 0.0935175979440122, + "learning_rate": 4.471483472495566e-06, + "loss": 0.8632, + "step": 29658 + }, + { + "epoch": 6.326578498293515, + "grad_norm": 0.09198286529033875, + "learning_rate": 4.468678734672951e-06, + "loss": 0.8726, + "step": 29659 + }, + { + "epoch": 6.32679180887372, + "grad_norm": 0.08186881743957843, + "learning_rate": 4.465874851488483e-06, + "loss": 0.8809, + "step": 29660 + }, + { + "epoch": 6.327005119453925, + "grad_norm": 0.08284033053352641, + "learning_rate": 4.463071822973932e-06, + "loss": 0.8739, + "step": 29661 + }, + { + "epoch": 6.327218430034129, + "grad_norm": 0.08841119212042253, + "learning_rate": 4.4602696491609886e-06, + "loss": 0.8636, + "step": 29662 + }, + { + "epoch": 6.327431740614334, + "grad_norm": 0.09268066832013767, + "learning_rate": 4.4574683300813696e-06, + "loss": 0.8811, + "step": 29663 + }, + { + "epoch": 6.327645051194539, + "grad_norm": 0.08463375217265424, + "learning_rate": 4.454667865766764e-06, + "loss": 0.8726, + "step": 29664 + }, + { + "epoch": 6.327858361774744, + "grad_norm": 0.08220548520648709, + "learning_rate": 4.451868256248855e-06, + "loss": 0.8573, + "step": 29665 + }, + { + "epoch": 6.328071672354949, + "grad_norm": 0.08153760033946245, + "learning_rate": 4.449069501559331e-06, + "loss": 0.8965, + "step": 29666 + }, + { + "epoch": 6.328284982935154, + "grad_norm": 0.09165775183870331, + "learning_rate": 4.446271601729875e-06, + "loss": 0.8652, + "step": 29667 + }, + { + "epoch": 6.328498293515358, + "grad_norm": 0.0903326025680576, + "learning_rate": 4.443474556792105e-06, + "loss": 0.8691, + "step": 29668 + }, + { + "epoch": 6.328711604095563, + "grad_norm": 0.08533547116807035, + "learning_rate": 4.440678366777693e-06, + "loss": 0.902, + "step": 29669 + }, + { + "epoch": 6.328924914675768, + "grad_norm": 0.08231837275877076, + "learning_rate": 4.437883031718287e-06, + "loss": 0.8917, + "step": 29670 + }, + { + "epoch": 6.3291382252559725, + "grad_norm": 0.08814851827060073, + "learning_rate": 4.435088551645495e-06, + "loss": 0.8442, + "step": 29671 + }, + { + "epoch": 6.329351535836177, + "grad_norm": 0.09373668911089729, + "learning_rate": 4.432294926590946e-06, + "loss": 0.9141, + "step": 29672 + }, + { + "epoch": 6.329564846416382, + "grad_norm": 0.0882095928156848, + "learning_rate": 4.429502156586258e-06, + "loss": 0.9101, + "step": 29673 + }, + { + "epoch": 6.329778156996587, + "grad_norm": 0.08905683265010592, + "learning_rate": 4.426710241663026e-06, + "loss": 0.8934, + "step": 29674 + }, + { + "epoch": 6.329991467576792, + "grad_norm": 0.08374268378928708, + "learning_rate": 4.423919181852832e-06, + "loss": 0.9035, + "step": 29675 + }, + { + "epoch": 6.330204778156997, + "grad_norm": 0.08851573471032328, + "learning_rate": 4.421128977187268e-06, + "loss": 0.8657, + "step": 29676 + }, + { + "epoch": 6.3304180887372015, + "grad_norm": 0.08640152644823716, + "learning_rate": 4.418339627697892e-06, + "loss": 0.8547, + "step": 29677 + }, + { + "epoch": 6.330631399317406, + "grad_norm": 0.08065665002229698, + "learning_rate": 4.415551133416278e-06, + "loss": 0.8606, + "step": 29678 + }, + { + "epoch": 6.330844709897611, + "grad_norm": 0.08117176867944259, + "learning_rate": 4.412763494373992e-06, + "loss": 0.9049, + "step": 29679 + }, + { + "epoch": 6.3310580204778155, + "grad_norm": 0.09313084055478854, + "learning_rate": 4.409976710602539e-06, + "loss": 0.8976, + "step": 29680 + }, + { + "epoch": 6.33127133105802, + "grad_norm": 0.0943943779825378, + "learning_rate": 4.407190782133474e-06, + "loss": 0.8929, + "step": 29681 + }, + { + "epoch": 6.331484641638225, + "grad_norm": 0.08492725968476281, + "learning_rate": 4.40440570899832e-06, + "loss": 0.9003, + "step": 29682 + }, + { + "epoch": 6.3316979522184305, + "grad_norm": 0.08324124781066024, + "learning_rate": 4.401621491228589e-06, + "loss": 0.8854, + "step": 29683 + }, + { + "epoch": 6.331911262798635, + "grad_norm": 0.07725524426808784, + "learning_rate": 4.3988381288557756e-06, + "loss": 0.8835, + "step": 29684 + }, + { + "epoch": 6.33212457337884, + "grad_norm": 0.08433783745812311, + "learning_rate": 4.396055621911384e-06, + "loss": 0.8899, + "step": 29685 + }, + { + "epoch": 6.3323378839590445, + "grad_norm": 0.07947507819237995, + "learning_rate": 4.393273970426899e-06, + "loss": 0.8645, + "step": 29686 + }, + { + "epoch": 6.332551194539249, + "grad_norm": 0.08122821318790767, + "learning_rate": 4.3904931744337806e-06, + "loss": 0.8867, + "step": 29687 + }, + { + "epoch": 6.332764505119454, + "grad_norm": 0.08386833675435107, + "learning_rate": 4.387713233963516e-06, + "loss": 0.8815, + "step": 29688 + }, + { + "epoch": 6.332977815699659, + "grad_norm": 0.0838208440184432, + "learning_rate": 4.384934149047535e-06, + "loss": 0.9149, + "step": 29689 + }, + { + "epoch": 6.333191126279863, + "grad_norm": 0.08417587329863788, + "learning_rate": 4.382155919717317e-06, + "loss": 0.9003, + "step": 29690 + }, + { + "epoch": 6.333404436860068, + "grad_norm": 0.08395740611231449, + "learning_rate": 4.379378546004258e-06, + "loss": 0.8904, + "step": 29691 + }, + { + "epoch": 6.3336177474402735, + "grad_norm": 0.08518649640626345, + "learning_rate": 4.3766020279398e-06, + "loss": 0.8569, + "step": 29692 + }, + { + "epoch": 6.333831058020478, + "grad_norm": 0.08462728648242558, + "learning_rate": 4.373826365555376e-06, + "loss": 0.8759, + "step": 29693 + }, + { + "epoch": 6.334044368600683, + "grad_norm": 0.08356671724462404, + "learning_rate": 4.371051558882373e-06, + "loss": 0.8452, + "step": 29694 + }, + { + "epoch": 6.334257679180888, + "grad_norm": 0.08160007020201637, + "learning_rate": 4.3682776079521895e-06, + "loss": 0.8992, + "step": 29695 + }, + { + "epoch": 6.334470989761092, + "grad_norm": 0.09157159234678007, + "learning_rate": 4.365504512796213e-06, + "loss": 0.8861, + "step": 29696 + }, + { + "epoch": 6.334684300341297, + "grad_norm": 0.08878137000559982, + "learning_rate": 4.362732273445831e-06, + "loss": 0.8474, + "step": 29697 + }, + { + "epoch": 6.334897610921502, + "grad_norm": 0.08672026608801786, + "learning_rate": 4.359960889932415e-06, + "loss": 0.8982, + "step": 29698 + }, + { + "epoch": 6.335110921501706, + "grad_norm": 0.08086578502395499, + "learning_rate": 4.3571903622873e-06, + "loss": 0.8763, + "step": 29699 + }, + { + "epoch": 6.335324232081911, + "grad_norm": 0.08588057600035963, + "learning_rate": 4.354420690541856e-06, + "loss": 0.8897, + "step": 29700 + }, + { + "epoch": 6.335537542662116, + "grad_norm": 0.0903002116215005, + "learning_rate": 4.351651874727418e-06, + "loss": 0.8427, + "step": 29701 + }, + { + "epoch": 6.335750853242321, + "grad_norm": 0.09171400079510711, + "learning_rate": 4.348883914875303e-06, + "loss": 0.8872, + "step": 29702 + }, + { + "epoch": 6.335964163822526, + "grad_norm": 0.0860180669332022, + "learning_rate": 4.346116811016847e-06, + "loss": 0.8836, + "step": 29703 + }, + { + "epoch": 6.336177474402731, + "grad_norm": 0.08867041492403309, + "learning_rate": 4.3433505631833396e-06, + "loss": 0.8625, + "step": 29704 + }, + { + "epoch": 6.336390784982935, + "grad_norm": 0.09189624958735937, + "learning_rate": 4.340585171406097e-06, + "loss": 0.8821, + "step": 29705 + }, + { + "epoch": 6.33660409556314, + "grad_norm": 0.08438328367616041, + "learning_rate": 4.337820635716412e-06, + "loss": 0.8938, + "step": 29706 + }, + { + "epoch": 6.336817406143345, + "grad_norm": 0.08374819161001901, + "learning_rate": 4.335056956145565e-06, + "loss": 0.8901, + "step": 29707 + }, + { + "epoch": 6.337030716723549, + "grad_norm": 0.08559818404290907, + "learning_rate": 4.332294132724801e-06, + "loss": 0.8698, + "step": 29708 + }, + { + "epoch": 6.337244027303754, + "grad_norm": 0.08191367139017423, + "learning_rate": 4.329532165485413e-06, + "loss": 0.8799, + "step": 29709 + }, + { + "epoch": 6.337457337883959, + "grad_norm": 0.08213733851312148, + "learning_rate": 4.326771054458645e-06, + "loss": 0.8954, + "step": 29710 + }, + { + "epoch": 6.337670648464163, + "grad_norm": 0.09105036880868665, + "learning_rate": 4.324010799675735e-06, + "loss": 0.8826, + "step": 29711 + }, + { + "epoch": 6.337883959044369, + "grad_norm": 0.08698833242660604, + "learning_rate": 4.321251401167912e-06, + "loss": 0.8879, + "step": 29712 + }, + { + "epoch": 6.338097269624574, + "grad_norm": 0.07882916213012887, + "learning_rate": 4.318492858966403e-06, + "loss": 0.9011, + "step": 29713 + }, + { + "epoch": 6.338310580204778, + "grad_norm": 0.08779637053162327, + "learning_rate": 4.31573517310242e-06, + "loss": 0.8762, + "step": 29714 + }, + { + "epoch": 6.338523890784983, + "grad_norm": 0.08815518639471157, + "learning_rate": 4.312978343607173e-06, + "loss": 0.8958, + "step": 29715 + }, + { + "epoch": 6.338737201365188, + "grad_norm": 0.07971749547978263, + "learning_rate": 4.310222370511828e-06, + "loss": 0.8673, + "step": 29716 + }, + { + "epoch": 6.338950511945392, + "grad_norm": 0.082722873821915, + "learning_rate": 4.307467253847613e-06, + "loss": 0.9064, + "step": 29717 + }, + { + "epoch": 6.339163822525597, + "grad_norm": 0.08735102644103249, + "learning_rate": 4.3047129936456674e-06, + "loss": 0.8783, + "step": 29718 + }, + { + "epoch": 6.339377133105802, + "grad_norm": 0.08341528001009729, + "learning_rate": 4.301959589937177e-06, + "loss": 0.8866, + "step": 29719 + }, + { + "epoch": 6.339590443686006, + "grad_norm": 0.09239301634619834, + "learning_rate": 4.299207042753271e-06, + "loss": 0.8807, + "step": 29720 + }, + { + "epoch": 6.339803754266212, + "grad_norm": 0.09851833681322897, + "learning_rate": 4.296455352125124e-06, + "loss": 0.9092, + "step": 29721 + }, + { + "epoch": 6.340017064846417, + "grad_norm": 0.08568082786726548, + "learning_rate": 4.2937045180838585e-06, + "loss": 0.866, + "step": 29722 + }, + { + "epoch": 6.340230375426621, + "grad_norm": 0.08760565167585864, + "learning_rate": 4.290954540660606e-06, + "loss": 0.9215, + "step": 29723 + }, + { + "epoch": 6.340443686006826, + "grad_norm": 0.09415424829334042, + "learning_rate": 4.28820541988646e-06, + "loss": 0.8961, + "step": 29724 + }, + { + "epoch": 6.340656996587031, + "grad_norm": 0.08619480189439578, + "learning_rate": 4.285457155792551e-06, + "loss": 0.8795, + "step": 29725 + }, + { + "epoch": 6.340870307167235, + "grad_norm": 0.09706129362822122, + "learning_rate": 4.282709748409968e-06, + "loss": 0.8487, + "step": 29726 + }, + { + "epoch": 6.34108361774744, + "grad_norm": 0.08456010394513998, + "learning_rate": 4.2799631977697945e-06, + "loss": 0.8808, + "step": 29727 + }, + { + "epoch": 6.341296928327645, + "grad_norm": 0.09181115829492466, + "learning_rate": 4.277217503903108e-06, + "loss": 0.8965, + "step": 29728 + }, + { + "epoch": 6.3415102389078495, + "grad_norm": 0.0877822341400924, + "learning_rate": 4.274472666840978e-06, + "loss": 0.8782, + "step": 29729 + }, + { + "epoch": 6.341723549488055, + "grad_norm": 0.09424520025129059, + "learning_rate": 4.2717286866144735e-06, + "loss": 0.9169, + "step": 29730 + }, + { + "epoch": 6.34193686006826, + "grad_norm": 0.09026998800214205, + "learning_rate": 4.268985563254635e-06, + "loss": 0.9086, + "step": 29731 + }, + { + "epoch": 6.342150170648464, + "grad_norm": 0.08446828018606072, + "learning_rate": 4.266243296792478e-06, + "loss": 0.8584, + "step": 29732 + }, + { + "epoch": 6.342363481228669, + "grad_norm": 0.08306871522793062, + "learning_rate": 4.2635018872590715e-06, + "loss": 0.8873, + "step": 29733 + }, + { + "epoch": 6.342576791808874, + "grad_norm": 0.08547912141434234, + "learning_rate": 4.260761334685421e-06, + "loss": 0.8838, + "step": 29734 + }, + { + "epoch": 6.3427901023890785, + "grad_norm": 0.08576595200864878, + "learning_rate": 4.258021639102526e-06, + "loss": 0.8794, + "step": 29735 + }, + { + "epoch": 6.343003412969283, + "grad_norm": 0.08526574150390984, + "learning_rate": 4.255282800541372e-06, + "loss": 0.8984, + "step": 29736 + }, + { + "epoch": 6.343216723549488, + "grad_norm": 0.08376667959873296, + "learning_rate": 4.252544819032975e-06, + "loss": 0.8748, + "step": 29737 + }, + { + "epoch": 6.3434300341296925, + "grad_norm": 0.08487261615242611, + "learning_rate": 4.249807694608317e-06, + "loss": 0.9038, + "step": 29738 + }, + { + "epoch": 6.343643344709897, + "grad_norm": 0.08332869429367067, + "learning_rate": 4.247071427298348e-06, + "loss": 0.879, + "step": 29739 + }, + { + "epoch": 6.343856655290103, + "grad_norm": 0.09740562542019283, + "learning_rate": 4.244336017134041e-06, + "loss": 0.9092, + "step": 29740 + }, + { + "epoch": 6.3440699658703075, + "grad_norm": 0.08033233789492851, + "learning_rate": 4.241601464146347e-06, + "loss": 0.9289, + "step": 29741 + }, + { + "epoch": 6.344283276450512, + "grad_norm": 0.0829670149466526, + "learning_rate": 4.238867768366212e-06, + "loss": 0.8768, + "step": 29742 + }, + { + "epoch": 6.344496587030717, + "grad_norm": 0.08004383910400012, + "learning_rate": 4.236134929824563e-06, + "loss": 0.8822, + "step": 29743 + }, + { + "epoch": 6.3447098976109215, + "grad_norm": 0.0800583199834653, + "learning_rate": 4.233402948552305e-06, + "loss": 0.8751, + "step": 29744 + }, + { + "epoch": 6.344923208191126, + "grad_norm": 0.07977787109883135, + "learning_rate": 4.2306718245803856e-06, + "loss": 0.8749, + "step": 29745 + }, + { + "epoch": 6.345136518771331, + "grad_norm": 0.08288512328389439, + "learning_rate": 4.227941557939703e-06, + "loss": 0.8403, + "step": 29746 + }, + { + "epoch": 6.345349829351536, + "grad_norm": 0.09185017824214153, + "learning_rate": 4.225212148661113e-06, + "loss": 0.8811, + "step": 29747 + }, + { + "epoch": 6.34556313993174, + "grad_norm": 0.083805630128412, + "learning_rate": 4.2224835967755154e-06, + "loss": 0.8818, + "step": 29748 + }, + { + "epoch": 6.345776450511945, + "grad_norm": 0.08292569297900151, + "learning_rate": 4.219755902313809e-06, + "loss": 0.8797, + "step": 29749 + }, + { + "epoch": 6.3459897610921505, + "grad_norm": 0.08860441632024381, + "learning_rate": 4.217029065306832e-06, + "loss": 0.9292, + "step": 29750 + }, + { + "epoch": 6.346203071672355, + "grad_norm": 0.0994592995587165, + "learning_rate": 4.214303085785449e-06, + "loss": 0.8909, + "step": 29751 + }, + { + "epoch": 6.34641638225256, + "grad_norm": 0.09130313180344896, + "learning_rate": 4.211577963780489e-06, + "loss": 0.8899, + "step": 29752 + }, + { + "epoch": 6.346629692832765, + "grad_norm": 0.08272175648745793, + "learning_rate": 4.2088536993228054e-06, + "loss": 0.894, + "step": 29753 + }, + { + "epoch": 6.346843003412969, + "grad_norm": 0.09054614673867584, + "learning_rate": 4.20613029244322e-06, + "loss": 0.8866, + "step": 29754 + }, + { + "epoch": 6.347056313993174, + "grad_norm": 0.09219054347595022, + "learning_rate": 4.203407743172552e-06, + "loss": 0.9156, + "step": 29755 + }, + { + "epoch": 6.347269624573379, + "grad_norm": 0.09389771054097597, + "learning_rate": 4.200686051541585e-06, + "loss": 0.8718, + "step": 29756 + }, + { + "epoch": 6.347482935153583, + "grad_norm": 0.07587185407324783, + "learning_rate": 4.197965217581148e-06, + "loss": 0.8453, + "step": 29757 + }, + { + "epoch": 6.347696245733788, + "grad_norm": 0.09108476218820864, + "learning_rate": 4.195245241321999e-06, + "loss": 0.8889, + "step": 29758 + }, + { + "epoch": 6.347909556313994, + "grad_norm": 0.10099501687552515, + "learning_rate": 4.192526122794931e-06, + "loss": 0.8654, + "step": 29759 + }, + { + "epoch": 6.348122866894198, + "grad_norm": 0.08710625857502691, + "learning_rate": 4.189807862030692e-06, + "loss": 0.8811, + "step": 29760 + }, + { + "epoch": 6.348336177474403, + "grad_norm": 0.0934477221007666, + "learning_rate": 4.187090459060059e-06, + "loss": 0.8662, + "step": 29761 + }, + { + "epoch": 6.348549488054608, + "grad_norm": 0.09732362506262952, + "learning_rate": 4.18437391391378e-06, + "loss": 0.8616, + "step": 29762 + }, + { + "epoch": 6.348762798634812, + "grad_norm": 0.08950509924262794, + "learning_rate": 4.181658226622576e-06, + "loss": 0.8892, + "step": 29763 + }, + { + "epoch": 6.348976109215017, + "grad_norm": 0.07993865625718528, + "learning_rate": 4.178943397217179e-06, + "loss": 0.907, + "step": 29764 + }, + { + "epoch": 6.349189419795222, + "grad_norm": 0.09386113856073319, + "learning_rate": 4.176229425728328e-06, + "loss": 0.8795, + "step": 29765 + }, + { + "epoch": 6.349402730375426, + "grad_norm": 0.09119318071235556, + "learning_rate": 4.173516312186712e-06, + "loss": 0.8886, + "step": 29766 + }, + { + "epoch": 6.349616040955631, + "grad_norm": 0.09698046647601898, + "learning_rate": 4.170804056623032e-06, + "loss": 0.866, + "step": 29767 + }, + { + "epoch": 6.349829351535837, + "grad_norm": 0.09641520675629531, + "learning_rate": 4.168092659067986e-06, + "loss": 0.8579, + "step": 29768 + }, + { + "epoch": 6.350042662116041, + "grad_norm": 0.08582254018914733, + "learning_rate": 4.165382119552242e-06, + "loss": 0.8657, + "step": 29769 + }, + { + "epoch": 6.350255972696246, + "grad_norm": 0.08960784802010974, + "learning_rate": 4.1626724381064764e-06, + "loss": 0.8779, + "step": 29770 + }, + { + "epoch": 6.350469283276451, + "grad_norm": 0.08507961743825415, + "learning_rate": 4.159963614761342e-06, + "loss": 0.9077, + "step": 29771 + }, + { + "epoch": 6.350682593856655, + "grad_norm": 0.08504762564548458, + "learning_rate": 4.1572556495474894e-06, + "loss": 0.8782, + "step": 29772 + }, + { + "epoch": 6.35089590443686, + "grad_norm": 0.09398210799179689, + "learning_rate": 4.154548542495578e-06, + "loss": 0.8892, + "step": 29773 + }, + { + "epoch": 6.351109215017065, + "grad_norm": 0.0857015234233801, + "learning_rate": 4.151842293636223e-06, + "loss": 0.8419, + "step": 29774 + }, + { + "epoch": 6.351322525597269, + "grad_norm": 0.07773899941250931, + "learning_rate": 4.14913690300005e-06, + "loss": 0.8721, + "step": 29775 + }, + { + "epoch": 6.351535836177474, + "grad_norm": 0.09474205982405812, + "learning_rate": 4.1464323706176566e-06, + "loss": 0.8933, + "step": 29776 + }, + { + "epoch": 6.351749146757679, + "grad_norm": 0.11161953408166854, + "learning_rate": 4.143728696519666e-06, + "loss": 0.8828, + "step": 29777 + }, + { + "epoch": 6.351962457337884, + "grad_norm": 0.08926334979002991, + "learning_rate": 4.14102588073666e-06, + "loss": 0.9017, + "step": 29778 + }, + { + "epoch": 6.352175767918089, + "grad_norm": 0.09726669847465205, + "learning_rate": 4.138323923299226e-06, + "loss": 0.8831, + "step": 29779 + }, + { + "epoch": 6.352389078498294, + "grad_norm": 0.09331432268735228, + "learning_rate": 4.135622824237926e-06, + "loss": 0.863, + "step": 29780 + }, + { + "epoch": 6.352602389078498, + "grad_norm": 0.08758567086235568, + "learning_rate": 4.132922583583341e-06, + "loss": 0.874, + "step": 29781 + }, + { + "epoch": 6.352815699658703, + "grad_norm": 0.08736506553813832, + "learning_rate": 4.130223201366006e-06, + "loss": 0.8662, + "step": 29782 + }, + { + "epoch": 6.353029010238908, + "grad_norm": 0.09060702371228407, + "learning_rate": 4.127524677616474e-06, + "loss": 0.9035, + "step": 29783 + }, + { + "epoch": 6.353242320819112, + "grad_norm": 0.0895164264202349, + "learning_rate": 4.124827012365264e-06, + "loss": 0.8778, + "step": 29784 + }, + { + "epoch": 6.353455631399317, + "grad_norm": 0.08021958136953475, + "learning_rate": 4.12213020564292e-06, + "loss": 0.8568, + "step": 29785 + }, + { + "epoch": 6.353668941979522, + "grad_norm": 0.08738779398328822, + "learning_rate": 4.119434257479959e-06, + "loss": 0.898, + "step": 29786 + }, + { + "epoch": 6.3538822525597265, + "grad_norm": 0.09902454759296368, + "learning_rate": 4.116739167906865e-06, + "loss": 0.906, + "step": 29787 + }, + { + "epoch": 6.354095563139932, + "grad_norm": 0.0926486464877848, + "learning_rate": 4.1140449369541355e-06, + "loss": 0.851, + "step": 29788 + }, + { + "epoch": 6.354308873720137, + "grad_norm": 0.08633760248643693, + "learning_rate": 4.111351564652282e-06, + "loss": 0.8748, + "step": 29789 + }, + { + "epoch": 6.354522184300341, + "grad_norm": 0.09482836354195785, + "learning_rate": 4.108659051031767e-06, + "loss": 0.8369, + "step": 29790 + }, + { + "epoch": 6.354735494880546, + "grad_norm": 0.09402016794028552, + "learning_rate": 4.10596739612303e-06, + "loss": 0.8872, + "step": 29791 + }, + { + "epoch": 6.354948805460751, + "grad_norm": 0.08940750368640014, + "learning_rate": 4.103276599956561e-06, + "loss": 0.8877, + "step": 29792 + }, + { + "epoch": 6.3551621160409555, + "grad_norm": 0.08265854480725099, + "learning_rate": 4.1005866625627885e-06, + "loss": 0.8605, + "step": 29793 + }, + { + "epoch": 6.35537542662116, + "grad_norm": 0.07998698568636538, + "learning_rate": 4.097897583972161e-06, + "loss": 0.873, + "step": 29794 + }, + { + "epoch": 6.355588737201365, + "grad_norm": 0.09082720897212862, + "learning_rate": 4.095209364215098e-06, + "loss": 0.8577, + "step": 29795 + }, + { + "epoch": 6.3558020477815695, + "grad_norm": 0.08842108618628222, + "learning_rate": 4.092522003322002e-06, + "loss": 0.8849, + "step": 29796 + }, + { + "epoch": 6.356015358361775, + "grad_norm": 0.07888226839292363, + "learning_rate": 4.089835501323309e-06, + "loss": 0.872, + "step": 29797 + }, + { + "epoch": 6.35622866894198, + "grad_norm": 0.07756925465087623, + "learning_rate": 4.087149858249406e-06, + "loss": 0.8679, + "step": 29798 + }, + { + "epoch": 6.3564419795221845, + "grad_norm": 0.08679234599267517, + "learning_rate": 4.084465074130686e-06, + "loss": 0.8778, + "step": 29799 + }, + { + "epoch": 6.356655290102389, + "grad_norm": 0.08907029673108828, + "learning_rate": 4.081781148997497e-06, + "loss": 0.8731, + "step": 29800 + }, + { + "epoch": 6.356868600682594, + "grad_norm": 0.08029687728616307, + "learning_rate": 4.07909808288025e-06, + "loss": 0.8723, + "step": 29801 + }, + { + "epoch": 6.3570819112627985, + "grad_norm": 0.08671760359099319, + "learning_rate": 4.0764158758092965e-06, + "loss": 0.8809, + "step": 29802 + }, + { + "epoch": 6.357295221843003, + "grad_norm": 0.0868757584579526, + "learning_rate": 4.073734527814947e-06, + "loss": 0.8765, + "step": 29803 + }, + { + "epoch": 6.357508532423208, + "grad_norm": 0.08638967014913825, + "learning_rate": 4.0710540389275885e-06, + "loss": 0.8805, + "step": 29804 + }, + { + "epoch": 6.357721843003413, + "grad_norm": 0.08102267453903958, + "learning_rate": 4.068374409177524e-06, + "loss": 0.8807, + "step": 29805 + }, + { + "epoch": 6.357935153583618, + "grad_norm": 0.07803680721692816, + "learning_rate": 4.0656956385950774e-06, + "loss": 0.9032, + "step": 29806 + }, + { + "epoch": 6.358148464163823, + "grad_norm": 0.08268696524987387, + "learning_rate": 4.063017727210569e-06, + "loss": 0.8777, + "step": 29807 + }, + { + "epoch": 6.3583617747440275, + "grad_norm": 0.07960963236818148, + "learning_rate": 4.06034067505428e-06, + "loss": 0.8864, + "step": 29808 + }, + { + "epoch": 6.358575085324232, + "grad_norm": 0.083957080245376, + "learning_rate": 4.0576644821565205e-06, + "loss": 0.8799, + "step": 29809 + }, + { + "epoch": 6.358788395904437, + "grad_norm": 0.0817201685574693, + "learning_rate": 4.054989148547561e-06, + "loss": 0.8882, + "step": 29810 + }, + { + "epoch": 6.359001706484642, + "grad_norm": 0.08086727918141247, + "learning_rate": 4.052314674257689e-06, + "loss": 0.8706, + "step": 29811 + }, + { + "epoch": 6.359215017064846, + "grad_norm": 0.07897991133248759, + "learning_rate": 4.049641059317128e-06, + "loss": 0.8766, + "step": 29812 + }, + { + "epoch": 6.359428327645051, + "grad_norm": 0.08415268357115539, + "learning_rate": 4.046968303756193e-06, + "loss": 0.9082, + "step": 29813 + }, + { + "epoch": 6.359641638225256, + "grad_norm": 0.07735034467777545, + "learning_rate": 4.044296407605073e-06, + "loss": 0.863, + "step": 29814 + }, + { + "epoch": 6.359854948805461, + "grad_norm": 0.0826725909248381, + "learning_rate": 4.041625370894e-06, + "loss": 0.9069, + "step": 29815 + }, + { + "epoch": 6.360068259385666, + "grad_norm": 0.07822252380365649, + "learning_rate": 4.038955193653227e-06, + "loss": 0.8505, + "step": 29816 + }, + { + "epoch": 6.360281569965871, + "grad_norm": 0.08128885545471176, + "learning_rate": 4.0362858759129595e-06, + "loss": 0.8696, + "step": 29817 + }, + { + "epoch": 6.360494880546075, + "grad_norm": 0.08480232087540339, + "learning_rate": 4.033617417703389e-06, + "loss": 0.8911, + "step": 29818 + }, + { + "epoch": 6.36070819112628, + "grad_norm": 0.08797221119513231, + "learning_rate": 4.030949819054719e-06, + "loss": 0.8678, + "step": 29819 + }, + { + "epoch": 6.360921501706485, + "grad_norm": 0.07889951065873325, + "learning_rate": 4.0282830799971155e-06, + "loss": 0.8802, + "step": 29820 + }, + { + "epoch": 6.361134812286689, + "grad_norm": 0.07996578678372016, + "learning_rate": 4.025617200560783e-06, + "loss": 0.8951, + "step": 29821 + }, + { + "epoch": 6.361348122866894, + "grad_norm": 0.0883019822420691, + "learning_rate": 4.0229521807758585e-06, + "loss": 0.8755, + "step": 29822 + }, + { + "epoch": 6.361561433447099, + "grad_norm": 0.08348329499091751, + "learning_rate": 4.020288020672523e-06, + "loss": 0.8476, + "step": 29823 + }, + { + "epoch": 6.361774744027303, + "grad_norm": 0.08297301740140751, + "learning_rate": 4.017624720280892e-06, + "loss": 0.8867, + "step": 29824 + }, + { + "epoch": 6.361988054607509, + "grad_norm": 0.08359945462437869, + "learning_rate": 4.014962279631123e-06, + "loss": 0.8642, + "step": 29825 + }, + { + "epoch": 6.362201365187714, + "grad_norm": 0.09331158502092195, + "learning_rate": 4.0123006987533396e-06, + "loss": 0.9188, + "step": 29826 + }, + { + "epoch": 6.362414675767918, + "grad_norm": 0.08871916468820687, + "learning_rate": 4.009639977677644e-06, + "loss": 0.9007, + "step": 29827 + }, + { + "epoch": 6.362627986348123, + "grad_norm": 0.0822683633039185, + "learning_rate": 4.006980116434136e-06, + "loss": 0.8632, + "step": 29828 + }, + { + "epoch": 6.362841296928328, + "grad_norm": 0.08332987450040673, + "learning_rate": 4.0043211150529425e-06, + "loss": 0.8903, + "step": 29829 + }, + { + "epoch": 6.363054607508532, + "grad_norm": 0.07842955073387525, + "learning_rate": 4.001662973564138e-06, + "loss": 0.844, + "step": 29830 + }, + { + "epoch": 6.363267918088737, + "grad_norm": 0.0859390177586782, + "learning_rate": 3.999005691997786e-06, + "loss": 0.9095, + "step": 29831 + }, + { + "epoch": 6.363481228668942, + "grad_norm": 0.08399197744593859, + "learning_rate": 3.996349270383961e-06, + "loss": 0.8667, + "step": 29832 + }, + { + "epoch": 6.363694539249146, + "grad_norm": 0.0821546172914099, + "learning_rate": 3.9936937087527286e-06, + "loss": 0.8853, + "step": 29833 + }, + { + "epoch": 6.363907849829351, + "grad_norm": 0.08758812993544807, + "learning_rate": 3.991039007134125e-06, + "loss": 0.8788, + "step": 29834 + }, + { + "epoch": 6.364121160409557, + "grad_norm": 0.08503070428378803, + "learning_rate": 3.9883851655582e-06, + "loss": 0.8625, + "step": 29835 + }, + { + "epoch": 6.364334470989761, + "grad_norm": 0.08964501649573353, + "learning_rate": 3.98573218405498e-06, + "loss": 0.8922, + "step": 29836 + }, + { + "epoch": 6.364547781569966, + "grad_norm": 0.08565944634884118, + "learning_rate": 3.98308006265447e-06, + "loss": 0.8694, + "step": 29837 + }, + { + "epoch": 6.364761092150171, + "grad_norm": 0.081223013073811, + "learning_rate": 3.980428801386689e-06, + "loss": 0.8768, + "step": 29838 + }, + { + "epoch": 6.364974402730375, + "grad_norm": 0.07921532191707423, + "learning_rate": 3.97777840028164e-06, + "loss": 0.852, + "step": 29839 + }, + { + "epoch": 6.36518771331058, + "grad_norm": 0.08085357748973637, + "learning_rate": 3.97512885936929e-06, + "loss": 0.8741, + "step": 29840 + }, + { + "epoch": 6.365401023890785, + "grad_norm": 0.08061244715804028, + "learning_rate": 3.972480178679652e-06, + "loss": 0.8791, + "step": 29841 + }, + { + "epoch": 6.3656143344709895, + "grad_norm": 0.0783919478699981, + "learning_rate": 3.969832358242682e-06, + "loss": 0.8788, + "step": 29842 + }, + { + "epoch": 6.365827645051194, + "grad_norm": 0.08600884813514972, + "learning_rate": 3.967185398088332e-06, + "loss": 0.8871, + "step": 29843 + }, + { + "epoch": 6.3660409556314, + "grad_norm": 0.0791520807816946, + "learning_rate": 3.964539298246548e-06, + "loss": 0.8806, + "step": 29844 + }, + { + "epoch": 6.366254266211604, + "grad_norm": 0.09135963718128139, + "learning_rate": 3.961894058747291e-06, + "loss": 0.8894, + "step": 29845 + }, + { + "epoch": 6.366467576791809, + "grad_norm": 0.08021275522548878, + "learning_rate": 3.959249679620492e-06, + "loss": 0.8497, + "step": 29846 + }, + { + "epoch": 6.366680887372014, + "grad_norm": 0.07946256270629179, + "learning_rate": 3.956606160896046e-06, + "loss": 0.8685, + "step": 29847 + }, + { + "epoch": 6.3668941979522184, + "grad_norm": 0.08680674356441274, + "learning_rate": 3.953963502603886e-06, + "loss": 0.8953, + "step": 29848 + }, + { + "epoch": 6.367107508532423, + "grad_norm": 0.08977222528031543, + "learning_rate": 3.9513217047739075e-06, + "loss": 0.854, + "step": 29849 + }, + { + "epoch": 6.367320819112628, + "grad_norm": 0.08433055357306161, + "learning_rate": 3.948680767436006e-06, + "loss": 0.8737, + "step": 29850 + }, + { + "epoch": 6.3675341296928325, + "grad_norm": 0.08078588795239526, + "learning_rate": 3.946040690620061e-06, + "loss": 0.8499, + "step": 29851 + }, + { + "epoch": 6.367747440273037, + "grad_norm": 0.08325694282166689, + "learning_rate": 3.943401474355932e-06, + "loss": 0.8623, + "step": 29852 + }, + { + "epoch": 6.367960750853243, + "grad_norm": 0.0877470418281589, + "learning_rate": 3.940763118673507e-06, + "loss": 0.9105, + "step": 29853 + }, + { + "epoch": 6.368174061433447, + "grad_norm": 0.08742384450206829, + "learning_rate": 3.93812562360262e-06, + "loss": 0.9166, + "step": 29854 + }, + { + "epoch": 6.368387372013652, + "grad_norm": 0.08466680475219332, + "learning_rate": 3.935488989173131e-06, + "loss": 0.9037, + "step": 29855 + }, + { + "epoch": 6.368600682593857, + "grad_norm": 0.078724956971344, + "learning_rate": 3.9328532154148465e-06, + "loss": 0.8942, + "step": 29856 + }, + { + "epoch": 6.3688139931740615, + "grad_norm": 0.08570454765268136, + "learning_rate": 3.9302183023576205e-06, + "loss": 0.8476, + "step": 29857 + }, + { + "epoch": 6.369027303754266, + "grad_norm": 0.09356824656669492, + "learning_rate": 3.927584250031266e-06, + "loss": 0.882, + "step": 29858 + }, + { + "epoch": 6.369240614334471, + "grad_norm": 0.08670598413752457, + "learning_rate": 3.9249510584655495e-06, + "loss": 0.8898, + "step": 29859 + }, + { + "epoch": 6.3694539249146755, + "grad_norm": 0.07722897048777919, + "learning_rate": 3.922318727690302e-06, + "loss": 0.8454, + "step": 29860 + }, + { + "epoch": 6.36966723549488, + "grad_norm": 0.09524364265391921, + "learning_rate": 3.919687257735305e-06, + "loss": 0.8771, + "step": 29861 + }, + { + "epoch": 6.369880546075085, + "grad_norm": 0.09053397277585892, + "learning_rate": 3.917056648630322e-06, + "loss": 0.9288, + "step": 29862 + }, + { + "epoch": 6.3700938566552905, + "grad_norm": 0.0838107157026426, + "learning_rate": 3.914426900405124e-06, + "loss": 0.8695, + "step": 29863 + }, + { + "epoch": 6.370307167235495, + "grad_norm": 0.08268776212855283, + "learning_rate": 3.911798013089447e-06, + "loss": 0.9075, + "step": 29864 + }, + { + "epoch": 6.3705204778157, + "grad_norm": 0.08183596933149449, + "learning_rate": 3.909169986713073e-06, + "loss": 0.8902, + "step": 29865 + }, + { + "epoch": 6.3707337883959045, + "grad_norm": 0.09659006716272836, + "learning_rate": 3.906542821305719e-06, + "loss": 0.914, + "step": 29866 + }, + { + "epoch": 6.370947098976109, + "grad_norm": 0.07810838713481609, + "learning_rate": 3.903916516897113e-06, + "loss": 0.8651, + "step": 29867 + }, + { + "epoch": 6.371160409556314, + "grad_norm": 0.0821125459158084, + "learning_rate": 3.901291073516964e-06, + "loss": 0.889, + "step": 29868 + }, + { + "epoch": 6.371373720136519, + "grad_norm": 0.09006786421515037, + "learning_rate": 3.898666491195009e-06, + "loss": 0.8607, + "step": 29869 + }, + { + "epoch": 6.371587030716723, + "grad_norm": 0.08620731079344118, + "learning_rate": 3.896042769960904e-06, + "loss": 0.8812, + "step": 29870 + }, + { + "epoch": 6.371800341296928, + "grad_norm": 0.07660228987507459, + "learning_rate": 3.8934199098443495e-06, + "loss": 0.8712, + "step": 29871 + }, + { + "epoch": 6.372013651877133, + "grad_norm": 0.09100646747337758, + "learning_rate": 3.890797910875037e-06, + "loss": 0.9073, + "step": 29872 + }, + { + "epoch": 6.372226962457338, + "grad_norm": 0.09655165584507203, + "learning_rate": 3.888176773082632e-06, + "loss": 0.8779, + "step": 29873 + }, + { + "epoch": 6.372440273037543, + "grad_norm": 0.08398998266256469, + "learning_rate": 3.8855564964967826e-06, + "loss": 0.8581, + "step": 29874 + }, + { + "epoch": 6.372653583617748, + "grad_norm": 0.08383436996172795, + "learning_rate": 3.882937081147145e-06, + "loss": 0.8812, + "step": 29875 + }, + { + "epoch": 6.372866894197952, + "grad_norm": 0.08314566091796917, + "learning_rate": 3.8803185270633385e-06, + "loss": 0.8981, + "step": 29876 + }, + { + "epoch": 6.373080204778157, + "grad_norm": 0.08094658803121926, + "learning_rate": 3.87770083427502e-06, + "loss": 0.9089, + "step": 29877 + }, + { + "epoch": 6.373293515358362, + "grad_norm": 0.0856804423955224, + "learning_rate": 3.875084002811802e-06, + "loss": 0.8763, + "step": 29878 + }, + { + "epoch": 6.373506825938566, + "grad_norm": 0.08107612437748582, + "learning_rate": 3.872468032703279e-06, + "loss": 0.8441, + "step": 29879 + }, + { + "epoch": 6.373720136518771, + "grad_norm": 0.0812589244395749, + "learning_rate": 3.86985292397907e-06, + "loss": 0.8711, + "step": 29880 + }, + { + "epoch": 6.373933447098976, + "grad_norm": 0.08294056385123146, + "learning_rate": 3.867238676668752e-06, + "loss": 0.8461, + "step": 29881 + }, + { + "epoch": 6.374146757679181, + "grad_norm": 0.08662630178430662, + "learning_rate": 3.864625290801911e-06, + "loss": 0.8589, + "step": 29882 + }, + { + "epoch": 6.374360068259386, + "grad_norm": 0.0945825875705561, + "learning_rate": 3.862012766408105e-06, + "loss": 0.8905, + "step": 29883 + }, + { + "epoch": 6.374573378839591, + "grad_norm": 0.08553398379477256, + "learning_rate": 3.859401103516911e-06, + "loss": 0.9114, + "step": 29884 + }, + { + "epoch": 6.374786689419795, + "grad_norm": 0.09419211089865223, + "learning_rate": 3.8567903021578775e-06, + "loss": 0.8701, + "step": 29885 + }, + { + "epoch": 6.375, + "grad_norm": 0.08372504101931594, + "learning_rate": 3.854180362360546e-06, + "loss": 0.8432, + "step": 29886 + }, + { + "epoch": 6.375213310580205, + "grad_norm": 0.0787926497911113, + "learning_rate": 3.8515712841544405e-06, + "loss": 0.8926, + "step": 29887 + }, + { + "epoch": 6.375426621160409, + "grad_norm": 0.08662434949507143, + "learning_rate": 3.848963067569074e-06, + "loss": 0.8565, + "step": 29888 + }, + { + "epoch": 6.375639931740614, + "grad_norm": 0.08379356048708436, + "learning_rate": 3.846355712633987e-06, + "loss": 0.857, + "step": 29889 + }, + { + "epoch": 6.375853242320819, + "grad_norm": 0.0771348369839839, + "learning_rate": 3.843749219378667e-06, + "loss": 0.8614, + "step": 29890 + }, + { + "epoch": 6.376066552901024, + "grad_norm": 0.08025675760084845, + "learning_rate": 3.841143587832603e-06, + "loss": 0.8872, + "step": 29891 + }, + { + "epoch": 6.376279863481229, + "grad_norm": 0.08854072524557828, + "learning_rate": 3.838538818025273e-06, + "loss": 0.9081, + "step": 29892 + }, + { + "epoch": 6.376493174061434, + "grad_norm": 0.0865555720107887, + "learning_rate": 3.835934909986172e-06, + "loss": 0.8895, + "step": 29893 + }, + { + "epoch": 6.376706484641638, + "grad_norm": 0.07791870152168154, + "learning_rate": 3.833331863744736e-06, + "loss": 0.8629, + "step": 29894 + }, + { + "epoch": 6.376919795221843, + "grad_norm": 0.07970435676614657, + "learning_rate": 3.830729679330425e-06, + "loss": 0.8495, + "step": 29895 + }, + { + "epoch": 6.377133105802048, + "grad_norm": 0.08194344101656763, + "learning_rate": 3.8281283567727005e-06, + "loss": 0.8598, + "step": 29896 + }, + { + "epoch": 6.377346416382252, + "grad_norm": 0.09395638840731474, + "learning_rate": 3.825527896100987e-06, + "loss": 0.8952, + "step": 29897 + }, + { + "epoch": 6.377559726962457, + "grad_norm": 0.08333183703302248, + "learning_rate": 3.822928297344702e-06, + "loss": 0.8611, + "step": 29898 + }, + { + "epoch": 6.377773037542662, + "grad_norm": 0.08295008065087058, + "learning_rate": 3.82032956053326e-06, + "loss": 0.8535, + "step": 29899 + }, + { + "epoch": 6.377986348122867, + "grad_norm": 0.07962636975454544, + "learning_rate": 3.817731685696071e-06, + "loss": 0.864, + "step": 29900 + }, + { + "epoch": 6.378199658703072, + "grad_norm": 0.086003904471726, + "learning_rate": 3.8151346728625325e-06, + "loss": 0.875, + "step": 29901 + }, + { + "epoch": 6.378412969283277, + "grad_norm": 0.08743784915541736, + "learning_rate": 3.8125385220620436e-06, + "loss": 0.8599, + "step": 29902 + }, + { + "epoch": 6.378626279863481, + "grad_norm": 0.08717661510371238, + "learning_rate": 3.809943233323932e-06, + "loss": 0.8561, + "step": 29903 + }, + { + "epoch": 6.378839590443686, + "grad_norm": 0.09708908403793193, + "learning_rate": 3.8073488066776134e-06, + "loss": 0.8868, + "step": 29904 + }, + { + "epoch": 6.379052901023891, + "grad_norm": 0.08198943314897111, + "learning_rate": 3.804755242152425e-06, + "loss": 0.9018, + "step": 29905 + }, + { + "epoch": 6.3792662116040955, + "grad_norm": 0.07669067248993638, + "learning_rate": 3.8021625397777117e-06, + "loss": 0.8746, + "step": 29906 + }, + { + "epoch": 6.3794795221843, + "grad_norm": 0.09145075786387283, + "learning_rate": 3.799570699582793e-06, + "loss": 0.8686, + "step": 29907 + }, + { + "epoch": 6.379692832764505, + "grad_norm": 0.08354961156962054, + "learning_rate": 3.796979721597032e-06, + "loss": 0.8838, + "step": 29908 + }, + { + "epoch": 6.3799061433447095, + "grad_norm": 0.0877343673763766, + "learning_rate": 3.7943896058497197e-06, + "loss": 0.8793, + "step": 29909 + }, + { + "epoch": 6.380119453924915, + "grad_norm": 0.08480593665158825, + "learning_rate": 3.791800352370176e-06, + "loss": 0.8534, + "step": 29910 + }, + { + "epoch": 6.38033276450512, + "grad_norm": 0.07986875075450774, + "learning_rate": 3.7892119611876933e-06, + "loss": 0.8813, + "step": 29911 + }, + { + "epoch": 6.3805460750853245, + "grad_norm": 0.08278261478283765, + "learning_rate": 3.7866244323315537e-06, + "loss": 0.8818, + "step": 29912 + }, + { + "epoch": 6.380759385665529, + "grad_norm": 0.09239216755837631, + "learning_rate": 3.78403776583105e-06, + "loss": 0.8931, + "step": 29913 + }, + { + "epoch": 6.380972696245734, + "grad_norm": 0.08134845846581079, + "learning_rate": 3.781451961715439e-06, + "loss": 0.887, + "step": 29914 + }, + { + "epoch": 6.3811860068259385, + "grad_norm": 0.07757250397193922, + "learning_rate": 3.7788670200139677e-06, + "loss": 0.8869, + "step": 29915 + }, + { + "epoch": 6.381399317406143, + "grad_norm": 0.09070462747058541, + "learning_rate": 3.776282940755911e-06, + "loss": 0.9052, + "step": 29916 + }, + { + "epoch": 6.381612627986348, + "grad_norm": 0.08605831712605576, + "learning_rate": 3.7736997239704896e-06, + "loss": 0.8395, + "step": 29917 + }, + { + "epoch": 6.381825938566553, + "grad_norm": 0.08011438131822775, + "learning_rate": 3.771117369686943e-06, + "loss": 0.8604, + "step": 29918 + }, + { + "epoch": 6.382039249146757, + "grad_norm": 0.07710702173685857, + "learning_rate": 3.768535877934474e-06, + "loss": 0.858, + "step": 29919 + }, + { + "epoch": 6.382252559726963, + "grad_norm": 0.07845336517102595, + "learning_rate": 3.7659552487423125e-06, + "loss": 0.8667, + "step": 29920 + }, + { + "epoch": 6.3824658703071675, + "grad_norm": 0.08062737647394208, + "learning_rate": 3.763375482139653e-06, + "loss": 0.8646, + "step": 29921 + }, + { + "epoch": 6.382679180887372, + "grad_norm": 0.078265053783587, + "learning_rate": 3.760796578155681e-06, + "loss": 0.8441, + "step": 29922 + }, + { + "epoch": 6.382892491467577, + "grad_norm": 0.08139971591338899, + "learning_rate": 3.758218536819573e-06, + "loss": 0.8732, + "step": 29923 + }, + { + "epoch": 6.3831058020477816, + "grad_norm": 0.0865887045553944, + "learning_rate": 3.755641358160498e-06, + "loss": 0.8862, + "step": 29924 + }, + { + "epoch": 6.383319112627986, + "grad_norm": 0.08012424389196864, + "learning_rate": 3.7530650422076486e-06, + "loss": 0.8702, + "step": 29925 + }, + { + "epoch": 6.383532423208191, + "grad_norm": 0.07925259034182806, + "learning_rate": 3.750489588990132e-06, + "loss": 0.8779, + "step": 29926 + }, + { + "epoch": 6.383745733788396, + "grad_norm": 0.08655388941355739, + "learning_rate": 3.7479149985371054e-06, + "loss": 0.8876, + "step": 29927 + }, + { + "epoch": 6.3839590443686, + "grad_norm": 0.07435389397345285, + "learning_rate": 3.7453412708777116e-06, + "loss": 0.8725, + "step": 29928 + }, + { + "epoch": 6.384172354948806, + "grad_norm": 0.08187927281072763, + "learning_rate": 3.742768406041064e-06, + "loss": 0.8826, + "step": 29929 + }, + { + "epoch": 6.3843856655290105, + "grad_norm": 0.08792406641121828, + "learning_rate": 3.7401964040562687e-06, + "loss": 0.891, + "step": 29930 + }, + { + "epoch": 6.384598976109215, + "grad_norm": 0.07808665896373872, + "learning_rate": 3.7376252649524316e-06, + "loss": 0.8284, + "step": 29931 + }, + { + "epoch": 6.38481228668942, + "grad_norm": 0.08487520899638952, + "learning_rate": 3.7350549887586487e-06, + "loss": 0.8888, + "step": 29932 + }, + { + "epoch": 6.385025597269625, + "grad_norm": 0.08449460141944669, + "learning_rate": 3.732485575504008e-06, + "loss": 0.8537, + "step": 29933 + }, + { + "epoch": 6.385238907849829, + "grad_norm": 0.0770506645207835, + "learning_rate": 3.7299170252175798e-06, + "loss": 0.909, + "step": 29934 + }, + { + "epoch": 6.385452218430034, + "grad_norm": 0.08006045481335115, + "learning_rate": 3.7273493379284163e-06, + "loss": 0.8777, + "step": 29935 + }, + { + "epoch": 6.385665529010239, + "grad_norm": 0.0830888914268942, + "learning_rate": 3.724782513665579e-06, + "loss": 0.8889, + "step": 29936 + }, + { + "epoch": 6.385878839590443, + "grad_norm": 0.07900589151152493, + "learning_rate": 3.722216552458111e-06, + "loss": 0.8737, + "step": 29937 + }, + { + "epoch": 6.386092150170649, + "grad_norm": 0.09094875924735404, + "learning_rate": 3.719651454335047e-06, + "loss": 0.8796, + "step": 29938 + }, + { + "epoch": 6.386305460750854, + "grad_norm": 0.09318105330112501, + "learning_rate": 3.7170872193253948e-06, + "loss": 0.8865, + "step": 29939 + }, + { + "epoch": 6.386518771331058, + "grad_norm": 0.07224473713356627, + "learning_rate": 3.7145238474581978e-06, + "loss": 0.8759, + "step": 29940 + }, + { + "epoch": 6.386732081911263, + "grad_norm": 0.07919545312711894, + "learning_rate": 3.7119613387624465e-06, + "loss": 0.8966, + "step": 29941 + }, + { + "epoch": 6.386945392491468, + "grad_norm": 0.08784086893829605, + "learning_rate": 3.7093996932671394e-06, + "loss": 0.8749, + "step": 29942 + }, + { + "epoch": 6.387158703071672, + "grad_norm": 0.09353771024160591, + "learning_rate": 3.7068389110012494e-06, + "loss": 0.8639, + "step": 29943 + }, + { + "epoch": 6.387372013651877, + "grad_norm": 0.0825091967847805, + "learning_rate": 3.704278991993766e-06, + "loss": 0.8845, + "step": 29944 + }, + { + "epoch": 6.387585324232082, + "grad_norm": 0.07680718305438171, + "learning_rate": 3.7017199362736445e-06, + "loss": 0.8743, + "step": 29945 + }, + { + "epoch": 6.387798634812286, + "grad_norm": 0.08475108956220302, + "learning_rate": 3.699161743869848e-06, + "loss": 0.8655, + "step": 29946 + }, + { + "epoch": 6.388011945392491, + "grad_norm": 0.0860315820785091, + "learning_rate": 3.696604414811322e-06, + "loss": 0.8739, + "step": 29947 + }, + { + "epoch": 6.388225255972697, + "grad_norm": 0.08817976413036095, + "learning_rate": 3.6940479491270044e-06, + "loss": 0.8711, + "step": 29948 + }, + { + "epoch": 6.388438566552901, + "grad_norm": 0.08469737251421781, + "learning_rate": 3.691492346845813e-06, + "loss": 0.8921, + "step": 29949 + }, + { + "epoch": 6.388651877133106, + "grad_norm": 0.08722921117539056, + "learning_rate": 3.6889376079966678e-06, + "loss": 0.8834, + "step": 29950 + }, + { + "epoch": 6.388865187713311, + "grad_norm": 0.0841045059440292, + "learning_rate": 3.6863837326084693e-06, + "loss": 0.8824, + "step": 29951 + }, + { + "epoch": 6.389078498293515, + "grad_norm": 0.09002684104531285, + "learning_rate": 3.6838307207101285e-06, + "loss": 0.886, + "step": 29952 + }, + { + "epoch": 6.38929180887372, + "grad_norm": 0.08329588320567761, + "learning_rate": 3.6812785723305288e-06, + "loss": 0.9084, + "step": 29953 + }, + { + "epoch": 6.389505119453925, + "grad_norm": 0.08245631806921369, + "learning_rate": 3.6787272874985534e-06, + "loss": 0.8592, + "step": 29954 + }, + { + "epoch": 6.389718430034129, + "grad_norm": 0.08200112782480123, + "learning_rate": 3.676176866243042e-06, + "loss": 0.9088, + "step": 29955 + }, + { + "epoch": 6.389931740614334, + "grad_norm": 0.0779486815815216, + "learning_rate": 3.6736273085928865e-06, + "loss": 0.8666, + "step": 29956 + }, + { + "epoch": 6.390145051194539, + "grad_norm": 0.08793704183663775, + "learning_rate": 3.671078614576926e-06, + "loss": 0.9085, + "step": 29957 + }, + { + "epoch": 6.390358361774744, + "grad_norm": 0.07950535294313651, + "learning_rate": 3.6685307842240004e-06, + "loss": 0.8581, + "step": 29958 + }, + { + "epoch": 6.390571672354949, + "grad_norm": 0.08339136836517491, + "learning_rate": 3.6659838175629125e-06, + "loss": 0.8856, + "step": 29959 + }, + { + "epoch": 6.390784982935154, + "grad_norm": 0.07951778901917918, + "learning_rate": 3.66343771462252e-06, + "loss": 0.8641, + "step": 29960 + }, + { + "epoch": 6.390998293515358, + "grad_norm": 0.08380055645354414, + "learning_rate": 3.6608924754316076e-06, + "loss": 0.8559, + "step": 29961 + }, + { + "epoch": 6.391211604095563, + "grad_norm": 0.08325421867055773, + "learning_rate": 3.65834810001898e-06, + "loss": 0.9001, + "step": 29962 + }, + { + "epoch": 6.391424914675768, + "grad_norm": 0.08158679355590337, + "learning_rate": 3.6558045884134143e-06, + "loss": 0.8866, + "step": 29963 + }, + { + "epoch": 6.3916382252559725, + "grad_norm": 0.08560426666679755, + "learning_rate": 3.653261940643722e-06, + "loss": 0.8668, + "step": 29964 + }, + { + "epoch": 6.391851535836177, + "grad_norm": 0.08236398331329492, + "learning_rate": 3.6507201567386452e-06, + "loss": 0.8984, + "step": 29965 + }, + { + "epoch": 6.392064846416382, + "grad_norm": 0.07877324946403713, + "learning_rate": 3.6481792367269607e-06, + "loss": 0.8802, + "step": 29966 + }, + { + "epoch": 6.392278156996587, + "grad_norm": 0.08656363873049427, + "learning_rate": 3.6456391806374013e-06, + "loss": 0.8601, + "step": 29967 + }, + { + "epoch": 6.392491467576792, + "grad_norm": 0.07924014022417292, + "learning_rate": 3.643099988498726e-06, + "loss": 0.885, + "step": 29968 + }, + { + "epoch": 6.392704778156997, + "grad_norm": 0.08303159078028846, + "learning_rate": 3.6405616603396765e-06, + "loss": 0.8707, + "step": 29969 + }, + { + "epoch": 6.3929180887372015, + "grad_norm": 0.08516580578570454, + "learning_rate": 3.6380241961889407e-06, + "loss": 0.9005, + "step": 29970 + }, + { + "epoch": 6.393131399317406, + "grad_norm": 0.07710646456136469, + "learning_rate": 3.6354875960752335e-06, + "loss": 0.8831, + "step": 29971 + }, + { + "epoch": 6.393344709897611, + "grad_norm": 0.08201548148157252, + "learning_rate": 3.6329518600272783e-06, + "loss": 0.8774, + "step": 29972 + }, + { + "epoch": 6.3935580204778155, + "grad_norm": 0.09505923481333556, + "learning_rate": 3.6304169880737637e-06, + "loss": 0.8828, + "step": 29973 + }, + { + "epoch": 6.39377133105802, + "grad_norm": 0.08543965211737801, + "learning_rate": 3.6278829802433603e-06, + "loss": 0.8725, + "step": 29974 + }, + { + "epoch": 6.393984641638225, + "grad_norm": 0.0784475731117614, + "learning_rate": 3.625349836564738e-06, + "loss": 0.9035, + "step": 29975 + }, + { + "epoch": 6.3941979522184305, + "grad_norm": 0.08236899045161476, + "learning_rate": 3.622817557066576e-06, + "loss": 0.88, + "step": 29976 + }, + { + "epoch": 6.394411262798635, + "grad_norm": 0.08409020086336028, + "learning_rate": 3.620286141777518e-06, + "loss": 0.8736, + "step": 29977 + }, + { + "epoch": 6.39462457337884, + "grad_norm": 0.07872443603800038, + "learning_rate": 3.6177555907262083e-06, + "loss": 0.882, + "step": 29978 + }, + { + "epoch": 6.3948378839590445, + "grad_norm": 0.08173276034556716, + "learning_rate": 3.615225903941264e-06, + "loss": 0.9139, + "step": 29979 + }, + { + "epoch": 6.395051194539249, + "grad_norm": 0.08388087088619219, + "learning_rate": 3.6126970814513373e-06, + "loss": 0.8893, + "step": 29980 + }, + { + "epoch": 6.395264505119454, + "grad_norm": 0.07606910463639686, + "learning_rate": 3.6101691232850457e-06, + "loss": 0.8576, + "step": 29981 + }, + { + "epoch": 6.395477815699659, + "grad_norm": 0.08131156205275088, + "learning_rate": 3.6076420294709523e-06, + "loss": 0.8764, + "step": 29982 + }, + { + "epoch": 6.395691126279863, + "grad_norm": 0.07664896187814683, + "learning_rate": 3.6051158000376753e-06, + "loss": 0.8773, + "step": 29983 + }, + { + "epoch": 6.395904436860068, + "grad_norm": 0.07980319179301697, + "learning_rate": 3.6025904350137954e-06, + "loss": 0.8792, + "step": 29984 + }, + { + "epoch": 6.3961177474402735, + "grad_norm": 0.07701450930812102, + "learning_rate": 3.600065934427903e-06, + "loss": 0.9044, + "step": 29985 + }, + { + "epoch": 6.396331058020478, + "grad_norm": 0.08442593880434442, + "learning_rate": 3.597542298308536e-06, + "loss": 0.886, + "step": 29986 + }, + { + "epoch": 6.396544368600683, + "grad_norm": 0.08840856047963751, + "learning_rate": 3.595019526684267e-06, + "loss": 0.8758, + "step": 29987 + }, + { + "epoch": 6.396757679180888, + "grad_norm": 0.08424351637526924, + "learning_rate": 3.5924976195836327e-06, + "loss": 0.8585, + "step": 29988 + }, + { + "epoch": 6.396970989761092, + "grad_norm": 0.07877521787855761, + "learning_rate": 3.589976577035188e-06, + "loss": 0.8991, + "step": 29989 + }, + { + "epoch": 6.397184300341297, + "grad_norm": 0.08178693158207173, + "learning_rate": 3.587456399067435e-06, + "loss": 0.8756, + "step": 29990 + }, + { + "epoch": 6.397397610921502, + "grad_norm": 0.0767119522683958, + "learning_rate": 3.584937085708884e-06, + "loss": 0.8563, + "step": 29991 + }, + { + "epoch": 6.397610921501706, + "grad_norm": 0.07697796399660874, + "learning_rate": 3.582418636988081e-06, + "loss": 0.8801, + "step": 29992 + }, + { + "epoch": 6.397824232081911, + "grad_norm": 0.08226750780703272, + "learning_rate": 3.579901052933492e-06, + "loss": 0.9004, + "step": 29993 + }, + { + "epoch": 6.398037542662116, + "grad_norm": 0.07896678988277718, + "learning_rate": 3.5773843335736015e-06, + "loss": 0.8536, + "step": 29994 + }, + { + "epoch": 6.398250853242321, + "grad_norm": 0.08531505949597193, + "learning_rate": 3.5748684789368837e-06, + "loss": 0.8795, + "step": 29995 + }, + { + "epoch": 6.398464163822526, + "grad_norm": 0.07897138510399913, + "learning_rate": 3.572353489051823e-06, + "loss": 0.8717, + "step": 29996 + }, + { + "epoch": 6.398677474402731, + "grad_norm": 0.08061609920800157, + "learning_rate": 3.5698393639468675e-06, + "loss": 0.8684, + "step": 29997 + }, + { + "epoch": 6.398890784982935, + "grad_norm": 0.08253807753520793, + "learning_rate": 3.567326103650466e-06, + "loss": 0.896, + "step": 29998 + }, + { + "epoch": 6.39910409556314, + "grad_norm": 0.08394637484744615, + "learning_rate": 3.564813708191048e-06, + "loss": 0.8803, + "step": 29999 + }, + { + "epoch": 6.399317406143345, + "grad_norm": 0.07871189331719637, + "learning_rate": 3.5623021775970544e-06, + "loss": 0.8703, + "step": 30000 + }, + { + "epoch": 6.399530716723549, + "grad_norm": 0.08626548526520049, + "learning_rate": 3.5597915118969063e-06, + "loss": 0.9066, + "step": 30001 + }, + { + "epoch": 6.399744027303754, + "grad_norm": 0.07786593846060087, + "learning_rate": 3.5572817111189896e-06, + "loss": 0.8931, + "step": 30002 + }, + { + "epoch": 6.399957337883959, + "grad_norm": 0.08113526339057023, + "learning_rate": 3.554772775291726e-06, + "loss": 0.8822, + "step": 30003 + }, + { + "epoch": 6.400170648464163, + "grad_norm": 0.07948770207391022, + "learning_rate": 3.552264704443493e-06, + "loss": 0.8689, + "step": 30004 + }, + { + "epoch": 6.400383959044369, + "grad_norm": 0.08194199858749439, + "learning_rate": 3.5497574986026683e-06, + "loss": 0.8935, + "step": 30005 + }, + { + "epoch": 6.400597269624574, + "grad_norm": 0.0779871965530043, + "learning_rate": 3.5472511577976197e-06, + "loss": 0.8858, + "step": 30006 + }, + { + "epoch": 6.400810580204778, + "grad_norm": 0.08167043812985811, + "learning_rate": 3.544745682056707e-06, + "loss": 0.8927, + "step": 30007 + }, + { + "epoch": 6.401023890784983, + "grad_norm": 0.08162544788575597, + "learning_rate": 3.54224107140829e-06, + "loss": 0.8659, + "step": 30008 + }, + { + "epoch": 6.401237201365188, + "grad_norm": 0.0823151360412787, + "learning_rate": 3.539737325880701e-06, + "loss": 0.8935, + "step": 30009 + }, + { + "epoch": 6.401450511945392, + "grad_norm": 0.07761851795395025, + "learning_rate": 3.5372344455022733e-06, + "loss": 0.8883, + "step": 30010 + }, + { + "epoch": 6.401663822525597, + "grad_norm": 0.09024183179445723, + "learning_rate": 3.5347324303013133e-06, + "loss": 0.9279, + "step": 30011 + }, + { + "epoch": 6.401877133105802, + "grad_norm": 0.08349435943367366, + "learning_rate": 3.532231280306153e-06, + "loss": 0.858, + "step": 30012 + }, + { + "epoch": 6.402090443686006, + "grad_norm": 0.0805914154079351, + "learning_rate": 3.529730995545073e-06, + "loss": 0.8772, + "step": 30013 + }, + { + "epoch": 6.402303754266212, + "grad_norm": 0.08744695513711787, + "learning_rate": 3.5272315760463794e-06, + "loss": 0.8773, + "step": 30014 + }, + { + "epoch": 6.402517064846417, + "grad_norm": 0.08514089065195814, + "learning_rate": 3.5247330218383514e-06, + "loss": 0.8708, + "step": 30015 + }, + { + "epoch": 6.402730375426621, + "grad_norm": 0.0881553518050565, + "learning_rate": 3.522235332949251e-06, + "loss": 0.8781, + "step": 30016 + }, + { + "epoch": 6.402943686006826, + "grad_norm": 0.07474116741239235, + "learning_rate": 3.51973850940734e-06, + "loss": 0.8777, + "step": 30017 + }, + { + "epoch": 6.403156996587031, + "grad_norm": 0.08810978610188386, + "learning_rate": 3.5172425512408803e-06, + "loss": 0.8982, + "step": 30018 + }, + { + "epoch": 6.403370307167235, + "grad_norm": 0.07380130054457051, + "learning_rate": 3.5147474584780984e-06, + "loss": 0.8546, + "step": 30019 + }, + { + "epoch": 6.40358361774744, + "grad_norm": 0.08408111634440875, + "learning_rate": 3.5122532311472466e-06, + "loss": 0.88, + "step": 30020 + }, + { + "epoch": 6.403796928327645, + "grad_norm": 0.08077029282270547, + "learning_rate": 3.5097598692765256e-06, + "loss": 0.9098, + "step": 30021 + }, + { + "epoch": 6.4040102389078495, + "grad_norm": 0.0770785933329755, + "learning_rate": 3.5072673728941696e-06, + "loss": 0.8915, + "step": 30022 + }, + { + "epoch": 6.404223549488055, + "grad_norm": 0.07480428973310055, + "learning_rate": 3.5047757420283525e-06, + "loss": 0.8666, + "step": 30023 + }, + { + "epoch": 6.40443686006826, + "grad_norm": 0.07925423391361787, + "learning_rate": 3.5022849767072997e-06, + "loss": 0.8893, + "step": 30024 + }, + { + "epoch": 6.404650170648464, + "grad_norm": 0.0770032816641933, + "learning_rate": 3.499795076959185e-06, + "loss": 0.8714, + "step": 30025 + }, + { + "epoch": 6.404863481228669, + "grad_norm": 0.0794365674981315, + "learning_rate": 3.4973060428121543e-06, + "loss": 0.8764, + "step": 30026 + }, + { + "epoch": 6.405076791808874, + "grad_norm": 0.08254042425313735, + "learning_rate": 3.4948178742944073e-06, + "loss": 0.8941, + "step": 30027 + }, + { + "epoch": 6.4052901023890785, + "grad_norm": 0.07618113030257077, + "learning_rate": 3.4923305714340816e-06, + "loss": 0.8817, + "step": 30028 + }, + { + "epoch": 6.405503412969283, + "grad_norm": 0.0754151230904443, + "learning_rate": 3.4898441342593236e-06, + "loss": 0.884, + "step": 30029 + }, + { + "epoch": 6.405716723549488, + "grad_norm": 0.07673459860039245, + "learning_rate": 3.487358562798262e-06, + "loss": 0.8674, + "step": 30030 + }, + { + "epoch": 6.4059300341296925, + "grad_norm": 0.07837655427121377, + "learning_rate": 3.4848738570790254e-06, + "loss": 0.8444, + "step": 30031 + }, + { + "epoch": 6.406143344709897, + "grad_norm": 0.08172043449910354, + "learning_rate": 3.4823900171297244e-06, + "loss": 0.8723, + "step": 30032 + }, + { + "epoch": 6.406356655290103, + "grad_norm": 0.0801516420398431, + "learning_rate": 3.4799070429784787e-06, + "loss": 0.9023, + "step": 30033 + }, + { + "epoch": 6.4065699658703075, + "grad_norm": 0.07504062636002422, + "learning_rate": 3.477424934653364e-06, + "loss": 0.8975, + "step": 30034 + }, + { + "epoch": 6.406783276450512, + "grad_norm": 0.07629766948943804, + "learning_rate": 3.474943692182464e-06, + "loss": 0.8753, + "step": 30035 + }, + { + "epoch": 6.406996587030717, + "grad_norm": 0.07986295959551362, + "learning_rate": 3.4724633155938813e-06, + "loss": 0.8736, + "step": 30036 + }, + { + "epoch": 6.4072098976109215, + "grad_norm": 0.08155157155593337, + "learning_rate": 3.469983804915664e-06, + "loss": 0.8485, + "step": 30037 + }, + { + "epoch": 6.407423208191126, + "grad_norm": 0.08258708146925817, + "learning_rate": 3.4675051601758524e-06, + "loss": 0.8964, + "step": 30038 + }, + { + "epoch": 6.407636518771331, + "grad_norm": 0.0799266879200829, + "learning_rate": 3.4650273814025126e-06, + "loss": 0.8976, + "step": 30039 + }, + { + "epoch": 6.407849829351536, + "grad_norm": 0.12581159230935227, + "learning_rate": 3.4625504686236843e-06, + "loss": 0.9059, + "step": 30040 + }, + { + "epoch": 6.40806313993174, + "grad_norm": 0.11093166469910389, + "learning_rate": 3.460074421867372e-06, + "loss": 0.8683, + "step": 30041 + }, + { + "epoch": 6.408276450511945, + "grad_norm": 0.07783091680176303, + "learning_rate": 3.4575992411616154e-06, + "loss": 0.876, + "step": 30042 + }, + { + "epoch": 6.4084897610921505, + "grad_norm": 0.07687980379797002, + "learning_rate": 3.4551249265343924e-06, + "loss": 0.8737, + "step": 30043 + }, + { + "epoch": 6.408703071672355, + "grad_norm": 0.08073213353963696, + "learning_rate": 3.4526514780137333e-06, + "loss": 0.8918, + "step": 30044 + }, + { + "epoch": 6.40891638225256, + "grad_norm": 0.07596535223187041, + "learning_rate": 3.450178895627616e-06, + "loss": 0.8828, + "step": 30045 + }, + { + "epoch": 6.409129692832765, + "grad_norm": 0.08067464806394932, + "learning_rate": 3.4477071794040008e-06, + "loss": 0.8765, + "step": 30046 + }, + { + "epoch": 6.409343003412969, + "grad_norm": 0.07697933879871252, + "learning_rate": 3.4452363293708645e-06, + "loss": 0.8733, + "step": 30047 + }, + { + "epoch": 6.409556313993174, + "grad_norm": 0.08221302622887143, + "learning_rate": 3.4427663455561856e-06, + "loss": 0.8743, + "step": 30048 + }, + { + "epoch": 6.409769624573379, + "grad_norm": 0.08267812310106858, + "learning_rate": 3.440297227987879e-06, + "loss": 0.8759, + "step": 30049 + }, + { + "epoch": 6.409982935153583, + "grad_norm": 0.07431507199452445, + "learning_rate": 3.4378289766938953e-06, + "loss": 0.8973, + "step": 30050 + }, + { + "epoch": 6.410196245733788, + "grad_norm": 0.07858087239136054, + "learning_rate": 3.4353615917021686e-06, + "loss": 0.8791, + "step": 30051 + }, + { + "epoch": 6.410409556313994, + "grad_norm": 0.08171323655850936, + "learning_rate": 3.432895073040614e-06, + "loss": 0.8657, + "step": 30052 + }, + { + "epoch": 6.410622866894198, + "grad_norm": 0.07887527118246582, + "learning_rate": 3.430429420737138e-06, + "loss": 0.8822, + "step": 30053 + }, + { + "epoch": 6.410836177474403, + "grad_norm": 0.08487959743061917, + "learning_rate": 3.427964634819647e-06, + "loss": 0.8474, + "step": 30054 + }, + { + "epoch": 6.411049488054608, + "grad_norm": 0.08786302843906459, + "learning_rate": 3.4255007153160037e-06, + "loss": 0.8961, + "step": 30055 + }, + { + "epoch": 6.411262798634812, + "grad_norm": 0.08202482378553046, + "learning_rate": 3.4230376622541227e-06, + "loss": 0.8752, + "step": 30056 + }, + { + "epoch": 6.411476109215017, + "grad_norm": 0.07881271267907528, + "learning_rate": 3.420575475661858e-06, + "loss": 0.8524, + "step": 30057 + }, + { + "epoch": 6.411689419795222, + "grad_norm": 0.08237727434842135, + "learning_rate": 3.4181141555670807e-06, + "loss": 0.8826, + "step": 30058 + }, + { + "epoch": 6.411902730375426, + "grad_norm": 0.07793736288967096, + "learning_rate": 3.4156537019976166e-06, + "loss": 0.8885, + "step": 30059 + }, + { + "epoch": 6.412116040955631, + "grad_norm": 0.07602579278963657, + "learning_rate": 3.4131941149813197e-06, + "loss": 0.8788, + "step": 30060 + }, + { + "epoch": 6.412329351535837, + "grad_norm": 0.07826620157191116, + "learning_rate": 3.4107353945460255e-06, + "loss": 0.8657, + "step": 30061 + }, + { + "epoch": 6.412542662116041, + "grad_norm": 0.07821279170734699, + "learning_rate": 3.4082775407195336e-06, + "loss": 0.8892, + "step": 30062 + }, + { + "epoch": 6.412755972696246, + "grad_norm": 0.08628804196636504, + "learning_rate": 3.4058205535296882e-06, + "loss": 0.871, + "step": 30063 + }, + { + "epoch": 6.412969283276451, + "grad_norm": 0.079476868278291, + "learning_rate": 3.4033644330042635e-06, + "loss": 0.8809, + "step": 30064 + }, + { + "epoch": 6.413182593856655, + "grad_norm": 0.0805370451948277, + "learning_rate": 3.400909179171059e-06, + "loss": 0.9125, + "step": 30065 + }, + { + "epoch": 6.41339590443686, + "grad_norm": 0.08047453970636367, + "learning_rate": 3.398454792057857e-06, + "loss": 0.8964, + "step": 30066 + }, + { + "epoch": 6.413609215017065, + "grad_norm": 0.08352160199229026, + "learning_rate": 3.3960012716924217e-06, + "loss": 0.8562, + "step": 30067 + }, + { + "epoch": 6.413822525597269, + "grad_norm": 0.07928345037650926, + "learning_rate": 3.3935486181025267e-06, + "loss": 0.8514, + "step": 30068 + }, + { + "epoch": 6.414035836177474, + "grad_norm": 0.07980687864159242, + "learning_rate": 3.39109683131591e-06, + "loss": 0.878, + "step": 30069 + }, + { + "epoch": 6.414249146757679, + "grad_norm": 0.07815028411630584, + "learning_rate": 3.3886459113603353e-06, + "loss": 0.8764, + "step": 30070 + }, + { + "epoch": 6.414462457337884, + "grad_norm": 0.08399303824816526, + "learning_rate": 3.386195858263515e-06, + "loss": 0.8809, + "step": 30071 + }, + { + "epoch": 6.414675767918089, + "grad_norm": 0.07658164667751047, + "learning_rate": 3.3837466720531765e-06, + "loss": 0.902, + "step": 30072 + }, + { + "epoch": 6.414889078498294, + "grad_norm": 0.077275449573438, + "learning_rate": 3.381298352757032e-06, + "loss": 0.8901, + "step": 30073 + }, + { + "epoch": 6.415102389078498, + "grad_norm": 0.07651355830127318, + "learning_rate": 3.378850900402775e-06, + "loss": 0.8701, + "step": 30074 + }, + { + "epoch": 6.415315699658703, + "grad_norm": 0.08088562714246045, + "learning_rate": 3.376404315018116e-06, + "loss": 0.8672, + "step": 30075 + }, + { + "epoch": 6.415529010238908, + "grad_norm": 0.08078026109278612, + "learning_rate": 3.3739585966307397e-06, + "loss": 0.9067, + "step": 30076 + }, + { + "epoch": 6.415742320819112, + "grad_norm": 0.07979694602827954, + "learning_rate": 3.3715137452683045e-06, + "loss": 0.8782, + "step": 30077 + }, + { + "epoch": 6.415955631399317, + "grad_norm": 0.07617313275561485, + "learning_rate": 3.3690697609584856e-06, + "loss": 0.891, + "step": 30078 + }, + { + "epoch": 6.416168941979522, + "grad_norm": 0.07921392891201412, + "learning_rate": 3.366626643728914e-06, + "loss": 0.8886, + "step": 30079 + }, + { + "epoch": 6.4163822525597265, + "grad_norm": 0.07874296537481344, + "learning_rate": 3.364184393607266e-06, + "loss": 0.87, + "step": 30080 + }, + { + "epoch": 6.416595563139932, + "grad_norm": 0.0792351570898051, + "learning_rate": 3.3617430106211723e-06, + "loss": 0.8799, + "step": 30081 + }, + { + "epoch": 6.416808873720137, + "grad_norm": 0.07853457986324448, + "learning_rate": 3.35930249479822e-06, + "loss": 0.9136, + "step": 30082 + }, + { + "epoch": 6.417022184300341, + "grad_norm": 0.07760062833096634, + "learning_rate": 3.356862846166058e-06, + "loss": 0.8693, + "step": 30083 + }, + { + "epoch": 6.417235494880546, + "grad_norm": 0.08057299789795926, + "learning_rate": 3.3544240647522907e-06, + "loss": 0.8548, + "step": 30084 + }, + { + "epoch": 6.417448805460751, + "grad_norm": 0.08096302977917101, + "learning_rate": 3.3519861505844966e-06, + "loss": 0.8824, + "step": 30085 + }, + { + "epoch": 6.4176621160409555, + "grad_norm": 0.07726736252656165, + "learning_rate": 3.3495491036902618e-06, + "loss": 0.8815, + "step": 30086 + }, + { + "epoch": 6.41787542662116, + "grad_norm": 0.07706421765109632, + "learning_rate": 3.3471129240971733e-06, + "loss": 0.8436, + "step": 30087 + }, + { + "epoch": 6.418088737201365, + "grad_norm": 0.07996087899842806, + "learning_rate": 3.344677611832801e-06, + "loss": 0.8524, + "step": 30088 + }, + { + "epoch": 6.4183020477815695, + "grad_norm": 0.07825650231731951, + "learning_rate": 3.3422431669246856e-06, + "loss": 0.8962, + "step": 30089 + }, + { + "epoch": 6.418515358361775, + "grad_norm": 0.07901631592764431, + "learning_rate": 3.3398095894003714e-06, + "loss": 0.895, + "step": 30090 + }, + { + "epoch": 6.41872866894198, + "grad_norm": 0.08090210276056244, + "learning_rate": 3.3373768792873997e-06, + "loss": 0.8662, + "step": 30091 + }, + { + "epoch": 6.4189419795221845, + "grad_norm": 0.0798558290767273, + "learning_rate": 3.3349450366133217e-06, + "loss": 0.8542, + "step": 30092 + }, + { + "epoch": 6.419155290102389, + "grad_norm": 0.08479749509721296, + "learning_rate": 3.3325140614056097e-06, + "loss": 0.9008, + "step": 30093 + }, + { + "epoch": 6.419368600682594, + "grad_norm": 0.0826767280053995, + "learning_rate": 3.3300839536917873e-06, + "loss": 0.8701, + "step": 30094 + }, + { + "epoch": 6.4195819112627985, + "grad_norm": 0.08052023586507108, + "learning_rate": 3.3276547134993618e-06, + "loss": 0.8977, + "step": 30095 + }, + { + "epoch": 6.419795221843003, + "grad_norm": 0.08408698175751365, + "learning_rate": 3.325226340855814e-06, + "loss": 0.89, + "step": 30096 + }, + { + "epoch": 6.420008532423208, + "grad_norm": 0.07952152570934976, + "learning_rate": 3.322798835788623e-06, + "loss": 0.8681, + "step": 30097 + }, + { + "epoch": 6.420221843003413, + "grad_norm": 0.0807278690170951, + "learning_rate": 3.3203721983252346e-06, + "loss": 0.8786, + "step": 30098 + }, + { + "epoch": 6.420435153583618, + "grad_norm": 0.08400066765794448, + "learning_rate": 3.3179464284931463e-06, + "loss": 0.8866, + "step": 30099 + }, + { + "epoch": 6.420648464163823, + "grad_norm": 0.07764803704989505, + "learning_rate": 3.315521526319776e-06, + "loss": 0.898, + "step": 30100 + }, + { + "epoch": 6.4208617747440275, + "grad_norm": 0.07573671578981053, + "learning_rate": 3.3130974918325686e-06, + "loss": 0.896, + "step": 30101 + }, + { + "epoch": 6.421075085324232, + "grad_norm": 0.09531560186995759, + "learning_rate": 3.3106743250589512e-06, + "loss": 0.9031, + "step": 30102 + }, + { + "epoch": 6.421288395904437, + "grad_norm": 0.08665079643259119, + "learning_rate": 3.308252026026342e-06, + "loss": 0.884, + "step": 30103 + }, + { + "epoch": 6.421501706484642, + "grad_norm": 0.08016497656318934, + "learning_rate": 3.305830594762167e-06, + "loss": 0.846, + "step": 30104 + }, + { + "epoch": 6.421715017064846, + "grad_norm": 0.0842385285957913, + "learning_rate": 3.303410031293801e-06, + "loss": 0.8747, + "step": 30105 + }, + { + "epoch": 6.421928327645051, + "grad_norm": 0.08548375983483826, + "learning_rate": 3.300990335648626e-06, + "loss": 0.8881, + "step": 30106 + }, + { + "epoch": 6.422141638225256, + "grad_norm": 0.08323465627369016, + "learning_rate": 3.2985715078540427e-06, + "loss": 0.8674, + "step": 30107 + }, + { + "epoch": 6.422354948805461, + "grad_norm": 0.08433090100450884, + "learning_rate": 3.296153547937424e-06, + "loss": 0.8487, + "step": 30108 + }, + { + "epoch": 6.422568259385666, + "grad_norm": 0.08771925071103516, + "learning_rate": 3.2937364559261087e-06, + "loss": 0.8913, + "step": 30109 + }, + { + "epoch": 6.422781569965871, + "grad_norm": 0.08240688245875746, + "learning_rate": 3.2913202318474615e-06, + "loss": 0.8899, + "step": 30110 + }, + { + "epoch": 6.422994880546075, + "grad_norm": 0.08089996626858173, + "learning_rate": 3.2889048757288024e-06, + "loss": 0.9174, + "step": 30111 + }, + { + "epoch": 6.42320819112628, + "grad_norm": 0.08643724759355752, + "learning_rate": 3.2864903875974875e-06, + "loss": 0.8511, + "step": 30112 + }, + { + "epoch": 6.423421501706485, + "grad_norm": 0.0757669365018056, + "learning_rate": 3.2840767674808192e-06, + "loss": 0.8563, + "step": 30113 + }, + { + "epoch": 6.423634812286689, + "grad_norm": 0.08367792994928239, + "learning_rate": 3.2816640154061187e-06, + "loss": 0.8906, + "step": 30114 + }, + { + "epoch": 6.423848122866894, + "grad_norm": 0.08733434360671465, + "learning_rate": 3.2792521314006785e-06, + "loss": 0.8669, + "step": 30115 + }, + { + "epoch": 6.424061433447099, + "grad_norm": 0.07987057699297771, + "learning_rate": 3.2768411154917847e-06, + "loss": 0.9211, + "step": 30116 + }, + { + "epoch": 6.424274744027303, + "grad_norm": 0.08290226031115627, + "learning_rate": 3.2744309677067297e-06, + "loss": 0.8491, + "step": 30117 + }, + { + "epoch": 6.424488054607509, + "grad_norm": 0.07927718317097228, + "learning_rate": 3.272021688072773e-06, + "loss": 0.8969, + "step": 30118 + }, + { + "epoch": 6.424701365187714, + "grad_norm": 0.08992851515022611, + "learning_rate": 3.2696132766171896e-06, + "loss": 0.9029, + "step": 30119 + }, + { + "epoch": 6.424914675767918, + "grad_norm": 0.08597238047951299, + "learning_rate": 3.2672057333672203e-06, + "loss": 0.8853, + "step": 30120 + }, + { + "epoch": 6.425127986348123, + "grad_norm": 0.07961940499319924, + "learning_rate": 3.2647990583501056e-06, + "loss": 0.8851, + "step": 30121 + }, + { + "epoch": 6.425341296928328, + "grad_norm": 0.08710406108550678, + "learning_rate": 3.2623932515930858e-06, + "loss": 0.8666, + "step": 30122 + }, + { + "epoch": 6.425554607508532, + "grad_norm": 0.08245592017841627, + "learning_rate": 3.2599883131233657e-06, + "loss": 0.8727, + "step": 30123 + }, + { + "epoch": 6.425767918088737, + "grad_norm": 0.09668733574749361, + "learning_rate": 3.257584242968177e-06, + "loss": 0.8661, + "step": 30124 + }, + { + "epoch": 6.425981228668942, + "grad_norm": 0.07490238396625659, + "learning_rate": 3.2551810411547156e-06, + "loss": 0.8742, + "step": 30125 + }, + { + "epoch": 6.426194539249146, + "grad_norm": 0.07878273615827353, + "learning_rate": 3.2527787077101693e-06, + "loss": 0.8679, + "step": 30126 + }, + { + "epoch": 6.426407849829351, + "grad_norm": 0.07820636657364102, + "learning_rate": 3.2503772426617242e-06, + "loss": 0.8867, + "step": 30127 + }, + { + "epoch": 6.426621160409557, + "grad_norm": 0.09083017020172109, + "learning_rate": 3.24797664603655e-06, + "loss": 0.8864, + "step": 30128 + }, + { + "epoch": 6.426834470989761, + "grad_norm": 0.07934126158103914, + "learning_rate": 3.245576917861808e-06, + "loss": 0.8797, + "step": 30129 + }, + { + "epoch": 6.427047781569966, + "grad_norm": 0.0832336037658456, + "learning_rate": 3.2431780581646488e-06, + "loss": 0.885, + "step": 30130 + }, + { + "epoch": 6.427261092150171, + "grad_norm": 0.08287425582784623, + "learning_rate": 3.240780066972233e-06, + "loss": 0.8662, + "step": 30131 + }, + { + "epoch": 6.427474402730375, + "grad_norm": 0.08656019471212008, + "learning_rate": 3.238382944311678e-06, + "loss": 0.8636, + "step": 30132 + }, + { + "epoch": 6.42768771331058, + "grad_norm": 0.08667272841921989, + "learning_rate": 3.2359866902101156e-06, + "loss": 0.8847, + "step": 30133 + }, + { + "epoch": 6.427901023890785, + "grad_norm": 0.08612065985800664, + "learning_rate": 3.2335913046946454e-06, + "loss": 0.8551, + "step": 30134 + }, + { + "epoch": 6.4281143344709895, + "grad_norm": 0.08283849033585174, + "learning_rate": 3.231196787792383e-06, + "loss": 0.8854, + "step": 30135 + }, + { + "epoch": 6.428327645051194, + "grad_norm": 0.08092585290546843, + "learning_rate": 3.2288031395304188e-06, + "loss": 0.8731, + "step": 30136 + }, + { + "epoch": 6.4285409556314, + "grad_norm": 0.0810782447634352, + "learning_rate": 3.226410359935859e-06, + "loss": 0.885, + "step": 30137 + }, + { + "epoch": 6.428754266211604, + "grad_norm": 0.08427288741567357, + "learning_rate": 3.2240184490357307e-06, + "loss": 0.888, + "step": 30138 + }, + { + "epoch": 6.428967576791809, + "grad_norm": 0.0822575326349203, + "learning_rate": 3.2216274068571417e-06, + "loss": 0.8605, + "step": 30139 + }, + { + "epoch": 6.429180887372014, + "grad_norm": 0.08474503941773447, + "learning_rate": 3.219237233427119e-06, + "loss": 0.8794, + "step": 30140 + }, + { + "epoch": 6.4293941979522184, + "grad_norm": 0.07960938516284717, + "learning_rate": 3.216847928772726e-06, + "loss": 0.8842, + "step": 30141 + }, + { + "epoch": 6.429607508532423, + "grad_norm": 0.0877672794713554, + "learning_rate": 3.2144594929209805e-06, + "loss": 0.8771, + "step": 30142 + }, + { + "epoch": 6.429820819112628, + "grad_norm": 0.08394289260869349, + "learning_rate": 3.2120719258989275e-06, + "loss": 0.8845, + "step": 30143 + }, + { + "epoch": 6.4300341296928325, + "grad_norm": 0.07905926100524811, + "learning_rate": 3.2096852277335676e-06, + "loss": 0.8384, + "step": 30144 + }, + { + "epoch": 6.430247440273037, + "grad_norm": 0.07843316325803526, + "learning_rate": 3.2072993984519106e-06, + "loss": 0.8778, + "step": 30145 + }, + { + "epoch": 6.430460750853243, + "grad_norm": 0.08058503300787977, + "learning_rate": 3.204914438080957e-06, + "loss": 0.8522, + "step": 30146 + }, + { + "epoch": 6.430674061433447, + "grad_norm": 0.08042623700872815, + "learning_rate": 3.2025303466476808e-06, + "loss": 0.8828, + "step": 30147 + }, + { + "epoch": 6.430887372013652, + "grad_norm": 0.08463243793443247, + "learning_rate": 3.2001471241790826e-06, + "loss": 0.8883, + "step": 30148 + }, + { + "epoch": 6.431100682593857, + "grad_norm": 0.08012158960226827, + "learning_rate": 3.197764770702101e-06, + "loss": 0.8468, + "step": 30149 + }, + { + "epoch": 6.4313139931740615, + "grad_norm": 0.08469857634348596, + "learning_rate": 3.195383286243692e-06, + "loss": 0.8836, + "step": 30150 + }, + { + "epoch": 6.431527303754266, + "grad_norm": 0.07756103378852701, + "learning_rate": 3.1930026708308202e-06, + "loss": 0.8818, + "step": 30151 + }, + { + "epoch": 6.431740614334471, + "grad_norm": 0.07994559470800261, + "learning_rate": 3.190622924490416e-06, + "loss": 0.8862, + "step": 30152 + }, + { + "epoch": 6.4319539249146755, + "grad_norm": 0.07911347285811719, + "learning_rate": 3.1882440472494093e-06, + "loss": 0.8768, + "step": 30153 + }, + { + "epoch": 6.43216723549488, + "grad_norm": 0.08803418721147771, + "learning_rate": 3.185866039134702e-06, + "loss": 0.8864, + "step": 30154 + }, + { + "epoch": 6.432380546075085, + "grad_norm": 0.08455121580629152, + "learning_rate": 3.183488900173224e-06, + "loss": 0.8987, + "step": 30155 + }, + { + "epoch": 6.4325938566552905, + "grad_norm": 0.08382478797266352, + "learning_rate": 3.1811126303918605e-06, + "loss": 0.9012, + "step": 30156 + }, + { + "epoch": 6.432807167235495, + "grad_norm": 0.08125461032807281, + "learning_rate": 3.178737229817497e-06, + "loss": 0.8841, + "step": 30157 + }, + { + "epoch": 6.4330204778157, + "grad_norm": 0.0806892308214298, + "learning_rate": 3.176362698477018e-06, + "loss": 0.8611, + "step": 30158 + }, + { + "epoch": 6.4332337883959045, + "grad_norm": 0.09284231887802527, + "learning_rate": 3.1739890363972738e-06, + "loss": 0.8926, + "step": 30159 + }, + { + "epoch": 6.433447098976109, + "grad_norm": 0.08983916056743259, + "learning_rate": 3.171616243605158e-06, + "loss": 0.9076, + "step": 30160 + }, + { + "epoch": 6.433660409556314, + "grad_norm": 0.10542464706726405, + "learning_rate": 3.1692443201274847e-06, + "loss": 0.8998, + "step": 30161 + }, + { + "epoch": 6.433873720136519, + "grad_norm": 0.09271144140319866, + "learning_rate": 3.166873265991095e-06, + "loss": 0.8896, + "step": 30162 + }, + { + "epoch": 6.434087030716723, + "grad_norm": 0.08277008629873299, + "learning_rate": 3.1645030812228383e-06, + "loss": 0.8758, + "step": 30163 + }, + { + "epoch": 6.434300341296928, + "grad_norm": 0.08862052839445982, + "learning_rate": 3.16213376584952e-06, + "loss": 0.8967, + "step": 30164 + }, + { + "epoch": 6.434513651877133, + "grad_norm": 0.08568803745286328, + "learning_rate": 3.159765319897945e-06, + "loss": 0.8956, + "step": 30165 + }, + { + "epoch": 6.434726962457338, + "grad_norm": 0.08414529888628824, + "learning_rate": 3.157397743394919e-06, + "loss": 0.9172, + "step": 30166 + }, + { + "epoch": 6.434940273037543, + "grad_norm": 0.07935460184555998, + "learning_rate": 3.155031036367229e-06, + "loss": 0.8612, + "step": 30167 + }, + { + "epoch": 6.435153583617748, + "grad_norm": 0.07459247604137878, + "learning_rate": 3.1526651988416624e-06, + "loss": 0.8617, + "step": 30168 + }, + { + "epoch": 6.435366894197952, + "grad_norm": 0.08243002553917936, + "learning_rate": 3.1503002308449805e-06, + "loss": 0.8948, + "step": 30169 + }, + { + "epoch": 6.435580204778157, + "grad_norm": 0.0799729781323778, + "learning_rate": 3.147936132403944e-06, + "loss": 0.8648, + "step": 30170 + }, + { + "epoch": 6.435793515358362, + "grad_norm": 0.08005803122219135, + "learning_rate": 3.1455729035453042e-06, + "loss": 0.8702, + "step": 30171 + }, + { + "epoch": 6.436006825938566, + "grad_norm": 0.07678843300053721, + "learning_rate": 3.143210544295796e-06, + "loss": 0.8899, + "step": 30172 + }, + { + "epoch": 6.436220136518771, + "grad_norm": 0.08101683479452787, + "learning_rate": 3.1408490546821537e-06, + "loss": 0.9132, + "step": 30173 + }, + { + "epoch": 6.436433447098976, + "grad_norm": 0.08078787543117859, + "learning_rate": 3.138488434731093e-06, + "loss": 0.9011, + "step": 30174 + }, + { + "epoch": 6.436646757679181, + "grad_norm": 0.07687015519868831, + "learning_rate": 3.1361286844693306e-06, + "loss": 0.8562, + "step": 30175 + }, + { + "epoch": 6.436860068259386, + "grad_norm": 0.07805533518742036, + "learning_rate": 3.133769803923574e-06, + "loss": 0.8801, + "step": 30176 + }, + { + "epoch": 6.437073378839591, + "grad_norm": 0.08427086238976873, + "learning_rate": 3.1314117931204956e-06, + "loss": 0.9006, + "step": 30177 + }, + { + "epoch": 6.437286689419795, + "grad_norm": 0.07663001156452276, + "learning_rate": 3.129054652086776e-06, + "loss": 0.8614, + "step": 30178 + }, + { + "epoch": 6.4375, + "grad_norm": 0.0815191287217908, + "learning_rate": 3.1266983808491134e-06, + "loss": 0.8957, + "step": 30179 + }, + { + "epoch": 6.437713310580205, + "grad_norm": 0.08224199005647696, + "learning_rate": 3.124342979434145e-06, + "loss": 0.885, + "step": 30180 + }, + { + "epoch": 6.437926621160409, + "grad_norm": 0.07552176029190082, + "learning_rate": 3.1219884478685337e-06, + "loss": 0.8703, + "step": 30181 + }, + { + "epoch": 6.438139931740614, + "grad_norm": 0.08554599046053599, + "learning_rate": 3.1196347861789157e-06, + "loss": 0.8538, + "step": 30182 + }, + { + "epoch": 6.438353242320819, + "grad_norm": 0.08491319240978008, + "learning_rate": 3.1172819943919187e-06, + "loss": 0.873, + "step": 30183 + }, + { + "epoch": 6.438566552901024, + "grad_norm": 0.08559979335245242, + "learning_rate": 3.114930072534179e-06, + "loss": 0.8981, + "step": 30184 + }, + { + "epoch": 6.438779863481229, + "grad_norm": 0.07772436407042209, + "learning_rate": 3.112579020632289e-06, + "loss": 0.8824, + "step": 30185 + }, + { + "epoch": 6.438993174061434, + "grad_norm": 0.08079610566988661, + "learning_rate": 3.11022883871285e-06, + "loss": 0.8824, + "step": 30186 + }, + { + "epoch": 6.439206484641638, + "grad_norm": 0.07571417142306612, + "learning_rate": 3.10787952680248e-06, + "loss": 0.8875, + "step": 30187 + }, + { + "epoch": 6.439419795221843, + "grad_norm": 0.08347407675803971, + "learning_rate": 3.1055310849277444e-06, + "loss": 0.8616, + "step": 30188 + }, + { + "epoch": 6.439633105802048, + "grad_norm": 0.08084072118453557, + "learning_rate": 3.1031835131152266e-06, + "loss": 0.8338, + "step": 30189 + }, + { + "epoch": 6.439846416382252, + "grad_norm": 0.08434822465292294, + "learning_rate": 3.1008368113914565e-06, + "loss": 0.9139, + "step": 30190 + }, + { + "epoch": 6.440059726962457, + "grad_norm": 0.07929039821008362, + "learning_rate": 3.0984909797830354e-06, + "loss": 0.8827, + "step": 30191 + }, + { + "epoch": 6.440273037542662, + "grad_norm": 0.08217575281787866, + "learning_rate": 3.096146018316475e-06, + "loss": 0.8733, + "step": 30192 + }, + { + "epoch": 6.440486348122867, + "grad_norm": 0.08131607913831489, + "learning_rate": 3.093801927018323e-06, + "loss": 0.8588, + "step": 30193 + }, + { + "epoch": 6.440699658703072, + "grad_norm": 0.08410643205287065, + "learning_rate": 3.0914587059150734e-06, + "loss": 0.8971, + "step": 30194 + }, + { + "epoch": 6.440912969283277, + "grad_norm": 0.08524818998682086, + "learning_rate": 3.0891163550332837e-06, + "loss": 0.8686, + "step": 30195 + }, + { + "epoch": 6.441126279863481, + "grad_norm": 0.08753061512661632, + "learning_rate": 3.0867748743994298e-06, + "loss": 0.8807, + "step": 30196 + }, + { + "epoch": 6.441339590443686, + "grad_norm": 0.08115396440967447, + "learning_rate": 3.084434264040006e-06, + "loss": 0.857, + "step": 30197 + }, + { + "epoch": 6.441552901023891, + "grad_norm": 0.08679806267700618, + "learning_rate": 3.082094523981498e-06, + "loss": 0.8817, + "step": 30198 + }, + { + "epoch": 6.4417662116040955, + "grad_norm": 0.08153322962095938, + "learning_rate": 3.0797556542504003e-06, + "loss": 0.8749, + "step": 30199 + }, + { + "epoch": 6.4419795221843, + "grad_norm": 0.08151417859739543, + "learning_rate": 3.077417654873154e-06, + "loss": 0.9022, + "step": 30200 + }, + { + "epoch": 6.442192832764505, + "grad_norm": 0.08090399623063844, + "learning_rate": 3.0750805258762174e-06, + "loss": 0.8898, + "step": 30201 + }, + { + "epoch": 6.4424061433447095, + "grad_norm": 0.08493681873118124, + "learning_rate": 3.072744267286041e-06, + "loss": 0.8606, + "step": 30202 + }, + { + "epoch": 6.442619453924915, + "grad_norm": 0.07986642561337898, + "learning_rate": 3.0704088791290565e-06, + "loss": 0.8821, + "step": 30203 + }, + { + "epoch": 6.44283276450512, + "grad_norm": 0.08560724591416213, + "learning_rate": 3.0680743614317055e-06, + "loss": 0.8733, + "step": 30204 + }, + { + "epoch": 6.4430460750853245, + "grad_norm": 0.07891141569344597, + "learning_rate": 3.0657407142203664e-06, + "loss": 0.8842, + "step": 30205 + }, + { + "epoch": 6.443259385665529, + "grad_norm": 0.07892233267445167, + "learning_rate": 3.063407937521463e-06, + "loss": 0.895, + "step": 30206 + }, + { + "epoch": 6.443472696245734, + "grad_norm": 0.08345033829471755, + "learning_rate": 3.0610760313614097e-06, + "loss": 0.8868, + "step": 30207 + }, + { + "epoch": 6.4436860068259385, + "grad_norm": 0.09076085446702592, + "learning_rate": 3.0587449957665672e-06, + "loss": 0.8947, + "step": 30208 + }, + { + "epoch": 6.443899317406143, + "grad_norm": 0.08113056926792817, + "learning_rate": 3.0564148307633146e-06, + "loss": 0.8861, + "step": 30209 + }, + { + "epoch": 6.444112627986348, + "grad_norm": 0.0825948443064296, + "learning_rate": 3.054085536378022e-06, + "loss": 0.9245, + "step": 30210 + }, + { + "epoch": 6.444325938566553, + "grad_norm": 0.08193205673115342, + "learning_rate": 3.0517571126370505e-06, + "loss": 0.8696, + "step": 30211 + }, + { + "epoch": 6.444539249146757, + "grad_norm": 0.08493753053571362, + "learning_rate": 3.0494295595667435e-06, + "loss": 0.8634, + "step": 30212 + }, + { + "epoch": 6.444752559726963, + "grad_norm": 0.08606739220579328, + "learning_rate": 3.0471028771934354e-06, + "loss": 0.8655, + "step": 30213 + }, + { + "epoch": 6.4449658703071675, + "grad_norm": 0.08703842138899169, + "learning_rate": 3.044777065543443e-06, + "loss": 0.8974, + "step": 30214 + }, + { + "epoch": 6.445179180887372, + "grad_norm": 0.08339040624409726, + "learning_rate": 3.0424521246431094e-06, + "loss": 0.8644, + "step": 30215 + }, + { + "epoch": 6.445392491467577, + "grad_norm": 0.08186318060979647, + "learning_rate": 3.040128054518716e-06, + "loss": 0.8939, + "step": 30216 + }, + { + "epoch": 6.4456058020477816, + "grad_norm": 0.08459046653487948, + "learning_rate": 3.0378048551965712e-06, + "loss": 0.8848, + "step": 30217 + }, + { + "epoch": 6.445819112627986, + "grad_norm": 0.08612223546778991, + "learning_rate": 3.035482526702946e-06, + "loss": 0.8988, + "step": 30218 + }, + { + "epoch": 6.446032423208191, + "grad_norm": 0.08607941190296946, + "learning_rate": 3.0331610690641413e-06, + "loss": 0.8779, + "step": 30219 + }, + { + "epoch": 6.446245733788396, + "grad_norm": 0.07950219758647868, + "learning_rate": 3.03084048230641e-06, + "loss": 0.891, + "step": 30220 + }, + { + "epoch": 6.4464590443686, + "grad_norm": 0.08002261737198207, + "learning_rate": 3.0285207664560155e-06, + "loss": 0.8709, + "step": 30221 + }, + { + "epoch": 6.446672354948806, + "grad_norm": 0.08187371575813407, + "learning_rate": 3.0262019215391958e-06, + "loss": 0.875, + "step": 30222 + }, + { + "epoch": 6.4468856655290105, + "grad_norm": 0.08720702631122072, + "learning_rate": 3.0238839475822046e-06, + "loss": 0.8817, + "step": 30223 + }, + { + "epoch": 6.447098976109215, + "grad_norm": 0.08233798151437105, + "learning_rate": 3.0215668446112613e-06, + "loss": 0.8975, + "step": 30224 + }, + { + "epoch": 6.44731228668942, + "grad_norm": 0.07638064372801705, + "learning_rate": 3.0192506126525843e-06, + "loss": 0.917, + "step": 30225 + }, + { + "epoch": 6.447525597269625, + "grad_norm": 0.07528357296638107, + "learning_rate": 3.016935251732376e-06, + "loss": 0.8833, + "step": 30226 + }, + { + "epoch": 6.447738907849829, + "grad_norm": 0.08299971030655376, + "learning_rate": 3.014620761876854e-06, + "loss": 0.8959, + "step": 30227 + }, + { + "epoch": 6.447952218430034, + "grad_norm": 0.0943605358060179, + "learning_rate": 3.0123071431121854e-06, + "loss": 0.8677, + "step": 30228 + }, + { + "epoch": 6.448165529010239, + "grad_norm": 0.08100625667465994, + "learning_rate": 3.0099943954645528e-06, + "loss": 0.8609, + "step": 30229 + }, + { + "epoch": 6.448378839590443, + "grad_norm": 0.08614232784790748, + "learning_rate": 3.0076825189601224e-06, + "loss": 0.9009, + "step": 30230 + }, + { + "epoch": 6.448592150170649, + "grad_norm": 0.090177526738976, + "learning_rate": 3.0053715136250683e-06, + "loss": 0.9065, + "step": 30231 + }, + { + "epoch": 6.448805460750854, + "grad_norm": 0.08606315757254746, + "learning_rate": 3.00306137948553e-06, + "loss": 0.9008, + "step": 30232 + }, + { + "epoch": 6.449018771331058, + "grad_norm": 0.08250185844209348, + "learning_rate": 3.000752116567647e-06, + "loss": 0.9053, + "step": 30233 + }, + { + "epoch": 6.449232081911263, + "grad_norm": 0.09544339462705778, + "learning_rate": 2.9984437248975396e-06, + "loss": 0.9092, + "step": 30234 + }, + { + "epoch": 6.449445392491468, + "grad_norm": 0.09459544138405848, + "learning_rate": 2.9961362045013474e-06, + "loss": 0.8493, + "step": 30235 + }, + { + "epoch": 6.449658703071672, + "grad_norm": 0.0872019191007116, + "learning_rate": 2.9938295554051746e-06, + "loss": 0.8735, + "step": 30236 + }, + { + "epoch": 6.449872013651877, + "grad_norm": 0.08378753264342506, + "learning_rate": 2.991523777635106e-06, + "loss": 0.8915, + "step": 30237 + }, + { + "epoch": 6.450085324232082, + "grad_norm": 0.09048481983709795, + "learning_rate": 2.9892188712172454e-06, + "loss": 0.8883, + "step": 30238 + }, + { + "epoch": 6.450298634812286, + "grad_norm": 0.08870581349585868, + "learning_rate": 2.98691483617767e-06, + "loss": 0.903, + "step": 30239 + }, + { + "epoch": 6.450511945392491, + "grad_norm": 0.10017327929102074, + "learning_rate": 2.9846116725424477e-06, + "loss": 0.8769, + "step": 30240 + }, + { + "epoch": 6.450725255972697, + "grad_norm": 0.07662612312009075, + "learning_rate": 2.982309380337638e-06, + "loss": 0.8613, + "step": 30241 + }, + { + "epoch": 6.450938566552901, + "grad_norm": 0.07249401684708284, + "learning_rate": 2.9800079595892817e-06, + "loss": 0.8617, + "step": 30242 + }, + { + "epoch": 6.451151877133106, + "grad_norm": 0.08496499606327103, + "learning_rate": 2.977707410323447e-06, + "loss": 0.9019, + "step": 30243 + }, + { + "epoch": 6.451365187713311, + "grad_norm": 0.0930965097759257, + "learning_rate": 2.9754077325661403e-06, + "loss": 0.8269, + "step": 30244 + }, + { + "epoch": 6.451578498293515, + "grad_norm": 0.08850337912911174, + "learning_rate": 2.973108926343393e-06, + "loss": 0.8975, + "step": 30245 + }, + { + "epoch": 6.45179180887372, + "grad_norm": 0.0820994388960557, + "learning_rate": 2.9708109916812124e-06, + "loss": 0.8652, + "step": 30246 + }, + { + "epoch": 6.452005119453925, + "grad_norm": 0.08395831977779326, + "learning_rate": 2.968513928605603e-06, + "loss": 0.8841, + "step": 30247 + }, + { + "epoch": 6.452218430034129, + "grad_norm": 0.0879514458132069, + "learning_rate": 2.966217737142554e-06, + "loss": 0.8973, + "step": 30248 + }, + { + "epoch": 6.452431740614334, + "grad_norm": 0.08050969721430737, + "learning_rate": 2.9639224173180524e-06, + "loss": 0.8672, + "step": 30249 + }, + { + "epoch": 6.452645051194539, + "grad_norm": 0.08826321264193883, + "learning_rate": 2.961627969158061e-06, + "loss": 0.8823, + "step": 30250 + }, + { + "epoch": 6.452858361774744, + "grad_norm": 0.09324064579593673, + "learning_rate": 2.95933439268854e-06, + "loss": 0.9099, + "step": 30251 + }, + { + "epoch": 6.453071672354949, + "grad_norm": 0.08636780861992883, + "learning_rate": 2.9570416879354513e-06, + "loss": 0.8744, + "step": 30252 + }, + { + "epoch": 6.453284982935154, + "grad_norm": 0.08877067728094436, + "learning_rate": 2.95474985492473e-06, + "loss": 0.8862, + "step": 30253 + }, + { + "epoch": 6.453498293515358, + "grad_norm": 0.0817482309830097, + "learning_rate": 2.952458893682302e-06, + "loss": 0.8796, + "step": 30254 + }, + { + "epoch": 6.453711604095563, + "grad_norm": 0.09072042143891246, + "learning_rate": 2.9501688042341102e-06, + "loss": 0.8801, + "step": 30255 + }, + { + "epoch": 6.453924914675768, + "grad_norm": 0.0849956612173831, + "learning_rate": 2.947879586606046e-06, + "loss": 0.8805, + "step": 30256 + }, + { + "epoch": 6.4541382252559725, + "grad_norm": 0.08467821916393578, + "learning_rate": 2.945591240824026e-06, + "loss": 0.8781, + "step": 30257 + }, + { + "epoch": 6.454351535836177, + "grad_norm": 0.07934046489982952, + "learning_rate": 2.9433037669139318e-06, + "loss": 0.8628, + "step": 30258 + }, + { + "epoch": 6.454564846416382, + "grad_norm": 0.07586119347999813, + "learning_rate": 2.9410171649016538e-06, + "loss": 0.8654, + "step": 30259 + }, + { + "epoch": 6.454778156996587, + "grad_norm": 0.08397974472359125, + "learning_rate": 2.9387314348130735e-06, + "loss": 0.9337, + "step": 30260 + }, + { + "epoch": 6.454991467576792, + "grad_norm": 0.09015111092601766, + "learning_rate": 2.936446576674028e-06, + "loss": 0.8688, + "step": 30261 + }, + { + "epoch": 6.455204778156997, + "grad_norm": 0.07854718517590617, + "learning_rate": 2.9341625905103897e-06, + "loss": 0.8677, + "step": 30262 + }, + { + "epoch": 6.4554180887372015, + "grad_norm": 0.08448192623764687, + "learning_rate": 2.931879476348005e-06, + "loss": 0.8739, + "step": 30263 + }, + { + "epoch": 6.455631399317406, + "grad_norm": 0.07996042379327106, + "learning_rate": 2.929597234212693e-06, + "loss": 0.8806, + "step": 30264 + }, + { + "epoch": 6.455844709897611, + "grad_norm": 0.07955086181241525, + "learning_rate": 2.9273158641302913e-06, + "loss": 0.8344, + "step": 30265 + }, + { + "epoch": 6.4560580204778155, + "grad_norm": 0.07912026988896192, + "learning_rate": 2.9250353661265917e-06, + "loss": 0.8804, + "step": 30266 + }, + { + "epoch": 6.45627133105802, + "grad_norm": 0.08691103063365874, + "learning_rate": 2.922755740227432e-06, + "loss": 0.9124, + "step": 30267 + }, + { + "epoch": 6.456484641638225, + "grad_norm": 0.08591269395967761, + "learning_rate": 2.920476986458578e-06, + "loss": 0.8693, + "step": 30268 + }, + { + "epoch": 6.4566979522184305, + "grad_norm": 0.0841987202311267, + "learning_rate": 2.9181991048458225e-06, + "loss": 0.8873, + "step": 30269 + }, + { + "epoch": 6.456911262798635, + "grad_norm": 0.08789888117491894, + "learning_rate": 2.9159220954149404e-06, + "loss": 0.8827, + "step": 30270 + }, + { + "epoch": 6.45712457337884, + "grad_norm": 0.08333150578272575, + "learning_rate": 2.9136459581917064e-06, + "loss": 0.8457, + "step": 30271 + }, + { + "epoch": 6.4573378839590445, + "grad_norm": 0.07689276521873811, + "learning_rate": 2.9113706932018603e-06, + "loss": 0.8921, + "step": 30272 + }, + { + "epoch": 6.457551194539249, + "grad_norm": 0.08451241371662575, + "learning_rate": 2.9090963004711413e-06, + "loss": 0.9105, + "step": 30273 + }, + { + "epoch": 6.457764505119454, + "grad_norm": 0.0785460927031421, + "learning_rate": 2.9068227800253067e-06, + "loss": 0.8719, + "step": 30274 + }, + { + "epoch": 6.457977815699659, + "grad_norm": 0.08125161835341757, + "learning_rate": 2.9045501318900603e-06, + "loss": 0.898, + "step": 30275 + }, + { + "epoch": 6.458191126279863, + "grad_norm": 0.09826409749593866, + "learning_rate": 2.9022783560911327e-06, + "loss": 0.8921, + "step": 30276 + }, + { + "epoch": 6.458404436860068, + "grad_norm": 0.07931459376806611, + "learning_rate": 2.9000074526542187e-06, + "loss": 0.8696, + "step": 30277 + }, + { + "epoch": 6.4586177474402735, + "grad_norm": 0.08000381969455339, + "learning_rate": 2.8977374216050135e-06, + "loss": 0.8685, + "step": 30278 + }, + { + "epoch": 6.458831058020478, + "grad_norm": 0.08025509429045061, + "learning_rate": 2.8954682629692123e-06, + "loss": 0.8868, + "step": 30279 + }, + { + "epoch": 6.459044368600683, + "grad_norm": 0.0793105828709429, + "learning_rate": 2.8931999767724826e-06, + "loss": 0.8707, + "step": 30280 + }, + { + "epoch": 6.459257679180888, + "grad_norm": 0.08065913848777262, + "learning_rate": 2.890932563040494e-06, + "loss": 0.8868, + "step": 30281 + }, + { + "epoch": 6.459470989761092, + "grad_norm": 0.08066409618120657, + "learning_rate": 2.888666021798887e-06, + "loss": 0.8631, + "step": 30282 + }, + { + "epoch": 6.459684300341297, + "grad_norm": 0.07943117122831764, + "learning_rate": 2.8864003530733487e-06, + "loss": 0.8884, + "step": 30283 + }, + { + "epoch": 6.459897610921502, + "grad_norm": 0.07613634474192857, + "learning_rate": 2.884135556889476e-06, + "loss": 0.8879, + "step": 30284 + }, + { + "epoch": 6.460110921501706, + "grad_norm": 0.07605171969120454, + "learning_rate": 2.8818716332728926e-06, + "loss": 0.8706, + "step": 30285 + }, + { + "epoch": 6.460324232081911, + "grad_norm": 0.08680004869274059, + "learning_rate": 2.8796085822492402e-06, + "loss": 0.8913, + "step": 30286 + }, + { + "epoch": 6.460537542662116, + "grad_norm": 0.080508316077922, + "learning_rate": 2.877346403844108e-06, + "loss": 0.8793, + "step": 30287 + }, + { + "epoch": 6.460750853242321, + "grad_norm": 0.08259960731226339, + "learning_rate": 2.87508509808311e-06, + "loss": 0.8742, + "step": 30288 + }, + { + "epoch": 6.460964163822526, + "grad_norm": 0.08182069350292735, + "learning_rate": 2.872824664991818e-06, + "loss": 0.8952, + "step": 30289 + }, + { + "epoch": 6.461177474402731, + "grad_norm": 0.08047379029137673, + "learning_rate": 2.8705651045957925e-06, + "loss": 0.8809, + "step": 30290 + }, + { + "epoch": 6.461390784982935, + "grad_norm": 0.07556324923333864, + "learning_rate": 2.868306416920641e-06, + "loss": 0.8411, + "step": 30291 + }, + { + "epoch": 6.46160409556314, + "grad_norm": 0.07954147451857901, + "learning_rate": 2.866048601991898e-06, + "loss": 0.8468, + "step": 30292 + }, + { + "epoch": 6.461817406143345, + "grad_norm": 0.08050533050259191, + "learning_rate": 2.8637916598351066e-06, + "loss": 0.8996, + "step": 30293 + }, + { + "epoch": 6.462030716723549, + "grad_norm": 0.07743932311022218, + "learning_rate": 2.8615355904758214e-06, + "loss": 0.8732, + "step": 30294 + }, + { + "epoch": 6.462244027303754, + "grad_norm": 0.08445060933823856, + "learning_rate": 2.8592803939395495e-06, + "loss": 0.9275, + "step": 30295 + }, + { + "epoch": 6.462457337883959, + "grad_norm": 0.0788802551607132, + "learning_rate": 2.857026070251818e-06, + "loss": 0.8705, + "step": 30296 + }, + { + "epoch": 6.462670648464163, + "grad_norm": 0.07400088427166303, + "learning_rate": 2.8547726194381266e-06, + "loss": 0.8714, + "step": 30297 + }, + { + "epoch": 6.462883959044369, + "grad_norm": 0.07731930344007043, + "learning_rate": 2.852520041523992e-06, + "loss": 0.8841, + "step": 30298 + }, + { + "epoch": 6.463097269624574, + "grad_norm": 0.07890427861599417, + "learning_rate": 2.8502683365348873e-06, + "loss": 0.9073, + "step": 30299 + }, + { + "epoch": 6.463310580204778, + "grad_norm": 0.0821081006021538, + "learning_rate": 2.8480175044962945e-06, + "loss": 0.8908, + "step": 30300 + }, + { + "epoch": 6.463523890784983, + "grad_norm": 0.08409708420613989, + "learning_rate": 2.8457675454336773e-06, + "loss": 0.8821, + "step": 30301 + }, + { + "epoch": 6.463737201365188, + "grad_norm": 0.07591702459799148, + "learning_rate": 2.8435184593724917e-06, + "loss": 0.9001, + "step": 30302 + }, + { + "epoch": 6.463950511945392, + "grad_norm": 0.08115346387153963, + "learning_rate": 2.8412702463382013e-06, + "loss": 0.9, + "step": 30303 + }, + { + "epoch": 6.464163822525597, + "grad_norm": 0.07571033423164426, + "learning_rate": 2.8390229063562436e-06, + "loss": 0.8553, + "step": 30304 + }, + { + "epoch": 6.464377133105802, + "grad_norm": 0.08360154225895054, + "learning_rate": 2.836776439452029e-06, + "loss": 0.895, + "step": 30305 + }, + { + "epoch": 6.464590443686006, + "grad_norm": 0.07790953449376911, + "learning_rate": 2.8345308456509958e-06, + "loss": 0.8509, + "step": 30306 + }, + { + "epoch": 6.464803754266212, + "grad_norm": 0.07951667021274779, + "learning_rate": 2.832286124978536e-06, + "loss": 0.8833, + "step": 30307 + }, + { + "epoch": 6.465017064846417, + "grad_norm": 0.07883227732998607, + "learning_rate": 2.8300422774600612e-06, + "loss": 0.9069, + "step": 30308 + }, + { + "epoch": 6.465230375426621, + "grad_norm": 0.08441804955512788, + "learning_rate": 2.827799303120946e-06, + "loss": 0.9051, + "step": 30309 + }, + { + "epoch": 6.465443686006826, + "grad_norm": 0.07953294403488992, + "learning_rate": 2.825557201986593e-06, + "loss": 0.8835, + "step": 30310 + }, + { + "epoch": 6.465656996587031, + "grad_norm": 0.074506259648117, + "learning_rate": 2.823315974082359e-06, + "loss": 0.9001, + "step": 30311 + }, + { + "epoch": 6.465870307167235, + "grad_norm": 0.07619862322591872, + "learning_rate": 2.821075619433593e-06, + "loss": 0.8735, + "step": 30312 + }, + { + "epoch": 6.46608361774744, + "grad_norm": 0.07501416927107783, + "learning_rate": 2.8188361380656616e-06, + "loss": 0.89, + "step": 30313 + }, + { + "epoch": 6.466296928327645, + "grad_norm": 0.07756118090682468, + "learning_rate": 2.816597530003886e-06, + "loss": 0.8821, + "step": 30314 + }, + { + "epoch": 6.4665102389078495, + "grad_norm": 0.08052607929636435, + "learning_rate": 2.814359795273616e-06, + "loss": 0.9062, + "step": 30315 + }, + { + "epoch": 6.466723549488055, + "grad_norm": 0.07906610608576851, + "learning_rate": 2.812122933900172e-06, + "loss": 0.9025, + "step": 30316 + }, + { + "epoch": 6.46693686006826, + "grad_norm": 0.08046693401992834, + "learning_rate": 2.8098869459088417e-06, + "loss": 0.9086, + "step": 30317 + }, + { + "epoch": 6.467150170648464, + "grad_norm": 0.07687725302333674, + "learning_rate": 2.807651831324947e-06, + "loss": 0.8843, + "step": 30318 + }, + { + "epoch": 6.467363481228669, + "grad_norm": 0.0802516801722047, + "learning_rate": 2.8054175901737645e-06, + "loss": 0.8776, + "step": 30319 + }, + { + "epoch": 6.467576791808874, + "grad_norm": 0.08017069152986846, + "learning_rate": 2.8031842224805907e-06, + "loss": 0.9062, + "step": 30320 + }, + { + "epoch": 6.4677901023890785, + "grad_norm": 0.07657671392784585, + "learning_rate": 2.8009517282706666e-06, + "loss": 0.8614, + "step": 30321 + }, + { + "epoch": 6.468003412969283, + "grad_norm": 0.0723714574109318, + "learning_rate": 2.798720107569288e-06, + "loss": 0.8408, + "step": 30322 + }, + { + "epoch": 6.468216723549488, + "grad_norm": 0.08050467699627684, + "learning_rate": 2.7964893604016886e-06, + "loss": 0.8752, + "step": 30323 + }, + { + "epoch": 6.4684300341296925, + "grad_norm": 0.08268966819060623, + "learning_rate": 2.7942594867931094e-06, + "loss": 0.8645, + "step": 30324 + }, + { + "epoch": 6.468643344709897, + "grad_norm": 0.07302125871821864, + "learning_rate": 2.792030486768784e-06, + "loss": 0.8488, + "step": 30325 + }, + { + "epoch": 6.468856655290103, + "grad_norm": 0.0787513830684877, + "learning_rate": 2.789802360353919e-06, + "loss": 0.8686, + "step": 30326 + }, + { + "epoch": 6.4690699658703075, + "grad_norm": 0.07525045177647893, + "learning_rate": 2.7875751075737566e-06, + "loss": 0.8926, + "step": 30327 + }, + { + "epoch": 6.469283276450512, + "grad_norm": 0.07532159198358064, + "learning_rate": 2.7853487284534764e-06, + "loss": 0.8799, + "step": 30328 + }, + { + "epoch": 6.469496587030717, + "grad_norm": 0.07707182873496937, + "learning_rate": 2.783123223018267e-06, + "loss": 0.8817, + "step": 30329 + }, + { + "epoch": 6.4697098976109215, + "grad_norm": 0.07419081498276885, + "learning_rate": 2.7808985912933173e-06, + "loss": 0.8695, + "step": 30330 + }, + { + "epoch": 6.469923208191126, + "grad_norm": 0.07882118056363716, + "learning_rate": 2.7786748333038073e-06, + "loss": 0.9067, + "step": 30331 + }, + { + "epoch": 6.470136518771331, + "grad_norm": 0.08276708110926402, + "learning_rate": 2.7764519490748897e-06, + "loss": 0.8787, + "step": 30332 + }, + { + "epoch": 6.470349829351536, + "grad_norm": 0.07272537002197146, + "learning_rate": 2.7742299386317007e-06, + "loss": 0.8824, + "step": 30333 + }, + { + "epoch": 6.47056313993174, + "grad_norm": 0.07610759882279905, + "learning_rate": 2.7720088019994196e-06, + "loss": 0.8595, + "step": 30334 + }, + { + "epoch": 6.470776450511945, + "grad_norm": 0.07785740035266188, + "learning_rate": 2.7697885392031466e-06, + "loss": 0.8734, + "step": 30335 + }, + { + "epoch": 6.4709897610921505, + "grad_norm": 0.08064270615501955, + "learning_rate": 2.767569150268026e-06, + "loss": 0.8979, + "step": 30336 + }, + { + "epoch": 6.471203071672355, + "grad_norm": 0.08391798635246508, + "learning_rate": 2.765350635219148e-06, + "loss": 0.8759, + "step": 30337 + }, + { + "epoch": 6.47141638225256, + "grad_norm": 0.07659752395898944, + "learning_rate": 2.763132994081623e-06, + "loss": 0.8542, + "step": 30338 + }, + { + "epoch": 6.471629692832765, + "grad_norm": 0.07935745400120219, + "learning_rate": 2.7609162268805677e-06, + "loss": 0.8829, + "step": 30339 + }, + { + "epoch": 6.471843003412969, + "grad_norm": 0.07930975879511863, + "learning_rate": 2.7587003336410377e-06, + "loss": 0.8561, + "step": 30340 + }, + { + "epoch": 6.472056313993174, + "grad_norm": 0.07751649304160774, + "learning_rate": 2.7564853143880975e-06, + "loss": 0.8612, + "step": 30341 + }, + { + "epoch": 6.472269624573379, + "grad_norm": 0.08154152661273338, + "learning_rate": 2.754271169146838e-06, + "loss": 0.8472, + "step": 30342 + }, + { + "epoch": 6.472482935153583, + "grad_norm": 0.0798724968999898, + "learning_rate": 2.7520578979422974e-06, + "loss": 0.8945, + "step": 30343 + }, + { + "epoch": 6.472696245733788, + "grad_norm": 0.07689447794585776, + "learning_rate": 2.749845500799522e-06, + "loss": 0.8667, + "step": 30344 + }, + { + "epoch": 6.472909556313994, + "grad_norm": 0.07800141272883697, + "learning_rate": 2.747633977743531e-06, + "loss": 0.8936, + "step": 30345 + }, + { + "epoch": 6.473122866894198, + "grad_norm": 0.07463137972366966, + "learning_rate": 2.7454233287993724e-06, + "loss": 0.8885, + "step": 30346 + }, + { + "epoch": 6.473336177474403, + "grad_norm": 0.08020051233759028, + "learning_rate": 2.7432135539920477e-06, + "loss": 0.898, + "step": 30347 + }, + { + "epoch": 6.473549488054608, + "grad_norm": 0.0799642146906279, + "learning_rate": 2.7410046533465685e-06, + "loss": 0.8595, + "step": 30348 + }, + { + "epoch": 6.473762798634812, + "grad_norm": 0.07702147554325108, + "learning_rate": 2.7387966268879097e-06, + "loss": 0.8847, + "step": 30349 + }, + { + "epoch": 6.473976109215017, + "grad_norm": 0.08377543168343554, + "learning_rate": 2.736589474641074e-06, + "loss": 0.8795, + "step": 30350 + }, + { + "epoch": 6.474189419795222, + "grad_norm": 0.08423246479919551, + "learning_rate": 2.7343831966310273e-06, + "loss": 0.9313, + "step": 30351 + }, + { + "epoch": 6.474402730375426, + "grad_norm": 0.0754840023587779, + "learning_rate": 2.7321777928827377e-06, + "loss": 0.8939, + "step": 30352 + }, + { + "epoch": 6.474616040955631, + "grad_norm": 0.07871574423857068, + "learning_rate": 2.7299732634211353e-06, + "loss": 0.9067, + "step": 30353 + }, + { + "epoch": 6.474829351535837, + "grad_norm": 0.075041854972634, + "learning_rate": 2.7277696082712045e-06, + "loss": 0.8547, + "step": 30354 + }, + { + "epoch": 6.475042662116041, + "grad_norm": 0.08094856077160373, + "learning_rate": 2.7255668274578597e-06, + "loss": 0.8516, + "step": 30355 + }, + { + "epoch": 6.475255972696246, + "grad_norm": 0.07523732642035573, + "learning_rate": 2.723364921006022e-06, + "loss": 0.861, + "step": 30356 + }, + { + "epoch": 6.475469283276451, + "grad_norm": 0.08697174316089787, + "learning_rate": 2.7211638889406057e-06, + "loss": 0.9118, + "step": 30357 + }, + { + "epoch": 6.475682593856655, + "grad_norm": 0.08216278971730528, + "learning_rate": 2.7189637312865237e-06, + "loss": 0.9129, + "step": 30358 + }, + { + "epoch": 6.47589590443686, + "grad_norm": 0.08320309516884834, + "learning_rate": 2.7167644480686717e-06, + "loss": 0.8694, + "step": 30359 + }, + { + "epoch": 6.476109215017065, + "grad_norm": 0.07844607020415227, + "learning_rate": 2.714566039311928e-06, + "loss": 0.8529, + "step": 30360 + }, + { + "epoch": 6.476322525597269, + "grad_norm": 0.0831128388481102, + "learning_rate": 2.7123685050411697e-06, + "loss": 0.8975, + "step": 30361 + }, + { + "epoch": 6.476535836177474, + "grad_norm": 0.07694250496278528, + "learning_rate": 2.7101718452812574e-06, + "loss": 0.8776, + "step": 30362 + }, + { + "epoch": 6.476749146757679, + "grad_norm": 0.07999477268749738, + "learning_rate": 2.7079760600570516e-06, + "loss": 0.8675, + "step": 30363 + }, + { + "epoch": 6.476962457337884, + "grad_norm": 0.07766255575112027, + "learning_rate": 2.705781149393403e-06, + "loss": 0.878, + "step": 30364 + }, + { + "epoch": 6.477175767918089, + "grad_norm": 0.08023242178353968, + "learning_rate": 2.703587113315127e-06, + "loss": 0.8801, + "step": 30365 + }, + { + "epoch": 6.477389078498294, + "grad_norm": 0.0901605537406617, + "learning_rate": 2.7013939518470756e-06, + "loss": 0.883, + "step": 30366 + }, + { + "epoch": 6.477602389078498, + "grad_norm": 0.09079683326161725, + "learning_rate": 2.6992016650140463e-06, + "loss": 0.8872, + "step": 30367 + }, + { + "epoch": 6.477815699658703, + "grad_norm": 0.07430896715909882, + "learning_rate": 2.697010252840855e-06, + "loss": 0.8883, + "step": 30368 + }, + { + "epoch": 6.478029010238908, + "grad_norm": 0.07533645272918656, + "learning_rate": 2.6948197153522815e-06, + "loss": 0.9006, + "step": 30369 + }, + { + "epoch": 6.478242320819112, + "grad_norm": 0.0767402089588698, + "learning_rate": 2.692630052573133e-06, + "loss": 0.8829, + "step": 30370 + }, + { + "epoch": 6.478455631399317, + "grad_norm": 0.08242521952684367, + "learning_rate": 2.690441264528181e-06, + "loss": 0.8557, + "step": 30371 + }, + { + "epoch": 6.478668941979522, + "grad_norm": 0.07635212734665173, + "learning_rate": 2.688253351242187e-06, + "loss": 0.8802, + "step": 30372 + }, + { + "epoch": 6.4788822525597265, + "grad_norm": 0.07425949680682928, + "learning_rate": 2.6860663127398877e-06, + "loss": 0.8822, + "step": 30373 + }, + { + "epoch": 6.479095563139932, + "grad_norm": 0.07509063028942584, + "learning_rate": 2.683880149046063e-06, + "loss": 0.8868, + "step": 30374 + }, + { + "epoch": 6.479308873720137, + "grad_norm": 0.07388299708566318, + "learning_rate": 2.6816948601854397e-06, + "loss": 0.8609, + "step": 30375 + }, + { + "epoch": 6.479522184300341, + "grad_norm": 0.07743540098269025, + "learning_rate": 2.679510446182727e-06, + "loss": 0.8657, + "step": 30376 + }, + { + "epoch": 6.479735494880546, + "grad_norm": 0.08424860782121239, + "learning_rate": 2.6773269070626517e-06, + "loss": 0.8947, + "step": 30377 + }, + { + "epoch": 6.479948805460751, + "grad_norm": 0.078869856908769, + "learning_rate": 2.6751442428499317e-06, + "loss": 0.8943, + "step": 30378 + }, + { + "epoch": 6.4801621160409555, + "grad_norm": 0.07853617063603123, + "learning_rate": 2.672962453569259e-06, + "loss": 0.881, + "step": 30379 + }, + { + "epoch": 6.48037542662116, + "grad_norm": 0.08039175669667063, + "learning_rate": 2.670781539245315e-06, + "loss": 0.8608, + "step": 30380 + }, + { + "epoch": 6.480588737201365, + "grad_norm": 0.07923039702191767, + "learning_rate": 2.6686014999027743e-06, + "loss": 0.8693, + "step": 30381 + }, + { + "epoch": 6.4808020477815695, + "grad_norm": 0.07323937441567775, + "learning_rate": 2.666422335566301e-06, + "loss": 0.8543, + "step": 30382 + }, + { + "epoch": 6.481015358361775, + "grad_norm": 0.08075438798070568, + "learning_rate": 2.664244046260587e-06, + "loss": 0.8562, + "step": 30383 + }, + { + "epoch": 6.48122866894198, + "grad_norm": 0.08618300397567116, + "learning_rate": 2.662066632010234e-06, + "loss": 0.9262, + "step": 30384 + }, + { + "epoch": 6.4814419795221845, + "grad_norm": 0.07673021637014234, + "learning_rate": 2.6598900928398897e-06, + "loss": 0.889, + "step": 30385 + }, + { + "epoch": 6.481655290102389, + "grad_norm": 0.07376675845868666, + "learning_rate": 2.6577144287742006e-06, + "loss": 0.8951, + "step": 30386 + }, + { + "epoch": 6.481868600682594, + "grad_norm": 0.07645653011429236, + "learning_rate": 2.65553963983777e-06, + "loss": 0.8643, + "step": 30387 + }, + { + "epoch": 6.4820819112627985, + "grad_norm": 0.08052518871425256, + "learning_rate": 2.653365726055208e-06, + "loss": 0.8839, + "step": 30388 + }, + { + "epoch": 6.482295221843003, + "grad_norm": 0.07685248099851677, + "learning_rate": 2.6511926874511094e-06, + "loss": 0.8664, + "step": 30389 + }, + { + "epoch": 6.482508532423208, + "grad_norm": 0.07860155906138755, + "learning_rate": 2.6490205240500765e-06, + "loss": 0.8692, + "step": 30390 + }, + { + "epoch": 6.482721843003413, + "grad_norm": 0.07579412908164515, + "learning_rate": 2.6468492358766763e-06, + "loss": 0.885, + "step": 30391 + }, + { + "epoch": 6.482935153583618, + "grad_norm": 0.07959203742781558, + "learning_rate": 2.6446788229554755e-06, + "loss": 0.8708, + "step": 30392 + }, + { + "epoch": 6.483148464163823, + "grad_norm": 0.07651160980168895, + "learning_rate": 2.6425092853110413e-06, + "loss": 0.8855, + "step": 30393 + }, + { + "epoch": 6.4833617747440275, + "grad_norm": 0.07820480413554694, + "learning_rate": 2.640340622967914e-06, + "loss": 0.8708, + "step": 30394 + }, + { + "epoch": 6.483575085324232, + "grad_norm": 0.07952737425741238, + "learning_rate": 2.6381728359506253e-06, + "loss": 0.904, + "step": 30395 + }, + { + "epoch": 6.483788395904437, + "grad_norm": 0.07201220847959172, + "learning_rate": 2.6360059242837244e-06, + "loss": 0.8933, + "step": 30396 + }, + { + "epoch": 6.484001706484642, + "grad_norm": 0.0857749313833109, + "learning_rate": 2.633839887991698e-06, + "loss": 0.9207, + "step": 30397 + }, + { + "epoch": 6.484215017064846, + "grad_norm": 0.07394742058383123, + "learning_rate": 2.6316747270990872e-06, + "loss": 0.865, + "step": 30398 + }, + { + "epoch": 6.484428327645051, + "grad_norm": 0.07575337574558885, + "learning_rate": 2.6295104416303785e-06, + "loss": 0.8754, + "step": 30399 + }, + { + "epoch": 6.484641638225256, + "grad_norm": 0.08321011174890607, + "learning_rate": 2.627347031610068e-06, + "loss": 0.8746, + "step": 30400 + }, + { + "epoch": 6.484854948805461, + "grad_norm": 0.08055958548082297, + "learning_rate": 2.6251844970626074e-06, + "loss": 0.9017, + "step": 30401 + }, + { + "epoch": 6.485068259385666, + "grad_norm": 0.07320979028621234, + "learning_rate": 2.623022838012501e-06, + "loss": 0.882, + "step": 30402 + }, + { + "epoch": 6.485281569965871, + "grad_norm": 0.07895932500034329, + "learning_rate": 2.6208620544841924e-06, + "loss": 0.8734, + "step": 30403 + }, + { + "epoch": 6.485494880546075, + "grad_norm": 0.07328562531739823, + "learning_rate": 2.6187021465021232e-06, + "loss": 0.8723, + "step": 30404 + }, + { + "epoch": 6.48570819112628, + "grad_norm": 0.081965926998936, + "learning_rate": 2.616543114090746e-06, + "loss": 0.9021, + "step": 30405 + }, + { + "epoch": 6.485921501706485, + "grad_norm": 0.08083115438032899, + "learning_rate": 2.614384957274485e-06, + "loss": 0.897, + "step": 30406 + }, + { + "epoch": 6.486134812286689, + "grad_norm": 0.07745966424022967, + "learning_rate": 2.6122276760777654e-06, + "loss": 0.8667, + "step": 30407 + }, + { + "epoch": 6.486348122866894, + "grad_norm": 0.07640305206964343, + "learning_rate": 2.6100712705249855e-06, + "loss": 0.878, + "step": 30408 + }, + { + "epoch": 6.486561433447099, + "grad_norm": 0.07947973593443114, + "learning_rate": 2.6079157406405432e-06, + "loss": 0.8446, + "step": 30409 + }, + { + "epoch": 6.486774744027303, + "grad_norm": 0.07928396854847508, + "learning_rate": 2.605761086448846e-06, + "loss": 0.8829, + "step": 30410 + }, + { + "epoch": 6.486988054607509, + "grad_norm": 0.08107947787910334, + "learning_rate": 2.6036073079742652e-06, + "loss": 0.858, + "step": 30411 + }, + { + "epoch": 6.487201365187714, + "grad_norm": 0.07815199718733117, + "learning_rate": 2.601454405241164e-06, + "loss": 0.8629, + "step": 30412 + }, + { + "epoch": 6.487414675767918, + "grad_norm": 0.07835107136148307, + "learning_rate": 2.599302378273905e-06, + "loss": 0.8572, + "step": 30413 + }, + { + "epoch": 6.487627986348123, + "grad_norm": 0.07034287705640618, + "learning_rate": 2.5971512270968414e-06, + "loss": 0.8602, + "step": 30414 + }, + { + "epoch": 6.487841296928328, + "grad_norm": 0.07953129040181629, + "learning_rate": 2.595000951734328e-06, + "loss": 0.8722, + "step": 30415 + }, + { + "epoch": 6.488054607508532, + "grad_norm": 0.07425449937789987, + "learning_rate": 2.5928515522106645e-06, + "loss": 0.8492, + "step": 30416 + }, + { + "epoch": 6.488267918088737, + "grad_norm": 0.07584927762902964, + "learning_rate": 2.5907030285501966e-06, + "loss": 0.8631, + "step": 30417 + }, + { + "epoch": 6.488481228668942, + "grad_norm": 0.07748519043834541, + "learning_rate": 2.5885553807772247e-06, + "loss": 0.8783, + "step": 30418 + }, + { + "epoch": 6.488694539249146, + "grad_norm": 0.08032069337183498, + "learning_rate": 2.586408608916058e-06, + "loss": 0.8589, + "step": 30419 + }, + { + "epoch": 6.488907849829351, + "grad_norm": 0.08649104051678327, + "learning_rate": 2.5842627129909703e-06, + "loss": 0.8779, + "step": 30420 + }, + { + "epoch": 6.489121160409557, + "grad_norm": 0.08344450598941688, + "learning_rate": 2.582117693026245e-06, + "loss": 0.8719, + "step": 30421 + }, + { + "epoch": 6.489334470989761, + "grad_norm": 0.0790917098755433, + "learning_rate": 2.579973549046173e-06, + "loss": 0.9246, + "step": 30422 + }, + { + "epoch": 6.489547781569966, + "grad_norm": 0.0800838009361868, + "learning_rate": 2.5778302810750024e-06, + "loss": 0.8735, + "step": 30423 + }, + { + "epoch": 6.489761092150171, + "grad_norm": 0.08087092987907558, + "learning_rate": 2.57568788913698e-06, + "loss": 0.8616, + "step": 30424 + }, + { + "epoch": 6.489974402730375, + "grad_norm": 0.07794184214035595, + "learning_rate": 2.573546373256344e-06, + "loss": 0.8887, + "step": 30425 + }, + { + "epoch": 6.49018771331058, + "grad_norm": 0.07454955950212828, + "learning_rate": 2.571405733457342e-06, + "loss": 0.8916, + "step": 30426 + }, + { + "epoch": 6.490401023890785, + "grad_norm": 0.08145181855225643, + "learning_rate": 2.5692659697641853e-06, + "loss": 0.8932, + "step": 30427 + }, + { + "epoch": 6.4906143344709895, + "grad_norm": 0.07400191522210478, + "learning_rate": 2.567127082201104e-06, + "loss": 0.9262, + "step": 30428 + }, + { + "epoch": 6.490827645051194, + "grad_norm": 0.08037160777729306, + "learning_rate": 2.5649890707922566e-06, + "loss": 0.9025, + "step": 30429 + }, + { + "epoch": 6.4910409556314, + "grad_norm": 0.08285605858458636, + "learning_rate": 2.562851935561863e-06, + "loss": 0.8692, + "step": 30430 + }, + { + "epoch": 6.491254266211604, + "grad_norm": 0.08247433858496742, + "learning_rate": 2.560715676534118e-06, + "loss": 0.8529, + "step": 30431 + }, + { + "epoch": 6.491467576791809, + "grad_norm": 0.07844562194145702, + "learning_rate": 2.5585802937331616e-06, + "loss": 0.9061, + "step": 30432 + }, + { + "epoch": 6.491680887372014, + "grad_norm": 0.07710601214818384, + "learning_rate": 2.5564457871831704e-06, + "loss": 0.8669, + "step": 30433 + }, + { + "epoch": 6.4918941979522184, + "grad_norm": 0.08445735279823562, + "learning_rate": 2.554312156908312e-06, + "loss": 0.8873, + "step": 30434 + }, + { + "epoch": 6.492107508532423, + "grad_norm": 0.08387890828601284, + "learning_rate": 2.5521794029327086e-06, + "loss": 0.8647, + "step": 30435 + }, + { + "epoch": 6.492320819112628, + "grad_norm": 0.08771620990247682, + "learning_rate": 2.5500475252805014e-06, + "loss": 0.8645, + "step": 30436 + }, + { + "epoch": 6.4925341296928325, + "grad_norm": 0.07605522735763752, + "learning_rate": 2.547916523975795e-06, + "loss": 0.8728, + "step": 30437 + }, + { + "epoch": 6.492747440273037, + "grad_norm": 0.0790810767905451, + "learning_rate": 2.5457863990427224e-06, + "loss": 0.8889, + "step": 30438 + }, + { + "epoch": 6.492960750853243, + "grad_norm": 0.08179751874434521, + "learning_rate": 2.5436571505053963e-06, + "loss": 0.9085, + "step": 30439 + }, + { + "epoch": 6.493174061433447, + "grad_norm": 0.0835510742971854, + "learning_rate": 2.5415287783878786e-06, + "loss": 0.8911, + "step": 30440 + }, + { + "epoch": 6.493387372013652, + "grad_norm": 0.07875608374026292, + "learning_rate": 2.5394012827142554e-06, + "loss": 0.9225, + "step": 30441 + }, + { + "epoch": 6.493600682593857, + "grad_norm": 0.07479388292100823, + "learning_rate": 2.537274663508624e-06, + "loss": 0.8755, + "step": 30442 + }, + { + "epoch": 6.4938139931740615, + "grad_norm": 0.07487526141911821, + "learning_rate": 2.5351489207950273e-06, + "loss": 0.8766, + "step": 30443 + }, + { + "epoch": 6.494027303754266, + "grad_norm": 0.08032978722255939, + "learning_rate": 2.5330240545975347e-06, + "loss": 0.8746, + "step": 30444 + }, + { + "epoch": 6.494240614334471, + "grad_norm": 0.07670060282035618, + "learning_rate": 2.5309000649401536e-06, + "loss": 0.8598, + "step": 30445 + }, + { + "epoch": 6.4944539249146755, + "grad_norm": 0.07483441109125294, + "learning_rate": 2.528776951846963e-06, + "loss": 0.8935, + "step": 30446 + }, + { + "epoch": 6.49466723549488, + "grad_norm": 0.07214959561989284, + "learning_rate": 2.526654715341952e-06, + "loss": 0.8525, + "step": 30447 + }, + { + "epoch": 6.494880546075085, + "grad_norm": 0.08552348342607476, + "learning_rate": 2.5245333554491546e-06, + "loss": 0.8917, + "step": 30448 + }, + { + "epoch": 6.4950938566552905, + "grad_norm": 0.08133233390713523, + "learning_rate": 2.522412872192561e-06, + "loss": 0.9055, + "step": 30449 + }, + { + "epoch": 6.495307167235495, + "grad_norm": 0.07602544979069262, + "learning_rate": 2.520293265596179e-06, + "loss": 0.8849, + "step": 30450 + }, + { + "epoch": 6.4955204778157, + "grad_norm": 0.0762376011019796, + "learning_rate": 2.5181745356839706e-06, + "loss": 0.8867, + "step": 30451 + }, + { + "epoch": 6.4957337883959045, + "grad_norm": 0.07280189143641802, + "learning_rate": 2.516056682479926e-06, + "loss": 0.8688, + "step": 30452 + }, + { + "epoch": 6.495947098976109, + "grad_norm": 0.07803201002569694, + "learning_rate": 2.5139397060079905e-06, + "loss": 0.9017, + "step": 30453 + }, + { + "epoch": 6.496160409556314, + "grad_norm": 0.0752204227696119, + "learning_rate": 2.511823606292145e-06, + "loss": 0.8836, + "step": 30454 + }, + { + "epoch": 6.496373720136519, + "grad_norm": 0.08192263956880325, + "learning_rate": 2.5097083833563174e-06, + "loss": 0.8749, + "step": 30455 + }, + { + "epoch": 6.496587030716723, + "grad_norm": 0.07876442063906021, + "learning_rate": 2.5075940372244343e-06, + "loss": 0.9062, + "step": 30456 + }, + { + "epoch": 6.496800341296928, + "grad_norm": 0.0788846075668202, + "learning_rate": 2.505480567920433e-06, + "loss": 0.8984, + "step": 30457 + }, + { + "epoch": 6.497013651877133, + "grad_norm": 0.07832295097612078, + "learning_rate": 2.503367975468223e-06, + "loss": 0.8647, + "step": 30458 + }, + { + "epoch": 6.497226962457338, + "grad_norm": 0.07308887023226218, + "learning_rate": 2.5012562598917045e-06, + "loss": 0.8774, + "step": 30459 + }, + { + "epoch": 6.497440273037543, + "grad_norm": 0.08554694017686583, + "learning_rate": 2.499145421214779e-06, + "loss": 0.8837, + "step": 30460 + }, + { + "epoch": 6.497653583617748, + "grad_norm": 0.07932822278675793, + "learning_rate": 2.497035459461321e-06, + "loss": 0.8907, + "step": 30461 + }, + { + "epoch": 6.497866894197952, + "grad_norm": 0.08484504084264959, + "learning_rate": 2.494926374655222e-06, + "loss": 0.8858, + "step": 30462 + }, + { + "epoch": 6.498080204778157, + "grad_norm": 0.08034207901746608, + "learning_rate": 2.49281816682033e-06, + "loss": 0.884, + "step": 30463 + }, + { + "epoch": 6.498293515358362, + "grad_norm": 0.07933845955860766, + "learning_rate": 2.49071083598051e-06, + "loss": 0.8723, + "step": 30464 + }, + { + "epoch": 6.498506825938566, + "grad_norm": 0.07688122334682658, + "learning_rate": 2.488604382159583e-06, + "loss": 0.9065, + "step": 30465 + }, + { + "epoch": 6.498720136518771, + "grad_norm": 0.08094535299698528, + "learning_rate": 2.4864988053814143e-06, + "loss": 0.8729, + "step": 30466 + }, + { + "epoch": 6.498933447098976, + "grad_norm": 0.07609041736326749, + "learning_rate": 2.4843941056698163e-06, + "loss": 0.8838, + "step": 30467 + }, + { + "epoch": 6.499146757679181, + "grad_norm": 0.08235753932110827, + "learning_rate": 2.4822902830485917e-06, + "loss": 0.8855, + "step": 30468 + }, + { + "epoch": 6.499360068259386, + "grad_norm": 0.08355728528004576, + "learning_rate": 2.480187337541562e-06, + "loss": 0.8928, + "step": 30469 + }, + { + "epoch": 6.499573378839591, + "grad_norm": 0.07551957917421084, + "learning_rate": 2.478085269172521e-06, + "loss": 0.9091, + "step": 30470 + }, + { + "epoch": 6.499786689419795, + "grad_norm": 0.07548780825354592, + "learning_rate": 2.4759840779652455e-06, + "loss": 0.8719, + "step": 30471 + }, + { + "epoch": 6.5, + "grad_norm": 0.0816459327553714, + "learning_rate": 2.473883763943512e-06, + "loss": 0.8511, + "step": 30472 + }, + { + "epoch": 6.500213310580205, + "grad_norm": 0.08517289291466924, + "learning_rate": 2.471784327131097e-06, + "loss": 0.8835, + "step": 30473 + }, + { + "epoch": 6.500426621160409, + "grad_norm": 0.07962095393145405, + "learning_rate": 2.4696857675517416e-06, + "loss": 0.8855, + "step": 30474 + }, + { + "epoch": 6.500639931740614, + "grad_norm": 0.07630620331408464, + "learning_rate": 2.4675880852291954e-06, + "loss": 0.918, + "step": 30475 + }, + { + "epoch": 6.500853242320819, + "grad_norm": 0.07931190347164667, + "learning_rate": 2.465491280187191e-06, + "loss": 0.8655, + "step": 30476 + }, + { + "epoch": 6.501066552901024, + "grad_norm": 0.08215505138072295, + "learning_rate": 2.4633953524494514e-06, + "loss": 0.8776, + "step": 30477 + }, + { + "epoch": 6.501279863481229, + "grad_norm": 0.07930274242475623, + "learning_rate": 2.4613003020397086e-06, + "loss": 0.8597, + "step": 30478 + }, + { + "epoch": 6.501493174061434, + "grad_norm": 0.08553339876233304, + "learning_rate": 2.4592061289816505e-06, + "loss": 0.8917, + "step": 30479 + }, + { + "epoch": 6.501706484641638, + "grad_norm": 0.07771203838369764, + "learning_rate": 2.4571128332989823e-06, + "loss": 0.8754, + "step": 30480 + }, + { + "epoch": 6.501919795221843, + "grad_norm": 0.0754589196472451, + "learning_rate": 2.4550204150153832e-06, + "loss": 0.8699, + "step": 30481 + }, + { + "epoch": 6.502133105802048, + "grad_norm": 0.07720438766377517, + "learning_rate": 2.452928874154532e-06, + "loss": 0.9087, + "step": 30482 + }, + { + "epoch": 6.502346416382252, + "grad_norm": 0.08352320408209697, + "learning_rate": 2.4508382107401075e-06, + "loss": 0.8766, + "step": 30483 + }, + { + "epoch": 6.502559726962457, + "grad_norm": 0.08022554080263969, + "learning_rate": 2.4487484247957437e-06, + "loss": 0.9155, + "step": 30484 + }, + { + "epoch": 6.502773037542662, + "grad_norm": 0.08732758798875345, + "learning_rate": 2.4466595163450934e-06, + "loss": 0.8941, + "step": 30485 + }, + { + "epoch": 6.502986348122867, + "grad_norm": 0.09080390759638844, + "learning_rate": 2.4445714854117995e-06, + "loss": 0.8758, + "step": 30486 + }, + { + "epoch": 6.503199658703072, + "grad_norm": 0.07925407333562065, + "learning_rate": 2.442484332019488e-06, + "loss": 0.8582, + "step": 30487 + }, + { + "epoch": 6.503412969283277, + "grad_norm": 0.08052051055091045, + "learning_rate": 2.4403980561917663e-06, + "loss": 0.8784, + "step": 30488 + }, + { + "epoch": 6.503626279863481, + "grad_norm": 0.08677081803545679, + "learning_rate": 2.4383126579522333e-06, + "loss": 0.9114, + "step": 30489 + }, + { + "epoch": 6.503839590443686, + "grad_norm": 0.09286368502047926, + "learning_rate": 2.4362281373245144e-06, + "loss": 0.869, + "step": 30490 + }, + { + "epoch": 6.504052901023891, + "grad_norm": 0.0794457988172354, + "learning_rate": 2.4341444943321737e-06, + "loss": 0.8752, + "step": 30491 + }, + { + "epoch": 6.5042662116040955, + "grad_norm": 0.0809995674625732, + "learning_rate": 2.4320617289987912e-06, + "loss": 0.8892, + "step": 30492 + }, + { + "epoch": 6.5044795221843, + "grad_norm": 0.07850575705146681, + "learning_rate": 2.4299798413479316e-06, + "loss": 0.9011, + "step": 30493 + }, + { + "epoch": 6.504692832764505, + "grad_norm": 0.08685531842272295, + "learning_rate": 2.427898831403157e-06, + "loss": 0.8892, + "step": 30494 + }, + { + "epoch": 6.5049061433447095, + "grad_norm": 0.08662328998697823, + "learning_rate": 2.425818699188023e-06, + "loss": 0.8898, + "step": 30495 + }, + { + "epoch": 6.505119453924914, + "grad_norm": 0.08527429516384312, + "learning_rate": 2.4237394447260386e-06, + "loss": 0.9042, + "step": 30496 + }, + { + "epoch": 6.50533276450512, + "grad_norm": 0.0751951565696061, + "learning_rate": 2.4216610680407595e-06, + "loss": 0.8779, + "step": 30497 + }, + { + "epoch": 6.5055460750853245, + "grad_norm": 0.07672511554571965, + "learning_rate": 2.4195835691556856e-06, + "loss": 0.8513, + "step": 30498 + }, + { + "epoch": 6.505759385665529, + "grad_norm": 0.07828587640110285, + "learning_rate": 2.417506948094328e-06, + "loss": 0.8862, + "step": 30499 + }, + { + "epoch": 6.505972696245734, + "grad_norm": 0.08806781683784375, + "learning_rate": 2.4154312048801874e-06, + "loss": 0.8654, + "step": 30500 + }, + { + "epoch": 6.5061860068259385, + "grad_norm": 0.08114971623905659, + "learning_rate": 2.41335633953673e-06, + "loss": 0.9073, + "step": 30501 + }, + { + "epoch": 6.506399317406143, + "grad_norm": 0.08235493650323975, + "learning_rate": 2.411282352087474e-06, + "loss": 0.8809, + "step": 30502 + }, + { + "epoch": 6.506612627986348, + "grad_norm": 0.07369305707334721, + "learning_rate": 2.40920924255585e-06, + "loss": 0.8714, + "step": 30503 + }, + { + "epoch": 6.506825938566553, + "grad_norm": 0.07932399335852915, + "learning_rate": 2.4071370109653325e-06, + "loss": 0.8652, + "step": 30504 + }, + { + "epoch": 6.507039249146757, + "grad_norm": 0.0816663957150126, + "learning_rate": 2.4050656573393607e-06, + "loss": 0.8744, + "step": 30505 + }, + { + "epoch": 6.507252559726963, + "grad_norm": 0.08031552519374065, + "learning_rate": 2.4029951817013906e-06, + "loss": 0.8721, + "step": 30506 + }, + { + "epoch": 6.5074658703071675, + "grad_norm": 0.07591886007437221, + "learning_rate": 2.4009255840748267e-06, + "loss": 0.8969, + "step": 30507 + }, + { + "epoch": 6.507679180887372, + "grad_norm": 0.07552227912478898, + "learning_rate": 2.3988568644830902e-06, + "loss": 0.881, + "step": 30508 + }, + { + "epoch": 6.507892491467577, + "grad_norm": 0.0862631153921773, + "learning_rate": 2.396789022949593e-06, + "loss": 0.8736, + "step": 30509 + }, + { + "epoch": 6.5081058020477816, + "grad_norm": 0.07933426672604592, + "learning_rate": 2.3947220594977473e-06, + "loss": 0.8812, + "step": 30510 + }, + { + "epoch": 6.508319112627986, + "grad_norm": 0.08299670658520049, + "learning_rate": 2.3926559741509126e-06, + "loss": 0.9143, + "step": 30511 + }, + { + "epoch": 6.508532423208191, + "grad_norm": 0.07531993560635966, + "learning_rate": 2.3905907669324924e-06, + "loss": 0.8531, + "step": 30512 + }, + { + "epoch": 6.508745733788396, + "grad_norm": 0.0762069107674483, + "learning_rate": 2.388526437865828e-06, + "loss": 0.894, + "step": 30513 + }, + { + "epoch": 6.5089590443686, + "grad_norm": 0.08095846463872848, + "learning_rate": 2.386462986974296e-06, + "loss": 0.8965, + "step": 30514 + }, + { + "epoch": 6.509172354948806, + "grad_norm": 0.08206266555817517, + "learning_rate": 2.384400414281247e-06, + "loss": 0.892, + "step": 30515 + }, + { + "epoch": 6.5093856655290105, + "grad_norm": 0.0774890890060145, + "learning_rate": 2.3823387198100134e-06, + "loss": 0.8525, + "step": 30516 + }, + { + "epoch": 6.509598976109215, + "grad_norm": 0.07319051817047686, + "learning_rate": 2.3802779035839186e-06, + "loss": 0.867, + "step": 30517 + }, + { + "epoch": 6.50981228668942, + "grad_norm": 0.08189788676379367, + "learning_rate": 2.378217965626295e-06, + "loss": 0.9058, + "step": 30518 + }, + { + "epoch": 6.510025597269625, + "grad_norm": 0.07652133575689428, + "learning_rate": 2.3761589059604305e-06, + "loss": 0.8786, + "step": 30519 + }, + { + "epoch": 6.510238907849829, + "grad_norm": 0.07504081834878075, + "learning_rate": 2.3741007246096316e-06, + "loss": 0.9037, + "step": 30520 + }, + { + "epoch": 6.510452218430034, + "grad_norm": 0.08353493957350187, + "learning_rate": 2.3720434215971944e-06, + "loss": 0.909, + "step": 30521 + }, + { + "epoch": 6.510665529010239, + "grad_norm": 0.07760715868198995, + "learning_rate": 2.3699869969463895e-06, + "loss": 0.8753, + "step": 30522 + }, + { + "epoch": 6.510878839590443, + "grad_norm": 0.07663217109298154, + "learning_rate": 2.3679314506804874e-06, + "loss": 0.8939, + "step": 30523 + }, + { + "epoch": 6.511092150170649, + "grad_norm": 0.08495444507266663, + "learning_rate": 2.3658767828227493e-06, + "loss": 0.9112, + "step": 30524 + }, + { + "epoch": 6.511305460750854, + "grad_norm": 0.07507489656083284, + "learning_rate": 2.3638229933964097e-06, + "loss": 0.8501, + "step": 30525 + }, + { + "epoch": 6.511518771331058, + "grad_norm": 0.07761486067334618, + "learning_rate": 2.3617700824247304e-06, + "loss": 0.8748, + "step": 30526 + }, + { + "epoch": 6.511732081911263, + "grad_norm": 0.08143740597133163, + "learning_rate": 2.359718049930928e-06, + "loss": 0.8599, + "step": 30527 + }, + { + "epoch": 6.511945392491468, + "grad_norm": 0.07929329601886148, + "learning_rate": 2.35766689593822e-06, + "loss": 0.8917, + "step": 30528 + }, + { + "epoch": 6.512158703071672, + "grad_norm": 0.0772756375008318, + "learning_rate": 2.3556166204698226e-06, + "loss": 0.9219, + "step": 30529 + }, + { + "epoch": 6.512372013651877, + "grad_norm": 0.07953755735489743, + "learning_rate": 2.353567223548927e-06, + "loss": 0.8816, + "step": 30530 + }, + { + "epoch": 6.512585324232082, + "grad_norm": 0.10936622954865242, + "learning_rate": 2.351518705198723e-06, + "loss": 0.8769, + "step": 30531 + }, + { + "epoch": 6.512798634812286, + "grad_norm": 0.07910606463902224, + "learning_rate": 2.3494710654423745e-06, + "loss": 0.8681, + "step": 30532 + }, + { + "epoch": 6.513011945392492, + "grad_norm": 0.0909387334642616, + "learning_rate": 2.34742430430309e-06, + "loss": 0.8767, + "step": 30533 + }, + { + "epoch": 6.513225255972696, + "grad_norm": 0.07917954212486433, + "learning_rate": 2.3453784218039965e-06, + "loss": 0.868, + "step": 30534 + }, + { + "epoch": 6.513438566552901, + "grad_norm": 0.07205162964910469, + "learning_rate": 2.3433334179682498e-06, + "loss": 0.8532, + "step": 30535 + }, + { + "epoch": 6.513651877133106, + "grad_norm": 0.08718042124270427, + "learning_rate": 2.3412892928189958e-06, + "loss": 0.9084, + "step": 30536 + }, + { + "epoch": 6.513865187713311, + "grad_norm": 0.07901792951833114, + "learning_rate": 2.339246046379353e-06, + "loss": 0.8753, + "step": 30537 + }, + { + "epoch": 6.514078498293515, + "grad_norm": 0.07610683958342039, + "learning_rate": 2.3372036786724593e-06, + "loss": 0.8433, + "step": 30538 + }, + { + "epoch": 6.51429180887372, + "grad_norm": 0.0749639235392692, + "learning_rate": 2.335162189721407e-06, + "loss": 0.8689, + "step": 30539 + }, + { + "epoch": 6.514505119453925, + "grad_norm": 0.0812795511972874, + "learning_rate": 2.3331215795492977e-06, + "loss": 0.8817, + "step": 30540 + }, + { + "epoch": 6.514718430034129, + "grad_norm": 0.07592806228859791, + "learning_rate": 2.331081848179233e-06, + "loss": 0.869, + "step": 30541 + }, + { + "epoch": 6.514931740614334, + "grad_norm": 0.07372677592743115, + "learning_rate": 2.3290429956342785e-06, + "loss": 0.8627, + "step": 30542 + }, + { + "epoch": 6.515145051194539, + "grad_norm": 0.07816715931324983, + "learning_rate": 2.3270050219375005e-06, + "loss": 0.8855, + "step": 30543 + }, + { + "epoch": 6.515358361774744, + "grad_norm": 0.07639330203704683, + "learning_rate": 2.324967927111974e-06, + "loss": 0.8786, + "step": 30544 + }, + { + "epoch": 6.515571672354949, + "grad_norm": 0.07750817309067969, + "learning_rate": 2.322931711180738e-06, + "loss": 0.8975, + "step": 30545 + }, + { + "epoch": 6.515784982935154, + "grad_norm": 0.07660944648597573, + "learning_rate": 2.320896374166841e-06, + "loss": 0.853, + "step": 30546 + }, + { + "epoch": 6.515998293515358, + "grad_norm": 0.07534727520564205, + "learning_rate": 2.3188619160933137e-06, + "loss": 0.8883, + "step": 30547 + }, + { + "epoch": 6.516211604095563, + "grad_norm": 0.08269717829654698, + "learning_rate": 2.316828336983168e-06, + "loss": 0.8861, + "step": 30548 + }, + { + "epoch": 6.516424914675768, + "grad_norm": 0.0786684646681526, + "learning_rate": 2.3147956368593993e-06, + "loss": 0.858, + "step": 30549 + }, + { + "epoch": 6.5166382252559725, + "grad_norm": 0.07759281350723533, + "learning_rate": 2.312763815745038e-06, + "loss": 0.8545, + "step": 30550 + }, + { + "epoch": 6.516851535836177, + "grad_norm": 0.07448001391410965, + "learning_rate": 2.310732873663071e-06, + "loss": 0.8515, + "step": 30551 + }, + { + "epoch": 6.517064846416382, + "grad_norm": 0.07985523393472108, + "learning_rate": 2.308702810636456e-06, + "loss": 0.8895, + "step": 30552 + }, + { + "epoch": 6.517278156996587, + "grad_norm": 0.08114024523422803, + "learning_rate": 2.3066736266881805e-06, + "loss": 0.8593, + "step": 30553 + }, + { + "epoch": 6.517491467576792, + "grad_norm": 0.08652570044350776, + "learning_rate": 2.3046453218411947e-06, + "loss": 0.8921, + "step": 30554 + }, + { + "epoch": 6.517704778156997, + "grad_norm": 0.07574150074316119, + "learning_rate": 2.3026178961184575e-06, + "loss": 0.8933, + "step": 30555 + }, + { + "epoch": 6.5179180887372015, + "grad_norm": 0.07754808561180425, + "learning_rate": 2.3005913495428934e-06, + "loss": 0.8613, + "step": 30556 + }, + { + "epoch": 6.518131399317406, + "grad_norm": 0.07917794978355731, + "learning_rate": 2.2985656821374613e-06, + "loss": 0.9092, + "step": 30557 + }, + { + "epoch": 6.518344709897611, + "grad_norm": 0.0783859903007939, + "learning_rate": 2.296540893925059e-06, + "loss": 0.8993, + "step": 30558 + }, + { + "epoch": 6.5185580204778155, + "grad_norm": 0.08247234781750774, + "learning_rate": 2.29451698492861e-06, + "loss": 0.867, + "step": 30559 + }, + { + "epoch": 6.51877133105802, + "grad_norm": 0.07867293080653215, + "learning_rate": 2.2924939551710023e-06, + "loss": 0.8942, + "step": 30560 + }, + { + "epoch": 6.518984641638225, + "grad_norm": 0.0857655364195055, + "learning_rate": 2.290471804675125e-06, + "loss": 0.8882, + "step": 30561 + }, + { + "epoch": 6.5191979522184305, + "grad_norm": 0.08276124045502703, + "learning_rate": 2.288450533463884e-06, + "loss": 0.8998, + "step": 30562 + }, + { + "epoch": 6.519411262798635, + "grad_norm": 0.07998787975419039, + "learning_rate": 2.286430141560123e-06, + "loss": 0.8947, + "step": 30563 + }, + { + "epoch": 6.51962457337884, + "grad_norm": 0.08974048803935819, + "learning_rate": 2.284410628986704e-06, + "loss": 0.9039, + "step": 30564 + }, + { + "epoch": 6.5198378839590445, + "grad_norm": 0.08478625692489095, + "learning_rate": 2.2823919957664973e-06, + "loss": 0.8849, + "step": 30565 + }, + { + "epoch": 6.520051194539249, + "grad_norm": 0.07432761473520083, + "learning_rate": 2.28037424192233e-06, + "loss": 0.9144, + "step": 30566 + }, + { + "epoch": 6.520264505119454, + "grad_norm": 0.08785755444866872, + "learning_rate": 2.2783573674770355e-06, + "loss": 0.8799, + "step": 30567 + }, + { + "epoch": 6.520477815699659, + "grad_norm": 0.09209771081340926, + "learning_rate": 2.2763413724534323e-06, + "loss": 0.8783, + "step": 30568 + }, + { + "epoch": 6.520691126279863, + "grad_norm": 0.08601647033801571, + "learning_rate": 2.2743262568743285e-06, + "loss": 0.8632, + "step": 30569 + }, + { + "epoch": 6.520904436860068, + "grad_norm": 0.07912863008317934, + "learning_rate": 2.2723120207625416e-06, + "loss": 0.8602, + "step": 30570 + }, + { + "epoch": 6.5211177474402735, + "grad_norm": 0.08051682575175063, + "learning_rate": 2.270298664140853e-06, + "loss": 0.8903, + "step": 30571 + }, + { + "epoch": 6.521331058020478, + "grad_norm": 0.07929020085747919, + "learning_rate": 2.268286187032036e-06, + "loss": 0.8896, + "step": 30572 + }, + { + "epoch": 6.521544368600683, + "grad_norm": 0.08101260908174009, + "learning_rate": 2.2662745894588723e-06, + "loss": 0.8682, + "step": 30573 + }, + { + "epoch": 6.521757679180888, + "grad_norm": 0.0823491438561964, + "learning_rate": 2.2642638714441167e-06, + "loss": 0.8861, + "step": 30574 + }, + { + "epoch": 6.521970989761092, + "grad_norm": 0.07457311114238568, + "learning_rate": 2.262254033010516e-06, + "loss": 0.8712, + "step": 30575 + }, + { + "epoch": 6.522184300341297, + "grad_norm": 0.07446353764678276, + "learning_rate": 2.260245074180825e-06, + "loss": 0.8701, + "step": 30576 + }, + { + "epoch": 6.522397610921502, + "grad_norm": 0.08002846911859618, + "learning_rate": 2.2582369949777627e-06, + "loss": 0.8645, + "step": 30577 + }, + { + "epoch": 6.522610921501706, + "grad_norm": 0.07843829771336741, + "learning_rate": 2.2562297954240673e-06, + "loss": 0.8519, + "step": 30578 + }, + { + "epoch": 6.522824232081911, + "grad_norm": 0.08341529181948612, + "learning_rate": 2.254223475542432e-06, + "loss": 0.8634, + "step": 30579 + }, + { + "epoch": 6.523037542662116, + "grad_norm": 0.0762152583182414, + "learning_rate": 2.252218035355558e-06, + "loss": 0.89, + "step": 30580 + }, + { + "epoch": 6.52325085324232, + "grad_norm": 0.07418613869911261, + "learning_rate": 2.2502134748861557e-06, + "loss": 0.8928, + "step": 30581 + }, + { + "epoch": 6.523464163822526, + "grad_norm": 0.07344095476299446, + "learning_rate": 2.2482097941569015e-06, + "loss": 0.8702, + "step": 30582 + }, + { + "epoch": 6.523677474402731, + "grad_norm": 0.07665408863690946, + "learning_rate": 2.2462069931904516e-06, + "loss": 0.8618, + "step": 30583 + }, + { + "epoch": 6.523890784982935, + "grad_norm": 0.0813274786908662, + "learning_rate": 2.244205072009482e-06, + "loss": 0.8652, + "step": 30584 + }, + { + "epoch": 6.52410409556314, + "grad_norm": 0.07618399665196646, + "learning_rate": 2.242204030636641e-06, + "loss": 0.875, + "step": 30585 + }, + { + "epoch": 6.524317406143345, + "grad_norm": 0.07581229770867276, + "learning_rate": 2.2402038690945684e-06, + "loss": 0.8714, + "step": 30586 + }, + { + "epoch": 6.524530716723549, + "grad_norm": 0.0804889453438392, + "learning_rate": 2.2382045874058946e-06, + "loss": 0.8979, + "step": 30587 + }, + { + "epoch": 6.524744027303754, + "grad_norm": 0.07625303462781897, + "learning_rate": 2.236206185593233e-06, + "loss": 0.884, + "step": 30588 + }, + { + "epoch": 6.524957337883959, + "grad_norm": 0.07922668409645013, + "learning_rate": 2.2342086636792225e-06, + "loss": 0.8616, + "step": 30589 + }, + { + "epoch": 6.525170648464163, + "grad_norm": 0.07756133602745709, + "learning_rate": 2.2322120216864417e-06, + "loss": 0.8455, + "step": 30590 + }, + { + "epoch": 6.525383959044369, + "grad_norm": 0.07323105555226314, + "learning_rate": 2.2302162596374944e-06, + "loss": 0.8677, + "step": 30591 + }, + { + "epoch": 6.525597269624574, + "grad_norm": 0.07981210704469087, + "learning_rate": 2.228221377554949e-06, + "loss": 0.8826, + "step": 30592 + }, + { + "epoch": 6.525810580204778, + "grad_norm": 0.073888832557339, + "learning_rate": 2.2262273754614007e-06, + "loss": 0.8895, + "step": 30593 + }, + { + "epoch": 6.526023890784983, + "grad_norm": 0.07255956665125407, + "learning_rate": 2.2242342533793914e-06, + "loss": 0.8725, + "step": 30594 + }, + { + "epoch": 6.526237201365188, + "grad_norm": 0.08049412742191542, + "learning_rate": 2.2222420113314814e-06, + "loss": 0.8862, + "step": 30595 + }, + { + "epoch": 6.526450511945392, + "grad_norm": 0.08409861575272166, + "learning_rate": 2.2202506493402033e-06, + "loss": 0.9047, + "step": 30596 + }, + { + "epoch": 6.526663822525597, + "grad_norm": 0.07820526196893127, + "learning_rate": 2.218260167428108e-06, + "loss": 0.8981, + "step": 30597 + }, + { + "epoch": 6.526877133105802, + "grad_norm": 0.07821319446311485, + "learning_rate": 2.216270565617702e-06, + "loss": 0.8994, + "step": 30598 + }, + { + "epoch": 6.527090443686006, + "grad_norm": 0.07451367144614739, + "learning_rate": 2.2142818439315097e-06, + "loss": 0.8778, + "step": 30599 + }, + { + "epoch": 6.527303754266212, + "grad_norm": 0.08400586595359172, + "learning_rate": 2.2122940023920103e-06, + "loss": 0.8886, + "step": 30600 + }, + { + "epoch": 6.527517064846417, + "grad_norm": 0.07784301329594731, + "learning_rate": 2.21030704102172e-06, + "loss": 0.8743, + "step": 30601 + }, + { + "epoch": 6.527730375426621, + "grad_norm": 0.07964267654189294, + "learning_rate": 2.2083209598431175e-06, + "loss": 0.8766, + "step": 30602 + }, + { + "epoch": 6.527943686006826, + "grad_norm": 0.08392946015666837, + "learning_rate": 2.206335758878675e-06, + "loss": 0.8732, + "step": 30603 + }, + { + "epoch": 6.528156996587031, + "grad_norm": 0.07619856420108648, + "learning_rate": 2.2043514381508445e-06, + "loss": 0.9161, + "step": 30604 + }, + { + "epoch": 6.528370307167235, + "grad_norm": 0.0793671982242182, + "learning_rate": 2.2023679976820887e-06, + "loss": 0.8836, + "step": 30605 + }, + { + "epoch": 6.52858361774744, + "grad_norm": 0.07979782014637284, + "learning_rate": 2.200385437494852e-06, + "loss": 0.8923, + "step": 30606 + }, + { + "epoch": 6.528796928327645, + "grad_norm": 0.08155421302221957, + "learning_rate": 2.198403757611569e-06, + "loss": 0.8614, + "step": 30607 + }, + { + "epoch": 6.5290102389078495, + "grad_norm": 0.07856244416809678, + "learning_rate": 2.1964229580546403e-06, + "loss": 0.8934, + "step": 30608 + }, + { + "epoch": 6.529223549488055, + "grad_norm": 0.08164242130746226, + "learning_rate": 2.194443038846501e-06, + "loss": 0.8822, + "step": 30609 + }, + { + "epoch": 6.52943686006826, + "grad_norm": 0.07678723638961724, + "learning_rate": 2.192464000009542e-06, + "loss": 0.8904, + "step": 30610 + }, + { + "epoch": 6.529650170648464, + "grad_norm": 0.08562657920707799, + "learning_rate": 2.1904858415661723e-06, + "loss": 0.8826, + "step": 30611 + }, + { + "epoch": 6.529863481228669, + "grad_norm": 0.07850804518141394, + "learning_rate": 2.188508563538747e-06, + "loss": 0.8777, + "step": 30612 + }, + { + "epoch": 6.530076791808874, + "grad_norm": 0.07977627641878035, + "learning_rate": 2.1865321659496754e-06, + "loss": 0.8728, + "step": 30613 + }, + { + "epoch": 6.5302901023890785, + "grad_norm": 0.08223859337796911, + "learning_rate": 2.1845566488212942e-06, + "loss": 0.8815, + "step": 30614 + }, + { + "epoch": 6.530503412969283, + "grad_norm": 0.08665864972138823, + "learning_rate": 2.18258201217596e-06, + "loss": 0.9134, + "step": 30615 + }, + { + "epoch": 6.530716723549488, + "grad_norm": 0.07238326477129223, + "learning_rate": 2.1806082560360186e-06, + "loss": 0.8929, + "step": 30616 + }, + { + "epoch": 6.5309300341296925, + "grad_norm": 0.0746890726433525, + "learning_rate": 2.178635380423808e-06, + "loss": 0.8542, + "step": 30617 + }, + { + "epoch": 6.531143344709898, + "grad_norm": 0.07504039325280935, + "learning_rate": 2.1766633853616482e-06, + "loss": 0.8712, + "step": 30618 + }, + { + "epoch": 6.531356655290102, + "grad_norm": 0.08366727814860621, + "learning_rate": 2.17469227087185e-06, + "loss": 0.8993, + "step": 30619 + }, + { + "epoch": 6.5315699658703075, + "grad_norm": 0.08383149111348598, + "learning_rate": 2.1727220369767157e-06, + "loss": 0.8938, + "step": 30620 + }, + { + "epoch": 6.531783276450512, + "grad_norm": 0.07975374921574106, + "learning_rate": 2.1707526836985384e-06, + "loss": 0.8864, + "step": 30621 + }, + { + "epoch": 6.531996587030717, + "grad_norm": 0.07998642562720959, + "learning_rate": 2.1687842110596115e-06, + "loss": 0.8769, + "step": 30622 + }, + { + "epoch": 6.5322098976109215, + "grad_norm": 0.08807432804078433, + "learning_rate": 2.1668166190821927e-06, + "loss": 0.8938, + "step": 30623 + }, + { + "epoch": 6.532423208191126, + "grad_norm": 0.07815238781514015, + "learning_rate": 2.1648499077885486e-06, + "loss": 0.884, + "step": 30624 + }, + { + "epoch": 6.532636518771331, + "grad_norm": 0.07882776535246475, + "learning_rate": 2.162884077200946e-06, + "loss": 0.8631, + "step": 30625 + }, + { + "epoch": 6.532849829351536, + "grad_norm": 0.07444324845376694, + "learning_rate": 2.1609191273416253e-06, + "loss": 0.8847, + "step": 30626 + }, + { + "epoch": 6.53306313993174, + "grad_norm": 0.07802254484099706, + "learning_rate": 2.1589550582328077e-06, + "loss": 0.8944, + "step": 30627 + }, + { + "epoch": 6.533276450511945, + "grad_norm": 0.07955335776993692, + "learning_rate": 2.1569918698967075e-06, + "loss": 0.8832, + "step": 30628 + }, + { + "epoch": 6.5334897610921505, + "grad_norm": 0.07837113295973856, + "learning_rate": 2.155029562355582e-06, + "loss": 0.8908, + "step": 30629 + }, + { + "epoch": 6.533703071672355, + "grad_norm": 0.08351738974247895, + "learning_rate": 2.153068135631591e-06, + "loss": 0.8842, + "step": 30630 + }, + { + "epoch": 6.53391638225256, + "grad_norm": 0.07616970514299912, + "learning_rate": 2.151107589746948e-06, + "loss": 0.9143, + "step": 30631 + }, + { + "epoch": 6.534129692832765, + "grad_norm": 0.07539612812214341, + "learning_rate": 2.1491479247238224e-06, + "loss": 0.875, + "step": 30632 + }, + { + "epoch": 6.534343003412969, + "grad_norm": 0.07701940945510014, + "learning_rate": 2.1471891405844e-06, + "loss": 0.8701, + "step": 30633 + }, + { + "epoch": 6.534556313993174, + "grad_norm": 0.08107253008808236, + "learning_rate": 2.145231237350851e-06, + "loss": 0.8728, + "step": 30634 + }, + { + "epoch": 6.534769624573379, + "grad_norm": 0.08056761909265797, + "learning_rate": 2.1432742150453167e-06, + "loss": 0.8786, + "step": 30635 + }, + { + "epoch": 6.534982935153583, + "grad_norm": 0.08098805742566263, + "learning_rate": 2.1413180736899307e-06, + "loss": 0.8895, + "step": 30636 + }, + { + "epoch": 6.535196245733788, + "grad_norm": 0.07898737482705681, + "learning_rate": 2.1393628133068533e-06, + "loss": 0.8816, + "step": 30637 + }, + { + "epoch": 6.535409556313994, + "grad_norm": 0.07723691624282565, + "learning_rate": 2.1374084339182e-06, + "loss": 0.8723, + "step": 30638 + }, + { + "epoch": 6.535622866894198, + "grad_norm": 0.0719814324200458, + "learning_rate": 2.1354549355460684e-06, + "loss": 0.859, + "step": 30639 + }, + { + "epoch": 6.535836177474403, + "grad_norm": 0.0763331678988202, + "learning_rate": 2.1335023182125745e-06, + "loss": 0.8603, + "step": 30640 + }, + { + "epoch": 6.536049488054608, + "grad_norm": 0.08392334366780677, + "learning_rate": 2.1315505819398252e-06, + "loss": 0.89, + "step": 30641 + }, + { + "epoch": 6.536262798634812, + "grad_norm": 0.08250662793127843, + "learning_rate": 2.1295997267498825e-06, + "loss": 0.8873, + "step": 30642 + }, + { + "epoch": 6.536476109215017, + "grad_norm": 0.0773628730415536, + "learning_rate": 2.1276497526648355e-06, + "loss": 0.8664, + "step": 30643 + }, + { + "epoch": 6.536689419795222, + "grad_norm": 0.08265629041660048, + "learning_rate": 2.1257006597067285e-06, + "loss": 0.8939, + "step": 30644 + }, + { + "epoch": 6.536902730375426, + "grad_norm": 0.07834650408779177, + "learning_rate": 2.1237524478976333e-06, + "loss": 0.9163, + "step": 30645 + }, + { + "epoch": 6.537116040955631, + "grad_norm": 0.08284931342573668, + "learning_rate": 2.1218051172595855e-06, + "loss": 0.869, + "step": 30646 + }, + { + "epoch": 6.537329351535837, + "grad_norm": 0.08291910496808724, + "learning_rate": 2.119858667814629e-06, + "loss": 0.8797, + "step": 30647 + }, + { + "epoch": 6.537542662116041, + "grad_norm": 0.07795045297934373, + "learning_rate": 2.117913099584774e-06, + "loss": 0.8764, + "step": 30648 + }, + { + "epoch": 6.537755972696246, + "grad_norm": 0.07587792868646764, + "learning_rate": 2.1159684125920556e-06, + "loss": 0.9255, + "step": 30649 + }, + { + "epoch": 6.537969283276451, + "grad_norm": 0.08293414861740973, + "learning_rate": 2.1140246068584557e-06, + "loss": 0.9045, + "step": 30650 + }, + { + "epoch": 6.538182593856655, + "grad_norm": 0.0781356037714294, + "learning_rate": 2.112081682405975e-06, + "loss": 0.878, + "step": 30651 + }, + { + "epoch": 6.53839590443686, + "grad_norm": 0.08381663281331266, + "learning_rate": 2.1101396392566055e-06, + "loss": 0.8897, + "step": 30652 + }, + { + "epoch": 6.538609215017065, + "grad_norm": 0.08308416161829942, + "learning_rate": 2.1081984774323196e-06, + "loss": 0.9314, + "step": 30653 + }, + { + "epoch": 6.538822525597269, + "grad_norm": 0.09000502275837742, + "learning_rate": 2.1062581969550733e-06, + "loss": 0.8841, + "step": 30654 + }, + { + "epoch": 6.539035836177474, + "grad_norm": 0.08169451781702626, + "learning_rate": 2.1043187978468227e-06, + "loss": 0.8498, + "step": 30655 + }, + { + "epoch": 6.53924914675768, + "grad_norm": 0.07839518117971185, + "learning_rate": 2.1023802801295147e-06, + "loss": 0.8888, + "step": 30656 + }, + { + "epoch": 6.539462457337884, + "grad_norm": 0.07424256529029302, + "learning_rate": 2.1004426438250868e-06, + "loss": 0.8724, + "step": 30657 + }, + { + "epoch": 6.539675767918089, + "grad_norm": 0.0759543745457323, + "learning_rate": 2.098505888955469e-06, + "loss": 0.8696, + "step": 30658 + }, + { + "epoch": 6.539889078498294, + "grad_norm": 0.08648181630906915, + "learning_rate": 2.096570015542554e-06, + "loss": 0.8917, + "step": 30659 + }, + { + "epoch": 6.540102389078498, + "grad_norm": 0.08773858842798764, + "learning_rate": 2.0946350236082623e-06, + "loss": 0.8825, + "step": 30660 + }, + { + "epoch": 6.540315699658703, + "grad_norm": 0.07399288552582704, + "learning_rate": 2.092700913174488e-06, + "loss": 0.8567, + "step": 30661 + }, + { + "epoch": 6.540529010238908, + "grad_norm": 0.07862541643502659, + "learning_rate": 2.090767684263115e-06, + "loss": 0.866, + "step": 30662 + }, + { + "epoch": 6.540742320819112, + "grad_norm": 0.07736262725430125, + "learning_rate": 2.0888353368960292e-06, + "loss": 0.9017, + "step": 30663 + }, + { + "epoch": 6.540955631399317, + "grad_norm": 0.0785791881282699, + "learning_rate": 2.086903871095061e-06, + "loss": 0.8815, + "step": 30664 + }, + { + "epoch": 6.541168941979522, + "grad_norm": 0.07227886384671557, + "learning_rate": 2.084973286882086e-06, + "loss": 0.8733, + "step": 30665 + }, + { + "epoch": 6.5413822525597265, + "grad_norm": 0.07977669171134404, + "learning_rate": 2.083043584278963e-06, + "loss": 0.8725, + "step": 30666 + }, + { + "epoch": 6.541595563139932, + "grad_norm": 0.08478120684207405, + "learning_rate": 2.0811147633075057e-06, + "loss": 0.8803, + "step": 30667 + }, + { + "epoch": 6.541808873720137, + "grad_norm": 0.08869194389058656, + "learning_rate": 2.079186823989536e-06, + "loss": 0.8829, + "step": 30668 + }, + { + "epoch": 6.542022184300341, + "grad_norm": 0.08036581196793237, + "learning_rate": 2.077259766346886e-06, + "loss": 0.8835, + "step": 30669 + }, + { + "epoch": 6.542235494880546, + "grad_norm": 0.07726231429229845, + "learning_rate": 2.0753335904013606e-06, + "loss": 0.8867, + "step": 30670 + }, + { + "epoch": 6.542448805460751, + "grad_norm": 0.07726184305445721, + "learning_rate": 2.073408296174737e-06, + "loss": 0.8748, + "step": 30671 + }, + { + "epoch": 6.5426621160409555, + "grad_norm": 0.07774997723032996, + "learning_rate": 2.071483883688803e-06, + "loss": 0.8681, + "step": 30672 + }, + { + "epoch": 6.54287542662116, + "grad_norm": 0.08125888897964202, + "learning_rate": 2.069560352965354e-06, + "loss": 0.882, + "step": 30673 + }, + { + "epoch": 6.543088737201365, + "grad_norm": 0.08008948753343872, + "learning_rate": 2.067637704026142e-06, + "loss": 0.8909, + "step": 30674 + }, + { + "epoch": 6.5433020477815695, + "grad_norm": 0.08131678771186628, + "learning_rate": 2.0657159368929093e-06, + "loss": 0.8822, + "step": 30675 + }, + { + "epoch": 6.543515358361775, + "grad_norm": 0.07608397314635323, + "learning_rate": 2.063795051587407e-06, + "loss": 0.8861, + "step": 30676 + }, + { + "epoch": 6.54372866894198, + "grad_norm": 0.07542670283406096, + "learning_rate": 2.0618750481313877e-06, + "loss": 0.8879, + "step": 30677 + }, + { + "epoch": 6.5439419795221845, + "grad_norm": 0.08246769926679971, + "learning_rate": 2.0599559265465575e-06, + "loss": 0.8799, + "step": 30678 + }, + { + "epoch": 6.544155290102389, + "grad_norm": 0.08579743751549676, + "learning_rate": 2.0580376868546416e-06, + "loss": 0.8843, + "step": 30679 + }, + { + "epoch": 6.544368600682594, + "grad_norm": 0.08945799425634836, + "learning_rate": 2.056120329077329e-06, + "loss": 0.9127, + "step": 30680 + }, + { + "epoch": 6.5445819112627985, + "grad_norm": 0.07639151218448434, + "learning_rate": 2.0542038532363362e-06, + "loss": 0.8642, + "step": 30681 + }, + { + "epoch": 6.544795221843003, + "grad_norm": 0.08978503738253693, + "learning_rate": 2.0522882593533344e-06, + "loss": 0.9006, + "step": 30682 + }, + { + "epoch": 6.545008532423208, + "grad_norm": 0.07625753760329211, + "learning_rate": 2.050373547450004e-06, + "loss": 0.9055, + "step": 30683 + }, + { + "epoch": 6.545221843003413, + "grad_norm": 0.07885799065889994, + "learning_rate": 2.0484597175479992e-06, + "loss": 0.8841, + "step": 30684 + }, + { + "epoch": 6.545435153583618, + "grad_norm": 0.0780217517026666, + "learning_rate": 2.046546769669e-06, + "loss": 0.8884, + "step": 30685 + }, + { + "epoch": 6.545648464163823, + "grad_norm": 0.08125242337991807, + "learning_rate": 2.044634703834625e-06, + "loss": 0.8905, + "step": 30686 + }, + { + "epoch": 6.5458617747440275, + "grad_norm": 0.07507653577594159, + "learning_rate": 2.0427235200665274e-06, + "loss": 0.8647, + "step": 30687 + }, + { + "epoch": 6.546075085324232, + "grad_norm": 0.08095455338408206, + "learning_rate": 2.0408132183863083e-06, + "loss": 0.8912, + "step": 30688 + }, + { + "epoch": 6.546288395904437, + "grad_norm": 0.0783457132787491, + "learning_rate": 2.0389037988156124e-06, + "loss": 0.8907, + "step": 30689 + }, + { + "epoch": 6.546501706484642, + "grad_norm": 0.0787472882549461, + "learning_rate": 2.036995261376031e-06, + "loss": 0.8915, + "step": 30690 + }, + { + "epoch": 6.546715017064846, + "grad_norm": 0.07704170193046664, + "learning_rate": 2.0350876060891565e-06, + "loss": 0.8981, + "step": 30691 + }, + { + "epoch": 6.546928327645051, + "grad_norm": 0.07825336288656524, + "learning_rate": 2.033180832976571e-06, + "loss": 0.8667, + "step": 30692 + }, + { + "epoch": 6.547141638225256, + "grad_norm": 0.07984601691169697, + "learning_rate": 2.0312749420598576e-06, + "loss": 0.9007, + "step": 30693 + }, + { + "epoch": 6.547354948805461, + "grad_norm": 0.0858856260345436, + "learning_rate": 2.0293699333605896e-06, + "loss": 0.8701, + "step": 30694 + }, + { + "epoch": 6.547568259385666, + "grad_norm": 0.07685505266232816, + "learning_rate": 2.027465806900306e-06, + "loss": 0.8692, + "step": 30695 + }, + { + "epoch": 6.547781569965871, + "grad_norm": 0.07310558805111278, + "learning_rate": 2.025562562700554e-06, + "loss": 0.8683, + "step": 30696 + }, + { + "epoch": 6.547994880546075, + "grad_norm": 0.07831482648794755, + "learning_rate": 2.0236602007828887e-06, + "loss": 0.9083, + "step": 30697 + }, + { + "epoch": 6.54820819112628, + "grad_norm": 0.0761725600086747, + "learning_rate": 2.021758721168805e-06, + "loss": 0.8637, + "step": 30698 + }, + { + "epoch": 6.548421501706485, + "grad_norm": 0.0757647792485727, + "learning_rate": 2.019858123879841e-06, + "loss": 0.8725, + "step": 30699 + }, + { + "epoch": 6.548634812286689, + "grad_norm": 0.07583820445897353, + "learning_rate": 2.0179584089374816e-06, + "loss": 0.8681, + "step": 30700 + }, + { + "epoch": 6.548848122866894, + "grad_norm": 0.07383770435545853, + "learning_rate": 2.0160595763632475e-06, + "loss": 0.8778, + "step": 30701 + }, + { + "epoch": 6.549061433447099, + "grad_norm": 0.0788395050355647, + "learning_rate": 2.014161626178606e-06, + "loss": 0.8871, + "step": 30702 + }, + { + "epoch": 6.549274744027304, + "grad_norm": 0.08267039803093107, + "learning_rate": 2.0122645584050416e-06, + "loss": 0.8883, + "step": 30703 + }, + { + "epoch": 6.549488054607508, + "grad_norm": 0.0820832227636177, + "learning_rate": 2.0103683730640046e-06, + "loss": 0.9219, + "step": 30704 + }, + { + "epoch": 6.549701365187714, + "grad_norm": 0.07815463000682767, + "learning_rate": 2.0084730701769615e-06, + "loss": 0.8695, + "step": 30705 + }, + { + "epoch": 6.549914675767918, + "grad_norm": 0.07919504877333415, + "learning_rate": 2.0065786497653717e-06, + "loss": 0.8728, + "step": 30706 + }, + { + "epoch": 6.550127986348123, + "grad_norm": 0.07784549713751705, + "learning_rate": 2.0046851118506484e-06, + "loss": 0.8686, + "step": 30707 + }, + { + "epoch": 6.550341296928328, + "grad_norm": 0.08121470826249842, + "learning_rate": 2.0027924564542234e-06, + "loss": 0.8562, + "step": 30708 + }, + { + "epoch": 6.550554607508532, + "grad_norm": 0.08351088067999933, + "learning_rate": 2.0009006835975197e-06, + "loss": 0.8824, + "step": 30709 + }, + { + "epoch": 6.550767918088737, + "grad_norm": 0.08080069905887707, + "learning_rate": 1.9990097933019335e-06, + "loss": 0.8725, + "step": 30710 + }, + { + "epoch": 6.550981228668942, + "grad_norm": 0.07666715794729939, + "learning_rate": 1.99711978558887e-06, + "loss": 0.8633, + "step": 30711 + }, + { + "epoch": 6.551194539249146, + "grad_norm": 0.08782591058471147, + "learning_rate": 1.9952306604796902e-06, + "loss": 0.9003, + "step": 30712 + }, + { + "epoch": 6.551407849829351, + "grad_norm": 0.08447289482978654, + "learning_rate": 1.9933424179957983e-06, + "loss": 0.9138, + "step": 30713 + }, + { + "epoch": 6.551621160409557, + "grad_norm": 0.08258210424471299, + "learning_rate": 1.991455058158556e-06, + "loss": 0.9072, + "step": 30714 + }, + { + "epoch": 6.551834470989761, + "grad_norm": 0.07575924856725538, + "learning_rate": 1.989568580989305e-06, + "loss": 0.8705, + "step": 30715 + }, + { + "epoch": 6.552047781569966, + "grad_norm": 0.07980535427224941, + "learning_rate": 1.9876829865093895e-06, + "loss": 0.8861, + "step": 30716 + }, + { + "epoch": 6.552261092150171, + "grad_norm": 0.07400534244360914, + "learning_rate": 1.9857982747401607e-06, + "loss": 0.8931, + "step": 30717 + }, + { + "epoch": 6.552474402730375, + "grad_norm": 0.0808613566624512, + "learning_rate": 1.983914445702935e-06, + "loss": 0.885, + "step": 30718 + }, + { + "epoch": 6.55268771331058, + "grad_norm": 0.07603429731176067, + "learning_rate": 1.982031499419037e-06, + "loss": 0.9013, + "step": 30719 + }, + { + "epoch": 6.552901023890785, + "grad_norm": 0.07995778864396749, + "learning_rate": 1.980149435909757e-06, + "loss": 0.8889, + "step": 30720 + }, + { + "epoch": 6.5531143344709895, + "grad_norm": 0.08106493907738054, + "learning_rate": 1.9782682551964027e-06, + "loss": 0.8467, + "step": 30721 + }, + { + "epoch": 6.553327645051194, + "grad_norm": 0.07618701264831583, + "learning_rate": 1.976387957300263e-06, + "loss": 0.86, + "step": 30722 + }, + { + "epoch": 6.5535409556314, + "grad_norm": 0.08194776089557082, + "learning_rate": 1.9745085422425925e-06, + "loss": 0.9103, + "step": 30723 + }, + { + "epoch": 6.553754266211604, + "grad_norm": 0.08257568257244126, + "learning_rate": 1.9726300100446714e-06, + "loss": 0.8734, + "step": 30724 + }, + { + "epoch": 6.553967576791809, + "grad_norm": 0.07749232877810888, + "learning_rate": 1.9707523607277544e-06, + "loss": 0.8641, + "step": 30725 + }, + { + "epoch": 6.554180887372014, + "grad_norm": 0.07805371095081244, + "learning_rate": 1.9688755943130957e-06, + "loss": 0.8797, + "step": 30726 + }, + { + "epoch": 6.5543941979522184, + "grad_norm": 0.07635782764180307, + "learning_rate": 1.966999710821913e-06, + "loss": 0.8929, + "step": 30727 + }, + { + "epoch": 6.554607508532423, + "grad_norm": 0.07805672924993147, + "learning_rate": 1.9651247102754434e-06, + "loss": 0.8705, + "step": 30728 + }, + { + "epoch": 6.554820819112628, + "grad_norm": 0.07982886761908979, + "learning_rate": 1.9632505926948963e-06, + "loss": 0.8889, + "step": 30729 + }, + { + "epoch": 6.5550341296928325, + "grad_norm": 0.07198792883785328, + "learning_rate": 1.9613773581014993e-06, + "loss": 0.8796, + "step": 30730 + }, + { + "epoch": 6.555247440273037, + "grad_norm": 0.08779685406552716, + "learning_rate": 1.959505006516409e-06, + "loss": 0.8558, + "step": 30731 + }, + { + "epoch": 6.555460750853243, + "grad_norm": 0.08091819936727225, + "learning_rate": 1.9576335379608434e-06, + "loss": 0.8986, + "step": 30732 + }, + { + "epoch": 6.555674061433447, + "grad_norm": 0.08331272601755411, + "learning_rate": 1.9557629524559687e-06, + "loss": 0.8888, + "step": 30733 + }, + { + "epoch": 6.555887372013652, + "grad_norm": 0.07962594047635552, + "learning_rate": 1.95389325002294e-06, + "loss": 0.8933, + "step": 30734 + }, + { + "epoch": 6.556100682593857, + "grad_norm": 0.07625146219843103, + "learning_rate": 1.9520244306829326e-06, + "loss": 0.8868, + "step": 30735 + }, + { + "epoch": 6.5563139931740615, + "grad_norm": 0.07486184018662145, + "learning_rate": 1.9501564944570674e-06, + "loss": 0.8858, + "step": 30736 + }, + { + "epoch": 6.556527303754266, + "grad_norm": 0.07320855751646316, + "learning_rate": 1.9482894413665e-06, + "loss": 0.8861, + "step": 30737 + }, + { + "epoch": 6.556740614334471, + "grad_norm": 0.08353479064929685, + "learning_rate": 1.946423271432352e-06, + "loss": 0.8881, + "step": 30738 + }, + { + "epoch": 6.5569539249146755, + "grad_norm": 0.07542533116821774, + "learning_rate": 1.9445579846757434e-06, + "loss": 0.8744, + "step": 30739 + }, + { + "epoch": 6.55716723549488, + "grad_norm": 0.0791432383731466, + "learning_rate": 1.9426935811177604e-06, + "loss": 0.9038, + "step": 30740 + }, + { + "epoch": 6.557380546075086, + "grad_norm": 0.07993648693921086, + "learning_rate": 1.9408300607795328e-06, + "loss": 0.8511, + "step": 30741 + }, + { + "epoch": 6.5575938566552905, + "grad_norm": 0.07940762060383777, + "learning_rate": 1.9389674236821098e-06, + "loss": 0.9186, + "step": 30742 + }, + { + "epoch": 6.557807167235495, + "grad_norm": 0.08097134962740281, + "learning_rate": 1.9371056698465863e-06, + "loss": 0.8826, + "step": 30743 + }, + { + "epoch": 6.5580204778157, + "grad_norm": 0.08101252465782512, + "learning_rate": 1.9352447992940294e-06, + "loss": 0.8938, + "step": 30744 + }, + { + "epoch": 6.5582337883959045, + "grad_norm": 0.07776309778030356, + "learning_rate": 1.933384812045489e-06, + "loss": 0.8723, + "step": 30745 + }, + { + "epoch": 6.558447098976109, + "grad_norm": 0.07757562902969875, + "learning_rate": 1.9315257081220063e-06, + "loss": 0.8851, + "step": 30746 + }, + { + "epoch": 6.558660409556314, + "grad_norm": 0.07655947909231713, + "learning_rate": 1.929667487544631e-06, + "loss": 0.896, + "step": 30747 + }, + { + "epoch": 6.558873720136519, + "grad_norm": 0.07834918757879729, + "learning_rate": 1.927810150334377e-06, + "loss": 0.8972, + "step": 30748 + }, + { + "epoch": 6.559087030716723, + "grad_norm": 0.0729595129890596, + "learning_rate": 1.925953696512268e-06, + "loss": 0.8871, + "step": 30749 + }, + { + "epoch": 6.559300341296928, + "grad_norm": 0.08054892672255136, + "learning_rate": 1.9240981260993008e-06, + "loss": 0.8997, + "step": 30750 + }, + { + "epoch": 6.559513651877133, + "grad_norm": 0.07408056810544063, + "learning_rate": 1.9222434391164803e-06, + "loss": 0.8689, + "step": 30751 + }, + { + "epoch": 6.559726962457338, + "grad_norm": 0.07885445022865167, + "learning_rate": 1.920389635584794e-06, + "loss": 0.8843, + "step": 30752 + }, + { + "epoch": 6.559940273037543, + "grad_norm": 0.07488086142142875, + "learning_rate": 1.9185367155252034e-06, + "loss": 0.8568, + "step": 30753 + }, + { + "epoch": 6.560153583617748, + "grad_norm": 0.0758291623987575, + "learning_rate": 1.9166846789586866e-06, + "loss": 0.8691, + "step": 30754 + }, + { + "epoch": 6.560366894197952, + "grad_norm": 0.0779544997827233, + "learning_rate": 1.9148335259061967e-06, + "loss": 0.9298, + "step": 30755 + }, + { + "epoch": 6.560580204778157, + "grad_norm": 0.07719768486026359, + "learning_rate": 1.9129832563886763e-06, + "loss": 0.8725, + "step": 30756 + }, + { + "epoch": 6.560793515358362, + "grad_norm": 0.07331782912946617, + "learning_rate": 1.911133870427069e-06, + "loss": 0.8719, + "step": 30757 + }, + { + "epoch": 6.561006825938566, + "grad_norm": 0.07529454074494979, + "learning_rate": 1.9092853680422906e-06, + "loss": 0.8738, + "step": 30758 + }, + { + "epoch": 6.561220136518771, + "grad_norm": 0.07766123601192221, + "learning_rate": 1.9074377492552675e-06, + "loss": 0.8609, + "step": 30759 + }, + { + "epoch": 6.561433447098976, + "grad_norm": 0.06873853581280809, + "learning_rate": 1.9055910140868893e-06, + "loss": 0.8513, + "step": 30760 + }, + { + "epoch": 6.561646757679181, + "grad_norm": 0.07812666503956074, + "learning_rate": 1.9037451625580639e-06, + "loss": 0.8801, + "step": 30761 + }, + { + "epoch": 6.561860068259386, + "grad_norm": 0.07415843340138925, + "learning_rate": 1.901900194689681e-06, + "loss": 0.8704, + "step": 30762 + }, + { + "epoch": 6.562073378839591, + "grad_norm": 0.07942098213972995, + "learning_rate": 1.9000561105026127e-06, + "loss": 0.884, + "step": 30763 + }, + { + "epoch": 6.562286689419795, + "grad_norm": 0.07676089770937068, + "learning_rate": 1.8982129100177226e-06, + "loss": 0.8894, + "step": 30764 + }, + { + "epoch": 6.5625, + "grad_norm": 0.07630852571770742, + "learning_rate": 1.896370593255865e-06, + "loss": 0.8648, + "step": 30765 + }, + { + "epoch": 6.562713310580205, + "grad_norm": 0.0746109312030182, + "learning_rate": 1.8945291602378946e-06, + "loss": 0.8791, + "step": 30766 + }, + { + "epoch": 6.562926621160409, + "grad_norm": 0.08136735937852513, + "learning_rate": 1.892688610984621e-06, + "loss": 0.8746, + "step": 30767 + }, + { + "epoch": 6.563139931740614, + "grad_norm": 0.08172712950004342, + "learning_rate": 1.8908489455169077e-06, + "loss": 0.8906, + "step": 30768 + }, + { + "epoch": 6.563353242320819, + "grad_norm": 0.08620344627368785, + "learning_rate": 1.8890101638555469e-06, + "loss": 0.8799, + "step": 30769 + }, + { + "epoch": 6.563566552901024, + "grad_norm": 0.0794059876845094, + "learning_rate": 1.8871722660213576e-06, + "loss": 0.8723, + "step": 30770 + }, + { + "epoch": 6.563779863481229, + "grad_norm": 0.08028022258169097, + "learning_rate": 1.8853352520351143e-06, + "loss": 0.8895, + "step": 30771 + }, + { + "epoch": 6.563993174061434, + "grad_norm": 0.0715310786340315, + "learning_rate": 1.883499121917618e-06, + "loss": 0.8811, + "step": 30772 + }, + { + "epoch": 6.564206484641638, + "grad_norm": 0.0739705538671538, + "learning_rate": 1.8816638756896522e-06, + "loss": 0.9021, + "step": 30773 + }, + { + "epoch": 6.564419795221843, + "grad_norm": 0.07041425729714484, + "learning_rate": 1.8798295133719735e-06, + "loss": 0.8679, + "step": 30774 + }, + { + "epoch": 6.564633105802048, + "grad_norm": 0.07488295853864763, + "learning_rate": 1.87799603498533e-06, + "loss": 0.8836, + "step": 30775 + }, + { + "epoch": 6.564846416382252, + "grad_norm": 0.07586020127761842, + "learning_rate": 1.8761634405504781e-06, + "loss": 0.8779, + "step": 30776 + }, + { + "epoch": 6.565059726962457, + "grad_norm": 0.07529612091208096, + "learning_rate": 1.874331730088157e-06, + "loss": 0.8674, + "step": 30777 + }, + { + "epoch": 6.565273037542662, + "grad_norm": 0.07537922861936995, + "learning_rate": 1.8725009036190789e-06, + "loss": 0.8852, + "step": 30778 + }, + { + "epoch": 6.565486348122867, + "grad_norm": 0.0793208582548192, + "learning_rate": 1.870670961163965e-06, + "loss": 0.8736, + "step": 30779 + }, + { + "epoch": 6.565699658703072, + "grad_norm": 0.07945337517595201, + "learning_rate": 1.8688419027435278e-06, + "loss": 0.909, + "step": 30780 + }, + { + "epoch": 6.565912969283277, + "grad_norm": 0.07230120656083976, + "learning_rate": 1.8670137283784616e-06, + "loss": 0.8961, + "step": 30781 + }, + { + "epoch": 6.566126279863481, + "grad_norm": 0.06971434748594722, + "learning_rate": 1.8651864380894525e-06, + "loss": 0.8567, + "step": 30782 + }, + { + "epoch": 6.566339590443686, + "grad_norm": 0.07687160785641234, + "learning_rate": 1.8633600318971678e-06, + "loss": 0.9134, + "step": 30783 + }, + { + "epoch": 6.566552901023891, + "grad_norm": 0.0735338667527814, + "learning_rate": 1.861534509822267e-06, + "loss": 0.8735, + "step": 30784 + }, + { + "epoch": 6.5667662116040955, + "grad_norm": 0.07623047875474206, + "learning_rate": 1.8597098718854356e-06, + "loss": 0.8804, + "step": 30785 + }, + { + "epoch": 6.5669795221843, + "grad_norm": 0.07984175873057382, + "learning_rate": 1.8578861181072971e-06, + "loss": 0.895, + "step": 30786 + }, + { + "epoch": 6.567192832764505, + "grad_norm": 0.06959039669383495, + "learning_rate": 1.856063248508484e-06, + "loss": 0.85, + "step": 30787 + }, + { + "epoch": 6.5674061433447095, + "grad_norm": 0.0739085563779725, + "learning_rate": 1.8542412631096374e-06, + "loss": 0.8924, + "step": 30788 + }, + { + "epoch": 6.567619453924914, + "grad_norm": 0.07318244831767266, + "learning_rate": 1.8524201619313631e-06, + "loss": 0.8593, + "step": 30789 + }, + { + "epoch": 6.56783276450512, + "grad_norm": 0.07842546942650587, + "learning_rate": 1.8505999449942669e-06, + "loss": 0.8912, + "step": 30790 + }, + { + "epoch": 6.5680460750853245, + "grad_norm": 0.07845722634828656, + "learning_rate": 1.8487806123189456e-06, + "loss": 0.8659, + "step": 30791 + }, + { + "epoch": 6.568259385665529, + "grad_norm": 0.0779789275714105, + "learning_rate": 1.8469621639259871e-06, + "loss": 0.8857, + "step": 30792 + }, + { + "epoch": 6.568472696245734, + "grad_norm": 0.07893900540659156, + "learning_rate": 1.8451445998359707e-06, + "loss": 0.9016, + "step": 30793 + }, + { + "epoch": 6.5686860068259385, + "grad_norm": 0.07518471801065991, + "learning_rate": 1.8433279200694576e-06, + "loss": 0.8898, + "step": 30794 + }, + { + "epoch": 6.568899317406143, + "grad_norm": 0.07363550333034953, + "learning_rate": 1.8415121246470002e-06, + "loss": 0.8736, + "step": 30795 + }, + { + "epoch": 6.569112627986348, + "grad_norm": 0.0795603901633734, + "learning_rate": 1.8396972135891422e-06, + "loss": 0.9031, + "step": 30796 + }, + { + "epoch": 6.569325938566553, + "grad_norm": 0.08665356362192574, + "learning_rate": 1.8378831869164448e-06, + "loss": 0.8849, + "step": 30797 + }, + { + "epoch": 6.569539249146757, + "grad_norm": 0.07792943308557311, + "learning_rate": 1.8360700446493984e-06, + "loss": 0.8441, + "step": 30798 + }, + { + "epoch": 6.569752559726963, + "grad_norm": 0.079189445498134, + "learning_rate": 1.8342577868085287e-06, + "loss": 0.8801, + "step": 30799 + }, + { + "epoch": 6.5699658703071675, + "grad_norm": 0.0728216443812128, + "learning_rate": 1.8324464134143617e-06, + "loss": 0.8466, + "step": 30800 + }, + { + "epoch": 6.570179180887372, + "grad_norm": 0.0740268617184192, + "learning_rate": 1.8306359244873694e-06, + "loss": 0.8926, + "step": 30801 + }, + { + "epoch": 6.570392491467577, + "grad_norm": 0.08269000425989169, + "learning_rate": 1.8288263200480517e-06, + "loss": 0.8599, + "step": 30802 + }, + { + "epoch": 6.5706058020477816, + "grad_norm": 0.08606551719069076, + "learning_rate": 1.8270176001168716e-06, + "loss": 0.9008, + "step": 30803 + }, + { + "epoch": 6.570819112627986, + "grad_norm": 0.07610510470875678, + "learning_rate": 1.8252097647143018e-06, + "loss": 0.897, + "step": 30804 + }, + { + "epoch": 6.571032423208191, + "grad_norm": 0.07684681590793654, + "learning_rate": 1.8234028138608063e-06, + "loss": 0.8817, + "step": 30805 + }, + { + "epoch": 6.571245733788396, + "grad_norm": 0.07338970122337866, + "learning_rate": 1.8215967475768214e-06, + "loss": 0.8652, + "step": 30806 + }, + { + "epoch": 6.5714590443686, + "grad_norm": 0.07970665016288195, + "learning_rate": 1.8197915658827847e-06, + "loss": 0.8856, + "step": 30807 + }, + { + "epoch": 6.571672354948806, + "grad_norm": 0.0753472958497041, + "learning_rate": 1.8179872687991152e-06, + "loss": 0.8774, + "step": 30808 + }, + { + "epoch": 6.5718856655290105, + "grad_norm": 0.08527921937227825, + "learning_rate": 1.8161838563462408e-06, + "loss": 0.9049, + "step": 30809 + }, + { + "epoch": 6.572098976109215, + "grad_norm": 0.07578189802657184, + "learning_rate": 1.8143813285445634e-06, + "loss": 0.9217, + "step": 30810 + }, + { + "epoch": 6.57231228668942, + "grad_norm": 0.07516505493319522, + "learning_rate": 1.8125796854144663e-06, + "loss": 0.9171, + "step": 30811 + }, + { + "epoch": 6.572525597269625, + "grad_norm": 0.07814449684000746, + "learning_rate": 1.8107789269763508e-06, + "loss": 0.899, + "step": 30812 + }, + { + "epoch": 6.572738907849829, + "grad_norm": 0.08231949632004912, + "learning_rate": 1.8089790532505925e-06, + "loss": 0.8651, + "step": 30813 + }, + { + "epoch": 6.572952218430034, + "grad_norm": 0.07142959865342886, + "learning_rate": 1.8071800642575477e-06, + "loss": 0.8768, + "step": 30814 + }, + { + "epoch": 6.573165529010239, + "grad_norm": 0.07582908404577422, + "learning_rate": 1.8053819600175648e-06, + "loss": 0.8816, + "step": 30815 + }, + { + "epoch": 6.573378839590443, + "grad_norm": 0.07372979994486938, + "learning_rate": 1.8035847405510099e-06, + "loss": 0.8872, + "step": 30816 + }, + { + "epoch": 6.573592150170649, + "grad_norm": 0.07834234838585251, + "learning_rate": 1.801788405878213e-06, + "loss": 0.8688, + "step": 30817 + }, + { + "epoch": 6.573805460750854, + "grad_norm": 0.0712585268545464, + "learning_rate": 1.7999929560194873e-06, + "loss": 0.8652, + "step": 30818 + }, + { + "epoch": 6.574018771331058, + "grad_norm": 0.08071944995432319, + "learning_rate": 1.7981983909951628e-06, + "loss": 0.8975, + "step": 30819 + }, + { + "epoch": 6.574232081911263, + "grad_norm": 0.08141117298240907, + "learning_rate": 1.7964047108255344e-06, + "loss": 0.9139, + "step": 30820 + }, + { + "epoch": 6.574445392491468, + "grad_norm": 0.07579463632229136, + "learning_rate": 1.7946119155309061e-06, + "loss": 0.8792, + "step": 30821 + }, + { + "epoch": 6.574658703071672, + "grad_norm": 0.08650020474973635, + "learning_rate": 1.7928200051315635e-06, + "loss": 0.853, + "step": 30822 + }, + { + "epoch": 6.574872013651877, + "grad_norm": 0.08055069851847439, + "learning_rate": 1.7910289796477664e-06, + "loss": 0.8716, + "step": 30823 + }, + { + "epoch": 6.575085324232082, + "grad_norm": 0.07841094901356811, + "learning_rate": 1.7892388390998005e-06, + "loss": 0.8667, + "step": 30824 + }, + { + "epoch": 6.575298634812286, + "grad_norm": 0.08079437594148478, + "learning_rate": 1.7874495835079076e-06, + "loss": 0.8893, + "step": 30825 + }, + { + "epoch": 6.575511945392492, + "grad_norm": 0.08184215340954511, + "learning_rate": 1.7856612128923378e-06, + "loss": 0.8848, + "step": 30826 + }, + { + "epoch": 6.575725255972696, + "grad_norm": 0.07924692932124709, + "learning_rate": 1.7838737272733243e-06, + "loss": 0.8997, + "step": 30827 + }, + { + "epoch": 6.575938566552901, + "grad_norm": 0.07503022887880252, + "learning_rate": 1.7820871266711082e-06, + "loss": 0.8701, + "step": 30828 + }, + { + "epoch": 6.576151877133106, + "grad_norm": 0.07538667707289747, + "learning_rate": 1.7803014111058781e-06, + "loss": 0.8875, + "step": 30829 + }, + { + "epoch": 6.576365187713311, + "grad_norm": 0.07233174907557952, + "learning_rate": 1.7785165805978666e-06, + "loss": 0.8496, + "step": 30830 + }, + { + "epoch": 6.576578498293515, + "grad_norm": 0.07999452825929862, + "learning_rate": 1.7767326351672443e-06, + "loss": 0.8863, + "step": 30831 + }, + { + "epoch": 6.57679180887372, + "grad_norm": 0.07237679140482751, + "learning_rate": 1.7749495748342171e-06, + "loss": 0.8942, + "step": 30832 + }, + { + "epoch": 6.577005119453925, + "grad_norm": 0.07441389737678478, + "learning_rate": 1.773167399618947e-06, + "loss": 0.8772, + "step": 30833 + }, + { + "epoch": 6.577218430034129, + "grad_norm": 0.07610895748566254, + "learning_rate": 1.771386109541604e-06, + "loss": 0.8733, + "step": 30834 + }, + { + "epoch": 6.577431740614334, + "grad_norm": 0.07763044171735833, + "learning_rate": 1.7696057046223326e-06, + "loss": 0.8906, + "step": 30835 + }, + { + "epoch": 6.577645051194539, + "grad_norm": 0.0759822723515104, + "learning_rate": 1.7678261848813028e-06, + "loss": 0.8766, + "step": 30836 + }, + { + "epoch": 6.577858361774744, + "grad_norm": 0.07501726629574891, + "learning_rate": 1.7660475503386321e-06, + "loss": 0.8781, + "step": 30837 + }, + { + "epoch": 6.578071672354949, + "grad_norm": 0.07294854308669203, + "learning_rate": 1.7642698010144466e-06, + "loss": 0.8828, + "step": 30838 + }, + { + "epoch": 6.578284982935154, + "grad_norm": 0.07220062593129317, + "learning_rate": 1.7624929369288634e-06, + "loss": 0.8619, + "step": 30839 + }, + { + "epoch": 6.578498293515358, + "grad_norm": 0.07825458244262087, + "learning_rate": 1.7607169581019913e-06, + "loss": 0.8758, + "step": 30840 + }, + { + "epoch": 6.578711604095563, + "grad_norm": 0.07561783621673747, + "learning_rate": 1.7589418645539202e-06, + "loss": 0.8812, + "step": 30841 + }, + { + "epoch": 6.578924914675768, + "grad_norm": 0.0760406203359973, + "learning_rate": 1.7571676563047503e-06, + "loss": 0.8961, + "step": 30842 + }, + { + "epoch": 6.5791382252559725, + "grad_norm": 0.07054977413785965, + "learning_rate": 1.755394333374527e-06, + "loss": 0.8764, + "step": 30843 + }, + { + "epoch": 6.579351535836177, + "grad_norm": 0.07503116455882057, + "learning_rate": 1.7536218957833417e-06, + "loss": 0.8715, + "step": 30844 + }, + { + "epoch": 6.579564846416382, + "grad_norm": 0.07301713250549514, + "learning_rate": 1.7518503435512402e-06, + "loss": 0.8548, + "step": 30845 + }, + { + "epoch": 6.579778156996587, + "grad_norm": 0.07752305626643075, + "learning_rate": 1.7500796766982687e-06, + "loss": 0.8828, + "step": 30846 + }, + { + "epoch": 6.579991467576792, + "grad_norm": 0.07440528808118733, + "learning_rate": 1.7483098952444555e-06, + "loss": 0.8567, + "step": 30847 + }, + { + "epoch": 6.580204778156997, + "grad_norm": 0.07531174271524432, + "learning_rate": 1.7465409992098381e-06, + "loss": 0.8786, + "step": 30848 + }, + { + "epoch": 6.5804180887372015, + "grad_norm": 0.0760374431170557, + "learning_rate": 1.7447729886144183e-06, + "loss": 0.9109, + "step": 30849 + }, + { + "epoch": 6.580631399317406, + "grad_norm": 0.07618333714267021, + "learning_rate": 1.7430058634782154e-06, + "loss": 0.8789, + "step": 30850 + }, + { + "epoch": 6.580844709897611, + "grad_norm": 0.07017697209090401, + "learning_rate": 1.7412396238212137e-06, + "loss": 0.8625, + "step": 30851 + }, + { + "epoch": 6.5810580204778155, + "grad_norm": 0.07839847731077666, + "learning_rate": 1.739474269663406e-06, + "loss": 0.907, + "step": 30852 + }, + { + "epoch": 6.58127133105802, + "grad_norm": 0.07910612331861114, + "learning_rate": 1.7377098010247672e-06, + "loss": 0.8629, + "step": 30853 + }, + { + "epoch": 6.581484641638225, + "grad_norm": 0.07576008888040356, + "learning_rate": 1.7359462179252552e-06, + "loss": 0.8932, + "step": 30854 + }, + { + "epoch": 6.5816979522184305, + "grad_norm": 0.07306157511734124, + "learning_rate": 1.7341835203848179e-06, + "loss": 0.8989, + "step": 30855 + }, + { + "epoch": 6.581911262798635, + "grad_norm": 0.07355238845683265, + "learning_rate": 1.732421708423422e-06, + "loss": 0.8613, + "step": 30856 + }, + { + "epoch": 6.58212457337884, + "grad_norm": 0.07475530526000979, + "learning_rate": 1.730660782060989e-06, + "loss": 0.8804, + "step": 30857 + }, + { + "epoch": 6.5823378839590445, + "grad_norm": 0.09658885509728929, + "learning_rate": 1.7289007413174496e-06, + "loss": 0.8923, + "step": 30858 + }, + { + "epoch": 6.582551194539249, + "grad_norm": 0.07768314981043264, + "learning_rate": 1.7271415862126995e-06, + "loss": 0.9297, + "step": 30859 + }, + { + "epoch": 6.582764505119454, + "grad_norm": 0.07417163429373234, + "learning_rate": 1.7253833167666689e-06, + "loss": 0.8716, + "step": 30860 + }, + { + "epoch": 6.582977815699659, + "grad_norm": 0.07464336613079933, + "learning_rate": 1.7236259329992445e-06, + "loss": 0.8683, + "step": 30861 + }, + { + "epoch": 6.583191126279863, + "grad_norm": 0.07408783219389178, + "learning_rate": 1.7218694349303122e-06, + "loss": 0.8656, + "step": 30862 + }, + { + "epoch": 6.583404436860068, + "grad_norm": 0.07649279213498228, + "learning_rate": 1.720113822579732e-06, + "loss": 0.8744, + "step": 30863 + }, + { + "epoch": 6.5836177474402735, + "grad_norm": 0.08163960624608178, + "learning_rate": 1.718359095967399e-06, + "loss": 0.889, + "step": 30864 + }, + { + "epoch": 6.583831058020478, + "grad_norm": 0.08191236998034881, + "learning_rate": 1.7166052551131375e-06, + "loss": 0.8575, + "step": 30865 + }, + { + "epoch": 6.584044368600683, + "grad_norm": 0.07803946990254068, + "learning_rate": 1.7148523000368066e-06, + "loss": 0.9248, + "step": 30866 + }, + { + "epoch": 6.584257679180888, + "grad_norm": 0.06937056325165256, + "learning_rate": 1.7131002307582312e-06, + "loss": 0.8475, + "step": 30867 + }, + { + "epoch": 6.584470989761092, + "grad_norm": 0.08180888393522555, + "learning_rate": 1.7113490472972526e-06, + "loss": 0.8481, + "step": 30868 + }, + { + "epoch": 6.584684300341297, + "grad_norm": 0.07497117706255929, + "learning_rate": 1.7095987496736776e-06, + "loss": 0.8403, + "step": 30869 + }, + { + "epoch": 6.584897610921502, + "grad_norm": 0.07652713366125599, + "learning_rate": 1.7078493379073035e-06, + "loss": 0.8725, + "step": 30870 + }, + { + "epoch": 6.585110921501706, + "grad_norm": 0.07810420830635988, + "learning_rate": 1.706100812017928e-06, + "loss": 0.9055, + "step": 30871 + }, + { + "epoch": 6.585324232081911, + "grad_norm": 0.08116473635005152, + "learning_rate": 1.7043531720253393e-06, + "loss": 0.8622, + "step": 30872 + }, + { + "epoch": 6.585537542662116, + "grad_norm": 0.07241616863632745, + "learning_rate": 1.7026064179493174e-06, + "loss": 0.8829, + "step": 30873 + }, + { + "epoch": 6.58575085324232, + "grad_norm": 0.07686382163974573, + "learning_rate": 1.7008605498096242e-06, + "loss": 0.8529, + "step": 30874 + }, + { + "epoch": 6.585964163822526, + "grad_norm": 0.0770792506886341, + "learning_rate": 1.6991155676259952e-06, + "loss": 0.8784, + "step": 30875 + }, + { + "epoch": 6.586177474402731, + "grad_norm": 0.08108657791137103, + "learning_rate": 1.6973714714182098e-06, + "loss": 0.8895, + "step": 30876 + }, + { + "epoch": 6.586390784982935, + "grad_norm": 0.07850869064821882, + "learning_rate": 1.695628261205977e-06, + "loss": 0.8847, + "step": 30877 + }, + { + "epoch": 6.58660409556314, + "grad_norm": 0.07603669465886073, + "learning_rate": 1.6938859370090322e-06, + "loss": 0.9028, + "step": 30878 + }, + { + "epoch": 6.586817406143345, + "grad_norm": 0.08359249543544794, + "learning_rate": 1.692144498847066e-06, + "loss": 0.8948, + "step": 30879 + }, + { + "epoch": 6.587030716723549, + "grad_norm": 0.07328018359324247, + "learning_rate": 1.6904039467398226e-06, + "loss": 0.8664, + "step": 30880 + }, + { + "epoch": 6.587244027303754, + "grad_norm": 0.07987379541862008, + "learning_rate": 1.6886642807069664e-06, + "loss": 0.8986, + "step": 30881 + }, + { + "epoch": 6.587457337883959, + "grad_norm": 0.07724411368019869, + "learning_rate": 1.6869255007681973e-06, + "loss": 0.8826, + "step": 30882 + }, + { + "epoch": 6.587670648464163, + "grad_norm": 0.07614443290543393, + "learning_rate": 1.6851876069431705e-06, + "loss": 0.8978, + "step": 30883 + }, + { + "epoch": 6.587883959044369, + "grad_norm": 0.07605593360485231, + "learning_rate": 1.6834505992515683e-06, + "loss": 0.8741, + "step": 30884 + }, + { + "epoch": 6.588097269624574, + "grad_norm": 0.07762885504430322, + "learning_rate": 1.6817144777130457e-06, + "loss": 0.8696, + "step": 30885 + }, + { + "epoch": 6.588310580204778, + "grad_norm": 0.07771909886628246, + "learning_rate": 1.6799792423472406e-06, + "loss": 0.9162, + "step": 30886 + }, + { + "epoch": 6.588523890784983, + "grad_norm": 0.07855410510036032, + "learning_rate": 1.6782448931737815e-06, + "loss": 0.8721, + "step": 30887 + }, + { + "epoch": 6.588737201365188, + "grad_norm": 0.07431920579229155, + "learning_rate": 1.6765114302123064e-06, + "loss": 0.8871, + "step": 30888 + }, + { + "epoch": 6.588950511945392, + "grad_norm": 0.0743550999064739, + "learning_rate": 1.6747788534824171e-06, + "loss": 0.8821, + "step": 30889 + }, + { + "epoch": 6.589163822525597, + "grad_norm": 0.07607101010656941, + "learning_rate": 1.6730471630037248e-06, + "loss": 0.862, + "step": 30890 + }, + { + "epoch": 6.589377133105802, + "grad_norm": 0.07296867670500486, + "learning_rate": 1.6713163587958137e-06, + "loss": 0.8799, + "step": 30891 + }, + { + "epoch": 6.589590443686006, + "grad_norm": 0.08465205507658355, + "learning_rate": 1.6695864408782857e-06, + "loss": 0.8698, + "step": 30892 + }, + { + "epoch": 6.589803754266212, + "grad_norm": 0.07793998834922942, + "learning_rate": 1.667857409270699e-06, + "loss": 0.8429, + "step": 30893 + }, + { + "epoch": 6.590017064846417, + "grad_norm": 0.0794641673574703, + "learning_rate": 1.6661292639926286e-06, + "loss": 0.9062, + "step": 30894 + }, + { + "epoch": 6.590230375426621, + "grad_norm": 0.07702396047348722, + "learning_rate": 1.6644020050636146e-06, + "loss": 0.8761, + "step": 30895 + }, + { + "epoch": 6.590443686006826, + "grad_norm": 0.07690690758540616, + "learning_rate": 1.6626756325032145e-06, + "loss": 0.8625, + "step": 30896 + }, + { + "epoch": 6.590656996587031, + "grad_norm": 0.08223766082480374, + "learning_rate": 1.6609501463309595e-06, + "loss": 0.8753, + "step": 30897 + }, + { + "epoch": 6.590870307167235, + "grad_norm": 0.0754330303757201, + "learning_rate": 1.659225546566372e-06, + "loss": 0.8618, + "step": 30898 + }, + { + "epoch": 6.59108361774744, + "grad_norm": 0.08329517465659479, + "learning_rate": 1.657501833228965e-06, + "loss": 0.9171, + "step": 30899 + }, + { + "epoch": 6.591296928327645, + "grad_norm": 0.0735906094378128, + "learning_rate": 1.655779006338234e-06, + "loss": 0.8796, + "step": 30900 + }, + { + "epoch": 6.5915102389078495, + "grad_norm": 0.07710571409340586, + "learning_rate": 1.6540570659136923e-06, + "loss": 0.8526, + "step": 30901 + }, + { + "epoch": 6.591723549488055, + "grad_norm": 0.07739265395401078, + "learning_rate": 1.6523360119748088e-06, + "loss": 0.8563, + "step": 30902 + }, + { + "epoch": 6.59193686006826, + "grad_norm": 0.07153599735622043, + "learning_rate": 1.6506158445410525e-06, + "loss": 0.8774, + "step": 30903 + }, + { + "epoch": 6.592150170648464, + "grad_norm": 0.07768909287712858, + "learning_rate": 1.648896563631901e-06, + "loss": 0.8899, + "step": 30904 + }, + { + "epoch": 6.592363481228669, + "grad_norm": 0.07616299688581775, + "learning_rate": 1.6471781692668054e-06, + "loss": 0.893, + "step": 30905 + }, + { + "epoch": 6.592576791808874, + "grad_norm": 0.08034221726770253, + "learning_rate": 1.645460661465208e-06, + "loss": 0.914, + "step": 30906 + }, + { + "epoch": 6.5927901023890785, + "grad_norm": 0.08090553490818894, + "learning_rate": 1.6437440402465333e-06, + "loss": 0.8754, + "step": 30907 + }, + { + "epoch": 6.593003412969283, + "grad_norm": 0.0800593520839015, + "learning_rate": 1.6420283056302232e-06, + "loss": 0.8927, + "step": 30908 + }, + { + "epoch": 6.593216723549488, + "grad_norm": 0.0806220593461297, + "learning_rate": 1.6403134576356849e-06, + "loss": 0.8833, + "step": 30909 + }, + { + "epoch": 6.5934300341296925, + "grad_norm": 0.08355003376440148, + "learning_rate": 1.6385994962823071e-06, + "loss": 0.9, + "step": 30910 + }, + { + "epoch": 6.593643344709898, + "grad_norm": 0.0730966934308441, + "learning_rate": 1.6368864215894963e-06, + "loss": 0.8904, + "step": 30911 + }, + { + "epoch": 6.593856655290102, + "grad_norm": 0.07695449438134778, + "learning_rate": 1.635174233576633e-06, + "loss": 0.8926, + "step": 30912 + }, + { + "epoch": 6.5940699658703075, + "grad_norm": 0.07433423504349544, + "learning_rate": 1.6334629322630968e-06, + "loss": 0.8554, + "step": 30913 + }, + { + "epoch": 6.594283276450512, + "grad_norm": 0.07655216672645196, + "learning_rate": 1.6317525176682415e-06, + "loss": 0.8773, + "step": 30914 + }, + { + "epoch": 6.594496587030717, + "grad_norm": 0.07581888371994275, + "learning_rate": 1.6300429898114112e-06, + "loss": 0.8883, + "step": 30915 + }, + { + "epoch": 6.5947098976109215, + "grad_norm": 0.07649124288778794, + "learning_rate": 1.6283343487119774e-06, + "loss": 0.8768, + "step": 30916 + }, + { + "epoch": 6.594923208191126, + "grad_norm": 0.07688048227599947, + "learning_rate": 1.626626594389258e-06, + "loss": 0.8667, + "step": 30917 + }, + { + "epoch": 6.595136518771331, + "grad_norm": 0.07603264070523838, + "learning_rate": 1.6249197268625795e-06, + "loss": 0.8775, + "step": 30918 + }, + { + "epoch": 6.595349829351536, + "grad_norm": 0.08503407111418482, + "learning_rate": 1.623213746151242e-06, + "loss": 0.8841, + "step": 30919 + }, + { + "epoch": 6.59556313993174, + "grad_norm": 0.0741933643142467, + "learning_rate": 1.6215086522745726e-06, + "loss": 0.8648, + "step": 30920 + }, + { + "epoch": 6.595776450511945, + "grad_norm": 0.07613198747007283, + "learning_rate": 1.6198044452518535e-06, + "loss": 0.9045, + "step": 30921 + }, + { + "epoch": 6.5959897610921505, + "grad_norm": 0.07774849898181271, + "learning_rate": 1.6181011251023582e-06, + "loss": 0.8787, + "step": 30922 + }, + { + "epoch": 6.596203071672355, + "grad_norm": 0.07910758305212108, + "learning_rate": 1.61639869184536e-06, + "loss": 0.89, + "step": 30923 + }, + { + "epoch": 6.59641638225256, + "grad_norm": 0.08165718292149138, + "learning_rate": 1.6146971455001326e-06, + "loss": 0.8972, + "step": 30924 + }, + { + "epoch": 6.596629692832765, + "grad_norm": 0.07933423908907125, + "learning_rate": 1.6129964860859315e-06, + "loss": 0.9111, + "step": 30925 + }, + { + "epoch": 6.596843003412969, + "grad_norm": 0.07467731051090543, + "learning_rate": 1.611296713621986e-06, + "loss": 0.8818, + "step": 30926 + }, + { + "epoch": 6.597056313993174, + "grad_norm": 0.07300213289992069, + "learning_rate": 1.6095978281275337e-06, + "loss": 0.8723, + "step": 30927 + }, + { + "epoch": 6.597269624573379, + "grad_norm": 0.0777067787783529, + "learning_rate": 1.6078998296218128e-06, + "loss": 0.8997, + "step": 30928 + }, + { + "epoch": 6.597482935153583, + "grad_norm": 0.08984760111240464, + "learning_rate": 1.606202718124017e-06, + "loss": 0.9128, + "step": 30929 + }, + { + "epoch": 6.597696245733788, + "grad_norm": 0.07344330745581595, + "learning_rate": 1.6045064936533572e-06, + "loss": 0.8722, + "step": 30930 + }, + { + "epoch": 6.597909556313994, + "grad_norm": 0.08107933745993973, + "learning_rate": 1.602811156229027e-06, + "loss": 0.8625, + "step": 30931 + }, + { + "epoch": 6.598122866894198, + "grad_norm": 0.07657795684621042, + "learning_rate": 1.6011167058702115e-06, + "loss": 0.881, + "step": 30932 + }, + { + "epoch": 6.598336177474403, + "grad_norm": 0.07165565256699068, + "learning_rate": 1.5994231425960683e-06, + "loss": 0.8831, + "step": 30933 + }, + { + "epoch": 6.598549488054608, + "grad_norm": 0.07497414486596092, + "learning_rate": 1.597730466425782e-06, + "loss": 0.838, + "step": 30934 + }, + { + "epoch": 6.598762798634812, + "grad_norm": 0.08120023714752356, + "learning_rate": 1.5960386773784753e-06, + "loss": 0.8647, + "step": 30935 + }, + { + "epoch": 6.598976109215017, + "grad_norm": 0.0835528812834648, + "learning_rate": 1.5943477754733238e-06, + "loss": 0.8597, + "step": 30936 + }, + { + "epoch": 6.599189419795222, + "grad_norm": 0.07977459173145285, + "learning_rate": 1.5926577607294503e-06, + "loss": 0.9024, + "step": 30937 + }, + { + "epoch": 6.599402730375426, + "grad_norm": 0.07897259931514275, + "learning_rate": 1.590968633165968e-06, + "loss": 0.8683, + "step": 30938 + }, + { + "epoch": 6.599616040955631, + "grad_norm": 0.07666885347193816, + "learning_rate": 1.5892803928019907e-06, + "loss": 0.8705, + "step": 30939 + }, + { + "epoch": 6.599829351535837, + "grad_norm": 0.08350324592293515, + "learning_rate": 1.587593039656623e-06, + "loss": 0.8552, + "step": 30940 + }, + { + "epoch": 6.600042662116041, + "grad_norm": 0.07439581424158377, + "learning_rate": 1.5859065737489698e-06, + "loss": 0.8948, + "step": 30941 + }, + { + "epoch": 6.600255972696246, + "grad_norm": 0.07824121369192714, + "learning_rate": 1.5842209950981002e-06, + "loss": 0.9101, + "step": 30942 + }, + { + "epoch": 6.600469283276451, + "grad_norm": 0.08050331790258716, + "learning_rate": 1.5825363037230834e-06, + "loss": 0.8744, + "step": 30943 + }, + { + "epoch": 6.600682593856655, + "grad_norm": 0.07721968957217965, + "learning_rate": 1.5808524996429886e-06, + "loss": 0.8951, + "step": 30944 + }, + { + "epoch": 6.60089590443686, + "grad_norm": 0.09050736268257528, + "learning_rate": 1.5791695828768672e-06, + "loss": 0.9134, + "step": 30945 + }, + { + "epoch": 6.601109215017065, + "grad_norm": 0.08022959553639648, + "learning_rate": 1.5774875534437705e-06, + "loss": 0.8754, + "step": 30946 + }, + { + "epoch": 6.601322525597269, + "grad_norm": 0.0831144307906387, + "learning_rate": 1.575806411362706e-06, + "loss": 0.8387, + "step": 30947 + }, + { + "epoch": 6.601535836177474, + "grad_norm": 0.0792569048872012, + "learning_rate": 1.5741261566527154e-06, + "loss": 0.8978, + "step": 30948 + }, + { + "epoch": 6.60174914675768, + "grad_norm": 0.07474680946585247, + "learning_rate": 1.5724467893328066e-06, + "loss": 0.8876, + "step": 30949 + }, + { + "epoch": 6.601962457337884, + "grad_norm": 0.07817543108313754, + "learning_rate": 1.570768309421986e-06, + "loss": 0.872, + "step": 30950 + }, + { + "epoch": 6.602175767918089, + "grad_norm": 0.08680516200868148, + "learning_rate": 1.5690907169392345e-06, + "loss": 0.8654, + "step": 30951 + }, + { + "epoch": 6.602389078498294, + "grad_norm": 0.08038903179713058, + "learning_rate": 1.5674140119035497e-06, + "loss": 0.8857, + "step": 30952 + }, + { + "epoch": 6.602602389078498, + "grad_norm": 0.08400730040607651, + "learning_rate": 1.5657381943338857e-06, + "loss": 0.879, + "step": 30953 + }, + { + "epoch": 6.602815699658703, + "grad_norm": 0.07666848829982245, + "learning_rate": 1.5640632642492226e-06, + "loss": 0.8936, + "step": 30954 + }, + { + "epoch": 6.603029010238908, + "grad_norm": 0.07336735204984564, + "learning_rate": 1.5623892216684966e-06, + "loss": 0.8969, + "step": 30955 + }, + { + "epoch": 6.603242320819112, + "grad_norm": 0.07953322277805656, + "learning_rate": 1.5607160666106614e-06, + "loss": 0.8682, + "step": 30956 + }, + { + "epoch": 6.603455631399317, + "grad_norm": 0.08772709908042158, + "learning_rate": 1.559043799094635e-06, + "loss": 0.9, + "step": 30957 + }, + { + "epoch": 6.603668941979522, + "grad_norm": 0.07874317944463846, + "learning_rate": 1.5573724191393536e-06, + "loss": 0.8527, + "step": 30958 + }, + { + "epoch": 6.6038822525597265, + "grad_norm": 0.07653045701779025, + "learning_rate": 1.5557019267637175e-06, + "loss": 0.8778, + "step": 30959 + }, + { + "epoch": 6.604095563139932, + "grad_norm": 0.07689936664424189, + "learning_rate": 1.554032321986636e-06, + "loss": 0.863, + "step": 30960 + }, + { + "epoch": 6.604308873720137, + "grad_norm": 0.0746816774716364, + "learning_rate": 1.5523636048270007e-06, + "loss": 0.8625, + "step": 30961 + }, + { + "epoch": 6.604522184300341, + "grad_norm": 0.06902287958846974, + "learning_rate": 1.5506957753036855e-06, + "loss": 0.8592, + "step": 30962 + }, + { + "epoch": 6.604735494880546, + "grad_norm": 0.07268416522799333, + "learning_rate": 1.549028833435564e-06, + "loss": 0.8635, + "step": 30963 + }, + { + "epoch": 6.604948805460751, + "grad_norm": 0.07658309052545512, + "learning_rate": 1.5473627792415102e-06, + "loss": 0.8792, + "step": 30964 + }, + { + "epoch": 6.6051621160409555, + "grad_norm": 0.07546160200953693, + "learning_rate": 1.545697612740371e-06, + "loss": 0.9219, + "step": 30965 + }, + { + "epoch": 6.60537542662116, + "grad_norm": 0.07577006479600801, + "learning_rate": 1.544033333950976e-06, + "loss": 0.8826, + "step": 30966 + }, + { + "epoch": 6.605588737201365, + "grad_norm": 0.08032952657339483, + "learning_rate": 1.5423699428921636e-06, + "loss": 0.8719, + "step": 30967 + }, + { + "epoch": 6.6058020477815695, + "grad_norm": 0.07525863916868078, + "learning_rate": 1.5407074395827537e-06, + "loss": 0.8924, + "step": 30968 + }, + { + "epoch": 6.606015358361775, + "grad_norm": 0.07720877956720809, + "learning_rate": 1.5390458240415584e-06, + "loss": 0.9192, + "step": 30969 + }, + { + "epoch": 6.60622866894198, + "grad_norm": 0.07355656778347237, + "learning_rate": 1.5373850962873805e-06, + "loss": 0.8713, + "step": 30970 + }, + { + "epoch": 6.6064419795221845, + "grad_norm": 0.07557784552062681, + "learning_rate": 1.5357252563390045e-06, + "loss": 0.9047, + "step": 30971 + }, + { + "epoch": 6.606655290102389, + "grad_norm": 0.0774991942447956, + "learning_rate": 1.534066304215225e-06, + "loss": 0.8871, + "step": 30972 + }, + { + "epoch": 6.606868600682594, + "grad_norm": 0.07946305868196316, + "learning_rate": 1.5324082399347994e-06, + "loss": 0.8524, + "step": 30973 + }, + { + "epoch": 6.6070819112627985, + "grad_norm": 0.07355195579118032, + "learning_rate": 1.5307510635165047e-06, + "loss": 0.874, + "step": 30974 + }, + { + "epoch": 6.607295221843003, + "grad_norm": 0.0755443972496165, + "learning_rate": 1.529094774979063e-06, + "loss": 0.8787, + "step": 30975 + }, + { + "epoch": 6.607508532423208, + "grad_norm": 0.07628713846387468, + "learning_rate": 1.5274393743412508e-06, + "loss": 0.8655, + "step": 30976 + }, + { + "epoch": 6.607721843003413, + "grad_norm": 0.07742354483115536, + "learning_rate": 1.5257848616217818e-06, + "loss": 0.8636, + "step": 30977 + }, + { + "epoch": 6.607935153583618, + "grad_norm": 0.07733413096353783, + "learning_rate": 1.5241312368393613e-06, + "loss": 0.8956, + "step": 30978 + }, + { + "epoch": 6.608148464163823, + "grad_norm": 0.08020589577283938, + "learning_rate": 1.5224785000127296e-06, + "loss": 0.9178, + "step": 30979 + }, + { + "epoch": 6.6083617747440275, + "grad_norm": 0.07557594631747047, + "learning_rate": 1.520826651160574e-06, + "loss": 0.906, + "step": 30980 + }, + { + "epoch": 6.608575085324232, + "grad_norm": 0.07700842303403156, + "learning_rate": 1.519175690301582e-06, + "loss": 0.8858, + "step": 30981 + }, + { + "epoch": 6.608788395904437, + "grad_norm": 0.07701515918167964, + "learning_rate": 1.5175256174544316e-06, + "loss": 0.8791, + "step": 30982 + }, + { + "epoch": 6.609001706484642, + "grad_norm": 0.07418871032642978, + "learning_rate": 1.5158764326378016e-06, + "loss": 0.853, + "step": 30983 + }, + { + "epoch": 6.609215017064846, + "grad_norm": 0.08285974358991129, + "learning_rate": 1.5142281358703525e-06, + "loss": 0.8902, + "step": 30984 + }, + { + "epoch": 6.609428327645051, + "grad_norm": 0.07521312130374203, + "learning_rate": 1.512580727170736e-06, + "loss": 0.876, + "step": 30985 + }, + { + "epoch": 6.609641638225256, + "grad_norm": 0.07384611236225144, + "learning_rate": 1.5109342065575861e-06, + "loss": 0.912, + "step": 30986 + }, + { + "epoch": 6.609854948805461, + "grad_norm": 0.07509471135900853, + "learning_rate": 1.5092885740495279e-06, + "loss": 0.8835, + "step": 30987 + }, + { + "epoch": 6.610068259385666, + "grad_norm": 0.072960997797797, + "learning_rate": 1.507643829665204e-06, + "loss": 0.8794, + "step": 30988 + }, + { + "epoch": 6.610281569965871, + "grad_norm": 0.07965416067204245, + "learning_rate": 1.5059999734231955e-06, + "loss": 0.865, + "step": 30989 + }, + { + "epoch": 6.610494880546075, + "grad_norm": 0.07604065274835455, + "learning_rate": 1.5043570053421187e-06, + "loss": 0.9117, + "step": 30990 + }, + { + "epoch": 6.61070819112628, + "grad_norm": 0.07644492343441989, + "learning_rate": 1.5027149254405715e-06, + "loss": 0.9014, + "step": 30991 + }, + { + "epoch": 6.610921501706485, + "grad_norm": 0.07836979592199583, + "learning_rate": 1.5010737337371262e-06, + "loss": 0.8679, + "step": 30992 + }, + { + "epoch": 6.611134812286689, + "grad_norm": 0.07786464721609072, + "learning_rate": 1.4994334302503456e-06, + "loss": 0.8695, + "step": 30993 + }, + { + "epoch": 6.611348122866894, + "grad_norm": 0.07960005949233129, + "learning_rate": 1.4977940149988013e-06, + "loss": 0.8604, + "step": 30994 + }, + { + "epoch": 6.611561433447099, + "grad_norm": 0.07725308784439532, + "learning_rate": 1.4961554880010298e-06, + "loss": 0.8716, + "step": 30995 + }, + { + "epoch": 6.611774744027304, + "grad_norm": 0.07582288809070102, + "learning_rate": 1.4945178492755852e-06, + "loss": 0.8954, + "step": 30996 + }, + { + "epoch": 6.611988054607508, + "grad_norm": 0.07833170631200873, + "learning_rate": 1.4928810988409948e-06, + "loss": 0.9077, + "step": 30997 + }, + { + "epoch": 6.612201365187714, + "grad_norm": 0.07257184777402531, + "learning_rate": 1.4912452367157682e-06, + "loss": 0.8879, + "step": 30998 + }, + { + "epoch": 6.612414675767918, + "grad_norm": 0.07445264511575848, + "learning_rate": 1.4896102629184328e-06, + "loss": 0.8743, + "step": 30999 + }, + { + "epoch": 6.612627986348123, + "grad_norm": 0.0795954383522019, + "learning_rate": 1.4879761774674716e-06, + "loss": 0.8869, + "step": 31000 + }, + { + "epoch": 6.612841296928328, + "grad_norm": 0.07158479057992119, + "learning_rate": 1.4863429803813767e-06, + "loss": 0.8605, + "step": 31001 + }, + { + "epoch": 6.613054607508532, + "grad_norm": 0.07480849168902719, + "learning_rate": 1.4847106716786307e-06, + "loss": 0.8935, + "step": 31002 + }, + { + "epoch": 6.613267918088737, + "grad_norm": 0.0833715897283804, + "learning_rate": 1.4830792513777081e-06, + "loss": 0.8969, + "step": 31003 + }, + { + "epoch": 6.613481228668942, + "grad_norm": 0.07422080846575731, + "learning_rate": 1.4814487194970651e-06, + "loss": 0.8615, + "step": 31004 + }, + { + "epoch": 6.613694539249146, + "grad_norm": 0.07601834194142397, + "learning_rate": 1.479819076055149e-06, + "loss": 0.8796, + "step": 31005 + }, + { + "epoch": 6.613907849829351, + "grad_norm": 0.08203116347556007, + "learning_rate": 1.4781903210704073e-06, + "loss": 0.8863, + "step": 31006 + }, + { + "epoch": 6.614121160409557, + "grad_norm": 0.08182703447607517, + "learning_rate": 1.4765624545612522e-06, + "loss": 0.8829, + "step": 31007 + }, + { + "epoch": 6.614334470989761, + "grad_norm": 0.07525328656056542, + "learning_rate": 1.4749354765461222e-06, + "loss": 0.8755, + "step": 31008 + }, + { + "epoch": 6.614547781569966, + "grad_norm": 0.07557514187245701, + "learning_rate": 1.4733093870434113e-06, + "loss": 0.8747, + "step": 31009 + }, + { + "epoch": 6.614761092150171, + "grad_norm": 0.0744722763138371, + "learning_rate": 1.4716841860715313e-06, + "loss": 0.8648, + "step": 31010 + }, + { + "epoch": 6.614974402730375, + "grad_norm": 0.08755520726730848, + "learning_rate": 1.470059873648868e-06, + "loss": 0.8726, + "step": 31011 + }, + { + "epoch": 6.61518771331058, + "grad_norm": 0.07442215299531861, + "learning_rate": 1.4684364497937886e-06, + "loss": 0.8649, + "step": 31012 + }, + { + "epoch": 6.615401023890785, + "grad_norm": 0.0755078799509699, + "learning_rate": 1.4668139145246784e-06, + "loss": 0.8764, + "step": 31013 + }, + { + "epoch": 6.6156143344709895, + "grad_norm": 0.07539642623041513, + "learning_rate": 1.4651922678598873e-06, + "loss": 0.8745, + "step": 31014 + }, + { + "epoch": 6.615827645051194, + "grad_norm": 0.07791625048290154, + "learning_rate": 1.463571509817765e-06, + "loss": 0.8643, + "step": 31015 + }, + { + "epoch": 6.6160409556314, + "grad_norm": 0.07376254082676818, + "learning_rate": 1.4619516404166522e-06, + "loss": 0.868, + "step": 31016 + }, + { + "epoch": 6.616254266211604, + "grad_norm": 0.07778345390156187, + "learning_rate": 1.4603326596748813e-06, + "loss": 0.894, + "step": 31017 + }, + { + "epoch": 6.616467576791809, + "grad_norm": 0.08193694276311092, + "learning_rate": 1.4587145676107661e-06, + "loss": 0.8725, + "step": 31018 + }, + { + "epoch": 6.616680887372014, + "grad_norm": 0.08510812276279274, + "learning_rate": 1.4570973642426122e-06, + "loss": 0.8831, + "step": 31019 + }, + { + "epoch": 6.6168941979522184, + "grad_norm": 0.07865829999767064, + "learning_rate": 1.455481049588734e-06, + "loss": 0.9049, + "step": 31020 + }, + { + "epoch": 6.617107508532423, + "grad_norm": 0.0751071504239721, + "learning_rate": 1.4538656236674097e-06, + "loss": 0.8907, + "step": 31021 + }, + { + "epoch": 6.617320819112628, + "grad_norm": 0.07166129437707551, + "learning_rate": 1.4522510864969007e-06, + "loss": 0.8698, + "step": 31022 + }, + { + "epoch": 6.6175341296928325, + "grad_norm": 0.07724892216325453, + "learning_rate": 1.4506374380955036e-06, + "loss": 0.9038, + "step": 31023 + }, + { + "epoch": 6.617747440273037, + "grad_norm": 0.07583241976124173, + "learning_rate": 1.449024678481461e-06, + "loss": 0.8781, + "step": 31024 + }, + { + "epoch": 6.617960750853243, + "grad_norm": 0.07741910280095597, + "learning_rate": 1.4474128076730254e-06, + "loss": 0.8726, + "step": 31025 + }, + { + "epoch": 6.618174061433447, + "grad_norm": 0.083618028554586, + "learning_rate": 1.445801825688431e-06, + "loss": 0.8828, + "step": 31026 + }, + { + "epoch": 6.618387372013652, + "grad_norm": 0.0783293670114716, + "learning_rate": 1.444191732545912e-06, + "loss": 0.8727, + "step": 31027 + }, + { + "epoch": 6.618600682593857, + "grad_norm": 0.07309011651376006, + "learning_rate": 1.442582528263694e-06, + "loss": 0.8954, + "step": 31028 + }, + { + "epoch": 6.6188139931740615, + "grad_norm": 0.07511343159608098, + "learning_rate": 1.4409742128599668e-06, + "loss": 0.852, + "step": 31029 + }, + { + "epoch": 6.619027303754266, + "grad_norm": 0.07483408031385705, + "learning_rate": 1.439366786352947e-06, + "loss": 0.8662, + "step": 31030 + }, + { + "epoch": 6.619240614334471, + "grad_norm": 0.07656831507024633, + "learning_rate": 1.4377602487607978e-06, + "loss": 0.8731, + "step": 31031 + }, + { + "epoch": 6.6194539249146755, + "grad_norm": 0.08245657296708905, + "learning_rate": 1.4361546001017356e-06, + "loss": 0.8695, + "step": 31032 + }, + { + "epoch": 6.61966723549488, + "grad_norm": 0.07822450171766723, + "learning_rate": 1.4345498403938885e-06, + "loss": 0.8599, + "step": 31033 + }, + { + "epoch": 6.619880546075086, + "grad_norm": 0.07539240221582173, + "learning_rate": 1.4329459696554282e-06, + "loss": 0.9049, + "step": 31034 + }, + { + "epoch": 6.6200938566552905, + "grad_norm": 0.07433715956968888, + "learning_rate": 1.4313429879045183e-06, + "loss": 0.9052, + "step": 31035 + }, + { + "epoch": 6.620307167235495, + "grad_norm": 0.07500276139870042, + "learning_rate": 1.4297408951592772e-06, + "loss": 0.854, + "step": 31036 + }, + { + "epoch": 6.6205204778157, + "grad_norm": 0.07882264630715656, + "learning_rate": 1.428139691437842e-06, + "loss": 0.8798, + "step": 31037 + }, + { + "epoch": 6.6207337883959045, + "grad_norm": 0.07870870460526673, + "learning_rate": 1.4265393767583225e-06, + "loss": 0.8988, + "step": 31038 + }, + { + "epoch": 6.620947098976109, + "grad_norm": 0.08415864927651484, + "learning_rate": 1.4249399511388373e-06, + "loss": 0.8593, + "step": 31039 + }, + { + "epoch": 6.621160409556314, + "grad_norm": 0.08060578586901361, + "learning_rate": 1.423341414597479e-06, + "loss": 0.8821, + "step": 31040 + }, + { + "epoch": 6.621373720136519, + "grad_norm": 0.08435762808573763, + "learning_rate": 1.4217437671523303e-06, + "loss": 0.9072, + "step": 31041 + }, + { + "epoch": 6.621587030716723, + "grad_norm": 0.07828488541623682, + "learning_rate": 1.4201470088214752e-06, + "loss": 0.8735, + "step": 31042 + }, + { + "epoch": 6.621800341296928, + "grad_norm": 0.07550184278988756, + "learning_rate": 1.418551139622979e-06, + "loss": 0.8589, + "step": 31043 + }, + { + "epoch": 6.622013651877133, + "grad_norm": 0.07704202108538095, + "learning_rate": 1.4169561595748982e-06, + "loss": 0.901, + "step": 31044 + }, + { + "epoch": 6.622226962457338, + "grad_norm": 0.07695366244732707, + "learning_rate": 1.4153620686952807e-06, + "loss": 0.8868, + "step": 31045 + }, + { + "epoch": 6.622440273037543, + "grad_norm": 0.08175723532293099, + "learning_rate": 1.4137688670021565e-06, + "loss": 0.867, + "step": 31046 + }, + { + "epoch": 6.622653583617748, + "grad_norm": 0.08004467085455945, + "learning_rate": 1.4121765545135646e-06, + "loss": 0.8975, + "step": 31047 + }, + { + "epoch": 6.622866894197952, + "grad_norm": 0.07547172691380835, + "learning_rate": 1.4105851312475173e-06, + "loss": 0.8794, + "step": 31048 + }, + { + "epoch": 6.623080204778157, + "grad_norm": 0.07607444304534089, + "learning_rate": 1.4089945972220266e-06, + "loss": 0.8969, + "step": 31049 + }, + { + "epoch": 6.623293515358362, + "grad_norm": 0.07792391406353563, + "learning_rate": 1.4074049524550693e-06, + "loss": 0.9075, + "step": 31050 + }, + { + "epoch": 6.623506825938566, + "grad_norm": 0.07569179670106152, + "learning_rate": 1.4058161969646577e-06, + "loss": 0.8565, + "step": 31051 + }, + { + "epoch": 6.623720136518771, + "grad_norm": 0.08113386448360001, + "learning_rate": 1.4042283307687598e-06, + "loss": 0.8668, + "step": 31052 + }, + { + "epoch": 6.623933447098976, + "grad_norm": 0.07505463761907621, + "learning_rate": 1.4026413538853346e-06, + "loss": 0.8768, + "step": 31053 + }, + { + "epoch": 6.624146757679181, + "grad_norm": 0.0781504108740301, + "learning_rate": 1.4010552663323496e-06, + "loss": 0.8465, + "step": 31054 + }, + { + "epoch": 6.624360068259386, + "grad_norm": 0.07442486566720208, + "learning_rate": 1.3994700681277463e-06, + "loss": 0.8858, + "step": 31055 + }, + { + "epoch": 6.624573378839591, + "grad_norm": 0.07613344942294384, + "learning_rate": 1.3978857592894569e-06, + "loss": 0.86, + "step": 31056 + }, + { + "epoch": 6.624786689419795, + "grad_norm": 0.07456626704227787, + "learning_rate": 1.3963023398354136e-06, + "loss": 0.8899, + "step": 31057 + }, + { + "epoch": 6.625, + "grad_norm": 0.07901026773843858, + "learning_rate": 1.3947198097835223e-06, + "loss": 0.8965, + "step": 31058 + }, + { + "epoch": 6.625213310580205, + "grad_norm": 0.07435832054987286, + "learning_rate": 1.3931381691517065e-06, + "loss": 0.8963, + "step": 31059 + }, + { + "epoch": 6.625426621160409, + "grad_norm": 0.07435037152259287, + "learning_rate": 1.3915574179578627e-06, + "loss": 0.8594, + "step": 31060 + }, + { + "epoch": 6.625639931740614, + "grad_norm": 0.07565278209000656, + "learning_rate": 1.3899775562198613e-06, + "loss": 0.8777, + "step": 31061 + }, + { + "epoch": 6.625853242320819, + "grad_norm": 0.07466090255008424, + "learning_rate": 1.3883985839555815e-06, + "loss": 0.8693, + "step": 31062 + }, + { + "epoch": 6.626066552901024, + "grad_norm": 0.07771942072916949, + "learning_rate": 1.3868205011829017e-06, + "loss": 0.9113, + "step": 31063 + }, + { + "epoch": 6.626279863481229, + "grad_norm": 0.07270844794024293, + "learning_rate": 1.385243307919666e-06, + "loss": 0.8594, + "step": 31064 + }, + { + "epoch": 6.626493174061434, + "grad_norm": 0.07330812129236097, + "learning_rate": 1.3836670041837262e-06, + "loss": 0.8513, + "step": 31065 + }, + { + "epoch": 6.626706484641638, + "grad_norm": 0.07674921958356243, + "learning_rate": 1.3820915899929178e-06, + "loss": 0.8692, + "step": 31066 + }, + { + "epoch": 6.626919795221843, + "grad_norm": 0.07990699640323125, + "learning_rate": 1.3805170653650656e-06, + "loss": 0.8773, + "step": 31067 + }, + { + "epoch": 6.627133105802048, + "grad_norm": 0.07530392690254836, + "learning_rate": 1.3789434303179872e-06, + "loss": 0.8819, + "step": 31068 + }, + { + "epoch": 6.627346416382252, + "grad_norm": 0.07069862038090033, + "learning_rate": 1.3773706848694811e-06, + "loss": 0.8846, + "step": 31069 + }, + { + "epoch": 6.627559726962457, + "grad_norm": 0.07719314204440396, + "learning_rate": 1.3757988290373469e-06, + "loss": 0.8868, + "step": 31070 + }, + { + "epoch": 6.627773037542662, + "grad_norm": 0.0752752787474856, + "learning_rate": 1.3742278628393747e-06, + "loss": 0.8645, + "step": 31071 + }, + { + "epoch": 6.627986348122867, + "grad_norm": 0.07429933236291471, + "learning_rate": 1.3726577862933366e-06, + "loss": 0.885, + "step": 31072 + }, + { + "epoch": 6.628199658703072, + "grad_norm": 0.07549540879050004, + "learning_rate": 1.3710885994169964e-06, + "loss": 0.8598, + "step": 31073 + }, + { + "epoch": 6.628412969283277, + "grad_norm": 0.07288752436666474, + "learning_rate": 1.3695203022281089e-06, + "loss": 0.8865, + "step": 31074 + }, + { + "epoch": 6.628626279863481, + "grad_norm": 0.07784399426265794, + "learning_rate": 1.3679528947444288e-06, + "loss": 0.8939, + "step": 31075 + }, + { + "epoch": 6.628839590443686, + "grad_norm": 0.07526244316013268, + "learning_rate": 1.366386376983675e-06, + "loss": 0.8506, + "step": 31076 + }, + { + "epoch": 6.629052901023891, + "grad_norm": 0.07516320250535272, + "learning_rate": 1.3648207489636022e-06, + "loss": 0.8652, + "step": 31077 + }, + { + "epoch": 6.6292662116040955, + "grad_norm": 0.07764862887660032, + "learning_rate": 1.3632560107018856e-06, + "loss": 0.8997, + "step": 31078 + }, + { + "epoch": 6.6294795221843, + "grad_norm": 0.07328711697189269, + "learning_rate": 1.3616921622162526e-06, + "loss": 0.8702, + "step": 31079 + }, + { + "epoch": 6.629692832764505, + "grad_norm": 0.07738444429151389, + "learning_rate": 1.3601292035244051e-06, + "loss": 0.8942, + "step": 31080 + }, + { + "epoch": 6.6299061433447095, + "grad_norm": 0.07439818159612882, + "learning_rate": 1.3585671346440089e-06, + "loss": 0.8706, + "step": 31081 + }, + { + "epoch": 6.630119453924914, + "grad_norm": 0.07501253063784544, + "learning_rate": 1.3570059555927473e-06, + "loss": 0.856, + "step": 31082 + }, + { + "epoch": 6.63033276450512, + "grad_norm": 0.07885246832428577, + "learning_rate": 1.3554456663882865e-06, + "loss": 0.8745, + "step": 31083 + }, + { + "epoch": 6.6305460750853245, + "grad_norm": 0.0795659796868315, + "learning_rate": 1.3538862670482922e-06, + "loss": 0.8808, + "step": 31084 + }, + { + "epoch": 6.630759385665529, + "grad_norm": 0.08236685254308977, + "learning_rate": 1.352327757590386e-06, + "loss": 0.8819, + "step": 31085 + }, + { + "epoch": 6.630972696245734, + "grad_norm": 0.07598093897536032, + "learning_rate": 1.3507701380322158e-06, + "loss": 0.872, + "step": 31086 + }, + { + "epoch": 6.6311860068259385, + "grad_norm": 0.07845648497121367, + "learning_rate": 1.3492134083914032e-06, + "loss": 0.8873, + "step": 31087 + }, + { + "epoch": 6.631399317406143, + "grad_norm": 0.07800556856161511, + "learning_rate": 1.3476575686855786e-06, + "loss": 0.8801, + "step": 31088 + }, + { + "epoch": 6.631612627986348, + "grad_norm": 0.077357313716931, + "learning_rate": 1.3461026189323189e-06, + "loss": 0.8562, + "step": 31089 + }, + { + "epoch": 6.631825938566553, + "grad_norm": 0.08418989165361337, + "learning_rate": 1.344548559149228e-06, + "loss": 0.8511, + "step": 31090 + }, + { + "epoch": 6.632039249146757, + "grad_norm": 0.07265898925852567, + "learning_rate": 1.3429953893539005e-06, + "loss": 0.8997, + "step": 31091 + }, + { + "epoch": 6.632252559726963, + "grad_norm": 0.07471372224301695, + "learning_rate": 1.3414431095638959e-06, + "loss": 0.9039, + "step": 31092 + }, + { + "epoch": 6.6324658703071675, + "grad_norm": 0.0708527760908149, + "learning_rate": 1.339891719796791e-06, + "loss": 0.9006, + "step": 31093 + }, + { + "epoch": 6.632679180887372, + "grad_norm": 0.07645814633374402, + "learning_rate": 1.3383412200701273e-06, + "loss": 0.8741, + "step": 31094 + }, + { + "epoch": 6.632892491467577, + "grad_norm": 0.081510698504324, + "learning_rate": 1.3367916104014645e-06, + "loss": 0.8701, + "step": 31095 + }, + { + "epoch": 6.6331058020477816, + "grad_norm": 0.08309833788761221, + "learning_rate": 1.335242890808326e-06, + "loss": 0.8811, + "step": 31096 + }, + { + "epoch": 6.633319112627986, + "grad_norm": 0.08302811581751761, + "learning_rate": 1.3336950613082355e-06, + "loss": 0.8789, + "step": 31097 + }, + { + "epoch": 6.633532423208191, + "grad_norm": 0.07759448184684788, + "learning_rate": 1.3321481219187083e-06, + "loss": 0.8889, + "step": 31098 + }, + { + "epoch": 6.633745733788396, + "grad_norm": 0.07845863403539943, + "learning_rate": 1.3306020726572587e-06, + "loss": 0.8708, + "step": 31099 + }, + { + "epoch": 6.6339590443686, + "grad_norm": 0.07328437571833912, + "learning_rate": 1.3290569135413578e-06, + "loss": 0.8758, + "step": 31100 + }, + { + "epoch": 6.634172354948806, + "grad_norm": 0.08120692619226755, + "learning_rate": 1.3275126445885023e-06, + "loss": 0.8967, + "step": 31101 + }, + { + "epoch": 6.6343856655290105, + "grad_norm": 0.08151457021900019, + "learning_rate": 1.3259692658161627e-06, + "loss": 0.8838, + "step": 31102 + }, + { + "epoch": 6.634598976109215, + "grad_norm": 0.07433536654190015, + "learning_rate": 1.3244267772418095e-06, + "loss": 0.8423, + "step": 31103 + }, + { + "epoch": 6.63481228668942, + "grad_norm": 0.07746830970736224, + "learning_rate": 1.3228851788828867e-06, + "loss": 0.8898, + "step": 31104 + }, + { + "epoch": 6.635025597269625, + "grad_norm": 0.07453688138567373, + "learning_rate": 1.3213444707568467e-06, + "loss": 0.8693, + "step": 31105 + }, + { + "epoch": 6.635238907849829, + "grad_norm": 0.0823006319346692, + "learning_rate": 1.319804652881107e-06, + "loss": 0.8768, + "step": 31106 + }, + { + "epoch": 6.635452218430034, + "grad_norm": 0.08401994916467487, + "learning_rate": 1.3182657252731112e-06, + "loss": 0.892, + "step": 31107 + }, + { + "epoch": 6.635665529010239, + "grad_norm": 0.07208587235986012, + "learning_rate": 1.316727687950259e-06, + "loss": 0.9001, + "step": 31108 + }, + { + "epoch": 6.635878839590443, + "grad_norm": 0.07562998228668096, + "learning_rate": 1.3151905409299582e-06, + "loss": 0.8677, + "step": 31109 + }, + { + "epoch": 6.636092150170649, + "grad_norm": 0.07305380289253223, + "learning_rate": 1.3136542842295997e-06, + "loss": 0.8696, + "step": 31110 + }, + { + "epoch": 6.636305460750854, + "grad_norm": 0.08096944438264189, + "learning_rate": 1.312118917866565e-06, + "loss": 0.8924, + "step": 31111 + }, + { + "epoch": 6.636518771331058, + "grad_norm": 0.08176568210323407, + "learning_rate": 1.3105844418582358e-06, + "loss": 0.8789, + "step": 31112 + }, + { + "epoch": 6.636732081911263, + "grad_norm": 0.07234106426670622, + "learning_rate": 1.3090508562219583e-06, + "loss": 0.8717, + "step": 31113 + }, + { + "epoch": 6.636945392491468, + "grad_norm": 0.07766909645823508, + "learning_rate": 1.3075181609750876e-06, + "loss": 0.8724, + "step": 31114 + }, + { + "epoch": 6.637158703071672, + "grad_norm": 0.07685991683156405, + "learning_rate": 1.3059863561349873e-06, + "loss": 0.8433, + "step": 31115 + }, + { + "epoch": 6.637372013651877, + "grad_norm": 0.07755935045159633, + "learning_rate": 1.3044554417189681e-06, + "loss": 0.8745, + "step": 31116 + }, + { + "epoch": 6.637585324232082, + "grad_norm": 0.07139062027477298, + "learning_rate": 1.3029254177443584e-06, + "loss": 0.8786, + "step": 31117 + }, + { + "epoch": 6.637798634812286, + "grad_norm": 0.07626272892664523, + "learning_rate": 1.301396284228469e-06, + "loss": 0.9069, + "step": 31118 + }, + { + "epoch": 6.638011945392492, + "grad_norm": 0.07449510135036816, + "learning_rate": 1.29986804118861e-06, + "loss": 0.8592, + "step": 31119 + }, + { + "epoch": 6.638225255972696, + "grad_norm": 0.07865619815070668, + "learning_rate": 1.2983406886420657e-06, + "loss": 0.8784, + "step": 31120 + }, + { + "epoch": 6.638438566552901, + "grad_norm": 0.07285305456694054, + "learning_rate": 1.29681422660612e-06, + "loss": 0.8969, + "step": 31121 + }, + { + "epoch": 6.638651877133106, + "grad_norm": 0.07524195242011217, + "learning_rate": 1.295288655098048e-06, + "loss": 0.9025, + "step": 31122 + }, + { + "epoch": 6.638865187713311, + "grad_norm": 0.08144479400265744, + "learning_rate": 1.293763974135107e-06, + "loss": 0.8852, + "step": 31123 + }, + { + "epoch": 6.639078498293515, + "grad_norm": 0.08219663583401596, + "learning_rate": 1.2922401837345544e-06, + "loss": 0.8685, + "step": 31124 + }, + { + "epoch": 6.63929180887372, + "grad_norm": 0.07861412955084014, + "learning_rate": 1.2907172839136206e-06, + "loss": 0.8615, + "step": 31125 + }, + { + "epoch": 6.639505119453925, + "grad_norm": 0.07605328410497852, + "learning_rate": 1.2891952746895453e-06, + "loss": 0.897, + "step": 31126 + }, + { + "epoch": 6.639718430034129, + "grad_norm": 0.07339351084787231, + "learning_rate": 1.2876741560795502e-06, + "loss": 0.8789, + "step": 31127 + }, + { + "epoch": 6.639931740614334, + "grad_norm": 0.07208962729692285, + "learning_rate": 1.2861539281008484e-06, + "loss": 0.9005, + "step": 31128 + }, + { + "epoch": 6.640145051194539, + "grad_norm": 0.07863369334218054, + "learning_rate": 1.2846345907706349e-06, + "loss": 0.8759, + "step": 31129 + }, + { + "epoch": 6.640358361774744, + "grad_norm": 0.0767313713352374, + "learning_rate": 1.2831161441061046e-06, + "loss": 0.8723, + "step": 31130 + }, + { + "epoch": 6.640571672354949, + "grad_norm": 0.08756714801641892, + "learning_rate": 1.281598588124444e-06, + "loss": 0.8854, + "step": 31131 + }, + { + "epoch": 6.640784982935154, + "grad_norm": 0.07634449090546067, + "learning_rate": 1.2800819228428218e-06, + "loss": 0.8496, + "step": 31132 + }, + { + "epoch": 6.640998293515358, + "grad_norm": 0.07667021884847591, + "learning_rate": 1.2785661482783972e-06, + "loss": 0.8494, + "step": 31133 + }, + { + "epoch": 6.641211604095563, + "grad_norm": 0.07170051410088323, + "learning_rate": 1.2770512644483124e-06, + "loss": 0.8436, + "step": 31134 + }, + { + "epoch": 6.641424914675768, + "grad_norm": 0.07415131677413853, + "learning_rate": 1.275537271369718e-06, + "loss": 0.8989, + "step": 31135 + }, + { + "epoch": 6.6416382252559725, + "grad_norm": 0.08129092462528388, + "learning_rate": 1.2740241690597466e-06, + "loss": 0.8857, + "step": 31136 + }, + { + "epoch": 6.641851535836177, + "grad_norm": 0.10467971492523541, + "learning_rate": 1.2725119575355228e-06, + "loss": 0.8863, + "step": 31137 + }, + { + "epoch": 6.642064846416382, + "grad_norm": 0.0751658336333223, + "learning_rate": 1.2710006368141348e-06, + "loss": 0.8861, + "step": 31138 + }, + { + "epoch": 6.642278156996587, + "grad_norm": 0.07617559137347765, + "learning_rate": 1.269490206912707e-06, + "loss": 0.8687, + "step": 31139 + }, + { + "epoch": 6.642491467576792, + "grad_norm": 0.07263436920286984, + "learning_rate": 1.2679806678483275e-06, + "loss": 0.886, + "step": 31140 + }, + { + "epoch": 6.642704778156997, + "grad_norm": 0.08190876363971729, + "learning_rate": 1.2664720196380675e-06, + "loss": 0.9047, + "step": 31141 + }, + { + "epoch": 6.6429180887372015, + "grad_norm": 0.08097736278768958, + "learning_rate": 1.2649642622989888e-06, + "loss": 0.8905, + "step": 31142 + }, + { + "epoch": 6.643131399317406, + "grad_norm": 0.07431374342069294, + "learning_rate": 1.26345739584818e-06, + "loss": 0.8938, + "step": 31143 + }, + { + "epoch": 6.643344709897611, + "grad_norm": 0.07014194290420633, + "learning_rate": 1.2619514203026761e-06, + "loss": 0.8792, + "step": 31144 + }, + { + "epoch": 6.6435580204778155, + "grad_norm": 0.07366077653997391, + "learning_rate": 1.2604463356795039e-06, + "loss": 0.8816, + "step": 31145 + }, + { + "epoch": 6.64377133105802, + "grad_norm": 0.07677313841124356, + "learning_rate": 1.2589421419957071e-06, + "loss": 0.9126, + "step": 31146 + }, + { + "epoch": 6.643984641638225, + "grad_norm": 0.07484684716858611, + "learning_rate": 1.2574388392683123e-06, + "loss": 0.8598, + "step": 31147 + }, + { + "epoch": 6.6441979522184305, + "grad_norm": 0.07615350525374058, + "learning_rate": 1.2559364275143193e-06, + "loss": 0.8896, + "step": 31148 + }, + { + "epoch": 6.644411262798635, + "grad_norm": 0.07618484313729723, + "learning_rate": 1.2544349067507278e-06, + "loss": 0.8699, + "step": 31149 + }, + { + "epoch": 6.64462457337884, + "grad_norm": 0.07273619724195547, + "learning_rate": 1.2529342769945284e-06, + "loss": 0.8751, + "step": 31150 + }, + { + "epoch": 6.6448378839590445, + "grad_norm": 0.07471665619828205, + "learning_rate": 1.2514345382627036e-06, + "loss": 0.8959, + "step": 31151 + }, + { + "epoch": 6.645051194539249, + "grad_norm": 0.07992337487317792, + "learning_rate": 1.249935690572226e-06, + "loss": 0.8641, + "step": 31152 + }, + { + "epoch": 6.645264505119454, + "grad_norm": 0.07411008675123958, + "learning_rate": 1.2484377339400422e-06, + "loss": 0.8877, + "step": 31153 + }, + { + "epoch": 6.645477815699659, + "grad_norm": 0.07146710869033047, + "learning_rate": 1.2469406683831164e-06, + "loss": 0.8691, + "step": 31154 + }, + { + "epoch": 6.645691126279863, + "grad_norm": 0.07621786222490745, + "learning_rate": 1.2454444939183863e-06, + "loss": 0.8938, + "step": 31155 + }, + { + "epoch": 6.645904436860068, + "grad_norm": 0.08122307587169292, + "learning_rate": 1.2439492105627715e-06, + "loss": 0.9117, + "step": 31156 + }, + { + "epoch": 6.6461177474402735, + "grad_norm": 0.0749276896050774, + "learning_rate": 1.242454818333192e-06, + "loss": 0.8887, + "step": 31157 + }, + { + "epoch": 6.646331058020478, + "grad_norm": 0.07560008780966208, + "learning_rate": 1.2409613172465584e-06, + "loss": 0.8989, + "step": 31158 + }, + { + "epoch": 6.646544368600683, + "grad_norm": 0.0716019865824012, + "learning_rate": 1.2394687073197819e-06, + "loss": 0.8815, + "step": 31159 + }, + { + "epoch": 6.646757679180888, + "grad_norm": 0.07866944648577995, + "learning_rate": 1.2379769885697379e-06, + "loss": 0.8875, + "step": 31160 + }, + { + "epoch": 6.646970989761092, + "grad_norm": 0.07599892449456806, + "learning_rate": 1.2364861610133105e-06, + "loss": 0.8747, + "step": 31161 + }, + { + "epoch": 6.647184300341297, + "grad_norm": 0.0741128700355124, + "learning_rate": 1.2349962246673575e-06, + "loss": 0.8874, + "step": 31162 + }, + { + "epoch": 6.647397610921502, + "grad_norm": 0.07789308774661287, + "learning_rate": 1.2335071795487541e-06, + "loss": 0.8824, + "step": 31163 + }, + { + "epoch": 6.647610921501706, + "grad_norm": 0.08086800211823657, + "learning_rate": 1.232019025674349e-06, + "loss": 0.9, + "step": 31164 + }, + { + "epoch": 6.647824232081911, + "grad_norm": 0.07686584989570353, + "learning_rate": 1.2305317630609648e-06, + "loss": 0.8732, + "step": 31165 + }, + { + "epoch": 6.648037542662116, + "grad_norm": 0.07788099028869108, + "learning_rate": 1.2290453917254408e-06, + "loss": 0.9221, + "step": 31166 + }, + { + "epoch": 6.64825085324232, + "grad_norm": 0.07358123458431545, + "learning_rate": 1.2275599116845994e-06, + "loss": 0.8701, + "step": 31167 + }, + { + "epoch": 6.648464163822526, + "grad_norm": 0.07903925885320034, + "learning_rate": 1.2260753229552357e-06, + "loss": 0.9116, + "step": 31168 + }, + { + "epoch": 6.648677474402731, + "grad_norm": 0.07386570232810172, + "learning_rate": 1.2245916255541545e-06, + "loss": 0.9095, + "step": 31169 + }, + { + "epoch": 6.648890784982935, + "grad_norm": 0.07295200172819158, + "learning_rate": 1.2231088194981423e-06, + "loss": 0.8833, + "step": 31170 + }, + { + "epoch": 6.64910409556314, + "grad_norm": 0.07963439060017301, + "learning_rate": 1.2216269048039764e-06, + "loss": 0.8892, + "step": 31171 + }, + { + "epoch": 6.649317406143345, + "grad_norm": 0.07645307742817485, + "learning_rate": 1.2201458814884348e-06, + "loss": 0.862, + "step": 31172 + }, + { + "epoch": 6.649530716723549, + "grad_norm": 0.07534424855412512, + "learning_rate": 1.2186657495682685e-06, + "loss": 0.9013, + "step": 31173 + }, + { + "epoch": 6.649744027303754, + "grad_norm": 0.07357315287796422, + "learning_rate": 1.2171865090602196e-06, + "loss": 0.8678, + "step": 31174 + }, + { + "epoch": 6.649957337883959, + "grad_norm": 0.0802971031101339, + "learning_rate": 1.2157081599810305e-06, + "loss": 0.854, + "step": 31175 + }, + { + "epoch": 6.650170648464163, + "grad_norm": 0.07965565363095314, + "learning_rate": 1.2142307023474253e-06, + "loss": 0.8916, + "step": 31176 + }, + { + "epoch": 6.650383959044369, + "grad_norm": 0.07728805081722265, + "learning_rate": 1.2127541361761375e-06, + "loss": 0.8806, + "step": 31177 + }, + { + "epoch": 6.650597269624574, + "grad_norm": 0.07538238307644264, + "learning_rate": 1.2112784614838558e-06, + "loss": 0.8563, + "step": 31178 + }, + { + "epoch": 6.650810580204778, + "grad_norm": 0.07520865223338594, + "learning_rate": 1.209803678287278e-06, + "loss": 0.8464, + "step": 31179 + }, + { + "epoch": 6.651023890784983, + "grad_norm": 0.07430466366097455, + "learning_rate": 1.2083297866031018e-06, + "loss": 0.9009, + "step": 31180 + }, + { + "epoch": 6.651237201365188, + "grad_norm": 0.07448151639637814, + "learning_rate": 1.2068567864479985e-06, + "loss": 0.8813, + "step": 31181 + }, + { + "epoch": 6.651450511945392, + "grad_norm": 0.08574602558642413, + "learning_rate": 1.2053846778386303e-06, + "loss": 0.8974, + "step": 31182 + }, + { + "epoch": 6.651663822525597, + "grad_norm": 0.07518344743365651, + "learning_rate": 1.203913460791668e-06, + "loss": 0.905, + "step": 31183 + }, + { + "epoch": 6.651877133105802, + "grad_norm": 0.0725795557747969, + "learning_rate": 1.2024431353237476e-06, + "loss": 0.8818, + "step": 31184 + }, + { + "epoch": 6.652090443686006, + "grad_norm": 0.07704066461328732, + "learning_rate": 1.2009737014515133e-06, + "loss": 0.8999, + "step": 31185 + }, + { + "epoch": 6.652303754266212, + "grad_norm": 0.07424005183093965, + "learning_rate": 1.199505159191574e-06, + "loss": 0.89, + "step": 31186 + }, + { + "epoch": 6.652517064846417, + "grad_norm": 0.08272867545535137, + "learning_rate": 1.1980375085605744e-06, + "loss": 0.902, + "step": 31187 + }, + { + "epoch": 6.652730375426621, + "grad_norm": 0.07434068699018342, + "learning_rate": 1.1965707495750966e-06, + "loss": 0.8743, + "step": 31188 + }, + { + "epoch": 6.652943686006826, + "grad_norm": 0.06809082061388161, + "learning_rate": 1.1951048822517498e-06, + "loss": 0.8424, + "step": 31189 + }, + { + "epoch": 6.653156996587031, + "grad_norm": 0.07476375423324193, + "learning_rate": 1.1936399066071158e-06, + "loss": 0.8485, + "step": 31190 + }, + { + "epoch": 6.653370307167235, + "grad_norm": 0.07353812776027345, + "learning_rate": 1.1921758226577685e-06, + "loss": 0.8528, + "step": 31191 + }, + { + "epoch": 6.65358361774744, + "grad_norm": 0.06996371602369329, + "learning_rate": 1.1907126304202899e-06, + "loss": 0.8776, + "step": 31192 + }, + { + "epoch": 6.653796928327645, + "grad_norm": 0.07791727719823745, + "learning_rate": 1.189250329911209e-06, + "loss": 0.8643, + "step": 31193 + }, + { + "epoch": 6.6540102389078495, + "grad_norm": 0.07652199898558139, + "learning_rate": 1.1877889211470905e-06, + "loss": 0.8484, + "step": 31194 + }, + { + "epoch": 6.654223549488055, + "grad_norm": 0.07770045768805445, + "learning_rate": 1.1863284041444635e-06, + "loss": 0.8911, + "step": 31195 + }, + { + "epoch": 6.65443686006826, + "grad_norm": 0.07381031874581603, + "learning_rate": 1.1848687789198654e-06, + "loss": 0.8971, + "step": 31196 + }, + { + "epoch": 6.654650170648464, + "grad_norm": 0.07019486091025602, + "learning_rate": 1.183410045489799e-06, + "loss": 0.868, + "step": 31197 + }, + { + "epoch": 6.654863481228669, + "grad_norm": 0.0789669518914389, + "learning_rate": 1.1819522038707664e-06, + "loss": 0.9141, + "step": 31198 + }, + { + "epoch": 6.655076791808874, + "grad_norm": 0.07881557005565167, + "learning_rate": 1.1804952540792703e-06, + "loss": 0.8786, + "step": 31199 + }, + { + "epoch": 6.6552901023890785, + "grad_norm": 0.07911312943561583, + "learning_rate": 1.1790391961318127e-06, + "loss": 0.8913, + "step": 31200 + }, + { + "epoch": 6.655503412969283, + "grad_norm": 0.07342101011902906, + "learning_rate": 1.1775840300448337e-06, + "loss": 0.8896, + "step": 31201 + }, + { + "epoch": 6.655716723549488, + "grad_norm": 0.07343954943192274, + "learning_rate": 1.1761297558348273e-06, + "loss": 0.8577, + "step": 31202 + }, + { + "epoch": 6.6559300341296925, + "grad_norm": 0.07704805979188853, + "learning_rate": 1.174676373518233e-06, + "loss": 0.8761, + "step": 31203 + }, + { + "epoch": 6.656143344709898, + "grad_norm": 0.08096949368854665, + "learning_rate": 1.1732238831115095e-06, + "loss": 0.8671, + "step": 31204 + }, + { + "epoch": 6.656356655290102, + "grad_norm": 0.07502445902734398, + "learning_rate": 1.1717722846310787e-06, + "loss": 0.9057, + "step": 31205 + }, + { + "epoch": 6.6565699658703075, + "grad_norm": 0.07979206637462224, + "learning_rate": 1.170321578093363e-06, + "loss": 0.8906, + "step": 31206 + }, + { + "epoch": 6.656783276450512, + "grad_norm": 0.07173954318238082, + "learning_rate": 1.168871763514794e-06, + "loss": 0.8813, + "step": 31207 + }, + { + "epoch": 6.656996587030717, + "grad_norm": 0.07665709273622487, + "learning_rate": 1.1674228409117671e-06, + "loss": 0.8768, + "step": 31208 + }, + { + "epoch": 6.6572098976109215, + "grad_norm": 0.07147967505117185, + "learning_rate": 1.1659748103006696e-06, + "loss": 0.8821, + "step": 31209 + }, + { + "epoch": 6.657423208191126, + "grad_norm": 0.07346292826846096, + "learning_rate": 1.164527671697897e-06, + "loss": 0.8916, + "step": 31210 + }, + { + "epoch": 6.657636518771331, + "grad_norm": 0.0751577078363829, + "learning_rate": 1.1630814251198274e-06, + "loss": 0.9076, + "step": 31211 + }, + { + "epoch": 6.657849829351536, + "grad_norm": 0.07237387791808267, + "learning_rate": 1.1616360705828122e-06, + "loss": 0.8647, + "step": 31212 + }, + { + "epoch": 6.65806313993174, + "grad_norm": 0.07348350639554237, + "learning_rate": 1.1601916081032027e-06, + "loss": 0.879, + "step": 31213 + }, + { + "epoch": 6.658276450511945, + "grad_norm": 0.07474474671464852, + "learning_rate": 1.1587480376973592e-06, + "loss": 0.868, + "step": 31214 + }, + { + "epoch": 6.6584897610921505, + "grad_norm": 0.0766683595672228, + "learning_rate": 1.1573053593816063e-06, + "loss": 0.8718, + "step": 31215 + }, + { + "epoch": 6.658703071672355, + "grad_norm": 0.07703471077010816, + "learning_rate": 1.1558635731722777e-06, + "loss": 0.9032, + "step": 31216 + }, + { + "epoch": 6.65891638225256, + "grad_norm": 0.07644586658224575, + "learning_rate": 1.1544226790856716e-06, + "loss": 0.8637, + "step": 31217 + }, + { + "epoch": 6.659129692832765, + "grad_norm": 0.06948780857219239, + "learning_rate": 1.1529826771380858e-06, + "loss": 0.8653, + "step": 31218 + }, + { + "epoch": 6.659343003412969, + "grad_norm": 0.07171727879905568, + "learning_rate": 1.1515435673458453e-06, + "loss": 0.8611, + "step": 31219 + }, + { + "epoch": 6.659556313993174, + "grad_norm": 0.0764990763031951, + "learning_rate": 1.1501053497252123e-06, + "loss": 0.8944, + "step": 31220 + }, + { + "epoch": 6.659769624573379, + "grad_norm": 0.07400734441281905, + "learning_rate": 1.1486680242924586e-06, + "loss": 0.8612, + "step": 31221 + }, + { + "epoch": 6.659982935153583, + "grad_norm": 0.0730423309686653, + "learning_rate": 1.1472315910638554e-06, + "loss": 0.8682, + "step": 31222 + }, + { + "epoch": 6.660196245733788, + "grad_norm": 0.07921343586597679, + "learning_rate": 1.1457960500556564e-06, + "loss": 0.8905, + "step": 31223 + }, + { + "epoch": 6.660409556313994, + "grad_norm": 0.07541400194658965, + "learning_rate": 1.1443614012840977e-06, + "loss": 0.9099, + "step": 31224 + }, + { + "epoch": 6.660622866894198, + "grad_norm": 0.07679482413174202, + "learning_rate": 1.1429276447654148e-06, + "loss": 0.9184, + "step": 31225 + }, + { + "epoch": 6.660836177474403, + "grad_norm": 0.07664375506217243, + "learning_rate": 1.141494780515835e-06, + "loss": 0.8929, + "step": 31226 + }, + { + "epoch": 6.661049488054608, + "grad_norm": 0.07103176925826145, + "learning_rate": 1.1400628085515674e-06, + "loss": 0.8674, + "step": 31227 + }, + { + "epoch": 6.661262798634812, + "grad_norm": 0.07546167513421102, + "learning_rate": 1.1386317288888127e-06, + "loss": 0.8765, + "step": 31228 + }, + { + "epoch": 6.661476109215017, + "grad_norm": 0.07363396228360955, + "learning_rate": 1.1372015415437709e-06, + "loss": 0.8867, + "step": 31229 + }, + { + "epoch": 6.661689419795222, + "grad_norm": 0.07433805155763064, + "learning_rate": 1.135772246532616e-06, + "loss": 0.8635, + "step": 31230 + }, + { + "epoch": 6.661902730375426, + "grad_norm": 0.0701554033629122, + "learning_rate": 1.1343438438715304e-06, + "loss": 0.8734, + "step": 31231 + }, + { + "epoch": 6.662116040955631, + "grad_norm": 0.07451873594168673, + "learning_rate": 1.1329163335766702e-06, + "loss": 0.8835, + "step": 31232 + }, + { + "epoch": 6.662329351535837, + "grad_norm": 0.07427292721190275, + "learning_rate": 1.1314897156641824e-06, + "loss": 0.8996, + "step": 31233 + }, + { + "epoch": 6.662542662116041, + "grad_norm": 0.07537598321157483, + "learning_rate": 1.1300639901502141e-06, + "loss": 0.878, + "step": 31234 + }, + { + "epoch": 6.662755972696246, + "grad_norm": 0.0783941745988473, + "learning_rate": 1.1286391570509036e-06, + "loss": 0.8969, + "step": 31235 + }, + { + "epoch": 6.662969283276451, + "grad_norm": 0.07853459015431852, + "learning_rate": 1.1272152163823713e-06, + "loss": 0.8789, + "step": 31236 + }, + { + "epoch": 6.663182593856655, + "grad_norm": 0.08074900618103835, + "learning_rate": 1.1257921681607197e-06, + "loss": 0.8611, + "step": 31237 + }, + { + "epoch": 6.66339590443686, + "grad_norm": 0.07317320917041897, + "learning_rate": 1.1243700124020519e-06, + "loss": 0.8785, + "step": 31238 + }, + { + "epoch": 6.663609215017065, + "grad_norm": 0.07179308034592587, + "learning_rate": 1.12294874912247e-06, + "loss": 0.8629, + "step": 31239 + }, + { + "epoch": 6.663822525597269, + "grad_norm": 0.0753121034475193, + "learning_rate": 1.1215283783380504e-06, + "loss": 0.8752, + "step": 31240 + }, + { + "epoch": 6.664035836177474, + "grad_norm": 0.07559424237964811, + "learning_rate": 1.1201089000648691e-06, + "loss": 0.8396, + "step": 31241 + }, + { + "epoch": 6.66424914675768, + "grad_norm": 0.07399759117483529, + "learning_rate": 1.1186903143189666e-06, + "loss": 0.8472, + "step": 31242 + }, + { + "epoch": 6.664462457337884, + "grad_norm": 0.07267023961266615, + "learning_rate": 1.1172726211164187e-06, + "loss": 0.8604, + "step": 31243 + }, + { + "epoch": 6.664675767918089, + "grad_norm": 0.07855814283799321, + "learning_rate": 1.1158558204732661e-06, + "loss": 0.8756, + "step": 31244 + }, + { + "epoch": 6.664889078498294, + "grad_norm": 0.07245139597466164, + "learning_rate": 1.1144399124055228e-06, + "loss": 0.8562, + "step": 31245 + }, + { + "epoch": 6.665102389078498, + "grad_norm": 0.06997589693701523, + "learning_rate": 1.1130248969292202e-06, + "loss": 0.8838, + "step": 31246 + }, + { + "epoch": 6.665315699658703, + "grad_norm": 0.07370568248617729, + "learning_rate": 1.111610774060372e-06, + "loss": 0.8805, + "step": 31247 + }, + { + "epoch": 6.665529010238908, + "grad_norm": 0.07467150028779422, + "learning_rate": 1.1101975438149658e-06, + "loss": 0.8842, + "step": 31248 + }, + { + "epoch": 6.665742320819112, + "grad_norm": 0.07475907404565375, + "learning_rate": 1.1087852062090066e-06, + "loss": 0.8882, + "step": 31249 + }, + { + "epoch": 6.665955631399317, + "grad_norm": 0.0736605867223896, + "learning_rate": 1.1073737612584634e-06, + "loss": 0.8664, + "step": 31250 + }, + { + "epoch": 6.666168941979522, + "grad_norm": 0.07384123483915286, + "learning_rate": 1.1059632089793236e-06, + "loss": 0.8817, + "step": 31251 + }, + { + "epoch": 6.6663822525597265, + "grad_norm": 0.07597557957203935, + "learning_rate": 1.1045535493875304e-06, + "loss": 0.8836, + "step": 31252 + }, + { + "epoch": 6.666595563139932, + "grad_norm": 0.07257264866940268, + "learning_rate": 1.1031447824990438e-06, + "loss": 0.8775, + "step": 31253 + }, + { + "epoch": 6.666808873720137, + "grad_norm": 0.07557119947565055, + "learning_rate": 1.101736908329798e-06, + "loss": 0.9295, + "step": 31254 + }, + { + "epoch": 6.667022184300341, + "grad_norm": 0.07292584327789552, + "learning_rate": 1.1003299268957268e-06, + "loss": 0.8828, + "step": 31255 + }, + { + "epoch": 6.667235494880546, + "grad_norm": 0.07557514793724185, + "learning_rate": 1.0989238382127643e-06, + "loss": 0.8976, + "step": 31256 + }, + { + "epoch": 6.667448805460751, + "grad_norm": 0.07711882545518409, + "learning_rate": 1.097518642296782e-06, + "loss": 0.8793, + "step": 31257 + }, + { + "epoch": 6.6676621160409555, + "grad_norm": 0.07556348655588113, + "learning_rate": 1.0961143391637141e-06, + "loss": 0.8869, + "step": 31258 + }, + { + "epoch": 6.66787542662116, + "grad_norm": 0.08417843854336912, + "learning_rate": 1.094710928829441e-06, + "loss": 0.8988, + "step": 31259 + }, + { + "epoch": 6.668088737201365, + "grad_norm": 0.0713034338569739, + "learning_rate": 1.0933084113098435e-06, + "loss": 0.8764, + "step": 31260 + }, + { + "epoch": 6.6683020477815695, + "grad_norm": 0.08032069802389134, + "learning_rate": 1.0919067866207755e-06, + "loss": 0.8815, + "step": 31261 + }, + { + "epoch": 6.668515358361775, + "grad_norm": 0.0799214822910012, + "learning_rate": 1.0905060547781265e-06, + "loss": 0.8964, + "step": 31262 + }, + { + "epoch": 6.66872866894198, + "grad_norm": 0.07526721860471083, + "learning_rate": 1.089106215797715e-06, + "loss": 0.8536, + "step": 31263 + }, + { + "epoch": 6.6689419795221845, + "grad_norm": 0.0716443671442427, + "learning_rate": 1.0877072696954039e-06, + "loss": 0.8635, + "step": 31264 + }, + { + "epoch": 6.669155290102389, + "grad_norm": 0.07982332655265258, + "learning_rate": 1.0863092164870114e-06, + "loss": 0.8672, + "step": 31265 + }, + { + "epoch": 6.669368600682594, + "grad_norm": 0.07649317963840559, + "learning_rate": 1.0849120561883474e-06, + "loss": 0.8874, + "step": 31266 + }, + { + "epoch": 6.6695819112627985, + "grad_norm": 0.08000922678220425, + "learning_rate": 1.083515788815248e-06, + "loss": 0.889, + "step": 31267 + }, + { + "epoch": 6.669795221843003, + "grad_norm": 0.07628220299109298, + "learning_rate": 1.0821204143834873e-06, + "loss": 0.88, + "step": 31268 + }, + { + "epoch": 6.670008532423208, + "grad_norm": 0.08089428962280246, + "learning_rate": 1.0807259329088483e-06, + "loss": 0.8403, + "step": 31269 + }, + { + "epoch": 6.670221843003413, + "grad_norm": 0.074037463503366, + "learning_rate": 1.0793323444071401e-06, + "loss": 0.8701, + "step": 31270 + }, + { + "epoch": 6.670435153583618, + "grad_norm": 0.07967116124115148, + "learning_rate": 1.0779396488941018e-06, + "loss": 0.8914, + "step": 31271 + }, + { + "epoch": 6.670648464163823, + "grad_norm": 0.07018010798608307, + "learning_rate": 1.0765478463855162e-06, + "loss": 0.8597, + "step": 31272 + }, + { + "epoch": 6.6708617747440275, + "grad_norm": 0.07424208139818574, + "learning_rate": 1.0751569368971038e-06, + "loss": 0.9159, + "step": 31273 + }, + { + "epoch": 6.671075085324232, + "grad_norm": 0.07824044232104906, + "learning_rate": 1.0737669204446299e-06, + "loss": 0.9212, + "step": 31274 + }, + { + "epoch": 6.671288395904437, + "grad_norm": 0.07406287299504671, + "learning_rate": 1.0723777970438066e-06, + "loss": 0.8674, + "step": 31275 + }, + { + "epoch": 6.671501706484642, + "grad_norm": 0.07708792317913508, + "learning_rate": 1.0709895667103543e-06, + "loss": 0.8923, + "step": 31276 + }, + { + "epoch": 6.671715017064846, + "grad_norm": 0.07267686232089594, + "learning_rate": 1.069602229459994e-06, + "loss": 0.8459, + "step": 31277 + }, + { + "epoch": 6.671928327645051, + "grad_norm": 0.07757559830831191, + "learning_rate": 1.068215785308402e-06, + "loss": 0.8667, + "step": 31278 + }, + { + "epoch": 6.672141638225256, + "grad_norm": 0.07113211335407539, + "learning_rate": 1.066830234271281e-06, + "loss": 0.8643, + "step": 31279 + }, + { + "epoch": 6.672354948805461, + "grad_norm": 0.07214114338979177, + "learning_rate": 1.065445576364299e-06, + "loss": 0.8752, + "step": 31280 + }, + { + "epoch": 6.672568259385666, + "grad_norm": 0.07613104385639576, + "learning_rate": 1.0640618116031232e-06, + "loss": 0.882, + "step": 31281 + }, + { + "epoch": 6.672781569965871, + "grad_norm": 0.07501755874312849, + "learning_rate": 1.062678940003421e-06, + "loss": 0.8978, + "step": 31282 + }, + { + "epoch": 6.672994880546075, + "grad_norm": 0.07453786815461433, + "learning_rate": 1.0612969615808423e-06, + "loss": 0.8772, + "step": 31283 + }, + { + "epoch": 6.67320819112628, + "grad_norm": 0.07322571291078042, + "learning_rate": 1.0599158763510098e-06, + "loss": 0.8601, + "step": 31284 + }, + { + "epoch": 6.673421501706485, + "grad_norm": 0.06817694444830809, + "learning_rate": 1.0585356843295468e-06, + "loss": 0.8558, + "step": 31285 + }, + { + "epoch": 6.673634812286689, + "grad_norm": 0.07350499499738627, + "learning_rate": 1.0571563855320943e-06, + "loss": 0.8821, + "step": 31286 + }, + { + "epoch": 6.673848122866894, + "grad_norm": 0.07577608400513328, + "learning_rate": 1.0557779799742397e-06, + "loss": 0.8886, + "step": 31287 + }, + { + "epoch": 6.674061433447099, + "grad_norm": 0.06919332628096071, + "learning_rate": 1.054400467671588e-06, + "loss": 0.8629, + "step": 31288 + }, + { + "epoch": 6.674274744027304, + "grad_norm": 0.07159078597363219, + "learning_rate": 1.0530238486397183e-06, + "loss": 0.8679, + "step": 31289 + }, + { + "epoch": 6.674488054607508, + "grad_norm": 0.07460447909530958, + "learning_rate": 1.0516481228942176e-06, + "loss": 0.8873, + "step": 31290 + }, + { + "epoch": 6.674701365187714, + "grad_norm": 0.07826620928545837, + "learning_rate": 1.0502732904506474e-06, + "loss": 0.8924, + "step": 31291 + }, + { + "epoch": 6.674914675767918, + "grad_norm": 0.07076307907013335, + "learning_rate": 1.0488993513245594e-06, + "loss": 0.8825, + "step": 31292 + }, + { + "epoch": 6.675127986348123, + "grad_norm": 0.06982900810241988, + "learning_rate": 1.0475263055314966e-06, + "loss": 0.8716, + "step": 31293 + }, + { + "epoch": 6.675341296928328, + "grad_norm": 0.07360018711161634, + "learning_rate": 1.0461541530870022e-06, + "loss": 0.872, + "step": 31294 + }, + { + "epoch": 6.675554607508532, + "grad_norm": 0.07242028426676807, + "learning_rate": 1.0447828940066108e-06, + "loss": 0.9026, + "step": 31295 + }, + { + "epoch": 6.675767918088737, + "grad_norm": 0.07443215180229862, + "learning_rate": 1.0434125283058295e-06, + "loss": 0.9063, + "step": 31296 + }, + { + "epoch": 6.675981228668942, + "grad_norm": 0.07456497335791794, + "learning_rate": 1.0420430560001483e-06, + "loss": 0.9335, + "step": 31297 + }, + { + "epoch": 6.676194539249146, + "grad_norm": 0.07425847757872465, + "learning_rate": 1.0406744771050925e-06, + "loss": 0.9082, + "step": 31298 + }, + { + "epoch": 6.676407849829351, + "grad_norm": 0.07323380608628223, + "learning_rate": 1.0393067916361344e-06, + "loss": 0.8856, + "step": 31299 + }, + { + "epoch": 6.676621160409557, + "grad_norm": 0.0785815954646452, + "learning_rate": 1.037939999608737e-06, + "loss": 0.8875, + "step": 31300 + }, + { + "epoch": 6.676834470989761, + "grad_norm": 0.07760255556194709, + "learning_rate": 1.03657410103839e-06, + "loss": 0.8565, + "step": 31301 + }, + { + "epoch": 6.677047781569966, + "grad_norm": 0.07692387164802429, + "learning_rate": 1.0352090959405214e-06, + "loss": 0.8855, + "step": 31302 + }, + { + "epoch": 6.677261092150171, + "grad_norm": 0.0723347874862141, + "learning_rate": 1.0338449843306032e-06, + "loss": 0.8691, + "step": 31303 + }, + { + "epoch": 6.677474402730375, + "grad_norm": 0.07442732616617942, + "learning_rate": 1.0324817662240538e-06, + "loss": 0.8658, + "step": 31304 + }, + { + "epoch": 6.67768771331058, + "grad_norm": 0.07318251352893868, + "learning_rate": 1.0311194416362925e-06, + "loss": 0.8645, + "step": 31305 + }, + { + "epoch": 6.677901023890785, + "grad_norm": 0.0747921418856558, + "learning_rate": 1.0297580105827464e-06, + "loss": 0.8664, + "step": 31306 + }, + { + "epoch": 6.6781143344709895, + "grad_norm": 0.0789857489404531, + "learning_rate": 1.028397473078817e-06, + "loss": 0.8651, + "step": 31307 + }, + { + "epoch": 6.678327645051194, + "grad_norm": 0.07374042984580478, + "learning_rate": 1.0270378291399053e-06, + "loss": 0.8954, + "step": 31308 + }, + { + "epoch": 6.6785409556314, + "grad_norm": 0.07367980184157023, + "learning_rate": 1.0256790787813764e-06, + "loss": 0.8906, + "step": 31309 + }, + { + "epoch": 6.678754266211604, + "grad_norm": 0.0768184583352705, + "learning_rate": 1.0243212220186315e-06, + "loss": 0.8833, + "step": 31310 + }, + { + "epoch": 6.678967576791809, + "grad_norm": 0.07425813453797496, + "learning_rate": 1.0229642588670097e-06, + "loss": 0.898, + "step": 31311 + }, + { + "epoch": 6.679180887372014, + "grad_norm": 0.07453068925724624, + "learning_rate": 1.0216081893418939e-06, + "loss": 0.85, + "step": 31312 + }, + { + "epoch": 6.6793941979522184, + "grad_norm": 0.07906518007965165, + "learning_rate": 1.0202530134585876e-06, + "loss": 0.8899, + "step": 31313 + }, + { + "epoch": 6.679607508532423, + "grad_norm": 0.06849223813555982, + "learning_rate": 1.0188987312324562e-06, + "loss": 0.8812, + "step": 31314 + }, + { + "epoch": 6.679820819112628, + "grad_norm": 0.07063545921025059, + "learning_rate": 1.0175453426788117e-06, + "loss": 0.8827, + "step": 31315 + }, + { + "epoch": 6.6800341296928325, + "grad_norm": 0.07173810208018751, + "learning_rate": 1.0161928478129758e-06, + "loss": 0.8969, + "step": 31316 + }, + { + "epoch": 6.680247440273037, + "grad_norm": 0.07078051658401893, + "learning_rate": 1.0148412466502332e-06, + "loss": 0.874, + "step": 31317 + }, + { + "epoch": 6.680460750853243, + "grad_norm": 0.07337000806921136, + "learning_rate": 1.0134905392059059e-06, + "loss": 0.8538, + "step": 31318 + }, + { + "epoch": 6.680674061433447, + "grad_norm": 0.07329820176008343, + "learning_rate": 1.012140725495252e-06, + "loss": 0.8798, + "step": 31319 + }, + { + "epoch": 6.680887372013652, + "grad_norm": 0.07632477697235547, + "learning_rate": 1.0107918055335574e-06, + "loss": 0.9021, + "step": 31320 + }, + { + "epoch": 6.681100682593857, + "grad_norm": 0.0731388548432476, + "learning_rate": 1.0094437793360812e-06, + "loss": 0.871, + "step": 31321 + }, + { + "epoch": 6.6813139931740615, + "grad_norm": 0.07643694361295984, + "learning_rate": 1.008096646918082e-06, + "loss": 0.887, + "step": 31322 + }, + { + "epoch": 6.681527303754266, + "grad_norm": 0.07167892305667768, + "learning_rate": 1.006750408294801e-06, + "loss": 0.8758, + "step": 31323 + }, + { + "epoch": 6.681740614334471, + "grad_norm": 0.07642621605436561, + "learning_rate": 1.0054050634814616e-06, + "loss": 0.8602, + "step": 31324 + }, + { + "epoch": 6.6819539249146755, + "grad_norm": 0.0732302550899476, + "learning_rate": 1.0040606124932873e-06, + "loss": 0.8553, + "step": 31325 + }, + { + "epoch": 6.68216723549488, + "grad_norm": 0.07401501503133316, + "learning_rate": 1.0027170553455012e-06, + "loss": 0.8664, + "step": 31326 + }, + { + "epoch": 6.682380546075086, + "grad_norm": 0.07596126400031566, + "learning_rate": 1.0013743920533003e-06, + "loss": 0.8679, + "step": 31327 + }, + { + "epoch": 6.6825938566552905, + "grad_norm": 0.0724886480065715, + "learning_rate": 1.0000326226318812e-06, + "loss": 0.9038, + "step": 31328 + }, + { + "epoch": 6.682807167235495, + "grad_norm": 0.07248060737603014, + "learning_rate": 9.986917470964054e-07, + "loss": 0.8682, + "step": 31329 + }, + { + "epoch": 6.6830204778157, + "grad_norm": 0.07761770225569657, + "learning_rate": 9.973517654620779e-07, + "loss": 0.9172, + "step": 31330 + }, + { + "epoch": 6.6832337883959045, + "grad_norm": 0.076634477736274, + "learning_rate": 9.960126777440426e-07, + "loss": 0.903, + "step": 31331 + }, + { + "epoch": 6.683447098976109, + "grad_norm": 0.07755034345370684, + "learning_rate": 9.94674483957443e-07, + "loss": 0.8913, + "step": 31332 + }, + { + "epoch": 6.683660409556314, + "grad_norm": 0.07314722513850301, + "learning_rate": 9.9333718411744e-07, + "loss": 0.8801, + "step": 31333 + }, + { + "epoch": 6.683873720136519, + "grad_norm": 0.07437536818645064, + "learning_rate": 9.920007782391594e-07, + "loss": 0.8573, + "step": 31334 + }, + { + "epoch": 6.684087030716723, + "grad_norm": 0.07970811197708193, + "learning_rate": 9.906652663377092e-07, + "loss": 0.8949, + "step": 31335 + }, + { + "epoch": 6.684300341296928, + "grad_norm": 0.07418063606748723, + "learning_rate": 9.89330648428215e-07, + "loss": 0.9091, + "step": 31336 + }, + { + "epoch": 6.684513651877133, + "grad_norm": 0.0715647639583574, + "learning_rate": 9.87996924525767e-07, + "loss": 0.9081, + "step": 31337 + }, + { + "epoch": 6.684726962457338, + "grad_norm": 0.07913237366543938, + "learning_rate": 9.866640946454643e-07, + "loss": 0.8775, + "step": 31338 + }, + { + "epoch": 6.684940273037543, + "grad_norm": 0.07095042673382881, + "learning_rate": 9.853321588023967e-07, + "loss": 0.8908, + "step": 31339 + }, + { + "epoch": 6.685153583617748, + "grad_norm": 0.07100732599251434, + "learning_rate": 9.840011170116193e-07, + "loss": 0.8598, + "step": 31340 + }, + { + "epoch": 6.685366894197952, + "grad_norm": 0.07711974170184235, + "learning_rate": 9.826709692881864e-07, + "loss": 0.9197, + "step": 31341 + }, + { + "epoch": 6.685580204778157, + "grad_norm": 0.08157244967296824, + "learning_rate": 9.813417156471794e-07, + "loss": 0.8493, + "step": 31342 + }, + { + "epoch": 6.685793515358362, + "grad_norm": 0.07471673989707535, + "learning_rate": 9.800133561036085e-07, + "loss": 0.8925, + "step": 31343 + }, + { + "epoch": 6.686006825938566, + "grad_norm": 0.07863761129297557, + "learning_rate": 9.786858906725195e-07, + "loss": 0.8868, + "step": 31344 + }, + { + "epoch": 6.686220136518771, + "grad_norm": 0.07662910774835674, + "learning_rate": 9.773593193689312e-07, + "loss": 0.8506, + "step": 31345 + }, + { + "epoch": 6.686433447098976, + "grad_norm": 0.07314017579156004, + "learning_rate": 9.760336422078453e-07, + "loss": 0.878, + "step": 31346 + }, + { + "epoch": 6.686646757679181, + "grad_norm": 0.07507366737561953, + "learning_rate": 9.74708859204272e-07, + "loss": 0.8979, + "step": 31347 + }, + { + "epoch": 6.686860068259386, + "grad_norm": 0.07990786451963645, + "learning_rate": 9.733849703731946e-07, + "loss": 0.8838, + "step": 31348 + }, + { + "epoch": 6.687073378839591, + "grad_norm": 0.07382153734831502, + "learning_rate": 9.72061975729597e-07, + "loss": 0.8478, + "step": 31349 + }, + { + "epoch": 6.687286689419795, + "grad_norm": 0.07652192998546183, + "learning_rate": 9.707398752884446e-07, + "loss": 0.904, + "step": 31350 + }, + { + "epoch": 6.6875, + "grad_norm": 0.08108058705351112, + "learning_rate": 9.694186690647034e-07, + "loss": 0.8844, + "step": 31351 + }, + { + "epoch": 6.687713310580205, + "grad_norm": 0.0727983548319847, + "learning_rate": 9.680983570733215e-07, + "loss": 0.8674, + "step": 31352 + }, + { + "epoch": 6.687926621160409, + "grad_norm": 0.07369360156495967, + "learning_rate": 9.667789393292294e-07, + "loss": 0.8717, + "step": 31353 + }, + { + "epoch": 6.688139931740614, + "grad_norm": 0.08156305559709079, + "learning_rate": 9.654604158473658e-07, + "loss": 0.9032, + "step": 31354 + }, + { + "epoch": 6.688353242320819, + "grad_norm": 0.07007070616039994, + "learning_rate": 9.641427866426524e-07, + "loss": 0.8634, + "step": 31355 + }, + { + "epoch": 6.688566552901024, + "grad_norm": 0.07414546009328281, + "learning_rate": 9.628260517299837e-07, + "loss": 0.8751, + "step": 31356 + }, + { + "epoch": 6.688779863481229, + "grad_norm": 0.0751114527521552, + "learning_rate": 9.615102111242812e-07, + "loss": 0.8884, + "step": 31357 + }, + { + "epoch": 6.688993174061434, + "grad_norm": 0.07442974304209508, + "learning_rate": 9.601952648404044e-07, + "loss": 0.9038, + "step": 31358 + }, + { + "epoch": 6.689206484641638, + "grad_norm": 0.07232032886983669, + "learning_rate": 9.588812128932567e-07, + "loss": 0.8703, + "step": 31359 + }, + { + "epoch": 6.689419795221843, + "grad_norm": 0.0725732888128171, + "learning_rate": 9.575680552976974e-07, + "loss": 0.8972, + "step": 31360 + }, + { + "epoch": 6.689633105802048, + "grad_norm": 0.0734905156151811, + "learning_rate": 9.562557920685766e-07, + "loss": 0.8734, + "step": 31361 + }, + { + "epoch": 6.689846416382252, + "grad_norm": 0.07407403583780306, + "learning_rate": 9.549444232207627e-07, + "loss": 0.8785, + "step": 31362 + }, + { + "epoch": 6.690059726962457, + "grad_norm": 0.07316435920151455, + "learning_rate": 9.536339487690793e-07, + "loss": 0.8925, + "step": 31363 + }, + { + "epoch": 6.690273037542662, + "grad_norm": 0.07944537567791446, + "learning_rate": 9.523243687283501e-07, + "loss": 0.8953, + "step": 31364 + }, + { + "epoch": 6.690486348122867, + "grad_norm": 0.07814821467185248, + "learning_rate": 9.510156831133988e-07, + "loss": 0.9075, + "step": 31365 + }, + { + "epoch": 6.690699658703072, + "grad_norm": 0.07464588829678422, + "learning_rate": 9.49707891939049e-07, + "loss": 0.8683, + "step": 31366 + }, + { + "epoch": 6.690912969283277, + "grad_norm": 0.07745624821872982, + "learning_rate": 9.484009952200712e-07, + "loss": 0.8972, + "step": 31367 + }, + { + "epoch": 6.691126279863481, + "grad_norm": 0.07570611988151069, + "learning_rate": 9.470949929712802e-07, + "loss": 0.8982, + "step": 31368 + }, + { + "epoch": 6.691339590443686, + "grad_norm": 0.07471249756964203, + "learning_rate": 9.457898852074288e-07, + "loss": 0.8826, + "step": 31369 + }, + { + "epoch": 6.691552901023891, + "grad_norm": 0.08117833297154121, + "learning_rate": 9.444856719432959e-07, + "loss": 0.8914, + "step": 31370 + }, + { + "epoch": 6.6917662116040955, + "grad_norm": 0.07094754403067882, + "learning_rate": 9.431823531936346e-07, + "loss": 0.8788, + "step": 31371 + }, + { + "epoch": 6.6919795221843, + "grad_norm": 0.07581752779501241, + "learning_rate": 9.418799289731972e-07, + "loss": 0.9053, + "step": 31372 + }, + { + "epoch": 6.692192832764505, + "grad_norm": 0.07975547707008175, + "learning_rate": 9.4057839929671e-07, + "loss": 0.8694, + "step": 31373 + }, + { + "epoch": 6.6924061433447095, + "grad_norm": 0.07169591618710655, + "learning_rate": 9.392777641789164e-07, + "loss": 0.8704, + "step": 31374 + }, + { + "epoch": 6.692619453924914, + "grad_norm": 0.07181936213213752, + "learning_rate": 9.379780236345248e-07, + "loss": 0.8475, + "step": 31375 + }, + { + "epoch": 6.69283276450512, + "grad_norm": 0.07172903877578453, + "learning_rate": 9.366791776782436e-07, + "loss": 0.9034, + "step": 31376 + }, + { + "epoch": 6.6930460750853245, + "grad_norm": 0.07867377367049792, + "learning_rate": 9.353812263247631e-07, + "loss": 0.8524, + "step": 31377 + }, + { + "epoch": 6.693259385665529, + "grad_norm": 0.07536659344227217, + "learning_rate": 9.340841695887737e-07, + "loss": 0.8915, + "step": 31378 + }, + { + "epoch": 6.693472696245734, + "grad_norm": 0.07129115487312747, + "learning_rate": 9.32788007484966e-07, + "loss": 0.8735, + "step": 31379 + }, + { + "epoch": 6.6936860068259385, + "grad_norm": 0.07211483802876346, + "learning_rate": 9.314927400279772e-07, + "loss": 0.8883, + "step": 31380 + }, + { + "epoch": 6.693899317406143, + "grad_norm": 0.0707942774404098, + "learning_rate": 9.3019836723248e-07, + "loss": 0.8664, + "step": 31381 + }, + { + "epoch": 6.694112627986348, + "grad_norm": 0.0726710735321578, + "learning_rate": 9.289048891131114e-07, + "loss": 0.8927, + "step": 31382 + }, + { + "epoch": 6.694325938566553, + "grad_norm": 0.0767731146659312, + "learning_rate": 9.276123056845266e-07, + "loss": 0.8797, + "step": 31383 + }, + { + "epoch": 6.694539249146757, + "grad_norm": 0.07603145844645447, + "learning_rate": 9.263206169613359e-07, + "loss": 0.8738, + "step": 31384 + }, + { + "epoch": 6.694752559726963, + "grad_norm": 0.07843652109125555, + "learning_rate": 9.2502982295815e-07, + "loss": 0.8718, + "step": 31385 + }, + { + "epoch": 6.6949658703071675, + "grad_norm": 0.07289000450672142, + "learning_rate": 9.237399236895883e-07, + "loss": 0.8853, + "step": 31386 + }, + { + "epoch": 6.695179180887372, + "grad_norm": 0.07357304944054721, + "learning_rate": 9.224509191702347e-07, + "loss": 0.8936, + "step": 31387 + }, + { + "epoch": 6.695392491467577, + "grad_norm": 0.07364642393346968, + "learning_rate": 9.211628094146819e-07, + "loss": 0.873, + "step": 31388 + }, + { + "epoch": 6.6956058020477816, + "grad_norm": 0.07579138692636597, + "learning_rate": 9.198755944375049e-07, + "loss": 0.8839, + "step": 31389 + }, + { + "epoch": 6.695819112627986, + "grad_norm": 0.07630172336099164, + "learning_rate": 9.18589274253261e-07, + "loss": 0.8575, + "step": 31390 + }, + { + "epoch": 6.696032423208191, + "grad_norm": 0.07645652384070155, + "learning_rate": 9.173038488765162e-07, + "loss": 0.8784, + "step": 31391 + }, + { + "epoch": 6.696245733788396, + "grad_norm": 0.07070047357406678, + "learning_rate": 9.160193183218013e-07, + "loss": 0.8561, + "step": 31392 + }, + { + "epoch": 6.6964590443686, + "grad_norm": 0.07531480945144638, + "learning_rate": 9.147356826036647e-07, + "loss": 0.8784, + "step": 31393 + }, + { + "epoch": 6.696672354948806, + "grad_norm": 0.07116033899697023, + "learning_rate": 9.134529417366189e-07, + "loss": 0.8722, + "step": 31394 + }, + { + "epoch": 6.6968856655290105, + "grad_norm": 0.07506045531183492, + "learning_rate": 9.12171095735186e-07, + "loss": 0.881, + "step": 31395 + }, + { + "epoch": 6.697098976109215, + "grad_norm": 0.07501412570177145, + "learning_rate": 9.108901446138607e-07, + "loss": 0.8949, + "step": 31396 + }, + { + "epoch": 6.69731228668942, + "grad_norm": 0.076256219200182, + "learning_rate": 9.096100883871472e-07, + "loss": 0.8939, + "step": 31397 + }, + { + "epoch": 6.697525597269625, + "grad_norm": 0.07725359519546832, + "learning_rate": 9.083309270695318e-07, + "loss": 0.8909, + "step": 31398 + }, + { + "epoch": 6.697738907849829, + "grad_norm": 0.0824247674439938, + "learning_rate": 9.070526606754737e-07, + "loss": 0.8867, + "step": 31399 + }, + { + "epoch": 6.697952218430034, + "grad_norm": 0.073585700032404, + "learning_rate": 9.057752892194505e-07, + "loss": 0.8841, + "step": 31400 + }, + { + "epoch": 6.698165529010239, + "grad_norm": 0.07205974955375387, + "learning_rate": 9.044988127159127e-07, + "loss": 0.873, + "step": 31401 + }, + { + "epoch": 6.698378839590443, + "grad_norm": 0.0725422717142762, + "learning_rate": 9.032232311793021e-07, + "loss": 0.873, + "step": 31402 + }, + { + "epoch": 6.698592150170649, + "grad_norm": 0.07122638070474169, + "learning_rate": 9.019485446240427e-07, + "loss": 0.8516, + "step": 31403 + }, + { + "epoch": 6.698805460750854, + "grad_norm": 0.07290937781947825, + "learning_rate": 9.006747530645765e-07, + "loss": 0.8475, + "step": 31404 + }, + { + "epoch": 6.699018771331058, + "grad_norm": 0.08139412792677837, + "learning_rate": 8.994018565152917e-07, + "loss": 0.8941, + "step": 31405 + }, + { + "epoch": 6.699232081911263, + "grad_norm": 0.07577125246460312, + "learning_rate": 8.981298549906126e-07, + "loss": 0.864, + "step": 31406 + }, + { + "epoch": 6.699445392491468, + "grad_norm": 0.07135933909717888, + "learning_rate": 8.968587485049274e-07, + "loss": 0.8745, + "step": 31407 + }, + { + "epoch": 6.699658703071672, + "grad_norm": 0.07620786925141151, + "learning_rate": 8.955885370726158e-07, + "loss": 0.9004, + "step": 31408 + }, + { + "epoch": 6.699872013651877, + "grad_norm": 0.0724766058278419, + "learning_rate": 8.943192207080486e-07, + "loss": 0.8575, + "step": 31409 + }, + { + "epoch": 6.700085324232082, + "grad_norm": 0.0725881640403305, + "learning_rate": 8.930507994255877e-07, + "loss": 0.8599, + "step": 31410 + }, + { + "epoch": 6.700298634812286, + "grad_norm": 0.07632141661834492, + "learning_rate": 8.917832732395948e-07, + "loss": 0.8422, + "step": 31411 + }, + { + "epoch": 6.700511945392492, + "grad_norm": 0.07293518732436742, + "learning_rate": 8.905166421643963e-07, + "loss": 0.8791, + "step": 31412 + }, + { + "epoch": 6.700725255972696, + "grad_norm": 0.07643738672451539, + "learning_rate": 8.892509062143362e-07, + "loss": 0.8933, + "step": 31413 + }, + { + "epoch": 6.700938566552901, + "grad_norm": 0.07249270734822612, + "learning_rate": 8.87986065403732e-07, + "loss": 0.8651, + "step": 31414 + }, + { + "epoch": 6.701151877133106, + "grad_norm": 0.0831102282605425, + "learning_rate": 8.867221197469012e-07, + "loss": 0.8742, + "step": 31415 + }, + { + "epoch": 6.701365187713311, + "grad_norm": 0.07173292421186868, + "learning_rate": 8.854590692581345e-07, + "loss": 0.8722, + "step": 31416 + }, + { + "epoch": 6.701578498293515, + "grad_norm": 0.07343911759719861, + "learning_rate": 8.841969139517315e-07, + "loss": 0.8855, + "step": 31417 + }, + { + "epoch": 6.70179180887372, + "grad_norm": 0.07489590249744546, + "learning_rate": 8.829356538419654e-07, + "loss": 0.8803, + "step": 31418 + }, + { + "epoch": 6.702005119453925, + "grad_norm": 0.07180144679954284, + "learning_rate": 8.816752889431268e-07, + "loss": 0.8886, + "step": 31419 + }, + { + "epoch": 6.702218430034129, + "grad_norm": 0.06963468742714184, + "learning_rate": 8.804158192694534e-07, + "loss": 0.8702, + "step": 31420 + }, + { + "epoch": 6.702431740614334, + "grad_norm": 0.07260410877633212, + "learning_rate": 8.791572448352004e-07, + "loss": 0.8712, + "step": 31421 + }, + { + "epoch": 6.702645051194539, + "grad_norm": 0.08327238000903901, + "learning_rate": 8.77899565654623e-07, + "loss": 0.8567, + "step": 31422 + }, + { + "epoch": 6.702858361774744, + "grad_norm": 0.07442250855410393, + "learning_rate": 8.766427817419409e-07, + "loss": 0.8377, + "step": 31423 + }, + { + "epoch": 6.703071672354949, + "grad_norm": 0.07345531538992817, + "learning_rate": 8.75386893111374e-07, + "loss": 0.8779, + "step": 31424 + }, + { + "epoch": 6.703284982935154, + "grad_norm": 0.07733473442557398, + "learning_rate": 8.741318997771419e-07, + "loss": 0.8639, + "step": 31425 + }, + { + "epoch": 6.703498293515358, + "grad_norm": 0.07513716828945652, + "learning_rate": 8.728778017534378e-07, + "loss": 0.8796, + "step": 31426 + }, + { + "epoch": 6.703711604095563, + "grad_norm": 0.07406722076419182, + "learning_rate": 8.716245990544458e-07, + "loss": 0.87, + "step": 31427 + }, + { + "epoch": 6.703924914675768, + "grad_norm": 0.07377143310024331, + "learning_rate": 8.703722916943591e-07, + "loss": 0.8917, + "step": 31428 + }, + { + "epoch": 6.7041382252559725, + "grad_norm": 0.07300422542792008, + "learning_rate": 8.691208796873263e-07, + "loss": 0.862, + "step": 31429 + }, + { + "epoch": 6.704351535836177, + "grad_norm": 0.07405081115390318, + "learning_rate": 8.678703630475405e-07, + "loss": 0.8642, + "step": 31430 + }, + { + "epoch": 6.704564846416382, + "grad_norm": 0.07644255441550421, + "learning_rate": 8.666207417891237e-07, + "loss": 0.8843, + "step": 31431 + }, + { + "epoch": 6.704778156996587, + "grad_norm": 0.07543056933462064, + "learning_rate": 8.653720159262246e-07, + "loss": 0.8752, + "step": 31432 + }, + { + "epoch": 6.704991467576792, + "grad_norm": 0.0830657724252531, + "learning_rate": 8.641241854729654e-07, + "loss": 0.8947, + "step": 31433 + }, + { + "epoch": 6.705204778156997, + "grad_norm": 0.07147186993627136, + "learning_rate": 8.628772504434857e-07, + "loss": 0.8581, + "step": 31434 + }, + { + "epoch": 6.7054180887372015, + "grad_norm": 0.07564744379351646, + "learning_rate": 8.616312108518809e-07, + "loss": 0.8875, + "step": 31435 + }, + { + "epoch": 6.705631399317406, + "grad_norm": 0.08402273644574336, + "learning_rate": 8.603860667122377e-07, + "loss": 0.8442, + "step": 31436 + }, + { + "epoch": 6.705844709897611, + "grad_norm": 0.069579969472664, + "learning_rate": 8.591418180386601e-07, + "loss": 0.8622, + "step": 31437 + }, + { + "epoch": 6.7060580204778155, + "grad_norm": 0.07301856795545987, + "learning_rate": 8.57898464845226e-07, + "loss": 0.8582, + "step": 31438 + }, + { + "epoch": 6.70627133105802, + "grad_norm": 0.06982301500123575, + "learning_rate": 8.56656007146004e-07, + "loss": 0.8741, + "step": 31439 + }, + { + "epoch": 6.706484641638225, + "grad_norm": 0.07229606593115816, + "learning_rate": 8.554144449550539e-07, + "loss": 0.8615, + "step": 31440 + }, + { + "epoch": 6.7066979522184305, + "grad_norm": 0.07348807781957771, + "learning_rate": 8.54173778286409e-07, + "loss": 0.8795, + "step": 31441 + }, + { + "epoch": 6.706911262798635, + "grad_norm": 0.07294862898230078, + "learning_rate": 8.529340071541292e-07, + "loss": 0.8531, + "step": 31442 + }, + { + "epoch": 6.70712457337884, + "grad_norm": 0.07571233532992869, + "learning_rate": 8.516951315722299e-07, + "loss": 0.8766, + "step": 31443 + }, + { + "epoch": 6.7073378839590445, + "grad_norm": 0.07230049193053609, + "learning_rate": 8.504571515547355e-07, + "loss": 0.8872, + "step": 31444 + }, + { + "epoch": 6.707551194539249, + "grad_norm": 0.07356582143943836, + "learning_rate": 8.492200671156436e-07, + "loss": 0.9004, + "step": 31445 + }, + { + "epoch": 6.707764505119454, + "grad_norm": 0.07601413613692751, + "learning_rate": 8.479838782689608e-07, + "loss": 0.9015, + "step": 31446 + }, + { + "epoch": 6.707977815699659, + "grad_norm": 0.07425517974524819, + "learning_rate": 8.467485850286761e-07, + "loss": 0.8705, + "step": 31447 + }, + { + "epoch": 6.708191126279863, + "grad_norm": 0.07540987081163761, + "learning_rate": 8.455141874087513e-07, + "loss": 0.8722, + "step": 31448 + }, + { + "epoch": 6.708404436860068, + "grad_norm": 0.0710071719108278, + "learning_rate": 8.442806854231756e-07, + "loss": 0.892, + "step": 31449 + }, + { + "epoch": 6.7086177474402735, + "grad_norm": 0.07467818450837954, + "learning_rate": 8.430480790858841e-07, + "loss": 0.8466, + "step": 31450 + }, + { + "epoch": 6.708831058020478, + "grad_norm": 0.07976223475272666, + "learning_rate": 8.418163684108483e-07, + "loss": 0.8972, + "step": 31451 + }, + { + "epoch": 6.709044368600683, + "grad_norm": 0.082171899758794, + "learning_rate": 8.405855534119855e-07, + "loss": 0.8906, + "step": 31452 + }, + { + "epoch": 6.709257679180888, + "grad_norm": 0.07194722004885121, + "learning_rate": 8.393556341032228e-07, + "loss": 0.8793, + "step": 31453 + }, + { + "epoch": 6.709470989761092, + "grad_norm": 0.07268674246212865, + "learning_rate": 8.381266104984864e-07, + "loss": 0.885, + "step": 31454 + }, + { + "epoch": 6.709684300341297, + "grad_norm": 0.0743546707023902, + "learning_rate": 8.368984826116855e-07, + "loss": 0.8846, + "step": 31455 + }, + { + "epoch": 6.709897610921502, + "grad_norm": 0.07188301001493638, + "learning_rate": 8.356712504567022e-07, + "loss": 0.8765, + "step": 31456 + }, + { + "epoch": 6.710110921501706, + "grad_norm": 0.07453636823175942, + "learning_rate": 8.344449140474365e-07, + "loss": 0.8743, + "step": 31457 + }, + { + "epoch": 6.710324232081911, + "grad_norm": 0.0732113429327206, + "learning_rate": 8.332194733977527e-07, + "loss": 0.8884, + "step": 31458 + }, + { + "epoch": 6.710537542662116, + "grad_norm": 0.0737126982240972, + "learning_rate": 8.319949285215245e-07, + "loss": 0.8635, + "step": 31459 + }, + { + "epoch": 6.71075085324232, + "grad_norm": 0.07322763785351677, + "learning_rate": 8.307712794325984e-07, + "loss": 0.8721, + "step": 31460 + }, + { + "epoch": 6.710964163822526, + "grad_norm": 0.07564979721318081, + "learning_rate": 8.295485261448388e-07, + "loss": 0.8972, + "step": 31461 + }, + { + "epoch": 6.711177474402731, + "grad_norm": 0.072602579921506, + "learning_rate": 8.283266686720659e-07, + "loss": 0.8826, + "step": 31462 + }, + { + "epoch": 6.711390784982935, + "grad_norm": 0.07484309695500438, + "learning_rate": 8.271057070281085e-07, + "loss": 0.8775, + "step": 31463 + }, + { + "epoch": 6.71160409556314, + "grad_norm": 0.07307717623424112, + "learning_rate": 8.258856412267779e-07, + "loss": 0.8956, + "step": 31464 + }, + { + "epoch": 6.711817406143345, + "grad_norm": 0.07515886031179772, + "learning_rate": 8.246664712818764e-07, + "loss": 0.8749, + "step": 31465 + }, + { + "epoch": 6.712030716723549, + "grad_norm": 0.07667900086609926, + "learning_rate": 8.23448197207215e-07, + "loss": 0.8931, + "step": 31466 + }, + { + "epoch": 6.712244027303754, + "grad_norm": 0.07670427375741683, + "learning_rate": 8.222308190165696e-07, + "loss": 0.8985, + "step": 31467 + }, + { + "epoch": 6.712457337883959, + "grad_norm": 0.08072502295069228, + "learning_rate": 8.210143367237155e-07, + "loss": 0.866, + "step": 31468 + }, + { + "epoch": 6.712670648464163, + "grad_norm": 0.07834774952686216, + "learning_rate": 8.197987503424109e-07, + "loss": 0.8713, + "step": 31469 + }, + { + "epoch": 6.712883959044369, + "grad_norm": 0.07684864958872638, + "learning_rate": 8.185840598864225e-07, + "loss": 0.8676, + "step": 31470 + }, + { + "epoch": 6.713097269624574, + "grad_norm": 0.07415077102404538, + "learning_rate": 8.173702653694815e-07, + "loss": 0.8651, + "step": 31471 + }, + { + "epoch": 6.713310580204778, + "grad_norm": 0.07605330139889156, + "learning_rate": 8.16157366805328e-07, + "loss": 0.8868, + "step": 31472 + }, + { + "epoch": 6.713523890784983, + "grad_norm": 0.07703013937123077, + "learning_rate": 8.149453642076844e-07, + "loss": 0.8945, + "step": 31473 + }, + { + "epoch": 6.713737201365188, + "grad_norm": 0.07904083035919927, + "learning_rate": 8.137342575902729e-07, + "loss": 0.8779, + "step": 31474 + }, + { + "epoch": 6.713950511945392, + "grad_norm": 0.07216067115585435, + "learning_rate": 8.125240469667806e-07, + "loss": 0.8868, + "step": 31475 + }, + { + "epoch": 6.714163822525597, + "grad_norm": 0.0722047559670227, + "learning_rate": 8.113147323509207e-07, + "loss": 0.882, + "step": 31476 + }, + { + "epoch": 6.714377133105802, + "grad_norm": 0.08587148013379779, + "learning_rate": 8.101063137563536e-07, + "loss": 0.9021, + "step": 31477 + }, + { + "epoch": 6.714590443686006, + "grad_norm": 0.07769013534809575, + "learning_rate": 8.088987911967749e-07, + "loss": 0.8852, + "step": 31478 + }, + { + "epoch": 6.714803754266212, + "grad_norm": 0.07369238385838975, + "learning_rate": 8.07692164685836e-07, + "loss": 0.8708, + "step": 31479 + }, + { + "epoch": 6.715017064846417, + "grad_norm": 0.07617245018105397, + "learning_rate": 8.06486434237197e-07, + "loss": 0.9016, + "step": 31480 + }, + { + "epoch": 6.715230375426621, + "grad_norm": 0.07762165811043127, + "learning_rate": 8.052815998644914e-07, + "loss": 0.8882, + "step": 31481 + }, + { + "epoch": 6.715443686006826, + "grad_norm": 0.06954070212828248, + "learning_rate": 8.040776615813617e-07, + "loss": 0.8747, + "step": 31482 + }, + { + "epoch": 6.715656996587031, + "grad_norm": 0.07519548054010369, + "learning_rate": 8.028746194014148e-07, + "loss": 0.8749, + "step": 31483 + }, + { + "epoch": 6.715870307167235, + "grad_norm": 0.07235038674473325, + "learning_rate": 8.016724733382842e-07, + "loss": 0.8718, + "step": 31484 + }, + { + "epoch": 6.71608361774744, + "grad_norm": 0.07240869449567709, + "learning_rate": 8.004712234055501e-07, + "loss": 0.8947, + "step": 31485 + }, + { + "epoch": 6.716296928327645, + "grad_norm": 0.0754010122730075, + "learning_rate": 7.992708696168284e-07, + "loss": 0.9015, + "step": 31486 + }, + { + "epoch": 6.7165102389078495, + "grad_norm": 0.07587885925411375, + "learning_rate": 7.980714119856814e-07, + "loss": 0.9088, + "step": 31487 + }, + { + "epoch": 6.716723549488055, + "grad_norm": 0.08407084163921734, + "learning_rate": 7.968728505256984e-07, + "loss": 0.8408, + "step": 31488 + }, + { + "epoch": 6.71693686006826, + "grad_norm": 0.07413947021247656, + "learning_rate": 7.956751852504152e-07, + "loss": 0.8869, + "step": 31489 + }, + { + "epoch": 6.717150170648464, + "grad_norm": 0.07227533361690754, + "learning_rate": 7.94478416173412e-07, + "loss": 0.8629, + "step": 31490 + }, + { + "epoch": 6.717363481228669, + "grad_norm": 0.07375366527209552, + "learning_rate": 7.932825433082159e-07, + "loss": 0.9031, + "step": 31491 + }, + { + "epoch": 6.717576791808874, + "grad_norm": 0.07483174740699215, + "learning_rate": 7.920875666683536e-07, + "loss": 0.8886, + "step": 31492 + }, + { + "epoch": 6.7177901023890785, + "grad_norm": 0.07927825809830027, + "learning_rate": 7.908934862673612e-07, + "loss": 0.9117, + "step": 31493 + }, + { + "epoch": 6.718003412969283, + "grad_norm": 0.07046208449684149, + "learning_rate": 7.897003021187388e-07, + "loss": 0.8718, + "step": 31494 + }, + { + "epoch": 6.718216723549488, + "grad_norm": 0.07640539146806387, + "learning_rate": 7.885080142359869e-07, + "loss": 0.8752, + "step": 31495 + }, + { + "epoch": 6.7184300341296925, + "grad_norm": 0.07940067114703656, + "learning_rate": 7.873166226325968e-07, + "loss": 0.8848, + "step": 31496 + }, + { + "epoch": 6.718643344709898, + "grad_norm": 0.07428157441669773, + "learning_rate": 7.861261273220599e-07, + "loss": 0.8956, + "step": 31497 + }, + { + "epoch": 6.718856655290102, + "grad_norm": 0.07062227473981933, + "learning_rate": 7.849365283178323e-07, + "loss": 0.8551, + "step": 31498 + }, + { + "epoch": 6.7190699658703075, + "grad_norm": 0.0740292524989585, + "learning_rate": 7.837478256333875e-07, + "loss": 0.9052, + "step": 31499 + }, + { + "epoch": 6.719283276450512, + "grad_norm": 0.07506602188426337, + "learning_rate": 7.825600192821637e-07, + "loss": 0.8845, + "step": 31500 + }, + { + "epoch": 6.719496587030717, + "grad_norm": 0.07665331227928762, + "learning_rate": 7.813731092775989e-07, + "loss": 0.907, + "step": 31501 + }, + { + "epoch": 6.7197098976109215, + "grad_norm": 0.07068571373706527, + "learning_rate": 7.801870956331403e-07, + "loss": 0.8786, + "step": 31502 + }, + { + "epoch": 6.719923208191126, + "grad_norm": 0.07098920641903678, + "learning_rate": 7.790019783621994e-07, + "loss": 0.8902, + "step": 31503 + }, + { + "epoch": 6.720136518771331, + "grad_norm": 0.07719671978383476, + "learning_rate": 7.778177574781698e-07, + "loss": 0.8663, + "step": 31504 + }, + { + "epoch": 6.720349829351536, + "grad_norm": 0.07312807105253484, + "learning_rate": 7.766344329944719e-07, + "loss": 0.9056, + "step": 31505 + }, + { + "epoch": 6.72056313993174, + "grad_norm": 0.07493969930211934, + "learning_rate": 7.754520049244907e-07, + "loss": 0.8837, + "step": 31506 + }, + { + "epoch": 6.720776450511945, + "grad_norm": 0.0723918556263908, + "learning_rate": 7.74270473281602e-07, + "loss": 0.8858, + "step": 31507 + }, + { + "epoch": 6.7209897610921505, + "grad_norm": 0.07135312823228863, + "learning_rate": 7.73089838079173e-07, + "loss": 0.8617, + "step": 31508 + }, + { + "epoch": 6.721203071672355, + "grad_norm": 0.07309718896821046, + "learning_rate": 7.719100993305617e-07, + "loss": 0.8679, + "step": 31509 + }, + { + "epoch": 6.72141638225256, + "grad_norm": 0.07203518678386957, + "learning_rate": 7.707312570491265e-07, + "loss": 0.849, + "step": 31510 + }, + { + "epoch": 6.721629692832765, + "grad_norm": 0.07388549157226339, + "learning_rate": 7.69553311248199e-07, + "loss": 0.871, + "step": 31511 + }, + { + "epoch": 6.721843003412969, + "grad_norm": 0.07415893354094276, + "learning_rate": 7.683762619411106e-07, + "loss": 0.8908, + "step": 31512 + }, + { + "epoch": 6.722056313993174, + "grad_norm": 0.07161654425048844, + "learning_rate": 7.672001091411752e-07, + "loss": 0.8433, + "step": 31513 + }, + { + "epoch": 6.722269624573379, + "grad_norm": 0.07445970763583022, + "learning_rate": 7.660248528616976e-07, + "loss": 0.9069, + "step": 31514 + }, + { + "epoch": 6.722482935153583, + "grad_norm": 0.07434022745591465, + "learning_rate": 7.648504931159917e-07, + "loss": 0.9103, + "step": 31515 + }, + { + "epoch": 6.722696245733788, + "grad_norm": 0.07365856991259619, + "learning_rate": 7.636770299173268e-07, + "loss": 0.8554, + "step": 31516 + }, + { + "epoch": 6.722909556313994, + "grad_norm": 0.07106475057185081, + "learning_rate": 7.625044632789902e-07, + "loss": 0.9016, + "step": 31517 + }, + { + "epoch": 6.723122866894198, + "grad_norm": 0.08009899957578354, + "learning_rate": 7.613327932142511e-07, + "loss": 0.9016, + "step": 31518 + }, + { + "epoch": 6.723336177474403, + "grad_norm": 0.07038040783237466, + "learning_rate": 7.601620197363612e-07, + "loss": 0.8575, + "step": 31519 + }, + { + "epoch": 6.723549488054608, + "grad_norm": 0.06830246740586918, + "learning_rate": 7.589921428585722e-07, + "loss": 0.8743, + "step": 31520 + }, + { + "epoch": 6.723762798634812, + "grad_norm": 0.07657563812910703, + "learning_rate": 7.578231625941179e-07, + "loss": 0.8926, + "step": 31521 + }, + { + "epoch": 6.723976109215017, + "grad_norm": 0.07757757704148342, + "learning_rate": 7.566550789562321e-07, + "loss": 0.8617, + "step": 31522 + }, + { + "epoch": 6.724189419795222, + "grad_norm": 0.07190325358480987, + "learning_rate": 7.55487891958131e-07, + "loss": 0.8439, + "step": 31523 + }, + { + "epoch": 6.724402730375426, + "grad_norm": 0.0736765265328116, + "learning_rate": 7.543216016130128e-07, + "loss": 0.902, + "step": 31524 + }, + { + "epoch": 6.724616040955631, + "grad_norm": 0.07262481065881007, + "learning_rate": 7.53156207934076e-07, + "loss": 0.8899, + "step": 31525 + }, + { + "epoch": 6.724829351535837, + "grad_norm": 0.07588703041446047, + "learning_rate": 7.519917109345098e-07, + "loss": 0.878, + "step": 31526 + }, + { + "epoch": 6.725042662116041, + "grad_norm": 0.07528569541887478, + "learning_rate": 7.50828110627495e-07, + "loss": 0.8933, + "step": 31527 + }, + { + "epoch": 6.725255972696246, + "grad_norm": 0.07234871600061946, + "learning_rate": 7.496654070261855e-07, + "loss": 0.8792, + "step": 31528 + }, + { + "epoch": 6.725469283276451, + "grad_norm": 0.07024946849944202, + "learning_rate": 7.48503600143744e-07, + "loss": 0.8521, + "step": 31529 + }, + { + "epoch": 6.725682593856655, + "grad_norm": 0.07766994946990882, + "learning_rate": 7.473426899933245e-07, + "loss": 0.8546, + "step": 31530 + }, + { + "epoch": 6.72589590443686, + "grad_norm": 0.07585412249939799, + "learning_rate": 7.461826765880542e-07, + "loss": 0.8739, + "step": 31531 + }, + { + "epoch": 6.726109215017065, + "grad_norm": 0.07085812540148599, + "learning_rate": 7.450235599410605e-07, + "loss": 0.8681, + "step": 31532 + }, + { + "epoch": 6.726322525597269, + "grad_norm": 0.07451280619420611, + "learning_rate": 7.438653400654527e-07, + "loss": 0.8745, + "step": 31533 + }, + { + "epoch": 6.726535836177474, + "grad_norm": 0.07360922338168253, + "learning_rate": 7.427080169743495e-07, + "loss": 0.8887, + "step": 31534 + }, + { + "epoch": 6.72674914675768, + "grad_norm": 0.07406340076567171, + "learning_rate": 7.415515906808335e-07, + "loss": 0.8717, + "step": 31535 + }, + { + "epoch": 6.726962457337884, + "grad_norm": 0.07094086227062407, + "learning_rate": 7.403960611979965e-07, + "loss": 0.869, + "step": 31536 + }, + { + "epoch": 6.727175767918089, + "grad_norm": 0.07239190882352982, + "learning_rate": 7.392414285389126e-07, + "loss": 0.8346, + "step": 31537 + }, + { + "epoch": 6.727389078498294, + "grad_norm": 0.07635124553727062, + "learning_rate": 7.38087692716647e-07, + "loss": 0.8955, + "step": 31538 + }, + { + "epoch": 6.727602389078498, + "grad_norm": 0.0720767312150196, + "learning_rate": 7.369348537442467e-07, + "loss": 0.8885, + "step": 31539 + }, + { + "epoch": 6.727815699658703, + "grad_norm": 0.07220409876179101, + "learning_rate": 7.357829116347593e-07, + "loss": 0.8856, + "step": 31540 + }, + { + "epoch": 6.728029010238908, + "grad_norm": 0.07471544557151404, + "learning_rate": 7.346318664012231e-07, + "loss": 0.8656, + "step": 31541 + }, + { + "epoch": 6.728242320819112, + "grad_norm": 0.06945586146417213, + "learning_rate": 7.334817180566678e-07, + "loss": 0.8909, + "step": 31542 + }, + { + "epoch": 6.728455631399317, + "grad_norm": 0.07304833072921199, + "learning_rate": 7.323324666140963e-07, + "loss": 0.8605, + "step": 31543 + }, + { + "epoch": 6.728668941979522, + "grad_norm": 0.07056514419478806, + "learning_rate": 7.311841120865116e-07, + "loss": 0.8894, + "step": 31544 + }, + { + "epoch": 6.7288822525597265, + "grad_norm": 0.07424614749019152, + "learning_rate": 7.300366544869164e-07, + "loss": 0.891, + "step": 31545 + }, + { + "epoch": 6.729095563139932, + "grad_norm": 0.07829554292679475, + "learning_rate": 7.288900938282872e-07, + "loss": 0.8836, + "step": 31546 + }, + { + "epoch": 6.729308873720137, + "grad_norm": 0.07479358739646136, + "learning_rate": 7.277444301236092e-07, + "loss": 0.8937, + "step": 31547 + }, + { + "epoch": 6.729522184300341, + "grad_norm": 0.07049173716941697, + "learning_rate": 7.26599663385823e-07, + "loss": 0.8966, + "step": 31548 + }, + { + "epoch": 6.729735494880546, + "grad_norm": 0.07308599206091586, + "learning_rate": 7.25455793627905e-07, + "loss": 0.8873, + "step": 31549 + }, + { + "epoch": 6.729948805460751, + "grad_norm": 0.06849966202065642, + "learning_rate": 7.243128208627869e-07, + "loss": 0.8765, + "step": 31550 + }, + { + "epoch": 6.7301621160409555, + "grad_norm": 0.07369589041433723, + "learning_rate": 7.231707451034009e-07, + "loss": 0.8798, + "step": 31551 + }, + { + "epoch": 6.73037542662116, + "grad_norm": 0.07554176955641474, + "learning_rate": 7.220295663626609e-07, + "loss": 0.8713, + "step": 31552 + }, + { + "epoch": 6.730588737201365, + "grad_norm": 0.07493545397634217, + "learning_rate": 7.208892846534988e-07, + "loss": 0.8645, + "step": 31553 + }, + { + "epoch": 6.7308020477815695, + "grad_norm": 0.07510838939347932, + "learning_rate": 7.197498999888108e-07, + "loss": 0.9079, + "step": 31554 + }, + { + "epoch": 6.731015358361775, + "grad_norm": 0.07145959713777732, + "learning_rate": 7.186114123814847e-07, + "loss": 0.8742, + "step": 31555 + }, + { + "epoch": 6.73122866894198, + "grad_norm": 0.07301112624584105, + "learning_rate": 7.174738218443988e-07, + "loss": 0.8828, + "step": 31556 + }, + { + "epoch": 6.7314419795221845, + "grad_norm": 0.07210897342962058, + "learning_rate": 7.163371283904408e-07, + "loss": 0.8832, + "step": 31557 + }, + { + "epoch": 6.731655290102389, + "grad_norm": 0.07522507730673315, + "learning_rate": 7.152013320324536e-07, + "loss": 0.8616, + "step": 31558 + }, + { + "epoch": 6.731868600682594, + "grad_norm": 0.07165156340348182, + "learning_rate": 7.140664327833069e-07, + "loss": 0.9024, + "step": 31559 + }, + { + "epoch": 6.7320819112627985, + "grad_norm": 0.07755870840632861, + "learning_rate": 7.129324306558172e-07, + "loss": 0.8831, + "step": 31560 + }, + { + "epoch": 6.732295221843003, + "grad_norm": 0.07399115809922542, + "learning_rate": 7.117993256628364e-07, + "loss": 0.87, + "step": 31561 + }, + { + "epoch": 6.732508532423208, + "grad_norm": 0.07362002888289389, + "learning_rate": 7.106671178171809e-07, + "loss": 0.8989, + "step": 31562 + }, + { + "epoch": 6.732721843003413, + "grad_norm": 0.07462718989839905, + "learning_rate": 7.095358071316672e-07, + "loss": 0.8736, + "step": 31563 + }, + { + "epoch": 6.732935153583618, + "grad_norm": 0.07346395513141729, + "learning_rate": 7.08405393619076e-07, + "loss": 0.8644, + "step": 31564 + }, + { + "epoch": 6.733148464163823, + "grad_norm": 0.0708040957993606, + "learning_rate": 7.072758772922151e-07, + "loss": 0.8849, + "step": 31565 + }, + { + "epoch": 6.7333617747440275, + "grad_norm": 0.0705951655085821, + "learning_rate": 7.06147258163874e-07, + "loss": 0.8465, + "step": 31566 + }, + { + "epoch": 6.733575085324232, + "grad_norm": 0.07015408637389134, + "learning_rate": 7.050195362467982e-07, + "loss": 0.8527, + "step": 31567 + }, + { + "epoch": 6.733788395904437, + "grad_norm": 0.07180214541429765, + "learning_rate": 7.038927115537597e-07, + "loss": 0.8535, + "step": 31568 + }, + { + "epoch": 6.734001706484642, + "grad_norm": 0.07557936183541471, + "learning_rate": 7.027667840975127e-07, + "loss": 0.8781, + "step": 31569 + }, + { + "epoch": 6.734215017064846, + "grad_norm": 0.07534012276571807, + "learning_rate": 7.016417538907938e-07, + "loss": 0.8693, + "step": 31570 + }, + { + "epoch": 6.734428327645051, + "grad_norm": 0.07544470232651518, + "learning_rate": 7.005176209463394e-07, + "loss": 0.8693, + "step": 31571 + }, + { + "epoch": 6.734641638225256, + "grad_norm": 0.07545401575171728, + "learning_rate": 6.993943852768504e-07, + "loss": 0.8622, + "step": 31572 + }, + { + "epoch": 6.734854948805461, + "grad_norm": 0.07289493320730549, + "learning_rate": 6.982720468950455e-07, + "loss": 0.8914, + "step": 31573 + }, + { + "epoch": 6.735068259385666, + "grad_norm": 0.07621693916614276, + "learning_rate": 6.971506058136346e-07, + "loss": 0.8907, + "step": 31574 + }, + { + "epoch": 6.735281569965871, + "grad_norm": 0.07271061628411524, + "learning_rate": 6.960300620453009e-07, + "loss": 0.8918, + "step": 31575 + }, + { + "epoch": 6.735494880546075, + "grad_norm": 0.07089501399541022, + "learning_rate": 6.949104156027098e-07, + "loss": 0.8851, + "step": 31576 + }, + { + "epoch": 6.73570819112628, + "grad_norm": 0.07900592449147653, + "learning_rate": 6.937916664985533e-07, + "loss": 0.9015, + "step": 31577 + }, + { + "epoch": 6.735921501706485, + "grad_norm": 0.07246883229575755, + "learning_rate": 6.926738147454703e-07, + "loss": 0.8539, + "step": 31578 + }, + { + "epoch": 6.736134812286689, + "grad_norm": 0.07734887805079835, + "learning_rate": 6.915568603561174e-07, + "loss": 0.8745, + "step": 31579 + }, + { + "epoch": 6.736348122866894, + "grad_norm": 0.06933195433399457, + "learning_rate": 6.904408033431331e-07, + "loss": 0.8758, + "step": 31580 + }, + { + "epoch": 6.736561433447099, + "grad_norm": 0.07462912824052574, + "learning_rate": 6.893256437191476e-07, + "loss": 0.8809, + "step": 31581 + }, + { + "epoch": 6.736774744027304, + "grad_norm": 0.08056297219223547, + "learning_rate": 6.882113814967817e-07, + "loss": 0.8676, + "step": 31582 + }, + { + "epoch": 6.736988054607508, + "grad_norm": 0.07076350604511032, + "learning_rate": 6.870980166886299e-07, + "loss": 0.8752, + "step": 31583 + }, + { + "epoch": 6.737201365187714, + "grad_norm": 0.07218262321246792, + "learning_rate": 6.859855493072954e-07, + "loss": 0.8758, + "step": 31584 + }, + { + "epoch": 6.737414675767918, + "grad_norm": 0.07197298589531621, + "learning_rate": 6.848739793653725e-07, + "loss": 0.8717, + "step": 31585 + }, + { + "epoch": 6.737627986348123, + "grad_norm": 0.07040771392201568, + "learning_rate": 6.837633068754379e-07, + "loss": 0.8619, + "step": 31586 + }, + { + "epoch": 6.737841296928328, + "grad_norm": 0.07198197034316979, + "learning_rate": 6.826535318500504e-07, + "loss": 0.86, + "step": 31587 + }, + { + "epoch": 6.738054607508532, + "grad_norm": 0.07280526061343665, + "learning_rate": 6.815446543017779e-07, + "loss": 0.8697, + "step": 31588 + }, + { + "epoch": 6.738267918088737, + "grad_norm": 0.06952065341215222, + "learning_rate": 6.804366742431612e-07, + "loss": 0.9121, + "step": 31589 + }, + { + "epoch": 6.738481228668942, + "grad_norm": 0.07312982079803759, + "learning_rate": 6.793295916867326e-07, + "loss": 0.8673, + "step": 31590 + }, + { + "epoch": 6.738694539249146, + "grad_norm": 0.07244574871425986, + "learning_rate": 6.782234066450244e-07, + "loss": 0.8836, + "step": 31591 + }, + { + "epoch": 6.738907849829351, + "grad_norm": 0.07393320888087346, + "learning_rate": 6.771181191305598e-07, + "loss": 0.8795, + "step": 31592 + }, + { + "epoch": 6.739121160409557, + "grad_norm": 0.07211109223760204, + "learning_rate": 6.760137291558355e-07, + "loss": 0.8944, + "step": 31593 + }, + { + "epoch": 6.739334470989761, + "grad_norm": 0.0718250511433985, + "learning_rate": 6.749102367333394e-07, + "loss": 0.8763, + "step": 31594 + }, + { + "epoch": 6.739547781569966, + "grad_norm": 0.07181503470454073, + "learning_rate": 6.738076418755857e-07, + "loss": 0.87, + "step": 31595 + }, + { + "epoch": 6.739761092150171, + "grad_norm": 0.07023289867247466, + "learning_rate": 6.727059445950179e-07, + "loss": 0.8602, + "step": 31596 + }, + { + "epoch": 6.739974402730375, + "grad_norm": 0.0720795965962603, + "learning_rate": 6.716051449041239e-07, + "loss": 0.8777, + "step": 31597 + }, + { + "epoch": 6.74018771331058, + "grad_norm": 0.07097060328844183, + "learning_rate": 6.705052428153469e-07, + "loss": 0.8698, + "step": 31598 + }, + { + "epoch": 6.740401023890785, + "grad_norm": 0.07197647626664558, + "learning_rate": 6.694062383411393e-07, + "loss": 0.8486, + "step": 31599 + }, + { + "epoch": 6.7406143344709895, + "grad_norm": 0.07513281515777107, + "learning_rate": 6.683081314939355e-07, + "loss": 0.8842, + "step": 31600 + }, + { + "epoch": 6.740827645051194, + "grad_norm": 0.07025864108399146, + "learning_rate": 6.672109222861611e-07, + "loss": 0.8774, + "step": 31601 + }, + { + "epoch": 6.7410409556314, + "grad_norm": 0.07398950165303614, + "learning_rate": 6.661146107302241e-07, + "loss": 0.8788, + "step": 31602 + }, + { + "epoch": 6.741254266211604, + "grad_norm": 0.07672677330764265, + "learning_rate": 6.650191968385411e-07, + "loss": 0.895, + "step": 31603 + }, + { + "epoch": 6.741467576791809, + "grad_norm": 0.0714983538891453, + "learning_rate": 6.639246806235023e-07, + "loss": 0.8684, + "step": 31604 + }, + { + "epoch": 6.741680887372014, + "grad_norm": 0.06836254889491436, + "learning_rate": 6.628310620974887e-07, + "loss": 0.8646, + "step": 31605 + }, + { + "epoch": 6.7418941979522184, + "grad_norm": 0.07392473839244507, + "learning_rate": 6.617383412728728e-07, + "loss": 0.8788, + "step": 31606 + }, + { + "epoch": 6.742107508532423, + "grad_norm": 0.07329616394056443, + "learning_rate": 6.60646518162027e-07, + "loss": 0.8668, + "step": 31607 + }, + { + "epoch": 6.742320819112628, + "grad_norm": 0.07297385021483546, + "learning_rate": 6.595555927772879e-07, + "loss": 0.8986, + "step": 31608 + }, + { + "epoch": 6.7425341296928325, + "grad_norm": 0.07338608969602993, + "learning_rate": 6.58465565131019e-07, + "loss": 0.9191, + "step": 31609 + }, + { + "epoch": 6.742747440273037, + "grad_norm": 0.07106845768813201, + "learning_rate": 6.573764352355483e-07, + "loss": 0.863, + "step": 31610 + }, + { + "epoch": 6.742960750853243, + "grad_norm": 0.0720989384000122, + "learning_rate": 6.562882031032036e-07, + "loss": 0.8936, + "step": 31611 + }, + { + "epoch": 6.743174061433447, + "grad_norm": 0.0737410947416716, + "learning_rate": 6.552008687462774e-07, + "loss": 0.8686, + "step": 31612 + }, + { + "epoch": 6.743387372013652, + "grad_norm": 0.0736486316799895, + "learning_rate": 6.541144321770976e-07, + "loss": 0.8719, + "step": 31613 + }, + { + "epoch": 6.743600682593857, + "grad_norm": 0.0671596470425622, + "learning_rate": 6.530288934079565e-07, + "loss": 0.8593, + "step": 31614 + }, + { + "epoch": 6.7438139931740615, + "grad_norm": 0.06995428071638818, + "learning_rate": 6.519442524511111e-07, + "loss": 0.886, + "step": 31615 + }, + { + "epoch": 6.744027303754266, + "grad_norm": 0.07938656363864985, + "learning_rate": 6.508605093188625e-07, + "loss": 0.8836, + "step": 31616 + }, + { + "epoch": 6.744240614334471, + "grad_norm": 0.07551584208284015, + "learning_rate": 6.497776640234588e-07, + "loss": 0.8906, + "step": 31617 + }, + { + "epoch": 6.7444539249146755, + "grad_norm": 0.3576215499295689, + "learning_rate": 6.486957165771568e-07, + "loss": 0.8805, + "step": 31618 + }, + { + "epoch": 6.74466723549488, + "grad_norm": 0.07088832451281275, + "learning_rate": 6.476146669921957e-07, + "loss": 0.8964, + "step": 31619 + }, + { + "epoch": 6.744880546075086, + "grad_norm": 0.07513492235145405, + "learning_rate": 6.465345152808055e-07, + "loss": 0.8872, + "step": 31620 + }, + { + "epoch": 6.7450938566552905, + "grad_norm": 0.0730450596295687, + "learning_rate": 6.454552614552167e-07, + "loss": 0.8777, + "step": 31621 + }, + { + "epoch": 6.745307167235495, + "grad_norm": 0.07186246665496873, + "learning_rate": 6.443769055276328e-07, + "loss": 0.8982, + "step": 31622 + }, + { + "epoch": 6.7455204778157, + "grad_norm": 0.07561831466124198, + "learning_rate": 6.432994475102571e-07, + "loss": 0.8785, + "step": 31623 + }, + { + "epoch": 6.7457337883959045, + "grad_norm": 0.07205507317412663, + "learning_rate": 6.422228874152847e-07, + "loss": 0.8799, + "step": 31624 + }, + { + "epoch": 6.745947098976109, + "grad_norm": 0.06968251392314033, + "learning_rate": 6.4114722525491e-07, + "loss": 0.8354, + "step": 31625 + }, + { + "epoch": 6.746160409556314, + "grad_norm": 0.07872249446896953, + "learning_rate": 6.400724610412745e-07, + "loss": 0.8801, + "step": 31626 + }, + { + "epoch": 6.746373720136519, + "grad_norm": 0.07406780073890316, + "learning_rate": 6.389985947865551e-07, + "loss": 0.8946, + "step": 31627 + }, + { + "epoch": 6.746587030716723, + "grad_norm": 0.07248576868628738, + "learning_rate": 6.379256265029022e-07, + "loss": 0.9065, + "step": 31628 + }, + { + "epoch": 6.746800341296928, + "grad_norm": 0.07704425491509315, + "learning_rate": 6.368535562024569e-07, + "loss": 0.8907, + "step": 31629 + }, + { + "epoch": 6.747013651877133, + "grad_norm": 0.07486620635239406, + "learning_rate": 6.357823838973432e-07, + "loss": 0.8924, + "step": 31630 + }, + { + "epoch": 6.747226962457338, + "grad_norm": 0.07147297391619854, + "learning_rate": 6.347121095996933e-07, + "loss": 0.8588, + "step": 31631 + }, + { + "epoch": 6.747440273037543, + "grad_norm": 0.07280781347124494, + "learning_rate": 6.336427333216044e-07, + "loss": 0.8888, + "step": 31632 + }, + { + "epoch": 6.747653583617748, + "grad_norm": 0.07135370638797495, + "learning_rate": 6.325742550751912e-07, + "loss": 0.8868, + "step": 31633 + }, + { + "epoch": 6.747866894197952, + "grad_norm": 0.07084749669895475, + "learning_rate": 6.31506674872533e-07, + "loss": 0.8984, + "step": 31634 + }, + { + "epoch": 6.748080204778157, + "grad_norm": 0.07647314886043247, + "learning_rate": 6.304399927257088e-07, + "loss": 0.8968, + "step": 31635 + }, + { + "epoch": 6.748293515358362, + "grad_norm": 0.07230297303713373, + "learning_rate": 6.29374208646798e-07, + "loss": 0.8611, + "step": 31636 + }, + { + "epoch": 6.748506825938566, + "grad_norm": 0.07210266295302455, + "learning_rate": 6.283093226478443e-07, + "loss": 0.8776, + "step": 31637 + }, + { + "epoch": 6.748720136518771, + "grad_norm": 0.07349665476056587, + "learning_rate": 6.272453347409091e-07, + "loss": 0.8784, + "step": 31638 + }, + { + "epoch": 6.748933447098976, + "grad_norm": 0.07362922552300757, + "learning_rate": 6.261822449380362e-07, + "loss": 0.8934, + "step": 31639 + }, + { + "epoch": 6.749146757679181, + "grad_norm": 0.07769486852174644, + "learning_rate": 6.251200532512336e-07, + "loss": 0.8999, + "step": 31640 + }, + { + "epoch": 6.749360068259386, + "grad_norm": 0.07456311914295138, + "learning_rate": 6.240587596925362e-07, + "loss": 0.8883, + "step": 31641 + }, + { + "epoch": 6.749573378839591, + "grad_norm": 0.06999151267454037, + "learning_rate": 6.229983642739524e-07, + "loss": 0.9074, + "step": 31642 + }, + { + "epoch": 6.749786689419795, + "grad_norm": 0.07410162555542066, + "learning_rate": 6.219388670074722e-07, + "loss": 0.8671, + "step": 31643 + }, + { + "epoch": 6.75, + "grad_norm": 0.07315220806060697, + "learning_rate": 6.208802679050863e-07, + "loss": 0.8597, + "step": 31644 + }, + { + "epoch": 6.750213310580205, + "grad_norm": 0.07451217394478314, + "learning_rate": 6.19822566978785e-07, + "loss": 0.8838, + "step": 31645 + }, + { + "epoch": 6.750426621160409, + "grad_norm": 0.07348492891688421, + "learning_rate": 6.187657642405232e-07, + "loss": 0.8835, + "step": 31646 + }, + { + "epoch": 6.750639931740614, + "grad_norm": 0.07456475377611133, + "learning_rate": 6.177098597022646e-07, + "loss": 0.8996, + "step": 31647 + }, + { + "epoch": 6.750853242320819, + "grad_norm": 0.07161801479034738, + "learning_rate": 6.166548533759464e-07, + "loss": 0.8469, + "step": 31648 + }, + { + "epoch": 6.751066552901024, + "grad_norm": 0.07287411800743074, + "learning_rate": 6.156007452735235e-07, + "loss": 0.8782, + "step": 31649 + }, + { + "epoch": 6.751279863481229, + "grad_norm": 0.07896684890254503, + "learning_rate": 6.145475354069063e-07, + "loss": 0.8927, + "step": 31650 + }, + { + "epoch": 6.751493174061434, + "grad_norm": 0.07861924959127424, + "learning_rate": 6.134952237880232e-07, + "loss": 0.885, + "step": 31651 + }, + { + "epoch": 6.751706484641638, + "grad_norm": 0.07783313403285062, + "learning_rate": 6.124438104287666e-07, + "loss": 0.9049, + "step": 31652 + }, + { + "epoch": 6.751919795221843, + "grad_norm": 0.06901337581973727, + "learning_rate": 6.113932953410561e-07, + "loss": 0.9109, + "step": 31653 + }, + { + "epoch": 6.752133105802048, + "grad_norm": 0.07155748899006585, + "learning_rate": 6.103436785367578e-07, + "loss": 0.8884, + "step": 31654 + }, + { + "epoch": 6.752346416382252, + "grad_norm": 0.07318938543769704, + "learning_rate": 6.092949600277642e-07, + "loss": 0.8661, + "step": 31655 + }, + { + "epoch": 6.752559726962457, + "grad_norm": 0.0689894546923197, + "learning_rate": 6.082471398259238e-07, + "loss": 0.85, + "step": 31656 + }, + { + "epoch": 6.752773037542662, + "grad_norm": 0.07502999693573166, + "learning_rate": 6.072002179431114e-07, + "loss": 0.8948, + "step": 31657 + }, + { + "epoch": 6.752986348122867, + "grad_norm": 0.07731306807586565, + "learning_rate": 6.061541943911576e-07, + "loss": 0.8717, + "step": 31658 + }, + { + "epoch": 6.753199658703072, + "grad_norm": 0.07174474972263722, + "learning_rate": 6.051090691819106e-07, + "loss": 0.8822, + "step": 31659 + }, + { + "epoch": 6.753412969283277, + "grad_norm": 0.07075560541394091, + "learning_rate": 6.040648423271833e-07, + "loss": 0.877, + "step": 31660 + }, + { + "epoch": 6.753626279863481, + "grad_norm": 0.07288177257160036, + "learning_rate": 6.030215138387973e-07, + "loss": 0.8703, + "step": 31661 + }, + { + "epoch": 6.753839590443686, + "grad_norm": 0.06699550358335153, + "learning_rate": 6.019790837285655e-07, + "loss": 0.8497, + "step": 31662 + }, + { + "epoch": 6.754052901023891, + "grad_norm": 0.07569285527390157, + "learning_rate": 6.009375520082738e-07, + "loss": 0.9072, + "step": 31663 + }, + { + "epoch": 6.7542662116040955, + "grad_norm": 0.07248769113033969, + "learning_rate": 5.998969186897086e-07, + "loss": 0.8949, + "step": 31664 + }, + { + "epoch": 6.7544795221843, + "grad_norm": 0.07626071581678229, + "learning_rate": 5.98857183784638e-07, + "loss": 0.9399, + "step": 31665 + }, + { + "epoch": 6.754692832764505, + "grad_norm": 0.07315084538782804, + "learning_rate": 5.978183473048393e-07, + "loss": 0.8983, + "step": 31666 + }, + { + "epoch": 6.7549061433447095, + "grad_norm": 0.06831332368297284, + "learning_rate": 5.967804092620632e-07, + "loss": 0.8186, + "step": 31667 + }, + { + "epoch": 6.755119453924914, + "grad_norm": 0.0695249727928124, + "learning_rate": 5.957433696680514e-07, + "loss": 0.8592, + "step": 31668 + }, + { + "epoch": 6.75533276450512, + "grad_norm": 0.07095931804302054, + "learning_rate": 5.947072285345367e-07, + "loss": 0.8724, + "step": 31669 + }, + { + "epoch": 6.7555460750853245, + "grad_norm": 0.07208296364400096, + "learning_rate": 5.93671985873261e-07, + "loss": 0.8857, + "step": 31670 + }, + { + "epoch": 6.755759385665529, + "grad_norm": 0.0738512872373154, + "learning_rate": 5.926376416959034e-07, + "loss": 0.8502, + "step": 31671 + }, + { + "epoch": 6.755972696245734, + "grad_norm": 0.0746119526773863, + "learning_rate": 5.916041960141883e-07, + "loss": 0.8666, + "step": 31672 + }, + { + "epoch": 6.7561860068259385, + "grad_norm": 0.07105223814337651, + "learning_rate": 5.905716488398128e-07, + "loss": 0.8724, + "step": 31673 + }, + { + "epoch": 6.756399317406143, + "grad_norm": 0.07088692179834857, + "learning_rate": 5.895400001844564e-07, + "loss": 0.8794, + "step": 31674 + }, + { + "epoch": 6.756612627986348, + "grad_norm": 0.07181423442404695, + "learning_rate": 5.8850925005979e-07, + "loss": 0.8685, + "step": 31675 + }, + { + "epoch": 6.756825938566553, + "grad_norm": 0.06881275809428014, + "learning_rate": 5.874793984774662e-07, + "loss": 0.8672, + "step": 31676 + }, + { + "epoch": 6.757039249146757, + "grad_norm": 0.06845874652800724, + "learning_rate": 5.864504454491649e-07, + "loss": 0.8741, + "step": 31677 + }, + { + "epoch": 6.757252559726963, + "grad_norm": 0.0702689160003281, + "learning_rate": 5.854223909865031e-07, + "loss": 0.8871, + "step": 31678 + }, + { + "epoch": 6.7574658703071675, + "grad_norm": 0.07256509508036758, + "learning_rate": 5.843952351011251e-07, + "loss": 0.8564, + "step": 31679 + }, + { + "epoch": 6.757679180887372, + "grad_norm": 0.07242825587260063, + "learning_rate": 5.833689778046481e-07, + "loss": 0.8657, + "step": 31680 + }, + { + "epoch": 6.757892491467577, + "grad_norm": 0.07254148056864773, + "learning_rate": 5.823436191086895e-07, + "loss": 0.8952, + "step": 31681 + }, + { + "epoch": 6.7581058020477816, + "grad_norm": 0.07108622246352092, + "learning_rate": 5.813191590248491e-07, + "loss": 0.8631, + "step": 31682 + }, + { + "epoch": 6.758319112627986, + "grad_norm": 0.07414525481371234, + "learning_rate": 5.802955975647173e-07, + "loss": 0.8752, + "step": 31683 + }, + { + "epoch": 6.758532423208191, + "grad_norm": 0.07454640371368326, + "learning_rate": 5.792729347398763e-07, + "loss": 0.8969, + "step": 31684 + }, + { + "epoch": 6.758745733788396, + "grad_norm": 0.07304189259379842, + "learning_rate": 5.782511705618987e-07, + "loss": 0.8764, + "step": 31685 + }, + { + "epoch": 6.7589590443686, + "grad_norm": 0.06993873351428377, + "learning_rate": 5.772303050423489e-07, + "loss": 0.8709, + "step": 31686 + }, + { + "epoch": 6.759172354948806, + "grad_norm": 0.06820579911689294, + "learning_rate": 5.762103381927731e-07, + "loss": 0.8676, + "step": 31687 + }, + { + "epoch": 6.7593856655290105, + "grad_norm": 0.07365505412779813, + "learning_rate": 5.751912700246998e-07, + "loss": 0.8753, + "step": 31688 + }, + { + "epoch": 6.759598976109215, + "grad_norm": 0.07094948858659743, + "learning_rate": 5.741731005496842e-07, + "loss": 0.9044, + "step": 31689 + }, + { + "epoch": 6.75981228668942, + "grad_norm": 0.0721043676829353, + "learning_rate": 5.731558297792372e-07, + "loss": 0.8856, + "step": 31690 + }, + { + "epoch": 6.760025597269625, + "grad_norm": 0.07629191287767892, + "learning_rate": 5.721394577248696e-07, + "loss": 0.8659, + "step": 31691 + }, + { + "epoch": 6.760238907849829, + "grad_norm": 0.06957356403066284, + "learning_rate": 5.711239843980742e-07, + "loss": 0.8797, + "step": 31692 + }, + { + "epoch": 6.760452218430034, + "grad_norm": 0.07016026418945528, + "learning_rate": 5.701094098103532e-07, + "loss": 0.8806, + "step": 31693 + }, + { + "epoch": 6.760665529010239, + "grad_norm": 0.07449366759226471, + "learning_rate": 5.690957339731728e-07, + "loss": 0.8726, + "step": 31694 + }, + { + "epoch": 6.760878839590443, + "grad_norm": 0.07192462964985202, + "learning_rate": 5.680829568980173e-07, + "loss": 0.851, + "step": 31695 + }, + { + "epoch": 6.761092150170649, + "grad_norm": 0.07107657087087141, + "learning_rate": 5.670710785963352e-07, + "loss": 0.8909, + "step": 31696 + }, + { + "epoch": 6.761305460750854, + "grad_norm": 0.06938074263304975, + "learning_rate": 5.66060099079575e-07, + "loss": 0.8776, + "step": 31697 + }, + { + "epoch": 6.761518771331058, + "grad_norm": 0.074005433516341, + "learning_rate": 5.650500183591944e-07, + "loss": 0.8659, + "step": 31698 + }, + { + "epoch": 6.761732081911263, + "grad_norm": 0.07388859613336231, + "learning_rate": 5.640408364465977e-07, + "loss": 0.8607, + "step": 31699 + }, + { + "epoch": 6.761945392491468, + "grad_norm": 0.07451054733875419, + "learning_rate": 5.630325533532155e-07, + "loss": 0.9047, + "step": 31700 + }, + { + "epoch": 6.762158703071672, + "grad_norm": 0.0710707522134694, + "learning_rate": 5.62025169090461e-07, + "loss": 0.8852, + "step": 31701 + }, + { + "epoch": 6.762372013651877, + "grad_norm": 0.07514079544129308, + "learning_rate": 5.610186836697296e-07, + "loss": 0.8794, + "step": 31702 + }, + { + "epoch": 6.762585324232082, + "grad_norm": 0.07925799692396968, + "learning_rate": 5.600130971023987e-07, + "loss": 0.9075, + "step": 31703 + }, + { + "epoch": 6.762798634812286, + "grad_norm": 0.07114815584668395, + "learning_rate": 5.590084093998638e-07, + "loss": 0.8582, + "step": 31704 + }, + { + "epoch": 6.763011945392492, + "grad_norm": 0.07761116439048141, + "learning_rate": 5.580046205734846e-07, + "loss": 0.8827, + "step": 31705 + }, + { + "epoch": 6.763225255972696, + "grad_norm": 0.07327129886775731, + "learning_rate": 5.57001730634621e-07, + "loss": 0.8904, + "step": 31706 + }, + { + "epoch": 6.763438566552901, + "grad_norm": 0.07449841319761151, + "learning_rate": 5.55999739594606e-07, + "loss": 0.8482, + "step": 31707 + }, + { + "epoch": 6.763651877133106, + "grad_norm": 0.07353590212818477, + "learning_rate": 5.549986474647994e-07, + "loss": 0.8844, + "step": 31708 + }, + { + "epoch": 6.763865187713311, + "grad_norm": 0.06915048472379301, + "learning_rate": 5.539984542565257e-07, + "loss": 0.8918, + "step": 31709 + }, + { + "epoch": 6.764078498293515, + "grad_norm": 0.06938452465969279, + "learning_rate": 5.529991599810913e-07, + "loss": 0.8418, + "step": 31710 + }, + { + "epoch": 6.76429180887372, + "grad_norm": 0.07174316904711096, + "learning_rate": 5.520007646498027e-07, + "loss": 0.9112, + "step": 31711 + }, + { + "epoch": 6.764505119453925, + "grad_norm": 0.07088443829219154, + "learning_rate": 5.510032682739574e-07, + "loss": 0.8778, + "step": 31712 + }, + { + "epoch": 6.764718430034129, + "grad_norm": 0.07812431663212148, + "learning_rate": 5.500066708648621e-07, + "loss": 0.8613, + "step": 31713 + }, + { + "epoch": 6.764931740614334, + "grad_norm": 0.06857408264376377, + "learning_rate": 5.4901097243377e-07, + "loss": 0.8404, + "step": 31714 + }, + { + "epoch": 6.765145051194539, + "grad_norm": 0.07779826389688607, + "learning_rate": 5.48016172991952e-07, + "loss": 0.8816, + "step": 31715 + }, + { + "epoch": 6.765358361774744, + "grad_norm": 0.0742131764524723, + "learning_rate": 5.470222725506702e-07, + "loss": 0.8915, + "step": 31716 + }, + { + "epoch": 6.765571672354949, + "grad_norm": 0.07651929471068973, + "learning_rate": 5.460292711211689e-07, + "loss": 0.8841, + "step": 31717 + }, + { + "epoch": 6.765784982935154, + "grad_norm": 0.06928097987847989, + "learning_rate": 5.450371687146839e-07, + "loss": 0.8871, + "step": 31718 + }, + { + "epoch": 6.765998293515358, + "grad_norm": 0.06997102102121901, + "learning_rate": 5.440459653424323e-07, + "loss": 0.868, + "step": 31719 + }, + { + "epoch": 6.766211604095563, + "grad_norm": 0.07591920337630059, + "learning_rate": 5.430556610156412e-07, + "loss": 0.8574, + "step": 31720 + }, + { + "epoch": 6.766424914675768, + "grad_norm": 0.07344233720737987, + "learning_rate": 5.420662557455192e-07, + "loss": 0.8655, + "step": 31721 + }, + { + "epoch": 6.7666382252559725, + "grad_norm": 0.07393821919387854, + "learning_rate": 5.410777495432485e-07, + "loss": 0.8729, + "step": 31722 + }, + { + "epoch": 6.766851535836177, + "grad_norm": 0.07583699417593454, + "learning_rate": 5.4009014242002e-07, + "loss": 0.8829, + "step": 31723 + }, + { + "epoch": 6.767064846416382, + "grad_norm": 0.07445812863899706, + "learning_rate": 5.391034343869983e-07, + "loss": 0.8547, + "step": 31724 + }, + { + "epoch": 6.767278156996587, + "grad_norm": 0.0744071855423837, + "learning_rate": 5.381176254553744e-07, + "loss": 0.8679, + "step": 31725 + }, + { + "epoch": 6.767491467576792, + "grad_norm": 0.07360318978132338, + "learning_rate": 5.371327156362771e-07, + "loss": 0.8908, + "step": 31726 + }, + { + "epoch": 6.767704778156997, + "grad_norm": 0.06777484838269444, + "learning_rate": 5.36148704940862e-07, + "loss": 0.8586, + "step": 31727 + }, + { + "epoch": 6.7679180887372015, + "grad_norm": 0.07370755701584006, + "learning_rate": 5.35165593380258e-07, + "loss": 0.9055, + "step": 31728 + }, + { + "epoch": 6.768131399317406, + "grad_norm": 0.0726179187234236, + "learning_rate": 5.341833809655938e-07, + "loss": 0.8745, + "step": 31729 + }, + { + "epoch": 6.768344709897611, + "grad_norm": 0.0752793384054271, + "learning_rate": 5.332020677079808e-07, + "loss": 0.8737, + "step": 31730 + }, + { + "epoch": 6.7685580204778155, + "grad_norm": 0.07125539895429134, + "learning_rate": 5.32221653618521e-07, + "loss": 0.8727, + "step": 31731 + }, + { + "epoch": 6.76877133105802, + "grad_norm": 0.07356555330263737, + "learning_rate": 5.312421387083166e-07, + "loss": 0.8671, + "step": 31732 + }, + { + "epoch": 6.768984641638225, + "grad_norm": 0.07441819531793462, + "learning_rate": 5.302635229884435e-07, + "loss": 0.8836, + "step": 31733 + }, + { + "epoch": 6.7691979522184305, + "grad_norm": 0.07936108320232528, + "learning_rate": 5.29285806469968e-07, + "loss": 0.8726, + "step": 31734 + }, + { + "epoch": 6.769411262798635, + "grad_norm": 0.07507834215848921, + "learning_rate": 5.283089891639659e-07, + "loss": 0.9158, + "step": 31735 + }, + { + "epoch": 6.76962457337884, + "grad_norm": 0.07254305739005427, + "learning_rate": 5.273330710814773e-07, + "loss": 0.8751, + "step": 31736 + }, + { + "epoch": 6.7698378839590445, + "grad_norm": 0.07484322355363193, + "learning_rate": 5.263580522335598e-07, + "loss": 0.8708, + "step": 31737 + }, + { + "epoch": 6.770051194539249, + "grad_norm": 0.07366429007199658, + "learning_rate": 5.25383932631236e-07, + "loss": 0.9009, + "step": 31738 + }, + { + "epoch": 6.770264505119454, + "grad_norm": 0.06957238032171555, + "learning_rate": 5.244107122855279e-07, + "loss": 0.8765, + "step": 31739 + }, + { + "epoch": 6.770477815699659, + "grad_norm": 0.07457951228513227, + "learning_rate": 5.234383912074492e-07, + "loss": 0.8889, + "step": 31740 + }, + { + "epoch": 6.770691126279863, + "grad_norm": 0.07369982906978234, + "learning_rate": 5.224669694080042e-07, + "loss": 0.8958, + "step": 31741 + }, + { + "epoch": 6.770904436860068, + "grad_norm": 0.07152834175618454, + "learning_rate": 5.214964468981798e-07, + "loss": 0.8889, + "step": 31742 + }, + { + "epoch": 6.7711177474402735, + "grad_norm": 0.07268489060211371, + "learning_rate": 5.205268236889627e-07, + "loss": 0.8907, + "step": 31743 + }, + { + "epoch": 6.771331058020478, + "grad_norm": 0.07744688438267179, + "learning_rate": 5.195580997913219e-07, + "loss": 0.9107, + "step": 31744 + }, + { + "epoch": 6.771544368600683, + "grad_norm": 0.0722996372633527, + "learning_rate": 5.185902752162175e-07, + "loss": 0.8683, + "step": 31745 + }, + { + "epoch": 6.771757679180888, + "grad_norm": 0.0800969864893249, + "learning_rate": 5.176233499746008e-07, + "loss": 0.8889, + "step": 31746 + }, + { + "epoch": 6.771970989761092, + "grad_norm": 0.0667193781340089, + "learning_rate": 5.166573240774053e-07, + "loss": 0.8882, + "step": 31747 + }, + { + "epoch": 6.772184300341297, + "grad_norm": 0.0691144135217262, + "learning_rate": 5.156921975355822e-07, + "loss": 0.8792, + "step": 31748 + }, + { + "epoch": 6.772397610921502, + "grad_norm": 0.07323382525358366, + "learning_rate": 5.147279703600294e-07, + "loss": 0.8855, + "step": 31749 + }, + { + "epoch": 6.772610921501706, + "grad_norm": 0.07366307969463816, + "learning_rate": 5.137646425616716e-07, + "loss": 0.8864, + "step": 31750 + }, + { + "epoch": 6.772824232081911, + "grad_norm": 0.07475429967630805, + "learning_rate": 5.128022141513978e-07, + "loss": 0.9085, + "step": 31751 + }, + { + "epoch": 6.773037542662116, + "grad_norm": 0.07079597875222186, + "learning_rate": 5.118406851401059e-07, + "loss": 0.8679, + "step": 31752 + }, + { + "epoch": 6.77325085324232, + "grad_norm": 0.0752220962907416, + "learning_rate": 5.108800555386761e-07, + "loss": 0.9093, + "step": 31753 + }, + { + "epoch": 6.773464163822526, + "grad_norm": 0.07230704102846404, + "learning_rate": 5.099203253579799e-07, + "loss": 0.8755, + "step": 31754 + }, + { + "epoch": 6.773677474402731, + "grad_norm": 0.07395554886428238, + "learning_rate": 5.089614946088617e-07, + "loss": 0.8843, + "step": 31755 + }, + { + "epoch": 6.773890784982935, + "grad_norm": 0.07895708934629492, + "learning_rate": 5.08003563302184e-07, + "loss": 0.9058, + "step": 31756 + }, + { + "epoch": 6.77410409556314, + "grad_norm": 0.0739336004096263, + "learning_rate": 5.070465314487916e-07, + "loss": 0.8797, + "step": 31757 + }, + { + "epoch": 6.774317406143345, + "grad_norm": 0.0728855753438658, + "learning_rate": 5.060903990594934e-07, + "loss": 0.8591, + "step": 31758 + }, + { + "epoch": 6.774530716723549, + "grad_norm": 0.06823044638537774, + "learning_rate": 5.051351661451254e-07, + "loss": 0.8529, + "step": 31759 + }, + { + "epoch": 6.774744027303754, + "grad_norm": 0.07091729248775179, + "learning_rate": 5.041808327164965e-07, + "loss": 0.8939, + "step": 31760 + }, + { + "epoch": 6.774957337883959, + "grad_norm": 0.07549524272995528, + "learning_rate": 5.032273987843894e-07, + "loss": 0.8784, + "step": 31761 + }, + { + "epoch": 6.775170648464163, + "grad_norm": 0.07205549324498614, + "learning_rate": 5.022748643596043e-07, + "loss": 0.8699, + "step": 31762 + }, + { + "epoch": 6.775383959044369, + "grad_norm": 0.07124068505430342, + "learning_rate": 5.013232294529147e-07, + "loss": 0.841, + "step": 31763 + }, + { + "epoch": 6.775597269624574, + "grad_norm": 0.0754155760354346, + "learning_rate": 5.003724940750942e-07, + "loss": 0.9003, + "step": 31764 + }, + { + "epoch": 6.775810580204778, + "grad_norm": 0.07363258500884749, + "learning_rate": 4.9942265823689e-07, + "loss": 0.8682, + "step": 31765 + }, + { + "epoch": 6.776023890784983, + "grad_norm": 0.06828225042719187, + "learning_rate": 4.984737219490577e-07, + "loss": 0.8843, + "step": 31766 + }, + { + "epoch": 6.776237201365188, + "grad_norm": 0.06922749940669992, + "learning_rate": 4.975256852223353e-07, + "loss": 0.876, + "step": 31767 + }, + { + "epoch": 6.776450511945392, + "grad_norm": 0.0736639990736194, + "learning_rate": 4.965785480674435e-07, + "loss": 0.8857, + "step": 31768 + }, + { + "epoch": 6.776663822525597, + "grad_norm": 0.07663165109873382, + "learning_rate": 4.956323104951022e-07, + "loss": 0.8659, + "step": 31769 + }, + { + "epoch": 6.776877133105802, + "grad_norm": 0.07982734649817054, + "learning_rate": 4.94686972516023e-07, + "loss": 0.864, + "step": 31770 + }, + { + "epoch": 6.777090443686006, + "grad_norm": 0.07229934870892757, + "learning_rate": 4.937425341408996e-07, + "loss": 0.8843, + "step": 31771 + }, + { + "epoch": 6.777303754266212, + "grad_norm": 0.07735898323362347, + "learning_rate": 4.92798995380408e-07, + "loss": 0.8883, + "step": 31772 + }, + { + "epoch": 6.777517064846417, + "grad_norm": 0.07230093250095632, + "learning_rate": 4.918563562452416e-07, + "loss": 0.8871, + "step": 31773 + }, + { + "epoch": 6.777730375426621, + "grad_norm": 0.07325854817848988, + "learning_rate": 4.909146167460588e-07, + "loss": 0.8856, + "step": 31774 + }, + { + "epoch": 6.777943686006826, + "grad_norm": 0.07152258254695629, + "learning_rate": 4.899737768935087e-07, + "loss": 0.8749, + "step": 31775 + }, + { + "epoch": 6.778156996587031, + "grad_norm": 0.06774467172759127, + "learning_rate": 4.890338366982405e-07, + "loss": 0.8675, + "step": 31776 + }, + { + "epoch": 6.778370307167235, + "grad_norm": 0.07098609437484511, + "learning_rate": 4.880947961708948e-07, + "loss": 0.8813, + "step": 31777 + }, + { + "epoch": 6.77858361774744, + "grad_norm": 0.07661332010974907, + "learning_rate": 4.871566553220941e-07, + "loss": 0.8581, + "step": 31778 + }, + { + "epoch": 6.778796928327645, + "grad_norm": 0.07391746495883764, + "learning_rate": 4.862194141624521e-07, + "loss": 0.8866, + "step": 31779 + }, + { + "epoch": 6.7790102389078495, + "grad_norm": 0.07247419226431649, + "learning_rate": 4.852830727025825e-07, + "loss": 0.8524, + "step": 31780 + }, + { + "epoch": 6.779223549488055, + "grad_norm": 0.07691197232978077, + "learning_rate": 4.843476309530637e-07, + "loss": 0.8686, + "step": 31781 + }, + { + "epoch": 6.77943686006826, + "grad_norm": 0.07755142957607698, + "learning_rate": 4.834130889244915e-07, + "loss": 0.8788, + "step": 31782 + }, + { + "epoch": 6.779650170648464, + "grad_norm": 0.0714673558466557, + "learning_rate": 4.824794466274352e-07, + "loss": 0.8748, + "step": 31783 + }, + { + "epoch": 6.779863481228669, + "grad_norm": 0.07188541324252298, + "learning_rate": 4.815467040724642e-07, + "loss": 0.8901, + "step": 31784 + }, + { + "epoch": 6.780076791808874, + "grad_norm": 0.07288889077472199, + "learning_rate": 4.8061486127013e-07, + "loss": 0.8871, + "step": 31785 + }, + { + "epoch": 6.7802901023890785, + "grad_norm": 0.07009175925461501, + "learning_rate": 4.796839182309842e-07, + "loss": 0.8782, + "step": 31786 + }, + { + "epoch": 6.780503412969283, + "grad_norm": 0.0764245952207446, + "learning_rate": 4.787538749655341e-07, + "loss": 0.8986, + "step": 31787 + }, + { + "epoch": 6.780716723549488, + "grad_norm": 0.07540049027057519, + "learning_rate": 4.778247314843309e-07, + "loss": 0.8791, + "step": 31788 + }, + { + "epoch": 6.7809300341296925, + "grad_norm": 0.06986412415589278, + "learning_rate": 4.768964877978821e-07, + "loss": 0.8728, + "step": 31789 + }, + { + "epoch": 6.781143344709898, + "grad_norm": 0.07261029042258535, + "learning_rate": 4.759691439166858e-07, + "loss": 0.8822, + "step": 31790 + }, + { + "epoch": 6.781356655290102, + "grad_norm": 0.07337879436840014, + "learning_rate": 4.750426998512314e-07, + "loss": 0.8601, + "step": 31791 + }, + { + "epoch": 6.7815699658703075, + "grad_norm": 0.0725645254859011, + "learning_rate": 4.741171556120083e-07, + "loss": 0.8711, + "step": 31792 + }, + { + "epoch": 6.781783276450512, + "grad_norm": 0.07026875969352488, + "learning_rate": 4.7319251120949705e-07, + "loss": 0.9039, + "step": 31793 + }, + { + "epoch": 6.781996587030717, + "grad_norm": 0.07278834281256676, + "learning_rate": 4.722687666541426e-07, + "loss": 0.9009, + "step": 31794 + }, + { + "epoch": 6.7822098976109215, + "grad_norm": 0.06905077526899457, + "learning_rate": 4.7134592195639874e-07, + "loss": 0.8548, + "step": 31795 + }, + { + "epoch": 6.782423208191126, + "grad_norm": 0.07293003912016845, + "learning_rate": 4.704239771267194e-07, + "loss": 0.8759, + "step": 31796 + }, + { + "epoch": 6.782636518771331, + "grad_norm": 0.07577435177572842, + "learning_rate": 4.6950293217553176e-07, + "loss": 0.8503, + "step": 31797 + }, + { + "epoch": 6.782849829351536, + "grad_norm": 0.07147405482372438, + "learning_rate": 4.685827871132542e-07, + "loss": 0.8599, + "step": 31798 + }, + { + "epoch": 6.78306313993174, + "grad_norm": 0.07574699670403831, + "learning_rate": 4.676635419502962e-07, + "loss": 0.8887, + "step": 31799 + }, + { + "epoch": 6.783276450511945, + "grad_norm": 0.07629072156551969, + "learning_rate": 4.667451966970671e-07, + "loss": 0.9015, + "step": 31800 + }, + { + "epoch": 6.7834897610921505, + "grad_norm": 0.0703871237782716, + "learning_rate": 4.6582775136394976e-07, + "loss": 0.9041, + "step": 31801 + }, + { + "epoch": 6.783703071672355, + "grad_norm": 0.06936133529208621, + "learning_rate": 4.649112059613359e-07, + "loss": 0.8841, + "step": 31802 + }, + { + "epoch": 6.78391638225256, + "grad_norm": 0.07216946237239755, + "learning_rate": 4.6399556049958163e-07, + "loss": 0.8783, + "step": 31803 + }, + { + "epoch": 6.784129692832765, + "grad_norm": 0.0762789122319891, + "learning_rate": 4.6308081498906976e-07, + "loss": 0.8904, + "step": 31804 + }, + { + "epoch": 6.784343003412969, + "grad_norm": 0.07186908537506338, + "learning_rate": 4.6216696944012984e-07, + "loss": 0.877, + "step": 31805 + }, + { + "epoch": 6.784556313993174, + "grad_norm": 0.0721573988793348, + "learning_rate": 4.612540238631003e-07, + "loss": 0.8753, + "step": 31806 + }, + { + "epoch": 6.784769624573379, + "grad_norm": 0.07712096662353458, + "learning_rate": 4.6034197826831937e-07, + "loss": 0.8906, + "step": 31807 + }, + { + "epoch": 6.784982935153583, + "grad_norm": 0.07822760902233646, + "learning_rate": 4.5943083266611675e-07, + "loss": 0.8901, + "step": 31808 + }, + { + "epoch": 6.785196245733788, + "grad_norm": 0.07213308286041975, + "learning_rate": 4.585205870667864e-07, + "loss": 0.8632, + "step": 31809 + }, + { + "epoch": 6.785409556313994, + "grad_norm": 0.06945321781187627, + "learning_rate": 4.5761124148063993e-07, + "loss": 0.8757, + "step": 31810 + }, + { + "epoch": 6.785622866894198, + "grad_norm": 0.07102088221254452, + "learning_rate": 4.567027959179537e-07, + "loss": 0.8868, + "step": 31811 + }, + { + "epoch": 6.785836177474403, + "grad_norm": 0.07644293314799258, + "learning_rate": 4.5579525038901287e-07, + "loss": 0.8933, + "step": 31812 + }, + { + "epoch": 6.786049488054608, + "grad_norm": 0.0718608330670721, + "learning_rate": 4.5488860490408457e-07, + "loss": 0.8813, + "step": 31813 + }, + { + "epoch": 6.786262798634812, + "grad_norm": 0.07285406899145906, + "learning_rate": 4.5398285947343635e-07, + "loss": 0.8877, + "step": 31814 + }, + { + "epoch": 6.786476109215017, + "grad_norm": 0.07428251936233998, + "learning_rate": 4.530780141073088e-07, + "loss": 0.891, + "step": 31815 + }, + { + "epoch": 6.786689419795222, + "grad_norm": 0.07491540990246222, + "learning_rate": 4.521740688159426e-07, + "loss": 0.8877, + "step": 31816 + }, + { + "epoch": 6.786902730375426, + "grad_norm": 0.06894001350438239, + "learning_rate": 4.5127102360956074e-07, + "loss": 0.864, + "step": 31817 + }, + { + "epoch": 6.787116040955631, + "grad_norm": 0.07047008027013436, + "learning_rate": 4.503688784983862e-07, + "loss": 0.8808, + "step": 31818 + }, + { + "epoch": 6.787329351535837, + "grad_norm": 0.07288183639803027, + "learning_rate": 4.49467633492624e-07, + "loss": 0.8882, + "step": 31819 + }, + { + "epoch": 6.787542662116041, + "grad_norm": 0.07248088100257351, + "learning_rate": 4.4856728860247946e-07, + "loss": 0.8615, + "step": 31820 + }, + { + "epoch": 6.787755972696246, + "grad_norm": 0.06883607346179921, + "learning_rate": 4.47667843838131e-07, + "loss": 0.883, + "step": 31821 + }, + { + "epoch": 6.787969283276451, + "grad_norm": 0.07933566846089062, + "learning_rate": 4.467692992097572e-07, + "loss": 0.8726, + "step": 31822 + }, + { + "epoch": 6.788182593856655, + "grad_norm": 0.07877503741491834, + "learning_rate": 4.458716547275277e-07, + "loss": 0.8843, + "step": 31823 + }, + { + "epoch": 6.78839590443686, + "grad_norm": 0.07251929124358923, + "learning_rate": 4.449749104015943e-07, + "loss": 0.8585, + "step": 31824 + }, + { + "epoch": 6.788609215017065, + "grad_norm": 0.06777011238974076, + "learning_rate": 4.4407906624211795e-07, + "loss": 0.8437, + "step": 31825 + }, + { + "epoch": 6.788822525597269, + "grad_norm": 0.07098727839622584, + "learning_rate": 4.431841222592148e-07, + "loss": 0.8709, + "step": 31826 + }, + { + "epoch": 6.789035836177474, + "grad_norm": 0.0706788191101906, + "learning_rate": 4.4229007846302796e-07, + "loss": 0.8933, + "step": 31827 + }, + { + "epoch": 6.78924914675768, + "grad_norm": 0.07241189863491934, + "learning_rate": 4.413969348636649e-07, + "loss": 0.8869, + "step": 31828 + }, + { + "epoch": 6.789462457337884, + "grad_norm": 0.07313597933217364, + "learning_rate": 4.405046914712419e-07, + "loss": 0.8943, + "step": 31829 + }, + { + "epoch": 6.789675767918089, + "grad_norm": 0.07503700238212213, + "learning_rate": 4.396133482958398e-07, + "loss": 0.8487, + "step": 31830 + }, + { + "epoch": 6.789889078498294, + "grad_norm": 0.06902753171566212, + "learning_rate": 4.3872290534754837e-07, + "loss": 0.888, + "step": 31831 + }, + { + "epoch": 6.790102389078498, + "grad_norm": 0.07158557114423428, + "learning_rate": 4.378333626364395e-07, + "loss": 0.8719, + "step": 31832 + }, + { + "epoch": 6.790315699658703, + "grad_norm": 0.0708874204662874, + "learning_rate": 4.3694472017259404e-07, + "loss": 0.9005, + "step": 31833 + }, + { + "epoch": 6.790529010238908, + "grad_norm": 0.07253611847779302, + "learning_rate": 4.3605697796605727e-07, + "loss": 0.8571, + "step": 31834 + }, + { + "epoch": 6.790742320819112, + "grad_norm": 0.07330162056407563, + "learning_rate": 4.351701360268745e-07, + "loss": 0.9144, + "step": 31835 + }, + { + "epoch": 6.790955631399317, + "grad_norm": 0.07109490265466327, + "learning_rate": 4.342841943650733e-07, + "loss": 0.8673, + "step": 31836 + }, + { + "epoch": 6.791168941979522, + "grad_norm": 0.069579982207961, + "learning_rate": 4.3339915299069e-07, + "loss": 0.8656, + "step": 31837 + }, + { + "epoch": 6.7913822525597265, + "grad_norm": 0.08141885341299303, + "learning_rate": 4.325150119137345e-07, + "loss": 0.8957, + "step": 31838 + }, + { + "epoch": 6.791595563139932, + "grad_norm": 0.07080197382564377, + "learning_rate": 4.316317711442075e-07, + "loss": 0.8645, + "step": 31839 + }, + { + "epoch": 6.791808873720137, + "grad_norm": 0.07335214899002977, + "learning_rate": 4.3074943069211005e-07, + "loss": 0.8676, + "step": 31840 + }, + { + "epoch": 6.792022184300341, + "grad_norm": 0.07520118653575757, + "learning_rate": 4.2986799056741635e-07, + "loss": 0.8872, + "step": 31841 + }, + { + "epoch": 6.792235494880546, + "grad_norm": 0.07657227946536146, + "learning_rate": 4.289874507801095e-07, + "loss": 0.8916, + "step": 31842 + }, + { + "epoch": 6.792448805460751, + "grad_norm": 0.0732824531663012, + "learning_rate": 4.2810781134013713e-07, + "loss": 0.8686, + "step": 31843 + }, + { + "epoch": 6.7926621160409555, + "grad_norm": 0.07465808101535483, + "learning_rate": 4.2722907225747347e-07, + "loss": 0.8862, + "step": 31844 + }, + { + "epoch": 6.79287542662116, + "grad_norm": 0.07416079064587082, + "learning_rate": 4.2635123354204833e-07, + "loss": 0.8715, + "step": 31845 + }, + { + "epoch": 6.793088737201365, + "grad_norm": 0.07794551732019331, + "learning_rate": 4.2547429520380044e-07, + "loss": 0.9152, + "step": 31846 + }, + { + "epoch": 6.7933020477815695, + "grad_norm": 0.07365686672044512, + "learning_rate": 4.2459825725264193e-07, + "loss": 0.8829, + "step": 31847 + }, + { + "epoch": 6.793515358361775, + "grad_norm": 0.07162196916551486, + "learning_rate": 4.2372311969849367e-07, + "loss": 0.8806, + "step": 31848 + }, + { + "epoch": 6.79372866894198, + "grad_norm": 0.07155765492696918, + "learning_rate": 4.2284888255126776e-07, + "loss": 0.8963, + "step": 31849 + }, + { + "epoch": 6.7939419795221845, + "grad_norm": 0.07458355578211616, + "learning_rate": 4.219755458208319e-07, + "loss": 0.8678, + "step": 31850 + }, + { + "epoch": 6.794155290102389, + "grad_norm": 0.07055141513576908, + "learning_rate": 4.211031095170892e-07, + "loss": 0.8863, + "step": 31851 + }, + { + "epoch": 6.794368600682594, + "grad_norm": 0.07250084434563152, + "learning_rate": 4.202315736499074e-07, + "loss": 0.8856, + "step": 31852 + }, + { + "epoch": 6.7945819112627985, + "grad_norm": 0.07165252323702102, + "learning_rate": 4.1936093822913637e-07, + "loss": 0.8711, + "step": 31853 + }, + { + "epoch": 6.794795221843003, + "grad_norm": 0.08128787305947337, + "learning_rate": 4.1849120326463486e-07, + "loss": 0.887, + "step": 31854 + }, + { + "epoch": 6.795008532423208, + "grad_norm": 0.07294082097264243, + "learning_rate": 4.1762236876624397e-07, + "loss": 0.8827, + "step": 31855 + }, + { + "epoch": 6.795221843003413, + "grad_norm": 0.07325163708764235, + "learning_rate": 4.1675443474379574e-07, + "loss": 0.8895, + "step": 31856 + }, + { + "epoch": 6.795435153583618, + "grad_norm": 0.07202192419656397, + "learning_rate": 4.158874012071135e-07, + "loss": 0.8933, + "step": 31857 + }, + { + "epoch": 6.795648464163823, + "grad_norm": 0.07131520023434794, + "learning_rate": 4.1502126816599375e-07, + "loss": 0.8458, + "step": 31858 + }, + { + "epoch": 6.7958617747440275, + "grad_norm": 0.0716524726977004, + "learning_rate": 4.1415603563025097e-07, + "loss": 0.8868, + "step": 31859 + }, + { + "epoch": 6.796075085324232, + "grad_norm": 0.07506983128599126, + "learning_rate": 4.132917036096817e-07, + "loss": 0.8862, + "step": 31860 + }, + { + "epoch": 6.796288395904437, + "grad_norm": 0.07235369361515982, + "learning_rate": 4.1242827211404714e-07, + "loss": 0.8715, + "step": 31861 + }, + { + "epoch": 6.796501706484642, + "grad_norm": 0.07397889233734503, + "learning_rate": 4.1156574115312596e-07, + "loss": 0.8761, + "step": 31862 + }, + { + "epoch": 6.796715017064846, + "grad_norm": 0.07395929038602649, + "learning_rate": 4.1070411073667935e-07, + "loss": 0.8766, + "step": 31863 + }, + { + "epoch": 6.796928327645051, + "grad_norm": 0.0793832941110743, + "learning_rate": 4.098433808744506e-07, + "loss": 0.8969, + "step": 31864 + }, + { + "epoch": 6.797141638225256, + "grad_norm": 0.07298821846973794, + "learning_rate": 4.0898355157618306e-07, + "loss": 0.8918, + "step": 31865 + }, + { + "epoch": 6.797354948805461, + "grad_norm": 0.0711757954585924, + "learning_rate": 4.081246228516111e-07, + "loss": 0.8915, + "step": 31866 + }, + { + "epoch": 6.797568259385666, + "grad_norm": 0.06796407501858098, + "learning_rate": 4.0726659471043374e-07, + "loss": 0.8808, + "step": 31867 + }, + { + "epoch": 6.797781569965871, + "grad_norm": 0.07511479981117147, + "learning_rate": 4.064094671623853e-07, + "loss": 0.8573, + "step": 31868 + }, + { + "epoch": 6.797994880546075, + "grad_norm": 0.07124922386223116, + "learning_rate": 4.055532402171558e-07, + "loss": 0.9113, + "step": 31869 + }, + { + "epoch": 6.79820819112628, + "grad_norm": 0.0735060680083758, + "learning_rate": 4.0469791388442645e-07, + "loss": 0.8978, + "step": 31870 + }, + { + "epoch": 6.798421501706485, + "grad_norm": 0.07022193653275861, + "learning_rate": 4.0384348817387844e-07, + "loss": 0.8542, + "step": 31871 + }, + { + "epoch": 6.798634812286689, + "grad_norm": 0.07096853575852194, + "learning_rate": 4.02989963095175e-07, + "loss": 0.8937, + "step": 31872 + }, + { + "epoch": 6.798848122866894, + "grad_norm": 0.07041657488189718, + "learning_rate": 4.0213733865798854e-07, + "loss": 0.8738, + "step": 31873 + }, + { + "epoch": 6.799061433447099, + "grad_norm": 0.0719714534723802, + "learning_rate": 4.012856148719468e-07, + "loss": 0.8815, + "step": 31874 + }, + { + "epoch": 6.799274744027304, + "grad_norm": 0.07065299722431886, + "learning_rate": 4.004347917467044e-07, + "loss": 0.8829, + "step": 31875 + }, + { + "epoch": 6.799488054607508, + "grad_norm": 0.07442715745963717, + "learning_rate": 3.995848692918802e-07, + "loss": 0.8908, + "step": 31876 + }, + { + "epoch": 6.799701365187714, + "grad_norm": 0.07234068285094794, + "learning_rate": 3.987358475170844e-07, + "loss": 0.8977, + "step": 31877 + }, + { + "epoch": 6.799914675767918, + "grad_norm": 0.0702501459574637, + "learning_rate": 3.9788772643194475e-07, + "loss": 0.8776, + "step": 31878 + }, + { + "epoch": 6.800127986348123, + "grad_norm": 0.07018267879587627, + "learning_rate": 3.9704050604602697e-07, + "loss": 0.8847, + "step": 31879 + }, + { + "epoch": 6.800341296928328, + "grad_norm": 0.06814960058665442, + "learning_rate": 3.9619418636895e-07, + "loss": 0.8931, + "step": 31880 + }, + { + "epoch": 6.800554607508532, + "grad_norm": 0.07039019616555926, + "learning_rate": 3.953487674102707e-07, + "loss": 0.8656, + "step": 31881 + }, + { + "epoch": 6.800767918088737, + "grad_norm": 0.07261750193700799, + "learning_rate": 3.945042491795548e-07, + "loss": 0.8679, + "step": 31882 + }, + { + "epoch": 6.800981228668942, + "grad_norm": 0.07452490017924757, + "learning_rate": 3.936606316863678e-07, + "loss": 0.8804, + "step": 31883 + }, + { + "epoch": 6.801194539249146, + "grad_norm": 0.07361011241498705, + "learning_rate": 3.928179149402489e-07, + "loss": 0.8811, + "step": 31884 + }, + { + "epoch": 6.801407849829351, + "grad_norm": 0.07240327925322852, + "learning_rate": 3.9197609895073707e-07, + "loss": 0.9076, + "step": 31885 + }, + { + "epoch": 6.801621160409557, + "grad_norm": 0.07050944116571747, + "learning_rate": 3.911351837273447e-07, + "loss": 0.8701, + "step": 31886 + }, + { + "epoch": 6.801834470989761, + "grad_norm": 0.0716517094684252, + "learning_rate": 3.9029516927960197e-07, + "loss": 0.897, + "step": 31887 + }, + { + "epoch": 6.802047781569966, + "grad_norm": 0.07187565815384447, + "learning_rate": 3.894560556170124e-07, + "loss": 0.8719, + "step": 31888 + }, + { + "epoch": 6.802261092150171, + "grad_norm": 0.0731995993431973, + "learning_rate": 3.886178427490617e-07, + "loss": 0.8997, + "step": 31889 + }, + { + "epoch": 6.802474402730375, + "grad_norm": 0.07335404938477945, + "learning_rate": 3.877805306852445e-07, + "loss": 0.858, + "step": 31890 + }, + { + "epoch": 6.80268771331058, + "grad_norm": 0.0782006443435835, + "learning_rate": 3.869441194350287e-07, + "loss": 0.8723, + "step": 31891 + }, + { + "epoch": 6.802901023890785, + "grad_norm": 0.06939458993160033, + "learning_rate": 3.8610860900787364e-07, + "loss": 0.8783, + "step": 31892 + }, + { + "epoch": 6.8031143344709895, + "grad_norm": 0.0712655536127403, + "learning_rate": 3.8527399941324706e-07, + "loss": 0.8637, + "step": 31893 + }, + { + "epoch": 6.803327645051194, + "grad_norm": 0.07249718758610818, + "learning_rate": 3.8444029066057263e-07, + "loss": 0.8968, + "step": 31894 + }, + { + "epoch": 6.8035409556314, + "grad_norm": 0.07330554218905941, + "learning_rate": 3.8360748275930945e-07, + "loss": 0.9046, + "step": 31895 + }, + { + "epoch": 6.803754266211604, + "grad_norm": 0.07322974679835301, + "learning_rate": 3.827755757188545e-07, + "loss": 0.8986, + "step": 31896 + }, + { + "epoch": 6.803967576791809, + "grad_norm": 0.07462637559727257, + "learning_rate": 3.819445695486401e-07, + "loss": 0.8569, + "step": 31897 + }, + { + "epoch": 6.804180887372014, + "grad_norm": 0.07151573099167244, + "learning_rate": 3.811144642580544e-07, + "loss": 0.8605, + "step": 31898 + }, + { + "epoch": 6.8043941979522184, + "grad_norm": 0.06983358438087037, + "learning_rate": 3.802852598565121e-07, + "loss": 0.9012, + "step": 31899 + }, + { + "epoch": 6.804607508532423, + "grad_norm": 0.07114118626276762, + "learning_rate": 3.7945695635336564e-07, + "loss": 0.8881, + "step": 31900 + }, + { + "epoch": 6.804820819112628, + "grad_norm": 0.07215862447357192, + "learning_rate": 3.78629553758012e-07, + "loss": 0.9033, + "step": 31901 + }, + { + "epoch": 6.8050341296928325, + "grad_norm": 0.0731251229834952, + "learning_rate": 3.778030520798037e-07, + "loss": 0.8759, + "step": 31902 + }, + { + "epoch": 6.805247440273037, + "grad_norm": 0.07114275334413961, + "learning_rate": 3.769774513280933e-07, + "loss": 0.8952, + "step": 31903 + }, + { + "epoch": 6.805460750853243, + "grad_norm": 0.07576166235587013, + "learning_rate": 3.761527515122154e-07, + "loss": 0.8676, + "step": 31904 + }, + { + "epoch": 6.805674061433447, + "grad_norm": 0.07071958930233378, + "learning_rate": 3.753289526415227e-07, + "loss": 0.873, + "step": 31905 + }, + { + "epoch": 6.805887372013652, + "grad_norm": 0.0733826059824673, + "learning_rate": 3.7450605472530543e-07, + "loss": 0.8784, + "step": 31906 + }, + { + "epoch": 6.806100682593857, + "grad_norm": 0.0737037609156439, + "learning_rate": 3.736840577729073e-07, + "loss": 0.8676, + "step": 31907 + }, + { + "epoch": 6.8063139931740615, + "grad_norm": 0.07206542526406702, + "learning_rate": 3.728629617936008e-07, + "loss": 0.8881, + "step": 31908 + }, + { + "epoch": 6.806527303754266, + "grad_norm": 0.0713713066821171, + "learning_rate": 3.7204276679669417e-07, + "loss": 0.9022, + "step": 31909 + }, + { + "epoch": 6.806740614334471, + "grad_norm": 0.07264747464852667, + "learning_rate": 3.7122347279145986e-07, + "loss": 0.8679, + "step": 31910 + }, + { + "epoch": 6.8069539249146755, + "grad_norm": 0.06912720020717233, + "learning_rate": 3.704050797871794e-07, + "loss": 0.8804, + "step": 31911 + }, + { + "epoch": 6.80716723549488, + "grad_norm": 0.07002256598487709, + "learning_rate": 3.6958758779309876e-07, + "loss": 0.8994, + "step": 31912 + }, + { + "epoch": 6.807380546075086, + "grad_norm": 0.06906819535422912, + "learning_rate": 3.6877099681847274e-07, + "loss": 0.854, + "step": 31913 + }, + { + "epoch": 6.8075938566552905, + "grad_norm": 0.07124764559928276, + "learning_rate": 3.679553068725472e-07, + "loss": 0.8294, + "step": 31914 + }, + { + "epoch": 6.807807167235495, + "grad_norm": 0.07066194593154689, + "learning_rate": 3.671405179645415e-07, + "loss": 0.8658, + "step": 31915 + }, + { + "epoch": 6.8080204778157, + "grad_norm": 0.07628483916189728, + "learning_rate": 3.6632663010368385e-07, + "loss": 0.8792, + "step": 31916 + }, + { + "epoch": 6.8082337883959045, + "grad_norm": 0.07034951255101612, + "learning_rate": 3.655136432991846e-07, + "loss": 0.9072, + "step": 31917 + }, + { + "epoch": 6.808447098976109, + "grad_norm": 0.07662510342187372, + "learning_rate": 3.647015575602275e-07, + "loss": 0.8828, + "step": 31918 + }, + { + "epoch": 6.808660409556314, + "grad_norm": 0.07368256920669292, + "learning_rate": 3.6389037289601415e-07, + "loss": 0.8939, + "step": 31919 + }, + { + "epoch": 6.808873720136519, + "grad_norm": 0.06967796472606007, + "learning_rate": 3.6308008931572825e-07, + "loss": 0.8868, + "step": 31920 + }, + { + "epoch": 6.809087030716723, + "grad_norm": 0.0757966783576771, + "learning_rate": 3.6227070682852697e-07, + "loss": 0.8946, + "step": 31921 + }, + { + "epoch": 6.809300341296928, + "grad_norm": 0.07273917696900062, + "learning_rate": 3.6146222544356736e-07, + "loss": 0.8749, + "step": 31922 + }, + { + "epoch": 6.809513651877133, + "grad_norm": 0.07294224773273522, + "learning_rate": 3.606546451699977e-07, + "loss": 0.8569, + "step": 31923 + }, + { + "epoch": 6.809726962457338, + "grad_norm": 0.06958683723327011, + "learning_rate": 3.5984796601697514e-07, + "loss": 0.8894, + "step": 31924 + }, + { + "epoch": 6.809940273037543, + "grad_norm": 0.07587990731116356, + "learning_rate": 3.590421879936035e-07, + "loss": 0.8742, + "step": 31925 + }, + { + "epoch": 6.810153583617748, + "grad_norm": 0.07043941795272547, + "learning_rate": 3.582373111090132e-07, + "loss": 0.8619, + "step": 31926 + }, + { + "epoch": 6.810366894197952, + "grad_norm": 0.07301220818534603, + "learning_rate": 3.5743333537230806e-07, + "loss": 0.8873, + "step": 31927 + }, + { + "epoch": 6.810580204778157, + "grad_norm": 0.0727530531464241, + "learning_rate": 3.566302607925742e-07, + "loss": 0.894, + "step": 31928 + }, + { + "epoch": 6.810793515358362, + "grad_norm": 0.0775819291655475, + "learning_rate": 3.558280873789244e-07, + "loss": 0.909, + "step": 31929 + }, + { + "epoch": 6.811006825938566, + "grad_norm": 0.0789349974466628, + "learning_rate": 3.5502681514040017e-07, + "loss": 0.8755, + "step": 31930 + }, + { + "epoch": 6.811220136518771, + "grad_norm": 0.0678749151435401, + "learning_rate": 3.542264440861054e-07, + "loss": 0.8759, + "step": 31931 + }, + { + "epoch": 6.811433447098976, + "grad_norm": 0.07304749284699798, + "learning_rate": 3.53426974225064e-07, + "loss": 0.877, + "step": 31932 + }, + { + "epoch": 6.811646757679181, + "grad_norm": 0.07184552973732287, + "learning_rate": 3.5262840556634427e-07, + "loss": 0.8844, + "step": 31933 + }, + { + "epoch": 6.811860068259386, + "grad_norm": 0.07132183549673882, + "learning_rate": 3.5183073811897893e-07, + "loss": 0.8891, + "step": 31934 + }, + { + "epoch": 6.812073378839591, + "grad_norm": 0.07394427779473356, + "learning_rate": 3.510339718919742e-07, + "loss": 0.8867, + "step": 31935 + }, + { + "epoch": 6.812286689419795, + "grad_norm": 0.07202010428560095, + "learning_rate": 3.5023810689437166e-07, + "loss": 0.8554, + "step": 31936 + }, + { + "epoch": 6.8125, + "grad_norm": 0.07618544386150534, + "learning_rate": 3.4944314313515973e-07, + "loss": 0.9107, + "step": 31937 + }, + { + "epoch": 6.812713310580205, + "grad_norm": 0.0722624902279791, + "learning_rate": 3.4864908062333556e-07, + "loss": 0.879, + "step": 31938 + }, + { + "epoch": 6.812926621160409, + "grad_norm": 0.07981970223634198, + "learning_rate": 3.4785591936789654e-07, + "loss": 0.9057, + "step": 31939 + }, + { + "epoch": 6.813139931740614, + "grad_norm": 0.0712459710252154, + "learning_rate": 3.470636593777954e-07, + "loss": 0.8868, + "step": 31940 + }, + { + "epoch": 6.813353242320819, + "grad_norm": 0.07401607541513747, + "learning_rate": 3.4627230066202054e-07, + "loss": 0.8826, + "step": 31941 + }, + { + "epoch": 6.813566552901024, + "grad_norm": 0.0714217013997073, + "learning_rate": 3.454818432294982e-07, + "loss": 0.8937, + "step": 31942 + }, + { + "epoch": 6.813779863481229, + "grad_norm": 0.07044449174508152, + "learning_rate": 3.4469228708920775e-07, + "loss": 0.8861, + "step": 31943 + }, + { + "epoch": 6.813993174061434, + "grad_norm": 0.07903817210099451, + "learning_rate": 3.4390363225004883e-07, + "loss": 0.8713, + "step": 31944 + }, + { + "epoch": 6.814206484641638, + "grad_norm": 0.07356269756107507, + "learning_rate": 3.4311587872096543e-07, + "loss": 0.9043, + "step": 31945 + }, + { + "epoch": 6.814419795221843, + "grad_norm": 0.07485784327164342, + "learning_rate": 3.423290265108747e-07, + "loss": 0.8867, + "step": 31946 + }, + { + "epoch": 6.814633105802048, + "grad_norm": 0.07364593939358231, + "learning_rate": 3.4154307562865865e-07, + "loss": 0.861, + "step": 31947 + }, + { + "epoch": 6.814846416382252, + "grad_norm": 0.07461326476987529, + "learning_rate": 3.407580260832255e-07, + "loss": 0.8696, + "step": 31948 + }, + { + "epoch": 6.815059726962457, + "grad_norm": 0.07452989350495419, + "learning_rate": 3.3997387788345714e-07, + "loss": 0.8457, + "step": 31949 + }, + { + "epoch": 6.815273037542662, + "grad_norm": 0.07000672606324937, + "learning_rate": 3.3919063103822646e-07, + "loss": 0.8783, + "step": 31950 + }, + { + "epoch": 6.815486348122867, + "grad_norm": 0.07254247595398267, + "learning_rate": 3.384082855563886e-07, + "loss": 0.8886, + "step": 31951 + }, + { + "epoch": 6.815699658703072, + "grad_norm": 0.07267082582026363, + "learning_rate": 3.3762684144679867e-07, + "loss": 0.8825, + "step": 31952 + }, + { + "epoch": 6.815912969283277, + "grad_norm": 0.06856373341855244, + "learning_rate": 3.3684629871831186e-07, + "loss": 0.8563, + "step": 31953 + }, + { + "epoch": 6.816126279863481, + "grad_norm": 0.0718310569195707, + "learning_rate": 3.3606665737973886e-07, + "loss": 0.8481, + "step": 31954 + }, + { + "epoch": 6.816339590443686, + "grad_norm": 0.074410088365173, + "learning_rate": 3.352879174399171e-07, + "loss": 0.8812, + "step": 31955 + }, + { + "epoch": 6.816552901023891, + "grad_norm": 0.07522356327113616, + "learning_rate": 3.345100789076483e-07, + "loss": 0.8666, + "step": 31956 + }, + { + "epoch": 6.8167662116040955, + "grad_norm": 0.07200523961090505, + "learning_rate": 3.337331417917433e-07, + "loss": 0.8968, + "step": 31957 + }, + { + "epoch": 6.8169795221843, + "grad_norm": 0.0723604260725178, + "learning_rate": 3.329571061009862e-07, + "loss": 0.8786, + "step": 31958 + }, + { + "epoch": 6.817192832764505, + "grad_norm": 0.0739856738749064, + "learning_rate": 3.3218197184415214e-07, + "loss": 0.8907, + "step": 31959 + }, + { + "epoch": 6.8174061433447095, + "grad_norm": 0.07129740223803768, + "learning_rate": 3.3140773903002523e-07, + "loss": 0.8726, + "step": 31960 + }, + { + "epoch": 6.817619453924914, + "grad_norm": 0.07430793281626673, + "learning_rate": 3.306344076673629e-07, + "loss": 0.8737, + "step": 31961 + }, + { + "epoch": 6.81783276450512, + "grad_norm": 0.07546981695727174, + "learning_rate": 3.298619777649137e-07, + "loss": 0.8891, + "step": 31962 + }, + { + "epoch": 6.8180460750853245, + "grad_norm": 0.07291314091131129, + "learning_rate": 3.2909044933140845e-07, + "loss": 0.8757, + "step": 31963 + }, + { + "epoch": 6.818259385665529, + "grad_norm": 0.08447430003037708, + "learning_rate": 3.283198223755868e-07, + "loss": 0.9101, + "step": 31964 + }, + { + "epoch": 6.818472696245734, + "grad_norm": 0.07257149055134586, + "learning_rate": 3.275500969061707e-07, + "loss": 0.8718, + "step": 31965 + }, + { + "epoch": 6.8186860068259385, + "grad_norm": 0.07250526470331765, + "learning_rate": 3.2678127293185533e-07, + "loss": 0.8923, + "step": 31966 + }, + { + "epoch": 6.818899317406143, + "grad_norm": 0.07292763016523918, + "learning_rate": 3.260133504613627e-07, + "loss": 0.8702, + "step": 31967 + }, + { + "epoch": 6.819112627986348, + "grad_norm": 0.07469372758898438, + "learning_rate": 3.2524632950336145e-07, + "loss": 0.8868, + "step": 31968 + }, + { + "epoch": 6.819325938566553, + "grad_norm": 0.07124505829080434, + "learning_rate": 3.2448021006654675e-07, + "loss": 0.8719, + "step": 31969 + }, + { + "epoch": 6.819539249146757, + "grad_norm": 0.0697454984789921, + "learning_rate": 3.237149921595695e-07, + "loss": 0.8884, + "step": 31970 + }, + { + "epoch": 6.819752559726963, + "grad_norm": 0.07211186881935884, + "learning_rate": 3.229506757910983e-07, + "loss": 0.8892, + "step": 31971 + }, + { + "epoch": 6.8199658703071675, + "grad_norm": 0.07067376719931881, + "learning_rate": 3.22187260969784e-07, + "loss": 0.8559, + "step": 31972 + }, + { + "epoch": 6.820179180887372, + "grad_norm": 0.07012646606047519, + "learning_rate": 3.214247477042598e-07, + "loss": 0.8925, + "step": 31973 + }, + { + "epoch": 6.820392491467577, + "grad_norm": 0.06927135957252295, + "learning_rate": 3.206631360031498e-07, + "loss": 0.8773, + "step": 31974 + }, + { + "epoch": 6.8206058020477816, + "grad_norm": 0.07103699119389323, + "learning_rate": 3.199024258750871e-07, + "loss": 0.8863, + "step": 31975 + }, + { + "epoch": 6.820819112627986, + "grad_norm": 0.07381144406637785, + "learning_rate": 3.191426173286605e-07, + "loss": 0.8967, + "step": 31976 + }, + { + "epoch": 6.821032423208191, + "grad_norm": 0.07086020616607425, + "learning_rate": 3.1838371037247626e-07, + "loss": 0.8899, + "step": 31977 + }, + { + "epoch": 6.821245733788396, + "grad_norm": 0.07176138107532777, + "learning_rate": 3.1762570501512325e-07, + "loss": 0.8708, + "step": 31978 + }, + { + "epoch": 6.8214590443686, + "grad_norm": 0.06915699994094195, + "learning_rate": 3.1686860126517225e-07, + "loss": 0.8619, + "step": 31979 + }, + { + "epoch": 6.821672354948806, + "grad_norm": 0.07369834202402502, + "learning_rate": 3.161123991311943e-07, + "loss": 0.8836, + "step": 31980 + }, + { + "epoch": 6.8218856655290105, + "grad_norm": 0.07024860502718086, + "learning_rate": 3.1535709862175135e-07, + "loss": 0.922, + "step": 31981 + }, + { + "epoch": 6.822098976109215, + "grad_norm": 0.07529041725345281, + "learning_rate": 3.1460269974536996e-07, + "loss": 0.8869, + "step": 31982 + }, + { + "epoch": 6.82231228668942, + "grad_norm": 0.07315765826115322, + "learning_rate": 3.1384920251061215e-07, + "loss": 0.8617, + "step": 31983 + }, + { + "epoch": 6.822525597269625, + "grad_norm": 0.06827482298772682, + "learning_rate": 3.130966069259866e-07, + "loss": 0.8779, + "step": 31984 + }, + { + "epoch": 6.822738907849829, + "grad_norm": 0.07381313417635413, + "learning_rate": 3.1234491300001115e-07, + "loss": 0.8937, + "step": 31985 + }, + { + "epoch": 6.822952218430034, + "grad_norm": 0.07233941472334363, + "learning_rate": 3.1159412074119434e-07, + "loss": 0.9209, + "step": 31986 + }, + { + "epoch": 6.823165529010239, + "grad_norm": 0.07106833295098981, + "learning_rate": 3.108442301580272e-07, + "loss": 0.8997, + "step": 31987 + }, + { + "epoch": 6.823378839590443, + "grad_norm": 0.0733362519696355, + "learning_rate": 3.100952412590097e-07, + "loss": 0.8893, + "step": 31988 + }, + { + "epoch": 6.823592150170649, + "grad_norm": 0.07432123430233688, + "learning_rate": 3.093471540525972e-07, + "loss": 0.9022, + "step": 31989 + }, + { + "epoch": 6.823805460750854, + "grad_norm": 0.07174163846217149, + "learning_rate": 3.0859996854725405e-07, + "loss": 0.8755, + "step": 31990 + }, + { + "epoch": 6.824018771331058, + "grad_norm": 0.0739806370749788, + "learning_rate": 3.078536847514535e-07, + "loss": 0.8686, + "step": 31991 + }, + { + "epoch": 6.824232081911263, + "grad_norm": 0.07523777470727235, + "learning_rate": 3.071083026736243e-07, + "loss": 0.9006, + "step": 31992 + }, + { + "epoch": 6.824445392491468, + "grad_norm": 0.07151576133389173, + "learning_rate": 3.063638223222132e-07, + "loss": 0.8687, + "step": 31993 + }, + { + "epoch": 6.824658703071672, + "grad_norm": 0.06962302480232976, + "learning_rate": 3.0562024370562216e-07, + "loss": 0.8674, + "step": 31994 + }, + { + "epoch": 6.824872013651877, + "grad_norm": 0.07157132670224396, + "learning_rate": 3.04877566832289e-07, + "loss": 0.8537, + "step": 31995 + }, + { + "epoch": 6.825085324232082, + "grad_norm": 0.07078409194406246, + "learning_rate": 3.041357917106069e-07, + "loss": 0.9018, + "step": 31996 + }, + { + "epoch": 6.825298634812286, + "grad_norm": 0.07048132988322062, + "learning_rate": 3.033949183489693e-07, + "loss": 0.8758, + "step": 31997 + }, + { + "epoch": 6.825511945392492, + "grad_norm": 0.07197486511850477, + "learning_rate": 3.0265494675575156e-07, + "loss": 0.8883, + "step": 31998 + }, + { + "epoch": 6.825725255972696, + "grad_norm": 0.0762915418817464, + "learning_rate": 3.019158769393382e-07, + "loss": 0.8741, + "step": 31999 + }, + { + "epoch": 6.825938566552901, + "grad_norm": 0.0703353856907482, + "learning_rate": 3.011777089080781e-07, + "loss": 0.8641, + "step": 32000 + }, + { + "epoch": 6.826151877133106, + "grad_norm": 0.07141493974710336, + "learning_rate": 3.0044044267034666e-07, + "loss": 0.8975, + "step": 32001 + }, + { + "epoch": 6.826365187713311, + "grad_norm": 0.0704870510057112, + "learning_rate": 2.997040782344662e-07, + "loss": 0.8826, + "step": 32002 + }, + { + "epoch": 6.826578498293515, + "grad_norm": 0.07402030360910238, + "learning_rate": 2.9896861560876786e-07, + "loss": 0.8896, + "step": 32003 + }, + { + "epoch": 6.82679180887372, + "grad_norm": 0.07354185132965922, + "learning_rate": 2.982340548015916e-07, + "loss": 0.8808, + "step": 32004 + }, + { + "epoch": 6.827005119453925, + "grad_norm": 0.07031018946096095, + "learning_rate": 2.97500395821233e-07, + "loss": 0.8776, + "step": 32005 + }, + { + "epoch": 6.827218430034129, + "grad_norm": 0.07318431371916406, + "learning_rate": 2.9676763867599655e-07, + "loss": 0.8926, + "step": 32006 + }, + { + "epoch": 6.827431740614334, + "grad_norm": 0.06869850942073917, + "learning_rate": 2.960357833741778e-07, + "loss": 0.8989, + "step": 32007 + }, + { + "epoch": 6.827645051194539, + "grad_norm": 0.0760249930538344, + "learning_rate": 2.953048299240546e-07, + "loss": 0.8959, + "step": 32008 + }, + { + "epoch": 6.827858361774744, + "grad_norm": 0.07991712261607585, + "learning_rate": 2.9457477833389593e-07, + "loss": 0.8679, + "step": 32009 + }, + { + "epoch": 6.828071672354949, + "grad_norm": 0.07179454841410304, + "learning_rate": 2.9384562861196174e-07, + "loss": 0.892, + "step": 32010 + }, + { + "epoch": 6.828284982935154, + "grad_norm": 0.07076504624375249, + "learning_rate": 2.931173807665122e-07, + "loss": 0.8681, + "step": 32011 + }, + { + "epoch": 6.828498293515358, + "grad_norm": 0.07299682878303841, + "learning_rate": 2.9239003480578065e-07, + "loss": 0.8924, + "step": 32012 + }, + { + "epoch": 6.828711604095563, + "grad_norm": 0.08080992511292692, + "learning_rate": 2.9166359073799165e-07, + "loss": 0.9061, + "step": 32013 + }, + { + "epoch": 6.828924914675768, + "grad_norm": 0.07432265431428221, + "learning_rate": 2.909380485713698e-07, + "loss": 0.8727, + "step": 32014 + }, + { + "epoch": 6.8291382252559725, + "grad_norm": 0.07327493483651445, + "learning_rate": 2.902134083141217e-07, + "loss": 0.8691, + "step": 32015 + }, + { + "epoch": 6.829351535836177, + "grad_norm": 0.06815293402268985, + "learning_rate": 2.894896699744543e-07, + "loss": 0.8492, + "step": 32016 + }, + { + "epoch": 6.829564846416382, + "grad_norm": 0.07148169162548115, + "learning_rate": 2.8876683356055646e-07, + "loss": 0.8903, + "step": 32017 + }, + { + "epoch": 6.829778156996587, + "grad_norm": 0.07883940563654228, + "learning_rate": 2.8804489908059065e-07, + "loss": 0.9053, + "step": 32018 + }, + { + "epoch": 6.829991467576792, + "grad_norm": 0.06924998257491435, + "learning_rate": 2.873238665427458e-07, + "loss": 0.8852, + "step": 32019 + }, + { + "epoch": 6.830204778156997, + "grad_norm": 0.0756874183555911, + "learning_rate": 2.866037359551754e-07, + "loss": 0.8969, + "step": 32020 + }, + { + "epoch": 6.8304180887372015, + "grad_norm": 0.07427619011960725, + "learning_rate": 2.8588450732601526e-07, + "loss": 0.8753, + "step": 32021 + }, + { + "epoch": 6.830631399317406, + "grad_norm": 0.06965655323449728, + "learning_rate": 2.851661806634187e-07, + "loss": 0.855, + "step": 32022 + }, + { + "epoch": 6.830844709897611, + "grad_norm": 0.06905360527128922, + "learning_rate": 2.8444875597550383e-07, + "loss": 0.8745, + "step": 32023 + }, + { + "epoch": 6.8310580204778155, + "grad_norm": 0.07178625817495549, + "learning_rate": 2.8373223327039734e-07, + "loss": 0.8616, + "step": 32024 + }, + { + "epoch": 6.83127133105802, + "grad_norm": 0.07183154912841287, + "learning_rate": 2.830166125561995e-07, + "loss": 0.8751, + "step": 32025 + }, + { + "epoch": 6.831484641638225, + "grad_norm": 0.07706000166930131, + "learning_rate": 2.823018938410016e-07, + "loss": 0.8676, + "step": 32026 + }, + { + "epoch": 6.8316979522184305, + "grad_norm": 0.07146430130654473, + "learning_rate": 2.815880771329127e-07, + "loss": 0.8755, + "step": 32027 + }, + { + "epoch": 6.831911262798635, + "grad_norm": 0.06919489126882707, + "learning_rate": 2.808751624399886e-07, + "loss": 0.8951, + "step": 32028 + }, + { + "epoch": 6.83212457337884, + "grad_norm": 0.07401434265732015, + "learning_rate": 2.801631497703028e-07, + "loss": 0.9248, + "step": 32029 + }, + { + "epoch": 6.8323378839590445, + "grad_norm": 0.07130591013638164, + "learning_rate": 2.794520391319111e-07, + "loss": 0.8727, + "step": 32030 + }, + { + "epoch": 6.832551194539249, + "grad_norm": 0.0720335171358114, + "learning_rate": 2.787418305328604e-07, + "loss": 0.8998, + "step": 32031 + }, + { + "epoch": 6.832764505119454, + "grad_norm": 0.0753996316716578, + "learning_rate": 2.7803252398118873e-07, + "loss": 0.8967, + "step": 32032 + }, + { + "epoch": 6.832977815699659, + "grad_norm": 0.07081506306673878, + "learning_rate": 2.773241194849163e-07, + "loss": 0.8697, + "step": 32033 + }, + { + "epoch": 6.833191126279863, + "grad_norm": 0.0718238758533341, + "learning_rate": 2.766166170520634e-07, + "loss": 0.8941, + "step": 32034 + }, + { + "epoch": 6.833404436860068, + "grad_norm": 0.07252216904944281, + "learning_rate": 2.7591001669064143e-07, + "loss": 0.8932, + "step": 32035 + }, + { + "epoch": 6.8336177474402735, + "grad_norm": 0.06962958006089384, + "learning_rate": 2.7520431840863503e-07, + "loss": 0.9105, + "step": 32036 + }, + { + "epoch": 6.833831058020478, + "grad_norm": 0.07325179079420599, + "learning_rate": 2.7449952221402896e-07, + "loss": 0.8841, + "step": 32037 + }, + { + "epoch": 6.834044368600683, + "grad_norm": 0.07155186430169361, + "learning_rate": 2.737956281147991e-07, + "loss": 0.9092, + "step": 32038 + }, + { + "epoch": 6.834257679180888, + "grad_norm": 0.06977685822918364, + "learning_rate": 2.7309263611892125e-07, + "loss": 0.8696, + "step": 32039 + }, + { + "epoch": 6.834470989761092, + "grad_norm": 0.07076979898057362, + "learning_rate": 2.723905462343357e-07, + "loss": 0.8687, + "step": 32040 + }, + { + "epoch": 6.834684300341297, + "grad_norm": 0.07006078409686115, + "learning_rate": 2.716893584690006e-07, + "loss": 0.8816, + "step": 32041 + }, + { + "epoch": 6.834897610921502, + "grad_norm": 0.06686947593595116, + "learning_rate": 2.709890728308295e-07, + "loss": 0.8741, + "step": 32042 + }, + { + "epoch": 6.835110921501706, + "grad_norm": 0.07143781148843223, + "learning_rate": 2.702896893277629e-07, + "loss": 0.8662, + "step": 32043 + }, + { + "epoch": 6.835324232081911, + "grad_norm": 0.07083652125798497, + "learning_rate": 2.6959120796771434e-07, + "loss": 0.8386, + "step": 32044 + }, + { + "epoch": 6.835537542662116, + "grad_norm": 0.06841258061907465, + "learning_rate": 2.6889362875857973e-07, + "loss": 0.8602, + "step": 32045 + }, + { + "epoch": 6.83575085324232, + "grad_norm": 0.07341771752709816, + "learning_rate": 2.681969517082461e-07, + "loss": 0.9148, + "step": 32046 + }, + { + "epoch": 6.835964163822526, + "grad_norm": 0.07706840081721139, + "learning_rate": 2.675011768246183e-07, + "loss": 0.8735, + "step": 32047 + }, + { + "epoch": 6.836177474402731, + "grad_norm": 0.07196946050401609, + "learning_rate": 2.6680630411554777e-07, + "loss": 0.8752, + "step": 32048 + }, + { + "epoch": 6.836390784982935, + "grad_norm": 0.0700796293613132, + "learning_rate": 2.6611233358890377e-07, + "loss": 0.8587, + "step": 32049 + }, + { + "epoch": 6.83660409556314, + "grad_norm": 0.06667011765593354, + "learning_rate": 2.6541926525254667e-07, + "loss": 0.8625, + "step": 32050 + }, + { + "epoch": 6.836817406143345, + "grad_norm": 0.07028254694108675, + "learning_rate": 2.6472709911431025e-07, + "loss": 0.8479, + "step": 32051 + }, + { + "epoch": 6.837030716723549, + "grad_norm": 0.06773822620807943, + "learning_rate": 2.6403583518202823e-07, + "loss": 0.8693, + "step": 32052 + }, + { + "epoch": 6.837244027303754, + "grad_norm": 0.06955122079288133, + "learning_rate": 2.633454734635166e-07, + "loss": 0.8976, + "step": 32053 + }, + { + "epoch": 6.837457337883959, + "grad_norm": 0.0714954535901536, + "learning_rate": 2.6265601396659126e-07, + "loss": 0.8838, + "step": 32054 + }, + { + "epoch": 6.837670648464163, + "grad_norm": 0.07124111228833165, + "learning_rate": 2.619674566990593e-07, + "loss": 0.8749, + "step": 32055 + }, + { + "epoch": 6.837883959044369, + "grad_norm": 0.06901023788222982, + "learning_rate": 2.6127980166871016e-07, + "loss": 0.8726, + "step": 32056 + }, + { + "epoch": 6.838097269624574, + "grad_norm": 0.07290468624952423, + "learning_rate": 2.6059304888331526e-07, + "loss": 0.8568, + "step": 32057 + }, + { + "epoch": 6.838310580204778, + "grad_norm": 0.06910322875224685, + "learning_rate": 2.599071983506463e-07, + "loss": 0.8838, + "step": 32058 + }, + { + "epoch": 6.838523890784983, + "grad_norm": 0.07328491706407685, + "learning_rate": 2.5922225007847466e-07, + "loss": 0.8582, + "step": 32059 + }, + { + "epoch": 6.838737201365188, + "grad_norm": 0.07180571013666824, + "learning_rate": 2.5853820407454544e-07, + "loss": 0.8821, + "step": 32060 + }, + { + "epoch": 6.838950511945392, + "grad_norm": 0.07182470405631644, + "learning_rate": 2.578550603466035e-07, + "loss": 0.8667, + "step": 32061 + }, + { + "epoch": 6.839163822525597, + "grad_norm": 0.0700215419437303, + "learning_rate": 2.57172818902367e-07, + "loss": 0.8823, + "step": 32062 + }, + { + "epoch": 6.839377133105802, + "grad_norm": 0.0739912774536894, + "learning_rate": 2.5649147974956323e-07, + "loss": 0.9027, + "step": 32063 + }, + { + "epoch": 6.839590443686006, + "grad_norm": 0.07105638299249105, + "learning_rate": 2.5581104289589265e-07, + "loss": 0.8741, + "step": 32064 + }, + { + "epoch": 6.839803754266212, + "grad_norm": 0.07367475855208486, + "learning_rate": 2.5513150834906466e-07, + "loss": 0.9004, + "step": 32065 + }, + { + "epoch": 6.840017064846417, + "grad_norm": 0.07332582325830675, + "learning_rate": 2.544528761167708e-07, + "loss": 0.8764, + "step": 32066 + }, + { + "epoch": 6.840230375426621, + "grad_norm": 0.06974718684284105, + "learning_rate": 2.537751462066762e-07, + "loss": 0.8725, + "step": 32067 + }, + { + "epoch": 6.840443686006826, + "grad_norm": 0.07306814208976097, + "learning_rate": 2.530983186264635e-07, + "loss": 0.9051, + "step": 32068 + }, + { + "epoch": 6.840656996587031, + "grad_norm": 0.07015690436640509, + "learning_rate": 2.5242239338377997e-07, + "loss": 0.9227, + "step": 32069 + }, + { + "epoch": 6.840870307167235, + "grad_norm": 0.07280886743979693, + "learning_rate": 2.517473704862816e-07, + "loss": 0.874, + "step": 32070 + }, + { + "epoch": 6.84108361774744, + "grad_norm": 0.0739356178957187, + "learning_rate": 2.51073249941598e-07, + "loss": 0.8767, + "step": 32071 + }, + { + "epoch": 6.841296928327645, + "grad_norm": 0.07279495125878993, + "learning_rate": 2.504000317573674e-07, + "loss": 0.9076, + "step": 32072 + }, + { + "epoch": 6.8415102389078495, + "grad_norm": 0.07116008703411104, + "learning_rate": 2.497277159412015e-07, + "loss": 0.8643, + "step": 32073 + }, + { + "epoch": 6.841723549488055, + "grad_norm": 0.06996003085989698, + "learning_rate": 2.49056302500712e-07, + "loss": 0.8676, + "step": 32074 + }, + { + "epoch": 6.84193686006826, + "grad_norm": 0.07254298783793986, + "learning_rate": 2.4838579144348395e-07, + "loss": 0.883, + "step": 32075 + }, + { + "epoch": 6.842150170648464, + "grad_norm": 0.07152069249952887, + "learning_rate": 2.477161827771113e-07, + "loss": 0.8843, + "step": 32076 + }, + { + "epoch": 6.842363481228669, + "grad_norm": 0.0729692923818268, + "learning_rate": 2.47047476509179e-07, + "loss": 0.8744, + "step": 32077 + }, + { + "epoch": 6.842576791808874, + "grad_norm": 0.06996026149085267, + "learning_rate": 2.463796726472367e-07, + "loss": 0.8755, + "step": 32078 + }, + { + "epoch": 6.8427901023890785, + "grad_norm": 0.0739722655595064, + "learning_rate": 2.457127711988605e-07, + "loss": 0.8698, + "step": 32079 + }, + { + "epoch": 6.843003412969283, + "grad_norm": 0.06924496126726519, + "learning_rate": 2.450467721715821e-07, + "loss": 0.8541, + "step": 32080 + }, + { + "epoch": 6.843216723549488, + "grad_norm": 0.07577277162134682, + "learning_rate": 2.4438167557293336e-07, + "loss": 0.9148, + "step": 32081 + }, + { + "epoch": 6.8434300341296925, + "grad_norm": 0.07343324129959779, + "learning_rate": 2.4371748141045484e-07, + "loss": 0.8707, + "step": 32082 + }, + { + "epoch": 6.843643344709898, + "grad_norm": 0.07201147405389022, + "learning_rate": 2.4305418969165163e-07, + "loss": 0.9063, + "step": 32083 + }, + { + "epoch": 6.843856655290102, + "grad_norm": 0.07060288136242977, + "learning_rate": 2.423918004240378e-07, + "loss": 0.8835, + "step": 32084 + }, + { + "epoch": 6.8440699658703075, + "grad_norm": 0.07089865255195017, + "learning_rate": 2.417303136150917e-07, + "loss": 0.8898, + "step": 32085 + }, + { + "epoch": 6.844283276450512, + "grad_norm": 0.07132862587106077, + "learning_rate": 2.4106972927231855e-07, + "loss": 0.8982, + "step": 32086 + }, + { + "epoch": 6.844496587030717, + "grad_norm": 0.07116396998908939, + "learning_rate": 2.40410047403179e-07, + "loss": 0.8773, + "step": 32087 + }, + { + "epoch": 6.8447098976109215, + "grad_norm": 0.07139605821135503, + "learning_rate": 2.3975126801514257e-07, + "loss": 0.8519, + "step": 32088 + }, + { + "epoch": 6.844923208191126, + "grad_norm": 0.07463033835022478, + "learning_rate": 2.390933911156612e-07, + "loss": 0.8974, + "step": 32089 + }, + { + "epoch": 6.845136518771331, + "grad_norm": 0.06975027744332168, + "learning_rate": 2.3843641671217776e-07, + "loss": 0.8667, + "step": 32090 + }, + { + "epoch": 6.845349829351536, + "grad_norm": 0.07407836068402272, + "learning_rate": 2.377803448121263e-07, + "loss": 0.8843, + "step": 32091 + }, + { + "epoch": 6.84556313993174, + "grad_norm": 0.07121948072416867, + "learning_rate": 2.371251754229409e-07, + "loss": 0.9103, + "step": 32092 + }, + { + "epoch": 6.845776450511945, + "grad_norm": 0.0729497179399725, + "learning_rate": 2.3647090855202004e-07, + "loss": 0.8813, + "step": 32093 + }, + { + "epoch": 6.8459897610921505, + "grad_norm": 0.07097411409158437, + "learning_rate": 2.3581754420677116e-07, + "loss": 0.9126, + "step": 32094 + }, + { + "epoch": 6.846203071672355, + "grad_norm": 0.07104995383800385, + "learning_rate": 2.351650823945928e-07, + "loss": 0.8713, + "step": 32095 + }, + { + "epoch": 6.84641638225256, + "grad_norm": 0.0722721502123427, + "learning_rate": 2.345135231228568e-07, + "loss": 0.9059, + "step": 32096 + }, + { + "epoch": 6.846629692832765, + "grad_norm": 0.06936869624912735, + "learning_rate": 2.3386286639894397e-07, + "loss": 0.8858, + "step": 32097 + }, + { + "epoch": 6.846843003412969, + "grad_norm": 0.06975476970965673, + "learning_rate": 2.3321311223021725e-07, + "loss": 0.8724, + "step": 32098 + }, + { + "epoch": 6.847056313993174, + "grad_norm": 0.07513360359146168, + "learning_rate": 2.325642606240308e-07, + "loss": 0.9023, + "step": 32099 + }, + { + "epoch": 6.847269624573379, + "grad_norm": 0.07041330853808335, + "learning_rate": 2.319163115877121e-07, + "loss": 0.8758, + "step": 32100 + }, + { + "epoch": 6.847482935153583, + "grad_norm": 0.07240370932383831, + "learning_rate": 2.3126926512860638e-07, + "loss": 0.9022, + "step": 32101 + }, + { + "epoch": 6.847696245733788, + "grad_norm": 0.07244950587959624, + "learning_rate": 2.306231212540233e-07, + "loss": 0.8977, + "step": 32102 + }, + { + "epoch": 6.847909556313994, + "grad_norm": 0.07387063774974957, + "learning_rate": 2.2997787997129039e-07, + "loss": 0.8659, + "step": 32103 + }, + { + "epoch": 6.848122866894198, + "grad_norm": 0.0722839546977637, + "learning_rate": 2.2933354128769957e-07, + "loss": 0.8672, + "step": 32104 + }, + { + "epoch": 6.848336177474403, + "grad_norm": 0.07544077132158684, + "learning_rate": 2.2869010521053393e-07, + "loss": 0.8912, + "step": 32105 + }, + { + "epoch": 6.848549488054608, + "grad_norm": 0.07210927125915863, + "learning_rate": 2.2804757174709424e-07, + "loss": 0.8711, + "step": 32106 + }, + { + "epoch": 6.848762798634812, + "grad_norm": 0.06977448699427213, + "learning_rate": 2.2740594090462808e-07, + "loss": 0.898, + "step": 32107 + }, + { + "epoch": 6.848976109215017, + "grad_norm": 0.07098651311303406, + "learning_rate": 2.2676521269040963e-07, + "loss": 0.8781, + "step": 32108 + }, + { + "epoch": 6.849189419795222, + "grad_norm": 0.0714561981316576, + "learning_rate": 2.2612538711167753e-07, + "loss": 0.904, + "step": 32109 + }, + { + "epoch": 6.849402730375426, + "grad_norm": 0.0724331373109696, + "learning_rate": 2.2548646417567932e-07, + "loss": 0.8896, + "step": 32110 + }, + { + "epoch": 6.849616040955631, + "grad_norm": 0.07449918613314291, + "learning_rate": 2.2484844388965366e-07, + "loss": 0.8835, + "step": 32111 + }, + { + "epoch": 6.849829351535837, + "grad_norm": 0.06955225153556872, + "learning_rate": 2.2421132626080365e-07, + "loss": 0.8804, + "step": 32112 + }, + { + "epoch": 6.850042662116041, + "grad_norm": 0.07589090217822031, + "learning_rate": 2.2357511129634135e-07, + "loss": 0.8982, + "step": 32113 + }, + { + "epoch": 6.850255972696246, + "grad_norm": 0.07172562817837207, + "learning_rate": 2.2293979900346984e-07, + "loss": 0.8775, + "step": 32114 + }, + { + "epoch": 6.850469283276451, + "grad_norm": 0.06896335731559508, + "learning_rate": 2.223053893893834e-07, + "loss": 0.8604, + "step": 32115 + }, + { + "epoch": 6.850682593856655, + "grad_norm": 0.07392000102419237, + "learning_rate": 2.216718824612496e-07, + "loss": 0.8895, + "step": 32116 + }, + { + "epoch": 6.85089590443686, + "grad_norm": 0.07437650153802737, + "learning_rate": 2.2103927822623604e-07, + "loss": 0.9186, + "step": 32117 + }, + { + "epoch": 6.851109215017065, + "grad_norm": 0.07080466274201751, + "learning_rate": 2.2040757669151036e-07, + "loss": 0.8616, + "step": 32118 + }, + { + "epoch": 6.851322525597269, + "grad_norm": 0.07010870460397987, + "learning_rate": 2.1977677786421348e-07, + "loss": 0.8794, + "step": 32119 + }, + { + "epoch": 6.851535836177474, + "grad_norm": 0.0707349506557816, + "learning_rate": 2.1914688175147748e-07, + "loss": 0.8759, + "step": 32120 + }, + { + "epoch": 6.85174914675768, + "grad_norm": 0.07270802890564396, + "learning_rate": 2.185178883604433e-07, + "loss": 0.9075, + "step": 32121 + }, + { + "epoch": 6.851962457337884, + "grad_norm": 0.07414513833382834, + "learning_rate": 2.1788979769821638e-07, + "loss": 0.8956, + "step": 32122 + }, + { + "epoch": 6.852175767918089, + "grad_norm": 0.07108925684998425, + "learning_rate": 2.1726260977191104e-07, + "loss": 0.9394, + "step": 32123 + }, + { + "epoch": 6.852389078498294, + "grad_norm": 0.07370685158215615, + "learning_rate": 2.1663632458862383e-07, + "loss": 0.8747, + "step": 32124 + }, + { + "epoch": 6.852602389078498, + "grad_norm": 0.07009004183553325, + "learning_rate": 2.160109421554424e-07, + "loss": 0.8818, + "step": 32125 + }, + { + "epoch": 6.852815699658703, + "grad_norm": 0.07119086851228264, + "learning_rate": 2.1538646247942774e-07, + "loss": 0.9043, + "step": 32126 + }, + { + "epoch": 6.853029010238908, + "grad_norm": 0.0709361815397614, + "learning_rate": 2.1476288556766757e-07, + "loss": 0.8792, + "step": 32127 + }, + { + "epoch": 6.853242320819112, + "grad_norm": 0.0708900712877335, + "learning_rate": 2.141402114272051e-07, + "loss": 0.8984, + "step": 32128 + }, + { + "epoch": 6.853455631399317, + "grad_norm": 0.07107286585308267, + "learning_rate": 2.135184400650836e-07, + "loss": 0.898, + "step": 32129 + }, + { + "epoch": 6.853668941979522, + "grad_norm": 0.07214057459659247, + "learning_rate": 2.1289757148834632e-07, + "loss": 0.8902, + "step": 32130 + }, + { + "epoch": 6.8538822525597265, + "grad_norm": 0.07230589058036029, + "learning_rate": 2.122776057040099e-07, + "loss": 0.8795, + "step": 32131 + }, + { + "epoch": 6.854095563139932, + "grad_norm": 0.0706305635681324, + "learning_rate": 2.116585427190998e-07, + "loss": 0.8869, + "step": 32132 + }, + { + "epoch": 6.854308873720137, + "grad_norm": 0.07159283380657366, + "learning_rate": 2.1104038254061489e-07, + "loss": 0.8919, + "step": 32133 + }, + { + "epoch": 6.854522184300341, + "grad_norm": 0.07302407518588726, + "learning_rate": 2.1042312517554507e-07, + "loss": 0.9234, + "step": 32134 + }, + { + "epoch": 6.854735494880546, + "grad_norm": 0.07088304150241924, + "learning_rate": 2.0980677063088928e-07, + "loss": 0.8883, + "step": 32135 + }, + { + "epoch": 6.854948805460751, + "grad_norm": 0.06912519991326804, + "learning_rate": 2.091913189136019e-07, + "loss": 0.8391, + "step": 32136 + }, + { + "epoch": 6.8551621160409555, + "grad_norm": 0.07025347177343966, + "learning_rate": 2.08576770030664e-07, + "loss": 0.8985, + "step": 32137 + }, + { + "epoch": 6.85537542662116, + "grad_norm": 0.07375536669466107, + "learning_rate": 2.0796312398902118e-07, + "loss": 0.8767, + "step": 32138 + }, + { + "epoch": 6.855588737201365, + "grad_norm": 0.07452956070450001, + "learning_rate": 2.07350380795619e-07, + "loss": 0.8603, + "step": 32139 + }, + { + "epoch": 6.8558020477815695, + "grad_norm": 0.07104157719894574, + "learning_rate": 2.0673854045738517e-07, + "loss": 0.8646, + "step": 32140 + }, + { + "epoch": 6.856015358361775, + "grad_norm": 0.06699271656199643, + "learning_rate": 2.0612760298124756e-07, + "loss": 0.8642, + "step": 32141 + }, + { + "epoch": 6.85622866894198, + "grad_norm": 0.07033958384280575, + "learning_rate": 2.0551756837411618e-07, + "loss": 0.8564, + "step": 32142 + }, + { + "epoch": 6.8564419795221845, + "grad_norm": 0.07023665751656902, + "learning_rate": 2.0490843664290104e-07, + "loss": 0.9004, + "step": 32143 + }, + { + "epoch": 6.856655290102389, + "grad_norm": 0.0688000654375304, + "learning_rate": 2.0430020779448557e-07, + "loss": 0.8909, + "step": 32144 + }, + { + "epoch": 6.856868600682594, + "grad_norm": 0.07184385970113168, + "learning_rate": 2.0369288183576197e-07, + "loss": 0.908, + "step": 32145 + }, + { + "epoch": 6.8570819112627985, + "grad_norm": 0.07468832754986512, + "learning_rate": 2.03086458773587e-07, + "loss": 0.9032, + "step": 32146 + }, + { + "epoch": 6.857295221843003, + "grad_norm": 0.07286397147766951, + "learning_rate": 2.024809386148352e-07, + "loss": 0.8824, + "step": 32147 + }, + { + "epoch": 6.857508532423208, + "grad_norm": 0.07136594073784167, + "learning_rate": 2.0187632136635438e-07, + "loss": 0.8554, + "step": 32148 + }, + { + "epoch": 6.857721843003413, + "grad_norm": 0.07163891807803513, + "learning_rate": 2.0127260703498353e-07, + "loss": 0.915, + "step": 32149 + }, + { + "epoch": 6.857935153583618, + "grad_norm": 0.06728834496169937, + "learning_rate": 2.0066979562755274e-07, + "loss": 0.8383, + "step": 32150 + }, + { + "epoch": 6.858148464163823, + "grad_norm": 0.06894639862837143, + "learning_rate": 2.000678871508921e-07, + "loss": 0.8631, + "step": 32151 + }, + { + "epoch": 6.8583617747440275, + "grad_norm": 0.0684420336617437, + "learning_rate": 1.9946688161179616e-07, + "loss": 0.9002, + "step": 32152 + }, + { + "epoch": 6.858575085324232, + "grad_norm": 0.07154202212790126, + "learning_rate": 1.9886677901708619e-07, + "loss": 0.9037, + "step": 32153 + }, + { + "epoch": 6.858788395904437, + "grad_norm": 0.07324857637651343, + "learning_rate": 1.9826757937353004e-07, + "loss": 0.8837, + "step": 32154 + }, + { + "epoch": 6.859001706484642, + "grad_norm": 0.07437886799640775, + "learning_rate": 1.976692826879223e-07, + "loss": 0.8809, + "step": 32155 + }, + { + "epoch": 6.859215017064846, + "grad_norm": 0.06955692185701771, + "learning_rate": 1.9707188896703088e-07, + "loss": 0.8812, + "step": 32156 + }, + { + "epoch": 6.859428327645051, + "grad_norm": 0.0742022178621059, + "learning_rate": 1.9647539821761487e-07, + "loss": 0.8897, + "step": 32157 + }, + { + "epoch": 6.859641638225256, + "grad_norm": 0.07178989862007813, + "learning_rate": 1.9587981044641546e-07, + "loss": 0.8437, + "step": 32158 + }, + { + "epoch": 6.859854948805461, + "grad_norm": 0.07118695702487532, + "learning_rate": 1.9528512566018287e-07, + "loss": 0.8869, + "step": 32159 + }, + { + "epoch": 6.860068259385666, + "grad_norm": 0.07037960573072989, + "learning_rate": 1.946913438656406e-07, + "loss": 0.8685, + "step": 32160 + }, + { + "epoch": 6.860281569965871, + "grad_norm": 0.07009005972906451, + "learning_rate": 1.9409846506951214e-07, + "loss": 0.8648, + "step": 32161 + }, + { + "epoch": 6.860494880546075, + "grad_norm": 0.07054217974689472, + "learning_rate": 1.9350648927850323e-07, + "loss": 0.8537, + "step": 32162 + }, + { + "epoch": 6.86070819112628, + "grad_norm": 0.07447069787702625, + "learning_rate": 1.929154164993019e-07, + "loss": 0.8857, + "step": 32163 + }, + { + "epoch": 6.860921501706485, + "grad_norm": 0.07058273416911845, + "learning_rate": 1.9232524673861386e-07, + "loss": 0.8983, + "step": 32164 + }, + { + "epoch": 6.861134812286689, + "grad_norm": 0.07453850917267214, + "learning_rate": 1.9173598000310046e-07, + "loss": 0.848, + "step": 32165 + }, + { + "epoch": 6.861348122866894, + "grad_norm": 0.07517603270759887, + "learning_rate": 1.9114761629944078e-07, + "loss": 0.8664, + "step": 32166 + }, + { + "epoch": 6.861561433447099, + "grad_norm": 0.07301907488482923, + "learning_rate": 1.9056015563428732e-07, + "loss": 0.8984, + "step": 32167 + }, + { + "epoch": 6.861774744027304, + "grad_norm": 0.07156061730857334, + "learning_rate": 1.8997359801428362e-07, + "loss": 0.9016, + "step": 32168 + }, + { + "epoch": 6.861988054607508, + "grad_norm": 0.07302247835211406, + "learning_rate": 1.8938794344607326e-07, + "loss": 0.8748, + "step": 32169 + }, + { + "epoch": 6.862201365187714, + "grad_norm": 0.0722221038457824, + "learning_rate": 1.8880319193628206e-07, + "loss": 0.859, + "step": 32170 + }, + { + "epoch": 6.862414675767918, + "grad_norm": 0.07105530381217506, + "learning_rate": 1.8821934349152692e-07, + "loss": 0.8953, + "step": 32171 + }, + { + "epoch": 6.862627986348123, + "grad_norm": 0.07297441023232194, + "learning_rate": 1.8763639811841594e-07, + "loss": 0.9035, + "step": 32172 + }, + { + "epoch": 6.862841296928328, + "grad_norm": 0.06782743316972684, + "learning_rate": 1.8705435582353937e-07, + "loss": 0.8467, + "step": 32173 + }, + { + "epoch": 6.863054607508532, + "grad_norm": 0.07041882870596924, + "learning_rate": 1.8647321661347861e-07, + "loss": 0.8747, + "step": 32174 + }, + { + "epoch": 6.863267918088737, + "grad_norm": 0.07658714173886874, + "learning_rate": 1.8589298049482396e-07, + "loss": 0.8817, + "step": 32175 + }, + { + "epoch": 6.863481228668942, + "grad_norm": 0.07038141483623447, + "learning_rate": 1.8531364747413016e-07, + "loss": 0.8808, + "step": 32176 + }, + { + "epoch": 6.863694539249146, + "grad_norm": 0.06640140188043436, + "learning_rate": 1.84735217557952e-07, + "loss": 0.8679, + "step": 32177 + }, + { + "epoch": 6.863907849829351, + "grad_norm": 0.06972950988326186, + "learning_rate": 1.841576907528353e-07, + "loss": 0.8782, + "step": 32178 + }, + { + "epoch": 6.864121160409557, + "grad_norm": 0.0741356083250461, + "learning_rate": 1.8358106706532597e-07, + "loss": 0.8822, + "step": 32179 + }, + { + "epoch": 6.864334470989761, + "grad_norm": 0.06983566923211013, + "learning_rate": 1.8300534650193436e-07, + "loss": 0.8934, + "step": 32180 + }, + { + "epoch": 6.864547781569966, + "grad_norm": 0.07065755710167196, + "learning_rate": 1.8243052906917081e-07, + "loss": 0.8568, + "step": 32181 + }, + { + "epoch": 6.864761092150171, + "grad_norm": 0.07152176094680558, + "learning_rate": 1.8185661477356341e-07, + "loss": 0.8752, + "step": 32182 + }, + { + "epoch": 6.864974402730375, + "grad_norm": 0.0722886430239459, + "learning_rate": 1.8128360362157815e-07, + "loss": 0.8911, + "step": 32183 + }, + { + "epoch": 6.86518771331058, + "grad_norm": 0.0759397524124154, + "learning_rate": 1.8071149561972533e-07, + "loss": 0.846, + "step": 32184 + }, + { + "epoch": 6.865401023890785, + "grad_norm": 0.07250015935367563, + "learning_rate": 1.8014029077445317e-07, + "loss": 0.8781, + "step": 32185 + }, + { + "epoch": 6.8656143344709895, + "grad_norm": 0.07171942419507929, + "learning_rate": 1.7956998909224533e-07, + "loss": 0.8989, + "step": 32186 + }, + { + "epoch": 6.865827645051194, + "grad_norm": 0.0740909269587647, + "learning_rate": 1.790005905795411e-07, + "loss": 0.8905, + "step": 32187 + }, + { + "epoch": 6.8660409556314, + "grad_norm": 0.07526402762018043, + "learning_rate": 1.784320952427887e-07, + "loss": 0.8781, + "step": 32188 + }, + { + "epoch": 6.866254266211604, + "grad_norm": 0.07176282597962641, + "learning_rate": 1.7786450308840963e-07, + "loss": 0.869, + "step": 32189 + }, + { + "epoch": 6.866467576791809, + "grad_norm": 0.07406949907412538, + "learning_rate": 1.7729781412285206e-07, + "loss": 0.8958, + "step": 32190 + }, + { + "epoch": 6.866680887372014, + "grad_norm": 0.06987273492446999, + "learning_rate": 1.7673202835250202e-07, + "loss": 0.8963, + "step": 32191 + }, + { + "epoch": 6.8668941979522184, + "grad_norm": 0.070604783997601, + "learning_rate": 1.7616714578377215e-07, + "loss": 0.8919, + "step": 32192 + }, + { + "epoch": 6.867107508532423, + "grad_norm": 0.06953329059260324, + "learning_rate": 1.756031664230573e-07, + "loss": 0.9071, + "step": 32193 + }, + { + "epoch": 6.867320819112628, + "grad_norm": 0.07079666426192333, + "learning_rate": 1.7504009027673463e-07, + "loss": 0.856, + "step": 32194 + }, + { + "epoch": 6.8675341296928325, + "grad_norm": 0.07759197248141235, + "learning_rate": 1.7447791735118124e-07, + "loss": 0.897, + "step": 32195 + }, + { + "epoch": 6.867747440273037, + "grad_norm": 0.07014603376514342, + "learning_rate": 1.7391664765274762e-07, + "loss": 0.8615, + "step": 32196 + }, + { + "epoch": 6.867960750853243, + "grad_norm": 0.07399336992940721, + "learning_rate": 1.7335628118778426e-07, + "loss": 0.8975, + "step": 32197 + }, + { + "epoch": 6.868174061433447, + "grad_norm": 0.07083801412521687, + "learning_rate": 1.7279681796265046e-07, + "loss": 0.8712, + "step": 32198 + }, + { + "epoch": 6.868387372013652, + "grad_norm": 0.06929724150599949, + "learning_rate": 1.7223825798365234e-07, + "loss": 0.8716, + "step": 32199 + }, + { + "epoch": 6.868600682593857, + "grad_norm": 0.069248538021253, + "learning_rate": 1.716806012571226e-07, + "loss": 0.8578, + "step": 32200 + }, + { + "epoch": 6.8688139931740615, + "grad_norm": 0.07068896961924423, + "learning_rate": 1.7112384778937617e-07, + "loss": 0.8826, + "step": 32201 + }, + { + "epoch": 6.869027303754266, + "grad_norm": 0.06704759792027873, + "learning_rate": 1.705679975867014e-07, + "loss": 0.8617, + "step": 32202 + }, + { + "epoch": 6.869240614334471, + "grad_norm": 0.07089410090209365, + "learning_rate": 1.7001305065538653e-07, + "loss": 0.8562, + "step": 32203 + }, + { + "epoch": 6.8694539249146755, + "grad_norm": 0.0715831766031572, + "learning_rate": 1.694590070017288e-07, + "loss": 0.8816, + "step": 32204 + }, + { + "epoch": 6.86966723549488, + "grad_norm": 0.07352127804948035, + "learning_rate": 1.6890586663198095e-07, + "loss": 0.8918, + "step": 32205 + }, + { + "epoch": 6.869880546075086, + "grad_norm": 0.0688161608839786, + "learning_rate": 1.6835362955239577e-07, + "loss": 0.8783, + "step": 32206 + }, + { + "epoch": 6.8700938566552905, + "grad_norm": 0.07075873417449258, + "learning_rate": 1.678022957692438e-07, + "loss": 0.8591, + "step": 32207 + }, + { + "epoch": 6.870307167235495, + "grad_norm": 0.07166761509309147, + "learning_rate": 1.6725186528875116e-07, + "loss": 0.8697, + "step": 32208 + }, + { + "epoch": 6.8705204778157, + "grad_norm": 0.0798230890993485, + "learning_rate": 1.6670233811714398e-07, + "loss": 0.9084, + "step": 32209 + }, + { + "epoch": 6.8707337883959045, + "grad_norm": 0.07204292411136123, + "learning_rate": 1.6615371426063954e-07, + "loss": 0.8509, + "step": 32210 + }, + { + "epoch": 6.870947098976109, + "grad_norm": 0.07368074029489843, + "learning_rate": 1.6560599372545506e-07, + "loss": 0.904, + "step": 32211 + }, + { + "epoch": 6.871160409556314, + "grad_norm": 0.07478790692340127, + "learning_rate": 1.6505917651778113e-07, + "loss": 0.8676, + "step": 32212 + }, + { + "epoch": 6.871373720136519, + "grad_norm": 0.0712531743424707, + "learning_rate": 1.6451326264379952e-07, + "loss": 0.8647, + "step": 32213 + }, + { + "epoch": 6.871587030716723, + "grad_norm": 0.0734985426365456, + "learning_rate": 1.6396825210969192e-07, + "loss": 0.8658, + "step": 32214 + }, + { + "epoch": 6.871800341296928, + "grad_norm": 0.07114432084682704, + "learning_rate": 1.6342414492163118e-07, + "loss": 0.8572, + "step": 32215 + }, + { + "epoch": 6.872013651877133, + "grad_norm": 0.07016410022539683, + "learning_rate": 1.6288094108577234e-07, + "loss": 0.869, + "step": 32216 + }, + { + "epoch": 6.872226962457338, + "grad_norm": 0.07453158991508255, + "learning_rate": 1.6233864060825277e-07, + "loss": 0.9054, + "step": 32217 + }, + { + "epoch": 6.872440273037543, + "grad_norm": 0.07326544634445029, + "learning_rate": 1.6179724349521864e-07, + "loss": 0.9176, + "step": 32218 + }, + { + "epoch": 6.872653583617748, + "grad_norm": 0.07126340079771662, + "learning_rate": 1.6125674975278948e-07, + "loss": 0.8642, + "step": 32219 + }, + { + "epoch": 6.872866894197952, + "grad_norm": 0.07619376049973302, + "learning_rate": 1.607171593870849e-07, + "loss": 0.8839, + "step": 32220 + }, + { + "epoch": 6.873080204778157, + "grad_norm": 0.06894125854670705, + "learning_rate": 1.601784724042066e-07, + "loss": 0.8624, + "step": 32221 + }, + { + "epoch": 6.873293515358362, + "grad_norm": 0.07058654779881393, + "learning_rate": 1.5964068881024752e-07, + "loss": 0.8983, + "step": 32222 + }, + { + "epoch": 6.873506825938566, + "grad_norm": 0.07083543445714512, + "learning_rate": 1.591038086113006e-07, + "loss": 0.8969, + "step": 32223 + }, + { + "epoch": 6.873720136518771, + "grad_norm": 0.07178296083314097, + "learning_rate": 1.5856783181344093e-07, + "loss": 0.8566, + "step": 32224 + }, + { + "epoch": 6.873933447098976, + "grad_norm": 0.06998004181134737, + "learning_rate": 1.5803275842272592e-07, + "loss": 0.8739, + "step": 32225 + }, + { + "epoch": 6.874146757679181, + "grad_norm": 0.07182420309042695, + "learning_rate": 1.5749858844521292e-07, + "loss": 0.907, + "step": 32226 + }, + { + "epoch": 6.874360068259386, + "grad_norm": 0.07012385503187737, + "learning_rate": 1.5696532188695046e-07, + "loss": 0.8662, + "step": 32227 + }, + { + "epoch": 6.874573378839591, + "grad_norm": 0.066508438929324, + "learning_rate": 1.5643295875396037e-07, + "loss": 0.864, + "step": 32228 + }, + { + "epoch": 6.874786689419795, + "grad_norm": 0.07198419915493108, + "learning_rate": 1.5590149905228223e-07, + "loss": 0.8505, + "step": 32229 + }, + { + "epoch": 6.875, + "grad_norm": 0.0692580939443758, + "learning_rate": 1.553709427879202e-07, + "loss": 0.8833, + "step": 32230 + }, + { + "epoch": 6.875213310580205, + "grad_norm": 0.07542652288107525, + "learning_rate": 1.5484128996686941e-07, + "loss": 0.8745, + "step": 32231 + }, + { + "epoch": 6.875426621160409, + "grad_norm": 0.06720536478391236, + "learning_rate": 1.5431254059514288e-07, + "loss": 0.8895, + "step": 32232 + }, + { + "epoch": 6.875639931740614, + "grad_norm": 0.07596682955509802, + "learning_rate": 1.5378469467870916e-07, + "loss": 0.8652, + "step": 32233 + }, + { + "epoch": 6.875853242320819, + "grad_norm": 0.06927415897788833, + "learning_rate": 1.5325775222354566e-07, + "loss": 0.8578, + "step": 32234 + }, + { + "epoch": 6.876066552901024, + "grad_norm": 0.07161773434543496, + "learning_rate": 1.527317132356121e-07, + "loss": 0.8777, + "step": 32235 + }, + { + "epoch": 6.876279863481229, + "grad_norm": 0.07311391425066462, + "learning_rate": 1.522065777208681e-07, + "loss": 0.8703, + "step": 32236 + }, + { + "epoch": 6.876493174061434, + "grad_norm": 0.07066126719796087, + "learning_rate": 1.5168234568523788e-07, + "loss": 0.8868, + "step": 32237 + }, + { + "epoch": 6.876706484641638, + "grad_norm": 0.07049265531049959, + "learning_rate": 1.5115901713468106e-07, + "loss": 0.8915, + "step": 32238 + }, + { + "epoch": 6.876919795221843, + "grad_norm": 0.07028610160525887, + "learning_rate": 1.506365920750952e-07, + "loss": 0.8735, + "step": 32239 + }, + { + "epoch": 6.877133105802048, + "grad_norm": 0.06866813644056743, + "learning_rate": 1.501150705123955e-07, + "loss": 0.8794, + "step": 32240 + }, + { + "epoch": 6.877346416382252, + "grad_norm": 0.06843042004603235, + "learning_rate": 1.4959445245248837e-07, + "loss": 0.8527, + "step": 32241 + }, + { + "epoch": 6.877559726962457, + "grad_norm": 0.07059065199474203, + "learning_rate": 1.4907473790126249e-07, + "loss": 0.8962, + "step": 32242 + }, + { + "epoch": 6.877773037542662, + "grad_norm": 0.07397647197074098, + "learning_rate": 1.485559268646064e-07, + "loss": 0.9041, + "step": 32243 + }, + { + "epoch": 6.877986348122867, + "grad_norm": 0.07170027573846982, + "learning_rate": 1.4803801934838214e-07, + "loss": 0.9046, + "step": 32244 + }, + { + "epoch": 6.878199658703072, + "grad_norm": 0.06907926595408953, + "learning_rate": 1.4752101535844276e-07, + "loss": 0.8919, + "step": 32245 + }, + { + "epoch": 6.878412969283277, + "grad_norm": 0.06718774402636814, + "learning_rate": 1.4700491490065028e-07, + "loss": 0.8564, + "step": 32246 + }, + { + "epoch": 6.878626279863481, + "grad_norm": 0.07267183442742484, + "learning_rate": 1.4648971798084e-07, + "loss": 0.8532, + "step": 32247 + }, + { + "epoch": 6.878839590443686, + "grad_norm": 0.07232442872350019, + "learning_rate": 1.4597542460483837e-07, + "loss": 0.8945, + "step": 32248 + }, + { + "epoch": 6.879052901023891, + "grad_norm": 0.07066920505424781, + "learning_rate": 1.4546203477847188e-07, + "loss": 0.8925, + "step": 32249 + }, + { + "epoch": 6.8792662116040955, + "grad_norm": 0.07215263359409296, + "learning_rate": 1.4494954850754917e-07, + "loss": 0.8609, + "step": 32250 + }, + { + "epoch": 6.8794795221843, + "grad_norm": 0.07062527701917617, + "learning_rate": 1.4443796579786117e-07, + "loss": 0.8703, + "step": 32251 + }, + { + "epoch": 6.879692832764505, + "grad_norm": 0.07179223407099194, + "learning_rate": 1.4392728665519885e-07, + "loss": 0.8761, + "step": 32252 + }, + { + "epoch": 6.8799061433447095, + "grad_norm": 0.06865145426977473, + "learning_rate": 1.4341751108534417e-07, + "loss": 0.873, + "step": 32253 + }, + { + "epoch": 6.880119453924914, + "grad_norm": 0.06885071303532016, + "learning_rate": 1.4290863909407038e-07, + "loss": 0.8546, + "step": 32254 + }, + { + "epoch": 6.88033276450512, + "grad_norm": 0.07294365916375252, + "learning_rate": 1.4240067068712393e-07, + "loss": 0.9134, + "step": 32255 + }, + { + "epoch": 6.8805460750853245, + "grad_norm": 0.07068943314239592, + "learning_rate": 1.4189360587025135e-07, + "loss": 0.8944, + "step": 32256 + }, + { + "epoch": 6.880759385665529, + "grad_norm": 0.0686051779072846, + "learning_rate": 1.4138744464919918e-07, + "loss": 0.8709, + "step": 32257 + }, + { + "epoch": 6.880972696245734, + "grad_norm": 0.07463523204258324, + "learning_rate": 1.4088218702969615e-07, + "loss": 0.9228, + "step": 32258 + }, + { + "epoch": 6.8811860068259385, + "grad_norm": 0.07332892062231629, + "learning_rate": 1.4037783301744434e-07, + "loss": 0.8593, + "step": 32259 + }, + { + "epoch": 6.881399317406143, + "grad_norm": 0.07034827688619742, + "learning_rate": 1.3987438261817254e-07, + "loss": 0.8721, + "step": 32260 + }, + { + "epoch": 6.881612627986348, + "grad_norm": 0.07659413676950684, + "learning_rate": 1.393718358375562e-07, + "loss": 0.8562, + "step": 32261 + }, + { + "epoch": 6.881825938566553, + "grad_norm": 0.07219211041750381, + "learning_rate": 1.3887019268129743e-07, + "loss": 0.8765, + "step": 32262 + }, + { + "epoch": 6.882039249146757, + "grad_norm": 0.07029710435251542, + "learning_rate": 1.383694531550628e-07, + "loss": 0.9055, + "step": 32263 + }, + { + "epoch": 6.882252559726963, + "grad_norm": 0.07050163374392593, + "learning_rate": 1.3786961726451885e-07, + "loss": 0.8847, + "step": 32264 + }, + { + "epoch": 6.8824658703071675, + "grad_norm": 0.07086431086454308, + "learning_rate": 1.3737068501532337e-07, + "loss": 0.9019, + "step": 32265 + }, + { + "epoch": 6.882679180887372, + "grad_norm": 0.07259106741904113, + "learning_rate": 1.3687265641312508e-07, + "loss": 0.8603, + "step": 32266 + }, + { + "epoch": 6.882892491467577, + "grad_norm": 0.06985016109727274, + "learning_rate": 1.3637553146355508e-07, + "loss": 0.8608, + "step": 32267 + }, + { + "epoch": 6.8831058020477816, + "grad_norm": 0.07064245840999318, + "learning_rate": 1.3587931017224442e-07, + "loss": 0.8789, + "step": 32268 + }, + { + "epoch": 6.883319112627986, + "grad_norm": 0.07437447872236634, + "learning_rate": 1.353839925447975e-07, + "loss": 0.9145, + "step": 32269 + }, + { + "epoch": 6.883532423208191, + "grad_norm": 0.07149661486203794, + "learning_rate": 1.3488957858681872e-07, + "loss": 0.8882, + "step": 32270 + }, + { + "epoch": 6.883745733788396, + "grad_norm": 0.07461422591286615, + "learning_rate": 1.343960683039125e-07, + "loss": 0.8812, + "step": 32271 + }, + { + "epoch": 6.8839590443686, + "grad_norm": 0.069392997548096, + "learning_rate": 1.3390346170166545e-07, + "loss": 0.8621, + "step": 32272 + }, + { + "epoch": 6.884172354948806, + "grad_norm": 0.06984866953203578, + "learning_rate": 1.3341175878563762e-07, + "loss": 0.8745, + "step": 32273 + }, + { + "epoch": 6.8843856655290105, + "grad_norm": 0.06957406377209584, + "learning_rate": 1.3292095956139783e-07, + "loss": 0.8852, + "step": 32274 + }, + { + "epoch": 6.884598976109215, + "grad_norm": 0.0783549651086819, + "learning_rate": 1.3243106403449724e-07, + "loss": 0.8821, + "step": 32275 + }, + { + "epoch": 6.88481228668942, + "grad_norm": 0.07669365745852383, + "learning_rate": 1.3194207221049581e-07, + "loss": 0.86, + "step": 32276 + }, + { + "epoch": 6.885025597269625, + "grad_norm": 0.07433067007498602, + "learning_rate": 1.3145398409490028e-07, + "loss": 0.8729, + "step": 32277 + }, + { + "epoch": 6.885238907849829, + "grad_norm": 0.07406355151770594, + "learning_rate": 1.3096679969325287e-07, + "loss": 0.9139, + "step": 32278 + }, + { + "epoch": 6.885452218430034, + "grad_norm": 0.07085332900125763, + "learning_rate": 1.3048051901105142e-07, + "loss": 0.8182, + "step": 32279 + }, + { + "epoch": 6.885665529010239, + "grad_norm": 0.0675318859962716, + "learning_rate": 1.2999514205382035e-07, + "loss": 0.84, + "step": 32280 + }, + { + "epoch": 6.885878839590443, + "grad_norm": 0.0702627424894695, + "learning_rate": 1.2951066882702202e-07, + "loss": 0.8639, + "step": 32281 + }, + { + "epoch": 6.886092150170649, + "grad_norm": 0.06944604883839076, + "learning_rate": 1.2902709933616307e-07, + "loss": 0.886, + "step": 32282 + }, + { + "epoch": 6.886305460750854, + "grad_norm": 0.07172163001372049, + "learning_rate": 1.2854443358670587e-07, + "loss": 0.8765, + "step": 32283 + }, + { + "epoch": 6.886518771331058, + "grad_norm": 0.0722514624950182, + "learning_rate": 1.2806267158411267e-07, + "loss": 0.8873, + "step": 32284 + }, + { + "epoch": 6.886732081911263, + "grad_norm": 0.06859610100884819, + "learning_rate": 1.2758181333382801e-07, + "loss": 0.8519, + "step": 32285 + }, + { + "epoch": 6.886945392491468, + "grad_norm": 0.06742748362091916, + "learning_rate": 1.2710185884130532e-07, + "loss": 0.9014, + "step": 32286 + }, + { + "epoch": 6.887158703071672, + "grad_norm": 0.06942888334577743, + "learning_rate": 1.2662280811196248e-07, + "loss": 0.8902, + "step": 32287 + }, + { + "epoch": 6.887372013651877, + "grad_norm": 0.07064953736796316, + "learning_rate": 1.2614466115123515e-07, + "loss": 0.8695, + "step": 32288 + }, + { + "epoch": 6.887585324232082, + "grad_norm": 0.07004367620243188, + "learning_rate": 1.2566741796451452e-07, + "loss": 0.9076, + "step": 32289 + }, + { + "epoch": 6.887798634812286, + "grad_norm": 0.06889595017819249, + "learning_rate": 1.2519107855720968e-07, + "loss": 0.87, + "step": 32290 + }, + { + "epoch": 6.888011945392492, + "grad_norm": 0.07186479819521291, + "learning_rate": 1.2471564293472072e-07, + "loss": 0.8619, + "step": 32291 + }, + { + "epoch": 6.888225255972696, + "grad_norm": 0.07062116301039804, + "learning_rate": 1.2424111110241223e-07, + "loss": 0.8723, + "step": 32292 + }, + { + "epoch": 6.888438566552901, + "grad_norm": 0.06860674741203954, + "learning_rate": 1.2376748306565766e-07, + "loss": 0.9114, + "step": 32293 + }, + { + "epoch": 6.888651877133106, + "grad_norm": 0.06773244762608667, + "learning_rate": 1.2329475882982168e-07, + "loss": 0.8619, + "step": 32294 + }, + { + "epoch": 6.888865187713311, + "grad_norm": 0.07158803172139272, + "learning_rate": 1.2282293840024218e-07, + "loss": 0.8859, + "step": 32295 + }, + { + "epoch": 6.889078498293515, + "grad_norm": 0.07295557151402214, + "learning_rate": 1.223520217822749e-07, + "loss": 0.8886, + "step": 32296 + }, + { + "epoch": 6.88929180887372, + "grad_norm": 0.06901000285197181, + "learning_rate": 1.2188200898123114e-07, + "loss": 0.8933, + "step": 32297 + }, + { + "epoch": 6.889505119453925, + "grad_norm": 0.06889863650162487, + "learning_rate": 1.2141290000244e-07, + "loss": 0.8653, + "step": 32298 + }, + { + "epoch": 6.889718430034129, + "grad_norm": 0.06999564604533604, + "learning_rate": 1.2094469485120385e-07, + "loss": 0.8675, + "step": 32299 + }, + { + "epoch": 6.889931740614334, + "grad_norm": 0.06875755688761599, + "learning_rate": 1.2047739353282517e-07, + "loss": 0.8701, + "step": 32300 + }, + { + "epoch": 6.890145051194539, + "grad_norm": 0.07108495392879687, + "learning_rate": 1.2001099605257972e-07, + "loss": 0.8876, + "step": 32301 + }, + { + "epoch": 6.890358361774744, + "grad_norm": 0.07178895056434295, + "learning_rate": 1.19545502415761e-07, + "loss": 0.8835, + "step": 32302 + }, + { + "epoch": 6.890571672354949, + "grad_norm": 0.06858781224215436, + "learning_rate": 1.190809126276271e-07, + "loss": 0.8945, + "step": 32303 + }, + { + "epoch": 6.890784982935154, + "grad_norm": 0.07183294411200629, + "learning_rate": 1.1861722669343601e-07, + "loss": 0.89, + "step": 32304 + }, + { + "epoch": 6.890998293515358, + "grad_norm": 0.07295562054065786, + "learning_rate": 1.1815444461842795e-07, + "loss": 0.8774, + "step": 32305 + }, + { + "epoch": 6.891211604095563, + "grad_norm": 0.07015669981106022, + "learning_rate": 1.1769256640785209e-07, + "loss": 0.873, + "step": 32306 + }, + { + "epoch": 6.891424914675768, + "grad_norm": 0.07532972666135487, + "learning_rate": 1.172315920669309e-07, + "loss": 0.8493, + "step": 32307 + }, + { + "epoch": 6.8916382252559725, + "grad_norm": 0.0743657939520536, + "learning_rate": 1.1677152160086914e-07, + "loss": 0.8679, + "step": 32308 + }, + { + "epoch": 6.891851535836177, + "grad_norm": 0.06854921957535311, + "learning_rate": 1.1631235501488924e-07, + "loss": 0.8543, + "step": 32309 + }, + { + "epoch": 6.892064846416382, + "grad_norm": 0.07053555636632476, + "learning_rate": 1.1585409231417821e-07, + "loss": 0.9021, + "step": 32310 + }, + { + "epoch": 6.892278156996587, + "grad_norm": 0.07241965212892885, + "learning_rate": 1.15396733503923e-07, + "loss": 0.8764, + "step": 32311 + }, + { + "epoch": 6.892491467576792, + "grad_norm": 0.0765460370310784, + "learning_rate": 1.149402785892928e-07, + "loss": 0.9033, + "step": 32312 + }, + { + "epoch": 6.892704778156997, + "grad_norm": 0.07058737987725354, + "learning_rate": 1.144847275754568e-07, + "loss": 0.8664, + "step": 32313 + }, + { + "epoch": 6.8929180887372015, + "grad_norm": 0.07001644324476791, + "learning_rate": 1.1403008046757535e-07, + "loss": 0.8844, + "step": 32314 + }, + { + "epoch": 6.893131399317406, + "grad_norm": 0.06957055362112181, + "learning_rate": 1.135763372707821e-07, + "loss": 0.8693, + "step": 32315 + }, + { + "epoch": 6.893344709897611, + "grad_norm": 0.07292567269057276, + "learning_rate": 1.131234979902196e-07, + "loss": 0.88, + "step": 32316 + }, + { + "epoch": 6.8935580204778155, + "grad_norm": 0.07429448890212889, + "learning_rate": 1.1267156263100376e-07, + "loss": 0.8759, + "step": 32317 + }, + { + "epoch": 6.89377133105802, + "grad_norm": 0.07084271926321567, + "learning_rate": 1.1222053119825938e-07, + "loss": 0.8685, + "step": 32318 + }, + { + "epoch": 6.893984641638225, + "grad_norm": 0.06731168889589284, + "learning_rate": 1.1177040369708458e-07, + "loss": 0.8742, + "step": 32319 + }, + { + "epoch": 6.8941979522184305, + "grad_norm": 0.07232510197684203, + "learning_rate": 1.1132118013255977e-07, + "loss": 0.9165, + "step": 32320 + }, + { + "epoch": 6.894411262798635, + "grad_norm": 0.07106437581223964, + "learning_rate": 1.1087286050979195e-07, + "loss": 0.8345, + "step": 32321 + }, + { + "epoch": 6.89462457337884, + "grad_norm": 0.07179891603637918, + "learning_rate": 1.1042544483383488e-07, + "loss": 0.8865, + "step": 32322 + }, + { + "epoch": 6.8948378839590445, + "grad_norm": 0.06591600755896729, + "learning_rate": 1.0997893310976004e-07, + "loss": 0.8697, + "step": 32323 + }, + { + "epoch": 6.895051194539249, + "grad_norm": 0.07166992300402907, + "learning_rate": 1.095333253426123e-07, + "loss": 0.8938, + "step": 32324 + }, + { + "epoch": 6.895264505119454, + "grad_norm": 0.07019364470446422, + "learning_rate": 1.0908862153744538e-07, + "loss": 0.9003, + "step": 32325 + }, + { + "epoch": 6.895477815699659, + "grad_norm": 0.06965051875441794, + "learning_rate": 1.0864482169928636e-07, + "loss": 0.8979, + "step": 32326 + }, + { + "epoch": 6.895691126279863, + "grad_norm": 0.06812497242614435, + "learning_rate": 1.082019258331446e-07, + "loss": 0.8548, + "step": 32327 + }, + { + "epoch": 6.895904436860068, + "grad_norm": 0.06983709901957244, + "learning_rate": 1.0775993394404715e-07, + "loss": 0.8971, + "step": 32328 + }, + { + "epoch": 6.8961177474402735, + "grad_norm": 0.072809121241684, + "learning_rate": 1.073188460369945e-07, + "loss": 0.8946, + "step": 32329 + }, + { + "epoch": 6.896331058020478, + "grad_norm": 0.06730494054695071, + "learning_rate": 1.0687866211696929e-07, + "loss": 0.8774, + "step": 32330 + }, + { + "epoch": 6.896544368600683, + "grad_norm": 0.07318051835702155, + "learning_rate": 1.0643938218895422e-07, + "loss": 0.8796, + "step": 32331 + }, + { + "epoch": 6.896757679180888, + "grad_norm": 0.0720637414769725, + "learning_rate": 1.0600100625792309e-07, + "loss": 0.8936, + "step": 32332 + }, + { + "epoch": 6.896970989761092, + "grad_norm": 0.07064813513910638, + "learning_rate": 1.0556353432883192e-07, + "loss": 0.8967, + "step": 32333 + }, + { + "epoch": 6.897184300341297, + "grad_norm": 0.06835637520571544, + "learning_rate": 1.0512696640663678e-07, + "loss": 0.8999, + "step": 32334 + }, + { + "epoch": 6.897397610921502, + "grad_norm": 0.07062808881832298, + "learning_rate": 1.0469130249627591e-07, + "loss": 0.8728, + "step": 32335 + }, + { + "epoch": 6.897610921501706, + "grad_norm": 0.06853755475239182, + "learning_rate": 1.0425654260266982e-07, + "loss": 0.8694, + "step": 32336 + }, + { + "epoch": 6.897824232081911, + "grad_norm": 0.07009415823538125, + "learning_rate": 1.0382268673074791e-07, + "loss": 0.887, + "step": 32337 + }, + { + "epoch": 6.898037542662116, + "grad_norm": 0.07495737241055882, + "learning_rate": 1.033897348854218e-07, + "loss": 0.8878, + "step": 32338 + }, + { + "epoch": 6.89825085324232, + "grad_norm": 0.07250636870585955, + "learning_rate": 1.029576870715765e-07, + "loss": 0.9232, + "step": 32339 + }, + { + "epoch": 6.898464163822526, + "grad_norm": 0.07171068479395919, + "learning_rate": 1.025265432941147e-07, + "loss": 0.8953, + "step": 32340 + }, + { + "epoch": 6.898677474402731, + "grad_norm": 0.07205678550758098, + "learning_rate": 1.0209630355790368e-07, + "loss": 0.8993, + "step": 32341 + }, + { + "epoch": 6.898890784982935, + "grad_norm": 0.06951805532801382, + "learning_rate": 1.0166696786781949e-07, + "loss": 0.8823, + "step": 32342 + }, + { + "epoch": 6.89910409556314, + "grad_norm": 0.0722744214596611, + "learning_rate": 1.0123853622872049e-07, + "loss": 0.8854, + "step": 32343 + }, + { + "epoch": 6.899317406143345, + "grad_norm": 0.0696082547008598, + "learning_rate": 1.0081100864544724e-07, + "loss": 0.8862, + "step": 32344 + }, + { + "epoch": 6.899530716723549, + "grad_norm": 0.06983981216593967, + "learning_rate": 1.0038438512284032e-07, + "loss": 0.8646, + "step": 32345 + }, + { + "epoch": 6.899744027303754, + "grad_norm": 0.0714093049910572, + "learning_rate": 9.995866566573143e-08, + "loss": 0.8725, + "step": 32346 + }, + { + "epoch": 6.899957337883959, + "grad_norm": 0.0730732818986622, + "learning_rate": 9.953385027893447e-08, + "loss": 0.9009, + "step": 32347 + }, + { + "epoch": 6.900170648464163, + "grad_norm": 0.07720825588957102, + "learning_rate": 9.910993896725451e-08, + "loss": 0.8895, + "step": 32348 + }, + { + "epoch": 6.900383959044369, + "grad_norm": 0.0712341599490792, + "learning_rate": 9.868693173548771e-08, + "loss": 0.8993, + "step": 32349 + }, + { + "epoch": 6.900597269624574, + "grad_norm": 0.06854669637589503, + "learning_rate": 9.826482858842135e-08, + "loss": 0.8982, + "step": 32350 + }, + { + "epoch": 6.900810580204778, + "grad_norm": 0.06671383378445093, + "learning_rate": 9.784362953083382e-08, + "loss": 0.8403, + "step": 32351 + }, + { + "epoch": 6.901023890784983, + "grad_norm": 0.06734212928996425, + "learning_rate": 9.742333456749464e-08, + "loss": 0.876, + "step": 32352 + }, + { + "epoch": 6.901237201365188, + "grad_norm": 0.07025312669993564, + "learning_rate": 9.70039437031467e-08, + "loss": 0.885, + "step": 32353 + }, + { + "epoch": 6.901450511945392, + "grad_norm": 0.0689242275238754, + "learning_rate": 9.658545694255062e-08, + "loss": 0.8881, + "step": 32354 + }, + { + "epoch": 6.901663822525597, + "grad_norm": 0.07358839733514964, + "learning_rate": 9.61678742904315e-08, + "loss": 0.8682, + "step": 32355 + }, + { + "epoch": 6.901877133105802, + "grad_norm": 0.07327922082559099, + "learning_rate": 9.575119575151449e-08, + "loss": 0.9007, + "step": 32356 + }, + { + "epoch": 6.902090443686006, + "grad_norm": 0.06801779066605511, + "learning_rate": 9.533542133051577e-08, + "loss": 0.8649, + "step": 32357 + }, + { + "epoch": 6.902303754266212, + "grad_norm": 0.07891474365003949, + "learning_rate": 9.492055103214271e-08, + "loss": 0.8931, + "step": 32358 + }, + { + "epoch": 6.902517064846417, + "grad_norm": 0.07110463258737737, + "learning_rate": 9.450658486109376e-08, + "loss": 0.888, + "step": 32359 + }, + { + "epoch": 6.902730375426621, + "grad_norm": 0.07149007957203772, + "learning_rate": 9.409352282204076e-08, + "loss": 0.8964, + "step": 32360 + }, + { + "epoch": 6.902943686006826, + "grad_norm": 0.07379107294552809, + "learning_rate": 9.368136491965551e-08, + "loss": 0.8961, + "step": 32361 + }, + { + "epoch": 6.903156996587031, + "grad_norm": 0.07215314892710373, + "learning_rate": 9.327011115861873e-08, + "loss": 0.8638, + "step": 32362 + }, + { + "epoch": 6.903370307167235, + "grad_norm": 0.0711811357205943, + "learning_rate": 9.285976154357556e-08, + "loss": 0.9182, + "step": 32363 + }, + { + "epoch": 6.90358361774744, + "grad_norm": 0.07058014721447246, + "learning_rate": 9.245031607917121e-08, + "loss": 0.8884, + "step": 32364 + }, + { + "epoch": 6.903796928327645, + "grad_norm": 0.07093342803538864, + "learning_rate": 9.204177477003307e-08, + "loss": 0.861, + "step": 32365 + }, + { + "epoch": 6.9040102389078495, + "grad_norm": 0.07420956957810973, + "learning_rate": 9.163413762077967e-08, + "loss": 0.9119, + "step": 32366 + }, + { + "epoch": 6.904223549488055, + "grad_norm": 0.06904669349270709, + "learning_rate": 9.122740463603841e-08, + "loss": 0.8397, + "step": 32367 + }, + { + "epoch": 6.90443686006826, + "grad_norm": 0.07128490042520096, + "learning_rate": 9.08215758203923e-08, + "loss": 0.905, + "step": 32368 + }, + { + "epoch": 6.904650170648464, + "grad_norm": 0.07179532050548854, + "learning_rate": 9.041665117845988e-08, + "loss": 0.8673, + "step": 32369 + }, + { + "epoch": 6.904863481228669, + "grad_norm": 0.07251837328050408, + "learning_rate": 9.001263071479749e-08, + "loss": 0.8799, + "step": 32370 + }, + { + "epoch": 6.905076791808874, + "grad_norm": 0.07164339604244027, + "learning_rate": 8.960951443398813e-08, + "loss": 0.8874, + "step": 32371 + }, + { + "epoch": 6.9052901023890785, + "grad_norm": 0.07103993116462351, + "learning_rate": 8.920730234058816e-08, + "loss": 0.8772, + "step": 32372 + }, + { + "epoch": 6.905503412969283, + "grad_norm": 0.06702908076999144, + "learning_rate": 8.880599443915395e-08, + "loss": 0.8721, + "step": 32373 + }, + { + "epoch": 6.905716723549488, + "grad_norm": 0.06863359914730989, + "learning_rate": 8.840559073422405e-08, + "loss": 0.8941, + "step": 32374 + }, + { + "epoch": 6.9059300341296925, + "grad_norm": 0.07242968431615852, + "learning_rate": 8.800609123032822e-08, + "loss": 0.8777, + "step": 32375 + }, + { + "epoch": 6.906143344709898, + "grad_norm": 0.06876623771146896, + "learning_rate": 8.760749593198725e-08, + "loss": 0.8764, + "step": 32376 + }, + { + "epoch": 6.906356655290102, + "grad_norm": 0.06951393488176713, + "learning_rate": 8.720980484371311e-08, + "loss": 0.8621, + "step": 32377 + }, + { + "epoch": 6.9065699658703075, + "grad_norm": 0.0741364403500954, + "learning_rate": 8.681301797000885e-08, + "loss": 0.8547, + "step": 32378 + }, + { + "epoch": 6.906783276450512, + "grad_norm": 0.06875599386958203, + "learning_rate": 8.64171353153509e-08, + "loss": 0.852, + "step": 32379 + }, + { + "epoch": 6.906996587030717, + "grad_norm": 0.06922345251499717, + "learning_rate": 8.602215688423343e-08, + "loss": 0.8827, + "step": 32380 + }, + { + "epoch": 6.9072098976109215, + "grad_norm": 0.0709533524579577, + "learning_rate": 8.562808268111511e-08, + "loss": 0.8751, + "step": 32381 + }, + { + "epoch": 6.907423208191126, + "grad_norm": 0.07226839566810049, + "learning_rate": 8.523491271046346e-08, + "loss": 0.8707, + "step": 32382 + }, + { + "epoch": 6.907636518771331, + "grad_norm": 0.07287532177111794, + "learning_rate": 8.48426469767194e-08, + "loss": 0.9005, + "step": 32383 + }, + { + "epoch": 6.907849829351536, + "grad_norm": 0.06881985265665296, + "learning_rate": 8.44512854843238e-08, + "loss": 0.8741, + "step": 32384 + }, + { + "epoch": 6.90806313993174, + "grad_norm": 0.07504272409776032, + "learning_rate": 8.406082823770866e-08, + "loss": 0.899, + "step": 32385 + }, + { + "epoch": 6.908276450511945, + "grad_norm": 0.07353900029455702, + "learning_rate": 8.367127524128827e-08, + "loss": 0.906, + "step": 32386 + }, + { + "epoch": 6.9084897610921505, + "grad_norm": 0.06961199850045047, + "learning_rate": 8.328262649947683e-08, + "loss": 0.8891, + "step": 32387 + }, + { + "epoch": 6.908703071672355, + "grad_norm": 0.07421654306779008, + "learning_rate": 8.289488201666196e-08, + "loss": 0.876, + "step": 32388 + }, + { + "epoch": 6.90891638225256, + "grad_norm": 0.07207391934278097, + "learning_rate": 8.250804179723126e-08, + "loss": 0.8743, + "step": 32389 + }, + { + "epoch": 6.909129692832765, + "grad_norm": 0.07066188778836421, + "learning_rate": 8.212210584557234e-08, + "loss": 0.874, + "step": 32390 + }, + { + "epoch": 6.909343003412969, + "grad_norm": 0.07394588825622643, + "learning_rate": 8.173707416604615e-08, + "loss": 0.8923, + "step": 32391 + }, + { + "epoch": 6.909556313993174, + "grad_norm": 0.0709430244830677, + "learning_rate": 8.135294676300475e-08, + "loss": 0.8836, + "step": 32392 + }, + { + "epoch": 6.909769624573379, + "grad_norm": 0.06748658140397082, + "learning_rate": 8.096972364080912e-08, + "loss": 0.8711, + "step": 32393 + }, + { + "epoch": 6.909982935153583, + "grad_norm": 0.06937855768386254, + "learning_rate": 8.058740480377581e-08, + "loss": 0.8675, + "step": 32394 + }, + { + "epoch": 6.910196245733788, + "grad_norm": 0.07176178503310073, + "learning_rate": 8.020599025624798e-08, + "loss": 0.9034, + "step": 32395 + }, + { + "epoch": 6.910409556313994, + "grad_norm": 0.07080373386633726, + "learning_rate": 7.982548000252443e-08, + "loss": 0.908, + "step": 32396 + }, + { + "epoch": 6.910622866894198, + "grad_norm": 0.07247521280999569, + "learning_rate": 7.944587404692172e-08, + "loss": 0.8998, + "step": 32397 + }, + { + "epoch": 6.910836177474403, + "grad_norm": 0.06786808560076196, + "learning_rate": 7.906717239373862e-08, + "loss": 0.8775, + "step": 32398 + }, + { + "epoch": 6.911049488054608, + "grad_norm": 0.07362087712410265, + "learning_rate": 7.868937504725616e-08, + "loss": 0.9137, + "step": 32399 + }, + { + "epoch": 6.911262798634812, + "grad_norm": 0.06928865982440104, + "learning_rate": 7.831248201173758e-08, + "loss": 0.8691, + "step": 32400 + }, + { + "epoch": 6.911476109215017, + "grad_norm": 0.07251807674507484, + "learning_rate": 7.793649329146391e-08, + "loss": 0.8693, + "step": 32401 + }, + { + "epoch": 6.911689419795222, + "grad_norm": 0.07109693527587256, + "learning_rate": 7.756140889068065e-08, + "loss": 0.8592, + "step": 32402 + }, + { + "epoch": 6.911902730375426, + "grad_norm": 0.07427774451902382, + "learning_rate": 7.718722881364216e-08, + "loss": 0.8793, + "step": 32403 + }, + { + "epoch": 6.912116040955631, + "grad_norm": 0.06739517814005265, + "learning_rate": 7.681395306456729e-08, + "loss": 0.8688, + "step": 32404 + }, + { + "epoch": 6.912329351535837, + "grad_norm": 0.06837168277556871, + "learning_rate": 7.644158164768379e-08, + "loss": 0.9021, + "step": 32405 + }, + { + "epoch": 6.912542662116041, + "grad_norm": 0.07353604683823421, + "learning_rate": 7.607011456721935e-08, + "loss": 0.8859, + "step": 32406 + }, + { + "epoch": 6.912755972696246, + "grad_norm": 0.0682406469160041, + "learning_rate": 7.569955182735733e-08, + "loss": 0.8994, + "step": 32407 + }, + { + "epoch": 6.912969283276451, + "grad_norm": 0.07157527985890065, + "learning_rate": 7.53298934322988e-08, + "loss": 0.8745, + "step": 32408 + }, + { + "epoch": 6.913182593856655, + "grad_norm": 0.06932685199646337, + "learning_rate": 7.496113938622707e-08, + "loss": 0.8751, + "step": 32409 + }, + { + "epoch": 6.91339590443686, + "grad_norm": 0.07370967298841634, + "learning_rate": 7.459328969331659e-08, + "loss": 0.8923, + "step": 32410 + }, + { + "epoch": 6.913609215017065, + "grad_norm": 0.07179694676953484, + "learning_rate": 7.42263443577329e-08, + "loss": 0.886, + "step": 32411 + }, + { + "epoch": 6.913822525597269, + "grad_norm": 0.06892600700142189, + "learning_rate": 7.386030338361494e-08, + "loss": 0.8931, + "step": 32412 + }, + { + "epoch": 6.914035836177474, + "grad_norm": 0.0707221121801565, + "learning_rate": 7.349516677511048e-08, + "loss": 0.874, + "step": 32413 + }, + { + "epoch": 6.91424914675768, + "grad_norm": 0.06987454007035096, + "learning_rate": 7.313093453635845e-08, + "loss": 0.8855, + "step": 32414 + }, + { + "epoch": 6.914462457337884, + "grad_norm": 0.07005703905036668, + "learning_rate": 7.27676066714711e-08, + "loss": 0.86, + "step": 32415 + }, + { + "epoch": 6.914675767918089, + "grad_norm": 0.076390135478706, + "learning_rate": 7.240518318456069e-08, + "loss": 0.8841, + "step": 32416 + }, + { + "epoch": 6.914889078498294, + "grad_norm": 0.07296067508686988, + "learning_rate": 7.20436640797395e-08, + "loss": 0.8848, + "step": 32417 + }, + { + "epoch": 6.915102389078498, + "grad_norm": 0.07154054664000468, + "learning_rate": 7.16830493610754e-08, + "loss": 0.8562, + "step": 32418 + }, + { + "epoch": 6.915315699658703, + "grad_norm": 0.06985578232259038, + "learning_rate": 7.132333903267175e-08, + "loss": 0.8733, + "step": 32419 + }, + { + "epoch": 6.915529010238908, + "grad_norm": 0.07246794799527732, + "learning_rate": 7.096453309858753e-08, + "loss": 0.8672, + "step": 32420 + }, + { + "epoch": 6.915742320819112, + "grad_norm": 0.07014563909862707, + "learning_rate": 7.060663156288172e-08, + "loss": 0.8742, + "step": 32421 + }, + { + "epoch": 6.915955631399317, + "grad_norm": 0.0671329379185013, + "learning_rate": 7.024963442960442e-08, + "loss": 0.8492, + "step": 32422 + }, + { + "epoch": 6.916168941979522, + "grad_norm": 0.06813153009871067, + "learning_rate": 6.989354170280572e-08, + "loss": 0.8824, + "step": 32423 + }, + { + "epoch": 6.9163822525597265, + "grad_norm": 0.07125239948927269, + "learning_rate": 6.953835338649128e-08, + "loss": 0.8806, + "step": 32424 + }, + { + "epoch": 6.916595563139932, + "grad_norm": 0.0745874779557569, + "learning_rate": 6.918406948471123e-08, + "loss": 0.8969, + "step": 32425 + }, + { + "epoch": 6.916808873720137, + "grad_norm": 0.06979652986059896, + "learning_rate": 6.88306900014446e-08, + "loss": 0.8633, + "step": 32426 + }, + { + "epoch": 6.917022184300341, + "grad_norm": 0.0662718289675497, + "learning_rate": 6.847821494070595e-08, + "loss": 0.8595, + "step": 32427 + }, + { + "epoch": 6.917235494880546, + "grad_norm": 0.07029138467365889, + "learning_rate": 6.812664430648319e-08, + "loss": 0.8518, + "step": 32428 + }, + { + "epoch": 6.917448805460751, + "grad_norm": 0.07215493868847091, + "learning_rate": 6.777597810275538e-08, + "loss": 0.8796, + "step": 32429 + }, + { + "epoch": 6.9176621160409555, + "grad_norm": 0.07109716444281275, + "learning_rate": 6.742621633348378e-08, + "loss": 0.9016, + "step": 32430 + }, + { + "epoch": 6.91787542662116, + "grad_norm": 0.07077681270502861, + "learning_rate": 6.707735900262968e-08, + "loss": 0.8968, + "step": 32431 + }, + { + "epoch": 6.918088737201365, + "grad_norm": 0.06939598063345534, + "learning_rate": 6.672940611413658e-08, + "loss": 0.8654, + "step": 32432 + }, + { + "epoch": 6.9183020477815695, + "grad_norm": 0.07011397459636017, + "learning_rate": 6.638235767194801e-08, + "loss": 0.8874, + "step": 32433 + }, + { + "epoch": 6.918515358361775, + "grad_norm": 0.06761294362019935, + "learning_rate": 6.603621367998969e-08, + "loss": 0.9018, + "step": 32434 + }, + { + "epoch": 6.91872866894198, + "grad_norm": 0.07316615468594169, + "learning_rate": 6.569097414216962e-08, + "loss": 0.8881, + "step": 32435 + }, + { + "epoch": 6.9189419795221845, + "grad_norm": 0.06912723215637867, + "learning_rate": 6.534663906240469e-08, + "loss": 0.8652, + "step": 32436 + }, + { + "epoch": 6.919155290102389, + "grad_norm": 0.06956757347959776, + "learning_rate": 6.500320844458507e-08, + "loss": 0.8576, + "step": 32437 + }, + { + "epoch": 6.919368600682594, + "grad_norm": 0.06789715003209394, + "learning_rate": 6.466068229260103e-08, + "loss": 0.8966, + "step": 32438 + }, + { + "epoch": 6.9195819112627985, + "grad_norm": 0.07167935746701902, + "learning_rate": 6.4319060610325e-08, + "loss": 0.8879, + "step": 32439 + }, + { + "epoch": 6.919795221843003, + "grad_norm": 0.07357456889679466, + "learning_rate": 6.397834340162945e-08, + "loss": 0.911, + "step": 32440 + }, + { + "epoch": 6.920008532423208, + "grad_norm": 0.06839070987087072, + "learning_rate": 6.36385306703513e-08, + "loss": 0.8956, + "step": 32441 + }, + { + "epoch": 6.920221843003413, + "grad_norm": 0.0685199553333184, + "learning_rate": 6.329962242035415e-08, + "loss": 0.8732, + "step": 32442 + }, + { + "epoch": 6.920435153583618, + "grad_norm": 0.07325307999595773, + "learning_rate": 6.296161865546601e-08, + "loss": 0.8853, + "step": 32443 + }, + { + "epoch": 6.920648464163823, + "grad_norm": 0.0685092400044725, + "learning_rate": 6.262451937951497e-08, + "loss": 0.8528, + "step": 32444 + }, + { + "epoch": 6.9208617747440275, + "grad_norm": 0.07089290641129763, + "learning_rate": 6.228832459631129e-08, + "loss": 0.9144, + "step": 32445 + }, + { + "epoch": 6.921075085324232, + "grad_norm": 0.06710705408682315, + "learning_rate": 6.195303430965638e-08, + "loss": 0.8505, + "step": 32446 + }, + { + "epoch": 6.921288395904437, + "grad_norm": 0.06767202699303147, + "learning_rate": 6.161864852334276e-08, + "loss": 0.8826, + "step": 32447 + }, + { + "epoch": 6.921501706484642, + "grad_norm": 0.06937706704303515, + "learning_rate": 6.128516724117183e-08, + "loss": 0.873, + "step": 32448 + }, + { + "epoch": 6.921715017064846, + "grad_norm": 0.06858574520206688, + "learning_rate": 6.095259046689173e-08, + "loss": 0.8981, + "step": 32449 + }, + { + "epoch": 6.921928327645051, + "grad_norm": 0.0683209504716957, + "learning_rate": 6.062091820428606e-08, + "loss": 0.8987, + "step": 32450 + }, + { + "epoch": 6.922141638225256, + "grad_norm": 0.07086810506896928, + "learning_rate": 6.029015045709407e-08, + "loss": 0.8802, + "step": 32451 + }, + { + "epoch": 6.922354948805461, + "grad_norm": 0.07166990091727736, + "learning_rate": 5.996028722906389e-08, + "loss": 0.8787, + "step": 32452 + }, + { + "epoch": 6.922568259385666, + "grad_norm": 0.06907546944243838, + "learning_rate": 5.963132852392584e-08, + "loss": 0.8799, + "step": 32453 + }, + { + "epoch": 6.922781569965871, + "grad_norm": 0.06837434011543618, + "learning_rate": 5.9303274345410275e-08, + "loss": 0.8685, + "step": 32454 + }, + { + "epoch": 6.922994880546075, + "grad_norm": 0.07413671819059608, + "learning_rate": 5.8976124697220915e-08, + "loss": 0.8978, + "step": 32455 + }, + { + "epoch": 6.92320819112628, + "grad_norm": 0.07006338360840748, + "learning_rate": 5.864987958306145e-08, + "loss": 0.8835, + "step": 32456 + }, + { + "epoch": 6.923421501706485, + "grad_norm": 0.07636285286749321, + "learning_rate": 5.832453900661783e-08, + "loss": 0.8985, + "step": 32457 + }, + { + "epoch": 6.923634812286689, + "grad_norm": 0.07228369756316833, + "learning_rate": 5.800010297158487e-08, + "loss": 0.9162, + "step": 32458 + }, + { + "epoch": 6.923848122866894, + "grad_norm": 0.07207969822170476, + "learning_rate": 5.767657148162187e-08, + "loss": 0.8875, + "step": 32459 + }, + { + "epoch": 6.924061433447099, + "grad_norm": 0.07104781301978344, + "learning_rate": 5.7353944540388115e-08, + "loss": 0.8792, + "step": 32460 + }, + { + "epoch": 6.924274744027304, + "grad_norm": 0.07038021692424344, + "learning_rate": 5.703222215154292e-08, + "loss": 0.8848, + "step": 32461 + }, + { + "epoch": 6.924488054607508, + "grad_norm": 0.07170994535912908, + "learning_rate": 5.671140431872779e-08, + "loss": 0.8718, + "step": 32462 + }, + { + "epoch": 6.924701365187714, + "grad_norm": 0.06946131729896332, + "learning_rate": 5.6391491045557634e-08, + "loss": 0.8744, + "step": 32463 + }, + { + "epoch": 6.924914675767918, + "grad_norm": 0.07021394039058039, + "learning_rate": 5.607248233567397e-08, + "loss": 0.9071, + "step": 32464 + }, + { + "epoch": 6.925127986348123, + "grad_norm": 0.06722938134366939, + "learning_rate": 5.575437819266505e-08, + "loss": 0.8652, + "step": 32465 + }, + { + "epoch": 6.925341296928328, + "grad_norm": 0.07075392893032424, + "learning_rate": 5.543717862014575e-08, + "loss": 0.8972, + "step": 32466 + }, + { + "epoch": 6.925554607508532, + "grad_norm": 0.06800779007307878, + "learning_rate": 5.512088362169543e-08, + "loss": 0.8691, + "step": 32467 + }, + { + "epoch": 6.925767918088737, + "grad_norm": 0.06996230116623, + "learning_rate": 5.480549320090234e-08, + "loss": 0.8609, + "step": 32468 + }, + { + "epoch": 6.925981228668942, + "grad_norm": 0.07260781695811662, + "learning_rate": 5.449100736132806e-08, + "loss": 0.8676, + "step": 32469 + }, + { + "epoch": 6.926194539249146, + "grad_norm": 0.0709494112061823, + "learning_rate": 5.4177426106525325e-08, + "loss": 0.8733, + "step": 32470 + }, + { + "epoch": 6.926407849829351, + "grad_norm": 0.06997150636424686, + "learning_rate": 5.386474944005571e-08, + "loss": 0.8817, + "step": 32471 + }, + { + "epoch": 6.926621160409557, + "grad_norm": 0.06945127268496966, + "learning_rate": 5.355297736545417e-08, + "loss": 0.9177, + "step": 32472 + }, + { + "epoch": 6.926834470989761, + "grad_norm": 0.06634676985302955, + "learning_rate": 5.324210988624679e-08, + "loss": 0.8813, + "step": 32473 + }, + { + "epoch": 6.927047781569966, + "grad_norm": 0.07188507094625309, + "learning_rate": 5.293214700594185e-08, + "loss": 0.8736, + "step": 32474 + }, + { + "epoch": 6.927261092150171, + "grad_norm": 0.07001516865549905, + "learning_rate": 5.262308872805655e-08, + "loss": 0.8709, + "step": 32475 + }, + { + "epoch": 6.927474402730375, + "grad_norm": 0.0710779833824509, + "learning_rate": 5.231493505609031e-08, + "loss": 0.8615, + "step": 32476 + }, + { + "epoch": 6.92768771331058, + "grad_norm": 0.07119295249735666, + "learning_rate": 5.2007685993524794e-08, + "loss": 0.8925, + "step": 32477 + }, + { + "epoch": 6.927901023890785, + "grad_norm": 0.07092923977940478, + "learning_rate": 5.170134154383277e-08, + "loss": 0.895, + "step": 32478 + }, + { + "epoch": 6.9281143344709895, + "grad_norm": 0.07323848975164304, + "learning_rate": 5.139590171048703e-08, + "loss": 0.8826, + "step": 32479 + }, + { + "epoch": 6.928327645051194, + "grad_norm": 0.0718949249398631, + "learning_rate": 5.109136649693369e-08, + "loss": 0.9086, + "step": 32480 + }, + { + "epoch": 6.9285409556314, + "grad_norm": 0.06929261263496898, + "learning_rate": 5.078773590663666e-08, + "loss": 0.8641, + "step": 32481 + }, + { + "epoch": 6.928754266211604, + "grad_norm": 0.07441256209115459, + "learning_rate": 5.048500994301542e-08, + "loss": 0.8877, + "step": 32482 + }, + { + "epoch": 6.928967576791809, + "grad_norm": 0.06648571480614553, + "learning_rate": 5.0183188609507214e-08, + "loss": 0.9081, + "step": 32483 + }, + { + "epoch": 6.929180887372014, + "grad_norm": 0.07109839595227489, + "learning_rate": 4.988227190951378e-08, + "loss": 0.8809, + "step": 32484 + }, + { + "epoch": 6.9293941979522184, + "grad_norm": 0.06770844683756817, + "learning_rate": 4.9582259846454596e-08, + "loss": 0.8507, + "step": 32485 + }, + { + "epoch": 6.929607508532423, + "grad_norm": 0.07402469017460603, + "learning_rate": 4.928315242371362e-08, + "loss": 0.8929, + "step": 32486 + }, + { + "epoch": 6.929820819112628, + "grad_norm": 0.07073653459971864, + "learning_rate": 4.8984949644674816e-08, + "loss": 0.8689, + "step": 32487 + }, + { + "epoch": 6.9300341296928325, + "grad_norm": 0.07240684604902817, + "learning_rate": 4.868765151272214e-08, + "loss": 0.8576, + "step": 32488 + }, + { + "epoch": 6.930247440273037, + "grad_norm": 0.07023256968504445, + "learning_rate": 4.8391258031212915e-08, + "loss": 0.9054, + "step": 32489 + }, + { + "epoch": 6.930460750853243, + "grad_norm": 0.07380355367077213, + "learning_rate": 4.809576920349557e-08, + "loss": 0.9018, + "step": 32490 + }, + { + "epoch": 6.930674061433447, + "grad_norm": 0.06945601406246309, + "learning_rate": 4.780118503292741e-08, + "loss": 0.8827, + "step": 32491 + }, + { + "epoch": 6.930887372013652, + "grad_norm": 0.0678865200788852, + "learning_rate": 4.7507505522830235e-08, + "loss": 0.842, + "step": 32492 + }, + { + "epoch": 6.931100682593857, + "grad_norm": 0.07073396884388138, + "learning_rate": 4.721473067652582e-08, + "loss": 0.8817, + "step": 32493 + }, + { + "epoch": 6.9313139931740615, + "grad_norm": 0.06978126701552075, + "learning_rate": 4.692286049733596e-08, + "loss": 0.8662, + "step": 32494 + }, + { + "epoch": 6.931527303754266, + "grad_norm": 0.0720578181622908, + "learning_rate": 4.6631894988555804e-08, + "loss": 0.8868, + "step": 32495 + }, + { + "epoch": 6.931740614334471, + "grad_norm": 0.07397295186735109, + "learning_rate": 4.6341834153489364e-08, + "loss": 0.8693, + "step": 32496 + }, + { + "epoch": 6.9319539249146755, + "grad_norm": 0.06717295194753091, + "learning_rate": 4.6052677995396256e-08, + "loss": 0.8739, + "step": 32497 + }, + { + "epoch": 6.93216723549488, + "grad_norm": 0.07135268863038245, + "learning_rate": 4.576442651757162e-08, + "loss": 0.9307, + "step": 32498 + }, + { + "epoch": 6.932380546075086, + "grad_norm": 0.06806482581270211, + "learning_rate": 4.5477079723266206e-08, + "loss": 0.8394, + "step": 32499 + }, + { + "epoch": 6.9325938566552905, + "grad_norm": 0.07179625659639154, + "learning_rate": 4.5190637615730726e-08, + "loss": 0.8725, + "step": 32500 + }, + { + "epoch": 6.932807167235495, + "grad_norm": 0.07044750180603478, + "learning_rate": 4.490510019820704e-08, + "loss": 0.8439, + "step": 32501 + }, + { + "epoch": 6.9330204778157, + "grad_norm": 0.070484735705431, + "learning_rate": 4.462046747392812e-08, + "loss": 0.8802, + "step": 32502 + }, + { + "epoch": 6.9332337883959045, + "grad_norm": 0.0707198788027287, + "learning_rate": 4.433673944611805e-08, + "loss": 0.8656, + "step": 32503 + }, + { + "epoch": 6.933447098976109, + "grad_norm": 0.07382282958379767, + "learning_rate": 4.405391611797427e-08, + "loss": 0.914, + "step": 32504 + }, + { + "epoch": 6.933660409556314, + "grad_norm": 0.06891258631789318, + "learning_rate": 4.377199749271199e-08, + "loss": 0.8726, + "step": 32505 + }, + { + "epoch": 6.933873720136519, + "grad_norm": 0.07623732667133702, + "learning_rate": 4.3490983573510894e-08, + "loss": 0.8601, + "step": 32506 + }, + { + "epoch": 6.934087030716723, + "grad_norm": 0.06783990541418886, + "learning_rate": 4.321087436355953e-08, + "loss": 0.8683, + "step": 32507 + }, + { + "epoch": 6.934300341296928, + "grad_norm": 0.06853284126140564, + "learning_rate": 4.293166986601094e-08, + "loss": 0.8614, + "step": 32508 + }, + { + "epoch": 6.934513651877133, + "grad_norm": 0.07080992222283511, + "learning_rate": 4.2653370084044796e-08, + "loss": 0.8776, + "step": 32509 + }, + { + "epoch": 6.934726962457338, + "grad_norm": 0.074466114865197, + "learning_rate": 4.237597502080526e-08, + "loss": 0.8847, + "step": 32510 + }, + { + "epoch": 6.934940273037543, + "grad_norm": 0.07022506175962011, + "learning_rate": 4.2099484679418713e-08, + "loss": 0.8395, + "step": 32511 + }, + { + "epoch": 6.935153583617748, + "grad_norm": 0.06902787949813421, + "learning_rate": 4.1823899063020425e-08, + "loss": 0.8872, + "step": 32512 + }, + { + "epoch": 6.935366894197952, + "grad_norm": 0.06993139318774752, + "learning_rate": 4.15492181747279e-08, + "loss": 0.889, + "step": 32513 + }, + { + "epoch": 6.935580204778157, + "grad_norm": 0.07481742698281933, + "learning_rate": 4.127544201765865e-08, + "loss": 0.8884, + "step": 32514 + }, + { + "epoch": 6.935793515358362, + "grad_norm": 0.07258715944731117, + "learning_rate": 4.100257059489465e-08, + "loss": 0.8569, + "step": 32515 + }, + { + "epoch": 6.936006825938566, + "grad_norm": 0.0702684613968795, + "learning_rate": 4.0730603909535646e-08, + "loss": 0.8654, + "step": 32516 + }, + { + "epoch": 6.936220136518771, + "grad_norm": 0.07037257791934651, + "learning_rate": 4.045954196465474e-08, + "loss": 0.8562, + "step": 32517 + }, + { + "epoch": 6.936433447098976, + "grad_norm": 0.06849859242041273, + "learning_rate": 4.018938476331613e-08, + "loss": 0.8876, + "step": 32518 + }, + { + "epoch": 6.936646757679181, + "grad_norm": 0.07001452566804679, + "learning_rate": 3.992013230858405e-08, + "loss": 0.9091, + "step": 32519 + }, + { + "epoch": 6.936860068259386, + "grad_norm": 0.07429988003078777, + "learning_rate": 3.965178460349606e-08, + "loss": 0.8984, + "step": 32520 + }, + { + "epoch": 6.937073378839591, + "grad_norm": 0.06906910333940608, + "learning_rate": 3.938434165109861e-08, + "loss": 0.8938, + "step": 32521 + }, + { + "epoch": 6.937286689419795, + "grad_norm": 0.06680563274436883, + "learning_rate": 3.911780345441152e-08, + "loss": 0.8791, + "step": 32522 + }, + { + "epoch": 6.9375, + "grad_norm": 0.06947283399489465, + "learning_rate": 3.88521700164457e-08, + "loss": 0.8858, + "step": 32523 + }, + { + "epoch": 6.937713310580205, + "grad_norm": 0.07301953873289779, + "learning_rate": 3.8587441340220966e-08, + "loss": 0.8691, + "step": 32524 + }, + { + "epoch": 6.937926621160409, + "grad_norm": 0.06973830465274262, + "learning_rate": 3.832361742871271e-08, + "loss": 0.8857, + "step": 32525 + }, + { + "epoch": 6.938139931740614, + "grad_norm": 0.06870491304817879, + "learning_rate": 3.806069828493186e-08, + "loss": 0.8833, + "step": 32526 + }, + { + "epoch": 6.938353242320819, + "grad_norm": 0.07017279594817749, + "learning_rate": 3.779868391182717e-08, + "loss": 0.857, + "step": 32527 + }, + { + "epoch": 6.938566552901024, + "grad_norm": 0.07443970561126063, + "learning_rate": 3.753757431238292e-08, + "loss": 0.9095, + "step": 32528 + }, + { + "epoch": 6.938779863481229, + "grad_norm": 0.07119831409264982, + "learning_rate": 3.727736948953897e-08, + "loss": 0.8748, + "step": 32529 + }, + { + "epoch": 6.938993174061434, + "grad_norm": 0.07037899972736077, + "learning_rate": 3.701806944625297e-08, + "loss": 0.8667, + "step": 32530 + }, + { + "epoch": 6.939206484641638, + "grad_norm": 0.06902801190222053, + "learning_rate": 3.675967418544701e-08, + "loss": 0.8742, + "step": 32531 + }, + { + "epoch": 6.939419795221843, + "grad_norm": 0.0695294166328845, + "learning_rate": 3.650218371006098e-08, + "loss": 0.8726, + "step": 32532 + }, + { + "epoch": 6.939633105802048, + "grad_norm": 0.06890736006283006, + "learning_rate": 3.6245598022990324e-08, + "loss": 0.8634, + "step": 32533 + }, + { + "epoch": 6.939846416382252, + "grad_norm": 0.07049105405441991, + "learning_rate": 3.59899171271394e-08, + "loss": 0.891, + "step": 32534 + }, + { + "epoch": 6.940059726962457, + "grad_norm": 0.06955714616305267, + "learning_rate": 3.5735141025421415e-08, + "loss": 0.8926, + "step": 32535 + }, + { + "epoch": 6.940273037542662, + "grad_norm": 0.07660024234761208, + "learning_rate": 3.5481269720696324e-08, + "loss": 0.8961, + "step": 32536 + }, + { + "epoch": 6.940486348122867, + "grad_norm": 0.06871635039684447, + "learning_rate": 3.5228303215841806e-08, + "loss": 0.9097, + "step": 32537 + }, + { + "epoch": 6.940699658703072, + "grad_norm": 0.07117145385253393, + "learning_rate": 3.4976241513735576e-08, + "loss": 0.8719, + "step": 32538 + }, + { + "epoch": 6.940912969283277, + "grad_norm": 0.06909553553335376, + "learning_rate": 3.4725084617210914e-08, + "loss": 0.9131, + "step": 32539 + }, + { + "epoch": 6.941126279863481, + "grad_norm": 0.0715392395869208, + "learning_rate": 3.447483252911887e-08, + "loss": 0.8918, + "step": 32540 + }, + { + "epoch": 6.941339590443686, + "grad_norm": 0.06978047655627094, + "learning_rate": 3.422548525228386e-08, + "loss": 0.8631, + "step": 32541 + }, + { + "epoch": 6.941552901023891, + "grad_norm": 0.06941251868014474, + "learning_rate": 3.397704278953917e-08, + "loss": 0.8823, + "step": 32542 + }, + { + "epoch": 6.9417662116040955, + "grad_norm": 0.06993935455966459, + "learning_rate": 3.3729505143691446e-08, + "loss": 0.9035, + "step": 32543 + }, + { + "epoch": 6.9419795221843, + "grad_norm": 0.06843419804162468, + "learning_rate": 3.348287231752956e-08, + "loss": 0.8667, + "step": 32544 + }, + { + "epoch": 6.942192832764505, + "grad_norm": 0.07025473682415923, + "learning_rate": 3.323714431386016e-08, + "loss": 0.8602, + "step": 32545 + }, + { + "epoch": 6.9424061433447095, + "grad_norm": 0.07192713377292131, + "learning_rate": 3.299232113546325e-08, + "loss": 0.8868, + "step": 32546 + }, + { + "epoch": 6.942619453924914, + "grad_norm": 0.07112609908146862, + "learning_rate": 3.274840278509217e-08, + "loss": 0.9012, + "step": 32547 + }, + { + "epoch": 6.94283276450512, + "grad_norm": 0.06726982756075417, + "learning_rate": 3.250538926552693e-08, + "loss": 0.8643, + "step": 32548 + }, + { + "epoch": 6.9430460750853245, + "grad_norm": 0.07416692171708024, + "learning_rate": 3.226328057950312e-08, + "loss": 0.8838, + "step": 32549 + }, + { + "epoch": 6.943259385665529, + "grad_norm": 0.07005000695930926, + "learning_rate": 3.2022076729765204e-08, + "loss": 0.8689, + "step": 32550 + }, + { + "epoch": 6.943472696245734, + "grad_norm": 0.0691058108672948, + "learning_rate": 3.178177771904878e-08, + "loss": 0.887, + "step": 32551 + }, + { + "epoch": 6.9436860068259385, + "grad_norm": 0.18981517793083597, + "learning_rate": 3.1542383550053904e-08, + "loss": 0.8925, + "step": 32552 + }, + { + "epoch": 6.943899317406143, + "grad_norm": 0.06805438992721449, + "learning_rate": 3.130389422551616e-08, + "loss": 0.8932, + "step": 32553 + }, + { + "epoch": 6.944112627986348, + "grad_norm": 0.06773589692334767, + "learning_rate": 3.106630974810898e-08, + "loss": 0.9097, + "step": 32554 + }, + { + "epoch": 6.944325938566553, + "grad_norm": 0.0698161515531652, + "learning_rate": 3.08296301205413e-08, + "loss": 0.9093, + "step": 32555 + }, + { + "epoch": 6.944539249146757, + "grad_norm": 0.07126931841685244, + "learning_rate": 3.059385534547765e-08, + "loss": 0.8607, + "step": 32556 + }, + { + "epoch": 6.944752559726963, + "grad_norm": 0.06910121952686983, + "learning_rate": 3.035898542558258e-08, + "loss": 0.9054, + "step": 32557 + }, + { + "epoch": 6.9449658703071675, + "grad_norm": 0.06975172536268427, + "learning_rate": 3.012502036352949e-08, + "loss": 0.8966, + "step": 32558 + }, + { + "epoch": 6.945179180887372, + "grad_norm": 0.06976962439726468, + "learning_rate": 2.989196016195628e-08, + "loss": 0.8716, + "step": 32559 + }, + { + "epoch": 6.945392491467577, + "grad_norm": 0.07143827094253799, + "learning_rate": 2.9659804823491955e-08, + "loss": 0.8869, + "step": 32560 + }, + { + "epoch": 6.9456058020477816, + "grad_norm": 0.07105231068862247, + "learning_rate": 2.9428554350783288e-08, + "loss": 0.8836, + "step": 32561 + }, + { + "epoch": 6.945819112627986, + "grad_norm": 0.06756044580865772, + "learning_rate": 2.9198208746432642e-08, + "loss": 0.8726, + "step": 32562 + }, + { + "epoch": 6.946032423208191, + "grad_norm": 0.07106612661341843, + "learning_rate": 2.896876801304238e-08, + "loss": 0.8753, + "step": 32563 + }, + { + "epoch": 6.946245733788396, + "grad_norm": 0.06963223085479146, + "learning_rate": 2.8740232153223746e-08, + "loss": 0.8794, + "step": 32564 + }, + { + "epoch": 6.9464590443686, + "grad_norm": 0.07173203141552245, + "learning_rate": 2.8512601169552455e-08, + "loss": 0.8574, + "step": 32565 + }, + { + "epoch": 6.946672354948806, + "grad_norm": 0.06937670625457307, + "learning_rate": 2.8285875064604228e-08, + "loss": 0.8639, + "step": 32566 + }, + { + "epoch": 6.9468856655290105, + "grad_norm": 0.06923340625570744, + "learning_rate": 2.80600538409459e-08, + "loss": 0.8817, + "step": 32567 + }, + { + "epoch": 6.947098976109215, + "grad_norm": 0.069910566878194, + "learning_rate": 2.7835137501135424e-08, + "loss": 0.856, + "step": 32568 + }, + { + "epoch": 6.94731228668942, + "grad_norm": 0.06889430503437065, + "learning_rate": 2.761112604772187e-08, + "loss": 0.8963, + "step": 32569 + }, + { + "epoch": 6.947525597269625, + "grad_norm": 0.0706357818371109, + "learning_rate": 2.7388019483227667e-08, + "loss": 0.8657, + "step": 32570 + }, + { + "epoch": 6.947738907849829, + "grad_norm": 0.07039891046558308, + "learning_rate": 2.7165817810175243e-08, + "loss": 0.8704, + "step": 32571 + }, + { + "epoch": 6.947952218430034, + "grad_norm": 0.06788378390407258, + "learning_rate": 2.6944521031095907e-08, + "loss": 0.852, + "step": 32572 + }, + { + "epoch": 6.948165529010239, + "grad_norm": 0.07391867350767846, + "learning_rate": 2.6724129148485435e-08, + "loss": 0.8708, + "step": 32573 + }, + { + "epoch": 6.948378839590443, + "grad_norm": 0.07317420476805019, + "learning_rate": 2.650464216483073e-08, + "loss": 0.8937, + "step": 32574 + }, + { + "epoch": 6.948592150170649, + "grad_norm": 0.07043332895738658, + "learning_rate": 2.6286060082627575e-08, + "loss": 0.8766, + "step": 32575 + }, + { + "epoch": 6.948805460750854, + "grad_norm": 0.07341443718165215, + "learning_rate": 2.606838290433622e-08, + "loss": 0.8648, + "step": 32576 + }, + { + "epoch": 6.949018771331058, + "grad_norm": 0.07070725047832309, + "learning_rate": 2.585161063243469e-08, + "loss": 0.8685, + "step": 32577 + }, + { + "epoch": 6.949232081911263, + "grad_norm": 0.07064288832969447, + "learning_rate": 2.5635743269356583e-08, + "loss": 0.8717, + "step": 32578 + }, + { + "epoch": 6.949445392491468, + "grad_norm": 0.07091829485060414, + "learning_rate": 2.5420780817562162e-08, + "loss": 0.8559, + "step": 32579 + }, + { + "epoch": 6.949658703071672, + "grad_norm": 0.07477259290048092, + "learning_rate": 2.520672327947615e-08, + "loss": 0.9014, + "step": 32580 + }, + { + "epoch": 6.949872013651877, + "grad_norm": 0.06903996531478229, + "learning_rate": 2.4993570657523282e-08, + "loss": 0.8904, + "step": 32581 + }, + { + "epoch": 6.950085324232082, + "grad_norm": 0.07085844324534456, + "learning_rate": 2.478132295411051e-08, + "loss": 0.887, + "step": 32582 + }, + { + "epoch": 6.950298634812286, + "grad_norm": 0.0685574875366384, + "learning_rate": 2.4569980171644802e-08, + "loss": 0.8525, + "step": 32583 + }, + { + "epoch": 6.950511945392492, + "grad_norm": 0.07010173951684782, + "learning_rate": 2.435954231250648e-08, + "loss": 0.8915, + "step": 32584 + }, + { + "epoch": 6.950725255972696, + "grad_norm": 0.07186954147312415, + "learning_rate": 2.415000937909362e-08, + "loss": 0.8655, + "step": 32585 + }, + { + "epoch": 6.950938566552901, + "grad_norm": 0.07176022401566577, + "learning_rate": 2.394138137376878e-08, + "loss": 0.9071, + "step": 32586 + }, + { + "epoch": 6.951151877133106, + "grad_norm": 0.06854484458782323, + "learning_rate": 2.373365829888563e-08, + "loss": 0.8638, + "step": 32587 + }, + { + "epoch": 6.951365187713311, + "grad_norm": 0.07153865258652042, + "learning_rate": 2.352684015680673e-08, + "loss": 0.8915, + "step": 32588 + }, + { + "epoch": 6.951578498293515, + "grad_norm": 0.07222140483034503, + "learning_rate": 2.3320926949867985e-08, + "loss": 0.869, + "step": 32589 + }, + { + "epoch": 6.95179180887372, + "grad_norm": 0.07001577397378297, + "learning_rate": 2.311591868039642e-08, + "loss": 0.883, + "step": 32590 + }, + { + "epoch": 6.952005119453925, + "grad_norm": 0.07083937010653085, + "learning_rate": 2.2911815350710188e-08, + "loss": 0.8983, + "step": 32591 + }, + { + "epoch": 6.952218430034129, + "grad_norm": 0.07317685296337738, + "learning_rate": 2.2708616963127427e-08, + "loss": 0.889, + "step": 32592 + }, + { + "epoch": 6.952431740614334, + "grad_norm": 0.06871748622397715, + "learning_rate": 2.2506323519939644e-08, + "loss": 0.8942, + "step": 32593 + }, + { + "epoch": 6.952645051194539, + "grad_norm": 0.0709923724562107, + "learning_rate": 2.2304935023438334e-08, + "loss": 0.864, + "step": 32594 + }, + { + "epoch": 6.952858361774744, + "grad_norm": 0.06929046132746373, + "learning_rate": 2.2104451475897238e-08, + "loss": 0.8673, + "step": 32595 + }, + { + "epoch": 6.953071672354949, + "grad_norm": 0.06919934391616407, + "learning_rate": 2.190487287959897e-08, + "loss": 0.8869, + "step": 32596 + }, + { + "epoch": 6.953284982935154, + "grad_norm": 0.07309915962196453, + "learning_rate": 2.1706199236781744e-08, + "loss": 0.9014, + "step": 32597 + }, + { + "epoch": 6.953498293515358, + "grad_norm": 0.07124131990879552, + "learning_rate": 2.1508430549710415e-08, + "loss": 0.8949, + "step": 32598 + }, + { + "epoch": 6.953711604095563, + "grad_norm": 0.06936589088614852, + "learning_rate": 2.1311566820614304e-08, + "loss": 0.8786, + "step": 32599 + }, + { + "epoch": 6.953924914675768, + "grad_norm": 0.07507812076864655, + "learning_rate": 2.111560805173163e-08, + "loss": 0.8789, + "step": 32600 + }, + { + "epoch": 6.9541382252559725, + "grad_norm": 0.07327673271097777, + "learning_rate": 2.0920554245256186e-08, + "loss": 0.8769, + "step": 32601 + }, + { + "epoch": 6.954351535836177, + "grad_norm": 0.0679998696855272, + "learning_rate": 2.0726405403417305e-08, + "loss": 0.8547, + "step": 32602 + }, + { + "epoch": 6.954564846416382, + "grad_norm": 0.07277940848130111, + "learning_rate": 2.0533161528408786e-08, + "loss": 0.8743, + "step": 32603 + }, + { + "epoch": 6.954778156996587, + "grad_norm": 0.06927877133428462, + "learning_rate": 2.0340822622406665e-08, + "loss": 0.8931, + "step": 32604 + }, + { + "epoch": 6.954991467576792, + "grad_norm": 0.06961963660344776, + "learning_rate": 2.014938868759586e-08, + "loss": 0.879, + "step": 32605 + }, + { + "epoch": 6.955204778156997, + "grad_norm": 0.07166409960080228, + "learning_rate": 1.995885972614353e-08, + "loss": 0.8585, + "step": 32606 + }, + { + "epoch": 6.9554180887372015, + "grad_norm": 0.07084029674377595, + "learning_rate": 1.9769235740199066e-08, + "loss": 0.8983, + "step": 32607 + }, + { + "epoch": 6.955631399317406, + "grad_norm": 0.07004350055043787, + "learning_rate": 1.9580516731902978e-08, + "loss": 0.8819, + "step": 32608 + }, + { + "epoch": 6.955844709897611, + "grad_norm": 0.06915219604775227, + "learning_rate": 1.9392702703404652e-08, + "loss": 0.8571, + "step": 32609 + }, + { + "epoch": 6.9560580204778155, + "grad_norm": 0.06901830690959533, + "learning_rate": 1.9205793656826842e-08, + "loss": 0.8829, + "step": 32610 + }, + { + "epoch": 6.95627133105802, + "grad_norm": 0.06711019962850293, + "learning_rate": 1.901978959428341e-08, + "loss": 0.8711, + "step": 32611 + }, + { + "epoch": 6.956484641638225, + "grad_norm": 0.0712987425674497, + "learning_rate": 1.8834690517870458e-08, + "loss": 0.9182, + "step": 32612 + }, + { + "epoch": 6.9566979522184305, + "grad_norm": 0.07142943423177052, + "learning_rate": 1.8650496429684085e-08, + "loss": 0.872, + "step": 32613 + }, + { + "epoch": 6.956911262798635, + "grad_norm": 0.06867266050460889, + "learning_rate": 1.846720733182039e-08, + "loss": 0.9094, + "step": 32614 + }, + { + "epoch": 6.95712457337884, + "grad_norm": 0.0703928866609993, + "learning_rate": 1.8284823226339953e-08, + "loss": 0.894, + "step": 32615 + }, + { + "epoch": 6.9573378839590445, + "grad_norm": 0.07013877805688817, + "learning_rate": 1.8103344115321108e-08, + "loss": 0.8739, + "step": 32616 + }, + { + "epoch": 6.957551194539249, + "grad_norm": 0.07241567572584029, + "learning_rate": 1.7922770000797783e-08, + "loss": 0.8982, + "step": 32617 + }, + { + "epoch": 6.957764505119454, + "grad_norm": 0.07088564398708365, + "learning_rate": 1.7743100884830556e-08, + "loss": 0.8793, + "step": 32618 + }, + { + "epoch": 6.957977815699659, + "grad_norm": 0.07027276360910069, + "learning_rate": 1.7564336769444467e-08, + "loss": 0.8994, + "step": 32619 + }, + { + "epoch": 6.958191126279863, + "grad_norm": 0.0714328202198048, + "learning_rate": 1.7386477656655686e-08, + "loss": 0.8656, + "step": 32620 + }, + { + "epoch": 6.958404436860068, + "grad_norm": 0.0690234730421904, + "learning_rate": 1.720952354848926e-08, + "loss": 0.8701, + "step": 32621 + }, + { + "epoch": 6.9586177474402735, + "grad_norm": 0.07331981786901388, + "learning_rate": 1.703347444694359e-08, + "loss": 0.8964, + "step": 32622 + }, + { + "epoch": 6.958831058020478, + "grad_norm": 0.07130928876899098, + "learning_rate": 1.685833035400819e-08, + "loss": 0.904, + "step": 32623 + }, + { + "epoch": 6.959044368600683, + "grad_norm": 0.07148401027342255, + "learning_rate": 1.6684091271672588e-08, + "loss": 0.8461, + "step": 32624 + }, + { + "epoch": 6.959257679180888, + "grad_norm": 0.06892818599725753, + "learning_rate": 1.651075720189077e-08, + "loss": 0.8802, + "step": 32625 + }, + { + "epoch": 6.959470989761092, + "grad_norm": 0.06969879902462207, + "learning_rate": 1.633832814664338e-08, + "loss": 0.8606, + "step": 32626 + }, + { + "epoch": 6.959684300341297, + "grad_norm": 0.06879741368206595, + "learning_rate": 1.616680410786664e-08, + "loss": 0.8679, + "step": 32627 + }, + { + "epoch": 6.959897610921502, + "grad_norm": 0.07551805434956729, + "learning_rate": 1.599618508751455e-08, + "loss": 0.8881, + "step": 32628 + }, + { + "epoch": 6.960110921501706, + "grad_norm": 0.07163828189285952, + "learning_rate": 1.5826471087505568e-08, + "loss": 0.8898, + "step": 32629 + }, + { + "epoch": 6.960324232081911, + "grad_norm": 0.06890502489073902, + "learning_rate": 1.5657662109767048e-08, + "loss": 0.8903, + "step": 32630 + }, + { + "epoch": 6.960537542662116, + "grad_norm": 0.07159435885936223, + "learning_rate": 1.5489758156208566e-08, + "loss": 0.8669, + "step": 32631 + }, + { + "epoch": 6.96075085324232, + "grad_norm": 0.07503617108375912, + "learning_rate": 1.5322759228730832e-08, + "loss": 0.8835, + "step": 32632 + }, + { + "epoch": 6.960964163822526, + "grad_norm": 0.07183435192737077, + "learning_rate": 1.515666532921678e-08, + "loss": 0.8703, + "step": 32633 + }, + { + "epoch": 6.961177474402731, + "grad_norm": 0.0736391735965013, + "learning_rate": 1.499147645955823e-08, + "loss": 0.8847, + "step": 32634 + }, + { + "epoch": 6.961390784982935, + "grad_norm": 0.07380736289575861, + "learning_rate": 1.4827192621611474e-08, + "loss": 0.8885, + "step": 32635 + }, + { + "epoch": 6.96160409556314, + "grad_norm": 0.06780819191649622, + "learning_rate": 1.466381381724169e-08, + "loss": 0.8655, + "step": 32636 + }, + { + "epoch": 6.961817406143345, + "grad_norm": 0.07052786579110644, + "learning_rate": 1.4501340048287404e-08, + "loss": 0.8631, + "step": 32637 + }, + { + "epoch": 6.962030716723549, + "grad_norm": 0.07038455133921982, + "learning_rate": 1.4339771316604911e-08, + "loss": 0.8884, + "step": 32638 + }, + { + "epoch": 6.962244027303754, + "grad_norm": 0.07344842458052282, + "learning_rate": 1.4179107624014976e-08, + "loss": 0.9111, + "step": 32639 + }, + { + "epoch": 6.962457337883959, + "grad_norm": 0.06809444496842833, + "learning_rate": 1.4019348972329483e-08, + "loss": 0.8719, + "step": 32640 + }, + { + "epoch": 6.962670648464163, + "grad_norm": 0.06989865418970236, + "learning_rate": 1.3860495363360316e-08, + "loss": 0.8584, + "step": 32641 + }, + { + "epoch": 6.962883959044369, + "grad_norm": 0.07169914611165448, + "learning_rate": 1.3702546798901595e-08, + "loss": 0.8741, + "step": 32642 + }, + { + "epoch": 6.963097269624574, + "grad_norm": 0.06784872750022769, + "learning_rate": 1.354550328074744e-08, + "loss": 0.8934, + "step": 32643 + }, + { + "epoch": 6.963310580204778, + "grad_norm": 0.07252162973433425, + "learning_rate": 1.338936481067421e-08, + "loss": 0.8964, + "step": 32644 + }, + { + "epoch": 6.963523890784983, + "grad_norm": 0.07023903391400572, + "learning_rate": 1.3234131390449378e-08, + "loss": 0.8701, + "step": 32645 + }, + { + "epoch": 6.963737201365188, + "grad_norm": 0.06887284796687002, + "learning_rate": 1.3079803021813776e-08, + "loss": 0.8615, + "step": 32646 + }, + { + "epoch": 6.963950511945392, + "grad_norm": 0.07324932676580773, + "learning_rate": 1.2926379706534875e-08, + "loss": 0.8671, + "step": 32647 + }, + { + "epoch": 6.964163822525597, + "grad_norm": 0.07008984719407275, + "learning_rate": 1.2773861446335744e-08, + "loss": 0.8897, + "step": 32648 + }, + { + "epoch": 6.964377133105802, + "grad_norm": 0.07131923020341445, + "learning_rate": 1.2622248242948331e-08, + "loss": 0.8754, + "step": 32649 + }, + { + "epoch": 6.964590443686006, + "grad_norm": 0.07048318065573513, + "learning_rate": 1.2471540098077938e-08, + "loss": 0.858, + "step": 32650 + }, + { + "epoch": 6.964803754266212, + "grad_norm": 0.07221839307396588, + "learning_rate": 1.2321737013447632e-08, + "loss": 0.8906, + "step": 32651 + }, + { + "epoch": 6.965017064846417, + "grad_norm": 0.07692123848305213, + "learning_rate": 1.217283899073607e-08, + "loss": 0.8789, + "step": 32652 + }, + { + "epoch": 6.965230375426621, + "grad_norm": 0.07456407203974938, + "learning_rate": 1.2024846031639669e-08, + "loss": 0.865, + "step": 32653 + }, + { + "epoch": 6.965443686006826, + "grad_norm": 0.0679782745107499, + "learning_rate": 1.1877758137819329e-08, + "loss": 0.8715, + "step": 32654 + }, + { + "epoch": 6.965656996587031, + "grad_norm": 0.08059348028377057, + "learning_rate": 1.1731575310953702e-08, + "loss": 0.8959, + "step": 32655 + }, + { + "epoch": 6.965870307167235, + "grad_norm": 0.06762152561253987, + "learning_rate": 1.1586297552685922e-08, + "loss": 0.8622, + "step": 32656 + }, + { + "epoch": 6.96608361774744, + "grad_norm": 0.07360409606604826, + "learning_rate": 1.1441924864667998e-08, + "loss": 0.9, + "step": 32657 + }, + { + "epoch": 6.966296928327645, + "grad_norm": 0.0735773595728786, + "learning_rate": 1.1298457248534179e-08, + "loss": 0.8747, + "step": 32658 + }, + { + "epoch": 6.9665102389078495, + "grad_norm": 0.07088692135494602, + "learning_rate": 1.115589470590095e-08, + "loss": 0.8914, + "step": 32659 + }, + { + "epoch": 6.966723549488055, + "grad_norm": 0.06996569704921442, + "learning_rate": 1.1014237238384795e-08, + "loss": 0.8842, + "step": 32660 + }, + { + "epoch": 6.96693686006826, + "grad_norm": 0.06820129503531369, + "learning_rate": 1.0873484847584436e-08, + "loss": 0.8631, + "step": 32661 + }, + { + "epoch": 6.967150170648464, + "grad_norm": 0.0748681700929121, + "learning_rate": 1.0733637535098596e-08, + "loss": 0.9025, + "step": 32662 + }, + { + "epoch": 6.967363481228669, + "grad_norm": 0.07230028683996045, + "learning_rate": 1.0594695302499346e-08, + "loss": 0.8882, + "step": 32663 + }, + { + "epoch": 6.967576791808874, + "grad_norm": 0.06842037971475458, + "learning_rate": 1.0456658151376531e-08, + "loss": 0.8545, + "step": 32664 + }, + { + "epoch": 6.9677901023890785, + "grad_norm": 0.07056657276358476, + "learning_rate": 1.0319526083284458e-08, + "loss": 0.8815, + "step": 32665 + }, + { + "epoch": 6.968003412969283, + "grad_norm": 0.07134548131864772, + "learning_rate": 1.018329909975968e-08, + "loss": 0.8765, + "step": 32666 + }, + { + "epoch": 6.968216723549488, + "grad_norm": 0.06891202900431, + "learning_rate": 1.0047977202365389e-08, + "loss": 0.8627, + "step": 32667 + }, + { + "epoch": 6.9684300341296925, + "grad_norm": 0.07075284279576918, + "learning_rate": 9.91356039262037e-09, + "loss": 0.8784, + "step": 32668 + }, + { + "epoch": 6.968643344709898, + "grad_norm": 0.06916300489461383, + "learning_rate": 9.78004867205229e-09, + "loss": 0.8537, + "step": 32669 + }, + { + "epoch": 6.968856655290102, + "grad_norm": 0.07325534461915671, + "learning_rate": 9.647442042171051e-09, + "loss": 0.8781, + "step": 32670 + }, + { + "epoch": 6.9690699658703075, + "grad_norm": 0.06941682535556018, + "learning_rate": 9.515740504468796e-09, + "loss": 0.8738, + "step": 32671 + }, + { + "epoch": 6.969283276450512, + "grad_norm": 0.06740023884970943, + "learning_rate": 9.384944060437662e-09, + "loss": 0.8584, + "step": 32672 + }, + { + "epoch": 6.969496587030717, + "grad_norm": 0.0679530722993636, + "learning_rate": 9.25505271156979e-09, + "loss": 0.8887, + "step": 32673 + }, + { + "epoch": 6.9697098976109215, + "grad_norm": 0.06920855149876107, + "learning_rate": 9.126066459321792e-09, + "loss": 0.8955, + "step": 32674 + }, + { + "epoch": 6.969923208191126, + "grad_norm": 0.0690112925392459, + "learning_rate": 8.997985305159163e-09, + "loss": 0.8529, + "step": 32675 + }, + { + "epoch": 6.970136518771331, + "grad_norm": 0.06836633423291177, + "learning_rate": 8.870809250529632e-09, + "loss": 0.8892, + "step": 32676 + }, + { + "epoch": 6.970349829351536, + "grad_norm": 0.06824556722386847, + "learning_rate": 8.744538296872051e-09, + "loss": 0.8833, + "step": 32677 + }, + { + "epoch": 6.97056313993174, + "grad_norm": 0.0692818820248151, + "learning_rate": 8.619172445616387e-09, + "loss": 0.8787, + "step": 32678 + }, + { + "epoch": 6.970776450511945, + "grad_norm": 0.07011042572038383, + "learning_rate": 8.494711698174841e-09, + "loss": 0.868, + "step": 32679 + }, + { + "epoch": 6.9709897610921505, + "grad_norm": 0.06994448938984985, + "learning_rate": 8.371156055968499e-09, + "loss": 0.8776, + "step": 32680 + }, + { + "epoch": 6.971203071672355, + "grad_norm": 0.06860740868066319, + "learning_rate": 8.248505520382921e-09, + "loss": 0.8663, + "step": 32681 + }, + { + "epoch": 6.97141638225256, + "grad_norm": 0.07289898249591245, + "learning_rate": 8.126760092812547e-09, + "loss": 0.8756, + "step": 32682 + }, + { + "epoch": 6.971629692832765, + "grad_norm": 0.07026124826681329, + "learning_rate": 8.005919774634052e-09, + "loss": 0.8672, + "step": 32683 + }, + { + "epoch": 6.971843003412969, + "grad_norm": 0.07198696616595571, + "learning_rate": 7.885984567215233e-09, + "loss": 0.9024, + "step": 32684 + }, + { + "epoch": 6.972056313993174, + "grad_norm": 0.06942333351082176, + "learning_rate": 7.766954471915001e-09, + "loss": 0.8886, + "step": 32685 + }, + { + "epoch": 6.972269624573379, + "grad_norm": 0.06879862698518198, + "learning_rate": 7.648829490074506e-09, + "loss": 0.881, + "step": 32686 + }, + { + "epoch": 6.972482935153583, + "grad_norm": 0.06946643491132182, + "learning_rate": 7.531609623026015e-09, + "loss": 0.8924, + "step": 32687 + }, + { + "epoch": 6.972696245733788, + "grad_norm": 0.07129194041134501, + "learning_rate": 7.415294872110679e-09, + "loss": 0.8806, + "step": 32688 + }, + { + "epoch": 6.972909556313994, + "grad_norm": 0.06865499869374574, + "learning_rate": 7.299885238634119e-09, + "loss": 0.9034, + "step": 32689 + }, + { + "epoch": 6.973122866894198, + "grad_norm": 0.07230175247517817, + "learning_rate": 7.185380723910839e-09, + "loss": 0.8812, + "step": 32690 + }, + { + "epoch": 6.973336177474403, + "grad_norm": 0.07052221299938866, + "learning_rate": 7.071781329219818e-09, + "loss": 0.8644, + "step": 32691 + }, + { + "epoch": 6.973549488054608, + "grad_norm": 0.06814068044981779, + "learning_rate": 6.959087055866676e-09, + "loss": 0.8739, + "step": 32692 + }, + { + "epoch": 6.973762798634812, + "grad_norm": 0.06926218839309613, + "learning_rate": 6.847297905112626e-09, + "loss": 0.8766, + "step": 32693 + }, + { + "epoch": 6.973976109215017, + "grad_norm": 0.07197124495856701, + "learning_rate": 6.736413878227765e-09, + "loss": 0.8655, + "step": 32694 + }, + { + "epoch": 6.974189419795222, + "grad_norm": 0.07043573772988433, + "learning_rate": 6.626434976464424e-09, + "loss": 0.8545, + "step": 32695 + }, + { + "epoch": 6.974402730375426, + "grad_norm": 0.0703235023765142, + "learning_rate": 6.517361201066053e-09, + "loss": 0.8596, + "step": 32696 + }, + { + "epoch": 6.974616040955631, + "grad_norm": 0.06984360435436507, + "learning_rate": 6.4091925532761e-09, + "loss": 0.8813, + "step": 32697 + }, + { + "epoch": 6.974829351535837, + "grad_norm": 0.07151491036979381, + "learning_rate": 6.30192903430249e-09, + "loss": 0.8729, + "step": 32698 + }, + { + "epoch": 6.975042662116041, + "grad_norm": 0.06999993776853351, + "learning_rate": 6.195570645370907e-09, + "loss": 0.8782, + "step": 32699 + }, + { + "epoch": 6.975255972696246, + "grad_norm": 0.07226416684075553, + "learning_rate": 6.090117387689276e-09, + "loss": 0.8595, + "step": 32700 + }, + { + "epoch": 6.975469283276451, + "grad_norm": 0.07165015077581302, + "learning_rate": 5.985569262438873e-09, + "loss": 0.8959, + "step": 32701 + }, + { + "epoch": 6.975682593856655, + "grad_norm": 0.0720938864703384, + "learning_rate": 5.8819262708098565e-09, + "loss": 0.8782, + "step": 32702 + }, + { + "epoch": 6.97589590443686, + "grad_norm": 0.07270357164366252, + "learning_rate": 5.779188413965742e-09, + "loss": 0.8896, + "step": 32703 + }, + { + "epoch": 6.976109215017065, + "grad_norm": 0.06744190132113016, + "learning_rate": 5.677355693078923e-09, + "loss": 0.8967, + "step": 32704 + }, + { + "epoch": 6.976322525597269, + "grad_norm": 0.07129614296256678, + "learning_rate": 5.576428109304033e-09, + "loss": 0.8913, + "step": 32705 + }, + { + "epoch": 6.976535836177474, + "grad_norm": 0.07045471658621177, + "learning_rate": 5.476405663769057e-09, + "loss": 0.8317, + "step": 32706 + }, + { + "epoch": 6.97674914675768, + "grad_norm": 0.07218320521670778, + "learning_rate": 5.377288357619748e-09, + "loss": 0.8784, + "step": 32707 + }, + { + "epoch": 6.976962457337884, + "grad_norm": 0.06846390599840518, + "learning_rate": 5.279076191966326e-09, + "loss": 0.8858, + "step": 32708 + }, + { + "epoch": 6.977175767918089, + "grad_norm": 0.07112387850133785, + "learning_rate": 5.181769167927897e-09, + "loss": 0.8783, + "step": 32709 + }, + { + "epoch": 6.977389078498294, + "grad_norm": 0.07019437862741906, + "learning_rate": 5.085367286605802e-09, + "loss": 0.8875, + "step": 32710 + }, + { + "epoch": 6.977602389078498, + "grad_norm": 0.06880466307635821, + "learning_rate": 4.98987054908362e-09, + "loss": 0.8588, + "step": 32711 + }, + { + "epoch": 6.977815699658703, + "grad_norm": 0.0720066795672757, + "learning_rate": 4.8952789564538085e-09, + "loss": 0.878, + "step": 32712 + }, + { + "epoch": 6.978029010238908, + "grad_norm": 0.0751899695833281, + "learning_rate": 4.801592509773301e-09, + "loss": 0.9016, + "step": 32713 + }, + { + "epoch": 6.978242320819112, + "grad_norm": 0.07090219939802683, + "learning_rate": 4.708811210107911e-09, + "loss": 0.8477, + "step": 32714 + }, + { + "epoch": 6.978455631399317, + "grad_norm": 0.07068444286759781, + "learning_rate": 4.616935058514571e-09, + "loss": 0.9032, + "step": 32715 + }, + { + "epoch": 6.978668941979522, + "grad_norm": 0.07260915581160181, + "learning_rate": 4.5259640560235685e-09, + "loss": 0.8566, + "step": 32716 + }, + { + "epoch": 6.9788822525597265, + "grad_norm": 0.07616130172526513, + "learning_rate": 4.43589820366519e-09, + "loss": 0.9143, + "step": 32717 + }, + { + "epoch": 6.979095563139932, + "grad_norm": 0.07103679562163644, + "learning_rate": 4.346737502460841e-09, + "loss": 0.8925, + "step": 32718 + }, + { + "epoch": 6.979308873720137, + "grad_norm": 0.07202876746906588, + "learning_rate": 4.258481953414162e-09, + "loss": 0.9054, + "step": 32719 + }, + { + "epoch": 6.979522184300341, + "grad_norm": 0.07097906943821983, + "learning_rate": 4.171131557528796e-09, + "loss": 0.8698, + "step": 32720 + }, + { + "epoch": 6.979735494880546, + "grad_norm": 0.06921779670576624, + "learning_rate": 4.084686315799502e-09, + "loss": 0.8632, + "step": 32721 + }, + { + "epoch": 6.979948805460751, + "grad_norm": 0.06715488086829301, + "learning_rate": 3.999146229194395e-09, + "loss": 0.8491, + "step": 32722 + }, + { + "epoch": 6.9801621160409555, + "grad_norm": 0.07332500413243842, + "learning_rate": 3.9145112986815895e-09, + "loss": 0.892, + "step": 32723 + }, + { + "epoch": 6.98037542662116, + "grad_norm": 0.06904244591961296, + "learning_rate": 3.830781525229199e-09, + "loss": 0.8975, + "step": 32724 + }, + { + "epoch": 6.980588737201365, + "grad_norm": 0.07195230250374449, + "learning_rate": 3.747956909769812e-09, + "loss": 0.8894, + "step": 32725 + }, + { + "epoch": 6.9808020477815695, + "grad_norm": 0.0687349095645044, + "learning_rate": 3.666037453253779e-09, + "loss": 0.8791, + "step": 32726 + }, + { + "epoch": 6.981015358361775, + "grad_norm": 0.07010703278027464, + "learning_rate": 3.5850231565959236e-09, + "loss": 0.8717, + "step": 32727 + }, + { + "epoch": 6.98122866894198, + "grad_norm": 0.06936203039185775, + "learning_rate": 3.5049140207199517e-09, + "loss": 0.8953, + "step": 32728 + }, + { + "epoch": 6.9814419795221845, + "grad_norm": 0.06920066386633875, + "learning_rate": 3.4257100465318047e-09, + "loss": 0.872, + "step": 32729 + }, + { + "epoch": 6.981655290102389, + "grad_norm": 0.06989545736403492, + "learning_rate": 3.3474112349285435e-09, + "loss": 0.8762, + "step": 32730 + }, + { + "epoch": 6.981868600682594, + "grad_norm": 0.07095604405666277, + "learning_rate": 3.2700175867983462e-09, + "loss": 0.9218, + "step": 32731 + }, + { + "epoch": 6.9820819112627985, + "grad_norm": 0.06987956207933343, + "learning_rate": 3.1935291030116277e-09, + "loss": 0.8966, + "step": 32732 + }, + { + "epoch": 6.982295221843003, + "grad_norm": 0.07207343316199846, + "learning_rate": 3.117945784429921e-09, + "loss": 0.8637, + "step": 32733 + }, + { + "epoch": 6.982508532423208, + "grad_norm": 0.07140138035234897, + "learning_rate": 3.0432676319236407e-09, + "loss": 0.8679, + "step": 32734 + }, + { + "epoch": 6.982721843003413, + "grad_norm": 0.06655631120415974, + "learning_rate": 2.9694946463187937e-09, + "loss": 0.8681, + "step": 32735 + }, + { + "epoch": 6.982935153583618, + "grad_norm": 0.07014670580999847, + "learning_rate": 2.89662682846803e-09, + "loss": 0.8585, + "step": 32736 + }, + { + "epoch": 6.983148464163823, + "grad_norm": 0.07233057777746624, + "learning_rate": 2.8246641791795924e-09, + "loss": 0.895, + "step": 32737 + }, + { + "epoch": 6.9833617747440275, + "grad_norm": 0.06809103164916357, + "learning_rate": 2.7536066992794874e-09, + "loss": 0.8675, + "step": 32738 + }, + { + "epoch": 6.983575085324232, + "grad_norm": 0.06778257924491927, + "learning_rate": 2.683454389567075e-09, + "loss": 0.8976, + "step": 32739 + }, + { + "epoch": 6.983788395904437, + "grad_norm": 0.0737894196169385, + "learning_rate": 2.614207250841716e-09, + "loss": 0.9191, + "step": 32740 + }, + { + "epoch": 6.984001706484642, + "grad_norm": 0.07240641752458535, + "learning_rate": 2.5458652838761254e-09, + "loss": 0.8885, + "step": 32741 + }, + { + "epoch": 6.984215017064846, + "grad_norm": 0.07275398725107114, + "learning_rate": 2.478428489451901e-09, + "loss": 0.8968, + "step": 32742 + }, + { + "epoch": 6.984428327645051, + "grad_norm": 0.06982836182791169, + "learning_rate": 2.4118968683328748e-09, + "loss": 0.8576, + "step": 32743 + }, + { + "epoch": 6.984641638225256, + "grad_norm": 0.06735743597689517, + "learning_rate": 2.3462704212651175e-09, + "loss": 0.8651, + "step": 32744 + }, + { + "epoch": 6.984854948805461, + "grad_norm": 0.06794071405319999, + "learning_rate": 2.2815491489946997e-09, + "loss": 0.8672, + "step": 32745 + }, + { + "epoch": 6.985068259385666, + "grad_norm": 0.07393959306778317, + "learning_rate": 2.2177330522588083e-09, + "loss": 0.9138, + "step": 32746 + }, + { + "epoch": 6.985281569965871, + "grad_norm": 0.07191291097989048, + "learning_rate": 2.1548221317679865e-09, + "loss": 0.8997, + "step": 32747 + }, + { + "epoch": 6.985494880546075, + "grad_norm": 0.070152462365933, + "learning_rate": 2.0928163882416585e-09, + "loss": 0.878, + "step": 32748 + }, + { + "epoch": 6.98570819112628, + "grad_norm": 0.0740356627769732, + "learning_rate": 2.0317158223903676e-09, + "loss": 0.8808, + "step": 32749 + }, + { + "epoch": 6.985921501706485, + "grad_norm": 0.07071418568391862, + "learning_rate": 1.971520434889129e-09, + "loss": 0.8663, + "step": 32750 + }, + { + "epoch": 6.986134812286689, + "grad_norm": 0.06959066612263616, + "learning_rate": 1.9122302264218405e-09, + "loss": 0.8591, + "step": 32751 + }, + { + "epoch": 6.986348122866894, + "grad_norm": 0.07284003756536984, + "learning_rate": 1.8538451976635174e-09, + "loss": 0.8948, + "step": 32752 + }, + { + "epoch": 6.986561433447099, + "grad_norm": 0.0710654572228715, + "learning_rate": 1.7963653492802936e-09, + "loss": 0.9045, + "step": 32753 + }, + { + "epoch": 6.986774744027304, + "grad_norm": 0.0672596702974674, + "learning_rate": 1.7397906819116572e-09, + "loss": 0.8921, + "step": 32754 + }, + { + "epoch": 6.986988054607508, + "grad_norm": 0.06998789088300174, + "learning_rate": 1.6841211962059789e-09, + "loss": 0.8989, + "step": 32755 + }, + { + "epoch": 6.987201365187714, + "grad_norm": 0.06882503077294873, + "learning_rate": 1.6293568927938652e-09, + "loss": 0.8973, + "step": 32756 + }, + { + "epoch": 6.987414675767918, + "grad_norm": 0.06688004710248559, + "learning_rate": 1.5754977722881592e-09, + "loss": 0.8903, + "step": 32757 + }, + { + "epoch": 6.987627986348123, + "grad_norm": 0.0688140958545179, + "learning_rate": 1.5225438352928224e-09, + "loss": 0.8827, + "step": 32758 + }, + { + "epoch": 6.987841296928328, + "grad_norm": 0.0709974824018783, + "learning_rate": 1.4704950824295794e-09, + "loss": 0.8773, + "step": 32759 + }, + { + "epoch": 6.988054607508532, + "grad_norm": 0.06971474327804214, + "learning_rate": 1.4193515142579827e-09, + "loss": 0.8847, + "step": 32760 + }, + { + "epoch": 6.988267918088737, + "grad_norm": 0.07371537274784952, + "learning_rate": 1.369113131381994e-09, + "loss": 0.8663, + "step": 32761 + }, + { + "epoch": 6.988481228668942, + "grad_norm": 0.07000485424278861, + "learning_rate": 1.319779934361165e-09, + "loss": 0.8674, + "step": 32762 + }, + { + "epoch": 6.988694539249146, + "grad_norm": 0.06879854243601621, + "learning_rate": 1.2713519237461668e-09, + "loss": 0.8536, + "step": 32763 + }, + { + "epoch": 6.988907849829351, + "grad_norm": 0.0698529498842379, + "learning_rate": 1.2238291000965519e-09, + "loss": 0.8746, + "step": 32764 + }, + { + "epoch": 6.989121160409557, + "grad_norm": 0.07098198779111035, + "learning_rate": 1.177211463945227e-09, + "loss": 0.9066, + "step": 32765 + }, + { + "epoch": 6.989334470989761, + "grad_norm": 0.07093752183443562, + "learning_rate": 1.1314990158162175e-09, + "loss": 0.8918, + "step": 32766 + }, + { + "epoch": 6.989547781569966, + "grad_norm": 0.0715137845832301, + "learning_rate": 1.0866917562335488e-09, + "loss": 0.8821, + "step": 32767 + }, + { + "epoch": 6.989761092150171, + "grad_norm": 0.07022038769401527, + "learning_rate": 1.0427896856946007e-09, + "loss": 0.8768, + "step": 32768 + }, + { + "epoch": 6.989974402730375, + "grad_norm": 0.07003307316183562, + "learning_rate": 9.997928047056349e-10, + "loss": 0.8846, + "step": 32769 + }, + { + "epoch": 6.99018771331058, + "grad_norm": 0.0688993473537093, + "learning_rate": 9.577011137551494e-10, + "loss": 0.8792, + "step": 32770 + }, + { + "epoch": 6.990401023890785, + "grad_norm": 0.06910313988048047, + "learning_rate": 9.165146133049974e-10, + "loss": 0.8509, + "step": 32771 + }, + { + "epoch": 6.9906143344709895, + "grad_norm": 0.07162663902591321, + "learning_rate": 8.762333038347947e-10, + "loss": 0.8897, + "step": 32772 + }, + { + "epoch": 6.990827645051194, + "grad_norm": 0.06790433094735629, + "learning_rate": 8.368571857975128e-10, + "loss": 0.865, + "step": 32773 + }, + { + "epoch": 6.9910409556314, + "grad_norm": 0.07058252311559035, + "learning_rate": 7.983862596283587e-10, + "loss": 0.8818, + "step": 32774 + }, + { + "epoch": 6.991254266211604, + "grad_norm": 0.07115203143820541, + "learning_rate": 7.608205257803036e-10, + "loss": 0.9039, + "step": 32775 + }, + { + "epoch": 6.991467576791809, + "grad_norm": 0.07516857274988865, + "learning_rate": 7.241599846619096e-10, + "loss": 0.9022, + "step": 32776 + }, + { + "epoch": 6.991680887372014, + "grad_norm": 0.06938820207782755, + "learning_rate": 6.884046366995023e-10, + "loss": 0.8784, + "step": 32777 + }, + { + "epoch": 6.9918941979522184, + "grad_norm": 0.06797505961983272, + "learning_rate": 6.535544822927619e-10, + "loss": 0.8577, + "step": 32778 + }, + { + "epoch": 6.992107508532423, + "grad_norm": 0.07159352118038854, + "learning_rate": 6.196095218413689e-10, + "loss": 0.897, + "step": 32779 + }, + { + "epoch": 6.992320819112628, + "grad_norm": 0.07098336893899157, + "learning_rate": 5.865697557183581e-10, + "loss": 0.8764, + "step": 32780 + }, + { + "epoch": 6.9925341296928325, + "grad_norm": 0.07431074248445611, + "learning_rate": 5.544351843056461e-10, + "loss": 0.8624, + "step": 32781 + }, + { + "epoch": 6.992747440273037, + "grad_norm": 0.07156956607889259, + "learning_rate": 5.232058079673863e-10, + "loss": 0.8641, + "step": 32782 + }, + { + "epoch": 6.992960750853243, + "grad_norm": 0.06855364239403307, + "learning_rate": 4.928816270499681e-10, + "loss": 0.8863, + "step": 32783 + }, + { + "epoch": 6.993174061433447, + "grad_norm": 0.07065970654685765, + "learning_rate": 4.634626418997812e-10, + "loss": 0.9018, + "step": 32784 + }, + { + "epoch": 6.993387372013652, + "grad_norm": 0.07247920084973859, + "learning_rate": 4.349488528543333e-10, + "loss": 0.8716, + "step": 32785 + }, + { + "epoch": 6.993600682593857, + "grad_norm": 0.07179848945138864, + "learning_rate": 4.073402602333687e-10, + "loss": 0.9359, + "step": 32786 + }, + { + "epoch": 6.9938139931740615, + "grad_norm": 0.0711174730939516, + "learning_rate": 3.806368643477498e-10, + "loss": 0.8822, + "step": 32787 + }, + { + "epoch": 6.994027303754266, + "grad_norm": 0.06838928765456628, + "learning_rate": 3.548386655083391e-10, + "loss": 0.8505, + "step": 32788 + }, + { + "epoch": 6.994240614334471, + "grad_norm": 0.07285149022127031, + "learning_rate": 3.299456639904719e-10, + "loss": 0.8925, + "step": 32789 + }, + { + "epoch": 6.9944539249146755, + "grad_norm": 0.07167293908207395, + "learning_rate": 3.0595786008724706e-10, + "loss": 0.8757, + "step": 32790 + }, + { + "epoch": 6.99466723549488, + "grad_norm": 0.07228408890022757, + "learning_rate": 2.8287525406511806e-10, + "loss": 0.8984, + "step": 32791 + }, + { + "epoch": 6.994880546075086, + "grad_norm": 0.07372118341700021, + "learning_rate": 2.6069784619942027e-10, + "loss": 0.8792, + "step": 32792 + }, + { + "epoch": 6.9950938566552905, + "grad_norm": 0.07002846911562119, + "learning_rate": 2.394256367210801e-10, + "loss": 0.8889, + "step": 32793 + }, + { + "epoch": 6.995307167235495, + "grad_norm": 0.07105490121742274, + "learning_rate": 2.1905862587878745e-10, + "loss": 0.9017, + "step": 32794 + }, + { + "epoch": 6.9955204778157, + "grad_norm": 0.07188786691777937, + "learning_rate": 1.9959681390346874e-10, + "loss": 0.8831, + "step": 32795 + }, + { + "epoch": 6.9957337883959045, + "grad_norm": 0.07056155320023831, + "learning_rate": 1.8104020101716858e-10, + "loss": 0.8937, + "step": 32796 + }, + { + "epoch": 6.995947098976109, + "grad_norm": 0.07540473191164974, + "learning_rate": 1.6338878742416798e-10, + "loss": 0.9282, + "step": 32797 + }, + { + "epoch": 6.996160409556314, + "grad_norm": 0.06903689458757335, + "learning_rate": 1.46642573328748e-10, + "loss": 0.8812, + "step": 32798 + }, + { + "epoch": 6.996373720136519, + "grad_norm": 0.06757723101396915, + "learning_rate": 1.3080155892630785e-10, + "loss": 0.8966, + "step": 32799 + }, + { + "epoch": 6.996587030716723, + "grad_norm": 0.07068512866837412, + "learning_rate": 1.1586574437671971e-10, + "loss": 0.888, + "step": 32800 + }, + { + "epoch": 6.996800341296928, + "grad_norm": 0.07584512248832304, + "learning_rate": 1.018351298753828e-10, + "loss": 0.8942, + "step": 32801 + }, + { + "epoch": 6.997013651877133, + "grad_norm": 0.0688407024662884, + "learning_rate": 8.870971556440567e-11, + "loss": 0.9015, + "step": 32802 + }, + { + "epoch": 6.997226962457338, + "grad_norm": 0.06697414842925895, + "learning_rate": 7.648950158589685e-11, + "loss": 0.8908, + "step": 32803 + }, + { + "epoch": 6.997440273037543, + "grad_norm": 0.07179321358852844, + "learning_rate": 6.517448809972849e-11, + "loss": 0.8656, + "step": 32804 + }, + { + "epoch": 6.997653583617748, + "grad_norm": 0.06898749155415872, + "learning_rate": 5.4764675212481966e-11, + "loss": 0.9044, + "step": 32805 + }, + { + "epoch": 6.997866894197952, + "grad_norm": 0.071429480282273, + "learning_rate": 4.5260063057384063e-11, + "loss": 0.9095, + "step": 32806 + }, + { + "epoch": 6.998080204778157, + "grad_norm": 0.0771399537101337, + "learning_rate": 3.66606517321344e-11, + "loss": 0.913, + "step": 32807 + }, + { + "epoch": 6.998293515358362, + "grad_norm": 0.07431627608523707, + "learning_rate": 2.8966441343314387e-11, + "loss": 0.8562, + "step": 32808 + }, + { + "epoch": 6.998506825938566, + "grad_norm": 0.0688393861725428, + "learning_rate": 2.2177431961978303e-11, + "loss": 0.9004, + "step": 32809 + }, + { + "epoch": 6.998720136518771, + "grad_norm": 0.06863562106339623, + "learning_rate": 1.6293623685825767e-11, + "loss": 0.8758, + "step": 32810 + }, + { + "epoch": 6.998933447098976, + "grad_norm": 0.070491784891534, + "learning_rate": 1.1315016568147485e-11, + "loss": 0.8622, + "step": 32811 + }, + { + "epoch": 6.999146757679181, + "grad_norm": 0.13335438190574062, + "learning_rate": 7.241610662234167e-12, + "loss": 0.8855, + "step": 32812 + }, + { + "epoch": 6.999360068259386, + "grad_norm": 0.07522380178843784, + "learning_rate": 4.073406021376514e-12, + "loss": 0.8884, + "step": 32813 + }, + { + "epoch": 6.999573378839591, + "grad_norm": 0.06618606334292836, + "learning_rate": 1.8104026899834482e-12, + "loss": 0.863, + "step": 32814 + }, + { + "epoch": 6.999786689419795, + "grad_norm": 0.0697898986177729, + "learning_rate": 4.5260067693675414e-13, + "loss": 0.8965, + "step": 32815 + }, + { + "epoch": 7.0, + "grad_norm": 0.07345674802792301, + "learning_rate": 0.0, + "loss": 0.8834, + "step": 32816 + }, + { + "epoch": 7.0, + "step": 32816, + "total_flos": 1.1955108328583987e+17, + "train_loss": 0.9723419804781719, + "train_runtime": 594145.3794, + "train_samples_per_second": 14.138, + "train_steps_per_second": 0.055 + } + ], + "logging_steps": 1.0, + "max_steps": 32816, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1955108328583987e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}