AgentGen-Rep-8B / trainer_state.json
DannyShaw's picture
Upload folder using huggingface_hub
c1847aa verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 500,
"global_step": 1431,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011180992313067784,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.2878,
"step": 2
},
{
"epoch": 0.02236198462613557,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.392,
"step": 4
},
{
"epoch": 0.033542976939203356,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.3594,
"step": 6
},
{
"epoch": 0.04472396925227114,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.2958,
"step": 8
},
{
"epoch": 0.055904961565338925,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.3475,
"step": 10
},
{
"epoch": 0.06708595387840671,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.2303,
"step": 12
},
{
"epoch": 0.07826694619147449,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.1964,
"step": 14
},
{
"epoch": 0.08944793850454227,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.3328,
"step": 16
},
{
"epoch": 0.10062893081761007,
"grad_norm": 6.944002672340658,
"learning_rate": 4.999996106235862e-06,
"loss": 1.3134,
"step": 18
},
{
"epoch": 0.11180992313067785,
"grad_norm": 7.800497498064014,
"learning_rate": 4.999964956195521e-06,
"loss": 1.1147,
"step": 20
},
{
"epoch": 0.12299091544374563,
"grad_norm": 4.4662495771497355,
"learning_rate": 4.999902656502973e-06,
"loss": 1.025,
"step": 22
},
{
"epoch": 0.13417190775681342,
"grad_norm": 4.026851738528776,
"learning_rate": 4.999809207934472e-06,
"loss": 1.0448,
"step": 24
},
{
"epoch": 0.1453529000698812,
"grad_norm": 5.658278761851693,
"learning_rate": 4.999684611654392e-06,
"loss": 0.9826,
"step": 26
},
{
"epoch": 0.15653389238294899,
"grad_norm": 3.9275877006609505,
"learning_rate": 4.9995288692152046e-06,
"loss": 0.9627,
"step": 28
},
{
"epoch": 0.16771488469601678,
"grad_norm": 3.634771950296262,
"learning_rate": 4.9993419825574686e-06,
"loss": 0.9476,
"step": 30
},
{
"epoch": 0.17889587700908455,
"grad_norm": 4.604406424526374,
"learning_rate": 4.9992368608591775e-06,
"loss": 0.9414,
"step": 32
},
{
"epoch": 0.19007686932215234,
"grad_norm": 5.708200502114745,
"learning_rate": 4.999003262361029e-06,
"loss": 0.9572,
"step": 34
},
{
"epoch": 0.20125786163522014,
"grad_norm": 5.020134712294459,
"learning_rate": 4.998738526193412e-06,
"loss": 0.9544,
"step": 36
},
{
"epoch": 0.2124388539482879,
"grad_norm": 4.643332496496484,
"learning_rate": 4.998442655654946e-06,
"loss": 0.8504,
"step": 38
},
{
"epoch": 0.2236198462613557,
"grad_norm": 4.7843514072232125,
"learning_rate": 4.998115654432191e-06,
"loss": 0.914,
"step": 40
},
{
"epoch": 0.2348008385744235,
"grad_norm": 3.973113705087721,
"learning_rate": 4.997757526599592e-06,
"loss": 0.8303,
"step": 42
},
{
"epoch": 0.24598183088749126,
"grad_norm": 5.753323652117126,
"learning_rate": 4.9973682766194355e-06,
"loss": 0.8916,
"step": 44
},
{
"epoch": 0.25716282320055905,
"grad_norm": 4.00607759948128,
"learning_rate": 4.996947909341789e-06,
"loss": 0.9391,
"step": 46
},
{
"epoch": 0.26834381551362685,
"grad_norm": 4.73751358896988,
"learning_rate": 4.996496430004446e-06,
"loss": 0.8445,
"step": 48
},
{
"epoch": 0.27952480782669464,
"grad_norm": 3.801634673248135,
"learning_rate": 4.9960138442328535e-06,
"loss": 0.8354,
"step": 50
},
{
"epoch": 0.2907058001397624,
"grad_norm": 4.998706656181077,
"learning_rate": 4.9955001580400475e-06,
"loss": 0.8556,
"step": 52
},
{
"epoch": 0.3018867924528302,
"grad_norm": 5.235396952388322,
"learning_rate": 4.994955377826577e-06,
"loss": 0.8821,
"step": 54
},
{
"epoch": 0.31306778476589797,
"grad_norm": 4.593843550283633,
"learning_rate": 4.994379510380421e-06,
"loss": 0.7965,
"step": 56
},
{
"epoch": 0.32424877707896577,
"grad_norm": 4.636040406542864,
"learning_rate": 4.993772562876909e-06,
"loss": 0.8576,
"step": 58
},
{
"epoch": 0.33542976939203356,
"grad_norm": 4.422458900120915,
"learning_rate": 4.993134542878631e-06,
"loss": 0.8388,
"step": 60
},
{
"epoch": 0.3466107617051013,
"grad_norm": 4.88515796654498,
"learning_rate": 4.992465458335335e-06,
"loss": 0.8427,
"step": 62
},
{
"epoch": 0.3577917540181691,
"grad_norm": 4.620642626620232,
"learning_rate": 4.991765317583841e-06,
"loss": 0.8088,
"step": 64
},
{
"epoch": 0.3689727463312369,
"grad_norm": 3.0164501013815146,
"learning_rate": 4.991034129347927e-06,
"loss": 0.7643,
"step": 66
},
{
"epoch": 0.3801537386443047,
"grad_norm": 4.0807085306410915,
"learning_rate": 4.990271902738223e-06,
"loss": 0.8304,
"step": 68
},
{
"epoch": 0.3913347309573725,
"grad_norm": 4.913983348963418,
"learning_rate": 4.989478647252101e-06,
"loss": 0.8694,
"step": 70
},
{
"epoch": 0.4025157232704403,
"grad_norm": 5.427166275548586,
"learning_rate": 4.988654372773552e-06,
"loss": 0.8031,
"step": 72
},
{
"epoch": 0.413696715583508,
"grad_norm": 4.976699288607289,
"learning_rate": 4.987799089573066e-06,
"loss": 0.7548,
"step": 74
},
{
"epoch": 0.4248777078965758,
"grad_norm": 5.035712861337141,
"learning_rate": 4.986912808307502e-06,
"loss": 0.7769,
"step": 76
},
{
"epoch": 0.4360587002096436,
"grad_norm": 5.703104314189732,
"learning_rate": 4.985995540019956e-06,
"loss": 0.7744,
"step": 78
},
{
"epoch": 0.4472396925227114,
"grad_norm": 3.6174332203212938,
"learning_rate": 4.985047296139622e-06,
"loss": 0.7215,
"step": 80
},
{
"epoch": 0.4584206848357792,
"grad_norm": 5.084461038739496,
"learning_rate": 4.984068088481654e-06,
"loss": 0.7462,
"step": 82
},
{
"epoch": 0.469601677148847,
"grad_norm": 5.500722673783384,
"learning_rate": 4.983057929247014e-06,
"loss": 0.7937,
"step": 84
},
{
"epoch": 0.4807826694619147,
"grad_norm": 5.76928743736382,
"learning_rate": 4.9820168310223215e-06,
"loss": 0.7701,
"step": 86
},
{
"epoch": 0.4919636617749825,
"grad_norm": 4.3638410984754366,
"learning_rate": 4.980944806779698e-06,
"loss": 0.7063,
"step": 88
},
{
"epoch": 0.5031446540880503,
"grad_norm": 6.6022312070502664,
"learning_rate": 4.979841869876603e-06,
"loss": 0.7829,
"step": 90
},
{
"epoch": 0.5143256464011181,
"grad_norm": 5.114853414480892,
"learning_rate": 4.97870803405567e-06,
"loss": 0.7419,
"step": 92
},
{
"epoch": 0.5255066387141859,
"grad_norm": 5.450293615821356,
"learning_rate": 4.977543313444534e-06,
"loss": 0.7428,
"step": 94
},
{
"epoch": 0.5366876310272537,
"grad_norm": 3.888671786201343,
"learning_rate": 4.976347722555655e-06,
"loss": 0.763,
"step": 96
},
{
"epoch": 0.5478686233403215,
"grad_norm": 5.580018062591517,
"learning_rate": 4.975121276286136e-06,
"loss": 0.7451,
"step": 98
},
{
"epoch": 0.5590496156533893,
"grad_norm": 5.244409209125885,
"learning_rate": 4.973863989917545e-06,
"loss": 0.6658,
"step": 100
},
{
"epoch": 0.570230607966457,
"grad_norm": 6.341201782490113,
"learning_rate": 4.9725758791157105e-06,
"loss": 0.7042,
"step": 102
},
{
"epoch": 0.5814116002795248,
"grad_norm": 3.63864440598579,
"learning_rate": 4.9712569599305415e-06,
"loss": 0.6859,
"step": 104
},
{
"epoch": 0.5925925925925926,
"grad_norm": 5.643540415249962,
"learning_rate": 4.9699072487958185e-06,
"loss": 0.7072,
"step": 106
},
{
"epoch": 0.6037735849056604,
"grad_norm": 4.518214836889502,
"learning_rate": 4.968526762528988e-06,
"loss": 0.6989,
"step": 108
},
{
"epoch": 0.6149545772187281,
"grad_norm": 4.813780988459217,
"learning_rate": 4.96711551833096e-06,
"loss": 0.6213,
"step": 110
},
{
"epoch": 0.6261355695317959,
"grad_norm": 6.534716960952802,
"learning_rate": 4.965673533785887e-06,
"loss": 0.6603,
"step": 112
},
{
"epoch": 0.6373165618448637,
"grad_norm": 4.694700268634709,
"learning_rate": 4.9642008268609455e-06,
"loss": 0.6458,
"step": 114
},
{
"epoch": 0.6484975541579315,
"grad_norm": 3.797163997052886,
"learning_rate": 4.962697415906118e-06,
"loss": 0.6208,
"step": 116
},
{
"epoch": 0.6596785464709993,
"grad_norm": 5.303604758140139,
"learning_rate": 4.961163319653959e-06,
"loss": 0.6175,
"step": 118
},
{
"epoch": 0.6708595387840671,
"grad_norm": 3.8308857949946398,
"learning_rate": 4.959598557219361e-06,
"loss": 0.6178,
"step": 120
},
{
"epoch": 0.6820405310971349,
"grad_norm": 5.611339241664303,
"learning_rate": 4.95800314809932e-06,
"loss": 0.617,
"step": 122
},
{
"epoch": 0.6932215234102026,
"grad_norm": 5.234511261826922,
"learning_rate": 4.956377112172691e-06,
"loss": 0.6557,
"step": 124
},
{
"epoch": 0.7044025157232704,
"grad_norm": 4.381066733905507,
"learning_rate": 4.954720469699939e-06,
"loss": 0.6343,
"step": 126
},
{
"epoch": 0.7155835080363382,
"grad_norm": 5.113989443684452,
"learning_rate": 4.953033241322887e-06,
"loss": 0.6135,
"step": 128
},
{
"epoch": 0.726764500349406,
"grad_norm": 5.138987950069777,
"learning_rate": 4.951315448064462e-06,
"loss": 0.6403,
"step": 130
},
{
"epoch": 0.7379454926624738,
"grad_norm": 4.43583718290579,
"learning_rate": 4.949567111328428e-06,
"loss": 0.6226,
"step": 132
},
{
"epoch": 0.7491264849755416,
"grad_norm": 4.391597448273059,
"learning_rate": 4.947788252899124e-06,
"loss": 0.6333,
"step": 134
},
{
"epoch": 0.7603074772886094,
"grad_norm": 4.193385817962468,
"learning_rate": 4.945978894941189e-06,
"loss": 0.6884,
"step": 136
},
{
"epoch": 0.7714884696016772,
"grad_norm": 5.03154779607414,
"learning_rate": 4.944139059999286e-06,
"loss": 0.5783,
"step": 138
},
{
"epoch": 0.782669461914745,
"grad_norm": 6.345004441163444,
"learning_rate": 4.942268770997825e-06,
"loss": 0.5314,
"step": 140
},
{
"epoch": 0.7938504542278128,
"grad_norm": 4.800013540838224,
"learning_rate": 4.940368051240675e-06,
"loss": 0.5876,
"step": 142
},
{
"epoch": 0.8050314465408805,
"grad_norm": 5.229387760297341,
"learning_rate": 4.938436924410869e-06,
"loss": 0.6266,
"step": 144
},
{
"epoch": 0.8162124388539483,
"grad_norm": 5.663117027843187,
"learning_rate": 4.936475414570317e-06,
"loss": 0.5407,
"step": 146
},
{
"epoch": 0.827393431167016,
"grad_norm": 4.355698674662869,
"learning_rate": 4.9344835461595016e-06,
"loss": 0.5757,
"step": 148
},
{
"epoch": 0.8385744234800838,
"grad_norm": 3.73012661577406,
"learning_rate": 4.932461343997174e-06,
"loss": 0.5671,
"step": 150
},
{
"epoch": 0.8497554157931516,
"grad_norm": 5.17610307953933,
"learning_rate": 4.930408833280044e-06,
"loss": 0.5552,
"step": 152
},
{
"epoch": 0.8609364081062194,
"grad_norm": 4.8108290286110575,
"learning_rate": 4.928326039582468e-06,
"loss": 0.5455,
"step": 154
},
{
"epoch": 0.8721174004192872,
"grad_norm": 4.143977047297293,
"learning_rate": 4.926212988856131e-06,
"loss": 0.5865,
"step": 156
},
{
"epoch": 0.883298392732355,
"grad_norm": 4.809016102192773,
"learning_rate": 4.9240697074297205e-06,
"loss": 0.5904,
"step": 158
},
{
"epoch": 0.8944793850454228,
"grad_norm": 4.329310274878485,
"learning_rate": 4.921896222008598e-06,
"loss": 0.5213,
"step": 160
},
{
"epoch": 0.9056603773584906,
"grad_norm": 6.082276125346202,
"learning_rate": 4.919692559674469e-06,
"loss": 0.5321,
"step": 162
},
{
"epoch": 0.9168413696715584,
"grad_norm": 3.595682377289556,
"learning_rate": 4.917458747885045e-06,
"loss": 0.5589,
"step": 164
},
{
"epoch": 0.9280223619846262,
"grad_norm": 4.759398027424621,
"learning_rate": 4.9151948144737e-06,
"loss": 0.5252,
"step": 166
},
{
"epoch": 0.939203354297694,
"grad_norm": 4.925856740501272,
"learning_rate": 4.912900787649124e-06,
"loss": 0.5688,
"step": 168
},
{
"epoch": 0.9503843466107617,
"grad_norm": 4.9751554778931695,
"learning_rate": 4.910576695994976e-06,
"loss": 0.49,
"step": 170
},
{
"epoch": 0.9615653389238294,
"grad_norm": 4.404002437196143,
"learning_rate": 4.908222568469516e-06,
"loss": 0.5031,
"step": 172
},
{
"epoch": 0.9727463312368972,
"grad_norm": 4.438458089119356,
"learning_rate": 4.905838434405259e-06,
"loss": 0.5015,
"step": 174
},
{
"epoch": 0.983927323549965,
"grad_norm": 3.7675300141289205,
"learning_rate": 4.903424323508601e-06,
"loss": 0.5133,
"step": 176
},
{
"epoch": 0.9951083158630328,
"grad_norm": 5.557474516168906,
"learning_rate": 4.900980265859449e-06,
"loss": 0.4913,
"step": 178
},
{
"epoch": 1.0062893081761006,
"grad_norm": 4.4806858821540585,
"learning_rate": 4.898506291910847e-06,
"loss": 0.4446,
"step": 180
},
{
"epoch": 1.0174703004891683,
"grad_norm": 4.605929975666356,
"learning_rate": 4.896002432488599e-06,
"loss": 0.3632,
"step": 182
},
{
"epoch": 1.0286512928022362,
"grad_norm": 4.9794341930411665,
"learning_rate": 4.893468718790883e-06,
"loss": 0.3868,
"step": 184
},
{
"epoch": 1.039832285115304,
"grad_norm": 3.5317296745452733,
"learning_rate": 4.890905182387862e-06,
"loss": 0.4334,
"step": 186
},
{
"epoch": 1.0510132774283718,
"grad_norm": 4.568181420141649,
"learning_rate": 4.88831185522129e-06,
"loss": 0.456,
"step": 188
},
{
"epoch": 1.0621942697414395,
"grad_norm": 3.570260813698039,
"learning_rate": 4.885688769604115e-06,
"loss": 0.3846,
"step": 190
},
{
"epoch": 1.0733752620545074,
"grad_norm": 3.639759353451614,
"learning_rate": 4.883035958220077e-06,
"loss": 0.4363,
"step": 192
},
{
"epoch": 1.084556254367575,
"grad_norm": 4.074741691986429,
"learning_rate": 4.8803534541233016e-06,
"loss": 0.3782,
"step": 194
},
{
"epoch": 1.095737246680643,
"grad_norm": 4.875221867832197,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.3815,
"step": 196
},
{
"epoch": 1.1069182389937107,
"grad_norm": 3.575182053435755,
"learning_rate": 4.874899501857477e-06,
"loss": 0.4023,
"step": 198
},
{
"epoch": 1.1180992313067786,
"grad_norm": 3.984785984285916,
"learning_rate": 4.8721281216448675e-06,
"loss": 0.305,
"step": 200
},
{
"epoch": 1.1292802236198463,
"grad_norm": 3.997235184408756,
"learning_rate": 4.869327184631552e-06,
"loss": 0.3896,
"step": 202
},
{
"epoch": 1.140461215932914,
"grad_norm": 3.403723018382878,
"learning_rate": 4.866496725717304e-06,
"loss": 0.3332,
"step": 204
},
{
"epoch": 1.1516422082459818,
"grad_norm": 3.5740869992425917,
"learning_rate": 4.8636367801697415e-06,
"loss": 0.3299,
"step": 206
},
{
"epoch": 1.1628232005590495,
"grad_norm": 3.8789874672120033,
"learning_rate": 4.860747383623889e-06,
"loss": 0.4145,
"step": 208
},
{
"epoch": 1.1740041928721174,
"grad_norm": 3.8038820435820084,
"learning_rate": 4.857828572081731e-06,
"loss": 0.3171,
"step": 210
},
{
"epoch": 1.1851851851851851,
"grad_norm": 3.260333619392394,
"learning_rate": 4.854880381911762e-06,
"loss": 0.3474,
"step": 212
},
{
"epoch": 1.196366177498253,
"grad_norm": 2.8989963280714925,
"learning_rate": 4.851902849848536e-06,
"loss": 0.3931,
"step": 214
},
{
"epoch": 1.2075471698113207,
"grad_norm": 3.6383247911373773,
"learning_rate": 4.848896012992208e-06,
"loss": 0.3822,
"step": 216
},
{
"epoch": 1.2187281621243886,
"grad_norm": 3.0864181531286734,
"learning_rate": 4.845859908808074e-06,
"loss": 0.378,
"step": 218
},
{
"epoch": 1.2299091544374563,
"grad_norm": 2.494513481207721,
"learning_rate": 4.842794575126099e-06,
"loss": 0.3655,
"step": 220
},
{
"epoch": 1.2410901467505242,
"grad_norm": 2.6074910342756334,
"learning_rate": 4.839700050140448e-06,
"loss": 0.3973,
"step": 222
},
{
"epoch": 1.2522711390635919,
"grad_norm": 2.2421870374103285,
"learning_rate": 4.836576372409015e-06,
"loss": 0.3784,
"step": 224
},
{
"epoch": 1.2634521313766598,
"grad_norm": 2.451559449193117,
"learning_rate": 4.833423580852933e-06,
"loss": 0.3805,
"step": 226
},
{
"epoch": 1.2746331236897275,
"grad_norm": 2.5374184019501285,
"learning_rate": 4.830241714756099e-06,
"loss": 0.293,
"step": 228
},
{
"epoch": 1.2858141160027952,
"grad_norm": 2.525807489259318,
"learning_rate": 4.827030813764677e-06,
"loss": 0.2665,
"step": 230
},
{
"epoch": 1.296995108315863,
"grad_norm": 2.3755504317471523,
"learning_rate": 4.8237909178866075e-06,
"loss": 0.4108,
"step": 232
},
{
"epoch": 1.3081761006289307,
"grad_norm": 2.7662660096000793,
"learning_rate": 4.8205220674911075e-06,
"loss": 0.3928,
"step": 234
},
{
"epoch": 1.3193570929419987,
"grad_norm": 2.245517906271987,
"learning_rate": 4.81722430330817e-06,
"loss": 0.355,
"step": 236
},
{
"epoch": 1.3305380852550663,
"grad_norm": 2.684087860818518,
"learning_rate": 4.813897666428054e-06,
"loss": 0.3624,
"step": 238
},
{
"epoch": 1.3417190775681342,
"grad_norm": 2.5507370157459865,
"learning_rate": 4.810542198300772e-06,
"loss": 0.3494,
"step": 240
},
{
"epoch": 1.352900069881202,
"grad_norm": 2.157612559104276,
"learning_rate": 4.807157940735577e-06,
"loss": 0.3064,
"step": 242
},
{
"epoch": 1.3640810621942698,
"grad_norm": 1.9389355017962189,
"learning_rate": 4.803744935900439e-06,
"loss": 0.3331,
"step": 244
},
{
"epoch": 1.3752620545073375,
"grad_norm": 2.3147558047608867,
"learning_rate": 4.8003032263215185e-06,
"loss": 0.3538,
"step": 246
},
{
"epoch": 1.3864430468204052,
"grad_norm": 2.414181223767401,
"learning_rate": 4.79683285488264e-06,
"loss": 0.3237,
"step": 248
},
{
"epoch": 1.397624039133473,
"grad_norm": 2.0498128676624368,
"learning_rate": 4.793333864824756e-06,
"loss": 0.3742,
"step": 250
},
{
"epoch": 1.408805031446541,
"grad_norm": 2.2294049255917416,
"learning_rate": 4.789806299745405e-06,
"loss": 0.2948,
"step": 252
},
{
"epoch": 1.4199860237596087,
"grad_norm": 2.2210196470155923,
"learning_rate": 4.786250203598174e-06,
"loss": 0.28,
"step": 254
},
{
"epoch": 1.4311670160726764,
"grad_norm": 2.6896787603814816,
"learning_rate": 4.782665620692147e-06,
"loss": 0.3513,
"step": 256
},
{
"epoch": 1.4423480083857443,
"grad_norm": 2.1151921249556644,
"learning_rate": 4.779052595691355e-06,
"loss": 0.3598,
"step": 258
},
{
"epoch": 1.453529000698812,
"grad_norm": 2.6404538176276047,
"learning_rate": 4.775411173614218e-06,
"loss": 0.3075,
"step": 260
},
{
"epoch": 1.4647099930118799,
"grad_norm": 1.9888888421343762,
"learning_rate": 4.771741399832984e-06,
"loss": 0.356,
"step": 262
},
{
"epoch": 1.4758909853249476,
"grad_norm": 2.284642426340359,
"learning_rate": 4.768043320073165e-06,
"loss": 0.2765,
"step": 264
},
{
"epoch": 1.4870719776380152,
"grad_norm": 2.135563450656965,
"learning_rate": 4.764316980412966e-06,
"loss": 0.2825,
"step": 266
},
{
"epoch": 1.4982529699510831,
"grad_norm": 1.8267552790003188,
"learning_rate": 4.7605624272827125e-06,
"loss": 0.3915,
"step": 268
},
{
"epoch": 1.509433962264151,
"grad_norm": 2.26569092336033,
"learning_rate": 4.75677970746427e-06,
"loss": 0.3859,
"step": 270
},
{
"epoch": 1.5206149545772187,
"grad_norm": 2.3510908940666346,
"learning_rate": 4.75296886809046e-06,
"loss": 0.312,
"step": 272
},
{
"epoch": 1.5317959468902864,
"grad_norm": 2.1562478846600883,
"learning_rate": 4.749129956644477e-06,
"loss": 0.4398,
"step": 274
},
{
"epoch": 1.5429769392033543,
"grad_norm": 2.1811966726037655,
"learning_rate": 4.745263020959296e-06,
"loss": 0.3221,
"step": 276
},
{
"epoch": 1.5541579315164222,
"grad_norm": 2.035643810106488,
"learning_rate": 4.741368109217072e-06,
"loss": 0.3317,
"step": 278
},
{
"epoch": 1.56533892382949,
"grad_norm": 2.0722038381676824,
"learning_rate": 4.737445269948543e-06,
"loss": 0.4627,
"step": 280
},
{
"epoch": 1.5765199161425576,
"grad_norm": 2.2584403073433212,
"learning_rate": 4.733494552032426e-06,
"loss": 0.352,
"step": 282
},
{
"epoch": 1.5877009084556253,
"grad_norm": 3.1127410509937783,
"learning_rate": 4.729516004694808e-06,
"loss": 0.3109,
"step": 284
},
{
"epoch": 1.5988819007686932,
"grad_norm": 1.6930738402579835,
"learning_rate": 4.725509677508528e-06,
"loss": 0.3723,
"step": 286
},
{
"epoch": 1.610062893081761,
"grad_norm": 2.6225330496610573,
"learning_rate": 4.721475620392567e-06,
"loss": 0.2853,
"step": 288
},
{
"epoch": 1.6212438853948288,
"grad_norm": 1.998954970455011,
"learning_rate": 4.71741388361142e-06,
"loss": 0.323,
"step": 290
},
{
"epoch": 1.6324248777078965,
"grad_norm": 2.3952745413220677,
"learning_rate": 4.713324517774471e-06,
"loss": 0.4057,
"step": 292
},
{
"epoch": 1.6436058700209644,
"grad_norm": 1.7339961999135642,
"learning_rate": 4.7092075738353625e-06,
"loss": 0.2855,
"step": 294
},
{
"epoch": 1.6547868623340323,
"grad_norm": 2.3672466509243075,
"learning_rate": 4.705063103091365e-06,
"loss": 0.277,
"step": 296
},
{
"epoch": 1.6659678546471,
"grad_norm": 1.92096238087282,
"learning_rate": 4.700891157182729e-06,
"loss": 0.2699,
"step": 298
},
{
"epoch": 1.6771488469601676,
"grad_norm": 1.6478187267877538,
"learning_rate": 4.696691788092049e-06,
"loss": 0.2875,
"step": 300
},
{
"epoch": 1.6883298392732355,
"grad_norm": 2.6637144089516545,
"learning_rate": 4.692465048143615e-06,
"loss": 0.3229,
"step": 302
},
{
"epoch": 1.6995108315863034,
"grad_norm": 2.0530281428374084,
"learning_rate": 4.688210990002755e-06,
"loss": 0.3546,
"step": 304
},
{
"epoch": 1.7106918238993711,
"grad_norm": 2.150198399781322,
"learning_rate": 4.683929666675185e-06,
"loss": 0.4021,
"step": 306
},
{
"epoch": 1.7218728162124388,
"grad_norm": 2.1752313572704542,
"learning_rate": 4.679621131506347e-06,
"loss": 0.3299,
"step": 308
},
{
"epoch": 1.7330538085255065,
"grad_norm": 1.9055889494341978,
"learning_rate": 4.6752854381807414e-06,
"loss": 0.2514,
"step": 310
},
{
"epoch": 1.7442348008385744,
"grad_norm": 2.469483649303522,
"learning_rate": 4.670922640721261e-06,
"loss": 0.332,
"step": 312
},
{
"epoch": 1.7554157931516423,
"grad_norm": 2.327049750502898,
"learning_rate": 4.666532793488518e-06,
"loss": 0.3482,
"step": 314
},
{
"epoch": 1.76659678546471,
"grad_norm": 2.0224582609864674,
"learning_rate": 4.662115951180164e-06,
"loss": 0.3192,
"step": 316
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.9568416201882894,
"learning_rate": 4.657672168830211e-06,
"loss": 0.2682,
"step": 318
},
{
"epoch": 1.7889587700908456,
"grad_norm": 1.919410926201314,
"learning_rate": 4.653201501808346e-06,
"loss": 0.3602,
"step": 320
},
{
"epoch": 1.8001397624039135,
"grad_norm": 2.239752835185363,
"learning_rate": 4.6487040058192385e-06,
"loss": 0.346,
"step": 322
},
{
"epoch": 1.8113207547169812,
"grad_norm": 2.3820790461811643,
"learning_rate": 4.644179736901848e-06,
"loss": 0.393,
"step": 324
},
{
"epoch": 1.8225017470300489,
"grad_norm": 2.100652056063807,
"learning_rate": 4.639628751428728e-06,
"loss": 0.3348,
"step": 326
},
{
"epoch": 1.8336827393431165,
"grad_norm": 1.839587786014522,
"learning_rate": 4.635051106105316e-06,
"loss": 0.297,
"step": 328
},
{
"epoch": 1.8448637316561844,
"grad_norm": 1.460937373317575,
"learning_rate": 4.630446857969238e-06,
"loss": 0.3291,
"step": 330
},
{
"epoch": 1.8560447239692524,
"grad_norm": 3.066440662132836,
"learning_rate": 4.625816064389589e-06,
"loss": 0.2752,
"step": 332
},
{
"epoch": 1.86722571628232,
"grad_norm": 1.9596525632755366,
"learning_rate": 4.62115878306622e-06,
"loss": 0.3444,
"step": 334
},
{
"epoch": 1.8784067085953877,
"grad_norm": 2.2835299782118335,
"learning_rate": 4.616475072029024e-06,
"loss": 0.3013,
"step": 336
},
{
"epoch": 1.8895877009084556,
"grad_norm": 2.1330589159921756,
"learning_rate": 4.6117649896372055e-06,
"loss": 0.3811,
"step": 338
},
{
"epoch": 1.9007686932215235,
"grad_norm": 2.28792058261577,
"learning_rate": 4.607028594578559e-06,
"loss": 0.304,
"step": 340
},
{
"epoch": 1.9119496855345912,
"grad_norm": 1.8457539990364031,
"learning_rate": 4.602265945868735e-06,
"loss": 0.2817,
"step": 342
},
{
"epoch": 1.923130677847659,
"grad_norm": 1.7860630390403116,
"learning_rate": 4.597477102850506e-06,
"loss": 0.3166,
"step": 344
},
{
"epoch": 1.9343116701607268,
"grad_norm": 1.988441202911347,
"learning_rate": 4.592662125193027e-06,
"loss": 0.2881,
"step": 346
},
{
"epoch": 1.9454926624737947,
"grad_norm": 1.7341207391896365,
"learning_rate": 4.587821072891089e-06,
"loss": 0.3126,
"step": 348
},
{
"epoch": 1.9566736547868624,
"grad_norm": 1.8960045369195677,
"learning_rate": 4.582954006264377e-06,
"loss": 0.32,
"step": 350
},
{
"epoch": 1.96785464709993,
"grad_norm": 1.8028316706058551,
"learning_rate": 4.578060985956714e-06,
"loss": 0.3308,
"step": 352
},
{
"epoch": 1.9790356394129978,
"grad_norm": 1.7537644172052635,
"learning_rate": 4.573142072935307e-06,
"loss": 0.325,
"step": 354
},
{
"epoch": 1.9902166317260657,
"grad_norm": 1.5291097261080726,
"learning_rate": 4.568197328489986e-06,
"loss": 0.3418,
"step": 356
},
{
"epoch": 2.0013976240391336,
"grad_norm": 2.703429613422267,
"learning_rate": 4.563226814232444e-06,
"loss": 0.316,
"step": 358
},
{
"epoch": 2.0125786163522013,
"grad_norm": 1.6677019482039983,
"learning_rate": 4.558230592095465e-06,
"loss": 0.2242,
"step": 360
},
{
"epoch": 2.023759608665269,
"grad_norm": 2.1855279147060527,
"learning_rate": 4.5532087243321536e-06,
"loss": 0.1706,
"step": 362
},
{
"epoch": 2.0349406009783366,
"grad_norm": 1.433260386596143,
"learning_rate": 4.548161273515161e-06,
"loss": 0.2597,
"step": 364
},
{
"epoch": 2.0461215932914047,
"grad_norm": 1.9528007044032762,
"learning_rate": 4.543088302535903e-06,
"loss": 0.2321,
"step": 366
},
{
"epoch": 2.0573025856044724,
"grad_norm": 1.508509476663671,
"learning_rate": 4.53798987460378e-06,
"loss": 0.1975,
"step": 368
},
{
"epoch": 2.06848357791754,
"grad_norm": 1.4870411030447606,
"learning_rate": 4.532866053245385e-06,
"loss": 0.218,
"step": 370
},
{
"epoch": 2.079664570230608,
"grad_norm": 1.984299603467917,
"learning_rate": 4.527716902303713e-06,
"loss": 0.1866,
"step": 372
},
{
"epoch": 2.090845562543676,
"grad_norm": 1.7502708144873231,
"learning_rate": 4.522542485937369e-06,
"loss": 0.2128,
"step": 374
},
{
"epoch": 2.1020265548567436,
"grad_norm": 1.131006072907252,
"learning_rate": 4.517342868619764e-06,
"loss": 0.2418,
"step": 376
},
{
"epoch": 2.1132075471698113,
"grad_norm": 2.365723778930082,
"learning_rate": 4.512118115138315e-06,
"loss": 0.2249,
"step": 378
},
{
"epoch": 2.124388539482879,
"grad_norm": 1.7739738087900154,
"learning_rate": 4.506868290593635e-06,
"loss": 0.225,
"step": 380
},
{
"epoch": 2.135569531795947,
"grad_norm": 2.3920039733015197,
"learning_rate": 4.501593460398726e-06,
"loss": 0.207,
"step": 382
},
{
"epoch": 2.146750524109015,
"grad_norm": 1.3961875749075527,
"learning_rate": 4.49629369027816e-06,
"loss": 0.1847,
"step": 384
},
{
"epoch": 2.1579315164220825,
"grad_norm": 1.740079266616333,
"learning_rate": 4.490969046267258e-06,
"loss": 0.2092,
"step": 386
},
{
"epoch": 2.16911250873515,
"grad_norm": 1.716849109423316,
"learning_rate": 4.485619594711278e-06,
"loss": 0.2512,
"step": 388
},
{
"epoch": 2.180293501048218,
"grad_norm": 2.2256205473256836,
"learning_rate": 4.4802454022645725e-06,
"loss": 0.2212,
"step": 390
},
{
"epoch": 2.191474493361286,
"grad_norm": 1.5080548485099736,
"learning_rate": 4.474846535889773e-06,
"loss": 0.2577,
"step": 392
},
{
"epoch": 2.2026554856743537,
"grad_norm": 1.849350001917602,
"learning_rate": 4.469423062856946e-06,
"loss": 0.2518,
"step": 394
},
{
"epoch": 2.2138364779874213,
"grad_norm": 2.0456903454646937,
"learning_rate": 4.463975050742757e-06,
"loss": 0.2666,
"step": 396
},
{
"epoch": 2.225017470300489,
"grad_norm": 2.1576955140860172,
"learning_rate": 4.4585025674296315e-06,
"loss": 0.1881,
"step": 398
},
{
"epoch": 2.236198462613557,
"grad_norm": 1.959825305986428,
"learning_rate": 4.453005681104906e-06,
"loss": 0.1912,
"step": 400
},
{
"epoch": 2.247379454926625,
"grad_norm": 1.8263078605633967,
"learning_rate": 4.44748446025998e-06,
"loss": 0.177,
"step": 402
},
{
"epoch": 2.2585604472396925,
"grad_norm": 1.3737693376807456,
"learning_rate": 4.44193897368946e-06,
"loss": 0.2083,
"step": 404
},
{
"epoch": 2.26974143955276,
"grad_norm": 1.9216745648550881,
"learning_rate": 4.436369290490307e-06,
"loss": 0.269,
"step": 406
},
{
"epoch": 2.280922431865828,
"grad_norm": 1.5225068983698562,
"learning_rate": 4.430775480060973e-06,
"loss": 0.2043,
"step": 408
},
{
"epoch": 2.292103424178896,
"grad_norm": 1.958524495155971,
"learning_rate": 4.425157612100531e-06,
"loss": 0.2735,
"step": 410
},
{
"epoch": 2.3032844164919637,
"grad_norm": 2.020109840115744,
"learning_rate": 4.419515756607819e-06,
"loss": 0.2623,
"step": 412
},
{
"epoch": 2.3144654088050314,
"grad_norm": 1.6832635446278787,
"learning_rate": 4.413849983880554e-06,
"loss": 0.2122,
"step": 414
},
{
"epoch": 2.325646401118099,
"grad_norm": 1.8238819367042174,
"learning_rate": 4.4081603645144685e-06,
"loss": 0.2141,
"step": 416
},
{
"epoch": 2.336827393431167,
"grad_norm": 1.636664838162331,
"learning_rate": 4.4024469694024194e-06,
"loss": 0.2159,
"step": 418
},
{
"epoch": 2.348008385744235,
"grad_norm": 1.563361723149053,
"learning_rate": 4.396709869733515e-06,
"loss": 0.2636,
"step": 420
},
{
"epoch": 2.3591893780573026,
"grad_norm": 1.7104549540666967,
"learning_rate": 4.39094913699222e-06,
"loss": 0.2059,
"step": 422
},
{
"epoch": 2.3703703703703702,
"grad_norm": 1.7448299629844894,
"learning_rate": 4.385164842957469e-06,
"loss": 0.2076,
"step": 424
},
{
"epoch": 2.381551362683438,
"grad_norm": 2.0760771369111812,
"learning_rate": 4.379357059701771e-06,
"loss": 0.2241,
"step": 426
},
{
"epoch": 2.392732354996506,
"grad_norm": 1.4610379659131663,
"learning_rate": 4.373525859590313e-06,
"loss": 0.2135,
"step": 428
},
{
"epoch": 2.4039133473095737,
"grad_norm": 1.9763200369365506,
"learning_rate": 4.367671315280055e-06,
"loss": 0.2225,
"step": 430
},
{
"epoch": 2.4150943396226414,
"grad_norm": 2.138415914668256,
"learning_rate": 4.3617934997188274e-06,
"loss": 0.2618,
"step": 432
},
{
"epoch": 2.426275331935709,
"grad_norm": 1.6842725394389781,
"learning_rate": 4.355892486144419e-06,
"loss": 0.1691,
"step": 434
},
{
"epoch": 2.4374563242487772,
"grad_norm": 2.056626946764254,
"learning_rate": 4.349968348083673e-06,
"loss": 0.1922,
"step": 436
},
{
"epoch": 2.448637316561845,
"grad_norm": 1.2423274511146358,
"learning_rate": 4.3440211593515556e-06,
"loss": 0.2061,
"step": 438
},
{
"epoch": 2.4598183088749126,
"grad_norm": 1.465237522133527,
"learning_rate": 4.338050994050253e-06,
"loss": 0.1996,
"step": 440
},
{
"epoch": 2.4709993011879803,
"grad_norm": 2.1451900105983315,
"learning_rate": 4.332057926568235e-06,
"loss": 0.2441,
"step": 442
},
{
"epoch": 2.4821802935010484,
"grad_norm": 1.5259606296511572,
"learning_rate": 4.326042031579337e-06,
"loss": 0.2066,
"step": 444
},
{
"epoch": 2.493361285814116,
"grad_norm": 2.4163109674867784,
"learning_rate": 4.320003384041823e-06,
"loss": 0.2393,
"step": 446
},
{
"epoch": 2.5045422781271838,
"grad_norm": 2.1518283309231907,
"learning_rate": 4.313942059197457e-06,
"loss": 0.2467,
"step": 448
},
{
"epoch": 2.5157232704402515,
"grad_norm": 1.6715387204280183,
"learning_rate": 4.3078581325705614e-06,
"loss": 0.2495,
"step": 450
},
{
"epoch": 2.5269042627533196,
"grad_norm": 1.7729216990478125,
"learning_rate": 4.3017516799670785e-06,
"loss": 0.1586,
"step": 452
},
{
"epoch": 2.5380852550663873,
"grad_norm": 1.7853923740535589,
"learning_rate": 4.295622777473625e-06,
"loss": 0.2216,
"step": 454
},
{
"epoch": 2.549266247379455,
"grad_norm": 1.7001940457803237,
"learning_rate": 4.289471501456543e-06,
"loss": 0.2288,
"step": 456
},
{
"epoch": 2.5604472396925226,
"grad_norm": 2.5868877625212354,
"learning_rate": 4.283297928560951e-06,
"loss": 0.2075,
"step": 458
},
{
"epoch": 2.5716282320055903,
"grad_norm": 2.1990912649669823,
"learning_rate": 4.277102135709786e-06,
"loss": 0.2017,
"step": 460
},
{
"epoch": 2.582809224318658,
"grad_norm": 2.2627396419665273,
"learning_rate": 4.270884200102848e-06,
"loss": 0.2144,
"step": 462
},
{
"epoch": 2.593990216631726,
"grad_norm": 2.2283930780278505,
"learning_rate": 4.2646441992158356e-06,
"loss": 0.3,
"step": 464
},
{
"epoch": 2.605171208944794,
"grad_norm": 2.6765537923336087,
"learning_rate": 4.258382210799381e-06,
"loss": 0.2441,
"step": 466
},
{
"epoch": 2.6163522012578615,
"grad_norm": 2.0124117535310706,
"learning_rate": 4.252098312878083e-06,
"loss": 0.2667,
"step": 468
},
{
"epoch": 2.6275331935709296,
"grad_norm": 2.0622543839995586,
"learning_rate": 4.245792583749533e-06,
"loss": 0.2209,
"step": 470
},
{
"epoch": 2.6387141858839973,
"grad_norm": 1.7479329049755916,
"learning_rate": 4.2394651019833385e-06,
"loss": 0.2045,
"step": 472
},
{
"epoch": 2.649895178197065,
"grad_norm": 2.223724201139868,
"learning_rate": 4.23311594642015e-06,
"loss": 0.2283,
"step": 474
},
{
"epoch": 2.6610761705101327,
"grad_norm": 1.8280919056271019,
"learning_rate": 4.226745196170669e-06,
"loss": 0.2319,
"step": 476
},
{
"epoch": 2.6722571628232004,
"grad_norm": 1.6911807333452673,
"learning_rate": 4.220352930614672e-06,
"loss": 0.232,
"step": 478
},
{
"epoch": 2.6834381551362685,
"grad_norm": 1.9242468593637576,
"learning_rate": 4.213939229400014e-06,
"loss": 0.2733,
"step": 480
},
{
"epoch": 2.694619147449336,
"grad_norm": 2.1223012349945254,
"learning_rate": 4.20750417244164e-06,
"loss": 0.2529,
"step": 482
},
{
"epoch": 2.705800139762404,
"grad_norm": 2.1921742273194313,
"learning_rate": 4.201047839920589e-06,
"loss": 0.257,
"step": 484
},
{
"epoch": 2.7169811320754715,
"grad_norm": 2.118251084662083,
"learning_rate": 4.194570312282993e-06,
"loss": 0.235,
"step": 486
},
{
"epoch": 2.7281621243885397,
"grad_norm": 1.9816644323530734,
"learning_rate": 4.1880716702390764e-06,
"loss": 0.1839,
"step": 488
},
{
"epoch": 2.7393431167016074,
"grad_norm": 1.8891363830208663,
"learning_rate": 4.181551994762151e-06,
"loss": 0.2301,
"step": 490
},
{
"epoch": 2.750524109014675,
"grad_norm": 1.7502840233703516,
"learning_rate": 4.1750113670876045e-06,
"loss": 0.1883,
"step": 492
},
{
"epoch": 2.7617051013277427,
"grad_norm": 1.5627429248705165,
"learning_rate": 4.16844986871189e-06,
"loss": 0.2042,
"step": 494
},
{
"epoch": 2.7728860936408104,
"grad_norm": 1.8631447011251083,
"learning_rate": 4.161867581391511e-06,
"loss": 0.2018,
"step": 496
},
{
"epoch": 2.7840670859538785,
"grad_norm": 2.0906363974353765,
"learning_rate": 4.155264587142002e-06,
"loss": 0.2319,
"step": 498
},
{
"epoch": 2.795248078266946,
"grad_norm": 1.7819164584799931,
"learning_rate": 4.148640968236903e-06,
"loss": 0.1703,
"step": 500
},
{
"epoch": 2.806429070580014,
"grad_norm": 1.7607086842324982,
"learning_rate": 4.141996807206745e-06,
"loss": 0.2264,
"step": 502
},
{
"epoch": 2.817610062893082,
"grad_norm": 1.5277530729360727,
"learning_rate": 4.135332186838008e-06,
"loss": 0.2134,
"step": 504
},
{
"epoch": 2.8287910552061497,
"grad_norm": 1.739277840645659,
"learning_rate": 4.128647190172099e-06,
"loss": 0.1952,
"step": 506
},
{
"epoch": 2.8399720475192174,
"grad_norm": 1.9987218712547774,
"learning_rate": 4.121941900504316e-06,
"loss": 0.2364,
"step": 508
},
{
"epoch": 2.851153039832285,
"grad_norm": 2.2244662318443225,
"learning_rate": 4.1152164013828035e-06,
"loss": 0.2072,
"step": 510
},
{
"epoch": 2.8623340321453528,
"grad_norm": 1.526547678145968,
"learning_rate": 4.108470776607521e-06,
"loss": 0.2047,
"step": 512
},
{
"epoch": 2.8735150244584204,
"grad_norm": 2.005093613185987,
"learning_rate": 4.1017051102291946e-06,
"loss": 0.2789,
"step": 514
},
{
"epoch": 2.8846960167714886,
"grad_norm": 2.2990829029486624,
"learning_rate": 4.094919486548266e-06,
"loss": 0.2414,
"step": 516
},
{
"epoch": 2.8958770090845563,
"grad_norm": 2.13743283403912,
"learning_rate": 4.088113990113846e-06,
"loss": 0.2029,
"step": 518
},
{
"epoch": 2.907058001397624,
"grad_norm": 1.9027626030017704,
"learning_rate": 4.081288705722666e-06,
"loss": 0.2229,
"step": 520
},
{
"epoch": 2.918238993710692,
"grad_norm": 2.0076859155071745,
"learning_rate": 4.074443718418009e-06,
"loss": 0.1995,
"step": 522
},
{
"epoch": 2.9294199860237597,
"grad_norm": 1.7985240007466619,
"learning_rate": 4.067579113488661e-06,
"loss": 0.1807,
"step": 524
},
{
"epoch": 2.9406009783368274,
"grad_norm": 2.140934337000471,
"learning_rate": 4.060694976467844e-06,
"loss": 0.2532,
"step": 526
},
{
"epoch": 2.951781970649895,
"grad_norm": 2.323003193893417,
"learning_rate": 4.0537913931321495e-06,
"loss": 0.2421,
"step": 528
},
{
"epoch": 2.962962962962963,
"grad_norm": 1.4532319163010707,
"learning_rate": 4.04686844950047e-06,
"loss": 0.2267,
"step": 530
},
{
"epoch": 2.9741439552760305,
"grad_norm": 2.0854922336923023,
"learning_rate": 4.039926231832931e-06,
"loss": 0.266,
"step": 532
},
{
"epoch": 2.9853249475890986,
"grad_norm": 2.882533995321225,
"learning_rate": 4.032964826629811e-06,
"loss": 0.2079,
"step": 534
},
{
"epoch": 2.9965059399021663,
"grad_norm": 2.7236955724192873,
"learning_rate": 4.025984320630465e-06,
"loss": 0.1657,
"step": 536
},
{
"epoch": 3.007686932215234,
"grad_norm": 1.8432900490614266,
"learning_rate": 4.018984800812248e-06,
"loss": 0.1354,
"step": 538
},
{
"epoch": 3.018867924528302,
"grad_norm": 2.0142515580054017,
"learning_rate": 4.011966354389424e-06,
"loss": 0.1542,
"step": 540
},
{
"epoch": 3.03004891684137,
"grad_norm": 2.756352182005047,
"learning_rate": 4.004929068812086e-06,
"loss": 0.1638,
"step": 542
},
{
"epoch": 3.0412299091544375,
"grad_norm": 2.048077691313813,
"learning_rate": 3.997873031765061e-06,
"loss": 0.156,
"step": 544
},
{
"epoch": 3.052410901467505,
"grad_norm": 1.7442233155652336,
"learning_rate": 3.990798331166822e-06,
"loss": 0.1095,
"step": 546
},
{
"epoch": 3.063591893780573,
"grad_norm": 1.826861973142375,
"learning_rate": 3.983705055168391e-06,
"loss": 0.1195,
"step": 548
},
{
"epoch": 3.074772886093641,
"grad_norm": 1.943175517862748,
"learning_rate": 3.976593292152238e-06,
"loss": 0.1638,
"step": 550
},
{
"epoch": 3.0859538784067087,
"grad_norm": 1.5477727978546996,
"learning_rate": 3.969463130731183e-06,
"loss": 0.1291,
"step": 552
},
{
"epoch": 3.0971348707197763,
"grad_norm": 2.3918080397656034,
"learning_rate": 3.9623146597472915e-06,
"loss": 0.1333,
"step": 554
},
{
"epoch": 3.108315863032844,
"grad_norm": 2.0592865934704,
"learning_rate": 3.955147968270764e-06,
"loss": 0.1692,
"step": 556
},
{
"epoch": 3.119496855345912,
"grad_norm": 1.280306245998938,
"learning_rate": 3.947963145598833e-06,
"loss": 0.1695,
"step": 558
},
{
"epoch": 3.13067784765898,
"grad_norm": 1.5568837418874426,
"learning_rate": 3.940760281254645e-06,
"loss": 0.1614,
"step": 560
},
{
"epoch": 3.1418588399720475,
"grad_norm": 1.6248982612645957,
"learning_rate": 3.933539464986143e-06,
"loss": 0.1184,
"step": 562
},
{
"epoch": 3.153039832285115,
"grad_norm": 1.657284019650329,
"learning_rate": 3.926300786764957e-06,
"loss": 0.1523,
"step": 564
},
{
"epoch": 3.164220824598183,
"grad_norm": 1.9315037734198213,
"learning_rate": 3.919044336785274e-06,
"loss": 0.1411,
"step": 566
},
{
"epoch": 3.175401816911251,
"grad_norm": 1.7456382044347782,
"learning_rate": 3.911770205462717e-06,
"loss": 0.1764,
"step": 568
},
{
"epoch": 3.1865828092243187,
"grad_norm": 1.4045398532057205,
"learning_rate": 3.904478483433223e-06,
"loss": 0.1241,
"step": 570
},
{
"epoch": 3.1977638015373864,
"grad_norm": 2.0886459168414895,
"learning_rate": 3.897169261551907e-06,
"loss": 0.1475,
"step": 572
},
{
"epoch": 3.208944793850454,
"grad_norm": 1.9098750157027404,
"learning_rate": 3.889842630891934e-06,
"loss": 0.138,
"step": 574
},
{
"epoch": 3.220125786163522,
"grad_norm": 2.184899827108709,
"learning_rate": 3.8824986827433804e-06,
"loss": 0.1315,
"step": 576
},
{
"epoch": 3.23130677847659,
"grad_norm": 1.528868394326383,
"learning_rate": 3.875137508612104e-06,
"loss": 0.1447,
"step": 578
},
{
"epoch": 3.2424877707896576,
"grad_norm": 1.6893708687857107,
"learning_rate": 3.867759200218594e-06,
"loss": 0.1746,
"step": 580
},
{
"epoch": 3.2536687631027252,
"grad_norm": 1.2610411246909474,
"learning_rate": 3.860363849496836e-06,
"loss": 0.1301,
"step": 582
},
{
"epoch": 3.264849755415793,
"grad_norm": 1.397542140556738,
"learning_rate": 3.852951548593161e-06,
"loss": 0.1373,
"step": 584
},
{
"epoch": 3.276030747728861,
"grad_norm": 1.9903353672741917,
"learning_rate": 3.845522389865106e-06,
"loss": 0.1609,
"step": 586
},
{
"epoch": 3.2872117400419287,
"grad_norm": 1.8370941337314268,
"learning_rate": 3.838076465880248e-06,
"loss": 0.148,
"step": 588
},
{
"epoch": 3.2983927323549964,
"grad_norm": 2.058865100613852,
"learning_rate": 3.830613869415069e-06,
"loss": 0.1483,
"step": 590
},
{
"epoch": 3.309573724668064,
"grad_norm": 1.5232253694216566,
"learning_rate": 3.823134693453782e-06,
"loss": 0.1621,
"step": 592
},
{
"epoch": 3.3207547169811322,
"grad_norm": 1.4993049111722665,
"learning_rate": 3.8156390311871885e-06,
"loss": 0.1433,
"step": 594
},
{
"epoch": 3.3319357092942,
"grad_norm": 1.555934394379587,
"learning_rate": 3.808126976011505e-06,
"loss": 0.1426,
"step": 596
},
{
"epoch": 3.3431167016072676,
"grad_norm": 1.3356473446523094,
"learning_rate": 3.8005986215272056e-06,
"loss": 0.1706,
"step": 598
},
{
"epoch": 3.3542976939203353,
"grad_norm": 1.9137688829035275,
"learning_rate": 3.7930540615378565e-06,
"loss": 0.1268,
"step": 600
},
{
"epoch": 3.3654786862334034,
"grad_norm": 1.5344748040953766,
"learning_rate": 3.785493390048942e-06,
"loss": 0.1458,
"step": 602
},
{
"epoch": 3.376659678546471,
"grad_norm": 1.602087497610558,
"learning_rate": 3.777916701266699e-06,
"loss": 0.1697,
"step": 604
},
{
"epoch": 3.3878406708595388,
"grad_norm": 1.4842568873334896,
"learning_rate": 3.7703240895969373e-06,
"loss": 0.1519,
"step": 606
},
{
"epoch": 3.3990216631726065,
"grad_norm": 1.53860971256147,
"learning_rate": 3.7627156496438686e-06,
"loss": 0.1691,
"step": 608
},
{
"epoch": 3.4102026554856746,
"grad_norm": 1.4193083610134813,
"learning_rate": 3.755091476208925e-06,
"loss": 0.1211,
"step": 610
},
{
"epoch": 3.4213836477987423,
"grad_norm": 1.8053625548432577,
"learning_rate": 3.7474516642895804e-06,
"loss": 0.131,
"step": 612
},
{
"epoch": 3.43256464011181,
"grad_norm": 1.9235537907938398,
"learning_rate": 3.7397963090781606e-06,
"loss": 0.163,
"step": 614
},
{
"epoch": 3.4437456324248776,
"grad_norm": 1.6022979215271898,
"learning_rate": 3.732125505960665e-06,
"loss": 0.1479,
"step": 616
},
{
"epoch": 3.4549266247379453,
"grad_norm": 1.663918706474492,
"learning_rate": 3.7244393505155713e-06,
"loss": 0.1376,
"step": 618
},
{
"epoch": 3.4661076170510134,
"grad_norm": 1.7974067820999995,
"learning_rate": 3.716737938512651e-06,
"loss": 0.1281,
"step": 620
},
{
"epoch": 3.477288609364081,
"grad_norm": 2.10108609081228,
"learning_rate": 3.709021365911772e-06,
"loss": 0.1388,
"step": 622
},
{
"epoch": 3.488469601677149,
"grad_norm": 1.367826215107555,
"learning_rate": 3.701289728861701e-06,
"loss": 0.1191,
"step": 624
},
{
"epoch": 3.4996505939902165,
"grad_norm": 1.7959553374302317,
"learning_rate": 3.693543123698913e-06,
"loss": 0.1758,
"step": 626
},
{
"epoch": 3.5108315863032846,
"grad_norm": 1.7389366148854988,
"learning_rate": 3.6857816469463806e-06,
"loss": 0.1405,
"step": 628
},
{
"epoch": 3.5220125786163523,
"grad_norm": 2.871162474790627,
"learning_rate": 3.6780053953123836e-06,
"loss": 0.1549,
"step": 630
},
{
"epoch": 3.53319357092942,
"grad_norm": 1.478751565339363,
"learning_rate": 3.6702144656892907e-06,
"loss": 0.1759,
"step": 632
},
{
"epoch": 3.5443745632424877,
"grad_norm": 1.4974413518112613,
"learning_rate": 3.662408955152364e-06,
"loss": 0.1078,
"step": 634
},
{
"epoch": 3.5555555555555554,
"grad_norm": 1.7006067350332152,
"learning_rate": 3.6545889609585405e-06,
"loss": 0.1427,
"step": 636
},
{
"epoch": 3.5667365478686235,
"grad_norm": 1.8754398825641954,
"learning_rate": 3.6467545805452266e-06,
"loss": 0.1893,
"step": 638
},
{
"epoch": 3.577917540181691,
"grad_norm": 1.7762501705151392,
"learning_rate": 3.6389059115290813e-06,
"loss": 0.1109,
"step": 640
},
{
"epoch": 3.589098532494759,
"grad_norm": 2.0251975300449327,
"learning_rate": 3.631043051704799e-06,
"loss": 0.121,
"step": 642
},
{
"epoch": 3.6002795248078265,
"grad_norm": 1.3531681902278672,
"learning_rate": 3.6231660990438922e-06,
"loss": 0.1348,
"step": 644
},
{
"epoch": 3.6114605171208947,
"grad_norm": 1.9724391202631109,
"learning_rate": 3.615275151693471e-06,
"loss": 0.1449,
"step": 646
},
{
"epoch": 3.6226415094339623,
"grad_norm": 1.785158595271644,
"learning_rate": 3.6073703079750204e-06,
"loss": 0.1485,
"step": 648
},
{
"epoch": 3.63382250174703,
"grad_norm": 1.829166278099355,
"learning_rate": 3.5994516663831734e-06,
"loss": 0.1192,
"step": 650
},
{
"epoch": 3.6450034940600977,
"grad_norm": 1.9222881871208803,
"learning_rate": 3.591519325584487e-06,
"loss": 0.1635,
"step": 652
},
{
"epoch": 3.6561844863731654,
"grad_norm": 2.052453811112636,
"learning_rate": 3.583573384416209e-06,
"loss": 0.1561,
"step": 654
},
{
"epoch": 3.6673654786862335,
"grad_norm": 1.9190051036571132,
"learning_rate": 3.575613941885047e-06,
"loss": 0.1051,
"step": 656
},
{
"epoch": 3.678546470999301,
"grad_norm": 1.4736638642637576,
"learning_rate": 3.5676410971659404e-06,
"loss": 0.123,
"step": 658
},
{
"epoch": 3.689727463312369,
"grad_norm": 1.7325761695268906,
"learning_rate": 3.5596549496008165e-06,
"loss": 0.1446,
"step": 660
},
{
"epoch": 3.700908455625437,
"grad_norm": 2.0344810615726288,
"learning_rate": 3.551655598697358e-06,
"loss": 0.1629,
"step": 662
},
{
"epoch": 3.7120894479385047,
"grad_norm": 1.936581123166174,
"learning_rate": 3.54364314412776e-06,
"loss": 0.1569,
"step": 664
},
{
"epoch": 3.7232704402515724,
"grad_norm": 1.3525874354992642,
"learning_rate": 3.535617685727494e-06,
"loss": 0.1082,
"step": 666
},
{
"epoch": 3.73445143256464,
"grad_norm": 1.6514309403224916,
"learning_rate": 3.527579323494055e-06,
"loss": 0.1431,
"step": 668
},
{
"epoch": 3.7456324248777078,
"grad_norm": 1.8602451468342234,
"learning_rate": 3.5195281575857228e-06,
"loss": 0.1639,
"step": 670
},
{
"epoch": 3.7568134171907754,
"grad_norm": 1.4731268992440232,
"learning_rate": 3.511464288320311e-06,
"loss": 0.1271,
"step": 672
},
{
"epoch": 3.7679944095038436,
"grad_norm": 1.37724516129253,
"learning_rate": 3.503387816173916e-06,
"loss": 0.1597,
"step": 674
},
{
"epoch": 3.7791754018169113,
"grad_norm": 1.7200144334067748,
"learning_rate": 3.495298841779669e-06,
"loss": 0.117,
"step": 676
},
{
"epoch": 3.790356394129979,
"grad_norm": 1.92538314164391,
"learning_rate": 3.4871974659264786e-06,
"loss": 0.1584,
"step": 678
},
{
"epoch": 3.801537386443047,
"grad_norm": 1.4718208788605616,
"learning_rate": 3.4790837895577752e-06,
"loss": 0.1333,
"step": 680
},
{
"epoch": 3.8127183787561147,
"grad_norm": 1.5582481918696203,
"learning_rate": 3.470957913770255e-06,
"loss": 0.1464,
"step": 682
},
{
"epoch": 3.8238993710691824,
"grad_norm": 1.4618275028428347,
"learning_rate": 3.462819939812618e-06,
"loss": 0.0995,
"step": 684
},
{
"epoch": 3.83508036338225,
"grad_norm": 1.3366351935592664,
"learning_rate": 3.4546699690843123e-06,
"loss": 0.1204,
"step": 686
},
{
"epoch": 3.846261355695318,
"grad_norm": 1.3780079667316787,
"learning_rate": 3.446508103134259e-06,
"loss": 0.1701,
"step": 688
},
{
"epoch": 3.8574423480083855,
"grad_norm": 1.7451718870626607,
"learning_rate": 3.4383344436595992e-06,
"loss": 0.1158,
"step": 690
},
{
"epoch": 3.8686233403214536,
"grad_norm": 2.019474198008684,
"learning_rate": 3.430149092504422e-06,
"loss": 0.1304,
"step": 692
},
{
"epoch": 3.8798043326345213,
"grad_norm": 1.6820935429062616,
"learning_rate": 3.4219521516584912e-06,
"loss": 0.1334,
"step": 694
},
{
"epoch": 3.890985324947589,
"grad_norm": 2.2578057319721236,
"learning_rate": 3.4137437232559834e-06,
"loss": 0.1557,
"step": 696
},
{
"epoch": 3.902166317260657,
"grad_norm": 1.3610116271561221,
"learning_rate": 3.4055239095742067e-06,
"loss": 0.1644,
"step": 698
},
{
"epoch": 3.913347309573725,
"grad_norm": 1.3397050224861815,
"learning_rate": 3.3972928130323322e-06,
"loss": 0.1471,
"step": 700
},
{
"epoch": 3.9245283018867925,
"grad_norm": 1.5234658664307734,
"learning_rate": 3.3890505361901153e-06,
"loss": 0.1195,
"step": 702
},
{
"epoch": 3.93570929419986,
"grad_norm": 1.763362220735128,
"learning_rate": 3.380797181746619e-06,
"loss": 0.1363,
"step": 704
},
{
"epoch": 3.946890286512928,
"grad_norm": 2.038986301246902,
"learning_rate": 3.3725328525389324e-06,
"loss": 0.1203,
"step": 706
},
{
"epoch": 3.958071278825996,
"grad_norm": 1.9046513315579439,
"learning_rate": 3.364257651540891e-06,
"loss": 0.1578,
"step": 708
},
{
"epoch": 3.9692522711390636,
"grad_norm": 1.423399143627221,
"learning_rate": 3.355971681861794e-06,
"loss": 0.1211,
"step": 710
},
{
"epoch": 3.9804332634521313,
"grad_norm": 1.5586817639667492,
"learning_rate": 3.3476750467451176e-06,
"loss": 0.153,
"step": 712
},
{
"epoch": 3.991614255765199,
"grad_norm": 1.4814888460752178,
"learning_rate": 3.33936784956723e-06,
"loss": 0.1288,
"step": 714
},
{
"epoch": 4.002795248078267,
"grad_norm": 1.6561127976965244,
"learning_rate": 3.331050193836104e-06,
"loss": 0.1196,
"step": 716
},
{
"epoch": 4.013976240391335,
"grad_norm": 1.8246755797846792,
"learning_rate": 3.322722183190025e-06,
"loss": 0.0983,
"step": 718
},
{
"epoch": 4.0251572327044025,
"grad_norm": 1.2508646883720782,
"learning_rate": 3.3143839213963026e-06,
"loss": 0.1132,
"step": 720
},
{
"epoch": 4.03633822501747,
"grad_norm": 1.3174073933660169,
"learning_rate": 3.306035512349974e-06,
"loss": 0.0886,
"step": 722
},
{
"epoch": 4.047519217330538,
"grad_norm": 1.4006843207756257,
"learning_rate": 3.297677060072513e-06,
"loss": 0.0907,
"step": 724
},
{
"epoch": 4.058700209643606,
"grad_norm": 2.147633002379955,
"learning_rate": 3.2893086687105324e-06,
"loss": 0.0814,
"step": 726
},
{
"epoch": 4.069881201956673,
"grad_norm": 1.8499679148666142,
"learning_rate": 3.280930442534486e-06,
"loss": 0.0916,
"step": 728
},
{
"epoch": 4.081062194269742,
"grad_norm": 1.5576608674855401,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0814,
"step": 730
},
{
"epoch": 4.0922431865828095,
"grad_norm": 1.5258204722757824,
"learning_rate": 3.264144903433419e-06,
"loss": 0.0929,
"step": 732
},
{
"epoch": 4.103424178895877,
"grad_norm": 1.2377371189448831,
"learning_rate": 3.2557377996568135e-06,
"loss": 0.0933,
"step": 734
},
{
"epoch": 4.114605171208945,
"grad_norm": 1.6706792363129992,
"learning_rate": 3.247321279360363e-06,
"loss": 0.0957,
"step": 736
},
{
"epoch": 4.1257861635220126,
"grad_norm": 1.5205095000978939,
"learning_rate": 3.238895447414211e-06,
"loss": 0.1094,
"step": 738
},
{
"epoch": 4.13696715583508,
"grad_norm": 1.8218111131497405,
"learning_rate": 3.2304604088045206e-06,
"loss": 0.0866,
"step": 740
},
{
"epoch": 4.148148148148148,
"grad_norm": 1.5060146063158792,
"learning_rate": 3.222016268632175e-06,
"loss": 0.0974,
"step": 742
},
{
"epoch": 4.159329140461216,
"grad_norm": 2.33394735696618,
"learning_rate": 3.2135631321114603e-06,
"loss": 0.0767,
"step": 744
},
{
"epoch": 4.170510132774284,
"grad_norm": 1.8304481485687374,
"learning_rate": 3.2051011045687574e-06,
"loss": 0.1027,
"step": 746
},
{
"epoch": 4.181691125087352,
"grad_norm": 1.4496933516097028,
"learning_rate": 3.196630291441231e-06,
"loss": 0.073,
"step": 748
},
{
"epoch": 4.1928721174004195,
"grad_norm": 1.5989097781751378,
"learning_rate": 3.1881507982755126e-06,
"loss": 0.074,
"step": 750
},
{
"epoch": 4.204053109713487,
"grad_norm": 1.5479651084913313,
"learning_rate": 3.17966273072639e-06,
"loss": 0.0941,
"step": 752
},
{
"epoch": 4.215234102026555,
"grad_norm": 1.4844971201883568,
"learning_rate": 3.1711661945554857e-06,
"loss": 0.1171,
"step": 754
},
{
"epoch": 4.226415094339623,
"grad_norm": 1.538555100844062,
"learning_rate": 3.162661295629942e-06,
"loss": 0.0839,
"step": 756
},
{
"epoch": 4.23759608665269,
"grad_norm": 1.511356916861757,
"learning_rate": 3.154148139921102e-06,
"loss": 0.1039,
"step": 758
},
{
"epoch": 4.248777078965758,
"grad_norm": 1.811476489190878,
"learning_rate": 3.1456268335031886e-06,
"loss": 0.0794,
"step": 760
},
{
"epoch": 4.259958071278826,
"grad_norm": 1.6229333309674812,
"learning_rate": 3.137097482551983e-06,
"loss": 0.1152,
"step": 762
},
{
"epoch": 4.271139063591894,
"grad_norm": 1.4723017587041405,
"learning_rate": 3.128560193343501e-06,
"loss": 0.0944,
"step": 764
},
{
"epoch": 4.282320055904962,
"grad_norm": 1.0034690245189755,
"learning_rate": 3.1200150722526693e-06,
"loss": 0.0663,
"step": 766
},
{
"epoch": 4.29350104821803,
"grad_norm": 1.5551415143149132,
"learning_rate": 3.1114622257520004e-06,
"loss": 0.1021,
"step": 768
},
{
"epoch": 4.304682040531097,
"grad_norm": 1.836559018121584,
"learning_rate": 3.1029017604102655e-06,
"loss": 0.099,
"step": 770
},
{
"epoch": 4.315863032844165,
"grad_norm": 1.0818921388079483,
"learning_rate": 3.0943337828911673e-06,
"loss": 0.0899,
"step": 772
},
{
"epoch": 4.327044025157233,
"grad_norm": 0.9784785751112162,
"learning_rate": 3.085758399952011e-06,
"loss": 0.1016,
"step": 774
},
{
"epoch": 4.3382250174703,
"grad_norm": 1.348338975607883,
"learning_rate": 3.0771757184423716e-06,
"loss": 0.1063,
"step": 776
},
{
"epoch": 4.349406009783368,
"grad_norm": 2.1529902019434455,
"learning_rate": 3.0685858453027668e-06,
"loss": 0.089,
"step": 778
},
{
"epoch": 4.360587002096436,
"grad_norm": 1.3031273077449874,
"learning_rate": 3.0599888875633192e-06,
"loss": 0.1077,
"step": 780
},
{
"epoch": 4.371767994409504,
"grad_norm": 1.3772043306307704,
"learning_rate": 3.0513849523424298e-06,
"loss": 0.0879,
"step": 782
},
{
"epoch": 4.382948986722572,
"grad_norm": 1.7829225937512299,
"learning_rate": 3.0427741468454375e-06,
"loss": 0.1099,
"step": 784
},
{
"epoch": 4.39412997903564,
"grad_norm": 1.1143653742483424,
"learning_rate": 3.034156578363284e-06,
"loss": 0.0908,
"step": 786
},
{
"epoch": 4.405310971348707,
"grad_norm": 1.9841896768408593,
"learning_rate": 3.0255323542711784e-06,
"loss": 0.0846,
"step": 788
},
{
"epoch": 4.416491963661775,
"grad_norm": 1.1622503242476587,
"learning_rate": 3.0169015820272595e-06,
"loss": 0.0809,
"step": 790
},
{
"epoch": 4.427672955974843,
"grad_norm": 1.4138977756081776,
"learning_rate": 3.0082643691712572e-06,
"loss": 0.0832,
"step": 792
},
{
"epoch": 4.43885394828791,
"grad_norm": 1.3694425414816003,
"learning_rate": 2.9996208233231506e-06,
"loss": 0.1015,
"step": 794
},
{
"epoch": 4.450034940600978,
"grad_norm": 1.8252502558409327,
"learning_rate": 2.9909710521818265e-06,
"loss": 0.1049,
"step": 796
},
{
"epoch": 4.461215932914046,
"grad_norm": 1.4396307405101365,
"learning_rate": 2.9823151635237424e-06,
"loss": 0.0613,
"step": 798
},
{
"epoch": 4.472396925227114,
"grad_norm": 1.3667673153541864,
"learning_rate": 2.973653265201578e-06,
"loss": 0.1081,
"step": 800
},
{
"epoch": 4.483577917540182,
"grad_norm": 1.761976942384573,
"learning_rate": 2.964985465142895e-06,
"loss": 0.1002,
"step": 802
},
{
"epoch": 4.49475890985325,
"grad_norm": 1.6343471974417978,
"learning_rate": 2.9563118713487895e-06,
"loss": 0.0749,
"step": 804
},
{
"epoch": 4.505939902166317,
"grad_norm": 2.0454570442431046,
"learning_rate": 2.9476325918925484e-06,
"loss": 0.0857,
"step": 806
},
{
"epoch": 4.517120894479385,
"grad_norm": 1.7007295640066746,
"learning_rate": 2.938947734918302e-06,
"loss": 0.1085,
"step": 808
},
{
"epoch": 4.528301886792453,
"grad_norm": 1.5611422829954795,
"learning_rate": 2.9302574086396774e-06,
"loss": 0.0775,
"step": 810
},
{
"epoch": 4.53948287910552,
"grad_norm": 1.7913016893140525,
"learning_rate": 2.9215617213384494e-06,
"loss": 0.0875,
"step": 812
},
{
"epoch": 4.550663871418588,
"grad_norm": 1.5753063947599002,
"learning_rate": 2.91286078136319e-06,
"loss": 0.0805,
"step": 814
},
{
"epoch": 4.561844863731656,
"grad_norm": 1.8942921897754963,
"learning_rate": 2.904154697127921e-06,
"loss": 0.0806,
"step": 816
},
{
"epoch": 4.573025856044724,
"grad_norm": 1.791394910046461,
"learning_rate": 2.8954435771107604e-06,
"loss": 0.0992,
"step": 818
},
{
"epoch": 4.584206848357792,
"grad_norm": 1.245790765054016,
"learning_rate": 2.8867275298525743e-06,
"loss": 0.0886,
"step": 820
},
{
"epoch": 4.59538784067086,
"grad_norm": 1.5133863011334676,
"learning_rate": 2.878006663955621e-06,
"loss": 0.0886,
"step": 822
},
{
"epoch": 4.606568832983927,
"grad_norm": 2.0502622868705993,
"learning_rate": 2.8692810880821997e-06,
"loss": 0.0716,
"step": 824
},
{
"epoch": 4.617749825296995,
"grad_norm": 1.2876873289352964,
"learning_rate": 2.860550910953296e-06,
"loss": 0.0943,
"step": 826
},
{
"epoch": 4.628930817610063,
"grad_norm": 1.440475980645125,
"learning_rate": 2.8518162413472266e-06,
"loss": 0.1083,
"step": 828
},
{
"epoch": 4.64011180992313,
"grad_norm": 1.3754262878787067,
"learning_rate": 2.843077188098286e-06,
"loss": 0.1041,
"step": 830
},
{
"epoch": 4.651292802236198,
"grad_norm": 1.4424213259038674,
"learning_rate": 2.834333860095388e-06,
"loss": 0.0807,
"step": 832
},
{
"epoch": 4.662473794549266,
"grad_norm": 1.994638545215632,
"learning_rate": 2.8255863662807097e-06,
"loss": 0.0819,
"step": 834
},
{
"epoch": 4.673654786862334,
"grad_norm": 1.5478645240921063,
"learning_rate": 2.8168348156483356e-06,
"loss": 0.113,
"step": 836
},
{
"epoch": 4.684835779175402,
"grad_norm": 1.324879005941319,
"learning_rate": 2.8124575531000226e-06,
"loss": 0.11,
"step": 838
},
{
"epoch": 4.69601677148847,
"grad_norm": 1.5993247352100177,
"learning_rate": 2.803700121715214e-06,
"loss": 0.0903,
"step": 840
},
{
"epoch": 4.707197763801537,
"grad_norm": 1.256541482417978,
"learning_rate": 2.7949389062160946e-06,
"loss": 0.0925,
"step": 842
},
{
"epoch": 4.718378756114605,
"grad_norm": 2.706891920194882,
"learning_rate": 2.786174015767721e-06,
"loss": 0.084,
"step": 844
},
{
"epoch": 4.729559748427673,
"grad_norm": 1.3220515828132557,
"learning_rate": 2.7774055595809395e-06,
"loss": 0.0801,
"step": 846
},
{
"epoch": 4.7407407407407405,
"grad_norm": 1.5911477732332153,
"learning_rate": 2.768633646911027e-06,
"loss": 0.0938,
"step": 848
},
{
"epoch": 4.751921733053808,
"grad_norm": 1.1333988378482527,
"learning_rate": 2.759858387056325e-06,
"loss": 0.0721,
"step": 850
},
{
"epoch": 4.763102725366876,
"grad_norm": 1.4690260920140663,
"learning_rate": 2.7510798893568846e-06,
"loss": 0.0769,
"step": 852
},
{
"epoch": 4.774283717679944,
"grad_norm": 1.3785131166774844,
"learning_rate": 2.742298263193099e-06,
"loss": 0.1064,
"step": 854
},
{
"epoch": 4.785464709993012,
"grad_norm": 1.39128795327872,
"learning_rate": 2.733513617984342e-06,
"loss": 0.075,
"step": 856
},
{
"epoch": 4.79664570230608,
"grad_norm": 1.6826021403482612,
"learning_rate": 2.724726063187605e-06,
"loss": 0.1175,
"step": 858
},
{
"epoch": 4.8078266946191475,
"grad_norm": 1.353741266830404,
"learning_rate": 2.715935708296134e-06,
"loss": 0.1146,
"step": 860
},
{
"epoch": 4.819007686932215,
"grad_norm": 1.4488179633464906,
"learning_rate": 2.707142662838062e-06,
"loss": 0.1033,
"step": 862
},
{
"epoch": 4.830188679245283,
"grad_norm": 1.307354977462126,
"learning_rate": 2.6983470363750497e-06,
"loss": 0.093,
"step": 864
},
{
"epoch": 4.8413696715583505,
"grad_norm": 1.4753004858703918,
"learning_rate": 2.689548938500914e-06,
"loss": 0.0905,
"step": 866
},
{
"epoch": 4.852550663871418,
"grad_norm": 1.551558439927485,
"learning_rate": 2.6807484788402676e-06,
"loss": 0.075,
"step": 868
},
{
"epoch": 4.863731656184486,
"grad_norm": 1.499892261020302,
"learning_rate": 2.67194576704715e-06,
"loss": 0.0876,
"step": 870
},
{
"epoch": 4.8749126484975545,
"grad_norm": 1.82643381640813,
"learning_rate": 2.6631409128036637e-06,
"loss": 0.0892,
"step": 872
},
{
"epoch": 4.886093640810622,
"grad_norm": 1.3480606493487655,
"learning_rate": 2.6543340258186063e-06,
"loss": 0.0816,
"step": 874
},
{
"epoch": 4.89727463312369,
"grad_norm": 2.2307067144092407,
"learning_rate": 2.6455252158261015e-06,
"loss": 0.0994,
"step": 876
},
{
"epoch": 4.9084556254367575,
"grad_norm": 1.8646868858712458,
"learning_rate": 2.636714592584235e-06,
"loss": 0.0902,
"step": 878
},
{
"epoch": 4.919636617749825,
"grad_norm": 1.535171207325978,
"learning_rate": 2.6279022658736856e-06,
"loss": 0.0911,
"step": 880
},
{
"epoch": 4.930817610062893,
"grad_norm": 1.1594360070916991,
"learning_rate": 2.619088345496358e-06,
"loss": 0.066,
"step": 882
},
{
"epoch": 4.941998602375961,
"grad_norm": 1.6526631394475477,
"learning_rate": 2.610272941274012e-06,
"loss": 0.1014,
"step": 884
},
{
"epoch": 4.953179594689029,
"grad_norm": 1.8240816325874138,
"learning_rate": 2.6014561630468993e-06,
"loss": 0.0928,
"step": 886
},
{
"epoch": 4.964360587002097,
"grad_norm": 1.3816438884334348,
"learning_rate": 2.5926381206723885e-06,
"loss": 0.088,
"step": 888
},
{
"epoch": 4.9755415793151645,
"grad_norm": 1.3157397283692482,
"learning_rate": 2.583818924023601e-06,
"loss": 0.0938,
"step": 890
},
{
"epoch": 4.986722571628232,
"grad_norm": 1.464557516575305,
"learning_rate": 2.5749986829880423e-06,
"loss": 0.0781,
"step": 892
},
{
"epoch": 4.9979035639413,
"grad_norm": 1.8481309973872981,
"learning_rate": 2.5661775074662276e-06,
"loss": 0.0708,
"step": 894
},
{
"epoch": 5.0090845562543675,
"grad_norm": 1.3777408578534927,
"learning_rate": 2.5573555073703172e-06,
"loss": 0.0574,
"step": 896
},
{
"epoch": 5.020265548567435,
"grad_norm": 1.5585565063610693,
"learning_rate": 2.5485327926227464e-06,
"loss": 0.0533,
"step": 898
},
{
"epoch": 5.031446540880503,
"grad_norm": 3.8488829032344403,
"learning_rate": 2.539709473154855e-06,
"loss": 0.0524,
"step": 900
},
{
"epoch": 5.042627533193571,
"grad_norm": 1.360678519326562,
"learning_rate": 2.5308856589055164e-06,
"loss": 0.0608,
"step": 902
},
{
"epoch": 5.053808525506638,
"grad_norm": 1.4720850175627471,
"learning_rate": 2.5220614598197708e-06,
"loss": 0.0527,
"step": 904
},
{
"epoch": 5.064989517819707,
"grad_norm": 1.2412662972591795,
"learning_rate": 2.513236985847451e-06,
"loss": 0.0488,
"step": 906
},
{
"epoch": 5.0761705101327745,
"grad_norm": 1.3236580966844242,
"learning_rate": 2.5044123469418174e-06,
"loss": 0.0638,
"step": 908
},
{
"epoch": 5.087351502445842,
"grad_norm": 1.8348241342651854,
"learning_rate": 2.495587653058184e-06,
"loss": 0.0629,
"step": 910
},
{
"epoch": 5.09853249475891,
"grad_norm": 0.9662213920921242,
"learning_rate": 2.4867630141525493e-06,
"loss": 0.0722,
"step": 912
},
{
"epoch": 5.109713487071978,
"grad_norm": 1.6784486385619315,
"learning_rate": 2.477938540180231e-06,
"loss": 0.0482,
"step": 914
},
{
"epoch": 5.120894479385045,
"grad_norm": 1.386742744607905,
"learning_rate": 2.4691143410944844e-06,
"loss": 0.0596,
"step": 916
},
{
"epoch": 5.132075471698113,
"grad_norm": 1.5375835898995094,
"learning_rate": 2.4602905268451455e-06,
"loss": 0.0592,
"step": 918
},
{
"epoch": 5.143256464011181,
"grad_norm": 1.334707574114043,
"learning_rate": 2.451467207377254e-06,
"loss": 0.0493,
"step": 920
},
{
"epoch": 5.154437456324249,
"grad_norm": 1.018606004126685,
"learning_rate": 2.442644492629683e-06,
"loss": 0.0544,
"step": 922
},
{
"epoch": 5.165618448637317,
"grad_norm": 1.0236510244569192,
"learning_rate": 2.433822492533774e-06,
"loss": 0.0501,
"step": 924
},
{
"epoch": 5.176799440950385,
"grad_norm": 0.8191759766926784,
"learning_rate": 2.4250013170119585e-06,
"loss": 0.0594,
"step": 926
},
{
"epoch": 5.187980433263452,
"grad_norm": 1.0938612787512558,
"learning_rate": 2.4161810759763993e-06,
"loss": 0.0544,
"step": 928
},
{
"epoch": 5.19916142557652,
"grad_norm": 1.3602285379082586,
"learning_rate": 2.407361879327612e-06,
"loss": 0.0442,
"step": 930
},
{
"epoch": 5.210342417889588,
"grad_norm": 1.1380441045618945,
"learning_rate": 2.398543836953101e-06,
"loss": 0.0563,
"step": 932
},
{
"epoch": 5.221523410202655,
"grad_norm": 1.1080478505241853,
"learning_rate": 2.389727058725989e-06,
"loss": 0.0515,
"step": 934
},
{
"epoch": 5.232704402515723,
"grad_norm": 1.2558697950305333,
"learning_rate": 2.380911654503643e-06,
"loss": 0.0507,
"step": 936
},
{
"epoch": 5.243885394828791,
"grad_norm": 1.2293644348010904,
"learning_rate": 2.3720977341263152e-06,
"loss": 0.0607,
"step": 938
},
{
"epoch": 5.255066387141859,
"grad_norm": 1.292488994918762,
"learning_rate": 2.3632854074157653e-06,
"loss": 0.0474,
"step": 940
},
{
"epoch": 5.266247379454927,
"grad_norm": 1.2671492916227067,
"learning_rate": 2.3544747841738998e-06,
"loss": 0.0769,
"step": 942
},
{
"epoch": 5.277428371767995,
"grad_norm": 1.6102887076835615,
"learning_rate": 2.3456659741813945e-06,
"loss": 0.0496,
"step": 944
},
{
"epoch": 5.288609364081062,
"grad_norm": 1.577997048333656,
"learning_rate": 2.3368590871963367e-06,
"loss": 0.0796,
"step": 946
},
{
"epoch": 5.29979035639413,
"grad_norm": 2.278441135480121,
"learning_rate": 2.328054232952851e-06,
"loss": 0.0679,
"step": 948
},
{
"epoch": 5.310971348707198,
"grad_norm": 1.1443796744340577,
"learning_rate": 2.3192515211597332e-06,
"loss": 0.0589,
"step": 950
},
{
"epoch": 5.322152341020265,
"grad_norm": 1.3246252050774938,
"learning_rate": 2.3104510614990875e-06,
"loss": 0.0711,
"step": 952
},
{
"epoch": 5.333333333333333,
"grad_norm": 2.3404125762291574,
"learning_rate": 2.301652963624951e-06,
"loss": 0.0571,
"step": 954
},
{
"epoch": 5.344514325646401,
"grad_norm": 1.6173224098499974,
"learning_rate": 2.292857337161938e-06,
"loss": 0.0715,
"step": 956
},
{
"epoch": 5.355695317959469,
"grad_norm": 1.416375080557459,
"learning_rate": 2.2840642917038666e-06,
"loss": 0.0555,
"step": 958
},
{
"epoch": 5.366876310272537,
"grad_norm": 1.2819320119071211,
"learning_rate": 2.2752739368123948e-06,
"loss": 0.0486,
"step": 960
},
{
"epoch": 5.378057302585605,
"grad_norm": 1.1198977788924485,
"learning_rate": 2.2664863820156593e-06,
"loss": 0.0408,
"step": 962
},
{
"epoch": 5.389238294898672,
"grad_norm": 1.1451798114445098,
"learning_rate": 2.2577017368069017e-06,
"loss": 0.0626,
"step": 964
},
{
"epoch": 5.40041928721174,
"grad_norm": 1.3380127274735694,
"learning_rate": 2.248920110643116e-06,
"loss": 0.0568,
"step": 966
},
{
"epoch": 5.411600279524808,
"grad_norm": 1.4489239240672898,
"learning_rate": 2.2401416129436753e-06,
"loss": 0.059,
"step": 968
},
{
"epoch": 5.422781271837875,
"grad_norm": 1.3130908635170957,
"learning_rate": 2.2313663530889734e-06,
"loss": 0.0444,
"step": 970
},
{
"epoch": 5.433962264150943,
"grad_norm": 1.2045728193533076,
"learning_rate": 2.222594440419061e-06,
"loss": 0.0952,
"step": 972
},
{
"epoch": 5.445143256464011,
"grad_norm": 1.1505612686257871,
"learning_rate": 2.2138259842322794e-06,
"loss": 0.0536,
"step": 974
},
{
"epoch": 5.456324248777079,
"grad_norm": 1.521719008832957,
"learning_rate": 2.2050610937839058e-06,
"loss": 0.073,
"step": 976
},
{
"epoch": 5.467505241090147,
"grad_norm": 1.3381824532405695,
"learning_rate": 2.1962998782847863e-06,
"loss": 0.0583,
"step": 978
},
{
"epoch": 5.478686233403215,
"grad_norm": 1.1782879600371732,
"learning_rate": 2.1875424468999787e-06,
"loss": 0.052,
"step": 980
},
{
"epoch": 5.489867225716282,
"grad_norm": 1.1689516819440322,
"learning_rate": 2.178788908747387e-06,
"loss": 0.0515,
"step": 982
},
{
"epoch": 5.50104821802935,
"grad_norm": 1.1479989981730907,
"learning_rate": 2.170039372896409e-06,
"loss": 0.055,
"step": 984
},
{
"epoch": 5.512229210342418,
"grad_norm": 1.3922562574409854,
"learning_rate": 2.161293948366573e-06,
"loss": 0.0554,
"step": 986
},
{
"epoch": 5.523410202655485,
"grad_norm": 1.409490849880991,
"learning_rate": 2.152552744126178e-06,
"loss": 0.0392,
"step": 988
},
{
"epoch": 5.534591194968553,
"grad_norm": 1.2479629003574995,
"learning_rate": 2.1438158690909413e-06,
"loss": 0.0599,
"step": 990
},
{
"epoch": 5.545772187281621,
"grad_norm": 1.2371376050465024,
"learning_rate": 2.1350834321226344e-06,
"loss": 0.0664,
"step": 992
},
{
"epoch": 5.556953179594689,
"grad_norm": 1.593505278104288,
"learning_rate": 2.126355542027734e-06,
"loss": 0.0479,
"step": 994
},
{
"epoch": 5.568134171907757,
"grad_norm": 1.2742537988695015,
"learning_rate": 2.117632307556059e-06,
"loss": 0.0803,
"step": 996
},
{
"epoch": 5.579315164220825,
"grad_norm": 1.3748039610126324,
"learning_rate": 2.1089138373994226e-06,
"loss": 0.0416,
"step": 998
},
{
"epoch": 5.590496156533892,
"grad_norm": 2.4084571636039755,
"learning_rate": 2.100200240190273e-06,
"loss": 0.0514,
"step": 1000
},
{
"epoch": 5.60167714884696,
"grad_norm": 1.1933752040503858,
"learning_rate": 2.09149162450034e-06,
"loss": 0.0625,
"step": 1002
},
{
"epoch": 5.612858141160028,
"grad_norm": 1.037709039674537,
"learning_rate": 2.0827880988392856e-06,
"loss": 0.0514,
"step": 1004
},
{
"epoch": 5.6240391334730955,
"grad_norm": 1.315142680072312,
"learning_rate": 2.0740897716533475e-06,
"loss": 0.0593,
"step": 1006
},
{
"epoch": 5.635220125786163,
"grad_norm": 1.0531660230737552,
"learning_rate": 2.0653967513239934e-06,
"loss": 0.0543,
"step": 1008
},
{
"epoch": 5.646401118099231,
"grad_norm": 1.2633776013551097,
"learning_rate": 2.0567091461665636e-06,
"loss": 0.0431,
"step": 1010
},
{
"epoch": 5.657582110412299,
"grad_norm": 1.449959564050197,
"learning_rate": 2.0480270644289282e-06,
"loss": 0.0482,
"step": 1012
},
{
"epoch": 5.668763102725367,
"grad_norm": 1.1071912059302882,
"learning_rate": 2.0393506142901347e-06,
"loss": 0.0564,
"step": 1014
},
{
"epoch": 5.679944095038435,
"grad_norm": 0.9876137346535111,
"learning_rate": 2.0306799038590595e-06,
"loss": 0.0391,
"step": 1016
},
{
"epoch": 5.6911250873515025,
"grad_norm": 1.1071464038310999,
"learning_rate": 2.0220150411730638e-06,
"loss": 0.0636,
"step": 1018
},
{
"epoch": 5.70230607966457,
"grad_norm": 1.0473491285671832,
"learning_rate": 2.013356134196643e-06,
"loss": 0.0581,
"step": 1020
},
{
"epoch": 5.713487071977638,
"grad_norm": 1.1296902267336801,
"learning_rate": 2.004703290820086e-06,
"loss": 0.0604,
"step": 1022
},
{
"epoch": 5.7246680642907055,
"grad_norm": 1.309317661735025,
"learning_rate": 1.9960566188581306e-06,
"loss": 0.0438,
"step": 1024
},
{
"epoch": 5.735849056603773,
"grad_norm": 0.8918766336417149,
"learning_rate": 1.9874162260486146e-06,
"loss": 0.0475,
"step": 1026
},
{
"epoch": 5.747030048916841,
"grad_norm": 1.2095534019736167,
"learning_rate": 1.978782220051142e-06,
"loss": 0.0454,
"step": 1028
},
{
"epoch": 5.7582110412299095,
"grad_norm": 1.1967009451687045,
"learning_rate": 1.9701547084457314e-06,
"loss": 0.0697,
"step": 1030
},
{
"epoch": 5.769392033542977,
"grad_norm": 1.8160556667087309,
"learning_rate": 1.961533798731486e-06,
"loss": 0.0422,
"step": 1032
},
{
"epoch": 5.780573025856045,
"grad_norm": 1.590627053883797,
"learning_rate": 1.952919598325247e-06,
"loss": 0.0602,
"step": 1034
},
{
"epoch": 5.7917540181691125,
"grad_norm": 1.4584761134724722,
"learning_rate": 1.944312214560256e-06,
"loss": 0.0575,
"step": 1036
},
{
"epoch": 5.80293501048218,
"grad_norm": 1.6093909025543798,
"learning_rate": 1.935711754684824e-06,
"loss": 0.0814,
"step": 1038
},
{
"epoch": 5.814116002795248,
"grad_norm": 1.7715253484509736,
"learning_rate": 1.9271183258609836e-06,
"loss": 0.0608,
"step": 1040
},
{
"epoch": 5.825296995108316,
"grad_norm": 0.850327251905485,
"learning_rate": 1.9185320351631654e-06,
"loss": 0.0388,
"step": 1042
},
{
"epoch": 5.836477987421384,
"grad_norm": 1.4837292387797913,
"learning_rate": 1.9099529895768552e-06,
"loss": 0.0567,
"step": 1044
},
{
"epoch": 5.847658979734452,
"grad_norm": 1.0384213631474088,
"learning_rate": 1.901381295997267e-06,
"loss": 0.0661,
"step": 1046
},
{
"epoch": 5.8588399720475195,
"grad_norm": 1.2071171218984706,
"learning_rate": 1.8928170612280067e-06,
"loss": 0.0665,
"step": 1048
},
{
"epoch": 5.870020964360587,
"grad_norm": 1.2020194163974407,
"learning_rate": 1.8842603919797436e-06,
"loss": 0.0466,
"step": 1050
},
{
"epoch": 5.881201956673655,
"grad_norm": 1.141150946131999,
"learning_rate": 1.8757113948688827e-06,
"loss": 0.0562,
"step": 1052
},
{
"epoch": 5.8923829489867225,
"grad_norm": 1.583487458549684,
"learning_rate": 1.8671701764162287e-06,
"loss": 0.0589,
"step": 1054
},
{
"epoch": 5.90356394129979,
"grad_norm": 1.3417276690702418,
"learning_rate": 1.8586368430456708e-06,
"loss": 0.0604,
"step": 1056
},
{
"epoch": 5.914744933612858,
"grad_norm": 1.3294273305641617,
"learning_rate": 1.8501115010828423e-06,
"loss": 0.0628,
"step": 1058
},
{
"epoch": 5.925925925925926,
"grad_norm": 1.2448945324282268,
"learning_rate": 1.8415942567538106e-06,
"loss": 0.0554,
"step": 1060
},
{
"epoch": 5.937106918238994,
"grad_norm": 0.960687093766239,
"learning_rate": 1.8330852161837399e-06,
"loss": 0.0532,
"step": 1062
},
{
"epoch": 5.948287910552062,
"grad_norm": 1.4656893110825278,
"learning_rate": 1.8245844853955786e-06,
"loss": 0.0719,
"step": 1064
},
{
"epoch": 5.9594689028651295,
"grad_norm": 1.6634277575338297,
"learning_rate": 1.8160921703087368e-06,
"loss": 0.0565,
"step": 1066
},
{
"epoch": 5.970649895178197,
"grad_norm": 1.7257111050609335,
"learning_rate": 1.8076083767377595e-06,
"loss": 0.068,
"step": 1068
},
{
"epoch": 5.981830887491265,
"grad_norm": 1.42483183153276,
"learning_rate": 1.7991332103910184e-06,
"loss": 0.0613,
"step": 1070
},
{
"epoch": 5.993011879804333,
"grad_norm": 1.4316025881020678,
"learning_rate": 1.7906667768693853e-06,
"loss": 0.0481,
"step": 1072
},
{
"epoch": 6.0041928721174,
"grad_norm": 1.037376667784287,
"learning_rate": 1.782209181664924e-06,
"loss": 0.0483,
"step": 1074
},
{
"epoch": 6.015373864430468,
"grad_norm": 1.0336168566598631,
"learning_rate": 1.773760530159571e-06,
"loss": 0.0347,
"step": 1076
},
{
"epoch": 6.026554856743536,
"grad_norm": 0.7872905184564322,
"learning_rate": 1.7653209276238242e-06,
"loss": 0.0355,
"step": 1078
},
{
"epoch": 6.037735849056604,
"grad_norm": 1.772389302776251,
"learning_rate": 1.7568904792154328e-06,
"loss": 0.0542,
"step": 1080
},
{
"epoch": 6.048916841369672,
"grad_norm": 1.3577848873845724,
"learning_rate": 1.7484692899780812e-06,
"loss": 0.0583,
"step": 1082
},
{
"epoch": 6.06009783368274,
"grad_norm": 0.7840766650439943,
"learning_rate": 1.740057464840088e-06,
"loss": 0.0289,
"step": 1084
},
{
"epoch": 6.071278825995807,
"grad_norm": 0.9255675051401594,
"learning_rate": 1.7316551086130925e-06,
"loss": 0.0417,
"step": 1086
},
{
"epoch": 6.082459818308875,
"grad_norm": 0.9107219582827843,
"learning_rate": 1.7232623259907538e-06,
"loss": 0.0429,
"step": 1088
},
{
"epoch": 6.093640810621943,
"grad_norm": 1.0296310110561282,
"learning_rate": 1.714879221547439e-06,
"loss": 0.0362,
"step": 1090
},
{
"epoch": 6.10482180293501,
"grad_norm": 0.9575340239366315,
"learning_rate": 1.7065058997369288e-06,
"loss": 0.0471,
"step": 1092
},
{
"epoch": 6.116002795248078,
"grad_norm": 0.7430183397758778,
"learning_rate": 1.6981424648911112e-06,
"loss": 0.0351,
"step": 1094
},
{
"epoch": 6.127183787561146,
"grad_norm": 0.9807593854080312,
"learning_rate": 1.6897890212186804e-06,
"loss": 0.0334,
"step": 1096
},
{
"epoch": 6.138364779874214,
"grad_norm": 1.2961448011313597,
"learning_rate": 1.6814456728038431e-06,
"loss": 0.025,
"step": 1098
},
{
"epoch": 6.149545772187282,
"grad_norm": 0.961636779671174,
"learning_rate": 1.673112523605015e-06,
"loss": 0.0285,
"step": 1100
},
{
"epoch": 6.16072676450035,
"grad_norm": 0.9647606646620928,
"learning_rate": 1.6647896774535324e-06,
"loss": 0.0303,
"step": 1102
},
{
"epoch": 6.171907756813417,
"grad_norm": 1.1381988477100318,
"learning_rate": 1.6564772380523546e-06,
"loss": 0.0358,
"step": 1104
},
{
"epoch": 6.183088749126485,
"grad_norm": 0.7901346245952422,
"learning_rate": 1.648175308974771e-06,
"loss": 0.0279,
"step": 1106
},
{
"epoch": 6.194269741439553,
"grad_norm": 1.2717247572933381,
"learning_rate": 1.6398839936631142e-06,
"loss": 0.0328,
"step": 1108
},
{
"epoch": 6.20545073375262,
"grad_norm": 1.2916496315117834,
"learning_rate": 1.631603395427466e-06,
"loss": 0.055,
"step": 1110
},
{
"epoch": 6.216631726065688,
"grad_norm": 0.9740099844597652,
"learning_rate": 1.6233336174443762e-06,
"loss": 0.048,
"step": 1112
},
{
"epoch": 6.227812718378756,
"grad_norm": 1.0103830292004847,
"learning_rate": 1.6150747627555713e-06,
"loss": 0.0434,
"step": 1114
},
{
"epoch": 6.238993710691824,
"grad_norm": 1.1350854047223082,
"learning_rate": 1.6068269342666749e-06,
"loss": 0.0389,
"step": 1116
},
{
"epoch": 6.250174703004892,
"grad_norm": 0.7884154494279628,
"learning_rate": 1.5985902347459239e-06,
"loss": 0.0432,
"step": 1118
},
{
"epoch": 6.26135569531796,
"grad_norm": 0.8788178903528164,
"learning_rate": 1.5903647668228855e-06,
"loss": 0.0432,
"step": 1120
},
{
"epoch": 6.272536687631027,
"grad_norm": 0.6393918351108393,
"learning_rate": 1.5821506329871834e-06,
"loss": 0.0253,
"step": 1122
},
{
"epoch": 6.283717679944095,
"grad_norm": 1.0870268262489273,
"learning_rate": 1.5739479355872162e-06,
"loss": 0.0364,
"step": 1124
},
{
"epoch": 6.294898672257163,
"grad_norm": 1.1679875063936556,
"learning_rate": 1.5657567768288868e-06,
"loss": 0.0333,
"step": 1126
},
{
"epoch": 6.30607966457023,
"grad_norm": 0.8388447320245327,
"learning_rate": 1.5575772587743222e-06,
"loss": 0.0316,
"step": 1128
},
{
"epoch": 6.317260656883298,
"grad_norm": 0.7710273725047172,
"learning_rate": 1.5494094833406092e-06,
"loss": 0.0308,
"step": 1130
},
{
"epoch": 6.328441649196366,
"grad_norm": 1.3107972415612894,
"learning_rate": 1.5412535522985205e-06,
"loss": 0.0186,
"step": 1132
},
{
"epoch": 6.339622641509434,
"grad_norm": 0.8488196487806184,
"learning_rate": 1.5331095672712463e-06,
"loss": 0.023,
"step": 1134
},
{
"epoch": 6.350803633822502,
"grad_norm": 1.014050814471419,
"learning_rate": 1.5249776297331302e-06,
"loss": 0.0425,
"step": 1136
},
{
"epoch": 6.36198462613557,
"grad_norm": 0.8160528908459946,
"learning_rate": 1.516857841008401e-06,
"loss": 0.0407,
"step": 1138
},
{
"epoch": 6.373165618448637,
"grad_norm": 0.6924190623075557,
"learning_rate": 1.5087503022699168e-06,
"loss": 0.0527,
"step": 1140
},
{
"epoch": 6.384346610761705,
"grad_norm": 1.0149043689805195,
"learning_rate": 1.5006551145378967e-06,
"loss": 0.0367,
"step": 1142
},
{
"epoch": 6.395527603074773,
"grad_norm": 1.5920991707794845,
"learning_rate": 1.4925723786786691e-06,
"loss": 0.0319,
"step": 1144
},
{
"epoch": 6.40670859538784,
"grad_norm": 0.8834798218634231,
"learning_rate": 1.4845021954034106e-06,
"loss": 0.0372,
"step": 1146
},
{
"epoch": 6.417889587700908,
"grad_norm": 1.072104658850445,
"learning_rate": 1.476444665266889e-06,
"loss": 0.0413,
"step": 1148
},
{
"epoch": 6.429070580013976,
"grad_norm": 1.1893734124292998,
"learning_rate": 1.4683998886662187e-06,
"loss": 0.0307,
"step": 1150
},
{
"epoch": 6.440251572327044,
"grad_norm": 1.1513167005422524,
"learning_rate": 1.4603679658396006e-06,
"loss": 0.0402,
"step": 1152
},
{
"epoch": 6.451432564640112,
"grad_norm": 1.0586602700365229,
"learning_rate": 1.4523489968650795e-06,
"loss": 0.0303,
"step": 1154
},
{
"epoch": 6.46261355695318,
"grad_norm": 0.7650987855999634,
"learning_rate": 1.4443430816592936e-06,
"loss": 0.0312,
"step": 1156
},
{
"epoch": 6.473794549266247,
"grad_norm": 0.7470083708652993,
"learning_rate": 1.4363503199762296e-06,
"loss": 0.0298,
"step": 1158
},
{
"epoch": 6.484975541579315,
"grad_norm": 1.2247183517462086,
"learning_rate": 1.4283708114059853e-06,
"loss": 0.0476,
"step": 1160
},
{
"epoch": 6.496156533892383,
"grad_norm": 1.0042001049340177,
"learning_rate": 1.4204046553735174e-06,
"loss": 0.0421,
"step": 1162
},
{
"epoch": 6.5073375262054505,
"grad_norm": 1.0066856707214424,
"learning_rate": 1.4124519511374158e-06,
"loss": 0.0277,
"step": 1164
},
{
"epoch": 6.518518518518518,
"grad_norm": 1.3761888161849996,
"learning_rate": 1.404512797788657e-06,
"loss": 0.0251,
"step": 1166
},
{
"epoch": 6.529699510831586,
"grad_norm": 0.7445041473181229,
"learning_rate": 1.396587294249374e-06,
"loss": 0.0383,
"step": 1168
},
{
"epoch": 6.540880503144654,
"grad_norm": 1.0231799225570892,
"learning_rate": 1.3886755392716225e-06,
"loss": 0.0289,
"step": 1170
},
{
"epoch": 6.552061495457722,
"grad_norm": 1.0842064444530823,
"learning_rate": 1.3807776314361498e-06,
"loss": 0.0341,
"step": 1172
},
{
"epoch": 6.56324248777079,
"grad_norm": 0.9409388421938562,
"learning_rate": 1.3728936691511704e-06,
"loss": 0.0413,
"step": 1174
},
{
"epoch": 6.5744234800838575,
"grad_norm": 0.8052329748698783,
"learning_rate": 1.3650237506511333e-06,
"loss": 0.0399,
"step": 1176
},
{
"epoch": 6.585604472396925,
"grad_norm": 0.6879172446908371,
"learning_rate": 1.3571679739955029e-06,
"loss": 0.0288,
"step": 1178
},
{
"epoch": 6.596785464709993,
"grad_norm": 0.8737080494275846,
"learning_rate": 1.3493264370675352e-06,
"loss": 0.0181,
"step": 1180
},
{
"epoch": 6.6079664570230605,
"grad_norm": 0.8744184416405667,
"learning_rate": 1.3414992375730587e-06,
"loss": 0.0432,
"step": 1182
},
{
"epoch": 6.619147449336128,
"grad_norm": 0.9265074156931595,
"learning_rate": 1.3336864730392587e-06,
"loss": 0.0464,
"step": 1184
},
{
"epoch": 6.630328441649196,
"grad_norm": 1.14003149718633,
"learning_rate": 1.3258882408134582e-06,
"loss": 0.0271,
"step": 1186
},
{
"epoch": 6.6415094339622645,
"grad_norm": 0.8949105583359471,
"learning_rate": 1.3181046380619078e-06,
"loss": 0.0276,
"step": 1188
},
{
"epoch": 6.652690426275332,
"grad_norm": 1.0602768370905677,
"learning_rate": 1.3103357617685746e-06,
"loss": 0.0352,
"step": 1190
},
{
"epoch": 6.6638714185884,
"grad_norm": 1.187406942024327,
"learning_rate": 1.3025817087339335e-06,
"loss": 0.0597,
"step": 1192
},
{
"epoch": 6.6750524109014675,
"grad_norm": 0.8451020033143687,
"learning_rate": 1.2948425755737592e-06,
"loss": 0.0359,
"step": 1194
},
{
"epoch": 6.686233403214535,
"grad_norm": 1.2760921925255864,
"learning_rate": 1.2871184587179286e-06,
"loss": 0.0285,
"step": 1196
},
{
"epoch": 6.697414395527603,
"grad_norm": 0.7781748766075295,
"learning_rate": 1.2794094544092111e-06,
"loss": 0.0346,
"step": 1198
},
{
"epoch": 6.7085953878406706,
"grad_norm": 1.1832623077309767,
"learning_rate": 1.2717156587020746e-06,
"loss": 0.041,
"step": 1200
},
{
"epoch": 6.719776380153739,
"grad_norm": 1.3133094357866473,
"learning_rate": 1.2640371674614866e-06,
"loss": 0.0629,
"step": 1202
},
{
"epoch": 6.730957372466807,
"grad_norm": 0.7218331862903847,
"learning_rate": 1.2563740763617198e-06,
"loss": 0.0366,
"step": 1204
},
{
"epoch": 6.7421383647798745,
"grad_norm": 0.9560652150388108,
"learning_rate": 1.2487264808851654e-06,
"loss": 0.044,
"step": 1206
},
{
"epoch": 6.753319357092942,
"grad_norm": 1.1190106870390395,
"learning_rate": 1.2410944763211302e-06,
"loss": 0.0517,
"step": 1208
},
{
"epoch": 6.76450034940601,
"grad_norm": 0.7835985914687663,
"learning_rate": 1.2334781577646653e-06,
"loss": 0.0272,
"step": 1210
},
{
"epoch": 6.7756813417190775,
"grad_norm": 2.056446636497986,
"learning_rate": 1.2258776201153702e-06,
"loss": 0.0239,
"step": 1212
},
{
"epoch": 6.786862334032145,
"grad_norm": 0.8485551422736736,
"learning_rate": 1.218292958076213e-06,
"loss": 0.0206,
"step": 1214
},
{
"epoch": 6.798043326345213,
"grad_norm": 1.2531964534501892,
"learning_rate": 1.2107242661523544e-06,
"loss": 0.0254,
"step": 1216
},
{
"epoch": 6.809224318658281,
"grad_norm": 1.269537638790587,
"learning_rate": 1.203171638649962e-06,
"loss": 0.0299,
"step": 1218
},
{
"epoch": 6.820405310971349,
"grad_norm": 1.1178764385402225,
"learning_rate": 1.195635169675045e-06,
"loss": 0.0396,
"step": 1220
},
{
"epoch": 6.831586303284417,
"grad_norm": 0.6920818283019613,
"learning_rate": 1.1881149531322744e-06,
"loss": 0.0268,
"step": 1222
},
{
"epoch": 6.8427672955974845,
"grad_norm": 0.80369354175751,
"learning_rate": 1.180611082723814e-06,
"loss": 0.031,
"step": 1224
},
{
"epoch": 6.853948287910552,
"grad_norm": 0.7447389756775401,
"learning_rate": 1.1731236519481593e-06,
"loss": 0.0345,
"step": 1226
},
{
"epoch": 6.86512928022362,
"grad_norm": 1.1115305000722167,
"learning_rate": 1.1656527540989595e-06,
"loss": 0.0283,
"step": 1228
},
{
"epoch": 6.876310272536688,
"grad_norm": 1.2279572164110593,
"learning_rate": 1.1581984822638706e-06,
"loss": 0.0452,
"step": 1230
},
{
"epoch": 6.887491264849755,
"grad_norm": 0.8467749629186313,
"learning_rate": 1.1507609293233837e-06,
"loss": 0.0283,
"step": 1232
},
{
"epoch": 6.898672257162823,
"grad_norm": 1.355703618365484,
"learning_rate": 1.1433401879496723e-06,
"loss": 0.0366,
"step": 1234
},
{
"epoch": 6.909853249475891,
"grad_norm": 1.004917827499692,
"learning_rate": 1.135936350605438e-06,
"loss": 0.0496,
"step": 1236
},
{
"epoch": 6.921034241788959,
"grad_norm": 1.2615070307313305,
"learning_rate": 1.1285495095427563e-06,
"loss": 0.0461,
"step": 1238
},
{
"epoch": 6.932215234102027,
"grad_norm": 0.9861185460727813,
"learning_rate": 1.1211797568019312e-06,
"loss": 0.0366,
"step": 1240
},
{
"epoch": 6.943396226415095,
"grad_norm": 1.6576290169923233,
"learning_rate": 1.113827184210343e-06,
"loss": 0.0337,
"step": 1242
},
{
"epoch": 6.954577218728162,
"grad_norm": 1.1363579065284033,
"learning_rate": 1.1064918833813073e-06,
"loss": 0.0406,
"step": 1244
},
{
"epoch": 6.96575821104123,
"grad_norm": 1.3125191134965577,
"learning_rate": 1.0991739457129333e-06,
"loss": 0.0397,
"step": 1246
},
{
"epoch": 6.976939203354298,
"grad_norm": 0.8904462468667067,
"learning_rate": 1.0918734623869835e-06,
"loss": 0.0407,
"step": 1248
},
{
"epoch": 6.988120195667365,
"grad_norm": 2.263233580582389,
"learning_rate": 1.0845905243677416e-06,
"loss": 0.0307,
"step": 1250
},
{
"epoch": 6.999301187980433,
"grad_norm": 0.791294534235276,
"learning_rate": 1.0773252224008726e-06,
"loss": 0.0387,
"step": 1252
},
{
"epoch": 7.010482180293501,
"grad_norm": 0.76599595030522,
"learning_rate": 1.0700776470122981e-06,
"loss": 0.0269,
"step": 1254
},
{
"epoch": 7.021663172606569,
"grad_norm": 0.7331796337642835,
"learning_rate": 1.0628478885070647e-06,
"loss": 0.0221,
"step": 1256
},
{
"epoch": 7.032844164919637,
"grad_norm": 0.6845784469587074,
"learning_rate": 1.05563603696822e-06,
"loss": 0.0291,
"step": 1258
},
{
"epoch": 7.044025157232705,
"grad_norm": 0.8176233505690059,
"learning_rate": 1.0484421822556904e-06,
"loss": 0.0364,
"step": 1260
},
{
"epoch": 7.055206149545772,
"grad_norm": 0.8629657573128657,
"learning_rate": 1.041266414005162e-06,
"loss": 0.0265,
"step": 1262
},
{
"epoch": 7.06638714185884,
"grad_norm": 1.1172499462707595,
"learning_rate": 1.0341088216269625e-06,
"loss": 0.0157,
"step": 1264
},
{
"epoch": 7.077568134171908,
"grad_norm": 0.5230775744769823,
"learning_rate": 1.0269694943049462e-06,
"loss": 0.0157,
"step": 1266
},
{
"epoch": 7.088749126484975,
"grad_norm": 0.8978199171663125,
"learning_rate": 1.0198485209953865e-06,
"loss": 0.0275,
"step": 1268
},
{
"epoch": 7.099930118798043,
"grad_norm": 0.815308309594077,
"learning_rate": 1.0127459904258621e-06,
"loss": 0.0237,
"step": 1270
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.8967571058386815,
"learning_rate": 1.0056619910941592e-06,
"loss": 0.019,
"step": 1272
},
{
"epoch": 7.122292103424179,
"grad_norm": 0.7843358442700527,
"learning_rate": 9.98596611267158e-07,
"loss": 0.021,
"step": 1274
},
{
"epoch": 7.133473095737247,
"grad_norm": 0.6797830063456453,
"learning_rate": 9.915499389797444e-07,
"loss": 0.0316,
"step": 1276
},
{
"epoch": 7.144654088050315,
"grad_norm": 0.6688875199025872,
"learning_rate": 9.845220620337054e-07,
"loss": 0.0303,
"step": 1278
},
{
"epoch": 7.155835080363382,
"grad_norm": 0.6664970872749731,
"learning_rate": 9.77513067996636e-07,
"loss": 0.0219,
"step": 1280
},
{
"epoch": 7.16701607267645,
"grad_norm": 0.7973098520727987,
"learning_rate": 9.705230442008542e-07,
"loss": 0.0376,
"step": 1282
},
{
"epoch": 7.178197064989518,
"grad_norm": 0.8759703504057706,
"learning_rate": 9.63552077742301e-07,
"loss": 0.0385,
"step": 1284
},
{
"epoch": 7.189378057302585,
"grad_norm": 1.0267904937054426,
"learning_rate": 9.56600255479469e-07,
"loss": 0.0222,
"step": 1286
},
{
"epoch": 7.200559049615653,
"grad_norm": 0.6389768145894307,
"learning_rate": 9.4966766403231e-07,
"loss": 0.018,
"step": 1288
},
{
"epoch": 7.211740041928721,
"grad_norm": 0.5762313893158477,
"learning_rate": 9.427543897811584e-07,
"loss": 0.0165,
"step": 1290
},
{
"epoch": 7.222921034241789,
"grad_norm": 0.5902518126138557,
"learning_rate": 9.358605188656603e-07,
"loss": 0.02,
"step": 1292
},
{
"epoch": 7.234102026554857,
"grad_norm": 0.824105561963567,
"learning_rate": 9.289861371836886e-07,
"loss": 0.0337,
"step": 1294
},
{
"epoch": 7.245283018867925,
"grad_norm": 0.504698332550927,
"learning_rate": 9.22131330390286e-07,
"loss": 0.0283,
"step": 1296
},
{
"epoch": 7.256464011180992,
"grad_norm": 0.5789695393721453,
"learning_rate": 9.152961838965879e-07,
"loss": 0.0169,
"step": 1298
},
{
"epoch": 7.26764500349406,
"grad_norm": 1.4892687104014115,
"learning_rate": 9.084807828687628e-07,
"loss": 0.0314,
"step": 1300
},
{
"epoch": 7.278825995807128,
"grad_norm": 1.0727067281323632,
"learning_rate": 9.016852122269493e-07,
"loss": 0.0274,
"step": 1302
},
{
"epoch": 7.290006988120195,
"grad_norm": 0.7309629553367788,
"learning_rate": 8.949095566441985e-07,
"loss": 0.0219,
"step": 1304
},
{
"epoch": 7.301187980433263,
"grad_norm": 0.6871990809680889,
"learning_rate": 8.881539005454215e-07,
"loss": 0.0339,
"step": 1306
},
{
"epoch": 7.312368972746331,
"grad_norm": 0.8530617423198913,
"learning_rate": 8.814183281063326e-07,
"loss": 0.0248,
"step": 1308
},
{
"epoch": 7.323549965059399,
"grad_norm": 0.76651991997128,
"learning_rate": 8.747029232524037e-07,
"loss": 0.023,
"step": 1310
},
{
"epoch": 7.334730957372467,
"grad_norm": 0.6966547986519114,
"learning_rate": 8.680077696578182e-07,
"loss": 0.0332,
"step": 1312
},
{
"epoch": 7.345911949685535,
"grad_norm": 1.0873098335521205,
"learning_rate": 8.613329507444274e-07,
"loss": 0.0234,
"step": 1314
},
{
"epoch": 7.357092941998602,
"grad_norm": 0.6461932986017782,
"learning_rate": 8.546785496807116e-07,
"loss": 0.0242,
"step": 1316
},
{
"epoch": 7.36827393431167,
"grad_norm": 0.7614414460885182,
"learning_rate": 8.480446493807464e-07,
"loss": 0.031,
"step": 1318
},
{
"epoch": 7.379454926624738,
"grad_norm": 0.641294466328584,
"learning_rate": 8.414313325031642e-07,
"loss": 0.028,
"step": 1320
},
{
"epoch": 7.3906359189378055,
"grad_norm": 0.47088954187562415,
"learning_rate": 8.348386814501286e-07,
"loss": 0.0186,
"step": 1322
},
{
"epoch": 7.401816911250873,
"grad_norm": 0.7909087034714356,
"learning_rate": 8.282667783663056e-07,
"loss": 0.0212,
"step": 1324
},
{
"epoch": 7.412997903563941,
"grad_norm": 0.8059238279425677,
"learning_rate": 8.217157051378411e-07,
"loss": 0.0239,
"step": 1326
},
{
"epoch": 7.424178895877009,
"grad_norm": 0.788531385863816,
"learning_rate": 8.151855433913414e-07,
"loss": 0.0199,
"step": 1328
},
{
"epoch": 7.435359888190077,
"grad_norm": 1.1393964476120448,
"learning_rate": 8.086763744928536e-07,
"loss": 0.0292,
"step": 1330
},
{
"epoch": 7.446540880503145,
"grad_norm": 0.5408108502649198,
"learning_rate": 8.02188279546853e-07,
"loss": 0.0146,
"step": 1332
},
{
"epoch": 7.4577218728162125,
"grad_norm": 0.8749206113652656,
"learning_rate": 7.957213393952335e-07,
"loss": 0.0247,
"step": 1334
},
{
"epoch": 7.46890286512928,
"grad_norm": 0.7053824386402378,
"learning_rate": 7.892756346162986e-07,
"loss": 0.02,
"step": 1336
},
{
"epoch": 7.480083857442348,
"grad_norm": 0.6965900833846856,
"learning_rate": 7.82851245523761e-07,
"loss": 0.0315,
"step": 1338
},
{
"epoch": 7.4912648497554155,
"grad_norm": 0.9392067120327887,
"learning_rate": 7.764482521657343e-07,
"loss": 0.0308,
"step": 1340
},
{
"epoch": 7.502445842068483,
"grad_norm": 0.7074561491918046,
"learning_rate": 7.700667343237453e-07,
"loss": 0.0171,
"step": 1342
},
{
"epoch": 7.513626834381551,
"grad_norm": 0.7697005768650605,
"learning_rate": 7.637067715117327e-07,
"loss": 0.0302,
"step": 1344
},
{
"epoch": 7.5248078266946195,
"grad_norm": 1.176668146060272,
"learning_rate": 7.573684429750583e-07,
"loss": 0.0265,
"step": 1346
},
{
"epoch": 7.535988819007687,
"grad_norm": 0.7258573280389607,
"learning_rate": 7.510518276895234e-07,
"loss": 0.0257,
"step": 1348
},
{
"epoch": 7.547169811320755,
"grad_norm": 1.1195611459347754,
"learning_rate": 7.447570043603755e-07,
"loss": 0.0261,
"step": 1350
},
{
"epoch": 7.5583508036338225,
"grad_norm": 0.9527258409378455,
"learning_rate": 7.384840514213404e-07,
"loss": 0.0524,
"step": 1352
},
{
"epoch": 7.56953179594689,
"grad_norm": 0.7074898357644916,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0205,
"step": 1354
},
{
"epoch": 7.580712788259958,
"grad_norm": 0.9361424266631929,
"learning_rate": 7.26004069084987e-07,
"loss": 0.0217,
"step": 1356
},
{
"epoch": 7.5918937805730256,
"grad_norm": 1.7048958108176762,
"learning_rate": 7.197971951886956e-07,
"loss": 0.0225,
"step": 1358
},
{
"epoch": 7.603074772886094,
"grad_norm": 0.8812767707258257,
"learning_rate": 7.13612502682623e-07,
"loss": 0.0196,
"step": 1360
},
{
"epoch": 7.614255765199162,
"grad_norm": 0.5682027618905875,
"learning_rate": 7.074500686282609e-07,
"loss": 0.019,
"step": 1362
},
{
"epoch": 7.6254367575122295,
"grad_norm": 0.4475598932931596,
"learning_rate": 7.013099698097539e-07,
"loss": 0.0171,
"step": 1364
},
{
"epoch": 7.636617749825297,
"grad_norm": 0.5527498039813922,
"learning_rate": 6.951922827329535e-07,
"loss": 0.0217,
"step": 1366
},
{
"epoch": 7.647798742138365,
"grad_norm": 0.7984442985333638,
"learning_rate": 6.890970836244574e-07,
"loss": 0.0361,
"step": 1368
},
{
"epoch": 7.6589797344514325,
"grad_norm": 0.624268450810696,
"learning_rate": 6.830244484306623e-07,
"loss": 0.0158,
"step": 1370
},
{
"epoch": 7.6701607267645,
"grad_norm": 0.7493822409267487,
"learning_rate": 6.769744528168207e-07,
"loss": 0.0286,
"step": 1372
},
{
"epoch": 7.681341719077568,
"grad_norm": 0.6787647092695418,
"learning_rate": 6.709471721660904e-07,
"loss": 0.0215,
"step": 1374
},
{
"epoch": 7.692522711390636,
"grad_norm": 0.7321502006735149,
"learning_rate": 6.649426815786045e-07,
"loss": 0.0311,
"step": 1376
},
{
"epoch": 7.703703703703704,
"grad_norm": 0.701610396870259,
"learning_rate": 6.589610558705284e-07,
"loss": 0.0235,
"step": 1378
},
{
"epoch": 7.714884696016772,
"grad_norm": 0.6530846520546149,
"learning_rate": 6.53002369573131e-07,
"loss": 0.0245,
"step": 1380
},
{
"epoch": 7.7260656883298395,
"grad_norm": 0.7531427984254183,
"learning_rate": 6.470666969318554e-07,
"loss": 0.0315,
"step": 1382
},
{
"epoch": 7.737246680642907,
"grad_norm": 0.7301669272251805,
"learning_rate": 6.41154111905393e-07,
"loss": 0.0225,
"step": 1384
},
{
"epoch": 7.748427672955975,
"grad_norm": 0.8707140120777088,
"learning_rate": 6.352646881647647e-07,
"loss": 0.0259,
"step": 1386
},
{
"epoch": 7.759608665269043,
"grad_norm": 0.837200588883093,
"learning_rate": 6.29398499092399e-07,
"loss": 0.0474,
"step": 1388
},
{
"epoch": 7.77078965758211,
"grad_norm": 0.973530488120086,
"learning_rate": 6.235556177812205e-07,
"loss": 0.0329,
"step": 1390
},
{
"epoch": 7.781970649895178,
"grad_norm": 0.5813627298678434,
"learning_rate": 6.177361170337376e-07,
"loss": 0.0194,
"step": 1392
},
{
"epoch": 7.793151642208246,
"grad_norm": 0.8597088367336019,
"learning_rate": 6.119400693611358e-07,
"loss": 0.0123,
"step": 1394
},
{
"epoch": 7.804332634521314,
"grad_norm": 0.8368570476462492,
"learning_rate": 6.061675469823763e-07,
"loss": 0.0227,
"step": 1396
},
{
"epoch": 7.815513626834382,
"grad_norm": 0.5203392914919558,
"learning_rate": 6.004186218232933e-07,
"loss": 0.0217,
"step": 1398
},
{
"epoch": 7.82669461914745,
"grad_norm": 0.8572153440435842,
"learning_rate": 5.946933655156976e-07,
"loss": 0.0294,
"step": 1400
},
{
"epoch": 7.837875611460517,
"grad_norm": 0.6862577628733875,
"learning_rate": 5.889918493964869e-07,
"loss": 0.0228,
"step": 1402
},
{
"epoch": 7.849056603773585,
"grad_norm": 0.7097594226614418,
"learning_rate": 5.833141445067541e-07,
"loss": 0.0113,
"step": 1404
},
{
"epoch": 7.860237596086653,
"grad_norm": 0.6322499286175502,
"learning_rate": 5.776603215909041e-07,
"loss": 0.0229,
"step": 1406
},
{
"epoch": 7.87141858839972,
"grad_norm": 0.6798739232739857,
"learning_rate": 5.720304510957722e-07,
"loss": 0.0257,
"step": 1408
},
{
"epoch": 7.882599580712788,
"grad_norm": 0.6568708401714163,
"learning_rate": 5.66424603169744e-07,
"loss": 0.0285,
"step": 1410
},
{
"epoch": 7.893780573025856,
"grad_norm": 1.1483908878505031,
"learning_rate": 5.608428476618843e-07,
"loss": 0.0235,
"step": 1412
},
{
"epoch": 7.904961565338924,
"grad_norm": 0.9297111790590921,
"learning_rate": 5.552852541210651e-07,
"loss": 0.022,
"step": 1414
},
{
"epoch": 7.916142557651992,
"grad_norm": 0.7288896652277049,
"learning_rate": 5.497518917950986e-07,
"loss": 0.033,
"step": 1416
},
{
"epoch": 7.92732354996506,
"grad_norm": 1.3241630685241197,
"learning_rate": 5.44242829629878e-07,
"loss": 0.0236,
"step": 1418
},
{
"epoch": 7.938504542278127,
"grad_norm": 0.6616696784338312,
"learning_rate": 5.387581362685112e-07,
"loss": 0.03,
"step": 1420
},
{
"epoch": 7.949685534591195,
"grad_norm": 0.9223806906428696,
"learning_rate": 5.332978800504742e-07,
"loss": 0.0234,
"step": 1422
},
{
"epoch": 7.960866526904263,
"grad_norm": 1.1302104401143789,
"learning_rate": 5.278621290107533e-07,
"loss": 0.0334,
"step": 1424
},
{
"epoch": 7.97204751921733,
"grad_norm": 0.6145924647383543,
"learning_rate": 5.224509508789987e-07,
"loss": 0.0205,
"step": 1426
},
{
"epoch": 7.983228511530398,
"grad_norm": 0.6724718918142113,
"learning_rate": 5.170644130786842e-07,
"loss": 0.0315,
"step": 1428
},
{
"epoch": 7.994409503843466,
"grad_norm": 0.5897709957691004,
"learning_rate": 5.117025827262598e-07,
"loss": 0.0189,
"step": 1430
}
],
"logging_steps": 2,
"max_steps": 1780,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 598197676277760.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}