prm_gsm_all_data_bon_4_hf / trainer_state.json
DongfuJiang's picture
Upload folder using huggingface_hub
6697e04 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997159897756319,
"eval_steps": 500,
"global_step": 880,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001136040897472309,
"grad_norm": 6.713736329359423,
"learning_rate": 1.1363636363636364e-07,
"loss": 0.2307,
"step": 1
},
{
"epoch": 0.002272081794944618,
"grad_norm": 6.425061477822205,
"learning_rate": 2.2727272727272729e-07,
"loss": 0.2478,
"step": 2
},
{
"epoch": 0.003408122692416927,
"grad_norm": 6.49997931766097,
"learning_rate": 3.409090909090909e-07,
"loss": 0.2494,
"step": 3
},
{
"epoch": 0.004544163589889236,
"grad_norm": 6.438878495171851,
"learning_rate": 4.5454545454545457e-07,
"loss": 0.2252,
"step": 4
},
{
"epoch": 0.005680204487361545,
"grad_norm": 6.0504803408240395,
"learning_rate": 5.681818181818182e-07,
"loss": 0.2356,
"step": 5
},
{
"epoch": 0.006816245384833854,
"grad_norm": 4.3484407974076635,
"learning_rate": 6.818181818181818e-07,
"loss": 0.2422,
"step": 6
},
{
"epoch": 0.007952286282306162,
"grad_norm": 4.171396401412328,
"learning_rate": 7.954545454545455e-07,
"loss": 0.2181,
"step": 7
},
{
"epoch": 0.009088327179778472,
"grad_norm": 3.4177520767635032,
"learning_rate": 9.090909090909091e-07,
"loss": 0.2267,
"step": 8
},
{
"epoch": 0.010224368077250781,
"grad_norm": 3.2064592077436522,
"learning_rate": 1.0227272727272729e-06,
"loss": 0.2149,
"step": 9
},
{
"epoch": 0.01136040897472309,
"grad_norm": 2.430289501917744,
"learning_rate": 1.1363636363636364e-06,
"loss": 0.1955,
"step": 10
},
{
"epoch": 0.012496449872195399,
"grad_norm": 2.3200048473028634,
"learning_rate": 1.25e-06,
"loss": 0.1946,
"step": 11
},
{
"epoch": 0.013632490769667709,
"grad_norm": 2.0881837538285373,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.1971,
"step": 12
},
{
"epoch": 0.014768531667140017,
"grad_norm": 1.9540608830360986,
"learning_rate": 1.4772727272727275e-06,
"loss": 0.186,
"step": 13
},
{
"epoch": 0.015904572564612324,
"grad_norm": 2.4445533261203884,
"learning_rate": 1.590909090909091e-06,
"loss": 0.1605,
"step": 14
},
{
"epoch": 0.017040613462084634,
"grad_norm": 2.288800657192205,
"learning_rate": 1.7045454545454546e-06,
"loss": 0.1659,
"step": 15
},
{
"epoch": 0.018176654359556944,
"grad_norm": 1.9509519762495295,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.1618,
"step": 16
},
{
"epoch": 0.019312695257029253,
"grad_norm": 1.592909275487469,
"learning_rate": 1.931818181818182e-06,
"loss": 0.1642,
"step": 17
},
{
"epoch": 0.020448736154501563,
"grad_norm": 1.400794979169675,
"learning_rate": 2.0454545454545457e-06,
"loss": 0.1545,
"step": 18
},
{
"epoch": 0.021584777051973873,
"grad_norm": 1.3975579360742723,
"learning_rate": 2.1590909090909092e-06,
"loss": 0.1451,
"step": 19
},
{
"epoch": 0.02272081794944618,
"grad_norm": 1.6759990274286927,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.1531,
"step": 20
},
{
"epoch": 0.02385685884691849,
"grad_norm": 1.6129060518400373,
"learning_rate": 2.3863636363636367e-06,
"loss": 0.1777,
"step": 21
},
{
"epoch": 0.024992899744390798,
"grad_norm": 1.4028499504088532,
"learning_rate": 2.5e-06,
"loss": 0.147,
"step": 22
},
{
"epoch": 0.026128940641863108,
"grad_norm": 1.319887691895735,
"learning_rate": 2.6136363636363637e-06,
"loss": 0.1418,
"step": 23
},
{
"epoch": 0.027264981539335417,
"grad_norm": 1.2184018909974619,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.1493,
"step": 24
},
{
"epoch": 0.028401022436807723,
"grad_norm": 1.202619579827782,
"learning_rate": 2.8409090909090916e-06,
"loss": 0.1379,
"step": 25
},
{
"epoch": 0.029537063334280033,
"grad_norm": 1.0605303626457392,
"learning_rate": 2.954545454545455e-06,
"loss": 0.1293,
"step": 26
},
{
"epoch": 0.030673104231752343,
"grad_norm": 1.054431016591944,
"learning_rate": 3.0681818181818186e-06,
"loss": 0.118,
"step": 27
},
{
"epoch": 0.03180914512922465,
"grad_norm": 1.0923689246123935,
"learning_rate": 3.181818181818182e-06,
"loss": 0.1244,
"step": 28
},
{
"epoch": 0.03294518602669696,
"grad_norm": 1.1788052159109477,
"learning_rate": 3.2954545454545456e-06,
"loss": 0.133,
"step": 29
},
{
"epoch": 0.03408122692416927,
"grad_norm": 1.273315281082378,
"learning_rate": 3.409090909090909e-06,
"loss": 0.1443,
"step": 30
},
{
"epoch": 0.03521726782164158,
"grad_norm": 1.1149014713257173,
"learning_rate": 3.522727272727273e-06,
"loss": 0.1259,
"step": 31
},
{
"epoch": 0.03635330871911389,
"grad_norm": 1.1519957043635232,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.1254,
"step": 32
},
{
"epoch": 0.0374893496165862,
"grad_norm": 1.0308432957952973,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.13,
"step": 33
},
{
"epoch": 0.03862539051405851,
"grad_norm": 1.100380112079228,
"learning_rate": 3.863636363636364e-06,
"loss": 0.126,
"step": 34
},
{
"epoch": 0.039761431411530816,
"grad_norm": 1.206728692480485,
"learning_rate": 3.9772727272727275e-06,
"loss": 0.1377,
"step": 35
},
{
"epoch": 0.040897472309003126,
"grad_norm": 1.0754244396472252,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.1239,
"step": 36
},
{
"epoch": 0.042033513206475435,
"grad_norm": 0.991175124338083,
"learning_rate": 4.204545454545455e-06,
"loss": 0.1208,
"step": 37
},
{
"epoch": 0.043169554103947745,
"grad_norm": 0.9863230162147564,
"learning_rate": 4.3181818181818185e-06,
"loss": 0.1236,
"step": 38
},
{
"epoch": 0.04430559500142005,
"grad_norm": 0.9747917351402464,
"learning_rate": 4.4318181818181824e-06,
"loss": 0.1141,
"step": 39
},
{
"epoch": 0.04544163589889236,
"grad_norm": 0.9526188744768924,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.1123,
"step": 40
},
{
"epoch": 0.04657767679636467,
"grad_norm": 1.1038102555895053,
"learning_rate": 4.6590909090909095e-06,
"loss": 0.1127,
"step": 41
},
{
"epoch": 0.04771371769383698,
"grad_norm": 0.9623246058987326,
"learning_rate": 4.772727272727273e-06,
"loss": 0.1095,
"step": 42
},
{
"epoch": 0.048849758591309286,
"grad_norm": 0.995620380472554,
"learning_rate": 4.8863636363636365e-06,
"loss": 0.1136,
"step": 43
},
{
"epoch": 0.049985799488781596,
"grad_norm": 1.1922367869339325,
"learning_rate": 5e-06,
"loss": 0.1111,
"step": 44
},
{
"epoch": 0.051121840386253906,
"grad_norm": 1.0650960783331873,
"learning_rate": 4.999982347887264e-06,
"loss": 0.1182,
"step": 45
},
{
"epoch": 0.052257881283726215,
"grad_norm": 1.097485598148884,
"learning_rate": 4.9999293917983325e-06,
"loss": 0.1166,
"step": 46
},
{
"epoch": 0.053393922181198525,
"grad_norm": 1.223556692747266,
"learning_rate": 4.999841132481035e-06,
"loss": 0.1135,
"step": 47
},
{
"epoch": 0.054529963078670834,
"grad_norm": 0.8885931835747594,
"learning_rate": 4.999717571181742e-06,
"loss": 0.1067,
"step": 48
},
{
"epoch": 0.055666003976143144,
"grad_norm": 0.8978133832333574,
"learning_rate": 4.999558709645349e-06,
"loss": 0.1062,
"step": 49
},
{
"epoch": 0.05680204487361545,
"grad_norm": 1.167148679651936,
"learning_rate": 4.9993645501152485e-06,
"loss": 0.1126,
"step": 50
},
{
"epoch": 0.057938085771087756,
"grad_norm": 0.8981796373126424,
"learning_rate": 4.999135095333301e-06,
"loss": 0.1059,
"step": 51
},
{
"epoch": 0.059074126668560066,
"grad_norm": 0.9930855230943375,
"learning_rate": 4.998870348539797e-06,
"loss": 0.1274,
"step": 52
},
{
"epoch": 0.060210167566032376,
"grad_norm": 1.098950668279268,
"learning_rate": 4.998570313473408e-06,
"loss": 0.1116,
"step": 53
},
{
"epoch": 0.061346208463504685,
"grad_norm": 0.9697083904957693,
"learning_rate": 4.998234994371135e-06,
"loss": 0.1051,
"step": 54
},
{
"epoch": 0.062482249360976995,
"grad_norm": 1.0880364629521353,
"learning_rate": 4.997864395968252e-06,
"loss": 0.112,
"step": 55
},
{
"epoch": 0.0636182902584493,
"grad_norm": 1.054159789623682,
"learning_rate": 4.997458523498236e-06,
"loss": 0.1146,
"step": 56
},
{
"epoch": 0.06475433115592161,
"grad_norm": 1.0291021807953078,
"learning_rate": 4.99701738269269e-06,
"loss": 0.1042,
"step": 57
},
{
"epoch": 0.06589037205339392,
"grad_norm": 1.0281626755590443,
"learning_rate": 4.996540979781269e-06,
"loss": 0.1219,
"step": 58
},
{
"epoch": 0.06702641295086623,
"grad_norm": 1.1105171034316133,
"learning_rate": 4.996029321491587e-06,
"loss": 0.1216,
"step": 59
},
{
"epoch": 0.06816245384833854,
"grad_norm": 1.034096902228184,
"learning_rate": 4.995482415049123e-06,
"loss": 0.1051,
"step": 60
},
{
"epoch": 0.06929849474581085,
"grad_norm": 0.889629538614507,
"learning_rate": 4.994900268177121e-06,
"loss": 0.1046,
"step": 61
},
{
"epoch": 0.07043453564328316,
"grad_norm": 0.9495477142186882,
"learning_rate": 4.99428288909648e-06,
"loss": 0.1018,
"step": 62
},
{
"epoch": 0.07157057654075547,
"grad_norm": 0.8696989475429725,
"learning_rate": 4.993630286525634e-06,
"loss": 0.1062,
"step": 63
},
{
"epoch": 0.07270661743822777,
"grad_norm": 0.9504849462967013,
"learning_rate": 4.992942469680437e-06,
"loss": 0.1099,
"step": 64
},
{
"epoch": 0.07384265833570008,
"grad_norm": 0.9799914580295991,
"learning_rate": 4.992219448274022e-06,
"loss": 0.1076,
"step": 65
},
{
"epoch": 0.0749786992331724,
"grad_norm": 0.9059465840472486,
"learning_rate": 4.991461232516675e-06,
"loss": 0.0946,
"step": 66
},
{
"epoch": 0.0761147401306447,
"grad_norm": 0.8918398442966295,
"learning_rate": 4.990667833115684e-06,
"loss": 0.1118,
"step": 67
},
{
"epoch": 0.07725078102811701,
"grad_norm": 1.00335150985273,
"learning_rate": 4.989839261275191e-06,
"loss": 0.1126,
"step": 68
},
{
"epoch": 0.07838682192558932,
"grad_norm": 1.019420038485551,
"learning_rate": 4.988975528696028e-06,
"loss": 0.1072,
"step": 69
},
{
"epoch": 0.07952286282306163,
"grad_norm": 0.9291595432428256,
"learning_rate": 4.988076647575562e-06,
"loss": 0.1045,
"step": 70
},
{
"epoch": 0.08065890372053394,
"grad_norm": 1.0634383716539064,
"learning_rate": 4.98714263060751e-06,
"loss": 0.1089,
"step": 71
},
{
"epoch": 0.08179494461800625,
"grad_norm": 1.0060661248475429,
"learning_rate": 4.986173490981773e-06,
"loss": 0.0987,
"step": 72
},
{
"epoch": 0.08293098551547856,
"grad_norm": 1.106124562082597,
"learning_rate": 4.9851692423842406e-06,
"loss": 0.1157,
"step": 73
},
{
"epoch": 0.08406702641295087,
"grad_norm": 0.9260108438954108,
"learning_rate": 4.984129898996599e-06,
"loss": 0.0975,
"step": 74
},
{
"epoch": 0.08520306731042318,
"grad_norm": 1.0266696611897657,
"learning_rate": 4.983055475496134e-06,
"loss": 0.0987,
"step": 75
},
{
"epoch": 0.08633910820789549,
"grad_norm": 0.8699860068340203,
"learning_rate": 4.981945987055521e-06,
"loss": 0.101,
"step": 76
},
{
"epoch": 0.0874751491053678,
"grad_norm": 1.0778844945975843,
"learning_rate": 4.9808014493426124e-06,
"loss": 0.1122,
"step": 77
},
{
"epoch": 0.0886111900028401,
"grad_norm": 0.9013095921740779,
"learning_rate": 4.979621878520217e-06,
"loss": 0.111,
"step": 78
},
{
"epoch": 0.0897472309003124,
"grad_norm": 0.9663627554637914,
"learning_rate": 4.978407291245866e-06,
"loss": 0.0949,
"step": 79
},
{
"epoch": 0.09088327179778471,
"grad_norm": 0.8666375684207491,
"learning_rate": 4.977157704671585e-06,
"loss": 0.0945,
"step": 80
},
{
"epoch": 0.09201931269525702,
"grad_norm": 0.9535753414488521,
"learning_rate": 4.975873136443649e-06,
"loss": 0.1015,
"step": 81
},
{
"epoch": 0.09315535359272933,
"grad_norm": 1.0742330242309999,
"learning_rate": 4.974553604702332e-06,
"loss": 0.0949,
"step": 82
},
{
"epoch": 0.09429139449020164,
"grad_norm": 1.0596668250713723,
"learning_rate": 4.9731991280816534e-06,
"loss": 0.1159,
"step": 83
},
{
"epoch": 0.09542743538767395,
"grad_norm": 0.9500662434392095,
"learning_rate": 4.971809725709112e-06,
"loss": 0.1006,
"step": 84
},
{
"epoch": 0.09656347628514626,
"grad_norm": 1.0736109125994615,
"learning_rate": 4.970385417205418e-06,
"loss": 0.1024,
"step": 85
},
{
"epoch": 0.09769951718261857,
"grad_norm": 0.8026706228357128,
"learning_rate": 4.968926222684213e-06,
"loss": 0.0908,
"step": 86
},
{
"epoch": 0.09883555808009088,
"grad_norm": 0.9508589273601138,
"learning_rate": 4.967432162751792e-06,
"loss": 0.0919,
"step": 87
},
{
"epoch": 0.09997159897756319,
"grad_norm": 0.927289333391325,
"learning_rate": 4.965903258506806e-06,
"loss": 0.0953,
"step": 88
},
{
"epoch": 0.1011076398750355,
"grad_norm": 0.9403610446156513,
"learning_rate": 4.964339531539967e-06,
"loss": 0.0976,
"step": 89
},
{
"epoch": 0.10224368077250781,
"grad_norm": 0.9342072851864264,
"learning_rate": 4.9627410039337426e-06,
"loss": 0.0967,
"step": 90
},
{
"epoch": 0.10337972166998012,
"grad_norm": 0.8774156748861563,
"learning_rate": 4.9611076982620445e-06,
"loss": 0.1033,
"step": 91
},
{
"epoch": 0.10451576256745243,
"grad_norm": 0.8318162500391886,
"learning_rate": 4.959439637589909e-06,
"loss": 0.1018,
"step": 92
},
{
"epoch": 0.10565180346492474,
"grad_norm": 0.8899888321299209,
"learning_rate": 4.957736845473173e-06,
"loss": 0.0941,
"step": 93
},
{
"epoch": 0.10678784436239705,
"grad_norm": 0.789140699658634,
"learning_rate": 4.9559993459581375e-06,
"loss": 0.1047,
"step": 94
},
{
"epoch": 0.10792388525986936,
"grad_norm": 0.8800492952518975,
"learning_rate": 4.954227163581234e-06,
"loss": 0.1028,
"step": 95
},
{
"epoch": 0.10905992615734167,
"grad_norm": 0.796766656234688,
"learning_rate": 4.952420323368673e-06,
"loss": 0.0942,
"step": 96
},
{
"epoch": 0.11019596705481398,
"grad_norm": 0.8613544374379629,
"learning_rate": 4.950578850836092e-06,
"loss": 0.1073,
"step": 97
},
{
"epoch": 0.11133200795228629,
"grad_norm": 0.8623738224609795,
"learning_rate": 4.948702771988195e-06,
"loss": 0.1004,
"step": 98
},
{
"epoch": 0.1124680488497586,
"grad_norm": 0.8419305666625628,
"learning_rate": 4.9467921133183864e-06,
"loss": 0.1007,
"step": 99
},
{
"epoch": 0.1136040897472309,
"grad_norm": 0.8733459281176695,
"learning_rate": 4.944846901808397e-06,
"loss": 0.0964,
"step": 100
},
{
"epoch": 0.1147401306447032,
"grad_norm": 0.8126576547036249,
"learning_rate": 4.942867164927899e-06,
"loss": 0.1115,
"step": 101
},
{
"epoch": 0.11587617154217551,
"grad_norm": 0.912151238889085,
"learning_rate": 4.940852930634126e-06,
"loss": 0.0974,
"step": 102
},
{
"epoch": 0.11701221243964782,
"grad_norm": 0.8379629358991061,
"learning_rate": 4.938804227371467e-06,
"loss": 0.0949,
"step": 103
},
{
"epoch": 0.11814825333712013,
"grad_norm": 0.8825318553017338,
"learning_rate": 4.936721084071079e-06,
"loss": 0.0995,
"step": 104
},
{
"epoch": 0.11928429423459244,
"grad_norm": 0.7887136184663985,
"learning_rate": 4.9346035301504644e-06,
"loss": 0.0856,
"step": 105
},
{
"epoch": 0.12042033513206475,
"grad_norm": 0.8694351555924742,
"learning_rate": 4.932451595513063e-06,
"loss": 0.0972,
"step": 106
},
{
"epoch": 0.12155637602953706,
"grad_norm": 0.8953649088172692,
"learning_rate": 4.930265310547829e-06,
"loss": 0.0997,
"step": 107
},
{
"epoch": 0.12269241692700937,
"grad_norm": 0.8607731259738107,
"learning_rate": 4.928044706128803e-06,
"loss": 0.0977,
"step": 108
},
{
"epoch": 0.12382845782448168,
"grad_norm": 0.8777824068539876,
"learning_rate": 4.92578981361467e-06,
"loss": 0.1021,
"step": 109
},
{
"epoch": 0.12496449872195399,
"grad_norm": 0.8266603261517322,
"learning_rate": 4.923500664848327e-06,
"loss": 0.1006,
"step": 110
},
{
"epoch": 0.1261005396194263,
"grad_norm": 0.8118026763757383,
"learning_rate": 4.9211772921564205e-06,
"loss": 0.0924,
"step": 111
},
{
"epoch": 0.1272365805168986,
"grad_norm": 0.8366614115506429,
"learning_rate": 4.918819728348901e-06,
"loss": 0.0823,
"step": 112
},
{
"epoch": 0.12837262141437092,
"grad_norm": 0.898715874913154,
"learning_rate": 4.916428006718555e-06,
"loss": 0.1027,
"step": 113
},
{
"epoch": 0.12950866231184321,
"grad_norm": 0.8937215198706769,
"learning_rate": 4.9140021610405335e-06,
"loss": 0.0783,
"step": 114
},
{
"epoch": 0.13064470320931554,
"grad_norm": 0.8093914274815599,
"learning_rate": 4.911542225571877e-06,
"loss": 0.1034,
"step": 115
},
{
"epoch": 0.13178074410678783,
"grad_norm": 0.9187764615250489,
"learning_rate": 4.909048235051033e-06,
"loss": 0.0929,
"step": 116
},
{
"epoch": 0.13291678500426016,
"grad_norm": 0.9645110159367966,
"learning_rate": 4.906520224697364e-06,
"loss": 0.1025,
"step": 117
},
{
"epoch": 0.13405282590173245,
"grad_norm": 0.8397494780778292,
"learning_rate": 4.903958230210647e-06,
"loss": 0.0902,
"step": 118
},
{
"epoch": 0.13518886679920478,
"grad_norm": 0.9810615634775558,
"learning_rate": 4.901362287770576e-06,
"loss": 0.1,
"step": 119
},
{
"epoch": 0.13632490769667707,
"grad_norm": 0.8518935095057704,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.0932,
"step": 120
},
{
"epoch": 0.1374609485941494,
"grad_norm": 0.8809729982496561,
"learning_rate": 4.896068706145632e-06,
"loss": 0.0915,
"step": 121
},
{
"epoch": 0.1385969894916217,
"grad_norm": 1.0731776458211226,
"learning_rate": 4.89337114171508e-06,
"loss": 0.098,
"step": 122
},
{
"epoch": 0.13973303038909402,
"grad_norm": 0.887795496144111,
"learning_rate": 4.890639778838757e-06,
"loss": 0.0927,
"step": 123
},
{
"epoch": 0.1408690712865663,
"grad_norm": 0.8194626039786459,
"learning_rate": 4.887874656088124e-06,
"loss": 0.1027,
"step": 124
},
{
"epoch": 0.14200511218403863,
"grad_norm": 0.8442429261448717,
"learning_rate": 4.885075812511386e-06,
"loss": 0.0961,
"step": 125
},
{
"epoch": 0.14314115308151093,
"grad_norm": 0.8395001231574464,
"learning_rate": 4.882243287632947e-06,
"loss": 0.1039,
"step": 126
},
{
"epoch": 0.14427719397898325,
"grad_norm": 0.8363673989585937,
"learning_rate": 4.879377121452844e-06,
"loss": 0.1006,
"step": 127
},
{
"epoch": 0.14541323487645555,
"grad_norm": 0.7397826384078707,
"learning_rate": 4.8764773544461895e-06,
"loss": 0.087,
"step": 128
},
{
"epoch": 0.14654927577392787,
"grad_norm": 0.8414577640896602,
"learning_rate": 4.873544027562593e-06,
"loss": 0.0891,
"step": 129
},
{
"epoch": 0.14768531667140017,
"grad_norm": 0.8972297713907886,
"learning_rate": 4.8705771822255895e-06,
"loss": 0.102,
"step": 130
},
{
"epoch": 0.1488213575688725,
"grad_norm": 0.7843924774512786,
"learning_rate": 4.867576860332048e-06,
"loss": 0.0945,
"step": 131
},
{
"epoch": 0.1499573984663448,
"grad_norm": 0.8740179298084251,
"learning_rate": 4.864543104251587e-06,
"loss": 0.097,
"step": 132
},
{
"epoch": 0.15109343936381708,
"grad_norm": 0.9423786129930344,
"learning_rate": 4.8614759568259685e-06,
"loss": 0.0975,
"step": 133
},
{
"epoch": 0.1522294802612894,
"grad_norm": 0.7928906420896884,
"learning_rate": 4.858375461368499e-06,
"loss": 0.0906,
"step": 134
},
{
"epoch": 0.1533655211587617,
"grad_norm": 0.9074949015198895,
"learning_rate": 4.855241661663413e-06,
"loss": 0.0959,
"step": 135
},
{
"epoch": 0.15450156205623403,
"grad_norm": 0.8127866420512039,
"learning_rate": 4.852074601965261e-06,
"loss": 0.0939,
"step": 136
},
{
"epoch": 0.15563760295370632,
"grad_norm": 0.8076873863138903,
"learning_rate": 4.848874326998279e-06,
"loss": 0.0977,
"step": 137
},
{
"epoch": 0.15677364385117865,
"grad_norm": 0.8129270738261459,
"learning_rate": 4.845640881955757e-06,
"loss": 0.0978,
"step": 138
},
{
"epoch": 0.15790968474865094,
"grad_norm": 0.8098881301085323,
"learning_rate": 4.842374312499405e-06,
"loss": 0.0886,
"step": 139
},
{
"epoch": 0.15904572564612326,
"grad_norm": 0.7756679586652417,
"learning_rate": 4.839074664758705e-06,
"loss": 0.0894,
"step": 140
},
{
"epoch": 0.16018176654359556,
"grad_norm": 0.8739902401352373,
"learning_rate": 4.835741985330259e-06,
"loss": 0.0915,
"step": 141
},
{
"epoch": 0.16131780744106788,
"grad_norm": 0.9169987881610346,
"learning_rate": 4.832376321277136e-06,
"loss": 0.0914,
"step": 142
},
{
"epoch": 0.16245384833854018,
"grad_norm": 0.8037543620615127,
"learning_rate": 4.828977720128198e-06,
"loss": 0.0876,
"step": 143
},
{
"epoch": 0.1635898892360125,
"grad_norm": 0.7992352843679545,
"learning_rate": 4.825546229877439e-06,
"loss": 0.0877,
"step": 144
},
{
"epoch": 0.1647259301334848,
"grad_norm": 1.0614030660867448,
"learning_rate": 4.822081898983302e-06,
"loss": 0.0941,
"step": 145
},
{
"epoch": 0.16586197103095712,
"grad_norm": 0.7970505919422197,
"learning_rate": 4.818584776367992e-06,
"loss": 0.0837,
"step": 146
},
{
"epoch": 0.16699801192842942,
"grad_norm": 0.8532009414479299,
"learning_rate": 4.815054911416795e-06,
"loss": 0.0966,
"step": 147
},
{
"epoch": 0.16813405282590174,
"grad_norm": 0.8955310251194117,
"learning_rate": 4.811492353977366e-06,
"loss": 0.0969,
"step": 148
},
{
"epoch": 0.16927009372337404,
"grad_norm": 0.8081962759766569,
"learning_rate": 4.80789715435904e-06,
"loss": 0.0953,
"step": 149
},
{
"epoch": 0.17040613462084636,
"grad_norm": 0.9554413544294639,
"learning_rate": 4.804269363332112e-06,
"loss": 0.0937,
"step": 150
},
{
"epoch": 0.17154217551831866,
"grad_norm": 0.8121987703021468,
"learning_rate": 4.800609032127123e-06,
"loss": 0.0853,
"step": 151
},
{
"epoch": 0.17267821641579098,
"grad_norm": 0.8917282328521219,
"learning_rate": 4.7969162124341354e-06,
"loss": 0.0934,
"step": 152
},
{
"epoch": 0.17381425731326328,
"grad_norm": 0.8287357539581232,
"learning_rate": 4.793190956402005e-06,
"loss": 0.0991,
"step": 153
},
{
"epoch": 0.1749502982107356,
"grad_norm": 0.7980156222346841,
"learning_rate": 4.789433316637644e-06,
"loss": 0.0876,
"step": 154
},
{
"epoch": 0.1760863391082079,
"grad_norm": 0.9256751077111557,
"learning_rate": 4.785643346205277e-06,
"loss": 0.0959,
"step": 155
},
{
"epoch": 0.1772223800056802,
"grad_norm": 0.8519817551622527,
"learning_rate": 4.781821098625691e-06,
"loss": 0.0885,
"step": 156
},
{
"epoch": 0.17835842090315251,
"grad_norm": 1.029546823822168,
"learning_rate": 4.777966627875484e-06,
"loss": 0.0979,
"step": 157
},
{
"epoch": 0.1794944618006248,
"grad_norm": 0.8471825516389808,
"learning_rate": 4.7740799883862966e-06,
"loss": 0.1109,
"step": 158
},
{
"epoch": 0.18063050269809713,
"grad_norm": 0.8232766246509707,
"learning_rate": 4.770161235044047e-06,
"loss": 0.0906,
"step": 159
},
{
"epoch": 0.18176654359556943,
"grad_norm": 0.7744605824026991,
"learning_rate": 4.766210423188158e-06,
"loss": 0.0865,
"step": 160
},
{
"epoch": 0.18290258449304175,
"grad_norm": 0.8240753157263881,
"learning_rate": 4.7622276086107685e-06,
"loss": 0.0929,
"step": 161
},
{
"epoch": 0.18403862539051405,
"grad_norm": 1.0242521571922516,
"learning_rate": 4.758212847555953e-06,
"loss": 0.0868,
"step": 162
},
{
"epoch": 0.18517466628798637,
"grad_norm": 0.7246225269895837,
"learning_rate": 4.7541661967189225e-06,
"loss": 0.0828,
"step": 163
},
{
"epoch": 0.18631070718545867,
"grad_norm": 0.8504247119017272,
"learning_rate": 4.750087713245227e-06,
"loss": 0.0925,
"step": 164
},
{
"epoch": 0.187446748082931,
"grad_norm": 1.0186333734231328,
"learning_rate": 4.745977454729947e-06,
"loss": 0.0858,
"step": 165
},
{
"epoch": 0.1885827889804033,
"grad_norm": 0.847067525360349,
"learning_rate": 4.74183547921688e-06,
"loss": 0.092,
"step": 166
},
{
"epoch": 0.1897188298778756,
"grad_norm": 0.8536347476054195,
"learning_rate": 4.7376618451977195e-06,
"loss": 0.0972,
"step": 167
},
{
"epoch": 0.1908548707753479,
"grad_norm": 0.7658131109439913,
"learning_rate": 4.733456611611233e-06,
"loss": 0.0915,
"step": 168
},
{
"epoch": 0.19199091167282023,
"grad_norm": 0.8219276405969249,
"learning_rate": 4.729219837842427e-06,
"loss": 0.095,
"step": 169
},
{
"epoch": 0.19312695257029253,
"grad_norm": 0.7164304793044781,
"learning_rate": 4.7249515837217075e-06,
"loss": 0.083,
"step": 170
},
{
"epoch": 0.19426299346776485,
"grad_norm": 0.6962094522936668,
"learning_rate": 4.720651909524037e-06,
"loss": 0.0813,
"step": 171
},
{
"epoch": 0.19539903436523715,
"grad_norm": 0.7811734598092144,
"learning_rate": 4.716320875968081e-06,
"loss": 0.0914,
"step": 172
},
{
"epoch": 0.19653507526270947,
"grad_norm": 0.8570333273830322,
"learning_rate": 4.711958544215355e-06,
"loss": 0.0915,
"step": 173
},
{
"epoch": 0.19767111616018176,
"grad_norm": 0.8036691820553741,
"learning_rate": 4.707564975869357e-06,
"loss": 0.0986,
"step": 174
},
{
"epoch": 0.1988071570576541,
"grad_norm": 0.7930663707472128,
"learning_rate": 4.703140232974697e-06,
"loss": 0.088,
"step": 175
},
{
"epoch": 0.19994319795512638,
"grad_norm": 0.7944858503238109,
"learning_rate": 4.698684378016223e-06,
"loss": 0.0797,
"step": 176
},
{
"epoch": 0.2010792388525987,
"grad_norm": 0.7708906880680786,
"learning_rate": 4.694197473918139e-06,
"loss": 0.0891,
"step": 177
},
{
"epoch": 0.202215279750071,
"grad_norm": 0.7294391552603106,
"learning_rate": 4.6896795840431155e-06,
"loss": 0.0836,
"step": 178
},
{
"epoch": 0.2033513206475433,
"grad_norm": 0.7440970510492587,
"learning_rate": 4.685130772191392e-06,
"loss": 0.0975,
"step": 179
},
{
"epoch": 0.20448736154501562,
"grad_norm": 0.7924536059191131,
"learning_rate": 4.680551102599881e-06,
"loss": 0.0885,
"step": 180
},
{
"epoch": 0.20562340244248792,
"grad_norm": 0.7596047990400485,
"learning_rate": 4.675940639941256e-06,
"loss": 0.1072,
"step": 181
},
{
"epoch": 0.20675944333996024,
"grad_norm": 0.9330680024460848,
"learning_rate": 4.671299449323045e-06,
"loss": 0.086,
"step": 182
},
{
"epoch": 0.20789548423743254,
"grad_norm": 0.8217560932912764,
"learning_rate": 4.666627596286702e-06,
"loss": 0.0969,
"step": 183
},
{
"epoch": 0.20903152513490486,
"grad_norm": 0.8223504953929077,
"learning_rate": 4.66192514680669e-06,
"loss": 0.0875,
"step": 184
},
{
"epoch": 0.21016756603237716,
"grad_norm": 0.8436510077788845,
"learning_rate": 4.657192167289542e-06,
"loss": 0.1025,
"step": 185
},
{
"epoch": 0.21130360692984948,
"grad_norm": 0.7432485961316236,
"learning_rate": 4.652428724572929e-06,
"loss": 0.0898,
"step": 186
},
{
"epoch": 0.21243964782732178,
"grad_norm": 0.8263428490118125,
"learning_rate": 4.647634885924713e-06,
"loss": 0.0914,
"step": 187
},
{
"epoch": 0.2135756887247941,
"grad_norm": 0.7006252126044292,
"learning_rate": 4.642810719041999e-06,
"loss": 0.0769,
"step": 188
},
{
"epoch": 0.2147117296222664,
"grad_norm": 0.7046589326705855,
"learning_rate": 4.637956292050176e-06,
"loss": 0.084,
"step": 189
},
{
"epoch": 0.21584777051973872,
"grad_norm": 0.7611962581820568,
"learning_rate": 4.63307167350196e-06,
"loss": 0.0876,
"step": 190
},
{
"epoch": 0.21698381141721101,
"grad_norm": 0.7833240772094479,
"learning_rate": 4.628156932376419e-06,
"loss": 0.0873,
"step": 191
},
{
"epoch": 0.21811985231468334,
"grad_norm": 0.7377511037071964,
"learning_rate": 4.623212138078004e-06,
"loss": 0.0945,
"step": 192
},
{
"epoch": 0.21925589321215563,
"grad_norm": 0.8377382456599733,
"learning_rate": 4.61823736043557e-06,
"loss": 0.0887,
"step": 193
},
{
"epoch": 0.22039193410962796,
"grad_norm": 0.8116009894693861,
"learning_rate": 4.613232669701384e-06,
"loss": 0.1012,
"step": 194
},
{
"epoch": 0.22152797500710025,
"grad_norm": 0.7422452860486192,
"learning_rate": 4.60819813655014e-06,
"loss": 0.0838,
"step": 195
},
{
"epoch": 0.22266401590457258,
"grad_norm": 0.7828779245377948,
"learning_rate": 4.603133832077953e-06,
"loss": 0.09,
"step": 196
},
{
"epoch": 0.22380005680204487,
"grad_norm": 0.7827054714406323,
"learning_rate": 4.598039827801364e-06,
"loss": 0.0826,
"step": 197
},
{
"epoch": 0.2249360976995172,
"grad_norm": 0.7486696522988175,
"learning_rate": 4.592916195656322e-06,
"loss": 0.0776,
"step": 198
},
{
"epoch": 0.2260721385969895,
"grad_norm": 0.86990777525941,
"learning_rate": 4.587763007997173e-06,
"loss": 0.0862,
"step": 199
},
{
"epoch": 0.2272081794944618,
"grad_norm": 0.8814024837126304,
"learning_rate": 4.582580337595636e-06,
"loss": 0.0916,
"step": 200
},
{
"epoch": 0.2283442203919341,
"grad_norm": 0.7612514997934728,
"learning_rate": 4.577368257639778e-06,
"loss": 0.0883,
"step": 201
},
{
"epoch": 0.2294802612894064,
"grad_norm": 0.8819736494589635,
"learning_rate": 4.572126841732977e-06,
"loss": 0.0906,
"step": 202
},
{
"epoch": 0.23061630218687873,
"grad_norm": 0.8685465339651437,
"learning_rate": 4.566856163892884e-06,
"loss": 0.0888,
"step": 203
},
{
"epoch": 0.23175234308435103,
"grad_norm": 0.8923434811809764,
"learning_rate": 4.561556298550379e-06,
"loss": 0.0841,
"step": 204
},
{
"epoch": 0.23288838398182335,
"grad_norm": 0.8683618736939278,
"learning_rate": 4.556227320548519e-06,
"loss": 0.0848,
"step": 205
},
{
"epoch": 0.23402442487929564,
"grad_norm": 0.8133017920049119,
"learning_rate": 4.550869305141478e-06,
"loss": 0.1032,
"step": 206
},
{
"epoch": 0.23516046577676797,
"grad_norm": 1.0077751828505999,
"learning_rate": 4.5454823279934924e-06,
"loss": 0.0939,
"step": 207
},
{
"epoch": 0.23629650667424026,
"grad_norm": 0.8941773669569191,
"learning_rate": 4.5400664651777835e-06,
"loss": 0.0736,
"step": 208
},
{
"epoch": 0.2374325475717126,
"grad_norm": 0.9625846478259737,
"learning_rate": 4.534621793175488e-06,
"loss": 0.0786,
"step": 209
},
{
"epoch": 0.23856858846918488,
"grad_norm": 0.8867810392749393,
"learning_rate": 4.529148388874577e-06,
"loss": 0.0882,
"step": 210
},
{
"epoch": 0.2397046293666572,
"grad_norm": 0.8856076921546335,
"learning_rate": 4.523646329568771e-06,
"loss": 0.084,
"step": 211
},
{
"epoch": 0.2408406702641295,
"grad_norm": 0.9900335853980446,
"learning_rate": 4.518115692956445e-06,
"loss": 0.0882,
"step": 212
},
{
"epoch": 0.24197671116160183,
"grad_norm": 0.8154742650873136,
"learning_rate": 4.512556557139538e-06,
"loss": 0.0999,
"step": 213
},
{
"epoch": 0.24311275205907412,
"grad_norm": 1.0349253631685067,
"learning_rate": 4.506969000622443e-06,
"loss": 0.0914,
"step": 214
},
{
"epoch": 0.24424879295654645,
"grad_norm": 0.8592000952917487,
"learning_rate": 4.501353102310901e-06,
"loss": 0.0892,
"step": 215
},
{
"epoch": 0.24538483385401874,
"grad_norm": 0.7760366725602739,
"learning_rate": 4.49570894151089e-06,
"loss": 0.0854,
"step": 216
},
{
"epoch": 0.24652087475149106,
"grad_norm": 0.8943917514409343,
"learning_rate": 4.490036597927499e-06,
"loss": 0.0894,
"step": 217
},
{
"epoch": 0.24765691564896336,
"grad_norm": 0.8522941541639423,
"learning_rate": 4.484336151663807e-06,
"loss": 0.1003,
"step": 218
},
{
"epoch": 0.24879295654643568,
"grad_norm": 0.8967087502728155,
"learning_rate": 4.47860768321975e-06,
"loss": 0.0948,
"step": 219
},
{
"epoch": 0.24992899744390798,
"grad_norm": 0.7189838300322855,
"learning_rate": 4.472851273490985e-06,
"loss": 0.098,
"step": 220
},
{
"epoch": 0.2510650383413803,
"grad_norm": 0.7078624951530635,
"learning_rate": 4.467067003767745e-06,
"loss": 0.0829,
"step": 221
},
{
"epoch": 0.2522010792388526,
"grad_norm": 0.7348078666330551,
"learning_rate": 4.4612549557336975e-06,
"loss": 0.0854,
"step": 222
},
{
"epoch": 0.2533371201363249,
"grad_norm": 0.7586832543393554,
"learning_rate": 4.455415211464783e-06,
"loss": 0.0871,
"step": 223
},
{
"epoch": 0.2544731610337972,
"grad_norm": 0.7930770514967493,
"learning_rate": 4.449547853428061e-06,
"loss": 0.0953,
"step": 224
},
{
"epoch": 0.2556092019312695,
"grad_norm": 0.7534958358029722,
"learning_rate": 4.443652964480544e-06,
"loss": 0.0917,
"step": 225
},
{
"epoch": 0.25674524282874184,
"grad_norm": 0.760763727741596,
"learning_rate": 4.437730627868028e-06,
"loss": 0.0842,
"step": 226
},
{
"epoch": 0.25788128372621416,
"grad_norm": 0.6899506937858009,
"learning_rate": 4.4317809272239145e-06,
"loss": 0.0835,
"step": 227
},
{
"epoch": 0.25901732462368643,
"grad_norm": 0.8301938179486986,
"learning_rate": 4.425803946568033e-06,
"loss": 0.0837,
"step": 228
},
{
"epoch": 0.26015336552115875,
"grad_norm": 0.806195952391347,
"learning_rate": 4.419799770305453e-06,
"loss": 0.0939,
"step": 229
},
{
"epoch": 0.2612894064186311,
"grad_norm": 0.7874803780113002,
"learning_rate": 4.413768483225292e-06,
"loss": 0.0927,
"step": 230
},
{
"epoch": 0.2624254473161034,
"grad_norm": 0.778443263649366,
"learning_rate": 4.407710170499517e-06,
"loss": 0.0842,
"step": 231
},
{
"epoch": 0.26356148821357567,
"grad_norm": 0.7837747021199533,
"learning_rate": 4.401624917681743e-06,
"loss": 0.0932,
"step": 232
},
{
"epoch": 0.264697529111048,
"grad_norm": 0.8282894247426811,
"learning_rate": 4.395512810706026e-06,
"loss": 0.0869,
"step": 233
},
{
"epoch": 0.2658335700085203,
"grad_norm": 0.829901206986068,
"learning_rate": 4.3893739358856465e-06,
"loss": 0.0865,
"step": 234
},
{
"epoch": 0.26696961090599264,
"grad_norm": 0.7444682225944242,
"learning_rate": 4.383208379911893e-06,
"loss": 0.0897,
"step": 235
},
{
"epoch": 0.2681056518034649,
"grad_norm": 0.7012792726547673,
"learning_rate": 4.377016229852836e-06,
"loss": 0.086,
"step": 236
},
{
"epoch": 0.26924169270093723,
"grad_norm": 0.8500432213095706,
"learning_rate": 4.370797573152101e-06,
"loss": 0.0813,
"step": 237
},
{
"epoch": 0.27037773359840955,
"grad_norm": 0.7570382330116664,
"learning_rate": 4.364552497627632e-06,
"loss": 0.089,
"step": 238
},
{
"epoch": 0.2715137744958819,
"grad_norm": 0.8335036015037836,
"learning_rate": 4.35828109147045e-06,
"loss": 0.0851,
"step": 239
},
{
"epoch": 0.27264981539335414,
"grad_norm": 0.8054974855210708,
"learning_rate": 4.3519834432434095e-06,
"loss": 0.0896,
"step": 240
},
{
"epoch": 0.27378585629082647,
"grad_norm": 0.7073705410015672,
"learning_rate": 4.345659641879948e-06,
"loss": 0.0909,
"step": 241
},
{
"epoch": 0.2749218971882988,
"grad_norm": 0.8852333441267097,
"learning_rate": 4.33930977668283e-06,
"loss": 0.0927,
"step": 242
},
{
"epoch": 0.2760579380857711,
"grad_norm": 0.8425230580400942,
"learning_rate": 4.332933937322883e-06,
"loss": 0.0845,
"step": 243
},
{
"epoch": 0.2771939789832434,
"grad_norm": 0.8758257547967765,
"learning_rate": 4.326532213837735e-06,
"loss": 0.0956,
"step": 244
},
{
"epoch": 0.2783300198807157,
"grad_norm": 0.816120585942606,
"learning_rate": 4.320104696630544e-06,
"loss": 0.0847,
"step": 245
},
{
"epoch": 0.27946606077818803,
"grad_norm": 0.7280485699649355,
"learning_rate": 4.3136514764687155e-06,
"loss": 0.0835,
"step": 246
},
{
"epoch": 0.2806021016756603,
"grad_norm": 0.7641526018539088,
"learning_rate": 4.3071726444826244e-06,
"loss": 0.0737,
"step": 247
},
{
"epoch": 0.2817381425731326,
"grad_norm": 0.696074958902299,
"learning_rate": 4.300668292164329e-06,
"loss": 0.0849,
"step": 248
},
{
"epoch": 0.28287418347060495,
"grad_norm": 0.7670647555778207,
"learning_rate": 4.29413851136628e-06,
"loss": 0.0923,
"step": 249
},
{
"epoch": 0.28401022436807727,
"grad_norm": 0.7773991448116256,
"learning_rate": 4.287583394300016e-06,
"loss": 0.09,
"step": 250
},
{
"epoch": 0.28514626526554954,
"grad_norm": 0.7573657225369796,
"learning_rate": 4.28100303353487e-06,
"loss": 0.0841,
"step": 251
},
{
"epoch": 0.28628230616302186,
"grad_norm": 0.7452647302721194,
"learning_rate": 4.274397521996658e-06,
"loss": 0.0822,
"step": 252
},
{
"epoch": 0.2874183470604942,
"grad_norm": 0.677199297450376,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0786,
"step": 253
},
{
"epoch": 0.2885543879579665,
"grad_norm": 0.8159060891220854,
"learning_rate": 4.261111420078844e-06,
"loss": 0.0855,
"step": 254
},
{
"epoch": 0.2896904288554388,
"grad_norm": 0.7560132821786525,
"learning_rate": 4.2544310173214546e-06,
"loss": 0.0896,
"step": 255
},
{
"epoch": 0.2908264697529111,
"grad_norm": 0.73318307350379,
"learning_rate": 4.247725839032781e-06,
"loss": 0.0768,
"step": 256
},
{
"epoch": 0.2919625106503834,
"grad_norm": 0.6654523142661359,
"learning_rate": 4.240995979901273e-06,
"loss": 0.0788,
"step": 257
},
{
"epoch": 0.29309855154785575,
"grad_norm": 0.6869374811575061,
"learning_rate": 4.234241534963916e-06,
"loss": 0.0842,
"step": 258
},
{
"epoch": 0.294234592445328,
"grad_norm": 0.7030551454665738,
"learning_rate": 4.227462599604889e-06,
"loss": 0.0771,
"step": 259
},
{
"epoch": 0.29537063334280034,
"grad_norm": 0.758359859205596,
"learning_rate": 4.220659269554217e-06,
"loss": 0.0865,
"step": 260
},
{
"epoch": 0.29650667424027266,
"grad_norm": 0.7754592301638612,
"learning_rate": 4.21383164088642e-06,
"loss": 0.0871,
"step": 261
},
{
"epoch": 0.297642715137745,
"grad_norm": 0.6840112409540218,
"learning_rate": 4.206979810019153e-06,
"loss": 0.0773,
"step": 262
},
{
"epoch": 0.29877875603521725,
"grad_norm": 0.6848481267287457,
"learning_rate": 4.20010387371185e-06,
"loss": 0.0782,
"step": 263
},
{
"epoch": 0.2999147969326896,
"grad_norm": 0.7061222716573403,
"learning_rate": 4.1932039290643534e-06,
"loss": 0.0777,
"step": 264
},
{
"epoch": 0.3010508378301619,
"grad_norm": 0.7448997259987848,
"learning_rate": 4.186280073515543e-06,
"loss": 0.0847,
"step": 265
},
{
"epoch": 0.30218687872763417,
"grad_norm": 0.7380145268386641,
"learning_rate": 4.179332404841963e-06,
"loss": 0.0713,
"step": 266
},
{
"epoch": 0.3033229196251065,
"grad_norm": 0.7441635416413879,
"learning_rate": 4.172361021156436e-06,
"loss": 0.0827,
"step": 267
},
{
"epoch": 0.3044589605225788,
"grad_norm": 0.7528609351049764,
"learning_rate": 4.1653660209066835e-06,
"loss": 0.0877,
"step": 268
},
{
"epoch": 0.30559500142005114,
"grad_norm": 0.8275998852209175,
"learning_rate": 4.158347502873933e-06,
"loss": 0.0804,
"step": 269
},
{
"epoch": 0.3067310423175234,
"grad_norm": 0.7437237554431169,
"learning_rate": 4.151305566171521e-06,
"loss": 0.0797,
"step": 270
},
{
"epoch": 0.30786708321499573,
"grad_norm": 0.8019081711906108,
"learning_rate": 4.144240310243496e-06,
"loss": 0.0841,
"step": 271
},
{
"epoch": 0.30900312411246805,
"grad_norm": 0.7291775823706078,
"learning_rate": 4.137151834863213e-06,
"loss": 0.0873,
"step": 272
},
{
"epoch": 0.3101391650099404,
"grad_norm": 0.7571411166672053,
"learning_rate": 4.130040240131925e-06,
"loss": 0.0845,
"step": 273
},
{
"epoch": 0.31127520590741264,
"grad_norm": 0.8067283136621284,
"learning_rate": 4.122905626477371e-06,
"loss": 0.0965,
"step": 274
},
{
"epoch": 0.31241124680488497,
"grad_norm": 0.7733213314503384,
"learning_rate": 4.115748094652352e-06,
"loss": 0.0783,
"step": 275
},
{
"epoch": 0.3135472877023573,
"grad_norm": 0.8024692635641963,
"learning_rate": 4.108567745733318e-06,
"loss": 0.0814,
"step": 276
},
{
"epoch": 0.3146833285998296,
"grad_norm": 0.9972625903967001,
"learning_rate": 4.10136468111893e-06,
"loss": 0.0814,
"step": 277
},
{
"epoch": 0.3158193694973019,
"grad_norm": 0.8366578826907879,
"learning_rate": 4.094139002528635e-06,
"loss": 0.0868,
"step": 278
},
{
"epoch": 0.3169554103947742,
"grad_norm": 0.6756245861915039,
"learning_rate": 4.086890812001228e-06,
"loss": 0.0815,
"step": 279
},
{
"epoch": 0.31809145129224653,
"grad_norm": 0.865443271304209,
"learning_rate": 4.07962021189341e-06,
"loss": 0.1015,
"step": 280
},
{
"epoch": 0.31922749218971885,
"grad_norm": 0.7942215693194814,
"learning_rate": 4.0723273048783426e-06,
"loss": 0.0824,
"step": 281
},
{
"epoch": 0.3203635330871911,
"grad_norm": 0.7941199716368488,
"learning_rate": 4.065012193944201e-06,
"loss": 0.0846,
"step": 282
},
{
"epoch": 0.32149957398466344,
"grad_norm": 0.7194857749286357,
"learning_rate": 4.057674982392713e-06,
"loss": 0.0851,
"step": 283
},
{
"epoch": 0.32263561488213577,
"grad_norm": 0.7058130985999059,
"learning_rate": 4.050315773837708e-06,
"loss": 0.0828,
"step": 284
},
{
"epoch": 0.3237716557796081,
"grad_norm": 0.8582057260744234,
"learning_rate": 4.042934672203651e-06,
"loss": 0.089,
"step": 285
},
{
"epoch": 0.32490769667708036,
"grad_norm": 0.8183999060210416,
"learning_rate": 4.0355317817241705e-06,
"loss": 0.0866,
"step": 286
},
{
"epoch": 0.3260437375745527,
"grad_norm": 0.8343617132623691,
"learning_rate": 4.028107206940592e-06,
"loss": 0.0853,
"step": 287
},
{
"epoch": 0.327179778472025,
"grad_norm": 0.718035241670308,
"learning_rate": 4.020661052700462e-06,
"loss": 0.0757,
"step": 288
},
{
"epoch": 0.3283158193694973,
"grad_norm": 0.736559511530064,
"learning_rate": 4.013193424156062e-06,
"loss": 0.0846,
"step": 289
},
{
"epoch": 0.3294518602669696,
"grad_norm": 0.7881644130060897,
"learning_rate": 4.00570442676293e-06,
"loss": 0.0801,
"step": 290
},
{
"epoch": 0.3305879011644419,
"grad_norm": 0.8339054711607803,
"learning_rate": 3.9981941662783675e-06,
"loss": 0.0832,
"step": 291
},
{
"epoch": 0.33172394206191425,
"grad_norm": 0.6576881392840723,
"learning_rate": 3.990662748759946e-06,
"loss": 0.0748,
"step": 292
},
{
"epoch": 0.3328599829593865,
"grad_norm": 0.7151497791042993,
"learning_rate": 3.983110280564009e-06,
"loss": 0.0788,
"step": 293
},
{
"epoch": 0.33399602385685884,
"grad_norm": 0.803862729134459,
"learning_rate": 3.975536868344174e-06,
"loss": 0.0856,
"step": 294
},
{
"epoch": 0.33513206475433116,
"grad_norm": 0.797700895368388,
"learning_rate": 3.96794261904982e-06,
"loss": 0.085,
"step": 295
},
{
"epoch": 0.3362681056518035,
"grad_norm": 0.7063931073253098,
"learning_rate": 3.9603276399245864e-06,
"loss": 0.0874,
"step": 296
},
{
"epoch": 0.33740414654927575,
"grad_norm": 0.8196874744289926,
"learning_rate": 3.9526920385048465e-06,
"loss": 0.0858,
"step": 297
},
{
"epoch": 0.3385401874467481,
"grad_norm": 0.6616725384562748,
"learning_rate": 3.945035922618198e-06,
"loss": 0.073,
"step": 298
},
{
"epoch": 0.3396762283442204,
"grad_norm": 0.7164040094642763,
"learning_rate": 3.937359400381938e-06,
"loss": 0.0756,
"step": 299
},
{
"epoch": 0.3408122692416927,
"grad_norm": 0.7262888078379157,
"learning_rate": 3.929662580201536e-06,
"loss": 0.0865,
"step": 300
},
{
"epoch": 0.341948310139165,
"grad_norm": 0.7363762038223817,
"learning_rate": 3.9219455707691004e-06,
"loss": 0.0811,
"step": 301
},
{
"epoch": 0.3430843510366373,
"grad_norm": 0.7072534911733872,
"learning_rate": 3.91420848106185e-06,
"loss": 0.0756,
"step": 302
},
{
"epoch": 0.34422039193410964,
"grad_norm": 0.6760504100426291,
"learning_rate": 3.906451420340566e-06,
"loss": 0.0768,
"step": 303
},
{
"epoch": 0.34535643283158196,
"grad_norm": 0.6941199007978212,
"learning_rate": 3.898674498148058e-06,
"loss": 0.0831,
"step": 304
},
{
"epoch": 0.34649247372905423,
"grad_norm": 0.7638852373461658,
"learning_rate": 3.890877824307611e-06,
"loss": 0.0837,
"step": 305
},
{
"epoch": 0.34762851462652655,
"grad_norm": 0.8602638611119576,
"learning_rate": 3.883061508921439e-06,
"loss": 0.0758,
"step": 306
},
{
"epoch": 0.3487645555239989,
"grad_norm": 0.7902801585559203,
"learning_rate": 3.875225662369125e-06,
"loss": 0.0852,
"step": 307
},
{
"epoch": 0.3499005964214712,
"grad_norm": 0.7206101025736867,
"learning_rate": 3.8673703953060685e-06,
"loss": 0.0805,
"step": 308
},
{
"epoch": 0.35103663731894347,
"grad_norm": 0.8588244629042325,
"learning_rate": 3.859495818661914e-06,
"loss": 0.0759,
"step": 309
},
{
"epoch": 0.3521726782164158,
"grad_norm": 0.8040786942739016,
"learning_rate": 3.8516020436389945e-06,
"loss": 0.0834,
"step": 310
},
{
"epoch": 0.3533087191138881,
"grad_norm": 0.7672683122350898,
"learning_rate": 3.843689181710756e-06,
"loss": 0.0826,
"step": 311
},
{
"epoch": 0.3544447600113604,
"grad_norm": 0.7239437740106832,
"learning_rate": 3.835757344620183e-06,
"loss": 0.0726,
"step": 312
},
{
"epoch": 0.3555808009088327,
"grad_norm": 0.707817772964158,
"learning_rate": 3.827806644378221e-06,
"loss": 0.0774,
"step": 313
},
{
"epoch": 0.35671684180630503,
"grad_norm": 0.7814771723692865,
"learning_rate": 3.819837193262197e-06,
"loss": 0.0816,
"step": 314
},
{
"epoch": 0.35785288270377735,
"grad_norm": 0.7018548527687214,
"learning_rate": 3.811849103814229e-06,
"loss": 0.0834,
"step": 315
},
{
"epoch": 0.3589889236012496,
"grad_norm": 0.7411725676109678,
"learning_rate": 3.803842488839642e-06,
"loss": 0.0745,
"step": 316
},
{
"epoch": 0.36012496449872194,
"grad_norm": 0.7575659597309948,
"learning_rate": 3.795817461405372e-06,
"loss": 0.0758,
"step": 317
},
{
"epoch": 0.36126100539619427,
"grad_norm": 0.7126296926711662,
"learning_rate": 3.7877741348383703e-06,
"loss": 0.0871,
"step": 318
},
{
"epoch": 0.3623970462936666,
"grad_norm": 0.7647606176666816,
"learning_rate": 3.779712622724003e-06,
"loss": 0.0813,
"step": 319
},
{
"epoch": 0.36353308719113886,
"grad_norm": 0.7394294466894481,
"learning_rate": 3.7716330389044463e-06,
"loss": 0.0781,
"step": 320
},
{
"epoch": 0.3646691280886112,
"grad_norm": 0.7442083160908898,
"learning_rate": 3.76353549747708e-06,
"loss": 0.0868,
"step": 321
},
{
"epoch": 0.3658051689860835,
"grad_norm": 0.7080436398478851,
"learning_rate": 3.7554201127928747e-06,
"loss": 0.078,
"step": 322
},
{
"epoch": 0.36694120988355583,
"grad_norm": 0.6967853094410319,
"learning_rate": 3.74728699945478e-06,
"loss": 0.0803,
"step": 323
},
{
"epoch": 0.3680772507810281,
"grad_norm": 0.7828060588699257,
"learning_rate": 3.739136272316102e-06,
"loss": 0.0845,
"step": 324
},
{
"epoch": 0.3692132916785004,
"grad_norm": 0.7825472362344883,
"learning_rate": 3.7309680464788835e-06,
"loss": 0.0741,
"step": 325
},
{
"epoch": 0.37034933257597275,
"grad_norm": 0.7968905209840789,
"learning_rate": 3.72278243729228e-06,
"loss": 0.0801,
"step": 326
},
{
"epoch": 0.37148537347344507,
"grad_norm": 0.7412746573627451,
"learning_rate": 3.7145795603509282e-06,
"loss": 0.0814,
"step": 327
},
{
"epoch": 0.37262141437091734,
"grad_norm": 0.8150346697048992,
"learning_rate": 3.706359531493316e-06,
"loss": 0.0788,
"step": 328
},
{
"epoch": 0.37375745526838966,
"grad_norm": 0.7931274032845024,
"learning_rate": 3.6981224668001427e-06,
"loss": 0.0827,
"step": 329
},
{
"epoch": 0.374893496165862,
"grad_norm": 0.7843395757066731,
"learning_rate": 3.6898684825926845e-06,
"loss": 0.0743,
"step": 330
},
{
"epoch": 0.3760295370633343,
"grad_norm": 0.7346940737464841,
"learning_rate": 3.681597695431149e-06,
"loss": 0.0786,
"step": 331
},
{
"epoch": 0.3771655779608066,
"grad_norm": 0.8796304428964914,
"learning_rate": 3.6733102221130303e-06,
"loss": 0.0812,
"step": 332
},
{
"epoch": 0.3783016188582789,
"grad_norm": 0.6688582439986495,
"learning_rate": 3.6650061796714597e-06,
"loss": 0.073,
"step": 333
},
{
"epoch": 0.3794376597557512,
"grad_norm": 0.7754423124852141,
"learning_rate": 3.656685685373552e-06,
"loss": 0.0789,
"step": 334
},
{
"epoch": 0.3805737006532235,
"grad_norm": 0.7590512262588398,
"learning_rate": 3.6483488567187473e-06,
"loss": 0.0849,
"step": 335
},
{
"epoch": 0.3817097415506958,
"grad_norm": 0.781321406354808,
"learning_rate": 3.6399958114371597e-06,
"loss": 0.083,
"step": 336
},
{
"epoch": 0.38284578244816814,
"grad_norm": 0.692794052076511,
"learning_rate": 3.631626667487906e-06,
"loss": 0.0652,
"step": 337
},
{
"epoch": 0.38398182334564046,
"grad_norm": 0.7897147258296024,
"learning_rate": 3.623241543057445e-06,
"loss": 0.0802,
"step": 338
},
{
"epoch": 0.38511786424311273,
"grad_norm": 0.7425771430389372,
"learning_rate": 3.614840556557905e-06,
"loss": 0.0809,
"step": 339
},
{
"epoch": 0.38625390514058505,
"grad_norm": 0.7333094111793276,
"learning_rate": 3.606423826625414e-06,
"loss": 0.0674,
"step": 340
},
{
"epoch": 0.3873899460380574,
"grad_norm": 0.7352398600810227,
"learning_rate": 3.5979914721184263e-06,
"loss": 0.0837,
"step": 341
},
{
"epoch": 0.3885259869355297,
"grad_norm": 0.8231872248613965,
"learning_rate": 3.5895436121160388e-06,
"loss": 0.0772,
"step": 342
},
{
"epoch": 0.38966202783300197,
"grad_norm": 0.7367149834483241,
"learning_rate": 3.5810803659163136e-06,
"loss": 0.0775,
"step": 343
},
{
"epoch": 0.3907980687304743,
"grad_norm": 0.7712185528181968,
"learning_rate": 3.5726018530345913e-06,
"loss": 0.0771,
"step": 344
},
{
"epoch": 0.3919341096279466,
"grad_norm": 0.7715161299771612,
"learning_rate": 3.564108193201804e-06,
"loss": 0.0727,
"step": 345
},
{
"epoch": 0.39307015052541894,
"grad_norm": 0.8012727862178366,
"learning_rate": 3.5555995063627842e-06,
"loss": 0.0797,
"step": 346
},
{
"epoch": 0.3942061914228912,
"grad_norm": 0.7337481236349752,
"learning_rate": 3.5470759126745726e-06,
"loss": 0.0798,
"step": 347
},
{
"epoch": 0.39534223232036353,
"grad_norm": 0.7350551910786791,
"learning_rate": 3.5385375325047167e-06,
"loss": 0.0875,
"step": 348
},
{
"epoch": 0.39647827321783585,
"grad_norm": 0.7048639304094766,
"learning_rate": 3.5299844864295773e-06,
"loss": 0.0754,
"step": 349
},
{
"epoch": 0.3976143141153082,
"grad_norm": 1.1334161441853856,
"learning_rate": 3.5214168952326205e-06,
"loss": 0.0746,
"step": 350
},
{
"epoch": 0.39875035501278044,
"grad_norm": 0.6989505797799814,
"learning_rate": 3.5128348799027157e-06,
"loss": 0.0747,
"step": 351
},
{
"epoch": 0.39988639591025277,
"grad_norm": 0.7187861112920556,
"learning_rate": 3.5042385616324243e-06,
"loss": 0.0825,
"step": 352
},
{
"epoch": 0.4010224368077251,
"grad_norm": 0.6972138154142447,
"learning_rate": 3.4956280618162887e-06,
"loss": 0.0779,
"step": 353
},
{
"epoch": 0.4021584777051974,
"grad_norm": 0.8929740592234533,
"learning_rate": 3.4870035020491216e-06,
"loss": 0.0887,
"step": 354
},
{
"epoch": 0.4032945186026697,
"grad_norm": 0.734887534070193,
"learning_rate": 3.4783650041242823e-06,
"loss": 0.0787,
"step": 355
},
{
"epoch": 0.404430559500142,
"grad_norm": 0.6976813270054868,
"learning_rate": 3.469712690031962e-06,
"loss": 0.0745,
"step": 356
},
{
"epoch": 0.40556660039761433,
"grad_norm": 0.8271656519707578,
"learning_rate": 3.4610466819574617e-06,
"loss": 0.0762,
"step": 357
},
{
"epoch": 0.4067026412950866,
"grad_norm": 0.7662647730775376,
"learning_rate": 3.4523671022794612e-06,
"loss": 0.0764,
"step": 358
},
{
"epoch": 0.4078386821925589,
"grad_norm": 0.8034388047927019,
"learning_rate": 3.443674073568296e-06,
"loss": 0.0851,
"step": 359
},
{
"epoch": 0.40897472309003124,
"grad_norm": 0.8132216242516986,
"learning_rate": 3.4349677185842246e-06,
"loss": 0.0717,
"step": 360
},
{
"epoch": 0.41011076398750357,
"grad_norm": 0.8537641846408835,
"learning_rate": 3.4262481602756937e-06,
"loss": 0.0687,
"step": 361
},
{
"epoch": 0.41124680488497584,
"grad_norm": 0.7776722907105323,
"learning_rate": 3.4175155217776057e-06,
"loss": 0.086,
"step": 362
},
{
"epoch": 0.41238284578244816,
"grad_norm": 0.7639319056207042,
"learning_rate": 3.4087699264095746e-06,
"loss": 0.0792,
"step": 363
},
{
"epoch": 0.4135188866799205,
"grad_norm": 0.9498236411990728,
"learning_rate": 3.4000114976741905e-06,
"loss": 0.0735,
"step": 364
},
{
"epoch": 0.4146549275773928,
"grad_norm": 0.8078922681417617,
"learning_rate": 3.391240359255269e-06,
"loss": 0.0859,
"step": 365
},
{
"epoch": 0.4157909684748651,
"grad_norm": 0.701305404529687,
"learning_rate": 3.38245663501611e-06,
"loss": 0.0754,
"step": 366
},
{
"epoch": 0.4169270093723374,
"grad_norm": 0.9535861270195332,
"learning_rate": 3.3736604489977465e-06,
"loss": 0.0844,
"step": 367
},
{
"epoch": 0.4180630502698097,
"grad_norm": 0.7631817995714191,
"learning_rate": 3.3648519254171906e-06,
"loss": 0.0685,
"step": 368
},
{
"epoch": 0.41919909116728205,
"grad_norm": 0.8104822580614385,
"learning_rate": 3.3560311886656855e-06,
"loss": 0.0855,
"step": 369
},
{
"epoch": 0.4203351320647543,
"grad_norm": 0.8298476446641734,
"learning_rate": 3.3471983633069414e-06,
"loss": 0.0795,
"step": 370
},
{
"epoch": 0.42147117296222664,
"grad_norm": 0.7256692328377535,
"learning_rate": 3.3383535740753813e-06,
"loss": 0.0812,
"step": 371
},
{
"epoch": 0.42260721385969896,
"grad_norm": 0.7210661181409769,
"learning_rate": 3.32949694587438e-06,
"loss": 0.0918,
"step": 372
},
{
"epoch": 0.4237432547571713,
"grad_norm": 0.7174750225509191,
"learning_rate": 3.320628603774496e-06,
"loss": 0.0815,
"step": 373
},
{
"epoch": 0.42487929565464355,
"grad_norm": 0.7537467160289135,
"learning_rate": 3.3117486730117092e-06,
"loss": 0.0829,
"step": 374
},
{
"epoch": 0.4260153365521159,
"grad_norm": 0.7277497535392837,
"learning_rate": 3.3028572789856507e-06,
"loss": 0.0696,
"step": 375
},
{
"epoch": 0.4271513774495882,
"grad_norm": 0.6443734430583794,
"learning_rate": 3.2939545472578314e-06,
"loss": 0.069,
"step": 376
},
{
"epoch": 0.42828741834706047,
"grad_norm": 0.7695280037358156,
"learning_rate": 3.285040603549872e-06,
"loss": 0.0747,
"step": 377
},
{
"epoch": 0.4294234592445328,
"grad_norm": 0.7294107932414593,
"learning_rate": 3.276115573741724e-06,
"loss": 0.0699,
"step": 378
},
{
"epoch": 0.4305595001420051,
"grad_norm": 0.6901445691629573,
"learning_rate": 3.267179583869892e-06,
"loss": 0.0653,
"step": 379
},
{
"epoch": 0.43169554103947744,
"grad_norm": 0.7479692537488529,
"learning_rate": 3.2582327601256567e-06,
"loss": 0.0729,
"step": 380
},
{
"epoch": 0.4328315819369497,
"grad_norm": 0.8489891426282298,
"learning_rate": 3.249275228853292e-06,
"loss": 0.0671,
"step": 381
},
{
"epoch": 0.43396762283442203,
"grad_norm": 0.8342540424379653,
"learning_rate": 3.240307116548279e-06,
"loss": 0.0857,
"step": 382
},
{
"epoch": 0.43510366373189435,
"grad_norm": 0.855952003925093,
"learning_rate": 3.231328549855522e-06,
"loss": 0.0841,
"step": 383
},
{
"epoch": 0.4362397046293667,
"grad_norm": 0.8171052821070877,
"learning_rate": 3.222339655567556e-06,
"loss": 0.0724,
"step": 384
},
{
"epoch": 0.43737574552683894,
"grad_norm": 0.7191540982661123,
"learning_rate": 3.2133405606227636e-06,
"loss": 0.0705,
"step": 385
},
{
"epoch": 0.43851178642431127,
"grad_norm": 0.8227659710377888,
"learning_rate": 3.2043313921035747e-06,
"loss": 0.0816,
"step": 386
},
{
"epoch": 0.4396478273217836,
"grad_norm": 0.7965981656056659,
"learning_rate": 3.1953122772346757e-06,
"loss": 0.0716,
"step": 387
},
{
"epoch": 0.4407838682192559,
"grad_norm": 0.7306236376054158,
"learning_rate": 3.1862833433812137e-06,
"loss": 0.07,
"step": 388
},
{
"epoch": 0.4419199091167282,
"grad_norm": 0.7687699723670527,
"learning_rate": 3.1772447180469934e-06,
"loss": 0.0787,
"step": 389
},
{
"epoch": 0.4430559500142005,
"grad_norm": 0.8313217943349993,
"learning_rate": 3.1681965288726825e-06,
"loss": 0.0716,
"step": 390
},
{
"epoch": 0.44419199091167283,
"grad_norm": 0.7173908314094287,
"learning_rate": 3.1591389036340064e-06,
"loss": 0.0742,
"step": 391
},
{
"epoch": 0.44532803180914515,
"grad_norm": 0.8407757996115441,
"learning_rate": 3.1500719702399406e-06,
"loss": 0.0734,
"step": 392
},
{
"epoch": 0.4464640727066174,
"grad_norm": 0.7682407487902735,
"learning_rate": 3.1409958567309114e-06,
"loss": 0.0895,
"step": 393
},
{
"epoch": 0.44760011360408974,
"grad_norm": 0.7226606623749933,
"learning_rate": 3.1319106912769797e-06,
"loss": 0.0757,
"step": 394
},
{
"epoch": 0.44873615450156207,
"grad_norm": 0.6935224939852427,
"learning_rate": 3.122816602176039e-06,
"loss": 0.0715,
"step": 395
},
{
"epoch": 0.4498721953990344,
"grad_norm": 0.7857347629017143,
"learning_rate": 3.1137137178519983e-06,
"loss": 0.0796,
"step": 396
},
{
"epoch": 0.45100823629650666,
"grad_norm": 0.8462391224537944,
"learning_rate": 3.1046021668529684e-06,
"loss": 0.0778,
"step": 397
},
{
"epoch": 0.452144277193979,
"grad_norm": 0.7803762723068312,
"learning_rate": 3.0954820778494516e-06,
"loss": 0.0792,
"step": 398
},
{
"epoch": 0.4532803180914513,
"grad_norm": 0.8208303464346856,
"learning_rate": 3.0863535796325173e-06,
"loss": 0.0822,
"step": 399
},
{
"epoch": 0.4544163589889236,
"grad_norm": 0.7406949289191436,
"learning_rate": 3.0772168011119894e-06,
"loss": 0.0722,
"step": 400
},
{
"epoch": 0.4555523998863959,
"grad_norm": 0.7232653457612589,
"learning_rate": 3.068071871314626e-06,
"loss": 0.0754,
"step": 401
},
{
"epoch": 0.4566884407838682,
"grad_norm": 0.6527667928894365,
"learning_rate": 3.0589189193822894e-06,
"loss": 0.0711,
"step": 402
},
{
"epoch": 0.45782448168134054,
"grad_norm": 0.767420451370671,
"learning_rate": 3.0497580745701334e-06,
"loss": 0.0834,
"step": 403
},
{
"epoch": 0.4589605225788128,
"grad_norm": 0.7645556615921287,
"learning_rate": 3.0405894662447682e-06,
"loss": 0.0789,
"step": 404
},
{
"epoch": 0.46009656347628514,
"grad_norm": 0.7810648726320831,
"learning_rate": 3.0314132238824416e-06,
"loss": 0.0818,
"step": 405
},
{
"epoch": 0.46123260437375746,
"grad_norm": 0.7115760882284773,
"learning_rate": 3.0222294770672054e-06,
"loss": 0.0705,
"step": 406
},
{
"epoch": 0.4623686452712298,
"grad_norm": 0.649480771602918,
"learning_rate": 3.013038355489086e-06,
"loss": 0.0689,
"step": 407
},
{
"epoch": 0.46350468616870205,
"grad_norm": 0.7592672186320847,
"learning_rate": 3.0038399889422553e-06,
"loss": 0.0777,
"step": 408
},
{
"epoch": 0.4646407270661744,
"grad_norm": 0.8172328817027344,
"learning_rate": 2.9946345073231964e-06,
"loss": 0.074,
"step": 409
},
{
"epoch": 0.4657767679636467,
"grad_norm": 0.7859879017666365,
"learning_rate": 2.985422040628867e-06,
"loss": 0.0769,
"step": 410
},
{
"epoch": 0.466912808861119,
"grad_norm": 0.7177354483721196,
"learning_rate": 2.976202718954869e-06,
"loss": 0.0752,
"step": 411
},
{
"epoch": 0.4680488497585913,
"grad_norm": 0.6542212761323943,
"learning_rate": 2.9669766724936074e-06,
"loss": 0.0699,
"step": 412
},
{
"epoch": 0.4691848906560636,
"grad_norm": 0.7320537820608451,
"learning_rate": 2.957744031532451e-06,
"loss": 0.0698,
"step": 413
},
{
"epoch": 0.47032093155353594,
"grad_norm": 0.7284037776279783,
"learning_rate": 2.948504926451896e-06,
"loss": 0.0735,
"step": 414
},
{
"epoch": 0.47145697245100826,
"grad_norm": 0.702970854905207,
"learning_rate": 2.9392594877237194e-06,
"loss": 0.065,
"step": 415
},
{
"epoch": 0.47259301334848053,
"grad_norm": 0.6558338370706804,
"learning_rate": 2.930007845909146e-06,
"loss": 0.0711,
"step": 416
},
{
"epoch": 0.47372905424595285,
"grad_norm": 0.667963355409311,
"learning_rate": 2.9207501316569936e-06,
"loss": 0.0669,
"step": 417
},
{
"epoch": 0.4748650951434252,
"grad_norm": 0.6856434352453329,
"learning_rate": 2.911486475701835e-06,
"loss": 0.0612,
"step": 418
},
{
"epoch": 0.4760011360408975,
"grad_norm": 0.7150424311335496,
"learning_rate": 2.9022170088621497e-06,
"loss": 0.0708,
"step": 419
},
{
"epoch": 0.47713717693836977,
"grad_norm": 0.7191644625060256,
"learning_rate": 2.892941862038475e-06,
"loss": 0.0792,
"step": 420
},
{
"epoch": 0.4782732178358421,
"grad_norm": 0.740499459461677,
"learning_rate": 2.883661166211564e-06,
"loss": 0.0729,
"step": 421
},
{
"epoch": 0.4794092587333144,
"grad_norm": 0.8211073931673529,
"learning_rate": 2.8743750524405254e-06,
"loss": 0.0759,
"step": 422
},
{
"epoch": 0.4805452996307867,
"grad_norm": 0.7278022551256821,
"learning_rate": 2.8650836518609814e-06,
"loss": 0.0649,
"step": 423
},
{
"epoch": 0.481681340528259,
"grad_norm": 0.7105414452804,
"learning_rate": 2.8557870956832135e-06,
"loss": 0.0719,
"step": 424
},
{
"epoch": 0.48281738142573133,
"grad_norm": 0.7882490477262348,
"learning_rate": 2.8464855151903065e-06,
"loss": 0.0752,
"step": 425
},
{
"epoch": 0.48395342232320365,
"grad_norm": 0.7614124126399766,
"learning_rate": 2.837179041736299e-06,
"loss": 0.0656,
"step": 426
},
{
"epoch": 0.4850894632206759,
"grad_norm": 0.7482163459741915,
"learning_rate": 2.8278678067443255e-06,
"loss": 0.0811,
"step": 427
},
{
"epoch": 0.48622550411814824,
"grad_norm": 0.7408052511491365,
"learning_rate": 2.8185519417047624e-06,
"loss": 0.0711,
"step": 428
},
{
"epoch": 0.48736154501562057,
"grad_norm": 0.7402789101662246,
"learning_rate": 2.80923157817337e-06,
"loss": 0.0691,
"step": 429
},
{
"epoch": 0.4884975859130929,
"grad_norm": 0.7071544319741676,
"learning_rate": 2.799906847769433e-06,
"loss": 0.0729,
"step": 430
},
{
"epoch": 0.48963362681056516,
"grad_norm": 0.7353065824078855,
"learning_rate": 2.790577882173906e-06,
"loss": 0.0732,
"step": 431
},
{
"epoch": 0.4907696677080375,
"grad_norm": 0.7209493364089367,
"learning_rate": 2.781244813127552e-06,
"loss": 0.0794,
"step": 432
},
{
"epoch": 0.4919057086055098,
"grad_norm": 0.7468157532688993,
"learning_rate": 2.7719077724290793e-06,
"loss": 0.0718,
"step": 433
},
{
"epoch": 0.49304174950298213,
"grad_norm": 0.6890756859527285,
"learning_rate": 2.762566891933285e-06,
"loss": 0.0753,
"step": 434
},
{
"epoch": 0.4941777904004544,
"grad_norm": 0.7788104679693914,
"learning_rate": 2.7532223035491877e-06,
"loss": 0.0716,
"step": 435
},
{
"epoch": 0.4953138312979267,
"grad_norm": 0.7723914173864724,
"learning_rate": 2.743874139238171e-06,
"loss": 0.0775,
"step": 436
},
{
"epoch": 0.49644987219539904,
"grad_norm": 0.743928252813702,
"learning_rate": 2.7345225310121155e-06,
"loss": 0.0748,
"step": 437
},
{
"epoch": 0.49758591309287137,
"grad_norm": 0.709937023384648,
"learning_rate": 2.725167610931534e-06,
"loss": 0.0731,
"step": 438
},
{
"epoch": 0.49872195399034364,
"grad_norm": 0.7257850662776226,
"learning_rate": 2.715809511103711e-06,
"loss": 0.0739,
"step": 439
},
{
"epoch": 0.49985799488781596,
"grad_norm": 0.7244919657561524,
"learning_rate": 2.7064483636808314e-06,
"loss": 0.0626,
"step": 440
},
{
"epoch": 0.5009940357852882,
"grad_norm": 0.6763467559532471,
"learning_rate": 2.69708430085812e-06,
"loss": 0.0672,
"step": 441
},
{
"epoch": 0.5021300766827606,
"grad_norm": 0.6998161361615932,
"learning_rate": 2.687717454871971e-06,
"loss": 0.066,
"step": 442
},
{
"epoch": 0.5032661175802329,
"grad_norm": 0.7167142257204601,
"learning_rate": 2.678347957998081e-06,
"loss": 0.0675,
"step": 443
},
{
"epoch": 0.5044021584777052,
"grad_norm": 0.8043133103525217,
"learning_rate": 2.6689759425495833e-06,
"loss": 0.0736,
"step": 444
},
{
"epoch": 0.5055381993751775,
"grad_norm": 0.6878575755122527,
"learning_rate": 2.659601540875174e-06,
"loss": 0.0746,
"step": 445
},
{
"epoch": 0.5066742402726498,
"grad_norm": 0.7525753170166554,
"learning_rate": 2.650224885357251e-06,
"loss": 0.0801,
"step": 446
},
{
"epoch": 0.5078102811701222,
"grad_norm": 0.7941324249115771,
"learning_rate": 2.640846108410039e-06,
"loss": 0.0685,
"step": 447
},
{
"epoch": 0.5089463220675944,
"grad_norm": 0.6913302757604484,
"learning_rate": 2.6314653424777194e-06,
"loss": 0.0705,
"step": 448
},
{
"epoch": 0.5100823629650667,
"grad_norm": 0.7174450522921005,
"learning_rate": 2.6220827200325628e-06,
"loss": 0.0759,
"step": 449
},
{
"epoch": 0.511218403862539,
"grad_norm": 0.7973535459102329,
"learning_rate": 2.612698373573056e-06,
"loss": 0.0736,
"step": 450
},
{
"epoch": 0.5123544447600114,
"grad_norm": 0.8074198728981284,
"learning_rate": 2.603312435622033e-06,
"loss": 0.0758,
"step": 451
},
{
"epoch": 0.5134904856574837,
"grad_norm": 0.7604997282120738,
"learning_rate": 2.593925038724802e-06,
"loss": 0.0737,
"step": 452
},
{
"epoch": 0.514626526554956,
"grad_norm": 0.7476276338719816,
"learning_rate": 2.5845363154472725e-06,
"loss": 0.0722,
"step": 453
},
{
"epoch": 0.5157625674524283,
"grad_norm": 0.735936044826126,
"learning_rate": 2.575146398374087e-06,
"loss": 0.0722,
"step": 454
},
{
"epoch": 0.5168986083499006,
"grad_norm": 0.7379849599595595,
"learning_rate": 2.565755420106744e-06,
"loss": 0.0861,
"step": 455
},
{
"epoch": 0.5180346492473729,
"grad_norm": 0.7802069151430526,
"learning_rate": 2.5563635132617305e-06,
"loss": 0.0699,
"step": 456
},
{
"epoch": 0.5191706901448452,
"grad_norm": 0.7850066911880272,
"learning_rate": 2.5469708104686452e-06,
"loss": 0.0786,
"step": 457
},
{
"epoch": 0.5203067310423175,
"grad_norm": 0.7173680507506982,
"learning_rate": 2.5375774443683263e-06,
"loss": 0.0662,
"step": 458
},
{
"epoch": 0.5214427719397898,
"grad_norm": 0.7215484657257588,
"learning_rate": 2.5281835476109796e-06,
"loss": 0.0691,
"step": 459
},
{
"epoch": 0.5225788128372622,
"grad_norm": 0.6809593038075415,
"learning_rate": 2.518789252854305e-06,
"loss": 0.0697,
"step": 460
},
{
"epoch": 0.5237148537347345,
"grad_norm": 0.8606317588371037,
"learning_rate": 2.5093946927616227e-06,
"loss": 0.0714,
"step": 461
},
{
"epoch": 0.5248508946322068,
"grad_norm": 0.7678392851383928,
"learning_rate": 2.5e-06,
"loss": 0.0727,
"step": 462
},
{
"epoch": 0.5259869355296791,
"grad_norm": 0.7979050686259359,
"learning_rate": 2.4906053072383773e-06,
"loss": 0.0731,
"step": 463
},
{
"epoch": 0.5271229764271513,
"grad_norm": 0.6681326332305534,
"learning_rate": 2.4812107471456958e-06,
"loss": 0.0626,
"step": 464
},
{
"epoch": 0.5282590173246237,
"grad_norm": 0.7627768324930073,
"learning_rate": 2.4718164523890212e-06,
"loss": 0.0712,
"step": 465
},
{
"epoch": 0.529395058222096,
"grad_norm": 0.7904275908906134,
"learning_rate": 2.4624225556316745e-06,
"loss": 0.0637,
"step": 466
},
{
"epoch": 0.5305310991195683,
"grad_norm": 0.7155499492449455,
"learning_rate": 2.453029189531356e-06,
"loss": 0.0703,
"step": 467
},
{
"epoch": 0.5316671400170406,
"grad_norm": 0.7250622453734636,
"learning_rate": 2.44363648673827e-06,
"loss": 0.069,
"step": 468
},
{
"epoch": 0.532803180914513,
"grad_norm": 0.7249989036503823,
"learning_rate": 2.4342445798932563e-06,
"loss": 0.063,
"step": 469
},
{
"epoch": 0.5339392218119853,
"grad_norm": 0.743744157976681,
"learning_rate": 2.4248536016259137e-06,
"loss": 0.077,
"step": 470
},
{
"epoch": 0.5350752627094575,
"grad_norm": 0.7582738981974823,
"learning_rate": 2.4154636845527284e-06,
"loss": 0.0732,
"step": 471
},
{
"epoch": 0.5362113036069298,
"grad_norm": 0.7328578027294108,
"learning_rate": 2.4060749612751987e-06,
"loss": 0.071,
"step": 472
},
{
"epoch": 0.5373473445044021,
"grad_norm": 0.7002570282552395,
"learning_rate": 2.396687564377967e-06,
"loss": 0.0659,
"step": 473
},
{
"epoch": 0.5384833854018745,
"grad_norm": 0.7359719709178979,
"learning_rate": 2.3873016264269446e-06,
"loss": 0.065,
"step": 474
},
{
"epoch": 0.5396194262993468,
"grad_norm": 0.7105844692533961,
"learning_rate": 2.3779172799674377e-06,
"loss": 0.0696,
"step": 475
},
{
"epoch": 0.5407554671968191,
"grad_norm": 0.6506193455984971,
"learning_rate": 2.368534657522281e-06,
"loss": 0.0676,
"step": 476
},
{
"epoch": 0.5418915080942914,
"grad_norm": 0.7291302216363645,
"learning_rate": 2.359153891589962e-06,
"loss": 0.0801,
"step": 477
},
{
"epoch": 0.5430275489917638,
"grad_norm": 0.7447378296478038,
"learning_rate": 2.3497751146427494e-06,
"loss": 0.0728,
"step": 478
},
{
"epoch": 0.544163589889236,
"grad_norm": 0.7252193208819527,
"learning_rate": 2.3403984591248265e-06,
"loss": 0.0754,
"step": 479
},
{
"epoch": 0.5452996307867083,
"grad_norm": 0.7138381216438716,
"learning_rate": 2.3310240574504184e-06,
"loss": 0.0722,
"step": 480
},
{
"epoch": 0.5464356716841806,
"grad_norm": 0.6530875177989481,
"learning_rate": 2.3216520420019194e-06,
"loss": 0.0638,
"step": 481
},
{
"epoch": 0.5475717125816529,
"grad_norm": 0.8473481511613769,
"learning_rate": 2.3122825451280294e-06,
"loss": 0.081,
"step": 482
},
{
"epoch": 0.5487077534791253,
"grad_norm": 0.7553901465399981,
"learning_rate": 2.30291569914188e-06,
"loss": 0.0599,
"step": 483
},
{
"epoch": 0.5498437943765976,
"grad_norm": 0.7885325178229121,
"learning_rate": 2.2935516363191695e-06,
"loss": 0.0848,
"step": 484
},
{
"epoch": 0.5509798352740699,
"grad_norm": 0.7822535293047203,
"learning_rate": 2.2841904888962903e-06,
"loss": 0.0746,
"step": 485
},
{
"epoch": 0.5521158761715422,
"grad_norm": 0.7752683853284668,
"learning_rate": 2.2748323890684664e-06,
"loss": 0.0745,
"step": 486
},
{
"epoch": 0.5532519170690144,
"grad_norm": 0.7713264960564677,
"learning_rate": 2.2654774689878862e-06,
"loss": 0.076,
"step": 487
},
{
"epoch": 0.5543879579664868,
"grad_norm": 0.629660342616562,
"learning_rate": 2.2561258607618296e-06,
"loss": 0.0712,
"step": 488
},
{
"epoch": 0.5555239988639591,
"grad_norm": 0.7243836087765395,
"learning_rate": 2.246777696450813e-06,
"loss": 0.0819,
"step": 489
},
{
"epoch": 0.5566600397614314,
"grad_norm": 0.6376006175785872,
"learning_rate": 2.2374331080667168e-06,
"loss": 0.0635,
"step": 490
},
{
"epoch": 0.5577960806589037,
"grad_norm": 0.6720742104662947,
"learning_rate": 2.2280922275709216e-06,
"loss": 0.0655,
"step": 491
},
{
"epoch": 0.5589321215563761,
"grad_norm": 0.7322632128140908,
"learning_rate": 2.2187551868724487e-06,
"loss": 0.0733,
"step": 492
},
{
"epoch": 0.5600681624538484,
"grad_norm": 0.7942425591087324,
"learning_rate": 2.209422117826094e-06,
"loss": 0.0708,
"step": 493
},
{
"epoch": 0.5612042033513206,
"grad_norm": 0.6415846065352803,
"learning_rate": 2.200093152230568e-06,
"loss": 0.0625,
"step": 494
},
{
"epoch": 0.5623402442487929,
"grad_norm": 0.791673886780985,
"learning_rate": 2.190768421826631e-06,
"loss": 0.0712,
"step": 495
},
{
"epoch": 0.5634762851462652,
"grad_norm": 0.8434695674081621,
"learning_rate": 2.1814480582952376e-06,
"loss": 0.0679,
"step": 496
},
{
"epoch": 0.5646123260437376,
"grad_norm": 0.7110416920872518,
"learning_rate": 2.1721321932556753e-06,
"loss": 0.0636,
"step": 497
},
{
"epoch": 0.5657483669412099,
"grad_norm": 0.8221680308072801,
"learning_rate": 2.1628209582637024e-06,
"loss": 0.0613,
"step": 498
},
{
"epoch": 0.5668844078386822,
"grad_norm": 0.7622218183878603,
"learning_rate": 2.1535144848096943e-06,
"loss": 0.0708,
"step": 499
},
{
"epoch": 0.5680204487361545,
"grad_norm": 0.7782722183282532,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.0757,
"step": 500
},
{
"epoch": 0.5680204487361545,
"eval_loss": 0.06981126964092255,
"eval_runtime": 11.0092,
"eval_samples_per_second": 51.775,
"eval_steps_per_second": 6.54,
"step": 500
},
{
"epoch": 0.5691564896336269,
"grad_norm": 1.0163760342592565,
"learning_rate": 2.134916348139019e-06,
"loss": 0.0686,
"step": 501
},
{
"epoch": 0.5702925305310991,
"grad_norm": 0.7322989198729203,
"learning_rate": 2.125624947559475e-06,
"loss": 0.0634,
"step": 502
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.6931871597884127,
"learning_rate": 2.116338833788437e-06,
"loss": 0.0616,
"step": 503
},
{
"epoch": 0.5725646123260437,
"grad_norm": 0.7359829849611855,
"learning_rate": 2.1070581379615253e-06,
"loss": 0.0671,
"step": 504
},
{
"epoch": 0.573700653223516,
"grad_norm": 0.6657442150083017,
"learning_rate": 2.0977829911378507e-06,
"loss": 0.0643,
"step": 505
},
{
"epoch": 0.5748366941209884,
"grad_norm": 0.6926944123781523,
"learning_rate": 2.088513524298165e-06,
"loss": 0.0735,
"step": 506
},
{
"epoch": 0.5759727350184607,
"grad_norm": 0.6648038490491777,
"learning_rate": 2.0792498683430072e-06,
"loss": 0.0642,
"step": 507
},
{
"epoch": 0.577108775915933,
"grad_norm": 0.7126968053240703,
"learning_rate": 2.0699921540908542e-06,
"loss": 0.0628,
"step": 508
},
{
"epoch": 0.5782448168134053,
"grad_norm": 0.6620429659025622,
"learning_rate": 2.0607405122762806e-06,
"loss": 0.0659,
"step": 509
},
{
"epoch": 0.5793808577108776,
"grad_norm": 0.7959716020355202,
"learning_rate": 2.0514950735481053e-06,
"loss": 0.0812,
"step": 510
},
{
"epoch": 0.5805168986083499,
"grad_norm": 0.7170097707725962,
"learning_rate": 2.0422559684675498e-06,
"loss": 0.0677,
"step": 511
},
{
"epoch": 0.5816529395058222,
"grad_norm": 0.6830917403644267,
"learning_rate": 2.033023327506393e-06,
"loss": 0.0694,
"step": 512
},
{
"epoch": 0.5827889804032945,
"grad_norm": 0.7191751035977458,
"learning_rate": 2.023797281045132e-06,
"loss": 0.0679,
"step": 513
},
{
"epoch": 0.5839250213007668,
"grad_norm": 0.7346106234856803,
"learning_rate": 2.014577959371134e-06,
"loss": 0.0718,
"step": 514
},
{
"epoch": 0.5850610621982392,
"grad_norm": 0.6813644114828487,
"learning_rate": 2.0053654926768044e-06,
"loss": 0.0593,
"step": 515
},
{
"epoch": 0.5861971030957115,
"grad_norm": 0.80336979808499,
"learning_rate": 1.996160011057746e-06,
"loss": 0.0729,
"step": 516
},
{
"epoch": 0.5873331439931837,
"grad_norm": 0.7514984326231717,
"learning_rate": 1.9869616445109146e-06,
"loss": 0.0673,
"step": 517
},
{
"epoch": 0.588469184890656,
"grad_norm": 0.7606405433072686,
"learning_rate": 1.9777705229327954e-06,
"loss": 0.0664,
"step": 518
},
{
"epoch": 0.5896052257881284,
"grad_norm": 0.7583627182956107,
"learning_rate": 1.9685867761175584e-06,
"loss": 0.0716,
"step": 519
},
{
"epoch": 0.5907412666856007,
"grad_norm": 0.700178818885243,
"learning_rate": 1.959410533755232e-06,
"loss": 0.067,
"step": 520
},
{
"epoch": 0.591877307583073,
"grad_norm": 0.7422231248355664,
"learning_rate": 1.9502419254298674e-06,
"loss": 0.0608,
"step": 521
},
{
"epoch": 0.5930133484805453,
"grad_norm": 0.6555516297292772,
"learning_rate": 1.9410810806177105e-06,
"loss": 0.0652,
"step": 522
},
{
"epoch": 0.5941493893780176,
"grad_norm": 0.8479046695169662,
"learning_rate": 1.931928128685375e-06,
"loss": 0.0778,
"step": 523
},
{
"epoch": 0.59528543027549,
"grad_norm": 0.8290532060534326,
"learning_rate": 1.922783198888011e-06,
"loss": 0.0741,
"step": 524
},
{
"epoch": 0.5964214711729622,
"grad_norm": 0.7494299014958242,
"learning_rate": 1.913646420367483e-06,
"loss": 0.0784,
"step": 525
},
{
"epoch": 0.5975575120704345,
"grad_norm": 0.7799356992873004,
"learning_rate": 1.9045179221505497e-06,
"loss": 0.0679,
"step": 526
},
{
"epoch": 0.5986935529679068,
"grad_norm": 0.669194670284561,
"learning_rate": 1.8953978331470322e-06,
"loss": 0.0667,
"step": 527
},
{
"epoch": 0.5998295938653792,
"grad_norm": 0.6788295538214086,
"learning_rate": 1.8862862821480023e-06,
"loss": 0.0727,
"step": 528
},
{
"epoch": 0.6009656347628515,
"grad_norm": 0.6939565933254241,
"learning_rate": 1.8771833978239615e-06,
"loss": 0.0694,
"step": 529
},
{
"epoch": 0.6021016756603238,
"grad_norm": 0.6537870877173654,
"learning_rate": 1.8680893087230207e-06,
"loss": 0.0653,
"step": 530
},
{
"epoch": 0.6032377165577961,
"grad_norm": 0.6637094837876996,
"learning_rate": 1.8590041432690895e-06,
"loss": 0.0614,
"step": 531
},
{
"epoch": 0.6043737574552683,
"grad_norm": 0.7093390779006853,
"learning_rate": 1.8499280297600594e-06,
"loss": 0.0617,
"step": 532
},
{
"epoch": 0.6055097983527407,
"grad_norm": 0.6755471903725316,
"learning_rate": 1.840861096365995e-06,
"loss": 0.064,
"step": 533
},
{
"epoch": 0.606645839250213,
"grad_norm": 0.774838419360385,
"learning_rate": 1.8318034711273181e-06,
"loss": 0.0637,
"step": 534
},
{
"epoch": 0.6077818801476853,
"grad_norm": 0.8065627164393625,
"learning_rate": 1.822755281953007e-06,
"loss": 0.0646,
"step": 535
},
{
"epoch": 0.6089179210451576,
"grad_norm": 0.7275113641331161,
"learning_rate": 1.813716656618788e-06,
"loss": 0.0582,
"step": 536
},
{
"epoch": 0.61005396194263,
"grad_norm": 0.7398892395016525,
"learning_rate": 1.8046877227653248e-06,
"loss": 0.0743,
"step": 537
},
{
"epoch": 0.6111900028401023,
"grad_norm": 0.7497489776755677,
"learning_rate": 1.7956686078964257e-06,
"loss": 0.07,
"step": 538
},
{
"epoch": 0.6123260437375746,
"grad_norm": 0.8216722715800929,
"learning_rate": 1.7866594393772375e-06,
"loss": 0.0686,
"step": 539
},
{
"epoch": 0.6134620846350468,
"grad_norm": 0.730615553098809,
"learning_rate": 1.7776603444324445e-06,
"loss": 0.0647,
"step": 540
},
{
"epoch": 0.6145981255325191,
"grad_norm": 0.7520686987142878,
"learning_rate": 1.7686714501444791e-06,
"loss": 0.0627,
"step": 541
},
{
"epoch": 0.6157341664299915,
"grad_norm": 0.7034476625476764,
"learning_rate": 1.759692883451721e-06,
"loss": 0.0718,
"step": 542
},
{
"epoch": 0.6168702073274638,
"grad_norm": 0.6589075115743108,
"learning_rate": 1.750724771146709e-06,
"loss": 0.0576,
"step": 543
},
{
"epoch": 0.6180062482249361,
"grad_norm": 0.7840335260584612,
"learning_rate": 1.741767239874344e-06,
"loss": 0.0628,
"step": 544
},
{
"epoch": 0.6191422891224084,
"grad_norm": 0.81310922260959,
"learning_rate": 1.7328204161301084e-06,
"loss": 0.078,
"step": 545
},
{
"epoch": 0.6202783300198808,
"grad_norm": 0.6591918365661746,
"learning_rate": 1.723884426258277e-06,
"loss": 0.0667,
"step": 546
},
{
"epoch": 0.6214143709173531,
"grad_norm": 0.7690500250019386,
"learning_rate": 1.7149593964501285e-06,
"loss": 0.0737,
"step": 547
},
{
"epoch": 0.6225504118148253,
"grad_norm": 0.708482385265233,
"learning_rate": 1.7060454527421688e-06,
"loss": 0.0659,
"step": 548
},
{
"epoch": 0.6236864527122976,
"grad_norm": 0.7262622089914782,
"learning_rate": 1.6971427210143503e-06,
"loss": 0.0649,
"step": 549
},
{
"epoch": 0.6248224936097699,
"grad_norm": 0.6756807736567372,
"learning_rate": 1.6882513269882916e-06,
"loss": 0.056,
"step": 550
},
{
"epoch": 0.6259585345072423,
"grad_norm": 0.7073460144734544,
"learning_rate": 1.6793713962255043e-06,
"loss": 0.0653,
"step": 551
},
{
"epoch": 0.6270945754047146,
"grad_norm": 0.7142037806342341,
"learning_rate": 1.6705030541256211e-06,
"loss": 0.0571,
"step": 552
},
{
"epoch": 0.6282306163021869,
"grad_norm": 0.7850076042825073,
"learning_rate": 1.661646425924619e-06,
"loss": 0.0619,
"step": 553
},
{
"epoch": 0.6293666571996592,
"grad_norm": 0.7204376620154082,
"learning_rate": 1.6528016366930594e-06,
"loss": 0.0732,
"step": 554
},
{
"epoch": 0.6305026980971314,
"grad_norm": 0.7608581987391684,
"learning_rate": 1.643968811334315e-06,
"loss": 0.0616,
"step": 555
},
{
"epoch": 0.6316387389946038,
"grad_norm": 0.8576111862705111,
"learning_rate": 1.6351480745828098e-06,
"loss": 0.0705,
"step": 556
},
{
"epoch": 0.6327747798920761,
"grad_norm": 0.6680467044643426,
"learning_rate": 1.6263395510022546e-06,
"loss": 0.0622,
"step": 557
},
{
"epoch": 0.6339108207895484,
"grad_norm": 0.8068649968829243,
"learning_rate": 1.6175433649838901e-06,
"loss": 0.0646,
"step": 558
},
{
"epoch": 0.6350468616870207,
"grad_norm": 0.6817820680144767,
"learning_rate": 1.6087596407447314e-06,
"loss": 0.0632,
"step": 559
},
{
"epoch": 0.6361829025844931,
"grad_norm": 0.7221690001860933,
"learning_rate": 1.5999885023258099e-06,
"loss": 0.0675,
"step": 560
},
{
"epoch": 0.6373189434819654,
"grad_norm": 0.7186532155986466,
"learning_rate": 1.5912300735904252e-06,
"loss": 0.0642,
"step": 561
},
{
"epoch": 0.6384549843794377,
"grad_norm": 0.6768918962633247,
"learning_rate": 1.5824844782223956e-06,
"loss": 0.065,
"step": 562
},
{
"epoch": 0.6395910252769099,
"grad_norm": 0.7455736530308621,
"learning_rate": 1.5737518397243074e-06,
"loss": 0.0606,
"step": 563
},
{
"epoch": 0.6407270661743822,
"grad_norm": 0.6548968684524288,
"learning_rate": 1.5650322814157764e-06,
"loss": 0.0636,
"step": 564
},
{
"epoch": 0.6418631070718546,
"grad_norm": 0.7155782582180678,
"learning_rate": 1.5563259264317048e-06,
"loss": 0.0717,
"step": 565
},
{
"epoch": 0.6429991479693269,
"grad_norm": 0.7240624483076444,
"learning_rate": 1.5476328977205396e-06,
"loss": 0.0673,
"step": 566
},
{
"epoch": 0.6441351888667992,
"grad_norm": 0.6718198739936723,
"learning_rate": 1.5389533180425387e-06,
"loss": 0.0663,
"step": 567
},
{
"epoch": 0.6452712297642715,
"grad_norm": 0.7380737125658104,
"learning_rate": 1.5302873099680378e-06,
"loss": 0.0703,
"step": 568
},
{
"epoch": 0.6464072706617439,
"grad_norm": 0.6948227028870951,
"learning_rate": 1.5216349958757187e-06,
"loss": 0.0643,
"step": 569
},
{
"epoch": 0.6475433115592162,
"grad_norm": 0.6963304517404344,
"learning_rate": 1.5129964979508792e-06,
"loss": 0.0671,
"step": 570
},
{
"epoch": 0.6486793524566884,
"grad_norm": 0.6972931207318768,
"learning_rate": 1.5043719381837113e-06,
"loss": 0.063,
"step": 571
},
{
"epoch": 0.6498153933541607,
"grad_norm": 0.6485613523241838,
"learning_rate": 1.495761438367577e-06,
"loss": 0.0596,
"step": 572
},
{
"epoch": 0.650951434251633,
"grad_norm": 0.7001853980454908,
"learning_rate": 1.4871651200972854e-06,
"loss": 0.067,
"step": 573
},
{
"epoch": 0.6520874751491054,
"grad_norm": 0.7272042380715532,
"learning_rate": 1.47858310476738e-06,
"loss": 0.0598,
"step": 574
},
{
"epoch": 0.6532235160465777,
"grad_norm": 0.7187945090994169,
"learning_rate": 1.470015513570424e-06,
"loss": 0.0694,
"step": 575
},
{
"epoch": 0.65435955694405,
"grad_norm": 0.7007096317128729,
"learning_rate": 1.4614624674952843e-06,
"loss": 0.0669,
"step": 576
},
{
"epoch": 0.6554955978415223,
"grad_norm": 0.7244179388243616,
"learning_rate": 1.452924087325428e-06,
"loss": 0.067,
"step": 577
},
{
"epoch": 0.6566316387389945,
"grad_norm": 0.7033512029008061,
"learning_rate": 1.4444004936372166e-06,
"loss": 0.0621,
"step": 578
},
{
"epoch": 0.6577676796364669,
"grad_norm": 0.6990155654636586,
"learning_rate": 1.4358918067981969e-06,
"loss": 0.0596,
"step": 579
},
{
"epoch": 0.6589037205339392,
"grad_norm": 0.760212990286638,
"learning_rate": 1.4273981469654093e-06,
"loss": 0.0624,
"step": 580
},
{
"epoch": 0.6600397614314115,
"grad_norm": 0.6977342847520188,
"learning_rate": 1.4189196340836866e-06,
"loss": 0.0568,
"step": 581
},
{
"epoch": 0.6611758023288838,
"grad_norm": 0.6780400957221138,
"learning_rate": 1.4104563878839623e-06,
"loss": 0.0586,
"step": 582
},
{
"epoch": 0.6623118432263562,
"grad_norm": 0.6673962911333318,
"learning_rate": 1.4020085278815745e-06,
"loss": 0.0653,
"step": 583
},
{
"epoch": 0.6634478841238285,
"grad_norm": 0.7198642911954184,
"learning_rate": 1.3935761733745865e-06,
"loss": 0.0635,
"step": 584
},
{
"epoch": 0.6645839250213008,
"grad_norm": 0.7115497850987673,
"learning_rate": 1.3851594434420968e-06,
"loss": 0.0622,
"step": 585
},
{
"epoch": 0.665719965918773,
"grad_norm": 0.6864557026455775,
"learning_rate": 1.3767584569425562e-06,
"loss": 0.0605,
"step": 586
},
{
"epoch": 0.6668560068162454,
"grad_norm": 0.7558879903480649,
"learning_rate": 1.3683733325120934e-06,
"loss": 0.0629,
"step": 587
},
{
"epoch": 0.6679920477137177,
"grad_norm": 0.6666337739350352,
"learning_rate": 1.360004188562841e-06,
"loss": 0.0645,
"step": 588
},
{
"epoch": 0.66912808861119,
"grad_norm": 0.7137577962203598,
"learning_rate": 1.351651143281253e-06,
"loss": 0.0643,
"step": 589
},
{
"epoch": 0.6702641295086623,
"grad_norm": 0.6319040066279115,
"learning_rate": 1.3433143146264494e-06,
"loss": 0.0508,
"step": 590
},
{
"epoch": 0.6714001704061346,
"grad_norm": 0.6909411839212513,
"learning_rate": 1.3349938203285412e-06,
"loss": 0.0545,
"step": 591
},
{
"epoch": 0.672536211303607,
"grad_norm": 0.7930681686093434,
"learning_rate": 1.3266897778869704e-06,
"loss": 0.0743,
"step": 592
},
{
"epoch": 0.6736722522010793,
"grad_norm": 0.7524271746971427,
"learning_rate": 1.3184023045688515e-06,
"loss": 0.065,
"step": 593
},
{
"epoch": 0.6748082930985515,
"grad_norm": 0.7593574572396692,
"learning_rate": 1.3101315174073162e-06,
"loss": 0.071,
"step": 594
},
{
"epoch": 0.6759443339960238,
"grad_norm": 0.7307972692757313,
"learning_rate": 1.301877533199859e-06,
"loss": 0.063,
"step": 595
},
{
"epoch": 0.6770803748934962,
"grad_norm": 0.71668422907411,
"learning_rate": 1.2936404685066852e-06,
"loss": 0.0642,
"step": 596
},
{
"epoch": 0.6782164157909685,
"grad_norm": 0.6457093799654868,
"learning_rate": 1.2854204396490722e-06,
"loss": 0.0551,
"step": 597
},
{
"epoch": 0.6793524566884408,
"grad_norm": 0.719388282295858,
"learning_rate": 1.2772175627077204e-06,
"loss": 0.0611,
"step": 598
},
{
"epoch": 0.6804884975859131,
"grad_norm": 0.6880869590502872,
"learning_rate": 1.2690319535211171e-06,
"loss": 0.0609,
"step": 599
},
{
"epoch": 0.6816245384833854,
"grad_norm": 0.6516612119128604,
"learning_rate": 1.2608637276838987e-06,
"loss": 0.054,
"step": 600
},
{
"epoch": 0.6827605793808577,
"grad_norm": 0.7511572391468219,
"learning_rate": 1.2527130005452212e-06,
"loss": 0.0663,
"step": 601
},
{
"epoch": 0.68389662027833,
"grad_norm": 0.7651843581040089,
"learning_rate": 1.244579887207126e-06,
"loss": 0.0699,
"step": 602
},
{
"epoch": 0.6850326611758023,
"grad_norm": 0.7373402269865651,
"learning_rate": 1.236464502522921e-06,
"loss": 0.0684,
"step": 603
},
{
"epoch": 0.6861687020732746,
"grad_norm": 0.6786418211884958,
"learning_rate": 1.2283669610955543e-06,
"loss": 0.0671,
"step": 604
},
{
"epoch": 0.687304742970747,
"grad_norm": 0.698068956478052,
"learning_rate": 1.2202873772759983e-06,
"loss": 0.0618,
"step": 605
},
{
"epoch": 0.6884407838682193,
"grad_norm": 0.6851653623325352,
"learning_rate": 1.2122258651616305e-06,
"loss": 0.059,
"step": 606
},
{
"epoch": 0.6895768247656916,
"grad_norm": 0.6974766060397587,
"learning_rate": 1.2041825385946288e-06,
"loss": 0.0659,
"step": 607
},
{
"epoch": 0.6907128656631639,
"grad_norm": 0.7519282757382589,
"learning_rate": 1.1961575111603588e-06,
"loss": 0.067,
"step": 608
},
{
"epoch": 0.6918489065606361,
"grad_norm": 0.8372671010396519,
"learning_rate": 1.1881508961857716e-06,
"loss": 0.0726,
"step": 609
},
{
"epoch": 0.6929849474581085,
"grad_norm": 0.7072832731190405,
"learning_rate": 1.1801628067378033e-06,
"loss": 0.0656,
"step": 610
},
{
"epoch": 0.6941209883555808,
"grad_norm": 0.7352876648138319,
"learning_rate": 1.1721933556217793e-06,
"loss": 0.0733,
"step": 611
},
{
"epoch": 0.6952570292530531,
"grad_norm": 0.686736638297048,
"learning_rate": 1.1642426553798175e-06,
"loss": 0.062,
"step": 612
},
{
"epoch": 0.6963930701505254,
"grad_norm": 0.6804914622631124,
"learning_rate": 1.1563108182892447e-06,
"loss": 0.0558,
"step": 613
},
{
"epoch": 0.6975291110479978,
"grad_norm": 0.7283887982191483,
"learning_rate": 1.148397956361007e-06,
"loss": 0.0644,
"step": 614
},
{
"epoch": 0.6986651519454701,
"grad_norm": 0.6925134320068936,
"learning_rate": 1.1405041813380879e-06,
"loss": 0.0679,
"step": 615
},
{
"epoch": 0.6998011928429424,
"grad_norm": 0.7087695259064026,
"learning_rate": 1.1326296046939334e-06,
"loss": 0.0562,
"step": 616
},
{
"epoch": 0.7009372337404146,
"grad_norm": 0.6799067610660522,
"learning_rate": 1.1247743376308754e-06,
"loss": 0.0554,
"step": 617
},
{
"epoch": 0.7020732746378869,
"grad_norm": 0.650598765409007,
"learning_rate": 1.1169384910785613e-06,
"loss": 0.0512,
"step": 618
},
{
"epoch": 0.7032093155353593,
"grad_norm": 0.7180760489339335,
"learning_rate": 1.1091221756923888e-06,
"loss": 0.0676,
"step": 619
},
{
"epoch": 0.7043453564328316,
"grad_norm": 0.771129010138558,
"learning_rate": 1.1013255018519426e-06,
"loss": 0.0744,
"step": 620
},
{
"epoch": 0.7054813973303039,
"grad_norm": 0.7354640458010835,
"learning_rate": 1.0935485796594352e-06,
"loss": 0.069,
"step": 621
},
{
"epoch": 0.7066174382277762,
"grad_norm": 0.7116533320646369,
"learning_rate": 1.0857915189381512e-06,
"loss": 0.0625,
"step": 622
},
{
"epoch": 0.7077534791252486,
"grad_norm": 0.7329261029533441,
"learning_rate": 1.0780544292308998e-06,
"loss": 0.0607,
"step": 623
},
{
"epoch": 0.7088895200227208,
"grad_norm": 0.7486549384519606,
"learning_rate": 1.0703374197984654e-06,
"loss": 0.0682,
"step": 624
},
{
"epoch": 0.7100255609201931,
"grad_norm": 0.7165004116848969,
"learning_rate": 1.0626405996180628e-06,
"loss": 0.0613,
"step": 625
},
{
"epoch": 0.7111616018176654,
"grad_norm": 0.7262023853060127,
"learning_rate": 1.054964077381803e-06,
"loss": 0.0645,
"step": 626
},
{
"epoch": 0.7122976427151377,
"grad_norm": 0.7113638076418349,
"learning_rate": 1.0473079614951546e-06,
"loss": 0.0655,
"step": 627
},
{
"epoch": 0.7134336836126101,
"grad_norm": 0.7029593419874601,
"learning_rate": 1.0396723600754144e-06,
"loss": 0.0596,
"step": 628
},
{
"epoch": 0.7145697245100824,
"grad_norm": 0.6926854888222683,
"learning_rate": 1.0320573809501796e-06,
"loss": 0.0592,
"step": 629
},
{
"epoch": 0.7157057654075547,
"grad_norm": 0.677936567695018,
"learning_rate": 1.0244631316558268e-06,
"loss": 0.0575,
"step": 630
},
{
"epoch": 0.716841806305027,
"grad_norm": 0.6830938530740739,
"learning_rate": 1.0168897194359922e-06,
"loss": 0.0561,
"step": 631
},
{
"epoch": 0.7179778472024992,
"grad_norm": 0.7493058249527598,
"learning_rate": 1.009337251240055e-06,
"loss": 0.0643,
"step": 632
},
{
"epoch": 0.7191138880999716,
"grad_norm": 0.6948071010614205,
"learning_rate": 1.0018058337216327e-06,
"loss": 0.0598,
"step": 633
},
{
"epoch": 0.7202499289974439,
"grad_norm": 0.6969420266325782,
"learning_rate": 9.942955732370706e-07,
"loss": 0.0609,
"step": 634
},
{
"epoch": 0.7213859698949162,
"grad_norm": 0.7578662249147158,
"learning_rate": 9.868065758439388e-07,
"loss": 0.0665,
"step": 635
},
{
"epoch": 0.7225220107923885,
"grad_norm": 0.6911037252532237,
"learning_rate": 9.793389472995393e-07,
"loss": 0.0576,
"step": 636
},
{
"epoch": 0.7236580516898609,
"grad_norm": 0.6739642804854075,
"learning_rate": 9.718927930594087e-07,
"loss": 0.0553,
"step": 637
},
{
"epoch": 0.7247940925873332,
"grad_norm": 0.7079769503814837,
"learning_rate": 9.644682182758305e-07,
"loss": 0.0619,
"step": 638
},
{
"epoch": 0.7259301334848055,
"grad_norm": 0.7111145505806309,
"learning_rate": 9.570653277963493e-07,
"loss": 0.0641,
"step": 639
},
{
"epoch": 0.7270661743822777,
"grad_norm": 0.6878608709979896,
"learning_rate": 9.496842261622921e-07,
"loss": 0.0556,
"step": 640
},
{
"epoch": 0.72820221527975,
"grad_norm": 0.708762829460775,
"learning_rate": 9.423250176072877e-07,
"loss": 0.0615,
"step": 641
},
{
"epoch": 0.7293382561772224,
"grad_norm": 0.7077145100832107,
"learning_rate": 9.349878060557998e-07,
"loss": 0.0567,
"step": 642
},
{
"epoch": 0.7304742970746947,
"grad_norm": 0.6814892554720623,
"learning_rate": 9.276726951216572e-07,
"loss": 0.0543,
"step": 643
},
{
"epoch": 0.731610337972167,
"grad_norm": 0.6926696110783388,
"learning_rate": 9.203797881065907e-07,
"loss": 0.0562,
"step": 644
},
{
"epoch": 0.7327463788696393,
"grad_norm": 0.7485805520199306,
"learning_rate": 9.131091879987725e-07,
"loss": 0.0653,
"step": 645
},
{
"epoch": 0.7338824197671117,
"grad_norm": 0.7149870147688834,
"learning_rate": 9.058609974713655e-07,
"loss": 0.0557,
"step": 646
},
{
"epoch": 0.7350184606645839,
"grad_norm": 0.6706533054717475,
"learning_rate": 8.986353188810706e-07,
"loss": 0.0504,
"step": 647
},
{
"epoch": 0.7361545015620562,
"grad_norm": 0.6570525705060476,
"learning_rate": 8.914322542666822e-07,
"loss": 0.057,
"step": 648
},
{
"epoch": 0.7372905424595285,
"grad_norm": 0.8225152469323989,
"learning_rate": 8.842519053476476e-07,
"loss": 0.0756,
"step": 649
},
{
"epoch": 0.7384265833570008,
"grad_norm": 0.7687986898101944,
"learning_rate": 8.770943735226303e-07,
"loss": 0.077,
"step": 650
},
{
"epoch": 0.7395626242544732,
"grad_norm": 0.6760083488102284,
"learning_rate": 8.699597598680753e-07,
"loss": 0.0616,
"step": 651
},
{
"epoch": 0.7406986651519455,
"grad_norm": 0.7111445133059799,
"learning_rate": 8.628481651367876e-07,
"loss": 0.0609,
"step": 652
},
{
"epoch": 0.7418347060494178,
"grad_norm": 0.715680620040158,
"learning_rate": 8.557596897565043e-07,
"loss": 0.0638,
"step": 653
},
{
"epoch": 0.7429707469468901,
"grad_norm": 0.7033591520489146,
"learning_rate": 8.486944338284797e-07,
"loss": 0.0581,
"step": 654
},
{
"epoch": 0.7441067878443623,
"grad_norm": 0.8037728386903387,
"learning_rate": 8.416524971260673e-07,
"loss": 0.0612,
"step": 655
},
{
"epoch": 0.7452428287418347,
"grad_norm": 0.6785930994355049,
"learning_rate": 8.346339790933167e-07,
"loss": 0.0581,
"step": 656
},
{
"epoch": 0.746378869639307,
"grad_norm": 0.606320813922669,
"learning_rate": 8.276389788435648e-07,
"loss": 0.0498,
"step": 657
},
{
"epoch": 0.7475149105367793,
"grad_norm": 0.74087511760928,
"learning_rate": 8.206675951580382e-07,
"loss": 0.066,
"step": 658
},
{
"epoch": 0.7486509514342516,
"grad_norm": 0.7997387497978912,
"learning_rate": 8.137199264844572e-07,
"loss": 0.0611,
"step": 659
},
{
"epoch": 0.749786992331724,
"grad_norm": 0.6899912005637121,
"learning_rate": 8.067960709356479e-07,
"loss": 0.0598,
"step": 660
},
{
"epoch": 0.7509230332291963,
"grad_norm": 0.681907855332953,
"learning_rate": 7.998961262881507e-07,
"loss": 0.0585,
"step": 661
},
{
"epoch": 0.7520590741266686,
"grad_norm": 0.7119725078654311,
"learning_rate": 7.930201899808476e-07,
"loss": 0.0656,
"step": 662
},
{
"epoch": 0.7531951150241408,
"grad_norm": 0.6825623566073593,
"learning_rate": 7.861683591135816e-07,
"loss": 0.057,
"step": 663
},
{
"epoch": 0.7543311559216132,
"grad_norm": 0.7481916867742439,
"learning_rate": 7.793407304457836e-07,
"loss": 0.0672,
"step": 664
},
{
"epoch": 0.7554671968190855,
"grad_norm": 0.7492624248158564,
"learning_rate": 7.725374003951117e-07,
"loss": 0.0621,
"step": 665
},
{
"epoch": 0.7566032377165578,
"grad_norm": 0.7248814424445004,
"learning_rate": 7.657584650360847e-07,
"loss": 0.0584,
"step": 666
},
{
"epoch": 0.7577392786140301,
"grad_norm": 0.7043497561761057,
"learning_rate": 7.590040200987275e-07,
"loss": 0.0648,
"step": 667
},
{
"epoch": 0.7588753195115024,
"grad_norm": 0.7143918824523554,
"learning_rate": 7.522741609672194e-07,
"loss": 0.0633,
"step": 668
},
{
"epoch": 0.7600113604089748,
"grad_norm": 0.7755831116052598,
"learning_rate": 7.455689826785456e-07,
"loss": 0.069,
"step": 669
},
{
"epoch": 0.761147401306447,
"grad_norm": 0.8501109028445187,
"learning_rate": 7.388885799211573e-07,
"loss": 0.0645,
"step": 670
},
{
"epoch": 0.7622834422039193,
"grad_norm": 0.7699716418147975,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0679,
"step": 671
},
{
"epoch": 0.7634194831013916,
"grad_norm": 0.6784245576570962,
"learning_rate": 7.256024780033418e-07,
"loss": 0.0564,
"step": 672
},
{
"epoch": 0.764555523998864,
"grad_norm": 0.6756586477723381,
"learning_rate": 7.189969664651314e-07,
"loss": 0.0561,
"step": 673
},
{
"epoch": 0.7656915648963363,
"grad_norm": 0.6751139607665853,
"learning_rate": 7.124166056999854e-07,
"loss": 0.0597,
"step": 674
},
{
"epoch": 0.7668276057938086,
"grad_norm": 0.7239702050922359,
"learning_rate": 7.058614886337212e-07,
"loss": 0.0669,
"step": 675
},
{
"epoch": 0.7679636466912809,
"grad_norm": 0.8123814274934885,
"learning_rate": 6.993317078356709e-07,
"loss": 0.0645,
"step": 676
},
{
"epoch": 0.7690996875887532,
"grad_norm": 0.6566260710318882,
"learning_rate": 6.928273555173762e-07,
"loss": 0.0612,
"step": 677
},
{
"epoch": 0.7702357284862255,
"grad_norm": 0.7245239279676755,
"learning_rate": 6.863485235312853e-07,
"loss": 0.0632,
"step": 678
},
{
"epoch": 0.7713717693836978,
"grad_norm": 0.7311867799854507,
"learning_rate": 6.798953033694558e-07,
"loss": 0.0635,
"step": 679
},
{
"epoch": 0.7725078102811701,
"grad_norm": 0.6374305064275428,
"learning_rate": 6.734677861622652e-07,
"loss": 0.0506,
"step": 680
},
{
"epoch": 0.7736438511786424,
"grad_norm": 0.6990838725244343,
"learning_rate": 6.67066062677118e-07,
"loss": 0.0643,
"step": 681
},
{
"epoch": 0.7747798920761148,
"grad_norm": 0.6830770597944112,
"learning_rate": 6.60690223317171e-07,
"loss": 0.0592,
"step": 682
},
{
"epoch": 0.7759159329735871,
"grad_norm": 0.7193721168009378,
"learning_rate": 6.54340358120053e-07,
"loss": 0.0543,
"step": 683
},
{
"epoch": 0.7770519738710594,
"grad_norm": 0.6929443175270855,
"learning_rate": 6.480165567565913e-07,
"loss": 0.0632,
"step": 684
},
{
"epoch": 0.7781880147685317,
"grad_norm": 0.7544691021614793,
"learning_rate": 6.417189085295508e-07,
"loss": 0.0643,
"step": 685
},
{
"epoch": 0.7793240556660039,
"grad_norm": 0.701467040161313,
"learning_rate": 6.354475023723685e-07,
"loss": 0.0667,
"step": 686
},
{
"epoch": 0.7804600965634763,
"grad_norm": 0.7755055024328538,
"learning_rate": 6.292024268478991e-07,
"loss": 0.0637,
"step": 687
},
{
"epoch": 0.7815961374609486,
"grad_norm": 0.7413535405232983,
"learning_rate": 6.229837701471645e-07,
"loss": 0.0667,
"step": 688
},
{
"epoch": 0.7827321783584209,
"grad_norm": 0.7579433832862532,
"learning_rate": 6.167916200881085e-07,
"loss": 0.0639,
"step": 689
},
{
"epoch": 0.7838682192558932,
"grad_norm": 0.7058994563449785,
"learning_rate": 6.106260641143547e-07,
"loss": 0.0566,
"step": 690
},
{
"epoch": 0.7850042601533656,
"grad_norm": 0.7474580592638266,
"learning_rate": 6.044871892939746e-07,
"loss": 0.0585,
"step": 691
},
{
"epoch": 0.7861403010508379,
"grad_norm": 0.7095617754477044,
"learning_rate": 5.983750823182574e-07,
"loss": 0.0604,
"step": 692
},
{
"epoch": 0.7872763419483101,
"grad_norm": 0.6270813452574567,
"learning_rate": 5.922898295004842e-07,
"loss": 0.0569,
"step": 693
},
{
"epoch": 0.7884123828457824,
"grad_norm": 0.7683343290015114,
"learning_rate": 5.86231516774709e-07,
"loss": 0.0641,
"step": 694
},
{
"epoch": 0.7895484237432547,
"grad_norm": 0.6205544191304064,
"learning_rate": 5.802002296945475e-07,
"loss": 0.0521,
"step": 695
},
{
"epoch": 0.7906844646407271,
"grad_norm": 0.6855169824530946,
"learning_rate": 5.741960534319677e-07,
"loss": 0.0541,
"step": 696
},
{
"epoch": 0.7918205055381994,
"grad_norm": 0.677963115259813,
"learning_rate": 5.682190727760864e-07,
"loss": 0.061,
"step": 697
},
{
"epoch": 0.7929565464356717,
"grad_norm": 0.7157962748647665,
"learning_rate": 5.622693721319728e-07,
"loss": 0.0602,
"step": 698
},
{
"epoch": 0.794092587333144,
"grad_norm": 0.7021846226730226,
"learning_rate": 5.563470355194564e-07,
"loss": 0.0689,
"step": 699
},
{
"epoch": 0.7952286282306164,
"grad_norm": 0.6730377226852833,
"learning_rate": 5.504521465719392e-07,
"loss": 0.0624,
"step": 700
},
{
"epoch": 0.7963646691280886,
"grad_norm": 0.7117566338537665,
"learning_rate": 5.445847885352171e-07,
"loss": 0.0544,
"step": 701
},
{
"epoch": 0.7975007100255609,
"grad_norm": 0.6514495814244172,
"learning_rate": 5.387450442663026e-07,
"loss": 0.0547,
"step": 702
},
{
"epoch": 0.7986367509230332,
"grad_norm": 0.7041482161296513,
"learning_rate": 5.329329962322554e-07,
"loss": 0.0637,
"step": 703
},
{
"epoch": 0.7997727918205055,
"grad_norm": 0.662553486771611,
"learning_rate": 5.271487265090163e-07,
"loss": 0.0617,
"step": 704
},
{
"epoch": 0.8009088327179779,
"grad_norm": 0.771571552061494,
"learning_rate": 5.213923167802506e-07,
"loss": 0.068,
"step": 705
},
{
"epoch": 0.8020448736154502,
"grad_norm": 0.7413005333218508,
"learning_rate": 5.156638483361933e-07,
"loss": 0.0592,
"step": 706
},
{
"epoch": 0.8031809145129225,
"grad_norm": 0.7075709987524001,
"learning_rate": 5.099634020725012e-07,
"loss": 0.0559,
"step": 707
},
{
"epoch": 0.8043169554103948,
"grad_norm": 0.7275257681800995,
"learning_rate": 5.0429105848911e-07,
"loss": 0.0696,
"step": 708
},
{
"epoch": 0.805452996307867,
"grad_norm": 0.6828000250150597,
"learning_rate": 4.986468976890993e-07,
"loss": 0.0659,
"step": 709
},
{
"epoch": 0.8065890372053394,
"grad_norm": 0.7757907559807298,
"learning_rate": 4.930309993775578e-07,
"loss": 0.064,
"step": 710
},
{
"epoch": 0.8077250781028117,
"grad_norm": 0.6173272258086845,
"learning_rate": 4.874434428604625e-07,
"loss": 0.0437,
"step": 711
},
{
"epoch": 0.808861119000284,
"grad_norm": 0.7177886268777128,
"learning_rate": 4.818843070435561e-07,
"loss": 0.0577,
"step": 712
},
{
"epoch": 0.8099971598977563,
"grad_norm": 0.68285187824597,
"learning_rate": 4.763536704312305e-07,
"loss": 0.0567,
"step": 713
},
{
"epoch": 0.8111332007952287,
"grad_norm": 0.771227288945189,
"learning_rate": 4.708516111254238e-07,
"loss": 0.071,
"step": 714
},
{
"epoch": 0.812269241692701,
"grad_norm": 0.7283356464831592,
"learning_rate": 4.6537820682451273e-07,
"loss": 0.0641,
"step": 715
},
{
"epoch": 0.8134052825901732,
"grad_norm": 0.7110559383473583,
"learning_rate": 4.5993353482221697e-07,
"loss": 0.0663,
"step": 716
},
{
"epoch": 0.8145413234876455,
"grad_norm": 0.6511668989401792,
"learning_rate": 4.545176720065078e-07,
"loss": 0.0574,
"step": 717
},
{
"epoch": 0.8156773643851178,
"grad_norm": 0.7393080837168933,
"learning_rate": 4.4913069485852197e-07,
"loss": 0.0604,
"step": 718
},
{
"epoch": 0.8168134052825902,
"grad_norm": 0.7436931187552859,
"learning_rate": 4.437726794514824e-07,
"loss": 0.0562,
"step": 719
},
{
"epoch": 0.8179494461800625,
"grad_norm": 0.7077518355152435,
"learning_rate": 4.3844370144962153e-07,
"loss": 0.0595,
"step": 720
},
{
"epoch": 0.8190854870775348,
"grad_norm": 0.6832077000007746,
"learning_rate": 4.3314383610711633e-07,
"loss": 0.0688,
"step": 721
},
{
"epoch": 0.8202215279750071,
"grad_norm": 0.7105084721185602,
"learning_rate": 4.2787315826702396e-07,
"loss": 0.0656,
"step": 722
},
{
"epoch": 0.8213575688724795,
"grad_norm": 0.6614284120793984,
"learning_rate": 4.2263174236022245e-07,
"loss": 0.0589,
"step": 723
},
{
"epoch": 0.8224936097699517,
"grad_norm": 0.6765730172646925,
"learning_rate": 4.1741966240436446e-07,
"loss": 0.0586,
"step": 724
},
{
"epoch": 0.823629650667424,
"grad_norm": 0.7338387674973738,
"learning_rate": 4.122369920028277e-07,
"loss": 0.0659,
"step": 725
},
{
"epoch": 0.8247656915648963,
"grad_norm": 0.7223755947250412,
"learning_rate": 4.070838043436787e-07,
"loss": 0.0608,
"step": 726
},
{
"epoch": 0.8259017324623686,
"grad_norm": 0.678695491674745,
"learning_rate": 4.019601721986363e-07,
"loss": 0.0667,
"step": 727
},
{
"epoch": 0.827037773359841,
"grad_norm": 0.6696005070496583,
"learning_rate": 3.9686616792204677e-07,
"loss": 0.0584,
"step": 728
},
{
"epoch": 0.8281738142573133,
"grad_norm": 0.6789148636755675,
"learning_rate": 3.9180186344986103e-07,
"loss": 0.0604,
"step": 729
},
{
"epoch": 0.8293098551547856,
"grad_norm": 0.7335609599059305,
"learning_rate": 3.867673302986161e-07,
"loss": 0.0585,
"step": 730
},
{
"epoch": 0.8304458960522578,
"grad_norm": 0.739136862580604,
"learning_rate": 3.8176263956443056e-07,
"loss": 0.0623,
"step": 731
},
{
"epoch": 0.8315819369497301,
"grad_norm": 0.7130221754855517,
"learning_rate": 3.7678786192199695e-07,
"loss": 0.059,
"step": 732
},
{
"epoch": 0.8327179778472025,
"grad_norm": 0.7513521882454077,
"learning_rate": 3.7184306762358235e-07,
"loss": 0.0595,
"step": 733
},
{
"epoch": 0.8338540187446748,
"grad_norm": 0.716760774039175,
"learning_rate": 3.6692832649804085e-07,
"loss": 0.0586,
"step": 734
},
{
"epoch": 0.8349900596421471,
"grad_norm": 0.6871206927436988,
"learning_rate": 3.6204370794982376e-07,
"loss": 0.0674,
"step": 735
},
{
"epoch": 0.8361261005396194,
"grad_norm": 0.692975693286346,
"learning_rate": 3.571892809580013e-07,
"loss": 0.0537,
"step": 736
},
{
"epoch": 0.8372621414370918,
"grad_norm": 0.680048611874437,
"learning_rate": 3.5236511407528676e-07,
"loss": 0.0607,
"step": 737
},
{
"epoch": 0.8383981823345641,
"grad_norm": 0.7149557473910435,
"learning_rate": 3.475712754270716e-07,
"loss": 0.0581,
"step": 738
},
{
"epoch": 0.8395342232320363,
"grad_norm": 0.6836636678136013,
"learning_rate": 3.4280783271045863e-07,
"loss": 0.0563,
"step": 739
},
{
"epoch": 0.8406702641295086,
"grad_norm": 0.6678612129832644,
"learning_rate": 3.3807485319331037e-07,
"loss": 0.0637,
"step": 740
},
{
"epoch": 0.841806305026981,
"grad_norm": 0.7068459150295527,
"learning_rate": 3.333724037132977e-07,
"loss": 0.061,
"step": 741
},
{
"epoch": 0.8429423459244533,
"grad_norm": 0.6444850358901095,
"learning_rate": 3.2870055067695557e-07,
"loss": 0.0479,
"step": 742
},
{
"epoch": 0.8440783868219256,
"grad_norm": 0.6551043511985852,
"learning_rate": 3.240593600587444e-07,
"loss": 0.0459,
"step": 743
},
{
"epoch": 0.8452144277193979,
"grad_norm": 0.7533706586347682,
"learning_rate": 3.194488974001203e-07,
"loss": 0.0727,
"step": 744
},
{
"epoch": 0.8463504686168702,
"grad_norm": 0.7193017131365196,
"learning_rate": 3.148692278086088e-07,
"loss": 0.0631,
"step": 745
},
{
"epoch": 0.8474865095143426,
"grad_norm": 0.6482741683877096,
"learning_rate": 3.1032041595688514e-07,
"loss": 0.0475,
"step": 746
},
{
"epoch": 0.8486225504118148,
"grad_norm": 0.6625125560890016,
"learning_rate": 3.058025260818609e-07,
"loss": 0.0625,
"step": 747
},
{
"epoch": 0.8497585913092871,
"grad_norm": 0.6797786911558783,
"learning_rate": 3.0131562198377763e-07,
"loss": 0.061,
"step": 748
},
{
"epoch": 0.8508946322067594,
"grad_norm": 0.7285783384018145,
"learning_rate": 2.96859767025304e-07,
"loss": 0.0652,
"step": 749
},
{
"epoch": 0.8520306731042318,
"grad_norm": 0.6758942124222337,
"learning_rate": 2.9243502413064365e-07,
"loss": 0.0601,
"step": 750
},
{
"epoch": 0.8531667140017041,
"grad_norm": 0.6569659983478026,
"learning_rate": 2.8804145578464533e-07,
"loss": 0.0549,
"step": 751
},
{
"epoch": 0.8543027548991764,
"grad_norm": 0.6561252291916106,
"learning_rate": 2.8367912403191976e-07,
"loss": 0.06,
"step": 752
},
{
"epoch": 0.8554387957966487,
"grad_norm": 0.6900748230193315,
"learning_rate": 2.7934809047596436e-07,
"loss": 0.0533,
"step": 753
},
{
"epoch": 0.8565748366941209,
"grad_norm": 0.6719565408280884,
"learning_rate": 2.7504841627829293e-07,
"loss": 0.0614,
"step": 754
},
{
"epoch": 0.8577108775915933,
"grad_norm": 0.6697878517856197,
"learning_rate": 2.7078016215757343e-07,
"loss": 0.0592,
"step": 755
},
{
"epoch": 0.8588469184890656,
"grad_norm": 0.6903304793443591,
"learning_rate": 2.6654338838876664e-07,
"loss": 0.0586,
"step": 756
},
{
"epoch": 0.8599829593865379,
"grad_norm": 0.6357831598645269,
"learning_rate": 2.623381548022802e-07,
"loss": 0.0542,
"step": 757
},
{
"epoch": 0.8611190002840102,
"grad_norm": 0.7011758996549884,
"learning_rate": 2.581645207831204e-07,
"loss": 0.0595,
"step": 758
},
{
"epoch": 0.8622550411814826,
"grad_norm": 0.7179364439414111,
"learning_rate": 2.5402254527005286e-07,
"loss": 0.0634,
"step": 759
},
{
"epoch": 0.8633910820789549,
"grad_norm": 0.7201197976310817,
"learning_rate": 2.4991228675477293e-07,
"loss": 0.0629,
"step": 760
},
{
"epoch": 0.8645271229764272,
"grad_norm": 0.702925778220309,
"learning_rate": 2.458338032810781e-07,
"loss": 0.0621,
"step": 761
},
{
"epoch": 0.8656631638738994,
"grad_norm": 0.6556798289743799,
"learning_rate": 2.4178715244404796e-07,
"loss": 0.0614,
"step": 762
},
{
"epoch": 0.8667992047713717,
"grad_norm": 0.6606948072301375,
"learning_rate": 2.3777239138923214e-07,
"loss": 0.057,
"step": 763
},
{
"epoch": 0.8679352456688441,
"grad_norm": 0.6761035461766024,
"learning_rate": 2.3378957681184283e-07,
"loss": 0.0523,
"step": 764
},
{
"epoch": 0.8690712865663164,
"grad_norm": 0.644129680660782,
"learning_rate": 2.298387649559533e-07,
"loss": 0.0553,
"step": 765
},
{
"epoch": 0.8702073274637887,
"grad_norm": 0.6581351644159102,
"learning_rate": 2.2592001161370392e-07,
"loss": 0.0544,
"step": 766
},
{
"epoch": 0.871343368361261,
"grad_norm": 0.6811793665508726,
"learning_rate": 2.2203337212451632e-07,
"loss": 0.0567,
"step": 767
},
{
"epoch": 0.8724794092587334,
"grad_norm": 0.6377187494528523,
"learning_rate": 2.1817890137430936e-07,
"loss": 0.0503,
"step": 768
},
{
"epoch": 0.8736154501562057,
"grad_norm": 0.6913242736410589,
"learning_rate": 2.1435665379472393e-07,
"loss": 0.0526,
"step": 769
},
{
"epoch": 0.8747514910536779,
"grad_norm": 0.7004250425383008,
"learning_rate": 2.1056668336235624e-07,
"loss": 0.0545,
"step": 770
},
{
"epoch": 0.8758875319511502,
"grad_norm": 0.7005696669465531,
"learning_rate": 2.0680904359799582e-07,
"loss": 0.0592,
"step": 771
},
{
"epoch": 0.8770235728486225,
"grad_norm": 0.6799341594364471,
"learning_rate": 2.0308378756586562e-07,
"loss": 0.0577,
"step": 772
},
{
"epoch": 0.8781596137460949,
"grad_norm": 0.664142756885439,
"learning_rate": 1.9939096787287783e-07,
"loss": 0.0539,
"step": 773
},
{
"epoch": 0.8792956546435672,
"grad_norm": 0.6412395921611368,
"learning_rate": 1.9573063666788878e-07,
"loss": 0.0559,
"step": 774
},
{
"epoch": 0.8804316955410395,
"grad_norm": 0.6613285197923775,
"learning_rate": 1.9210284564096042e-07,
"loss": 0.055,
"step": 775
},
{
"epoch": 0.8815677364385118,
"grad_norm": 0.6140474369508341,
"learning_rate": 1.8850764602263428e-07,
"loss": 0.0531,
"step": 776
},
{
"epoch": 0.882703777335984,
"grad_norm": 0.6692649568862223,
"learning_rate": 1.8494508858320603e-07,
"loss": 0.0571,
"step": 777
},
{
"epoch": 0.8838398182334564,
"grad_norm": 0.6425303533870481,
"learning_rate": 1.8141522363200797e-07,
"loss": 0.0561,
"step": 778
},
{
"epoch": 0.8849758591309287,
"grad_norm": 0.6768491095903121,
"learning_rate": 1.7791810101669887e-07,
"loss": 0.0648,
"step": 779
},
{
"epoch": 0.886111900028401,
"grad_norm": 0.6641297355215765,
"learning_rate": 1.7445377012256127e-07,
"loss": 0.0525,
"step": 780
},
{
"epoch": 0.8872479409258733,
"grad_norm": 0.6968334370818641,
"learning_rate": 1.710222798718028e-07,
"loss": 0.056,
"step": 781
},
{
"epoch": 0.8883839818233457,
"grad_norm": 0.6500257846017748,
"learning_rate": 1.676236787228652e-07,
"loss": 0.0599,
"step": 782
},
{
"epoch": 0.889520022720818,
"grad_norm": 0.6996758140756307,
"learning_rate": 1.6425801466974118e-07,
"loss": 0.064,
"step": 783
},
{
"epoch": 0.8906560636182903,
"grad_norm": 0.6965589508578391,
"learning_rate": 1.6092533524129623e-07,
"loss": 0.0611,
"step": 784
},
{
"epoch": 0.8917921045157625,
"grad_norm": 0.7093030606255738,
"learning_rate": 1.5762568750059604e-07,
"loss": 0.0614,
"step": 785
},
{
"epoch": 0.8929281454132348,
"grad_norm": 0.6563496411557368,
"learning_rate": 1.543591180442436e-07,
"loss": 0.0549,
"step": 786
},
{
"epoch": 0.8940641863107072,
"grad_norm": 0.6667029761797618,
"learning_rate": 1.5112567300172186e-07,
"loss": 0.0589,
"step": 787
},
{
"epoch": 0.8952002272081795,
"grad_norm": 0.6828229184240027,
"learning_rate": 1.4792539803473921e-07,
"loss": 0.0557,
"step": 788
},
{
"epoch": 0.8963362681056518,
"grad_norm": 0.650678331393596,
"learning_rate": 1.447583383365872e-07,
"loss": 0.056,
"step": 789
},
{
"epoch": 0.8974723090031241,
"grad_norm": 0.7815935941115113,
"learning_rate": 1.4162453863150183e-07,
"loss": 0.0578,
"step": 790
},
{
"epoch": 0.8986083499005965,
"grad_norm": 0.6868768630237868,
"learning_rate": 1.38524043174032e-07,
"loss": 0.0552,
"step": 791
},
{
"epoch": 0.8997443907980688,
"grad_norm": 0.6840956771248708,
"learning_rate": 1.3545689574841341e-07,
"loss": 0.065,
"step": 792
},
{
"epoch": 0.900880431695541,
"grad_norm": 0.6760255459454898,
"learning_rate": 1.3242313966795207e-07,
"loss": 0.0565,
"step": 793
},
{
"epoch": 0.9020164725930133,
"grad_norm": 0.7016923239975965,
"learning_rate": 1.2942281777441168e-07,
"loss": 0.0582,
"step": 794
},
{
"epoch": 0.9031525134904856,
"grad_norm": 0.7053617949532103,
"learning_rate": 1.2645597243740788e-07,
"loss": 0.0612,
"step": 795
},
{
"epoch": 0.904288554387958,
"grad_norm": 0.6802180277198182,
"learning_rate": 1.2352264555381134e-07,
"loss": 0.0613,
"step": 796
},
{
"epoch": 0.9054245952854303,
"grad_norm": 0.6967777911681594,
"learning_rate": 1.2062287854715638e-07,
"loss": 0.0538,
"step": 797
},
{
"epoch": 0.9065606361829026,
"grad_norm": 0.6496482286765165,
"learning_rate": 1.1775671236705366e-07,
"loss": 0.0529,
"step": 798
},
{
"epoch": 0.9076966770803749,
"grad_norm": 0.6523563127293543,
"learning_rate": 1.1492418748861422e-07,
"loss": 0.0579,
"step": 799
},
{
"epoch": 0.9088327179778471,
"grad_norm": 0.6666269373861077,
"learning_rate": 1.121253439118769e-07,
"loss": 0.0549,
"step": 800
},
{
"epoch": 0.9099687588753195,
"grad_norm": 0.6219590734666158,
"learning_rate": 1.0936022116124323e-07,
"loss": 0.0577,
"step": 801
},
{
"epoch": 0.9111047997727918,
"grad_norm": 0.6646183084981874,
"learning_rate": 1.0662885828492037e-07,
"loss": 0.0507,
"step": 802
},
{
"epoch": 0.9122408406702641,
"grad_norm": 0.667226418096009,
"learning_rate": 1.0393129385436824e-07,
"loss": 0.0554,
"step": 803
},
{
"epoch": 0.9133768815677364,
"grad_norm": 0.6177821185317497,
"learning_rate": 1.0126756596375687e-07,
"loss": 0.0594,
"step": 804
},
{
"epoch": 0.9145129224652088,
"grad_norm": 0.6988859872273063,
"learning_rate": 9.86377122294252e-08,
"loss": 0.0563,
"step": 805
},
{
"epoch": 0.9156489633626811,
"grad_norm": 0.638466441774376,
"learning_rate": 9.604176978935342e-08,
"loss": 0.0516,
"step": 806
},
{
"epoch": 0.9167850042601534,
"grad_norm": 0.6355833397748155,
"learning_rate": 9.347977530263646e-08,
"loss": 0.0518,
"step": 807
},
{
"epoch": 0.9179210451576256,
"grad_norm": 0.6615633189629551,
"learning_rate": 9.095176494896662e-08,
"loss": 0.0551,
"step": 808
},
{
"epoch": 0.919057086055098,
"grad_norm": 0.6539242681970885,
"learning_rate": 8.845777442812314e-08,
"loss": 0.0558,
"step": 809
},
{
"epoch": 0.9201931269525703,
"grad_norm": 0.646172270844248,
"learning_rate": 8.599783895946762e-08,
"loss": 0.054,
"step": 810
},
{
"epoch": 0.9213291678500426,
"grad_norm": 0.6785332746212462,
"learning_rate": 8.357199328144577e-08,
"loss": 0.0599,
"step": 811
},
{
"epoch": 0.9224652087475149,
"grad_norm": 0.6495019596955803,
"learning_rate": 8.118027165109926e-08,
"loss": 0.0539,
"step": 812
},
{
"epoch": 0.9236012496449872,
"grad_norm": 0.6522218309663713,
"learning_rate": 7.88227078435802e-08,
"loss": 0.0545,
"step": 813
},
{
"epoch": 0.9247372905424596,
"grad_norm": 0.7053726256275266,
"learning_rate": 7.649933515167407e-08,
"loss": 0.0593,
"step": 814
},
{
"epoch": 0.9258733314399319,
"grad_norm": 0.6581258921923674,
"learning_rate": 7.421018638533006e-08,
"loss": 0.0554,
"step": 815
},
{
"epoch": 0.9270093723374041,
"grad_norm": 0.7110467493048188,
"learning_rate": 7.195529387119815e-08,
"loss": 0.0629,
"step": 816
},
{
"epoch": 0.9281454132348764,
"grad_norm": 0.61576971414751,
"learning_rate": 6.973468945217138e-08,
"loss": 0.0535,
"step": 817
},
{
"epoch": 0.9292814541323487,
"grad_norm": 0.6228873518276579,
"learning_rate": 6.75484044869379e-08,
"loss": 0.0529,
"step": 818
},
{
"epoch": 0.9304174950298211,
"grad_norm": 0.6884984566218579,
"learning_rate": 6.539646984953629e-08,
"loss": 0.0599,
"step": 819
},
{
"epoch": 0.9315535359272934,
"grad_norm": 0.6351812720863683,
"learning_rate": 6.327891592892126e-08,
"loss": 0.0532,
"step": 820
},
{
"epoch": 0.9326895768247657,
"grad_norm": 0.6412648558002986,
"learning_rate": 6.119577262853255e-08,
"loss": 0.0497,
"step": 821
},
{
"epoch": 0.933825617722238,
"grad_norm": 0.6250020595841635,
"learning_rate": 5.914706936587494e-08,
"loss": 0.0539,
"step": 822
},
{
"epoch": 0.9349616586197103,
"grad_norm": 0.7251758286477061,
"learning_rate": 5.7132835072101486e-08,
"loss": 0.0588,
"step": 823
},
{
"epoch": 0.9360976995171826,
"grad_norm": 0.6450110691416201,
"learning_rate": 5.515309819160402e-08,
"loss": 0.0563,
"step": 824
},
{
"epoch": 0.9372337404146549,
"grad_norm": 0.6976176454161298,
"learning_rate": 5.3207886681613804e-08,
"loss": 0.0568,
"step": 825
},
{
"epoch": 0.9383697813121272,
"grad_norm": 0.6806662590375777,
"learning_rate": 5.129722801180542e-08,
"loss": 0.0522,
"step": 826
},
{
"epoch": 0.9395058222095996,
"grad_norm": 0.6627861317533252,
"learning_rate": 4.942114916390822e-08,
"loss": 0.0569,
"step": 827
},
{
"epoch": 0.9406418631070719,
"grad_norm": 0.6921036678056119,
"learning_rate": 4.75796766313269e-08,
"loss": 0.0541,
"step": 828
},
{
"epoch": 0.9417779040045442,
"grad_norm": 0.685394799824403,
"learning_rate": 4.5772836418765674e-08,
"loss": 0.0585,
"step": 829
},
{
"epoch": 0.9429139449020165,
"grad_norm": 0.6764406432665999,
"learning_rate": 4.4000654041862764e-08,
"loss": 0.0608,
"step": 830
},
{
"epoch": 0.9440499857994887,
"grad_norm": 0.6631270624595927,
"learning_rate": 4.2263154526828164e-08,
"loss": 0.0574,
"step": 831
},
{
"epoch": 0.9451860266969611,
"grad_norm": 0.7131017565421759,
"learning_rate": 4.05603624100917e-08,
"loss": 0.0617,
"step": 832
},
{
"epoch": 0.9463220675944334,
"grad_norm": 0.641827746966638,
"learning_rate": 3.889230173795639e-08,
"loss": 0.0598,
"step": 833
},
{
"epoch": 0.9474581084919057,
"grad_norm": 0.6845999909211324,
"learning_rate": 3.72589960662581e-08,
"loss": 0.0624,
"step": 834
},
{
"epoch": 0.948594149389378,
"grad_norm": 0.6808398137416478,
"learning_rate": 3.56604684600334e-08,
"loss": 0.0597,
"step": 835
},
{
"epoch": 0.9497301902868504,
"grad_norm": 0.6766075948113582,
"learning_rate": 3.4096741493194196e-08,
"loss": 0.0664,
"step": 836
},
{
"epoch": 0.9508662311843227,
"grad_norm": 0.6608092476063419,
"learning_rate": 3.2567837248208e-08,
"loss": 0.0552,
"step": 837
},
{
"epoch": 0.952002272081795,
"grad_norm": 0.6801239599671042,
"learning_rate": 3.107377731578709e-08,
"loss": 0.0555,
"step": 838
},
{
"epoch": 0.9531383129792672,
"grad_norm": 0.6921131575508155,
"learning_rate": 2.9614582794582904e-08,
"loss": 0.062,
"step": 839
},
{
"epoch": 0.9542743538767395,
"grad_norm": 0.6832717944066543,
"learning_rate": 2.819027429088822e-08,
"loss": 0.0664,
"step": 840
},
{
"epoch": 0.9554103947742119,
"grad_norm": 0.7103866012859991,
"learning_rate": 2.680087191834685e-08,
"loss": 0.0607,
"step": 841
},
{
"epoch": 0.9565464356716842,
"grad_norm": 0.7120565073244881,
"learning_rate": 2.544639529766829e-08,
"loss": 0.0532,
"step": 842
},
{
"epoch": 0.9576824765691565,
"grad_norm": 0.6746499491772139,
"learning_rate": 2.4126863556351854e-08,
"loss": 0.0628,
"step": 843
},
{
"epoch": 0.9588185174666288,
"grad_norm": 0.6744896412450383,
"learning_rate": 2.284229532841603e-08,
"loss": 0.0597,
"step": 844
},
{
"epoch": 0.9599545583641012,
"grad_norm": 0.6566929119765946,
"learning_rate": 2.1592708754135105e-08,
"loss": 0.0568,
"step": 845
},
{
"epoch": 0.9610905992615734,
"grad_norm": 0.6458104835743707,
"learning_rate": 2.0378121479783798e-08,
"loss": 0.0557,
"step": 846
},
{
"epoch": 0.9622266401590457,
"grad_norm": 0.6716209159702464,
"learning_rate": 1.919855065738746e-08,
"loss": 0.0562,
"step": 847
},
{
"epoch": 0.963362681056518,
"grad_norm": 0.6679933147801366,
"learning_rate": 1.8054012944479225e-08,
"loss": 0.0554,
"step": 848
},
{
"epoch": 0.9644987219539903,
"grad_norm": 0.7113182719515749,
"learning_rate": 1.6944524503866854e-08,
"loss": 0.0564,
"step": 849
},
{
"epoch": 0.9656347628514627,
"grad_norm": 0.6608607329275047,
"learning_rate": 1.5870101003402083e-08,
"loss": 0.0548,
"step": 850
},
{
"epoch": 0.966770803748935,
"grad_norm": 0.6622351677702579,
"learning_rate": 1.483075761576025e-08,
"loss": 0.0526,
"step": 851
},
{
"epoch": 0.9679068446464073,
"grad_norm": 0.6927125282488752,
"learning_rate": 1.382650901822713e-08,
"loss": 0.0607,
"step": 852
},
{
"epoch": 0.9690428855438796,
"grad_norm": 0.7162152988014072,
"learning_rate": 1.2857369392490493e-08,
"loss": 0.0589,
"step": 853
},
{
"epoch": 0.9701789264413518,
"grad_norm": 0.7898973362889951,
"learning_rate": 1.1923352424439149e-08,
"loss": 0.0646,
"step": 854
},
{
"epoch": 0.9713149673388242,
"grad_norm": 0.6800027138060577,
"learning_rate": 1.1024471303971995e-08,
"loss": 0.0562,
"step": 855
},
{
"epoch": 0.9724510082362965,
"grad_norm": 0.7092009667813072,
"learning_rate": 1.0160738724809549e-08,
"loss": 0.0542,
"step": 856
},
{
"epoch": 0.9735870491337688,
"grad_norm": 0.6988482035788468,
"learning_rate": 9.332166884315763e-09,
"loss": 0.0664,
"step": 857
},
{
"epoch": 0.9747230900312411,
"grad_norm": 0.6887000678028918,
"learning_rate": 8.538767483325384e-09,
"loss": 0.0638,
"step": 858
},
{
"epoch": 0.9758591309287135,
"grad_norm": 0.6409429108166975,
"learning_rate": 7.78055172597908e-09,
"loss": 0.057,
"step": 859
},
{
"epoch": 0.9769951718261858,
"grad_norm": 0.6719107919151849,
"learning_rate": 7.05753031956441e-09,
"loss": 0.0557,
"step": 860
},
{
"epoch": 0.9781312127236581,
"grad_norm": 0.7066671846222795,
"learning_rate": 6.369713474366213e-09,
"loss": 0.0557,
"step": 861
},
{
"epoch": 0.9792672536211303,
"grad_norm": 0.6893439981594293,
"learning_rate": 5.717110903520617e-09,
"loss": 0.0582,
"step": 862
},
{
"epoch": 0.9804032945186026,
"grad_norm": 0.7127707639288884,
"learning_rate": 5.09973182287904e-09,
"loss": 0.0598,
"step": 863
},
{
"epoch": 0.981539335416075,
"grad_norm": 0.6688168935131097,
"learning_rate": 4.517584950877451e-09,
"loss": 0.0628,
"step": 864
},
{
"epoch": 0.9826753763135473,
"grad_norm": 0.6588910866128358,
"learning_rate": 3.970678508413983e-09,
"loss": 0.0579,
"step": 865
},
{
"epoch": 0.9838114172110196,
"grad_norm": 0.6434730119889486,
"learning_rate": 3.4590202187315124e-09,
"loss": 0.0524,
"step": 866
},
{
"epoch": 0.9849474581084919,
"grad_norm": 0.6675520230860594,
"learning_rate": 2.982617307310254e-09,
"loss": 0.0526,
"step": 867
},
{
"epoch": 0.9860834990059643,
"grad_norm": 0.7011452717713275,
"learning_rate": 2.5414765017642285e-09,
"loss": 0.0582,
"step": 868
},
{
"epoch": 0.9872195399034365,
"grad_norm": 0.6315815934268008,
"learning_rate": 2.1356040317474512e-09,
"loss": 0.0523,
"step": 869
},
{
"epoch": 0.9883555808009088,
"grad_norm": 0.6397196083044729,
"learning_rate": 1.765005628865113e-09,
"loss": 0.0534,
"step": 870
},
{
"epoch": 0.9894916216983811,
"grad_norm": 0.6499579240718064,
"learning_rate": 1.4296865265930882e-09,
"loss": 0.0488,
"step": 871
},
{
"epoch": 0.9906276625958534,
"grad_norm": 0.7131022504272738,
"learning_rate": 1.1296514602038289e-09,
"loss": 0.0604,
"step": 872
},
{
"epoch": 0.9917637034933258,
"grad_norm": 0.6575236309958269,
"learning_rate": 8.649046666994732e-10,
"loss": 0.0514,
"step": 873
},
{
"epoch": 0.9928997443907981,
"grad_norm": 0.6876333215462411,
"learning_rate": 6.354498847521706e-10,
"loss": 0.0609,
"step": 874
},
{
"epoch": 0.9940357852882704,
"grad_norm": 0.656021262952682,
"learning_rate": 4.412903546516245e-10,
"loss": 0.0535,
"step": 875
},
{
"epoch": 0.9951718261857427,
"grad_norm": 0.6554446679692325,
"learning_rate": 2.8242881825846225e-10,
"loss": 0.0533,
"step": 876
},
{
"epoch": 0.996307867083215,
"grad_norm": 0.6892726133408795,
"learning_rate": 1.5886751896565521e-10,
"loss": 0.0573,
"step": 877
},
{
"epoch": 0.9974439079806873,
"grad_norm": 0.624707525404124,
"learning_rate": 7.060820166826521e-11,
"loss": 0.0489,
"step": 878
},
{
"epoch": 0.9985799488781596,
"grad_norm": 0.685439156837803,
"learning_rate": 1.7652112736521455e-11,
"loss": 0.0603,
"step": 879
},
{
"epoch": 0.9997159897756319,
"grad_norm": 0.6532677360538005,
"learning_rate": 0.0,
"loss": 0.0627,
"step": 880
},
{
"epoch": 0.9997159897756319,
"step": 880,
"total_flos": 104387643310080.0,
"train_loss": 0.07837209747257558,
"train_runtime": 3781.2331,
"train_samples_per_second": 14.898,
"train_steps_per_second": 0.233
}
],
"logging_steps": 1,
"max_steps": 880,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 104387643310080.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}