{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 484, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01652892561983471, "grad_norm": 12.011792182922363, "learning_rate": 2.857142857142857e-05, "loss": 4.7518, "step": 2 }, { "epoch": 0.03305785123966942, "grad_norm": 12.689871788024902, "learning_rate": 5.714285714285714e-05, "loss": 4.687, "step": 4 }, { "epoch": 0.049586776859504134, "grad_norm": 11.848198890686035, "learning_rate": 8.571428571428571e-05, "loss": 4.4922, "step": 6 }, { "epoch": 0.06611570247933884, "grad_norm": 10.378408432006836, "learning_rate": 0.00011428571428571428, "loss": 3.4839, "step": 8 }, { "epoch": 0.08264462809917356, "grad_norm": 9.543920516967773, "learning_rate": 0.00014285714285714287, "loss": 2.4164, "step": 10 }, { "epoch": 0.09917355371900827, "grad_norm": 5.602456092834473, "learning_rate": 0.00017142857142857143, "loss": 1.838, "step": 12 }, { "epoch": 0.11570247933884298, "grad_norm": 1.7414193153381348, "learning_rate": 0.0002, "loss": 1.4404, "step": 14 }, { "epoch": 0.1322314049586777, "grad_norm": 1.0191770792007446, "learning_rate": 0.0001999910643210378, "loss": 1.3474, "step": 16 }, { "epoch": 0.1487603305785124, "grad_norm": 3.509352922439575, "learning_rate": 0.0001999642588810784, "loss": 1.3789, "step": 18 }, { "epoch": 0.1652892561983471, "grad_norm": 2.426680088043213, "learning_rate": 0.00019991958847061784, "loss": 1.2245, "step": 20 }, { "epoch": 0.18181818181818182, "grad_norm": 0.8060101270675659, "learning_rate": 0.00019985706107286514, "loss": 1.2259, "step": 22 }, { "epoch": 0.19834710743801653, "grad_norm": 1.6215940713882446, "learning_rate": 0.00019977668786231534, "loss": 1.1992, "step": 24 }, { "epoch": 0.21487603305785125, "grad_norm": 2.476259231567383, "learning_rate": 0.0001996784832027525, "loss": 1.2782, "step": 26 }, { "epoch": 0.23140495867768596, "grad_norm": 2.04018235206604, "learning_rate": 0.00019956246464468294, "loss": 1.23, "step": 28 }, { "epoch": 0.24793388429752067, "grad_norm": 1.8799017667770386, "learning_rate": 0.00019942865292219838, "loss": 1.2164, "step": 30 }, { "epoch": 0.2644628099173554, "grad_norm": 1.2413148880004883, "learning_rate": 0.00019927707194927066, "loss": 1.2924, "step": 32 }, { "epoch": 0.2809917355371901, "grad_norm": 1.8723937273025513, "learning_rate": 0.000199107748815478, "loss": 1.2302, "step": 34 }, { "epoch": 0.2975206611570248, "grad_norm": 1.393110752105713, "learning_rate": 0.00019892071378116376, "loss": 1.2276, "step": 36 }, { "epoch": 0.3140495867768595, "grad_norm": 1.1459721326828003, "learning_rate": 0.0001987160002720283, "loss": 1.1504, "step": 38 }, { "epoch": 0.3305785123966942, "grad_norm": 1.4680942296981812, "learning_rate": 0.00019849364487315558, "loss": 1.1623, "step": 40 }, { "epoch": 0.34710743801652894, "grad_norm": 1.8715866804122925, "learning_rate": 0.0001982536873224748, "loss": 1.2155, "step": 42 }, { "epoch": 0.36363636363636365, "grad_norm": 0.871064305305481, "learning_rate": 0.0001979961705036587, "loss": 1.1594, "step": 44 }, { "epoch": 0.38016528925619836, "grad_norm": 0.8239800930023193, "learning_rate": 0.00019772114043845965, "loss": 1.1501, "step": 46 }, { "epoch": 0.39669421487603307, "grad_norm": 0.9587319493293762, "learning_rate": 0.0001974286462784851, "loss": 1.1195, "step": 48 }, { "epoch": 0.4132231404958678, "grad_norm": 1.1645926237106323, "learning_rate": 0.0001971187402964132, "loss": 1.1417, "step": 50 }, { "epoch": 0.4297520661157025, "grad_norm": 0.576813817024231, "learning_rate": 0.00019679147787665126, "loss": 1.1445, "step": 52 }, { "epoch": 0.4462809917355372, "grad_norm": 1.0733133554458618, "learning_rate": 0.00019644691750543767, "loss": 1.0979, "step": 54 }, { "epoch": 0.4628099173553719, "grad_norm": 0.5801639556884766, "learning_rate": 0.00019608512076038962, "loss": 1.0977, "step": 56 }, { "epoch": 0.4793388429752066, "grad_norm": 1.6796538829803467, "learning_rate": 0.00019570615229949842, "loss": 1.1925, "step": 58 }, { "epoch": 0.49586776859504134, "grad_norm": 1.0563887357711792, "learning_rate": 0.00019531007984957408, "loss": 1.0657, "step": 60 }, { "epoch": 0.512396694214876, "grad_norm": 0.9109811186790466, "learning_rate": 0.00019489697419414182, "loss": 1.1098, "step": 62 }, { "epoch": 0.5289256198347108, "grad_norm": 0.7321667671203613, "learning_rate": 0.0001944669091607919, "loss": 1.0929, "step": 64 }, { "epoch": 0.5454545454545454, "grad_norm": 0.685366690158844, "learning_rate": 0.00019401996160798573, "loss": 1.1242, "step": 66 }, { "epoch": 0.5619834710743802, "grad_norm": 0.8959838151931763, "learning_rate": 0.0001935562114113202, "loss": 1.181, "step": 68 }, { "epoch": 0.5785123966942148, "grad_norm": 0.9717262983322144, "learning_rate": 0.00019307574144925287, "loss": 1.2295, "step": 70 }, { "epoch": 0.5950413223140496, "grad_norm": 1.0358582735061646, "learning_rate": 0.00019257863758829035, "loss": 1.1431, "step": 72 }, { "epoch": 0.6115702479338843, "grad_norm": 0.7998526096343994, "learning_rate": 0.00019206498866764288, "loss": 1.1032, "step": 74 }, { "epoch": 0.628099173553719, "grad_norm": 1.1496188640594482, "learning_rate": 0.0001915348864833476, "loss": 1.057, "step": 76 }, { "epoch": 0.6446280991735537, "grad_norm": 0.652406632900238, "learning_rate": 0.00019098842577186314, "loss": 1.146, "step": 78 }, { "epoch": 0.6611570247933884, "grad_norm": 0.9454944729804993, "learning_rate": 0.00019042570419313925, "loss": 1.1543, "step": 80 }, { "epoch": 0.6776859504132231, "grad_norm": 0.7456652522087097, "learning_rate": 0.00018984682231316333, "loss": 1.1189, "step": 82 }, { "epoch": 0.6942148760330579, "grad_norm": 0.7312512397766113, "learning_rate": 0.00018925188358598813, "loss": 1.0873, "step": 84 }, { "epoch": 0.7107438016528925, "grad_norm": 0.8474765419960022, "learning_rate": 0.000188640994335243, "loss": 1.1698, "step": 86 }, { "epoch": 0.7272727272727273, "grad_norm": 0.6979633569717407, "learning_rate": 0.0001880142637351325, "loss": 1.1417, "step": 88 }, { "epoch": 0.743801652892562, "grad_norm": 0.5989161133766174, "learning_rate": 0.00018737180379092537, "loss": 1.0479, "step": 90 }, { "epoch": 0.7603305785123967, "grad_norm": 0.5765272378921509, "learning_rate": 0.00018671372931893773, "loss": 1.1336, "step": 92 }, { "epoch": 0.7768595041322314, "grad_norm": 0.6709849834442139, "learning_rate": 0.00018604015792601396, "loss": 1.1157, "step": 94 }, { "epoch": 0.7933884297520661, "grad_norm": 0.8181343674659729, "learning_rate": 0.00018535120998850848, "loss": 1.0927, "step": 96 }, { "epoch": 0.8099173553719008, "grad_norm": 0.6146332621574402, "learning_rate": 0.00018464700863077312, "loss": 1.0739, "step": 98 }, { "epoch": 0.8264462809917356, "grad_norm": 0.9904415011405945, "learning_rate": 0.00018392767970315313, "loss": 1.0331, "step": 100 }, { "epoch": 0.8429752066115702, "grad_norm": 0.6186695694923401, "learning_rate": 0.0001831933517594957, "loss": 1.0513, "step": 102 }, { "epoch": 0.859504132231405, "grad_norm": 1.1912785768508911, "learning_rate": 0.00018244415603417603, "loss": 1.1567, "step": 104 }, { "epoch": 0.8760330578512396, "grad_norm": 1.3681318759918213, "learning_rate": 0.00018168022641864377, "loss": 1.1497, "step": 106 }, { "epoch": 0.8925619834710744, "grad_norm": 0.619476318359375, "learning_rate": 0.00018090169943749476, "loss": 1.1546, "step": 108 }, { "epoch": 0.9090909090909091, "grad_norm": 0.7421219348907471, "learning_rate": 0.00018010871422407236, "loss": 1.1458, "step": 110 }, { "epoch": 0.9256198347107438, "grad_norm": 0.6569286584854126, "learning_rate": 0.00017930141249560233, "loss": 1.12, "step": 112 }, { "epoch": 0.9421487603305785, "grad_norm": 0.4168110191822052, "learning_rate": 0.0001784799385278661, "loss": 1.1682, "step": 114 }, { "epoch": 0.9586776859504132, "grad_norm": 0.5620162487030029, "learning_rate": 0.00017764443912941672, "loss": 1.1828, "step": 116 }, { "epoch": 0.9752066115702479, "grad_norm": 0.8095484375953674, "learning_rate": 0.00017679506361534215, "loss": 1.1953, "step": 118 }, { "epoch": 0.9917355371900827, "grad_norm": 0.7646257281303406, "learning_rate": 0.0001759319637805806, "loss": 1.2148, "step": 120 }, { "epoch": 1.0082644628099173, "grad_norm": 0.5254501104354858, "learning_rate": 0.00017505529387279277, "loss": 1.1359, "step": 122 }, { "epoch": 1.024793388429752, "grad_norm": 0.6001765727996826, "learning_rate": 0.00017416521056479577, "loss": 1.1336, "step": 124 }, { "epoch": 1.0413223140495869, "grad_norm": 0.35407504439353943, "learning_rate": 0.00017326187292656333, "loss": 1.1833, "step": 126 }, { "epoch": 1.0578512396694215, "grad_norm": 0.414528489112854, "learning_rate": 0.00017234544239679806, "loss": 1.1301, "step": 128 }, { "epoch": 1.0743801652892562, "grad_norm": 0.46355852484703064, "learning_rate": 0.00017141608275408006, "loss": 1.213, "step": 130 }, { "epoch": 1.0909090909090908, "grad_norm": 0.5040593147277832, "learning_rate": 0.00017047396008759754, "loss": 1.132, "step": 132 }, { "epoch": 1.1074380165289257, "grad_norm": 0.4813704192638397, "learning_rate": 0.00016951924276746425, "loss": 1.0831, "step": 134 }, { "epoch": 1.1239669421487604, "grad_norm": 0.5174686312675476, "learning_rate": 0.00016855210141462963, "loss": 1.0514, "step": 136 }, { "epoch": 1.140495867768595, "grad_norm": 0.4712466299533844, "learning_rate": 0.00016757270887038654, "loss": 1.1334, "step": 138 }, { "epoch": 1.1570247933884297, "grad_norm": 0.5912173390388489, "learning_rate": 0.00016658124016548197, "loss": 1.1011, "step": 140 }, { "epoch": 1.1735537190082646, "grad_norm": 0.6392802000045776, "learning_rate": 0.00016557787248883696, "loss": 1.1361, "step": 142 }, { "epoch": 1.1900826446280992, "grad_norm": 0.7376368045806885, "learning_rate": 0.00016456278515588024, "loss": 1.109, "step": 144 }, { "epoch": 1.2066115702479339, "grad_norm": 0.5020875930786133, "learning_rate": 0.00016353615957650236, "loss": 1.0925, "step": 146 }, { "epoch": 1.2231404958677685, "grad_norm": 0.8081740736961365, "learning_rate": 0.00016249817922263517, "loss": 1.047, "step": 148 }, { "epoch": 1.2396694214876034, "grad_norm": 0.6371219754219055, "learning_rate": 0.00016144902959546286, "loss": 1.113, "step": 150 }, { "epoch": 1.256198347107438, "grad_norm": 0.7588189840316772, "learning_rate": 0.00016038889819227045, "loss": 1.1179, "step": 152 }, { "epoch": 1.2727272727272727, "grad_norm": 0.6286205053329468, "learning_rate": 0.00015931797447293552, "loss": 1.1209, "step": 154 }, { "epoch": 1.2892561983471074, "grad_norm": 0.797656238079071, "learning_rate": 0.00015823644982606905, "loss": 1.1698, "step": 156 }, { "epoch": 1.3057851239669422, "grad_norm": 0.5368632078170776, "learning_rate": 0.00015714451753481168, "loss": 1.1973, "step": 158 }, { "epoch": 1.322314049586777, "grad_norm": 0.4135212302207947, "learning_rate": 0.00015604237274229147, "loss": 1.1452, "step": 160 }, { "epoch": 1.3388429752066116, "grad_norm": 0.5289668440818787, "learning_rate": 0.00015493021241674918, "loss": 1.1954, "step": 162 }, { "epoch": 1.3553719008264462, "grad_norm": 0.4092061221599579, "learning_rate": 0.00015380823531633729, "loss": 1.1226, "step": 164 }, { "epoch": 1.3719008264462809, "grad_norm": 0.7049645781517029, "learning_rate": 0.00015267664195359917, "loss": 1.0948, "step": 166 }, { "epoch": 1.3884297520661157, "grad_norm": 0.47164198756217957, "learning_rate": 0.00015153563455963499, "loss": 1.0977, "step": 168 }, { "epoch": 1.4049586776859504, "grad_norm": 0.7871695160865784, "learning_rate": 0.00015038541704796003, "loss": 1.1674, "step": 170 }, { "epoch": 1.421487603305785, "grad_norm": 0.5381121635437012, "learning_rate": 0.00014922619497806277, "loss": 1.1415, "step": 172 }, { "epoch": 1.43801652892562, "grad_norm": 0.39419299364089966, "learning_rate": 0.00014805817551866838, "loss": 1.0747, "step": 174 }, { "epoch": 1.4545454545454546, "grad_norm": 0.38382914662361145, "learning_rate": 0.00014688156741071514, "loss": 1.1278, "step": 176 }, { "epoch": 1.4710743801652892, "grad_norm": 0.32674962282180786, "learning_rate": 0.00014569658093004935, "loss": 0.9774, "step": 178 }, { "epoch": 1.487603305785124, "grad_norm": 0.5443088412284851, "learning_rate": 0.00014450342784984633, "loss": 1.034, "step": 180 }, { "epoch": 1.5041322314049586, "grad_norm": 0.6682401895523071, "learning_rate": 0.00014330232140276366, "loss": 1.1732, "step": 182 }, { "epoch": 1.5206611570247934, "grad_norm": 0.5696044564247131, "learning_rate": 0.0001420934762428335, "loss": 1.0384, "step": 184 }, { "epoch": 1.537190082644628, "grad_norm": 0.6782551407814026, "learning_rate": 0.0001408771084071012, "loss": 1.1107, "step": 186 }, { "epoch": 1.553719008264463, "grad_norm": 0.8336123824119568, "learning_rate": 0.00013965343527701628, "loss": 1.0737, "step": 188 }, { "epoch": 1.5702479338842976, "grad_norm": 0.539226233959198, "learning_rate": 0.00013842267553958371, "loss": 1.1665, "step": 190 }, { "epoch": 1.5867768595041323, "grad_norm": 0.566620409488678, "learning_rate": 0.00013718504914828135, "loss": 1.1333, "step": 192 }, { "epoch": 1.603305785123967, "grad_norm": 0.4735005795955658, "learning_rate": 0.00013594077728375128, "loss": 1.1709, "step": 194 }, { "epoch": 1.6198347107438016, "grad_norm": 0.534383237361908, "learning_rate": 0.00013469008231427207, "loss": 1.0783, "step": 196 }, { "epoch": 1.6363636363636362, "grad_norm": 0.8410363793373108, "learning_rate": 0.0001334331877560182, "loss": 1.0708, "step": 198 }, { "epoch": 1.6528925619834711, "grad_norm": 0.6392219662666321, "learning_rate": 0.00013217031823311488, "loss": 1.0329, "step": 200 }, { "epoch": 1.6694214876033058, "grad_norm": 0.5770404934883118, "learning_rate": 0.00013090169943749476, "loss": 1.0404, "step": 202 }, { "epoch": 1.6859504132231407, "grad_norm": 0.6814575791358948, "learning_rate": 0.00012962755808856342, "loss": 1.0702, "step": 204 }, { "epoch": 1.7024793388429753, "grad_norm": 0.673312783241272, "learning_rate": 0.0001283481218926818, "loss": 1.0529, "step": 206 }, { "epoch": 1.71900826446281, "grad_norm": 0.6180073618888855, "learning_rate": 0.0001270636195024719, "loss": 1.0257, "step": 208 }, { "epoch": 1.7355371900826446, "grad_norm": 0.5565724968910217, "learning_rate": 0.00012577428047595344, "loss": 1.1102, "step": 210 }, { "epoch": 1.7520661157024793, "grad_norm": 0.5586270689964294, "learning_rate": 0.00012448033523551865, "loss": 1.0277, "step": 212 }, { "epoch": 1.768595041322314, "grad_norm": 0.542448878288269, "learning_rate": 0.00012318201502675285, "loss": 1.0988, "step": 214 }, { "epoch": 1.7851239669421488, "grad_norm": 0.513042151927948, "learning_rate": 0.0001218795518771075, "loss": 1.0828, "step": 216 }, { "epoch": 1.8016528925619835, "grad_norm": 0.7613060474395752, "learning_rate": 0.00012057317855443395, "loss": 1.1962, "step": 218 }, { "epoch": 1.8181818181818183, "grad_norm": 0.7522129416465759, "learning_rate": 0.00011926312852538455, "loss": 1.1339, "step": 220 }, { "epoch": 1.834710743801653, "grad_norm": 0.4655594825744629, "learning_rate": 0.00011794963591368893, "loss": 1.0967, "step": 222 }, { "epoch": 1.8512396694214877, "grad_norm": 0.5036570429801941, "learning_rate": 0.00011663293545831302, "loss": 1.0361, "step": 224 }, { "epoch": 1.8677685950413223, "grad_norm": 0.43016380071640015, "learning_rate": 0.00011531326247150803, "loss": 1.1281, "step": 226 }, { "epoch": 1.884297520661157, "grad_norm": 0.5184316635131836, "learning_rate": 0.00011399085279675687, "loss": 1.2083, "step": 228 }, { "epoch": 1.9008264462809916, "grad_norm": 0.6556355357170105, "learning_rate": 0.0001126659427666257, "loss": 1.0266, "step": 230 }, { "epoch": 1.9173553719008265, "grad_norm": 0.515681803226471, "learning_rate": 0.00011133876916052821, "loss": 1.0472, "step": 232 }, { "epoch": 1.9338842975206612, "grad_norm": 0.4592064321041107, "learning_rate": 0.00011000956916240985, "loss": 1.054, "step": 234 }, { "epoch": 1.950413223140496, "grad_norm": 0.5623230338096619, "learning_rate": 0.00010867858031835975, "loss": 1.1571, "step": 236 }, { "epoch": 1.9669421487603307, "grad_norm": 0.5241667032241821, "learning_rate": 0.00010734604049415822, "loss": 1.0985, "step": 238 }, { "epoch": 1.9834710743801653, "grad_norm": 0.54905104637146, "learning_rate": 0.00010601218783276672, "loss": 1.1088, "step": 240 }, { "epoch": 2.0, "grad_norm": 0.8823345303535461, "learning_rate": 0.00010467726071176853, "loss": 1.0991, "step": 242 }, { "epoch": 2.0165289256198347, "grad_norm": 0.5789780020713806, "learning_rate": 0.00010334149770076747, "loss": 1.1429, "step": 244 }, { "epoch": 2.0330578512396693, "grad_norm": 0.6136715412139893, "learning_rate": 0.00010200513751875227, "loss": 1.0347, "step": 246 }, { "epoch": 2.049586776859504, "grad_norm": 0.6462894082069397, "learning_rate": 0.00010066841899143425, "loss": 1.1008, "step": 248 }, { "epoch": 2.0661157024793386, "grad_norm": 0.7272156476974487, "learning_rate": 9.93315810085658e-05, "loss": 1.1642, "step": 250 }, { "epoch": 2.0826446280991737, "grad_norm": 0.5664356350898743, "learning_rate": 9.799486248124775e-05, "loss": 1.0799, "step": 252 }, { "epoch": 2.0991735537190084, "grad_norm": 0.559697687625885, "learning_rate": 9.665850229923258e-05, "loss": 1.0536, "step": 254 }, { "epoch": 2.115702479338843, "grad_norm": 0.7208287715911865, "learning_rate": 9.532273928823151e-05, "loss": 1.0595, "step": 256 }, { "epoch": 2.1322314049586777, "grad_norm": 0.6985677480697632, "learning_rate": 9.398781216723331e-05, "loss": 1.0626, "step": 258 }, { "epoch": 2.1487603305785123, "grad_norm": 0.7583006024360657, "learning_rate": 9.26539595058418e-05, "loss": 0.9919, "step": 260 }, { "epoch": 2.165289256198347, "grad_norm": 0.6283029317855835, "learning_rate": 9.132141968164026e-05, "loss": 1.1292, "step": 262 }, { "epoch": 2.1818181818181817, "grad_norm": 1.4162311553955078, "learning_rate": 8.999043083759017e-05, "loss": 1.1834, "step": 264 }, { "epoch": 2.1983471074380168, "grad_norm": 0.5169870853424072, "learning_rate": 8.866123083947182e-05, "loss": 1.049, "step": 266 }, { "epoch": 2.2148760330578514, "grad_norm": 0.9136308431625366, "learning_rate": 8.733405723337432e-05, "loss": 1.1336, "step": 268 }, { "epoch": 2.231404958677686, "grad_norm": 0.6882078647613525, "learning_rate": 8.600914720324316e-05, "loss": 1.0953, "step": 270 }, { "epoch": 2.2479338842975207, "grad_norm": 0.505447506904602, "learning_rate": 8.4686737528492e-05, "loss": 1.1014, "step": 272 }, { "epoch": 2.2644628099173554, "grad_norm": 0.40685173869132996, "learning_rate": 8.336706454168701e-05, "loss": 1.0878, "step": 274 }, { "epoch": 2.28099173553719, "grad_norm": 0.5715451836585999, "learning_rate": 8.20503640863111e-05, "loss": 1.0555, "step": 276 }, { "epoch": 2.2975206611570247, "grad_norm": 0.5256667733192444, "learning_rate": 8.073687147461547e-05, "loss": 1.0678, "step": 278 }, { "epoch": 2.3140495867768593, "grad_norm": 0.46319133043289185, "learning_rate": 7.942682144556604e-05, "loss": 1.0699, "step": 280 }, { "epoch": 2.330578512396694, "grad_norm": 0.6000749468803406, "learning_rate": 7.812044812289249e-05, "loss": 1.1389, "step": 282 }, { "epoch": 2.347107438016529, "grad_norm": 0.5024062395095825, "learning_rate": 7.681798497324716e-05, "loss": 1.1033, "step": 284 }, { "epoch": 2.3636363636363638, "grad_norm": 0.6761860251426697, "learning_rate": 7.55196647644814e-05, "loss": 1.0665, "step": 286 }, { "epoch": 2.3801652892561984, "grad_norm": 0.7175805568695068, "learning_rate": 7.422571952404663e-05, "loss": 1.0511, "step": 288 }, { "epoch": 2.396694214876033, "grad_norm": 0.6883595585823059, "learning_rate": 7.293638049752812e-05, "loss": 1.0765, "step": 290 }, { "epoch": 2.4132231404958677, "grad_norm": 0.6382430791854858, "learning_rate": 7.165187810731823e-05, "loss": 1.1036, "step": 292 }, { "epoch": 2.4297520661157024, "grad_norm": 0.9323483109474182, "learning_rate": 7.037244191143661e-05, "loss": 1.1113, "step": 294 }, { "epoch": 2.446280991735537, "grad_norm": 0.9768907427787781, "learning_rate": 6.909830056250527e-05, "loss": 1.0983, "step": 296 }, { "epoch": 2.462809917355372, "grad_norm": 0.6783613562583923, "learning_rate": 6.782968176688514e-05, "loss": 1.1141, "step": 298 }, { "epoch": 2.479338842975207, "grad_norm": 0.753887951374054, "learning_rate": 6.656681224398183e-05, "loss": 1.1234, "step": 300 }, { "epoch": 2.4958677685950414, "grad_norm": 0.6788628697395325, "learning_rate": 6.530991768572794e-05, "loss": 1.0407, "step": 302 }, { "epoch": 2.512396694214876, "grad_norm": 0.6727221012115479, "learning_rate": 6.405922271624874e-05, "loss": 1.091, "step": 304 }, { "epoch": 2.5289256198347108, "grad_norm": 0.7418019771575928, "learning_rate": 6.281495085171869e-05, "loss": 1.0884, "step": 306 }, { "epoch": 2.5454545454545454, "grad_norm": 0.7063189148902893, "learning_rate": 6.15773244604163e-05, "loss": 1.0218, "step": 308 }, { "epoch": 2.56198347107438, "grad_norm": 1.0706840753555298, "learning_rate": 6.0346564722983736e-05, "loss": 1.0873, "step": 310 }, { "epoch": 2.5785123966942147, "grad_norm": 0.683253288269043, "learning_rate": 5.912289159289883e-05, "loss": 1.0346, "step": 312 }, { "epoch": 2.5950413223140494, "grad_norm": 0.5816308856010437, "learning_rate": 5.790652375716652e-05, "loss": 1.0113, "step": 314 }, { "epoch": 2.6115702479338845, "grad_norm": 0.7935141324996948, "learning_rate": 5.6697678597236356e-05, "loss": 1.1771, "step": 316 }, { "epoch": 2.628099173553719, "grad_norm": 0.6827735900878906, "learning_rate": 5.549657215015367e-05, "loss": 1.0899, "step": 318 }, { "epoch": 2.644628099173554, "grad_norm": 0.5408580899238586, "learning_rate": 5.430341906995065e-05, "loss": 1.1056, "step": 320 }, { "epoch": 2.6611570247933884, "grad_norm": 0.6996527910232544, "learning_rate": 5.31184325892849e-05, "loss": 1.0494, "step": 322 }, { "epoch": 2.677685950413223, "grad_norm": 0.6700767874717712, "learning_rate": 5.1941824481331626e-05, "loss": 1.1206, "step": 324 }, { "epoch": 2.6942148760330578, "grad_norm": 0.7690112590789795, "learning_rate": 5.077380502193725e-05, "loss": 1.1541, "step": 326 }, { "epoch": 2.7107438016528924, "grad_norm": 0.7921670079231262, "learning_rate": 4.961458295203999e-05, "loss": 1.0684, "step": 328 }, { "epoch": 2.7272727272727275, "grad_norm": 0.6989784240722656, "learning_rate": 4.8464365440365044e-05, "loss": 1.0097, "step": 330 }, { "epoch": 2.7438016528925617, "grad_norm": 0.6038585901260376, "learning_rate": 4.7323358046400844e-05, "loss": 1.0751, "step": 332 }, { "epoch": 2.760330578512397, "grad_norm": 0.8160024881362915, "learning_rate": 4.6191764683662744e-05, "loss": 0.9734, "step": 334 }, { "epoch": 2.7768595041322315, "grad_norm": 0.7741029858589172, "learning_rate": 4.506978758325081e-05, "loss": 1.1186, "step": 336 }, { "epoch": 2.793388429752066, "grad_norm": 0.8177542090415955, "learning_rate": 4.395762725770852e-05, "loss": 1.1538, "step": 338 }, { "epoch": 2.809917355371901, "grad_norm": 0.7617602944374084, "learning_rate": 4.285548246518837e-05, "loss": 1.1162, "step": 340 }, { "epoch": 2.8264462809917354, "grad_norm": 0.6426255106925964, "learning_rate": 4.176355017393099e-05, "loss": 1.0404, "step": 342 }, { "epoch": 2.84297520661157, "grad_norm": 0.6394379138946533, "learning_rate": 4.0682025527064486e-05, "loss": 1.1107, "step": 344 }, { "epoch": 2.8595041322314048, "grad_norm": 0.6632633209228516, "learning_rate": 3.961110180772955e-05, "loss": 1.1085, "step": 346 }, { "epoch": 2.87603305785124, "grad_norm": 0.5437737703323364, "learning_rate": 3.8550970404537144e-05, "loss": 1.0072, "step": 348 }, { "epoch": 2.8925619834710745, "grad_norm": 0.8178601264953613, "learning_rate": 3.750182077736486e-05, "loss": 1.0606, "step": 350 }, { "epoch": 2.909090909090909, "grad_norm": 0.8268756866455078, "learning_rate": 3.646384042349764e-05, "loss": 0.9642, "step": 352 }, { "epoch": 2.925619834710744, "grad_norm": 1.064460277557373, "learning_rate": 3.543721484411976e-05, "loss": 1.0881, "step": 354 }, { "epoch": 2.9421487603305785, "grad_norm": 0.7875952124595642, "learning_rate": 3.442212751116305e-05, "loss": 1.0141, "step": 356 }, { "epoch": 2.958677685950413, "grad_norm": 0.6071433424949646, "learning_rate": 3.3418759834518056e-05, "loss": 1.0692, "step": 358 }, { "epoch": 2.975206611570248, "grad_norm": 0.6311193704605103, "learning_rate": 3.24272911296135e-05, "loss": 0.9481, "step": 360 }, { "epoch": 2.991735537190083, "grad_norm": 0.7501534223556519, "learning_rate": 3.1447898585370384e-05, "loss": 1.1183, "step": 362 }, { "epoch": 3.0082644628099175, "grad_norm": 0.6752927899360657, "learning_rate": 3.0480757232535772e-05, "loss": 1.0628, "step": 364 }, { "epoch": 3.024793388429752, "grad_norm": 0.9237536787986755, "learning_rate": 2.9526039912402503e-05, "loss": 1.1539, "step": 366 }, { "epoch": 3.041322314049587, "grad_norm": 0.5888502597808838, "learning_rate": 2.8583917245919945e-05, "loss": 1.0937, "step": 368 }, { "epoch": 3.0578512396694215, "grad_norm": 0.8892382383346558, "learning_rate": 2.7654557603201957e-05, "loss": 1.0385, "step": 370 }, { "epoch": 3.074380165289256, "grad_norm": 0.7525384426116943, "learning_rate": 2.673812707343669e-05, "loss": 0.9868, "step": 372 }, { "epoch": 3.090909090909091, "grad_norm": 0.8219236731529236, "learning_rate": 2.5834789435204243e-05, "loss": 1.0349, "step": 374 }, { "epoch": 3.1074380165289255, "grad_norm": 1.2317075729370117, "learning_rate": 2.494470612720725e-05, "loss": 1.0493, "step": 376 }, { "epoch": 3.12396694214876, "grad_norm": 0.9741241931915283, "learning_rate": 2.4068036219419432e-05, "loss": 1.0685, "step": 378 }, { "epoch": 3.1404958677685952, "grad_norm": 0.910832405090332, "learning_rate": 2.3204936384657872e-05, "loss": 1.0191, "step": 380 }, { "epoch": 3.15702479338843, "grad_norm": 0.8739452958106995, "learning_rate": 2.235556087058328e-05, "loss": 1.0511, "step": 382 }, { "epoch": 3.1735537190082646, "grad_norm": 1.0163301229476929, "learning_rate": 2.1520061472133902e-05, "loss": 1.0358, "step": 384 }, { "epoch": 3.190082644628099, "grad_norm": 0.8788334131240845, "learning_rate": 2.069858750439768e-05, "loss": 1.0087, "step": 386 }, { "epoch": 3.206611570247934, "grad_norm": 0.8024322390556335, "learning_rate": 1.9891285775927682e-05, "loss": 0.973, "step": 388 }, { "epoch": 3.2231404958677685, "grad_norm": 1.0479772090911865, "learning_rate": 1.9098300562505266e-05, "loss": 1.0334, "step": 390 }, { "epoch": 3.239669421487603, "grad_norm": 0.8988475203514099, "learning_rate": 1.831977358135625e-05, "loss": 1.0007, "step": 392 }, { "epoch": 3.256198347107438, "grad_norm": 0.9283938407897949, "learning_rate": 1.7555843965823992e-05, "loss": 1.0458, "step": 394 }, { "epoch": 3.2727272727272725, "grad_norm": 1.1304008960723877, "learning_rate": 1.680664824050432e-05, "loss": 1.0623, "step": 396 }, { "epoch": 3.2892561983471076, "grad_norm": 0.9807791113853455, "learning_rate": 1.6072320296846898e-05, "loss": 1.0569, "step": 398 }, { "epoch": 3.3057851239669422, "grad_norm": 1.191261887550354, "learning_rate": 1.5352991369226865e-05, "loss": 0.9944, "step": 400 }, { "epoch": 3.322314049586777, "grad_norm": 1.0459243059158325, "learning_rate": 1.4648790011491542e-05, "loss": 1.1106, "step": 402 }, { "epoch": 3.3388429752066116, "grad_norm": 1.2179893255233765, "learning_rate": 1.3959842073986085e-05, "loss": 0.8982, "step": 404 }, { "epoch": 3.355371900826446, "grad_norm": 1.219767689704895, "learning_rate": 1.3286270681062274e-05, "loss": 1.0193, "step": 406 }, { "epoch": 3.371900826446281, "grad_norm": 1.2486698627471924, "learning_rate": 1.262819620907465e-05, "loss": 0.9492, "step": 408 }, { "epoch": 3.3884297520661155, "grad_norm": 1.2574692964553833, "learning_rate": 1.1985736264867509e-05, "loss": 1.0077, "step": 410 }, { "epoch": 3.4049586776859506, "grad_norm": 1.1906545162200928, "learning_rate": 1.1359005664756994e-05, "loss": 1.014, "step": 412 }, { "epoch": 3.4214876033057853, "grad_norm": 1.3691320419311523, "learning_rate": 1.0748116414011888e-05, "loss": 1.0101, "step": 414 }, { "epoch": 3.43801652892562, "grad_norm": 1.3103783130645752, "learning_rate": 1.0153177686836691e-05, "loss": 1.0953, "step": 416 }, { "epoch": 3.4545454545454546, "grad_norm": 1.1507749557495117, "learning_rate": 9.574295806860767e-06, "loss": 1.0044, "step": 418 }, { "epoch": 3.4710743801652892, "grad_norm": 1.3561640977859497, "learning_rate": 9.011574228136865e-06, "loss": 1.1707, "step": 420 }, { "epoch": 3.487603305785124, "grad_norm": 1.31363046169281, "learning_rate": 8.465113516652424e-06, "loss": 1.0806, "step": 422 }, { "epoch": 3.5041322314049586, "grad_norm": 1.0729997158050537, "learning_rate": 7.935011332357112e-06, "loss": 1.0657, "step": 424 }, { "epoch": 3.5206611570247937, "grad_norm": 1.6132832765579224, "learning_rate": 7.4213624117096755e-06, "loss": 1.0328, "step": 426 }, { "epoch": 3.537190082644628, "grad_norm": 0.8563976287841797, "learning_rate": 6.924258550747154e-06, "loss": 0.9862, "step": 428 }, { "epoch": 3.553719008264463, "grad_norm": 1.1601394414901733, "learning_rate": 6.4437885886798224e-06, "loss": 0.957, "step": 430 }, { "epoch": 3.5702479338842976, "grad_norm": 1.1800570487976074, "learning_rate": 5.980038392014309e-06, "loss": 1.0435, "step": 432 }, { "epoch": 3.5867768595041323, "grad_norm": 1.334128737449646, "learning_rate": 5.533090839208133e-06, "loss": 1.0272, "step": 434 }, { "epoch": 3.603305785123967, "grad_norm": 1.2104339599609375, "learning_rate": 5.103025805858197e-06, "loss": 0.9882, "step": 436 }, { "epoch": 3.6198347107438016, "grad_norm": 1.1799639463424683, "learning_rate": 4.68992015042592e-06, "loss": 1.0944, "step": 438 }, { "epoch": 3.6363636363636362, "grad_norm": 1.5897610187530518, "learning_rate": 4.293847700501585e-06, "loss": 1.0863, "step": 440 }, { "epoch": 3.652892561983471, "grad_norm": 1.3954260349273682, "learning_rate": 3.914879239610392e-06, "loss": 1.0532, "step": 442 }, { "epoch": 3.669421487603306, "grad_norm": 1.1017087697982788, "learning_rate": 3.5530824945623542e-06, "loss": 1.0719, "step": 444 }, { "epoch": 3.6859504132231407, "grad_norm": 0.9582527279853821, "learning_rate": 3.2085221233487562e-06, "loss": 1.0178, "step": 446 }, { "epoch": 3.7024793388429753, "grad_norm": 1.5694618225097656, "learning_rate": 2.8812597035868137e-06, "loss": 1.0657, "step": 448 }, { "epoch": 3.71900826446281, "grad_norm": 0.8071503639221191, "learning_rate": 2.5713537215149132e-06, "loss": 1.0048, "step": 450 }, { "epoch": 3.7355371900826446, "grad_norm": 1.3347952365875244, "learning_rate": 2.2788595615403474e-06, "loss": 1.0384, "step": 452 }, { "epoch": 3.7520661157024793, "grad_norm": 1.4382898807525635, "learning_rate": 2.003829496341325e-06, "loss": 1.0574, "step": 454 }, { "epoch": 3.768595041322314, "grad_norm": 1.1163910627365112, "learning_rate": 1.7463126775252191e-06, "loss": 1.1162, "step": 456 }, { "epoch": 3.785123966942149, "grad_norm": 1.3733882904052734, "learning_rate": 1.5063551268444276e-06, "loss": 1.0597, "step": 458 }, { "epoch": 3.8016528925619832, "grad_norm": 0.9412918090820312, "learning_rate": 1.2839997279717076e-06, "loss": 1.0342, "step": 460 }, { "epoch": 3.8181818181818183, "grad_norm": 1.3428821563720703, "learning_rate": 1.0792862188362396e-06, "loss": 1.0588, "step": 462 }, { "epoch": 3.834710743801653, "grad_norm": 1.326529860496521, "learning_rate": 8.922511845219971e-07, "loss": 1.0857, "step": 464 }, { "epoch": 3.8512396694214877, "grad_norm": 1.1826655864715576, "learning_rate": 7.229280507293657e-07, "loss": 1.0606, "step": 466 }, { "epoch": 3.8677685950413223, "grad_norm": 1.266386866569519, "learning_rate": 5.713470778016538e-07, "loss": 1.0195, "step": 468 }, { "epoch": 3.884297520661157, "grad_norm": 1.2093831300735474, "learning_rate": 4.375353553170647e-07, "loss": 1.0039, "step": 470 }, { "epoch": 3.9008264462809916, "grad_norm": 1.0445263385772705, "learning_rate": 3.2151679724748975e-07, "loss": 1.0612, "step": 472 }, { "epoch": 3.9173553719008263, "grad_norm": 1.331885814666748, "learning_rate": 2.2331213768468363e-07, "loss": 1.1568, "step": 474 }, { "epoch": 3.9338842975206614, "grad_norm": 1.254831075668335, "learning_rate": 1.4293892713486134e-07, "loss": 1.0544, "step": 476 }, { "epoch": 3.950413223140496, "grad_norm": 0.9736830592155457, "learning_rate": 8.041152938216278e-08, "loss": 0.976, "step": 478 }, { "epoch": 3.9669421487603307, "grad_norm": 1.0081415176391602, "learning_rate": 3.5741118921628346e-08, "loss": 1.0152, "step": 480 }, { "epoch": 3.9834710743801653, "grad_norm": 1.2260342836380005, "learning_rate": 8.93567896219638e-09, "loss": 1.0639, "step": 482 }, { "epoch": 4.0, "grad_norm": 3.486053466796875, "learning_rate": 0.0, "loss": 1.1032, "step": 484 } ], "logging_steps": 2, "max_steps": 484, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9157352821358592.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }