| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9925611052072263, | |
| "eval_steps": 500, | |
| "global_step": 7500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002656748140276302, | |
| "grad_norm": 207.1781768798828, | |
| "learning_rate": 1.9982288345731494e-05, | |
| "loss": 6.5351, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005313496280552604, | |
| "grad_norm": 265.0539855957031, | |
| "learning_rate": 1.9964576691462986e-05, | |
| "loss": 5.2822, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.007970244420828906, | |
| "grad_norm": 1933.7158203125, | |
| "learning_rate": 1.9946865037194475e-05, | |
| "loss": 4.7764, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010626992561105207, | |
| "grad_norm": 896.5211791992188, | |
| "learning_rate": 1.9929153382925967e-05, | |
| "loss": 4.5617, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.013283740701381509, | |
| "grad_norm": 2147.634765625, | |
| "learning_rate": 1.991144172865746e-05, | |
| "loss": 4.5559, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.015940488841657812, | |
| "grad_norm": 1384.8623046875, | |
| "learning_rate": 1.9893730074388952e-05, | |
| "loss": 4.1671, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.018597236981934114, | |
| "grad_norm": 3381.87060546875, | |
| "learning_rate": 1.987601842012044e-05, | |
| "loss": 3.9988, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.021253985122210415, | |
| "grad_norm": 398.0505676269531, | |
| "learning_rate": 1.985830676585193e-05, | |
| "loss": 4.0844, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.023910733262486716, | |
| "grad_norm": 2040.54736328125, | |
| "learning_rate": 1.9840595111583422e-05, | |
| "loss": 3.9493, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.026567481402763018, | |
| "grad_norm": 8612.021484375, | |
| "learning_rate": 1.9822883457314914e-05, | |
| "loss": 3.6944, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02922422954303932, | |
| "grad_norm": 22271.3125, | |
| "learning_rate": 1.9805171803046406e-05, | |
| "loss": 4.1335, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.031880977683315624, | |
| "grad_norm": 5334.6806640625, | |
| "learning_rate": 1.97874601487779e-05, | |
| "loss": 3.9284, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03453772582359192, | |
| "grad_norm": 1616.4825439453125, | |
| "learning_rate": 1.9769748494509388e-05, | |
| "loss": 3.9407, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03719447396386823, | |
| "grad_norm": 137.30589294433594, | |
| "learning_rate": 1.975203684024088e-05, | |
| "loss": 3.7372, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.039851222104144525, | |
| "grad_norm": 2417.81982421875, | |
| "learning_rate": 1.9734325185972372e-05, | |
| "loss": 3.6944, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04250797024442083, | |
| "grad_norm": 7971.87451171875, | |
| "learning_rate": 1.9716613531703864e-05, | |
| "loss": 3.6615, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04516471838469713, | |
| "grad_norm": 1645.13916015625, | |
| "learning_rate": 1.9698901877435353e-05, | |
| "loss": 3.4582, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04782146652497343, | |
| "grad_norm": 2899.1162109375, | |
| "learning_rate": 1.9681190223166846e-05, | |
| "loss": 3.4193, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05047821466524974, | |
| "grad_norm": 13782.0908203125, | |
| "learning_rate": 1.9663478568898338e-05, | |
| "loss": 3.577, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.053134962805526036, | |
| "grad_norm": 7818.07177734375, | |
| "learning_rate": 1.964576691462983e-05, | |
| "loss": 3.2082, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05579171094580234, | |
| "grad_norm": 14882.34375, | |
| "learning_rate": 1.962805526036132e-05, | |
| "loss": 3.1947, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05844845908607864, | |
| "grad_norm": 27526.642578125, | |
| "learning_rate": 1.961034360609281e-05, | |
| "loss": 3.23, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.06110520722635494, | |
| "grad_norm": 9511.650390625, | |
| "learning_rate": 1.95926319518243e-05, | |
| "loss": 3.0386, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06376195536663125, | |
| "grad_norm": 2172.15625, | |
| "learning_rate": 1.9574920297555792e-05, | |
| "loss": 3.1756, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06641870350690754, | |
| "grad_norm": 11950.30078125, | |
| "learning_rate": 1.9557208643287285e-05, | |
| "loss": 3.36, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06907545164718384, | |
| "grad_norm": 17726.330078125, | |
| "learning_rate": 1.9539496989018777e-05, | |
| "loss": 3.0231, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07173219978746015, | |
| "grad_norm": 4690.27587890625, | |
| "learning_rate": 1.9521785334750266e-05, | |
| "loss": 3.0029, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07438894792773645, | |
| "grad_norm": 40308.61328125, | |
| "learning_rate": 1.9504073680481758e-05, | |
| "loss": 3.688, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07704569606801276, | |
| "grad_norm": 27147.087890625, | |
| "learning_rate": 1.948636202621325e-05, | |
| "loss": 3.3881, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07970244420828905, | |
| "grad_norm": 59977.046875, | |
| "learning_rate": 1.9468650371944743e-05, | |
| "loss": 3.4571, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08235919234856535, | |
| "grad_norm": 66940.046875, | |
| "learning_rate": 1.9450938717676235e-05, | |
| "loss": 3.3864, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08501594048884166, | |
| "grad_norm": 5094.89013671875, | |
| "learning_rate": 1.9433227063407724e-05, | |
| "loss": 3.3697, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08767268862911796, | |
| "grad_norm": 4367.36474609375, | |
| "learning_rate": 1.9415515409139216e-05, | |
| "loss": 3.3016, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.09032943676939426, | |
| "grad_norm": 7941.5458984375, | |
| "learning_rate": 1.9397803754870705e-05, | |
| "loss": 3.0374, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09298618490967056, | |
| "grad_norm": 3960.741943359375, | |
| "learning_rate": 1.9380092100602197e-05, | |
| "loss": 3.3324, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09564293304994687, | |
| "grad_norm": 18565.732421875, | |
| "learning_rate": 1.936238044633369e-05, | |
| "loss": 3.2402, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.09829968119022317, | |
| "grad_norm": 66859.0, | |
| "learning_rate": 1.9344668792065178e-05, | |
| "loss": 3.3142, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.10095642933049948, | |
| "grad_norm": 1521.879638671875, | |
| "learning_rate": 1.932695713779667e-05, | |
| "loss": 3.0546, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10361317747077577, | |
| "grad_norm": 12662.775390625, | |
| "learning_rate": 1.9309245483528163e-05, | |
| "loss": 3.5396, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10626992561105207, | |
| "grad_norm": 105807.59375, | |
| "learning_rate": 1.9291533829259655e-05, | |
| "loss": 3.5301, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10892667375132838, | |
| "grad_norm": 663547.875, | |
| "learning_rate": 1.9273822174991147e-05, | |
| "loss": 4.28, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.11158342189160468, | |
| "grad_norm": 8186676.0, | |
| "learning_rate": 1.9256110520722636e-05, | |
| "loss": 5.9807, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.11424017003188097, | |
| "grad_norm": 2142551.25, | |
| "learning_rate": 1.923839886645413e-05, | |
| "loss": 9.4764, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11689691817215728, | |
| "grad_norm": 366486.1875, | |
| "learning_rate": 1.922068721218562e-05, | |
| "loss": 10.9151, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11955366631243358, | |
| "grad_norm": 2276693.0, | |
| "learning_rate": 1.9202975557917113e-05, | |
| "loss": 12.5549, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.12221041445270989, | |
| "grad_norm": 2184425.5, | |
| "learning_rate": 1.9185263903648602e-05, | |
| "loss": 13.1915, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.12486716259298619, | |
| "grad_norm": 2937578.75, | |
| "learning_rate": 1.9167552249380094e-05, | |
| "loss": 14.2279, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1275239107332625, | |
| "grad_norm": 10091141.0, | |
| "learning_rate": 1.9149840595111583e-05, | |
| "loss": 13.4766, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1301806588735388, | |
| "grad_norm": 5426885.5, | |
| "learning_rate": 1.9132128940843075e-05, | |
| "loss": 14.8065, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.13283740701381508, | |
| "grad_norm": 2068535.25, | |
| "learning_rate": 1.9114417286574568e-05, | |
| "loss": 16.3781, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13549415515409138, | |
| "grad_norm": 3599295.0, | |
| "learning_rate": 1.909670563230606e-05, | |
| "loss": 15.1519, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1381509032943677, | |
| "grad_norm": 761431.875, | |
| "learning_rate": 1.907899397803755e-05, | |
| "loss": 15.1124, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.140807651434644, | |
| "grad_norm": 933641.375, | |
| "learning_rate": 1.906128232376904e-05, | |
| "loss": 14.1038, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1434643995749203, | |
| "grad_norm": 423861.0625, | |
| "learning_rate": 1.9043570669500533e-05, | |
| "loss": 13.7131, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.1461211477151966, | |
| "grad_norm": 5383.50537109375, | |
| "learning_rate": 1.9025859015232026e-05, | |
| "loss": 12.8075, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1487778958554729, | |
| "grad_norm": 3759.12548828125, | |
| "learning_rate": 1.9008147360963514e-05, | |
| "loss": 10.7237, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1514346439957492, | |
| "grad_norm": 2150.089111328125, | |
| "learning_rate": 1.8990435706695007e-05, | |
| "loss": 7.2887, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.15409139213602552, | |
| "grad_norm": 3893.645751953125, | |
| "learning_rate": 1.89727240524265e-05, | |
| "loss": 4.8237, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1567481402763018, | |
| "grad_norm": 11881.3046875, | |
| "learning_rate": 1.895501239815799e-05, | |
| "loss": 3.9525, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.1594048884165781, | |
| "grad_norm": 14820.740234375, | |
| "learning_rate": 1.8937300743889483e-05, | |
| "loss": 4.6401, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1620616365568544, | |
| "grad_norm": 99031.640625, | |
| "learning_rate": 1.8919589089620972e-05, | |
| "loss": 4.9725, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1647183846971307, | |
| "grad_norm": 47882.5859375, | |
| "learning_rate": 1.890187743535246e-05, | |
| "loss": 4.6917, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.16737513283740701, | |
| "grad_norm": 77129.8046875, | |
| "learning_rate": 1.8884165781083953e-05, | |
| "loss": 4.3883, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.17003188097768332, | |
| "grad_norm": 85341.125, | |
| "learning_rate": 1.8866454126815446e-05, | |
| "loss": 5.114, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.17268862911795962, | |
| "grad_norm": 34883.13671875, | |
| "learning_rate": 1.8848742472546938e-05, | |
| "loss": 4.9715, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.17534537725823593, | |
| "grad_norm": 22649.3359375, | |
| "learning_rate": 1.8831030818278427e-05, | |
| "loss": 4.9266, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.17800212539851223, | |
| "grad_norm": 59614.453125, | |
| "learning_rate": 1.881331916400992e-05, | |
| "loss": 4.3894, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1806588735387885, | |
| "grad_norm": 13419.771484375, | |
| "learning_rate": 1.879560750974141e-05, | |
| "loss": 4.238, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.18331562167906482, | |
| "grad_norm": 26652.462890625, | |
| "learning_rate": 1.8777895855472904e-05, | |
| "loss": 4.5253, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.18597236981934112, | |
| "grad_norm": 37440.6015625, | |
| "learning_rate": 1.8760184201204396e-05, | |
| "loss": 4.0546, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.18862911795961743, | |
| "grad_norm": 43147.1796875, | |
| "learning_rate": 1.8742472546935885e-05, | |
| "loss": 4.4831, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.19128586609989373, | |
| "grad_norm": 143355.296875, | |
| "learning_rate": 1.8724760892667377e-05, | |
| "loss": 4.5257, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.19394261424017004, | |
| "grad_norm": 12484.8466796875, | |
| "learning_rate": 1.870704923839887e-05, | |
| "loss": 4.9662, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.19659936238044634, | |
| "grad_norm": 10305.0126953125, | |
| "learning_rate": 1.868933758413036e-05, | |
| "loss": 5.3629, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.19925611052072265, | |
| "grad_norm": 3247.491943359375, | |
| "learning_rate": 1.867162592986185e-05, | |
| "loss": 5.014, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.20191285866099895, | |
| "grad_norm": 2328.57470703125, | |
| "learning_rate": 1.8653914275593343e-05, | |
| "loss": 4.9864, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.20456960680127523, | |
| "grad_norm": 16007.7978515625, | |
| "learning_rate": 1.863620262132483e-05, | |
| "loss": 4.5492, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.20722635494155153, | |
| "grad_norm": 39521.5078125, | |
| "learning_rate": 1.8618490967056324e-05, | |
| "loss": 4.4608, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.20988310308182784, | |
| "grad_norm": 553922.0, | |
| "learning_rate": 1.8600779312787816e-05, | |
| "loss": 4.9998, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.21253985122210414, | |
| "grad_norm": 623164.25, | |
| "learning_rate": 1.858306765851931e-05, | |
| "loss": 4.6969, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.21519659936238045, | |
| "grad_norm": 849724.3125, | |
| "learning_rate": 1.8565356004250797e-05, | |
| "loss": 5.2992, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.21785334750265675, | |
| "grad_norm": 1883489.125, | |
| "learning_rate": 1.854764434998229e-05, | |
| "loss": 5.5446, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.22051009564293306, | |
| "grad_norm": 1473608.5, | |
| "learning_rate": 1.8529932695713782e-05, | |
| "loss": 5.6081, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.22316684378320936, | |
| "grad_norm": 6046079.5, | |
| "learning_rate": 1.8512221041445274e-05, | |
| "loss": 5.543, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.22582359192348567, | |
| "grad_norm": 3414641.75, | |
| "learning_rate": 1.8494509387176763e-05, | |
| "loss": 6.5477, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.22848034006376194, | |
| "grad_norm": 3107066.0, | |
| "learning_rate": 1.8476797732908255e-05, | |
| "loss": 6.6238, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.23113708820403825, | |
| "grad_norm": 2057658.75, | |
| "learning_rate": 1.8459086078639748e-05, | |
| "loss": 6.6566, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.23379383634431455, | |
| "grad_norm": 689954.125, | |
| "learning_rate": 1.8441374424371236e-05, | |
| "loss": 5.6908, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.23645058448459086, | |
| "grad_norm": 5757.73388671875, | |
| "learning_rate": 1.842366277010273e-05, | |
| "loss": 4.5477, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.23910733262486716, | |
| "grad_norm": 5359.6728515625, | |
| "learning_rate": 1.840595111583422e-05, | |
| "loss": 3.6785, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.24176408076514347, | |
| "grad_norm": 2013.8673095703125, | |
| "learning_rate": 1.838823946156571e-05, | |
| "loss": 3.519, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.24442082890541977, | |
| "grad_norm": 6289.10888671875, | |
| "learning_rate": 1.8370527807297202e-05, | |
| "loss": 3.6842, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.24707757704569608, | |
| "grad_norm": 3089.353759765625, | |
| "learning_rate": 1.8352816153028694e-05, | |
| "loss": 3.6535, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.24973432518597238, | |
| "grad_norm": 2002.3780517578125, | |
| "learning_rate": 1.8335104498760187e-05, | |
| "loss": 3.5385, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.25239107332624866, | |
| "grad_norm": 5194.0224609375, | |
| "learning_rate": 1.8317392844491676e-05, | |
| "loss": 3.4652, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.255047821466525, | |
| "grad_norm": 2200.886962890625, | |
| "learning_rate": 1.8299681190223168e-05, | |
| "loss": 3.6788, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.25770456960680127, | |
| "grad_norm": 10148.009765625, | |
| "learning_rate": 1.828196953595466e-05, | |
| "loss": 3.7478, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2603613177470776, | |
| "grad_norm": 2540.3837890625, | |
| "learning_rate": 1.8264257881686152e-05, | |
| "loss": 3.4836, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.2630180658873539, | |
| "grad_norm": 2385.15625, | |
| "learning_rate": 1.8246546227417645e-05, | |
| "loss": 3.2733, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.26567481402763016, | |
| "grad_norm": 8635.650390625, | |
| "learning_rate": 1.8228834573149134e-05, | |
| "loss": 3.4935, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2683315621679065, | |
| "grad_norm": 17405.947265625, | |
| "learning_rate": 1.8211122918880626e-05, | |
| "loss": 3.3743, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.27098831030818277, | |
| "grad_norm": 2616.988037109375, | |
| "learning_rate": 1.8193411264612115e-05, | |
| "loss": 4.0444, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2736450584484591, | |
| "grad_norm": 9487.044921875, | |
| "learning_rate": 1.8175699610343607e-05, | |
| "loss": 3.8644, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.2763018065887354, | |
| "grad_norm": 681.0313110351562, | |
| "learning_rate": 1.81579879560751e-05, | |
| "loss": 3.2198, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.2789585547290117, | |
| "grad_norm": 1654.2945556640625, | |
| "learning_rate": 1.8140276301806588e-05, | |
| "loss": 3.741, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.281615302869288, | |
| "grad_norm": 2555.9970703125, | |
| "learning_rate": 1.812256464753808e-05, | |
| "loss": 3.5377, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.2842720510095643, | |
| "grad_norm": 1187.751220703125, | |
| "learning_rate": 1.8104852993269573e-05, | |
| "loss": 3.6048, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.2869287991498406, | |
| "grad_norm": 2747.8486328125, | |
| "learning_rate": 1.8087141339001065e-05, | |
| "loss": 3.6148, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.2895855472901169, | |
| "grad_norm": 624.16650390625, | |
| "learning_rate": 1.8069429684732557e-05, | |
| "loss": 3.0917, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.2922422954303932, | |
| "grad_norm": 283.41033935546875, | |
| "learning_rate": 1.8051718030464046e-05, | |
| "loss": 3.4423, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2948990435706695, | |
| "grad_norm": 563.9237670898438, | |
| "learning_rate": 1.8034006376195538e-05, | |
| "loss": 3.1134, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.2975557917109458, | |
| "grad_norm": 419.8347473144531, | |
| "learning_rate": 1.801629472192703e-05, | |
| "loss": 3.3765, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3002125398512221, | |
| "grad_norm": 328.199462890625, | |
| "learning_rate": 1.7998583067658523e-05, | |
| "loss": 3.1981, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3028692879914984, | |
| "grad_norm": 1167.4515380859375, | |
| "learning_rate": 1.7980871413390012e-05, | |
| "loss": 2.9826, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3055260361317747, | |
| "grad_norm": 1590.5523681640625, | |
| "learning_rate": 1.7963159759121504e-05, | |
| "loss": 3.2378, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.30818278427205104, | |
| "grad_norm": 1228.88037109375, | |
| "learning_rate": 1.7945448104852993e-05, | |
| "loss": 3.2167, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3108395324123273, | |
| "grad_norm": 866.290283203125, | |
| "learning_rate": 1.7927736450584485e-05, | |
| "loss": 2.9749, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3134962805526036, | |
| "grad_norm": 326.7938537597656, | |
| "learning_rate": 1.7910024796315977e-05, | |
| "loss": 3.111, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3161530286928799, | |
| "grad_norm": 603.0250854492188, | |
| "learning_rate": 1.789231314204747e-05, | |
| "loss": 3.1647, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3188097768331562, | |
| "grad_norm": 553.5940551757812, | |
| "learning_rate": 1.787460148777896e-05, | |
| "loss": 3.1094, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.32146652497343253, | |
| "grad_norm": 417.6220703125, | |
| "learning_rate": 1.785688983351045e-05, | |
| "loss": 3.195, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3241232731137088, | |
| "grad_norm": 745.7908935546875, | |
| "learning_rate": 1.7839178179241943e-05, | |
| "loss": 2.8119, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.32678002125398514, | |
| "grad_norm": 963.697021484375, | |
| "learning_rate": 1.7821466524973435e-05, | |
| "loss": 2.9828, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3294367693942614, | |
| "grad_norm": 3789.7373046875, | |
| "learning_rate": 1.7803754870704924e-05, | |
| "loss": 2.8971, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.33209351753453775, | |
| "grad_norm": 1777.551025390625, | |
| "learning_rate": 1.7786043216436416e-05, | |
| "loss": 2.8533, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.33475026567481403, | |
| "grad_norm": 725.1536254882812, | |
| "learning_rate": 1.776833156216791e-05, | |
| "loss": 2.6644, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3374070138150903, | |
| "grad_norm": 2410.62060546875, | |
| "learning_rate": 1.77506199078994e-05, | |
| "loss": 3.058, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.34006376195536664, | |
| "grad_norm": 825.2067260742188, | |
| "learning_rate": 1.7732908253630893e-05, | |
| "loss": 2.7154, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3427205100956429, | |
| "grad_norm": 835.7099609375, | |
| "learning_rate": 1.7715196599362382e-05, | |
| "loss": 3.5358, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.34537725823591925, | |
| "grad_norm": 2334.035888671875, | |
| "learning_rate": 1.769748494509387e-05, | |
| "loss": 3.2141, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3480340063761955, | |
| "grad_norm": 1089.702392578125, | |
| "learning_rate": 1.7679773290825363e-05, | |
| "loss": 2.8534, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.35069075451647186, | |
| "grad_norm": 643.6981811523438, | |
| "learning_rate": 1.7662061636556856e-05, | |
| "loss": 3.14, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.35334750265674814, | |
| "grad_norm": 927.3551025390625, | |
| "learning_rate": 1.7644349982288348e-05, | |
| "loss": 3.255, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.35600425079702447, | |
| "grad_norm": 642.1421508789062, | |
| "learning_rate": 1.7626638328019837e-05, | |
| "loss": 2.9875, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.35866099893730075, | |
| "grad_norm": 1514.4876708984375, | |
| "learning_rate": 1.760892667375133e-05, | |
| "loss": 2.7786, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.361317747077577, | |
| "grad_norm": 2913.84912109375, | |
| "learning_rate": 1.759121501948282e-05, | |
| "loss": 2.83, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.36397449521785336, | |
| "grad_norm": 1152.3695068359375, | |
| "learning_rate": 1.7573503365214314e-05, | |
| "loss": 3.316, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.36663124335812963, | |
| "grad_norm": 2364.73876953125, | |
| "learning_rate": 1.7555791710945806e-05, | |
| "loss": 3.1473, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.36928799149840597, | |
| "grad_norm": 1560.827392578125, | |
| "learning_rate": 1.7538080056677295e-05, | |
| "loss": 2.875, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.37194473963868224, | |
| "grad_norm": 672.7749633789062, | |
| "learning_rate": 1.7520368402408787e-05, | |
| "loss": 3.3416, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3746014877789586, | |
| "grad_norm": 3212.583740234375, | |
| "learning_rate": 1.750265674814028e-05, | |
| "loss": 2.6347, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.37725823591923485, | |
| "grad_norm": 9892.419921875, | |
| "learning_rate": 1.7484945093871768e-05, | |
| "loss": 2.9356, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3799149840595112, | |
| "grad_norm": 13098.6201171875, | |
| "learning_rate": 1.746723343960326e-05, | |
| "loss": 3.0818, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.38257173219978746, | |
| "grad_norm": 33038.46484375, | |
| "learning_rate": 1.7449521785334753e-05, | |
| "loss": 3.4073, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.38522848034006374, | |
| "grad_norm": 58945.421875, | |
| "learning_rate": 1.743181013106624e-05, | |
| "loss": 3.4505, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.38788522848034007, | |
| "grad_norm": 53823.19921875, | |
| "learning_rate": 1.7414098476797734e-05, | |
| "loss": 3.4398, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.39054197662061635, | |
| "grad_norm": 213358.46875, | |
| "learning_rate": 1.7396386822529226e-05, | |
| "loss": 3.1337, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.3931987247608927, | |
| "grad_norm": 174113.078125, | |
| "learning_rate": 1.7378675168260718e-05, | |
| "loss": 3.6872, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.39585547290116896, | |
| "grad_norm": 110265.9609375, | |
| "learning_rate": 1.7360963513992207e-05, | |
| "loss": 3.5268, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3985122210414453, | |
| "grad_norm": 125626.78125, | |
| "learning_rate": 1.73432518597237e-05, | |
| "loss": 3.8027, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.40116896918172157, | |
| "grad_norm": 119383.8359375, | |
| "learning_rate": 1.7325540205455192e-05, | |
| "loss": 3.6381, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4038257173219979, | |
| "grad_norm": 78246.125, | |
| "learning_rate": 1.7307828551186684e-05, | |
| "loss": 3.6688, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4064824654622742, | |
| "grad_norm": 77016.8671875, | |
| "learning_rate": 1.7290116896918173e-05, | |
| "loss": 3.7796, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.40913921360255046, | |
| "grad_norm": 471759.21875, | |
| "learning_rate": 1.7272405242649665e-05, | |
| "loss": 3.738, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.4117959617428268, | |
| "grad_norm": 108969.1171875, | |
| "learning_rate": 1.7254693588381157e-05, | |
| "loss": 3.4583, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.41445270988310307, | |
| "grad_norm": 44717.91015625, | |
| "learning_rate": 1.7236981934112646e-05, | |
| "loss": 3.0156, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4171094580233794, | |
| "grad_norm": 56418.765625, | |
| "learning_rate": 1.721927027984414e-05, | |
| "loss": 3.339, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4197662061636557, | |
| "grad_norm": 82086.234375, | |
| "learning_rate": 1.720155862557563e-05, | |
| "loss": 3.2477, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.422422954303932, | |
| "grad_norm": 38437.12890625, | |
| "learning_rate": 1.718384697130712e-05, | |
| "loss": 3.0923, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4250797024442083, | |
| "grad_norm": 64070.26953125, | |
| "learning_rate": 1.7166135317038612e-05, | |
| "loss": 3.8784, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4277364505844846, | |
| "grad_norm": 96363.0078125, | |
| "learning_rate": 1.7148423662770104e-05, | |
| "loss": 3.1945, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4303931987247609, | |
| "grad_norm": 101021.7578125, | |
| "learning_rate": 1.7130712008501596e-05, | |
| "loss": 2.9785, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.43304994686503717, | |
| "grad_norm": 33741.50390625, | |
| "learning_rate": 1.7113000354233085e-05, | |
| "loss": 3.0544, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.4357066950053135, | |
| "grad_norm": 18486.07421875, | |
| "learning_rate": 1.7095288699964578e-05, | |
| "loss": 3.3951, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4383634431455898, | |
| "grad_norm": 141817.4375, | |
| "learning_rate": 1.707757704569607e-05, | |
| "loss": 3.8719, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4410201912858661, | |
| "grad_norm": 18356.125, | |
| "learning_rate": 1.7059865391427562e-05, | |
| "loss": 3.217, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4436769394261424, | |
| "grad_norm": 75286.890625, | |
| "learning_rate": 1.7042153737159054e-05, | |
| "loss": 3.2279, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.4463336875664187, | |
| "grad_norm": 93692.8671875, | |
| "learning_rate": 1.7024442082890543e-05, | |
| "loss": 3.3421, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.448990435706695, | |
| "grad_norm": 137171.109375, | |
| "learning_rate": 1.7006730428622032e-05, | |
| "loss": 3.4727, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.45164718384697133, | |
| "grad_norm": 143812.296875, | |
| "learning_rate": 1.6989018774353524e-05, | |
| "loss": 3.24, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4543039319872476, | |
| "grad_norm": 35345.19921875, | |
| "learning_rate": 1.6971307120085017e-05, | |
| "loss": 3.2903, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.4569606801275239, | |
| "grad_norm": 69917.4375, | |
| "learning_rate": 1.695359546581651e-05, | |
| "loss": 3.1309, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4596174282678002, | |
| "grad_norm": 71451.5859375, | |
| "learning_rate": 1.6935883811547998e-05, | |
| "loss": 3.8151, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4622741764080765, | |
| "grad_norm": 54897.4375, | |
| "learning_rate": 1.691817215727949e-05, | |
| "loss": 3.7961, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.46493092454835283, | |
| "grad_norm": 42574.12109375, | |
| "learning_rate": 1.6900460503010982e-05, | |
| "loss": 3.3018, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.4675876726886291, | |
| "grad_norm": 118568.609375, | |
| "learning_rate": 1.6882748848742475e-05, | |
| "loss": 3.4044, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.47024442082890544, | |
| "grad_norm": 141536.96875, | |
| "learning_rate": 1.6865037194473967e-05, | |
| "loss": 3.5705, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.4729011689691817, | |
| "grad_norm": 153274.9375, | |
| "learning_rate": 1.6847325540205456e-05, | |
| "loss": 3.7034, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.47555791710945805, | |
| "grad_norm": 121872.7890625, | |
| "learning_rate": 1.6829613885936948e-05, | |
| "loss": 3.6836, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.4782146652497343, | |
| "grad_norm": 101665.6640625, | |
| "learning_rate": 1.681190223166844e-05, | |
| "loss": 3.5983, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4808714133900106, | |
| "grad_norm": 212873.5, | |
| "learning_rate": 1.6794190577399933e-05, | |
| "loss": 3.3915, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.48352816153028694, | |
| "grad_norm": 19234.345703125, | |
| "learning_rate": 1.677647892313142e-05, | |
| "loss": 3.1403, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.4861849096705632, | |
| "grad_norm": 126968.46875, | |
| "learning_rate": 1.6758767268862914e-05, | |
| "loss": 3.3559, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.48884165781083955, | |
| "grad_norm": 40483.28515625, | |
| "learning_rate": 1.6741055614594403e-05, | |
| "loss": 3.4042, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.4914984059511158, | |
| "grad_norm": 281826.84375, | |
| "learning_rate": 1.6723343960325895e-05, | |
| "loss": 3.5656, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.49415515409139216, | |
| "grad_norm": 112396.421875, | |
| "learning_rate": 1.6705632306057387e-05, | |
| "loss": 3.5217, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.49681190223166843, | |
| "grad_norm": 430567.96875, | |
| "learning_rate": 1.668792065178888e-05, | |
| "loss": 3.784, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.49946865037194477, | |
| "grad_norm": 19857.708984375, | |
| "learning_rate": 1.667020899752037e-05, | |
| "loss": 3.2844, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.502125398512221, | |
| "grad_norm": 153824.828125, | |
| "learning_rate": 1.665249734325186e-05, | |
| "loss": 3.5734, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5047821466524973, | |
| "grad_norm": 555864.875, | |
| "learning_rate": 1.6634785688983353e-05, | |
| "loss": 3.5042, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5074388947927736, | |
| "grad_norm": 1425396.625, | |
| "learning_rate": 1.6617074034714845e-05, | |
| "loss": 3.8919, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.51009564293305, | |
| "grad_norm": 1588321.5, | |
| "learning_rate": 1.6599362380446334e-05, | |
| "loss": 3.7013, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5127523910733263, | |
| "grad_norm": 843313.25, | |
| "learning_rate": 1.6581650726177826e-05, | |
| "loss": 4.0527, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5154091392136025, | |
| "grad_norm": 121270.0859375, | |
| "learning_rate": 1.656393907190932e-05, | |
| "loss": 3.6732, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5180658873538788, | |
| "grad_norm": 194603.609375, | |
| "learning_rate": 1.654622741764081e-05, | |
| "loss": 3.5416, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5207226354941552, | |
| "grad_norm": 103689.84375, | |
| "learning_rate": 1.65285157633723e-05, | |
| "loss": 3.6058, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5233793836344315, | |
| "grad_norm": 148743.953125, | |
| "learning_rate": 1.6510804109103792e-05, | |
| "loss": 3.4376, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5260361317747078, | |
| "grad_norm": 23079.94140625, | |
| "learning_rate": 1.649309245483528e-05, | |
| "loss": 3.524, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.528692879914984, | |
| "grad_norm": 12263.953125, | |
| "learning_rate": 1.6475380800566773e-05, | |
| "loss": 3.1242, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5313496280552603, | |
| "grad_norm": 270958.5625, | |
| "learning_rate": 1.6457669146298265e-05, | |
| "loss": 3.8531, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5340063761955367, | |
| "grad_norm": 145561.640625, | |
| "learning_rate": 1.6439957492029758e-05, | |
| "loss": 3.104, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.536663124335813, | |
| "grad_norm": 104717.5625, | |
| "learning_rate": 1.6422245837761247e-05, | |
| "loss": 3.3674, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5393198724760893, | |
| "grad_norm": 112249.3515625, | |
| "learning_rate": 1.640453418349274e-05, | |
| "loss": 3.2119, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5419766206163655, | |
| "grad_norm": 131700.71875, | |
| "learning_rate": 1.638682252922423e-05, | |
| "loss": 3.6448, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5446333687566419, | |
| "grad_norm": 119026.4140625, | |
| "learning_rate": 1.6369110874955723e-05, | |
| "loss": 3.0097, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5472901168969182, | |
| "grad_norm": 103121.09375, | |
| "learning_rate": 1.6351399220687216e-05, | |
| "loss": 3.4205, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5499468650371945, | |
| "grad_norm": 237787.03125, | |
| "learning_rate": 1.6333687566418704e-05, | |
| "loss": 3.349, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5526036131774708, | |
| "grad_norm": 49652.95703125, | |
| "learning_rate": 1.6315975912150197e-05, | |
| "loss": 3.1665, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.555260361317747, | |
| "grad_norm": 262178.34375, | |
| "learning_rate": 1.629826425788169e-05, | |
| "loss": 3.4743, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.5579171094580234, | |
| "grad_norm": 130814.703125, | |
| "learning_rate": 1.6280552603613178e-05, | |
| "loss": 3.4995, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5605738575982997, | |
| "grad_norm": 273671.09375, | |
| "learning_rate": 1.626284094934467e-05, | |
| "loss": 3.3983, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.563230605738576, | |
| "grad_norm": 385060.25, | |
| "learning_rate": 1.6245129295076162e-05, | |
| "loss": 3.5215, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5658873538788523, | |
| "grad_norm": 165007.71875, | |
| "learning_rate": 1.622741764080765e-05, | |
| "loss": 3.1164, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5685441020191286, | |
| "grad_norm": 70266.53125, | |
| "learning_rate": 1.6209705986539144e-05, | |
| "loss": 3.1971, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5712008501594049, | |
| "grad_norm": 271687.3125, | |
| "learning_rate": 1.6191994332270636e-05, | |
| "loss": 3.4932, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5738575982996812, | |
| "grad_norm": 35143.67578125, | |
| "learning_rate": 1.6174282678002128e-05, | |
| "loss": 3.4214, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5765143464399575, | |
| "grad_norm": 1173879.625, | |
| "learning_rate": 1.6156571023733617e-05, | |
| "loss": 3.3194, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5791710945802337, | |
| "grad_norm": 306067.03125, | |
| "learning_rate": 1.613885936946511e-05, | |
| "loss": 3.1417, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5818278427205101, | |
| "grad_norm": 342329.0625, | |
| "learning_rate": 1.61211477151966e-05, | |
| "loss": 3.355, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5844845908607864, | |
| "grad_norm": 50600.97265625, | |
| "learning_rate": 1.6103436060928094e-05, | |
| "loss": 3.3974, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5871413390010627, | |
| "grad_norm": 360589.03125, | |
| "learning_rate": 1.6085724406659583e-05, | |
| "loss": 3.4514, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.589798087141339, | |
| "grad_norm": 94335.3828125, | |
| "learning_rate": 1.6068012752391075e-05, | |
| "loss": 3.2719, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.5924548352816154, | |
| "grad_norm": 53790.76953125, | |
| "learning_rate": 1.6050301098122564e-05, | |
| "loss": 3.3992, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.5951115834218916, | |
| "grad_norm": 107421.421875, | |
| "learning_rate": 1.6032589443854056e-05, | |
| "loss": 3.3512, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.5977683315621679, | |
| "grad_norm": 142487.859375, | |
| "learning_rate": 1.601487778958555e-05, | |
| "loss": 3.826, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6004250797024442, | |
| "grad_norm": 1261580.75, | |
| "learning_rate": 1.599716613531704e-05, | |
| "loss": 3.7385, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.6030818278427205, | |
| "grad_norm": 648111.0, | |
| "learning_rate": 1.597945448104853e-05, | |
| "loss": 3.2839, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.6057385759829969, | |
| "grad_norm": 326968.125, | |
| "learning_rate": 1.5961742826780022e-05, | |
| "loss": 3.7895, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.6083953241232731, | |
| "grad_norm": 808961.5625, | |
| "learning_rate": 1.5944031172511514e-05, | |
| "loss": 3.7051, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.6110520722635494, | |
| "grad_norm": 2958079.0, | |
| "learning_rate": 1.5926319518243006e-05, | |
| "loss": 3.8099, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6137088204038257, | |
| "grad_norm": 314874.03125, | |
| "learning_rate": 1.5908607863974495e-05, | |
| "loss": 3.6654, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.6163655685441021, | |
| "grad_norm": 8078548.0, | |
| "learning_rate": 1.5890896209705987e-05, | |
| "loss": 4.018, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.6190223166843783, | |
| "grad_norm": 135695.46875, | |
| "learning_rate": 1.587318455543748e-05, | |
| "loss": 3.6651, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.6216790648246546, | |
| "grad_norm": 18501240.0, | |
| "learning_rate": 1.5855472901168972e-05, | |
| "loss": 4.0069, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.6243358129649309, | |
| "grad_norm": 4980981.5, | |
| "learning_rate": 1.5837761246900464e-05, | |
| "loss": 3.9174, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6269925611052072, | |
| "grad_norm": 1297274.125, | |
| "learning_rate": 1.5820049592631953e-05, | |
| "loss": 3.3223, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6296493092454836, | |
| "grad_norm": 1378757.625, | |
| "learning_rate": 1.5802337938363442e-05, | |
| "loss": 3.7712, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6323060573857598, | |
| "grad_norm": 2027859.875, | |
| "learning_rate": 1.5784626284094934e-05, | |
| "loss": 3.5468, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6349628055260361, | |
| "grad_norm": 157107.65625, | |
| "learning_rate": 1.5766914629826427e-05, | |
| "loss": 3.5328, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6376195536663124, | |
| "grad_norm": 1103094.75, | |
| "learning_rate": 1.574920297555792e-05, | |
| "loss": 3.6031, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6402763018065888, | |
| "grad_norm": 725449.5, | |
| "learning_rate": 1.5731491321289408e-05, | |
| "loss": 3.9276, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6429330499468651, | |
| "grad_norm": 214425.640625, | |
| "learning_rate": 1.57137796670209e-05, | |
| "loss": 3.517, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6455897980871413, | |
| "grad_norm": 876419.625, | |
| "learning_rate": 1.5696068012752392e-05, | |
| "loss": 3.4675, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6482465462274176, | |
| "grad_norm": 1504300.25, | |
| "learning_rate": 1.5678356358483884e-05, | |
| "loss": 3.4772, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6509032943676939, | |
| "grad_norm": 144657.71875, | |
| "learning_rate": 1.5660644704215377e-05, | |
| "loss": 3.42, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6535600425079703, | |
| "grad_norm": 371512.40625, | |
| "learning_rate": 1.5642933049946866e-05, | |
| "loss": 3.6802, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6562167906482466, | |
| "grad_norm": 1322714.5, | |
| "learning_rate": 1.5625221395678358e-05, | |
| "loss": 3.805, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6588735387885228, | |
| "grad_norm": 218897.765625, | |
| "learning_rate": 1.560750974140985e-05, | |
| "loss": 3.252, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6615302869287991, | |
| "grad_norm": 1596077.0, | |
| "learning_rate": 1.5589798087141342e-05, | |
| "loss": 3.626, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6641870350690755, | |
| "grad_norm": 2922875.75, | |
| "learning_rate": 1.557208643287283e-05, | |
| "loss": 3.5045, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6668437832093518, | |
| "grad_norm": 96812.5859375, | |
| "learning_rate": 1.5554374778604324e-05, | |
| "loss": 3.7078, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6695005313496281, | |
| "grad_norm": 1580814.125, | |
| "learning_rate": 1.5536663124335812e-05, | |
| "loss": 3.615, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6721572794899043, | |
| "grad_norm": 235169.53125, | |
| "learning_rate": 1.5518951470067305e-05, | |
| "loss": 3.5076, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.6748140276301806, | |
| "grad_norm": 816632.0, | |
| "learning_rate": 1.5501239815798797e-05, | |
| "loss": 4.0074, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.677470775770457, | |
| "grad_norm": 3783126.5, | |
| "learning_rate": 1.548352816153029e-05, | |
| "loss": 3.7162, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6801275239107333, | |
| "grad_norm": 1676969.875, | |
| "learning_rate": 1.5465816507261778e-05, | |
| "loss": 3.9383, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.6827842720510096, | |
| "grad_norm": 944205.0, | |
| "learning_rate": 1.544810485299327e-05, | |
| "loss": 3.6335, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.6854410201912858, | |
| "grad_norm": 532299.0, | |
| "learning_rate": 1.5430393198724763e-05, | |
| "loss": 3.776, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.6880977683315622, | |
| "grad_norm": 324683.46875, | |
| "learning_rate": 1.5412681544456255e-05, | |
| "loss": 4.0332, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6907545164718385, | |
| "grad_norm": 371158.6875, | |
| "learning_rate": 1.5394969890187744e-05, | |
| "loss": 3.2831, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6934112646121148, | |
| "grad_norm": 626177.8125, | |
| "learning_rate": 1.5377258235919236e-05, | |
| "loss": 3.7419, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.696068012752391, | |
| "grad_norm": 489480.3125, | |
| "learning_rate": 1.535954658165073e-05, | |
| "loss": 3.9135, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.6987247608926673, | |
| "grad_norm": 840057.5625, | |
| "learning_rate": 1.534183492738222e-05, | |
| "loss": 3.6214, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.7013815090329437, | |
| "grad_norm": 641658.4375, | |
| "learning_rate": 1.532412327311371e-05, | |
| "loss": 3.9029, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.70403825717322, | |
| "grad_norm": 1129191.0, | |
| "learning_rate": 1.5306411618845202e-05, | |
| "loss": 3.6271, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7066950053134963, | |
| "grad_norm": 758676.8125, | |
| "learning_rate": 1.528869996457669e-05, | |
| "loss": 3.8411, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.7093517534537725, | |
| "grad_norm": 946755.25, | |
| "learning_rate": 1.5270988310308183e-05, | |
| "loss": 3.8184, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.7120085015940489, | |
| "grad_norm": 1282365.625, | |
| "learning_rate": 1.5253276656039675e-05, | |
| "loss": 3.8393, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.7146652497343252, | |
| "grad_norm": 1212575.875, | |
| "learning_rate": 1.5235565001771166e-05, | |
| "loss": 3.6106, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.7173219978746015, | |
| "grad_norm": 2197153.75, | |
| "learning_rate": 1.5217853347502658e-05, | |
| "loss": 3.5554, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7199787460148778, | |
| "grad_norm": 621252.1875, | |
| "learning_rate": 1.520014169323415e-05, | |
| "loss": 3.3832, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.722635494155154, | |
| "grad_norm": 243552.59375, | |
| "learning_rate": 1.5182430038965641e-05, | |
| "loss": 3.4785, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.7252922422954304, | |
| "grad_norm": 3559921.0, | |
| "learning_rate": 1.5164718384697133e-05, | |
| "loss": 3.7972, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.7279489904357067, | |
| "grad_norm": 8816077.0, | |
| "learning_rate": 1.5147006730428624e-05, | |
| "loss": 3.6698, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.730605738575983, | |
| "grad_norm": 2959412.0, | |
| "learning_rate": 1.5129295076160116e-05, | |
| "loss": 3.9389, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7332624867162593, | |
| "grad_norm": 13276429.0, | |
| "learning_rate": 1.5111583421891607e-05, | |
| "loss": 3.6811, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7359192348565357, | |
| "grad_norm": 24583468.0, | |
| "learning_rate": 1.5093871767623095e-05, | |
| "loss": 3.9955, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7385759829968119, | |
| "grad_norm": 11388400.0, | |
| "learning_rate": 1.5076160113354588e-05, | |
| "loss": 3.4851, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7412327311370882, | |
| "grad_norm": 2901875.5, | |
| "learning_rate": 1.5058448459086078e-05, | |
| "loss": 4.0118, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7438894792773645, | |
| "grad_norm": 7893670.0, | |
| "learning_rate": 1.504073680481757e-05, | |
| "loss": 4.3674, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7465462274176408, | |
| "grad_norm": 13170602.0, | |
| "learning_rate": 1.5023025150549063e-05, | |
| "loss": 3.5882, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7492029755579172, | |
| "grad_norm": 12720932.0, | |
| "learning_rate": 1.5005313496280553e-05, | |
| "loss": 4.7013, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7518597236981934, | |
| "grad_norm": 7461363.0, | |
| "learning_rate": 1.4987601842012046e-05, | |
| "loss": 3.5194, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7545164718384697, | |
| "grad_norm": 3747000.25, | |
| "learning_rate": 1.4969890187743536e-05, | |
| "loss": 3.9811, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.757173219978746, | |
| "grad_norm": 2111091.0, | |
| "learning_rate": 1.4952178533475028e-05, | |
| "loss": 3.3212, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7598299681190224, | |
| "grad_norm": 4919647.5, | |
| "learning_rate": 1.4934466879206519e-05, | |
| "loss": 4.0383, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7624867162592986, | |
| "grad_norm": 3595169.25, | |
| "learning_rate": 1.4916755224938011e-05, | |
| "loss": 3.7293, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.7651434643995749, | |
| "grad_norm": 1647251.75, | |
| "learning_rate": 1.4899043570669502e-05, | |
| "loss": 4.166, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7678002125398512, | |
| "grad_norm": 4398145.0, | |
| "learning_rate": 1.4881331916400994e-05, | |
| "loss": 3.4454, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7704569606801275, | |
| "grad_norm": 3135213.0, | |
| "learning_rate": 1.4863620262132485e-05, | |
| "loss": 4.0135, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7731137088204039, | |
| "grad_norm": 7072787.0, | |
| "learning_rate": 1.4845908607863975e-05, | |
| "loss": 3.4145, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.7757704569606801, | |
| "grad_norm": 2635511.75, | |
| "learning_rate": 1.4828196953595466e-05, | |
| "loss": 3.8201, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.7784272051009564, | |
| "grad_norm": 4616754.5, | |
| "learning_rate": 1.4810485299326958e-05, | |
| "loss": 4.1764, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.7810839532412327, | |
| "grad_norm": 877153.0, | |
| "learning_rate": 1.4792773645058449e-05, | |
| "loss": 3.9471, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.7837407013815091, | |
| "grad_norm": 569671.3125, | |
| "learning_rate": 1.4775061990789941e-05, | |
| "loss": 3.7697, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7863974495217854, | |
| "grad_norm": 810236.125, | |
| "learning_rate": 1.4757350336521432e-05, | |
| "loss": 4.4753, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.7890541976620616, | |
| "grad_norm": 877906.875, | |
| "learning_rate": 1.4739638682252924e-05, | |
| "loss": 3.6654, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.7917109458023379, | |
| "grad_norm": 481885.46875, | |
| "learning_rate": 1.4721927027984414e-05, | |
| "loss": 4.1253, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.7943676939426142, | |
| "grad_norm": 1338787.0, | |
| "learning_rate": 1.4704215373715907e-05, | |
| "loss": 4.0294, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.7970244420828906, | |
| "grad_norm": 1250065.875, | |
| "learning_rate": 1.4686503719447397e-05, | |
| "loss": 4.7282, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7996811902231669, | |
| "grad_norm": 1604171.375, | |
| "learning_rate": 1.466879206517889e-05, | |
| "loss": 4.0439, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.8023379383634431, | |
| "grad_norm": 512070.90625, | |
| "learning_rate": 1.4651080410910382e-05, | |
| "loss": 3.5779, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.8049946865037194, | |
| "grad_norm": 312113.46875, | |
| "learning_rate": 1.4633368756641872e-05, | |
| "loss": 3.6514, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.8076514346439958, | |
| "grad_norm": 23779.923828125, | |
| "learning_rate": 1.4615657102373361e-05, | |
| "loss": 3.8136, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.8103081827842721, | |
| "grad_norm": 8204.794921875, | |
| "learning_rate": 1.4597945448104854e-05, | |
| "loss": 4.1336, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.8129649309245484, | |
| "grad_norm": 76479.1640625, | |
| "learning_rate": 1.4580233793836344e-05, | |
| "loss": 3.4411, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.8156216790648246, | |
| "grad_norm": 66624.71875, | |
| "learning_rate": 1.4562522139567836e-05, | |
| "loss": 3.8493, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.8182784272051009, | |
| "grad_norm": 22607.904296875, | |
| "learning_rate": 1.4544810485299327e-05, | |
| "loss": 3.2428, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.8209351753453773, | |
| "grad_norm": 119469.640625, | |
| "learning_rate": 1.452709883103082e-05, | |
| "loss": 3.4363, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.8235919234856536, | |
| "grad_norm": 108868.203125, | |
| "learning_rate": 1.4509387176762311e-05, | |
| "loss": 3.5903, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8262486716259299, | |
| "grad_norm": 5543388.0, | |
| "learning_rate": 1.4491675522493802e-05, | |
| "loss": 3.7918, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.8289054197662061, | |
| "grad_norm": 2565445.75, | |
| "learning_rate": 1.4473963868225294e-05, | |
| "loss": 3.8573, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.8315621679064825, | |
| "grad_norm": 702086.4375, | |
| "learning_rate": 1.4456252213956785e-05, | |
| "loss": 3.3944, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.8342189160467588, | |
| "grad_norm": 115243.6484375, | |
| "learning_rate": 1.4438540559688277e-05, | |
| "loss": 3.2222, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8368756641870351, | |
| "grad_norm": 476268.625, | |
| "learning_rate": 1.4420828905419768e-05, | |
| "loss": 3.6144, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8395324123273114, | |
| "grad_norm": 65992.0, | |
| "learning_rate": 1.440311725115126e-05, | |
| "loss": 3.1891, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.8421891604675876, | |
| "grad_norm": 1161863.375, | |
| "learning_rate": 1.438540559688275e-05, | |
| "loss": 3.6714, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.844845908607864, | |
| "grad_norm": 185466.84375, | |
| "learning_rate": 1.4367693942614241e-05, | |
| "loss": 3.4372, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8475026567481403, | |
| "grad_norm": 56940.96875, | |
| "learning_rate": 1.4349982288345732e-05, | |
| "loss": 3.7385, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8501594048884166, | |
| "grad_norm": 99763.78125, | |
| "learning_rate": 1.4332270634077224e-05, | |
| "loss": 3.5612, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8528161530286928, | |
| "grad_norm": 91525.1328125, | |
| "learning_rate": 1.4314558979808715e-05, | |
| "loss": 3.6116, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8554729011689692, | |
| "grad_norm": 23506.251953125, | |
| "learning_rate": 1.4296847325540207e-05, | |
| "loss": 3.4268, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8581296493092455, | |
| "grad_norm": 36794.52734375, | |
| "learning_rate": 1.4279135671271697e-05, | |
| "loss": 3.7912, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8607863974495218, | |
| "grad_norm": 14971.548828125, | |
| "learning_rate": 1.426142401700319e-05, | |
| "loss": 3.7623, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8634431455897981, | |
| "grad_norm": 29957.119140625, | |
| "learning_rate": 1.424371236273468e-05, | |
| "loss": 3.5765, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8660998937300743, | |
| "grad_norm": 24691.1796875, | |
| "learning_rate": 1.4226000708466172e-05, | |
| "loss": 3.4663, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8687566418703507, | |
| "grad_norm": 21935.2734375, | |
| "learning_rate": 1.4208289054197663e-05, | |
| "loss": 3.6494, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.871413390010627, | |
| "grad_norm": 26350.591796875, | |
| "learning_rate": 1.4190577399929155e-05, | |
| "loss": 3.5611, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.8740701381509033, | |
| "grad_norm": 30286.142578125, | |
| "learning_rate": 1.4172865745660646e-05, | |
| "loss": 3.7046, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.8767268862911796, | |
| "grad_norm": 6965.02734375, | |
| "learning_rate": 1.4155154091392138e-05, | |
| "loss": 3.9012, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.879383634431456, | |
| "grad_norm": 34496.1171875, | |
| "learning_rate": 1.4137442437123627e-05, | |
| "loss": 3.5102, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.8820403825717322, | |
| "grad_norm": 15867.46875, | |
| "learning_rate": 1.411973078285512e-05, | |
| "loss": 3.9485, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.8846971307120085, | |
| "grad_norm": 8408.2509765625, | |
| "learning_rate": 1.410201912858661e-05, | |
| "loss": 4.0955, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.8873538788522848, | |
| "grad_norm": 12868.8935546875, | |
| "learning_rate": 1.4084307474318102e-05, | |
| "loss": 3.8902, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.8900106269925611, | |
| "grad_norm": 39027.8125, | |
| "learning_rate": 1.4066595820049593e-05, | |
| "loss": 3.7809, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8926673751328374, | |
| "grad_norm": 30144.494140625, | |
| "learning_rate": 1.4048884165781085e-05, | |
| "loss": 3.8368, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.8953241232731137, | |
| "grad_norm": 14916.984375, | |
| "learning_rate": 1.4031172511512576e-05, | |
| "loss": 3.8361, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.89798087141339, | |
| "grad_norm": 10657.8974609375, | |
| "learning_rate": 1.4013460857244068e-05, | |
| "loss": 3.9388, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.9006376195536663, | |
| "grad_norm": 20504.70703125, | |
| "learning_rate": 1.399574920297556e-05, | |
| "loss": 4.257, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.9032943676939427, | |
| "grad_norm": 32460.078125, | |
| "learning_rate": 1.397803754870705e-05, | |
| "loss": 4.0817, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.905951115834219, | |
| "grad_norm": 6730.14404296875, | |
| "learning_rate": 1.3960325894438543e-05, | |
| "loss": 4.2065, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.9086078639744952, | |
| "grad_norm": 17531.017578125, | |
| "learning_rate": 1.3942614240170034e-05, | |
| "loss": 3.5729, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.9112646121147715, | |
| "grad_norm": 17859.064453125, | |
| "learning_rate": 1.3924902585901526e-05, | |
| "loss": 4.3419, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.9139213602550478, | |
| "grad_norm": 99839.4296875, | |
| "learning_rate": 1.3907190931633016e-05, | |
| "loss": 3.9653, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.9165781083953242, | |
| "grad_norm": 13036.796875, | |
| "learning_rate": 1.3889479277364505e-05, | |
| "loss": 3.8463, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9192348565356004, | |
| "grad_norm": 54209.05859375, | |
| "learning_rate": 1.3871767623095998e-05, | |
| "loss": 3.9493, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.9218916046758767, | |
| "grad_norm": 227248.34375, | |
| "learning_rate": 1.385405596882749e-05, | |
| "loss": 3.7791, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.924548352816153, | |
| "grad_norm": 856476.3125, | |
| "learning_rate": 1.383634431455898e-05, | |
| "loss": 4.2208, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.9272051009564294, | |
| "grad_norm": 373248.40625, | |
| "learning_rate": 1.3818632660290473e-05, | |
| "loss": 4.6665, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.9298618490967057, | |
| "grad_norm": 476773.1875, | |
| "learning_rate": 1.3800921006021963e-05, | |
| "loss": 4.3373, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9325185972369819, | |
| "grad_norm": 3948952.0, | |
| "learning_rate": 1.3783209351753455e-05, | |
| "loss": 3.9872, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.9351753453772582, | |
| "grad_norm": 131342.296875, | |
| "learning_rate": 1.3765497697484946e-05, | |
| "loss": 4.0315, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.9378320935175345, | |
| "grad_norm": 1021533.8125, | |
| "learning_rate": 1.3747786043216438e-05, | |
| "loss": 3.898, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.9404888416578109, | |
| "grad_norm": 70664288.0, | |
| "learning_rate": 1.3730074388947929e-05, | |
| "loss": 4.0261, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.9431455897980872, | |
| "grad_norm": 1955257.25, | |
| "learning_rate": 1.3712362734679421e-05, | |
| "loss": 3.9837, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9458023379383634, | |
| "grad_norm": 10510368.0, | |
| "learning_rate": 1.3694651080410912e-05, | |
| "loss": 4.2089, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9484590860786397, | |
| "grad_norm": 4540049.0, | |
| "learning_rate": 1.3676939426142404e-05, | |
| "loss": 4.0757, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.9511158342189161, | |
| "grad_norm": 1934832.5, | |
| "learning_rate": 1.3659227771873893e-05, | |
| "loss": 3.8116, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9537725823591924, | |
| "grad_norm": 721523.875, | |
| "learning_rate": 1.3641516117605385e-05, | |
| "loss": 3.8604, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.9564293304994687, | |
| "grad_norm": 3694456.5, | |
| "learning_rate": 1.3623804463336876e-05, | |
| "loss": 4.3438, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9590860786397449, | |
| "grad_norm": 4130751.5, | |
| "learning_rate": 1.3606092809068368e-05, | |
| "loss": 3.7722, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9617428267800212, | |
| "grad_norm": 3232915.5, | |
| "learning_rate": 1.3588381154799859e-05, | |
| "loss": 4.1108, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.9643995749202976, | |
| "grad_norm": 5608699.5, | |
| "learning_rate": 1.357066950053135e-05, | |
| "loss": 4.4695, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.9670563230605739, | |
| "grad_norm": 37526024.0, | |
| "learning_rate": 1.3552957846262841e-05, | |
| "loss": 3.8838, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.9697130712008502, | |
| "grad_norm": 11544401.0, | |
| "learning_rate": 1.3535246191994334e-05, | |
| "loss": 3.7949, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9723698193411264, | |
| "grad_norm": 1559264.75, | |
| "learning_rate": 1.3517534537725824e-05, | |
| "loss": 3.8236, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.9750265674814028, | |
| "grad_norm": 10817994.0, | |
| "learning_rate": 1.3499822883457316e-05, | |
| "loss": 4.0035, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.9776833156216791, | |
| "grad_norm": 20268342.0, | |
| "learning_rate": 1.3482111229188807e-05, | |
| "loss": 3.6612, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.9803400637619554, | |
| "grad_norm": 51181968.0, | |
| "learning_rate": 1.34643995749203e-05, | |
| "loss": 3.7019, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.9829968119022316, | |
| "grad_norm": 74098400.0, | |
| "learning_rate": 1.3446687920651792e-05, | |
| "loss": 3.779, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9856535600425079, | |
| "grad_norm": 48340468.0, | |
| "learning_rate": 1.3428976266383282e-05, | |
| "loss": 3.6759, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.9883103081827843, | |
| "grad_norm": 8802756.0, | |
| "learning_rate": 1.3411264612114771e-05, | |
| "loss": 3.6238, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.9909670563230606, | |
| "grad_norm": 3833086.75, | |
| "learning_rate": 1.3393552957846263e-05, | |
| "loss": 3.3759, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.9936238044633369, | |
| "grad_norm": 29499648.0, | |
| "learning_rate": 1.3375841303577754e-05, | |
| "loss": 3.6134, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.9962805526036131, | |
| "grad_norm": 6612167.0, | |
| "learning_rate": 1.3358129649309246e-05, | |
| "loss": 3.5491, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.9989373007438895, | |
| "grad_norm": 21236494.0, | |
| "learning_rate": 1.3340417995040737e-05, | |
| "loss": 3.7831, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 3.75178599357605, | |
| "eval_runtime": 744.4128, | |
| "eval_samples_per_second": 20.225, | |
| "eval_steps_per_second": 5.056, | |
| "step": 3764 | |
| }, | |
| { | |
| "epoch": 1.0015940488841657, | |
| "grad_norm": 40179844.0, | |
| "learning_rate": 1.3322706340772229e-05, | |
| "loss": 3.711, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.004250797024442, | |
| "grad_norm": 17010662.0, | |
| "learning_rate": 1.3304994686503721e-05, | |
| "loss": 3.4946, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.0069075451647185, | |
| "grad_norm": 19932106.0, | |
| "learning_rate": 1.3287283032235212e-05, | |
| "loss": 3.5648, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.0095642933049946, | |
| "grad_norm": 5492312.0, | |
| "learning_rate": 1.3269571377966704e-05, | |
| "loss": 4.0635, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.012221041445271, | |
| "grad_norm": 192937568.0, | |
| "learning_rate": 1.3251859723698195e-05, | |
| "loss": 3.4178, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.0148777895855472, | |
| "grad_norm": 1293443.125, | |
| "learning_rate": 1.3234148069429687e-05, | |
| "loss": 3.9658, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.0175345377258236, | |
| "grad_norm": 158162096.0, | |
| "learning_rate": 1.3216436415161178e-05, | |
| "loss": 3.6695, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.0201912858661, | |
| "grad_norm": 207503072.0, | |
| "learning_rate": 1.319872476089267e-05, | |
| "loss": 4.1104, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.0228480340063761, | |
| "grad_norm": 5859501.0, | |
| "learning_rate": 1.3181013106624159e-05, | |
| "loss": 3.7423, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.0255047821466525, | |
| "grad_norm": 65099376.0, | |
| "learning_rate": 1.3163301452355651e-05, | |
| "loss": 3.8122, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.0281615302869287, | |
| "grad_norm": 13768734.0, | |
| "learning_rate": 1.3145589798087142e-05, | |
| "loss": 3.8062, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.030818278427205, | |
| "grad_norm": 24830612.0, | |
| "learning_rate": 1.3127878143818634e-05, | |
| "loss": 3.5577, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.0334750265674815, | |
| "grad_norm": 109977040.0, | |
| "learning_rate": 1.3110166489550124e-05, | |
| "loss": 3.8904, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.0361317747077576, | |
| "grad_norm": 22621510.0, | |
| "learning_rate": 1.3092454835281617e-05, | |
| "loss": 3.7924, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.038788522848034, | |
| "grad_norm": 15618693.0, | |
| "learning_rate": 1.3074743181013107e-05, | |
| "loss": 3.9009, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.0414452709883104, | |
| "grad_norm": 102296992.0, | |
| "learning_rate": 1.30570315267446e-05, | |
| "loss": 4.0488, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.0441020191285866, | |
| "grad_norm": 180104320.0, | |
| "learning_rate": 1.303931987247609e-05, | |
| "loss": 4.0832, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.046758767268863, | |
| "grad_norm": 8426886.0, | |
| "learning_rate": 1.3021608218207582e-05, | |
| "loss": 3.9811, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.0494155154091391, | |
| "grad_norm": 23817282.0, | |
| "learning_rate": 1.3003896563939073e-05, | |
| "loss": 3.5573, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.0520722635494155, | |
| "grad_norm": 34805012.0, | |
| "learning_rate": 1.2986184909670565e-05, | |
| "loss": 3.6933, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.054729011689692, | |
| "grad_norm": 27546222.0, | |
| "learning_rate": 1.2968473255402056e-05, | |
| "loss": 3.826, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.057385759829968, | |
| "grad_norm": 73101112.0, | |
| "learning_rate": 1.2950761601133548e-05, | |
| "loss": 4.3474, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.0600425079702445, | |
| "grad_norm": 60012056.0, | |
| "learning_rate": 1.2933049946865037e-05, | |
| "loss": 3.4645, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.0626992561105206, | |
| "grad_norm": 10204493.0, | |
| "learning_rate": 1.2915338292596529e-05, | |
| "loss": 3.7942, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.065356004250797, | |
| "grad_norm": 67629928.0, | |
| "learning_rate": 1.289762663832802e-05, | |
| "loss": 3.6377, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.0680127523910734, | |
| "grad_norm": 31746526.0, | |
| "learning_rate": 1.2879914984059512e-05, | |
| "loss": 3.7846, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.0706695005313496, | |
| "grad_norm": 52992448.0, | |
| "learning_rate": 1.2862203329791003e-05, | |
| "loss": 3.2981, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.073326248671626, | |
| "grad_norm": 36022592.0, | |
| "learning_rate": 1.2844491675522495e-05, | |
| "loss": 3.6733, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.0759829968119021, | |
| "grad_norm": 11422725.0, | |
| "learning_rate": 1.2826780021253985e-05, | |
| "loss": 3.5682, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.0786397449521785, | |
| "grad_norm": 77457192.0, | |
| "learning_rate": 1.2809068366985478e-05, | |
| "loss": 3.8538, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.081296493092455, | |
| "grad_norm": 109772792.0, | |
| "learning_rate": 1.279135671271697e-05, | |
| "loss": 4.0151, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.083953241232731, | |
| "grad_norm": 126942304.0, | |
| "learning_rate": 1.277364505844846e-05, | |
| "loss": 4.418, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.0866099893730075, | |
| "grad_norm": 215005632.0, | |
| "learning_rate": 1.2755933404179953e-05, | |
| "loss": 3.6302, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.0892667375132838, | |
| "grad_norm": 18895672.0, | |
| "learning_rate": 1.2738221749911443e-05, | |
| "loss": 4.2548, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.09192348565356, | |
| "grad_norm": 20576284.0, | |
| "learning_rate": 1.2720510095642936e-05, | |
| "loss": 3.9913, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.0945802337938364, | |
| "grad_norm": 90564424.0, | |
| "learning_rate": 1.2702798441374424e-05, | |
| "loss": 3.8335, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.0972369819341126, | |
| "grad_norm": 136458144.0, | |
| "learning_rate": 1.2685086787105915e-05, | |
| "loss": 4.0485, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.099893730074389, | |
| "grad_norm": 175102016.0, | |
| "learning_rate": 1.2667375132837407e-05, | |
| "loss": 4.1181, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.1025504782146653, | |
| "grad_norm": 15060149.0, | |
| "learning_rate": 1.26496634785689e-05, | |
| "loss": 3.753, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.1052072263549415, | |
| "grad_norm": 92020808.0, | |
| "learning_rate": 1.263195182430039e-05, | |
| "loss": 3.9935, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.107863974495218, | |
| "grad_norm": 133574952.0, | |
| "learning_rate": 1.2614240170031882e-05, | |
| "loss": 4.0376, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.110520722635494, | |
| "grad_norm": 69448336.0, | |
| "learning_rate": 1.2596528515763373e-05, | |
| "loss": 3.7264, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.1131774707757705, | |
| "grad_norm": 24695358.0, | |
| "learning_rate": 1.2578816861494865e-05, | |
| "loss": 3.6435, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.1158342189160468, | |
| "grad_norm": 26981000.0, | |
| "learning_rate": 1.2561105207226356e-05, | |
| "loss": 4.1867, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.118490967056323, | |
| "grad_norm": 26429450.0, | |
| "learning_rate": 1.2543393552957848e-05, | |
| "loss": 4.2308, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.1211477151965994, | |
| "grad_norm": 75864056.0, | |
| "learning_rate": 1.2525681898689339e-05, | |
| "loss": 4.1067, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.1238044633368758, | |
| "grad_norm": 53176204.0, | |
| "learning_rate": 1.2507970244420831e-05, | |
| "loss": 4.3122, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.126461211477152, | |
| "grad_norm": 27715404.0, | |
| "learning_rate": 1.2490258590152322e-05, | |
| "loss": 4.0918, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.1291179596174283, | |
| "grad_norm": 6029370.0, | |
| "learning_rate": 1.2472546935883814e-05, | |
| "loss": 4.1725, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.1317747077577045, | |
| "grad_norm": 26051718.0, | |
| "learning_rate": 1.2454835281615303e-05, | |
| "loss": 3.9757, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.134431455897981, | |
| "grad_norm": 77973728.0, | |
| "learning_rate": 1.2437123627346795e-05, | |
| "loss": 3.989, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.1370882040382573, | |
| "grad_norm": 11366385.0, | |
| "learning_rate": 1.2419411973078286e-05, | |
| "loss": 4.3978, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.1397449521785334, | |
| "grad_norm": 19926490.0, | |
| "learning_rate": 1.2401700318809778e-05, | |
| "loss": 3.7446, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.1424017003188098, | |
| "grad_norm": 66211068.0, | |
| "learning_rate": 1.2383988664541268e-05, | |
| "loss": 3.9591, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.145058448459086, | |
| "grad_norm": 7617592.5, | |
| "learning_rate": 1.236627701027276e-05, | |
| "loss": 4.2812, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.1477151965993624, | |
| "grad_norm": 47218612.0, | |
| "learning_rate": 1.2348565356004251e-05, | |
| "loss": 4.137, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.1503719447396388, | |
| "grad_norm": 115950944.0, | |
| "learning_rate": 1.2330853701735743e-05, | |
| "loss": 4.1344, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.153028692879915, | |
| "grad_norm": 27328380.0, | |
| "learning_rate": 1.2313142047467234e-05, | |
| "loss": 4.0865, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.1556854410201913, | |
| "grad_norm": 8267316.5, | |
| "learning_rate": 1.2295430393198726e-05, | |
| "loss": 4.3048, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.1583421891604675, | |
| "grad_norm": 18654644.0, | |
| "learning_rate": 1.2277718738930217e-05, | |
| "loss": 4.4512, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.1609989373007439, | |
| "grad_norm": 123494120.0, | |
| "learning_rate": 1.2260007084661709e-05, | |
| "loss": 4.1863, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.1636556854410203, | |
| "grad_norm": 87930224.0, | |
| "learning_rate": 1.2242295430393201e-05, | |
| "loss": 4.1395, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.1663124335812964, | |
| "grad_norm": 60926568.0, | |
| "learning_rate": 1.222458377612469e-05, | |
| "loss": 3.9975, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.1689691817215728, | |
| "grad_norm": 15561844.0, | |
| "learning_rate": 1.2206872121856181e-05, | |
| "loss": 4.1746, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.171625929861849, | |
| "grad_norm": 14337786.0, | |
| "learning_rate": 1.2189160467587673e-05, | |
| "loss": 4.0762, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.1742826780021254, | |
| "grad_norm": 27260074.0, | |
| "learning_rate": 1.2171448813319164e-05, | |
| "loss": 4.3436, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.1769394261424018, | |
| "grad_norm": 14445331.0, | |
| "learning_rate": 1.2153737159050656e-05, | |
| "loss": 3.9788, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.179596174282678, | |
| "grad_norm": 21041896.0, | |
| "learning_rate": 1.2136025504782147e-05, | |
| "loss": 4.3681, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.1822529224229543, | |
| "grad_norm": 15333385.0, | |
| "learning_rate": 1.2118313850513639e-05, | |
| "loss": 4.1638, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.1849096705632305, | |
| "grad_norm": 18882606.0, | |
| "learning_rate": 1.2100602196245131e-05, | |
| "loss": 3.9175, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.1875664187035069, | |
| "grad_norm": 6002330.5, | |
| "learning_rate": 1.2082890541976622e-05, | |
| "loss": 4.0274, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.1902231668437833, | |
| "grad_norm": 12174502.0, | |
| "learning_rate": 1.2065178887708114e-05, | |
| "loss": 4.0163, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.1928799149840594, | |
| "grad_norm": 3046521.75, | |
| "learning_rate": 1.2047467233439604e-05, | |
| "loss": 4.2218, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.1955366631243358, | |
| "grad_norm": 7046191.0, | |
| "learning_rate": 1.2029755579171097e-05, | |
| "loss": 3.8047, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1981934112646122, | |
| "grad_norm": 2158310.5, | |
| "learning_rate": 1.2012043924902587e-05, | |
| "loss": 4.0102, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.2008501594048884, | |
| "grad_norm": 1953139.875, | |
| "learning_rate": 1.199433227063408e-05, | |
| "loss": 3.9815, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.2035069075451648, | |
| "grad_norm": 10403948.0, | |
| "learning_rate": 1.1976620616365568e-05, | |
| "loss": 4.2106, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.206163655685441, | |
| "grad_norm": 1701127.5, | |
| "learning_rate": 1.195890896209706e-05, | |
| "loss": 4.1719, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.2088204038257173, | |
| "grad_norm": 1922839.625, | |
| "learning_rate": 1.1941197307828551e-05, | |
| "loss": 4.2, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.2114771519659937, | |
| "grad_norm": 1249251.375, | |
| "learning_rate": 1.1923485653560044e-05, | |
| "loss": 4.3854, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.2141339001062699, | |
| "grad_norm": 3677515.25, | |
| "learning_rate": 1.1905773999291534e-05, | |
| "loss": 4.1928, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.2167906482465463, | |
| "grad_norm": 1778515.5, | |
| "learning_rate": 1.1888062345023026e-05, | |
| "loss": 4.282, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.2194473963868226, | |
| "grad_norm": 2142989.75, | |
| "learning_rate": 1.1870350690754517e-05, | |
| "loss": 4.0862, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.2221041445270988, | |
| "grad_norm": 3376149.5, | |
| "learning_rate": 1.185263903648601e-05, | |
| "loss": 4.9249, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.2247608926673752, | |
| "grad_norm": 918137.0625, | |
| "learning_rate": 1.18349273822175e-05, | |
| "loss": 4.4397, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.2274176408076514, | |
| "grad_norm": 5548887.5, | |
| "learning_rate": 1.1817215727948992e-05, | |
| "loss": 4.186, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.2300743889479278, | |
| "grad_norm": 1206121.0, | |
| "learning_rate": 1.1799504073680483e-05, | |
| "loss": 4.4369, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.2327311370882041, | |
| "grad_norm": 1302905.0, | |
| "learning_rate": 1.1781792419411975e-05, | |
| "loss": 4.2492, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.2353878852284803, | |
| "grad_norm": 1243181.25, | |
| "learning_rate": 1.1764080765143466e-05, | |
| "loss": 4.3557, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.2380446333687567, | |
| "grad_norm": 1636811.25, | |
| "learning_rate": 1.1746369110874956e-05, | |
| "loss": 4.4305, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.2407013815090329, | |
| "grad_norm": 3252745.75, | |
| "learning_rate": 1.1728657456606447e-05, | |
| "loss": 4.4447, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.2433581296493093, | |
| "grad_norm": 3218180.0, | |
| "learning_rate": 1.1710945802337939e-05, | |
| "loss": 4.1695, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.2460148777895856, | |
| "grad_norm": 7251921.5, | |
| "learning_rate": 1.169323414806943e-05, | |
| "loss": 4.0679, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.2486716259298618, | |
| "grad_norm": 3886631.0, | |
| "learning_rate": 1.1675522493800922e-05, | |
| "loss": 3.9159, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.2513283740701382, | |
| "grad_norm": 2420017.75, | |
| "learning_rate": 1.1657810839532412e-05, | |
| "loss": 4.6458, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.2539851222104144, | |
| "grad_norm": 1138159.875, | |
| "learning_rate": 1.1640099185263905e-05, | |
| "loss": 4.078, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.2566418703506907, | |
| "grad_norm": 930125.875, | |
| "learning_rate": 1.1622387530995395e-05, | |
| "loss": 4.0812, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.2592986184909671, | |
| "grad_norm": 3835148.25, | |
| "learning_rate": 1.1604675876726887e-05, | |
| "loss": 4.1012, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.2619553666312433, | |
| "grad_norm": 6243373.5, | |
| "learning_rate": 1.158696422245838e-05, | |
| "loss": 3.8252, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.2646121147715197, | |
| "grad_norm": 3021652.25, | |
| "learning_rate": 1.156925256818987e-05, | |
| "loss": 3.9515, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.2672688629117959, | |
| "grad_norm": 4503118.5, | |
| "learning_rate": 1.1551540913921363e-05, | |
| "loss": 4.0478, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.2699256110520722, | |
| "grad_norm": 5867597.5, | |
| "learning_rate": 1.1533829259652853e-05, | |
| "loss": 4.0726, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.2725823591923486, | |
| "grad_norm": 23690828.0, | |
| "learning_rate": 1.1516117605384345e-05, | |
| "loss": 3.5037, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.2752391073326248, | |
| "grad_norm": 5260964.5, | |
| "learning_rate": 1.1498405951115834e-05, | |
| "loss": 4.0774, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.2778958554729012, | |
| "grad_norm": 4894551.5, | |
| "learning_rate": 1.1480694296847325e-05, | |
| "loss": 3.7113, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.2805526036131774, | |
| "grad_norm": 4784902.0, | |
| "learning_rate": 1.1462982642578817e-05, | |
| "loss": 3.886, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.2832093517534537, | |
| "grad_norm": 22511842.0, | |
| "learning_rate": 1.144527098831031e-05, | |
| "loss": 3.7413, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.2858660998937301, | |
| "grad_norm": 13445524.0, | |
| "learning_rate": 1.14275593340418e-05, | |
| "loss": 4.3171, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.2885228480340063, | |
| "grad_norm": 4879641.0, | |
| "learning_rate": 1.1409847679773292e-05, | |
| "loss": 4.0366, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.2911795961742827, | |
| "grad_norm": 5458451.0, | |
| "learning_rate": 1.1392136025504783e-05, | |
| "loss": 4.0356, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.2938363443145589, | |
| "grad_norm": 1152951.125, | |
| "learning_rate": 1.1374424371236275e-05, | |
| "loss": 3.9322, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.2964930924548352, | |
| "grad_norm": 1573109.875, | |
| "learning_rate": 1.1356712716967766e-05, | |
| "loss": 3.5684, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.2991498405951116, | |
| "grad_norm": 3557934.25, | |
| "learning_rate": 1.1339001062699258e-05, | |
| "loss": 3.8874, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.301806588735388, | |
| "grad_norm": 2637183.5, | |
| "learning_rate": 1.1321289408430748e-05, | |
| "loss": 4.0737, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.3044633368756642, | |
| "grad_norm": 1852644.25, | |
| "learning_rate": 1.130357775416224e-05, | |
| "loss": 4.4462, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.3071200850159406, | |
| "grad_norm": 7577384.5, | |
| "learning_rate": 1.1285866099893731e-05, | |
| "loss": 3.8546, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.3097768331562167, | |
| "grad_norm": 4401453.5, | |
| "learning_rate": 1.1268154445625224e-05, | |
| "loss": 4.0443, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.3124335812964931, | |
| "grad_norm": 3643839.75, | |
| "learning_rate": 1.1250442791356712e-05, | |
| "loss": 3.678, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.3150903294367695, | |
| "grad_norm": 27145024.0, | |
| "learning_rate": 1.1232731137088205e-05, | |
| "loss": 3.8589, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.3177470775770457, | |
| "grad_norm": 1982266.875, | |
| "learning_rate": 1.1215019482819695e-05, | |
| "loss": 3.587, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.320403825717322, | |
| "grad_norm": 2339293.25, | |
| "learning_rate": 1.1197307828551188e-05, | |
| "loss": 3.6116, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.3230605738575982, | |
| "grad_norm": 21441204.0, | |
| "learning_rate": 1.1179596174282678e-05, | |
| "loss": 3.4365, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.3257173219978746, | |
| "grad_norm": 3329228.0, | |
| "learning_rate": 1.116188452001417e-05, | |
| "loss": 4.184, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.328374070138151, | |
| "grad_norm": 2602702.75, | |
| "learning_rate": 1.1144172865745661e-05, | |
| "loss": 3.6095, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.3310308182784272, | |
| "grad_norm": 62917268.0, | |
| "learning_rate": 1.1126461211477153e-05, | |
| "loss": 3.4086, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.3336875664187036, | |
| "grad_norm": 9320738.0, | |
| "learning_rate": 1.1108749557208644e-05, | |
| "loss": 3.8485, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.3363443145589797, | |
| "grad_norm": 11171778.0, | |
| "learning_rate": 1.1091037902940136e-05, | |
| "loss": 3.5241, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.3390010626992561, | |
| "grad_norm": 13504690.0, | |
| "learning_rate": 1.1073326248671628e-05, | |
| "loss": 3.7951, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.3416578108395325, | |
| "grad_norm": 1940023.625, | |
| "learning_rate": 1.1055614594403119e-05, | |
| "loss": 3.938, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.3443145589798087, | |
| "grad_norm": 9250230.0, | |
| "learning_rate": 1.1037902940134611e-05, | |
| "loss": 3.6501, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.346971307120085, | |
| "grad_norm": 8658494.0, | |
| "learning_rate": 1.10201912858661e-05, | |
| "loss": 3.4101, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.3496280552603612, | |
| "grad_norm": 24788584.0, | |
| "learning_rate": 1.100247963159759e-05, | |
| "loss": 3.2665, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.3522848034006376, | |
| "grad_norm": 17288262.0, | |
| "learning_rate": 1.0984767977329083e-05, | |
| "loss": 3.9485, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.354941551540914, | |
| "grad_norm": 1679803.0, | |
| "learning_rate": 1.0967056323060574e-05, | |
| "loss": 3.7726, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.3575982996811902, | |
| "grad_norm": 14593549.0, | |
| "learning_rate": 1.0949344668792066e-05, | |
| "loss": 4.0024, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.3602550478214666, | |
| "grad_norm": 4186409.75, | |
| "learning_rate": 1.0931633014523556e-05, | |
| "loss": 3.6818, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.3629117959617427, | |
| "grad_norm": 747755.5625, | |
| "learning_rate": 1.0913921360255049e-05, | |
| "loss": 3.4717, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.365568544102019, | |
| "grad_norm": 445103.3125, | |
| "learning_rate": 1.0896209705986541e-05, | |
| "loss": 3.4684, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.3682252922422955, | |
| "grad_norm": 1250102.625, | |
| "learning_rate": 1.0878498051718031e-05, | |
| "loss": 3.2248, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.3708820403825717, | |
| "grad_norm": 532045.3125, | |
| "learning_rate": 1.0860786397449524e-05, | |
| "loss": 3.3662, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.373538788522848, | |
| "grad_norm": 454849.5625, | |
| "learning_rate": 1.0843074743181014e-05, | |
| "loss": 3.5507, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.3761955366631242, | |
| "grad_norm": 3551179.5, | |
| "learning_rate": 1.0825363088912507e-05, | |
| "loss": 3.2755, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.3788522848034006, | |
| "grad_norm": 6700418.0, | |
| "learning_rate": 1.0807651434643997e-05, | |
| "loss": 3.2751, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.381509032943677, | |
| "grad_norm": 37462192.0, | |
| "learning_rate": 1.078993978037549e-05, | |
| "loss": 3.5327, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.3841657810839532, | |
| "grad_norm": 9333666.0, | |
| "learning_rate": 1.0772228126106978e-05, | |
| "loss": 3.1278, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.3868225292242295, | |
| "grad_norm": 16026876.0, | |
| "learning_rate": 1.075451647183847e-05, | |
| "loss": 3.5275, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.3894792773645057, | |
| "grad_norm": 24360552.0, | |
| "learning_rate": 1.0736804817569961e-05, | |
| "loss": 3.6815, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.392136025504782, | |
| "grad_norm": 12289483.0, | |
| "learning_rate": 1.0719093163301453e-05, | |
| "loss": 3.1039, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.3947927736450585, | |
| "grad_norm": 1954500.625, | |
| "learning_rate": 1.0701381509032944e-05, | |
| "loss": 3.3327, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.3974495217853349, | |
| "grad_norm": 5957172.5, | |
| "learning_rate": 1.0683669854764436e-05, | |
| "loss": 3.6985, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.400106269925611, | |
| "grad_norm": 136582976.0, | |
| "learning_rate": 1.0665958200495927e-05, | |
| "loss": 3.4845, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.4027630180658874, | |
| "grad_norm": 21799228.0, | |
| "learning_rate": 1.0648246546227419e-05, | |
| "loss": 3.4648, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.4054197662061636, | |
| "grad_norm": 1183856.625, | |
| "learning_rate": 1.063053489195891e-05, | |
| "loss": 3.2929, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.40807651434644, | |
| "grad_norm": 28349394.0, | |
| "learning_rate": 1.0612823237690402e-05, | |
| "loss": 3.611, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.4107332624867164, | |
| "grad_norm": 1230487.75, | |
| "learning_rate": 1.0595111583421892e-05, | |
| "loss": 3.0602, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.4133900106269925, | |
| "grad_norm": 29549574.0, | |
| "learning_rate": 1.0577399929153385e-05, | |
| "loss": 3.6129, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.416046758767269, | |
| "grad_norm": 65607896.0, | |
| "learning_rate": 1.0559688274884875e-05, | |
| "loss": 3.305, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.418703506907545, | |
| "grad_norm": 21593944.0, | |
| "learning_rate": 1.0541976620616366e-05, | |
| "loss": 4.182, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.4213602550478215, | |
| "grad_norm": 9913192.0, | |
| "learning_rate": 1.0524264966347856e-05, | |
| "loss": 3.333, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.4240170031880979, | |
| "grad_norm": 5600408.5, | |
| "learning_rate": 1.0506553312079349e-05, | |
| "loss": 3.2001, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.426673751328374, | |
| "grad_norm": 4921900.0, | |
| "learning_rate": 1.048884165781084e-05, | |
| "loss": 3.8381, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.4293304994686504, | |
| "grad_norm": 22669404.0, | |
| "learning_rate": 1.0471130003542332e-05, | |
| "loss": 3.438, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.4319872476089266, | |
| "grad_norm": 11211402.0, | |
| "learning_rate": 1.0453418349273822e-05, | |
| "loss": 3.3608, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.434643995749203, | |
| "grad_norm": 10033162.0, | |
| "learning_rate": 1.0435706695005314e-05, | |
| "loss": 3.2148, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.4373007438894794, | |
| "grad_norm": 34627448.0, | |
| "learning_rate": 1.0417995040736805e-05, | |
| "loss": 3.3408, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.4399574920297555, | |
| "grad_norm": 19163360.0, | |
| "learning_rate": 1.0400283386468297e-05, | |
| "loss": 3.0767, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.442614240170032, | |
| "grad_norm": 11876396.0, | |
| "learning_rate": 1.038257173219979e-05, | |
| "loss": 3.8624, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.445270988310308, | |
| "grad_norm": 6485251.5, | |
| "learning_rate": 1.036486007793128e-05, | |
| "loss": 3.4212, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.4479277364505845, | |
| "grad_norm": 2855033.5, | |
| "learning_rate": 1.0347148423662772e-05, | |
| "loss": 3.5543, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.4505844845908609, | |
| "grad_norm": 39419356.0, | |
| "learning_rate": 1.0329436769394263e-05, | |
| "loss": 3.6357, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.453241232731137, | |
| "grad_norm": 8782708.0, | |
| "learning_rate": 1.0311725115125755e-05, | |
| "loss": 3.7995, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.4558979808714134, | |
| "grad_norm": 32046924.0, | |
| "learning_rate": 1.0294013460857244e-05, | |
| "loss": 3.2472, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.4585547290116896, | |
| "grad_norm": 30402538.0, | |
| "learning_rate": 1.0276301806588735e-05, | |
| "loss": 3.1715, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.461211477151966, | |
| "grad_norm": 19326186.0, | |
| "learning_rate": 1.0258590152320227e-05, | |
| "loss": 3.9161, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.4638682252922424, | |
| "grad_norm": 9990077.0, | |
| "learning_rate": 1.024087849805172e-05, | |
| "loss": 3.849, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.4665249734325185, | |
| "grad_norm": 29835254.0, | |
| "learning_rate": 1.022316684378321e-05, | |
| "loss": 3.331, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.469181721572795, | |
| "grad_norm": 84350656.0, | |
| "learning_rate": 1.0205455189514702e-05, | |
| "loss": 3.3592, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.471838469713071, | |
| "grad_norm": 5173333.5, | |
| "learning_rate": 1.0187743535246193e-05, | |
| "loss": 3.3015, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.4744952178533475, | |
| "grad_norm": 3443425.5, | |
| "learning_rate": 1.0170031880977685e-05, | |
| "loss": 3.5236, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.4771519659936239, | |
| "grad_norm": 2188022.75, | |
| "learning_rate": 1.0152320226709175e-05, | |
| "loss": 3.5614, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.4798087141339, | |
| "grad_norm": 16931794.0, | |
| "learning_rate": 1.0134608572440668e-05, | |
| "loss": 3.6685, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.4824654622741764, | |
| "grad_norm": 10456564.0, | |
| "learning_rate": 1.0116896918172158e-05, | |
| "loss": 3.4864, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.4851222104144526, | |
| "grad_norm": 27239420.0, | |
| "learning_rate": 1.009918526390365e-05, | |
| "loss": 3.5637, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.487778958554729, | |
| "grad_norm": 16616771.0, | |
| "learning_rate": 1.0081473609635141e-05, | |
| "loss": 3.6085, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.4904357066950054, | |
| "grad_norm": 10221569.0, | |
| "learning_rate": 1.0063761955366632e-05, | |
| "loss": 3.5812, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.4930924548352817, | |
| "grad_norm": 1452260.75, | |
| "learning_rate": 1.0046050301098122e-05, | |
| "loss": 3.9326, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.495749202975558, | |
| "grad_norm": 3546143.0, | |
| "learning_rate": 1.0028338646829615e-05, | |
| "loss": 3.2541, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.4984059511158343, | |
| "grad_norm": 12791246.0, | |
| "learning_rate": 1.0010626992561105e-05, | |
| "loss": 3.4152, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.5010626992561105, | |
| "grad_norm": 12529229.0, | |
| "learning_rate": 9.992915338292597e-06, | |
| "loss": 3.0508, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.5037194473963869, | |
| "grad_norm": 9755405.0, | |
| "learning_rate": 9.975203684024088e-06, | |
| "loss": 3.5064, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.5063761955366632, | |
| "grad_norm": 6901898.0, | |
| "learning_rate": 9.95749202975558e-06, | |
| "loss": 3.6654, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.5090329436769394, | |
| "grad_norm": 9542270.0, | |
| "learning_rate": 9.93978037548707e-06, | |
| "loss": 3.3481, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.5116896918172156, | |
| "grad_norm": 14570059.0, | |
| "learning_rate": 9.922068721218563e-06, | |
| "loss": 3.6342, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.514346439957492, | |
| "grad_norm": 130252984.0, | |
| "learning_rate": 9.904357066950054e-06, | |
| "loss": 3.3275, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.5170031880977684, | |
| "grad_norm": 12491921.0, | |
| "learning_rate": 9.886645412681544e-06, | |
| "loss": 3.1862, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.5196599362380447, | |
| "grad_norm": 171955248.0, | |
| "learning_rate": 9.868933758413036e-06, | |
| "loss": 3.6, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.522316684378321, | |
| "grad_norm": 67972536.0, | |
| "learning_rate": 9.851222104144527e-06, | |
| "loss": 3.5839, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.524973432518597, | |
| "grad_norm": 19312536.0, | |
| "learning_rate": 9.83351044987602e-06, | |
| "loss": 3.3906, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.5276301806588735, | |
| "grad_norm": 39636108.0, | |
| "learning_rate": 9.81579879560751e-06, | |
| "loss": 3.5388, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.5302869287991498, | |
| "grad_norm": 54133548.0, | |
| "learning_rate": 9.798087141339002e-06, | |
| "loss": 3.2938, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.5329436769394262, | |
| "grad_norm": 28021788.0, | |
| "learning_rate": 9.780375487070494e-06, | |
| "loss": 3.565, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.5356004250797024, | |
| "grad_norm": 12500334.0, | |
| "learning_rate": 9.762663832801983e-06, | |
| "loss": 3.4099, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.5382571732199788, | |
| "grad_norm": 20677724.0, | |
| "learning_rate": 9.744952178533476e-06, | |
| "loss": 3.8265, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.540913921360255, | |
| "grad_norm": 25849000.0, | |
| "learning_rate": 9.727240524264968e-06, | |
| "loss": 3.5107, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.5435706695005313, | |
| "grad_norm": 7106916.0, | |
| "learning_rate": 9.709528869996458e-06, | |
| "loss": 3.7538, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.5462274176408077, | |
| "grad_norm": 78143128.0, | |
| "learning_rate": 9.69181721572795e-06, | |
| "loss": 3.8139, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.548884165781084, | |
| "grad_norm": 124880632.0, | |
| "learning_rate": 9.674105561459441e-06, | |
| "loss": 3.4966, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.5515409139213603, | |
| "grad_norm": 16674735.0, | |
| "learning_rate": 9.656393907190934e-06, | |
| "loss": 3.7779, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.5541976620616365, | |
| "grad_norm": 36204444.0, | |
| "learning_rate": 9.638682252922424e-06, | |
| "loss": 3.5086, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.5568544102019128, | |
| "grad_norm": 7019197.5, | |
| "learning_rate": 9.620970598653915e-06, | |
| "loss": 3.3062, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.5595111583421892, | |
| "grad_norm": 14028569.0, | |
| "learning_rate": 9.603258944385407e-06, | |
| "loss": 3.4862, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.5621679064824656, | |
| "grad_norm": 24143218.0, | |
| "learning_rate": 9.585547290116898e-06, | |
| "loss": 3.388, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.5648246546227418, | |
| "grad_norm": 8635328.0, | |
| "learning_rate": 9.56783563584839e-06, | |
| "loss": 3.9959, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.567481402763018, | |
| "grad_norm": 14461347.0, | |
| "learning_rate": 9.55012398157988e-06, | |
| "loss": 3.3619, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.5701381509032943, | |
| "grad_norm": 45164232.0, | |
| "learning_rate": 9.532412327311371e-06, | |
| "loss": 3.7565, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.5727948990435707, | |
| "grad_norm": 43768708.0, | |
| "learning_rate": 9.514700673042863e-06, | |
| "loss": 3.2873, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.5754516471838471, | |
| "grad_norm": 102944216.0, | |
| "learning_rate": 9.496989018774354e-06, | |
| "loss": 3.5849, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.5781083953241233, | |
| "grad_norm": 8864102.0, | |
| "learning_rate": 9.479277364505846e-06, | |
| "loss": 3.3615, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.5807651434643994, | |
| "grad_norm": 17926040.0, | |
| "learning_rate": 9.461565710237337e-06, | |
| "loss": 3.3599, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.5834218916046758, | |
| "grad_norm": 563806208.0, | |
| "learning_rate": 9.443854055968829e-06, | |
| "loss": 3.6726, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.5860786397449522, | |
| "grad_norm": 4375813.5, | |
| "learning_rate": 9.42614240170032e-06, | |
| "loss": 3.5982, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.5887353878852286, | |
| "grad_norm": 23817932.0, | |
| "learning_rate": 9.40843074743181e-06, | |
| "loss": 3.6873, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.5913921360255048, | |
| "grad_norm": 3588041.25, | |
| "learning_rate": 9.390719093163302e-06, | |
| "loss": 3.8219, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.594048884165781, | |
| "grad_norm": 97096224.0, | |
| "learning_rate": 9.373007438894793e-06, | |
| "loss": 3.5905, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.5967056323060573, | |
| "grad_norm": 4066724.0, | |
| "learning_rate": 9.355295784626285e-06, | |
| "loss": 3.5762, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.5993623804463337, | |
| "grad_norm": 44529008.0, | |
| "learning_rate": 9.337584130357776e-06, | |
| "loss": 3.821, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.60201912858661, | |
| "grad_norm": 10141793.0, | |
| "learning_rate": 9.319872476089268e-06, | |
| "loss": 3.4989, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.6046758767268863, | |
| "grad_norm": 22102744.0, | |
| "learning_rate": 9.302160821820759e-06, | |
| "loss": 3.4363, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.6073326248671624, | |
| "grad_norm": 1421525.375, | |
| "learning_rate": 9.284449167552249e-06, | |
| "loss": 3.3543, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.6099893730074388, | |
| "grad_norm": 17624050.0, | |
| "learning_rate": 9.266737513283741e-06, | |
| "loss": 3.5835, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.6126461211477152, | |
| "grad_norm": 2787807.5, | |
| "learning_rate": 9.249025859015232e-06, | |
| "loss": 3.7715, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.6153028692879916, | |
| "grad_norm": 36419916.0, | |
| "learning_rate": 9.231314204746724e-06, | |
| "loss": 3.2874, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.6179596174282678, | |
| "grad_norm": 550304.0, | |
| "learning_rate": 9.213602550478215e-06, | |
| "loss": 3.775, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.620616365568544, | |
| "grad_norm": 13110638.0, | |
| "learning_rate": 9.195890896209707e-06, | |
| "loss": 4.0895, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.6232731137088203, | |
| "grad_norm": 153279.40625, | |
| "learning_rate": 9.1781792419412e-06, | |
| "loss": 3.5868, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.6259298618490967, | |
| "grad_norm": 274644.03125, | |
| "learning_rate": 9.160467587672688e-06, | |
| "loss": 3.4759, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.628586609989373, | |
| "grad_norm": 21545.19921875, | |
| "learning_rate": 9.14275593340418e-06, | |
| "loss": 4.0524, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.6312433581296493, | |
| "grad_norm": 27863.1015625, | |
| "learning_rate": 9.125044279135673e-06, | |
| "loss": 3.4133, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.6339001062699257, | |
| "grad_norm": 146765.640625, | |
| "learning_rate": 9.107332624867163e-06, | |
| "loss": 3.6765, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.6365568544102018, | |
| "grad_norm": 60709.375, | |
| "learning_rate": 9.089620970598656e-06, | |
| "loss": 3.8558, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.6392136025504782, | |
| "grad_norm": 290704.21875, | |
| "learning_rate": 9.071909316330146e-06, | |
| "loss": 3.3615, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.6418703506907546, | |
| "grad_norm": 198007.828125, | |
| "learning_rate": 9.054197662061637e-06, | |
| "loss": 3.6759, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.6445270988310308, | |
| "grad_norm": 30211.29296875, | |
| "learning_rate": 9.036486007793129e-06, | |
| "loss": 4.1618, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.6471838469713072, | |
| "grad_norm": 697217.3125, | |
| "learning_rate": 9.01877435352462e-06, | |
| "loss": 3.5873, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.6498405951115833, | |
| "grad_norm": 311260.34375, | |
| "learning_rate": 9.001062699256112e-06, | |
| "loss": 4.0309, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.6524973432518597, | |
| "grad_norm": 7285945.0, | |
| "learning_rate": 8.983351044987602e-06, | |
| "loss": 3.7024, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.655154091392136, | |
| "grad_norm": 238075.265625, | |
| "learning_rate": 8.965639390719095e-06, | |
| "loss": 3.7081, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.6578108395324125, | |
| "grad_norm": 104777.8828125, | |
| "learning_rate": 8.947927736450585e-06, | |
| "loss": 3.6374, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.6604675876726886, | |
| "grad_norm": 45899.98828125, | |
| "learning_rate": 8.930216082182076e-06, | |
| "loss": 3.7753, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.6631243358129648, | |
| "grad_norm": 4903258.0, | |
| "learning_rate": 8.912504427913568e-06, | |
| "loss": 3.7641, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.6657810839532412, | |
| "grad_norm": 691504.875, | |
| "learning_rate": 8.894792773645059e-06, | |
| "loss": 3.012, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.6684378320935176, | |
| "grad_norm": 7211197.0, | |
| "learning_rate": 8.877081119376551e-06, | |
| "loss": 3.278, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.671094580233794, | |
| "grad_norm": 55386.39453125, | |
| "learning_rate": 8.859369465108042e-06, | |
| "loss": 3.5972, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.6737513283740701, | |
| "grad_norm": 4803297.5, | |
| "learning_rate": 8.841657810839534e-06, | |
| "loss": 3.5168, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.6764080765143463, | |
| "grad_norm": 153394.5625, | |
| "learning_rate": 8.823946156571024e-06, | |
| "loss": 3.4884, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.6790648246546227, | |
| "grad_norm": 105014.6796875, | |
| "learning_rate": 8.806234502302515e-06, | |
| "loss": 3.5724, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.681721572794899, | |
| "grad_norm": 425531.6875, | |
| "learning_rate": 8.788522848034007e-06, | |
| "loss": 3.7171, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.6843783209351755, | |
| "grad_norm": 881638.625, | |
| "learning_rate": 8.770811193765498e-06, | |
| "loss": 3.5689, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.6870350690754516, | |
| "grad_norm": 506417.84375, | |
| "learning_rate": 8.75309953949699e-06, | |
| "loss": 3.3471, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.6896918172157278, | |
| "grad_norm": 218658.8125, | |
| "learning_rate": 8.73538788522848e-06, | |
| "loss": 3.0762, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.6923485653560042, | |
| "grad_norm": 3747502.5, | |
| "learning_rate": 8.717676230959973e-06, | |
| "loss": 3.7819, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.6950053134962806, | |
| "grad_norm": 402977.15625, | |
| "learning_rate": 8.699964576691463e-06, | |
| "loss": 3.2238, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.697662061636557, | |
| "grad_norm": 354610.0, | |
| "learning_rate": 8.682252922422954e-06, | |
| "loss": 3.5365, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.7003188097768331, | |
| "grad_norm": 737137.25, | |
| "learning_rate": 8.664541268154446e-06, | |
| "loss": 3.7334, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.7029755579171093, | |
| "grad_norm": 270020.3125, | |
| "learning_rate": 8.646829613885937e-06, | |
| "loss": 3.6183, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.7056323060573857, | |
| "grad_norm": 740626.4375, | |
| "learning_rate": 8.629117959617429e-06, | |
| "loss": 3.7487, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.708289054197662, | |
| "grad_norm": 1305229.75, | |
| "learning_rate": 8.61140630534892e-06, | |
| "loss": 3.7039, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.7109458023379385, | |
| "grad_norm": 172010.875, | |
| "learning_rate": 8.593694651080412e-06, | |
| "loss": 2.9064, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.7136025504782146, | |
| "grad_norm": 36386.55859375, | |
| "learning_rate": 8.575982996811903e-06, | |
| "loss": 3.5462, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.7162592986184908, | |
| "grad_norm": 280424.5, | |
| "learning_rate": 8.558271342543393e-06, | |
| "loss": 3.7119, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.7189160467587672, | |
| "grad_norm": 65134.73828125, | |
| "learning_rate": 8.540559688274885e-06, | |
| "loss": 4.058, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.7215727948990436, | |
| "grad_norm": 66937.53125, | |
| "learning_rate": 8.522848034006378e-06, | |
| "loss": 3.3975, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.72422954303932, | |
| "grad_norm": 131224.421875, | |
| "learning_rate": 8.505136379737868e-06, | |
| "loss": 3.4813, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.7268862911795961, | |
| "grad_norm": 108172.1640625, | |
| "learning_rate": 8.48742472546936e-06, | |
| "loss": 3.1716, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.7295430393198725, | |
| "grad_norm": 25198.029296875, | |
| "learning_rate": 8.469713071200851e-06, | |
| "loss": 3.6849, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 1.7321997874601487, | |
| "grad_norm": 61498.03515625, | |
| "learning_rate": 8.452001416932342e-06, | |
| "loss": 3.4036, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.734856535600425, | |
| "grad_norm": 442683.875, | |
| "learning_rate": 8.434289762663834e-06, | |
| "loss": 3.3497, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 1.7375132837407015, | |
| "grad_norm": 27654.84765625, | |
| "learning_rate": 8.416578108395324e-06, | |
| "loss": 3.2324, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.7401700318809776, | |
| "grad_norm": 87875.5546875, | |
| "learning_rate": 8.398866454126817e-06, | |
| "loss": 3.211, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.742826780021254, | |
| "grad_norm": 443493.65625, | |
| "learning_rate": 8.381154799858307e-06, | |
| "loss": 3.5746, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.7454835281615302, | |
| "grad_norm": 112091.3046875, | |
| "learning_rate": 8.3634431455898e-06, | |
| "loss": 3.2604, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 1.7481402763018066, | |
| "grad_norm": 37516.62109375, | |
| "learning_rate": 8.34573149132129e-06, | |
| "loss": 3.3058, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.750797024442083, | |
| "grad_norm": 98792.796875, | |
| "learning_rate": 8.32801983705278e-06, | |
| "loss": 3.4504, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 1.7534537725823593, | |
| "grad_norm": 24296.8125, | |
| "learning_rate": 8.310308182784273e-06, | |
| "loss": 3.2476, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.7561105207226355, | |
| "grad_norm": 27490.43359375, | |
| "learning_rate": 8.292596528515764e-06, | |
| "loss": 3.4551, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 1.7587672688629117, | |
| "grad_norm": 163381.75, | |
| "learning_rate": 8.274884874247256e-06, | |
| "loss": 3.56, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.761424017003188, | |
| "grad_norm": 5022.00244140625, | |
| "learning_rate": 8.257173219978746e-06, | |
| "loss": 3.2829, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 1.7640807651434645, | |
| "grad_norm": 873426.5, | |
| "learning_rate": 8.239461565710239e-06, | |
| "loss": 3.292, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.7667375132837408, | |
| "grad_norm": 48760.75390625, | |
| "learning_rate": 8.22174991144173e-06, | |
| "loss": 3.3971, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.769394261424017, | |
| "grad_norm": 22562.328125, | |
| "learning_rate": 8.20403825717322e-06, | |
| "loss": 3.4901, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.7720510095642932, | |
| "grad_norm": 110952.984375, | |
| "learning_rate": 8.186326602904712e-06, | |
| "loss": 3.5824, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 1.7747077577045696, | |
| "grad_norm": 11664.615234375, | |
| "learning_rate": 8.168614948636203e-06, | |
| "loss": 3.6433, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.777364505844846, | |
| "grad_norm": 296820.28125, | |
| "learning_rate": 8.150903294367695e-06, | |
| "loss": 3.4816, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.7800212539851223, | |
| "grad_norm": 28750.556640625, | |
| "learning_rate": 8.133191640099186e-06, | |
| "loss": 3.4851, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.7826780021253985, | |
| "grad_norm": 86309.7890625, | |
| "learning_rate": 8.115479985830678e-06, | |
| "loss": 3.3058, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 1.7853347502656747, | |
| "grad_norm": 91584.7734375, | |
| "learning_rate": 8.097768331562168e-06, | |
| "loss": 3.7495, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.787991498405951, | |
| "grad_norm": 132450.96875, | |
| "learning_rate": 8.080056677293659e-06, | |
| "loss": 3.4955, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 1.7906482465462275, | |
| "grad_norm": 134387.046875, | |
| "learning_rate": 8.062345023025151e-06, | |
| "loss": 3.4655, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.7933049946865038, | |
| "grad_norm": 74426.6875, | |
| "learning_rate": 8.044633368756642e-06, | |
| "loss": 3.7594, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.79596174282678, | |
| "grad_norm": 58667.3984375, | |
| "learning_rate": 8.026921714488134e-06, | |
| "loss": 3.7655, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.7986184909670562, | |
| "grad_norm": 130389.9140625, | |
| "learning_rate": 8.009210060219625e-06, | |
| "loss": 3.673, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 1.8012752391073326, | |
| "grad_norm": 89147.9296875, | |
| "learning_rate": 7.991498405951117e-06, | |
| "loss": 3.2874, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.803931987247609, | |
| "grad_norm": 44793.80859375, | |
| "learning_rate": 7.973786751682607e-06, | |
| "loss": 3.2517, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 1.8065887353878853, | |
| "grad_norm": 15245.392578125, | |
| "learning_rate": 7.956075097414098e-06, | |
| "loss": 3.5179, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.8092454835281615, | |
| "grad_norm": 15995.4912109375, | |
| "learning_rate": 7.93836344314559e-06, | |
| "loss": 3.6515, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 1.8119022316684377, | |
| "grad_norm": 16524.787109375, | |
| "learning_rate": 7.920651788877083e-06, | |
| "loss": 3.1618, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.814558979808714, | |
| "grad_norm": 42409.20703125, | |
| "learning_rate": 7.902940134608573e-06, | |
| "loss": 3.58, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 1.8172157279489904, | |
| "grad_norm": 10542.6796875, | |
| "learning_rate": 7.885228480340065e-06, | |
| "loss": 3.508, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.8198724760892668, | |
| "grad_norm": 25151.1484375, | |
| "learning_rate": 7.867516826071556e-06, | |
| "loss": 3.0635, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.822529224229543, | |
| "grad_norm": 9499.1826171875, | |
| "learning_rate": 7.849805171803047e-06, | |
| "loss": 2.9901, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.8251859723698194, | |
| "grad_norm": 54946.984375, | |
| "learning_rate": 7.832093517534539e-06, | |
| "loss": 2.9531, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 1.8278427205100956, | |
| "grad_norm": 10790.599609375, | |
| "learning_rate": 7.81438186326603e-06, | |
| "loss": 3.5882, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.830499468650372, | |
| "grad_norm": 13575.8759765625, | |
| "learning_rate": 7.796670208997522e-06, | |
| "loss": 3.3612, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 1.8331562167906483, | |
| "grad_norm": 20945.48046875, | |
| "learning_rate": 7.778958554729012e-06, | |
| "loss": 3.0764, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.8358129649309245, | |
| "grad_norm": 232869.03125, | |
| "learning_rate": 7.761246900460504e-06, | |
| "loss": 3.1716, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 1.8384697130712009, | |
| "grad_norm": 43791.59765625, | |
| "learning_rate": 7.743535246191995e-06, | |
| "loss": 3.2311, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.841126461211477, | |
| "grad_norm": 22579.091796875, | |
| "learning_rate": 7.725823591923486e-06, | |
| "loss": 3.4563, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 1.8437832093517534, | |
| "grad_norm": 28530.806640625, | |
| "learning_rate": 7.708111937654978e-06, | |
| "loss": 3.452, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.8464399574920298, | |
| "grad_norm": 12486.0390625, | |
| "learning_rate": 7.690400283386468e-06, | |
| "loss": 3.2791, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.8490967056323062, | |
| "grad_norm": 17018.11328125, | |
| "learning_rate": 7.67268862911796e-06, | |
| "loss": 3.6792, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.8517534537725824, | |
| "grad_norm": 16199.2470703125, | |
| "learning_rate": 7.654976974849451e-06, | |
| "loss": 3.2561, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 1.8544102019128585, | |
| "grad_norm": 10388.2470703125, | |
| "learning_rate": 7.637265320580944e-06, | |
| "loss": 3.0233, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.857066950053135, | |
| "grad_norm": 15407.7548828125, | |
| "learning_rate": 7.619553666312433e-06, | |
| "loss": 3.167, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 1.8597236981934113, | |
| "grad_norm": 26815.095703125, | |
| "learning_rate": 7.601842012043925e-06, | |
| "loss": 3.2972, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8623804463336877, | |
| "grad_norm": 58698.21875, | |
| "learning_rate": 7.584130357775417e-06, | |
| "loss": 3.2334, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 1.8650371944739639, | |
| "grad_norm": 27274.71875, | |
| "learning_rate": 7.566418703506908e-06, | |
| "loss": 3.1432, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.86769394261424, | |
| "grad_norm": 83316.0703125, | |
| "learning_rate": 7.5487070492384e-06, | |
| "loss": 3.1284, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 1.8703506907545164, | |
| "grad_norm": 30122.771484375, | |
| "learning_rate": 7.530995394969891e-06, | |
| "loss": 2.8904, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.8730074388947928, | |
| "grad_norm": 40200.9609375, | |
| "learning_rate": 7.513283740701383e-06, | |
| "loss": 3.3255, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.8756641870350692, | |
| "grad_norm": 16342.447265625, | |
| "learning_rate": 7.495572086432873e-06, | |
| "loss": 3.1073, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.8783209351753454, | |
| "grad_norm": 14423.703125, | |
| "learning_rate": 7.477860432164365e-06, | |
| "loss": 3.4831, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 1.8809776833156215, | |
| "grad_norm": 34366.14453125, | |
| "learning_rate": 7.460148777895856e-06, | |
| "loss": 3.2063, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 1.883634431455898, | |
| "grad_norm": 70803.8359375, | |
| "learning_rate": 7.4424371236273475e-06, | |
| "loss": 3.5181, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 1.8862911795961743, | |
| "grad_norm": 13800.69140625, | |
| "learning_rate": 7.424725469358839e-06, | |
| "loss": 3.4993, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.8889479277364507, | |
| "grad_norm": 48057.68359375, | |
| "learning_rate": 7.40701381509033e-06, | |
| "loss": 3.1418, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 1.8916046758767269, | |
| "grad_norm": 40145.4921875, | |
| "learning_rate": 7.389302160821822e-06, | |
| "loss": 3.3427, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 1.894261424017003, | |
| "grad_norm": 13148.1484375, | |
| "learning_rate": 7.371590506553312e-06, | |
| "loss": 3.2584, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 1.8969181721572794, | |
| "grad_norm": 10740.6826171875, | |
| "learning_rate": 7.353878852284804e-06, | |
| "loss": 2.8656, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 1.8995749202975558, | |
| "grad_norm": 7270.3818359375, | |
| "learning_rate": 7.336167198016295e-06, | |
| "loss": 3.1929, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.9022316684378322, | |
| "grad_norm": 3250.9072265625, | |
| "learning_rate": 7.318455543747787e-06, | |
| "loss": 3.3218, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 1.9048884165781084, | |
| "grad_norm": 40904.6484375, | |
| "learning_rate": 7.300743889479278e-06, | |
| "loss": 3.0994, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 1.9075451647183845, | |
| "grad_norm": 9426.0009765625, | |
| "learning_rate": 7.2830322352107695e-06, | |
| "loss": 3.2861, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 1.910201912858661, | |
| "grad_norm": 10107.427734375, | |
| "learning_rate": 7.265320580942261e-06, | |
| "loss": 3.3694, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 1.9128586609989373, | |
| "grad_norm": 25632.7734375, | |
| "learning_rate": 7.2476089266737514e-06, | |
| "loss": 3.1918, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.9155154091392137, | |
| "grad_norm": 10823.2509765625, | |
| "learning_rate": 7.229897272405243e-06, | |
| "loss": 3.1984, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 1.9181721572794899, | |
| "grad_norm": 8237.4482421875, | |
| "learning_rate": 7.212185618136734e-06, | |
| "loss": 2.7874, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 1.9208289054197663, | |
| "grad_norm": 4823.09716796875, | |
| "learning_rate": 7.194473963868226e-06, | |
| "loss": 3.2625, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 1.9234856535600424, | |
| "grad_norm": 6276.54150390625, | |
| "learning_rate": 7.176762309599717e-06, | |
| "loss": 3.1739, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 1.9261424017003188, | |
| "grad_norm": 9979.935546875, | |
| "learning_rate": 7.1590506553312085e-06, | |
| "loss": 3.34, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.9287991498405952, | |
| "grad_norm": 3373.656982421875, | |
| "learning_rate": 7.141339001062701e-06, | |
| "loss": 3.4366, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 1.9314558979808714, | |
| "grad_norm": 9178.9404296875, | |
| "learning_rate": 7.1236273467941905e-06, | |
| "loss": 3.245, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 1.9341126461211477, | |
| "grad_norm": 11173.3037109375, | |
| "learning_rate": 7.105915692525682e-06, | |
| "loss": 3.1306, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 1.936769394261424, | |
| "grad_norm": 6969.20849609375, | |
| "learning_rate": 7.088204038257173e-06, | |
| "loss": 3.5482, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 1.9394261424017003, | |
| "grad_norm": 22079.796875, | |
| "learning_rate": 7.070492383988665e-06, | |
| "loss": 3.2338, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.9420828905419767, | |
| "grad_norm": 51803.05078125, | |
| "learning_rate": 7.052780729720157e-06, | |
| "loss": 3.1844, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 1.944739638682253, | |
| "grad_norm": 17502.84375, | |
| "learning_rate": 7.0350690754516485e-06, | |
| "loss": 3.3796, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 1.9473963868225292, | |
| "grad_norm": 4275.10009765625, | |
| "learning_rate": 7.017357421183138e-06, | |
| "loss": 3.0306, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 1.9500531349628054, | |
| "grad_norm": 3620.85400390625, | |
| "learning_rate": 6.99964576691463e-06, | |
| "loss": 3.3635, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 1.9527098831030818, | |
| "grad_norm": 32547.673828125, | |
| "learning_rate": 6.981934112646122e-06, | |
| "loss": 3.1764, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.9553666312433582, | |
| "grad_norm": 5065.5751953125, | |
| "learning_rate": 6.964222458377613e-06, | |
| "loss": 3.1895, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.9580233793836346, | |
| "grad_norm": 10395.2060546875, | |
| "learning_rate": 6.946510804109105e-06, | |
| "loss": 3.1655, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 1.9606801275239107, | |
| "grad_norm": 4557.41796875, | |
| "learning_rate": 6.928799149840596e-06, | |
| "loss": 3.0581, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 1.963336875664187, | |
| "grad_norm": 38417.4765625, | |
| "learning_rate": 6.911087495572088e-06, | |
| "loss": 3.0893, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 1.9659936238044633, | |
| "grad_norm": 5107.16796875, | |
| "learning_rate": 6.893375841303578e-06, | |
| "loss": 3.4167, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.9686503719447397, | |
| "grad_norm": 5035.6201171875, | |
| "learning_rate": 6.87566418703507e-06, | |
| "loss": 3.0664, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 1.971307120085016, | |
| "grad_norm": 12651.587890625, | |
| "learning_rate": 6.857952532766561e-06, | |
| "loss": 3.1299, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 1.9739638682252922, | |
| "grad_norm": 7539.5400390625, | |
| "learning_rate": 6.840240878498052e-06, | |
| "loss": 3.0492, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 1.9766206163655684, | |
| "grad_norm": 5577.158203125, | |
| "learning_rate": 6.822529224229544e-06, | |
| "loss": 3.098, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 1.9792773645058448, | |
| "grad_norm": 41558.4921875, | |
| "learning_rate": 6.804817569961035e-06, | |
| "loss": 3.2933, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.9819341126461212, | |
| "grad_norm": 3775.939697265625, | |
| "learning_rate": 6.787105915692527e-06, | |
| "loss": 2.9667, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 1.9845908607863976, | |
| "grad_norm": 30318.9921875, | |
| "learning_rate": 6.769394261424017e-06, | |
| "loss": 3.0666, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 1.9872476089266737, | |
| "grad_norm": 21865.806640625, | |
| "learning_rate": 6.751682607155509e-06, | |
| "loss": 2.9213, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 1.98990435706695, | |
| "grad_norm": 10458.220703125, | |
| "learning_rate": 6.733970952887e-06, | |
| "loss": 3.2567, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 1.9925611052072263, | |
| "grad_norm": 14638.0439453125, | |
| "learning_rate": 6.7162592986184915e-06, | |
| "loss": 3.2132, | |
| "step": 7500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 11292, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7836212920320000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |