{ "best_global_step": 4062, "best_metric": 0.003324420191347599, "best_model_checkpoint": "./outputs/checkpoint-4062", "epoch": 5.0, "eval_steps": 500, "global_step": 10155, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004923682914820286, "grad_norm": 1.6549489498138428, "learning_rate": 1.9982274741506648e-05, "loss": 0.4492, "step": 10 }, { "epoch": 0.009847365829640572, "grad_norm": 1.334946870803833, "learning_rate": 1.9962580009847367e-05, "loss": 0.1576, "step": 20 }, { "epoch": 0.014771048744460856, "grad_norm": 0.6242792010307312, "learning_rate": 1.9942885278188087e-05, "loss": 0.1297, "step": 30 }, { "epoch": 0.019694731659281144, "grad_norm": 0.3603459596633911, "learning_rate": 1.9923190546528807e-05, "loss": 0.087, "step": 40 }, { "epoch": 0.024618414574101428, "grad_norm": 1.5480420589447021, "learning_rate": 1.9903495814869524e-05, "loss": 0.1001, "step": 50 }, { "epoch": 0.029542097488921712, "grad_norm": 0.46412599086761475, "learning_rate": 1.9883801083210244e-05, "loss": 0.0426, "step": 60 }, { "epoch": 0.034465780403742, "grad_norm": 0.17812378704547882, "learning_rate": 1.9864106351550964e-05, "loss": 0.0667, "step": 70 }, { "epoch": 0.03938946331856229, "grad_norm": 0.35499292612075806, "learning_rate": 1.984441161989168e-05, "loss": 0.0745, "step": 80 }, { "epoch": 0.04431314623338257, "grad_norm": 0.130273699760437, "learning_rate": 1.98247168882324e-05, "loss": 0.0377, "step": 90 }, { "epoch": 0.049236829148202856, "grad_norm": 0.12631726264953613, "learning_rate": 1.980502215657312e-05, "loss": 0.0621, "step": 100 }, { "epoch": 0.05416051206302314, "grad_norm": 0.6002771854400635, "learning_rate": 1.9785327424913836e-05, "loss": 0.0625, "step": 110 }, { "epoch": 0.059084194977843424, "grad_norm": 2.8204727172851562, "learning_rate": 1.9765632693254556e-05, "loss": 0.1003, "step": 120 }, { "epoch": 0.06400787789266371, "grad_norm": 0.7127029895782471, "learning_rate": 1.9745937961595273e-05, "loss": 0.0507, "step": 130 }, { "epoch": 0.068931560807484, "grad_norm": 0.09328364580869675, "learning_rate": 1.9726243229935993e-05, "loss": 0.0513, "step": 140 }, { "epoch": 0.07385524372230429, "grad_norm": 0.1317463517189026, "learning_rate": 1.9706548498276713e-05, "loss": 0.0321, "step": 150 }, { "epoch": 0.07877892663712457, "grad_norm": 0.0986248180270195, "learning_rate": 1.9686853766617433e-05, "loss": 0.021, "step": 160 }, { "epoch": 0.08370260955194485, "grad_norm": 0.09169679135084152, "learning_rate": 1.966715903495815e-05, "loss": 0.0197, "step": 170 }, { "epoch": 0.08862629246676514, "grad_norm": 0.11187795549631119, "learning_rate": 1.964746430329887e-05, "loss": 0.0294, "step": 180 }, { "epoch": 0.09354997538158542, "grad_norm": 0.07759550958871841, "learning_rate": 1.962776957163959e-05, "loss": 0.0413, "step": 190 }, { "epoch": 0.09847365829640571, "grad_norm": 0.18886986374855042, "learning_rate": 1.9608074839980305e-05, "loss": 0.0319, "step": 200 }, { "epoch": 0.103397341211226, "grad_norm": 0.07388991117477417, "learning_rate": 1.9588380108321025e-05, "loss": 0.0142, "step": 210 }, { "epoch": 0.10832102412604629, "grad_norm": 0.24932511150836945, "learning_rate": 1.9568685376661745e-05, "loss": 0.015, "step": 220 }, { "epoch": 0.11324470704086657, "grad_norm": 0.0711674615740776, "learning_rate": 1.9548990645002465e-05, "loss": 0.0168, "step": 230 }, { "epoch": 0.11816838995568685, "grad_norm": 0.06821451336145401, "learning_rate": 1.952929591334318e-05, "loss": 0.0121, "step": 240 }, { "epoch": 0.12309207287050714, "grad_norm": 0.06332361698150635, "learning_rate": 1.95096011816839e-05, "loss": 0.0238, "step": 250 }, { "epoch": 0.12801575578532742, "grad_norm": 0.06404373049736023, "learning_rate": 1.948990645002462e-05, "loss": 0.0158, "step": 260 }, { "epoch": 0.1329394387001477, "grad_norm": 0.06331443041563034, "learning_rate": 1.9470211718365338e-05, "loss": 0.0107, "step": 270 }, { "epoch": 0.137863121614968, "grad_norm": 0.10350039601325989, "learning_rate": 1.9450516986706058e-05, "loss": 0.0225, "step": 280 }, { "epoch": 0.14278680452978829, "grad_norm": 0.06487182527780533, "learning_rate": 1.9430822255046778e-05, "loss": 0.0094, "step": 290 }, { "epoch": 0.14771048744460857, "grad_norm": 0.1302100121974945, "learning_rate": 1.9411127523387498e-05, "loss": 0.01, "step": 300 }, { "epoch": 0.15263417035942886, "grad_norm": 0.08045461773872375, "learning_rate": 1.9391432791728214e-05, "loss": 0.0091, "step": 310 }, { "epoch": 0.15755785327424915, "grad_norm": 2.8434016704559326, "learning_rate": 1.9371738060068934e-05, "loss": 0.0408, "step": 320 }, { "epoch": 0.16248153618906944, "grad_norm": 0.061778053641319275, "learning_rate": 1.935204332840965e-05, "loss": 0.0463, "step": 330 }, { "epoch": 0.1674052191038897, "grad_norm": 0.0558575838804245, "learning_rate": 1.933234859675037e-05, "loss": 0.009, "step": 340 }, { "epoch": 0.17232890201870998, "grad_norm": 0.10792255401611328, "learning_rate": 1.931265386509109e-05, "loss": 0.0077, "step": 350 }, { "epoch": 0.17725258493353027, "grad_norm": 0.05559010058641434, "learning_rate": 1.9292959133431807e-05, "loss": 0.008, "step": 360 }, { "epoch": 0.18217626784835056, "grad_norm": 0.0584879107773304, "learning_rate": 1.9273264401772527e-05, "loss": 0.0097, "step": 370 }, { "epoch": 0.18709995076317085, "grad_norm": 0.05132288858294487, "learning_rate": 1.9253569670113247e-05, "loss": 0.0139, "step": 380 }, { "epoch": 0.19202363367799113, "grad_norm": 0.052865270525217056, "learning_rate": 1.9233874938453963e-05, "loss": 0.0073, "step": 390 }, { "epoch": 0.19694731659281142, "grad_norm": 0.06938347965478897, "learning_rate": 1.9214180206794683e-05, "loss": 0.008, "step": 400 }, { "epoch": 0.2018709995076317, "grad_norm": 0.06304794549942017, "learning_rate": 1.9194485475135403e-05, "loss": 0.0067, "step": 410 }, { "epoch": 0.206794682422452, "grad_norm": 0.06260755658149719, "learning_rate": 1.9174790743476123e-05, "loss": 0.0059, "step": 420 }, { "epoch": 0.21171836533727229, "grad_norm": 0.043609410524368286, "learning_rate": 1.915509601181684e-05, "loss": 0.0066, "step": 430 }, { "epoch": 0.21664204825209257, "grad_norm": 0.0525021031498909, "learning_rate": 1.913540128015756e-05, "loss": 0.0062, "step": 440 }, { "epoch": 0.22156573116691286, "grad_norm": 0.043818939477205276, "learning_rate": 1.911570654849828e-05, "loss": 0.006, "step": 450 }, { "epoch": 0.22648941408173315, "grad_norm": 0.05230528861284256, "learning_rate": 1.9096011816838996e-05, "loss": 0.0055, "step": 460 }, { "epoch": 0.23141309699655344, "grad_norm": 0.042306121438741684, "learning_rate": 1.9076317085179716e-05, "loss": 0.0056, "step": 470 }, { "epoch": 0.2363367799113737, "grad_norm": 4.103374481201172, "learning_rate": 1.9056622353520436e-05, "loss": 0.0389, "step": 480 }, { "epoch": 0.24126046282619398, "grad_norm": 0.04262426868081093, "learning_rate": 1.9036927621861155e-05, "loss": 0.0347, "step": 490 }, { "epoch": 0.24618414574101427, "grad_norm": 0.04224070906639099, "learning_rate": 1.9017232890201872e-05, "loss": 0.0051, "step": 500 }, { "epoch": 0.2511078286558346, "grad_norm": 0.05894336849451065, "learning_rate": 1.8997538158542592e-05, "loss": 0.1076, "step": 510 }, { "epoch": 0.25603151157065485, "grad_norm": 0.045038461685180664, "learning_rate": 1.8977843426883312e-05, "loss": 0.0072, "step": 520 }, { "epoch": 0.26095519448547516, "grad_norm": 0.04939822852611542, "learning_rate": 1.895814869522403e-05, "loss": 0.0065, "step": 530 }, { "epoch": 0.2658788774002954, "grad_norm": 0.04498062655329704, "learning_rate": 1.8938453963564748e-05, "loss": 0.0067, "step": 540 }, { "epoch": 0.2708025603151157, "grad_norm": 0.04091101884841919, "learning_rate": 1.8918759231905465e-05, "loss": 0.0053, "step": 550 }, { "epoch": 0.275726243229936, "grad_norm": 0.040018483996391296, "learning_rate": 1.8899064500246185e-05, "loss": 0.0619, "step": 560 }, { "epoch": 0.28064992614475626, "grad_norm": 2.4129717350006104, "learning_rate": 1.8879369768586905e-05, "loss": 0.0524, "step": 570 }, { "epoch": 0.28557360905957657, "grad_norm": 0.07172203063964844, "learning_rate": 1.885967503692762e-05, "loss": 0.0088, "step": 580 }, { "epoch": 0.29049729197439683, "grad_norm": 1.157906174659729, "learning_rate": 1.883998030526834e-05, "loss": 0.0067, "step": 590 }, { "epoch": 0.29542097488921715, "grad_norm": 0.04644926264882088, "learning_rate": 1.882028557360906e-05, "loss": 0.0052, "step": 600 }, { "epoch": 0.3003446578040374, "grad_norm": 0.04166566953063011, "learning_rate": 1.880059084194978e-05, "loss": 0.005, "step": 610 }, { "epoch": 0.3052683407188577, "grad_norm": 0.039835572242736816, "learning_rate": 1.8780896110290497e-05, "loss": 0.0376, "step": 620 }, { "epoch": 0.310192023633678, "grad_norm": 0.04242682084441185, "learning_rate": 1.8761201378631217e-05, "loss": 0.006, "step": 630 }, { "epoch": 0.3151157065484983, "grad_norm": 0.06742667406797409, "learning_rate": 1.8741506646971937e-05, "loss": 0.0049, "step": 640 }, { "epoch": 0.32003938946331856, "grad_norm": 0.03444899246096611, "learning_rate": 1.8721811915312657e-05, "loss": 0.0057, "step": 650 }, { "epoch": 0.3249630723781389, "grad_norm": 0.06320315599441528, "learning_rate": 1.8702117183653373e-05, "loss": 0.0049, "step": 660 }, { "epoch": 0.32988675529295913, "grad_norm": 0.03362872451543808, "learning_rate": 1.8682422451994093e-05, "loss": 0.0041, "step": 670 }, { "epoch": 0.3348104382077794, "grad_norm": 0.03198781982064247, "learning_rate": 1.8662727720334813e-05, "loss": 0.0046, "step": 680 }, { "epoch": 0.3397341211225997, "grad_norm": 0.046496450901031494, "learning_rate": 1.864303298867553e-05, "loss": 0.0189, "step": 690 }, { "epoch": 0.34465780403741997, "grad_norm": 0.03556285426020622, "learning_rate": 1.862333825701625e-05, "loss": 0.0039, "step": 700 }, { "epoch": 0.3495814869522403, "grad_norm": 0.040747638791799545, "learning_rate": 1.860364352535697e-05, "loss": 0.0054, "step": 710 }, { "epoch": 0.35450516986706054, "grad_norm": 0.030371148139238358, "learning_rate": 1.858394879369769e-05, "loss": 0.0048, "step": 720 }, { "epoch": 0.35942885278188086, "grad_norm": 0.03187814727425575, "learning_rate": 1.8564254062038406e-05, "loss": 0.0377, "step": 730 }, { "epoch": 0.3643525356967011, "grad_norm": 0.03332262486219406, "learning_rate": 1.8544559330379126e-05, "loss": 0.078, "step": 740 }, { "epoch": 0.36927621861152143, "grad_norm": 0.03905538097023964, "learning_rate": 1.8524864598719846e-05, "loss": 0.0158, "step": 750 }, { "epoch": 0.3741999015263417, "grad_norm": 0.034239765256643295, "learning_rate": 1.8505169867060562e-05, "loss": 0.0047, "step": 760 }, { "epoch": 0.379123584441162, "grad_norm": 0.029871227219700813, "learning_rate": 1.8485475135401282e-05, "loss": 0.0502, "step": 770 }, { "epoch": 0.38404726735598227, "grad_norm": 2.714010238647461, "learning_rate": 1.8465780403742e-05, "loss": 0.0311, "step": 780 }, { "epoch": 0.3889709502708026, "grad_norm": 0.030506784096360207, "learning_rate": 1.844608567208272e-05, "loss": 0.0042, "step": 790 }, { "epoch": 0.39389463318562284, "grad_norm": 0.03251843899488449, "learning_rate": 1.842639094042344e-05, "loss": 0.0043, "step": 800 }, { "epoch": 0.3988183161004431, "grad_norm": 0.8572911620140076, "learning_rate": 1.8406696208764155e-05, "loss": 0.0081, "step": 810 }, { "epoch": 0.4037419990152634, "grad_norm": 0.03934726119041443, "learning_rate": 1.8387001477104875e-05, "loss": 0.0038, "step": 820 }, { "epoch": 0.4086656819300837, "grad_norm": 0.027939526364207268, "learning_rate": 1.8367306745445595e-05, "loss": 0.0035, "step": 830 }, { "epoch": 0.413589364844904, "grad_norm": 0.0301466416567564, "learning_rate": 1.8347612013786315e-05, "loss": 0.029, "step": 840 }, { "epoch": 0.41851304775972425, "grad_norm": 0.026854408904910088, "learning_rate": 1.832791728212703e-05, "loss": 0.0035, "step": 850 }, { "epoch": 0.42343673067454457, "grad_norm": 0.02816801145672798, "learning_rate": 1.830822255046775e-05, "loss": 0.0158, "step": 860 }, { "epoch": 0.42836041358936483, "grad_norm": 0.025995774194598198, "learning_rate": 1.828852781880847e-05, "loss": 0.0032, "step": 870 }, { "epoch": 0.43328409650418515, "grad_norm": 0.02553786151111126, "learning_rate": 1.8268833087149188e-05, "loss": 0.0039, "step": 880 }, { "epoch": 0.4382077794190054, "grad_norm": 0.025080446153879166, "learning_rate": 1.8249138355489908e-05, "loss": 0.0032, "step": 890 }, { "epoch": 0.4431314623338257, "grad_norm": 0.02811121568083763, "learning_rate": 1.8229443623830627e-05, "loss": 0.0033, "step": 900 }, { "epoch": 0.448055145248646, "grad_norm": 0.025451993569731712, "learning_rate": 1.8209748892171347e-05, "loss": 0.0032, "step": 910 }, { "epoch": 0.4529788281634663, "grad_norm": 0.02798454649746418, "learning_rate": 1.8190054160512064e-05, "loss": 0.0806, "step": 920 }, { "epoch": 0.45790251107828656, "grad_norm": 0.03505368530750275, "learning_rate": 1.8170359428852784e-05, "loss": 0.0293, "step": 930 }, { "epoch": 0.46282619399310687, "grad_norm": 0.02549416944384575, "learning_rate": 1.8150664697193504e-05, "loss": 0.0037, "step": 940 }, { "epoch": 0.46774987690792713, "grad_norm": 0.0305502787232399, "learning_rate": 1.8130969965534224e-05, "loss": 0.0035, "step": 950 }, { "epoch": 0.4726735598227474, "grad_norm": 0.023494524881243706, "learning_rate": 1.811127523387494e-05, "loss": 0.0032, "step": 960 }, { "epoch": 0.4775972427375677, "grad_norm": 0.02340528555214405, "learning_rate": 1.809158050221566e-05, "loss": 0.0628, "step": 970 }, { "epoch": 0.48252092565238797, "grad_norm": 0.02350657619535923, "learning_rate": 1.8071885770556377e-05, "loss": 0.0038, "step": 980 }, { "epoch": 0.4874446085672083, "grad_norm": 0.02277735061943531, "learning_rate": 1.8052191038897096e-05, "loss": 0.0033, "step": 990 }, { "epoch": 0.49236829148202854, "grad_norm": 0.022285617887973785, "learning_rate": 1.8032496307237813e-05, "loss": 0.0032, "step": 1000 }, { "epoch": 0.49729197439684886, "grad_norm": 0.0286717526614666, "learning_rate": 1.8012801575578533e-05, "loss": 0.0034, "step": 1010 }, { "epoch": 0.5022156573116692, "grad_norm": 0.03329790011048317, "learning_rate": 1.7993106843919253e-05, "loss": 0.0032, "step": 1020 }, { "epoch": 0.5071393402264894, "grad_norm": 0.022517314180731773, "learning_rate": 1.7973412112259973e-05, "loss": 0.0028, "step": 1030 }, { "epoch": 0.5120630231413097, "grad_norm": 0.021621420979499817, "learning_rate": 1.795371738060069e-05, "loss": 0.003, "step": 1040 }, { "epoch": 0.51698670605613, "grad_norm": 0.021763555705547333, "learning_rate": 1.793402264894141e-05, "loss": 0.0027, "step": 1050 }, { "epoch": 0.5219103889709503, "grad_norm": 0.021067790687084198, "learning_rate": 1.791432791728213e-05, "loss": 0.0029, "step": 1060 }, { "epoch": 0.5268340718857706, "grad_norm": 0.02510879375040531, "learning_rate": 1.789463318562285e-05, "loss": 0.0027, "step": 1070 }, { "epoch": 0.5317577548005908, "grad_norm": 0.02315102145075798, "learning_rate": 1.7874938453963565e-05, "loss": 0.0026, "step": 1080 }, { "epoch": 0.5366814377154111, "grad_norm": 0.02186872623860836, "learning_rate": 1.7855243722304285e-05, "loss": 0.0029, "step": 1090 }, { "epoch": 0.5416051206302314, "grad_norm": 0.019837426021695137, "learning_rate": 1.7835548990645005e-05, "loss": 0.0025, "step": 1100 }, { "epoch": 0.5465288035450517, "grad_norm": 0.02163524180650711, "learning_rate": 1.7815854258985722e-05, "loss": 0.0403, "step": 1110 }, { "epoch": 0.551452486459872, "grad_norm": 0.02067442610859871, "learning_rate": 1.779615952732644e-05, "loss": 0.0252, "step": 1120 }, { "epoch": 0.5563761693746923, "grad_norm": 0.020983709022402763, "learning_rate": 1.777646479566716e-05, "loss": 0.0083, "step": 1130 }, { "epoch": 0.5612998522895125, "grad_norm": 0.01975114829838276, "learning_rate": 1.775677006400788e-05, "loss": 0.0085, "step": 1140 }, { "epoch": 0.5662235352043329, "grad_norm": 0.02342822030186653, "learning_rate": 1.7737075332348598e-05, "loss": 0.0287, "step": 1150 }, { "epoch": 0.5711472181191531, "grad_norm": 0.021341597661376, "learning_rate": 1.7717380600689318e-05, "loss": 0.0123, "step": 1160 }, { "epoch": 0.5760709010339734, "grad_norm": 1.7539845705032349, "learning_rate": 1.7697685869030038e-05, "loss": 0.0082, "step": 1170 }, { "epoch": 0.5809945839487937, "grad_norm": 0.026660829782485962, "learning_rate": 1.7677991137370754e-05, "loss": 0.0056, "step": 1180 }, { "epoch": 0.585918266863614, "grad_norm": 0.01867399923503399, "learning_rate": 1.7658296405711474e-05, "loss": 0.0024, "step": 1190 }, { "epoch": 0.5908419497784343, "grad_norm": 0.019043035805225372, "learning_rate": 1.763860167405219e-05, "loss": 0.0022, "step": 1200 }, { "epoch": 0.5957656326932546, "grad_norm": 0.018355844542384148, "learning_rate": 1.761890694239291e-05, "loss": 0.0025, "step": 1210 }, { "epoch": 0.6006893156080748, "grad_norm": 0.01786232925951481, "learning_rate": 1.759921221073363e-05, "loss": 0.0022, "step": 1220 }, { "epoch": 0.6056129985228951, "grad_norm": 0.01761608012020588, "learning_rate": 1.7579517479074347e-05, "loss": 0.0026, "step": 1230 }, { "epoch": 0.6105366814377154, "grad_norm": 0.018296098336577415, "learning_rate": 1.7559822747415067e-05, "loss": 0.0022, "step": 1240 }, { "epoch": 0.6154603643525357, "grad_norm": 0.017041673883795738, "learning_rate": 1.7540128015755787e-05, "loss": 0.0024, "step": 1250 }, { "epoch": 0.620384047267356, "grad_norm": 0.016793906688690186, "learning_rate": 1.7520433284096507e-05, "loss": 0.002, "step": 1260 }, { "epoch": 0.6253077301821762, "grad_norm": 2.5061357021331787, "learning_rate": 1.7500738552437223e-05, "loss": 0.0658, "step": 1270 }, { "epoch": 0.6302314130969966, "grad_norm": 0.024613628163933754, "learning_rate": 1.7481043820777943e-05, "loss": 0.0691, "step": 1280 }, { "epoch": 0.6351550960118169, "grad_norm": 0.06863340735435486, "learning_rate": 1.7461349089118663e-05, "loss": 0.0388, "step": 1290 }, { "epoch": 0.6400787789266371, "grad_norm": 0.02072734758257866, "learning_rate": 1.744165435745938e-05, "loss": 0.0655, "step": 1300 }, { "epoch": 0.6450024618414574, "grad_norm": 0.025477182120084763, "learning_rate": 1.74219596258001e-05, "loss": 0.0033, "step": 1310 }, { "epoch": 0.6499261447562777, "grad_norm": 0.01841667853295803, "learning_rate": 1.740226489414082e-05, "loss": 0.0085, "step": 1320 }, { "epoch": 0.654849827671098, "grad_norm": 0.024184470996260643, "learning_rate": 1.738257016248154e-05, "loss": 0.0028, "step": 1330 }, { "epoch": 0.6597735105859183, "grad_norm": 0.02624395862221718, "learning_rate": 1.7362875430822256e-05, "loss": 0.017, "step": 1340 }, { "epoch": 0.6646971935007385, "grad_norm": 0.10919595509767532, "learning_rate": 1.7343180699162976e-05, "loss": 0.0023, "step": 1350 }, { "epoch": 0.6696208764155588, "grad_norm": 0.016470756381750107, "learning_rate": 1.7323485967503696e-05, "loss": 0.0023, "step": 1360 }, { "epoch": 0.6745445593303792, "grad_norm": 0.016200121492147446, "learning_rate": 1.7303791235844412e-05, "loss": 0.0024, "step": 1370 }, { "epoch": 0.6794682422451994, "grad_norm": 0.015988627448678017, "learning_rate": 1.7284096504185132e-05, "loss": 0.0021, "step": 1380 }, { "epoch": 0.6843919251600197, "grad_norm": 0.01572404056787491, "learning_rate": 1.7264401772525852e-05, "loss": 0.0023, "step": 1390 }, { "epoch": 0.6893156080748399, "grad_norm": 9.45742130279541, "learning_rate": 1.7244707040866572e-05, "loss": 0.0434, "step": 1400 }, { "epoch": 0.6942392909896603, "grad_norm": 0.015786904841661453, "learning_rate": 1.722501230920729e-05, "loss": 0.0026, "step": 1410 }, { "epoch": 0.6991629739044806, "grad_norm": 0.01596219092607498, "learning_rate": 1.7205317577548005e-05, "loss": 0.0021, "step": 1420 }, { "epoch": 0.7040866568193008, "grad_norm": 0.01756127178668976, "learning_rate": 1.7185622845888725e-05, "loss": 0.0221, "step": 1430 }, { "epoch": 0.7090103397341211, "grad_norm": 0.0174099151045084, "learning_rate": 1.7165928114229445e-05, "loss": 0.0024, "step": 1440 }, { "epoch": 0.7139340226489415, "grad_norm": 0.01599658839404583, "learning_rate": 1.7146233382570165e-05, "loss": 0.005, "step": 1450 }, { "epoch": 0.7188577055637617, "grad_norm": 0.015532166697084904, "learning_rate": 1.712653865091088e-05, "loss": 0.002, "step": 1460 }, { "epoch": 0.723781388478582, "grad_norm": 0.05480727553367615, "learning_rate": 1.71068439192516e-05, "loss": 0.0717, "step": 1470 }, { "epoch": 0.7287050713934022, "grad_norm": 0.01733849197626114, "learning_rate": 1.708714918759232e-05, "loss": 0.0018, "step": 1480 }, { "epoch": 0.7336287543082225, "grad_norm": 0.028923368081450462, "learning_rate": 1.7067454455933037e-05, "loss": 0.002, "step": 1490 }, { "epoch": 0.7385524372230429, "grad_norm": 0.020347867161035538, "learning_rate": 1.7047759724273757e-05, "loss": 0.002, "step": 1500 }, { "epoch": 0.7434761201378631, "grad_norm": 0.01512329000979662, "learning_rate": 1.7028064992614477e-05, "loss": 0.0018, "step": 1510 }, { "epoch": 0.7483998030526834, "grad_norm": 0.014671691693365574, "learning_rate": 1.7008370260955197e-05, "loss": 0.0019, "step": 1520 }, { "epoch": 0.7533234859675036, "grad_norm": 0.015612194314599037, "learning_rate": 1.6988675529295914e-05, "loss": 0.0018, "step": 1530 }, { "epoch": 0.758247168882324, "grad_norm": 0.014378263615071774, "learning_rate": 1.6968980797636634e-05, "loss": 0.0018, "step": 1540 }, { "epoch": 0.7631708517971443, "grad_norm": 0.013822750188410282, "learning_rate": 1.6949286065977353e-05, "loss": 0.0018, "step": 1550 }, { "epoch": 0.7680945347119645, "grad_norm": 0.01382039301097393, "learning_rate": 1.6929591334318073e-05, "loss": 0.0017, "step": 1560 }, { "epoch": 0.7730182176267848, "grad_norm": 0.015447800047695637, "learning_rate": 1.690989660265879e-05, "loss": 0.0017, "step": 1570 }, { "epoch": 0.7779419005416052, "grad_norm": 0.016364755108952522, "learning_rate": 1.689020187099951e-05, "loss": 0.0679, "step": 1580 }, { "epoch": 0.7828655834564254, "grad_norm": 0.01517761405557394, "learning_rate": 1.687050713934023e-05, "loss": 0.0018, "step": 1590 }, { "epoch": 0.7877892663712457, "grad_norm": 0.014340780675411224, "learning_rate": 1.6850812407680946e-05, "loss": 0.0022, "step": 1600 }, { "epoch": 0.792712949286066, "grad_norm": 0.014129845425486565, "learning_rate": 1.6831117676021666e-05, "loss": 0.0018, "step": 1610 }, { "epoch": 0.7976366322008862, "grad_norm": 0.01870601251721382, "learning_rate": 1.6811422944362386e-05, "loss": 0.002, "step": 1620 }, { "epoch": 0.8025603151157066, "grad_norm": 0.017389826476573944, "learning_rate": 1.6791728212703103e-05, "loss": 0.0019, "step": 1630 }, { "epoch": 0.8074839980305268, "grad_norm": 0.013149221427738667, "learning_rate": 1.6772033481043822e-05, "loss": 0.0546, "step": 1640 }, { "epoch": 0.8124076809453471, "grad_norm": 0.013570933602750301, "learning_rate": 1.675233874938454e-05, "loss": 0.0043, "step": 1650 }, { "epoch": 0.8173313638601674, "grad_norm": 0.013511077500879765, "learning_rate": 1.673264401772526e-05, "loss": 0.0019, "step": 1660 }, { "epoch": 0.8222550467749877, "grad_norm": 0.015698149800300598, "learning_rate": 1.671294928606598e-05, "loss": 0.0019, "step": 1670 }, { "epoch": 0.827178729689808, "grad_norm": 0.013056913390755653, "learning_rate": 1.66932545544067e-05, "loss": 0.0018, "step": 1680 }, { "epoch": 0.8321024126046282, "grad_norm": 0.015361153520643711, "learning_rate": 1.6673559822747415e-05, "loss": 0.002, "step": 1690 }, { "epoch": 0.8370260955194485, "grad_norm": 0.013177746906876564, "learning_rate": 1.6653865091088135e-05, "loss": 0.0015, "step": 1700 }, { "epoch": 0.8419497784342689, "grad_norm": 0.05426933988928795, "learning_rate": 1.6634170359428855e-05, "loss": 0.0095, "step": 1710 }, { "epoch": 0.8468734613490891, "grad_norm": 0.01309322752058506, "learning_rate": 1.661447562776957e-05, "loss": 0.0016, "step": 1720 }, { "epoch": 0.8517971442639094, "grad_norm": 0.015039416030049324, "learning_rate": 1.659478089611029e-05, "loss": 0.0016, "step": 1730 }, { "epoch": 0.8567208271787297, "grad_norm": 0.014542197808623314, "learning_rate": 1.657508616445101e-05, "loss": 0.0018, "step": 1740 }, { "epoch": 0.8616445100935499, "grad_norm": 0.012222396209836006, "learning_rate": 1.655539143279173e-05, "loss": 0.0014, "step": 1750 }, { "epoch": 0.8665681930083703, "grad_norm": 0.012340564280748367, "learning_rate": 1.6535696701132448e-05, "loss": 0.0014, "step": 1760 }, { "epoch": 0.8714918759231906, "grad_norm": 0.011919394135475159, "learning_rate": 1.6516001969473168e-05, "loss": 0.0016, "step": 1770 }, { "epoch": 0.8764155588380108, "grad_norm": 0.01167115569114685, "learning_rate": 1.6496307237813888e-05, "loss": 0.0016, "step": 1780 }, { "epoch": 0.8813392417528311, "grad_norm": 0.013182350434362888, "learning_rate": 1.6476612506154604e-05, "loss": 0.0015, "step": 1790 }, { "epoch": 0.8862629246676514, "grad_norm": 0.013948991894721985, "learning_rate": 1.6456917774495324e-05, "loss": 0.0014, "step": 1800 }, { "epoch": 0.8911866075824717, "grad_norm": 0.01775103434920311, "learning_rate": 1.6437223042836044e-05, "loss": 0.0015, "step": 1810 }, { "epoch": 0.896110290497292, "grad_norm": 0.01155338529497385, "learning_rate": 1.6417528311176764e-05, "loss": 0.002, "step": 1820 }, { "epoch": 0.9010339734121122, "grad_norm": 0.014259099029004574, "learning_rate": 1.639783357951748e-05, "loss": 0.0016, "step": 1830 }, { "epoch": 0.9059576563269326, "grad_norm": 0.014643428847193718, "learning_rate": 1.6378138847858197e-05, "loss": 0.0014, "step": 1840 }, { "epoch": 0.9108813392417529, "grad_norm": 0.011148291639983654, "learning_rate": 1.6358444116198917e-05, "loss": 0.0706, "step": 1850 }, { "epoch": 0.9158050221565731, "grad_norm": 0.012168935500085354, "learning_rate": 1.6338749384539637e-05, "loss": 0.0014, "step": 1860 }, { "epoch": 0.9207287050713934, "grad_norm": 0.011070801876485348, "learning_rate": 1.6319054652880356e-05, "loss": 0.0018, "step": 1870 }, { "epoch": 0.9256523879862137, "grad_norm": 0.023191597312688828, "learning_rate": 1.6299359921221073e-05, "loss": 0.0023, "step": 1880 }, { "epoch": 0.930576070901034, "grad_norm": 0.0227424968034029, "learning_rate": 1.6279665189561793e-05, "loss": 0.0867, "step": 1890 }, { "epoch": 0.9354997538158543, "grad_norm": 0.023284750059247017, "learning_rate": 1.6259970457902513e-05, "loss": 0.002, "step": 1900 }, { "epoch": 0.9404234367306745, "grad_norm": 0.013480834662914276, "learning_rate": 1.624027572624323e-05, "loss": 0.0028, "step": 1910 }, { "epoch": 0.9453471196454948, "grad_norm": 0.0108991339802742, "learning_rate": 1.622058099458395e-05, "loss": 0.0084, "step": 1920 }, { "epoch": 0.9502708025603152, "grad_norm": 0.011104236356914043, "learning_rate": 1.620088626292467e-05, "loss": 0.0104, "step": 1930 }, { "epoch": 0.9551944854751354, "grad_norm": 0.011128348298370838, "learning_rate": 1.618119153126539e-05, "loss": 0.0018, "step": 1940 }, { "epoch": 0.9601181683899557, "grad_norm": 0.01104864850640297, "learning_rate": 1.6161496799606106e-05, "loss": 0.0014, "step": 1950 }, { "epoch": 0.9650418513047759, "grad_norm": 0.03145885095000267, "learning_rate": 1.6141802067946825e-05, "loss": 0.0086, "step": 1960 }, { "epoch": 0.9699655342195963, "grad_norm": 0.012039147317409515, "learning_rate": 1.6122107336287545e-05, "loss": 0.0512, "step": 1970 }, { "epoch": 0.9748892171344166, "grad_norm": 0.011414915323257446, "learning_rate": 1.6102412604628265e-05, "loss": 0.0015, "step": 1980 }, { "epoch": 0.9798129000492368, "grad_norm": 0.013891511596739292, "learning_rate": 1.6082717872968982e-05, "loss": 0.0015, "step": 1990 }, { "epoch": 0.9847365829640571, "grad_norm": 0.010472927242517471, "learning_rate": 1.60630231413097e-05, "loss": 0.0024, "step": 2000 }, { "epoch": 0.9896602658788775, "grad_norm": 0.010258992202579975, "learning_rate": 1.604332840965042e-05, "loss": 0.0015, "step": 2010 }, { "epoch": 0.9945839487936977, "grad_norm": 0.021762333810329437, "learning_rate": 1.6023633677991138e-05, "loss": 0.0017, "step": 2020 }, { "epoch": 0.999507631708518, "grad_norm": 0.009938563220202923, "learning_rate": 1.6003938946331858e-05, "loss": 0.0014, "step": 2030 }, { "epoch": 1.0, "eval_accuracy": 0.9986048133937914, "eval_loss": 0.006461690180003643, "eval_runtime": 124.9038, "eval_samples_per_second": 22.954, "eval_steps_per_second": 2.874, "step": 2031 }, { "epoch": 1.0044313146233383, "grad_norm": 0.009946716949343681, "learning_rate": 1.5984244214672578e-05, "loss": 0.0013, "step": 2040 }, { "epoch": 1.0093549975381586, "grad_norm": 0.01007277425378561, "learning_rate": 1.5964549483013298e-05, "loss": 0.0051, "step": 2050 }, { "epoch": 1.0142786804529789, "grad_norm": 0.009899957105517387, "learning_rate": 1.5944854751354014e-05, "loss": 0.0012, "step": 2060 }, { "epoch": 1.0192023633677991, "grad_norm": 0.011700589209794998, "learning_rate": 1.592516001969473e-05, "loss": 0.0014, "step": 2070 }, { "epoch": 1.0241260462826194, "grad_norm": 0.01607823744416237, "learning_rate": 1.590546528803545e-05, "loss": 0.0627, "step": 2080 }, { "epoch": 1.0290497291974396, "grad_norm": 0.010469055734574795, "learning_rate": 1.588577055637617e-05, "loss": 0.0013, "step": 2090 }, { "epoch": 1.03397341211226, "grad_norm": 0.010042490437626839, "learning_rate": 1.586607582471689e-05, "loss": 0.0014, "step": 2100 }, { "epoch": 1.0388970950270802, "grad_norm": 0.00974634476006031, "learning_rate": 1.5846381093057607e-05, "loss": 0.0015, "step": 2110 }, { "epoch": 1.0438207779419006, "grad_norm": 0.026304766535758972, "learning_rate": 1.5826686361398327e-05, "loss": 0.0017, "step": 2120 }, { "epoch": 1.048744460856721, "grad_norm": 0.022236375138163567, "learning_rate": 1.5806991629739047e-05, "loss": 0.0013, "step": 2130 }, { "epoch": 1.0536681437715412, "grad_norm": 0.025571007281541824, "learning_rate": 1.5787296898079763e-05, "loss": 0.0017, "step": 2140 }, { "epoch": 1.0585918266863614, "grad_norm": 0.009789519011974335, "learning_rate": 1.5767602166420483e-05, "loss": 0.0012, "step": 2150 }, { "epoch": 1.0635155096011817, "grad_norm": 0.009336259216070175, "learning_rate": 1.5747907434761203e-05, "loss": 0.0014, "step": 2160 }, { "epoch": 1.068439192516002, "grad_norm": 0.009336400777101517, "learning_rate": 1.5728212703101923e-05, "loss": 0.0012, "step": 2170 }, { "epoch": 1.0733628754308222, "grad_norm": 0.009125406853854656, "learning_rate": 1.570851797144264e-05, "loss": 0.0011, "step": 2180 }, { "epoch": 1.0782865583456425, "grad_norm": 0.008981688879430294, "learning_rate": 1.568882323978336e-05, "loss": 0.0012, "step": 2190 }, { "epoch": 1.0832102412604627, "grad_norm": 0.008810392580926418, "learning_rate": 1.566912850812408e-05, "loss": 0.0011, "step": 2200 }, { "epoch": 1.0881339241752832, "grad_norm": 0.02159319818019867, "learning_rate": 1.5649433776464796e-05, "loss": 0.0012, "step": 2210 }, { "epoch": 1.0930576070901035, "grad_norm": 2.4877359867095947, "learning_rate": 1.5629739044805516e-05, "loss": 0.0381, "step": 2220 }, { "epoch": 1.0979812900049237, "grad_norm": 0.011043106205761433, "learning_rate": 1.5610044313146236e-05, "loss": 0.0012, "step": 2230 }, { "epoch": 1.102904972919744, "grad_norm": 0.012737146578729153, "learning_rate": 1.5590349581486956e-05, "loss": 0.0017, "step": 2240 }, { "epoch": 1.1078286558345642, "grad_norm": 0.045515026897192, "learning_rate": 1.5570654849827672e-05, "loss": 0.0013, "step": 2250 }, { "epoch": 1.1127523387493845, "grad_norm": 0.010885367169976234, "learning_rate": 1.5550960118168392e-05, "loss": 0.0013, "step": 2260 }, { "epoch": 1.1176760216642048, "grad_norm": 0.010456659831106663, "learning_rate": 1.553126538650911e-05, "loss": 0.0011, "step": 2270 }, { "epoch": 1.122599704579025, "grad_norm": 0.008621557615697384, "learning_rate": 1.551157065484983e-05, "loss": 0.0013, "step": 2280 }, { "epoch": 1.1275233874938455, "grad_norm": 0.008608179166913033, "learning_rate": 1.549187592319055e-05, "loss": 0.001, "step": 2290 }, { "epoch": 1.1324470704086658, "grad_norm": 0.008878961205482483, "learning_rate": 1.5472181191531265e-05, "loss": 0.0599, "step": 2300 }, { "epoch": 1.137370753323486, "grad_norm": 0.008603103458881378, "learning_rate": 1.5452486459871985e-05, "loss": 0.0013, "step": 2310 }, { "epoch": 1.1422944362383063, "grad_norm": 0.008407089859247208, "learning_rate": 1.5432791728212705e-05, "loss": 0.0166, "step": 2320 }, { "epoch": 1.1472181191531265, "grad_norm": 1.1491219997406006, "learning_rate": 1.541309699655342e-05, "loss": 0.0027, "step": 2330 }, { "epoch": 1.1521418020679468, "grad_norm": 0.008799925446510315, "learning_rate": 1.539340226489414e-05, "loss": 0.0064, "step": 2340 }, { "epoch": 1.157065484982767, "grad_norm": 0.102848581969738, "learning_rate": 1.537370753323486e-05, "loss": 0.0014, "step": 2350 }, { "epoch": 1.1619891678975873, "grad_norm": 0.01129199005663395, "learning_rate": 1.535401280157558e-05, "loss": 0.0628, "step": 2360 }, { "epoch": 1.1669128508124076, "grad_norm": 0.011105939745903015, "learning_rate": 1.5334318069916297e-05, "loss": 0.0322, "step": 2370 }, { "epoch": 1.171836533727228, "grad_norm": 0.009356286376714706, "learning_rate": 1.5314623338257017e-05, "loss": 0.0023, "step": 2380 }, { "epoch": 1.1767602166420483, "grad_norm": 0.010616080835461617, "learning_rate": 1.5294928606597737e-05, "loss": 0.0016, "step": 2390 }, { "epoch": 1.1816838995568686, "grad_norm": 0.024889415130019188, "learning_rate": 1.5275233874938454e-05, "loss": 0.0012, "step": 2400 }, { "epoch": 1.1866075824716888, "grad_norm": 0.013439149595797062, "learning_rate": 1.5255539143279174e-05, "loss": 0.0054, "step": 2410 }, { "epoch": 1.191531265386509, "grad_norm": 0.008294392377138138, "learning_rate": 1.5235844411619894e-05, "loss": 0.0013, "step": 2420 }, { "epoch": 1.1964549483013294, "grad_norm": 0.008172878995537758, "learning_rate": 1.5216149679960612e-05, "loss": 0.001, "step": 2430 }, { "epoch": 1.2013786312161496, "grad_norm": 0.008062479086220264, "learning_rate": 1.5196454948301332e-05, "loss": 0.001, "step": 2440 }, { "epoch": 1.2063023141309699, "grad_norm": 0.015131563879549503, "learning_rate": 1.517676021664205e-05, "loss": 0.0011, "step": 2450 }, { "epoch": 1.2112259970457901, "grad_norm": 0.017097556963562965, "learning_rate": 1.515706548498277e-05, "loss": 0.025, "step": 2460 }, { "epoch": 1.2161496799606106, "grad_norm": 0.011799236759543419, "learning_rate": 1.5137370753323488e-05, "loss": 0.0011, "step": 2470 }, { "epoch": 1.221073362875431, "grad_norm": 0.024478154256939888, "learning_rate": 1.5117676021664208e-05, "loss": 0.0732, "step": 2480 }, { "epoch": 1.2259970457902511, "grad_norm": 0.013807065784931183, "learning_rate": 1.5097981290004924e-05, "loss": 0.0014, "step": 2490 }, { "epoch": 1.2309207287050714, "grad_norm": 0.010231712833046913, "learning_rate": 1.5078286558345643e-05, "loss": 0.0014, "step": 2500 }, { "epoch": 1.2358444116198917, "grad_norm": 0.012537546455860138, "learning_rate": 1.5058591826686363e-05, "loss": 0.0626, "step": 2510 }, { "epoch": 1.240768094534712, "grad_norm": 0.00985932070761919, "learning_rate": 1.503889709502708e-05, "loss": 0.0012, "step": 2520 }, { "epoch": 1.2456917774495322, "grad_norm": 0.01136948075145483, "learning_rate": 1.5019202363367799e-05, "loss": 0.0028, "step": 2530 }, { "epoch": 1.2506154603643527, "grad_norm": 0.009887000545859337, "learning_rate": 1.4999507631708519e-05, "loss": 0.0012, "step": 2540 }, { "epoch": 1.2555391432791727, "grad_norm": 0.009588898159563541, "learning_rate": 1.4979812900049237e-05, "loss": 0.0018, "step": 2550 }, { "epoch": 1.2604628261939932, "grad_norm": 0.00931033119559288, "learning_rate": 1.4960118168389957e-05, "loss": 0.001, "step": 2560 }, { "epoch": 1.2653865091088135, "grad_norm": 0.008880667388439178, "learning_rate": 1.4940423436730675e-05, "loss": 0.001, "step": 2570 }, { "epoch": 1.2703101920236337, "grad_norm": 0.008290020748972893, "learning_rate": 1.4920728705071395e-05, "loss": 0.0022, "step": 2580 }, { "epoch": 1.275233874938454, "grad_norm": 0.013004295527935028, "learning_rate": 1.4901033973412113e-05, "loss": 0.0011, "step": 2590 }, { "epoch": 1.2801575578532742, "grad_norm": 0.009094899520277977, "learning_rate": 1.4881339241752833e-05, "loss": 0.0011, "step": 2600 }, { "epoch": 1.2850812407680945, "grad_norm": 0.007369400467723608, "learning_rate": 1.4861644510093551e-05, "loss": 0.0011, "step": 2610 }, { "epoch": 1.2900049236829148, "grad_norm": 0.01753099076449871, "learning_rate": 1.484194977843427e-05, "loss": 0.001, "step": 2620 }, { "epoch": 1.2949286065977352, "grad_norm": 0.007790517993271351, "learning_rate": 1.482225504677499e-05, "loss": 0.001, "step": 2630 }, { "epoch": 1.2998522895125553, "grad_norm": 0.007512846030294895, "learning_rate": 1.4802560315115708e-05, "loss": 0.0013, "step": 2640 }, { "epoch": 1.3047759724273758, "grad_norm": 0.009430987760424614, "learning_rate": 1.4782865583456428e-05, "loss": 0.001, "step": 2650 }, { "epoch": 1.309699655342196, "grad_norm": 0.025956837460398674, "learning_rate": 1.4763170851797146e-05, "loss": 0.001, "step": 2660 }, { "epoch": 1.3146233382570163, "grad_norm": 0.007779216393828392, "learning_rate": 1.4743476120137866e-05, "loss": 0.006, "step": 2670 }, { "epoch": 1.3195470211718365, "grad_norm": 0.0102881183847785, "learning_rate": 1.4723781388478584e-05, "loss": 0.0027, "step": 2680 }, { "epoch": 1.3244707040866568, "grad_norm": 0.007080462761223316, "learning_rate": 1.4704086656819302e-05, "loss": 0.0011, "step": 2690 }, { "epoch": 1.329394387001477, "grad_norm": 0.0068712919019162655, "learning_rate": 1.4684391925160022e-05, "loss": 0.0009, "step": 2700 }, { "epoch": 1.3343180699162973, "grad_norm": 0.006963066756725311, "learning_rate": 1.4664697193500739e-05, "loss": 0.0008, "step": 2710 }, { "epoch": 1.3392417528311178, "grad_norm": 0.007015050854533911, "learning_rate": 1.4645002461841459e-05, "loss": 0.0068, "step": 2720 }, { "epoch": 1.3441654357459378, "grad_norm": 0.006741439923644066, "learning_rate": 1.4625307730182177e-05, "loss": 0.0008, "step": 2730 }, { "epoch": 1.3490891186607583, "grad_norm": 0.009445318952202797, "learning_rate": 1.4605612998522895e-05, "loss": 0.0008, "step": 2740 }, { "epoch": 1.3540128015755786, "grad_norm": 0.006745448801666498, "learning_rate": 1.4585918266863615e-05, "loss": 0.0009, "step": 2750 }, { "epoch": 1.3589364844903988, "grad_norm": 0.0067230272106826305, "learning_rate": 1.4566223535204333e-05, "loss": 0.0756, "step": 2760 }, { "epoch": 1.363860167405219, "grad_norm": 0.006957577541470528, "learning_rate": 1.4546528803545053e-05, "loss": 0.0009, "step": 2770 }, { "epoch": 1.3687838503200394, "grad_norm": 0.008368587121367455, "learning_rate": 1.4526834071885771e-05, "loss": 0.0271, "step": 2780 }, { "epoch": 1.3737075332348596, "grad_norm": 0.008987442590296268, "learning_rate": 1.4507139340226491e-05, "loss": 0.001, "step": 2790 }, { "epoch": 1.3786312161496799, "grad_norm": 0.009542476385831833, "learning_rate": 1.448744460856721e-05, "loss": 0.0011, "step": 2800 }, { "epoch": 1.3835548990645004, "grad_norm": 0.006845912430435419, "learning_rate": 1.4467749876907927e-05, "loss": 0.0167, "step": 2810 }, { "epoch": 1.3884785819793206, "grad_norm": 0.00742871779948473, "learning_rate": 1.4448055145248647e-05, "loss": 0.0033, "step": 2820 }, { "epoch": 1.3934022648941409, "grad_norm": 6.05401086807251, "learning_rate": 1.4428360413589366e-05, "loss": 0.0399, "step": 2830 }, { "epoch": 1.3983259478089611, "grad_norm": 0.007821030914783478, "learning_rate": 1.4408665681930086e-05, "loss": 0.0014, "step": 2840 }, { "epoch": 1.4032496307237814, "grad_norm": 0.006872816011309624, "learning_rate": 1.4388970950270804e-05, "loss": 0.0129, "step": 2850 }, { "epoch": 1.4081733136386017, "grad_norm": 0.006989907938987017, "learning_rate": 1.4369276218611524e-05, "loss": 0.02, "step": 2860 }, { "epoch": 1.413096996553422, "grad_norm": 0.017713190987706184, "learning_rate": 1.4349581486952242e-05, "loss": 0.076, "step": 2870 }, { "epoch": 1.4180206794682422, "grad_norm": 0.009753022342920303, "learning_rate": 1.4329886755292962e-05, "loss": 0.0024, "step": 2880 }, { "epoch": 1.4229443623830624, "grad_norm": 0.010295004583895206, "learning_rate": 1.431019202363368e-05, "loss": 0.0011, "step": 2890 }, { "epoch": 1.427868045297883, "grad_norm": 0.009071256965398788, "learning_rate": 1.4290497291974398e-05, "loss": 0.001, "step": 2900 }, { "epoch": 1.4327917282127032, "grad_norm": 0.009070048108696938, "learning_rate": 1.4270802560315118e-05, "loss": 0.001, "step": 2910 }, { "epoch": 1.4377154111275234, "grad_norm": 0.007653309963643551, "learning_rate": 1.4251107828655835e-05, "loss": 0.001, "step": 2920 }, { "epoch": 1.4426390940423437, "grad_norm": 0.006937779951840639, "learning_rate": 1.4231413096996553e-05, "loss": 0.0008, "step": 2930 }, { "epoch": 1.447562776957164, "grad_norm": 0.008243228308856487, "learning_rate": 1.4211718365337273e-05, "loss": 0.001, "step": 2940 }, { "epoch": 1.4524864598719842, "grad_norm": 0.010159426368772984, "learning_rate": 1.4192023633677991e-05, "loss": 0.0669, "step": 2950 }, { "epoch": 1.4574101427868045, "grad_norm": 0.009640317410230637, "learning_rate": 1.417232890201871e-05, "loss": 0.0244, "step": 2960 }, { "epoch": 1.4623338257016247, "grad_norm": 0.11323712021112442, "learning_rate": 1.4152634170359429e-05, "loss": 0.0015, "step": 2970 }, { "epoch": 1.467257508616445, "grad_norm": 0.00797420833259821, "learning_rate": 1.4132939438700149e-05, "loss": 0.0365, "step": 2980 }, { "epoch": 1.4721811915312655, "grad_norm": 0.0069627827033400536, "learning_rate": 1.4113244707040867e-05, "loss": 0.0036, "step": 2990 }, { "epoch": 1.4771048744460857, "grad_norm": 0.008956373669207096, "learning_rate": 1.4093549975381587e-05, "loss": 0.0019, "step": 3000 }, { "epoch": 1.482028557360906, "grad_norm": 0.0071487524546682835, "learning_rate": 1.4073855243722305e-05, "loss": 0.0175, "step": 3010 }, { "epoch": 1.4869522402757263, "grad_norm": 0.009181806817650795, "learning_rate": 1.4054160512063023e-05, "loss": 0.0012, "step": 3020 }, { "epoch": 1.4918759231905465, "grad_norm": 0.006655732169747353, "learning_rate": 1.4034465780403743e-05, "loss": 0.0008, "step": 3030 }, { "epoch": 1.4967996061053668, "grad_norm": 0.007243420463055372, "learning_rate": 1.4014771048744462e-05, "loss": 0.0044, "step": 3040 }, { "epoch": 1.501723289020187, "grad_norm": 0.012056315317749977, "learning_rate": 1.3995076317085181e-05, "loss": 0.0011, "step": 3050 }, { "epoch": 1.5066469719350075, "grad_norm": 0.007087447214871645, "learning_rate": 1.39753815854259e-05, "loss": 0.0008, "step": 3060 }, { "epoch": 1.5115706548498276, "grad_norm": 0.011527610942721367, "learning_rate": 1.395568685376662e-05, "loss": 0.0009, "step": 3070 }, { "epoch": 1.516494337764648, "grad_norm": 0.009305012412369251, "learning_rate": 1.3935992122107338e-05, "loss": 0.0008, "step": 3080 }, { "epoch": 1.5214180206794683, "grad_norm": 0.010509533807635307, "learning_rate": 1.3916297390448058e-05, "loss": 0.0008, "step": 3090 }, { "epoch": 1.5263417035942886, "grad_norm": 0.0072973608039319515, "learning_rate": 1.3896602658788776e-05, "loss": 0.0158, "step": 3100 }, { "epoch": 1.5312653865091088, "grad_norm": 0.00608638534322381, "learning_rate": 1.3876907927129494e-05, "loss": 0.0097, "step": 3110 }, { "epoch": 1.536189069423929, "grad_norm": 0.018107222393155098, "learning_rate": 1.3857213195470214e-05, "loss": 0.0008, "step": 3120 }, { "epoch": 1.5411127523387493, "grad_norm": 0.007667326834052801, "learning_rate": 1.3837518463810932e-05, "loss": 0.0009, "step": 3130 }, { "epoch": 1.5460364352535696, "grad_norm": 0.006599190644919872, "learning_rate": 1.3817823732151649e-05, "loss": 0.0087, "step": 3140 }, { "epoch": 1.55096011816839, "grad_norm": 0.007009952329099178, "learning_rate": 1.3798129000492369e-05, "loss": 0.0012, "step": 3150 }, { "epoch": 1.5558838010832101, "grad_norm": 0.006010917481034994, "learning_rate": 1.3778434268833087e-05, "loss": 0.0008, "step": 3160 }, { "epoch": 1.5608074839980306, "grad_norm": 0.006966378074139357, "learning_rate": 1.3758739537173807e-05, "loss": 0.0007, "step": 3170 }, { "epoch": 1.5657311669128509, "grad_norm": 0.0060759298503398895, "learning_rate": 1.3739044805514525e-05, "loss": 0.0012, "step": 3180 }, { "epoch": 1.5706548498276711, "grad_norm": 0.005752959754317999, "learning_rate": 1.3719350073855245e-05, "loss": 0.0007, "step": 3190 }, { "epoch": 1.5755785327424914, "grad_norm": 0.006781155243515968, "learning_rate": 1.3699655342195963e-05, "loss": 0.0007, "step": 3200 }, { "epoch": 1.5805022156573116, "grad_norm": 0.005668849218636751, "learning_rate": 1.3679960610536683e-05, "loss": 0.0196, "step": 3210 }, { "epoch": 1.5854258985721321, "grad_norm": 0.005629651714116335, "learning_rate": 1.3660265878877401e-05, "loss": 0.0007, "step": 3220 }, { "epoch": 1.5903495814869522, "grad_norm": 0.00564204016700387, "learning_rate": 1.364057114721812e-05, "loss": 0.0011, "step": 3230 }, { "epoch": 1.5952732644017726, "grad_norm": 0.005540564656257629, "learning_rate": 1.362087641555884e-05, "loss": 0.0008, "step": 3240 }, { "epoch": 1.6001969473165927, "grad_norm": 0.006026837043464184, "learning_rate": 1.3601181683899558e-05, "loss": 0.0425, "step": 3250 }, { "epoch": 1.6051206302314132, "grad_norm": 0.00552277360111475, "learning_rate": 1.3581486952240277e-05, "loss": 0.0008, "step": 3260 }, { "epoch": 1.6100443131462334, "grad_norm": 0.018236679956316948, "learning_rate": 1.3561792220580996e-05, "loss": 0.0007, "step": 3270 }, { "epoch": 1.6149679960610537, "grad_norm": 0.00691232131794095, "learning_rate": 1.3542097488921716e-05, "loss": 0.0007, "step": 3280 }, { "epoch": 1.619891678975874, "grad_norm": 0.006581007968634367, "learning_rate": 1.3522402757262434e-05, "loss": 0.0007, "step": 3290 }, { "epoch": 1.6248153618906942, "grad_norm": 0.0053688762709498405, "learning_rate": 1.3502708025603154e-05, "loss": 0.0008, "step": 3300 }, { "epoch": 1.6297390448055147, "grad_norm": 1.0336942672729492, "learning_rate": 1.3483013293943872e-05, "loss": 0.0076, "step": 3310 }, { "epoch": 1.6346627277203347, "grad_norm": 0.005269223358482122, "learning_rate": 1.346331856228459e-05, "loss": 0.0007, "step": 3320 }, { "epoch": 1.6395864106351552, "grad_norm": 0.008069952018558979, "learning_rate": 1.344362383062531e-05, "loss": 0.0008, "step": 3330 }, { "epoch": 1.6445100935499752, "grad_norm": 0.005610567983239889, "learning_rate": 1.3423929098966028e-05, "loss": 0.0007, "step": 3340 }, { "epoch": 1.6494337764647957, "grad_norm": 0.005232799798250198, "learning_rate": 1.3404234367306745e-05, "loss": 0.0007, "step": 3350 }, { "epoch": 1.654357459379616, "grad_norm": 0.005385094787925482, "learning_rate": 1.3384539635647465e-05, "loss": 0.0006, "step": 3360 }, { "epoch": 1.6592811422944362, "grad_norm": 0.005121603608131409, "learning_rate": 1.3364844903988183e-05, "loss": 0.0006, "step": 3370 }, { "epoch": 1.6642048252092565, "grad_norm": 0.005196818150579929, "learning_rate": 1.3345150172328903e-05, "loss": 0.0007, "step": 3380 }, { "epoch": 1.6691285081240768, "grad_norm": 0.005104544572532177, "learning_rate": 1.3325455440669621e-05, "loss": 0.0006, "step": 3390 }, { "epoch": 1.6740521910388972, "grad_norm": 0.0062180510722100735, "learning_rate": 1.330576070901034e-05, "loss": 0.0007, "step": 3400 }, { "epoch": 1.6789758739537173, "grad_norm": 0.01003600750118494, "learning_rate": 1.3286065977351059e-05, "loss": 0.0007, "step": 3410 }, { "epoch": 1.6838995568685378, "grad_norm": 0.0061742691323161125, "learning_rate": 1.3266371245691779e-05, "loss": 0.0018, "step": 3420 }, { "epoch": 1.6888232397833578, "grad_norm": 0.00500025087967515, "learning_rate": 1.3246676514032497e-05, "loss": 0.0006, "step": 3430 }, { "epoch": 1.6937469226981783, "grad_norm": 0.009734513238072395, "learning_rate": 1.3226981782373215e-05, "loss": 0.0007, "step": 3440 }, { "epoch": 1.6986706056129985, "grad_norm": 0.00670117000117898, "learning_rate": 1.3207287050713935e-05, "loss": 0.0007, "step": 3450 }, { "epoch": 1.7035942885278188, "grad_norm": 0.004964028485119343, "learning_rate": 1.3187592319054653e-05, "loss": 0.0006, "step": 3460 }, { "epoch": 1.708517971442639, "grad_norm": 0.004891118034720421, "learning_rate": 1.3167897587395373e-05, "loss": 0.0007, "step": 3470 }, { "epoch": 1.7134416543574593, "grad_norm": 0.005974843632429838, "learning_rate": 1.3148202855736092e-05, "loss": 0.0007, "step": 3480 }, { "epoch": 1.7183653372722798, "grad_norm": 0.005670204292982817, "learning_rate": 1.3128508124076811e-05, "loss": 0.0007, "step": 3490 }, { "epoch": 1.7232890201870998, "grad_norm": 0.00542798126116395, "learning_rate": 1.310881339241753e-05, "loss": 0.0007, "step": 3500 }, { "epoch": 1.7282127031019203, "grad_norm": 0.005577849689871073, "learning_rate": 1.308911866075825e-05, "loss": 0.0006, "step": 3510 }, { "epoch": 1.7331363860167404, "grad_norm": 0.02375694364309311, "learning_rate": 1.3069423929098968e-05, "loss": 0.0007, "step": 3520 }, { "epoch": 1.7380600689315608, "grad_norm": 0.004778748843818903, "learning_rate": 1.3049729197439686e-05, "loss": 0.0006, "step": 3530 }, { "epoch": 1.742983751846381, "grad_norm": 0.005341399926692247, "learning_rate": 1.3030034465780406e-05, "loss": 0.0006, "step": 3540 }, { "epoch": 1.7479074347612014, "grad_norm": 0.0069299363531172276, "learning_rate": 1.3010339734121124e-05, "loss": 0.0006, "step": 3550 }, { "epoch": 1.7528311176760216, "grad_norm": 0.004599249456077814, "learning_rate": 1.2990645002461844e-05, "loss": 0.0007, "step": 3560 }, { "epoch": 1.7577548005908419, "grad_norm": 0.004595101345330477, "learning_rate": 1.297095027080256e-05, "loss": 0.0006, "step": 3570 }, { "epoch": 1.7626784835056624, "grad_norm": 0.005347404163330793, "learning_rate": 1.2951255539143279e-05, "loss": 0.0006, "step": 3580 }, { "epoch": 1.7676021664204824, "grad_norm": 0.004594715777784586, "learning_rate": 1.2931560807483999e-05, "loss": 0.0006, "step": 3590 }, { "epoch": 1.7725258493353029, "grad_norm": 0.06393859535455704, "learning_rate": 1.2911866075824717e-05, "loss": 0.0008, "step": 3600 }, { "epoch": 1.7774495322501231, "grad_norm": 0.005369629245251417, "learning_rate": 1.2892171344165437e-05, "loss": 0.0006, "step": 3610 }, { "epoch": 1.7823732151649434, "grad_norm": 0.009649335406720638, "learning_rate": 1.2872476612506155e-05, "loss": 0.0006, "step": 3620 }, { "epoch": 1.7872968980797637, "grad_norm": 0.005992168094962835, "learning_rate": 1.2852781880846875e-05, "loss": 0.0006, "step": 3630 }, { "epoch": 1.792220580994584, "grad_norm": 0.00444134371355176, "learning_rate": 1.2833087149187593e-05, "loss": 0.0034, "step": 3640 }, { "epoch": 1.7971442639094042, "grad_norm": 0.005889700725674629, "learning_rate": 1.2813392417528311e-05, "loss": 0.0006, "step": 3650 }, { "epoch": 1.8020679468242244, "grad_norm": 0.004602220840752125, "learning_rate": 1.2793697685869031e-05, "loss": 0.0005, "step": 3660 }, { "epoch": 1.806991629739045, "grad_norm": 0.016186626628041267, "learning_rate": 1.277400295420975e-05, "loss": 0.0612, "step": 3670 }, { "epoch": 1.811915312653865, "grad_norm": 0.011738053523004055, "learning_rate": 1.275430822255047e-05, "loss": 0.0006, "step": 3680 }, { "epoch": 1.8168389955686854, "grad_norm": 0.0061439890414476395, "learning_rate": 1.2734613490891188e-05, "loss": 0.0007, "step": 3690 }, { "epoch": 1.8217626784835057, "grad_norm": 0.00608317693695426, "learning_rate": 1.2714918759231907e-05, "loss": 0.001, "step": 3700 }, { "epoch": 1.826686361398326, "grad_norm": 0.0054056947119534016, "learning_rate": 1.2695224027572626e-05, "loss": 0.0006, "step": 3710 }, { "epoch": 1.8316100443131462, "grad_norm": 0.005898363888263702, "learning_rate": 1.2675529295913344e-05, "loss": 0.0572, "step": 3720 }, { "epoch": 1.8365337272279665, "grad_norm": 0.004474004730582237, "learning_rate": 1.2655834564254064e-05, "loss": 0.0085, "step": 3730 }, { "epoch": 1.841457410142787, "grad_norm": 0.0045126210898160934, "learning_rate": 1.2636139832594782e-05, "loss": 0.0015, "step": 3740 }, { "epoch": 1.846381093057607, "grad_norm": 0.00464917765930295, "learning_rate": 1.2616445100935502e-05, "loss": 0.0536, "step": 3750 }, { "epoch": 1.8513047759724275, "grad_norm": 0.00560635793954134, "learning_rate": 1.259675036927622e-05, "loss": 0.0007, "step": 3760 }, { "epoch": 1.8562284588872475, "grad_norm": 0.0122684920206666, "learning_rate": 1.257705563761694e-05, "loss": 0.0008, "step": 3770 }, { "epoch": 1.861152141802068, "grad_norm": 0.004684649407863617, "learning_rate": 1.2557360905957658e-05, "loss": 0.0007, "step": 3780 }, { "epoch": 1.8660758247168883, "grad_norm": 0.010543436743319035, "learning_rate": 1.2537666174298375e-05, "loss": 0.0109, "step": 3790 }, { "epoch": 1.8709995076317085, "grad_norm": 0.009408768266439438, "learning_rate": 1.2517971442639095e-05, "loss": 0.0007, "step": 3800 }, { "epoch": 1.8759231905465288, "grad_norm": 0.004331331700086594, "learning_rate": 1.2498276710979813e-05, "loss": 0.0006, "step": 3810 }, { "epoch": 1.880846873461349, "grad_norm": 0.004327212926000357, "learning_rate": 1.2478581979320533e-05, "loss": 0.0006, "step": 3820 }, { "epoch": 1.8857705563761695, "grad_norm": 0.0050591351464390755, "learning_rate": 1.2458887247661251e-05, "loss": 0.0007, "step": 3830 }, { "epoch": 1.8906942392909896, "grad_norm": 0.005602705758064985, "learning_rate": 1.2439192516001969e-05, "loss": 0.0006, "step": 3840 }, { "epoch": 1.89561792220581, "grad_norm": 0.009484893642365932, "learning_rate": 1.2419497784342689e-05, "loss": 0.0005, "step": 3850 }, { "epoch": 1.90054160512063, "grad_norm": 0.004367966204881668, "learning_rate": 1.2399803052683407e-05, "loss": 0.0005, "step": 3860 }, { "epoch": 1.9054652880354506, "grad_norm": 0.004156662616878748, "learning_rate": 1.2380108321024127e-05, "loss": 0.0006, "step": 3870 }, { "epoch": 1.9103889709502708, "grad_norm": 0.00411129929125309, "learning_rate": 1.2360413589364845e-05, "loss": 0.0006, "step": 3880 }, { "epoch": 1.915312653865091, "grad_norm": 1.3656526803970337, "learning_rate": 1.2340718857705565e-05, "loss": 0.0104, "step": 3890 }, { "epoch": 1.9202363367799113, "grad_norm": 0.005277382675558329, "learning_rate": 1.2321024126046283e-05, "loss": 0.0006, "step": 3900 }, { "epoch": 1.9251600196947316, "grad_norm": 0.00482240691781044, "learning_rate": 1.2301329394387003e-05, "loss": 0.0005, "step": 3910 }, { "epoch": 1.930083702609552, "grad_norm": 0.00404377281665802, "learning_rate": 1.2281634662727722e-05, "loss": 0.0006, "step": 3920 }, { "epoch": 1.9350073855243721, "grad_norm": 0.005294101778417826, "learning_rate": 1.226193993106844e-05, "loss": 0.0297, "step": 3930 }, { "epoch": 1.9399310684391926, "grad_norm": 0.0040844217874109745, "learning_rate": 1.224224519940916e-05, "loss": 0.0005, "step": 3940 }, { "epoch": 1.9448547513540126, "grad_norm": 0.012116051279008389, "learning_rate": 1.2222550467749878e-05, "loss": 0.0012, "step": 3950 }, { "epoch": 1.9497784342688331, "grad_norm": 0.0040712859481573105, "learning_rate": 1.2202855736090598e-05, "loss": 0.0005, "step": 3960 }, { "epoch": 1.9547021171836534, "grad_norm": 0.010710555128753185, "learning_rate": 1.2183161004431316e-05, "loss": 0.0008, "step": 3970 }, { "epoch": 1.9596258000984736, "grad_norm": 0.00512111559510231, "learning_rate": 1.2163466272772036e-05, "loss": 0.0005, "step": 3980 }, { "epoch": 1.964549483013294, "grad_norm": 0.004718789830803871, "learning_rate": 1.2143771541112754e-05, "loss": 0.0006, "step": 3990 }, { "epoch": 1.9694731659281142, "grad_norm": 0.040209267288446426, "learning_rate": 1.212407680945347e-05, "loss": 0.0006, "step": 4000 }, { "epoch": 1.9743968488429346, "grad_norm": 0.004102818667888641, "learning_rate": 1.210438207779419e-05, "loss": 0.0006, "step": 4010 }, { "epoch": 1.9793205317577547, "grad_norm": 0.003849179018288851, "learning_rate": 1.2084687346134909e-05, "loss": 0.0004, "step": 4020 }, { "epoch": 1.9842442146725752, "grad_norm": 0.0038420234341174364, "learning_rate": 1.2064992614475629e-05, "loss": 0.0005, "step": 4030 }, { "epoch": 1.9891678975873952, "grad_norm": 0.004476201254874468, "learning_rate": 1.2045297882816347e-05, "loss": 0.0008, "step": 4040 }, { "epoch": 1.9940915805022157, "grad_norm": 0.0061459592543542385, "learning_rate": 1.2025603151157065e-05, "loss": 0.0005, "step": 4050 }, { "epoch": 1.999015263417036, "grad_norm": 0.00451872032135725, "learning_rate": 1.2005908419497785e-05, "loss": 0.0005, "step": 4060 }, { "epoch": 2.0, "eval_accuracy": 0.9993024066968957, "eval_loss": 0.003324420191347599, "eval_runtime": 125.6557, "eval_samples_per_second": 22.816, "eval_steps_per_second": 2.857, "step": 4062 }, { "epoch": 2.003938946331856, "grad_norm": 0.003870410844683647, "learning_rate": 1.1986213687838503e-05, "loss": 0.0004, "step": 4070 }, { "epoch": 2.0088626292466767, "grad_norm": 0.0037198015488684177, "learning_rate": 1.1966518956179223e-05, "loss": 0.0005, "step": 4080 }, { "epoch": 2.0137863121614967, "grad_norm": 0.003692836966365576, "learning_rate": 1.1946824224519941e-05, "loss": 0.0005, "step": 4090 }, { "epoch": 2.018709995076317, "grad_norm": 0.005183310713618994, "learning_rate": 1.1927129492860661e-05, "loss": 0.0005, "step": 4100 }, { "epoch": 2.0236336779911372, "grad_norm": 0.005319288466125727, "learning_rate": 1.190743476120138e-05, "loss": 0.0007, "step": 4110 }, { "epoch": 2.0285573609059577, "grad_norm": 0.003845900995656848, "learning_rate": 1.18877400295421e-05, "loss": 0.0009, "step": 4120 }, { "epoch": 2.0334810438207778, "grad_norm": 0.0036504592280834913, "learning_rate": 1.1868045297882818e-05, "loss": 0.0005, "step": 4130 }, { "epoch": 2.0384047267355982, "grad_norm": 0.005353834945708513, "learning_rate": 1.1848350566223536e-05, "loss": 0.0005, "step": 4140 }, { "epoch": 2.0433284096504187, "grad_norm": 0.003965914249420166, "learning_rate": 1.1828655834564256e-05, "loss": 0.0005, "step": 4150 }, { "epoch": 2.0482520925652388, "grad_norm": 0.0035763198975473642, "learning_rate": 1.1808961102904974e-05, "loss": 0.0005, "step": 4160 }, { "epoch": 2.0531757754800593, "grad_norm": 0.0036004288122057915, "learning_rate": 1.1789266371245694e-05, "loss": 0.0004, "step": 4170 }, { "epoch": 2.0580994583948793, "grad_norm": 0.003568575019016862, "learning_rate": 1.1769571639586412e-05, "loss": 0.0004, "step": 4180 }, { "epoch": 2.0630231413096998, "grad_norm": 0.003498220583423972, "learning_rate": 1.1749876907927132e-05, "loss": 0.0004, "step": 4190 }, { "epoch": 2.06794682422452, "grad_norm": 0.004230022896081209, "learning_rate": 1.173018217626785e-05, "loss": 0.0005, "step": 4200 }, { "epoch": 2.0728705071393403, "grad_norm": 0.004168345592916012, "learning_rate": 1.171048744460857e-05, "loss": 0.0004, "step": 4210 }, { "epoch": 2.0777941900541603, "grad_norm": 0.004782133270055056, "learning_rate": 1.1690792712949287e-05, "loss": 0.0004, "step": 4220 }, { "epoch": 2.082717872968981, "grad_norm": 0.004512319806963205, "learning_rate": 1.1671097981290005e-05, "loss": 0.0005, "step": 4230 }, { "epoch": 2.0876415558838013, "grad_norm": 0.0035142311826348305, "learning_rate": 1.1651403249630725e-05, "loss": 0.0004, "step": 4240 }, { "epoch": 2.0925652387986213, "grad_norm": 0.0034635839983820915, "learning_rate": 1.1631708517971443e-05, "loss": 0.0005, "step": 4250 }, { "epoch": 2.097488921713442, "grad_norm": 0.0034351220820099115, "learning_rate": 1.1612013786312161e-05, "loss": 0.0004, "step": 4260 }, { "epoch": 2.102412604628262, "grad_norm": 0.003419801127165556, "learning_rate": 1.1592319054652881e-05, "loss": 0.0004, "step": 4270 }, { "epoch": 2.1073362875430823, "grad_norm": 0.004152268171310425, "learning_rate": 1.15726243229936e-05, "loss": 0.001, "step": 4280 }, { "epoch": 2.1122599704579024, "grad_norm": 0.003409736789762974, "learning_rate": 1.1552929591334319e-05, "loss": 0.0004, "step": 4290 }, { "epoch": 2.117183653372723, "grad_norm": 0.0034677856601774693, "learning_rate": 1.1533234859675037e-05, "loss": 0.0004, "step": 4300 }, { "epoch": 2.122107336287543, "grad_norm": 0.0034529881086200476, "learning_rate": 1.1513540128015757e-05, "loss": 0.0004, "step": 4310 }, { "epoch": 2.1270310192023634, "grad_norm": 0.0033237591851502657, "learning_rate": 1.1493845396356475e-05, "loss": 0.0005, "step": 4320 }, { "epoch": 2.131954702117184, "grad_norm": 0.0033787621650844812, "learning_rate": 1.1474150664697195e-05, "loss": 0.0004, "step": 4330 }, { "epoch": 2.136878385032004, "grad_norm": 0.004501492716372013, "learning_rate": 1.1454455933037914e-05, "loss": 0.0004, "step": 4340 }, { "epoch": 2.1418020679468244, "grad_norm": 0.0037883783224970102, "learning_rate": 1.1434761201378632e-05, "loss": 0.0005, "step": 4350 }, { "epoch": 2.1467257508616444, "grad_norm": 0.0033413332421332598, "learning_rate": 1.1415066469719352e-05, "loss": 0.0004, "step": 4360 }, { "epoch": 2.151649433776465, "grad_norm": 0.003297879360616207, "learning_rate": 1.139537173806007e-05, "loss": 0.0004, "step": 4370 }, { "epoch": 2.156573116691285, "grad_norm": 0.0032432284206151962, "learning_rate": 1.137567700640079e-05, "loss": 0.0004, "step": 4380 }, { "epoch": 2.1614967996061054, "grad_norm": 0.0034160311333835125, "learning_rate": 1.1355982274741508e-05, "loss": 0.0004, "step": 4390 }, { "epoch": 2.1664204825209255, "grad_norm": 0.003737648716196418, "learning_rate": 1.1336287543082228e-05, "loss": 0.0004, "step": 4400 }, { "epoch": 2.171344165435746, "grad_norm": 0.003206575522199273, "learning_rate": 1.1316592811422946e-05, "loss": 0.0004, "step": 4410 }, { "epoch": 2.1762678483505664, "grad_norm": 0.0032377000898122787, "learning_rate": 1.1296898079763666e-05, "loss": 0.0004, "step": 4420 }, { "epoch": 2.1811915312653865, "grad_norm": 0.0031994974706321955, "learning_rate": 1.1277203348104384e-05, "loss": 0.0004, "step": 4430 }, { "epoch": 2.186115214180207, "grad_norm": 0.003612807020545006, "learning_rate": 1.12575086164451e-05, "loss": 0.0004, "step": 4440 }, { "epoch": 2.191038897095027, "grad_norm": 0.0038285143673419952, "learning_rate": 1.123781388478582e-05, "loss": 0.0005, "step": 4450 }, { "epoch": 2.1959625800098475, "grad_norm": 0.05885794758796692, "learning_rate": 1.1218119153126539e-05, "loss": 0.0005, "step": 4460 }, { "epoch": 2.2008862629246675, "grad_norm": 0.004322012886404991, "learning_rate": 1.1198424421467257e-05, "loss": 0.0081, "step": 4470 }, { "epoch": 2.205809945839488, "grad_norm": 0.0031240857206285, "learning_rate": 1.1178729689807977e-05, "loss": 0.0004, "step": 4480 }, { "epoch": 2.2107336287543085, "grad_norm": 0.0030879732221364975, "learning_rate": 1.1159034958148695e-05, "loss": 0.0004, "step": 4490 }, { "epoch": 2.2156573116691285, "grad_norm": 0.0031029575038701296, "learning_rate": 1.1139340226489415e-05, "loss": 0.0004, "step": 4500 }, { "epoch": 2.220580994583949, "grad_norm": 0.003118765540421009, "learning_rate": 1.1119645494830133e-05, "loss": 0.0004, "step": 4510 }, { "epoch": 2.225504677498769, "grad_norm": 0.0030926407780498266, "learning_rate": 1.1099950763170853e-05, "loss": 0.0004, "step": 4520 }, { "epoch": 2.2304283604135895, "grad_norm": 0.0031910729594528675, "learning_rate": 1.1080256031511571e-05, "loss": 0.0004, "step": 4530 }, { "epoch": 2.2353520433284095, "grad_norm": 0.0032135951332747936, "learning_rate": 1.1060561299852291e-05, "loss": 0.0004, "step": 4540 }, { "epoch": 2.24027572624323, "grad_norm": 0.00302210939116776, "learning_rate": 1.104086656819301e-05, "loss": 0.0004, "step": 4550 }, { "epoch": 2.24519940915805, "grad_norm": 0.003041194984689355, "learning_rate": 1.1021171836533728e-05, "loss": 0.0004, "step": 4560 }, { "epoch": 2.2501230920728705, "grad_norm": 0.0029863493982702494, "learning_rate": 1.1001477104874448e-05, "loss": 0.0004, "step": 4570 }, { "epoch": 2.255046774987691, "grad_norm": 0.0035601642448455095, "learning_rate": 1.0981782373215166e-05, "loss": 0.0004, "step": 4580 }, { "epoch": 2.259970457902511, "grad_norm": 0.002995297545567155, "learning_rate": 1.0962087641555886e-05, "loss": 0.0258, "step": 4590 }, { "epoch": 2.2648941408173315, "grad_norm": 0.0030106825288385153, "learning_rate": 1.0942392909896604e-05, "loss": 0.0004, "step": 4600 }, { "epoch": 2.2698178237321516, "grad_norm": 0.0030412061605602503, "learning_rate": 1.0922698178237324e-05, "loss": 0.0003, "step": 4610 }, { "epoch": 2.274741506646972, "grad_norm": 0.0032338390592485666, "learning_rate": 1.0903003446578042e-05, "loss": 0.0007, "step": 4620 }, { "epoch": 2.279665189561792, "grad_norm": 0.0029980672989040613, "learning_rate": 1.088330871491876e-05, "loss": 0.0004, "step": 4630 }, { "epoch": 2.2845888724766126, "grad_norm": 0.0030107314232736826, "learning_rate": 1.086361398325948e-05, "loss": 0.0679, "step": 4640 }, { "epoch": 2.2895125553914326, "grad_norm": 0.003009843174368143, "learning_rate": 1.0843919251600197e-05, "loss": 0.0004, "step": 4650 }, { "epoch": 2.294436238306253, "grad_norm": 0.0035064418334513903, "learning_rate": 1.0824224519940917e-05, "loss": 0.0004, "step": 4660 }, { "epoch": 2.2993599212210736, "grad_norm": 0.0035762269981205463, "learning_rate": 1.0804529788281635e-05, "loss": 0.0004, "step": 4670 }, { "epoch": 2.3042836041358936, "grad_norm": 0.003944043070077896, "learning_rate": 1.0784835056622353e-05, "loss": 0.0004, "step": 4680 }, { "epoch": 2.309207287050714, "grad_norm": 0.008321553468704224, "learning_rate": 1.0765140324963073e-05, "loss": 0.0004, "step": 4690 }, { "epoch": 2.314130969965534, "grad_norm": 0.0029584006406366825, "learning_rate": 1.0745445593303791e-05, "loss": 0.0004, "step": 4700 }, { "epoch": 2.3190546528803546, "grad_norm": 0.0038970340974628925, "learning_rate": 1.0725750861644511e-05, "loss": 0.0004, "step": 4710 }, { "epoch": 2.3239783357951747, "grad_norm": 0.0028425133787095547, "learning_rate": 1.070605612998523e-05, "loss": 0.0004, "step": 4720 }, { "epoch": 2.328902018709995, "grad_norm": 0.0031168104615062475, "learning_rate": 1.0686361398325949e-05, "loss": 0.0004, "step": 4730 }, { "epoch": 2.333825701624815, "grad_norm": 0.00459252716973424, "learning_rate": 1.0666666666666667e-05, "loss": 0.0004, "step": 4740 }, { "epoch": 2.3387493845396357, "grad_norm": 0.0028256394434720278, "learning_rate": 1.0646971935007386e-05, "loss": 0.0004, "step": 4750 }, { "epoch": 2.343673067454456, "grad_norm": 0.005775698460638523, "learning_rate": 1.0627277203348105e-05, "loss": 0.0505, "step": 4760 }, { "epoch": 2.348596750369276, "grad_norm": 0.0028328783810138702, "learning_rate": 1.0607582471688824e-05, "loss": 0.0004, "step": 4770 }, { "epoch": 2.3535204332840967, "grad_norm": 0.0029578169342130423, "learning_rate": 1.0587887740029544e-05, "loss": 0.0004, "step": 4780 }, { "epoch": 2.3584441161989167, "grad_norm": 0.0028520470950752497, "learning_rate": 1.0568193008370262e-05, "loss": 0.0746, "step": 4790 }, { "epoch": 2.363367799113737, "grad_norm": 0.0032039277721196413, "learning_rate": 1.0548498276710982e-05, "loss": 0.0004, "step": 4800 }, { "epoch": 2.368291482028557, "grad_norm": 0.004140175879001617, "learning_rate": 1.05288035450517e-05, "loss": 0.0004, "step": 4810 }, { "epoch": 2.3732151649433777, "grad_norm": 0.0033235037699341774, "learning_rate": 1.050910881339242e-05, "loss": 0.0004, "step": 4820 }, { "epoch": 2.3781388478581977, "grad_norm": 0.0031042725313454866, "learning_rate": 1.0489414081733138e-05, "loss": 0.0063, "step": 4830 }, { "epoch": 2.383062530773018, "grad_norm": 0.0032992505002766848, "learning_rate": 1.0469719350073856e-05, "loss": 0.0004, "step": 4840 }, { "epoch": 2.3879862136878387, "grad_norm": 0.0029236788395792246, "learning_rate": 1.0450024618414576e-05, "loss": 0.0376, "step": 4850 }, { "epoch": 2.3929098966026587, "grad_norm": 0.0035732307005673647, "learning_rate": 1.0430329886755294e-05, "loss": 0.0004, "step": 4860 }, { "epoch": 2.397833579517479, "grad_norm": 0.002821253379806876, "learning_rate": 1.041063515509601e-05, "loss": 0.0003, "step": 4870 }, { "epoch": 2.4027572624322993, "grad_norm": 0.002872566459700465, "learning_rate": 1.039094042343673e-05, "loss": 0.0003, "step": 4880 }, { "epoch": 2.4076809453471197, "grad_norm": 0.00280165602453053, "learning_rate": 1.0371245691777449e-05, "loss": 0.0004, "step": 4890 }, { "epoch": 2.4126046282619398, "grad_norm": 0.003701514797285199, "learning_rate": 1.0351550960118169e-05, "loss": 0.0004, "step": 4900 }, { "epoch": 2.4175283111767603, "grad_norm": 0.0033293033484369516, "learning_rate": 1.0331856228458887e-05, "loss": 0.0004, "step": 4910 }, { "epoch": 2.4224519940915803, "grad_norm": 0.0027583721093833447, "learning_rate": 1.0312161496799607e-05, "loss": 0.0004, "step": 4920 }, { "epoch": 2.427375677006401, "grad_norm": 0.0035485969856381416, "learning_rate": 1.0292466765140325e-05, "loss": 0.0004, "step": 4930 }, { "epoch": 2.4322993599212213, "grad_norm": 0.0032952686306089163, "learning_rate": 1.0272772033481045e-05, "loss": 0.0004, "step": 4940 }, { "epoch": 2.4372230428360413, "grad_norm": 0.0037070815451443195, "learning_rate": 1.0253077301821763e-05, "loss": 0.0004, "step": 4950 }, { "epoch": 2.442146725750862, "grad_norm": 0.002730543026700616, "learning_rate": 1.0233382570162481e-05, "loss": 0.0003, "step": 4960 }, { "epoch": 2.447070408665682, "grad_norm": 0.004558619111776352, "learning_rate": 1.0213687838503201e-05, "loss": 0.0004, "step": 4970 }, { "epoch": 2.4519940915805023, "grad_norm": 0.003558282507583499, "learning_rate": 1.019399310684392e-05, "loss": 0.0005, "step": 4980 }, { "epoch": 2.4569177744953223, "grad_norm": 0.012926378287374973, "learning_rate": 1.017429837518464e-05, "loss": 0.0004, "step": 4990 }, { "epoch": 2.461841457410143, "grad_norm": 0.00339103932492435, "learning_rate": 1.0154603643525358e-05, "loss": 0.0004, "step": 5000 }, { "epoch": 2.466765140324963, "grad_norm": 0.0026393327862024307, "learning_rate": 1.0134908911866078e-05, "loss": 0.0003, "step": 5010 }, { "epoch": 2.4716888232397833, "grad_norm": 0.006687171291559935, "learning_rate": 1.0115214180206796e-05, "loss": 0.0004, "step": 5020 }, { "epoch": 2.476612506154604, "grad_norm": 0.003788519883528352, "learning_rate": 1.0095519448547516e-05, "loss": 0.0003, "step": 5030 }, { "epoch": 2.481536189069424, "grad_norm": 0.0027769142761826515, "learning_rate": 1.0075824716888234e-05, "loss": 0.0004, "step": 5040 }, { "epoch": 2.4864598719842443, "grad_norm": 0.0029424901586025953, "learning_rate": 1.0056129985228952e-05, "loss": 0.0003, "step": 5050 }, { "epoch": 2.4913835548990644, "grad_norm": 0.002582251327112317, "learning_rate": 1.0036435253569672e-05, "loss": 0.0003, "step": 5060 }, { "epoch": 2.496307237813885, "grad_norm": 0.0027259523048996925, "learning_rate": 1.001674052191039e-05, "loss": 0.0003, "step": 5070 }, { "epoch": 2.5012309207287053, "grad_norm": 0.026945946738123894, "learning_rate": 9.997045790251108e-06, "loss": 0.0004, "step": 5080 }, { "epoch": 2.5061546036435254, "grad_norm": 0.002602215390652418, "learning_rate": 9.977351058591828e-06, "loss": 0.0004, "step": 5090 }, { "epoch": 2.5110782865583454, "grad_norm": 0.002513736952096224, "learning_rate": 9.957656326932547e-06, "loss": 0.0003, "step": 5100 }, { "epoch": 2.516001969473166, "grad_norm": 0.0032119974493980408, "learning_rate": 9.937961595273265e-06, "loss": 0.0003, "step": 5110 }, { "epoch": 2.5209256523879864, "grad_norm": 0.006075989454984665, "learning_rate": 9.918266863613985e-06, "loss": 0.0007, "step": 5120 }, { "epoch": 2.5258493353028064, "grad_norm": 0.02874099276959896, "learning_rate": 9.898572131954703e-06, "loss": 0.0005, "step": 5130 }, { "epoch": 2.530773018217627, "grad_norm": 0.0025109825655817986, "learning_rate": 9.878877400295421e-06, "loss": 0.0355, "step": 5140 }, { "epoch": 2.535696701132447, "grad_norm": 0.029801325872540474, "learning_rate": 9.859182668636141e-06, "loss": 0.0004, "step": 5150 }, { "epoch": 2.5406203840472674, "grad_norm": 0.00397358788177371, "learning_rate": 9.83948793697686e-06, "loss": 0.0004, "step": 5160 }, { "epoch": 2.545544066962088, "grad_norm": 0.002678680932149291, "learning_rate": 9.819793205317577e-06, "loss": 0.0003, "step": 5170 }, { "epoch": 2.550467749876908, "grad_norm": 0.0029178003314882517, "learning_rate": 9.800098473658297e-06, "loss": 0.0003, "step": 5180 }, { "epoch": 2.555391432791728, "grad_norm": 0.0024977768771350384, "learning_rate": 9.780403741999016e-06, "loss": 0.0004, "step": 5190 }, { "epoch": 2.5603151157065485, "grad_norm": 0.0030609865207225084, "learning_rate": 9.760709010339735e-06, "loss": 0.0003, "step": 5200 }, { "epoch": 2.565238798621369, "grad_norm": 0.003063684096559882, "learning_rate": 9.741014278680454e-06, "loss": 0.0303, "step": 5210 }, { "epoch": 2.570162481536189, "grad_norm": 0.003366800956428051, "learning_rate": 9.721319547021174e-06, "loss": 0.0003, "step": 5220 }, { "epoch": 2.5750861644510095, "grad_norm": 0.0025488571263849735, "learning_rate": 9.701624815361892e-06, "loss": 0.0004, "step": 5230 }, { "epoch": 2.5800098473658295, "grad_norm": 0.0029968577437102795, "learning_rate": 9.681930083702612e-06, "loss": 0.0003, "step": 5240 }, { "epoch": 2.58493353028065, "grad_norm": 0.004265849944204092, "learning_rate": 9.662235352043328e-06, "loss": 0.0004, "step": 5250 }, { "epoch": 2.5898572131954705, "grad_norm": 0.0030266179237514734, "learning_rate": 9.642540620384048e-06, "loss": 0.0004, "step": 5260 }, { "epoch": 2.5947808961102905, "grad_norm": 0.00575278652831912, "learning_rate": 9.622845888724766e-06, "loss": 0.0045, "step": 5270 }, { "epoch": 2.5997045790251105, "grad_norm": 0.004713424481451511, "learning_rate": 9.603151157065486e-06, "loss": 0.001, "step": 5280 }, { "epoch": 2.604628261939931, "grad_norm": 0.0028863553889095783, "learning_rate": 9.583456425406204e-06, "loss": 0.0003, "step": 5290 }, { "epoch": 2.6095519448547515, "grad_norm": 0.0027613062411546707, "learning_rate": 9.563761693746924e-06, "loss": 0.0003, "step": 5300 }, { "epoch": 2.6144756277695715, "grad_norm": 0.00244720047339797, "learning_rate": 9.544066962087643e-06, "loss": 0.0003, "step": 5310 }, { "epoch": 2.619399310684392, "grad_norm": 0.0027843692805618048, "learning_rate": 9.52437223042836e-06, "loss": 0.0003, "step": 5320 }, { "epoch": 2.624322993599212, "grad_norm": 0.0024719424545764923, "learning_rate": 9.50467749876908e-06, "loss": 0.0003, "step": 5330 }, { "epoch": 2.6292466765140325, "grad_norm": 0.0023956247605383396, "learning_rate": 9.484982767109799e-06, "loss": 0.0003, "step": 5340 }, { "epoch": 2.634170359428853, "grad_norm": 0.0024870047345757484, "learning_rate": 9.465288035450517e-06, "loss": 0.0003, "step": 5350 }, { "epoch": 2.639094042343673, "grad_norm": 0.006259846035391092, "learning_rate": 9.445593303791237e-06, "loss": 0.0004, "step": 5360 }, { "epoch": 2.644017725258493, "grad_norm": 0.0027619535103440285, "learning_rate": 9.425898572131955e-06, "loss": 0.0003, "step": 5370 }, { "epoch": 2.6489414081733136, "grad_norm": 0.002708840649574995, "learning_rate": 9.406203840472673e-06, "loss": 0.0003, "step": 5380 }, { "epoch": 2.653865091088134, "grad_norm": 0.0023432679008692503, "learning_rate": 9.386509108813393e-06, "loss": 0.002, "step": 5390 }, { "epoch": 2.658788774002954, "grad_norm": 0.0024096230044960976, "learning_rate": 9.366814377154112e-06, "loss": 0.0004, "step": 5400 }, { "epoch": 2.6637124569177746, "grad_norm": 0.002802999457344413, "learning_rate": 9.347119645494831e-06, "loss": 0.0003, "step": 5410 }, { "epoch": 2.6686361398325946, "grad_norm": 0.002744765719398856, "learning_rate": 9.32742491383555e-06, "loss": 0.0003, "step": 5420 }, { "epoch": 2.673559822747415, "grad_norm": 0.0027366564609110355, "learning_rate": 9.30773018217627e-06, "loss": 0.0003, "step": 5430 }, { "epoch": 2.6784835056622356, "grad_norm": 0.0023025909904390574, "learning_rate": 9.288035450516988e-06, "loss": 0.0008, "step": 5440 }, { "epoch": 2.6834071885770556, "grad_norm": 0.003953023348003626, "learning_rate": 9.268340718857708e-06, "loss": 0.0004, "step": 5450 }, { "epoch": 2.6883308714918757, "grad_norm": 0.002293068915605545, "learning_rate": 9.248645987198424e-06, "loss": 0.0003, "step": 5460 }, { "epoch": 2.693254554406696, "grad_norm": 0.0025221628602594137, "learning_rate": 9.228951255539144e-06, "loss": 0.0003, "step": 5470 }, { "epoch": 2.6981782373215166, "grad_norm": 0.0023449882864952087, "learning_rate": 9.209256523879862e-06, "loss": 0.0003, "step": 5480 }, { "epoch": 2.7031019202363367, "grad_norm": 0.002262598369270563, "learning_rate": 9.189561792220582e-06, "loss": 0.0003, "step": 5490 }, { "epoch": 2.708025603151157, "grad_norm": 0.0022516308818012476, "learning_rate": 9.1698670605613e-06, "loss": 0.0011, "step": 5500 }, { "epoch": 2.712949286065977, "grad_norm": 0.0027228447142988443, "learning_rate": 9.15017232890202e-06, "loss": 0.0003, "step": 5510 }, { "epoch": 2.7178729689807977, "grad_norm": 0.0023118378594517708, "learning_rate": 9.130477597242738e-06, "loss": 0.0003, "step": 5520 }, { "epoch": 2.722796651895618, "grad_norm": 0.002595614641904831, "learning_rate": 9.110782865583457e-06, "loss": 0.0003, "step": 5530 }, { "epoch": 2.727720334810438, "grad_norm": 0.0023607241455465555, "learning_rate": 9.091088133924177e-06, "loss": 0.0003, "step": 5540 }, { "epoch": 2.7326440177252582, "grad_norm": 0.0023618319537490606, "learning_rate": 9.071393402264895e-06, "loss": 0.0003, "step": 5550 }, { "epoch": 2.7375677006400787, "grad_norm": 0.002674200339242816, "learning_rate": 9.051698670605615e-06, "loss": 0.0003, "step": 5560 }, { "epoch": 2.742491383554899, "grad_norm": 0.002609808696433902, "learning_rate": 9.032003938946333e-06, "loss": 0.0005, "step": 5570 }, { "epoch": 2.7474150664697192, "grad_norm": 0.0023445002734661102, "learning_rate": 9.012309207287051e-06, "loss": 0.0003, "step": 5580 }, { "epoch": 2.7523387493845397, "grad_norm": 0.0025870969984680414, "learning_rate": 8.99261447562777e-06, "loss": 0.0003, "step": 5590 }, { "epoch": 2.7572624322993597, "grad_norm": 0.0021777572110295296, "learning_rate": 8.97291974396849e-06, "loss": 0.0002, "step": 5600 }, { "epoch": 2.7621861152141802, "grad_norm": 0.002778227673843503, "learning_rate": 8.953225012309207e-06, "loss": 0.0003, "step": 5610 }, { "epoch": 2.7671097981290007, "grad_norm": 0.002241934882476926, "learning_rate": 8.933530280649927e-06, "loss": 0.0005, "step": 5620 }, { "epoch": 2.7720334810438207, "grad_norm": 0.002897050231695175, "learning_rate": 8.913835548990646e-06, "loss": 0.0003, "step": 5630 }, { "epoch": 2.7769571639586412, "grad_norm": 0.0022650938481092453, "learning_rate": 8.894140817331365e-06, "loss": 0.0003, "step": 5640 }, { "epoch": 2.7818808468734613, "grad_norm": 0.002553451107814908, "learning_rate": 8.874446085672084e-06, "loss": 0.0003, "step": 5650 }, { "epoch": 2.7868045297882817, "grad_norm": 4.319761276245117, "learning_rate": 8.854751354012802e-06, "loss": 0.0084, "step": 5660 }, { "epoch": 2.791728212703102, "grad_norm": 0.002494723303243518, "learning_rate": 8.835056622353522e-06, "loss": 0.0003, "step": 5670 }, { "epoch": 2.7966518956179223, "grad_norm": 0.0021848022006452084, "learning_rate": 8.81536189069424e-06, "loss": 0.0003, "step": 5680 }, { "epoch": 2.8015755785327423, "grad_norm": 0.002222836948931217, "learning_rate": 8.795667159034958e-06, "loss": 0.0003, "step": 5690 }, { "epoch": 2.806499261447563, "grad_norm": 0.002521749120205641, "learning_rate": 8.775972427375678e-06, "loss": 0.0003, "step": 5700 }, { "epoch": 2.8114229443623833, "grad_norm": 0.0022716443054378033, "learning_rate": 8.756277695716396e-06, "loss": 0.0003, "step": 5710 }, { "epoch": 2.8163466272772033, "grad_norm": 0.0027861695270985365, "learning_rate": 8.736582964057115e-06, "loss": 0.0442, "step": 5720 }, { "epoch": 2.821270310192024, "grad_norm": 0.0026454541366547346, "learning_rate": 8.716888232397834e-06, "loss": 0.0067, "step": 5730 }, { "epoch": 2.826193993106844, "grad_norm": 0.0024635563604533672, "learning_rate": 8.697193500738553e-06, "loss": 0.0003, "step": 5740 }, { "epoch": 2.8311176760216643, "grad_norm": 0.003969325218349695, "learning_rate": 8.677498769079273e-06, "loss": 0.0003, "step": 5750 }, { "epoch": 2.8360413589364843, "grad_norm": 0.003894041758030653, "learning_rate": 8.65780403741999e-06, "loss": 0.0003, "step": 5760 }, { "epoch": 2.840965041851305, "grad_norm": 0.0021586958318948746, "learning_rate": 8.63810930576071e-06, "loss": 0.0003, "step": 5770 }, { "epoch": 2.845888724766125, "grad_norm": 0.0021715862676501274, "learning_rate": 8.618414574101429e-06, "loss": 0.0003, "step": 5780 }, { "epoch": 2.8508124076809453, "grad_norm": 0.002224855124950409, "learning_rate": 8.598719842442147e-06, "loss": 0.0713, "step": 5790 }, { "epoch": 2.855736090595766, "grad_norm": 0.01677889935672283, "learning_rate": 8.579025110782865e-06, "loss": 0.0003, "step": 5800 }, { "epoch": 2.860659773510586, "grad_norm": 0.0036739737261086702, "learning_rate": 8.559330379123585e-06, "loss": 0.0002, "step": 5810 }, { "epoch": 2.8655834564254064, "grad_norm": 0.002256969688460231, "learning_rate": 8.539635647464303e-06, "loss": 0.0003, "step": 5820 }, { "epoch": 2.8705071393402264, "grad_norm": 0.0026434718165546656, "learning_rate": 8.519940915805023e-06, "loss": 0.0003, "step": 5830 }, { "epoch": 2.875430822255047, "grad_norm": 0.002218646463006735, "learning_rate": 8.500246184145742e-06, "loss": 0.0435, "step": 5840 }, { "epoch": 2.880354505169867, "grad_norm": 0.0029450245201587677, "learning_rate": 8.480551452486461e-06, "loss": 0.0003, "step": 5850 }, { "epoch": 2.8852781880846874, "grad_norm": 0.00922380294650793, "learning_rate": 8.46085672082718e-06, "loss": 0.0004, "step": 5860 }, { "epoch": 2.8902018709995074, "grad_norm": 0.004534618929028511, "learning_rate": 8.441161989167898e-06, "loss": 0.0003, "step": 5870 }, { "epoch": 2.895125553914328, "grad_norm": 0.002629433758556843, "learning_rate": 8.421467257508618e-06, "loss": 0.0003, "step": 5880 }, { "epoch": 2.9000492368291484, "grad_norm": 0.0023545767180621624, "learning_rate": 8.401772525849336e-06, "loss": 0.0003, "step": 5890 }, { "epoch": 2.9049729197439684, "grad_norm": 0.004185693338513374, "learning_rate": 8.382077794190054e-06, "loss": 0.0003, "step": 5900 }, { "epoch": 2.909896602658789, "grad_norm": 0.004315607715398073, "learning_rate": 8.362383062530774e-06, "loss": 0.0003, "step": 5910 }, { "epoch": 2.914820285573609, "grad_norm": 0.027618886902928352, "learning_rate": 8.342688330871492e-06, "loss": 0.0003, "step": 5920 }, { "epoch": 2.9197439684884294, "grad_norm": 0.003252118593081832, "learning_rate": 8.32299359921221e-06, "loss": 0.0003, "step": 5930 }, { "epoch": 2.9246676514032495, "grad_norm": 0.002648021560162306, "learning_rate": 8.30329886755293e-06, "loss": 0.0004, "step": 5940 }, { "epoch": 2.92959133431807, "grad_norm": 0.002187141450121999, "learning_rate": 8.283604135893649e-06, "loss": 0.0003, "step": 5950 }, { "epoch": 2.93451501723289, "grad_norm": 0.0021846459712833166, "learning_rate": 8.263909404234369e-06, "loss": 0.0003, "step": 5960 }, { "epoch": 2.9394387001477105, "grad_norm": 0.0021735290065407753, "learning_rate": 8.244214672575087e-06, "loss": 0.0003, "step": 5970 }, { "epoch": 2.944362383062531, "grad_norm": 0.002656135242432356, "learning_rate": 8.224519940915807e-06, "loss": 0.0003, "step": 5980 }, { "epoch": 2.949286065977351, "grad_norm": 0.002111564390361309, "learning_rate": 8.204825209256525e-06, "loss": 0.0003, "step": 5990 }, { "epoch": 2.9542097488921715, "grad_norm": 0.002079706871882081, "learning_rate": 8.185130477597243e-06, "loss": 0.0002, "step": 6000 }, { "epoch": 2.9591334318069915, "grad_norm": 0.0020710995886474848, "learning_rate": 8.165435745937961e-06, "loss": 0.0003, "step": 6010 }, { "epoch": 2.964057114721812, "grad_norm": 0.009048682637512684, "learning_rate": 8.145741014278681e-06, "loss": 0.0003, "step": 6020 }, { "epoch": 2.9689807976366325, "grad_norm": 0.002403336577117443, "learning_rate": 8.1260462826194e-06, "loss": 0.0062, "step": 6030 }, { "epoch": 2.9739044805514525, "grad_norm": 0.026963578537106514, "learning_rate": 8.10635155096012e-06, "loss": 0.0003, "step": 6040 }, { "epoch": 2.9788281634662725, "grad_norm": 0.0021355238277465105, "learning_rate": 8.086656819300837e-06, "loss": 0.0002, "step": 6050 }, { "epoch": 2.983751846381093, "grad_norm": 0.0022524266969412565, "learning_rate": 8.066962087641557e-06, "loss": 0.0003, "step": 6060 }, { "epoch": 2.9886755292959135, "grad_norm": 0.002698551630601287, "learning_rate": 8.047267355982276e-06, "loss": 0.0007, "step": 6070 }, { "epoch": 2.9935992122107336, "grad_norm": 0.0021112014073878527, "learning_rate": 8.027572624322994e-06, "loss": 0.0002, "step": 6080 }, { "epoch": 2.998522895125554, "grad_norm": 0.0032392677385360003, "learning_rate": 8.007877892663714e-06, "loss": 0.0003, "step": 6090 }, { "epoch": 3.0, "eval_accuracy": 0.9989536100453436, "eval_loss": 0.005800504703074694, "eval_runtime": 124.8736, "eval_samples_per_second": 22.959, "eval_steps_per_second": 2.875, "step": 6093 }, { "epoch": 3.003446578040374, "grad_norm": 0.002395425923168659, "learning_rate": 7.988183161004432e-06, "loss": 0.0003, "step": 6100 }, { "epoch": 3.0083702609551946, "grad_norm": 0.011811596341431141, "learning_rate": 7.96848842934515e-06, "loss": 0.0007, "step": 6110 }, { "epoch": 3.0132939438700146, "grad_norm": 0.0019815864507108927, "learning_rate": 7.94879369768587e-06, "loss": 0.0003, "step": 6120 }, { "epoch": 3.018217626784835, "grad_norm": 0.0020032506436109543, "learning_rate": 7.929098966026588e-06, "loss": 0.0002, "step": 6130 }, { "epoch": 3.0231413096996556, "grad_norm": 0.004863973241299391, "learning_rate": 7.909404234367306e-06, "loss": 0.0464, "step": 6140 }, { "epoch": 3.0280649926144756, "grad_norm": 35.28690719604492, "learning_rate": 7.889709502708026e-06, "loss": 0.045, "step": 6150 }, { "epoch": 3.032988675529296, "grad_norm": 0.0026731633115559816, "learning_rate": 7.870014771048745e-06, "loss": 0.0002, "step": 6160 }, { "epoch": 3.037912358444116, "grad_norm": 0.00375444907695055, "learning_rate": 7.850320039389464e-06, "loss": 0.0003, "step": 6170 }, { "epoch": 3.0428360413589366, "grad_norm": 0.0023011912126094103, "learning_rate": 7.830625307730183e-06, "loss": 0.0002, "step": 6180 }, { "epoch": 3.0477597242737566, "grad_norm": 0.0019286174792796373, "learning_rate": 7.810930576070903e-06, "loss": 0.0004, "step": 6190 }, { "epoch": 3.052683407188577, "grad_norm": 0.003087955992668867, "learning_rate": 7.79123584441162e-06, "loss": 0.0005, "step": 6200 }, { "epoch": 3.057607090103397, "grad_norm": 0.0019114416791126132, "learning_rate": 7.77154111275234e-06, "loss": 0.0003, "step": 6210 }, { "epoch": 3.0625307730182176, "grad_norm": 0.0019345678156241775, "learning_rate": 7.751846381093057e-06, "loss": 0.0002, "step": 6220 }, { "epoch": 3.067454455933038, "grad_norm": 0.0028331466019153595, "learning_rate": 7.732151649433777e-06, "loss": 0.0003, "step": 6230 }, { "epoch": 3.072378138847858, "grad_norm": 0.0019752325024455786, "learning_rate": 7.712456917774495e-06, "loss": 0.0002, "step": 6240 }, { "epoch": 3.0773018217626786, "grad_norm": 0.0023958859965205193, "learning_rate": 7.692762186115215e-06, "loss": 0.0004, "step": 6250 }, { "epoch": 3.0822255046774987, "grad_norm": 0.002664359984919429, "learning_rate": 7.673067454455933e-06, "loss": 0.0002, "step": 6260 }, { "epoch": 3.087149187592319, "grad_norm": 0.0026232562959194183, "learning_rate": 7.653372722796653e-06, "loss": 0.0002, "step": 6270 }, { "epoch": 3.092072870507139, "grad_norm": 0.002290521515533328, "learning_rate": 7.633677991137372e-06, "loss": 0.0003, "step": 6280 }, { "epoch": 3.0969965534219597, "grad_norm": 0.0018943879986181855, "learning_rate": 7.613983259478091e-06, "loss": 0.0003, "step": 6290 }, { "epoch": 3.1019202363367797, "grad_norm": 0.0019178404472768307, "learning_rate": 7.59428852781881e-06, "loss": 0.0002, "step": 6300 }, { "epoch": 3.1068439192516, "grad_norm": 0.0021891624201089144, "learning_rate": 7.574593796159529e-06, "loss": 0.0003, "step": 6310 }, { "epoch": 3.1117676021664207, "grad_norm": 0.001878801267594099, "learning_rate": 7.554899064500247e-06, "loss": 0.0002, "step": 6320 }, { "epoch": 3.1166912850812407, "grad_norm": 0.006385423243045807, "learning_rate": 7.535204332840965e-06, "loss": 0.0003, "step": 6330 }, { "epoch": 3.121614967996061, "grad_norm": 0.0018304381519556046, "learning_rate": 7.515509601181684e-06, "loss": 0.0002, "step": 6340 }, { "epoch": 3.1265386509108812, "grad_norm": 0.0026562483981251717, "learning_rate": 7.495814869522403e-06, "loss": 0.0002, "step": 6350 }, { "epoch": 3.1314623338257017, "grad_norm": 0.0018779343226924539, "learning_rate": 7.476120137863122e-06, "loss": 0.0002, "step": 6360 }, { "epoch": 3.1363860167405218, "grad_norm": 0.002124248770996928, "learning_rate": 7.456425406203841e-06, "loss": 0.0002, "step": 6370 }, { "epoch": 3.1413096996553422, "grad_norm": 0.0029339792672544718, "learning_rate": 7.43673067454456e-06, "loss": 0.0003, "step": 6380 }, { "epoch": 3.1462333825701623, "grad_norm": 0.0021487076301127672, "learning_rate": 7.417035942885279e-06, "loss": 0.0002, "step": 6390 }, { "epoch": 3.1511570654849828, "grad_norm": 0.001838160096667707, "learning_rate": 7.397341211225998e-06, "loss": 0.0002, "step": 6400 }, { "epoch": 3.1560807483998032, "grad_norm": 0.004113923758268356, "learning_rate": 7.377646479566717e-06, "loss": 0.0002, "step": 6410 }, { "epoch": 3.1610044313146233, "grad_norm": 0.0024032278452068567, "learning_rate": 7.357951747907436e-06, "loss": 0.0002, "step": 6420 }, { "epoch": 3.1659281142294438, "grad_norm": 0.0024140363093465567, "learning_rate": 7.338257016248154e-06, "loss": 0.0002, "step": 6430 }, { "epoch": 3.170851797144264, "grad_norm": 0.0021326704882085323, "learning_rate": 7.318562284588872e-06, "loss": 0.0002, "step": 6440 }, { "epoch": 3.1757754800590843, "grad_norm": 0.0017906288849189878, "learning_rate": 7.298867552929591e-06, "loss": 0.0003, "step": 6450 }, { "epoch": 3.1806991629739043, "grad_norm": 0.0026058172807097435, "learning_rate": 7.27917282127031e-06, "loss": 0.0002, "step": 6460 }, { "epoch": 3.185622845888725, "grad_norm": 0.0017882023239508271, "learning_rate": 7.259478089611029e-06, "loss": 0.0002, "step": 6470 }, { "epoch": 3.1905465288035453, "grad_norm": 0.0017937802476808429, "learning_rate": 7.2397833579517485e-06, "loss": 0.0002, "step": 6480 }, { "epoch": 3.1954702117183653, "grad_norm": 0.0017613591626286507, "learning_rate": 7.2200886262924675e-06, "loss": 0.0002, "step": 6490 }, { "epoch": 3.200393894633186, "grad_norm": 0.001861646305769682, "learning_rate": 7.2003938946331866e-06, "loss": 0.0018, "step": 6500 }, { "epoch": 3.205317577548006, "grad_norm": 0.02838357537984848, "learning_rate": 7.180699162973906e-06, "loss": 0.0003, "step": 6510 }, { "epoch": 3.2102412604628263, "grad_norm": 0.00175224500708282, "learning_rate": 7.161004431314625e-06, "loss": 0.0002, "step": 6520 }, { "epoch": 3.2151649433776464, "grad_norm": 0.0017621772130951285, "learning_rate": 7.141309699655343e-06, "loss": 0.0002, "step": 6530 }, { "epoch": 3.220088626292467, "grad_norm": 0.0019370117224752903, "learning_rate": 7.121614967996061e-06, "loss": 0.0002, "step": 6540 }, { "epoch": 3.225012309207287, "grad_norm": 0.0031523550860583782, "learning_rate": 7.10192023633678e-06, "loss": 0.0002, "step": 6550 }, { "epoch": 3.2299359921221074, "grad_norm": 0.0028889975510537624, "learning_rate": 7.082225504677499e-06, "loss": 0.0002, "step": 6560 }, { "epoch": 3.234859675036928, "grad_norm": 0.0017210356891155243, "learning_rate": 7.062530773018218e-06, "loss": 0.0002, "step": 6570 }, { "epoch": 3.239783357951748, "grad_norm": 0.005474100820720196, "learning_rate": 7.042836041358937e-06, "loss": 0.0002, "step": 6580 }, { "epoch": 3.2447070408665684, "grad_norm": 0.002096637850627303, "learning_rate": 7.0231413096996555e-06, "loss": 0.0002, "step": 6590 }, { "epoch": 3.2496307237813884, "grad_norm": 0.0020583397708833218, "learning_rate": 7.003446578040375e-06, "loss": 0.0002, "step": 6600 }, { "epoch": 3.254554406696209, "grad_norm": 0.001692062127403915, "learning_rate": 6.983751846381094e-06, "loss": 0.0002, "step": 6610 }, { "epoch": 3.259478089611029, "grad_norm": 0.0017217019340023398, "learning_rate": 6.964057114721813e-06, "loss": 0.0002, "step": 6620 }, { "epoch": 3.2644017725258494, "grad_norm": 0.001690174569375813, "learning_rate": 6.944362383062532e-06, "loss": 0.0002, "step": 6630 }, { "epoch": 3.2693254554406694, "grad_norm": 0.0017117226962000132, "learning_rate": 6.924667651403251e-06, "loss": 0.0002, "step": 6640 }, { "epoch": 3.27424913835549, "grad_norm": 0.002361851977184415, "learning_rate": 6.904972919743968e-06, "loss": 0.0003, "step": 6650 }, { "epoch": 3.2791728212703104, "grad_norm": 0.0022522679064422846, "learning_rate": 6.885278188084687e-06, "loss": 0.0002, "step": 6660 }, { "epoch": 3.2840965041851304, "grad_norm": 0.001928364159539342, "learning_rate": 6.865583456425406e-06, "loss": 0.0002, "step": 6670 }, { "epoch": 3.289020187099951, "grad_norm": 0.001716680359095335, "learning_rate": 6.845888724766125e-06, "loss": 0.0002, "step": 6680 }, { "epoch": 3.293943870014771, "grad_norm": 0.10604394972324371, "learning_rate": 6.826193993106844e-06, "loss": 0.0004, "step": 6690 }, { "epoch": 3.2988675529295914, "grad_norm": 0.001663457602262497, "learning_rate": 6.8064992614475635e-06, "loss": 0.0002, "step": 6700 }, { "epoch": 3.3037912358444115, "grad_norm": 0.0023722369223833084, "learning_rate": 6.7868045297882825e-06, "loss": 0.0003, "step": 6710 }, { "epoch": 3.308714918759232, "grad_norm": 0.0020810524001717567, "learning_rate": 6.7671097981290016e-06, "loss": 0.0002, "step": 6720 }, { "epoch": 3.313638601674052, "grad_norm": 0.0019393692491576076, "learning_rate": 6.747415066469721e-06, "loss": 0.0002, "step": 6730 }, { "epoch": 3.3185622845888725, "grad_norm": 0.0016419962048530579, "learning_rate": 6.727720334810439e-06, "loss": 0.0002, "step": 6740 }, { "epoch": 3.323485967503693, "grad_norm": 0.0016390602104365826, "learning_rate": 6.708025603151158e-06, "loss": 0.0002, "step": 6750 }, { "epoch": 3.328409650418513, "grad_norm": 0.0016691131750121713, "learning_rate": 6.688330871491876e-06, "loss": 0.0002, "step": 6760 }, { "epoch": 3.3333333333333335, "grad_norm": 0.001631229417398572, "learning_rate": 6.668636139832595e-06, "loss": 0.0003, "step": 6770 }, { "epoch": 3.3382570162481535, "grad_norm": 0.001860469812527299, "learning_rate": 6.648941408173314e-06, "loss": 0.0002, "step": 6780 }, { "epoch": 3.343180699162974, "grad_norm": 0.0019199643284082413, "learning_rate": 6.629246676514033e-06, "loss": 0.0002, "step": 6790 }, { "epoch": 3.348104382077794, "grad_norm": 0.0016363976756110787, "learning_rate": 6.6095519448547515e-06, "loss": 0.0002, "step": 6800 }, { "epoch": 3.3530280649926145, "grad_norm": 0.0021544115152209997, "learning_rate": 6.5898572131954705e-06, "loss": 0.0002, "step": 6810 }, { "epoch": 3.3579517479074346, "grad_norm": 0.0016355662373825908, "learning_rate": 6.57016248153619e-06, "loss": 0.0002, "step": 6820 }, { "epoch": 3.362875430822255, "grad_norm": 0.0016973905730992556, "learning_rate": 6.550467749876909e-06, "loss": 0.0002, "step": 6830 }, { "epoch": 3.3677991137370755, "grad_norm": 0.4839552938938141, "learning_rate": 6.530773018217628e-06, "loss": 0.0025, "step": 6840 }, { "epoch": 3.3727227966518956, "grad_norm": 0.0016121608205139637, "learning_rate": 6.511078286558347e-06, "loss": 0.0002, "step": 6850 }, { "epoch": 3.377646479566716, "grad_norm": 0.0016309829661622643, "learning_rate": 6.491383554899066e-06, "loss": 0.0002, "step": 6860 }, { "epoch": 3.382570162481536, "grad_norm": 0.0015953374095261097, "learning_rate": 6.471688823239783e-06, "loss": 0.0002, "step": 6870 }, { "epoch": 3.3874938453963566, "grad_norm": 0.0015872882213443518, "learning_rate": 6.451994091580502e-06, "loss": 0.0002, "step": 6880 }, { "epoch": 3.3924175283111766, "grad_norm": 0.001950180740095675, "learning_rate": 6.432299359921221e-06, "loss": 0.0002, "step": 6890 }, { "epoch": 3.397341211225997, "grad_norm": 0.001902850461192429, "learning_rate": 6.41260462826194e-06, "loss": 0.0002, "step": 6900 }, { "epoch": 3.402264894140817, "grad_norm": 0.0018240628996863961, "learning_rate": 6.392909896602659e-06, "loss": 0.0002, "step": 6910 }, { "epoch": 3.4071885770556376, "grad_norm": 0.0015961594181135297, "learning_rate": 6.3732151649433785e-06, "loss": 0.0002, "step": 6920 }, { "epoch": 3.412112259970458, "grad_norm": 0.0015978438314050436, "learning_rate": 6.3535204332840975e-06, "loss": 0.0002, "step": 6930 }, { "epoch": 3.417035942885278, "grad_norm": 0.002185217337682843, "learning_rate": 6.333825701624816e-06, "loss": 0.0002, "step": 6940 }, { "epoch": 3.4219596258000986, "grad_norm": 0.0015885097673162818, "learning_rate": 6.314130969965535e-06, "loss": 0.0002, "step": 6950 }, { "epoch": 3.4268833087149186, "grad_norm": 0.0017295465804636478, "learning_rate": 6.294436238306254e-06, "loss": 0.0002, "step": 6960 }, { "epoch": 3.431806991629739, "grad_norm": 0.0020427897106856108, "learning_rate": 6.274741506646972e-06, "loss": 0.0002, "step": 6970 }, { "epoch": 3.436730674544559, "grad_norm": 0.001587416511029005, "learning_rate": 6.255046774987691e-06, "loss": 0.0002, "step": 6980 }, { "epoch": 3.4416543574593796, "grad_norm": 0.0015563281485810876, "learning_rate": 6.23535204332841e-06, "loss": 0.0002, "step": 6990 }, { "epoch": 3.4465780403741997, "grad_norm": 0.0019551627337932587, "learning_rate": 6.215657311669128e-06, "loss": 0.0002, "step": 7000 }, { "epoch": 3.45150172328902, "grad_norm": 0.00192571512889117, "learning_rate": 6.1959625800098474e-06, "loss": 0.0002, "step": 7010 }, { "epoch": 3.4564254062038406, "grad_norm": 0.0022392261307686567, "learning_rate": 6.1762678483505665e-06, "loss": 0.0002, "step": 7020 }, { "epoch": 3.4613490891186607, "grad_norm": 0.0015274557517841458, "learning_rate": 6.1565731166912856e-06, "loss": 0.0002, "step": 7030 }, { "epoch": 3.466272772033481, "grad_norm": 0.0025689592584967613, "learning_rate": 6.136878385032005e-06, "loss": 0.0528, "step": 7040 }, { "epoch": 3.471196454948301, "grad_norm": 0.0017388068372383714, "learning_rate": 6.117183653372724e-06, "loss": 0.0002, "step": 7050 }, { "epoch": 3.4761201378631217, "grad_norm": 0.002218118868768215, "learning_rate": 6.097488921713443e-06, "loss": 0.0002, "step": 7060 }, { "epoch": 3.4810438207779417, "grad_norm": 0.0015678524505347013, "learning_rate": 6.077794190054162e-06, "loss": 0.0002, "step": 7070 }, { "epoch": 3.485967503692762, "grad_norm": 0.001805935869924724, "learning_rate": 6.058099458394879e-06, "loss": 0.0002, "step": 7080 }, { "epoch": 3.4908911866075822, "grad_norm": 0.0015670402208343148, "learning_rate": 6.038404726735598e-06, "loss": 0.0004, "step": 7090 }, { "epoch": 3.4958148695224027, "grad_norm": 0.0016316076507791877, "learning_rate": 6.018709995076317e-06, "loss": 0.0002, "step": 7100 }, { "epoch": 3.500738552437223, "grad_norm": 0.0015326079446822405, "learning_rate": 5.999015263417036e-06, "loss": 0.0002, "step": 7110 }, { "epoch": 3.5056622353520432, "grad_norm": 0.0019040625775232911, "learning_rate": 5.979320531757755e-06, "loss": 0.0002, "step": 7120 }, { "epoch": 3.5105859182668637, "grad_norm": 0.0018364688148722053, "learning_rate": 5.9596258000984744e-06, "loss": 0.0002, "step": 7130 }, { "epoch": 3.5155096011816838, "grad_norm": 0.001652045757509768, "learning_rate": 5.9399310684391935e-06, "loss": 0.028, "step": 7140 }, { "epoch": 3.5204332840965042, "grad_norm": 0.0017825603717938066, "learning_rate": 5.920236336779912e-06, "loss": 0.0002, "step": 7150 }, { "epoch": 3.5253569670113247, "grad_norm": 0.0015865016030147672, "learning_rate": 5.900541605120631e-06, "loss": 0.0002, "step": 7160 }, { "epoch": 3.5302806499261448, "grad_norm": 0.002345743589103222, "learning_rate": 5.88084687346135e-06, "loss": 0.0002, "step": 7170 }, { "epoch": 3.535204332840965, "grad_norm": 0.001753124175593257, "learning_rate": 5.861152141802069e-06, "loss": 0.0002, "step": 7180 }, { "epoch": 3.5401280157557853, "grad_norm": 0.0016645858995616436, "learning_rate": 5.841457410142787e-06, "loss": 0.0002, "step": 7190 }, { "epoch": 3.5450516986706058, "grad_norm": 0.0017800434725359082, "learning_rate": 5.821762678483506e-06, "loss": 0.0002, "step": 7200 }, { "epoch": 3.549975381585426, "grad_norm": 0.0025877405423671007, "learning_rate": 5.802067946824224e-06, "loss": 0.0002, "step": 7210 }, { "epoch": 3.5548990645002463, "grad_norm": 0.0018426472088322043, "learning_rate": 5.782373215164943e-06, "loss": 0.0002, "step": 7220 }, { "epoch": 3.5598227474150663, "grad_norm": 0.0019229879835620522, "learning_rate": 5.7626784835056625e-06, "loss": 0.0018, "step": 7230 }, { "epoch": 3.564746430329887, "grad_norm": 0.001863017212599516, "learning_rate": 5.7429837518463815e-06, "loss": 0.0002, "step": 7240 }, { "epoch": 3.5696701132447073, "grad_norm": 0.0015497950371354818, "learning_rate": 5.7232890201871006e-06, "loss": 0.0002, "step": 7250 }, { "epoch": 3.5745937961595273, "grad_norm": 0.0015739004593342543, "learning_rate": 5.70359428852782e-06, "loss": 0.0002, "step": 7260 }, { "epoch": 3.5795174790743474, "grad_norm": 0.0018886280013248324, "learning_rate": 5.683899556868539e-06, "loss": 0.0002, "step": 7270 }, { "epoch": 3.584441161989168, "grad_norm": 0.0020600894931703806, "learning_rate": 5.664204825209258e-06, "loss": 0.0002, "step": 7280 }, { "epoch": 3.5893648449039883, "grad_norm": 0.0017813716549426317, "learning_rate": 5.644510093549976e-06, "loss": 0.0002, "step": 7290 }, { "epoch": 3.5942885278188084, "grad_norm": 0.0014752724673599005, "learning_rate": 5.624815361890694e-06, "loss": 0.0003, "step": 7300 }, { "epoch": 3.599212210733629, "grad_norm": 0.0017788780387490988, "learning_rate": 5.605120630231413e-06, "loss": 0.0002, "step": 7310 }, { "epoch": 3.604135893648449, "grad_norm": 0.0017315271543338895, "learning_rate": 5.585425898572132e-06, "loss": 0.0002, "step": 7320 }, { "epoch": 3.6090595765632694, "grad_norm": 0.0015107191866263747, "learning_rate": 5.565731166912851e-06, "loss": 0.0212, "step": 7330 }, { "epoch": 3.61398325947809, "grad_norm": 0.0015164316864684224, "learning_rate": 5.54603643525357e-06, "loss": 0.0002, "step": 7340 }, { "epoch": 3.61890694239291, "grad_norm": 0.0015602256171405315, "learning_rate": 5.526341703594289e-06, "loss": 0.0002, "step": 7350 }, { "epoch": 3.62383062530773, "grad_norm": 0.0015156749868765473, "learning_rate": 5.506646971935008e-06, "loss": 0.0002, "step": 7360 }, { "epoch": 3.6287543082225504, "grad_norm": 0.0015055168187245727, "learning_rate": 5.486952240275727e-06, "loss": 0.0002, "step": 7370 }, { "epoch": 3.633677991137371, "grad_norm": 0.0020491585601121187, "learning_rate": 5.467257508616446e-06, "loss": 0.0002, "step": 7380 }, { "epoch": 3.638601674052191, "grad_norm": 0.0016383701004087925, "learning_rate": 5.447562776957165e-06, "loss": 0.0002, "step": 7390 }, { "epoch": 3.6435253569670114, "grad_norm": 0.0015544986817985773, "learning_rate": 5.427868045297884e-06, "loss": 0.0003, "step": 7400 }, { "epoch": 3.6484490398818314, "grad_norm": 0.0018146246438845992, "learning_rate": 5.408173313638601e-06, "loss": 0.0002, "step": 7410 }, { "epoch": 3.653372722796652, "grad_norm": 0.001972957979887724, "learning_rate": 5.38847858197932e-06, "loss": 0.0002, "step": 7420 }, { "epoch": 3.6582964057114724, "grad_norm": 0.0021671319846063852, "learning_rate": 5.368783850320039e-06, "loss": 0.0002, "step": 7430 }, { "epoch": 3.6632200886262924, "grad_norm": 0.002430422930046916, "learning_rate": 5.349089118660758e-06, "loss": 0.0002, "step": 7440 }, { "epoch": 3.6681437715411125, "grad_norm": 0.0015026788460090756, "learning_rate": 5.3293943870014775e-06, "loss": 0.0002, "step": 7450 }, { "epoch": 3.673067454455933, "grad_norm": 0.0068082925863564014, "learning_rate": 5.3096996553421965e-06, "loss": 0.0002, "step": 7460 }, { "epoch": 3.6779911373707534, "grad_norm": 0.001487517962232232, "learning_rate": 5.290004923682916e-06, "loss": 0.0002, "step": 7470 }, { "epoch": 3.6829148202855735, "grad_norm": 0.006023659370839596, "learning_rate": 5.270310192023635e-06, "loss": 0.0002, "step": 7480 }, { "epoch": 3.687838503200394, "grad_norm": 0.0016626460710540414, "learning_rate": 5.250615460364354e-06, "loss": 0.0002, "step": 7490 }, { "epoch": 3.692762186115214, "grad_norm": 0.0018089113291352987, "learning_rate": 5.230920728705072e-06, "loss": 0.0002, "step": 7500 }, { "epoch": 3.6976858690300345, "grad_norm": 0.001456581405363977, "learning_rate": 5.21122599704579e-06, "loss": 0.0002, "step": 7510 }, { "epoch": 3.702609551944855, "grad_norm": 0.0014255401911213994, "learning_rate": 5.191531265386509e-06, "loss": 0.0003, "step": 7520 }, { "epoch": 3.707533234859675, "grad_norm": 0.00147194042801857, "learning_rate": 5.171836533727228e-06, "loss": 0.0002, "step": 7530 }, { "epoch": 3.712456917774495, "grad_norm": 0.001444322639144957, "learning_rate": 5.152141802067947e-06, "loss": 0.0002, "step": 7540 }, { "epoch": 3.7173806006893155, "grad_norm": 0.001467820955440402, "learning_rate": 5.132447070408666e-06, "loss": 0.0002, "step": 7550 }, { "epoch": 3.722304283604136, "grad_norm": 0.0020592319779098034, "learning_rate": 5.1127523387493846e-06, "loss": 0.0002, "step": 7560 }, { "epoch": 3.727227966518956, "grad_norm": 0.0017327726818621159, "learning_rate": 5.093057607090104e-06, "loss": 0.0002, "step": 7570 }, { "epoch": 3.7321516494337765, "grad_norm": 0.0017492349725216627, "learning_rate": 5.073362875430823e-06, "loss": 0.0002, "step": 7580 }, { "epoch": 3.7370753323485966, "grad_norm": 0.00662592425942421, "learning_rate": 5.053668143771542e-06, "loss": 0.0002, "step": 7590 }, { "epoch": 3.741999015263417, "grad_norm": 0.001524804625660181, "learning_rate": 5.033973412112261e-06, "loss": 0.0002, "step": 7600 }, { "epoch": 3.7469226981782375, "grad_norm": 0.0014649020740762353, "learning_rate": 5.01427868045298e-06, "loss": 0.0002, "step": 7610 }, { "epoch": 3.7518463810930576, "grad_norm": 0.0014713428681716323, "learning_rate": 4.994583948793698e-06, "loss": 0.0005, "step": 7620 }, { "epoch": 3.756770064007878, "grad_norm": 0.0017360311467200518, "learning_rate": 4.974889217134417e-06, "loss": 0.0049, "step": 7630 }, { "epoch": 3.761693746922698, "grad_norm": 0.0014296614099293947, "learning_rate": 4.955194485475136e-06, "loss": 0.0002, "step": 7640 }, { "epoch": 3.7666174298375186, "grad_norm": 0.13573376834392548, "learning_rate": 4.935499753815854e-06, "loss": 0.0003, "step": 7650 }, { "epoch": 3.7715411127523386, "grad_norm": 0.0024039391428232193, "learning_rate": 4.915805022156573e-06, "loss": 0.0002, "step": 7660 }, { "epoch": 3.776464795667159, "grad_norm": 0.0014649844961240888, "learning_rate": 4.8961102904972925e-06, "loss": 0.0002, "step": 7670 }, { "epoch": 3.781388478581979, "grad_norm": 0.0016867737285792828, "learning_rate": 4.8764155588380115e-06, "loss": 0.0002, "step": 7680 }, { "epoch": 3.7863121614967996, "grad_norm": 0.001405984628945589, "learning_rate": 4.856720827178731e-06, "loss": 0.0002, "step": 7690 }, { "epoch": 3.79123584441162, "grad_norm": 0.0014443190302699804, "learning_rate": 4.837026095519449e-06, "loss": 0.0002, "step": 7700 }, { "epoch": 3.79615952732644, "grad_norm": 0.0016520627541467547, "learning_rate": 4.817331363860168e-06, "loss": 0.0003, "step": 7710 }, { "epoch": 3.8010832102412606, "grad_norm": 0.01564246602356434, "learning_rate": 4.797636632200887e-06, "loss": 0.0002, "step": 7720 }, { "epoch": 3.8060068931560807, "grad_norm": 0.0014039729721844196, "learning_rate": 4.777941900541606e-06, "loss": 0.0002, "step": 7730 }, { "epoch": 3.810930576070901, "grad_norm": 0.0015944570768624544, "learning_rate": 4.758247168882324e-06, "loss": 0.0002, "step": 7740 }, { "epoch": 3.815854258985721, "grad_norm": 0.0016037479508668184, "learning_rate": 4.738552437223043e-06, "loss": 0.0002, "step": 7750 }, { "epoch": 3.8207779419005417, "grad_norm": 0.0016006861114874482, "learning_rate": 4.718857705563762e-06, "loss": 0.0003, "step": 7760 }, { "epoch": 3.8257016248153617, "grad_norm": 0.0014518728712573647, "learning_rate": 4.6991629739044805e-06, "loss": 0.0002, "step": 7770 }, { "epoch": 3.830625307730182, "grad_norm": 0.0020076241344213486, "learning_rate": 4.6794682422451996e-06, "loss": 0.0002, "step": 7780 }, { "epoch": 3.8355489906450027, "grad_norm": 0.0013705624733120203, "learning_rate": 4.659773510585919e-06, "loss": 0.0002, "step": 7790 }, { "epoch": 3.8404726735598227, "grad_norm": 0.0015762551920488477, "learning_rate": 4.640078778926638e-06, "loss": 0.0002, "step": 7800 }, { "epoch": 3.845396356474643, "grad_norm": 0.001401646644808352, "learning_rate": 4.620384047267356e-06, "loss": 0.0023, "step": 7810 }, { "epoch": 3.850320039389463, "grad_norm": 0.0014297078596428037, "learning_rate": 4.600689315608075e-06, "loss": 0.0002, "step": 7820 }, { "epoch": 3.8552437223042837, "grad_norm": 0.001564424717798829, "learning_rate": 4.580994583948794e-06, "loss": 0.0002, "step": 7830 }, { "epoch": 3.8601674052191037, "grad_norm": 0.0016091325087472796, "learning_rate": 4.561299852289513e-06, "loss": 0.0427, "step": 7840 }, { "epoch": 3.865091088133924, "grad_norm": 0.0013987331185489893, "learning_rate": 4.541605120630232e-06, "loss": 0.0002, "step": 7850 }, { "epoch": 3.8700147710487443, "grad_norm": 0.0013764946488663554, "learning_rate": 4.521910388970951e-06, "loss": 0.0041, "step": 7860 }, { "epoch": 3.8749384539635647, "grad_norm": 0.0014972817152738571, "learning_rate": 4.502215657311669e-06, "loss": 0.0002, "step": 7870 }, { "epoch": 3.879862136878385, "grad_norm": 0.002630974631756544, "learning_rate": 4.4825209256523884e-06, "loss": 0.0002, "step": 7880 }, { "epoch": 3.8847858197932053, "grad_norm": 0.0015066839987412095, "learning_rate": 4.4628261939931075e-06, "loss": 0.0002, "step": 7890 }, { "epoch": 3.8897095027080257, "grad_norm": 0.0013690165942534804, "learning_rate": 4.4431314623338265e-06, "loss": 0.0002, "step": 7900 }, { "epoch": 3.8946331856228458, "grad_norm": 0.0015422162832692266, "learning_rate": 4.423436730674545e-06, "loss": 0.0002, "step": 7910 }, { "epoch": 3.8995568685376663, "grad_norm": 0.0013625508872792125, "learning_rate": 4.403741999015264e-06, "loss": 0.0002, "step": 7920 }, { "epoch": 3.9044805514524867, "grad_norm": 0.0013615674106404185, "learning_rate": 4.384047267355983e-06, "loss": 0.0002, "step": 7930 }, { "epoch": 3.9094042343673068, "grad_norm": 0.0013642277335748076, "learning_rate": 4.364352535696701e-06, "loss": 0.0006, "step": 7940 }, { "epoch": 3.914327917282127, "grad_norm": 0.0018096421845257282, "learning_rate": 4.34465780403742e-06, "loss": 0.0002, "step": 7950 }, { "epoch": 3.9192516001969473, "grad_norm": 0.0016002668999135494, "learning_rate": 4.324963072378139e-06, "loss": 0.0002, "step": 7960 }, { "epoch": 3.9241752831117678, "grad_norm": 0.0013518129708245397, "learning_rate": 4.305268340718857e-06, "loss": 0.0002, "step": 7970 }, { "epoch": 3.929098966026588, "grad_norm": 0.0013844161294400692, "learning_rate": 4.2855736090595765e-06, "loss": 0.0002, "step": 7980 }, { "epoch": 3.9340226489414083, "grad_norm": 0.0013579302467405796, "learning_rate": 4.2658788774002955e-06, "loss": 0.0002, "step": 7990 }, { "epoch": 3.9389463318562283, "grad_norm": 0.00134462455753237, "learning_rate": 4.2461841457410146e-06, "loss": 0.0002, "step": 8000 }, { "epoch": 3.943870014771049, "grad_norm": 0.0018355045467615128, "learning_rate": 4.226489414081734e-06, "loss": 0.0002, "step": 8010 }, { "epoch": 3.9487936976858693, "grad_norm": 0.001347012585029006, "learning_rate": 4.206794682422453e-06, "loss": 0.0002, "step": 8020 }, { "epoch": 3.9537173806006893, "grad_norm": 0.0016463873907923698, "learning_rate": 4.187099950763171e-06, "loss": 0.0002, "step": 8030 }, { "epoch": 3.9586410635155094, "grad_norm": 0.0013524794485419989, "learning_rate": 4.16740521910389e-06, "loss": 0.0717, "step": 8040 }, { "epoch": 3.96356474643033, "grad_norm": 0.0015625300584360957, "learning_rate": 4.147710487444609e-06, "loss": 0.0002, "step": 8050 }, { "epoch": 3.9684884293451503, "grad_norm": 0.001937661785632372, "learning_rate": 4.128015755785328e-06, "loss": 0.0002, "step": 8060 }, { "epoch": 3.9734121122599704, "grad_norm": 0.0013714928645640612, "learning_rate": 4.108321024126047e-06, "loss": 0.0002, "step": 8070 }, { "epoch": 3.978335795174791, "grad_norm": 0.001366511220112443, "learning_rate": 4.088626292466765e-06, "loss": 0.0002, "step": 8080 }, { "epoch": 3.983259478089611, "grad_norm": 0.005632834974676371, "learning_rate": 4.068931560807484e-06, "loss": 0.0002, "step": 8090 }, { "epoch": 3.9881831610044314, "grad_norm": 0.0016822253819555044, "learning_rate": 4.0492368291482034e-06, "loss": 0.0259, "step": 8100 }, { "epoch": 3.993106843919252, "grad_norm": 0.0016814577393233776, "learning_rate": 4.0295420974889225e-06, "loss": 0.0002, "step": 8110 }, { "epoch": 3.998030526834072, "grad_norm": 0.0015251173172146082, "learning_rate": 4.009847365829641e-06, "loss": 0.0002, "step": 8120 }, { "epoch": 4.0, "eval_accuracy": 0.9982560167422393, "eval_loss": 0.004296807572245598, "eval_runtime": 127.4433, "eval_samples_per_second": 22.496, "eval_steps_per_second": 2.817, "step": 8124 }, { "epoch": 4.002954209748892, "grad_norm": 0.0015737974317744374, "learning_rate": 3.99015263417036e-06, "loss": 0.0002, "step": 8130 }, { "epoch": 4.007877892663712, "grad_norm": 0.0015378405805677176, "learning_rate": 3.970457902511079e-06, "loss": 0.0001, "step": 8140 }, { "epoch": 4.012801575578533, "grad_norm": 0.0023324452340602875, "learning_rate": 3.950763170851797e-06, "loss": 0.0002, "step": 8150 }, { "epoch": 4.017725258493353, "grad_norm": 0.0015311094466596842, "learning_rate": 3.931068439192516e-06, "loss": 0.0002, "step": 8160 }, { "epoch": 4.022648941408173, "grad_norm": 0.05710865184664726, "learning_rate": 3.911373707533235e-06, "loss": 0.0004, "step": 8170 }, { "epoch": 4.0275726243229935, "grad_norm": 0.0013750927755609155, "learning_rate": 3.891678975873954e-06, "loss": 0.0002, "step": 8180 }, { "epoch": 4.032496307237814, "grad_norm": 0.0015017867553979158, "learning_rate": 3.871984244214672e-06, "loss": 0.0002, "step": 8190 }, { "epoch": 4.037419990152634, "grad_norm": 0.0018651321297511458, "learning_rate": 3.8522895125553915e-06, "loss": 0.0002, "step": 8200 }, { "epoch": 4.042343673067455, "grad_norm": 0.0015941975871101022, "learning_rate": 3.8325947808961105e-06, "loss": 0.0002, "step": 8210 }, { "epoch": 4.0472673559822745, "grad_norm": 0.0018877830589190125, "learning_rate": 3.8129000492368296e-06, "loss": 0.0002, "step": 8220 }, { "epoch": 4.052191038897095, "grad_norm": 0.0013936322648078203, "learning_rate": 3.7932053175775486e-06, "loss": 0.0002, "step": 8230 }, { "epoch": 4.0571147218119155, "grad_norm": 0.0015832120552659035, "learning_rate": 3.773510585918267e-06, "loss": 0.0002, "step": 8240 }, { "epoch": 4.062038404726736, "grad_norm": 0.0013590282760560513, "learning_rate": 3.753815854258986e-06, "loss": 0.0002, "step": 8250 }, { "epoch": 4.0669620876415555, "grad_norm": 0.0016734645469114184, "learning_rate": 3.734121122599705e-06, "loss": 0.0002, "step": 8260 }, { "epoch": 4.071885770556376, "grad_norm": 0.001429893309250474, "learning_rate": 3.7144263909404236e-06, "loss": 0.0002, "step": 8270 }, { "epoch": 4.0768094534711965, "grad_norm": 0.0017754683503881097, "learning_rate": 3.6947316592811427e-06, "loss": 0.0002, "step": 8280 }, { "epoch": 4.081733136386017, "grad_norm": 0.0013153937179595232, "learning_rate": 3.6750369276218617e-06, "loss": 0.0002, "step": 8290 }, { "epoch": 4.0866568193008375, "grad_norm": 0.0013936875620856881, "learning_rate": 3.65534219596258e-06, "loss": 0.0002, "step": 8300 }, { "epoch": 4.091580502215657, "grad_norm": 0.007932737469673157, "learning_rate": 3.635647464303299e-06, "loss": 0.0002, "step": 8310 }, { "epoch": 4.0965041851304775, "grad_norm": 0.0017327237874269485, "learning_rate": 3.615952732644018e-06, "loss": 0.0002, "step": 8320 }, { "epoch": 4.101427868045298, "grad_norm": 0.0019220160320401192, "learning_rate": 3.596258000984737e-06, "loss": 0.0002, "step": 8330 }, { "epoch": 4.1063515509601185, "grad_norm": 0.0012822451535612345, "learning_rate": 3.576563269325456e-06, "loss": 0.0002, "step": 8340 }, { "epoch": 4.111275233874938, "grad_norm": 0.0037715635262429714, "learning_rate": 3.5568685376661744e-06, "loss": 0.0002, "step": 8350 }, { "epoch": 4.116198916789759, "grad_norm": 0.0015437414404004812, "learning_rate": 3.5371738060068934e-06, "loss": 0.0002, "step": 8360 }, { "epoch": 4.121122599704579, "grad_norm": 0.0012796347727999091, "learning_rate": 3.5174790743476125e-06, "loss": 0.0001, "step": 8370 }, { "epoch": 4.1260462826193995, "grad_norm": 0.0013989837607368827, "learning_rate": 3.497784342688331e-06, "loss": 0.0033, "step": 8380 }, { "epoch": 4.13096996553422, "grad_norm": 0.0012871460057795048, "learning_rate": 3.47808961102905e-06, "loss": 0.0001, "step": 8390 }, { "epoch": 4.13589364844904, "grad_norm": 0.0014854512410238385, "learning_rate": 3.4583948793697692e-06, "loss": 0.0002, "step": 8400 }, { "epoch": 4.14081733136386, "grad_norm": 0.0012728660367429256, "learning_rate": 3.4387001477104874e-06, "loss": 0.0002, "step": 8410 }, { "epoch": 4.145741014278681, "grad_norm": 0.0012776675866916776, "learning_rate": 3.4190054160512065e-06, "loss": 0.0002, "step": 8420 }, { "epoch": 4.150664697193501, "grad_norm": 0.0016903368523344398, "learning_rate": 3.3993106843919255e-06, "loss": 0.0002, "step": 8430 }, { "epoch": 4.155588380108321, "grad_norm": 0.004182947799563408, "learning_rate": 3.3796159527326446e-06, "loss": 0.0003, "step": 8440 }, { "epoch": 4.160512063023141, "grad_norm": 0.0012582590570673347, "learning_rate": 3.3599212210733632e-06, "loss": 0.0002, "step": 8450 }, { "epoch": 4.165435745937962, "grad_norm": 0.0017205028561875224, "learning_rate": 3.340226489414082e-06, "loss": 0.0002, "step": 8460 }, { "epoch": 4.170359428852782, "grad_norm": 0.0015602920902892947, "learning_rate": 3.320531757754801e-06, "loss": 0.0002, "step": 8470 }, { "epoch": 4.175283111767603, "grad_norm": 0.0013148763682693243, "learning_rate": 3.3008370260955196e-06, "loss": 0.0001, "step": 8480 }, { "epoch": 4.180206794682422, "grad_norm": 0.001262652687728405, "learning_rate": 3.2811422944362386e-06, "loss": 0.0002, "step": 8490 }, { "epoch": 4.185130477597243, "grad_norm": 0.00150257907807827, "learning_rate": 3.2614475627769577e-06, "loss": 0.0002, "step": 8500 }, { "epoch": 4.190054160512063, "grad_norm": 0.0014894099440425634, "learning_rate": 3.241752831117676e-06, "loss": 0.0002, "step": 8510 }, { "epoch": 4.194977843426884, "grad_norm": 0.0017964887665584683, "learning_rate": 3.222058099458395e-06, "loss": 0.0002, "step": 8520 }, { "epoch": 4.199901526341703, "grad_norm": 0.0014780315104871988, "learning_rate": 3.202363367799114e-06, "loss": 0.0001, "step": 8530 }, { "epoch": 4.204825209256524, "grad_norm": 0.0012969339732080698, "learning_rate": 3.182668636139833e-06, "loss": 0.0001, "step": 8540 }, { "epoch": 4.209748892171344, "grad_norm": 0.0014950993936508894, "learning_rate": 3.1629739044805517e-06, "loss": 0.0002, "step": 8550 }, { "epoch": 4.214672575086165, "grad_norm": 0.0013700306881219149, "learning_rate": 3.1432791728212707e-06, "loss": 0.0002, "step": 8560 }, { "epoch": 4.219596258000985, "grad_norm": 0.001990771619603038, "learning_rate": 3.1235844411619894e-06, "loss": 0.0002, "step": 8570 }, { "epoch": 4.224519940915805, "grad_norm": 0.0013008847599849105, "learning_rate": 3.103889709502708e-06, "loss": 0.0002, "step": 8580 }, { "epoch": 4.229443623830625, "grad_norm": 0.0012881122529506683, "learning_rate": 3.084194977843427e-06, "loss": 0.0002, "step": 8590 }, { "epoch": 4.234367306745446, "grad_norm": 0.0016159663209691644, "learning_rate": 3.064500246184146e-06, "loss": 0.025, "step": 8600 }, { "epoch": 4.239290989660266, "grad_norm": 0.0012733545154333115, "learning_rate": 3.044805514524865e-06, "loss": 0.0002, "step": 8610 }, { "epoch": 4.244214672575086, "grad_norm": 0.001908605801872909, "learning_rate": 3.0251107828655834e-06, "loss": 0.0002, "step": 8620 }, { "epoch": 4.249138355489906, "grad_norm": 0.0013102259254083037, "learning_rate": 3.0054160512063024e-06, "loss": 0.0002, "step": 8630 }, { "epoch": 4.254062038404727, "grad_norm": 0.0015227696858346462, "learning_rate": 2.9857213195470215e-06, "loss": 0.0002, "step": 8640 }, { "epoch": 4.258985721319547, "grad_norm": 0.001237130374647677, "learning_rate": 2.96602658788774e-06, "loss": 0.0002, "step": 8650 }, { "epoch": 4.263909404234368, "grad_norm": 0.0012896446278318763, "learning_rate": 2.946331856228459e-06, "loss": 0.0001, "step": 8660 }, { "epoch": 4.268833087149187, "grad_norm": 0.0012958886800333858, "learning_rate": 2.9266371245691782e-06, "loss": 0.0001, "step": 8670 }, { "epoch": 4.273756770064008, "grad_norm": 0.011026582680642605, "learning_rate": 2.9069423929098965e-06, "loss": 0.0002, "step": 8680 }, { "epoch": 4.278680452978828, "grad_norm": 0.0022566046100109816, "learning_rate": 2.8872476612506155e-06, "loss": 0.0002, "step": 8690 }, { "epoch": 4.283604135893649, "grad_norm": 0.0012606906238943338, "learning_rate": 2.8675529295913346e-06, "loss": 0.0002, "step": 8700 }, { "epoch": 4.288527818808468, "grad_norm": 0.0012501559685915709, "learning_rate": 2.8478581979320536e-06, "loss": 0.0002, "step": 8710 }, { "epoch": 4.293451501723289, "grad_norm": 0.0015419054543599486, "learning_rate": 2.8281634662727727e-06, "loss": 0.0002, "step": 8720 }, { "epoch": 4.298375184638109, "grad_norm": 0.0012369498144835234, "learning_rate": 2.808468734613491e-06, "loss": 0.0002, "step": 8730 }, { "epoch": 4.30329886755293, "grad_norm": 0.001716209459118545, "learning_rate": 2.78877400295421e-06, "loss": 0.0001, "step": 8740 }, { "epoch": 4.30822255046775, "grad_norm": 0.0012448432389646769, "learning_rate": 2.769079271294929e-06, "loss": 0.0001, "step": 8750 }, { "epoch": 4.31314623338257, "grad_norm": 0.0012436291435733438, "learning_rate": 2.7493845396356476e-06, "loss": 0.0002, "step": 8760 }, { "epoch": 4.31806991629739, "grad_norm": 0.0013124769320711493, "learning_rate": 2.7296898079763667e-06, "loss": 0.0002, "step": 8770 }, { "epoch": 4.322993599212211, "grad_norm": 0.0014658995205536485, "learning_rate": 2.7099950763170853e-06, "loss": 0.0002, "step": 8780 }, { "epoch": 4.327917282127031, "grad_norm": 0.00151143129914999, "learning_rate": 2.690300344657804e-06, "loss": 0.0002, "step": 8790 }, { "epoch": 4.332840965041851, "grad_norm": 0.0013197060907259583, "learning_rate": 2.670605612998523e-06, "loss": 0.0002, "step": 8800 }, { "epoch": 4.337764647956671, "grad_norm": 0.0024781296961009502, "learning_rate": 2.650910881339242e-06, "loss": 0.0006, "step": 8810 }, { "epoch": 4.342688330871492, "grad_norm": 0.002245939103886485, "learning_rate": 2.631216149679961e-06, "loss": 0.0002, "step": 8820 }, { "epoch": 4.347612013786312, "grad_norm": 0.0012940160231664777, "learning_rate": 2.6115214180206798e-06, "loss": 0.0002, "step": 8830 }, { "epoch": 4.352535696701133, "grad_norm": 0.0018242484657093883, "learning_rate": 2.5918266863613984e-06, "loss": 0.0002, "step": 8840 }, { "epoch": 4.357459379615952, "grad_norm": 0.0016134735196828842, "learning_rate": 2.5721319547021175e-06, "loss": 0.0002, "step": 8850 }, { "epoch": 4.362383062530773, "grad_norm": 0.0012313901679590344, "learning_rate": 2.552437223042836e-06, "loss": 0.0001, "step": 8860 }, { "epoch": 4.367306745445593, "grad_norm": 0.0012789260363206267, "learning_rate": 2.532742491383555e-06, "loss": 0.0002, "step": 8870 }, { "epoch": 4.372230428360414, "grad_norm": 0.00142171629704535, "learning_rate": 2.513047759724274e-06, "loss": 0.0001, "step": 8880 }, { "epoch": 4.3771541112752335, "grad_norm": 0.001423663692548871, "learning_rate": 2.493353028064993e-06, "loss": 0.0001, "step": 8890 }, { "epoch": 4.382077794190054, "grad_norm": 0.0012176020536571741, "learning_rate": 2.473658296405712e-06, "loss": 0.0002, "step": 8900 }, { "epoch": 4.387001477104874, "grad_norm": 0.3053596615791321, "learning_rate": 2.4539635647464305e-06, "loss": 0.001, "step": 8910 }, { "epoch": 4.391925160019695, "grad_norm": 0.0014659055741503835, "learning_rate": 2.4342688330871496e-06, "loss": 0.0002, "step": 8920 }, { "epoch": 4.396848842934515, "grad_norm": 0.0012392625212669373, "learning_rate": 2.4145741014278682e-06, "loss": 0.0002, "step": 8930 }, { "epoch": 4.401772525849335, "grad_norm": 0.0013947734842076898, "learning_rate": 2.3948793697685873e-06, "loss": 0.0001, "step": 8940 }, { "epoch": 4.4066962087641555, "grad_norm": 0.0014435608172789216, "learning_rate": 2.375184638109306e-06, "loss": 0.0002, "step": 8950 }, { "epoch": 4.411619891678976, "grad_norm": 0.5223457217216492, "learning_rate": 2.3554899064500245e-06, "loss": 0.0055, "step": 8960 }, { "epoch": 4.416543574593796, "grad_norm": 0.001231219619512558, "learning_rate": 2.3357951747907436e-06, "loss": 0.0001, "step": 8970 }, { "epoch": 4.421467257508617, "grad_norm": 0.001335963956080377, "learning_rate": 2.3161004431314626e-06, "loss": 0.0002, "step": 8980 }, { "epoch": 4.4263909404234365, "grad_norm": 0.0014932631747797132, "learning_rate": 2.2964057114721813e-06, "loss": 0.0183, "step": 8990 }, { "epoch": 4.431314623338257, "grad_norm": 0.0012341372203081846, "learning_rate": 2.2767109798129003e-06, "loss": 0.0001, "step": 9000 }, { "epoch": 4.4362383062530775, "grad_norm": 0.001224417588673532, "learning_rate": 2.257016248153619e-06, "loss": 0.0002, "step": 9010 }, { "epoch": 4.441161989167898, "grad_norm": 0.001415021950379014, "learning_rate": 2.237321516494338e-06, "loss": 0.0002, "step": 9020 }, { "epoch": 4.4460856720827175, "grad_norm": 0.0012212812434881926, "learning_rate": 2.217626784835057e-06, "loss": 0.0001, "step": 9030 }, { "epoch": 4.451009354997538, "grad_norm": 0.0016725438181310892, "learning_rate": 2.1979320531757757e-06, "loss": 0.0001, "step": 9040 }, { "epoch": 4.4559330379123585, "grad_norm": 0.0031123908702284098, "learning_rate": 2.1782373215164944e-06, "loss": 0.0002, "step": 9050 }, { "epoch": 4.460856720827179, "grad_norm": 0.00149108178447932, "learning_rate": 2.1585425898572134e-06, "loss": 0.0002, "step": 9060 }, { "epoch": 4.465780403741999, "grad_norm": 0.0012318805092945695, "learning_rate": 2.138847858197932e-06, "loss": 0.0001, "step": 9070 }, { "epoch": 4.470704086656819, "grad_norm": 0.001223694533109665, "learning_rate": 2.119153126538651e-06, "loss": 0.0001, "step": 9080 }, { "epoch": 4.4756277695716395, "grad_norm": 0.0015389460604637861, "learning_rate": 2.0994583948793697e-06, "loss": 0.0001, "step": 9090 }, { "epoch": 4.48055145248646, "grad_norm": 0.001753377728164196, "learning_rate": 2.079763663220089e-06, "loss": 0.0001, "step": 9100 }, { "epoch": 4.4854751354012805, "grad_norm": 0.00128873810172081, "learning_rate": 2.060068931560808e-06, "loss": 0.0001, "step": 9110 }, { "epoch": 4.4903988183161, "grad_norm": 0.0013829106464982033, "learning_rate": 2.0403741999015265e-06, "loss": 0.0001, "step": 9120 }, { "epoch": 4.495322501230921, "grad_norm": 0.0014223408652469516, "learning_rate": 2.0206794682422455e-06, "loss": 0.0002, "step": 9130 }, { "epoch": 4.500246184145741, "grad_norm": 0.0014313060091808438, "learning_rate": 2.000984736582964e-06, "loss": 0.0001, "step": 9140 }, { "epoch": 4.5051698670605616, "grad_norm": 0.004056563135236502, "learning_rate": 1.981290004923683e-06, "loss": 0.0002, "step": 9150 }, { "epoch": 4.510093549975382, "grad_norm": 0.0014529697364196181, "learning_rate": 1.961595273264402e-06, "loss": 0.0002, "step": 9160 }, { "epoch": 4.515017232890202, "grad_norm": 0.0011942709097638726, "learning_rate": 1.941900541605121e-06, "loss": 0.0001, "step": 9170 }, { "epoch": 4.519940915805022, "grad_norm": 0.0016299562994390726, "learning_rate": 1.9222058099458395e-06, "loss": 0.0002, "step": 9180 }, { "epoch": 4.524864598719843, "grad_norm": 0.0012343422276899219, "learning_rate": 1.9025110782865586e-06, "loss": 0.0001, "step": 9190 }, { "epoch": 4.529788281634663, "grad_norm": 0.001468788948841393, "learning_rate": 1.8828163466272772e-06, "loss": 0.0002, "step": 9200 }, { "epoch": 4.534711964549483, "grad_norm": 0.0012346956646069884, "learning_rate": 1.8631216149679963e-06, "loss": 0.0002, "step": 9210 }, { "epoch": 4.539635647464303, "grad_norm": 0.005765956360846758, "learning_rate": 1.8434268833087151e-06, "loss": 0.0002, "step": 9220 }, { "epoch": 4.544559330379124, "grad_norm": 0.0012338730739429593, "learning_rate": 1.8237321516494338e-06, "loss": 0.0001, "step": 9230 }, { "epoch": 4.549483013293944, "grad_norm": 0.0018139538588002324, "learning_rate": 1.8040374199901528e-06, "loss": 0.0001, "step": 9240 }, { "epoch": 4.554406696208764, "grad_norm": 0.001192125491797924, "learning_rate": 1.7843426883308717e-06, "loss": 0.0002, "step": 9250 }, { "epoch": 4.559330379123584, "grad_norm": 0.0013297711266204715, "learning_rate": 1.7646479566715905e-06, "loss": 0.0002, "step": 9260 }, { "epoch": 4.564254062038405, "grad_norm": 0.0015158694004639983, "learning_rate": 1.7449532250123094e-06, "loss": 0.0001, "step": 9270 }, { "epoch": 4.569177744953225, "grad_norm": 0.0014190117362886667, "learning_rate": 1.725258493353028e-06, "loss": 0.0003, "step": 9280 }, { "epoch": 4.574101427868046, "grad_norm": 0.0013871495611965656, "learning_rate": 1.705563761693747e-06, "loss": 0.0001, "step": 9290 }, { "epoch": 4.579025110782865, "grad_norm": 0.0016816201386973262, "learning_rate": 1.685869030034466e-06, "loss": 0.0002, "step": 9300 }, { "epoch": 4.583948793697686, "grad_norm": 0.0012660392094403505, "learning_rate": 1.6661742983751847e-06, "loss": 0.0001, "step": 9310 }, { "epoch": 4.588872476612506, "grad_norm": 0.0014960381668061018, "learning_rate": 1.6464795667159036e-06, "loss": 0.0001, "step": 9320 }, { "epoch": 4.593796159527327, "grad_norm": 0.0016527030384168029, "learning_rate": 1.6267848350566226e-06, "loss": 0.0001, "step": 9330 }, { "epoch": 4.598719842442147, "grad_norm": 0.0014885793207213283, "learning_rate": 1.6070901033973413e-06, "loss": 0.0001, "step": 9340 }, { "epoch": 4.603643525356967, "grad_norm": 0.0012881554430350661, "learning_rate": 1.5873953717380603e-06, "loss": 0.0001, "step": 9350 }, { "epoch": 4.608567208271787, "grad_norm": 0.003308955579996109, "learning_rate": 1.567700640078779e-06, "loss": 0.0002, "step": 9360 }, { "epoch": 4.613490891186608, "grad_norm": 0.0030271336436271667, "learning_rate": 1.5480059084194978e-06, "loss": 0.0002, "step": 9370 }, { "epoch": 4.618414574101428, "grad_norm": 0.0012079523876309395, "learning_rate": 1.5283111767602169e-06, "loss": 0.0001, "step": 9380 }, { "epoch": 4.623338257016248, "grad_norm": 0.0013818118022754788, "learning_rate": 1.5086164451009355e-06, "loss": 0.0002, "step": 9390 }, { "epoch": 4.628261939931068, "grad_norm": 0.0013594648335129023, "learning_rate": 1.4889217134416546e-06, "loss": 0.0002, "step": 9400 }, { "epoch": 4.633185622845889, "grad_norm": 0.0014723829226568341, "learning_rate": 1.4692269817823734e-06, "loss": 0.0002, "step": 9410 }, { "epoch": 4.638109305760709, "grad_norm": 0.001555690192617476, "learning_rate": 1.449532250123092e-06, "loss": 0.0001, "step": 9420 }, { "epoch": 4.643032988675529, "grad_norm": 0.001370370271615684, "learning_rate": 1.429837518463811e-06, "loss": 0.0001, "step": 9430 }, { "epoch": 4.647956671590349, "grad_norm": 0.0013733146479353309, "learning_rate": 1.41014278680453e-06, "loss": 0.0002, "step": 9440 }, { "epoch": 4.65288035450517, "grad_norm": 0.020692721009254456, "learning_rate": 1.3904480551452488e-06, "loss": 0.0002, "step": 9450 }, { "epoch": 4.65780403741999, "grad_norm": 0.0013894923031330109, "learning_rate": 1.3707533234859676e-06, "loss": 0.0079, "step": 9460 }, { "epoch": 4.662727720334811, "grad_norm": 0.005957514047622681, "learning_rate": 1.3510585918266863e-06, "loss": 0.0002, "step": 9470 }, { "epoch": 4.66765140324963, "grad_norm": 0.0014959081308916211, "learning_rate": 1.3313638601674053e-06, "loss": 0.0001, "step": 9480 }, { "epoch": 4.672575086164451, "grad_norm": 0.0012322113616392016, "learning_rate": 1.3116691285081244e-06, "loss": 0.0001, "step": 9490 }, { "epoch": 4.677498769079271, "grad_norm": 0.00134864985011518, "learning_rate": 1.291974396848843e-06, "loss": 0.0001, "step": 9500 }, { "epoch": 4.682422451994092, "grad_norm": 0.0011784137459471822, "learning_rate": 1.2722796651895619e-06, "loss": 0.0002, "step": 9510 }, { "epoch": 4.687346134908912, "grad_norm": 0.0011899089440703392, "learning_rate": 1.252584933530281e-06, "loss": 0.0002, "step": 9520 }, { "epoch": 4.692269817823732, "grad_norm": 0.0012023659655824304, "learning_rate": 1.2328902018709995e-06, "loss": 0.0002, "step": 9530 }, { "epoch": 4.697193500738552, "grad_norm": 0.001208995352499187, "learning_rate": 1.2131954702117186e-06, "loss": 0.0011, "step": 9540 }, { "epoch": 4.702117183653373, "grad_norm": 0.0014061863766983151, "learning_rate": 1.1935007385524372e-06, "loss": 0.0001, "step": 9550 }, { "epoch": 4.707040866568193, "grad_norm": 0.001575971720740199, "learning_rate": 1.173806006893156e-06, "loss": 0.0002, "step": 9560 }, { "epoch": 4.711964549483013, "grad_norm": 0.0012307134456932545, "learning_rate": 1.154111275233875e-06, "loss": 0.0002, "step": 9570 }, { "epoch": 4.716888232397833, "grad_norm": 0.0013830272946506739, "learning_rate": 1.134416543574594e-06, "loss": 0.0001, "step": 9580 }, { "epoch": 4.721811915312654, "grad_norm": 0.0013915746239945292, "learning_rate": 1.1147218119153128e-06, "loss": 0.0002, "step": 9590 }, { "epoch": 4.726735598227474, "grad_norm": 0.001184074324555695, "learning_rate": 1.0950270802560317e-06, "loss": 0.0001, "step": 9600 }, { "epoch": 4.731659281142294, "grad_norm": 0.0012042863527312875, "learning_rate": 1.0753323485967503e-06, "loss": 0.0001, "step": 9610 }, { "epoch": 4.736582964057114, "grad_norm": 0.0011871194001287222, "learning_rate": 1.0556376169374694e-06, "loss": 0.0001, "step": 9620 }, { "epoch": 4.741506646971935, "grad_norm": 0.0011803999077528715, "learning_rate": 1.0359428852781882e-06, "loss": 0.0001, "step": 9630 }, { "epoch": 4.746430329886755, "grad_norm": 0.001454996527172625, "learning_rate": 1.016248153618907e-06, "loss": 0.0001, "step": 9640 }, { "epoch": 4.751354012801576, "grad_norm": 0.0013643850106745958, "learning_rate": 9.96553421959626e-07, "loss": 0.0001, "step": 9650 }, { "epoch": 4.7562776957163955, "grad_norm": 0.004807002376765013, "learning_rate": 9.768586903003447e-07, "loss": 0.0002, "step": 9660 }, { "epoch": 4.761201378631216, "grad_norm": 0.010427232831716537, "learning_rate": 9.571639586410636e-07, "loss": 0.0002, "step": 9670 }, { "epoch": 4.766125061546036, "grad_norm": 0.0012309970334172249, "learning_rate": 9.374692269817824e-07, "loss": 0.0001, "step": 9680 }, { "epoch": 4.771048744460857, "grad_norm": 0.0019813096150755882, "learning_rate": 9.177744953225014e-07, "loss": 0.0002, "step": 9690 }, { "epoch": 4.775972427375677, "grad_norm": 0.0014402302913367748, "learning_rate": 8.980797636632202e-07, "loss": 0.0002, "step": 9700 }, { "epoch": 4.780896110290497, "grad_norm": 0.0014121612766757607, "learning_rate": 8.78385032003939e-07, "loss": 0.0002, "step": 9710 }, { "epoch": 4.7858197932053175, "grad_norm": 0.0011728608515113592, "learning_rate": 8.586903003446578e-07, "loss": 0.0001, "step": 9720 }, { "epoch": 4.790743476120138, "grad_norm": 0.001560199074447155, "learning_rate": 8.389955686853768e-07, "loss": 0.0002, "step": 9730 }, { "epoch": 4.795667159034958, "grad_norm": 0.001252340734936297, "learning_rate": 8.193008370260956e-07, "loss": 0.0001, "step": 9740 }, { "epoch": 4.800590841949779, "grad_norm": 0.0011785068782046437, "learning_rate": 7.996061053668145e-07, "loss": 0.0002, "step": 9750 }, { "epoch": 4.8055145248645985, "grad_norm": 0.001220080885104835, "learning_rate": 7.799113737075333e-07, "loss": 0.0001, "step": 9760 }, { "epoch": 4.810438207779419, "grad_norm": 0.0013691087951883674, "learning_rate": 7.602166420482522e-07, "loss": 0.0001, "step": 9770 }, { "epoch": 4.8153618906942395, "grad_norm": 0.003999890293926001, "learning_rate": 7.40521910388971e-07, "loss": 0.0002, "step": 9780 }, { "epoch": 4.82028557360906, "grad_norm": 0.0012147324159741402, "learning_rate": 7.208271787296898e-07, "loss": 0.0002, "step": 9790 }, { "epoch": 4.8252092565238796, "grad_norm": 0.0013871859991922975, "learning_rate": 7.011324470704087e-07, "loss": 0.0001, "step": 9800 }, { "epoch": 4.8301329394387, "grad_norm": 0.0011777483159676194, "learning_rate": 6.814377154111276e-07, "loss": 0.0001, "step": 9810 }, { "epoch": 4.8350566223535205, "grad_norm": 0.0013384807389229536, "learning_rate": 6.617429837518465e-07, "loss": 0.0001, "step": 9820 }, { "epoch": 4.839980305268341, "grad_norm": 0.00117026106454432, "learning_rate": 6.420482520925653e-07, "loss": 0.0001, "step": 9830 }, { "epoch": 4.844903988183161, "grad_norm": 0.0012068642536178231, "learning_rate": 6.223535204332842e-07, "loss": 0.0001, "step": 9840 }, { "epoch": 4.849827671097981, "grad_norm": 0.001245830673724413, "learning_rate": 6.02658788774003e-07, "loss": 0.0002, "step": 9850 }, { "epoch": 4.854751354012802, "grad_norm": 0.0014253932749852538, "learning_rate": 5.829640571147219e-07, "loss": 0.0001, "step": 9860 }, { "epoch": 4.859675036927622, "grad_norm": 0.001373025239445269, "learning_rate": 5.632693254554407e-07, "loss": 0.0001, "step": 9870 }, { "epoch": 4.8645987198424425, "grad_norm": 0.0014195754192769527, "learning_rate": 5.435745937961595e-07, "loss": 0.0002, "step": 9880 }, { "epoch": 4.869522402757262, "grad_norm": 0.0012903210008516908, "learning_rate": 5.238798621368784e-07, "loss": 0.0002, "step": 9890 }, { "epoch": 4.874446085672083, "grad_norm": 0.001158708124421537, "learning_rate": 5.041851304775973e-07, "loss": 0.0001, "step": 9900 }, { "epoch": 4.879369768586903, "grad_norm": 0.0012715465854853392, "learning_rate": 4.844903988183161e-07, "loss": 0.0001, "step": 9910 }, { "epoch": 4.884293451501724, "grad_norm": 0.0012133314739912748, "learning_rate": 4.64795667159035e-07, "loss": 0.0001, "step": 9920 }, { "epoch": 4.889217134416544, "grad_norm": 0.016124047338962555, "learning_rate": 4.451009354997538e-07, "loss": 0.0002, "step": 9930 }, { "epoch": 4.894140817331364, "grad_norm": 0.0013699685223400593, "learning_rate": 4.254062038404727e-07, "loss": 0.0001, "step": 9940 }, { "epoch": 4.899064500246184, "grad_norm": 0.0013574556214734912, "learning_rate": 4.057114721811916e-07, "loss": 0.0001, "step": 9950 }, { "epoch": 4.903988183161005, "grad_norm": 0.0013499916531145573, "learning_rate": 3.860167405219104e-07, "loss": 0.0002, "step": 9960 }, { "epoch": 4.908911866075825, "grad_norm": 0.001382304704748094, "learning_rate": 3.663220088626293e-07, "loss": 0.0001, "step": 9970 }, { "epoch": 4.913835548990645, "grad_norm": 0.0013027896638959646, "learning_rate": 3.4662727720334815e-07, "loss": 0.0001, "step": 9980 }, { "epoch": 4.918759231905465, "grad_norm": 0.0011790018761530519, "learning_rate": 3.26932545544067e-07, "loss": 0.0002, "step": 9990 }, { "epoch": 4.923682914820286, "grad_norm": 0.0012470015790313482, "learning_rate": 3.0723781388478584e-07, "loss": 0.0037, "step": 10000 }, { "epoch": 4.928606597735106, "grad_norm": 0.0012150456896051764, "learning_rate": 2.875430822255047e-07, "loss": 0.0002, "step": 10010 }, { "epoch": 4.933530280649926, "grad_norm": 0.0012021757429465652, "learning_rate": 2.6784835056622353e-07, "loss": 0.0001, "step": 10020 }, { "epoch": 4.938453963564746, "grad_norm": 0.0011834530159831047, "learning_rate": 2.4815361890694243e-07, "loss": 0.0001, "step": 10030 }, { "epoch": 4.943377646479567, "grad_norm": 0.0013943571830168366, "learning_rate": 2.2845888724766125e-07, "loss": 0.0002, "step": 10040 }, { "epoch": 4.948301329394387, "grad_norm": 0.001787975779734552, "learning_rate": 2.0876415558838012e-07, "loss": 0.0217, "step": 10050 }, { "epoch": 4.953225012309208, "grad_norm": 0.0014198760036379099, "learning_rate": 1.8906942392909896e-07, "loss": 0.0001, "step": 10060 }, { "epoch": 4.958148695224027, "grad_norm": 0.0013527346309274435, "learning_rate": 1.6937469226981783e-07, "loss": 0.0001, "step": 10070 }, { "epoch": 4.963072378138848, "grad_norm": 0.0020522738341242075, "learning_rate": 1.496799606105367e-07, "loss": 0.0002, "step": 10080 }, { "epoch": 4.967996061053668, "grad_norm": 0.0013713801745325327, "learning_rate": 1.2998522895125555e-07, "loss": 0.0001, "step": 10090 }, { "epoch": 4.972919743968489, "grad_norm": 0.001553553156554699, "learning_rate": 1.1029049729197441e-07, "loss": 0.0001, "step": 10100 }, { "epoch": 4.977843426883309, "grad_norm": 0.0011614145478233695, "learning_rate": 9.059576563269325e-08, "loss": 0.0001, "step": 10110 }, { "epoch": 4.982767109798129, "grad_norm": 0.001341148978099227, "learning_rate": 7.090103397341212e-08, "loss": 0.0001, "step": 10120 }, { "epoch": 4.987690792712949, "grad_norm": 0.0014009519945830107, "learning_rate": 5.1206302314130975e-08, "loss": 0.0001, "step": 10130 }, { "epoch": 4.99261447562777, "grad_norm": 0.0011851818999275565, "learning_rate": 3.1511570654849834e-08, "loss": 0.0001, "step": 10140 }, { "epoch": 4.99753815854259, "grad_norm": 0.001298619550652802, "learning_rate": 1.1816838995568685e-08, "loss": 0.0001, "step": 10150 }, { "epoch": 5.0, "eval_accuracy": 0.9989536100453436, "eval_loss": 0.003578549949452281, "eval_runtime": 127.0837, "eval_samples_per_second": 22.56, "eval_steps_per_second": 2.825, "step": 10155 }, { "epoch": 5.0, "step": 10155, "total_flos": 6.293899396497162e+18, "train_loss": 0.00641422240726846, "train_runtime": 5338.9651, "train_samples_per_second": 15.213, "train_steps_per_second": 1.902 } ], "logging_steps": 10, "max_steps": 10155, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.293899396497162e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }