image_segmentation_classifier / trainer_state.json
Dugerij's picture
End of training
51c73d5 verified
{
"best_global_step": 4062,
"best_metric": 0.003324420191347599,
"best_model_checkpoint": "./outputs/checkpoint-4062",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 10155,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004923682914820286,
"grad_norm": 1.6549489498138428,
"learning_rate": 1.9982274741506648e-05,
"loss": 0.4492,
"step": 10
},
{
"epoch": 0.009847365829640572,
"grad_norm": 1.334946870803833,
"learning_rate": 1.9962580009847367e-05,
"loss": 0.1576,
"step": 20
},
{
"epoch": 0.014771048744460856,
"grad_norm": 0.6242792010307312,
"learning_rate": 1.9942885278188087e-05,
"loss": 0.1297,
"step": 30
},
{
"epoch": 0.019694731659281144,
"grad_norm": 0.3603459596633911,
"learning_rate": 1.9923190546528807e-05,
"loss": 0.087,
"step": 40
},
{
"epoch": 0.024618414574101428,
"grad_norm": 1.5480420589447021,
"learning_rate": 1.9903495814869524e-05,
"loss": 0.1001,
"step": 50
},
{
"epoch": 0.029542097488921712,
"grad_norm": 0.46412599086761475,
"learning_rate": 1.9883801083210244e-05,
"loss": 0.0426,
"step": 60
},
{
"epoch": 0.034465780403742,
"grad_norm": 0.17812378704547882,
"learning_rate": 1.9864106351550964e-05,
"loss": 0.0667,
"step": 70
},
{
"epoch": 0.03938946331856229,
"grad_norm": 0.35499292612075806,
"learning_rate": 1.984441161989168e-05,
"loss": 0.0745,
"step": 80
},
{
"epoch": 0.04431314623338257,
"grad_norm": 0.130273699760437,
"learning_rate": 1.98247168882324e-05,
"loss": 0.0377,
"step": 90
},
{
"epoch": 0.049236829148202856,
"grad_norm": 0.12631726264953613,
"learning_rate": 1.980502215657312e-05,
"loss": 0.0621,
"step": 100
},
{
"epoch": 0.05416051206302314,
"grad_norm": 0.6002771854400635,
"learning_rate": 1.9785327424913836e-05,
"loss": 0.0625,
"step": 110
},
{
"epoch": 0.059084194977843424,
"grad_norm": 2.8204727172851562,
"learning_rate": 1.9765632693254556e-05,
"loss": 0.1003,
"step": 120
},
{
"epoch": 0.06400787789266371,
"grad_norm": 0.7127029895782471,
"learning_rate": 1.9745937961595273e-05,
"loss": 0.0507,
"step": 130
},
{
"epoch": 0.068931560807484,
"grad_norm": 0.09328364580869675,
"learning_rate": 1.9726243229935993e-05,
"loss": 0.0513,
"step": 140
},
{
"epoch": 0.07385524372230429,
"grad_norm": 0.1317463517189026,
"learning_rate": 1.9706548498276713e-05,
"loss": 0.0321,
"step": 150
},
{
"epoch": 0.07877892663712457,
"grad_norm": 0.0986248180270195,
"learning_rate": 1.9686853766617433e-05,
"loss": 0.021,
"step": 160
},
{
"epoch": 0.08370260955194485,
"grad_norm": 0.09169679135084152,
"learning_rate": 1.966715903495815e-05,
"loss": 0.0197,
"step": 170
},
{
"epoch": 0.08862629246676514,
"grad_norm": 0.11187795549631119,
"learning_rate": 1.964746430329887e-05,
"loss": 0.0294,
"step": 180
},
{
"epoch": 0.09354997538158542,
"grad_norm": 0.07759550958871841,
"learning_rate": 1.962776957163959e-05,
"loss": 0.0413,
"step": 190
},
{
"epoch": 0.09847365829640571,
"grad_norm": 0.18886986374855042,
"learning_rate": 1.9608074839980305e-05,
"loss": 0.0319,
"step": 200
},
{
"epoch": 0.103397341211226,
"grad_norm": 0.07388991117477417,
"learning_rate": 1.9588380108321025e-05,
"loss": 0.0142,
"step": 210
},
{
"epoch": 0.10832102412604629,
"grad_norm": 0.24932511150836945,
"learning_rate": 1.9568685376661745e-05,
"loss": 0.015,
"step": 220
},
{
"epoch": 0.11324470704086657,
"grad_norm": 0.0711674615740776,
"learning_rate": 1.9548990645002465e-05,
"loss": 0.0168,
"step": 230
},
{
"epoch": 0.11816838995568685,
"grad_norm": 0.06821451336145401,
"learning_rate": 1.952929591334318e-05,
"loss": 0.0121,
"step": 240
},
{
"epoch": 0.12309207287050714,
"grad_norm": 0.06332361698150635,
"learning_rate": 1.95096011816839e-05,
"loss": 0.0238,
"step": 250
},
{
"epoch": 0.12801575578532742,
"grad_norm": 0.06404373049736023,
"learning_rate": 1.948990645002462e-05,
"loss": 0.0158,
"step": 260
},
{
"epoch": 0.1329394387001477,
"grad_norm": 0.06331443041563034,
"learning_rate": 1.9470211718365338e-05,
"loss": 0.0107,
"step": 270
},
{
"epoch": 0.137863121614968,
"grad_norm": 0.10350039601325989,
"learning_rate": 1.9450516986706058e-05,
"loss": 0.0225,
"step": 280
},
{
"epoch": 0.14278680452978829,
"grad_norm": 0.06487182527780533,
"learning_rate": 1.9430822255046778e-05,
"loss": 0.0094,
"step": 290
},
{
"epoch": 0.14771048744460857,
"grad_norm": 0.1302100121974945,
"learning_rate": 1.9411127523387498e-05,
"loss": 0.01,
"step": 300
},
{
"epoch": 0.15263417035942886,
"grad_norm": 0.08045461773872375,
"learning_rate": 1.9391432791728214e-05,
"loss": 0.0091,
"step": 310
},
{
"epoch": 0.15755785327424915,
"grad_norm": 2.8434016704559326,
"learning_rate": 1.9371738060068934e-05,
"loss": 0.0408,
"step": 320
},
{
"epoch": 0.16248153618906944,
"grad_norm": 0.061778053641319275,
"learning_rate": 1.935204332840965e-05,
"loss": 0.0463,
"step": 330
},
{
"epoch": 0.1674052191038897,
"grad_norm": 0.0558575838804245,
"learning_rate": 1.933234859675037e-05,
"loss": 0.009,
"step": 340
},
{
"epoch": 0.17232890201870998,
"grad_norm": 0.10792255401611328,
"learning_rate": 1.931265386509109e-05,
"loss": 0.0077,
"step": 350
},
{
"epoch": 0.17725258493353027,
"grad_norm": 0.05559010058641434,
"learning_rate": 1.9292959133431807e-05,
"loss": 0.008,
"step": 360
},
{
"epoch": 0.18217626784835056,
"grad_norm": 0.0584879107773304,
"learning_rate": 1.9273264401772527e-05,
"loss": 0.0097,
"step": 370
},
{
"epoch": 0.18709995076317085,
"grad_norm": 0.05132288858294487,
"learning_rate": 1.9253569670113247e-05,
"loss": 0.0139,
"step": 380
},
{
"epoch": 0.19202363367799113,
"grad_norm": 0.052865270525217056,
"learning_rate": 1.9233874938453963e-05,
"loss": 0.0073,
"step": 390
},
{
"epoch": 0.19694731659281142,
"grad_norm": 0.06938347965478897,
"learning_rate": 1.9214180206794683e-05,
"loss": 0.008,
"step": 400
},
{
"epoch": 0.2018709995076317,
"grad_norm": 0.06304794549942017,
"learning_rate": 1.9194485475135403e-05,
"loss": 0.0067,
"step": 410
},
{
"epoch": 0.206794682422452,
"grad_norm": 0.06260755658149719,
"learning_rate": 1.9174790743476123e-05,
"loss": 0.0059,
"step": 420
},
{
"epoch": 0.21171836533727229,
"grad_norm": 0.043609410524368286,
"learning_rate": 1.915509601181684e-05,
"loss": 0.0066,
"step": 430
},
{
"epoch": 0.21664204825209257,
"grad_norm": 0.0525021031498909,
"learning_rate": 1.913540128015756e-05,
"loss": 0.0062,
"step": 440
},
{
"epoch": 0.22156573116691286,
"grad_norm": 0.043818939477205276,
"learning_rate": 1.911570654849828e-05,
"loss": 0.006,
"step": 450
},
{
"epoch": 0.22648941408173315,
"grad_norm": 0.05230528861284256,
"learning_rate": 1.9096011816838996e-05,
"loss": 0.0055,
"step": 460
},
{
"epoch": 0.23141309699655344,
"grad_norm": 0.042306121438741684,
"learning_rate": 1.9076317085179716e-05,
"loss": 0.0056,
"step": 470
},
{
"epoch": 0.2363367799113737,
"grad_norm": 4.103374481201172,
"learning_rate": 1.9056622353520436e-05,
"loss": 0.0389,
"step": 480
},
{
"epoch": 0.24126046282619398,
"grad_norm": 0.04262426868081093,
"learning_rate": 1.9036927621861155e-05,
"loss": 0.0347,
"step": 490
},
{
"epoch": 0.24618414574101427,
"grad_norm": 0.04224070906639099,
"learning_rate": 1.9017232890201872e-05,
"loss": 0.0051,
"step": 500
},
{
"epoch": 0.2511078286558346,
"grad_norm": 0.05894336849451065,
"learning_rate": 1.8997538158542592e-05,
"loss": 0.1076,
"step": 510
},
{
"epoch": 0.25603151157065485,
"grad_norm": 0.045038461685180664,
"learning_rate": 1.8977843426883312e-05,
"loss": 0.0072,
"step": 520
},
{
"epoch": 0.26095519448547516,
"grad_norm": 0.04939822852611542,
"learning_rate": 1.895814869522403e-05,
"loss": 0.0065,
"step": 530
},
{
"epoch": 0.2658788774002954,
"grad_norm": 0.04498062655329704,
"learning_rate": 1.8938453963564748e-05,
"loss": 0.0067,
"step": 540
},
{
"epoch": 0.2708025603151157,
"grad_norm": 0.04091101884841919,
"learning_rate": 1.8918759231905465e-05,
"loss": 0.0053,
"step": 550
},
{
"epoch": 0.275726243229936,
"grad_norm": 0.040018483996391296,
"learning_rate": 1.8899064500246185e-05,
"loss": 0.0619,
"step": 560
},
{
"epoch": 0.28064992614475626,
"grad_norm": 2.4129717350006104,
"learning_rate": 1.8879369768586905e-05,
"loss": 0.0524,
"step": 570
},
{
"epoch": 0.28557360905957657,
"grad_norm": 0.07172203063964844,
"learning_rate": 1.885967503692762e-05,
"loss": 0.0088,
"step": 580
},
{
"epoch": 0.29049729197439683,
"grad_norm": 1.157906174659729,
"learning_rate": 1.883998030526834e-05,
"loss": 0.0067,
"step": 590
},
{
"epoch": 0.29542097488921715,
"grad_norm": 0.04644926264882088,
"learning_rate": 1.882028557360906e-05,
"loss": 0.0052,
"step": 600
},
{
"epoch": 0.3003446578040374,
"grad_norm": 0.04166566953063011,
"learning_rate": 1.880059084194978e-05,
"loss": 0.005,
"step": 610
},
{
"epoch": 0.3052683407188577,
"grad_norm": 0.039835572242736816,
"learning_rate": 1.8780896110290497e-05,
"loss": 0.0376,
"step": 620
},
{
"epoch": 0.310192023633678,
"grad_norm": 0.04242682084441185,
"learning_rate": 1.8761201378631217e-05,
"loss": 0.006,
"step": 630
},
{
"epoch": 0.3151157065484983,
"grad_norm": 0.06742667406797409,
"learning_rate": 1.8741506646971937e-05,
"loss": 0.0049,
"step": 640
},
{
"epoch": 0.32003938946331856,
"grad_norm": 0.03444899246096611,
"learning_rate": 1.8721811915312657e-05,
"loss": 0.0057,
"step": 650
},
{
"epoch": 0.3249630723781389,
"grad_norm": 0.06320315599441528,
"learning_rate": 1.8702117183653373e-05,
"loss": 0.0049,
"step": 660
},
{
"epoch": 0.32988675529295913,
"grad_norm": 0.03362872451543808,
"learning_rate": 1.8682422451994093e-05,
"loss": 0.0041,
"step": 670
},
{
"epoch": 0.3348104382077794,
"grad_norm": 0.03198781982064247,
"learning_rate": 1.8662727720334813e-05,
"loss": 0.0046,
"step": 680
},
{
"epoch": 0.3397341211225997,
"grad_norm": 0.046496450901031494,
"learning_rate": 1.864303298867553e-05,
"loss": 0.0189,
"step": 690
},
{
"epoch": 0.34465780403741997,
"grad_norm": 0.03556285426020622,
"learning_rate": 1.862333825701625e-05,
"loss": 0.0039,
"step": 700
},
{
"epoch": 0.3495814869522403,
"grad_norm": 0.040747638791799545,
"learning_rate": 1.860364352535697e-05,
"loss": 0.0054,
"step": 710
},
{
"epoch": 0.35450516986706054,
"grad_norm": 0.030371148139238358,
"learning_rate": 1.858394879369769e-05,
"loss": 0.0048,
"step": 720
},
{
"epoch": 0.35942885278188086,
"grad_norm": 0.03187814727425575,
"learning_rate": 1.8564254062038406e-05,
"loss": 0.0377,
"step": 730
},
{
"epoch": 0.3643525356967011,
"grad_norm": 0.03332262486219406,
"learning_rate": 1.8544559330379126e-05,
"loss": 0.078,
"step": 740
},
{
"epoch": 0.36927621861152143,
"grad_norm": 0.03905538097023964,
"learning_rate": 1.8524864598719846e-05,
"loss": 0.0158,
"step": 750
},
{
"epoch": 0.3741999015263417,
"grad_norm": 0.034239765256643295,
"learning_rate": 1.8505169867060562e-05,
"loss": 0.0047,
"step": 760
},
{
"epoch": 0.379123584441162,
"grad_norm": 0.029871227219700813,
"learning_rate": 1.8485475135401282e-05,
"loss": 0.0502,
"step": 770
},
{
"epoch": 0.38404726735598227,
"grad_norm": 2.714010238647461,
"learning_rate": 1.8465780403742e-05,
"loss": 0.0311,
"step": 780
},
{
"epoch": 0.3889709502708026,
"grad_norm": 0.030506784096360207,
"learning_rate": 1.844608567208272e-05,
"loss": 0.0042,
"step": 790
},
{
"epoch": 0.39389463318562284,
"grad_norm": 0.03251843899488449,
"learning_rate": 1.842639094042344e-05,
"loss": 0.0043,
"step": 800
},
{
"epoch": 0.3988183161004431,
"grad_norm": 0.8572911620140076,
"learning_rate": 1.8406696208764155e-05,
"loss": 0.0081,
"step": 810
},
{
"epoch": 0.4037419990152634,
"grad_norm": 0.03934726119041443,
"learning_rate": 1.8387001477104875e-05,
"loss": 0.0038,
"step": 820
},
{
"epoch": 0.4086656819300837,
"grad_norm": 0.027939526364207268,
"learning_rate": 1.8367306745445595e-05,
"loss": 0.0035,
"step": 830
},
{
"epoch": 0.413589364844904,
"grad_norm": 0.0301466416567564,
"learning_rate": 1.8347612013786315e-05,
"loss": 0.029,
"step": 840
},
{
"epoch": 0.41851304775972425,
"grad_norm": 0.026854408904910088,
"learning_rate": 1.832791728212703e-05,
"loss": 0.0035,
"step": 850
},
{
"epoch": 0.42343673067454457,
"grad_norm": 0.02816801145672798,
"learning_rate": 1.830822255046775e-05,
"loss": 0.0158,
"step": 860
},
{
"epoch": 0.42836041358936483,
"grad_norm": 0.025995774194598198,
"learning_rate": 1.828852781880847e-05,
"loss": 0.0032,
"step": 870
},
{
"epoch": 0.43328409650418515,
"grad_norm": 0.02553786151111126,
"learning_rate": 1.8268833087149188e-05,
"loss": 0.0039,
"step": 880
},
{
"epoch": 0.4382077794190054,
"grad_norm": 0.025080446153879166,
"learning_rate": 1.8249138355489908e-05,
"loss": 0.0032,
"step": 890
},
{
"epoch": 0.4431314623338257,
"grad_norm": 0.02811121568083763,
"learning_rate": 1.8229443623830627e-05,
"loss": 0.0033,
"step": 900
},
{
"epoch": 0.448055145248646,
"grad_norm": 0.025451993569731712,
"learning_rate": 1.8209748892171347e-05,
"loss": 0.0032,
"step": 910
},
{
"epoch": 0.4529788281634663,
"grad_norm": 0.02798454649746418,
"learning_rate": 1.8190054160512064e-05,
"loss": 0.0806,
"step": 920
},
{
"epoch": 0.45790251107828656,
"grad_norm": 0.03505368530750275,
"learning_rate": 1.8170359428852784e-05,
"loss": 0.0293,
"step": 930
},
{
"epoch": 0.46282619399310687,
"grad_norm": 0.02549416944384575,
"learning_rate": 1.8150664697193504e-05,
"loss": 0.0037,
"step": 940
},
{
"epoch": 0.46774987690792713,
"grad_norm": 0.0305502787232399,
"learning_rate": 1.8130969965534224e-05,
"loss": 0.0035,
"step": 950
},
{
"epoch": 0.4726735598227474,
"grad_norm": 0.023494524881243706,
"learning_rate": 1.811127523387494e-05,
"loss": 0.0032,
"step": 960
},
{
"epoch": 0.4775972427375677,
"grad_norm": 0.02340528555214405,
"learning_rate": 1.809158050221566e-05,
"loss": 0.0628,
"step": 970
},
{
"epoch": 0.48252092565238797,
"grad_norm": 0.02350657619535923,
"learning_rate": 1.8071885770556377e-05,
"loss": 0.0038,
"step": 980
},
{
"epoch": 0.4874446085672083,
"grad_norm": 0.02277735061943531,
"learning_rate": 1.8052191038897096e-05,
"loss": 0.0033,
"step": 990
},
{
"epoch": 0.49236829148202854,
"grad_norm": 0.022285617887973785,
"learning_rate": 1.8032496307237813e-05,
"loss": 0.0032,
"step": 1000
},
{
"epoch": 0.49729197439684886,
"grad_norm": 0.0286717526614666,
"learning_rate": 1.8012801575578533e-05,
"loss": 0.0034,
"step": 1010
},
{
"epoch": 0.5022156573116692,
"grad_norm": 0.03329790011048317,
"learning_rate": 1.7993106843919253e-05,
"loss": 0.0032,
"step": 1020
},
{
"epoch": 0.5071393402264894,
"grad_norm": 0.022517314180731773,
"learning_rate": 1.7973412112259973e-05,
"loss": 0.0028,
"step": 1030
},
{
"epoch": 0.5120630231413097,
"grad_norm": 0.021621420979499817,
"learning_rate": 1.795371738060069e-05,
"loss": 0.003,
"step": 1040
},
{
"epoch": 0.51698670605613,
"grad_norm": 0.021763555705547333,
"learning_rate": 1.793402264894141e-05,
"loss": 0.0027,
"step": 1050
},
{
"epoch": 0.5219103889709503,
"grad_norm": 0.021067790687084198,
"learning_rate": 1.791432791728213e-05,
"loss": 0.0029,
"step": 1060
},
{
"epoch": 0.5268340718857706,
"grad_norm": 0.02510879375040531,
"learning_rate": 1.789463318562285e-05,
"loss": 0.0027,
"step": 1070
},
{
"epoch": 0.5317577548005908,
"grad_norm": 0.02315102145075798,
"learning_rate": 1.7874938453963565e-05,
"loss": 0.0026,
"step": 1080
},
{
"epoch": 0.5366814377154111,
"grad_norm": 0.02186872623860836,
"learning_rate": 1.7855243722304285e-05,
"loss": 0.0029,
"step": 1090
},
{
"epoch": 0.5416051206302314,
"grad_norm": 0.019837426021695137,
"learning_rate": 1.7835548990645005e-05,
"loss": 0.0025,
"step": 1100
},
{
"epoch": 0.5465288035450517,
"grad_norm": 0.02163524180650711,
"learning_rate": 1.7815854258985722e-05,
"loss": 0.0403,
"step": 1110
},
{
"epoch": 0.551452486459872,
"grad_norm": 0.02067442610859871,
"learning_rate": 1.779615952732644e-05,
"loss": 0.0252,
"step": 1120
},
{
"epoch": 0.5563761693746923,
"grad_norm": 0.020983709022402763,
"learning_rate": 1.777646479566716e-05,
"loss": 0.0083,
"step": 1130
},
{
"epoch": 0.5612998522895125,
"grad_norm": 0.01975114829838276,
"learning_rate": 1.775677006400788e-05,
"loss": 0.0085,
"step": 1140
},
{
"epoch": 0.5662235352043329,
"grad_norm": 0.02342822030186653,
"learning_rate": 1.7737075332348598e-05,
"loss": 0.0287,
"step": 1150
},
{
"epoch": 0.5711472181191531,
"grad_norm": 0.021341597661376,
"learning_rate": 1.7717380600689318e-05,
"loss": 0.0123,
"step": 1160
},
{
"epoch": 0.5760709010339734,
"grad_norm": 1.7539845705032349,
"learning_rate": 1.7697685869030038e-05,
"loss": 0.0082,
"step": 1170
},
{
"epoch": 0.5809945839487937,
"grad_norm": 0.026660829782485962,
"learning_rate": 1.7677991137370754e-05,
"loss": 0.0056,
"step": 1180
},
{
"epoch": 0.585918266863614,
"grad_norm": 0.01867399923503399,
"learning_rate": 1.7658296405711474e-05,
"loss": 0.0024,
"step": 1190
},
{
"epoch": 0.5908419497784343,
"grad_norm": 0.019043035805225372,
"learning_rate": 1.763860167405219e-05,
"loss": 0.0022,
"step": 1200
},
{
"epoch": 0.5957656326932546,
"grad_norm": 0.018355844542384148,
"learning_rate": 1.761890694239291e-05,
"loss": 0.0025,
"step": 1210
},
{
"epoch": 0.6006893156080748,
"grad_norm": 0.01786232925951481,
"learning_rate": 1.759921221073363e-05,
"loss": 0.0022,
"step": 1220
},
{
"epoch": 0.6056129985228951,
"grad_norm": 0.01761608012020588,
"learning_rate": 1.7579517479074347e-05,
"loss": 0.0026,
"step": 1230
},
{
"epoch": 0.6105366814377154,
"grad_norm": 0.018296098336577415,
"learning_rate": 1.7559822747415067e-05,
"loss": 0.0022,
"step": 1240
},
{
"epoch": 0.6154603643525357,
"grad_norm": 0.017041673883795738,
"learning_rate": 1.7540128015755787e-05,
"loss": 0.0024,
"step": 1250
},
{
"epoch": 0.620384047267356,
"grad_norm": 0.016793906688690186,
"learning_rate": 1.7520433284096507e-05,
"loss": 0.002,
"step": 1260
},
{
"epoch": 0.6253077301821762,
"grad_norm": 2.5061357021331787,
"learning_rate": 1.7500738552437223e-05,
"loss": 0.0658,
"step": 1270
},
{
"epoch": 0.6302314130969966,
"grad_norm": 0.024613628163933754,
"learning_rate": 1.7481043820777943e-05,
"loss": 0.0691,
"step": 1280
},
{
"epoch": 0.6351550960118169,
"grad_norm": 0.06863340735435486,
"learning_rate": 1.7461349089118663e-05,
"loss": 0.0388,
"step": 1290
},
{
"epoch": 0.6400787789266371,
"grad_norm": 0.02072734758257866,
"learning_rate": 1.744165435745938e-05,
"loss": 0.0655,
"step": 1300
},
{
"epoch": 0.6450024618414574,
"grad_norm": 0.025477182120084763,
"learning_rate": 1.74219596258001e-05,
"loss": 0.0033,
"step": 1310
},
{
"epoch": 0.6499261447562777,
"grad_norm": 0.01841667853295803,
"learning_rate": 1.740226489414082e-05,
"loss": 0.0085,
"step": 1320
},
{
"epoch": 0.654849827671098,
"grad_norm": 0.024184470996260643,
"learning_rate": 1.738257016248154e-05,
"loss": 0.0028,
"step": 1330
},
{
"epoch": 0.6597735105859183,
"grad_norm": 0.02624395862221718,
"learning_rate": 1.7362875430822256e-05,
"loss": 0.017,
"step": 1340
},
{
"epoch": 0.6646971935007385,
"grad_norm": 0.10919595509767532,
"learning_rate": 1.7343180699162976e-05,
"loss": 0.0023,
"step": 1350
},
{
"epoch": 0.6696208764155588,
"grad_norm": 0.016470756381750107,
"learning_rate": 1.7323485967503696e-05,
"loss": 0.0023,
"step": 1360
},
{
"epoch": 0.6745445593303792,
"grad_norm": 0.016200121492147446,
"learning_rate": 1.7303791235844412e-05,
"loss": 0.0024,
"step": 1370
},
{
"epoch": 0.6794682422451994,
"grad_norm": 0.015988627448678017,
"learning_rate": 1.7284096504185132e-05,
"loss": 0.0021,
"step": 1380
},
{
"epoch": 0.6843919251600197,
"grad_norm": 0.01572404056787491,
"learning_rate": 1.7264401772525852e-05,
"loss": 0.0023,
"step": 1390
},
{
"epoch": 0.6893156080748399,
"grad_norm": 9.45742130279541,
"learning_rate": 1.7244707040866572e-05,
"loss": 0.0434,
"step": 1400
},
{
"epoch": 0.6942392909896603,
"grad_norm": 0.015786904841661453,
"learning_rate": 1.722501230920729e-05,
"loss": 0.0026,
"step": 1410
},
{
"epoch": 0.6991629739044806,
"grad_norm": 0.01596219092607498,
"learning_rate": 1.7205317577548005e-05,
"loss": 0.0021,
"step": 1420
},
{
"epoch": 0.7040866568193008,
"grad_norm": 0.01756127178668976,
"learning_rate": 1.7185622845888725e-05,
"loss": 0.0221,
"step": 1430
},
{
"epoch": 0.7090103397341211,
"grad_norm": 0.0174099151045084,
"learning_rate": 1.7165928114229445e-05,
"loss": 0.0024,
"step": 1440
},
{
"epoch": 0.7139340226489415,
"grad_norm": 0.01599658839404583,
"learning_rate": 1.7146233382570165e-05,
"loss": 0.005,
"step": 1450
},
{
"epoch": 0.7188577055637617,
"grad_norm": 0.015532166697084904,
"learning_rate": 1.712653865091088e-05,
"loss": 0.002,
"step": 1460
},
{
"epoch": 0.723781388478582,
"grad_norm": 0.05480727553367615,
"learning_rate": 1.71068439192516e-05,
"loss": 0.0717,
"step": 1470
},
{
"epoch": 0.7287050713934022,
"grad_norm": 0.01733849197626114,
"learning_rate": 1.708714918759232e-05,
"loss": 0.0018,
"step": 1480
},
{
"epoch": 0.7336287543082225,
"grad_norm": 0.028923368081450462,
"learning_rate": 1.7067454455933037e-05,
"loss": 0.002,
"step": 1490
},
{
"epoch": 0.7385524372230429,
"grad_norm": 0.020347867161035538,
"learning_rate": 1.7047759724273757e-05,
"loss": 0.002,
"step": 1500
},
{
"epoch": 0.7434761201378631,
"grad_norm": 0.01512329000979662,
"learning_rate": 1.7028064992614477e-05,
"loss": 0.0018,
"step": 1510
},
{
"epoch": 0.7483998030526834,
"grad_norm": 0.014671691693365574,
"learning_rate": 1.7008370260955197e-05,
"loss": 0.0019,
"step": 1520
},
{
"epoch": 0.7533234859675036,
"grad_norm": 0.015612194314599037,
"learning_rate": 1.6988675529295914e-05,
"loss": 0.0018,
"step": 1530
},
{
"epoch": 0.758247168882324,
"grad_norm": 0.014378263615071774,
"learning_rate": 1.6968980797636634e-05,
"loss": 0.0018,
"step": 1540
},
{
"epoch": 0.7631708517971443,
"grad_norm": 0.013822750188410282,
"learning_rate": 1.6949286065977353e-05,
"loss": 0.0018,
"step": 1550
},
{
"epoch": 0.7680945347119645,
"grad_norm": 0.01382039301097393,
"learning_rate": 1.6929591334318073e-05,
"loss": 0.0017,
"step": 1560
},
{
"epoch": 0.7730182176267848,
"grad_norm": 0.015447800047695637,
"learning_rate": 1.690989660265879e-05,
"loss": 0.0017,
"step": 1570
},
{
"epoch": 0.7779419005416052,
"grad_norm": 0.016364755108952522,
"learning_rate": 1.689020187099951e-05,
"loss": 0.0679,
"step": 1580
},
{
"epoch": 0.7828655834564254,
"grad_norm": 0.01517761405557394,
"learning_rate": 1.687050713934023e-05,
"loss": 0.0018,
"step": 1590
},
{
"epoch": 0.7877892663712457,
"grad_norm": 0.014340780675411224,
"learning_rate": 1.6850812407680946e-05,
"loss": 0.0022,
"step": 1600
},
{
"epoch": 0.792712949286066,
"grad_norm": 0.014129845425486565,
"learning_rate": 1.6831117676021666e-05,
"loss": 0.0018,
"step": 1610
},
{
"epoch": 0.7976366322008862,
"grad_norm": 0.01870601251721382,
"learning_rate": 1.6811422944362386e-05,
"loss": 0.002,
"step": 1620
},
{
"epoch": 0.8025603151157066,
"grad_norm": 0.017389826476573944,
"learning_rate": 1.6791728212703103e-05,
"loss": 0.0019,
"step": 1630
},
{
"epoch": 0.8074839980305268,
"grad_norm": 0.013149221427738667,
"learning_rate": 1.6772033481043822e-05,
"loss": 0.0546,
"step": 1640
},
{
"epoch": 0.8124076809453471,
"grad_norm": 0.013570933602750301,
"learning_rate": 1.675233874938454e-05,
"loss": 0.0043,
"step": 1650
},
{
"epoch": 0.8173313638601674,
"grad_norm": 0.013511077500879765,
"learning_rate": 1.673264401772526e-05,
"loss": 0.0019,
"step": 1660
},
{
"epoch": 0.8222550467749877,
"grad_norm": 0.015698149800300598,
"learning_rate": 1.671294928606598e-05,
"loss": 0.0019,
"step": 1670
},
{
"epoch": 0.827178729689808,
"grad_norm": 0.013056913390755653,
"learning_rate": 1.66932545544067e-05,
"loss": 0.0018,
"step": 1680
},
{
"epoch": 0.8321024126046282,
"grad_norm": 0.015361153520643711,
"learning_rate": 1.6673559822747415e-05,
"loss": 0.002,
"step": 1690
},
{
"epoch": 0.8370260955194485,
"grad_norm": 0.013177746906876564,
"learning_rate": 1.6653865091088135e-05,
"loss": 0.0015,
"step": 1700
},
{
"epoch": 0.8419497784342689,
"grad_norm": 0.05426933988928795,
"learning_rate": 1.6634170359428855e-05,
"loss": 0.0095,
"step": 1710
},
{
"epoch": 0.8468734613490891,
"grad_norm": 0.01309322752058506,
"learning_rate": 1.661447562776957e-05,
"loss": 0.0016,
"step": 1720
},
{
"epoch": 0.8517971442639094,
"grad_norm": 0.015039416030049324,
"learning_rate": 1.659478089611029e-05,
"loss": 0.0016,
"step": 1730
},
{
"epoch": 0.8567208271787297,
"grad_norm": 0.014542197808623314,
"learning_rate": 1.657508616445101e-05,
"loss": 0.0018,
"step": 1740
},
{
"epoch": 0.8616445100935499,
"grad_norm": 0.012222396209836006,
"learning_rate": 1.655539143279173e-05,
"loss": 0.0014,
"step": 1750
},
{
"epoch": 0.8665681930083703,
"grad_norm": 0.012340564280748367,
"learning_rate": 1.6535696701132448e-05,
"loss": 0.0014,
"step": 1760
},
{
"epoch": 0.8714918759231906,
"grad_norm": 0.011919394135475159,
"learning_rate": 1.6516001969473168e-05,
"loss": 0.0016,
"step": 1770
},
{
"epoch": 0.8764155588380108,
"grad_norm": 0.01167115569114685,
"learning_rate": 1.6496307237813888e-05,
"loss": 0.0016,
"step": 1780
},
{
"epoch": 0.8813392417528311,
"grad_norm": 0.013182350434362888,
"learning_rate": 1.6476612506154604e-05,
"loss": 0.0015,
"step": 1790
},
{
"epoch": 0.8862629246676514,
"grad_norm": 0.013948991894721985,
"learning_rate": 1.6456917774495324e-05,
"loss": 0.0014,
"step": 1800
},
{
"epoch": 0.8911866075824717,
"grad_norm": 0.01775103434920311,
"learning_rate": 1.6437223042836044e-05,
"loss": 0.0015,
"step": 1810
},
{
"epoch": 0.896110290497292,
"grad_norm": 0.01155338529497385,
"learning_rate": 1.6417528311176764e-05,
"loss": 0.002,
"step": 1820
},
{
"epoch": 0.9010339734121122,
"grad_norm": 0.014259099029004574,
"learning_rate": 1.639783357951748e-05,
"loss": 0.0016,
"step": 1830
},
{
"epoch": 0.9059576563269326,
"grad_norm": 0.014643428847193718,
"learning_rate": 1.6378138847858197e-05,
"loss": 0.0014,
"step": 1840
},
{
"epoch": 0.9108813392417529,
"grad_norm": 0.011148291639983654,
"learning_rate": 1.6358444116198917e-05,
"loss": 0.0706,
"step": 1850
},
{
"epoch": 0.9158050221565731,
"grad_norm": 0.012168935500085354,
"learning_rate": 1.6338749384539637e-05,
"loss": 0.0014,
"step": 1860
},
{
"epoch": 0.9207287050713934,
"grad_norm": 0.011070801876485348,
"learning_rate": 1.6319054652880356e-05,
"loss": 0.0018,
"step": 1870
},
{
"epoch": 0.9256523879862137,
"grad_norm": 0.023191597312688828,
"learning_rate": 1.6299359921221073e-05,
"loss": 0.0023,
"step": 1880
},
{
"epoch": 0.930576070901034,
"grad_norm": 0.0227424968034029,
"learning_rate": 1.6279665189561793e-05,
"loss": 0.0867,
"step": 1890
},
{
"epoch": 0.9354997538158543,
"grad_norm": 0.023284750059247017,
"learning_rate": 1.6259970457902513e-05,
"loss": 0.002,
"step": 1900
},
{
"epoch": 0.9404234367306745,
"grad_norm": 0.013480834662914276,
"learning_rate": 1.624027572624323e-05,
"loss": 0.0028,
"step": 1910
},
{
"epoch": 0.9453471196454948,
"grad_norm": 0.0108991339802742,
"learning_rate": 1.622058099458395e-05,
"loss": 0.0084,
"step": 1920
},
{
"epoch": 0.9502708025603152,
"grad_norm": 0.011104236356914043,
"learning_rate": 1.620088626292467e-05,
"loss": 0.0104,
"step": 1930
},
{
"epoch": 0.9551944854751354,
"grad_norm": 0.011128348298370838,
"learning_rate": 1.618119153126539e-05,
"loss": 0.0018,
"step": 1940
},
{
"epoch": 0.9601181683899557,
"grad_norm": 0.01104864850640297,
"learning_rate": 1.6161496799606106e-05,
"loss": 0.0014,
"step": 1950
},
{
"epoch": 0.9650418513047759,
"grad_norm": 0.03145885095000267,
"learning_rate": 1.6141802067946825e-05,
"loss": 0.0086,
"step": 1960
},
{
"epoch": 0.9699655342195963,
"grad_norm": 0.012039147317409515,
"learning_rate": 1.6122107336287545e-05,
"loss": 0.0512,
"step": 1970
},
{
"epoch": 0.9748892171344166,
"grad_norm": 0.011414915323257446,
"learning_rate": 1.6102412604628265e-05,
"loss": 0.0015,
"step": 1980
},
{
"epoch": 0.9798129000492368,
"grad_norm": 0.013891511596739292,
"learning_rate": 1.6082717872968982e-05,
"loss": 0.0015,
"step": 1990
},
{
"epoch": 0.9847365829640571,
"grad_norm": 0.010472927242517471,
"learning_rate": 1.60630231413097e-05,
"loss": 0.0024,
"step": 2000
},
{
"epoch": 0.9896602658788775,
"grad_norm": 0.010258992202579975,
"learning_rate": 1.604332840965042e-05,
"loss": 0.0015,
"step": 2010
},
{
"epoch": 0.9945839487936977,
"grad_norm": 0.021762333810329437,
"learning_rate": 1.6023633677991138e-05,
"loss": 0.0017,
"step": 2020
},
{
"epoch": 0.999507631708518,
"grad_norm": 0.009938563220202923,
"learning_rate": 1.6003938946331858e-05,
"loss": 0.0014,
"step": 2030
},
{
"epoch": 1.0,
"eval_accuracy": 0.9986048133937914,
"eval_loss": 0.006461690180003643,
"eval_runtime": 124.9038,
"eval_samples_per_second": 22.954,
"eval_steps_per_second": 2.874,
"step": 2031
},
{
"epoch": 1.0044313146233383,
"grad_norm": 0.009946716949343681,
"learning_rate": 1.5984244214672578e-05,
"loss": 0.0013,
"step": 2040
},
{
"epoch": 1.0093549975381586,
"grad_norm": 0.01007277425378561,
"learning_rate": 1.5964549483013298e-05,
"loss": 0.0051,
"step": 2050
},
{
"epoch": 1.0142786804529789,
"grad_norm": 0.009899957105517387,
"learning_rate": 1.5944854751354014e-05,
"loss": 0.0012,
"step": 2060
},
{
"epoch": 1.0192023633677991,
"grad_norm": 0.011700589209794998,
"learning_rate": 1.592516001969473e-05,
"loss": 0.0014,
"step": 2070
},
{
"epoch": 1.0241260462826194,
"grad_norm": 0.01607823744416237,
"learning_rate": 1.590546528803545e-05,
"loss": 0.0627,
"step": 2080
},
{
"epoch": 1.0290497291974396,
"grad_norm": 0.010469055734574795,
"learning_rate": 1.588577055637617e-05,
"loss": 0.0013,
"step": 2090
},
{
"epoch": 1.03397341211226,
"grad_norm": 0.010042490437626839,
"learning_rate": 1.586607582471689e-05,
"loss": 0.0014,
"step": 2100
},
{
"epoch": 1.0388970950270802,
"grad_norm": 0.00974634476006031,
"learning_rate": 1.5846381093057607e-05,
"loss": 0.0015,
"step": 2110
},
{
"epoch": 1.0438207779419006,
"grad_norm": 0.026304766535758972,
"learning_rate": 1.5826686361398327e-05,
"loss": 0.0017,
"step": 2120
},
{
"epoch": 1.048744460856721,
"grad_norm": 0.022236375138163567,
"learning_rate": 1.5806991629739047e-05,
"loss": 0.0013,
"step": 2130
},
{
"epoch": 1.0536681437715412,
"grad_norm": 0.025571007281541824,
"learning_rate": 1.5787296898079763e-05,
"loss": 0.0017,
"step": 2140
},
{
"epoch": 1.0585918266863614,
"grad_norm": 0.009789519011974335,
"learning_rate": 1.5767602166420483e-05,
"loss": 0.0012,
"step": 2150
},
{
"epoch": 1.0635155096011817,
"grad_norm": 0.009336259216070175,
"learning_rate": 1.5747907434761203e-05,
"loss": 0.0014,
"step": 2160
},
{
"epoch": 1.068439192516002,
"grad_norm": 0.009336400777101517,
"learning_rate": 1.5728212703101923e-05,
"loss": 0.0012,
"step": 2170
},
{
"epoch": 1.0733628754308222,
"grad_norm": 0.009125406853854656,
"learning_rate": 1.570851797144264e-05,
"loss": 0.0011,
"step": 2180
},
{
"epoch": 1.0782865583456425,
"grad_norm": 0.008981688879430294,
"learning_rate": 1.568882323978336e-05,
"loss": 0.0012,
"step": 2190
},
{
"epoch": 1.0832102412604627,
"grad_norm": 0.008810392580926418,
"learning_rate": 1.566912850812408e-05,
"loss": 0.0011,
"step": 2200
},
{
"epoch": 1.0881339241752832,
"grad_norm": 0.02159319818019867,
"learning_rate": 1.5649433776464796e-05,
"loss": 0.0012,
"step": 2210
},
{
"epoch": 1.0930576070901035,
"grad_norm": 2.4877359867095947,
"learning_rate": 1.5629739044805516e-05,
"loss": 0.0381,
"step": 2220
},
{
"epoch": 1.0979812900049237,
"grad_norm": 0.011043106205761433,
"learning_rate": 1.5610044313146236e-05,
"loss": 0.0012,
"step": 2230
},
{
"epoch": 1.102904972919744,
"grad_norm": 0.012737146578729153,
"learning_rate": 1.5590349581486956e-05,
"loss": 0.0017,
"step": 2240
},
{
"epoch": 1.1078286558345642,
"grad_norm": 0.045515026897192,
"learning_rate": 1.5570654849827672e-05,
"loss": 0.0013,
"step": 2250
},
{
"epoch": 1.1127523387493845,
"grad_norm": 0.010885367169976234,
"learning_rate": 1.5550960118168392e-05,
"loss": 0.0013,
"step": 2260
},
{
"epoch": 1.1176760216642048,
"grad_norm": 0.010456659831106663,
"learning_rate": 1.553126538650911e-05,
"loss": 0.0011,
"step": 2270
},
{
"epoch": 1.122599704579025,
"grad_norm": 0.008621557615697384,
"learning_rate": 1.551157065484983e-05,
"loss": 0.0013,
"step": 2280
},
{
"epoch": 1.1275233874938455,
"grad_norm": 0.008608179166913033,
"learning_rate": 1.549187592319055e-05,
"loss": 0.001,
"step": 2290
},
{
"epoch": 1.1324470704086658,
"grad_norm": 0.008878961205482483,
"learning_rate": 1.5472181191531265e-05,
"loss": 0.0599,
"step": 2300
},
{
"epoch": 1.137370753323486,
"grad_norm": 0.008603103458881378,
"learning_rate": 1.5452486459871985e-05,
"loss": 0.0013,
"step": 2310
},
{
"epoch": 1.1422944362383063,
"grad_norm": 0.008407089859247208,
"learning_rate": 1.5432791728212705e-05,
"loss": 0.0166,
"step": 2320
},
{
"epoch": 1.1472181191531265,
"grad_norm": 1.1491219997406006,
"learning_rate": 1.541309699655342e-05,
"loss": 0.0027,
"step": 2330
},
{
"epoch": 1.1521418020679468,
"grad_norm": 0.008799925446510315,
"learning_rate": 1.539340226489414e-05,
"loss": 0.0064,
"step": 2340
},
{
"epoch": 1.157065484982767,
"grad_norm": 0.102848581969738,
"learning_rate": 1.537370753323486e-05,
"loss": 0.0014,
"step": 2350
},
{
"epoch": 1.1619891678975873,
"grad_norm": 0.01129199005663395,
"learning_rate": 1.535401280157558e-05,
"loss": 0.0628,
"step": 2360
},
{
"epoch": 1.1669128508124076,
"grad_norm": 0.011105939745903015,
"learning_rate": 1.5334318069916297e-05,
"loss": 0.0322,
"step": 2370
},
{
"epoch": 1.171836533727228,
"grad_norm": 0.009356286376714706,
"learning_rate": 1.5314623338257017e-05,
"loss": 0.0023,
"step": 2380
},
{
"epoch": 1.1767602166420483,
"grad_norm": 0.010616080835461617,
"learning_rate": 1.5294928606597737e-05,
"loss": 0.0016,
"step": 2390
},
{
"epoch": 1.1816838995568686,
"grad_norm": 0.024889415130019188,
"learning_rate": 1.5275233874938454e-05,
"loss": 0.0012,
"step": 2400
},
{
"epoch": 1.1866075824716888,
"grad_norm": 0.013439149595797062,
"learning_rate": 1.5255539143279174e-05,
"loss": 0.0054,
"step": 2410
},
{
"epoch": 1.191531265386509,
"grad_norm": 0.008294392377138138,
"learning_rate": 1.5235844411619894e-05,
"loss": 0.0013,
"step": 2420
},
{
"epoch": 1.1964549483013294,
"grad_norm": 0.008172878995537758,
"learning_rate": 1.5216149679960612e-05,
"loss": 0.001,
"step": 2430
},
{
"epoch": 1.2013786312161496,
"grad_norm": 0.008062479086220264,
"learning_rate": 1.5196454948301332e-05,
"loss": 0.001,
"step": 2440
},
{
"epoch": 1.2063023141309699,
"grad_norm": 0.015131563879549503,
"learning_rate": 1.517676021664205e-05,
"loss": 0.0011,
"step": 2450
},
{
"epoch": 1.2112259970457901,
"grad_norm": 0.017097556963562965,
"learning_rate": 1.515706548498277e-05,
"loss": 0.025,
"step": 2460
},
{
"epoch": 1.2161496799606106,
"grad_norm": 0.011799236759543419,
"learning_rate": 1.5137370753323488e-05,
"loss": 0.0011,
"step": 2470
},
{
"epoch": 1.221073362875431,
"grad_norm": 0.024478154256939888,
"learning_rate": 1.5117676021664208e-05,
"loss": 0.0732,
"step": 2480
},
{
"epoch": 1.2259970457902511,
"grad_norm": 0.013807065784931183,
"learning_rate": 1.5097981290004924e-05,
"loss": 0.0014,
"step": 2490
},
{
"epoch": 1.2309207287050714,
"grad_norm": 0.010231712833046913,
"learning_rate": 1.5078286558345643e-05,
"loss": 0.0014,
"step": 2500
},
{
"epoch": 1.2358444116198917,
"grad_norm": 0.012537546455860138,
"learning_rate": 1.5058591826686363e-05,
"loss": 0.0626,
"step": 2510
},
{
"epoch": 1.240768094534712,
"grad_norm": 0.00985932070761919,
"learning_rate": 1.503889709502708e-05,
"loss": 0.0012,
"step": 2520
},
{
"epoch": 1.2456917774495322,
"grad_norm": 0.01136948075145483,
"learning_rate": 1.5019202363367799e-05,
"loss": 0.0028,
"step": 2530
},
{
"epoch": 1.2506154603643527,
"grad_norm": 0.009887000545859337,
"learning_rate": 1.4999507631708519e-05,
"loss": 0.0012,
"step": 2540
},
{
"epoch": 1.2555391432791727,
"grad_norm": 0.009588898159563541,
"learning_rate": 1.4979812900049237e-05,
"loss": 0.0018,
"step": 2550
},
{
"epoch": 1.2604628261939932,
"grad_norm": 0.00931033119559288,
"learning_rate": 1.4960118168389957e-05,
"loss": 0.001,
"step": 2560
},
{
"epoch": 1.2653865091088135,
"grad_norm": 0.008880667388439178,
"learning_rate": 1.4940423436730675e-05,
"loss": 0.001,
"step": 2570
},
{
"epoch": 1.2703101920236337,
"grad_norm": 0.008290020748972893,
"learning_rate": 1.4920728705071395e-05,
"loss": 0.0022,
"step": 2580
},
{
"epoch": 1.275233874938454,
"grad_norm": 0.013004295527935028,
"learning_rate": 1.4901033973412113e-05,
"loss": 0.0011,
"step": 2590
},
{
"epoch": 1.2801575578532742,
"grad_norm": 0.009094899520277977,
"learning_rate": 1.4881339241752833e-05,
"loss": 0.0011,
"step": 2600
},
{
"epoch": 1.2850812407680945,
"grad_norm": 0.007369400467723608,
"learning_rate": 1.4861644510093551e-05,
"loss": 0.0011,
"step": 2610
},
{
"epoch": 1.2900049236829148,
"grad_norm": 0.01753099076449871,
"learning_rate": 1.484194977843427e-05,
"loss": 0.001,
"step": 2620
},
{
"epoch": 1.2949286065977352,
"grad_norm": 0.007790517993271351,
"learning_rate": 1.482225504677499e-05,
"loss": 0.001,
"step": 2630
},
{
"epoch": 1.2998522895125553,
"grad_norm": 0.007512846030294895,
"learning_rate": 1.4802560315115708e-05,
"loss": 0.0013,
"step": 2640
},
{
"epoch": 1.3047759724273758,
"grad_norm": 0.009430987760424614,
"learning_rate": 1.4782865583456428e-05,
"loss": 0.001,
"step": 2650
},
{
"epoch": 1.309699655342196,
"grad_norm": 0.025956837460398674,
"learning_rate": 1.4763170851797146e-05,
"loss": 0.001,
"step": 2660
},
{
"epoch": 1.3146233382570163,
"grad_norm": 0.007779216393828392,
"learning_rate": 1.4743476120137866e-05,
"loss": 0.006,
"step": 2670
},
{
"epoch": 1.3195470211718365,
"grad_norm": 0.0102881183847785,
"learning_rate": 1.4723781388478584e-05,
"loss": 0.0027,
"step": 2680
},
{
"epoch": 1.3244707040866568,
"grad_norm": 0.007080462761223316,
"learning_rate": 1.4704086656819302e-05,
"loss": 0.0011,
"step": 2690
},
{
"epoch": 1.329394387001477,
"grad_norm": 0.0068712919019162655,
"learning_rate": 1.4684391925160022e-05,
"loss": 0.0009,
"step": 2700
},
{
"epoch": 1.3343180699162973,
"grad_norm": 0.006963066756725311,
"learning_rate": 1.4664697193500739e-05,
"loss": 0.0008,
"step": 2710
},
{
"epoch": 1.3392417528311178,
"grad_norm": 0.007015050854533911,
"learning_rate": 1.4645002461841459e-05,
"loss": 0.0068,
"step": 2720
},
{
"epoch": 1.3441654357459378,
"grad_norm": 0.006741439923644066,
"learning_rate": 1.4625307730182177e-05,
"loss": 0.0008,
"step": 2730
},
{
"epoch": 1.3490891186607583,
"grad_norm": 0.009445318952202797,
"learning_rate": 1.4605612998522895e-05,
"loss": 0.0008,
"step": 2740
},
{
"epoch": 1.3540128015755786,
"grad_norm": 0.006745448801666498,
"learning_rate": 1.4585918266863615e-05,
"loss": 0.0009,
"step": 2750
},
{
"epoch": 1.3589364844903988,
"grad_norm": 0.0067230272106826305,
"learning_rate": 1.4566223535204333e-05,
"loss": 0.0756,
"step": 2760
},
{
"epoch": 1.363860167405219,
"grad_norm": 0.006957577541470528,
"learning_rate": 1.4546528803545053e-05,
"loss": 0.0009,
"step": 2770
},
{
"epoch": 1.3687838503200394,
"grad_norm": 0.008368587121367455,
"learning_rate": 1.4526834071885771e-05,
"loss": 0.0271,
"step": 2780
},
{
"epoch": 1.3737075332348596,
"grad_norm": 0.008987442590296268,
"learning_rate": 1.4507139340226491e-05,
"loss": 0.001,
"step": 2790
},
{
"epoch": 1.3786312161496799,
"grad_norm": 0.009542476385831833,
"learning_rate": 1.448744460856721e-05,
"loss": 0.0011,
"step": 2800
},
{
"epoch": 1.3835548990645004,
"grad_norm": 0.006845912430435419,
"learning_rate": 1.4467749876907927e-05,
"loss": 0.0167,
"step": 2810
},
{
"epoch": 1.3884785819793206,
"grad_norm": 0.00742871779948473,
"learning_rate": 1.4448055145248647e-05,
"loss": 0.0033,
"step": 2820
},
{
"epoch": 1.3934022648941409,
"grad_norm": 6.05401086807251,
"learning_rate": 1.4428360413589366e-05,
"loss": 0.0399,
"step": 2830
},
{
"epoch": 1.3983259478089611,
"grad_norm": 0.007821030914783478,
"learning_rate": 1.4408665681930086e-05,
"loss": 0.0014,
"step": 2840
},
{
"epoch": 1.4032496307237814,
"grad_norm": 0.006872816011309624,
"learning_rate": 1.4388970950270804e-05,
"loss": 0.0129,
"step": 2850
},
{
"epoch": 1.4081733136386017,
"grad_norm": 0.006989907938987017,
"learning_rate": 1.4369276218611524e-05,
"loss": 0.02,
"step": 2860
},
{
"epoch": 1.413096996553422,
"grad_norm": 0.017713190987706184,
"learning_rate": 1.4349581486952242e-05,
"loss": 0.076,
"step": 2870
},
{
"epoch": 1.4180206794682422,
"grad_norm": 0.009753022342920303,
"learning_rate": 1.4329886755292962e-05,
"loss": 0.0024,
"step": 2880
},
{
"epoch": 1.4229443623830624,
"grad_norm": 0.010295004583895206,
"learning_rate": 1.431019202363368e-05,
"loss": 0.0011,
"step": 2890
},
{
"epoch": 1.427868045297883,
"grad_norm": 0.009071256965398788,
"learning_rate": 1.4290497291974398e-05,
"loss": 0.001,
"step": 2900
},
{
"epoch": 1.4327917282127032,
"grad_norm": 0.009070048108696938,
"learning_rate": 1.4270802560315118e-05,
"loss": 0.001,
"step": 2910
},
{
"epoch": 1.4377154111275234,
"grad_norm": 0.007653309963643551,
"learning_rate": 1.4251107828655835e-05,
"loss": 0.001,
"step": 2920
},
{
"epoch": 1.4426390940423437,
"grad_norm": 0.006937779951840639,
"learning_rate": 1.4231413096996553e-05,
"loss": 0.0008,
"step": 2930
},
{
"epoch": 1.447562776957164,
"grad_norm": 0.008243228308856487,
"learning_rate": 1.4211718365337273e-05,
"loss": 0.001,
"step": 2940
},
{
"epoch": 1.4524864598719842,
"grad_norm": 0.010159426368772984,
"learning_rate": 1.4192023633677991e-05,
"loss": 0.0669,
"step": 2950
},
{
"epoch": 1.4574101427868045,
"grad_norm": 0.009640317410230637,
"learning_rate": 1.417232890201871e-05,
"loss": 0.0244,
"step": 2960
},
{
"epoch": 1.4623338257016247,
"grad_norm": 0.11323712021112442,
"learning_rate": 1.4152634170359429e-05,
"loss": 0.0015,
"step": 2970
},
{
"epoch": 1.467257508616445,
"grad_norm": 0.00797420833259821,
"learning_rate": 1.4132939438700149e-05,
"loss": 0.0365,
"step": 2980
},
{
"epoch": 1.4721811915312655,
"grad_norm": 0.0069627827033400536,
"learning_rate": 1.4113244707040867e-05,
"loss": 0.0036,
"step": 2990
},
{
"epoch": 1.4771048744460857,
"grad_norm": 0.008956373669207096,
"learning_rate": 1.4093549975381587e-05,
"loss": 0.0019,
"step": 3000
},
{
"epoch": 1.482028557360906,
"grad_norm": 0.0071487524546682835,
"learning_rate": 1.4073855243722305e-05,
"loss": 0.0175,
"step": 3010
},
{
"epoch": 1.4869522402757263,
"grad_norm": 0.009181806817650795,
"learning_rate": 1.4054160512063023e-05,
"loss": 0.0012,
"step": 3020
},
{
"epoch": 1.4918759231905465,
"grad_norm": 0.006655732169747353,
"learning_rate": 1.4034465780403743e-05,
"loss": 0.0008,
"step": 3030
},
{
"epoch": 1.4967996061053668,
"grad_norm": 0.007243420463055372,
"learning_rate": 1.4014771048744462e-05,
"loss": 0.0044,
"step": 3040
},
{
"epoch": 1.501723289020187,
"grad_norm": 0.012056315317749977,
"learning_rate": 1.3995076317085181e-05,
"loss": 0.0011,
"step": 3050
},
{
"epoch": 1.5066469719350075,
"grad_norm": 0.007087447214871645,
"learning_rate": 1.39753815854259e-05,
"loss": 0.0008,
"step": 3060
},
{
"epoch": 1.5115706548498276,
"grad_norm": 0.011527610942721367,
"learning_rate": 1.395568685376662e-05,
"loss": 0.0009,
"step": 3070
},
{
"epoch": 1.516494337764648,
"grad_norm": 0.009305012412369251,
"learning_rate": 1.3935992122107338e-05,
"loss": 0.0008,
"step": 3080
},
{
"epoch": 1.5214180206794683,
"grad_norm": 0.010509533807635307,
"learning_rate": 1.3916297390448058e-05,
"loss": 0.0008,
"step": 3090
},
{
"epoch": 1.5263417035942886,
"grad_norm": 0.0072973608039319515,
"learning_rate": 1.3896602658788776e-05,
"loss": 0.0158,
"step": 3100
},
{
"epoch": 1.5312653865091088,
"grad_norm": 0.00608638534322381,
"learning_rate": 1.3876907927129494e-05,
"loss": 0.0097,
"step": 3110
},
{
"epoch": 1.536189069423929,
"grad_norm": 0.018107222393155098,
"learning_rate": 1.3857213195470214e-05,
"loss": 0.0008,
"step": 3120
},
{
"epoch": 1.5411127523387493,
"grad_norm": 0.007667326834052801,
"learning_rate": 1.3837518463810932e-05,
"loss": 0.0009,
"step": 3130
},
{
"epoch": 1.5460364352535696,
"grad_norm": 0.006599190644919872,
"learning_rate": 1.3817823732151649e-05,
"loss": 0.0087,
"step": 3140
},
{
"epoch": 1.55096011816839,
"grad_norm": 0.007009952329099178,
"learning_rate": 1.3798129000492369e-05,
"loss": 0.0012,
"step": 3150
},
{
"epoch": 1.5558838010832101,
"grad_norm": 0.006010917481034994,
"learning_rate": 1.3778434268833087e-05,
"loss": 0.0008,
"step": 3160
},
{
"epoch": 1.5608074839980306,
"grad_norm": 0.006966378074139357,
"learning_rate": 1.3758739537173807e-05,
"loss": 0.0007,
"step": 3170
},
{
"epoch": 1.5657311669128509,
"grad_norm": 0.0060759298503398895,
"learning_rate": 1.3739044805514525e-05,
"loss": 0.0012,
"step": 3180
},
{
"epoch": 1.5706548498276711,
"grad_norm": 0.005752959754317999,
"learning_rate": 1.3719350073855245e-05,
"loss": 0.0007,
"step": 3190
},
{
"epoch": 1.5755785327424914,
"grad_norm": 0.006781155243515968,
"learning_rate": 1.3699655342195963e-05,
"loss": 0.0007,
"step": 3200
},
{
"epoch": 1.5805022156573116,
"grad_norm": 0.005668849218636751,
"learning_rate": 1.3679960610536683e-05,
"loss": 0.0196,
"step": 3210
},
{
"epoch": 1.5854258985721321,
"grad_norm": 0.005629651714116335,
"learning_rate": 1.3660265878877401e-05,
"loss": 0.0007,
"step": 3220
},
{
"epoch": 1.5903495814869522,
"grad_norm": 0.00564204016700387,
"learning_rate": 1.364057114721812e-05,
"loss": 0.0011,
"step": 3230
},
{
"epoch": 1.5952732644017726,
"grad_norm": 0.005540564656257629,
"learning_rate": 1.362087641555884e-05,
"loss": 0.0008,
"step": 3240
},
{
"epoch": 1.6001969473165927,
"grad_norm": 0.006026837043464184,
"learning_rate": 1.3601181683899558e-05,
"loss": 0.0425,
"step": 3250
},
{
"epoch": 1.6051206302314132,
"grad_norm": 0.00552277360111475,
"learning_rate": 1.3581486952240277e-05,
"loss": 0.0008,
"step": 3260
},
{
"epoch": 1.6100443131462334,
"grad_norm": 0.018236679956316948,
"learning_rate": 1.3561792220580996e-05,
"loss": 0.0007,
"step": 3270
},
{
"epoch": 1.6149679960610537,
"grad_norm": 0.00691232131794095,
"learning_rate": 1.3542097488921716e-05,
"loss": 0.0007,
"step": 3280
},
{
"epoch": 1.619891678975874,
"grad_norm": 0.006581007968634367,
"learning_rate": 1.3522402757262434e-05,
"loss": 0.0007,
"step": 3290
},
{
"epoch": 1.6248153618906942,
"grad_norm": 0.0053688762709498405,
"learning_rate": 1.3502708025603154e-05,
"loss": 0.0008,
"step": 3300
},
{
"epoch": 1.6297390448055147,
"grad_norm": 1.0336942672729492,
"learning_rate": 1.3483013293943872e-05,
"loss": 0.0076,
"step": 3310
},
{
"epoch": 1.6346627277203347,
"grad_norm": 0.005269223358482122,
"learning_rate": 1.346331856228459e-05,
"loss": 0.0007,
"step": 3320
},
{
"epoch": 1.6395864106351552,
"grad_norm": 0.008069952018558979,
"learning_rate": 1.344362383062531e-05,
"loss": 0.0008,
"step": 3330
},
{
"epoch": 1.6445100935499752,
"grad_norm": 0.005610567983239889,
"learning_rate": 1.3423929098966028e-05,
"loss": 0.0007,
"step": 3340
},
{
"epoch": 1.6494337764647957,
"grad_norm": 0.005232799798250198,
"learning_rate": 1.3404234367306745e-05,
"loss": 0.0007,
"step": 3350
},
{
"epoch": 1.654357459379616,
"grad_norm": 0.005385094787925482,
"learning_rate": 1.3384539635647465e-05,
"loss": 0.0006,
"step": 3360
},
{
"epoch": 1.6592811422944362,
"grad_norm": 0.005121603608131409,
"learning_rate": 1.3364844903988183e-05,
"loss": 0.0006,
"step": 3370
},
{
"epoch": 1.6642048252092565,
"grad_norm": 0.005196818150579929,
"learning_rate": 1.3345150172328903e-05,
"loss": 0.0007,
"step": 3380
},
{
"epoch": 1.6691285081240768,
"grad_norm": 0.005104544572532177,
"learning_rate": 1.3325455440669621e-05,
"loss": 0.0006,
"step": 3390
},
{
"epoch": 1.6740521910388972,
"grad_norm": 0.0062180510722100735,
"learning_rate": 1.330576070901034e-05,
"loss": 0.0007,
"step": 3400
},
{
"epoch": 1.6789758739537173,
"grad_norm": 0.01003600750118494,
"learning_rate": 1.3286065977351059e-05,
"loss": 0.0007,
"step": 3410
},
{
"epoch": 1.6838995568685378,
"grad_norm": 0.0061742691323161125,
"learning_rate": 1.3266371245691779e-05,
"loss": 0.0018,
"step": 3420
},
{
"epoch": 1.6888232397833578,
"grad_norm": 0.00500025087967515,
"learning_rate": 1.3246676514032497e-05,
"loss": 0.0006,
"step": 3430
},
{
"epoch": 1.6937469226981783,
"grad_norm": 0.009734513238072395,
"learning_rate": 1.3226981782373215e-05,
"loss": 0.0007,
"step": 3440
},
{
"epoch": 1.6986706056129985,
"grad_norm": 0.00670117000117898,
"learning_rate": 1.3207287050713935e-05,
"loss": 0.0007,
"step": 3450
},
{
"epoch": 1.7035942885278188,
"grad_norm": 0.004964028485119343,
"learning_rate": 1.3187592319054653e-05,
"loss": 0.0006,
"step": 3460
},
{
"epoch": 1.708517971442639,
"grad_norm": 0.004891118034720421,
"learning_rate": 1.3167897587395373e-05,
"loss": 0.0007,
"step": 3470
},
{
"epoch": 1.7134416543574593,
"grad_norm": 0.005974843632429838,
"learning_rate": 1.3148202855736092e-05,
"loss": 0.0007,
"step": 3480
},
{
"epoch": 1.7183653372722798,
"grad_norm": 0.005670204292982817,
"learning_rate": 1.3128508124076811e-05,
"loss": 0.0007,
"step": 3490
},
{
"epoch": 1.7232890201870998,
"grad_norm": 0.00542798126116395,
"learning_rate": 1.310881339241753e-05,
"loss": 0.0007,
"step": 3500
},
{
"epoch": 1.7282127031019203,
"grad_norm": 0.005577849689871073,
"learning_rate": 1.308911866075825e-05,
"loss": 0.0006,
"step": 3510
},
{
"epoch": 1.7331363860167404,
"grad_norm": 0.02375694364309311,
"learning_rate": 1.3069423929098968e-05,
"loss": 0.0007,
"step": 3520
},
{
"epoch": 1.7380600689315608,
"grad_norm": 0.004778748843818903,
"learning_rate": 1.3049729197439686e-05,
"loss": 0.0006,
"step": 3530
},
{
"epoch": 1.742983751846381,
"grad_norm": 0.005341399926692247,
"learning_rate": 1.3030034465780406e-05,
"loss": 0.0006,
"step": 3540
},
{
"epoch": 1.7479074347612014,
"grad_norm": 0.0069299363531172276,
"learning_rate": 1.3010339734121124e-05,
"loss": 0.0006,
"step": 3550
},
{
"epoch": 1.7528311176760216,
"grad_norm": 0.004599249456077814,
"learning_rate": 1.2990645002461844e-05,
"loss": 0.0007,
"step": 3560
},
{
"epoch": 1.7577548005908419,
"grad_norm": 0.004595101345330477,
"learning_rate": 1.297095027080256e-05,
"loss": 0.0006,
"step": 3570
},
{
"epoch": 1.7626784835056624,
"grad_norm": 0.005347404163330793,
"learning_rate": 1.2951255539143279e-05,
"loss": 0.0006,
"step": 3580
},
{
"epoch": 1.7676021664204824,
"grad_norm": 0.004594715777784586,
"learning_rate": 1.2931560807483999e-05,
"loss": 0.0006,
"step": 3590
},
{
"epoch": 1.7725258493353029,
"grad_norm": 0.06393859535455704,
"learning_rate": 1.2911866075824717e-05,
"loss": 0.0008,
"step": 3600
},
{
"epoch": 1.7774495322501231,
"grad_norm": 0.005369629245251417,
"learning_rate": 1.2892171344165437e-05,
"loss": 0.0006,
"step": 3610
},
{
"epoch": 1.7823732151649434,
"grad_norm": 0.009649335406720638,
"learning_rate": 1.2872476612506155e-05,
"loss": 0.0006,
"step": 3620
},
{
"epoch": 1.7872968980797637,
"grad_norm": 0.005992168094962835,
"learning_rate": 1.2852781880846875e-05,
"loss": 0.0006,
"step": 3630
},
{
"epoch": 1.792220580994584,
"grad_norm": 0.00444134371355176,
"learning_rate": 1.2833087149187593e-05,
"loss": 0.0034,
"step": 3640
},
{
"epoch": 1.7971442639094042,
"grad_norm": 0.005889700725674629,
"learning_rate": 1.2813392417528311e-05,
"loss": 0.0006,
"step": 3650
},
{
"epoch": 1.8020679468242244,
"grad_norm": 0.004602220840752125,
"learning_rate": 1.2793697685869031e-05,
"loss": 0.0005,
"step": 3660
},
{
"epoch": 1.806991629739045,
"grad_norm": 0.016186626628041267,
"learning_rate": 1.277400295420975e-05,
"loss": 0.0612,
"step": 3670
},
{
"epoch": 1.811915312653865,
"grad_norm": 0.011738053523004055,
"learning_rate": 1.275430822255047e-05,
"loss": 0.0006,
"step": 3680
},
{
"epoch": 1.8168389955686854,
"grad_norm": 0.0061439890414476395,
"learning_rate": 1.2734613490891188e-05,
"loss": 0.0007,
"step": 3690
},
{
"epoch": 1.8217626784835057,
"grad_norm": 0.00608317693695426,
"learning_rate": 1.2714918759231907e-05,
"loss": 0.001,
"step": 3700
},
{
"epoch": 1.826686361398326,
"grad_norm": 0.0054056947119534016,
"learning_rate": 1.2695224027572626e-05,
"loss": 0.0006,
"step": 3710
},
{
"epoch": 1.8316100443131462,
"grad_norm": 0.005898363888263702,
"learning_rate": 1.2675529295913344e-05,
"loss": 0.0572,
"step": 3720
},
{
"epoch": 1.8365337272279665,
"grad_norm": 0.004474004730582237,
"learning_rate": 1.2655834564254064e-05,
"loss": 0.0085,
"step": 3730
},
{
"epoch": 1.841457410142787,
"grad_norm": 0.0045126210898160934,
"learning_rate": 1.2636139832594782e-05,
"loss": 0.0015,
"step": 3740
},
{
"epoch": 1.846381093057607,
"grad_norm": 0.00464917765930295,
"learning_rate": 1.2616445100935502e-05,
"loss": 0.0536,
"step": 3750
},
{
"epoch": 1.8513047759724275,
"grad_norm": 0.00560635793954134,
"learning_rate": 1.259675036927622e-05,
"loss": 0.0007,
"step": 3760
},
{
"epoch": 1.8562284588872475,
"grad_norm": 0.0122684920206666,
"learning_rate": 1.257705563761694e-05,
"loss": 0.0008,
"step": 3770
},
{
"epoch": 1.861152141802068,
"grad_norm": 0.004684649407863617,
"learning_rate": 1.2557360905957658e-05,
"loss": 0.0007,
"step": 3780
},
{
"epoch": 1.8660758247168883,
"grad_norm": 0.010543436743319035,
"learning_rate": 1.2537666174298375e-05,
"loss": 0.0109,
"step": 3790
},
{
"epoch": 1.8709995076317085,
"grad_norm": 0.009408768266439438,
"learning_rate": 1.2517971442639095e-05,
"loss": 0.0007,
"step": 3800
},
{
"epoch": 1.8759231905465288,
"grad_norm": 0.004331331700086594,
"learning_rate": 1.2498276710979813e-05,
"loss": 0.0006,
"step": 3810
},
{
"epoch": 1.880846873461349,
"grad_norm": 0.004327212926000357,
"learning_rate": 1.2478581979320533e-05,
"loss": 0.0006,
"step": 3820
},
{
"epoch": 1.8857705563761695,
"grad_norm": 0.0050591351464390755,
"learning_rate": 1.2458887247661251e-05,
"loss": 0.0007,
"step": 3830
},
{
"epoch": 1.8906942392909896,
"grad_norm": 0.005602705758064985,
"learning_rate": 1.2439192516001969e-05,
"loss": 0.0006,
"step": 3840
},
{
"epoch": 1.89561792220581,
"grad_norm": 0.009484893642365932,
"learning_rate": 1.2419497784342689e-05,
"loss": 0.0005,
"step": 3850
},
{
"epoch": 1.90054160512063,
"grad_norm": 0.004367966204881668,
"learning_rate": 1.2399803052683407e-05,
"loss": 0.0005,
"step": 3860
},
{
"epoch": 1.9054652880354506,
"grad_norm": 0.004156662616878748,
"learning_rate": 1.2380108321024127e-05,
"loss": 0.0006,
"step": 3870
},
{
"epoch": 1.9103889709502708,
"grad_norm": 0.00411129929125309,
"learning_rate": 1.2360413589364845e-05,
"loss": 0.0006,
"step": 3880
},
{
"epoch": 1.915312653865091,
"grad_norm": 1.3656526803970337,
"learning_rate": 1.2340718857705565e-05,
"loss": 0.0104,
"step": 3890
},
{
"epoch": 1.9202363367799113,
"grad_norm": 0.005277382675558329,
"learning_rate": 1.2321024126046283e-05,
"loss": 0.0006,
"step": 3900
},
{
"epoch": 1.9251600196947316,
"grad_norm": 0.00482240691781044,
"learning_rate": 1.2301329394387003e-05,
"loss": 0.0005,
"step": 3910
},
{
"epoch": 1.930083702609552,
"grad_norm": 0.00404377281665802,
"learning_rate": 1.2281634662727722e-05,
"loss": 0.0006,
"step": 3920
},
{
"epoch": 1.9350073855243721,
"grad_norm": 0.005294101778417826,
"learning_rate": 1.226193993106844e-05,
"loss": 0.0297,
"step": 3930
},
{
"epoch": 1.9399310684391926,
"grad_norm": 0.0040844217874109745,
"learning_rate": 1.224224519940916e-05,
"loss": 0.0005,
"step": 3940
},
{
"epoch": 1.9448547513540126,
"grad_norm": 0.012116051279008389,
"learning_rate": 1.2222550467749878e-05,
"loss": 0.0012,
"step": 3950
},
{
"epoch": 1.9497784342688331,
"grad_norm": 0.0040712859481573105,
"learning_rate": 1.2202855736090598e-05,
"loss": 0.0005,
"step": 3960
},
{
"epoch": 1.9547021171836534,
"grad_norm": 0.010710555128753185,
"learning_rate": 1.2183161004431316e-05,
"loss": 0.0008,
"step": 3970
},
{
"epoch": 1.9596258000984736,
"grad_norm": 0.00512111559510231,
"learning_rate": 1.2163466272772036e-05,
"loss": 0.0005,
"step": 3980
},
{
"epoch": 1.964549483013294,
"grad_norm": 0.004718789830803871,
"learning_rate": 1.2143771541112754e-05,
"loss": 0.0006,
"step": 3990
},
{
"epoch": 1.9694731659281142,
"grad_norm": 0.040209267288446426,
"learning_rate": 1.212407680945347e-05,
"loss": 0.0006,
"step": 4000
},
{
"epoch": 1.9743968488429346,
"grad_norm": 0.004102818667888641,
"learning_rate": 1.210438207779419e-05,
"loss": 0.0006,
"step": 4010
},
{
"epoch": 1.9793205317577547,
"grad_norm": 0.003849179018288851,
"learning_rate": 1.2084687346134909e-05,
"loss": 0.0004,
"step": 4020
},
{
"epoch": 1.9842442146725752,
"grad_norm": 0.0038420234341174364,
"learning_rate": 1.2064992614475629e-05,
"loss": 0.0005,
"step": 4030
},
{
"epoch": 1.9891678975873952,
"grad_norm": 0.004476201254874468,
"learning_rate": 1.2045297882816347e-05,
"loss": 0.0008,
"step": 4040
},
{
"epoch": 1.9940915805022157,
"grad_norm": 0.0061459592543542385,
"learning_rate": 1.2025603151157065e-05,
"loss": 0.0005,
"step": 4050
},
{
"epoch": 1.999015263417036,
"grad_norm": 0.00451872032135725,
"learning_rate": 1.2005908419497785e-05,
"loss": 0.0005,
"step": 4060
},
{
"epoch": 2.0,
"eval_accuracy": 0.9993024066968957,
"eval_loss": 0.003324420191347599,
"eval_runtime": 125.6557,
"eval_samples_per_second": 22.816,
"eval_steps_per_second": 2.857,
"step": 4062
},
{
"epoch": 2.003938946331856,
"grad_norm": 0.003870410844683647,
"learning_rate": 1.1986213687838503e-05,
"loss": 0.0004,
"step": 4070
},
{
"epoch": 2.0088626292466767,
"grad_norm": 0.0037198015488684177,
"learning_rate": 1.1966518956179223e-05,
"loss": 0.0005,
"step": 4080
},
{
"epoch": 2.0137863121614967,
"grad_norm": 0.003692836966365576,
"learning_rate": 1.1946824224519941e-05,
"loss": 0.0005,
"step": 4090
},
{
"epoch": 2.018709995076317,
"grad_norm": 0.005183310713618994,
"learning_rate": 1.1927129492860661e-05,
"loss": 0.0005,
"step": 4100
},
{
"epoch": 2.0236336779911372,
"grad_norm": 0.005319288466125727,
"learning_rate": 1.190743476120138e-05,
"loss": 0.0007,
"step": 4110
},
{
"epoch": 2.0285573609059577,
"grad_norm": 0.003845900995656848,
"learning_rate": 1.18877400295421e-05,
"loss": 0.0009,
"step": 4120
},
{
"epoch": 2.0334810438207778,
"grad_norm": 0.0036504592280834913,
"learning_rate": 1.1868045297882818e-05,
"loss": 0.0005,
"step": 4130
},
{
"epoch": 2.0384047267355982,
"grad_norm": 0.005353834945708513,
"learning_rate": 1.1848350566223536e-05,
"loss": 0.0005,
"step": 4140
},
{
"epoch": 2.0433284096504187,
"grad_norm": 0.003965914249420166,
"learning_rate": 1.1828655834564256e-05,
"loss": 0.0005,
"step": 4150
},
{
"epoch": 2.0482520925652388,
"grad_norm": 0.0035763198975473642,
"learning_rate": 1.1808961102904974e-05,
"loss": 0.0005,
"step": 4160
},
{
"epoch": 2.0531757754800593,
"grad_norm": 0.0036004288122057915,
"learning_rate": 1.1789266371245694e-05,
"loss": 0.0004,
"step": 4170
},
{
"epoch": 2.0580994583948793,
"grad_norm": 0.003568575019016862,
"learning_rate": 1.1769571639586412e-05,
"loss": 0.0004,
"step": 4180
},
{
"epoch": 2.0630231413096998,
"grad_norm": 0.003498220583423972,
"learning_rate": 1.1749876907927132e-05,
"loss": 0.0004,
"step": 4190
},
{
"epoch": 2.06794682422452,
"grad_norm": 0.004230022896081209,
"learning_rate": 1.173018217626785e-05,
"loss": 0.0005,
"step": 4200
},
{
"epoch": 2.0728705071393403,
"grad_norm": 0.004168345592916012,
"learning_rate": 1.171048744460857e-05,
"loss": 0.0004,
"step": 4210
},
{
"epoch": 2.0777941900541603,
"grad_norm": 0.004782133270055056,
"learning_rate": 1.1690792712949287e-05,
"loss": 0.0004,
"step": 4220
},
{
"epoch": 2.082717872968981,
"grad_norm": 0.004512319806963205,
"learning_rate": 1.1671097981290005e-05,
"loss": 0.0005,
"step": 4230
},
{
"epoch": 2.0876415558838013,
"grad_norm": 0.0035142311826348305,
"learning_rate": 1.1651403249630725e-05,
"loss": 0.0004,
"step": 4240
},
{
"epoch": 2.0925652387986213,
"grad_norm": 0.0034635839983820915,
"learning_rate": 1.1631708517971443e-05,
"loss": 0.0005,
"step": 4250
},
{
"epoch": 2.097488921713442,
"grad_norm": 0.0034351220820099115,
"learning_rate": 1.1612013786312161e-05,
"loss": 0.0004,
"step": 4260
},
{
"epoch": 2.102412604628262,
"grad_norm": 0.003419801127165556,
"learning_rate": 1.1592319054652881e-05,
"loss": 0.0004,
"step": 4270
},
{
"epoch": 2.1073362875430823,
"grad_norm": 0.004152268171310425,
"learning_rate": 1.15726243229936e-05,
"loss": 0.001,
"step": 4280
},
{
"epoch": 2.1122599704579024,
"grad_norm": 0.003409736789762974,
"learning_rate": 1.1552929591334319e-05,
"loss": 0.0004,
"step": 4290
},
{
"epoch": 2.117183653372723,
"grad_norm": 0.0034677856601774693,
"learning_rate": 1.1533234859675037e-05,
"loss": 0.0004,
"step": 4300
},
{
"epoch": 2.122107336287543,
"grad_norm": 0.0034529881086200476,
"learning_rate": 1.1513540128015757e-05,
"loss": 0.0004,
"step": 4310
},
{
"epoch": 2.1270310192023634,
"grad_norm": 0.0033237591851502657,
"learning_rate": 1.1493845396356475e-05,
"loss": 0.0005,
"step": 4320
},
{
"epoch": 2.131954702117184,
"grad_norm": 0.0033787621650844812,
"learning_rate": 1.1474150664697195e-05,
"loss": 0.0004,
"step": 4330
},
{
"epoch": 2.136878385032004,
"grad_norm": 0.004501492716372013,
"learning_rate": 1.1454455933037914e-05,
"loss": 0.0004,
"step": 4340
},
{
"epoch": 2.1418020679468244,
"grad_norm": 0.0037883783224970102,
"learning_rate": 1.1434761201378632e-05,
"loss": 0.0005,
"step": 4350
},
{
"epoch": 2.1467257508616444,
"grad_norm": 0.0033413332421332598,
"learning_rate": 1.1415066469719352e-05,
"loss": 0.0004,
"step": 4360
},
{
"epoch": 2.151649433776465,
"grad_norm": 0.003297879360616207,
"learning_rate": 1.139537173806007e-05,
"loss": 0.0004,
"step": 4370
},
{
"epoch": 2.156573116691285,
"grad_norm": 0.0032432284206151962,
"learning_rate": 1.137567700640079e-05,
"loss": 0.0004,
"step": 4380
},
{
"epoch": 2.1614967996061054,
"grad_norm": 0.0034160311333835125,
"learning_rate": 1.1355982274741508e-05,
"loss": 0.0004,
"step": 4390
},
{
"epoch": 2.1664204825209255,
"grad_norm": 0.003737648716196418,
"learning_rate": 1.1336287543082228e-05,
"loss": 0.0004,
"step": 4400
},
{
"epoch": 2.171344165435746,
"grad_norm": 0.003206575522199273,
"learning_rate": 1.1316592811422946e-05,
"loss": 0.0004,
"step": 4410
},
{
"epoch": 2.1762678483505664,
"grad_norm": 0.0032377000898122787,
"learning_rate": 1.1296898079763666e-05,
"loss": 0.0004,
"step": 4420
},
{
"epoch": 2.1811915312653865,
"grad_norm": 0.0031994974706321955,
"learning_rate": 1.1277203348104384e-05,
"loss": 0.0004,
"step": 4430
},
{
"epoch": 2.186115214180207,
"grad_norm": 0.003612807020545006,
"learning_rate": 1.12575086164451e-05,
"loss": 0.0004,
"step": 4440
},
{
"epoch": 2.191038897095027,
"grad_norm": 0.0038285143673419952,
"learning_rate": 1.123781388478582e-05,
"loss": 0.0005,
"step": 4450
},
{
"epoch": 2.1959625800098475,
"grad_norm": 0.05885794758796692,
"learning_rate": 1.1218119153126539e-05,
"loss": 0.0005,
"step": 4460
},
{
"epoch": 2.2008862629246675,
"grad_norm": 0.004322012886404991,
"learning_rate": 1.1198424421467257e-05,
"loss": 0.0081,
"step": 4470
},
{
"epoch": 2.205809945839488,
"grad_norm": 0.0031240857206285,
"learning_rate": 1.1178729689807977e-05,
"loss": 0.0004,
"step": 4480
},
{
"epoch": 2.2107336287543085,
"grad_norm": 0.0030879732221364975,
"learning_rate": 1.1159034958148695e-05,
"loss": 0.0004,
"step": 4490
},
{
"epoch": 2.2156573116691285,
"grad_norm": 0.0031029575038701296,
"learning_rate": 1.1139340226489415e-05,
"loss": 0.0004,
"step": 4500
},
{
"epoch": 2.220580994583949,
"grad_norm": 0.003118765540421009,
"learning_rate": 1.1119645494830133e-05,
"loss": 0.0004,
"step": 4510
},
{
"epoch": 2.225504677498769,
"grad_norm": 0.0030926407780498266,
"learning_rate": 1.1099950763170853e-05,
"loss": 0.0004,
"step": 4520
},
{
"epoch": 2.2304283604135895,
"grad_norm": 0.0031910729594528675,
"learning_rate": 1.1080256031511571e-05,
"loss": 0.0004,
"step": 4530
},
{
"epoch": 2.2353520433284095,
"grad_norm": 0.0032135951332747936,
"learning_rate": 1.1060561299852291e-05,
"loss": 0.0004,
"step": 4540
},
{
"epoch": 2.24027572624323,
"grad_norm": 0.00302210939116776,
"learning_rate": 1.104086656819301e-05,
"loss": 0.0004,
"step": 4550
},
{
"epoch": 2.24519940915805,
"grad_norm": 0.003041194984689355,
"learning_rate": 1.1021171836533728e-05,
"loss": 0.0004,
"step": 4560
},
{
"epoch": 2.2501230920728705,
"grad_norm": 0.0029863493982702494,
"learning_rate": 1.1001477104874448e-05,
"loss": 0.0004,
"step": 4570
},
{
"epoch": 2.255046774987691,
"grad_norm": 0.0035601642448455095,
"learning_rate": 1.0981782373215166e-05,
"loss": 0.0004,
"step": 4580
},
{
"epoch": 2.259970457902511,
"grad_norm": 0.002995297545567155,
"learning_rate": 1.0962087641555886e-05,
"loss": 0.0258,
"step": 4590
},
{
"epoch": 2.2648941408173315,
"grad_norm": 0.0030106825288385153,
"learning_rate": 1.0942392909896604e-05,
"loss": 0.0004,
"step": 4600
},
{
"epoch": 2.2698178237321516,
"grad_norm": 0.0030412061605602503,
"learning_rate": 1.0922698178237324e-05,
"loss": 0.0003,
"step": 4610
},
{
"epoch": 2.274741506646972,
"grad_norm": 0.0032338390592485666,
"learning_rate": 1.0903003446578042e-05,
"loss": 0.0007,
"step": 4620
},
{
"epoch": 2.279665189561792,
"grad_norm": 0.0029980672989040613,
"learning_rate": 1.088330871491876e-05,
"loss": 0.0004,
"step": 4630
},
{
"epoch": 2.2845888724766126,
"grad_norm": 0.0030107314232736826,
"learning_rate": 1.086361398325948e-05,
"loss": 0.0679,
"step": 4640
},
{
"epoch": 2.2895125553914326,
"grad_norm": 0.003009843174368143,
"learning_rate": 1.0843919251600197e-05,
"loss": 0.0004,
"step": 4650
},
{
"epoch": 2.294436238306253,
"grad_norm": 0.0035064418334513903,
"learning_rate": 1.0824224519940917e-05,
"loss": 0.0004,
"step": 4660
},
{
"epoch": 2.2993599212210736,
"grad_norm": 0.0035762269981205463,
"learning_rate": 1.0804529788281635e-05,
"loss": 0.0004,
"step": 4670
},
{
"epoch": 2.3042836041358936,
"grad_norm": 0.003944043070077896,
"learning_rate": 1.0784835056622353e-05,
"loss": 0.0004,
"step": 4680
},
{
"epoch": 2.309207287050714,
"grad_norm": 0.008321553468704224,
"learning_rate": 1.0765140324963073e-05,
"loss": 0.0004,
"step": 4690
},
{
"epoch": 2.314130969965534,
"grad_norm": 0.0029584006406366825,
"learning_rate": 1.0745445593303791e-05,
"loss": 0.0004,
"step": 4700
},
{
"epoch": 2.3190546528803546,
"grad_norm": 0.0038970340974628925,
"learning_rate": 1.0725750861644511e-05,
"loss": 0.0004,
"step": 4710
},
{
"epoch": 2.3239783357951747,
"grad_norm": 0.0028425133787095547,
"learning_rate": 1.070605612998523e-05,
"loss": 0.0004,
"step": 4720
},
{
"epoch": 2.328902018709995,
"grad_norm": 0.0031168104615062475,
"learning_rate": 1.0686361398325949e-05,
"loss": 0.0004,
"step": 4730
},
{
"epoch": 2.333825701624815,
"grad_norm": 0.00459252716973424,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.0004,
"step": 4740
},
{
"epoch": 2.3387493845396357,
"grad_norm": 0.0028256394434720278,
"learning_rate": 1.0646971935007386e-05,
"loss": 0.0004,
"step": 4750
},
{
"epoch": 2.343673067454456,
"grad_norm": 0.005775698460638523,
"learning_rate": 1.0627277203348105e-05,
"loss": 0.0505,
"step": 4760
},
{
"epoch": 2.348596750369276,
"grad_norm": 0.0028328783810138702,
"learning_rate": 1.0607582471688824e-05,
"loss": 0.0004,
"step": 4770
},
{
"epoch": 2.3535204332840967,
"grad_norm": 0.0029578169342130423,
"learning_rate": 1.0587887740029544e-05,
"loss": 0.0004,
"step": 4780
},
{
"epoch": 2.3584441161989167,
"grad_norm": 0.0028520470950752497,
"learning_rate": 1.0568193008370262e-05,
"loss": 0.0746,
"step": 4790
},
{
"epoch": 2.363367799113737,
"grad_norm": 0.0032039277721196413,
"learning_rate": 1.0548498276710982e-05,
"loss": 0.0004,
"step": 4800
},
{
"epoch": 2.368291482028557,
"grad_norm": 0.004140175879001617,
"learning_rate": 1.05288035450517e-05,
"loss": 0.0004,
"step": 4810
},
{
"epoch": 2.3732151649433777,
"grad_norm": 0.0033235037699341774,
"learning_rate": 1.050910881339242e-05,
"loss": 0.0004,
"step": 4820
},
{
"epoch": 2.3781388478581977,
"grad_norm": 0.0031042725313454866,
"learning_rate": 1.0489414081733138e-05,
"loss": 0.0063,
"step": 4830
},
{
"epoch": 2.383062530773018,
"grad_norm": 0.0032992505002766848,
"learning_rate": 1.0469719350073856e-05,
"loss": 0.0004,
"step": 4840
},
{
"epoch": 2.3879862136878387,
"grad_norm": 0.0029236788395792246,
"learning_rate": 1.0450024618414576e-05,
"loss": 0.0376,
"step": 4850
},
{
"epoch": 2.3929098966026587,
"grad_norm": 0.0035732307005673647,
"learning_rate": 1.0430329886755294e-05,
"loss": 0.0004,
"step": 4860
},
{
"epoch": 2.397833579517479,
"grad_norm": 0.002821253379806876,
"learning_rate": 1.041063515509601e-05,
"loss": 0.0003,
"step": 4870
},
{
"epoch": 2.4027572624322993,
"grad_norm": 0.002872566459700465,
"learning_rate": 1.039094042343673e-05,
"loss": 0.0003,
"step": 4880
},
{
"epoch": 2.4076809453471197,
"grad_norm": 0.00280165602453053,
"learning_rate": 1.0371245691777449e-05,
"loss": 0.0004,
"step": 4890
},
{
"epoch": 2.4126046282619398,
"grad_norm": 0.003701514797285199,
"learning_rate": 1.0351550960118169e-05,
"loss": 0.0004,
"step": 4900
},
{
"epoch": 2.4175283111767603,
"grad_norm": 0.0033293033484369516,
"learning_rate": 1.0331856228458887e-05,
"loss": 0.0004,
"step": 4910
},
{
"epoch": 2.4224519940915803,
"grad_norm": 0.0027583721093833447,
"learning_rate": 1.0312161496799607e-05,
"loss": 0.0004,
"step": 4920
},
{
"epoch": 2.427375677006401,
"grad_norm": 0.0035485969856381416,
"learning_rate": 1.0292466765140325e-05,
"loss": 0.0004,
"step": 4930
},
{
"epoch": 2.4322993599212213,
"grad_norm": 0.0032952686306089163,
"learning_rate": 1.0272772033481045e-05,
"loss": 0.0004,
"step": 4940
},
{
"epoch": 2.4372230428360413,
"grad_norm": 0.0037070815451443195,
"learning_rate": 1.0253077301821763e-05,
"loss": 0.0004,
"step": 4950
},
{
"epoch": 2.442146725750862,
"grad_norm": 0.002730543026700616,
"learning_rate": 1.0233382570162481e-05,
"loss": 0.0003,
"step": 4960
},
{
"epoch": 2.447070408665682,
"grad_norm": 0.004558619111776352,
"learning_rate": 1.0213687838503201e-05,
"loss": 0.0004,
"step": 4970
},
{
"epoch": 2.4519940915805023,
"grad_norm": 0.003558282507583499,
"learning_rate": 1.019399310684392e-05,
"loss": 0.0005,
"step": 4980
},
{
"epoch": 2.4569177744953223,
"grad_norm": 0.012926378287374973,
"learning_rate": 1.017429837518464e-05,
"loss": 0.0004,
"step": 4990
},
{
"epoch": 2.461841457410143,
"grad_norm": 0.00339103932492435,
"learning_rate": 1.0154603643525358e-05,
"loss": 0.0004,
"step": 5000
},
{
"epoch": 2.466765140324963,
"grad_norm": 0.0026393327862024307,
"learning_rate": 1.0134908911866078e-05,
"loss": 0.0003,
"step": 5010
},
{
"epoch": 2.4716888232397833,
"grad_norm": 0.006687171291559935,
"learning_rate": 1.0115214180206796e-05,
"loss": 0.0004,
"step": 5020
},
{
"epoch": 2.476612506154604,
"grad_norm": 0.003788519883528352,
"learning_rate": 1.0095519448547516e-05,
"loss": 0.0003,
"step": 5030
},
{
"epoch": 2.481536189069424,
"grad_norm": 0.0027769142761826515,
"learning_rate": 1.0075824716888234e-05,
"loss": 0.0004,
"step": 5040
},
{
"epoch": 2.4864598719842443,
"grad_norm": 0.0029424901586025953,
"learning_rate": 1.0056129985228952e-05,
"loss": 0.0003,
"step": 5050
},
{
"epoch": 2.4913835548990644,
"grad_norm": 0.002582251327112317,
"learning_rate": 1.0036435253569672e-05,
"loss": 0.0003,
"step": 5060
},
{
"epoch": 2.496307237813885,
"grad_norm": 0.0027259523048996925,
"learning_rate": 1.001674052191039e-05,
"loss": 0.0003,
"step": 5070
},
{
"epoch": 2.5012309207287053,
"grad_norm": 0.026945946738123894,
"learning_rate": 9.997045790251108e-06,
"loss": 0.0004,
"step": 5080
},
{
"epoch": 2.5061546036435254,
"grad_norm": 0.002602215390652418,
"learning_rate": 9.977351058591828e-06,
"loss": 0.0004,
"step": 5090
},
{
"epoch": 2.5110782865583454,
"grad_norm": 0.002513736952096224,
"learning_rate": 9.957656326932547e-06,
"loss": 0.0003,
"step": 5100
},
{
"epoch": 2.516001969473166,
"grad_norm": 0.0032119974493980408,
"learning_rate": 9.937961595273265e-06,
"loss": 0.0003,
"step": 5110
},
{
"epoch": 2.5209256523879864,
"grad_norm": 0.006075989454984665,
"learning_rate": 9.918266863613985e-06,
"loss": 0.0007,
"step": 5120
},
{
"epoch": 2.5258493353028064,
"grad_norm": 0.02874099276959896,
"learning_rate": 9.898572131954703e-06,
"loss": 0.0005,
"step": 5130
},
{
"epoch": 2.530773018217627,
"grad_norm": 0.0025109825655817986,
"learning_rate": 9.878877400295421e-06,
"loss": 0.0355,
"step": 5140
},
{
"epoch": 2.535696701132447,
"grad_norm": 0.029801325872540474,
"learning_rate": 9.859182668636141e-06,
"loss": 0.0004,
"step": 5150
},
{
"epoch": 2.5406203840472674,
"grad_norm": 0.00397358788177371,
"learning_rate": 9.83948793697686e-06,
"loss": 0.0004,
"step": 5160
},
{
"epoch": 2.545544066962088,
"grad_norm": 0.002678680932149291,
"learning_rate": 9.819793205317577e-06,
"loss": 0.0003,
"step": 5170
},
{
"epoch": 2.550467749876908,
"grad_norm": 0.0029178003314882517,
"learning_rate": 9.800098473658297e-06,
"loss": 0.0003,
"step": 5180
},
{
"epoch": 2.555391432791728,
"grad_norm": 0.0024977768771350384,
"learning_rate": 9.780403741999016e-06,
"loss": 0.0004,
"step": 5190
},
{
"epoch": 2.5603151157065485,
"grad_norm": 0.0030609865207225084,
"learning_rate": 9.760709010339735e-06,
"loss": 0.0003,
"step": 5200
},
{
"epoch": 2.565238798621369,
"grad_norm": 0.003063684096559882,
"learning_rate": 9.741014278680454e-06,
"loss": 0.0303,
"step": 5210
},
{
"epoch": 2.570162481536189,
"grad_norm": 0.003366800956428051,
"learning_rate": 9.721319547021174e-06,
"loss": 0.0003,
"step": 5220
},
{
"epoch": 2.5750861644510095,
"grad_norm": 0.0025488571263849735,
"learning_rate": 9.701624815361892e-06,
"loss": 0.0004,
"step": 5230
},
{
"epoch": 2.5800098473658295,
"grad_norm": 0.0029968577437102795,
"learning_rate": 9.681930083702612e-06,
"loss": 0.0003,
"step": 5240
},
{
"epoch": 2.58493353028065,
"grad_norm": 0.004265849944204092,
"learning_rate": 9.662235352043328e-06,
"loss": 0.0004,
"step": 5250
},
{
"epoch": 2.5898572131954705,
"grad_norm": 0.0030266179237514734,
"learning_rate": 9.642540620384048e-06,
"loss": 0.0004,
"step": 5260
},
{
"epoch": 2.5947808961102905,
"grad_norm": 0.00575278652831912,
"learning_rate": 9.622845888724766e-06,
"loss": 0.0045,
"step": 5270
},
{
"epoch": 2.5997045790251105,
"grad_norm": 0.004713424481451511,
"learning_rate": 9.603151157065486e-06,
"loss": 0.001,
"step": 5280
},
{
"epoch": 2.604628261939931,
"grad_norm": 0.0028863553889095783,
"learning_rate": 9.583456425406204e-06,
"loss": 0.0003,
"step": 5290
},
{
"epoch": 2.6095519448547515,
"grad_norm": 0.0027613062411546707,
"learning_rate": 9.563761693746924e-06,
"loss": 0.0003,
"step": 5300
},
{
"epoch": 2.6144756277695715,
"grad_norm": 0.00244720047339797,
"learning_rate": 9.544066962087643e-06,
"loss": 0.0003,
"step": 5310
},
{
"epoch": 2.619399310684392,
"grad_norm": 0.0027843692805618048,
"learning_rate": 9.52437223042836e-06,
"loss": 0.0003,
"step": 5320
},
{
"epoch": 2.624322993599212,
"grad_norm": 0.0024719424545764923,
"learning_rate": 9.50467749876908e-06,
"loss": 0.0003,
"step": 5330
},
{
"epoch": 2.6292466765140325,
"grad_norm": 0.0023956247605383396,
"learning_rate": 9.484982767109799e-06,
"loss": 0.0003,
"step": 5340
},
{
"epoch": 2.634170359428853,
"grad_norm": 0.0024870047345757484,
"learning_rate": 9.465288035450517e-06,
"loss": 0.0003,
"step": 5350
},
{
"epoch": 2.639094042343673,
"grad_norm": 0.006259846035391092,
"learning_rate": 9.445593303791237e-06,
"loss": 0.0004,
"step": 5360
},
{
"epoch": 2.644017725258493,
"grad_norm": 0.0027619535103440285,
"learning_rate": 9.425898572131955e-06,
"loss": 0.0003,
"step": 5370
},
{
"epoch": 2.6489414081733136,
"grad_norm": 0.002708840649574995,
"learning_rate": 9.406203840472673e-06,
"loss": 0.0003,
"step": 5380
},
{
"epoch": 2.653865091088134,
"grad_norm": 0.0023432679008692503,
"learning_rate": 9.386509108813393e-06,
"loss": 0.002,
"step": 5390
},
{
"epoch": 2.658788774002954,
"grad_norm": 0.0024096230044960976,
"learning_rate": 9.366814377154112e-06,
"loss": 0.0004,
"step": 5400
},
{
"epoch": 2.6637124569177746,
"grad_norm": 0.002802999457344413,
"learning_rate": 9.347119645494831e-06,
"loss": 0.0003,
"step": 5410
},
{
"epoch": 2.6686361398325946,
"grad_norm": 0.002744765719398856,
"learning_rate": 9.32742491383555e-06,
"loss": 0.0003,
"step": 5420
},
{
"epoch": 2.673559822747415,
"grad_norm": 0.0027366564609110355,
"learning_rate": 9.30773018217627e-06,
"loss": 0.0003,
"step": 5430
},
{
"epoch": 2.6784835056622356,
"grad_norm": 0.0023025909904390574,
"learning_rate": 9.288035450516988e-06,
"loss": 0.0008,
"step": 5440
},
{
"epoch": 2.6834071885770556,
"grad_norm": 0.003953023348003626,
"learning_rate": 9.268340718857708e-06,
"loss": 0.0004,
"step": 5450
},
{
"epoch": 2.6883308714918757,
"grad_norm": 0.002293068915605545,
"learning_rate": 9.248645987198424e-06,
"loss": 0.0003,
"step": 5460
},
{
"epoch": 2.693254554406696,
"grad_norm": 0.0025221628602594137,
"learning_rate": 9.228951255539144e-06,
"loss": 0.0003,
"step": 5470
},
{
"epoch": 2.6981782373215166,
"grad_norm": 0.0023449882864952087,
"learning_rate": 9.209256523879862e-06,
"loss": 0.0003,
"step": 5480
},
{
"epoch": 2.7031019202363367,
"grad_norm": 0.002262598369270563,
"learning_rate": 9.189561792220582e-06,
"loss": 0.0003,
"step": 5490
},
{
"epoch": 2.708025603151157,
"grad_norm": 0.0022516308818012476,
"learning_rate": 9.1698670605613e-06,
"loss": 0.0011,
"step": 5500
},
{
"epoch": 2.712949286065977,
"grad_norm": 0.0027228447142988443,
"learning_rate": 9.15017232890202e-06,
"loss": 0.0003,
"step": 5510
},
{
"epoch": 2.7178729689807977,
"grad_norm": 0.0023118378594517708,
"learning_rate": 9.130477597242738e-06,
"loss": 0.0003,
"step": 5520
},
{
"epoch": 2.722796651895618,
"grad_norm": 0.002595614641904831,
"learning_rate": 9.110782865583457e-06,
"loss": 0.0003,
"step": 5530
},
{
"epoch": 2.727720334810438,
"grad_norm": 0.0023607241455465555,
"learning_rate": 9.091088133924177e-06,
"loss": 0.0003,
"step": 5540
},
{
"epoch": 2.7326440177252582,
"grad_norm": 0.0023618319537490606,
"learning_rate": 9.071393402264895e-06,
"loss": 0.0003,
"step": 5550
},
{
"epoch": 2.7375677006400787,
"grad_norm": 0.002674200339242816,
"learning_rate": 9.051698670605615e-06,
"loss": 0.0003,
"step": 5560
},
{
"epoch": 2.742491383554899,
"grad_norm": 0.002609808696433902,
"learning_rate": 9.032003938946333e-06,
"loss": 0.0005,
"step": 5570
},
{
"epoch": 2.7474150664697192,
"grad_norm": 0.0023445002734661102,
"learning_rate": 9.012309207287051e-06,
"loss": 0.0003,
"step": 5580
},
{
"epoch": 2.7523387493845397,
"grad_norm": 0.0025870969984680414,
"learning_rate": 8.99261447562777e-06,
"loss": 0.0003,
"step": 5590
},
{
"epoch": 2.7572624322993597,
"grad_norm": 0.0021777572110295296,
"learning_rate": 8.97291974396849e-06,
"loss": 0.0002,
"step": 5600
},
{
"epoch": 2.7621861152141802,
"grad_norm": 0.002778227673843503,
"learning_rate": 8.953225012309207e-06,
"loss": 0.0003,
"step": 5610
},
{
"epoch": 2.7671097981290007,
"grad_norm": 0.002241934882476926,
"learning_rate": 8.933530280649927e-06,
"loss": 0.0005,
"step": 5620
},
{
"epoch": 2.7720334810438207,
"grad_norm": 0.002897050231695175,
"learning_rate": 8.913835548990646e-06,
"loss": 0.0003,
"step": 5630
},
{
"epoch": 2.7769571639586412,
"grad_norm": 0.0022650938481092453,
"learning_rate": 8.894140817331365e-06,
"loss": 0.0003,
"step": 5640
},
{
"epoch": 2.7818808468734613,
"grad_norm": 0.002553451107814908,
"learning_rate": 8.874446085672084e-06,
"loss": 0.0003,
"step": 5650
},
{
"epoch": 2.7868045297882817,
"grad_norm": 4.319761276245117,
"learning_rate": 8.854751354012802e-06,
"loss": 0.0084,
"step": 5660
},
{
"epoch": 2.791728212703102,
"grad_norm": 0.002494723303243518,
"learning_rate": 8.835056622353522e-06,
"loss": 0.0003,
"step": 5670
},
{
"epoch": 2.7966518956179223,
"grad_norm": 0.0021848022006452084,
"learning_rate": 8.81536189069424e-06,
"loss": 0.0003,
"step": 5680
},
{
"epoch": 2.8015755785327423,
"grad_norm": 0.002222836948931217,
"learning_rate": 8.795667159034958e-06,
"loss": 0.0003,
"step": 5690
},
{
"epoch": 2.806499261447563,
"grad_norm": 0.002521749120205641,
"learning_rate": 8.775972427375678e-06,
"loss": 0.0003,
"step": 5700
},
{
"epoch": 2.8114229443623833,
"grad_norm": 0.0022716443054378033,
"learning_rate": 8.756277695716396e-06,
"loss": 0.0003,
"step": 5710
},
{
"epoch": 2.8163466272772033,
"grad_norm": 0.0027861695270985365,
"learning_rate": 8.736582964057115e-06,
"loss": 0.0442,
"step": 5720
},
{
"epoch": 2.821270310192024,
"grad_norm": 0.0026454541366547346,
"learning_rate": 8.716888232397834e-06,
"loss": 0.0067,
"step": 5730
},
{
"epoch": 2.826193993106844,
"grad_norm": 0.0024635563604533672,
"learning_rate": 8.697193500738553e-06,
"loss": 0.0003,
"step": 5740
},
{
"epoch": 2.8311176760216643,
"grad_norm": 0.003969325218349695,
"learning_rate": 8.677498769079273e-06,
"loss": 0.0003,
"step": 5750
},
{
"epoch": 2.8360413589364843,
"grad_norm": 0.003894041758030653,
"learning_rate": 8.65780403741999e-06,
"loss": 0.0003,
"step": 5760
},
{
"epoch": 2.840965041851305,
"grad_norm": 0.0021586958318948746,
"learning_rate": 8.63810930576071e-06,
"loss": 0.0003,
"step": 5770
},
{
"epoch": 2.845888724766125,
"grad_norm": 0.0021715862676501274,
"learning_rate": 8.618414574101429e-06,
"loss": 0.0003,
"step": 5780
},
{
"epoch": 2.8508124076809453,
"grad_norm": 0.002224855124950409,
"learning_rate": 8.598719842442147e-06,
"loss": 0.0713,
"step": 5790
},
{
"epoch": 2.855736090595766,
"grad_norm": 0.01677889935672283,
"learning_rate": 8.579025110782865e-06,
"loss": 0.0003,
"step": 5800
},
{
"epoch": 2.860659773510586,
"grad_norm": 0.0036739737261086702,
"learning_rate": 8.559330379123585e-06,
"loss": 0.0002,
"step": 5810
},
{
"epoch": 2.8655834564254064,
"grad_norm": 0.002256969688460231,
"learning_rate": 8.539635647464303e-06,
"loss": 0.0003,
"step": 5820
},
{
"epoch": 2.8705071393402264,
"grad_norm": 0.0026434718165546656,
"learning_rate": 8.519940915805023e-06,
"loss": 0.0003,
"step": 5830
},
{
"epoch": 2.875430822255047,
"grad_norm": 0.002218646463006735,
"learning_rate": 8.500246184145742e-06,
"loss": 0.0435,
"step": 5840
},
{
"epoch": 2.880354505169867,
"grad_norm": 0.0029450245201587677,
"learning_rate": 8.480551452486461e-06,
"loss": 0.0003,
"step": 5850
},
{
"epoch": 2.8852781880846874,
"grad_norm": 0.00922380294650793,
"learning_rate": 8.46085672082718e-06,
"loss": 0.0004,
"step": 5860
},
{
"epoch": 2.8902018709995074,
"grad_norm": 0.004534618929028511,
"learning_rate": 8.441161989167898e-06,
"loss": 0.0003,
"step": 5870
},
{
"epoch": 2.895125553914328,
"grad_norm": 0.002629433758556843,
"learning_rate": 8.421467257508618e-06,
"loss": 0.0003,
"step": 5880
},
{
"epoch": 2.9000492368291484,
"grad_norm": 0.0023545767180621624,
"learning_rate": 8.401772525849336e-06,
"loss": 0.0003,
"step": 5890
},
{
"epoch": 2.9049729197439684,
"grad_norm": 0.004185693338513374,
"learning_rate": 8.382077794190054e-06,
"loss": 0.0003,
"step": 5900
},
{
"epoch": 2.909896602658789,
"grad_norm": 0.004315607715398073,
"learning_rate": 8.362383062530774e-06,
"loss": 0.0003,
"step": 5910
},
{
"epoch": 2.914820285573609,
"grad_norm": 0.027618886902928352,
"learning_rate": 8.342688330871492e-06,
"loss": 0.0003,
"step": 5920
},
{
"epoch": 2.9197439684884294,
"grad_norm": 0.003252118593081832,
"learning_rate": 8.32299359921221e-06,
"loss": 0.0003,
"step": 5930
},
{
"epoch": 2.9246676514032495,
"grad_norm": 0.002648021560162306,
"learning_rate": 8.30329886755293e-06,
"loss": 0.0004,
"step": 5940
},
{
"epoch": 2.92959133431807,
"grad_norm": 0.002187141450121999,
"learning_rate": 8.283604135893649e-06,
"loss": 0.0003,
"step": 5950
},
{
"epoch": 2.93451501723289,
"grad_norm": 0.0021846459712833166,
"learning_rate": 8.263909404234369e-06,
"loss": 0.0003,
"step": 5960
},
{
"epoch": 2.9394387001477105,
"grad_norm": 0.0021735290065407753,
"learning_rate": 8.244214672575087e-06,
"loss": 0.0003,
"step": 5970
},
{
"epoch": 2.944362383062531,
"grad_norm": 0.002656135242432356,
"learning_rate": 8.224519940915807e-06,
"loss": 0.0003,
"step": 5980
},
{
"epoch": 2.949286065977351,
"grad_norm": 0.002111564390361309,
"learning_rate": 8.204825209256525e-06,
"loss": 0.0003,
"step": 5990
},
{
"epoch": 2.9542097488921715,
"grad_norm": 0.002079706871882081,
"learning_rate": 8.185130477597243e-06,
"loss": 0.0002,
"step": 6000
},
{
"epoch": 2.9591334318069915,
"grad_norm": 0.0020710995886474848,
"learning_rate": 8.165435745937961e-06,
"loss": 0.0003,
"step": 6010
},
{
"epoch": 2.964057114721812,
"grad_norm": 0.009048682637512684,
"learning_rate": 8.145741014278681e-06,
"loss": 0.0003,
"step": 6020
},
{
"epoch": 2.9689807976366325,
"grad_norm": 0.002403336577117443,
"learning_rate": 8.1260462826194e-06,
"loss": 0.0062,
"step": 6030
},
{
"epoch": 2.9739044805514525,
"grad_norm": 0.026963578537106514,
"learning_rate": 8.10635155096012e-06,
"loss": 0.0003,
"step": 6040
},
{
"epoch": 2.9788281634662725,
"grad_norm": 0.0021355238277465105,
"learning_rate": 8.086656819300837e-06,
"loss": 0.0002,
"step": 6050
},
{
"epoch": 2.983751846381093,
"grad_norm": 0.0022524266969412565,
"learning_rate": 8.066962087641557e-06,
"loss": 0.0003,
"step": 6060
},
{
"epoch": 2.9886755292959135,
"grad_norm": 0.002698551630601287,
"learning_rate": 8.047267355982276e-06,
"loss": 0.0007,
"step": 6070
},
{
"epoch": 2.9935992122107336,
"grad_norm": 0.0021112014073878527,
"learning_rate": 8.027572624322994e-06,
"loss": 0.0002,
"step": 6080
},
{
"epoch": 2.998522895125554,
"grad_norm": 0.0032392677385360003,
"learning_rate": 8.007877892663714e-06,
"loss": 0.0003,
"step": 6090
},
{
"epoch": 3.0,
"eval_accuracy": 0.9989536100453436,
"eval_loss": 0.005800504703074694,
"eval_runtime": 124.8736,
"eval_samples_per_second": 22.959,
"eval_steps_per_second": 2.875,
"step": 6093
},
{
"epoch": 3.003446578040374,
"grad_norm": 0.002395425923168659,
"learning_rate": 7.988183161004432e-06,
"loss": 0.0003,
"step": 6100
},
{
"epoch": 3.0083702609551946,
"grad_norm": 0.011811596341431141,
"learning_rate": 7.96848842934515e-06,
"loss": 0.0007,
"step": 6110
},
{
"epoch": 3.0132939438700146,
"grad_norm": 0.0019815864507108927,
"learning_rate": 7.94879369768587e-06,
"loss": 0.0003,
"step": 6120
},
{
"epoch": 3.018217626784835,
"grad_norm": 0.0020032506436109543,
"learning_rate": 7.929098966026588e-06,
"loss": 0.0002,
"step": 6130
},
{
"epoch": 3.0231413096996556,
"grad_norm": 0.004863973241299391,
"learning_rate": 7.909404234367306e-06,
"loss": 0.0464,
"step": 6140
},
{
"epoch": 3.0280649926144756,
"grad_norm": 35.28690719604492,
"learning_rate": 7.889709502708026e-06,
"loss": 0.045,
"step": 6150
},
{
"epoch": 3.032988675529296,
"grad_norm": 0.0026731633115559816,
"learning_rate": 7.870014771048745e-06,
"loss": 0.0002,
"step": 6160
},
{
"epoch": 3.037912358444116,
"grad_norm": 0.00375444907695055,
"learning_rate": 7.850320039389464e-06,
"loss": 0.0003,
"step": 6170
},
{
"epoch": 3.0428360413589366,
"grad_norm": 0.0023011912126094103,
"learning_rate": 7.830625307730183e-06,
"loss": 0.0002,
"step": 6180
},
{
"epoch": 3.0477597242737566,
"grad_norm": 0.0019286174792796373,
"learning_rate": 7.810930576070903e-06,
"loss": 0.0004,
"step": 6190
},
{
"epoch": 3.052683407188577,
"grad_norm": 0.003087955992668867,
"learning_rate": 7.79123584441162e-06,
"loss": 0.0005,
"step": 6200
},
{
"epoch": 3.057607090103397,
"grad_norm": 0.0019114416791126132,
"learning_rate": 7.77154111275234e-06,
"loss": 0.0003,
"step": 6210
},
{
"epoch": 3.0625307730182176,
"grad_norm": 0.0019345678156241775,
"learning_rate": 7.751846381093057e-06,
"loss": 0.0002,
"step": 6220
},
{
"epoch": 3.067454455933038,
"grad_norm": 0.0028331466019153595,
"learning_rate": 7.732151649433777e-06,
"loss": 0.0003,
"step": 6230
},
{
"epoch": 3.072378138847858,
"grad_norm": 0.0019752325024455786,
"learning_rate": 7.712456917774495e-06,
"loss": 0.0002,
"step": 6240
},
{
"epoch": 3.0773018217626786,
"grad_norm": 0.0023958859965205193,
"learning_rate": 7.692762186115215e-06,
"loss": 0.0004,
"step": 6250
},
{
"epoch": 3.0822255046774987,
"grad_norm": 0.002664359984919429,
"learning_rate": 7.673067454455933e-06,
"loss": 0.0002,
"step": 6260
},
{
"epoch": 3.087149187592319,
"grad_norm": 0.0026232562959194183,
"learning_rate": 7.653372722796653e-06,
"loss": 0.0002,
"step": 6270
},
{
"epoch": 3.092072870507139,
"grad_norm": 0.002290521515533328,
"learning_rate": 7.633677991137372e-06,
"loss": 0.0003,
"step": 6280
},
{
"epoch": 3.0969965534219597,
"grad_norm": 0.0018943879986181855,
"learning_rate": 7.613983259478091e-06,
"loss": 0.0003,
"step": 6290
},
{
"epoch": 3.1019202363367797,
"grad_norm": 0.0019178404472768307,
"learning_rate": 7.59428852781881e-06,
"loss": 0.0002,
"step": 6300
},
{
"epoch": 3.1068439192516,
"grad_norm": 0.0021891624201089144,
"learning_rate": 7.574593796159529e-06,
"loss": 0.0003,
"step": 6310
},
{
"epoch": 3.1117676021664207,
"grad_norm": 0.001878801267594099,
"learning_rate": 7.554899064500247e-06,
"loss": 0.0002,
"step": 6320
},
{
"epoch": 3.1166912850812407,
"grad_norm": 0.006385423243045807,
"learning_rate": 7.535204332840965e-06,
"loss": 0.0003,
"step": 6330
},
{
"epoch": 3.121614967996061,
"grad_norm": 0.0018304381519556046,
"learning_rate": 7.515509601181684e-06,
"loss": 0.0002,
"step": 6340
},
{
"epoch": 3.1265386509108812,
"grad_norm": 0.0026562483981251717,
"learning_rate": 7.495814869522403e-06,
"loss": 0.0002,
"step": 6350
},
{
"epoch": 3.1314623338257017,
"grad_norm": 0.0018779343226924539,
"learning_rate": 7.476120137863122e-06,
"loss": 0.0002,
"step": 6360
},
{
"epoch": 3.1363860167405218,
"grad_norm": 0.002124248770996928,
"learning_rate": 7.456425406203841e-06,
"loss": 0.0002,
"step": 6370
},
{
"epoch": 3.1413096996553422,
"grad_norm": 0.0029339792672544718,
"learning_rate": 7.43673067454456e-06,
"loss": 0.0003,
"step": 6380
},
{
"epoch": 3.1462333825701623,
"grad_norm": 0.0021487076301127672,
"learning_rate": 7.417035942885279e-06,
"loss": 0.0002,
"step": 6390
},
{
"epoch": 3.1511570654849828,
"grad_norm": 0.001838160096667707,
"learning_rate": 7.397341211225998e-06,
"loss": 0.0002,
"step": 6400
},
{
"epoch": 3.1560807483998032,
"grad_norm": 0.004113923758268356,
"learning_rate": 7.377646479566717e-06,
"loss": 0.0002,
"step": 6410
},
{
"epoch": 3.1610044313146233,
"grad_norm": 0.0024032278452068567,
"learning_rate": 7.357951747907436e-06,
"loss": 0.0002,
"step": 6420
},
{
"epoch": 3.1659281142294438,
"grad_norm": 0.0024140363093465567,
"learning_rate": 7.338257016248154e-06,
"loss": 0.0002,
"step": 6430
},
{
"epoch": 3.170851797144264,
"grad_norm": 0.0021326704882085323,
"learning_rate": 7.318562284588872e-06,
"loss": 0.0002,
"step": 6440
},
{
"epoch": 3.1757754800590843,
"grad_norm": 0.0017906288849189878,
"learning_rate": 7.298867552929591e-06,
"loss": 0.0003,
"step": 6450
},
{
"epoch": 3.1806991629739043,
"grad_norm": 0.0026058172807097435,
"learning_rate": 7.27917282127031e-06,
"loss": 0.0002,
"step": 6460
},
{
"epoch": 3.185622845888725,
"grad_norm": 0.0017882023239508271,
"learning_rate": 7.259478089611029e-06,
"loss": 0.0002,
"step": 6470
},
{
"epoch": 3.1905465288035453,
"grad_norm": 0.0017937802476808429,
"learning_rate": 7.2397833579517485e-06,
"loss": 0.0002,
"step": 6480
},
{
"epoch": 3.1954702117183653,
"grad_norm": 0.0017613591626286507,
"learning_rate": 7.2200886262924675e-06,
"loss": 0.0002,
"step": 6490
},
{
"epoch": 3.200393894633186,
"grad_norm": 0.001861646305769682,
"learning_rate": 7.2003938946331866e-06,
"loss": 0.0018,
"step": 6500
},
{
"epoch": 3.205317577548006,
"grad_norm": 0.02838357537984848,
"learning_rate": 7.180699162973906e-06,
"loss": 0.0003,
"step": 6510
},
{
"epoch": 3.2102412604628263,
"grad_norm": 0.00175224500708282,
"learning_rate": 7.161004431314625e-06,
"loss": 0.0002,
"step": 6520
},
{
"epoch": 3.2151649433776464,
"grad_norm": 0.0017621772130951285,
"learning_rate": 7.141309699655343e-06,
"loss": 0.0002,
"step": 6530
},
{
"epoch": 3.220088626292467,
"grad_norm": 0.0019370117224752903,
"learning_rate": 7.121614967996061e-06,
"loss": 0.0002,
"step": 6540
},
{
"epoch": 3.225012309207287,
"grad_norm": 0.0031523550860583782,
"learning_rate": 7.10192023633678e-06,
"loss": 0.0002,
"step": 6550
},
{
"epoch": 3.2299359921221074,
"grad_norm": 0.0028889975510537624,
"learning_rate": 7.082225504677499e-06,
"loss": 0.0002,
"step": 6560
},
{
"epoch": 3.234859675036928,
"grad_norm": 0.0017210356891155243,
"learning_rate": 7.062530773018218e-06,
"loss": 0.0002,
"step": 6570
},
{
"epoch": 3.239783357951748,
"grad_norm": 0.005474100820720196,
"learning_rate": 7.042836041358937e-06,
"loss": 0.0002,
"step": 6580
},
{
"epoch": 3.2447070408665684,
"grad_norm": 0.002096637850627303,
"learning_rate": 7.0231413096996555e-06,
"loss": 0.0002,
"step": 6590
},
{
"epoch": 3.2496307237813884,
"grad_norm": 0.0020583397708833218,
"learning_rate": 7.003446578040375e-06,
"loss": 0.0002,
"step": 6600
},
{
"epoch": 3.254554406696209,
"grad_norm": 0.001692062127403915,
"learning_rate": 6.983751846381094e-06,
"loss": 0.0002,
"step": 6610
},
{
"epoch": 3.259478089611029,
"grad_norm": 0.0017217019340023398,
"learning_rate": 6.964057114721813e-06,
"loss": 0.0002,
"step": 6620
},
{
"epoch": 3.2644017725258494,
"grad_norm": 0.001690174569375813,
"learning_rate": 6.944362383062532e-06,
"loss": 0.0002,
"step": 6630
},
{
"epoch": 3.2693254554406694,
"grad_norm": 0.0017117226962000132,
"learning_rate": 6.924667651403251e-06,
"loss": 0.0002,
"step": 6640
},
{
"epoch": 3.27424913835549,
"grad_norm": 0.002361851977184415,
"learning_rate": 6.904972919743968e-06,
"loss": 0.0003,
"step": 6650
},
{
"epoch": 3.2791728212703104,
"grad_norm": 0.0022522679064422846,
"learning_rate": 6.885278188084687e-06,
"loss": 0.0002,
"step": 6660
},
{
"epoch": 3.2840965041851304,
"grad_norm": 0.001928364159539342,
"learning_rate": 6.865583456425406e-06,
"loss": 0.0002,
"step": 6670
},
{
"epoch": 3.289020187099951,
"grad_norm": 0.001716680359095335,
"learning_rate": 6.845888724766125e-06,
"loss": 0.0002,
"step": 6680
},
{
"epoch": 3.293943870014771,
"grad_norm": 0.10604394972324371,
"learning_rate": 6.826193993106844e-06,
"loss": 0.0004,
"step": 6690
},
{
"epoch": 3.2988675529295914,
"grad_norm": 0.001663457602262497,
"learning_rate": 6.8064992614475635e-06,
"loss": 0.0002,
"step": 6700
},
{
"epoch": 3.3037912358444115,
"grad_norm": 0.0023722369223833084,
"learning_rate": 6.7868045297882825e-06,
"loss": 0.0003,
"step": 6710
},
{
"epoch": 3.308714918759232,
"grad_norm": 0.0020810524001717567,
"learning_rate": 6.7671097981290016e-06,
"loss": 0.0002,
"step": 6720
},
{
"epoch": 3.313638601674052,
"grad_norm": 0.0019393692491576076,
"learning_rate": 6.747415066469721e-06,
"loss": 0.0002,
"step": 6730
},
{
"epoch": 3.3185622845888725,
"grad_norm": 0.0016419962048530579,
"learning_rate": 6.727720334810439e-06,
"loss": 0.0002,
"step": 6740
},
{
"epoch": 3.323485967503693,
"grad_norm": 0.0016390602104365826,
"learning_rate": 6.708025603151158e-06,
"loss": 0.0002,
"step": 6750
},
{
"epoch": 3.328409650418513,
"grad_norm": 0.0016691131750121713,
"learning_rate": 6.688330871491876e-06,
"loss": 0.0002,
"step": 6760
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.001631229417398572,
"learning_rate": 6.668636139832595e-06,
"loss": 0.0003,
"step": 6770
},
{
"epoch": 3.3382570162481535,
"grad_norm": 0.001860469812527299,
"learning_rate": 6.648941408173314e-06,
"loss": 0.0002,
"step": 6780
},
{
"epoch": 3.343180699162974,
"grad_norm": 0.0019199643284082413,
"learning_rate": 6.629246676514033e-06,
"loss": 0.0002,
"step": 6790
},
{
"epoch": 3.348104382077794,
"grad_norm": 0.0016363976756110787,
"learning_rate": 6.6095519448547515e-06,
"loss": 0.0002,
"step": 6800
},
{
"epoch": 3.3530280649926145,
"grad_norm": 0.0021544115152209997,
"learning_rate": 6.5898572131954705e-06,
"loss": 0.0002,
"step": 6810
},
{
"epoch": 3.3579517479074346,
"grad_norm": 0.0016355662373825908,
"learning_rate": 6.57016248153619e-06,
"loss": 0.0002,
"step": 6820
},
{
"epoch": 3.362875430822255,
"grad_norm": 0.0016973905730992556,
"learning_rate": 6.550467749876909e-06,
"loss": 0.0002,
"step": 6830
},
{
"epoch": 3.3677991137370755,
"grad_norm": 0.4839552938938141,
"learning_rate": 6.530773018217628e-06,
"loss": 0.0025,
"step": 6840
},
{
"epoch": 3.3727227966518956,
"grad_norm": 0.0016121608205139637,
"learning_rate": 6.511078286558347e-06,
"loss": 0.0002,
"step": 6850
},
{
"epoch": 3.377646479566716,
"grad_norm": 0.0016309829661622643,
"learning_rate": 6.491383554899066e-06,
"loss": 0.0002,
"step": 6860
},
{
"epoch": 3.382570162481536,
"grad_norm": 0.0015953374095261097,
"learning_rate": 6.471688823239783e-06,
"loss": 0.0002,
"step": 6870
},
{
"epoch": 3.3874938453963566,
"grad_norm": 0.0015872882213443518,
"learning_rate": 6.451994091580502e-06,
"loss": 0.0002,
"step": 6880
},
{
"epoch": 3.3924175283111766,
"grad_norm": 0.001950180740095675,
"learning_rate": 6.432299359921221e-06,
"loss": 0.0002,
"step": 6890
},
{
"epoch": 3.397341211225997,
"grad_norm": 0.001902850461192429,
"learning_rate": 6.41260462826194e-06,
"loss": 0.0002,
"step": 6900
},
{
"epoch": 3.402264894140817,
"grad_norm": 0.0018240628996863961,
"learning_rate": 6.392909896602659e-06,
"loss": 0.0002,
"step": 6910
},
{
"epoch": 3.4071885770556376,
"grad_norm": 0.0015961594181135297,
"learning_rate": 6.3732151649433785e-06,
"loss": 0.0002,
"step": 6920
},
{
"epoch": 3.412112259970458,
"grad_norm": 0.0015978438314050436,
"learning_rate": 6.3535204332840975e-06,
"loss": 0.0002,
"step": 6930
},
{
"epoch": 3.417035942885278,
"grad_norm": 0.002185217337682843,
"learning_rate": 6.333825701624816e-06,
"loss": 0.0002,
"step": 6940
},
{
"epoch": 3.4219596258000986,
"grad_norm": 0.0015885097673162818,
"learning_rate": 6.314130969965535e-06,
"loss": 0.0002,
"step": 6950
},
{
"epoch": 3.4268833087149186,
"grad_norm": 0.0017295465804636478,
"learning_rate": 6.294436238306254e-06,
"loss": 0.0002,
"step": 6960
},
{
"epoch": 3.431806991629739,
"grad_norm": 0.0020427897106856108,
"learning_rate": 6.274741506646972e-06,
"loss": 0.0002,
"step": 6970
},
{
"epoch": 3.436730674544559,
"grad_norm": 0.001587416511029005,
"learning_rate": 6.255046774987691e-06,
"loss": 0.0002,
"step": 6980
},
{
"epoch": 3.4416543574593796,
"grad_norm": 0.0015563281485810876,
"learning_rate": 6.23535204332841e-06,
"loss": 0.0002,
"step": 6990
},
{
"epoch": 3.4465780403741997,
"grad_norm": 0.0019551627337932587,
"learning_rate": 6.215657311669128e-06,
"loss": 0.0002,
"step": 7000
},
{
"epoch": 3.45150172328902,
"grad_norm": 0.00192571512889117,
"learning_rate": 6.1959625800098474e-06,
"loss": 0.0002,
"step": 7010
},
{
"epoch": 3.4564254062038406,
"grad_norm": 0.0022392261307686567,
"learning_rate": 6.1762678483505665e-06,
"loss": 0.0002,
"step": 7020
},
{
"epoch": 3.4613490891186607,
"grad_norm": 0.0015274557517841458,
"learning_rate": 6.1565731166912856e-06,
"loss": 0.0002,
"step": 7030
},
{
"epoch": 3.466272772033481,
"grad_norm": 0.0025689592584967613,
"learning_rate": 6.136878385032005e-06,
"loss": 0.0528,
"step": 7040
},
{
"epoch": 3.471196454948301,
"grad_norm": 0.0017388068372383714,
"learning_rate": 6.117183653372724e-06,
"loss": 0.0002,
"step": 7050
},
{
"epoch": 3.4761201378631217,
"grad_norm": 0.002218118868768215,
"learning_rate": 6.097488921713443e-06,
"loss": 0.0002,
"step": 7060
},
{
"epoch": 3.4810438207779417,
"grad_norm": 0.0015678524505347013,
"learning_rate": 6.077794190054162e-06,
"loss": 0.0002,
"step": 7070
},
{
"epoch": 3.485967503692762,
"grad_norm": 0.001805935869924724,
"learning_rate": 6.058099458394879e-06,
"loss": 0.0002,
"step": 7080
},
{
"epoch": 3.4908911866075822,
"grad_norm": 0.0015670402208343148,
"learning_rate": 6.038404726735598e-06,
"loss": 0.0004,
"step": 7090
},
{
"epoch": 3.4958148695224027,
"grad_norm": 0.0016316076507791877,
"learning_rate": 6.018709995076317e-06,
"loss": 0.0002,
"step": 7100
},
{
"epoch": 3.500738552437223,
"grad_norm": 0.0015326079446822405,
"learning_rate": 5.999015263417036e-06,
"loss": 0.0002,
"step": 7110
},
{
"epoch": 3.5056622353520432,
"grad_norm": 0.0019040625775232911,
"learning_rate": 5.979320531757755e-06,
"loss": 0.0002,
"step": 7120
},
{
"epoch": 3.5105859182668637,
"grad_norm": 0.0018364688148722053,
"learning_rate": 5.9596258000984744e-06,
"loss": 0.0002,
"step": 7130
},
{
"epoch": 3.5155096011816838,
"grad_norm": 0.001652045757509768,
"learning_rate": 5.9399310684391935e-06,
"loss": 0.028,
"step": 7140
},
{
"epoch": 3.5204332840965042,
"grad_norm": 0.0017825603717938066,
"learning_rate": 5.920236336779912e-06,
"loss": 0.0002,
"step": 7150
},
{
"epoch": 3.5253569670113247,
"grad_norm": 0.0015865016030147672,
"learning_rate": 5.900541605120631e-06,
"loss": 0.0002,
"step": 7160
},
{
"epoch": 3.5302806499261448,
"grad_norm": 0.002345743589103222,
"learning_rate": 5.88084687346135e-06,
"loss": 0.0002,
"step": 7170
},
{
"epoch": 3.535204332840965,
"grad_norm": 0.001753124175593257,
"learning_rate": 5.861152141802069e-06,
"loss": 0.0002,
"step": 7180
},
{
"epoch": 3.5401280157557853,
"grad_norm": 0.0016645858995616436,
"learning_rate": 5.841457410142787e-06,
"loss": 0.0002,
"step": 7190
},
{
"epoch": 3.5450516986706058,
"grad_norm": 0.0017800434725359082,
"learning_rate": 5.821762678483506e-06,
"loss": 0.0002,
"step": 7200
},
{
"epoch": 3.549975381585426,
"grad_norm": 0.0025877405423671007,
"learning_rate": 5.802067946824224e-06,
"loss": 0.0002,
"step": 7210
},
{
"epoch": 3.5548990645002463,
"grad_norm": 0.0018426472088322043,
"learning_rate": 5.782373215164943e-06,
"loss": 0.0002,
"step": 7220
},
{
"epoch": 3.5598227474150663,
"grad_norm": 0.0019229879835620522,
"learning_rate": 5.7626784835056625e-06,
"loss": 0.0018,
"step": 7230
},
{
"epoch": 3.564746430329887,
"grad_norm": 0.001863017212599516,
"learning_rate": 5.7429837518463815e-06,
"loss": 0.0002,
"step": 7240
},
{
"epoch": 3.5696701132447073,
"grad_norm": 0.0015497950371354818,
"learning_rate": 5.7232890201871006e-06,
"loss": 0.0002,
"step": 7250
},
{
"epoch": 3.5745937961595273,
"grad_norm": 0.0015739004593342543,
"learning_rate": 5.70359428852782e-06,
"loss": 0.0002,
"step": 7260
},
{
"epoch": 3.5795174790743474,
"grad_norm": 0.0018886280013248324,
"learning_rate": 5.683899556868539e-06,
"loss": 0.0002,
"step": 7270
},
{
"epoch": 3.584441161989168,
"grad_norm": 0.0020600894931703806,
"learning_rate": 5.664204825209258e-06,
"loss": 0.0002,
"step": 7280
},
{
"epoch": 3.5893648449039883,
"grad_norm": 0.0017813716549426317,
"learning_rate": 5.644510093549976e-06,
"loss": 0.0002,
"step": 7290
},
{
"epoch": 3.5942885278188084,
"grad_norm": 0.0014752724673599005,
"learning_rate": 5.624815361890694e-06,
"loss": 0.0003,
"step": 7300
},
{
"epoch": 3.599212210733629,
"grad_norm": 0.0017788780387490988,
"learning_rate": 5.605120630231413e-06,
"loss": 0.0002,
"step": 7310
},
{
"epoch": 3.604135893648449,
"grad_norm": 0.0017315271543338895,
"learning_rate": 5.585425898572132e-06,
"loss": 0.0002,
"step": 7320
},
{
"epoch": 3.6090595765632694,
"grad_norm": 0.0015107191866263747,
"learning_rate": 5.565731166912851e-06,
"loss": 0.0212,
"step": 7330
},
{
"epoch": 3.61398325947809,
"grad_norm": 0.0015164316864684224,
"learning_rate": 5.54603643525357e-06,
"loss": 0.0002,
"step": 7340
},
{
"epoch": 3.61890694239291,
"grad_norm": 0.0015602256171405315,
"learning_rate": 5.526341703594289e-06,
"loss": 0.0002,
"step": 7350
},
{
"epoch": 3.62383062530773,
"grad_norm": 0.0015156749868765473,
"learning_rate": 5.506646971935008e-06,
"loss": 0.0002,
"step": 7360
},
{
"epoch": 3.6287543082225504,
"grad_norm": 0.0015055168187245727,
"learning_rate": 5.486952240275727e-06,
"loss": 0.0002,
"step": 7370
},
{
"epoch": 3.633677991137371,
"grad_norm": 0.0020491585601121187,
"learning_rate": 5.467257508616446e-06,
"loss": 0.0002,
"step": 7380
},
{
"epoch": 3.638601674052191,
"grad_norm": 0.0016383701004087925,
"learning_rate": 5.447562776957165e-06,
"loss": 0.0002,
"step": 7390
},
{
"epoch": 3.6435253569670114,
"grad_norm": 0.0015544986817985773,
"learning_rate": 5.427868045297884e-06,
"loss": 0.0003,
"step": 7400
},
{
"epoch": 3.6484490398818314,
"grad_norm": 0.0018146246438845992,
"learning_rate": 5.408173313638601e-06,
"loss": 0.0002,
"step": 7410
},
{
"epoch": 3.653372722796652,
"grad_norm": 0.001972957979887724,
"learning_rate": 5.38847858197932e-06,
"loss": 0.0002,
"step": 7420
},
{
"epoch": 3.6582964057114724,
"grad_norm": 0.0021671319846063852,
"learning_rate": 5.368783850320039e-06,
"loss": 0.0002,
"step": 7430
},
{
"epoch": 3.6632200886262924,
"grad_norm": 0.002430422930046916,
"learning_rate": 5.349089118660758e-06,
"loss": 0.0002,
"step": 7440
},
{
"epoch": 3.6681437715411125,
"grad_norm": 0.0015026788460090756,
"learning_rate": 5.3293943870014775e-06,
"loss": 0.0002,
"step": 7450
},
{
"epoch": 3.673067454455933,
"grad_norm": 0.0068082925863564014,
"learning_rate": 5.3096996553421965e-06,
"loss": 0.0002,
"step": 7460
},
{
"epoch": 3.6779911373707534,
"grad_norm": 0.001487517962232232,
"learning_rate": 5.290004923682916e-06,
"loss": 0.0002,
"step": 7470
},
{
"epoch": 3.6829148202855735,
"grad_norm": 0.006023659370839596,
"learning_rate": 5.270310192023635e-06,
"loss": 0.0002,
"step": 7480
},
{
"epoch": 3.687838503200394,
"grad_norm": 0.0016626460710540414,
"learning_rate": 5.250615460364354e-06,
"loss": 0.0002,
"step": 7490
},
{
"epoch": 3.692762186115214,
"grad_norm": 0.0018089113291352987,
"learning_rate": 5.230920728705072e-06,
"loss": 0.0002,
"step": 7500
},
{
"epoch": 3.6976858690300345,
"grad_norm": 0.001456581405363977,
"learning_rate": 5.21122599704579e-06,
"loss": 0.0002,
"step": 7510
},
{
"epoch": 3.702609551944855,
"grad_norm": 0.0014255401911213994,
"learning_rate": 5.191531265386509e-06,
"loss": 0.0003,
"step": 7520
},
{
"epoch": 3.707533234859675,
"grad_norm": 0.00147194042801857,
"learning_rate": 5.171836533727228e-06,
"loss": 0.0002,
"step": 7530
},
{
"epoch": 3.712456917774495,
"grad_norm": 0.001444322639144957,
"learning_rate": 5.152141802067947e-06,
"loss": 0.0002,
"step": 7540
},
{
"epoch": 3.7173806006893155,
"grad_norm": 0.001467820955440402,
"learning_rate": 5.132447070408666e-06,
"loss": 0.0002,
"step": 7550
},
{
"epoch": 3.722304283604136,
"grad_norm": 0.0020592319779098034,
"learning_rate": 5.1127523387493846e-06,
"loss": 0.0002,
"step": 7560
},
{
"epoch": 3.727227966518956,
"grad_norm": 0.0017327726818621159,
"learning_rate": 5.093057607090104e-06,
"loss": 0.0002,
"step": 7570
},
{
"epoch": 3.7321516494337765,
"grad_norm": 0.0017492349725216627,
"learning_rate": 5.073362875430823e-06,
"loss": 0.0002,
"step": 7580
},
{
"epoch": 3.7370753323485966,
"grad_norm": 0.00662592425942421,
"learning_rate": 5.053668143771542e-06,
"loss": 0.0002,
"step": 7590
},
{
"epoch": 3.741999015263417,
"grad_norm": 0.001524804625660181,
"learning_rate": 5.033973412112261e-06,
"loss": 0.0002,
"step": 7600
},
{
"epoch": 3.7469226981782375,
"grad_norm": 0.0014649020740762353,
"learning_rate": 5.01427868045298e-06,
"loss": 0.0002,
"step": 7610
},
{
"epoch": 3.7518463810930576,
"grad_norm": 0.0014713428681716323,
"learning_rate": 4.994583948793698e-06,
"loss": 0.0005,
"step": 7620
},
{
"epoch": 3.756770064007878,
"grad_norm": 0.0017360311467200518,
"learning_rate": 4.974889217134417e-06,
"loss": 0.0049,
"step": 7630
},
{
"epoch": 3.761693746922698,
"grad_norm": 0.0014296614099293947,
"learning_rate": 4.955194485475136e-06,
"loss": 0.0002,
"step": 7640
},
{
"epoch": 3.7666174298375186,
"grad_norm": 0.13573376834392548,
"learning_rate": 4.935499753815854e-06,
"loss": 0.0003,
"step": 7650
},
{
"epoch": 3.7715411127523386,
"grad_norm": 0.0024039391428232193,
"learning_rate": 4.915805022156573e-06,
"loss": 0.0002,
"step": 7660
},
{
"epoch": 3.776464795667159,
"grad_norm": 0.0014649844961240888,
"learning_rate": 4.8961102904972925e-06,
"loss": 0.0002,
"step": 7670
},
{
"epoch": 3.781388478581979,
"grad_norm": 0.0016867737285792828,
"learning_rate": 4.8764155588380115e-06,
"loss": 0.0002,
"step": 7680
},
{
"epoch": 3.7863121614967996,
"grad_norm": 0.001405984628945589,
"learning_rate": 4.856720827178731e-06,
"loss": 0.0002,
"step": 7690
},
{
"epoch": 3.79123584441162,
"grad_norm": 0.0014443190302699804,
"learning_rate": 4.837026095519449e-06,
"loss": 0.0002,
"step": 7700
},
{
"epoch": 3.79615952732644,
"grad_norm": 0.0016520627541467547,
"learning_rate": 4.817331363860168e-06,
"loss": 0.0003,
"step": 7710
},
{
"epoch": 3.8010832102412606,
"grad_norm": 0.01564246602356434,
"learning_rate": 4.797636632200887e-06,
"loss": 0.0002,
"step": 7720
},
{
"epoch": 3.8060068931560807,
"grad_norm": 0.0014039729721844196,
"learning_rate": 4.777941900541606e-06,
"loss": 0.0002,
"step": 7730
},
{
"epoch": 3.810930576070901,
"grad_norm": 0.0015944570768624544,
"learning_rate": 4.758247168882324e-06,
"loss": 0.0002,
"step": 7740
},
{
"epoch": 3.815854258985721,
"grad_norm": 0.0016037479508668184,
"learning_rate": 4.738552437223043e-06,
"loss": 0.0002,
"step": 7750
},
{
"epoch": 3.8207779419005417,
"grad_norm": 0.0016006861114874482,
"learning_rate": 4.718857705563762e-06,
"loss": 0.0003,
"step": 7760
},
{
"epoch": 3.8257016248153617,
"grad_norm": 0.0014518728712573647,
"learning_rate": 4.6991629739044805e-06,
"loss": 0.0002,
"step": 7770
},
{
"epoch": 3.830625307730182,
"grad_norm": 0.0020076241344213486,
"learning_rate": 4.6794682422451996e-06,
"loss": 0.0002,
"step": 7780
},
{
"epoch": 3.8355489906450027,
"grad_norm": 0.0013705624733120203,
"learning_rate": 4.659773510585919e-06,
"loss": 0.0002,
"step": 7790
},
{
"epoch": 3.8404726735598227,
"grad_norm": 0.0015762551920488477,
"learning_rate": 4.640078778926638e-06,
"loss": 0.0002,
"step": 7800
},
{
"epoch": 3.845396356474643,
"grad_norm": 0.001401646644808352,
"learning_rate": 4.620384047267356e-06,
"loss": 0.0023,
"step": 7810
},
{
"epoch": 3.850320039389463,
"grad_norm": 0.0014297078596428037,
"learning_rate": 4.600689315608075e-06,
"loss": 0.0002,
"step": 7820
},
{
"epoch": 3.8552437223042837,
"grad_norm": 0.001564424717798829,
"learning_rate": 4.580994583948794e-06,
"loss": 0.0002,
"step": 7830
},
{
"epoch": 3.8601674052191037,
"grad_norm": 0.0016091325087472796,
"learning_rate": 4.561299852289513e-06,
"loss": 0.0427,
"step": 7840
},
{
"epoch": 3.865091088133924,
"grad_norm": 0.0013987331185489893,
"learning_rate": 4.541605120630232e-06,
"loss": 0.0002,
"step": 7850
},
{
"epoch": 3.8700147710487443,
"grad_norm": 0.0013764946488663554,
"learning_rate": 4.521910388970951e-06,
"loss": 0.0041,
"step": 7860
},
{
"epoch": 3.8749384539635647,
"grad_norm": 0.0014972817152738571,
"learning_rate": 4.502215657311669e-06,
"loss": 0.0002,
"step": 7870
},
{
"epoch": 3.879862136878385,
"grad_norm": 0.002630974631756544,
"learning_rate": 4.4825209256523884e-06,
"loss": 0.0002,
"step": 7880
},
{
"epoch": 3.8847858197932053,
"grad_norm": 0.0015066839987412095,
"learning_rate": 4.4628261939931075e-06,
"loss": 0.0002,
"step": 7890
},
{
"epoch": 3.8897095027080257,
"grad_norm": 0.0013690165942534804,
"learning_rate": 4.4431314623338265e-06,
"loss": 0.0002,
"step": 7900
},
{
"epoch": 3.8946331856228458,
"grad_norm": 0.0015422162832692266,
"learning_rate": 4.423436730674545e-06,
"loss": 0.0002,
"step": 7910
},
{
"epoch": 3.8995568685376663,
"grad_norm": 0.0013625508872792125,
"learning_rate": 4.403741999015264e-06,
"loss": 0.0002,
"step": 7920
},
{
"epoch": 3.9044805514524867,
"grad_norm": 0.0013615674106404185,
"learning_rate": 4.384047267355983e-06,
"loss": 0.0002,
"step": 7930
},
{
"epoch": 3.9094042343673068,
"grad_norm": 0.0013642277335748076,
"learning_rate": 4.364352535696701e-06,
"loss": 0.0006,
"step": 7940
},
{
"epoch": 3.914327917282127,
"grad_norm": 0.0018096421845257282,
"learning_rate": 4.34465780403742e-06,
"loss": 0.0002,
"step": 7950
},
{
"epoch": 3.9192516001969473,
"grad_norm": 0.0016002668999135494,
"learning_rate": 4.324963072378139e-06,
"loss": 0.0002,
"step": 7960
},
{
"epoch": 3.9241752831117678,
"grad_norm": 0.0013518129708245397,
"learning_rate": 4.305268340718857e-06,
"loss": 0.0002,
"step": 7970
},
{
"epoch": 3.929098966026588,
"grad_norm": 0.0013844161294400692,
"learning_rate": 4.2855736090595765e-06,
"loss": 0.0002,
"step": 7980
},
{
"epoch": 3.9340226489414083,
"grad_norm": 0.0013579302467405796,
"learning_rate": 4.2658788774002955e-06,
"loss": 0.0002,
"step": 7990
},
{
"epoch": 3.9389463318562283,
"grad_norm": 0.00134462455753237,
"learning_rate": 4.2461841457410146e-06,
"loss": 0.0002,
"step": 8000
},
{
"epoch": 3.943870014771049,
"grad_norm": 0.0018355045467615128,
"learning_rate": 4.226489414081734e-06,
"loss": 0.0002,
"step": 8010
},
{
"epoch": 3.9487936976858693,
"grad_norm": 0.001347012585029006,
"learning_rate": 4.206794682422453e-06,
"loss": 0.0002,
"step": 8020
},
{
"epoch": 3.9537173806006893,
"grad_norm": 0.0016463873907923698,
"learning_rate": 4.187099950763171e-06,
"loss": 0.0002,
"step": 8030
},
{
"epoch": 3.9586410635155094,
"grad_norm": 0.0013524794485419989,
"learning_rate": 4.16740521910389e-06,
"loss": 0.0717,
"step": 8040
},
{
"epoch": 3.96356474643033,
"grad_norm": 0.0015625300584360957,
"learning_rate": 4.147710487444609e-06,
"loss": 0.0002,
"step": 8050
},
{
"epoch": 3.9684884293451503,
"grad_norm": 0.001937661785632372,
"learning_rate": 4.128015755785328e-06,
"loss": 0.0002,
"step": 8060
},
{
"epoch": 3.9734121122599704,
"grad_norm": 0.0013714928645640612,
"learning_rate": 4.108321024126047e-06,
"loss": 0.0002,
"step": 8070
},
{
"epoch": 3.978335795174791,
"grad_norm": 0.001366511220112443,
"learning_rate": 4.088626292466765e-06,
"loss": 0.0002,
"step": 8080
},
{
"epoch": 3.983259478089611,
"grad_norm": 0.005632834974676371,
"learning_rate": 4.068931560807484e-06,
"loss": 0.0002,
"step": 8090
},
{
"epoch": 3.9881831610044314,
"grad_norm": 0.0016822253819555044,
"learning_rate": 4.0492368291482034e-06,
"loss": 0.0259,
"step": 8100
},
{
"epoch": 3.993106843919252,
"grad_norm": 0.0016814577393233776,
"learning_rate": 4.0295420974889225e-06,
"loss": 0.0002,
"step": 8110
},
{
"epoch": 3.998030526834072,
"grad_norm": 0.0015251173172146082,
"learning_rate": 4.009847365829641e-06,
"loss": 0.0002,
"step": 8120
},
{
"epoch": 4.0,
"eval_accuracy": 0.9982560167422393,
"eval_loss": 0.004296807572245598,
"eval_runtime": 127.4433,
"eval_samples_per_second": 22.496,
"eval_steps_per_second": 2.817,
"step": 8124
},
{
"epoch": 4.002954209748892,
"grad_norm": 0.0015737974317744374,
"learning_rate": 3.99015263417036e-06,
"loss": 0.0002,
"step": 8130
},
{
"epoch": 4.007877892663712,
"grad_norm": 0.0015378405805677176,
"learning_rate": 3.970457902511079e-06,
"loss": 0.0001,
"step": 8140
},
{
"epoch": 4.012801575578533,
"grad_norm": 0.0023324452340602875,
"learning_rate": 3.950763170851797e-06,
"loss": 0.0002,
"step": 8150
},
{
"epoch": 4.017725258493353,
"grad_norm": 0.0015311094466596842,
"learning_rate": 3.931068439192516e-06,
"loss": 0.0002,
"step": 8160
},
{
"epoch": 4.022648941408173,
"grad_norm": 0.05710865184664726,
"learning_rate": 3.911373707533235e-06,
"loss": 0.0004,
"step": 8170
},
{
"epoch": 4.0275726243229935,
"grad_norm": 0.0013750927755609155,
"learning_rate": 3.891678975873954e-06,
"loss": 0.0002,
"step": 8180
},
{
"epoch": 4.032496307237814,
"grad_norm": 0.0015017867553979158,
"learning_rate": 3.871984244214672e-06,
"loss": 0.0002,
"step": 8190
},
{
"epoch": 4.037419990152634,
"grad_norm": 0.0018651321297511458,
"learning_rate": 3.8522895125553915e-06,
"loss": 0.0002,
"step": 8200
},
{
"epoch": 4.042343673067455,
"grad_norm": 0.0015941975871101022,
"learning_rate": 3.8325947808961105e-06,
"loss": 0.0002,
"step": 8210
},
{
"epoch": 4.0472673559822745,
"grad_norm": 0.0018877830589190125,
"learning_rate": 3.8129000492368296e-06,
"loss": 0.0002,
"step": 8220
},
{
"epoch": 4.052191038897095,
"grad_norm": 0.0013936322648078203,
"learning_rate": 3.7932053175775486e-06,
"loss": 0.0002,
"step": 8230
},
{
"epoch": 4.0571147218119155,
"grad_norm": 0.0015832120552659035,
"learning_rate": 3.773510585918267e-06,
"loss": 0.0002,
"step": 8240
},
{
"epoch": 4.062038404726736,
"grad_norm": 0.0013590282760560513,
"learning_rate": 3.753815854258986e-06,
"loss": 0.0002,
"step": 8250
},
{
"epoch": 4.0669620876415555,
"grad_norm": 0.0016734645469114184,
"learning_rate": 3.734121122599705e-06,
"loss": 0.0002,
"step": 8260
},
{
"epoch": 4.071885770556376,
"grad_norm": 0.001429893309250474,
"learning_rate": 3.7144263909404236e-06,
"loss": 0.0002,
"step": 8270
},
{
"epoch": 4.0768094534711965,
"grad_norm": 0.0017754683503881097,
"learning_rate": 3.6947316592811427e-06,
"loss": 0.0002,
"step": 8280
},
{
"epoch": 4.081733136386017,
"grad_norm": 0.0013153937179595232,
"learning_rate": 3.6750369276218617e-06,
"loss": 0.0002,
"step": 8290
},
{
"epoch": 4.0866568193008375,
"grad_norm": 0.0013936875620856881,
"learning_rate": 3.65534219596258e-06,
"loss": 0.0002,
"step": 8300
},
{
"epoch": 4.091580502215657,
"grad_norm": 0.007932737469673157,
"learning_rate": 3.635647464303299e-06,
"loss": 0.0002,
"step": 8310
},
{
"epoch": 4.0965041851304775,
"grad_norm": 0.0017327237874269485,
"learning_rate": 3.615952732644018e-06,
"loss": 0.0002,
"step": 8320
},
{
"epoch": 4.101427868045298,
"grad_norm": 0.0019220160320401192,
"learning_rate": 3.596258000984737e-06,
"loss": 0.0002,
"step": 8330
},
{
"epoch": 4.1063515509601185,
"grad_norm": 0.0012822451535612345,
"learning_rate": 3.576563269325456e-06,
"loss": 0.0002,
"step": 8340
},
{
"epoch": 4.111275233874938,
"grad_norm": 0.0037715635262429714,
"learning_rate": 3.5568685376661744e-06,
"loss": 0.0002,
"step": 8350
},
{
"epoch": 4.116198916789759,
"grad_norm": 0.0015437414404004812,
"learning_rate": 3.5371738060068934e-06,
"loss": 0.0002,
"step": 8360
},
{
"epoch": 4.121122599704579,
"grad_norm": 0.0012796347727999091,
"learning_rate": 3.5174790743476125e-06,
"loss": 0.0001,
"step": 8370
},
{
"epoch": 4.1260462826193995,
"grad_norm": 0.0013989837607368827,
"learning_rate": 3.497784342688331e-06,
"loss": 0.0033,
"step": 8380
},
{
"epoch": 4.13096996553422,
"grad_norm": 0.0012871460057795048,
"learning_rate": 3.47808961102905e-06,
"loss": 0.0001,
"step": 8390
},
{
"epoch": 4.13589364844904,
"grad_norm": 0.0014854512410238385,
"learning_rate": 3.4583948793697692e-06,
"loss": 0.0002,
"step": 8400
},
{
"epoch": 4.14081733136386,
"grad_norm": 0.0012728660367429256,
"learning_rate": 3.4387001477104874e-06,
"loss": 0.0002,
"step": 8410
},
{
"epoch": 4.145741014278681,
"grad_norm": 0.0012776675866916776,
"learning_rate": 3.4190054160512065e-06,
"loss": 0.0002,
"step": 8420
},
{
"epoch": 4.150664697193501,
"grad_norm": 0.0016903368523344398,
"learning_rate": 3.3993106843919255e-06,
"loss": 0.0002,
"step": 8430
},
{
"epoch": 4.155588380108321,
"grad_norm": 0.004182947799563408,
"learning_rate": 3.3796159527326446e-06,
"loss": 0.0003,
"step": 8440
},
{
"epoch": 4.160512063023141,
"grad_norm": 0.0012582590570673347,
"learning_rate": 3.3599212210733632e-06,
"loss": 0.0002,
"step": 8450
},
{
"epoch": 4.165435745937962,
"grad_norm": 0.0017205028561875224,
"learning_rate": 3.340226489414082e-06,
"loss": 0.0002,
"step": 8460
},
{
"epoch": 4.170359428852782,
"grad_norm": 0.0015602920902892947,
"learning_rate": 3.320531757754801e-06,
"loss": 0.0002,
"step": 8470
},
{
"epoch": 4.175283111767603,
"grad_norm": 0.0013148763682693243,
"learning_rate": 3.3008370260955196e-06,
"loss": 0.0001,
"step": 8480
},
{
"epoch": 4.180206794682422,
"grad_norm": 0.001262652687728405,
"learning_rate": 3.2811422944362386e-06,
"loss": 0.0002,
"step": 8490
},
{
"epoch": 4.185130477597243,
"grad_norm": 0.00150257907807827,
"learning_rate": 3.2614475627769577e-06,
"loss": 0.0002,
"step": 8500
},
{
"epoch": 4.190054160512063,
"grad_norm": 0.0014894099440425634,
"learning_rate": 3.241752831117676e-06,
"loss": 0.0002,
"step": 8510
},
{
"epoch": 4.194977843426884,
"grad_norm": 0.0017964887665584683,
"learning_rate": 3.222058099458395e-06,
"loss": 0.0002,
"step": 8520
},
{
"epoch": 4.199901526341703,
"grad_norm": 0.0014780315104871988,
"learning_rate": 3.202363367799114e-06,
"loss": 0.0001,
"step": 8530
},
{
"epoch": 4.204825209256524,
"grad_norm": 0.0012969339732080698,
"learning_rate": 3.182668636139833e-06,
"loss": 0.0001,
"step": 8540
},
{
"epoch": 4.209748892171344,
"grad_norm": 0.0014950993936508894,
"learning_rate": 3.1629739044805517e-06,
"loss": 0.0002,
"step": 8550
},
{
"epoch": 4.214672575086165,
"grad_norm": 0.0013700306881219149,
"learning_rate": 3.1432791728212707e-06,
"loss": 0.0002,
"step": 8560
},
{
"epoch": 4.219596258000985,
"grad_norm": 0.001990771619603038,
"learning_rate": 3.1235844411619894e-06,
"loss": 0.0002,
"step": 8570
},
{
"epoch": 4.224519940915805,
"grad_norm": 0.0013008847599849105,
"learning_rate": 3.103889709502708e-06,
"loss": 0.0002,
"step": 8580
},
{
"epoch": 4.229443623830625,
"grad_norm": 0.0012881122529506683,
"learning_rate": 3.084194977843427e-06,
"loss": 0.0002,
"step": 8590
},
{
"epoch": 4.234367306745446,
"grad_norm": 0.0016159663209691644,
"learning_rate": 3.064500246184146e-06,
"loss": 0.025,
"step": 8600
},
{
"epoch": 4.239290989660266,
"grad_norm": 0.0012733545154333115,
"learning_rate": 3.044805514524865e-06,
"loss": 0.0002,
"step": 8610
},
{
"epoch": 4.244214672575086,
"grad_norm": 0.001908605801872909,
"learning_rate": 3.0251107828655834e-06,
"loss": 0.0002,
"step": 8620
},
{
"epoch": 4.249138355489906,
"grad_norm": 0.0013102259254083037,
"learning_rate": 3.0054160512063024e-06,
"loss": 0.0002,
"step": 8630
},
{
"epoch": 4.254062038404727,
"grad_norm": 0.0015227696858346462,
"learning_rate": 2.9857213195470215e-06,
"loss": 0.0002,
"step": 8640
},
{
"epoch": 4.258985721319547,
"grad_norm": 0.001237130374647677,
"learning_rate": 2.96602658788774e-06,
"loss": 0.0002,
"step": 8650
},
{
"epoch": 4.263909404234368,
"grad_norm": 0.0012896446278318763,
"learning_rate": 2.946331856228459e-06,
"loss": 0.0001,
"step": 8660
},
{
"epoch": 4.268833087149187,
"grad_norm": 0.0012958886800333858,
"learning_rate": 2.9266371245691782e-06,
"loss": 0.0001,
"step": 8670
},
{
"epoch": 4.273756770064008,
"grad_norm": 0.011026582680642605,
"learning_rate": 2.9069423929098965e-06,
"loss": 0.0002,
"step": 8680
},
{
"epoch": 4.278680452978828,
"grad_norm": 0.0022566046100109816,
"learning_rate": 2.8872476612506155e-06,
"loss": 0.0002,
"step": 8690
},
{
"epoch": 4.283604135893649,
"grad_norm": 0.0012606906238943338,
"learning_rate": 2.8675529295913346e-06,
"loss": 0.0002,
"step": 8700
},
{
"epoch": 4.288527818808468,
"grad_norm": 0.0012501559685915709,
"learning_rate": 2.8478581979320536e-06,
"loss": 0.0002,
"step": 8710
},
{
"epoch": 4.293451501723289,
"grad_norm": 0.0015419054543599486,
"learning_rate": 2.8281634662727727e-06,
"loss": 0.0002,
"step": 8720
},
{
"epoch": 4.298375184638109,
"grad_norm": 0.0012369498144835234,
"learning_rate": 2.808468734613491e-06,
"loss": 0.0002,
"step": 8730
},
{
"epoch": 4.30329886755293,
"grad_norm": 0.001716209459118545,
"learning_rate": 2.78877400295421e-06,
"loss": 0.0001,
"step": 8740
},
{
"epoch": 4.30822255046775,
"grad_norm": 0.0012448432389646769,
"learning_rate": 2.769079271294929e-06,
"loss": 0.0001,
"step": 8750
},
{
"epoch": 4.31314623338257,
"grad_norm": 0.0012436291435733438,
"learning_rate": 2.7493845396356476e-06,
"loss": 0.0002,
"step": 8760
},
{
"epoch": 4.31806991629739,
"grad_norm": 0.0013124769320711493,
"learning_rate": 2.7296898079763667e-06,
"loss": 0.0002,
"step": 8770
},
{
"epoch": 4.322993599212211,
"grad_norm": 0.0014658995205536485,
"learning_rate": 2.7099950763170853e-06,
"loss": 0.0002,
"step": 8780
},
{
"epoch": 4.327917282127031,
"grad_norm": 0.00151143129914999,
"learning_rate": 2.690300344657804e-06,
"loss": 0.0002,
"step": 8790
},
{
"epoch": 4.332840965041851,
"grad_norm": 0.0013197060907259583,
"learning_rate": 2.670605612998523e-06,
"loss": 0.0002,
"step": 8800
},
{
"epoch": 4.337764647956671,
"grad_norm": 0.0024781296961009502,
"learning_rate": 2.650910881339242e-06,
"loss": 0.0006,
"step": 8810
},
{
"epoch": 4.342688330871492,
"grad_norm": 0.002245939103886485,
"learning_rate": 2.631216149679961e-06,
"loss": 0.0002,
"step": 8820
},
{
"epoch": 4.347612013786312,
"grad_norm": 0.0012940160231664777,
"learning_rate": 2.6115214180206798e-06,
"loss": 0.0002,
"step": 8830
},
{
"epoch": 4.352535696701133,
"grad_norm": 0.0018242484657093883,
"learning_rate": 2.5918266863613984e-06,
"loss": 0.0002,
"step": 8840
},
{
"epoch": 4.357459379615952,
"grad_norm": 0.0016134735196828842,
"learning_rate": 2.5721319547021175e-06,
"loss": 0.0002,
"step": 8850
},
{
"epoch": 4.362383062530773,
"grad_norm": 0.0012313901679590344,
"learning_rate": 2.552437223042836e-06,
"loss": 0.0001,
"step": 8860
},
{
"epoch": 4.367306745445593,
"grad_norm": 0.0012789260363206267,
"learning_rate": 2.532742491383555e-06,
"loss": 0.0002,
"step": 8870
},
{
"epoch": 4.372230428360414,
"grad_norm": 0.00142171629704535,
"learning_rate": 2.513047759724274e-06,
"loss": 0.0001,
"step": 8880
},
{
"epoch": 4.3771541112752335,
"grad_norm": 0.001423663692548871,
"learning_rate": 2.493353028064993e-06,
"loss": 0.0001,
"step": 8890
},
{
"epoch": 4.382077794190054,
"grad_norm": 0.0012176020536571741,
"learning_rate": 2.473658296405712e-06,
"loss": 0.0002,
"step": 8900
},
{
"epoch": 4.387001477104874,
"grad_norm": 0.3053596615791321,
"learning_rate": 2.4539635647464305e-06,
"loss": 0.001,
"step": 8910
},
{
"epoch": 4.391925160019695,
"grad_norm": 0.0014659055741503835,
"learning_rate": 2.4342688330871496e-06,
"loss": 0.0002,
"step": 8920
},
{
"epoch": 4.396848842934515,
"grad_norm": 0.0012392625212669373,
"learning_rate": 2.4145741014278682e-06,
"loss": 0.0002,
"step": 8930
},
{
"epoch": 4.401772525849335,
"grad_norm": 0.0013947734842076898,
"learning_rate": 2.3948793697685873e-06,
"loss": 0.0001,
"step": 8940
},
{
"epoch": 4.4066962087641555,
"grad_norm": 0.0014435608172789216,
"learning_rate": 2.375184638109306e-06,
"loss": 0.0002,
"step": 8950
},
{
"epoch": 4.411619891678976,
"grad_norm": 0.5223457217216492,
"learning_rate": 2.3554899064500245e-06,
"loss": 0.0055,
"step": 8960
},
{
"epoch": 4.416543574593796,
"grad_norm": 0.001231219619512558,
"learning_rate": 2.3357951747907436e-06,
"loss": 0.0001,
"step": 8970
},
{
"epoch": 4.421467257508617,
"grad_norm": 0.001335963956080377,
"learning_rate": 2.3161004431314626e-06,
"loss": 0.0002,
"step": 8980
},
{
"epoch": 4.4263909404234365,
"grad_norm": 0.0014932631747797132,
"learning_rate": 2.2964057114721813e-06,
"loss": 0.0183,
"step": 8990
},
{
"epoch": 4.431314623338257,
"grad_norm": 0.0012341372203081846,
"learning_rate": 2.2767109798129003e-06,
"loss": 0.0001,
"step": 9000
},
{
"epoch": 4.4362383062530775,
"grad_norm": 0.001224417588673532,
"learning_rate": 2.257016248153619e-06,
"loss": 0.0002,
"step": 9010
},
{
"epoch": 4.441161989167898,
"grad_norm": 0.001415021950379014,
"learning_rate": 2.237321516494338e-06,
"loss": 0.0002,
"step": 9020
},
{
"epoch": 4.4460856720827175,
"grad_norm": 0.0012212812434881926,
"learning_rate": 2.217626784835057e-06,
"loss": 0.0001,
"step": 9030
},
{
"epoch": 4.451009354997538,
"grad_norm": 0.0016725438181310892,
"learning_rate": 2.1979320531757757e-06,
"loss": 0.0001,
"step": 9040
},
{
"epoch": 4.4559330379123585,
"grad_norm": 0.0031123908702284098,
"learning_rate": 2.1782373215164944e-06,
"loss": 0.0002,
"step": 9050
},
{
"epoch": 4.460856720827179,
"grad_norm": 0.00149108178447932,
"learning_rate": 2.1585425898572134e-06,
"loss": 0.0002,
"step": 9060
},
{
"epoch": 4.465780403741999,
"grad_norm": 0.0012318805092945695,
"learning_rate": 2.138847858197932e-06,
"loss": 0.0001,
"step": 9070
},
{
"epoch": 4.470704086656819,
"grad_norm": 0.001223694533109665,
"learning_rate": 2.119153126538651e-06,
"loss": 0.0001,
"step": 9080
},
{
"epoch": 4.4756277695716395,
"grad_norm": 0.0015389460604637861,
"learning_rate": 2.0994583948793697e-06,
"loss": 0.0001,
"step": 9090
},
{
"epoch": 4.48055145248646,
"grad_norm": 0.001753377728164196,
"learning_rate": 2.079763663220089e-06,
"loss": 0.0001,
"step": 9100
},
{
"epoch": 4.4854751354012805,
"grad_norm": 0.00128873810172081,
"learning_rate": 2.060068931560808e-06,
"loss": 0.0001,
"step": 9110
},
{
"epoch": 4.4903988183161,
"grad_norm": 0.0013829106464982033,
"learning_rate": 2.0403741999015265e-06,
"loss": 0.0001,
"step": 9120
},
{
"epoch": 4.495322501230921,
"grad_norm": 0.0014223408652469516,
"learning_rate": 2.0206794682422455e-06,
"loss": 0.0002,
"step": 9130
},
{
"epoch": 4.500246184145741,
"grad_norm": 0.0014313060091808438,
"learning_rate": 2.000984736582964e-06,
"loss": 0.0001,
"step": 9140
},
{
"epoch": 4.5051698670605616,
"grad_norm": 0.004056563135236502,
"learning_rate": 1.981290004923683e-06,
"loss": 0.0002,
"step": 9150
},
{
"epoch": 4.510093549975382,
"grad_norm": 0.0014529697364196181,
"learning_rate": 1.961595273264402e-06,
"loss": 0.0002,
"step": 9160
},
{
"epoch": 4.515017232890202,
"grad_norm": 0.0011942709097638726,
"learning_rate": 1.941900541605121e-06,
"loss": 0.0001,
"step": 9170
},
{
"epoch": 4.519940915805022,
"grad_norm": 0.0016299562994390726,
"learning_rate": 1.9222058099458395e-06,
"loss": 0.0002,
"step": 9180
},
{
"epoch": 4.524864598719843,
"grad_norm": 0.0012343422276899219,
"learning_rate": 1.9025110782865586e-06,
"loss": 0.0001,
"step": 9190
},
{
"epoch": 4.529788281634663,
"grad_norm": 0.001468788948841393,
"learning_rate": 1.8828163466272772e-06,
"loss": 0.0002,
"step": 9200
},
{
"epoch": 4.534711964549483,
"grad_norm": 0.0012346956646069884,
"learning_rate": 1.8631216149679963e-06,
"loss": 0.0002,
"step": 9210
},
{
"epoch": 4.539635647464303,
"grad_norm": 0.005765956360846758,
"learning_rate": 1.8434268833087151e-06,
"loss": 0.0002,
"step": 9220
},
{
"epoch": 4.544559330379124,
"grad_norm": 0.0012338730739429593,
"learning_rate": 1.8237321516494338e-06,
"loss": 0.0001,
"step": 9230
},
{
"epoch": 4.549483013293944,
"grad_norm": 0.0018139538588002324,
"learning_rate": 1.8040374199901528e-06,
"loss": 0.0001,
"step": 9240
},
{
"epoch": 4.554406696208764,
"grad_norm": 0.001192125491797924,
"learning_rate": 1.7843426883308717e-06,
"loss": 0.0002,
"step": 9250
},
{
"epoch": 4.559330379123584,
"grad_norm": 0.0013297711266204715,
"learning_rate": 1.7646479566715905e-06,
"loss": 0.0002,
"step": 9260
},
{
"epoch": 4.564254062038405,
"grad_norm": 0.0015158694004639983,
"learning_rate": 1.7449532250123094e-06,
"loss": 0.0001,
"step": 9270
},
{
"epoch": 4.569177744953225,
"grad_norm": 0.0014190117362886667,
"learning_rate": 1.725258493353028e-06,
"loss": 0.0003,
"step": 9280
},
{
"epoch": 4.574101427868046,
"grad_norm": 0.0013871495611965656,
"learning_rate": 1.705563761693747e-06,
"loss": 0.0001,
"step": 9290
},
{
"epoch": 4.579025110782865,
"grad_norm": 0.0016816201386973262,
"learning_rate": 1.685869030034466e-06,
"loss": 0.0002,
"step": 9300
},
{
"epoch": 4.583948793697686,
"grad_norm": 0.0012660392094403505,
"learning_rate": 1.6661742983751847e-06,
"loss": 0.0001,
"step": 9310
},
{
"epoch": 4.588872476612506,
"grad_norm": 0.0014960381668061018,
"learning_rate": 1.6464795667159036e-06,
"loss": 0.0001,
"step": 9320
},
{
"epoch": 4.593796159527327,
"grad_norm": 0.0016527030384168029,
"learning_rate": 1.6267848350566226e-06,
"loss": 0.0001,
"step": 9330
},
{
"epoch": 4.598719842442147,
"grad_norm": 0.0014885793207213283,
"learning_rate": 1.6070901033973413e-06,
"loss": 0.0001,
"step": 9340
},
{
"epoch": 4.603643525356967,
"grad_norm": 0.0012881554430350661,
"learning_rate": 1.5873953717380603e-06,
"loss": 0.0001,
"step": 9350
},
{
"epoch": 4.608567208271787,
"grad_norm": 0.003308955579996109,
"learning_rate": 1.567700640078779e-06,
"loss": 0.0002,
"step": 9360
},
{
"epoch": 4.613490891186608,
"grad_norm": 0.0030271336436271667,
"learning_rate": 1.5480059084194978e-06,
"loss": 0.0002,
"step": 9370
},
{
"epoch": 4.618414574101428,
"grad_norm": 0.0012079523876309395,
"learning_rate": 1.5283111767602169e-06,
"loss": 0.0001,
"step": 9380
},
{
"epoch": 4.623338257016248,
"grad_norm": 0.0013818118022754788,
"learning_rate": 1.5086164451009355e-06,
"loss": 0.0002,
"step": 9390
},
{
"epoch": 4.628261939931068,
"grad_norm": 0.0013594648335129023,
"learning_rate": 1.4889217134416546e-06,
"loss": 0.0002,
"step": 9400
},
{
"epoch": 4.633185622845889,
"grad_norm": 0.0014723829226568341,
"learning_rate": 1.4692269817823734e-06,
"loss": 0.0002,
"step": 9410
},
{
"epoch": 4.638109305760709,
"grad_norm": 0.001555690192617476,
"learning_rate": 1.449532250123092e-06,
"loss": 0.0001,
"step": 9420
},
{
"epoch": 4.643032988675529,
"grad_norm": 0.001370370271615684,
"learning_rate": 1.429837518463811e-06,
"loss": 0.0001,
"step": 9430
},
{
"epoch": 4.647956671590349,
"grad_norm": 0.0013733146479353309,
"learning_rate": 1.41014278680453e-06,
"loss": 0.0002,
"step": 9440
},
{
"epoch": 4.65288035450517,
"grad_norm": 0.020692721009254456,
"learning_rate": 1.3904480551452488e-06,
"loss": 0.0002,
"step": 9450
},
{
"epoch": 4.65780403741999,
"grad_norm": 0.0013894923031330109,
"learning_rate": 1.3707533234859676e-06,
"loss": 0.0079,
"step": 9460
},
{
"epoch": 4.662727720334811,
"grad_norm": 0.005957514047622681,
"learning_rate": 1.3510585918266863e-06,
"loss": 0.0002,
"step": 9470
},
{
"epoch": 4.66765140324963,
"grad_norm": 0.0014959081308916211,
"learning_rate": 1.3313638601674053e-06,
"loss": 0.0001,
"step": 9480
},
{
"epoch": 4.672575086164451,
"grad_norm": 0.0012322113616392016,
"learning_rate": 1.3116691285081244e-06,
"loss": 0.0001,
"step": 9490
},
{
"epoch": 4.677498769079271,
"grad_norm": 0.00134864985011518,
"learning_rate": 1.291974396848843e-06,
"loss": 0.0001,
"step": 9500
},
{
"epoch": 4.682422451994092,
"grad_norm": 0.0011784137459471822,
"learning_rate": 1.2722796651895619e-06,
"loss": 0.0002,
"step": 9510
},
{
"epoch": 4.687346134908912,
"grad_norm": 0.0011899089440703392,
"learning_rate": 1.252584933530281e-06,
"loss": 0.0002,
"step": 9520
},
{
"epoch": 4.692269817823732,
"grad_norm": 0.0012023659655824304,
"learning_rate": 1.2328902018709995e-06,
"loss": 0.0002,
"step": 9530
},
{
"epoch": 4.697193500738552,
"grad_norm": 0.001208995352499187,
"learning_rate": 1.2131954702117186e-06,
"loss": 0.0011,
"step": 9540
},
{
"epoch": 4.702117183653373,
"grad_norm": 0.0014061863766983151,
"learning_rate": 1.1935007385524372e-06,
"loss": 0.0001,
"step": 9550
},
{
"epoch": 4.707040866568193,
"grad_norm": 0.001575971720740199,
"learning_rate": 1.173806006893156e-06,
"loss": 0.0002,
"step": 9560
},
{
"epoch": 4.711964549483013,
"grad_norm": 0.0012307134456932545,
"learning_rate": 1.154111275233875e-06,
"loss": 0.0002,
"step": 9570
},
{
"epoch": 4.716888232397833,
"grad_norm": 0.0013830272946506739,
"learning_rate": 1.134416543574594e-06,
"loss": 0.0001,
"step": 9580
},
{
"epoch": 4.721811915312654,
"grad_norm": 0.0013915746239945292,
"learning_rate": 1.1147218119153128e-06,
"loss": 0.0002,
"step": 9590
},
{
"epoch": 4.726735598227474,
"grad_norm": 0.001184074324555695,
"learning_rate": 1.0950270802560317e-06,
"loss": 0.0001,
"step": 9600
},
{
"epoch": 4.731659281142294,
"grad_norm": 0.0012042863527312875,
"learning_rate": 1.0753323485967503e-06,
"loss": 0.0001,
"step": 9610
},
{
"epoch": 4.736582964057114,
"grad_norm": 0.0011871194001287222,
"learning_rate": 1.0556376169374694e-06,
"loss": 0.0001,
"step": 9620
},
{
"epoch": 4.741506646971935,
"grad_norm": 0.0011803999077528715,
"learning_rate": 1.0359428852781882e-06,
"loss": 0.0001,
"step": 9630
},
{
"epoch": 4.746430329886755,
"grad_norm": 0.001454996527172625,
"learning_rate": 1.016248153618907e-06,
"loss": 0.0001,
"step": 9640
},
{
"epoch": 4.751354012801576,
"grad_norm": 0.0013643850106745958,
"learning_rate": 9.96553421959626e-07,
"loss": 0.0001,
"step": 9650
},
{
"epoch": 4.7562776957163955,
"grad_norm": 0.004807002376765013,
"learning_rate": 9.768586903003447e-07,
"loss": 0.0002,
"step": 9660
},
{
"epoch": 4.761201378631216,
"grad_norm": 0.010427232831716537,
"learning_rate": 9.571639586410636e-07,
"loss": 0.0002,
"step": 9670
},
{
"epoch": 4.766125061546036,
"grad_norm": 0.0012309970334172249,
"learning_rate": 9.374692269817824e-07,
"loss": 0.0001,
"step": 9680
},
{
"epoch": 4.771048744460857,
"grad_norm": 0.0019813096150755882,
"learning_rate": 9.177744953225014e-07,
"loss": 0.0002,
"step": 9690
},
{
"epoch": 4.775972427375677,
"grad_norm": 0.0014402302913367748,
"learning_rate": 8.980797636632202e-07,
"loss": 0.0002,
"step": 9700
},
{
"epoch": 4.780896110290497,
"grad_norm": 0.0014121612766757607,
"learning_rate": 8.78385032003939e-07,
"loss": 0.0002,
"step": 9710
},
{
"epoch": 4.7858197932053175,
"grad_norm": 0.0011728608515113592,
"learning_rate": 8.586903003446578e-07,
"loss": 0.0001,
"step": 9720
},
{
"epoch": 4.790743476120138,
"grad_norm": 0.001560199074447155,
"learning_rate": 8.389955686853768e-07,
"loss": 0.0002,
"step": 9730
},
{
"epoch": 4.795667159034958,
"grad_norm": 0.001252340734936297,
"learning_rate": 8.193008370260956e-07,
"loss": 0.0001,
"step": 9740
},
{
"epoch": 4.800590841949779,
"grad_norm": 0.0011785068782046437,
"learning_rate": 7.996061053668145e-07,
"loss": 0.0002,
"step": 9750
},
{
"epoch": 4.8055145248645985,
"grad_norm": 0.001220080885104835,
"learning_rate": 7.799113737075333e-07,
"loss": 0.0001,
"step": 9760
},
{
"epoch": 4.810438207779419,
"grad_norm": 0.0013691087951883674,
"learning_rate": 7.602166420482522e-07,
"loss": 0.0001,
"step": 9770
},
{
"epoch": 4.8153618906942395,
"grad_norm": 0.003999890293926001,
"learning_rate": 7.40521910388971e-07,
"loss": 0.0002,
"step": 9780
},
{
"epoch": 4.82028557360906,
"grad_norm": 0.0012147324159741402,
"learning_rate": 7.208271787296898e-07,
"loss": 0.0002,
"step": 9790
},
{
"epoch": 4.8252092565238796,
"grad_norm": 0.0013871859991922975,
"learning_rate": 7.011324470704087e-07,
"loss": 0.0001,
"step": 9800
},
{
"epoch": 4.8301329394387,
"grad_norm": 0.0011777483159676194,
"learning_rate": 6.814377154111276e-07,
"loss": 0.0001,
"step": 9810
},
{
"epoch": 4.8350566223535205,
"grad_norm": 0.0013384807389229536,
"learning_rate": 6.617429837518465e-07,
"loss": 0.0001,
"step": 9820
},
{
"epoch": 4.839980305268341,
"grad_norm": 0.00117026106454432,
"learning_rate": 6.420482520925653e-07,
"loss": 0.0001,
"step": 9830
},
{
"epoch": 4.844903988183161,
"grad_norm": 0.0012068642536178231,
"learning_rate": 6.223535204332842e-07,
"loss": 0.0001,
"step": 9840
},
{
"epoch": 4.849827671097981,
"grad_norm": 0.001245830673724413,
"learning_rate": 6.02658788774003e-07,
"loss": 0.0002,
"step": 9850
},
{
"epoch": 4.854751354012802,
"grad_norm": 0.0014253932749852538,
"learning_rate": 5.829640571147219e-07,
"loss": 0.0001,
"step": 9860
},
{
"epoch": 4.859675036927622,
"grad_norm": 0.001373025239445269,
"learning_rate": 5.632693254554407e-07,
"loss": 0.0001,
"step": 9870
},
{
"epoch": 4.8645987198424425,
"grad_norm": 0.0014195754192769527,
"learning_rate": 5.435745937961595e-07,
"loss": 0.0002,
"step": 9880
},
{
"epoch": 4.869522402757262,
"grad_norm": 0.0012903210008516908,
"learning_rate": 5.238798621368784e-07,
"loss": 0.0002,
"step": 9890
},
{
"epoch": 4.874446085672083,
"grad_norm": 0.001158708124421537,
"learning_rate": 5.041851304775973e-07,
"loss": 0.0001,
"step": 9900
},
{
"epoch": 4.879369768586903,
"grad_norm": 0.0012715465854853392,
"learning_rate": 4.844903988183161e-07,
"loss": 0.0001,
"step": 9910
},
{
"epoch": 4.884293451501724,
"grad_norm": 0.0012133314739912748,
"learning_rate": 4.64795667159035e-07,
"loss": 0.0001,
"step": 9920
},
{
"epoch": 4.889217134416544,
"grad_norm": 0.016124047338962555,
"learning_rate": 4.451009354997538e-07,
"loss": 0.0002,
"step": 9930
},
{
"epoch": 4.894140817331364,
"grad_norm": 0.0013699685223400593,
"learning_rate": 4.254062038404727e-07,
"loss": 0.0001,
"step": 9940
},
{
"epoch": 4.899064500246184,
"grad_norm": 0.0013574556214734912,
"learning_rate": 4.057114721811916e-07,
"loss": 0.0001,
"step": 9950
},
{
"epoch": 4.903988183161005,
"grad_norm": 0.0013499916531145573,
"learning_rate": 3.860167405219104e-07,
"loss": 0.0002,
"step": 9960
},
{
"epoch": 4.908911866075825,
"grad_norm": 0.001382304704748094,
"learning_rate": 3.663220088626293e-07,
"loss": 0.0001,
"step": 9970
},
{
"epoch": 4.913835548990645,
"grad_norm": 0.0013027896638959646,
"learning_rate": 3.4662727720334815e-07,
"loss": 0.0001,
"step": 9980
},
{
"epoch": 4.918759231905465,
"grad_norm": 0.0011790018761530519,
"learning_rate": 3.26932545544067e-07,
"loss": 0.0002,
"step": 9990
},
{
"epoch": 4.923682914820286,
"grad_norm": 0.0012470015790313482,
"learning_rate": 3.0723781388478584e-07,
"loss": 0.0037,
"step": 10000
},
{
"epoch": 4.928606597735106,
"grad_norm": 0.0012150456896051764,
"learning_rate": 2.875430822255047e-07,
"loss": 0.0002,
"step": 10010
},
{
"epoch": 4.933530280649926,
"grad_norm": 0.0012021757429465652,
"learning_rate": 2.6784835056622353e-07,
"loss": 0.0001,
"step": 10020
},
{
"epoch": 4.938453963564746,
"grad_norm": 0.0011834530159831047,
"learning_rate": 2.4815361890694243e-07,
"loss": 0.0001,
"step": 10030
},
{
"epoch": 4.943377646479567,
"grad_norm": 0.0013943571830168366,
"learning_rate": 2.2845888724766125e-07,
"loss": 0.0002,
"step": 10040
},
{
"epoch": 4.948301329394387,
"grad_norm": 0.001787975779734552,
"learning_rate": 2.0876415558838012e-07,
"loss": 0.0217,
"step": 10050
},
{
"epoch": 4.953225012309208,
"grad_norm": 0.0014198760036379099,
"learning_rate": 1.8906942392909896e-07,
"loss": 0.0001,
"step": 10060
},
{
"epoch": 4.958148695224027,
"grad_norm": 0.0013527346309274435,
"learning_rate": 1.6937469226981783e-07,
"loss": 0.0001,
"step": 10070
},
{
"epoch": 4.963072378138848,
"grad_norm": 0.0020522738341242075,
"learning_rate": 1.496799606105367e-07,
"loss": 0.0002,
"step": 10080
},
{
"epoch": 4.967996061053668,
"grad_norm": 0.0013713801745325327,
"learning_rate": 1.2998522895125555e-07,
"loss": 0.0001,
"step": 10090
},
{
"epoch": 4.972919743968489,
"grad_norm": 0.001553553156554699,
"learning_rate": 1.1029049729197441e-07,
"loss": 0.0001,
"step": 10100
},
{
"epoch": 4.977843426883309,
"grad_norm": 0.0011614145478233695,
"learning_rate": 9.059576563269325e-08,
"loss": 0.0001,
"step": 10110
},
{
"epoch": 4.982767109798129,
"grad_norm": 0.001341148978099227,
"learning_rate": 7.090103397341212e-08,
"loss": 0.0001,
"step": 10120
},
{
"epoch": 4.987690792712949,
"grad_norm": 0.0014009519945830107,
"learning_rate": 5.1206302314130975e-08,
"loss": 0.0001,
"step": 10130
},
{
"epoch": 4.99261447562777,
"grad_norm": 0.0011851818999275565,
"learning_rate": 3.1511570654849834e-08,
"loss": 0.0001,
"step": 10140
},
{
"epoch": 4.99753815854259,
"grad_norm": 0.001298619550652802,
"learning_rate": 1.1816838995568685e-08,
"loss": 0.0001,
"step": 10150
},
{
"epoch": 5.0,
"eval_accuracy": 0.9989536100453436,
"eval_loss": 0.003578549949452281,
"eval_runtime": 127.0837,
"eval_samples_per_second": 22.56,
"eval_steps_per_second": 2.825,
"step": 10155
},
{
"epoch": 5.0,
"step": 10155,
"total_flos": 6.293899396497162e+18,
"train_loss": 0.00641422240726846,
"train_runtime": 5338.9651,
"train_samples_per_second": 15.213,
"train_steps_per_second": 1.902
}
],
"logging_steps": 10,
"max_steps": 10155,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.293899396497162e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}