GUI-Drag-7B / trainer_state.json
lzy337's picture
Upload folder using huggingface_hub
afab03c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 10246,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019521717911176184,
"grad_norm": 3.0788934230804443,
"learning_rate": 8.78048780487805e-08,
"loss": 0.4941,
"step": 10
},
{
"epoch": 0.003904343582235237,
"grad_norm": 2.4788622856140137,
"learning_rate": 1.8536585365853658e-07,
"loss": 0.5202,
"step": 20
},
{
"epoch": 0.005856515373352855,
"grad_norm": 2.7830698490142822,
"learning_rate": 2.829268292682927e-07,
"loss": 0.4917,
"step": 30
},
{
"epoch": 0.007808687164470474,
"grad_norm": 2.2472496032714844,
"learning_rate": 3.804878048780488e-07,
"loss": 0.4798,
"step": 40
},
{
"epoch": 0.009760858955588092,
"grad_norm": 2.4603986740112305,
"learning_rate": 4.780487804878049e-07,
"loss": 0.507,
"step": 50
},
{
"epoch": 0.01171303074670571,
"grad_norm": 2.1819260120391846,
"learning_rate": 5.75609756097561e-07,
"loss": 0.4741,
"step": 60
},
{
"epoch": 0.013665202537823329,
"grad_norm": 2.283219814300537,
"learning_rate": 6.731707317073172e-07,
"loss": 0.491,
"step": 70
},
{
"epoch": 0.015617374328940947,
"grad_norm": 2.379164695739746,
"learning_rate": 7.707317073170732e-07,
"loss": 0.4963,
"step": 80
},
{
"epoch": 0.017569546120058566,
"grad_norm": 1.8835978507995605,
"learning_rate": 8.682926829268293e-07,
"loss": 0.4649,
"step": 90
},
{
"epoch": 0.019521717911176184,
"grad_norm": 2.128862142562866,
"learning_rate": 9.658536585365854e-07,
"loss": 0.4627,
"step": 100
},
{
"epoch": 0.021473889702293802,
"grad_norm": 2.55245304107666,
"learning_rate": 1.0634146341463415e-06,
"loss": 0.4943,
"step": 110
},
{
"epoch": 0.02342606149341142,
"grad_norm": 2.192894697189331,
"learning_rate": 1.1609756097560977e-06,
"loss": 0.477,
"step": 120
},
{
"epoch": 0.02537823328452904,
"grad_norm": 2.0305230617523193,
"learning_rate": 1.2585365853658538e-06,
"loss": 0.4636,
"step": 130
},
{
"epoch": 0.027330405075646658,
"grad_norm": 2.010695695877075,
"learning_rate": 1.3560975609756099e-06,
"loss": 0.4873,
"step": 140
},
{
"epoch": 0.029282576866764276,
"grad_norm": 1.8330553770065308,
"learning_rate": 1.453658536585366e-06,
"loss": 0.4764,
"step": 150
},
{
"epoch": 0.031234748657881894,
"grad_norm": 1.8834083080291748,
"learning_rate": 1.551219512195122e-06,
"loss": 0.4945,
"step": 160
},
{
"epoch": 0.03318692044899951,
"grad_norm": 1.932608962059021,
"learning_rate": 1.6487804878048783e-06,
"loss": 0.4486,
"step": 170
},
{
"epoch": 0.03513909224011713,
"grad_norm": 2.3214528560638428,
"learning_rate": 1.7463414634146341e-06,
"loss": 0.4464,
"step": 180
},
{
"epoch": 0.03709126403123475,
"grad_norm": 3.1889891624450684,
"learning_rate": 1.8439024390243904e-06,
"loss": 0.4858,
"step": 190
},
{
"epoch": 0.03904343582235237,
"grad_norm": 2.5776352882385254,
"learning_rate": 1.9414634146341465e-06,
"loss": 0.4713,
"step": 200
},
{
"epoch": 0.040995607613469986,
"grad_norm": 2.4418435096740723,
"learning_rate": 2.0390243902439023e-06,
"loss": 0.4838,
"step": 210
},
{
"epoch": 0.042947779404587605,
"grad_norm": 2.241332530975342,
"learning_rate": 2.1365853658536586e-06,
"loss": 0.429,
"step": 220
},
{
"epoch": 0.04489995119570522,
"grad_norm": 1.7465415000915527,
"learning_rate": 2.234146341463415e-06,
"loss": 0.4743,
"step": 230
},
{
"epoch": 0.04685212298682284,
"grad_norm": 2.3134076595306396,
"learning_rate": 2.331707317073171e-06,
"loss": 0.4751,
"step": 240
},
{
"epoch": 0.04880429477794046,
"grad_norm": 2.256594657897949,
"learning_rate": 2.429268292682927e-06,
"loss": 0.4773,
"step": 250
},
{
"epoch": 0.05075646656905808,
"grad_norm": 2.873642921447754,
"learning_rate": 2.5268292682926833e-06,
"loss": 0.4484,
"step": 260
},
{
"epoch": 0.0527086383601757,
"grad_norm": 2.403874635696411,
"learning_rate": 2.624390243902439e-06,
"loss": 0.4338,
"step": 270
},
{
"epoch": 0.054660810151293315,
"grad_norm": 2.0818169116973877,
"learning_rate": 2.7219512195121954e-06,
"loss": 0.4569,
"step": 280
},
{
"epoch": 0.056612981942410934,
"grad_norm": 1.8225184679031372,
"learning_rate": 2.8195121951219513e-06,
"loss": 0.4647,
"step": 290
},
{
"epoch": 0.05856515373352855,
"grad_norm": 1.9539415836334229,
"learning_rate": 2.9170731707317076e-06,
"loss": 0.4713,
"step": 300
},
{
"epoch": 0.06051732552464617,
"grad_norm": 2.6394882202148438,
"learning_rate": 3.0146341463414634e-06,
"loss": 0.4314,
"step": 310
},
{
"epoch": 0.06246949731576379,
"grad_norm": 2.3676302433013916,
"learning_rate": 3.1121951219512197e-06,
"loss": 0.4849,
"step": 320
},
{
"epoch": 0.06442166910688141,
"grad_norm": 2.2868118286132812,
"learning_rate": 3.209756097560976e-06,
"loss": 0.4461,
"step": 330
},
{
"epoch": 0.06637384089799903,
"grad_norm": 2.1315979957580566,
"learning_rate": 3.3073170731707323e-06,
"loss": 0.4411,
"step": 340
},
{
"epoch": 0.06832601268911664,
"grad_norm": 2.1080586910247803,
"learning_rate": 3.404878048780488e-06,
"loss": 0.4752,
"step": 350
},
{
"epoch": 0.07027818448023426,
"grad_norm": 2.3179149627685547,
"learning_rate": 3.502439024390244e-06,
"loss": 0.4904,
"step": 360
},
{
"epoch": 0.07223035627135188,
"grad_norm": 2.042257308959961,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.4776,
"step": 370
},
{
"epoch": 0.0741825280624695,
"grad_norm": 2.1745452880859375,
"learning_rate": 3.6975609756097565e-06,
"loss": 0.4796,
"step": 380
},
{
"epoch": 0.07613469985358712,
"grad_norm": 2.0925772190093994,
"learning_rate": 3.7951219512195124e-06,
"loss": 0.4414,
"step": 390
},
{
"epoch": 0.07808687164470474,
"grad_norm": 1.9548064470291138,
"learning_rate": 3.892682926829269e-06,
"loss": 0.4618,
"step": 400
},
{
"epoch": 0.08003904343582235,
"grad_norm": 2.2613227367401123,
"learning_rate": 3.9902439024390245e-06,
"loss": 0.4561,
"step": 410
},
{
"epoch": 0.08199121522693997,
"grad_norm": 2.3353497982025146,
"learning_rate": 4.087804878048781e-06,
"loss": 0.4661,
"step": 420
},
{
"epoch": 0.08394338701805759,
"grad_norm": 2.0971009731292725,
"learning_rate": 4.185365853658537e-06,
"loss": 0.4257,
"step": 430
},
{
"epoch": 0.08589555880917521,
"grad_norm": 2.191269636154175,
"learning_rate": 4.282926829268293e-06,
"loss": 0.4584,
"step": 440
},
{
"epoch": 0.08784773060029283,
"grad_norm": 1.9917972087860107,
"learning_rate": 4.380487804878049e-06,
"loss": 0.4402,
"step": 450
},
{
"epoch": 0.08979990239141045,
"grad_norm": 2.4279751777648926,
"learning_rate": 4.478048780487805e-06,
"loss": 0.4521,
"step": 460
},
{
"epoch": 0.09175207418252807,
"grad_norm": 2.051614284515381,
"learning_rate": 4.575609756097561e-06,
"loss": 0.462,
"step": 470
},
{
"epoch": 0.09370424597364568,
"grad_norm": 1.9167184829711914,
"learning_rate": 4.673170731707318e-06,
"loss": 0.4484,
"step": 480
},
{
"epoch": 0.0956564177647633,
"grad_norm": 3.8172707557678223,
"learning_rate": 4.770731707317073e-06,
"loss": 0.4307,
"step": 490
},
{
"epoch": 0.09760858955588092,
"grad_norm": 1.8547595739364624,
"learning_rate": 4.868292682926829e-06,
"loss": 0.4537,
"step": 500
},
{
"epoch": 0.09956076134699854,
"grad_norm": 2.314936637878418,
"learning_rate": 4.965853658536586e-06,
"loss": 0.4425,
"step": 510
},
{
"epoch": 0.10151293313811616,
"grad_norm": 1.9514055252075195,
"learning_rate": 5.063414634146342e-06,
"loss": 0.4453,
"step": 520
},
{
"epoch": 0.10346510492923378,
"grad_norm": 1.7186030149459839,
"learning_rate": 5.160975609756098e-06,
"loss": 0.3934,
"step": 530
},
{
"epoch": 0.1054172767203514,
"grad_norm": 1.8911871910095215,
"learning_rate": 5.2585365853658545e-06,
"loss": 0.4454,
"step": 540
},
{
"epoch": 0.10736944851146901,
"grad_norm": 2.092548370361328,
"learning_rate": 5.356097560975611e-06,
"loss": 0.4937,
"step": 550
},
{
"epoch": 0.10932162030258663,
"grad_norm": 1.619399905204773,
"learning_rate": 5.453658536585367e-06,
"loss": 0.445,
"step": 560
},
{
"epoch": 0.11127379209370425,
"grad_norm": 2.086709976196289,
"learning_rate": 5.551219512195123e-06,
"loss": 0.4528,
"step": 570
},
{
"epoch": 0.11322596388482187,
"grad_norm": 2.0194432735443115,
"learning_rate": 5.648780487804878e-06,
"loss": 0.4503,
"step": 580
},
{
"epoch": 0.11517813567593949,
"grad_norm": 2.06130051612854,
"learning_rate": 5.746341463414634e-06,
"loss": 0.464,
"step": 590
},
{
"epoch": 0.1171303074670571,
"grad_norm": 1.8607715368270874,
"learning_rate": 5.8439024390243904e-06,
"loss": 0.4545,
"step": 600
},
{
"epoch": 0.11908247925817472,
"grad_norm": 1.8002784252166748,
"learning_rate": 5.941463414634147e-06,
"loss": 0.4211,
"step": 610
},
{
"epoch": 0.12103465104929234,
"grad_norm": 2.372277021408081,
"learning_rate": 6.039024390243903e-06,
"loss": 0.4757,
"step": 620
},
{
"epoch": 0.12298682284040996,
"grad_norm": 1.8813685178756714,
"learning_rate": 6.136585365853659e-06,
"loss": 0.4533,
"step": 630
},
{
"epoch": 0.12493899463152758,
"grad_norm": 1.8307466506958008,
"learning_rate": 6.234146341463415e-06,
"loss": 0.4727,
"step": 640
},
{
"epoch": 0.1268911664226452,
"grad_norm": 1.9979982376098633,
"learning_rate": 6.331707317073171e-06,
"loss": 0.4198,
"step": 650
},
{
"epoch": 0.12884333821376281,
"grad_norm": 2.121293544769287,
"learning_rate": 6.429268292682927e-06,
"loss": 0.4552,
"step": 660
},
{
"epoch": 0.13079551000488043,
"grad_norm": 1.8079158067703247,
"learning_rate": 6.5268292682926836e-06,
"loss": 0.4425,
"step": 670
},
{
"epoch": 0.13274768179599805,
"grad_norm": 1.8363804817199707,
"learning_rate": 6.62439024390244e-06,
"loss": 0.4299,
"step": 680
},
{
"epoch": 0.13469985358711567,
"grad_norm": 1.9010361433029175,
"learning_rate": 6.721951219512196e-06,
"loss": 0.4432,
"step": 690
},
{
"epoch": 0.1366520253782333,
"grad_norm": 2.0648021697998047,
"learning_rate": 6.819512195121952e-06,
"loss": 0.4498,
"step": 700
},
{
"epoch": 0.1386041971693509,
"grad_norm": 1.9787408113479614,
"learning_rate": 6.917073170731707e-06,
"loss": 0.4378,
"step": 710
},
{
"epoch": 0.14055636896046853,
"grad_norm": 1.7960318326950073,
"learning_rate": 7.014634146341463e-06,
"loss": 0.4273,
"step": 720
},
{
"epoch": 0.14250854075158614,
"grad_norm": 1.6109867095947266,
"learning_rate": 7.1121951219512195e-06,
"loss": 0.4351,
"step": 730
},
{
"epoch": 0.14446071254270376,
"grad_norm": 1.8068534135818481,
"learning_rate": 7.209756097560976e-06,
"loss": 0.4522,
"step": 740
},
{
"epoch": 0.14641288433382138,
"grad_norm": 2.0688509941101074,
"learning_rate": 7.307317073170732e-06,
"loss": 0.4367,
"step": 750
},
{
"epoch": 0.148365056124939,
"grad_norm": 1.9218088388442993,
"learning_rate": 7.404878048780488e-06,
"loss": 0.4406,
"step": 760
},
{
"epoch": 0.15031722791605662,
"grad_norm": 2.0230157375335693,
"learning_rate": 7.502439024390245e-06,
"loss": 0.4339,
"step": 770
},
{
"epoch": 0.15226939970717424,
"grad_norm": 1.8556506633758545,
"learning_rate": 7.600000000000001e-06,
"loss": 0.4219,
"step": 780
},
{
"epoch": 0.15422157149829185,
"grad_norm": 1.9535895586013794,
"learning_rate": 7.697560975609756e-06,
"loss": 0.4513,
"step": 790
},
{
"epoch": 0.15617374328940947,
"grad_norm": 1.6965866088867188,
"learning_rate": 7.795121951219513e-06,
"loss": 0.4355,
"step": 800
},
{
"epoch": 0.1581259150805271,
"grad_norm": 2.014047145843506,
"learning_rate": 7.892682926829269e-06,
"loss": 0.4535,
"step": 810
},
{
"epoch": 0.1600780868716447,
"grad_norm": 1.7660433053970337,
"learning_rate": 7.990243902439025e-06,
"loss": 0.4439,
"step": 820
},
{
"epoch": 0.16203025866276233,
"grad_norm": 1.8006068468093872,
"learning_rate": 8.087804878048781e-06,
"loss": 0.4337,
"step": 830
},
{
"epoch": 0.16398243045387995,
"grad_norm": 2.9495394229888916,
"learning_rate": 8.185365853658536e-06,
"loss": 0.4596,
"step": 840
},
{
"epoch": 0.16593460224499756,
"grad_norm": 2.061260938644409,
"learning_rate": 8.282926829268292e-06,
"loss": 0.426,
"step": 850
},
{
"epoch": 0.16788677403611518,
"grad_norm": 1.7372653484344482,
"learning_rate": 8.380487804878049e-06,
"loss": 0.4445,
"step": 860
},
{
"epoch": 0.1698389458272328,
"grad_norm": 1.9551905393600464,
"learning_rate": 8.478048780487805e-06,
"loss": 0.4328,
"step": 870
},
{
"epoch": 0.17179111761835042,
"grad_norm": 1.709471583366394,
"learning_rate": 8.575609756097561e-06,
"loss": 0.4775,
"step": 880
},
{
"epoch": 0.17374328940946804,
"grad_norm": 1.6745487451553345,
"learning_rate": 8.673170731707317e-06,
"loss": 0.4428,
"step": 890
},
{
"epoch": 0.17569546120058566,
"grad_norm": 1.675772786140442,
"learning_rate": 8.770731707317074e-06,
"loss": 0.4608,
"step": 900
},
{
"epoch": 0.17764763299170327,
"grad_norm": 1.7424198389053345,
"learning_rate": 8.86829268292683e-06,
"loss": 0.4427,
"step": 910
},
{
"epoch": 0.1795998047828209,
"grad_norm": 1.6522661447525024,
"learning_rate": 8.965853658536586e-06,
"loss": 0.4242,
"step": 920
},
{
"epoch": 0.1815519765739385,
"grad_norm": 1.795058012008667,
"learning_rate": 9.063414634146343e-06,
"loss": 0.4454,
"step": 930
},
{
"epoch": 0.18350414836505613,
"grad_norm": 1.9827325344085693,
"learning_rate": 9.160975609756099e-06,
"loss": 0.445,
"step": 940
},
{
"epoch": 0.18545632015617375,
"grad_norm": 1.6691913604736328,
"learning_rate": 9.258536585365855e-06,
"loss": 0.4269,
"step": 950
},
{
"epoch": 0.18740849194729137,
"grad_norm": 2.306386947631836,
"learning_rate": 9.356097560975611e-06,
"loss": 0.4299,
"step": 960
},
{
"epoch": 0.18936066373840899,
"grad_norm": 1.7081074714660645,
"learning_rate": 9.453658536585366e-06,
"loss": 0.3928,
"step": 970
},
{
"epoch": 0.1913128355295266,
"grad_norm": 1.8791937828063965,
"learning_rate": 9.551219512195122e-06,
"loss": 0.4513,
"step": 980
},
{
"epoch": 0.19326500732064422,
"grad_norm": 1.6924887895584106,
"learning_rate": 9.648780487804879e-06,
"loss": 0.4497,
"step": 990
},
{
"epoch": 0.19521717911176184,
"grad_norm": 1.8185955286026,
"learning_rate": 9.746341463414635e-06,
"loss": 0.4211,
"step": 1000
},
{
"epoch": 0.19716935090287946,
"grad_norm": 2.200645923614502,
"learning_rate": 9.843902439024391e-06,
"loss": 0.429,
"step": 1010
},
{
"epoch": 0.19912152269399708,
"grad_norm": 1.568526268005371,
"learning_rate": 9.941463414634147e-06,
"loss": 0.432,
"step": 1020
},
{
"epoch": 0.2010736944851147,
"grad_norm": 2.0274457931518555,
"learning_rate": 9.999995356947167e-06,
"loss": 0.4331,
"step": 1030
},
{
"epoch": 0.20302586627623231,
"grad_norm": 2.0021557807922363,
"learning_rate": 9.999943122701835e-06,
"loss": 0.4312,
"step": 1040
},
{
"epoch": 0.20497803806734993,
"grad_norm": 1.8168703317642212,
"learning_rate": 9.999832851003463e-06,
"loss": 0.4233,
"step": 1050
},
{
"epoch": 0.20693020985846755,
"grad_norm": 1.9876569509506226,
"learning_rate": 9.999664543132046e-06,
"loss": 0.4387,
"step": 1060
},
{
"epoch": 0.20888238164958517,
"grad_norm": 2.8725063800811768,
"learning_rate": 9.999438201041236e-06,
"loss": 0.4465,
"step": 1070
},
{
"epoch": 0.2108345534407028,
"grad_norm": 1.7831240892410278,
"learning_rate": 9.999153827358329e-06,
"loss": 0.4516,
"step": 1080
},
{
"epoch": 0.2127867252318204,
"grad_norm": 1.7374250888824463,
"learning_rate": 9.998811425384225e-06,
"loss": 0.4228,
"step": 1090
},
{
"epoch": 0.21473889702293802,
"grad_norm": 1.8771198987960815,
"learning_rate": 9.998410999093401e-06,
"loss": 0.4281,
"step": 1100
},
{
"epoch": 0.21669106881405564,
"grad_norm": 1.6397325992584229,
"learning_rate": 9.99795255313385e-06,
"loss": 0.4169,
"step": 1110
},
{
"epoch": 0.21864324060517326,
"grad_norm": 1.6200042963027954,
"learning_rate": 9.997436092827042e-06,
"loss": 0.4306,
"step": 1120
},
{
"epoch": 0.22059541239629088,
"grad_norm": 1.7567847967147827,
"learning_rate": 9.996861624167853e-06,
"loss": 0.44,
"step": 1130
},
{
"epoch": 0.2225475841874085,
"grad_norm": 2.0347070693969727,
"learning_rate": 9.996229153824497e-06,
"loss": 0.4122,
"step": 1140
},
{
"epoch": 0.22449975597852612,
"grad_norm": 1.7771193981170654,
"learning_rate": 9.995538689138454e-06,
"loss": 0.4577,
"step": 1150
},
{
"epoch": 0.22645192776964373,
"grad_norm": 1.78081214427948,
"learning_rate": 9.994790238124374e-06,
"loss": 0.4366,
"step": 1160
},
{
"epoch": 0.22840409956076135,
"grad_norm": 1.5818620920181274,
"learning_rate": 9.993983809469995e-06,
"loss": 0.4062,
"step": 1170
},
{
"epoch": 0.23035627135187897,
"grad_norm": 1.515559196472168,
"learning_rate": 9.99311941253604e-06,
"loss": 0.4164,
"step": 1180
},
{
"epoch": 0.2323084431429966,
"grad_norm": 1.9654150009155273,
"learning_rate": 9.992197057356098e-06,
"loss": 0.407,
"step": 1190
},
{
"epoch": 0.2342606149341142,
"grad_norm": 2.0451202392578125,
"learning_rate": 9.991216754636522e-06,
"loss": 0.4049,
"step": 1200
},
{
"epoch": 0.23621278672523183,
"grad_norm": 1.8495047092437744,
"learning_rate": 9.990178515756294e-06,
"loss": 0.4389,
"step": 1210
},
{
"epoch": 0.23816495851634945,
"grad_norm": 2.012528657913208,
"learning_rate": 9.989082352766903e-06,
"loss": 0.4244,
"step": 1220
},
{
"epoch": 0.24011713030746706,
"grad_norm": 1.5398080348968506,
"learning_rate": 9.987928278392192e-06,
"loss": 0.4449,
"step": 1230
},
{
"epoch": 0.24206930209858468,
"grad_norm": 2.037444591522217,
"learning_rate": 9.986716306028222e-06,
"loss": 0.4155,
"step": 1240
},
{
"epoch": 0.2440214738897023,
"grad_norm": 1.666106104850769,
"learning_rate": 9.985446449743111e-06,
"loss": 0.4451,
"step": 1250
},
{
"epoch": 0.24597364568081992,
"grad_norm": 1.8997377157211304,
"learning_rate": 9.984118724276871e-06,
"loss": 0.4218,
"step": 1260
},
{
"epoch": 0.24792581747193754,
"grad_norm": 1.8274009227752686,
"learning_rate": 9.982733145041238e-06,
"loss": 0.4341,
"step": 1270
},
{
"epoch": 0.24987798926305516,
"grad_norm": 1.8715990781784058,
"learning_rate": 9.981289728119495e-06,
"loss": 0.4208,
"step": 1280
},
{
"epoch": 0.2518301610541728,
"grad_norm": 1.65500807762146,
"learning_rate": 9.97978849026628e-06,
"loss": 0.473,
"step": 1290
},
{
"epoch": 0.2537823328452904,
"grad_norm": 1.9954007863998413,
"learning_rate": 9.978229448907392e-06,
"loss": 0.412,
"step": 1300
},
{
"epoch": 0.255734504636408,
"grad_norm": 1.6430351734161377,
"learning_rate": 9.9766126221396e-06,
"loss": 0.4078,
"step": 1310
},
{
"epoch": 0.25768667642752563,
"grad_norm": 1.9523135423660278,
"learning_rate": 9.974938028730415e-06,
"loss": 0.4072,
"step": 1320
},
{
"epoch": 0.25963884821864325,
"grad_norm": 1.6687992811203003,
"learning_rate": 9.97320568811789e-06,
"loss": 0.4449,
"step": 1330
},
{
"epoch": 0.26159102000976087,
"grad_norm": 1.4892559051513672,
"learning_rate": 9.971415620410373e-06,
"loss": 0.4314,
"step": 1340
},
{
"epoch": 0.2635431918008785,
"grad_norm": 1.774784803390503,
"learning_rate": 9.969567846386305e-06,
"loss": 0.412,
"step": 1350
},
{
"epoch": 0.2654953635919961,
"grad_norm": 1.6988633871078491,
"learning_rate": 9.967662387493942e-06,
"loss": 0.4225,
"step": 1360
},
{
"epoch": 0.2674475353831137,
"grad_norm": 1.8404074907302856,
"learning_rate": 9.965699265851134e-06,
"loss": 0.3972,
"step": 1370
},
{
"epoch": 0.26939970717423134,
"grad_norm": 1.7147365808486938,
"learning_rate": 9.963678504245058e-06,
"loss": 0.4168,
"step": 1380
},
{
"epoch": 0.27135187896534896,
"grad_norm": 1.7342804670333862,
"learning_rate": 9.961600126131949e-06,
"loss": 0.4308,
"step": 1390
},
{
"epoch": 0.2733040507564666,
"grad_norm": 1.7366749048233032,
"learning_rate": 9.959464155636837e-06,
"loss": 0.4362,
"step": 1400
},
{
"epoch": 0.2752562225475842,
"grad_norm": 1.560364007949829,
"learning_rate": 9.957270617553263e-06,
"loss": 0.4295,
"step": 1410
},
{
"epoch": 0.2772083943387018,
"grad_norm": 1.6392924785614014,
"learning_rate": 9.955019537342988e-06,
"loss": 0.4272,
"step": 1420
},
{
"epoch": 0.27916056612981943,
"grad_norm": 1.6431063413619995,
"learning_rate": 9.952710941135702e-06,
"loss": 0.4369,
"step": 1430
},
{
"epoch": 0.28111273792093705,
"grad_norm": 1.7722136974334717,
"learning_rate": 9.950344855728717e-06,
"loss": 0.4352,
"step": 1440
},
{
"epoch": 0.28306490971205467,
"grad_norm": 1.8588448762893677,
"learning_rate": 9.947921308586663e-06,
"loss": 0.4126,
"step": 1450
},
{
"epoch": 0.2850170815031723,
"grad_norm": 1.8954261541366577,
"learning_rate": 9.945440327841159e-06,
"loss": 0.406,
"step": 1460
},
{
"epoch": 0.2869692532942899,
"grad_norm": 2.013242244720459,
"learning_rate": 9.942901942290493e-06,
"loss": 0.4159,
"step": 1470
},
{
"epoch": 0.2889214250854075,
"grad_norm": 2.03349232673645,
"learning_rate": 9.940306181399284e-06,
"loss": 0.4206,
"step": 1480
},
{
"epoch": 0.29087359687652514,
"grad_norm": 2.5981340408325195,
"learning_rate": 9.93765307529815e-06,
"loss": 0.4358,
"step": 1490
},
{
"epoch": 0.29282576866764276,
"grad_norm": 1.626177191734314,
"learning_rate": 9.934942654783343e-06,
"loss": 0.3846,
"step": 1500
},
{
"epoch": 0.2947779404587604,
"grad_norm": 1.7740366458892822,
"learning_rate": 9.932174951316401e-06,
"loss": 0.382,
"step": 1510
},
{
"epoch": 0.296730112249878,
"grad_norm": 1.7357271909713745,
"learning_rate": 9.929349997023782e-06,
"loss": 0.3819,
"step": 1520
},
{
"epoch": 0.2986822840409956,
"grad_norm": 1.6888749599456787,
"learning_rate": 9.92646782469649e-06,
"loss": 0.4121,
"step": 1530
},
{
"epoch": 0.30063445583211323,
"grad_norm": 1.6334894895553589,
"learning_rate": 9.923528467789694e-06,
"loss": 0.4173,
"step": 1540
},
{
"epoch": 0.30258662762323085,
"grad_norm": 1.8393194675445557,
"learning_rate": 9.920531960422337e-06,
"loss": 0.3917,
"step": 1550
},
{
"epoch": 0.30453879941434847,
"grad_norm": 1.558082103729248,
"learning_rate": 9.91747833737675e-06,
"loss": 0.4068,
"step": 1560
},
{
"epoch": 0.3064909712054661,
"grad_norm": 1.4404140710830688,
"learning_rate": 9.914367634098233e-06,
"loss": 0.4181,
"step": 1570
},
{
"epoch": 0.3084431429965837,
"grad_norm": 1.7500289678573608,
"learning_rate": 9.911199886694658e-06,
"loss": 0.4064,
"step": 1580
},
{
"epoch": 0.3103953147877013,
"grad_norm": 1.5437930822372437,
"learning_rate": 9.907975131936043e-06,
"loss": 0.3951,
"step": 1590
},
{
"epoch": 0.31234748657881894,
"grad_norm": 2.1010377407073975,
"learning_rate": 9.904693407254121e-06,
"loss": 0.394,
"step": 1600
},
{
"epoch": 0.31429965836993656,
"grad_norm": 1.9228754043579102,
"learning_rate": 9.901354750741915e-06,
"loss": 0.392,
"step": 1610
},
{
"epoch": 0.3162518301610542,
"grad_norm": 1.8459354639053345,
"learning_rate": 9.897959201153291e-06,
"loss": 0.3964,
"step": 1620
},
{
"epoch": 0.3182040019521718,
"grad_norm": 2.0633997917175293,
"learning_rate": 9.894506797902508e-06,
"loss": 0.3844,
"step": 1630
},
{
"epoch": 0.3201561737432894,
"grad_norm": 1.7887264490127563,
"learning_rate": 9.890997581063757e-06,
"loss": 0.4231,
"step": 1640
},
{
"epoch": 0.32210834553440704,
"grad_norm": 1.6638391017913818,
"learning_rate": 9.887431591370707e-06,
"loss": 0.3866,
"step": 1650
},
{
"epoch": 0.32406051732552466,
"grad_norm": 1.5746939182281494,
"learning_rate": 9.883808870216016e-06,
"loss": 0.4002,
"step": 1660
},
{
"epoch": 0.3260126891166423,
"grad_norm": 1.737459421157837,
"learning_rate": 9.880129459650867e-06,
"loss": 0.4179,
"step": 1670
},
{
"epoch": 0.3279648609077599,
"grad_norm": 1.6839922666549683,
"learning_rate": 9.87639340238447e-06,
"loss": 0.3991,
"step": 1680
},
{
"epoch": 0.3299170326988775,
"grad_norm": 2.228264331817627,
"learning_rate": 9.872600741783565e-06,
"loss": 0.3815,
"step": 1690
},
{
"epoch": 0.33186920448999513,
"grad_norm": 1.7104638814926147,
"learning_rate": 9.868751521871929e-06,
"loss": 0.3833,
"step": 1700
},
{
"epoch": 0.33382137628111275,
"grad_norm": 2.207789182662964,
"learning_rate": 9.864845787329851e-06,
"loss": 0.3806,
"step": 1710
},
{
"epoch": 0.33577354807223037,
"grad_norm": 1.7838051319122314,
"learning_rate": 9.860883583493624e-06,
"loss": 0.4102,
"step": 1720
},
{
"epoch": 0.337725719863348,
"grad_norm": 1.4554154872894287,
"learning_rate": 9.856864956355018e-06,
"loss": 0.402,
"step": 1730
},
{
"epoch": 0.3396778916544656,
"grad_norm": 2.0781755447387695,
"learning_rate": 9.852789952560737e-06,
"loss": 0.395,
"step": 1740
},
{
"epoch": 0.3416300634455832,
"grad_norm": 1.8114964962005615,
"learning_rate": 9.848658619411889e-06,
"loss": 0.3727,
"step": 1750
},
{
"epoch": 0.34358223523670084,
"grad_norm": 1.6250026226043701,
"learning_rate": 9.84447100486343e-06,
"loss": 0.4073,
"step": 1760
},
{
"epoch": 0.34553440702781846,
"grad_norm": 2.6292777061462402,
"learning_rate": 9.84022715752361e-06,
"loss": 0.3897,
"step": 1770
},
{
"epoch": 0.3474865788189361,
"grad_norm": 1.6362555027008057,
"learning_rate": 9.835927126653407e-06,
"loss": 0.3935,
"step": 1780
},
{
"epoch": 0.3494387506100537,
"grad_norm": 1.828399658203125,
"learning_rate": 9.831570962165955e-06,
"loss": 0.4215,
"step": 1790
},
{
"epoch": 0.3513909224011713,
"grad_norm": 2.1395528316497803,
"learning_rate": 9.827158714625971e-06,
"loss": 0.3982,
"step": 1800
},
{
"epoch": 0.35334309419228893,
"grad_norm": 1.7870949506759644,
"learning_rate": 9.822690435249157e-06,
"loss": 0.3871,
"step": 1810
},
{
"epoch": 0.35529526598340655,
"grad_norm": 1.8139511346817017,
"learning_rate": 9.818166175901613e-06,
"loss": 0.4085,
"step": 1820
},
{
"epoch": 0.35724743777452417,
"grad_norm": 1.7053240537643433,
"learning_rate": 9.813585989099237e-06,
"loss": 0.3874,
"step": 1830
},
{
"epoch": 0.3591996095656418,
"grad_norm": 1.5838854312896729,
"learning_rate": 9.808949928007108e-06,
"loss": 0.3814,
"step": 1840
},
{
"epoch": 0.3611517813567594,
"grad_norm": 1.6612449884414673,
"learning_rate": 9.804258046438875e-06,
"loss": 0.3871,
"step": 1850
},
{
"epoch": 0.363103953147877,
"grad_norm": 1.915330410003662,
"learning_rate": 9.799510398856125e-06,
"loss": 0.4036,
"step": 1860
},
{
"epoch": 0.36505612493899464,
"grad_norm": 2.415217161178589,
"learning_rate": 9.794707040367763e-06,
"loss": 0.4082,
"step": 1870
},
{
"epoch": 0.36700829673011226,
"grad_norm": 2.1422359943389893,
"learning_rate": 9.78984802672936e-06,
"loss": 0.4177,
"step": 1880
},
{
"epoch": 0.3689604685212299,
"grad_norm": 1.7647225856781006,
"learning_rate": 9.784933414342516e-06,
"loss": 0.397,
"step": 1890
},
{
"epoch": 0.3709126403123475,
"grad_norm": 1.5418081283569336,
"learning_rate": 9.779963260254193e-06,
"loss": 0.3798,
"step": 1900
},
{
"epoch": 0.3728648121034651,
"grad_norm": 1.7419403791427612,
"learning_rate": 9.774937622156065e-06,
"loss": 0.3956,
"step": 1910
},
{
"epoch": 0.37481698389458273,
"grad_norm": 1.7291524410247803,
"learning_rate": 9.769856558383845e-06,
"loss": 0.3919,
"step": 1920
},
{
"epoch": 0.37676915568570035,
"grad_norm": 1.8995845317840576,
"learning_rate": 9.764720127916601e-06,
"loss": 0.395,
"step": 1930
},
{
"epoch": 0.37872132747681797,
"grad_norm": 1.7812365293502808,
"learning_rate": 9.759528390376081e-06,
"loss": 0.3846,
"step": 1940
},
{
"epoch": 0.3806734992679356,
"grad_norm": 1.9531811475753784,
"learning_rate": 9.754281406026016e-06,
"loss": 0.4002,
"step": 1950
},
{
"epoch": 0.3826256710590532,
"grad_norm": 1.832743525505066,
"learning_rate": 9.74897923577142e-06,
"loss": 0.3889,
"step": 1960
},
{
"epoch": 0.3845778428501708,
"grad_norm": 1.8087397813796997,
"learning_rate": 9.743621941157884e-06,
"loss": 0.3731,
"step": 1970
},
{
"epoch": 0.38653001464128844,
"grad_norm": 2.172013521194458,
"learning_rate": 9.738209584370862e-06,
"loss": 0.3859,
"step": 1980
},
{
"epoch": 0.38848218643240606,
"grad_norm": 1.6979947090148926,
"learning_rate": 9.732742228234948e-06,
"loss": 0.3776,
"step": 1990
},
{
"epoch": 0.3904343582235237,
"grad_norm": 1.8701212406158447,
"learning_rate": 9.727219936213153e-06,
"loss": 0.4074,
"step": 2000
},
{
"epoch": 0.3923865300146413,
"grad_norm": 1.4129152297973633,
"learning_rate": 9.721642772406156e-06,
"loss": 0.3775,
"step": 2010
},
{
"epoch": 0.3943387018057589,
"grad_norm": 1.6878793239593506,
"learning_rate": 9.71601080155157e-06,
"loss": 0.3941,
"step": 2020
},
{
"epoch": 0.39629087359687654,
"grad_norm": 2.422609806060791,
"learning_rate": 9.710324089023188e-06,
"loss": 0.3792,
"step": 2030
},
{
"epoch": 0.39824304538799415,
"grad_norm": 1.4131423234939575,
"learning_rate": 9.704582700830223e-06,
"loss": 0.3888,
"step": 2040
},
{
"epoch": 0.4001952171791118,
"grad_norm": 1.5815601348876953,
"learning_rate": 9.698786703616543e-06,
"loss": 0.3599,
"step": 2050
},
{
"epoch": 0.4021473889702294,
"grad_norm": 1.8221068382263184,
"learning_rate": 9.692936164659897e-06,
"loss": 0.3532,
"step": 2060
},
{
"epoch": 0.404099560761347,
"grad_norm": 1.6556801795959473,
"learning_rate": 9.68703115187113e-06,
"loss": 0.3744,
"step": 2070
},
{
"epoch": 0.40605173255246463,
"grad_norm": 1.666934609413147,
"learning_rate": 9.681071733793408e-06,
"loss": 0.3976,
"step": 2080
},
{
"epoch": 0.40800390434358225,
"grad_norm": 1.9403831958770752,
"learning_rate": 9.675057979601404e-06,
"loss": 0.3837,
"step": 2090
},
{
"epoch": 0.40995607613469986,
"grad_norm": 1.6218396425247192,
"learning_rate": 9.668989959100505e-06,
"loss": 0.4007,
"step": 2100
},
{
"epoch": 0.4119082479258175,
"grad_norm": 1.9031404256820679,
"learning_rate": 9.662867742726008e-06,
"loss": 0.3719,
"step": 2110
},
{
"epoch": 0.4138604197169351,
"grad_norm": 1.7567657232284546,
"learning_rate": 9.656691401542288e-06,
"loss": 0.3878,
"step": 2120
},
{
"epoch": 0.4158125915080527,
"grad_norm": 1.6076161861419678,
"learning_rate": 9.650461007241983e-06,
"loss": 0.3982,
"step": 2130
},
{
"epoch": 0.41776476329917034,
"grad_norm": 1.894406795501709,
"learning_rate": 9.644176632145158e-06,
"loss": 0.3462,
"step": 2140
},
{
"epoch": 0.41971693509028796,
"grad_norm": 1.7494549751281738,
"learning_rate": 9.637838349198469e-06,
"loss": 0.3829,
"step": 2150
},
{
"epoch": 0.4216691068814056,
"grad_norm": 2.3885393142700195,
"learning_rate": 9.631446231974313e-06,
"loss": 0.3997,
"step": 2160
},
{
"epoch": 0.4236212786725232,
"grad_norm": 1.9122508764266968,
"learning_rate": 9.625000354669973e-06,
"loss": 0.3661,
"step": 2170
},
{
"epoch": 0.4255734504636408,
"grad_norm": 1.6724847555160522,
"learning_rate": 9.618500792106765e-06,
"loss": 0.3662,
"step": 2180
},
{
"epoch": 0.42752562225475843,
"grad_norm": 1.5858006477355957,
"learning_rate": 9.61194761972915e-06,
"loss": 0.388,
"step": 2190
},
{
"epoch": 0.42947779404587605,
"grad_norm": 1.6714067459106445,
"learning_rate": 9.60534091360389e-06,
"loss": 0.3734,
"step": 2200
},
{
"epoch": 0.43142996583699367,
"grad_norm": 1.657956838607788,
"learning_rate": 9.598680750419128e-06,
"loss": 0.3888,
"step": 2210
},
{
"epoch": 0.4333821376281113,
"grad_norm": 1.7038514614105225,
"learning_rate": 9.59196720748353e-06,
"loss": 0.3744,
"step": 2220
},
{
"epoch": 0.4353343094192289,
"grad_norm": 1.6116293668746948,
"learning_rate": 9.585200362725369e-06,
"loss": 0.401,
"step": 2230
},
{
"epoch": 0.4372864812103465,
"grad_norm": 1.8921372890472412,
"learning_rate": 9.578380294691626e-06,
"loss": 0.3734,
"step": 2240
},
{
"epoch": 0.43923865300146414,
"grad_norm": 1.7467238903045654,
"learning_rate": 9.571507082547071e-06,
"loss": 0.364,
"step": 2250
},
{
"epoch": 0.44119082479258176,
"grad_norm": 1.659144639968872,
"learning_rate": 9.564580806073365e-06,
"loss": 0.3746,
"step": 2260
},
{
"epoch": 0.4431429965836994,
"grad_norm": 2.034449577331543,
"learning_rate": 9.557601545668107e-06,
"loss": 0.363,
"step": 2270
},
{
"epoch": 0.445095168374817,
"grad_norm": 1.5256868600845337,
"learning_rate": 9.55056938234392e-06,
"loss": 0.3709,
"step": 2280
},
{
"epoch": 0.4470473401659346,
"grad_norm": 1.6245321035385132,
"learning_rate": 9.543484397727502e-06,
"loss": 0.3791,
"step": 2290
},
{
"epoch": 0.44899951195705223,
"grad_norm": 1.7325410842895508,
"learning_rate": 9.53634667405868e-06,
"loss": 0.3925,
"step": 2300
},
{
"epoch": 0.45095168374816985,
"grad_norm": 1.793959379196167,
"learning_rate": 9.529156294189459e-06,
"loss": 0.3661,
"step": 2310
},
{
"epoch": 0.45290385553928747,
"grad_norm": 1.762374758720398,
"learning_rate": 9.521913341583051e-06,
"loss": 0.3517,
"step": 2320
},
{
"epoch": 0.4548560273304051,
"grad_norm": 1.7093387842178345,
"learning_rate": 9.51461790031292e-06,
"loss": 0.3404,
"step": 2330
},
{
"epoch": 0.4568081991215227,
"grad_norm": 1.7257992029190063,
"learning_rate": 9.507270055061798e-06,
"loss": 0.376,
"step": 2340
},
{
"epoch": 0.4587603709126403,
"grad_norm": 1.5308430194854736,
"learning_rate": 9.499869891120694e-06,
"loss": 0.3713,
"step": 2350
},
{
"epoch": 0.46071254270375794,
"grad_norm": 2.258150339126587,
"learning_rate": 9.492417494387923e-06,
"loss": 0.3804,
"step": 2360
},
{
"epoch": 0.46266471449487556,
"grad_norm": 1.6879998445510864,
"learning_rate": 9.48491295136809e-06,
"loss": 0.3572,
"step": 2370
},
{
"epoch": 0.4646168862859932,
"grad_norm": 1.9355443716049194,
"learning_rate": 9.477356349171102e-06,
"loss": 0.3513,
"step": 2380
},
{
"epoch": 0.4665690580771108,
"grad_norm": 1.737768292427063,
"learning_rate": 9.469747775511147e-06,
"loss": 0.3613,
"step": 2390
},
{
"epoch": 0.4685212298682284,
"grad_norm": 2.228402614593506,
"learning_rate": 9.462087318705672e-06,
"loss": 0.3487,
"step": 2400
},
{
"epoch": 0.47047340165934604,
"grad_norm": 1.715980887413025,
"learning_rate": 9.454375067674374e-06,
"loss": 0.3774,
"step": 2410
},
{
"epoch": 0.47242557345046365,
"grad_norm": 1.7476235628128052,
"learning_rate": 9.446611111938149e-06,
"loss": 0.3557,
"step": 2420
},
{
"epoch": 0.4743777452415813,
"grad_norm": 1.7379567623138428,
"learning_rate": 9.438795541618067e-06,
"loss": 0.3673,
"step": 2430
},
{
"epoch": 0.4763299170326989,
"grad_norm": 1.634992241859436,
"learning_rate": 9.430928447434317e-06,
"loss": 0.3786,
"step": 2440
},
{
"epoch": 0.4782820888238165,
"grad_norm": 1.5183148384094238,
"learning_rate": 9.423009920705163e-06,
"loss": 0.3593,
"step": 2450
},
{
"epoch": 0.4802342606149341,
"grad_norm": 1.7934198379516602,
"learning_rate": 9.415040053345876e-06,
"loss": 0.3666,
"step": 2460
},
{
"epoch": 0.48218643240605175,
"grad_norm": 2.3873658180236816,
"learning_rate": 9.407018937867665e-06,
"loss": 0.3771,
"step": 2470
},
{
"epoch": 0.48413860419716936,
"grad_norm": 1.7866123914718628,
"learning_rate": 9.398946667376614e-06,
"loss": 0.3659,
"step": 2480
},
{
"epoch": 0.486090775988287,
"grad_norm": 1.4919376373291016,
"learning_rate": 9.390823335572591e-06,
"loss": 0.3832,
"step": 2490
},
{
"epoch": 0.4880429477794046,
"grad_norm": 1.5056414604187012,
"learning_rate": 9.382649036748168e-06,
"loss": 0.3439,
"step": 2500
},
{
"epoch": 0.4899951195705222,
"grad_norm": 1.6114498376846313,
"learning_rate": 9.374423865787521e-06,
"loss": 0.3483,
"step": 2510
},
{
"epoch": 0.49194729136163984,
"grad_norm": 1.7268636226654053,
"learning_rate": 9.36614791816533e-06,
"loss": 0.3607,
"step": 2520
},
{
"epoch": 0.49389946315275746,
"grad_norm": 1.705761432647705,
"learning_rate": 9.357821289945673e-06,
"loss": 0.3373,
"step": 2530
},
{
"epoch": 0.4958516349438751,
"grad_norm": 1.5740495920181274,
"learning_rate": 9.349444077780905e-06,
"loss": 0.336,
"step": 2540
},
{
"epoch": 0.4978038067349927,
"grad_norm": 1.6093052625656128,
"learning_rate": 9.34101637891055e-06,
"loss": 0.3538,
"step": 2550
},
{
"epoch": 0.4997559785261103,
"grad_norm": 1.5724884271621704,
"learning_rate": 9.33253829116015e-06,
"loss": 0.368,
"step": 2560
},
{
"epoch": 0.5017081503172279,
"grad_norm": 2.033222198486328,
"learning_rate": 9.324009912940151e-06,
"loss": 0.3808,
"step": 2570
},
{
"epoch": 0.5036603221083455,
"grad_norm": 1.569170594215393,
"learning_rate": 9.315431343244752e-06,
"loss": 0.3768,
"step": 2580
},
{
"epoch": 0.5056124938994632,
"grad_norm": 1.5116662979125977,
"learning_rate": 9.306802681650748e-06,
"loss": 0.3697,
"step": 2590
},
{
"epoch": 0.5075646656905808,
"grad_norm": 1.8755792379379272,
"learning_rate": 9.298124028316388e-06,
"loss": 0.3731,
"step": 2600
},
{
"epoch": 0.5095168374816984,
"grad_norm": 1.7512634992599487,
"learning_rate": 9.289395483980209e-06,
"loss": 0.3707,
"step": 2610
},
{
"epoch": 0.511469009272816,
"grad_norm": 1.724428415298462,
"learning_rate": 9.280617149959853e-06,
"loss": 0.3623,
"step": 2620
},
{
"epoch": 0.5134211810639336,
"grad_norm": 2.088242530822754,
"learning_rate": 9.271789128150916e-06,
"loss": 0.3356,
"step": 2630
},
{
"epoch": 0.5153733528550513,
"grad_norm": 1.7629714012145996,
"learning_rate": 9.262911521025738e-06,
"loss": 0.3543,
"step": 2640
},
{
"epoch": 0.5173255246461689,
"grad_norm": 1.510256052017212,
"learning_rate": 9.253984431632238e-06,
"loss": 0.342,
"step": 2650
},
{
"epoch": 0.5192776964372865,
"grad_norm": 1.78757905960083,
"learning_rate": 9.245007963592697e-06,
"loss": 0.3785,
"step": 2660
},
{
"epoch": 0.5212298682284041,
"grad_norm": 1.7818865776062012,
"learning_rate": 9.235982221102569e-06,
"loss": 0.35,
"step": 2670
},
{
"epoch": 0.5231820400195217,
"grad_norm": 1.6107079982757568,
"learning_rate": 9.226907308929268e-06,
"loss": 0.3462,
"step": 2680
},
{
"epoch": 0.5251342118106394,
"grad_norm": 1.8741298913955688,
"learning_rate": 9.21778333241095e-06,
"loss": 0.358,
"step": 2690
},
{
"epoch": 0.527086383601757,
"grad_norm": 1.697345495223999,
"learning_rate": 9.208610397455292e-06,
"loss": 0.3539,
"step": 2700
},
{
"epoch": 0.5290385553928746,
"grad_norm": 1.5936979055404663,
"learning_rate": 9.199388610538261e-06,
"loss": 0.3656,
"step": 2710
},
{
"epoch": 0.5309907271839922,
"grad_norm": 1.8244935274124146,
"learning_rate": 9.190118078702879e-06,
"loss": 0.3791,
"step": 2720
},
{
"epoch": 0.5329428989751098,
"grad_norm": 1.7835068702697754,
"learning_rate": 9.180798909557982e-06,
"loss": 0.3642,
"step": 2730
},
{
"epoch": 0.5348950707662274,
"grad_norm": 1.4222677946090698,
"learning_rate": 9.17143121127697e-06,
"loss": 0.3969,
"step": 2740
},
{
"epoch": 0.5368472425573451,
"grad_norm": 1.6140766143798828,
"learning_rate": 9.162015092596546e-06,
"loss": 0.337,
"step": 2750
},
{
"epoch": 0.5387994143484627,
"grad_norm": 1.4900151491165161,
"learning_rate": 9.152550662815468e-06,
"loss": 0.3686,
"step": 2760
},
{
"epoch": 0.5407515861395803,
"grad_norm": 1.7115081548690796,
"learning_rate": 9.143038031793259e-06,
"loss": 0.3825,
"step": 2770
},
{
"epoch": 0.5427037579306979,
"grad_norm": 1.9142796993255615,
"learning_rate": 9.133477309948956e-06,
"loss": 0.347,
"step": 2780
},
{
"epoch": 0.5446559297218155,
"grad_norm": 1.6392863988876343,
"learning_rate": 9.123868608259808e-06,
"loss": 0.3571,
"step": 2790
},
{
"epoch": 0.5466081015129332,
"grad_norm": 1.9869853258132935,
"learning_rate": 9.114212038259998e-06,
"loss": 0.3656,
"step": 2800
},
{
"epoch": 0.5485602733040508,
"grad_norm": 1.797553300857544,
"learning_rate": 9.104507712039348e-06,
"loss": 0.3563,
"step": 2810
},
{
"epoch": 0.5505124450951684,
"grad_norm": 1.4315779209136963,
"learning_rate": 9.094755742242014e-06,
"loss": 0.3608,
"step": 2820
},
{
"epoch": 0.552464616886286,
"grad_norm": 1.5433530807495117,
"learning_rate": 9.084956242065182e-06,
"loss": 0.3531,
"step": 2830
},
{
"epoch": 0.5544167886774036,
"grad_norm": 1.5521483421325684,
"learning_rate": 9.07510932525775e-06,
"loss": 0.3431,
"step": 2840
},
{
"epoch": 0.5563689604685212,
"grad_norm": 1.7205086946487427,
"learning_rate": 9.065215106119017e-06,
"loss": 0.3506,
"step": 2850
},
{
"epoch": 0.5583211322596389,
"grad_norm": 1.5988975763320923,
"learning_rate": 9.05527369949734e-06,
"loss": 0.3519,
"step": 2860
},
{
"epoch": 0.5602733040507565,
"grad_norm": 1.5551445484161377,
"learning_rate": 9.04528522078882e-06,
"loss": 0.3413,
"step": 2870
},
{
"epoch": 0.5622254758418741,
"grad_norm": 1.4696311950683594,
"learning_rate": 9.035249785935946e-06,
"loss": 0.3604,
"step": 2880
},
{
"epoch": 0.5641776476329917,
"grad_norm": 1.6763559579849243,
"learning_rate": 9.02516751142626e-06,
"loss": 0.3624,
"step": 2890
},
{
"epoch": 0.5661298194241093,
"grad_norm": 1.6215537786483765,
"learning_rate": 9.015038514290999e-06,
"loss": 0.3401,
"step": 2900
},
{
"epoch": 0.568081991215227,
"grad_norm": 1.4154938459396362,
"learning_rate": 9.00486291210374e-06,
"loss": 0.3575,
"step": 2910
},
{
"epoch": 0.5700341630063446,
"grad_norm": 1.8908969163894653,
"learning_rate": 8.994640822979036e-06,
"loss": 0.3304,
"step": 2920
},
{
"epoch": 0.5719863347974622,
"grad_norm": 1.6332852840423584,
"learning_rate": 8.984372365571036e-06,
"loss": 0.3253,
"step": 2930
},
{
"epoch": 0.5739385065885798,
"grad_norm": 1.5187574625015259,
"learning_rate": 8.974057659072121e-06,
"loss": 0.3545,
"step": 2940
},
{
"epoch": 0.5758906783796974,
"grad_norm": 1.7546658515930176,
"learning_rate": 8.963696823211512e-06,
"loss": 0.356,
"step": 2950
},
{
"epoch": 0.577842850170815,
"grad_norm": 1.621563196182251,
"learning_rate": 8.953289978253881e-06,
"loss": 0.3129,
"step": 2960
},
{
"epoch": 0.5797950219619327,
"grad_norm": 1.652968406677246,
"learning_rate": 8.942837244997959e-06,
"loss": 0.3554,
"step": 2970
},
{
"epoch": 0.5817471937530503,
"grad_norm": 1.8834774494171143,
"learning_rate": 8.932338744775128e-06,
"loss": 0.3588,
"step": 2980
},
{
"epoch": 0.5836993655441679,
"grad_norm": 2.5238523483276367,
"learning_rate": 8.921794599448015e-06,
"loss": 0.3388,
"step": 2990
},
{
"epoch": 0.5856515373352855,
"grad_norm": 1.6469742059707642,
"learning_rate": 8.911204931409084e-06,
"loss": 0.3824,
"step": 3000
},
{
"epoch": 0.5876037091264031,
"grad_norm": 1.6612112522125244,
"learning_rate": 8.900569863579203e-06,
"loss": 0.3519,
"step": 3010
},
{
"epoch": 0.5895558809175208,
"grad_norm": 1.8060604333877563,
"learning_rate": 8.889889519406227e-06,
"loss": 0.3393,
"step": 3020
},
{
"epoch": 0.5915080527086384,
"grad_norm": 1.6979930400848389,
"learning_rate": 8.879164022863562e-06,
"loss": 0.3309,
"step": 3030
},
{
"epoch": 0.593460224499756,
"grad_norm": 1.366909384727478,
"learning_rate": 8.868393498448724e-06,
"loss": 0.3309,
"step": 3040
},
{
"epoch": 0.5954123962908736,
"grad_norm": 1.539758324623108,
"learning_rate": 8.857578071181894e-06,
"loss": 0.3379,
"step": 3050
},
{
"epoch": 0.5973645680819912,
"grad_norm": 1.5678811073303223,
"learning_rate": 8.846717866604471e-06,
"loss": 0.3607,
"step": 3060
},
{
"epoch": 0.5993167398731089,
"grad_norm": 1.8162411451339722,
"learning_rate": 8.835813010777615e-06,
"loss": 0.3653,
"step": 3070
},
{
"epoch": 0.6012689116642265,
"grad_norm": 1.7801045179367065,
"learning_rate": 8.824863630280775e-06,
"loss": 0.3545,
"step": 3080
},
{
"epoch": 0.6032210834553441,
"grad_norm": 1.7757545709609985,
"learning_rate": 8.813869852210228e-06,
"loss": 0.3383,
"step": 3090
},
{
"epoch": 0.6051732552464617,
"grad_norm": 1.4703953266143799,
"learning_rate": 8.802831804177601e-06,
"loss": 0.3305,
"step": 3100
},
{
"epoch": 0.6071254270375793,
"grad_norm": 1.6324414014816284,
"learning_rate": 8.791749614308392e-06,
"loss": 0.3386,
"step": 3110
},
{
"epoch": 0.6090775988286969,
"grad_norm": 2.0556728839874268,
"learning_rate": 8.780623411240477e-06,
"loss": 0.3473,
"step": 3120
},
{
"epoch": 0.6110297706198146,
"grad_norm": 1.7078546285629272,
"learning_rate": 8.769453324122625e-06,
"loss": 0.3279,
"step": 3130
},
{
"epoch": 0.6129819424109322,
"grad_norm": 1.872077226638794,
"learning_rate": 8.758239482612992e-06,
"loss": 0.3765,
"step": 3140
},
{
"epoch": 0.6149341142020498,
"grad_norm": 2.0109453201293945,
"learning_rate": 8.746982016877616e-06,
"loss": 0.3155,
"step": 3150
},
{
"epoch": 0.6168862859931674,
"grad_norm": 1.9242374897003174,
"learning_rate": 8.735681057588914e-06,
"loss": 0.3679,
"step": 3160
},
{
"epoch": 0.618838457784285,
"grad_norm": 1.6928529739379883,
"learning_rate": 8.724336735924155e-06,
"loss": 0.3589,
"step": 3170
},
{
"epoch": 0.6207906295754027,
"grad_norm": 2.123342990875244,
"learning_rate": 8.712949183563945e-06,
"loss": 0.3265,
"step": 3180
},
{
"epoch": 0.6227428013665203,
"grad_norm": 1.4975687265396118,
"learning_rate": 8.701518532690696e-06,
"loss": 0.3466,
"step": 3190
},
{
"epoch": 0.6246949731576379,
"grad_norm": 2.0426642894744873,
"learning_rate": 8.690044915987091e-06,
"loss": 0.3662,
"step": 3200
},
{
"epoch": 0.6266471449487555,
"grad_norm": 1.6315159797668457,
"learning_rate": 8.678528466634537e-06,
"loss": 0.3118,
"step": 3210
},
{
"epoch": 0.6285993167398731,
"grad_norm": 1.6052002906799316,
"learning_rate": 8.666969318311638e-06,
"loss": 0.3499,
"step": 3220
},
{
"epoch": 0.6305514885309907,
"grad_norm": 1.821866750717163,
"learning_rate": 8.655367605192623e-06,
"loss": 0.3231,
"step": 3230
},
{
"epoch": 0.6325036603221084,
"grad_norm": 1.8277753591537476,
"learning_rate": 8.643723461945804e-06,
"loss": 0.3369,
"step": 3240
},
{
"epoch": 0.634455832113226,
"grad_norm": 1.5077341794967651,
"learning_rate": 8.632037023731997e-06,
"loss": 0.3599,
"step": 3250
},
{
"epoch": 0.6364080039043436,
"grad_norm": 1.663934350013733,
"learning_rate": 8.62030842620297e-06,
"loss": 0.3359,
"step": 3260
},
{
"epoch": 0.6383601756954612,
"grad_norm": 1.6589021682739258,
"learning_rate": 8.608537805499854e-06,
"loss": 0.3467,
"step": 3270
},
{
"epoch": 0.6403123474865788,
"grad_norm": 1.6243420839309692,
"learning_rate": 8.596725298251578e-06,
"loss": 0.3488,
"step": 3280
},
{
"epoch": 0.6422645192776965,
"grad_norm": 2.027372360229492,
"learning_rate": 8.584871041573263e-06,
"loss": 0.3394,
"step": 3290
},
{
"epoch": 0.6442166910688141,
"grad_norm": 1.519249677658081,
"learning_rate": 8.572975173064651e-06,
"loss": 0.3445,
"step": 3300
},
{
"epoch": 0.6461688628599317,
"grad_norm": 1.7421610355377197,
"learning_rate": 8.561037830808493e-06,
"loss": 0.3308,
"step": 3310
},
{
"epoch": 0.6481210346510493,
"grad_norm": 1.7053309679031372,
"learning_rate": 8.549059153368954e-06,
"loss": 0.3176,
"step": 3320
},
{
"epoch": 0.6500732064421669,
"grad_norm": 1.6674582958221436,
"learning_rate": 8.537039279790002e-06,
"loss": 0.3247,
"step": 3330
},
{
"epoch": 0.6520253782332845,
"grad_norm": 1.5092159509658813,
"learning_rate": 8.524978349593791e-06,
"loss": 0.3505,
"step": 3340
},
{
"epoch": 0.6539775500244022,
"grad_norm": 1.6612021923065186,
"learning_rate": 8.512876502779053e-06,
"loss": 0.3256,
"step": 3350
},
{
"epoch": 0.6559297218155198,
"grad_norm": 1.7003108263015747,
"learning_rate": 8.500733879819453e-06,
"loss": 0.3453,
"step": 3360
},
{
"epoch": 0.6578818936066374,
"grad_norm": 1.549048662185669,
"learning_rate": 8.488550621661982e-06,
"loss": 0.3548,
"step": 3370
},
{
"epoch": 0.659834065397755,
"grad_norm": 1.461796522140503,
"learning_rate": 8.476326869725297e-06,
"loss": 0.3154,
"step": 3380
},
{
"epoch": 0.6617862371888726,
"grad_norm": 1.7047860622406006,
"learning_rate": 8.464062765898104e-06,
"loss": 0.3232,
"step": 3390
},
{
"epoch": 0.6637384089799903,
"grad_norm": 1.7252593040466309,
"learning_rate": 8.45175845253749e-06,
"loss": 0.3207,
"step": 3400
},
{
"epoch": 0.6656905807711079,
"grad_norm": 1.7007282972335815,
"learning_rate": 8.43941407246728e-06,
"loss": 0.3419,
"step": 3410
},
{
"epoch": 0.6676427525622255,
"grad_norm": 1.7178446054458618,
"learning_rate": 8.42702976897638e-06,
"loss": 0.3348,
"step": 3420
},
{
"epoch": 0.6695949243533431,
"grad_norm": 1.611234426498413,
"learning_rate": 8.414605685817115e-06,
"loss": 0.3095,
"step": 3430
},
{
"epoch": 0.6715470961444607,
"grad_norm": 1.9325944185256958,
"learning_rate": 8.40214196720355e-06,
"loss": 0.3212,
"step": 3440
},
{
"epoch": 0.6734992679355783,
"grad_norm": 1.4901853799819946,
"learning_rate": 8.38963875780983e-06,
"loss": 0.3535,
"step": 3450
},
{
"epoch": 0.675451439726696,
"grad_norm": 1.5685844421386719,
"learning_rate": 8.37709620276849e-06,
"loss": 0.3426,
"step": 3460
},
{
"epoch": 0.6774036115178136,
"grad_norm": 1.7217646837234497,
"learning_rate": 8.364514447668777e-06,
"loss": 0.3626,
"step": 3470
},
{
"epoch": 0.6793557833089312,
"grad_norm": 1.7846200466156006,
"learning_rate": 8.351893638554957e-06,
"loss": 0.3245,
"step": 3480
},
{
"epoch": 0.6813079551000488,
"grad_norm": 1.9821772575378418,
"learning_rate": 8.339233921924619e-06,
"loss": 0.338,
"step": 3490
},
{
"epoch": 0.6832601268911664,
"grad_norm": 1.7743932008743286,
"learning_rate": 8.326535444726975e-06,
"loss": 0.3272,
"step": 3500
},
{
"epoch": 0.6852122986822841,
"grad_norm": 1.653472900390625,
"learning_rate": 8.31379835436116e-06,
"loss": 0.3434,
"step": 3510
},
{
"epoch": 0.6871644704734017,
"grad_norm": 1.4776761531829834,
"learning_rate": 8.301022798674507e-06,
"loss": 0.3148,
"step": 3520
},
{
"epoch": 0.6891166422645193,
"grad_norm": 1.6221344470977783,
"learning_rate": 8.288208925960853e-06,
"loss": 0.3091,
"step": 3530
},
{
"epoch": 0.6910688140556369,
"grad_norm": 1.911304235458374,
"learning_rate": 8.27535688495879e-06,
"loss": 0.3709,
"step": 3540
},
{
"epoch": 0.6930209858467545,
"grad_norm": 1.5734984874725342,
"learning_rate": 8.262466824849965e-06,
"loss": 0.3239,
"step": 3550
},
{
"epoch": 0.6949731576378722,
"grad_norm": 1.9672952890396118,
"learning_rate": 8.24953889525733e-06,
"loss": 0.3296,
"step": 3560
},
{
"epoch": 0.6969253294289898,
"grad_norm": 2.76518177986145,
"learning_rate": 8.236573246243414e-06,
"loss": 0.3099,
"step": 3570
},
{
"epoch": 0.6988775012201074,
"grad_norm": 1.655221700668335,
"learning_rate": 8.223570028308578e-06,
"loss": 0.3378,
"step": 3580
},
{
"epoch": 0.700829673011225,
"grad_norm": 1.7827235460281372,
"learning_rate": 8.210529392389268e-06,
"loss": 0.321,
"step": 3590
},
{
"epoch": 0.7027818448023426,
"grad_norm": 1.5071234703063965,
"learning_rate": 8.197451489856265e-06,
"loss": 0.3394,
"step": 3600
},
{
"epoch": 0.7047340165934602,
"grad_norm": 1.7633196115493774,
"learning_rate": 8.184336472512926e-06,
"loss": 0.3154,
"step": 3610
},
{
"epoch": 0.7066861883845779,
"grad_norm": 1.2931830883026123,
"learning_rate": 8.171184492593427e-06,
"loss": 0.3198,
"step": 3620
},
{
"epoch": 0.7086383601756955,
"grad_norm": 1.9217031002044678,
"learning_rate": 8.157995702760985e-06,
"loss": 0.3144,
"step": 3630
},
{
"epoch": 0.7105905319668131,
"grad_norm": 1.7861008644104004,
"learning_rate": 8.144770256106095e-06,
"loss": 0.3357,
"step": 3640
},
{
"epoch": 0.7125427037579307,
"grad_norm": 1.4293644428253174,
"learning_rate": 8.131508306144753e-06,
"loss": 0.3309,
"step": 3650
},
{
"epoch": 0.7144948755490483,
"grad_norm": 1.50441575050354,
"learning_rate": 8.118210006816669e-06,
"loss": 0.3406,
"step": 3660
},
{
"epoch": 0.716447047340166,
"grad_norm": 1.6063085794448853,
"learning_rate": 8.104875512483484e-06,
"loss": 0.3207,
"step": 3670
},
{
"epoch": 0.7183992191312836,
"grad_norm": 1.720033049583435,
"learning_rate": 8.091504977926976e-06,
"loss": 0.3316,
"step": 3680
},
{
"epoch": 0.7203513909224012,
"grad_norm": 1.5257724523544312,
"learning_rate": 8.078098558347266e-06,
"loss": 0.3261,
"step": 3690
},
{
"epoch": 0.7223035627135188,
"grad_norm": 1.4936727285385132,
"learning_rate": 8.064656409361009e-06,
"loss": 0.3263,
"step": 3700
},
{
"epoch": 0.7242557345046364,
"grad_norm": 1.685624361038208,
"learning_rate": 8.051178686999604e-06,
"loss": 0.3366,
"step": 3710
},
{
"epoch": 0.726207906295754,
"grad_norm": 1.7392653226852417,
"learning_rate": 8.037665547707362e-06,
"loss": 0.3283,
"step": 3720
},
{
"epoch": 0.7281600780868717,
"grad_norm": 1.705322265625,
"learning_rate": 8.024117148339708e-06,
"loss": 0.3,
"step": 3730
},
{
"epoch": 0.7301122498779893,
"grad_norm": 1.7675234079360962,
"learning_rate": 8.010533646161345e-06,
"loss": 0.3359,
"step": 3740
},
{
"epoch": 0.7320644216691069,
"grad_norm": 1.575836181640625,
"learning_rate": 7.996915198844446e-06,
"loss": 0.3253,
"step": 3750
},
{
"epoch": 0.7340165934602245,
"grad_norm": 1.9200180768966675,
"learning_rate": 7.983261964466805e-06,
"loss": 0.3277,
"step": 3760
},
{
"epoch": 0.7359687652513421,
"grad_norm": 1.6772791147232056,
"learning_rate": 7.969574101510014e-06,
"loss": 0.3539,
"step": 3770
},
{
"epoch": 0.7379209370424598,
"grad_norm": 1.6115095615386963,
"learning_rate": 7.955851768857624e-06,
"loss": 0.3128,
"step": 3780
},
{
"epoch": 0.7398731088335774,
"grad_norm": 1.8706231117248535,
"learning_rate": 7.942095125793293e-06,
"loss": 0.3425,
"step": 3790
},
{
"epoch": 0.741825280624695,
"grad_norm": 1.7618359327316284,
"learning_rate": 7.928304331998942e-06,
"loss": 0.3108,
"step": 3800
},
{
"epoch": 0.7437774524158126,
"grad_norm": 1.5923467874526978,
"learning_rate": 7.914479547552901e-06,
"loss": 0.3205,
"step": 3810
},
{
"epoch": 0.7457296242069302,
"grad_norm": 1.465217113494873,
"learning_rate": 7.900620932928053e-06,
"loss": 0.3386,
"step": 3820
},
{
"epoch": 0.7476817959980478,
"grad_norm": 1.7743841409683228,
"learning_rate": 7.886728648989965e-06,
"loss": 0.3384,
"step": 3830
},
{
"epoch": 0.7496339677891655,
"grad_norm": 2.0260348320007324,
"learning_rate": 7.87280285699503e-06,
"loss": 0.3561,
"step": 3840
},
{
"epoch": 0.7515861395802831,
"grad_norm": 1.6193934679031372,
"learning_rate": 7.85884371858858e-06,
"loss": 0.3094,
"step": 3850
},
{
"epoch": 0.7535383113714007,
"grad_norm": 1.3467310667037964,
"learning_rate": 7.844851395803034e-06,
"loss": 0.3093,
"step": 3860
},
{
"epoch": 0.7554904831625183,
"grad_norm": 1.415460467338562,
"learning_rate": 7.830826051055989e-06,
"loss": 0.3311,
"step": 3870
},
{
"epoch": 0.7574426549536359,
"grad_norm": 1.631778359413147,
"learning_rate": 7.816767847148358e-06,
"loss": 0.3353,
"step": 3880
},
{
"epoch": 0.7593948267447536,
"grad_norm": 1.7144395112991333,
"learning_rate": 7.802676947262466e-06,
"loss": 0.3528,
"step": 3890
},
{
"epoch": 0.7613469985358712,
"grad_norm": 1.720763921737671,
"learning_rate": 7.788553514960158e-06,
"loss": 0.34,
"step": 3900
},
{
"epoch": 0.7632991703269888,
"grad_norm": 1.3365732431411743,
"learning_rate": 7.774397714180913e-06,
"loss": 0.3309,
"step": 3910
},
{
"epoch": 0.7652513421181064,
"grad_norm": 1.425312876701355,
"learning_rate": 7.760209709239921e-06,
"loss": 0.3342,
"step": 3920
},
{
"epoch": 0.767203513909224,
"grad_norm": 1.7869527339935303,
"learning_rate": 7.74598966482619e-06,
"loss": 0.3193,
"step": 3930
},
{
"epoch": 0.7691556857003417,
"grad_norm": 1.4641571044921875,
"learning_rate": 7.731737746000631e-06,
"loss": 0.3061,
"step": 3940
},
{
"epoch": 0.7711078574914593,
"grad_norm": 1.8277581930160522,
"learning_rate": 7.717454118194138e-06,
"loss": 0.3207,
"step": 3950
},
{
"epoch": 0.7730600292825769,
"grad_norm": 2.149855613708496,
"learning_rate": 7.703138947205672e-06,
"loss": 0.3146,
"step": 3960
},
{
"epoch": 0.7750122010736945,
"grad_norm": 10.179088592529297,
"learning_rate": 7.68879239920034e-06,
"loss": 0.3279,
"step": 3970
},
{
"epoch": 0.7769643728648121,
"grad_norm": 1.7173975706100464,
"learning_rate": 7.674414640707453e-06,
"loss": 0.3352,
"step": 3980
},
{
"epoch": 0.7789165446559297,
"grad_norm": 2.3102633953094482,
"learning_rate": 7.660005838618607e-06,
"loss": 0.3247,
"step": 3990
},
{
"epoch": 0.7808687164470474,
"grad_norm": 1.6337876319885254,
"learning_rate": 7.645566160185742e-06,
"loss": 0.3013,
"step": 4000
},
{
"epoch": 0.782820888238165,
"grad_norm": 1.7631540298461914,
"learning_rate": 7.631095773019195e-06,
"loss": 0.3227,
"step": 4010
},
{
"epoch": 0.7847730600292826,
"grad_norm": 1.5217206478118896,
"learning_rate": 7.616594845085759e-06,
"loss": 0.3213,
"step": 4020
},
{
"epoch": 0.7867252318204002,
"grad_norm": 1.7165250778198242,
"learning_rate": 7.602063544706735e-06,
"loss": 0.3075,
"step": 4030
},
{
"epoch": 0.7886774036115178,
"grad_norm": 1.6861894130706787,
"learning_rate": 7.587502040555972e-06,
"loss": 0.3044,
"step": 4040
},
{
"epoch": 0.7906295754026355,
"grad_norm": 1.8273816108703613,
"learning_rate": 7.572910501657918e-06,
"loss": 0.3362,
"step": 4050
},
{
"epoch": 0.7925817471937531,
"grad_norm": 1.779876470565796,
"learning_rate": 7.55828909738565e-06,
"loss": 0.3297,
"step": 4060
},
{
"epoch": 0.7945339189848707,
"grad_norm": 1.5581871271133423,
"learning_rate": 7.54363799745891e-06,
"loss": 0.2991,
"step": 4070
},
{
"epoch": 0.7964860907759883,
"grad_norm": 1.8014543056488037,
"learning_rate": 7.528957371942139e-06,
"loss": 0.3134,
"step": 4080
},
{
"epoch": 0.7984382625671059,
"grad_norm": 1.5220543146133423,
"learning_rate": 7.5142473912424975e-06,
"loss": 0.32,
"step": 4090
},
{
"epoch": 0.8003904343582235,
"grad_norm": 1.6463323831558228,
"learning_rate": 7.499508226107889e-06,
"loss": 0.308,
"step": 4100
},
{
"epoch": 0.8023426061493412,
"grad_norm": 1.795111060142517,
"learning_rate": 7.484740047624983e-06,
"loss": 0.311,
"step": 4110
},
{
"epoch": 0.8042947779404588,
"grad_norm": 1.662244200706482,
"learning_rate": 7.469943027217222e-06,
"loss": 0.3188,
"step": 4120
},
{
"epoch": 0.8062469497315764,
"grad_norm": 1.9081573486328125,
"learning_rate": 7.4551173366428355e-06,
"loss": 0.2983,
"step": 4130
},
{
"epoch": 0.808199121522694,
"grad_norm": 1.5372672080993652,
"learning_rate": 7.440263147992844e-06,
"loss": 0.3156,
"step": 4140
},
{
"epoch": 0.8101512933138116,
"grad_norm": 1.4703730344772339,
"learning_rate": 7.425380633689065e-06,
"loss": 0.3093,
"step": 4150
},
{
"epoch": 0.8121034651049293,
"grad_norm": 1.5877699851989746,
"learning_rate": 7.4104699664821076e-06,
"loss": 0.3129,
"step": 4160
},
{
"epoch": 0.8140556368960469,
"grad_norm": 1.7468280792236328,
"learning_rate": 7.395531319449372e-06,
"loss": 0.2965,
"step": 4170
},
{
"epoch": 0.8160078086871645,
"grad_norm": 1.7475582361221313,
"learning_rate": 7.380564865993034e-06,
"loss": 0.3188,
"step": 4180
},
{
"epoch": 0.8179599804782821,
"grad_norm": 1.5171892642974854,
"learning_rate": 7.3655707798380385e-06,
"loss": 0.3098,
"step": 4190
},
{
"epoch": 0.8199121522693997,
"grad_norm": 1.7682745456695557,
"learning_rate": 7.35054923503008e-06,
"loss": 0.2925,
"step": 4200
},
{
"epoch": 0.8218643240605173,
"grad_norm": 1.6885298490524292,
"learning_rate": 7.335500405933581e-06,
"loss": 0.3171,
"step": 4210
},
{
"epoch": 0.823816495851635,
"grad_norm": 1.499431848526001,
"learning_rate": 7.320424467229673e-06,
"loss": 0.292,
"step": 4220
},
{
"epoch": 0.8257686676427526,
"grad_norm": 1.9566731452941895,
"learning_rate": 7.305321593914163e-06,
"loss": 0.3064,
"step": 4230
},
{
"epoch": 0.8277208394338702,
"grad_norm": 1.6873054504394531,
"learning_rate": 7.290191961295503e-06,
"loss": 0.3157,
"step": 4240
},
{
"epoch": 0.8296730112249878,
"grad_norm": 1.7769254446029663,
"learning_rate": 7.275035744992762e-06,
"loss": 0.3138,
"step": 4250
},
{
"epoch": 0.8316251830161054,
"grad_norm": 1.712579607963562,
"learning_rate": 7.2598531209335785e-06,
"loss": 0.3131,
"step": 4260
},
{
"epoch": 0.8335773548072231,
"grad_norm": 1.3291455507278442,
"learning_rate": 7.2446442653521235e-06,
"loss": 0.3207,
"step": 4270
},
{
"epoch": 0.8355295265983407,
"grad_norm": 1.687888503074646,
"learning_rate": 7.229409354787053e-06,
"loss": 0.3274,
"step": 4280
},
{
"epoch": 0.8374816983894583,
"grad_norm": 1.7257566452026367,
"learning_rate": 7.2141485660794605e-06,
"loss": 0.3261,
"step": 4290
},
{
"epoch": 0.8394338701805759,
"grad_norm": 3.248084545135498,
"learning_rate": 7.198862076370825e-06,
"loss": 0.3004,
"step": 4300
},
{
"epoch": 0.8413860419716935,
"grad_norm": 1.6880978345870972,
"learning_rate": 7.183550063100946e-06,
"loss": 0.2934,
"step": 4310
},
{
"epoch": 0.8433382137628112,
"grad_norm": 1.704007863998413,
"learning_rate": 7.168212704005899e-06,
"loss": 0.3134,
"step": 4320
},
{
"epoch": 0.8452903855539288,
"grad_norm": 2.11948823928833,
"learning_rate": 7.1528501771159585e-06,
"loss": 0.3172,
"step": 4330
},
{
"epoch": 0.8472425573450464,
"grad_norm": 1.5055100917816162,
"learning_rate": 7.137462660753542e-06,
"loss": 0.3174,
"step": 4340
},
{
"epoch": 0.849194729136164,
"grad_norm": 1.7868834733963013,
"learning_rate": 7.122050333531132e-06,
"loss": 0.3129,
"step": 4350
},
{
"epoch": 0.8511469009272816,
"grad_norm": 1.6094841957092285,
"learning_rate": 7.106613374349206e-06,
"loss": 0.3069,
"step": 4360
},
{
"epoch": 0.8530990727183992,
"grad_norm": 1.7572426795959473,
"learning_rate": 7.0911519623941625e-06,
"loss": 0.315,
"step": 4370
},
{
"epoch": 0.8550512445095169,
"grad_norm": 1.4624701738357544,
"learning_rate": 7.075666277136235e-06,
"loss": 0.3014,
"step": 4380
},
{
"epoch": 0.8570034163006345,
"grad_norm": 1.7297512292861938,
"learning_rate": 7.060156498327417e-06,
"loss": 0.314,
"step": 4390
},
{
"epoch": 0.8589555880917521,
"grad_norm": 1.429887294769287,
"learning_rate": 7.0446228059993675e-06,
"loss": 0.2955,
"step": 4400
},
{
"epoch": 0.8609077598828697,
"grad_norm": 1.4165374040603638,
"learning_rate": 7.029065380461324e-06,
"loss": 0.3195,
"step": 4410
},
{
"epoch": 0.8628599316739873,
"grad_norm": 1.5273165702819824,
"learning_rate": 7.013484402298014e-06,
"loss": 0.2905,
"step": 4420
},
{
"epoch": 0.864812103465105,
"grad_norm": 1.7598567008972168,
"learning_rate": 6.997880052367549e-06,
"loss": 0.3114,
"step": 4430
},
{
"epoch": 0.8667642752562226,
"grad_norm": 1.361842393875122,
"learning_rate": 6.98225251179934e-06,
"loss": 0.2928,
"step": 4440
},
{
"epoch": 0.8687164470473402,
"grad_norm": 1.6436288356781006,
"learning_rate": 6.9666019619919765e-06,
"loss": 0.321,
"step": 4450
},
{
"epoch": 0.8706686188384578,
"grad_norm": 1.7562702894210815,
"learning_rate": 6.950928584611135e-06,
"loss": 0.3194,
"step": 4460
},
{
"epoch": 0.8726207906295754,
"grad_norm": 1.9722226858139038,
"learning_rate": 6.935232561587468e-06,
"loss": 0.3087,
"step": 4470
},
{
"epoch": 0.874572962420693,
"grad_norm": 1.77361261844635,
"learning_rate": 6.9195140751144866e-06,
"loss": 0.3186,
"step": 4480
},
{
"epoch": 0.8765251342118107,
"grad_norm": 1.6900993585586548,
"learning_rate": 6.903773307646449e-06,
"loss": 0.2882,
"step": 4490
},
{
"epoch": 0.8784773060029283,
"grad_norm": 1.5208570957183838,
"learning_rate": 6.888010441896249e-06,
"loss": 0.292,
"step": 4500
},
{
"epoch": 0.8804294777940459,
"grad_norm": 1.6470943689346313,
"learning_rate": 6.872225660833278e-06,
"loss": 0.312,
"step": 4510
},
{
"epoch": 0.8823816495851635,
"grad_norm": 1.5475988388061523,
"learning_rate": 6.856419147681322e-06,
"loss": 0.3135,
"step": 4520
},
{
"epoch": 0.8843338213762811,
"grad_norm": 1.3954654932022095,
"learning_rate": 6.84059108591642e-06,
"loss": 0.2928,
"step": 4530
},
{
"epoch": 0.8862859931673988,
"grad_norm": 1.4919439554214478,
"learning_rate": 6.824741659264742e-06,
"loss": 0.2879,
"step": 4540
},
{
"epoch": 0.8882381649585164,
"grad_norm": 1.4646505117416382,
"learning_rate": 6.808871051700447e-06,
"loss": 0.3109,
"step": 4550
},
{
"epoch": 0.890190336749634,
"grad_norm": 1.4592580795288086,
"learning_rate": 6.792979447443565e-06,
"loss": 0.3197,
"step": 4560
},
{
"epoch": 0.8921425085407516,
"grad_norm": 1.6104093790054321,
"learning_rate": 6.777067030957838e-06,
"loss": 0.3099,
"step": 4570
},
{
"epoch": 0.8940946803318692,
"grad_norm": 2.3115813732147217,
"learning_rate": 6.7611339869485894e-06,
"loss": 0.3143,
"step": 4580
},
{
"epoch": 0.8960468521229868,
"grad_norm": 1.6102055311203003,
"learning_rate": 6.745180500360589e-06,
"loss": 0.2866,
"step": 4590
},
{
"epoch": 0.8979990239141045,
"grad_norm": 1.5225039720535278,
"learning_rate": 6.729206756375883e-06,
"loss": 0.2921,
"step": 4600
},
{
"epoch": 0.8999511957052221,
"grad_norm": 1.5761200189590454,
"learning_rate": 6.713212940411665e-06,
"loss": 0.3304,
"step": 4610
},
{
"epoch": 0.9019033674963397,
"grad_norm": 1.7699685096740723,
"learning_rate": 6.697199238118117e-06,
"loss": 0.2961,
"step": 4620
},
{
"epoch": 0.9038555392874573,
"grad_norm": 1.9456015825271606,
"learning_rate": 6.681165835376252e-06,
"loss": 0.3068,
"step": 4630
},
{
"epoch": 0.9058077110785749,
"grad_norm": 1.614859700202942,
"learning_rate": 6.665112918295759e-06,
"loss": 0.3192,
"step": 4640
},
{
"epoch": 0.9077598828696926,
"grad_norm": 1.859410047531128,
"learning_rate": 6.64904067321284e-06,
"loss": 0.2961,
"step": 4650
},
{
"epoch": 0.9097120546608102,
"grad_norm": 1.799694299697876,
"learning_rate": 6.632949286688053e-06,
"loss": 0.2969,
"step": 4660
},
{
"epoch": 0.9116642264519278,
"grad_norm": 1.6212577819824219,
"learning_rate": 6.6168389455041405e-06,
"loss": 0.3118,
"step": 4670
},
{
"epoch": 0.9136163982430454,
"grad_norm": 1.8194401264190674,
"learning_rate": 6.600709836663861e-06,
"loss": 0.3017,
"step": 4680
},
{
"epoch": 0.915568570034163,
"grad_norm": 1.459394097328186,
"learning_rate": 6.58456214738783e-06,
"loss": 0.3209,
"step": 4690
},
{
"epoch": 0.9175207418252807,
"grad_norm": 1.4963152408599854,
"learning_rate": 6.5683960651123234e-06,
"loss": 0.3062,
"step": 4700
},
{
"epoch": 0.9194729136163983,
"grad_norm": 1.6015785932540894,
"learning_rate": 6.55221177748713e-06,
"loss": 0.3105,
"step": 4710
},
{
"epoch": 0.9214250854075159,
"grad_norm": 1.656490445137024,
"learning_rate": 6.536009472373351e-06,
"loss": 0.3111,
"step": 4720
},
{
"epoch": 0.9233772571986335,
"grad_norm": 1.6450612545013428,
"learning_rate": 6.5197893378412295e-06,
"loss": 0.3049,
"step": 4730
},
{
"epoch": 0.9253294289897511,
"grad_norm": 1.5392154455184937,
"learning_rate": 6.503551562167969e-06,
"loss": 0.291,
"step": 4740
},
{
"epoch": 0.9272816007808687,
"grad_norm": 1.703579306602478,
"learning_rate": 6.4872963338355386e-06,
"loss": 0.293,
"step": 4750
},
{
"epoch": 0.9292337725719864,
"grad_norm": 1.695459246635437,
"learning_rate": 6.4710238415284985e-06,
"loss": 0.3142,
"step": 4760
},
{
"epoch": 0.931185944363104,
"grad_norm": 1.4692509174346924,
"learning_rate": 6.454734274131796e-06,
"loss": 0.2857,
"step": 4770
},
{
"epoch": 0.9331381161542216,
"grad_norm": 1.5995670557022095,
"learning_rate": 6.438427820728584e-06,
"loss": 0.3114,
"step": 4780
},
{
"epoch": 0.9350902879453392,
"grad_norm": 1.643405795097351,
"learning_rate": 6.422104670598021e-06,
"loss": 0.3017,
"step": 4790
},
{
"epoch": 0.9370424597364568,
"grad_norm": 1.4569945335388184,
"learning_rate": 6.405765013213073e-06,
"loss": 0.3,
"step": 4800
},
{
"epoch": 0.9389946315275745,
"grad_norm": 1.4433624744415283,
"learning_rate": 6.389409038238317e-06,
"loss": 0.303,
"step": 4810
},
{
"epoch": 0.9409468033186921,
"grad_norm": 1.7255630493164062,
"learning_rate": 6.37303693552774e-06,
"loss": 0.2974,
"step": 4820
},
{
"epoch": 0.9428989751098097,
"grad_norm": 1.4573408365249634,
"learning_rate": 6.35664889512253e-06,
"loss": 0.3217,
"step": 4830
},
{
"epoch": 0.9448511469009273,
"grad_norm": 2.016216278076172,
"learning_rate": 6.340245107248879e-06,
"loss": 0.2707,
"step": 4840
},
{
"epoch": 0.9468033186920449,
"grad_norm": 1.9193816184997559,
"learning_rate": 6.323825762315765e-06,
"loss": 0.2976,
"step": 4850
},
{
"epoch": 0.9487554904831625,
"grad_norm": 1.5443313121795654,
"learning_rate": 6.307391050912748e-06,
"loss": 0.3106,
"step": 4860
},
{
"epoch": 0.9507076622742802,
"grad_norm": 1.598844289779663,
"learning_rate": 6.290941163807756e-06,
"loss": 0.3063,
"step": 4870
},
{
"epoch": 0.9526598340653978,
"grad_norm": 1.764891505241394,
"learning_rate": 6.27447629194487e-06,
"loss": 0.3202,
"step": 4880
},
{
"epoch": 0.9546120058565154,
"grad_norm": 1.6712101697921753,
"learning_rate": 6.257996626442113e-06,
"loss": 0.3073,
"step": 4890
},
{
"epoch": 0.956564177647633,
"grad_norm": 1.5691962242126465,
"learning_rate": 6.241502358589222e-06,
"loss": 0.2932,
"step": 4900
},
{
"epoch": 0.9585163494387506,
"grad_norm": 1.6402193307876587,
"learning_rate": 6.224993679845434e-06,
"loss": 0.2886,
"step": 4910
},
{
"epoch": 0.9604685212298683,
"grad_norm": 1.7932533025741577,
"learning_rate": 6.2084707818372604e-06,
"loss": 0.3047,
"step": 4920
},
{
"epoch": 0.9624206930209859,
"grad_norm": 1.9926279783248901,
"learning_rate": 6.1919338563562705e-06,
"loss": 0.3092,
"step": 4930
},
{
"epoch": 0.9643728648121035,
"grad_norm": 1.5863287448883057,
"learning_rate": 6.175383095356852e-06,
"loss": 0.3038,
"step": 4940
},
{
"epoch": 0.9663250366032211,
"grad_norm": 1.7748479843139648,
"learning_rate": 6.158818690953991e-06,
"loss": 0.3006,
"step": 4950
},
{
"epoch": 0.9682772083943387,
"grad_norm": 1.546392560005188,
"learning_rate": 6.142240835421049e-06,
"loss": 0.3345,
"step": 4960
},
{
"epoch": 0.9702293801854563,
"grad_norm": 1.392622470855713,
"learning_rate": 6.125649721187514e-06,
"loss": 0.3015,
"step": 4970
},
{
"epoch": 0.972181551976574,
"grad_norm": 1.4012292623519897,
"learning_rate": 6.109045540836779e-06,
"loss": 0.2938,
"step": 4980
},
{
"epoch": 0.9741337237676916,
"grad_norm": 1.656556248664856,
"learning_rate": 6.0924284871039055e-06,
"loss": 0.2907,
"step": 4990
},
{
"epoch": 0.9760858955588092,
"grad_norm": 1.6998143196105957,
"learning_rate": 6.075798752873381e-06,
"loss": 0.2955,
"step": 5000
},
{
"epoch": 0.9780380673499268,
"grad_norm": 1.7401262521743774,
"learning_rate": 6.059156531176887e-06,
"loss": 0.2864,
"step": 5010
},
{
"epoch": 0.9799902391410444,
"grad_norm": 1.6069300174713135,
"learning_rate": 6.042502015191052e-06,
"loss": 0.2836,
"step": 5020
},
{
"epoch": 0.9819424109321621,
"grad_norm": 2.119791269302368,
"learning_rate": 6.025835398235213e-06,
"loss": 0.3022,
"step": 5030
},
{
"epoch": 0.9838945827232797,
"grad_norm": 1.3919116258621216,
"learning_rate": 6.009156873769172e-06,
"loss": 0.3041,
"step": 5040
},
{
"epoch": 0.9858467545143973,
"grad_norm": 1.7489839792251587,
"learning_rate": 5.992466635390945e-06,
"loss": 0.2841,
"step": 5050
},
{
"epoch": 0.9877989263055149,
"grad_norm": 1.5278687477111816,
"learning_rate": 5.975764876834522e-06,
"loss": 0.2791,
"step": 5060
},
{
"epoch": 0.9897510980966325,
"grad_norm": 1.8421188592910767,
"learning_rate": 5.959051791967612e-06,
"loss": 0.2885,
"step": 5070
},
{
"epoch": 0.9917032698877501,
"grad_norm": 1.8238730430603027,
"learning_rate": 5.942327574789402e-06,
"loss": 0.3192,
"step": 5080
},
{
"epoch": 0.9936554416788678,
"grad_norm": 1.5909425020217896,
"learning_rate": 5.925592419428291e-06,
"loss": 0.2717,
"step": 5090
},
{
"epoch": 0.9956076134699854,
"grad_norm": 1.8577344417572021,
"learning_rate": 5.908846520139646e-06,
"loss": 0.3013,
"step": 5100
},
{
"epoch": 0.997559785261103,
"grad_norm": 1.6016184091567993,
"learning_rate": 5.892090071303551e-06,
"loss": 0.3048,
"step": 5110
},
{
"epoch": 0.9995119570522206,
"grad_norm": 1.6936761140823364,
"learning_rate": 5.875323267422538e-06,
"loss": 0.2856,
"step": 5120
},
{
"epoch": 1.0013665202537823,
"grad_norm": 1.4396748542785645,
"learning_rate": 5.858546303119341e-06,
"loss": 0.2494,
"step": 5130
},
{
"epoch": 1.0033186920449,
"grad_norm": 1.4734578132629395,
"learning_rate": 5.841759373134629e-06,
"loss": 0.2256,
"step": 5140
},
{
"epoch": 1.0052708638360175,
"grad_norm": 1.3114964962005615,
"learning_rate": 5.8249626723247535e-06,
"loss": 0.2184,
"step": 5150
},
{
"epoch": 1.0072230356271352,
"grad_norm": 1.84878671169281,
"learning_rate": 5.808156395659475e-06,
"loss": 0.2266,
"step": 5160
},
{
"epoch": 1.0091752074182527,
"grad_norm": 1.5423600673675537,
"learning_rate": 5.791340738219715e-06,
"loss": 0.2255,
"step": 5170
},
{
"epoch": 1.0111273792093705,
"grad_norm": 1.4672765731811523,
"learning_rate": 5.774515895195279e-06,
"loss": 0.238,
"step": 5180
},
{
"epoch": 1.013079551000488,
"grad_norm": 1.5899931192398071,
"learning_rate": 5.757682061882596e-06,
"loss": 0.2272,
"step": 5190
},
{
"epoch": 1.0150317227916057,
"grad_norm": 1.8665298223495483,
"learning_rate": 5.740839433682449e-06,
"loss": 0.2402,
"step": 5200
},
{
"epoch": 1.0169838945827232,
"grad_norm": 1.8546682596206665,
"learning_rate": 5.723988206097712e-06,
"loss": 0.2261,
"step": 5210
},
{
"epoch": 1.018936066373841,
"grad_norm": 1.6443926095962524,
"learning_rate": 5.7071285747310755e-06,
"loss": 0.2357,
"step": 5220
},
{
"epoch": 1.0208882381649584,
"grad_norm": 1.82640540599823,
"learning_rate": 5.69026073528278e-06,
"loss": 0.2224,
"step": 5230
},
{
"epoch": 1.0228404099560762,
"grad_norm": 1.2243527173995972,
"learning_rate": 5.673384883548339e-06,
"loss": 0.2236,
"step": 5240
},
{
"epoch": 1.0247925817471937,
"grad_norm": 1.621323823928833,
"learning_rate": 5.656501215416272e-06,
"loss": 0.2343,
"step": 5250
},
{
"epoch": 1.0267447535383114,
"grad_norm": 1.4708138704299927,
"learning_rate": 5.639609926865825e-06,
"loss": 0.2246,
"step": 5260
},
{
"epoch": 1.028696925329429,
"grad_norm": 1.7992093563079834,
"learning_rate": 5.6227112139647065e-06,
"loss": 0.2215,
"step": 5270
},
{
"epoch": 1.0306490971205466,
"grad_norm": 1.545155644416809,
"learning_rate": 5.605805272866797e-06,
"loss": 0.2244,
"step": 5280
},
{
"epoch": 1.0326012689116641,
"grad_norm": 1.6702899932861328,
"learning_rate": 5.58889229980988e-06,
"loss": 0.2302,
"step": 5290
},
{
"epoch": 1.0345534407027819,
"grad_norm": 1.704221487045288,
"learning_rate": 5.571972491113364e-06,
"loss": 0.2337,
"step": 5300
},
{
"epoch": 1.0365056124938994,
"grad_norm": 1.6749101877212524,
"learning_rate": 5.555046043176008e-06,
"loss": 0.2181,
"step": 5310
},
{
"epoch": 1.0384577842850171,
"grad_norm": 1.77487313747406,
"learning_rate": 5.538113152473628e-06,
"loss": 0.2168,
"step": 5320
},
{
"epoch": 1.0404099560761346,
"grad_norm": 1.8465723991394043,
"learning_rate": 5.521174015556832e-06,
"loss": 0.2228,
"step": 5330
},
{
"epoch": 1.0423621278672524,
"grad_norm": 1.8834266662597656,
"learning_rate": 5.504228829048728e-06,
"loss": 0.2024,
"step": 5340
},
{
"epoch": 1.0443142996583699,
"grad_norm": 1.3431150913238525,
"learning_rate": 5.487277789642648e-06,
"loss": 0.2436,
"step": 5350
},
{
"epoch": 1.0462664714494876,
"grad_norm": 1.867727518081665,
"learning_rate": 5.470321094099859e-06,
"loss": 0.2316,
"step": 5360
},
{
"epoch": 1.048218643240605,
"grad_norm": 1.5301685333251953,
"learning_rate": 5.453358939247285e-06,
"loss": 0.2188,
"step": 5370
},
{
"epoch": 1.0501708150317228,
"grad_norm": 1.3902242183685303,
"learning_rate": 5.4363915219752214e-06,
"loss": 0.2096,
"step": 5380
},
{
"epoch": 1.0521229868228403,
"grad_norm": 1.7120879888534546,
"learning_rate": 5.419419039235042e-06,
"loss": 0.2264,
"step": 5390
},
{
"epoch": 1.054075158613958,
"grad_norm": 2.1603755950927734,
"learning_rate": 5.4024416880369245e-06,
"loss": 0.2179,
"step": 5400
},
{
"epoch": 1.0560273304050756,
"grad_norm": 1.4981776475906372,
"learning_rate": 5.385459665447554e-06,
"loss": 0.2334,
"step": 5410
},
{
"epoch": 1.0579795021961933,
"grad_norm": 1.601750373840332,
"learning_rate": 5.368473168587838e-06,
"loss": 0.2306,
"step": 5420
},
{
"epoch": 1.0599316739873108,
"grad_norm": 1.6521871089935303,
"learning_rate": 5.351482394630626e-06,
"loss": 0.2512,
"step": 5430
},
{
"epoch": 1.0618838457784285,
"grad_norm": 1.3969298601150513,
"learning_rate": 5.334487540798408e-06,
"loss": 0.2308,
"step": 5440
},
{
"epoch": 1.063836017569546,
"grad_norm": 1.5212770700454712,
"learning_rate": 5.317488804361035e-06,
"loss": 0.2054,
"step": 5450
},
{
"epoch": 1.0657881893606638,
"grad_norm": 1.7573391199111938,
"learning_rate": 5.300486382633428e-06,
"loss": 0.2058,
"step": 5460
},
{
"epoch": 1.0677403611517813,
"grad_norm": 1.4401905536651611,
"learning_rate": 5.283480472973278e-06,
"loss": 0.2237,
"step": 5470
},
{
"epoch": 1.069692532942899,
"grad_norm": 1.5984746217727661,
"learning_rate": 5.26647127277877e-06,
"loss": 0.2115,
"step": 5480
},
{
"epoch": 1.0716447047340165,
"grad_norm": 1.6980221271514893,
"learning_rate": 5.249458979486281e-06,
"loss": 0.2152,
"step": 5490
},
{
"epoch": 1.0735968765251342,
"grad_norm": 1.834252953529358,
"learning_rate": 5.232443790568091e-06,
"loss": 0.253,
"step": 5500
},
{
"epoch": 1.0755490483162518,
"grad_norm": 1.8841309547424316,
"learning_rate": 5.215425903530093e-06,
"loss": 0.238,
"step": 5510
},
{
"epoch": 1.0775012201073695,
"grad_norm": 1.3869953155517578,
"learning_rate": 5.198405515909497e-06,
"loss": 0.2119,
"step": 5520
},
{
"epoch": 1.079453391898487,
"grad_norm": 1.437673807144165,
"learning_rate": 5.181382825272543e-06,
"loss": 0.2126,
"step": 5530
},
{
"epoch": 1.0814055636896047,
"grad_norm": 1.5257771015167236,
"learning_rate": 5.1643580292121955e-06,
"loss": 0.2389,
"step": 5540
},
{
"epoch": 1.0833577354807222,
"grad_norm": 1.3782355785369873,
"learning_rate": 5.1473313253458654e-06,
"loss": 0.236,
"step": 5550
},
{
"epoch": 1.08530990727184,
"grad_norm": 1.6246196031570435,
"learning_rate": 5.130302911313109e-06,
"loss": 0.2055,
"step": 5560
},
{
"epoch": 1.0872620790629575,
"grad_norm": 1.501262903213501,
"learning_rate": 5.113272984773325e-06,
"loss": 0.2181,
"step": 5570
},
{
"epoch": 1.0892142508540752,
"grad_norm": 1.7623234987258911,
"learning_rate": 5.09624174340348e-06,
"loss": 0.2255,
"step": 5580
},
{
"epoch": 1.0911664226451927,
"grad_norm": 1.4687215089797974,
"learning_rate": 5.079209384895791e-06,
"loss": 0.2182,
"step": 5590
},
{
"epoch": 1.0931185944363104,
"grad_norm": 1.4029916524887085,
"learning_rate": 5.062176106955456e-06,
"loss": 0.2195,
"step": 5600
},
{
"epoch": 1.095070766227428,
"grad_norm": 1.3234246969223022,
"learning_rate": 5.04514210729833e-06,
"loss": 0.2282,
"step": 5610
},
{
"epoch": 1.0970229380185457,
"grad_norm": 1.8046473264694214,
"learning_rate": 5.028107583648659e-06,
"loss": 0.2275,
"step": 5620
},
{
"epoch": 1.0989751098096632,
"grad_norm": 1.8408907651901245,
"learning_rate": 5.011072733736764e-06,
"loss": 0.2095,
"step": 5630
},
{
"epoch": 1.100927281600781,
"grad_norm": 1.4698619842529297,
"learning_rate": 4.994037755296751e-06,
"loss": 0.2454,
"step": 5640
},
{
"epoch": 1.1028794533918984,
"grad_norm": 1.5274852514266968,
"learning_rate": 4.9770028460642274e-06,
"loss": 0.2353,
"step": 5650
},
{
"epoch": 1.1048316251830161,
"grad_norm": 1.8901482820510864,
"learning_rate": 4.959968203773987e-06,
"loss": 0.2271,
"step": 5660
},
{
"epoch": 1.1067837969741336,
"grad_norm": 1.4531559944152832,
"learning_rate": 4.942934026157734e-06,
"loss": 0.2076,
"step": 5670
},
{
"epoch": 1.1087359687652514,
"grad_norm": 1.6990450620651245,
"learning_rate": 4.925900510941769e-06,
"loss": 0.2184,
"step": 5680
},
{
"epoch": 1.1106881405563689,
"grad_norm": 1.2921063899993896,
"learning_rate": 4.908867855844709e-06,
"loss": 0.235,
"step": 5690
},
{
"epoch": 1.1126403123474866,
"grad_norm": 1.931989073753357,
"learning_rate": 4.891836258575195e-06,
"loss": 0.2289,
"step": 5700
},
{
"epoch": 1.1145924841386041,
"grad_norm": 1.7392537593841553,
"learning_rate": 4.874805916829573e-06,
"loss": 0.231,
"step": 5710
},
{
"epoch": 1.1165446559297219,
"grad_norm": 1.6657911539077759,
"learning_rate": 4.857777028289627e-06,
"loss": 0.219,
"step": 5720
},
{
"epoch": 1.1184968277208394,
"grad_norm": 1.6863371133804321,
"learning_rate": 4.840749790620268e-06,
"loss": 0.2076,
"step": 5730
},
{
"epoch": 1.120448999511957,
"grad_norm": 1.742223858833313,
"learning_rate": 4.823724401467248e-06,
"loss": 0.2114,
"step": 5740
},
{
"epoch": 1.1224011713030746,
"grad_norm": 1.8562861680984497,
"learning_rate": 4.806701058454856e-06,
"loss": 0.2187,
"step": 5750
},
{
"epoch": 1.1243533430941923,
"grad_norm": 1.945334553718567,
"learning_rate": 4.789679959183638e-06,
"loss": 0.2399,
"step": 5760
},
{
"epoch": 1.1263055148853098,
"grad_norm": 1.8333011865615845,
"learning_rate": 4.772661301228088e-06,
"loss": 0.2335,
"step": 5770
},
{
"epoch": 1.1282576866764276,
"grad_norm": 1.5760397911071777,
"learning_rate": 4.755645282134368e-06,
"loss": 0.243,
"step": 5780
},
{
"epoch": 1.130209858467545,
"grad_norm": 1.642067790031433,
"learning_rate": 4.738632099418004e-06,
"loss": 0.2297,
"step": 5790
},
{
"epoch": 1.1321620302586628,
"grad_norm": 1.6263489723205566,
"learning_rate": 4.721621950561604e-06,
"loss": 0.2145,
"step": 5800
},
{
"epoch": 1.1341142020497803,
"grad_norm": 1.5291143655776978,
"learning_rate": 4.704615033012556e-06,
"loss": 0.2263,
"step": 5810
},
{
"epoch": 1.136066373840898,
"grad_norm": 1.674197793006897,
"learning_rate": 4.687611544180741e-06,
"loss": 0.231,
"step": 5820
},
{
"epoch": 1.1380185456320155,
"grad_norm": 1.3444764614105225,
"learning_rate": 4.670611681436242e-06,
"loss": 0.2265,
"step": 5830
},
{
"epoch": 1.1399707174231333,
"grad_norm": 1.430079698562622,
"learning_rate": 4.6536156421070484e-06,
"loss": 0.1948,
"step": 5840
},
{
"epoch": 1.1419228892142508,
"grad_norm": 1.8119804859161377,
"learning_rate": 4.636623623476775e-06,
"loss": 0.2292,
"step": 5850
},
{
"epoch": 1.1438750610053685,
"grad_norm": 2.2670273780822754,
"learning_rate": 4.619635822782357e-06,
"loss": 0.2045,
"step": 5860
},
{
"epoch": 1.145827232796486,
"grad_norm": 1.7421070337295532,
"learning_rate": 4.602652437211781e-06,
"loss": 0.2306,
"step": 5870
},
{
"epoch": 1.1477794045876037,
"grad_norm": 1.8024625778198242,
"learning_rate": 4.585673663901773e-06,
"loss": 0.2136,
"step": 5880
},
{
"epoch": 1.1497315763787213,
"grad_norm": 1.860446572303772,
"learning_rate": 4.5686996999355266e-06,
"loss": 0.227,
"step": 5890
},
{
"epoch": 1.151683748169839,
"grad_norm": 1.634185552597046,
"learning_rate": 4.551730742340416e-06,
"loss": 0.227,
"step": 5900
},
{
"epoch": 1.1536359199609565,
"grad_norm": 1.6622986793518066,
"learning_rate": 4.5347669880856895e-06,
"loss": 0.2058,
"step": 5910
},
{
"epoch": 1.1555880917520742,
"grad_norm": 1.7402809858322144,
"learning_rate": 4.517808634080213e-06,
"loss": 0.2249,
"step": 5920
},
{
"epoch": 1.1575402635431917,
"grad_norm": 1.7279547452926636,
"learning_rate": 4.500855877170155e-06,
"loss": 0.2289,
"step": 5930
},
{
"epoch": 1.1594924353343095,
"grad_norm": 1.647835612297058,
"learning_rate": 4.483908914136723e-06,
"loss": 0.2232,
"step": 5940
},
{
"epoch": 1.161444607125427,
"grad_norm": 1.6166253089904785,
"learning_rate": 4.4669679416938685e-06,
"loss": 0.2209,
"step": 5950
},
{
"epoch": 1.1633967789165447,
"grad_norm": 1.8101551532745361,
"learning_rate": 4.450033156486007e-06,
"loss": 0.2251,
"step": 5960
},
{
"epoch": 1.1653489507076622,
"grad_norm": 1.6204791069030762,
"learning_rate": 4.4331047550857345e-06,
"loss": 0.1969,
"step": 5970
},
{
"epoch": 1.16730112249878,
"grad_norm": 1.601891040802002,
"learning_rate": 4.416182933991548e-06,
"loss": 0.2364,
"step": 5980
},
{
"epoch": 1.1692532942898974,
"grad_norm": 1.7196043729782104,
"learning_rate": 4.3992678896255595e-06,
"loss": 0.2312,
"step": 5990
},
{
"epoch": 1.1712054660810152,
"grad_norm": 1.5971014499664307,
"learning_rate": 4.382359818331221e-06,
"loss": 0.2291,
"step": 6000
},
{
"epoch": 1.1731576378721327,
"grad_norm": 1.803208351135254,
"learning_rate": 4.365458916371046e-06,
"loss": 0.2133,
"step": 6010
},
{
"epoch": 1.1751098096632504,
"grad_norm": 2.963254690170288,
"learning_rate": 4.348565379924324e-06,
"loss": 0.2279,
"step": 6020
},
{
"epoch": 1.177061981454368,
"grad_norm": 1.800583004951477,
"learning_rate": 4.331679405084853e-06,
"loss": 0.2223,
"step": 6030
},
{
"epoch": 1.1790141532454856,
"grad_norm": 1.5035797357559204,
"learning_rate": 4.3148011878586576e-06,
"loss": 0.2112,
"step": 6040
},
{
"epoch": 1.1809663250366031,
"grad_norm": 1.5907450914382935,
"learning_rate": 4.297930924161714e-06,
"loss": 0.2127,
"step": 6050
},
{
"epoch": 1.1829184968277209,
"grad_norm": 1.705296277999878,
"learning_rate": 4.281068809817675e-06,
"loss": 0.2242,
"step": 6060
},
{
"epoch": 1.1848706686188384,
"grad_norm": 2.0136845111846924,
"learning_rate": 4.264215040555605e-06,
"loss": 0.2279,
"step": 6070
},
{
"epoch": 1.1868228404099561,
"grad_norm": 1.5842479467391968,
"learning_rate": 4.247369812007692e-06,
"loss": 0.2281,
"step": 6080
},
{
"epoch": 1.1887750122010736,
"grad_norm": 1.4241150617599487,
"learning_rate": 4.230533319706998e-06,
"loss": 0.2223,
"step": 6090
},
{
"epoch": 1.1907271839921914,
"grad_norm": 1.6764804124832153,
"learning_rate": 4.213705759085172e-06,
"loss": 0.2191,
"step": 6100
},
{
"epoch": 1.1926793557833089,
"grad_norm": 1.4387422800064087,
"learning_rate": 4.196887325470183e-06,
"loss": 0.2173,
"step": 6110
},
{
"epoch": 1.1946315275744266,
"grad_norm": 1.4490535259246826,
"learning_rate": 4.180078214084068e-06,
"loss": 0.2182,
"step": 6120
},
{
"epoch": 1.196583699365544,
"grad_norm": 1.8612465858459473,
"learning_rate": 4.1632786200406436e-06,
"loss": 0.2179,
"step": 6130
},
{
"epoch": 1.1985358711566618,
"grad_norm": 1.6195158958435059,
"learning_rate": 4.146488738343263e-06,
"loss": 0.2113,
"step": 6140
},
{
"epoch": 1.2004880429477793,
"grad_norm": 1.405368685722351,
"learning_rate": 4.129708763882533e-06,
"loss": 0.2209,
"step": 6150
},
{
"epoch": 1.202440214738897,
"grad_norm": 1.5172168016433716,
"learning_rate": 4.112938891434069e-06,
"loss": 0.2136,
"step": 6160
},
{
"epoch": 1.2043923865300146,
"grad_norm": 1.6718852519989014,
"learning_rate": 4.096179315656219e-06,
"loss": 0.2088,
"step": 6170
},
{
"epoch": 1.2063445583211323,
"grad_norm": 1.8428574800491333,
"learning_rate": 4.079430231087815e-06,
"loss": 0.2199,
"step": 6180
},
{
"epoch": 1.2082967301122498,
"grad_norm": 1.4721224308013916,
"learning_rate": 4.062691832145913e-06,
"loss": 0.2124,
"step": 6190
},
{
"epoch": 1.2102489019033675,
"grad_norm": 1.5123246908187866,
"learning_rate": 4.045964313123528e-06,
"loss": 0.2063,
"step": 6200
},
{
"epoch": 1.212201073694485,
"grad_norm": 1.5970547199249268,
"learning_rate": 4.029247868187392e-06,
"loss": 0.2037,
"step": 6210
},
{
"epoch": 1.2141532454856028,
"grad_norm": 1.618791103363037,
"learning_rate": 4.012542691375688e-06,
"loss": 0.2214,
"step": 6220
},
{
"epoch": 1.2161054172767203,
"grad_norm": 1.6315085887908936,
"learning_rate": 3.995848976595806e-06,
"loss": 0.2201,
"step": 6230
},
{
"epoch": 1.218057589067838,
"grad_norm": 1.8346765041351318,
"learning_rate": 3.979166917622086e-06,
"loss": 0.2167,
"step": 6240
},
{
"epoch": 1.2200097608589555,
"grad_norm": 1.7975165843963623,
"learning_rate": 3.962496708093575e-06,
"loss": 0.2021,
"step": 6250
},
{
"epoch": 1.2219619326500732,
"grad_norm": 11.739858627319336,
"learning_rate": 3.945838541511773e-06,
"loss": 0.2218,
"step": 6260
},
{
"epoch": 1.2239141044411908,
"grad_norm": 1.684730887413025,
"learning_rate": 3.929192611238395e-06,
"loss": 0.2336,
"step": 6270
},
{
"epoch": 1.2258662762323085,
"grad_norm": 1.6722686290740967,
"learning_rate": 3.912559110493115e-06,
"loss": 0.1999,
"step": 6280
},
{
"epoch": 1.227818448023426,
"grad_norm": 2.155362844467163,
"learning_rate": 3.895938232351333e-06,
"loss": 0.2237,
"step": 6290
},
{
"epoch": 1.2297706198145437,
"grad_norm": 1.7131150960922241,
"learning_rate": 3.879330169741934e-06,
"loss": 0.2108,
"step": 6300
},
{
"epoch": 1.2317227916056612,
"grad_norm": 1.8110287189483643,
"learning_rate": 3.862735115445039e-06,
"loss": 0.2111,
"step": 6310
},
{
"epoch": 1.233674963396779,
"grad_norm": 1.29988694190979,
"learning_rate": 3.846153262089777e-06,
"loss": 0.2032,
"step": 6320
},
{
"epoch": 1.2356271351878965,
"grad_norm": 1.6154941320419312,
"learning_rate": 3.829584802152042e-06,
"loss": 0.2116,
"step": 6330
},
{
"epoch": 1.2375793069790142,
"grad_norm": 1.7335631847381592,
"learning_rate": 3.8130299279522696e-06,
"loss": 0.2237,
"step": 6340
},
{
"epoch": 1.2395314787701317,
"grad_norm": 1.5450068712234497,
"learning_rate": 3.796488831653187e-06,
"loss": 0.2048,
"step": 6350
},
{
"epoch": 1.2414836505612494,
"grad_norm": 1.7847269773483276,
"learning_rate": 3.779961705257605e-06,
"loss": 0.2205,
"step": 6360
},
{
"epoch": 1.243435822352367,
"grad_norm": 1.9442840814590454,
"learning_rate": 3.763448740606164e-06,
"loss": 0.2263,
"step": 6370
},
{
"epoch": 1.2453879941434847,
"grad_norm": 1.4432337284088135,
"learning_rate": 3.7469501293751277e-06,
"loss": 0.2264,
"step": 6380
},
{
"epoch": 1.2473401659346022,
"grad_norm": 1.4733388423919678,
"learning_rate": 3.730466063074154e-06,
"loss": 0.2218,
"step": 6390
},
{
"epoch": 1.24929233772572,
"grad_norm": 1.5178956985473633,
"learning_rate": 3.713996733044059e-06,
"loss": 0.2049,
"step": 6400
},
{
"epoch": 1.2512445095168374,
"grad_norm": 1.6209615468978882,
"learning_rate": 3.6975423304546142e-06,
"loss": 0.2179,
"step": 6410
},
{
"epoch": 1.2531966813079551,
"grad_norm": 1.591838002204895,
"learning_rate": 3.6811030463023133e-06,
"loss": 0.2227,
"step": 6420
},
{
"epoch": 1.2551488530990726,
"grad_norm": 1.7100849151611328,
"learning_rate": 3.664679071408166e-06,
"loss": 0.23,
"step": 6430
},
{
"epoch": 1.2571010248901904,
"grad_norm": 1.591233491897583,
"learning_rate": 3.648270596415473e-06,
"loss": 0.2248,
"step": 6440
},
{
"epoch": 1.2590531966813079,
"grad_norm": 1.4432621002197266,
"learning_rate": 3.6318778117876225e-06,
"loss": 0.2202,
"step": 6450
},
{
"epoch": 1.2610053684724256,
"grad_norm": 1.7753307819366455,
"learning_rate": 3.61550090780587e-06,
"loss": 0.2052,
"step": 6460
},
{
"epoch": 1.2629575402635431,
"grad_norm": 1.6676727533340454,
"learning_rate": 3.5991400745671384e-06,
"loss": 0.2075,
"step": 6470
},
{
"epoch": 1.2649097120546609,
"grad_norm": 1.4981262683868408,
"learning_rate": 3.5827955019818072e-06,
"loss": 0.2182,
"step": 6480
},
{
"epoch": 1.2668618838457784,
"grad_norm": 1.5867663621902466,
"learning_rate": 3.5664673797715056e-06,
"loss": 0.2183,
"step": 6490
},
{
"epoch": 1.268814055636896,
"grad_norm": 1.8115100860595703,
"learning_rate": 3.550155897466917e-06,
"loss": 0.2133,
"step": 6500
},
{
"epoch": 1.2707662274280136,
"grad_norm": 1.7083834409713745,
"learning_rate": 3.5338612444055697e-06,
"loss": 0.2025,
"step": 6510
},
{
"epoch": 1.2727183992191313,
"grad_norm": 1.6217999458312988,
"learning_rate": 3.5175836097296504e-06,
"loss": 0.2121,
"step": 6520
},
{
"epoch": 1.2746705710102488,
"grad_norm": 1.762628197669983,
"learning_rate": 3.5013231823837985e-06,
"loss": 0.2122,
"step": 6530
},
{
"epoch": 1.2766227428013666,
"grad_norm": 1.7376768589019775,
"learning_rate": 3.4850801511129205e-06,
"loss": 0.2168,
"step": 6540
},
{
"epoch": 1.278574914592484,
"grad_norm": 2.347261667251587,
"learning_rate": 3.468854704459991e-06,
"loss": 0.2058,
"step": 6550
},
{
"epoch": 1.2805270863836018,
"grad_norm": 1.9372010231018066,
"learning_rate": 3.452647030763876e-06,
"loss": 0.2009,
"step": 6560
},
{
"epoch": 1.2824792581747193,
"grad_norm": 1.5644526481628418,
"learning_rate": 3.436457318157131e-06,
"loss": 0.2066,
"step": 6570
},
{
"epoch": 1.284431429965837,
"grad_norm": 2.102677345275879,
"learning_rate": 3.4202857545638346e-06,
"loss": 0.2302,
"step": 6580
},
{
"epoch": 1.2863836017569545,
"grad_norm": 1.641678810119629,
"learning_rate": 3.4041325276973945e-06,
"loss": 0.193,
"step": 6590
},
{
"epoch": 1.2883357735480723,
"grad_norm": 1.6585208177566528,
"learning_rate": 3.38799782505837e-06,
"loss": 0.223,
"step": 6600
},
{
"epoch": 1.2902879453391898,
"grad_norm": 1.4569475650787354,
"learning_rate": 3.3718818339323058e-06,
"loss": 0.2145,
"step": 6610
},
{
"epoch": 1.2922401171303075,
"grad_norm": 1.6879363059997559,
"learning_rate": 3.355784741387539e-06,
"loss": 0.2171,
"step": 6620
},
{
"epoch": 1.294192288921425,
"grad_norm": 1.4503649473190308,
"learning_rate": 3.3397067342730504e-06,
"loss": 0.2055,
"step": 6630
},
{
"epoch": 1.2961444607125427,
"grad_norm": 1.8341500759124756,
"learning_rate": 3.323647999216278e-06,
"loss": 0.2065,
"step": 6640
},
{
"epoch": 1.2980966325036603,
"grad_norm": 1.490210771560669,
"learning_rate": 3.307608722620959e-06,
"loss": 0.1857,
"step": 6650
},
{
"epoch": 1.300048804294778,
"grad_norm": 1.4428335428237915,
"learning_rate": 3.2915890906649628e-06,
"loss": 0.2097,
"step": 6660
},
{
"epoch": 1.3020009760858955,
"grad_norm": 1.6432727575302124,
"learning_rate": 3.2755892892981323e-06,
"loss": 0.205,
"step": 6670
},
{
"epoch": 1.3039531478770132,
"grad_norm": 2.000216484069824,
"learning_rate": 3.2596095042401256e-06,
"loss": 0.2398,
"step": 6680
},
{
"epoch": 1.3059053196681307,
"grad_norm": 1.362533450126648,
"learning_rate": 3.2436499209782557e-06,
"loss": 0.2083,
"step": 6690
},
{
"epoch": 1.3078574914592485,
"grad_norm": 2.0070676803588867,
"learning_rate": 3.227710724765345e-06,
"loss": 0.2052,
"step": 6700
},
{
"epoch": 1.309809663250366,
"grad_norm": 1.47205650806427,
"learning_rate": 3.211792100617566e-06,
"loss": 0.2149,
"step": 6710
},
{
"epoch": 1.3117618350414837,
"grad_norm": 1.3983055353164673,
"learning_rate": 3.1958942333123035e-06,
"loss": 0.2337,
"step": 6720
},
{
"epoch": 1.3137140068326012,
"grad_norm": 1.6914767026901245,
"learning_rate": 3.1800173073859995e-06,
"loss": 0.2068,
"step": 6730
},
{
"epoch": 1.315666178623719,
"grad_norm": 1.351125717163086,
"learning_rate": 3.164161507132021e-06,
"loss": 0.1956,
"step": 6740
},
{
"epoch": 1.3176183504148364,
"grad_norm": 1.5504491329193115,
"learning_rate": 3.1483270165985124e-06,
"loss": 0.2372,
"step": 6750
},
{
"epoch": 1.3195705222059542,
"grad_norm": 1.9348682165145874,
"learning_rate": 3.1325140195862664e-06,
"loss": 0.2137,
"step": 6760
},
{
"epoch": 1.3215226939970717,
"grad_norm": 1.408018708229065,
"learning_rate": 3.1167226996465847e-06,
"loss": 0.1905,
"step": 6770
},
{
"epoch": 1.3234748657881894,
"grad_norm": 2.0732595920562744,
"learning_rate": 3.10095324007915e-06,
"loss": 0.2192,
"step": 6780
},
{
"epoch": 1.325427037579307,
"grad_norm": 1.3288345336914062,
"learning_rate": 3.085205823929899e-06,
"loss": 0.2122,
"step": 6790
},
{
"epoch": 1.3273792093704246,
"grad_norm": 1.5556180477142334,
"learning_rate": 3.069480633988895e-06,
"loss": 0.2007,
"step": 6800
},
{
"epoch": 1.3293313811615421,
"grad_norm": 1.4359447956085205,
"learning_rate": 3.053777852788211e-06,
"loss": 0.204,
"step": 6810
},
{
"epoch": 1.3312835529526599,
"grad_norm": 1.696527123451233,
"learning_rate": 3.0380976625998014e-06,
"loss": 0.2093,
"step": 6820
},
{
"epoch": 1.3332357247437774,
"grad_norm": 1.706363320350647,
"learning_rate": 3.022440245433403e-06,
"loss": 0.1934,
"step": 6830
},
{
"epoch": 1.3351878965348951,
"grad_norm": 1.193893551826477,
"learning_rate": 3.0068057830343998e-06,
"loss": 0.1961,
"step": 6840
},
{
"epoch": 1.3371400683260126,
"grad_norm": 1.2589608430862427,
"learning_rate": 2.991194456881737e-06,
"loss": 0.2085,
"step": 6850
},
{
"epoch": 1.3390922401171304,
"grad_norm": 1.5599446296691895,
"learning_rate": 2.9756064481857937e-06,
"loss": 0.2108,
"step": 6860
},
{
"epoch": 1.3410444119082479,
"grad_norm": 1.5846322774887085,
"learning_rate": 2.9600419378862925e-06,
"loss": 0.1868,
"step": 6870
},
{
"epoch": 1.3429965836993656,
"grad_norm": 1.6705174446105957,
"learning_rate": 2.9445011066502015e-06,
"loss": 0.2093,
"step": 6880
},
{
"epoch": 1.344948755490483,
"grad_norm": 1.7667570114135742,
"learning_rate": 2.928984134869619e-06,
"loss": 0.2148,
"step": 6890
},
{
"epoch": 1.3469009272816008,
"grad_norm": 1.7263866662979126,
"learning_rate": 2.9134912026596995e-06,
"loss": 0.1983,
"step": 6900
},
{
"epoch": 1.3488530990727183,
"grad_norm": 1.8353338241577148,
"learning_rate": 2.8980224898565555e-06,
"loss": 0.2074,
"step": 6910
},
{
"epoch": 1.350805270863836,
"grad_norm": 1.8815158605575562,
"learning_rate": 2.8825781760151693e-06,
"loss": 0.2097,
"step": 6920
},
{
"epoch": 1.3527574426549536,
"grad_norm": 1.6498256921768188,
"learning_rate": 2.8671584404073037e-06,
"loss": 0.1941,
"step": 6930
},
{
"epoch": 1.3547096144460713,
"grad_norm": 1.5063817501068115,
"learning_rate": 2.8517634620194358e-06,
"loss": 0.1977,
"step": 6940
},
{
"epoch": 1.3566617862371888,
"grad_norm": 2.3448615074157715,
"learning_rate": 2.836393419550661e-06,
"loss": 0.1946,
"step": 6950
},
{
"epoch": 1.3586139580283065,
"grad_norm": 1.4969979524612427,
"learning_rate": 2.821048491410632e-06,
"loss": 0.2039,
"step": 6960
},
{
"epoch": 1.360566129819424,
"grad_norm": 1.774389386177063,
"learning_rate": 2.8057288557174905e-06,
"loss": 0.1783,
"step": 6970
},
{
"epoch": 1.3625183016105418,
"grad_norm": 1.4157977104187012,
"learning_rate": 2.790434690295781e-06,
"loss": 0.2256,
"step": 6980
},
{
"epoch": 1.3644704734016593,
"grad_norm": 1.6641570329666138,
"learning_rate": 2.7751661726744083e-06,
"loss": 0.2009,
"step": 6990
},
{
"epoch": 1.366422645192777,
"grad_norm": 1.6484465599060059,
"learning_rate": 2.75992348008456e-06,
"loss": 0.2023,
"step": 7000
},
{
"epoch": 1.3683748169838945,
"grad_norm": 1.6529380083084106,
"learning_rate": 2.74470678945766e-06,
"loss": 0.204,
"step": 7010
},
{
"epoch": 1.3703269887750122,
"grad_norm": 1.4806594848632812,
"learning_rate": 2.729516277423313e-06,
"loss": 0.2004,
"step": 7020
},
{
"epoch": 1.3722791605661298,
"grad_norm": 1.3869143724441528,
"learning_rate": 2.714352120307252e-06,
"loss": 0.2119,
"step": 7030
},
{
"epoch": 1.3742313323572475,
"grad_norm": 1.3637787103652954,
"learning_rate": 2.699214494129286e-06,
"loss": 0.2128,
"step": 7040
},
{
"epoch": 1.376183504148365,
"grad_norm": 1.4802008867263794,
"learning_rate": 2.68410357460127e-06,
"loss": 0.2053,
"step": 7050
},
{
"epoch": 1.3781356759394827,
"grad_norm": 1.6326429843902588,
"learning_rate": 2.669019537125056e-06,
"loss": 0.2159,
"step": 7060
},
{
"epoch": 1.3800878477306002,
"grad_norm": 1.4315476417541504,
"learning_rate": 2.653962556790458e-06,
"loss": 0.2104,
"step": 7070
},
{
"epoch": 1.382040019521718,
"grad_norm": 1.6744425296783447,
"learning_rate": 2.638932808373226e-06,
"loss": 0.2073,
"step": 7080
},
{
"epoch": 1.3839921913128355,
"grad_norm": 1.5166488885879517,
"learning_rate": 2.623930466333002e-06,
"loss": 0.2004,
"step": 7090
},
{
"epoch": 1.3859443631039532,
"grad_norm": 1.4280526638031006,
"learning_rate": 2.608955704811314e-06,
"loss": 0.1921,
"step": 7100
},
{
"epoch": 1.3878965348950707,
"grad_norm": 1.697077751159668,
"learning_rate": 2.594008697629543e-06,
"loss": 0.2176,
"step": 7110
},
{
"epoch": 1.3898487066861884,
"grad_norm": 1.7205731868743896,
"learning_rate": 2.5790896182869106e-06,
"loss": 0.2108,
"step": 7120
},
{
"epoch": 1.391800878477306,
"grad_norm": 1.5982770919799805,
"learning_rate": 2.564198639958456e-06,
"loss": 0.1982,
"step": 7130
},
{
"epoch": 1.3937530502684237,
"grad_norm": 1.3759297132492065,
"learning_rate": 2.5493359354930404e-06,
"loss": 0.1954,
"step": 7140
},
{
"epoch": 1.3957052220595412,
"grad_norm": 1.6432647705078125,
"learning_rate": 2.5345016774113223e-06,
"loss": 0.2006,
"step": 7150
},
{
"epoch": 1.397657393850659,
"grad_norm": 1.5497634410858154,
"learning_rate": 2.5196960379037783e-06,
"loss": 0.1987,
"step": 7160
},
{
"epoch": 1.3996095656417764,
"grad_norm": 1.5066088438034058,
"learning_rate": 2.5049191888286846e-06,
"loss": 0.1854,
"step": 7170
},
{
"epoch": 1.4015617374328941,
"grad_norm": 1.6009175777435303,
"learning_rate": 2.490171301710125e-06,
"loss": 0.2093,
"step": 7180
},
{
"epoch": 1.4035139092240116,
"grad_norm": 1.9166560173034668,
"learning_rate": 2.475452547736013e-06,
"loss": 0.2036,
"step": 7190
},
{
"epoch": 1.4054660810151294,
"grad_norm": 1.4790300130844116,
"learning_rate": 2.460763097756086e-06,
"loss": 0.1901,
"step": 7200
},
{
"epoch": 1.4074182528062469,
"grad_norm": 1.6410057544708252,
"learning_rate": 2.446103122279938e-06,
"loss": 0.1968,
"step": 7210
},
{
"epoch": 1.4093704245973646,
"grad_norm": 1.4774519205093384,
"learning_rate": 2.431472791475033e-06,
"loss": 0.1826,
"step": 7220
},
{
"epoch": 1.4113225963884821,
"grad_norm": 1.4628077745437622,
"learning_rate": 2.416872275164732e-06,
"loss": 0.1876,
"step": 7230
},
{
"epoch": 1.4132747681795998,
"grad_norm": 1.589036464691162,
"learning_rate": 2.402301742826314e-06,
"loss": 0.2032,
"step": 7240
},
{
"epoch": 1.4152269399707174,
"grad_norm": 1.8331555128097534,
"learning_rate": 2.3877613635890233e-06,
"loss": 0.2018,
"step": 7250
},
{
"epoch": 1.417179111761835,
"grad_norm": 1.4858099222183228,
"learning_rate": 2.373251306232095e-06,
"loss": 0.1809,
"step": 7260
},
{
"epoch": 1.4191312835529526,
"grad_norm": 1.6922942399978638,
"learning_rate": 2.3587717391827997e-06,
"loss": 0.2195,
"step": 7270
},
{
"epoch": 1.4210834553440703,
"grad_norm": 1.8440519571304321,
"learning_rate": 2.344322830514489e-06,
"loss": 0.1992,
"step": 7280
},
{
"epoch": 1.4230356271351878,
"grad_norm": 1.589641809463501,
"learning_rate": 2.329904747944639e-06,
"loss": 0.1952,
"step": 7290
},
{
"epoch": 1.4249877989263056,
"grad_norm": 2.5631532669067383,
"learning_rate": 2.315517658832914e-06,
"loss": 0.2036,
"step": 7300
},
{
"epoch": 1.426939970717423,
"grad_norm": 1.4761751890182495,
"learning_rate": 2.3011617301792144e-06,
"loss": 0.2157,
"step": 7310
},
{
"epoch": 1.4288921425085408,
"grad_norm": 1.3230587244033813,
"learning_rate": 2.2868371286217458e-06,
"loss": 0.2074,
"step": 7320
},
{
"epoch": 1.4308443142996583,
"grad_norm": 1.5246955156326294,
"learning_rate": 2.272544020435073e-06,
"loss": 0.1976,
"step": 7330
},
{
"epoch": 1.432796486090776,
"grad_norm": 1.3427962064743042,
"learning_rate": 2.2582825715282043e-06,
"loss": 0.199,
"step": 7340
},
{
"epoch": 1.4347486578818935,
"grad_norm": 1.5608947277069092,
"learning_rate": 2.2440529474426576e-06,
"loss": 0.2091,
"step": 7350
},
{
"epoch": 1.4367008296730113,
"grad_norm": 1.4694558382034302,
"learning_rate": 2.229855313350539e-06,
"loss": 0.2016,
"step": 7360
},
{
"epoch": 1.4386530014641288,
"grad_norm": 1.3749107122421265,
"learning_rate": 2.21568983405263e-06,
"loss": 0.1969,
"step": 7370
},
{
"epoch": 1.4406051732552465,
"grad_norm": 1.803125262260437,
"learning_rate": 2.2015566739764647e-06,
"loss": 0.1825,
"step": 7380
},
{
"epoch": 1.442557345046364,
"grad_norm": 1.7458932399749756,
"learning_rate": 2.187455997174437e-06,
"loss": 0.2274,
"step": 7390
},
{
"epoch": 1.4445095168374817,
"grad_norm": 1.9437659978866577,
"learning_rate": 2.1733879673218754e-06,
"loss": 0.1892,
"step": 7400
},
{
"epoch": 1.4464616886285993,
"grad_norm": 1.3329747915267944,
"learning_rate": 2.15935274771517e-06,
"loss": 0.1908,
"step": 7410
},
{
"epoch": 1.448413860419717,
"grad_norm": 1.2210984230041504,
"learning_rate": 2.145350501269848e-06,
"loss": 0.2083,
"step": 7420
},
{
"epoch": 1.4503660322108345,
"grad_norm": 1.6458992958068848,
"learning_rate": 2.1313813905187057e-06,
"loss": 0.1919,
"step": 7430
},
{
"epoch": 1.4523182040019522,
"grad_norm": 1.724516749382019,
"learning_rate": 2.117445577609907e-06,
"loss": 0.2122,
"step": 7440
},
{
"epoch": 1.4542703757930697,
"grad_norm": 1.9065065383911133,
"learning_rate": 2.103543224305108e-06,
"loss": 0.1766,
"step": 7450
},
{
"epoch": 1.4562225475841875,
"grad_norm": 1.6535025835037231,
"learning_rate": 2.0896744919775857e-06,
"loss": 0.2011,
"step": 7460
},
{
"epoch": 1.458174719375305,
"grad_norm": 1.4374114274978638,
"learning_rate": 2.075839541610347e-06,
"loss": 0.1874,
"step": 7470
},
{
"epoch": 1.4601268911664227,
"grad_norm": 1.3221102952957153,
"learning_rate": 2.062038533794278e-06,
"loss": 0.1931,
"step": 7480
},
{
"epoch": 1.4620790629575402,
"grad_norm": 1.524937391281128,
"learning_rate": 2.0482716287262655e-06,
"loss": 0.2148,
"step": 7490
},
{
"epoch": 1.464031234748658,
"grad_norm": 1.4761067628860474,
"learning_rate": 2.0345389862073515e-06,
"loss": 0.1927,
"step": 7500
},
{
"epoch": 1.4659834065397754,
"grad_norm": 1.3123400211334229,
"learning_rate": 2.020840765640868e-06,
"loss": 0.2027,
"step": 7510
},
{
"epoch": 1.4679355783308932,
"grad_norm": 1.5982189178466797,
"learning_rate": 2.0071771260305917e-06,
"loss": 0.2187,
"step": 7520
},
{
"epoch": 1.4698877501220107,
"grad_norm": 1.5544073581695557,
"learning_rate": 1.993548225978892e-06,
"loss": 0.1969,
"step": 7530
},
{
"epoch": 1.4718399219131284,
"grad_norm": 1.753106951713562,
"learning_rate": 1.9799542236848996e-06,
"loss": 0.204,
"step": 7540
},
{
"epoch": 1.473792093704246,
"grad_norm": 1.5062081813812256,
"learning_rate": 1.966395276942663e-06,
"loss": 0.1828,
"step": 7550
},
{
"epoch": 1.4757442654953636,
"grad_norm": 1.4779527187347412,
"learning_rate": 1.95287154313932e-06,
"loss": 0.2011,
"step": 7560
},
{
"epoch": 1.4776964372864811,
"grad_norm": 1.6511406898498535,
"learning_rate": 1.9393831792532714e-06,
"loss": 0.1944,
"step": 7570
},
{
"epoch": 1.4796486090775989,
"grad_norm": 2.017911672592163,
"learning_rate": 1.9259303418523505e-06,
"loss": 0.1881,
"step": 7580
},
{
"epoch": 1.4816007808687164,
"grad_norm": 1.4355378150939941,
"learning_rate": 1.9125131870920212e-06,
"loss": 0.1973,
"step": 7590
},
{
"epoch": 1.4835529526598341,
"grad_norm": 1.6479721069335938,
"learning_rate": 1.8991318707135515e-06,
"loss": 0.1911,
"step": 7600
},
{
"epoch": 1.4855051244509516,
"grad_norm": 1.593674659729004,
"learning_rate": 1.8857865480422143e-06,
"loss": 0.1977,
"step": 7610
},
{
"epoch": 1.4874572962420693,
"grad_norm": 1.2697453498840332,
"learning_rate": 1.8724773739854763e-06,
"loss": 0.1861,
"step": 7620
},
{
"epoch": 1.4894094680331869,
"grad_norm": 1.7640104293823242,
"learning_rate": 1.8592045030312094e-06,
"loss": 0.2057,
"step": 7630
},
{
"epoch": 1.4913616398243046,
"grad_norm": 1.9129244089126587,
"learning_rate": 1.8459680892458932e-06,
"loss": 0.1847,
"step": 7640
},
{
"epoch": 1.493313811615422,
"grad_norm": 1.7413532733917236,
"learning_rate": 1.8327682862728174e-06,
"loss": 0.217,
"step": 7650
},
{
"epoch": 1.4952659834065398,
"grad_norm": 1.9197639226913452,
"learning_rate": 1.8196052473303227e-06,
"loss": 0.2086,
"step": 7660
},
{
"epoch": 1.4972181551976573,
"grad_norm": 1.4320746660232544,
"learning_rate": 1.8064791252099923e-06,
"loss": 0.2005,
"step": 7670
},
{
"epoch": 1.499170326988775,
"grad_norm": 1.49395751953125,
"learning_rate": 1.793390072274902e-06,
"loss": 0.1783,
"step": 7680
},
{
"epoch": 1.5011224987798926,
"grad_norm": 1.5681458711624146,
"learning_rate": 1.7803382404578356e-06,
"loss": 0.1857,
"step": 7690
},
{
"epoch": 1.5030746705710103,
"grad_norm": 1.4596507549285889,
"learning_rate": 1.7673237812595334e-06,
"loss": 0.2048,
"step": 7700
},
{
"epoch": 1.505026842362128,
"grad_norm": 1.6496530771255493,
"learning_rate": 1.7543468457469264e-06,
"loss": 0.2015,
"step": 7710
},
{
"epoch": 1.5069790141532455,
"grad_norm": 1.67415452003479,
"learning_rate": 1.741407584551388e-06,
"loss": 0.1899,
"step": 7720
},
{
"epoch": 1.508931185944363,
"grad_norm": 1.4106497764587402,
"learning_rate": 1.728506147866975e-06,
"loss": 0.1966,
"step": 7730
},
{
"epoch": 1.5108833577354808,
"grad_norm": 1.4033384323120117,
"learning_rate": 1.715642685448698e-06,
"loss": 0.1923,
"step": 7740
},
{
"epoch": 1.5128355295265985,
"grad_norm": 1.6058449745178223,
"learning_rate": 1.7028173466107756e-06,
"loss": 0.1808,
"step": 7750
},
{
"epoch": 1.514787701317716,
"grad_norm": 1.6106737852096558,
"learning_rate": 1.6900302802249002e-06,
"loss": 0.1939,
"step": 7760
},
{
"epoch": 1.5167398731088335,
"grad_norm": 1.6454293727874756,
"learning_rate": 1.6772816347185155e-06,
"loss": 0.2005,
"step": 7770
},
{
"epoch": 1.5186920448999512,
"grad_norm": 1.386349081993103,
"learning_rate": 1.6645715580730842e-06,
"loss": 0.1964,
"step": 7780
},
{
"epoch": 1.520644216691069,
"grad_norm": 1.464933156967163,
"learning_rate": 1.651900197822382e-06,
"loss": 0.1882,
"step": 7790
},
{
"epoch": 1.5225963884821865,
"grad_norm": 1.5540884733200073,
"learning_rate": 1.6392677010507768e-06,
"loss": 0.1997,
"step": 7800
},
{
"epoch": 1.524548560273304,
"grad_norm": 1.8291113376617432,
"learning_rate": 1.626674214391526e-06,
"loss": 0.2043,
"step": 7810
},
{
"epoch": 1.5265007320644217,
"grad_norm": 1.2215909957885742,
"learning_rate": 1.6141198840250672e-06,
"loss": 0.1876,
"step": 7820
},
{
"epoch": 1.5284529038555394,
"grad_norm": 1.7053108215332031,
"learning_rate": 1.6016048556773318e-06,
"loss": 0.1926,
"step": 7830
},
{
"epoch": 1.530405075646657,
"grad_norm": 2.2207212448120117,
"learning_rate": 1.5891292746180453e-06,
"loss": 0.1906,
"step": 7840
},
{
"epoch": 1.5323572474377745,
"grad_norm": 1.6570477485656738,
"learning_rate": 1.5766932856590467e-06,
"loss": 0.1955,
"step": 7850
},
{
"epoch": 1.5343094192288922,
"grad_norm": 3.652426242828369,
"learning_rate": 1.564297033152603e-06,
"loss": 0.1795,
"step": 7860
},
{
"epoch": 1.53626159102001,
"grad_norm": 1.6970198154449463,
"learning_rate": 1.5519406609897337e-06,
"loss": 0.2021,
"step": 7870
},
{
"epoch": 1.5382137628111274,
"grad_norm": 1.788756251335144,
"learning_rate": 1.5396243125985467e-06,
"loss": 0.2041,
"step": 7880
},
{
"epoch": 1.540165934602245,
"grad_norm": 1.713059902191162,
"learning_rate": 1.5273481309425614e-06,
"loss": 0.2039,
"step": 7890
},
{
"epoch": 1.5421181063933627,
"grad_norm": 1.5806763172149658,
"learning_rate": 1.5151122585190697e-06,
"loss": 0.1787,
"step": 7900
},
{
"epoch": 1.5440702781844804,
"grad_norm": 1.6198954582214355,
"learning_rate": 1.5029168373574553e-06,
"loss": 0.2025,
"step": 7910
},
{
"epoch": 1.546022449975598,
"grad_norm": 1.5160976648330688,
"learning_rate": 1.4907620090175678e-06,
"loss": 0.1842,
"step": 7920
},
{
"epoch": 1.5479746217667154,
"grad_norm": 1.5395866632461548,
"learning_rate": 1.4786479145880684e-06,
"loss": 0.1876,
"step": 7930
},
{
"epoch": 1.5499267935578331,
"grad_norm": 2.082068681716919,
"learning_rate": 1.466574694684792e-06,
"loss": 0.1865,
"step": 7940
},
{
"epoch": 1.5518789653489509,
"grad_norm": 1.5307166576385498,
"learning_rate": 1.45454248944912e-06,
"loss": 0.2111,
"step": 7950
},
{
"epoch": 1.5538311371400684,
"grad_norm": 1.3423751592636108,
"learning_rate": 1.4425514385463513e-06,
"loss": 0.2022,
"step": 7960
},
{
"epoch": 1.5557833089311859,
"grad_norm": 1.5550248622894287,
"learning_rate": 1.4306016811640804e-06,
"loss": 0.1882,
"step": 7970
},
{
"epoch": 1.5577354807223036,
"grad_norm": 1.5491834878921509,
"learning_rate": 1.4186933560105798e-06,
"loss": 0.1936,
"step": 7980
},
{
"epoch": 1.5596876525134213,
"grad_norm": 1.489673137664795,
"learning_rate": 1.4068266013131954e-06,
"loss": 0.1961,
"step": 7990
},
{
"epoch": 1.5616398243045388,
"grad_norm": 1.7777575254440308,
"learning_rate": 1.3950015548167372e-06,
"loss": 0.1868,
"step": 8000
},
{
"epoch": 1.5635919960956564,
"grad_norm": 1.5120434761047363,
"learning_rate": 1.383218353781885e-06,
"loss": 0.1861,
"step": 8010
},
{
"epoch": 1.565544167886774,
"grad_norm": 1.5217742919921875,
"learning_rate": 1.3714771349835871e-06,
"loss": 0.1991,
"step": 8020
},
{
"epoch": 1.5674963396778918,
"grad_norm": 1.8366824388504028,
"learning_rate": 1.3597780347094814e-06,
"loss": 0.1924,
"step": 8030
},
{
"epoch": 1.5694485114690093,
"grad_norm": 1.8791022300720215,
"learning_rate": 1.3481211887583101e-06,
"loss": 0.1798,
"step": 8040
},
{
"epoch": 1.5714006832601268,
"grad_norm": 1.4695504903793335,
"learning_rate": 1.3365067324383418e-06,
"loss": 0.1861,
"step": 8050
},
{
"epoch": 1.5733528550512446,
"grad_norm": 1.619231939315796,
"learning_rate": 1.3249348005658047e-06,
"loss": 0.2067,
"step": 8060
},
{
"epoch": 1.5753050268423623,
"grad_norm": 1.5700544118881226,
"learning_rate": 1.3134055274633135e-06,
"loss": 0.177,
"step": 8070
},
{
"epoch": 1.5772571986334798,
"grad_norm": 1.4808869361877441,
"learning_rate": 1.3019190469583238e-06,
"loss": 0.1878,
"step": 8080
},
{
"epoch": 1.5792093704245973,
"grad_norm": 1.8071080446243286,
"learning_rate": 1.2904754923815615e-06,
"loss": 0.1895,
"step": 8090
},
{
"epoch": 1.581161542215715,
"grad_norm": 1.6094269752502441,
"learning_rate": 1.2790749965654964e-06,
"loss": 0.2031,
"step": 8100
},
{
"epoch": 1.5831137140068328,
"grad_norm": 1.8464001417160034,
"learning_rate": 1.2677176918427769e-06,
"loss": 0.1908,
"step": 8110
},
{
"epoch": 1.5850658857979503,
"grad_norm": 1.3580669164657593,
"learning_rate": 1.256403710044713e-06,
"loss": 0.1877,
"step": 8120
},
{
"epoch": 1.5870180575890678,
"grad_norm": 1.6515955924987793,
"learning_rate": 1.245133182499737e-06,
"loss": 0.1926,
"step": 8130
},
{
"epoch": 1.5889702293801855,
"grad_norm": 1.3067126274108887,
"learning_rate": 1.2339062400318746e-06,
"loss": 0.1841,
"step": 8140
},
{
"epoch": 1.5909224011713032,
"grad_norm": 1.7133537530899048,
"learning_rate": 1.222723012959245e-06,
"loss": 0.192,
"step": 8150
},
{
"epoch": 1.5928745729624207,
"grad_norm": 1.7153127193450928,
"learning_rate": 1.2115836310925222e-06,
"loss": 0.1875,
"step": 8160
},
{
"epoch": 1.5948267447535383,
"grad_norm": 1.5071845054626465,
"learning_rate": 1.2004882237334508e-06,
"loss": 0.1881,
"step": 8170
},
{
"epoch": 1.596778916544656,
"grad_norm": 1.383007526397705,
"learning_rate": 1.1894369196733296e-06,
"loss": 0.1963,
"step": 8180
},
{
"epoch": 1.5987310883357737,
"grad_norm": 2.016113042831421,
"learning_rate": 1.1784298471915279e-06,
"loss": 0.192,
"step": 8190
},
{
"epoch": 1.6006832601268912,
"grad_norm": 1.7588931322097778,
"learning_rate": 1.1674671340539895e-06,
"loss": 0.1763,
"step": 8200
},
{
"epoch": 1.6026354319180087,
"grad_norm": 1.6065738201141357,
"learning_rate": 1.156548907511751e-06,
"loss": 0.1782,
"step": 8210
},
{
"epoch": 1.6045876037091265,
"grad_norm": 1.385704517364502,
"learning_rate": 1.1456752942994675e-06,
"loss": 0.1895,
"step": 8220
},
{
"epoch": 1.6065397755002442,
"grad_norm": 1.7642816305160522,
"learning_rate": 1.134846420633936e-06,
"loss": 0.1863,
"step": 8230
},
{
"epoch": 1.6084919472913617,
"grad_norm": 1.86152982711792,
"learning_rate": 1.1240624122126364e-06,
"loss": 0.1973,
"step": 8240
},
{
"epoch": 1.6104441190824792,
"grad_norm": 1.391201376914978,
"learning_rate": 1.11332339421227e-06,
"loss": 0.2125,
"step": 8250
},
{
"epoch": 1.612396290873597,
"grad_norm": 1.69306218624115,
"learning_rate": 1.102629491287306e-06,
"loss": 0.183,
"step": 8260
},
{
"epoch": 1.6143484626647147,
"grad_norm": 1.454978585243225,
"learning_rate": 1.0919808275685312e-06,
"loss": 0.1756,
"step": 8270
},
{
"epoch": 1.6163006344558322,
"grad_norm": 2.039059638977051,
"learning_rate": 1.0813775266616178e-06,
"loss": 0.2104,
"step": 8280
},
{
"epoch": 1.6182528062469497,
"grad_norm": 1.7883342504501343,
"learning_rate": 1.0708197116456814e-06,
"loss": 0.186,
"step": 8290
},
{
"epoch": 1.6202049780380674,
"grad_norm": 1.4627658128738403,
"learning_rate": 1.060307505071856e-06,
"loss": 0.1895,
"step": 8300
},
{
"epoch": 1.6221571498291851,
"grad_norm": 1.8400914669036865,
"learning_rate": 1.0498410289618661e-06,
"loss": 0.2073,
"step": 8310
},
{
"epoch": 1.6241093216203026,
"grad_norm": 1.562545657157898,
"learning_rate": 1.039420404806618e-06,
"loss": 0.1762,
"step": 8320
},
{
"epoch": 1.6260614934114201,
"grad_norm": 1.5337347984313965,
"learning_rate": 1.0290457535647851e-06,
"loss": 0.185,
"step": 8330
},
{
"epoch": 1.6280136652025379,
"grad_norm": 1.2989600896835327,
"learning_rate": 1.0187171956614034e-06,
"loss": 0.1979,
"step": 8340
},
{
"epoch": 1.6299658369936556,
"grad_norm": 1.4280548095703125,
"learning_rate": 1.0084348509864778e-06,
"loss": 0.1819,
"step": 8350
},
{
"epoch": 1.631918008784773,
"grad_norm": 1.5403130054473877,
"learning_rate": 9.981988388935815e-07,
"loss": 0.1998,
"step": 8360
},
{
"epoch": 1.6338701805758906,
"grad_norm": 1.5721741914749146,
"learning_rate": 9.88009278198484e-07,
"loss": 0.1946,
"step": 8370
},
{
"epoch": 1.6358223523670083,
"grad_norm": 1.484516978263855,
"learning_rate": 9.778662871777577e-07,
"loss": 0.1869,
"step": 8380
},
{
"epoch": 1.637774524158126,
"grad_norm": 1.5680903196334839,
"learning_rate": 9.677699835674165e-07,
"loss": 0.1902,
"step": 8390
},
{
"epoch": 1.6397266959492436,
"grad_norm": 2.306062936782837,
"learning_rate": 9.577204845615423e-07,
"loss": 0.1792,
"step": 8400
},
{
"epoch": 1.641678867740361,
"grad_norm": 1.3779124021530151,
"learning_rate": 9.477179068109276e-07,
"loss": 0.1749,
"step": 8410
},
{
"epoch": 1.6436310395314788,
"grad_norm": 1.7077033519744873,
"learning_rate": 9.377623664217223e-07,
"loss": 0.1803,
"step": 8420
},
{
"epoch": 1.6455832113225966,
"grad_norm": 1.6295830011367798,
"learning_rate": 9.278539789540791e-07,
"loss": 0.1791,
"step": 8430
},
{
"epoch": 1.647535383113714,
"grad_norm": 1.654883861541748,
"learning_rate": 9.179928594208226e-07,
"loss": 0.1897,
"step": 8440
},
{
"epoch": 1.6494875549048316,
"grad_norm": 1.6051833629608154,
"learning_rate": 9.08179122286107e-07,
"loss": 0.1815,
"step": 8450
},
{
"epoch": 1.6514397266959493,
"grad_norm": 1.543062448501587,
"learning_rate": 8.984128814640913e-07,
"loss": 0.1948,
"step": 8460
},
{
"epoch": 1.653391898487067,
"grad_norm": 1.3002710342407227,
"learning_rate": 8.886942503176111e-07,
"loss": 0.1682,
"step": 8470
},
{
"epoch": 1.6553440702781845,
"grad_norm": 1.279937982559204,
"learning_rate": 8.790233416568705e-07,
"loss": 0.1865,
"step": 8480
},
{
"epoch": 1.657296242069302,
"grad_norm": 1.4807121753692627,
"learning_rate": 8.694002677381275e-07,
"loss": 0.1794,
"step": 8490
},
{
"epoch": 1.6592484138604198,
"grad_norm": 1.7030433416366577,
"learning_rate": 8.598251402623936e-07,
"loss": 0.2162,
"step": 8500
},
{
"epoch": 1.6612005856515375,
"grad_norm": 1.3917113542556763,
"learning_rate": 8.502980703741365e-07,
"loss": 0.2069,
"step": 8510
},
{
"epoch": 1.663152757442655,
"grad_norm": 1.782596468925476,
"learning_rate": 8.408191686599859e-07,
"loss": 0.1957,
"step": 8520
},
{
"epoch": 1.6651049292337725,
"grad_norm": 1.685548186302185,
"learning_rate": 8.313885451474568e-07,
"loss": 0.1814,
"step": 8530
},
{
"epoch": 1.6670571010248902,
"grad_norm": 1.159106969833374,
"learning_rate": 8.22006309303669e-07,
"loss": 0.1897,
"step": 8540
},
{
"epoch": 1.669009272816008,
"grad_norm": 1.6422431468963623,
"learning_rate": 8.126725700340765e-07,
"loss": 0.1749,
"step": 8550
},
{
"epoch": 1.6709614446071255,
"grad_norm": 1.769709825515747,
"learning_rate": 8.033874356811999e-07,
"loss": 0.1887,
"step": 8560
},
{
"epoch": 1.672913616398243,
"grad_norm": 1.5047273635864258,
"learning_rate": 7.941510140233782e-07,
"loss": 0.1972,
"step": 8570
},
{
"epoch": 1.6748657881893607,
"grad_norm": 1.2721914052963257,
"learning_rate": 7.849634122735051e-07,
"loss": 0.1889,
"step": 8580
},
{
"epoch": 1.6768179599804784,
"grad_norm": 1.4038310050964355,
"learning_rate": 7.758247370777988e-07,
"loss": 0.1846,
"step": 8590
},
{
"epoch": 1.678770131771596,
"grad_norm": 2.210256576538086,
"learning_rate": 7.66735094514549e-07,
"loss": 0.1937,
"step": 8600
},
{
"epoch": 1.6807223035627135,
"grad_norm": 2.027031183242798,
"learning_rate": 7.576945900928989e-07,
"loss": 0.1819,
"step": 8610
},
{
"epoch": 1.6826744753538312,
"grad_norm": 2.145761251449585,
"learning_rate": 7.487033287516121e-07,
"loss": 0.1791,
"step": 8620
},
{
"epoch": 1.684626647144949,
"grad_norm": 1.670264720916748,
"learning_rate": 7.397614148578546e-07,
"loss": 0.1909,
"step": 8630
},
{
"epoch": 1.6865788189360664,
"grad_norm": 1.315961241722107,
"learning_rate": 7.308689522059936e-07,
"loss": 0.1618,
"step": 8640
},
{
"epoch": 1.688530990727184,
"grad_norm": 1.3299071788787842,
"learning_rate": 7.220260440163756e-07,
"loss": 0.1865,
"step": 8650
},
{
"epoch": 1.6904831625183017,
"grad_norm": 1.5446456670761108,
"learning_rate": 7.132327929341448e-07,
"loss": 0.1981,
"step": 8660
},
{
"epoch": 1.6924353343094194,
"grad_norm": 1.5094953775405884,
"learning_rate": 7.044893010280401e-07,
"loss": 0.1758,
"step": 8670
},
{
"epoch": 1.694387506100537,
"grad_norm": 1.5475033521652222,
"learning_rate": 6.95795669789216e-07,
"loss": 0.1774,
"step": 8680
},
{
"epoch": 1.6963396778916544,
"grad_norm": 1.6808737516403198,
"learning_rate": 6.871520001300641e-07,
"loss": 0.1817,
"step": 8690
},
{
"epoch": 1.6982918496827721,
"grad_norm": 1.6497918367385864,
"learning_rate": 6.785583923830403e-07,
"loss": 0.2037,
"step": 8700
},
{
"epoch": 1.7002440214738899,
"grad_norm": 1.6453701257705688,
"learning_rate": 6.70014946299501e-07,
"loss": 0.1922,
"step": 8710
},
{
"epoch": 1.7021961932650074,
"grad_norm": 1.8417551517486572,
"learning_rate": 6.615217610485425e-07,
"loss": 0.1957,
"step": 8720
},
{
"epoch": 1.7041483650561249,
"grad_norm": 1.4647294282913208,
"learning_rate": 6.530789352158556e-07,
"loss": 0.177,
"step": 8730
},
{
"epoch": 1.7061005368472426,
"grad_norm": 1.6255731582641602,
"learning_rate": 6.446865668025764e-07,
"loss": 0.1863,
"step": 8740
},
{
"epoch": 1.7080527086383603,
"grad_norm": 1.3874322175979614,
"learning_rate": 6.363447532241518e-07,
"loss": 0.1787,
"step": 8750
},
{
"epoch": 1.7100048804294778,
"grad_norm": 1.8756142854690552,
"learning_rate": 6.280535913092039e-07,
"loss": 0.2087,
"step": 8760
},
{
"epoch": 1.7119570522205954,
"grad_norm": 1.6572879552841187,
"learning_rate": 6.198131772984123e-07,
"loss": 0.1874,
"step": 8770
},
{
"epoch": 1.713909224011713,
"grad_norm": 1.7034530639648438,
"learning_rate": 6.11623606843394e-07,
"loss": 0.1744,
"step": 8780
},
{
"epoch": 1.7158613958028308,
"grad_norm": 2.1638829708099365,
"learning_rate": 6.034849750055922e-07,
"loss": 0.1873,
"step": 8790
},
{
"epoch": 1.7178135675939483,
"grad_norm": 1.5946378707885742,
"learning_rate": 5.953973762551746e-07,
"loss": 0.1879,
"step": 8800
},
{
"epoch": 1.7197657393850658,
"grad_norm": 1.7199867963790894,
"learning_rate": 5.873609044699347e-07,
"loss": 0.1881,
"step": 8810
},
{
"epoch": 1.7217179111761836,
"grad_norm": 1.6096932888031006,
"learning_rate": 5.793756529342054e-07,
"loss": 0.195,
"step": 8820
},
{
"epoch": 1.7236700829673013,
"grad_norm": 1.6395195722579956,
"learning_rate": 5.714417143377704e-07,
"loss": 0.1899,
"step": 8830
},
{
"epoch": 1.7256222547584188,
"grad_norm": 1.9110219478607178,
"learning_rate": 5.635591807747997e-07,
"loss": 0.2085,
"step": 8840
},
{
"epoch": 1.7275744265495363,
"grad_norm": 1.688049554824829,
"learning_rate": 5.557281437427647e-07,
"loss": 0.1835,
"step": 8850
},
{
"epoch": 1.729526598340654,
"grad_norm": 1.2737325429916382,
"learning_rate": 5.479486941413914e-07,
"loss": 0.1772,
"step": 8860
},
{
"epoch": 1.7314787701317718,
"grad_norm": 1.3557264804840088,
"learning_rate": 5.402209222715915e-07,
"loss": 0.1809,
"step": 8870
},
{
"epoch": 1.7334309419228893,
"grad_norm": 1.7743897438049316,
"learning_rate": 5.325449178344272e-07,
"loss": 0.1842,
"step": 8880
},
{
"epoch": 1.7353831137140068,
"grad_norm": 1.759393572807312,
"learning_rate": 5.249207699300607e-07,
"loss": 0.1829,
"step": 8890
},
{
"epoch": 1.7373352855051245,
"grad_norm": 1.3803874254226685,
"learning_rate": 5.173485670567241e-07,
"loss": 0.198,
"step": 8900
},
{
"epoch": 1.7392874572962422,
"grad_norm": 1.1466678380966187,
"learning_rate": 5.098283971096923e-07,
"loss": 0.1861,
"step": 8910
},
{
"epoch": 1.7412396290873597,
"grad_norm": 1.2843297719955444,
"learning_rate": 5.02360347380258e-07,
"loss": 0.1835,
"step": 8920
},
{
"epoch": 1.7431918008784772,
"grad_norm": 1.6574630737304688,
"learning_rate": 4.949445045547253e-07,
"loss": 0.1957,
"step": 8930
},
{
"epoch": 1.745143972669595,
"grad_norm": 1.6570775508880615,
"learning_rate": 4.875809547133991e-07,
"loss": 0.173,
"step": 8940
},
{
"epoch": 1.7470961444607127,
"grad_norm": 1.4477870464324951,
"learning_rate": 4.802697833295888e-07,
"loss": 0.1908,
"step": 8950
},
{
"epoch": 1.7490483162518302,
"grad_norm": 1.4322397708892822,
"learning_rate": 4.7301107526861125e-07,
"loss": 0.1843,
"step": 8960
},
{
"epoch": 1.7510004880429477,
"grad_norm": 1.7200080156326294,
"learning_rate": 4.65804914786811e-07,
"loss": 0.1844,
"step": 8970
},
{
"epoch": 1.7529526598340655,
"grad_norm": 1.5689972639083862,
"learning_rate": 4.5865138553057963e-07,
"loss": 0.1751,
"step": 8980
},
{
"epoch": 1.7549048316251832,
"grad_norm": 2.6789133548736572,
"learning_rate": 4.5155057053538564e-07,
"loss": 0.1781,
"step": 8990
},
{
"epoch": 1.7568570034163007,
"grad_norm": 1.8074640035629272,
"learning_rate": 4.445025522248109e-07,
"loss": 0.1972,
"step": 9000
},
{
"epoch": 1.7588091752074182,
"grad_norm": 1.4728846549987793,
"learning_rate": 4.375074124095902e-07,
"loss": 0.1837,
"step": 9010
},
{
"epoch": 1.760761346998536,
"grad_norm": 1.4168732166290283,
"learning_rate": 4.3056523228666823e-07,
"loss": 0.1889,
"step": 9020
},
{
"epoch": 1.7627135187896537,
"grad_norm": 1.5875383615493774,
"learning_rate": 4.2367609243825215e-07,
"loss": 0.1811,
"step": 9030
},
{
"epoch": 1.7646656905807712,
"grad_norm": 1.8759644031524658,
"learning_rate": 4.1684007283087803e-07,
"loss": 0.1887,
"step": 9040
},
{
"epoch": 1.7666178623718887,
"grad_norm": 1.457709789276123,
"learning_rate": 4.1005725281448083e-07,
"loss": 0.1766,
"step": 9050
},
{
"epoch": 1.7685700341630064,
"grad_norm": 1.5108357667922974,
"learning_rate": 4.033277111214778e-07,
"loss": 0.1781,
"step": 9060
},
{
"epoch": 1.7705222059541241,
"grad_norm": 1.6905869245529175,
"learning_rate": 3.966515258658465e-07,
"loss": 0.206,
"step": 9070
},
{
"epoch": 1.7724743777452416,
"grad_norm": 1.4353312253952026,
"learning_rate": 3.9002877454222767e-07,
"loss": 0.1712,
"step": 9080
},
{
"epoch": 1.7744265495363591,
"grad_norm": 1.5684956312179565,
"learning_rate": 3.834595340250208e-07,
"loss": 0.1836,
"step": 9090
},
{
"epoch": 1.7763787213274769,
"grad_norm": 1.705480933189392,
"learning_rate": 3.7694388056748966e-07,
"loss": 0.2001,
"step": 9100
},
{
"epoch": 1.7783308931185946,
"grad_norm": 2.7086293697357178,
"learning_rate": 3.704818898008811e-07,
"loss": 0.194,
"step": 9110
},
{
"epoch": 1.780283064909712,
"grad_norm": 1.3837997913360596,
"learning_rate": 3.640736367335451e-07,
"loss": 0.1961,
"step": 9120
},
{
"epoch": 1.7822352367008296,
"grad_norm": 1.6206564903259277,
"learning_rate": 3.577191957500653e-07,
"loss": 0.1808,
"step": 9130
},
{
"epoch": 1.7841874084919473,
"grad_norm": 1.6568931341171265,
"learning_rate": 3.5141864061039534e-07,
"loss": 0.1884,
"step": 9140
},
{
"epoch": 1.786139580283065,
"grad_norm": 1.385784387588501,
"learning_rate": 3.4517204444900143e-07,
"loss": 0.1764,
"step": 9150
},
{
"epoch": 1.7880917520741826,
"grad_norm": 1.6895047426223755,
"learning_rate": 3.3897947977401426e-07,
"loss": 0.1935,
"step": 9160
},
{
"epoch": 1.7900439238653,
"grad_norm": 1.6801025867462158,
"learning_rate": 3.328410184663883e-07,
"loss": 0.1953,
"step": 9170
},
{
"epoch": 1.7919960956564178,
"grad_norm": 1.7120310068130493,
"learning_rate": 3.2675673177906543e-07,
"loss": 0.1813,
"step": 9180
},
{
"epoch": 1.7939482674475355,
"grad_norm": 1.6116238832473755,
"learning_rate": 3.207266903361506e-07,
"loss": 0.1741,
"step": 9190
},
{
"epoch": 1.795900439238653,
"grad_norm": 1.5066558122634888,
"learning_rate": 3.1475096413208895e-07,
"loss": 0.1795,
"step": 9200
},
{
"epoch": 1.7978526110297706,
"grad_norm": 1.6561325788497925,
"learning_rate": 3.0882962253085513e-07,
"loss": 0.179,
"step": 9210
},
{
"epoch": 1.7998047828208883,
"grad_norm": 1.5498188734054565,
"learning_rate": 3.029627342651481e-07,
"loss": 0.1898,
"step": 9220
},
{
"epoch": 1.8017569546120058,
"grad_norm": 1.6732233762741089,
"learning_rate": 2.97150367435593e-07,
"loss": 0.1996,
"step": 9230
},
{
"epoch": 1.8037091264031235,
"grad_norm": 1.2501994371414185,
"learning_rate": 2.913925895099512e-07,
"loss": 0.1855,
"step": 9240
},
{
"epoch": 1.805661298194241,
"grad_norm": 1.7852662801742554,
"learning_rate": 2.8568946732233536e-07,
"loss": 0.1868,
"step": 9250
},
{
"epoch": 1.8076134699853585,
"grad_norm": 1.6470123529434204,
"learning_rate": 2.8004106707243685e-07,
"loss": 0.1752,
"step": 9260
},
{
"epoch": 1.8095656417764763,
"grad_norm": 1.574867606163025,
"learning_rate": 2.7444745432475217e-07,
"loss": 0.1875,
"step": 9270
},
{
"epoch": 1.811517813567594,
"grad_norm": 1.4707486629486084,
"learning_rate": 2.6890869400782893e-07,
"loss": 0.1852,
"step": 9280
},
{
"epoch": 1.8134699853587115,
"grad_norm": 1.4332098960876465,
"learning_rate": 2.6342485041350786e-07,
"loss": 0.2055,
"step": 9290
},
{
"epoch": 1.815422157149829,
"grad_norm": 1.5197958946228027,
"learning_rate": 2.579959871961746e-07,
"loss": 0.1957,
"step": 9300
},
{
"epoch": 1.8173743289409467,
"grad_norm": 1.361507773399353,
"learning_rate": 2.5262216737202526e-07,
"loss": 0.1793,
"step": 9310
},
{
"epoch": 1.8193265007320645,
"grad_norm": 1.4850951433181763,
"learning_rate": 2.4730345331833105e-07,
"loss": 0.1759,
"step": 9320
},
{
"epoch": 1.821278672523182,
"grad_norm": 1.5868844985961914,
"learning_rate": 2.4203990677272025e-07,
"loss": 0.1862,
"step": 9330
},
{
"epoch": 1.8232308443142995,
"grad_norm": 1.2263453006744385,
"learning_rate": 2.3683158883245294e-07,
"loss": 0.1712,
"step": 9340
},
{
"epoch": 1.8251830161054172,
"grad_norm": 1.3057667016983032,
"learning_rate": 2.3167855995372025e-07,
"loss": 0.1921,
"step": 9350
},
{
"epoch": 1.827135187896535,
"grad_norm": 1.3871904611587524,
"learning_rate": 2.2658087995093503e-07,
"loss": 0.1763,
"step": 9360
},
{
"epoch": 1.8290873596876525,
"grad_norm": 1.5762321949005127,
"learning_rate": 2.215386079960441e-07,
"loss": 0.1763,
"step": 9370
},
{
"epoch": 1.83103953147877,
"grad_norm": 1.703233242034912,
"learning_rate": 2.1655180261783704e-07,
"loss": 0.1844,
"step": 9380
},
{
"epoch": 1.8329917032698877,
"grad_norm": 1.7117513418197632,
"learning_rate": 2.1162052170126956e-07,
"loss": 0.1853,
"step": 9390
},
{
"epoch": 1.8349438750610054,
"grad_norm": 1.9015542268753052,
"learning_rate": 2.0674482248679018e-07,
"loss": 0.1795,
"step": 9400
},
{
"epoch": 1.836896046852123,
"grad_norm": 1.317375659942627,
"learning_rate": 2.0192476156967456e-07,
"loss": 0.1635,
"step": 9410
},
{
"epoch": 1.8388482186432404,
"grad_norm": 1.704931616783142,
"learning_rate": 1.9716039489937056e-07,
"loss": 0.1863,
"step": 9420
},
{
"epoch": 1.8408003904343582,
"grad_norm": 1.7894163131713867,
"learning_rate": 1.9245177777884983e-07,
"loss": 0.1647,
"step": 9430
},
{
"epoch": 1.842752562225476,
"grad_norm": 1.5365114212036133,
"learning_rate": 1.877989648639633e-07,
"loss": 0.1836,
"step": 9440
},
{
"epoch": 1.8447047340165934,
"grad_norm": 1.510296106338501,
"learning_rate": 1.8320201016280626e-07,
"loss": 0.181,
"step": 9450
},
{
"epoch": 1.846656905807711,
"grad_norm": 1.6183487176895142,
"learning_rate": 1.7866096703509472e-07,
"loss": 0.1735,
"step": 9460
},
{
"epoch": 1.8486090775988286,
"grad_norm": 1.6009883880615234,
"learning_rate": 1.741758881915434e-07,
"loss": 0.1989,
"step": 9470
},
{
"epoch": 1.8505612493899464,
"grad_norm": 1.6394355297088623,
"learning_rate": 1.6974682569325607e-07,
"loss": 0.1928,
"step": 9480
},
{
"epoch": 1.8525134211810639,
"grad_norm": 1.5997397899627686,
"learning_rate": 1.6537383095111882e-07,
"loss": 0.1839,
"step": 9490
},
{
"epoch": 1.8544655929721814,
"grad_norm": 1.5100394487380981,
"learning_rate": 1.6105695472520333e-07,
"loss": 0.1794,
"step": 9500
},
{
"epoch": 1.8564177647632991,
"grad_norm": 1.6228221654891968,
"learning_rate": 1.567962471241813e-07,
"loss": 0.19,
"step": 9510
},
{
"epoch": 1.8583699365544168,
"grad_norm": 1.5703927278518677,
"learning_rate": 1.52591757604737e-07,
"loss": 0.1941,
"step": 9520
},
{
"epoch": 1.8603221083455344,
"grad_norm": 1.9678072929382324,
"learning_rate": 1.4844353497100006e-07,
"loss": 0.1897,
"step": 9530
},
{
"epoch": 1.8622742801366519,
"grad_norm": 1.8721849918365479,
"learning_rate": 1.4435162737397203e-07,
"loss": 0.2055,
"step": 9540
},
{
"epoch": 1.8642264519277696,
"grad_norm": 1.1459565162658691,
"learning_rate": 1.4031608231097394e-07,
"loss": 0.2025,
"step": 9550
},
{
"epoch": 1.8661786237188873,
"grad_norm": 1.595178246498108,
"learning_rate": 1.3633694662508745e-07,
"loss": 0.1656,
"step": 9560
},
{
"epoch": 1.8681307955100048,
"grad_norm": 1.6833758354187012,
"learning_rate": 1.3241426650461964e-07,
"loss": 0.2007,
"step": 9570
},
{
"epoch": 1.8700829673011223,
"grad_norm": 2.375056505203247,
"learning_rate": 1.285480874825623e-07,
"loss": 0.1792,
"step": 9580
},
{
"epoch": 1.87203513909224,
"grad_norm": 1.5321182012557983,
"learning_rate": 1.2473845443606081e-07,
"loss": 0.1926,
"step": 9590
},
{
"epoch": 1.8739873108833578,
"grad_norm": 1.4411838054656982,
"learning_rate": 1.2098541158589883e-07,
"loss": 0.1895,
"step": 9600
},
{
"epoch": 1.8759394826744753,
"grad_norm": 2.2660744190216064,
"learning_rate": 1.1728900249598052e-07,
"loss": 0.1813,
"step": 9610
},
{
"epoch": 1.8778916544655928,
"grad_norm": 1.5801745653152466,
"learning_rate": 1.1364927007282866e-07,
"loss": 0.185,
"step": 9620
},
{
"epoch": 1.8798438262567105,
"grad_norm": 1.7089074850082397,
"learning_rate": 1.1006625656508397e-07,
"loss": 0.1918,
"step": 9630
},
{
"epoch": 1.8817959980478283,
"grad_norm": 1.8887181282043457,
"learning_rate": 1.0654000356301541e-07,
"loss": 0.1834,
"step": 9640
},
{
"epoch": 1.8837481698389458,
"grad_norm": 1.394489049911499,
"learning_rate": 1.0307055199803573e-07,
"loss": 0.1751,
"step": 9650
},
{
"epoch": 1.8857003416300633,
"grad_norm": 1.383232593536377,
"learning_rate": 9.965794214223056e-08,
"loss": 0.1878,
"step": 9660
},
{
"epoch": 1.887652513421181,
"grad_norm": 1.9068111181259155,
"learning_rate": 9.630221360788728e-08,
"loss": 0.2025,
"step": 9670
},
{
"epoch": 1.8896046852122987,
"grad_norm": 1.5991621017456055,
"learning_rate": 9.300340534703634e-08,
"loss": 0.1905,
"step": 9680
},
{
"epoch": 1.8915568570034162,
"grad_norm": 2.5463223457336426,
"learning_rate": 8.976155565099953e-08,
"loss": 0.1821,
"step": 9690
},
{
"epoch": 1.8935090287945338,
"grad_norm": 1.5754520893096924,
"learning_rate": 8.657670214994418e-08,
"loss": 0.1762,
"step": 9700
},
{
"epoch": 1.8954612005856515,
"grad_norm": 1.4129728078842163,
"learning_rate": 8.344888181244847e-08,
"loss": 0.191,
"step": 9710
},
{
"epoch": 1.8974133723767692,
"grad_norm": 2.057875633239746,
"learning_rate": 8.037813094507018e-08,
"loss": 0.1849,
"step": 9720
},
{
"epoch": 1.8993655441678867,
"grad_norm": 1.4810922145843506,
"learning_rate": 7.736448519192752e-08,
"loss": 0.1987,
"step": 9730
},
{
"epoch": 1.9013177159590042,
"grad_norm": 1.797695517539978,
"learning_rate": 7.440797953428169e-08,
"loss": 0.169,
"step": 9740
},
{
"epoch": 1.903269887750122,
"grad_norm": 1.7072638273239136,
"learning_rate": 7.150864829013616e-08,
"loss": 0.189,
"step": 9750
},
{
"epoch": 1.9052220595412397,
"grad_norm": 2.731884002685547,
"learning_rate": 6.866652511383298e-08,
"loss": 0.207,
"step": 9760
},
{
"epoch": 1.9071742313323572,
"grad_norm": 1.4508967399597168,
"learning_rate": 6.588164299566546e-08,
"loss": 0.1822,
"step": 9770
},
{
"epoch": 1.9091264031234747,
"grad_norm": 1.656947374343872,
"learning_rate": 6.315403426149558e-08,
"loss": 0.1787,
"step": 9780
},
{
"epoch": 1.9110785749145924,
"grad_norm": 1.576837182044983,
"learning_rate": 6.048373057237489e-08,
"loss": 0.1699,
"step": 9790
},
{
"epoch": 1.9130307467057102,
"grad_norm": 1.460389494895935,
"learning_rate": 5.787076292418203e-08,
"loss": 0.196,
"step": 9800
},
{
"epoch": 1.9149829184968277,
"grad_norm": 1.6283118724822998,
"learning_rate": 5.531516164725858e-08,
"loss": 0.1989,
"step": 9810
},
{
"epoch": 1.9169350902879452,
"grad_norm": 1.3542603254318237,
"learning_rate": 5.281695640605988e-08,
"loss": 0.1888,
"step": 9820
},
{
"epoch": 1.918887262079063,
"grad_norm": 1.46894109249115,
"learning_rate": 5.03761761988103e-08,
"loss": 0.1874,
"step": 9830
},
{
"epoch": 1.9208394338701806,
"grad_norm": 1.651439905166626,
"learning_rate": 4.799284935716519e-08,
"loss": 0.1935,
"step": 9840
},
{
"epoch": 1.9227916056612981,
"grad_norm": 1.5114413499832153,
"learning_rate": 4.566700354588283e-08,
"loss": 0.1844,
"step": 9850
},
{
"epoch": 1.9247437774524156,
"grad_norm": 1.7399928569793701,
"learning_rate": 4.339866576250407e-08,
"loss": 0.1715,
"step": 9860
},
{
"epoch": 1.9266959492435334,
"grad_norm": 1.6063965559005737,
"learning_rate": 4.1187862337038195e-08,
"loss": 0.1751,
"step": 9870
},
{
"epoch": 1.928648121034651,
"grad_norm": 1.688035011291504,
"learning_rate": 3.90346189316565e-08,
"loss": 0.1775,
"step": 9880
},
{
"epoch": 1.9306002928257686,
"grad_norm": 1.5691899061203003,
"learning_rate": 3.6938960540396364e-08,
"loss": 0.1785,
"step": 9890
},
{
"epoch": 1.9325524646168861,
"grad_norm": 1.4794323444366455,
"learning_rate": 3.490091148886932e-08,
"loss": 0.1911,
"step": 9900
},
{
"epoch": 1.9345046364080039,
"grad_norm": 1.5015360116958618,
"learning_rate": 3.2920495433980125e-08,
"loss": 0.1772,
"step": 9910
},
{
"epoch": 1.9364568081991216,
"grad_norm": 1.6606435775756836,
"learning_rate": 3.099773536365036e-08,
"loss": 0.1876,
"step": 9920
},
{
"epoch": 1.938408979990239,
"grad_norm": 1.2969989776611328,
"learning_rate": 2.913265359655415e-08,
"loss": 0.1781,
"step": 9930
},
{
"epoch": 1.9403611517813566,
"grad_norm": 1.765059232711792,
"learning_rate": 2.7325271781856176e-08,
"loss": 0.1958,
"step": 9940
},
{
"epoch": 1.9423133235724743,
"grad_norm": 1.4548059701919556,
"learning_rate": 2.5575610898962987e-08,
"loss": 0.1799,
"step": 9950
},
{
"epoch": 1.944265495363592,
"grad_norm": 1.309770107269287,
"learning_rate": 2.3883691257277074e-08,
"loss": 0.1742,
"step": 9960
},
{
"epoch": 1.9462176671547096,
"grad_norm": 1.6486570835113525,
"learning_rate": 2.2249532495964287e-08,
"loss": 0.1692,
"step": 9970
},
{
"epoch": 1.948169838945827,
"grad_norm": 1.840489387512207,
"learning_rate": 2.0673153583722904e-08,
"loss": 0.1858,
"step": 9980
},
{
"epoch": 1.9501220107369448,
"grad_norm": 1.6692792177200317,
"learning_rate": 1.9154572818563254e-08,
"loss": 0.1904,
"step": 9990
},
{
"epoch": 1.9520741825280625,
"grad_norm": 1.500740647315979,
"learning_rate": 1.7693807827598998e-08,
"loss": 0.1858,
"step": 10000
},
{
"epoch": 1.95402635431918,
"grad_norm": 1.4872006177902222,
"learning_rate": 1.629087556683784e-08,
"loss": 0.1755,
"step": 10010
},
{
"epoch": 1.9559785261102975,
"grad_norm": 1.3963922262191772,
"learning_rate": 1.4945792320989472e-08,
"loss": 0.1785,
"step": 10020
},
{
"epoch": 1.9579306979014153,
"grad_norm": 1.4054490327835083,
"learning_rate": 1.3658573703271282e-08,
"loss": 0.1828,
"step": 10030
},
{
"epoch": 1.959882869692533,
"grad_norm": 1.5541080236434937,
"learning_rate": 1.242923465523238e-08,
"loss": 0.184,
"step": 10040
},
{
"epoch": 1.9618350414836505,
"grad_norm": 1.6898419857025146,
"learning_rate": 1.1257789446575407e-08,
"loss": 0.1759,
"step": 10050
},
{
"epoch": 1.963787213274768,
"grad_norm": 1.7505004405975342,
"learning_rate": 1.0144251674995553e-08,
"loss": 0.1915,
"step": 10060
},
{
"epoch": 1.9657393850658857,
"grad_norm": 1.5951542854309082,
"learning_rate": 9.088634266017915e-09,
"loss": 0.1736,
"step": 10070
},
{
"epoch": 1.9676915568570035,
"grad_norm": 1.5528618097305298,
"learning_rate": 8.09094947285205e-09,
"loss": 0.1866,
"step": 10080
},
{
"epoch": 1.969643728648121,
"grad_norm": 1.4379768371582031,
"learning_rate": 7.151208876245985e-09,
"loss": 0.1918,
"step": 10090
},
{
"epoch": 1.9715959004392385,
"grad_norm": 1.6501480340957642,
"learning_rate": 6.269423384353546e-09,
"loss": 0.1802,
"step": 10100
},
{
"epoch": 1.9735480722303562,
"grad_norm": 2.3685152530670166,
"learning_rate": 5.445603232608898e-09,
"loss": 0.1853,
"step": 10110
},
{
"epoch": 1.975500244021474,
"grad_norm": 1.6372160911560059,
"learning_rate": 4.679757983604983e-09,
"loss": 0.167,
"step": 10120
},
{
"epoch": 1.9774524158125915,
"grad_norm": 1.8266721963882446,
"learning_rate": 3.971896526984709e-09,
"loss": 0.1874,
"step": 10130
},
{
"epoch": 1.979404587603709,
"grad_norm": 1.66588294506073,
"learning_rate": 3.322027079336043e-09,
"loss": 0.1794,
"step": 10140
},
{
"epoch": 1.9813567593948267,
"grad_norm": 1.63362455368042,
"learning_rate": 2.7301571840993022e-09,
"loss": 0.1829,
"step": 10150
},
{
"epoch": 1.9833089311859444,
"grad_norm": 1.626795768737793,
"learning_rate": 2.1962937114766715e-09,
"loss": 0.1969,
"step": 10160
},
{
"epoch": 1.985261102977062,
"grad_norm": 1.3988165855407715,
"learning_rate": 1.7204428583533773e-09,
"loss": 0.1714,
"step": 10170
},
{
"epoch": 1.9872132747681794,
"grad_norm": 1.5400702953338623,
"learning_rate": 1.3026101482266352e-09,
"loss": 0.1771,
"step": 10180
},
{
"epoch": 1.9891654465592972,
"grad_norm": 1.419124960899353,
"learning_rate": 9.428004311412552e-10,
"loss": 0.1691,
"step": 10190
},
{
"epoch": 1.991117618350415,
"grad_norm": 1.6745810508728027,
"learning_rate": 6.410178836324666e-10,
"loss": 0.1791,
"step": 10200
},
{
"epoch": 1.9930697901415324,
"grad_norm": 1.6668099164962769,
"learning_rate": 3.9726600867817657e-10,
"loss": 0.1849,
"step": 10210
},
{
"epoch": 1.99502196193265,
"grad_norm": 1.4950345754623413,
"learning_rate": 2.1154763565844894e-10,
"loss": 0.2081,
"step": 10220
},
{
"epoch": 1.9969741337237676,
"grad_norm": 2.625748872756958,
"learning_rate": 8.386492032164129e-11,
"loss": 0.1656,
"step": 10230
},
{
"epoch": 1.9989263055148854,
"grad_norm": 1.790960431098938,
"learning_rate": 1.4219344760535436e-11,
"loss": 0.1974,
"step": 10240
}
],
"logging_steps": 10,
"max_steps": 10246,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2355982926700924e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}