Qwen3-8B-ODA-Math-460k / trainer_state.json
GX-XinGao's picture
Initial release v1.0
4e31e9f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 8661,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034644032565390613,
"grad_norm": 24.105030711898273,
"learning_rate": 5.190311418685121e-07,
"loss": 2.8149,
"step": 10
},
{
"epoch": 0.0069288065130781226,
"grad_norm": 13.101411602211778,
"learning_rate": 1.0957324106113035e-06,
"loss": 2.6533,
"step": 20
},
{
"epoch": 0.010393209769617183,
"grad_norm": 11.8076022930929,
"learning_rate": 1.6724336793540945e-06,
"loss": 2.4007,
"step": 30
},
{
"epoch": 0.013857613026156245,
"grad_norm": 4.7580336362411515,
"learning_rate": 2.249134948096886e-06,
"loss": 1.8865,
"step": 40
},
{
"epoch": 0.017322016282695307,
"grad_norm": 2.9114980071240395,
"learning_rate": 2.825836216839677e-06,
"loss": 1.4207,
"step": 50
},
{
"epoch": 0.020786419539234366,
"grad_norm": 1.7324565160527228,
"learning_rate": 3.402537485582469e-06,
"loss": 1.0113,
"step": 60
},
{
"epoch": 0.024250822795773428,
"grad_norm": 2.1155590344472155,
"learning_rate": 3.9792387543252595e-06,
"loss": 0.7313,
"step": 70
},
{
"epoch": 0.02771522605231249,
"grad_norm": 0.6681966791277029,
"learning_rate": 4.555940023068051e-06,
"loss": 0.5804,
"step": 80
},
{
"epoch": 0.03117962930885155,
"grad_norm": 2.046887683182866,
"learning_rate": 5.132641291810842e-06,
"loss": 0.5134,
"step": 90
},
{
"epoch": 0.034644032565390614,
"grad_norm": 1.0043907474739053,
"learning_rate": 5.709342560553633e-06,
"loss": 0.4659,
"step": 100
},
{
"epoch": 0.03810843582192967,
"grad_norm": 0.6198895627168727,
"learning_rate": 6.286043829296424e-06,
"loss": 0.4432,
"step": 110
},
{
"epoch": 0.04157283907846873,
"grad_norm": 0.5789693949791498,
"learning_rate": 6.862745098039216e-06,
"loss": 0.4208,
"step": 120
},
{
"epoch": 0.0450372423350078,
"grad_norm": 0.5151956509234089,
"learning_rate": 7.439446366782007e-06,
"loss": 0.4103,
"step": 130
},
{
"epoch": 0.048501645591546856,
"grad_norm": 0.6554047963560565,
"learning_rate": 8.016147635524798e-06,
"loss": 0.3938,
"step": 140
},
{
"epoch": 0.051966048848085915,
"grad_norm": 0.504860048364916,
"learning_rate": 8.592848904267588e-06,
"loss": 0.384,
"step": 150
},
{
"epoch": 0.05543045210462498,
"grad_norm": 0.6045057320065914,
"learning_rate": 9.169550173010382e-06,
"loss": 0.3781,
"step": 160
},
{
"epoch": 0.05889485536116404,
"grad_norm": 1.055910114221381,
"learning_rate": 9.746251441753172e-06,
"loss": 0.3695,
"step": 170
},
{
"epoch": 0.0623592586177031,
"grad_norm": 0.462726140305608,
"learning_rate": 1.0322952710495964e-05,
"loss": 0.3577,
"step": 180
},
{
"epoch": 0.06582366187424216,
"grad_norm": 0.4950470445818375,
"learning_rate": 1.0899653979238756e-05,
"loss": 0.3593,
"step": 190
},
{
"epoch": 0.06928806513078123,
"grad_norm": 0.4011301599299419,
"learning_rate": 1.1476355247981546e-05,
"loss": 0.3469,
"step": 200
},
{
"epoch": 0.07275246838732029,
"grad_norm": 1.5395956991225512,
"learning_rate": 1.2053056516724338e-05,
"loss": 0.3466,
"step": 210
},
{
"epoch": 0.07621687164385935,
"grad_norm": 0.4782287164298744,
"learning_rate": 1.2629757785467128e-05,
"loss": 0.3384,
"step": 220
},
{
"epoch": 0.0796812749003984,
"grad_norm": 0.4264649009974007,
"learning_rate": 1.3206459054209918e-05,
"loss": 0.335,
"step": 230
},
{
"epoch": 0.08314567815693746,
"grad_norm": 0.5500552360743377,
"learning_rate": 1.3783160322952712e-05,
"loss": 0.3294,
"step": 240
},
{
"epoch": 0.08661008141347652,
"grad_norm": 0.5287918177337965,
"learning_rate": 1.4359861591695503e-05,
"loss": 0.328,
"step": 250
},
{
"epoch": 0.0900744846700156,
"grad_norm": 0.4510077521972178,
"learning_rate": 1.4936562860438294e-05,
"loss": 0.3294,
"step": 260
},
{
"epoch": 0.09353888792655465,
"grad_norm": 0.47410104891468646,
"learning_rate": 1.5513264129181084e-05,
"loss": 0.322,
"step": 270
},
{
"epoch": 0.09700329118309371,
"grad_norm": 0.5064048354634346,
"learning_rate": 1.6089965397923876e-05,
"loss": 0.3222,
"step": 280
},
{
"epoch": 0.10046769443963277,
"grad_norm": 0.5163925193924696,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.3104,
"step": 290
},
{
"epoch": 0.10393209769617183,
"grad_norm": 0.5854453216694896,
"learning_rate": 1.7243367935409456e-05,
"loss": 0.3108,
"step": 300
},
{
"epoch": 0.1073965009527109,
"grad_norm": 0.47989721671414387,
"learning_rate": 1.782006920415225e-05,
"loss": 0.3122,
"step": 310
},
{
"epoch": 0.11086090420924996,
"grad_norm": 0.4198890022476431,
"learning_rate": 1.8396770472895043e-05,
"loss": 0.3013,
"step": 320
},
{
"epoch": 0.11432530746578902,
"grad_norm": 0.35484886374124996,
"learning_rate": 1.897347174163783e-05,
"loss": 0.308,
"step": 330
},
{
"epoch": 0.11778971072232808,
"grad_norm": 0.6798994381335608,
"learning_rate": 1.9550173010380623e-05,
"loss": 0.3017,
"step": 340
},
{
"epoch": 0.12125411397886714,
"grad_norm": 0.5104370022480478,
"learning_rate": 2.0126874279123415e-05,
"loss": 0.3009,
"step": 350
},
{
"epoch": 0.1247185172354062,
"grad_norm": 0.43181121899772806,
"learning_rate": 2.0703575547866204e-05,
"loss": 0.2984,
"step": 360
},
{
"epoch": 0.12818292049194527,
"grad_norm": 0.44631400797502857,
"learning_rate": 2.1280276816609e-05,
"loss": 0.3011,
"step": 370
},
{
"epoch": 0.1316473237484843,
"grad_norm": 1.3618600184569662,
"learning_rate": 2.185697808535179e-05,
"loss": 0.3028,
"step": 380
},
{
"epoch": 0.13511172700502339,
"grad_norm": 0.4288866277270222,
"learning_rate": 2.243367935409458e-05,
"loss": 0.2966,
"step": 390
},
{
"epoch": 0.13857613026156246,
"grad_norm": 0.43831756556428697,
"learning_rate": 2.301038062283737e-05,
"loss": 0.2916,
"step": 400
},
{
"epoch": 0.1420405335181015,
"grad_norm": 0.422949206010911,
"learning_rate": 2.3587081891580163e-05,
"loss": 0.2979,
"step": 410
},
{
"epoch": 0.14550493677464058,
"grad_norm": 0.5022509237000374,
"learning_rate": 2.4163783160322955e-05,
"loss": 0.2895,
"step": 420
},
{
"epoch": 0.14896934003117962,
"grad_norm": 0.49397467400296585,
"learning_rate": 2.4740484429065743e-05,
"loss": 0.2898,
"step": 430
},
{
"epoch": 0.1524337432877187,
"grad_norm": 0.3494967743075288,
"learning_rate": 2.531718569780854e-05,
"loss": 0.279,
"step": 440
},
{
"epoch": 0.15589814654425777,
"grad_norm": 0.45661458601823307,
"learning_rate": 2.5893886966551327e-05,
"loss": 0.291,
"step": 450
},
{
"epoch": 0.1593625498007968,
"grad_norm": 0.5438985700428273,
"learning_rate": 2.647058823529412e-05,
"loss": 0.29,
"step": 460
},
{
"epoch": 0.16282695305733588,
"grad_norm": 0.4788529828590826,
"learning_rate": 2.704728950403691e-05,
"loss": 0.2866,
"step": 470
},
{
"epoch": 0.16629135631387493,
"grad_norm": 0.4154664374547961,
"learning_rate": 2.7623990772779702e-05,
"loss": 0.2825,
"step": 480
},
{
"epoch": 0.169755759570414,
"grad_norm": 0.45665598437011556,
"learning_rate": 2.820069204152249e-05,
"loss": 0.2839,
"step": 490
},
{
"epoch": 0.17322016282695304,
"grad_norm": 0.5602183672259513,
"learning_rate": 2.8777393310265283e-05,
"loss": 0.285,
"step": 500
},
{
"epoch": 0.17668456608349212,
"grad_norm": 0.4595470907703161,
"learning_rate": 2.9354094579008075e-05,
"loss": 0.2845,
"step": 510
},
{
"epoch": 0.1801489693400312,
"grad_norm": 0.35972117453020297,
"learning_rate": 2.9930795847750863e-05,
"loss": 0.2772,
"step": 520
},
{
"epoch": 0.18361337259657023,
"grad_norm": 0.4722911335764227,
"learning_rate": 3.0507497116493655e-05,
"loss": 0.2774,
"step": 530
},
{
"epoch": 0.1870777758531093,
"grad_norm": 0.4212365397286255,
"learning_rate": 3.108419838523645e-05,
"loss": 0.2778,
"step": 540
},
{
"epoch": 0.19054217910964835,
"grad_norm": 0.4437718583316477,
"learning_rate": 3.166089965397924e-05,
"loss": 0.2763,
"step": 550
},
{
"epoch": 0.19400658236618742,
"grad_norm": 0.5795580292549793,
"learning_rate": 3.2237600922722034e-05,
"loss": 0.2777,
"step": 560
},
{
"epoch": 0.1974709856227265,
"grad_norm": 0.38801773200166495,
"learning_rate": 3.2814302191464826e-05,
"loss": 0.2782,
"step": 570
},
{
"epoch": 0.20093538887926554,
"grad_norm": 0.39386050551197965,
"learning_rate": 3.339100346020762e-05,
"loss": 0.2756,
"step": 580
},
{
"epoch": 0.20439979213580461,
"grad_norm": 0.4094962883057876,
"learning_rate": 3.39677047289504e-05,
"loss": 0.2725,
"step": 590
},
{
"epoch": 0.20786419539234366,
"grad_norm": 0.46243563531324183,
"learning_rate": 3.4544405997693194e-05,
"loss": 0.2679,
"step": 600
},
{
"epoch": 0.21132859864888273,
"grad_norm": 0.44066593111685165,
"learning_rate": 3.5121107266435986e-05,
"loss": 0.2785,
"step": 610
},
{
"epoch": 0.2147930019054218,
"grad_norm": 0.3559776358805474,
"learning_rate": 3.569780853517878e-05,
"loss": 0.2717,
"step": 620
},
{
"epoch": 0.21825740516196085,
"grad_norm": 0.46610388128285485,
"learning_rate": 3.627450980392157e-05,
"loss": 0.2697,
"step": 630
},
{
"epoch": 0.22172180841849992,
"grad_norm": 0.40480766460689316,
"learning_rate": 3.685121107266436e-05,
"loss": 0.2703,
"step": 640
},
{
"epoch": 0.22518621167503897,
"grad_norm": 0.4307168132201555,
"learning_rate": 3.7427912341407154e-05,
"loss": 0.2732,
"step": 650
},
{
"epoch": 0.22865061493157804,
"grad_norm": 0.4259888241847833,
"learning_rate": 3.800461361014994e-05,
"loss": 0.2701,
"step": 660
},
{
"epoch": 0.23211501818811708,
"grad_norm": 0.45912634835480254,
"learning_rate": 3.858131487889274e-05,
"loss": 0.2715,
"step": 670
},
{
"epoch": 0.23557942144465616,
"grad_norm": 0.4028410357459808,
"learning_rate": 3.915801614763553e-05,
"loss": 0.2722,
"step": 680
},
{
"epoch": 0.23904382470119523,
"grad_norm": 0.48493013541150326,
"learning_rate": 3.973471741637832e-05,
"loss": 0.2671,
"step": 690
},
{
"epoch": 0.24250822795773427,
"grad_norm": 0.3565349550219391,
"learning_rate": 4.031141868512111e-05,
"loss": 0.2663,
"step": 700
},
{
"epoch": 0.24597263121427335,
"grad_norm": 0.32316152566585826,
"learning_rate": 4.0888119953863905e-05,
"loss": 0.2725,
"step": 710
},
{
"epoch": 0.2494370344708124,
"grad_norm": 0.5136690083745088,
"learning_rate": 4.146482122260669e-05,
"loss": 0.2655,
"step": 720
},
{
"epoch": 0.25290143772735146,
"grad_norm": 0.40785543102527044,
"learning_rate": 4.204152249134948e-05,
"loss": 0.2668,
"step": 730
},
{
"epoch": 0.25636584098389054,
"grad_norm": 0.38517456097463165,
"learning_rate": 4.2618223760092274e-05,
"loss": 0.257,
"step": 740
},
{
"epoch": 0.2598302442404296,
"grad_norm": 0.38202204582529214,
"learning_rate": 4.3194925028835065e-05,
"loss": 0.2657,
"step": 750
},
{
"epoch": 0.2632946474969686,
"grad_norm": 0.5057316578616624,
"learning_rate": 4.377162629757786e-05,
"loss": 0.2669,
"step": 760
},
{
"epoch": 0.2667590507535077,
"grad_norm": 0.4288116267424748,
"learning_rate": 4.434832756632065e-05,
"loss": 0.2636,
"step": 770
},
{
"epoch": 0.27022345401004677,
"grad_norm": 0.37093943217557107,
"learning_rate": 4.4925028835063434e-05,
"loss": 0.2647,
"step": 780
},
{
"epoch": 0.27368785726658584,
"grad_norm": 0.34353485608546885,
"learning_rate": 4.5501730103806226e-05,
"loss": 0.2598,
"step": 790
},
{
"epoch": 0.2771522605231249,
"grad_norm": 0.35239301626467756,
"learning_rate": 4.607843137254902e-05,
"loss": 0.2646,
"step": 800
},
{
"epoch": 0.28061666377966393,
"grad_norm": 0.3842430524450989,
"learning_rate": 4.6655132641291816e-05,
"loss": 0.2619,
"step": 810
},
{
"epoch": 0.284081067036203,
"grad_norm": 0.35682883446316815,
"learning_rate": 4.723183391003461e-05,
"loss": 0.2561,
"step": 820
},
{
"epoch": 0.2875454702927421,
"grad_norm": 0.3734875488865965,
"learning_rate": 4.78085351787774e-05,
"loss": 0.2584,
"step": 830
},
{
"epoch": 0.29100987354928115,
"grad_norm": 0.4315870557925161,
"learning_rate": 4.8385236447520185e-05,
"loss": 0.2613,
"step": 840
},
{
"epoch": 0.2944742768058202,
"grad_norm": 0.4103742391218009,
"learning_rate": 4.896193771626298e-05,
"loss": 0.2624,
"step": 850
},
{
"epoch": 0.29793868006235924,
"grad_norm": 0.3591092100235818,
"learning_rate": 4.953863898500577e-05,
"loss": 0.2597,
"step": 860
},
{
"epoch": 0.3014030833188983,
"grad_norm": 0.3623515286988262,
"learning_rate": 4.999999187639266e-05,
"loss": 0.2593,
"step": 870
},
{
"epoch": 0.3048674865754374,
"grad_norm": 0.3048594211509306,
"learning_rate": 4.999970755069012e-05,
"loss": 0.2601,
"step": 880
},
{
"epoch": 0.30833188983197646,
"grad_norm": 0.45002930011250586,
"learning_rate": 4.9999017049900046e-05,
"loss": 0.2543,
"step": 890
},
{
"epoch": 0.31179629308851553,
"grad_norm": 0.35358665328747896,
"learning_rate": 4.999792038524113e-05,
"loss": 0.2559,
"step": 900
},
{
"epoch": 0.31526069634505455,
"grad_norm": 0.2967473429143083,
"learning_rate": 4.9996417574531085e-05,
"loss": 0.2576,
"step": 910
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.38465662558696595,
"learning_rate": 4.9994508642186376e-05,
"loss": 0.2603,
"step": 920
},
{
"epoch": 0.3221895028581327,
"grad_norm": 0.31492562032892685,
"learning_rate": 4.9992193619221796e-05,
"loss": 0.2576,
"step": 930
},
{
"epoch": 0.32565390611467177,
"grad_norm": 0.34729263239739333,
"learning_rate": 4.998947254324998e-05,
"loss": 0.2547,
"step": 940
},
{
"epoch": 0.3291183093712108,
"grad_norm": 0.3488064460618837,
"learning_rate": 4.998634545848076e-05,
"loss": 0.2515,
"step": 950
},
{
"epoch": 0.33258271262774985,
"grad_norm": 0.26471687398645055,
"learning_rate": 4.9982812415720496e-05,
"loss": 0.2574,
"step": 960
},
{
"epoch": 0.3360471158842889,
"grad_norm": 0.2644922688346079,
"learning_rate": 4.997887347237122e-05,
"loss": 0.2586,
"step": 970
},
{
"epoch": 0.339511519140828,
"grad_norm": 0.7989330166644754,
"learning_rate": 4.99745286924297e-05,
"loss": 0.2552,
"step": 980
},
{
"epoch": 0.3429759223973671,
"grad_norm": 0.26650336882677617,
"learning_rate": 4.9969778146486424e-05,
"loss": 0.2524,
"step": 990
},
{
"epoch": 0.3464403256539061,
"grad_norm": 0.2899936429270715,
"learning_rate": 4.996462191172443e-05,
"loss": 0.2543,
"step": 1000
},
{
"epoch": 0.34990472891044516,
"grad_norm": 0.26123609655286284,
"learning_rate": 4.9959060071918055e-05,
"loss": 0.2533,
"step": 1010
},
{
"epoch": 0.35336913216698423,
"grad_norm": 0.33570339771203644,
"learning_rate": 4.99530927174316e-05,
"loss": 0.2563,
"step": 1020
},
{
"epoch": 0.3568335354235233,
"grad_norm": 0.3014454205427117,
"learning_rate": 4.9946719945217814e-05,
"loss": 0.2499,
"step": 1030
},
{
"epoch": 0.3602979386800624,
"grad_norm": 0.27325651600933015,
"learning_rate": 4.9939941858816366e-05,
"loss": 0.2511,
"step": 1040
},
{
"epoch": 0.3637623419366014,
"grad_norm": 0.32465788692548037,
"learning_rate": 4.9932758568352144e-05,
"loss": 0.2479,
"step": 1050
},
{
"epoch": 0.36722674519314047,
"grad_norm": 0.2652235360508281,
"learning_rate": 4.9925170190533454e-05,
"loss": 0.2517,
"step": 1060
},
{
"epoch": 0.37069114844967954,
"grad_norm": 0.3970874000599188,
"learning_rate": 4.991717684865014e-05,
"loss": 0.2476,
"step": 1070
},
{
"epoch": 0.3741555517062186,
"grad_norm": 0.2722867182843485,
"learning_rate": 4.990877867257157e-05,
"loss": 0.2529,
"step": 1080
},
{
"epoch": 0.3776199549627577,
"grad_norm": 0.2619052903321827,
"learning_rate": 4.989997579874454e-05,
"loss": 0.2469,
"step": 1090
},
{
"epoch": 0.3810843582192967,
"grad_norm": 0.2955918646015672,
"learning_rate": 4.9890768370191046e-05,
"loss": 0.2502,
"step": 1100
},
{
"epoch": 0.3845487614758358,
"grad_norm": 0.387831684494149,
"learning_rate": 4.988115653650596e-05,
"loss": 0.2425,
"step": 1110
},
{
"epoch": 0.38801316473237485,
"grad_norm": 0.32722861574894646,
"learning_rate": 4.98711404538546e-05,
"loss": 0.248,
"step": 1120
},
{
"epoch": 0.3914775679889139,
"grad_norm": 0.29851671037489946,
"learning_rate": 4.986072028497021e-05,
"loss": 0.2477,
"step": 1130
},
{
"epoch": 0.394941971245453,
"grad_norm": 0.32504507070930905,
"learning_rate": 4.984989619915128e-05,
"loss": 0.2483,
"step": 1140
},
{
"epoch": 0.398406374501992,
"grad_norm": 0.2992367844222804,
"learning_rate": 4.9838668372258844e-05,
"loss": 0.2434,
"step": 1150
},
{
"epoch": 0.4018707777585311,
"grad_norm": 0.35335395639024886,
"learning_rate": 4.982703698671356e-05,
"loss": 0.2515,
"step": 1160
},
{
"epoch": 0.40533518101507016,
"grad_norm": 0.2724105498054968,
"learning_rate": 4.9815002231492806e-05,
"loss": 0.2422,
"step": 1170
},
{
"epoch": 0.40879958427160923,
"grad_norm": 0.26081802141731797,
"learning_rate": 4.9802564302127584e-05,
"loss": 0.2477,
"step": 1180
},
{
"epoch": 0.4122639875281483,
"grad_norm": 0.26509078283121335,
"learning_rate": 4.978972340069934e-05,
"loss": 0.2428,
"step": 1190
},
{
"epoch": 0.4157283907846873,
"grad_norm": 0.3268402808784247,
"learning_rate": 4.977647973583669e-05,
"loss": 0.245,
"step": 1200
},
{
"epoch": 0.4191927940412264,
"grad_norm": 0.26931679508398965,
"learning_rate": 4.9762833522712e-05,
"loss": 0.2461,
"step": 1210
},
{
"epoch": 0.42265719729776546,
"grad_norm": 0.28796254136886007,
"learning_rate": 4.9748784983037955e-05,
"loss": 0.2464,
"step": 1220
},
{
"epoch": 0.42612160055430454,
"grad_norm": 0.2903443309315686,
"learning_rate": 4.9734334345063884e-05,
"loss": 0.2462,
"step": 1230
},
{
"epoch": 0.4295860038108436,
"grad_norm": 0.2474488914617406,
"learning_rate": 4.971948184357211e-05,
"loss": 0.241,
"step": 1240
},
{
"epoch": 0.4330504070673826,
"grad_norm": 0.255276664365212,
"learning_rate": 4.970422771987411e-05,
"loss": 0.239,
"step": 1250
},
{
"epoch": 0.4365148103239217,
"grad_norm": 0.24990399483865394,
"learning_rate": 4.968857222180656e-05,
"loss": 0.2466,
"step": 1260
},
{
"epoch": 0.43997921358046077,
"grad_norm": 0.35045726721574383,
"learning_rate": 4.9672515603727385e-05,
"loss": 0.2423,
"step": 1270
},
{
"epoch": 0.44344361683699984,
"grad_norm": 0.24435346062621818,
"learning_rate": 4.965605812651155e-05,
"loss": 0.2407,
"step": 1280
},
{
"epoch": 0.44690802009353886,
"grad_norm": 0.24411085271480595,
"learning_rate": 4.96392000575469e-05,
"loss": 0.2381,
"step": 1290
},
{
"epoch": 0.45037242335007793,
"grad_norm": 0.2774791617724539,
"learning_rate": 4.962194167072971e-05,
"loss": 0.2397,
"step": 1300
},
{
"epoch": 0.453836826606617,
"grad_norm": 0.33768453163385515,
"learning_rate": 4.960428324646036e-05,
"loss": 0.2391,
"step": 1310
},
{
"epoch": 0.4573012298631561,
"grad_norm": 0.26599849394188135,
"learning_rate": 4.958622507163868e-05,
"loss": 0.2372,
"step": 1320
},
{
"epoch": 0.46076563311969515,
"grad_norm": 0.34654816144848893,
"learning_rate": 4.9567767439659315e-05,
"loss": 0.2405,
"step": 1330
},
{
"epoch": 0.46423003637623417,
"grad_norm": 0.2964452922825174,
"learning_rate": 4.954891065040701e-05,
"loss": 0.2424,
"step": 1340
},
{
"epoch": 0.46769443963277324,
"grad_norm": 0.26622673063396735,
"learning_rate": 4.952965501025165e-05,
"loss": 0.2396,
"step": 1350
},
{
"epoch": 0.4711588428893123,
"grad_norm": 0.25967043414851687,
"learning_rate": 4.9510000832043356e-05,
"loss": 0.2421,
"step": 1360
},
{
"epoch": 0.4746232461458514,
"grad_norm": 0.22164309133223403,
"learning_rate": 4.948994843510737e-05,
"loss": 0.2429,
"step": 1370
},
{
"epoch": 0.47808764940239046,
"grad_norm": 0.3316472047464383,
"learning_rate": 4.9469498145238855e-05,
"loss": 0.2426,
"step": 1380
},
{
"epoch": 0.4815520526589295,
"grad_norm": 0.3356169844309959,
"learning_rate": 4.944865029469764e-05,
"loss": 0.2355,
"step": 1390
},
{
"epoch": 0.48501645591546855,
"grad_norm": 0.268486815716583,
"learning_rate": 4.9427405222202784e-05,
"loss": 0.2368,
"step": 1400
},
{
"epoch": 0.4884808591720076,
"grad_norm": 0.2334743696157166,
"learning_rate": 4.9405763272927086e-05,
"loss": 0.2439,
"step": 1410
},
{
"epoch": 0.4919452624285467,
"grad_norm": 0.2957491864509931,
"learning_rate": 4.938372479849149e-05,
"loss": 0.237,
"step": 1420
},
{
"epoch": 0.49540966568508576,
"grad_norm": 0.26896125577295615,
"learning_rate": 4.936129015695936e-05,
"loss": 0.2354,
"step": 1430
},
{
"epoch": 0.4988740689416248,
"grad_norm": 0.25816020668305967,
"learning_rate": 4.9338459712830656e-05,
"loss": 0.2374,
"step": 1440
},
{
"epoch": 0.5023384721981639,
"grad_norm": 0.24981495367506942,
"learning_rate": 4.9315233837036016e-05,
"loss": 0.2332,
"step": 1450
},
{
"epoch": 0.5058028754547029,
"grad_norm": 0.2554391345730699,
"learning_rate": 4.9291612906930754e-05,
"loss": 0.2383,
"step": 1460
},
{
"epoch": 0.509267278711242,
"grad_norm": 0.26653068047488826,
"learning_rate": 4.926759730628868e-05,
"loss": 0.2411,
"step": 1470
},
{
"epoch": 0.5127316819677811,
"grad_norm": 0.2402237748204006,
"learning_rate": 4.9243187425295915e-05,
"loss": 0.2332,
"step": 1480
},
{
"epoch": 0.5161960852243201,
"grad_norm": 0.24413036670385513,
"learning_rate": 4.921838366054451e-05,
"loss": 0.2396,
"step": 1490
},
{
"epoch": 0.5196604884808592,
"grad_norm": 0.22909060071304008,
"learning_rate": 4.919318641502604e-05,
"loss": 0.2349,
"step": 1500
},
{
"epoch": 0.5231248917373982,
"grad_norm": 0.22701914207506868,
"learning_rate": 4.9167596098125036e-05,
"loss": 0.2324,
"step": 1510
},
{
"epoch": 0.5265892949939373,
"grad_norm": 0.25285812755375375,
"learning_rate": 4.9141613125612316e-05,
"loss": 0.2361,
"step": 1520
},
{
"epoch": 0.5300536982504763,
"grad_norm": 0.23531009106142328,
"learning_rate": 4.911523791963828e-05,
"loss": 0.2389,
"step": 1530
},
{
"epoch": 0.5335181015070154,
"grad_norm": 0.23036998762323121,
"learning_rate": 4.908847090872599e-05,
"loss": 0.2349,
"step": 1540
},
{
"epoch": 0.5369825047635545,
"grad_norm": 0.2771142315120943,
"learning_rate": 4.906131252776426e-05,
"loss": 0.2384,
"step": 1550
},
{
"epoch": 0.5404469080200935,
"grad_norm": 0.22571300533030822,
"learning_rate": 4.9033763218000555e-05,
"loss": 0.2307,
"step": 1560
},
{
"epoch": 0.5439113112766326,
"grad_norm": 0.2351873545158354,
"learning_rate": 4.9005823427033856e-05,
"loss": 0.2353,
"step": 1570
},
{
"epoch": 0.5473757145331717,
"grad_norm": 0.2540211756984853,
"learning_rate": 4.897749360880735e-05,
"loss": 0.2324,
"step": 1580
},
{
"epoch": 0.5508401177897108,
"grad_norm": 0.25457759095229565,
"learning_rate": 4.894877422360106e-05,
"loss": 0.233,
"step": 1590
},
{
"epoch": 0.5543045210462498,
"grad_norm": 0.270030962078855,
"learning_rate": 4.8919665738024424e-05,
"loss": 0.2415,
"step": 1600
},
{
"epoch": 0.5577689243027888,
"grad_norm": 0.2570316973112016,
"learning_rate": 4.8890168625008624e-05,
"loss": 0.2342,
"step": 1610
},
{
"epoch": 0.5612333275593279,
"grad_norm": 0.27096347274503246,
"learning_rate": 4.8860283363798974e-05,
"loss": 0.2279,
"step": 1620
},
{
"epoch": 0.5646977308158669,
"grad_norm": 0.24193443897499534,
"learning_rate": 4.8830010439947096e-05,
"loss": 0.2337,
"step": 1630
},
{
"epoch": 0.568162134072406,
"grad_norm": 0.29770893374153723,
"learning_rate": 4.879935034530304e-05,
"loss": 0.2308,
"step": 1640
},
{
"epoch": 0.5716265373289451,
"grad_norm": 0.25131732918770927,
"learning_rate": 4.876830357800729e-05,
"loss": 0.2294,
"step": 1650
},
{
"epoch": 0.5750909405854842,
"grad_norm": 0.35319194823747296,
"learning_rate": 4.87368706424827e-05,
"loss": 0.231,
"step": 1660
},
{
"epoch": 0.5785553438420232,
"grad_norm": 0.23506203264124303,
"learning_rate": 4.8705052049426254e-05,
"loss": 0.2353,
"step": 1670
},
{
"epoch": 0.5820197470985623,
"grad_norm": 0.26231316656015746,
"learning_rate": 4.867284831580078e-05,
"loss": 0.2379,
"step": 1680
},
{
"epoch": 0.5854841503551014,
"grad_norm": 0.23941392513796939,
"learning_rate": 4.8640259964826584e-05,
"loss": 0.2308,
"step": 1690
},
{
"epoch": 0.5889485536116404,
"grad_norm": 0.2531735045781861,
"learning_rate": 4.860728752597291e-05,
"loss": 0.2315,
"step": 1700
},
{
"epoch": 0.5924129568681794,
"grad_norm": 0.23922451826735386,
"learning_rate": 4.8573931534949354e-05,
"loss": 0.2334,
"step": 1710
},
{
"epoch": 0.5958773601247185,
"grad_norm": 0.26345705856662766,
"learning_rate": 4.8540192533697155e-05,
"loss": 0.2326,
"step": 1720
},
{
"epoch": 0.5993417633812576,
"grad_norm": 0.25324175129413945,
"learning_rate": 4.85060710703804e-05,
"loss": 0.2333,
"step": 1730
},
{
"epoch": 0.6028061666377966,
"grad_norm": 0.25577386258261664,
"learning_rate": 4.84715676993771e-05,
"loss": 0.2362,
"step": 1740
},
{
"epoch": 0.6062705698943357,
"grad_norm": 0.27948607132517533,
"learning_rate": 4.843668298127022e-05,
"loss": 0.2304,
"step": 1750
},
{
"epoch": 0.6097349731508748,
"grad_norm": 0.2560173418476038,
"learning_rate": 4.840141748283851e-05,
"loss": 0.2362,
"step": 1760
},
{
"epoch": 0.6131993764074138,
"grad_norm": 0.24729226299066756,
"learning_rate": 4.8365771777047356e-05,
"loss": 0.2317,
"step": 1770
},
{
"epoch": 0.6166637796639529,
"grad_norm": 0.2568219765818277,
"learning_rate": 4.832974644303944e-05,
"loss": 0.2393,
"step": 1780
},
{
"epoch": 0.620128182920492,
"grad_norm": 0.2306059252401264,
"learning_rate": 4.829334206612534e-05,
"loss": 0.2367,
"step": 1790
},
{
"epoch": 0.6235925861770311,
"grad_norm": 0.31020261629615153,
"learning_rate": 4.8256559237774e-05,
"loss": 0.2326,
"step": 1800
},
{
"epoch": 0.62705698943357,
"grad_norm": 0.27321518814530843,
"learning_rate": 4.821939855560318e-05,
"loss": 0.2341,
"step": 1810
},
{
"epoch": 0.6305213926901091,
"grad_norm": 0.3177547595944014,
"learning_rate": 4.8181860623369646e-05,
"loss": 0.235,
"step": 1820
},
{
"epoch": 0.6339857959466482,
"grad_norm": 0.2631656385352305,
"learning_rate": 4.814394605095946e-05,
"loss": 0.2325,
"step": 1830
},
{
"epoch": 0.6374501992031872,
"grad_norm": 0.19501128079802074,
"learning_rate": 4.810565545437802e-05,
"loss": 0.2318,
"step": 1840
},
{
"epoch": 0.6409146024597263,
"grad_norm": 0.23539186170837376,
"learning_rate": 4.806698945574006e-05,
"loss": 0.2322,
"step": 1850
},
{
"epoch": 0.6443790057162654,
"grad_norm": 0.2474455157721665,
"learning_rate": 4.8027948683259546e-05,
"loss": 0.2319,
"step": 1860
},
{
"epoch": 0.6478434089728045,
"grad_norm": 0.21665268684721978,
"learning_rate": 4.798853377123948e-05,
"loss": 0.2277,
"step": 1870
},
{
"epoch": 0.6513078122293435,
"grad_norm": 0.24192469144287532,
"learning_rate": 4.794874536006152e-05,
"loss": 0.2263,
"step": 1880
},
{
"epoch": 0.6547722154858826,
"grad_norm": 0.2372551879205524,
"learning_rate": 4.790858409617573e-05,
"loss": 0.227,
"step": 1890
},
{
"epoch": 0.6582366187424216,
"grad_norm": 0.23445846730497275,
"learning_rate": 4.786805063208992e-05,
"loss": 0.2349,
"step": 1900
},
{
"epoch": 0.6617010219989606,
"grad_norm": 0.22188437526617993,
"learning_rate": 4.782714562635914e-05,
"loss": 0.2298,
"step": 1910
},
{
"epoch": 0.6651654252554997,
"grad_norm": 0.22756117335293902,
"learning_rate": 4.7785869743574915e-05,
"loss": 0.2245,
"step": 1920
},
{
"epoch": 0.6686298285120388,
"grad_norm": 0.20717649696487608,
"learning_rate": 4.7744223654354506e-05,
"loss": 0.2331,
"step": 1930
},
{
"epoch": 0.6720942317685779,
"grad_norm": 0.21788412171242105,
"learning_rate": 4.7702208035329996e-05,
"loss": 0.2207,
"step": 1940
},
{
"epoch": 0.6755586350251169,
"grad_norm": 0.23790582790590165,
"learning_rate": 4.765982356913728e-05,
"loss": 0.2299,
"step": 1950
},
{
"epoch": 0.679023038281656,
"grad_norm": 0.254738526683431,
"learning_rate": 4.7617070944404975e-05,
"loss": 0.2277,
"step": 1960
},
{
"epoch": 0.6824874415381951,
"grad_norm": 0.2628656210512643,
"learning_rate": 4.757395085574326e-05,
"loss": 0.2297,
"step": 1970
},
{
"epoch": 0.6859518447947341,
"grad_norm": 0.20591189751455907,
"learning_rate": 4.7530464003732545e-05,
"loss": 0.2248,
"step": 1980
},
{
"epoch": 0.6894162480512732,
"grad_norm": 0.2576351313383852,
"learning_rate": 4.7486611094912146e-05,
"loss": 0.2251,
"step": 1990
},
{
"epoch": 0.6928806513078122,
"grad_norm": 0.20856524397248571,
"learning_rate": 4.744239284176876e-05,
"loss": 0.2291,
"step": 2000
},
{
"epoch": 0.6963450545643513,
"grad_norm": 0.2186827003230637,
"learning_rate": 4.73978099627249e-05,
"loss": 0.2229,
"step": 2010
},
{
"epoch": 0.6998094578208903,
"grad_norm": 0.20914686503222382,
"learning_rate": 4.7352863182127246e-05,
"loss": 0.2206,
"step": 2020
},
{
"epoch": 0.7032738610774294,
"grad_norm": 0.22559468853582607,
"learning_rate": 4.730755323023482e-05,
"loss": 0.2319,
"step": 2030
},
{
"epoch": 0.7067382643339685,
"grad_norm": 0.23247057053881534,
"learning_rate": 4.72618808432072e-05,
"loss": 0.2261,
"step": 2040
},
{
"epoch": 0.7102026675905075,
"grad_norm": 0.22994852903066657,
"learning_rate": 4.7215846763092486e-05,
"loss": 0.2275,
"step": 2050
},
{
"epoch": 0.7136670708470466,
"grad_norm": 0.21562072555254103,
"learning_rate": 4.716945173781528e-05,
"loss": 0.2275,
"step": 2060
},
{
"epoch": 0.7171314741035857,
"grad_norm": 0.1964162535848221,
"learning_rate": 4.7122696521164564e-05,
"loss": 0.2267,
"step": 2070
},
{
"epoch": 0.7205958773601248,
"grad_norm": 0.23405157909799276,
"learning_rate": 4.7075581872781375e-05,
"loss": 0.2293,
"step": 2080
},
{
"epoch": 0.7240602806166638,
"grad_norm": 0.22767428517421429,
"learning_rate": 4.7028108558146526e-05,
"loss": 0.2273,
"step": 2090
},
{
"epoch": 0.7275246838732028,
"grad_norm": 0.28600372042549504,
"learning_rate": 4.698027734856816e-05,
"loss": 0.2297,
"step": 2100
},
{
"epoch": 0.7309890871297419,
"grad_norm": 0.2298031761231605,
"learning_rate": 4.693208902116918e-05,
"loss": 0.2227,
"step": 2110
},
{
"epoch": 0.7344534903862809,
"grad_norm": 0.2116599416195417,
"learning_rate": 4.688354435887467e-05,
"loss": 0.2248,
"step": 2120
},
{
"epoch": 0.73791789364282,
"grad_norm": 0.20160724544535827,
"learning_rate": 4.683464415039918e-05,
"loss": 0.2197,
"step": 2130
},
{
"epoch": 0.7413822968993591,
"grad_norm": 0.22040076084389237,
"learning_rate": 4.678538919023383e-05,
"loss": 0.2306,
"step": 2140
},
{
"epoch": 0.7448467001558982,
"grad_norm": 0.24384427587101223,
"learning_rate": 4.673578027863351e-05,
"loss": 0.226,
"step": 2150
},
{
"epoch": 0.7483111034124372,
"grad_norm": 0.21129176345098202,
"learning_rate": 4.6685818221603804e-05,
"loss": 0.2298,
"step": 2160
},
{
"epoch": 0.7517755066689763,
"grad_norm": 0.2763625585319092,
"learning_rate": 4.663550383088792e-05,
"loss": 0.2253,
"step": 2170
},
{
"epoch": 0.7552399099255154,
"grad_norm": 0.21704903419555124,
"learning_rate": 4.6584837923953516e-05,
"loss": 0.2215,
"step": 2180
},
{
"epoch": 0.7587043131820544,
"grad_norm": 0.21517162264960232,
"learning_rate": 4.653382132397938e-05,
"loss": 0.2251,
"step": 2190
},
{
"epoch": 0.7621687164385934,
"grad_norm": 0.2018666939738909,
"learning_rate": 4.648245485984207e-05,
"loss": 0.2239,
"step": 2200
},
{
"epoch": 0.7656331196951325,
"grad_norm": 0.18928657263154086,
"learning_rate": 4.64307393661025e-05,
"loss": 0.2222,
"step": 2210
},
{
"epoch": 0.7690975229516716,
"grad_norm": 0.23077452960308834,
"learning_rate": 4.63786756829923e-05,
"loss": 0.2254,
"step": 2220
},
{
"epoch": 0.7725619262082106,
"grad_norm": 0.19369514792151565,
"learning_rate": 4.63262646564002e-05,
"loss": 0.2214,
"step": 2230
},
{
"epoch": 0.7760263294647497,
"grad_norm": 0.19788506342985074,
"learning_rate": 4.627350713785829e-05,
"loss": 0.2199,
"step": 2240
},
{
"epoch": 0.7794907327212888,
"grad_norm": 0.1892817176323716,
"learning_rate": 4.622040398452819e-05,
"loss": 0.2209,
"step": 2250
},
{
"epoch": 0.7829551359778278,
"grad_norm": 0.21413217334371104,
"learning_rate": 4.616695605918712e-05,
"loss": 0.2259,
"step": 2260
},
{
"epoch": 0.7864195392343669,
"grad_norm": 0.2516901660820717,
"learning_rate": 4.6113164230213844e-05,
"loss": 0.2224,
"step": 2270
},
{
"epoch": 0.789883942490906,
"grad_norm": 0.23275016126252404,
"learning_rate": 4.605902937157465e-05,
"loss": 0.2269,
"step": 2280
},
{
"epoch": 0.793348345747445,
"grad_norm": 0.2121809202061214,
"learning_rate": 4.600455236280905e-05,
"loss": 0.2231,
"step": 2290
},
{
"epoch": 0.796812749003984,
"grad_norm": 0.19630769074682533,
"learning_rate": 4.5949734089015544e-05,
"loss": 0.2207,
"step": 2300
},
{
"epoch": 0.8002771522605231,
"grad_norm": 0.21355754308769215,
"learning_rate": 4.589457544083725e-05,
"loss": 0.224,
"step": 2310
},
{
"epoch": 0.8037415555170622,
"grad_norm": 0.21050957794732314,
"learning_rate": 4.5839077314447385e-05,
"loss": 0.2238,
"step": 2320
},
{
"epoch": 0.8072059587736012,
"grad_norm": 0.22620144254655492,
"learning_rate": 4.578324061153477e-05,
"loss": 0.2252,
"step": 2330
},
{
"epoch": 0.8106703620301403,
"grad_norm": 0.19924326647557386,
"learning_rate": 4.5727066239289117e-05,
"loss": 0.2239,
"step": 2340
},
{
"epoch": 0.8141347652866794,
"grad_norm": 0.22935473461207423,
"learning_rate": 4.5670555110386316e-05,
"loss": 0.222,
"step": 2350
},
{
"epoch": 0.8175991685432185,
"grad_norm": 0.2651712827400202,
"learning_rate": 4.561370814297363e-05,
"loss": 0.2225,
"step": 2360
},
{
"epoch": 0.8210635717997575,
"grad_norm": 0.2182312756063536,
"learning_rate": 4.555652626065473e-05,
"loss": 0.2238,
"step": 2370
},
{
"epoch": 0.8245279750562966,
"grad_norm": 0.20447815035283293,
"learning_rate": 4.549901039247474e-05,
"loss": 0.2212,
"step": 2380
},
{
"epoch": 0.8279923783128356,
"grad_norm": 0.17858513767092687,
"learning_rate": 4.544116147290509e-05,
"loss": 0.223,
"step": 2390
},
{
"epoch": 0.8314567815693746,
"grad_norm": 0.21126944513637871,
"learning_rate": 4.5382980441828385e-05,
"loss": 0.2253,
"step": 2400
},
{
"epoch": 0.8349211848259137,
"grad_norm": 0.2343427045564657,
"learning_rate": 4.5324468244523086e-05,
"loss": 0.2176,
"step": 2410
},
{
"epoch": 0.8383855880824528,
"grad_norm": 0.19348098291090032,
"learning_rate": 4.52656258316482e-05,
"loss": 0.2171,
"step": 2420
},
{
"epoch": 0.8418499913389919,
"grad_norm": 0.18351830211981723,
"learning_rate": 4.5206454159227783e-05,
"loss": 0.2209,
"step": 2430
},
{
"epoch": 0.8453143945955309,
"grad_norm": 0.21496767724862215,
"learning_rate": 4.514695418863547e-05,
"loss": 0.2209,
"step": 2440
},
{
"epoch": 0.84877879785207,
"grad_norm": 0.19946299406668855,
"learning_rate": 4.508712688657879e-05,
"loss": 0.2202,
"step": 2450
},
{
"epoch": 0.8522432011086091,
"grad_norm": 0.19377744464389568,
"learning_rate": 4.50269732250835e-05,
"loss": 0.2201,
"step": 2460
},
{
"epoch": 0.8557076043651481,
"grad_norm": 0.2366384856135975,
"learning_rate": 4.496649418147778e-05,
"loss": 0.2149,
"step": 2470
},
{
"epoch": 0.8591720076216872,
"grad_norm": 0.21068619938805416,
"learning_rate": 4.490569073837636e-05,
"loss": 0.2184,
"step": 2480
},
{
"epoch": 0.8626364108782262,
"grad_norm": 0.2315187829791829,
"learning_rate": 4.4844563883664554e-05,
"loss": 0.222,
"step": 2490
},
{
"epoch": 0.8661008141347653,
"grad_norm": 0.18649456076146348,
"learning_rate": 4.478311461048219e-05,
"loss": 0.2209,
"step": 2500
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.23315042553004378,
"learning_rate": 4.472134391720751e-05,
"loss": 0.2224,
"step": 2510
},
{
"epoch": 0.8730296206478434,
"grad_norm": 0.1951271064725934,
"learning_rate": 4.465925280744094e-05,
"loss": 0.2239,
"step": 2520
},
{
"epoch": 0.8764940239043825,
"grad_norm": 0.1913261231013653,
"learning_rate": 4.459684228998873e-05,
"loss": 0.2179,
"step": 2530
},
{
"epoch": 0.8799584271609215,
"grad_norm": 0.19492329842413578,
"learning_rate": 4.453411337884666e-05,
"loss": 0.2162,
"step": 2540
},
{
"epoch": 0.8834228304174606,
"grad_norm": 0.2055009502555625,
"learning_rate": 4.4471067093183475e-05,
"loss": 0.2165,
"step": 2550
},
{
"epoch": 0.8868872336739997,
"grad_norm": 0.19549219766107842,
"learning_rate": 4.4407704457324394e-05,
"loss": 0.2158,
"step": 2560
},
{
"epoch": 0.8903516369305388,
"grad_norm": 0.1850796577943829,
"learning_rate": 4.4344026500734415e-05,
"loss": 0.2172,
"step": 2570
},
{
"epoch": 0.8938160401870777,
"grad_norm": 0.19643486062098017,
"learning_rate": 4.428003425800164e-05,
"loss": 0.2208,
"step": 2580
},
{
"epoch": 0.8972804434436168,
"grad_norm": 0.21099302341164966,
"learning_rate": 4.4215728768820406e-05,
"loss": 0.2194,
"step": 2590
},
{
"epoch": 0.9007448467001559,
"grad_norm": 0.24764922928675714,
"learning_rate": 4.415111107797445e-05,
"loss": 0.2193,
"step": 2600
},
{
"epoch": 0.9042092499566949,
"grad_norm": 0.22064705519176767,
"learning_rate": 4.4086182235319904e-05,
"loss": 0.2148,
"step": 2610
},
{
"epoch": 0.907673653213234,
"grad_norm": 0.2131199868612154,
"learning_rate": 4.402094329576825e-05,
"loss": 0.2233,
"step": 2620
},
{
"epoch": 0.9111380564697731,
"grad_norm": 0.2184325870922078,
"learning_rate": 4.395539531926914e-05,
"loss": 0.2227,
"step": 2630
},
{
"epoch": 0.9146024597263122,
"grad_norm": 0.20257196030375807,
"learning_rate": 4.388953937079327e-05,
"loss": 0.2145,
"step": 2640
},
{
"epoch": 0.9180668629828512,
"grad_norm": 0.2052608168052879,
"learning_rate": 4.3823376520314964e-05,
"loss": 0.2176,
"step": 2650
},
{
"epoch": 0.9215312662393903,
"grad_norm": 0.20116038778213127,
"learning_rate": 4.3756907842794855e-05,
"loss": 0.2209,
"step": 2660
},
{
"epoch": 0.9249956694959294,
"grad_norm": 0.2094115752703769,
"learning_rate": 4.369013441816242e-05,
"loss": 0.2186,
"step": 2670
},
{
"epoch": 0.9284600727524683,
"grad_norm": 0.21116605286596796,
"learning_rate": 4.362305733129841e-05,
"loss": 0.2177,
"step": 2680
},
{
"epoch": 0.9319244760090074,
"grad_norm": 0.2355250090651405,
"learning_rate": 4.355567767201725e-05,
"loss": 0.216,
"step": 2690
},
{
"epoch": 0.9353888792655465,
"grad_norm": 0.21712887070689368,
"learning_rate": 4.3487996535049296e-05,
"loss": 0.2194,
"step": 2700
},
{
"epoch": 0.9388532825220856,
"grad_norm": 0.21715912584157251,
"learning_rate": 4.342001502002309e-05,
"loss": 0.2187,
"step": 2710
},
{
"epoch": 0.9423176857786246,
"grad_norm": 0.23840653028574527,
"learning_rate": 4.3351734231447436e-05,
"loss": 0.2127,
"step": 2720
},
{
"epoch": 0.9457820890351637,
"grad_norm": 0.18655391680385583,
"learning_rate": 4.328315527869357e-05,
"loss": 0.2213,
"step": 2730
},
{
"epoch": 0.9492464922917028,
"grad_norm": 0.20595739324476314,
"learning_rate": 4.321427927597697e-05,
"loss": 0.2172,
"step": 2740
},
{
"epoch": 0.9527108955482418,
"grad_norm": 0.16761465601551198,
"learning_rate": 4.31451073423394e-05,
"loss": 0.2153,
"step": 2750
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.18621316019170056,
"learning_rate": 4.3075640601630664e-05,
"loss": 0.2156,
"step": 2760
},
{
"epoch": 0.95963970206132,
"grad_norm": 0.17305717604504345,
"learning_rate": 4.300588018249033e-05,
"loss": 0.2129,
"step": 2770
},
{
"epoch": 0.963104105317859,
"grad_norm": 0.18044851481990093,
"learning_rate": 4.2935827218329434e-05,
"loss": 0.2154,
"step": 2780
},
{
"epoch": 0.966568508574398,
"grad_norm": 0.20518001148705736,
"learning_rate": 4.2865482847312043e-05,
"loss": 0.2132,
"step": 2790
},
{
"epoch": 0.9700329118309371,
"grad_norm": 0.3349155151219487,
"learning_rate": 4.279484821233678e-05,
"loss": 0.2202,
"step": 2800
},
{
"epoch": 0.9734973150874762,
"grad_norm": 0.18282251380283435,
"learning_rate": 4.2723924461018225e-05,
"loss": 0.2186,
"step": 2810
},
{
"epoch": 0.9769617183440152,
"grad_norm": 0.2173702476740624,
"learning_rate": 4.265271274566829e-05,
"loss": 0.22,
"step": 2820
},
{
"epoch": 0.9804261216005543,
"grad_norm": 0.19184429102715528,
"learning_rate": 4.2581214223277495e-05,
"loss": 0.2077,
"step": 2830
},
{
"epoch": 0.9838905248570934,
"grad_norm": 0.20991477426484062,
"learning_rate": 4.250943005549618e-05,
"loss": 0.2208,
"step": 2840
},
{
"epoch": 0.9873549281136325,
"grad_norm": 0.2285331325208064,
"learning_rate": 4.2437361408615614e-05,
"loss": 0.2151,
"step": 2850
},
{
"epoch": 0.9908193313701715,
"grad_norm": 0.2148804733407693,
"learning_rate": 4.2365009453549046e-05,
"loss": 0.2216,
"step": 2860
},
{
"epoch": 0.9942837346267106,
"grad_norm": 0.23671514755116507,
"learning_rate": 4.22923753658127e-05,
"loss": 0.2197,
"step": 2870
},
{
"epoch": 0.9977481378832496,
"grad_norm": 0.21988104341412565,
"learning_rate": 4.221946032550665e-05,
"loss": 0.2143,
"step": 2880
},
{
"epoch": 1.0010393209769617,
"grad_norm": 0.20204127186990958,
"learning_rate": 4.214626551729569e-05,
"loss": 0.2095,
"step": 2890
},
{
"epoch": 1.0045037242335009,
"grad_norm": 0.19674007109910524,
"learning_rate": 4.207279213039003e-05,
"loss": 0.192,
"step": 2900
},
{
"epoch": 1.0079681274900398,
"grad_norm": 0.18771217665260514,
"learning_rate": 4.199904135852598e-05,
"loss": 0.1936,
"step": 2910
},
{
"epoch": 1.0114325307465788,
"grad_norm": 0.2045135750011863,
"learning_rate": 4.192501439994664e-05,
"loss": 0.1942,
"step": 2920
},
{
"epoch": 1.014896934003118,
"grad_norm": 0.18370214635615664,
"learning_rate": 4.185071245738231e-05,
"loss": 0.1891,
"step": 2930
},
{
"epoch": 1.018361337259657,
"grad_norm": 0.18667193497723014,
"learning_rate": 4.177613673803106e-05,
"loss": 0.1944,
"step": 2940
},
{
"epoch": 1.0218257405161961,
"grad_norm": 0.1791392276679071,
"learning_rate": 4.170128845353902e-05,
"loss": 0.189,
"step": 2950
},
{
"epoch": 1.025290143772735,
"grad_norm": 0.16725027485938548,
"learning_rate": 4.162616881998075e-05,
"loss": 0.1985,
"step": 2960
},
{
"epoch": 1.0287545470292743,
"grad_norm": 0.2029662849352073,
"learning_rate": 4.155077905783949e-05,
"loss": 0.1938,
"step": 2970
},
{
"epoch": 1.0322189502858132,
"grad_norm": 0.21676480591029273,
"learning_rate": 4.14751203919873e-05,
"loss": 0.1938,
"step": 2980
},
{
"epoch": 1.0356833535423524,
"grad_norm": 0.18194277030338057,
"learning_rate": 4.1399194051665146e-05,
"loss": 0.1943,
"step": 2990
},
{
"epoch": 1.0391477567988914,
"grad_norm": 0.22187757323234317,
"learning_rate": 4.1323001270463e-05,
"loss": 0.1956,
"step": 3000
},
{
"epoch": 1.0426121600554306,
"grad_norm": 0.17994994526393276,
"learning_rate": 4.1246543286299714e-05,
"loss": 0.196,
"step": 3010
},
{
"epoch": 1.0460765633119695,
"grad_norm": 0.2006941244750973,
"learning_rate": 4.1169821341402956e-05,
"loss": 0.1928,
"step": 3020
},
{
"epoch": 1.0495409665685085,
"grad_norm": 0.18127266652072002,
"learning_rate": 4.109283668228903e-05,
"loss": 0.1883,
"step": 3030
},
{
"epoch": 1.0530053698250477,
"grad_norm": 0.17064172550512371,
"learning_rate": 4.101559055974258e-05,
"loss": 0.1944,
"step": 3040
},
{
"epoch": 1.0564697730815866,
"grad_norm": 0.17513257115164024,
"learning_rate": 4.09380842287963e-05,
"loss": 0.1924,
"step": 3050
},
{
"epoch": 1.0599341763381258,
"grad_norm": 0.17679873359445633,
"learning_rate": 4.0860318948710574e-05,
"loss": 0.1967,
"step": 3060
},
{
"epoch": 1.0633985795946648,
"grad_norm": 0.17716657233908994,
"learning_rate": 4.0782295982952954e-05,
"loss": 0.1904,
"step": 3070
},
{
"epoch": 1.066862982851204,
"grad_norm": 0.20587418971229102,
"learning_rate": 4.0704016599177655e-05,
"loss": 0.1944,
"step": 3080
},
{
"epoch": 1.070327386107743,
"grad_norm": 0.19036738753494378,
"learning_rate": 4.062548206920499e-05,
"loss": 0.1927,
"step": 3090
},
{
"epoch": 1.073791789364282,
"grad_norm": 0.19921829291538054,
"learning_rate": 4.054669366900066e-05,
"loss": 0.1917,
"step": 3100
},
{
"epoch": 1.077256192620821,
"grad_norm": 0.20317583824797536,
"learning_rate": 4.0467652678655056e-05,
"loss": 0.1914,
"step": 3110
},
{
"epoch": 1.08072059587736,
"grad_norm": 0.1848009075000212,
"learning_rate": 4.038836038236245e-05,
"loss": 0.1868,
"step": 3120
},
{
"epoch": 1.0841849991338992,
"grad_norm": 0.2082390217876895,
"learning_rate": 4.0308818068400125e-05,
"loss": 0.1897,
"step": 3130
},
{
"epoch": 1.0876494023904382,
"grad_norm": 0.23521100374901133,
"learning_rate": 4.022902702910745e-05,
"loss": 0.1849,
"step": 3140
},
{
"epoch": 1.0911138056469774,
"grad_norm": 0.18191505448762496,
"learning_rate": 4.014898856086489e-05,
"loss": 0.1909,
"step": 3150
},
{
"epoch": 1.0945782089035163,
"grad_norm": 0.19423731854983112,
"learning_rate": 4.006870396407294e-05,
"loss": 0.1925,
"step": 3160
},
{
"epoch": 1.0980426121600555,
"grad_norm": 0.17415753971247672,
"learning_rate": 3.998817454313096e-05,
"loss": 0.1961,
"step": 3170
},
{
"epoch": 1.1015070154165945,
"grad_norm": 0.17868260694918264,
"learning_rate": 3.9907401606416054e-05,
"loss": 0.1984,
"step": 3180
},
{
"epoch": 1.1049714186731336,
"grad_norm": 0.16583150690918594,
"learning_rate": 3.9826386466261765e-05,
"loss": 0.1948,
"step": 3190
},
{
"epoch": 1.1084358219296726,
"grad_norm": 0.19154779416193093,
"learning_rate": 3.9745130438936744e-05,
"loss": 0.187,
"step": 3200
},
{
"epoch": 1.1119002251862118,
"grad_norm": 0.18741249451292638,
"learning_rate": 3.96636348446234e-05,
"loss": 0.1907,
"step": 3210
},
{
"epoch": 1.1153646284427507,
"grad_norm": 0.18322276372490287,
"learning_rate": 3.958190100739643e-05,
"loss": 0.1872,
"step": 3220
},
{
"epoch": 1.1188290316992897,
"grad_norm": 0.18729813363303588,
"learning_rate": 3.94999302552013e-05,
"loss": 0.1942,
"step": 3230
},
{
"epoch": 1.122293434955829,
"grad_norm": 0.19705653891217498,
"learning_rate": 3.941772391983271e-05,
"loss": 0.1912,
"step": 3240
},
{
"epoch": 1.1257578382123679,
"grad_norm": 0.17707126700086195,
"learning_rate": 3.9335283336912873e-05,
"loss": 0.192,
"step": 3250
},
{
"epoch": 1.129222241468907,
"grad_norm": 0.17283665381012417,
"learning_rate": 3.925260984586991e-05,
"loss": 0.1904,
"step": 3260
},
{
"epoch": 1.132686644725446,
"grad_norm": 0.18010720437594213,
"learning_rate": 3.916970478991604e-05,
"loss": 0.1943,
"step": 3270
},
{
"epoch": 1.1361510479819852,
"grad_norm": 0.15919684820270277,
"learning_rate": 3.908656951602574e-05,
"loss": 0.1897,
"step": 3280
},
{
"epoch": 1.1396154512385241,
"grad_norm": 0.1931598714015755,
"learning_rate": 3.9003205374913906e-05,
"loss": 0.1901,
"step": 3290
},
{
"epoch": 1.1430798544950633,
"grad_norm": 0.17545621617123003,
"learning_rate": 3.891961372101387e-05,
"loss": 0.1869,
"step": 3300
},
{
"epoch": 1.1465442577516023,
"grad_norm": 0.17341255082466775,
"learning_rate": 3.883579591245542e-05,
"loss": 0.1899,
"step": 3310
},
{
"epoch": 1.1500086610081413,
"grad_norm": 0.16204232239326744,
"learning_rate": 3.8751753311042704e-05,
"loss": 0.1897,
"step": 3320
},
{
"epoch": 1.1534730642646804,
"grad_norm": 0.17553623491312426,
"learning_rate": 3.8667487282232144e-05,
"loss": 0.187,
"step": 3330
},
{
"epoch": 1.1569374675212194,
"grad_norm": 0.1774831508823347,
"learning_rate": 3.8582999195110215e-05,
"loss": 0.1943,
"step": 3340
},
{
"epoch": 1.1604018707777586,
"grad_norm": 0.17946778025767848,
"learning_rate": 3.849829042237123e-05,
"loss": 0.1929,
"step": 3350
},
{
"epoch": 1.1638662740342975,
"grad_norm": 0.17173672286352726,
"learning_rate": 3.841336234029501e-05,
"loss": 0.195,
"step": 3360
},
{
"epoch": 1.1673306772908367,
"grad_norm": 0.17994745465860756,
"learning_rate": 3.832821632872454e-05,
"loss": 0.19,
"step": 3370
},
{
"epoch": 1.1707950805473757,
"grad_norm": 0.1850088732651661,
"learning_rate": 3.8242853771043566e-05,
"loss": 0.1957,
"step": 3380
},
{
"epoch": 1.1742594838039149,
"grad_norm": 0.15908666104639177,
"learning_rate": 3.815727605415406e-05,
"loss": 0.1915,
"step": 3390
},
{
"epoch": 1.1777238870604538,
"grad_norm": 0.18156692156906576,
"learning_rate": 3.807148456845378e-05,
"loss": 0.188,
"step": 3400
},
{
"epoch": 1.1811882903169928,
"grad_norm": 0.17113960858415064,
"learning_rate": 3.798548070781357e-05,
"loss": 0.1893,
"step": 3410
},
{
"epoch": 1.184652693573532,
"grad_norm": 0.18236211522160944,
"learning_rate": 3.789926586955484e-05,
"loss": 0.1859,
"step": 3420
},
{
"epoch": 1.188117096830071,
"grad_norm": 0.16797194145110902,
"learning_rate": 3.7812841454426715e-05,
"loss": 0.1901,
"step": 3430
},
{
"epoch": 1.1915815000866101,
"grad_norm": 0.1988755262539599,
"learning_rate": 3.772620886658342e-05,
"loss": 0.1942,
"step": 3440
},
{
"epoch": 1.195045903343149,
"grad_norm": 0.18276504002195174,
"learning_rate": 3.7639369513561374e-05,
"loss": 0.1901,
"step": 3450
},
{
"epoch": 1.1985103065996883,
"grad_norm": 0.19420509251762302,
"learning_rate": 3.7552324806256356e-05,
"loss": 0.1893,
"step": 3460
},
{
"epoch": 1.2019747098562272,
"grad_norm": 0.1856528154175482,
"learning_rate": 3.7465076158900565e-05,
"loss": 0.1926,
"step": 3470
},
{
"epoch": 1.2054391131127664,
"grad_norm": 0.15954596519502212,
"learning_rate": 3.737762498903967e-05,
"loss": 0.1928,
"step": 3480
},
{
"epoch": 1.2089035163693054,
"grad_norm": 0.16035794848815701,
"learning_rate": 3.728997271750975e-05,
"loss": 0.1911,
"step": 3490
},
{
"epoch": 1.2123679196258443,
"grad_norm": 0.1860891805383301,
"learning_rate": 3.720212076841424e-05,
"loss": 0.1906,
"step": 3500
},
{
"epoch": 1.2158323228823835,
"grad_norm": 0.18968549769475154,
"learning_rate": 3.7114070569100745e-05,
"loss": 0.1915,
"step": 3510
},
{
"epoch": 1.2192967261389225,
"grad_norm": 0.18632674935712812,
"learning_rate": 3.70258235501379e-05,
"loss": 0.2005,
"step": 3520
},
{
"epoch": 1.2227611293954617,
"grad_norm": 0.17630114385410808,
"learning_rate": 3.693738114529211e-05,
"loss": 0.1932,
"step": 3530
},
{
"epoch": 1.2262255326520006,
"grad_norm": 0.17065976632694624,
"learning_rate": 3.6848744791504244e-05,
"loss": 0.1924,
"step": 3540
},
{
"epoch": 1.2296899359085398,
"grad_norm": 0.16123026277843103,
"learning_rate": 3.675991592886629e-05,
"loss": 0.1921,
"step": 3550
},
{
"epoch": 1.2331543391650788,
"grad_norm": 0.15903672080165102,
"learning_rate": 3.667089600059799e-05,
"loss": 0.1872,
"step": 3560
},
{
"epoch": 1.236618742421618,
"grad_norm": 0.17243581096797103,
"learning_rate": 3.658168645302333e-05,
"loss": 0.1933,
"step": 3570
},
{
"epoch": 1.240083145678157,
"grad_norm": 0.18100700093073868,
"learning_rate": 3.6492288735547104e-05,
"loss": 0.1951,
"step": 3580
},
{
"epoch": 1.2435475489346959,
"grad_norm": 0.17490828663332014,
"learning_rate": 3.640270430063133e-05,
"loss": 0.1914,
"step": 3590
},
{
"epoch": 1.247011952191235,
"grad_norm": 0.15940099515313444,
"learning_rate": 3.6312934603771674e-05,
"loss": 0.1894,
"step": 3600
},
{
"epoch": 1.2504763554477742,
"grad_norm": 0.18052826346513526,
"learning_rate": 3.622298110347377e-05,
"loss": 0.1891,
"step": 3610
},
{
"epoch": 1.2539407587043132,
"grad_norm": 0.16612752065480388,
"learning_rate": 3.613284526122954e-05,
"loss": 0.1908,
"step": 3620
},
{
"epoch": 1.2574051619608522,
"grad_norm": 0.18743922356428885,
"learning_rate": 3.604252854149347e-05,
"loss": 0.1883,
"step": 3630
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.181478054238508,
"learning_rate": 3.595203241165878e-05,
"loss": 0.1878,
"step": 3640
},
{
"epoch": 1.2643339684739303,
"grad_norm": 0.1804892389398887,
"learning_rate": 3.586135834203362e-05,
"loss": 0.1893,
"step": 3650
},
{
"epoch": 1.2677983717304695,
"grad_norm": 0.1741226930201246,
"learning_rate": 3.5770507805817135e-05,
"loss": 0.1912,
"step": 3660
},
{
"epoch": 1.2712627749870085,
"grad_norm": 0.16334377723582452,
"learning_rate": 3.5679482279075584e-05,
"loss": 0.1892,
"step": 3670
},
{
"epoch": 1.2747271782435474,
"grad_norm": 0.17047126134525575,
"learning_rate": 3.558828324071831e-05,
"loss": 0.1907,
"step": 3680
},
{
"epoch": 1.2781915815000866,
"grad_norm": 0.17298916476177462,
"learning_rate": 3.549691217247375e-05,
"loss": 0.1906,
"step": 3690
},
{
"epoch": 1.2816559847566258,
"grad_norm": 0.15717399662492984,
"learning_rate": 3.540537055886533e-05,
"loss": 0.1934,
"step": 3700
},
{
"epoch": 1.2851203880131647,
"grad_norm": 0.17584831578937637,
"learning_rate": 3.531365988718736e-05,
"loss": 0.1851,
"step": 3710
},
{
"epoch": 1.2885847912697037,
"grad_norm": 0.16205306891044194,
"learning_rate": 3.522178164748089e-05,
"loss": 0.1861,
"step": 3720
},
{
"epoch": 1.292049194526243,
"grad_norm": 0.15057670492815758,
"learning_rate": 3.5129737332509456e-05,
"loss": 0.1906,
"step": 3730
},
{
"epoch": 1.2955135977827819,
"grad_norm": 0.15822993127615093,
"learning_rate": 3.503752843773486e-05,
"loss": 0.1838,
"step": 3740
},
{
"epoch": 1.298978001039321,
"grad_norm": 0.15864785027235162,
"learning_rate": 3.4945156461292854e-05,
"loss": 0.1952,
"step": 3750
},
{
"epoch": 1.30244240429586,
"grad_norm": 0.18227602995981887,
"learning_rate": 3.485262290396883e-05,
"loss": 0.1898,
"step": 3760
},
{
"epoch": 1.305906807552399,
"grad_norm": 0.16200311831559636,
"learning_rate": 3.475992926917341e-05,
"loss": 0.1929,
"step": 3770
},
{
"epoch": 1.3093712108089381,
"grad_norm": 0.15259911553775118,
"learning_rate": 3.4667077062918e-05,
"loss": 0.1872,
"step": 3780
},
{
"epoch": 1.3128356140654773,
"grad_norm": 0.18125600690337332,
"learning_rate": 3.457406779379039e-05,
"loss": 0.1925,
"step": 3790
},
{
"epoch": 1.3163000173220163,
"grad_norm": 0.192188947967991,
"learning_rate": 3.448090297293016e-05,
"loss": 0.1871,
"step": 3800
},
{
"epoch": 1.3197644205785553,
"grad_norm": 0.17328021183720274,
"learning_rate": 3.438758411400421e-05,
"loss": 0.189,
"step": 3810
},
{
"epoch": 1.3232288238350944,
"grad_norm": 0.16788815861788564,
"learning_rate": 3.4294112733182084e-05,
"loss": 0.1916,
"step": 3820
},
{
"epoch": 1.3266932270916334,
"grad_norm": 0.15688398565420658,
"learning_rate": 3.420049034911139e-05,
"loss": 0.1882,
"step": 3830
},
{
"epoch": 1.3301576303481726,
"grad_norm": 0.1714798078013055,
"learning_rate": 3.410671848289315e-05,
"loss": 0.1882,
"step": 3840
},
{
"epoch": 1.3336220336047115,
"grad_norm": 0.16309203688542842,
"learning_rate": 3.401279865805702e-05,
"loss": 0.1914,
"step": 3850
},
{
"epoch": 1.3370864368612507,
"grad_norm": 0.15748111773147097,
"learning_rate": 3.391873240053656e-05,
"loss": 0.1866,
"step": 3860
},
{
"epoch": 1.3405508401177897,
"grad_norm": 0.1820518471519293,
"learning_rate": 3.382452123864448e-05,
"loss": 0.1865,
"step": 3870
},
{
"epoch": 1.3440152433743289,
"grad_norm": 0.15772861765330992,
"learning_rate": 3.373016670304774e-05,
"loss": 0.1928,
"step": 3880
},
{
"epoch": 1.3474796466308678,
"grad_norm": 0.15938918092133955,
"learning_rate": 3.3635670326742755e-05,
"loss": 0.1913,
"step": 3890
},
{
"epoch": 1.3509440498874068,
"grad_norm": 0.1721339531229463,
"learning_rate": 3.354103364503045e-05,
"loss": 0.1907,
"step": 3900
},
{
"epoch": 1.354408453143946,
"grad_norm": 0.15410885348307193,
"learning_rate": 3.3446258195491305e-05,
"loss": 0.1879,
"step": 3910
},
{
"epoch": 1.357872856400485,
"grad_norm": 0.16165063244129996,
"learning_rate": 3.3351345517960386e-05,
"loss": 0.1898,
"step": 3920
},
{
"epoch": 1.3613372596570241,
"grad_norm": 0.16569690135925177,
"learning_rate": 3.325629715450235e-05,
"loss": 0.1873,
"step": 3930
},
{
"epoch": 1.364801662913563,
"grad_norm": 0.16487442057352042,
"learning_rate": 3.3161114649386335e-05,
"loss": 0.1888,
"step": 3940
},
{
"epoch": 1.3682660661701023,
"grad_norm": 0.17731395997046875,
"learning_rate": 3.306579954906095e-05,
"loss": 0.1881,
"step": 3950
},
{
"epoch": 1.3717304694266412,
"grad_norm": 0.18039964361054459,
"learning_rate": 3.2970353402129065e-05,
"loss": 0.1873,
"step": 3960
},
{
"epoch": 1.3751948726831804,
"grad_norm": 0.17793927736310747,
"learning_rate": 3.287477775932271e-05,
"loss": 0.1912,
"step": 3970
},
{
"epoch": 1.3786592759397194,
"grad_norm": 0.1790070950599728,
"learning_rate": 3.2779074173477845e-05,
"loss": 0.1837,
"step": 3980
},
{
"epoch": 1.3821236791962583,
"grad_norm": 0.16536456635439983,
"learning_rate": 3.2683244199509164e-05,
"loss": 0.1822,
"step": 3990
},
{
"epoch": 1.3855880824527975,
"grad_norm": 0.17630304243331663,
"learning_rate": 3.258728939438479e-05,
"loss": 0.1875,
"step": 4000
},
{
"epoch": 1.3890524857093367,
"grad_norm": 0.17420428081742365,
"learning_rate": 3.249121131710102e-05,
"loss": 0.1883,
"step": 4010
},
{
"epoch": 1.3925168889658757,
"grad_norm": 0.17543770138618733,
"learning_rate": 3.239501152865698e-05,
"loss": 0.1871,
"step": 4020
},
{
"epoch": 1.3959812922224146,
"grad_norm": 0.1592954498035186,
"learning_rate": 3.229869159202925e-05,
"loss": 0.1845,
"step": 4030
},
{
"epoch": 1.3994456954789538,
"grad_norm": 0.1969202150910442,
"learning_rate": 3.2202253072146485e-05,
"loss": 0.1899,
"step": 4040
},
{
"epoch": 1.4029100987354928,
"grad_norm": 0.16611324585892465,
"learning_rate": 3.2105697535863974e-05,
"loss": 0.1884,
"step": 4050
},
{
"epoch": 1.406374501992032,
"grad_norm": 0.16159007802640257,
"learning_rate": 3.200902655193822e-05,
"loss": 0.185,
"step": 4060
},
{
"epoch": 1.409838905248571,
"grad_norm": 0.1580252831987218,
"learning_rate": 3.1912241691001396e-05,
"loss": 0.1883,
"step": 4070
},
{
"epoch": 1.4133033085051099,
"grad_norm": 0.17023457596293415,
"learning_rate": 3.181534452553589e-05,
"loss": 0.1864,
"step": 4080
},
{
"epoch": 1.416767711761649,
"grad_norm": 0.16337679260042998,
"learning_rate": 3.1718336629848674e-05,
"loss": 0.1877,
"step": 4090
},
{
"epoch": 1.4202321150181882,
"grad_norm": 0.1899260224945417,
"learning_rate": 3.162121958004584e-05,
"loss": 0.188,
"step": 4100
},
{
"epoch": 1.4236965182747272,
"grad_norm": 0.1901828231396822,
"learning_rate": 3.1523994954006875e-05,
"loss": 0.1868,
"step": 4110
},
{
"epoch": 1.4271609215312662,
"grad_norm": 0.1791821381475306,
"learning_rate": 3.142666433135911e-05,
"loss": 0.1889,
"step": 4120
},
{
"epoch": 1.4306253247878054,
"grad_norm": 0.16547869947285684,
"learning_rate": 3.132922929345199e-05,
"loss": 0.1884,
"step": 4130
},
{
"epoch": 1.4340897280443443,
"grad_norm": 0.1549076050063216,
"learning_rate": 3.123169142333145e-05,
"loss": 0.1884,
"step": 4140
},
{
"epoch": 1.4375541313008835,
"grad_norm": 0.15295140029133414,
"learning_rate": 3.1134052305714146e-05,
"loss": 0.1852,
"step": 4150
},
{
"epoch": 1.4410185345574225,
"grad_norm": 0.16777477030335766,
"learning_rate": 3.1036313526961716e-05,
"loss": 0.1849,
"step": 4160
},
{
"epoch": 1.4444829378139614,
"grad_norm": 0.14671702288800448,
"learning_rate": 3.093847667505502e-05,
"loss": 0.1918,
"step": 4170
},
{
"epoch": 1.4479473410705006,
"grad_norm": 0.15525539418252593,
"learning_rate": 3.084054333956833e-05,
"loss": 0.1866,
"step": 4180
},
{
"epoch": 1.4514117443270398,
"grad_norm": 0.16462043975916238,
"learning_rate": 3.0742515111643496e-05,
"loss": 0.1863,
"step": 4190
},
{
"epoch": 1.4548761475835787,
"grad_norm": 0.15447538862682672,
"learning_rate": 3.064439358396412e-05,
"loss": 0.1871,
"step": 4200
},
{
"epoch": 1.4583405508401177,
"grad_norm": 0.19391772756705172,
"learning_rate": 3.0546180350729646e-05,
"loss": 0.1907,
"step": 4210
},
{
"epoch": 1.461804954096657,
"grad_norm": 0.16839531563433688,
"learning_rate": 3.0447877007629494e-05,
"loss": 0.1872,
"step": 4220
},
{
"epoch": 1.4652693573531959,
"grad_norm": 0.1667638250714929,
"learning_rate": 3.0349485151817104e-05,
"loss": 0.1891,
"step": 4230
},
{
"epoch": 1.468733760609735,
"grad_norm": 0.15657114860167654,
"learning_rate": 3.0251006381884e-05,
"loss": 0.1888,
"step": 4240
},
{
"epoch": 1.472198163866274,
"grad_norm": 0.1616644177415703,
"learning_rate": 3.0152442297833817e-05,
"loss": 0.1866,
"step": 4250
},
{
"epoch": 1.475662567122813,
"grad_norm": 0.16936489968998167,
"learning_rate": 3.005379450105631e-05,
"loss": 0.1861,
"step": 4260
},
{
"epoch": 1.4791269703793521,
"grad_norm": 0.17783057212322842,
"learning_rate": 2.995506459430133e-05,
"loss": 0.1921,
"step": 4270
},
{
"epoch": 1.4825913736358913,
"grad_norm": 0.18253028978511793,
"learning_rate": 2.9856254181652777e-05,
"loss": 0.1931,
"step": 4280
},
{
"epoch": 1.4860557768924303,
"grad_norm": 0.17108679951308503,
"learning_rate": 2.9757364868502558e-05,
"loss": 0.187,
"step": 4290
},
{
"epoch": 1.4895201801489693,
"grad_norm": 0.17127968110442396,
"learning_rate": 2.9658398261524477e-05,
"loss": 0.1856,
"step": 4300
},
{
"epoch": 1.4929845834055084,
"grad_norm": 0.15243800455802872,
"learning_rate": 2.9559355968648163e-05,
"loss": 0.1878,
"step": 4310
},
{
"epoch": 1.4964489866620474,
"grad_norm": 0.16208265788754178,
"learning_rate": 2.9460239599032898e-05,
"loss": 0.1831,
"step": 4320
},
{
"epoch": 1.4999133899185866,
"grad_norm": 0.17721043816157045,
"learning_rate": 2.9361050763041552e-05,
"loss": 0.1855,
"step": 4330
},
{
"epoch": 1.5033777931751255,
"grad_norm": 0.1624251601306164,
"learning_rate": 2.926179107221433e-05,
"loss": 0.1905,
"step": 4340
},
{
"epoch": 1.5068421964316645,
"grad_norm": 0.17242171402814246,
"learning_rate": 2.916246213924263e-05,
"loss": 0.1892,
"step": 4350
},
{
"epoch": 1.5103065996882037,
"grad_norm": 0.18060628796516595,
"learning_rate": 2.9063065577942873e-05,
"loss": 0.1868,
"step": 4360
},
{
"epoch": 1.5137710029447429,
"grad_norm": 0.16420679560340506,
"learning_rate": 2.896360300323022e-05,
"loss": 0.189,
"step": 4370
},
{
"epoch": 1.5172354062012818,
"grad_norm": 0.1657190366825487,
"learning_rate": 2.8864076031092375e-05,
"loss": 0.1862,
"step": 4380
},
{
"epoch": 1.5206998094578208,
"grad_norm": 0.14827671598571107,
"learning_rate": 2.8764486278563313e-05,
"loss": 0.187,
"step": 4390
},
{
"epoch": 1.52416421271436,
"grad_norm": 0.1696723630570198,
"learning_rate": 2.8664835363697028e-05,
"loss": 0.1878,
"step": 4400
},
{
"epoch": 1.5276286159708992,
"grad_norm": 0.16415531283137735,
"learning_rate": 2.8565124905541224e-05,
"loss": 0.1884,
"step": 4410
},
{
"epoch": 1.5310930192274381,
"grad_norm": 0.1711666868251127,
"learning_rate": 2.8465356524111014e-05,
"loss": 0.1885,
"step": 4420
},
{
"epoch": 1.534557422483977,
"grad_norm": 0.15183248647521377,
"learning_rate": 2.8365531840362586e-05,
"loss": 0.185,
"step": 4430
},
{
"epoch": 1.538021825740516,
"grad_norm": 0.14509942126397887,
"learning_rate": 2.826565247616692e-05,
"loss": 0.1882,
"step": 4440
},
{
"epoch": 1.5414862289970552,
"grad_norm": 0.1638920162743926,
"learning_rate": 2.816572005428337e-05,
"loss": 0.1868,
"step": 4450
},
{
"epoch": 1.5449506322535944,
"grad_norm": 0.15578153131291148,
"learning_rate": 2.8065736198333337e-05,
"loss": 0.1811,
"step": 4460
},
{
"epoch": 1.5484150355101334,
"grad_norm": 0.16655001845904682,
"learning_rate": 2.796570253277389e-05,
"loss": 0.1867,
"step": 4470
},
{
"epoch": 1.5518794387666723,
"grad_norm": 0.16373974199509916,
"learning_rate": 2.786562068287134e-05,
"loss": 0.1841,
"step": 4480
},
{
"epoch": 1.5553438420232115,
"grad_norm": 0.17150797676546983,
"learning_rate": 2.7765492274674887e-05,
"loss": 0.1885,
"step": 4490
},
{
"epoch": 1.5588082452797507,
"grad_norm": 0.1477537468546959,
"learning_rate": 2.7665318934990153e-05,
"loss": 0.1836,
"step": 4500
},
{
"epoch": 1.5622726485362897,
"grad_norm": 0.1553596996678754,
"learning_rate": 2.7565102291352785e-05,
"loss": 0.1828,
"step": 4510
},
{
"epoch": 1.5657370517928286,
"grad_norm": 0.1653908672889608,
"learning_rate": 2.7464843972001985e-05,
"loss": 0.1841,
"step": 4520
},
{
"epoch": 1.5692014550493676,
"grad_norm": 0.16834209700469813,
"learning_rate": 2.7364545605854077e-05,
"loss": 0.1868,
"step": 4530
},
{
"epoch": 1.5726658583059068,
"grad_norm": 0.16296690623854307,
"learning_rate": 2.7264208822476016e-05,
"loss": 0.1871,
"step": 4540
},
{
"epoch": 1.576130261562446,
"grad_norm": 0.1465435846295834,
"learning_rate": 2.716383525205896e-05,
"loss": 0.1845,
"step": 4550
},
{
"epoch": 1.579594664818985,
"grad_norm": 0.16909003341569917,
"learning_rate": 2.7063426525391732e-05,
"loss": 0.1818,
"step": 4560
},
{
"epoch": 1.5830590680755239,
"grad_norm": 0.16069669006075715,
"learning_rate": 2.6962984273834346e-05,
"loss": 0.1844,
"step": 4570
},
{
"epoch": 1.586523471332063,
"grad_norm": 0.1750416978664114,
"learning_rate": 2.686251012929151e-05,
"loss": 0.1876,
"step": 4580
},
{
"epoch": 1.5899878745886022,
"grad_norm": 0.16259236460228596,
"learning_rate": 2.6762005724186084e-05,
"loss": 0.1817,
"step": 4590
},
{
"epoch": 1.5934522778451412,
"grad_norm": 0.14258206554047775,
"learning_rate": 2.6661472691432614e-05,
"loss": 0.1908,
"step": 4600
},
{
"epoch": 1.5969166811016802,
"grad_norm": 0.15573453304245355,
"learning_rate": 2.6560912664410724e-05,
"loss": 0.1847,
"step": 4610
},
{
"epoch": 1.6003810843582191,
"grad_norm": 0.1598455887435374,
"learning_rate": 2.646032727693864e-05,
"loss": 0.1883,
"step": 4620
},
{
"epoch": 1.6038454876147583,
"grad_norm": 0.16653009190610318,
"learning_rate": 2.6359718163246627e-05,
"loss": 0.1817,
"step": 4630
},
{
"epoch": 1.6073098908712975,
"grad_norm": 0.16139062116102534,
"learning_rate": 2.6259086957950434e-05,
"loss": 0.186,
"step": 4640
},
{
"epoch": 1.6107742941278365,
"grad_norm": 0.1408890193988467,
"learning_rate": 2.615843529602472e-05,
"loss": 0.1883,
"step": 4650
},
{
"epoch": 1.6142386973843754,
"grad_norm": 0.15912928421327938,
"learning_rate": 2.6057764812776524e-05,
"loss": 0.1818,
"step": 4660
},
{
"epoch": 1.6177031006409146,
"grad_norm": 0.1531498559224487,
"learning_rate": 2.595707714381867e-05,
"loss": 0.1815,
"step": 4670
},
{
"epoch": 1.6211675038974538,
"grad_norm": 0.15818993699570458,
"learning_rate": 2.585637392504321e-05,
"loss": 0.189,
"step": 4680
},
{
"epoch": 1.6246319071539927,
"grad_norm": 0.1550856346527227,
"learning_rate": 2.575565679259483e-05,
"loss": 0.1851,
"step": 4690
},
{
"epoch": 1.6280963104105317,
"grad_norm": 0.17234204432744546,
"learning_rate": 2.5654927382844274e-05,
"loss": 0.1856,
"step": 4700
},
{
"epoch": 1.631560713667071,
"grad_norm": 0.14720447626557742,
"learning_rate": 2.555418733236176e-05,
"loss": 0.1849,
"step": 4710
},
{
"epoch": 1.6350251169236099,
"grad_norm": 0.15155258081793999,
"learning_rate": 2.545343827789039e-05,
"loss": 0.185,
"step": 4720
},
{
"epoch": 1.638489520180149,
"grad_norm": 0.1585948164278231,
"learning_rate": 2.5352681856319556e-05,
"loss": 0.1818,
"step": 4730
},
{
"epoch": 1.641953923436688,
"grad_norm": 0.16020278019866965,
"learning_rate": 2.5251919704658323e-05,
"loss": 0.1847,
"step": 4740
},
{
"epoch": 1.645418326693227,
"grad_norm": 0.1619196729529339,
"learning_rate": 2.5151153460008898e-05,
"loss": 0.1851,
"step": 4750
},
{
"epoch": 1.6488827299497661,
"grad_norm": 0.17892607432188418,
"learning_rate": 2.5050384759539946e-05,
"loss": 0.1884,
"step": 4760
},
{
"epoch": 1.6523471332063053,
"grad_norm": 0.1453101210669921,
"learning_rate": 2.4949615240460053e-05,
"loss": 0.1829,
"step": 4770
},
{
"epoch": 1.6558115364628443,
"grad_norm": 0.15218576172212772,
"learning_rate": 2.4848846539991108e-05,
"loss": 0.1815,
"step": 4780
},
{
"epoch": 1.6592759397193833,
"grad_norm": 0.14916349890801867,
"learning_rate": 2.474808029534168e-05,
"loss": 0.1831,
"step": 4790
},
{
"epoch": 1.6627403429759224,
"grad_norm": 0.16152070439341162,
"learning_rate": 2.464731814368045e-05,
"loss": 0.1845,
"step": 4800
},
{
"epoch": 1.6662047462324616,
"grad_norm": 0.2549593516440478,
"learning_rate": 2.4546561722109614e-05,
"loss": 0.1821,
"step": 4810
},
{
"epoch": 1.6696691494890006,
"grad_norm": 0.1423882724472122,
"learning_rate": 2.4445812667638242e-05,
"loss": 0.1824,
"step": 4820
},
{
"epoch": 1.6731335527455395,
"grad_norm": 0.16756009716869377,
"learning_rate": 2.4345072617155732e-05,
"loss": 0.1861,
"step": 4830
},
{
"epoch": 1.6765979560020785,
"grad_norm": 0.16461215951788935,
"learning_rate": 2.424434320740518e-05,
"loss": 0.1832,
"step": 4840
},
{
"epoch": 1.6800623592586177,
"grad_norm": 0.14517098753523155,
"learning_rate": 2.4143626074956796e-05,
"loss": 0.1785,
"step": 4850
},
{
"epoch": 1.6835267625151569,
"grad_norm": 0.15108292412387223,
"learning_rate": 2.4042922856181337e-05,
"loss": 0.1827,
"step": 4860
},
{
"epoch": 1.6869911657716958,
"grad_norm": 0.15525515390004765,
"learning_rate": 2.394223518722348e-05,
"loss": 0.1838,
"step": 4870
},
{
"epoch": 1.6904555690282348,
"grad_norm": 0.15512655551886048,
"learning_rate": 2.3841564703975287e-05,
"loss": 0.1812,
"step": 4880
},
{
"epoch": 1.693919972284774,
"grad_norm": 0.16187683902086444,
"learning_rate": 2.374091304204958e-05,
"loss": 0.1832,
"step": 4890
},
{
"epoch": 1.6973843755413132,
"grad_norm": 0.15579000956703476,
"learning_rate": 2.364028183675337e-05,
"loss": 0.1806,
"step": 4900
},
{
"epoch": 1.7008487787978521,
"grad_norm": 0.15600046355547195,
"learning_rate": 2.353967272306137e-05,
"loss": 0.1844,
"step": 4910
},
{
"epoch": 1.704313182054391,
"grad_norm": 0.15757986710095487,
"learning_rate": 2.3439087335589285e-05,
"loss": 0.1841,
"step": 4920
},
{
"epoch": 1.70777758531093,
"grad_norm": 0.1452879429548844,
"learning_rate": 2.333852730856739e-05,
"loss": 0.1846,
"step": 4930
},
{
"epoch": 1.7112419885674692,
"grad_norm": 0.14279806262433956,
"learning_rate": 2.3237994275813918e-05,
"loss": 0.1846,
"step": 4940
},
{
"epoch": 1.7147063918240084,
"grad_norm": 0.168480732012115,
"learning_rate": 2.3137489870708494e-05,
"loss": 0.1854,
"step": 4950
},
{
"epoch": 1.7181707950805474,
"grad_norm": 0.14771704725144452,
"learning_rate": 2.303701572616566e-05,
"loss": 0.1781,
"step": 4960
},
{
"epoch": 1.7216351983370863,
"grad_norm": 0.15399297657379268,
"learning_rate": 2.2936573474608274e-05,
"loss": 0.1851,
"step": 4970
},
{
"epoch": 1.7250996015936255,
"grad_norm": 0.14268904873654373,
"learning_rate": 2.283616474794104e-05,
"loss": 0.1793,
"step": 4980
},
{
"epoch": 1.7285640048501647,
"grad_norm": 0.1458152221590982,
"learning_rate": 2.273579117752399e-05,
"loss": 0.18,
"step": 4990
},
{
"epoch": 1.7320284081067037,
"grad_norm": 0.157752398066733,
"learning_rate": 2.2635454394145926e-05,
"loss": 0.1804,
"step": 5000
},
{
"epoch": 1.7354928113632426,
"grad_norm": 0.16354623197521176,
"learning_rate": 2.253515602799802e-05,
"loss": 0.18,
"step": 5010
},
{
"epoch": 1.7389572146197816,
"grad_norm": 0.14987547413644828,
"learning_rate": 2.2434897708647225e-05,
"loss": 0.1884,
"step": 5020
},
{
"epoch": 1.7424216178763208,
"grad_norm": 0.13264921207181166,
"learning_rate": 2.233468106500985e-05,
"loss": 0.1817,
"step": 5030
},
{
"epoch": 1.74588602113286,
"grad_norm": 0.16074846687523298,
"learning_rate": 2.2234507725325115e-05,
"loss": 0.1821,
"step": 5040
},
{
"epoch": 1.749350424389399,
"grad_norm": 0.14912108468148355,
"learning_rate": 2.2134379317128666e-05,
"loss": 0.1831,
"step": 5050
},
{
"epoch": 1.7528148276459379,
"grad_norm": 0.15108125406874282,
"learning_rate": 2.2034297467226117e-05,
"loss": 0.1849,
"step": 5060
},
{
"epoch": 1.756279230902477,
"grad_norm": 0.1841322575834389,
"learning_rate": 2.193426380166667e-05,
"loss": 0.1814,
"step": 5070
},
{
"epoch": 1.7597436341590162,
"grad_norm": 0.15923619065201655,
"learning_rate": 2.183427994571663e-05,
"loss": 0.1826,
"step": 5080
},
{
"epoch": 1.7632080374155552,
"grad_norm": 0.15869671284509257,
"learning_rate": 2.1734347523833088e-05,
"loss": 0.1825,
"step": 5090
},
{
"epoch": 1.7666724406720942,
"grad_norm": 0.15221224358826752,
"learning_rate": 2.163446815963742e-05,
"loss": 0.182,
"step": 5100
},
{
"epoch": 1.7701368439286331,
"grad_norm": 0.1460640105447675,
"learning_rate": 2.1534643475888995e-05,
"loss": 0.1823,
"step": 5110
},
{
"epoch": 1.7736012471851723,
"grad_norm": 0.1521505743886422,
"learning_rate": 2.1434875094458785e-05,
"loss": 0.1874,
"step": 5120
},
{
"epoch": 1.7770656504417115,
"grad_norm": 0.14416303035440375,
"learning_rate": 2.133516463630297e-05,
"loss": 0.1808,
"step": 5130
},
{
"epoch": 1.7805300536982505,
"grad_norm": 0.1586845052440452,
"learning_rate": 2.1235513721436693e-05,
"loss": 0.1841,
"step": 5140
},
{
"epoch": 1.7839944569547894,
"grad_norm": 0.14468498059776927,
"learning_rate": 2.113592396890764e-05,
"loss": 0.181,
"step": 5150
},
{
"epoch": 1.7874588602113286,
"grad_norm": 0.14200788749614593,
"learning_rate": 2.1036396996769785e-05,
"loss": 0.1806,
"step": 5160
},
{
"epoch": 1.7909232634678678,
"grad_norm": 0.14145049327765216,
"learning_rate": 2.093693442205713e-05,
"loss": 0.1844,
"step": 5170
},
{
"epoch": 1.7943876667244067,
"grad_norm": 0.14047484844001326,
"learning_rate": 2.0837537860757378e-05,
"loss": 0.1856,
"step": 5180
},
{
"epoch": 1.7978520699809457,
"grad_norm": 0.15798548962919395,
"learning_rate": 2.073820892778568e-05,
"loss": 0.1781,
"step": 5190
},
{
"epoch": 1.801316473237485,
"grad_norm": 0.15599305625176324,
"learning_rate": 2.063894923695846e-05,
"loss": 0.1846,
"step": 5200
},
{
"epoch": 1.8047808764940239,
"grad_norm": 0.15451286542161505,
"learning_rate": 2.0539760400967105e-05,
"loss": 0.1814,
"step": 5210
},
{
"epoch": 1.808245279750563,
"grad_norm": 0.15151680436099774,
"learning_rate": 2.0440644031351846e-05,
"loss": 0.187,
"step": 5220
},
{
"epoch": 1.811709683007102,
"grad_norm": 0.16740847608877021,
"learning_rate": 2.0341601738475532e-05,
"loss": 0.1788,
"step": 5230
},
{
"epoch": 1.815174086263641,
"grad_norm": 0.15255748452386272,
"learning_rate": 2.0242635131497444e-05,
"loss": 0.1799,
"step": 5240
},
{
"epoch": 1.8186384895201801,
"grad_norm": 0.21370458116557772,
"learning_rate": 2.0143745818347226e-05,
"loss": 0.1859,
"step": 5250
},
{
"epoch": 1.8221028927767193,
"grad_norm": 0.15178882291540086,
"learning_rate": 2.004493540569867e-05,
"loss": 0.1816,
"step": 5260
},
{
"epoch": 1.8255672960332583,
"grad_norm": 0.1381809582876828,
"learning_rate": 1.9946205498943693e-05,
"loss": 0.1782,
"step": 5270
},
{
"epoch": 1.8290316992897973,
"grad_norm": 0.15231248860338162,
"learning_rate": 1.9847557702166185e-05,
"loss": 0.182,
"step": 5280
},
{
"epoch": 1.8324961025463364,
"grad_norm": 0.13540143942421462,
"learning_rate": 1.9748993618116003e-05,
"loss": 0.1802,
"step": 5290
},
{
"epoch": 1.8359605058028756,
"grad_norm": 0.15057686984095897,
"learning_rate": 1.9650514848182902e-05,
"loss": 0.1845,
"step": 5300
},
{
"epoch": 1.8394249090594146,
"grad_norm": 0.15657239979982757,
"learning_rate": 1.9552122992370515e-05,
"loss": 0.1816,
"step": 5310
},
{
"epoch": 1.8428893123159535,
"grad_norm": 0.14942811146448692,
"learning_rate": 1.9453819649270356e-05,
"loss": 0.1881,
"step": 5320
},
{
"epoch": 1.8463537155724925,
"grad_norm": 0.14206236299677405,
"learning_rate": 1.9355606416035893e-05,
"loss": 0.1798,
"step": 5330
},
{
"epoch": 1.8498181188290317,
"grad_norm": 0.15512685603221288,
"learning_rate": 1.925748488835651e-05,
"loss": 0.1787,
"step": 5340
},
{
"epoch": 1.8532825220855709,
"grad_norm": 0.14951858713327895,
"learning_rate": 1.9159456660431675e-05,
"loss": 0.1808,
"step": 5350
},
{
"epoch": 1.8567469253421098,
"grad_norm": 0.1518891829482899,
"learning_rate": 1.906152332494499e-05,
"loss": 0.1847,
"step": 5360
},
{
"epoch": 1.8602113285986488,
"grad_norm": 0.1599605482136836,
"learning_rate": 1.8963686473038286e-05,
"loss": 0.1758,
"step": 5370
},
{
"epoch": 1.863675731855188,
"grad_norm": 0.14097789058459068,
"learning_rate": 1.8865947694285863e-05,
"loss": 0.1814,
"step": 5380
},
{
"epoch": 1.8671401351117272,
"grad_norm": 0.13734664780231215,
"learning_rate": 1.876830857666855e-05,
"loss": 0.1813,
"step": 5390
},
{
"epoch": 1.8706045383682661,
"grad_norm": 0.15463681734647824,
"learning_rate": 1.867077070654802e-05,
"loss": 0.1807,
"step": 5400
},
{
"epoch": 1.874068941624805,
"grad_norm": 0.16737868012866194,
"learning_rate": 1.85733356686409e-05,
"loss": 0.1791,
"step": 5410
},
{
"epoch": 1.877533344881344,
"grad_norm": 0.1422554567768386,
"learning_rate": 1.847600504599312e-05,
"loss": 0.1812,
"step": 5420
},
{
"epoch": 1.8809977481378832,
"grad_norm": 0.1499775475287378,
"learning_rate": 1.8378780419954168e-05,
"loss": 0.1791,
"step": 5430
},
{
"epoch": 1.8844621513944224,
"grad_norm": 0.1620107234869922,
"learning_rate": 1.828166337015133e-05,
"loss": 0.1798,
"step": 5440
},
{
"epoch": 1.8879265546509614,
"grad_norm": 0.14943512022105485,
"learning_rate": 1.8184655474464122e-05,
"loss": 0.1769,
"step": 5450
},
{
"epoch": 1.8913909579075003,
"grad_norm": 0.1636002811165978,
"learning_rate": 1.8087758308998607e-05,
"loss": 0.1828,
"step": 5460
},
{
"epoch": 1.8948553611640395,
"grad_norm": 0.1488164351792101,
"learning_rate": 1.7990973448061788e-05,
"loss": 0.1793,
"step": 5470
},
{
"epoch": 1.8983197644205787,
"grad_norm": 0.1495335119087624,
"learning_rate": 1.7894302464136028e-05,
"loss": 0.1804,
"step": 5480
},
{
"epoch": 1.9017841676771177,
"grad_norm": 0.13881655138176263,
"learning_rate": 1.7797746927853524e-05,
"loss": 0.1808,
"step": 5490
},
{
"epoch": 1.9052485709336566,
"grad_norm": 0.14655156873185612,
"learning_rate": 1.770130840797075e-05,
"loss": 0.1824,
"step": 5500
},
{
"epoch": 1.9087129741901956,
"grad_norm": 0.1401693117484928,
"learning_rate": 1.7604988471343026e-05,
"loss": 0.1836,
"step": 5510
},
{
"epoch": 1.9121773774467348,
"grad_norm": 0.13663044985397266,
"learning_rate": 1.750878868289898e-05,
"loss": 0.179,
"step": 5520
},
{
"epoch": 1.915641780703274,
"grad_norm": 0.1440844464161243,
"learning_rate": 1.741271060561522e-05,
"loss": 0.1794,
"step": 5530
},
{
"epoch": 1.919106183959813,
"grad_norm": 0.1579545402684227,
"learning_rate": 1.731675580049085e-05,
"loss": 0.1762,
"step": 5540
},
{
"epoch": 1.9225705872163519,
"grad_norm": 0.14451711020259372,
"learning_rate": 1.7220925826522158e-05,
"loss": 0.179,
"step": 5550
},
{
"epoch": 1.926034990472891,
"grad_norm": 0.16318851480604618,
"learning_rate": 1.71252222406773e-05,
"loss": 0.1794,
"step": 5560
},
{
"epoch": 1.9294993937294302,
"grad_norm": 0.16754335602378248,
"learning_rate": 1.7029646597870934e-05,
"loss": 0.1809,
"step": 5570
},
{
"epoch": 1.9329637969859692,
"grad_norm": 0.14425404689789675,
"learning_rate": 1.693420045093905e-05,
"loss": 0.181,
"step": 5580
},
{
"epoch": 1.9364282002425082,
"grad_norm": 0.14950486896828924,
"learning_rate": 1.6838885350613664e-05,
"loss": 0.1834,
"step": 5590
},
{
"epoch": 1.9398926034990471,
"grad_norm": 0.15401643917378996,
"learning_rate": 1.674370284549765e-05,
"loss": 0.18,
"step": 5600
},
{
"epoch": 1.9433570067555863,
"grad_norm": 0.1457933918907984,
"learning_rate": 1.6648654482039616e-05,
"loss": 0.1798,
"step": 5610
},
{
"epoch": 1.9468214100121255,
"grad_norm": 0.15104837558607415,
"learning_rate": 1.6553741804508704e-05,
"loss": 0.1808,
"step": 5620
},
{
"epoch": 1.9502858132686645,
"grad_norm": 0.15234854679585655,
"learning_rate": 1.6458966354969553e-05,
"loss": 0.183,
"step": 5630
},
{
"epoch": 1.9537502165252034,
"grad_norm": 0.15054697844231169,
"learning_rate": 1.6364329673257244e-05,
"loss": 0.1782,
"step": 5640
},
{
"epoch": 1.9572146197817426,
"grad_norm": 0.19519516147127766,
"learning_rate": 1.6269833296952267e-05,
"loss": 0.1779,
"step": 5650
},
{
"epoch": 1.9606790230382818,
"grad_norm": 0.14953949146669174,
"learning_rate": 1.617547876135553e-05,
"loss": 0.1817,
"step": 5660
},
{
"epoch": 1.9641434262948207,
"grad_norm": 0.14098620367788547,
"learning_rate": 1.6081267599463446e-05,
"loss": 0.1795,
"step": 5670
},
{
"epoch": 1.9676078295513597,
"grad_norm": 0.1516762253034418,
"learning_rate": 1.598720134194298e-05,
"loss": 0.175,
"step": 5680
},
{
"epoch": 1.971072232807899,
"grad_norm": 0.14592317747183597,
"learning_rate": 1.5893281517106852e-05,
"loss": 0.1817,
"step": 5690
},
{
"epoch": 1.9745366360644379,
"grad_norm": 0.1367156708009888,
"learning_rate": 1.5799509650888605e-05,
"loss": 0.1792,
"step": 5700
},
{
"epoch": 1.978001039320977,
"grad_norm": 0.15222974333942974,
"learning_rate": 1.5705887266817926e-05,
"loss": 0.1838,
"step": 5710
},
{
"epoch": 1.981465442577516,
"grad_norm": 0.15063980268483795,
"learning_rate": 1.5612415885995803e-05,
"loss": 0.1798,
"step": 5720
},
{
"epoch": 1.984929845834055,
"grad_norm": 0.14361029580183732,
"learning_rate": 1.551909702706984e-05,
"loss": 0.1775,
"step": 5730
},
{
"epoch": 1.9883942490905941,
"grad_norm": 0.14905006323407718,
"learning_rate": 1.5425932206209617e-05,
"loss": 0.1853,
"step": 5740
},
{
"epoch": 1.9918586523471333,
"grad_norm": 0.1473397517873399,
"learning_rate": 1.533292293708201e-05,
"loss": 0.1857,
"step": 5750
},
{
"epoch": 1.9953230556036723,
"grad_norm": 0.14626760437882622,
"learning_rate": 1.52400707308266e-05,
"loss": 0.1741,
"step": 5760
},
{
"epoch": 1.9987874588602113,
"grad_norm": 0.1405339530408193,
"learning_rate": 1.5147377096031173e-05,
"loss": 0.1757,
"step": 5770
},
{
"epoch": 2.0020786419539234,
"grad_norm": 0.17093438476339481,
"learning_rate": 1.5054843538707147e-05,
"loss": 0.1598,
"step": 5780
},
{
"epoch": 2.0055430452104623,
"grad_norm": 0.15448133744376852,
"learning_rate": 1.4962471562265151e-05,
"loss": 0.1509,
"step": 5790
},
{
"epoch": 2.0090074484670017,
"grad_norm": 0.15883932121601194,
"learning_rate": 1.4870262667490553e-05,
"loss": 0.1508,
"step": 5800
},
{
"epoch": 2.0124718517235407,
"grad_norm": 0.15799483695764974,
"learning_rate": 1.4778218352519113e-05,
"loss": 0.1514,
"step": 5810
},
{
"epoch": 2.0159362549800797,
"grad_norm": 0.1397582105470391,
"learning_rate": 1.4686340112812644e-05,
"loss": 0.1513,
"step": 5820
},
{
"epoch": 2.0194006582366186,
"grad_norm": 0.13988381555476553,
"learning_rate": 1.4594629441134674e-05,
"loss": 0.1516,
"step": 5830
},
{
"epoch": 2.0228650614931576,
"grad_norm": 0.15357440805298642,
"learning_rate": 1.4503087827526257e-05,
"loss": 0.1537,
"step": 5840
},
{
"epoch": 2.026329464749697,
"grad_norm": 0.140105602913093,
"learning_rate": 1.4411716759281701e-05,
"loss": 0.1489,
"step": 5850
},
{
"epoch": 2.029793868006236,
"grad_norm": 0.1371171850215917,
"learning_rate": 1.4320517720924423e-05,
"loss": 0.1478,
"step": 5860
},
{
"epoch": 2.033258271262775,
"grad_norm": 0.1415040646234407,
"learning_rate": 1.4229492194182864e-05,
"loss": 0.1511,
"step": 5870
},
{
"epoch": 2.036722674519314,
"grad_norm": 0.1455346789180852,
"learning_rate": 1.4138641657966387e-05,
"loss": 0.1541,
"step": 5880
},
{
"epoch": 2.0401870777758533,
"grad_norm": 0.14764527095741217,
"learning_rate": 1.4047967588341216e-05,
"loss": 0.1528,
"step": 5890
},
{
"epoch": 2.0436514810323922,
"grad_norm": 0.13949032357196248,
"learning_rate": 1.3957471458506536e-05,
"loss": 0.1539,
"step": 5900
},
{
"epoch": 2.047115884288931,
"grad_norm": 0.13705465172650183,
"learning_rate": 1.386715473877046e-05,
"loss": 0.1457,
"step": 5910
},
{
"epoch": 2.05058028754547,
"grad_norm": 0.12956466312791168,
"learning_rate": 1.3777018896526236e-05,
"loss": 0.1473,
"step": 5920
},
{
"epoch": 2.0540446908020096,
"grad_norm": 0.15368302078148194,
"learning_rate": 1.3687065396228332e-05,
"loss": 0.1497,
"step": 5930
},
{
"epoch": 2.0575090940585485,
"grad_norm": 0.1428450265623596,
"learning_rate": 1.3597295699368668e-05,
"loss": 0.1514,
"step": 5940
},
{
"epoch": 2.0609734973150875,
"grad_norm": 0.1281258398579668,
"learning_rate": 1.3507711264452905e-05,
"loss": 0.1483,
"step": 5950
},
{
"epoch": 2.0644379005716265,
"grad_norm": 0.15262843729207556,
"learning_rate": 1.3418313546976676e-05,
"loss": 0.1466,
"step": 5960
},
{
"epoch": 2.0679023038281654,
"grad_norm": 0.14904369630104097,
"learning_rate": 1.332910399940202e-05,
"loss": 0.1454,
"step": 5970
},
{
"epoch": 2.071366707084705,
"grad_norm": 0.1423441662005472,
"learning_rate": 1.324008407113371e-05,
"loss": 0.1501,
"step": 5980
},
{
"epoch": 2.074831110341244,
"grad_norm": 0.1422975070772413,
"learning_rate": 1.3151255208495755e-05,
"loss": 0.154,
"step": 5990
},
{
"epoch": 2.0782955135977828,
"grad_norm": 0.1352228664513086,
"learning_rate": 1.306261885470789e-05,
"loss": 0.1506,
"step": 6000
},
{
"epoch": 2.0817599168543217,
"grad_norm": 0.13493941433636564,
"learning_rate": 1.2974176449862101e-05,
"loss": 0.148,
"step": 6010
},
{
"epoch": 2.085224320110861,
"grad_norm": 0.13664044137951584,
"learning_rate": 1.2885929430899258e-05,
"loss": 0.1479,
"step": 6020
},
{
"epoch": 2.0886887233674,
"grad_norm": 0.13427253141306963,
"learning_rate": 1.279787923158577e-05,
"loss": 0.1451,
"step": 6030
},
{
"epoch": 2.092153126623939,
"grad_norm": 0.18500494498072673,
"learning_rate": 1.2710027282490247e-05,
"loss": 0.1504,
"step": 6040
},
{
"epoch": 2.095617529880478,
"grad_norm": 0.1384525206350356,
"learning_rate": 1.2622375010960335e-05,
"loss": 0.1502,
"step": 6050
},
{
"epoch": 2.099081933137017,
"grad_norm": 0.1417115574643469,
"learning_rate": 1.2534923841099445e-05,
"loss": 0.1522,
"step": 6060
},
{
"epoch": 2.1025463363935564,
"grad_norm": 0.14117258627489582,
"learning_rate": 1.2447675193743651e-05,
"loss": 0.149,
"step": 6070
},
{
"epoch": 2.1060107396500953,
"grad_norm": 0.13976875251148813,
"learning_rate": 1.2360630486438635e-05,
"loss": 0.1479,
"step": 6080
},
{
"epoch": 2.1094751429066343,
"grad_norm": 0.13438983639128466,
"learning_rate": 1.2273791133416584e-05,
"loss": 0.1493,
"step": 6090
},
{
"epoch": 2.1129395461631733,
"grad_norm": 0.12916426405356554,
"learning_rate": 1.2187158545573295e-05,
"loss": 0.1462,
"step": 6100
},
{
"epoch": 2.1164039494197127,
"grad_norm": 0.14884278506870566,
"learning_rate": 1.2100734130445173e-05,
"loss": 0.1534,
"step": 6110
},
{
"epoch": 2.1198683526762516,
"grad_norm": 0.14763268804961824,
"learning_rate": 1.2014519292186428e-05,
"loss": 0.1504,
"step": 6120
},
{
"epoch": 2.1233327559327906,
"grad_norm": 0.14546584264873522,
"learning_rate": 1.1928515431546233e-05,
"loss": 0.1549,
"step": 6130
},
{
"epoch": 2.1267971591893295,
"grad_norm": 0.13000521630439743,
"learning_rate": 1.1842723945845948e-05,
"loss": 0.1515,
"step": 6140
},
{
"epoch": 2.1302615624458685,
"grad_norm": 0.1380689347039955,
"learning_rate": 1.1757146228956445e-05,
"loss": 0.1534,
"step": 6150
},
{
"epoch": 2.133725965702408,
"grad_norm": 0.13885554681042553,
"learning_rate": 1.1671783671275467e-05,
"loss": 0.1477,
"step": 6160
},
{
"epoch": 2.137190368958947,
"grad_norm": 0.1413517568795959,
"learning_rate": 1.1586637659704994e-05,
"loss": 0.1489,
"step": 6170
},
{
"epoch": 2.140654772215486,
"grad_norm": 0.1445567017190447,
"learning_rate": 1.1501709577628777e-05,
"loss": 0.1527,
"step": 6180
},
{
"epoch": 2.144119175472025,
"grad_norm": 0.13842122845453322,
"learning_rate": 1.1417000804889793e-05,
"loss": 0.149,
"step": 6190
},
{
"epoch": 2.147583578728564,
"grad_norm": 0.1351342875816243,
"learning_rate": 1.1332512717767862e-05,
"loss": 0.1499,
"step": 6200
},
{
"epoch": 2.151047981985103,
"grad_norm": 0.19203561762793064,
"learning_rate": 1.1248246688957307e-05,
"loss": 0.1516,
"step": 6210
},
{
"epoch": 2.154512385241642,
"grad_norm": 0.13275502783261534,
"learning_rate": 1.1164204087544589e-05,
"loss": 0.1496,
"step": 6220
},
{
"epoch": 2.157976788498181,
"grad_norm": 0.14124554657127336,
"learning_rate": 1.108038627898613e-05,
"loss": 0.1469,
"step": 6230
},
{
"epoch": 2.16144119175472,
"grad_norm": 0.13662855483900213,
"learning_rate": 1.0996794625086102e-05,
"loss": 0.1533,
"step": 6240
},
{
"epoch": 2.1649055950112595,
"grad_norm": 0.13136794147532813,
"learning_rate": 1.091343048397426e-05,
"loss": 0.1479,
"step": 6250
},
{
"epoch": 2.1683699982677984,
"grad_norm": 0.14289580428302312,
"learning_rate": 1.0830295210083968e-05,
"loss": 0.1523,
"step": 6260
},
{
"epoch": 2.1718344015243374,
"grad_norm": 0.14336144046136406,
"learning_rate": 1.0747390154130097e-05,
"loss": 0.1482,
"step": 6270
},
{
"epoch": 2.1752988047808763,
"grad_norm": 0.12351594136318876,
"learning_rate": 1.0664716663087132e-05,
"loss": 0.148,
"step": 6280
},
{
"epoch": 2.1787632080374157,
"grad_norm": 0.1263105200697639,
"learning_rate": 1.0582276080167299e-05,
"loss": 0.1488,
"step": 6290
},
{
"epoch": 2.1822276112939547,
"grad_norm": 0.1409302104392171,
"learning_rate": 1.0500069744798696e-05,
"loss": 0.1493,
"step": 6300
},
{
"epoch": 2.1856920145504937,
"grad_norm": 0.13878678763750232,
"learning_rate": 1.0418098992603576e-05,
"loss": 0.1467,
"step": 6310
},
{
"epoch": 2.1891564178070326,
"grad_norm": 0.13130670045208906,
"learning_rate": 1.033636515537661e-05,
"loss": 0.15,
"step": 6320
},
{
"epoch": 2.1926208210635716,
"grad_norm": 0.13037720388849533,
"learning_rate": 1.0254869561063263e-05,
"loss": 0.1514,
"step": 6330
},
{
"epoch": 2.196085224320111,
"grad_norm": 0.1392467131479532,
"learning_rate": 1.0173613533738238e-05,
"loss": 0.1497,
"step": 6340
},
{
"epoch": 2.19954962757665,
"grad_norm": 0.13583441746880653,
"learning_rate": 1.0092598393583949e-05,
"loss": 0.1533,
"step": 6350
},
{
"epoch": 2.203014030833189,
"grad_norm": 0.138478347457145,
"learning_rate": 1.001182545686904e-05,
"loss": 0.1474,
"step": 6360
},
{
"epoch": 2.206478434089728,
"grad_norm": 0.13196884066043799,
"learning_rate": 9.931296035927068e-06,
"loss": 0.1485,
"step": 6370
},
{
"epoch": 2.2099428373462673,
"grad_norm": 0.12628759927748884,
"learning_rate": 9.851011439135105e-06,
"loss": 0.1512,
"step": 6380
},
{
"epoch": 2.2134072406028062,
"grad_norm": 0.12720352197837026,
"learning_rate": 9.770972970892553e-06,
"loss": 0.1492,
"step": 6390
},
{
"epoch": 2.216871643859345,
"grad_norm": 0.1288529643249653,
"learning_rate": 9.691181931599886e-06,
"loss": 0.1463,
"step": 6400
},
{
"epoch": 2.220336047115884,
"grad_norm": 0.12600916062409626,
"learning_rate": 9.611639617637558e-06,
"loss": 0.1488,
"step": 6410
},
{
"epoch": 2.2238004503724236,
"grad_norm": 0.14329418696142437,
"learning_rate": 9.532347321344956e-06,
"loss": 0.1521,
"step": 6420
},
{
"epoch": 2.2272648536289625,
"grad_norm": 0.13819893815319562,
"learning_rate": 9.453306330999349e-06,
"loss": 0.1501,
"step": 6430
},
{
"epoch": 2.2307292568855015,
"grad_norm": 0.13277309165484377,
"learning_rate": 9.37451793079502e-06,
"loss": 0.1515,
"step": 6440
},
{
"epoch": 2.2341936601420405,
"grad_norm": 0.1304498393414478,
"learning_rate": 9.29598340082236e-06,
"loss": 0.1498,
"step": 6450
},
{
"epoch": 2.2376580633985794,
"grad_norm": 0.13297679247054883,
"learning_rate": 9.217704017047057e-06,
"loss": 0.151,
"step": 6460
},
{
"epoch": 2.241122466655119,
"grad_norm": 0.12871920957838523,
"learning_rate": 9.139681051289425e-06,
"loss": 0.1507,
"step": 6470
},
{
"epoch": 2.244586869911658,
"grad_norm": 0.1377467169551894,
"learning_rate": 9.061915771203695e-06,
"loss": 0.1477,
"step": 6480
},
{
"epoch": 2.2480512731681968,
"grad_norm": 0.14354881718167706,
"learning_rate": 8.984409440257427e-06,
"loss": 0.1496,
"step": 6490
},
{
"epoch": 2.2515156764247357,
"grad_norm": 0.13146580616283166,
"learning_rate": 8.907163317710976e-06,
"loss": 0.1499,
"step": 6500
},
{
"epoch": 2.2549800796812747,
"grad_norm": 0.14028927070593003,
"learning_rate": 8.830178658597038e-06,
"loss": 0.1482,
"step": 6510
},
{
"epoch": 2.258444482937814,
"grad_norm": 0.13406176038514053,
"learning_rate": 8.75345671370029e-06,
"loss": 0.1513,
"step": 6520
},
{
"epoch": 2.261908886194353,
"grad_norm": 0.1281531685378758,
"learning_rate": 8.676998729537009e-06,
"loss": 0.1487,
"step": 6530
},
{
"epoch": 2.265373289450892,
"grad_norm": 0.1334136953967956,
"learning_rate": 8.600805948334858e-06,
"loss": 0.1461,
"step": 6540
},
{
"epoch": 2.268837692707431,
"grad_norm": 0.13263104957255284,
"learning_rate": 8.524879608012714e-06,
"loss": 0.1494,
"step": 6550
},
{
"epoch": 2.2723020959639704,
"grad_norm": 0.14074464264886974,
"learning_rate": 8.449220942160512e-06,
"loss": 0.1492,
"step": 6560
},
{
"epoch": 2.2757664992205093,
"grad_norm": 0.13291686489725904,
"learning_rate": 8.373831180019256e-06,
"loss": 0.1468,
"step": 6570
},
{
"epoch": 2.2792309024770483,
"grad_norm": 0.13968516954362226,
"learning_rate": 8.298711546460986e-06,
"loss": 0.148,
"step": 6580
},
{
"epoch": 2.2826953057335873,
"grad_norm": 0.13617881495577386,
"learning_rate": 8.223863261968945e-06,
"loss": 0.1514,
"step": 6590
},
{
"epoch": 2.2861597089901267,
"grad_norm": 0.13589641843226832,
"learning_rate": 8.149287542617686e-06,
"loss": 0.147,
"step": 6600
},
{
"epoch": 2.2896241122466656,
"grad_norm": 0.13498192446917104,
"learning_rate": 8.074985600053361e-06,
"loss": 0.1559,
"step": 6610
},
{
"epoch": 2.2930885155032046,
"grad_norm": 0.13109156975966305,
"learning_rate": 8.000958641474021e-06,
"loss": 0.1524,
"step": 6620
},
{
"epoch": 2.2965529187597435,
"grad_norm": 0.1320540630523605,
"learning_rate": 7.927207869609984e-06,
"loss": 0.1493,
"step": 6630
},
{
"epoch": 2.3000173220162825,
"grad_norm": 0.14127205971076925,
"learning_rate": 7.853734482704309e-06,
"loss": 0.1511,
"step": 6640
},
{
"epoch": 2.303481725272822,
"grad_norm": 0.13466133744323935,
"learning_rate": 7.780539674493345e-06,
"loss": 0.1506,
"step": 6650
},
{
"epoch": 2.306946128529361,
"grad_norm": 0.1286157583033884,
"learning_rate": 7.707624634187308e-06,
"loss": 0.1527,
"step": 6660
},
{
"epoch": 2.3104105317859,
"grad_norm": 0.1380982881943608,
"learning_rate": 7.63499054645096e-06,
"loss": 0.1484,
"step": 6670
},
{
"epoch": 2.313874935042439,
"grad_norm": 0.15156686175057582,
"learning_rate": 7.562638591384396e-06,
"loss": 0.1461,
"step": 6680
},
{
"epoch": 2.3173393382989778,
"grad_norm": 0.12803039542374958,
"learning_rate": 7.4905699445038255e-06,
"loss": 0.1483,
"step": 6690
},
{
"epoch": 2.320803741555517,
"grad_norm": 0.13309461688414237,
"learning_rate": 7.418785776722514e-06,
"loss": 0.151,
"step": 6700
},
{
"epoch": 2.324268144812056,
"grad_norm": 0.12574985983373116,
"learning_rate": 7.34728725433172e-06,
"loss": 0.1466,
"step": 6710
},
{
"epoch": 2.327732548068595,
"grad_norm": 0.13375589775606558,
"learning_rate": 7.276075538981778e-06,
"loss": 0.1511,
"step": 6720
},
{
"epoch": 2.3311969513251345,
"grad_norm": 0.13021533231229976,
"learning_rate": 7.205151787663222e-06,
"loss": 0.1486,
"step": 6730
},
{
"epoch": 2.3346613545816735,
"grad_norm": 0.1238991538662092,
"learning_rate": 7.134517152687953e-06,
"loss": 0.1467,
"step": 6740
},
{
"epoch": 2.3381257578382124,
"grad_norm": 0.13319985151049268,
"learning_rate": 7.064172781670569e-06,
"loss": 0.1504,
"step": 6750
},
{
"epoch": 2.3415901610947514,
"grad_norm": 0.13427753958581112,
"learning_rate": 6.994119817509678e-06,
"loss": 0.1454,
"step": 6760
},
{
"epoch": 2.3450545643512903,
"grad_norm": 0.1329147728324323,
"learning_rate": 6.924359398369342e-06,
"loss": 0.1487,
"step": 6770
},
{
"epoch": 2.3485189676078297,
"grad_norm": 0.11940132635278913,
"learning_rate": 6.854892657660605e-06,
"loss": 0.1476,
"step": 6780
},
{
"epoch": 2.3519833708643687,
"grad_norm": 0.13086880472397885,
"learning_rate": 6.785720724023042e-06,
"loss": 0.1483,
"step": 6790
},
{
"epoch": 2.3554477741209077,
"grad_norm": 0.12994412754505075,
"learning_rate": 6.716844721306443e-06,
"loss": 0.1496,
"step": 6800
},
{
"epoch": 2.3589121773774466,
"grad_norm": 0.13309217588951705,
"learning_rate": 6.648265768552569e-06,
"loss": 0.1469,
"step": 6810
},
{
"epoch": 2.3623765806339856,
"grad_norm": 0.1353366584822473,
"learning_rate": 6.579984979976925e-06,
"loss": 0.151,
"step": 6820
},
{
"epoch": 2.365840983890525,
"grad_norm": 0.13072638016170673,
"learning_rate": 6.512003464950706e-06,
"loss": 0.1498,
"step": 6830
},
{
"epoch": 2.369305387147064,
"grad_norm": 0.14086411924508258,
"learning_rate": 6.444322327982752e-06,
"loss": 0.1488,
"step": 6840
},
{
"epoch": 2.372769790403603,
"grad_norm": 0.13022025324073322,
"learning_rate": 6.376942668701586e-06,
"loss": 0.1505,
"step": 6850
},
{
"epoch": 2.376234193660142,
"grad_norm": 0.1270304545640671,
"learning_rate": 6.309865581837584e-06,
"loss": 0.1467,
"step": 6860
},
{
"epoch": 2.3796985969166813,
"grad_norm": 0.13824906046376823,
"learning_rate": 6.243092157205146e-06,
"loss": 0.1479,
"step": 6870
},
{
"epoch": 2.3831630001732202,
"grad_norm": 0.13322342141123356,
"learning_rate": 6.1766234796850426e-06,
"loss": 0.1532,
"step": 6880
},
{
"epoch": 2.386627403429759,
"grad_norm": 0.13622336055549258,
"learning_rate": 6.110460629206735e-06,
"loss": 0.1494,
"step": 6890
},
{
"epoch": 2.390091806686298,
"grad_norm": 0.13141145530513035,
"learning_rate": 6.044604680730856e-06,
"loss": 0.1478,
"step": 6900
},
{
"epoch": 2.3935562099428376,
"grad_norm": 0.12753936723220544,
"learning_rate": 5.979056704231759e-06,
"loss": 0.1508,
"step": 6910
},
{
"epoch": 2.3970206131993765,
"grad_norm": 0.1282678691982685,
"learning_rate": 5.9138177646800934e-06,
"loss": 0.1494,
"step": 6920
},
{
"epoch": 2.4004850164559155,
"grad_norm": 0.12550541746949806,
"learning_rate": 5.848888922025553e-06,
"loss": 0.1474,
"step": 6930
},
{
"epoch": 2.4039494197124545,
"grad_norm": 0.13234115800985993,
"learning_rate": 5.7842712311796025e-06,
"loss": 0.1463,
"step": 6940
},
{
"epoch": 2.4074138229689934,
"grad_norm": 0.12467209107241967,
"learning_rate": 5.719965741998368e-06,
"loss": 0.1491,
"step": 6950
},
{
"epoch": 2.410878226225533,
"grad_norm": 0.13573330603028008,
"learning_rate": 5.655973499265582e-06,
"loss": 0.1513,
"step": 6960
},
{
"epoch": 2.414342629482072,
"grad_norm": 0.13240853708299855,
"learning_rate": 5.59229554267561e-06,
"loss": 0.1516,
"step": 6970
},
{
"epoch": 2.4178070327386108,
"grad_norm": 0.1329534082302391,
"learning_rate": 5.528932906816522e-06,
"loss": 0.1517,
"step": 6980
},
{
"epoch": 2.4212714359951497,
"grad_norm": 0.12911430987167502,
"learning_rate": 5.465886621153346e-06,
"loss": 0.1457,
"step": 6990
},
{
"epoch": 2.4247358392516887,
"grad_norm": 0.1334215432640577,
"learning_rate": 5.403157710011267e-06,
"loss": 0.1534,
"step": 7000
},
{
"epoch": 2.428200242508228,
"grad_norm": 0.12233095847822573,
"learning_rate": 5.340747192559064e-06,
"loss": 0.1443,
"step": 7010
},
{
"epoch": 2.431664645764767,
"grad_norm": 0.12214722447020973,
"learning_rate": 5.278656082792488e-06,
"loss": 0.1506,
"step": 7020
},
{
"epoch": 2.435129049021306,
"grad_norm": 0.1312038495892653,
"learning_rate": 5.216885389517808e-06,
"loss": 0.1494,
"step": 7030
},
{
"epoch": 2.438593452277845,
"grad_norm": 0.12877996640447992,
"learning_rate": 5.155436116335455e-06,
"loss": 0.1498,
"step": 7040
},
{
"epoch": 2.4420578555343844,
"grad_norm": 0.13785796047700058,
"learning_rate": 5.094309261623642e-06,
"loss": 0.1493,
"step": 7050
},
{
"epoch": 2.4455222587909233,
"grad_norm": 0.12689388560349374,
"learning_rate": 5.0335058185222245e-06,
"loss": 0.148,
"step": 7060
},
{
"epoch": 2.4489866620474623,
"grad_norm": 0.13349070966216434,
"learning_rate": 4.973026774916504e-06,
"loss": 0.1491,
"step": 7070
},
{
"epoch": 2.4524510653040013,
"grad_norm": 0.13124780154477322,
"learning_rate": 4.912873113421215e-06,
"loss": 0.1472,
"step": 7080
},
{
"epoch": 2.4559154685605407,
"grad_norm": 0.12289322814948568,
"learning_rate": 4.853045811364532e-06,
"loss": 0.151,
"step": 7090
},
{
"epoch": 2.4593798718170796,
"grad_norm": 0.1231683131497469,
"learning_rate": 4.793545840772221e-06,
"loss": 0.1477,
"step": 7100
},
{
"epoch": 2.4628442750736186,
"grad_norm": 0.12204021116853747,
"learning_rate": 4.734374168351807e-06,
"loss": 0.1464,
"step": 7110
},
{
"epoch": 2.4663086783301575,
"grad_norm": 0.12705870894938584,
"learning_rate": 4.675531755476922e-06,
"loss": 0.1487,
"step": 7120
},
{
"epoch": 2.4697730815866965,
"grad_norm": 0.13311385467368475,
"learning_rate": 4.617019558171623e-06,
"loss": 0.1518,
"step": 7130
},
{
"epoch": 2.473237484843236,
"grad_norm": 0.12633200495803268,
"learning_rate": 4.558838527094916e-06,
"loss": 0.1487,
"step": 7140
},
{
"epoch": 2.476701888099775,
"grad_norm": 0.12821618157907513,
"learning_rate": 4.500989607525271e-06,
"loss": 0.1513,
"step": 7150
},
{
"epoch": 2.480166291356314,
"grad_norm": 0.12368299726159533,
"learning_rate": 4.443473739345275e-06,
"loss": 0.1486,
"step": 7160
},
{
"epoch": 2.483630694612853,
"grad_norm": 0.13459364837403084,
"learning_rate": 4.386291857026381e-06,
"loss": 0.1507,
"step": 7170
},
{
"epoch": 2.4870950978693918,
"grad_norm": 0.12921146609560327,
"learning_rate": 4.329444889613687e-06,
"loss": 0.1506,
"step": 7180
},
{
"epoch": 2.490559501125931,
"grad_norm": 0.13011095251066665,
"learning_rate": 4.272933760710893e-06,
"loss": 0.1472,
"step": 7190
},
{
"epoch": 2.49402390438247,
"grad_norm": 0.13394025522445424,
"learning_rate": 4.2167593884652325e-06,
"loss": 0.1482,
"step": 7200
},
{
"epoch": 2.497488307639009,
"grad_norm": 0.12341718153206974,
"learning_rate": 4.160922685552612e-06,
"loss": 0.1492,
"step": 7210
},
{
"epoch": 2.5009527108955485,
"grad_norm": 0.12332806421502722,
"learning_rate": 4.105424559162754e-06,
"loss": 0.151,
"step": 7220
},
{
"epoch": 2.5044171141520875,
"grad_norm": 0.12451895183900567,
"learning_rate": 4.05026591098446e-06,
"loss": 0.1477,
"step": 7230
},
{
"epoch": 2.5078815174086264,
"grad_norm": 0.12237447886665374,
"learning_rate": 3.995447637190955e-06,
"loss": 0.1434,
"step": 7240
},
{
"epoch": 2.5113459206651654,
"grad_norm": 0.12468896676434474,
"learning_rate": 3.940970628425353e-06,
"loss": 0.1509,
"step": 7250
},
{
"epoch": 2.5148103239217043,
"grad_norm": 0.12229284415257566,
"learning_rate": 3.886835769786154e-06,
"loss": 0.1508,
"step": 7260
},
{
"epoch": 2.5182747271782437,
"grad_norm": 0.12284739761632071,
"learning_rate": 3.833043940812889e-06,
"loss": 0.1469,
"step": 7270
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.12171858269842417,
"learning_rate": 3.7795960154718175e-06,
"loss": 0.148,
"step": 7280
},
{
"epoch": 2.5252035336913217,
"grad_norm": 0.1279096545024547,
"learning_rate": 3.726492862141717e-06,
"loss": 0.1492,
"step": 7290
},
{
"epoch": 2.5286679369478606,
"grad_norm": 0.1261825689932827,
"learning_rate": 3.67373534359981e-06,
"loss": 0.1485,
"step": 7300
},
{
"epoch": 2.5321323402043996,
"grad_norm": 0.12731710139244515,
"learning_rate": 3.621324317007704e-06,
"loss": 0.1485,
"step": 7310
},
{
"epoch": 2.535596743460939,
"grad_norm": 0.1224027049400114,
"learning_rate": 3.569260633897495e-06,
"loss": 0.1476,
"step": 7320
},
{
"epoch": 2.539061146717478,
"grad_norm": 0.12095976657546359,
"learning_rate": 3.517545140157927e-06,
"loss": 0.1466,
"step": 7330
},
{
"epoch": 2.542525549974017,
"grad_norm": 0.12165268564942827,
"learning_rate": 3.466178676020626e-06,
"loss": 0.1463,
"step": 7340
},
{
"epoch": 2.545989953230556,
"grad_norm": 0.12226265951849115,
"learning_rate": 3.415162076046488e-06,
"loss": 0.1485,
"step": 7350
},
{
"epoch": 2.549454356487095,
"grad_norm": 0.12637110596902743,
"learning_rate": 3.364496169112083e-06,
"loss": 0.1487,
"step": 7360
},
{
"epoch": 2.5529187597436342,
"grad_norm": 0.12810226008624373,
"learning_rate": 3.3141817783962e-06,
"loss": 0.146,
"step": 7370
},
{
"epoch": 2.556383163000173,
"grad_norm": 0.1252993130112932,
"learning_rate": 3.264219721366496e-06,
"loss": 0.1465,
"step": 7380
},
{
"epoch": 2.559847566256712,
"grad_norm": 0.1217860788587577,
"learning_rate": 3.2146108097661746e-06,
"loss": 0.1465,
"step": 7390
},
{
"epoch": 2.5633119695132516,
"grad_norm": 0.1306358146817434,
"learning_rate": 3.165355849600829e-06,
"loss": 0.1488,
"step": 7400
},
{
"epoch": 2.5667763727697905,
"grad_norm": 0.1243046537823559,
"learning_rate": 3.116455641125332e-06,
"loss": 0.1457,
"step": 7410
},
{
"epoch": 2.5702407760263295,
"grad_norm": 0.13155702663844496,
"learning_rate": 3.0679109788308293e-06,
"loss": 0.1507,
"step": 7420
},
{
"epoch": 2.5737051792828685,
"grad_norm": 0.12565136055486759,
"learning_rate": 3.0197226514318527e-06,
"loss": 0.1447,
"step": 7430
},
{
"epoch": 2.5771695825394074,
"grad_norm": 0.12227279079368177,
"learning_rate": 2.9718914418534747e-06,
"loss": 0.1481,
"step": 7440
},
{
"epoch": 2.580633985795947,
"grad_norm": 0.12446579351502253,
"learning_rate": 2.9244181272186257e-06,
"loss": 0.1508,
"step": 7450
},
{
"epoch": 2.584098389052486,
"grad_norm": 0.12176081031880796,
"learning_rate": 2.8773034788354384e-06,
"loss": 0.1516,
"step": 7460
},
{
"epoch": 2.5875627923090248,
"grad_norm": 0.12394944777774412,
"learning_rate": 2.8305482621847152e-06,
"loss": 0.146,
"step": 7470
},
{
"epoch": 2.5910271955655637,
"grad_norm": 0.12488028679932459,
"learning_rate": 2.784153236907522e-06,
"loss": 0.1473,
"step": 7480
},
{
"epoch": 2.5944915988221027,
"grad_norm": 0.12228989346664527,
"learning_rate": 2.7381191567928064e-06,
"loss": 0.1481,
"step": 7490
},
{
"epoch": 2.597956002078642,
"grad_norm": 0.12775499380593486,
"learning_rate": 2.6924467697651778e-06,
"loss": 0.1489,
"step": 7500
},
{
"epoch": 2.601420405335181,
"grad_norm": 0.123666079771719,
"learning_rate": 2.6471368178727583e-06,
"loss": 0.1457,
"step": 7510
},
{
"epoch": 2.60488480859172,
"grad_norm": 0.12267432282437238,
"learning_rate": 2.6021900372750956e-06,
"loss": 0.1457,
"step": 7520
},
{
"epoch": 2.6083492118482594,
"grad_norm": 0.12433547335629408,
"learning_rate": 2.5576071582312428e-06,
"loss": 0.1461,
"step": 7530
},
{
"epoch": 2.611813615104798,
"grad_norm": 0.12415914526721374,
"learning_rate": 2.51338890508786e-06,
"loss": 0.1503,
"step": 7540
},
{
"epoch": 2.6152780183613373,
"grad_norm": 0.1228630751843417,
"learning_rate": 2.4695359962674608e-06,
"loss": 0.1478,
"step": 7550
},
{
"epoch": 2.6187424216178763,
"grad_norm": 0.12239232127572298,
"learning_rate": 2.4260491442567506e-06,
"loss": 0.148,
"step": 7560
},
{
"epoch": 2.6222068248744153,
"grad_norm": 0.13105355595668536,
"learning_rate": 2.3829290555950264e-06,
"loss": 0.1463,
"step": 7570
},
{
"epoch": 2.6256712281309547,
"grad_norm": 0.12268630715908427,
"learning_rate": 2.340176430862723e-06,
"loss": 0.1458,
"step": 7580
},
{
"epoch": 2.6291356313874936,
"grad_norm": 0.12235644638631886,
"learning_rate": 2.2977919646700068e-06,
"loss": 0.1455,
"step": 7590
},
{
"epoch": 2.6326000346440326,
"grad_norm": 0.11798600614702796,
"learning_rate": 2.255776345645494e-06,
"loss": 0.142,
"step": 7600
},
{
"epoch": 2.6360644379005715,
"grad_norm": 0.12407906704297328,
"learning_rate": 2.2141302564250926e-06,
"loss": 0.1502,
"step": 7610
},
{
"epoch": 2.6395288411571105,
"grad_norm": 0.11753331905833096,
"learning_rate": 2.17285437364087e-06,
"loss": 0.1428,
"step": 7620
},
{
"epoch": 2.64299324441365,
"grad_norm": 0.12007274689026741,
"learning_rate": 2.131949367910077e-06,
"loss": 0.1465,
"step": 7630
},
{
"epoch": 2.646457647670189,
"grad_norm": 0.125472794336859,
"learning_rate": 2.0914159038242704e-06,
"loss": 0.1423,
"step": 7640
},
{
"epoch": 2.649922050926728,
"grad_norm": 0.12784584316147593,
"learning_rate": 2.051254639938477e-06,
"loss": 0.1476,
"step": 7650
},
{
"epoch": 2.653386454183267,
"grad_norm": 0.11853028227441036,
"learning_rate": 2.0114662287605335e-06,
"loss": 0.1486,
"step": 7660
},
{
"epoch": 2.6568508574398058,
"grad_norm": 0.1191674746816265,
"learning_rate": 1.97205131674045e-06,
"loss": 0.1492,
"step": 7670
},
{
"epoch": 2.660315260696345,
"grad_norm": 0.1250727768460615,
"learning_rate": 1.933010544259939e-06,
"loss": 0.1454,
"step": 7680
},
{
"epoch": 2.663779663952884,
"grad_norm": 0.12320213371283693,
"learning_rate": 1.8943445456219815e-06,
"loss": 0.1472,
"step": 7690
},
{
"epoch": 2.667244067209423,
"grad_norm": 0.11891666023211327,
"learning_rate": 1.8560539490405399e-06,
"loss": 0.1492,
"step": 7700
},
{
"epoch": 2.6707084704659625,
"grad_norm": 0.125346930924718,
"learning_rate": 1.8181393766303595e-06,
"loss": 0.1462,
"step": 7710
},
{
"epoch": 2.6741728737225015,
"grad_norm": 0.1321032171582539,
"learning_rate": 1.7806014443968289e-06,
"loss": 0.1436,
"step": 7720
},
{
"epoch": 2.6776372769790404,
"grad_norm": 0.12117133427851705,
"learning_rate": 1.7434407622259951e-06,
"loss": 0.1466,
"step": 7730
},
{
"epoch": 2.6811016802355794,
"grad_norm": 0.11718318211141446,
"learning_rate": 1.7066579338746668e-06,
"loss": 0.1456,
"step": 7740
},
{
"epoch": 2.6845660834921183,
"grad_norm": 0.1283761135380607,
"learning_rate": 1.670253556960563e-06,
"loss": 0.1464,
"step": 7750
},
{
"epoch": 2.6880304867486577,
"grad_norm": 0.12456996594619513,
"learning_rate": 1.6342282229526468e-06,
"loss": 0.1482,
"step": 7760
},
{
"epoch": 2.6914948900051967,
"grad_norm": 0.11703659504283041,
"learning_rate": 1.5985825171614953e-06,
"loss": 0.1455,
"step": 7770
},
{
"epoch": 2.6949592932617357,
"grad_norm": 0.11962329640139921,
"learning_rate": 1.5633170187297846e-06,
"loss": 0.1468,
"step": 7780
},
{
"epoch": 2.6984236965182746,
"grad_norm": 0.12759004303067803,
"learning_rate": 1.5284323006229035e-06,
"loss": 0.1457,
"step": 7790
},
{
"epoch": 2.7018880997748136,
"grad_norm": 0.12164262479291138,
"learning_rate": 1.4939289296196063e-06,
"loss": 0.1451,
"step": 7800
},
{
"epoch": 2.705352503031353,
"grad_norm": 0.11841849076099206,
"learning_rate": 1.4598074663028483e-06,
"loss": 0.1442,
"step": 7810
},
{
"epoch": 2.708816906287892,
"grad_norm": 0.1279272278852782,
"learning_rate": 1.4260684650506478e-06,
"loss": 0.1466,
"step": 7820
},
{
"epoch": 2.712281309544431,
"grad_norm": 0.11952115983833288,
"learning_rate": 1.3927124740270885e-06,
"loss": 0.1457,
"step": 7830
},
{
"epoch": 2.71574571280097,
"grad_norm": 0.12162884781857089,
"learning_rate": 1.3597400351734151e-06,
"loss": 0.1427,
"step": 7840
},
{
"epoch": 2.719210116057509,
"grad_norm": 0.13471370194895713,
"learning_rate": 1.327151684199221e-06,
"loss": 0.1442,
"step": 7850
},
{
"epoch": 2.7226745193140482,
"grad_norm": 0.12907986475959193,
"learning_rate": 1.2949479505737494e-06,
"loss": 0.1455,
"step": 7860
},
{
"epoch": 2.726138922570587,
"grad_norm": 0.12566383969973927,
"learning_rate": 1.263129357517301e-06,
"loss": 0.1463,
"step": 7870
},
{
"epoch": 2.729603325827126,
"grad_norm": 0.1195570593454166,
"learning_rate": 1.2316964219927119e-06,
"loss": 0.1422,
"step": 7880
},
{
"epoch": 2.7330677290836656,
"grad_norm": 0.12438741593202014,
"learning_rate": 1.2006496546969642e-06,
"loss": 0.1506,
"step": 7890
},
{
"epoch": 2.7365321323402045,
"grad_norm": 0.13311421492284156,
"learning_rate": 1.1699895600529087e-06,
"loss": 0.1484,
"step": 7900
},
{
"epoch": 2.7399965355967435,
"grad_norm": 0.13353566400942274,
"learning_rate": 1.1397166362010243e-06,
"loss": 0.1472,
"step": 7910
},
{
"epoch": 2.7434609388532825,
"grad_norm": 0.12303792368683632,
"learning_rate": 1.109831374991377e-06,
"loss": 0.1455,
"step": 7920
},
{
"epoch": 2.7469253421098214,
"grad_norm": 0.1164932218834871,
"learning_rate": 1.080334261975577e-06,
"loss": 0.1456,
"step": 7930
},
{
"epoch": 2.750389745366361,
"grad_norm": 0.12413166278304175,
"learning_rate": 1.051225776398937e-06,
"loss": 0.1475,
"step": 7940
},
{
"epoch": 2.7538541486229,
"grad_norm": 0.11917463944574379,
"learning_rate": 1.0225063911926597e-06,
"loss": 0.1473,
"step": 7950
},
{
"epoch": 2.7573185518794388,
"grad_norm": 0.21002619780579418,
"learning_rate": 9.94176572966149e-07,
"loss": 0.1499,
"step": 7960
},
{
"epoch": 2.7607829551359777,
"grad_norm": 0.12190061305916731,
"learning_rate": 9.662367819994467e-07,
"loss": 0.1448,
"step": 7970
},
{
"epoch": 2.7642473583925167,
"grad_norm": 0.11828260636790151,
"learning_rate": 9.386874722357469e-07,
"loss": 0.1492,
"step": 7980
},
{
"epoch": 2.767711761649056,
"grad_norm": 0.11503639232862127,
"learning_rate": 9.115290912740132e-07,
"loss": 0.1448,
"step": 7990
},
{
"epoch": 2.771176164905595,
"grad_norm": 0.1189533885403222,
"learning_rate": 8.847620803617257e-07,
"loss": 0.1467,
"step": 8000
},
{
"epoch": 2.774640568162134,
"grad_norm": 0.12701672502597688,
"learning_rate": 8.583868743876844e-07,
"loss": 0.1462,
"step": 8010
},
{
"epoch": 2.7781049714186734,
"grad_norm": 0.1220861131545684,
"learning_rate": 8.324039018749674e-07,
"loss": 0.1453,
"step": 8020
},
{
"epoch": 2.781569374675212,
"grad_norm": 0.12031457421341588,
"learning_rate": 8.068135849739617e-07,
"loss": 0.1437,
"step": 8030
},
{
"epoch": 2.7850337779317513,
"grad_norm": 0.12017875343248997,
"learning_rate": 7.816163394554932e-07,
"loss": 0.1478,
"step": 8040
},
{
"epoch": 2.7884981811882903,
"grad_norm": 0.1192917350903116,
"learning_rate": 7.56812574704091e-07,
"loss": 0.1436,
"step": 8050
},
{
"epoch": 2.7919625844448293,
"grad_norm": 0.11524716943825414,
"learning_rate": 7.32402693711326e-07,
"loss": 0.1451,
"step": 8060
},
{
"epoch": 2.7954269877013687,
"grad_norm": 0.12406089861167721,
"learning_rate": 7.083870930692516e-07,
"loss": 0.1459,
"step": 8070
},
{
"epoch": 2.7988913909579076,
"grad_norm": 0.12315834818935076,
"learning_rate": 6.847661629639873e-07,
"loss": 0.1451,
"step": 8080
},
{
"epoch": 2.8023557942144466,
"grad_norm": 0.11732007572900043,
"learning_rate": 6.615402871693487e-07,
"loss": 0.1473,
"step": 8090
},
{
"epoch": 2.8058201974709855,
"grad_norm": 0.11711430732640442,
"learning_rate": 6.387098430406441e-07,
"loss": 0.1481,
"step": 8100
},
{
"epoch": 2.8092846007275245,
"grad_norm": 0.11662597413638771,
"learning_rate": 6.162752015085122e-07,
"loss": 0.1462,
"step": 8110
},
{
"epoch": 2.812749003984064,
"grad_norm": 0.11644716676700212,
"learning_rate": 5.942367270729165e-07,
"loss": 0.1454,
"step": 8120
},
{
"epoch": 2.816213407240603,
"grad_norm": 0.1210826937584845,
"learning_rate": 5.725947777972224e-07,
"loss": 0.1482,
"step": 8130
},
{
"epoch": 2.819677810497142,
"grad_norm": 0.11717664884081884,
"learning_rate": 5.513497053023647e-07,
"loss": 0.1446,
"step": 8140
},
{
"epoch": 2.823142213753681,
"grad_norm": 0.11782433267361038,
"learning_rate": 5.305018547611451e-07,
"loss": 0.1479,
"step": 8150
},
{
"epoch": 2.8266066170102198,
"grad_norm": 0.12844697710634922,
"learning_rate": 5.100515648926329e-07,
"loss": 0.1457,
"step": 8160
},
{
"epoch": 2.830071020266759,
"grad_norm": 0.1259382993927401,
"learning_rate": 4.899991679566423e-07,
"loss": 0.1482,
"step": 8170
},
{
"epoch": 2.833535423523298,
"grad_norm": 0.12112661112999973,
"learning_rate": 4.703449897483503e-07,
"loss": 0.1468,
"step": 8180
},
{
"epoch": 2.836999826779837,
"grad_norm": 0.11907259047343774,
"learning_rate": 4.5108934959299243e-07,
"loss": 0.1412,
"step": 8190
},
{
"epoch": 2.8404642300363765,
"grad_norm": 0.11046949957522073,
"learning_rate": 4.322325603406813e-07,
"loss": 0.1457,
"step": 8200
},
{
"epoch": 2.8439286332929155,
"grad_norm": 0.11728964170089871,
"learning_rate": 4.137749283613268e-07,
"loss": 0.148,
"step": 8210
},
{
"epoch": 2.8473930365494544,
"grad_norm": 0.12458638671812242,
"learning_rate": 3.9571675353964053e-07,
"loss": 0.1477,
"step": 8220
},
{
"epoch": 2.8508574398059934,
"grad_norm": 0.12603956939519112,
"learning_rate": 3.780583292702894e-07,
"loss": 0.1431,
"step": 8230
},
{
"epoch": 2.8543218430625323,
"grad_norm": 0.11737849831673626,
"learning_rate": 3.607999424531078e-07,
"loss": 0.1459,
"step": 8240
},
{
"epoch": 2.8577862463190717,
"grad_norm": 0.11861750859649005,
"learning_rate": 3.4394187348844866e-07,
"loss": 0.1484,
"step": 8250
},
{
"epoch": 2.8612506495756107,
"grad_norm": 0.12007382110446226,
"learning_rate": 3.274843962726204e-07,
"loss": 0.1461,
"step": 8260
},
{
"epoch": 2.8647150528321497,
"grad_norm": 0.1114021812553446,
"learning_rate": 3.114277781934433e-07,
"loss": 0.1408,
"step": 8270
},
{
"epoch": 2.8681794560886886,
"grad_norm": 0.12459777820858844,
"learning_rate": 2.957722801258944e-07,
"loss": 0.1469,
"step": 8280
},
{
"epoch": 2.8716438593452276,
"grad_norm": 0.12059709971421617,
"learning_rate": 2.805181564278864e-07,
"loss": 0.1476,
"step": 8290
},
{
"epoch": 2.875108262601767,
"grad_norm": 0.11316078775210853,
"learning_rate": 2.6566565493611475e-07,
"loss": 0.1447,
"step": 8300
},
{
"epoch": 2.878572665858306,
"grad_norm": 0.12343927697834793,
"learning_rate": 2.512150169620503e-07,
"loss": 0.1499,
"step": 8310
},
{
"epoch": 2.882037069114845,
"grad_norm": 0.1264438568086182,
"learning_rate": 2.371664772880061e-07,
"loss": 0.1475,
"step": 8320
},
{
"epoch": 2.885501472371384,
"grad_norm": 0.11834929752982455,
"learning_rate": 2.2352026416331829e-07,
"loss": 0.1484,
"step": 8330
},
{
"epoch": 2.888965875627923,
"grad_norm": 0.12074802080378956,
"learning_rate": 2.1027659930066e-07,
"loss": 0.1464,
"step": 8340
},
{
"epoch": 2.8924302788844622,
"grad_norm": 0.11880030905569529,
"learning_rate": 1.97435697872414e-07,
"loss": 0.1465,
"step": 8350
},
{
"epoch": 2.895894682141001,
"grad_norm": 0.1286519420227983,
"learning_rate": 1.8499776850719463e-07,
"loss": 0.1466,
"step": 8360
},
{
"epoch": 2.89935908539754,
"grad_norm": 0.12408525205621782,
"learning_rate": 1.7296301328644516e-07,
"loss": 0.1467,
"step": 8370
},
{
"epoch": 2.9028234886540796,
"grad_norm": 0.11413939977206788,
"learning_rate": 1.613316277411625e-07,
"loss": 0.1432,
"step": 8380
},
{
"epoch": 2.9062878919106185,
"grad_norm": 0.12094926364512971,
"learning_rate": 1.5010380084871933e-07,
"loss": 0.1482,
"step": 8390
},
{
"epoch": 2.9097522951671575,
"grad_norm": 0.1173931691129058,
"learning_rate": 1.392797150297942e-07,
"loss": 0.1476,
"step": 8400
},
{
"epoch": 2.9132166984236965,
"grad_norm": 0.11719997871862918,
"learning_rate": 1.2885954614540175e-07,
"loss": 0.1463,
"step": 8410
},
{
"epoch": 2.9166811016802354,
"grad_norm": 0.11982420345633238,
"learning_rate": 1.1884346349404774e-07,
"loss": 0.1508,
"step": 8420
},
{
"epoch": 2.920145504936775,
"grad_norm": 0.12475530168973786,
"learning_rate": 1.0923162980896185e-07,
"loss": 0.15,
"step": 8430
},
{
"epoch": 2.923609908193314,
"grad_norm": 0.11892325463016702,
"learning_rate": 1.000242012554664e-07,
"loss": 0.1476,
"step": 8440
},
{
"epoch": 2.9270743114498528,
"grad_norm": 0.1192790735851098,
"learning_rate": 9.122132742843681e-08,
"loss": 0.1454,
"step": 8450
},
{
"epoch": 2.9305387147063917,
"grad_norm": 0.1170936103181331,
"learning_rate": 8.28231513498673e-08,
"loss": 0.1482,
"step": 8460
},
{
"epoch": 2.9340031179629307,
"grad_norm": 0.11575536846669715,
"learning_rate": 7.48298094665506e-08,
"loss": 0.1482,
"step": 8470
},
{
"epoch": 2.93746752121947,
"grad_norm": 0.10817448466285368,
"learning_rate": 6.724143164785757e-08,
"loss": 0.1454,
"step": 8480
},
{
"epoch": 2.940931924476009,
"grad_norm": 0.11464554359428555,
"learning_rate": 6.005814118363317e-08,
"loss": 0.1438,
"step": 8490
},
{
"epoch": 2.944396327732548,
"grad_norm": 0.12385487233495297,
"learning_rate": 5.328005478218989e-08,
"loss": 0.1463,
"step": 8500
},
{
"epoch": 2.947860730989087,
"grad_norm": 0.11878438059832382,
"learning_rate": 4.69072825684036e-08,
"loss": 0.1464,
"step": 8510
},
{
"epoch": 2.951325134245626,
"grad_norm": 0.11382553656848247,
"learning_rate": 4.093992808194558e-08,
"loss": 0.1427,
"step": 8520
},
{
"epoch": 2.9547895375021653,
"grad_norm": 0.1221402366685275,
"learning_rate": 3.537808827557276e-08,
"loss": 0.1469,
"step": 8530
},
{
"epoch": 2.9582539407587043,
"grad_norm": 0.11836429164824389,
"learning_rate": 3.0221853513576207e-08,
"loss": 0.1471,
"step": 8540
},
{
"epoch": 2.9617183440152433,
"grad_norm": 0.11592227533291538,
"learning_rate": 2.5471307570298918e-08,
"loss": 0.1436,
"step": 8550
},
{
"epoch": 2.9651827472717827,
"grad_norm": 0.12005026306688542,
"learning_rate": 2.112652762878142e-08,
"loss": 0.1466,
"step": 8560
},
{
"epoch": 2.9686471505283216,
"grad_norm": 0.12062875615569613,
"learning_rate": 1.71875842795044e-08,
"loss": 0.1457,
"step": 8570
},
{
"epoch": 2.9721115537848606,
"grad_norm": 0.12033280556169802,
"learning_rate": 1.3654541519242392e-08,
"loss": 0.1437,
"step": 8580
},
{
"epoch": 2.9755759570413995,
"grad_norm": 0.12337008384975914,
"learning_rate": 1.0527456750025755e-08,
"loss": 0.1468,
"step": 8590
},
{
"epoch": 2.9790403602979385,
"grad_norm": 0.12167133579723895,
"learning_rate": 7.80638077820528e-09,
"loss": 0.1519,
"step": 8600
},
{
"epoch": 2.982504763554478,
"grad_norm": 0.11751314218922883,
"learning_rate": 5.491357813627862e-09,
"loss": 0.1494,
"step": 8610
},
{
"epoch": 2.985969166811017,
"grad_norm": 0.12293930219223904,
"learning_rate": 3.582425468920403e-09,
"loss": 0.1483,
"step": 8620
},
{
"epoch": 2.989433570067556,
"grad_norm": 0.12371328514211692,
"learning_rate": 2.0796147588791894e-09,
"loss": 0.1446,
"step": 8630
},
{
"epoch": 2.992897973324095,
"grad_norm": 0.1283635993974664,
"learning_rate": 9.829500999564144e-10,
"loss": 0.148,
"step": 8640
},
{
"epoch": 2.9963623765806338,
"grad_norm": 0.11829545701564251,
"learning_rate": 2.924493098743764e-10,
"loss": 0.1419,
"step": 8650
},
{
"epoch": 2.999826779837173,
"grad_norm": 0.1191095821300158,
"learning_rate": 8.123607339594053e-12,
"loss": 0.1497,
"step": 8660
},
{
"epoch": 3.0,
"step": 8661,
"total_flos": 1.6970304064115966e+19,
"train_loss": 0.20868314632318158,
"train_runtime": 603947.8558,
"train_samples_per_second": 0.459,
"train_steps_per_second": 0.014
}
],
"logging_steps": 10,
"max_steps": 8661,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6970304064115966e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}