diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6105 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 8661, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0034644032565390613, + "grad_norm": 24.105030711898273, + "learning_rate": 5.190311418685121e-07, + "loss": 2.8149, + "step": 10 + }, + { + "epoch": 0.0069288065130781226, + "grad_norm": 13.101411602211778, + "learning_rate": 1.0957324106113035e-06, + "loss": 2.6533, + "step": 20 + }, + { + "epoch": 0.010393209769617183, + "grad_norm": 11.8076022930929, + "learning_rate": 1.6724336793540945e-06, + "loss": 2.4007, + "step": 30 + }, + { + "epoch": 0.013857613026156245, + "grad_norm": 4.7580336362411515, + "learning_rate": 2.249134948096886e-06, + "loss": 1.8865, + "step": 40 + }, + { + "epoch": 0.017322016282695307, + "grad_norm": 2.9114980071240395, + "learning_rate": 2.825836216839677e-06, + "loss": 1.4207, + "step": 50 + }, + { + "epoch": 0.020786419539234366, + "grad_norm": 1.7324565160527228, + "learning_rate": 3.402537485582469e-06, + "loss": 1.0113, + "step": 60 + }, + { + "epoch": 0.024250822795773428, + "grad_norm": 2.1155590344472155, + "learning_rate": 3.9792387543252595e-06, + "loss": 0.7313, + "step": 70 + }, + { + "epoch": 0.02771522605231249, + "grad_norm": 0.6681966791277029, + "learning_rate": 4.555940023068051e-06, + "loss": 0.5804, + "step": 80 + }, + { + "epoch": 0.03117962930885155, + "grad_norm": 2.046887683182866, + "learning_rate": 5.132641291810842e-06, + "loss": 0.5134, + "step": 90 + }, + { + "epoch": 0.034644032565390614, + "grad_norm": 1.0043907474739053, + "learning_rate": 5.709342560553633e-06, + "loss": 0.4659, + "step": 100 + }, + { + "epoch": 0.03810843582192967, + "grad_norm": 0.6198895627168727, + "learning_rate": 6.286043829296424e-06, + "loss": 0.4432, + "step": 110 + }, + { + "epoch": 0.04157283907846873, + "grad_norm": 0.5789693949791498, + "learning_rate": 6.862745098039216e-06, + "loss": 0.4208, + "step": 120 + }, + { + "epoch": 0.0450372423350078, + "grad_norm": 0.5151956509234089, + "learning_rate": 7.439446366782007e-06, + "loss": 0.4103, + "step": 130 + }, + { + "epoch": 0.048501645591546856, + "grad_norm": 0.6554047963560565, + "learning_rate": 8.016147635524798e-06, + "loss": 0.3938, + "step": 140 + }, + { + "epoch": 0.051966048848085915, + "grad_norm": 0.504860048364916, + "learning_rate": 8.592848904267588e-06, + "loss": 0.384, + "step": 150 + }, + { + "epoch": 0.05543045210462498, + "grad_norm": 0.6045057320065914, + "learning_rate": 9.169550173010382e-06, + "loss": 0.3781, + "step": 160 + }, + { + "epoch": 0.05889485536116404, + "grad_norm": 1.055910114221381, + "learning_rate": 9.746251441753172e-06, + "loss": 0.3695, + "step": 170 + }, + { + "epoch": 0.0623592586177031, + "grad_norm": 0.462726140305608, + "learning_rate": 1.0322952710495964e-05, + "loss": 0.3577, + "step": 180 + }, + { + "epoch": 0.06582366187424216, + "grad_norm": 0.4950470445818375, + "learning_rate": 1.0899653979238756e-05, + "loss": 0.3593, + "step": 190 + }, + { + "epoch": 0.06928806513078123, + "grad_norm": 0.4011301599299419, + "learning_rate": 1.1476355247981546e-05, + "loss": 0.3469, + "step": 200 + }, + { + "epoch": 0.07275246838732029, + "grad_norm": 1.5395956991225512, + "learning_rate": 1.2053056516724338e-05, + "loss": 0.3466, + "step": 210 + }, + { + "epoch": 0.07621687164385935, + "grad_norm": 0.4782287164298744, + "learning_rate": 1.2629757785467128e-05, + "loss": 0.3384, + "step": 220 + }, + { + "epoch": 0.0796812749003984, + "grad_norm": 0.4264649009974007, + "learning_rate": 1.3206459054209918e-05, + "loss": 0.335, + "step": 230 + }, + { + "epoch": 0.08314567815693746, + "grad_norm": 0.5500552360743377, + "learning_rate": 1.3783160322952712e-05, + "loss": 0.3294, + "step": 240 + }, + { + "epoch": 0.08661008141347652, + "grad_norm": 0.5287918177337965, + "learning_rate": 1.4359861591695503e-05, + "loss": 0.328, + "step": 250 + }, + { + "epoch": 0.0900744846700156, + "grad_norm": 0.4510077521972178, + "learning_rate": 1.4936562860438294e-05, + "loss": 0.3294, + "step": 260 + }, + { + "epoch": 0.09353888792655465, + "grad_norm": 0.47410104891468646, + "learning_rate": 1.5513264129181084e-05, + "loss": 0.322, + "step": 270 + }, + { + "epoch": 0.09700329118309371, + "grad_norm": 0.5064048354634346, + "learning_rate": 1.6089965397923876e-05, + "loss": 0.3222, + "step": 280 + }, + { + "epoch": 0.10046769443963277, + "grad_norm": 0.5163925193924696, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.3104, + "step": 290 + }, + { + "epoch": 0.10393209769617183, + "grad_norm": 0.5854453216694896, + "learning_rate": 1.7243367935409456e-05, + "loss": 0.3108, + "step": 300 + }, + { + "epoch": 0.1073965009527109, + "grad_norm": 0.47989721671414387, + "learning_rate": 1.782006920415225e-05, + "loss": 0.3122, + "step": 310 + }, + { + "epoch": 0.11086090420924996, + "grad_norm": 0.4198890022476431, + "learning_rate": 1.8396770472895043e-05, + "loss": 0.3013, + "step": 320 + }, + { + "epoch": 0.11432530746578902, + "grad_norm": 0.35484886374124996, + "learning_rate": 1.897347174163783e-05, + "loss": 0.308, + "step": 330 + }, + { + "epoch": 0.11778971072232808, + "grad_norm": 0.6798994381335608, + "learning_rate": 1.9550173010380623e-05, + "loss": 0.3017, + "step": 340 + }, + { + "epoch": 0.12125411397886714, + "grad_norm": 0.5104370022480478, + "learning_rate": 2.0126874279123415e-05, + "loss": 0.3009, + "step": 350 + }, + { + "epoch": 0.1247185172354062, + "grad_norm": 0.43181121899772806, + "learning_rate": 2.0703575547866204e-05, + "loss": 0.2984, + "step": 360 + }, + { + "epoch": 0.12818292049194527, + "grad_norm": 0.44631400797502857, + "learning_rate": 2.1280276816609e-05, + "loss": 0.3011, + "step": 370 + }, + { + "epoch": 0.1316473237484843, + "grad_norm": 1.3618600184569662, + "learning_rate": 2.185697808535179e-05, + "loss": 0.3028, + "step": 380 + }, + { + "epoch": 0.13511172700502339, + "grad_norm": 0.4288866277270222, + "learning_rate": 2.243367935409458e-05, + "loss": 0.2966, + "step": 390 + }, + { + "epoch": 0.13857613026156246, + "grad_norm": 0.43831756556428697, + "learning_rate": 2.301038062283737e-05, + "loss": 0.2916, + "step": 400 + }, + { + "epoch": 0.1420405335181015, + "grad_norm": 0.422949206010911, + "learning_rate": 2.3587081891580163e-05, + "loss": 0.2979, + "step": 410 + }, + { + "epoch": 0.14550493677464058, + "grad_norm": 0.5022509237000374, + "learning_rate": 2.4163783160322955e-05, + "loss": 0.2895, + "step": 420 + }, + { + "epoch": 0.14896934003117962, + "grad_norm": 0.49397467400296585, + "learning_rate": 2.4740484429065743e-05, + "loss": 0.2898, + "step": 430 + }, + { + "epoch": 0.1524337432877187, + "grad_norm": 0.3494967743075288, + "learning_rate": 2.531718569780854e-05, + "loss": 0.279, + "step": 440 + }, + { + "epoch": 0.15589814654425777, + "grad_norm": 0.45661458601823307, + "learning_rate": 2.5893886966551327e-05, + "loss": 0.291, + "step": 450 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.5438985700428273, + "learning_rate": 2.647058823529412e-05, + "loss": 0.29, + "step": 460 + }, + { + "epoch": 0.16282695305733588, + "grad_norm": 0.4788529828590826, + "learning_rate": 2.704728950403691e-05, + "loss": 0.2866, + "step": 470 + }, + { + "epoch": 0.16629135631387493, + "grad_norm": 0.4154664374547961, + "learning_rate": 2.7623990772779702e-05, + "loss": 0.2825, + "step": 480 + }, + { + "epoch": 0.169755759570414, + "grad_norm": 0.45665598437011556, + "learning_rate": 2.820069204152249e-05, + "loss": 0.2839, + "step": 490 + }, + { + "epoch": 0.17322016282695304, + "grad_norm": 0.5602183672259513, + "learning_rate": 2.8777393310265283e-05, + "loss": 0.285, + "step": 500 + }, + { + "epoch": 0.17668456608349212, + "grad_norm": 0.4595470907703161, + "learning_rate": 2.9354094579008075e-05, + "loss": 0.2845, + "step": 510 + }, + { + "epoch": 0.1801489693400312, + "grad_norm": 0.35972117453020297, + "learning_rate": 2.9930795847750863e-05, + "loss": 0.2772, + "step": 520 + }, + { + "epoch": 0.18361337259657023, + "grad_norm": 0.4722911335764227, + "learning_rate": 3.0507497116493655e-05, + "loss": 0.2774, + "step": 530 + }, + { + "epoch": 0.1870777758531093, + "grad_norm": 0.4212365397286255, + "learning_rate": 3.108419838523645e-05, + "loss": 0.2778, + "step": 540 + }, + { + "epoch": 0.19054217910964835, + "grad_norm": 0.4437718583316477, + "learning_rate": 3.166089965397924e-05, + "loss": 0.2763, + "step": 550 + }, + { + "epoch": 0.19400658236618742, + "grad_norm": 0.5795580292549793, + "learning_rate": 3.2237600922722034e-05, + "loss": 0.2777, + "step": 560 + }, + { + "epoch": 0.1974709856227265, + "grad_norm": 0.38801773200166495, + "learning_rate": 3.2814302191464826e-05, + "loss": 0.2782, + "step": 570 + }, + { + "epoch": 0.20093538887926554, + "grad_norm": 0.39386050551197965, + "learning_rate": 3.339100346020762e-05, + "loss": 0.2756, + "step": 580 + }, + { + "epoch": 0.20439979213580461, + "grad_norm": 0.4094962883057876, + "learning_rate": 3.39677047289504e-05, + "loss": 0.2725, + "step": 590 + }, + { + "epoch": 0.20786419539234366, + "grad_norm": 0.46243563531324183, + "learning_rate": 3.4544405997693194e-05, + "loss": 0.2679, + "step": 600 + }, + { + "epoch": 0.21132859864888273, + "grad_norm": 0.44066593111685165, + "learning_rate": 3.5121107266435986e-05, + "loss": 0.2785, + "step": 610 + }, + { + "epoch": 0.2147930019054218, + "grad_norm": 0.3559776358805474, + "learning_rate": 3.569780853517878e-05, + "loss": 0.2717, + "step": 620 + }, + { + "epoch": 0.21825740516196085, + "grad_norm": 0.46610388128285485, + "learning_rate": 3.627450980392157e-05, + "loss": 0.2697, + "step": 630 + }, + { + "epoch": 0.22172180841849992, + "grad_norm": 0.40480766460689316, + "learning_rate": 3.685121107266436e-05, + "loss": 0.2703, + "step": 640 + }, + { + "epoch": 0.22518621167503897, + "grad_norm": 0.4307168132201555, + "learning_rate": 3.7427912341407154e-05, + "loss": 0.2732, + "step": 650 + }, + { + "epoch": 0.22865061493157804, + "grad_norm": 0.4259888241847833, + "learning_rate": 3.800461361014994e-05, + "loss": 0.2701, + "step": 660 + }, + { + "epoch": 0.23211501818811708, + "grad_norm": 0.45912634835480254, + "learning_rate": 3.858131487889274e-05, + "loss": 0.2715, + "step": 670 + }, + { + "epoch": 0.23557942144465616, + "grad_norm": 0.4028410357459808, + "learning_rate": 3.915801614763553e-05, + "loss": 0.2722, + "step": 680 + }, + { + "epoch": 0.23904382470119523, + "grad_norm": 0.48493013541150326, + "learning_rate": 3.973471741637832e-05, + "loss": 0.2671, + "step": 690 + }, + { + "epoch": 0.24250822795773427, + "grad_norm": 0.3565349550219391, + "learning_rate": 4.031141868512111e-05, + "loss": 0.2663, + "step": 700 + }, + { + "epoch": 0.24597263121427335, + "grad_norm": 0.32316152566585826, + "learning_rate": 4.0888119953863905e-05, + "loss": 0.2725, + "step": 710 + }, + { + "epoch": 0.2494370344708124, + "grad_norm": 0.5136690083745088, + "learning_rate": 4.146482122260669e-05, + "loss": 0.2655, + "step": 720 + }, + { + "epoch": 0.25290143772735146, + "grad_norm": 0.40785543102527044, + "learning_rate": 4.204152249134948e-05, + "loss": 0.2668, + "step": 730 + }, + { + "epoch": 0.25636584098389054, + "grad_norm": 0.38517456097463165, + "learning_rate": 4.2618223760092274e-05, + "loss": 0.257, + "step": 740 + }, + { + "epoch": 0.2598302442404296, + "grad_norm": 0.38202204582529214, + "learning_rate": 4.3194925028835065e-05, + "loss": 0.2657, + "step": 750 + }, + { + "epoch": 0.2632946474969686, + "grad_norm": 0.5057316578616624, + "learning_rate": 4.377162629757786e-05, + "loss": 0.2669, + "step": 760 + }, + { + "epoch": 0.2667590507535077, + "grad_norm": 0.4288116267424748, + "learning_rate": 4.434832756632065e-05, + "loss": 0.2636, + "step": 770 + }, + { + "epoch": 0.27022345401004677, + "grad_norm": 0.37093943217557107, + "learning_rate": 4.4925028835063434e-05, + "loss": 0.2647, + "step": 780 + }, + { + "epoch": 0.27368785726658584, + "grad_norm": 0.34353485608546885, + "learning_rate": 4.5501730103806226e-05, + "loss": 0.2598, + "step": 790 + }, + { + "epoch": 0.2771522605231249, + "grad_norm": 0.35239301626467756, + "learning_rate": 4.607843137254902e-05, + "loss": 0.2646, + "step": 800 + }, + { + "epoch": 0.28061666377966393, + "grad_norm": 0.3842430524450989, + "learning_rate": 4.6655132641291816e-05, + "loss": 0.2619, + "step": 810 + }, + { + "epoch": 0.284081067036203, + "grad_norm": 0.35682883446316815, + "learning_rate": 4.723183391003461e-05, + "loss": 0.2561, + "step": 820 + }, + { + "epoch": 0.2875454702927421, + "grad_norm": 0.3734875488865965, + "learning_rate": 4.78085351787774e-05, + "loss": 0.2584, + "step": 830 + }, + { + "epoch": 0.29100987354928115, + "grad_norm": 0.4315870557925161, + "learning_rate": 4.8385236447520185e-05, + "loss": 0.2613, + "step": 840 + }, + { + "epoch": 0.2944742768058202, + "grad_norm": 0.4103742391218009, + "learning_rate": 4.896193771626298e-05, + "loss": 0.2624, + "step": 850 + }, + { + "epoch": 0.29793868006235924, + "grad_norm": 0.3591092100235818, + "learning_rate": 4.953863898500577e-05, + "loss": 0.2597, + "step": 860 + }, + { + "epoch": 0.3014030833188983, + "grad_norm": 0.3623515286988262, + "learning_rate": 4.999999187639266e-05, + "loss": 0.2593, + "step": 870 + }, + { + "epoch": 0.3048674865754374, + "grad_norm": 0.3048594211509306, + "learning_rate": 4.999970755069012e-05, + "loss": 0.2601, + "step": 880 + }, + { + "epoch": 0.30833188983197646, + "grad_norm": 0.45002930011250586, + "learning_rate": 4.9999017049900046e-05, + "loss": 0.2543, + "step": 890 + }, + { + "epoch": 0.31179629308851553, + "grad_norm": 0.35358665328747896, + "learning_rate": 4.999792038524113e-05, + "loss": 0.2559, + "step": 900 + }, + { + "epoch": 0.31526069634505455, + "grad_norm": 0.2967473429143083, + "learning_rate": 4.9996417574531085e-05, + "loss": 0.2576, + "step": 910 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.38465662558696595, + "learning_rate": 4.9994508642186376e-05, + "loss": 0.2603, + "step": 920 + }, + { + "epoch": 0.3221895028581327, + "grad_norm": 0.31492562032892685, + "learning_rate": 4.9992193619221796e-05, + "loss": 0.2576, + "step": 930 + }, + { + "epoch": 0.32565390611467177, + "grad_norm": 0.34729263239739333, + "learning_rate": 4.998947254324998e-05, + "loss": 0.2547, + "step": 940 + }, + { + "epoch": 0.3291183093712108, + "grad_norm": 0.3488064460618837, + "learning_rate": 4.998634545848076e-05, + "loss": 0.2515, + "step": 950 + }, + { + "epoch": 0.33258271262774985, + "grad_norm": 0.26471687398645055, + "learning_rate": 4.9982812415720496e-05, + "loss": 0.2574, + "step": 960 + }, + { + "epoch": 0.3360471158842889, + "grad_norm": 0.2644922688346079, + "learning_rate": 4.997887347237122e-05, + "loss": 0.2586, + "step": 970 + }, + { + "epoch": 0.339511519140828, + "grad_norm": 0.7989330166644754, + "learning_rate": 4.99745286924297e-05, + "loss": 0.2552, + "step": 980 + }, + { + "epoch": 0.3429759223973671, + "grad_norm": 0.26650336882677617, + "learning_rate": 4.9969778146486424e-05, + "loss": 0.2524, + "step": 990 + }, + { + "epoch": 0.3464403256539061, + "grad_norm": 0.2899936429270715, + "learning_rate": 4.996462191172443e-05, + "loss": 0.2543, + "step": 1000 + }, + { + "epoch": 0.34990472891044516, + "grad_norm": 0.26123609655286284, + "learning_rate": 4.9959060071918055e-05, + "loss": 0.2533, + "step": 1010 + }, + { + "epoch": 0.35336913216698423, + "grad_norm": 0.33570339771203644, + "learning_rate": 4.99530927174316e-05, + "loss": 0.2563, + "step": 1020 + }, + { + "epoch": 0.3568335354235233, + "grad_norm": 0.3014454205427117, + "learning_rate": 4.9946719945217814e-05, + "loss": 0.2499, + "step": 1030 + }, + { + "epoch": 0.3602979386800624, + "grad_norm": 0.27325651600933015, + "learning_rate": 4.9939941858816366e-05, + "loss": 0.2511, + "step": 1040 + }, + { + "epoch": 0.3637623419366014, + "grad_norm": 0.32465788692548037, + "learning_rate": 4.9932758568352144e-05, + "loss": 0.2479, + "step": 1050 + }, + { + "epoch": 0.36722674519314047, + "grad_norm": 0.2652235360508281, + "learning_rate": 4.9925170190533454e-05, + "loss": 0.2517, + "step": 1060 + }, + { + "epoch": 0.37069114844967954, + "grad_norm": 0.3970874000599188, + "learning_rate": 4.991717684865014e-05, + "loss": 0.2476, + "step": 1070 + }, + { + "epoch": 0.3741555517062186, + "grad_norm": 0.2722867182843485, + "learning_rate": 4.990877867257157e-05, + "loss": 0.2529, + "step": 1080 + }, + { + "epoch": 0.3776199549627577, + "grad_norm": 0.2619052903321827, + "learning_rate": 4.989997579874454e-05, + "loss": 0.2469, + "step": 1090 + }, + { + "epoch": 0.3810843582192967, + "grad_norm": 0.2955918646015672, + "learning_rate": 4.9890768370191046e-05, + "loss": 0.2502, + "step": 1100 + }, + { + "epoch": 0.3845487614758358, + "grad_norm": 0.387831684494149, + "learning_rate": 4.988115653650596e-05, + "loss": 0.2425, + "step": 1110 + }, + { + "epoch": 0.38801316473237485, + "grad_norm": 0.32722861574894646, + "learning_rate": 4.98711404538546e-05, + "loss": 0.248, + "step": 1120 + }, + { + "epoch": 0.3914775679889139, + "grad_norm": 0.29851671037489946, + "learning_rate": 4.986072028497021e-05, + "loss": 0.2477, + "step": 1130 + }, + { + "epoch": 0.394941971245453, + "grad_norm": 0.32504507070930905, + "learning_rate": 4.984989619915128e-05, + "loss": 0.2483, + "step": 1140 + }, + { + "epoch": 0.398406374501992, + "grad_norm": 0.2992367844222804, + "learning_rate": 4.9838668372258844e-05, + "loss": 0.2434, + "step": 1150 + }, + { + "epoch": 0.4018707777585311, + "grad_norm": 0.35335395639024886, + "learning_rate": 4.982703698671356e-05, + "loss": 0.2515, + "step": 1160 + }, + { + "epoch": 0.40533518101507016, + "grad_norm": 0.2724105498054968, + "learning_rate": 4.9815002231492806e-05, + "loss": 0.2422, + "step": 1170 + }, + { + "epoch": 0.40879958427160923, + "grad_norm": 0.26081802141731797, + "learning_rate": 4.9802564302127584e-05, + "loss": 0.2477, + "step": 1180 + }, + { + "epoch": 0.4122639875281483, + "grad_norm": 0.26509078283121335, + "learning_rate": 4.978972340069934e-05, + "loss": 0.2428, + "step": 1190 + }, + { + "epoch": 0.4157283907846873, + "grad_norm": 0.3268402808784247, + "learning_rate": 4.977647973583669e-05, + "loss": 0.245, + "step": 1200 + }, + { + "epoch": 0.4191927940412264, + "grad_norm": 0.26931679508398965, + "learning_rate": 4.9762833522712e-05, + "loss": 0.2461, + "step": 1210 + }, + { + "epoch": 0.42265719729776546, + "grad_norm": 0.28796254136886007, + "learning_rate": 4.9748784983037955e-05, + "loss": 0.2464, + "step": 1220 + }, + { + "epoch": 0.42612160055430454, + "grad_norm": 0.2903443309315686, + "learning_rate": 4.9734334345063884e-05, + "loss": 0.2462, + "step": 1230 + }, + { + "epoch": 0.4295860038108436, + "grad_norm": 0.2474488914617406, + "learning_rate": 4.971948184357211e-05, + "loss": 0.241, + "step": 1240 + }, + { + "epoch": 0.4330504070673826, + "grad_norm": 0.255276664365212, + "learning_rate": 4.970422771987411e-05, + "loss": 0.239, + "step": 1250 + }, + { + "epoch": 0.4365148103239217, + "grad_norm": 0.24990399483865394, + "learning_rate": 4.968857222180656e-05, + "loss": 0.2466, + "step": 1260 + }, + { + "epoch": 0.43997921358046077, + "grad_norm": 0.35045726721574383, + "learning_rate": 4.9672515603727385e-05, + "loss": 0.2423, + "step": 1270 + }, + { + "epoch": 0.44344361683699984, + "grad_norm": 0.24435346062621818, + "learning_rate": 4.965605812651155e-05, + "loss": 0.2407, + "step": 1280 + }, + { + "epoch": 0.44690802009353886, + "grad_norm": 0.24411085271480595, + "learning_rate": 4.96392000575469e-05, + "loss": 0.2381, + "step": 1290 + }, + { + "epoch": 0.45037242335007793, + "grad_norm": 0.2774791617724539, + "learning_rate": 4.962194167072971e-05, + "loss": 0.2397, + "step": 1300 + }, + { + "epoch": 0.453836826606617, + "grad_norm": 0.33768453163385515, + "learning_rate": 4.960428324646036e-05, + "loss": 0.2391, + "step": 1310 + }, + { + "epoch": 0.4573012298631561, + "grad_norm": 0.26599849394188135, + "learning_rate": 4.958622507163868e-05, + "loss": 0.2372, + "step": 1320 + }, + { + "epoch": 0.46076563311969515, + "grad_norm": 0.34654816144848893, + "learning_rate": 4.9567767439659315e-05, + "loss": 0.2405, + "step": 1330 + }, + { + "epoch": 0.46423003637623417, + "grad_norm": 0.2964452922825174, + "learning_rate": 4.954891065040701e-05, + "loss": 0.2424, + "step": 1340 + }, + { + "epoch": 0.46769443963277324, + "grad_norm": 0.26622673063396735, + "learning_rate": 4.952965501025165e-05, + "loss": 0.2396, + "step": 1350 + }, + { + "epoch": 0.4711588428893123, + "grad_norm": 0.25967043414851687, + "learning_rate": 4.9510000832043356e-05, + "loss": 0.2421, + "step": 1360 + }, + { + "epoch": 0.4746232461458514, + "grad_norm": 0.22164309133223403, + "learning_rate": 4.948994843510737e-05, + "loss": 0.2429, + "step": 1370 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.3316472047464383, + "learning_rate": 4.9469498145238855e-05, + "loss": 0.2426, + "step": 1380 + }, + { + "epoch": 0.4815520526589295, + "grad_norm": 0.3356169844309959, + "learning_rate": 4.944865029469764e-05, + "loss": 0.2355, + "step": 1390 + }, + { + "epoch": 0.48501645591546855, + "grad_norm": 0.268486815716583, + "learning_rate": 4.9427405222202784e-05, + "loss": 0.2368, + "step": 1400 + }, + { + "epoch": 0.4884808591720076, + "grad_norm": 0.2334743696157166, + "learning_rate": 4.9405763272927086e-05, + "loss": 0.2439, + "step": 1410 + }, + { + "epoch": 0.4919452624285467, + "grad_norm": 0.2957491864509931, + "learning_rate": 4.938372479849149e-05, + "loss": 0.237, + "step": 1420 + }, + { + "epoch": 0.49540966568508576, + "grad_norm": 0.26896125577295615, + "learning_rate": 4.936129015695936e-05, + "loss": 0.2354, + "step": 1430 + }, + { + "epoch": 0.4988740689416248, + "grad_norm": 0.25816020668305967, + "learning_rate": 4.9338459712830656e-05, + "loss": 0.2374, + "step": 1440 + }, + { + "epoch": 0.5023384721981639, + "grad_norm": 0.24981495367506942, + "learning_rate": 4.9315233837036016e-05, + "loss": 0.2332, + "step": 1450 + }, + { + "epoch": 0.5058028754547029, + "grad_norm": 0.2554391345730699, + "learning_rate": 4.9291612906930754e-05, + "loss": 0.2383, + "step": 1460 + }, + { + "epoch": 0.509267278711242, + "grad_norm": 0.26653068047488826, + "learning_rate": 4.926759730628868e-05, + "loss": 0.2411, + "step": 1470 + }, + { + "epoch": 0.5127316819677811, + "grad_norm": 0.2402237748204006, + "learning_rate": 4.9243187425295915e-05, + "loss": 0.2332, + "step": 1480 + }, + { + "epoch": 0.5161960852243201, + "grad_norm": 0.24413036670385513, + "learning_rate": 4.921838366054451e-05, + "loss": 0.2396, + "step": 1490 + }, + { + "epoch": 0.5196604884808592, + "grad_norm": 0.22909060071304008, + "learning_rate": 4.919318641502604e-05, + "loss": 0.2349, + "step": 1500 + }, + { + "epoch": 0.5231248917373982, + "grad_norm": 0.22701914207506868, + "learning_rate": 4.9167596098125036e-05, + "loss": 0.2324, + "step": 1510 + }, + { + "epoch": 0.5265892949939373, + "grad_norm": 0.25285812755375375, + "learning_rate": 4.9141613125612316e-05, + "loss": 0.2361, + "step": 1520 + }, + { + "epoch": 0.5300536982504763, + "grad_norm": 0.23531009106142328, + "learning_rate": 4.911523791963828e-05, + "loss": 0.2389, + "step": 1530 + }, + { + "epoch": 0.5335181015070154, + "grad_norm": 0.23036998762323121, + "learning_rate": 4.908847090872599e-05, + "loss": 0.2349, + "step": 1540 + }, + { + "epoch": 0.5369825047635545, + "grad_norm": 0.2771142315120943, + "learning_rate": 4.906131252776426e-05, + "loss": 0.2384, + "step": 1550 + }, + { + "epoch": 0.5404469080200935, + "grad_norm": 0.22571300533030822, + "learning_rate": 4.9033763218000555e-05, + "loss": 0.2307, + "step": 1560 + }, + { + "epoch": 0.5439113112766326, + "grad_norm": 0.2351873545158354, + "learning_rate": 4.9005823427033856e-05, + "loss": 0.2353, + "step": 1570 + }, + { + "epoch": 0.5473757145331717, + "grad_norm": 0.2540211756984853, + "learning_rate": 4.897749360880735e-05, + "loss": 0.2324, + "step": 1580 + }, + { + "epoch": 0.5508401177897108, + "grad_norm": 0.25457759095229565, + "learning_rate": 4.894877422360106e-05, + "loss": 0.233, + "step": 1590 + }, + { + "epoch": 0.5543045210462498, + "grad_norm": 0.270030962078855, + "learning_rate": 4.8919665738024424e-05, + "loss": 0.2415, + "step": 1600 + }, + { + "epoch": 0.5577689243027888, + "grad_norm": 0.2570316973112016, + "learning_rate": 4.8890168625008624e-05, + "loss": 0.2342, + "step": 1610 + }, + { + "epoch": 0.5612333275593279, + "grad_norm": 0.27096347274503246, + "learning_rate": 4.8860283363798974e-05, + "loss": 0.2279, + "step": 1620 + }, + { + "epoch": 0.5646977308158669, + "grad_norm": 0.24193443897499534, + "learning_rate": 4.8830010439947096e-05, + "loss": 0.2337, + "step": 1630 + }, + { + "epoch": 0.568162134072406, + "grad_norm": 0.29770893374153723, + "learning_rate": 4.879935034530304e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.5716265373289451, + "grad_norm": 0.25131732918770927, + "learning_rate": 4.876830357800729e-05, + "loss": 0.2294, + "step": 1650 + }, + { + "epoch": 0.5750909405854842, + "grad_norm": 0.35319194823747296, + "learning_rate": 4.87368706424827e-05, + "loss": 0.231, + "step": 1660 + }, + { + "epoch": 0.5785553438420232, + "grad_norm": 0.23506203264124303, + "learning_rate": 4.8705052049426254e-05, + "loss": 0.2353, + "step": 1670 + }, + { + "epoch": 0.5820197470985623, + "grad_norm": 0.26231316656015746, + "learning_rate": 4.867284831580078e-05, + "loss": 0.2379, + "step": 1680 + }, + { + "epoch": 0.5854841503551014, + "grad_norm": 0.23941392513796939, + "learning_rate": 4.8640259964826584e-05, + "loss": 0.2308, + "step": 1690 + }, + { + "epoch": 0.5889485536116404, + "grad_norm": 0.2531735045781861, + "learning_rate": 4.860728752597291e-05, + "loss": 0.2315, + "step": 1700 + }, + { + "epoch": 0.5924129568681794, + "grad_norm": 0.23922451826735386, + "learning_rate": 4.8573931534949354e-05, + "loss": 0.2334, + "step": 1710 + }, + { + "epoch": 0.5958773601247185, + "grad_norm": 0.26345705856662766, + "learning_rate": 4.8540192533697155e-05, + "loss": 0.2326, + "step": 1720 + }, + { + "epoch": 0.5993417633812576, + "grad_norm": 0.25324175129413945, + "learning_rate": 4.85060710703804e-05, + "loss": 0.2333, + "step": 1730 + }, + { + "epoch": 0.6028061666377966, + "grad_norm": 0.25577386258261664, + "learning_rate": 4.84715676993771e-05, + "loss": 0.2362, + "step": 1740 + }, + { + "epoch": 0.6062705698943357, + "grad_norm": 0.27948607132517533, + "learning_rate": 4.843668298127022e-05, + "loss": 0.2304, + "step": 1750 + }, + { + "epoch": 0.6097349731508748, + "grad_norm": 0.2560173418476038, + "learning_rate": 4.840141748283851e-05, + "loss": 0.2362, + "step": 1760 + }, + { + "epoch": 0.6131993764074138, + "grad_norm": 0.24729226299066756, + "learning_rate": 4.8365771777047356e-05, + "loss": 0.2317, + "step": 1770 + }, + { + "epoch": 0.6166637796639529, + "grad_norm": 0.2568219765818277, + "learning_rate": 4.832974644303944e-05, + "loss": 0.2393, + "step": 1780 + }, + { + "epoch": 0.620128182920492, + "grad_norm": 0.2306059252401264, + "learning_rate": 4.829334206612534e-05, + "loss": 0.2367, + "step": 1790 + }, + { + "epoch": 0.6235925861770311, + "grad_norm": 0.31020261629615153, + "learning_rate": 4.8256559237774e-05, + "loss": 0.2326, + "step": 1800 + }, + { + "epoch": 0.62705698943357, + "grad_norm": 0.27321518814530843, + "learning_rate": 4.821939855560318e-05, + "loss": 0.2341, + "step": 1810 + }, + { + "epoch": 0.6305213926901091, + "grad_norm": 0.3177547595944014, + "learning_rate": 4.8181860623369646e-05, + "loss": 0.235, + "step": 1820 + }, + { + "epoch": 0.6339857959466482, + "grad_norm": 0.2631656385352305, + "learning_rate": 4.814394605095946e-05, + "loss": 0.2325, + "step": 1830 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.19501128079802074, + "learning_rate": 4.810565545437802e-05, + "loss": 0.2318, + "step": 1840 + }, + { + "epoch": 0.6409146024597263, + "grad_norm": 0.23539186170837376, + "learning_rate": 4.806698945574006e-05, + "loss": 0.2322, + "step": 1850 + }, + { + "epoch": 0.6443790057162654, + "grad_norm": 0.2474455157721665, + "learning_rate": 4.8027948683259546e-05, + "loss": 0.2319, + "step": 1860 + }, + { + "epoch": 0.6478434089728045, + "grad_norm": 0.21665268684721978, + "learning_rate": 4.798853377123948e-05, + "loss": 0.2277, + "step": 1870 + }, + { + "epoch": 0.6513078122293435, + "grad_norm": 0.24192469144287532, + "learning_rate": 4.794874536006152e-05, + "loss": 0.2263, + "step": 1880 + }, + { + "epoch": 0.6547722154858826, + "grad_norm": 0.2372551879205524, + "learning_rate": 4.790858409617573e-05, + "loss": 0.227, + "step": 1890 + }, + { + "epoch": 0.6582366187424216, + "grad_norm": 0.23445846730497275, + "learning_rate": 4.786805063208992e-05, + "loss": 0.2349, + "step": 1900 + }, + { + "epoch": 0.6617010219989606, + "grad_norm": 0.22188437526617993, + "learning_rate": 4.782714562635914e-05, + "loss": 0.2298, + "step": 1910 + }, + { + "epoch": 0.6651654252554997, + "grad_norm": 0.22756117335293902, + "learning_rate": 4.7785869743574915e-05, + "loss": 0.2245, + "step": 1920 + }, + { + "epoch": 0.6686298285120388, + "grad_norm": 0.20717649696487608, + "learning_rate": 4.7744223654354506e-05, + "loss": 0.2331, + "step": 1930 + }, + { + "epoch": 0.6720942317685779, + "grad_norm": 0.21788412171242105, + "learning_rate": 4.7702208035329996e-05, + "loss": 0.2207, + "step": 1940 + }, + { + "epoch": 0.6755586350251169, + "grad_norm": 0.23790582790590165, + "learning_rate": 4.765982356913728e-05, + "loss": 0.2299, + "step": 1950 + }, + { + "epoch": 0.679023038281656, + "grad_norm": 0.254738526683431, + "learning_rate": 4.7617070944404975e-05, + "loss": 0.2277, + "step": 1960 + }, + { + "epoch": 0.6824874415381951, + "grad_norm": 0.2628656210512643, + "learning_rate": 4.757395085574326e-05, + "loss": 0.2297, + "step": 1970 + }, + { + "epoch": 0.6859518447947341, + "grad_norm": 0.20591189751455907, + "learning_rate": 4.7530464003732545e-05, + "loss": 0.2248, + "step": 1980 + }, + { + "epoch": 0.6894162480512732, + "grad_norm": 0.2576351313383852, + "learning_rate": 4.7486611094912146e-05, + "loss": 0.2251, + "step": 1990 + }, + { + "epoch": 0.6928806513078122, + "grad_norm": 0.20856524397248571, + "learning_rate": 4.744239284176876e-05, + "loss": 0.2291, + "step": 2000 + }, + { + "epoch": 0.6963450545643513, + "grad_norm": 0.2186827003230637, + "learning_rate": 4.73978099627249e-05, + "loss": 0.2229, + "step": 2010 + }, + { + "epoch": 0.6998094578208903, + "grad_norm": 0.20914686503222382, + "learning_rate": 4.7352863182127246e-05, + "loss": 0.2206, + "step": 2020 + }, + { + "epoch": 0.7032738610774294, + "grad_norm": 0.22559468853582607, + "learning_rate": 4.730755323023482e-05, + "loss": 0.2319, + "step": 2030 + }, + { + "epoch": 0.7067382643339685, + "grad_norm": 0.23247057053881534, + "learning_rate": 4.72618808432072e-05, + "loss": 0.2261, + "step": 2040 + }, + { + "epoch": 0.7102026675905075, + "grad_norm": 0.22994852903066657, + "learning_rate": 4.7215846763092486e-05, + "loss": 0.2275, + "step": 2050 + }, + { + "epoch": 0.7136670708470466, + "grad_norm": 0.21562072555254103, + "learning_rate": 4.716945173781528e-05, + "loss": 0.2275, + "step": 2060 + }, + { + "epoch": 0.7171314741035857, + "grad_norm": 0.1964162535848221, + "learning_rate": 4.7122696521164564e-05, + "loss": 0.2267, + "step": 2070 + }, + { + "epoch": 0.7205958773601248, + "grad_norm": 0.23405157909799276, + "learning_rate": 4.7075581872781375e-05, + "loss": 0.2293, + "step": 2080 + }, + { + "epoch": 0.7240602806166638, + "grad_norm": 0.22767428517421429, + "learning_rate": 4.7028108558146526e-05, + "loss": 0.2273, + "step": 2090 + }, + { + "epoch": 0.7275246838732028, + "grad_norm": 0.28600372042549504, + "learning_rate": 4.698027734856816e-05, + "loss": 0.2297, + "step": 2100 + }, + { + "epoch": 0.7309890871297419, + "grad_norm": 0.2298031761231605, + "learning_rate": 4.693208902116918e-05, + "loss": 0.2227, + "step": 2110 + }, + { + "epoch": 0.7344534903862809, + "grad_norm": 0.2116599416195417, + "learning_rate": 4.688354435887467e-05, + "loss": 0.2248, + "step": 2120 + }, + { + "epoch": 0.73791789364282, + "grad_norm": 0.20160724544535827, + "learning_rate": 4.683464415039918e-05, + "loss": 0.2197, + "step": 2130 + }, + { + "epoch": 0.7413822968993591, + "grad_norm": 0.22040076084389237, + "learning_rate": 4.678538919023383e-05, + "loss": 0.2306, + "step": 2140 + }, + { + "epoch": 0.7448467001558982, + "grad_norm": 0.24384427587101223, + "learning_rate": 4.673578027863351e-05, + "loss": 0.226, + "step": 2150 + }, + { + "epoch": 0.7483111034124372, + "grad_norm": 0.21129176345098202, + "learning_rate": 4.6685818221603804e-05, + "loss": 0.2298, + "step": 2160 + }, + { + "epoch": 0.7517755066689763, + "grad_norm": 0.2763625585319092, + "learning_rate": 4.663550383088792e-05, + "loss": 0.2253, + "step": 2170 + }, + { + "epoch": 0.7552399099255154, + "grad_norm": 0.21704903419555124, + "learning_rate": 4.6584837923953516e-05, + "loss": 0.2215, + "step": 2180 + }, + { + "epoch": 0.7587043131820544, + "grad_norm": 0.21517162264960232, + "learning_rate": 4.653382132397938e-05, + "loss": 0.2251, + "step": 2190 + }, + { + "epoch": 0.7621687164385934, + "grad_norm": 0.2018666939738909, + "learning_rate": 4.648245485984207e-05, + "loss": 0.2239, + "step": 2200 + }, + { + "epoch": 0.7656331196951325, + "grad_norm": 0.18928657263154086, + "learning_rate": 4.64307393661025e-05, + "loss": 0.2222, + "step": 2210 + }, + { + "epoch": 0.7690975229516716, + "grad_norm": 0.23077452960308834, + "learning_rate": 4.63786756829923e-05, + "loss": 0.2254, + "step": 2220 + }, + { + "epoch": 0.7725619262082106, + "grad_norm": 0.19369514792151565, + "learning_rate": 4.63262646564002e-05, + "loss": 0.2214, + "step": 2230 + }, + { + "epoch": 0.7760263294647497, + "grad_norm": 0.19788506342985074, + "learning_rate": 4.627350713785829e-05, + "loss": 0.2199, + "step": 2240 + }, + { + "epoch": 0.7794907327212888, + "grad_norm": 0.1892817176323716, + "learning_rate": 4.622040398452819e-05, + "loss": 0.2209, + "step": 2250 + }, + { + "epoch": 0.7829551359778278, + "grad_norm": 0.21413217334371104, + "learning_rate": 4.616695605918712e-05, + "loss": 0.2259, + "step": 2260 + }, + { + "epoch": 0.7864195392343669, + "grad_norm": 0.2516901660820717, + "learning_rate": 4.6113164230213844e-05, + "loss": 0.2224, + "step": 2270 + }, + { + "epoch": 0.789883942490906, + "grad_norm": 0.23275016126252404, + "learning_rate": 4.605902937157465e-05, + "loss": 0.2269, + "step": 2280 + }, + { + "epoch": 0.793348345747445, + "grad_norm": 0.2121809202061214, + "learning_rate": 4.600455236280905e-05, + "loss": 0.2231, + "step": 2290 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.19630769074682533, + "learning_rate": 4.5949734089015544e-05, + "loss": 0.2207, + "step": 2300 + }, + { + "epoch": 0.8002771522605231, + "grad_norm": 0.21355754308769215, + "learning_rate": 4.589457544083725e-05, + "loss": 0.224, + "step": 2310 + }, + { + "epoch": 0.8037415555170622, + "grad_norm": 0.21050957794732314, + "learning_rate": 4.5839077314447385e-05, + "loss": 0.2238, + "step": 2320 + }, + { + "epoch": 0.8072059587736012, + "grad_norm": 0.22620144254655492, + "learning_rate": 4.578324061153477e-05, + "loss": 0.2252, + "step": 2330 + }, + { + "epoch": 0.8106703620301403, + "grad_norm": 0.19924326647557386, + "learning_rate": 4.5727066239289117e-05, + "loss": 0.2239, + "step": 2340 + }, + { + "epoch": 0.8141347652866794, + "grad_norm": 0.22935473461207423, + "learning_rate": 4.5670555110386316e-05, + "loss": 0.222, + "step": 2350 + }, + { + "epoch": 0.8175991685432185, + "grad_norm": 0.2651712827400202, + "learning_rate": 4.561370814297363e-05, + "loss": 0.2225, + "step": 2360 + }, + { + "epoch": 0.8210635717997575, + "grad_norm": 0.2182312756063536, + "learning_rate": 4.555652626065473e-05, + "loss": 0.2238, + "step": 2370 + }, + { + "epoch": 0.8245279750562966, + "grad_norm": 0.20447815035283293, + "learning_rate": 4.549901039247474e-05, + "loss": 0.2212, + "step": 2380 + }, + { + "epoch": 0.8279923783128356, + "grad_norm": 0.17858513767092687, + "learning_rate": 4.544116147290509e-05, + "loss": 0.223, + "step": 2390 + }, + { + "epoch": 0.8314567815693746, + "grad_norm": 0.21126944513637871, + "learning_rate": 4.5382980441828385e-05, + "loss": 0.2253, + "step": 2400 + }, + { + "epoch": 0.8349211848259137, + "grad_norm": 0.2343427045564657, + "learning_rate": 4.5324468244523086e-05, + "loss": 0.2176, + "step": 2410 + }, + { + "epoch": 0.8383855880824528, + "grad_norm": 0.19348098291090032, + "learning_rate": 4.52656258316482e-05, + "loss": 0.2171, + "step": 2420 + }, + { + "epoch": 0.8418499913389919, + "grad_norm": 0.18351830211981723, + "learning_rate": 4.5206454159227783e-05, + "loss": 0.2209, + "step": 2430 + }, + { + "epoch": 0.8453143945955309, + "grad_norm": 0.21496767724862215, + "learning_rate": 4.514695418863547e-05, + "loss": 0.2209, + "step": 2440 + }, + { + "epoch": 0.84877879785207, + "grad_norm": 0.19946299406668855, + "learning_rate": 4.508712688657879e-05, + "loss": 0.2202, + "step": 2450 + }, + { + "epoch": 0.8522432011086091, + "grad_norm": 0.19377744464389568, + "learning_rate": 4.50269732250835e-05, + "loss": 0.2201, + "step": 2460 + }, + { + "epoch": 0.8557076043651481, + "grad_norm": 0.2366384856135975, + "learning_rate": 4.496649418147778e-05, + "loss": 0.2149, + "step": 2470 + }, + { + "epoch": 0.8591720076216872, + "grad_norm": 0.21068619938805416, + "learning_rate": 4.490569073837636e-05, + "loss": 0.2184, + "step": 2480 + }, + { + "epoch": 0.8626364108782262, + "grad_norm": 0.2315187829791829, + "learning_rate": 4.4844563883664554e-05, + "loss": 0.222, + "step": 2490 + }, + { + "epoch": 0.8661008141347653, + "grad_norm": 0.18649456076146348, + "learning_rate": 4.478311461048219e-05, + "loss": 0.2209, + "step": 2500 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.23315042553004378, + "learning_rate": 4.472134391720751e-05, + "loss": 0.2224, + "step": 2510 + }, + { + "epoch": 0.8730296206478434, + "grad_norm": 0.1951271064725934, + "learning_rate": 4.465925280744094e-05, + "loss": 0.2239, + "step": 2520 + }, + { + "epoch": 0.8764940239043825, + "grad_norm": 0.1913261231013653, + "learning_rate": 4.459684228998873e-05, + "loss": 0.2179, + "step": 2530 + }, + { + "epoch": 0.8799584271609215, + "grad_norm": 0.19492329842413578, + "learning_rate": 4.453411337884666e-05, + "loss": 0.2162, + "step": 2540 + }, + { + "epoch": 0.8834228304174606, + "grad_norm": 0.2055009502555625, + "learning_rate": 4.4471067093183475e-05, + "loss": 0.2165, + "step": 2550 + }, + { + "epoch": 0.8868872336739997, + "grad_norm": 0.19549219766107842, + "learning_rate": 4.4407704457324394e-05, + "loss": 0.2158, + "step": 2560 + }, + { + "epoch": 0.8903516369305388, + "grad_norm": 0.1850796577943829, + "learning_rate": 4.4344026500734415e-05, + "loss": 0.2172, + "step": 2570 + }, + { + "epoch": 0.8938160401870777, + "grad_norm": 0.19643486062098017, + "learning_rate": 4.428003425800164e-05, + "loss": 0.2208, + "step": 2580 + }, + { + "epoch": 0.8972804434436168, + "grad_norm": 0.21099302341164966, + "learning_rate": 4.4215728768820406e-05, + "loss": 0.2194, + "step": 2590 + }, + { + "epoch": 0.9007448467001559, + "grad_norm": 0.24764922928675714, + "learning_rate": 4.415111107797445e-05, + "loss": 0.2193, + "step": 2600 + }, + { + "epoch": 0.9042092499566949, + "grad_norm": 0.22064705519176767, + "learning_rate": 4.4086182235319904e-05, + "loss": 0.2148, + "step": 2610 + }, + { + "epoch": 0.907673653213234, + "grad_norm": 0.2131199868612154, + "learning_rate": 4.402094329576825e-05, + "loss": 0.2233, + "step": 2620 + }, + { + "epoch": 0.9111380564697731, + "grad_norm": 0.2184325870922078, + "learning_rate": 4.395539531926914e-05, + "loss": 0.2227, + "step": 2630 + }, + { + "epoch": 0.9146024597263122, + "grad_norm": 0.20257196030375807, + "learning_rate": 4.388953937079327e-05, + "loss": 0.2145, + "step": 2640 + }, + { + "epoch": 0.9180668629828512, + "grad_norm": 0.2052608168052879, + "learning_rate": 4.3823376520314964e-05, + "loss": 0.2176, + "step": 2650 + }, + { + "epoch": 0.9215312662393903, + "grad_norm": 0.20116038778213127, + "learning_rate": 4.3756907842794855e-05, + "loss": 0.2209, + "step": 2660 + }, + { + "epoch": 0.9249956694959294, + "grad_norm": 0.2094115752703769, + "learning_rate": 4.369013441816242e-05, + "loss": 0.2186, + "step": 2670 + }, + { + "epoch": 0.9284600727524683, + "grad_norm": 0.21116605286596796, + "learning_rate": 4.362305733129841e-05, + "loss": 0.2177, + "step": 2680 + }, + { + "epoch": 0.9319244760090074, + "grad_norm": 0.2355250090651405, + "learning_rate": 4.355567767201725e-05, + "loss": 0.216, + "step": 2690 + }, + { + "epoch": 0.9353888792655465, + "grad_norm": 0.21712887070689368, + "learning_rate": 4.3487996535049296e-05, + "loss": 0.2194, + "step": 2700 + }, + { + "epoch": 0.9388532825220856, + "grad_norm": 0.21715912584157251, + "learning_rate": 4.342001502002309e-05, + "loss": 0.2187, + "step": 2710 + }, + { + "epoch": 0.9423176857786246, + "grad_norm": 0.23840653028574527, + "learning_rate": 4.3351734231447436e-05, + "loss": 0.2127, + "step": 2720 + }, + { + "epoch": 0.9457820890351637, + "grad_norm": 0.18655391680385583, + "learning_rate": 4.328315527869357e-05, + "loss": 0.2213, + "step": 2730 + }, + { + "epoch": 0.9492464922917028, + "grad_norm": 0.20595739324476314, + "learning_rate": 4.321427927597697e-05, + "loss": 0.2172, + "step": 2740 + }, + { + "epoch": 0.9527108955482418, + "grad_norm": 0.16761465601551198, + "learning_rate": 4.31451073423394e-05, + "loss": 0.2153, + "step": 2750 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.18621316019170056, + "learning_rate": 4.3075640601630664e-05, + "loss": 0.2156, + "step": 2760 + }, + { + "epoch": 0.95963970206132, + "grad_norm": 0.17305717604504345, + "learning_rate": 4.300588018249033e-05, + "loss": 0.2129, + "step": 2770 + }, + { + "epoch": 0.963104105317859, + "grad_norm": 0.18044851481990093, + "learning_rate": 4.2935827218329434e-05, + "loss": 0.2154, + "step": 2780 + }, + { + "epoch": 0.966568508574398, + "grad_norm": 0.20518001148705736, + "learning_rate": 4.2865482847312043e-05, + "loss": 0.2132, + "step": 2790 + }, + { + "epoch": 0.9700329118309371, + "grad_norm": 0.3349155151219487, + "learning_rate": 4.279484821233678e-05, + "loss": 0.2202, + "step": 2800 + }, + { + "epoch": 0.9734973150874762, + "grad_norm": 0.18282251380283435, + "learning_rate": 4.2723924461018225e-05, + "loss": 0.2186, + "step": 2810 + }, + { + "epoch": 0.9769617183440152, + "grad_norm": 0.2173702476740624, + "learning_rate": 4.265271274566829e-05, + "loss": 0.22, + "step": 2820 + }, + { + "epoch": 0.9804261216005543, + "grad_norm": 0.19184429102715528, + "learning_rate": 4.2581214223277495e-05, + "loss": 0.2077, + "step": 2830 + }, + { + "epoch": 0.9838905248570934, + "grad_norm": 0.20991477426484062, + "learning_rate": 4.250943005549618e-05, + "loss": 0.2208, + "step": 2840 + }, + { + "epoch": 0.9873549281136325, + "grad_norm": 0.2285331325208064, + "learning_rate": 4.2437361408615614e-05, + "loss": 0.2151, + "step": 2850 + }, + { + "epoch": 0.9908193313701715, + "grad_norm": 0.2148804733407693, + "learning_rate": 4.2365009453549046e-05, + "loss": 0.2216, + "step": 2860 + }, + { + "epoch": 0.9942837346267106, + "grad_norm": 0.23671514755116507, + "learning_rate": 4.22923753658127e-05, + "loss": 0.2197, + "step": 2870 + }, + { + "epoch": 0.9977481378832496, + "grad_norm": 0.21988104341412565, + "learning_rate": 4.221946032550665e-05, + "loss": 0.2143, + "step": 2880 + }, + { + "epoch": 1.0010393209769617, + "grad_norm": 0.20204127186990958, + "learning_rate": 4.214626551729569e-05, + "loss": 0.2095, + "step": 2890 + }, + { + "epoch": 1.0045037242335009, + "grad_norm": 0.19674007109910524, + "learning_rate": 4.207279213039003e-05, + "loss": 0.192, + "step": 2900 + }, + { + "epoch": 1.0079681274900398, + "grad_norm": 0.18771217665260514, + "learning_rate": 4.199904135852598e-05, + "loss": 0.1936, + "step": 2910 + }, + { + "epoch": 1.0114325307465788, + "grad_norm": 0.2045135750011863, + "learning_rate": 4.192501439994664e-05, + "loss": 0.1942, + "step": 2920 + }, + { + "epoch": 1.014896934003118, + "grad_norm": 0.18370214635615664, + "learning_rate": 4.185071245738231e-05, + "loss": 0.1891, + "step": 2930 + }, + { + "epoch": 1.018361337259657, + "grad_norm": 0.18667193497723014, + "learning_rate": 4.177613673803106e-05, + "loss": 0.1944, + "step": 2940 + }, + { + "epoch": 1.0218257405161961, + "grad_norm": 0.1791392276679071, + "learning_rate": 4.170128845353902e-05, + "loss": 0.189, + "step": 2950 + }, + { + "epoch": 1.025290143772735, + "grad_norm": 0.16725027485938548, + "learning_rate": 4.162616881998075e-05, + "loss": 0.1985, + "step": 2960 + }, + { + "epoch": 1.0287545470292743, + "grad_norm": 0.2029662849352073, + "learning_rate": 4.155077905783949e-05, + "loss": 0.1938, + "step": 2970 + }, + { + "epoch": 1.0322189502858132, + "grad_norm": 0.21676480591029273, + "learning_rate": 4.14751203919873e-05, + "loss": 0.1938, + "step": 2980 + }, + { + "epoch": 1.0356833535423524, + "grad_norm": 0.18194277030338057, + "learning_rate": 4.1399194051665146e-05, + "loss": 0.1943, + "step": 2990 + }, + { + "epoch": 1.0391477567988914, + "grad_norm": 0.22187757323234317, + "learning_rate": 4.1323001270463e-05, + "loss": 0.1956, + "step": 3000 + }, + { + "epoch": 1.0426121600554306, + "grad_norm": 0.17994994526393276, + "learning_rate": 4.1246543286299714e-05, + "loss": 0.196, + "step": 3010 + }, + { + "epoch": 1.0460765633119695, + "grad_norm": 0.2006941244750973, + "learning_rate": 4.1169821341402956e-05, + "loss": 0.1928, + "step": 3020 + }, + { + "epoch": 1.0495409665685085, + "grad_norm": 0.18127266652072002, + "learning_rate": 4.109283668228903e-05, + "loss": 0.1883, + "step": 3030 + }, + { + "epoch": 1.0530053698250477, + "grad_norm": 0.17064172550512371, + "learning_rate": 4.101559055974258e-05, + "loss": 0.1944, + "step": 3040 + }, + { + "epoch": 1.0564697730815866, + "grad_norm": 0.17513257115164024, + "learning_rate": 4.09380842287963e-05, + "loss": 0.1924, + "step": 3050 + }, + { + "epoch": 1.0599341763381258, + "grad_norm": 0.17679873359445633, + "learning_rate": 4.0860318948710574e-05, + "loss": 0.1967, + "step": 3060 + }, + { + "epoch": 1.0633985795946648, + "grad_norm": 0.17716657233908994, + "learning_rate": 4.0782295982952954e-05, + "loss": 0.1904, + "step": 3070 + }, + { + "epoch": 1.066862982851204, + "grad_norm": 0.20587418971229102, + "learning_rate": 4.0704016599177655e-05, + "loss": 0.1944, + "step": 3080 + }, + { + "epoch": 1.070327386107743, + "grad_norm": 0.19036738753494378, + "learning_rate": 4.062548206920499e-05, + "loss": 0.1927, + "step": 3090 + }, + { + "epoch": 1.073791789364282, + "grad_norm": 0.19921829291538054, + "learning_rate": 4.054669366900066e-05, + "loss": 0.1917, + "step": 3100 + }, + { + "epoch": 1.077256192620821, + "grad_norm": 0.20317583824797536, + "learning_rate": 4.0467652678655056e-05, + "loss": 0.1914, + "step": 3110 + }, + { + "epoch": 1.08072059587736, + "grad_norm": 0.1848009075000212, + "learning_rate": 4.038836038236245e-05, + "loss": 0.1868, + "step": 3120 + }, + { + "epoch": 1.0841849991338992, + "grad_norm": 0.2082390217876895, + "learning_rate": 4.0308818068400125e-05, + "loss": 0.1897, + "step": 3130 + }, + { + "epoch": 1.0876494023904382, + "grad_norm": 0.23521100374901133, + "learning_rate": 4.022902702910745e-05, + "loss": 0.1849, + "step": 3140 + }, + { + "epoch": 1.0911138056469774, + "grad_norm": 0.18191505448762496, + "learning_rate": 4.014898856086489e-05, + "loss": 0.1909, + "step": 3150 + }, + { + "epoch": 1.0945782089035163, + "grad_norm": 0.19423731854983112, + "learning_rate": 4.006870396407294e-05, + "loss": 0.1925, + "step": 3160 + }, + { + "epoch": 1.0980426121600555, + "grad_norm": 0.17415753971247672, + "learning_rate": 3.998817454313096e-05, + "loss": 0.1961, + "step": 3170 + }, + { + "epoch": 1.1015070154165945, + "grad_norm": 0.17868260694918264, + "learning_rate": 3.9907401606416054e-05, + "loss": 0.1984, + "step": 3180 + }, + { + "epoch": 1.1049714186731336, + "grad_norm": 0.16583150690918594, + "learning_rate": 3.9826386466261765e-05, + "loss": 0.1948, + "step": 3190 + }, + { + "epoch": 1.1084358219296726, + "grad_norm": 0.19154779416193093, + "learning_rate": 3.9745130438936744e-05, + "loss": 0.187, + "step": 3200 + }, + { + "epoch": 1.1119002251862118, + "grad_norm": 0.18741249451292638, + "learning_rate": 3.96636348446234e-05, + "loss": 0.1907, + "step": 3210 + }, + { + "epoch": 1.1153646284427507, + "grad_norm": 0.18322276372490287, + "learning_rate": 3.958190100739643e-05, + "loss": 0.1872, + "step": 3220 + }, + { + "epoch": 1.1188290316992897, + "grad_norm": 0.18729813363303588, + "learning_rate": 3.94999302552013e-05, + "loss": 0.1942, + "step": 3230 + }, + { + "epoch": 1.122293434955829, + "grad_norm": 0.19705653891217498, + "learning_rate": 3.941772391983271e-05, + "loss": 0.1912, + "step": 3240 + }, + { + "epoch": 1.1257578382123679, + "grad_norm": 0.17707126700086195, + "learning_rate": 3.9335283336912873e-05, + "loss": 0.192, + "step": 3250 + }, + { + "epoch": 1.129222241468907, + "grad_norm": 0.17283665381012417, + "learning_rate": 3.925260984586991e-05, + "loss": 0.1904, + "step": 3260 + }, + { + "epoch": 1.132686644725446, + "grad_norm": 0.18010720437594213, + "learning_rate": 3.916970478991604e-05, + "loss": 0.1943, + "step": 3270 + }, + { + "epoch": 1.1361510479819852, + "grad_norm": 0.15919684820270277, + "learning_rate": 3.908656951602574e-05, + "loss": 0.1897, + "step": 3280 + }, + { + "epoch": 1.1396154512385241, + "grad_norm": 0.1931598714015755, + "learning_rate": 3.9003205374913906e-05, + "loss": 0.1901, + "step": 3290 + }, + { + "epoch": 1.1430798544950633, + "grad_norm": 0.17545621617123003, + "learning_rate": 3.891961372101387e-05, + "loss": 0.1869, + "step": 3300 + }, + { + "epoch": 1.1465442577516023, + "grad_norm": 0.17341255082466775, + "learning_rate": 3.883579591245542e-05, + "loss": 0.1899, + "step": 3310 + }, + { + "epoch": 1.1500086610081413, + "grad_norm": 0.16204232239326744, + "learning_rate": 3.8751753311042704e-05, + "loss": 0.1897, + "step": 3320 + }, + { + "epoch": 1.1534730642646804, + "grad_norm": 0.17553623491312426, + "learning_rate": 3.8667487282232144e-05, + "loss": 0.187, + "step": 3330 + }, + { + "epoch": 1.1569374675212194, + "grad_norm": 0.1774831508823347, + "learning_rate": 3.8582999195110215e-05, + "loss": 0.1943, + "step": 3340 + }, + { + "epoch": 1.1604018707777586, + "grad_norm": 0.17946778025767848, + "learning_rate": 3.849829042237123e-05, + "loss": 0.1929, + "step": 3350 + }, + { + "epoch": 1.1638662740342975, + "grad_norm": 0.17173672286352726, + "learning_rate": 3.841336234029501e-05, + "loss": 0.195, + "step": 3360 + }, + { + "epoch": 1.1673306772908367, + "grad_norm": 0.17994745465860756, + "learning_rate": 3.832821632872454e-05, + "loss": 0.19, + "step": 3370 + }, + { + "epoch": 1.1707950805473757, + "grad_norm": 0.1850088732651661, + "learning_rate": 3.8242853771043566e-05, + "loss": 0.1957, + "step": 3380 + }, + { + "epoch": 1.1742594838039149, + "grad_norm": 0.15908666104639177, + "learning_rate": 3.815727605415406e-05, + "loss": 0.1915, + "step": 3390 + }, + { + "epoch": 1.1777238870604538, + "grad_norm": 0.18156692156906576, + "learning_rate": 3.807148456845378e-05, + "loss": 0.188, + "step": 3400 + }, + { + "epoch": 1.1811882903169928, + "grad_norm": 0.17113960858415064, + "learning_rate": 3.798548070781357e-05, + "loss": 0.1893, + "step": 3410 + }, + { + "epoch": 1.184652693573532, + "grad_norm": 0.18236211522160944, + "learning_rate": 3.789926586955484e-05, + "loss": 0.1859, + "step": 3420 + }, + { + "epoch": 1.188117096830071, + "grad_norm": 0.16797194145110902, + "learning_rate": 3.7812841454426715e-05, + "loss": 0.1901, + "step": 3430 + }, + { + "epoch": 1.1915815000866101, + "grad_norm": 0.1988755262539599, + "learning_rate": 3.772620886658342e-05, + "loss": 0.1942, + "step": 3440 + }, + { + "epoch": 1.195045903343149, + "grad_norm": 0.18276504002195174, + "learning_rate": 3.7639369513561374e-05, + "loss": 0.1901, + "step": 3450 + }, + { + "epoch": 1.1985103065996883, + "grad_norm": 0.19420509251762302, + "learning_rate": 3.7552324806256356e-05, + "loss": 0.1893, + "step": 3460 + }, + { + "epoch": 1.2019747098562272, + "grad_norm": 0.1856528154175482, + "learning_rate": 3.7465076158900565e-05, + "loss": 0.1926, + "step": 3470 + }, + { + "epoch": 1.2054391131127664, + "grad_norm": 0.15954596519502212, + "learning_rate": 3.737762498903967e-05, + "loss": 0.1928, + "step": 3480 + }, + { + "epoch": 1.2089035163693054, + "grad_norm": 0.16035794848815701, + "learning_rate": 3.728997271750975e-05, + "loss": 0.1911, + "step": 3490 + }, + { + "epoch": 1.2123679196258443, + "grad_norm": 0.1860891805383301, + "learning_rate": 3.720212076841424e-05, + "loss": 0.1906, + "step": 3500 + }, + { + "epoch": 1.2158323228823835, + "grad_norm": 0.18968549769475154, + "learning_rate": 3.7114070569100745e-05, + "loss": 0.1915, + "step": 3510 + }, + { + "epoch": 1.2192967261389225, + "grad_norm": 0.18632674935712812, + "learning_rate": 3.70258235501379e-05, + "loss": 0.2005, + "step": 3520 + }, + { + "epoch": 1.2227611293954617, + "grad_norm": 0.17630114385410808, + "learning_rate": 3.693738114529211e-05, + "loss": 0.1932, + "step": 3530 + }, + { + "epoch": 1.2262255326520006, + "grad_norm": 0.17065976632694624, + "learning_rate": 3.6848744791504244e-05, + "loss": 0.1924, + "step": 3540 + }, + { + "epoch": 1.2296899359085398, + "grad_norm": 0.16123026277843103, + "learning_rate": 3.675991592886629e-05, + "loss": 0.1921, + "step": 3550 + }, + { + "epoch": 1.2331543391650788, + "grad_norm": 0.15903672080165102, + "learning_rate": 3.667089600059799e-05, + "loss": 0.1872, + "step": 3560 + }, + { + "epoch": 1.236618742421618, + "grad_norm": 0.17243581096797103, + "learning_rate": 3.658168645302333e-05, + "loss": 0.1933, + "step": 3570 + }, + { + "epoch": 1.240083145678157, + "grad_norm": 0.18100700093073868, + "learning_rate": 3.6492288735547104e-05, + "loss": 0.1951, + "step": 3580 + }, + { + "epoch": 1.2435475489346959, + "grad_norm": 0.17490828663332014, + "learning_rate": 3.640270430063133e-05, + "loss": 0.1914, + "step": 3590 + }, + { + "epoch": 1.247011952191235, + "grad_norm": 0.15940099515313444, + "learning_rate": 3.6312934603771674e-05, + "loss": 0.1894, + "step": 3600 + }, + { + "epoch": 1.2504763554477742, + "grad_norm": 0.18052826346513526, + "learning_rate": 3.622298110347377e-05, + "loss": 0.1891, + "step": 3610 + }, + { + "epoch": 1.2539407587043132, + "grad_norm": 0.16612752065480388, + "learning_rate": 3.613284526122954e-05, + "loss": 0.1908, + "step": 3620 + }, + { + "epoch": 1.2574051619608522, + "grad_norm": 0.18743922356428885, + "learning_rate": 3.604252854149347e-05, + "loss": 0.1883, + "step": 3630 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.181478054238508, + "learning_rate": 3.595203241165878e-05, + "loss": 0.1878, + "step": 3640 + }, + { + "epoch": 1.2643339684739303, + "grad_norm": 0.1804892389398887, + "learning_rate": 3.586135834203362e-05, + "loss": 0.1893, + "step": 3650 + }, + { + "epoch": 1.2677983717304695, + "grad_norm": 0.1741226930201246, + "learning_rate": 3.5770507805817135e-05, + "loss": 0.1912, + "step": 3660 + }, + { + "epoch": 1.2712627749870085, + "grad_norm": 0.16334377723582452, + "learning_rate": 3.5679482279075584e-05, + "loss": 0.1892, + "step": 3670 + }, + { + "epoch": 1.2747271782435474, + "grad_norm": 0.17047126134525575, + "learning_rate": 3.558828324071831e-05, + "loss": 0.1907, + "step": 3680 + }, + { + "epoch": 1.2781915815000866, + "grad_norm": 0.17298916476177462, + "learning_rate": 3.549691217247375e-05, + "loss": 0.1906, + "step": 3690 + }, + { + "epoch": 1.2816559847566258, + "grad_norm": 0.15717399662492984, + "learning_rate": 3.540537055886533e-05, + "loss": 0.1934, + "step": 3700 + }, + { + "epoch": 1.2851203880131647, + "grad_norm": 0.17584831578937637, + "learning_rate": 3.531365988718736e-05, + "loss": 0.1851, + "step": 3710 + }, + { + "epoch": 1.2885847912697037, + "grad_norm": 0.16205306891044194, + "learning_rate": 3.522178164748089e-05, + "loss": 0.1861, + "step": 3720 + }, + { + "epoch": 1.292049194526243, + "grad_norm": 0.15057670492815758, + "learning_rate": 3.5129737332509456e-05, + "loss": 0.1906, + "step": 3730 + }, + { + "epoch": 1.2955135977827819, + "grad_norm": 0.15822993127615093, + "learning_rate": 3.503752843773486e-05, + "loss": 0.1838, + "step": 3740 + }, + { + "epoch": 1.298978001039321, + "grad_norm": 0.15864785027235162, + "learning_rate": 3.4945156461292854e-05, + "loss": 0.1952, + "step": 3750 + }, + { + "epoch": 1.30244240429586, + "grad_norm": 0.18227602995981887, + "learning_rate": 3.485262290396883e-05, + "loss": 0.1898, + "step": 3760 + }, + { + "epoch": 1.305906807552399, + "grad_norm": 0.16200311831559636, + "learning_rate": 3.475992926917341e-05, + "loss": 0.1929, + "step": 3770 + }, + { + "epoch": 1.3093712108089381, + "grad_norm": 0.15259911553775118, + "learning_rate": 3.4667077062918e-05, + "loss": 0.1872, + "step": 3780 + }, + { + "epoch": 1.3128356140654773, + "grad_norm": 0.18125600690337332, + "learning_rate": 3.457406779379039e-05, + "loss": 0.1925, + "step": 3790 + }, + { + "epoch": 1.3163000173220163, + "grad_norm": 0.192188947967991, + "learning_rate": 3.448090297293016e-05, + "loss": 0.1871, + "step": 3800 + }, + { + "epoch": 1.3197644205785553, + "grad_norm": 0.17328021183720274, + "learning_rate": 3.438758411400421e-05, + "loss": 0.189, + "step": 3810 + }, + { + "epoch": 1.3232288238350944, + "grad_norm": 0.16788815861788564, + "learning_rate": 3.4294112733182084e-05, + "loss": 0.1916, + "step": 3820 + }, + { + "epoch": 1.3266932270916334, + "grad_norm": 0.15688398565420658, + "learning_rate": 3.420049034911139e-05, + "loss": 0.1882, + "step": 3830 + }, + { + "epoch": 1.3301576303481726, + "grad_norm": 0.1714798078013055, + "learning_rate": 3.410671848289315e-05, + "loss": 0.1882, + "step": 3840 + }, + { + "epoch": 1.3336220336047115, + "grad_norm": 0.16309203688542842, + "learning_rate": 3.401279865805702e-05, + "loss": 0.1914, + "step": 3850 + }, + { + "epoch": 1.3370864368612507, + "grad_norm": 0.15748111773147097, + "learning_rate": 3.391873240053656e-05, + "loss": 0.1866, + "step": 3860 + }, + { + "epoch": 1.3405508401177897, + "grad_norm": 0.1820518471519293, + "learning_rate": 3.382452123864448e-05, + "loss": 0.1865, + "step": 3870 + }, + { + "epoch": 1.3440152433743289, + "grad_norm": 0.15772861765330992, + "learning_rate": 3.373016670304774e-05, + "loss": 0.1928, + "step": 3880 + }, + { + "epoch": 1.3474796466308678, + "grad_norm": 0.15938918092133955, + "learning_rate": 3.3635670326742755e-05, + "loss": 0.1913, + "step": 3890 + }, + { + "epoch": 1.3509440498874068, + "grad_norm": 0.1721339531229463, + "learning_rate": 3.354103364503045e-05, + "loss": 0.1907, + "step": 3900 + }, + { + "epoch": 1.354408453143946, + "grad_norm": 0.15410885348307193, + "learning_rate": 3.3446258195491305e-05, + "loss": 0.1879, + "step": 3910 + }, + { + "epoch": 1.357872856400485, + "grad_norm": 0.16165063244129996, + "learning_rate": 3.3351345517960386e-05, + "loss": 0.1898, + "step": 3920 + }, + { + "epoch": 1.3613372596570241, + "grad_norm": 0.16569690135925177, + "learning_rate": 3.325629715450235e-05, + "loss": 0.1873, + "step": 3930 + }, + { + "epoch": 1.364801662913563, + "grad_norm": 0.16487442057352042, + "learning_rate": 3.3161114649386335e-05, + "loss": 0.1888, + "step": 3940 + }, + { + "epoch": 1.3682660661701023, + "grad_norm": 0.17731395997046875, + "learning_rate": 3.306579954906095e-05, + "loss": 0.1881, + "step": 3950 + }, + { + "epoch": 1.3717304694266412, + "grad_norm": 0.18039964361054459, + "learning_rate": 3.2970353402129065e-05, + "loss": 0.1873, + "step": 3960 + }, + { + "epoch": 1.3751948726831804, + "grad_norm": 0.17793927736310747, + "learning_rate": 3.287477775932271e-05, + "loss": 0.1912, + "step": 3970 + }, + { + "epoch": 1.3786592759397194, + "grad_norm": 0.1790070950599728, + "learning_rate": 3.2779074173477845e-05, + "loss": 0.1837, + "step": 3980 + }, + { + "epoch": 1.3821236791962583, + "grad_norm": 0.16536456635439983, + "learning_rate": 3.2683244199509164e-05, + "loss": 0.1822, + "step": 3990 + }, + { + "epoch": 1.3855880824527975, + "grad_norm": 0.17630304243331663, + "learning_rate": 3.258728939438479e-05, + "loss": 0.1875, + "step": 4000 + }, + { + "epoch": 1.3890524857093367, + "grad_norm": 0.17420428081742365, + "learning_rate": 3.249121131710102e-05, + "loss": 0.1883, + "step": 4010 + }, + { + "epoch": 1.3925168889658757, + "grad_norm": 0.17543770138618733, + "learning_rate": 3.239501152865698e-05, + "loss": 0.1871, + "step": 4020 + }, + { + "epoch": 1.3959812922224146, + "grad_norm": 0.1592954498035186, + "learning_rate": 3.229869159202925e-05, + "loss": 0.1845, + "step": 4030 + }, + { + "epoch": 1.3994456954789538, + "grad_norm": 0.1969202150910442, + "learning_rate": 3.2202253072146485e-05, + "loss": 0.1899, + "step": 4040 + }, + { + "epoch": 1.4029100987354928, + "grad_norm": 0.16611324585892465, + "learning_rate": 3.2105697535863974e-05, + "loss": 0.1884, + "step": 4050 + }, + { + "epoch": 1.406374501992032, + "grad_norm": 0.16159007802640257, + "learning_rate": 3.200902655193822e-05, + "loss": 0.185, + "step": 4060 + }, + { + "epoch": 1.409838905248571, + "grad_norm": 0.1580252831987218, + "learning_rate": 3.1912241691001396e-05, + "loss": 0.1883, + "step": 4070 + }, + { + "epoch": 1.4133033085051099, + "grad_norm": 0.17023457596293415, + "learning_rate": 3.181534452553589e-05, + "loss": 0.1864, + "step": 4080 + }, + { + "epoch": 1.416767711761649, + "grad_norm": 0.16337679260042998, + "learning_rate": 3.1718336629848674e-05, + "loss": 0.1877, + "step": 4090 + }, + { + "epoch": 1.4202321150181882, + "grad_norm": 0.1899260224945417, + "learning_rate": 3.162121958004584e-05, + "loss": 0.188, + "step": 4100 + }, + { + "epoch": 1.4236965182747272, + "grad_norm": 0.1901828231396822, + "learning_rate": 3.1523994954006875e-05, + "loss": 0.1868, + "step": 4110 + }, + { + "epoch": 1.4271609215312662, + "grad_norm": 0.1791821381475306, + "learning_rate": 3.142666433135911e-05, + "loss": 0.1889, + "step": 4120 + }, + { + "epoch": 1.4306253247878054, + "grad_norm": 0.16547869947285684, + "learning_rate": 3.132922929345199e-05, + "loss": 0.1884, + "step": 4130 + }, + { + "epoch": 1.4340897280443443, + "grad_norm": 0.1549076050063216, + "learning_rate": 3.123169142333145e-05, + "loss": 0.1884, + "step": 4140 + }, + { + "epoch": 1.4375541313008835, + "grad_norm": 0.15295140029133414, + "learning_rate": 3.1134052305714146e-05, + "loss": 0.1852, + "step": 4150 + }, + { + "epoch": 1.4410185345574225, + "grad_norm": 0.16777477030335766, + "learning_rate": 3.1036313526961716e-05, + "loss": 0.1849, + "step": 4160 + }, + { + "epoch": 1.4444829378139614, + "grad_norm": 0.14671702288800448, + "learning_rate": 3.093847667505502e-05, + "loss": 0.1918, + "step": 4170 + }, + { + "epoch": 1.4479473410705006, + "grad_norm": 0.15525539418252593, + "learning_rate": 3.084054333956833e-05, + "loss": 0.1866, + "step": 4180 + }, + { + "epoch": 1.4514117443270398, + "grad_norm": 0.16462043975916238, + "learning_rate": 3.0742515111643496e-05, + "loss": 0.1863, + "step": 4190 + }, + { + "epoch": 1.4548761475835787, + "grad_norm": 0.15447538862682672, + "learning_rate": 3.064439358396412e-05, + "loss": 0.1871, + "step": 4200 + }, + { + "epoch": 1.4583405508401177, + "grad_norm": 0.19391772756705172, + "learning_rate": 3.0546180350729646e-05, + "loss": 0.1907, + "step": 4210 + }, + { + "epoch": 1.461804954096657, + "grad_norm": 0.16839531563433688, + "learning_rate": 3.0447877007629494e-05, + "loss": 0.1872, + "step": 4220 + }, + { + "epoch": 1.4652693573531959, + "grad_norm": 0.1667638250714929, + "learning_rate": 3.0349485151817104e-05, + "loss": 0.1891, + "step": 4230 + }, + { + "epoch": 1.468733760609735, + "grad_norm": 0.15657114860167654, + "learning_rate": 3.0251006381884e-05, + "loss": 0.1888, + "step": 4240 + }, + { + "epoch": 1.472198163866274, + "grad_norm": 0.1616644177415703, + "learning_rate": 3.0152442297833817e-05, + "loss": 0.1866, + "step": 4250 + }, + { + "epoch": 1.475662567122813, + "grad_norm": 0.16936489968998167, + "learning_rate": 3.005379450105631e-05, + "loss": 0.1861, + "step": 4260 + }, + { + "epoch": 1.4791269703793521, + "grad_norm": 0.17783057212322842, + "learning_rate": 2.995506459430133e-05, + "loss": 0.1921, + "step": 4270 + }, + { + "epoch": 1.4825913736358913, + "grad_norm": 0.18253028978511793, + "learning_rate": 2.9856254181652777e-05, + "loss": 0.1931, + "step": 4280 + }, + { + "epoch": 1.4860557768924303, + "grad_norm": 0.17108679951308503, + "learning_rate": 2.9757364868502558e-05, + "loss": 0.187, + "step": 4290 + }, + { + "epoch": 1.4895201801489693, + "grad_norm": 0.17127968110442396, + "learning_rate": 2.9658398261524477e-05, + "loss": 0.1856, + "step": 4300 + }, + { + "epoch": 1.4929845834055084, + "grad_norm": 0.15243800455802872, + "learning_rate": 2.9559355968648163e-05, + "loss": 0.1878, + "step": 4310 + }, + { + "epoch": 1.4964489866620474, + "grad_norm": 0.16208265788754178, + "learning_rate": 2.9460239599032898e-05, + "loss": 0.1831, + "step": 4320 + }, + { + "epoch": 1.4999133899185866, + "grad_norm": 0.17721043816157045, + "learning_rate": 2.9361050763041552e-05, + "loss": 0.1855, + "step": 4330 + }, + { + "epoch": 1.5033777931751255, + "grad_norm": 0.1624251601306164, + "learning_rate": 2.926179107221433e-05, + "loss": 0.1905, + "step": 4340 + }, + { + "epoch": 1.5068421964316645, + "grad_norm": 0.17242171402814246, + "learning_rate": 2.916246213924263e-05, + "loss": 0.1892, + "step": 4350 + }, + { + "epoch": 1.5103065996882037, + "grad_norm": 0.18060628796516595, + "learning_rate": 2.9063065577942873e-05, + "loss": 0.1868, + "step": 4360 + }, + { + "epoch": 1.5137710029447429, + "grad_norm": 0.16420679560340506, + "learning_rate": 2.896360300323022e-05, + "loss": 0.189, + "step": 4370 + }, + { + "epoch": 1.5172354062012818, + "grad_norm": 0.1657190366825487, + "learning_rate": 2.8864076031092375e-05, + "loss": 0.1862, + "step": 4380 + }, + { + "epoch": 1.5206998094578208, + "grad_norm": 0.14827671598571107, + "learning_rate": 2.8764486278563313e-05, + "loss": 0.187, + "step": 4390 + }, + { + "epoch": 1.52416421271436, + "grad_norm": 0.1696723630570198, + "learning_rate": 2.8664835363697028e-05, + "loss": 0.1878, + "step": 4400 + }, + { + "epoch": 1.5276286159708992, + "grad_norm": 0.16415531283137735, + "learning_rate": 2.8565124905541224e-05, + "loss": 0.1884, + "step": 4410 + }, + { + "epoch": 1.5310930192274381, + "grad_norm": 0.1711666868251127, + "learning_rate": 2.8465356524111014e-05, + "loss": 0.1885, + "step": 4420 + }, + { + "epoch": 1.534557422483977, + "grad_norm": 0.15183248647521377, + "learning_rate": 2.8365531840362586e-05, + "loss": 0.185, + "step": 4430 + }, + { + "epoch": 1.538021825740516, + "grad_norm": 0.14509942126397887, + "learning_rate": 2.826565247616692e-05, + "loss": 0.1882, + "step": 4440 + }, + { + "epoch": 1.5414862289970552, + "grad_norm": 0.1638920162743926, + "learning_rate": 2.816572005428337e-05, + "loss": 0.1868, + "step": 4450 + }, + { + "epoch": 1.5449506322535944, + "grad_norm": 0.15578153131291148, + "learning_rate": 2.8065736198333337e-05, + "loss": 0.1811, + "step": 4460 + }, + { + "epoch": 1.5484150355101334, + "grad_norm": 0.16655001845904682, + "learning_rate": 2.796570253277389e-05, + "loss": 0.1867, + "step": 4470 + }, + { + "epoch": 1.5518794387666723, + "grad_norm": 0.16373974199509916, + "learning_rate": 2.786562068287134e-05, + "loss": 0.1841, + "step": 4480 + }, + { + "epoch": 1.5553438420232115, + "grad_norm": 0.17150797676546983, + "learning_rate": 2.7765492274674887e-05, + "loss": 0.1885, + "step": 4490 + }, + { + "epoch": 1.5588082452797507, + "grad_norm": 0.1477537468546959, + "learning_rate": 2.7665318934990153e-05, + "loss": 0.1836, + "step": 4500 + }, + { + "epoch": 1.5622726485362897, + "grad_norm": 0.1553596996678754, + "learning_rate": 2.7565102291352785e-05, + "loss": 0.1828, + "step": 4510 + }, + { + "epoch": 1.5657370517928286, + "grad_norm": 0.1653908672889608, + "learning_rate": 2.7464843972001985e-05, + "loss": 0.1841, + "step": 4520 + }, + { + "epoch": 1.5692014550493676, + "grad_norm": 0.16834209700469813, + "learning_rate": 2.7364545605854077e-05, + "loss": 0.1868, + "step": 4530 + }, + { + "epoch": 1.5726658583059068, + "grad_norm": 0.16296690623854307, + "learning_rate": 2.7264208822476016e-05, + "loss": 0.1871, + "step": 4540 + }, + { + "epoch": 1.576130261562446, + "grad_norm": 0.1465435846295834, + "learning_rate": 2.716383525205896e-05, + "loss": 0.1845, + "step": 4550 + }, + { + "epoch": 1.579594664818985, + "grad_norm": 0.16909003341569917, + "learning_rate": 2.7063426525391732e-05, + "loss": 0.1818, + "step": 4560 + }, + { + "epoch": 1.5830590680755239, + "grad_norm": 0.16069669006075715, + "learning_rate": 2.6962984273834346e-05, + "loss": 0.1844, + "step": 4570 + }, + { + "epoch": 1.586523471332063, + "grad_norm": 0.1750416978664114, + "learning_rate": 2.686251012929151e-05, + "loss": 0.1876, + "step": 4580 + }, + { + "epoch": 1.5899878745886022, + "grad_norm": 0.16259236460228596, + "learning_rate": 2.6762005724186084e-05, + "loss": 0.1817, + "step": 4590 + }, + { + "epoch": 1.5934522778451412, + "grad_norm": 0.14258206554047775, + "learning_rate": 2.6661472691432614e-05, + "loss": 0.1908, + "step": 4600 + }, + { + "epoch": 1.5969166811016802, + "grad_norm": 0.15573453304245355, + "learning_rate": 2.6560912664410724e-05, + "loss": 0.1847, + "step": 4610 + }, + { + "epoch": 1.6003810843582191, + "grad_norm": 0.1598455887435374, + "learning_rate": 2.646032727693864e-05, + "loss": 0.1883, + "step": 4620 + }, + { + "epoch": 1.6038454876147583, + "grad_norm": 0.16653009190610318, + "learning_rate": 2.6359718163246627e-05, + "loss": 0.1817, + "step": 4630 + }, + { + "epoch": 1.6073098908712975, + "grad_norm": 0.16139062116102534, + "learning_rate": 2.6259086957950434e-05, + "loss": 0.186, + "step": 4640 + }, + { + "epoch": 1.6107742941278365, + "grad_norm": 0.1408890193988467, + "learning_rate": 2.615843529602472e-05, + "loss": 0.1883, + "step": 4650 + }, + { + "epoch": 1.6142386973843754, + "grad_norm": 0.15912928421327938, + "learning_rate": 2.6057764812776524e-05, + "loss": 0.1818, + "step": 4660 + }, + { + "epoch": 1.6177031006409146, + "grad_norm": 0.1531498559224487, + "learning_rate": 2.595707714381867e-05, + "loss": 0.1815, + "step": 4670 + }, + { + "epoch": 1.6211675038974538, + "grad_norm": 0.15818993699570458, + "learning_rate": 2.585637392504321e-05, + "loss": 0.189, + "step": 4680 + }, + { + "epoch": 1.6246319071539927, + "grad_norm": 0.1550856346527227, + "learning_rate": 2.575565679259483e-05, + "loss": 0.1851, + "step": 4690 + }, + { + "epoch": 1.6280963104105317, + "grad_norm": 0.17234204432744546, + "learning_rate": 2.5654927382844274e-05, + "loss": 0.1856, + "step": 4700 + }, + { + "epoch": 1.631560713667071, + "grad_norm": 0.14720447626557742, + "learning_rate": 2.555418733236176e-05, + "loss": 0.1849, + "step": 4710 + }, + { + "epoch": 1.6350251169236099, + "grad_norm": 0.15155258081793999, + "learning_rate": 2.545343827789039e-05, + "loss": 0.185, + "step": 4720 + }, + { + "epoch": 1.638489520180149, + "grad_norm": 0.1585948164278231, + "learning_rate": 2.5352681856319556e-05, + "loss": 0.1818, + "step": 4730 + }, + { + "epoch": 1.641953923436688, + "grad_norm": 0.16020278019866965, + "learning_rate": 2.5251919704658323e-05, + "loss": 0.1847, + "step": 4740 + }, + { + "epoch": 1.645418326693227, + "grad_norm": 0.1619196729529339, + "learning_rate": 2.5151153460008898e-05, + "loss": 0.1851, + "step": 4750 + }, + { + "epoch": 1.6488827299497661, + "grad_norm": 0.17892607432188418, + "learning_rate": 2.5050384759539946e-05, + "loss": 0.1884, + "step": 4760 + }, + { + "epoch": 1.6523471332063053, + "grad_norm": 0.1453101210669921, + "learning_rate": 2.4949615240460053e-05, + "loss": 0.1829, + "step": 4770 + }, + { + "epoch": 1.6558115364628443, + "grad_norm": 0.15218576172212772, + "learning_rate": 2.4848846539991108e-05, + "loss": 0.1815, + "step": 4780 + }, + { + "epoch": 1.6592759397193833, + "grad_norm": 0.14916349890801867, + "learning_rate": 2.474808029534168e-05, + "loss": 0.1831, + "step": 4790 + }, + { + "epoch": 1.6627403429759224, + "grad_norm": 0.16152070439341162, + "learning_rate": 2.464731814368045e-05, + "loss": 0.1845, + "step": 4800 + }, + { + "epoch": 1.6662047462324616, + "grad_norm": 0.2549593516440478, + "learning_rate": 2.4546561722109614e-05, + "loss": 0.1821, + "step": 4810 + }, + { + "epoch": 1.6696691494890006, + "grad_norm": 0.1423882724472122, + "learning_rate": 2.4445812667638242e-05, + "loss": 0.1824, + "step": 4820 + }, + { + "epoch": 1.6731335527455395, + "grad_norm": 0.16756009716869377, + "learning_rate": 2.4345072617155732e-05, + "loss": 0.1861, + "step": 4830 + }, + { + "epoch": 1.6765979560020785, + "grad_norm": 0.16461215951788935, + "learning_rate": 2.424434320740518e-05, + "loss": 0.1832, + "step": 4840 + }, + { + "epoch": 1.6800623592586177, + "grad_norm": 0.14517098753523155, + "learning_rate": 2.4143626074956796e-05, + "loss": 0.1785, + "step": 4850 + }, + { + "epoch": 1.6835267625151569, + "grad_norm": 0.15108292412387223, + "learning_rate": 2.4042922856181337e-05, + "loss": 0.1827, + "step": 4860 + }, + { + "epoch": 1.6869911657716958, + "grad_norm": 0.15525515390004765, + "learning_rate": 2.394223518722348e-05, + "loss": 0.1838, + "step": 4870 + }, + { + "epoch": 1.6904555690282348, + "grad_norm": 0.15512655551886048, + "learning_rate": 2.3841564703975287e-05, + "loss": 0.1812, + "step": 4880 + }, + { + "epoch": 1.693919972284774, + "grad_norm": 0.16187683902086444, + "learning_rate": 2.374091304204958e-05, + "loss": 0.1832, + "step": 4890 + }, + { + "epoch": 1.6973843755413132, + "grad_norm": 0.15579000956703476, + "learning_rate": 2.364028183675337e-05, + "loss": 0.1806, + "step": 4900 + }, + { + "epoch": 1.7008487787978521, + "grad_norm": 0.15600046355547195, + "learning_rate": 2.353967272306137e-05, + "loss": 0.1844, + "step": 4910 + }, + { + "epoch": 1.704313182054391, + "grad_norm": 0.15757986710095487, + "learning_rate": 2.3439087335589285e-05, + "loss": 0.1841, + "step": 4920 + }, + { + "epoch": 1.70777758531093, + "grad_norm": 0.1452879429548844, + "learning_rate": 2.333852730856739e-05, + "loss": 0.1846, + "step": 4930 + }, + { + "epoch": 1.7112419885674692, + "grad_norm": 0.14279806262433956, + "learning_rate": 2.3237994275813918e-05, + "loss": 0.1846, + "step": 4940 + }, + { + "epoch": 1.7147063918240084, + "grad_norm": 0.168480732012115, + "learning_rate": 2.3137489870708494e-05, + "loss": 0.1854, + "step": 4950 + }, + { + "epoch": 1.7181707950805474, + "grad_norm": 0.14771704725144452, + "learning_rate": 2.303701572616566e-05, + "loss": 0.1781, + "step": 4960 + }, + { + "epoch": 1.7216351983370863, + "grad_norm": 0.15399297657379268, + "learning_rate": 2.2936573474608274e-05, + "loss": 0.1851, + "step": 4970 + }, + { + "epoch": 1.7250996015936255, + "grad_norm": 0.14268904873654373, + "learning_rate": 2.283616474794104e-05, + "loss": 0.1793, + "step": 4980 + }, + { + "epoch": 1.7285640048501647, + "grad_norm": 0.1458152221590982, + "learning_rate": 2.273579117752399e-05, + "loss": 0.18, + "step": 4990 + }, + { + "epoch": 1.7320284081067037, + "grad_norm": 0.157752398066733, + "learning_rate": 2.2635454394145926e-05, + "loss": 0.1804, + "step": 5000 + }, + { + "epoch": 1.7354928113632426, + "grad_norm": 0.16354623197521176, + "learning_rate": 2.253515602799802e-05, + "loss": 0.18, + "step": 5010 + }, + { + "epoch": 1.7389572146197816, + "grad_norm": 0.14987547413644828, + "learning_rate": 2.2434897708647225e-05, + "loss": 0.1884, + "step": 5020 + }, + { + "epoch": 1.7424216178763208, + "grad_norm": 0.13264921207181166, + "learning_rate": 2.233468106500985e-05, + "loss": 0.1817, + "step": 5030 + }, + { + "epoch": 1.74588602113286, + "grad_norm": 0.16074846687523298, + "learning_rate": 2.2234507725325115e-05, + "loss": 0.1821, + "step": 5040 + }, + { + "epoch": 1.749350424389399, + "grad_norm": 0.14912108468148355, + "learning_rate": 2.2134379317128666e-05, + "loss": 0.1831, + "step": 5050 + }, + { + "epoch": 1.7528148276459379, + "grad_norm": 0.15108125406874282, + "learning_rate": 2.2034297467226117e-05, + "loss": 0.1849, + "step": 5060 + }, + { + "epoch": 1.756279230902477, + "grad_norm": 0.1841322575834389, + "learning_rate": 2.193426380166667e-05, + "loss": 0.1814, + "step": 5070 + }, + { + "epoch": 1.7597436341590162, + "grad_norm": 0.15923619065201655, + "learning_rate": 2.183427994571663e-05, + "loss": 0.1826, + "step": 5080 + }, + { + "epoch": 1.7632080374155552, + "grad_norm": 0.15869671284509257, + "learning_rate": 2.1734347523833088e-05, + "loss": 0.1825, + "step": 5090 + }, + { + "epoch": 1.7666724406720942, + "grad_norm": 0.15221224358826752, + "learning_rate": 2.163446815963742e-05, + "loss": 0.182, + "step": 5100 + }, + { + "epoch": 1.7701368439286331, + "grad_norm": 0.1460640105447675, + "learning_rate": 2.1534643475888995e-05, + "loss": 0.1823, + "step": 5110 + }, + { + "epoch": 1.7736012471851723, + "grad_norm": 0.1521505743886422, + "learning_rate": 2.1434875094458785e-05, + "loss": 0.1874, + "step": 5120 + }, + { + "epoch": 1.7770656504417115, + "grad_norm": 0.14416303035440375, + "learning_rate": 2.133516463630297e-05, + "loss": 0.1808, + "step": 5130 + }, + { + "epoch": 1.7805300536982505, + "grad_norm": 0.1586845052440452, + "learning_rate": 2.1235513721436693e-05, + "loss": 0.1841, + "step": 5140 + }, + { + "epoch": 1.7839944569547894, + "grad_norm": 0.14468498059776927, + "learning_rate": 2.113592396890764e-05, + "loss": 0.181, + "step": 5150 + }, + { + "epoch": 1.7874588602113286, + "grad_norm": 0.14200788749614593, + "learning_rate": 2.1036396996769785e-05, + "loss": 0.1806, + "step": 5160 + }, + { + "epoch": 1.7909232634678678, + "grad_norm": 0.14145049327765216, + "learning_rate": 2.093693442205713e-05, + "loss": 0.1844, + "step": 5170 + }, + { + "epoch": 1.7943876667244067, + "grad_norm": 0.14047484844001326, + "learning_rate": 2.0837537860757378e-05, + "loss": 0.1856, + "step": 5180 + }, + { + "epoch": 1.7978520699809457, + "grad_norm": 0.15798548962919395, + "learning_rate": 2.073820892778568e-05, + "loss": 0.1781, + "step": 5190 + }, + { + "epoch": 1.801316473237485, + "grad_norm": 0.15599305625176324, + "learning_rate": 2.063894923695846e-05, + "loss": 0.1846, + "step": 5200 + }, + { + "epoch": 1.8047808764940239, + "grad_norm": 0.15451286542161505, + "learning_rate": 2.0539760400967105e-05, + "loss": 0.1814, + "step": 5210 + }, + { + "epoch": 1.808245279750563, + "grad_norm": 0.15151680436099774, + "learning_rate": 2.0440644031351846e-05, + "loss": 0.187, + "step": 5220 + }, + { + "epoch": 1.811709683007102, + "grad_norm": 0.16740847608877021, + "learning_rate": 2.0341601738475532e-05, + "loss": 0.1788, + "step": 5230 + }, + { + "epoch": 1.815174086263641, + "grad_norm": 0.15255748452386272, + "learning_rate": 2.0242635131497444e-05, + "loss": 0.1799, + "step": 5240 + }, + { + "epoch": 1.8186384895201801, + "grad_norm": 0.21370458116557772, + "learning_rate": 2.0143745818347226e-05, + "loss": 0.1859, + "step": 5250 + }, + { + "epoch": 1.8221028927767193, + "grad_norm": 0.15178882291540086, + "learning_rate": 2.004493540569867e-05, + "loss": 0.1816, + "step": 5260 + }, + { + "epoch": 1.8255672960332583, + "grad_norm": 0.1381809582876828, + "learning_rate": 1.9946205498943693e-05, + "loss": 0.1782, + "step": 5270 + }, + { + "epoch": 1.8290316992897973, + "grad_norm": 0.15231248860338162, + "learning_rate": 1.9847557702166185e-05, + "loss": 0.182, + "step": 5280 + }, + { + "epoch": 1.8324961025463364, + "grad_norm": 0.13540143942421462, + "learning_rate": 1.9748993618116003e-05, + "loss": 0.1802, + "step": 5290 + }, + { + "epoch": 1.8359605058028756, + "grad_norm": 0.15057686984095897, + "learning_rate": 1.9650514848182902e-05, + "loss": 0.1845, + "step": 5300 + }, + { + "epoch": 1.8394249090594146, + "grad_norm": 0.15657239979982757, + "learning_rate": 1.9552122992370515e-05, + "loss": 0.1816, + "step": 5310 + }, + { + "epoch": 1.8428893123159535, + "grad_norm": 0.14942811146448692, + "learning_rate": 1.9453819649270356e-05, + "loss": 0.1881, + "step": 5320 + }, + { + "epoch": 1.8463537155724925, + "grad_norm": 0.14206236299677405, + "learning_rate": 1.9355606416035893e-05, + "loss": 0.1798, + "step": 5330 + }, + { + "epoch": 1.8498181188290317, + "grad_norm": 0.15512685603221288, + "learning_rate": 1.925748488835651e-05, + "loss": 0.1787, + "step": 5340 + }, + { + "epoch": 1.8532825220855709, + "grad_norm": 0.14951858713327895, + "learning_rate": 1.9159456660431675e-05, + "loss": 0.1808, + "step": 5350 + }, + { + "epoch": 1.8567469253421098, + "grad_norm": 0.1518891829482899, + "learning_rate": 1.906152332494499e-05, + "loss": 0.1847, + "step": 5360 + }, + { + "epoch": 1.8602113285986488, + "grad_norm": 0.1599605482136836, + "learning_rate": 1.8963686473038286e-05, + "loss": 0.1758, + "step": 5370 + }, + { + "epoch": 1.863675731855188, + "grad_norm": 0.14097789058459068, + "learning_rate": 1.8865947694285863e-05, + "loss": 0.1814, + "step": 5380 + }, + { + "epoch": 1.8671401351117272, + "grad_norm": 0.13734664780231215, + "learning_rate": 1.876830857666855e-05, + "loss": 0.1813, + "step": 5390 + }, + { + "epoch": 1.8706045383682661, + "grad_norm": 0.15463681734647824, + "learning_rate": 1.867077070654802e-05, + "loss": 0.1807, + "step": 5400 + }, + { + "epoch": 1.874068941624805, + "grad_norm": 0.16737868012866194, + "learning_rate": 1.85733356686409e-05, + "loss": 0.1791, + "step": 5410 + }, + { + "epoch": 1.877533344881344, + "grad_norm": 0.1422554567768386, + "learning_rate": 1.847600504599312e-05, + "loss": 0.1812, + "step": 5420 + }, + { + "epoch": 1.8809977481378832, + "grad_norm": 0.1499775475287378, + "learning_rate": 1.8378780419954168e-05, + "loss": 0.1791, + "step": 5430 + }, + { + "epoch": 1.8844621513944224, + "grad_norm": 0.1620107234869922, + "learning_rate": 1.828166337015133e-05, + "loss": 0.1798, + "step": 5440 + }, + { + "epoch": 1.8879265546509614, + "grad_norm": 0.14943512022105485, + "learning_rate": 1.8184655474464122e-05, + "loss": 0.1769, + "step": 5450 + }, + { + "epoch": 1.8913909579075003, + "grad_norm": 0.1636002811165978, + "learning_rate": 1.8087758308998607e-05, + "loss": 0.1828, + "step": 5460 + }, + { + "epoch": 1.8948553611640395, + "grad_norm": 0.1488164351792101, + "learning_rate": 1.7990973448061788e-05, + "loss": 0.1793, + "step": 5470 + }, + { + "epoch": 1.8983197644205787, + "grad_norm": 0.1495335119087624, + "learning_rate": 1.7894302464136028e-05, + "loss": 0.1804, + "step": 5480 + }, + { + "epoch": 1.9017841676771177, + "grad_norm": 0.13881655138176263, + "learning_rate": 1.7797746927853524e-05, + "loss": 0.1808, + "step": 5490 + }, + { + "epoch": 1.9052485709336566, + "grad_norm": 0.14655156873185612, + "learning_rate": 1.770130840797075e-05, + "loss": 0.1824, + "step": 5500 + }, + { + "epoch": 1.9087129741901956, + "grad_norm": 0.1401693117484928, + "learning_rate": 1.7604988471343026e-05, + "loss": 0.1836, + "step": 5510 + }, + { + "epoch": 1.9121773774467348, + "grad_norm": 0.13663044985397266, + "learning_rate": 1.750878868289898e-05, + "loss": 0.179, + "step": 5520 + }, + { + "epoch": 1.915641780703274, + "grad_norm": 0.1440844464161243, + "learning_rate": 1.741271060561522e-05, + "loss": 0.1794, + "step": 5530 + }, + { + "epoch": 1.919106183959813, + "grad_norm": 0.1579545402684227, + "learning_rate": 1.731675580049085e-05, + "loss": 0.1762, + "step": 5540 + }, + { + "epoch": 1.9225705872163519, + "grad_norm": 0.14451711020259372, + "learning_rate": 1.7220925826522158e-05, + "loss": 0.179, + "step": 5550 + }, + { + "epoch": 1.926034990472891, + "grad_norm": 0.16318851480604618, + "learning_rate": 1.71252222406773e-05, + "loss": 0.1794, + "step": 5560 + }, + { + "epoch": 1.9294993937294302, + "grad_norm": 0.16754335602378248, + "learning_rate": 1.7029646597870934e-05, + "loss": 0.1809, + "step": 5570 + }, + { + "epoch": 1.9329637969859692, + "grad_norm": 0.14425404689789675, + "learning_rate": 1.693420045093905e-05, + "loss": 0.181, + "step": 5580 + }, + { + "epoch": 1.9364282002425082, + "grad_norm": 0.14950486896828924, + "learning_rate": 1.6838885350613664e-05, + "loss": 0.1834, + "step": 5590 + }, + { + "epoch": 1.9398926034990471, + "grad_norm": 0.15401643917378996, + "learning_rate": 1.674370284549765e-05, + "loss": 0.18, + "step": 5600 + }, + { + "epoch": 1.9433570067555863, + "grad_norm": 0.1457933918907984, + "learning_rate": 1.6648654482039616e-05, + "loss": 0.1798, + "step": 5610 + }, + { + "epoch": 1.9468214100121255, + "grad_norm": 0.15104837558607415, + "learning_rate": 1.6553741804508704e-05, + "loss": 0.1808, + "step": 5620 + }, + { + "epoch": 1.9502858132686645, + "grad_norm": 0.15234854679585655, + "learning_rate": 1.6458966354969553e-05, + "loss": 0.183, + "step": 5630 + }, + { + "epoch": 1.9537502165252034, + "grad_norm": 0.15054697844231169, + "learning_rate": 1.6364329673257244e-05, + "loss": 0.1782, + "step": 5640 + }, + { + "epoch": 1.9572146197817426, + "grad_norm": 0.19519516147127766, + "learning_rate": 1.6269833296952267e-05, + "loss": 0.1779, + "step": 5650 + }, + { + "epoch": 1.9606790230382818, + "grad_norm": 0.14953949146669174, + "learning_rate": 1.617547876135553e-05, + "loss": 0.1817, + "step": 5660 + }, + { + "epoch": 1.9641434262948207, + "grad_norm": 0.14098620367788547, + "learning_rate": 1.6081267599463446e-05, + "loss": 0.1795, + "step": 5670 + }, + { + "epoch": 1.9676078295513597, + "grad_norm": 0.1516762253034418, + "learning_rate": 1.598720134194298e-05, + "loss": 0.175, + "step": 5680 + }, + { + "epoch": 1.971072232807899, + "grad_norm": 0.14592317747183597, + "learning_rate": 1.5893281517106852e-05, + "loss": 0.1817, + "step": 5690 + }, + { + "epoch": 1.9745366360644379, + "grad_norm": 0.1367156708009888, + "learning_rate": 1.5799509650888605e-05, + "loss": 0.1792, + "step": 5700 + }, + { + "epoch": 1.978001039320977, + "grad_norm": 0.15222974333942974, + "learning_rate": 1.5705887266817926e-05, + "loss": 0.1838, + "step": 5710 + }, + { + "epoch": 1.981465442577516, + "grad_norm": 0.15063980268483795, + "learning_rate": 1.5612415885995803e-05, + "loss": 0.1798, + "step": 5720 + }, + { + "epoch": 1.984929845834055, + "grad_norm": 0.14361029580183732, + "learning_rate": 1.551909702706984e-05, + "loss": 0.1775, + "step": 5730 + }, + { + "epoch": 1.9883942490905941, + "grad_norm": 0.14905006323407718, + "learning_rate": 1.5425932206209617e-05, + "loss": 0.1853, + "step": 5740 + }, + { + "epoch": 1.9918586523471333, + "grad_norm": 0.1473397517873399, + "learning_rate": 1.533292293708201e-05, + "loss": 0.1857, + "step": 5750 + }, + { + "epoch": 1.9953230556036723, + "grad_norm": 0.14626760437882622, + "learning_rate": 1.52400707308266e-05, + "loss": 0.1741, + "step": 5760 + }, + { + "epoch": 1.9987874588602113, + "grad_norm": 0.1405339530408193, + "learning_rate": 1.5147377096031173e-05, + "loss": 0.1757, + "step": 5770 + }, + { + "epoch": 2.0020786419539234, + "grad_norm": 0.17093438476339481, + "learning_rate": 1.5054843538707147e-05, + "loss": 0.1598, + "step": 5780 + }, + { + "epoch": 2.0055430452104623, + "grad_norm": 0.15448133744376852, + "learning_rate": 1.4962471562265151e-05, + "loss": 0.1509, + "step": 5790 + }, + { + "epoch": 2.0090074484670017, + "grad_norm": 0.15883932121601194, + "learning_rate": 1.4870262667490553e-05, + "loss": 0.1508, + "step": 5800 + }, + { + "epoch": 2.0124718517235407, + "grad_norm": 0.15799483695764974, + "learning_rate": 1.4778218352519113e-05, + "loss": 0.1514, + "step": 5810 + }, + { + "epoch": 2.0159362549800797, + "grad_norm": 0.1397582105470391, + "learning_rate": 1.4686340112812644e-05, + "loss": 0.1513, + "step": 5820 + }, + { + "epoch": 2.0194006582366186, + "grad_norm": 0.13988381555476553, + "learning_rate": 1.4594629441134674e-05, + "loss": 0.1516, + "step": 5830 + }, + { + "epoch": 2.0228650614931576, + "grad_norm": 0.15357440805298642, + "learning_rate": 1.4503087827526257e-05, + "loss": 0.1537, + "step": 5840 + }, + { + "epoch": 2.026329464749697, + "grad_norm": 0.140105602913093, + "learning_rate": 1.4411716759281701e-05, + "loss": 0.1489, + "step": 5850 + }, + { + "epoch": 2.029793868006236, + "grad_norm": 0.1371171850215917, + "learning_rate": 1.4320517720924423e-05, + "loss": 0.1478, + "step": 5860 + }, + { + "epoch": 2.033258271262775, + "grad_norm": 0.1415040646234407, + "learning_rate": 1.4229492194182864e-05, + "loss": 0.1511, + "step": 5870 + }, + { + "epoch": 2.036722674519314, + "grad_norm": 0.1455346789180852, + "learning_rate": 1.4138641657966387e-05, + "loss": 0.1541, + "step": 5880 + }, + { + "epoch": 2.0401870777758533, + "grad_norm": 0.14764527095741217, + "learning_rate": 1.4047967588341216e-05, + "loss": 0.1528, + "step": 5890 + }, + { + "epoch": 2.0436514810323922, + "grad_norm": 0.13949032357196248, + "learning_rate": 1.3957471458506536e-05, + "loss": 0.1539, + "step": 5900 + }, + { + "epoch": 2.047115884288931, + "grad_norm": 0.13705465172650183, + "learning_rate": 1.386715473877046e-05, + "loss": 0.1457, + "step": 5910 + }, + { + "epoch": 2.05058028754547, + "grad_norm": 0.12956466312791168, + "learning_rate": 1.3777018896526236e-05, + "loss": 0.1473, + "step": 5920 + }, + { + "epoch": 2.0540446908020096, + "grad_norm": 0.15368302078148194, + "learning_rate": 1.3687065396228332e-05, + "loss": 0.1497, + "step": 5930 + }, + { + "epoch": 2.0575090940585485, + "grad_norm": 0.1428450265623596, + "learning_rate": 1.3597295699368668e-05, + "loss": 0.1514, + "step": 5940 + }, + { + "epoch": 2.0609734973150875, + "grad_norm": 0.1281258398579668, + "learning_rate": 1.3507711264452905e-05, + "loss": 0.1483, + "step": 5950 + }, + { + "epoch": 2.0644379005716265, + "grad_norm": 0.15262843729207556, + "learning_rate": 1.3418313546976676e-05, + "loss": 0.1466, + "step": 5960 + }, + { + "epoch": 2.0679023038281654, + "grad_norm": 0.14904369630104097, + "learning_rate": 1.332910399940202e-05, + "loss": 0.1454, + "step": 5970 + }, + { + "epoch": 2.071366707084705, + "grad_norm": 0.1423441662005472, + "learning_rate": 1.324008407113371e-05, + "loss": 0.1501, + "step": 5980 + }, + { + "epoch": 2.074831110341244, + "grad_norm": 0.1422975070772413, + "learning_rate": 1.3151255208495755e-05, + "loss": 0.154, + "step": 5990 + }, + { + "epoch": 2.0782955135977828, + "grad_norm": 0.1352228664513086, + "learning_rate": 1.306261885470789e-05, + "loss": 0.1506, + "step": 6000 + }, + { + "epoch": 2.0817599168543217, + "grad_norm": 0.13493941433636564, + "learning_rate": 1.2974176449862101e-05, + "loss": 0.148, + "step": 6010 + }, + { + "epoch": 2.085224320110861, + "grad_norm": 0.13664044137951584, + "learning_rate": 1.2885929430899258e-05, + "loss": 0.1479, + "step": 6020 + }, + { + "epoch": 2.0886887233674, + "grad_norm": 0.13427253141306963, + "learning_rate": 1.279787923158577e-05, + "loss": 0.1451, + "step": 6030 + }, + { + "epoch": 2.092153126623939, + "grad_norm": 0.18500494498072673, + "learning_rate": 1.2710027282490247e-05, + "loss": 0.1504, + "step": 6040 + }, + { + "epoch": 2.095617529880478, + "grad_norm": 0.1384525206350356, + "learning_rate": 1.2622375010960335e-05, + "loss": 0.1502, + "step": 6050 + }, + { + "epoch": 2.099081933137017, + "grad_norm": 0.1417115574643469, + "learning_rate": 1.2534923841099445e-05, + "loss": 0.1522, + "step": 6060 + }, + { + "epoch": 2.1025463363935564, + "grad_norm": 0.14117258627489582, + "learning_rate": 1.2447675193743651e-05, + "loss": 0.149, + "step": 6070 + }, + { + "epoch": 2.1060107396500953, + "grad_norm": 0.13976875251148813, + "learning_rate": 1.2360630486438635e-05, + "loss": 0.1479, + "step": 6080 + }, + { + "epoch": 2.1094751429066343, + "grad_norm": 0.13438983639128466, + "learning_rate": 1.2273791133416584e-05, + "loss": 0.1493, + "step": 6090 + }, + { + "epoch": 2.1129395461631733, + "grad_norm": 0.12916426405356554, + "learning_rate": 1.2187158545573295e-05, + "loss": 0.1462, + "step": 6100 + }, + { + "epoch": 2.1164039494197127, + "grad_norm": 0.14884278506870566, + "learning_rate": 1.2100734130445173e-05, + "loss": 0.1534, + "step": 6110 + }, + { + "epoch": 2.1198683526762516, + "grad_norm": 0.14763268804961824, + "learning_rate": 1.2014519292186428e-05, + "loss": 0.1504, + "step": 6120 + }, + { + "epoch": 2.1233327559327906, + "grad_norm": 0.14546584264873522, + "learning_rate": 1.1928515431546233e-05, + "loss": 0.1549, + "step": 6130 + }, + { + "epoch": 2.1267971591893295, + "grad_norm": 0.13000521630439743, + "learning_rate": 1.1842723945845948e-05, + "loss": 0.1515, + "step": 6140 + }, + { + "epoch": 2.1302615624458685, + "grad_norm": 0.1380689347039955, + "learning_rate": 1.1757146228956445e-05, + "loss": 0.1534, + "step": 6150 + }, + { + "epoch": 2.133725965702408, + "grad_norm": 0.13885554681042553, + "learning_rate": 1.1671783671275467e-05, + "loss": 0.1477, + "step": 6160 + }, + { + "epoch": 2.137190368958947, + "grad_norm": 0.1413517568795959, + "learning_rate": 1.1586637659704994e-05, + "loss": 0.1489, + "step": 6170 + }, + { + "epoch": 2.140654772215486, + "grad_norm": 0.1445567017190447, + "learning_rate": 1.1501709577628777e-05, + "loss": 0.1527, + "step": 6180 + }, + { + "epoch": 2.144119175472025, + "grad_norm": 0.13842122845453322, + "learning_rate": 1.1417000804889793e-05, + "loss": 0.149, + "step": 6190 + }, + { + "epoch": 2.147583578728564, + "grad_norm": 0.1351342875816243, + "learning_rate": 1.1332512717767862e-05, + "loss": 0.1499, + "step": 6200 + }, + { + "epoch": 2.151047981985103, + "grad_norm": 0.19203561762793064, + "learning_rate": 1.1248246688957307e-05, + "loss": 0.1516, + "step": 6210 + }, + { + "epoch": 2.154512385241642, + "grad_norm": 0.13275502783261534, + "learning_rate": 1.1164204087544589e-05, + "loss": 0.1496, + "step": 6220 + }, + { + "epoch": 2.157976788498181, + "grad_norm": 0.14124554657127336, + "learning_rate": 1.108038627898613e-05, + "loss": 0.1469, + "step": 6230 + }, + { + "epoch": 2.16144119175472, + "grad_norm": 0.13662855483900213, + "learning_rate": 1.0996794625086102e-05, + "loss": 0.1533, + "step": 6240 + }, + { + "epoch": 2.1649055950112595, + "grad_norm": 0.13136794147532813, + "learning_rate": 1.091343048397426e-05, + "loss": 0.1479, + "step": 6250 + }, + { + "epoch": 2.1683699982677984, + "grad_norm": 0.14289580428302312, + "learning_rate": 1.0830295210083968e-05, + "loss": 0.1523, + "step": 6260 + }, + { + "epoch": 2.1718344015243374, + "grad_norm": 0.14336144046136406, + "learning_rate": 1.0747390154130097e-05, + "loss": 0.1482, + "step": 6270 + }, + { + "epoch": 2.1752988047808763, + "grad_norm": 0.12351594136318876, + "learning_rate": 1.0664716663087132e-05, + "loss": 0.148, + "step": 6280 + }, + { + "epoch": 2.1787632080374157, + "grad_norm": 0.1263105200697639, + "learning_rate": 1.0582276080167299e-05, + "loss": 0.1488, + "step": 6290 + }, + { + "epoch": 2.1822276112939547, + "grad_norm": 0.1409302104392171, + "learning_rate": 1.0500069744798696e-05, + "loss": 0.1493, + "step": 6300 + }, + { + "epoch": 2.1856920145504937, + "grad_norm": 0.13878678763750232, + "learning_rate": 1.0418098992603576e-05, + "loss": 0.1467, + "step": 6310 + }, + { + "epoch": 2.1891564178070326, + "grad_norm": 0.13130670045208906, + "learning_rate": 1.033636515537661e-05, + "loss": 0.15, + "step": 6320 + }, + { + "epoch": 2.1926208210635716, + "grad_norm": 0.13037720388849533, + "learning_rate": 1.0254869561063263e-05, + "loss": 0.1514, + "step": 6330 + }, + { + "epoch": 2.196085224320111, + "grad_norm": 0.1392467131479532, + "learning_rate": 1.0173613533738238e-05, + "loss": 0.1497, + "step": 6340 + }, + { + "epoch": 2.19954962757665, + "grad_norm": 0.13583441746880653, + "learning_rate": 1.0092598393583949e-05, + "loss": 0.1533, + "step": 6350 + }, + { + "epoch": 2.203014030833189, + "grad_norm": 0.138478347457145, + "learning_rate": 1.001182545686904e-05, + "loss": 0.1474, + "step": 6360 + }, + { + "epoch": 2.206478434089728, + "grad_norm": 0.13196884066043799, + "learning_rate": 9.931296035927068e-06, + "loss": 0.1485, + "step": 6370 + }, + { + "epoch": 2.2099428373462673, + "grad_norm": 0.12628759927748884, + "learning_rate": 9.851011439135105e-06, + "loss": 0.1512, + "step": 6380 + }, + { + "epoch": 2.2134072406028062, + "grad_norm": 0.12720352197837026, + "learning_rate": 9.770972970892553e-06, + "loss": 0.1492, + "step": 6390 + }, + { + "epoch": 2.216871643859345, + "grad_norm": 0.1288529643249653, + "learning_rate": 9.691181931599886e-06, + "loss": 0.1463, + "step": 6400 + }, + { + "epoch": 2.220336047115884, + "grad_norm": 0.12600916062409626, + "learning_rate": 9.611639617637558e-06, + "loss": 0.1488, + "step": 6410 + }, + { + "epoch": 2.2238004503724236, + "grad_norm": 0.14329418696142437, + "learning_rate": 9.532347321344956e-06, + "loss": 0.1521, + "step": 6420 + }, + { + "epoch": 2.2272648536289625, + "grad_norm": 0.13819893815319562, + "learning_rate": 9.453306330999349e-06, + "loss": 0.1501, + "step": 6430 + }, + { + "epoch": 2.2307292568855015, + "grad_norm": 0.13277309165484377, + "learning_rate": 9.37451793079502e-06, + "loss": 0.1515, + "step": 6440 + }, + { + "epoch": 2.2341936601420405, + "grad_norm": 0.1304498393414478, + "learning_rate": 9.29598340082236e-06, + "loss": 0.1498, + "step": 6450 + }, + { + "epoch": 2.2376580633985794, + "grad_norm": 0.13297679247054883, + "learning_rate": 9.217704017047057e-06, + "loss": 0.151, + "step": 6460 + }, + { + "epoch": 2.241122466655119, + "grad_norm": 0.12871920957838523, + "learning_rate": 9.139681051289425e-06, + "loss": 0.1507, + "step": 6470 + }, + { + "epoch": 2.244586869911658, + "grad_norm": 0.1377467169551894, + "learning_rate": 9.061915771203695e-06, + "loss": 0.1477, + "step": 6480 + }, + { + "epoch": 2.2480512731681968, + "grad_norm": 0.14354881718167706, + "learning_rate": 8.984409440257427e-06, + "loss": 0.1496, + "step": 6490 + }, + { + "epoch": 2.2515156764247357, + "grad_norm": 0.13146580616283166, + "learning_rate": 8.907163317710976e-06, + "loss": 0.1499, + "step": 6500 + }, + { + "epoch": 2.2549800796812747, + "grad_norm": 0.14028927070593003, + "learning_rate": 8.830178658597038e-06, + "loss": 0.1482, + "step": 6510 + }, + { + "epoch": 2.258444482937814, + "grad_norm": 0.13406176038514053, + "learning_rate": 8.75345671370029e-06, + "loss": 0.1513, + "step": 6520 + }, + { + "epoch": 2.261908886194353, + "grad_norm": 0.1281531685378758, + "learning_rate": 8.676998729537009e-06, + "loss": 0.1487, + "step": 6530 + }, + { + "epoch": 2.265373289450892, + "grad_norm": 0.1334136953967956, + "learning_rate": 8.600805948334858e-06, + "loss": 0.1461, + "step": 6540 + }, + { + "epoch": 2.268837692707431, + "grad_norm": 0.13263104957255284, + "learning_rate": 8.524879608012714e-06, + "loss": 0.1494, + "step": 6550 + }, + { + "epoch": 2.2723020959639704, + "grad_norm": 0.14074464264886974, + "learning_rate": 8.449220942160512e-06, + "loss": 0.1492, + "step": 6560 + }, + { + "epoch": 2.2757664992205093, + "grad_norm": 0.13291686489725904, + "learning_rate": 8.373831180019256e-06, + "loss": 0.1468, + "step": 6570 + }, + { + "epoch": 2.2792309024770483, + "grad_norm": 0.13968516954362226, + "learning_rate": 8.298711546460986e-06, + "loss": 0.148, + "step": 6580 + }, + { + "epoch": 2.2826953057335873, + "grad_norm": 0.13617881495577386, + "learning_rate": 8.223863261968945e-06, + "loss": 0.1514, + "step": 6590 + }, + { + "epoch": 2.2861597089901267, + "grad_norm": 0.13589641843226832, + "learning_rate": 8.149287542617686e-06, + "loss": 0.147, + "step": 6600 + }, + { + "epoch": 2.2896241122466656, + "grad_norm": 0.13498192446917104, + "learning_rate": 8.074985600053361e-06, + "loss": 0.1559, + "step": 6610 + }, + { + "epoch": 2.2930885155032046, + "grad_norm": 0.13109156975966305, + "learning_rate": 8.000958641474021e-06, + "loss": 0.1524, + "step": 6620 + }, + { + "epoch": 2.2965529187597435, + "grad_norm": 0.1320540630523605, + "learning_rate": 7.927207869609984e-06, + "loss": 0.1493, + "step": 6630 + }, + { + "epoch": 2.3000173220162825, + "grad_norm": 0.14127205971076925, + "learning_rate": 7.853734482704309e-06, + "loss": 0.1511, + "step": 6640 + }, + { + "epoch": 2.303481725272822, + "grad_norm": 0.13466133744323935, + "learning_rate": 7.780539674493345e-06, + "loss": 0.1506, + "step": 6650 + }, + { + "epoch": 2.306946128529361, + "grad_norm": 0.1286157583033884, + "learning_rate": 7.707624634187308e-06, + "loss": 0.1527, + "step": 6660 + }, + { + "epoch": 2.3104105317859, + "grad_norm": 0.1380982881943608, + "learning_rate": 7.63499054645096e-06, + "loss": 0.1484, + "step": 6670 + }, + { + "epoch": 2.313874935042439, + "grad_norm": 0.15156686175057582, + "learning_rate": 7.562638591384396e-06, + "loss": 0.1461, + "step": 6680 + }, + { + "epoch": 2.3173393382989778, + "grad_norm": 0.12803039542374958, + "learning_rate": 7.4905699445038255e-06, + "loss": 0.1483, + "step": 6690 + }, + { + "epoch": 2.320803741555517, + "grad_norm": 0.13309461688414237, + "learning_rate": 7.418785776722514e-06, + "loss": 0.151, + "step": 6700 + }, + { + "epoch": 2.324268144812056, + "grad_norm": 0.12574985983373116, + "learning_rate": 7.34728725433172e-06, + "loss": 0.1466, + "step": 6710 + }, + { + "epoch": 2.327732548068595, + "grad_norm": 0.13375589775606558, + "learning_rate": 7.276075538981778e-06, + "loss": 0.1511, + "step": 6720 + }, + { + "epoch": 2.3311969513251345, + "grad_norm": 0.13021533231229976, + "learning_rate": 7.205151787663222e-06, + "loss": 0.1486, + "step": 6730 + }, + { + "epoch": 2.3346613545816735, + "grad_norm": 0.1238991538662092, + "learning_rate": 7.134517152687953e-06, + "loss": 0.1467, + "step": 6740 + }, + { + "epoch": 2.3381257578382124, + "grad_norm": 0.13319985151049268, + "learning_rate": 7.064172781670569e-06, + "loss": 0.1504, + "step": 6750 + }, + { + "epoch": 2.3415901610947514, + "grad_norm": 0.13427753958581112, + "learning_rate": 6.994119817509678e-06, + "loss": 0.1454, + "step": 6760 + }, + { + "epoch": 2.3450545643512903, + "grad_norm": 0.1329147728324323, + "learning_rate": 6.924359398369342e-06, + "loss": 0.1487, + "step": 6770 + }, + { + "epoch": 2.3485189676078297, + "grad_norm": 0.11940132635278913, + "learning_rate": 6.854892657660605e-06, + "loss": 0.1476, + "step": 6780 + }, + { + "epoch": 2.3519833708643687, + "grad_norm": 0.13086880472397885, + "learning_rate": 6.785720724023042e-06, + "loss": 0.1483, + "step": 6790 + }, + { + "epoch": 2.3554477741209077, + "grad_norm": 0.12994412754505075, + "learning_rate": 6.716844721306443e-06, + "loss": 0.1496, + "step": 6800 + }, + { + "epoch": 2.3589121773774466, + "grad_norm": 0.13309217588951705, + "learning_rate": 6.648265768552569e-06, + "loss": 0.1469, + "step": 6810 + }, + { + "epoch": 2.3623765806339856, + "grad_norm": 0.1353366584822473, + "learning_rate": 6.579984979976925e-06, + "loss": 0.151, + "step": 6820 + }, + { + "epoch": 2.365840983890525, + "grad_norm": 0.13072638016170673, + "learning_rate": 6.512003464950706e-06, + "loss": 0.1498, + "step": 6830 + }, + { + "epoch": 2.369305387147064, + "grad_norm": 0.14086411924508258, + "learning_rate": 6.444322327982752e-06, + "loss": 0.1488, + "step": 6840 + }, + { + "epoch": 2.372769790403603, + "grad_norm": 0.13022025324073322, + "learning_rate": 6.376942668701586e-06, + "loss": 0.1505, + "step": 6850 + }, + { + "epoch": 2.376234193660142, + "grad_norm": 0.1270304545640671, + "learning_rate": 6.309865581837584e-06, + "loss": 0.1467, + "step": 6860 + }, + { + "epoch": 2.3796985969166813, + "grad_norm": 0.13824906046376823, + "learning_rate": 6.243092157205146e-06, + "loss": 0.1479, + "step": 6870 + }, + { + "epoch": 2.3831630001732202, + "grad_norm": 0.13322342141123356, + "learning_rate": 6.1766234796850426e-06, + "loss": 0.1532, + "step": 6880 + }, + { + "epoch": 2.386627403429759, + "grad_norm": 0.13622336055549258, + "learning_rate": 6.110460629206735e-06, + "loss": 0.1494, + "step": 6890 + }, + { + "epoch": 2.390091806686298, + "grad_norm": 0.13141145530513035, + "learning_rate": 6.044604680730856e-06, + "loss": 0.1478, + "step": 6900 + }, + { + "epoch": 2.3935562099428376, + "grad_norm": 0.12753936723220544, + "learning_rate": 5.979056704231759e-06, + "loss": 0.1508, + "step": 6910 + }, + { + "epoch": 2.3970206131993765, + "grad_norm": 0.1282678691982685, + "learning_rate": 5.9138177646800934e-06, + "loss": 0.1494, + "step": 6920 + }, + { + "epoch": 2.4004850164559155, + "grad_norm": 0.12550541746949806, + "learning_rate": 5.848888922025553e-06, + "loss": 0.1474, + "step": 6930 + }, + { + "epoch": 2.4039494197124545, + "grad_norm": 0.13234115800985993, + "learning_rate": 5.7842712311796025e-06, + "loss": 0.1463, + "step": 6940 + }, + { + "epoch": 2.4074138229689934, + "grad_norm": 0.12467209107241967, + "learning_rate": 5.719965741998368e-06, + "loss": 0.1491, + "step": 6950 + }, + { + "epoch": 2.410878226225533, + "grad_norm": 0.13573330603028008, + "learning_rate": 5.655973499265582e-06, + "loss": 0.1513, + "step": 6960 + }, + { + "epoch": 2.414342629482072, + "grad_norm": 0.13240853708299855, + "learning_rate": 5.59229554267561e-06, + "loss": 0.1516, + "step": 6970 + }, + { + "epoch": 2.4178070327386108, + "grad_norm": 0.1329534082302391, + "learning_rate": 5.528932906816522e-06, + "loss": 0.1517, + "step": 6980 + }, + { + "epoch": 2.4212714359951497, + "grad_norm": 0.12911430987167502, + "learning_rate": 5.465886621153346e-06, + "loss": 0.1457, + "step": 6990 + }, + { + "epoch": 2.4247358392516887, + "grad_norm": 0.1334215432640577, + "learning_rate": 5.403157710011267e-06, + "loss": 0.1534, + "step": 7000 + }, + { + "epoch": 2.428200242508228, + "grad_norm": 0.12233095847822573, + "learning_rate": 5.340747192559064e-06, + "loss": 0.1443, + "step": 7010 + }, + { + "epoch": 2.431664645764767, + "grad_norm": 0.12214722447020973, + "learning_rate": 5.278656082792488e-06, + "loss": 0.1506, + "step": 7020 + }, + { + "epoch": 2.435129049021306, + "grad_norm": 0.1312038495892653, + "learning_rate": 5.216885389517808e-06, + "loss": 0.1494, + "step": 7030 + }, + { + "epoch": 2.438593452277845, + "grad_norm": 0.12877996640447992, + "learning_rate": 5.155436116335455e-06, + "loss": 0.1498, + "step": 7040 + }, + { + "epoch": 2.4420578555343844, + "grad_norm": 0.13785796047700058, + "learning_rate": 5.094309261623642e-06, + "loss": 0.1493, + "step": 7050 + }, + { + "epoch": 2.4455222587909233, + "grad_norm": 0.12689388560349374, + "learning_rate": 5.0335058185222245e-06, + "loss": 0.148, + "step": 7060 + }, + { + "epoch": 2.4489866620474623, + "grad_norm": 0.13349070966216434, + "learning_rate": 4.973026774916504e-06, + "loss": 0.1491, + "step": 7070 + }, + { + "epoch": 2.4524510653040013, + "grad_norm": 0.13124780154477322, + "learning_rate": 4.912873113421215e-06, + "loss": 0.1472, + "step": 7080 + }, + { + "epoch": 2.4559154685605407, + "grad_norm": 0.12289322814948568, + "learning_rate": 4.853045811364532e-06, + "loss": 0.151, + "step": 7090 + }, + { + "epoch": 2.4593798718170796, + "grad_norm": 0.1231683131497469, + "learning_rate": 4.793545840772221e-06, + "loss": 0.1477, + "step": 7100 + }, + { + "epoch": 2.4628442750736186, + "grad_norm": 0.12204021116853747, + "learning_rate": 4.734374168351807e-06, + "loss": 0.1464, + "step": 7110 + }, + { + "epoch": 2.4663086783301575, + "grad_norm": 0.12705870894938584, + "learning_rate": 4.675531755476922e-06, + "loss": 0.1487, + "step": 7120 + }, + { + "epoch": 2.4697730815866965, + "grad_norm": 0.13311385467368475, + "learning_rate": 4.617019558171623e-06, + "loss": 0.1518, + "step": 7130 + }, + { + "epoch": 2.473237484843236, + "grad_norm": 0.12633200495803268, + "learning_rate": 4.558838527094916e-06, + "loss": 0.1487, + "step": 7140 + }, + { + "epoch": 2.476701888099775, + "grad_norm": 0.12821618157907513, + "learning_rate": 4.500989607525271e-06, + "loss": 0.1513, + "step": 7150 + }, + { + "epoch": 2.480166291356314, + "grad_norm": 0.12368299726159533, + "learning_rate": 4.443473739345275e-06, + "loss": 0.1486, + "step": 7160 + }, + { + "epoch": 2.483630694612853, + "grad_norm": 0.13459364837403084, + "learning_rate": 4.386291857026381e-06, + "loss": 0.1507, + "step": 7170 + }, + { + "epoch": 2.4870950978693918, + "grad_norm": 0.12921146609560327, + "learning_rate": 4.329444889613687e-06, + "loss": 0.1506, + "step": 7180 + }, + { + "epoch": 2.490559501125931, + "grad_norm": 0.13011095251066665, + "learning_rate": 4.272933760710893e-06, + "loss": 0.1472, + "step": 7190 + }, + { + "epoch": 2.49402390438247, + "grad_norm": 0.13394025522445424, + "learning_rate": 4.2167593884652325e-06, + "loss": 0.1482, + "step": 7200 + }, + { + "epoch": 2.497488307639009, + "grad_norm": 0.12341718153206974, + "learning_rate": 4.160922685552612e-06, + "loss": 0.1492, + "step": 7210 + }, + { + "epoch": 2.5009527108955485, + "grad_norm": 0.12332806421502722, + "learning_rate": 4.105424559162754e-06, + "loss": 0.151, + "step": 7220 + }, + { + "epoch": 2.5044171141520875, + "grad_norm": 0.12451895183900567, + "learning_rate": 4.05026591098446e-06, + "loss": 0.1477, + "step": 7230 + }, + { + "epoch": 2.5078815174086264, + "grad_norm": 0.12237447886665374, + "learning_rate": 3.995447637190955e-06, + "loss": 0.1434, + "step": 7240 + }, + { + "epoch": 2.5113459206651654, + "grad_norm": 0.12468896676434474, + "learning_rate": 3.940970628425353e-06, + "loss": 0.1509, + "step": 7250 + }, + { + "epoch": 2.5148103239217043, + "grad_norm": 0.12229284415257566, + "learning_rate": 3.886835769786154e-06, + "loss": 0.1508, + "step": 7260 + }, + { + "epoch": 2.5182747271782437, + "grad_norm": 0.12284739761632071, + "learning_rate": 3.833043940812889e-06, + "loss": 0.1469, + "step": 7270 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.12171858269842417, + "learning_rate": 3.7795960154718175e-06, + "loss": 0.148, + "step": 7280 + }, + { + "epoch": 2.5252035336913217, + "grad_norm": 0.1279096545024547, + "learning_rate": 3.726492862141717e-06, + "loss": 0.1492, + "step": 7290 + }, + { + "epoch": 2.5286679369478606, + "grad_norm": 0.1261825689932827, + "learning_rate": 3.67373534359981e-06, + "loss": 0.1485, + "step": 7300 + }, + { + "epoch": 2.5321323402043996, + "grad_norm": 0.12731710139244515, + "learning_rate": 3.621324317007704e-06, + "loss": 0.1485, + "step": 7310 + }, + { + "epoch": 2.535596743460939, + "grad_norm": 0.1224027049400114, + "learning_rate": 3.569260633897495e-06, + "loss": 0.1476, + "step": 7320 + }, + { + "epoch": 2.539061146717478, + "grad_norm": 0.12095976657546359, + "learning_rate": 3.517545140157927e-06, + "loss": 0.1466, + "step": 7330 + }, + { + "epoch": 2.542525549974017, + "grad_norm": 0.12165268564942827, + "learning_rate": 3.466178676020626e-06, + "loss": 0.1463, + "step": 7340 + }, + { + "epoch": 2.545989953230556, + "grad_norm": 0.12226265951849115, + "learning_rate": 3.415162076046488e-06, + "loss": 0.1485, + "step": 7350 + }, + { + "epoch": 2.549454356487095, + "grad_norm": 0.12637110596902743, + "learning_rate": 3.364496169112083e-06, + "loss": 0.1487, + "step": 7360 + }, + { + "epoch": 2.5529187597436342, + "grad_norm": 0.12810226008624373, + "learning_rate": 3.3141817783962e-06, + "loss": 0.146, + "step": 7370 + }, + { + "epoch": 2.556383163000173, + "grad_norm": 0.1252993130112932, + "learning_rate": 3.264219721366496e-06, + "loss": 0.1465, + "step": 7380 + }, + { + "epoch": 2.559847566256712, + "grad_norm": 0.1217860788587577, + "learning_rate": 3.2146108097661746e-06, + "loss": 0.1465, + "step": 7390 + }, + { + "epoch": 2.5633119695132516, + "grad_norm": 0.1306358146817434, + "learning_rate": 3.165355849600829e-06, + "loss": 0.1488, + "step": 7400 + }, + { + "epoch": 2.5667763727697905, + "grad_norm": 0.1243046537823559, + "learning_rate": 3.116455641125332e-06, + "loss": 0.1457, + "step": 7410 + }, + { + "epoch": 2.5702407760263295, + "grad_norm": 0.13155702663844496, + "learning_rate": 3.0679109788308293e-06, + "loss": 0.1507, + "step": 7420 + }, + { + "epoch": 2.5737051792828685, + "grad_norm": 0.12565136055486759, + "learning_rate": 3.0197226514318527e-06, + "loss": 0.1447, + "step": 7430 + }, + { + "epoch": 2.5771695825394074, + "grad_norm": 0.12227279079368177, + "learning_rate": 2.9718914418534747e-06, + "loss": 0.1481, + "step": 7440 + }, + { + "epoch": 2.580633985795947, + "grad_norm": 0.12446579351502253, + "learning_rate": 2.9244181272186257e-06, + "loss": 0.1508, + "step": 7450 + }, + { + "epoch": 2.584098389052486, + "grad_norm": 0.12176081031880796, + "learning_rate": 2.8773034788354384e-06, + "loss": 0.1516, + "step": 7460 + }, + { + "epoch": 2.5875627923090248, + "grad_norm": 0.12394944777774412, + "learning_rate": 2.8305482621847152e-06, + "loss": 0.146, + "step": 7470 + }, + { + "epoch": 2.5910271955655637, + "grad_norm": 0.12488028679932459, + "learning_rate": 2.784153236907522e-06, + "loss": 0.1473, + "step": 7480 + }, + { + "epoch": 2.5944915988221027, + "grad_norm": 0.12228989346664527, + "learning_rate": 2.7381191567928064e-06, + "loss": 0.1481, + "step": 7490 + }, + { + "epoch": 2.597956002078642, + "grad_norm": 0.12775499380593486, + "learning_rate": 2.6924467697651778e-06, + "loss": 0.1489, + "step": 7500 + }, + { + "epoch": 2.601420405335181, + "grad_norm": 0.123666079771719, + "learning_rate": 2.6471368178727583e-06, + "loss": 0.1457, + "step": 7510 + }, + { + "epoch": 2.60488480859172, + "grad_norm": 0.12267432282437238, + "learning_rate": 2.6021900372750956e-06, + "loss": 0.1457, + "step": 7520 + }, + { + "epoch": 2.6083492118482594, + "grad_norm": 0.12433547335629408, + "learning_rate": 2.5576071582312428e-06, + "loss": 0.1461, + "step": 7530 + }, + { + "epoch": 2.611813615104798, + "grad_norm": 0.12415914526721374, + "learning_rate": 2.51338890508786e-06, + "loss": 0.1503, + "step": 7540 + }, + { + "epoch": 2.6152780183613373, + "grad_norm": 0.1228630751843417, + "learning_rate": 2.4695359962674608e-06, + "loss": 0.1478, + "step": 7550 + }, + { + "epoch": 2.6187424216178763, + "grad_norm": 0.12239232127572298, + "learning_rate": 2.4260491442567506e-06, + "loss": 0.148, + "step": 7560 + }, + { + "epoch": 2.6222068248744153, + "grad_norm": 0.13105355595668536, + "learning_rate": 2.3829290555950264e-06, + "loss": 0.1463, + "step": 7570 + }, + { + "epoch": 2.6256712281309547, + "grad_norm": 0.12268630715908427, + "learning_rate": 2.340176430862723e-06, + "loss": 0.1458, + "step": 7580 + }, + { + "epoch": 2.6291356313874936, + "grad_norm": 0.12235644638631886, + "learning_rate": 2.2977919646700068e-06, + "loss": 0.1455, + "step": 7590 + }, + { + "epoch": 2.6326000346440326, + "grad_norm": 0.11798600614702796, + "learning_rate": 2.255776345645494e-06, + "loss": 0.142, + "step": 7600 + }, + { + "epoch": 2.6360644379005715, + "grad_norm": 0.12407906704297328, + "learning_rate": 2.2141302564250926e-06, + "loss": 0.1502, + "step": 7610 + }, + { + "epoch": 2.6395288411571105, + "grad_norm": 0.11753331905833096, + "learning_rate": 2.17285437364087e-06, + "loss": 0.1428, + "step": 7620 + }, + { + "epoch": 2.64299324441365, + "grad_norm": 0.12007274689026741, + "learning_rate": 2.131949367910077e-06, + "loss": 0.1465, + "step": 7630 + }, + { + "epoch": 2.646457647670189, + "grad_norm": 0.125472794336859, + "learning_rate": 2.0914159038242704e-06, + "loss": 0.1423, + "step": 7640 + }, + { + "epoch": 2.649922050926728, + "grad_norm": 0.12784584316147593, + "learning_rate": 2.051254639938477e-06, + "loss": 0.1476, + "step": 7650 + }, + { + "epoch": 2.653386454183267, + "grad_norm": 0.11853028227441036, + "learning_rate": 2.0114662287605335e-06, + "loss": 0.1486, + "step": 7660 + }, + { + "epoch": 2.6568508574398058, + "grad_norm": 0.1191674746816265, + "learning_rate": 1.97205131674045e-06, + "loss": 0.1492, + "step": 7670 + }, + { + "epoch": 2.660315260696345, + "grad_norm": 0.1250727768460615, + "learning_rate": 1.933010544259939e-06, + "loss": 0.1454, + "step": 7680 + }, + { + "epoch": 2.663779663952884, + "grad_norm": 0.12320213371283693, + "learning_rate": 1.8943445456219815e-06, + "loss": 0.1472, + "step": 7690 + }, + { + "epoch": 2.667244067209423, + "grad_norm": 0.11891666023211327, + "learning_rate": 1.8560539490405399e-06, + "loss": 0.1492, + "step": 7700 + }, + { + "epoch": 2.6707084704659625, + "grad_norm": 0.125346930924718, + "learning_rate": 1.8181393766303595e-06, + "loss": 0.1462, + "step": 7710 + }, + { + "epoch": 2.6741728737225015, + "grad_norm": 0.1321032171582539, + "learning_rate": 1.7806014443968289e-06, + "loss": 0.1436, + "step": 7720 + }, + { + "epoch": 2.6776372769790404, + "grad_norm": 0.12117133427851705, + "learning_rate": 1.7434407622259951e-06, + "loss": 0.1466, + "step": 7730 + }, + { + "epoch": 2.6811016802355794, + "grad_norm": 0.11718318211141446, + "learning_rate": 1.7066579338746668e-06, + "loss": 0.1456, + "step": 7740 + }, + { + "epoch": 2.6845660834921183, + "grad_norm": 0.1283761135380607, + "learning_rate": 1.670253556960563e-06, + "loss": 0.1464, + "step": 7750 + }, + { + "epoch": 2.6880304867486577, + "grad_norm": 0.12456996594619513, + "learning_rate": 1.6342282229526468e-06, + "loss": 0.1482, + "step": 7760 + }, + { + "epoch": 2.6914948900051967, + "grad_norm": 0.11703659504283041, + "learning_rate": 1.5985825171614953e-06, + "loss": 0.1455, + "step": 7770 + }, + { + "epoch": 2.6949592932617357, + "grad_norm": 0.11962329640139921, + "learning_rate": 1.5633170187297846e-06, + "loss": 0.1468, + "step": 7780 + }, + { + "epoch": 2.6984236965182746, + "grad_norm": 0.12759004303067803, + "learning_rate": 1.5284323006229035e-06, + "loss": 0.1457, + "step": 7790 + }, + { + "epoch": 2.7018880997748136, + "grad_norm": 0.12164262479291138, + "learning_rate": 1.4939289296196063e-06, + "loss": 0.1451, + "step": 7800 + }, + { + "epoch": 2.705352503031353, + "grad_norm": 0.11841849076099206, + "learning_rate": 1.4598074663028483e-06, + "loss": 0.1442, + "step": 7810 + }, + { + "epoch": 2.708816906287892, + "grad_norm": 0.1279272278852782, + "learning_rate": 1.4260684650506478e-06, + "loss": 0.1466, + "step": 7820 + }, + { + "epoch": 2.712281309544431, + "grad_norm": 0.11952115983833288, + "learning_rate": 1.3927124740270885e-06, + "loss": 0.1457, + "step": 7830 + }, + { + "epoch": 2.71574571280097, + "grad_norm": 0.12162884781857089, + "learning_rate": 1.3597400351734151e-06, + "loss": 0.1427, + "step": 7840 + }, + { + "epoch": 2.719210116057509, + "grad_norm": 0.13471370194895713, + "learning_rate": 1.327151684199221e-06, + "loss": 0.1442, + "step": 7850 + }, + { + "epoch": 2.7226745193140482, + "grad_norm": 0.12907986475959193, + "learning_rate": 1.2949479505737494e-06, + "loss": 0.1455, + "step": 7860 + }, + { + "epoch": 2.726138922570587, + "grad_norm": 0.12566383969973927, + "learning_rate": 1.263129357517301e-06, + "loss": 0.1463, + "step": 7870 + }, + { + "epoch": 2.729603325827126, + "grad_norm": 0.1195570593454166, + "learning_rate": 1.2316964219927119e-06, + "loss": 0.1422, + "step": 7880 + }, + { + "epoch": 2.7330677290836656, + "grad_norm": 0.12438741593202014, + "learning_rate": 1.2006496546969642e-06, + "loss": 0.1506, + "step": 7890 + }, + { + "epoch": 2.7365321323402045, + "grad_norm": 0.13311421492284156, + "learning_rate": 1.1699895600529087e-06, + "loss": 0.1484, + "step": 7900 + }, + { + "epoch": 2.7399965355967435, + "grad_norm": 0.13353566400942274, + "learning_rate": 1.1397166362010243e-06, + "loss": 0.1472, + "step": 7910 + }, + { + "epoch": 2.7434609388532825, + "grad_norm": 0.12303792368683632, + "learning_rate": 1.109831374991377e-06, + "loss": 0.1455, + "step": 7920 + }, + { + "epoch": 2.7469253421098214, + "grad_norm": 0.1164932218834871, + "learning_rate": 1.080334261975577e-06, + "loss": 0.1456, + "step": 7930 + }, + { + "epoch": 2.750389745366361, + "grad_norm": 0.12413166278304175, + "learning_rate": 1.051225776398937e-06, + "loss": 0.1475, + "step": 7940 + }, + { + "epoch": 2.7538541486229, + "grad_norm": 0.11917463944574379, + "learning_rate": 1.0225063911926597e-06, + "loss": 0.1473, + "step": 7950 + }, + { + "epoch": 2.7573185518794388, + "grad_norm": 0.21002619780579418, + "learning_rate": 9.94176572966149e-07, + "loss": 0.1499, + "step": 7960 + }, + { + "epoch": 2.7607829551359777, + "grad_norm": 0.12190061305916731, + "learning_rate": 9.662367819994467e-07, + "loss": 0.1448, + "step": 7970 + }, + { + "epoch": 2.7642473583925167, + "grad_norm": 0.11828260636790151, + "learning_rate": 9.386874722357469e-07, + "loss": 0.1492, + "step": 7980 + }, + { + "epoch": 2.767711761649056, + "grad_norm": 0.11503639232862127, + "learning_rate": 9.115290912740132e-07, + "loss": 0.1448, + "step": 7990 + }, + { + "epoch": 2.771176164905595, + "grad_norm": 0.1189533885403222, + "learning_rate": 8.847620803617257e-07, + "loss": 0.1467, + "step": 8000 + }, + { + "epoch": 2.774640568162134, + "grad_norm": 0.12701672502597688, + "learning_rate": 8.583868743876844e-07, + "loss": 0.1462, + "step": 8010 + }, + { + "epoch": 2.7781049714186734, + "grad_norm": 0.1220861131545684, + "learning_rate": 8.324039018749674e-07, + "loss": 0.1453, + "step": 8020 + }, + { + "epoch": 2.781569374675212, + "grad_norm": 0.12031457421341588, + "learning_rate": 8.068135849739617e-07, + "loss": 0.1437, + "step": 8030 + }, + { + "epoch": 2.7850337779317513, + "grad_norm": 0.12017875343248997, + "learning_rate": 7.816163394554932e-07, + "loss": 0.1478, + "step": 8040 + }, + { + "epoch": 2.7884981811882903, + "grad_norm": 0.1192917350903116, + "learning_rate": 7.56812574704091e-07, + "loss": 0.1436, + "step": 8050 + }, + { + "epoch": 2.7919625844448293, + "grad_norm": 0.11524716943825414, + "learning_rate": 7.32402693711326e-07, + "loss": 0.1451, + "step": 8060 + }, + { + "epoch": 2.7954269877013687, + "grad_norm": 0.12406089861167721, + "learning_rate": 7.083870930692516e-07, + "loss": 0.1459, + "step": 8070 + }, + { + "epoch": 2.7988913909579076, + "grad_norm": 0.12315834818935076, + "learning_rate": 6.847661629639873e-07, + "loss": 0.1451, + "step": 8080 + }, + { + "epoch": 2.8023557942144466, + "grad_norm": 0.11732007572900043, + "learning_rate": 6.615402871693487e-07, + "loss": 0.1473, + "step": 8090 + }, + { + "epoch": 2.8058201974709855, + "grad_norm": 0.11711430732640442, + "learning_rate": 6.387098430406441e-07, + "loss": 0.1481, + "step": 8100 + }, + { + "epoch": 2.8092846007275245, + "grad_norm": 0.11662597413638771, + "learning_rate": 6.162752015085122e-07, + "loss": 0.1462, + "step": 8110 + }, + { + "epoch": 2.812749003984064, + "grad_norm": 0.11644716676700212, + "learning_rate": 5.942367270729165e-07, + "loss": 0.1454, + "step": 8120 + }, + { + "epoch": 2.816213407240603, + "grad_norm": 0.1210826937584845, + "learning_rate": 5.725947777972224e-07, + "loss": 0.1482, + "step": 8130 + }, + { + "epoch": 2.819677810497142, + "grad_norm": 0.11717664884081884, + "learning_rate": 5.513497053023647e-07, + "loss": 0.1446, + "step": 8140 + }, + { + "epoch": 2.823142213753681, + "grad_norm": 0.11782433267361038, + "learning_rate": 5.305018547611451e-07, + "loss": 0.1479, + "step": 8150 + }, + { + "epoch": 2.8266066170102198, + "grad_norm": 0.12844697710634922, + "learning_rate": 5.100515648926329e-07, + "loss": 0.1457, + "step": 8160 + }, + { + "epoch": 2.830071020266759, + "grad_norm": 0.1259382993927401, + "learning_rate": 4.899991679566423e-07, + "loss": 0.1482, + "step": 8170 + }, + { + "epoch": 2.833535423523298, + "grad_norm": 0.12112661112999973, + "learning_rate": 4.703449897483503e-07, + "loss": 0.1468, + "step": 8180 + }, + { + "epoch": 2.836999826779837, + "grad_norm": 0.11907259047343774, + "learning_rate": 4.5108934959299243e-07, + "loss": 0.1412, + "step": 8190 + }, + { + "epoch": 2.8404642300363765, + "grad_norm": 0.11046949957522073, + "learning_rate": 4.322325603406813e-07, + "loss": 0.1457, + "step": 8200 + }, + { + "epoch": 2.8439286332929155, + "grad_norm": 0.11728964170089871, + "learning_rate": 4.137749283613268e-07, + "loss": 0.148, + "step": 8210 + }, + { + "epoch": 2.8473930365494544, + "grad_norm": 0.12458638671812242, + "learning_rate": 3.9571675353964053e-07, + "loss": 0.1477, + "step": 8220 + }, + { + "epoch": 2.8508574398059934, + "grad_norm": 0.12603956939519112, + "learning_rate": 3.780583292702894e-07, + "loss": 0.1431, + "step": 8230 + }, + { + "epoch": 2.8543218430625323, + "grad_norm": 0.11737849831673626, + "learning_rate": 3.607999424531078e-07, + "loss": 0.1459, + "step": 8240 + }, + { + "epoch": 2.8577862463190717, + "grad_norm": 0.11861750859649005, + "learning_rate": 3.4394187348844866e-07, + "loss": 0.1484, + "step": 8250 + }, + { + "epoch": 2.8612506495756107, + "grad_norm": 0.12007382110446226, + "learning_rate": 3.274843962726204e-07, + "loss": 0.1461, + "step": 8260 + }, + { + "epoch": 2.8647150528321497, + "grad_norm": 0.1114021812553446, + "learning_rate": 3.114277781934433e-07, + "loss": 0.1408, + "step": 8270 + }, + { + "epoch": 2.8681794560886886, + "grad_norm": 0.12459777820858844, + "learning_rate": 2.957722801258944e-07, + "loss": 0.1469, + "step": 8280 + }, + { + "epoch": 2.8716438593452276, + "grad_norm": 0.12059709971421617, + "learning_rate": 2.805181564278864e-07, + "loss": 0.1476, + "step": 8290 + }, + { + "epoch": 2.875108262601767, + "grad_norm": 0.11316078775210853, + "learning_rate": 2.6566565493611475e-07, + "loss": 0.1447, + "step": 8300 + }, + { + "epoch": 2.878572665858306, + "grad_norm": 0.12343927697834793, + "learning_rate": 2.512150169620503e-07, + "loss": 0.1499, + "step": 8310 + }, + { + "epoch": 2.882037069114845, + "grad_norm": 0.1264438568086182, + "learning_rate": 2.371664772880061e-07, + "loss": 0.1475, + "step": 8320 + }, + { + "epoch": 2.885501472371384, + "grad_norm": 0.11834929752982455, + "learning_rate": 2.2352026416331829e-07, + "loss": 0.1484, + "step": 8330 + }, + { + "epoch": 2.888965875627923, + "grad_norm": 0.12074802080378956, + "learning_rate": 2.1027659930066e-07, + "loss": 0.1464, + "step": 8340 + }, + { + "epoch": 2.8924302788844622, + "grad_norm": 0.11880030905569529, + "learning_rate": 1.97435697872414e-07, + "loss": 0.1465, + "step": 8350 + }, + { + "epoch": 2.895894682141001, + "grad_norm": 0.1286519420227983, + "learning_rate": 1.8499776850719463e-07, + "loss": 0.1466, + "step": 8360 + }, + { + "epoch": 2.89935908539754, + "grad_norm": 0.12408525205621782, + "learning_rate": 1.7296301328644516e-07, + "loss": 0.1467, + "step": 8370 + }, + { + "epoch": 2.9028234886540796, + "grad_norm": 0.11413939977206788, + "learning_rate": 1.613316277411625e-07, + "loss": 0.1432, + "step": 8380 + }, + { + "epoch": 2.9062878919106185, + "grad_norm": 0.12094926364512971, + "learning_rate": 1.5010380084871933e-07, + "loss": 0.1482, + "step": 8390 + }, + { + "epoch": 2.9097522951671575, + "grad_norm": 0.1173931691129058, + "learning_rate": 1.392797150297942e-07, + "loss": 0.1476, + "step": 8400 + }, + { + "epoch": 2.9132166984236965, + "grad_norm": 0.11719997871862918, + "learning_rate": 1.2885954614540175e-07, + "loss": 0.1463, + "step": 8410 + }, + { + "epoch": 2.9166811016802354, + "grad_norm": 0.11982420345633238, + "learning_rate": 1.1884346349404774e-07, + "loss": 0.1508, + "step": 8420 + }, + { + "epoch": 2.920145504936775, + "grad_norm": 0.12475530168973786, + "learning_rate": 1.0923162980896185e-07, + "loss": 0.15, + "step": 8430 + }, + { + "epoch": 2.923609908193314, + "grad_norm": 0.11892325463016702, + "learning_rate": 1.000242012554664e-07, + "loss": 0.1476, + "step": 8440 + }, + { + "epoch": 2.9270743114498528, + "grad_norm": 0.1192790735851098, + "learning_rate": 9.122132742843681e-08, + "loss": 0.1454, + "step": 8450 + }, + { + "epoch": 2.9305387147063917, + "grad_norm": 0.1170936103181331, + "learning_rate": 8.28231513498673e-08, + "loss": 0.1482, + "step": 8460 + }, + { + "epoch": 2.9340031179629307, + "grad_norm": 0.11575536846669715, + "learning_rate": 7.48298094665506e-08, + "loss": 0.1482, + "step": 8470 + }, + { + "epoch": 2.93746752121947, + "grad_norm": 0.10817448466285368, + "learning_rate": 6.724143164785757e-08, + "loss": 0.1454, + "step": 8480 + }, + { + "epoch": 2.940931924476009, + "grad_norm": 0.11464554359428555, + "learning_rate": 6.005814118363317e-08, + "loss": 0.1438, + "step": 8490 + }, + { + "epoch": 2.944396327732548, + "grad_norm": 0.12385487233495297, + "learning_rate": 5.328005478218989e-08, + "loss": 0.1463, + "step": 8500 + }, + { + "epoch": 2.947860730989087, + "grad_norm": 0.11878438059832382, + "learning_rate": 4.69072825684036e-08, + "loss": 0.1464, + "step": 8510 + }, + { + "epoch": 2.951325134245626, + "grad_norm": 0.11382553656848247, + "learning_rate": 4.093992808194558e-08, + "loss": 0.1427, + "step": 8520 + }, + { + "epoch": 2.9547895375021653, + "grad_norm": 0.1221402366685275, + "learning_rate": 3.537808827557276e-08, + "loss": 0.1469, + "step": 8530 + }, + { + "epoch": 2.9582539407587043, + "grad_norm": 0.11836429164824389, + "learning_rate": 3.0221853513576207e-08, + "loss": 0.1471, + "step": 8540 + }, + { + "epoch": 2.9617183440152433, + "grad_norm": 0.11592227533291538, + "learning_rate": 2.5471307570298918e-08, + "loss": 0.1436, + "step": 8550 + }, + { + "epoch": 2.9651827472717827, + "grad_norm": 0.12005026306688542, + "learning_rate": 2.112652762878142e-08, + "loss": 0.1466, + "step": 8560 + }, + { + "epoch": 2.9686471505283216, + "grad_norm": 0.12062875615569613, + "learning_rate": 1.71875842795044e-08, + "loss": 0.1457, + "step": 8570 + }, + { + "epoch": 2.9721115537848606, + "grad_norm": 0.12033280556169802, + "learning_rate": 1.3654541519242392e-08, + "loss": 0.1437, + "step": 8580 + }, + { + "epoch": 2.9755759570413995, + "grad_norm": 0.12337008384975914, + "learning_rate": 1.0527456750025755e-08, + "loss": 0.1468, + "step": 8590 + }, + { + "epoch": 2.9790403602979385, + "grad_norm": 0.12167133579723895, + "learning_rate": 7.80638077820528e-09, + "loss": 0.1519, + "step": 8600 + }, + { + "epoch": 2.982504763554478, + "grad_norm": 0.11751314218922883, + "learning_rate": 5.491357813627862e-09, + "loss": 0.1494, + "step": 8610 + }, + { + "epoch": 2.985969166811017, + "grad_norm": 0.12293930219223904, + "learning_rate": 3.582425468920403e-09, + "loss": 0.1483, + "step": 8620 + }, + { + "epoch": 2.989433570067556, + "grad_norm": 0.12371328514211692, + "learning_rate": 2.0796147588791894e-09, + "loss": 0.1446, + "step": 8630 + }, + { + "epoch": 2.992897973324095, + "grad_norm": 0.1283635993974664, + "learning_rate": 9.829500999564144e-10, + "loss": 0.148, + "step": 8640 + }, + { + "epoch": 2.9963623765806338, + "grad_norm": 0.11829545701564251, + "learning_rate": 2.924493098743764e-10, + "loss": 0.1419, + "step": 8650 + }, + { + "epoch": 2.999826779837173, + "grad_norm": 0.1191095821300158, + "learning_rate": 8.123607339594053e-12, + "loss": 0.1497, + "step": 8660 + }, + { + "epoch": 3.0, + "step": 8661, + "total_flos": 1.6970304064115966e+19, + "train_loss": 0.20868314632318158, + "train_runtime": 603947.8558, + "train_samples_per_second": 0.459, + "train_steps_per_second": 0.014 + } + ], + "logging_steps": 10, + "max_steps": 8661, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6970304064115966e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}