diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13846 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9996578337071815, + "eval_steps": 500, + "global_step": 19725, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015207390791924875, + "grad_norm": 7.920117204038963, + "learning_rate": 2.534211860111505e-07, + "loss": 0.4324, + "step": 10 + }, + { + "epoch": 0.003041478158384975, + "grad_norm": 3.2371878377445746, + "learning_rate": 5.06842372022301e-07, + "loss": 0.4101, + "step": 20 + }, + { + "epoch": 0.004562217237577462, + "grad_norm": 4.287064463182669, + "learning_rate": 7.602635580334516e-07, + "loss": 0.3245, + "step": 30 + }, + { + "epoch": 0.00608295631676995, + "grad_norm": 3.4432643629680473, + "learning_rate": 1.013684744044602e-06, + "loss": 0.3225, + "step": 40 + }, + { + "epoch": 0.007603695395962438, + "grad_norm": 3.434644181212606, + "learning_rate": 1.2671059300557528e-06, + "loss": 0.3009, + "step": 50 + }, + { + "epoch": 0.009124434475154925, + "grad_norm": 2.8694168864714866, + "learning_rate": 1.520527116066903e-06, + "loss": 0.3489, + "step": 60 + }, + { + "epoch": 0.010645173554347413, + "grad_norm": 3.678337642146407, + "learning_rate": 1.7739483020780538e-06, + "loss": 0.2779, + "step": 70 + }, + { + "epoch": 0.0121659126335399, + "grad_norm": 1.9666916605256644, + "learning_rate": 2.027369488089204e-06, + "loss": 0.3357, + "step": 80 + }, + { + "epoch": 0.013686651712732388, + "grad_norm": 3.888847421977736, + "learning_rate": 2.280790674100355e-06, + "loss": 0.3296, + "step": 90 + }, + { + "epoch": 0.015207390791924876, + "grad_norm": 3.424017018832768, + "learning_rate": 2.5342118601115056e-06, + "loss": 0.3148, + "step": 100 + }, + { + "epoch": 0.016728129871117364, + "grad_norm": 3.8454291097588618, + "learning_rate": 2.787633046122656e-06, + "loss": 0.347, + "step": 110 + }, + { + "epoch": 0.01824886895030985, + "grad_norm": 2.1823036100543254, + "learning_rate": 3.041054232133806e-06, + "loss": 0.2602, + "step": 120 + }, + { + "epoch": 0.01976960802950234, + "grad_norm": 3.507590329557954, + "learning_rate": 3.2944754181449574e-06, + "loss": 0.2611, + "step": 130 + }, + { + "epoch": 0.021290347108694825, + "grad_norm": 2.409802367134588, + "learning_rate": 3.5478966041561077e-06, + "loss": 0.2734, + "step": 140 + }, + { + "epoch": 0.022811086187887315, + "grad_norm": 2.912034510716539, + "learning_rate": 3.801317790167258e-06, + "loss": 0.2606, + "step": 150 + }, + { + "epoch": 0.0243318252670798, + "grad_norm": 3.168615304492601, + "learning_rate": 4.054738976178408e-06, + "loss": 0.2932, + "step": 160 + }, + { + "epoch": 0.025852564346272287, + "grad_norm": 1.8963015357670776, + "learning_rate": 4.308160162189559e-06, + "loss": 0.2355, + "step": 170 + }, + { + "epoch": 0.027373303425464776, + "grad_norm": 2.4755983063673415, + "learning_rate": 4.56158134820071e-06, + "loss": 0.3456, + "step": 180 + }, + { + "epoch": 0.028894042504657262, + "grad_norm": 1.6314601451103148, + "learning_rate": 4.8150025342118605e-06, + "loss": 0.2596, + "step": 190 + }, + { + "epoch": 0.03041478158384975, + "grad_norm": 2.4119688907773478, + "learning_rate": 5.068423720223011e-06, + "loss": 0.2917, + "step": 200 + }, + { + "epoch": 0.03193552066304224, + "grad_norm": 2.8174073356075944, + "learning_rate": 5.321844906234161e-06, + "loss": 0.1707, + "step": 210 + }, + { + "epoch": 0.03345625974223473, + "grad_norm": 2.3365114447087265, + "learning_rate": 5.575266092245312e-06, + "loss": 0.2638, + "step": 220 + }, + { + "epoch": 0.03497699882142721, + "grad_norm": 1.8538131901006343, + "learning_rate": 5.8286872782564625e-06, + "loss": 0.2592, + "step": 230 + }, + { + "epoch": 0.0364977379006197, + "grad_norm": 2.3286015139012344, + "learning_rate": 6.082108464267612e-06, + "loss": 0.2251, + "step": 240 + }, + { + "epoch": 0.03801847697981219, + "grad_norm": 2.0703385448813387, + "learning_rate": 6.335529650278764e-06, + "loss": 0.2212, + "step": 250 + }, + { + "epoch": 0.03953921605900468, + "grad_norm": 2.3452792522041572, + "learning_rate": 6.588950836289915e-06, + "loss": 0.1738, + "step": 260 + }, + { + "epoch": 0.041059955138197164, + "grad_norm": 1.9706026111359962, + "learning_rate": 6.842372022301065e-06, + "loss": 0.2121, + "step": 270 + }, + { + "epoch": 0.04258069421738965, + "grad_norm": 3.4374737998995952, + "learning_rate": 7.095793208312215e-06, + "loss": 0.2547, + "step": 280 + }, + { + "epoch": 0.044101433296582136, + "grad_norm": 1.6907681676775899, + "learning_rate": 7.349214394323366e-06, + "loss": 0.2256, + "step": 290 + }, + { + "epoch": 0.04562217237577463, + "grad_norm": 1.7193038175885995, + "learning_rate": 7.602635580334516e-06, + "loss": 0.2818, + "step": 300 + }, + { + "epoch": 0.047142911454967115, + "grad_norm": 2.032149644063285, + "learning_rate": 7.856056766345667e-06, + "loss": 0.2373, + "step": 310 + }, + { + "epoch": 0.0486636505341596, + "grad_norm": 2.382840800605271, + "learning_rate": 8.109477952356817e-06, + "loss": 0.2276, + "step": 320 + }, + { + "epoch": 0.05018438961335209, + "grad_norm": 2.518309085848073, + "learning_rate": 8.362899138367968e-06, + "loss": 0.1993, + "step": 330 + }, + { + "epoch": 0.05170512869254457, + "grad_norm": 3.2293405091493774, + "learning_rate": 8.616320324379118e-06, + "loss": 0.2344, + "step": 340 + }, + { + "epoch": 0.053225867771737066, + "grad_norm": 2.1856599331380306, + "learning_rate": 8.869741510390268e-06, + "loss": 0.2518, + "step": 350 + }, + { + "epoch": 0.05474660685092955, + "grad_norm": 2.5745150998899264, + "learning_rate": 9.12316269640142e-06, + "loss": 0.2264, + "step": 360 + }, + { + "epoch": 0.05626734593012204, + "grad_norm": 2.3059962966937544, + "learning_rate": 9.376583882412571e-06, + "loss": 0.2915, + "step": 370 + }, + { + "epoch": 0.057788085009314524, + "grad_norm": 2.8307230402198966, + "learning_rate": 9.630005068423721e-06, + "loss": 0.2917, + "step": 380 + }, + { + "epoch": 0.05930882408850702, + "grad_norm": 1.627973369140701, + "learning_rate": 9.88342625443487e-06, + "loss": 0.2234, + "step": 390 + }, + { + "epoch": 0.0608295631676995, + "grad_norm": 2.2765038272901066, + "learning_rate": 1.0136847440446022e-05, + "loss": 0.2102, + "step": 400 + }, + { + "epoch": 0.06235030224689199, + "grad_norm": 2.130331047651815, + "learning_rate": 1.0390268626457172e-05, + "loss": 0.2204, + "step": 410 + }, + { + "epoch": 0.06387104132608448, + "grad_norm": 2.0554775112936596, + "learning_rate": 1.0643689812468322e-05, + "loss": 0.2054, + "step": 420 + }, + { + "epoch": 0.06539178040527696, + "grad_norm": 1.3602114204429487, + "learning_rate": 1.0897110998479474e-05, + "loss": 0.1953, + "step": 430 + }, + { + "epoch": 0.06691251948446945, + "grad_norm": 3.3227202901316004, + "learning_rate": 1.1150532184490624e-05, + "loss": 0.1974, + "step": 440 + }, + { + "epoch": 0.06843325856366193, + "grad_norm": 2.233814011092518, + "learning_rate": 1.1403953370501773e-05, + "loss": 0.2004, + "step": 450 + }, + { + "epoch": 0.06995399764285443, + "grad_norm": 2.794021420039594, + "learning_rate": 1.1657374556512925e-05, + "loss": 0.1851, + "step": 460 + }, + { + "epoch": 0.07147473672204692, + "grad_norm": 1.7792982852138908, + "learning_rate": 1.1910795742524075e-05, + "loss": 0.2027, + "step": 470 + }, + { + "epoch": 0.0729954758012394, + "grad_norm": 1.6112141271455942, + "learning_rate": 1.2164216928535225e-05, + "loss": 0.2307, + "step": 480 + }, + { + "epoch": 0.07451621488043189, + "grad_norm": 1.7987356868356992, + "learning_rate": 1.2417638114546376e-05, + "loss": 0.1754, + "step": 490 + }, + { + "epoch": 0.07603695395962438, + "grad_norm": 1.7944296854021997, + "learning_rate": 1.2671059300557528e-05, + "loss": 0.2184, + "step": 500 + }, + { + "epoch": 0.07755769303881686, + "grad_norm": 2.9372263399292247, + "learning_rate": 1.2924480486568676e-05, + "loss": 0.2473, + "step": 510 + }, + { + "epoch": 0.07907843211800936, + "grad_norm": 2.504506544051458, + "learning_rate": 1.317790167257983e-05, + "loss": 0.2002, + "step": 520 + }, + { + "epoch": 0.08059917119720184, + "grad_norm": 2.51380250456987, + "learning_rate": 1.3431322858590978e-05, + "loss": 0.2329, + "step": 530 + }, + { + "epoch": 0.08211991027639433, + "grad_norm": 2.9915935834168845, + "learning_rate": 1.368474404460213e-05, + "loss": 0.288, + "step": 540 + }, + { + "epoch": 0.08364064935558682, + "grad_norm": 2.5971490080381625, + "learning_rate": 1.3938165230613279e-05, + "loss": 0.2177, + "step": 550 + }, + { + "epoch": 0.0851613884347793, + "grad_norm": 2.119019730725757, + "learning_rate": 1.419158641662443e-05, + "loss": 0.1767, + "step": 560 + }, + { + "epoch": 0.0866821275139718, + "grad_norm": 3.322214667726387, + "learning_rate": 1.4445007602635582e-05, + "loss": 0.2175, + "step": 570 + }, + { + "epoch": 0.08820286659316427, + "grad_norm": 1.7893539806696637, + "learning_rate": 1.4698428788646732e-05, + "loss": 0.2089, + "step": 580 + }, + { + "epoch": 0.08972360567235677, + "grad_norm": 1.9120300159336259, + "learning_rate": 1.4951849974657884e-05, + "loss": 0.2739, + "step": 590 + }, + { + "epoch": 0.09124434475154926, + "grad_norm": 2.2300972862834927, + "learning_rate": 1.5205271160669032e-05, + "loss": 0.2304, + "step": 600 + }, + { + "epoch": 0.09276508383074174, + "grad_norm": 2.066071390622535, + "learning_rate": 1.5458692346680183e-05, + "loss": 0.2305, + "step": 610 + }, + { + "epoch": 0.09428582290993423, + "grad_norm": 1.4318165390362705, + "learning_rate": 1.5712113532691333e-05, + "loss": 0.2379, + "step": 620 + }, + { + "epoch": 0.09580656198912671, + "grad_norm": 2.2697820262564026, + "learning_rate": 1.5965534718702487e-05, + "loss": 0.2339, + "step": 630 + }, + { + "epoch": 0.0973273010683192, + "grad_norm": 2.8166379711528844, + "learning_rate": 1.6218955904713633e-05, + "loss": 0.2132, + "step": 640 + }, + { + "epoch": 0.0988480401475117, + "grad_norm": 2.810524853699334, + "learning_rate": 1.6472377090724786e-05, + "loss": 0.2472, + "step": 650 + }, + { + "epoch": 0.10036877922670417, + "grad_norm": 2.066128107039832, + "learning_rate": 1.6725798276735936e-05, + "loss": 0.25, + "step": 660 + }, + { + "epoch": 0.10188951830589667, + "grad_norm": 2.435606189612046, + "learning_rate": 1.6979219462747086e-05, + "loss": 0.239, + "step": 670 + }, + { + "epoch": 0.10341025738508915, + "grad_norm": 1.9096992866300264, + "learning_rate": 1.7232640648758236e-05, + "loss": 0.2084, + "step": 680 + }, + { + "epoch": 0.10493099646428164, + "grad_norm": 2.2969777071890167, + "learning_rate": 1.748606183476939e-05, + "loss": 0.1933, + "step": 690 + }, + { + "epoch": 0.10645173554347413, + "grad_norm": 1.9122345339858153, + "learning_rate": 1.7739483020780536e-05, + "loss": 0.2385, + "step": 700 + }, + { + "epoch": 0.10797247462266661, + "grad_norm": 2.2128587436807003, + "learning_rate": 1.799290420679169e-05, + "loss": 0.2015, + "step": 710 + }, + { + "epoch": 0.1094932137018591, + "grad_norm": 2.0805746996608985, + "learning_rate": 1.824632539280284e-05, + "loss": 0.2317, + "step": 720 + }, + { + "epoch": 0.1110139527810516, + "grad_norm": 2.538481601513248, + "learning_rate": 1.849974657881399e-05, + "loss": 0.2475, + "step": 730 + }, + { + "epoch": 0.11253469186024408, + "grad_norm": 3.9441785949556167, + "learning_rate": 1.8753167764825142e-05, + "loss": 0.1774, + "step": 740 + }, + { + "epoch": 0.11405543093943657, + "grad_norm": 1.707545824552262, + "learning_rate": 1.9006588950836292e-05, + "loss": 0.2101, + "step": 750 + }, + { + "epoch": 0.11557617001862905, + "grad_norm": 1.972874300745295, + "learning_rate": 1.9260010136847442e-05, + "loss": 0.2461, + "step": 760 + }, + { + "epoch": 0.11709690909782154, + "grad_norm": 2.3804875175385343, + "learning_rate": 1.9513431322858592e-05, + "loss": 0.2488, + "step": 770 + }, + { + "epoch": 0.11861764817701403, + "grad_norm": 2.2229738232333287, + "learning_rate": 1.976685250886974e-05, + "loss": 0.2118, + "step": 780 + }, + { + "epoch": 0.12013838725620651, + "grad_norm": 2.594415792608808, + "learning_rate": 2.002027369488089e-05, + "loss": 0.2129, + "step": 790 + }, + { + "epoch": 0.121659126335399, + "grad_norm": 2.402432562805599, + "learning_rate": 2.0273694880892045e-05, + "loss": 0.2387, + "step": 800 + }, + { + "epoch": 0.12317986541459149, + "grad_norm": 2.726165952951196, + "learning_rate": 2.0527116066903195e-05, + "loss": 0.2548, + "step": 810 + }, + { + "epoch": 0.12470060449378398, + "grad_norm": 2.5069103341469456, + "learning_rate": 2.0780537252914344e-05, + "loss": 0.2164, + "step": 820 + }, + { + "epoch": 0.12622134357297646, + "grad_norm": 2.8860878102804612, + "learning_rate": 2.1033958438925494e-05, + "loss": 0.2505, + "step": 830 + }, + { + "epoch": 0.12774208265216896, + "grad_norm": 2.510420405028582, + "learning_rate": 2.1287379624936644e-05, + "loss": 0.228, + "step": 840 + }, + { + "epoch": 0.12926282173136144, + "grad_norm": 2.932281200718965, + "learning_rate": 2.1540800810947794e-05, + "loss": 0.2276, + "step": 850 + }, + { + "epoch": 0.13078356081055392, + "grad_norm": 2.467362998251219, + "learning_rate": 2.1794221996958947e-05, + "loss": 0.251, + "step": 860 + }, + { + "epoch": 0.13230429988974643, + "grad_norm": 2.8197481587473026, + "learning_rate": 2.2047643182970097e-05, + "loss": 0.2626, + "step": 870 + }, + { + "epoch": 0.1338250389689389, + "grad_norm": 2.9743202941425637, + "learning_rate": 2.2301064368981247e-05, + "loss": 0.2132, + "step": 880 + }, + { + "epoch": 0.1353457780481314, + "grad_norm": 2.8885353371177698, + "learning_rate": 2.25544855549924e-05, + "loss": 0.2643, + "step": 890 + }, + { + "epoch": 0.13686651712732387, + "grad_norm": 2.2370714458535788, + "learning_rate": 2.2807906741003547e-05, + "loss": 0.2191, + "step": 900 + }, + { + "epoch": 0.13838725620651637, + "grad_norm": 2.9120916782256567, + "learning_rate": 2.30613279270147e-05, + "loss": 0.2313, + "step": 910 + }, + { + "epoch": 0.13990799528570885, + "grad_norm": 2.3730413955397354, + "learning_rate": 2.331474911302585e-05, + "loss": 0.2916, + "step": 920 + }, + { + "epoch": 0.14142873436490133, + "grad_norm": 3.1022942600752126, + "learning_rate": 2.3568170299037e-05, + "loss": 0.2553, + "step": 930 + }, + { + "epoch": 0.14294947344409384, + "grad_norm": 3.2486745221168754, + "learning_rate": 2.382159148504815e-05, + "loss": 0.2208, + "step": 940 + }, + { + "epoch": 0.14447021252328632, + "grad_norm": 3.5614367451449858, + "learning_rate": 2.4075012671059303e-05, + "loss": 0.2667, + "step": 950 + }, + { + "epoch": 0.1459909516024788, + "grad_norm": 2.6683500953166086, + "learning_rate": 2.432843385707045e-05, + "loss": 0.2721, + "step": 960 + }, + { + "epoch": 0.1475116906816713, + "grad_norm": 3.25444067117784, + "learning_rate": 2.4581855043081603e-05, + "loss": 0.3067, + "step": 970 + }, + { + "epoch": 0.14903242976086378, + "grad_norm": 2.2374708843712887, + "learning_rate": 2.4835276229092753e-05, + "loss": 0.2919, + "step": 980 + }, + { + "epoch": 0.15055316884005626, + "grad_norm": 2.869841015853382, + "learning_rate": 2.5088697415103903e-05, + "loss": 0.2147, + "step": 990 + }, + { + "epoch": 0.15207390791924877, + "grad_norm": 2.1321670302129796, + "learning_rate": 2.5342118601115056e-05, + "loss": 0.2381, + "step": 1000 + }, + { + "epoch": 0.15359464699844125, + "grad_norm": 2.3325200197335714, + "learning_rate": 2.5595539787126206e-05, + "loss": 0.2512, + "step": 1010 + }, + { + "epoch": 0.15511538607763373, + "grad_norm": 3.5327084783930687, + "learning_rate": 2.5848960973137352e-05, + "loss": 0.2388, + "step": 1020 + }, + { + "epoch": 0.1566361251568262, + "grad_norm": 2.6237782244989147, + "learning_rate": 2.610238215914851e-05, + "loss": 0.2598, + "step": 1030 + }, + { + "epoch": 0.1581568642360187, + "grad_norm": 3.398535735508933, + "learning_rate": 2.635580334515966e-05, + "loss": 0.2438, + "step": 1040 + }, + { + "epoch": 0.1596776033152112, + "grad_norm": 3.0572180685811543, + "learning_rate": 2.6609224531170805e-05, + "loss": 0.2501, + "step": 1050 + }, + { + "epoch": 0.16119834239440367, + "grad_norm": 1.9644577673671377, + "learning_rate": 2.6862645717181955e-05, + "loss": 0.2531, + "step": 1060 + }, + { + "epoch": 0.16271908147359618, + "grad_norm": 3.2897743179797607, + "learning_rate": 2.7116066903193112e-05, + "loss": 0.2968, + "step": 1070 + }, + { + "epoch": 0.16423982055278866, + "grad_norm": 2.5955725619239263, + "learning_rate": 2.736948808920426e-05, + "loss": 0.2514, + "step": 1080 + }, + { + "epoch": 0.16576055963198114, + "grad_norm": 3.280716307751303, + "learning_rate": 2.7622909275215408e-05, + "loss": 0.2789, + "step": 1090 + }, + { + "epoch": 0.16728129871117364, + "grad_norm": 2.493367397696881, + "learning_rate": 2.7876330461226558e-05, + "loss": 0.2615, + "step": 1100 + }, + { + "epoch": 0.16880203779036612, + "grad_norm": 2.2300948328654973, + "learning_rate": 2.812975164723771e-05, + "loss": 0.2823, + "step": 1110 + }, + { + "epoch": 0.1703227768695586, + "grad_norm": 5.9105834950678116, + "learning_rate": 2.838317283324886e-05, + "loss": 0.2637, + "step": 1120 + }, + { + "epoch": 0.17184351594875108, + "grad_norm": 2.4156598526309656, + "learning_rate": 2.863659401926001e-05, + "loss": 0.2101, + "step": 1130 + }, + { + "epoch": 0.1733642550279436, + "grad_norm": 3.066402694606584, + "learning_rate": 2.8890015205271164e-05, + "loss": 0.2605, + "step": 1140 + }, + { + "epoch": 0.17488499410713607, + "grad_norm": 2.25179865182719, + "learning_rate": 2.9143436391282314e-05, + "loss": 0.2383, + "step": 1150 + }, + { + "epoch": 0.17640573318632854, + "grad_norm": 2.2186791513803414, + "learning_rate": 2.9396857577293464e-05, + "loss": 0.2506, + "step": 1160 + }, + { + "epoch": 0.17792647226552105, + "grad_norm": 2.4436209216967164, + "learning_rate": 2.965027876330461e-05, + "loss": 0.251, + "step": 1170 + }, + { + "epoch": 0.17944721134471353, + "grad_norm": 2.4642727793923354, + "learning_rate": 2.9903699949315767e-05, + "loss": 0.2768, + "step": 1180 + }, + { + "epoch": 0.180967950423906, + "grad_norm": 2.280073283652799, + "learning_rate": 3.0157121135326917e-05, + "loss": 0.2674, + "step": 1190 + }, + { + "epoch": 0.18248868950309852, + "grad_norm": 2.184894983604127, + "learning_rate": 3.0410542321338064e-05, + "loss": 0.2626, + "step": 1200 + }, + { + "epoch": 0.184009428582291, + "grad_norm": 2.6359546529683455, + "learning_rate": 3.066396350734922e-05, + "loss": 0.2956, + "step": 1210 + }, + { + "epoch": 0.18553016766148347, + "grad_norm": 2.614335071515979, + "learning_rate": 3.091738469336037e-05, + "loss": 0.2869, + "step": 1220 + }, + { + "epoch": 0.18705090674067598, + "grad_norm": 3.3507555662597994, + "learning_rate": 3.117080587937152e-05, + "loss": 0.2956, + "step": 1230 + }, + { + "epoch": 0.18857164581986846, + "grad_norm": 3.0682412105854713, + "learning_rate": 3.142422706538267e-05, + "loss": 0.2698, + "step": 1240 + }, + { + "epoch": 0.19009238489906094, + "grad_norm": 2.137330695252669, + "learning_rate": 3.1677648251393816e-05, + "loss": 0.2587, + "step": 1250 + }, + { + "epoch": 0.19161312397825342, + "grad_norm": 3.719498720763188, + "learning_rate": 3.193106943740497e-05, + "loss": 0.2464, + "step": 1260 + }, + { + "epoch": 0.19313386305744593, + "grad_norm": 2.940930267295928, + "learning_rate": 3.2184490623416116e-05, + "loss": 0.2764, + "step": 1270 + }, + { + "epoch": 0.1946546021366384, + "grad_norm": 2.8159842315218455, + "learning_rate": 3.2437911809427266e-05, + "loss": 0.3308, + "step": 1280 + }, + { + "epoch": 0.19617534121583088, + "grad_norm": 2.6799248203458417, + "learning_rate": 3.269133299543842e-05, + "loss": 0.3227, + "step": 1290 + }, + { + "epoch": 0.1976960802950234, + "grad_norm": 2.3009991809159955, + "learning_rate": 3.294475418144957e-05, + "loss": 0.2485, + "step": 1300 + }, + { + "epoch": 0.19921681937421587, + "grad_norm": 2.739984314302177, + "learning_rate": 3.319817536746072e-05, + "loss": 0.283, + "step": 1310 + }, + { + "epoch": 0.20073755845340835, + "grad_norm": 1.7504361967775135, + "learning_rate": 3.345159655347187e-05, + "loss": 0.2862, + "step": 1320 + }, + { + "epoch": 0.20225829753260086, + "grad_norm": 2.62411910629099, + "learning_rate": 3.370501773948302e-05, + "loss": 0.2838, + "step": 1330 + }, + { + "epoch": 0.20377903661179333, + "grad_norm": 2.4208165226051026, + "learning_rate": 3.395843892549417e-05, + "loss": 0.2606, + "step": 1340 + }, + { + "epoch": 0.20529977569098581, + "grad_norm": 3.0720742321766683, + "learning_rate": 3.421186011150532e-05, + "loss": 0.2745, + "step": 1350 + }, + { + "epoch": 0.2068205147701783, + "grad_norm": 2.617424000183995, + "learning_rate": 3.446528129751647e-05, + "loss": 0.2907, + "step": 1360 + }, + { + "epoch": 0.2083412538493708, + "grad_norm": 2.5811137906115866, + "learning_rate": 3.471870248352763e-05, + "loss": 0.3061, + "step": 1370 + }, + { + "epoch": 0.20986199292856328, + "grad_norm": 2.647757365428539, + "learning_rate": 3.497212366953878e-05, + "loss": 0.3072, + "step": 1380 + }, + { + "epoch": 0.21138273200775576, + "grad_norm": 2.814783989811939, + "learning_rate": 3.522554485554992e-05, + "loss": 0.2705, + "step": 1390 + }, + { + "epoch": 0.21290347108694826, + "grad_norm": 2.5967283216133277, + "learning_rate": 3.547896604156107e-05, + "loss": 0.3037, + "step": 1400 + }, + { + "epoch": 0.21442421016614074, + "grad_norm": 2.6227809954011545, + "learning_rate": 3.573238722757223e-05, + "loss": 0.3465, + "step": 1410 + }, + { + "epoch": 0.21594494924533322, + "grad_norm": 2.862740276411002, + "learning_rate": 3.598580841358338e-05, + "loss": 0.3128, + "step": 1420 + }, + { + "epoch": 0.21746568832452573, + "grad_norm": 3.1055741833779695, + "learning_rate": 3.623922959959453e-05, + "loss": 0.3017, + "step": 1430 + }, + { + "epoch": 0.2189864274037182, + "grad_norm": 4.2296133198373465, + "learning_rate": 3.649265078560568e-05, + "loss": 0.3288, + "step": 1440 + }, + { + "epoch": 0.2205071664829107, + "grad_norm": 3.160973787623607, + "learning_rate": 3.674607197161683e-05, + "loss": 0.3134, + "step": 1450 + }, + { + "epoch": 0.2220279055621032, + "grad_norm": 3.5344424318250725, + "learning_rate": 3.699949315762798e-05, + "loss": 0.3091, + "step": 1460 + }, + { + "epoch": 0.22354864464129567, + "grad_norm": 4.3719797092667845, + "learning_rate": 3.725291434363913e-05, + "loss": 0.3015, + "step": 1470 + }, + { + "epoch": 0.22506938372048815, + "grad_norm": 2.9106034618048082, + "learning_rate": 3.7506335529650284e-05, + "loss": 0.3466, + "step": 1480 + }, + { + "epoch": 0.22659012279968063, + "grad_norm": 2.8175328228524816, + "learning_rate": 3.7759756715661434e-05, + "loss": 0.3084, + "step": 1490 + }, + { + "epoch": 0.22811086187887314, + "grad_norm": 2.9905706138623795, + "learning_rate": 3.8013177901672584e-05, + "loss": 0.3445, + "step": 1500 + }, + { + "epoch": 0.22963160095806562, + "grad_norm": 3.0632305300729437, + "learning_rate": 3.826659908768373e-05, + "loss": 0.3443, + "step": 1510 + }, + { + "epoch": 0.2311523400372581, + "grad_norm": 2.422186828253968, + "learning_rate": 3.8520020273694884e-05, + "loss": 0.3432, + "step": 1520 + }, + { + "epoch": 0.2326730791164506, + "grad_norm": 3.02544912116432, + "learning_rate": 3.8773441459706034e-05, + "loss": 0.3141, + "step": 1530 + }, + { + "epoch": 0.23419381819564308, + "grad_norm": 2.533611343357827, + "learning_rate": 3.9026862645717183e-05, + "loss": 0.3243, + "step": 1540 + }, + { + "epoch": 0.23571455727483556, + "grad_norm": 2.63030810864126, + "learning_rate": 3.928028383172833e-05, + "loss": 0.356, + "step": 1550 + }, + { + "epoch": 0.23723529635402807, + "grad_norm": 4.354130627920429, + "learning_rate": 3.953370501773948e-05, + "loss": 0.321, + "step": 1560 + }, + { + "epoch": 0.23875603543322055, + "grad_norm": 3.2297226856566996, + "learning_rate": 3.978712620375063e-05, + "loss": 0.3165, + "step": 1570 + }, + { + "epoch": 0.24027677451241303, + "grad_norm": 1.6722767876551345, + "learning_rate": 4.004054738976178e-05, + "loss": 0.2574, + "step": 1580 + }, + { + "epoch": 0.24179751359160553, + "grad_norm": 2.4249921912866337, + "learning_rate": 4.029396857577294e-05, + "loss": 0.3241, + "step": 1590 + }, + { + "epoch": 0.243318252670798, + "grad_norm": 3.101097942115159, + "learning_rate": 4.054738976178409e-05, + "loss": 0.2996, + "step": 1600 + }, + { + "epoch": 0.2448389917499905, + "grad_norm": 3.218388428024808, + "learning_rate": 4.080081094779524e-05, + "loss": 0.347, + "step": 1610 + }, + { + "epoch": 0.24635973082918297, + "grad_norm": 2.4725135236656977, + "learning_rate": 4.105423213380639e-05, + "loss": 0.2961, + "step": 1620 + }, + { + "epoch": 0.24788046990837548, + "grad_norm": 2.4239535182870955, + "learning_rate": 4.130765331981754e-05, + "loss": 0.3234, + "step": 1630 + }, + { + "epoch": 0.24940120898756796, + "grad_norm": 3.174631014170229, + "learning_rate": 4.156107450582869e-05, + "loss": 0.3565, + "step": 1640 + }, + { + "epoch": 0.25092194806676044, + "grad_norm": 2.3861366450839014, + "learning_rate": 4.181449569183984e-05, + "loss": 0.3299, + "step": 1650 + }, + { + "epoch": 0.2524426871459529, + "grad_norm": 2.8547579254040696, + "learning_rate": 4.206791687785099e-05, + "loss": 0.3017, + "step": 1660 + }, + { + "epoch": 0.2539634262251454, + "grad_norm": 2.548627780068986, + "learning_rate": 4.2321338063862145e-05, + "loss": 0.3271, + "step": 1670 + }, + { + "epoch": 0.25548416530433793, + "grad_norm": 2.3856254824905148, + "learning_rate": 4.257475924987329e-05, + "loss": 0.3113, + "step": 1680 + }, + { + "epoch": 0.2570049043835304, + "grad_norm": 2.77556026188186, + "learning_rate": 4.282818043588444e-05, + "loss": 0.3923, + "step": 1690 + }, + { + "epoch": 0.2585256434627229, + "grad_norm": 3.1540046730034854, + "learning_rate": 4.308160162189559e-05, + "loss": 0.3347, + "step": 1700 + }, + { + "epoch": 0.26004638254191537, + "grad_norm": 3.370386051829158, + "learning_rate": 4.3335022807906745e-05, + "loss": 0.3231, + "step": 1710 + }, + { + "epoch": 0.26156712162110785, + "grad_norm": 3.6506497179186757, + "learning_rate": 4.3588443993917895e-05, + "loss": 0.3048, + "step": 1720 + }, + { + "epoch": 0.2630878607003003, + "grad_norm": 2.186173992106316, + "learning_rate": 4.3841865179929045e-05, + "loss": 0.3648, + "step": 1730 + }, + { + "epoch": 0.26460859977949286, + "grad_norm": 2.7960562338826707, + "learning_rate": 4.4095286365940195e-05, + "loss": 0.3077, + "step": 1740 + }, + { + "epoch": 0.26612933885868534, + "grad_norm": 3.1195787893544504, + "learning_rate": 4.4348707551951344e-05, + "loss": 0.3749, + "step": 1750 + }, + { + "epoch": 0.2676500779378778, + "grad_norm": 3.051887423826739, + "learning_rate": 4.4602128737962494e-05, + "loss": 0.3494, + "step": 1760 + }, + { + "epoch": 0.2691708170170703, + "grad_norm": 3.432706201494327, + "learning_rate": 4.4855549923973644e-05, + "loss": 0.3895, + "step": 1770 + }, + { + "epoch": 0.2706915560962628, + "grad_norm": 2.1100808212386886, + "learning_rate": 4.51089711099848e-05, + "loss": 0.3632, + "step": 1780 + }, + { + "epoch": 0.27221229517545525, + "grad_norm": 2.9413102934224047, + "learning_rate": 4.536239229599595e-05, + "loss": 0.3659, + "step": 1790 + }, + { + "epoch": 0.27373303425464773, + "grad_norm": 3.360383132887778, + "learning_rate": 4.5615813482007094e-05, + "loss": 0.3826, + "step": 1800 + }, + { + "epoch": 0.27525377333384027, + "grad_norm": 2.547739784450618, + "learning_rate": 4.5869234668018244e-05, + "loss": 0.3569, + "step": 1810 + }, + { + "epoch": 0.27677451241303275, + "grad_norm": 2.796135581415711, + "learning_rate": 4.61226558540294e-05, + "loss": 0.4077, + "step": 1820 + }, + { + "epoch": 0.2782952514922252, + "grad_norm": 3.130522433821404, + "learning_rate": 4.637607704004055e-05, + "loss": 0.3045, + "step": 1830 + }, + { + "epoch": 0.2798159905714177, + "grad_norm": 2.643366387437177, + "learning_rate": 4.66294982260517e-05, + "loss": 0.3946, + "step": 1840 + }, + { + "epoch": 0.2813367296506102, + "grad_norm": 2.801843777188606, + "learning_rate": 4.688291941206285e-05, + "loss": 0.3891, + "step": 1850 + }, + { + "epoch": 0.28285746872980266, + "grad_norm": 3.4869561958022506, + "learning_rate": 4.7136340598074e-05, + "loss": 0.3679, + "step": 1860 + }, + { + "epoch": 0.2843782078089952, + "grad_norm": 2.8438443965366482, + "learning_rate": 4.738976178408515e-05, + "loss": 0.3623, + "step": 1870 + }, + { + "epoch": 0.2858989468881877, + "grad_norm": 3.1165100287877454, + "learning_rate": 4.76431829700963e-05, + "loss": 0.3609, + "step": 1880 + }, + { + "epoch": 0.28741968596738016, + "grad_norm": 2.8598159347445056, + "learning_rate": 4.7896604156107456e-05, + "loss": 0.3498, + "step": 1890 + }, + { + "epoch": 0.28894042504657264, + "grad_norm": 3.7398012700339227, + "learning_rate": 4.8150025342118606e-05, + "loss": 0.3677, + "step": 1900 + }, + { + "epoch": 0.2904611641257651, + "grad_norm": 3.3481512822821364, + "learning_rate": 4.8403446528129756e-05, + "loss": 0.3575, + "step": 1910 + }, + { + "epoch": 0.2919819032049576, + "grad_norm": 2.9236950043434344, + "learning_rate": 4.86568677141409e-05, + "loss": 0.3719, + "step": 1920 + }, + { + "epoch": 0.2935026422841501, + "grad_norm": 3.415115008806836, + "learning_rate": 4.8910288900152056e-05, + "loss": 0.3659, + "step": 1930 + }, + { + "epoch": 0.2950233813633426, + "grad_norm": 2.8037300578412023, + "learning_rate": 4.9163710086163206e-05, + "loss": 0.4294, + "step": 1940 + }, + { + "epoch": 0.2965441204425351, + "grad_norm": 2.4846511118832924, + "learning_rate": 4.9417131272174356e-05, + "loss": 0.3785, + "step": 1950 + }, + { + "epoch": 0.29806485952172757, + "grad_norm": 2.28166171169232, + "learning_rate": 4.9670552458185506e-05, + "loss": 0.4149, + "step": 1960 + }, + { + "epoch": 0.29958559860092004, + "grad_norm": 3.425645674336139, + "learning_rate": 4.992397364419666e-05, + "loss": 0.3995, + "step": 1970 + }, + { + "epoch": 0.3011063376801125, + "grad_norm": 3.4306846968505433, + "learning_rate": 4.9999980817235095e-05, + "loss": 0.3897, + "step": 1980 + }, + { + "epoch": 0.302627076759305, + "grad_norm": 2.9768092074360055, + "learning_rate": 4.9999886860906395e-05, + "loss": 0.41, + "step": 1990 + }, + { + "epoch": 0.30414781583849754, + "grad_norm": 2.2455394711109573, + "learning_rate": 4.9999714607942834e-05, + "loss": 0.409, + "step": 2000 + }, + { + "epoch": 0.30566855491769, + "grad_norm": 3.2078843086982305, + "learning_rate": 4.999946405888386e-05, + "loss": 0.3592, + "step": 2010 + }, + { + "epoch": 0.3071892939968825, + "grad_norm": 3.476146745628206, + "learning_rate": 4.9999135214514194e-05, + "loss": 0.3886, + "step": 2020 + }, + { + "epoch": 0.308710033076075, + "grad_norm": 2.5193696641935603, + "learning_rate": 4.999872807586371e-05, + "loss": 0.3942, + "step": 2030 + }, + { + "epoch": 0.31023077215526745, + "grad_norm": 3.404402332718992, + "learning_rate": 4.999824264420753e-05, + "loss": 0.4277, + "step": 2040 + }, + { + "epoch": 0.31175151123445993, + "grad_norm": 3.11029933504406, + "learning_rate": 4.999767892106596e-05, + "loss": 0.4167, + "step": 2050 + }, + { + "epoch": 0.3132722503136524, + "grad_norm": 1.982601494083213, + "learning_rate": 4.999703690820452e-05, + "loss": 0.4093, + "step": 2060 + }, + { + "epoch": 0.31479298939284495, + "grad_norm": 3.220049656489309, + "learning_rate": 4.999631660763392e-05, + "loss": 0.3989, + "step": 2070 + }, + { + "epoch": 0.3163137284720374, + "grad_norm": 2.649013238948908, + "learning_rate": 4.999551802161004e-05, + "loss": 0.3871, + "step": 2080 + }, + { + "epoch": 0.3178344675512299, + "grad_norm": 2.7464338977347573, + "learning_rate": 4.999464115263397e-05, + "loss": 0.383, + "step": 2090 + }, + { + "epoch": 0.3193552066304224, + "grad_norm": 4.032560938361417, + "learning_rate": 4.9993686003451955e-05, + "loss": 0.4492, + "step": 2100 + }, + { + "epoch": 0.32087594570961486, + "grad_norm": 3.475628045051091, + "learning_rate": 4.99926525770554e-05, + "loss": 0.459, + "step": 2110 + }, + { + "epoch": 0.32239668478880734, + "grad_norm": 3.640091873399105, + "learning_rate": 4.9991540876680876e-05, + "loss": 0.4171, + "step": 2120 + }, + { + "epoch": 0.3239174238679998, + "grad_norm": 3.455646810495871, + "learning_rate": 4.99903509058101e-05, + "loss": 0.4054, + "step": 2130 + }, + { + "epoch": 0.32543816294719236, + "grad_norm": 2.889736037386252, + "learning_rate": 4.998908266816991e-05, + "loss": 0.477, + "step": 2140 + }, + { + "epoch": 0.32695890202638483, + "grad_norm": 2.8505277354499636, + "learning_rate": 4.998773616773228e-05, + "loss": 0.4305, + "step": 2150 + }, + { + "epoch": 0.3284796411055773, + "grad_norm": 2.841869726791348, + "learning_rate": 4.998631140871428e-05, + "loss": 0.4167, + "step": 2160 + }, + { + "epoch": 0.3300003801847698, + "grad_norm": 2.8300676409750705, + "learning_rate": 4.998480839557808e-05, + "loss": 0.4196, + "step": 2170 + }, + { + "epoch": 0.33152111926396227, + "grad_norm": 3.538674455891556, + "learning_rate": 4.998322713303095e-05, + "loss": 0.4072, + "step": 2180 + }, + { + "epoch": 0.33304185834315475, + "grad_norm": 3.2901141096922117, + "learning_rate": 4.998156762602521e-05, + "loss": 0.4097, + "step": 2190 + }, + { + "epoch": 0.3345625974223473, + "grad_norm": 2.7618156879603304, + "learning_rate": 4.997982987975823e-05, + "loss": 0.3835, + "step": 2200 + }, + { + "epoch": 0.33608333650153976, + "grad_norm": 2.010194762939355, + "learning_rate": 4.9978013899672423e-05, + "loss": 0.3914, + "step": 2210 + }, + { + "epoch": 0.33760407558073224, + "grad_norm": 2.229450235166853, + "learning_rate": 4.997611969145524e-05, + "loss": 0.3854, + "step": 2220 + }, + { + "epoch": 0.3391248146599247, + "grad_norm": 3.858697957575173, + "learning_rate": 4.997414726103907e-05, + "loss": 0.3814, + "step": 2230 + }, + { + "epoch": 0.3406455537391172, + "grad_norm": 2.5844536166128895, + "learning_rate": 4.997209661460137e-05, + "loss": 0.383, + "step": 2240 + }, + { + "epoch": 0.3421662928183097, + "grad_norm": 3.9848611249601817, + "learning_rate": 4.99699677585645e-05, + "loss": 0.4462, + "step": 2250 + }, + { + "epoch": 0.34368703189750216, + "grad_norm": 3.269957241628345, + "learning_rate": 4.996776069959577e-05, + "loss": 0.4303, + "step": 2260 + }, + { + "epoch": 0.3452077709766947, + "grad_norm": 3.3800727934842434, + "learning_rate": 4.996547544460744e-05, + "loss": 0.4221, + "step": 2270 + }, + { + "epoch": 0.3467285100558872, + "grad_norm": 3.9000703635515874, + "learning_rate": 4.9963112000756653e-05, + "loss": 0.3782, + "step": 2280 + }, + { + "epoch": 0.34824924913507965, + "grad_norm": 3.078863651939886, + "learning_rate": 4.996067037544542e-05, + "loss": 0.3985, + "step": 2290 + }, + { + "epoch": 0.34976998821427213, + "grad_norm": 2.6703057893185536, + "learning_rate": 4.9958150576320616e-05, + "loss": 0.3929, + "step": 2300 + }, + { + "epoch": 0.3512907272934646, + "grad_norm": 3.209226050549795, + "learning_rate": 4.9955552611273966e-05, + "loss": 0.4106, + "step": 2310 + }, + { + "epoch": 0.3528114663726571, + "grad_norm": 2.6678616185582014, + "learning_rate": 4.995287648844197e-05, + "loss": 0.3925, + "step": 2320 + }, + { + "epoch": 0.3543322054518496, + "grad_norm": 3.571028393306744, + "learning_rate": 4.995012221620592e-05, + "loss": 0.4236, + "step": 2330 + }, + { + "epoch": 0.3558529445310421, + "grad_norm": 2.489815442822639, + "learning_rate": 4.994728980319187e-05, + "loss": 0.4356, + "step": 2340 + }, + { + "epoch": 0.3573736836102346, + "grad_norm": 3.092753426543583, + "learning_rate": 4.99443792582706e-05, + "loss": 0.4102, + "step": 2350 + }, + { + "epoch": 0.35889442268942706, + "grad_norm": 3.9100196454169986, + "learning_rate": 4.994139059055758e-05, + "loss": 0.4202, + "step": 2360 + }, + { + "epoch": 0.36041516176861954, + "grad_norm": 3.10769833590166, + "learning_rate": 4.993832380941297e-05, + "loss": 0.4252, + "step": 2370 + }, + { + "epoch": 0.361935900847812, + "grad_norm": 4.10969966048515, + "learning_rate": 4.993517892444154e-05, + "loss": 0.4086, + "step": 2380 + }, + { + "epoch": 0.3634566399270045, + "grad_norm": 3.5235815597441063, + "learning_rate": 4.99319559454927e-05, + "loss": 0.4342, + "step": 2390 + }, + { + "epoch": 0.36497737900619703, + "grad_norm": 2.7044850500010345, + "learning_rate": 4.992865488266043e-05, + "loss": 0.436, + "step": 2400 + }, + { + "epoch": 0.3664981180853895, + "grad_norm": 2.994101348186802, + "learning_rate": 4.992527574628326e-05, + "loss": 0.3706, + "step": 2410 + }, + { + "epoch": 0.368018857164582, + "grad_norm": 3.2633411996986563, + "learning_rate": 4.9921818546944254e-05, + "loss": 0.4883, + "step": 2420 + }, + { + "epoch": 0.36953959624377447, + "grad_norm": 3.1258932013534144, + "learning_rate": 4.991828329547092e-05, + "loss": 0.4159, + "step": 2430 + }, + { + "epoch": 0.37106033532296695, + "grad_norm": 3.6349807032087775, + "learning_rate": 4.991467000293526e-05, + "loss": 0.4113, + "step": 2440 + }, + { + "epoch": 0.37258107440215943, + "grad_norm": 4.051392284798637, + "learning_rate": 4.991097868065366e-05, + "loss": 0.4663, + "step": 2450 + }, + { + "epoch": 0.37410181348135196, + "grad_norm": 3.3459356285619615, + "learning_rate": 4.99072093401869e-05, + "loss": 0.4304, + "step": 2460 + }, + { + "epoch": 0.37562255256054444, + "grad_norm": 3.10279064111015, + "learning_rate": 4.9903361993340095e-05, + "loss": 0.3989, + "step": 2470 + }, + { + "epoch": 0.3771432916397369, + "grad_norm": 2.883726668395835, + "learning_rate": 4.989943665216268e-05, + "loss": 0.4166, + "step": 2480 + }, + { + "epoch": 0.3786640307189294, + "grad_norm": 3.502062488085139, + "learning_rate": 4.989543332894834e-05, + "loss": 0.3837, + "step": 2490 + }, + { + "epoch": 0.3801847697981219, + "grad_norm": 3.32520582741293, + "learning_rate": 4.9891352036235016e-05, + "loss": 0.4109, + "step": 2500 + }, + { + "epoch": 0.38170550887731436, + "grad_norm": 3.134102480753488, + "learning_rate": 4.9887192786804816e-05, + "loss": 0.501, + "step": 2510 + }, + { + "epoch": 0.38322624795650684, + "grad_norm": 2.8576569104948475, + "learning_rate": 4.988295559368401e-05, + "loss": 0.4143, + "step": 2520 + }, + { + "epoch": 0.3847469870356994, + "grad_norm": 3.1285248740722813, + "learning_rate": 4.987864047014298e-05, + "loss": 0.4536, + "step": 2530 + }, + { + "epoch": 0.38626772611489185, + "grad_norm": 2.8344236950702824, + "learning_rate": 4.987424742969616e-05, + "loss": 0.4531, + "step": 2540 + }, + { + "epoch": 0.38778846519408433, + "grad_norm": 2.6631674692124565, + "learning_rate": 4.9869776486102047e-05, + "loss": 0.4131, + "step": 2550 + }, + { + "epoch": 0.3893092042732768, + "grad_norm": 3.5484938419380296, + "learning_rate": 4.986522765336308e-05, + "loss": 0.4281, + "step": 2560 + }, + { + "epoch": 0.3908299433524693, + "grad_norm": 2.3892402020030494, + "learning_rate": 4.986060094572566e-05, + "loss": 0.4043, + "step": 2570 + }, + { + "epoch": 0.39235068243166177, + "grad_norm": 3.343193924546747, + "learning_rate": 4.9855896377680066e-05, + "loss": 0.4502, + "step": 2580 + }, + { + "epoch": 0.3938714215108543, + "grad_norm": 2.917278692762873, + "learning_rate": 4.985111396396046e-05, + "loss": 0.3944, + "step": 2590 + }, + { + "epoch": 0.3953921605900468, + "grad_norm": 2.8995173190402515, + "learning_rate": 4.984625371954478e-05, + "loss": 0.383, + "step": 2600 + }, + { + "epoch": 0.39691289966923926, + "grad_norm": 3.6756162176204614, + "learning_rate": 4.984131565965472e-05, + "loss": 0.4624, + "step": 2610 + }, + { + "epoch": 0.39843363874843174, + "grad_norm": 3.075571807178465, + "learning_rate": 4.9836299799755695e-05, + "loss": 0.4168, + "step": 2620 + }, + { + "epoch": 0.3999543778276242, + "grad_norm": 2.9061275255474253, + "learning_rate": 4.983120615555676e-05, + "loss": 0.4163, + "step": 2630 + }, + { + "epoch": 0.4014751169068167, + "grad_norm": 3.3867519335530316, + "learning_rate": 4.9826034743010606e-05, + "loss": 0.4847, + "step": 2640 + }, + { + "epoch": 0.4029958559860092, + "grad_norm": 3.0362461204116467, + "learning_rate": 4.982078557831348e-05, + "loss": 0.4117, + "step": 2650 + }, + { + "epoch": 0.4045165950652017, + "grad_norm": 3.349029775934006, + "learning_rate": 4.981545867790512e-05, + "loss": 0.4701, + "step": 2660 + }, + { + "epoch": 0.4060373341443942, + "grad_norm": 3.350714632961027, + "learning_rate": 4.981005405846876e-05, + "loss": 0.4089, + "step": 2670 + }, + { + "epoch": 0.40755807322358667, + "grad_norm": 3.400036077774985, + "learning_rate": 4.980457173693099e-05, + "loss": 0.4347, + "step": 2680 + }, + { + "epoch": 0.40907881230277915, + "grad_norm": 2.703610261587994, + "learning_rate": 4.97990117304618e-05, + "loss": 0.4125, + "step": 2690 + }, + { + "epoch": 0.41059955138197163, + "grad_norm": 2.952061192662502, + "learning_rate": 4.979337405647444e-05, + "loss": 0.451, + "step": 2700 + }, + { + "epoch": 0.4121202904611641, + "grad_norm": 3.43083970008645, + "learning_rate": 4.978765873262545e-05, + "loss": 0.426, + "step": 2710 + }, + { + "epoch": 0.4136410295403566, + "grad_norm": 2.9398584204126768, + "learning_rate": 4.9781865776814515e-05, + "loss": 0.4463, + "step": 2720 + }, + { + "epoch": 0.4151617686195491, + "grad_norm": 4.053147954790478, + "learning_rate": 4.977599520718449e-05, + "loss": 0.4318, + "step": 2730 + }, + { + "epoch": 0.4166825076987416, + "grad_norm": 3.5337252640971375, + "learning_rate": 4.977004704212129e-05, + "loss": 0.4431, + "step": 2740 + }, + { + "epoch": 0.4182032467779341, + "grad_norm": 4.494032222491364, + "learning_rate": 4.9764021300253844e-05, + "loss": 0.4272, + "step": 2750 + }, + { + "epoch": 0.41972398585712656, + "grad_norm": 3.520268308260917, + "learning_rate": 4.975791800045406e-05, + "loss": 0.46, + "step": 2760 + }, + { + "epoch": 0.42124472493631904, + "grad_norm": 2.5616860817136975, + "learning_rate": 4.975173716183672e-05, + "loss": 0.4651, + "step": 2770 + }, + { + "epoch": 0.4227654640155115, + "grad_norm": 3.135442455134988, + "learning_rate": 4.974547880375949e-05, + "loss": 0.4149, + "step": 2780 + }, + { + "epoch": 0.42428620309470405, + "grad_norm": 3.720240753054115, + "learning_rate": 4.973914294582278e-05, + "loss": 0.4481, + "step": 2790 + }, + { + "epoch": 0.42580694217389653, + "grad_norm": 2.677355550665423, + "learning_rate": 4.973272960786973e-05, + "loss": 0.406, + "step": 2800 + }, + { + "epoch": 0.427327681253089, + "grad_norm": 3.0650578666356636, + "learning_rate": 4.9726238809986154e-05, + "loss": 0.4331, + "step": 2810 + }, + { + "epoch": 0.4288484203322815, + "grad_norm": 2.660429855500457, + "learning_rate": 4.9719670572500444e-05, + "loss": 0.4259, + "step": 2820 + }, + { + "epoch": 0.43036915941147397, + "grad_norm": 3.469158723835534, + "learning_rate": 4.9713024915983525e-05, + "loss": 0.456, + "step": 2830 + }, + { + "epoch": 0.43188989849066645, + "grad_norm": 3.9103910852429395, + "learning_rate": 4.9706301861248794e-05, + "loss": 0.4479, + "step": 2840 + }, + { + "epoch": 0.4334106375698589, + "grad_norm": 3.576914764309182, + "learning_rate": 4.969950142935206e-05, + "loss": 0.4508, + "step": 2850 + }, + { + "epoch": 0.43493137664905146, + "grad_norm": 3.973919785255315, + "learning_rate": 4.969262364159144e-05, + "loss": 0.4676, + "step": 2860 + }, + { + "epoch": 0.43645211572824394, + "grad_norm": 2.362099617631692, + "learning_rate": 4.9685668519507346e-05, + "loss": 0.4786, + "step": 2870 + }, + { + "epoch": 0.4379728548074364, + "grad_norm": 4.254638669306579, + "learning_rate": 4.9678636084882385e-05, + "loss": 0.4137, + "step": 2880 + }, + { + "epoch": 0.4394935938866289, + "grad_norm": 3.082447203810843, + "learning_rate": 4.967152635974129e-05, + "loss": 0.4699, + "step": 2890 + }, + { + "epoch": 0.4410143329658214, + "grad_norm": 2.5093472424560948, + "learning_rate": 4.966433936635086e-05, + "loss": 0.4949, + "step": 2900 + }, + { + "epoch": 0.44253507204501386, + "grad_norm": 2.1004375655244014, + "learning_rate": 4.965707512721991e-05, + "loss": 0.4101, + "step": 2910 + }, + { + "epoch": 0.4440558111242064, + "grad_norm": 3.076161114909028, + "learning_rate": 4.9649733665099144e-05, + "loss": 0.4553, + "step": 2920 + }, + { + "epoch": 0.44557655020339887, + "grad_norm": 3.2228401091362193, + "learning_rate": 4.964231500298114e-05, + "loss": 0.4634, + "step": 2930 + }, + { + "epoch": 0.44709728928259135, + "grad_norm": 3.059368064594295, + "learning_rate": 4.963481916410026e-05, + "loss": 0.4761, + "step": 2940 + }, + { + "epoch": 0.4486180283617838, + "grad_norm": 4.5845729493653495, + "learning_rate": 4.9627246171932574e-05, + "loss": 0.4525, + "step": 2950 + }, + { + "epoch": 0.4501387674409763, + "grad_norm": 3.5696008084786612, + "learning_rate": 4.961959605019576e-05, + "loss": 0.4354, + "step": 2960 + }, + { + "epoch": 0.4516595065201688, + "grad_norm": 3.5796152808733166, + "learning_rate": 4.9611868822849104e-05, + "loss": 0.4849, + "step": 2970 + }, + { + "epoch": 0.45318024559936126, + "grad_norm": 4.49427736732011, + "learning_rate": 4.9604064514093335e-05, + "loss": 0.4692, + "step": 2980 + }, + { + "epoch": 0.4547009846785538, + "grad_norm": 3.2146923499273936, + "learning_rate": 4.959618314837061e-05, + "loss": 0.4504, + "step": 2990 + }, + { + "epoch": 0.4562217237577463, + "grad_norm": 3.1886056531350007, + "learning_rate": 4.958822475036443e-05, + "loss": 0.4367, + "step": 3000 + }, + { + "epoch": 0.45774246283693876, + "grad_norm": 3.050916374055843, + "learning_rate": 4.9580189344999515e-05, + "loss": 0.4683, + "step": 3010 + }, + { + "epoch": 0.45926320191613124, + "grad_norm": 4.297091689394695, + "learning_rate": 4.9572076957441815e-05, + "loss": 0.486, + "step": 3020 + }, + { + "epoch": 0.4607839409953237, + "grad_norm": 3.417294093567773, + "learning_rate": 4.956388761309832e-05, + "loss": 0.4725, + "step": 3030 + }, + { + "epoch": 0.4623046800745162, + "grad_norm": 3.0432069429643316, + "learning_rate": 4.955562133761711e-05, + "loss": 0.4723, + "step": 3040 + }, + { + "epoch": 0.46382541915370873, + "grad_norm": 2.30435742594806, + "learning_rate": 4.954727815688712e-05, + "loss": 0.4461, + "step": 3050 + }, + { + "epoch": 0.4653461582329012, + "grad_norm": 3.4233448102070443, + "learning_rate": 4.953885809703822e-05, + "loss": 0.4486, + "step": 3060 + }, + { + "epoch": 0.4668668973120937, + "grad_norm": 2.8734308998843168, + "learning_rate": 4.953036118444101e-05, + "loss": 0.4498, + "step": 3070 + }, + { + "epoch": 0.46838763639128617, + "grad_norm": 2.586239364869869, + "learning_rate": 4.95217874457068e-05, + "loss": 0.4357, + "step": 3080 + }, + { + "epoch": 0.46990837547047865, + "grad_norm": 2.7464814291429174, + "learning_rate": 4.951313690768751e-05, + "loss": 0.4524, + "step": 3090 + }, + { + "epoch": 0.4714291145496711, + "grad_norm": 4.421472706275767, + "learning_rate": 4.9504409597475565e-05, + "loss": 0.5416, + "step": 3100 + }, + { + "epoch": 0.4729498536288636, + "grad_norm": 2.4521803364829706, + "learning_rate": 4.949560554240388e-05, + "loss": 0.4775, + "step": 3110 + }, + { + "epoch": 0.47447059270805614, + "grad_norm": 3.2923635008316112, + "learning_rate": 4.948672477004567e-05, + "loss": 0.4859, + "step": 3120 + }, + { + "epoch": 0.4759913317872486, + "grad_norm": 3.076456255553935, + "learning_rate": 4.947776730821445e-05, + "loss": 0.4331, + "step": 3130 + }, + { + "epoch": 0.4775120708664411, + "grad_norm": 3.763185660714678, + "learning_rate": 4.946873318496392e-05, + "loss": 0.474, + "step": 3140 + }, + { + "epoch": 0.4790328099456336, + "grad_norm": 2.966168343061985, + "learning_rate": 4.945962242858787e-05, + "loss": 0.4478, + "step": 3150 + }, + { + "epoch": 0.48055354902482605, + "grad_norm": 2.2489301569540796, + "learning_rate": 4.945043506762007e-05, + "loss": 0.4086, + "step": 3160 + }, + { + "epoch": 0.48207428810401853, + "grad_norm": 3.649146285588346, + "learning_rate": 4.9441171130834245e-05, + "loss": 0.4941, + "step": 3170 + }, + { + "epoch": 0.48359502718321107, + "grad_norm": 3.2324705134646257, + "learning_rate": 4.943183064724393e-05, + "loss": 0.4982, + "step": 3180 + }, + { + "epoch": 0.48511576626240355, + "grad_norm": 2.850478859005078, + "learning_rate": 4.9422413646102385e-05, + "loss": 0.4518, + "step": 3190 + }, + { + "epoch": 0.486636505341596, + "grad_norm": 2.83444440962313, + "learning_rate": 4.941292015690253e-05, + "loss": 0.4202, + "step": 3200 + }, + { + "epoch": 0.4881572444207885, + "grad_norm": 2.8915243714372196, + "learning_rate": 4.940335020937683e-05, + "loss": 0.4345, + "step": 3210 + }, + { + "epoch": 0.489677983499981, + "grad_norm": 3.3863600716264886, + "learning_rate": 4.939370383349721e-05, + "loss": 0.4446, + "step": 3220 + }, + { + "epoch": 0.49119872257917346, + "grad_norm": 2.502867630291183, + "learning_rate": 4.938398105947495e-05, + "loss": 0.4759, + "step": 3230 + }, + { + "epoch": 0.49271946165836594, + "grad_norm": 3.502753196172659, + "learning_rate": 4.937418191776061e-05, + "loss": 0.4236, + "step": 3240 + }, + { + "epoch": 0.4942402007375585, + "grad_norm": 2.8928201413418324, + "learning_rate": 4.936430643904392e-05, + "loss": 0.4561, + "step": 3250 + }, + { + "epoch": 0.49576093981675096, + "grad_norm": 4.0814808750833915, + "learning_rate": 4.93543546542537e-05, + "loss": 0.4878, + "step": 3260 + }, + { + "epoch": 0.49728167889594344, + "grad_norm": 3.1105097444073544, + "learning_rate": 4.9344326594557734e-05, + "loss": 0.4301, + "step": 3270 + }, + { + "epoch": 0.4988024179751359, + "grad_norm": 2.975444637557291, + "learning_rate": 4.93342222913627e-05, + "loss": 0.4715, + "step": 3280 + }, + { + "epoch": 0.5003231570543284, + "grad_norm": 4.77999882731304, + "learning_rate": 4.932404177631405e-05, + "loss": 0.4594, + "step": 3290 + }, + { + "epoch": 0.5018438961335209, + "grad_norm": 2.990912480551215, + "learning_rate": 4.9313785081295946e-05, + "loss": 0.4364, + "step": 3300 + }, + { + "epoch": 0.5033646352127134, + "grad_norm": 3.191786915410823, + "learning_rate": 4.93034522384311e-05, + "loss": 0.4783, + "step": 3310 + }, + { + "epoch": 0.5048853742919058, + "grad_norm": 2.756197803889167, + "learning_rate": 4.9293043280080756e-05, + "loss": 0.4583, + "step": 3320 + }, + { + "epoch": 0.5064061133710983, + "grad_norm": 3.0181247696624602, + "learning_rate": 4.9282558238844514e-05, + "loss": 0.4331, + "step": 3330 + }, + { + "epoch": 0.5079268524502908, + "grad_norm": 4.093247026516489, + "learning_rate": 4.927199714756024e-05, + "loss": 0.4375, + "step": 3340 + }, + { + "epoch": 0.5094475915294834, + "grad_norm": 3.6598816351826793, + "learning_rate": 4.9261360039304016e-05, + "loss": 0.4391, + "step": 3350 + }, + { + "epoch": 0.5109683306086759, + "grad_norm": 2.9071090207731722, + "learning_rate": 4.9250646947389985e-05, + "loss": 0.4449, + "step": 3360 + }, + { + "epoch": 0.5124890696878683, + "grad_norm": 3.3450512593962842, + "learning_rate": 4.923985790537025e-05, + "loss": 0.496, + "step": 3370 + }, + { + "epoch": 0.5140098087670608, + "grad_norm": 3.4456654874197232, + "learning_rate": 4.92289929470348e-05, + "loss": 0.4557, + "step": 3380 + }, + { + "epoch": 0.5155305478462533, + "grad_norm": 3.250902815040035, + "learning_rate": 4.9218052106411385e-05, + "loss": 0.4661, + "step": 3390 + }, + { + "epoch": 0.5170512869254458, + "grad_norm": 2.9182253678199386, + "learning_rate": 4.920703541776538e-05, + "loss": 0.4619, + "step": 3400 + }, + { + "epoch": 0.5185720260046383, + "grad_norm": 3.983071016449622, + "learning_rate": 4.919594291559974e-05, + "loss": 0.4203, + "step": 3410 + }, + { + "epoch": 0.5200927650838307, + "grad_norm": 2.81809578351843, + "learning_rate": 4.918477463465484e-05, + "loss": 0.4391, + "step": 3420 + }, + { + "epoch": 0.5216135041630232, + "grad_norm": 4.357613697441156, + "learning_rate": 4.917353060990839e-05, + "loss": 0.4979, + "step": 3430 + }, + { + "epoch": 0.5231342432422157, + "grad_norm": 3.674065240274498, + "learning_rate": 4.916221087657533e-05, + "loss": 0.4292, + "step": 3440 + }, + { + "epoch": 0.5246549823214082, + "grad_norm": 3.2329879326400657, + "learning_rate": 4.915081547010769e-05, + "loss": 0.4429, + "step": 3450 + }, + { + "epoch": 0.5261757214006006, + "grad_norm": 2.8422261228820394, + "learning_rate": 4.913934442619453e-05, + "loss": 0.5289, + "step": 3460 + }, + { + "epoch": 0.5276964604797931, + "grad_norm": 3.74398658027105, + "learning_rate": 4.912779778076175e-05, + "loss": 0.4523, + "step": 3470 + }, + { + "epoch": 0.5292171995589857, + "grad_norm": 2.97658521542873, + "learning_rate": 4.9116175569972076e-05, + "loss": 0.4736, + "step": 3480 + }, + { + "epoch": 0.5307379386381782, + "grad_norm": 2.438644363872717, + "learning_rate": 4.910447783022487e-05, + "loss": 0.4697, + "step": 3490 + }, + { + "epoch": 0.5322586777173707, + "grad_norm": 3.65454025674319, + "learning_rate": 4.909270459815602e-05, + "loss": 0.4921, + "step": 3500 + }, + { + "epoch": 0.5337794167965632, + "grad_norm": 3.5593419781198374, + "learning_rate": 4.9080855910637915e-05, + "loss": 0.4733, + "step": 3510 + }, + { + "epoch": 0.5353001558757556, + "grad_norm": 3.1976378818028475, + "learning_rate": 4.9068931804779175e-05, + "loss": 0.4777, + "step": 3520 + }, + { + "epoch": 0.5368208949549481, + "grad_norm": 3.837345402766518, + "learning_rate": 4.905693231792468e-05, + "loss": 0.4644, + "step": 3530 + }, + { + "epoch": 0.5383416340341406, + "grad_norm": 3.4390505438183796, + "learning_rate": 4.9044857487655385e-05, + "loss": 0.4604, + "step": 3540 + }, + { + "epoch": 0.5398623731133331, + "grad_norm": 3.0956675756744936, + "learning_rate": 4.9032707351788194e-05, + "loss": 0.4243, + "step": 3550 + }, + { + "epoch": 0.5413831121925256, + "grad_norm": 2.956206218228375, + "learning_rate": 4.9020481948375876e-05, + "loss": 0.4438, + "step": 3560 + }, + { + "epoch": 0.542903851271718, + "grad_norm": 3.334267826189304, + "learning_rate": 4.900818131570691e-05, + "loss": 0.459, + "step": 3570 + }, + { + "epoch": 0.5444245903509105, + "grad_norm": 3.26016049001583, + "learning_rate": 4.899580549230541e-05, + "loss": 0.4344, + "step": 3580 + }, + { + "epoch": 0.545945329430103, + "grad_norm": 4.249791893824641, + "learning_rate": 4.898335451693096e-05, + "loss": 0.4594, + "step": 3590 + }, + { + "epoch": 0.5474660685092955, + "grad_norm": 2.802750462997298, + "learning_rate": 4.897082842857851e-05, + "loss": 0.4636, + "step": 3600 + }, + { + "epoch": 0.5489868075884881, + "grad_norm": 3.4554211099484435, + "learning_rate": 4.8958227266478275e-05, + "loss": 0.4027, + "step": 3610 + }, + { + "epoch": 0.5505075466676805, + "grad_norm": 2.4952583194598694, + "learning_rate": 4.894555107009556e-05, + "loss": 0.443, + "step": 3620 + }, + { + "epoch": 0.552028285746873, + "grad_norm": 2.518308743381062, + "learning_rate": 4.8932799879130696e-05, + "loss": 0.5164, + "step": 3630 + }, + { + "epoch": 0.5535490248260655, + "grad_norm": 3.2114335689716156, + "learning_rate": 4.891997373351887e-05, + "loss": 0.4333, + "step": 3640 + }, + { + "epoch": 0.555069763905258, + "grad_norm": 3.353508397798323, + "learning_rate": 4.890707267343003e-05, + "loss": 0.5647, + "step": 3650 + }, + { + "epoch": 0.5565905029844505, + "grad_norm": 3.585989659703909, + "learning_rate": 4.8894096739268746e-05, + "loss": 0.4393, + "step": 3660 + }, + { + "epoch": 0.5581112420636429, + "grad_norm": 3.1491475686880754, + "learning_rate": 4.8881045971674074e-05, + "loss": 0.5292, + "step": 3670 + }, + { + "epoch": 0.5596319811428354, + "grad_norm": 3.099918487926915, + "learning_rate": 4.8867920411519446e-05, + "loss": 0.4156, + "step": 3680 + }, + { + "epoch": 0.5611527202220279, + "grad_norm": 2.849940533795096, + "learning_rate": 4.8854720099912543e-05, + "loss": 0.4488, + "step": 3690 + }, + { + "epoch": 0.5626734593012204, + "grad_norm": 3.030616089081656, + "learning_rate": 4.884144507819515e-05, + "loss": 0.4865, + "step": 3700 + }, + { + "epoch": 0.5641941983804128, + "grad_norm": 3.0158331071227407, + "learning_rate": 4.882809538794303e-05, + "loss": 0.4734, + "step": 3710 + }, + { + "epoch": 0.5657149374596053, + "grad_norm": 2.7183386810265526, + "learning_rate": 4.881467107096581e-05, + "loss": 0.4457, + "step": 3720 + }, + { + "epoch": 0.5672356765387978, + "grad_norm": 2.49575834619917, + "learning_rate": 4.880117216930683e-05, + "loss": 0.4309, + "step": 3730 + }, + { + "epoch": 0.5687564156179904, + "grad_norm": 2.9700309711315924, + "learning_rate": 4.878759872524302e-05, + "loss": 0.5135, + "step": 3740 + }, + { + "epoch": 0.5702771546971829, + "grad_norm": 3.5806533253821113, + "learning_rate": 4.8773950781284794e-05, + "loss": 0.4492, + "step": 3750 + }, + { + "epoch": 0.5717978937763754, + "grad_norm": 2.9915032792689056, + "learning_rate": 4.876022838017584e-05, + "loss": 0.4463, + "step": 3760 + }, + { + "epoch": 0.5733186328555678, + "grad_norm": 3.7538038059725922, + "learning_rate": 4.874643156489309e-05, + "loss": 0.4593, + "step": 3770 + }, + { + "epoch": 0.5748393719347603, + "grad_norm": 2.666060431274113, + "learning_rate": 4.8732560378646494e-05, + "loss": 0.4306, + "step": 3780 + }, + { + "epoch": 0.5763601110139528, + "grad_norm": 2.991293097238099, + "learning_rate": 4.8718614864878945e-05, + "loss": 0.4799, + "step": 3790 + }, + { + "epoch": 0.5778808500931453, + "grad_norm": 2.94924480180075, + "learning_rate": 4.87045950672661e-05, + "loss": 0.4794, + "step": 3800 + }, + { + "epoch": 0.5794015891723377, + "grad_norm": 3.516381710163782, + "learning_rate": 4.869050102971629e-05, + "loss": 0.4919, + "step": 3810 + }, + { + "epoch": 0.5809223282515302, + "grad_norm": 3.031511345457987, + "learning_rate": 4.8676332796370336e-05, + "loss": 0.4579, + "step": 3820 + }, + { + "epoch": 0.5824430673307227, + "grad_norm": 3.1217612271167137, + "learning_rate": 4.866209041160144e-05, + "loss": 0.4684, + "step": 3830 + }, + { + "epoch": 0.5839638064099152, + "grad_norm": 3.425050370638793, + "learning_rate": 4.864777392001504e-05, + "loss": 0.4794, + "step": 3840 + }, + { + "epoch": 0.5854845454891077, + "grad_norm": 3.340354786286474, + "learning_rate": 4.863338336644866e-05, + "loss": 0.4208, + "step": 3850 + }, + { + "epoch": 0.5870052845683001, + "grad_norm": 3.3876202053653475, + "learning_rate": 4.8618918795971774e-05, + "loss": 0.5053, + "step": 3860 + }, + { + "epoch": 0.5885260236474927, + "grad_norm": 2.968501789107022, + "learning_rate": 4.860438025388568e-05, + "loss": 0.432, + "step": 3870 + }, + { + "epoch": 0.5900467627266852, + "grad_norm": 2.593085815518715, + "learning_rate": 4.858976778572335e-05, + "loss": 0.4613, + "step": 3880 + }, + { + "epoch": 0.5915675018058777, + "grad_norm": 3.1688053779871064, + "learning_rate": 4.8575081437249266e-05, + "loss": 0.4973, + "step": 3890 + }, + { + "epoch": 0.5930882408850702, + "grad_norm": 2.5672161391347905, + "learning_rate": 4.8560321254459296e-05, + "loss": 0.4351, + "step": 3900 + }, + { + "epoch": 0.5946089799642627, + "grad_norm": 2.656446511277839, + "learning_rate": 4.854548728358057e-05, + "loss": 0.4543, + "step": 3910 + }, + { + "epoch": 0.5961297190434551, + "grad_norm": 3.311013103731686, + "learning_rate": 4.853057957107129e-05, + "loss": 0.462, + "step": 3920 + }, + { + "epoch": 0.5976504581226476, + "grad_norm": 3.544516955846144, + "learning_rate": 4.851559816362061e-05, + "loss": 0.4473, + "step": 3930 + }, + { + "epoch": 0.5991711972018401, + "grad_norm": 3.0957817342666, + "learning_rate": 4.850054310814851e-05, + "loss": 0.5022, + "step": 3940 + }, + { + "epoch": 0.6006919362810326, + "grad_norm": 3.3589109163252813, + "learning_rate": 4.848541445180559e-05, + "loss": 0.4863, + "step": 3950 + }, + { + "epoch": 0.602212675360225, + "grad_norm": 3.0451264253048085, + "learning_rate": 4.8470212241973e-05, + "loss": 0.4957, + "step": 3960 + }, + { + "epoch": 0.6037334144394175, + "grad_norm": 3.5872097466950006, + "learning_rate": 4.8454936526262215e-05, + "loss": 0.5131, + "step": 3970 + }, + { + "epoch": 0.60525415351861, + "grad_norm": 2.3561930755343656, + "learning_rate": 4.843958735251495e-05, + "loss": 0.4732, + "step": 3980 + }, + { + "epoch": 0.6067748925978025, + "grad_norm": 2.805536991964577, + "learning_rate": 4.842416476880296e-05, + "loss": 0.4692, + "step": 3990 + }, + { + "epoch": 0.6082956316769951, + "grad_norm": 3.321127496471528, + "learning_rate": 4.840866882342792e-05, + "loss": 0.4435, + "step": 4000 + }, + { + "epoch": 0.6098163707561876, + "grad_norm": 3.131358327110808, + "learning_rate": 4.8393099564921265e-05, + "loss": 0.4495, + "step": 4010 + }, + { + "epoch": 0.61133710983538, + "grad_norm": 3.672620997758315, + "learning_rate": 4.8377457042044044e-05, + "loss": 0.408, + "step": 4020 + }, + { + "epoch": 0.6128578489145725, + "grad_norm": 2.752457009145662, + "learning_rate": 4.836174130378675e-05, + "loss": 0.4528, + "step": 4030 + }, + { + "epoch": 0.614378587993765, + "grad_norm": 3.227082089135796, + "learning_rate": 4.834595239936917e-05, + "loss": 0.466, + "step": 4040 + }, + { + "epoch": 0.6158993270729575, + "grad_norm": 3.209531219724546, + "learning_rate": 4.833009037824028e-05, + "loss": 0.5086, + "step": 4050 + }, + { + "epoch": 0.61742006615215, + "grad_norm": 3.2158817941095323, + "learning_rate": 4.8314155290078e-05, + "loss": 0.4563, + "step": 4060 + }, + { + "epoch": 0.6189408052313424, + "grad_norm": 3.574873081779174, + "learning_rate": 4.829814718478911e-05, + "loss": 0.489, + "step": 4070 + }, + { + "epoch": 0.6204615443105349, + "grad_norm": 3.323681244843775, + "learning_rate": 4.828206611250906e-05, + "loss": 0.5022, + "step": 4080 + }, + { + "epoch": 0.6219822833897274, + "grad_norm": 2.47984810432678, + "learning_rate": 4.8265912123601854e-05, + "loss": 0.4018, + "step": 4090 + }, + { + "epoch": 0.6235030224689199, + "grad_norm": 2.941766206084687, + "learning_rate": 4.8249685268659815e-05, + "loss": 0.4555, + "step": 4100 + }, + { + "epoch": 0.6250237615481123, + "grad_norm": 2.893125584353245, + "learning_rate": 4.823338559850351e-05, + "loss": 0.478, + "step": 4110 + }, + { + "epoch": 0.6265445006273048, + "grad_norm": 2.9976112410415414, + "learning_rate": 4.8217013164181546e-05, + "loss": 0.4527, + "step": 4120 + }, + { + "epoch": 0.6280652397064973, + "grad_norm": 2.191934501498237, + "learning_rate": 4.820056801697041e-05, + "loss": 0.4435, + "step": 4130 + }, + { + "epoch": 0.6295859787856899, + "grad_norm": 3.379771869364421, + "learning_rate": 4.818405020837433e-05, + "loss": 0.4687, + "step": 4140 + }, + { + "epoch": 0.6311067178648824, + "grad_norm": 2.8027121615751516, + "learning_rate": 4.816745979012508e-05, + "loss": 0.4526, + "step": 4150 + }, + { + "epoch": 0.6326274569440749, + "grad_norm": 3.7850638979127558, + "learning_rate": 4.815079681418187e-05, + "loss": 0.4841, + "step": 4160 + }, + { + "epoch": 0.6341481960232673, + "grad_norm": 2.4537601131071933, + "learning_rate": 4.813406133273111e-05, + "loss": 0.4378, + "step": 4170 + }, + { + "epoch": 0.6356689351024598, + "grad_norm": 2.547932463003559, + "learning_rate": 4.811725339818633e-05, + "loss": 0.4487, + "step": 4180 + }, + { + "epoch": 0.6371896741816523, + "grad_norm": 3.8957786182096927, + "learning_rate": 4.810037306318795e-05, + "loss": 0.4524, + "step": 4190 + }, + { + "epoch": 0.6387104132608448, + "grad_norm": 3.649088004867976, + "learning_rate": 4.8083420380603165e-05, + "loss": 0.459, + "step": 4200 + }, + { + "epoch": 0.6402311523400372, + "grad_norm": 2.6474149147687385, + "learning_rate": 4.80663954035257e-05, + "loss": 0.5525, + "step": 4210 + }, + { + "epoch": 0.6417518914192297, + "grad_norm": 2.8460577658948214, + "learning_rate": 4.804929818527576e-05, + "loss": 0.4642, + "step": 4220 + }, + { + "epoch": 0.6432726304984222, + "grad_norm": 2.2834250665494036, + "learning_rate": 4.803212877939977e-05, + "loss": 0.4234, + "step": 4230 + }, + { + "epoch": 0.6447933695776147, + "grad_norm": 2.715136104909643, + "learning_rate": 4.8014887239670233e-05, + "loss": 0.4544, + "step": 4240 + }, + { + "epoch": 0.6463141086568072, + "grad_norm": 3.547930077314492, + "learning_rate": 4.7997573620085576e-05, + "loss": 0.4374, + "step": 4250 + }, + { + "epoch": 0.6478348477359996, + "grad_norm": 2.5161321352085997, + "learning_rate": 4.798018797486998e-05, + "loss": 0.498, + "step": 4260 + }, + { + "epoch": 0.6493555868151922, + "grad_norm": 2.9736016338211826, + "learning_rate": 4.796273035847318e-05, + "loss": 0.498, + "step": 4270 + }, + { + "epoch": 0.6508763258943847, + "grad_norm": 2.945996766212601, + "learning_rate": 4.7945200825570335e-05, + "loss": 0.4522, + "step": 4280 + }, + { + "epoch": 0.6523970649735772, + "grad_norm": 4.44259056849006, + "learning_rate": 4.792759943106183e-05, + "loss": 0.51, + "step": 4290 + }, + { + "epoch": 0.6539178040527697, + "grad_norm": 2.848673639424329, + "learning_rate": 4.790992623007312e-05, + "loss": 0.492, + "step": 4300 + }, + { + "epoch": 0.6554385431319621, + "grad_norm": 3.18289063918082, + "learning_rate": 4.789218127795453e-05, + "loss": 0.4662, + "step": 4310 + }, + { + "epoch": 0.6569592822111546, + "grad_norm": 3.791707672716069, + "learning_rate": 4.787436463028111e-05, + "loss": 0.4509, + "step": 4320 + }, + { + "epoch": 0.6584800212903471, + "grad_norm": 3.0787742391863695, + "learning_rate": 4.7856476342852475e-05, + "loss": 0.4573, + "step": 4330 + }, + { + "epoch": 0.6600007603695396, + "grad_norm": 2.7041382458996885, + "learning_rate": 4.783851647169256e-05, + "loss": 0.4402, + "step": 4340 + }, + { + "epoch": 0.6615214994487321, + "grad_norm": 3.0586818300071132, + "learning_rate": 4.7820485073049544e-05, + "loss": 0.4678, + "step": 4350 + }, + { + "epoch": 0.6630422385279245, + "grad_norm": 3.0109404421666874, + "learning_rate": 4.780238220339558e-05, + "loss": 0.4586, + "step": 4360 + }, + { + "epoch": 0.664562977607117, + "grad_norm": 3.007176972550797, + "learning_rate": 4.778420791942668e-05, + "loss": 0.503, + "step": 4370 + }, + { + "epoch": 0.6660837166863095, + "grad_norm": 3.3501318710279495, + "learning_rate": 4.776596227806252e-05, + "loss": 0.4589, + "step": 4380 + }, + { + "epoch": 0.667604455765502, + "grad_norm": 3.3452990295599987, + "learning_rate": 4.7747645336446237e-05, + "loss": 0.4878, + "step": 4390 + }, + { + "epoch": 0.6691251948446946, + "grad_norm": 2.9415666914380503, + "learning_rate": 4.772925715194429e-05, + "loss": 0.4905, + "step": 4400 + }, + { + "epoch": 0.670645933923887, + "grad_norm": 2.0963394608887986, + "learning_rate": 4.771079778214627e-05, + "loss": 0.4309, + "step": 4410 + }, + { + "epoch": 0.6721666730030795, + "grad_norm": 2.4162552445823606, + "learning_rate": 4.769226728486469e-05, + "loss": 0.4711, + "step": 4420 + }, + { + "epoch": 0.673687412082272, + "grad_norm": 3.069913833418239, + "learning_rate": 4.767366571813484e-05, + "loss": 0.4321, + "step": 4430 + }, + { + "epoch": 0.6752081511614645, + "grad_norm": 2.8326378928076044, + "learning_rate": 4.765499314021458e-05, + "loss": 0.4613, + "step": 4440 + }, + { + "epoch": 0.676728890240657, + "grad_norm": 3.241832775747423, + "learning_rate": 4.7636249609584185e-05, + "loss": 0.4569, + "step": 4450 + }, + { + "epoch": 0.6782496293198494, + "grad_norm": 3.187250483244871, + "learning_rate": 4.7617435184946125e-05, + "loss": 0.4432, + "step": 4460 + }, + { + "epoch": 0.6797703683990419, + "grad_norm": 2.508060152550878, + "learning_rate": 4.759854992522492e-05, + "loss": 0.436, + "step": 4470 + }, + { + "epoch": 0.6812911074782344, + "grad_norm": 2.6989304729155683, + "learning_rate": 4.757959388956693e-05, + "loss": 0.49, + "step": 4480 + }, + { + "epoch": 0.6828118465574269, + "grad_norm": 2.458753567708561, + "learning_rate": 4.7560567137340175e-05, + "loss": 0.4525, + "step": 4490 + }, + { + "epoch": 0.6843325856366194, + "grad_norm": 3.2346137590822512, + "learning_rate": 4.7541469728134133e-05, + "loss": 0.4973, + "step": 4500 + }, + { + "epoch": 0.6858533247158118, + "grad_norm": 1.9014444826365784, + "learning_rate": 4.752230172175962e-05, + "loss": 0.4123, + "step": 4510 + }, + { + "epoch": 0.6873740637950043, + "grad_norm": 2.6470741339543795, + "learning_rate": 4.750306317824851e-05, + "loss": 0.4532, + "step": 4520 + }, + { + "epoch": 0.6888948028741969, + "grad_norm": 3.2531525859216286, + "learning_rate": 4.74837541578536e-05, + "loss": 0.4527, + "step": 4530 + }, + { + "epoch": 0.6904155419533894, + "grad_norm": 3.4578148212872835, + "learning_rate": 4.746437472104842e-05, + "loss": 0.5209, + "step": 4540 + }, + { + "epoch": 0.6919362810325819, + "grad_norm": 3.224905088278856, + "learning_rate": 4.7444924928527033e-05, + "loss": 0.4546, + "step": 4550 + }, + { + "epoch": 0.6934570201117743, + "grad_norm": 3.6406256431648765, + "learning_rate": 4.742540484120385e-05, + "loss": 0.4286, + "step": 4560 + }, + { + "epoch": 0.6949777591909668, + "grad_norm": 2.6694328881529144, + "learning_rate": 4.740581452021343e-05, + "loss": 0.4499, + "step": 4570 + }, + { + "epoch": 0.6964984982701593, + "grad_norm": 2.4099047975932506, + "learning_rate": 4.738615402691029e-05, + "loss": 0.4606, + "step": 4580 + }, + { + "epoch": 0.6980192373493518, + "grad_norm": 2.5476297943516437, + "learning_rate": 4.736642342286874e-05, + "loss": 0.4704, + "step": 4590 + }, + { + "epoch": 0.6995399764285443, + "grad_norm": 3.0123907539967334, + "learning_rate": 4.734662276988265e-05, + "loss": 0.4852, + "step": 4600 + }, + { + "epoch": 0.7010607155077367, + "grad_norm": 3.1932446281268927, + "learning_rate": 4.732675212996529e-05, + "loss": 0.4408, + "step": 4610 + }, + { + "epoch": 0.7025814545869292, + "grad_norm": 3.0138581076520614, + "learning_rate": 4.7306811565349094e-05, + "loss": 0.4104, + "step": 4620 + }, + { + "epoch": 0.7041021936661217, + "grad_norm": 2.6384517492999877, + "learning_rate": 4.728680113848553e-05, + "loss": 0.455, + "step": 4630 + }, + { + "epoch": 0.7056229327453142, + "grad_norm": 2.7954806796561074, + "learning_rate": 4.726672091204483e-05, + "loss": 0.46, + "step": 4640 + }, + { + "epoch": 0.7071436718245067, + "grad_norm": 3.050300428846164, + "learning_rate": 4.7246570948915846e-05, + "loss": 0.4189, + "step": 4650 + }, + { + "epoch": 0.7086644109036992, + "grad_norm": 2.718898360041631, + "learning_rate": 4.722635131220583e-05, + "loss": 0.457, + "step": 4660 + }, + { + "epoch": 0.7101851499828917, + "grad_norm": 3.8307235689636325, + "learning_rate": 4.720606206524027e-05, + "loss": 0.44, + "step": 4670 + }, + { + "epoch": 0.7117058890620842, + "grad_norm": 2.8414990342537667, + "learning_rate": 4.7185703271562625e-05, + "loss": 0.4645, + "step": 4680 + }, + { + "epoch": 0.7132266281412767, + "grad_norm": 2.864070117661034, + "learning_rate": 4.716527499493419e-05, + "loss": 0.462, + "step": 4690 + }, + { + "epoch": 0.7147473672204692, + "grad_norm": 4.015865057641727, + "learning_rate": 4.7144777299333875e-05, + "loss": 0.4274, + "step": 4700 + }, + { + "epoch": 0.7162681062996616, + "grad_norm": 2.82155523991619, + "learning_rate": 4.712421024895799e-05, + "loss": 0.4887, + "step": 4710 + }, + { + "epoch": 0.7177888453788541, + "grad_norm": 2.4814048508019875, + "learning_rate": 4.7103573908220046e-05, + "loss": 0.4785, + "step": 4720 + }, + { + "epoch": 0.7193095844580466, + "grad_norm": 2.884853459231838, + "learning_rate": 4.7082868341750596e-05, + "loss": 0.5046, + "step": 4730 + }, + { + "epoch": 0.7208303235372391, + "grad_norm": 2.6703151852461513, + "learning_rate": 4.706209361439697e-05, + "loss": 0.4807, + "step": 4740 + }, + { + "epoch": 0.7223510626164316, + "grad_norm": 2.46610802334064, + "learning_rate": 4.704124979122312e-05, + "loss": 0.4557, + "step": 4750 + }, + { + "epoch": 0.723871801695624, + "grad_norm": 2.3749951265131015, + "learning_rate": 4.702033693750938e-05, + "loss": 0.4913, + "step": 4760 + }, + { + "epoch": 0.7253925407748165, + "grad_norm": 2.720512825485018, + "learning_rate": 4.69993551187523e-05, + "loss": 0.4758, + "step": 4770 + }, + { + "epoch": 0.726913279854009, + "grad_norm": 3.324510944776037, + "learning_rate": 4.6978304400664394e-05, + "loss": 0.4608, + "step": 4780 + }, + { + "epoch": 0.7284340189332016, + "grad_norm": 2.9379378635290743, + "learning_rate": 4.695718484917399e-05, + "loss": 0.4906, + "step": 4790 + }, + { + "epoch": 0.7299547580123941, + "grad_norm": 2.710299319504829, + "learning_rate": 4.6935996530424976e-05, + "loss": 0.4722, + "step": 4800 + }, + { + "epoch": 0.7314754970915865, + "grad_norm": 3.575653010394522, + "learning_rate": 4.6914739510776615e-05, + "loss": 0.504, + "step": 4810 + }, + { + "epoch": 0.732996236170779, + "grad_norm": 3.2888535956645626, + "learning_rate": 4.689341385680333e-05, + "loss": 0.4566, + "step": 4820 + }, + { + "epoch": 0.7345169752499715, + "grad_norm": 4.878303253483945, + "learning_rate": 4.6872019635294504e-05, + "loss": 0.408, + "step": 4830 + }, + { + "epoch": 0.736037714329164, + "grad_norm": 2.518227559739081, + "learning_rate": 4.685055691325426e-05, + "loss": 0.4366, + "step": 4840 + }, + { + "epoch": 0.7375584534083565, + "grad_norm": 3.2706048558436174, + "learning_rate": 4.682902575790126e-05, + "loss": 0.4742, + "step": 4850 + }, + { + "epoch": 0.7390791924875489, + "grad_norm": 2.755945611319533, + "learning_rate": 4.6807426236668486e-05, + "loss": 0.4481, + "step": 4860 + }, + { + "epoch": 0.7405999315667414, + "grad_norm": 2.9649725094521644, + "learning_rate": 4.678575841720305e-05, + "loss": 0.4565, + "step": 4870 + }, + { + "epoch": 0.7421206706459339, + "grad_norm": 3.29200532495098, + "learning_rate": 4.6764022367365936e-05, + "loss": 0.495, + "step": 4880 + }, + { + "epoch": 0.7436414097251264, + "grad_norm": 2.7028292999304697, + "learning_rate": 4.6742218155231836e-05, + "loss": 0.4616, + "step": 4890 + }, + { + "epoch": 0.7451621488043189, + "grad_norm": 2.8764222484492157, + "learning_rate": 4.672034584908893e-05, + "loss": 0.4447, + "step": 4900 + }, + { + "epoch": 0.7466828878835113, + "grad_norm": 3.163154407849368, + "learning_rate": 4.669840551743864e-05, + "loss": 0.4766, + "step": 4910 + }, + { + "epoch": 0.7482036269627039, + "grad_norm": 3.8542317891453006, + "learning_rate": 4.6676397228995436e-05, + "loss": 0.4808, + "step": 4920 + }, + { + "epoch": 0.7497243660418964, + "grad_norm": 3.792089071745496, + "learning_rate": 4.665432105268663e-05, + "loss": 0.5018, + "step": 4930 + }, + { + "epoch": 0.7512451051210889, + "grad_norm": 2.586407431540178, + "learning_rate": 4.663217705765216e-05, + "loss": 0.4416, + "step": 4940 + }, + { + "epoch": 0.7527658442002814, + "grad_norm": 2.650176003001353, + "learning_rate": 4.660996531324433e-05, + "loss": 0.498, + "step": 4950 + }, + { + "epoch": 0.7542865832794738, + "grad_norm": 3.391657965995569, + "learning_rate": 4.658768588902767e-05, + "loss": 0.4772, + "step": 4960 + }, + { + "epoch": 0.7558073223586663, + "grad_norm": 2.4114827718494056, + "learning_rate": 4.656533885477864e-05, + "loss": 0.4735, + "step": 4970 + }, + { + "epoch": 0.7573280614378588, + "grad_norm": 3.4350109394387256, + "learning_rate": 4.654292428048546e-05, + "loss": 0.4379, + "step": 4980 + }, + { + "epoch": 0.7588488005170513, + "grad_norm": 2.537815957167368, + "learning_rate": 4.6520442236347885e-05, + "loss": 0.4685, + "step": 4990 + }, + { + "epoch": 0.7603695395962438, + "grad_norm": 3.6021415220203656, + "learning_rate": 4.6497892792776955e-05, + "loss": 0.4715, + "step": 5000 + }, + { + "epoch": 0.7618902786754362, + "grad_norm": 2.4699909787594168, + "learning_rate": 4.647527602039483e-05, + "loss": 0.4304, + "step": 5010 + }, + { + "epoch": 0.7634110177546287, + "grad_norm": 2.9637623354254248, + "learning_rate": 4.645259199003451e-05, + "loss": 0.4233, + "step": 5020 + }, + { + "epoch": 0.7649317568338212, + "grad_norm": 4.685700669403354, + "learning_rate": 4.642984077273964e-05, + "loss": 0.5218, + "step": 5030 + }, + { + "epoch": 0.7664524959130137, + "grad_norm": 2.423393639695851, + "learning_rate": 4.6407022439764305e-05, + "loss": 0.5202, + "step": 5040 + }, + { + "epoch": 0.7679732349922063, + "grad_norm": 2.6126533816022555, + "learning_rate": 4.6384137062572767e-05, + "loss": 0.5016, + "step": 5050 + }, + { + "epoch": 0.7694939740713987, + "grad_norm": 3.2601116027596917, + "learning_rate": 4.636118471283927e-05, + "loss": 0.4868, + "step": 5060 + }, + { + "epoch": 0.7710147131505912, + "grad_norm": 2.5939985298517296, + "learning_rate": 4.6338165462447816e-05, + "loss": 0.4697, + "step": 5070 + }, + { + "epoch": 0.7725354522297837, + "grad_norm": 2.964211942479611, + "learning_rate": 4.631507938349192e-05, + "loss": 0.4474, + "step": 5080 + }, + { + "epoch": 0.7740561913089762, + "grad_norm": 2.4908558751072385, + "learning_rate": 4.62919265482744e-05, + "loss": 0.4568, + "step": 5090 + }, + { + "epoch": 0.7755769303881687, + "grad_norm": 3.044301223668193, + "learning_rate": 4.6268707029307156e-05, + "loss": 0.4706, + "step": 5100 + }, + { + "epoch": 0.7770976694673611, + "grad_norm": 2.8387381770191347, + "learning_rate": 4.624542089931091e-05, + "loss": 0.4038, + "step": 5110 + }, + { + "epoch": 0.7786184085465536, + "grad_norm": 2.8435680566211414, + "learning_rate": 4.622206823121503e-05, + "loss": 0.4882, + "step": 5120 + }, + { + "epoch": 0.7801391476257461, + "grad_norm": 3.0194643535859234, + "learning_rate": 4.619864909815726e-05, + "loss": 0.4759, + "step": 5130 + }, + { + "epoch": 0.7816598867049386, + "grad_norm": 2.891342639096013, + "learning_rate": 4.617516357348349e-05, + "loss": 0.5088, + "step": 5140 + }, + { + "epoch": 0.7831806257841311, + "grad_norm": 3.200196420226114, + "learning_rate": 4.615161173074757e-05, + "loss": 0.4279, + "step": 5150 + }, + { + "epoch": 0.7847013648633235, + "grad_norm": 4.442763916393894, + "learning_rate": 4.6127993643711034e-05, + "loss": 0.512, + "step": 5160 + }, + { + "epoch": 0.786222103942516, + "grad_norm": 2.9047343834130346, + "learning_rate": 4.6104309386342884e-05, + "loss": 0.4742, + "step": 5170 + }, + { + "epoch": 0.7877428430217086, + "grad_norm": 2.61668813869526, + "learning_rate": 4.608055903281935e-05, + "loss": 0.4537, + "step": 5180 + }, + { + "epoch": 0.7892635821009011, + "grad_norm": 3.253496309332458, + "learning_rate": 4.605674265752369e-05, + "loss": 0.4649, + "step": 5190 + }, + { + "epoch": 0.7907843211800936, + "grad_norm": 3.5048636819107055, + "learning_rate": 4.6032860335045924e-05, + "loss": 0.4892, + "step": 5200 + }, + { + "epoch": 0.792305060259286, + "grad_norm": 2.8012067427222216, + "learning_rate": 4.6008912140182617e-05, + "loss": 0.4631, + "step": 5210 + }, + { + "epoch": 0.7938257993384785, + "grad_norm": 3.406760585253113, + "learning_rate": 4.598489814793661e-05, + "loss": 0.4743, + "step": 5220 + }, + { + "epoch": 0.795346538417671, + "grad_norm": 2.9804451729495844, + "learning_rate": 4.596081843351685e-05, + "loss": 0.4812, + "step": 5230 + }, + { + "epoch": 0.7968672774968635, + "grad_norm": 2.961819902411992, + "learning_rate": 4.593667307233811e-05, + "loss": 0.4259, + "step": 5240 + }, + { + "epoch": 0.798388016576056, + "grad_norm": 3.810154135060747, + "learning_rate": 4.591246214002073e-05, + "loss": 0.4937, + "step": 5250 + }, + { + "epoch": 0.7999087556552484, + "grad_norm": 2.408879519704649, + "learning_rate": 4.5888185712390444e-05, + "loss": 0.4651, + "step": 5260 + }, + { + "epoch": 0.8014294947344409, + "grad_norm": 2.1951506956266784, + "learning_rate": 4.586384386547811e-05, + "loss": 0.4703, + "step": 5270 + }, + { + "epoch": 0.8029502338136334, + "grad_norm": 2.6966674107097353, + "learning_rate": 4.5839436675519454e-05, + "loss": 0.4844, + "step": 5280 + }, + { + "epoch": 0.8044709728928259, + "grad_norm": 2.4866097756367953, + "learning_rate": 4.581496421895486e-05, + "loss": 0.4858, + "step": 5290 + }, + { + "epoch": 0.8059917119720184, + "grad_norm": 2.704963366869489, + "learning_rate": 4.57904265724291e-05, + "loss": 0.4961, + "step": 5300 + }, + { + "epoch": 0.8075124510512108, + "grad_norm": 2.3968592839538307, + "learning_rate": 4.576582381279114e-05, + "loss": 0.4607, + "step": 5310 + }, + { + "epoch": 0.8090331901304034, + "grad_norm": 2.9244157844444505, + "learning_rate": 4.574115601709386e-05, + "loss": 0.4412, + "step": 5320 + }, + { + "epoch": 0.8105539292095959, + "grad_norm": 2.947964009650227, + "learning_rate": 4.5716423262593825e-05, + "loss": 0.4555, + "step": 5330 + }, + { + "epoch": 0.8120746682887884, + "grad_norm": 3.2293151572875436, + "learning_rate": 4.569162562675104e-05, + "loss": 0.4587, + "step": 5340 + }, + { + "epoch": 0.8135954073679809, + "grad_norm": 2.822812608679261, + "learning_rate": 4.566676318722872e-05, + "loss": 0.476, + "step": 5350 + }, + { + "epoch": 0.8151161464471733, + "grad_norm": 2.757717524454528, + "learning_rate": 4.564183602189302e-05, + "loss": 0.4914, + "step": 5360 + }, + { + "epoch": 0.8166368855263658, + "grad_norm": 2.8212686419770465, + "learning_rate": 4.5616844208812826e-05, + "loss": 0.4145, + "step": 5370 + }, + { + "epoch": 0.8181576246055583, + "grad_norm": 2.6990622259235946, + "learning_rate": 4.5591787826259495e-05, + "loss": 0.5029, + "step": 5380 + }, + { + "epoch": 0.8196783636847508, + "grad_norm": 3.3442714600420542, + "learning_rate": 4.5566666952706595e-05, + "loss": 0.4634, + "step": 5390 + }, + { + "epoch": 0.8211991027639433, + "grad_norm": 2.671875475754362, + "learning_rate": 4.554148166682967e-05, + "loss": 0.4826, + "step": 5400 + }, + { + "epoch": 0.8227198418431357, + "grad_norm": 2.6960102280740026, + "learning_rate": 4.5516232047506e-05, + "loss": 0.4728, + "step": 5410 + }, + { + "epoch": 0.8242405809223282, + "grad_norm": 3.245896464017568, + "learning_rate": 4.5490918173814364e-05, + "loss": 0.482, + "step": 5420 + }, + { + "epoch": 0.8257613200015207, + "grad_norm": 2.299524695820454, + "learning_rate": 4.546554012503476e-05, + "loss": 0.4137, + "step": 5430 + }, + { + "epoch": 0.8272820590807132, + "grad_norm": 2.7315211370899286, + "learning_rate": 4.544009798064818e-05, + "loss": 0.4561, + "step": 5440 + }, + { + "epoch": 0.8288027981599058, + "grad_norm": 2.9803554905467116, + "learning_rate": 4.541459182033635e-05, + "loss": 0.4527, + "step": 5450 + }, + { + "epoch": 0.8303235372390982, + "grad_norm": 3.3072919752090053, + "learning_rate": 4.538902172398151e-05, + "loss": 0.4467, + "step": 5460 + }, + { + "epoch": 0.8318442763182907, + "grad_norm": 2.752221002247515, + "learning_rate": 4.53633877716661e-05, + "loss": 0.5052, + "step": 5470 + }, + { + "epoch": 0.8333650153974832, + "grad_norm": 3.3301027419884135, + "learning_rate": 4.5337690043672596e-05, + "loss": 0.4632, + "step": 5480 + }, + { + "epoch": 0.8348857544766757, + "grad_norm": 2.9617013929303804, + "learning_rate": 4.531192862048316e-05, + "loss": 0.4603, + "step": 5490 + }, + { + "epoch": 0.8364064935558682, + "grad_norm": 2.833080477650345, + "learning_rate": 4.528610358277949e-05, + "loss": 0.4753, + "step": 5500 + }, + { + "epoch": 0.8379272326350606, + "grad_norm": 3.02141151642277, + "learning_rate": 4.5260215011442485e-05, + "loss": 0.4838, + "step": 5510 + }, + { + "epoch": 0.8394479717142531, + "grad_norm": 3.1692008114922054, + "learning_rate": 4.523426298755203e-05, + "loss": 0.4543, + "step": 5520 + }, + { + "epoch": 0.8409687107934456, + "grad_norm": 2.885944432530924, + "learning_rate": 4.520824759238674e-05, + "loss": 0.4061, + "step": 5530 + }, + { + "epoch": 0.8424894498726381, + "grad_norm": 3.36401816712044, + "learning_rate": 4.518216890742371e-05, + "loss": 0.4753, + "step": 5540 + }, + { + "epoch": 0.8440101889518306, + "grad_norm": 2.342834496594284, + "learning_rate": 4.515602701433822e-05, + "loss": 0.4441, + "step": 5550 + }, + { + "epoch": 0.845530928031023, + "grad_norm": 2.5261454544271533, + "learning_rate": 4.512982199500354e-05, + "loss": 0.4539, + "step": 5560 + }, + { + "epoch": 0.8470516671102155, + "grad_norm": 2.691992473720647, + "learning_rate": 4.510355393149064e-05, + "loss": 0.4878, + "step": 5570 + }, + { + "epoch": 0.8485724061894081, + "grad_norm": 3.157163713867332, + "learning_rate": 4.5077222906067935e-05, + "loss": 0.4418, + "step": 5580 + }, + { + "epoch": 0.8500931452686006, + "grad_norm": 2.5917304954690437, + "learning_rate": 4.505082900120101e-05, + "loss": 0.4691, + "step": 5590 + }, + { + "epoch": 0.8516138843477931, + "grad_norm": 2.6381539233296367, + "learning_rate": 4.502437229955241e-05, + "loss": 0.4846, + "step": 5600 + }, + { + "epoch": 0.8531346234269855, + "grad_norm": 2.9756687449936265, + "learning_rate": 4.499785288398133e-05, + "loss": 0.4338, + "step": 5610 + }, + { + "epoch": 0.854655362506178, + "grad_norm": 3.590281873551749, + "learning_rate": 4.497127083754339e-05, + "loss": 0.4938, + "step": 5620 + }, + { + "epoch": 0.8561761015853705, + "grad_norm": 3.515369396641824, + "learning_rate": 4.494462624349036e-05, + "loss": 0.4728, + "step": 5630 + }, + { + "epoch": 0.857696840664563, + "grad_norm": 2.46632658256501, + "learning_rate": 4.4917919185269895e-05, + "loss": 0.5013, + "step": 5640 + }, + { + "epoch": 0.8592175797437555, + "grad_norm": 2.832388750175382, + "learning_rate": 4.4891149746525294e-05, + "loss": 0.4919, + "step": 5650 + }, + { + "epoch": 0.8607383188229479, + "grad_norm": 3.3835500841823447, + "learning_rate": 4.4864318011095196e-05, + "loss": 0.4701, + "step": 5660 + }, + { + "epoch": 0.8622590579021404, + "grad_norm": 2.5787552820373976, + "learning_rate": 4.483742406301339e-05, + "loss": 0.4815, + "step": 5670 + }, + { + "epoch": 0.8637797969813329, + "grad_norm": 2.937858271690391, + "learning_rate": 4.481046798650846e-05, + "loss": 0.4687, + "step": 5680 + }, + { + "epoch": 0.8653005360605254, + "grad_norm": 2.874079427895431, + "learning_rate": 4.47834498660036e-05, + "loss": 0.4625, + "step": 5690 + }, + { + "epoch": 0.8668212751397179, + "grad_norm": 2.963576910508528, + "learning_rate": 4.4756369786116315e-05, + "loss": 0.4645, + "step": 5700 + }, + { + "epoch": 0.8683420142189104, + "grad_norm": 2.4242734324413213, + "learning_rate": 4.4729227831658146e-05, + "loss": 0.4975, + "step": 5710 + }, + { + "epoch": 0.8698627532981029, + "grad_norm": 3.7520501687093635, + "learning_rate": 4.4702024087634434e-05, + "loss": 0.472, + "step": 5720 + }, + { + "epoch": 0.8713834923772954, + "grad_norm": 3.1323861031683395, + "learning_rate": 4.467475863924402e-05, + "loss": 0.4901, + "step": 5730 + }, + { + "epoch": 0.8729042314564879, + "grad_norm": 2.5988600376011703, + "learning_rate": 4.464743157187901e-05, + "loss": 0.4669, + "step": 5740 + }, + { + "epoch": 0.8744249705356804, + "grad_norm": 3.3888954899287276, + "learning_rate": 4.4620042971124485e-05, + "loss": 0.491, + "step": 5750 + }, + { + "epoch": 0.8759457096148728, + "grad_norm": 2.6401813306603525, + "learning_rate": 4.459259292275825e-05, + "loss": 0.461, + "step": 5760 + }, + { + "epoch": 0.8774664486940653, + "grad_norm": 3.4263156404926547, + "learning_rate": 4.4565081512750554e-05, + "loss": 0.4977, + "step": 5770 + }, + { + "epoch": 0.8789871877732578, + "grad_norm": 3.472536302453718, + "learning_rate": 4.4537508827263795e-05, + "loss": 0.5038, + "step": 5780 + }, + { + "epoch": 0.8805079268524503, + "grad_norm": 3.356501625013648, + "learning_rate": 4.450987495265233e-05, + "loss": 0.4823, + "step": 5790 + }, + { + "epoch": 0.8820286659316428, + "grad_norm": 3.02520735512741, + "learning_rate": 4.448217997546212e-05, + "loss": 0.4292, + "step": 5800 + }, + { + "epoch": 0.8835494050108352, + "grad_norm": 2.5849512780911925, + "learning_rate": 4.4454423982430495e-05, + "loss": 0.4138, + "step": 5810 + }, + { + "epoch": 0.8850701440900277, + "grad_norm": 2.9807449243616433, + "learning_rate": 4.4426607060485876e-05, + "loss": 0.469, + "step": 5820 + }, + { + "epoch": 0.8865908831692202, + "grad_norm": 2.7148140982105633, + "learning_rate": 4.439872929674752e-05, + "loss": 0.4885, + "step": 5830 + }, + { + "epoch": 0.8881116222484128, + "grad_norm": 3.0001870998466478, + "learning_rate": 4.4370790778525225e-05, + "loss": 0.4474, + "step": 5840 + }, + { + "epoch": 0.8896323613276053, + "grad_norm": 2.755199472374878, + "learning_rate": 4.4342791593319075e-05, + "loss": 0.4399, + "step": 5850 + }, + { + "epoch": 0.8911531004067977, + "grad_norm": 1.979375659184468, + "learning_rate": 4.4314731828819126e-05, + "loss": 0.4687, + "step": 5860 + }, + { + "epoch": 0.8926738394859902, + "grad_norm": 3.3414032675089347, + "learning_rate": 4.4286611572905195e-05, + "loss": 0.4412, + "step": 5870 + }, + { + "epoch": 0.8941945785651827, + "grad_norm": 2.44699956144009, + "learning_rate": 4.425843091364654e-05, + "loss": 0.4192, + "step": 5880 + }, + { + "epoch": 0.8957153176443752, + "grad_norm": 2.8437873742970297, + "learning_rate": 4.423018993930157e-05, + "loss": 0.4092, + "step": 5890 + }, + { + "epoch": 0.8972360567235677, + "grad_norm": 3.802906450549967, + "learning_rate": 4.4201888738317646e-05, + "loss": 0.4363, + "step": 5900 + }, + { + "epoch": 0.8987567958027601, + "grad_norm": 2.1216497827232965, + "learning_rate": 4.41735273993307e-05, + "loss": 0.4513, + "step": 5910 + }, + { + "epoch": 0.9002775348819526, + "grad_norm": 2.1757255212034554, + "learning_rate": 4.414510601116504e-05, + "loss": 0.4666, + "step": 5920 + }, + { + "epoch": 0.9017982739611451, + "grad_norm": 2.683714283658416, + "learning_rate": 4.411662466283302e-05, + "loss": 0.4383, + "step": 5930 + }, + { + "epoch": 0.9033190130403376, + "grad_norm": 3.0108177056507524, + "learning_rate": 4.4088083443534806e-05, + "loss": 0.4548, + "step": 5940 + }, + { + "epoch": 0.90483975211953, + "grad_norm": 2.5551922287006903, + "learning_rate": 4.4059482442658054e-05, + "loss": 0.44, + "step": 5950 + }, + { + "epoch": 0.9063604911987225, + "grad_norm": 2.725152163862961, + "learning_rate": 4.403082174977765e-05, + "loss": 0.4535, + "step": 5960 + }, + { + "epoch": 0.9078812302779151, + "grad_norm": 2.9082566644406147, + "learning_rate": 4.4002101454655444e-05, + "loss": 0.4564, + "step": 5970 + }, + { + "epoch": 0.9094019693571076, + "grad_norm": 3.2358422833677407, + "learning_rate": 4.397332164723992e-05, + "loss": 0.4541, + "step": 5980 + }, + { + "epoch": 0.9109227084363001, + "grad_norm": 2.371090420490626, + "learning_rate": 4.3944482417665986e-05, + "loss": 0.466, + "step": 5990 + }, + { + "epoch": 0.9124434475154926, + "grad_norm": 3.2688576374956035, + "learning_rate": 4.3915583856254646e-05, + "loss": 0.46, + "step": 6000 + }, + { + "epoch": 0.913964186594685, + "grad_norm": 3.470651307428788, + "learning_rate": 4.3886626053512694e-05, + "loss": 0.5075, + "step": 6010 + }, + { + "epoch": 0.9154849256738775, + "grad_norm": 2.6352202729101446, + "learning_rate": 4.385760910013248e-05, + "loss": 0.4101, + "step": 6020 + }, + { + "epoch": 0.91700566475307, + "grad_norm": 2.2278217203497084, + "learning_rate": 4.382853308699162e-05, + "loss": 0.4656, + "step": 6030 + }, + { + "epoch": 0.9185264038322625, + "grad_norm": 3.1696287871841293, + "learning_rate": 4.379939810515268e-05, + "loss": 0.4463, + "step": 6040 + }, + { + "epoch": 0.920047142911455, + "grad_norm": 2.284711857421693, + "learning_rate": 4.3770204245862915e-05, + "loss": 0.4324, + "step": 6050 + }, + { + "epoch": 0.9215678819906474, + "grad_norm": 2.6816471619508704, + "learning_rate": 4.374095160055398e-05, + "loss": 0.5003, + "step": 6060 + }, + { + "epoch": 0.9230886210698399, + "grad_norm": 2.303082413814844, + "learning_rate": 4.371164026084163e-05, + "loss": 0.457, + "step": 6070 + }, + { + "epoch": 0.9246093601490324, + "grad_norm": 3.2219581191846203, + "learning_rate": 4.368227031852545e-05, + "loss": 0.483, + "step": 6080 + }, + { + "epoch": 0.9261300992282249, + "grad_norm": 2.8200763691964115, + "learning_rate": 4.365284186558858e-05, + "loss": 0.4738, + "step": 6090 + }, + { + "epoch": 0.9276508383074175, + "grad_norm": 2.2059517726516273, + "learning_rate": 4.362335499419736e-05, + "loss": 0.438, + "step": 6100 + }, + { + "epoch": 0.9291715773866099, + "grad_norm": 3.1161792288961947, + "learning_rate": 4.3593809796701146e-05, + "loss": 0.4482, + "step": 6110 + }, + { + "epoch": 0.9306923164658024, + "grad_norm": 2.389377488590752, + "learning_rate": 4.356420636563193e-05, + "loss": 0.4608, + "step": 6120 + }, + { + "epoch": 0.9322130555449949, + "grad_norm": 2.1121677112663275, + "learning_rate": 4.3534544793704093e-05, + "loss": 0.4336, + "step": 6130 + }, + { + "epoch": 0.9337337946241874, + "grad_norm": 3.2344050660000714, + "learning_rate": 4.35048251738141e-05, + "loss": 0.4686, + "step": 6140 + }, + { + "epoch": 0.9352545337033799, + "grad_norm": 3.3233386962513025, + "learning_rate": 4.347504759904023e-05, + "loss": 0.403, + "step": 6150 + }, + { + "epoch": 0.9367752727825723, + "grad_norm": 2.8393190252745, + "learning_rate": 4.3445212162642254e-05, + "loss": 0.4852, + "step": 6160 + }, + { + "epoch": 0.9382960118617648, + "grad_norm": 3.224354472612487, + "learning_rate": 4.341531895806118e-05, + "loss": 0.4998, + "step": 6170 + }, + { + "epoch": 0.9398167509409573, + "grad_norm": 2.489967934098991, + "learning_rate": 4.3385368078918906e-05, + "loss": 0.4429, + "step": 6180 + }, + { + "epoch": 0.9413374900201498, + "grad_norm": 3.4277391106528707, + "learning_rate": 4.335535961901799e-05, + "loss": 0.4556, + "step": 6190 + }, + { + "epoch": 0.9428582290993422, + "grad_norm": 2.9005513112389325, + "learning_rate": 4.3325293672341303e-05, + "loss": 0.4791, + "step": 6200 + }, + { + "epoch": 0.9443789681785347, + "grad_norm": 2.178886857683573, + "learning_rate": 4.329517033305178e-05, + "loss": 0.4518, + "step": 6210 + }, + { + "epoch": 0.9458997072577272, + "grad_norm": 2.506566458756905, + "learning_rate": 4.3264989695492095e-05, + "loss": 0.4679, + "step": 6220 + }, + { + "epoch": 0.9474204463369198, + "grad_norm": 3.373603874796829, + "learning_rate": 4.3234751854184366e-05, + "loss": 0.4257, + "step": 6230 + }, + { + "epoch": 0.9489411854161123, + "grad_norm": 2.511945815322995, + "learning_rate": 4.3204456903829856e-05, + "loss": 0.4571, + "step": 6240 + }, + { + "epoch": 0.9504619244953048, + "grad_norm": 2.54930670410594, + "learning_rate": 4.3174104939308725e-05, + "loss": 0.4501, + "step": 6250 + }, + { + "epoch": 0.9519826635744972, + "grad_norm": 3.1389539343604786, + "learning_rate": 4.314369605567966e-05, + "loss": 0.4356, + "step": 6260 + }, + { + "epoch": 0.9535034026536897, + "grad_norm": 2.568856717815106, + "learning_rate": 4.311323034817961e-05, + "loss": 0.4438, + "step": 6270 + }, + { + "epoch": 0.9550241417328822, + "grad_norm": 3.4183427612510866, + "learning_rate": 4.308270791222352e-05, + "loss": 0.4633, + "step": 6280 + }, + { + "epoch": 0.9565448808120747, + "grad_norm": 2.6606861443908874, + "learning_rate": 4.3052128843403984e-05, + "loss": 0.4488, + "step": 6290 + }, + { + "epoch": 0.9580656198912672, + "grad_norm": 2.6617292006212137, + "learning_rate": 4.3021493237490944e-05, + "loss": 0.4787, + "step": 6300 + }, + { + "epoch": 0.9595863589704596, + "grad_norm": 2.0087825352990047, + "learning_rate": 4.299080119043144e-05, + "loss": 0.4539, + "step": 6310 + }, + { + "epoch": 0.9611070980496521, + "grad_norm": 3.0282297007319126, + "learning_rate": 4.296005279834928e-05, + "loss": 0.4644, + "step": 6320 + }, + { + "epoch": 0.9626278371288446, + "grad_norm": 3.465152636811805, + "learning_rate": 4.29292481575447e-05, + "loss": 0.4527, + "step": 6330 + }, + { + "epoch": 0.9641485762080371, + "grad_norm": 2.7104346692604753, + "learning_rate": 4.289838736449414e-05, + "loss": 0.4626, + "step": 6340 + }, + { + "epoch": 0.9656693152872295, + "grad_norm": 2.714955759421443, + "learning_rate": 4.286747051584989e-05, + "loss": 0.4272, + "step": 6350 + }, + { + "epoch": 0.9671900543664221, + "grad_norm": 3.1173699584714627, + "learning_rate": 4.2836497708439784e-05, + "loss": 0.4859, + "step": 6360 + }, + { + "epoch": 0.9687107934456146, + "grad_norm": 1.8592027242486544, + "learning_rate": 4.2805469039266934e-05, + "loss": 0.4599, + "step": 6370 + }, + { + "epoch": 0.9702315325248071, + "grad_norm": 2.4034178405385096, + "learning_rate": 4.2774384605509395e-05, + "loss": 0.4405, + "step": 6380 + }, + { + "epoch": 0.9717522716039996, + "grad_norm": 2.2929810582228933, + "learning_rate": 4.274324450451986e-05, + "loss": 0.4369, + "step": 6390 + }, + { + "epoch": 0.973273010683192, + "grad_norm": 3.7757171952239754, + "learning_rate": 4.271204883382539e-05, + "loss": 0.4651, + "step": 6400 + }, + { + "epoch": 0.9747937497623845, + "grad_norm": 2.577236436546833, + "learning_rate": 4.268079769112706e-05, + "loss": 0.4635, + "step": 6410 + }, + { + "epoch": 0.976314488841577, + "grad_norm": 2.646306736933796, + "learning_rate": 4.264949117429968e-05, + "loss": 0.4462, + "step": 6420 + }, + { + "epoch": 0.9778352279207695, + "grad_norm": 2.9957257975311764, + "learning_rate": 4.261812938139151e-05, + "loss": 0.4541, + "step": 6430 + }, + { + "epoch": 0.979355966999962, + "grad_norm": 2.698706233581602, + "learning_rate": 4.258671241062388e-05, + "loss": 0.4114, + "step": 6440 + }, + { + "epoch": 0.9808767060791544, + "grad_norm": 2.9199555532288524, + "learning_rate": 4.255524036039098e-05, + "loss": 0.4723, + "step": 6450 + }, + { + "epoch": 0.9823974451583469, + "grad_norm": 2.37198984593648, + "learning_rate": 4.2523713329259484e-05, + "loss": 0.5112, + "step": 6460 + }, + { + "epoch": 0.9839181842375394, + "grad_norm": 2.929893514403328, + "learning_rate": 4.249213141596824e-05, + "loss": 0.4793, + "step": 6470 + }, + { + "epoch": 0.9854389233167319, + "grad_norm": 2.3354155151916807, + "learning_rate": 4.246049471942801e-05, + "loss": 0.4294, + "step": 6480 + }, + { + "epoch": 0.9869596623959244, + "grad_norm": 2.770808272180371, + "learning_rate": 4.2428803338721114e-05, + "loss": 0.4716, + "step": 6490 + }, + { + "epoch": 0.988480401475117, + "grad_norm": 2.758112780042773, + "learning_rate": 4.239705737310114e-05, + "loss": 0.4305, + "step": 6500 + }, + { + "epoch": 0.9900011405543094, + "grad_norm": 3.028712682657119, + "learning_rate": 4.236525692199261e-05, + "loss": 0.4185, + "step": 6510 + }, + { + "epoch": 0.9915218796335019, + "grad_norm": 2.8941293673672166, + "learning_rate": 4.233340208499074e-05, + "loss": 0.4557, + "step": 6520 + }, + { + "epoch": 0.9930426187126944, + "grad_norm": 2.870957351688323, + "learning_rate": 4.230149296186102e-05, + "loss": 0.4599, + "step": 6530 + }, + { + "epoch": 0.9945633577918869, + "grad_norm": 3.400601856741602, + "learning_rate": 4.2269529652538995e-05, + "loss": 0.457, + "step": 6540 + }, + { + "epoch": 0.9960840968710793, + "grad_norm": 2.831974714760935, + "learning_rate": 4.22375122571299e-05, + "loss": 0.433, + "step": 6550 + }, + { + "epoch": 0.9976048359502718, + "grad_norm": 2.891754786335497, + "learning_rate": 4.2205440875908345e-05, + "loss": 0.4783, + "step": 6560 + }, + { + "epoch": 0.9991255750294643, + "grad_norm": 2.9556624793001993, + "learning_rate": 4.217331560931804e-05, + "loss": 0.483, + "step": 6570 + }, + { + "epoch": 1.0006463141086568, + "grad_norm": 2.6326617881109406, + "learning_rate": 4.214113655797146e-05, + "loss": 0.3874, + "step": 6580 + }, + { + "epoch": 1.0021670531878493, + "grad_norm": 2.3484848386790613, + "learning_rate": 4.210890382264952e-05, + "loss": 0.3006, + "step": 6590 + }, + { + "epoch": 1.0036877922670417, + "grad_norm": 2.677898750672153, + "learning_rate": 4.2076617504301254e-05, + "loss": 0.3039, + "step": 6600 + }, + { + "epoch": 1.0052085313462342, + "grad_norm": 10.813699814463863, + "learning_rate": 4.2044277704043523e-05, + "loss": 0.2944, + "step": 6610 + }, + { + "epoch": 1.0067292704254267, + "grad_norm": 2.2780846544936626, + "learning_rate": 4.201188452316069e-05, + "loss": 0.3083, + "step": 6620 + }, + { + "epoch": 1.0082500095046192, + "grad_norm": 2.7917265597956296, + "learning_rate": 4.1979438063104304e-05, + "loss": 0.3158, + "step": 6630 + }, + { + "epoch": 1.0097707485838117, + "grad_norm": 2.7936035179112473, + "learning_rate": 4.1946938425492765e-05, + "loss": 0.2928, + "step": 6640 + }, + { + "epoch": 1.0112914876630041, + "grad_norm": 2.890227286734542, + "learning_rate": 4.1914385712111026e-05, + "loss": 0.2758, + "step": 6650 + }, + { + "epoch": 1.0128122267421966, + "grad_norm": 4.5585252411174695, + "learning_rate": 4.188178002491025e-05, + "loss": 0.2664, + "step": 6660 + }, + { + "epoch": 1.014332965821389, + "grad_norm": 2.3466804741210865, + "learning_rate": 4.184912146600754e-05, + "loss": 0.3214, + "step": 6670 + }, + { + "epoch": 1.0158537049005816, + "grad_norm": 2.111379772439604, + "learning_rate": 4.181641013768557e-05, + "loss": 0.2996, + "step": 6680 + }, + { + "epoch": 1.0173744439797743, + "grad_norm": 3.020141596153829, + "learning_rate": 4.1783646142392266e-05, + "loss": 0.3155, + "step": 6690 + }, + { + "epoch": 1.0188951830589668, + "grad_norm": 3.420581674984006, + "learning_rate": 4.175082958274053e-05, + "loss": 0.3106, + "step": 6700 + }, + { + "epoch": 1.0204159221381592, + "grad_norm": 2.3686855381907574, + "learning_rate": 4.171796056150786e-05, + "loss": 0.2566, + "step": 6710 + }, + { + "epoch": 1.0219366612173517, + "grad_norm": 4.488712640618961, + "learning_rate": 4.168503918163608e-05, + "loss": 0.3244, + "step": 6720 + }, + { + "epoch": 1.0234574002965442, + "grad_norm": 2.3267273702038023, + "learning_rate": 4.1652065546231e-05, + "loss": 0.2723, + "step": 6730 + }, + { + "epoch": 1.0249781393757367, + "grad_norm": 1.9614603401141328, + "learning_rate": 4.161903975856205e-05, + "loss": 0.2861, + "step": 6740 + }, + { + "epoch": 1.0264988784549292, + "grad_norm": 2.5473894947699995, + "learning_rate": 4.1585961922062046e-05, + "loss": 0.2931, + "step": 6750 + }, + { + "epoch": 1.0280196175341216, + "grad_norm": 2.890318389429616, + "learning_rate": 4.155283214032676e-05, + "loss": 0.3175, + "step": 6760 + }, + { + "epoch": 1.0295403566133141, + "grad_norm": 2.333986344892157, + "learning_rate": 4.1519650517114703e-05, + "loss": 0.2912, + "step": 6770 + }, + { + "epoch": 1.0310610956925066, + "grad_norm": 2.4948276090486043, + "learning_rate": 4.148641715634671e-05, + "loss": 0.2732, + "step": 6780 + }, + { + "epoch": 1.032581834771699, + "grad_norm": 2.773494977629346, + "learning_rate": 4.1453132162105666e-05, + "loss": 0.2939, + "step": 6790 + }, + { + "epoch": 1.0341025738508915, + "grad_norm": 2.3044251833253715, + "learning_rate": 4.141979563863617e-05, + "loss": 0.296, + "step": 6800 + }, + { + "epoch": 1.035623312930084, + "grad_norm": 1.7004098552219704, + "learning_rate": 4.138640769034419e-05, + "loss": 0.2765, + "step": 6810 + }, + { + "epoch": 1.0371440520092765, + "grad_norm": 2.312925554823324, + "learning_rate": 4.1352968421796774e-05, + "loss": 0.2474, + "step": 6820 + }, + { + "epoch": 1.038664791088469, + "grad_norm": 2.7146336744848196, + "learning_rate": 4.131947793772166e-05, + "loss": 0.3513, + "step": 6830 + }, + { + "epoch": 1.0401855301676615, + "grad_norm": 2.4967294835731852, + "learning_rate": 4.128593634300704e-05, + "loss": 0.2932, + "step": 6840 + }, + { + "epoch": 1.041706269246854, + "grad_norm": 2.745315579775212, + "learning_rate": 4.1252343742701117e-05, + "loss": 0.2906, + "step": 6850 + }, + { + "epoch": 1.0432270083260464, + "grad_norm": 2.289650741540806, + "learning_rate": 4.12187002420119e-05, + "loss": 0.2798, + "step": 6860 + }, + { + "epoch": 1.044747747405239, + "grad_norm": 2.762698151193697, + "learning_rate": 4.1185005946306754e-05, + "loss": 0.2782, + "step": 6870 + }, + { + "epoch": 1.0462684864844314, + "grad_norm": 2.4704815867357985, + "learning_rate": 4.115126096111218e-05, + "loss": 0.2709, + "step": 6880 + }, + { + "epoch": 1.0477892255636239, + "grad_norm": 2.793118016559259, + "learning_rate": 4.111746539211339e-05, + "loss": 0.2838, + "step": 6890 + }, + { + "epoch": 1.0493099646428163, + "grad_norm": 2.430159684686915, + "learning_rate": 4.108361934515405e-05, + "loss": 0.281, + "step": 6900 + }, + { + "epoch": 1.0508307037220088, + "grad_norm": 2.4219024805843175, + "learning_rate": 4.104972292623589e-05, + "loss": 0.31, + "step": 6910 + }, + { + "epoch": 1.0523514428012013, + "grad_norm": 1.9020584220977228, + "learning_rate": 4.1015776241518416e-05, + "loss": 0.2725, + "step": 6920 + }, + { + "epoch": 1.0538721818803938, + "grad_norm": 2.3643624139189003, + "learning_rate": 4.098177939731856e-05, + "loss": 0.2986, + "step": 6930 + }, + { + "epoch": 1.0553929209595863, + "grad_norm": 2.870750777061021, + "learning_rate": 4.0947732500110345e-05, + "loss": 0.3116, + "step": 6940 + }, + { + "epoch": 1.056913660038779, + "grad_norm": 2.4253297391385695, + "learning_rate": 4.091363565652455e-05, + "loss": 0.2663, + "step": 6950 + }, + { + "epoch": 1.0584343991179714, + "grad_norm": 2.201903500574825, + "learning_rate": 4.0879488973348376e-05, + "loss": 0.2713, + "step": 6960 + }, + { + "epoch": 1.059955138197164, + "grad_norm": 2.1321285823771516, + "learning_rate": 4.084529255752513e-05, + "loss": 0.2575, + "step": 6970 + }, + { + "epoch": 1.0614758772763564, + "grad_norm": 3.5677859994747827, + "learning_rate": 4.0811046516153864e-05, + "loss": 0.3086, + "step": 6980 + }, + { + "epoch": 1.0629966163555489, + "grad_norm": 2.8279969470057806, + "learning_rate": 4.077675095648906e-05, + "loss": 0.3084, + "step": 6990 + }, + { + "epoch": 1.0645173554347414, + "grad_norm": 2.4595888359022524, + "learning_rate": 4.074240598594028e-05, + "loss": 0.3067, + "step": 7000 + }, + { + "epoch": 1.0660380945139338, + "grad_norm": 3.1126523682810734, + "learning_rate": 4.0708011712071834e-05, + "loss": 0.2934, + "step": 7010 + }, + { + "epoch": 1.0675588335931263, + "grad_norm": 2.358952169754945, + "learning_rate": 4.067356824260244e-05, + "loss": 0.3216, + "step": 7020 + }, + { + "epoch": 1.0690795726723188, + "grad_norm": 2.644865712521293, + "learning_rate": 4.063907568540491e-05, + "loss": 0.2937, + "step": 7030 + }, + { + "epoch": 1.0706003117515113, + "grad_norm": 3.0696659945463622, + "learning_rate": 4.060453414850577e-05, + "loss": 0.3, + "step": 7040 + }, + { + "epoch": 1.0721210508307037, + "grad_norm": 2.4298838582676057, + "learning_rate": 4.056994374008495e-05, + "loss": 0.268, + "step": 7050 + }, + { + "epoch": 1.0736417899098962, + "grad_norm": 2.651331900415168, + "learning_rate": 4.053530456847545e-05, + "loss": 0.268, + "step": 7060 + }, + { + "epoch": 1.0751625289890887, + "grad_norm": 1.7436125380356489, + "learning_rate": 4.0500616742162984e-05, + "loss": 0.3049, + "step": 7070 + }, + { + "epoch": 1.0766832680682812, + "grad_norm": 2.7694128507644287, + "learning_rate": 4.046588036978564e-05, + "loss": 0.3074, + "step": 7080 + }, + { + "epoch": 1.0782040071474737, + "grad_norm": 2.2340818356991266, + "learning_rate": 4.043109556013356e-05, + "loss": 0.2706, + "step": 7090 + }, + { + "epoch": 1.0797247462266661, + "grad_norm": 2.801192116432972, + "learning_rate": 4.039626242214858e-05, + "loss": 0.3111, + "step": 7100 + }, + { + "epoch": 1.0812454853058586, + "grad_norm": 2.1621888258586646, + "learning_rate": 4.0361381064923874e-05, + "loss": 0.2822, + "step": 7110 + }, + { + "epoch": 1.082766224385051, + "grad_norm": 2.9769885560226053, + "learning_rate": 4.032645159770368e-05, + "loss": 0.3146, + "step": 7120 + }, + { + "epoch": 1.0842869634642436, + "grad_norm": 2.8281421158925557, + "learning_rate": 4.0291474129882874e-05, + "loss": 0.3254, + "step": 7130 + }, + { + "epoch": 1.085807702543436, + "grad_norm": 2.5159243142653476, + "learning_rate": 4.025644877100666e-05, + "loss": 0.2806, + "step": 7140 + }, + { + "epoch": 1.0873284416226285, + "grad_norm": 2.2694241907589023, + "learning_rate": 4.0221375630770256e-05, + "loss": 0.2941, + "step": 7150 + }, + { + "epoch": 1.088849180701821, + "grad_norm": 2.5009996629807665, + "learning_rate": 4.0186254819018504e-05, + "loss": 0.2694, + "step": 7160 + }, + { + "epoch": 1.0903699197810135, + "grad_norm": 3.0379324105298253, + "learning_rate": 4.015108644574557e-05, + "loss": 0.312, + "step": 7170 + }, + { + "epoch": 1.091890658860206, + "grad_norm": 1.545917443985011, + "learning_rate": 4.011587062109456e-05, + "loss": 0.2855, + "step": 7180 + }, + { + "epoch": 1.0934113979393985, + "grad_norm": 2.7326990494901287, + "learning_rate": 4.008060745535718e-05, + "loss": 0.2958, + "step": 7190 + }, + { + "epoch": 1.094932137018591, + "grad_norm": 3.082055478849537, + "learning_rate": 4.004529705897343e-05, + "loss": 0.3018, + "step": 7200 + }, + { + "epoch": 1.0964528760977834, + "grad_norm": 2.203404739678904, + "learning_rate": 4.000993954253122e-05, + "loss": 0.2794, + "step": 7210 + }, + { + "epoch": 1.0979736151769761, + "grad_norm": 2.474576997932723, + "learning_rate": 3.9974535016766045e-05, + "loss": 0.3197, + "step": 7220 + }, + { + "epoch": 1.0994943542561686, + "grad_norm": 2.3611522585632696, + "learning_rate": 3.9939083592560586e-05, + "loss": 0.2809, + "step": 7230 + }, + { + "epoch": 1.101015093335361, + "grad_norm": 3.0124336080791574, + "learning_rate": 3.990358538094446e-05, + "loss": 0.2721, + "step": 7240 + }, + { + "epoch": 1.1025358324145536, + "grad_norm": 3.420009548470919, + "learning_rate": 3.986804049309379e-05, + "loss": 0.2706, + "step": 7250 + }, + { + "epoch": 1.104056571493746, + "grad_norm": 2.6798268999972925, + "learning_rate": 3.983244904033087e-05, + "loss": 0.3119, + "step": 7260 + }, + { + "epoch": 1.1055773105729385, + "grad_norm": 3.344934438773394, + "learning_rate": 3.979681113412385e-05, + "loss": 0.3052, + "step": 7270 + }, + { + "epoch": 1.107098049652131, + "grad_norm": 2.877554575784066, + "learning_rate": 3.9761126886086356e-05, + "loss": 0.303, + "step": 7280 + }, + { + "epoch": 1.1086187887313235, + "grad_norm": 2.388565366487381, + "learning_rate": 3.9725396407977166e-05, + "loss": 0.2972, + "step": 7290 + }, + { + "epoch": 1.110139527810516, + "grad_norm": 2.2712321325089406, + "learning_rate": 3.968961981169983e-05, + "loss": 0.2528, + "step": 7300 + }, + { + "epoch": 1.1116602668897084, + "grad_norm": 2.643024464957487, + "learning_rate": 3.965379720930233e-05, + "loss": 0.3306, + "step": 7310 + }, + { + "epoch": 1.113181005968901, + "grad_norm": 1.7687715931469454, + "learning_rate": 3.961792871297675e-05, + "loss": 0.3129, + "step": 7320 + }, + { + "epoch": 1.1147017450480934, + "grad_norm": 2.955445614627385, + "learning_rate": 3.95820144350589e-05, + "loss": 0.2855, + "step": 7330 + }, + { + "epoch": 1.1162224841272859, + "grad_norm": 2.85984961447341, + "learning_rate": 3.954605448802798e-05, + "loss": 0.2678, + "step": 7340 + }, + { + "epoch": 1.1177432232064783, + "grad_norm": 2.1859356200772435, + "learning_rate": 3.95100489845062e-05, + "loss": 0.2859, + "step": 7350 + }, + { + "epoch": 1.1192639622856708, + "grad_norm": 2.344592445395288, + "learning_rate": 3.9473998037258475e-05, + "loss": 0.2938, + "step": 7360 + }, + { + "epoch": 1.1207847013648633, + "grad_norm": 2.6778883749837354, + "learning_rate": 3.943790175919201e-05, + "loss": 0.2926, + "step": 7370 + }, + { + "epoch": 1.1223054404440558, + "grad_norm": 2.9528934772761417, + "learning_rate": 3.9401760263356025e-05, + "loss": 0.3133, + "step": 7380 + }, + { + "epoch": 1.1238261795232483, + "grad_norm": 2.2006345359705097, + "learning_rate": 3.936557366294132e-05, + "loss": 0.2875, + "step": 7390 + }, + { + "epoch": 1.1253469186024407, + "grad_norm": 10.942153256767018, + "learning_rate": 3.9329342071279975e-05, + "loss": 0.2847, + "step": 7400 + }, + { + "epoch": 1.1268676576816332, + "grad_norm": 2.7728633391027335, + "learning_rate": 3.929306560184495e-05, + "loss": 0.2419, + "step": 7410 + }, + { + "epoch": 1.1283883967608257, + "grad_norm": 2.3707264401548054, + "learning_rate": 3.925674436824981e-05, + "loss": 0.2731, + "step": 7420 + }, + { + "epoch": 1.1299091358400182, + "grad_norm": 2.8118448588737324, + "learning_rate": 3.922037848424826e-05, + "loss": 0.274, + "step": 7430 + }, + { + "epoch": 1.1314298749192107, + "grad_norm": 3.0985190856750995, + "learning_rate": 3.918396806373389e-05, + "loss": 0.2877, + "step": 7440 + }, + { + "epoch": 1.1329506139984031, + "grad_norm": 2.4639649486154322, + "learning_rate": 3.914751322073974e-05, + "loss": 0.303, + "step": 7450 + }, + { + "epoch": 1.1344713530775956, + "grad_norm": 2.4943565102540894, + "learning_rate": 3.911101406943798e-05, + "loss": 0.2567, + "step": 7460 + }, + { + "epoch": 1.1359920921567883, + "grad_norm": 2.5391776003288715, + "learning_rate": 3.907447072413958e-05, + "loss": 0.3071, + "step": 7470 + }, + { + "epoch": 1.1375128312359806, + "grad_norm": 2.498032098366562, + "learning_rate": 3.903788329929386e-05, + "loss": 0.2699, + "step": 7480 + }, + { + "epoch": 1.1390335703151733, + "grad_norm": 2.607227874502264, + "learning_rate": 3.9001251909488266e-05, + "loss": 0.313, + "step": 7490 + }, + { + "epoch": 1.1405543093943658, + "grad_norm": 2.0619990281628837, + "learning_rate": 3.8964576669447886e-05, + "loss": 0.304, + "step": 7500 + }, + { + "epoch": 1.1420750484735582, + "grad_norm": 2.485540166086768, + "learning_rate": 3.892785769403514e-05, + "loss": 0.2759, + "step": 7510 + }, + { + "epoch": 1.1435957875527507, + "grad_norm": 2.4842749762844494, + "learning_rate": 3.889109509824946e-05, + "loss": 0.2972, + "step": 7520 + }, + { + "epoch": 1.1451165266319432, + "grad_norm": 2.6184891925692777, + "learning_rate": 3.8854288997226856e-05, + "loss": 0.293, + "step": 7530 + }, + { + "epoch": 1.1466372657111357, + "grad_norm": 2.700487515560249, + "learning_rate": 3.881743950623961e-05, + "loss": 0.282, + "step": 7540 + }, + { + "epoch": 1.1481580047903281, + "grad_norm": 2.0608493424437517, + "learning_rate": 3.8780546740695874e-05, + "loss": 0.3015, + "step": 7550 + }, + { + "epoch": 1.1496787438695206, + "grad_norm": 2.2477767289461403, + "learning_rate": 3.8743610816139366e-05, + "loss": 0.3185, + "step": 7560 + }, + { + "epoch": 1.151199482948713, + "grad_norm": 1.8485982236120895, + "learning_rate": 3.870663184824894e-05, + "loss": 0.2879, + "step": 7570 + }, + { + "epoch": 1.1527202220279056, + "grad_norm": 2.488928176797832, + "learning_rate": 3.8669609952838284e-05, + "loss": 0.3118, + "step": 7580 + }, + { + "epoch": 1.154240961107098, + "grad_norm": 1.993075849947279, + "learning_rate": 3.8632545245855504e-05, + "loss": 0.3002, + "step": 7590 + }, + { + "epoch": 1.1557617001862905, + "grad_norm": 2.666660892675978, + "learning_rate": 3.85954378433828e-05, + "loss": 0.2946, + "step": 7600 + }, + { + "epoch": 1.157282439265483, + "grad_norm": 2.640371352995608, + "learning_rate": 3.8558287861636084e-05, + "loss": 0.3197, + "step": 7610 + }, + { + "epoch": 1.1588031783446755, + "grad_norm": 2.600142726659339, + "learning_rate": 3.852109541696464e-05, + "loss": 0.286, + "step": 7620 + }, + { + "epoch": 1.160323917423868, + "grad_norm": 2.5984125538897658, + "learning_rate": 3.8483860625850706e-05, + "loss": 0.249, + "step": 7630 + }, + { + "epoch": 1.1618446565030605, + "grad_norm": 3.1731314596771165, + "learning_rate": 3.844658360490918e-05, + "loss": 0.3128, + "step": 7640 + }, + { + "epoch": 1.163365395582253, + "grad_norm": 2.2037268109627535, + "learning_rate": 3.8409264470887204e-05, + "loss": 0.2897, + "step": 7650 + }, + { + "epoch": 1.1648861346614454, + "grad_norm": 2.752818441511239, + "learning_rate": 3.8371903340663796e-05, + "loss": 0.3076, + "step": 7660 + }, + { + "epoch": 1.166406873740638, + "grad_norm": 2.4426583975833176, + "learning_rate": 3.8334500331249524e-05, + "loss": 0.2545, + "step": 7670 + }, + { + "epoch": 1.1679276128198304, + "grad_norm": 2.568136368452352, + "learning_rate": 3.829705555978611e-05, + "loss": 0.3104, + "step": 7680 + }, + { + "epoch": 1.1694483518990229, + "grad_norm": 2.8737053540852835, + "learning_rate": 3.825956914354607e-05, + "loss": 0.3106, + "step": 7690 + }, + { + "epoch": 1.1709690909782153, + "grad_norm": 2.151015653637833, + "learning_rate": 3.8222041199932335e-05, + "loss": 0.2622, + "step": 7700 + }, + { + "epoch": 1.1724898300574078, + "grad_norm": 2.375732216648787, + "learning_rate": 3.818447184647792e-05, + "loss": 0.3217, + "step": 7710 + }, + { + "epoch": 1.1740105691366005, + "grad_norm": 2.1958673028989995, + "learning_rate": 3.81468612008455e-05, + "loss": 0.2396, + "step": 7720 + }, + { + "epoch": 1.1755313082157928, + "grad_norm": 2.261786628523854, + "learning_rate": 3.810920938082709e-05, + "loss": 0.2626, + "step": 7730 + }, + { + "epoch": 1.1770520472949855, + "grad_norm": 2.3156584569427023, + "learning_rate": 3.807151650434367e-05, + "loss": 0.2692, + "step": 7740 + }, + { + "epoch": 1.1785727863741777, + "grad_norm": 2.592659448019222, + "learning_rate": 3.8033782689444784e-05, + "loss": 0.2794, + "step": 7750 + }, + { + "epoch": 1.1800935254533704, + "grad_norm": 2.14703377700402, + "learning_rate": 3.799600805430819e-05, + "loss": 0.2971, + "step": 7760 + }, + { + "epoch": 1.181614264532563, + "grad_norm": 2.9737381399934164, + "learning_rate": 3.79581927172395e-05, + "loss": 0.2886, + "step": 7770 + }, + { + "epoch": 1.1831350036117554, + "grad_norm": 2.3727303304477414, + "learning_rate": 3.792033679667179e-05, + "loss": 0.2871, + "step": 7780 + }, + { + "epoch": 1.1846557426909479, + "grad_norm": 3.499203890965845, + "learning_rate": 3.788244041116525e-05, + "loss": 0.2717, + "step": 7790 + }, + { + "epoch": 1.1861764817701403, + "grad_norm": 2.693961115799628, + "learning_rate": 3.7844503679406795e-05, + "loss": 0.2895, + "step": 7800 + }, + { + "epoch": 1.1876972208493328, + "grad_norm": 2.3977040906778537, + "learning_rate": 3.78065267202097e-05, + "loss": 0.3129, + "step": 7810 + }, + { + "epoch": 1.1892179599285253, + "grad_norm": 1.8211655914039755, + "learning_rate": 3.776850965251323e-05, + "loss": 0.2972, + "step": 7820 + }, + { + "epoch": 1.1907386990077178, + "grad_norm": 3.1482627978218285, + "learning_rate": 3.7730452595382246e-05, + "loss": 0.3254, + "step": 7830 + }, + { + "epoch": 1.1922594380869103, + "grad_norm": 2.6531255885346257, + "learning_rate": 3.76923556680069e-05, + "loss": 0.2938, + "step": 7840 + }, + { + "epoch": 1.1937801771661027, + "grad_norm": 1.9477743698950827, + "learning_rate": 3.765421898970215e-05, + "loss": 0.2972, + "step": 7850 + }, + { + "epoch": 1.1953009162452952, + "grad_norm": 3.0567428322551833, + "learning_rate": 3.7616042679907494e-05, + "loss": 0.2786, + "step": 7860 + }, + { + "epoch": 1.1968216553244877, + "grad_norm": 1.9420653153949747, + "learning_rate": 3.7577826858186524e-05, + "loss": 0.2838, + "step": 7870 + }, + { + "epoch": 1.1983423944036802, + "grad_norm": 2.705150162809909, + "learning_rate": 3.753957164422661e-05, + "loss": 0.2657, + "step": 7880 + }, + { + "epoch": 1.1998631334828727, + "grad_norm": 3.206876624779621, + "learning_rate": 3.7501277157838474e-05, + "loss": 0.3401, + "step": 7890 + }, + { + "epoch": 1.2013838725620651, + "grad_norm": 2.3022311694482425, + "learning_rate": 3.746294351895582e-05, + "loss": 0.2976, + "step": 7900 + }, + { + "epoch": 1.2029046116412576, + "grad_norm": 2.8216012183671157, + "learning_rate": 3.742457084763499e-05, + "loss": 0.3273, + "step": 7910 + }, + { + "epoch": 1.20442535072045, + "grad_norm": 2.196718691332546, + "learning_rate": 3.738615926405458e-05, + "loss": 0.2822, + "step": 7920 + }, + { + "epoch": 1.2059460897996426, + "grad_norm": 2.75972001903708, + "learning_rate": 3.734770888851504e-05, + "loss": 0.31, + "step": 7930 + }, + { + "epoch": 1.207466828878835, + "grad_norm": 2.67842779801343, + "learning_rate": 3.730921984143831e-05, + "loss": 0.3035, + "step": 7940 + }, + { + "epoch": 1.2089875679580275, + "grad_norm": 2.6350769106843077, + "learning_rate": 3.727069224336747e-05, + "loss": 0.2889, + "step": 7950 + }, + { + "epoch": 1.21050830703722, + "grad_norm": 2.7683493677073345, + "learning_rate": 3.7232126214966286e-05, + "loss": 0.28, + "step": 7960 + }, + { + "epoch": 1.2120290461164125, + "grad_norm": 2.2023346969252167, + "learning_rate": 3.719352187701893e-05, + "loss": 0.29, + "step": 7970 + }, + { + "epoch": 1.213549785195605, + "grad_norm": 3.0372586300740263, + "learning_rate": 3.715487935042955e-05, + "loss": 0.2862, + "step": 7980 + }, + { + "epoch": 1.2150705242747977, + "grad_norm": 2.036134832189659, + "learning_rate": 3.7116198756221864e-05, + "loss": 0.2773, + "step": 7990 + }, + { + "epoch": 1.21659126335399, + "grad_norm": 2.9632261542423626, + "learning_rate": 3.7077480215538854e-05, + "loss": 0.3041, + "step": 8000 + }, + { + "epoch": 1.2181120024331826, + "grad_norm": 2.76544126054891, + "learning_rate": 3.703872384964231e-05, + "loss": 0.3158, + "step": 8010 + }, + { + "epoch": 1.219632741512375, + "grad_norm": 2.2885642936439257, + "learning_rate": 3.6999929779912516e-05, + "loss": 0.2877, + "step": 8020 + }, + { + "epoch": 1.2211534805915676, + "grad_norm": 2.1075502618528263, + "learning_rate": 3.696109812784782e-05, + "loss": 0.2927, + "step": 8030 + }, + { + "epoch": 1.22267421967076, + "grad_norm": 2.409419031659738, + "learning_rate": 3.692222901506428e-05, + "loss": 0.2678, + "step": 8040 + }, + { + "epoch": 1.2241949587499525, + "grad_norm": 2.2857069340907756, + "learning_rate": 3.688332256329528e-05, + "loss": 0.2911, + "step": 8050 + }, + { + "epoch": 1.225715697829145, + "grad_norm": 2.4658095513311373, + "learning_rate": 3.684437889439114e-05, + "loss": 0.2729, + "step": 8060 + }, + { + "epoch": 1.2272364369083375, + "grad_norm": 1.8688872989834802, + "learning_rate": 3.6805398130318736e-05, + "loss": 0.3067, + "step": 8070 + }, + { + "epoch": 1.22875717598753, + "grad_norm": 2.53037801470446, + "learning_rate": 3.676638039316112e-05, + "loss": 0.3051, + "step": 8080 + }, + { + "epoch": 1.2302779150667225, + "grad_norm": 2.288626632712959, + "learning_rate": 3.672732580511717e-05, + "loss": 0.2832, + "step": 8090 + }, + { + "epoch": 1.231798654145915, + "grad_norm": 2.4895338516060783, + "learning_rate": 3.668823448850113e-05, + "loss": 0.2889, + "step": 8100 + }, + { + "epoch": 1.2333193932251074, + "grad_norm": 2.3065520873802177, + "learning_rate": 3.664910656574231e-05, + "loss": 0.2505, + "step": 8110 + }, + { + "epoch": 1.2348401323043, + "grad_norm": 3.030261180222603, + "learning_rate": 3.660994215938465e-05, + "loss": 0.2728, + "step": 8120 + }, + { + "epoch": 1.2363608713834924, + "grad_norm": 3.740960153047791, + "learning_rate": 3.657074139208633e-05, + "loss": 0.3012, + "step": 8130 + }, + { + "epoch": 1.2378816104626849, + "grad_norm": 2.4849409890552456, + "learning_rate": 3.6531504386619466e-05, + "loss": 0.3013, + "step": 8140 + }, + { + "epoch": 1.2394023495418773, + "grad_norm": 2.9576430174515376, + "learning_rate": 3.649223126586962e-05, + "loss": 0.2932, + "step": 8150 + }, + { + "epoch": 1.2409230886210698, + "grad_norm": 2.516659589443617, + "learning_rate": 3.645292215283548e-05, + "loss": 0.2905, + "step": 8160 + }, + { + "epoch": 1.2424438277002623, + "grad_norm": 3.1270087729129856, + "learning_rate": 3.6413577170628435e-05, + "loss": 0.2968, + "step": 8170 + }, + { + "epoch": 1.2439645667794548, + "grad_norm": 1.7256782600804983, + "learning_rate": 3.6374196442472255e-05, + "loss": 0.3402, + "step": 8180 + }, + { + "epoch": 1.2454853058586473, + "grad_norm": 2.3994544583081603, + "learning_rate": 3.633478009170263e-05, + "loss": 0.305, + "step": 8190 + }, + { + "epoch": 1.2470060449378397, + "grad_norm": 2.6361277051999568, + "learning_rate": 3.6295328241766826e-05, + "loss": 0.2771, + "step": 8200 + }, + { + "epoch": 1.2485267840170322, + "grad_norm": 1.9228515944188007, + "learning_rate": 3.625584101622328e-05, + "loss": 0.2699, + "step": 8210 + }, + { + "epoch": 1.2500475230962247, + "grad_norm": 2.277769357168177, + "learning_rate": 3.621631853874122e-05, + "loss": 0.2706, + "step": 8220 + }, + { + "epoch": 1.2515682621754172, + "grad_norm": 3.0925495664862703, + "learning_rate": 3.61767609331003e-05, + "loss": 0.3159, + "step": 8230 + }, + { + "epoch": 1.2530890012546099, + "grad_norm": 1.5040094826213677, + "learning_rate": 3.613716832319019e-05, + "loss": 0.2964, + "step": 8240 + }, + { + "epoch": 1.2546097403338021, + "grad_norm": 2.6512928022634825, + "learning_rate": 3.6097540833010135e-05, + "loss": 0.28, + "step": 8250 + }, + { + "epoch": 1.2561304794129948, + "grad_norm": 4.193322412459563, + "learning_rate": 3.605787858666869e-05, + "loss": 0.2882, + "step": 8260 + }, + { + "epoch": 1.257651218492187, + "grad_norm": 3.058933307428843, + "learning_rate": 3.601818170838323e-05, + "loss": 0.285, + "step": 8270 + }, + { + "epoch": 1.2591719575713798, + "grad_norm": 4.398463300043078, + "learning_rate": 3.5978450322479596e-05, + "loss": 0.3047, + "step": 8280 + }, + { + "epoch": 1.260692696650572, + "grad_norm": 2.8947539298519556, + "learning_rate": 3.59386845533917e-05, + "loss": 0.2801, + "step": 8290 + }, + { + "epoch": 1.2622134357297647, + "grad_norm": 1.8824547748463556, + "learning_rate": 3.589888452566115e-05, + "loss": 0.2407, + "step": 8300 + }, + { + "epoch": 1.2637341748089572, + "grad_norm": 1.7926263765679695, + "learning_rate": 3.5859050363936834e-05, + "loss": 0.3028, + "step": 8310 + }, + { + "epoch": 1.2652549138881497, + "grad_norm": 3.1309122142790216, + "learning_rate": 3.581918219297455e-05, + "loss": 0.2967, + "step": 8320 + }, + { + "epoch": 1.2667756529673422, + "grad_norm": 1.9295142556815468, + "learning_rate": 3.57792801376366e-05, + "loss": 0.2664, + "step": 8330 + }, + { + "epoch": 1.2682963920465347, + "grad_norm": 2.5565223450386143, + "learning_rate": 3.573934432289144e-05, + "loss": 0.2563, + "step": 8340 + }, + { + "epoch": 1.2698171311257271, + "grad_norm": 2.1868367873818544, + "learning_rate": 3.569937487381321e-05, + "loss": 0.3057, + "step": 8350 + }, + { + "epoch": 1.2713378702049196, + "grad_norm": 1.7560657812339475, + "learning_rate": 3.5659371915581406e-05, + "loss": 0.2892, + "step": 8360 + }, + { + "epoch": 1.272858609284112, + "grad_norm": 2.527689407532655, + "learning_rate": 3.5619335573480494e-05, + "loss": 0.2921, + "step": 8370 + }, + { + "epoch": 1.2743793483633046, + "grad_norm": 2.7878471899123114, + "learning_rate": 3.557926597289947e-05, + "loss": 0.2888, + "step": 8380 + }, + { + "epoch": 1.275900087442497, + "grad_norm": 2.5129692358771596, + "learning_rate": 3.55391632393315e-05, + "loss": 0.2596, + "step": 8390 + }, + { + "epoch": 1.2774208265216895, + "grad_norm": 2.2521142550939435, + "learning_rate": 3.5499027498373506e-05, + "loss": 0.2519, + "step": 8400 + }, + { + "epoch": 1.278941565600882, + "grad_norm": 2.818756454991938, + "learning_rate": 3.54588588757258e-05, + "loss": 0.2562, + "step": 8410 + }, + { + "epoch": 1.2804623046800745, + "grad_norm": 2.576771256059629, + "learning_rate": 3.541865749719167e-05, + "loss": 0.3075, + "step": 8420 + }, + { + "epoch": 1.281983043759267, + "grad_norm": 2.0346330660249263, + "learning_rate": 3.537842348867701e-05, + "loss": 0.2759, + "step": 8430 + }, + { + "epoch": 1.2835037828384595, + "grad_norm": 2.341486788869301, + "learning_rate": 3.533815697618986e-05, + "loss": 0.295, + "step": 8440 + }, + { + "epoch": 1.285024521917652, + "grad_norm": 2.2631771728426453, + "learning_rate": 3.52978580858401e-05, + "loss": 0.3014, + "step": 8450 + }, + { + "epoch": 1.2865452609968444, + "grad_norm": 2.2868045622031454, + "learning_rate": 3.5257526943839e-05, + "loss": 0.2648, + "step": 8460 + }, + { + "epoch": 1.2880660000760369, + "grad_norm": 2.1061817657490063, + "learning_rate": 3.5217163676498846e-05, + "loss": 0.2719, + "step": 8470 + }, + { + "epoch": 1.2895867391552294, + "grad_norm": 2.386225017842193, + "learning_rate": 3.517676841023252e-05, + "loss": 0.3012, + "step": 8480 + }, + { + "epoch": 1.2911074782344218, + "grad_norm": 3.1316754850912303, + "learning_rate": 3.513634127155314e-05, + "loss": 0.28, + "step": 8490 + }, + { + "epoch": 1.2926282173136143, + "grad_norm": 2.1283435500520307, + "learning_rate": 3.5095882387073635e-05, + "loss": 0.2683, + "step": 8500 + }, + { + "epoch": 1.294148956392807, + "grad_norm": 2.2456390691030017, + "learning_rate": 3.5055391883506354e-05, + "loss": 0.3087, + "step": 8510 + }, + { + "epoch": 1.2956696954719993, + "grad_norm": 3.6881150728137184, + "learning_rate": 3.501486988766268e-05, + "loss": 0.2928, + "step": 8520 + }, + { + "epoch": 1.297190434551192, + "grad_norm": 2.3713517475024237, + "learning_rate": 3.4974316526452626e-05, + "loss": 0.283, + "step": 8530 + }, + { + "epoch": 1.2987111736303842, + "grad_norm": 2.9653274418009783, + "learning_rate": 3.4933731926884436e-05, + "loss": 0.2769, + "step": 8540 + }, + { + "epoch": 1.300231912709577, + "grad_norm": 2.884079734277004, + "learning_rate": 3.4893116216064176e-05, + "loss": 0.2692, + "step": 8550 + }, + { + "epoch": 1.3017526517887694, + "grad_norm": 2.941810387259709, + "learning_rate": 3.485246952119537e-05, + "loss": 0.3057, + "step": 8560 + }, + { + "epoch": 1.303273390867962, + "grad_norm": 2.938875235577459, + "learning_rate": 3.4811791969578584e-05, + "loss": 0.2818, + "step": 8570 + }, + { + "epoch": 1.3047941299471544, + "grad_norm": 2.4323172504079147, + "learning_rate": 3.477108368861099e-05, + "loss": 0.27, + "step": 8580 + }, + { + "epoch": 1.3063148690263469, + "grad_norm": 3.8689532326722524, + "learning_rate": 3.473034480578603e-05, + "loss": 0.2636, + "step": 8590 + }, + { + "epoch": 1.3078356081055393, + "grad_norm": 5.66743239022323, + "learning_rate": 3.4689575448692975e-05, + "loss": 0.286, + "step": 8600 + }, + { + "epoch": 1.3093563471847318, + "grad_norm": 2.7054918042796334, + "learning_rate": 3.4648775745016554e-05, + "loss": 0.3271, + "step": 8610 + }, + { + "epoch": 1.3108770862639243, + "grad_norm": 1.9837404018057043, + "learning_rate": 3.460794582253651e-05, + "loss": 0.3085, + "step": 8620 + }, + { + "epoch": 1.3123978253431168, + "grad_norm": 2.1020396569109265, + "learning_rate": 3.456708580912725e-05, + "loss": 0.2987, + "step": 8630 + }, + { + "epoch": 1.3139185644223093, + "grad_norm": 3.32077144877692, + "learning_rate": 3.4526195832757414e-05, + "loss": 0.2983, + "step": 8640 + }, + { + "epoch": 1.3154393035015017, + "grad_norm": 2.5449140748548573, + "learning_rate": 3.448527602148948e-05, + "loss": 0.2525, + "step": 8650 + }, + { + "epoch": 1.3169600425806942, + "grad_norm": 2.695730399903775, + "learning_rate": 3.444432650347938e-05, + "loss": 0.3143, + "step": 8660 + }, + { + "epoch": 1.3184807816598867, + "grad_norm": 2.4262837257361607, + "learning_rate": 3.440334740697606e-05, + "loss": 0.2921, + "step": 8670 + }, + { + "epoch": 1.3200015207390792, + "grad_norm": 2.937061423075909, + "learning_rate": 3.4362338860321124e-05, + "loss": 0.3022, + "step": 8680 + }, + { + "epoch": 1.3215222598182716, + "grad_norm": 2.131811782453632, + "learning_rate": 3.4321300991948405e-05, + "loss": 0.2731, + "step": 8690 + }, + { + "epoch": 1.3230429988974641, + "grad_norm": 1.7495019307353712, + "learning_rate": 3.428023393038355e-05, + "loss": 0.2638, + "step": 8700 + }, + { + "epoch": 1.3245637379766566, + "grad_norm": 2.2245864519286416, + "learning_rate": 3.423913780424366e-05, + "loss": 0.2276, + "step": 8710 + }, + { + "epoch": 1.326084477055849, + "grad_norm": 2.907121835301637, + "learning_rate": 3.4198012742236845e-05, + "loss": 0.275, + "step": 8720 + }, + { + "epoch": 1.3276052161350416, + "grad_norm": 2.699874002034777, + "learning_rate": 3.415685887316186e-05, + "loss": 0.2953, + "step": 8730 + }, + { + "epoch": 1.329125955214234, + "grad_norm": 1.9406304740599793, + "learning_rate": 3.4115676325907646e-05, + "loss": 0.2547, + "step": 8740 + }, + { + "epoch": 1.3306466942934265, + "grad_norm": 2.783833478708033, + "learning_rate": 3.4074465229453004e-05, + "loss": 0.2863, + "step": 8750 + }, + { + "epoch": 1.3321674333726192, + "grad_norm": 2.8329758136005374, + "learning_rate": 3.403322571286611e-05, + "loss": 0.2947, + "step": 8760 + }, + { + "epoch": 1.3336881724518115, + "grad_norm": 2.32605398822267, + "learning_rate": 3.3991957905304184e-05, + "loss": 0.2986, + "step": 8770 + }, + { + "epoch": 1.3352089115310042, + "grad_norm": 2.4350922642211277, + "learning_rate": 3.395066193601299e-05, + "loss": 0.2863, + "step": 8780 + }, + { + "epoch": 1.3367296506101964, + "grad_norm": 2.7527802400230983, + "learning_rate": 3.3909337934326576e-05, + "loss": 0.3115, + "step": 8790 + }, + { + "epoch": 1.3382503896893891, + "grad_norm": 2.7740915140834645, + "learning_rate": 3.386798602966671e-05, + "loss": 0.2932, + "step": 8800 + }, + { + "epoch": 1.3397711287685814, + "grad_norm": 2.2925772958130577, + "learning_rate": 3.3826606351542575e-05, + "loss": 0.2654, + "step": 8810 + }, + { + "epoch": 1.341291867847774, + "grad_norm": 2.446175059797116, + "learning_rate": 3.378519902955035e-05, + "loss": 0.2826, + "step": 8820 + }, + { + "epoch": 1.3428126069269666, + "grad_norm": 3.3637350167800375, + "learning_rate": 3.3743764193372786e-05, + "loss": 0.321, + "step": 8830 + }, + { + "epoch": 1.344333346006159, + "grad_norm": 1.9601507472175537, + "learning_rate": 3.37023019727788e-05, + "loss": 0.2724, + "step": 8840 + }, + { + "epoch": 1.3458540850853515, + "grad_norm": 2.6883730954402325, + "learning_rate": 3.366081249762305e-05, + "loss": 0.2897, + "step": 8850 + }, + { + "epoch": 1.347374824164544, + "grad_norm": 1.9088335775122864, + "learning_rate": 3.36192958978456e-05, + "loss": 0.2887, + "step": 8860 + }, + { + "epoch": 1.3488955632437365, + "grad_norm": 2.2652234692435522, + "learning_rate": 3.357775230347143e-05, + "loss": 0.2797, + "step": 8870 + }, + { + "epoch": 1.350416302322929, + "grad_norm": 2.729328446630985, + "learning_rate": 3.353618184461008e-05, + "loss": 0.2622, + "step": 8880 + }, + { + "epoch": 1.3519370414021215, + "grad_norm": 2.167176230497568, + "learning_rate": 3.349458465145522e-05, + "loss": 0.2869, + "step": 8890 + }, + { + "epoch": 1.353457780481314, + "grad_norm": 2.818663369003388, + "learning_rate": 3.3452960854284244e-05, + "loss": 0.2664, + "step": 8900 + }, + { + "epoch": 1.3549785195605064, + "grad_norm": 2.962271187638817, + "learning_rate": 3.341131058345788e-05, + "loss": 0.2617, + "step": 8910 + }, + { + "epoch": 1.356499258639699, + "grad_norm": 2.344929327911788, + "learning_rate": 3.3369633969419745e-05, + "loss": 0.2723, + "step": 8920 + }, + { + "epoch": 1.3580199977188914, + "grad_norm": 2.7941671452214267, + "learning_rate": 3.3327931142695996e-05, + "loss": 0.2885, + "step": 8930 + }, + { + "epoch": 1.3595407367980838, + "grad_norm": 2.547188026736848, + "learning_rate": 3.3286202233894845e-05, + "loss": 0.2683, + "step": 8940 + }, + { + "epoch": 1.3610614758772763, + "grad_norm": 2.4702701135824356, + "learning_rate": 3.3244447373706225e-05, + "loss": 0.2914, + "step": 8950 + }, + { + "epoch": 1.3625822149564688, + "grad_norm": 2.114693769789113, + "learning_rate": 3.3202666692901316e-05, + "loss": 0.2661, + "step": 8960 + }, + { + "epoch": 1.3641029540356613, + "grad_norm": 7.67912229167745, + "learning_rate": 3.3160860322332195e-05, + "loss": 0.2833, + "step": 8970 + }, + { + "epoch": 1.3656236931148538, + "grad_norm": 2.572482053920299, + "learning_rate": 3.311902839293136e-05, + "loss": 0.2718, + "step": 8980 + }, + { + "epoch": 1.3671444321940462, + "grad_norm": 3.258014975149785, + "learning_rate": 3.3077171035711386e-05, + "loss": 0.3098, + "step": 8990 + }, + { + "epoch": 1.3686651712732387, + "grad_norm": 2.801871358299987, + "learning_rate": 3.303528838176447e-05, + "loss": 0.2935, + "step": 9000 + }, + { + "epoch": 1.3701859103524312, + "grad_norm": 2.1542198150739345, + "learning_rate": 3.299338056226205e-05, + "loss": 0.3, + "step": 9010 + }, + { + "epoch": 1.3717066494316237, + "grad_norm": 2.7392082968766607, + "learning_rate": 3.2951447708454365e-05, + "loss": 0.2774, + "step": 9020 + }, + { + "epoch": 1.3732273885108164, + "grad_norm": 2.006108638584907, + "learning_rate": 3.2909489951670055e-05, + "loss": 0.246, + "step": 9030 + }, + { + "epoch": 1.3747481275900086, + "grad_norm": 2.6396907760404504, + "learning_rate": 3.286750742331578e-05, + "loss": 0.2635, + "step": 9040 + }, + { + "epoch": 1.3762688666692013, + "grad_norm": 2.441395642334279, + "learning_rate": 3.2825500254875744e-05, + "loss": 0.2869, + "step": 9050 + }, + { + "epoch": 1.3777896057483936, + "grad_norm": 3.0523466093368974, + "learning_rate": 3.278346857791135e-05, + "loss": 0.2908, + "step": 9060 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 2.4473641202001337, + "learning_rate": 3.2741412524060746e-05, + "loss": 0.3155, + "step": 9070 + }, + { + "epoch": 1.3808310839067786, + "grad_norm": 3.8992286001708742, + "learning_rate": 3.269933222503842e-05, + "loss": 0.255, + "step": 9080 + }, + { + "epoch": 1.3823518229859713, + "grad_norm": 2.5531694774005866, + "learning_rate": 3.265722781263481e-05, + "loss": 0.2892, + "step": 9090 + }, + { + "epoch": 1.3838725620651637, + "grad_norm": 3.4162045032794426, + "learning_rate": 3.261509941871586e-05, + "loss": 0.2827, + "step": 9100 + }, + { + "epoch": 1.3853933011443562, + "grad_norm": 2.0326175931546566, + "learning_rate": 3.257294717522263e-05, + "loss": 0.2833, + "step": 9110 + }, + { + "epoch": 1.3869140402235487, + "grad_norm": 2.1237533794999375, + "learning_rate": 3.2530771214170876e-05, + "loss": 0.287, + "step": 9120 + }, + { + "epoch": 1.3884347793027412, + "grad_norm": 2.9529723881467183, + "learning_rate": 3.2488571667650614e-05, + "loss": 0.2606, + "step": 9130 + }, + { + "epoch": 1.3899555183819337, + "grad_norm": 1.757641186244348, + "learning_rate": 3.2446348667825745e-05, + "loss": 0.2723, + "step": 9140 + }, + { + "epoch": 1.3914762574611261, + "grad_norm": 3.141996519260948, + "learning_rate": 3.240410234693363e-05, + "loss": 0.2751, + "step": 9150 + }, + { + "epoch": 1.3929969965403186, + "grad_norm": 2.584648826063317, + "learning_rate": 3.236183283728465e-05, + "loss": 0.3208, + "step": 9160 + }, + { + "epoch": 1.394517735619511, + "grad_norm": 2.3370718889172903, + "learning_rate": 3.2319540271261824e-05, + "loss": 0.3136, + "step": 9170 + }, + { + "epoch": 1.3960384746987036, + "grad_norm": 2.3756774408630443, + "learning_rate": 3.2277224781320364e-05, + "loss": 0.3035, + "step": 9180 + }, + { + "epoch": 1.397559213777896, + "grad_norm": 2.2864073224809527, + "learning_rate": 3.22348864999873e-05, + "loss": 0.268, + "step": 9190 + }, + { + "epoch": 1.3990799528570885, + "grad_norm": 2.464041673438244, + "learning_rate": 3.219252555986103e-05, + "loss": 0.273, + "step": 9200 + }, + { + "epoch": 1.400600691936281, + "grad_norm": 2.261446441317804, + "learning_rate": 3.215014209361091e-05, + "loss": 0.2782, + "step": 9210 + }, + { + "epoch": 1.4021214310154735, + "grad_norm": 2.6127010457578628, + "learning_rate": 3.2107736233976856e-05, + "loss": 0.2784, + "step": 9220 + }, + { + "epoch": 1.403642170094666, + "grad_norm": 2.0288963686403, + "learning_rate": 3.206530811376891e-05, + "loss": 0.2787, + "step": 9230 + }, + { + "epoch": 1.4051629091738584, + "grad_norm": 2.4331143759219818, + "learning_rate": 3.202285786586684e-05, + "loss": 0.2417, + "step": 9240 + }, + { + "epoch": 1.406683648253051, + "grad_norm": 2.572337020866105, + "learning_rate": 3.198038562321971e-05, + "loss": 0.2926, + "step": 9250 + }, + { + "epoch": 1.4082043873322434, + "grad_norm": 1.9038057838024613, + "learning_rate": 3.193789151884546e-05, + "loss": 0.2959, + "step": 9260 + }, + { + "epoch": 1.4097251264114359, + "grad_norm": 1.9732719924423563, + "learning_rate": 3.1895375685830513e-05, + "loss": 0.2748, + "step": 9270 + }, + { + "epoch": 1.4112458654906286, + "grad_norm": 2.597966720146178, + "learning_rate": 3.185283825732934e-05, + "loss": 0.2939, + "step": 9280 + }, + { + "epoch": 1.4127666045698208, + "grad_norm": 2.2317352547058538, + "learning_rate": 3.181027936656406e-05, + "loss": 0.2826, + "step": 9290 + }, + { + "epoch": 1.4142873436490135, + "grad_norm": 1.584431662358437, + "learning_rate": 3.1767699146823954e-05, + "loss": 0.2676, + "step": 9300 + }, + { + "epoch": 1.4158080827282058, + "grad_norm": 2.133635880323741, + "learning_rate": 3.172509773146517e-05, + "loss": 0.2606, + "step": 9310 + }, + { + "epoch": 1.4173288218073985, + "grad_norm": 3.3483020313403165, + "learning_rate": 3.168247525391021e-05, + "loss": 0.2826, + "step": 9320 + }, + { + "epoch": 1.4188495608865908, + "grad_norm": 2.399160608323548, + "learning_rate": 3.1639831847647534e-05, + "loss": 0.2819, + "step": 9330 + }, + { + "epoch": 1.4203702999657835, + "grad_norm": 2.5695423683823675, + "learning_rate": 3.1597167646231156e-05, + "loss": 0.2325, + "step": 9340 + }, + { + "epoch": 1.421891039044976, + "grad_norm": 2.7040724988700036, + "learning_rate": 3.1554482783280215e-05, + "loss": 0.2824, + "step": 9350 + }, + { + "epoch": 1.4234117781241684, + "grad_norm": 2.6172787921984946, + "learning_rate": 3.1511777392478567e-05, + "loss": 0.2772, + "step": 9360 + }, + { + "epoch": 1.424932517203361, + "grad_norm": 2.5340004381285537, + "learning_rate": 3.146905160757434e-05, + "loss": 0.2841, + "step": 9370 + }, + { + "epoch": 1.4264532562825534, + "grad_norm": 3.053081844806618, + "learning_rate": 3.1426305562379565e-05, + "loss": 0.3016, + "step": 9380 + }, + { + "epoch": 1.4279739953617459, + "grad_norm": 2.5508445658270653, + "learning_rate": 3.1383539390769694e-05, + "loss": 0.2851, + "step": 9390 + }, + { + "epoch": 1.4294947344409383, + "grad_norm": 2.3703211389621615, + "learning_rate": 3.134075322668323e-05, + "loss": 0.2973, + "step": 9400 + }, + { + "epoch": 1.4310154735201308, + "grad_norm": 3.9026952078020303, + "learning_rate": 3.1297947204121285e-05, + "loss": 0.2988, + "step": 9410 + }, + { + "epoch": 1.4325362125993233, + "grad_norm": 2.505544359193364, + "learning_rate": 3.1255121457147174e-05, + "loss": 0.2763, + "step": 9420 + }, + { + "epoch": 1.4340569516785158, + "grad_norm": 1.585447969325035, + "learning_rate": 3.121227611988597e-05, + "loss": 0.2768, + "step": 9430 + }, + { + "epoch": 1.4355776907577082, + "grad_norm": 2.1041708667562724, + "learning_rate": 3.116941132652411e-05, + "loss": 0.261, + "step": 9440 + }, + { + "epoch": 1.4370984298369007, + "grad_norm": 2.41203066176811, + "learning_rate": 3.112652721130897e-05, + "loss": 0.2732, + "step": 9450 + }, + { + "epoch": 1.4386191689160932, + "grad_norm": 3.239612631064481, + "learning_rate": 3.108362390854843e-05, + "loss": 0.2706, + "step": 9460 + }, + { + "epoch": 1.4401399079952857, + "grad_norm": 2.2898212925195276, + "learning_rate": 3.1040701552610466e-05, + "loss": 0.286, + "step": 9470 + }, + { + "epoch": 1.4416606470744782, + "grad_norm": 2.0807613495810675, + "learning_rate": 3.099776027792273e-05, + "loss": 0.257, + "step": 9480 + }, + { + "epoch": 1.4431813861536706, + "grad_norm": 2.532230606363068, + "learning_rate": 3.095480021897213e-05, + "loss": 0.2739, + "step": 9490 + }, + { + "epoch": 1.4447021252328631, + "grad_norm": 1.7689163104082777, + "learning_rate": 3.0911821510304386e-05, + "loss": 0.2873, + "step": 9500 + }, + { + "epoch": 1.4462228643120556, + "grad_norm": 2.6892552550402806, + "learning_rate": 3.086882428652364e-05, + "loss": 0.277, + "step": 9510 + }, + { + "epoch": 1.447743603391248, + "grad_norm": 2.4211732212160935, + "learning_rate": 3.0825808682292026e-05, + "loss": 0.2604, + "step": 9520 + }, + { + "epoch": 1.4492643424704406, + "grad_norm": 1.9343445667547632, + "learning_rate": 3.078277483232922e-05, + "loss": 0.302, + "step": 9530 + }, + { + "epoch": 1.450785081549633, + "grad_norm": 2.253520335275868, + "learning_rate": 3.0739722871412067e-05, + "loss": 0.2716, + "step": 9540 + }, + { + "epoch": 1.4523058206288257, + "grad_norm": 2.215733436357561, + "learning_rate": 3.069665293437413e-05, + "loss": 0.2262, + "step": 9550 + }, + { + "epoch": 1.453826559708018, + "grad_norm": 2.4778654850251103, + "learning_rate": 3.0653565156105263e-05, + "loss": 0.2848, + "step": 9560 + }, + { + "epoch": 1.4553472987872107, + "grad_norm": 2.5392544292707737, + "learning_rate": 3.061045967155119e-05, + "loss": 0.2959, + "step": 9570 + }, + { + "epoch": 1.456868037866403, + "grad_norm": 2.380040944823623, + "learning_rate": 3.05673366157131e-05, + "loss": 0.2883, + "step": 9580 + }, + { + "epoch": 1.4583887769455957, + "grad_norm": 2.544682557835092, + "learning_rate": 3.052419612364722e-05, + "loss": 0.2928, + "step": 9590 + }, + { + "epoch": 1.459909516024788, + "grad_norm": 2.55215723838331, + "learning_rate": 3.0481038330464357e-05, + "loss": 0.2731, + "step": 9600 + }, + { + "epoch": 1.4614302551039806, + "grad_norm": 3.507719641532427, + "learning_rate": 3.0437863371329538e-05, + "loss": 0.2867, + "step": 9610 + }, + { + "epoch": 1.462950994183173, + "grad_norm": 2.426409569627035, + "learning_rate": 3.0394671381461538e-05, + "loss": 0.2582, + "step": 9620 + }, + { + "epoch": 1.4644717332623656, + "grad_norm": 3.7259760673829008, + "learning_rate": 3.0351462496132447e-05, + "loss": 0.2866, + "step": 9630 + }, + { + "epoch": 1.465992472341558, + "grad_norm": 2.3559043874919077, + "learning_rate": 3.0308236850667306e-05, + "loss": 0.2652, + "step": 9640 + }, + { + "epoch": 1.4675132114207505, + "grad_norm": 2.404718898730075, + "learning_rate": 3.0264994580443635e-05, + "loss": 0.2735, + "step": 9650 + }, + { + "epoch": 1.469033950499943, + "grad_norm": 2.8257618972416902, + "learning_rate": 3.0221735820891e-05, + "loss": 0.2649, + "step": 9660 + }, + { + "epoch": 1.4705546895791355, + "grad_norm": 2.091173238788161, + "learning_rate": 3.0178460707490642e-05, + "loss": 0.2721, + "step": 9670 + }, + { + "epoch": 1.472075428658328, + "grad_norm": 2.378182060062904, + "learning_rate": 3.013516937577499e-05, + "loss": 0.2788, + "step": 9680 + }, + { + "epoch": 1.4735961677375204, + "grad_norm": 2.7344828452189205, + "learning_rate": 3.00918619613273e-05, + "loss": 0.2895, + "step": 9690 + }, + { + "epoch": 1.475116906816713, + "grad_norm": 1.8682268713077461, + "learning_rate": 3.0048538599781163e-05, + "loss": 0.3054, + "step": 9700 + }, + { + "epoch": 1.4766376458959054, + "grad_norm": 2.5891155233425973, + "learning_rate": 3.000519942682013e-05, + "loss": 0.2922, + "step": 9710 + }, + { + "epoch": 1.4781583849750979, + "grad_norm": 1.7680895535635703, + "learning_rate": 2.996184457817728e-05, + "loss": 0.2713, + "step": 9720 + }, + { + "epoch": 1.4796791240542904, + "grad_norm": 2.697963714947113, + "learning_rate": 2.991847418963477e-05, + "loss": 0.2806, + "step": 9730 + }, + { + "epoch": 1.4811998631334828, + "grad_norm": 2.0953092921793095, + "learning_rate": 2.987508839702345e-05, + "loss": 0.2512, + "step": 9740 + }, + { + "epoch": 1.4827206022126753, + "grad_norm": 2.1533904362272103, + "learning_rate": 2.9831687336222375e-05, + "loss": 0.292, + "step": 9750 + }, + { + "epoch": 1.4842413412918678, + "grad_norm": 2.5560159383323233, + "learning_rate": 2.9788271143158446e-05, + "loss": 0.3045, + "step": 9760 + }, + { + "epoch": 1.4857620803710603, + "grad_norm": 1.9944786895309443, + "learning_rate": 2.9744839953805963e-05, + "loss": 0.2683, + "step": 9770 + }, + { + "epoch": 1.4872828194502528, + "grad_norm": 1.9751716378057818, + "learning_rate": 2.9701393904186174e-05, + "loss": 0.2713, + "step": 9780 + }, + { + "epoch": 1.4888035585294452, + "grad_norm": 2.112388139775092, + "learning_rate": 2.9657933130366868e-05, + "loss": 0.2761, + "step": 9790 + }, + { + "epoch": 1.4903242976086377, + "grad_norm": 2.6225323954974935, + "learning_rate": 2.9614457768461963e-05, + "loss": 0.2702, + "step": 9800 + }, + { + "epoch": 1.4918450366878302, + "grad_norm": 2.08620312831736, + "learning_rate": 2.9570967954631046e-05, + "loss": 0.2409, + "step": 9810 + }, + { + "epoch": 1.493365775767023, + "grad_norm": 2.794813673941386, + "learning_rate": 2.952746382507899e-05, + "loss": 0.2527, + "step": 9820 + }, + { + "epoch": 1.4948865148462152, + "grad_norm": 2.2538207497412244, + "learning_rate": 2.9483945516055488e-05, + "loss": 0.2663, + "step": 9830 + }, + { + "epoch": 1.4964072539254079, + "grad_norm": 2.154986725299355, + "learning_rate": 2.944041316385463e-05, + "loss": 0.2748, + "step": 9840 + }, + { + "epoch": 1.4979279930046001, + "grad_norm": 2.0470839008166437, + "learning_rate": 2.9396866904814506e-05, + "loss": 0.2956, + "step": 9850 + }, + { + "epoch": 1.4994487320837928, + "grad_norm": 2.984325526044332, + "learning_rate": 2.9353306875316772e-05, + "loss": 0.2539, + "step": 9860 + }, + { + "epoch": 1.500969471162985, + "grad_norm": 2.3493316694828756, + "learning_rate": 2.9309733211786167e-05, + "loss": 0.2702, + "step": 9870 + }, + { + "epoch": 1.5024902102421778, + "grad_norm": 2.7370399649255552, + "learning_rate": 2.9266146050690175e-05, + "loss": 0.261, + "step": 9880 + }, + { + "epoch": 1.50401094932137, + "grad_norm": 3.239246783011574, + "learning_rate": 2.922254552853853e-05, + "loss": 0.298, + "step": 9890 + }, + { + "epoch": 1.5055316884005627, + "grad_norm": 2.533540123651865, + "learning_rate": 2.917893178188282e-05, + "loss": 0.2797, + "step": 9900 + }, + { + "epoch": 1.5070524274797552, + "grad_norm": 2.687122195756101, + "learning_rate": 2.913530494731604e-05, + "loss": 0.2718, + "step": 9910 + }, + { + "epoch": 1.5085731665589477, + "grad_norm": 2.856855895983984, + "learning_rate": 2.9091665161472187e-05, + "loss": 0.2426, + "step": 9920 + }, + { + "epoch": 1.5100939056381402, + "grad_norm": 1.9263103193698976, + "learning_rate": 2.904801256102581e-05, + "loss": 0.2572, + "step": 9930 + }, + { + "epoch": 1.5116146447173326, + "grad_norm": 2.215017796348092, + "learning_rate": 2.9004347282691606e-05, + "loss": 0.2518, + "step": 9940 + }, + { + "epoch": 1.5131353837965251, + "grad_norm": 1.6841828013347673, + "learning_rate": 2.896066946322396e-05, + "loss": 0.2903, + "step": 9950 + }, + { + "epoch": 1.5146561228757176, + "grad_norm": 3.467200608098527, + "learning_rate": 2.8916979239416546e-05, + "loss": 0.2884, + "step": 9960 + }, + { + "epoch": 1.51617686195491, + "grad_norm": 2.4131773340571145, + "learning_rate": 2.887327674810188e-05, + "loss": 0.2694, + "step": 9970 + }, + { + "epoch": 1.5176976010341026, + "grad_norm": 2.6159737638726557, + "learning_rate": 2.8829562126150906e-05, + "loss": 0.282, + "step": 9980 + }, + { + "epoch": 1.519218340113295, + "grad_norm": 2.210893848093061, + "learning_rate": 2.8785835510472548e-05, + "loss": 0.2791, + "step": 9990 + }, + { + "epoch": 1.5207390791924875, + "grad_norm": 2.416941539597356, + "learning_rate": 2.8742097038013306e-05, + "loss": 0.2956, + "step": 10000 + }, + { + "epoch": 1.52225981827168, + "grad_norm": 2.8722901832735332, + "learning_rate": 2.8698346845756813e-05, + "loss": 0.2942, + "step": 10010 + }, + { + "epoch": 1.5237805573508725, + "grad_norm": 3.008398895672437, + "learning_rate": 2.8654585070723393e-05, + "loss": 0.2882, + "step": 10020 + }, + { + "epoch": 1.525301296430065, + "grad_norm": 2.655798765302728, + "learning_rate": 2.8610811849969655e-05, + "loss": 0.2416, + "step": 10030 + }, + { + "epoch": 1.5268220355092574, + "grad_norm": 3.4103606775939044, + "learning_rate": 2.8567027320588062e-05, + "loss": 0.2524, + "step": 10040 + }, + { + "epoch": 1.5283427745884501, + "grad_norm": 2.2631742216805284, + "learning_rate": 2.8523231619706485e-05, + "loss": 0.2821, + "step": 10050 + }, + { + "epoch": 1.5298635136676424, + "grad_norm": 3.7026912115095563, + "learning_rate": 2.847942488448778e-05, + "loss": 0.2602, + "step": 10060 + }, + { + "epoch": 1.531384252746835, + "grad_norm": 2.429287094561136, + "learning_rate": 2.8435607252129377e-05, + "loss": 0.2544, + "step": 10070 + }, + { + "epoch": 1.5329049918260274, + "grad_norm": 2.511086273788543, + "learning_rate": 2.8391778859862804e-05, + "loss": 0.2516, + "step": 10080 + }, + { + "epoch": 1.53442573090522, + "grad_norm": 2.2338327227771995, + "learning_rate": 2.834793984495333e-05, + "loss": 0.263, + "step": 10090 + }, + { + "epoch": 1.5359464699844123, + "grad_norm": 3.5518619791489323, + "learning_rate": 2.8304090344699453e-05, + "loss": 0.2549, + "step": 10100 + }, + { + "epoch": 1.537467209063605, + "grad_norm": 2.342372453317764, + "learning_rate": 2.8260230496432532e-05, + "loss": 0.2746, + "step": 10110 + }, + { + "epoch": 1.5389879481427973, + "grad_norm": 2.950173043090453, + "learning_rate": 2.8216360437516333e-05, + "loss": 0.2899, + "step": 10120 + }, + { + "epoch": 1.54050868722199, + "grad_norm": 2.724200907617944, + "learning_rate": 2.8172480305346592e-05, + "loss": 0.2708, + "step": 10130 + }, + { + "epoch": 1.5420294263011822, + "grad_norm": 2.9619321553786238, + "learning_rate": 2.8128590237350612e-05, + "loss": 0.2382, + "step": 10140 + }, + { + "epoch": 1.543550165380375, + "grad_norm": 2.1997366725974064, + "learning_rate": 2.808469037098678e-05, + "loss": 0.2819, + "step": 10150 + }, + { + "epoch": 1.5450709044595672, + "grad_norm": 2.250072115672658, + "learning_rate": 2.8040780843744202e-05, + "loss": 0.2987, + "step": 10160 + }, + { + "epoch": 1.5465916435387599, + "grad_norm": 2.64269671373306, + "learning_rate": 2.7996861793142227e-05, + "loss": 0.2729, + "step": 10170 + }, + { + "epoch": 1.5481123826179524, + "grad_norm": 1.959647860513098, + "learning_rate": 2.7952933356730028e-05, + "loss": 0.239, + "step": 10180 + }, + { + "epoch": 1.5496331216971448, + "grad_norm": 2.215920028858252, + "learning_rate": 2.7908995672086193e-05, + "loss": 0.216, + "step": 10190 + }, + { + "epoch": 1.5511538607763373, + "grad_norm": 2.9493928287581603, + "learning_rate": 2.7865048876818234e-05, + "loss": 0.3138, + "step": 10200 + }, + { + "epoch": 1.5526745998555298, + "grad_norm": 2.1312435853516436, + "learning_rate": 2.7821093108562242e-05, + "loss": 0.2736, + "step": 10210 + }, + { + "epoch": 1.5541953389347223, + "grad_norm": 2.242399939895815, + "learning_rate": 2.777712850498238e-05, + "loss": 0.2619, + "step": 10220 + }, + { + "epoch": 1.5557160780139148, + "grad_norm": 2.904992525360724, + "learning_rate": 2.77331552037705e-05, + "loss": 0.2801, + "step": 10230 + }, + { + "epoch": 1.5572368170931072, + "grad_norm": 2.1672914232791434, + "learning_rate": 2.768917334264568e-05, + "loss": 0.2426, + "step": 10240 + }, + { + "epoch": 1.5587575561722997, + "grad_norm": 2.23751126823739, + "learning_rate": 2.7645183059353802e-05, + "loss": 0.2779, + "step": 10250 + }, + { + "epoch": 1.5602782952514922, + "grad_norm": 1.7931125272045993, + "learning_rate": 2.7601184491667154e-05, + "loss": 0.2552, + "step": 10260 + }, + { + "epoch": 1.5617990343306847, + "grad_norm": 2.178017751308904, + "learning_rate": 2.755717777738394e-05, + "loss": 0.256, + "step": 10270 + }, + { + "epoch": 1.5633197734098772, + "grad_norm": 2.076214311015082, + "learning_rate": 2.7513163054327895e-05, + "loss": 0.2479, + "step": 10280 + }, + { + "epoch": 1.5648405124890696, + "grad_norm": 2.7206328576731984, + "learning_rate": 2.7469140460347825e-05, + "loss": 0.2724, + "step": 10290 + }, + { + "epoch": 1.5663612515682623, + "grad_norm": 2.632117958898784, + "learning_rate": 2.7425110133317196e-05, + "loss": 0.257, + "step": 10300 + }, + { + "epoch": 1.5678819906474546, + "grad_norm": 2.8515827043302977, + "learning_rate": 2.7381072211133683e-05, + "loss": 0.27, + "step": 10310 + }, + { + "epoch": 1.5694027297266473, + "grad_norm": 2.7212668135123765, + "learning_rate": 2.733702683171877e-05, + "loss": 0.2751, + "step": 10320 + }, + { + "epoch": 1.5709234688058396, + "grad_norm": 2.234757937035352, + "learning_rate": 2.7292974133017268e-05, + "loss": 0.265, + "step": 10330 + }, + { + "epoch": 1.5724442078850323, + "grad_norm": 2.276031276530949, + "learning_rate": 2.7248914252996928e-05, + "loss": 0.2566, + "step": 10340 + }, + { + "epoch": 1.5739649469642245, + "grad_norm": 2.661189453629107, + "learning_rate": 2.7204847329647987e-05, + "loss": 0.2848, + "step": 10350 + }, + { + "epoch": 1.5754856860434172, + "grad_norm": 1.6706418514574084, + "learning_rate": 2.716077350098275e-05, + "loss": 0.2538, + "step": 10360 + }, + { + "epoch": 1.5770064251226095, + "grad_norm": 2.6580412041318278, + "learning_rate": 2.7116692905035123e-05, + "loss": 0.2922, + "step": 10370 + }, + { + "epoch": 1.5785271642018022, + "grad_norm": 2.691277535899845, + "learning_rate": 2.707260567986024e-05, + "loss": 0.2829, + "step": 10380 + }, + { + "epoch": 1.5800479032809944, + "grad_norm": 2.057779788726583, + "learning_rate": 2.702851196353397e-05, + "loss": 0.2653, + "step": 10390 + }, + { + "epoch": 1.5815686423601871, + "grad_norm": 2.3450166858186847, + "learning_rate": 2.6984411894152528e-05, + "loss": 0.2484, + "step": 10400 + }, + { + "epoch": 1.5830893814393794, + "grad_norm": 2.4147279650755133, + "learning_rate": 2.6940305609832023e-05, + "loss": 0.2581, + "step": 10410 + }, + { + "epoch": 1.584610120518572, + "grad_norm": 2.357665974385496, + "learning_rate": 2.6896193248708022e-05, + "loss": 0.2541, + "step": 10420 + }, + { + "epoch": 1.5861308595977646, + "grad_norm": 2.055480346978913, + "learning_rate": 2.685207494893513e-05, + "loss": 0.231, + "step": 10430 + }, + { + "epoch": 1.587651598676957, + "grad_norm": 2.1753138881770107, + "learning_rate": 2.6807950848686537e-05, + "loss": 0.2692, + "step": 10440 + }, + { + "epoch": 1.5891723377561495, + "grad_norm": 2.7305142812015726, + "learning_rate": 2.6763821086153622e-05, + "loss": 0.2708, + "step": 10450 + }, + { + "epoch": 1.590693076835342, + "grad_norm": 2.844913376475673, + "learning_rate": 2.6719685799545495e-05, + "loss": 0.2947, + "step": 10460 + }, + { + "epoch": 1.5922138159145345, + "grad_norm": 3.045396287194931, + "learning_rate": 2.667554512708854e-05, + "loss": 0.2826, + "step": 10470 + }, + { + "epoch": 1.593734554993727, + "grad_norm": 2.2385196154404885, + "learning_rate": 2.663139920702603e-05, + "loss": 0.2685, + "step": 10480 + }, + { + "epoch": 1.5952552940729194, + "grad_norm": 2.4300878583049252, + "learning_rate": 2.6587248177617686e-05, + "loss": 0.2602, + "step": 10490 + }, + { + "epoch": 1.596776033152112, + "grad_norm": 2.1883256140610143, + "learning_rate": 2.6543092177139202e-05, + "loss": 0.2512, + "step": 10500 + }, + { + "epoch": 1.5982967722313044, + "grad_norm": 3.3864943137315158, + "learning_rate": 2.649893134388185e-05, + "loss": 0.249, + "step": 10510 + }, + { + "epoch": 1.5998175113104969, + "grad_norm": 3.645957849021312, + "learning_rate": 2.6454765816152054e-05, + "loss": 0.2665, + "step": 10520 + }, + { + "epoch": 1.6013382503896894, + "grad_norm": 3.130416712177351, + "learning_rate": 2.6410595732270914e-05, + "loss": 0.2707, + "step": 10530 + }, + { + "epoch": 1.6028589894688818, + "grad_norm": 2.1777089910654794, + "learning_rate": 2.6366421230573833e-05, + "loss": 0.2529, + "step": 10540 + }, + { + "epoch": 1.6043797285480743, + "grad_norm": 2.4346557387036167, + "learning_rate": 2.6322242449410005e-05, + "loss": 0.269, + "step": 10550 + }, + { + "epoch": 1.6059004676272668, + "grad_norm": 2.2774149779519606, + "learning_rate": 2.6278059527142074e-05, + "loss": 0.2677, + "step": 10560 + }, + { + "epoch": 1.6074212067064595, + "grad_norm": 2.2271175831604384, + "learning_rate": 2.6233872602145615e-05, + "loss": 0.2537, + "step": 10570 + }, + { + "epoch": 1.6089419457856518, + "grad_norm": 2.5253708219365576, + "learning_rate": 2.618968181280877e-05, + "loss": 0.2433, + "step": 10580 + }, + { + "epoch": 1.6104626848648445, + "grad_norm": 1.911842238808494, + "learning_rate": 2.614548729753176e-05, + "loss": 0.2288, + "step": 10590 + }, + { + "epoch": 1.6119834239440367, + "grad_norm": 1.7501223048709273, + "learning_rate": 2.610128919472649e-05, + "loss": 0.2733, + "step": 10600 + }, + { + "epoch": 1.6135041630232294, + "grad_norm": 2.6346656223436695, + "learning_rate": 2.6057087642816085e-05, + "loss": 0.2608, + "step": 10610 + }, + { + "epoch": 1.6150249021024217, + "grad_norm": 2.406634453779971, + "learning_rate": 2.6012882780234493e-05, + "loss": 0.2425, + "step": 10620 + }, + { + "epoch": 1.6165456411816144, + "grad_norm": 2.2112813243388967, + "learning_rate": 2.5968674745426026e-05, + "loss": 0.2324, + "step": 10630 + }, + { + "epoch": 1.6180663802608066, + "grad_norm": 2.788441404839204, + "learning_rate": 2.59244636768449e-05, + "loss": 0.2404, + "step": 10640 + }, + { + "epoch": 1.6195871193399993, + "grad_norm": 2.3524859402433407, + "learning_rate": 2.588024971295488e-05, + "loss": 0.2485, + "step": 10650 + }, + { + "epoch": 1.6211078584191916, + "grad_norm": 1.3784188070548162, + "learning_rate": 2.5836032992228766e-05, + "loss": 0.2428, + "step": 10660 + }, + { + "epoch": 1.6226285974983843, + "grad_norm": 2.3965161587626382, + "learning_rate": 2.5791813653148005e-05, + "loss": 0.2747, + "step": 10670 + }, + { + "epoch": 1.6241493365775765, + "grad_norm": 3.201800153547138, + "learning_rate": 2.5747591834202244e-05, + "loss": 0.2572, + "step": 10680 + }, + { + "epoch": 1.6256700756567692, + "grad_norm": 2.2567169116555563, + "learning_rate": 2.5703367673888894e-05, + "loss": 0.2766, + "step": 10690 + }, + { + "epoch": 1.6271908147359617, + "grad_norm": 2.445750280800234, + "learning_rate": 2.5659141310712693e-05, + "loss": 0.256, + "step": 10700 + }, + { + "epoch": 1.6287115538151542, + "grad_norm": 2.321898447968121, + "learning_rate": 2.5614912883185293e-05, + "loss": 0.2861, + "step": 10710 + }, + { + "epoch": 1.6302322928943467, + "grad_norm": 2.35256040763746, + "learning_rate": 2.557068252982479e-05, + "loss": 0.2644, + "step": 10720 + }, + { + "epoch": 1.6317530319735392, + "grad_norm": 2.040625870679742, + "learning_rate": 2.5526450389155342e-05, + "loss": 0.2662, + "step": 10730 + }, + { + "epoch": 1.6332737710527316, + "grad_norm": 2.5167100545447205, + "learning_rate": 2.548221659970667e-05, + "loss": 0.2432, + "step": 10740 + }, + { + "epoch": 1.6347945101319241, + "grad_norm": 2.1618879065018013, + "learning_rate": 2.543798130001368e-05, + "loss": 0.2527, + "step": 10750 + }, + { + "epoch": 1.6363152492111166, + "grad_norm": 2.150593452737521, + "learning_rate": 2.5393744628616006e-05, + "loss": 0.2384, + "step": 10760 + }, + { + "epoch": 1.637835988290309, + "grad_norm": 2.0312781028875877, + "learning_rate": 2.534950672405758e-05, + "loss": 0.2534, + "step": 10770 + }, + { + "epoch": 1.6393567273695016, + "grad_norm": 2.592119815812875, + "learning_rate": 2.5305267724886185e-05, + "loss": 0.2547, + "step": 10780 + }, + { + "epoch": 1.640877466448694, + "grad_norm": 2.651700627075668, + "learning_rate": 2.5261027769653033e-05, + "loss": 0.255, + "step": 10790 + }, + { + "epoch": 1.6423982055278865, + "grad_norm": 1.6339889222310704, + "learning_rate": 2.5216786996912345e-05, + "loss": 0.3032, + "step": 10800 + }, + { + "epoch": 1.643918944607079, + "grad_norm": 2.4502026474269694, + "learning_rate": 2.5172545545220895e-05, + "loss": 0.2646, + "step": 10810 + }, + { + "epoch": 1.6454396836862717, + "grad_norm": 2.6907530182726145, + "learning_rate": 2.5128303553137573e-05, + "loss": 0.2692, + "step": 10820 + }, + { + "epoch": 1.646960422765464, + "grad_norm": 1.7573571204356377, + "learning_rate": 2.5084061159222972e-05, + "loss": 0.263, + "step": 10830 + }, + { + "epoch": 1.6484811618446567, + "grad_norm": 1.5536158454165618, + "learning_rate": 2.503981850203894e-05, + "loss": 0.2468, + "step": 10840 + }, + { + "epoch": 1.650001900923849, + "grad_norm": 2.414108027915641, + "learning_rate": 2.499557572014817e-05, + "loss": 0.2736, + "step": 10850 + }, + { + "epoch": 1.6515226400030416, + "grad_norm": 1.9061310459490142, + "learning_rate": 2.4951332952113704e-05, + "loss": 0.2755, + "step": 10860 + }, + { + "epoch": 1.6530433790822339, + "grad_norm": 2.2706542644135186, + "learning_rate": 2.4907090336498567e-05, + "loss": 0.2705, + "step": 10870 + }, + { + "epoch": 1.6545641181614266, + "grad_norm": 1.909814765091512, + "learning_rate": 2.486284801186531e-05, + "loss": 0.2439, + "step": 10880 + }, + { + "epoch": 1.6560848572406188, + "grad_norm": 1.8043457614358276, + "learning_rate": 2.481860611677556e-05, + "loss": 0.2302, + "step": 10890 + }, + { + "epoch": 1.6576055963198115, + "grad_norm": 2.237083970338692, + "learning_rate": 2.477436478978961e-05, + "loss": 0.2478, + "step": 10900 + }, + { + "epoch": 1.6591263353990038, + "grad_norm": 2.572478346505799, + "learning_rate": 2.473012416946596e-05, + "loss": 0.2557, + "step": 10910 + }, + { + "epoch": 1.6606470744781965, + "grad_norm": 2.1631987423901644, + "learning_rate": 2.4685884394360915e-05, + "loss": 0.2384, + "step": 10920 + }, + { + "epoch": 1.6621678135573887, + "grad_norm": 2.5998039619428193, + "learning_rate": 2.4641645603028127e-05, + "loss": 0.2815, + "step": 10930 + }, + { + "epoch": 1.6636885526365814, + "grad_norm": 2.355642385371739, + "learning_rate": 2.459740793401814e-05, + "loss": 0.262, + "step": 10940 + }, + { + "epoch": 1.665209291715774, + "grad_norm": 2.1017096167845337, + "learning_rate": 2.4553171525878018e-05, + "loss": 0.2424, + "step": 10950 + }, + { + "epoch": 1.6667300307949664, + "grad_norm": 2.7207614028304454, + "learning_rate": 2.4508936517150867e-05, + "loss": 0.2435, + "step": 10960 + }, + { + "epoch": 1.6682507698741589, + "grad_norm": 2.2905689551552295, + "learning_rate": 2.4464703046375408e-05, + "loss": 0.2543, + "step": 10970 + }, + { + "epoch": 1.6697715089533514, + "grad_norm": 2.1087074637571646, + "learning_rate": 2.442047125208554e-05, + "loss": 0.2633, + "step": 10980 + }, + { + "epoch": 1.6712922480325438, + "grad_norm": 2.5461443648917177, + "learning_rate": 2.4376241272809916e-05, + "loss": 0.2665, + "step": 10990 + }, + { + "epoch": 1.6728129871117363, + "grad_norm": 3.044161381887142, + "learning_rate": 2.4332013247071504e-05, + "loss": 0.256, + "step": 11000 + }, + { + "epoch": 1.6743337261909288, + "grad_norm": 2.677851194205806, + "learning_rate": 2.4287787313387157e-05, + "loss": 0.2757, + "step": 11010 + }, + { + "epoch": 1.6758544652701213, + "grad_norm": 2.1714269227898204, + "learning_rate": 2.4243563610267172e-05, + "loss": 0.2422, + "step": 11020 + }, + { + "epoch": 1.6773752043493138, + "grad_norm": 2.004342074903223, + "learning_rate": 2.4199342276214868e-05, + "loss": 0.2467, + "step": 11030 + }, + { + "epoch": 1.6788959434285062, + "grad_norm": 2.5999851389621336, + "learning_rate": 2.415512344972612e-05, + "loss": 0.275, + "step": 11040 + }, + { + "epoch": 1.6804166825076987, + "grad_norm": 2.6562881536741467, + "learning_rate": 2.4110907269288978e-05, + "loss": 0.2666, + "step": 11050 + }, + { + "epoch": 1.6819374215868912, + "grad_norm": 2.412796871054908, + "learning_rate": 2.4066693873383196e-05, + "loss": 0.2497, + "step": 11060 + }, + { + "epoch": 1.6834581606660837, + "grad_norm": 1.9916582635520819, + "learning_rate": 2.40224834004798e-05, + "loss": 0.2466, + "step": 11070 + }, + { + "epoch": 1.6849788997452761, + "grad_norm": 2.5983213196385, + "learning_rate": 2.3978275989040676e-05, + "loss": 0.2762, + "step": 11080 + }, + { + "epoch": 1.6864996388244688, + "grad_norm": 1.8296535496748179, + "learning_rate": 2.39340717775181e-05, + "loss": 0.2761, + "step": 11090 + }, + { + "epoch": 1.688020377903661, + "grad_norm": 1.9518211483741656, + "learning_rate": 2.3889870904354348e-05, + "loss": 0.2422, + "step": 11100 + }, + { + "epoch": 1.6895411169828538, + "grad_norm": 3.7631105734715735, + "learning_rate": 2.3845673507981238e-05, + "loss": 0.248, + "step": 11110 + }, + { + "epoch": 1.691061856062046, + "grad_norm": 2.6912137904948574, + "learning_rate": 2.3801479726819675e-05, + "loss": 0.2471, + "step": 11120 + }, + { + "epoch": 1.6925825951412388, + "grad_norm": 2.105268584706402, + "learning_rate": 2.375728969927927e-05, + "loss": 0.2487, + "step": 11130 + }, + { + "epoch": 1.694103334220431, + "grad_norm": 1.902390829926803, + "learning_rate": 2.3713103563757862e-05, + "loss": 0.2282, + "step": 11140 + }, + { + "epoch": 1.6956240732996237, + "grad_norm": 2.6860360066099447, + "learning_rate": 2.366892145864111e-05, + "loss": 0.2751, + "step": 11150 + }, + { + "epoch": 1.697144812378816, + "grad_norm": 2.3467095602956185, + "learning_rate": 2.3624743522302045e-05, + "loss": 0.2663, + "step": 11160 + }, + { + "epoch": 1.6986655514580087, + "grad_norm": 2.307778687596221, + "learning_rate": 2.3580569893100644e-05, + "loss": 0.2466, + "step": 11170 + }, + { + "epoch": 1.700186290537201, + "grad_norm": 2.221375890948776, + "learning_rate": 2.3536400709383395e-05, + "loss": 0.2343, + "step": 11180 + }, + { + "epoch": 1.7017070296163936, + "grad_norm": 2.723608193538271, + "learning_rate": 2.349223610948286e-05, + "loss": 0.258, + "step": 11190 + }, + { + "epoch": 1.703227768695586, + "grad_norm": 1.6541204119347275, + "learning_rate": 2.344807623171726e-05, + "loss": 0.2461, + "step": 11200 + }, + { + "epoch": 1.7047485077747786, + "grad_norm": 2.1113024504571714, + "learning_rate": 2.3403921214389994e-05, + "loss": 0.2375, + "step": 11210 + }, + { + "epoch": 1.706269246853971, + "grad_norm": 2.448243082772575, + "learning_rate": 2.335977119578926e-05, + "loss": 0.2697, + "step": 11220 + }, + { + "epoch": 1.7077899859331636, + "grad_norm": 2.7435100377531363, + "learning_rate": 2.331562631418761e-05, + "loss": 0.2772, + "step": 11230 + }, + { + "epoch": 1.709310725012356, + "grad_norm": 2.397326786861537, + "learning_rate": 2.3271486707841487e-05, + "loss": 0.2466, + "step": 11240 + }, + { + "epoch": 1.7108314640915485, + "grad_norm": 2.52964156878347, + "learning_rate": 2.3227352514990837e-05, + "loss": 0.2623, + "step": 11250 + }, + { + "epoch": 1.712352203170741, + "grad_norm": 1.8354253252005148, + "learning_rate": 2.318322387385862e-05, + "loss": 0.2274, + "step": 11260 + }, + { + "epoch": 1.7138729422499335, + "grad_norm": 2.3988541410370803, + "learning_rate": 2.3139100922650435e-05, + "loss": 0.2307, + "step": 11270 + }, + { + "epoch": 1.715393681329126, + "grad_norm": 2.2211186154223856, + "learning_rate": 2.309498379955405e-05, + "loss": 0.2249, + "step": 11280 + }, + { + "epoch": 1.7169144204083184, + "grad_norm": 2.2707395819260054, + "learning_rate": 2.3050872642738985e-05, + "loss": 0.2614, + "step": 11290 + }, + { + "epoch": 1.718435159487511, + "grad_norm": 2.128860786343451, + "learning_rate": 2.3006767590356066e-05, + "loss": 0.245, + "step": 11300 + }, + { + "epoch": 1.7199558985667034, + "grad_norm": 1.9328529137128245, + "learning_rate": 2.2962668780537e-05, + "loss": 0.2255, + "step": 11310 + }, + { + "epoch": 1.7214766376458959, + "grad_norm": 2.6804145354900397, + "learning_rate": 2.2918576351393955e-05, + "loss": 0.2604, + "step": 11320 + }, + { + "epoch": 1.7229973767250883, + "grad_norm": 2.5385568743945406, + "learning_rate": 2.287449044101911e-05, + "loss": 0.2667, + "step": 11330 + }, + { + "epoch": 1.7245181158042808, + "grad_norm": 1.551920385501737, + "learning_rate": 2.2830411187484225e-05, + "loss": 0.2522, + "step": 11340 + }, + { + "epoch": 1.7260388548834733, + "grad_norm": 1.795129543960983, + "learning_rate": 2.2786338728840212e-05, + "loss": 0.218, + "step": 11350 + }, + { + "epoch": 1.727559593962666, + "grad_norm": 2.096611780165575, + "learning_rate": 2.2742273203116705e-05, + "loss": 0.2593, + "step": 11360 + }, + { + "epoch": 1.7290803330418583, + "grad_norm": 1.6566632381956612, + "learning_rate": 2.269821474832163e-05, + "loss": 0.265, + "step": 11370 + }, + { + "epoch": 1.730601072121051, + "grad_norm": 2.449352477124583, + "learning_rate": 2.265416350244076e-05, + "loss": 0.2421, + "step": 11380 + }, + { + "epoch": 1.7321218112002432, + "grad_norm": 2.7953136260094404, + "learning_rate": 2.261011960343728e-05, + "loss": 0.2663, + "step": 11390 + }, + { + "epoch": 1.733642550279436, + "grad_norm": 2.7581616044085298, + "learning_rate": 2.2566083189251386e-05, + "loss": 0.2519, + "step": 11400 + }, + { + "epoch": 1.7351632893586282, + "grad_norm": 1.9035545964406477, + "learning_rate": 2.252205439779982e-05, + "loss": 0.2409, + "step": 11410 + }, + { + "epoch": 1.7366840284378209, + "grad_norm": 1.4810272712575074, + "learning_rate": 2.247803336697546e-05, + "loss": 0.2847, + "step": 11420 + }, + { + "epoch": 1.7382047675170131, + "grad_norm": 2.205953593094286, + "learning_rate": 2.243402023464687e-05, + "loss": 0.2647, + "step": 11430 + }, + { + "epoch": 1.7397255065962058, + "grad_norm": 2.82504834593873, + "learning_rate": 2.239001513865788e-05, + "loss": 0.2222, + "step": 11440 + }, + { + "epoch": 1.741246245675398, + "grad_norm": 2.816012547286073, + "learning_rate": 2.234601821682715e-05, + "loss": 0.2328, + "step": 11450 + }, + { + "epoch": 1.7427669847545908, + "grad_norm": 2.2038957942792505, + "learning_rate": 2.2302029606947754e-05, + "loss": 0.2378, + "step": 11460 + }, + { + "epoch": 1.744287723833783, + "grad_norm": 2.101604753223921, + "learning_rate": 2.2258049446786698e-05, + "loss": 0.2537, + "step": 11470 + }, + { + "epoch": 1.7458084629129758, + "grad_norm": 1.9712806097818185, + "learning_rate": 2.221407787408456e-05, + "loss": 0.2654, + "step": 11480 + }, + { + "epoch": 1.7473292019921682, + "grad_norm": 2.5747164358762737, + "learning_rate": 2.2170115026555e-05, + "loss": 0.2688, + "step": 11490 + }, + { + "epoch": 1.7488499410713607, + "grad_norm": 1.5427178077578, + "learning_rate": 2.2126161041884375e-05, + "loss": 0.2466, + "step": 11500 + }, + { + "epoch": 1.7503706801505532, + "grad_norm": 2.5771139670673815, + "learning_rate": 2.2082216057731266e-05, + "loss": 0.2664, + "step": 11510 + }, + { + "epoch": 1.7518914192297457, + "grad_norm": 2.1113147618898838, + "learning_rate": 2.2038280211726067e-05, + "loss": 0.2266, + "step": 11520 + }, + { + "epoch": 1.7534121583089382, + "grad_norm": 2.8575762480640443, + "learning_rate": 2.199435364147057e-05, + "loss": 0.24, + "step": 11530 + }, + { + "epoch": 1.7549328973881306, + "grad_norm": 2.4407179630203193, + "learning_rate": 2.1950436484537484e-05, + "loss": 0.2328, + "step": 11540 + }, + { + "epoch": 1.756453636467323, + "grad_norm": 2.3797274133785424, + "learning_rate": 2.1906528878470075e-05, + "loss": 0.2514, + "step": 11550 + }, + { + "epoch": 1.7579743755465156, + "grad_norm": 3.2425061957262695, + "learning_rate": 2.186263096078166e-05, + "loss": 0.292, + "step": 11560 + }, + { + "epoch": 1.759495114625708, + "grad_norm": 2.4836485605993905, + "learning_rate": 2.1818742868955237e-05, + "loss": 0.2528, + "step": 11570 + }, + { + "epoch": 1.7610158537049005, + "grad_norm": 1.6608413414219843, + "learning_rate": 2.1774864740443028e-05, + "loss": 0.2352, + "step": 11580 + }, + { + "epoch": 1.762536592784093, + "grad_norm": 2.7146457517492575, + "learning_rate": 2.1730996712666042e-05, + "loss": 0.2457, + "step": 11590 + }, + { + "epoch": 1.7640573318632855, + "grad_norm": 2.536213354264096, + "learning_rate": 2.168713892301366e-05, + "loss": 0.2442, + "step": 11600 + }, + { + "epoch": 1.7655780709424782, + "grad_norm": 1.990038741190813, + "learning_rate": 2.16432915088432e-05, + "loss": 0.2339, + "step": 11610 + }, + { + "epoch": 1.7670988100216705, + "grad_norm": 1.8411939650720006, + "learning_rate": 2.159945460747948e-05, + "loss": 0.2386, + "step": 11620 + }, + { + "epoch": 1.7686195491008632, + "grad_norm": 2.752824692767241, + "learning_rate": 2.1555628356214394e-05, + "loss": 0.27, + "step": 11630 + }, + { + "epoch": 1.7701402881800554, + "grad_norm": 1.9572427846616054, + "learning_rate": 2.1511812892306498e-05, + "loss": 0.2659, + "step": 11640 + }, + { + "epoch": 1.7716610272592481, + "grad_norm": 2.085846662700279, + "learning_rate": 2.1468008352980527e-05, + "loss": 0.2573, + "step": 11650 + }, + { + "epoch": 1.7731817663384404, + "grad_norm": 2.6139980617671035, + "learning_rate": 2.142421487542703e-05, + "loss": 0.242, + "step": 11660 + }, + { + "epoch": 1.774702505417633, + "grad_norm": 2.527762981235945, + "learning_rate": 2.1380432596801902e-05, + "loss": 0.2525, + "step": 11670 + }, + { + "epoch": 1.7762232444968253, + "grad_norm": 1.7407626684377822, + "learning_rate": 2.1336661654225977e-05, + "loss": 0.2149, + "step": 11680 + }, + { + "epoch": 1.777743983576018, + "grad_norm": 2.1406163736407646, + "learning_rate": 2.1292902184784564e-05, + "loss": 0.2339, + "step": 11690 + }, + { + "epoch": 1.7792647226552103, + "grad_norm": 1.9760492766991442, + "learning_rate": 2.124915432552706e-05, + "loss": 0.232, + "step": 11700 + }, + { + "epoch": 1.780785461734403, + "grad_norm": 2.332006014350402, + "learning_rate": 2.1205418213466487e-05, + "loss": 0.2482, + "step": 11710 + }, + { + "epoch": 1.7823062008135953, + "grad_norm": 2.7393527949478167, + "learning_rate": 2.1161693985579086e-05, + "loss": 0.2623, + "step": 11720 + }, + { + "epoch": 1.783826939892788, + "grad_norm": 2.367173780786629, + "learning_rate": 2.1117981778803876e-05, + "loss": 0.2245, + "step": 11730 + }, + { + "epoch": 1.7853476789719804, + "grad_norm": 2.6961193279434306, + "learning_rate": 2.1074281730042207e-05, + "loss": 0.2309, + "step": 11740 + }, + { + "epoch": 1.786868418051173, + "grad_norm": 1.969335029340796, + "learning_rate": 2.103059397615738e-05, + "loss": 0.234, + "step": 11750 + }, + { + "epoch": 1.7883891571303654, + "grad_norm": 1.797466293551152, + "learning_rate": 2.0986918653974176e-05, + "loss": 0.2201, + "step": 11760 + }, + { + "epoch": 1.7899098962095579, + "grad_norm": 3.1873586926830995, + "learning_rate": 2.0943255900278448e-05, + "loss": 0.2509, + "step": 11770 + }, + { + "epoch": 1.7914306352887504, + "grad_norm": 1.874906880458549, + "learning_rate": 2.089960585181668e-05, + "loss": 0.23, + "step": 11780 + }, + { + "epoch": 1.7929513743679428, + "grad_norm": 2.587171334136062, + "learning_rate": 2.0855968645295568e-05, + "loss": 0.2686, + "step": 11790 + }, + { + "epoch": 1.7944721134471353, + "grad_norm": 1.90711534381791, + "learning_rate": 2.0812344417381595e-05, + "loss": 0.2362, + "step": 11800 + }, + { + "epoch": 1.7959928525263278, + "grad_norm": 2.6253779865506757, + "learning_rate": 2.076873330470057e-05, + "loss": 0.2334, + "step": 11810 + }, + { + "epoch": 1.7975135916055203, + "grad_norm": 2.4535956091526896, + "learning_rate": 2.0725135443837267e-05, + "loss": 0.2268, + "step": 11820 + }, + { + "epoch": 1.7990343306847127, + "grad_norm": 2.5421814642780696, + "learning_rate": 2.068155097133492e-05, + "loss": 0.2292, + "step": 11830 + }, + { + "epoch": 1.8005550697639052, + "grad_norm": 1.9322802765489833, + "learning_rate": 2.063798002369485e-05, + "loss": 0.2743, + "step": 11840 + }, + { + "epoch": 1.8020758088430977, + "grad_norm": 2.07106587817283, + "learning_rate": 2.0594422737376025e-05, + "loss": 0.244, + "step": 11850 + }, + { + "epoch": 1.8035965479222902, + "grad_norm": 2.3384528070468273, + "learning_rate": 2.0550879248794616e-05, + "loss": 0.2184, + "step": 11860 + }, + { + "epoch": 1.8051172870014827, + "grad_norm": 2.013268021967202, + "learning_rate": 2.0507349694323583e-05, + "loss": 0.2148, + "step": 11870 + }, + { + "epoch": 1.8066380260806754, + "grad_norm": 2.09703427663534, + "learning_rate": 2.046383421029225e-05, + "loss": 0.2329, + "step": 11880 + }, + { + "epoch": 1.8081587651598676, + "grad_norm": 2.74159939463849, + "learning_rate": 2.0420332932985876e-05, + "loss": 0.2753, + "step": 11890 + }, + { + "epoch": 1.8096795042390603, + "grad_norm": 2.1988031900552336, + "learning_rate": 2.0376845998645215e-05, + "loss": 0.2231, + "step": 11900 + }, + { + "epoch": 1.8112002433182526, + "grad_norm": 1.9116863579143428, + "learning_rate": 2.0333373543466128e-05, + "loss": 0.2342, + "step": 11910 + }, + { + "epoch": 1.8127209823974453, + "grad_norm": 2.3634132534019576, + "learning_rate": 2.0289915703599083e-05, + "loss": 0.2341, + "step": 11920 + }, + { + "epoch": 1.8142417214766375, + "grad_norm": 1.7681255541932372, + "learning_rate": 2.0246472615148808e-05, + "loss": 0.2473, + "step": 11930 + }, + { + "epoch": 1.8157624605558302, + "grad_norm": 1.8675297536330524, + "learning_rate": 2.0203044414173832e-05, + "loss": 0.2477, + "step": 11940 + }, + { + "epoch": 1.8172831996350225, + "grad_norm": 2.6334846477666978, + "learning_rate": 2.0159631236686044e-05, + "loss": 0.2302, + "step": 11950 + }, + { + "epoch": 1.8188039387142152, + "grad_norm": 2.028253434904097, + "learning_rate": 2.011623321865029e-05, + "loss": 0.2627, + "step": 11960 + }, + { + "epoch": 1.8203246777934075, + "grad_norm": 2.392098119086575, + "learning_rate": 2.0072850495983936e-05, + "loss": 0.2742, + "step": 11970 + }, + { + "epoch": 1.8218454168726002, + "grad_norm": 2.1829524648659664, + "learning_rate": 2.0029483204556443e-05, + "loss": 0.2344, + "step": 11980 + }, + { + "epoch": 1.8233661559517924, + "grad_norm": 1.7211651787977147, + "learning_rate": 1.998613148018895e-05, + "loss": 0.2461, + "step": 11990 + }, + { + "epoch": 1.8248868950309851, + "grad_norm": 1.7085252787817737, + "learning_rate": 1.994279545865384e-05, + "loss": 0.2341, + "step": 12000 + }, + { + "epoch": 1.8264076341101776, + "grad_norm": 1.8688446109676775, + "learning_rate": 1.98994752756743e-05, + "loss": 0.2271, + "step": 12010 + }, + { + "epoch": 1.82792837318937, + "grad_norm": 1.9199304482223822, + "learning_rate": 1.985617106692393e-05, + "loss": 0.2095, + "step": 12020 + }, + { + "epoch": 1.8294491122685625, + "grad_norm": 2.0864548013897624, + "learning_rate": 1.9812882968026297e-05, + "loss": 0.2481, + "step": 12030 + }, + { + "epoch": 1.830969851347755, + "grad_norm": 3.098278609950291, + "learning_rate": 1.976961111455452e-05, + "loss": 0.2477, + "step": 12040 + }, + { + "epoch": 1.8324905904269475, + "grad_norm": 2.725389049496119, + "learning_rate": 1.9726355642030826e-05, + "loss": 0.2259, + "step": 12050 + }, + { + "epoch": 1.83401132950614, + "grad_norm": 2.1049418090396004, + "learning_rate": 1.968311668592615e-05, + "loss": 0.2204, + "step": 12060 + }, + { + "epoch": 1.8355320685853325, + "grad_norm": 2.8185947393194306, + "learning_rate": 1.96398943816597e-05, + "loss": 0.2491, + "step": 12070 + }, + { + "epoch": 1.837052807664525, + "grad_norm": 2.0075482477362057, + "learning_rate": 1.9596688864598512e-05, + "loss": 0.232, + "step": 12080 + }, + { + "epoch": 1.8385735467437174, + "grad_norm": 2.1284462818425705, + "learning_rate": 1.9553500270057077e-05, + "loss": 0.2223, + "step": 12090 + }, + { + "epoch": 1.84009428582291, + "grad_norm": 2.882566716326108, + "learning_rate": 1.951032873329686e-05, + "loss": 0.2453, + "step": 12100 + }, + { + "epoch": 1.8416150249021024, + "grad_norm": 3.6725058894833826, + "learning_rate": 1.9467174389525923e-05, + "loss": 0.2577, + "step": 12110 + }, + { + "epoch": 1.8431357639812949, + "grad_norm": 2.616693309498326, + "learning_rate": 1.9424037373898476e-05, + "loss": 0.2233, + "step": 12120 + }, + { + "epoch": 1.8446565030604876, + "grad_norm": 2.0578110932923055, + "learning_rate": 1.938091782151445e-05, + "loss": 0.2281, + "step": 12130 + }, + { + "epoch": 1.8461772421396798, + "grad_norm": 1.8476982465885867, + "learning_rate": 1.9337815867419086e-05, + "loss": 0.2623, + "step": 12140 + }, + { + "epoch": 1.8476979812188725, + "grad_norm": 1.9191261161224569, + "learning_rate": 1.9294731646602527e-05, + "loss": 0.1986, + "step": 12150 + }, + { + "epoch": 1.8492187202980648, + "grad_norm": 1.730046030498853, + "learning_rate": 1.925166529399935e-05, + "loss": 0.2188, + "step": 12160 + }, + { + "epoch": 1.8507394593772575, + "grad_norm": 1.6931282649120674, + "learning_rate": 1.9208616944488196e-05, + "loss": 0.2307, + "step": 12170 + }, + { + "epoch": 1.8522601984564497, + "grad_norm": 2.834906088084818, + "learning_rate": 1.9165586732891317e-05, + "loss": 0.2191, + "step": 12180 + }, + { + "epoch": 1.8537809375356424, + "grad_norm": 2.3773812191277965, + "learning_rate": 1.912257479397413e-05, + "loss": 0.2371, + "step": 12190 + }, + { + "epoch": 1.8553016766148347, + "grad_norm": 2.2044816205735938, + "learning_rate": 1.9079581262444865e-05, + "loss": 0.2239, + "step": 12200 + }, + { + "epoch": 1.8568224156940274, + "grad_norm": 2.901375877174323, + "learning_rate": 1.9036606272954077e-05, + "loss": 0.2263, + "step": 12210 + }, + { + "epoch": 1.8583431547732197, + "grad_norm": 2.4778524033888902, + "learning_rate": 1.8993649960094266e-05, + "loss": 0.257, + "step": 12220 + }, + { + "epoch": 1.8598638938524124, + "grad_norm": 2.937216246307808, + "learning_rate": 1.8950712458399422e-05, + "loss": 0.2229, + "step": 12230 + }, + { + "epoch": 1.8613846329316046, + "grad_norm": 1.751564726206306, + "learning_rate": 1.8907793902344635e-05, + "loss": 0.257, + "step": 12240 + }, + { + "epoch": 1.8629053720107973, + "grad_norm": 1.7822335315616866, + "learning_rate": 1.886489442634565e-05, + "loss": 0.2518, + "step": 12250 + }, + { + "epoch": 1.8644261110899898, + "grad_norm": 2.0847425865455524, + "learning_rate": 1.8822014164758468e-05, + "loss": 0.2451, + "step": 12260 + }, + { + "epoch": 1.8659468501691823, + "grad_norm": 2.701851418993436, + "learning_rate": 1.8779153251878904e-05, + "loss": 0.2291, + "step": 12270 + }, + { + "epoch": 1.8674675892483747, + "grad_norm": 1.9345123292319792, + "learning_rate": 1.8736311821942157e-05, + "loss": 0.249, + "step": 12280 + }, + { + "epoch": 1.8689883283275672, + "grad_norm": 2.597727296474687, + "learning_rate": 1.869349000912244e-05, + "loss": 0.2288, + "step": 12290 + }, + { + "epoch": 1.8705090674067597, + "grad_norm": 2.6567722002061904, + "learning_rate": 1.8650687947532514e-05, + "loss": 0.1993, + "step": 12300 + }, + { + "epoch": 1.8720298064859522, + "grad_norm": 2.3753455087042763, + "learning_rate": 1.8607905771223274e-05, + "loss": 0.2555, + "step": 12310 + }, + { + "epoch": 1.8735505455651447, + "grad_norm": 3.3337392986552588, + "learning_rate": 1.856514361418335e-05, + "loss": 0.2477, + "step": 12320 + }, + { + "epoch": 1.8750712846443371, + "grad_norm": 2.823429250404052, + "learning_rate": 1.852240161033867e-05, + "loss": 0.2432, + "step": 12330 + }, + { + "epoch": 1.8765920237235296, + "grad_norm": 3.144831772066807, + "learning_rate": 1.8479679893552042e-05, + "loss": 0.2275, + "step": 12340 + }, + { + "epoch": 1.878112762802722, + "grad_norm": 1.93465763289895, + "learning_rate": 1.8436978597622734e-05, + "loss": 0.2123, + "step": 12350 + }, + { + "epoch": 1.8796335018819146, + "grad_norm": 2.478959296090728, + "learning_rate": 1.8394297856286067e-05, + "loss": 0.2279, + "step": 12360 + }, + { + "epoch": 1.881154240961107, + "grad_norm": 2.4608111514287865, + "learning_rate": 1.8351637803212972e-05, + "loss": 0.2178, + "step": 12370 + }, + { + "epoch": 1.8826749800402995, + "grad_norm": 1.7225456772765662, + "learning_rate": 1.8308998572009604e-05, + "loss": 0.2171, + "step": 12380 + }, + { + "epoch": 1.884195719119492, + "grad_norm": 1.9621932409736234, + "learning_rate": 1.8266380296216905e-05, + "loss": 0.2119, + "step": 12390 + }, + { + "epoch": 1.8857164581986847, + "grad_norm": 2.2352325854642086, + "learning_rate": 1.822378310931018e-05, + "loss": 0.2328, + "step": 12400 + }, + { + "epoch": 1.887237197277877, + "grad_norm": 1.553642622000155, + "learning_rate": 1.8181207144698686e-05, + "loss": 0.2453, + "step": 12410 + }, + { + "epoch": 1.8887579363570697, + "grad_norm": 2.8591715099488275, + "learning_rate": 1.8138652535725213e-05, + "loss": 0.2305, + "step": 12420 + }, + { + "epoch": 1.890278675436262, + "grad_norm": 2.0416326049262334, + "learning_rate": 1.809611941566568e-05, + "loss": 0.2126, + "step": 12430 + }, + { + "epoch": 1.8917994145154546, + "grad_norm": 2.13199150392611, + "learning_rate": 1.8053607917728687e-05, + "loss": 0.2124, + "step": 12440 + }, + { + "epoch": 1.893320153594647, + "grad_norm": 1.9840212605683112, + "learning_rate": 1.8011118175055148e-05, + "loss": 0.2241, + "step": 12450 + }, + { + "epoch": 1.8948408926738396, + "grad_norm": 2.566732160670393, + "learning_rate": 1.7968650320717787e-05, + "loss": 0.2229, + "step": 12460 + }, + { + "epoch": 1.8963616317530319, + "grad_norm": 3.4687560288885106, + "learning_rate": 1.7926204487720828e-05, + "loss": 0.2223, + "step": 12470 + }, + { + "epoch": 1.8978823708322246, + "grad_norm": 2.0150702055919507, + "learning_rate": 1.78837808089995e-05, + "loss": 0.2323, + "step": 12480 + }, + { + "epoch": 1.8994031099114168, + "grad_norm": 2.940491453283728, + "learning_rate": 1.7841379417419667e-05, + "loss": 0.2282, + "step": 12490 + }, + { + "epoch": 1.9009238489906095, + "grad_norm": 1.7334842367759842, + "learning_rate": 1.7799000445777374e-05, + "loss": 0.2368, + "step": 12500 + }, + { + "epoch": 1.9024445880698018, + "grad_norm": 2.404853410569919, + "learning_rate": 1.7756644026798458e-05, + "loss": 0.2245, + "step": 12510 + }, + { + "epoch": 1.9039653271489945, + "grad_norm": 2.2673730076337772, + "learning_rate": 1.771431029313812e-05, + "loss": 0.25, + "step": 12520 + }, + { + "epoch": 1.905486066228187, + "grad_norm": 1.6115442949367513, + "learning_rate": 1.767199937738052e-05, + "loss": 0.2305, + "step": 12530 + }, + { + "epoch": 1.9070068053073794, + "grad_norm": 2.585824658607386, + "learning_rate": 1.762971141203836e-05, + "loss": 0.216, + "step": 12540 + }, + { + "epoch": 1.908527544386572, + "grad_norm": 2.398246204853989, + "learning_rate": 1.7587446529552425e-05, + "loss": 0.2169, + "step": 12550 + }, + { + "epoch": 1.9100482834657644, + "grad_norm": 2.2389709106553806, + "learning_rate": 1.754520486229126e-05, + "loss": 0.2221, + "step": 12560 + }, + { + "epoch": 1.9115690225449569, + "grad_norm": 2.581476999924307, + "learning_rate": 1.750298654255067e-05, + "loss": 0.2232, + "step": 12570 + }, + { + "epoch": 1.9130897616241493, + "grad_norm": 3.0012050407720885, + "learning_rate": 1.7460791702553354e-05, + "loss": 0.2351, + "step": 12580 + }, + { + "epoch": 1.9146105007033418, + "grad_norm": 2.650749181311306, + "learning_rate": 1.7418620474448467e-05, + "loss": 0.2248, + "step": 12590 + }, + { + "epoch": 1.9161312397825343, + "grad_norm": 2.6456959587150393, + "learning_rate": 1.7376472990311215e-05, + "loss": 0.2205, + "step": 12600 + }, + { + "epoch": 1.9176519788617268, + "grad_norm": 2.025927578038237, + "learning_rate": 1.733434938214245e-05, + "loss": 0.2159, + "step": 12610 + }, + { + "epoch": 1.9191727179409193, + "grad_norm": 1.7122836081407693, + "learning_rate": 1.7292249781868237e-05, + "loss": 0.2523, + "step": 12620 + }, + { + "epoch": 1.9206934570201117, + "grad_norm": 1.9252100413762843, + "learning_rate": 1.725017432133945e-05, + "loss": 0.2489, + "step": 12630 + }, + { + "epoch": 1.9222141960993042, + "grad_norm": 2.3373785745793576, + "learning_rate": 1.720812313233136e-05, + "loss": 0.1945, + "step": 12640 + }, + { + "epoch": 1.9237349351784967, + "grad_norm": 1.458729846363728, + "learning_rate": 1.7166096346543233e-05, + "loss": 0.1934, + "step": 12650 + }, + { + "epoch": 1.9252556742576892, + "grad_norm": 1.899795446609367, + "learning_rate": 1.7124094095597898e-05, + "loss": 0.2185, + "step": 12660 + }, + { + "epoch": 1.9267764133368819, + "grad_norm": 2.3625282945458292, + "learning_rate": 1.7082116511041354e-05, + "loss": 0.2259, + "step": 12670 + }, + { + "epoch": 1.9282971524160741, + "grad_norm": 2.388459664870278, + "learning_rate": 1.7040163724342328e-05, + "loss": 0.2489, + "step": 12680 + }, + { + "epoch": 1.9298178914952668, + "grad_norm": 2.1328292385622056, + "learning_rate": 1.6998235866891908e-05, + "loss": 0.2193, + "step": 12690 + }, + { + "epoch": 1.931338630574459, + "grad_norm": 2.1698400056796983, + "learning_rate": 1.6956333070003085e-05, + "loss": 0.227, + "step": 12700 + }, + { + "epoch": 1.9328593696536518, + "grad_norm": 2.367986063351272, + "learning_rate": 1.6914455464910373e-05, + "loss": 0.2312, + "step": 12710 + }, + { + "epoch": 1.934380108732844, + "grad_norm": 2.145006394233981, + "learning_rate": 1.68726031827694e-05, + "loss": 0.2431, + "step": 12720 + }, + { + "epoch": 1.9359008478120368, + "grad_norm": 1.8698846562747071, + "learning_rate": 1.6830776354656462e-05, + "loss": 0.2424, + "step": 12730 + }, + { + "epoch": 1.937421586891229, + "grad_norm": 2.2540669175899737, + "learning_rate": 1.6788975111568144e-05, + "loss": 0.2144, + "step": 12740 + }, + { + "epoch": 1.9389423259704217, + "grad_norm": 2.1436845079666553, + "learning_rate": 1.6747199584420907e-05, + "loss": 0.2379, + "step": 12750 + }, + { + "epoch": 1.940463065049614, + "grad_norm": 2.4017679318494407, + "learning_rate": 1.670544990405068e-05, + "loss": 0.2094, + "step": 12760 + }, + { + "epoch": 1.9419838041288067, + "grad_norm": 3.110281133223647, + "learning_rate": 1.6663726201212425e-05, + "loss": 0.2529, + "step": 12770 + }, + { + "epoch": 1.943504543207999, + "grad_norm": 2.302117208930645, + "learning_rate": 1.662202860657976e-05, + "loss": 0.2182, + "step": 12780 + }, + { + "epoch": 1.9450252822871916, + "grad_norm": 2.2040623126010717, + "learning_rate": 1.6580357250744524e-05, + "loss": 0.2445, + "step": 12790 + }, + { + "epoch": 1.946546021366384, + "grad_norm": 2.312437507349849, + "learning_rate": 1.653871226421639e-05, + "loss": 0.1974, + "step": 12800 + }, + { + "epoch": 1.9480667604455766, + "grad_norm": 1.3102810474448412, + "learning_rate": 1.649709377742245e-05, + "loss": 0.1992, + "step": 12810 + }, + { + "epoch": 1.949587499524769, + "grad_norm": 2.865904236698949, + "learning_rate": 1.645550192070677e-05, + "loss": 0.2236, + "step": 12820 + }, + { + "epoch": 1.9511082386039615, + "grad_norm": 1.8258768453471192, + "learning_rate": 1.641393682433005e-05, + "loss": 0.2084, + "step": 12830 + }, + { + "epoch": 1.952628977683154, + "grad_norm": 2.2482196187397236, + "learning_rate": 1.637239861846916e-05, + "loss": 0.2292, + "step": 12840 + }, + { + "epoch": 1.9541497167623465, + "grad_norm": 2.2171921689302123, + "learning_rate": 1.6330887433216764e-05, + "loss": 0.2217, + "step": 12850 + }, + { + "epoch": 1.955670455841539, + "grad_norm": 2.1441975676748855, + "learning_rate": 1.628940339858089e-05, + "loss": 0.194, + "step": 12860 + }, + { + "epoch": 1.9571911949207315, + "grad_norm": 2.0653988216929653, + "learning_rate": 1.6247946644484544e-05, + "loss": 0.2236, + "step": 12870 + }, + { + "epoch": 1.958711933999924, + "grad_norm": 1.8271519771315685, + "learning_rate": 1.6206517300765284e-05, + "loss": 0.2139, + "step": 12880 + }, + { + "epoch": 1.9602326730791164, + "grad_norm": 1.875161896511899, + "learning_rate": 1.616511549717483e-05, + "loss": 0.2345, + "step": 12890 + }, + { + "epoch": 1.961753412158309, + "grad_norm": 2.5006304196411047, + "learning_rate": 1.612374136337864e-05, + "loss": 0.2453, + "step": 12900 + }, + { + "epoch": 1.9632741512375014, + "grad_norm": 2.7188046949741365, + "learning_rate": 1.6082395028955516e-05, + "loss": 0.2254, + "step": 12910 + }, + { + "epoch": 1.964794890316694, + "grad_norm": 2.6131202947458974, + "learning_rate": 1.60410766233972e-05, + "loss": 0.2267, + "step": 12920 + }, + { + "epoch": 1.9663156293958863, + "grad_norm": 2.2558759471915195, + "learning_rate": 1.5999786276107957e-05, + "loss": 0.2271, + "step": 12930 + }, + { + "epoch": 1.967836368475079, + "grad_norm": 2.04673318982872, + "learning_rate": 1.59585241164042e-05, + "loss": 0.2026, + "step": 12940 + }, + { + "epoch": 1.9693571075542713, + "grad_norm": 1.945596017311589, + "learning_rate": 1.5917290273514023e-05, + "loss": 0.2435, + "step": 12950 + }, + { + "epoch": 1.970877846633464, + "grad_norm": 2.251713793391793, + "learning_rate": 1.587608487657687e-05, + "loss": 0.1965, + "step": 12960 + }, + { + "epoch": 1.9723985857126562, + "grad_norm": 1.8083972872206615, + "learning_rate": 1.5834908054643073e-05, + "loss": 0.2369, + "step": 12970 + }, + { + "epoch": 1.973919324791849, + "grad_norm": 2.656379303746841, + "learning_rate": 1.579375993667348e-05, + "loss": 0.2445, + "step": 12980 + }, + { + "epoch": 1.9754400638710412, + "grad_norm": 2.158566456424086, + "learning_rate": 1.575264065153906e-05, + "loss": 0.2189, + "step": 12990 + }, + { + "epoch": 1.976960802950234, + "grad_norm": 3.0820274727385772, + "learning_rate": 1.571155032802043e-05, + "loss": 0.232, + "step": 13000 + }, + { + "epoch": 1.9784815420294262, + "grad_norm": 1.7087679226343748, + "learning_rate": 1.567048909480755e-05, + "loss": 0.2016, + "step": 13010 + }, + { + "epoch": 1.9800022811086189, + "grad_norm": 2.673504142365056, + "learning_rate": 1.562945708049926e-05, + "loss": 0.2222, + "step": 13020 + }, + { + "epoch": 1.9815230201878111, + "grad_norm": 2.132595809347763, + "learning_rate": 1.5588454413602887e-05, + "loss": 0.2499, + "step": 13030 + }, + { + "epoch": 1.9830437592670038, + "grad_norm": 2.5547922073682554, + "learning_rate": 1.5547481222533846e-05, + "loss": 0.2232, + "step": 13040 + }, + { + "epoch": 1.9845644983461963, + "grad_norm": 2.7824513999139087, + "learning_rate": 1.550653763561523e-05, + "loss": 0.2141, + "step": 13050 + }, + { + "epoch": 1.9860852374253888, + "grad_norm": 2.9855216580029147, + "learning_rate": 1.546562378107743e-05, + "loss": 0.2484, + "step": 13060 + }, + { + "epoch": 1.9876059765045813, + "grad_norm": 2.6834998243614137, + "learning_rate": 1.5424739787057707e-05, + "loss": 0.222, + "step": 13070 + }, + { + "epoch": 1.9891267155837737, + "grad_norm": 1.9591876666340196, + "learning_rate": 1.5383885781599817e-05, + "loss": 0.2034, + "step": 13080 + }, + { + "epoch": 1.9906474546629662, + "grad_norm": 1.61381613135114, + "learning_rate": 1.5343061892653565e-05, + "loss": 0.2133, + "step": 13090 + }, + { + "epoch": 1.9921681937421587, + "grad_norm": 1.665878946079043, + "learning_rate": 1.530226824807447e-05, + "loss": 0.2105, + "step": 13100 + }, + { + "epoch": 1.9936889328213512, + "grad_norm": 1.6101264429798263, + "learning_rate": 1.5261504975623306e-05, + "loss": 0.2154, + "step": 13110 + }, + { + "epoch": 1.9952096719005437, + "grad_norm": 1.6551517962016868, + "learning_rate": 1.5220772202965738e-05, + "loss": 0.2503, + "step": 13120 + }, + { + "epoch": 1.9967304109797361, + "grad_norm": 1.8736572589402436, + "learning_rate": 1.5180070057671902e-05, + "loss": 0.2396, + "step": 13130 + }, + { + "epoch": 1.9982511500589286, + "grad_norm": 1.7452806762718773, + "learning_rate": 1.5139398667216015e-05, + "loss": 0.1948, + "step": 13140 + }, + { + "epoch": 1.999771889138121, + "grad_norm": 1.727734261761001, + "learning_rate": 1.5098758158975979e-05, + "loss": 0.2153, + "step": 13150 + }, + { + "epoch": 2.0012926282173136, + "grad_norm": 0.8799754360526506, + "learning_rate": 1.5058148660232969e-05, + "loss": 0.1161, + "step": 13160 + }, + { + "epoch": 2.0028133672965063, + "grad_norm": 1.5801140844573125, + "learning_rate": 1.5017570298171047e-05, + "loss": 0.0852, + "step": 13170 + }, + { + "epoch": 2.0043341063756985, + "grad_norm": 1.3725613102588519, + "learning_rate": 1.4977023199876743e-05, + "loss": 0.0784, + "step": 13180 + }, + { + "epoch": 2.0058548454548912, + "grad_norm": 1.3411146994763936, + "learning_rate": 1.4936507492338698e-05, + "loss": 0.0908, + "step": 13190 + }, + { + "epoch": 2.0073755845340835, + "grad_norm": 1.320596544869458, + "learning_rate": 1.4896023302447221e-05, + "loss": 0.0854, + "step": 13200 + }, + { + "epoch": 2.008896323613276, + "grad_norm": 1.122282953775995, + "learning_rate": 1.4855570756993931e-05, + "loss": 0.0839, + "step": 13210 + }, + { + "epoch": 2.0104170626924684, + "grad_norm": 1.4538239142395946, + "learning_rate": 1.4815149982671317e-05, + "loss": 0.0846, + "step": 13220 + }, + { + "epoch": 2.011937801771661, + "grad_norm": 1.027437499176305, + "learning_rate": 1.4774761106072377e-05, + "loss": 0.0795, + "step": 13230 + }, + { + "epoch": 2.0134585408508534, + "grad_norm": 0.9417995527886438, + "learning_rate": 1.4734404253690215e-05, + "loss": 0.0787, + "step": 13240 + }, + { + "epoch": 2.014979279930046, + "grad_norm": 1.0523592672319342, + "learning_rate": 1.4694079551917629e-05, + "loss": 0.0747, + "step": 13250 + }, + { + "epoch": 2.0165000190092384, + "grad_norm": 1.8192152177014613, + "learning_rate": 1.465378712704673e-05, + "loss": 0.0708, + "step": 13260 + }, + { + "epoch": 2.018020758088431, + "grad_norm": 1.598689957872877, + "learning_rate": 1.4613527105268544e-05, + "loss": 0.084, + "step": 13270 + }, + { + "epoch": 2.0195414971676233, + "grad_norm": 1.5700608512630276, + "learning_rate": 1.4573299612672592e-05, + "loss": 0.0927, + "step": 13280 + }, + { + "epoch": 2.021062236246816, + "grad_norm": 2.927288935793796, + "learning_rate": 1.4533104775246558e-05, + "loss": 0.0943, + "step": 13290 + }, + { + "epoch": 2.0225829753260083, + "grad_norm": 1.4404956140620406, + "learning_rate": 1.4492942718875824e-05, + "loss": 0.0854, + "step": 13300 + }, + { + "epoch": 2.024103714405201, + "grad_norm": 1.302555990080665, + "learning_rate": 1.445281356934311e-05, + "loss": 0.0833, + "step": 13310 + }, + { + "epoch": 2.0256244534843932, + "grad_norm": 1.3202990830551773, + "learning_rate": 1.4412717452328084e-05, + "loss": 0.0788, + "step": 13320 + }, + { + "epoch": 2.027145192563586, + "grad_norm": 1.1208692714315232, + "learning_rate": 1.4372654493406968e-05, + "loss": 0.0951, + "step": 13330 + }, + { + "epoch": 2.028665931642778, + "grad_norm": 1.3984905985167944, + "learning_rate": 1.4332624818052099e-05, + "loss": 0.0772, + "step": 13340 + }, + { + "epoch": 2.030186670721971, + "grad_norm": 1.647760028960037, + "learning_rate": 1.4292628551631609e-05, + "loss": 0.0917, + "step": 13350 + }, + { + "epoch": 2.031707409801163, + "grad_norm": 1.0263830054724326, + "learning_rate": 1.4252665819408995e-05, + "loss": 0.0814, + "step": 13360 + }, + { + "epoch": 2.033228148880356, + "grad_norm": 1.84829888177863, + "learning_rate": 1.4212736746542716e-05, + "loss": 0.0999, + "step": 13370 + }, + { + "epoch": 2.0347488879595486, + "grad_norm": 2.4624465665717716, + "learning_rate": 1.4172841458085828e-05, + "loss": 0.0715, + "step": 13380 + }, + { + "epoch": 2.036269627038741, + "grad_norm": 0.8708291031446286, + "learning_rate": 1.4132980078985553e-05, + "loss": 0.0874, + "step": 13390 + }, + { + "epoch": 2.0377903661179335, + "grad_norm": 1.4087806607632896, + "learning_rate": 1.409315273408294e-05, + "loss": 0.0889, + "step": 13400 + }, + { + "epoch": 2.0393111051971258, + "grad_norm": 1.0861826052566301, + "learning_rate": 1.4053359548112435e-05, + "loss": 0.0699, + "step": 13410 + }, + { + "epoch": 2.0408318442763185, + "grad_norm": 2.1559600401892216, + "learning_rate": 1.4013600645701506e-05, + "loss": 0.0905, + "step": 13420 + }, + { + "epoch": 2.0423525833555107, + "grad_norm": 2.2051634623809586, + "learning_rate": 1.3973876151370244e-05, + "loss": 0.1073, + "step": 13430 + }, + { + "epoch": 2.0438733224347034, + "grad_norm": 1.5415523865825944, + "learning_rate": 1.3934186189530996e-05, + "loss": 0.0806, + "step": 13440 + }, + { + "epoch": 2.0453940615138957, + "grad_norm": 1.5835731687068992, + "learning_rate": 1.3894530884487917e-05, + "loss": 0.0901, + "step": 13450 + }, + { + "epoch": 2.0469148005930884, + "grad_norm": 1.5319752915061746, + "learning_rate": 1.3854910360436657e-05, + "loss": 0.0759, + "step": 13460 + }, + { + "epoch": 2.0484355396722806, + "grad_norm": 1.9387924300296389, + "learning_rate": 1.3815324741463931e-05, + "loss": 0.0761, + "step": 13470 + }, + { + "epoch": 2.0499562787514733, + "grad_norm": 1.3034855665746299, + "learning_rate": 1.3775774151547127e-05, + "loss": 0.0787, + "step": 13480 + }, + { + "epoch": 2.0514770178306656, + "grad_norm": 2.5376763224034646, + "learning_rate": 1.3736258714553929e-05, + "loss": 0.096, + "step": 13490 + }, + { + "epoch": 2.0529977569098583, + "grad_norm": 1.570633984909571, + "learning_rate": 1.3696778554241929e-05, + "loss": 0.0783, + "step": 13500 + }, + { + "epoch": 2.0545184959890506, + "grad_norm": 2.3250714257587615, + "learning_rate": 1.3657333794258233e-05, + "loss": 0.0968, + "step": 13510 + }, + { + "epoch": 2.0560392350682433, + "grad_norm": 1.1707694924697984, + "learning_rate": 1.3617924558139079e-05, + "loss": 0.0835, + "step": 13520 + }, + { + "epoch": 2.0575599741474355, + "grad_norm": 1.6574278041981088, + "learning_rate": 1.3578550969309459e-05, + "loss": 0.088, + "step": 13530 + }, + { + "epoch": 2.0590807132266282, + "grad_norm": 1.8970987039825085, + "learning_rate": 1.3539213151082688e-05, + "loss": 0.0999, + "step": 13540 + }, + { + "epoch": 2.0606014523058205, + "grad_norm": 1.658139282087872, + "learning_rate": 1.3499911226660089e-05, + "loss": 0.0714, + "step": 13550 + }, + { + "epoch": 2.062122191385013, + "grad_norm": 1.954094171628233, + "learning_rate": 1.3460645319130555e-05, + "loss": 0.0823, + "step": 13560 + }, + { + "epoch": 2.0636429304642054, + "grad_norm": 1.5567624267638749, + "learning_rate": 1.3421415551470181e-05, + "loss": 0.0739, + "step": 13570 + }, + { + "epoch": 2.065163669543398, + "grad_norm": 1.2722876527409706, + "learning_rate": 1.3382222046541872e-05, + "loss": 0.0945, + "step": 13580 + }, + { + "epoch": 2.0666844086225904, + "grad_norm": 1.9920820805151715, + "learning_rate": 1.3343064927094967e-05, + "loss": 0.0858, + "step": 13590 + }, + { + "epoch": 2.068205147701783, + "grad_norm": 1.6175307921310331, + "learning_rate": 1.3303944315764848e-05, + "loss": 0.0933, + "step": 13600 + }, + { + "epoch": 2.0697258867809754, + "grad_norm": 0.8039728118420898, + "learning_rate": 1.3264860335072559e-05, + "loss": 0.0809, + "step": 13610 + }, + { + "epoch": 2.071246625860168, + "grad_norm": 1.1117015933199734, + "learning_rate": 1.3225813107424429e-05, + "loss": 0.0717, + "step": 13620 + }, + { + "epoch": 2.0727673649393603, + "grad_norm": 0.941002702385723, + "learning_rate": 1.3186802755111652e-05, + "loss": 0.0714, + "step": 13630 + }, + { + "epoch": 2.074288104018553, + "grad_norm": 1.2539170235002541, + "learning_rate": 1.3147829400309965e-05, + "loss": 0.0828, + "step": 13640 + }, + { + "epoch": 2.0758088430977457, + "grad_norm": 2.2044104942877634, + "learning_rate": 1.3108893165079222e-05, + "loss": 0.0857, + "step": 13650 + }, + { + "epoch": 2.077329582176938, + "grad_norm": 4.1281705531243285, + "learning_rate": 1.3069994171363026e-05, + "loss": 0.0899, + "step": 13660 + }, + { + "epoch": 2.0788503212561307, + "grad_norm": 1.3417232162814456, + "learning_rate": 1.3031132540988331e-05, + "loss": 0.0947, + "step": 13670 + }, + { + "epoch": 2.080371060335323, + "grad_norm": 1.2593725245475709, + "learning_rate": 1.299230839566509e-05, + "loss": 0.0778, + "step": 13680 + }, + { + "epoch": 2.0818917994145156, + "grad_norm": 1.9336127523996036, + "learning_rate": 1.295352185698585e-05, + "loss": 0.0855, + "step": 13690 + }, + { + "epoch": 2.083412538493708, + "grad_norm": 1.5135239306180184, + "learning_rate": 1.2914773046425378e-05, + "loss": 0.07, + "step": 13700 + }, + { + "epoch": 2.0849332775729006, + "grad_norm": 1.3790978156470939, + "learning_rate": 1.2876062085340294e-05, + "loss": 0.0666, + "step": 13710 + }, + { + "epoch": 2.086454016652093, + "grad_norm": 1.557050048281088, + "learning_rate": 1.2837389094968643e-05, + "loss": 0.0849, + "step": 13720 + }, + { + "epoch": 2.0879747557312855, + "grad_norm": 1.24056517735308, + "learning_rate": 1.2798754196429586e-05, + "loss": 0.0878, + "step": 13730 + }, + { + "epoch": 2.089495494810478, + "grad_norm": 2.30514101534121, + "learning_rate": 1.276015751072297e-05, + "loss": 0.0991, + "step": 13740 + }, + { + "epoch": 2.0910162338896705, + "grad_norm": 2.004835354962667, + "learning_rate": 1.2721599158728964e-05, + "loss": 0.0744, + "step": 13750 + }, + { + "epoch": 2.0925369729688628, + "grad_norm": 1.9444636280163927, + "learning_rate": 1.2683079261207691e-05, + "loss": 0.0853, + "step": 13760 + }, + { + "epoch": 2.0940577120480555, + "grad_norm": 1.6084336533520782, + "learning_rate": 1.2644597938798827e-05, + "loss": 0.0997, + "step": 13770 + }, + { + "epoch": 2.0955784511272477, + "grad_norm": 1.201026396448179, + "learning_rate": 1.2606155312021239e-05, + "loss": 0.0657, + "step": 13780 + }, + { + "epoch": 2.0970991902064404, + "grad_norm": 2.0574375693079214, + "learning_rate": 1.2567751501272602e-05, + "loss": 0.0834, + "step": 13790 + }, + { + "epoch": 2.0986199292856327, + "grad_norm": 1.7447302344743607, + "learning_rate": 1.2529386626829037e-05, + "loss": 0.0906, + "step": 13800 + }, + { + "epoch": 2.1001406683648254, + "grad_norm": 1.4856590841073205, + "learning_rate": 1.2491060808844696e-05, + "loss": 0.0796, + "step": 13810 + }, + { + "epoch": 2.1016614074440176, + "grad_norm": 1.5055591539842343, + "learning_rate": 1.2452774167351428e-05, + "loss": 0.0835, + "step": 13820 + }, + { + "epoch": 2.1031821465232103, + "grad_norm": 2.1946284246524397, + "learning_rate": 1.2414526822258388e-05, + "loss": 0.0585, + "step": 13830 + }, + { + "epoch": 2.1047028856024026, + "grad_norm": 1.4726327351851431, + "learning_rate": 1.2376318893351654e-05, + "loss": 0.0921, + "step": 13840 + }, + { + "epoch": 2.1062236246815953, + "grad_norm": 1.3564759766359273, + "learning_rate": 1.2338150500293854e-05, + "loss": 0.0814, + "step": 13850 + }, + { + "epoch": 2.1077443637607876, + "grad_norm": 1.2248913338303713, + "learning_rate": 1.2300021762623803e-05, + "loss": 0.0801, + "step": 13860 + }, + { + "epoch": 2.1092651028399803, + "grad_norm": 1.369916304002182, + "learning_rate": 1.2261932799756123e-05, + "loss": 0.0653, + "step": 13870 + }, + { + "epoch": 2.1107858419191725, + "grad_norm": 2.0949384948732455, + "learning_rate": 1.2223883730980843e-05, + "loss": 0.0679, + "step": 13880 + }, + { + "epoch": 2.112306580998365, + "grad_norm": 1.5748154134533177, + "learning_rate": 1.2185874675463066e-05, + "loss": 0.085, + "step": 13890 + }, + { + "epoch": 2.113827320077558, + "grad_norm": 1.815919663613788, + "learning_rate": 1.2147905752242582e-05, + "loss": 0.0877, + "step": 13900 + }, + { + "epoch": 2.11534805915675, + "grad_norm": 1.6050058314408222, + "learning_rate": 1.2109977080233486e-05, + "loss": 0.0882, + "step": 13910 + }, + { + "epoch": 2.116868798235943, + "grad_norm": 1.3039524288568387, + "learning_rate": 1.2072088778223819e-05, + "loss": 0.0649, + "step": 13920 + }, + { + "epoch": 2.118389537315135, + "grad_norm": 2.4629271488702718, + "learning_rate": 1.2034240964875163e-05, + "loss": 0.0828, + "step": 13930 + }, + { + "epoch": 2.119910276394328, + "grad_norm": 1.444328611181193, + "learning_rate": 1.1996433758722317e-05, + "loss": 0.0788, + "step": 13940 + }, + { + "epoch": 2.12143101547352, + "grad_norm": 1.498340413213286, + "learning_rate": 1.1958667278172897e-05, + "loss": 0.0917, + "step": 13950 + }, + { + "epoch": 2.122951754552713, + "grad_norm": 1.1273225291524156, + "learning_rate": 1.1920941641506977e-05, + "loss": 0.0803, + "step": 13960 + }, + { + "epoch": 2.124472493631905, + "grad_norm": 1.0842674630893712, + "learning_rate": 1.1883256966876705e-05, + "loss": 0.0866, + "step": 13970 + }, + { + "epoch": 2.1259932327110977, + "grad_norm": 2.1024998797741934, + "learning_rate": 1.1845613372305952e-05, + "loss": 0.0838, + "step": 13980 + }, + { + "epoch": 2.12751397179029, + "grad_norm": 1.843662919366999, + "learning_rate": 1.1808010975689907e-05, + "loss": 0.0715, + "step": 13990 + }, + { + "epoch": 2.1290347108694827, + "grad_norm": 1.7383168207990838, + "learning_rate": 1.1770449894794752e-05, + "loss": 0.0952, + "step": 14000 + }, + { + "epoch": 2.130555449948675, + "grad_norm": 1.559631355809642, + "learning_rate": 1.1732930247257274e-05, + "loss": 0.0781, + "step": 14010 + }, + { + "epoch": 2.1320761890278677, + "grad_norm": 2.0154364763862573, + "learning_rate": 1.1695452150584484e-05, + "loss": 0.0739, + "step": 14020 + }, + { + "epoch": 2.13359692810706, + "grad_norm": 1.731899280985556, + "learning_rate": 1.1658015722153276e-05, + "loss": 0.0747, + "step": 14030 + }, + { + "epoch": 2.1351176671862526, + "grad_norm": 1.3471281915282276, + "learning_rate": 1.1620621079210031e-05, + "loss": 0.0614, + "step": 14040 + }, + { + "epoch": 2.136638406265445, + "grad_norm": 1.9274069732383192, + "learning_rate": 1.1583268338870264e-05, + "loss": 0.0721, + "step": 14050 + }, + { + "epoch": 2.1381591453446376, + "grad_norm": 1.4371215010749696, + "learning_rate": 1.1545957618118263e-05, + "loss": 0.0748, + "step": 14060 + }, + { + "epoch": 2.13967988442383, + "grad_norm": 1.960550659259583, + "learning_rate": 1.1508689033806725e-05, + "loss": 0.0927, + "step": 14070 + }, + { + "epoch": 2.1412006235030225, + "grad_norm": 1.2128788450490324, + "learning_rate": 1.1471462702656339e-05, + "loss": 0.0817, + "step": 14080 + }, + { + "epoch": 2.142721362582215, + "grad_norm": 1.289549232536518, + "learning_rate": 1.1434278741255508e-05, + "loss": 0.0856, + "step": 14090 + }, + { + "epoch": 2.1442421016614075, + "grad_norm": 1.6253800590969019, + "learning_rate": 1.139713726605992e-05, + "loss": 0.0765, + "step": 14100 + }, + { + "epoch": 2.1457628407405998, + "grad_norm": 1.5273949387250885, + "learning_rate": 1.13600383933922e-05, + "loss": 0.0732, + "step": 14110 + }, + { + "epoch": 2.1472835798197925, + "grad_norm": 1.3955099640385364, + "learning_rate": 1.1322982239441551e-05, + "loss": 0.0826, + "step": 14120 + }, + { + "epoch": 2.1488043188989847, + "grad_norm": 0.9616657208099416, + "learning_rate": 1.1285968920263385e-05, + "loss": 0.0853, + "step": 14130 + }, + { + "epoch": 2.1503250579781774, + "grad_norm": 1.2109060282475896, + "learning_rate": 1.1248998551778957e-05, + "loss": 0.0823, + "step": 14140 + }, + { + "epoch": 2.15184579705737, + "grad_norm": 2.1010760190681808, + "learning_rate": 1.1212071249775013e-05, + "loss": 0.0847, + "step": 14150 + }, + { + "epoch": 2.1533665361365624, + "grad_norm": 1.1186763348128428, + "learning_rate": 1.1175187129903423e-05, + "loss": 0.0783, + "step": 14160 + }, + { + "epoch": 2.1548872752157546, + "grad_norm": 1.2845629207714808, + "learning_rate": 1.1138346307680788e-05, + "loss": 0.087, + "step": 14170 + }, + { + "epoch": 2.1564080142949473, + "grad_norm": 1.9809210904387808, + "learning_rate": 1.1101548898488137e-05, + "loss": 0.0899, + "step": 14180 + }, + { + "epoch": 2.15792875337414, + "grad_norm": 1.9793693695192063, + "learning_rate": 1.1064795017570525e-05, + "loss": 0.0808, + "step": 14190 + }, + { + "epoch": 2.1594494924533323, + "grad_norm": 1.0088262197588462, + "learning_rate": 1.1028084780036681e-05, + "loss": 0.0923, + "step": 14200 + }, + { + "epoch": 2.160970231532525, + "grad_norm": 1.4455385491089217, + "learning_rate": 1.0991418300858648e-05, + "loss": 0.0788, + "step": 14210 + }, + { + "epoch": 2.1624909706117172, + "grad_norm": 1.554710714168596, + "learning_rate": 1.095479569487142e-05, + "loss": 0.0794, + "step": 14220 + }, + { + "epoch": 2.16401170969091, + "grad_norm": 1.0831579015333144, + "learning_rate": 1.0918217076772591e-05, + "loss": 0.0604, + "step": 14230 + }, + { + "epoch": 2.165532448770102, + "grad_norm": 1.1294992634713061, + "learning_rate": 1.0881682561121989e-05, + "loss": 0.0866, + "step": 14240 + }, + { + "epoch": 2.167053187849295, + "grad_norm": 0.9949832926852767, + "learning_rate": 1.0845192262341322e-05, + "loss": 0.0823, + "step": 14250 + }, + { + "epoch": 2.168573926928487, + "grad_norm": 1.106081857275583, + "learning_rate": 1.0808746294713795e-05, + "loss": 0.0713, + "step": 14260 + }, + { + "epoch": 2.17009466600768, + "grad_norm": 0.9476585258010604, + "learning_rate": 1.0772344772383799e-05, + "loss": 0.1073, + "step": 14270 + }, + { + "epoch": 2.171615405086872, + "grad_norm": 1.0137858864627292, + "learning_rate": 1.0735987809356518e-05, + "loss": 0.0716, + "step": 14280 + }, + { + "epoch": 2.173136144166065, + "grad_norm": 1.3406177210227552, + "learning_rate": 1.0699675519497582e-05, + "loss": 0.0726, + "step": 14290 + }, + { + "epoch": 2.174656883245257, + "grad_norm": 1.8388932151980188, + "learning_rate": 1.0663408016532708e-05, + "loss": 0.0866, + "step": 14300 + }, + { + "epoch": 2.17617762232445, + "grad_norm": 2.1876140814812652, + "learning_rate": 1.062718541404735e-05, + "loss": 0.0995, + "step": 14310 + }, + { + "epoch": 2.177698361403642, + "grad_norm": 1.3865360749796154, + "learning_rate": 1.0591007825486335e-05, + "loss": 0.0697, + "step": 14320 + }, + { + "epoch": 2.1792191004828347, + "grad_norm": 1.045501975781109, + "learning_rate": 1.0554875364153512e-05, + "loss": 0.0674, + "step": 14330 + }, + { + "epoch": 2.180739839562027, + "grad_norm": 1.4094883511858418, + "learning_rate": 1.0518788143211413e-05, + "loss": 0.075, + "step": 14340 + }, + { + "epoch": 2.1822605786412197, + "grad_norm": 1.2788471594232682, + "learning_rate": 1.0482746275680843e-05, + "loss": 0.084, + "step": 14350 + }, + { + "epoch": 2.183781317720412, + "grad_norm": 1.834211081727854, + "learning_rate": 1.0446749874440603e-05, + "loss": 0.0855, + "step": 14360 + }, + { + "epoch": 2.1853020567996047, + "grad_norm": 2.231091031043873, + "learning_rate": 1.0410799052227089e-05, + "loss": 0.0799, + "step": 14370 + }, + { + "epoch": 2.186822795878797, + "grad_norm": 1.963534651077372, + "learning_rate": 1.0374893921633941e-05, + "loss": 0.0834, + "step": 14380 + }, + { + "epoch": 2.1883435349579896, + "grad_norm": 1.364606602553275, + "learning_rate": 1.0339034595111707e-05, + "loss": 0.0868, + "step": 14390 + }, + { + "epoch": 2.189864274037182, + "grad_norm": 1.3606665252565466, + "learning_rate": 1.030322118496748e-05, + "loss": 0.0723, + "step": 14400 + }, + { + "epoch": 2.1913850131163746, + "grad_norm": 0.9967535642573017, + "learning_rate": 1.0267453803364546e-05, + "loss": 0.082, + "step": 14410 + }, + { + "epoch": 2.192905752195567, + "grad_norm": 2.160880489095367, + "learning_rate": 1.0231732562322046e-05, + "loss": 0.0927, + "step": 14420 + }, + { + "epoch": 2.1944264912747595, + "grad_norm": 0.8613915409836699, + "learning_rate": 1.0196057573714585e-05, + "loss": 0.0744, + "step": 14430 + }, + { + "epoch": 2.1959472303539522, + "grad_norm": 1.7920159374040712, + "learning_rate": 1.0160428949271943e-05, + "loss": 0.0861, + "step": 14440 + }, + { + "epoch": 2.1974679694331445, + "grad_norm": 1.5191651724128894, + "learning_rate": 1.0124846800578683e-05, + "loss": 0.0918, + "step": 14450 + }, + { + "epoch": 2.198988708512337, + "grad_norm": 0.6149657119770483, + "learning_rate": 1.0089311239073804e-05, + "loss": 0.0795, + "step": 14460 + }, + { + "epoch": 2.2005094475915294, + "grad_norm": 1.6808841483016506, + "learning_rate": 1.0053822376050415e-05, + "loss": 0.0789, + "step": 14470 + }, + { + "epoch": 2.202030186670722, + "grad_norm": 0.930658857381887, + "learning_rate": 1.0018380322655344e-05, + "loss": 0.0795, + "step": 14480 + }, + { + "epoch": 2.2035509257499144, + "grad_norm": 2.340315905389011, + "learning_rate": 9.982985189888847e-06, + "loss": 0.0848, + "step": 14490 + }, + { + "epoch": 2.205071664829107, + "grad_norm": 2.0334396769154695, + "learning_rate": 9.947637088604209e-06, + "loss": 0.0692, + "step": 14500 + }, + { + "epoch": 2.2065924039082994, + "grad_norm": 0.9378570796760217, + "learning_rate": 9.912336129507433e-06, + "loss": 0.0719, + "step": 14510 + }, + { + "epoch": 2.208113142987492, + "grad_norm": 0.7360573801745955, + "learning_rate": 9.877082423156877e-06, + "loss": 0.0759, + "step": 14520 + }, + { + "epoch": 2.2096338820666843, + "grad_norm": 1.1352239299928393, + "learning_rate": 9.84187607996289e-06, + "loss": 0.0664, + "step": 14530 + }, + { + "epoch": 2.211154621145877, + "grad_norm": 1.6185751341791907, + "learning_rate": 9.8067172101875e-06, + "loss": 0.0738, + "step": 14540 + }, + { + "epoch": 2.2126753602250693, + "grad_norm": 1.2043177633882376, + "learning_rate": 9.77160592394406e-06, + "loss": 0.0616, + "step": 14550 + }, + { + "epoch": 2.214196099304262, + "grad_norm": 1.541292406998891, + "learning_rate": 9.736542331196888e-06, + "loss": 0.0803, + "step": 14560 + }, + { + "epoch": 2.2157168383834542, + "grad_norm": 1.1292714520847973, + "learning_rate": 9.701526541760933e-06, + "loss": 0.0727, + "step": 14570 + }, + { + "epoch": 2.217237577462647, + "grad_norm": 1.2281766235822928, + "learning_rate": 9.66655866530143e-06, + "loss": 0.0774, + "step": 14580 + }, + { + "epoch": 2.218758316541839, + "grad_norm": 1.3330423214241516, + "learning_rate": 9.63163881133356e-06, + "loss": 0.0743, + "step": 14590 + }, + { + "epoch": 2.220279055621032, + "grad_norm": 1.3419972126158786, + "learning_rate": 9.596767089222094e-06, + "loss": 0.0896, + "step": 14600 + }, + { + "epoch": 2.221799794700224, + "grad_norm": 1.0939077925032277, + "learning_rate": 9.561943608181081e-06, + "loss": 0.0842, + "step": 14610 + }, + { + "epoch": 2.223320533779417, + "grad_norm": 1.3938709532688216, + "learning_rate": 9.527168477273448e-06, + "loss": 0.0925, + "step": 14620 + }, + { + "epoch": 2.224841272858609, + "grad_norm": 1.15694178350779, + "learning_rate": 9.492441805410732e-06, + "loss": 0.0759, + "step": 14630 + }, + { + "epoch": 2.226362011937802, + "grad_norm": 1.4236211409289892, + "learning_rate": 9.457763701352679e-06, + "loss": 0.0777, + "step": 14640 + }, + { + "epoch": 2.227882751016994, + "grad_norm": 1.4677772508994706, + "learning_rate": 9.423134273706941e-06, + "loss": 0.0846, + "step": 14650 + }, + { + "epoch": 2.2294034900961868, + "grad_norm": 1.7261599587336731, + "learning_rate": 9.38855363092871e-06, + "loss": 0.0776, + "step": 14660 + }, + { + "epoch": 2.230924229175379, + "grad_norm": 1.6087557690137484, + "learning_rate": 9.354021881320397e-06, + "loss": 0.07, + "step": 14670 + }, + { + "epoch": 2.2324449682545717, + "grad_norm": 1.4705390691753955, + "learning_rate": 9.319539133031282e-06, + "loss": 0.0845, + "step": 14680 + }, + { + "epoch": 2.2339657073337644, + "grad_norm": 1.7112684967904457, + "learning_rate": 9.285105494057182e-06, + "loss": 0.0756, + "step": 14690 + }, + { + "epoch": 2.2354864464129567, + "grad_norm": 1.869469030430834, + "learning_rate": 9.250721072240115e-06, + "loss": 0.0862, + "step": 14700 + }, + { + "epoch": 2.2370071854921494, + "grad_norm": 2.3779313921731053, + "learning_rate": 9.21638597526793e-06, + "loss": 0.0805, + "step": 14710 + }, + { + "epoch": 2.2385279245713416, + "grad_norm": 1.920040966478369, + "learning_rate": 9.182100310674028e-06, + "loss": 0.0768, + "step": 14720 + }, + { + "epoch": 2.2400486636505343, + "grad_norm": 2.2564202464944367, + "learning_rate": 9.147864185836976e-06, + "loss": 0.0796, + "step": 14730 + }, + { + "epoch": 2.2415694027297266, + "grad_norm": 0.8909745046011474, + "learning_rate": 9.113677707980201e-06, + "loss": 0.0745, + "step": 14740 + }, + { + "epoch": 2.2430901418089193, + "grad_norm": 1.3658929399065485, + "learning_rate": 9.079540984171628e-06, + "loss": 0.07, + "step": 14750 + }, + { + "epoch": 2.2446108808881116, + "grad_norm": 1.497039097428028, + "learning_rate": 9.04545412132337e-06, + "loss": 0.0854, + "step": 14760 + }, + { + "epoch": 2.2461316199673043, + "grad_norm": 1.4318791528550605, + "learning_rate": 9.011417226191371e-06, + "loss": 0.0823, + "step": 14770 + }, + { + "epoch": 2.2476523590464965, + "grad_norm": 1.7142848344409756, + "learning_rate": 8.977430405375093e-06, + "loss": 0.0899, + "step": 14780 + }, + { + "epoch": 2.249173098125689, + "grad_norm": 1.415102386789473, + "learning_rate": 8.94349376531717e-06, + "loss": 0.0743, + "step": 14790 + }, + { + "epoch": 2.2506938372048815, + "grad_norm": 1.3147727706911585, + "learning_rate": 8.909607412303056e-06, + "loss": 0.0831, + "step": 14800 + }, + { + "epoch": 2.252214576284074, + "grad_norm": 2.0470768251652776, + "learning_rate": 8.875771452460738e-06, + "loss": 0.0714, + "step": 14810 + }, + { + "epoch": 2.2537353153632664, + "grad_norm": 1.375767902720006, + "learning_rate": 8.841985991760363e-06, + "loss": 0.0712, + "step": 14820 + }, + { + "epoch": 2.255256054442459, + "grad_norm": 1.8901055717658843, + "learning_rate": 8.80825113601393e-06, + "loss": 0.0749, + "step": 14830 + }, + { + "epoch": 2.2567767935216514, + "grad_norm": 1.7256294818547173, + "learning_rate": 8.774566990874942e-06, + "loss": 0.0872, + "step": 14840 + }, + { + "epoch": 2.258297532600844, + "grad_norm": 1.788326867734765, + "learning_rate": 8.740933661838082e-06, + "loss": 0.0679, + "step": 14850 + }, + { + "epoch": 2.2598182716800364, + "grad_norm": 0.9692118760848188, + "learning_rate": 8.70735125423889e-06, + "loss": 0.0746, + "step": 14860 + }, + { + "epoch": 2.261339010759229, + "grad_norm": 2.6886103238880157, + "learning_rate": 8.673819873253425e-06, + "loss": 0.0765, + "step": 14870 + }, + { + "epoch": 2.2628597498384213, + "grad_norm": 1.577808220397971, + "learning_rate": 8.640339623897936e-06, + "loss": 0.0642, + "step": 14880 + }, + { + "epoch": 2.264380488917614, + "grad_norm": 1.423862075456549, + "learning_rate": 8.606910611028517e-06, + "loss": 0.0791, + "step": 14890 + }, + { + "epoch": 2.2659012279968063, + "grad_norm": 1.1984404853865682, + "learning_rate": 8.57353293934082e-06, + "loss": 0.0692, + "step": 14900 + }, + { + "epoch": 2.267421967075999, + "grad_norm": 1.147883650867586, + "learning_rate": 8.540206713369694e-06, + "loss": 0.0762, + "step": 14910 + }, + { + "epoch": 2.2689427061551912, + "grad_norm": 1.9455538406474022, + "learning_rate": 8.506932037488862e-06, + "loss": 0.0826, + "step": 14920 + }, + { + "epoch": 2.270463445234384, + "grad_norm": 1.4399193919628854, + "learning_rate": 8.473709015910605e-06, + "loss": 0.0848, + "step": 14930 + }, + { + "epoch": 2.2719841843135766, + "grad_norm": 1.5281482079537057, + "learning_rate": 8.440537752685426e-06, + "loss": 0.0841, + "step": 14940 + }, + { + "epoch": 2.273504923392769, + "grad_norm": 1.071668107447664, + "learning_rate": 8.407418351701721e-06, + "loss": 0.0767, + "step": 14950 + }, + { + "epoch": 2.275025662471961, + "grad_norm": 1.3303327771287778, + "learning_rate": 8.374350916685478e-06, + "loss": 0.077, + "step": 14960 + }, + { + "epoch": 2.276546401551154, + "grad_norm": 1.216868278503513, + "learning_rate": 8.341335551199902e-06, + "loss": 0.0703, + "step": 14970 + }, + { + "epoch": 2.2780671406303465, + "grad_norm": 1.145695281026218, + "learning_rate": 8.308372358645156e-06, + "loss": 0.0767, + "step": 14980 + }, + { + "epoch": 2.279587879709539, + "grad_norm": 1.489962551668301, + "learning_rate": 8.275461442257981e-06, + "loss": 0.0853, + "step": 14990 + }, + { + "epoch": 2.2811086187887315, + "grad_norm": 1.0877242581060296, + "learning_rate": 8.242602905111408e-06, + "loss": 0.0888, + "step": 15000 + }, + { + "epoch": 2.2826293578679238, + "grad_norm": 1.5397867354402757, + "learning_rate": 8.209796850114424e-06, + "loss": 0.0758, + "step": 15010 + }, + { + "epoch": 2.2841500969471165, + "grad_norm": 2.159941061670381, + "learning_rate": 8.177043380011629e-06, + "loss": 0.074, + "step": 15020 + }, + { + "epoch": 2.2856708360263087, + "grad_norm": 1.531081959820059, + "learning_rate": 8.144342597382953e-06, + "loss": 0.0691, + "step": 15030 + }, + { + "epoch": 2.2871915751055014, + "grad_norm": 1.2192787415903035, + "learning_rate": 8.111694604643311e-06, + "loss": 0.0682, + "step": 15040 + }, + { + "epoch": 2.2887123141846937, + "grad_norm": 2.003703920479949, + "learning_rate": 8.079099504042284e-06, + "loss": 0.0888, + "step": 15050 + }, + { + "epoch": 2.2902330532638864, + "grad_norm": 1.025868534013282, + "learning_rate": 8.046557397663797e-06, + "loss": 0.0785, + "step": 15060 + }, + { + "epoch": 2.2917537923430786, + "grad_norm": 1.5694801849305755, + "learning_rate": 8.014068387425824e-06, + "loss": 0.0772, + "step": 15070 + }, + { + "epoch": 2.2932745314222713, + "grad_norm": 0.9972724353761387, + "learning_rate": 7.981632575080008e-06, + "loss": 0.0817, + "step": 15080 + }, + { + "epoch": 2.2947952705014636, + "grad_norm": 2.6943774884046063, + "learning_rate": 7.949250062211423e-06, + "loss": 0.0878, + "step": 15090 + }, + { + "epoch": 2.2963160095806563, + "grad_norm": 1.1678232585063222, + "learning_rate": 7.916920950238191e-06, + "loss": 0.0728, + "step": 15100 + }, + { + "epoch": 2.2978367486598485, + "grad_norm": 2.748751943602176, + "learning_rate": 7.884645340411207e-06, + "loss": 0.0757, + "step": 15110 + }, + { + "epoch": 2.2993574877390413, + "grad_norm": 2.474778175392585, + "learning_rate": 7.852423333813785e-06, + "loss": 0.0888, + "step": 15120 + }, + { + "epoch": 2.3008782268182335, + "grad_norm": 0.9637202260330902, + "learning_rate": 7.820255031361373e-06, + "loss": 0.0775, + "step": 15130 + }, + { + "epoch": 2.302398965897426, + "grad_norm": 1.5359496885021875, + "learning_rate": 7.788140533801219e-06, + "loss": 0.0856, + "step": 15140 + }, + { + "epoch": 2.3039197049766185, + "grad_norm": 1.6996739424155856, + "learning_rate": 7.756079941712074e-06, + "loss": 0.0829, + "step": 15150 + }, + { + "epoch": 2.305440444055811, + "grad_norm": 1.0594329860059375, + "learning_rate": 7.72407335550383e-06, + "loss": 0.0783, + "step": 15160 + }, + { + "epoch": 2.3069611831350034, + "grad_norm": 1.3984790801102625, + "learning_rate": 7.692120875417272e-06, + "loss": 0.0749, + "step": 15170 + }, + { + "epoch": 2.308481922214196, + "grad_norm": 1.1530359588054873, + "learning_rate": 7.660222601523717e-06, + "loss": 0.0677, + "step": 15180 + }, + { + "epoch": 2.310002661293389, + "grad_norm": 2.2936764096716815, + "learning_rate": 7.628378633724717e-06, + "loss": 0.0905, + "step": 15190 + }, + { + "epoch": 2.311523400372581, + "grad_norm": 1.5438777173386609, + "learning_rate": 7.596589071751748e-06, + "loss": 0.0758, + "step": 15200 + }, + { + "epoch": 2.3130441394517733, + "grad_norm": 1.504174577516658, + "learning_rate": 7.564854015165887e-06, + "loss": 0.0745, + "step": 15210 + }, + { + "epoch": 2.314564878530966, + "grad_norm": 1.6011094407288464, + "learning_rate": 7.533173563357506e-06, + "loss": 0.0789, + "step": 15220 + }, + { + "epoch": 2.3160856176101587, + "grad_norm": 1.0023368952044847, + "learning_rate": 7.5015478155459725e-06, + "loss": 0.0587, + "step": 15230 + }, + { + "epoch": 2.317606356689351, + "grad_norm": 1.559100180887192, + "learning_rate": 7.469976870779322e-06, + "loss": 0.1002, + "step": 15240 + }, + { + "epoch": 2.3191270957685437, + "grad_norm": 1.3973032161689798, + "learning_rate": 7.4384608279339355e-06, + "loss": 0.0727, + "step": 15250 + }, + { + "epoch": 2.320647834847736, + "grad_norm": 1.4421948906013595, + "learning_rate": 7.406999785714275e-06, + "loss": 0.0698, + "step": 15260 + }, + { + "epoch": 2.3221685739269287, + "grad_norm": 1.901457156002241, + "learning_rate": 7.375593842652534e-06, + "loss": 0.0836, + "step": 15270 + }, + { + "epoch": 2.323689313006121, + "grad_norm": 1.3868881247334626, + "learning_rate": 7.344243097108341e-06, + "loss": 0.0781, + "step": 15280 + }, + { + "epoch": 2.3252100520853136, + "grad_norm": 1.6570426223001582, + "learning_rate": 7.312947647268462e-06, + "loss": 0.0874, + "step": 15290 + }, + { + "epoch": 2.326730791164506, + "grad_norm": 1.5949448587696051, + "learning_rate": 7.2817075911464725e-06, + "loss": 0.0771, + "step": 15300 + }, + { + "epoch": 2.3282515302436986, + "grad_norm": 2.0819049937521523, + "learning_rate": 7.2505230265824645e-06, + "loss": 0.0842, + "step": 15310 + }, + { + "epoch": 2.329772269322891, + "grad_norm": 1.2022239996152702, + "learning_rate": 7.219394051242742e-06, + "loss": 0.0768, + "step": 15320 + }, + { + "epoch": 2.3312930084020835, + "grad_norm": 1.9403258656958948, + "learning_rate": 7.188320762619513e-06, + "loss": 0.0785, + "step": 15330 + }, + { + "epoch": 2.332813747481276, + "grad_norm": 1.5789142977086315, + "learning_rate": 7.1573032580305625e-06, + "loss": 0.0767, + "step": 15340 + }, + { + "epoch": 2.3343344865604685, + "grad_norm": 1.4197018089430316, + "learning_rate": 7.126341634618983e-06, + "loss": 0.0837, + "step": 15350 + }, + { + "epoch": 2.3358552256396607, + "grad_norm": 1.7941130994782886, + "learning_rate": 7.09543598935285e-06, + "loss": 0.0842, + "step": 15360 + }, + { + "epoch": 2.3373759647188534, + "grad_norm": 1.7750412870393282, + "learning_rate": 7.0645864190249265e-06, + "loss": 0.0749, + "step": 15370 + }, + { + "epoch": 2.3388967037980457, + "grad_norm": 1.096770018982687, + "learning_rate": 7.03379302025235e-06, + "loss": 0.063, + "step": 15380 + }, + { + "epoch": 2.3404174428772384, + "grad_norm": 1.5459861837921394, + "learning_rate": 7.003055889476337e-06, + "loss": 0.0764, + "step": 15390 + }, + { + "epoch": 2.3419381819564307, + "grad_norm": 0.9068458235398982, + "learning_rate": 6.972375122961878e-06, + "loss": 0.0692, + "step": 15400 + }, + { + "epoch": 2.3434589210356234, + "grad_norm": 1.9835826512456294, + "learning_rate": 6.9417508167974405e-06, + "loss": 0.0731, + "step": 15410 + }, + { + "epoch": 2.3449796601148156, + "grad_norm": 1.6930352621338667, + "learning_rate": 6.9111830668946694e-06, + "loss": 0.0748, + "step": 15420 + }, + { + "epoch": 2.3465003991940083, + "grad_norm": 0.9910420585944557, + "learning_rate": 6.880671968988061e-06, + "loss": 0.0756, + "step": 15430 + }, + { + "epoch": 2.348021138273201, + "grad_norm": 2.5544920757078593, + "learning_rate": 6.8502176186347074e-06, + "loss": 0.0799, + "step": 15440 + }, + { + "epoch": 2.3495418773523933, + "grad_norm": 1.4304151550206872, + "learning_rate": 6.819820111213967e-06, + "loss": 0.0691, + "step": 15450 + }, + { + "epoch": 2.3510626164315855, + "grad_norm": 1.5700987923148602, + "learning_rate": 6.789479541927171e-06, + "loss": 0.0707, + "step": 15460 + }, + { + "epoch": 2.3525833555107782, + "grad_norm": 1.499917823839197, + "learning_rate": 6.75919600579733e-06, + "loss": 0.0714, + "step": 15470 + }, + { + "epoch": 2.354104094589971, + "grad_norm": 1.7622186889189837, + "learning_rate": 6.7289695976688325e-06, + "loss": 0.0678, + "step": 15480 + }, + { + "epoch": 2.355624833669163, + "grad_norm": 1.3008448165627606, + "learning_rate": 6.698800412207146e-06, + "loss": 0.0669, + "step": 15490 + }, + { + "epoch": 2.3571455727483555, + "grad_norm": 0.8868549184971497, + "learning_rate": 6.668688543898524e-06, + "loss": 0.0669, + "step": 15500 + }, + { + "epoch": 2.358666311827548, + "grad_norm": 1.3234575912770161, + "learning_rate": 6.6386340870497275e-06, + "loss": 0.09, + "step": 15510 + }, + { + "epoch": 2.360187050906741, + "grad_norm": 1.4535570240534006, + "learning_rate": 6.608637135787674e-06, + "loss": 0.0797, + "step": 15520 + }, + { + "epoch": 2.361707789985933, + "grad_norm": 1.2995322599584196, + "learning_rate": 6.57869778405921e-06, + "loss": 0.0814, + "step": 15530 + }, + { + "epoch": 2.363228529065126, + "grad_norm": 0.9577561399649751, + "learning_rate": 6.54881612563078e-06, + "loss": 0.0636, + "step": 15540 + }, + { + "epoch": 2.364749268144318, + "grad_norm": 1.4417329110344148, + "learning_rate": 6.518992254088144e-06, + "loss": 0.0768, + "step": 15550 + }, + { + "epoch": 2.3662700072235108, + "grad_norm": 1.8317043742088905, + "learning_rate": 6.4892262628360785e-06, + "loss": 0.0664, + "step": 15560 + }, + { + "epoch": 2.367790746302703, + "grad_norm": 0.9889417624175595, + "learning_rate": 6.45951824509807e-06, + "loss": 0.0616, + "step": 15570 + }, + { + "epoch": 2.3693114853818957, + "grad_norm": 1.1479575308283045, + "learning_rate": 6.429868293916058e-06, + "loss": 0.0715, + "step": 15580 + }, + { + "epoch": 2.370832224461088, + "grad_norm": 2.2121976279079827, + "learning_rate": 6.400276502150121e-06, + "loss": 0.0821, + "step": 15590 + }, + { + "epoch": 2.3723529635402807, + "grad_norm": 1.1836697414898847, + "learning_rate": 6.370742962478188e-06, + "loss": 0.0581, + "step": 15600 + }, + { + "epoch": 2.373873702619473, + "grad_norm": 1.665066241266222, + "learning_rate": 6.3412677673957535e-06, + "loss": 0.075, + "step": 15610 + }, + { + "epoch": 2.3753944416986656, + "grad_norm": 1.2414904371210682, + "learning_rate": 6.311851009215563e-06, + "loss": 0.0705, + "step": 15620 + }, + { + "epoch": 2.376915180777858, + "grad_norm": 1.1199650889023747, + "learning_rate": 6.28249278006737e-06, + "loss": 0.062, + "step": 15630 + }, + { + "epoch": 2.3784359198570506, + "grad_norm": 1.47088931267413, + "learning_rate": 6.253193171897611e-06, + "loss": 0.0604, + "step": 15640 + }, + { + "epoch": 2.379956658936243, + "grad_norm": 1.2312613268036232, + "learning_rate": 6.223952276469136e-06, + "loss": 0.0538, + "step": 15650 + }, + { + "epoch": 2.3814773980154356, + "grad_norm": 1.149484743993302, + "learning_rate": 6.194770185360899e-06, + "loss": 0.0731, + "step": 15660 + }, + { + "epoch": 2.382998137094628, + "grad_norm": 1.1406424960983295, + "learning_rate": 6.1656469899676995e-06, + "loss": 0.0796, + "step": 15670 + }, + { + "epoch": 2.3845188761738205, + "grad_norm": 1.3177456539057788, + "learning_rate": 6.136582781499878e-06, + "loss": 0.0732, + "step": 15680 + }, + { + "epoch": 2.386039615253013, + "grad_norm": 1.248773717089184, + "learning_rate": 6.1075776509830425e-06, + "loss": 0.0591, + "step": 15690 + }, + { + "epoch": 2.3875603543322055, + "grad_norm": 1.39860627911742, + "learning_rate": 6.078631689257755e-06, + "loss": 0.0845, + "step": 15700 + }, + { + "epoch": 2.3890810934113977, + "grad_norm": 1.5091880679435803, + "learning_rate": 6.049744986979286e-06, + "loss": 0.067, + "step": 15710 + }, + { + "epoch": 2.3906018324905904, + "grad_norm": 1.0464924832744245, + "learning_rate": 6.02091763461731e-06, + "loss": 0.0753, + "step": 15720 + }, + { + "epoch": 2.392122571569783, + "grad_norm": 1.4134897663537427, + "learning_rate": 5.992149722455623e-06, + "loss": 0.0811, + "step": 15730 + }, + { + "epoch": 2.3936433106489754, + "grad_norm": 1.3540990601452845, + "learning_rate": 5.963441340591863e-06, + "loss": 0.0605, + "step": 15740 + }, + { + "epoch": 2.3951640497281677, + "grad_norm": 1.734180531596699, + "learning_rate": 5.93479257893722e-06, + "loss": 0.0845, + "step": 15750 + }, + { + "epoch": 2.3966847888073604, + "grad_norm": 1.6421366255830065, + "learning_rate": 5.90620352721617e-06, + "loss": 0.0693, + "step": 15760 + }, + { + "epoch": 2.398205527886553, + "grad_norm": 1.0441533763224742, + "learning_rate": 5.877674274966174e-06, + "loss": 0.0753, + "step": 15770 + }, + { + "epoch": 2.3997262669657453, + "grad_norm": 1.243015349489903, + "learning_rate": 5.849204911537428e-06, + "loss": 0.0584, + "step": 15780 + }, + { + "epoch": 2.401247006044938, + "grad_norm": 0.7472478987465554, + "learning_rate": 5.8207955260925344e-06, + "loss": 0.065, + "step": 15790 + }, + { + "epoch": 2.4027677451241303, + "grad_norm": 1.1686754261639394, + "learning_rate": 5.79244620760627e-06, + "loss": 0.0833, + "step": 15800 + }, + { + "epoch": 2.404288484203323, + "grad_norm": 1.403166052111952, + "learning_rate": 5.764157044865287e-06, + "loss": 0.0912, + "step": 15810 + }, + { + "epoch": 2.4058092232825152, + "grad_norm": 1.3251062521019288, + "learning_rate": 5.73592812646784e-06, + "loss": 0.0823, + "step": 15820 + }, + { + "epoch": 2.407329962361708, + "grad_norm": 1.0192166437026122, + "learning_rate": 5.7077595408235015e-06, + "loss": 0.0747, + "step": 15830 + }, + { + "epoch": 2.4088507014409, + "grad_norm": 1.7439301278463135, + "learning_rate": 5.679651376152883e-06, + "loss": 0.0732, + "step": 15840 + }, + { + "epoch": 2.410371440520093, + "grad_norm": 1.2873383796281948, + "learning_rate": 5.651603720487378e-06, + "loss": 0.066, + "step": 15850 + }, + { + "epoch": 2.411892179599285, + "grad_norm": 1.5984735415430806, + "learning_rate": 5.623616661668862e-06, + "loss": 0.0817, + "step": 15860 + }, + { + "epoch": 2.413412918678478, + "grad_norm": 1.728301273152382, + "learning_rate": 5.595690287349445e-06, + "loss": 0.0828, + "step": 15870 + }, + { + "epoch": 2.41493365775767, + "grad_norm": 1.419316408267827, + "learning_rate": 5.567824684991147e-06, + "loss": 0.0848, + "step": 15880 + }, + { + "epoch": 2.416454396836863, + "grad_norm": 1.389145281230967, + "learning_rate": 5.54001994186569e-06, + "loss": 0.0638, + "step": 15890 + }, + { + "epoch": 2.417975135916055, + "grad_norm": 1.439010591732099, + "learning_rate": 5.512276145054182e-06, + "loss": 0.0891, + "step": 15900 + }, + { + "epoch": 2.4194958749952478, + "grad_norm": 1.011786780906434, + "learning_rate": 5.484593381446851e-06, + "loss": 0.0715, + "step": 15910 + }, + { + "epoch": 2.42101661407444, + "grad_norm": 2.2433154646160673, + "learning_rate": 5.4569717377427775e-06, + "loss": 0.0774, + "step": 15920 + }, + { + "epoch": 2.4225373531536327, + "grad_norm": 1.695400636503717, + "learning_rate": 5.429411300449627e-06, + "loss": 0.0689, + "step": 15930 + }, + { + "epoch": 2.424058092232825, + "grad_norm": 1.2197059040273677, + "learning_rate": 5.401912155883365e-06, + "loss": 0.0744, + "step": 15940 + }, + { + "epoch": 2.4255788313120177, + "grad_norm": 1.3469202874104786, + "learning_rate": 5.374474390168005e-06, + "loss": 0.074, + "step": 15950 + }, + { + "epoch": 2.42709957039121, + "grad_norm": 1.403339963598999, + "learning_rate": 5.3470980892353305e-06, + "loss": 0.0787, + "step": 15960 + }, + { + "epoch": 2.4286203094704026, + "grad_norm": 0.821750090737834, + "learning_rate": 5.319783338824605e-06, + "loss": 0.0705, + "step": 15970 + }, + { + "epoch": 2.4301410485495953, + "grad_norm": 1.3466543540493576, + "learning_rate": 5.292530224482345e-06, + "loss": 0.0709, + "step": 15980 + }, + { + "epoch": 2.4316617876287876, + "grad_norm": 1.048793613501932, + "learning_rate": 5.265338831562019e-06, + "loss": 0.0696, + "step": 15990 + }, + { + "epoch": 2.43318252670798, + "grad_norm": 1.3827520028231233, + "learning_rate": 5.238209245223799e-06, + "loss": 0.0837, + "step": 16000 + }, + { + "epoch": 2.4347032657871726, + "grad_norm": 1.3808874366818278, + "learning_rate": 5.211141550434279e-06, + "loss": 0.0789, + "step": 16010 + }, + { + "epoch": 2.4362240048663653, + "grad_norm": 1.3233616805659318, + "learning_rate": 5.184135831966222e-06, + "loss": 0.0856, + "step": 16020 + }, + { + "epoch": 2.4377447439455575, + "grad_norm": 1.215215399289992, + "learning_rate": 5.157192174398284e-06, + "loss": 0.0757, + "step": 16030 + }, + { + "epoch": 2.43926548302475, + "grad_norm": 1.5193048377702127, + "learning_rate": 5.130310662114757e-06, + "loss": 0.067, + "step": 16040 + }, + { + "epoch": 2.4407862221039425, + "grad_norm": 2.478646997253683, + "learning_rate": 5.103491379305306e-06, + "loss": 0.0644, + "step": 16050 + }, + { + "epoch": 2.442306961183135, + "grad_norm": 1.63556433117381, + "learning_rate": 5.076734409964681e-06, + "loss": 0.0768, + "step": 16060 + }, + { + "epoch": 2.4438277002623274, + "grad_norm": 0.8728111609285438, + "learning_rate": 5.050039837892495e-06, + "loss": 0.0641, + "step": 16070 + }, + { + "epoch": 2.44534843934152, + "grad_norm": 1.2784903996986088, + "learning_rate": 5.023407746692932e-06, + "loss": 0.0661, + "step": 16080 + }, + { + "epoch": 2.4468691784207124, + "grad_norm": 0.8817262382469994, + "learning_rate": 4.9968382197744964e-06, + "loss": 0.0688, + "step": 16090 + }, + { + "epoch": 2.448389917499905, + "grad_norm": 1.8324648230482137, + "learning_rate": 4.970331340349746e-06, + "loss": 0.0668, + "step": 16100 + }, + { + "epoch": 2.4499106565790973, + "grad_norm": 1.0959669619868995, + "learning_rate": 4.9438871914350395e-06, + "loss": 0.0619, + "step": 16110 + }, + { + "epoch": 2.45143139565829, + "grad_norm": 1.256014325789025, + "learning_rate": 4.917505855850255e-06, + "loss": 0.0693, + "step": 16120 + }, + { + "epoch": 2.4529521347374823, + "grad_norm": 1.5664130355913324, + "learning_rate": 4.891187416218565e-06, + "loss": 0.0631, + "step": 16130 + }, + { + "epoch": 2.454472873816675, + "grad_norm": 0.9708455720215153, + "learning_rate": 4.864931954966151e-06, + "loss": 0.0608, + "step": 16140 + }, + { + "epoch": 2.4559936128958673, + "grad_norm": 1.6306436321335107, + "learning_rate": 4.838739554321961e-06, + "loss": 0.0758, + "step": 16150 + }, + { + "epoch": 2.45751435197506, + "grad_norm": 0.9167054488048461, + "learning_rate": 4.812610296317438e-06, + "loss": 0.0706, + "step": 16160 + }, + { + "epoch": 2.459035091054252, + "grad_norm": 1.4251104866007889, + "learning_rate": 4.7865442627862665e-06, + "loss": 0.0708, + "step": 16170 + }, + { + "epoch": 2.460555830133445, + "grad_norm": 0.9201196496229148, + "learning_rate": 4.760541535364124e-06, + "loss": 0.0712, + "step": 16180 + }, + { + "epoch": 2.462076569212637, + "grad_norm": 1.5334023751703192, + "learning_rate": 4.734602195488425e-06, + "loss": 0.0946, + "step": 16190 + }, + { + "epoch": 2.46359730829183, + "grad_norm": 1.0898215263030644, + "learning_rate": 4.708726324398055e-06, + "loss": 0.0609, + "step": 16200 + }, + { + "epoch": 2.465118047371022, + "grad_norm": 1.852641075562358, + "learning_rate": 4.682914003133132e-06, + "loss": 0.0837, + "step": 16210 + }, + { + "epoch": 2.466638786450215, + "grad_norm": 1.9021523546705155, + "learning_rate": 4.657165312534731e-06, + "loss": 0.0807, + "step": 16220 + }, + { + "epoch": 2.4681595255294075, + "grad_norm": 1.3150728483620808, + "learning_rate": 4.631480333244659e-06, + "loss": 0.0683, + "step": 16230 + }, + { + "epoch": 2.4696802646086, + "grad_norm": 1.0940708319493337, + "learning_rate": 4.605859145705166e-06, + "loss": 0.086, + "step": 16240 + }, + { + "epoch": 2.471201003687792, + "grad_norm": 0.875255757472059, + "learning_rate": 4.5803018301587315e-06, + "loss": 0.0585, + "step": 16250 + }, + { + "epoch": 2.4727217427669848, + "grad_norm": 1.1331172913328724, + "learning_rate": 4.554808466647792e-06, + "loss": 0.072, + "step": 16260 + }, + { + "epoch": 2.4742424818461775, + "grad_norm": 0.8697940669219488, + "learning_rate": 4.5293791350144855e-06, + "loss": 0.0616, + "step": 16270 + }, + { + "epoch": 2.4757632209253697, + "grad_norm": 1.2634518562709949, + "learning_rate": 4.5040139149004204e-06, + "loss": 0.0728, + "step": 16280 + }, + { + "epoch": 2.477283960004562, + "grad_norm": 1.9305682645908826, + "learning_rate": 4.478712885746414e-06, + "loss": 0.0635, + "step": 16290 + }, + { + "epoch": 2.4788046990837547, + "grad_norm": 1.1562482447063303, + "learning_rate": 4.453476126792239e-06, + "loss": 0.0604, + "step": 16300 + }, + { + "epoch": 2.4803254381629474, + "grad_norm": 1.3553157750633347, + "learning_rate": 4.428303717076385e-06, + "loss": 0.068, + "step": 16310 + }, + { + "epoch": 2.4818461772421396, + "grad_norm": 1.4394403872567403, + "learning_rate": 4.403195735435814e-06, + "loss": 0.0764, + "step": 16320 + }, + { + "epoch": 2.4833669163213323, + "grad_norm": 1.2945407742293265, + "learning_rate": 4.378152260505691e-06, + "loss": 0.0752, + "step": 16330 + }, + { + "epoch": 2.4848876554005246, + "grad_norm": 0.9199590088464841, + "learning_rate": 4.3531733707191655e-06, + "loss": 0.0647, + "step": 16340 + }, + { + "epoch": 2.4864083944797173, + "grad_norm": 2.874604505168445, + "learning_rate": 4.328259144307114e-06, + "loss": 0.0929, + "step": 16350 + }, + { + "epoch": 2.4879291335589095, + "grad_norm": 1.8773532409530773, + "learning_rate": 4.303409659297897e-06, + "loss": 0.0972, + "step": 16360 + }, + { + "epoch": 2.4894498726381022, + "grad_norm": 1.3981208564920704, + "learning_rate": 4.278624993517102e-06, + "loss": 0.0832, + "step": 16370 + }, + { + "epoch": 2.4909706117172945, + "grad_norm": 1.749483198634528, + "learning_rate": 4.253905224587324e-06, + "loss": 0.0774, + "step": 16380 + }, + { + "epoch": 2.492491350796487, + "grad_norm": 1.4026533898540186, + "learning_rate": 4.229250429927895e-06, + "loss": 0.0612, + "step": 16390 + }, + { + "epoch": 2.4940120898756795, + "grad_norm": 1.5034768263878815, + "learning_rate": 4.204660686754669e-06, + "loss": 0.0762, + "step": 16400 + }, + { + "epoch": 2.495532828954872, + "grad_norm": 1.5904693038935027, + "learning_rate": 4.180136072079763e-06, + "loss": 0.0743, + "step": 16410 + }, + { + "epoch": 2.4970535680340644, + "grad_norm": 1.2915411043379237, + "learning_rate": 4.155676662711305e-06, + "loss": 0.0645, + "step": 16420 + }, + { + "epoch": 2.498574307113257, + "grad_norm": 1.513933286573302, + "learning_rate": 4.131282535253228e-06, + "loss": 0.0799, + "step": 16430 + }, + { + "epoch": 2.5000950461924494, + "grad_norm": 2.3889805277350478, + "learning_rate": 4.106953766104998e-06, + "loss": 0.0718, + "step": 16440 + }, + { + "epoch": 2.501615785271642, + "grad_norm": 1.5831048257943319, + "learning_rate": 4.082690431461389e-06, + "loss": 0.0614, + "step": 16450 + }, + { + "epoch": 2.5031365243508343, + "grad_norm": 1.099067722378326, + "learning_rate": 4.058492607312248e-06, + "loss": 0.0494, + "step": 16460 + }, + { + "epoch": 2.504657263430027, + "grad_norm": 0.9295981369608421, + "learning_rate": 4.034360369442245e-06, + "loss": 0.0839, + "step": 16470 + }, + { + "epoch": 2.5061780025092197, + "grad_norm": 1.4305809058715393, + "learning_rate": 4.010293793430636e-06, + "loss": 0.065, + "step": 16480 + }, + { + "epoch": 2.507698741588412, + "grad_norm": 1.423191345671265, + "learning_rate": 3.98629295465105e-06, + "loss": 0.0645, + "step": 16490 + }, + { + "epoch": 2.5092194806676043, + "grad_norm": 0.9270541481073462, + "learning_rate": 3.9623579282712185e-06, + "loss": 0.0659, + "step": 16500 + }, + { + "epoch": 2.510740219746797, + "grad_norm": 0.9689438578167608, + "learning_rate": 3.938488789252756e-06, + "loss": 0.0695, + "step": 16510 + }, + { + "epoch": 2.5122609588259897, + "grad_norm": 1.3874251120626608, + "learning_rate": 3.9146856123509375e-06, + "loss": 0.0662, + "step": 16520 + }, + { + "epoch": 2.513781697905182, + "grad_norm": 1.1577671522994104, + "learning_rate": 3.890948472114442e-06, + "loss": 0.0784, + "step": 16530 + }, + { + "epoch": 2.515302436984374, + "grad_norm": 1.7823197500439127, + "learning_rate": 3.86727744288514e-06, + "loss": 0.0697, + "step": 16540 + }, + { + "epoch": 2.516823176063567, + "grad_norm": 1.1618994976327301, + "learning_rate": 3.843672598797837e-06, + "loss": 0.0726, + "step": 16550 + }, + { + "epoch": 2.5183439151427596, + "grad_norm": 1.132623482430385, + "learning_rate": 3.820134013780069e-06, + "loss": 0.0631, + "step": 16560 + }, + { + "epoch": 2.519864654221952, + "grad_norm": 1.2339998674473187, + "learning_rate": 3.796661761551845e-06, + "loss": 0.0721, + "step": 16570 + }, + { + "epoch": 2.521385393301144, + "grad_norm": 0.6912389004307292, + "learning_rate": 3.7732559156254345e-06, + "loss": 0.0653, + "step": 16580 + }, + { + "epoch": 2.522906132380337, + "grad_norm": 1.5243192474441012, + "learning_rate": 3.7499165493051323e-06, + "loss": 0.0757, + "step": 16590 + }, + { + "epoch": 2.5244268714595295, + "grad_norm": 0.9713463801374472, + "learning_rate": 3.726643735687013e-06, + "loss": 0.0639, + "step": 16600 + }, + { + "epoch": 2.5259476105387217, + "grad_norm": 2.6354343592136145, + "learning_rate": 3.7034375476587307e-06, + "loss": 0.0717, + "step": 16610 + }, + { + "epoch": 2.5274683496179144, + "grad_norm": 1.7552772846407914, + "learning_rate": 3.68029805789927e-06, + "loss": 0.0698, + "step": 16620 + }, + { + "epoch": 2.5289890886971067, + "grad_norm": 0.8416039871368718, + "learning_rate": 3.6572253388787307e-06, + "loss": 0.0645, + "step": 16630 + }, + { + "epoch": 2.5305098277762994, + "grad_norm": 1.9756814496183763, + "learning_rate": 3.634219462858088e-06, + "loss": 0.0763, + "step": 16640 + }, + { + "epoch": 2.5320305668554917, + "grad_norm": 0.8695612545016982, + "learning_rate": 3.611280501888978e-06, + "loss": 0.0721, + "step": 16650 + }, + { + "epoch": 2.5335513059346844, + "grad_norm": 1.0042363396180785, + "learning_rate": 3.5884085278134527e-06, + "loss": 0.0652, + "step": 16660 + }, + { + "epoch": 2.5350720450138766, + "grad_norm": 1.3878813555232412, + "learning_rate": 3.5656036122637904e-06, + "loss": 0.0559, + "step": 16670 + }, + { + "epoch": 2.5365927840930693, + "grad_norm": 1.49662077449546, + "learning_rate": 3.5428658266622365e-06, + "loss": 0.0775, + "step": 16680 + }, + { + "epoch": 2.5381135231722616, + "grad_norm": 1.4865317648073033, + "learning_rate": 3.5201952422208e-06, + "loss": 0.0584, + "step": 16690 + }, + { + "epoch": 2.5396342622514543, + "grad_norm": 1.2057297380771546, + "learning_rate": 3.4975919299410244e-06, + "loss": 0.072, + "step": 16700 + }, + { + "epoch": 2.5411550013306465, + "grad_norm": 1.8849535357863731, + "learning_rate": 3.475055960613749e-06, + "loss": 0.072, + "step": 16710 + }, + { + "epoch": 2.5426757404098392, + "grad_norm": 1.4492950185045972, + "learning_rate": 3.4525874048189244e-06, + "loss": 0.0804, + "step": 16720 + }, + { + "epoch": 2.544196479489032, + "grad_norm": 1.2730162342579492, + "learning_rate": 3.4301863329253612e-06, + "loss": 0.0664, + "step": 16730 + }, + { + "epoch": 2.545717218568224, + "grad_norm": 1.606226827351996, + "learning_rate": 3.4078528150905174e-06, + "loss": 0.0744, + "step": 16740 + }, + { + "epoch": 2.5472379576474165, + "grad_norm": 1.4298167806811466, + "learning_rate": 3.385586921260281e-06, + "loss": 0.0741, + "step": 16750 + }, + { + "epoch": 2.548758696726609, + "grad_norm": 1.2382665885705888, + "learning_rate": 3.3633887211687505e-06, + "loss": 0.0691, + "step": 16760 + }, + { + "epoch": 2.550279435805802, + "grad_norm": 1.66468121903113, + "learning_rate": 3.3412582843380224e-06, + "loss": 0.0616, + "step": 16770 + }, + { + "epoch": 2.551800174884994, + "grad_norm": 1.8747117316881012, + "learning_rate": 3.319195680077947e-06, + "loss": 0.073, + "step": 16780 + }, + { + "epoch": 2.5533209139641864, + "grad_norm": 1.6278549840143888, + "learning_rate": 3.297200977485948e-06, + "loss": 0.0605, + "step": 16790 + }, + { + "epoch": 2.554841653043379, + "grad_norm": 0.999474424195142, + "learning_rate": 3.275274245446788e-06, + "loss": 0.0593, + "step": 16800 + }, + { + "epoch": 2.5563623921225718, + "grad_norm": 1.5899311646727736, + "learning_rate": 3.2534155526323496e-06, + "loss": 0.0744, + "step": 16810 + }, + { + "epoch": 2.557883131201764, + "grad_norm": 1.4125020961997823, + "learning_rate": 3.2316249675014233e-06, + "loss": 0.0584, + "step": 16820 + }, + { + "epoch": 2.5594038702809563, + "grad_norm": 1.300502514473648, + "learning_rate": 3.2099025582995046e-06, + "loss": 0.0691, + "step": 16830 + }, + { + "epoch": 2.560924609360149, + "grad_norm": 1.0170316424176329, + "learning_rate": 3.188248393058557e-06, + "loss": 0.0618, + "step": 16840 + }, + { + "epoch": 2.5624453484393417, + "grad_norm": 0.9958747500069167, + "learning_rate": 3.166662539596818e-06, + "loss": 0.0655, + "step": 16850 + }, + { + "epoch": 2.563966087518534, + "grad_norm": 1.6728860243985662, + "learning_rate": 3.14514506551859e-06, + "loss": 0.0652, + "step": 16860 + }, + { + "epoch": 2.5654868265977266, + "grad_norm": 1.282414864865698, + "learning_rate": 3.1236960382139996e-06, + "loss": 0.058, + "step": 16870 + }, + { + "epoch": 2.567007565676919, + "grad_norm": 1.744735108008295, + "learning_rate": 3.10231552485882e-06, + "loss": 0.0612, + "step": 16880 + }, + { + "epoch": 2.5685283047561116, + "grad_norm": 2.4688695519532673, + "learning_rate": 3.0810035924142446e-06, + "loss": 0.065, + "step": 16890 + }, + { + "epoch": 2.570049043835304, + "grad_norm": 2.264162744634141, + "learning_rate": 3.05976030762668e-06, + "loss": 0.0798, + "step": 16900 + }, + { + "epoch": 2.5715697829144966, + "grad_norm": 1.469172818041532, + "learning_rate": 3.0385857370275356e-06, + "loss": 0.0677, + "step": 16910 + }, + { + "epoch": 2.573090521993689, + "grad_norm": 1.528312744695235, + "learning_rate": 3.0174799469330163e-06, + "loss": 0.0688, + "step": 16920 + }, + { + "epoch": 2.5746112610728815, + "grad_norm": 1.294770892350271, + "learning_rate": 2.996443003443916e-06, + "loss": 0.0694, + "step": 16930 + }, + { + "epoch": 2.5761320001520738, + "grad_norm": 1.0242486727116262, + "learning_rate": 2.975474972445405e-06, + "loss": 0.0694, + "step": 16940 + }, + { + "epoch": 2.5776527392312665, + "grad_norm": 1.54798173891042, + "learning_rate": 2.9545759196068416e-06, + "loss": 0.0586, + "step": 16950 + }, + { + "epoch": 2.5791734783104587, + "grad_norm": 1.9946225285574046, + "learning_rate": 2.933745910381527e-06, + "loss": 0.0771, + "step": 16960 + }, + { + "epoch": 2.5806942173896514, + "grad_norm": 0.9300345139508825, + "learning_rate": 2.9129850100065496e-06, + "loss": 0.0798, + "step": 16970 + }, + { + "epoch": 2.5822149564688437, + "grad_norm": 1.3374982985594406, + "learning_rate": 2.8922932835025477e-06, + "loss": 0.0631, + "step": 16980 + }, + { + "epoch": 2.5837356955480364, + "grad_norm": 1.3199204717711885, + "learning_rate": 2.8716707956735144e-06, + "loss": 0.0737, + "step": 16990 + }, + { + "epoch": 2.5852564346272286, + "grad_norm": 2.1558083033083606, + "learning_rate": 2.8511176111066048e-06, + "loss": 0.0631, + "step": 17000 + }, + { + "epoch": 2.5867771737064214, + "grad_norm": 1.8265804353401853, + "learning_rate": 2.8306337941719134e-06, + "loss": 0.0763, + "step": 17010 + }, + { + "epoch": 2.588297912785614, + "grad_norm": 0.83177600564757, + "learning_rate": 2.8102194090222856e-06, + "loss": 0.0657, + "step": 17020 + }, + { + "epoch": 2.5898186518648063, + "grad_norm": 2.1367131281709484, + "learning_rate": 2.7898745195931274e-06, + "loss": 0.0769, + "step": 17030 + }, + { + "epoch": 2.5913393909439986, + "grad_norm": 1.2633939149662006, + "learning_rate": 2.7695991896021676e-06, + "loss": 0.0534, + "step": 17040 + }, + { + "epoch": 2.5928601300231913, + "grad_norm": 2.211531308582087, + "learning_rate": 2.749393482549306e-06, + "loss": 0.0631, + "step": 17050 + }, + { + "epoch": 2.594380869102384, + "grad_norm": 1.1637012243687697, + "learning_rate": 2.7292574617163836e-06, + "loss": 0.0876, + "step": 17060 + }, + { + "epoch": 2.5959016081815762, + "grad_norm": 1.5485383113636906, + "learning_rate": 2.7091911901669903e-06, + "loss": 0.0701, + "step": 17070 + }, + { + "epoch": 2.5974223472607685, + "grad_norm": 1.2160974113484384, + "learning_rate": 2.689194730746275e-06, + "loss": 0.0623, + "step": 17080 + }, + { + "epoch": 2.598943086339961, + "grad_norm": 1.0273449406690973, + "learning_rate": 2.669268146080739e-06, + "loss": 0.0606, + "step": 17090 + }, + { + "epoch": 2.600463825419154, + "grad_norm": 1.4900259728821708, + "learning_rate": 2.6494114985780445e-06, + "loss": 0.067, + "step": 17100 + }, + { + "epoch": 2.601984564498346, + "grad_norm": 2.417492321083565, + "learning_rate": 2.6296248504268207e-06, + "loss": 0.0665, + "step": 17110 + }, + { + "epoch": 2.603505303577539, + "grad_norm": 1.478942366014328, + "learning_rate": 2.6099082635964723e-06, + "loss": 0.0624, + "step": 17120 + }, + { + "epoch": 2.605026042656731, + "grad_norm": 1.6484687819324104, + "learning_rate": 2.5902617998369656e-06, + "loss": 0.0585, + "step": 17130 + }, + { + "epoch": 2.606546781735924, + "grad_norm": 1.6749847210300874, + "learning_rate": 2.5706855206786646e-06, + "loss": 0.0628, + "step": 17140 + }, + { + "epoch": 2.608067520815116, + "grad_norm": 2.390516670881179, + "learning_rate": 2.5511794874321122e-06, + "loss": 0.0828, + "step": 17150 + }, + { + "epoch": 2.6095882598943088, + "grad_norm": 1.9319362892890666, + "learning_rate": 2.531743761187863e-06, + "loss": 0.0767, + "step": 17160 + }, + { + "epoch": 2.611108998973501, + "grad_norm": 1.1849823746191523, + "learning_rate": 2.512378402816268e-06, + "loss": 0.0622, + "step": 17170 + }, + { + "epoch": 2.6126297380526937, + "grad_norm": 1.119859476423327, + "learning_rate": 2.4930834729672948e-06, + "loss": 0.0718, + "step": 17180 + }, + { + "epoch": 2.614150477131886, + "grad_norm": 1.2884112069652924, + "learning_rate": 2.4738590320703416e-06, + "loss": 0.0826, + "step": 17190 + }, + { + "epoch": 2.6156712162110787, + "grad_norm": 1.9712609962988357, + "learning_rate": 2.4547051403340486e-06, + "loss": 0.073, + "step": 17200 + }, + { + "epoch": 2.617191955290271, + "grad_norm": 2.4558895487189734, + "learning_rate": 2.43562185774609e-06, + "loss": 0.0584, + "step": 17210 + }, + { + "epoch": 2.6187126943694636, + "grad_norm": 2.235882709139369, + "learning_rate": 2.4166092440730066e-06, + "loss": 0.0772, + "step": 17220 + }, + { + "epoch": 2.620233433448656, + "grad_norm": 1.9881243611269888, + "learning_rate": 2.39766735886002e-06, + "loss": 0.0677, + "step": 17230 + }, + { + "epoch": 2.6217541725278486, + "grad_norm": 1.7152255402446284, + "learning_rate": 2.378796261430832e-06, + "loss": 0.0838, + "step": 17240 + }, + { + "epoch": 2.623274911607041, + "grad_norm": 1.0289364556469438, + "learning_rate": 2.3599960108874525e-06, + "loss": 0.066, + "step": 17250 + }, + { + "epoch": 2.6247956506862336, + "grad_norm": 0.9147955724122397, + "learning_rate": 2.3412666661099927e-06, + "loss": 0.0573, + "step": 17260 + }, + { + "epoch": 2.6263163897654263, + "grad_norm": 1.5185166747900805, + "learning_rate": 2.3226082857565083e-06, + "loss": 0.0769, + "step": 17270 + }, + { + "epoch": 2.6278371288446185, + "grad_norm": 1.6002636303663504, + "learning_rate": 2.3040209282628013e-06, + "loss": 0.0838, + "step": 17280 + }, + { + "epoch": 2.6293578679238108, + "grad_norm": 1.9739594485493668, + "learning_rate": 2.2855046518422354e-06, + "loss": 0.0809, + "step": 17290 + }, + { + "epoch": 2.6308786070030035, + "grad_norm": 1.7154259904561626, + "learning_rate": 2.267059514485562e-06, + "loss": 0.0665, + "step": 17300 + }, + { + "epoch": 2.632399346082196, + "grad_norm": 1.8652145131768123, + "learning_rate": 2.2486855739607237e-06, + "loss": 0.0722, + "step": 17310 + }, + { + "epoch": 2.6339200851613884, + "grad_norm": 0.7409870938854066, + "learning_rate": 2.23038288781269e-06, + "loss": 0.0726, + "step": 17320 + }, + { + "epoch": 2.6354408242405807, + "grad_norm": 1.7852088490100544, + "learning_rate": 2.2121515133632754e-06, + "loss": 0.0841, + "step": 17330 + }, + { + "epoch": 2.6369615633197734, + "grad_norm": 1.4394040965091337, + "learning_rate": 2.1939915077109434e-06, + "loss": 0.0649, + "step": 17340 + }, + { + "epoch": 2.638482302398966, + "grad_norm": 1.8955376441883045, + "learning_rate": 2.1759029277306497e-06, + "loss": 0.0745, + "step": 17350 + }, + { + "epoch": 2.6400030414781583, + "grad_norm": 1.0426218029205343, + "learning_rate": 2.157885830073644e-06, + "loss": 0.0782, + "step": 17360 + }, + { + "epoch": 2.6415237805573506, + "grad_norm": 1.3990393715927554, + "learning_rate": 2.1399402711673065e-06, + "loss": 0.0645, + "step": 17370 + }, + { + "epoch": 2.6430445196365433, + "grad_norm": 2.104932952488623, + "learning_rate": 2.1220663072149704e-06, + "loss": 0.063, + "step": 17380 + }, + { + "epoch": 2.644565258715736, + "grad_norm": 1.3750844003437537, + "learning_rate": 2.1042639941957404e-06, + "loss": 0.0757, + "step": 17390 + }, + { + "epoch": 2.6460859977949283, + "grad_norm": 1.042089782664088, + "learning_rate": 2.086533387864306e-06, + "loss": 0.0674, + "step": 17400 + }, + { + "epoch": 2.647606736874121, + "grad_norm": 1.7587890275088534, + "learning_rate": 2.0688745437507984e-06, + "loss": 0.0711, + "step": 17410 + }, + { + "epoch": 2.649127475953313, + "grad_norm": 1.2972487861888373, + "learning_rate": 2.051287517160591e-06, + "loss": 0.0791, + "step": 17420 + }, + { + "epoch": 2.650648215032506, + "grad_norm": 1.0566373032831995, + "learning_rate": 2.03377236317413e-06, + "loss": 0.0547, + "step": 17430 + }, + { + "epoch": 2.652168954111698, + "grad_norm": 0.9990829242166553, + "learning_rate": 2.016329136646769e-06, + "loss": 0.056, + "step": 17440 + }, + { + "epoch": 2.653689693190891, + "grad_norm": 1.2090416778736495, + "learning_rate": 1.9989578922085945e-06, + "loss": 0.0757, + "step": 17450 + }, + { + "epoch": 2.655210432270083, + "grad_norm": 1.7319770905497094, + "learning_rate": 1.981658684264251e-06, + "loss": 0.0729, + "step": 17460 + }, + { + "epoch": 2.656731171349276, + "grad_norm": 2.605049995731011, + "learning_rate": 1.964431566992775e-06, + "loss": 0.0642, + "step": 17470 + }, + { + "epoch": 2.658251910428468, + "grad_norm": 1.2483657944135544, + "learning_rate": 1.9472765943474275e-06, + "loss": 0.0775, + "step": 17480 + }, + { + "epoch": 2.659772649507661, + "grad_norm": 1.323492579317446, + "learning_rate": 1.9301938200555063e-06, + "loss": 0.0685, + "step": 17490 + }, + { + "epoch": 2.661293388586853, + "grad_norm": 1.0255804859312434, + "learning_rate": 1.9131832976182125e-06, + "loss": 0.0648, + "step": 17500 + }, + { + "epoch": 2.6628141276660457, + "grad_norm": 1.2437136261797794, + "learning_rate": 1.8962450803104465e-06, + "loss": 0.0652, + "step": 17510 + }, + { + "epoch": 2.6643348667452385, + "grad_norm": 1.5873658510877269, + "learning_rate": 1.8793792211806688e-06, + "loss": 0.0638, + "step": 17520 + }, + { + "epoch": 2.6658556058244307, + "grad_norm": 0.7878927603706403, + "learning_rate": 1.8625857730507207e-06, + "loss": 0.0775, + "step": 17530 + }, + { + "epoch": 2.667376344903623, + "grad_norm": 2.0844581325576077, + "learning_rate": 1.8458647885156583e-06, + "loss": 0.0706, + "step": 17540 + }, + { + "epoch": 2.6688970839828157, + "grad_norm": 1.55454404952492, + "learning_rate": 1.8292163199435886e-06, + "loss": 0.0704, + "step": 17550 + }, + { + "epoch": 2.6704178230620084, + "grad_norm": 1.1740692231002052, + "learning_rate": 1.8126404194755114e-06, + "loss": 0.0586, + "step": 17560 + }, + { + "epoch": 2.6719385621412006, + "grad_norm": 1.4657459760801541, + "learning_rate": 1.7961371390251586e-06, + "loss": 0.0604, + "step": 17570 + }, + { + "epoch": 2.673459301220393, + "grad_norm": 1.60973409729932, + "learning_rate": 1.7797065302788046e-06, + "loss": 0.0753, + "step": 17580 + }, + { + "epoch": 2.6749800402995856, + "grad_norm": 1.525799710613245, + "learning_rate": 1.7633486446951426e-06, + "loss": 0.0853, + "step": 17590 + }, + { + "epoch": 2.6765007793787783, + "grad_norm": 0.8311947350680909, + "learning_rate": 1.7470635335051e-06, + "loss": 0.0655, + "step": 17600 + }, + { + "epoch": 2.6780215184579705, + "grad_norm": 1.4452059462905902, + "learning_rate": 1.7308512477116816e-06, + "loss": 0.0786, + "step": 17610 + }, + { + "epoch": 2.679542257537163, + "grad_norm": 0.7771281493315982, + "learning_rate": 1.7147118380898192e-06, + "loss": 0.0649, + "step": 17620 + }, + { + "epoch": 2.6810629966163555, + "grad_norm": 1.339404106405533, + "learning_rate": 1.6986453551861986e-06, + "loss": 0.0621, + "step": 17630 + }, + { + "epoch": 2.682583735695548, + "grad_norm": 1.2586133318374326, + "learning_rate": 1.6826518493191056e-06, + "loss": 0.0607, + "step": 17640 + }, + { + "epoch": 2.6841044747747405, + "grad_norm": 1.3012083442709566, + "learning_rate": 1.6667313705782782e-06, + "loss": 0.0692, + "step": 17650 + }, + { + "epoch": 2.685625213853933, + "grad_norm": 1.410187013142057, + "learning_rate": 1.6508839688247401e-06, + "loss": 0.0633, + "step": 17660 + }, + { + "epoch": 2.6871459529331254, + "grad_norm": 2.150952944625462, + "learning_rate": 1.6351096936906397e-06, + "loss": 0.0584, + "step": 17670 + }, + { + "epoch": 2.688666692012318, + "grad_norm": 0.8651517651063503, + "learning_rate": 1.6194085945791081e-06, + "loss": 0.06, + "step": 17680 + }, + { + "epoch": 2.6901874310915104, + "grad_norm": 1.3593356781561372, + "learning_rate": 1.6037807206640998e-06, + "loss": 0.0822, + "step": 17690 + }, + { + "epoch": 2.691708170170703, + "grad_norm": 1.3945393673612487, + "learning_rate": 1.5882261208902322e-06, + "loss": 0.065, + "step": 17700 + }, + { + "epoch": 2.6932289092498953, + "grad_norm": 1.6904571212326305, + "learning_rate": 1.5727448439726373e-06, + "loss": 0.0672, + "step": 17710 + }, + { + "epoch": 2.694749648329088, + "grad_norm": 0.7855101514843933, + "learning_rate": 1.557336938396811e-06, + "loss": 0.0553, + "step": 17720 + }, + { + "epoch": 2.6962703874082803, + "grad_norm": 1.0721921158990866, + "learning_rate": 1.542002452418459e-06, + "loss": 0.0658, + "step": 17730 + }, + { + "epoch": 2.697791126487473, + "grad_norm": 0.6114183109670661, + "learning_rate": 1.5267414340633474e-06, + "loss": 0.0593, + "step": 17740 + }, + { + "epoch": 2.6993118655666652, + "grad_norm": 1.8935950610888679, + "learning_rate": 1.511553931127141e-06, + "loss": 0.0774, + "step": 17750 + }, + { + "epoch": 2.700832604645858, + "grad_norm": 0.880446447876442, + "learning_rate": 1.496439991175272e-06, + "loss": 0.0686, + "step": 17760 + }, + { + "epoch": 2.7023533437250506, + "grad_norm": 0.9573852504219756, + "learning_rate": 1.4813996615427817e-06, + "loss": 0.0648, + "step": 17770 + }, + { + "epoch": 2.703874082804243, + "grad_norm": 1.2755397732605782, + "learning_rate": 1.466432989334171e-06, + "loss": 0.077, + "step": 17780 + }, + { + "epoch": 2.705394821883435, + "grad_norm": 1.3480254849407687, + "learning_rate": 1.451540021423256e-06, + "loss": 0.0701, + "step": 17790 + }, + { + "epoch": 2.706915560962628, + "grad_norm": 0.8184097359630846, + "learning_rate": 1.436720804453015e-06, + "loss": 0.0604, + "step": 17800 + }, + { + "epoch": 2.7084363000418206, + "grad_norm": 1.6137649473157811, + "learning_rate": 1.4219753848354557e-06, + "loss": 0.0611, + "step": 17810 + }, + { + "epoch": 2.709957039121013, + "grad_norm": 1.3206506392107076, + "learning_rate": 1.4073038087514507e-06, + "loss": 0.0704, + "step": 17820 + }, + { + "epoch": 2.711477778200205, + "grad_norm": 1.0459664957916708, + "learning_rate": 1.3927061221506139e-06, + "loss": 0.0637, + "step": 17830 + }, + { + "epoch": 2.712998517279398, + "grad_norm": 1.5594390669412024, + "learning_rate": 1.3781823707511382e-06, + "loss": 0.0569, + "step": 17840 + }, + { + "epoch": 2.7145192563585905, + "grad_norm": 1.0604577496906515, + "learning_rate": 1.3637326000396717e-06, + "loss": 0.071, + "step": 17850 + }, + { + "epoch": 2.7160399954377827, + "grad_norm": 0.820051994958119, + "learning_rate": 1.3493568552711471e-06, + "loss": 0.0629, + "step": 17860 + }, + { + "epoch": 2.717560734516975, + "grad_norm": 1.2357171446226995, + "learning_rate": 1.3350551814686668e-06, + "loss": 0.0731, + "step": 17870 + }, + { + "epoch": 2.7190814735961677, + "grad_norm": 1.0659338105631653, + "learning_rate": 1.3208276234233541e-06, + "loss": 0.0731, + "step": 17880 + }, + { + "epoch": 2.7206022126753604, + "grad_norm": 1.7800658789109485, + "learning_rate": 1.3066742256942071e-06, + "loss": 0.072, + "step": 17890 + }, + { + "epoch": 2.7221229517545527, + "grad_norm": 1.6213664727570085, + "learning_rate": 1.29259503260796e-06, + "loss": 0.0618, + "step": 17900 + }, + { + "epoch": 2.7236436908337454, + "grad_norm": 1.3396779805069436, + "learning_rate": 1.2785900882589547e-06, + "loss": 0.0724, + "step": 17910 + }, + { + "epoch": 2.7251644299129376, + "grad_norm": 1.1285915641877668, + "learning_rate": 1.2646594365089859e-06, + "loss": 0.0669, + "step": 17920 + }, + { + "epoch": 2.7266851689921303, + "grad_norm": 1.1960492876428621, + "learning_rate": 1.2508031209871818e-06, + "loss": 0.074, + "step": 17930 + }, + { + "epoch": 2.7282059080713226, + "grad_norm": 1.3184483957486965, + "learning_rate": 1.2370211850898482e-06, + "loss": 0.0735, + "step": 17940 + }, + { + "epoch": 2.7297266471505153, + "grad_norm": 1.1400818311418341, + "learning_rate": 1.223313671980353e-06, + "loss": 0.0602, + "step": 17950 + }, + { + "epoch": 2.7312473862297075, + "grad_norm": 1.749853388749553, + "learning_rate": 1.2096806245889713e-06, + "loss": 0.0581, + "step": 17960 + }, + { + "epoch": 2.7327681253089002, + "grad_norm": 1.844473290866694, + "learning_rate": 1.196122085612772e-06, + "loss": 0.0693, + "step": 17970 + }, + { + "epoch": 2.7342888643880925, + "grad_norm": 1.39737591376742, + "learning_rate": 1.1826380975154623e-06, + "loss": 0.0689, + "step": 17980 + }, + { + "epoch": 2.735809603467285, + "grad_norm": 1.0598753801875533, + "learning_rate": 1.1692287025272696e-06, + "loss": 0.069, + "step": 17990 + }, + { + "epoch": 2.7373303425464774, + "grad_norm": 1.4156713198374042, + "learning_rate": 1.1558939426448062e-06, + "loss": 0.0624, + "step": 18000 + }, + { + "epoch": 2.73885108162567, + "grad_norm": 1.1998688658630212, + "learning_rate": 1.1426338596309315e-06, + "loss": 0.0633, + "step": 18010 + }, + { + "epoch": 2.7403718207048624, + "grad_norm": 1.7598474759416387, + "learning_rate": 1.1294484950146344e-06, + "loss": 0.0711, + "step": 18020 + }, + { + "epoch": 2.741892559784055, + "grad_norm": 1.300469281247238, + "learning_rate": 1.1163378900908795e-06, + "loss": 0.0697, + "step": 18030 + }, + { + "epoch": 2.7434132988632474, + "grad_norm": 1.480010157505714, + "learning_rate": 1.1033020859205073e-06, + "loss": 0.0585, + "step": 18040 + }, + { + "epoch": 2.74493403794244, + "grad_norm": 1.2969029153724783, + "learning_rate": 1.0903411233300853e-06, + "loss": 0.0708, + "step": 18050 + }, + { + "epoch": 2.7464547770216328, + "grad_norm": 1.6473179577476063, + "learning_rate": 1.0774550429117901e-06, + "loss": 0.069, + "step": 18060 + }, + { + "epoch": 2.747975516100825, + "grad_norm": 1.5359741184428621, + "learning_rate": 1.0646438850232726e-06, + "loss": 0.064, + "step": 18070 + }, + { + "epoch": 2.7494962551800173, + "grad_norm": 1.37588652122152, + "learning_rate": 1.0519076897875347e-06, + "loss": 0.0608, + "step": 18080 + }, + { + "epoch": 2.75101699425921, + "grad_norm": 0.9813993390163018, + "learning_rate": 1.0392464970928079e-06, + "loss": 0.0563, + "step": 18090 + }, + { + "epoch": 2.7525377333384027, + "grad_norm": 1.5981387949744281, + "learning_rate": 1.0266603465924229e-06, + "loss": 0.0554, + "step": 18100 + }, + { + "epoch": 2.754058472417595, + "grad_norm": 1.8988262492146903, + "learning_rate": 1.0141492777046875e-06, + "loss": 0.0693, + "step": 18110 + }, + { + "epoch": 2.755579211496787, + "grad_norm": 2.17510979310219, + "learning_rate": 1.0017133296127606e-06, + "loss": 0.0655, + "step": 18120 + }, + { + "epoch": 2.75709995057598, + "grad_norm": 1.7425765907633803, + "learning_rate": 9.893525412645349e-07, + "loss": 0.0536, + "step": 18130 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.8667153704801879, + "learning_rate": 9.770669513725128e-07, + "loss": 0.0634, + "step": 18140 + }, + { + "epoch": 2.760141428734365, + "grad_norm": 0.7654718901871912, + "learning_rate": 9.648565984136798e-07, + "loss": 0.0741, + "step": 18150 + }, + { + "epoch": 2.761662167813557, + "grad_norm": 1.082641080593386, + "learning_rate": 9.527215206293905e-07, + "loss": 0.0641, + "step": 18160 + }, + { + "epoch": 2.76318290689275, + "grad_norm": 1.2279341886215938, + "learning_rate": 9.40661756025249e-07, + "loss": 0.0635, + "step": 18170 + }, + { + "epoch": 2.7647036459719425, + "grad_norm": 1.5836785569529248, + "learning_rate": 9.286773423709872e-07, + "loss": 0.0754, + "step": 18180 + }, + { + "epoch": 2.7662243850511348, + "grad_norm": 1.107676898662823, + "learning_rate": 9.167683172003394e-07, + "loss": 0.0664, + "step": 18190 + }, + { + "epoch": 2.7677451241303275, + "grad_norm": 2.604956214147783, + "learning_rate": 9.049347178109486e-07, + "loss": 0.08, + "step": 18200 + }, + { + "epoch": 2.7692658632095197, + "grad_norm": 0.9458359867522632, + "learning_rate": 8.931765812642102e-07, + "loss": 0.0612, + "step": 18210 + }, + { + "epoch": 2.7707866022887124, + "grad_norm": 2.0684340922970117, + "learning_rate": 8.814939443852005e-07, + "loss": 0.0741, + "step": 18220 + }, + { + "epoch": 2.7723073413679047, + "grad_norm": 1.7603541432784737, + "learning_rate": 8.698868437625269e-07, + "loss": 0.0717, + "step": 18230 + }, + { + "epoch": 2.7738280804470974, + "grad_norm": 0.8204681688645085, + "learning_rate": 8.583553157482327e-07, + "loss": 0.0489, + "step": 18240 + }, + { + "epoch": 2.7753488195262896, + "grad_norm": 1.3975464935357333, + "learning_rate": 8.468993964576761e-07, + "loss": 0.0775, + "step": 18250 + }, + { + "epoch": 2.7768695586054823, + "grad_norm": 2.233248599862039, + "learning_rate": 8.355191217694153e-07, + "loss": 0.0727, + "step": 18260 + }, + { + "epoch": 2.7783902976846746, + "grad_norm": 0.8714645554324814, + "learning_rate": 8.242145273250984e-07, + "loss": 0.0833, + "step": 18270 + }, + { + "epoch": 2.7799110367638673, + "grad_norm": 1.5460867876425883, + "learning_rate": 8.129856485293574e-07, + "loss": 0.0634, + "step": 18280 + }, + { + "epoch": 2.7814317758430596, + "grad_norm": 1.113090641456699, + "learning_rate": 8.018325205496863e-07, + "loss": 0.0686, + "step": 18290 + }, + { + "epoch": 2.7829525149222523, + "grad_norm": 1.5685679894443727, + "learning_rate": 7.907551783163381e-07, + "loss": 0.0639, + "step": 18300 + }, + { + "epoch": 2.784473254001445, + "grad_norm": 1.0963898639727545, + "learning_rate": 7.797536565222141e-07, + "loss": 0.0572, + "step": 18310 + }, + { + "epoch": 2.785993993080637, + "grad_norm": 0.9809730418128052, + "learning_rate": 7.688279896227585e-07, + "loss": 0.0576, + "step": 18320 + }, + { + "epoch": 2.7875147321598295, + "grad_norm": 1.7262120844603204, + "learning_rate": 7.579782118358413e-07, + "loss": 0.0639, + "step": 18330 + }, + { + "epoch": 2.789035471239022, + "grad_norm": 1.5764251263125348, + "learning_rate": 7.472043571416648e-07, + "loss": 0.0622, + "step": 18340 + }, + { + "epoch": 2.790556210318215, + "grad_norm": 1.6497853345629714, + "learning_rate": 7.365064592826376e-07, + "loss": 0.0759, + "step": 18350 + }, + { + "epoch": 2.792076949397407, + "grad_norm": 1.7815533026657195, + "learning_rate": 7.258845517632923e-07, + "loss": 0.0671, + "step": 18360 + }, + { + "epoch": 2.7935976884765994, + "grad_norm": 1.5751362488908474, + "learning_rate": 7.1533866785016e-07, + "loss": 0.0622, + "step": 18370 + }, + { + "epoch": 2.795118427555792, + "grad_norm": 1.2418555106908011, + "learning_rate": 7.048688405716819e-07, + "loss": 0.0657, + "step": 18380 + }, + { + "epoch": 2.796639166634985, + "grad_norm": 1.0130942662331341, + "learning_rate": 6.944751027180979e-07, + "loss": 0.0587, + "step": 18390 + }, + { + "epoch": 2.798159905714177, + "grad_norm": 1.327467373323979, + "learning_rate": 6.841574868413387e-07, + "loss": 0.0834, + "step": 18400 + }, + { + "epoch": 2.7996806447933693, + "grad_norm": 1.7094348287717562, + "learning_rate": 6.739160252549393e-07, + "loss": 0.0659, + "step": 18410 + }, + { + "epoch": 2.801201383872562, + "grad_norm": 0.8422842416789422, + "learning_rate": 6.637507500339174e-07, + "loss": 0.0564, + "step": 18420 + }, + { + "epoch": 2.8027221229517547, + "grad_norm": 1.51867815972037, + "learning_rate": 6.536616930146983e-07, + "loss": 0.0596, + "step": 18430 + }, + { + "epoch": 2.804242862030947, + "grad_norm": 1.2589942270225711, + "learning_rate": 6.436488857949896e-07, + "loss": 0.0648, + "step": 18440 + }, + { + "epoch": 2.8057636011101397, + "grad_norm": 1.6853008030645784, + "learning_rate": 6.337123597336958e-07, + "loss": 0.0534, + "step": 18450 + }, + { + "epoch": 2.807284340189332, + "grad_norm": 0.7989528603827765, + "learning_rate": 6.238521459508262e-07, + "loss": 0.0675, + "step": 18460 + }, + { + "epoch": 2.8088050792685246, + "grad_norm": 0.9814294851101155, + "learning_rate": 6.140682753273785e-07, + "loss": 0.0722, + "step": 18470 + }, + { + "epoch": 2.810325818347717, + "grad_norm": 0.9966318070850247, + "learning_rate": 6.043607785052557e-07, + "loss": 0.0575, + "step": 18480 + }, + { + "epoch": 2.8118465574269096, + "grad_norm": 1.5784366877326221, + "learning_rate": 5.947296858871715e-07, + "loss": 0.0598, + "step": 18490 + }, + { + "epoch": 2.813367296506102, + "grad_norm": 1.4107887923187672, + "learning_rate": 5.851750276365476e-07, + "loss": 0.0624, + "step": 18500 + }, + { + "epoch": 2.8148880355852945, + "grad_norm": 1.5184766568569745, + "learning_rate": 5.756968336774221e-07, + "loss": 0.0601, + "step": 18510 + }, + { + "epoch": 2.816408774664487, + "grad_norm": 1.9178866751098347, + "learning_rate": 5.662951336943583e-07, + "loss": 0.0738, + "step": 18520 + }, + { + "epoch": 2.8179295137436795, + "grad_norm": 1.5867064672423505, + "learning_rate": 5.569699571323472e-07, + "loss": 0.0721, + "step": 18530 + }, + { + "epoch": 2.8194502528228718, + "grad_norm": 0.7892333431018899, + "learning_rate": 5.477213331967185e-07, + "loss": 0.0669, + "step": 18540 + }, + { + "epoch": 2.8209709919020645, + "grad_norm": 1.3525563028992478, + "learning_rate": 5.38549290853052e-07, + "loss": 0.0579, + "step": 18550 + }, + { + "epoch": 2.822491730981257, + "grad_norm": 1.421626032029119, + "learning_rate": 5.294538588270754e-07, + "loss": 0.0629, + "step": 18560 + }, + { + "epoch": 2.8240124700604494, + "grad_norm": 0.8979587123652093, + "learning_rate": 5.204350656045881e-07, + "loss": 0.0633, + "step": 18570 + }, + { + "epoch": 2.8255332091396417, + "grad_norm": 0.9307182538350249, + "learning_rate": 5.114929394313628e-07, + "loss": 0.0538, + "step": 18580 + }, + { + "epoch": 2.8270539482188344, + "grad_norm": 2.0356405058267244, + "learning_rate": 5.026275083130638e-07, + "loss": 0.0661, + "step": 18590 + }, + { + "epoch": 2.828574687298027, + "grad_norm": 1.1842587071532091, + "learning_rate": 4.938388000151534e-07, + "loss": 0.0666, + "step": 18600 + }, + { + "epoch": 2.8300954263772193, + "grad_norm": 1.3521092796741017, + "learning_rate": 4.85126842062808e-07, + "loss": 0.0644, + "step": 18610 + }, + { + "epoch": 2.8316161654564116, + "grad_norm": 1.0071242528407522, + "learning_rate": 4.7649166174082713e-07, + "loss": 0.0599, + "step": 18620 + }, + { + "epoch": 2.8331369045356043, + "grad_norm": 1.5460748038922985, + "learning_rate": 4.679332860935609e-07, + "loss": 0.0708, + "step": 18630 + }, + { + "epoch": 2.834657643614797, + "grad_norm": 2.7249257185035134, + "learning_rate": 4.594517419248073e-07, + "loss": 0.071, + "step": 18640 + }, + { + "epoch": 2.8361783826939893, + "grad_norm": 1.3598228360508484, + "learning_rate": 4.5104705579774586e-07, + "loss": 0.068, + "step": 18650 + }, + { + "epoch": 2.8376991217731815, + "grad_norm": 0.9176529403239523, + "learning_rate": 4.4271925403483463e-07, + "loss": 0.0651, + "step": 18660 + }, + { + "epoch": 2.839219860852374, + "grad_norm": 2.0312501808960883, + "learning_rate": 4.3446836271775204e-07, + "loss": 0.0815, + "step": 18670 + }, + { + "epoch": 2.840740599931567, + "grad_norm": 1.387863292518313, + "learning_rate": 4.2629440768728877e-07, + "loss": 0.0785, + "step": 18680 + }, + { + "epoch": 2.842261339010759, + "grad_norm": 1.6178631476261067, + "learning_rate": 4.181974145432949e-07, + "loss": 0.0925, + "step": 18690 + }, + { + "epoch": 2.843782078089952, + "grad_norm": 1.18288041529664, + "learning_rate": 4.1017740864457424e-07, + "loss": 0.0555, + "step": 18700 + }, + { + "epoch": 2.845302817169144, + "grad_norm": 1.0386446280212949, + "learning_rate": 4.0223441510882086e-07, + "loss": 0.0568, + "step": 18710 + }, + { + "epoch": 2.846823556248337, + "grad_norm": 1.8132509606972071, + "learning_rate": 3.9436845881253295e-07, + "loss": 0.066, + "step": 18720 + }, + { + "epoch": 2.848344295327529, + "grad_norm": 2.704866909748142, + "learning_rate": 3.8657956439093754e-07, + "loss": 0.0656, + "step": 18730 + }, + { + "epoch": 2.849865034406722, + "grad_norm": 1.4316924973317835, + "learning_rate": 3.7886775623791617e-07, + "loss": 0.0657, + "step": 18740 + }, + { + "epoch": 2.851385773485914, + "grad_norm": 1.4488161612335226, + "learning_rate": 3.712330585059209e-07, + "loss": 0.0659, + "step": 18750 + }, + { + "epoch": 2.8529065125651067, + "grad_norm": 1.1963191462171934, + "learning_rate": 3.636754951059057e-07, + "loss": 0.067, + "step": 18760 + }, + { + "epoch": 2.854427251644299, + "grad_norm": 1.5618875864671342, + "learning_rate": 3.561950897072508e-07, + "loss": 0.0659, + "step": 18770 + }, + { + "epoch": 2.8559479907234917, + "grad_norm": 1.5429232409368998, + "learning_rate": 3.487918657376854e-07, + "loss": 0.0556, + "step": 18780 + }, + { + "epoch": 2.857468729802684, + "grad_norm": 1.5207181513487522, + "learning_rate": 3.4146584638321265e-07, + "loss": 0.053, + "step": 18790 + }, + { + "epoch": 2.8589894688818767, + "grad_norm": 1.1439076357363198, + "learning_rate": 3.3421705458804573e-07, + "loss": 0.0767, + "step": 18800 + }, + { + "epoch": 2.860510207961069, + "grad_norm": 0.7954729013462755, + "learning_rate": 3.270455130545247e-07, + "loss": 0.0693, + "step": 18810 + }, + { + "epoch": 2.8620309470402616, + "grad_norm": 1.6909840440004344, + "learning_rate": 3.199512442430552e-07, + "loss": 0.0704, + "step": 18820 + }, + { + "epoch": 2.863551686119454, + "grad_norm": 1.633456511437107, + "learning_rate": 3.129342703720367e-07, + "loss": 0.0561, + "step": 18830 + }, + { + "epoch": 2.8650724251986466, + "grad_norm": 1.5563239802446716, + "learning_rate": 3.059946134177788e-07, + "loss": 0.0694, + "step": 18840 + }, + { + "epoch": 2.8665931642778393, + "grad_norm": 1.3962469036283447, + "learning_rate": 2.9913229511445704e-07, + "loss": 0.0669, + "step": 18850 + }, + { + "epoch": 2.8681139033570315, + "grad_norm": 1.0955304282229135, + "learning_rate": 2.923473369540242e-07, + "loss": 0.0605, + "step": 18860 + }, + { + "epoch": 2.869634642436224, + "grad_norm": 2.775915235026873, + "learning_rate": 2.856397601861488e-07, + "loss": 0.0714, + "step": 18870 + }, + { + "epoch": 2.8711553815154165, + "grad_norm": 1.8391555120383567, + "learning_rate": 2.7900958581816017e-07, + "loss": 0.0659, + "step": 18880 + }, + { + "epoch": 2.872676120594609, + "grad_norm": 2.127824165905573, + "learning_rate": 2.7245683461496196e-07, + "loss": 0.0665, + "step": 18890 + }, + { + "epoch": 2.8741968596738015, + "grad_norm": 2.5565078649308024, + "learning_rate": 2.659815270989796e-07, + "loss": 0.0643, + "step": 18900 + }, + { + "epoch": 2.8757175987529937, + "grad_norm": 0.9924163310693394, + "learning_rate": 2.5958368355010197e-07, + "loss": 0.068, + "step": 18910 + }, + { + "epoch": 2.8772383378321864, + "grad_norm": 2.259834503067782, + "learning_rate": 2.5326332400559817e-07, + "loss": 0.0666, + "step": 18920 + }, + { + "epoch": 2.878759076911379, + "grad_norm": 1.700459340405849, + "learning_rate": 2.470204682600785e-07, + "loss": 0.0707, + "step": 18930 + }, + { + "epoch": 2.8802798159905714, + "grad_norm": 0.8850275923180811, + "learning_rate": 2.408551358654171e-07, + "loss": 0.051, + "step": 18940 + }, + { + "epoch": 2.881800555069764, + "grad_norm": 1.8178368814203103, + "learning_rate": 2.3476734613068486e-07, + "loss": 0.0675, + "step": 18950 + }, + { + "epoch": 2.8833212941489563, + "grad_norm": 2.001154950555922, + "learning_rate": 2.287571181221082e-07, + "loss": 0.0793, + "step": 18960 + }, + { + "epoch": 2.884842033228149, + "grad_norm": 1.7074326323674216, + "learning_rate": 2.2282447066300228e-07, + "loss": 0.0672, + "step": 18970 + }, + { + "epoch": 2.8863627723073413, + "grad_norm": 0.7734952888918128, + "learning_rate": 2.1696942233370153e-07, + "loss": 0.0538, + "step": 18980 + }, + { + "epoch": 2.887883511386534, + "grad_norm": 1.5686788728726144, + "learning_rate": 2.111919914715099e-07, + "loss": 0.0623, + "step": 18990 + }, + { + "epoch": 2.8894042504657262, + "grad_norm": 1.8209721668026781, + "learning_rate": 2.0549219617064796e-07, + "loss": 0.0699, + "step": 19000 + }, + { + "epoch": 2.890924989544919, + "grad_norm": 1.0288824514118928, + "learning_rate": 1.9987005428218907e-07, + "loss": 0.0722, + "step": 19010 + }, + { + "epoch": 2.892445728624111, + "grad_norm": 1.69763137958439, + "learning_rate": 1.943255834140012e-07, + "loss": 0.0648, + "step": 19020 + }, + { + "epoch": 2.893966467703304, + "grad_norm": 1.2159424936121752, + "learning_rate": 1.888588009307024e-07, + "loss": 0.0568, + "step": 19030 + }, + { + "epoch": 2.895487206782496, + "grad_norm": 0.7702024609871306, + "learning_rate": 1.8346972395359429e-07, + "loss": 0.0612, + "step": 19040 + }, + { + "epoch": 2.897007945861689, + "grad_norm": 1.1463584626993126, + "learning_rate": 1.7815836936062035e-07, + "loss": 0.0723, + "step": 19050 + }, + { + "epoch": 2.898528684940881, + "grad_norm": 0.8690857625487675, + "learning_rate": 1.7292475378629936e-07, + "loss": 0.0632, + "step": 19060 + }, + { + "epoch": 2.900049424020074, + "grad_norm": 1.1699374294147835, + "learning_rate": 1.6776889362168646e-07, + "loss": 0.0754, + "step": 19070 + }, + { + "epoch": 2.901570163099266, + "grad_norm": 1.0588054696322655, + "learning_rate": 1.6269080501431776e-07, + "loss": 0.0594, + "step": 19080 + }, + { + "epoch": 2.9030909021784588, + "grad_norm": 1.5987399884390379, + "learning_rate": 1.57690503868152e-07, + "loss": 0.0545, + "step": 19090 + }, + { + "epoch": 2.9046116412576515, + "grad_norm": 1.5395769375543062, + "learning_rate": 1.5276800584353158e-07, + "loss": 0.0685, + "step": 19100 + }, + { + "epoch": 2.9061323803368437, + "grad_norm": 1.692374068025694, + "learning_rate": 1.4792332635712723e-07, + "loss": 0.0568, + "step": 19110 + }, + { + "epoch": 2.907653119416036, + "grad_norm": 0.8629390515588464, + "learning_rate": 1.4315648058189079e-07, + "loss": 0.0658, + "step": 19120 + }, + { + "epoch": 2.9091738584952287, + "grad_norm": 1.6306210949721018, + "learning_rate": 1.3846748344701065e-07, + "loss": 0.0674, + "step": 19130 + }, + { + "epoch": 2.9106945975744214, + "grad_norm": 1.0146375059642367, + "learning_rate": 1.338563496378592e-07, + "loss": 0.0576, + "step": 19140 + }, + { + "epoch": 2.9122153366536137, + "grad_norm": 1.6827794282870436, + "learning_rate": 1.2932309359595385e-07, + "loss": 0.078, + "step": 19150 + }, + { + "epoch": 2.913736075732806, + "grad_norm": 1.9343229404914606, + "learning_rate": 1.2486772951890435e-07, + "loss": 0.0758, + "step": 19160 + }, + { + "epoch": 2.9152568148119986, + "grad_norm": 1.6121855476346139, + "learning_rate": 1.204902713603795e-07, + "loss": 0.0696, + "step": 19170 + }, + { + "epoch": 2.9167775538911913, + "grad_norm": 1.9380635233795818, + "learning_rate": 1.1619073283004323e-07, + "loss": 0.0667, + "step": 19180 + }, + { + "epoch": 2.9182982929703836, + "grad_norm": 1.747561976486993, + "learning_rate": 1.1196912739354082e-07, + "loss": 0.0598, + "step": 19190 + }, + { + "epoch": 2.919819032049576, + "grad_norm": 1.4265869752909142, + "learning_rate": 1.0782546827242667e-07, + "loss": 0.0624, + "step": 19200 + }, + { + "epoch": 2.9213397711287685, + "grad_norm": 1.347871394345266, + "learning_rate": 1.0375976844414492e-07, + "loss": 0.0622, + "step": 19210 + }, + { + "epoch": 2.9228605102079612, + "grad_norm": 0.924835556274283, + "learning_rate": 9.977204064197387e-08, + "loss": 0.0596, + "step": 19220 + }, + { + "epoch": 2.9243812492871535, + "grad_norm": 1.8799049469329432, + "learning_rate": 9.586229735500107e-08, + "loss": 0.0615, + "step": 19230 + }, + { + "epoch": 2.925901988366346, + "grad_norm": 0.9535706593700133, + "learning_rate": 9.203055082806777e-08, + "loss": 0.0635, + "step": 19240 + }, + { + "epoch": 2.9274227274455384, + "grad_norm": 1.0524294115158686, + "learning_rate": 8.827681306174673e-08, + "loss": 0.0556, + "step": 19250 + }, + { + "epoch": 2.928943466524731, + "grad_norm": 1.9865887849131574, + "learning_rate": 8.460109581228947e-08, + "loss": 0.065, + "step": 19260 + }, + { + "epoch": 2.9304642056039234, + "grad_norm": 1.9164663044711834, + "learning_rate": 8.100341059160133e-08, + "loss": 0.0608, + "step": 19270 + }, + { + "epoch": 2.931984944683116, + "grad_norm": 1.1124826077603518, + "learning_rate": 7.748376866719975e-08, + "loss": 0.0638, + "step": 19280 + }, + { + "epoch": 2.9335056837623084, + "grad_norm": 0.9970257919359131, + "learning_rate": 7.404218106217831e-08, + "loss": 0.0538, + "step": 19290 + }, + { + "epoch": 2.935026422841501, + "grad_norm": 1.7494234730916183, + "learning_rate": 7.06786585551733e-08, + "loss": 0.08, + "step": 19300 + }, + { + "epoch": 2.9365471619206933, + "grad_norm": 2.295850762393038, + "learning_rate": 6.73932116803333e-08, + "loss": 0.0594, + "step": 19310 + }, + { + "epoch": 2.938067900999886, + "grad_norm": 0.8354007254085288, + "learning_rate": 6.418585072728578e-08, + "loss": 0.0604, + "step": 19320 + }, + { + "epoch": 2.9395886400790783, + "grad_norm": 1.7754769675235187, + "learning_rate": 6.105658574109274e-08, + "loss": 0.0844, + "step": 19330 + }, + { + "epoch": 2.941109379158271, + "grad_norm": 1.2509714603477, + "learning_rate": 5.800542652223961e-08, + "loss": 0.0804, + "step": 19340 + }, + { + "epoch": 2.9426301182374637, + "grad_norm": 1.1735968272835666, + "learning_rate": 5.503238262658805e-08, + "loss": 0.0607, + "step": 19350 + }, + { + "epoch": 2.944150857316656, + "grad_norm": 1.2833566957389388, + "learning_rate": 5.213746336535374e-08, + "loss": 0.07, + "step": 19360 + }, + { + "epoch": 2.945671596395848, + "grad_norm": 0.856492509446977, + "learning_rate": 4.932067780507588e-08, + "loss": 0.0686, + "step": 19370 + }, + { + "epoch": 2.947192335475041, + "grad_norm": 1.2776770452816817, + "learning_rate": 4.65820347675866e-08, + "loss": 0.0614, + "step": 19380 + }, + { + "epoch": 2.9487130745542336, + "grad_norm": 1.050420781518197, + "learning_rate": 4.392154282998606e-08, + "loss": 0.0724, + "step": 19390 + }, + { + "epoch": 2.950233813633426, + "grad_norm": 2.748729516148674, + "learning_rate": 4.133921032461463e-08, + "loss": 0.077, + "step": 19400 + }, + { + "epoch": 2.951754552712618, + "grad_norm": 1.0042626630975255, + "learning_rate": 3.883504533902793e-08, + "loss": 0.0649, + "step": 19410 + }, + { + "epoch": 2.953275291791811, + "grad_norm": 0.7428874171128743, + "learning_rate": 3.6409055715969084e-08, + "loss": 0.07, + "step": 19420 + }, + { + "epoch": 2.9547960308710035, + "grad_norm": 1.9797509015373627, + "learning_rate": 3.406124905334373e-08, + "loss": 0.0711, + "step": 19430 + }, + { + "epoch": 2.9563167699501958, + "grad_norm": 1.1816812529298226, + "learning_rate": 3.17916327042006e-08, + "loss": 0.0723, + "step": 19440 + }, + { + "epoch": 2.957837509029388, + "grad_norm": 1.0187846419056599, + "learning_rate": 2.9600213776703744e-08, + "loss": 0.0723, + "step": 19450 + }, + { + "epoch": 2.9593582481085807, + "grad_norm": 1.0777184431914886, + "learning_rate": 2.7486999134115898e-08, + "loss": 0.0718, + "step": 19460 + }, + { + "epoch": 2.9608789871877734, + "grad_norm": 1.2440381815463568, + "learning_rate": 2.5451995394767947e-08, + "loss": 0.0621, + "step": 19470 + }, + { + "epoch": 2.9623997262669657, + "grad_norm": 1.0752833708467975, + "learning_rate": 2.3495208932047818e-08, + "loss": 0.0591, + "step": 19480 + }, + { + "epoch": 2.9639204653461584, + "grad_norm": 1.3610558019905572, + "learning_rate": 2.161664587437551e-08, + "loss": 0.0717, + "step": 19490 + }, + { + "epoch": 2.9654412044253506, + "grad_norm": 0.719131774641052, + "learning_rate": 1.9816312105183642e-08, + "loss": 0.0609, + "step": 19500 + }, + { + "epoch": 2.9669619435045433, + "grad_norm": 1.0255112156664457, + "learning_rate": 1.8094213262898064e-08, + "loss": 0.0572, + "step": 19510 + }, + { + "epoch": 2.9684826825837356, + "grad_norm": 1.5829743611389515, + "learning_rate": 1.6450354740932262e-08, + "loss": 0.0768, + "step": 19520 + }, + { + "epoch": 2.9700034216629283, + "grad_norm": 1.482495169424652, + "learning_rate": 1.4884741687645754e-08, + "loss": 0.0649, + "step": 19530 + }, + { + "epoch": 2.9715241607421206, + "grad_norm": 0.9782828840538833, + "learning_rate": 1.3397379006352405e-08, + "loss": 0.0645, + "step": 19540 + }, + { + "epoch": 2.9730448998213133, + "grad_norm": 1.3728316975633654, + "learning_rate": 1.1988271355295456e-08, + "loss": 0.0604, + "step": 19550 + }, + { + "epoch": 2.9745656389005055, + "grad_norm": 0.8774366466348487, + "learning_rate": 1.0657423147628077e-08, + "loss": 0.0549, + "step": 19560 + }, + { + "epoch": 2.976086377979698, + "grad_norm": 1.4420351284912958, + "learning_rate": 9.404838551407834e-09, + "loss": 0.0588, + "step": 19570 + }, + { + "epoch": 2.9776071170588905, + "grad_norm": 2.377578036145292, + "learning_rate": 8.230521489577259e-09, + "loss": 0.0723, + "step": 19580 + }, + { + "epoch": 2.979127856138083, + "grad_norm": 1.5719860819042877, + "learning_rate": 7.134475639955507e-09, + "loss": 0.0724, + "step": 19590 + }, + { + "epoch": 2.9806485952172754, + "grad_norm": 1.5525563056250178, + "learning_rate": 6.116704435227272e-09, + "loss": 0.0748, + "step": 19600 + }, + { + "epoch": 2.982169334296468, + "grad_norm": 1.0664002038631866, + "learning_rate": 5.177211062931675e-09, + "loss": 0.0621, + "step": 19610 + }, + { + "epoch": 2.9836900733756604, + "grad_norm": 0.813074377361075, + "learning_rate": 4.315998465445614e-09, + "loss": 0.0606, + "step": 19620 + }, + { + "epoch": 2.985210812454853, + "grad_norm": 1.2963485986713008, + "learning_rate": 3.5330693399837633e-09, + "loss": 0.0673, + "step": 19630 + }, + { + "epoch": 2.986731551534046, + "grad_norm": 0.95360691597918, + "learning_rate": 2.82842613858747e-09, + "loss": 0.0634, + "step": 19640 + }, + { + "epoch": 2.988252290613238, + "grad_norm": 1.6935378265906058, + "learning_rate": 2.202071068110878e-09, + "loss": 0.086, + "step": 19650 + }, + { + "epoch": 2.9897730296924303, + "grad_norm": 2.3021179934908766, + "learning_rate": 1.6540060902264787e-09, + "loss": 0.0623, + "step": 19660 + }, + { + "epoch": 2.991293768771623, + "grad_norm": 1.7207167131074257, + "learning_rate": 1.1842329214056813e-09, + "loss": 0.0645, + "step": 19670 + }, + { + "epoch": 2.9928145078508157, + "grad_norm": 2.0308470709030177, + "learning_rate": 7.927530329215893e-10, + "loss": 0.0781, + "step": 19680 + }, + { + "epoch": 2.994335246930008, + "grad_norm": 1.018103853789418, + "learning_rate": 4.795676508434488e-10, + "loss": 0.0679, + "step": 19690 + }, + { + "epoch": 2.9958559860092002, + "grad_norm": 1.141817091831521, + "learning_rate": 2.4467775603109754e-10, + "loss": 0.0662, + "step": 19700 + }, + { + "epoch": 2.997376725088393, + "grad_norm": 1.199006392405564, + "learning_rate": 8.808408412663838e-11, + "loss": 0.0616, + "step": 19710 + }, + { + "epoch": 2.9988974641675856, + "grad_norm": 2.0928764138864304, + "learning_rate": 9.787125568316846e-12, + "loss": 0.0637, + "step": 19720 + }, + { + "epoch": 2.9996578337071815, + "step": 19725, + "total_flos": 1225331877437440.0, + "train_loss": 0.2470260431361591, + "train_runtime": 59939.3079, + "train_samples_per_second": 10.532, + "train_steps_per_second": 0.329 + } + ], + "logging_steps": 10, + "max_steps": 19725, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1225331877437440.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}