diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17530 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 15.929184960782006, + "learning_rate": 2.666666666666667e-07, + "loss": 12.7783, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 15.874932389885064, + "learning_rate": 5.333333333333335e-07, + "loss": 12.7829, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 16.638474194681965, + "learning_rate": 8.000000000000001e-07, + "loss": 12.7492, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 15.691613562458222, + "learning_rate": 1.066666666666667e-06, + "loss": 12.7644, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 16.183637154093034, + "learning_rate": 1.3333333333333334e-06, + "loss": 12.7361, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 15.394137566345053, + "learning_rate": 1.6000000000000001e-06, + "loss": 12.7412, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 16.41810330919611, + "learning_rate": 1.8666666666666669e-06, + "loss": 12.7497, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 15.703173654632728, + "learning_rate": 2.133333333333334e-06, + "loss": 12.6923, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 15.28258963587572, + "learning_rate": 2.4000000000000003e-06, + "loss": 12.633, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 14.979699480878217, + "learning_rate": 2.666666666666667e-06, + "loss": 12.4999, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 13.901151346983722, + "learning_rate": 2.9333333333333338e-06, + "loss": 12.4535, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 14.29923785648717, + "learning_rate": 3.2000000000000003e-06, + "loss": 12.3986, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 11.471178506450912, + "learning_rate": 3.4666666666666672e-06, + "loss": 11.9871, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 11.838514649534746, + "learning_rate": 3.7333333333333337e-06, + "loss": 11.9267, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 10.978160035469458, + "learning_rate": 4.000000000000001e-06, + "loss": 11.8394, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 11.141794873591074, + "learning_rate": 4.266666666666668e-06, + "loss": 11.7715, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 11.298617438327794, + "learning_rate": 4.533333333333334e-06, + "loss": 11.3915, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 10.25498056051837, + "learning_rate": 4.800000000000001e-06, + "loss": 11.17, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 10.283997955198105, + "learning_rate": 5.0666666666666676e-06, + "loss": 11.0347, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 9.780727278149014, + "learning_rate": 5.333333333333334e-06, + "loss": 10.8716, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 7.854956693311563, + "learning_rate": 5.600000000000001e-06, + "loss": 10.8734, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 7.375861574845129, + "learning_rate": 5.8666666666666675e-06, + "loss": 10.7295, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 7.478804987639091, + "learning_rate": 6.133333333333334e-06, + "loss": 10.5077, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 7.061475152313659, + "learning_rate": 6.4000000000000006e-06, + "loss": 10.4395, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 7.26564612763676, + "learning_rate": 6.666666666666667e-06, + "loss": 10.2625, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 6.538214638535141, + "learning_rate": 6.9333333333333344e-06, + "loss": 10.1915, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 6.337218899132541, + "learning_rate": 7.2000000000000005e-06, + "loss": 10.1794, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 5.415165410790723, + "learning_rate": 7.4666666666666675e-06, + "loss": 10.2069, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 6.187915970859848, + "learning_rate": 7.733333333333334e-06, + "loss": 9.895, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 4.631030212687751, + "learning_rate": 8.000000000000001e-06, + "loss": 9.9977, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 4.631218434317244, + "learning_rate": 8.266666666666667e-06, + "loss": 9.8149, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 4.8158165880323365, + "learning_rate": 8.533333333333335e-06, + "loss": 9.7459, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 4.841399667684455, + "learning_rate": 8.8e-06, + "loss": 9.7156, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 4.610263893321706, + "learning_rate": 9.066666666666667e-06, + "loss": 9.6304, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 4.235066753616497, + "learning_rate": 9.333333333333334e-06, + "loss": 9.41, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 5.452655010828091, + "learning_rate": 9.600000000000001e-06, + "loss": 9.498, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 4.938743934202386, + "learning_rate": 9.866666666666668e-06, + "loss": 9.3042, + "step": 37 + }, + { + "epoch": 0.02, + "grad_norm": 3.9544713684597945, + "learning_rate": 1.0133333333333335e-05, + "loss": 9.4645, + "step": 38 + }, + { + "epoch": 0.02, + "grad_norm": 3.6572227241816835, + "learning_rate": 1.04e-05, + "loss": 9.2902, + "step": 39 + }, + { + "epoch": 0.02, + "grad_norm": 6.466505871627913, + "learning_rate": 1.0666666666666667e-05, + "loss": 9.3163, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 3.778564965485367, + "learning_rate": 1.0933333333333334e-05, + "loss": 9.1139, + "step": 41 + }, + { + "epoch": 0.02, + "grad_norm": 5.410075586798703, + "learning_rate": 1.1200000000000001e-05, + "loss": 9.0775, + "step": 42 + }, + { + "epoch": 0.02, + "grad_norm": 4.122133392766589, + "learning_rate": 1.1466666666666668e-05, + "loss": 9.146, + "step": 43 + }, + { + "epoch": 0.02, + "grad_norm": 3.6810042264298595, + "learning_rate": 1.1733333333333335e-05, + "loss": 9.0396, + "step": 44 + }, + { + "epoch": 0.02, + "grad_norm": 4.227042286632877, + "learning_rate": 1.2e-05, + "loss": 9.0946, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 5.629133823957898, + "learning_rate": 1.2266666666666667e-05, + "loss": 8.9797, + "step": 46 + }, + { + "epoch": 0.02, + "grad_norm": 3.7271542497913503, + "learning_rate": 1.2533333333333336e-05, + "loss": 8.9485, + "step": 47 + }, + { + "epoch": 0.02, + "grad_norm": 5.242646810059511, + "learning_rate": 1.2800000000000001e-05, + "loss": 8.8412, + "step": 48 + }, + { + "epoch": 0.02, + "grad_norm": 7.076863398153946, + "learning_rate": 1.3066666666666668e-05, + "loss": 8.9591, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 4.744253371158508, + "learning_rate": 1.3333333333333333e-05, + "loss": 8.849, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 4.4165606869895475, + "learning_rate": 1.3600000000000002e-05, + "loss": 8.8491, + "step": 51 + }, + { + "epoch": 0.02, + "grad_norm": 6.857593202175524, + "learning_rate": 1.3866666666666669e-05, + "loss": 8.8226, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 4.475861448921801, + "learning_rate": 1.4133333333333334e-05, + "loss": 8.808, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 4.8910608136482905, + "learning_rate": 1.4400000000000001e-05, + "loss": 8.6174, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 3.759581203767945, + "learning_rate": 1.4666666666666666e-05, + "loss": 8.7544, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 5.430056602810506, + "learning_rate": 1.4933333333333335e-05, + "loss": 8.7811, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 3.6362047197084126, + "learning_rate": 1.5200000000000002e-05, + "loss": 8.6588, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 6.307397557457014, + "learning_rate": 1.546666666666667e-05, + "loss": 8.6764, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 5.163868735280828, + "learning_rate": 1.5733333333333334e-05, + "loss": 8.5804, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 5.926174882030974, + "learning_rate": 1.6000000000000003e-05, + "loss": 8.4866, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 4.622342586255422, + "learning_rate": 1.6266666666666668e-05, + "loss": 8.4828, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 4.589183178299784, + "learning_rate": 1.6533333333333333e-05, + "loss": 8.5785, + "step": 62 + }, + { + "epoch": 0.03, + "grad_norm": 4.736115426468919, + "learning_rate": 1.6800000000000002e-05, + "loss": 8.3527, + "step": 63 + }, + { + "epoch": 0.03, + "grad_norm": 3.6673697167117676, + "learning_rate": 1.706666666666667e-05, + "loss": 8.4629, + "step": 64 + }, + { + "epoch": 0.03, + "grad_norm": 4.672070519476944, + "learning_rate": 1.7333333333333336e-05, + "loss": 8.2062, + "step": 65 + }, + { + "epoch": 0.03, + "grad_norm": 5.187720892130302, + "learning_rate": 1.76e-05, + "loss": 8.3856, + "step": 66 + }, + { + "epoch": 0.03, + "grad_norm": 4.15197703033149, + "learning_rate": 1.7866666666666666e-05, + "loss": 8.2769, + "step": 67 + }, + { + "epoch": 0.03, + "grad_norm": 5.912600166722876, + "learning_rate": 1.8133333333333335e-05, + "loss": 8.3477, + "step": 68 + }, + { + "epoch": 0.03, + "grad_norm": 4.244802476034463, + "learning_rate": 1.8400000000000003e-05, + "loss": 8.1459, + "step": 69 + }, + { + "epoch": 0.03, + "grad_norm": 5.722980011319047, + "learning_rate": 1.866666666666667e-05, + "loss": 8.2767, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 4.174831807901733, + "learning_rate": 1.8933333333333334e-05, + "loss": 8.1762, + "step": 71 + }, + { + "epoch": 0.03, + "grad_norm": 8.528943545214611, + "learning_rate": 1.9200000000000003e-05, + "loss": 8.2168, + "step": 72 + }, + { + "epoch": 0.03, + "grad_norm": 3.639645536558215, + "learning_rate": 1.9466666666666668e-05, + "loss": 8.1628, + "step": 73 + }, + { + "epoch": 0.03, + "grad_norm": 5.52886572716252, + "learning_rate": 1.9733333333333336e-05, + "loss": 8.1529, + "step": 74 + }, + { + "epoch": 0.03, + "grad_norm": 6.075612772806507, + "learning_rate": 2e-05, + "loss": 7.8176, + "step": 75 + }, + { + "epoch": 0.03, + "grad_norm": 6.783051863921666, + "learning_rate": 1.9999991608372392e-05, + "loss": 8.028, + "step": 76 + }, + { + "epoch": 0.03, + "grad_norm": 3.4072414013421133, + "learning_rate": 1.999996643350365e-05, + "loss": 8.018, + "step": 77 + }, + { + "epoch": 0.03, + "grad_norm": 4.2518888973752365, + "learning_rate": 1.999992447543603e-05, + "loss": 8.0281, + "step": 78 + }, + { + "epoch": 0.03, + "grad_norm": 5.342242469543838, + "learning_rate": 1.999986573423995e-05, + "loss": 7.7778, + "step": 79 + }, + { + "epoch": 0.03, + "grad_norm": 4.492869195214892, + "learning_rate": 1.999979021001399e-05, + "loss": 7.9055, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 5.969553752122128, + "learning_rate": 1.999969790288491e-05, + "loss": 7.8983, + "step": 81 + }, + { + "epoch": 0.03, + "grad_norm": 5.683302910481509, + "learning_rate": 1.999958881300763e-05, + "loss": 7.7587, + "step": 82 + }, + { + "epoch": 0.03, + "grad_norm": 4.970945676308067, + "learning_rate": 1.9999462940565242e-05, + "loss": 7.7445, + "step": 83 + }, + { + "epoch": 0.03, + "grad_norm": 3.102972572272466, + "learning_rate": 1.9999320285769e-05, + "loss": 7.73, + "step": 84 + }, + { + "epoch": 0.03, + "grad_norm": 5.501542226169796, + "learning_rate": 1.999916084885832e-05, + "loss": 7.7589, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 3.2240889084532633, + "learning_rate": 1.999898463010079e-05, + "loss": 7.7714, + "step": 86 + }, + { + "epoch": 0.03, + "grad_norm": 6.2224615585033245, + "learning_rate": 1.9998791629792172e-05, + "loss": 7.863, + "step": 87 + }, + { + "epoch": 0.04, + "grad_norm": 4.918041691814854, + "learning_rate": 1.999858184825637e-05, + "loss": 7.6648, + "step": 88 + }, + { + "epoch": 0.04, + "grad_norm": 5.96906876524204, + "learning_rate": 1.9998355285845473e-05, + "loss": 7.6013, + "step": 89 + }, + { + "epoch": 0.04, + "grad_norm": 5.462241702890193, + "learning_rate": 1.9998111942939727e-05, + "loss": 7.6095, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 4.324181915141825, + "learning_rate": 1.9997851819947537e-05, + "loss": 7.5812, + "step": 91 + }, + { + "epoch": 0.04, + "grad_norm": 4.031985651604528, + "learning_rate": 1.999757491730548e-05, + "loss": 7.6367, + "step": 92 + }, + { + "epoch": 0.04, + "grad_norm": 4.638146112260251, + "learning_rate": 1.999728123547828e-05, + "loss": 7.5123, + "step": 93 + }, + { + "epoch": 0.04, + "grad_norm": 5.290147893102006, + "learning_rate": 1.9996970774958836e-05, + "loss": 7.3667, + "step": 94 + }, + { + "epoch": 0.04, + "grad_norm": 3.6217639310893253, + "learning_rate": 1.9996643536268202e-05, + "loss": 7.3957, + "step": 95 + }, + { + "epoch": 0.04, + "grad_norm": 4.565797170343041, + "learning_rate": 1.999629951995559e-05, + "loss": 7.4662, + "step": 96 + }, + { + "epoch": 0.04, + "grad_norm": 3.3508669824426702, + "learning_rate": 1.9995938726598374e-05, + "loss": 7.378, + "step": 97 + }, + { + "epoch": 0.04, + "grad_norm": 5.424192916253537, + "learning_rate": 1.999556115680208e-05, + "loss": 7.3639, + "step": 98 + }, + { + "epoch": 0.04, + "grad_norm": 4.232652541715236, + "learning_rate": 1.999516681120039e-05, + "loss": 7.2653, + "step": 99 + }, + { + "epoch": 0.04, + "grad_norm": 4.129496340056508, + "learning_rate": 1.9994755690455154e-05, + "loss": 7.4364, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 4.233080193988463, + "learning_rate": 1.999432779525635e-05, + "loss": 7.2178, + "step": 101 + }, + { + "epoch": 0.04, + "grad_norm": 4.176827501719459, + "learning_rate": 1.9993883126322142e-05, + "loss": 7.254, + "step": 102 + }, + { + "epoch": 0.04, + "grad_norm": 3.758141329766479, + "learning_rate": 1.9993421684398825e-05, + "loss": 7.1993, + "step": 103 + }, + { + "epoch": 0.04, + "grad_norm": 4.405265927651899, + "learning_rate": 1.9992943470260845e-05, + "loss": 7.3139, + "step": 104 + }, + { + "epoch": 0.04, + "grad_norm": 4.152161371033381, + "learning_rate": 1.99924484847108e-05, + "loss": 7.0201, + "step": 105 + }, + { + "epoch": 0.04, + "grad_norm": 3.7267363140868968, + "learning_rate": 1.9991936728579438e-05, + "loss": 7.0296, + "step": 106 + }, + { + "epoch": 0.04, + "grad_norm": 4.3497962003683925, + "learning_rate": 1.999140820272566e-05, + "loss": 7.2447, + "step": 107 + }, + { + "epoch": 0.04, + "grad_norm": 4.298332761550472, + "learning_rate": 1.9990862908036492e-05, + "loss": 7.0496, + "step": 108 + }, + { + "epoch": 0.04, + "grad_norm": 2.846041734386933, + "learning_rate": 1.9990300845427123e-05, + "loss": 7.015, + "step": 109 + }, + { + "epoch": 0.04, + "grad_norm": 5.155174975783645, + "learning_rate": 1.998972201584088e-05, + "loss": 6.952, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 3.420517963314204, + "learning_rate": 1.998912642024922e-05, + "loss": 7.0024, + "step": 111 + }, + { + "epoch": 0.04, + "grad_norm": 4.174524593326725, + "learning_rate": 1.998851405965175e-05, + "loss": 6.9717, + "step": 112 + }, + { + "epoch": 0.05, + "grad_norm": 4.633064351625967, + "learning_rate": 1.9987884935076213e-05, + "loss": 7.0441, + "step": 113 + }, + { + "epoch": 0.05, + "grad_norm": 3.841624516136794, + "learning_rate": 1.9987239047578482e-05, + "loss": 6.9816, + "step": 114 + }, + { + "epoch": 0.05, + "grad_norm": 5.012980808319145, + "learning_rate": 1.9986576398242566e-05, + "loss": 6.8687, + "step": 115 + }, + { + "epoch": 0.05, + "grad_norm": 4.412369426332821, + "learning_rate": 1.9985896988180607e-05, + "loss": 6.9009, + "step": 116 + }, + { + "epoch": 0.05, + "grad_norm": 3.597355208644285, + "learning_rate": 1.9985200818532873e-05, + "loss": 6.9828, + "step": 117 + }, + { + "epoch": 0.05, + "grad_norm": 2.7175005237062635, + "learning_rate": 1.9984487890467773e-05, + "loss": 7.0573, + "step": 118 + }, + { + "epoch": 0.05, + "grad_norm": 4.03043168020476, + "learning_rate": 1.9983758205181824e-05, + "loss": 6.8331, + "step": 119 + }, + { + "epoch": 0.05, + "grad_norm": 4.88783774497073, + "learning_rate": 1.9983011763899674e-05, + "loss": 6.6711, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 4.280680262363997, + "learning_rate": 1.9982248567874098e-05, + "loss": 6.8023, + "step": 121 + }, + { + "epoch": 0.05, + "grad_norm": 4.577251349440423, + "learning_rate": 1.998146861838599e-05, + "loss": 6.8457, + "step": 122 + }, + { + "epoch": 0.05, + "grad_norm": 3.5891117859872472, + "learning_rate": 1.9980671916744356e-05, + "loss": 6.8008, + "step": 123 + }, + { + "epoch": 0.05, + "grad_norm": 4.099854105360618, + "learning_rate": 1.9979858464286317e-05, + "loss": 6.726, + "step": 124 + }, + { + "epoch": 0.05, + "grad_norm": 3.9821886115445677, + "learning_rate": 1.997902826237712e-05, + "loss": 6.6006, + "step": 125 + }, + { + "epoch": 0.05, + "grad_norm": 4.681735516522654, + "learning_rate": 1.9978181312410104e-05, + "loss": 6.7371, + "step": 126 + }, + { + "epoch": 0.05, + "grad_norm": 3.4290275825448147, + "learning_rate": 1.9977317615806738e-05, + "loss": 6.7282, + "step": 127 + }, + { + "epoch": 0.05, + "grad_norm": 3.803532374626138, + "learning_rate": 1.9976437174016575e-05, + "loss": 6.6815, + "step": 128 + }, + { + "epoch": 0.05, + "grad_norm": 4.32888767020378, + "learning_rate": 1.997553998851729e-05, + "loss": 6.6643, + "step": 129 + }, + { + "epoch": 0.05, + "grad_norm": 3.066789919091347, + "learning_rate": 1.997462606081465e-05, + "loss": 6.6853, + "step": 130 + }, + { + "epoch": 0.05, + "grad_norm": 4.708099567003482, + "learning_rate": 1.997369539244252e-05, + "loss": 6.7651, + "step": 131 + }, + { + "epoch": 0.05, + "grad_norm": 4.245589097217936, + "learning_rate": 1.997274798496287e-05, + "loss": 6.6351, + "step": 132 + }, + { + "epoch": 0.05, + "grad_norm": 6.240512001177884, + "learning_rate": 1.9971783839965756e-05, + "loss": 6.509, + "step": 133 + }, + { + "epoch": 0.05, + "grad_norm": 3.9243143342593974, + "learning_rate": 1.997080295906933e-05, + "loss": 6.6422, + "step": 134 + }, + { + "epoch": 0.05, + "grad_norm": 3.9373753907930666, + "learning_rate": 1.9969805343919822e-05, + "loss": 6.6452, + "step": 135 + }, + { + "epoch": 0.05, + "grad_norm": 3.29054775888337, + "learning_rate": 1.996879099619156e-05, + "loss": 6.5095, + "step": 136 + }, + { + "epoch": 0.05, + "grad_norm": 4.39805253052477, + "learning_rate": 1.9967759917586953e-05, + "loss": 6.4897, + "step": 137 + }, + { + "epoch": 0.06, + "grad_norm": 3.7120504701706247, + "learning_rate": 1.9966712109836476e-05, + "loss": 6.4993, + "step": 138 + }, + { + "epoch": 0.06, + "grad_norm": 5.288830803394844, + "learning_rate": 1.9965647574698705e-05, + "loss": 6.5248, + "step": 139 + }, + { + "epoch": 0.06, + "grad_norm": 4.908954938672954, + "learning_rate": 1.9964566313960265e-05, + "loss": 6.382, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 6.400649085165072, + "learning_rate": 1.9963468329435872e-05, + "loss": 6.6161, + "step": 141 + }, + { + "epoch": 0.06, + "grad_norm": 3.6352934691721006, + "learning_rate": 1.9962353622968296e-05, + "loss": 6.2676, + "step": 142 + }, + { + "epoch": 0.06, + "grad_norm": 4.869261630058483, + "learning_rate": 1.996122219642838e-05, + "loss": 6.6562, + "step": 143 + }, + { + "epoch": 0.06, + "grad_norm": 4.386275014923958, + "learning_rate": 1.9960074051715022e-05, + "loss": 6.4506, + "step": 144 + }, + { + "epoch": 0.06, + "grad_norm": 4.209204783609756, + "learning_rate": 1.995890919075519e-05, + "loss": 6.3668, + "step": 145 + }, + { + "epoch": 0.06, + "grad_norm": 6.247052362466022, + "learning_rate": 1.995772761550389e-05, + "loss": 6.3787, + "step": 146 + }, + { + "epoch": 0.06, + "grad_norm": 4.27883778192868, + "learning_rate": 1.9956529327944198e-05, + "loss": 6.3488, + "step": 147 + }, + { + "epoch": 0.06, + "grad_norm": 5.635114569082134, + "learning_rate": 1.9955314330087225e-05, + "loss": 6.343, + "step": 148 + }, + { + "epoch": 0.06, + "grad_norm": 3.8840203593424962, + "learning_rate": 1.9954082623972143e-05, + "loss": 6.3235, + "step": 149 + }, + { + "epoch": 0.06, + "grad_norm": 3.5382476230277056, + "learning_rate": 1.995283421166614e-05, + "loss": 6.3011, + "step": 150 + }, + { + "epoch": 0.06, + "grad_norm": 3.234026714115879, + "learning_rate": 1.9951569095264473e-05, + "loss": 6.2582, + "step": 151 + }, + { + "epoch": 0.06, + "grad_norm": 3.4587502541675947, + "learning_rate": 1.995028727689041e-05, + "loss": 6.3841, + "step": 152 + }, + { + "epoch": 0.06, + "grad_norm": 6.061472086983153, + "learning_rate": 1.9948988758695263e-05, + "loss": 6.1556, + "step": 153 + }, + { + "epoch": 0.06, + "grad_norm": 5.147311138116654, + "learning_rate": 1.994767354285837e-05, + "loss": 6.1378, + "step": 154 + }, + { + "epoch": 0.06, + "grad_norm": 7.231891466089776, + "learning_rate": 1.9946341631587086e-05, + "loss": 6.2871, + "step": 155 + }, + { + "epoch": 0.06, + "grad_norm": 4.434419768405808, + "learning_rate": 1.9944993027116798e-05, + "loss": 6.332, + "step": 156 + }, + { + "epoch": 0.06, + "grad_norm": 5.531188775167551, + "learning_rate": 1.9943627731710896e-05, + "loss": 6.0622, + "step": 157 + }, + { + "epoch": 0.06, + "grad_norm": 5.843702023801363, + "learning_rate": 1.9942245747660797e-05, + "loss": 6.2306, + "step": 158 + }, + { + "epoch": 0.06, + "grad_norm": 4.766495021175679, + "learning_rate": 1.9940847077285918e-05, + "loss": 6.1657, + "step": 159 + }, + { + "epoch": 0.06, + "grad_norm": 4.226745630752804, + "learning_rate": 1.9939431722933678e-05, + "loss": 6.2713, + "step": 160 + }, + { + "epoch": 0.06, + "grad_norm": 4.213643944441643, + "learning_rate": 1.993799968697951e-05, + "loss": 6.2101, + "step": 161 + }, + { + "epoch": 0.06, + "grad_norm": 4.320563316825058, + "learning_rate": 1.9936550971826835e-05, + "loss": 6.1438, + "step": 162 + }, + { + "epoch": 0.07, + "grad_norm": 3.816127954551905, + "learning_rate": 1.9935085579907064e-05, + "loss": 6.2491, + "step": 163 + }, + { + "epoch": 0.07, + "grad_norm": 5.6329522483416845, + "learning_rate": 1.9933603513679604e-05, + "loss": 6.1279, + "step": 164 + }, + { + "epoch": 0.07, + "grad_norm": 3.027487870638996, + "learning_rate": 1.9932104775631847e-05, + "loss": 6.0602, + "step": 165 + }, + { + "epoch": 0.07, + "grad_norm": 4.98578715574418, + "learning_rate": 1.993058936827916e-05, + "loss": 6.0202, + "step": 166 + }, + { + "epoch": 0.07, + "grad_norm": 5.604110505866004, + "learning_rate": 1.9929057294164894e-05, + "loss": 6.0824, + "step": 167 + }, + { + "epoch": 0.07, + "grad_norm": 5.246568749376813, + "learning_rate": 1.992750855586036e-05, + "loss": 5.9831, + "step": 168 + }, + { + "epoch": 0.07, + "grad_norm": 2.7019710360614204, + "learning_rate": 1.9925943155964857e-05, + "loss": 6.0617, + "step": 169 + }, + { + "epoch": 0.07, + "grad_norm": 5.276255882489964, + "learning_rate": 1.9924361097105624e-05, + "loss": 5.8458, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 10.92369718317991, + "learning_rate": 1.992276238193788e-05, + "loss": 5.9862, + "step": 171 + }, + { + "epoch": 0.07, + "grad_norm": 7.178133635023751, + "learning_rate": 1.9921147013144782e-05, + "loss": 5.8947, + "step": 172 + }, + { + "epoch": 0.07, + "grad_norm": 4.894193449640341, + "learning_rate": 1.9919514993437445e-05, + "loss": 5.864, + "step": 173 + }, + { + "epoch": 0.07, + "grad_norm": 6.701018088179099, + "learning_rate": 1.9917866325554936e-05, + "loss": 6.0865, + "step": 174 + }, + { + "epoch": 0.07, + "grad_norm": 6.066248382914103, + "learning_rate": 1.9916201012264255e-05, + "loss": 6.032, + "step": 175 + }, + { + "epoch": 0.07, + "grad_norm": 6.103277009648441, + "learning_rate": 1.991451905636033e-05, + "loss": 6.1079, + "step": 176 + }, + { + "epoch": 0.07, + "grad_norm": 5.025146053384431, + "learning_rate": 1.9912820460666046e-05, + "loss": 5.9556, + "step": 177 + }, + { + "epoch": 0.07, + "grad_norm": 4.297603739475917, + "learning_rate": 1.9911105228032186e-05, + "loss": 5.9033, + "step": 178 + }, + { + "epoch": 0.07, + "grad_norm": 5.973721423418755, + "learning_rate": 1.9909373361337475e-05, + "loss": 5.9913, + "step": 179 + }, + { + "epoch": 0.07, + "grad_norm": 3.7597853707129434, + "learning_rate": 1.990762486348855e-05, + "loss": 5.91, + "step": 180 + }, + { + "epoch": 0.07, + "grad_norm": 6.429544778679046, + "learning_rate": 1.990585973741996e-05, + "loss": 5.9034, + "step": 181 + }, + { + "epoch": 0.07, + "grad_norm": 5.746428539982332, + "learning_rate": 1.9904077986094153e-05, + "loss": 5.8531, + "step": 182 + }, + { + "epoch": 0.07, + "grad_norm": 4.325501317331493, + "learning_rate": 1.9902279612501494e-05, + "loss": 5.8167, + "step": 183 + }, + { + "epoch": 0.07, + "grad_norm": 4.410086017324782, + "learning_rate": 1.9900464619660243e-05, + "loss": 5.7331, + "step": 184 + }, + { + "epoch": 0.07, + "grad_norm": 3.99166656509949, + "learning_rate": 1.989863301061654e-05, + "loss": 5.9225, + "step": 185 + }, + { + "epoch": 0.07, + "grad_norm": 4.535795454860607, + "learning_rate": 1.989678478844443e-05, + "loss": 5.8929, + "step": 186 + }, + { + "epoch": 0.07, + "grad_norm": 3.547069866912832, + "learning_rate": 1.9894919956245825e-05, + "loss": 5.9094, + "step": 187 + }, + { + "epoch": 0.08, + "grad_norm": 3.4283860612781245, + "learning_rate": 1.9893038517150526e-05, + "loss": 5.9043, + "step": 188 + }, + { + "epoch": 0.08, + "grad_norm": 4.162844234054413, + "learning_rate": 1.9891140474316197e-05, + "loss": 5.7997, + "step": 189 + }, + { + "epoch": 0.08, + "grad_norm": 3.305168114018011, + "learning_rate": 1.9889225830928365e-05, + "loss": 5.7349, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 4.055746785153127, + "learning_rate": 1.9887294590200437e-05, + "loss": 5.8111, + "step": 191 + }, + { + "epoch": 0.08, + "grad_norm": 2.7328825091394324, + "learning_rate": 1.988534675537366e-05, + "loss": 5.8384, + "step": 192 + }, + { + "epoch": 0.08, + "grad_norm": 3.1019661386474735, + "learning_rate": 1.988338232971713e-05, + "loss": 5.8723, + "step": 193 + }, + { + "epoch": 0.08, + "grad_norm": 3.497989489932904, + "learning_rate": 1.9881401316527795e-05, + "loss": 5.9128, + "step": 194 + }, + { + "epoch": 0.08, + "grad_norm": 3.8831242401657087, + "learning_rate": 1.987940371913044e-05, + "loss": 5.8428, + "step": 195 + }, + { + "epoch": 0.08, + "grad_norm": 3.174688702002974, + "learning_rate": 1.9877389540877686e-05, + "loss": 5.7467, + "step": 196 + }, + { + "epoch": 0.08, + "grad_norm": 3.055915757767483, + "learning_rate": 1.9875358785149982e-05, + "loss": 5.7186, + "step": 197 + }, + { + "epoch": 0.08, + "grad_norm": 5.782137439007435, + "learning_rate": 1.987331145535559e-05, + "loss": 5.8389, + "step": 198 + }, + { + "epoch": 0.08, + "grad_norm": 3.2379181329566777, + "learning_rate": 1.98712475549306e-05, + "loss": 5.7215, + "step": 199 + }, + { + "epoch": 0.08, + "grad_norm": 3.6456344440328845, + "learning_rate": 1.9869167087338908e-05, + "loss": 5.618, + "step": 200 + }, + { + "epoch": 0.08, + "grad_norm": 3.7619772961608255, + "learning_rate": 1.9867070056072215e-05, + "loss": 5.8516, + "step": 201 + }, + { + "epoch": 0.08, + "grad_norm": 3.5740327111798322, + "learning_rate": 1.9864956464650027e-05, + "loss": 5.7066, + "step": 202 + }, + { + "epoch": 0.08, + "grad_norm": 4.033968317687985, + "learning_rate": 1.986282631661963e-05, + "loss": 5.6463, + "step": 203 + }, + { + "epoch": 0.08, + "grad_norm": 3.644873695688273, + "learning_rate": 1.9860679615556112e-05, + "loss": 5.5472, + "step": 204 + }, + { + "epoch": 0.08, + "grad_norm": 3.2695708458307617, + "learning_rate": 1.9858516365062334e-05, + "loss": 5.7738, + "step": 205 + }, + { + "epoch": 0.08, + "grad_norm": 4.43428089723817, + "learning_rate": 1.9856336568768936e-05, + "loss": 5.6885, + "step": 206 + }, + { + "epoch": 0.08, + "grad_norm": 4.203310963319522, + "learning_rate": 1.9854140230334323e-05, + "loss": 5.6778, + "step": 207 + }, + { + "epoch": 0.08, + "grad_norm": 3.4114108774306633, + "learning_rate": 1.985192735344467e-05, + "loss": 5.9011, + "step": 208 + }, + { + "epoch": 0.08, + "grad_norm": 4.125731707197647, + "learning_rate": 1.98496979418139e-05, + "loss": 5.5251, + "step": 209 + }, + { + "epoch": 0.08, + "grad_norm": 3.558175716829582, + "learning_rate": 1.9847451999183692e-05, + "loss": 5.5834, + "step": 210 + }, + { + "epoch": 0.08, + "grad_norm": 3.1705697609316523, + "learning_rate": 1.9845189529323473e-05, + "loss": 5.6806, + "step": 211 + }, + { + "epoch": 0.08, + "grad_norm": 5.375356341487538, + "learning_rate": 1.98429105360304e-05, + "loss": 5.6382, + "step": 212 + }, + { + "epoch": 0.09, + "grad_norm": 4.886412279702093, + "learning_rate": 1.9840615023129372e-05, + "loss": 5.5464, + "step": 213 + }, + { + "epoch": 0.09, + "grad_norm": 4.003221004452615, + "learning_rate": 1.9838302994473e-05, + "loss": 5.5527, + "step": 214 + }, + { + "epoch": 0.09, + "grad_norm": 3.2142950313355976, + "learning_rate": 1.9835974453941623e-05, + "loss": 5.4318, + "step": 215 + }, + { + "epoch": 0.09, + "grad_norm": 4.124707418757366, + "learning_rate": 1.9833629405443283e-05, + "loss": 5.4974, + "step": 216 + }, + { + "epoch": 0.09, + "grad_norm": 5.162365896102337, + "learning_rate": 1.983126785291375e-05, + "loss": 5.5908, + "step": 217 + }, + { + "epoch": 0.09, + "grad_norm": 3.5671782583753178, + "learning_rate": 1.9828889800316467e-05, + "loss": 5.5886, + "step": 218 + }, + { + "epoch": 0.09, + "grad_norm": 5.552839596348838, + "learning_rate": 1.982649525164258e-05, + "loss": 5.527, + "step": 219 + }, + { + "epoch": 0.09, + "grad_norm": 5.338000185805563, + "learning_rate": 1.9824084210910924e-05, + "loss": 5.6058, + "step": 220 + }, + { + "epoch": 0.09, + "grad_norm": 4.314810218621173, + "learning_rate": 1.9821656682168013e-05, + "loss": 5.6443, + "step": 221 + }, + { + "epoch": 0.09, + "grad_norm": 4.124905233622171, + "learning_rate": 1.9819212669488026e-05, + "loss": 5.8652, + "step": 222 + }, + { + "epoch": 0.09, + "grad_norm": 5.601461559913127, + "learning_rate": 1.9816752176972815e-05, + "loss": 5.6487, + "step": 223 + }, + { + "epoch": 0.09, + "grad_norm": 4.682447349841513, + "learning_rate": 1.9814275208751882e-05, + "loss": 5.4415, + "step": 224 + }, + { + "epoch": 0.09, + "grad_norm": 4.092388938276944, + "learning_rate": 1.9811781768982392e-05, + "loss": 5.4588, + "step": 225 + }, + { + "epoch": 0.09, + "grad_norm": 5.5748442841522134, + "learning_rate": 1.9809271861849147e-05, + "loss": 5.4366, + "step": 226 + }, + { + "epoch": 0.09, + "grad_norm": 3.6795744789993554, + "learning_rate": 1.9806745491564588e-05, + "loss": 5.5378, + "step": 227 + }, + { + "epoch": 0.09, + "grad_norm": 4.236814926224578, + "learning_rate": 1.9804202662368782e-05, + "loss": 5.5051, + "step": 228 + }, + { + "epoch": 0.09, + "grad_norm": 3.8825245603921315, + "learning_rate": 1.980164337852943e-05, + "loss": 5.4982, + "step": 229 + }, + { + "epoch": 0.09, + "grad_norm": 4.689216752889219, + "learning_rate": 1.9799067644341844e-05, + "loss": 5.4037, + "step": 230 + }, + { + "epoch": 0.09, + "grad_norm": 4.498751852842296, + "learning_rate": 1.9796475464128943e-05, + "loss": 5.5379, + "step": 231 + }, + { + "epoch": 0.09, + "grad_norm": 3.2926041035928995, + "learning_rate": 1.9793866842241245e-05, + "loss": 5.6656, + "step": 232 + }, + { + "epoch": 0.09, + "grad_norm": 4.563969254045201, + "learning_rate": 1.9791241783056874e-05, + "loss": 5.5352, + "step": 233 + }, + { + "epoch": 0.09, + "grad_norm": 3.9349353489105567, + "learning_rate": 1.9788600290981525e-05, + "loss": 5.6132, + "step": 234 + }, + { + "epoch": 0.09, + "grad_norm": 4.461095251707113, + "learning_rate": 1.978594237044849e-05, + "loss": 5.5776, + "step": 235 + }, + { + "epoch": 0.09, + "grad_norm": 4.128877934256824, + "learning_rate": 1.9783268025918622e-05, + "loss": 5.3886, + "step": 236 + }, + { + "epoch": 0.09, + "grad_norm": 4.340131228111977, + "learning_rate": 1.9780577261880336e-05, + "loss": 5.4318, + "step": 237 + }, + { + "epoch": 0.1, + "grad_norm": 3.5405936438145473, + "learning_rate": 1.977787008284962e-05, + "loss": 5.612, + "step": 238 + }, + { + "epoch": 0.1, + "grad_norm": 3.9277621763337125, + "learning_rate": 1.9775146493369996e-05, + "loss": 5.5956, + "step": 239 + }, + { + "epoch": 0.1, + "grad_norm": 4.189175924053646, + "learning_rate": 1.977240649801253e-05, + "loss": 5.4193, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 3.7261802456797044, + "learning_rate": 1.9769650101375835e-05, + "loss": 5.4232, + "step": 241 + }, + { + "epoch": 0.1, + "grad_norm": 5.461796387404502, + "learning_rate": 1.9766877308086038e-05, + "loss": 5.5134, + "step": 242 + }, + { + "epoch": 0.1, + "grad_norm": 4.061775651187604, + "learning_rate": 1.9764088122796785e-05, + "loss": 5.4679, + "step": 243 + }, + { + "epoch": 0.1, + "grad_norm": 5.965102040882132, + "learning_rate": 1.976128255018924e-05, + "loss": 5.5467, + "step": 244 + }, + { + "epoch": 0.1, + "grad_norm": 4.676829205300839, + "learning_rate": 1.9758460594972068e-05, + "loss": 5.4451, + "step": 245 + }, + { + "epoch": 0.1, + "grad_norm": 4.911400602977502, + "learning_rate": 1.975562226188143e-05, + "loss": 5.4803, + "step": 246 + }, + { + "epoch": 0.1, + "grad_norm": 6.282418377084345, + "learning_rate": 1.9752767555680967e-05, + "loss": 5.286, + "step": 247 + }, + { + "epoch": 0.1, + "grad_norm": 4.308118562484986, + "learning_rate": 1.9749896481161807e-05, + "loss": 5.4824, + "step": 248 + }, + { + "epoch": 0.1, + "grad_norm": 7.263589886497265, + "learning_rate": 1.9747009043142556e-05, + "loss": 5.5256, + "step": 249 + }, + { + "epoch": 0.1, + "grad_norm": 3.827111939511474, + "learning_rate": 1.9744105246469264e-05, + "loss": 5.5244, + "step": 250 + }, + { + "epoch": 0.1, + "grad_norm": 5.403518844265166, + "learning_rate": 1.974118509601545e-05, + "loss": 5.3989, + "step": 251 + }, + { + "epoch": 0.1, + "grad_norm": 3.720678665989358, + "learning_rate": 1.9738248596682078e-05, + "loss": 5.4725, + "step": 252 + }, + { + "epoch": 0.1, + "grad_norm": 5.631390734909404, + "learning_rate": 1.973529575339755e-05, + "loss": 5.4297, + "step": 253 + }, + { + "epoch": 0.1, + "grad_norm": 3.629825341493495, + "learning_rate": 1.9732326571117703e-05, + "loss": 5.3291, + "step": 254 + }, + { + "epoch": 0.1, + "grad_norm": 4.527878508159066, + "learning_rate": 1.9729341054825783e-05, + "loss": 5.4482, + "step": 255 + }, + { + "epoch": 0.1, + "grad_norm": 5.041074384352786, + "learning_rate": 1.9726339209532462e-05, + "loss": 5.4306, + "step": 256 + }, + { + "epoch": 0.1, + "grad_norm": 3.7876647301759494, + "learning_rate": 1.9723321040275816e-05, + "loss": 5.3598, + "step": 257 + }, + { + "epoch": 0.1, + "grad_norm": 3.9349814631193314, + "learning_rate": 1.972028655212131e-05, + "loss": 5.2161, + "step": 258 + }, + { + "epoch": 0.1, + "grad_norm": 4.196007883835419, + "learning_rate": 1.9717235750161808e-05, + "loss": 5.1868, + "step": 259 + }, + { + "epoch": 0.1, + "grad_norm": 4.128921591316096, + "learning_rate": 1.9714168639517543e-05, + "loss": 5.274, + "step": 260 + }, + { + "epoch": 0.1, + "grad_norm": 3.42696847333355, + "learning_rate": 1.971108522533613e-05, + "loss": 5.4778, + "step": 261 + }, + { + "epoch": 0.1, + "grad_norm": 3.0050007534041336, + "learning_rate": 1.9707985512792544e-05, + "loss": 5.5596, + "step": 262 + }, + { + "epoch": 0.11, + "grad_norm": 3.669616234270172, + "learning_rate": 1.9704869507089105e-05, + "loss": 5.3656, + "step": 263 + }, + { + "epoch": 0.11, + "grad_norm": 3.7645184234700584, + "learning_rate": 1.970173721345549e-05, + "loss": 5.3022, + "step": 264 + }, + { + "epoch": 0.11, + "grad_norm": 3.0530948203414745, + "learning_rate": 1.9698588637148705e-05, + "loss": 5.374, + "step": 265 + }, + { + "epoch": 0.11, + "grad_norm": 3.8200253090291936, + "learning_rate": 1.9695423783453086e-05, + "loss": 5.2075, + "step": 266 + }, + { + "epoch": 0.11, + "grad_norm": 4.315490825813043, + "learning_rate": 1.9692242657680286e-05, + "loss": 5.2312, + "step": 267 + }, + { + "epoch": 0.11, + "grad_norm": 4.5436673528211955, + "learning_rate": 1.9689045265169272e-05, + "loss": 5.3477, + "step": 268 + }, + { + "epoch": 0.11, + "grad_norm": 3.6433231953876364, + "learning_rate": 1.9685831611286312e-05, + "loss": 5.3378, + "step": 269 + }, + { + "epoch": 0.11, + "grad_norm": 4.116580849675184, + "learning_rate": 1.9682601701424958e-05, + "loss": 5.3044, + "step": 270 + }, + { + "epoch": 0.11, + "grad_norm": 3.3227233442460795, + "learning_rate": 1.9679355541006056e-05, + "loss": 5.3657, + "step": 271 + }, + { + "epoch": 0.11, + "grad_norm": 3.7533500222827096, + "learning_rate": 1.9676093135477713e-05, + "loss": 5.255, + "step": 272 + }, + { + "epoch": 0.11, + "grad_norm": 4.564992658943727, + "learning_rate": 1.9672814490315312e-05, + "loss": 5.3761, + "step": 273 + }, + { + "epoch": 0.11, + "grad_norm": 3.8094013405705556, + "learning_rate": 1.9669519611021485e-05, + "loss": 5.2762, + "step": 274 + }, + { + "epoch": 0.11, + "grad_norm": 3.43100443242049, + "learning_rate": 1.9666208503126115e-05, + "loss": 5.1598, + "step": 275 + }, + { + "epoch": 0.11, + "grad_norm": 4.2699612429758895, + "learning_rate": 1.9662881172186313e-05, + "loss": 5.3879, + "step": 276 + }, + { + "epoch": 0.11, + "grad_norm": 3.922986884818375, + "learning_rate": 1.9659537623786428e-05, + "loss": 5.4821, + "step": 277 + }, + { + "epoch": 0.11, + "grad_norm": 5.4516156643641995, + "learning_rate": 1.9656177863538025e-05, + "loss": 5.3939, + "step": 278 + }, + { + "epoch": 0.11, + "grad_norm": 4.939693355525761, + "learning_rate": 1.965280189707987e-05, + "loss": 5.2649, + "step": 279 + }, + { + "epoch": 0.11, + "grad_norm": 3.3627846721747887, + "learning_rate": 1.9649409730077934e-05, + "loss": 5.3007, + "step": 280 + }, + { + "epoch": 0.11, + "grad_norm": 5.506360077258328, + "learning_rate": 1.9646001368225382e-05, + "loss": 5.4097, + "step": 281 + }, + { + "epoch": 0.11, + "grad_norm": 6.214193129635226, + "learning_rate": 1.9642576817242553e-05, + "loss": 5.316, + "step": 282 + }, + { + "epoch": 0.11, + "grad_norm": 5.2863221250018135, + "learning_rate": 1.9639136082876954e-05, + "loss": 5.3198, + "step": 283 + }, + { + "epoch": 0.11, + "grad_norm": 5.826690727773217, + "learning_rate": 1.9635679170903258e-05, + "loss": 5.3665, + "step": 284 + }, + { + "epoch": 0.11, + "grad_norm": 6.521409795613998, + "learning_rate": 1.9632206087123296e-05, + "loss": 5.1958, + "step": 285 + }, + { + "epoch": 0.11, + "grad_norm": 6.195903307493117, + "learning_rate": 1.962871683736603e-05, + "loss": 5.1238, + "step": 286 + }, + { + "epoch": 0.11, + "grad_norm": 5.701677642348548, + "learning_rate": 1.962521142748755e-05, + "loss": 5.1754, + "step": 287 + }, + { + "epoch": 0.12, + "grad_norm": 5.650371964203301, + "learning_rate": 1.9621689863371083e-05, + "loss": 5.2856, + "step": 288 + }, + { + "epoch": 0.12, + "grad_norm": 4.643759454410139, + "learning_rate": 1.9618152150926953e-05, + "loss": 5.4458, + "step": 289 + }, + { + "epoch": 0.12, + "grad_norm": 5.0810750026652345, + "learning_rate": 1.9614598296092603e-05, + "loss": 5.3881, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 4.540921451453481, + "learning_rate": 1.9611028304832547e-05, + "loss": 5.3133, + "step": 291 + }, + { + "epoch": 0.12, + "grad_norm": 4.449089021608378, + "learning_rate": 1.9607442183138403e-05, + "loss": 5.2165, + "step": 292 + }, + { + "epoch": 0.12, + "grad_norm": 4.303653173381438, + "learning_rate": 1.960383993702884e-05, + "loss": 5.2999, + "step": 293 + }, + { + "epoch": 0.12, + "grad_norm": 4.777311506166449, + "learning_rate": 1.9600221572549607e-05, + "loss": 5.1426, + "step": 294 + }, + { + "epoch": 0.12, + "grad_norm": 5.756333904685148, + "learning_rate": 1.9596587095773496e-05, + "loss": 5.3915, + "step": 295 + }, + { + "epoch": 0.12, + "grad_norm": 6.378184736178621, + "learning_rate": 1.959293651280034e-05, + "loss": 5.138, + "step": 296 + }, + { + "epoch": 0.12, + "grad_norm": 4.883143470991173, + "learning_rate": 1.958926982975701e-05, + "loss": 5.1153, + "step": 297 + }, + { + "epoch": 0.12, + "grad_norm": 6.929401125683602, + "learning_rate": 1.958558705279739e-05, + "loss": 5.2267, + "step": 298 + }, + { + "epoch": 0.12, + "grad_norm": 5.781275244606163, + "learning_rate": 1.9581888188102375e-05, + "loss": 5.2871, + "step": 299 + }, + { + "epoch": 0.12, + "grad_norm": 4.234221317741716, + "learning_rate": 1.957817324187987e-05, + "loss": 5.1617, + "step": 300 + }, + { + "epoch": 0.12, + "grad_norm": 7.159465777574264, + "learning_rate": 1.9574442220364768e-05, + "loss": 5.1299, + "step": 301 + }, + { + "epoch": 0.12, + "grad_norm": 4.256329018916455, + "learning_rate": 1.9570695129818928e-05, + "loss": 5.2352, + "step": 302 + }, + { + "epoch": 0.12, + "grad_norm": 5.133018933678203, + "learning_rate": 1.956693197653119e-05, + "loss": 5.2031, + "step": 303 + }, + { + "epoch": 0.12, + "grad_norm": 3.87755014809663, + "learning_rate": 1.9563152766817356e-05, + "loss": 5.2025, + "step": 304 + }, + { + "epoch": 0.12, + "grad_norm": 3.9463633578914727, + "learning_rate": 1.9559357507020163e-05, + "loss": 5.0225, + "step": 305 + }, + { + "epoch": 0.12, + "grad_norm": 4.274298024612477, + "learning_rate": 1.9555546203509297e-05, + "loss": 5.1675, + "step": 306 + }, + { + "epoch": 0.12, + "grad_norm": 4.571969693205734, + "learning_rate": 1.9551718862681363e-05, + "loss": 5.1387, + "step": 307 + }, + { + "epoch": 0.12, + "grad_norm": 4.441878767657563, + "learning_rate": 1.9547875490959884e-05, + "loss": 5.3539, + "step": 308 + }, + { + "epoch": 0.12, + "grad_norm": 3.627572689101822, + "learning_rate": 1.9544016094795294e-05, + "loss": 5.1321, + "step": 309 + }, + { + "epoch": 0.12, + "grad_norm": 4.781514761663345, + "learning_rate": 1.9540140680664915e-05, + "loss": 5.1977, + "step": 310 + }, + { + "epoch": 0.12, + "grad_norm": 4.140560769424146, + "learning_rate": 1.953624925507295e-05, + "loss": 5.0422, + "step": 311 + }, + { + "epoch": 0.12, + "grad_norm": 4.907850959965522, + "learning_rate": 1.953234182455048e-05, + "loss": 5.3369, + "step": 312 + }, + { + "epoch": 0.13, + "grad_norm": 4.691324420893443, + "learning_rate": 1.9528418395655443e-05, + "loss": 5.2007, + "step": 313 + }, + { + "epoch": 0.13, + "grad_norm": 5.0983085273471955, + "learning_rate": 1.952447897497263e-05, + "loss": 5.1451, + "step": 314 + }, + { + "epoch": 0.13, + "grad_norm": 5.354850165008846, + "learning_rate": 1.952052356911368e-05, + "loss": 5.1415, + "step": 315 + }, + { + "epoch": 0.13, + "grad_norm": 3.8975948202258897, + "learning_rate": 1.9516552184717036e-05, + "loss": 5.0287, + "step": 316 + }, + { + "epoch": 0.13, + "grad_norm": 4.873422769676822, + "learning_rate": 1.951256482844799e-05, + "loss": 4.9794, + "step": 317 + }, + { + "epoch": 0.13, + "grad_norm": 4.683604880793774, + "learning_rate": 1.9508561506998613e-05, + "loss": 5.105, + "step": 318 + }, + { + "epoch": 0.13, + "grad_norm": 7.194250897520733, + "learning_rate": 1.950454222708778e-05, + "loss": 5.2785, + "step": 319 + }, + { + "epoch": 0.13, + "grad_norm": 6.560112427184304, + "learning_rate": 1.950050699546116e-05, + "loss": 4.9859, + "step": 320 + }, + { + "epoch": 0.13, + "grad_norm": 5.652738444026526, + "learning_rate": 1.949645581889118e-05, + "loss": 5.0616, + "step": 321 + }, + { + "epoch": 0.13, + "grad_norm": 3.8360003224169783, + "learning_rate": 1.9492388704177036e-05, + "loss": 5.0376, + "step": 322 + }, + { + "epoch": 0.13, + "grad_norm": 4.744616157206043, + "learning_rate": 1.9488305658144666e-05, + "loss": 5.0505, + "step": 323 + }, + { + "epoch": 0.13, + "grad_norm": 5.197730198094315, + "learning_rate": 1.9484206687646753e-05, + "loss": 5.2067, + "step": 324 + }, + { + "epoch": 0.13, + "grad_norm": 4.344506328513772, + "learning_rate": 1.9480091799562706e-05, + "loss": 5.2547, + "step": 325 + }, + { + "epoch": 0.13, + "grad_norm": 5.033536417957443, + "learning_rate": 1.9475961000798645e-05, + "loss": 5.1837, + "step": 326 + }, + { + "epoch": 0.13, + "grad_norm": 4.220408063420163, + "learning_rate": 1.947181429828739e-05, + "loss": 5.0591, + "step": 327 + }, + { + "epoch": 0.13, + "grad_norm": 4.20562324255212, + "learning_rate": 1.9467651698988464e-05, + "loss": 5.2423, + "step": 328 + }, + { + "epoch": 0.13, + "grad_norm": 4.741171703804873, + "learning_rate": 1.9463473209888063e-05, + "loss": 5.0776, + "step": 329 + }, + { + "epoch": 0.13, + "grad_norm": 3.352390084872122, + "learning_rate": 1.9459278837999048e-05, + "loss": 5.1908, + "step": 330 + }, + { + "epoch": 0.13, + "grad_norm": 7.336766536879142, + "learning_rate": 1.9455068590360943e-05, + "loss": 5.0963, + "step": 331 + }, + { + "epoch": 0.13, + "grad_norm": 5.323080508118235, + "learning_rate": 1.9450842474039914e-05, + "loss": 5.0653, + "step": 332 + }, + { + "epoch": 0.13, + "grad_norm": 7.748250010976563, + "learning_rate": 1.944660049612876e-05, + "loss": 5.1332, + "step": 333 + }, + { + "epoch": 0.13, + "grad_norm": 4.7527834896436865, + "learning_rate": 1.9442342663746903e-05, + "loss": 4.9813, + "step": 334 + }, + { + "epoch": 0.13, + "grad_norm": 5.5313121566876555, + "learning_rate": 1.9438068984040366e-05, + "loss": 4.8612, + "step": 335 + }, + { + "epoch": 0.13, + "grad_norm": 4.769650373642717, + "learning_rate": 1.943377946418178e-05, + "loss": 5.1038, + "step": 336 + }, + { + "epoch": 0.13, + "grad_norm": 4.254447255879978, + "learning_rate": 1.942947411137035e-05, + "loss": 4.9938, + "step": 337 + }, + { + "epoch": 0.14, + "grad_norm": 7.232316770724755, + "learning_rate": 1.942515293283187e-05, + "loss": 4.9693, + "step": 338 + }, + { + "epoch": 0.14, + "grad_norm": 3.84144956221707, + "learning_rate": 1.9420815935818673e-05, + "loss": 5.0987, + "step": 339 + }, + { + "epoch": 0.14, + "grad_norm": 4.639198534380874, + "learning_rate": 1.9416463127609655e-05, + "loss": 5.0796, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 4.293798925443441, + "learning_rate": 1.941209451551025e-05, + "loss": 4.9292, + "step": 341 + }, + { + "epoch": 0.14, + "grad_norm": 4.903633350612866, + "learning_rate": 1.9407710106852405e-05, + "loss": 5.129, + "step": 342 + }, + { + "epoch": 0.14, + "grad_norm": 5.126542780485656, + "learning_rate": 1.940330990899459e-05, + "loss": 4.9603, + "step": 343 + }, + { + "epoch": 0.14, + "grad_norm": 4.288069212982325, + "learning_rate": 1.9398893929321763e-05, + "loss": 4.9396, + "step": 344 + }, + { + "epoch": 0.14, + "grad_norm": 3.9523613385551686, + "learning_rate": 1.9394462175245382e-05, + "loss": 5.1768, + "step": 345 + }, + { + "epoch": 0.14, + "grad_norm": 3.996899954396938, + "learning_rate": 1.939001465420337e-05, + "loss": 4.9787, + "step": 346 + }, + { + "epoch": 0.14, + "grad_norm": 4.569032903501148, + "learning_rate": 1.9385551373660113e-05, + "loss": 4.9764, + "step": 347 + }, + { + "epoch": 0.14, + "grad_norm": 4.176747620302349, + "learning_rate": 1.9381072341106453e-05, + "loss": 5.1336, + "step": 348 + }, + { + "epoch": 0.14, + "grad_norm": 4.154232238794425, + "learning_rate": 1.937657756405966e-05, + "loss": 5.058, + "step": 349 + }, + { + "epoch": 0.14, + "grad_norm": 3.8984462605590084, + "learning_rate": 1.937206705006344e-05, + "loss": 4.9333, + "step": 350 + }, + { + "epoch": 0.14, + "grad_norm": 4.690682787780671, + "learning_rate": 1.9367540806687894e-05, + "loss": 4.974, + "step": 351 + }, + { + "epoch": 0.14, + "grad_norm": 4.031124140257661, + "learning_rate": 1.9362998841529542e-05, + "loss": 5.1886, + "step": 352 + }, + { + "epoch": 0.14, + "grad_norm": 3.992297500197864, + "learning_rate": 1.935844116221127e-05, + "loss": 5.0888, + "step": 353 + }, + { + "epoch": 0.14, + "grad_norm": 3.818973294829996, + "learning_rate": 1.9353867776382357e-05, + "loss": 5.1499, + "step": 354 + }, + { + "epoch": 0.14, + "grad_norm": 3.619804549551641, + "learning_rate": 1.9349278691718426e-05, + "loss": 4.9794, + "step": 355 + }, + { + "epoch": 0.14, + "grad_norm": 3.592935767722761, + "learning_rate": 1.934467391592146e-05, + "loss": 4.8134, + "step": 356 + }, + { + "epoch": 0.14, + "grad_norm": 3.796223984937865, + "learning_rate": 1.9340053456719768e-05, + "loss": 5.0538, + "step": 357 + }, + { + "epoch": 0.14, + "grad_norm": 3.2741434670658767, + "learning_rate": 1.9335417321867988e-05, + "loss": 4.8695, + "step": 358 + }, + { + "epoch": 0.14, + "grad_norm": 3.4290531934102613, + "learning_rate": 1.9330765519147058e-05, + "loss": 4.8899, + "step": 359 + }, + { + "epoch": 0.14, + "grad_norm": 3.8359637299600364, + "learning_rate": 1.9326098056364224e-05, + "loss": 5.0173, + "step": 360 + }, + { + "epoch": 0.14, + "grad_norm": 5.249562964456709, + "learning_rate": 1.9321414941353006e-05, + "loss": 4.9638, + "step": 361 + }, + { + "epoch": 0.14, + "grad_norm": 3.552155369652054, + "learning_rate": 1.931671618197319e-05, + "loss": 4.8706, + "step": 362 + }, + { + "epoch": 0.15, + "grad_norm": 4.94673706390727, + "learning_rate": 1.931200178611083e-05, + "loss": 4.9603, + "step": 363 + }, + { + "epoch": 0.15, + "grad_norm": 3.587398864725715, + "learning_rate": 1.9307271761678214e-05, + "loss": 5.1801, + "step": 364 + }, + { + "epoch": 0.15, + "grad_norm": 3.786932404767004, + "learning_rate": 1.9302526116613863e-05, + "loss": 4.8651, + "step": 365 + }, + { + "epoch": 0.15, + "grad_norm": 4.4252520799983905, + "learning_rate": 1.9297764858882516e-05, + "loss": 5.1099, + "step": 366 + }, + { + "epoch": 0.15, + "grad_norm": 3.1820317143483927, + "learning_rate": 1.9292987996475113e-05, + "loss": 4.8881, + "step": 367 + }, + { + "epoch": 0.15, + "grad_norm": 4.807420527910623, + "learning_rate": 1.928819553740878e-05, + "loss": 4.9814, + "step": 368 + }, + { + "epoch": 0.15, + "grad_norm": 4.2681398731258975, + "learning_rate": 1.9283387489726827e-05, + "loss": 4.8342, + "step": 369 + }, + { + "epoch": 0.15, + "grad_norm": 4.2039721487863915, + "learning_rate": 1.9278563861498726e-05, + "loss": 4.9355, + "step": 370 + }, + { + "epoch": 0.15, + "grad_norm": 4.125766177541317, + "learning_rate": 1.9273724660820086e-05, + "loss": 5.0716, + "step": 371 + }, + { + "epoch": 0.15, + "grad_norm": 3.872774946960404, + "learning_rate": 1.9268869895812673e-05, + "loss": 4.8714, + "step": 372 + }, + { + "epoch": 0.15, + "grad_norm": 5.4800006941199735, + "learning_rate": 1.9263999574624357e-05, + "loss": 4.7984, + "step": 373 + }, + { + "epoch": 0.15, + "grad_norm": 4.493398067066022, + "learning_rate": 1.925911370542912e-05, + "loss": 5.1133, + "step": 374 + }, + { + "epoch": 0.15, + "grad_norm": 5.493654763578316, + "learning_rate": 1.9254212296427043e-05, + "loss": 4.7568, + "step": 375 + }, + { + "epoch": 0.15, + "grad_norm": 4.658644058543485, + "learning_rate": 1.9249295355844286e-05, + "loss": 4.8862, + "step": 376 + }, + { + "epoch": 0.15, + "grad_norm": 4.74943938464457, + "learning_rate": 1.9244362891933077e-05, + "loss": 4.9593, + "step": 377 + }, + { + "epoch": 0.15, + "grad_norm": 4.3633638724612815, + "learning_rate": 1.9239414912971697e-05, + "loss": 4.8014, + "step": 378 + }, + { + "epoch": 0.15, + "grad_norm": 3.8515410115588504, + "learning_rate": 1.923445142726446e-05, + "loss": 5.0227, + "step": 379 + }, + { + "epoch": 0.15, + "grad_norm": 4.555780430950132, + "learning_rate": 1.922947244314172e-05, + "loss": 4.8137, + "step": 380 + }, + { + "epoch": 0.15, + "grad_norm": 5.809535670084992, + "learning_rate": 1.922447796895982e-05, + "loss": 4.9212, + "step": 381 + }, + { + "epoch": 0.15, + "grad_norm": 5.688606257617834, + "learning_rate": 1.9219468013101123e-05, + "loss": 5.0151, + "step": 382 + }, + { + "epoch": 0.15, + "grad_norm": 6.349121714522054, + "learning_rate": 1.9214442583973965e-05, + "loss": 4.8788, + "step": 383 + }, + { + "epoch": 0.15, + "grad_norm": 5.863385327006022, + "learning_rate": 1.920940169001265e-05, + "loss": 4.8467, + "step": 384 + }, + { + "epoch": 0.15, + "grad_norm": 6.852291860004504, + "learning_rate": 1.9204345339677442e-05, + "loss": 5.0244, + "step": 385 + }, + { + "epoch": 0.15, + "grad_norm": 5.064040378679961, + "learning_rate": 1.919927354145454e-05, + "loss": 4.8919, + "step": 386 + }, + { + "epoch": 0.15, + "grad_norm": 4.802953333476715, + "learning_rate": 1.919418630385607e-05, + "loss": 4.9746, + "step": 387 + }, + { + "epoch": 0.16, + "grad_norm": 4.651834945637443, + "learning_rate": 1.9189083635420077e-05, + "loss": 4.7957, + "step": 388 + }, + { + "epoch": 0.16, + "grad_norm": 4.565966174891365, + "learning_rate": 1.9183965544710495e-05, + "loss": 4.9191, + "step": 389 + }, + { + "epoch": 0.16, + "grad_norm": 4.027565446582875, + "learning_rate": 1.9178832040317153e-05, + "loss": 4.9894, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 5.785579850054768, + "learning_rate": 1.9173683130855737e-05, + "loss": 4.8868, + "step": 391 + }, + { + "epoch": 0.16, + "grad_norm": 4.764008160209594, + "learning_rate": 1.9168518824967797e-05, + "loss": 4.9743, + "step": 392 + }, + { + "epoch": 0.16, + "grad_norm": 4.849385545583676, + "learning_rate": 1.916333913132072e-05, + "loss": 4.9855, + "step": 393 + }, + { + "epoch": 0.16, + "grad_norm": 5.172608067530522, + "learning_rate": 1.915814405860771e-05, + "loss": 4.8692, + "step": 394 + }, + { + "epoch": 0.16, + "grad_norm": 6.252142640840125, + "learning_rate": 1.91529336155478e-05, + "loss": 4.7972, + "step": 395 + }, + { + "epoch": 0.16, + "grad_norm": 3.7879672350261075, + "learning_rate": 1.9147707810885798e-05, + "loss": 4.8605, + "step": 396 + }, + { + "epoch": 0.16, + "grad_norm": 5.846876109888715, + "learning_rate": 1.9142466653392317e-05, + "loss": 4.7777, + "step": 397 + }, + { + "epoch": 0.16, + "grad_norm": 4.271047397565751, + "learning_rate": 1.913721015186372e-05, + "loss": 4.7847, + "step": 398 + }, + { + "epoch": 0.16, + "grad_norm": 3.792822143066731, + "learning_rate": 1.913193831512213e-05, + "loss": 4.8928, + "step": 399 + }, + { + "epoch": 0.16, + "grad_norm": 3.818952980145011, + "learning_rate": 1.9126651152015404e-05, + "loss": 4.7874, + "step": 400 + }, + { + "epoch": 0.16, + "grad_norm": 4.5004007269500335, + "learning_rate": 1.912134867141712e-05, + "loss": 4.9484, + "step": 401 + }, + { + "epoch": 0.16, + "grad_norm": 3.8619583600345053, + "learning_rate": 1.911603088222657e-05, + "loss": 4.8863, + "step": 402 + }, + { + "epoch": 0.16, + "grad_norm": 4.068544846748943, + "learning_rate": 1.9110697793368733e-05, + "loss": 4.9838, + "step": 403 + }, + { + "epoch": 0.16, + "grad_norm": 4.678329405936819, + "learning_rate": 1.9105349413794272e-05, + "loss": 4.8077, + "step": 404 + }, + { + "epoch": 0.16, + "grad_norm": 4.1845179577894065, + "learning_rate": 1.9099985752479505e-05, + "loss": 4.9151, + "step": 405 + }, + { + "epoch": 0.16, + "grad_norm": 4.25200584743188, + "learning_rate": 1.9094606818426403e-05, + "loss": 4.9883, + "step": 406 + }, + { + "epoch": 0.16, + "grad_norm": 4.469433610330476, + "learning_rate": 1.908921262066257e-05, + "loss": 4.9338, + "step": 407 + }, + { + "epoch": 0.16, + "grad_norm": 3.9569422357282615, + "learning_rate": 1.9083803168241225e-05, + "loss": 4.7066, + "step": 408 + }, + { + "epoch": 0.16, + "grad_norm": 4.162760399856056, + "learning_rate": 1.9078378470241183e-05, + "loss": 4.8367, + "step": 409 + }, + { + "epoch": 0.16, + "grad_norm": 3.5660794253037156, + "learning_rate": 1.9072938535766864e-05, + "loss": 4.7543, + "step": 410 + }, + { + "epoch": 0.16, + "grad_norm": 4.797315689316765, + "learning_rate": 1.9067483373948245e-05, + "loss": 4.9331, + "step": 411 + }, + { + "epoch": 0.16, + "grad_norm": 4.111914062880509, + "learning_rate": 1.906201299394086e-05, + "loss": 4.7696, + "step": 412 + }, + { + "epoch": 0.17, + "grad_norm": 4.539315392333889, + "learning_rate": 1.9056527404925788e-05, + "loss": 5.0399, + "step": 413 + }, + { + "epoch": 0.17, + "grad_norm": 4.059585069189535, + "learning_rate": 1.9051026616109637e-05, + "loss": 4.757, + "step": 414 + }, + { + "epoch": 0.17, + "grad_norm": 5.3260168539805735, + "learning_rate": 1.904551063672452e-05, + "loss": 4.702, + "step": 415 + }, + { + "epoch": 0.17, + "grad_norm": 4.549834876237694, + "learning_rate": 1.9039979476028044e-05, + "loss": 4.6862, + "step": 416 + }, + { + "epoch": 0.17, + "grad_norm": 3.6787257685993997, + "learning_rate": 1.90344331433033e-05, + "loss": 4.7645, + "step": 417 + }, + { + "epoch": 0.17, + "grad_norm": 4.4154234577817055, + "learning_rate": 1.9028871647858836e-05, + "loss": 4.7376, + "step": 418 + }, + { + "epoch": 0.17, + "grad_norm": 3.9661339259733364, + "learning_rate": 1.9023294999028654e-05, + "loss": 4.7945, + "step": 419 + }, + { + "epoch": 0.17, + "grad_norm": 4.932704209466543, + "learning_rate": 1.9017703206172187e-05, + "loss": 4.7688, + "step": 420 + }, + { + "epoch": 0.17, + "grad_norm": 3.861972016862522, + "learning_rate": 1.9012096278674283e-05, + "loss": 4.8772, + "step": 421 + }, + { + "epoch": 0.17, + "grad_norm": 3.548438857043438, + "learning_rate": 1.900647422594519e-05, + "loss": 4.8837, + "step": 422 + }, + { + "epoch": 0.17, + "grad_norm": 4.6790725280248475, + "learning_rate": 1.900083705742054e-05, + "loss": 4.8103, + "step": 423 + }, + { + "epoch": 0.17, + "grad_norm": 5.539654010468228, + "learning_rate": 1.8995184782561343e-05, + "loss": 4.893, + "step": 424 + }, + { + "epoch": 0.17, + "grad_norm": 3.68762323673155, + "learning_rate": 1.8989517410853956e-05, + "loss": 4.8705, + "step": 425 + }, + { + "epoch": 0.17, + "grad_norm": 6.36298700012937, + "learning_rate": 1.8983834951810068e-05, + "loss": 4.8289, + "step": 426 + }, + { + "epoch": 0.17, + "grad_norm": 4.5203690816614435, + "learning_rate": 1.89781374149667e-05, + "loss": 4.7811, + "step": 427 + }, + { + "epoch": 0.17, + "grad_norm": 4.99630344415343, + "learning_rate": 1.897242480988617e-05, + "loss": 4.6708, + "step": 428 + }, + { + "epoch": 0.17, + "grad_norm": 4.297946514995462, + "learning_rate": 1.8966697146156092e-05, + "loss": 4.7412, + "step": 429 + }, + { + "epoch": 0.17, + "grad_norm": 4.807089182475005, + "learning_rate": 1.896095443338935e-05, + "loss": 4.8245, + "step": 430 + }, + { + "epoch": 0.17, + "grad_norm": 4.442015701587801, + "learning_rate": 1.895519668122408e-05, + "loss": 4.741, + "step": 431 + }, + { + "epoch": 0.17, + "grad_norm": 4.627904445348331, + "learning_rate": 1.894942389932367e-05, + "loss": 4.9728, + "step": 432 + }, + { + "epoch": 0.17, + "grad_norm": 4.3819001573345915, + "learning_rate": 1.8943636097376728e-05, + "loss": 4.4982, + "step": 433 + }, + { + "epoch": 0.17, + "grad_norm": 3.9790732584475466, + "learning_rate": 1.8937833285097067e-05, + "loss": 4.9344, + "step": 434 + }, + { + "epoch": 0.17, + "grad_norm": 4.418435773767789, + "learning_rate": 1.8932015472223692e-05, + "loss": 4.7223, + "step": 435 + }, + { + "epoch": 0.17, + "grad_norm": 4.379821395443412, + "learning_rate": 1.8926182668520794e-05, + "loss": 4.6574, + "step": 436 + }, + { + "epoch": 0.17, + "grad_norm": 4.002954466901974, + "learning_rate": 1.892033488377771e-05, + "loss": 4.7822, + "step": 437 + }, + { + "epoch": 0.18, + "grad_norm": 5.36170107372927, + "learning_rate": 1.891447212780893e-05, + "loss": 4.6219, + "step": 438 + }, + { + "epoch": 0.18, + "grad_norm": 4.555620399748319, + "learning_rate": 1.8908594410454068e-05, + "loss": 4.8075, + "step": 439 + }, + { + "epoch": 0.18, + "grad_norm": 4.765240769452751, + "learning_rate": 1.8902701741577844e-05, + "loss": 4.8486, + "step": 440 + }, + { + "epoch": 0.18, + "grad_norm": 4.479000049549065, + "learning_rate": 1.8896794131070073e-05, + "loss": 4.8357, + "step": 441 + }, + { + "epoch": 0.18, + "grad_norm": 4.846180547802962, + "learning_rate": 1.8890871588845653e-05, + "loss": 4.8586, + "step": 442 + }, + { + "epoch": 0.18, + "grad_norm": 5.4394887924737105, + "learning_rate": 1.8884934124844534e-05, + "loss": 4.8759, + "step": 443 + }, + { + "epoch": 0.18, + "grad_norm": 5.771942523822463, + "learning_rate": 1.8878981749031718e-05, + "loss": 4.6933, + "step": 444 + }, + { + "epoch": 0.18, + "grad_norm": 5.222815576090352, + "learning_rate": 1.8873014471397225e-05, + "loss": 4.6762, + "step": 445 + }, + { + "epoch": 0.18, + "grad_norm": 3.8429763054526256, + "learning_rate": 1.886703230195609e-05, + "loss": 4.963, + "step": 446 + }, + { + "epoch": 0.18, + "grad_norm": 4.934157371040181, + "learning_rate": 1.8861035250748343e-05, + "loss": 4.8002, + "step": 447 + }, + { + "epoch": 0.18, + "grad_norm": 4.717891500942569, + "learning_rate": 1.8855023327838984e-05, + "loss": 4.7741, + "step": 448 + }, + { + "epoch": 0.18, + "grad_norm": 4.000372249774135, + "learning_rate": 1.8848996543317982e-05, + "loss": 4.6798, + "step": 449 + }, + { + "epoch": 0.18, + "grad_norm": 3.7911306427773557, + "learning_rate": 1.8842954907300236e-05, + "loss": 4.7683, + "step": 450 + }, + { + "epoch": 0.18, + "grad_norm": 5.059160382982438, + "learning_rate": 1.8836898429925586e-05, + "loss": 4.7103, + "step": 451 + }, + { + "epoch": 0.18, + "grad_norm": 4.154690931845284, + "learning_rate": 1.883082712135877e-05, + "loss": 4.6826, + "step": 452 + }, + { + "epoch": 0.18, + "grad_norm": 7.718997675526316, + "learning_rate": 1.8824740991789417e-05, + "loss": 4.6789, + "step": 453 + }, + { + "epoch": 0.18, + "grad_norm": 3.9877134941447205, + "learning_rate": 1.8818640051432036e-05, + "loss": 4.7378, + "step": 454 + }, + { + "epoch": 0.18, + "grad_norm": 6.986902956873382, + "learning_rate": 1.881252431052599e-05, + "loss": 4.8052, + "step": 455 + }, + { + "epoch": 0.18, + "grad_norm": 3.581422466169945, + "learning_rate": 1.8806393779335483e-05, + "loss": 4.7852, + "step": 456 + }, + { + "epoch": 0.18, + "grad_norm": 4.657535097217237, + "learning_rate": 1.8800248468149545e-05, + "loss": 4.8474, + "step": 457 + }, + { + "epoch": 0.18, + "grad_norm": 4.965672082104629, + "learning_rate": 1.8794088387282e-05, + "loss": 4.7586, + "step": 458 + }, + { + "epoch": 0.18, + "grad_norm": 4.277918670720924, + "learning_rate": 1.8787913547071485e-05, + "loss": 4.629, + "step": 459 + }, + { + "epoch": 0.18, + "grad_norm": 7.159016365365489, + "learning_rate": 1.8781723957881374e-05, + "loss": 4.7081, + "step": 460 + }, + { + "epoch": 0.18, + "grad_norm": 4.816953315704852, + "learning_rate": 1.8775519630099822e-05, + "loss": 4.5669, + "step": 461 + }, + { + "epoch": 0.18, + "grad_norm": 5.130734125757001, + "learning_rate": 1.876930057413971e-05, + "loss": 4.598, + "step": 462 + }, + { + "epoch": 0.19, + "grad_norm": 4.014308605597524, + "learning_rate": 1.8763066800438638e-05, + "loss": 4.8269, + "step": 463 + }, + { + "epoch": 0.19, + "grad_norm": 4.896624107406017, + "learning_rate": 1.875681831945891e-05, + "loss": 4.5851, + "step": 464 + }, + { + "epoch": 0.19, + "grad_norm": 4.525332134586662, + "learning_rate": 1.87505551416875e-05, + "loss": 4.662, + "step": 465 + }, + { + "epoch": 0.19, + "grad_norm": 4.06267120156479, + "learning_rate": 1.874427727763607e-05, + "loss": 4.694, + "step": 466 + }, + { + "epoch": 0.19, + "grad_norm": 3.9585474393739455, + "learning_rate": 1.873798473784092e-05, + "loss": 4.7157, + "step": 467 + }, + { + "epoch": 0.19, + "grad_norm": 3.831951830757437, + "learning_rate": 1.8731677532862975e-05, + "loss": 4.8873, + "step": 468 + }, + { + "epoch": 0.19, + "grad_norm": 4.762319748001275, + "learning_rate": 1.872535567328778e-05, + "loss": 4.5554, + "step": 469 + }, + { + "epoch": 0.19, + "grad_norm": 3.9742662708177727, + "learning_rate": 1.871901916972547e-05, + "loss": 4.8751, + "step": 470 + }, + { + "epoch": 0.19, + "grad_norm": 3.8286378361260334, + "learning_rate": 1.8712668032810767e-05, + "loss": 4.67, + "step": 471 + }, + { + "epoch": 0.19, + "grad_norm": 3.917732126677209, + "learning_rate": 1.870630227320294e-05, + "loss": 4.8138, + "step": 472 + }, + { + "epoch": 0.19, + "grad_norm": 3.9483842076440188, + "learning_rate": 1.8699921901585814e-05, + "loss": 4.6845, + "step": 473 + }, + { + "epoch": 0.19, + "grad_norm": 3.8546046730369565, + "learning_rate": 1.8693526928667724e-05, + "loss": 4.5114, + "step": 474 + }, + { + "epoch": 0.19, + "grad_norm": 4.077329695550013, + "learning_rate": 1.8687117365181514e-05, + "loss": 4.4996, + "step": 475 + }, + { + "epoch": 0.19, + "grad_norm": 4.131904872521195, + "learning_rate": 1.868069322188452e-05, + "loss": 4.6157, + "step": 476 + }, + { + "epoch": 0.19, + "grad_norm": 4.167070219544355, + "learning_rate": 1.8674254509558544e-05, + "loss": 4.635, + "step": 477 + }, + { + "epoch": 0.19, + "grad_norm": 5.080283219503045, + "learning_rate": 1.8667801239009845e-05, + "loss": 4.7456, + "step": 478 + }, + { + "epoch": 0.19, + "grad_norm": 4.617084882050906, + "learning_rate": 1.866133342106911e-05, + "loss": 4.6322, + "step": 479 + }, + { + "epoch": 0.19, + "grad_norm": 3.877045676966755, + "learning_rate": 1.865485106659145e-05, + "loss": 4.6529, + "step": 480 + }, + { + "epoch": 0.19, + "grad_norm": 4.8546137617071095, + "learning_rate": 1.864835418645635e-05, + "loss": 4.5812, + "step": 481 + }, + { + "epoch": 0.19, + "grad_norm": 5.024800269628755, + "learning_rate": 1.86418427915677e-05, + "loss": 4.7275, + "step": 482 + }, + { + "epoch": 0.19, + "grad_norm": 4.104346808360058, + "learning_rate": 1.863531689285374e-05, + "loss": 4.4275, + "step": 483 + }, + { + "epoch": 0.19, + "grad_norm": 5.443182810817108, + "learning_rate": 1.8628776501267052e-05, + "loss": 4.5006, + "step": 484 + }, + { + "epoch": 0.19, + "grad_norm": 3.7630454146072916, + "learning_rate": 1.862222162778454e-05, + "loss": 4.6852, + "step": 485 + }, + { + "epoch": 0.19, + "grad_norm": 5.377530807213343, + "learning_rate": 1.861565228340742e-05, + "loss": 4.4133, + "step": 486 + }, + { + "epoch": 0.19, + "grad_norm": 5.074254813435036, + "learning_rate": 1.8609068479161182e-05, + "loss": 4.7537, + "step": 487 + }, + { + "epoch": 0.2, + "grad_norm": 7.8222231116074195, + "learning_rate": 1.8602470226095602e-05, + "loss": 4.5551, + "step": 488 + }, + { + "epoch": 0.2, + "grad_norm": 5.37608449638734, + "learning_rate": 1.8595857535284692e-05, + "loss": 4.53, + "step": 489 + }, + { + "epoch": 0.2, + "grad_norm": 5.128267316594894, + "learning_rate": 1.85892304178267e-05, + "loss": 4.6295, + "step": 490 + }, + { + "epoch": 0.2, + "grad_norm": 4.860631729644693, + "learning_rate": 1.8582588884844086e-05, + "loss": 4.4502, + "step": 491 + }, + { + "epoch": 0.2, + "grad_norm": 4.9510406392102295, + "learning_rate": 1.8575932947483503e-05, + "loss": 4.7843, + "step": 492 + }, + { + "epoch": 0.2, + "grad_norm": 3.948182711081336, + "learning_rate": 1.8569262616915784e-05, + "loss": 4.7088, + "step": 493 + }, + { + "epoch": 0.2, + "grad_norm": 4.802694876664713, + "learning_rate": 1.8562577904335913e-05, + "loss": 4.4809, + "step": 494 + }, + { + "epoch": 0.2, + "grad_norm": 4.871236430723091, + "learning_rate": 1.8555878820963014e-05, + "loss": 4.5609, + "step": 495 + }, + { + "epoch": 0.2, + "grad_norm": 4.297328298890337, + "learning_rate": 1.8549165378040328e-05, + "loss": 4.5167, + "step": 496 + }, + { + "epoch": 0.2, + "grad_norm": 4.604119686459511, + "learning_rate": 1.8542437586835202e-05, + "loss": 4.7448, + "step": 497 + }, + { + "epoch": 0.2, + "grad_norm": 4.557903280049274, + "learning_rate": 1.8535695458639056e-05, + "loss": 4.6444, + "step": 498 + }, + { + "epoch": 0.2, + "grad_norm": 3.8352904144837736, + "learning_rate": 1.8528939004767377e-05, + "loss": 4.5181, + "step": 499 + }, + { + "epoch": 0.2, + "grad_norm": 4.4315903519140605, + "learning_rate": 1.8522168236559693e-05, + "loss": 4.5232, + "step": 500 + }, + { + "epoch": 0.2, + "grad_norm": 4.5003343526651225, + "learning_rate": 1.851538316537956e-05, + "loss": 4.5658, + "step": 501 + }, + { + "epoch": 0.2, + "grad_norm": 4.655561193479631, + "learning_rate": 1.8508583802614534e-05, + "loss": 4.6049, + "step": 502 + }, + { + "epoch": 0.2, + "grad_norm": 4.336819493633404, + "learning_rate": 1.8501770159676157e-05, + "loss": 4.385, + "step": 503 + }, + { + "epoch": 0.2, + "grad_norm": 4.213922156906962, + "learning_rate": 1.849494224799994e-05, + "loss": 4.5663, + "step": 504 + }, + { + "epoch": 0.2, + "grad_norm": 3.8801853326394733, + "learning_rate": 1.8488100079045345e-05, + "loss": 4.4578, + "step": 505 + }, + { + "epoch": 0.2, + "grad_norm": 5.638341688055788, + "learning_rate": 1.848124366429576e-05, + "loss": 4.4374, + "step": 506 + }, + { + "epoch": 0.2, + "grad_norm": 4.1259886569912405, + "learning_rate": 1.8474373015258472e-05, + "loss": 4.5498, + "step": 507 + }, + { + "epoch": 0.2, + "grad_norm": 5.335260758484502, + "learning_rate": 1.846748814346468e-05, + "loss": 4.4976, + "step": 508 + }, + { + "epoch": 0.2, + "grad_norm": 3.5227862557003005, + "learning_rate": 1.846058906046943e-05, + "loss": 4.4723, + "step": 509 + }, + { + "epoch": 0.2, + "grad_norm": 4.391719508660204, + "learning_rate": 1.8453675777851627e-05, + "loss": 4.6382, + "step": 510 + }, + { + "epoch": 0.2, + "grad_norm": 4.119916316504547, + "learning_rate": 1.844674830721402e-05, + "loss": 4.5931, + "step": 511 + }, + { + "epoch": 0.2, + "grad_norm": 3.5972094158028476, + "learning_rate": 1.843980666018315e-05, + "loss": 4.4852, + "step": 512 + }, + { + "epoch": 0.21, + "grad_norm": 5.437761861563065, + "learning_rate": 1.8432850848409367e-05, + "loss": 4.5205, + "step": 513 + }, + { + "epoch": 0.21, + "grad_norm": 3.4619487319523112, + "learning_rate": 1.8425880883566784e-05, + "loss": 4.5162, + "step": 514 + }, + { + "epoch": 0.21, + "grad_norm": 4.246560330909532, + "learning_rate": 1.8418896777353272e-05, + "loss": 4.6419, + "step": 515 + }, + { + "epoch": 0.21, + "grad_norm": 4.504108212476313, + "learning_rate": 1.8411898541490433e-05, + "loss": 4.5368, + "step": 516 + }, + { + "epoch": 0.21, + "grad_norm": 4.7748515692704, + "learning_rate": 1.840488618772359e-05, + "loss": 4.325, + "step": 517 + }, + { + "epoch": 0.21, + "grad_norm": 4.400533553399993, + "learning_rate": 1.8397859727821747e-05, + "loss": 4.7751, + "step": 518 + }, + { + "epoch": 0.21, + "grad_norm": 3.796804590072757, + "learning_rate": 1.83908191735776e-05, + "loss": 4.6476, + "step": 519 + }, + { + "epoch": 0.21, + "grad_norm": 5.318892580222769, + "learning_rate": 1.8383764536807486e-05, + "loss": 4.7816, + "step": 520 + }, + { + "epoch": 0.21, + "grad_norm": 3.755398814310214, + "learning_rate": 1.8376695829351378e-05, + "loss": 4.4902, + "step": 521 + }, + { + "epoch": 0.21, + "grad_norm": 4.66174991561927, + "learning_rate": 1.8369613063072875e-05, + "loss": 4.5982, + "step": 522 + }, + { + "epoch": 0.21, + "grad_norm": 5.16713458203689, + "learning_rate": 1.8362516249859164e-05, + "loss": 4.4873, + "step": 523 + }, + { + "epoch": 0.21, + "grad_norm": 4.714662973498544, + "learning_rate": 1.8355405401621e-05, + "loss": 4.5149, + "step": 524 + }, + { + "epoch": 0.21, + "grad_norm": 4.496778637749604, + "learning_rate": 1.8348280530292712e-05, + "loss": 4.5553, + "step": 525 + }, + { + "epoch": 0.21, + "grad_norm": 3.908462628514215, + "learning_rate": 1.834114164783215e-05, + "loss": 4.5851, + "step": 526 + }, + { + "epoch": 0.21, + "grad_norm": 4.0522977320295945, + "learning_rate": 1.8333988766220676e-05, + "loss": 4.526, + "step": 527 + }, + { + "epoch": 0.21, + "grad_norm": 5.000357193034575, + "learning_rate": 1.832682189746316e-05, + "loss": 4.4757, + "step": 528 + }, + { + "epoch": 0.21, + "grad_norm": 4.2730152597229, + "learning_rate": 1.831964105358794e-05, + "loss": 4.5307, + "step": 529 + }, + { + "epoch": 0.21, + "grad_norm": 6.0852057209225, + "learning_rate": 1.831244624664681e-05, + "loss": 4.5722, + "step": 530 + }, + { + "epoch": 0.21, + "grad_norm": 5.9744696789409755, + "learning_rate": 1.8305237488714995e-05, + "loss": 4.4842, + "step": 531 + }, + { + "epoch": 0.21, + "grad_norm": 7.004849793326939, + "learning_rate": 1.8298014791891138e-05, + "loss": 4.3618, + "step": 532 + }, + { + "epoch": 0.21, + "grad_norm": 4.7948194156817605, + "learning_rate": 1.829077816829728e-05, + "loss": 4.7279, + "step": 533 + }, + { + "epoch": 0.21, + "grad_norm": 6.8258386178589605, + "learning_rate": 1.8283527630078827e-05, + "loss": 4.4468, + "step": 534 + }, + { + "epoch": 0.21, + "grad_norm": 5.047435784425765, + "learning_rate": 1.827626318940454e-05, + "loss": 4.242, + "step": 535 + }, + { + "epoch": 0.21, + "grad_norm": 4.245582218556195, + "learning_rate": 1.8268984858466524e-05, + "loss": 4.4191, + "step": 536 + }, + { + "epoch": 0.21, + "grad_norm": 5.212873965419334, + "learning_rate": 1.8261692649480174e-05, + "loss": 4.3846, + "step": 537 + }, + { + "epoch": 0.22, + "grad_norm": 3.973443022794257, + "learning_rate": 1.8254386574684205e-05, + "loss": 4.4775, + "step": 538 + }, + { + "epoch": 0.22, + "grad_norm": 4.655778663416384, + "learning_rate": 1.824706664634058e-05, + "loss": 4.6469, + "step": 539 + }, + { + "epoch": 0.22, + "grad_norm": 3.7745155639127477, + "learning_rate": 1.8239732876734525e-05, + "loss": 4.4211, + "step": 540 + }, + { + "epoch": 0.22, + "grad_norm": 3.8928105110404876, + "learning_rate": 1.823238527817449e-05, + "loss": 4.5191, + "step": 541 + }, + { + "epoch": 0.22, + "grad_norm": 4.012209741102411, + "learning_rate": 1.822502386299214e-05, + "loss": 4.7298, + "step": 542 + }, + { + "epoch": 0.22, + "grad_norm": 4.09491355504511, + "learning_rate": 1.8217648643542326e-05, + "loss": 4.4712, + "step": 543 + }, + { + "epoch": 0.22, + "grad_norm": 3.9623455755711756, + "learning_rate": 1.8210259632203063e-05, + "loss": 4.5201, + "step": 544 + }, + { + "epoch": 0.22, + "grad_norm": 3.278338142567551, + "learning_rate": 1.8202856841375517e-05, + "loss": 4.5629, + "step": 545 + }, + { + "epoch": 0.22, + "grad_norm": 3.470833412471676, + "learning_rate": 1.819544028348399e-05, + "loss": 4.5509, + "step": 546 + }, + { + "epoch": 0.22, + "grad_norm": 3.557667082733272, + "learning_rate": 1.818800997097587e-05, + "loss": 4.4987, + "step": 547 + }, + { + "epoch": 0.22, + "grad_norm": 3.307435410054455, + "learning_rate": 1.8180565916321646e-05, + "loss": 4.5655, + "step": 548 + }, + { + "epoch": 0.22, + "grad_norm": 3.8047001558686695, + "learning_rate": 1.817310813201486e-05, + "loss": 4.647, + "step": 549 + }, + { + "epoch": 0.22, + "grad_norm": 4.232758498965093, + "learning_rate": 1.816563663057211e-05, + "loss": 4.5642, + "step": 550 + }, + { + "epoch": 0.22, + "grad_norm": 3.9694403182709084, + "learning_rate": 1.8158151424533002e-05, + "loss": 4.3808, + "step": 551 + }, + { + "epoch": 0.22, + "grad_norm": 4.511990232061259, + "learning_rate": 1.8150652526460146e-05, + "loss": 4.6089, + "step": 552 + }, + { + "epoch": 0.22, + "grad_norm": 5.589363784630558, + "learning_rate": 1.8143139948939138e-05, + "loss": 4.424, + "step": 553 + }, + { + "epoch": 0.22, + "grad_norm": 5.391501846213585, + "learning_rate": 1.8135613704578525e-05, + "loss": 4.6266, + "step": 554 + }, + { + "epoch": 0.22, + "grad_norm": 4.081782646799652, + "learning_rate": 1.81280738060098e-05, + "loss": 4.5131, + "step": 555 + }, + { + "epoch": 0.22, + "grad_norm": 4.701357773481821, + "learning_rate": 1.8120520265887364e-05, + "loss": 4.7093, + "step": 556 + }, + { + "epoch": 0.22, + "grad_norm": 4.560945380217301, + "learning_rate": 1.8112953096888517e-05, + "loss": 4.5045, + "step": 557 + }, + { + "epoch": 0.22, + "grad_norm": 4.7367764771999035, + "learning_rate": 1.810537231171343e-05, + "loss": 4.4186, + "step": 558 + }, + { + "epoch": 0.22, + "grad_norm": 4.383419123230545, + "learning_rate": 1.809777792308513e-05, + "loss": 4.4929, + "step": 559 + }, + { + "epoch": 0.22, + "grad_norm": 5.130267653307888, + "learning_rate": 1.8090169943749477e-05, + "loss": 4.4062, + "step": 560 + }, + { + "epoch": 0.22, + "grad_norm": 4.531928301060983, + "learning_rate": 1.808254838647513e-05, + "loss": 4.3487, + "step": 561 + }, + { + "epoch": 0.22, + "grad_norm": 5.1958588839876905, + "learning_rate": 1.8074913264053547e-05, + "loss": 4.5398, + "step": 562 + }, + { + "epoch": 0.23, + "grad_norm": 4.145503981737899, + "learning_rate": 1.8067264589298945e-05, + "loss": 4.6086, + "step": 563 + }, + { + "epoch": 0.23, + "grad_norm": 5.874804141420842, + "learning_rate": 1.8059602375048294e-05, + "loss": 4.3948, + "step": 564 + }, + { + "epoch": 0.23, + "grad_norm": 3.710773936481747, + "learning_rate": 1.8051926634161282e-05, + "loss": 4.4046, + "step": 565 + }, + { + "epoch": 0.23, + "grad_norm": 3.7682629786468107, + "learning_rate": 1.8044237379520305e-05, + "loss": 4.5396, + "step": 566 + }, + { + "epoch": 0.23, + "grad_norm": 3.9988341208652693, + "learning_rate": 1.8036534624030428e-05, + "loss": 4.5059, + "step": 567 + }, + { + "epoch": 0.23, + "grad_norm": 4.393107693200345, + "learning_rate": 1.802881838061939e-05, + "loss": 4.3796, + "step": 568 + }, + { + "epoch": 0.23, + "grad_norm": 5.574902532810956, + "learning_rate": 1.802108866223755e-05, + "loss": 4.3632, + "step": 569 + }, + { + "epoch": 0.23, + "grad_norm": 3.8128396404739546, + "learning_rate": 1.8013345481857903e-05, + "loss": 4.3907, + "step": 570 + }, + { + "epoch": 0.23, + "grad_norm": 4.33414503434105, + "learning_rate": 1.8005588852476018e-05, + "loss": 4.4354, + "step": 571 + }, + { + "epoch": 0.23, + "grad_norm": 5.301332379657507, + "learning_rate": 1.7997818787110043e-05, + "loss": 4.4156, + "step": 572 + }, + { + "epoch": 0.23, + "grad_norm": 3.718201275219629, + "learning_rate": 1.7990035298800682e-05, + "loss": 4.6335, + "step": 573 + }, + { + "epoch": 0.23, + "grad_norm": 4.82328878051179, + "learning_rate": 1.798223840061116e-05, + "loss": 4.3818, + "step": 574 + }, + { + "epoch": 0.23, + "grad_norm": 4.009302341530525, + "learning_rate": 1.797442810562721e-05, + "loss": 4.4935, + "step": 575 + }, + { + "epoch": 0.23, + "grad_norm": 4.470726269769983, + "learning_rate": 1.796660442695705e-05, + "loss": 4.4081, + "step": 576 + }, + { + "epoch": 0.23, + "grad_norm": 4.144773016409501, + "learning_rate": 1.795876737773136e-05, + "loss": 4.5555, + "step": 577 + }, + { + "epoch": 0.23, + "grad_norm": 4.540527520249997, + "learning_rate": 1.795091697110326e-05, + "loss": 4.4273, + "step": 578 + }, + { + "epoch": 0.23, + "grad_norm": 4.539566055605331, + "learning_rate": 1.7943053220248284e-05, + "loss": 4.583, + "step": 579 + }, + { + "epoch": 0.23, + "grad_norm": 4.932252647457211, + "learning_rate": 1.793517613836437e-05, + "loss": 4.4988, + "step": 580 + }, + { + "epoch": 0.23, + "grad_norm": 4.217312041012894, + "learning_rate": 1.7927285738671825e-05, + "loss": 4.4821, + "step": 581 + }, + { + "epoch": 0.23, + "grad_norm": 4.185334769654126, + "learning_rate": 1.7919382034413306e-05, + "loss": 4.558, + "step": 582 + }, + { + "epoch": 0.23, + "grad_norm": 4.583654330854395, + "learning_rate": 1.7911465038853805e-05, + "loss": 4.4156, + "step": 583 + }, + { + "epoch": 0.23, + "grad_norm": 5.437316693787005, + "learning_rate": 1.7903534765280616e-05, + "loss": 4.3659, + "step": 584 + }, + { + "epoch": 0.23, + "grad_norm": 5.442733348394627, + "learning_rate": 1.7895591227003316e-05, + "loss": 4.2313, + "step": 585 + }, + { + "epoch": 0.23, + "grad_norm": 4.898831347856921, + "learning_rate": 1.7887634437353754e-05, + "loss": 4.3839, + "step": 586 + }, + { + "epoch": 0.23, + "grad_norm": 4.187477019121502, + "learning_rate": 1.7879664409686007e-05, + "loss": 4.5723, + "step": 587 + }, + { + "epoch": 0.24, + "grad_norm": 5.317936443725569, + "learning_rate": 1.7871681157376382e-05, + "loss": 4.2695, + "step": 588 + }, + { + "epoch": 0.24, + "grad_norm": 4.427780735865234, + "learning_rate": 1.7863684693823375e-05, + "loss": 4.5148, + "step": 589 + }, + { + "epoch": 0.24, + "grad_norm": 4.2148392567243835, + "learning_rate": 1.7855675032447648e-05, + "loss": 4.3736, + "step": 590 + }, + { + "epoch": 0.24, + "grad_norm": 4.085066691271566, + "learning_rate": 1.7847652186692025e-05, + "loss": 4.4033, + "step": 591 + }, + { + "epoch": 0.24, + "grad_norm": 4.106345509099876, + "learning_rate": 1.7839616170021452e-05, + "loss": 4.3073, + "step": 592 + }, + { + "epoch": 0.24, + "grad_norm": 3.795043396096179, + "learning_rate": 1.7831566995922983e-05, + "loss": 4.4571, + "step": 593 + }, + { + "epoch": 0.24, + "grad_norm": 5.040391570866654, + "learning_rate": 1.782350467790575e-05, + "loss": 4.5967, + "step": 594 + }, + { + "epoch": 0.24, + "grad_norm": 5.172887527977414, + "learning_rate": 1.7815429229500946e-05, + "loss": 4.4391, + "step": 595 + }, + { + "epoch": 0.24, + "grad_norm": 3.8558696182852272, + "learning_rate": 1.78073406642618e-05, + "loss": 4.5532, + "step": 596 + }, + { + "epoch": 0.24, + "grad_norm": 5.324716483505234, + "learning_rate": 1.779923899576357e-05, + "loss": 4.3905, + "step": 597 + }, + { + "epoch": 0.24, + "grad_norm": 3.876541883027683, + "learning_rate": 1.7791124237603477e-05, + "loss": 4.32, + "step": 598 + }, + { + "epoch": 0.24, + "grad_norm": 3.7679255531997913, + "learning_rate": 1.7782996403400737e-05, + "loss": 4.3041, + "step": 599 + }, + { + "epoch": 0.24, + "grad_norm": 5.222611374687213, + "learning_rate": 1.7774855506796497e-05, + "loss": 4.4244, + "step": 600 + }, + { + "epoch": 0.24, + "grad_norm": 3.304244924899713, + "learning_rate": 1.776670156145383e-05, + "loss": 4.5282, + "step": 601 + }, + { + "epoch": 0.24, + "grad_norm": 4.981595214348723, + "learning_rate": 1.775853458105772e-05, + "loss": 4.4572, + "step": 602 + }, + { + "epoch": 0.24, + "grad_norm": 4.183377523359722, + "learning_rate": 1.7750354579315004e-05, + "loss": 4.6287, + "step": 603 + }, + { + "epoch": 0.24, + "grad_norm": 3.5751970777478586, + "learning_rate": 1.77421615699544e-05, + "loss": 4.2279, + "step": 604 + }, + { + "epoch": 0.24, + "grad_norm": 3.8162581902884876, + "learning_rate": 1.7733955566726438e-05, + "loss": 4.2113, + "step": 605 + }, + { + "epoch": 0.24, + "grad_norm": 4.545507736912509, + "learning_rate": 1.772573658340347e-05, + "loss": 4.3691, + "step": 606 + }, + { + "epoch": 0.24, + "grad_norm": 4.224629578143067, + "learning_rate": 1.7717504633779618e-05, + "loss": 4.3486, + "step": 607 + }, + { + "epoch": 0.24, + "grad_norm": 4.610588665876807, + "learning_rate": 1.7709259731670774e-05, + "loss": 4.38, + "step": 608 + }, + { + "epoch": 0.24, + "grad_norm": 5.70075196848019, + "learning_rate": 1.770100189091457e-05, + "loss": 4.1483, + "step": 609 + }, + { + "epoch": 0.24, + "grad_norm": 3.2360932038131613, + "learning_rate": 1.7692731125370355e-05, + "loss": 4.3603, + "step": 610 + }, + { + "epoch": 0.24, + "grad_norm": 3.8433889218472244, + "learning_rate": 1.7684447448919156e-05, + "loss": 4.4584, + "step": 611 + }, + { + "epoch": 0.24, + "grad_norm": 4.286041363522565, + "learning_rate": 1.7676150875463688e-05, + "loss": 4.3522, + "step": 612 + }, + { + "epoch": 0.25, + "grad_norm": 3.324312507834647, + "learning_rate": 1.7667841418928292e-05, + "loss": 4.34, + "step": 613 + }, + { + "epoch": 0.25, + "grad_norm": 3.826648128672422, + "learning_rate": 1.765951909325895e-05, + "loss": 4.3366, + "step": 614 + }, + { + "epoch": 0.25, + "grad_norm": 4.326699626627627, + "learning_rate": 1.7651183912423228e-05, + "loss": 4.22, + "step": 615 + }, + { + "epoch": 0.25, + "grad_norm": 3.502208687825821, + "learning_rate": 1.764283589041028e-05, + "loss": 4.434, + "step": 616 + }, + { + "epoch": 0.25, + "grad_norm": 3.9726600059699697, + "learning_rate": 1.7634475041230796e-05, + "loss": 4.4077, + "step": 617 + }, + { + "epoch": 0.25, + "grad_norm": 4.794767934344185, + "learning_rate": 1.7626101378917004e-05, + "loss": 4.3801, + "step": 618 + }, + { + "epoch": 0.25, + "grad_norm": 3.828415872023352, + "learning_rate": 1.761771491752264e-05, + "loss": 4.5012, + "step": 619 + }, + { + "epoch": 0.25, + "grad_norm": 3.864618059858188, + "learning_rate": 1.7609315671122912e-05, + "loss": 4.1654, + "step": 620 + }, + { + "epoch": 0.25, + "grad_norm": 3.5444444341424144, + "learning_rate": 1.760090365381449e-05, + "loss": 4.2813, + "step": 621 + }, + { + "epoch": 0.25, + "grad_norm": 3.0754856971618674, + "learning_rate": 1.759247887971548e-05, + "loss": 4.4173, + "step": 622 + }, + { + "epoch": 0.25, + "grad_norm": 4.051093782169358, + "learning_rate": 1.7584041362965397e-05, + "loss": 4.2801, + "step": 623 + }, + { + "epoch": 0.25, + "grad_norm": 3.703146776128272, + "learning_rate": 1.7575591117725132e-05, + "loss": 4.2089, + "step": 624 + }, + { + "epoch": 0.25, + "grad_norm": 3.194884286403555, + "learning_rate": 1.7567128158176955e-05, + "loss": 4.4067, + "step": 625 + }, + { + "epoch": 0.25, + "grad_norm": 3.7197087523579073, + "learning_rate": 1.7558652498524464e-05, + "loss": 4.5624, + "step": 626 + }, + { + "epoch": 0.25, + "grad_norm": 3.9997767290885187, + "learning_rate": 1.7550164152992573e-05, + "loss": 3.9206, + "step": 627 + }, + { + "epoch": 0.25, + "grad_norm": 3.723995563236504, + "learning_rate": 1.7541663135827493e-05, + "loss": 4.2779, + "step": 628 + }, + { + "epoch": 0.25, + "grad_norm": 3.8566179329855492, + "learning_rate": 1.75331494612967e-05, + "loss": 4.4607, + "step": 629 + }, + { + "epoch": 0.25, + "grad_norm": 3.6837802961843953, + "learning_rate": 1.7524623143688905e-05, + "loss": 4.3756, + "step": 630 + }, + { + "epoch": 0.25, + "grad_norm": 3.3643578361540993, + "learning_rate": 1.7516084197314044e-05, + "loss": 4.3226, + "step": 631 + }, + { + "epoch": 0.25, + "grad_norm": 4.13921138996773, + "learning_rate": 1.7507532636503256e-05, + "loss": 3.9717, + "step": 632 + }, + { + "epoch": 0.25, + "grad_norm": 3.7857046012154534, + "learning_rate": 1.749896847560884e-05, + "loss": 4.2997, + "step": 633 + }, + { + "epoch": 0.25, + "grad_norm": 3.4312849218009567, + "learning_rate": 1.7490391729004242e-05, + "loss": 4.3715, + "step": 634 + }, + { + "epoch": 0.25, + "grad_norm": 5.031659106872237, + "learning_rate": 1.748180241108404e-05, + "loss": 4.4076, + "step": 635 + }, + { + "epoch": 0.25, + "grad_norm": 3.29646662187367, + "learning_rate": 1.7473200536263905e-05, + "loss": 4.3464, + "step": 636 + }, + { + "epoch": 0.25, + "grad_norm": 4.453734263665285, + "learning_rate": 1.746458611898058e-05, + "loss": 4.497, + "step": 637 + }, + { + "epoch": 0.26, + "grad_norm": 4.267571577595243, + "learning_rate": 1.7455959173691863e-05, + "loss": 4.326, + "step": 638 + }, + { + "epoch": 0.26, + "grad_norm": 4.037988035302162, + "learning_rate": 1.744731971487658e-05, + "loss": 4.2036, + "step": 639 + }, + { + "epoch": 0.26, + "grad_norm": 5.6499546934773, + "learning_rate": 1.7438667757034547e-05, + "loss": 4.1384, + "step": 640 + }, + { + "epoch": 0.26, + "grad_norm": 4.193992253945382, + "learning_rate": 1.743000331468657e-05, + "loss": 4.1671, + "step": 641 + }, + { + "epoch": 0.26, + "grad_norm": 3.9238693224223846, + "learning_rate": 1.7421326402374406e-05, + "loss": 4.2942, + "step": 642 + }, + { + "epoch": 0.26, + "grad_norm": 3.7221881431043866, + "learning_rate": 1.7412637034660735e-05, + "loss": 4.2449, + "step": 643 + }, + { + "epoch": 0.26, + "grad_norm": 3.33969588119697, + "learning_rate": 1.740393522612915e-05, + "loss": 4.397, + "step": 644 + }, + { + "epoch": 0.26, + "grad_norm": 3.0812195963488773, + "learning_rate": 1.739522099138411e-05, + "loss": 4.3136, + "step": 645 + }, + { + "epoch": 0.26, + "grad_norm": 4.248841778221737, + "learning_rate": 1.7386494345050944e-05, + "loss": 4.2977, + "step": 646 + }, + { + "epoch": 0.26, + "grad_norm": 3.4512523414792726, + "learning_rate": 1.73777553017758e-05, + "loss": 4.181, + "step": 647 + }, + { + "epoch": 0.26, + "grad_norm": 4.056102712659825, + "learning_rate": 1.7369003876225644e-05, + "loss": 4.262, + "step": 648 + }, + { + "epoch": 0.26, + "grad_norm": 3.7592475887210575, + "learning_rate": 1.7360240083088213e-05, + "loss": 4.2013, + "step": 649 + }, + { + "epoch": 0.26, + "grad_norm": 3.661785505723803, + "learning_rate": 1.7351463937072008e-05, + "loss": 4.4768, + "step": 650 + }, + { + "epoch": 0.26, + "grad_norm": 3.6526178793870967, + "learning_rate": 1.734267545290625e-05, + "loss": 4.1434, + "step": 651 + }, + { + "epoch": 0.26, + "grad_norm": 3.362778318040067, + "learning_rate": 1.7333874645340886e-05, + "loss": 4.5816, + "step": 652 + }, + { + "epoch": 0.26, + "grad_norm": 3.6785134226263883, + "learning_rate": 1.7325061529146528e-05, + "loss": 4.1977, + "step": 653 + }, + { + "epoch": 0.26, + "grad_norm": 3.6203914812532583, + "learning_rate": 1.7316236119114466e-05, + "loss": 4.1502, + "step": 654 + }, + { + "epoch": 0.26, + "grad_norm": 3.909042527107989, + "learning_rate": 1.7307398430056595e-05, + "loss": 4.3402, + "step": 655 + }, + { + "epoch": 0.26, + "grad_norm": 4.343420425268675, + "learning_rate": 1.7298548476805446e-05, + "loss": 4.1416, + "step": 656 + }, + { + "epoch": 0.26, + "grad_norm": 3.8231798578591825, + "learning_rate": 1.7289686274214116e-05, + "loss": 4.4234, + "step": 657 + }, + { + "epoch": 0.26, + "grad_norm": 4.104953811214553, + "learning_rate": 1.7280811837156268e-05, + "loss": 4.3791, + "step": 658 + }, + { + "epoch": 0.26, + "grad_norm": 4.006233314995586, + "learning_rate": 1.7271925180526094e-05, + "loss": 4.1491, + "step": 659 + }, + { + "epoch": 0.26, + "grad_norm": 3.493796073974652, + "learning_rate": 1.72630263192383e-05, + "loss": 4.0866, + "step": 660 + }, + { + "epoch": 0.26, + "grad_norm": 5.5861790784869365, + "learning_rate": 1.7254115268228073e-05, + "loss": 4.1643, + "step": 661 + }, + { + "epoch": 0.26, + "grad_norm": 4.094790170581724, + "learning_rate": 1.724519204245105e-05, + "loss": 4.3313, + "step": 662 + }, + { + "epoch": 0.27, + "grad_norm": 4.8169362681688614, + "learning_rate": 1.723625665688331e-05, + "loss": 4.2777, + "step": 663 + }, + { + "epoch": 0.27, + "grad_norm": 3.5415226420284696, + "learning_rate": 1.7227309126521347e-05, + "loss": 4.2133, + "step": 664 + }, + { + "epoch": 0.27, + "grad_norm": 3.447252681075696, + "learning_rate": 1.7218349466382024e-05, + "loss": 4.4371, + "step": 665 + }, + { + "epoch": 0.27, + "grad_norm": 3.857931839153315, + "learning_rate": 1.7209377691502565e-05, + "loss": 4.3564, + "step": 666 + }, + { + "epoch": 0.27, + "grad_norm": 3.868644542943814, + "learning_rate": 1.720039381694053e-05, + "loss": 4.3347, + "step": 667 + }, + { + "epoch": 0.27, + "grad_norm": 3.9872569129304427, + "learning_rate": 1.7191397857773787e-05, + "loss": 4.1822, + "step": 668 + }, + { + "epoch": 0.27, + "grad_norm": 4.377738459178347, + "learning_rate": 1.7182389829100484e-05, + "loss": 4.2361, + "step": 669 + }, + { + "epoch": 0.27, + "grad_norm": 4.526630194902284, + "learning_rate": 1.7173369746039026e-05, + "loss": 4.2334, + "step": 670 + }, + { + "epoch": 0.27, + "grad_norm": 3.903151798762359, + "learning_rate": 1.7164337623728044e-05, + "loss": 4.2507, + "step": 671 + }, + { + "epoch": 0.27, + "grad_norm": 3.416704913811221, + "learning_rate": 1.7155293477326385e-05, + "loss": 4.3865, + "step": 672 + }, + { + "epoch": 0.27, + "grad_norm": 4.668815207748128, + "learning_rate": 1.714623732201307e-05, + "loss": 4.1203, + "step": 673 + }, + { + "epoch": 0.27, + "grad_norm": 4.835149742451225, + "learning_rate": 1.713716917298727e-05, + "loss": 4.1392, + "step": 674 + }, + { + "epoch": 0.27, + "grad_norm": 4.881418868309846, + "learning_rate": 1.7128089045468294e-05, + "loss": 4.2875, + "step": 675 + }, + { + "epoch": 0.27, + "grad_norm": 5.034412844093579, + "learning_rate": 1.7118996954695553e-05, + "loss": 4.0848, + "step": 676 + }, + { + "epoch": 0.27, + "grad_norm": 4.518013043332243, + "learning_rate": 1.7109892915928535e-05, + "loss": 4.3151, + "step": 677 + }, + { + "epoch": 0.27, + "grad_norm": 4.478304353081878, + "learning_rate": 1.7100776944446783e-05, + "loss": 4.3224, + "step": 678 + }, + { + "epoch": 0.27, + "grad_norm": 5.048591652502835, + "learning_rate": 1.709164905554986e-05, + "loss": 4.0666, + "step": 679 + }, + { + "epoch": 0.27, + "grad_norm": 5.0357109087365535, + "learning_rate": 1.7082509264557333e-05, + "loss": 4.0443, + "step": 680 + }, + { + "epoch": 0.27, + "grad_norm": 4.3409258985175585, + "learning_rate": 1.7073357586808753e-05, + "loss": 4.2371, + "step": 681 + }, + { + "epoch": 0.27, + "grad_norm": 4.582253486092752, + "learning_rate": 1.706419403766361e-05, + "loss": 4.077, + "step": 682 + }, + { + "epoch": 0.27, + "grad_norm": 4.801114373409387, + "learning_rate": 1.7055018632501326e-05, + "loss": 4.2553, + "step": 683 + }, + { + "epoch": 0.27, + "grad_norm": 3.8747080745349978, + "learning_rate": 1.7045831386721213e-05, + "loss": 4.1561, + "step": 684 + }, + { + "epoch": 0.27, + "grad_norm": 4.696948918314867, + "learning_rate": 1.7036632315742464e-05, + "loss": 4.2312, + "step": 685 + }, + { + "epoch": 0.27, + "grad_norm": 5.258212104642618, + "learning_rate": 1.7027421435004114e-05, + "loss": 4.1866, + "step": 686 + }, + { + "epoch": 0.27, + "grad_norm": 5.388331452291105, + "learning_rate": 1.7018198759965018e-05, + "loss": 4.1992, + "step": 687 + }, + { + "epoch": 0.28, + "grad_norm": 5.647950158243207, + "learning_rate": 1.7008964306103823e-05, + "loss": 4.3468, + "step": 688 + }, + { + "epoch": 0.28, + "grad_norm": 5.168886137147408, + "learning_rate": 1.6999718088918956e-05, + "loss": 4.0308, + "step": 689 + }, + { + "epoch": 0.28, + "grad_norm": 4.357151028206129, + "learning_rate": 1.6990460123928577e-05, + "loss": 4.3588, + "step": 690 + }, + { + "epoch": 0.28, + "grad_norm": 4.530147627269544, + "learning_rate": 1.698119042667056e-05, + "loss": 4.3144, + "step": 691 + }, + { + "epoch": 0.28, + "grad_norm": 4.817151116073827, + "learning_rate": 1.6971909012702483e-05, + "loss": 4.2138, + "step": 692 + }, + { + "epoch": 0.28, + "grad_norm": 5.092987342888455, + "learning_rate": 1.6962615897601573e-05, + "loss": 4.2911, + "step": 693 + }, + { + "epoch": 0.28, + "grad_norm": 5.613134484273773, + "learning_rate": 1.6953311096964706e-05, + "loss": 4.3282, + "step": 694 + }, + { + "epoch": 0.28, + "grad_norm": 5.173274414167209, + "learning_rate": 1.6943994626408365e-05, + "loss": 4.0758, + "step": 695 + }, + { + "epoch": 0.28, + "grad_norm": 4.190457374578632, + "learning_rate": 1.6934666501568618e-05, + "loss": 4.3804, + "step": 696 + }, + { + "epoch": 0.28, + "grad_norm": 5.601597668292631, + "learning_rate": 1.69253267381011e-05, + "loss": 4.3887, + "step": 697 + }, + { + "epoch": 0.28, + "grad_norm": 4.210265170570191, + "learning_rate": 1.6915975351680968e-05, + "loss": 4.1695, + "step": 698 + }, + { + "epoch": 0.28, + "grad_norm": 4.137670168126125, + "learning_rate": 1.69066123580029e-05, + "loss": 4.3003, + "step": 699 + }, + { + "epoch": 0.28, + "grad_norm": 3.747988286996686, + "learning_rate": 1.6897237772781046e-05, + "loss": 4.4487, + "step": 700 + }, + { + "epoch": 0.28, + "grad_norm": 4.594341934110022, + "learning_rate": 1.6887851611749005e-05, + "loss": 4.1977, + "step": 701 + }, + { + "epoch": 0.28, + "grad_norm": 4.693710767713602, + "learning_rate": 1.6878453890659815e-05, + "loss": 4.4077, + "step": 702 + }, + { + "epoch": 0.28, + "grad_norm": 3.9460120139260026, + "learning_rate": 1.686904462528591e-05, + "loss": 4.3575, + "step": 703 + }, + { + "epoch": 0.28, + "grad_norm": 3.903534992356641, + "learning_rate": 1.68596238314191e-05, + "loss": 4.2751, + "step": 704 + }, + { + "epoch": 0.28, + "grad_norm": 3.6197148558188004, + "learning_rate": 1.6850191524870548e-05, + "loss": 4.3067, + "step": 705 + }, + { + "epoch": 0.28, + "grad_norm": 6.013695095794239, + "learning_rate": 1.6840747721470733e-05, + "loss": 4.3629, + "step": 706 + }, + { + "epoch": 0.28, + "grad_norm": 5.793940986645831, + "learning_rate": 1.6831292437069425e-05, + "loss": 4.1589, + "step": 707 + }, + { + "epoch": 0.28, + "grad_norm": 5.068061320502639, + "learning_rate": 1.6821825687535675e-05, + "loss": 4.3191, + "step": 708 + }, + { + "epoch": 0.28, + "grad_norm": 4.835071559565869, + "learning_rate": 1.6812347488757774e-05, + "loss": 4.3855, + "step": 709 + }, + { + "epoch": 0.28, + "grad_norm": 5.352222600656303, + "learning_rate": 1.6802857856643214e-05, + "loss": 4.1151, + "step": 710 + }, + { + "epoch": 0.28, + "grad_norm": 5.223271549995184, + "learning_rate": 1.6793356807118695e-05, + "loss": 4.2358, + "step": 711 + }, + { + "epoch": 0.28, + "grad_norm": 5.697294907663005, + "learning_rate": 1.6783844356130073e-05, + "loss": 4.0528, + "step": 712 + }, + { + "epoch": 0.29, + "grad_norm": 5.019657412383279, + "learning_rate": 1.677432051964233e-05, + "loss": 4.3906, + "step": 713 + }, + { + "epoch": 0.29, + "grad_norm": 4.330982781472389, + "learning_rate": 1.6764785313639568e-05, + "loss": 4.2145, + "step": 714 + }, + { + "epoch": 0.29, + "grad_norm": 5.159400687537892, + "learning_rate": 1.6755238754124965e-05, + "loss": 4.15, + "step": 715 + }, + { + "epoch": 0.29, + "grad_norm": 4.743019702246863, + "learning_rate": 1.6745680857120757e-05, + "loss": 4.3182, + "step": 716 + }, + { + "epoch": 0.29, + "grad_norm": 5.376804285660271, + "learning_rate": 1.6736111638668203e-05, + "loss": 4.1689, + "step": 717 + }, + { + "epoch": 0.29, + "grad_norm": 4.358221275967151, + "learning_rate": 1.6726531114827572e-05, + "loss": 4.1588, + "step": 718 + }, + { + "epoch": 0.29, + "grad_norm": 4.0442497727339966, + "learning_rate": 1.6716939301678098e-05, + "loss": 4.4582, + "step": 719 + }, + { + "epoch": 0.29, + "grad_norm": 6.046366033000383, + "learning_rate": 1.6707336215317968e-05, + "loss": 4.2036, + "step": 720 + }, + { + "epoch": 0.29, + "grad_norm": 4.445621373056556, + "learning_rate": 1.6697721871864286e-05, + "loss": 4.348, + "step": 721 + }, + { + "epoch": 0.29, + "grad_norm": 3.8971967069923776, + "learning_rate": 1.6688096287453048e-05, + "loss": 4.1313, + "step": 722 + }, + { + "epoch": 0.29, + "grad_norm": 3.9511536886764964, + "learning_rate": 1.6678459478239116e-05, + "loss": 4.1277, + "step": 723 + }, + { + "epoch": 0.29, + "grad_norm": 3.7444284748312713, + "learning_rate": 1.6668811460396202e-05, + "loss": 4.3639, + "step": 724 + }, + { + "epoch": 0.29, + "grad_norm": 4.535042203472182, + "learning_rate": 1.665915225011681e-05, + "loss": 4.2301, + "step": 725 + }, + { + "epoch": 0.29, + "grad_norm": 4.344050025912771, + "learning_rate": 1.664948186361225e-05, + "loss": 4.084, + "step": 726 + }, + { + "epoch": 0.29, + "grad_norm": 4.583266578664799, + "learning_rate": 1.663980031711257e-05, + "loss": 4.1723, + "step": 727 + }, + { + "epoch": 0.29, + "grad_norm": 4.094665681041583, + "learning_rate": 1.6630107626866558e-05, + "loss": 4.3757, + "step": 728 + }, + { + "epoch": 0.29, + "grad_norm": 4.904919081388526, + "learning_rate": 1.6620403809141707e-05, + "loss": 4.1186, + "step": 729 + }, + { + "epoch": 0.29, + "grad_norm": 4.399804648357315, + "learning_rate": 1.6610688880224178e-05, + "loss": 4.0947, + "step": 730 + }, + { + "epoch": 0.29, + "grad_norm": 3.923219829805552, + "learning_rate": 1.6600962856418782e-05, + "loss": 4.335, + "step": 731 + }, + { + "epoch": 0.29, + "grad_norm": 4.427663133020312, + "learning_rate": 1.6591225754048963e-05, + "loss": 4.1266, + "step": 732 + }, + { + "epoch": 0.29, + "grad_norm": 5.097746227281494, + "learning_rate": 1.6581477589456737e-05, + "loss": 4.1285, + "step": 733 + }, + { + "epoch": 0.29, + "grad_norm": 4.745457635840251, + "learning_rate": 1.6571718379002705e-05, + "loss": 4.3592, + "step": 734 + }, + { + "epoch": 0.29, + "grad_norm": 3.758424791186076, + "learning_rate": 1.6561948139065997e-05, + "loss": 3.9558, + "step": 735 + }, + { + "epoch": 0.29, + "grad_norm": 4.6602131909735185, + "learning_rate": 1.6552166886044253e-05, + "loss": 4.2786, + "step": 736 + }, + { + "epoch": 0.29, + "grad_norm": 4.331991964126386, + "learning_rate": 1.6542374636353605e-05, + "loss": 4.205, + "step": 737 + }, + { + "epoch": 0.3, + "grad_norm": 4.742271026876533, + "learning_rate": 1.653257140642863e-05, + "loss": 4.1134, + "step": 738 + }, + { + "epoch": 0.3, + "grad_norm": 4.128592064049964, + "learning_rate": 1.6522757212722346e-05, + "loss": 4.2645, + "step": 739 + }, + { + "epoch": 0.3, + "grad_norm": 4.128026840094599, + "learning_rate": 1.6512932071706153e-05, + "loss": 4.1295, + "step": 740 + }, + { + "epoch": 0.3, + "grad_norm": 3.73216243413116, + "learning_rate": 1.650309599986985e-05, + "loss": 4.3763, + "step": 741 + }, + { + "epoch": 0.3, + "grad_norm": 4.406532996935937, + "learning_rate": 1.6493249013721558e-05, + "loss": 4.2893, + "step": 742 + }, + { + "epoch": 0.3, + "grad_norm": 4.892317898810144, + "learning_rate": 1.6483391129787725e-05, + "loss": 4.0252, + "step": 743 + }, + { + "epoch": 0.3, + "grad_norm": 3.655969865077262, + "learning_rate": 1.64735223646131e-05, + "loss": 4.0342, + "step": 744 + }, + { + "epoch": 0.3, + "grad_norm": 4.237746678866634, + "learning_rate": 1.646364273476067e-05, + "loss": 4.2777, + "step": 745 + }, + { + "epoch": 0.3, + "grad_norm": 4.7276325571184055, + "learning_rate": 1.6453752256811676e-05, + "loss": 4.1562, + "step": 746 + }, + { + "epoch": 0.3, + "grad_norm": 5.040221659089892, + "learning_rate": 1.644385094736556e-05, + "loss": 4.0892, + "step": 747 + }, + { + "epoch": 0.3, + "grad_norm": 4.219234733858174, + "learning_rate": 1.6433938823039942e-05, + "loss": 4.2405, + "step": 748 + }, + { + "epoch": 0.3, + "grad_norm": 4.014880781194069, + "learning_rate": 1.642401590047059e-05, + "loss": 4.1244, + "step": 749 + }, + { + "epoch": 0.3, + "grad_norm": 5.603922662946277, + "learning_rate": 1.6414082196311402e-05, + "loss": 4.1623, + "step": 750 + }, + { + "epoch": 0.3, + "grad_norm": 4.169752318365569, + "learning_rate": 1.6404137727234366e-05, + "loss": 4.1791, + "step": 751 + }, + { + "epoch": 0.3, + "grad_norm": 4.800322679915261, + "learning_rate": 1.639418250992954e-05, + "loss": 4.0497, + "step": 752 + }, + { + "epoch": 0.3, + "grad_norm": 4.819362617310608, + "learning_rate": 1.6384216561105014e-05, + "loss": 4.2569, + "step": 753 + }, + { + "epoch": 0.3, + "grad_norm": 3.8943247801354315, + "learning_rate": 1.63742398974869e-05, + "loss": 4.2013, + "step": 754 + }, + { + "epoch": 0.3, + "grad_norm": 4.237703478343301, + "learning_rate": 1.6364252535819284e-05, + "loss": 4.2323, + "step": 755 + }, + { + "epoch": 0.3, + "grad_norm": 4.289135372736193, + "learning_rate": 1.635425449286421e-05, + "loss": 4.2226, + "step": 756 + }, + { + "epoch": 0.3, + "grad_norm": 4.03279421005735, + "learning_rate": 1.6344245785401653e-05, + "loss": 4.0047, + "step": 757 + }, + { + "epoch": 0.3, + "grad_norm": 4.12143068811916, + "learning_rate": 1.6334226430229475e-05, + "loss": 4.1518, + "step": 758 + }, + { + "epoch": 0.3, + "grad_norm": 4.045018027459975, + "learning_rate": 1.632419644416342e-05, + "loss": 3.9894, + "step": 759 + }, + { + "epoch": 0.3, + "grad_norm": 3.3634370875446935, + "learning_rate": 1.6314155844037074e-05, + "loss": 4.1409, + "step": 760 + }, + { + "epoch": 0.3, + "grad_norm": 4.424185767990525, + "learning_rate": 1.6304104646701818e-05, + "loss": 4.0657, + "step": 761 + }, + { + "epoch": 0.3, + "grad_norm": 3.8362370349072434, + "learning_rate": 1.629404286902685e-05, + "loss": 4.2315, + "step": 762 + }, + { + "epoch": 0.31, + "grad_norm": 4.054880336997564, + "learning_rate": 1.62839705278991e-05, + "loss": 4.1762, + "step": 763 + }, + { + "epoch": 0.31, + "grad_norm": 3.455945336863922, + "learning_rate": 1.627388764022323e-05, + "loss": 3.9945, + "step": 764 + }, + { + "epoch": 0.31, + "grad_norm": 4.089347168700492, + "learning_rate": 1.626379422292162e-05, + "loss": 4.1632, + "step": 765 + }, + { + "epoch": 0.31, + "grad_norm": 3.9432572091307634, + "learning_rate": 1.6253690292934303e-05, + "loss": 4.1629, + "step": 766 + }, + { + "epoch": 0.31, + "grad_norm": 3.0237380868373296, + "learning_rate": 1.624357586721896e-05, + "loss": 4.3338, + "step": 767 + }, + { + "epoch": 0.31, + "grad_norm": 3.6907839068821535, + "learning_rate": 1.6233450962750895e-05, + "loss": 4.0344, + "step": 768 + }, + { + "epoch": 0.31, + "grad_norm": 4.241780550240553, + "learning_rate": 1.622331559652299e-05, + "loss": 4.1167, + "step": 769 + }, + { + "epoch": 0.31, + "grad_norm": 3.354524584576279, + "learning_rate": 1.6213169785545688e-05, + "loss": 4.3221, + "step": 770 + }, + { + "epoch": 0.31, + "grad_norm": 3.787194573086892, + "learning_rate": 1.6203013546846967e-05, + "loss": 4.0784, + "step": 771 + }, + { + "epoch": 0.31, + "grad_norm": 4.173642942286069, + "learning_rate": 1.61928468974723e-05, + "loss": 4.1752, + "step": 772 + }, + { + "epoch": 0.31, + "grad_norm": 3.9260357041238607, + "learning_rate": 1.618266985448463e-05, + "loss": 4.2805, + "step": 773 + }, + { + "epoch": 0.31, + "grad_norm": 5.923440227675206, + "learning_rate": 1.6172482434964353e-05, + "loss": 4.0797, + "step": 774 + }, + { + "epoch": 0.31, + "grad_norm": 5.2466075816715, + "learning_rate": 1.6162284656009276e-05, + "loss": 4.2478, + "step": 775 + }, + { + "epoch": 0.31, + "grad_norm": 4.969458945548505, + "learning_rate": 1.6152076534734585e-05, + "loss": 4.2096, + "step": 776 + }, + { + "epoch": 0.31, + "grad_norm": 6.042108075929687, + "learning_rate": 1.6141858088272838e-05, + "loss": 3.8777, + "step": 777 + }, + { + "epoch": 0.31, + "grad_norm": 3.9542589642669697, + "learning_rate": 1.6131629333773908e-05, + "loss": 4.1506, + "step": 778 + }, + { + "epoch": 0.31, + "grad_norm": 4.219621112468623, + "learning_rate": 1.612139028840498e-05, + "loss": 4.0092, + "step": 779 + }, + { + "epoch": 0.31, + "grad_norm": 3.6474863522574874, + "learning_rate": 1.6111140969350504e-05, + "loss": 4.1307, + "step": 780 + }, + { + "epoch": 0.31, + "grad_norm": 3.5346835812296193, + "learning_rate": 1.610088139381217e-05, + "loss": 4.1175, + "step": 781 + }, + { + "epoch": 0.31, + "grad_norm": 3.725366186832991, + "learning_rate": 1.609061157900889e-05, + "loss": 4.0711, + "step": 782 + }, + { + "epoch": 0.31, + "grad_norm": 4.667673965148235, + "learning_rate": 1.6080331542176754e-05, + "loss": 3.918, + "step": 783 + }, + { + "epoch": 0.31, + "grad_norm": 4.085941511402021, + "learning_rate": 1.6070041300569014e-05, + "loss": 3.9859, + "step": 784 + }, + { + "epoch": 0.31, + "grad_norm": 4.663966021746003, + "learning_rate": 1.6059740871456035e-05, + "loss": 3.9296, + "step": 785 + }, + { + "epoch": 0.31, + "grad_norm": 4.171734077729142, + "learning_rate": 1.60494302721253e-05, + "loss": 4.25, + "step": 786 + }, + { + "epoch": 0.31, + "grad_norm": 3.470608583836035, + "learning_rate": 1.603910951988135e-05, + "loss": 4.1841, + "step": 787 + }, + { + "epoch": 0.32, + "grad_norm": 4.944785987346097, + "learning_rate": 1.602877863204576e-05, + "loss": 3.9134, + "step": 788 + }, + { + "epoch": 0.32, + "grad_norm": 3.679669019856182, + "learning_rate": 1.6018437625957135e-05, + "loss": 4.1185, + "step": 789 + }, + { + "epoch": 0.32, + "grad_norm": 4.8456129680158755, + "learning_rate": 1.6008086518971037e-05, + "loss": 3.9203, + "step": 790 + }, + { + "epoch": 0.32, + "grad_norm": 4.038185112299229, + "learning_rate": 1.599772532846e-05, + "loss": 4.1326, + "step": 791 + }, + { + "epoch": 0.32, + "grad_norm": 4.110740634200455, + "learning_rate": 1.598735407181347e-05, + "loss": 4.188, + "step": 792 + }, + { + "epoch": 0.32, + "grad_norm": 4.088496773536847, + "learning_rate": 1.5976972766437796e-05, + "loss": 3.9539, + "step": 793 + }, + { + "epoch": 0.32, + "grad_norm": 3.548087741743393, + "learning_rate": 1.596658142975618e-05, + "loss": 4.0481, + "step": 794 + }, + { + "epoch": 0.32, + "grad_norm": 3.368884980792822, + "learning_rate": 1.5956180079208684e-05, + "loss": 4.162, + "step": 795 + }, + { + "epoch": 0.32, + "grad_norm": 3.320305188255113, + "learning_rate": 1.5945768732252144e-05, + "loss": 4.143, + "step": 796 + }, + { + "epoch": 0.32, + "grad_norm": 3.9640549005931587, + "learning_rate": 1.5935347406360192e-05, + "loss": 3.9122, + "step": 797 + }, + { + "epoch": 0.32, + "grad_norm": 4.222874057950338, + "learning_rate": 1.5924916119023214e-05, + "loss": 4.2344, + "step": 798 + }, + { + "epoch": 0.32, + "grad_norm": 3.5720174004747354, + "learning_rate": 1.5914474887748297e-05, + "loss": 4.1743, + "step": 799 + }, + { + "epoch": 0.32, + "grad_norm": 4.231737588287248, + "learning_rate": 1.5904023730059227e-05, + "loss": 4.0582, + "step": 800 + }, + { + "epoch": 0.32, + "grad_norm": 3.5855444742625484, + "learning_rate": 1.589356266349645e-05, + "loss": 4.1918, + "step": 801 + }, + { + "epoch": 0.32, + "grad_norm": 3.6077052786289086, + "learning_rate": 1.5883091705617045e-05, + "loss": 3.9826, + "step": 802 + }, + { + "epoch": 0.32, + "grad_norm": 3.462258780982009, + "learning_rate": 1.5872610873994685e-05, + "loss": 4.2167, + "step": 803 + }, + { + "epoch": 0.32, + "grad_norm": 3.915782876006761, + "learning_rate": 1.5862120186219614e-05, + "loss": 4.0085, + "step": 804 + }, + { + "epoch": 0.32, + "grad_norm": 4.072359468444531, + "learning_rate": 1.5851619659898623e-05, + "loss": 3.8627, + "step": 805 + }, + { + "epoch": 0.32, + "grad_norm": 3.9217692589848068, + "learning_rate": 1.5841109312655017e-05, + "loss": 3.9516, + "step": 806 + }, + { + "epoch": 0.32, + "grad_norm": 4.086637237247661, + "learning_rate": 1.5830589162128574e-05, + "loss": 4.0158, + "step": 807 + }, + { + "epoch": 0.32, + "grad_norm": 4.674894390048435, + "learning_rate": 1.582005922597553e-05, + "loss": 3.9575, + "step": 808 + }, + { + "epoch": 0.32, + "grad_norm": 4.18175058284887, + "learning_rate": 1.580951952186856e-05, + "loss": 4.0599, + "step": 809 + }, + { + "epoch": 0.32, + "grad_norm": 3.8499748638554805, + "learning_rate": 1.57989700674967e-05, + "loss": 4.0656, + "step": 810 + }, + { + "epoch": 0.32, + "grad_norm": 3.716337060023653, + "learning_rate": 1.578841088056538e-05, + "loss": 4.157, + "step": 811 + }, + { + "epoch": 0.32, + "grad_norm": 4.305254563957067, + "learning_rate": 1.5777841978796348e-05, + "loss": 4.1538, + "step": 812 + }, + { + "epoch": 0.33, + "grad_norm": 4.559347319241137, + "learning_rate": 1.5767263379927663e-05, + "loss": 3.9864, + "step": 813 + }, + { + "epoch": 0.33, + "grad_norm": 3.9159846433046575, + "learning_rate": 1.5756675101713657e-05, + "loss": 4.1094, + "step": 814 + }, + { + "epoch": 0.33, + "grad_norm": 4.299425354443105, + "learning_rate": 1.5746077161924905e-05, + "loss": 4.1367, + "step": 815 + }, + { + "epoch": 0.33, + "grad_norm": 4.316883826806145, + "learning_rate": 1.573546957834821e-05, + "loss": 4.0427, + "step": 816 + }, + { + "epoch": 0.33, + "grad_norm": 3.878824079534932, + "learning_rate": 1.572485236878654e-05, + "loss": 4.1057, + "step": 817 + }, + { + "epoch": 0.33, + "grad_norm": 5.495346619133286, + "learning_rate": 1.5714225551059027e-05, + "loss": 4.0545, + "step": 818 + }, + { + "epoch": 0.33, + "grad_norm": 4.9758999194804545, + "learning_rate": 1.570358914300094e-05, + "loss": 4.0423, + "step": 819 + }, + { + "epoch": 0.33, + "grad_norm": 3.604552161772746, + "learning_rate": 1.5692943162463628e-05, + "loss": 3.9757, + "step": 820 + }, + { + "epoch": 0.33, + "grad_norm": 4.851176884663419, + "learning_rate": 1.5682287627314513e-05, + "loss": 3.9113, + "step": 821 + }, + { + "epoch": 0.33, + "grad_norm": 4.289288179124551, + "learning_rate": 1.5671622555437055e-05, + "loss": 4.2302, + "step": 822 + }, + { + "epoch": 0.33, + "grad_norm": 3.7811955845085796, + "learning_rate": 1.566094796473071e-05, + "loss": 4.0358, + "step": 823 + }, + { + "epoch": 0.33, + "grad_norm": 4.674141666896367, + "learning_rate": 1.565026387311092e-05, + "loss": 4.1475, + "step": 824 + }, + { + "epoch": 0.33, + "grad_norm": 3.5027858102986764, + "learning_rate": 1.5639570298509067e-05, + "loss": 4.053, + "step": 825 + }, + { + "epoch": 0.33, + "grad_norm": 3.6916507627982593, + "learning_rate": 1.562886725887245e-05, + "loss": 4.2153, + "step": 826 + }, + { + "epoch": 0.33, + "grad_norm": 3.561565009887269, + "learning_rate": 1.5618154772164257e-05, + "loss": 4.1343, + "step": 827 + }, + { + "epoch": 0.33, + "grad_norm": 3.8244028110454558, + "learning_rate": 1.5607432856363523e-05, + "loss": 3.8806, + "step": 828 + }, + { + "epoch": 0.33, + "grad_norm": 4.460749430124214, + "learning_rate": 1.559670152946512e-05, + "loss": 3.9809, + "step": 829 + }, + { + "epoch": 0.33, + "grad_norm": 3.755014824313866, + "learning_rate": 1.5585960809479698e-05, + "loss": 4.0223, + "step": 830 + }, + { + "epoch": 0.33, + "grad_norm": 4.104852010995523, + "learning_rate": 1.5575210714433687e-05, + "loss": 4.0399, + "step": 831 + }, + { + "epoch": 0.33, + "grad_norm": 3.7760475308616637, + "learning_rate": 1.5564451262369247e-05, + "loss": 4.0801, + "step": 832 + }, + { + "epoch": 0.33, + "grad_norm": 4.2793544157021355, + "learning_rate": 1.5553682471344237e-05, + "loss": 4.0501, + "step": 833 + }, + { + "epoch": 0.33, + "grad_norm": 4.629557175983418, + "learning_rate": 1.5542904359432198e-05, + "loss": 4.0355, + "step": 834 + }, + { + "epoch": 0.33, + "grad_norm": 3.7883624814021895, + "learning_rate": 1.5532116944722308e-05, + "loss": 4.1627, + "step": 835 + }, + { + "epoch": 0.33, + "grad_norm": 3.7953856863825313, + "learning_rate": 1.5521320245319364e-05, + "loss": 3.9935, + "step": 836 + }, + { + "epoch": 0.33, + "grad_norm": 4.788743717332434, + "learning_rate": 1.5510514279343736e-05, + "loss": 4.0584, + "step": 837 + }, + { + "epoch": 0.34, + "grad_norm": 4.138431050489037, + "learning_rate": 1.5499699064931354e-05, + "loss": 3.9832, + "step": 838 + }, + { + "epoch": 0.34, + "grad_norm": 3.2686778021991447, + "learning_rate": 1.5488874620233674e-05, + "loss": 4.0388, + "step": 839 + }, + { + "epoch": 0.34, + "grad_norm": 4.340524461273947, + "learning_rate": 1.547804096341763e-05, + "loss": 4.1873, + "step": 840 + }, + { + "epoch": 0.34, + "grad_norm": 3.9077568345088065, + "learning_rate": 1.5467198112665632e-05, + "loss": 4.0613, + "step": 841 + }, + { + "epoch": 0.34, + "grad_norm": 4.12493499904231, + "learning_rate": 1.5456346086175508e-05, + "loss": 4.2309, + "step": 842 + }, + { + "epoch": 0.34, + "grad_norm": 4.341152194190719, + "learning_rate": 1.5445484902160494e-05, + "loss": 3.8967, + "step": 843 + }, + { + "epoch": 0.34, + "grad_norm": 3.8927081089261724, + "learning_rate": 1.543461457884919e-05, + "loss": 3.9914, + "step": 844 + }, + { + "epoch": 0.34, + "grad_norm": 4.07972545980503, + "learning_rate": 1.5423735134485537e-05, + "loss": 3.8737, + "step": 845 + }, + { + "epoch": 0.34, + "grad_norm": 4.153125798186397, + "learning_rate": 1.541284658732878e-05, + "loss": 4.1366, + "step": 846 + }, + { + "epoch": 0.34, + "grad_norm": 3.9414313278213275, + "learning_rate": 1.540194895565346e-05, + "loss": 4.0546, + "step": 847 + }, + { + "epoch": 0.34, + "grad_norm": 5.007997536551065, + "learning_rate": 1.5391042257749338e-05, + "loss": 4.0203, + "step": 848 + }, + { + "epoch": 0.34, + "grad_norm": 3.7302176266174256, + "learning_rate": 1.5380126511921404e-05, + "loss": 4.0391, + "step": 849 + }, + { + "epoch": 0.34, + "grad_norm": 3.785687015338851, + "learning_rate": 1.536920173648984e-05, + "loss": 4.0755, + "step": 850 + }, + { + "epoch": 0.34, + "grad_norm": 4.949326413164564, + "learning_rate": 1.5358267949789968e-05, + "loss": 3.9168, + "step": 851 + }, + { + "epoch": 0.34, + "grad_norm": 4.339362469992984, + "learning_rate": 1.5347325170172246e-05, + "loss": 4.044, + "step": 852 + }, + { + "epoch": 0.34, + "grad_norm": 3.894035836096662, + "learning_rate": 1.533637341600221e-05, + "loss": 3.9281, + "step": 853 + }, + { + "epoch": 0.34, + "grad_norm": 4.585643645017433, + "learning_rate": 1.532541270566049e-05, + "loss": 3.958, + "step": 854 + }, + { + "epoch": 0.34, + "grad_norm": 4.369189203074978, + "learning_rate": 1.5314443057542703e-05, + "loss": 4.1421, + "step": 855 + }, + { + "epoch": 0.34, + "grad_norm": 3.6856868676372194, + "learning_rate": 1.5303464490059506e-05, + "loss": 4.0606, + "step": 856 + }, + { + "epoch": 0.34, + "grad_norm": 3.7601211848512306, + "learning_rate": 1.5292477021636498e-05, + "loss": 3.9721, + "step": 857 + }, + { + "epoch": 0.34, + "grad_norm": 3.8571531489482274, + "learning_rate": 1.528148067071423e-05, + "loss": 3.8854, + "step": 858 + }, + { + "epoch": 0.34, + "grad_norm": 5.285596884849708, + "learning_rate": 1.5270475455748165e-05, + "loss": 3.9812, + "step": 859 + }, + { + "epoch": 0.34, + "grad_norm": 4.5621322826993085, + "learning_rate": 1.5259461395208628e-05, + "loss": 4.0867, + "step": 860 + }, + { + "epoch": 0.34, + "grad_norm": 3.7099938172783467, + "learning_rate": 1.5248438507580806e-05, + "loss": 4.1262, + "step": 861 + }, + { + "epoch": 0.34, + "grad_norm": 4.822370952909795, + "learning_rate": 1.5237406811364682e-05, + "loss": 3.8992, + "step": 862 + }, + { + "epoch": 0.35, + "grad_norm": 4.615930344249073, + "learning_rate": 1.5226366325075042e-05, + "loss": 3.9119, + "step": 863 + }, + { + "epoch": 0.35, + "grad_norm": 4.141109925790369, + "learning_rate": 1.5215317067241415e-05, + "loss": 3.922, + "step": 864 + }, + { + "epoch": 0.35, + "grad_norm": 4.91182469534649, + "learning_rate": 1.5204259056408046e-05, + "loss": 4.1266, + "step": 865 + }, + { + "epoch": 0.35, + "grad_norm": 3.739014813917722, + "learning_rate": 1.5193192311133884e-05, + "loss": 4.1846, + "step": 866 + }, + { + "epoch": 0.35, + "grad_norm": 3.733700050825112, + "learning_rate": 1.5182116849992528e-05, + "loss": 4.1189, + "step": 867 + }, + { + "epoch": 0.35, + "grad_norm": 4.577400393069631, + "learning_rate": 1.5171032691572207e-05, + "loss": 3.8833, + "step": 868 + }, + { + "epoch": 0.35, + "grad_norm": 3.9216623914448743, + "learning_rate": 1.5159939854475743e-05, + "loss": 3.8559, + "step": 869 + }, + { + "epoch": 0.35, + "grad_norm": 4.248938111614794, + "learning_rate": 1.5148838357320537e-05, + "loss": 3.9905, + "step": 870 + }, + { + "epoch": 0.35, + "grad_norm": 3.7734583568786384, + "learning_rate": 1.5137728218738504e-05, + "loss": 4.1165, + "step": 871 + }, + { + "epoch": 0.35, + "grad_norm": 3.3465062079897963, + "learning_rate": 1.512660945737608e-05, + "loss": 4.1162, + "step": 872 + }, + { + "epoch": 0.35, + "grad_norm": 4.716306651251567, + "learning_rate": 1.5115482091894164e-05, + "loss": 4.1707, + "step": 873 + }, + { + "epoch": 0.35, + "grad_norm": 3.776023574626814, + "learning_rate": 1.5104346140968096e-05, + "loss": 3.9443, + "step": 874 + }, + { + "epoch": 0.35, + "grad_norm": 3.3972508182391574, + "learning_rate": 1.5093201623287631e-05, + "loss": 4.0875, + "step": 875 + }, + { + "epoch": 0.35, + "grad_norm": 4.557313751621321, + "learning_rate": 1.5082048557556892e-05, + "loss": 3.9307, + "step": 876 + }, + { + "epoch": 0.35, + "grad_norm": 4.009659574014854, + "learning_rate": 1.507088696249436e-05, + "loss": 3.9834, + "step": 877 + }, + { + "epoch": 0.35, + "grad_norm": 3.4421878630309575, + "learning_rate": 1.505971685683282e-05, + "loss": 3.9976, + "step": 878 + }, + { + "epoch": 0.35, + "grad_norm": 4.090750290145485, + "learning_rate": 1.5048538259319347e-05, + "loss": 3.7983, + "step": 879 + }, + { + "epoch": 0.35, + "grad_norm": 4.084453101262391, + "learning_rate": 1.5037351188715265e-05, + "loss": 3.8341, + "step": 880 + }, + { + "epoch": 0.35, + "grad_norm": 3.7389574740374556, + "learning_rate": 1.5026155663796123e-05, + "loss": 3.6981, + "step": 881 + }, + { + "epoch": 0.35, + "grad_norm": 3.2705691585414263, + "learning_rate": 1.5014951703351655e-05, + "loss": 3.9908, + "step": 882 + }, + { + "epoch": 0.35, + "grad_norm": 3.6909533401956316, + "learning_rate": 1.500373932618575e-05, + "loss": 3.8345, + "step": 883 + }, + { + "epoch": 0.35, + "grad_norm": 3.8291194048755024, + "learning_rate": 1.4992518551116436e-05, + "loss": 4.0007, + "step": 884 + }, + { + "epoch": 0.35, + "grad_norm": 2.9216771533735, + "learning_rate": 1.4981289396975818e-05, + "loss": 4.0789, + "step": 885 + }, + { + "epoch": 0.35, + "grad_norm": 3.5513257467131147, + "learning_rate": 1.4970051882610073e-05, + "loss": 4.0285, + "step": 886 + }, + { + "epoch": 0.35, + "grad_norm": 3.938085563815545, + "learning_rate": 1.4958806026879411e-05, + "loss": 3.9651, + "step": 887 + }, + { + "epoch": 0.36, + "grad_norm": 3.200770892095531, + "learning_rate": 1.4947551848658036e-05, + "loss": 4.0959, + "step": 888 + }, + { + "epoch": 0.36, + "grad_norm": 3.9964458964377174, + "learning_rate": 1.4936289366834123e-05, + "loss": 3.8196, + "step": 889 + }, + { + "epoch": 0.36, + "grad_norm": 3.3213563792790763, + "learning_rate": 1.4925018600309784e-05, + "loss": 4.109, + "step": 890 + }, + { + "epoch": 0.36, + "grad_norm": 4.041337943923999, + "learning_rate": 1.4913739568001034e-05, + "loss": 4.0301, + "step": 891 + }, + { + "epoch": 0.36, + "grad_norm": 3.4764085757009324, + "learning_rate": 1.4902452288837761e-05, + "loss": 3.8225, + "step": 892 + }, + { + "epoch": 0.36, + "grad_norm": 3.592918007172055, + "learning_rate": 1.4891156781763692e-05, + "loss": 3.8364, + "step": 893 + }, + { + "epoch": 0.36, + "grad_norm": 3.592437141265646, + "learning_rate": 1.4879853065736366e-05, + "loss": 4.0147, + "step": 894 + }, + { + "epoch": 0.36, + "grad_norm": 3.7811593209134333, + "learning_rate": 1.4868541159727097e-05, + "loss": 3.8975, + "step": 895 + }, + { + "epoch": 0.36, + "grad_norm": 3.17581172791539, + "learning_rate": 1.485722108272095e-05, + "loss": 4.0324, + "step": 896 + }, + { + "epoch": 0.36, + "grad_norm": 3.631807623212032, + "learning_rate": 1.4845892853716692e-05, + "loss": 3.9237, + "step": 897 + }, + { + "epoch": 0.36, + "grad_norm": 3.7247974116244467, + "learning_rate": 1.4834556491726781e-05, + "loss": 3.9782, + "step": 898 + }, + { + "epoch": 0.36, + "grad_norm": 4.953397859539653, + "learning_rate": 1.482321201577733e-05, + "loss": 4.0207, + "step": 899 + }, + { + "epoch": 0.36, + "grad_norm": 3.286067218324245, + "learning_rate": 1.4811859444908053e-05, + "loss": 3.9184, + "step": 900 + }, + { + "epoch": 0.36, + "grad_norm": 3.9689525465502546, + "learning_rate": 1.4800498798172263e-05, + "loss": 3.9761, + "step": 901 + }, + { + "epoch": 0.36, + "grad_norm": 3.9432696950576682, + "learning_rate": 1.478913009463682e-05, + "loss": 3.8415, + "step": 902 + }, + { + "epoch": 0.36, + "grad_norm": 3.784967591738362, + "learning_rate": 1.4777753353382121e-05, + "loss": 3.9511, + "step": 903 + }, + { + "epoch": 0.36, + "grad_norm": 3.649524601044959, + "learning_rate": 1.4766368593502028e-05, + "loss": 3.9241, + "step": 904 + }, + { + "epoch": 0.36, + "grad_norm": 4.125027971868248, + "learning_rate": 1.4754975834103877e-05, + "loss": 3.8264, + "step": 905 + }, + { + "epoch": 0.36, + "grad_norm": 4.167521116754602, + "learning_rate": 1.474357509430843e-05, + "loss": 3.9653, + "step": 906 + }, + { + "epoch": 0.36, + "grad_norm": 4.694796156291042, + "learning_rate": 1.473216639324984e-05, + "loss": 3.8908, + "step": 907 + }, + { + "epoch": 0.36, + "grad_norm": 3.880655396365558, + "learning_rate": 1.472074975007562e-05, + "loss": 3.8734, + "step": 908 + }, + { + "epoch": 0.36, + "grad_norm": 3.7017895815934265, + "learning_rate": 1.4709325183946613e-05, + "loss": 3.9442, + "step": 909 + }, + { + "epoch": 0.36, + "grad_norm": 4.055857844353811, + "learning_rate": 1.4697892714036959e-05, + "loss": 3.9686, + "step": 910 + }, + { + "epoch": 0.36, + "grad_norm": 4.938238712811652, + "learning_rate": 1.4686452359534067e-05, + "loss": 3.9718, + "step": 911 + }, + { + "epoch": 0.36, + "grad_norm": 3.633536668094118, + "learning_rate": 1.467500413963857e-05, + "loss": 4.0176, + "step": 912 + }, + { + "epoch": 0.37, + "grad_norm": 5.039475965540521, + "learning_rate": 1.4663548073564316e-05, + "loss": 3.8204, + "step": 913 + }, + { + "epoch": 0.37, + "grad_norm": 4.586728291833797, + "learning_rate": 1.4652084180538304e-05, + "loss": 3.9394, + "step": 914 + }, + { + "epoch": 0.37, + "grad_norm": 3.6290231319278643, + "learning_rate": 1.4640612479800686e-05, + "loss": 3.9651, + "step": 915 + }, + { + "epoch": 0.37, + "grad_norm": 4.523810100980663, + "learning_rate": 1.4629132990604706e-05, + "loss": 3.9046, + "step": 916 + }, + { + "epoch": 0.37, + "grad_norm": 3.3352008780739277, + "learning_rate": 1.4617645732216686e-05, + "loss": 3.9908, + "step": 917 + }, + { + "epoch": 0.37, + "grad_norm": 4.216385007164741, + "learning_rate": 1.4606150723915984e-05, + "loss": 4.1919, + "step": 918 + }, + { + "epoch": 0.37, + "grad_norm": 3.3379333114880017, + "learning_rate": 1.4594647984994966e-05, + "loss": 3.8451, + "step": 919 + }, + { + "epoch": 0.37, + "grad_norm": 4.326212397572884, + "learning_rate": 1.4583137534758968e-05, + "loss": 4.0293, + "step": 920 + }, + { + "epoch": 0.37, + "grad_norm": 4.104354270670977, + "learning_rate": 1.4571619392526279e-05, + "loss": 4.0689, + "step": 921 + }, + { + "epoch": 0.37, + "grad_norm": 4.855954672044245, + "learning_rate": 1.456009357762809e-05, + "loss": 3.8103, + "step": 922 + }, + { + "epoch": 0.37, + "grad_norm": 3.4625438644586413, + "learning_rate": 1.4548560109408465e-05, + "loss": 3.9354, + "step": 923 + }, + { + "epoch": 0.37, + "grad_norm": 3.244228225779234, + "learning_rate": 1.4537019007224324e-05, + "loss": 4.1405, + "step": 924 + }, + { + "epoch": 0.37, + "grad_norm": 4.736402044407656, + "learning_rate": 1.4525470290445392e-05, + "loss": 3.9756, + "step": 925 + }, + { + "epoch": 0.37, + "grad_norm": 3.804301068297979, + "learning_rate": 1.4513913978454169e-05, + "loss": 4.1198, + "step": 926 + }, + { + "epoch": 0.37, + "grad_norm": 4.157084681702177, + "learning_rate": 1.4502350090645919e-05, + "loss": 3.8946, + "step": 927 + }, + { + "epoch": 0.37, + "grad_norm": 3.4439326017172562, + "learning_rate": 1.4490778646428601e-05, + "loss": 3.9797, + "step": 928 + }, + { + "epoch": 0.37, + "grad_norm": 3.855937635878492, + "learning_rate": 1.4479199665222869e-05, + "loss": 3.8229, + "step": 929 + }, + { + "epoch": 0.37, + "grad_norm": 3.52379856141619, + "learning_rate": 1.4467613166462024e-05, + "loss": 4.1031, + "step": 930 + }, + { + "epoch": 0.37, + "grad_norm": 4.343080438472811, + "learning_rate": 1.445601916959198e-05, + "loss": 3.8626, + "step": 931 + }, + { + "epoch": 0.37, + "grad_norm": 3.3607075037651586, + "learning_rate": 1.4444417694071242e-05, + "loss": 4.0184, + "step": 932 + }, + { + "epoch": 0.37, + "grad_norm": 3.7952187164245066, + "learning_rate": 1.4432808759370853e-05, + "loss": 3.7856, + "step": 933 + }, + { + "epoch": 0.37, + "grad_norm": 4.346280524599407, + "learning_rate": 1.4421192384974396e-05, + "loss": 3.8906, + "step": 934 + }, + { + "epoch": 0.37, + "grad_norm": 3.4646318285115116, + "learning_rate": 1.4409568590377918e-05, + "loss": 4.0531, + "step": 935 + }, + { + "epoch": 0.37, + "grad_norm": 4.182963241175465, + "learning_rate": 1.439793739508994e-05, + "loss": 3.8542, + "step": 936 + }, + { + "epoch": 0.37, + "grad_norm": 3.7121887516870933, + "learning_rate": 1.4386298818631388e-05, + "loss": 3.9009, + "step": 937 + }, + { + "epoch": 0.38, + "grad_norm": 3.281667995792813, + "learning_rate": 1.437465288053558e-05, + "loss": 3.9757, + "step": 938 + }, + { + "epoch": 0.38, + "grad_norm": 3.356495016960309, + "learning_rate": 1.4362999600348198e-05, + "loss": 3.8267, + "step": 939 + }, + { + "epoch": 0.38, + "grad_norm": 4.711219010948904, + "learning_rate": 1.4351338997627233e-05, + "loss": 3.9867, + "step": 940 + }, + { + "epoch": 0.38, + "grad_norm": 3.556481836187361, + "learning_rate": 1.433967109194298e-05, + "loss": 3.9451, + "step": 941 + }, + { + "epoch": 0.38, + "grad_norm": 4.265019987798413, + "learning_rate": 1.4327995902877972e-05, + "loss": 3.8217, + "step": 942 + }, + { + "epoch": 0.38, + "grad_norm": 3.703379886530379, + "learning_rate": 1.4316313450026986e-05, + "loss": 3.8829, + "step": 943 + }, + { + "epoch": 0.38, + "grad_norm": 3.5889354627304004, + "learning_rate": 1.4304623752996974e-05, + "loss": 4.1259, + "step": 944 + }, + { + "epoch": 0.38, + "grad_norm": 4.097019004750559, + "learning_rate": 1.429292683140706e-05, + "loss": 3.7848, + "step": 945 + }, + { + "epoch": 0.38, + "grad_norm": 3.7604395396181123, + "learning_rate": 1.428122270488848e-05, + "loss": 4.0124, + "step": 946 + }, + { + "epoch": 0.38, + "grad_norm": 3.1999883377450113, + "learning_rate": 1.4269511393084572e-05, + "loss": 4.0831, + "step": 947 + }, + { + "epoch": 0.38, + "grad_norm": 3.382731690898, + "learning_rate": 1.4257792915650728e-05, + "loss": 3.8673, + "step": 948 + }, + { + "epoch": 0.38, + "grad_norm": 3.513739284257866, + "learning_rate": 1.4246067292254367e-05, + "loss": 3.8318, + "step": 949 + }, + { + "epoch": 0.38, + "grad_norm": 3.723969538475621, + "learning_rate": 1.4234334542574906e-05, + "loss": 3.9158, + "step": 950 + }, + { + "epoch": 0.38, + "grad_norm": 3.389218087656833, + "learning_rate": 1.4222594686303707e-05, + "loss": 3.9276, + "step": 951 + }, + { + "epoch": 0.38, + "grad_norm": 4.042149116988585, + "learning_rate": 1.4210847743144087e-05, + "loss": 3.8478, + "step": 952 + }, + { + "epoch": 0.38, + "grad_norm": 3.823734180174385, + "learning_rate": 1.4199093732811227e-05, + "loss": 3.8644, + "step": 953 + }, + { + "epoch": 0.38, + "grad_norm": 3.4666325491254772, + "learning_rate": 1.4187332675032189e-05, + "loss": 3.9167, + "step": 954 + }, + { + "epoch": 0.38, + "grad_norm": 4.565609657289887, + "learning_rate": 1.4175564589545853e-05, + "loss": 3.7594, + "step": 955 + }, + { + "epoch": 0.38, + "grad_norm": 4.279210813025605, + "learning_rate": 1.4163789496102902e-05, + "loss": 3.9157, + "step": 956 + }, + { + "epoch": 0.38, + "grad_norm": 3.6109420515584296, + "learning_rate": 1.4152007414465771e-05, + "loss": 3.775, + "step": 957 + }, + { + "epoch": 0.38, + "grad_norm": 3.5183027997433562, + "learning_rate": 1.4140218364408634e-05, + "loss": 3.8613, + "step": 958 + }, + { + "epoch": 0.38, + "grad_norm": 4.7680112519128635, + "learning_rate": 1.4128422365717346e-05, + "loss": 3.8931, + "step": 959 + }, + { + "epoch": 0.38, + "grad_norm": 4.065893326610565, + "learning_rate": 1.411661943818944e-05, + "loss": 3.926, + "step": 960 + }, + { + "epoch": 0.38, + "grad_norm": 4.062397562509862, + "learning_rate": 1.4104809601634069e-05, + "loss": 3.8562, + "step": 961 + }, + { + "epoch": 0.38, + "grad_norm": 4.143634258192227, + "learning_rate": 1.409299287587198e-05, + "loss": 3.7753, + "step": 962 + }, + { + "epoch": 0.39, + "grad_norm": 4.103196921879452, + "learning_rate": 1.4081169280735488e-05, + "loss": 4.0006, + "step": 963 + }, + { + "epoch": 0.39, + "grad_norm": 3.7702584463005606, + "learning_rate": 1.4069338836068434e-05, + "loss": 3.8562, + "step": 964 + }, + { + "epoch": 0.39, + "grad_norm": 3.664516815419518, + "learning_rate": 1.4057501561726157e-05, + "loss": 4.1171, + "step": 965 + }, + { + "epoch": 0.39, + "grad_norm": 4.196260439411363, + "learning_rate": 1.404565747757545e-05, + "loss": 3.872, + "step": 966 + }, + { + "epoch": 0.39, + "grad_norm": 3.837246468651308, + "learning_rate": 1.403380660349455e-05, + "loss": 4.0388, + "step": 967 + }, + { + "epoch": 0.39, + "grad_norm": 3.665536340443557, + "learning_rate": 1.4021948959373075e-05, + "loss": 3.9043, + "step": 968 + }, + { + "epoch": 0.39, + "grad_norm": 4.561871423373831, + "learning_rate": 1.4010084565112018e-05, + "loss": 4.0049, + "step": 969 + }, + { + "epoch": 0.39, + "grad_norm": 4.31227711238914, + "learning_rate": 1.3998213440623691e-05, + "loss": 3.9867, + "step": 970 + }, + { + "epoch": 0.39, + "grad_norm": 3.3638177448238977, + "learning_rate": 1.3986335605831707e-05, + "loss": 4.2255, + "step": 971 + }, + { + "epoch": 0.39, + "grad_norm": 3.7103949192821384, + "learning_rate": 1.3974451080670934e-05, + "loss": 3.9219, + "step": 972 + }, + { + "epoch": 0.39, + "grad_norm": 3.981342474110125, + "learning_rate": 1.3962559885087482e-05, + "loss": 3.8157, + "step": 973 + }, + { + "epoch": 0.39, + "grad_norm": 4.679894552303274, + "learning_rate": 1.3950662039038643e-05, + "loss": 3.9717, + "step": 974 + }, + { + "epoch": 0.39, + "grad_norm": 4.367030262675531, + "learning_rate": 1.3938757562492873e-05, + "loss": 3.8756, + "step": 975 + }, + { + "epoch": 0.39, + "grad_norm": 3.7171472220445456, + "learning_rate": 1.3926846475429767e-05, + "loss": 3.8459, + "step": 976 + }, + { + "epoch": 0.39, + "grad_norm": 4.508351826758392, + "learning_rate": 1.3914928797839996e-05, + "loss": 3.6319, + "step": 977 + }, + { + "epoch": 0.39, + "grad_norm": 4.474974790718888, + "learning_rate": 1.3903004549725313e-05, + "loss": 3.8134, + "step": 978 + }, + { + "epoch": 0.39, + "grad_norm": 3.555949869772792, + "learning_rate": 1.3891073751098481e-05, + "loss": 3.917, + "step": 979 + }, + { + "epoch": 0.39, + "grad_norm": 4.225391171204873, + "learning_rate": 1.3879136421983265e-05, + "loss": 3.9551, + "step": 980 + }, + { + "epoch": 0.39, + "grad_norm": 3.666887533774541, + "learning_rate": 1.3867192582414393e-05, + "loss": 3.8632, + "step": 981 + }, + { + "epoch": 0.39, + "grad_norm": 3.802718705221923, + "learning_rate": 1.3855242252437511e-05, + "loss": 3.9409, + "step": 982 + }, + { + "epoch": 0.39, + "grad_norm": 3.5413928934272914, + "learning_rate": 1.3843285452109166e-05, + "loss": 3.8817, + "step": 983 + }, + { + "epoch": 0.39, + "grad_norm": 4.159489305465269, + "learning_rate": 1.3831322201496757e-05, + "loss": 3.857, + "step": 984 + }, + { + "epoch": 0.39, + "grad_norm": 4.823310889696872, + "learning_rate": 1.3819352520678519e-05, + "loss": 4.1045, + "step": 985 + }, + { + "epoch": 0.39, + "grad_norm": 3.3447101805626867, + "learning_rate": 1.3807376429743467e-05, + "loss": 3.9642, + "step": 986 + }, + { + "epoch": 0.39, + "grad_norm": 3.869554340354115, + "learning_rate": 1.3795393948791382e-05, + "loss": 3.8103, + "step": 987 + }, + { + "epoch": 0.4, + "grad_norm": 4.624295111996602, + "learning_rate": 1.3783405097932772e-05, + "loss": 3.7571, + "step": 988 + }, + { + "epoch": 0.4, + "grad_norm": 3.8202389870193754, + "learning_rate": 1.3771409897288823e-05, + "loss": 3.7456, + "step": 989 + }, + { + "epoch": 0.4, + "grad_norm": 3.4182725294539593, + "learning_rate": 1.3759408366991391e-05, + "loss": 3.9806, + "step": 990 + }, + { + "epoch": 0.4, + "grad_norm": 3.899990351720217, + "learning_rate": 1.3747400527182952e-05, + "loss": 4.1462, + "step": 991 + }, + { + "epoch": 0.4, + "grad_norm": 3.2462833506393514, + "learning_rate": 1.373538639801657e-05, + "loss": 4.1141, + "step": 992 + }, + { + "epoch": 0.4, + "grad_norm": 3.778151533518385, + "learning_rate": 1.3723365999655859e-05, + "loss": 3.8378, + "step": 993 + }, + { + "epoch": 0.4, + "grad_norm": 3.4612512836444447, + "learning_rate": 1.3711339352274969e-05, + "loss": 3.961, + "step": 994 + }, + { + "epoch": 0.4, + "grad_norm": 3.4892336558699677, + "learning_rate": 1.3699306476058523e-05, + "loss": 3.8107, + "step": 995 + }, + { + "epoch": 0.4, + "grad_norm": 3.667163737052047, + "learning_rate": 1.3687267391201604e-05, + "loss": 3.8194, + "step": 996 + }, + { + "epoch": 0.4, + "grad_norm": 3.7753395478216945, + "learning_rate": 1.3675222117909716e-05, + "loss": 3.8419, + "step": 997 + }, + { + "epoch": 0.4, + "grad_norm": 4.210105830897785, + "learning_rate": 1.366317067639875e-05, + "loss": 3.8023, + "step": 998 + }, + { + "epoch": 0.4, + "grad_norm": 3.723917366613343, + "learning_rate": 1.3651113086894951e-05, + "loss": 4.0149, + "step": 999 + }, + { + "epoch": 0.4, + "grad_norm": 4.244202977967538, + "learning_rate": 1.3639049369634878e-05, + "loss": 4.0083, + "step": 1000 + }, + { + "epoch": 0.4, + "grad_norm": 4.02953633918017, + "learning_rate": 1.3626979544865369e-05, + "loss": 3.6242, + "step": 1001 + }, + { + "epoch": 0.4, + "grad_norm": 3.8798268517813534, + "learning_rate": 1.3614903632843523e-05, + "loss": 3.8001, + "step": 1002 + }, + { + "epoch": 0.4, + "grad_norm": 3.9228004237258767, + "learning_rate": 1.3602821653836654e-05, + "loss": 3.8272, + "step": 1003 + }, + { + "epoch": 0.4, + "grad_norm": 3.904727851654214, + "learning_rate": 1.3590733628122253e-05, + "loss": 3.9567, + "step": 1004 + }, + { + "epoch": 0.4, + "grad_norm": 3.7882268070089697, + "learning_rate": 1.357863957598796e-05, + "loss": 3.8891, + "step": 1005 + }, + { + "epoch": 0.4, + "grad_norm": 5.706967784285879, + "learning_rate": 1.3566539517731536e-05, + "loss": 3.8622, + "step": 1006 + }, + { + "epoch": 0.4, + "grad_norm": 4.748018650903142, + "learning_rate": 1.3554433473660818e-05, + "loss": 3.7923, + "step": 1007 + }, + { + "epoch": 0.4, + "grad_norm": 3.4758174617619013, + "learning_rate": 1.354232146409368e-05, + "loss": 3.8247, + "step": 1008 + }, + { + "epoch": 0.4, + "grad_norm": 4.646296934730249, + "learning_rate": 1.353020350935803e-05, + "loss": 3.828, + "step": 1009 + }, + { + "epoch": 0.4, + "grad_norm": 4.470041626731096, + "learning_rate": 1.3518079629791725e-05, + "loss": 3.7678, + "step": 1010 + }, + { + "epoch": 0.4, + "grad_norm": 4.079568258122262, + "learning_rate": 1.3505949845742599e-05, + "loss": 3.7364, + "step": 1011 + }, + { + "epoch": 0.4, + "grad_norm": 3.8456370828497772, + "learning_rate": 1.3493814177568365e-05, + "loss": 3.9458, + "step": 1012 + }, + { + "epoch": 0.41, + "grad_norm": 3.6309196422021865, + "learning_rate": 1.3481672645636627e-05, + "loss": 3.8194, + "step": 1013 + }, + { + "epoch": 0.41, + "grad_norm": 4.163434543886125, + "learning_rate": 1.3469525270324835e-05, + "loss": 3.728, + "step": 1014 + }, + { + "epoch": 0.41, + "grad_norm": 4.21450380310228, + "learning_rate": 1.345737207202023e-05, + "loss": 3.9219, + "step": 1015 + }, + { + "epoch": 0.41, + "grad_norm": 3.974894908073055, + "learning_rate": 1.3445213071119841e-05, + "loss": 3.795, + "step": 1016 + }, + { + "epoch": 0.41, + "grad_norm": 3.7309967078096324, + "learning_rate": 1.3433048288030424e-05, + "loss": 3.9291, + "step": 1017 + }, + { + "epoch": 0.41, + "grad_norm": 4.882915305390519, + "learning_rate": 1.342087774316845e-05, + "loss": 3.9465, + "step": 1018 + }, + { + "epoch": 0.41, + "grad_norm": 5.298426712564282, + "learning_rate": 1.3408701456960052e-05, + "loss": 3.9188, + "step": 1019 + }, + { + "epoch": 0.41, + "grad_norm": 4.075499444985152, + "learning_rate": 1.3396519449841006e-05, + "loss": 3.8499, + "step": 1020 + }, + { + "epoch": 0.41, + "grad_norm": 4.058588213246766, + "learning_rate": 1.338433174225668e-05, + "loss": 3.9857, + "step": 1021 + }, + { + "epoch": 0.41, + "grad_norm": 4.441001425059062, + "learning_rate": 1.3372138354662018e-05, + "loss": 3.6495, + "step": 1022 + }, + { + "epoch": 0.41, + "grad_norm": 4.737403468138393, + "learning_rate": 1.3359939307521494e-05, + "loss": 3.5714, + "step": 1023 + }, + { + "epoch": 0.41, + "grad_norm": 4.489644852015869, + "learning_rate": 1.3347734621309076e-05, + "loss": 3.7954, + "step": 1024 + }, + { + "epoch": 0.41, + "grad_norm": 4.698554077016783, + "learning_rate": 1.3335524316508208e-05, + "loss": 3.7721, + "step": 1025 + }, + { + "epoch": 0.41, + "grad_norm": 4.537685736192624, + "learning_rate": 1.3323308413611748e-05, + "loss": 3.6806, + "step": 1026 + }, + { + "epoch": 0.41, + "grad_norm": 4.6844980885590415, + "learning_rate": 1.3311086933121961e-05, + "loss": 3.95, + "step": 1027 + }, + { + "epoch": 0.41, + "grad_norm": 3.7048407750955152, + "learning_rate": 1.3298859895550473e-05, + "loss": 3.9469, + "step": 1028 + }, + { + "epoch": 0.41, + "grad_norm": 3.66973118936307, + "learning_rate": 1.3286627321418229e-05, + "loss": 3.8629, + "step": 1029 + }, + { + "epoch": 0.41, + "grad_norm": 3.8653046493517436, + "learning_rate": 1.3274389231255466e-05, + "loss": 3.8781, + "step": 1030 + }, + { + "epoch": 0.41, + "grad_norm": 4.60833520162824, + "learning_rate": 1.3262145645601693e-05, + "loss": 3.786, + "step": 1031 + }, + { + "epoch": 0.41, + "grad_norm": 3.5817623342333627, + "learning_rate": 1.3249896585005628e-05, + "loss": 3.877, + "step": 1032 + }, + { + "epoch": 0.41, + "grad_norm": 4.387212721078502, + "learning_rate": 1.3237642070025183e-05, + "loss": 3.8994, + "step": 1033 + }, + { + "epoch": 0.41, + "grad_norm": 4.159428683537451, + "learning_rate": 1.322538212122742e-05, + "loss": 3.7944, + "step": 1034 + }, + { + "epoch": 0.41, + "grad_norm": 3.6203299759058987, + "learning_rate": 1.3213116759188525e-05, + "loss": 3.985, + "step": 1035 + }, + { + "epoch": 0.41, + "grad_norm": 4.1657884558644, + "learning_rate": 1.320084600449377e-05, + "loss": 3.7315, + "step": 1036 + }, + { + "epoch": 0.41, + "grad_norm": 4.281052074286255, + "learning_rate": 1.3188569877737474e-05, + "loss": 3.7459, + "step": 1037 + }, + { + "epoch": 0.42, + "grad_norm": 3.7677672464400835, + "learning_rate": 1.3176288399522975e-05, + "loss": 3.5716, + "step": 1038 + }, + { + "epoch": 0.42, + "grad_norm": 4.428918038754764, + "learning_rate": 1.3164001590462592e-05, + "loss": 3.8393, + "step": 1039 + }, + { + "epoch": 0.42, + "grad_norm": 3.298395603507693, + "learning_rate": 1.3151709471177589e-05, + "loss": 3.8122, + "step": 1040 + }, + { + "epoch": 0.42, + "grad_norm": 4.280164110278819, + "learning_rate": 1.3139412062298141e-05, + "loss": 3.5995, + "step": 1041 + }, + { + "epoch": 0.42, + "grad_norm": 3.500876125985926, + "learning_rate": 1.312710938446331e-05, + "loss": 3.8619, + "step": 1042 + }, + { + "epoch": 0.42, + "grad_norm": 3.6210925789968, + "learning_rate": 1.3114801458320988e-05, + "loss": 3.6235, + "step": 1043 + }, + { + "epoch": 0.42, + "grad_norm": 3.7574064252584147, + "learning_rate": 1.3102488304527883e-05, + "loss": 3.8639, + "step": 1044 + }, + { + "epoch": 0.42, + "grad_norm": 4.1328056061368095, + "learning_rate": 1.3090169943749475e-05, + "loss": 3.987, + "step": 1045 + }, + { + "epoch": 0.42, + "grad_norm": 4.01823522119479, + "learning_rate": 1.3077846396659986e-05, + "loss": 3.7406, + "step": 1046 + }, + { + "epoch": 0.42, + "grad_norm": 4.1690337753538556, + "learning_rate": 1.3065517683942339e-05, + "loss": 3.8761, + "step": 1047 + }, + { + "epoch": 0.42, + "grad_norm": 4.118559827596486, + "learning_rate": 1.3053183826288124e-05, + "loss": 3.8011, + "step": 1048 + }, + { + "epoch": 0.42, + "grad_norm": 4.382326156860178, + "learning_rate": 1.3040844844397573e-05, + "loss": 3.7709, + "step": 1049 + }, + { + "epoch": 0.42, + "grad_norm": 5.4798186185746225, + "learning_rate": 1.3028500758979507e-05, + "loss": 3.9834, + "step": 1050 + }, + { + "epoch": 0.42, + "grad_norm": 4.40810523021627, + "learning_rate": 1.3016151590751332e-05, + "loss": 3.7962, + "step": 1051 + }, + { + "epoch": 0.42, + "grad_norm": 3.8039902850152925, + "learning_rate": 1.3003797360438961e-05, + "loss": 3.9683, + "step": 1052 + }, + { + "epoch": 0.42, + "grad_norm": 3.894557739361905, + "learning_rate": 1.2991438088776818e-05, + "loss": 3.6801, + "step": 1053 + }, + { + "epoch": 0.42, + "grad_norm": 5.239443292560934, + "learning_rate": 1.2979073796507786e-05, + "loss": 3.8762, + "step": 1054 + }, + { + "epoch": 0.42, + "grad_norm": 3.5021977746058317, + "learning_rate": 1.296670450438317e-05, + "loss": 3.774, + "step": 1055 + }, + { + "epoch": 0.42, + "grad_norm": 4.1109819520202056, + "learning_rate": 1.2954330233162669e-05, + "loss": 3.7251, + "step": 1056 + }, + { + "epoch": 0.42, + "grad_norm": 3.8759541874915167, + "learning_rate": 1.2941951003614337e-05, + "loss": 3.7179, + "step": 1057 + }, + { + "epoch": 0.42, + "grad_norm": 4.6162051772187995, + "learning_rate": 1.2929566836514556e-05, + "loss": 3.7607, + "step": 1058 + }, + { + "epoch": 0.42, + "grad_norm": 3.801727408834567, + "learning_rate": 1.291717775264798e-05, + "loss": 3.8958, + "step": 1059 + }, + { + "epoch": 0.42, + "grad_norm": 4.407215697298154, + "learning_rate": 1.2904783772807534e-05, + "loss": 3.8602, + "step": 1060 + }, + { + "epoch": 0.42, + "grad_norm": 3.560458114043935, + "learning_rate": 1.2892384917794347e-05, + "loss": 3.9323, + "step": 1061 + }, + { + "epoch": 0.42, + "grad_norm": 3.8926615511691383, + "learning_rate": 1.2879981208417735e-05, + "loss": 3.6922, + "step": 1062 + }, + { + "epoch": 0.43, + "grad_norm": 4.022272943906542, + "learning_rate": 1.2867572665495156e-05, + "loss": 3.8297, + "step": 1063 + }, + { + "epoch": 0.43, + "grad_norm": 4.174392840890098, + "learning_rate": 1.285515930985219e-05, + "loss": 3.7822, + "step": 1064 + }, + { + "epoch": 0.43, + "grad_norm": 4.48534689846332, + "learning_rate": 1.2842741162322487e-05, + "loss": 3.9492, + "step": 1065 + }, + { + "epoch": 0.43, + "grad_norm": 3.7857352405447555, + "learning_rate": 1.2830318243747736e-05, + "loss": 3.8072, + "step": 1066 + }, + { + "epoch": 0.43, + "grad_norm": 3.607574037963277, + "learning_rate": 1.2817890574977648e-05, + "loss": 3.9189, + "step": 1067 + }, + { + "epoch": 0.43, + "grad_norm": 3.4592415194671173, + "learning_rate": 1.2805458176869885e-05, + "loss": 3.8008, + "step": 1068 + }, + { + "epoch": 0.43, + "grad_norm": 3.7386853887528777, + "learning_rate": 1.2793021070290065e-05, + "loss": 3.8409, + "step": 1069 + }, + { + "epoch": 0.43, + "grad_norm": 4.53759324100156, + "learning_rate": 1.2780579276111702e-05, + "loss": 3.6507, + "step": 1070 + }, + { + "epoch": 0.43, + "grad_norm": 4.945952450142829, + "learning_rate": 1.2768132815216174e-05, + "loss": 3.8486, + "step": 1071 + }, + { + "epoch": 0.43, + "grad_norm": 3.8818735387168606, + "learning_rate": 1.2755681708492696e-05, + "loss": 3.751, + "step": 1072 + }, + { + "epoch": 0.43, + "grad_norm": 4.159697782524046, + "learning_rate": 1.2743225976838277e-05, + "loss": 3.7202, + "step": 1073 + }, + { + "epoch": 0.43, + "grad_norm": 3.688463314558805, + "learning_rate": 1.2730765641157689e-05, + "loss": 3.8645, + "step": 1074 + }, + { + "epoch": 0.43, + "grad_norm": 3.8295978281953693, + "learning_rate": 1.2718300722363431e-05, + "loss": 3.7975, + "step": 1075 + }, + { + "epoch": 0.43, + "grad_norm": 3.924531322625543, + "learning_rate": 1.2705831241375695e-05, + "loss": 3.5147, + "step": 1076 + }, + { + "epoch": 0.43, + "grad_norm": 4.318083146218096, + "learning_rate": 1.2693357219122331e-05, + "loss": 3.9557, + "step": 1077 + }, + { + "epoch": 0.43, + "grad_norm": 3.677130448505102, + "learning_rate": 1.2680878676538804e-05, + "loss": 3.7197, + "step": 1078 + }, + { + "epoch": 0.43, + "grad_norm": 4.8463986020512575, + "learning_rate": 1.2668395634568175e-05, + "loss": 3.777, + "step": 1079 + }, + { + "epoch": 0.43, + "grad_norm": 5.058312752920507, + "learning_rate": 1.2655908114161053e-05, + "loss": 4.0007, + "step": 1080 + }, + { + "epoch": 0.43, + "grad_norm": 3.638632419006762, + "learning_rate": 1.2643416136275557e-05, + "loss": 3.7884, + "step": 1081 + }, + { + "epoch": 0.43, + "grad_norm": 3.5887543927846846, + "learning_rate": 1.2630919721877299e-05, + "loss": 3.8393, + "step": 1082 + }, + { + "epoch": 0.43, + "grad_norm": 4.149681723639608, + "learning_rate": 1.261841889193932e-05, + "loss": 3.7256, + "step": 1083 + }, + { + "epoch": 0.43, + "grad_norm": 4.897842387997689, + "learning_rate": 1.2605913667442096e-05, + "loss": 3.817, + "step": 1084 + }, + { + "epoch": 0.43, + "grad_norm": 4.088190503564847, + "learning_rate": 1.2593404069373452e-05, + "loss": 3.7328, + "step": 1085 + }, + { + "epoch": 0.43, + "grad_norm": 4.346641744766435, + "learning_rate": 1.2580890118728572e-05, + "loss": 3.6659, + "step": 1086 + }, + { + "epoch": 0.43, + "grad_norm": 3.9172265316872497, + "learning_rate": 1.2568371836509936e-05, + "loss": 3.6883, + "step": 1087 + }, + { + "epoch": 0.44, + "grad_norm": 4.0186942695070895, + "learning_rate": 1.2555849243727298e-05, + "loss": 3.756, + "step": 1088 + }, + { + "epoch": 0.44, + "grad_norm": 4.2788739628746395, + "learning_rate": 1.2543322361397648e-05, + "loss": 3.8754, + "step": 1089 + }, + { + "epoch": 0.44, + "grad_norm": 4.187805152986359, + "learning_rate": 1.2530791210545163e-05, + "loss": 3.7108, + "step": 1090 + }, + { + "epoch": 0.44, + "grad_norm": 4.32176195886882, + "learning_rate": 1.2518255812201203e-05, + "loss": 3.648, + "step": 1091 + }, + { + "epoch": 0.44, + "grad_norm": 3.967619501198415, + "learning_rate": 1.2505716187404242e-05, + "loss": 3.9049, + "step": 1092 + }, + { + "epoch": 0.44, + "grad_norm": 4.8773142998279955, + "learning_rate": 1.2493172357199856e-05, + "loss": 3.6919, + "step": 1093 + }, + { + "epoch": 0.44, + "grad_norm": 3.6144010815109398, + "learning_rate": 1.2480624342640673e-05, + "loss": 3.7658, + "step": 1094 + }, + { + "epoch": 0.44, + "grad_norm": 4.2419265154497285, + "learning_rate": 1.2468072164786342e-05, + "loss": 3.8289, + "step": 1095 + }, + { + "epoch": 0.44, + "grad_norm": 3.481230443413657, + "learning_rate": 1.2455515844703512e-05, + "loss": 3.8465, + "step": 1096 + }, + { + "epoch": 0.44, + "grad_norm": 4.165489059771342, + "learning_rate": 1.2442955403465768e-05, + "loss": 3.6346, + "step": 1097 + }, + { + "epoch": 0.44, + "grad_norm": 3.8231188657999677, + "learning_rate": 1.2430390862153625e-05, + "loss": 3.8528, + "step": 1098 + }, + { + "epoch": 0.44, + "grad_norm": 3.896380824909673, + "learning_rate": 1.2417822241854466e-05, + "loss": 3.785, + "step": 1099 + }, + { + "epoch": 0.44, + "grad_norm": 3.190572844245639, + "learning_rate": 1.2405249563662539e-05, + "loss": 3.9942, + "step": 1100 + }, + { + "epoch": 0.44, + "grad_norm": 4.049608251353112, + "learning_rate": 1.2392672848678877e-05, + "loss": 3.8083, + "step": 1101 + }, + { + "epoch": 0.44, + "grad_norm": 3.1717023508880833, + "learning_rate": 1.238009211801131e-05, + "loss": 3.8426, + "step": 1102 + }, + { + "epoch": 0.44, + "grad_norm": 3.570540640509857, + "learning_rate": 1.2367507392774398e-05, + "loss": 3.5437, + "step": 1103 + }, + { + "epoch": 0.44, + "grad_norm": 3.5967168685654696, + "learning_rate": 1.2354918694089406e-05, + "loss": 3.7432, + "step": 1104 + }, + { + "epoch": 0.44, + "grad_norm": 3.6697195244996283, + "learning_rate": 1.2342326043084268e-05, + "loss": 3.7574, + "step": 1105 + }, + { + "epoch": 0.44, + "grad_norm": 3.536043300169957, + "learning_rate": 1.2329729460893552e-05, + "loss": 3.6837, + "step": 1106 + }, + { + "epoch": 0.44, + "grad_norm": 3.736616862422548, + "learning_rate": 1.2317128968658424e-05, + "loss": 3.9007, + "step": 1107 + }, + { + "epoch": 0.44, + "grad_norm": 3.6992203544786433, + "learning_rate": 1.2304524587526609e-05, + "loss": 3.9669, + "step": 1108 + }, + { + "epoch": 0.44, + "grad_norm": 3.499394891696174, + "learning_rate": 1.2291916338652365e-05, + "loss": 3.7316, + "step": 1109 + }, + { + "epoch": 0.44, + "grad_norm": 3.8376821119143663, + "learning_rate": 1.2279304243196438e-05, + "loss": 3.952, + "step": 1110 + }, + { + "epoch": 0.44, + "grad_norm": 4.12009679545591, + "learning_rate": 1.2266688322326024e-05, + "loss": 3.8621, + "step": 1111 + }, + { + "epoch": 0.44, + "grad_norm": 3.411885666351011, + "learning_rate": 1.225406859721475e-05, + "loss": 3.8947, + "step": 1112 + }, + { + "epoch": 0.45, + "grad_norm": 3.975578622769097, + "learning_rate": 1.2241445089042623e-05, + "loss": 3.636, + "step": 1113 + }, + { + "epoch": 0.45, + "grad_norm": 4.414469682007408, + "learning_rate": 1.2228817818995998e-05, + "loss": 3.806, + "step": 1114 + }, + { + "epoch": 0.45, + "grad_norm": 3.831971776801647, + "learning_rate": 1.2216186808267544e-05, + "loss": 3.9733, + "step": 1115 + }, + { + "epoch": 0.45, + "grad_norm": 3.810781796334439, + "learning_rate": 1.2203552078056209e-05, + "loss": 3.772, + "step": 1116 + }, + { + "epoch": 0.45, + "grad_norm": 3.884988279276643, + "learning_rate": 1.2190913649567185e-05, + "loss": 3.932, + "step": 1117 + }, + { + "epoch": 0.45, + "grad_norm": 3.5364222816956867, + "learning_rate": 1.2178271544011864e-05, + "loss": 3.8833, + "step": 1118 + }, + { + "epoch": 0.45, + "grad_norm": 4.086827186062164, + "learning_rate": 1.2165625782607817e-05, + "loss": 3.8387, + "step": 1119 + }, + { + "epoch": 0.45, + "grad_norm": 4.2399315656735865, + "learning_rate": 1.215297638657875e-05, + "loss": 3.7389, + "step": 1120 + }, + { + "epoch": 0.45, + "grad_norm": 3.317299756870864, + "learning_rate": 1.2140323377154467e-05, + "loss": 3.9934, + "step": 1121 + }, + { + "epoch": 0.45, + "grad_norm": 3.755435455736925, + "learning_rate": 1.2127666775570837e-05, + "loss": 3.6894, + "step": 1122 + }, + { + "epoch": 0.45, + "grad_norm": 3.702006572771475, + "learning_rate": 1.211500660306975e-05, + "loss": 3.7638, + "step": 1123 + }, + { + "epoch": 0.45, + "grad_norm": 3.6732674412435555, + "learning_rate": 1.210234288089911e-05, + "loss": 3.5788, + "step": 1124 + }, + { + "epoch": 0.45, + "grad_norm": 3.733951889792287, + "learning_rate": 1.2089675630312755e-05, + "loss": 3.7181, + "step": 1125 + }, + { + "epoch": 0.45, + "grad_norm": 4.5632865766418425, + "learning_rate": 1.2077004872570454e-05, + "loss": 3.7373, + "step": 1126 + }, + { + "epoch": 0.45, + "grad_norm": 3.682464186358648, + "learning_rate": 1.206433062893787e-05, + "loss": 3.7892, + "step": 1127 + }, + { + "epoch": 0.45, + "grad_norm": 3.8827651672073524, + "learning_rate": 1.2051652920686505e-05, + "loss": 3.6032, + "step": 1128 + }, + { + "epoch": 0.45, + "grad_norm": 4.079285241985473, + "learning_rate": 1.2038971769093685e-05, + "loss": 3.8538, + "step": 1129 + }, + { + "epoch": 0.45, + "grad_norm": 4.306943261397105, + "learning_rate": 1.2026287195442503e-05, + "loss": 3.7325, + "step": 1130 + }, + { + "epoch": 0.45, + "grad_norm": 4.058850469182537, + "learning_rate": 1.201359922102181e-05, + "loss": 3.7228, + "step": 1131 + }, + { + "epoch": 0.45, + "grad_norm": 3.8833854615216725, + "learning_rate": 1.200090786712615e-05, + "loss": 3.9011, + "step": 1132 + }, + { + "epoch": 0.45, + "grad_norm": 4.401892254030929, + "learning_rate": 1.1988213155055754e-05, + "loss": 3.7903, + "step": 1133 + }, + { + "epoch": 0.45, + "grad_norm": 4.239794919660805, + "learning_rate": 1.1975515106116472e-05, + "loss": 3.7108, + "step": 1134 + }, + { + "epoch": 0.45, + "grad_norm": 3.857128357765898, + "learning_rate": 1.1962813741619777e-05, + "loss": 3.8152, + "step": 1135 + }, + { + "epoch": 0.45, + "grad_norm": 3.7166704461674835, + "learning_rate": 1.1950109082882681e-05, + "loss": 3.692, + "step": 1136 + }, + { + "epoch": 0.45, + "grad_norm": 3.529483667781026, + "learning_rate": 1.193740115122774e-05, + "loss": 4.073, + "step": 1137 + }, + { + "epoch": 0.46, + "grad_norm": 3.79176223989985, + "learning_rate": 1.1924689967983006e-05, + "loss": 3.8253, + "step": 1138 + }, + { + "epoch": 0.46, + "grad_norm": 3.603835067055338, + "learning_rate": 1.191197555448197e-05, + "loss": 3.8131, + "step": 1139 + }, + { + "epoch": 0.46, + "grad_norm": 3.6152376513863014, + "learning_rate": 1.189925793206357e-05, + "loss": 3.7461, + "step": 1140 + }, + { + "epoch": 0.46, + "grad_norm": 3.410975002301851, + "learning_rate": 1.1886537122072106e-05, + "loss": 3.901, + "step": 1141 + }, + { + "epoch": 0.46, + "grad_norm": 3.3186385039853183, + "learning_rate": 1.187381314585725e-05, + "loss": 3.9929, + "step": 1142 + }, + { + "epoch": 0.46, + "grad_norm": 3.7670461549530887, + "learning_rate": 1.1861086024773963e-05, + "loss": 3.7552, + "step": 1143 + }, + { + "epoch": 0.46, + "grad_norm": 3.564484757844791, + "learning_rate": 1.1848355780182502e-05, + "loss": 3.8756, + "step": 1144 + }, + { + "epoch": 0.46, + "grad_norm": 3.719641849142067, + "learning_rate": 1.1835622433448361e-05, + "loss": 3.6392, + "step": 1145 + }, + { + "epoch": 0.46, + "grad_norm": 3.699068447698048, + "learning_rate": 1.1822886005942244e-05, + "loss": 3.6641, + "step": 1146 + }, + { + "epoch": 0.46, + "grad_norm": 3.7519943682708763, + "learning_rate": 1.1810146519040023e-05, + "loss": 3.8413, + "step": 1147 + }, + { + "epoch": 0.46, + "grad_norm": 3.5090371131676132, + "learning_rate": 1.1797403994122698e-05, + "loss": 3.7514, + "step": 1148 + }, + { + "epoch": 0.46, + "grad_norm": 3.483919091866205, + "learning_rate": 1.178465845257638e-05, + "loss": 3.8969, + "step": 1149 + }, + { + "epoch": 0.46, + "grad_norm": 3.9152410753151496, + "learning_rate": 1.177190991579223e-05, + "loss": 3.7882, + "step": 1150 + }, + { + "epoch": 0.46, + "grad_norm": 3.7833928552124285, + "learning_rate": 1.1759158405166446e-05, + "loss": 3.8664, + "step": 1151 + }, + { + "epoch": 0.46, + "grad_norm": 3.217725364864304, + "learning_rate": 1.1746403942100215e-05, + "loss": 3.807, + "step": 1152 + }, + { + "epoch": 0.46, + "grad_norm": 3.4669536263663465, + "learning_rate": 1.1733646547999678e-05, + "loss": 3.7389, + "step": 1153 + }, + { + "epoch": 0.46, + "grad_norm": 3.4928261841706183, + "learning_rate": 1.1720886244275893e-05, + "loss": 3.7921, + "step": 1154 + }, + { + "epoch": 0.46, + "grad_norm": 3.989073154700081, + "learning_rate": 1.1708123052344803e-05, + "loss": 3.6648, + "step": 1155 + }, + { + "epoch": 0.46, + "grad_norm": 3.693225989563176, + "learning_rate": 1.1695356993627203e-05, + "loss": 3.9464, + "step": 1156 + }, + { + "epoch": 0.46, + "grad_norm": 4.5615429339832385, + "learning_rate": 1.1682588089548692e-05, + "loss": 3.6917, + "step": 1157 + }, + { + "epoch": 0.46, + "grad_norm": 3.680963349085811, + "learning_rate": 1.1669816361539647e-05, + "loss": 3.7782, + "step": 1158 + }, + { + "epoch": 0.46, + "grad_norm": 3.598393270654093, + "learning_rate": 1.1657041831035186e-05, + "loss": 3.7299, + "step": 1159 + }, + { + "epoch": 0.46, + "grad_norm": 4.466531188320706, + "learning_rate": 1.164426451947513e-05, + "loss": 3.6484, + "step": 1160 + }, + { + "epoch": 0.46, + "grad_norm": 4.035699717288, + "learning_rate": 1.1631484448303964e-05, + "loss": 3.62, + "step": 1161 + }, + { + "epoch": 0.46, + "grad_norm": 4.121097309892934, + "learning_rate": 1.1618701638970815e-05, + "loss": 3.6402, + "step": 1162 + }, + { + "epoch": 0.47, + "grad_norm": 4.323877433118681, + "learning_rate": 1.1605916112929388e-05, + "loss": 3.6339, + "step": 1163 + }, + { + "epoch": 0.47, + "grad_norm": 4.3841070809544185, + "learning_rate": 1.1593127891637968e-05, + "loss": 3.8707, + "step": 1164 + }, + { + "epoch": 0.47, + "grad_norm": 3.361551441812719, + "learning_rate": 1.1580336996559343e-05, + "loss": 3.702, + "step": 1165 + }, + { + "epoch": 0.47, + "grad_norm": 3.073065467204595, + "learning_rate": 1.156754344916081e-05, + "loss": 3.8247, + "step": 1166 + }, + { + "epoch": 0.47, + "grad_norm": 4.520324161669763, + "learning_rate": 1.1554747270914098e-05, + "loss": 3.5686, + "step": 1167 + }, + { + "epoch": 0.47, + "grad_norm": 4.654913388365398, + "learning_rate": 1.1541948483295358e-05, + "loss": 3.8035, + "step": 1168 + }, + { + "epoch": 0.47, + "grad_norm": 3.2461626059038178, + "learning_rate": 1.1529147107785129e-05, + "loss": 3.7129, + "step": 1169 + }, + { + "epoch": 0.47, + "grad_norm": 4.180515399607351, + "learning_rate": 1.151634316586828e-05, + "loss": 3.8563, + "step": 1170 + }, + { + "epoch": 0.47, + "grad_norm": 4.5441403758051315, + "learning_rate": 1.1503536679034e-05, + "loss": 3.6948, + "step": 1171 + }, + { + "epoch": 0.47, + "grad_norm": 4.078399152054442, + "learning_rate": 1.1490727668775735e-05, + "loss": 3.7647, + "step": 1172 + }, + { + "epoch": 0.47, + "grad_norm": 4.17839698744333, + "learning_rate": 1.147791615659118e-05, + "loss": 3.6863, + "step": 1173 + }, + { + "epoch": 0.47, + "grad_norm": 3.411722230592826, + "learning_rate": 1.1465102163982218e-05, + "loss": 3.7029, + "step": 1174 + }, + { + "epoch": 0.47, + "grad_norm": 4.572334871578432, + "learning_rate": 1.1452285712454905e-05, + "loss": 3.7514, + "step": 1175 + }, + { + "epoch": 0.47, + "grad_norm": 4.010392827821349, + "learning_rate": 1.1439466823519414e-05, + "loss": 3.7983, + "step": 1176 + }, + { + "epoch": 0.47, + "grad_norm": 5.15222057397017, + "learning_rate": 1.1426645518690015e-05, + "loss": 3.6849, + "step": 1177 + }, + { + "epoch": 0.47, + "grad_norm": 4.519535347933383, + "learning_rate": 1.1413821819485035e-05, + "loss": 3.6528, + "step": 1178 + }, + { + "epoch": 0.47, + "grad_norm": 4.14465571981964, + "learning_rate": 1.140099574742681e-05, + "loss": 3.8408, + "step": 1179 + }, + { + "epoch": 0.47, + "grad_norm": 3.8855608293576704, + "learning_rate": 1.138816732404167e-05, + "loss": 3.7254, + "step": 1180 + }, + { + "epoch": 0.47, + "grad_norm": 3.66606300571839, + "learning_rate": 1.1375336570859877e-05, + "loss": 3.6236, + "step": 1181 + }, + { + "epoch": 0.47, + "grad_norm": 4.159327154865983, + "learning_rate": 1.136250350941562e-05, + "loss": 3.6226, + "step": 1182 + }, + { + "epoch": 0.47, + "grad_norm": 3.8058559454023757, + "learning_rate": 1.1349668161246945e-05, + "loss": 3.7024, + "step": 1183 + }, + { + "epoch": 0.47, + "grad_norm": 4.583400400011528, + "learning_rate": 1.1336830547895752e-05, + "loss": 3.5792, + "step": 1184 + }, + { + "epoch": 0.47, + "grad_norm": 3.7655923073287294, + "learning_rate": 1.1323990690907734e-05, + "loss": 3.7676, + "step": 1185 + }, + { + "epoch": 0.47, + "grad_norm": 4.466605227178359, + "learning_rate": 1.1311148611832346e-05, + "loss": 3.7292, + "step": 1186 + }, + { + "epoch": 0.47, + "grad_norm": 3.7647284604385653, + "learning_rate": 1.129830433222278e-05, + "loss": 3.9269, + "step": 1187 + }, + { + "epoch": 0.48, + "grad_norm": 4.733028020978156, + "learning_rate": 1.128545787363592e-05, + "loss": 3.6541, + "step": 1188 + }, + { + "epoch": 0.48, + "grad_norm": 5.784316104630818, + "learning_rate": 1.1272609257632305e-05, + "loss": 3.7619, + "step": 1189 + }, + { + "epoch": 0.48, + "grad_norm": 3.6535503281391337, + "learning_rate": 1.1259758505776092e-05, + "loss": 4.0232, + "step": 1190 + }, + { + "epoch": 0.48, + "grad_norm": 3.9417157274952443, + "learning_rate": 1.1246905639635029e-05, + "loss": 3.8453, + "step": 1191 + }, + { + "epoch": 0.48, + "grad_norm": 3.8028096902179827, + "learning_rate": 1.1234050680780407e-05, + "loss": 3.7516, + "step": 1192 + }, + { + "epoch": 0.48, + "grad_norm": 5.259264025052849, + "learning_rate": 1.1221193650787032e-05, + "loss": 3.823, + "step": 1193 + }, + { + "epoch": 0.48, + "grad_norm": 4.238159290220217, + "learning_rate": 1.1208334571233186e-05, + "loss": 3.9508, + "step": 1194 + }, + { + "epoch": 0.48, + "grad_norm": 3.763516705043185, + "learning_rate": 1.119547346370059e-05, + "loss": 3.6778, + "step": 1195 + }, + { + "epoch": 0.48, + "grad_norm": 3.801301507686709, + "learning_rate": 1.118261034977437e-05, + "loss": 3.5536, + "step": 1196 + }, + { + "epoch": 0.48, + "grad_norm": 4.0995292505809795, + "learning_rate": 1.116974525104302e-05, + "loss": 3.5847, + "step": 1197 + }, + { + "epoch": 0.48, + "grad_norm": 4.052881858623986, + "learning_rate": 1.1156878189098357e-05, + "loss": 3.7869, + "step": 1198 + }, + { + "epoch": 0.48, + "grad_norm": 3.759889351369552, + "learning_rate": 1.114400918553551e-05, + "loss": 3.7752, + "step": 1199 + }, + { + "epoch": 0.48, + "grad_norm": 3.641606335216125, + "learning_rate": 1.1131138261952845e-05, + "loss": 3.7229, + "step": 1200 + }, + { + "epoch": 0.48, + "grad_norm": 3.2624956103832945, + "learning_rate": 1.1118265439951968e-05, + "loss": 3.5944, + "step": 1201 + }, + { + "epoch": 0.48, + "grad_norm": 4.1450940216557415, + "learning_rate": 1.110539074113766e-05, + "loss": 3.7048, + "step": 1202 + }, + { + "epoch": 0.48, + "grad_norm": 3.3447326327359326, + "learning_rate": 1.1092514187117865e-05, + "loss": 3.6985, + "step": 1203 + }, + { + "epoch": 0.48, + "grad_norm": 3.5461055814418447, + "learning_rate": 1.1079635799503625e-05, + "loss": 3.7647, + "step": 1204 + }, + { + "epoch": 0.48, + "grad_norm": 3.3556065313219903, + "learning_rate": 1.1066755599909065e-05, + "loss": 3.734, + "step": 1205 + }, + { + "epoch": 0.48, + "grad_norm": 3.8317618472216592, + "learning_rate": 1.1053873609951362e-05, + "loss": 3.5126, + "step": 1206 + }, + { + "epoch": 0.48, + "grad_norm": 3.8356744970212726, + "learning_rate": 1.1040989851250678e-05, + "loss": 3.8545, + "step": 1207 + }, + { + "epoch": 0.48, + "grad_norm": 3.8078493058296803, + "learning_rate": 1.1028104345430161e-05, + "loss": 3.6943, + "step": 1208 + }, + { + "epoch": 0.48, + "grad_norm": 3.806203159509349, + "learning_rate": 1.1015217114115884e-05, + "loss": 3.5924, + "step": 1209 + }, + { + "epoch": 0.48, + "grad_norm": 3.4329163591887006, + "learning_rate": 1.1002328178936813e-05, + "loss": 3.5889, + "step": 1210 + }, + { + "epoch": 0.48, + "grad_norm": 3.2311989382515263, + "learning_rate": 1.0989437561524776e-05, + "loss": 3.8365, + "step": 1211 + }, + { + "epoch": 0.48, + "grad_norm": 3.7876161541916766, + "learning_rate": 1.097654528351443e-05, + "loss": 3.7401, + "step": 1212 + }, + { + "epoch": 0.49, + "grad_norm": 4.450794072226459, + "learning_rate": 1.0963651366543214e-05, + "loss": 3.6452, + "step": 1213 + }, + { + "epoch": 0.49, + "grad_norm": 3.4025224897119974, + "learning_rate": 1.095075583225131e-05, + "loss": 3.7985, + "step": 1214 + }, + { + "epoch": 0.49, + "grad_norm": 4.266609502557084, + "learning_rate": 1.0937858702281631e-05, + "loss": 3.653, + "step": 1215 + }, + { + "epoch": 0.49, + "grad_norm": 4.352530022697998, + "learning_rate": 1.0924959998279754e-05, + "loss": 3.5712, + "step": 1216 + }, + { + "epoch": 0.49, + "grad_norm": 4.7920904023153685, + "learning_rate": 1.0912059741893908e-05, + "loss": 3.5353, + "step": 1217 + }, + { + "epoch": 0.49, + "grad_norm": 4.27009447990584, + "learning_rate": 1.089915795477492e-05, + "loss": 3.6817, + "step": 1218 + }, + { + "epoch": 0.49, + "grad_norm": 4.093104784383785, + "learning_rate": 1.0886254658576186e-05, + "loss": 3.5539, + "step": 1219 + }, + { + "epoch": 0.49, + "grad_norm": 5.121766404390022, + "learning_rate": 1.087334987495364e-05, + "loss": 3.6973, + "step": 1220 + }, + { + "epoch": 0.49, + "grad_norm": 3.6097287264286777, + "learning_rate": 1.0860443625565712e-05, + "loss": 3.7054, + "step": 1221 + }, + { + "epoch": 0.49, + "grad_norm": 4.099101696343389, + "learning_rate": 1.0847535932073288e-05, + "loss": 3.821, + "step": 1222 + }, + { + "epoch": 0.49, + "grad_norm": 3.7239613892649963, + "learning_rate": 1.0834626816139678e-05, + "loss": 3.8649, + "step": 1223 + }, + { + "epoch": 0.49, + "grad_norm": 3.927013781158877, + "learning_rate": 1.0821716299430577e-05, + "loss": 3.7919, + "step": 1224 + }, + { + "epoch": 0.49, + "grad_norm": 3.926068050669173, + "learning_rate": 1.0808804403614044e-05, + "loss": 3.7943, + "step": 1225 + }, + { + "epoch": 0.49, + "grad_norm": 3.9343822616688335, + "learning_rate": 1.0795891150360435e-05, + "loss": 3.6685, + "step": 1226 + }, + { + "epoch": 0.49, + "grad_norm": 3.816742376232799, + "learning_rate": 1.0782976561342398e-05, + "loss": 3.5803, + "step": 1227 + }, + { + "epoch": 0.49, + "grad_norm": 3.3950844811577676, + "learning_rate": 1.0770060658234815e-05, + "loss": 3.6176, + "step": 1228 + }, + { + "epoch": 0.49, + "grad_norm": 3.9333804501497704, + "learning_rate": 1.0757143462714777e-05, + "loss": 3.5919, + "step": 1229 + }, + { + "epoch": 0.49, + "grad_norm": 4.418557233366642, + "learning_rate": 1.0744224996461541e-05, + "loss": 3.6716, + "step": 1230 + }, + { + "epoch": 0.49, + "grad_norm": 3.7898331498735236, + "learning_rate": 1.0731305281156499e-05, + "loss": 3.6412, + "step": 1231 + }, + { + "epoch": 0.49, + "grad_norm": 3.7969582329269076, + "learning_rate": 1.0718384338483141e-05, + "loss": 3.7543, + "step": 1232 + }, + { + "epoch": 0.49, + "grad_norm": 4.148636874878519, + "learning_rate": 1.0705462190127011e-05, + "loss": 3.7497, + "step": 1233 + }, + { + "epoch": 0.49, + "grad_norm": 3.3035604956864364, + "learning_rate": 1.0692538857775685e-05, + "loss": 3.8701, + "step": 1234 + }, + { + "epoch": 0.49, + "grad_norm": 4.008854288704277, + "learning_rate": 1.0679614363118718e-05, + "loss": 3.4629, + "step": 1235 + }, + { + "epoch": 0.49, + "grad_norm": 3.8217933247978877, + "learning_rate": 1.066668872784762e-05, + "loss": 3.599, + "step": 1236 + }, + { + "epoch": 0.49, + "grad_norm": 3.505699888668807, + "learning_rate": 1.0653761973655819e-05, + "loss": 3.61, + "step": 1237 + }, + { + "epoch": 0.5, + "grad_norm": 3.630452722607848, + "learning_rate": 1.0640834122238606e-05, + "loss": 3.6772, + "step": 1238 + }, + { + "epoch": 0.5, + "grad_norm": 3.4696551759013063, + "learning_rate": 1.0627905195293135e-05, + "loss": 3.8177, + "step": 1239 + }, + { + "epoch": 0.5, + "grad_norm": 3.600962380822384, + "learning_rate": 1.061497521451835e-05, + "loss": 3.7467, + "step": 1240 + }, + { + "epoch": 0.5, + "grad_norm": 3.5615429684192623, + "learning_rate": 1.0602044201614965e-05, + "loss": 3.7444, + "step": 1241 + }, + { + "epoch": 0.5, + "grad_norm": 3.6893702737397125, + "learning_rate": 1.0589112178285432e-05, + "loss": 3.7873, + "step": 1242 + }, + { + "epoch": 0.5, + "grad_norm": 4.088874847636349, + "learning_rate": 1.0576179166233895e-05, + "loss": 3.7347, + "step": 1243 + }, + { + "epoch": 0.5, + "grad_norm": 3.6194124320453978, + "learning_rate": 1.056324518716616e-05, + "loss": 3.6616, + "step": 1244 + }, + { + "epoch": 0.5, + "grad_norm": 3.709738290667363, + "learning_rate": 1.055031026278965e-05, + "loss": 3.5815, + "step": 1245 + }, + { + "epoch": 0.5, + "grad_norm": 4.3126884197713675, + "learning_rate": 1.0537374414813384e-05, + "loss": 3.7129, + "step": 1246 + }, + { + "epoch": 0.5, + "grad_norm": 4.126610206839559, + "learning_rate": 1.0524437664947918e-05, + "loss": 3.8666, + "step": 1247 + }, + { + "epoch": 0.5, + "grad_norm": 4.228485912654699, + "learning_rate": 1.051150003490534e-05, + "loss": 3.8475, + "step": 1248 + }, + { + "epoch": 0.5, + "grad_norm": 4.414002348692545, + "learning_rate": 1.0498561546399194e-05, + "loss": 3.7105, + "step": 1249 + }, + { + "epoch": 0.5, + "grad_norm": 3.544218783563698, + "learning_rate": 1.0485622221144485e-05, + "loss": 3.6683, + "step": 1250 + }, + { + "epoch": 0.5, + "grad_norm": 3.6293603164028023, + "learning_rate": 1.0472682080857606e-05, + "loss": 3.7261, + "step": 1251 + }, + { + "epoch": 0.5, + "grad_norm": 4.119603916787359, + "learning_rate": 1.0459741147256325e-05, + "loss": 3.645, + "step": 1252 + }, + { + "epoch": 0.5, + "grad_norm": 3.309856421704084, + "learning_rate": 1.044679944205975e-05, + "loss": 3.7916, + "step": 1253 + }, + { + "epoch": 0.5, + "grad_norm": 3.3126208630112717, + "learning_rate": 1.043385698698826e-05, + "loss": 3.5428, + "step": 1254 + }, + { + "epoch": 0.5, + "grad_norm": 3.435020152190706, + "learning_rate": 1.0420913803763522e-05, + "loss": 3.6746, + "step": 1255 + }, + { + "epoch": 0.5, + "grad_norm": 3.442250663980389, + "learning_rate": 1.04079699141084e-05, + "loss": 3.8303, + "step": 1256 + }, + { + "epoch": 0.5, + "grad_norm": 3.6876733935381405, + "learning_rate": 1.0395025339746965e-05, + "loss": 3.7032, + "step": 1257 + }, + { + "epoch": 0.5, + "grad_norm": 3.680657228628987, + "learning_rate": 1.0382080102404417e-05, + "loss": 3.6895, + "step": 1258 + }, + { + "epoch": 0.5, + "grad_norm": 3.4319284847026, + "learning_rate": 1.0369134223807082e-05, + "loss": 3.5902, + "step": 1259 + }, + { + "epoch": 0.5, + "grad_norm": 3.146045404231594, + "learning_rate": 1.0356187725682359e-05, + "loss": 3.7994, + "step": 1260 + }, + { + "epoch": 0.5, + "grad_norm": 3.279354700190127, + "learning_rate": 1.0343240629758683e-05, + "loss": 3.6323, + "step": 1261 + }, + { + "epoch": 0.5, + "grad_norm": 3.6423447713494674, + "learning_rate": 1.0330292957765502e-05, + "loss": 3.7522, + "step": 1262 + }, + { + "epoch": 0.51, + "grad_norm": 3.9929601866704547, + "learning_rate": 1.0317344731433217e-05, + "loss": 3.6607, + "step": 1263 + }, + { + "epoch": 0.51, + "grad_norm": 3.884660944168367, + "learning_rate": 1.0304395972493172e-05, + "loss": 3.9169, + "step": 1264 + }, + { + "epoch": 0.51, + "grad_norm": 3.4149693145698077, + "learning_rate": 1.0291446702677598e-05, + "loss": 3.6962, + "step": 1265 + }, + { + "epoch": 0.51, + "grad_norm": 3.45271671752356, + "learning_rate": 1.0278496943719585e-05, + "loss": 3.7441, + "step": 1266 + }, + { + "epoch": 0.51, + "grad_norm": 3.6133398318052428, + "learning_rate": 1.0265546717353041e-05, + "loss": 3.683, + "step": 1267 + }, + { + "epoch": 0.51, + "grad_norm": 3.371173249364838, + "learning_rate": 1.0252596045312666e-05, + "loss": 3.6712, + "step": 1268 + }, + { + "epoch": 0.51, + "grad_norm": 3.83144634459696, + "learning_rate": 1.02396449493339e-05, + "loss": 3.6621, + "step": 1269 + }, + { + "epoch": 0.51, + "grad_norm": 3.5042655204060793, + "learning_rate": 1.02266934511529e-05, + "loss": 3.6081, + "step": 1270 + }, + { + "epoch": 0.51, + "grad_norm": 3.901785995386197, + "learning_rate": 1.0213741572506497e-05, + "loss": 3.5902, + "step": 1271 + }, + { + "epoch": 0.51, + "grad_norm": 3.597912776915605, + "learning_rate": 1.0200789335132157e-05, + "loss": 3.6509, + "step": 1272 + }, + { + "epoch": 0.51, + "grad_norm": 3.576944142026537, + "learning_rate": 1.0187836760767954e-05, + "loss": 3.6438, + "step": 1273 + }, + { + "epoch": 0.51, + "grad_norm": 4.171711826080111, + "learning_rate": 1.0174883871152517e-05, + "loss": 3.6199, + "step": 1274 + }, + { + "epoch": 0.51, + "grad_norm": 3.997960682784523, + "learning_rate": 1.0161930688025018e-05, + "loss": 3.597, + "step": 1275 + }, + { + "epoch": 0.51, + "grad_norm": 4.118551729202294, + "learning_rate": 1.014897723312511e-05, + "loss": 3.7327, + "step": 1276 + }, + { + "epoch": 0.51, + "grad_norm": 4.115281655837042, + "learning_rate": 1.013602352819291e-05, + "loss": 3.6384, + "step": 1277 + }, + { + "epoch": 0.51, + "grad_norm": 3.8326653216963527, + "learning_rate": 1.0123069594968952e-05, + "loss": 3.6624, + "step": 1278 + }, + { + "epoch": 0.51, + "grad_norm": 6.420306455859883, + "learning_rate": 1.0110115455194157e-05, + "loss": 3.6849, + "step": 1279 + }, + { + "epoch": 0.51, + "grad_norm": 3.8078583089194438, + "learning_rate": 1.0097161130609774e-05, + "loss": 3.6237, + "step": 1280 + }, + { + "epoch": 0.51, + "grad_norm": 3.4296770703932693, + "learning_rate": 1.0084206642957393e-05, + "loss": 3.772, + "step": 1281 + }, + { + "epoch": 0.51, + "grad_norm": 4.127642158329446, + "learning_rate": 1.0071252013978852e-05, + "loss": 3.571, + "step": 1282 + }, + { + "epoch": 0.51, + "grad_norm": 3.7661312311309287, + "learning_rate": 1.0058297265416234e-05, + "loss": 3.6234, + "step": 1283 + }, + { + "epoch": 0.51, + "grad_norm": 3.543027913063287, + "learning_rate": 1.0045342419011832e-05, + "loss": 3.5259, + "step": 1284 + }, + { + "epoch": 0.51, + "grad_norm": 4.157795877165396, + "learning_rate": 1.003238749650809e-05, + "loss": 3.5826, + "step": 1285 + }, + { + "epoch": 0.51, + "grad_norm": 3.9799792666825966, + "learning_rate": 1.0019432519647585e-05, + "loss": 3.6644, + "step": 1286 + }, + { + "epoch": 0.51, + "grad_norm": 3.922820795521154, + "learning_rate": 1.0006477510172984e-05, + "loss": 3.7645, + "step": 1287 + }, + { + "epoch": 0.52, + "grad_norm": 3.2795127075695625, + "learning_rate": 9.993522489827016e-06, + "loss": 3.7082, + "step": 1288 + }, + { + "epoch": 0.52, + "grad_norm": 4.160932501828583, + "learning_rate": 9.980567480352417e-06, + "loss": 3.7132, + "step": 1289 + }, + { + "epoch": 0.52, + "grad_norm": 3.9551672926611743, + "learning_rate": 9.967612503491915e-06, + "loss": 3.6536, + "step": 1290 + }, + { + "epoch": 0.52, + "grad_norm": 3.567688655247914, + "learning_rate": 9.954657580988171e-06, + "loss": 3.6158, + "step": 1291 + }, + { + "epoch": 0.52, + "grad_norm": 4.504476394633079, + "learning_rate": 9.941702734583771e-06, + "loss": 3.6681, + "step": 1292 + }, + { + "epoch": 0.52, + "grad_norm": 3.8738546664048155, + "learning_rate": 9.928747986021153e-06, + "loss": 3.9168, + "step": 1293 + }, + { + "epoch": 0.52, + "grad_norm": 3.681866936469431, + "learning_rate": 9.91579335704261e-06, + "loss": 3.5911, + "step": 1294 + }, + { + "epoch": 0.52, + "grad_norm": 3.8543107213799708, + "learning_rate": 9.90283886939023e-06, + "loss": 3.649, + "step": 1295 + }, + { + "epoch": 0.52, + "grad_norm": 4.213819491800523, + "learning_rate": 9.88988454480585e-06, + "loss": 3.6681, + "step": 1296 + }, + { + "epoch": 0.52, + "grad_norm": 3.672911779908602, + "learning_rate": 9.876930405031047e-06, + "loss": 3.4539, + "step": 1297 + }, + { + "epoch": 0.52, + "grad_norm": 4.242243787102731, + "learning_rate": 9.86397647180709e-06, + "loss": 3.6487, + "step": 1298 + }, + { + "epoch": 0.52, + "grad_norm": 3.2004720405539255, + "learning_rate": 9.851022766874892e-06, + "loss": 3.8127, + "step": 1299 + }, + { + "epoch": 0.52, + "grad_norm": 4.09162308157697, + "learning_rate": 9.838069311974986e-06, + "loss": 3.6898, + "step": 1300 + }, + { + "epoch": 0.52, + "grad_norm": 3.548750561025053, + "learning_rate": 9.825116128847488e-06, + "loss": 3.6147, + "step": 1301 + }, + { + "epoch": 0.52, + "grad_norm": 3.8328336344463603, + "learning_rate": 9.812163239232051e-06, + "loss": 3.5539, + "step": 1302 + }, + { + "epoch": 0.52, + "grad_norm": 3.2443184038824855, + "learning_rate": 9.799210664867844e-06, + "loss": 3.6883, + "step": 1303 + }, + { + "epoch": 0.52, + "grad_norm": 3.878823223982976, + "learning_rate": 9.786258427493505e-06, + "loss": 3.823, + "step": 1304 + }, + { + "epoch": 0.52, + "grad_norm": 3.4116361274295186, + "learning_rate": 9.773306548847102e-06, + "loss": 3.578, + "step": 1305 + }, + { + "epoch": 0.52, + "grad_norm": 4.24034110874559, + "learning_rate": 9.760355050666102e-06, + "loss": 3.727, + "step": 1306 + }, + { + "epoch": 0.52, + "grad_norm": 4.634306245889952, + "learning_rate": 9.747403954687334e-06, + "loss": 3.7736, + "step": 1307 + }, + { + "epoch": 0.52, + "grad_norm": 3.749776046989676, + "learning_rate": 9.734453282646962e-06, + "loss": 3.6961, + "step": 1308 + }, + { + "epoch": 0.52, + "grad_norm": 3.932417095522838, + "learning_rate": 9.721503056280418e-06, + "loss": 3.6418, + "step": 1309 + }, + { + "epoch": 0.52, + "grad_norm": 4.542879382334435, + "learning_rate": 9.708553297322407e-06, + "loss": 3.5859, + "step": 1310 + }, + { + "epoch": 0.52, + "grad_norm": 3.5258395671961127, + "learning_rate": 9.69560402750683e-06, + "loss": 3.7562, + "step": 1311 + }, + { + "epoch": 0.52, + "grad_norm": 4.135406118551599, + "learning_rate": 9.682655268566783e-06, + "loss": 3.5696, + "step": 1312 + }, + { + "epoch": 0.53, + "grad_norm": 3.8528419671684597, + "learning_rate": 9.669707042234502e-06, + "loss": 3.6807, + "step": 1313 + }, + { + "epoch": 0.53, + "grad_norm": 4.201555676577082, + "learning_rate": 9.656759370241318e-06, + "loss": 3.5825, + "step": 1314 + }, + { + "epoch": 0.53, + "grad_norm": 3.64562077172273, + "learning_rate": 9.643812274317644e-06, + "loss": 3.4712, + "step": 1315 + }, + { + "epoch": 0.53, + "grad_norm": 3.8396792440839103, + "learning_rate": 9.630865776192918e-06, + "loss": 3.7105, + "step": 1316 + }, + { + "epoch": 0.53, + "grad_norm": 4.4577250263526444, + "learning_rate": 9.617919897595586e-06, + "loss": 3.5812, + "step": 1317 + }, + { + "epoch": 0.53, + "grad_norm": 3.39166849313604, + "learning_rate": 9.604974660253039e-06, + "loss": 3.5843, + "step": 1318 + }, + { + "epoch": 0.53, + "grad_norm": 3.1918890159235995, + "learning_rate": 9.592030085891602e-06, + "loss": 3.6049, + "step": 1319 + }, + { + "epoch": 0.53, + "grad_norm": 3.664646495100284, + "learning_rate": 9.579086196236483e-06, + "loss": 3.6161, + "step": 1320 + }, + { + "epoch": 0.53, + "grad_norm": 4.318782755233096, + "learning_rate": 9.56614301301174e-06, + "loss": 3.8907, + "step": 1321 + }, + { + "epoch": 0.53, + "grad_norm": 3.1588859391321322, + "learning_rate": 9.553200557940254e-06, + "loss": 3.7873, + "step": 1322 + }, + { + "epoch": 0.53, + "grad_norm": 3.967215300253095, + "learning_rate": 9.540258852743676e-06, + "loss": 3.5864, + "step": 1323 + }, + { + "epoch": 0.53, + "grad_norm": 3.941485118974778, + "learning_rate": 9.527317919142398e-06, + "loss": 3.9642, + "step": 1324 + }, + { + "epoch": 0.53, + "grad_norm": 3.592458924548226, + "learning_rate": 9.514377778855521e-06, + "loss": 3.3537, + "step": 1325 + }, + { + "epoch": 0.53, + "grad_norm": 3.7732706931466202, + "learning_rate": 9.501438453600808e-06, + "loss": 3.6944, + "step": 1326 + }, + { + "epoch": 0.53, + "grad_norm": 3.7412543307988915, + "learning_rate": 9.488499965094664e-06, + "loss": 3.6244, + "step": 1327 + }, + { + "epoch": 0.53, + "grad_norm": 3.9239890999288694, + "learning_rate": 9.475562335052086e-06, + "loss": 3.5095, + "step": 1328 + }, + { + "epoch": 0.53, + "grad_norm": 3.5112467552110247, + "learning_rate": 9.462625585186621e-06, + "loss": 3.7438, + "step": 1329 + }, + { + "epoch": 0.53, + "grad_norm": 3.6543160157821206, + "learning_rate": 9.449689737210352e-06, + "loss": 3.6419, + "step": 1330 + }, + { + "epoch": 0.53, + "grad_norm": 3.988862136866014, + "learning_rate": 9.436754812833843e-06, + "loss": 3.6131, + "step": 1331 + }, + { + "epoch": 0.53, + "grad_norm": 3.3299291715841157, + "learning_rate": 9.423820833766108e-06, + "loss": 3.7091, + "step": 1332 + }, + { + "epoch": 0.53, + "grad_norm": 3.9403033777529255, + "learning_rate": 9.410887821714571e-06, + "loss": 3.6045, + "step": 1333 + }, + { + "epoch": 0.53, + "grad_norm": 3.8157108095673604, + "learning_rate": 9.39795579838504e-06, + "loss": 3.5397, + "step": 1334 + }, + { + "epoch": 0.53, + "grad_norm": 3.2338840832210667, + "learning_rate": 9.385024785481653e-06, + "loss": 3.6673, + "step": 1335 + }, + { + "epoch": 0.53, + "grad_norm": 3.6445781586174437, + "learning_rate": 9.372094804706867e-06, + "loss": 3.5268, + "step": 1336 + }, + { + "epoch": 0.53, + "grad_norm": 3.9066716946435434, + "learning_rate": 9.359165877761396e-06, + "loss": 3.6854, + "step": 1337 + }, + { + "epoch": 0.54, + "grad_norm": 3.2966925584112547, + "learning_rate": 9.346238026344186e-06, + "loss": 3.5206, + "step": 1338 + }, + { + "epoch": 0.54, + "grad_norm": 3.299893845106976, + "learning_rate": 9.333311272152385e-06, + "loss": 3.6807, + "step": 1339 + }, + { + "epoch": 0.54, + "grad_norm": 3.899885401635834, + "learning_rate": 9.320385636881283e-06, + "loss": 3.5119, + "step": 1340 + }, + { + "epoch": 0.54, + "grad_norm": 4.026905163814134, + "learning_rate": 9.307461142224318e-06, + "loss": 3.442, + "step": 1341 + }, + { + "epoch": 0.54, + "grad_norm": 3.880216888977279, + "learning_rate": 9.29453780987299e-06, + "loss": 3.5532, + "step": 1342 + }, + { + "epoch": 0.54, + "grad_norm": 4.387280773854988, + "learning_rate": 9.281615661516866e-06, + "loss": 3.7189, + "step": 1343 + }, + { + "epoch": 0.54, + "grad_norm": 3.6555244517963943, + "learning_rate": 9.268694718843503e-06, + "loss": 3.6117, + "step": 1344 + }, + { + "epoch": 0.54, + "grad_norm": 4.277666315779683, + "learning_rate": 9.255775003538462e-06, + "loss": 3.4797, + "step": 1345 + }, + { + "epoch": 0.54, + "grad_norm": 3.6156687964961898, + "learning_rate": 9.242856537285227e-06, + "loss": 3.6104, + "step": 1346 + }, + { + "epoch": 0.54, + "grad_norm": 3.5831485134788332, + "learning_rate": 9.229939341765188e-06, + "loss": 3.7844, + "step": 1347 + }, + { + "epoch": 0.54, + "grad_norm": 3.6448623997034795, + "learning_rate": 9.217023438657606e-06, + "loss": 3.483, + "step": 1348 + }, + { + "epoch": 0.54, + "grad_norm": 4.37271213698544, + "learning_rate": 9.204108849639565e-06, + "loss": 3.7646, + "step": 1349 + }, + { + "epoch": 0.54, + "grad_norm": 3.945295172360582, + "learning_rate": 9.19119559638596e-06, + "loss": 3.5386, + "step": 1350 + }, + { + "epoch": 0.54, + "grad_norm": 3.2435047963682893, + "learning_rate": 9.178283700569424e-06, + "loss": 3.5083, + "step": 1351 + }, + { + "epoch": 0.54, + "grad_norm": 3.6441695000819863, + "learning_rate": 9.165373183860329e-06, + "loss": 3.6747, + "step": 1352 + }, + { + "epoch": 0.54, + "grad_norm": 3.758895378834221, + "learning_rate": 9.152464067926717e-06, + "loss": 3.5133, + "step": 1353 + }, + { + "epoch": 0.54, + "grad_norm": 3.8045119099664175, + "learning_rate": 9.139556374434288e-06, + "loss": 3.6009, + "step": 1354 + }, + { + "epoch": 0.54, + "grad_norm": 3.862260842586843, + "learning_rate": 9.126650125046361e-06, + "loss": 3.4762, + "step": 1355 + }, + { + "epoch": 0.54, + "grad_norm": 3.5145809371669103, + "learning_rate": 9.113745341423816e-06, + "loss": 3.6713, + "step": 1356 + }, + { + "epoch": 0.54, + "grad_norm": 3.828220691263425, + "learning_rate": 9.100842045225084e-06, + "loss": 3.7191, + "step": 1357 + }, + { + "epoch": 0.54, + "grad_norm": 3.774746236211917, + "learning_rate": 9.087940258106093e-06, + "loss": 3.4438, + "step": 1358 + }, + { + "epoch": 0.54, + "grad_norm": 3.7795138083033657, + "learning_rate": 9.075040001720247e-06, + "loss": 3.4842, + "step": 1359 + }, + { + "epoch": 0.54, + "grad_norm": 3.94093397333636, + "learning_rate": 9.062141297718372e-06, + "loss": 3.6428, + "step": 1360 + }, + { + "epoch": 0.54, + "grad_norm": 4.471037661246279, + "learning_rate": 9.049244167748694e-06, + "loss": 3.5629, + "step": 1361 + }, + { + "epoch": 0.54, + "grad_norm": 3.392239101547643, + "learning_rate": 9.036348633456791e-06, + "loss": 3.5099, + "step": 1362 + }, + { + "epoch": 0.55, + "grad_norm": 3.437481937306805, + "learning_rate": 9.023454716485572e-06, + "loss": 3.6523, + "step": 1363 + }, + { + "epoch": 0.55, + "grad_norm": 4.716482112384828, + "learning_rate": 9.010562438475225e-06, + "loss": 3.6393, + "step": 1364 + }, + { + "epoch": 0.55, + "grad_norm": 4.20173417003461, + "learning_rate": 8.99767182106319e-06, + "loss": 3.4431, + "step": 1365 + }, + { + "epoch": 0.55, + "grad_norm": 3.601247359299966, + "learning_rate": 8.984782885884119e-06, + "loss": 3.5896, + "step": 1366 + }, + { + "epoch": 0.55, + "grad_norm": 3.4135121264981185, + "learning_rate": 8.971895654569842e-06, + "loss": 3.7265, + "step": 1367 + }, + { + "epoch": 0.55, + "grad_norm": 3.9001461297693223, + "learning_rate": 8.959010148749324e-06, + "loss": 3.5086, + "step": 1368 + }, + { + "epoch": 0.55, + "grad_norm": 4.03454895841195, + "learning_rate": 8.94612639004864e-06, + "loss": 3.677, + "step": 1369 + }, + { + "epoch": 0.55, + "grad_norm": 3.650697251071297, + "learning_rate": 8.933244400090937e-06, + "loss": 3.6443, + "step": 1370 + }, + { + "epoch": 0.55, + "grad_norm": 3.46434005555104, + "learning_rate": 8.92036420049638e-06, + "loss": 3.5778, + "step": 1371 + }, + { + "epoch": 0.55, + "grad_norm": 3.93551298155282, + "learning_rate": 8.907485812882137e-06, + "loss": 3.444, + "step": 1372 + }, + { + "epoch": 0.55, + "grad_norm": 4.137595514523684, + "learning_rate": 8.89460925886234e-06, + "loss": 3.5835, + "step": 1373 + }, + { + "epoch": 0.55, + "grad_norm": 3.1221207733061735, + "learning_rate": 8.881734560048037e-06, + "loss": 3.4763, + "step": 1374 + }, + { + "epoch": 0.55, + "grad_norm": 3.6063693650856203, + "learning_rate": 8.868861738047158e-06, + "loss": 3.512, + "step": 1375 + }, + { + "epoch": 0.55, + "grad_norm": 3.8006663498045854, + "learning_rate": 8.855990814464497e-06, + "loss": 3.6161, + "step": 1376 + }, + { + "epoch": 0.55, + "grad_norm": 3.689346473741649, + "learning_rate": 8.843121810901643e-06, + "loss": 3.4761, + "step": 1377 + }, + { + "epoch": 0.55, + "grad_norm": 3.5443205545427614, + "learning_rate": 8.830254748956983e-06, + "loss": 3.5129, + "step": 1378 + }, + { + "epoch": 0.55, + "grad_norm": 4.461742310621672, + "learning_rate": 8.817389650225631e-06, + "loss": 3.5642, + "step": 1379 + }, + { + "epoch": 0.55, + "grad_norm": 4.0981378966007, + "learning_rate": 8.804526536299413e-06, + "loss": 3.5141, + "step": 1380 + }, + { + "epoch": 0.55, + "grad_norm": 3.8550298590996155, + "learning_rate": 8.79166542876682e-06, + "loss": 3.5965, + "step": 1381 + }, + { + "epoch": 0.55, + "grad_norm": 4.045038429383355, + "learning_rate": 8.778806349212968e-06, + "loss": 3.6084, + "step": 1382 + }, + { + "epoch": 0.55, + "grad_norm": 3.955366018201757, + "learning_rate": 8.765949319219595e-06, + "loss": 3.5752, + "step": 1383 + }, + { + "epoch": 0.55, + "grad_norm": 3.457754380490334, + "learning_rate": 8.753094360364973e-06, + "loss": 3.7728, + "step": 1384 + }, + { + "epoch": 0.55, + "grad_norm": 3.786851175476542, + "learning_rate": 8.740241494223911e-06, + "loss": 3.5675, + "step": 1385 + }, + { + "epoch": 0.55, + "grad_norm": 3.660440730641834, + "learning_rate": 8.727390742367698e-06, + "loss": 3.6431, + "step": 1386 + }, + { + "epoch": 0.55, + "grad_norm": 4.084737719287929, + "learning_rate": 8.71454212636408e-06, + "loss": 3.5924, + "step": 1387 + }, + { + "epoch": 0.56, + "grad_norm": 3.2622905116568877, + "learning_rate": 8.701695667777221e-06, + "loss": 3.7183, + "step": 1388 + }, + { + "epoch": 0.56, + "grad_norm": 3.55476228024875, + "learning_rate": 8.688851388167658e-06, + "loss": 3.4861, + "step": 1389 + }, + { + "epoch": 0.56, + "grad_norm": 3.6473137503472546, + "learning_rate": 8.676009309092273e-06, + "loss": 3.6158, + "step": 1390 + }, + { + "epoch": 0.56, + "grad_norm": 3.7945829817862524, + "learning_rate": 8.663169452104248e-06, + "loss": 3.6302, + "step": 1391 + }, + { + "epoch": 0.56, + "grad_norm": 3.640805329322732, + "learning_rate": 8.650331838753057e-06, + "loss": 3.7547, + "step": 1392 + }, + { + "epoch": 0.56, + "grad_norm": 3.8124210910003455, + "learning_rate": 8.637496490584385e-06, + "loss": 3.5456, + "step": 1393 + }, + { + "epoch": 0.56, + "grad_norm": 3.9704591066995767, + "learning_rate": 8.624663429140128e-06, + "loss": 3.4268, + "step": 1394 + }, + { + "epoch": 0.56, + "grad_norm": 4.049723146662677, + "learning_rate": 8.611832675958335e-06, + "loss": 3.5364, + "step": 1395 + }, + { + "epoch": 0.56, + "grad_norm": 3.533547042548108, + "learning_rate": 8.599004252573191e-06, + "loss": 3.6708, + "step": 1396 + }, + { + "epoch": 0.56, + "grad_norm": 4.58267549077645, + "learning_rate": 8.586178180514968e-06, + "loss": 3.5749, + "step": 1397 + }, + { + "epoch": 0.56, + "grad_norm": 4.9116869644342795, + "learning_rate": 8.573354481309986e-06, + "loss": 3.5463, + "step": 1398 + }, + { + "epoch": 0.56, + "grad_norm": 4.055182017276201, + "learning_rate": 8.560533176480588e-06, + "loss": 3.8375, + "step": 1399 + }, + { + "epoch": 0.56, + "grad_norm": 3.627516942271804, + "learning_rate": 8.5477142875451e-06, + "loss": 3.5717, + "step": 1400 + }, + { + "epoch": 0.56, + "grad_norm": 4.812255836731826, + "learning_rate": 8.534897836017784e-06, + "loss": 3.5476, + "step": 1401 + }, + { + "epoch": 0.56, + "grad_norm": 5.078113719579199, + "learning_rate": 8.522083843408823e-06, + "loss": 3.6681, + "step": 1402 + }, + { + "epoch": 0.56, + "grad_norm": 3.5598802470842936, + "learning_rate": 8.50927233122427e-06, + "loss": 3.3416, + "step": 1403 + }, + { + "epoch": 0.56, + "grad_norm": 3.648309157965994, + "learning_rate": 8.496463320966004e-06, + "loss": 3.5791, + "step": 1404 + }, + { + "epoch": 0.56, + "grad_norm": 3.7101771856350076, + "learning_rate": 8.48365683413172e-06, + "loss": 3.674, + "step": 1405 + }, + { + "epoch": 0.56, + "grad_norm": 4.9313354547428485, + "learning_rate": 8.470852892214875e-06, + "loss": 3.6009, + "step": 1406 + }, + { + "epoch": 0.56, + "grad_norm": 3.395437754771472, + "learning_rate": 8.458051516704644e-06, + "loss": 3.6776, + "step": 1407 + }, + { + "epoch": 0.56, + "grad_norm": 3.269641226339698, + "learning_rate": 8.445252729085907e-06, + "loss": 3.5882, + "step": 1408 + }, + { + "epoch": 0.56, + "grad_norm": 3.238326516283703, + "learning_rate": 8.432456550839196e-06, + "loss": 3.8776, + "step": 1409 + }, + { + "epoch": 0.56, + "grad_norm": 3.850496269302347, + "learning_rate": 8.419663003440657e-06, + "loss": 3.5365, + "step": 1410 + }, + { + "epoch": 0.56, + "grad_norm": 3.651812598740248, + "learning_rate": 8.406872108362034e-06, + "loss": 3.6866, + "step": 1411 + }, + { + "epoch": 0.56, + "grad_norm": 4.310329473688571, + "learning_rate": 8.394083887070614e-06, + "loss": 3.5812, + "step": 1412 + }, + { + "epoch": 0.57, + "grad_norm": 3.1603001545864196, + "learning_rate": 8.38129836102919e-06, + "loss": 3.6525, + "step": 1413 + }, + { + "epoch": 0.57, + "grad_norm": 4.259331885962401, + "learning_rate": 8.36851555169604e-06, + "loss": 3.3805, + "step": 1414 + }, + { + "epoch": 0.57, + "grad_norm": 3.6620436971814896, + "learning_rate": 8.355735480524874e-06, + "loss": 3.5445, + "step": 1415 + }, + { + "epoch": 0.57, + "grad_norm": 4.124009902890339, + "learning_rate": 8.342958168964816e-06, + "loss": 3.6279, + "step": 1416 + }, + { + "epoch": 0.57, + "grad_norm": 3.515479094471101, + "learning_rate": 8.330183638460356e-06, + "loss": 3.3126, + "step": 1417 + }, + { + "epoch": 0.57, + "grad_norm": 3.510007177496425, + "learning_rate": 8.317411910451313e-06, + "loss": 3.5948, + "step": 1418 + }, + { + "epoch": 0.57, + "grad_norm": 3.9870743550694567, + "learning_rate": 8.304643006372797e-06, + "loss": 3.5617, + "step": 1419 + }, + { + "epoch": 0.57, + "grad_norm": 3.31871331046679, + "learning_rate": 8.291876947655197e-06, + "loss": 3.6182, + "step": 1420 + }, + { + "epoch": 0.57, + "grad_norm": 3.657244073140862, + "learning_rate": 8.27911375572411e-06, + "loss": 3.4358, + "step": 1421 + }, + { + "epoch": 0.57, + "grad_norm": 3.6885031118944838, + "learning_rate": 8.266353452000326e-06, + "loss": 3.4941, + "step": 1422 + }, + { + "epoch": 0.57, + "grad_norm": 3.4968539016285725, + "learning_rate": 8.253596057899788e-06, + "loss": 3.4569, + "step": 1423 + }, + { + "epoch": 0.57, + "grad_norm": 4.241081464957698, + "learning_rate": 8.240841594833554e-06, + "loss": 3.5976, + "step": 1424 + }, + { + "epoch": 0.57, + "grad_norm": 3.992209502684881, + "learning_rate": 8.228090084207773e-06, + "loss": 3.5895, + "step": 1425 + }, + { + "epoch": 0.57, + "grad_norm": 3.6248505694886615, + "learning_rate": 8.215341547423624e-06, + "loss": 3.7637, + "step": 1426 + }, + { + "epoch": 0.57, + "grad_norm": 3.548230202941458, + "learning_rate": 8.202596005877307e-06, + "loss": 3.6086, + "step": 1427 + }, + { + "epoch": 0.57, + "grad_norm": 3.8281555139288903, + "learning_rate": 8.189853480959982e-06, + "loss": 3.6207, + "step": 1428 + }, + { + "epoch": 0.57, + "grad_norm": 3.5927761035877155, + "learning_rate": 8.177113994057756e-06, + "loss": 3.5961, + "step": 1429 + }, + { + "epoch": 0.57, + "grad_norm": 3.296788483429141, + "learning_rate": 8.16437756655164e-06, + "loss": 3.6734, + "step": 1430 + }, + { + "epoch": 0.57, + "grad_norm": 3.7427103365741834, + "learning_rate": 8.1516442198175e-06, + "loss": 3.6577, + "step": 1431 + }, + { + "epoch": 0.57, + "grad_norm": 3.7415928654401127, + "learning_rate": 8.138913975226044e-06, + "loss": 3.4335, + "step": 1432 + }, + { + "epoch": 0.57, + "grad_norm": 3.7793528215077097, + "learning_rate": 8.126186854142752e-06, + "loss": 3.4863, + "step": 1433 + }, + { + "epoch": 0.57, + "grad_norm": 3.2918021656616068, + "learning_rate": 8.113462877927893e-06, + "loss": 3.4836, + "step": 1434 + }, + { + "epoch": 0.57, + "grad_norm": 3.658566957717168, + "learning_rate": 8.100742067936432e-06, + "loss": 3.3866, + "step": 1435 + }, + { + "epoch": 0.57, + "grad_norm": 3.6849277344716795, + "learning_rate": 8.088024445518033e-06, + "loss": 3.6564, + "step": 1436 + }, + { + "epoch": 0.57, + "grad_norm": 3.935693476386245, + "learning_rate": 8.075310032017e-06, + "loss": 3.6148, + "step": 1437 + }, + { + "epoch": 0.58, + "grad_norm": 3.899449265414525, + "learning_rate": 8.062598848772261e-06, + "loss": 3.5728, + "step": 1438 + }, + { + "epoch": 0.58, + "grad_norm": 4.121515481621947, + "learning_rate": 8.049890917117322e-06, + "loss": 3.6736, + "step": 1439 + }, + { + "epoch": 0.58, + "grad_norm": 3.806565706148004, + "learning_rate": 8.037186258380226e-06, + "loss": 3.6357, + "step": 1440 + }, + { + "epoch": 0.58, + "grad_norm": 5.195115835009729, + "learning_rate": 8.02448489388353e-06, + "loss": 3.5017, + "step": 1441 + }, + { + "epoch": 0.58, + "grad_norm": 4.586544394139627, + "learning_rate": 8.01178684494425e-06, + "loss": 3.685, + "step": 1442 + }, + { + "epoch": 0.58, + "grad_norm": 3.966560123803234, + "learning_rate": 7.999092132873851e-06, + "loss": 3.5741, + "step": 1443 + }, + { + "epoch": 0.58, + "grad_norm": 4.172759572645984, + "learning_rate": 7.986400778978192e-06, + "loss": 3.6334, + "step": 1444 + }, + { + "epoch": 0.58, + "grad_norm": 4.244342787839209, + "learning_rate": 7.9737128045575e-06, + "loss": 3.8758, + "step": 1445 + }, + { + "epoch": 0.58, + "grad_norm": 5.992017579746834, + "learning_rate": 7.96102823090632e-06, + "loss": 3.6978, + "step": 1446 + }, + { + "epoch": 0.58, + "grad_norm": 4.134386091152547, + "learning_rate": 7.948347079313494e-06, + "loss": 3.6387, + "step": 1447 + }, + { + "epoch": 0.58, + "grad_norm": 4.282185418813217, + "learning_rate": 7.935669371062132e-06, + "loss": 3.7487, + "step": 1448 + }, + { + "epoch": 0.58, + "grad_norm": 4.552626650054134, + "learning_rate": 7.922995127429547e-06, + "loss": 3.7081, + "step": 1449 + }, + { + "epoch": 0.58, + "grad_norm": 4.447248945720088, + "learning_rate": 7.91032436968725e-06, + "loss": 3.3661, + "step": 1450 + }, + { + "epoch": 0.58, + "grad_norm": 4.113127895541361, + "learning_rate": 7.897657119100896e-06, + "loss": 3.349, + "step": 1451 + }, + { + "epoch": 0.58, + "grad_norm": 3.3860791856708996, + "learning_rate": 7.88499339693025e-06, + "loss": 3.7081, + "step": 1452 + }, + { + "epoch": 0.58, + "grad_norm": 4.2905573209490475, + "learning_rate": 7.872333224429166e-06, + "loss": 3.6622, + "step": 1453 + }, + { + "epoch": 0.58, + "grad_norm": 4.364258205258447, + "learning_rate": 7.859676622845535e-06, + "loss": 3.6177, + "step": 1454 + }, + { + "epoch": 0.58, + "grad_norm": 3.9009187557616913, + "learning_rate": 7.847023613421251e-06, + "loss": 3.6914, + "step": 1455 + }, + { + "epoch": 0.58, + "grad_norm": 3.5785002847394134, + "learning_rate": 7.834374217392188e-06, + "loss": 3.544, + "step": 1456 + }, + { + "epoch": 0.58, + "grad_norm": 3.5018286056581798, + "learning_rate": 7.82172845598814e-06, + "loss": 3.5752, + "step": 1457 + }, + { + "epoch": 0.58, + "grad_norm": 4.181096789262609, + "learning_rate": 7.80908635043282e-06, + "loss": 3.495, + "step": 1458 + }, + { + "epoch": 0.58, + "grad_norm": 3.9920855934196773, + "learning_rate": 7.796447921943793e-06, + "loss": 3.529, + "step": 1459 + }, + { + "epoch": 0.58, + "grad_norm": 4.293480957964132, + "learning_rate": 7.78381319173246e-06, + "loss": 3.4376, + "step": 1460 + }, + { + "epoch": 0.58, + "grad_norm": 3.9040102950110485, + "learning_rate": 7.771182181004005e-06, + "loss": 3.5162, + "step": 1461 + }, + { + "epoch": 0.58, + "grad_norm": 3.635380000175011, + "learning_rate": 7.758554910957378e-06, + "loss": 3.8755, + "step": 1462 + }, + { + "epoch": 0.59, + "grad_norm": 4.32258353400662, + "learning_rate": 7.745931402785252e-06, + "loss": 3.5296, + "step": 1463 + }, + { + "epoch": 0.59, + "grad_norm": 3.6660451711288147, + "learning_rate": 7.733311677673979e-06, + "loss": 3.8214, + "step": 1464 + }, + { + "epoch": 0.59, + "grad_norm": 3.6681150611869375, + "learning_rate": 7.720695756803569e-06, + "loss": 3.52, + "step": 1465 + }, + { + "epoch": 0.59, + "grad_norm": 3.419776479875391, + "learning_rate": 7.708083661347637e-06, + "loss": 3.3692, + "step": 1466 + }, + { + "epoch": 0.59, + "grad_norm": 3.8962339569352924, + "learning_rate": 7.695475412473393e-06, + "loss": 3.4717, + "step": 1467 + }, + { + "epoch": 0.59, + "grad_norm": 3.8587481606461775, + "learning_rate": 7.682871031341579e-06, + "loss": 3.7205, + "step": 1468 + }, + { + "epoch": 0.59, + "grad_norm": 3.0586472108646032, + "learning_rate": 7.670270539106452e-06, + "loss": 3.4851, + "step": 1469 + }, + { + "epoch": 0.59, + "grad_norm": 3.455954380269373, + "learning_rate": 7.657673956915735e-06, + "loss": 3.6466, + "step": 1470 + }, + { + "epoch": 0.59, + "grad_norm": 4.047129327789186, + "learning_rate": 7.645081305910596e-06, + "loss": 3.267, + "step": 1471 + }, + { + "epoch": 0.59, + "grad_norm": 3.8169951162740348, + "learning_rate": 7.632492607225604e-06, + "loss": 3.6871, + "step": 1472 + }, + { + "epoch": 0.59, + "grad_norm": 3.8305487121163644, + "learning_rate": 7.619907881988692e-06, + "loss": 3.5829, + "step": 1473 + }, + { + "epoch": 0.59, + "grad_norm": 3.2735811073421788, + "learning_rate": 7.607327151321127e-06, + "loss": 3.6891, + "step": 1474 + }, + { + "epoch": 0.59, + "grad_norm": 3.9136084760331418, + "learning_rate": 7.594750436337467e-06, + "loss": 3.4515, + "step": 1475 + }, + { + "epoch": 0.59, + "grad_norm": 3.828639513589231, + "learning_rate": 7.582177758145532e-06, + "loss": 3.5019, + "step": 1476 + }, + { + "epoch": 0.59, + "grad_norm": 3.9437593119820415, + "learning_rate": 7.569609137846376e-06, + "loss": 3.7731, + "step": 1477 + }, + { + "epoch": 0.59, + "grad_norm": 4.164958834406506, + "learning_rate": 7.557044596534234e-06, + "loss": 3.7246, + "step": 1478 + }, + { + "epoch": 0.59, + "grad_norm": 3.7055840109179368, + "learning_rate": 7.544484155296492e-06, + "loss": 3.6791, + "step": 1479 + }, + { + "epoch": 0.59, + "grad_norm": 3.7527652506423284, + "learning_rate": 7.531927835213657e-06, + "loss": 3.6093, + "step": 1480 + }, + { + "epoch": 0.59, + "grad_norm": 4.014478944178257, + "learning_rate": 7.519375657359331e-06, + "loss": 3.5312, + "step": 1481 + }, + { + "epoch": 0.59, + "grad_norm": 4.246264132846788, + "learning_rate": 7.506827642800146e-06, + "loss": 3.7168, + "step": 1482 + }, + { + "epoch": 0.59, + "grad_norm": 4.241663502305347, + "learning_rate": 7.49428381259576e-06, + "loss": 3.4936, + "step": 1483 + }, + { + "epoch": 0.59, + "grad_norm": 3.985732421048951, + "learning_rate": 7.4817441877988005e-06, + "loss": 3.4383, + "step": 1484 + }, + { + "epoch": 0.59, + "grad_norm": 3.520163919263216, + "learning_rate": 7.469208789454838e-06, + "loss": 3.5112, + "step": 1485 + }, + { + "epoch": 0.59, + "grad_norm": 4.100364501529123, + "learning_rate": 7.456677638602355e-06, + "loss": 3.3978, + "step": 1486 + }, + { + "epoch": 0.59, + "grad_norm": 3.906400338438948, + "learning_rate": 7.444150756272704e-06, + "loss": 3.5787, + "step": 1487 + }, + { + "epoch": 0.6, + "grad_norm": 4.142247742585712, + "learning_rate": 7.431628163490067e-06, + "loss": 3.5451, + "step": 1488 + }, + { + "epoch": 0.6, + "grad_norm": 3.8484953594054607, + "learning_rate": 7.419109881271434e-06, + "loss": 3.5031, + "step": 1489 + }, + { + "epoch": 0.6, + "grad_norm": 3.5376977157395064, + "learning_rate": 7.40659593062655e-06, + "loss": 3.5643, + "step": 1490 + }, + { + "epoch": 0.6, + "grad_norm": 3.684148878346785, + "learning_rate": 7.394086332557907e-06, + "loss": 3.7448, + "step": 1491 + }, + { + "epoch": 0.6, + "grad_norm": 3.7070400002530035, + "learning_rate": 7.38158110806068e-06, + "loss": 3.5748, + "step": 1492 + }, + { + "epoch": 0.6, + "grad_norm": 3.0700666156104046, + "learning_rate": 7.3690802781227056e-06, + "loss": 3.6341, + "step": 1493 + }, + { + "epoch": 0.6, + "grad_norm": 3.8456034836325275, + "learning_rate": 7.356583863724442e-06, + "loss": 3.5115, + "step": 1494 + }, + { + "epoch": 0.6, + "grad_norm": 3.789836055161562, + "learning_rate": 7.344091885838949e-06, + "loss": 3.5078, + "step": 1495 + }, + { + "epoch": 0.6, + "grad_norm": 3.782648371746039, + "learning_rate": 7.331604365431826e-06, + "loss": 3.4992, + "step": 1496 + }, + { + "epoch": 0.6, + "grad_norm": 3.399890932099315, + "learning_rate": 7.319121323461198e-06, + "loss": 3.5788, + "step": 1497 + }, + { + "epoch": 0.6, + "grad_norm": 3.519272517962314, + "learning_rate": 7.3066427808776754e-06, + "loss": 3.5133, + "step": 1498 + }, + { + "epoch": 0.6, + "grad_norm": 3.607027156933682, + "learning_rate": 7.294168758624307e-06, + "loss": 3.5752, + "step": 1499 + }, + { + "epoch": 0.6, + "grad_norm": 3.6015766335135573, + "learning_rate": 7.2816992776365714e-06, + "loss": 3.8539, + "step": 1500 + }, + { + "epoch": 0.6, + "grad_norm": 3.7429477576654637, + "learning_rate": 7.269234358842314e-06, + "loss": 3.6593, + "step": 1501 + }, + { + "epoch": 0.6, + "grad_norm": 3.417196523965054, + "learning_rate": 7.256774023161728e-06, + "loss": 3.8102, + "step": 1502 + }, + { + "epoch": 0.6, + "grad_norm": 3.902532703962836, + "learning_rate": 7.244318291507308e-06, + "loss": 3.4636, + "step": 1503 + }, + { + "epoch": 0.6, + "grad_norm": 3.1780987980769613, + "learning_rate": 7.231867184783826e-06, + "loss": 3.5413, + "step": 1504 + }, + { + "epoch": 0.6, + "grad_norm": 3.195603868320716, + "learning_rate": 7.219420723888301e-06, + "loss": 3.6185, + "step": 1505 + }, + { + "epoch": 0.6, + "grad_norm": 3.5099189748150685, + "learning_rate": 7.2069789297099355e-06, + "loss": 3.6203, + "step": 1506 + }, + { + "epoch": 0.6, + "grad_norm": 3.7368146967887084, + "learning_rate": 7.194541823130119e-06, + "loss": 3.4034, + "step": 1507 + }, + { + "epoch": 0.6, + "grad_norm": 3.2477208654607175, + "learning_rate": 7.182109425022357e-06, + "loss": 3.5462, + "step": 1508 + }, + { + "epoch": 0.6, + "grad_norm": 3.954509777924059, + "learning_rate": 7.169681756252265e-06, + "loss": 3.3927, + "step": 1509 + }, + { + "epoch": 0.6, + "grad_norm": 3.44799095991217, + "learning_rate": 7.157258837677514e-06, + "loss": 3.3569, + "step": 1510 + }, + { + "epoch": 0.6, + "grad_norm": 3.7064912121782094, + "learning_rate": 7.144840690147812e-06, + "loss": 3.5461, + "step": 1511 + }, + { + "epoch": 0.6, + "grad_norm": 3.8854217452893565, + "learning_rate": 7.132427334504846e-06, + "loss": 3.8537, + "step": 1512 + }, + { + "epoch": 0.61, + "grad_norm": 2.9978972776658903, + "learning_rate": 7.120018791582266e-06, + "loss": 3.6769, + "step": 1513 + }, + { + "epoch": 0.61, + "grad_norm": 4.105775013442488, + "learning_rate": 7.107615082205654e-06, + "loss": 3.6836, + "step": 1514 + }, + { + "epoch": 0.61, + "grad_norm": 3.594521170717367, + "learning_rate": 7.095216227192467e-06, + "loss": 3.5434, + "step": 1515 + }, + { + "epoch": 0.61, + "grad_norm": 4.083671392728415, + "learning_rate": 7.082822247352024e-06, + "loss": 3.7165, + "step": 1516 + }, + { + "epoch": 0.61, + "grad_norm": 3.282797887972504, + "learning_rate": 7.070433163485451e-06, + "loss": 3.6366, + "step": 1517 + }, + { + "epoch": 0.61, + "grad_norm": 3.372384984933006, + "learning_rate": 7.0580489963856646e-06, + "loss": 3.5777, + "step": 1518 + }, + { + "epoch": 0.61, + "grad_norm": 4.160592298870379, + "learning_rate": 7.045669766837333e-06, + "loss": 3.3398, + "step": 1519 + }, + { + "epoch": 0.61, + "grad_norm": 4.020626122432115, + "learning_rate": 7.033295495616834e-06, + "loss": 3.3516, + "step": 1520 + }, + { + "epoch": 0.61, + "grad_norm": 4.17132157189251, + "learning_rate": 7.020926203492218e-06, + "loss": 3.4481, + "step": 1521 + }, + { + "epoch": 0.61, + "grad_norm": 3.648358936658591, + "learning_rate": 7.008561911223186e-06, + "loss": 3.6298, + "step": 1522 + }, + { + "epoch": 0.61, + "grad_norm": 5.03872622210649, + "learning_rate": 6.9962026395610416e-06, + "loss": 3.6469, + "step": 1523 + }, + { + "epoch": 0.61, + "grad_norm": 3.796123861492065, + "learning_rate": 6.983848409248672e-06, + "loss": 3.625, + "step": 1524 + }, + { + "epoch": 0.61, + "grad_norm": 4.01401433098644, + "learning_rate": 6.971499241020495e-06, + "loss": 3.4491, + "step": 1525 + }, + { + "epoch": 0.61, + "grad_norm": 3.600759214686406, + "learning_rate": 6.959155155602433e-06, + "loss": 3.6289, + "step": 1526 + }, + { + "epoch": 0.61, + "grad_norm": 4.2385099838211495, + "learning_rate": 6.946816173711878e-06, + "loss": 3.5884, + "step": 1527 + }, + { + "epoch": 0.61, + "grad_norm": 4.017534211523324, + "learning_rate": 6.934482316057663e-06, + "loss": 3.6964, + "step": 1528 + }, + { + "epoch": 0.61, + "grad_norm": 4.284559410654498, + "learning_rate": 6.922153603340016e-06, + "loss": 3.509, + "step": 1529 + }, + { + "epoch": 0.61, + "grad_norm": 3.624699831426992, + "learning_rate": 6.909830056250527e-06, + "loss": 3.315, + "step": 1530 + }, + { + "epoch": 0.61, + "grad_norm": 3.9957231893286465, + "learning_rate": 6.897511695472124e-06, + "loss": 3.3653, + "step": 1531 + }, + { + "epoch": 0.61, + "grad_norm": 3.7559802871043244, + "learning_rate": 6.885198541679016e-06, + "loss": 3.5622, + "step": 1532 + }, + { + "epoch": 0.61, + "grad_norm": 3.515750722822639, + "learning_rate": 6.872890615536694e-06, + "loss": 3.6934, + "step": 1533 + }, + { + "epoch": 0.61, + "grad_norm": 3.361687296630023, + "learning_rate": 6.860587937701862e-06, + "loss": 3.7604, + "step": 1534 + }, + { + "epoch": 0.61, + "grad_norm": 3.665985775304246, + "learning_rate": 6.848290528822417e-06, + "loss": 3.6467, + "step": 1535 + }, + { + "epoch": 0.61, + "grad_norm": 3.5742810689464894, + "learning_rate": 6.835998409537412e-06, + "loss": 3.6129, + "step": 1536 + }, + { + "epoch": 0.61, + "grad_norm": 3.1398746962171953, + "learning_rate": 6.823711600477025e-06, + "loss": 3.5571, + "step": 1537 + }, + { + "epoch": 0.62, + "grad_norm": 3.8744598615958283, + "learning_rate": 6.811430122262529e-06, + "loss": 3.4432, + "step": 1538 + }, + { + "epoch": 0.62, + "grad_norm": 3.680958440614546, + "learning_rate": 6.799153995506234e-06, + "loss": 3.4893, + "step": 1539 + }, + { + "epoch": 0.62, + "grad_norm": 4.251229349989716, + "learning_rate": 6.786883240811479e-06, + "loss": 3.421, + "step": 1540 + }, + { + "epoch": 0.62, + "grad_norm": 3.363805693156403, + "learning_rate": 6.774617878772582e-06, + "loss": 3.7277, + "step": 1541 + }, + { + "epoch": 0.62, + "grad_norm": 3.8060970867616395, + "learning_rate": 6.76235792997482e-06, + "loss": 3.5927, + "step": 1542 + }, + { + "epoch": 0.62, + "grad_norm": 4.121071467859133, + "learning_rate": 6.750103414994374e-06, + "loss": 3.6922, + "step": 1543 + }, + { + "epoch": 0.62, + "grad_norm": 3.5635624234510095, + "learning_rate": 6.737854354398308e-06, + "loss": 3.3097, + "step": 1544 + }, + { + "epoch": 0.62, + "grad_norm": 3.3704037106511224, + "learning_rate": 6.725610768744535e-06, + "loss": 3.5523, + "step": 1545 + }, + { + "epoch": 0.62, + "grad_norm": 5.6183604752793395, + "learning_rate": 6.713372678581773e-06, + "loss": 3.6057, + "step": 1546 + }, + { + "epoch": 0.62, + "grad_norm": 3.8491054462248853, + "learning_rate": 6.7011401044495304e-06, + "loss": 3.4293, + "step": 1547 + }, + { + "epoch": 0.62, + "grad_norm": 3.4848739882554973, + "learning_rate": 6.68891306687804e-06, + "loss": 3.6692, + "step": 1548 + }, + { + "epoch": 0.62, + "grad_norm": 3.7199367426632524, + "learning_rate": 6.676691586388255e-06, + "loss": 3.449, + "step": 1549 + }, + { + "epoch": 0.62, + "grad_norm": 3.580681658335725, + "learning_rate": 6.664475683491797e-06, + "loss": 3.5791, + "step": 1550 + }, + { + "epoch": 0.62, + "grad_norm": 3.5016383532515483, + "learning_rate": 6.652265378690923e-06, + "loss": 3.6628, + "step": 1551 + }, + { + "epoch": 0.62, + "grad_norm": 3.2121234727237225, + "learning_rate": 6.6400606924785095e-06, + "loss": 3.3646, + "step": 1552 + }, + { + "epoch": 0.62, + "grad_norm": 3.9771811074622305, + "learning_rate": 6.627861645337984e-06, + "loss": 3.4899, + "step": 1553 + }, + { + "epoch": 0.62, + "grad_norm": 3.424344005443578, + "learning_rate": 6.615668257743322e-06, + "loss": 3.6783, + "step": 1554 + }, + { + "epoch": 0.62, + "grad_norm": 3.9008718425597486, + "learning_rate": 6.603480550158995e-06, + "loss": 3.4927, + "step": 1555 + }, + { + "epoch": 0.62, + "grad_norm": 3.5524895975215767, + "learning_rate": 6.591298543039949e-06, + "loss": 3.6305, + "step": 1556 + }, + { + "epoch": 0.62, + "grad_norm": 3.199136364840206, + "learning_rate": 6.579122256831551e-06, + "loss": 3.469, + "step": 1557 + }, + { + "epoch": 0.62, + "grad_norm": 3.136039927950006, + "learning_rate": 6.566951711969581e-06, + "loss": 3.8279, + "step": 1558 + }, + { + "epoch": 0.62, + "grad_norm": 3.5719592951558004, + "learning_rate": 6.554786928880165e-06, + "loss": 3.6708, + "step": 1559 + }, + { + "epoch": 0.62, + "grad_norm": 3.265159798599461, + "learning_rate": 6.542627927979772e-06, + "loss": 3.3514, + "step": 1560 + }, + { + "epoch": 0.62, + "grad_norm": 3.705950892355711, + "learning_rate": 6.530474729675167e-06, + "loss": 3.6671, + "step": 1561 + }, + { + "epoch": 0.62, + "grad_norm": 4.27802567413278, + "learning_rate": 6.518327354363374e-06, + "loss": 3.4778, + "step": 1562 + }, + { + "epoch": 0.63, + "grad_norm": 3.4155897157303494, + "learning_rate": 6.506185822431639e-06, + "loss": 3.6436, + "step": 1563 + }, + { + "epoch": 0.63, + "grad_norm": 3.4645941258945476, + "learning_rate": 6.494050154257408e-06, + "loss": 3.5203, + "step": 1564 + }, + { + "epoch": 0.63, + "grad_norm": 3.5880428732526735, + "learning_rate": 6.481920370208274e-06, + "loss": 3.292, + "step": 1565 + }, + { + "epoch": 0.63, + "grad_norm": 3.453501383628343, + "learning_rate": 6.469796490641974e-06, + "loss": 3.8199, + "step": 1566 + }, + { + "epoch": 0.63, + "grad_norm": 3.7078788389918738, + "learning_rate": 6.4576785359063225e-06, + "loss": 3.629, + "step": 1567 + }, + { + "epoch": 0.63, + "grad_norm": 4.268044566714057, + "learning_rate": 6.445566526339187e-06, + "loss": 3.4953, + "step": 1568 + }, + { + "epoch": 0.63, + "grad_norm": 3.1001497410904943, + "learning_rate": 6.4334604822684645e-06, + "loss": 3.6877, + "step": 1569 + }, + { + "epoch": 0.63, + "grad_norm": 4.245953209024202, + "learning_rate": 6.421360424012039e-06, + "loss": 3.4786, + "step": 1570 + }, + { + "epoch": 0.63, + "grad_norm": 3.2821730226566808, + "learning_rate": 6.409266371877751e-06, + "loss": 3.4555, + "step": 1571 + }, + { + "epoch": 0.63, + "grad_norm": 4.182794902814907, + "learning_rate": 6.397178346163348e-06, + "loss": 3.3958, + "step": 1572 + }, + { + "epoch": 0.63, + "grad_norm": 3.766092498437075, + "learning_rate": 6.38509636715648e-06, + "loss": 3.4831, + "step": 1573 + }, + { + "epoch": 0.63, + "grad_norm": 3.3471215501567446, + "learning_rate": 6.373020455134633e-06, + "loss": 3.3831, + "step": 1574 + }, + { + "epoch": 0.63, + "grad_norm": 3.9241836485526003, + "learning_rate": 6.360950630365126e-06, + "loss": 3.6346, + "step": 1575 + }, + { + "epoch": 0.63, + "grad_norm": 3.3124279763521547, + "learning_rate": 6.3488869131050505e-06, + "loss": 3.3606, + "step": 1576 + }, + { + "epoch": 0.63, + "grad_norm": 3.8207348660033094, + "learning_rate": 6.33682932360125e-06, + "loss": 3.4873, + "step": 1577 + }, + { + "epoch": 0.63, + "grad_norm": 4.313411276985893, + "learning_rate": 6.324777882090287e-06, + "loss": 3.547, + "step": 1578 + }, + { + "epoch": 0.63, + "grad_norm": 3.8492618965358716, + "learning_rate": 6.3127326087983974e-06, + "loss": 3.4882, + "step": 1579 + }, + { + "epoch": 0.63, + "grad_norm": 3.7440406276540643, + "learning_rate": 6.300693523941481e-06, + "loss": 3.5546, + "step": 1580 + }, + { + "epoch": 0.63, + "grad_norm": 4.256194385654137, + "learning_rate": 6.2886606477250345e-06, + "loss": 3.542, + "step": 1581 + }, + { + "epoch": 0.63, + "grad_norm": 4.100847067766242, + "learning_rate": 6.276634000344144e-06, + "loss": 3.4619, + "step": 1582 + }, + { + "epoch": 0.63, + "grad_norm": 3.863838035866122, + "learning_rate": 6.264613601983435e-06, + "loss": 3.5449, + "step": 1583 + }, + { + "epoch": 0.63, + "grad_norm": 3.4931068818315585, + "learning_rate": 6.2525994728170495e-06, + "loss": 3.5324, + "step": 1584 + }, + { + "epoch": 0.63, + "grad_norm": 3.3210210896064334, + "learning_rate": 6.2405916330086106e-06, + "loss": 3.4511, + "step": 1585 + }, + { + "epoch": 0.63, + "grad_norm": 4.435646053815424, + "learning_rate": 6.2285901027111806e-06, + "loss": 3.2911, + "step": 1586 + }, + { + "epoch": 0.63, + "grad_norm": 3.954806179069221, + "learning_rate": 6.216594902067233e-06, + "loss": 3.5649, + "step": 1587 + }, + { + "epoch": 0.64, + "grad_norm": 3.220561982446002, + "learning_rate": 6.204606051208617e-06, + "loss": 3.4769, + "step": 1588 + }, + { + "epoch": 0.64, + "grad_norm": 3.5975537055749225, + "learning_rate": 6.192623570256535e-06, + "loss": 3.5477, + "step": 1589 + }, + { + "epoch": 0.64, + "grad_norm": 3.8078229439619187, + "learning_rate": 6.180647479321484e-06, + "loss": 3.534, + "step": 1590 + }, + { + "epoch": 0.64, + "grad_norm": 3.5466201216437394, + "learning_rate": 6.168677798503246e-06, + "loss": 3.5194, + "step": 1591 + }, + { + "epoch": 0.64, + "grad_norm": 3.6779273264121106, + "learning_rate": 6.156714547890838e-06, + "loss": 3.4331, + "step": 1592 + }, + { + "epoch": 0.64, + "grad_norm": 3.7255338440199077, + "learning_rate": 6.14475774756249e-06, + "loss": 3.5986, + "step": 1593 + }, + { + "epoch": 0.64, + "grad_norm": 3.626610821101966, + "learning_rate": 6.13280741758561e-06, + "loss": 3.497, + "step": 1594 + }, + { + "epoch": 0.64, + "grad_norm": 4.147161579623455, + "learning_rate": 6.120863578016736e-06, + "loss": 3.7874, + "step": 1595 + }, + { + "epoch": 0.64, + "grad_norm": 3.6261110954731866, + "learning_rate": 6.108926248901521e-06, + "loss": 3.4446, + "step": 1596 + }, + { + "epoch": 0.64, + "grad_norm": 3.4428710403247615, + "learning_rate": 6.0969954502746916e-06, + "loss": 3.4673, + "step": 1597 + }, + { + "epoch": 0.64, + "grad_norm": 3.5732663274462007, + "learning_rate": 6.0850712021600044e-06, + "loss": 3.4875, + "step": 1598 + }, + { + "epoch": 0.64, + "grad_norm": 3.507148824224458, + "learning_rate": 6.073153524570236e-06, + "loss": 3.5483, + "step": 1599 + }, + { + "epoch": 0.64, + "grad_norm": 3.5971890810464346, + "learning_rate": 6.061242437507131e-06, + "loss": 3.3854, + "step": 1600 + }, + { + "epoch": 0.64, + "grad_norm": 4.158234085592182, + "learning_rate": 6.049337960961362e-06, + "loss": 3.3238, + "step": 1601 + }, + { + "epoch": 0.64, + "grad_norm": 3.8319143470427215, + "learning_rate": 6.037440114912521e-06, + "loss": 3.5227, + "step": 1602 + }, + { + "epoch": 0.64, + "grad_norm": 3.3015817963676706, + "learning_rate": 6.0255489193290675e-06, + "loss": 3.6485, + "step": 1603 + }, + { + "epoch": 0.64, + "grad_norm": 3.6210568854004803, + "learning_rate": 6.013664394168297e-06, + "loss": 3.5156, + "step": 1604 + }, + { + "epoch": 0.64, + "grad_norm": 3.5389233431815312, + "learning_rate": 6.00178655937631e-06, + "loss": 3.5009, + "step": 1605 + }, + { + "epoch": 0.64, + "grad_norm": 3.6576189067512854, + "learning_rate": 5.989915434887985e-06, + "loss": 3.3947, + "step": 1606 + }, + { + "epoch": 0.64, + "grad_norm": 3.339883862987916, + "learning_rate": 5.9780510406269245e-06, + "loss": 3.4721, + "step": 1607 + }, + { + "epoch": 0.64, + "grad_norm": 4.52954677971464, + "learning_rate": 5.966193396505452e-06, + "loss": 3.3105, + "step": 1608 + }, + { + "epoch": 0.64, + "grad_norm": 3.797901067597484, + "learning_rate": 5.954342522424553e-06, + "loss": 3.5331, + "step": 1609 + }, + { + "epoch": 0.64, + "grad_norm": 4.025259163522358, + "learning_rate": 5.942498438273849e-06, + "loss": 3.4357, + "step": 1610 + }, + { + "epoch": 0.64, + "grad_norm": 3.883933535536824, + "learning_rate": 5.930661163931572e-06, + "loss": 3.5552, + "step": 1611 + }, + { + "epoch": 0.64, + "grad_norm": 3.9176809770017296, + "learning_rate": 5.918830719264514e-06, + "loss": 3.6888, + "step": 1612 + }, + { + "epoch": 0.65, + "grad_norm": 4.158931640923236, + "learning_rate": 5.9070071241280235e-06, + "loss": 3.745, + "step": 1613 + }, + { + "epoch": 0.65, + "grad_norm": 4.087354007066473, + "learning_rate": 5.895190398365935e-06, + "loss": 3.6546, + "step": 1614 + }, + { + "epoch": 0.65, + "grad_norm": 4.238168845466041, + "learning_rate": 5.8833805618105635e-06, + "loss": 3.6311, + "step": 1615 + }, + { + "epoch": 0.65, + "grad_norm": 4.921391574264145, + "learning_rate": 5.871577634282655e-06, + "loss": 3.3619, + "step": 1616 + }, + { + "epoch": 0.65, + "grad_norm": 5.222025825827338, + "learning_rate": 5.8597816355913685e-06, + "loss": 3.5381, + "step": 1617 + }, + { + "epoch": 0.65, + "grad_norm": 4.526603701604458, + "learning_rate": 5.84799258553423e-06, + "loss": 3.2635, + "step": 1618 + }, + { + "epoch": 0.65, + "grad_norm": 4.328920440404449, + "learning_rate": 5.836210503897099e-06, + "loss": 3.3941, + "step": 1619 + }, + { + "epoch": 0.65, + "grad_norm": 3.694703294976491, + "learning_rate": 5.82443541045415e-06, + "loss": 3.6353, + "step": 1620 + }, + { + "epoch": 0.65, + "grad_norm": 4.046321767564445, + "learning_rate": 5.812667324967813e-06, + "loss": 3.56, + "step": 1621 + }, + { + "epoch": 0.65, + "grad_norm": 4.7512923075952695, + "learning_rate": 5.800906267188773e-06, + "loss": 3.5772, + "step": 1622 + }, + { + "epoch": 0.65, + "grad_norm": 3.919553279950237, + "learning_rate": 5.789152256855917e-06, + "loss": 3.4645, + "step": 1623 + }, + { + "epoch": 0.65, + "grad_norm": 3.9665901288915806, + "learning_rate": 5.777405313696294e-06, + "loss": 3.4751, + "step": 1624 + }, + { + "epoch": 0.65, + "grad_norm": 3.9438397686222832, + "learning_rate": 5.765665457425102e-06, + "loss": 3.5324, + "step": 1625 + }, + { + "epoch": 0.65, + "grad_norm": 3.720691178895891, + "learning_rate": 5.753932707745635e-06, + "loss": 3.3672, + "step": 1626 + }, + { + "epoch": 0.65, + "grad_norm": 3.4053190270389604, + "learning_rate": 5.742207084349274e-06, + "loss": 3.5297, + "step": 1627 + }, + { + "epoch": 0.65, + "grad_norm": 3.8384090142645753, + "learning_rate": 5.73048860691543e-06, + "loss": 3.6019, + "step": 1628 + }, + { + "epoch": 0.65, + "grad_norm": 4.155590506087914, + "learning_rate": 5.718777295111524e-06, + "loss": 3.541, + "step": 1629 + }, + { + "epoch": 0.65, + "grad_norm": 3.935875106073164, + "learning_rate": 5.707073168592943e-06, + "loss": 3.4477, + "step": 1630 + }, + { + "epoch": 0.65, + "grad_norm": 4.0601087611821365, + "learning_rate": 5.695376247003025e-06, + "loss": 3.5035, + "step": 1631 + }, + { + "epoch": 0.65, + "grad_norm": 4.005956841096344, + "learning_rate": 5.683686549973018e-06, + "loss": 3.6031, + "step": 1632 + }, + { + "epoch": 0.65, + "grad_norm": 4.018384680043002, + "learning_rate": 5.672004097122033e-06, + "loss": 3.4305, + "step": 1633 + }, + { + "epoch": 0.65, + "grad_norm": 4.089213051550755, + "learning_rate": 5.6603289080570274e-06, + "loss": 3.4664, + "step": 1634 + }, + { + "epoch": 0.65, + "grad_norm": 4.0324336465554556, + "learning_rate": 5.648661002372769e-06, + "loss": 3.5037, + "step": 1635 + }, + { + "epoch": 0.65, + "grad_norm": 3.8702545614500976, + "learning_rate": 5.637000399651804e-06, + "loss": 3.3685, + "step": 1636 + }, + { + "epoch": 0.65, + "grad_norm": 4.044818800764605, + "learning_rate": 5.625347119464422e-06, + "loss": 3.4022, + "step": 1637 + }, + { + "epoch": 0.66, + "grad_norm": 3.4606581789411743, + "learning_rate": 5.613701181368618e-06, + "loss": 3.7145, + "step": 1638 + }, + { + "epoch": 0.66, + "grad_norm": 4.083316905669718, + "learning_rate": 5.602062604910064e-06, + "loss": 3.6496, + "step": 1639 + }, + { + "epoch": 0.66, + "grad_norm": 4.959151798357818, + "learning_rate": 5.590431409622081e-06, + "loss": 3.8787, + "step": 1640 + }, + { + "epoch": 0.66, + "grad_norm": 4.588318519706414, + "learning_rate": 5.5788076150256075e-06, + "loss": 3.6376, + "step": 1641 + }, + { + "epoch": 0.66, + "grad_norm": 4.939429818845424, + "learning_rate": 5.567191240629151e-06, + "loss": 3.3604, + "step": 1642 + }, + { + "epoch": 0.66, + "grad_norm": 4.024057438227261, + "learning_rate": 5.555582305928766e-06, + "loss": 3.3678, + "step": 1643 + }, + { + "epoch": 0.66, + "grad_norm": 3.702973125759723, + "learning_rate": 5.5439808304080225e-06, + "loss": 3.6029, + "step": 1644 + }, + { + "epoch": 0.66, + "grad_norm": 3.553114471083393, + "learning_rate": 5.5323868335379775e-06, + "loss": 3.4881, + "step": 1645 + }, + { + "epoch": 0.66, + "grad_norm": 4.531936177964785, + "learning_rate": 5.520800334777132e-06, + "loss": 3.7625, + "step": 1646 + }, + { + "epoch": 0.66, + "grad_norm": 3.7173152360158865, + "learning_rate": 5.509221353571404e-06, + "loss": 3.2961, + "step": 1647 + }, + { + "epoch": 0.66, + "grad_norm": 3.9439784637905064, + "learning_rate": 5.497649909354084e-06, + "loss": 3.3936, + "step": 1648 + }, + { + "epoch": 0.66, + "grad_norm": 3.78139801390855, + "learning_rate": 5.486086021545829e-06, + "loss": 3.5477, + "step": 1649 + }, + { + "epoch": 0.66, + "grad_norm": 3.8954021579269464, + "learning_rate": 5.4745297095546125e-06, + "loss": 3.3632, + "step": 1650 + }, + { + "epoch": 0.66, + "grad_norm": 3.9087623151938984, + "learning_rate": 5.4629809927756794e-06, + "loss": 3.4839, + "step": 1651 + }, + { + "epoch": 0.66, + "grad_norm": 3.635931087210787, + "learning_rate": 5.451439890591539e-06, + "loss": 3.4566, + "step": 1652 + }, + { + "epoch": 0.66, + "grad_norm": 3.58703249038166, + "learning_rate": 5.439906422371914e-06, + "loss": 3.6018, + "step": 1653 + }, + { + "epoch": 0.66, + "grad_norm": 3.3918410665471743, + "learning_rate": 5.42838060747372e-06, + "loss": 3.3492, + "step": 1654 + }, + { + "epoch": 0.66, + "grad_norm": 4.307113537697726, + "learning_rate": 5.416862465241033e-06, + "loss": 3.4002, + "step": 1655 + }, + { + "epoch": 0.66, + "grad_norm": 3.4614695792966033, + "learning_rate": 5.405352015005039e-06, + "loss": 3.5605, + "step": 1656 + }, + { + "epoch": 0.66, + "grad_norm": 3.5235364563297984, + "learning_rate": 5.3938492760840176e-06, + "loss": 3.4043, + "step": 1657 + }, + { + "epoch": 0.66, + "grad_norm": 3.6270246557742647, + "learning_rate": 5.382354267783316e-06, + "loss": 3.6153, + "step": 1658 + }, + { + "epoch": 0.66, + "grad_norm": 3.7458961603016636, + "learning_rate": 5.370867009395294e-06, + "loss": 3.3846, + "step": 1659 + }, + { + "epoch": 0.66, + "grad_norm": 3.9253003162704854, + "learning_rate": 5.359387520199317e-06, + "loss": 3.5766, + "step": 1660 + }, + { + "epoch": 0.66, + "grad_norm": 3.66451016542722, + "learning_rate": 5.3479158194617e-06, + "loss": 3.3781, + "step": 1661 + }, + { + "epoch": 0.66, + "grad_norm": 3.913591319050972, + "learning_rate": 5.336451926435688e-06, + "loss": 3.5858, + "step": 1662 + }, + { + "epoch": 0.67, + "grad_norm": 3.6297933301525984, + "learning_rate": 5.32499586036143e-06, + "loss": 3.3996, + "step": 1663 + }, + { + "epoch": 0.67, + "grad_norm": 3.409251034623019, + "learning_rate": 5.313547640465937e-06, + "loss": 3.6274, + "step": 1664 + }, + { + "epoch": 0.67, + "grad_norm": 3.5973973468416776, + "learning_rate": 5.302107285963045e-06, + "loss": 3.3889, + "step": 1665 + }, + { + "epoch": 0.67, + "grad_norm": 3.1962049167523157, + "learning_rate": 5.2906748160533895e-06, + "loss": 3.4495, + "step": 1666 + }, + { + "epoch": 0.67, + "grad_norm": 4.0059543728084375, + "learning_rate": 5.279250249924384e-06, + "loss": 3.5386, + "step": 1667 + }, + { + "epoch": 0.67, + "grad_norm": 3.7194107001174936, + "learning_rate": 5.26783360675016e-06, + "loss": 3.5265, + "step": 1668 + }, + { + "epoch": 0.67, + "grad_norm": 3.7921834437433484, + "learning_rate": 5.2564249056915704e-06, + "loss": 3.5549, + "step": 1669 + }, + { + "epoch": 0.67, + "grad_norm": 4.403776989679724, + "learning_rate": 5.245024165896126e-06, + "loss": 3.3479, + "step": 1670 + }, + { + "epoch": 0.67, + "grad_norm": 3.8704830411141518, + "learning_rate": 5.2336314064979766e-06, + "loss": 3.3499, + "step": 1671 + }, + { + "epoch": 0.67, + "grad_norm": 3.7989300337853664, + "learning_rate": 5.222246646617886e-06, + "loss": 3.5178, + "step": 1672 + }, + { + "epoch": 0.67, + "grad_norm": 3.734045664578303, + "learning_rate": 5.210869905363178e-06, + "loss": 3.464, + "step": 1673 + }, + { + "epoch": 0.67, + "grad_norm": 3.832535478682915, + "learning_rate": 5.199501201827741e-06, + "loss": 3.7964, + "step": 1674 + }, + { + "epoch": 0.67, + "grad_norm": 3.7729187124032797, + "learning_rate": 5.18814055509195e-06, + "loss": 3.4932, + "step": 1675 + }, + { + "epoch": 0.67, + "grad_norm": 3.7973719375176183, + "learning_rate": 5.1767879842226745e-06, + "loss": 3.4163, + "step": 1676 + }, + { + "epoch": 0.67, + "grad_norm": 4.2259816427355, + "learning_rate": 5.165443508273218e-06, + "loss": 3.3001, + "step": 1677 + }, + { + "epoch": 0.67, + "grad_norm": 4.251724589557688, + "learning_rate": 5.154107146283311e-06, + "loss": 3.1482, + "step": 1678 + }, + { + "epoch": 0.67, + "grad_norm": 3.6809587313873013, + "learning_rate": 5.1427789172790565e-06, + "loss": 3.6134, + "step": 1679 + }, + { + "epoch": 0.67, + "grad_norm": 3.951286411251701, + "learning_rate": 5.131458840272905e-06, + "loss": 3.4604, + "step": 1680 + }, + { + "epoch": 0.67, + "grad_norm": 4.002690650992529, + "learning_rate": 5.120146934263638e-06, + "loss": 3.4269, + "step": 1681 + }, + { + "epoch": 0.67, + "grad_norm": 3.587007733749125, + "learning_rate": 5.10884321823631e-06, + "loss": 3.4809, + "step": 1682 + }, + { + "epoch": 0.67, + "grad_norm": 4.033586550699142, + "learning_rate": 5.097547711162243e-06, + "loss": 3.495, + "step": 1683 + }, + { + "epoch": 0.67, + "grad_norm": 4.358904503933148, + "learning_rate": 5.086260431998967e-06, + "loss": 3.4295, + "step": 1684 + }, + { + "epoch": 0.67, + "grad_norm": 3.092011940657899, + "learning_rate": 5.074981399690219e-06, + "loss": 3.4061, + "step": 1685 + }, + { + "epoch": 0.67, + "grad_norm": 3.509859863620395, + "learning_rate": 5.0637106331658815e-06, + "loss": 3.3268, + "step": 1686 + }, + { + "epoch": 0.67, + "grad_norm": 3.4988932754363944, + "learning_rate": 5.0524481513419675e-06, + "loss": 3.5498, + "step": 1687 + }, + { + "epoch": 0.68, + "grad_norm": 3.5371819602716257, + "learning_rate": 5.041193973120595e-06, + "loss": 3.6162, + "step": 1688 + }, + { + "epoch": 0.68, + "grad_norm": 3.7999904525150257, + "learning_rate": 5.02994811738993e-06, + "loss": 3.4791, + "step": 1689 + }, + { + "epoch": 0.68, + "grad_norm": 4.34916869882989, + "learning_rate": 5.018710603024187e-06, + "loss": 3.4989, + "step": 1690 + }, + { + "epoch": 0.68, + "grad_norm": 4.3696963886037965, + "learning_rate": 5.007481448883567e-06, + "loss": 3.458, + "step": 1691 + }, + { + "epoch": 0.68, + "grad_norm": 4.2633588151069315, + "learning_rate": 4.99626067381425e-06, + "loss": 3.4841, + "step": 1692 + }, + { + "epoch": 0.68, + "grad_norm": 3.6953948473727296, + "learning_rate": 4.985048296648346e-06, + "loss": 3.3859, + "step": 1693 + }, + { + "epoch": 0.68, + "grad_norm": 3.7639597092063384, + "learning_rate": 4.973844336203879e-06, + "loss": 3.5064, + "step": 1694 + }, + { + "epoch": 0.68, + "grad_norm": 3.6696256156495695, + "learning_rate": 4.9626488112847384e-06, + "loss": 3.4375, + "step": 1695 + }, + { + "epoch": 0.68, + "grad_norm": 4.165077667236303, + "learning_rate": 4.951461740680655e-06, + "loss": 3.4031, + "step": 1696 + }, + { + "epoch": 0.68, + "grad_norm": 3.6209308184846605, + "learning_rate": 4.9402831431671834e-06, + "loss": 3.4249, + "step": 1697 + }, + { + "epoch": 0.68, + "grad_norm": 3.4591739980588274, + "learning_rate": 4.929113037505642e-06, + "loss": 3.3922, + "step": 1698 + }, + { + "epoch": 0.68, + "grad_norm": 4.2079804852931355, + "learning_rate": 4.91795144244311e-06, + "loss": 3.4299, + "step": 1699 + }, + { + "epoch": 0.68, + "grad_norm": 3.873718125760766, + "learning_rate": 4.9067983767123736e-06, + "loss": 3.5628, + "step": 1700 + }, + { + "epoch": 0.68, + "grad_norm": 3.1974630418683976, + "learning_rate": 4.895653859031906e-06, + "loss": 3.6201, + "step": 1701 + }, + { + "epoch": 0.68, + "grad_norm": 3.2799431494589464, + "learning_rate": 4.884517908105837e-06, + "loss": 3.43, + "step": 1702 + }, + { + "epoch": 0.68, + "grad_norm": 3.6073965976368547, + "learning_rate": 4.873390542623922e-06, + "loss": 3.4691, + "step": 1703 + }, + { + "epoch": 0.68, + "grad_norm": 3.6439670250442, + "learning_rate": 4.8622717812615e-06, + "loss": 3.2538, + "step": 1704 + }, + { + "epoch": 0.68, + "grad_norm": 3.884889324274793, + "learning_rate": 4.851161642679466e-06, + "loss": 3.3603, + "step": 1705 + }, + { + "epoch": 0.68, + "grad_norm": 3.371748248373679, + "learning_rate": 4.840060145524254e-06, + "loss": 3.4982, + "step": 1706 + }, + { + "epoch": 0.68, + "grad_norm": 4.07931994670271, + "learning_rate": 4.828967308427795e-06, + "loss": 3.2928, + "step": 1707 + }, + { + "epoch": 0.68, + "grad_norm": 3.704691505269074, + "learning_rate": 4.817883150007474e-06, + "loss": 3.3268, + "step": 1708 + }, + { + "epoch": 0.68, + "grad_norm": 3.609481060520609, + "learning_rate": 4.806807688866119e-06, + "loss": 3.5597, + "step": 1709 + }, + { + "epoch": 0.68, + "grad_norm": 4.287120337678626, + "learning_rate": 4.795740943591955e-06, + "loss": 3.7014, + "step": 1710 + }, + { + "epoch": 0.68, + "grad_norm": 3.2417233501828395, + "learning_rate": 4.784682932758588e-06, + "loss": 3.4061, + "step": 1711 + }, + { + "epoch": 0.68, + "grad_norm": 4.099954943224899, + "learning_rate": 4.77363367492496e-06, + "loss": 3.3982, + "step": 1712 + }, + { + "epoch": 0.69, + "grad_norm": 3.4674350398982248, + "learning_rate": 4.7625931886353215e-06, + "loss": 3.5787, + "step": 1713 + }, + { + "epoch": 0.69, + "grad_norm": 3.8299806212591014, + "learning_rate": 4.7515614924192026e-06, + "loss": 3.2252, + "step": 1714 + }, + { + "epoch": 0.69, + "grad_norm": 3.3829512664027224, + "learning_rate": 4.740538604791371e-06, + "loss": 3.585, + "step": 1715 + }, + { + "epoch": 0.69, + "grad_norm": 3.5722636561234493, + "learning_rate": 4.729524544251837e-06, + "loss": 3.4258, + "step": 1716 + }, + { + "epoch": 0.69, + "grad_norm": 4.050179515902923, + "learning_rate": 4.718519329285771e-06, + "loss": 3.3031, + "step": 1717 + }, + { + "epoch": 0.69, + "grad_norm": 4.128925748003555, + "learning_rate": 4.707522978363508e-06, + "loss": 3.5009, + "step": 1718 + }, + { + "epoch": 0.69, + "grad_norm": 4.002009858329216, + "learning_rate": 4.696535509940499e-06, + "loss": 3.4921, + "step": 1719 + }, + { + "epoch": 0.69, + "grad_norm": 3.6431409914843242, + "learning_rate": 4.685556942457296e-06, + "loss": 3.5246, + "step": 1720 + }, + { + "epoch": 0.69, + "grad_norm": 3.8368553209068117, + "learning_rate": 4.674587294339513e-06, + "loss": 3.6919, + "step": 1721 + }, + { + "epoch": 0.69, + "grad_norm": 3.9066283355125186, + "learning_rate": 4.663626583997789e-06, + "loss": 3.388, + "step": 1722 + }, + { + "epoch": 0.69, + "grad_norm": 3.337474491676217, + "learning_rate": 4.652674829827762e-06, + "loss": 3.6723, + "step": 1723 + }, + { + "epoch": 0.69, + "grad_norm": 3.8716302465293615, + "learning_rate": 4.641732050210032e-06, + "loss": 3.5052, + "step": 1724 + }, + { + "epoch": 0.69, + "grad_norm": 3.5067245303171237, + "learning_rate": 4.630798263510162e-06, + "loss": 3.6115, + "step": 1725 + }, + { + "epoch": 0.69, + "grad_norm": 3.7223703218552573, + "learning_rate": 4.619873488078597e-06, + "loss": 3.3937, + "step": 1726 + }, + { + "epoch": 0.69, + "grad_norm": 4.510075235746018, + "learning_rate": 4.608957742250667e-06, + "loss": 3.5372, + "step": 1727 + }, + { + "epoch": 0.69, + "grad_norm": 3.9874952880001366, + "learning_rate": 4.598051044346542e-06, + "loss": 3.3717, + "step": 1728 + }, + { + "epoch": 0.69, + "grad_norm": 3.885816752060376, + "learning_rate": 4.587153412671217e-06, + "loss": 3.3987, + "step": 1729 + }, + { + "epoch": 0.69, + "grad_norm": 3.7827071572281445, + "learning_rate": 4.576264865514467e-06, + "loss": 3.1763, + "step": 1730 + }, + { + "epoch": 0.69, + "grad_norm": 3.890063791368322, + "learning_rate": 4.565385421150817e-06, + "loss": 3.5614, + "step": 1731 + }, + { + "epoch": 0.69, + "grad_norm": 3.783650330588731, + "learning_rate": 4.554515097839511e-06, + "loss": 3.4039, + "step": 1732 + }, + { + "epoch": 0.69, + "grad_norm": 3.973268905334733, + "learning_rate": 4.543653913824496e-06, + "loss": 3.3783, + "step": 1733 + }, + { + "epoch": 0.69, + "grad_norm": 3.422375253114076, + "learning_rate": 4.53280188733437e-06, + "loss": 3.4258, + "step": 1734 + }, + { + "epoch": 0.69, + "grad_norm": 3.6091058977682224, + "learning_rate": 4.521959036582372e-06, + "loss": 3.5038, + "step": 1735 + }, + { + "epoch": 0.69, + "grad_norm": 3.449899993245845, + "learning_rate": 4.511125379766332e-06, + "loss": 3.3051, + "step": 1736 + }, + { + "epoch": 0.69, + "grad_norm": 3.3716372041379747, + "learning_rate": 4.500300935068647e-06, + "loss": 3.3973, + "step": 1737 + }, + { + "epoch": 0.7, + "grad_norm": 3.5533894107507735, + "learning_rate": 4.489485720656266e-06, + "loss": 3.4855, + "step": 1738 + }, + { + "epoch": 0.7, + "grad_norm": 3.559773371573033, + "learning_rate": 4.478679754680639e-06, + "loss": 3.4554, + "step": 1739 + }, + { + "epoch": 0.7, + "grad_norm": 4.138358956281304, + "learning_rate": 4.467883055277696e-06, + "loss": 3.5556, + "step": 1740 + }, + { + "epoch": 0.7, + "grad_norm": 3.049507644251867, + "learning_rate": 4.457095640567804e-06, + "loss": 3.5686, + "step": 1741 + }, + { + "epoch": 0.7, + "grad_norm": 3.11767434244862, + "learning_rate": 4.4463175286557654e-06, + "loss": 3.6089, + "step": 1742 + }, + { + "epoch": 0.7, + "grad_norm": 3.3567108518742557, + "learning_rate": 4.435548737630756e-06, + "loss": 3.4967, + "step": 1743 + }, + { + "epoch": 0.7, + "grad_norm": 3.6967941537174314, + "learning_rate": 4.4247892855663164e-06, + "loss": 3.3797, + "step": 1744 + }, + { + "epoch": 0.7, + "grad_norm": 3.179141600245913, + "learning_rate": 4.414039190520308e-06, + "loss": 3.5925, + "step": 1745 + }, + { + "epoch": 0.7, + "grad_norm": 3.453056516550807, + "learning_rate": 4.403298470534885e-06, + "loss": 3.6349, + "step": 1746 + }, + { + "epoch": 0.7, + "grad_norm": 3.5712888486599774, + "learning_rate": 4.39256714363648e-06, + "loss": 3.6258, + "step": 1747 + }, + { + "epoch": 0.7, + "grad_norm": 3.31493960973444, + "learning_rate": 4.3818452278357445e-06, + "loss": 3.4372, + "step": 1748 + }, + { + "epoch": 0.7, + "grad_norm": 4.171432917668506, + "learning_rate": 4.371132741127553e-06, + "loss": 3.3394, + "step": 1749 + }, + { + "epoch": 0.7, + "grad_norm": 3.656724448043528, + "learning_rate": 4.360429701490935e-06, + "loss": 3.3471, + "step": 1750 + }, + { + "epoch": 0.7, + "grad_norm": 4.186499874648426, + "learning_rate": 4.349736126889084e-06, + "loss": 3.3913, + "step": 1751 + }, + { + "epoch": 0.7, + "grad_norm": 3.8208946244534188, + "learning_rate": 4.339052035269291e-06, + "loss": 3.6172, + "step": 1752 + }, + { + "epoch": 0.7, + "grad_norm": 4.116100858707486, + "learning_rate": 4.328377444562948e-06, + "loss": 3.4454, + "step": 1753 + }, + { + "epoch": 0.7, + "grad_norm": 4.161503924768385, + "learning_rate": 4.31771237268549e-06, + "loss": 3.6466, + "step": 1754 + }, + { + "epoch": 0.7, + "grad_norm": 4.494899046455282, + "learning_rate": 4.307056837536373e-06, + "loss": 3.4494, + "step": 1755 + }, + { + "epoch": 0.7, + "grad_norm": 3.767898841687319, + "learning_rate": 4.296410856999062e-06, + "loss": 3.3613, + "step": 1756 + }, + { + "epoch": 0.7, + "grad_norm": 3.9982391161187314, + "learning_rate": 4.2857744489409725e-06, + "loss": 3.6314, + "step": 1757 + }, + { + "epoch": 0.7, + "grad_norm": 3.5058775605789996, + "learning_rate": 4.2751476312134655e-06, + "loss": 3.4211, + "step": 1758 + }, + { + "epoch": 0.7, + "grad_norm": 3.697058493781567, + "learning_rate": 4.264530421651792e-06, + "loss": 3.5555, + "step": 1759 + }, + { + "epoch": 0.7, + "grad_norm": 4.011432271491016, + "learning_rate": 4.2539228380750955e-06, + "loss": 3.241, + "step": 1760 + }, + { + "epoch": 0.7, + "grad_norm": 3.9185907235318886, + "learning_rate": 4.243324898286349e-06, + "loss": 3.4969, + "step": 1761 + }, + { + "epoch": 0.7, + "grad_norm": 3.5614235766322233, + "learning_rate": 4.2327366200723404e-06, + "loss": 3.3976, + "step": 1762 + }, + { + "epoch": 0.71, + "grad_norm": 3.8839560924349317, + "learning_rate": 4.222158021203657e-06, + "loss": 3.4789, + "step": 1763 + }, + { + "epoch": 0.71, + "grad_norm": 4.671243277730213, + "learning_rate": 4.211589119434622e-06, + "loss": 3.5638, + "step": 1764 + }, + { + "epoch": 0.71, + "grad_norm": 3.3993535836751265, + "learning_rate": 4.201029932503303e-06, + "loss": 3.5107, + "step": 1765 + }, + { + "epoch": 0.71, + "grad_norm": 3.013661425202177, + "learning_rate": 4.190480478131443e-06, + "loss": 3.2718, + "step": 1766 + }, + { + "epoch": 0.71, + "grad_norm": 3.3719456559452223, + "learning_rate": 4.179940774024469e-06, + "loss": 3.5625, + "step": 1767 + }, + { + "epoch": 0.71, + "grad_norm": 3.2931148487027095, + "learning_rate": 4.169410837871427e-06, + "loss": 3.4839, + "step": 1768 + }, + { + "epoch": 0.71, + "grad_norm": 3.2833663912883853, + "learning_rate": 4.158890687344986e-06, + "loss": 3.3587, + "step": 1769 + }, + { + "epoch": 0.71, + "grad_norm": 3.8882103322283528, + "learning_rate": 4.14838034010138e-06, + "loss": 3.3663, + "step": 1770 + }, + { + "epoch": 0.71, + "grad_norm": 3.8955310931768823, + "learning_rate": 4.137879813780388e-06, + "loss": 3.4513, + "step": 1771 + }, + { + "epoch": 0.71, + "grad_norm": 3.636956183849931, + "learning_rate": 4.127389126005319e-06, + "loss": 3.5105, + "step": 1772 + }, + { + "epoch": 0.71, + "grad_norm": 3.6484577281266577, + "learning_rate": 4.116908294382956e-06, + "loss": 3.6188, + "step": 1773 + }, + { + "epoch": 0.71, + "grad_norm": 4.0266571534966005, + "learning_rate": 4.10643733650355e-06, + "loss": 3.304, + "step": 1774 + }, + { + "epoch": 0.71, + "grad_norm": 3.88428003537644, + "learning_rate": 4.095976269940777e-06, + "loss": 3.3975, + "step": 1775 + }, + { + "epoch": 0.71, + "grad_norm": 3.6192466745520893, + "learning_rate": 4.085525112251706e-06, + "loss": 3.5155, + "step": 1776 + }, + { + "epoch": 0.71, + "grad_norm": 3.704237885509808, + "learning_rate": 4.0750838809767875e-06, + "loss": 3.5143, + "step": 1777 + }, + { + "epoch": 0.71, + "grad_norm": 3.678360702162961, + "learning_rate": 4.0646525936398086e-06, + "loss": 3.542, + "step": 1778 + }, + { + "epoch": 0.71, + "grad_norm": 4.389314698208116, + "learning_rate": 4.054231267747862e-06, + "loss": 3.5115, + "step": 1779 + }, + { + "epoch": 0.71, + "grad_norm": 3.701937280915647, + "learning_rate": 4.043819920791322e-06, + "loss": 3.4142, + "step": 1780 + }, + { + "epoch": 0.71, + "grad_norm": 3.9218694258931435, + "learning_rate": 4.033418570243819e-06, + "loss": 3.4976, + "step": 1781 + }, + { + "epoch": 0.71, + "grad_norm": 3.721482825872019, + "learning_rate": 4.0230272335622065e-06, + "loss": 3.4643, + "step": 1782 + }, + { + "epoch": 0.71, + "grad_norm": 3.6470247224867243, + "learning_rate": 4.012645928186533e-06, + "loss": 3.5088, + "step": 1783 + }, + { + "epoch": 0.71, + "grad_norm": 3.7696088401191385, + "learning_rate": 4.002274671540006e-06, + "loss": 3.2971, + "step": 1784 + }, + { + "epoch": 0.71, + "grad_norm": 3.7052611261579202, + "learning_rate": 3.991913481028965e-06, + "loss": 3.5161, + "step": 1785 + }, + { + "epoch": 0.71, + "grad_norm": 3.7585081410257684, + "learning_rate": 3.981562374042867e-06, + "loss": 3.1938, + "step": 1786 + }, + { + "epoch": 0.71, + "grad_norm": 3.9585861280160684, + "learning_rate": 3.971221367954239e-06, + "loss": 3.4439, + "step": 1787 + }, + { + "epoch": 0.72, + "grad_norm": 3.283636045591956, + "learning_rate": 3.960890480118653e-06, + "loss": 3.6489, + "step": 1788 + }, + { + "epoch": 0.72, + "grad_norm": 3.654578635777536, + "learning_rate": 3.950569727874704e-06, + "loss": 3.4375, + "step": 1789 + }, + { + "epoch": 0.72, + "grad_norm": 4.169022532820428, + "learning_rate": 3.940259128543967e-06, + "loss": 3.499, + "step": 1790 + }, + { + "epoch": 0.72, + "grad_norm": 3.3344543139240477, + "learning_rate": 3.9299586994309905e-06, + "loss": 3.4579, + "step": 1791 + }, + { + "epoch": 0.72, + "grad_norm": 3.746352581841832, + "learning_rate": 3.919668457823248e-06, + "loss": 3.4583, + "step": 1792 + }, + { + "epoch": 0.72, + "grad_norm": 3.6192979524953657, + "learning_rate": 3.909388420991113e-06, + "loss": 3.4426, + "step": 1793 + }, + { + "epoch": 0.72, + "grad_norm": 3.736628337328482, + "learning_rate": 3.899118606187832e-06, + "loss": 3.5413, + "step": 1794 + }, + { + "epoch": 0.72, + "grad_norm": 3.4436637095613287, + "learning_rate": 3.888859030649498e-06, + "loss": 3.4048, + "step": 1795 + }, + { + "epoch": 0.72, + "grad_norm": 3.2072769723867838, + "learning_rate": 3.878609711595022e-06, + "loss": 3.3558, + "step": 1796 + }, + { + "epoch": 0.72, + "grad_norm": 3.7640368299387825, + "learning_rate": 3.8683706662260945e-06, + "loss": 3.4847, + "step": 1797 + }, + { + "epoch": 0.72, + "grad_norm": 3.436055272932363, + "learning_rate": 3.858141911727168e-06, + "loss": 3.2878, + "step": 1798 + }, + { + "epoch": 0.72, + "grad_norm": 3.529156547637179, + "learning_rate": 3.847923465265418e-06, + "loss": 3.7081, + "step": 1799 + }, + { + "epoch": 0.72, + "grad_norm": 3.468301049973078, + "learning_rate": 3.837715343990727e-06, + "loss": 3.3457, + "step": 1800 + }, + { + "epoch": 0.72, + "grad_norm": 3.5741394728587204, + "learning_rate": 3.8275175650356485e-06, + "loss": 3.523, + "step": 1801 + }, + { + "epoch": 0.72, + "grad_norm": 3.6469623554292134, + "learning_rate": 3.817330145515374e-06, + "loss": 3.3227, + "step": 1802 + }, + { + "epoch": 0.72, + "grad_norm": 3.332529000908109, + "learning_rate": 3.807153102527704e-06, + "loss": 3.3699, + "step": 1803 + }, + { + "epoch": 0.72, + "grad_norm": 4.208393616053263, + "learning_rate": 3.7969864531530344e-06, + "loss": 3.4386, + "step": 1804 + }, + { + "epoch": 0.72, + "grad_norm": 3.512302703646855, + "learning_rate": 3.7868302144543146e-06, + "loss": 3.6729, + "step": 1805 + }, + { + "epoch": 0.72, + "grad_norm": 3.7091168641858503, + "learning_rate": 3.7766844034770155e-06, + "loss": 3.7557, + "step": 1806 + }, + { + "epoch": 0.72, + "grad_norm": 4.132021104400233, + "learning_rate": 3.766549037249112e-06, + "loss": 3.3382, + "step": 1807 + }, + { + "epoch": 0.72, + "grad_norm": 4.002942906418598, + "learning_rate": 3.7564241327810436e-06, + "loss": 3.4619, + "step": 1808 + }, + { + "epoch": 0.72, + "grad_norm": 3.9271576290560906, + "learning_rate": 3.7463097070656995e-06, + "loss": 3.5258, + "step": 1809 + }, + { + "epoch": 0.72, + "grad_norm": 3.98998238474593, + "learning_rate": 3.736205777078381e-06, + "loss": 3.4328, + "step": 1810 + }, + { + "epoch": 0.72, + "grad_norm": 4.765986861107831, + "learning_rate": 3.72611235977677e-06, + "loss": 3.371, + "step": 1811 + }, + { + "epoch": 0.72, + "grad_norm": 4.13175319308379, + "learning_rate": 3.7160294721009026e-06, + "loss": 3.2895, + "step": 1812 + }, + { + "epoch": 0.73, + "grad_norm": 4.5676925128829895, + "learning_rate": 3.705957130973149e-06, + "loss": 3.3525, + "step": 1813 + }, + { + "epoch": 0.73, + "grad_norm": 4.065579675390456, + "learning_rate": 3.69589535329818e-06, + "loss": 3.4717, + "step": 1814 + }, + { + "epoch": 0.73, + "grad_norm": 3.6964342085032156, + "learning_rate": 3.685844155962931e-06, + "loss": 3.374, + "step": 1815 + }, + { + "epoch": 0.73, + "grad_norm": 3.5454709243268656, + "learning_rate": 3.675803555836582e-06, + "loss": 3.2699, + "step": 1816 + }, + { + "epoch": 0.73, + "grad_norm": 3.6910173645241513, + "learning_rate": 3.6657735697705267e-06, + "loss": 3.2236, + "step": 1817 + }, + { + "epoch": 0.73, + "grad_norm": 3.588311501677217, + "learning_rate": 3.6557542145983495e-06, + "loss": 3.5066, + "step": 1818 + }, + { + "epoch": 0.73, + "grad_norm": 4.651925132300316, + "learning_rate": 3.6457455071357918e-06, + "loss": 3.3634, + "step": 1819 + }, + { + "epoch": 0.73, + "grad_norm": 4.250683821727388, + "learning_rate": 3.63574746418072e-06, + "loss": 3.267, + "step": 1820 + }, + { + "epoch": 0.73, + "grad_norm": 3.429181473137611, + "learning_rate": 3.625760102513103e-06, + "loss": 3.5255, + "step": 1821 + }, + { + "epoch": 0.73, + "grad_norm": 3.1222267069199283, + "learning_rate": 3.6157834388949907e-06, + "loss": 3.5681, + "step": 1822 + }, + { + "epoch": 0.73, + "grad_norm": 3.43599969407311, + "learning_rate": 3.6058174900704646e-06, + "loss": 3.5532, + "step": 1823 + }, + { + "epoch": 0.73, + "grad_norm": 3.9924024367280544, + "learning_rate": 3.595862272765638e-06, + "loss": 3.3557, + "step": 1824 + }, + { + "epoch": 0.73, + "grad_norm": 4.335774679391719, + "learning_rate": 3.585917803688603e-06, + "loss": 3.1922, + "step": 1825 + }, + { + "epoch": 0.73, + "grad_norm": 3.317027772024, + "learning_rate": 3.5759840995294136e-06, + "loss": 3.3783, + "step": 1826 + }, + { + "epoch": 0.73, + "grad_norm": 3.805915177072342, + "learning_rate": 3.5660611769600604e-06, + "loss": 3.5855, + "step": 1827 + }, + { + "epoch": 0.73, + "grad_norm": 3.9192673343873063, + "learning_rate": 3.556149052634443e-06, + "loss": 3.4866, + "step": 1828 + }, + { + "epoch": 0.73, + "grad_norm": 3.484350084902121, + "learning_rate": 3.546247743188328e-06, + "loss": 3.5675, + "step": 1829 + }, + { + "epoch": 0.73, + "grad_norm": 3.103281813841342, + "learning_rate": 3.536357265239333e-06, + "loss": 3.5552, + "step": 1830 + }, + { + "epoch": 0.73, + "grad_norm": 3.450382295757395, + "learning_rate": 3.5264776353869046e-06, + "loss": 3.4087, + "step": 1831 + }, + { + "epoch": 0.73, + "grad_norm": 4.437727669001523, + "learning_rate": 3.5166088702122738e-06, + "loss": 3.3265, + "step": 1832 + }, + { + "epoch": 0.73, + "grad_norm": 4.346381212758512, + "learning_rate": 3.5067509862784455e-06, + "loss": 3.4703, + "step": 1833 + }, + { + "epoch": 0.73, + "grad_norm": 3.587343604875265, + "learning_rate": 3.4969040001301513e-06, + "loss": 3.6085, + "step": 1834 + }, + { + "epoch": 0.73, + "grad_norm": 3.854682116713474, + "learning_rate": 3.487067928293848e-06, + "loss": 3.4987, + "step": 1835 + }, + { + "epoch": 0.73, + "grad_norm": 3.9170447966343542, + "learning_rate": 3.4772427872776606e-06, + "loss": 3.3871, + "step": 1836 + }, + { + "epoch": 0.73, + "grad_norm": 3.652408689543764, + "learning_rate": 3.4674285935713715e-06, + "loss": 3.5386, + "step": 1837 + }, + { + "epoch": 0.74, + "grad_norm": 3.6076480873164796, + "learning_rate": 3.4576253636463996e-06, + "loss": 3.4289, + "step": 1838 + }, + { + "epoch": 0.74, + "grad_norm": 3.633468327300971, + "learning_rate": 3.4478331139557475e-06, + "loss": 3.4698, + "step": 1839 + }, + { + "epoch": 0.74, + "grad_norm": 3.584011854682265, + "learning_rate": 3.4380518609340076e-06, + "loss": 3.3809, + "step": 1840 + }, + { + "epoch": 0.74, + "grad_norm": 3.4086643945235453, + "learning_rate": 3.428281620997296e-06, + "loss": 3.4472, + "step": 1841 + }, + { + "epoch": 0.74, + "grad_norm": 3.7236703417587798, + "learning_rate": 3.418522410543266e-06, + "loss": 3.3683, + "step": 1842 + }, + { + "epoch": 0.74, + "grad_norm": 4.516193849667952, + "learning_rate": 3.4087742459510396e-06, + "loss": 3.3648, + "step": 1843 + }, + { + "epoch": 0.74, + "grad_norm": 4.0536722168971995, + "learning_rate": 3.3990371435812185e-06, + "loss": 3.3732, + "step": 1844 + }, + { + "epoch": 0.74, + "grad_norm": 3.7285338094616565, + "learning_rate": 3.3893111197758276e-06, + "loss": 3.3925, + "step": 1845 + }, + { + "epoch": 0.74, + "grad_norm": 3.7303715346811033, + "learning_rate": 3.3795961908582965e-06, + "loss": 3.3005, + "step": 1846 + }, + { + "epoch": 0.74, + "grad_norm": 3.9474329011980913, + "learning_rate": 3.3698923731334453e-06, + "loss": 3.4754, + "step": 1847 + }, + { + "epoch": 0.74, + "grad_norm": 3.5606404505123774, + "learning_rate": 3.360199682887433e-06, + "loss": 3.675, + "step": 1848 + }, + { + "epoch": 0.74, + "grad_norm": 3.7856691217724734, + "learning_rate": 3.3505181363877536e-06, + "loss": 3.342, + "step": 1849 + }, + { + "epoch": 0.74, + "grad_norm": 3.300592236425659, + "learning_rate": 3.3408477498831917e-06, + "loss": 3.2336, + "step": 1850 + }, + { + "epoch": 0.74, + "grad_norm": 3.511252927635383, + "learning_rate": 3.3311885396038002e-06, + "loss": 3.47, + "step": 1851 + }, + { + "epoch": 0.74, + "grad_norm": 3.496345516082245, + "learning_rate": 3.321540521760883e-06, + "loss": 3.4671, + "step": 1852 + }, + { + "epoch": 0.74, + "grad_norm": 3.20909351432755, + "learning_rate": 3.3119037125469553e-06, + "loss": 3.6159, + "step": 1853 + }, + { + "epoch": 0.74, + "grad_norm": 3.5867603857436006, + "learning_rate": 3.3022781281357184e-06, + "loss": 3.7095, + "step": 1854 + }, + { + "epoch": 0.74, + "grad_norm": 3.4734426795727376, + "learning_rate": 3.2926637846820366e-06, + "loss": 3.5205, + "step": 1855 + }, + { + "epoch": 0.74, + "grad_norm": 4.055599777957748, + "learning_rate": 3.2830606983219038e-06, + "loss": 3.4546, + "step": 1856 + }, + { + "epoch": 0.74, + "grad_norm": 4.196656750316166, + "learning_rate": 3.2734688851724273e-06, + "loss": 3.3104, + "step": 1857 + }, + { + "epoch": 0.74, + "grad_norm": 3.761411862813457, + "learning_rate": 3.2638883613317974e-06, + "loss": 3.4606, + "step": 1858 + }, + { + "epoch": 0.74, + "grad_norm": 3.2224622867474353, + "learning_rate": 3.2543191428792466e-06, + "loss": 3.6587, + "step": 1859 + }, + { + "epoch": 0.74, + "grad_norm": 3.252120608521762, + "learning_rate": 3.2447612458750365e-06, + "loss": 3.3622, + "step": 1860 + }, + { + "epoch": 0.74, + "grad_norm": 3.4210000097935085, + "learning_rate": 3.2352146863604317e-06, + "loss": 3.3387, + "step": 1861 + }, + { + "epoch": 0.74, + "grad_norm": 3.9081317042211747, + "learning_rate": 3.2256794803576707e-06, + "loss": 3.5707, + "step": 1862 + }, + { + "epoch": 0.75, + "grad_norm": 4.2730047394313795, + "learning_rate": 3.2161556438699303e-06, + "loss": 3.5105, + "step": 1863 + }, + { + "epoch": 0.75, + "grad_norm": 3.566834764286657, + "learning_rate": 3.2066431928813068e-06, + "loss": 3.5039, + "step": 1864 + }, + { + "epoch": 0.75, + "grad_norm": 3.4615537663089855, + "learning_rate": 3.197142143356787e-06, + "loss": 3.2387, + "step": 1865 + }, + { + "epoch": 0.75, + "grad_norm": 3.7727034614513606, + "learning_rate": 3.1876525112422283e-06, + "loss": 3.4536, + "step": 1866 + }, + { + "epoch": 0.75, + "grad_norm": 3.850992472141217, + "learning_rate": 3.178174312464326e-06, + "loss": 3.4271, + "step": 1867 + }, + { + "epoch": 0.75, + "grad_norm": 3.4404782089592967, + "learning_rate": 3.1687075629305787e-06, + "loss": 3.5452, + "step": 1868 + }, + { + "epoch": 0.75, + "grad_norm": 3.4309534748380246, + "learning_rate": 3.1592522785292714e-06, + "loss": 3.5336, + "step": 1869 + }, + { + "epoch": 0.75, + "grad_norm": 3.9067620491946853, + "learning_rate": 3.1498084751294523e-06, + "loss": 3.598, + "step": 1870 + }, + { + "epoch": 0.75, + "grad_norm": 3.448718948350417, + "learning_rate": 3.1403761685809007e-06, + "loss": 3.3707, + "step": 1871 + }, + { + "epoch": 0.75, + "grad_norm": 3.3574537673903415, + "learning_rate": 3.130955374714094e-06, + "loss": 3.4151, + "step": 1872 + }, + { + "epoch": 0.75, + "grad_norm": 3.542471370777354, + "learning_rate": 3.1215461093401913e-06, + "loss": 3.2864, + "step": 1873 + }, + { + "epoch": 0.75, + "grad_norm": 3.96498352917791, + "learning_rate": 3.1121483882509996e-06, + "loss": 3.4146, + "step": 1874 + }, + { + "epoch": 0.75, + "grad_norm": 2.9306184568657394, + "learning_rate": 3.1027622272189572e-06, + "loss": 3.6675, + "step": 1875 + }, + { + "epoch": 0.75, + "grad_norm": 3.563712587378033, + "learning_rate": 3.0933876419971008e-06, + "loss": 3.5339, + "step": 1876 + }, + { + "epoch": 0.75, + "grad_norm": 3.656991955040363, + "learning_rate": 3.0840246483190338e-06, + "loss": 3.4484, + "step": 1877 + }, + { + "epoch": 0.75, + "grad_norm": 3.8382126444965428, + "learning_rate": 3.074673261898903e-06, + "loss": 3.4853, + "step": 1878 + }, + { + "epoch": 0.75, + "grad_norm": 3.775782565231201, + "learning_rate": 3.065333498431381e-06, + "loss": 3.193, + "step": 1879 + }, + { + "epoch": 0.75, + "grad_norm": 4.55715379265718, + "learning_rate": 3.0560053735916372e-06, + "loss": 3.3536, + "step": 1880 + }, + { + "epoch": 0.75, + "grad_norm": 3.852296770838666, + "learning_rate": 3.0466889030352976e-06, + "loss": 3.5737, + "step": 1881 + }, + { + "epoch": 0.75, + "grad_norm": 3.9813297344776477, + "learning_rate": 3.037384102398431e-06, + "loss": 3.6859, + "step": 1882 + }, + { + "epoch": 0.75, + "grad_norm": 3.7225704836822513, + "learning_rate": 3.0280909872975194e-06, + "loss": 3.4207, + "step": 1883 + }, + { + "epoch": 0.75, + "grad_norm": 3.424724501437131, + "learning_rate": 3.0188095733294388e-06, + "loss": 3.4869, + "step": 1884 + }, + { + "epoch": 0.75, + "grad_norm": 3.5295304976839144, + "learning_rate": 3.009539876071427e-06, + "loss": 3.3601, + "step": 1885 + }, + { + "epoch": 0.75, + "grad_norm": 3.5881747955842727, + "learning_rate": 3.0002819110810475e-06, + "loss": 3.6793, + "step": 1886 + }, + { + "epoch": 0.75, + "grad_norm": 3.9583141250494607, + "learning_rate": 2.9910356938961782e-06, + "loss": 3.0941, + "step": 1887 + }, + { + "epoch": 0.76, + "grad_norm": 3.912358664094153, + "learning_rate": 2.981801240034985e-06, + "loss": 3.3709, + "step": 1888 + }, + { + "epoch": 0.76, + "grad_norm": 3.263720825562735, + "learning_rate": 2.9725785649958895e-06, + "loss": 3.3977, + "step": 1889 + }, + { + "epoch": 0.76, + "grad_norm": 3.509980044518029, + "learning_rate": 2.9633676842575386e-06, + "loss": 3.3369, + "step": 1890 + }, + { + "epoch": 0.76, + "grad_norm": 3.1011270995474804, + "learning_rate": 2.9541686132787907e-06, + "loss": 3.4079, + "step": 1891 + }, + { + "epoch": 0.76, + "grad_norm": 3.0679947729345174, + "learning_rate": 2.944981367498677e-06, + "loss": 3.3624, + "step": 1892 + }, + { + "epoch": 0.76, + "grad_norm": 3.288148926844505, + "learning_rate": 2.93580596233639e-06, + "loss": 3.4671, + "step": 1893 + }, + { + "epoch": 0.76, + "grad_norm": 3.6214058326443292, + "learning_rate": 2.9266424131912495e-06, + "loss": 3.3457, + "step": 1894 + }, + { + "epoch": 0.76, + "grad_norm": 3.53592212281632, + "learning_rate": 2.9174907354426696e-06, + "loss": 3.3712, + "step": 1895 + }, + { + "epoch": 0.76, + "grad_norm": 3.5987696162272527, + "learning_rate": 2.9083509444501433e-06, + "loss": 3.3665, + "step": 1896 + }, + { + "epoch": 0.76, + "grad_norm": 3.085596638238194, + "learning_rate": 2.899223055553221e-06, + "loss": 3.6694, + "step": 1897 + }, + { + "epoch": 0.76, + "grad_norm": 3.4340867270724083, + "learning_rate": 2.890107084071465e-06, + "loss": 3.316, + "step": 1898 + }, + { + "epoch": 0.76, + "grad_norm": 4.285488215778594, + "learning_rate": 2.881003045304448e-06, + "loss": 3.4162, + "step": 1899 + }, + { + "epoch": 0.76, + "grad_norm": 3.6478133045438454, + "learning_rate": 2.8719109545317102e-06, + "loss": 3.4196, + "step": 1900 + }, + { + "epoch": 0.76, + "grad_norm": 3.4821976881960612, + "learning_rate": 2.8628308270127335e-06, + "loss": 3.2749, + "step": 1901 + }, + { + "epoch": 0.76, + "grad_norm": 3.8625654362844153, + "learning_rate": 2.853762677986932e-06, + "loss": 3.1237, + "step": 1902 + }, + { + "epoch": 0.76, + "grad_norm": 3.9299865920722934, + "learning_rate": 2.844706522673616e-06, + "loss": 3.5478, + "step": 1903 + }, + { + "epoch": 0.76, + "grad_norm": 3.410719964347605, + "learning_rate": 2.835662376271957e-06, + "loss": 3.7158, + "step": 1904 + }, + { + "epoch": 0.76, + "grad_norm": 3.873022927865709, + "learning_rate": 2.8266302539609747e-06, + "loss": 3.4461, + "step": 1905 + }, + { + "epoch": 0.76, + "grad_norm": 3.67026539624037, + "learning_rate": 2.8176101708995174e-06, + "loss": 3.4867, + "step": 1906 + }, + { + "epoch": 0.76, + "grad_norm": 3.9182061324797237, + "learning_rate": 2.808602142226212e-06, + "loss": 3.2738, + "step": 1907 + }, + { + "epoch": 0.76, + "grad_norm": 4.124918425721532, + "learning_rate": 2.7996061830594714e-06, + "loss": 3.4517, + "step": 1908 + }, + { + "epoch": 0.76, + "grad_norm": 4.105538309675829, + "learning_rate": 2.7906223084974405e-06, + "loss": 3.2271, + "step": 1909 + }, + { + "epoch": 0.76, + "grad_norm": 3.3859102048877254, + "learning_rate": 2.78165053361798e-06, + "loss": 3.3504, + "step": 1910 + }, + { + "epoch": 0.76, + "grad_norm": 3.7425966269377486, + "learning_rate": 2.772690873478656e-06, + "loss": 3.3253, + "step": 1911 + }, + { + "epoch": 0.76, + "grad_norm": 3.310424851528983, + "learning_rate": 2.7637433431166903e-06, + "loss": 3.2082, + "step": 1912 + }, + { + "epoch": 0.77, + "grad_norm": 3.221875373344164, + "learning_rate": 2.754807957548955e-06, + "loss": 3.6851, + "step": 1913 + }, + { + "epoch": 0.77, + "grad_norm": 3.5123775223492832, + "learning_rate": 2.745884731771931e-06, + "loss": 3.5482, + "step": 1914 + }, + { + "epoch": 0.77, + "grad_norm": 3.044351047721393, + "learning_rate": 2.736973680761702e-06, + "loss": 3.4826, + "step": 1915 + }, + { + "epoch": 0.77, + "grad_norm": 4.1297812159014615, + "learning_rate": 2.728074819473908e-06, + "loss": 3.4117, + "step": 1916 + }, + { + "epoch": 0.77, + "grad_norm": 3.8714590716432395, + "learning_rate": 2.7191881628437335e-06, + "loss": 3.479, + "step": 1917 + }, + { + "epoch": 0.77, + "grad_norm": 3.5456711355596324, + "learning_rate": 2.7103137257858867e-06, + "loss": 3.4652, + "step": 1918 + }, + { + "epoch": 0.77, + "grad_norm": 3.8344813934172226, + "learning_rate": 2.7014515231945557e-06, + "loss": 3.4678, + "step": 1919 + }, + { + "epoch": 0.77, + "grad_norm": 3.4176638962488086, + "learning_rate": 2.692601569943407e-06, + "loss": 3.5222, + "step": 1920 + }, + { + "epoch": 0.77, + "grad_norm": 4.1479220590942765, + "learning_rate": 2.683763880885538e-06, + "loss": 3.5278, + "step": 1921 + }, + { + "epoch": 0.77, + "grad_norm": 3.7676113482167803, + "learning_rate": 2.674938470853472e-06, + "loss": 3.5333, + "step": 1922 + }, + { + "epoch": 0.77, + "grad_norm": 3.506946240907651, + "learning_rate": 2.6661253546591158e-06, + "loss": 3.6749, + "step": 1923 + }, + { + "epoch": 0.77, + "grad_norm": 3.6055160050017987, + "learning_rate": 2.6573245470937527e-06, + "loss": 3.518, + "step": 1924 + }, + { + "epoch": 0.77, + "grad_norm": 3.7106478573047106, + "learning_rate": 2.648536062927999e-06, + "loss": 3.54, + "step": 1925 + }, + { + "epoch": 0.77, + "grad_norm": 3.5451801376739276, + "learning_rate": 2.639759916911788e-06, + "loss": 3.5237, + "step": 1926 + }, + { + "epoch": 0.77, + "grad_norm": 3.8798623404933728, + "learning_rate": 2.6309961237743587e-06, + "loss": 3.491, + "step": 1927 + }, + { + "epoch": 0.77, + "grad_norm": 3.5768159862331284, + "learning_rate": 2.6222446982242e-06, + "loss": 3.2704, + "step": 1928 + }, + { + "epoch": 0.77, + "grad_norm": 4.269484671851689, + "learning_rate": 2.61350565494906e-06, + "loss": 3.4937, + "step": 1929 + }, + { + "epoch": 0.77, + "grad_norm": 3.332641827711261, + "learning_rate": 2.604779008615895e-06, + "loss": 3.5345, + "step": 1930 + }, + { + "epoch": 0.77, + "grad_norm": 3.829641692493816, + "learning_rate": 2.5960647738708553e-06, + "loss": 3.3914, + "step": 1931 + }, + { + "epoch": 0.77, + "grad_norm": 3.231876182233534, + "learning_rate": 2.5873629653392653e-06, + "loss": 3.5457, + "step": 1932 + }, + { + "epoch": 0.77, + "grad_norm": 3.92942692841208, + "learning_rate": 2.578673597625597e-06, + "loss": 3.4388, + "step": 1933 + }, + { + "epoch": 0.77, + "grad_norm": 4.207353567728463, + "learning_rate": 2.569996685313434e-06, + "loss": 3.4373, + "step": 1934 + }, + { + "epoch": 0.77, + "grad_norm": 3.81926065122999, + "learning_rate": 2.5613322429654573e-06, + "loss": 3.4003, + "step": 1935 + }, + { + "epoch": 0.77, + "grad_norm": 3.736117305431284, + "learning_rate": 2.5526802851234268e-06, + "loss": 3.259, + "step": 1936 + }, + { + "epoch": 0.77, + "grad_norm": 3.8281798178776683, + "learning_rate": 2.5440408263081385e-06, + "loss": 3.5264, + "step": 1937 + }, + { + "epoch": 0.78, + "grad_norm": 4.418895548861695, + "learning_rate": 2.535413881019423e-06, + "loss": 3.3591, + "step": 1938 + }, + { + "epoch": 0.78, + "grad_norm": 3.4143628309124843, + "learning_rate": 2.526799463736099e-06, + "loss": 3.3624, + "step": 1939 + }, + { + "epoch": 0.78, + "grad_norm": 4.041022539593813, + "learning_rate": 2.5181975889159615e-06, + "loss": 3.5832, + "step": 1940 + }, + { + "epoch": 0.78, + "grad_norm": 3.6739434254765775, + "learning_rate": 2.509608270995758e-06, + "loss": 3.3057, + "step": 1941 + }, + { + "epoch": 0.78, + "grad_norm": 4.493683119459877, + "learning_rate": 2.501031524391163e-06, + "loss": 3.1586, + "step": 1942 + }, + { + "epoch": 0.78, + "grad_norm": 3.8597452477414396, + "learning_rate": 2.492467363496747e-06, + "loss": 3.3849, + "step": 1943 + }, + { + "epoch": 0.78, + "grad_norm": 3.817698743749816, + "learning_rate": 2.483915802685959e-06, + "loss": 3.46, + "step": 1944 + }, + { + "epoch": 0.78, + "grad_norm": 4.0309323747808055, + "learning_rate": 2.475376856311097e-06, + "loss": 3.5345, + "step": 1945 + }, + { + "epoch": 0.78, + "grad_norm": 3.4469163019967795, + "learning_rate": 2.4668505387033025e-06, + "loss": 3.3929, + "step": 1946 + }, + { + "epoch": 0.78, + "grad_norm": 3.761660769645593, + "learning_rate": 2.458336864172508e-06, + "loss": 3.4265, + "step": 1947 + }, + { + "epoch": 0.78, + "grad_norm": 3.534763224235543, + "learning_rate": 2.44983584700743e-06, + "loss": 3.696, + "step": 1948 + }, + { + "epoch": 0.78, + "grad_norm": 3.3341157071801257, + "learning_rate": 2.4413475014755396e-06, + "loss": 3.385, + "step": 1949 + }, + { + "epoch": 0.78, + "grad_norm": 4.9798835388607525, + "learning_rate": 2.432871841823047e-06, + "loss": 3.2967, + "step": 1950 + }, + { + "epoch": 0.78, + "grad_norm": 3.9600306056181904, + "learning_rate": 2.42440888227487e-06, + "loss": 3.2849, + "step": 1951 + }, + { + "epoch": 0.78, + "grad_norm": 4.104498375827391, + "learning_rate": 2.415958637034609e-06, + "loss": 3.4203, + "step": 1952 + }, + { + "epoch": 0.78, + "grad_norm": 3.7502083975036826, + "learning_rate": 2.407521120284523e-06, + "loss": 3.3641, + "step": 1953 + }, + { + "epoch": 0.78, + "grad_norm": 3.5881591927638516, + "learning_rate": 2.3990963461855075e-06, + "loss": 3.2882, + "step": 1954 + }, + { + "epoch": 0.78, + "grad_norm": 3.428645383919292, + "learning_rate": 2.390684328877089e-06, + "loss": 3.4263, + "step": 1955 + }, + { + "epoch": 0.78, + "grad_norm": 3.721307523264979, + "learning_rate": 2.3822850824773623e-06, + "loss": 3.4291, + "step": 1956 + }, + { + "epoch": 0.78, + "grad_norm": 3.6339505062874577, + "learning_rate": 2.3738986210829997e-06, + "loss": 3.5408, + "step": 1957 + }, + { + "epoch": 0.78, + "grad_norm": 3.7015671534693193, + "learning_rate": 2.3655249587692073e-06, + "loss": 3.367, + "step": 1958 + }, + { + "epoch": 0.78, + "grad_norm": 3.4071277961892066, + "learning_rate": 2.3571641095897223e-06, + "loss": 3.1734, + "step": 1959 + }, + { + "epoch": 0.78, + "grad_norm": 3.704927639099572, + "learning_rate": 2.3488160875767717e-06, + "loss": 3.4119, + "step": 1960 + }, + { + "epoch": 0.78, + "grad_norm": 3.1341363150404877, + "learning_rate": 2.340480906741053e-06, + "loss": 3.2445, + "step": 1961 + }, + { + "epoch": 0.78, + "grad_norm": 3.58868146610442, + "learning_rate": 2.332158581071712e-06, + "loss": 3.5284, + "step": 1962 + }, + { + "epoch": 0.79, + "grad_norm": 3.31203736033518, + "learning_rate": 2.323849124536315e-06, + "loss": 3.4336, + "step": 1963 + }, + { + "epoch": 0.79, + "grad_norm": 4.813832593935863, + "learning_rate": 2.3155525510808453e-06, + "loss": 3.509, + "step": 1964 + }, + { + "epoch": 0.79, + "grad_norm": 4.0666930929331615, + "learning_rate": 2.307268874629649e-06, + "loss": 3.3478, + "step": 1965 + }, + { + "epoch": 0.79, + "grad_norm": 3.479016340534677, + "learning_rate": 2.2989981090854306e-06, + "loss": 3.5218, + "step": 1966 + }, + { + "epoch": 0.79, + "grad_norm": 4.087492566542144, + "learning_rate": 2.2907402683292268e-06, + "loss": 3.4544, + "step": 1967 + }, + { + "epoch": 0.79, + "grad_norm": 3.227066132714872, + "learning_rate": 2.2824953662203832e-06, + "loss": 3.4573, + "step": 1968 + }, + { + "epoch": 0.79, + "grad_norm": 3.859067379027617, + "learning_rate": 2.2742634165965317e-06, + "loss": 3.4539, + "step": 1969 + }, + { + "epoch": 0.79, + "grad_norm": 3.413802073964399, + "learning_rate": 2.266044433273562e-06, + "loss": 3.3942, + "step": 1970 + }, + { + "epoch": 0.79, + "grad_norm": 3.086380726334998, + "learning_rate": 2.2578384300456014e-06, + "loss": 3.5544, + "step": 1971 + }, + { + "epoch": 0.79, + "grad_norm": 3.591331242244212, + "learning_rate": 2.249645420684998e-06, + "loss": 3.1801, + "step": 1972 + }, + { + "epoch": 0.79, + "grad_norm": 3.990643884284416, + "learning_rate": 2.2414654189422845e-06, + "loss": 3.3328, + "step": 1973 + }, + { + "epoch": 0.79, + "grad_norm": 3.7177062831411893, + "learning_rate": 2.233298438546172e-06, + "loss": 3.5575, + "step": 1974 + }, + { + "epoch": 0.79, + "grad_norm": 4.057689293818614, + "learning_rate": 2.2251444932035094e-06, + "loss": 3.5605, + "step": 1975 + }, + { + "epoch": 0.79, + "grad_norm": 3.5873009841450383, + "learning_rate": 2.2170035965992674e-06, + "loss": 3.5157, + "step": 1976 + }, + { + "epoch": 0.79, + "grad_norm": 3.539103717358677, + "learning_rate": 2.2088757623965263e-06, + "loss": 3.3443, + "step": 1977 + }, + { + "epoch": 0.79, + "grad_norm": 4.0314317779393285, + "learning_rate": 2.2007610042364337e-06, + "loss": 3.4664, + "step": 1978 + }, + { + "epoch": 0.79, + "grad_norm": 3.319751629440479, + "learning_rate": 2.1926593357382e-06, + "loss": 3.3711, + "step": 1979 + }, + { + "epoch": 0.79, + "grad_norm": 3.5932358520391667, + "learning_rate": 2.184570770499056e-06, + "loss": 3.4833, + "step": 1980 + }, + { + "epoch": 0.79, + "grad_norm": 3.367779707746548, + "learning_rate": 2.176495322094254e-06, + "loss": 3.3852, + "step": 1981 + }, + { + "epoch": 0.79, + "grad_norm": 3.3471031613965927, + "learning_rate": 2.1684330040770183e-06, + "loss": 3.5292, + "step": 1982 + }, + { + "epoch": 0.79, + "grad_norm": 3.3645699007477976, + "learning_rate": 2.1603838299785486e-06, + "loss": 3.3293, + "step": 1983 + }, + { + "epoch": 0.79, + "grad_norm": 4.175500406999358, + "learning_rate": 2.1523478133079776e-06, + "loss": 3.4913, + "step": 1984 + }, + { + "epoch": 0.79, + "grad_norm": 3.455711229312703, + "learning_rate": 2.1443249675523536e-06, + "loss": 3.4397, + "step": 1985 + }, + { + "epoch": 0.79, + "grad_norm": 3.3649792008508643, + "learning_rate": 2.1363153061766297e-06, + "loss": 3.2848, + "step": 1986 + }, + { + "epoch": 0.79, + "grad_norm": 4.389255501639541, + "learning_rate": 2.128318842623618e-06, + "loss": 3.3366, + "step": 1987 + }, + { + "epoch": 0.8, + "grad_norm": 3.9054400348160923, + "learning_rate": 2.1203355903139934e-06, + "loss": 3.3767, + "step": 1988 + }, + { + "epoch": 0.8, + "grad_norm": 3.202178172426877, + "learning_rate": 2.112365562646248e-06, + "loss": 3.3604, + "step": 1989 + }, + { + "epoch": 0.8, + "grad_norm": 3.6981929021416575, + "learning_rate": 2.1044087729966856e-06, + "loss": 3.5228, + "step": 1990 + }, + { + "epoch": 0.8, + "grad_norm": 2.9855355748316836, + "learning_rate": 2.0964652347193894e-06, + "loss": 3.4482, + "step": 1991 + }, + { + "epoch": 0.8, + "grad_norm": 3.522924579103291, + "learning_rate": 2.088534961146197e-06, + "loss": 3.4379, + "step": 1992 + }, + { + "epoch": 0.8, + "grad_norm": 3.2621294434046266, + "learning_rate": 2.0806179655866964e-06, + "loss": 3.4508, + "step": 1993 + }, + { + "epoch": 0.8, + "grad_norm": 3.5103053461158957, + "learning_rate": 2.072714261328177e-06, + "loss": 3.2115, + "step": 1994 + }, + { + "epoch": 0.8, + "grad_norm": 3.6445225225407483, + "learning_rate": 2.064823861635633e-06, + "loss": 3.4337, + "step": 1995 + }, + { + "epoch": 0.8, + "grad_norm": 3.760005549912783, + "learning_rate": 2.0569467797517173e-06, + "loss": 3.3243, + "step": 1996 + }, + { + "epoch": 0.8, + "grad_norm": 3.2248284331492862, + "learning_rate": 2.0490830288967443e-06, + "loss": 3.1467, + "step": 1997 + }, + { + "epoch": 0.8, + "grad_norm": 4.170608568490723, + "learning_rate": 2.041232622268642e-06, + "loss": 3.1809, + "step": 1998 + }, + { + "epoch": 0.8, + "grad_norm": 3.6936094239380752, + "learning_rate": 2.033395573042952e-06, + "loss": 3.3979, + "step": 1999 + }, + { + "epoch": 0.8, + "grad_norm": 3.0024131234462748, + "learning_rate": 2.025571894372794e-06, + "loss": 3.473, + "step": 2000 + }, + { + "epoch": 0.8, + "grad_norm": 3.5589140858814954, + "learning_rate": 2.017761599388842e-06, + "loss": 3.3617, + "step": 2001 + }, + { + "epoch": 0.8, + "grad_norm": 3.517662937978228, + "learning_rate": 2.0099647011993217e-06, + "loss": 3.3516, + "step": 2002 + }, + { + "epoch": 0.8, + "grad_norm": 3.243366018931923, + "learning_rate": 2.00218121288996e-06, + "loss": 3.3382, + "step": 2003 + }, + { + "epoch": 0.8, + "grad_norm": 3.5996319864021356, + "learning_rate": 1.994411147523987e-06, + "loss": 3.4044, + "step": 2004 + }, + { + "epoch": 0.8, + "grad_norm": 3.931460370263261, + "learning_rate": 1.9866545181421016e-06, + "loss": 3.394, + "step": 2005 + }, + { + "epoch": 0.8, + "grad_norm": 3.521015795117093, + "learning_rate": 1.97891133776245e-06, + "loss": 3.5298, + "step": 2006 + }, + { + "epoch": 0.8, + "grad_norm": 3.625159681693936, + "learning_rate": 1.971181619380611e-06, + "loss": 3.192, + "step": 2007 + }, + { + "epoch": 0.8, + "grad_norm": 3.8264105777364743, + "learning_rate": 1.963465375969572e-06, + "loss": 3.3629, + "step": 2008 + }, + { + "epoch": 0.8, + "grad_norm": 3.676670729415169, + "learning_rate": 1.955762620479699e-06, + "loss": 3.4384, + "step": 2009 + }, + { + "epoch": 0.8, + "grad_norm": 3.394143019524009, + "learning_rate": 1.9480733658387175e-06, + "loss": 3.3489, + "step": 2010 + }, + { + "epoch": 0.8, + "grad_norm": 3.2006120148546935, + "learning_rate": 1.940397624951709e-06, + "loss": 3.4182, + "step": 2011 + }, + { + "epoch": 0.8, + "grad_norm": 3.6169611140886224, + "learning_rate": 1.9327354107010566e-06, + "loss": 3.4577, + "step": 2012 + }, + { + "epoch": 0.81, + "grad_norm": 3.518943092013396, + "learning_rate": 1.9250867359464575e-06, + "loss": 3.3019, + "step": 2013 + }, + { + "epoch": 0.81, + "grad_norm": 3.257401642789152, + "learning_rate": 1.9174516135248745e-06, + "loss": 3.5141, + "step": 2014 + }, + { + "epoch": 0.81, + "grad_norm": 3.6641653222885147, + "learning_rate": 1.9098300562505266e-06, + "loss": 3.3903, + "step": 2015 + }, + { + "epoch": 0.81, + "grad_norm": 3.7826868132289713, + "learning_rate": 1.902222076914869e-06, + "loss": 3.4909, + "step": 2016 + }, + { + "epoch": 0.81, + "grad_norm": 3.5520282434078796, + "learning_rate": 1.894627688286571e-06, + "loss": 3.312, + "step": 2017 + }, + { + "epoch": 0.81, + "grad_norm": 3.6540670643755684, + "learning_rate": 1.8870469031114868e-06, + "loss": 3.3058, + "step": 2018 + }, + { + "epoch": 0.81, + "grad_norm": 3.735593509692127, + "learning_rate": 1.8794797341126403e-06, + "loss": 3.423, + "step": 2019 + }, + { + "epoch": 0.81, + "grad_norm": 3.328850503165228, + "learning_rate": 1.8719261939902023e-06, + "loss": 3.3835, + "step": 2020 + }, + { + "epoch": 0.81, + "grad_norm": 3.5312614490842464, + "learning_rate": 1.8643862954214754e-06, + "loss": 3.4535, + "step": 2021 + }, + { + "epoch": 0.81, + "grad_norm": 3.530286273299819, + "learning_rate": 1.8568600510608659e-06, + "loss": 3.1957, + "step": 2022 + }, + { + "epoch": 0.81, + "grad_norm": 3.3364923534174658, + "learning_rate": 1.8493474735398575e-06, + "loss": 3.4549, + "step": 2023 + }, + { + "epoch": 0.81, + "grad_norm": 3.495691119616844, + "learning_rate": 1.8418485754670013e-06, + "loss": 3.4042, + "step": 2024 + }, + { + "epoch": 0.81, + "grad_norm": 3.8025310219035995, + "learning_rate": 1.8343633694278895e-06, + "loss": 3.7479, + "step": 2025 + }, + { + "epoch": 0.81, + "grad_norm": 3.8728585066365886, + "learning_rate": 1.8268918679851388e-06, + "loss": 3.4093, + "step": 2026 + }, + { + "epoch": 0.81, + "grad_norm": 3.7564618280991113, + "learning_rate": 1.8194340836783565e-06, + "loss": 3.3054, + "step": 2027 + }, + { + "epoch": 0.81, + "grad_norm": 3.681863260494089, + "learning_rate": 1.8119900290241331e-06, + "loss": 3.5343, + "step": 2028 + }, + { + "epoch": 0.81, + "grad_norm": 3.5455921958019068, + "learning_rate": 1.8045597165160134e-06, + "loss": 3.3716, + "step": 2029 + }, + { + "epoch": 0.81, + "grad_norm": 3.356563608206486, + "learning_rate": 1.7971431586244814e-06, + "loss": 3.4031, + "step": 2030 + }, + { + "epoch": 0.81, + "grad_norm": 3.2452422437785047, + "learning_rate": 1.7897403677969405e-06, + "loss": 3.3848, + "step": 2031 + }, + { + "epoch": 0.81, + "grad_norm": 2.914647472140249, + "learning_rate": 1.7823513564576788e-06, + "loss": 3.3433, + "step": 2032 + }, + { + "epoch": 0.81, + "grad_norm": 3.856800399645909, + "learning_rate": 1.774976137007861e-06, + "loss": 3.3345, + "step": 2033 + }, + { + "epoch": 0.81, + "grad_norm": 3.45709066528513, + "learning_rate": 1.7676147218255092e-06, + "loss": 3.4962, + "step": 2034 + }, + { + "epoch": 0.81, + "grad_norm": 3.766310680972499, + "learning_rate": 1.7602671232654755e-06, + "loss": 3.4752, + "step": 2035 + }, + { + "epoch": 0.81, + "grad_norm": 3.1432477155905585, + "learning_rate": 1.7529333536594217e-06, + "loss": 3.4079, + "step": 2036 + }, + { + "epoch": 0.81, + "grad_norm": 4.178186217696704, + "learning_rate": 1.7456134253157976e-06, + "loss": 3.4548, + "step": 2037 + }, + { + "epoch": 0.82, + "grad_norm": 4.075392019420518, + "learning_rate": 1.7383073505198255e-06, + "loss": 3.387, + "step": 2038 + }, + { + "epoch": 0.82, + "grad_norm": 3.822195351239234, + "learning_rate": 1.7310151415334798e-06, + "loss": 3.4997, + "step": 2039 + }, + { + "epoch": 0.82, + "grad_norm": 3.4265966877599485, + "learning_rate": 1.723736810595461e-06, + "loss": 3.484, + "step": 2040 + }, + { + "epoch": 0.82, + "grad_norm": 2.9330877865873854, + "learning_rate": 1.7164723699211782e-06, + "loss": 3.4072, + "step": 2041 + }, + { + "epoch": 0.82, + "grad_norm": 3.25944471322417, + "learning_rate": 1.709221831702723e-06, + "loss": 3.2468, + "step": 2042 + }, + { + "epoch": 0.82, + "grad_norm": 3.6246985984462836, + "learning_rate": 1.7019852081088616e-06, + "loss": 3.429, + "step": 2043 + }, + { + "epoch": 0.82, + "grad_norm": 3.455585489352441, + "learning_rate": 1.6947625112850074e-06, + "loss": 3.5424, + "step": 2044 + }, + { + "epoch": 0.82, + "grad_norm": 3.443555610888772, + "learning_rate": 1.687553753353195e-06, + "loss": 3.5144, + "step": 2045 + }, + { + "epoch": 0.82, + "grad_norm": 3.818876261129042, + "learning_rate": 1.680358946412064e-06, + "loss": 3.454, + "step": 2046 + }, + { + "epoch": 0.82, + "grad_norm": 3.343897423268437, + "learning_rate": 1.6731781025368422e-06, + "loss": 3.42, + "step": 2047 + }, + { + "epoch": 0.82, + "grad_norm": 3.4004073563010193, + "learning_rate": 1.6660112337793256e-06, + "loss": 3.5292, + "step": 2048 + }, + { + "epoch": 0.82, + "grad_norm": 3.6316960690058284, + "learning_rate": 1.6588583521678536e-06, + "loss": 3.571, + "step": 2049 + }, + { + "epoch": 0.82, + "grad_norm": 3.5002122945295335, + "learning_rate": 1.6517194697072903e-06, + "loss": 3.5268, + "step": 2050 + }, + { + "epoch": 0.82, + "grad_norm": 3.2795958724183882, + "learning_rate": 1.644594598378999e-06, + "loss": 3.3957, + "step": 2051 + }, + { + "epoch": 0.82, + "grad_norm": 3.6550159003093974, + "learning_rate": 1.6374837501408403e-06, + "loss": 3.7017, + "step": 2052 + }, + { + "epoch": 0.82, + "grad_norm": 4.488224452276052, + "learning_rate": 1.6303869369271264e-06, + "loss": 3.3991, + "step": 2053 + }, + { + "epoch": 0.82, + "grad_norm": 3.357249271462046, + "learning_rate": 1.6233041706486253e-06, + "loss": 3.5832, + "step": 2054 + }, + { + "epoch": 0.82, + "grad_norm": 3.266031195079106, + "learning_rate": 1.6162354631925203e-06, + "loss": 3.3827, + "step": 2055 + }, + { + "epoch": 0.82, + "grad_norm": 3.6682563446568577, + "learning_rate": 1.609180826422404e-06, + "loss": 3.412, + "step": 2056 + }, + { + "epoch": 0.82, + "grad_norm": 4.033994422891126, + "learning_rate": 1.602140272178253e-06, + "loss": 3.5786, + "step": 2057 + }, + { + "epoch": 0.82, + "grad_norm": 3.9632696082851995, + "learning_rate": 1.5951138122764132e-06, + "loss": 3.2678, + "step": 2058 + }, + { + "epoch": 0.82, + "grad_norm": 3.8403714532035282, + "learning_rate": 1.58810145850957e-06, + "loss": 3.4687, + "step": 2059 + }, + { + "epoch": 0.82, + "grad_norm": 4.193247452585058, + "learning_rate": 1.5811032226467304e-06, + "loss": 3.6549, + "step": 2060 + }, + { + "epoch": 0.82, + "grad_norm": 3.2398053957952713, + "learning_rate": 1.5741191164332192e-06, + "loss": 3.2968, + "step": 2061 + }, + { + "epoch": 0.82, + "grad_norm": 3.6586197558812015, + "learning_rate": 1.5671491515906355e-06, + "loss": 3.4885, + "step": 2062 + }, + { + "epoch": 0.83, + "grad_norm": 3.61379309237948, + "learning_rate": 1.5601933398168523e-06, + "loss": 3.5474, + "step": 2063 + }, + { + "epoch": 0.83, + "grad_norm": 3.558606413314661, + "learning_rate": 1.5532516927859853e-06, + "loss": 3.3917, + "step": 2064 + }, + { + "epoch": 0.83, + "grad_norm": 4.051200426686327, + "learning_rate": 1.5463242221483742e-06, + "loss": 3.1919, + "step": 2065 + }, + { + "epoch": 0.83, + "grad_norm": 3.502973571132373, + "learning_rate": 1.5394109395305757e-06, + "loss": 3.2573, + "step": 2066 + }, + { + "epoch": 0.83, + "grad_norm": 3.683744743231853, + "learning_rate": 1.5325118565353237e-06, + "loss": 3.5408, + "step": 2067 + }, + { + "epoch": 0.83, + "grad_norm": 3.5358865096945524, + "learning_rate": 1.5256269847415283e-06, + "loss": 3.3673, + "step": 2068 + }, + { + "epoch": 0.83, + "grad_norm": 3.8122495212068617, + "learning_rate": 1.5187563357042423e-06, + "loss": 3.436, + "step": 2069 + }, + { + "epoch": 0.83, + "grad_norm": 3.406497985667074, + "learning_rate": 1.511899920954656e-06, + "loss": 3.6248, + "step": 2070 + }, + { + "epoch": 0.83, + "grad_norm": 3.690777983132295, + "learning_rate": 1.5050577520000608e-06, + "loss": 3.3869, + "step": 2071 + }, + { + "epoch": 0.83, + "grad_norm": 3.171874530283066, + "learning_rate": 1.498229840323847e-06, + "loss": 3.4189, + "step": 2072 + }, + { + "epoch": 0.83, + "grad_norm": 3.492803640863754, + "learning_rate": 1.4914161973854714e-06, + "loss": 3.2746, + "step": 2073 + }, + { + "epoch": 0.83, + "grad_norm": 3.4706085858486273, + "learning_rate": 1.4846168346204425e-06, + "loss": 3.388, + "step": 2074 + }, + { + "epoch": 0.83, + "grad_norm": 4.0469632561414475, + "learning_rate": 1.4778317634403082e-06, + "loss": 3.2526, + "step": 2075 + }, + { + "epoch": 0.83, + "grad_norm": 3.3027608809487523, + "learning_rate": 1.4710609952326239e-06, + "loss": 3.4284, + "step": 2076 + }, + { + "epoch": 0.83, + "grad_norm": 3.4405453383957854, + "learning_rate": 1.464304541360946e-06, + "loss": 3.4081, + "step": 2077 + }, + { + "epoch": 0.83, + "grad_norm": 3.6129503858999756, + "learning_rate": 1.457562413164799e-06, + "loss": 3.6202, + "step": 2078 + }, + { + "epoch": 0.83, + "grad_norm": 3.508733737407462, + "learning_rate": 1.4508346219596725e-06, + "loss": 3.3593, + "step": 2079 + }, + { + "epoch": 0.83, + "grad_norm": 3.5745164993431753, + "learning_rate": 1.4441211790369892e-06, + "loss": 3.4305, + "step": 2080 + }, + { + "epoch": 0.83, + "grad_norm": 4.000893397603999, + "learning_rate": 1.4374220956640895e-06, + "loss": 3.3896, + "step": 2081 + }, + { + "epoch": 0.83, + "grad_norm": 3.482970629880628, + "learning_rate": 1.4307373830842174e-06, + "loss": 3.2945, + "step": 2082 + }, + { + "epoch": 0.83, + "grad_norm": 3.9659815944283183, + "learning_rate": 1.424067052516499e-06, + "loss": 3.3155, + "step": 2083 + }, + { + "epoch": 0.83, + "grad_norm": 3.3102034971632275, + "learning_rate": 1.4174111151559188e-06, + "loss": 3.3685, + "step": 2084 + }, + { + "epoch": 0.83, + "grad_norm": 3.1837029622658832, + "learning_rate": 1.4107695821733026e-06, + "loss": 3.2757, + "step": 2085 + }, + { + "epoch": 0.83, + "grad_norm": 3.267635574821952, + "learning_rate": 1.4041424647153112e-06, + "loss": 3.3863, + "step": 2086 + }, + { + "epoch": 0.83, + "grad_norm": 3.4560576602866955, + "learning_rate": 1.3975297739043992e-06, + "loss": 3.435, + "step": 2087 + }, + { + "epoch": 0.84, + "grad_norm": 3.989060712448598, + "learning_rate": 1.3909315208388185e-06, + "loss": 3.419, + "step": 2088 + }, + { + "epoch": 0.84, + "grad_norm": 3.30029757789045, + "learning_rate": 1.3843477165925846e-06, + "loss": 3.5192, + "step": 2089 + }, + { + "epoch": 0.84, + "grad_norm": 3.4550946449825046, + "learning_rate": 1.3777783722154603e-06, + "loss": 3.5344, + "step": 2090 + }, + { + "epoch": 0.84, + "grad_norm": 4.44034970355981, + "learning_rate": 1.3712234987329486e-06, + "loss": 3.6119, + "step": 2091 + }, + { + "epoch": 0.84, + "grad_norm": 3.3075509781316645, + "learning_rate": 1.3646831071462606e-06, + "loss": 3.4391, + "step": 2092 + }, + { + "epoch": 0.84, + "grad_norm": 3.4657392511161556, + "learning_rate": 1.3581572084323014e-06, + "loss": 3.4166, + "step": 2093 + }, + { + "epoch": 0.84, + "grad_norm": 3.1882512927254556, + "learning_rate": 1.3516458135436539e-06, + "loss": 3.3661, + "step": 2094 + }, + { + "epoch": 0.84, + "grad_norm": 4.174656242832297, + "learning_rate": 1.3451489334085555e-06, + "loss": 3.2017, + "step": 2095 + }, + { + "epoch": 0.84, + "grad_norm": 3.757745275768527, + "learning_rate": 1.3386665789308885e-06, + "loss": 3.4389, + "step": 2096 + }, + { + "epoch": 0.84, + "grad_norm": 3.4372378669298747, + "learning_rate": 1.3321987609901553e-06, + "loss": 3.4804, + "step": 2097 + }, + { + "epoch": 0.84, + "grad_norm": 3.961745026644603, + "learning_rate": 1.325745490441458e-06, + "loss": 3.3249, + "step": 2098 + }, + { + "epoch": 0.84, + "grad_norm": 3.9095905978056797, + "learning_rate": 1.3193067781154835e-06, + "loss": 3.3281, + "step": 2099 + }, + { + "epoch": 0.84, + "grad_norm": 3.546582952524952, + "learning_rate": 1.3128826348184886e-06, + "loss": 3.1577, + "step": 2100 + }, + { + "epoch": 0.84, + "grad_norm": 3.161408052005003, + "learning_rate": 1.3064730713322793e-06, + "loss": 3.3952, + "step": 2101 + }, + { + "epoch": 0.84, + "grad_norm": 3.4427792034132265, + "learning_rate": 1.3000780984141881e-06, + "loss": 3.4078, + "step": 2102 + }, + { + "epoch": 0.84, + "grad_norm": 3.8874612168561593, + "learning_rate": 1.2936977267970597e-06, + "loss": 3.4509, + "step": 2103 + }, + { + "epoch": 0.84, + "grad_norm": 4.018729625106304, + "learning_rate": 1.2873319671892337e-06, + "loss": 3.2307, + "step": 2104 + }, + { + "epoch": 0.84, + "grad_norm": 3.1584635393788365, + "learning_rate": 1.2809808302745298e-06, + "loss": 3.4466, + "step": 2105 + }, + { + "epoch": 0.84, + "grad_norm": 3.5165182729856266, + "learning_rate": 1.2746443267122233e-06, + "loss": 3.4201, + "step": 2106 + }, + { + "epoch": 0.84, + "grad_norm": 3.6148373495878054, + "learning_rate": 1.2683224671370286e-06, + "loss": 3.3471, + "step": 2107 + }, + { + "epoch": 0.84, + "grad_norm": 3.520082081357915, + "learning_rate": 1.262015262159082e-06, + "loss": 3.2257, + "step": 2108 + }, + { + "epoch": 0.84, + "grad_norm": 3.5479686710776432, + "learning_rate": 1.255722722363929e-06, + "loss": 3.2691, + "step": 2109 + }, + { + "epoch": 0.84, + "grad_norm": 2.956232806174273, + "learning_rate": 1.249444858312502e-06, + "loss": 3.4043, + "step": 2110 + }, + { + "epoch": 0.84, + "grad_norm": 2.855373187723047, + "learning_rate": 1.2431816805410968e-06, + "loss": 3.4917, + "step": 2111 + }, + { + "epoch": 0.84, + "grad_norm": 3.1965735034591716, + "learning_rate": 1.2369331995613664e-06, + "loss": 3.3842, + "step": 2112 + }, + { + "epoch": 0.85, + "grad_norm": 3.028949144756169, + "learning_rate": 1.2306994258602922e-06, + "loss": 3.3098, + "step": 2113 + }, + { + "epoch": 0.85, + "grad_norm": 4.362284334661307, + "learning_rate": 1.2244803699001785e-06, + "loss": 3.2978, + "step": 2114 + }, + { + "epoch": 0.85, + "grad_norm": 3.779755730336405, + "learning_rate": 1.218276042118629e-06, + "loss": 3.3034, + "step": 2115 + }, + { + "epoch": 0.85, + "grad_norm": 3.5439051586156336, + "learning_rate": 1.2120864529285203e-06, + "loss": 3.3475, + "step": 2116 + }, + { + "epoch": 0.85, + "grad_norm": 3.3052105946171926, + "learning_rate": 1.2059116127179993e-06, + "loss": 3.3798, + "step": 2117 + }, + { + "epoch": 0.85, + "grad_norm": 3.218353846479608, + "learning_rate": 1.199751531850457e-06, + "loss": 3.2223, + "step": 2118 + }, + { + "epoch": 0.85, + "grad_norm": 3.300669411980882, + "learning_rate": 1.1936062206645183e-06, + "loss": 3.4321, + "step": 2119 + }, + { + "epoch": 0.85, + "grad_norm": 3.5403763231563574, + "learning_rate": 1.1874756894740137e-06, + "loss": 3.2812, + "step": 2120 + }, + { + "epoch": 0.85, + "grad_norm": 3.2628393837539784, + "learning_rate": 1.1813599485679684e-06, + "loss": 3.5328, + "step": 2121 + }, + { + "epoch": 0.85, + "grad_norm": 3.507796005006337, + "learning_rate": 1.1752590082105863e-06, + "loss": 3.2593, + "step": 2122 + }, + { + "epoch": 0.85, + "grad_norm": 3.77010141147154, + "learning_rate": 1.1691728786412315e-06, + "loss": 3.4443, + "step": 2123 + }, + { + "epoch": 0.85, + "grad_norm": 3.2616859923362456, + "learning_rate": 1.1631015700744153e-06, + "loss": 3.5765, + "step": 2124 + }, + { + "epoch": 0.85, + "grad_norm": 3.345595498088119, + "learning_rate": 1.1570450926997657e-06, + "loss": 3.4364, + "step": 2125 + }, + { + "epoch": 0.85, + "grad_norm": 3.502868345524852, + "learning_rate": 1.1510034566820205e-06, + "loss": 3.2767, + "step": 2126 + }, + { + "epoch": 0.85, + "grad_norm": 3.7828869424110594, + "learning_rate": 1.144976672161019e-06, + "loss": 3.379, + "step": 2127 + }, + { + "epoch": 0.85, + "grad_norm": 3.4442099961092265, + "learning_rate": 1.1389647492516598e-06, + "loss": 3.4932, + "step": 2128 + }, + { + "epoch": 0.85, + "grad_norm": 3.5130471167919657, + "learning_rate": 1.132967698043913e-06, + "loss": 3.1723, + "step": 2129 + }, + { + "epoch": 0.85, + "grad_norm": 3.7375709878896384, + "learning_rate": 1.1269855286027798e-06, + "loss": 3.3686, + "step": 2130 + }, + { + "epoch": 0.85, + "grad_norm": 3.6802875325445674, + "learning_rate": 1.1210182509682854e-06, + "loss": 3.2742, + "step": 2131 + }, + { + "epoch": 0.85, + "grad_norm": 3.3259916423660383, + "learning_rate": 1.1150658751554667e-06, + "loss": 3.4796, + "step": 2132 + }, + { + "epoch": 0.85, + "grad_norm": 3.234854692123814, + "learning_rate": 1.1091284111543499e-06, + "loss": 3.3168, + "step": 2133 + }, + { + "epoch": 0.85, + "grad_norm": 3.632315148384429, + "learning_rate": 1.1032058689299297e-06, + "loss": 3.4312, + "step": 2134 + }, + { + "epoch": 0.85, + "grad_norm": 3.1620238621266283, + "learning_rate": 1.0972982584221592e-06, + "loss": 3.5919, + "step": 2135 + }, + { + "epoch": 0.85, + "grad_norm": 3.3617634363233004, + "learning_rate": 1.0914055895459353e-06, + "loss": 3.383, + "step": 2136 + }, + { + "epoch": 0.85, + "grad_norm": 3.922553259067895, + "learning_rate": 1.08552787219107e-06, + "loss": 3.2105, + "step": 2137 + }, + { + "epoch": 0.86, + "grad_norm": 3.37654662519368, + "learning_rate": 1.0796651162222916e-06, + "loss": 3.4856, + "step": 2138 + }, + { + "epoch": 0.86, + "grad_norm": 3.7966863859829796, + "learning_rate": 1.07381733147921e-06, + "loss": 3.501, + "step": 2139 + }, + { + "epoch": 0.86, + "grad_norm": 3.840564360081184, + "learning_rate": 1.067984527776309e-06, + "loss": 3.4146, + "step": 2140 + }, + { + "epoch": 0.86, + "grad_norm": 3.4357237633421547, + "learning_rate": 1.062166714902938e-06, + "loss": 3.5836, + "step": 2141 + }, + { + "epoch": 0.86, + "grad_norm": 3.2728261173976994, + "learning_rate": 1.0563639026232742e-06, + "loss": 3.214, + "step": 2142 + }, + { + "epoch": 0.86, + "grad_norm": 3.915665784391846, + "learning_rate": 1.0505761006763315e-06, + "loss": 3.342, + "step": 2143 + }, + { + "epoch": 0.86, + "grad_norm": 3.2202663116738095, + "learning_rate": 1.044803318775922e-06, + "loss": 3.3531, + "step": 2144 + }, + { + "epoch": 0.86, + "grad_norm": 3.65403499358109, + "learning_rate": 1.0390455666106547e-06, + "loss": 3.4521, + "step": 2145 + }, + { + "epoch": 0.86, + "grad_norm": 3.662658535408528, + "learning_rate": 1.0333028538439093e-06, + "loss": 3.3767, + "step": 2146 + }, + { + "epoch": 0.86, + "grad_norm": 3.3917540438518996, + "learning_rate": 1.027575190113832e-06, + "loss": 3.5931, + "step": 2147 + }, + { + "epoch": 0.86, + "grad_norm": 3.397753021489659, + "learning_rate": 1.021862585033304e-06, + "loss": 3.3456, + "step": 2148 + }, + { + "epoch": 0.86, + "grad_norm": 3.2907386279763573, + "learning_rate": 1.0161650481899344e-06, + "loss": 3.4717, + "step": 2149 + }, + { + "epoch": 0.86, + "grad_norm": 3.316699462123251, + "learning_rate": 1.010482589146048e-06, + "loss": 3.4836, + "step": 2150 + }, + { + "epoch": 0.86, + "grad_norm": 3.5992791259273185, + "learning_rate": 1.0048152174386584e-06, + "loss": 3.5198, + "step": 2151 + }, + { + "epoch": 0.86, + "grad_norm": 3.897952562761025, + "learning_rate": 9.991629425794624e-07, + "loss": 3.5266, + "step": 2152 + }, + { + "epoch": 0.86, + "grad_norm": 3.645336662391037, + "learning_rate": 9.935257740548143e-07, + "loss": 3.6137, + "step": 2153 + }, + { + "epoch": 0.86, + "grad_norm": 4.079152642547377, + "learning_rate": 9.879037213257214e-07, + "loss": 3.3159, + "step": 2154 + }, + { + "epoch": 0.86, + "grad_norm": 3.9483279351701177, + "learning_rate": 9.822967938278172e-07, + "loss": 3.2368, + "step": 2155 + }, + { + "epoch": 0.86, + "grad_norm": 3.4047908302860526, + "learning_rate": 9.767050009713476e-07, + "loss": 3.1575, + "step": 2156 + }, + { + "epoch": 0.86, + "grad_norm": 3.1730170838584604, + "learning_rate": 9.711283521411674e-07, + "loss": 3.5803, + "step": 2157 + }, + { + "epoch": 0.86, + "grad_norm": 3.5454281828947, + "learning_rate": 9.655668566967026e-07, + "loss": 3.4175, + "step": 2158 + }, + { + "epoch": 0.86, + "grad_norm": 3.6667236181520173, + "learning_rate": 9.600205239719584e-07, + "loss": 3.4001, + "step": 2159 + }, + { + "epoch": 0.86, + "grad_norm": 3.3294844387626874, + "learning_rate": 9.544893632754816e-07, + "loss": 3.3551, + "step": 2160 + }, + { + "epoch": 0.86, + "grad_norm": 3.654141927545411, + "learning_rate": 9.489733838903648e-07, + "loss": 3.6266, + "step": 2161 + }, + { + "epoch": 0.86, + "grad_norm": 3.8045320643118186, + "learning_rate": 9.434725950742119e-07, + "loss": 3.4512, + "step": 2162 + }, + { + "epoch": 0.87, + "grad_norm": 3.6024403263194764, + "learning_rate": 9.379870060591434e-07, + "loss": 3.3326, + "step": 2163 + }, + { + "epoch": 0.87, + "grad_norm": 3.386820474420187, + "learning_rate": 9.325166260517593e-07, + "loss": 3.428, + "step": 2164 + }, + { + "epoch": 0.87, + "grad_norm": 3.651991887916642, + "learning_rate": 9.270614642331377e-07, + "loss": 3.6325, + "step": 2165 + }, + { + "epoch": 0.87, + "grad_norm": 3.3168626955308005, + "learning_rate": 9.216215297588182e-07, + "loss": 3.3719, + "step": 2166 + }, + { + "epoch": 0.87, + "grad_norm": 3.808956179858258, + "learning_rate": 9.161968317587788e-07, + "loss": 3.1597, + "step": 2167 + }, + { + "epoch": 0.87, + "grad_norm": 3.5534169108074485, + "learning_rate": 9.107873793374322e-07, + "loss": 3.4365, + "step": 2168 + }, + { + "epoch": 0.87, + "grad_norm": 3.713641504637213, + "learning_rate": 9.053931815735995e-07, + "loss": 3.3859, + "step": 2169 + }, + { + "epoch": 0.87, + "grad_norm": 3.291244265002922, + "learning_rate": 9.000142475204965e-07, + "loss": 3.2342, + "step": 2170 + }, + { + "epoch": 0.87, + "grad_norm": 3.6304683284911143, + "learning_rate": 8.946505862057286e-07, + "loss": 3.3497, + "step": 2171 + }, + { + "epoch": 0.87, + "grad_norm": 3.036041707321972, + "learning_rate": 8.893022066312674e-07, + "loss": 3.297, + "step": 2172 + }, + { + "epoch": 0.87, + "grad_norm": 3.5995359541359333, + "learning_rate": 8.839691177734322e-07, + "loss": 3.3432, + "step": 2173 + }, + { + "epoch": 0.87, + "grad_norm": 3.5355815132434922, + "learning_rate": 8.786513285828835e-07, + "loss": 3.4101, + "step": 2174 + }, + { + "epoch": 0.87, + "grad_norm": 3.2445046978749734, + "learning_rate": 8.733488479845997e-07, + "loss": 3.3971, + "step": 2175 + }, + { + "epoch": 0.87, + "grad_norm": 3.3838590014717567, + "learning_rate": 8.680616848778711e-07, + "loss": 3.3631, + "step": 2176 + }, + { + "epoch": 0.87, + "grad_norm": 3.808898429237898, + "learning_rate": 8.627898481362817e-07, + "loss": 3.5121, + "step": 2177 + }, + { + "epoch": 0.87, + "grad_norm": 4.148682068047069, + "learning_rate": 8.575333466076863e-07, + "loss": 3.5631, + "step": 2178 + }, + { + "epoch": 0.87, + "grad_norm": 3.349487356821742, + "learning_rate": 8.522921891142034e-07, + "loss": 3.3379, + "step": 2179 + }, + { + "epoch": 0.87, + "grad_norm": 3.335460985598822, + "learning_rate": 8.470663844522053e-07, + "loss": 3.2859, + "step": 2180 + }, + { + "epoch": 0.87, + "grad_norm": 3.5146350091888077, + "learning_rate": 8.418559413922933e-07, + "loss": 3.4165, + "step": 2181 + }, + { + "epoch": 0.87, + "grad_norm": 3.0467244644452345, + "learning_rate": 8.366608686792854e-07, + "loss": 3.491, + "step": 2182 + }, + { + "epoch": 0.87, + "grad_norm": 3.37795744884803, + "learning_rate": 8.31481175032206e-07, + "loss": 3.3244, + "step": 2183 + }, + { + "epoch": 0.87, + "grad_norm": 3.8806468735539363, + "learning_rate": 8.263168691442624e-07, + "loss": 3.2478, + "step": 2184 + }, + { + "epoch": 0.87, + "grad_norm": 3.399507866382731, + "learning_rate": 8.211679596828481e-07, + "loss": 3.493, + "step": 2185 + }, + { + "epoch": 0.87, + "grad_norm": 3.6180760047276843, + "learning_rate": 8.160344552895061e-07, + "loss": 3.4451, + "step": 2186 + }, + { + "epoch": 0.87, + "grad_norm": 3.975696190654819, + "learning_rate": 8.109163645799267e-07, + "loss": 3.5058, + "step": 2187 + }, + { + "epoch": 0.88, + "grad_norm": 3.290590938027743, + "learning_rate": 8.058136961439333e-07, + "loss": 3.4879, + "step": 2188 + }, + { + "epoch": 0.88, + "grad_norm": 2.9777779041564334, + "learning_rate": 8.007264585454632e-07, + "loss": 3.3098, + "step": 2189 + }, + { + "epoch": 0.88, + "grad_norm": 3.7754034334706503, + "learning_rate": 7.956546603225601e-07, + "loss": 3.3984, + "step": 2190 + }, + { + "epoch": 0.88, + "grad_norm": 4.098136120772462, + "learning_rate": 7.905983099873504e-07, + "loss": 3.3151, + "step": 2191 + }, + { + "epoch": 0.88, + "grad_norm": 3.840743601541193, + "learning_rate": 7.855574160260371e-07, + "loss": 3.492, + "step": 2192 + }, + { + "epoch": 0.88, + "grad_norm": 3.6265418650700787, + "learning_rate": 7.805319868988759e-07, + "loss": 3.4926, + "step": 2193 + }, + { + "epoch": 0.88, + "grad_norm": 3.85235771264861, + "learning_rate": 7.755220310401812e-07, + "loss": 3.4783, + "step": 2194 + }, + { + "epoch": 0.88, + "grad_norm": 3.5818307264323272, + "learning_rate": 7.705275568582848e-07, + "loss": 3.2884, + "step": 2195 + }, + { + "epoch": 0.88, + "grad_norm": 3.270453873784988, + "learning_rate": 7.655485727355416e-07, + "loss": 3.2947, + "step": 2196 + }, + { + "epoch": 0.88, + "grad_norm": 3.6452666439703183, + "learning_rate": 7.60585087028305e-07, + "loss": 3.1528, + "step": 2197 + }, + { + "epoch": 0.88, + "grad_norm": 2.9837151835317623, + "learning_rate": 7.556371080669222e-07, + "loss": 3.3995, + "step": 2198 + }, + { + "epoch": 0.88, + "grad_norm": 3.505327878248056, + "learning_rate": 7.507046441557142e-07, + "loss": 3.3998, + "step": 2199 + }, + { + "epoch": 0.88, + "grad_norm": 3.1591895177001366, + "learning_rate": 7.457877035729588e-07, + "loss": 3.317, + "step": 2200 + }, + { + "epoch": 0.88, + "grad_norm": 3.5187197845554508, + "learning_rate": 7.408862945708839e-07, + "loss": 3.0716, + "step": 2201 + }, + { + "epoch": 0.88, + "grad_norm": 3.4083125288162908, + "learning_rate": 7.360004253756459e-07, + "loss": 3.3461, + "step": 2202 + }, + { + "epoch": 0.88, + "grad_norm": 3.0850483605956924, + "learning_rate": 7.311301041873276e-07, + "loss": 3.432, + "step": 2203 + }, + { + "epoch": 0.88, + "grad_norm": 3.8955991470826112, + "learning_rate": 7.262753391799127e-07, + "loss": 3.3223, + "step": 2204 + }, + { + "epoch": 0.88, + "grad_norm": 3.171206654714713, + "learning_rate": 7.21436138501278e-07, + "loss": 3.5402, + "step": 2205 + }, + { + "epoch": 0.88, + "grad_norm": 3.3432430917547724, + "learning_rate": 7.166125102731735e-07, + "loss": 3.3176, + "step": 2206 + }, + { + "epoch": 0.88, + "grad_norm": 3.694322018316045, + "learning_rate": 7.118044625912213e-07, + "loss": 3.5505, + "step": 2207 + }, + { + "epoch": 0.88, + "grad_norm": 3.359500371733772, + "learning_rate": 7.070120035248906e-07, + "loss": 3.4174, + "step": 2208 + }, + { + "epoch": 0.88, + "grad_norm": 4.010062042226531, + "learning_rate": 7.022351411174866e-07, + "loss": 3.0835, + "step": 2209 + }, + { + "epoch": 0.88, + "grad_norm": 3.5776919424054485, + "learning_rate": 6.974738833861383e-07, + "loss": 3.5002, + "step": 2210 + }, + { + "epoch": 0.88, + "grad_norm": 3.0916541703552607, + "learning_rate": 6.927282383217893e-07, + "loss": 3.3724, + "step": 2211 + }, + { + "epoch": 0.88, + "grad_norm": 3.483857548891637, + "learning_rate": 6.879982138891717e-07, + "loss": 3.2362, + "step": 2212 + }, + { + "epoch": 0.89, + "grad_norm": 3.783788104808641, + "learning_rate": 6.83283818026812e-07, + "loss": 3.4115, + "step": 2213 + }, + { + "epoch": 0.89, + "grad_norm": 3.924748913054718, + "learning_rate": 6.785850586469989e-07, + "loss": 3.3712, + "step": 2214 + }, + { + "epoch": 0.89, + "grad_norm": 3.2175573933479846, + "learning_rate": 6.739019436357774e-07, + "loss": 3.4548, + "step": 2215 + }, + { + "epoch": 0.89, + "grad_norm": 3.843047627078977, + "learning_rate": 6.692344808529427e-07, + "loss": 3.4471, + "step": 2216 + }, + { + "epoch": 0.89, + "grad_norm": 3.746682602581423, + "learning_rate": 6.645826781320141e-07, + "loss": 3.2257, + "step": 2217 + }, + { + "epoch": 0.89, + "grad_norm": 3.632430383751459, + "learning_rate": 6.599465432802332e-07, + "loss": 3.5574, + "step": 2218 + }, + { + "epoch": 0.89, + "grad_norm": 3.591676174959998, + "learning_rate": 6.553260840785414e-07, + "loss": 3.2546, + "step": 2219 + }, + { + "epoch": 0.89, + "grad_norm": 3.739713355457886, + "learning_rate": 6.507213082815745e-07, + "loss": 3.3196, + "step": 2220 + }, + { + "epoch": 0.89, + "grad_norm": 3.5008219313024265, + "learning_rate": 6.461322236176438e-07, + "loss": 3.465, + "step": 2221 + }, + { + "epoch": 0.89, + "grad_norm": 3.310600462482619, + "learning_rate": 6.415588377887305e-07, + "loss": 3.433, + "step": 2222 + }, + { + "epoch": 0.89, + "grad_norm": 3.4841994321670917, + "learning_rate": 6.370011584704617e-07, + "loss": 3.3722, + "step": 2223 + }, + { + "epoch": 0.89, + "grad_norm": 3.6842707477566856, + "learning_rate": 6.324591933121072e-07, + "loss": 3.3919, + "step": 2224 + }, + { + "epoch": 0.89, + "grad_norm": 3.8566346491530994, + "learning_rate": 6.279329499365649e-07, + "loss": 3.6028, + "step": 2225 + }, + { + "epoch": 0.89, + "grad_norm": 3.3556102440025666, + "learning_rate": 6.234224359403407e-07, + "loss": 3.2679, + "step": 2226 + }, + { + "epoch": 0.89, + "grad_norm": 3.248424488867721, + "learning_rate": 6.1892765889355e-07, + "loss": 3.3181, + "step": 2227 + }, + { + "epoch": 0.89, + "grad_norm": 3.433953805154286, + "learning_rate": 6.144486263398886e-07, + "loss": 3.3922, + "step": 2228 + }, + { + "epoch": 0.89, + "grad_norm": 3.606794192325152, + "learning_rate": 6.099853457966342e-07, + "loss": 3.4089, + "step": 2229 + }, + { + "epoch": 0.89, + "grad_norm": 3.2863483621832756, + "learning_rate": 6.055378247546217e-07, + "loss": 3.3874, + "step": 2230 + }, + { + "epoch": 0.89, + "grad_norm": 3.5462763362220646, + "learning_rate": 6.01106070678239e-07, + "loss": 3.2217, + "step": 2231 + }, + { + "epoch": 0.89, + "grad_norm": 3.18770247047478, + "learning_rate": 5.966900910054141e-07, + "loss": 3.414, + "step": 2232 + }, + { + "epoch": 0.89, + "grad_norm": 3.3584454420282794, + "learning_rate": 5.922898931475973e-07, + "loss": 3.548, + "step": 2233 + }, + { + "epoch": 0.89, + "grad_norm": 3.2032427721789545, + "learning_rate": 5.879054844897536e-07, + "loss": 3.4794, + "step": 2234 + }, + { + "epoch": 0.89, + "grad_norm": 3.172170956432926, + "learning_rate": 5.835368723903456e-07, + "loss": 3.478, + "step": 2235 + }, + { + "epoch": 0.89, + "grad_norm": 3.8577441327169724, + "learning_rate": 5.791840641813295e-07, + "loss": 3.4523, + "step": 2236 + }, + { + "epoch": 0.89, + "grad_norm": 3.5517894641461263, + "learning_rate": 5.748470671681328e-07, + "loss": 3.367, + "step": 2237 + }, + { + "epoch": 0.9, + "grad_norm": 3.3612955774508038, + "learning_rate": 5.705258886296494e-07, + "loss": 3.2858, + "step": 2238 + }, + { + "epoch": 0.9, + "grad_norm": 3.620965017753635, + "learning_rate": 5.662205358182226e-07, + "loss": 3.5161, + "step": 2239 + }, + { + "epoch": 0.9, + "grad_norm": 3.306799867032343, + "learning_rate": 5.619310159596358e-07, + "loss": 3.3049, + "step": 2240 + }, + { + "epoch": 0.9, + "grad_norm": 3.892165227199948, + "learning_rate": 5.576573362531001e-07, + "loss": 3.2364, + "step": 2241 + }, + { + "epoch": 0.9, + "grad_norm": 3.418545935942755, + "learning_rate": 5.533995038712403e-07, + "loss": 3.5193, + "step": 2242 + }, + { + "epoch": 0.9, + "grad_norm": 3.605324441619377, + "learning_rate": 5.491575259600879e-07, + "loss": 3.2923, + "step": 2243 + }, + { + "epoch": 0.9, + "grad_norm": 3.2932202213739408, + "learning_rate": 5.449314096390601e-07, + "loss": 3.3651, + "step": 2244 + }, + { + "epoch": 0.9, + "grad_norm": 3.7819035993265215, + "learning_rate": 5.407211620009545e-07, + "loss": 3.1878, + "step": 2245 + }, + { + "epoch": 0.9, + "grad_norm": 3.342693554550334, + "learning_rate": 5.365267901119398e-07, + "loss": 3.497, + "step": 2246 + }, + { + "epoch": 0.9, + "grad_norm": 3.0434406319244784, + "learning_rate": 5.323483010115382e-07, + "loss": 3.4321, + "step": 2247 + }, + { + "epoch": 0.9, + "grad_norm": 3.9546557643627587, + "learning_rate": 5.281857017126124e-07, + "loss": 3.4684, + "step": 2248 + }, + { + "epoch": 0.9, + "grad_norm": 3.202307405158531, + "learning_rate": 5.240389992013606e-07, + "loss": 3.4182, + "step": 2249 + }, + { + "epoch": 0.9, + "grad_norm": 3.6682108167874126, + "learning_rate": 5.199082004372958e-07, + "loss": 3.5151, + "step": 2250 + }, + { + "epoch": 0.9, + "grad_norm": 3.517041176431839, + "learning_rate": 5.157933123532466e-07, + "loss": 3.5793, + "step": 2251 + }, + { + "epoch": 0.9, + "grad_norm": 4.060991767391451, + "learning_rate": 5.116943418553355e-07, + "loss": 3.1333, + "step": 2252 + }, + { + "epoch": 0.9, + "grad_norm": 3.4617925682458828, + "learning_rate": 5.076112958229673e-07, + "loss": 3.4533, + "step": 2253 + }, + { + "epoch": 0.9, + "grad_norm": 3.8325002574788374, + "learning_rate": 5.035441811088204e-07, + "loss": 3.3571, + "step": 2254 + }, + { + "epoch": 0.9, + "grad_norm": 3.87591566298437, + "learning_rate": 4.994930045388414e-07, + "loss": 3.2436, + "step": 2255 + }, + { + "epoch": 0.9, + "grad_norm": 3.2740730847041695, + "learning_rate": 4.954577729122212e-07, + "loss": 3.363, + "step": 2256 + }, + { + "epoch": 0.9, + "grad_norm": 3.9086146571113733, + "learning_rate": 4.914384930013927e-07, + "loss": 3.3876, + "step": 2257 + }, + { + "epoch": 0.9, + "grad_norm": 2.9553725583407164, + "learning_rate": 4.874351715520154e-07, + "loss": 3.4666, + "step": 2258 + }, + { + "epoch": 0.9, + "grad_norm": 3.409566328810979, + "learning_rate": 4.834478152829658e-07, + "loss": 3.3281, + "step": 2259 + }, + { + "epoch": 0.9, + "grad_norm": 3.6652751960125713, + "learning_rate": 4.794764308863242e-07, + "loss": 3.3374, + "step": 2260 + }, + { + "epoch": 0.9, + "grad_norm": 3.7376935408690137, + "learning_rate": 4.755210250273701e-07, + "loss": 3.3913, + "step": 2261 + }, + { + "epoch": 0.9, + "grad_norm": 3.5403464753418685, + "learning_rate": 4.715816043445609e-07, + "loss": 3.3002, + "step": 2262 + }, + { + "epoch": 0.91, + "grad_norm": 3.488337281386899, + "learning_rate": 4.676581754495235e-07, + "loss": 3.4845, + "step": 2263 + }, + { + "epoch": 0.91, + "grad_norm": 3.2385604249717814, + "learning_rate": 4.6375074492705173e-07, + "loss": 3.4416, + "step": 2264 + }, + { + "epoch": 0.91, + "grad_norm": 3.8998958534577404, + "learning_rate": 4.5985931933508757e-07, + "loss": 3.2112, + "step": 2265 + }, + { + "epoch": 0.91, + "grad_norm": 3.4381788905534654, + "learning_rate": 4.559839052047066e-07, + "loss": 3.24, + "step": 2266 + }, + { + "epoch": 0.91, + "grad_norm": 2.9753181370969592, + "learning_rate": 4.521245090401172e-07, + "loss": 3.372, + "step": 2267 + }, + { + "epoch": 0.91, + "grad_norm": 3.220562802271603, + "learning_rate": 4.482811373186402e-07, + "loss": 3.4169, + "step": 2268 + }, + { + "epoch": 0.91, + "grad_norm": 3.4575228301184597, + "learning_rate": 4.4445379649070587e-07, + "loss": 3.3371, + "step": 2269 + }, + { + "epoch": 0.91, + "grad_norm": 3.4887598974576735, + "learning_rate": 4.406424929798403e-07, + "loss": 3.2589, + "step": 2270 + }, + { + "epoch": 0.91, + "grad_norm": 3.285211766969659, + "learning_rate": 4.368472331826479e-07, + "loss": 3.2891, + "step": 2271 + }, + { + "epoch": 0.91, + "grad_norm": 3.2239734105128406, + "learning_rate": 4.3306802346881116e-07, + "loss": 3.3581, + "step": 2272 + }, + { + "epoch": 0.91, + "grad_norm": 3.484669516554814, + "learning_rate": 4.2930487018107425e-07, + "loss": 3.4378, + "step": 2273 + }, + { + "epoch": 0.91, + "grad_norm": 3.487229626033901, + "learning_rate": 4.2555777963523506e-07, + "loss": 3.4464, + "step": 2274 + }, + { + "epoch": 0.91, + "grad_norm": 3.1419345073346125, + "learning_rate": 4.218267581201296e-07, + "loss": 3.4621, + "step": 2275 + }, + { + "epoch": 0.91, + "grad_norm": 3.219933497970078, + "learning_rate": 4.1811181189762684e-07, + "loss": 3.2655, + "step": 2276 + }, + { + "epoch": 0.91, + "grad_norm": 3.594623370400675, + "learning_rate": 4.1441294720261373e-07, + "loss": 3.3844, + "step": 2277 + }, + { + "epoch": 0.91, + "grad_norm": 3.7548340359274093, + "learning_rate": 4.107301702429922e-07, + "loss": 3.3651, + "step": 2278 + }, + { + "epoch": 0.91, + "grad_norm": 3.3666014345227353, + "learning_rate": 4.070634871996615e-07, + "loss": 3.4483, + "step": 2279 + }, + { + "epoch": 0.91, + "grad_norm": 3.2939439702731037, + "learning_rate": 4.034129042265067e-07, + "loss": 3.1616, + "step": 2280 + }, + { + "epoch": 0.91, + "grad_norm": 3.7606232331352465, + "learning_rate": 3.9977842745039464e-07, + "loss": 3.2778, + "step": 2281 + }, + { + "epoch": 0.91, + "grad_norm": 3.4635150898245977, + "learning_rate": 3.961600629711615e-07, + "loss": 3.2342, + "step": 2282 + }, + { + "epoch": 0.91, + "grad_norm": 3.5295595342864443, + "learning_rate": 3.925578168616007e-07, + "loss": 3.319, + "step": 2283 + }, + { + "epoch": 0.91, + "grad_norm": 3.792507271237906, + "learning_rate": 3.889716951674549e-07, + "loss": 3.3114, + "step": 2284 + }, + { + "epoch": 0.91, + "grad_norm": 4.228923211595772, + "learning_rate": 3.8540170390740097e-07, + "loss": 3.5547, + "step": 2285 + }, + { + "epoch": 0.91, + "grad_norm": 3.442052688731442, + "learning_rate": 3.8184784907304704e-07, + "loss": 3.457, + "step": 2286 + }, + { + "epoch": 0.91, + "grad_norm": 3.3026272696031045, + "learning_rate": 3.783101366289199e-07, + "loss": 3.3291, + "step": 2287 + }, + { + "epoch": 0.92, + "grad_norm": 3.436857110291015, + "learning_rate": 3.747885725124523e-07, + "loss": 3.2545, + "step": 2288 + }, + { + "epoch": 0.92, + "grad_norm": 3.569007423588782, + "learning_rate": 3.712831626339752e-07, + "loss": 3.5329, + "step": 2289 + }, + { + "epoch": 0.92, + "grad_norm": 3.413741939457016, + "learning_rate": 3.67793912876705e-07, + "loss": 3.4219, + "step": 2290 + }, + { + "epoch": 0.92, + "grad_norm": 3.9608789183114315, + "learning_rate": 3.643208290967415e-07, + "loss": 3.4362, + "step": 2291 + }, + { + "epoch": 0.92, + "grad_norm": 4.037967550302035, + "learning_rate": 3.608639171230488e-07, + "loss": 3.4947, + "step": 2292 + }, + { + "epoch": 0.92, + "grad_norm": 2.963217206035839, + "learning_rate": 3.5742318275745147e-07, + "loss": 3.4222, + "step": 2293 + }, + { + "epoch": 0.92, + "grad_norm": 4.404616702862916, + "learning_rate": 3.5399863177462024e-07, + "loss": 3.5048, + "step": 2294 + }, + { + "epoch": 0.92, + "grad_norm": 3.5130147839058874, + "learning_rate": 3.5059026992206645e-07, + "loss": 3.5803, + "step": 2295 + }, + { + "epoch": 0.92, + "grad_norm": 3.145610428854735, + "learning_rate": 3.4719810292013214e-07, + "loss": 3.3987, + "step": 2296 + }, + { + "epoch": 0.92, + "grad_norm": 3.1675486907915675, + "learning_rate": 3.438221364619776e-07, + "loss": 3.3274, + "step": 2297 + }, + { + "epoch": 0.92, + "grad_norm": 3.5141499328252555, + "learning_rate": 3.404623762135728e-07, + "loss": 3.2946, + "step": 2298 + }, + { + "epoch": 0.92, + "grad_norm": 3.5931881815304108, + "learning_rate": 3.371188278136883e-07, + "loss": 3.4986, + "step": 2299 + }, + { + "epoch": 0.92, + "grad_norm": 3.237549412583975, + "learning_rate": 3.3379149687388866e-07, + "loss": 3.4054, + "step": 2300 + }, + { + "epoch": 0.92, + "grad_norm": 3.4046964323023268, + "learning_rate": 3.3048038897851576e-07, + "loss": 3.3034, + "step": 2301 + }, + { + "epoch": 0.92, + "grad_norm": 3.288139297934954, + "learning_rate": 3.271855096846899e-07, + "loss": 3.3689, + "step": 2302 + }, + { + "epoch": 0.92, + "grad_norm": 3.512789471000361, + "learning_rate": 3.2390686452228983e-07, + "loss": 3.4543, + "step": 2303 + }, + { + "epoch": 0.92, + "grad_norm": 3.843382156078518, + "learning_rate": 3.2064445899394723e-07, + "loss": 3.1137, + "step": 2304 + }, + { + "epoch": 0.92, + "grad_norm": 4.054753404859892, + "learning_rate": 3.1739829857504235e-07, + "loss": 3.2675, + "step": 2305 + }, + { + "epoch": 0.92, + "grad_norm": 3.567866951457264, + "learning_rate": 3.1416838871368925e-07, + "loss": 3.3668, + "step": 2306 + }, + { + "epoch": 0.92, + "grad_norm": 3.588050596796032, + "learning_rate": 3.1095473483072733e-07, + "loss": 3.5633, + "step": 2307 + }, + { + "epoch": 0.92, + "grad_norm": 3.3237287020633057, + "learning_rate": 3.0775734231971443e-07, + "loss": 3.498, + "step": 2308 + }, + { + "epoch": 0.92, + "grad_norm": 3.5006180356875145, + "learning_rate": 3.045762165469168e-07, + "loss": 3.351, + "step": 2309 + }, + { + "epoch": 0.92, + "grad_norm": 3.783894516690254, + "learning_rate": 3.0141136285129825e-07, + "loss": 3.3718, + "step": 2310 + }, + { + "epoch": 0.92, + "grad_norm": 3.8365546836380733, + "learning_rate": 2.982627865445109e-07, + "loss": 3.3811, + "step": 2311 + }, + { + "epoch": 0.92, + "grad_norm": 3.6036011049940466, + "learning_rate": 2.951304929108956e-07, + "loss": 3.3192, + "step": 2312 + }, + { + "epoch": 0.93, + "grad_norm": 4.117468304701286, + "learning_rate": 2.9201448720745706e-07, + "loss": 3.3399, + "step": 2313 + }, + { + "epoch": 0.93, + "grad_norm": 3.360859819170253, + "learning_rate": 2.8891477466386987e-07, + "loss": 3.6341, + "step": 2314 + }, + { + "epoch": 0.93, + "grad_norm": 3.7695749961891507, + "learning_rate": 2.8583136048245697e-07, + "loss": 3.2734, + "step": 2315 + }, + { + "epoch": 0.93, + "grad_norm": 3.6133036111449046, + "learning_rate": 2.827642498381955e-07, + "loss": 3.2883, + "step": 2316 + }, + { + "epoch": 0.93, + "grad_norm": 3.4475207107442603, + "learning_rate": 2.7971344787869114e-07, + "loss": 3.231, + "step": 2317 + }, + { + "epoch": 0.93, + "grad_norm": 3.569809544449258, + "learning_rate": 2.76678959724187e-07, + "loss": 3.1757, + "step": 2318 + }, + { + "epoch": 0.93, + "grad_norm": 3.310205615755986, + "learning_rate": 2.7366079046753925e-07, + "loss": 3.4045, + "step": 2319 + }, + { + "epoch": 0.93, + "grad_norm": 3.043808757165406, + "learning_rate": 2.706589451742181e-07, + "loss": 3.341, + "step": 2320 + }, + { + "epoch": 0.93, + "grad_norm": 3.4378814841858576, + "learning_rate": 2.6767342888229907e-07, + "loss": 3.4172, + "step": 2321 + }, + { + "epoch": 0.93, + "grad_norm": 3.3665108545388387, + "learning_rate": 2.647042466024485e-07, + "loss": 3.4747, + "step": 2322 + }, + { + "epoch": 0.93, + "grad_norm": 3.0931178224328666, + "learning_rate": 2.617514033179236e-07, + "loss": 3.3271, + "step": 2323 + }, + { + "epoch": 0.93, + "grad_norm": 3.4596327927003623, + "learning_rate": 2.588149039845533e-07, + "loss": 3.3217, + "step": 2324 + }, + { + "epoch": 0.93, + "grad_norm": 3.611293101153788, + "learning_rate": 2.5589475353073987e-07, + "loss": 3.5053, + "step": 2325 + }, + { + "epoch": 0.93, + "grad_norm": 3.4077089320299736, + "learning_rate": 2.5299095685744734e-07, + "loss": 3.5835, + "step": 2326 + }, + { + "epoch": 0.93, + "grad_norm": 3.362519772264958, + "learning_rate": 2.5010351883819283e-07, + "loss": 3.441, + "step": 2327 + }, + { + "epoch": 0.93, + "grad_norm": 3.6639173507916767, + "learning_rate": 2.472324443190355e-07, + "loss": 3.5446, + "step": 2328 + }, + { + "epoch": 0.93, + "grad_norm": 3.5669895297852423, + "learning_rate": 2.4437773811857304e-07, + "loss": 3.3679, + "step": 2329 + }, + { + "epoch": 0.93, + "grad_norm": 3.9325205560382717, + "learning_rate": 2.4153940502793185e-07, + "loss": 3.3625, + "step": 2330 + }, + { + "epoch": 0.93, + "grad_norm": 3.4262239886835975, + "learning_rate": 2.387174498107614e-07, + "loss": 3.1708, + "step": 2331 + }, + { + "epoch": 0.93, + "grad_norm": 3.382901267977727, + "learning_rate": 2.359118772032176e-07, + "loss": 3.6955, + "step": 2332 + }, + { + "epoch": 0.93, + "grad_norm": 3.8769220455422917, + "learning_rate": 2.3312269191396619e-07, + "loss": 3.4398, + "step": 2333 + }, + { + "epoch": 0.93, + "grad_norm": 3.2645249976733792, + "learning_rate": 2.30349898624167e-07, + "loss": 3.5903, + "step": 2334 + }, + { + "epoch": 0.93, + "grad_norm": 3.2437369619688132, + "learning_rate": 2.2759350198746978e-07, + "loss": 3.2821, + "step": 2335 + }, + { + "epoch": 0.93, + "grad_norm": 3.585064031562485, + "learning_rate": 2.2485350663000727e-07, + "loss": 3.3699, + "step": 2336 + }, + { + "epoch": 0.93, + "grad_norm": 3.529688315884326, + "learning_rate": 2.2212991715038324e-07, + "loss": 3.3273, + "step": 2337 + }, + { + "epoch": 0.94, + "grad_norm": 3.5455952725918034, + "learning_rate": 2.1942273811966563e-07, + "loss": 3.3869, + "step": 2338 + }, + { + "epoch": 0.94, + "grad_norm": 3.7542843609631973, + "learning_rate": 2.1673197408138115e-07, + "loss": 3.5519, + "step": 2339 + }, + { + "epoch": 0.94, + "grad_norm": 3.7024764625310995, + "learning_rate": 2.1405762955151178e-07, + "loss": 3.3642, + "step": 2340 + }, + { + "epoch": 0.94, + "grad_norm": 3.2576909980611903, + "learning_rate": 2.1139970901847607e-07, + "loss": 3.1324, + "step": 2341 + }, + { + "epoch": 0.94, + "grad_norm": 3.291794267112212, + "learning_rate": 2.0875821694313014e-07, + "loss": 3.492, + "step": 2342 + }, + { + "epoch": 0.94, + "grad_norm": 3.851766738786476, + "learning_rate": 2.0613315775875665e-07, + "loss": 3.4191, + "step": 2343 + }, + { + "epoch": 0.94, + "grad_norm": 3.3503396542169863, + "learning_rate": 2.0352453587105914e-07, + "loss": 3.4286, + "step": 2344 + }, + { + "epoch": 0.94, + "grad_norm": 3.0832706870783757, + "learning_rate": 2.009323556581566e-07, + "loss": 3.4376, + "step": 2345 + }, + { + "epoch": 0.94, + "grad_norm": 3.2176834985322738, + "learning_rate": 1.9835662147057012e-07, + "loss": 3.7042, + "step": 2346 + }, + { + "epoch": 0.94, + "grad_norm": 3.508572081745489, + "learning_rate": 1.9579733763121943e-07, + "loss": 3.2487, + "step": 2347 + }, + { + "epoch": 0.94, + "grad_norm": 3.3815565196920945, + "learning_rate": 1.932545084354154e-07, + "loss": 3.3544, + "step": 2348 + }, + { + "epoch": 0.94, + "grad_norm": 3.224405710797126, + "learning_rate": 1.9072813815085523e-07, + "loss": 3.3338, + "step": 2349 + }, + { + "epoch": 0.94, + "grad_norm": 3.587434527550172, + "learning_rate": 1.8821823101760949e-07, + "loss": 3.2779, + "step": 2350 + }, + { + "epoch": 0.94, + "grad_norm": 4.0105448519410976, + "learning_rate": 1.857247912481197e-07, + "loss": 3.5875, + "step": 2351 + }, + { + "epoch": 0.94, + "grad_norm": 3.3545775574783443, + "learning_rate": 1.8324782302718835e-07, + "loss": 3.6504, + "step": 2352 + }, + { + "epoch": 0.94, + "grad_norm": 3.772545381786392, + "learning_rate": 1.8078733051197561e-07, + "loss": 3.2604, + "step": 2353 + }, + { + "epoch": 0.94, + "grad_norm": 3.514296507565377, + "learning_rate": 1.7834331783198933e-07, + "loss": 3.4913, + "step": 2354 + }, + { + "epoch": 0.94, + "grad_norm": 3.2130027457938604, + "learning_rate": 1.7591578908907724e-07, + "loss": 3.2063, + "step": 2355 + }, + { + "epoch": 0.94, + "grad_norm": 3.473408546211247, + "learning_rate": 1.735047483574215e-07, + "loss": 3.4455, + "step": 2356 + }, + { + "epoch": 0.94, + "grad_norm": 4.611810751851434, + "learning_rate": 1.7111019968353625e-07, + "loss": 3.4056, + "step": 2357 + }, + { + "epoch": 0.94, + "grad_norm": 3.448733256723853, + "learning_rate": 1.687321470862524e-07, + "loss": 3.4959, + "step": 2358 + }, + { + "epoch": 0.94, + "grad_norm": 3.1270683895468423, + "learning_rate": 1.6637059455671623e-07, + "loss": 3.4274, + "step": 2359 + }, + { + "epoch": 0.94, + "grad_norm": 3.7016371313782153, + "learning_rate": 1.6402554605838173e-07, + "loss": 3.5068, + "step": 2360 + }, + { + "epoch": 0.94, + "grad_norm": 3.7952189658057955, + "learning_rate": 1.6169700552700284e-07, + "loss": 3.3875, + "step": 2361 + }, + { + "epoch": 0.94, + "grad_norm": 3.6167241598160005, + "learning_rate": 1.5938497687062905e-07, + "loss": 3.4302, + "step": 2362 + }, + { + "epoch": 0.95, + "grad_norm": 3.899382383808761, + "learning_rate": 1.5708946396959856e-07, + "loss": 3.112, + "step": 2363 + }, + { + "epoch": 0.95, + "grad_norm": 3.61072616759305, + "learning_rate": 1.5481047067652744e-07, + "loss": 3.0877, + "step": 2364 + }, + { + "epoch": 0.95, + "grad_norm": 3.6528911414970993, + "learning_rate": 1.5254800081630828e-07, + "loss": 3.1509, + "step": 2365 + }, + { + "epoch": 0.95, + "grad_norm": 3.7677655630715092, + "learning_rate": 1.5030205818610255e-07, + "loss": 3.286, + "step": 2366 + }, + { + "epoch": 0.95, + "grad_norm": 3.0290147877844635, + "learning_rate": 1.4807264655533282e-07, + "loss": 3.4737, + "step": 2367 + }, + { + "epoch": 0.95, + "grad_norm": 3.3276357529009775, + "learning_rate": 1.4585976966567826e-07, + "loss": 3.292, + "step": 2368 + }, + { + "epoch": 0.95, + "grad_norm": 3.5897484848827896, + "learning_rate": 1.4366343123106697e-07, + "loss": 3.1764, + "step": 2369 + }, + { + "epoch": 0.95, + "grad_norm": 3.931259962842155, + "learning_rate": 1.4148363493766803e-07, + "loss": 3.2836, + "step": 2370 + }, + { + "epoch": 0.95, + "grad_norm": 2.8715543592371913, + "learning_rate": 1.3932038444389063e-07, + "loss": 3.456, + "step": 2371 + }, + { + "epoch": 0.95, + "grad_norm": 3.6658305690245285, + "learning_rate": 1.3717368338037163e-07, + "loss": 3.5232, + "step": 2372 + }, + { + "epoch": 0.95, + "grad_norm": 3.5552091059925846, + "learning_rate": 1.3504353534997682e-07, + "loss": 3.4265, + "step": 2373 + }, + { + "epoch": 0.95, + "grad_norm": 4.060853993648709, + "learning_rate": 1.3292994392778535e-07, + "loss": 3.6187, + "step": 2374 + }, + { + "epoch": 0.95, + "grad_norm": 3.4127774044560337, + "learning_rate": 1.30832912661093e-07, + "loss": 3.4672, + "step": 2375 + }, + { + "epoch": 0.95, + "grad_norm": 3.3298217946063042, + "learning_rate": 1.287524450694011e-07, + "loss": 3.3182, + "step": 2376 + }, + { + "epoch": 0.95, + "grad_norm": 3.765897310420544, + "learning_rate": 1.2668854464441104e-07, + "loss": 3.5001, + "step": 2377 + }, + { + "epoch": 0.95, + "grad_norm": 3.6799767075818983, + "learning_rate": 1.246412148500198e-07, + "loss": 3.3275, + "step": 2378 + }, + { + "epoch": 0.95, + "grad_norm": 3.271650758753963, + "learning_rate": 1.2261045912231318e-07, + "loss": 3.4898, + "step": 2379 + }, + { + "epoch": 0.95, + "grad_norm": 3.4900191809890475, + "learning_rate": 1.2059628086956044e-07, + "loss": 3.3488, + "step": 2380 + }, + { + "epoch": 0.95, + "grad_norm": 3.432667422028782, + "learning_rate": 1.1859868347220749e-07, + "loss": 3.3059, + "step": 2381 + }, + { + "epoch": 0.95, + "grad_norm": 3.1095534136728507, + "learning_rate": 1.1661767028287363e-07, + "loss": 3.3028, + "step": 2382 + }, + { + "epoch": 0.95, + "grad_norm": 3.2825282377319884, + "learning_rate": 1.1465324462634375e-07, + "loss": 3.2163, + "step": 2383 + }, + { + "epoch": 0.95, + "grad_norm": 3.348291541570649, + "learning_rate": 1.1270540979956501e-07, + "loss": 3.4357, + "step": 2384 + }, + { + "epoch": 0.95, + "grad_norm": 3.8133420709248758, + "learning_rate": 1.1077416907163573e-07, + "loss": 3.3492, + "step": 2385 + }, + { + "epoch": 0.95, + "grad_norm": 4.416116463150174, + "learning_rate": 1.0885952568380764e-07, + "loss": 3.2941, + "step": 2386 + }, + { + "epoch": 0.95, + "grad_norm": 3.2336397899674707, + "learning_rate": 1.0696148284947694e-07, + "loss": 3.2592, + "step": 2387 + }, + { + "epoch": 0.96, + "grad_norm": 3.290423744841851, + "learning_rate": 1.0508004375417546e-07, + "loss": 3.4197, + "step": 2388 + }, + { + "epoch": 0.96, + "grad_norm": 3.4718862466754437, + "learning_rate": 1.032152115555718e-07, + "loss": 3.1517, + "step": 2389 + }, + { + "epoch": 0.96, + "grad_norm": 3.4536745523599475, + "learning_rate": 1.0136698938346012e-07, + "loss": 3.3263, + "step": 2390 + }, + { + "epoch": 0.96, + "grad_norm": 3.4892773617678046, + "learning_rate": 9.953538033975918e-08, + "loss": 3.548, + "step": 2391 + }, + { + "epoch": 0.96, + "grad_norm": 3.274808744119864, + "learning_rate": 9.772038749850665e-08, + "loss": 3.4609, + "step": 2392 + }, + { + "epoch": 0.96, + "grad_norm": 3.1066954963040736, + "learning_rate": 9.59220139058492e-08, + "loss": 3.5136, + "step": 2393 + }, + { + "epoch": 0.96, + "grad_norm": 3.819428406700386, + "learning_rate": 9.414026258004583e-08, + "loss": 3.3541, + "step": 2394 + }, + { + "epoch": 0.96, + "grad_norm": 3.5996884033585483, + "learning_rate": 9.237513651145224e-08, + "loss": 3.5706, + "step": 2395 + }, + { + "epoch": 0.96, + "grad_norm": 4.280278075970925, + "learning_rate": 9.062663866252541e-08, + "loss": 3.422, + "step": 2396 + }, + { + "epoch": 0.96, + "grad_norm": 3.6535216978039577, + "learning_rate": 8.889477196781571e-08, + "loss": 3.2123, + "step": 2397 + }, + { + "epoch": 0.96, + "grad_norm": 3.6186688736381183, + "learning_rate": 8.717953933395695e-08, + "loss": 3.5313, + "step": 2398 + }, + { + "epoch": 0.96, + "grad_norm": 3.4552877616197284, + "learning_rate": 8.548094363966974e-08, + "loss": 3.2186, + "step": 2399 + }, + { + "epoch": 0.96, + "grad_norm": 3.614807339334542, + "learning_rate": 8.379898773574924e-08, + "loss": 3.4121, + "step": 2400 + }, + { + "epoch": 0.96, + "grad_norm": 3.560606455212878, + "learning_rate": 8.213367444506515e-08, + "loss": 3.4588, + "step": 2401 + }, + { + "epoch": 0.96, + "grad_norm": 3.894001650217336, + "learning_rate": 8.04850065625551e-08, + "loss": 3.3264, + "step": 2402 + }, + { + "epoch": 0.96, + "grad_norm": 3.7294219687904273, + "learning_rate": 7.885298685522235e-08, + "loss": 3.268, + "step": 2403 + }, + { + "epoch": 0.96, + "grad_norm": 3.3727386747414427, + "learning_rate": 7.723761806212371e-08, + "loss": 3.5093, + "step": 2404 + }, + { + "epoch": 0.96, + "grad_norm": 3.6980302225240456, + "learning_rate": 7.563890289437825e-08, + "loss": 3.2537, + "step": 2405 + }, + { + "epoch": 0.96, + "grad_norm": 3.3141221393707134, + "learning_rate": 7.405684403514635e-08, + "loss": 3.3421, + "step": 2406 + }, + { + "epoch": 0.96, + "grad_norm": 3.943541524063672, + "learning_rate": 7.24914441396396e-08, + "loss": 3.3136, + "step": 2407 + }, + { + "epoch": 0.96, + "grad_norm": 3.621160257306377, + "learning_rate": 7.094270583510976e-08, + "loss": 3.4558, + "step": 2408 + }, + { + "epoch": 0.96, + "grad_norm": 3.611032570898534, + "learning_rate": 6.941063172084094e-08, + "loss": 3.5023, + "step": 2409 + }, + { + "epoch": 0.96, + "grad_norm": 3.069405550611829, + "learning_rate": 6.78952243681541e-08, + "loss": 3.5456, + "step": 2410 + }, + { + "epoch": 0.96, + "grad_norm": 3.2993895937672195, + "learning_rate": 6.639648632039697e-08, + "loss": 3.5449, + "step": 2411 + }, + { + "epoch": 0.96, + "grad_norm": 3.673634537765846, + "learning_rate": 6.491442009293858e-08, + "loss": 3.4421, + "step": 2412 + }, + { + "epoch": 0.97, + "grad_norm": 3.7532147515910252, + "learning_rate": 6.344902817316811e-08, + "loss": 3.3273, + "step": 2413 + }, + { + "epoch": 0.97, + "grad_norm": 3.7110501713964834, + "learning_rate": 6.200031302049048e-08, + "loss": 3.2886, + "step": 2414 + }, + { + "epoch": 0.97, + "grad_norm": 3.438293669662373, + "learning_rate": 6.056827706632185e-08, + "loss": 3.5594, + "step": 2415 + }, + { + "epoch": 0.97, + "grad_norm": 3.508072088402561, + "learning_rate": 5.915292271408524e-08, + "loss": 3.3602, + "step": 2416 + }, + { + "epoch": 0.97, + "grad_norm": 3.5684197432611096, + "learning_rate": 5.7754252339204955e-08, + "loss": 3.414, + "step": 2417 + }, + { + "epoch": 0.97, + "grad_norm": 4.319063366225016, + "learning_rate": 5.637226828910436e-08, + "loss": 3.3698, + "step": 2418 + }, + { + "epoch": 0.97, + "grad_norm": 3.8336668797631637, + "learning_rate": 5.5006972883204776e-08, + "loss": 3.3801, + "step": 2419 + }, + { + "epoch": 0.97, + "grad_norm": 3.6058057493735585, + "learning_rate": 5.365836841291439e-08, + "loss": 3.4924, + "step": 2420 + }, + { + "epoch": 0.97, + "grad_norm": 3.4600163464627554, + "learning_rate": 5.232645714163265e-08, + "loss": 3.4741, + "step": 2421 + }, + { + "epoch": 0.97, + "grad_norm": 3.480906233508525, + "learning_rate": 5.1011241304738115e-08, + "loss": 3.3203, + "step": 2422 + }, + { + "epoch": 0.97, + "grad_norm": 3.3298471961537905, + "learning_rate": 4.9712723109590636e-08, + "loss": 3.4202, + "step": 2423 + }, + { + "epoch": 0.97, + "grad_norm": 3.337131802937764, + "learning_rate": 4.843090473552914e-08, + "loss": 3.4015, + "step": 2424 + }, + { + "epoch": 0.97, + "grad_norm": 3.2352102980270843, + "learning_rate": 4.716578833386054e-08, + "loss": 3.2217, + "step": 2425 + }, + { + "epoch": 0.97, + "grad_norm": 3.436029228983509, + "learning_rate": 4.5917376027861945e-08, + "loss": 3.2638, + "step": 2426 + }, + { + "epoch": 0.97, + "grad_norm": 3.8479008982912006, + "learning_rate": 4.468566991277512e-08, + "loss": 3.2157, + "step": 2427 + }, + { + "epoch": 0.97, + "grad_norm": 3.500510734627635, + "learning_rate": 4.347067205580424e-08, + "loss": 3.4505, + "step": 2428 + }, + { + "epoch": 0.97, + "grad_norm": 4.057314345010156, + "learning_rate": 4.2272384496112597e-08, + "loss": 3.4177, + "step": 2429 + }, + { + "epoch": 0.97, + "grad_norm": 3.5742843904617185, + "learning_rate": 4.109080924481479e-08, + "loss": 3.2047, + "step": 2430 + }, + { + "epoch": 0.97, + "grad_norm": 3.980513447383297, + "learning_rate": 3.9925948284980086e-08, + "loss": 3.3224, + "step": 2431 + }, + { + "epoch": 0.97, + "grad_norm": 3.523660677303886, + "learning_rate": 3.877780357162353e-08, + "loss": 3.1595, + "step": 2432 + }, + { + "epoch": 0.97, + "grad_norm": 3.7222719612091675, + "learning_rate": 3.764637703170593e-08, + "loss": 3.4171, + "step": 2433 + }, + { + "epoch": 0.97, + "grad_norm": 3.294507134265397, + "learning_rate": 3.653167056413054e-08, + "loss": 3.4806, + "step": 2434 + }, + { + "epoch": 0.97, + "grad_norm": 3.4418301658508055, + "learning_rate": 3.543368603973529e-08, + "loss": 3.2865, + "step": 2435 + }, + { + "epoch": 0.97, + "grad_norm": 3.3616588149744073, + "learning_rate": 3.435242530129723e-08, + "loss": 3.4426, + "step": 2436 + }, + { + "epoch": 0.97, + "grad_norm": 3.9010966248921273, + "learning_rate": 3.3287890163523626e-08, + "loss": 3.5075, + "step": 2437 + }, + { + "epoch": 0.98, + "grad_norm": 3.116521733307449, + "learning_rate": 3.224008241304977e-08, + "loss": 3.2828, + "step": 2438 + }, + { + "epoch": 0.98, + "grad_norm": 3.285127613209865, + "learning_rate": 3.120900380844116e-08, + "loss": 3.3661, + "step": 2439 + }, + { + "epoch": 0.98, + "grad_norm": 3.5071884384835306, + "learning_rate": 3.019465608018024e-08, + "loss": 3.5639, + "step": 2440 + }, + { + "epoch": 0.98, + "grad_norm": 3.1870037421829043, + "learning_rate": 2.9197040930674102e-08, + "loss": 3.1974, + "step": 2441 + }, + { + "epoch": 0.98, + "grad_norm": 3.509012040156903, + "learning_rate": 2.8216160034244544e-08, + "loss": 3.3545, + "step": 2442 + }, + { + "epoch": 0.98, + "grad_norm": 3.70637322606787, + "learning_rate": 2.7252015037131373e-08, + "loss": 3.3309, + "step": 2443 + }, + { + "epoch": 0.98, + "grad_norm": 3.8258205503720206, + "learning_rate": 2.6304607557481322e-08, + "loss": 3.3279, + "step": 2444 + }, + { + "epoch": 0.98, + "grad_norm": 3.3015147026301306, + "learning_rate": 2.537393918535358e-08, + "loss": 3.3331, + "step": 2445 + }, + { + "epoch": 0.98, + "grad_norm": 3.558951915848977, + "learning_rate": 2.4460011482713153e-08, + "loss": 3.2497, + "step": 2446 + }, + { + "epoch": 0.98, + "grad_norm": 3.989173866915556, + "learning_rate": 2.3562825983427517e-08, + "loss": 3.5516, + "step": 2447 + }, + { + "epoch": 0.98, + "grad_norm": 3.639158558560462, + "learning_rate": 2.2682384193266625e-08, + "loss": 3.4531, + "step": 2448 + }, + { + "epoch": 0.98, + "grad_norm": 4.331617858251365, + "learning_rate": 2.1818687589896248e-08, + "loss": 3.4148, + "step": 2449 + }, + { + "epoch": 0.98, + "grad_norm": 3.5011814560248915, + "learning_rate": 2.0971737622883515e-08, + "loss": 3.2685, + "step": 2450 + }, + { + "epoch": 0.98, + "grad_norm": 3.4780309027220215, + "learning_rate": 2.01415357136836e-08, + "loss": 3.2917, + "step": 2451 + }, + { + "epoch": 0.98, + "grad_norm": 4.40132084644993, + "learning_rate": 1.93280832556475e-08, + "loss": 3.3335, + "step": 2452 + }, + { + "epoch": 0.98, + "grad_norm": 3.359641935311238, + "learning_rate": 1.8531381614013133e-08, + "loss": 3.4068, + "step": 2453 + }, + { + "epoch": 0.98, + "grad_norm": 3.1694399725333087, + "learning_rate": 1.7751432125903133e-08, + "loss": 3.3822, + "step": 2454 + }, + { + "epoch": 0.98, + "grad_norm": 3.587399184824128, + "learning_rate": 1.698823610032929e-08, + "loss": 3.4473, + "step": 2455 + }, + { + "epoch": 0.98, + "grad_norm": 3.302820237000699, + "learning_rate": 1.6241794818180333e-08, + "loss": 3.2511, + "step": 2456 + }, + { + "epoch": 0.98, + "grad_norm": 3.546048636973119, + "learning_rate": 1.5512109532229703e-08, + "loss": 3.3763, + "step": 2457 + }, + { + "epoch": 0.98, + "grad_norm": 3.6815940730038124, + "learning_rate": 1.4799181467125557e-08, + "loss": 3.5656, + "step": 2458 + }, + { + "epoch": 0.98, + "grad_norm": 3.23628539901506, + "learning_rate": 1.4103011819395218e-08, + "loss": 3.652, + "step": 2459 + }, + { + "epoch": 0.98, + "grad_norm": 4.047804798596946, + "learning_rate": 1.3423601757436289e-08, + "loss": 3.2655, + "step": 2460 + }, + { + "epoch": 0.98, + "grad_norm": 3.528437058663026, + "learning_rate": 1.276095242151998e-08, + "loss": 3.1583, + "step": 2461 + }, + { + "epoch": 0.98, + "grad_norm": 3.4972696524475113, + "learning_rate": 1.2115064923787778e-08, + "loss": 3.344, + "step": 2462 + }, + { + "epoch": 0.99, + "grad_norm": 2.9816851292985413, + "learning_rate": 1.1485940348249235e-08, + "loss": 3.2459, + "step": 2463 + }, + { + "epoch": 0.99, + "grad_norm": 3.5442232507559304, + "learning_rate": 1.087357975078085e-08, + "loss": 3.352, + "step": 2464 + }, + { + "epoch": 0.99, + "grad_norm": 3.4841578124819246, + "learning_rate": 1.0277984159122734e-08, + "loss": 3.423, + "step": 2465 + }, + { + "epoch": 0.99, + "grad_norm": 3.5554413670728247, + "learning_rate": 9.699154572877511e-09, + "loss": 3.3301, + "step": 2466 + }, + { + "epoch": 0.99, + "grad_norm": 3.6871360033669887, + "learning_rate": 9.137091963510314e-09, + "loss": 3.2715, + "step": 2467 + }, + { + "epoch": 0.99, + "grad_norm": 3.165684460893977, + "learning_rate": 8.591797274344338e-09, + "loss": 3.4842, + "step": 2468 + }, + { + "epoch": 0.99, + "grad_norm": 2.9791606895583, + "learning_rate": 8.063271420563068e-09, + "loss": 3.4481, + "step": 2469 + }, + { + "epoch": 0.99, + "grad_norm": 2.9882821622236264, + "learning_rate": 7.551515289203615e-09, + "loss": 3.5619, + "step": 2470 + }, + { + "epoch": 0.99, + "grad_norm": 3.7516938084672558, + "learning_rate": 7.056529739158935e-09, + "loss": 3.488, + "step": 2471 + }, + { + "epoch": 0.99, + "grad_norm": 3.80255477519852, + "learning_rate": 6.5783156011778315e-09, + "loss": 3.3546, + "step": 2472 + }, + { + "epoch": 0.99, + "grad_norm": 4.145429658546217, + "learning_rate": 6.116873677858293e-09, + "loss": 3.3526, + "step": 2473 + }, + { + "epoch": 0.99, + "grad_norm": 3.6512344972109707, + "learning_rate": 5.6722047436497115e-09, + "loss": 3.3866, + "step": 2474 + }, + { + "epoch": 0.99, + "grad_norm": 3.9255532464199225, + "learning_rate": 5.2443095448506674e-09, + "loss": 3.4207, + "step": 2475 + }, + { + "epoch": 0.99, + "grad_norm": 3.500865341292617, + "learning_rate": 4.833188799610033e-09, + "loss": 3.5687, + "step": 2476 + }, + { + "epoch": 0.99, + "grad_norm": 3.4307967357678644, + "learning_rate": 4.438843197922538e-09, + "loss": 3.3044, + "step": 2477 + }, + { + "epoch": 0.99, + "grad_norm": 3.3415100972592366, + "learning_rate": 4.061273401627653e-09, + "loss": 3.3692, + "step": 2478 + }, + { + "epoch": 0.99, + "grad_norm": 3.3171125128650845, + "learning_rate": 3.7004800444095933e-09, + "loss": 3.3511, + "step": 2479 + }, + { + "epoch": 0.99, + "grad_norm": 3.563985957009853, + "learning_rate": 3.3564637317984318e-09, + "loss": 3.2359, + "step": 2480 + }, + { + "epoch": 0.99, + "grad_norm": 3.2396809393878967, + "learning_rate": 3.0292250411645406e-09, + "loss": 3.5031, + "step": 2481 + }, + { + "epoch": 0.99, + "grad_norm": 3.487495476298278, + "learning_rate": 2.7187645217219283e-09, + "loss": 3.2537, + "step": 2482 + }, + { + "epoch": 0.99, + "grad_norm": 3.462183993392574, + "learning_rate": 2.4250826945226847e-09, + "loss": 3.4758, + "step": 2483 + }, + { + "epoch": 0.99, + "grad_norm": 3.45474989584317, + "learning_rate": 2.148180052462534e-09, + "loss": 3.4833, + "step": 2484 + }, + { + "epoch": 0.99, + "grad_norm": 3.5726004000973304, + "learning_rate": 1.888057060274173e-09, + "loss": 3.227, + "step": 2485 + }, + { + "epoch": 0.99, + "grad_norm": 3.2144747119002104, + "learning_rate": 1.6447141545272717e-09, + "loss": 3.168, + "step": 2486 + }, + { + "epoch": 0.99, + "grad_norm": 3.608690469923541, + "learning_rate": 1.4181517436306913e-09, + "loss": 3.2924, + "step": 2487 + }, + { + "epoch": 1.0, + "grad_norm": 3.523006771698765, + "learning_rate": 1.2083702078302673e-09, + "loss": 3.4771, + "step": 2488 + }, + { + "epoch": 1.0, + "grad_norm": 3.393916464258931, + "learning_rate": 1.0153698992088069e-09, + "loss": 3.4871, + "step": 2489 + }, + { + "epoch": 1.0, + "grad_norm": 3.5257507681162927, + "learning_rate": 8.391511416816489e-10, + "loss": 3.3328, + "step": 2490 + }, + { + "epoch": 1.0, + "grad_norm": 3.5827728588499856, + "learning_rate": 6.797142310022154e-10, + "loss": 3.0827, + "step": 2491 + }, + { + "epoch": 1.0, + "grad_norm": 3.412401860436348, + "learning_rate": 5.370594347575697e-10, + "loss": 3.4849, + "step": 2492 + }, + { + "epoch": 1.0, + "grad_norm": 3.3902248151074974, + "learning_rate": 4.1118699236841753e-10, + "loss": 3.478, + "step": 2493 + }, + { + "epoch": 1.0, + "grad_norm": 3.1063910939480706, + "learning_rate": 3.0209711509132657e-10, + "loss": 3.4109, + "step": 2494 + }, + { + "epoch": 1.0, + "grad_norm": 3.5735345416169753, + "learning_rate": 2.0978998601206558e-10, + "loss": 3.3083, + "step": 2495 + }, + { + "epoch": 1.0, + "grad_norm": 3.4576775876063928, + "learning_rate": 1.342657600544861e-10, + "loss": 3.51, + "step": 2496 + }, + { + "epoch": 1.0, + "grad_norm": 3.835368532804796, + "learning_rate": 7.552456397053042e-11, + "loss": 3.3847, + "step": 2497 + }, + { + "epoch": 1.0, + "grad_norm": 2.9813514920634563, + "learning_rate": 3.3566496349113355e-11, + "loss": 3.5756, + "step": 2498 + }, + { + "epoch": 1.0, + "grad_norm": 3.2882708884674345, + "learning_rate": 8.39162760835066e-12, + "loss": 3.4625, + "step": 2499 + }, + { + "epoch": 1.0, + "grad_norm": 3.528204116206639, + "learning_rate": 0.0, + "loss": 3.2533, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 5.835554469802148e+17, + "train_loss": 4.1818853558540345, + "train_runtime": 17904.7495, + "train_samples_per_second": 4.468, + "train_steps_per_second": 0.14 + } + ], + "logging_steps": 1.0, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 240, + "total_flos": 5.835554469802148e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}