diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22962 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998473981382573, + "eval_steps": 500, + "global_step": 3276, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003052037234854265, + "grad_norm": 19.476922880741295, + "learning_rate": 1.0101010101010103e-07, + "loss": 1.1728, + "step": 1 + }, + { + "epoch": 0.000610407446970853, + "grad_norm": 29.879020388476594, + "learning_rate": 2.0202020202020205e-07, + "loss": 1.0955, + "step": 2 + }, + { + "epoch": 0.0009156111704562796, + "grad_norm": 24.931945947136526, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.9541, + "step": 3 + }, + { + "epoch": 0.001220814893941706, + "grad_norm": 27.83214939667906, + "learning_rate": 4.040404040404041e-07, + "loss": 1.0735, + "step": 4 + }, + { + "epoch": 0.0015260186174271325, + "grad_norm": 21.219233961021736, + "learning_rate": 5.05050505050505e-07, + "loss": 1.0455, + "step": 5 + }, + { + "epoch": 0.0018312223409125592, + "grad_norm": 20.022707446211225, + "learning_rate": 6.060606060606061e-07, + "loss": 0.9675, + "step": 6 + }, + { + "epoch": 0.0021364260643979855, + "grad_norm": 26.532427830157193, + "learning_rate": 7.070707070707071e-07, + "loss": 1.1393, + "step": 7 + }, + { + "epoch": 0.002441629787883412, + "grad_norm": 27.89728780710031, + "learning_rate": 8.080808080808082e-07, + "loss": 1.0952, + "step": 8 + }, + { + "epoch": 0.0027468335113688385, + "grad_norm": 20.346264005570532, + "learning_rate": 9.090909090909091e-07, + "loss": 0.9626, + "step": 9 + }, + { + "epoch": 0.003052037234854265, + "grad_norm": 18.804489508720884, + "learning_rate": 1.01010101010101e-06, + "loss": 1.0255, + "step": 10 + }, + { + "epoch": 0.003357240958339692, + "grad_norm": 19.776534785573535, + "learning_rate": 1.111111111111111e-06, + "loss": 0.7399, + "step": 11 + }, + { + "epoch": 0.0036624446818251184, + "grad_norm": 21.16130386460154, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.5413, + "step": 12 + }, + { + "epoch": 0.0039676484053105445, + "grad_norm": 16.482713371526263, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5773, + "step": 13 + }, + { + "epoch": 0.004272852128795971, + "grad_norm": 10.780528168770594, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.6782, + "step": 14 + }, + { + "epoch": 0.0045780558522813975, + "grad_norm": 7.0900135030469915, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.9153, + "step": 15 + }, + { + "epoch": 0.004883259575766824, + "grad_norm": 8.490445320662754, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.4798, + "step": 16 + }, + { + "epoch": 0.0051884632992522505, + "grad_norm": 6.677142812986669, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.4782, + "step": 17 + }, + { + "epoch": 0.005493667022737677, + "grad_norm": 5.9204247946017485, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.3191, + "step": 18 + }, + { + "epoch": 0.0057988707462231035, + "grad_norm": 5.012462343754674, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.4115, + "step": 19 + }, + { + "epoch": 0.00610407446970853, + "grad_norm": 3.9095937836899113, + "learning_rate": 2.02020202020202e-06, + "loss": 0.6158, + "step": 20 + }, + { + "epoch": 0.006409278193193957, + "grad_norm": 4.438163815129716, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.7388, + "step": 21 + }, + { + "epoch": 0.006714481916679384, + "grad_norm": 3.62875198348435, + "learning_rate": 2.222222222222222e-06, + "loss": 0.2875, + "step": 22 + }, + { + "epoch": 0.00701968564016481, + "grad_norm": 4.963543929599541, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.4662, + "step": 23 + }, + { + "epoch": 0.007324889363650237, + "grad_norm": 4.274904100558248, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.5171, + "step": 24 + }, + { + "epoch": 0.007630093087135663, + "grad_norm": 2.670885047669819, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.4488, + "step": 25 + }, + { + "epoch": 0.007935296810621089, + "grad_norm": 2.6864388610994014, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.372, + "step": 26 + }, + { + "epoch": 0.008240500534106516, + "grad_norm": 3.804357369452407, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.2646, + "step": 27 + }, + { + "epoch": 0.008545704257591942, + "grad_norm": 4.059008227452532, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.5907, + "step": 28 + }, + { + "epoch": 0.008850907981077369, + "grad_norm": 4.9062443629918855, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.2972, + "step": 29 + }, + { + "epoch": 0.009156111704562795, + "grad_norm": 3.5391495380267064, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.3821, + "step": 30 + }, + { + "epoch": 0.009461315428048222, + "grad_norm": 2.5896920322264854, + "learning_rate": 3.131313131313132e-06, + "loss": 0.4164, + "step": 31 + }, + { + "epoch": 0.009766519151533648, + "grad_norm": 3.0230775761822937, + "learning_rate": 3.232323232323233e-06, + "loss": 0.4237, + "step": 32 + }, + { + "epoch": 0.010071722875019075, + "grad_norm": 2.8417717057519423, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3353, + "step": 33 + }, + { + "epoch": 0.010376926598504501, + "grad_norm": 2.5789157463945878, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.3769, + "step": 34 + }, + { + "epoch": 0.010682130321989928, + "grad_norm": 2.5222241581850096, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.519, + "step": 35 + }, + { + "epoch": 0.010987334045475354, + "grad_norm": 2.8704682168269127, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.2829, + "step": 36 + }, + { + "epoch": 0.01129253776896078, + "grad_norm": 3.24684532820184, + "learning_rate": 3.737373737373738e-06, + "loss": 0.3586, + "step": 37 + }, + { + "epoch": 0.011597741492446207, + "grad_norm": 5.24792475783676, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.402, + "step": 38 + }, + { + "epoch": 0.011902945215931634, + "grad_norm": 3.111184671834165, + "learning_rate": 3.93939393939394e-06, + "loss": 0.466, + "step": 39 + }, + { + "epoch": 0.01220814893941706, + "grad_norm": 3.165565566985893, + "learning_rate": 4.04040404040404e-06, + "loss": 0.2678, + "step": 40 + }, + { + "epoch": 0.012513352662902488, + "grad_norm": 2.5486933296193257, + "learning_rate": 4.141414141414142e-06, + "loss": 0.5457, + "step": 41 + }, + { + "epoch": 0.012818556386387915, + "grad_norm": 3.4373721012250438, + "learning_rate": 4.242424242424243e-06, + "loss": 0.3862, + "step": 42 + }, + { + "epoch": 0.013123760109873341, + "grad_norm": 2.863317221380458, + "learning_rate": 4.343434343434344e-06, + "loss": 0.3601, + "step": 43 + }, + { + "epoch": 0.013428963833358768, + "grad_norm": 2.1041128573446035, + "learning_rate": 4.444444444444444e-06, + "loss": 0.3693, + "step": 44 + }, + { + "epoch": 0.013734167556844194, + "grad_norm": 2.286990324679626, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.2513, + "step": 45 + }, + { + "epoch": 0.01403937128032962, + "grad_norm": 8.793466778432636, + "learning_rate": 4.646464646464647e-06, + "loss": 0.4343, + "step": 46 + }, + { + "epoch": 0.014344575003815047, + "grad_norm": 1.8648737533834159, + "learning_rate": 4.747474747474748e-06, + "loss": 0.2631, + "step": 47 + }, + { + "epoch": 0.014649778727300474, + "grad_norm": 2.3081781364995324, + "learning_rate": 4.848484848484849e-06, + "loss": 0.2755, + "step": 48 + }, + { + "epoch": 0.0149549824507859, + "grad_norm": 2.284005369243557, + "learning_rate": 4.94949494949495e-06, + "loss": 0.4186, + "step": 49 + }, + { + "epoch": 0.015260186174271327, + "grad_norm": 2.6759709423238096, + "learning_rate": 5.0505050505050515e-06, + "loss": 0.6459, + "step": 50 + }, + { + "epoch": 0.015565389897756753, + "grad_norm": 2.8773749120652523, + "learning_rate": 5.151515151515152e-06, + "loss": 0.3324, + "step": 51 + }, + { + "epoch": 0.015870593621242178, + "grad_norm": 2.8060164424498786, + "learning_rate": 5.252525252525253e-06, + "loss": 0.3608, + "step": 52 + }, + { + "epoch": 0.016175797344727606, + "grad_norm": 2.3060494229726793, + "learning_rate": 5.353535353535354e-06, + "loss": 0.3818, + "step": 53 + }, + { + "epoch": 0.01648100106821303, + "grad_norm": 2.073464811557714, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.2667, + "step": 54 + }, + { + "epoch": 0.01678620479169846, + "grad_norm": 2.3474749655399245, + "learning_rate": 5.555555555555557e-06, + "loss": 0.35, + "step": 55 + }, + { + "epoch": 0.017091408515183884, + "grad_norm": 3.6988890036672086, + "learning_rate": 5.656565656565657e-06, + "loss": 0.284, + "step": 56 + }, + { + "epoch": 0.017396612238669312, + "grad_norm": 2.313501192849839, + "learning_rate": 5.7575757575757586e-06, + "loss": 0.3308, + "step": 57 + }, + { + "epoch": 0.017701815962154737, + "grad_norm": 2.411936098122121, + "learning_rate": 5.858585858585859e-06, + "loss": 0.3982, + "step": 58 + }, + { + "epoch": 0.018007019685640165, + "grad_norm": 2.724660127775508, + "learning_rate": 5.95959595959596e-06, + "loss": 0.3587, + "step": 59 + }, + { + "epoch": 0.01831222340912559, + "grad_norm": 3.130895013540925, + "learning_rate": 6.060606060606061e-06, + "loss": 0.3427, + "step": 60 + }, + { + "epoch": 0.01861742713261102, + "grad_norm": 3.4261489723004614, + "learning_rate": 6.1616161616161615e-06, + "loss": 0.4578, + "step": 61 + }, + { + "epoch": 0.018922630856096443, + "grad_norm": 2.413871881063889, + "learning_rate": 6.262626262626264e-06, + "loss": 0.2067, + "step": 62 + }, + { + "epoch": 0.01922783457958187, + "grad_norm": 2.0941348505038366, + "learning_rate": 6.363636363636364e-06, + "loss": 0.27, + "step": 63 + }, + { + "epoch": 0.019533038303067296, + "grad_norm": 2.2153240133926153, + "learning_rate": 6.464646464646466e-06, + "loss": 0.3298, + "step": 64 + }, + { + "epoch": 0.019838242026552724, + "grad_norm": 2.422022070572305, + "learning_rate": 6.565656565656566e-06, + "loss": 0.4894, + "step": 65 + }, + { + "epoch": 0.02014344575003815, + "grad_norm": 2.45442660843552, + "learning_rate": 6.666666666666667e-06, + "loss": 0.3684, + "step": 66 + }, + { + "epoch": 0.020448649473523577, + "grad_norm": 3.5398238081108304, + "learning_rate": 6.767676767676769e-06, + "loss": 0.4233, + "step": 67 + }, + { + "epoch": 0.020753853197009002, + "grad_norm": 2.530397719080883, + "learning_rate": 6.868686868686869e-06, + "loss": 0.2676, + "step": 68 + }, + { + "epoch": 0.02105905692049443, + "grad_norm": 2.259346305696615, + "learning_rate": 6.969696969696971e-06, + "loss": 0.4409, + "step": 69 + }, + { + "epoch": 0.021364260643979855, + "grad_norm": 2.3339543424453764, + "learning_rate": 7.070707070707071e-06, + "loss": 0.3882, + "step": 70 + }, + { + "epoch": 0.021669464367465283, + "grad_norm": 2.348843038116063, + "learning_rate": 7.171717171717172e-06, + "loss": 0.3904, + "step": 71 + }, + { + "epoch": 0.021974668090950708, + "grad_norm": 2.7011363922899965, + "learning_rate": 7.272727272727273e-06, + "loss": 0.3586, + "step": 72 + }, + { + "epoch": 0.022279871814436136, + "grad_norm": 2.6923381814173486, + "learning_rate": 7.373737373737374e-06, + "loss": 0.4331, + "step": 73 + }, + { + "epoch": 0.02258507553792156, + "grad_norm": 2.0435337430530924, + "learning_rate": 7.474747474747476e-06, + "loss": 0.2739, + "step": 74 + }, + { + "epoch": 0.02289027926140699, + "grad_norm": 2.257183264462076, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.4554, + "step": 75 + }, + { + "epoch": 0.023195482984892414, + "grad_norm": 2.5384248372961626, + "learning_rate": 7.676767676767677e-06, + "loss": 0.4934, + "step": 76 + }, + { + "epoch": 0.023500686708377842, + "grad_norm": 2.1578730127908488, + "learning_rate": 7.77777777777778e-06, + "loss": 0.3519, + "step": 77 + }, + { + "epoch": 0.023805890431863267, + "grad_norm": 2.1316764516757476, + "learning_rate": 7.87878787878788e-06, + "loss": 0.3268, + "step": 78 + }, + { + "epoch": 0.024111094155348695, + "grad_norm": 2.095996278024237, + "learning_rate": 7.97979797979798e-06, + "loss": 0.3318, + "step": 79 + }, + { + "epoch": 0.02441629787883412, + "grad_norm": 1.9985574049541877, + "learning_rate": 8.08080808080808e-06, + "loss": 0.1852, + "step": 80 + }, + { + "epoch": 0.02472150160231955, + "grad_norm": 1.7092921737326583, + "learning_rate": 8.181818181818183e-06, + "loss": 0.2412, + "step": 81 + }, + { + "epoch": 0.025026705325804977, + "grad_norm": 1.9609482601524066, + "learning_rate": 8.282828282828283e-06, + "loss": 0.3349, + "step": 82 + }, + { + "epoch": 0.0253319090492904, + "grad_norm": 2.5619254980161412, + "learning_rate": 8.383838383838384e-06, + "loss": 0.3327, + "step": 83 + }, + { + "epoch": 0.02563711277277583, + "grad_norm": 2.1734116421771827, + "learning_rate": 8.484848484848486e-06, + "loss": 0.5005, + "step": 84 + }, + { + "epoch": 0.025942316496261254, + "grad_norm": 2.4612836321871785, + "learning_rate": 8.585858585858587e-06, + "loss": 0.5919, + "step": 85 + }, + { + "epoch": 0.026247520219746683, + "grad_norm": 2.050264187978962, + "learning_rate": 8.686868686868687e-06, + "loss": 0.2654, + "step": 86 + }, + { + "epoch": 0.026552723943232107, + "grad_norm": 1.7466792206761999, + "learning_rate": 8.787878787878788e-06, + "loss": 0.2875, + "step": 87 + }, + { + "epoch": 0.026857927666717536, + "grad_norm": 1.9114055019911376, + "learning_rate": 8.888888888888888e-06, + "loss": 0.3317, + "step": 88 + }, + { + "epoch": 0.02716313139020296, + "grad_norm": 2.136028617695754, + "learning_rate": 8.98989898989899e-06, + "loss": 0.4322, + "step": 89 + }, + { + "epoch": 0.02746833511368839, + "grad_norm": 2.0559196693817303, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3372, + "step": 90 + }, + { + "epoch": 0.027773538837173813, + "grad_norm": 1.6053810559753854, + "learning_rate": 9.191919191919193e-06, + "loss": 0.2833, + "step": 91 + }, + { + "epoch": 0.02807874256065924, + "grad_norm": 1.9190338968500587, + "learning_rate": 9.292929292929294e-06, + "loss": 0.2358, + "step": 92 + }, + { + "epoch": 0.028383946284144666, + "grad_norm": 1.7424429804531956, + "learning_rate": 9.393939393939396e-06, + "loss": 0.2805, + "step": 93 + }, + { + "epoch": 0.028689150007630095, + "grad_norm": 1.5616301594921251, + "learning_rate": 9.494949494949497e-06, + "loss": 0.326, + "step": 94 + }, + { + "epoch": 0.02899435373111552, + "grad_norm": 2.6517363851490297, + "learning_rate": 9.595959595959597e-06, + "loss": 0.5839, + "step": 95 + }, + { + "epoch": 0.029299557454600948, + "grad_norm": 1.9068377479857994, + "learning_rate": 9.696969696969698e-06, + "loss": 0.4213, + "step": 96 + }, + { + "epoch": 0.029604761178086372, + "grad_norm": 2.147263972819766, + "learning_rate": 9.797979797979798e-06, + "loss": 0.3776, + "step": 97 + }, + { + "epoch": 0.0299099649015718, + "grad_norm": 2.3466004395170685, + "learning_rate": 9.8989898989899e-06, + "loss": 0.4828, + "step": 98 + }, + { + "epoch": 0.030215168625057225, + "grad_norm": 1.9328188798162316, + "learning_rate": 1e-05, + "loss": 0.3816, + "step": 99 + }, + { + "epoch": 0.030520372348542654, + "grad_norm": 2.120656679761712, + "learning_rate": 9.999997555414177e-06, + "loss": 0.287, + "step": 100 + }, + { + "epoch": 0.03082557607202808, + "grad_norm": 1.8272767014289886, + "learning_rate": 9.999990221659095e-06, + "loss": 0.2529, + "step": 101 + }, + { + "epoch": 0.031130779795513507, + "grad_norm": 2.108876035097533, + "learning_rate": 9.999977998741925e-06, + "loss": 0.4, + "step": 102 + }, + { + "epoch": 0.031435983518998935, + "grad_norm": 2.611227326027621, + "learning_rate": 9.999960886674623e-06, + "loss": 0.5577, + "step": 103 + }, + { + "epoch": 0.031741187242484356, + "grad_norm": 2.012760226088087, + "learning_rate": 9.999938885473916e-06, + "loss": 0.2397, + "step": 104 + }, + { + "epoch": 0.032046390965969784, + "grad_norm": 3.4069313977643088, + "learning_rate": 9.999911995161323e-06, + "loss": 0.3074, + "step": 105 + }, + { + "epoch": 0.03235159468945521, + "grad_norm": 1.5281487804348939, + "learning_rate": 9.999880215763133e-06, + "loss": 0.306, + "step": 106 + }, + { + "epoch": 0.03265679841294064, + "grad_norm": 1.5733903167529437, + "learning_rate": 9.999843547310427e-06, + "loss": 0.3123, + "step": 107 + }, + { + "epoch": 0.03296200213642606, + "grad_norm": 2.2084260837102776, + "learning_rate": 9.999801989839055e-06, + "loss": 0.2686, + "step": 108 + }, + { + "epoch": 0.03326720585991149, + "grad_norm": 2.0235527329790477, + "learning_rate": 9.999755543389658e-06, + "loss": 0.362, + "step": 109 + }, + { + "epoch": 0.03357240958339692, + "grad_norm": 1.4126246608311444, + "learning_rate": 9.999704208007647e-06, + "loss": 0.1868, + "step": 110 + }, + { + "epoch": 0.03387761330688235, + "grad_norm": 1.9363750145032863, + "learning_rate": 9.999647983743227e-06, + "loss": 0.4674, + "step": 111 + }, + { + "epoch": 0.03418281703036777, + "grad_norm": 2.306492812857686, + "learning_rate": 9.999586870651372e-06, + "loss": 0.7454, + "step": 112 + }, + { + "epoch": 0.034488020753853196, + "grad_norm": 1.9927578577114744, + "learning_rate": 9.999520868791839e-06, + "loss": 0.2964, + "step": 113 + }, + { + "epoch": 0.034793224477338625, + "grad_norm": 2.897230200199283, + "learning_rate": 9.99944997822917e-06, + "loss": 0.3507, + "step": 114 + }, + { + "epoch": 0.03509842820082405, + "grad_norm": 1.7040567211820554, + "learning_rate": 9.999374199032682e-06, + "loss": 0.358, + "step": 115 + }, + { + "epoch": 0.035403631924309474, + "grad_norm": 1.7684725864001616, + "learning_rate": 9.999293531276475e-06, + "loss": 0.469, + "step": 116 + }, + { + "epoch": 0.0357088356477949, + "grad_norm": 2.151331613378997, + "learning_rate": 9.999207975039429e-06, + "loss": 0.4007, + "step": 117 + }, + { + "epoch": 0.03601403937128033, + "grad_norm": 2.1827006415812678, + "learning_rate": 9.999117530405205e-06, + "loss": 0.373, + "step": 118 + }, + { + "epoch": 0.03631924309476576, + "grad_norm": 2.0424756244526283, + "learning_rate": 9.99902219746224e-06, + "loss": 0.4664, + "step": 119 + }, + { + "epoch": 0.03662444681825118, + "grad_norm": 2.4438750213097014, + "learning_rate": 9.998921976303757e-06, + "loss": 0.5884, + "step": 120 + }, + { + "epoch": 0.03692965054173661, + "grad_norm": 1.6168805259489245, + "learning_rate": 9.998816867027753e-06, + "loss": 0.3874, + "step": 121 + }, + { + "epoch": 0.03723485426522204, + "grad_norm": 2.4836564854380914, + "learning_rate": 9.99870686973701e-06, + "loss": 0.3865, + "step": 122 + }, + { + "epoch": 0.037540057988707465, + "grad_norm": 2.187549263535683, + "learning_rate": 9.998591984539085e-06, + "loss": 0.4419, + "step": 123 + }, + { + "epoch": 0.037845261712192886, + "grad_norm": 2.3145724108896366, + "learning_rate": 9.998472211546317e-06, + "loss": 0.5048, + "step": 124 + }, + { + "epoch": 0.038150465435678314, + "grad_norm": 2.6043824271784377, + "learning_rate": 9.998347550875825e-06, + "loss": 0.4323, + "step": 125 + }, + { + "epoch": 0.03845566915916374, + "grad_norm": 1.7266964407358079, + "learning_rate": 9.998218002649507e-06, + "loss": 0.3093, + "step": 126 + }, + { + "epoch": 0.03876087288264917, + "grad_norm": 2.3091863655820397, + "learning_rate": 9.99808356699404e-06, + "loss": 0.5394, + "step": 127 + }, + { + "epoch": 0.03906607660613459, + "grad_norm": 2.178584103245907, + "learning_rate": 9.997944244040877e-06, + "loss": 0.562, + "step": 128 + }, + { + "epoch": 0.03937128032962002, + "grad_norm": 1.4762803065381216, + "learning_rate": 9.997800033926252e-06, + "loss": 0.3012, + "step": 129 + }, + { + "epoch": 0.03967648405310545, + "grad_norm": 1.6768704233807339, + "learning_rate": 9.997650936791183e-06, + "loss": 0.3314, + "step": 130 + }, + { + "epoch": 0.03998168777659088, + "grad_norm": 1.8423584681568375, + "learning_rate": 9.997496952781461e-06, + "loss": 0.5373, + "step": 131 + }, + { + "epoch": 0.0402868915000763, + "grad_norm": 1.4926628434179245, + "learning_rate": 9.997338082047656e-06, + "loss": 0.1992, + "step": 132 + }, + { + "epoch": 0.040592095223561726, + "grad_norm": 1.6323074947028773, + "learning_rate": 9.997174324745117e-06, + "loss": 0.4872, + "step": 133 + }, + { + "epoch": 0.040897298947047155, + "grad_norm": 2.159688005520465, + "learning_rate": 9.997005681033973e-06, + "loss": 0.5076, + "step": 134 + }, + { + "epoch": 0.04120250267053258, + "grad_norm": 2.207163038792008, + "learning_rate": 9.996832151079127e-06, + "loss": 0.2677, + "step": 135 + }, + { + "epoch": 0.041507706394018004, + "grad_norm": 1.3990677420334965, + "learning_rate": 9.996653735050265e-06, + "loss": 0.2526, + "step": 136 + }, + { + "epoch": 0.04181291011750343, + "grad_norm": 1.7368886105229604, + "learning_rate": 9.996470433121847e-06, + "loss": 0.2874, + "step": 137 + }, + { + "epoch": 0.04211811384098886, + "grad_norm": 1.8138446424045762, + "learning_rate": 9.996282245473113e-06, + "loss": 0.2986, + "step": 138 + }, + { + "epoch": 0.04242331756447429, + "grad_norm": 1.8564789601928355, + "learning_rate": 9.996089172288078e-06, + "loss": 0.3954, + "step": 139 + }, + { + "epoch": 0.04272852128795971, + "grad_norm": 1.9085920361180522, + "learning_rate": 9.995891213755536e-06, + "loss": 0.2739, + "step": 140 + }, + { + "epoch": 0.04303372501144514, + "grad_norm": 1.8924678931794556, + "learning_rate": 9.99568837006906e-06, + "loss": 0.2766, + "step": 141 + }, + { + "epoch": 0.04333892873493057, + "grad_norm": 1.8418836037208652, + "learning_rate": 9.995480641426992e-06, + "loss": 0.488, + "step": 142 + }, + { + "epoch": 0.043644132458415995, + "grad_norm": 1.6305125707231247, + "learning_rate": 9.99526802803246e-06, + "loss": 0.3045, + "step": 143 + }, + { + "epoch": 0.043949336181901416, + "grad_norm": 2.143051665423358, + "learning_rate": 9.995050530093366e-06, + "loss": 0.3567, + "step": 144 + }, + { + "epoch": 0.044254539905386844, + "grad_norm": 1.994194545633334, + "learning_rate": 9.994828147822387e-06, + "loss": 0.3655, + "step": 145 + }, + { + "epoch": 0.04455974362887227, + "grad_norm": 1.8553346605537173, + "learning_rate": 9.994600881436972e-06, + "loss": 0.3249, + "step": 146 + }, + { + "epoch": 0.0448649473523577, + "grad_norm": 2.1613773805709857, + "learning_rate": 9.994368731159351e-06, + "loss": 0.4863, + "step": 147 + }, + { + "epoch": 0.04517015107584312, + "grad_norm": 2.199571706523493, + "learning_rate": 9.99413169721653e-06, + "loss": 0.465, + "step": 148 + }, + { + "epoch": 0.04547535479932855, + "grad_norm": 1.681707967900651, + "learning_rate": 9.99388977984029e-06, + "loss": 0.3472, + "step": 149 + }, + { + "epoch": 0.04578055852281398, + "grad_norm": 1.6586587053140593, + "learning_rate": 9.993642979267184e-06, + "loss": 0.3626, + "step": 150 + }, + { + "epoch": 0.04608576224629941, + "grad_norm": 2.12592721793332, + "learning_rate": 9.993391295738542e-06, + "loss": 0.3218, + "step": 151 + }, + { + "epoch": 0.04639096596978483, + "grad_norm": 1.6765944279655143, + "learning_rate": 9.99313472950047e-06, + "loss": 0.3402, + "step": 152 + }, + { + "epoch": 0.046696169693270256, + "grad_norm": 1.6019038139070678, + "learning_rate": 9.992873280803848e-06, + "loss": 0.4554, + "step": 153 + }, + { + "epoch": 0.047001373416755685, + "grad_norm": 1.6429860881882794, + "learning_rate": 9.99260694990433e-06, + "loss": 0.4086, + "step": 154 + }, + { + "epoch": 0.04730657714024111, + "grad_norm": 1.98592334325083, + "learning_rate": 9.992335737062338e-06, + "loss": 0.5733, + "step": 155 + }, + { + "epoch": 0.047611780863726534, + "grad_norm": 1.5624846648417388, + "learning_rate": 9.992059642543076e-06, + "loss": 0.2524, + "step": 156 + }, + { + "epoch": 0.04791698458721196, + "grad_norm": 1.4438198320418865, + "learning_rate": 9.991778666616523e-06, + "loss": 0.1756, + "step": 157 + }, + { + "epoch": 0.04822218831069739, + "grad_norm": 1.6284817295660008, + "learning_rate": 9.991492809557424e-06, + "loss": 0.4144, + "step": 158 + }, + { + "epoch": 0.04852739203418282, + "grad_norm": 1.2236340789910145, + "learning_rate": 9.991202071645298e-06, + "loss": 0.1664, + "step": 159 + }, + { + "epoch": 0.04883259575766824, + "grad_norm": 1.4874398163232816, + "learning_rate": 9.99090645316444e-06, + "loss": 0.3323, + "step": 160 + }, + { + "epoch": 0.04913779948115367, + "grad_norm": 2.5394515927833403, + "learning_rate": 9.990605954403917e-06, + "loss": 0.27, + "step": 161 + }, + { + "epoch": 0.0494430032046391, + "grad_norm": 1.7966332314422868, + "learning_rate": 9.990300575657565e-06, + "loss": 0.4453, + "step": 162 + }, + { + "epoch": 0.049748206928124525, + "grad_norm": 1.825976682624809, + "learning_rate": 9.989990317223995e-06, + "loss": 0.2646, + "step": 163 + }, + { + "epoch": 0.05005341065160995, + "grad_norm": 1.6554541925183588, + "learning_rate": 9.989675179406588e-06, + "loss": 0.445, + "step": 164 + }, + { + "epoch": 0.050358614375095374, + "grad_norm": 1.6711133844293076, + "learning_rate": 9.989355162513496e-06, + "loss": 0.3685, + "step": 165 + }, + { + "epoch": 0.0506638180985808, + "grad_norm": 1.8033315345252203, + "learning_rate": 9.989030266857644e-06, + "loss": 0.2566, + "step": 166 + }, + { + "epoch": 0.05096902182206623, + "grad_norm": 1.6879852444966537, + "learning_rate": 9.988700492756726e-06, + "loss": 0.4086, + "step": 167 + }, + { + "epoch": 0.05127422554555166, + "grad_norm": 1.6855038740169574, + "learning_rate": 9.988365840533204e-06, + "loss": 0.3081, + "step": 168 + }, + { + "epoch": 0.05157942926903708, + "grad_norm": 2.245121010490438, + "learning_rate": 9.988026310514316e-06, + "loss": 0.5646, + "step": 169 + }, + { + "epoch": 0.05188463299252251, + "grad_norm": 1.531117336209479, + "learning_rate": 9.987681903032065e-06, + "loss": 0.3598, + "step": 170 + }, + { + "epoch": 0.05218983671600794, + "grad_norm": 1.4368727600956301, + "learning_rate": 9.987332618423221e-06, + "loss": 0.3864, + "step": 171 + }, + { + "epoch": 0.052495040439493365, + "grad_norm": 2.039026486601271, + "learning_rate": 9.98697845702933e-06, + "loss": 0.2728, + "step": 172 + }, + { + "epoch": 0.052800244162978786, + "grad_norm": 1.5481974795842472, + "learning_rate": 9.986619419196704e-06, + "loss": 0.2376, + "step": 173 + }, + { + "epoch": 0.053105447886464215, + "grad_norm": 1.583025735121783, + "learning_rate": 9.986255505276418e-06, + "loss": 0.3941, + "step": 174 + }, + { + "epoch": 0.05341065160994964, + "grad_norm": 2.025610033619695, + "learning_rate": 9.985886715624326e-06, + "loss": 0.432, + "step": 175 + }, + { + "epoch": 0.05371585533343507, + "grad_norm": 1.9370365819159912, + "learning_rate": 9.985513050601037e-06, + "loss": 0.3311, + "step": 176 + }, + { + "epoch": 0.05402105905692049, + "grad_norm": 1.534591376747653, + "learning_rate": 9.985134510571936e-06, + "loss": 0.3804, + "step": 177 + }, + { + "epoch": 0.05432626278040592, + "grad_norm": 1.5627980520171343, + "learning_rate": 9.984751095907175e-06, + "loss": 0.3991, + "step": 178 + }, + { + "epoch": 0.05463146650389135, + "grad_norm": 1.858760828475349, + "learning_rate": 9.984362806981665e-06, + "loss": 0.4124, + "step": 179 + }, + { + "epoch": 0.05493667022737678, + "grad_norm": 1.4922057145689682, + "learning_rate": 9.983969644175092e-06, + "loss": 0.2571, + "step": 180 + }, + { + "epoch": 0.0552418739508622, + "grad_norm": 1.4358215484460224, + "learning_rate": 9.983571607871903e-06, + "loss": 0.3351, + "step": 181 + }, + { + "epoch": 0.05554707767434763, + "grad_norm": 1.7105120125454414, + "learning_rate": 9.983168698461312e-06, + "loss": 0.4374, + "step": 182 + }, + { + "epoch": 0.055852281397833055, + "grad_norm": 1.4100459259074987, + "learning_rate": 9.982760916337296e-06, + "loss": 0.3958, + "step": 183 + }, + { + "epoch": 0.05615748512131848, + "grad_norm": 1.667173817085955, + "learning_rate": 9.982348261898598e-06, + "loss": 0.2867, + "step": 184 + }, + { + "epoch": 0.056462688844803904, + "grad_norm": 1.8278737995984025, + "learning_rate": 9.981930735548731e-06, + "loss": 0.3738, + "step": 185 + }, + { + "epoch": 0.05676789256828933, + "grad_norm": 1.806852289121097, + "learning_rate": 9.98150833769596e-06, + "loss": 0.5608, + "step": 186 + }, + { + "epoch": 0.05707309629177476, + "grad_norm": 1.6986308867720055, + "learning_rate": 9.981081068753324e-06, + "loss": 0.4253, + "step": 187 + }, + { + "epoch": 0.05737830001526019, + "grad_norm": 1.6392088091109513, + "learning_rate": 9.98064892913862e-06, + "loss": 0.2444, + "step": 188 + }, + { + "epoch": 0.05768350373874561, + "grad_norm": 1.7762995408711126, + "learning_rate": 9.980211919274407e-06, + "loss": 0.3866, + "step": 189 + }, + { + "epoch": 0.05798870746223104, + "grad_norm": 1.7144647062044762, + "learning_rate": 9.979770039588013e-06, + "loss": 0.4504, + "step": 190 + }, + { + "epoch": 0.05829391118571647, + "grad_norm": 1.9069269572943617, + "learning_rate": 9.979323290511517e-06, + "loss": 0.4972, + "step": 191 + }, + { + "epoch": 0.058599114909201895, + "grad_norm": 1.831943664409223, + "learning_rate": 9.978871672481774e-06, + "loss": 0.3884, + "step": 192 + }, + { + "epoch": 0.058904318632687316, + "grad_norm": 1.60483584957947, + "learning_rate": 9.978415185940383e-06, + "loss": 0.3366, + "step": 193 + }, + { + "epoch": 0.059209522356172745, + "grad_norm": 2.041633475935638, + "learning_rate": 9.977953831333718e-06, + "loss": 0.4928, + "step": 194 + }, + { + "epoch": 0.05951472607965817, + "grad_norm": 2.1574861604284243, + "learning_rate": 9.977487609112904e-06, + "loss": 0.7092, + "step": 195 + }, + { + "epoch": 0.0598199298031436, + "grad_norm": 1.5382345073334531, + "learning_rate": 9.97701651973383e-06, + "loss": 0.2236, + "step": 196 + }, + { + "epoch": 0.06012513352662902, + "grad_norm": 2.1479787995768014, + "learning_rate": 9.976540563657143e-06, + "loss": 0.5182, + "step": 197 + }, + { + "epoch": 0.06043033725011445, + "grad_norm": 1.8579437774142544, + "learning_rate": 9.976059741348252e-06, + "loss": 0.3093, + "step": 198 + }, + { + "epoch": 0.06073554097359988, + "grad_norm": 1.5409701380525285, + "learning_rate": 9.975574053277317e-06, + "loss": 0.2877, + "step": 199 + }, + { + "epoch": 0.06104074469708531, + "grad_norm": 1.5474598097011698, + "learning_rate": 9.975083499919264e-06, + "loss": 0.2981, + "step": 200 + }, + { + "epoch": 0.06134594842057073, + "grad_norm": 1.9202152932180157, + "learning_rate": 9.974588081753773e-06, + "loss": 0.5369, + "step": 201 + }, + { + "epoch": 0.06165115214405616, + "grad_norm": 1.4598442515817716, + "learning_rate": 9.974087799265279e-06, + "loss": 0.3696, + "step": 202 + }, + { + "epoch": 0.061956355867541585, + "grad_norm": 1.48078814360119, + "learning_rate": 9.973582652942975e-06, + "loss": 0.284, + "step": 203 + }, + { + "epoch": 0.06226155959102701, + "grad_norm": 2.100326004155181, + "learning_rate": 9.973072643280813e-06, + "loss": 0.5681, + "step": 204 + }, + { + "epoch": 0.06256676331451244, + "grad_norm": 1.976128330719915, + "learning_rate": 9.972557770777496e-06, + "loss": 0.3655, + "step": 205 + }, + { + "epoch": 0.06287196703799787, + "grad_norm": 1.2103730393566896, + "learning_rate": 9.972038035936483e-06, + "loss": 0.2471, + "step": 206 + }, + { + "epoch": 0.06317717076148328, + "grad_norm": 1.670449906238349, + "learning_rate": 9.971513439265992e-06, + "loss": 0.2184, + "step": 207 + }, + { + "epoch": 0.06348237448496871, + "grad_norm": 1.5020544764497652, + "learning_rate": 9.970983981278989e-06, + "loss": 0.3196, + "step": 208 + }, + { + "epoch": 0.06378757820845414, + "grad_norm": 1.7833251911345853, + "learning_rate": 9.970449662493195e-06, + "loss": 0.4122, + "step": 209 + }, + { + "epoch": 0.06409278193193957, + "grad_norm": 1.4149595334362772, + "learning_rate": 9.96991048343109e-06, + "loss": 0.2947, + "step": 210 + }, + { + "epoch": 0.064397985655425, + "grad_norm": 1.5991867680932033, + "learning_rate": 9.969366444619898e-06, + "loss": 0.1902, + "step": 211 + }, + { + "epoch": 0.06470318937891043, + "grad_norm": 1.4132064841734169, + "learning_rate": 9.968817546591601e-06, + "loss": 0.3389, + "step": 212 + }, + { + "epoch": 0.06500839310239585, + "grad_norm": 1.7671902900221814, + "learning_rate": 9.968263789882926e-06, + "loss": 0.4294, + "step": 213 + }, + { + "epoch": 0.06531359682588128, + "grad_norm": 1.5709821497329826, + "learning_rate": 9.96770517503536e-06, + "loss": 0.2765, + "step": 214 + }, + { + "epoch": 0.0656188005493667, + "grad_norm": 1.5211731343844295, + "learning_rate": 9.967141702595134e-06, + "loss": 0.387, + "step": 215 + }, + { + "epoch": 0.06592400427285212, + "grad_norm": 1.5499265222668686, + "learning_rate": 9.96657337311323e-06, + "loss": 0.4535, + "step": 216 + }, + { + "epoch": 0.06622920799633755, + "grad_norm": 1.4736546539447488, + "learning_rate": 9.966000187145383e-06, + "loss": 0.3834, + "step": 217 + }, + { + "epoch": 0.06653441171982298, + "grad_norm": 1.3306288958233108, + "learning_rate": 9.965422145252072e-06, + "loss": 0.3172, + "step": 218 + }, + { + "epoch": 0.06683961544330841, + "grad_norm": 1.5745937005003143, + "learning_rate": 9.964839247998524e-06, + "loss": 0.2725, + "step": 219 + }, + { + "epoch": 0.06714481916679384, + "grad_norm": 1.7546511557153388, + "learning_rate": 9.96425149595472e-06, + "loss": 0.3577, + "step": 220 + }, + { + "epoch": 0.06745002289027927, + "grad_norm": 2.0422588449754286, + "learning_rate": 9.96365888969538e-06, + "loss": 0.4976, + "step": 221 + }, + { + "epoch": 0.0677552266137647, + "grad_norm": 1.4661824124133862, + "learning_rate": 9.963061429799979e-06, + "loss": 0.3672, + "step": 222 + }, + { + "epoch": 0.06806043033725011, + "grad_norm": 2.0959067552369666, + "learning_rate": 9.96245911685273e-06, + "loss": 0.5381, + "step": 223 + }, + { + "epoch": 0.06836563406073554, + "grad_norm": 1.3296813372997014, + "learning_rate": 9.961851951442599e-06, + "loss": 0.2799, + "step": 224 + }, + { + "epoch": 0.06867083778422096, + "grad_norm": 1.7385807765114274, + "learning_rate": 9.96123993416329e-06, + "loss": 0.5183, + "step": 225 + }, + { + "epoch": 0.06897604150770639, + "grad_norm": 1.5190119701865645, + "learning_rate": 9.960623065613254e-06, + "loss": 0.4608, + "step": 226 + }, + { + "epoch": 0.06928124523119182, + "grad_norm": 1.4393894383331207, + "learning_rate": 9.96000134639569e-06, + "loss": 0.3455, + "step": 227 + }, + { + "epoch": 0.06958644895467725, + "grad_norm": 1.7132863682619555, + "learning_rate": 9.959374777118533e-06, + "loss": 0.316, + "step": 228 + }, + { + "epoch": 0.06989165267816268, + "grad_norm": 1.3227120889592454, + "learning_rate": 9.958743358394464e-06, + "loss": 0.2467, + "step": 229 + }, + { + "epoch": 0.0701968564016481, + "grad_norm": 1.5331153407144422, + "learning_rate": 9.95810709084091e-06, + "loss": 0.3138, + "step": 230 + }, + { + "epoch": 0.07050206012513352, + "grad_norm": 1.7990748995190806, + "learning_rate": 9.957465975080031e-06, + "loss": 0.4747, + "step": 231 + }, + { + "epoch": 0.07080726384861895, + "grad_norm": 1.1638981235859056, + "learning_rate": 9.956820011738736e-06, + "loss": 0.2265, + "step": 232 + }, + { + "epoch": 0.07111246757210438, + "grad_norm": 1.5739388418179414, + "learning_rate": 9.956169201448665e-06, + "loss": 0.5066, + "step": 233 + }, + { + "epoch": 0.0714176712955898, + "grad_norm": 1.6803933013620869, + "learning_rate": 9.955513544846205e-06, + "loss": 0.4415, + "step": 234 + }, + { + "epoch": 0.07172287501907523, + "grad_norm": 1.4014872110785643, + "learning_rate": 9.954853042572479e-06, + "loss": 0.3271, + "step": 235 + }, + { + "epoch": 0.07202807874256066, + "grad_norm": 1.5310222689941932, + "learning_rate": 9.954187695273352e-06, + "loss": 0.3289, + "step": 236 + }, + { + "epoch": 0.07233328246604609, + "grad_norm": 2.166268226472017, + "learning_rate": 9.953517503599419e-06, + "loss": 0.622, + "step": 237 + }, + { + "epoch": 0.07263848618953152, + "grad_norm": 2.258081862277545, + "learning_rate": 9.952842468206019e-06, + "loss": 0.5071, + "step": 238 + }, + { + "epoch": 0.07294368991301693, + "grad_norm": 1.7322119894263104, + "learning_rate": 9.952162589753224e-06, + "loss": 0.5097, + "step": 239 + }, + { + "epoch": 0.07324889363650236, + "grad_norm": 1.9966284228033864, + "learning_rate": 9.951477868905843e-06, + "loss": 0.2263, + "step": 240 + }, + { + "epoch": 0.07355409735998779, + "grad_norm": 1.6793267860774614, + "learning_rate": 9.95078830633342e-06, + "loss": 0.2065, + "step": 241 + }, + { + "epoch": 0.07385930108347322, + "grad_norm": 2.122564153881175, + "learning_rate": 9.95009390271023e-06, + "loss": 0.2665, + "step": 242 + }, + { + "epoch": 0.07416450480695864, + "grad_norm": 1.5852282963187305, + "learning_rate": 9.949394658715289e-06, + "loss": 0.4453, + "step": 243 + }, + { + "epoch": 0.07446970853044407, + "grad_norm": 1.7534712016120517, + "learning_rate": 9.948690575032338e-06, + "loss": 0.3628, + "step": 244 + }, + { + "epoch": 0.0747749122539295, + "grad_norm": 1.351810586905304, + "learning_rate": 9.947981652349854e-06, + "loss": 0.3984, + "step": 245 + }, + { + "epoch": 0.07508011597741493, + "grad_norm": 1.8377506474408298, + "learning_rate": 9.947267891361051e-06, + "loss": 0.3677, + "step": 246 + }, + { + "epoch": 0.07538531970090036, + "grad_norm": 1.4655632998364951, + "learning_rate": 9.946549292763865e-06, + "loss": 0.3516, + "step": 247 + }, + { + "epoch": 0.07569052342438577, + "grad_norm": 3.240838121636416, + "learning_rate": 9.945825857260967e-06, + "loss": 0.2627, + "step": 248 + }, + { + "epoch": 0.0759957271478712, + "grad_norm": 1.4085823215183912, + "learning_rate": 9.945097585559757e-06, + "loss": 0.2716, + "step": 249 + }, + { + "epoch": 0.07630093087135663, + "grad_norm": 1.6361471921651585, + "learning_rate": 9.944364478372364e-06, + "loss": 0.3595, + "step": 250 + }, + { + "epoch": 0.07660613459484206, + "grad_norm": 1.0912978886499554, + "learning_rate": 9.943626536415647e-06, + "loss": 0.1968, + "step": 251 + }, + { + "epoch": 0.07691133831832749, + "grad_norm": 1.9515717700893849, + "learning_rate": 9.942883760411188e-06, + "loss": 0.374, + "step": 252 + }, + { + "epoch": 0.07721654204181291, + "grad_norm": 1.5560755068838334, + "learning_rate": 9.942136151085302e-06, + "loss": 0.44, + "step": 253 + }, + { + "epoch": 0.07752174576529834, + "grad_norm": 1.4843235207715992, + "learning_rate": 9.941383709169024e-06, + "loss": 0.3175, + "step": 254 + }, + { + "epoch": 0.07782694948878377, + "grad_norm": 1.5210960196158274, + "learning_rate": 9.94062643539812e-06, + "loss": 0.3722, + "step": 255 + }, + { + "epoch": 0.07813215321226918, + "grad_norm": 1.6656094376801425, + "learning_rate": 9.939864330513079e-06, + "loss": 0.3511, + "step": 256 + }, + { + "epoch": 0.07843735693575461, + "grad_norm": 1.2732857455769802, + "learning_rate": 9.939097395259108e-06, + "loss": 0.2619, + "step": 257 + }, + { + "epoch": 0.07874256065924004, + "grad_norm": 1.8947301386622588, + "learning_rate": 9.938325630386149e-06, + "loss": 0.3933, + "step": 258 + }, + { + "epoch": 0.07904776438272547, + "grad_norm": 1.5625416559388712, + "learning_rate": 9.937549036648857e-06, + "loss": 0.4491, + "step": 259 + }, + { + "epoch": 0.0793529681062109, + "grad_norm": 1.5125179888703784, + "learning_rate": 9.936767614806612e-06, + "loss": 0.3674, + "step": 260 + }, + { + "epoch": 0.07965817182969633, + "grad_norm": 1.5026525250547669, + "learning_rate": 9.935981365623516e-06, + "loss": 0.4103, + "step": 261 + }, + { + "epoch": 0.07996337555318175, + "grad_norm": 2.3948536293362115, + "learning_rate": 9.93519028986839e-06, + "loss": 0.4009, + "step": 262 + }, + { + "epoch": 0.08026857927666718, + "grad_norm": 2.416554371647352, + "learning_rate": 9.934394388314775e-06, + "loss": 0.4265, + "step": 263 + }, + { + "epoch": 0.0805737830001526, + "grad_norm": 1.560923734953618, + "learning_rate": 9.933593661740933e-06, + "loss": 0.303, + "step": 264 + }, + { + "epoch": 0.08087898672363802, + "grad_norm": 1.6053945705234087, + "learning_rate": 9.932788110929837e-06, + "loss": 0.3295, + "step": 265 + }, + { + "epoch": 0.08118419044712345, + "grad_norm": 1.7775437462596928, + "learning_rate": 9.931977736669185e-06, + "loss": 0.2197, + "step": 266 + }, + { + "epoch": 0.08148939417060888, + "grad_norm": 1.701318325041301, + "learning_rate": 9.931162539751392e-06, + "loss": 0.3581, + "step": 267 + }, + { + "epoch": 0.08179459789409431, + "grad_norm": 1.5974548511363529, + "learning_rate": 9.93034252097358e-06, + "loss": 0.3432, + "step": 268 + }, + { + "epoch": 0.08209980161757974, + "grad_norm": 1.8669593065073864, + "learning_rate": 9.929517681137594e-06, + "loss": 0.4133, + "step": 269 + }, + { + "epoch": 0.08240500534106517, + "grad_norm": 1.4895827642408586, + "learning_rate": 9.928688021049991e-06, + "loss": 0.3111, + "step": 270 + }, + { + "epoch": 0.0827102090645506, + "grad_norm": 1.4317804244871846, + "learning_rate": 9.927853541522041e-06, + "loss": 0.2915, + "step": 271 + }, + { + "epoch": 0.08301541278803601, + "grad_norm": 1.252478145781798, + "learning_rate": 9.927014243369727e-06, + "loss": 0.2794, + "step": 272 + }, + { + "epoch": 0.08332061651152144, + "grad_norm": 1.6973954865497314, + "learning_rate": 9.926170127413743e-06, + "loss": 0.6183, + "step": 273 + }, + { + "epoch": 0.08362582023500686, + "grad_norm": 1.4723277244112698, + "learning_rate": 9.925321194479494e-06, + "loss": 0.2815, + "step": 274 + }, + { + "epoch": 0.08393102395849229, + "grad_norm": 1.7075555550514414, + "learning_rate": 9.924467445397097e-06, + "loss": 0.4178, + "step": 275 + }, + { + "epoch": 0.08423622768197772, + "grad_norm": 1.5354808046910606, + "learning_rate": 9.923608881001377e-06, + "loss": 0.2355, + "step": 276 + }, + { + "epoch": 0.08454143140546315, + "grad_norm": 1.1795750747565834, + "learning_rate": 9.922745502131865e-06, + "loss": 0.3404, + "step": 277 + }, + { + "epoch": 0.08484663512894858, + "grad_norm": 1.427067758888222, + "learning_rate": 9.921877309632805e-06, + "loss": 0.3141, + "step": 278 + }, + { + "epoch": 0.085151838852434, + "grad_norm": 1.3691564278772157, + "learning_rate": 9.921004304353147e-06, + "loss": 0.287, + "step": 279 + }, + { + "epoch": 0.08545704257591942, + "grad_norm": 1.9220775714586407, + "learning_rate": 9.920126487146544e-06, + "loss": 0.6617, + "step": 280 + }, + { + "epoch": 0.08576224629940485, + "grad_norm": 1.6761030408371134, + "learning_rate": 9.919243858871355e-06, + "loss": 0.466, + "step": 281 + }, + { + "epoch": 0.08606745002289028, + "grad_norm": 1.6120747264173168, + "learning_rate": 9.918356420390645e-06, + "loss": 0.5351, + "step": 282 + }, + { + "epoch": 0.0863726537463757, + "grad_norm": 1.5236961732014556, + "learning_rate": 9.91746417257218e-06, + "loss": 0.33, + "step": 283 + }, + { + "epoch": 0.08667785746986113, + "grad_norm": 1.6328635321860312, + "learning_rate": 9.916567116288434e-06, + "loss": 0.4301, + "step": 284 + }, + { + "epoch": 0.08698306119334656, + "grad_norm": 1.4120804188821041, + "learning_rate": 9.915665252416577e-06, + "loss": 0.3025, + "step": 285 + }, + { + "epoch": 0.08728826491683199, + "grad_norm": 1.8410843798908767, + "learning_rate": 9.914758581838482e-06, + "loss": 0.5415, + "step": 286 + }, + { + "epoch": 0.08759346864031742, + "grad_norm": 1.1807475096034001, + "learning_rate": 9.913847105440725e-06, + "loss": 0.3184, + "step": 287 + }, + { + "epoch": 0.08789867236380283, + "grad_norm": 1.52681276111022, + "learning_rate": 9.912930824114577e-06, + "loss": 0.4266, + "step": 288 + }, + { + "epoch": 0.08820387608728826, + "grad_norm": 1.4904538614169496, + "learning_rate": 9.91200973875601e-06, + "loss": 0.3404, + "step": 289 + }, + { + "epoch": 0.08850907981077369, + "grad_norm": 1.7385111110311349, + "learning_rate": 9.911083850265692e-06, + "loss": 0.3371, + "step": 290 + }, + { + "epoch": 0.08881428353425912, + "grad_norm": 1.6013762575114376, + "learning_rate": 9.91015315954899e-06, + "loss": 0.4475, + "step": 291 + }, + { + "epoch": 0.08911948725774455, + "grad_norm": 1.5474202900018152, + "learning_rate": 9.909217667515964e-06, + "loss": 0.4162, + "step": 292 + }, + { + "epoch": 0.08942469098122997, + "grad_norm": 1.875769203080621, + "learning_rate": 9.908277375081371e-06, + "loss": 0.4446, + "step": 293 + }, + { + "epoch": 0.0897298947047154, + "grad_norm": 1.4914731218024286, + "learning_rate": 9.907332283164663e-06, + "loss": 0.4274, + "step": 294 + }, + { + "epoch": 0.09003509842820083, + "grad_norm": 1.6551811079983538, + "learning_rate": 9.90638239268998e-06, + "loss": 0.4883, + "step": 295 + }, + { + "epoch": 0.09034030215168624, + "grad_norm": 1.645510927644492, + "learning_rate": 9.905427704586158e-06, + "loss": 0.4885, + "step": 296 + }, + { + "epoch": 0.09064550587517167, + "grad_norm": 1.6759165462483547, + "learning_rate": 9.904468219786727e-06, + "loss": 0.3878, + "step": 297 + }, + { + "epoch": 0.0909507095986571, + "grad_norm": 1.596800484010474, + "learning_rate": 9.903503939229901e-06, + "loss": 0.2725, + "step": 298 + }, + { + "epoch": 0.09125591332214253, + "grad_norm": 1.4035704196730787, + "learning_rate": 9.902534863858588e-06, + "loss": 0.2147, + "step": 299 + }, + { + "epoch": 0.09156111704562796, + "grad_norm": 1.7460761357385464, + "learning_rate": 9.90156099462038e-06, + "loss": 0.3495, + "step": 300 + }, + { + "epoch": 0.09186632076911339, + "grad_norm": 1.3373562156184522, + "learning_rate": 9.900582332467566e-06, + "loss": 0.342, + "step": 301 + }, + { + "epoch": 0.09217152449259881, + "grad_norm": 1.1466755748188362, + "learning_rate": 9.89959887835711e-06, + "loss": 0.1737, + "step": 302 + }, + { + "epoch": 0.09247672821608424, + "grad_norm": 1.8078659273922337, + "learning_rate": 9.898610633250669e-06, + "loss": 0.3111, + "step": 303 + }, + { + "epoch": 0.09278193193956966, + "grad_norm": 1.5400638324339648, + "learning_rate": 9.897617598114584e-06, + "loss": 0.4746, + "step": 304 + }, + { + "epoch": 0.09308713566305508, + "grad_norm": 1.558728128630052, + "learning_rate": 9.896619773919878e-06, + "loss": 0.3085, + "step": 305 + }, + { + "epoch": 0.09339233938654051, + "grad_norm": 4.094736926672729, + "learning_rate": 9.895617161642257e-06, + "loss": 0.4664, + "step": 306 + }, + { + "epoch": 0.09369754311002594, + "grad_norm": 1.63116898024897, + "learning_rate": 9.89460976226211e-06, + "loss": 0.3878, + "step": 307 + }, + { + "epoch": 0.09400274683351137, + "grad_norm": 1.7238364123731507, + "learning_rate": 9.893597576764508e-06, + "loss": 0.2989, + "step": 308 + }, + { + "epoch": 0.0943079505569968, + "grad_norm": 1.2496662648050174, + "learning_rate": 9.8925806061392e-06, + "loss": 0.3054, + "step": 309 + }, + { + "epoch": 0.09461315428048223, + "grad_norm": 0.8807197003313585, + "learning_rate": 9.891558851380614e-06, + "loss": 0.1904, + "step": 310 + }, + { + "epoch": 0.09491835800396765, + "grad_norm": 1.5076918479598347, + "learning_rate": 9.890532313487858e-06, + "loss": 0.2679, + "step": 311 + }, + { + "epoch": 0.09522356172745307, + "grad_norm": 1.8465691043660122, + "learning_rate": 9.889500993464716e-06, + "loss": 0.5002, + "step": 312 + }, + { + "epoch": 0.0955287654509385, + "grad_norm": 1.9183643810942494, + "learning_rate": 9.888464892319647e-06, + "loss": 0.4869, + "step": 313 + }, + { + "epoch": 0.09583396917442392, + "grad_norm": 1.6515373264151805, + "learning_rate": 9.887424011065788e-06, + "loss": 0.4507, + "step": 314 + }, + { + "epoch": 0.09613917289790935, + "grad_norm": 1.6223391241834122, + "learning_rate": 9.886378350720945e-06, + "loss": 0.3445, + "step": 315 + }, + { + "epoch": 0.09644437662139478, + "grad_norm": 1.4416645097808285, + "learning_rate": 9.885327912307604e-06, + "loss": 0.2808, + "step": 316 + }, + { + "epoch": 0.09674958034488021, + "grad_norm": 1.4777192121308136, + "learning_rate": 9.88427269685292e-06, + "loss": 0.4335, + "step": 317 + }, + { + "epoch": 0.09705478406836564, + "grad_norm": 1.6934694740555867, + "learning_rate": 9.883212705388715e-06, + "loss": 0.4299, + "step": 318 + }, + { + "epoch": 0.09735998779185107, + "grad_norm": 1.9031284601590377, + "learning_rate": 9.882147938951489e-06, + "loss": 0.5364, + "step": 319 + }, + { + "epoch": 0.09766519151533648, + "grad_norm": 1.990035566558448, + "learning_rate": 9.881078398582406e-06, + "loss": 0.6476, + "step": 320 + }, + { + "epoch": 0.09797039523882191, + "grad_norm": 1.4458600630840748, + "learning_rate": 9.8800040853273e-06, + "loss": 0.268, + "step": 321 + }, + { + "epoch": 0.09827559896230734, + "grad_norm": 1.473557254783057, + "learning_rate": 9.878925000236667e-06, + "loss": 0.3889, + "step": 322 + }, + { + "epoch": 0.09858080268579276, + "grad_norm": 1.429462352597184, + "learning_rate": 9.877841144365681e-06, + "loss": 0.3348, + "step": 323 + }, + { + "epoch": 0.0988860064092782, + "grad_norm": 1.9126483909533352, + "learning_rate": 9.876752518774167e-06, + "loss": 0.5004, + "step": 324 + }, + { + "epoch": 0.09919121013276362, + "grad_norm": 1.528278815830415, + "learning_rate": 9.875659124526622e-06, + "loss": 0.1931, + "step": 325 + }, + { + "epoch": 0.09949641385624905, + "grad_norm": 1.6064809314060318, + "learning_rate": 9.874560962692207e-06, + "loss": 0.2627, + "step": 326 + }, + { + "epoch": 0.09980161757973448, + "grad_norm": 1.8583002911468363, + "learning_rate": 9.873458034344741e-06, + "loss": 0.4795, + "step": 327 + }, + { + "epoch": 0.1001068213032199, + "grad_norm": 2.180040993961252, + "learning_rate": 9.872350340562704e-06, + "loss": 0.3502, + "step": 328 + }, + { + "epoch": 0.10041202502670532, + "grad_norm": 2.2760944374886334, + "learning_rate": 9.871237882429237e-06, + "loss": 0.5504, + "step": 329 + }, + { + "epoch": 0.10071722875019075, + "grad_norm": 1.599604903553732, + "learning_rate": 9.87012066103214e-06, + "loss": 0.4642, + "step": 330 + }, + { + "epoch": 0.10102243247367618, + "grad_norm": 1.165592902920284, + "learning_rate": 9.868998677463874e-06, + "loss": 0.2118, + "step": 331 + }, + { + "epoch": 0.1013276361971616, + "grad_norm": 1.562687671834839, + "learning_rate": 9.867871932821549e-06, + "loss": 0.3389, + "step": 332 + }, + { + "epoch": 0.10163283992064703, + "grad_norm": 1.5690938291630006, + "learning_rate": 9.866740428206935e-06, + "loss": 0.3084, + "step": 333 + }, + { + "epoch": 0.10193804364413246, + "grad_norm": 1.6392469885959746, + "learning_rate": 9.865604164726456e-06, + "loss": 0.3935, + "step": 334 + }, + { + "epoch": 0.10224324736761789, + "grad_norm": 1.4807871775535164, + "learning_rate": 9.864463143491192e-06, + "loss": 0.4445, + "step": 335 + }, + { + "epoch": 0.10254845109110332, + "grad_norm": 1.1926217304533164, + "learning_rate": 9.86331736561687e-06, + "loss": 0.2623, + "step": 336 + }, + { + "epoch": 0.10285365481458873, + "grad_norm": 1.5461262133304665, + "learning_rate": 9.862166832223871e-06, + "loss": 0.4161, + "step": 337 + }, + { + "epoch": 0.10315885853807416, + "grad_norm": 1.5135662608063911, + "learning_rate": 9.861011544437226e-06, + "loss": 0.2864, + "step": 338 + }, + { + "epoch": 0.10346406226155959, + "grad_norm": 1.2723458882899108, + "learning_rate": 9.85985150338662e-06, + "loss": 0.3208, + "step": 339 + }, + { + "epoch": 0.10376926598504502, + "grad_norm": 1.5555768560283307, + "learning_rate": 9.858686710206373e-06, + "loss": 0.3341, + "step": 340 + }, + { + "epoch": 0.10407446970853045, + "grad_norm": 1.7562240544367693, + "learning_rate": 9.857517166035466e-06, + "loss": 0.4637, + "step": 341 + }, + { + "epoch": 0.10437967343201587, + "grad_norm": 3.9759170817857044, + "learning_rate": 9.856342872017515e-06, + "loss": 0.6559, + "step": 342 + }, + { + "epoch": 0.1046848771555013, + "grad_norm": 1.6864581247305628, + "learning_rate": 9.855163829300789e-06, + "loss": 0.4889, + "step": 343 + }, + { + "epoch": 0.10499008087898673, + "grad_norm": 1.5660173118774432, + "learning_rate": 9.853980039038193e-06, + "loss": 0.2512, + "step": 344 + }, + { + "epoch": 0.10529528460247214, + "grad_norm": 1.747919617181712, + "learning_rate": 9.85279150238728e-06, + "loss": 0.4796, + "step": 345 + }, + { + "epoch": 0.10560048832595757, + "grad_norm": 2.0064830609007496, + "learning_rate": 9.85159822051024e-06, + "loss": 0.6876, + "step": 346 + }, + { + "epoch": 0.105905692049443, + "grad_norm": 3.232705832740987, + "learning_rate": 9.850400194573908e-06, + "loss": 0.5043, + "step": 347 + }, + { + "epoch": 0.10621089577292843, + "grad_norm": 1.6750365694523466, + "learning_rate": 9.849197425749753e-06, + "loss": 0.4426, + "step": 348 + }, + { + "epoch": 0.10651609949641386, + "grad_norm": 1.2407353388150502, + "learning_rate": 9.847989915213883e-06, + "loss": 0.2728, + "step": 349 + }, + { + "epoch": 0.10682130321989929, + "grad_norm": 1.145770915094197, + "learning_rate": 9.846777664147046e-06, + "loss": 0.1828, + "step": 350 + }, + { + "epoch": 0.10712650694338471, + "grad_norm": 1.0259526919339048, + "learning_rate": 9.845560673734617e-06, + "loss": 0.1815, + "step": 351 + }, + { + "epoch": 0.10743171066687014, + "grad_norm": 1.5927868344920564, + "learning_rate": 9.844338945166619e-06, + "loss": 0.4563, + "step": 352 + }, + { + "epoch": 0.10773691439035556, + "grad_norm": 1.862511817843432, + "learning_rate": 9.843112479637692e-06, + "loss": 0.4873, + "step": 353 + }, + { + "epoch": 0.10804211811384098, + "grad_norm": 1.3346380863318066, + "learning_rate": 9.841881278347122e-06, + "loss": 0.344, + "step": 354 + }, + { + "epoch": 0.10834732183732641, + "grad_norm": 1.2558059799685006, + "learning_rate": 9.840645342498817e-06, + "loss": 0.3489, + "step": 355 + }, + { + "epoch": 0.10865252556081184, + "grad_norm": 1.734496310003891, + "learning_rate": 9.839404673301318e-06, + "loss": 0.4161, + "step": 356 + }, + { + "epoch": 0.10895772928429727, + "grad_norm": 1.5538063811233147, + "learning_rate": 9.838159271967795e-06, + "loss": 0.4971, + "step": 357 + }, + { + "epoch": 0.1092629330077827, + "grad_norm": 1.51967394631939, + "learning_rate": 9.836909139716044e-06, + "loss": 0.3422, + "step": 358 + }, + { + "epoch": 0.10956813673126813, + "grad_norm": 1.8153898943745974, + "learning_rate": 9.835654277768487e-06, + "loss": 0.5826, + "step": 359 + }, + { + "epoch": 0.10987334045475355, + "grad_norm": 1.3956832615282058, + "learning_rate": 9.834394687352168e-06, + "loss": 0.3444, + "step": 360 + }, + { + "epoch": 0.11017854417823897, + "grad_norm": 1.7028097158357791, + "learning_rate": 9.833130369698764e-06, + "loss": 0.4653, + "step": 361 + }, + { + "epoch": 0.1104837479017244, + "grad_norm": 1.984086744142622, + "learning_rate": 9.831861326044564e-06, + "loss": 0.7268, + "step": 362 + }, + { + "epoch": 0.11078895162520982, + "grad_norm": 1.6010962575202456, + "learning_rate": 9.830587557630481e-06, + "loss": 0.4979, + "step": 363 + }, + { + "epoch": 0.11109415534869525, + "grad_norm": 1.163967768763629, + "learning_rate": 9.829309065702054e-06, + "loss": 0.2721, + "step": 364 + }, + { + "epoch": 0.11139935907218068, + "grad_norm": 1.5116526665647354, + "learning_rate": 9.828025851509433e-06, + "loss": 0.4504, + "step": 365 + }, + { + "epoch": 0.11170456279566611, + "grad_norm": 1.3126613426949747, + "learning_rate": 9.82673791630739e-06, + "loss": 0.2915, + "step": 366 + }, + { + "epoch": 0.11200976651915154, + "grad_norm": 1.2919896962667308, + "learning_rate": 9.825445261355313e-06, + "loss": 0.2862, + "step": 367 + }, + { + "epoch": 0.11231497024263697, + "grad_norm": 1.7465438774619029, + "learning_rate": 9.824147887917201e-06, + "loss": 0.5347, + "step": 368 + }, + { + "epoch": 0.11262017396612238, + "grad_norm": 1.5226131458552274, + "learning_rate": 9.822845797261676e-06, + "loss": 0.4764, + "step": 369 + }, + { + "epoch": 0.11292537768960781, + "grad_norm": 1.157991129368488, + "learning_rate": 9.82153899066196e-06, + "loss": 0.2002, + "step": 370 + }, + { + "epoch": 0.11323058141309324, + "grad_norm": 0.9301400886155634, + "learning_rate": 9.8202274693959e-06, + "loss": 0.1684, + "step": 371 + }, + { + "epoch": 0.11353578513657867, + "grad_norm": 1.502487900950231, + "learning_rate": 9.818911234745942e-06, + "loss": 0.3093, + "step": 372 + }, + { + "epoch": 0.1138409888600641, + "grad_norm": 1.9784385010881411, + "learning_rate": 9.817590287999149e-06, + "loss": 0.4803, + "step": 373 + }, + { + "epoch": 0.11414619258354952, + "grad_norm": 0.9195545540370847, + "learning_rate": 9.816264630447186e-06, + "loss": 0.2159, + "step": 374 + }, + { + "epoch": 0.11445139630703495, + "grad_norm": 1.9332444000616873, + "learning_rate": 9.814934263386324e-06, + "loss": 0.704, + "step": 375 + }, + { + "epoch": 0.11475660003052038, + "grad_norm": 1.2982271338494382, + "learning_rate": 9.813599188117447e-06, + "loss": 0.2579, + "step": 376 + }, + { + "epoch": 0.11506180375400579, + "grad_norm": 1.4400207145843447, + "learning_rate": 9.812259405946033e-06, + "loss": 0.2581, + "step": 377 + }, + { + "epoch": 0.11536700747749122, + "grad_norm": 1.6214016909138647, + "learning_rate": 9.810914918182168e-06, + "loss": 0.3348, + "step": 378 + }, + { + "epoch": 0.11567221120097665, + "grad_norm": 1.3037284021243603, + "learning_rate": 9.80956572614054e-06, + "loss": 0.2009, + "step": 379 + }, + { + "epoch": 0.11597741492446208, + "grad_norm": 1.4597999101205241, + "learning_rate": 9.808211831140434e-06, + "loss": 0.4417, + "step": 380 + }, + { + "epoch": 0.1162826186479475, + "grad_norm": 1.3133108268461724, + "learning_rate": 9.806853234505736e-06, + "loss": 0.3305, + "step": 381 + }, + { + "epoch": 0.11658782237143293, + "grad_norm": 1.455145466929522, + "learning_rate": 9.805489937564926e-06, + "loss": 0.4611, + "step": 382 + }, + { + "epoch": 0.11689302609491836, + "grad_norm": 1.3560358010182432, + "learning_rate": 9.804121941651085e-06, + "loss": 0.2624, + "step": 383 + }, + { + "epoch": 0.11719822981840379, + "grad_norm": 1.5730489544680661, + "learning_rate": 9.802749248101885e-06, + "loss": 0.5959, + "step": 384 + }, + { + "epoch": 0.1175034335418892, + "grad_norm": 1.6080011966995038, + "learning_rate": 9.801371858259594e-06, + "loss": 0.3077, + "step": 385 + }, + { + "epoch": 0.11780863726537463, + "grad_norm": 1.3452073998773173, + "learning_rate": 9.799989773471071e-06, + "loss": 0.3877, + "step": 386 + }, + { + "epoch": 0.11811384098886006, + "grad_norm": 1.521531541360139, + "learning_rate": 9.798602995087764e-06, + "loss": 0.2978, + "step": 387 + }, + { + "epoch": 0.11841904471234549, + "grad_norm": 1.705542362457564, + "learning_rate": 9.797211524465715e-06, + "loss": 0.4298, + "step": 388 + }, + { + "epoch": 0.11872424843583092, + "grad_norm": 2.110762528312395, + "learning_rate": 9.79581536296555e-06, + "loss": 0.4766, + "step": 389 + }, + { + "epoch": 0.11902945215931635, + "grad_norm": 1.8442245131411212, + "learning_rate": 9.794414511952484e-06, + "loss": 0.2988, + "step": 390 + }, + { + "epoch": 0.11933465588280177, + "grad_norm": 1.2950053698748747, + "learning_rate": 9.793008972796318e-06, + "loss": 0.2907, + "step": 391 + }, + { + "epoch": 0.1196398596062872, + "grad_norm": 1.1692504994324242, + "learning_rate": 9.791598746871438e-06, + "loss": 0.2584, + "step": 392 + }, + { + "epoch": 0.11994506332977262, + "grad_norm": 1.7847769770487698, + "learning_rate": 9.790183835556806e-06, + "loss": 0.4874, + "step": 393 + }, + { + "epoch": 0.12025026705325804, + "grad_norm": 1.2562325547558533, + "learning_rate": 9.788764240235976e-06, + "loss": 0.3739, + "step": 394 + }, + { + "epoch": 0.12055547077674347, + "grad_norm": 1.340465345857484, + "learning_rate": 9.787339962297076e-06, + "loss": 0.1911, + "step": 395 + }, + { + "epoch": 0.1208606745002289, + "grad_norm": 1.5667496682899116, + "learning_rate": 9.785911003132811e-06, + "loss": 0.4243, + "step": 396 + }, + { + "epoch": 0.12116587822371433, + "grad_norm": 1.81118055527733, + "learning_rate": 9.78447736414047e-06, + "loss": 0.5246, + "step": 397 + }, + { + "epoch": 0.12147108194719976, + "grad_norm": 1.4053388398772566, + "learning_rate": 9.783039046721912e-06, + "loss": 0.2964, + "step": 398 + }, + { + "epoch": 0.12177628567068519, + "grad_norm": 1.5253960954907508, + "learning_rate": 9.781596052283573e-06, + "loss": 0.3939, + "step": 399 + }, + { + "epoch": 0.12208148939417061, + "grad_norm": 1.4250712259795966, + "learning_rate": 9.780148382236465e-06, + "loss": 0.2338, + "step": 400 + }, + { + "epoch": 0.12238669311765603, + "grad_norm": 1.6445234212439845, + "learning_rate": 9.778696037996167e-06, + "loss": 0.6218, + "step": 401 + }, + { + "epoch": 0.12269189684114146, + "grad_norm": 1.625655408880631, + "learning_rate": 9.777239020982834e-06, + "loss": 0.4565, + "step": 402 + }, + { + "epoch": 0.12299710056462689, + "grad_norm": 1.2734470484088918, + "learning_rate": 9.775777332621184e-06, + "loss": 0.3673, + "step": 403 + }, + { + "epoch": 0.12330230428811231, + "grad_norm": 1.5775928166525761, + "learning_rate": 9.774310974340506e-06, + "loss": 0.3673, + "step": 404 + }, + { + "epoch": 0.12360750801159774, + "grad_norm": 1.2252627733176171, + "learning_rate": 9.772839947574658e-06, + "loss": 0.4064, + "step": 405 + }, + { + "epoch": 0.12391271173508317, + "grad_norm": 1.7189364245488037, + "learning_rate": 9.77136425376206e-06, + "loss": 0.4633, + "step": 406 + }, + { + "epoch": 0.1242179154585686, + "grad_norm": 1.7558994988767551, + "learning_rate": 9.769883894345693e-06, + "loss": 0.5655, + "step": 407 + }, + { + "epoch": 0.12452311918205403, + "grad_norm": 1.4802133948901528, + "learning_rate": 9.768398870773108e-06, + "loss": 0.4175, + "step": 408 + }, + { + "epoch": 0.12482832290553945, + "grad_norm": 1.3302751078782091, + "learning_rate": 9.766909184496408e-06, + "loss": 0.3468, + "step": 409 + }, + { + "epoch": 0.12513352662902488, + "grad_norm": 1.704721594870751, + "learning_rate": 9.765414836972262e-06, + "loss": 0.4898, + "step": 410 + }, + { + "epoch": 0.1254387303525103, + "grad_norm": 1.2131375907547226, + "learning_rate": 9.763915829661891e-06, + "loss": 0.3886, + "step": 411 + }, + { + "epoch": 0.12574393407599574, + "grad_norm": 1.3401723742285034, + "learning_rate": 9.76241216403108e-06, + "loss": 0.4102, + "step": 412 + }, + { + "epoch": 0.12604913779948115, + "grad_norm": 1.6513004120286505, + "learning_rate": 9.760903841550164e-06, + "loss": 0.433, + "step": 413 + }, + { + "epoch": 0.12635434152296657, + "grad_norm": 2.2390646637016327, + "learning_rate": 9.75939086369403e-06, + "loss": 0.4213, + "step": 414 + }, + { + "epoch": 0.126659545246452, + "grad_norm": 1.585281461881457, + "learning_rate": 9.757873231942122e-06, + "loss": 0.2145, + "step": 415 + }, + { + "epoch": 0.12696474896993742, + "grad_norm": 1.616278056338578, + "learning_rate": 9.756350947778431e-06, + "loss": 0.3786, + "step": 416 + }, + { + "epoch": 0.12726995269342287, + "grad_norm": 1.3896314522086528, + "learning_rate": 9.754824012691499e-06, + "loss": 0.3886, + "step": 417 + }, + { + "epoch": 0.12757515641690828, + "grad_norm": 1.5388381717065547, + "learning_rate": 9.753292428174416e-06, + "loss": 0.4901, + "step": 418 + }, + { + "epoch": 0.12788036014039372, + "grad_norm": 1.2099556172325527, + "learning_rate": 9.75175619572482e-06, + "loss": 0.2379, + "step": 419 + }, + { + "epoch": 0.12818556386387914, + "grad_norm": 2.578673148403812, + "learning_rate": 9.750215316844886e-06, + "loss": 0.3093, + "step": 420 + }, + { + "epoch": 0.12849076758736458, + "grad_norm": 1.3557950817043143, + "learning_rate": 9.748669793041345e-06, + "loss": 0.2901, + "step": 421 + }, + { + "epoch": 0.12879597131085, + "grad_norm": 1.8058846206435177, + "learning_rate": 9.747119625825459e-06, + "loss": 0.5222, + "step": 422 + }, + { + "epoch": 0.1291011750343354, + "grad_norm": 1.449090093366816, + "learning_rate": 9.745564816713034e-06, + "loss": 0.3738, + "step": 423 + }, + { + "epoch": 0.12940637875782085, + "grad_norm": 1.3181509799847857, + "learning_rate": 9.74400536722442e-06, + "loss": 0.2624, + "step": 424 + }, + { + "epoch": 0.12971158248130626, + "grad_norm": 2.1599699357377675, + "learning_rate": 9.742441278884496e-06, + "loss": 0.4838, + "step": 425 + }, + { + "epoch": 0.1300167862047917, + "grad_norm": 1.6258096878519581, + "learning_rate": 9.740872553222685e-06, + "loss": 0.4999, + "step": 426 + }, + { + "epoch": 0.13032198992827712, + "grad_norm": 1.5046387312958875, + "learning_rate": 9.739299191772937e-06, + "loss": 0.3095, + "step": 427 + }, + { + "epoch": 0.13062719365176256, + "grad_norm": 3.2565177704876653, + "learning_rate": 9.737721196073742e-06, + "loss": 0.4886, + "step": 428 + }, + { + "epoch": 0.13093239737524798, + "grad_norm": 5.072257540244327, + "learning_rate": 9.736138567668117e-06, + "loss": 0.2433, + "step": 429 + }, + { + "epoch": 0.1312376010987334, + "grad_norm": 1.5984597272623309, + "learning_rate": 9.734551308103607e-06, + "loss": 0.3274, + "step": 430 + }, + { + "epoch": 0.13154280482221883, + "grad_norm": 1.9766310728941525, + "learning_rate": 9.732959418932297e-06, + "loss": 0.5601, + "step": 431 + }, + { + "epoch": 0.13184800854570425, + "grad_norm": 1.8500535183975242, + "learning_rate": 9.731362901710783e-06, + "loss": 0.8055, + "step": 432 + }, + { + "epoch": 0.1321532122691897, + "grad_norm": 1.5082490121163843, + "learning_rate": 9.7297617580002e-06, + "loss": 0.3715, + "step": 433 + }, + { + "epoch": 0.1324584159926751, + "grad_norm": 1.3484742708817405, + "learning_rate": 9.728155989366198e-06, + "loss": 0.2316, + "step": 434 + }, + { + "epoch": 0.13276361971616055, + "grad_norm": 1.4064281774599339, + "learning_rate": 9.726545597378953e-06, + "loss": 0.2529, + "step": 435 + }, + { + "epoch": 0.13306882343964596, + "grad_norm": 1.3619428913058238, + "learning_rate": 9.724930583613164e-06, + "loss": 0.2675, + "step": 436 + }, + { + "epoch": 0.1333740271631314, + "grad_norm": 1.8072201350967787, + "learning_rate": 9.723310949648044e-06, + "loss": 0.4832, + "step": 437 + }, + { + "epoch": 0.13367923088661682, + "grad_norm": 1.376837369774298, + "learning_rate": 9.721686697067328e-06, + "loss": 0.3855, + "step": 438 + }, + { + "epoch": 0.13398443461010223, + "grad_norm": 1.577084491012061, + "learning_rate": 9.720057827459264e-06, + "loss": 0.3307, + "step": 439 + }, + { + "epoch": 0.13428963833358767, + "grad_norm": 1.7335930513549882, + "learning_rate": 9.718424342416619e-06, + "loss": 0.3794, + "step": 440 + }, + { + "epoch": 0.1345948420570731, + "grad_norm": 1.3979759520207362, + "learning_rate": 9.716786243536672e-06, + "loss": 0.17, + "step": 441 + }, + { + "epoch": 0.13490004578055853, + "grad_norm": 1.5166020341173607, + "learning_rate": 9.715143532421208e-06, + "loss": 0.2782, + "step": 442 + }, + { + "epoch": 0.13520524950404395, + "grad_norm": 1.5547284451560954, + "learning_rate": 9.71349621067653e-06, + "loss": 0.2398, + "step": 443 + }, + { + "epoch": 0.1355104532275294, + "grad_norm": 1.4706416022210265, + "learning_rate": 9.711844279913443e-06, + "loss": 0.3486, + "step": 444 + }, + { + "epoch": 0.1358156569510148, + "grad_norm": 1.7726813901729983, + "learning_rate": 9.710187741747264e-06, + "loss": 0.4938, + "step": 445 + }, + { + "epoch": 0.13612086067450022, + "grad_norm": 1.3682846898358798, + "learning_rate": 9.708526597797812e-06, + "loss": 0.3399, + "step": 446 + }, + { + "epoch": 0.13642606439798566, + "grad_norm": 1.173206742476641, + "learning_rate": 9.70686084968941e-06, + "loss": 0.209, + "step": 447 + }, + { + "epoch": 0.13673126812147107, + "grad_norm": 1.5006667880365117, + "learning_rate": 9.705190499050885e-06, + "loss": 0.3918, + "step": 448 + }, + { + "epoch": 0.13703647184495651, + "grad_norm": 1.4081281623516984, + "learning_rate": 9.70351554751556e-06, + "loss": 0.3478, + "step": 449 + }, + { + "epoch": 0.13734167556844193, + "grad_norm": 1.3521857616183288, + "learning_rate": 9.701835996721267e-06, + "loss": 0.2527, + "step": 450 + }, + { + "epoch": 0.13764687929192737, + "grad_norm": 1.609701105059192, + "learning_rate": 9.70015184831032e-06, + "loss": 0.4019, + "step": 451 + }, + { + "epoch": 0.13795208301541279, + "grad_norm": 1.5247656094886828, + "learning_rate": 9.698463103929542e-06, + "loss": 0.3393, + "step": 452 + }, + { + "epoch": 0.13825728673889823, + "grad_norm": 1.7521263555747508, + "learning_rate": 9.696769765230244e-06, + "loss": 0.4711, + "step": 453 + }, + { + "epoch": 0.13856249046238364, + "grad_norm": 1.1750952661731386, + "learning_rate": 9.695071833868233e-06, + "loss": 0.3209, + "step": 454 + }, + { + "epoch": 0.13886769418586906, + "grad_norm": 1.1097718724387464, + "learning_rate": 9.693369311503801e-06, + "loss": 0.2077, + "step": 455 + }, + { + "epoch": 0.1391728979093545, + "grad_norm": 1.3477077308441543, + "learning_rate": 9.691662199801735e-06, + "loss": 0.3387, + "step": 456 + }, + { + "epoch": 0.1394781016328399, + "grad_norm": 1.5926509134118427, + "learning_rate": 9.689950500431306e-06, + "loss": 0.4737, + "step": 457 + }, + { + "epoch": 0.13978330535632535, + "grad_norm": 1.238821976588628, + "learning_rate": 9.688234215066274e-06, + "loss": 0.2941, + "step": 458 + }, + { + "epoch": 0.14008850907981077, + "grad_norm": 1.283103486116252, + "learning_rate": 9.68651334538488e-06, + "loss": 0.2298, + "step": 459 + }, + { + "epoch": 0.1403937128032962, + "grad_norm": 1.3964480095243228, + "learning_rate": 9.684787893069852e-06, + "loss": 0.2755, + "step": 460 + }, + { + "epoch": 0.14069891652678163, + "grad_norm": 4.5768637107742185, + "learning_rate": 9.683057859808394e-06, + "loss": 0.3969, + "step": 461 + }, + { + "epoch": 0.14100412025026704, + "grad_norm": 1.238974584705195, + "learning_rate": 9.681323247292193e-06, + "loss": 0.302, + "step": 462 + }, + { + "epoch": 0.14130932397375248, + "grad_norm": 1.484367852444757, + "learning_rate": 9.679584057217412e-06, + "loss": 0.3771, + "step": 463 + }, + { + "epoch": 0.1416145276972379, + "grad_norm": 1.8892302609803566, + "learning_rate": 9.677840291284693e-06, + "loss": 0.4296, + "step": 464 + }, + { + "epoch": 0.14191973142072334, + "grad_norm": 1.9264735588835349, + "learning_rate": 9.676091951199147e-06, + "loss": 0.1789, + "step": 465 + }, + { + "epoch": 0.14222493514420875, + "grad_norm": 1.6779527799580054, + "learning_rate": 9.674339038670362e-06, + "loss": 0.3394, + "step": 466 + }, + { + "epoch": 0.1425301388676942, + "grad_norm": 1.4269803792009585, + "learning_rate": 9.672581555412396e-06, + "loss": 0.3436, + "step": 467 + }, + { + "epoch": 0.1428353425911796, + "grad_norm": 1.6997561953129157, + "learning_rate": 9.67081950314378e-06, + "loss": 0.4244, + "step": 468 + }, + { + "epoch": 0.14314054631466505, + "grad_norm": 1.5847658235861504, + "learning_rate": 9.669052883587503e-06, + "loss": 0.4061, + "step": 469 + }, + { + "epoch": 0.14344575003815047, + "grad_norm": 1.5000132719088555, + "learning_rate": 9.667281698471032e-06, + "loss": 0.2349, + "step": 470 + }, + { + "epoch": 0.14375095376163588, + "grad_norm": 1.4700398564287758, + "learning_rate": 9.665505949526288e-06, + "loss": 0.4265, + "step": 471 + }, + { + "epoch": 0.14405615748512132, + "grad_norm": 1.1929108671224367, + "learning_rate": 9.663725638489662e-06, + "loss": 0.3107, + "step": 472 + }, + { + "epoch": 0.14436136120860674, + "grad_norm": 1.1577689014622543, + "learning_rate": 9.661940767102001e-06, + "loss": 0.1722, + "step": 473 + }, + { + "epoch": 0.14466656493209218, + "grad_norm": 2.093772204836488, + "learning_rate": 9.660151337108617e-06, + "loss": 0.4278, + "step": 474 + }, + { + "epoch": 0.1449717686555776, + "grad_norm": 1.6654665758895533, + "learning_rate": 9.658357350259274e-06, + "loss": 0.3394, + "step": 475 + }, + { + "epoch": 0.14527697237906304, + "grad_norm": 1.2202451289258887, + "learning_rate": 9.656558808308193e-06, + "loss": 0.3433, + "step": 476 + }, + { + "epoch": 0.14558217610254845, + "grad_norm": 1.5418473816537024, + "learning_rate": 9.654755713014052e-06, + "loss": 0.4099, + "step": 477 + }, + { + "epoch": 0.14588737982603386, + "grad_norm": 1.867172943441559, + "learning_rate": 9.652948066139978e-06, + "loss": 0.4445, + "step": 478 + }, + { + "epoch": 0.1461925835495193, + "grad_norm": 1.2274320399473075, + "learning_rate": 9.651135869453552e-06, + "loss": 0.3687, + "step": 479 + }, + { + "epoch": 0.14649778727300472, + "grad_norm": 1.3632570280967384, + "learning_rate": 9.649319124726799e-06, + "loss": 0.3554, + "step": 480 + }, + { + "epoch": 0.14680299099649016, + "grad_norm": 1.9368328668689925, + "learning_rate": 9.647497833736197e-06, + "loss": 0.433, + "step": 481 + }, + { + "epoch": 0.14710819471997558, + "grad_norm": 1.7750440538339176, + "learning_rate": 9.645671998262668e-06, + "loss": 0.4275, + "step": 482 + }, + { + "epoch": 0.14741339844346102, + "grad_norm": 1.7620981827052555, + "learning_rate": 9.643841620091572e-06, + "loss": 0.4327, + "step": 483 + }, + { + "epoch": 0.14771860216694643, + "grad_norm": 1.705808013569081, + "learning_rate": 9.642006701012719e-06, + "loss": 0.2914, + "step": 484 + }, + { + "epoch": 0.14802380589043188, + "grad_norm": 1.4490666476731855, + "learning_rate": 9.640167242820356e-06, + "loss": 0.4171, + "step": 485 + }, + { + "epoch": 0.1483290096139173, + "grad_norm": 1.4017898188106575, + "learning_rate": 9.638323247313167e-06, + "loss": 0.3609, + "step": 486 + }, + { + "epoch": 0.1486342133374027, + "grad_norm": 1.273585773097182, + "learning_rate": 9.636474716294275e-06, + "loss": 0.2781, + "step": 487 + }, + { + "epoch": 0.14893941706088815, + "grad_norm": 1.4426438722313946, + "learning_rate": 9.634621651571235e-06, + "loss": 0.3446, + "step": 488 + }, + { + "epoch": 0.14924462078437356, + "grad_norm": 1.3396568501630033, + "learning_rate": 9.632764054956042e-06, + "loss": 0.3209, + "step": 489 + }, + { + "epoch": 0.149549824507859, + "grad_norm": 1.869987463373602, + "learning_rate": 9.630901928265113e-06, + "loss": 0.6676, + "step": 490 + }, + { + "epoch": 0.14985502823134442, + "grad_norm": 1.3995020559881168, + "learning_rate": 9.6290352733193e-06, + "loss": 0.443, + "step": 491 + }, + { + "epoch": 0.15016023195482986, + "grad_norm": 1.5646801044524754, + "learning_rate": 9.627164091943886e-06, + "loss": 0.4808, + "step": 492 + }, + { + "epoch": 0.15046543567831527, + "grad_norm": 1.4064811516565765, + "learning_rate": 9.625288385968572e-06, + "loss": 0.3281, + "step": 493 + }, + { + "epoch": 0.15077063940180072, + "grad_norm": 1.3603824959747954, + "learning_rate": 9.623408157227493e-06, + "loss": 0.4203, + "step": 494 + }, + { + "epoch": 0.15107584312528613, + "grad_norm": 1.4648922518012057, + "learning_rate": 9.621523407559193e-06, + "loss": 0.4691, + "step": 495 + }, + { + "epoch": 0.15138104684877154, + "grad_norm": 1.2898880598011702, + "learning_rate": 9.619634138806653e-06, + "loss": 0.22, + "step": 496 + }, + { + "epoch": 0.151686250572257, + "grad_norm": 1.629312149082348, + "learning_rate": 9.61774035281726e-06, + "loss": 0.3852, + "step": 497 + }, + { + "epoch": 0.1519914542957424, + "grad_norm": 1.4209940770003642, + "learning_rate": 9.615842051442825e-06, + "loss": 0.3434, + "step": 498 + }, + { + "epoch": 0.15229665801922784, + "grad_norm": 1.5981471419786573, + "learning_rate": 9.613939236539571e-06, + "loss": 0.4405, + "step": 499 + }, + { + "epoch": 0.15260186174271326, + "grad_norm": 1.6345273007096384, + "learning_rate": 9.612031909968138e-06, + "loss": 0.4621, + "step": 500 + }, + { + "epoch": 0.1529070654661987, + "grad_norm": 1.7946527422515466, + "learning_rate": 9.610120073593574e-06, + "loss": 0.4215, + "step": 501 + }, + { + "epoch": 0.1532122691896841, + "grad_norm": 1.9696007964079152, + "learning_rate": 9.608203729285337e-06, + "loss": 0.2416, + "step": 502 + }, + { + "epoch": 0.15351747291316953, + "grad_norm": 1.186483550480542, + "learning_rate": 9.606282878917296e-06, + "loss": 0.1656, + "step": 503 + }, + { + "epoch": 0.15382267663665497, + "grad_norm": 1.3709781521921298, + "learning_rate": 9.604357524367723e-06, + "loss": 0.3374, + "step": 504 + }, + { + "epoch": 0.15412788036014038, + "grad_norm": 1.4744363645402312, + "learning_rate": 9.602427667519297e-06, + "loss": 0.3472, + "step": 505 + }, + { + "epoch": 0.15443308408362583, + "grad_norm": 1.7032963377834875, + "learning_rate": 9.600493310259098e-06, + "loss": 0.4352, + "step": 506 + }, + { + "epoch": 0.15473828780711124, + "grad_norm": 1.487020684601837, + "learning_rate": 9.598554454478608e-06, + "loss": 0.2661, + "step": 507 + }, + { + "epoch": 0.15504349153059668, + "grad_norm": 1.2499312381905126, + "learning_rate": 9.596611102073703e-06, + "loss": 0.2785, + "step": 508 + }, + { + "epoch": 0.1553486952540821, + "grad_norm": 1.529878897767237, + "learning_rate": 9.594663254944664e-06, + "loss": 0.3768, + "step": 509 + }, + { + "epoch": 0.15565389897756754, + "grad_norm": 1.5214931502474798, + "learning_rate": 9.592710914996162e-06, + "loss": 0.5126, + "step": 510 + }, + { + "epoch": 0.15595910270105295, + "grad_norm": 1.3836939529329817, + "learning_rate": 9.590754084137259e-06, + "loss": 0.3011, + "step": 511 + }, + { + "epoch": 0.15626430642453837, + "grad_norm": 1.4833094737816435, + "learning_rate": 9.588792764281413e-06, + "loss": 0.4761, + "step": 512 + }, + { + "epoch": 0.1565695101480238, + "grad_norm": 1.2346664760598378, + "learning_rate": 9.586826957346473e-06, + "loss": 0.2454, + "step": 513 + }, + { + "epoch": 0.15687471387150922, + "grad_norm": 1.6476636760719772, + "learning_rate": 9.584856665254667e-06, + "loss": 0.2867, + "step": 514 + }, + { + "epoch": 0.15717991759499467, + "grad_norm": 1.5637210652382973, + "learning_rate": 9.58288188993262e-06, + "loss": 0.2899, + "step": 515 + }, + { + "epoch": 0.15748512131848008, + "grad_norm": 1.3205463270086828, + "learning_rate": 9.580902633311331e-06, + "loss": 0.3756, + "step": 516 + }, + { + "epoch": 0.15779032504196552, + "grad_norm": 1.3975127161911243, + "learning_rate": 9.578918897326186e-06, + "loss": 0.5111, + "step": 517 + }, + { + "epoch": 0.15809552876545094, + "grad_norm": 1.4459383872321914, + "learning_rate": 9.57693068391695e-06, + "loss": 0.4283, + "step": 518 + }, + { + "epoch": 0.15840073248893635, + "grad_norm": 1.7061785001760192, + "learning_rate": 9.574937995027767e-06, + "loss": 0.3702, + "step": 519 + }, + { + "epoch": 0.1587059362124218, + "grad_norm": 1.822247438656905, + "learning_rate": 9.572940832607157e-06, + "loss": 0.3406, + "step": 520 + }, + { + "epoch": 0.1590111399359072, + "grad_norm": 2.44932543751886, + "learning_rate": 9.570939198608013e-06, + "loss": 0.3118, + "step": 521 + }, + { + "epoch": 0.15931634365939265, + "grad_norm": 1.6119202421775476, + "learning_rate": 9.5689330949876e-06, + "loss": 0.3903, + "step": 522 + }, + { + "epoch": 0.15962154738287807, + "grad_norm": 1.4848330945324375, + "learning_rate": 9.56692252370756e-06, + "loss": 0.2336, + "step": 523 + }, + { + "epoch": 0.1599267511063635, + "grad_norm": 1.410632362194396, + "learning_rate": 9.564907486733893e-06, + "loss": 0.2749, + "step": 524 + }, + { + "epoch": 0.16023195482984892, + "grad_norm": 1.596052638125191, + "learning_rate": 9.562887986036975e-06, + "loss": 0.4752, + "step": 525 + }, + { + "epoch": 0.16053715855333436, + "grad_norm": 1.7668740909494465, + "learning_rate": 9.560864023591541e-06, + "loss": 0.4457, + "step": 526 + }, + { + "epoch": 0.16084236227681978, + "grad_norm": 1.4009268145182425, + "learning_rate": 9.558835601376692e-06, + "loss": 0.2615, + "step": 527 + }, + { + "epoch": 0.1611475660003052, + "grad_norm": 1.7299333351168085, + "learning_rate": 9.55680272137589e-06, + "loss": 0.5216, + "step": 528 + }, + { + "epoch": 0.16145276972379063, + "grad_norm": 1.398003196407042, + "learning_rate": 9.554765385576951e-06, + "loss": 0.2917, + "step": 529 + }, + { + "epoch": 0.16175797344727605, + "grad_norm": 1.4037115710357768, + "learning_rate": 9.552723595972055e-06, + "loss": 0.2794, + "step": 530 + }, + { + "epoch": 0.1620631771707615, + "grad_norm": 1.4104804936912443, + "learning_rate": 9.550677354557734e-06, + "loss": 0.3294, + "step": 531 + }, + { + "epoch": 0.1623683808942469, + "grad_norm": 1.3043707731550427, + "learning_rate": 9.548626663334872e-06, + "loss": 0.3542, + "step": 532 + }, + { + "epoch": 0.16267358461773235, + "grad_norm": 1.4523817232860987, + "learning_rate": 9.546571524308707e-06, + "loss": 0.4394, + "step": 533 + }, + { + "epoch": 0.16297878834121776, + "grad_norm": 1.2378417959119585, + "learning_rate": 9.544511939488823e-06, + "loss": 0.2859, + "step": 534 + }, + { + "epoch": 0.16328399206470318, + "grad_norm": 1.1623856416463947, + "learning_rate": 9.542447910889152e-06, + "loss": 0.2682, + "step": 535 + }, + { + "epoch": 0.16358919578818862, + "grad_norm": 1.503292443987416, + "learning_rate": 9.540379440527974e-06, + "loss": 0.4513, + "step": 536 + }, + { + "epoch": 0.16389439951167403, + "grad_norm": 1.278183220840744, + "learning_rate": 9.538306530427908e-06, + "loss": 0.2486, + "step": 537 + }, + { + "epoch": 0.16419960323515947, + "grad_norm": 1.477438530587252, + "learning_rate": 9.536229182615919e-06, + "loss": 0.4748, + "step": 538 + }, + { + "epoch": 0.1645048069586449, + "grad_norm": 1.161000468008389, + "learning_rate": 9.534147399123308e-06, + "loss": 0.3166, + "step": 539 + }, + { + "epoch": 0.16481001068213033, + "grad_norm": 1.3151690275104762, + "learning_rate": 9.532061181985713e-06, + "loss": 0.3547, + "step": 540 + }, + { + "epoch": 0.16511521440561575, + "grad_norm": 1.750297725419665, + "learning_rate": 9.529970533243112e-06, + "loss": 0.4156, + "step": 541 + }, + { + "epoch": 0.1654204181291012, + "grad_norm": 1.3465531852012238, + "learning_rate": 9.52787545493981e-06, + "loss": 0.3366, + "step": 542 + }, + { + "epoch": 0.1657256218525866, + "grad_norm": 1.5440141830188223, + "learning_rate": 9.525775949124447e-06, + "loss": 0.3376, + "step": 543 + }, + { + "epoch": 0.16603082557607202, + "grad_norm": 1.5415130315298482, + "learning_rate": 9.523672017849995e-06, + "loss": 0.541, + "step": 544 + }, + { + "epoch": 0.16633602929955746, + "grad_norm": 1.7579856956776627, + "learning_rate": 9.521563663173746e-06, + "loss": 0.4806, + "step": 545 + }, + { + "epoch": 0.16664123302304287, + "grad_norm": 1.7818866729621587, + "learning_rate": 9.519450887157324e-06, + "loss": 0.5464, + "step": 546 + }, + { + "epoch": 0.16694643674652831, + "grad_norm": 1.6064088023016758, + "learning_rate": 9.517333691866672e-06, + "loss": 0.43, + "step": 547 + }, + { + "epoch": 0.16725164047001373, + "grad_norm": 1.4778043177300115, + "learning_rate": 9.515212079372059e-06, + "loss": 0.4399, + "step": 548 + }, + { + "epoch": 0.16755684419349917, + "grad_norm": 2.027608100840915, + "learning_rate": 9.513086051748069e-06, + "loss": 0.4069, + "step": 549 + }, + { + "epoch": 0.16786204791698459, + "grad_norm": 1.5850802204517391, + "learning_rate": 9.510955611073605e-06, + "loss": 0.3827, + "step": 550 + }, + { + "epoch": 0.16816725164047, + "grad_norm": 1.293889481401633, + "learning_rate": 9.508820759431883e-06, + "loss": 0.2572, + "step": 551 + }, + { + "epoch": 0.16847245536395544, + "grad_norm": 1.842230801320139, + "learning_rate": 9.506681498910437e-06, + "loss": 0.5275, + "step": 552 + }, + { + "epoch": 0.16877765908744086, + "grad_norm": 1.1873629641518748, + "learning_rate": 9.50453783160111e-06, + "loss": 0.3282, + "step": 553 + }, + { + "epoch": 0.1690828628109263, + "grad_norm": 1.6093453710106354, + "learning_rate": 9.50238975960005e-06, + "loss": 0.5784, + "step": 554 + }, + { + "epoch": 0.1693880665344117, + "grad_norm": 1.8900657646403543, + "learning_rate": 9.500237285007719e-06, + "loss": 0.5224, + "step": 555 + }, + { + "epoch": 0.16969327025789716, + "grad_norm": 1.4803576264157936, + "learning_rate": 9.498080409928878e-06, + "loss": 0.3726, + "step": 556 + }, + { + "epoch": 0.16999847398138257, + "grad_norm": 1.7675886459458987, + "learning_rate": 9.495919136472595e-06, + "loss": 0.4656, + "step": 557 + }, + { + "epoch": 0.170303677704868, + "grad_norm": 1.7910944298366285, + "learning_rate": 9.493753466752236e-06, + "loss": 0.8076, + "step": 558 + }, + { + "epoch": 0.17060888142835343, + "grad_norm": 1.4678825901279975, + "learning_rate": 9.49158340288547e-06, + "loss": 0.3575, + "step": 559 + }, + { + "epoch": 0.17091408515183884, + "grad_norm": 1.411596350121475, + "learning_rate": 9.489408946994256e-06, + "loss": 0.3316, + "step": 560 + }, + { + "epoch": 0.17121928887532428, + "grad_norm": 1.2961064094904746, + "learning_rate": 9.487230101204855e-06, + "loss": 0.3634, + "step": 561 + }, + { + "epoch": 0.1715244925988097, + "grad_norm": 1.2525457208629842, + "learning_rate": 9.485046867647816e-06, + "loss": 0.368, + "step": 562 + }, + { + "epoch": 0.17182969632229514, + "grad_norm": 1.5857071732762902, + "learning_rate": 9.48285924845798e-06, + "loss": 0.4546, + "step": 563 + }, + { + "epoch": 0.17213490004578055, + "grad_norm": 1.4510716523054648, + "learning_rate": 9.480667245774474e-06, + "loss": 0.2739, + "step": 564 + }, + { + "epoch": 0.172440103769266, + "grad_norm": 1.7890268513821783, + "learning_rate": 9.478470861740716e-06, + "loss": 0.4085, + "step": 565 + }, + { + "epoch": 0.1727453074927514, + "grad_norm": 1.2143129127325427, + "learning_rate": 9.476270098504405e-06, + "loss": 0.2669, + "step": 566 + }, + { + "epoch": 0.17305051121623685, + "grad_norm": 1.4725539275626134, + "learning_rate": 9.474064958217524e-06, + "loss": 0.3474, + "step": 567 + }, + { + "epoch": 0.17335571493972227, + "grad_norm": 1.6648347880329453, + "learning_rate": 9.471855443036333e-06, + "loss": 0.3059, + "step": 568 + }, + { + "epoch": 0.17366091866320768, + "grad_norm": 1.4826208089202084, + "learning_rate": 9.469641555121372e-06, + "loss": 0.3309, + "step": 569 + }, + { + "epoch": 0.17396612238669312, + "grad_norm": 1.2087588985869038, + "learning_rate": 9.467423296637458e-06, + "loss": 0.2765, + "step": 570 + }, + { + "epoch": 0.17427132611017854, + "grad_norm": 1.5271503296745377, + "learning_rate": 9.465200669753678e-06, + "loss": 0.4388, + "step": 571 + }, + { + "epoch": 0.17457652983366398, + "grad_norm": 1.720167996940521, + "learning_rate": 9.462973676643395e-06, + "loss": 0.4693, + "step": 572 + }, + { + "epoch": 0.1748817335571494, + "grad_norm": 1.4666930059033054, + "learning_rate": 9.46074231948424e-06, + "loss": 0.354, + "step": 573 + }, + { + "epoch": 0.17518693728063484, + "grad_norm": 1.8597430337784902, + "learning_rate": 9.458506600458106e-06, + "loss": 0.3892, + "step": 574 + }, + { + "epoch": 0.17549214100412025, + "grad_norm": 0.9696512337091734, + "learning_rate": 9.456266521751162e-06, + "loss": 0.2294, + "step": 575 + }, + { + "epoch": 0.17579734472760566, + "grad_norm": 1.781498807963985, + "learning_rate": 9.454022085553829e-06, + "loss": 0.4873, + "step": 576 + }, + { + "epoch": 0.1761025484510911, + "grad_norm": 1.3337716733106453, + "learning_rate": 9.451773294060797e-06, + "loss": 0.3031, + "step": 577 + }, + { + "epoch": 0.17640775217457652, + "grad_norm": 1.7055986970891146, + "learning_rate": 9.449520149471008e-06, + "loss": 0.6405, + "step": 578 + }, + { + "epoch": 0.17671295589806196, + "grad_norm": 1.5188407211523098, + "learning_rate": 9.447262653987668e-06, + "loss": 0.3739, + "step": 579 + }, + { + "epoch": 0.17701815962154738, + "grad_norm": 0.9308569789252137, + "learning_rate": 9.445000809818231e-06, + "loss": 0.2505, + "step": 580 + }, + { + "epoch": 0.17732336334503282, + "grad_norm": 1.5000700139554115, + "learning_rate": 9.442734619174408e-06, + "loss": 0.4158, + "step": 581 + }, + { + "epoch": 0.17762856706851823, + "grad_norm": 1.3548714588751447, + "learning_rate": 9.440464084272157e-06, + "loss": 0.3911, + "step": 582 + }, + { + "epoch": 0.17793377079200368, + "grad_norm": 1.3241166157002833, + "learning_rate": 9.438189207331684e-06, + "loss": 0.371, + "step": 583 + }, + { + "epoch": 0.1782389745154891, + "grad_norm": 1.251287417238196, + "learning_rate": 9.435909990577442e-06, + "loss": 0.4543, + "step": 584 + }, + { + "epoch": 0.1785441782389745, + "grad_norm": 1.6243898273186124, + "learning_rate": 9.433626436238128e-06, + "loss": 0.3955, + "step": 585 + }, + { + "epoch": 0.17884938196245995, + "grad_norm": 1.5933520250274826, + "learning_rate": 9.43133854654668e-06, + "loss": 0.4232, + "step": 586 + }, + { + "epoch": 0.17915458568594536, + "grad_norm": 1.271604048902552, + "learning_rate": 9.429046323740275e-06, + "loss": 0.186, + "step": 587 + }, + { + "epoch": 0.1794597894094308, + "grad_norm": 1.7480757897399084, + "learning_rate": 9.426749770060325e-06, + "loss": 0.3198, + "step": 588 + }, + { + "epoch": 0.17976499313291622, + "grad_norm": 1.6582908292655634, + "learning_rate": 9.424448887752485e-06, + "loss": 0.4497, + "step": 589 + }, + { + "epoch": 0.18007019685640166, + "grad_norm": 1.514808198729056, + "learning_rate": 9.42214367906663e-06, + "loss": 0.3135, + "step": 590 + }, + { + "epoch": 0.18037540057988707, + "grad_norm": 1.33735933544563, + "learning_rate": 9.419834146256875e-06, + "loss": 0.1512, + "step": 591 + }, + { + "epoch": 0.1806806043033725, + "grad_norm": 1.7983955915325747, + "learning_rate": 9.417520291581562e-06, + "loss": 0.397, + "step": 592 + }, + { + "epoch": 0.18098580802685793, + "grad_norm": 1.8357965942853254, + "learning_rate": 9.415202117303253e-06, + "loss": 0.3479, + "step": 593 + }, + { + "epoch": 0.18129101175034334, + "grad_norm": 2.1098502294891084, + "learning_rate": 9.412879625688742e-06, + "loss": 0.6081, + "step": 594 + }, + { + "epoch": 0.1815962154738288, + "grad_norm": 1.7002717361934219, + "learning_rate": 9.410552819009041e-06, + "loss": 0.2335, + "step": 595 + }, + { + "epoch": 0.1819014191973142, + "grad_norm": 1.6858392243118179, + "learning_rate": 9.408221699539381e-06, + "loss": 0.3502, + "step": 596 + }, + { + "epoch": 0.18220662292079964, + "grad_norm": 1.9779389304442994, + "learning_rate": 9.40588626955921e-06, + "loss": 0.5023, + "step": 597 + }, + { + "epoch": 0.18251182664428506, + "grad_norm": 1.984831269441273, + "learning_rate": 9.403546531352192e-06, + "loss": 0.1808, + "step": 598 + }, + { + "epoch": 0.1828170303677705, + "grad_norm": 1.7825668553552305, + "learning_rate": 9.401202487206205e-06, + "loss": 0.2451, + "step": 599 + }, + { + "epoch": 0.18312223409125591, + "grad_norm": 1.7395479880147604, + "learning_rate": 9.398854139413332e-06, + "loss": 0.4586, + "step": 600 + }, + { + "epoch": 0.18342743781474133, + "grad_norm": 1.7910877075998561, + "learning_rate": 9.396501490269871e-06, + "loss": 0.4334, + "step": 601 + }, + { + "epoch": 0.18373264153822677, + "grad_norm": 1.671153260357796, + "learning_rate": 9.394144542076321e-06, + "loss": 0.3457, + "step": 602 + }, + { + "epoch": 0.18403784526171219, + "grad_norm": 2.2927735747628057, + "learning_rate": 9.391783297137392e-06, + "loss": 0.5006, + "step": 603 + }, + { + "epoch": 0.18434304898519763, + "grad_norm": 1.4375155904173251, + "learning_rate": 9.389417757761983e-06, + "loss": 0.3412, + "step": 604 + }, + { + "epoch": 0.18464825270868304, + "grad_norm": 1.0876679937459988, + "learning_rate": 9.387047926263205e-06, + "loss": 0.2323, + "step": 605 + }, + { + "epoch": 0.18495345643216848, + "grad_norm": 1.5691166969698962, + "learning_rate": 9.384673804958357e-06, + "loss": 0.3929, + "step": 606 + }, + { + "epoch": 0.1852586601556539, + "grad_norm": 1.2508041656129554, + "learning_rate": 9.38229539616894e-06, + "loss": 0.2123, + "step": 607 + }, + { + "epoch": 0.1855638638791393, + "grad_norm": 1.6014009719897135, + "learning_rate": 9.379912702220641e-06, + "loss": 0.234, + "step": 608 + }, + { + "epoch": 0.18586906760262475, + "grad_norm": 1.4804611004553776, + "learning_rate": 9.377525725443341e-06, + "loss": 0.3951, + "step": 609 + }, + { + "epoch": 0.18617427132611017, + "grad_norm": 1.4589508747304376, + "learning_rate": 9.375134468171108e-06, + "loss": 0.2887, + "step": 610 + }, + { + "epoch": 0.1864794750495956, + "grad_norm": 1.9199770728371568, + "learning_rate": 9.372738932742193e-06, + "loss": 0.5627, + "step": 611 + }, + { + "epoch": 0.18678467877308103, + "grad_norm": 1.5903295576095668, + "learning_rate": 9.370339121499039e-06, + "loss": 0.4379, + "step": 612 + }, + { + "epoch": 0.18708988249656647, + "grad_norm": 1.6986262549100166, + "learning_rate": 9.367935036788257e-06, + "loss": 0.4873, + "step": 613 + }, + { + "epoch": 0.18739508622005188, + "grad_norm": 1.2194659257752518, + "learning_rate": 9.365526680960645e-06, + "loss": 0.3571, + "step": 614 + }, + { + "epoch": 0.18770028994353732, + "grad_norm": 1.3049138038493902, + "learning_rate": 9.363114056371178e-06, + "loss": 0.3114, + "step": 615 + }, + { + "epoch": 0.18800549366702274, + "grad_norm": 1.468685879220778, + "learning_rate": 9.360697165379004e-06, + "loss": 0.5043, + "step": 616 + }, + { + "epoch": 0.18831069739050815, + "grad_norm": 2.131454426646245, + "learning_rate": 9.35827601034744e-06, + "loss": 0.5165, + "step": 617 + }, + { + "epoch": 0.1886159011139936, + "grad_norm": 1.5563065422847613, + "learning_rate": 9.355850593643974e-06, + "loss": 0.4707, + "step": 618 + }, + { + "epoch": 0.188921104837479, + "grad_norm": 1.3992788671446874, + "learning_rate": 9.353420917640264e-06, + "loss": 0.3905, + "step": 619 + }, + { + "epoch": 0.18922630856096445, + "grad_norm": 1.2635210704880713, + "learning_rate": 9.350986984712128e-06, + "loss": 0.2399, + "step": 620 + }, + { + "epoch": 0.18953151228444987, + "grad_norm": 1.4071894580574145, + "learning_rate": 9.348548797239551e-06, + "loss": 0.3689, + "step": 621 + }, + { + "epoch": 0.1898367160079353, + "grad_norm": 1.8460324916935194, + "learning_rate": 9.346106357606675e-06, + "loss": 0.3337, + "step": 622 + }, + { + "epoch": 0.19014191973142072, + "grad_norm": 1.2199756132877848, + "learning_rate": 9.343659668201803e-06, + "loss": 0.3707, + "step": 623 + }, + { + "epoch": 0.19044712345490614, + "grad_norm": 1.3352934416971625, + "learning_rate": 9.34120873141739e-06, + "loss": 0.3332, + "step": 624 + }, + { + "epoch": 0.19075232717839158, + "grad_norm": 1.5847999740161538, + "learning_rate": 9.33875354965005e-06, + "loss": 0.4658, + "step": 625 + }, + { + "epoch": 0.191057530901877, + "grad_norm": 1.59267718540602, + "learning_rate": 9.336294125300538e-06, + "loss": 0.5138, + "step": 626 + }, + { + "epoch": 0.19136273462536244, + "grad_norm": 1.0558131110089597, + "learning_rate": 9.333830460773767e-06, + "loss": 0.2512, + "step": 627 + }, + { + "epoch": 0.19166793834884785, + "grad_norm": 1.6912848096424418, + "learning_rate": 9.331362558478793e-06, + "loss": 0.3129, + "step": 628 + }, + { + "epoch": 0.1919731420723333, + "grad_norm": 1.7565199783626735, + "learning_rate": 9.328890420828817e-06, + "loss": 0.2625, + "step": 629 + }, + { + "epoch": 0.1922783457958187, + "grad_norm": 1.6287483120468187, + "learning_rate": 9.326414050241176e-06, + "loss": 0.4631, + "step": 630 + }, + { + "epoch": 0.19258354951930415, + "grad_norm": 1.5343127080699748, + "learning_rate": 9.323933449137353e-06, + "loss": 0.35, + "step": 631 + }, + { + "epoch": 0.19288875324278956, + "grad_norm": 1.2901421851525343, + "learning_rate": 9.321448619942963e-06, + "loss": 0.3191, + "step": 632 + }, + { + "epoch": 0.19319395696627498, + "grad_norm": 1.3651871352420322, + "learning_rate": 9.318959565087761e-06, + "loss": 0.3063, + "step": 633 + }, + { + "epoch": 0.19349916068976042, + "grad_norm": 1.4900191765389657, + "learning_rate": 9.316466287005625e-06, + "loss": 0.3621, + "step": 634 + }, + { + "epoch": 0.19380436441324583, + "grad_norm": 1.836926327897149, + "learning_rate": 9.313968788134572e-06, + "loss": 0.6273, + "step": 635 + }, + { + "epoch": 0.19410956813673128, + "grad_norm": 1.6283757285797815, + "learning_rate": 9.311467070916743e-06, + "loss": 0.3191, + "step": 636 + }, + { + "epoch": 0.1944147718602167, + "grad_norm": 1.7047955780313857, + "learning_rate": 9.308961137798398e-06, + "loss": 0.5581, + "step": 637 + }, + { + "epoch": 0.19471997558370213, + "grad_norm": 1.4826549000090183, + "learning_rate": 9.306450991229927e-06, + "loss": 0.3157, + "step": 638 + }, + { + "epoch": 0.19502517930718755, + "grad_norm": 1.435361017145943, + "learning_rate": 9.30393663366584e-06, + "loss": 0.3084, + "step": 639 + }, + { + "epoch": 0.19533038303067296, + "grad_norm": 1.402358583674702, + "learning_rate": 9.301418067564758e-06, + "loss": 0.3351, + "step": 640 + }, + { + "epoch": 0.1956355867541584, + "grad_norm": 1.2933654176691274, + "learning_rate": 9.298895295389423e-06, + "loss": 0.2585, + "step": 641 + }, + { + "epoch": 0.19594079047764382, + "grad_norm": 1.5346090103364156, + "learning_rate": 9.29636831960669e-06, + "loss": 0.33, + "step": 642 + }, + { + "epoch": 0.19624599420112926, + "grad_norm": 1.3251725534327445, + "learning_rate": 9.293837142687522e-06, + "loss": 0.2104, + "step": 643 + }, + { + "epoch": 0.19655119792461467, + "grad_norm": 1.583285740923444, + "learning_rate": 9.291301767106986e-06, + "loss": 0.4326, + "step": 644 + }, + { + "epoch": 0.19685640164810012, + "grad_norm": 1.0905371753559963, + "learning_rate": 9.288762195344266e-06, + "loss": 0.274, + "step": 645 + }, + { + "epoch": 0.19716160537158553, + "grad_norm": 1.7263838041187525, + "learning_rate": 9.28621842988264e-06, + "loss": 0.5011, + "step": 646 + }, + { + "epoch": 0.19746680909507097, + "grad_norm": 1.4838510492072716, + "learning_rate": 9.283670473209488e-06, + "loss": 0.1956, + "step": 647 + }, + { + "epoch": 0.1977720128185564, + "grad_norm": 1.2036114489558822, + "learning_rate": 9.28111832781629e-06, + "loss": 0.2346, + "step": 648 + }, + { + "epoch": 0.1980772165420418, + "grad_norm": 1.3853687861302435, + "learning_rate": 9.278561996198622e-06, + "loss": 0.2313, + "step": 649 + }, + { + "epoch": 0.19838242026552724, + "grad_norm": 1.2708592907281826, + "learning_rate": 9.276001480856152e-06, + "loss": 0.3717, + "step": 650 + }, + { + "epoch": 0.19868762398901266, + "grad_norm": 1.6407943638167721, + "learning_rate": 9.273436784292641e-06, + "loss": 0.5533, + "step": 651 + }, + { + "epoch": 0.1989928277124981, + "grad_norm": 1.9614104046455092, + "learning_rate": 9.270867909015936e-06, + "loss": 0.4552, + "step": 652 + }, + { + "epoch": 0.1992980314359835, + "grad_norm": 1.6038724290911757, + "learning_rate": 9.268294857537973e-06, + "loss": 0.477, + "step": 653 + }, + { + "epoch": 0.19960323515946896, + "grad_norm": 1.9838192379267598, + "learning_rate": 9.26571763237477e-06, + "loss": 0.7118, + "step": 654 + }, + { + "epoch": 0.19990843888295437, + "grad_norm": 1.4066170955047037, + "learning_rate": 9.263136236046422e-06, + "loss": 0.4222, + "step": 655 + }, + { + "epoch": 0.2002136426064398, + "grad_norm": 1.800111592330467, + "learning_rate": 9.260550671077113e-06, + "loss": 0.4969, + "step": 656 + }, + { + "epoch": 0.20051884632992523, + "grad_norm": 1.450700768131887, + "learning_rate": 9.257960939995093e-06, + "loss": 0.3938, + "step": 657 + }, + { + "epoch": 0.20082405005341064, + "grad_norm": 1.6208519445035616, + "learning_rate": 9.255367045332693e-06, + "loss": 0.5284, + "step": 658 + }, + { + "epoch": 0.20112925377689608, + "grad_norm": 1.4936718972189802, + "learning_rate": 9.25276898962631e-06, + "loss": 0.3314, + "step": 659 + }, + { + "epoch": 0.2014344575003815, + "grad_norm": 1.8465064182825872, + "learning_rate": 9.250166775416412e-06, + "loss": 0.5858, + "step": 660 + }, + { + "epoch": 0.20173966122386694, + "grad_norm": 1.7205660848788409, + "learning_rate": 9.247560405247535e-06, + "loss": 0.4412, + "step": 661 + }, + { + "epoch": 0.20204486494735235, + "grad_norm": 1.3534767838887214, + "learning_rate": 9.244949881668276e-06, + "loss": 0.2469, + "step": 662 + }, + { + "epoch": 0.2023500686708378, + "grad_norm": 1.283164106498675, + "learning_rate": 9.242335207231297e-06, + "loss": 0.2035, + "step": 663 + }, + { + "epoch": 0.2026552723943232, + "grad_norm": 1.6915673582245812, + "learning_rate": 9.239716384493313e-06, + "loss": 0.2989, + "step": 664 + }, + { + "epoch": 0.20296047611780862, + "grad_norm": 2.2410065734371956, + "learning_rate": 9.2370934160151e-06, + "loss": 0.4781, + "step": 665 + }, + { + "epoch": 0.20326567984129407, + "grad_norm": 1.3632387520798106, + "learning_rate": 9.234466304361487e-06, + "loss": 0.2167, + "step": 666 + }, + { + "epoch": 0.20357088356477948, + "grad_norm": 1.249315694419475, + "learning_rate": 9.231835052101352e-06, + "loss": 0.323, + "step": 667 + }, + { + "epoch": 0.20387608728826492, + "grad_norm": 1.2758684884196045, + "learning_rate": 9.229199661807628e-06, + "loss": 0.2939, + "step": 668 + }, + { + "epoch": 0.20418129101175034, + "grad_norm": 1.6065093517583355, + "learning_rate": 9.226560136057286e-06, + "loss": 0.5757, + "step": 669 + }, + { + "epoch": 0.20448649473523578, + "grad_norm": 1.4713850530122647, + "learning_rate": 9.223916477431348e-06, + "loss": 0.1891, + "step": 670 + }, + { + "epoch": 0.2047916984587212, + "grad_norm": 1.2115833160052654, + "learning_rate": 9.221268688514872e-06, + "loss": 0.2403, + "step": 671 + }, + { + "epoch": 0.20509690218220664, + "grad_norm": 1.2023485328623802, + "learning_rate": 9.218616771896958e-06, + "loss": 0.2996, + "step": 672 + }, + { + "epoch": 0.20540210590569205, + "grad_norm": 1.489677195048235, + "learning_rate": 9.21596073017074e-06, + "loss": 0.4833, + "step": 673 + }, + { + "epoch": 0.20570730962917746, + "grad_norm": 1.3322943821875144, + "learning_rate": 9.213300565933387e-06, + "loss": 0.4981, + "step": 674 + }, + { + "epoch": 0.2060125133526629, + "grad_norm": 1.657242243821286, + "learning_rate": 9.210636281786101e-06, + "loss": 0.4101, + "step": 675 + }, + { + "epoch": 0.20631771707614832, + "grad_norm": 1.5710052935767878, + "learning_rate": 9.207967880334107e-06, + "loss": 0.5269, + "step": 676 + }, + { + "epoch": 0.20662292079963376, + "grad_norm": 1.7349735636167414, + "learning_rate": 9.205295364186664e-06, + "loss": 0.4501, + "step": 677 + }, + { + "epoch": 0.20692812452311918, + "grad_norm": 1.2131120415162016, + "learning_rate": 9.202618735957044e-06, + "loss": 0.2405, + "step": 678 + }, + { + "epoch": 0.20723332824660462, + "grad_norm": 1.873473817856673, + "learning_rate": 9.199937998262553e-06, + "loss": 0.534, + "step": 679 + }, + { + "epoch": 0.20753853197009003, + "grad_norm": 1.5561433809212002, + "learning_rate": 9.197253153724504e-06, + "loss": 0.3787, + "step": 680 + }, + { + "epoch": 0.20784373569357545, + "grad_norm": 1.656918804520659, + "learning_rate": 9.194564204968231e-06, + "loss": 0.5373, + "step": 681 + }, + { + "epoch": 0.2081489394170609, + "grad_norm": 1.2717305075559726, + "learning_rate": 9.19187115462308e-06, + "loss": 0.3069, + "step": 682 + }, + { + "epoch": 0.2084541431405463, + "grad_norm": 1.6153063314109386, + "learning_rate": 9.189174005322408e-06, + "loss": 0.5305, + "step": 683 + }, + { + "epoch": 0.20875934686403175, + "grad_norm": 1.336223638576054, + "learning_rate": 9.18647275970358e-06, + "loss": 0.3889, + "step": 684 + }, + { + "epoch": 0.20906455058751716, + "grad_norm": 1.8056451147641948, + "learning_rate": 9.18376742040797e-06, + "loss": 0.5624, + "step": 685 + }, + { + "epoch": 0.2093697543110026, + "grad_norm": 1.4375442829774812, + "learning_rate": 9.181057990080945e-06, + "loss": 0.4177, + "step": 686 + }, + { + "epoch": 0.20967495803448802, + "grad_norm": 1.4711699333864836, + "learning_rate": 9.178344471371886e-06, + "loss": 0.3406, + "step": 687 + }, + { + "epoch": 0.20998016175797346, + "grad_norm": 1.3838222287012096, + "learning_rate": 9.17562686693416e-06, + "loss": 0.3066, + "step": 688 + }, + { + "epoch": 0.21028536548145887, + "grad_norm": 1.4022212067938096, + "learning_rate": 9.172905179425136e-06, + "loss": 0.2756, + "step": 689 + }, + { + "epoch": 0.2105905692049443, + "grad_norm": 1.3536281346270633, + "learning_rate": 9.170179411506175e-06, + "loss": 0.4636, + "step": 690 + }, + { + "epoch": 0.21089577292842973, + "grad_norm": 1.8927154333736904, + "learning_rate": 9.167449565842622e-06, + "loss": 0.4806, + "step": 691 + }, + { + "epoch": 0.21120097665191515, + "grad_norm": 1.3316444887041978, + "learning_rate": 9.164715645103818e-06, + "loss": 0.3236, + "step": 692 + }, + { + "epoch": 0.2115061803754006, + "grad_norm": 1.5414957862521723, + "learning_rate": 9.161977651963082e-06, + "loss": 0.3174, + "step": 693 + }, + { + "epoch": 0.211811384098886, + "grad_norm": 1.3526532837600678, + "learning_rate": 9.15923558909772e-06, + "loss": 0.3286, + "step": 694 + }, + { + "epoch": 0.21211658782237144, + "grad_norm": 1.6741032947314405, + "learning_rate": 9.156489459189011e-06, + "loss": 0.5021, + "step": 695 + }, + { + "epoch": 0.21242179154585686, + "grad_norm": 1.3755135010833581, + "learning_rate": 9.153739264922221e-06, + "loss": 0.3659, + "step": 696 + }, + { + "epoch": 0.21272699526934227, + "grad_norm": 1.1828934010664012, + "learning_rate": 9.150985008986579e-06, + "loss": 0.273, + "step": 697 + }, + { + "epoch": 0.21303219899282771, + "grad_norm": 1.6368043280832338, + "learning_rate": 9.148226694075295e-06, + "loss": 0.3457, + "step": 698 + }, + { + "epoch": 0.21333740271631313, + "grad_norm": 1.7649363297171154, + "learning_rate": 9.14546432288554e-06, + "loss": 0.504, + "step": 699 + }, + { + "epoch": 0.21364260643979857, + "grad_norm": 1.5869344950607664, + "learning_rate": 9.14269789811846e-06, + "loss": 0.2746, + "step": 700 + }, + { + "epoch": 0.21394781016328399, + "grad_norm": 2.1981528156810266, + "learning_rate": 9.139927422479157e-06, + "loss": 0.3494, + "step": 701 + }, + { + "epoch": 0.21425301388676943, + "grad_norm": 1.5364449020919386, + "learning_rate": 9.137152898676698e-06, + "loss": 0.259, + "step": 702 + }, + { + "epoch": 0.21455821761025484, + "grad_norm": 1.5976641591815994, + "learning_rate": 9.134374329424107e-06, + "loss": 0.3714, + "step": 703 + }, + { + "epoch": 0.21486342133374028, + "grad_norm": 1.2686502539601563, + "learning_rate": 9.131591717438366e-06, + "loss": 0.3223, + "step": 704 + }, + { + "epoch": 0.2151686250572257, + "grad_norm": 1.246057539745606, + "learning_rate": 9.128805065440405e-06, + "loss": 0.2376, + "step": 705 + }, + { + "epoch": 0.2154738287807111, + "grad_norm": 1.7361721316316239, + "learning_rate": 9.126014376155113e-06, + "loss": 0.4981, + "step": 706 + }, + { + "epoch": 0.21577903250419656, + "grad_norm": 1.5341841942267511, + "learning_rate": 9.12321965231132e-06, + "loss": 0.3948, + "step": 707 + }, + { + "epoch": 0.21608423622768197, + "grad_norm": 1.564805123380453, + "learning_rate": 9.1204208966418e-06, + "loss": 0.3948, + "step": 708 + }, + { + "epoch": 0.2163894399511674, + "grad_norm": 1.3738642249897421, + "learning_rate": 9.117618111883276e-06, + "loss": 0.3615, + "step": 709 + }, + { + "epoch": 0.21669464367465283, + "grad_norm": 1.2994164991334436, + "learning_rate": 9.114811300776405e-06, + "loss": 0.2186, + "step": 710 + }, + { + "epoch": 0.21699984739813827, + "grad_norm": 1.7972182236315297, + "learning_rate": 9.112000466065785e-06, + "loss": 0.5655, + "step": 711 + }, + { + "epoch": 0.21730505112162368, + "grad_norm": 1.278135561724537, + "learning_rate": 9.109185610499943e-06, + "loss": 0.3853, + "step": 712 + }, + { + "epoch": 0.2176102548451091, + "grad_norm": 1.213829377522313, + "learning_rate": 9.106366736831347e-06, + "loss": 0.3143, + "step": 713 + }, + { + "epoch": 0.21791545856859454, + "grad_norm": 1.5786296679622351, + "learning_rate": 9.103543847816384e-06, + "loss": 0.3748, + "step": 714 + }, + { + "epoch": 0.21822066229207995, + "grad_norm": 1.1980380231780912, + "learning_rate": 9.100716946215374e-06, + "loss": 0.3257, + "step": 715 + }, + { + "epoch": 0.2185258660155654, + "grad_norm": 1.5748488156857616, + "learning_rate": 9.097886034792557e-06, + "loss": 0.3859, + "step": 716 + }, + { + "epoch": 0.2188310697390508, + "grad_norm": 1.4962734980956909, + "learning_rate": 9.095051116316095e-06, + "loss": 0.4612, + "step": 717 + }, + { + "epoch": 0.21913627346253625, + "grad_norm": 1.4269067789650183, + "learning_rate": 9.092212193558072e-06, + "loss": 0.3177, + "step": 718 + }, + { + "epoch": 0.21944147718602167, + "grad_norm": 1.2842950689313655, + "learning_rate": 9.089369269294483e-06, + "loss": 0.238, + "step": 719 + }, + { + "epoch": 0.2197466809095071, + "grad_norm": 1.3869072584515276, + "learning_rate": 9.086522346305233e-06, + "loss": 0.3534, + "step": 720 + }, + { + "epoch": 0.22005188463299252, + "grad_norm": 1.5861416389361607, + "learning_rate": 9.083671427374144e-06, + "loss": 0.404, + "step": 721 + }, + { + "epoch": 0.22035708835647794, + "grad_norm": 1.3575612826767118, + "learning_rate": 9.080816515288944e-06, + "loss": 0.2564, + "step": 722 + }, + { + "epoch": 0.22066229207996338, + "grad_norm": 1.6224055486381046, + "learning_rate": 9.077957612841262e-06, + "loss": 0.5335, + "step": 723 + }, + { + "epoch": 0.2209674958034488, + "grad_norm": 1.1424184312646575, + "learning_rate": 9.07509472282663e-06, + "loss": 0.2248, + "step": 724 + }, + { + "epoch": 0.22127269952693424, + "grad_norm": 0.9653403668195691, + "learning_rate": 9.07222784804448e-06, + "loss": 0.1492, + "step": 725 + }, + { + "epoch": 0.22157790325041965, + "grad_norm": 1.7736448595259362, + "learning_rate": 9.069356991298145e-06, + "loss": 0.3365, + "step": 726 + }, + { + "epoch": 0.2218831069739051, + "grad_norm": 1.5382460544138177, + "learning_rate": 9.066482155394843e-06, + "loss": 0.3641, + "step": 727 + }, + { + "epoch": 0.2221883106973905, + "grad_norm": 1.5490656282179018, + "learning_rate": 9.063603343145689e-06, + "loss": 0.451, + "step": 728 + }, + { + "epoch": 0.22249351442087595, + "grad_norm": 1.6477503496903847, + "learning_rate": 9.060720557365683e-06, + "loss": 0.3152, + "step": 729 + }, + { + "epoch": 0.22279871814436136, + "grad_norm": 1.3334962278440752, + "learning_rate": 9.057833800873712e-06, + "loss": 0.3514, + "step": 730 + }, + { + "epoch": 0.22310392186784678, + "grad_norm": 1.3285303282716252, + "learning_rate": 9.054943076492548e-06, + "loss": 0.2835, + "step": 731 + }, + { + "epoch": 0.22340912559133222, + "grad_norm": 1.275406047266298, + "learning_rate": 9.05204838704884e-06, + "loss": 0.2324, + "step": 732 + }, + { + "epoch": 0.22371432931481763, + "grad_norm": 1.6512597679062968, + "learning_rate": 9.04914973537311e-06, + "loss": 0.6219, + "step": 733 + }, + { + "epoch": 0.22401953303830308, + "grad_norm": 1.2059290028925398, + "learning_rate": 9.046247124299766e-06, + "loss": 0.3296, + "step": 734 + }, + { + "epoch": 0.2243247367617885, + "grad_norm": 1.5740026131575533, + "learning_rate": 9.043340556667076e-06, + "loss": 0.4929, + "step": 735 + }, + { + "epoch": 0.22462994048527393, + "grad_norm": 1.525534179809715, + "learning_rate": 9.040430035317184e-06, + "loss": 0.3855, + "step": 736 + }, + { + "epoch": 0.22493514420875935, + "grad_norm": 1.5007721185282292, + "learning_rate": 9.037515563096096e-06, + "loss": 0.2867, + "step": 737 + }, + { + "epoch": 0.22524034793224476, + "grad_norm": 1.7439351574891226, + "learning_rate": 9.034597142853685e-06, + "loss": 0.5206, + "step": 738 + }, + { + "epoch": 0.2255455516557302, + "grad_norm": 1.3832916895025305, + "learning_rate": 9.031674777443679e-06, + "loss": 0.3525, + "step": 739 + }, + { + "epoch": 0.22585075537921562, + "grad_norm": 1.689345092847421, + "learning_rate": 9.028748469723671e-06, + "loss": 0.605, + "step": 740 + }, + { + "epoch": 0.22615595910270106, + "grad_norm": 1.5414939302466633, + "learning_rate": 9.025818222555105e-06, + "loss": 0.4616, + "step": 741 + }, + { + "epoch": 0.22646116282618647, + "grad_norm": 1.572475405138349, + "learning_rate": 9.022884038803275e-06, + "loss": 0.4125, + "step": 742 + }, + { + "epoch": 0.22676636654967192, + "grad_norm": 1.6244915386812042, + "learning_rate": 9.019945921337328e-06, + "loss": 0.3892, + "step": 743 + }, + { + "epoch": 0.22707157027315733, + "grad_norm": 1.109983250603302, + "learning_rate": 9.017003873030254e-06, + "loss": 0.2775, + "step": 744 + }, + { + "epoch": 0.22737677399664277, + "grad_norm": 1.2908690769048217, + "learning_rate": 9.014057896758892e-06, + "loss": 0.2799, + "step": 745 + }, + { + "epoch": 0.2276819777201282, + "grad_norm": 1.3869598608885343, + "learning_rate": 9.011107995403917e-06, + "loss": 0.3109, + "step": 746 + }, + { + "epoch": 0.2279871814436136, + "grad_norm": 1.9493518241147973, + "learning_rate": 9.008154171849843e-06, + "loss": 0.3222, + "step": 747 + }, + { + "epoch": 0.22829238516709904, + "grad_norm": 1.1260119760970688, + "learning_rate": 9.005196428985024e-06, + "loss": 0.2015, + "step": 748 + }, + { + "epoch": 0.22859758889058446, + "grad_norm": 1.3395279047737458, + "learning_rate": 9.002234769701637e-06, + "loss": 0.3174, + "step": 749 + }, + { + "epoch": 0.2289027926140699, + "grad_norm": 1.127299131698768, + "learning_rate": 8.999269196895698e-06, + "loss": 0.3164, + "step": 750 + }, + { + "epoch": 0.22920799633755531, + "grad_norm": 1.1374371018678644, + "learning_rate": 8.996299713467044e-06, + "loss": 0.3252, + "step": 751 + }, + { + "epoch": 0.22951320006104076, + "grad_norm": 1.4696910359597422, + "learning_rate": 8.993326322319339e-06, + "loss": 0.4006, + "step": 752 + }, + { + "epoch": 0.22981840378452617, + "grad_norm": 1.862503625238974, + "learning_rate": 8.990349026360065e-06, + "loss": 0.5648, + "step": 753 + }, + { + "epoch": 0.23012360750801159, + "grad_norm": 1.768109354521003, + "learning_rate": 8.987367828500528e-06, + "loss": 0.4002, + "step": 754 + }, + { + "epoch": 0.23042881123149703, + "grad_norm": 1.6108609348753753, + "learning_rate": 8.984382731655842e-06, + "loss": 0.3703, + "step": 755 + }, + { + "epoch": 0.23073401495498244, + "grad_norm": 2.0879815191569153, + "learning_rate": 8.98139373874494e-06, + "loss": 0.4966, + "step": 756 + }, + { + "epoch": 0.23103921867846788, + "grad_norm": 1.627248748628783, + "learning_rate": 8.978400852690557e-06, + "loss": 0.4972, + "step": 757 + }, + { + "epoch": 0.2313444224019533, + "grad_norm": 1.6245326323749618, + "learning_rate": 8.975404076419244e-06, + "loss": 0.2992, + "step": 758 + }, + { + "epoch": 0.23164962612543874, + "grad_norm": 1.295122180020727, + "learning_rate": 8.972403412861354e-06, + "loss": 0.1872, + "step": 759 + }, + { + "epoch": 0.23195482984892415, + "grad_norm": 1.5772423116895944, + "learning_rate": 8.969398864951034e-06, + "loss": 0.3683, + "step": 760 + }, + { + "epoch": 0.2322600335724096, + "grad_norm": 1.2952086072241094, + "learning_rate": 8.966390435626233e-06, + "loss": 0.3407, + "step": 761 + }, + { + "epoch": 0.232565237295895, + "grad_norm": 1.350730242849987, + "learning_rate": 8.963378127828703e-06, + "loss": 0.3012, + "step": 762 + }, + { + "epoch": 0.23287044101938043, + "grad_norm": 1.757344365832517, + "learning_rate": 8.960361944503976e-06, + "loss": 0.4483, + "step": 763 + }, + { + "epoch": 0.23317564474286587, + "grad_norm": 1.4971687299475023, + "learning_rate": 8.957341888601382e-06, + "loss": 0.3751, + "step": 764 + }, + { + "epoch": 0.23348084846635128, + "grad_norm": 0.9570785510771707, + "learning_rate": 8.954317963074035e-06, + "loss": 0.1987, + "step": 765 + }, + { + "epoch": 0.23378605218983672, + "grad_norm": 1.8380667817910563, + "learning_rate": 8.951290170878834e-06, + "loss": 0.4731, + "step": 766 + }, + { + "epoch": 0.23409125591332214, + "grad_norm": 1.112944434897937, + "learning_rate": 8.948258514976456e-06, + "loss": 0.2407, + "step": 767 + }, + { + "epoch": 0.23439645963680758, + "grad_norm": 1.309366755129546, + "learning_rate": 8.94522299833136e-06, + "loss": 0.319, + "step": 768 + }, + { + "epoch": 0.234701663360293, + "grad_norm": 1.1451808921483004, + "learning_rate": 8.94218362391178e-06, + "loss": 0.2171, + "step": 769 + }, + { + "epoch": 0.2350068670837784, + "grad_norm": 1.428457846554008, + "learning_rate": 8.939140394689716e-06, + "loss": 0.294, + "step": 770 + }, + { + "epoch": 0.23531207080726385, + "grad_norm": 1.6548235526629411, + "learning_rate": 8.936093313640947e-06, + "loss": 0.4041, + "step": 771 + }, + { + "epoch": 0.23561727453074927, + "grad_norm": 1.4221218858946143, + "learning_rate": 8.93304238374501e-06, + "loss": 0.2095, + "step": 772 + }, + { + "epoch": 0.2359224782542347, + "grad_norm": 1.5383523764762757, + "learning_rate": 8.929987607985212e-06, + "loss": 0.478, + "step": 773 + }, + { + "epoch": 0.23622768197772012, + "grad_norm": 1.5292649150056814, + "learning_rate": 8.926928989348612e-06, + "loss": 0.4167, + "step": 774 + }, + { + "epoch": 0.23653288570120556, + "grad_norm": 1.6476214228386405, + "learning_rate": 8.923866530826037e-06, + "loss": 0.4843, + "step": 775 + }, + { + "epoch": 0.23683808942469098, + "grad_norm": 1.5924467106022067, + "learning_rate": 8.920800235412067e-06, + "loss": 0.407, + "step": 776 + }, + { + "epoch": 0.23714329314817642, + "grad_norm": 1.5594856941518516, + "learning_rate": 8.917730106105022e-06, + "loss": 0.3124, + "step": 777 + }, + { + "epoch": 0.23744849687166183, + "grad_norm": 1.386797749592837, + "learning_rate": 8.914656145906988e-06, + "loss": 0.3016, + "step": 778 + }, + { + "epoch": 0.23775370059514725, + "grad_norm": 1.3692351348081182, + "learning_rate": 8.911578357823784e-06, + "loss": 0.3401, + "step": 779 + }, + { + "epoch": 0.2380589043186327, + "grad_norm": 1.389706254561915, + "learning_rate": 8.90849674486498e-06, + "loss": 0.3724, + "step": 780 + }, + { + "epoch": 0.2383641080421181, + "grad_norm": 1.05409970328404, + "learning_rate": 8.90541131004388e-06, + "loss": 0.2848, + "step": 781 + }, + { + "epoch": 0.23866931176560355, + "grad_norm": 1.5742105715707222, + "learning_rate": 8.90232205637753e-06, + "loss": 0.6582, + "step": 782 + }, + { + "epoch": 0.23897451548908896, + "grad_norm": 1.4335601553241863, + "learning_rate": 8.899228986886709e-06, + "loss": 0.4478, + "step": 783 + }, + { + "epoch": 0.2392797192125744, + "grad_norm": 1.692276024021936, + "learning_rate": 8.896132104595925e-06, + "loss": 0.2317, + "step": 784 + }, + { + "epoch": 0.23958492293605982, + "grad_norm": 1.4666941437892842, + "learning_rate": 8.893031412533415e-06, + "loss": 0.4121, + "step": 785 + }, + { + "epoch": 0.23989012665954523, + "grad_norm": 1.7867397905919786, + "learning_rate": 8.889926913731144e-06, + "loss": 0.3591, + "step": 786 + }, + { + "epoch": 0.24019533038303068, + "grad_norm": 1.5710332933779532, + "learning_rate": 8.886818611224798e-06, + "loss": 0.3421, + "step": 787 + }, + { + "epoch": 0.2405005341065161, + "grad_norm": 1.0993405381050079, + "learning_rate": 8.883706508053778e-06, + "loss": 0.1907, + "step": 788 + }, + { + "epoch": 0.24080573783000153, + "grad_norm": 1.5126727738496022, + "learning_rate": 8.88059060726121e-06, + "loss": 0.5117, + "step": 789 + }, + { + "epoch": 0.24111094155348695, + "grad_norm": 1.0050194650159943, + "learning_rate": 8.877470911893927e-06, + "loss": 0.2927, + "step": 790 + }, + { + "epoch": 0.2414161452769724, + "grad_norm": 1.549032075208785, + "learning_rate": 8.874347425002474e-06, + "loss": 0.4823, + "step": 791 + }, + { + "epoch": 0.2417213490004578, + "grad_norm": 1.4199913878918462, + "learning_rate": 8.871220149641101e-06, + "loss": 0.372, + "step": 792 + }, + { + "epoch": 0.24202655272394324, + "grad_norm": 1.2204976389548388, + "learning_rate": 8.868089088867772e-06, + "loss": 0.4162, + "step": 793 + }, + { + "epoch": 0.24233175644742866, + "grad_norm": 1.2773847194249226, + "learning_rate": 8.86495424574414e-06, + "loss": 0.3559, + "step": 794 + }, + { + "epoch": 0.24263696017091407, + "grad_norm": 1.5146172702910354, + "learning_rate": 8.861815623335564e-06, + "loss": 0.4853, + "step": 795 + }, + { + "epoch": 0.24294216389439952, + "grad_norm": 1.2421815500176878, + "learning_rate": 8.858673224711097e-06, + "loss": 0.2322, + "step": 796 + }, + { + "epoch": 0.24324736761788493, + "grad_norm": 1.573303107817239, + "learning_rate": 8.855527052943482e-06, + "loss": 0.4221, + "step": 797 + }, + { + "epoch": 0.24355257134137037, + "grad_norm": 1.3348685555629807, + "learning_rate": 8.852377111109158e-06, + "loss": 0.4016, + "step": 798 + }, + { + "epoch": 0.2438577750648558, + "grad_norm": 1.4724771810683823, + "learning_rate": 8.849223402288244e-06, + "loss": 0.3673, + "step": 799 + }, + { + "epoch": 0.24416297878834123, + "grad_norm": 1.4577243169053915, + "learning_rate": 8.846065929564542e-06, + "loss": 0.3661, + "step": 800 + }, + { + "epoch": 0.24446818251182664, + "grad_norm": 1.2504498625919398, + "learning_rate": 8.842904696025542e-06, + "loss": 0.3142, + "step": 801 + }, + { + "epoch": 0.24477338623531206, + "grad_norm": 1.4501852015903842, + "learning_rate": 8.839739704762404e-06, + "loss": 0.4218, + "step": 802 + }, + { + "epoch": 0.2450785899587975, + "grad_norm": 1.814308645288012, + "learning_rate": 8.836570958869966e-06, + "loss": 0.2846, + "step": 803 + }, + { + "epoch": 0.2453837936822829, + "grad_norm": 1.4130518552602014, + "learning_rate": 8.833398461446736e-06, + "loss": 0.3661, + "step": 804 + }, + { + "epoch": 0.24568899740576836, + "grad_norm": 1.207043545854772, + "learning_rate": 8.83022221559489e-06, + "loss": 0.3083, + "step": 805 + }, + { + "epoch": 0.24599420112925377, + "grad_norm": 1.3695879979407652, + "learning_rate": 8.827042224420273e-06, + "loss": 0.3913, + "step": 806 + }, + { + "epoch": 0.2462994048527392, + "grad_norm": 1.509429933800135, + "learning_rate": 8.823858491032388e-06, + "loss": 0.4189, + "step": 807 + }, + { + "epoch": 0.24660460857622463, + "grad_norm": 1.5440694625132374, + "learning_rate": 8.820671018544398e-06, + "loss": 0.4917, + "step": 808 + }, + { + "epoch": 0.24690981229971007, + "grad_norm": 1.2648979736165928, + "learning_rate": 8.817479810073124e-06, + "loss": 0.2272, + "step": 809 + }, + { + "epoch": 0.24721501602319548, + "grad_norm": 1.5676001662475934, + "learning_rate": 8.814284868739038e-06, + "loss": 0.3194, + "step": 810 + }, + { + "epoch": 0.2475202197466809, + "grad_norm": 1.5083433807769027, + "learning_rate": 8.811086197666266e-06, + "loss": 0.4213, + "step": 811 + }, + { + "epoch": 0.24782542347016634, + "grad_norm": 1.428992764464654, + "learning_rate": 8.807883799982574e-06, + "loss": 0.3399, + "step": 812 + }, + { + "epoch": 0.24813062719365175, + "grad_norm": 1.0166441279138219, + "learning_rate": 8.80467767881938e-06, + "loss": 0.2195, + "step": 813 + }, + { + "epoch": 0.2484358309171372, + "grad_norm": 1.2422483417317973, + "learning_rate": 8.801467837311738e-06, + "loss": 0.2906, + "step": 814 + }, + { + "epoch": 0.2487410346406226, + "grad_norm": 1.419560487453445, + "learning_rate": 8.798254278598345e-06, + "loss": 0.4495, + "step": 815 + }, + { + "epoch": 0.24904623836410805, + "grad_norm": 1.16665115800268, + "learning_rate": 8.795037005821521e-06, + "loss": 0.2503, + "step": 816 + }, + { + "epoch": 0.24935144208759347, + "grad_norm": 1.4481353411677762, + "learning_rate": 8.791816022127233e-06, + "loss": 0.3363, + "step": 817 + }, + { + "epoch": 0.2496566458110789, + "grad_norm": 1.338264713753052, + "learning_rate": 8.788591330665065e-06, + "loss": 0.2325, + "step": 818 + }, + { + "epoch": 0.24996184953456432, + "grad_norm": 1.3941866237825566, + "learning_rate": 8.785362934588234e-06, + "loss": 0.4524, + "step": 819 + }, + { + "epoch": 0.25026705325804977, + "grad_norm": 1.508098753010285, + "learning_rate": 8.782130837053575e-06, + "loss": 0.3796, + "step": 820 + }, + { + "epoch": 0.2505722569815352, + "grad_norm": 1.3573838043362958, + "learning_rate": 8.778895041221544e-06, + "loss": 0.2623, + "step": 821 + }, + { + "epoch": 0.2508774607050206, + "grad_norm": 1.3709683700574562, + "learning_rate": 8.775655550256214e-06, + "loss": 0.3998, + "step": 822 + }, + { + "epoch": 0.251182664428506, + "grad_norm": 1.5964366482889705, + "learning_rate": 8.772412367325269e-06, + "loss": 0.455, + "step": 823 + }, + { + "epoch": 0.2514878681519915, + "grad_norm": 1.6611709904306184, + "learning_rate": 8.769165495600007e-06, + "loss": 0.5094, + "step": 824 + }, + { + "epoch": 0.2517930718754769, + "grad_norm": 1.1400080018013656, + "learning_rate": 8.76591493825533e-06, + "loss": 0.3997, + "step": 825 + }, + { + "epoch": 0.2520982755989623, + "grad_norm": 1.862472646627605, + "learning_rate": 8.76266069846974e-06, + "loss": 0.3198, + "step": 826 + }, + { + "epoch": 0.2524034793224477, + "grad_norm": 1.6623689872345544, + "learning_rate": 8.75940277942535e-06, + "loss": 0.5344, + "step": 827 + }, + { + "epoch": 0.25270868304593314, + "grad_norm": 1.4234557534815584, + "learning_rate": 8.756141184307865e-06, + "loss": 0.3674, + "step": 828 + }, + { + "epoch": 0.2530138867694186, + "grad_norm": 1.7406127381363719, + "learning_rate": 8.75287591630658e-06, + "loss": 0.4348, + "step": 829 + }, + { + "epoch": 0.253319090492904, + "grad_norm": 1.4903918741773559, + "learning_rate": 8.749606978614391e-06, + "loss": 0.2545, + "step": 830 + }, + { + "epoch": 0.25362429421638943, + "grad_norm": 1.8076324214942547, + "learning_rate": 8.746334374427774e-06, + "loss": 0.4559, + "step": 831 + }, + { + "epoch": 0.25392949793987485, + "grad_norm": 1.3532948940123553, + "learning_rate": 8.743058106946798e-06, + "loss": 0.2642, + "step": 832 + }, + { + "epoch": 0.2542347016633603, + "grad_norm": 1.6339304623976283, + "learning_rate": 8.739778179375105e-06, + "loss": 0.3866, + "step": 833 + }, + { + "epoch": 0.25453990538684573, + "grad_norm": 1.2532848113555968, + "learning_rate": 8.736494594919922e-06, + "loss": 0.2563, + "step": 834 + }, + { + "epoch": 0.25484510911033115, + "grad_norm": 1.062909552234949, + "learning_rate": 8.733207356792053e-06, + "loss": 0.2942, + "step": 835 + }, + { + "epoch": 0.25515031283381656, + "grad_norm": 1.3812270046063335, + "learning_rate": 8.72991646820587e-06, + "loss": 0.4591, + "step": 836 + }, + { + "epoch": 0.255455516557302, + "grad_norm": 1.401063935434944, + "learning_rate": 8.726621932379319e-06, + "loss": 0.3184, + "step": 837 + }, + { + "epoch": 0.25576072028078745, + "grad_norm": 1.394771394969279, + "learning_rate": 8.723323752533907e-06, + "loss": 0.2373, + "step": 838 + }, + { + "epoch": 0.25606592400427286, + "grad_norm": 1.4771850726658662, + "learning_rate": 8.72002193189471e-06, + "loss": 0.384, + "step": 839 + }, + { + "epoch": 0.2563711277277583, + "grad_norm": 1.278173381384045, + "learning_rate": 8.716716473690361e-06, + "loss": 0.2333, + "step": 840 + }, + { + "epoch": 0.2566763314512437, + "grad_norm": 1.5231335458997086, + "learning_rate": 8.713407381153052e-06, + "loss": 0.4792, + "step": 841 + }, + { + "epoch": 0.25698153517472916, + "grad_norm": 1.2138032535770222, + "learning_rate": 8.710094657518524e-06, + "loss": 0.1745, + "step": 842 + }, + { + "epoch": 0.2572867388982146, + "grad_norm": 1.6203262902374684, + "learning_rate": 8.706778306026073e-06, + "loss": 0.392, + "step": 843 + }, + { + "epoch": 0.2575919426217, + "grad_norm": 1.5129887571452891, + "learning_rate": 8.703458329918541e-06, + "loss": 0.3458, + "step": 844 + }, + { + "epoch": 0.2578971463451854, + "grad_norm": 1.579903535191777, + "learning_rate": 8.700134732442318e-06, + "loss": 0.4183, + "step": 845 + }, + { + "epoch": 0.2582023500686708, + "grad_norm": 1.6503110245689208, + "learning_rate": 8.69680751684733e-06, + "loss": 0.5822, + "step": 846 + }, + { + "epoch": 0.2585075537921563, + "grad_norm": 1.4889049274535986, + "learning_rate": 8.693476686387038e-06, + "loss": 0.4368, + "step": 847 + }, + { + "epoch": 0.2588127575156417, + "grad_norm": 1.1464358161897683, + "learning_rate": 8.69014224431845e-06, + "loss": 0.2515, + "step": 848 + }, + { + "epoch": 0.2591179612391271, + "grad_norm": 1.759904896160663, + "learning_rate": 8.686804193902092e-06, + "loss": 0.3729, + "step": 849 + }, + { + "epoch": 0.25942316496261253, + "grad_norm": 1.6474376376075388, + "learning_rate": 8.683462538402026e-06, + "loss": 0.2528, + "step": 850 + }, + { + "epoch": 0.25972836868609794, + "grad_norm": 1.6153335633088148, + "learning_rate": 8.680117281085839e-06, + "loss": 0.4734, + "step": 851 + }, + { + "epoch": 0.2600335724095834, + "grad_norm": 1.7594579836313058, + "learning_rate": 8.676768425224636e-06, + "loss": 0.3666, + "step": 852 + }, + { + "epoch": 0.2603387761330688, + "grad_norm": 2.7725868317444973, + "learning_rate": 8.673415974093046e-06, + "loss": 0.3024, + "step": 853 + }, + { + "epoch": 0.26064397985655424, + "grad_norm": 1.3139609425748013, + "learning_rate": 8.67005993096921e-06, + "loss": 0.2145, + "step": 854 + }, + { + "epoch": 0.26094918358003966, + "grad_norm": 1.5222687762088085, + "learning_rate": 8.66670029913478e-06, + "loss": 0.306, + "step": 855 + }, + { + "epoch": 0.2612543873035251, + "grad_norm": 1.225174136338348, + "learning_rate": 8.663337081874921e-06, + "loss": 0.1945, + "step": 856 + }, + { + "epoch": 0.26155959102701054, + "grad_norm": 1.5844026100416482, + "learning_rate": 8.659970282478303e-06, + "loss": 0.5302, + "step": 857 + }, + { + "epoch": 0.26186479475049596, + "grad_norm": 1.4779095030916518, + "learning_rate": 8.656599904237097e-06, + "loss": 0.492, + "step": 858 + }, + { + "epoch": 0.26216999847398137, + "grad_norm": 1.6248924513219443, + "learning_rate": 8.653225950446973e-06, + "loss": 0.5415, + "step": 859 + }, + { + "epoch": 0.2624752021974668, + "grad_norm": 1.3916992153237053, + "learning_rate": 8.649848424407103e-06, + "loss": 0.2716, + "step": 860 + }, + { + "epoch": 0.26278040592095225, + "grad_norm": 1.5314790982505428, + "learning_rate": 8.646467329420145e-06, + "loss": 0.486, + "step": 861 + }, + { + "epoch": 0.26308560964443767, + "grad_norm": 1.474685202158443, + "learning_rate": 8.64308266879225e-06, + "loss": 0.4944, + "step": 862 + }, + { + "epoch": 0.2633908133679231, + "grad_norm": 2.089424447750544, + "learning_rate": 8.639694445833056e-06, + "loss": 0.5607, + "step": 863 + }, + { + "epoch": 0.2636960170914085, + "grad_norm": 1.6425002507132653, + "learning_rate": 8.636302663855682e-06, + "loss": 0.3179, + "step": 864 + }, + { + "epoch": 0.26400122081489397, + "grad_norm": 1.2523110959560677, + "learning_rate": 8.632907326176732e-06, + "loss": 0.3304, + "step": 865 + }, + { + "epoch": 0.2643064245383794, + "grad_norm": 1.9713067301422675, + "learning_rate": 8.629508436116281e-06, + "loss": 0.3482, + "step": 866 + }, + { + "epoch": 0.2646116282618648, + "grad_norm": 1.574438794377994, + "learning_rate": 8.626105996997881e-06, + "loss": 0.3681, + "step": 867 + }, + { + "epoch": 0.2649168319853502, + "grad_norm": 1.975886896641686, + "learning_rate": 8.622700012148554e-06, + "loss": 0.3096, + "step": 868 + }, + { + "epoch": 0.2652220357088356, + "grad_norm": 1.205037763961463, + "learning_rate": 8.619290484898791e-06, + "loss": 0.2542, + "step": 869 + }, + { + "epoch": 0.2655272394323211, + "grad_norm": 1.410569402556539, + "learning_rate": 8.615877418582541e-06, + "loss": 0.3157, + "step": 870 + }, + { + "epoch": 0.2658324431558065, + "grad_norm": 1.4086097839086094, + "learning_rate": 8.61246081653722e-06, + "loss": 0.2501, + "step": 871 + }, + { + "epoch": 0.2661376468792919, + "grad_norm": 1.8258092836473052, + "learning_rate": 8.609040682103697e-06, + "loss": 0.5621, + "step": 872 + }, + { + "epoch": 0.26644285060277734, + "grad_norm": 1.5606382340865774, + "learning_rate": 8.6056170186263e-06, + "loss": 0.4155, + "step": 873 + }, + { + "epoch": 0.2667480543262628, + "grad_norm": 1.4738547712233099, + "learning_rate": 8.6021898294528e-06, + "loss": 0.3736, + "step": 874 + }, + { + "epoch": 0.2670532580497482, + "grad_norm": 1.2521166999589783, + "learning_rate": 8.598759117934424e-06, + "loss": 0.2707, + "step": 875 + }, + { + "epoch": 0.26735846177323364, + "grad_norm": 1.854567909402273, + "learning_rate": 8.595324887425839e-06, + "loss": 0.6055, + "step": 876 + }, + { + "epoch": 0.26766366549671905, + "grad_norm": 1.3156991751956812, + "learning_rate": 8.59188714128515e-06, + "loss": 0.1861, + "step": 877 + }, + { + "epoch": 0.26796886922020446, + "grad_norm": 1.2377078296039872, + "learning_rate": 8.58844588287391e-06, + "loss": 0.3001, + "step": 878 + }, + { + "epoch": 0.26827407294368993, + "grad_norm": 1.2660582471980335, + "learning_rate": 8.585001115557093e-06, + "loss": 0.3137, + "step": 879 + }, + { + "epoch": 0.26857927666717535, + "grad_norm": 1.6544970466723443, + "learning_rate": 8.581552842703114e-06, + "loss": 0.518, + "step": 880 + }, + { + "epoch": 0.26888448039066076, + "grad_norm": 1.7821640881601462, + "learning_rate": 8.57810106768381e-06, + "loss": 0.3717, + "step": 881 + }, + { + "epoch": 0.2691896841141462, + "grad_norm": 1.4045088267172914, + "learning_rate": 8.57464579387445e-06, + "loss": 0.507, + "step": 882 + }, + { + "epoch": 0.2694948878376316, + "grad_norm": 1.2562243938208713, + "learning_rate": 8.571187024653715e-06, + "loss": 0.2897, + "step": 883 + }, + { + "epoch": 0.26980009156111706, + "grad_norm": 1.647258608185494, + "learning_rate": 8.567724763403709e-06, + "loss": 0.4629, + "step": 884 + }, + { + "epoch": 0.2701052952846025, + "grad_norm": 1.4132863649325282, + "learning_rate": 8.564259013509952e-06, + "loss": 0.2711, + "step": 885 + }, + { + "epoch": 0.2704104990080879, + "grad_norm": 1.0546579427462797, + "learning_rate": 8.560789778361372e-06, + "loss": 0.1637, + "step": 886 + }, + { + "epoch": 0.2707157027315733, + "grad_norm": 1.8546260807501882, + "learning_rate": 8.557317061350303e-06, + "loss": 0.3391, + "step": 887 + }, + { + "epoch": 0.2710209064550588, + "grad_norm": 1.23880229781815, + "learning_rate": 8.553840865872492e-06, + "loss": 0.1623, + "step": 888 + }, + { + "epoch": 0.2713261101785442, + "grad_norm": 1.5924230549187588, + "learning_rate": 8.550361195327079e-06, + "loss": 0.3142, + "step": 889 + }, + { + "epoch": 0.2716313139020296, + "grad_norm": 1.3037915837064973, + "learning_rate": 8.546878053116609e-06, + "loss": 0.2978, + "step": 890 + }, + { + "epoch": 0.271936517625515, + "grad_norm": 1.2055323902273756, + "learning_rate": 8.543391442647013e-06, + "loss": 0.2708, + "step": 891 + }, + { + "epoch": 0.27224172134900043, + "grad_norm": 1.1232141519442895, + "learning_rate": 8.539901367327622e-06, + "loss": 0.2381, + "step": 892 + }, + { + "epoch": 0.2725469250724859, + "grad_norm": 1.6705142749495014, + "learning_rate": 8.53640783057115e-06, + "loss": 0.3586, + "step": 893 + }, + { + "epoch": 0.2728521287959713, + "grad_norm": 1.1042663181369634, + "learning_rate": 8.532910835793697e-06, + "loss": 0.142, + "step": 894 + }, + { + "epoch": 0.27315733251945673, + "grad_norm": 1.4442013980293982, + "learning_rate": 8.529410386414747e-06, + "loss": 0.3902, + "step": 895 + }, + { + "epoch": 0.27346253624294214, + "grad_norm": 1.1510191464990058, + "learning_rate": 8.525906485857156e-06, + "loss": 0.2468, + "step": 896 + }, + { + "epoch": 0.2737677399664276, + "grad_norm": 1.5572115857341773, + "learning_rate": 8.522399137547162e-06, + "loss": 0.3039, + "step": 897 + }, + { + "epoch": 0.27407294368991303, + "grad_norm": 1.7209680290555225, + "learning_rate": 8.518888344914365e-06, + "loss": 0.5024, + "step": 898 + }, + { + "epoch": 0.27437814741339844, + "grad_norm": 1.3970675444552212, + "learning_rate": 8.515374111391745e-06, + "loss": 0.3327, + "step": 899 + }, + { + "epoch": 0.27468335113688386, + "grad_norm": 1.5862907429686486, + "learning_rate": 8.511856440415635e-06, + "loss": 0.4007, + "step": 900 + }, + { + "epoch": 0.27498855486036927, + "grad_norm": 1.5069995329007524, + "learning_rate": 8.508335335425739e-06, + "loss": 0.415, + "step": 901 + }, + { + "epoch": 0.27529375858385474, + "grad_norm": 1.3954829540134184, + "learning_rate": 8.50481079986511e-06, + "loss": 0.2788, + "step": 902 + }, + { + "epoch": 0.27559896230734016, + "grad_norm": 1.8025662725066627, + "learning_rate": 8.501282837180165e-06, + "loss": 0.3723, + "step": 903 + }, + { + "epoch": 0.27590416603082557, + "grad_norm": 1.520821384052033, + "learning_rate": 8.49775145082066e-06, + "loss": 0.2988, + "step": 904 + }, + { + "epoch": 0.276209369754311, + "grad_norm": 1.604241860679225, + "learning_rate": 8.494216644239712e-06, + "loss": 0.2951, + "step": 905 + }, + { + "epoch": 0.27651457347779645, + "grad_norm": 1.3350559025036193, + "learning_rate": 8.490678420893773e-06, + "loss": 0.2941, + "step": 906 + }, + { + "epoch": 0.27681977720128187, + "grad_norm": 1.3989463178848165, + "learning_rate": 8.487136784242641e-06, + "loss": 0.3683, + "step": 907 + }, + { + "epoch": 0.2771249809247673, + "grad_norm": 1.6081399715718512, + "learning_rate": 8.483591737749448e-06, + "loss": 0.4503, + "step": 908 + }, + { + "epoch": 0.2774301846482527, + "grad_norm": 1.6045658886374856, + "learning_rate": 8.480043284880666e-06, + "loss": 0.3672, + "step": 909 + }, + { + "epoch": 0.2777353883717381, + "grad_norm": 1.5365693477891538, + "learning_rate": 8.476491429106088e-06, + "loss": 0.5018, + "step": 910 + }, + { + "epoch": 0.2780405920952236, + "grad_norm": 1.306165577473847, + "learning_rate": 8.472936173898846e-06, + "loss": 0.3336, + "step": 911 + }, + { + "epoch": 0.278345795818709, + "grad_norm": 1.2410262335833262, + "learning_rate": 8.469377522735387e-06, + "loss": 0.3175, + "step": 912 + }, + { + "epoch": 0.2786509995421944, + "grad_norm": 0.9618771928411501, + "learning_rate": 8.465815479095485e-06, + "loss": 0.2373, + "step": 913 + }, + { + "epoch": 0.2789562032656798, + "grad_norm": 2.139593836663555, + "learning_rate": 8.462250046462226e-06, + "loss": 0.4121, + "step": 914 + }, + { + "epoch": 0.2792614069891653, + "grad_norm": 1.3283828036351077, + "learning_rate": 8.458681228322014e-06, + "loss": 0.38, + "step": 915 + }, + { + "epoch": 0.2795666107126507, + "grad_norm": 1.4749702062668928, + "learning_rate": 8.455109028164561e-06, + "loss": 0.4707, + "step": 916 + }, + { + "epoch": 0.2798718144361361, + "grad_norm": 1.2064530552545678, + "learning_rate": 8.45153344948289e-06, + "loss": 0.3217, + "step": 917 + }, + { + "epoch": 0.28017701815962154, + "grad_norm": 1.6256035637532993, + "learning_rate": 8.44795449577332e-06, + "loss": 0.4455, + "step": 918 + }, + { + "epoch": 0.28048222188310695, + "grad_norm": 1.3626337563191635, + "learning_rate": 8.444372170535478e-06, + "loss": 0.3391, + "step": 919 + }, + { + "epoch": 0.2807874256065924, + "grad_norm": 1.2635973004795618, + "learning_rate": 8.440786477272282e-06, + "loss": 0.2515, + "step": 920 + }, + { + "epoch": 0.28109262933007784, + "grad_norm": 1.1911478731743688, + "learning_rate": 8.437197419489948e-06, + "loss": 0.2891, + "step": 921 + }, + { + "epoch": 0.28139783305356325, + "grad_norm": 1.3587095012788262, + "learning_rate": 8.43360500069798e-06, + "loss": 0.416, + "step": 922 + }, + { + "epoch": 0.28170303677704867, + "grad_norm": 1.3537329737331374, + "learning_rate": 8.430009224409167e-06, + "loss": 0.2851, + "step": 923 + }, + { + "epoch": 0.2820082405005341, + "grad_norm": 1.0905607319343713, + "learning_rate": 8.426410094139582e-06, + "loss": 0.219, + "step": 924 + }, + { + "epoch": 0.28231344422401955, + "grad_norm": 1.2957621936139718, + "learning_rate": 8.422807613408583e-06, + "loss": 0.2757, + "step": 925 + }, + { + "epoch": 0.28261864794750496, + "grad_norm": 1.5464703579715704, + "learning_rate": 8.419201785738794e-06, + "loss": 0.488, + "step": 926 + }, + { + "epoch": 0.2829238516709904, + "grad_norm": 1.5027192422666622, + "learning_rate": 8.415592614656119e-06, + "loss": 0.298, + "step": 927 + }, + { + "epoch": 0.2832290553944758, + "grad_norm": 1.5041494197684437, + "learning_rate": 8.411980103689727e-06, + "loss": 0.2037, + "step": 928 + }, + { + "epoch": 0.28353425911796126, + "grad_norm": 1.1718395482104975, + "learning_rate": 8.40836425637206e-06, + "loss": 0.1936, + "step": 929 + }, + { + "epoch": 0.2838394628414467, + "grad_norm": 1.4034578790843537, + "learning_rate": 8.404745076238816e-06, + "loss": 0.3402, + "step": 930 + }, + { + "epoch": 0.2841446665649321, + "grad_norm": 1.471548215059981, + "learning_rate": 8.40112256682895e-06, + "loss": 0.3633, + "step": 931 + }, + { + "epoch": 0.2844498702884175, + "grad_norm": 2.534571034931892, + "learning_rate": 8.39749673168468e-06, + "loss": 0.3617, + "step": 932 + }, + { + "epoch": 0.2847550740119029, + "grad_norm": 1.5477637618039504, + "learning_rate": 8.39386757435147e-06, + "loss": 0.4332, + "step": 933 + }, + { + "epoch": 0.2850602777353884, + "grad_norm": 1.058523928884, + "learning_rate": 8.390235098378037e-06, + "loss": 0.2018, + "step": 934 + }, + { + "epoch": 0.2853654814588738, + "grad_norm": 1.051963849726592, + "learning_rate": 8.38659930731634e-06, + "loss": 0.2024, + "step": 935 + }, + { + "epoch": 0.2856706851823592, + "grad_norm": 1.4307030578090003, + "learning_rate": 8.382960204721575e-06, + "loss": 0.3186, + "step": 936 + }, + { + "epoch": 0.28597588890584463, + "grad_norm": 1.2191946734542334, + "learning_rate": 8.37931779415219e-06, + "loss": 0.2907, + "step": 937 + }, + { + "epoch": 0.2862810926293301, + "grad_norm": 1.4629392730406188, + "learning_rate": 8.375672079169854e-06, + "loss": 0.3628, + "step": 938 + }, + { + "epoch": 0.2865862963528155, + "grad_norm": 1.2671740435957073, + "learning_rate": 8.372023063339472e-06, + "loss": 0.3895, + "step": 939 + }, + { + "epoch": 0.28689150007630093, + "grad_norm": 1.2843029586747219, + "learning_rate": 8.36837075022918e-06, + "loss": 0.2448, + "step": 940 + }, + { + "epoch": 0.28719670379978635, + "grad_norm": 1.225493384282322, + "learning_rate": 8.36471514341033e-06, + "loss": 0.3751, + "step": 941 + }, + { + "epoch": 0.28750190752327176, + "grad_norm": 1.118820597376438, + "learning_rate": 8.361056246457509e-06, + "loss": 0.2959, + "step": 942 + }, + { + "epoch": 0.28780711124675723, + "grad_norm": 1.5826287542337327, + "learning_rate": 8.357394062948501e-06, + "loss": 0.1865, + "step": 943 + }, + { + "epoch": 0.28811231497024264, + "grad_norm": 1.7322615384777234, + "learning_rate": 8.353728596464324e-06, + "loss": 0.3319, + "step": 944 + }, + { + "epoch": 0.28841751869372806, + "grad_norm": 1.1819946263218575, + "learning_rate": 8.350059850589189e-06, + "loss": 0.258, + "step": 945 + }, + { + "epoch": 0.2887227224172135, + "grad_norm": 1.2768421179691603, + "learning_rate": 8.34638782891053e-06, + "loss": 0.3505, + "step": 946 + }, + { + "epoch": 0.28902792614069894, + "grad_norm": 1.6765785010321739, + "learning_rate": 8.342712535018968e-06, + "loss": 0.3988, + "step": 947 + }, + { + "epoch": 0.28933312986418436, + "grad_norm": 1.6460901695759294, + "learning_rate": 8.339033972508335e-06, + "loss": 0.5964, + "step": 948 + }, + { + "epoch": 0.28963833358766977, + "grad_norm": 1.3645220250147991, + "learning_rate": 8.335352144975657e-06, + "loss": 0.3997, + "step": 949 + }, + { + "epoch": 0.2899435373111552, + "grad_norm": 1.565124139219312, + "learning_rate": 8.331667056021151e-06, + "loss": 0.2581, + "step": 950 + }, + { + "epoch": 0.2902487410346406, + "grad_norm": 1.2163034972670224, + "learning_rate": 8.32797870924822e-06, + "loss": 0.3549, + "step": 951 + }, + { + "epoch": 0.29055394475812607, + "grad_norm": 1.9963649856615253, + "learning_rate": 8.324287108263462e-06, + "loss": 0.4499, + "step": 952 + }, + { + "epoch": 0.2908591484816115, + "grad_norm": 1.5738812787701257, + "learning_rate": 8.320592256676646e-06, + "loss": 0.4303, + "step": 953 + }, + { + "epoch": 0.2911643522050969, + "grad_norm": 1.51492812662889, + "learning_rate": 8.316894158100728e-06, + "loss": 0.3071, + "step": 954 + }, + { + "epoch": 0.2914695559285823, + "grad_norm": 1.5701047086118967, + "learning_rate": 8.313192816151832e-06, + "loss": 0.3527, + "step": 955 + }, + { + "epoch": 0.29177475965206773, + "grad_norm": 1.6660195604391899, + "learning_rate": 8.309488234449261e-06, + "loss": 0.4016, + "step": 956 + }, + { + "epoch": 0.2920799633755532, + "grad_norm": 1.5308116363222675, + "learning_rate": 8.30578041661548e-06, + "loss": 0.3991, + "step": 957 + }, + { + "epoch": 0.2923851670990386, + "grad_norm": 1.5118616317530167, + "learning_rate": 8.302069366276121e-06, + "loss": 0.4968, + "step": 958 + }, + { + "epoch": 0.292690370822524, + "grad_norm": 1.6934474407382258, + "learning_rate": 8.298355087059978e-06, + "loss": 0.5735, + "step": 959 + }, + { + "epoch": 0.29299557454600944, + "grad_norm": 1.4097627634984986, + "learning_rate": 8.294637582598998e-06, + "loss": 0.3816, + "step": 960 + }, + { + "epoch": 0.2933007782694949, + "grad_norm": 1.4214156530194402, + "learning_rate": 8.290916856528288e-06, + "loss": 0.3706, + "step": 961 + }, + { + "epoch": 0.2936059819929803, + "grad_norm": 1.3762680622131698, + "learning_rate": 8.287192912486098e-06, + "loss": 0.2482, + "step": 962 + }, + { + "epoch": 0.29391118571646574, + "grad_norm": 1.6238119477399238, + "learning_rate": 8.283465754113832e-06, + "loss": 0.483, + "step": 963 + }, + { + "epoch": 0.29421638943995115, + "grad_norm": 1.580806305043778, + "learning_rate": 8.279735385056028e-06, + "loss": 0.3277, + "step": 964 + }, + { + "epoch": 0.29452159316343657, + "grad_norm": 1.9306479469286622, + "learning_rate": 8.276001808960374e-06, + "loss": 0.448, + "step": 965 + }, + { + "epoch": 0.29482679688692204, + "grad_norm": 1.5497884575246978, + "learning_rate": 8.272265029477686e-06, + "loss": 0.3917, + "step": 966 + }, + { + "epoch": 0.29513200061040745, + "grad_norm": 1.5393309383304865, + "learning_rate": 8.26852505026192e-06, + "loss": 0.3033, + "step": 967 + }, + { + "epoch": 0.29543720433389287, + "grad_norm": 1.6980365016594954, + "learning_rate": 8.26478187497015e-06, + "loss": 0.2664, + "step": 968 + }, + { + "epoch": 0.2957424080573783, + "grad_norm": 1.363460408765191, + "learning_rate": 8.261035507262582e-06, + "loss": 0.254, + "step": 969 + }, + { + "epoch": 0.29604761178086375, + "grad_norm": 1.6405225449314822, + "learning_rate": 8.257285950802547e-06, + "loss": 0.4717, + "step": 970 + }, + { + "epoch": 0.29635281550434917, + "grad_norm": 1.4402490121062985, + "learning_rate": 8.253533209256486e-06, + "loss": 0.2763, + "step": 971 + }, + { + "epoch": 0.2966580192278346, + "grad_norm": 1.8581821237579783, + "learning_rate": 8.24977728629396e-06, + "loss": 0.5867, + "step": 972 + }, + { + "epoch": 0.29696322295132, + "grad_norm": 1.6644196462856506, + "learning_rate": 8.246018185587642e-06, + "loss": 0.6931, + "step": 973 + }, + { + "epoch": 0.2972684266748054, + "grad_norm": 1.7226101616124239, + "learning_rate": 8.242255910813308e-06, + "loss": 0.4707, + "step": 974 + }, + { + "epoch": 0.2975736303982909, + "grad_norm": 1.559017343729282, + "learning_rate": 8.238490465649837e-06, + "loss": 0.3913, + "step": 975 + }, + { + "epoch": 0.2978788341217763, + "grad_norm": 2.355733929284259, + "learning_rate": 8.234721853779212e-06, + "loss": 0.3725, + "step": 976 + }, + { + "epoch": 0.2981840378452617, + "grad_norm": 1.800766090445105, + "learning_rate": 8.230950078886512e-06, + "loss": 0.3071, + "step": 977 + }, + { + "epoch": 0.2984892415687471, + "grad_norm": 2.5023327434487097, + "learning_rate": 8.227175144659908e-06, + "loss": 0.5576, + "step": 978 + }, + { + "epoch": 0.2987944452922326, + "grad_norm": 1.657243682467184, + "learning_rate": 8.22339705479066e-06, + "loss": 0.3635, + "step": 979 + }, + { + "epoch": 0.299099649015718, + "grad_norm": 1.4497278352698486, + "learning_rate": 8.219615812973111e-06, + "loss": 0.3942, + "step": 980 + }, + { + "epoch": 0.2994048527392034, + "grad_norm": 1.4676463261815744, + "learning_rate": 8.215831422904694e-06, + "loss": 0.3701, + "step": 981 + }, + { + "epoch": 0.29971005646268883, + "grad_norm": 1.4714044973041538, + "learning_rate": 8.212043888285913e-06, + "loss": 0.3646, + "step": 982 + }, + { + "epoch": 0.30001526018617425, + "grad_norm": 1.8962188440528207, + "learning_rate": 8.208253212820349e-06, + "loss": 0.459, + "step": 983 + }, + { + "epoch": 0.3003204639096597, + "grad_norm": 1.5089138139920484, + "learning_rate": 8.204459400214657e-06, + "loss": 0.3636, + "step": 984 + }, + { + "epoch": 0.30062566763314513, + "grad_norm": 1.5444120585905727, + "learning_rate": 8.200662454178553e-06, + "loss": 0.3727, + "step": 985 + }, + { + "epoch": 0.30093087135663055, + "grad_norm": 1.3153314475890103, + "learning_rate": 8.196862378424826e-06, + "loss": 0.2808, + "step": 986 + }, + { + "epoch": 0.30123607508011596, + "grad_norm": 1.645327268934486, + "learning_rate": 8.193059176669317e-06, + "loss": 0.4536, + "step": 987 + }, + { + "epoch": 0.30154127880360143, + "grad_norm": 1.343818307959044, + "learning_rate": 8.189252852630927e-06, + "loss": 0.3137, + "step": 988 + }, + { + "epoch": 0.30184648252708685, + "grad_norm": 1.079353461920414, + "learning_rate": 8.185443410031613e-06, + "loss": 0.1587, + "step": 989 + }, + { + "epoch": 0.30215168625057226, + "grad_norm": 1.3923715489039352, + "learning_rate": 8.181630852596378e-06, + "loss": 0.5496, + "step": 990 + }, + { + "epoch": 0.3024568899740577, + "grad_norm": 1.3696798223323383, + "learning_rate": 8.17781518405327e-06, + "loss": 0.2037, + "step": 991 + }, + { + "epoch": 0.3027620936975431, + "grad_norm": 1.6749786375590874, + "learning_rate": 8.173996408133382e-06, + "loss": 0.523, + "step": 992 + }, + { + "epoch": 0.30306729742102856, + "grad_norm": 1.129331462466354, + "learning_rate": 8.170174528570845e-06, + "loss": 0.2696, + "step": 993 + }, + { + "epoch": 0.303372501144514, + "grad_norm": 1.4335553322615482, + "learning_rate": 8.16634954910282e-06, + "loss": 0.3102, + "step": 994 + }, + { + "epoch": 0.3036777048679994, + "grad_norm": 1.9054489954275935, + "learning_rate": 8.162521473469509e-06, + "loss": 0.4552, + "step": 995 + }, + { + "epoch": 0.3039829085914848, + "grad_norm": 1.8579421733648953, + "learning_rate": 8.158690305414132e-06, + "loss": 0.3536, + "step": 996 + }, + { + "epoch": 0.3042881123149702, + "grad_norm": 2.2709514225577223, + "learning_rate": 8.154856048682938e-06, + "loss": 0.4632, + "step": 997 + }, + { + "epoch": 0.3045933160384557, + "grad_norm": 1.5737905369537974, + "learning_rate": 8.151018707025194e-06, + "loss": 0.2503, + "step": 998 + }, + { + "epoch": 0.3048985197619411, + "grad_norm": 1.298904393760468, + "learning_rate": 8.147178284193185e-06, + "loss": 0.3482, + "step": 999 + }, + { + "epoch": 0.3052037234854265, + "grad_norm": 2.3561598742827132, + "learning_rate": 8.143334783942207e-06, + "loss": 0.3419, + "step": 1000 + }, + { + "epoch": 0.30550892720891193, + "grad_norm": 1.1397332664109803, + "learning_rate": 8.13948821003057e-06, + "loss": 0.264, + "step": 1001 + }, + { + "epoch": 0.3058141309323974, + "grad_norm": 1.664228189305695, + "learning_rate": 8.135638566219581e-06, + "loss": 0.6021, + "step": 1002 + }, + { + "epoch": 0.3061193346558828, + "grad_norm": 1.2828873662440865, + "learning_rate": 8.131785856273558e-06, + "loss": 0.3529, + "step": 1003 + }, + { + "epoch": 0.3064245383793682, + "grad_norm": 1.54799572968202, + "learning_rate": 8.127930083959813e-06, + "loss": 0.41, + "step": 1004 + }, + { + "epoch": 0.30672974210285364, + "grad_norm": 1.6015124057285708, + "learning_rate": 8.124071253048648e-06, + "loss": 0.5602, + "step": 1005 + }, + { + "epoch": 0.30703494582633906, + "grad_norm": 1.5019375684946967, + "learning_rate": 8.120209367313364e-06, + "loss": 0.5064, + "step": 1006 + }, + { + "epoch": 0.3073401495498245, + "grad_norm": 1.4841104372065577, + "learning_rate": 8.116344430530248e-06, + "loss": 0.2734, + "step": 1007 + }, + { + "epoch": 0.30764535327330994, + "grad_norm": 1.4979575079530576, + "learning_rate": 8.112476446478561e-06, + "loss": 0.4476, + "step": 1008 + }, + { + "epoch": 0.30795055699679535, + "grad_norm": 1.48012915266827, + "learning_rate": 8.108605418940555e-06, + "loss": 0.3478, + "step": 1009 + }, + { + "epoch": 0.30825576072028077, + "grad_norm": 1.7441608560206838, + "learning_rate": 8.104731351701456e-06, + "loss": 0.4198, + "step": 1010 + }, + { + "epoch": 0.30856096444376624, + "grad_norm": 1.6603083282826114, + "learning_rate": 8.100854248549453e-06, + "loss": 0.2977, + "step": 1011 + }, + { + "epoch": 0.30886616816725165, + "grad_norm": 1.4905130468269074, + "learning_rate": 8.096974113275716e-06, + "loss": 0.4249, + "step": 1012 + }, + { + "epoch": 0.30917137189073707, + "grad_norm": 1.3913148176340926, + "learning_rate": 8.093090949674373e-06, + "loss": 0.3232, + "step": 1013 + }, + { + "epoch": 0.3094765756142225, + "grad_norm": 1.896351174136801, + "learning_rate": 8.089204761542515e-06, + "loss": 0.6316, + "step": 1014 + }, + { + "epoch": 0.3097817793377079, + "grad_norm": 1.3098254883257765, + "learning_rate": 8.08531555268019e-06, + "loss": 0.3526, + "step": 1015 + }, + { + "epoch": 0.31008698306119337, + "grad_norm": 1.2414711860204384, + "learning_rate": 8.081423326890399e-06, + "loss": 0.2527, + "step": 1016 + }, + { + "epoch": 0.3103921867846788, + "grad_norm": 1.651694240025547, + "learning_rate": 8.077528087979095e-06, + "loss": 0.4595, + "step": 1017 + }, + { + "epoch": 0.3106973905081642, + "grad_norm": 1.2490722074076572, + "learning_rate": 8.073629839755177e-06, + "loss": 0.3143, + "step": 1018 + }, + { + "epoch": 0.3110025942316496, + "grad_norm": 1.4922248875592679, + "learning_rate": 8.069728586030483e-06, + "loss": 0.4735, + "step": 1019 + }, + { + "epoch": 0.3113077979551351, + "grad_norm": 1.3134160251209652, + "learning_rate": 8.065824330619796e-06, + "loss": 0.3399, + "step": 1020 + }, + { + "epoch": 0.3116130016786205, + "grad_norm": 1.3819445456159205, + "learning_rate": 8.061917077340828e-06, + "loss": 0.4378, + "step": 1021 + }, + { + "epoch": 0.3119182054021059, + "grad_norm": 1.3067268833489771, + "learning_rate": 8.05800683001423e-06, + "loss": 0.3642, + "step": 1022 + }, + { + "epoch": 0.3122234091255913, + "grad_norm": 1.2386206994963245, + "learning_rate": 8.054093592463572e-06, + "loss": 0.3116, + "step": 1023 + }, + { + "epoch": 0.31252861284907674, + "grad_norm": 1.5821932486019623, + "learning_rate": 8.050177368515353e-06, + "loss": 0.3499, + "step": 1024 + }, + { + "epoch": 0.3128338165725622, + "grad_norm": 1.2768808628569455, + "learning_rate": 8.046258161998993e-06, + "loss": 0.1659, + "step": 1025 + }, + { + "epoch": 0.3131390202960476, + "grad_norm": 1.2575220756033998, + "learning_rate": 8.042335976746822e-06, + "loss": 0.2405, + "step": 1026 + }, + { + "epoch": 0.31344422401953304, + "grad_norm": 1.8245135116139561, + "learning_rate": 8.038410816594093e-06, + "loss": 0.4366, + "step": 1027 + }, + { + "epoch": 0.31374942774301845, + "grad_norm": 1.4466165793811705, + "learning_rate": 8.034482685378959e-06, + "loss": 0.4082, + "step": 1028 + }, + { + "epoch": 0.31405463146650386, + "grad_norm": 1.5442738868035615, + "learning_rate": 8.030551586942484e-06, + "loss": 0.4539, + "step": 1029 + }, + { + "epoch": 0.31435983518998933, + "grad_norm": 1.2964103969559602, + "learning_rate": 8.026617525128628e-06, + "loss": 0.3801, + "step": 1030 + }, + { + "epoch": 0.31466503891347475, + "grad_norm": 1.872123164314918, + "learning_rate": 8.022680503784252e-06, + "loss": 0.3698, + "step": 1031 + }, + { + "epoch": 0.31497024263696016, + "grad_norm": 1.3752871993705764, + "learning_rate": 8.018740526759115e-06, + "loss": 0.3825, + "step": 1032 + }, + { + "epoch": 0.3152754463604456, + "grad_norm": 0.9331829492009497, + "learning_rate": 8.014797597905856e-06, + "loss": 0.2683, + "step": 1033 + }, + { + "epoch": 0.31558065008393105, + "grad_norm": 1.5771299945709583, + "learning_rate": 8.01085172108001e-06, + "loss": 0.5906, + "step": 1034 + }, + { + "epoch": 0.31588585380741646, + "grad_norm": 1.239300714325851, + "learning_rate": 8.006902900139989e-06, + "loss": 0.3648, + "step": 1035 + }, + { + "epoch": 0.3161910575309019, + "grad_norm": 1.058310326503882, + "learning_rate": 8.002951138947085e-06, + "loss": 0.228, + "step": 1036 + }, + { + "epoch": 0.3164962612543873, + "grad_norm": 1.1824475268280543, + "learning_rate": 7.99899644136547e-06, + "loss": 0.2105, + "step": 1037 + }, + { + "epoch": 0.3168014649778727, + "grad_norm": 1.5959459771356375, + "learning_rate": 7.995038811262176e-06, + "loss": 0.4822, + "step": 1038 + }, + { + "epoch": 0.3171066687013582, + "grad_norm": 1.0890532877681574, + "learning_rate": 7.991078252507117e-06, + "loss": 0.301, + "step": 1039 + }, + { + "epoch": 0.3174118724248436, + "grad_norm": 1.5858524092975068, + "learning_rate": 7.987114768973059e-06, + "loss": 0.3329, + "step": 1040 + }, + { + "epoch": 0.317717076148329, + "grad_norm": 1.2871444766825402, + "learning_rate": 7.983148364535633e-06, + "loss": 0.3842, + "step": 1041 + }, + { + "epoch": 0.3180222798718144, + "grad_norm": 1.6762774250879164, + "learning_rate": 7.979179043073327e-06, + "loss": 0.6428, + "step": 1042 + }, + { + "epoch": 0.3183274835952999, + "grad_norm": 1.1260557519041783, + "learning_rate": 7.975206808467477e-06, + "loss": 0.2912, + "step": 1043 + }, + { + "epoch": 0.3186326873187853, + "grad_norm": 1.4814440250076806, + "learning_rate": 7.971231664602273e-06, + "loss": 0.428, + "step": 1044 + }, + { + "epoch": 0.3189378910422707, + "grad_norm": 1.5470069289542376, + "learning_rate": 7.967253615364746e-06, + "loss": 0.326, + "step": 1045 + }, + { + "epoch": 0.31924309476575613, + "grad_norm": 1.6492330791767074, + "learning_rate": 7.963272664644765e-06, + "loss": 0.4851, + "step": 1046 + }, + { + "epoch": 0.31954829848924154, + "grad_norm": 1.1387453660755529, + "learning_rate": 7.95928881633505e-06, + "loss": 0.1819, + "step": 1047 + }, + { + "epoch": 0.319853502212727, + "grad_norm": 1.528342152399863, + "learning_rate": 7.955302074331136e-06, + "loss": 0.2465, + "step": 1048 + }, + { + "epoch": 0.32015870593621243, + "grad_norm": 1.3953796712292619, + "learning_rate": 7.9513124425314e-06, + "loss": 0.3372, + "step": 1049 + }, + { + "epoch": 0.32046390965969784, + "grad_norm": 1.5878443787469112, + "learning_rate": 7.94731992483704e-06, + "loss": 0.3593, + "step": 1050 + }, + { + "epoch": 0.32076911338318326, + "grad_norm": 1.5355544022885335, + "learning_rate": 7.943324525152077e-06, + "loss": 0.3395, + "step": 1051 + }, + { + "epoch": 0.3210743171066687, + "grad_norm": 1.2530592675223244, + "learning_rate": 7.93932624738335e-06, + "loss": 0.2164, + "step": 1052 + }, + { + "epoch": 0.32137952083015414, + "grad_norm": 4.526521517561823, + "learning_rate": 7.935325095440511e-06, + "loss": 0.2893, + "step": 1053 + }, + { + "epoch": 0.32168472455363956, + "grad_norm": 1.4025910565877562, + "learning_rate": 7.931321073236024e-06, + "loss": 0.3703, + "step": 1054 + }, + { + "epoch": 0.32198992827712497, + "grad_norm": 1.6381535397997637, + "learning_rate": 7.927314184685164e-06, + "loss": 0.3823, + "step": 1055 + }, + { + "epoch": 0.3222951320006104, + "grad_norm": 1.6423873881343087, + "learning_rate": 7.923304433705999e-06, + "loss": 0.3951, + "step": 1056 + }, + { + "epoch": 0.32260033572409585, + "grad_norm": 1.8735606709031154, + "learning_rate": 7.919291824219402e-06, + "loss": 0.2828, + "step": 1057 + }, + { + "epoch": 0.32290553944758127, + "grad_norm": 1.3931902680953308, + "learning_rate": 7.91527636014904e-06, + "loss": 0.2629, + "step": 1058 + }, + { + "epoch": 0.3232107431710667, + "grad_norm": 1.3616683731784345, + "learning_rate": 7.911258045421374e-06, + "loss": 0.3049, + "step": 1059 + }, + { + "epoch": 0.3235159468945521, + "grad_norm": 1.8477138002235685, + "learning_rate": 7.90723688396565e-06, + "loss": 0.5091, + "step": 1060 + }, + { + "epoch": 0.32382115061803757, + "grad_norm": 2.1715762804280048, + "learning_rate": 7.903212879713894e-06, + "loss": 0.3296, + "step": 1061 + }, + { + "epoch": 0.324126354341523, + "grad_norm": 1.2259722218688094, + "learning_rate": 7.89918603660092e-06, + "loss": 0.2084, + "step": 1062 + }, + { + "epoch": 0.3244315580650084, + "grad_norm": 1.5260897510578109, + "learning_rate": 7.895156358564308e-06, + "loss": 0.3144, + "step": 1063 + }, + { + "epoch": 0.3247367617884938, + "grad_norm": 1.2101539291645254, + "learning_rate": 7.891123849544421e-06, + "loss": 0.2139, + "step": 1064 + }, + { + "epoch": 0.3250419655119792, + "grad_norm": 1.4587532074802312, + "learning_rate": 7.887088513484383e-06, + "loss": 0.3954, + "step": 1065 + }, + { + "epoch": 0.3253471692354647, + "grad_norm": 1.6684881111981207, + "learning_rate": 7.883050354330085e-06, + "loss": 0.3655, + "step": 1066 + }, + { + "epoch": 0.3256523729589501, + "grad_norm": 1.307546988348603, + "learning_rate": 7.879009376030173e-06, + "loss": 0.2016, + "step": 1067 + }, + { + "epoch": 0.3259575766824355, + "grad_norm": 1.382846724092064, + "learning_rate": 7.87496558253606e-06, + "loss": 0.3516, + "step": 1068 + }, + { + "epoch": 0.32626278040592094, + "grad_norm": 1.420119155129035, + "learning_rate": 7.870918977801902e-06, + "loss": 0.4247, + "step": 1069 + }, + { + "epoch": 0.32656798412940635, + "grad_norm": 2.0170283200834227, + "learning_rate": 7.866869565784611e-06, + "loss": 0.4074, + "step": 1070 + }, + { + "epoch": 0.3268731878528918, + "grad_norm": 1.3132837770926757, + "learning_rate": 7.86281735044384e-06, + "loss": 0.3554, + "step": 1071 + }, + { + "epoch": 0.32717839157637724, + "grad_norm": 1.491650229990386, + "learning_rate": 7.858762335741984e-06, + "loss": 0.4311, + "step": 1072 + }, + { + "epoch": 0.32748359529986265, + "grad_norm": 1.3015830744734889, + "learning_rate": 7.854704525644175e-06, + "loss": 0.4368, + "step": 1073 + }, + { + "epoch": 0.32778879902334807, + "grad_norm": 1.240655590227942, + "learning_rate": 7.85064392411828e-06, + "loss": 0.3503, + "step": 1074 + }, + { + "epoch": 0.32809400274683354, + "grad_norm": 1.0370573041527442, + "learning_rate": 7.846580535134895e-06, + "loss": 0.2173, + "step": 1075 + }, + { + "epoch": 0.32839920647031895, + "grad_norm": 1.490198377421961, + "learning_rate": 7.842514362667341e-06, + "loss": 0.4797, + "step": 1076 + }, + { + "epoch": 0.32870441019380436, + "grad_norm": 1.2483676742868937, + "learning_rate": 7.83844541069166e-06, + "loss": 0.3117, + "step": 1077 + }, + { + "epoch": 0.3290096139172898, + "grad_norm": 1.1118596151916136, + "learning_rate": 7.834373683186614e-06, + "loss": 0.3425, + "step": 1078 + }, + { + "epoch": 0.3293148176407752, + "grad_norm": 1.1498236044067436, + "learning_rate": 7.830299184133676e-06, + "loss": 0.3116, + "step": 1079 + }, + { + "epoch": 0.32962002136426066, + "grad_norm": 1.330890961706903, + "learning_rate": 7.826221917517034e-06, + "loss": 0.326, + "step": 1080 + }, + { + "epoch": 0.3299252250877461, + "grad_norm": 1.1391107096366442, + "learning_rate": 7.822141887323577e-06, + "loss": 0.2253, + "step": 1081 + }, + { + "epoch": 0.3302304288112315, + "grad_norm": 1.0899804689249892, + "learning_rate": 7.8180590975429e-06, + "loss": 0.3972, + "step": 1082 + }, + { + "epoch": 0.3305356325347169, + "grad_norm": 1.1421995399769407, + "learning_rate": 7.813973552167293e-06, + "loss": 0.1951, + "step": 1083 + }, + { + "epoch": 0.3308408362582024, + "grad_norm": 1.6301828043180877, + "learning_rate": 7.809885255191745e-06, + "loss": 0.5203, + "step": 1084 + }, + { + "epoch": 0.3311460399816878, + "grad_norm": 1.1015959389663672, + "learning_rate": 7.805794210613934e-06, + "loss": 0.1868, + "step": 1085 + }, + { + "epoch": 0.3314512437051732, + "grad_norm": 1.0719373857232206, + "learning_rate": 7.801700422434218e-06, + "loss": 0.3546, + "step": 1086 + }, + { + "epoch": 0.3317564474286586, + "grad_norm": 1.7069619729935679, + "learning_rate": 7.79760389465565e-06, + "loss": 0.2883, + "step": 1087 + }, + { + "epoch": 0.33206165115214403, + "grad_norm": 1.1374914944508905, + "learning_rate": 7.793504631283952e-06, + "loss": 0.2634, + "step": 1088 + }, + { + "epoch": 0.3323668548756295, + "grad_norm": 1.407791140920363, + "learning_rate": 7.789402636327526e-06, + "loss": 0.4067, + "step": 1089 + }, + { + "epoch": 0.3326720585991149, + "grad_norm": 1.1160900354196726, + "learning_rate": 7.785297913797441e-06, + "loss": 0.2304, + "step": 1090 + }, + { + "epoch": 0.33297726232260033, + "grad_norm": 1.2290853807544064, + "learning_rate": 7.78119046770744e-06, + "loss": 0.3382, + "step": 1091 + }, + { + "epoch": 0.33328246604608575, + "grad_norm": 1.582064705954116, + "learning_rate": 7.777080302073922e-06, + "loss": 0.4928, + "step": 1092 + }, + { + "epoch": 0.3335876697695712, + "grad_norm": 1.1020461299123707, + "learning_rate": 7.772967420915948e-06, + "loss": 0.2539, + "step": 1093 + }, + { + "epoch": 0.33389287349305663, + "grad_norm": 1.2964060291640898, + "learning_rate": 7.768851828255233e-06, + "loss": 0.3398, + "step": 1094 + }, + { + "epoch": 0.33419807721654204, + "grad_norm": 1.2035385015753215, + "learning_rate": 7.764733528116148e-06, + "loss": 0.2841, + "step": 1095 + }, + { + "epoch": 0.33450328094002746, + "grad_norm": 1.4026090323213516, + "learning_rate": 7.760612524525708e-06, + "loss": 0.3719, + "step": 1096 + }, + { + "epoch": 0.3348084846635129, + "grad_norm": 1.44136903296084, + "learning_rate": 7.75648882151357e-06, + "loss": 0.3998, + "step": 1097 + }, + { + "epoch": 0.33511368838699834, + "grad_norm": 1.491339317646278, + "learning_rate": 7.752362423112032e-06, + "loss": 0.4328, + "step": 1098 + }, + { + "epoch": 0.33541889211048376, + "grad_norm": 1.0421144222506495, + "learning_rate": 7.74823333335603e-06, + "loss": 0.1908, + "step": 1099 + }, + { + "epoch": 0.33572409583396917, + "grad_norm": 1.2708409522823558, + "learning_rate": 7.744101556283129e-06, + "loss": 0.3129, + "step": 1100 + }, + { + "epoch": 0.3360292995574546, + "grad_norm": 1.6982843745883154, + "learning_rate": 7.73996709593352e-06, + "loss": 0.4516, + "step": 1101 + }, + { + "epoch": 0.33633450328094, + "grad_norm": 1.3745112417433676, + "learning_rate": 7.735829956350026e-06, + "loss": 0.3835, + "step": 1102 + }, + { + "epoch": 0.33663970700442547, + "grad_norm": 1.0289081222992171, + "learning_rate": 7.731690141578076e-06, + "loss": 0.23, + "step": 1103 + }, + { + "epoch": 0.3369449107279109, + "grad_norm": 1.3309142499739386, + "learning_rate": 7.72754765566573e-06, + "loss": 0.2879, + "step": 1104 + }, + { + "epoch": 0.3372501144513963, + "grad_norm": 1.7026299273281889, + "learning_rate": 7.72340250266365e-06, + "loss": 0.6118, + "step": 1105 + }, + { + "epoch": 0.3375553181748817, + "grad_norm": 1.3708586314610167, + "learning_rate": 7.71925468662511e-06, + "loss": 0.3695, + "step": 1106 + }, + { + "epoch": 0.3378605218983672, + "grad_norm": 1.138227058539724, + "learning_rate": 7.715104211605987e-06, + "loss": 0.3156, + "step": 1107 + }, + { + "epoch": 0.3381657256218526, + "grad_norm": 1.6312680765955443, + "learning_rate": 7.710951081664758e-06, + "loss": 0.3716, + "step": 1108 + }, + { + "epoch": 0.338470929345338, + "grad_norm": 1.43186060111883, + "learning_rate": 7.706795300862493e-06, + "loss": 0.4303, + "step": 1109 + }, + { + "epoch": 0.3387761330688234, + "grad_norm": 1.295302897418865, + "learning_rate": 7.702636873262859e-06, + "loss": 0.2334, + "step": 1110 + }, + { + "epoch": 0.33908133679230884, + "grad_norm": 1.2674378022051007, + "learning_rate": 7.69847580293211e-06, + "loss": 0.3056, + "step": 1111 + }, + { + "epoch": 0.3393865405157943, + "grad_norm": 1.326251694716828, + "learning_rate": 7.694312093939084e-06, + "loss": 0.2555, + "step": 1112 + }, + { + "epoch": 0.3396917442392797, + "grad_norm": 1.525503000572152, + "learning_rate": 7.690145750355198e-06, + "loss": 0.3448, + "step": 1113 + }, + { + "epoch": 0.33999694796276514, + "grad_norm": 1.3733356980578115, + "learning_rate": 7.685976776254446e-06, + "loss": 0.2901, + "step": 1114 + }, + { + "epoch": 0.34030215168625055, + "grad_norm": 1.23597237288206, + "learning_rate": 7.681805175713392e-06, + "loss": 0.2144, + "step": 1115 + }, + { + "epoch": 0.340607355409736, + "grad_norm": 1.5777935019830391, + "learning_rate": 7.677630952811172e-06, + "loss": 0.5106, + "step": 1116 + }, + { + "epoch": 0.34091255913322144, + "grad_norm": 1.171188713025467, + "learning_rate": 7.673454111629486e-06, + "loss": 0.1928, + "step": 1117 + }, + { + "epoch": 0.34121776285670685, + "grad_norm": 1.3533361626972884, + "learning_rate": 7.669274656252588e-06, + "loss": 0.2716, + "step": 1118 + }, + { + "epoch": 0.34152296658019227, + "grad_norm": 1.3706061645549534, + "learning_rate": 7.665092590767298e-06, + "loss": 0.2415, + "step": 1119 + }, + { + "epoch": 0.3418281703036777, + "grad_norm": 1.444083686347945, + "learning_rate": 7.66090791926298e-06, + "loss": 0.2384, + "step": 1120 + }, + { + "epoch": 0.34213337402716315, + "grad_norm": 1.7114127557234347, + "learning_rate": 7.65672064583155e-06, + "loss": 0.422, + "step": 1121 + }, + { + "epoch": 0.34243857775064857, + "grad_norm": 1.5259644507270844, + "learning_rate": 7.652530774567468e-06, + "loss": 0.5226, + "step": 1122 + }, + { + "epoch": 0.342743781474134, + "grad_norm": 0.9911746164065847, + "learning_rate": 7.648338309567735e-06, + "loss": 0.2267, + "step": 1123 + }, + { + "epoch": 0.3430489851976194, + "grad_norm": 1.0166151648645703, + "learning_rate": 7.644143254931887e-06, + "loss": 0.2104, + "step": 1124 + }, + { + "epoch": 0.34335418892110486, + "grad_norm": 1.6720832623689992, + "learning_rate": 7.63994561476199e-06, + "loss": 0.5396, + "step": 1125 + }, + { + "epoch": 0.3436593926445903, + "grad_norm": 1.5678839432039253, + "learning_rate": 7.635745393162643e-06, + "loss": 0.4492, + "step": 1126 + }, + { + "epoch": 0.3439645963680757, + "grad_norm": 1.8116112207083335, + "learning_rate": 7.631542594240968e-06, + "loss": 0.3977, + "step": 1127 + }, + { + "epoch": 0.3442698000915611, + "grad_norm": 1.473465027051407, + "learning_rate": 7.627337222106602e-06, + "loss": 0.3924, + "step": 1128 + }, + { + "epoch": 0.3445750038150465, + "grad_norm": 1.8018368091168058, + "learning_rate": 7.623129280871706e-06, + "loss": 0.6324, + "step": 1129 + }, + { + "epoch": 0.344880207538532, + "grad_norm": 1.2871463289874823, + "learning_rate": 7.618918774650947e-06, + "loss": 0.317, + "step": 1130 + }, + { + "epoch": 0.3451854112620174, + "grad_norm": 1.3104766508523527, + "learning_rate": 7.614705707561505e-06, + "loss": 0.1865, + "step": 1131 + }, + { + "epoch": 0.3454906149855028, + "grad_norm": 1.4958005772283107, + "learning_rate": 7.610490083723059e-06, + "loss": 0.3286, + "step": 1132 + }, + { + "epoch": 0.34579581870898823, + "grad_norm": 1.3334807622385083, + "learning_rate": 7.606271907257793e-06, + "loss": 0.1803, + "step": 1133 + }, + { + "epoch": 0.3461010224324737, + "grad_norm": 1.58655509762167, + "learning_rate": 7.602051182290382e-06, + "loss": 0.428, + "step": 1134 + }, + { + "epoch": 0.3464062261559591, + "grad_norm": 1.6285506850668747, + "learning_rate": 7.597827912947998e-06, + "loss": 0.3046, + "step": 1135 + }, + { + "epoch": 0.34671142987944453, + "grad_norm": 1.3766123681447917, + "learning_rate": 7.593602103360298e-06, + "loss": 0.2579, + "step": 1136 + }, + { + "epoch": 0.34701663360292995, + "grad_norm": 1.7542194180322948, + "learning_rate": 7.589373757659424e-06, + "loss": 0.2563, + "step": 1137 + }, + { + "epoch": 0.34732183732641536, + "grad_norm": 1.1682456208270866, + "learning_rate": 7.585142879979998e-06, + "loss": 0.3068, + "step": 1138 + }, + { + "epoch": 0.34762704104990083, + "grad_norm": 1.1214377908787607, + "learning_rate": 7.580909474459117e-06, + "loss": 0.1778, + "step": 1139 + }, + { + "epoch": 0.34793224477338625, + "grad_norm": 1.5446511669234666, + "learning_rate": 7.576673545236349e-06, + "loss": 0.2251, + "step": 1140 + }, + { + "epoch": 0.34823744849687166, + "grad_norm": 1.2705911290898333, + "learning_rate": 7.572435096453734e-06, + "loss": 0.2938, + "step": 1141 + }, + { + "epoch": 0.3485426522203571, + "grad_norm": 1.493297862812399, + "learning_rate": 7.5681941322557685e-06, + "loss": 0.3066, + "step": 1142 + }, + { + "epoch": 0.3488478559438425, + "grad_norm": 1.0677076510295394, + "learning_rate": 7.563950656789416e-06, + "loss": 0.2338, + "step": 1143 + }, + { + "epoch": 0.34915305966732796, + "grad_norm": 1.2516527217201623, + "learning_rate": 7.559704674204091e-06, + "loss": 0.3347, + "step": 1144 + }, + { + "epoch": 0.3494582633908134, + "grad_norm": 1.702080362504898, + "learning_rate": 7.555456188651665e-06, + "loss": 0.4124, + "step": 1145 + }, + { + "epoch": 0.3497634671142988, + "grad_norm": 1.539828192304715, + "learning_rate": 7.551205204286447e-06, + "loss": 0.462, + "step": 1146 + }, + { + "epoch": 0.3500686708377842, + "grad_norm": 1.3087519763541762, + "learning_rate": 7.546951725265198e-06, + "loss": 0.2483, + "step": 1147 + }, + { + "epoch": 0.35037387456126967, + "grad_norm": 1.43063223220777, + "learning_rate": 7.542695755747116e-06, + "loss": 0.3615, + "step": 1148 + }, + { + "epoch": 0.3506790782847551, + "grad_norm": 1.5523078380591047, + "learning_rate": 7.538437299893836e-06, + "loss": 0.5231, + "step": 1149 + }, + { + "epoch": 0.3509842820082405, + "grad_norm": 1.2101188107160756, + "learning_rate": 7.534176361869418e-06, + "loss": 0.3176, + "step": 1150 + }, + { + "epoch": 0.3512894857317259, + "grad_norm": 1.4239347860105078, + "learning_rate": 7.529912945840359e-06, + "loss": 0.3347, + "step": 1151 + }, + { + "epoch": 0.35159468945521133, + "grad_norm": 1.3610805029033388, + "learning_rate": 7.5256470559755665e-06, + "loss": 0.3266, + "step": 1152 + }, + { + "epoch": 0.3518998931786968, + "grad_norm": 1.6132233172860933, + "learning_rate": 7.521378696446381e-06, + "loss": 0.3327, + "step": 1153 + }, + { + "epoch": 0.3522050969021822, + "grad_norm": 1.2930810424354338, + "learning_rate": 7.517107871426548e-06, + "loss": 0.1709, + "step": 1154 + }, + { + "epoch": 0.3525103006256676, + "grad_norm": 1.6807461972866704, + "learning_rate": 7.512834585092226e-06, + "loss": 0.3168, + "step": 1155 + }, + { + "epoch": 0.35281550434915304, + "grad_norm": 1.7732192926139976, + "learning_rate": 7.508558841621981e-06, + "loss": 0.5005, + "step": 1156 + }, + { + "epoch": 0.3531207080726385, + "grad_norm": 1.4750885241824656, + "learning_rate": 7.504280645196784e-06, + "loss": 0.3684, + "step": 1157 + }, + { + "epoch": 0.3534259117961239, + "grad_norm": 1.5202750221957153, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1899, + "step": 1158 + }, + { + "epoch": 0.35373111551960934, + "grad_norm": 1.1595814112433622, + "learning_rate": 7.495716910217393e-06, + "loss": 0.3105, + "step": 1159 + }, + { + "epoch": 0.35403631924309475, + "grad_norm": 1.4534366232509097, + "learning_rate": 7.491431380037113e-06, + "loss": 0.3465, + "step": 1160 + }, + { + "epoch": 0.35434152296658017, + "grad_norm": 1.156548538845248, + "learning_rate": 7.4871434136497e-06, + "loss": 0.2893, + "step": 1161 + }, + { + "epoch": 0.35464672669006564, + "grad_norm": 1.4538056615012076, + "learning_rate": 7.482853015248075e-06, + "loss": 0.3667, + "step": 1162 + }, + { + "epoch": 0.35495193041355105, + "grad_norm": 1.6556975684961233, + "learning_rate": 7.478560189027536e-06, + "loss": 0.2493, + "step": 1163 + }, + { + "epoch": 0.35525713413703647, + "grad_norm": 1.6844460438765754, + "learning_rate": 7.474264939185756e-06, + "loss": 0.1798, + "step": 1164 + }, + { + "epoch": 0.3555623378605219, + "grad_norm": 1.9475210131391785, + "learning_rate": 7.469967269922777e-06, + "loss": 0.5204, + "step": 1165 + }, + { + "epoch": 0.35586754158400735, + "grad_norm": 2.793715839902533, + "learning_rate": 7.465667185441009e-06, + "loss": 0.3077, + "step": 1166 + }, + { + "epoch": 0.35617274530749277, + "grad_norm": 1.4888770645456522, + "learning_rate": 7.4613646899452205e-06, + "loss": 0.4135, + "step": 1167 + }, + { + "epoch": 0.3564779490309782, + "grad_norm": 1.1987516506211549, + "learning_rate": 7.457059787642541e-06, + "loss": 0.2382, + "step": 1168 + }, + { + "epoch": 0.3567831527544636, + "grad_norm": 1.4813551853352487, + "learning_rate": 7.452752482742452e-06, + "loss": 0.217, + "step": 1169 + }, + { + "epoch": 0.357088356477949, + "grad_norm": 1.6930510736367361, + "learning_rate": 7.448442779456781e-06, + "loss": 0.5214, + "step": 1170 + }, + { + "epoch": 0.3573935602014345, + "grad_norm": 2.190573141270535, + "learning_rate": 7.444130681999708e-06, + "loss": 0.2428, + "step": 1171 + }, + { + "epoch": 0.3576987639249199, + "grad_norm": 1.3779387713699958, + "learning_rate": 7.439816194587748e-06, + "loss": 0.3164, + "step": 1172 + }, + { + "epoch": 0.3580039676484053, + "grad_norm": 1.092055070131237, + "learning_rate": 7.435499321439754e-06, + "loss": 0.2772, + "step": 1173 + }, + { + "epoch": 0.3583091713718907, + "grad_norm": 1.3792533977854593, + "learning_rate": 7.431180066776913e-06, + "loss": 0.3562, + "step": 1174 + }, + { + "epoch": 0.35861437509537614, + "grad_norm": 1.2656833023017504, + "learning_rate": 7.426858434822742e-06, + "loss": 0.2438, + "step": 1175 + }, + { + "epoch": 0.3589195788188616, + "grad_norm": 1.4217271832774543, + "learning_rate": 7.42253442980308e-06, + "loss": 0.4452, + "step": 1176 + }, + { + "epoch": 0.359224782542347, + "grad_norm": 1.2647369471621235, + "learning_rate": 7.418208055946088e-06, + "loss": 0.3769, + "step": 1177 + }, + { + "epoch": 0.35952998626583244, + "grad_norm": 1.5573904799259919, + "learning_rate": 7.413879317482242e-06, + "loss": 0.2497, + "step": 1178 + }, + { + "epoch": 0.35983518998931785, + "grad_norm": 1.1158649645978311, + "learning_rate": 7.409548218644332e-06, + "loss": 0.3706, + "step": 1179 + }, + { + "epoch": 0.3601403937128033, + "grad_norm": 1.325219870264579, + "learning_rate": 7.4052147636674545e-06, + "loss": 0.2988, + "step": 1180 + }, + { + "epoch": 0.36044559743628873, + "grad_norm": 1.339413943736962, + "learning_rate": 7.400878956789011e-06, + "loss": 0.3791, + "step": 1181 + }, + { + "epoch": 0.36075080115977415, + "grad_norm": 1.3236780103464618, + "learning_rate": 7.396540802248704e-06, + "loss": 0.3051, + "step": 1182 + }, + { + "epoch": 0.36105600488325956, + "grad_norm": 1.4621900605001774, + "learning_rate": 7.392200304288526e-06, + "loss": 0.4029, + "step": 1183 + }, + { + "epoch": 0.361361208606745, + "grad_norm": 1.3004193418247747, + "learning_rate": 7.387857467152767e-06, + "loss": 0.3573, + "step": 1184 + }, + { + "epoch": 0.36166641233023045, + "grad_norm": 1.3403584017356904, + "learning_rate": 7.383512295088002e-06, + "loss": 0.301, + "step": 1185 + }, + { + "epoch": 0.36197161605371586, + "grad_norm": 1.3326412828848118, + "learning_rate": 7.379164792343091e-06, + "loss": 0.2346, + "step": 1186 + }, + { + "epoch": 0.3622768197772013, + "grad_norm": 1.3231894845074195, + "learning_rate": 7.37481496316917e-06, + "loss": 0.3924, + "step": 1187 + }, + { + "epoch": 0.3625820235006867, + "grad_norm": 1.2230476508538641, + "learning_rate": 7.370462811819651e-06, + "loss": 0.3316, + "step": 1188 + }, + { + "epoch": 0.36288722722417216, + "grad_norm": 1.6324841474899627, + "learning_rate": 7.366108342550217e-06, + "loss": 0.2683, + "step": 1189 + }, + { + "epoch": 0.3631924309476576, + "grad_norm": 1.1650127142511193, + "learning_rate": 7.361751559618819e-06, + "loss": 0.3151, + "step": 1190 + }, + { + "epoch": 0.363497634671143, + "grad_norm": 1.7617352605413967, + "learning_rate": 7.357392467285667e-06, + "loss": 0.3985, + "step": 1191 + }, + { + "epoch": 0.3638028383946284, + "grad_norm": 1.6177177849776165, + "learning_rate": 7.353031069813231e-06, + "loss": 0.3853, + "step": 1192 + }, + { + "epoch": 0.3641080421181138, + "grad_norm": 1.4672814189844252, + "learning_rate": 7.348667371466238e-06, + "loss": 0.3228, + "step": 1193 + }, + { + "epoch": 0.3644132458415993, + "grad_norm": 1.429990475762935, + "learning_rate": 7.344301376511659e-06, + "loss": 0.3216, + "step": 1194 + }, + { + "epoch": 0.3647184495650847, + "grad_norm": 1.2518101936436028, + "learning_rate": 7.339933089218716e-06, + "loss": 0.3541, + "step": 1195 + }, + { + "epoch": 0.3650236532885701, + "grad_norm": 1.5100278244233902, + "learning_rate": 7.335562513858868e-06, + "loss": 0.3777, + "step": 1196 + }, + { + "epoch": 0.36532885701205553, + "grad_norm": 1.3864156902279452, + "learning_rate": 7.331189654705816e-06, + "loss": 0.3451, + "step": 1197 + }, + { + "epoch": 0.365634060735541, + "grad_norm": 1.3148051409795818, + "learning_rate": 7.326814516035491e-06, + "loss": 0.2568, + "step": 1198 + }, + { + "epoch": 0.3659392644590264, + "grad_norm": 1.1783966682135776, + "learning_rate": 7.322437102126052e-06, + "loss": 0.409, + "step": 1199 + }, + { + "epoch": 0.36624446818251183, + "grad_norm": 1.3482269405611969, + "learning_rate": 7.318057417257886e-06, + "loss": 0.4714, + "step": 1200 + }, + { + "epoch": 0.36654967190599724, + "grad_norm": 1.6310707549202617, + "learning_rate": 7.3136754657136e-06, + "loss": 0.2499, + "step": 1201 + }, + { + "epoch": 0.36685487562948266, + "grad_norm": 1.039768581410541, + "learning_rate": 7.309291251778016e-06, + "loss": 0.272, + "step": 1202 + }, + { + "epoch": 0.3671600793529681, + "grad_norm": 0.9895520991699935, + "learning_rate": 7.304904779738169e-06, + "loss": 0.2435, + "step": 1203 + }, + { + "epoch": 0.36746528307645354, + "grad_norm": 1.229493544913369, + "learning_rate": 7.300516053883301e-06, + "loss": 0.3142, + "step": 1204 + }, + { + "epoch": 0.36777048679993896, + "grad_norm": 1.4306220663631386, + "learning_rate": 7.29612507850486e-06, + "loss": 0.3279, + "step": 1205 + }, + { + "epoch": 0.36807569052342437, + "grad_norm": 1.6062925874875447, + "learning_rate": 7.291731857896493e-06, + "loss": 0.4255, + "step": 1206 + }, + { + "epoch": 0.3683808942469098, + "grad_norm": 1.268726927448817, + "learning_rate": 7.287336396354039e-06, + "loss": 0.3437, + "step": 1207 + }, + { + "epoch": 0.36868609797039525, + "grad_norm": 1.6882520342147596, + "learning_rate": 7.282938698175535e-06, + "loss": 0.5873, + "step": 1208 + }, + { + "epoch": 0.36899130169388067, + "grad_norm": 1.0404068703903802, + "learning_rate": 7.278538767661198e-06, + "loss": 0.2389, + "step": 1209 + }, + { + "epoch": 0.3692965054173661, + "grad_norm": 1.1653062972796002, + "learning_rate": 7.274136609113433e-06, + "loss": 0.2512, + "step": 1210 + }, + { + "epoch": 0.3696017091408515, + "grad_norm": 1.3873057667640636, + "learning_rate": 7.269732226836821e-06, + "loss": 0.3267, + "step": 1211 + }, + { + "epoch": 0.36990691286433697, + "grad_norm": 1.5224889687825975, + "learning_rate": 7.265325625138119e-06, + "loss": 0.3719, + "step": 1212 + }, + { + "epoch": 0.3702121165878224, + "grad_norm": 1.2204715600675486, + "learning_rate": 7.260916808326252e-06, + "loss": 0.2686, + "step": 1213 + }, + { + "epoch": 0.3705173203113078, + "grad_norm": 1.3842524558223457, + "learning_rate": 7.256505780712314e-06, + "loss": 0.2467, + "step": 1214 + }, + { + "epoch": 0.3708225240347932, + "grad_norm": 1.7657099340484863, + "learning_rate": 7.252092546609558e-06, + "loss": 0.2798, + "step": 1215 + }, + { + "epoch": 0.3711277277582786, + "grad_norm": 1.3329438693740925, + "learning_rate": 7.247677110333397e-06, + "loss": 0.2828, + "step": 1216 + }, + { + "epoch": 0.3714329314817641, + "grad_norm": 1.104320779277888, + "learning_rate": 7.2432594762013945e-06, + "loss": 0.3003, + "step": 1217 + }, + { + "epoch": 0.3717381352052495, + "grad_norm": 1.2445465814923657, + "learning_rate": 7.238839648533266e-06, + "loss": 0.2837, + "step": 1218 + }, + { + "epoch": 0.3720433389287349, + "grad_norm": 1.540336275418335, + "learning_rate": 7.234417631650872e-06, + "loss": 0.5497, + "step": 1219 + }, + { + "epoch": 0.37234854265222034, + "grad_norm": 1.2167853145142045, + "learning_rate": 7.2299934298782105e-06, + "loss": 0.2327, + "step": 1220 + }, + { + "epoch": 0.3726537463757058, + "grad_norm": 1.6897903723185226, + "learning_rate": 7.225567047541418e-06, + "loss": 0.4309, + "step": 1221 + }, + { + "epoch": 0.3729589500991912, + "grad_norm": 1.3675420682807005, + "learning_rate": 7.221138488968763e-06, + "loss": 0.3553, + "step": 1222 + }, + { + "epoch": 0.37326415382267664, + "grad_norm": 1.1874100500472131, + "learning_rate": 7.2167077584906416e-06, + "loss": 0.2332, + "step": 1223 + }, + { + "epoch": 0.37356935754616205, + "grad_norm": 1.7310995153583675, + "learning_rate": 7.2122748604395765e-06, + "loss": 0.3075, + "step": 1224 + }, + { + "epoch": 0.37387456126964747, + "grad_norm": 1.3940442942063533, + "learning_rate": 7.207839799150206e-06, + "loss": 0.43, + "step": 1225 + }, + { + "epoch": 0.37417976499313294, + "grad_norm": 1.6425564976559994, + "learning_rate": 7.203402578959285e-06, + "loss": 0.5069, + "step": 1226 + }, + { + "epoch": 0.37448496871661835, + "grad_norm": 0.9576849748308918, + "learning_rate": 7.1989632042056816e-06, + "loss": 0.2351, + "step": 1227 + }, + { + "epoch": 0.37479017244010376, + "grad_norm": 1.3701544258348275, + "learning_rate": 7.1945216792303666e-06, + "loss": 0.3381, + "step": 1228 + }, + { + "epoch": 0.3750953761635892, + "grad_norm": 1.140854485823984, + "learning_rate": 7.190078008376415e-06, + "loss": 0.2751, + "step": 1229 + }, + { + "epoch": 0.37540057988707465, + "grad_norm": 1.3323249381262718, + "learning_rate": 7.185632195989005e-06, + "loss": 0.3349, + "step": 1230 + }, + { + "epoch": 0.37570578361056006, + "grad_norm": 1.0885163567382525, + "learning_rate": 7.1811842464154e-06, + "loss": 0.2448, + "step": 1231 + }, + { + "epoch": 0.3760109873340455, + "grad_norm": 1.4080823193078549, + "learning_rate": 7.1767341640049595e-06, + "loss": 0.3132, + "step": 1232 + }, + { + "epoch": 0.3763161910575309, + "grad_norm": 1.353912867702153, + "learning_rate": 7.172281953109128e-06, + "loss": 0.3432, + "step": 1233 + }, + { + "epoch": 0.3766213947810163, + "grad_norm": 1.3812789171739044, + "learning_rate": 7.167827618081426e-06, + "loss": 0.2137, + "step": 1234 + }, + { + "epoch": 0.3769265985045018, + "grad_norm": 1.2478616544202277, + "learning_rate": 7.1633711632774605e-06, + "loss": 0.2565, + "step": 1235 + }, + { + "epoch": 0.3772318022279872, + "grad_norm": 1.4006239352207315, + "learning_rate": 7.158912593054904e-06, + "loss": 0.4235, + "step": 1236 + }, + { + "epoch": 0.3775370059514726, + "grad_norm": 1.2649826963093111, + "learning_rate": 7.154451911773499e-06, + "loss": 0.3147, + "step": 1237 + }, + { + "epoch": 0.377842209674958, + "grad_norm": 1.9419141139672966, + "learning_rate": 7.149989123795054e-06, + "loss": 0.2825, + "step": 1238 + }, + { + "epoch": 0.3781474133984435, + "grad_norm": 1.4118815933485622, + "learning_rate": 7.145524233483434e-06, + "loss": 0.252, + "step": 1239 + }, + { + "epoch": 0.3784526171219289, + "grad_norm": 1.55813966795455, + "learning_rate": 7.1410572452045635e-06, + "loss": 0.3432, + "step": 1240 + }, + { + "epoch": 0.3787578208454143, + "grad_norm": 0.9637101653261684, + "learning_rate": 7.136588163326417e-06, + "loss": 0.2123, + "step": 1241 + }, + { + "epoch": 0.37906302456889973, + "grad_norm": 1.4156183777374252, + "learning_rate": 7.1321169922190144e-06, + "loss": 0.4448, + "step": 1242 + }, + { + "epoch": 0.37936822829238515, + "grad_norm": 1.2011574845111053, + "learning_rate": 7.127643736254424e-06, + "loss": 0.1914, + "step": 1243 + }, + { + "epoch": 0.3796734320158706, + "grad_norm": 1.2719810910228972, + "learning_rate": 7.123168399806747e-06, + "loss": 0.2365, + "step": 1244 + }, + { + "epoch": 0.37997863573935603, + "grad_norm": 1.1627952282839493, + "learning_rate": 7.118690987252121e-06, + "loss": 0.2535, + "step": 1245 + }, + { + "epoch": 0.38028383946284144, + "grad_norm": 1.526033273441638, + "learning_rate": 7.114211502968712e-06, + "loss": 0.563, + "step": 1246 + }, + { + "epoch": 0.38058904318632686, + "grad_norm": 1.6215492435897827, + "learning_rate": 7.1097299513367166e-06, + "loss": 0.238, + "step": 1247 + }, + { + "epoch": 0.3808942469098123, + "grad_norm": 1.689691321719832, + "learning_rate": 7.105246336738348e-06, + "loss": 0.1984, + "step": 1248 + }, + { + "epoch": 0.38119945063329774, + "grad_norm": 3.3706702256948455, + "learning_rate": 7.100760663557841e-06, + "loss": 0.2772, + "step": 1249 + }, + { + "epoch": 0.38150465435678316, + "grad_norm": 1.5153837503559604, + "learning_rate": 7.0962729361814355e-06, + "loss": 0.2683, + "step": 1250 + }, + { + "epoch": 0.38180985808026857, + "grad_norm": 1.269653690009095, + "learning_rate": 7.0917831589973895e-06, + "loss": 0.2611, + "step": 1251 + }, + { + "epoch": 0.382115061803754, + "grad_norm": 1.409968651632736, + "learning_rate": 7.0872913363959614e-06, + "loss": 0.3647, + "step": 1252 + }, + { + "epoch": 0.38242026552723946, + "grad_norm": 1.5218215588345414, + "learning_rate": 7.082797472769408e-06, + "loss": 0.466, + "step": 1253 + }, + { + "epoch": 0.38272546925072487, + "grad_norm": 2.3890410517745835, + "learning_rate": 7.078301572511984e-06, + "loss": 0.3646, + "step": 1254 + }, + { + "epoch": 0.3830306729742103, + "grad_norm": 1.641992489517636, + "learning_rate": 7.073803640019936e-06, + "loss": 0.3105, + "step": 1255 + }, + { + "epoch": 0.3833358766976957, + "grad_norm": 1.299025900561255, + "learning_rate": 7.0693036796914945e-06, + "loss": 0.3399, + "step": 1256 + }, + { + "epoch": 0.3836410804211811, + "grad_norm": 1.5380435417024074, + "learning_rate": 7.064801695926877e-06, + "loss": 0.2588, + "step": 1257 + }, + { + "epoch": 0.3839462841446666, + "grad_norm": 1.2372269820552038, + "learning_rate": 7.060297693128277e-06, + "loss": 0.3599, + "step": 1258 + }, + { + "epoch": 0.384251487868152, + "grad_norm": 1.1614198321930482, + "learning_rate": 7.055791675699863e-06, + "loss": 0.1642, + "step": 1259 + }, + { + "epoch": 0.3845566915916374, + "grad_norm": 1.4934775483085228, + "learning_rate": 7.051283648047775e-06, + "loss": 0.4615, + "step": 1260 + }, + { + "epoch": 0.3848618953151228, + "grad_norm": 1.4582552843552379, + "learning_rate": 7.046773614580116e-06, + "loss": 0.3002, + "step": 1261 + }, + { + "epoch": 0.3851670990386083, + "grad_norm": 4.333102342366705, + "learning_rate": 7.042261579706951e-06, + "loss": 0.3606, + "step": 1262 + }, + { + "epoch": 0.3854723027620937, + "grad_norm": 1.5981293143466662, + "learning_rate": 7.037747547840303e-06, + "loss": 0.3758, + "step": 1263 + }, + { + "epoch": 0.3857775064855791, + "grad_norm": 1.1060333427728648, + "learning_rate": 7.033231523394147e-06, + "loss": 0.1347, + "step": 1264 + }, + { + "epoch": 0.38608271020906454, + "grad_norm": 1.7104158346504792, + "learning_rate": 7.028713510784409e-06, + "loss": 0.6244, + "step": 1265 + }, + { + "epoch": 0.38638791393254995, + "grad_norm": 1.2041816530098084, + "learning_rate": 7.024193514428953e-06, + "loss": 0.2692, + "step": 1266 + }, + { + "epoch": 0.3866931176560354, + "grad_norm": 1.2271375340838422, + "learning_rate": 7.0196715387475885e-06, + "loss": 0.2536, + "step": 1267 + }, + { + "epoch": 0.38699832137952084, + "grad_norm": 1.3208524965326127, + "learning_rate": 7.015147588162061e-06, + "loss": 0.2425, + "step": 1268 + }, + { + "epoch": 0.38730352510300625, + "grad_norm": 1.6679415198075516, + "learning_rate": 7.010621667096041e-06, + "loss": 0.3849, + "step": 1269 + }, + { + "epoch": 0.38760872882649167, + "grad_norm": 1.336956505558018, + "learning_rate": 7.0060937799751316e-06, + "loss": 0.2577, + "step": 1270 + }, + { + "epoch": 0.38791393254997714, + "grad_norm": 1.1363423042029048, + "learning_rate": 7.0015639312268555e-06, + "loss": 0.2241, + "step": 1271 + }, + { + "epoch": 0.38821913627346255, + "grad_norm": 1.283661967883495, + "learning_rate": 6.997032125280655e-06, + "loss": 0.2903, + "step": 1272 + }, + { + "epoch": 0.38852433999694796, + "grad_norm": 1.6001822129445207, + "learning_rate": 6.992498366567884e-06, + "loss": 0.2261, + "step": 1273 + }, + { + "epoch": 0.3888295437204334, + "grad_norm": 1.3026652853973728, + "learning_rate": 6.987962659521808e-06, + "loss": 0.3057, + "step": 1274 + }, + { + "epoch": 0.3891347474439188, + "grad_norm": 1.4690995307739332, + "learning_rate": 6.983425008577598e-06, + "loss": 0.5416, + "step": 1275 + }, + { + "epoch": 0.38943995116740426, + "grad_norm": 1.8375650537094952, + "learning_rate": 6.978885418172325e-06, + "loss": 0.3456, + "step": 1276 + }, + { + "epoch": 0.3897451548908897, + "grad_norm": 2.194246189266676, + "learning_rate": 6.974343892744954e-06, + "loss": 0.2907, + "step": 1277 + }, + { + "epoch": 0.3900503586143751, + "grad_norm": 1.6774755773457783, + "learning_rate": 6.969800436736347e-06, + "loss": 0.3058, + "step": 1278 + }, + { + "epoch": 0.3903555623378605, + "grad_norm": 1.4187678667867623, + "learning_rate": 6.965255054589252e-06, + "loss": 0.4065, + "step": 1279 + }, + { + "epoch": 0.3906607660613459, + "grad_norm": 1.2370997427013408, + "learning_rate": 6.960707750748296e-06, + "loss": 0.276, + "step": 1280 + }, + { + "epoch": 0.3909659697848314, + "grad_norm": 1.2143352462379313, + "learning_rate": 6.956158529659991e-06, + "loss": 0.2169, + "step": 1281 + }, + { + "epoch": 0.3912711735083168, + "grad_norm": 1.3152454363292938, + "learning_rate": 6.951607395772721e-06, + "loss": 0.3729, + "step": 1282 + }, + { + "epoch": 0.3915763772318022, + "grad_norm": 1.7624342452965842, + "learning_rate": 6.947054353536742e-06, + "loss": 0.3491, + "step": 1283 + }, + { + "epoch": 0.39188158095528763, + "grad_norm": 1.0107668961163137, + "learning_rate": 6.942499407404175e-06, + "loss": 0.2528, + "step": 1284 + }, + { + "epoch": 0.3921867846787731, + "grad_norm": 1.3640381643739714, + "learning_rate": 6.937942561829001e-06, + "loss": 0.2922, + "step": 1285 + }, + { + "epoch": 0.3924919884022585, + "grad_norm": 1.3678373263857668, + "learning_rate": 6.933383821267062e-06, + "loss": 0.4763, + "step": 1286 + }, + { + "epoch": 0.39279719212574393, + "grad_norm": 1.4067856510214773, + "learning_rate": 6.928823190176051e-06, + "loss": 0.441, + "step": 1287 + }, + { + "epoch": 0.39310239584922935, + "grad_norm": 1.1456457967166047, + "learning_rate": 6.924260673015507e-06, + "loss": 0.2726, + "step": 1288 + }, + { + "epoch": 0.39340759957271476, + "grad_norm": 1.2577366776447958, + "learning_rate": 6.919696274246818e-06, + "loss": 0.2356, + "step": 1289 + }, + { + "epoch": 0.39371280329620023, + "grad_norm": 1.5018478773369082, + "learning_rate": 6.91512999833321e-06, + "loss": 0.3858, + "step": 1290 + }, + { + "epoch": 0.39401800701968565, + "grad_norm": 1.4382055458480731, + "learning_rate": 6.910561849739743e-06, + "loss": 0.4623, + "step": 1291 + }, + { + "epoch": 0.39432321074317106, + "grad_norm": 1.4150013204507765, + "learning_rate": 6.905991832933312e-06, + "loss": 0.2598, + "step": 1292 + }, + { + "epoch": 0.3946284144666565, + "grad_norm": 1.313619408763559, + "learning_rate": 6.901419952382633e-06, + "loss": 0.1856, + "step": 1293 + }, + { + "epoch": 0.39493361819014194, + "grad_norm": 1.1626329284469816, + "learning_rate": 6.89684621255825e-06, + "loss": 0.2655, + "step": 1294 + }, + { + "epoch": 0.39523882191362736, + "grad_norm": 1.4286438514880226, + "learning_rate": 6.892270617932523e-06, + "loss": 0.3111, + "step": 1295 + }, + { + "epoch": 0.3955440256371128, + "grad_norm": 1.467025068592859, + "learning_rate": 6.887693172979624e-06, + "loss": 0.1943, + "step": 1296 + }, + { + "epoch": 0.3958492293605982, + "grad_norm": 1.5302799032399699, + "learning_rate": 6.883113882175536e-06, + "loss": 0.2896, + "step": 1297 + }, + { + "epoch": 0.3961544330840836, + "grad_norm": 1.4221040310691428, + "learning_rate": 6.878532749998049e-06, + "loss": 0.2751, + "step": 1298 + }, + { + "epoch": 0.39645963680756907, + "grad_norm": 1.377710099164584, + "learning_rate": 6.873949780926747e-06, + "loss": 0.2793, + "step": 1299 + }, + { + "epoch": 0.3967648405310545, + "grad_norm": 1.3417904782871544, + "learning_rate": 6.869364979443019e-06, + "loss": 0.2602, + "step": 1300 + }, + { + "epoch": 0.3970700442545399, + "grad_norm": 1.3467916175186017, + "learning_rate": 6.86477835003004e-06, + "loss": 0.2857, + "step": 1301 + }, + { + "epoch": 0.3973752479780253, + "grad_norm": 1.4529951304165758, + "learning_rate": 6.860189897172772e-06, + "loss": 0.5097, + "step": 1302 + }, + { + "epoch": 0.3976804517015108, + "grad_norm": 1.3839926990470812, + "learning_rate": 6.8555996253579645e-06, + "loss": 0.4489, + "step": 1303 + }, + { + "epoch": 0.3979856554249962, + "grad_norm": 1.4801954354304339, + "learning_rate": 6.85100753907414e-06, + "loss": 0.5139, + "step": 1304 + }, + { + "epoch": 0.3982908591484816, + "grad_norm": 1.0192216424990155, + "learning_rate": 6.846413642811598e-06, + "loss": 0.231, + "step": 1305 + }, + { + "epoch": 0.398596062871967, + "grad_norm": 1.4352380952459303, + "learning_rate": 6.841817941062412e-06, + "loss": 0.3393, + "step": 1306 + }, + { + "epoch": 0.39890126659545244, + "grad_norm": 1.5851110550635767, + "learning_rate": 6.837220438320411e-06, + "loss": 0.41, + "step": 1307 + }, + { + "epoch": 0.3992064703189379, + "grad_norm": 1.569272508271095, + "learning_rate": 6.832621139081196e-06, + "loss": 0.3163, + "step": 1308 + }, + { + "epoch": 0.3995116740424233, + "grad_norm": 1.3018066671809359, + "learning_rate": 6.8280200478421185e-06, + "loss": 0.2329, + "step": 1309 + }, + { + "epoch": 0.39981687776590874, + "grad_norm": 1.660594639958246, + "learning_rate": 6.823417169102282e-06, + "loss": 0.3668, + "step": 1310 + }, + { + "epoch": 0.40012208148939415, + "grad_norm": 1.522549727556404, + "learning_rate": 6.81881250736254e-06, + "loss": 0.3363, + "step": 1311 + }, + { + "epoch": 0.4004272852128796, + "grad_norm": 1.497940397892529, + "learning_rate": 6.8142060671254905e-06, + "loss": 0.393, + "step": 1312 + }, + { + "epoch": 0.40073248893636504, + "grad_norm": 1.4706728908070485, + "learning_rate": 6.8095978528954665e-06, + "loss": 0.4207, + "step": 1313 + }, + { + "epoch": 0.40103769265985045, + "grad_norm": 1.2561900413969935, + "learning_rate": 6.80498786917854e-06, + "loss": 0.2689, + "step": 1314 + }, + { + "epoch": 0.40134289638333587, + "grad_norm": 2.2206434310633494, + "learning_rate": 6.800376120482508e-06, + "loss": 0.3934, + "step": 1315 + }, + { + "epoch": 0.4016481001068213, + "grad_norm": 1.180675774729955, + "learning_rate": 6.795762611316901e-06, + "loss": 0.1996, + "step": 1316 + }, + { + "epoch": 0.40195330383030675, + "grad_norm": 1.2644690898256274, + "learning_rate": 6.791147346192965e-06, + "loss": 0.3932, + "step": 1317 + }, + { + "epoch": 0.40225850755379217, + "grad_norm": 1.9397161791331279, + "learning_rate": 6.786530329623663e-06, + "loss": 0.2068, + "step": 1318 + }, + { + "epoch": 0.4025637112772776, + "grad_norm": 1.2683770669890404, + "learning_rate": 6.7819115661236745e-06, + "loss": 0.3456, + "step": 1319 + }, + { + "epoch": 0.402868915000763, + "grad_norm": 1.2741738279318962, + "learning_rate": 6.7772910602093845e-06, + "loss": 0.226, + "step": 1320 + }, + { + "epoch": 0.4031741187242484, + "grad_norm": 1.5848234426074979, + "learning_rate": 6.772668816398882e-06, + "loss": 0.4086, + "step": 1321 + }, + { + "epoch": 0.4034793224477339, + "grad_norm": 1.1692837868067145, + "learning_rate": 6.768044839211955e-06, + "loss": 0.2752, + "step": 1322 + }, + { + "epoch": 0.4037845261712193, + "grad_norm": 1.3967415526137874, + "learning_rate": 6.763419133170089e-06, + "loss": 0.4226, + "step": 1323 + }, + { + "epoch": 0.4040897298947047, + "grad_norm": 1.4254602358278508, + "learning_rate": 6.758791702796454e-06, + "loss": 0.3899, + "step": 1324 + }, + { + "epoch": 0.4043949336181901, + "grad_norm": 1.5572617271717888, + "learning_rate": 6.754162552615916e-06, + "loss": 0.4574, + "step": 1325 + }, + { + "epoch": 0.4047001373416756, + "grad_norm": 1.7167631282679396, + "learning_rate": 6.749531687155014e-06, + "loss": 0.298, + "step": 1326 + }, + { + "epoch": 0.405005341065161, + "grad_norm": 1.2196148469218278, + "learning_rate": 6.744899110941967e-06, + "loss": 0.3353, + "step": 1327 + }, + { + "epoch": 0.4053105447886464, + "grad_norm": 1.425829742861132, + "learning_rate": 6.740264828506668e-06, + "loss": 0.3501, + "step": 1328 + }, + { + "epoch": 0.40561574851213184, + "grad_norm": 1.3938860001518678, + "learning_rate": 6.7356288443806765e-06, + "loss": 0.2435, + "step": 1329 + }, + { + "epoch": 0.40592095223561725, + "grad_norm": 1.2919006084530176, + "learning_rate": 6.730991163097217e-06, + "loss": 0.419, + "step": 1330 + }, + { + "epoch": 0.4062261559591027, + "grad_norm": 1.1955154346409425, + "learning_rate": 6.7263517891911745e-06, + "loss": 0.2766, + "step": 1331 + }, + { + "epoch": 0.40653135968258813, + "grad_norm": 1.272620752447219, + "learning_rate": 6.721710727199087e-06, + "loss": 0.3691, + "step": 1332 + }, + { + "epoch": 0.40683656340607355, + "grad_norm": 1.2911916084787372, + "learning_rate": 6.717067981659145e-06, + "loss": 0.2437, + "step": 1333 + }, + { + "epoch": 0.40714176712955896, + "grad_norm": 1.3485651903926066, + "learning_rate": 6.712423557111184e-06, + "loss": 0.3076, + "step": 1334 + }, + { + "epoch": 0.40744697085304443, + "grad_norm": 1.1329951698085787, + "learning_rate": 6.707777458096682e-06, + "loss": 0.2218, + "step": 1335 + }, + { + "epoch": 0.40775217457652985, + "grad_norm": 1.667913860338061, + "learning_rate": 6.703129689158755e-06, + "loss": 0.3711, + "step": 1336 + }, + { + "epoch": 0.40805737830001526, + "grad_norm": 1.145899868969619, + "learning_rate": 6.698480254842148e-06, + "loss": 0.2636, + "step": 1337 + }, + { + "epoch": 0.4083625820235007, + "grad_norm": 1.1099719738214417, + "learning_rate": 6.693829159693241e-06, + "loss": 0.2096, + "step": 1338 + }, + { + "epoch": 0.4086677857469861, + "grad_norm": 2.3227758208085665, + "learning_rate": 6.689176408260033e-06, + "loss": 0.4818, + "step": 1339 + }, + { + "epoch": 0.40897298947047156, + "grad_norm": 1.1176996724319626, + "learning_rate": 6.684522005092142e-06, + "loss": 0.3045, + "step": 1340 + }, + { + "epoch": 0.409278193193957, + "grad_norm": 1.2902343971744434, + "learning_rate": 6.679865954740808e-06, + "loss": 0.2739, + "step": 1341 + }, + { + "epoch": 0.4095833969174424, + "grad_norm": 1.24221657772939, + "learning_rate": 6.675208261758874e-06, + "loss": 0.248, + "step": 1342 + }, + { + "epoch": 0.4098886006409278, + "grad_norm": 1.466634078458504, + "learning_rate": 6.670548930700791e-06, + "loss": 0.2954, + "step": 1343 + }, + { + "epoch": 0.4101938043644133, + "grad_norm": 1.4954932379638273, + "learning_rate": 6.665887966122616e-06, + "loss": 0.4283, + "step": 1344 + }, + { + "epoch": 0.4104990080878987, + "grad_norm": 1.3273678584881572, + "learning_rate": 6.661225372581996e-06, + "loss": 0.2329, + "step": 1345 + }, + { + "epoch": 0.4108042118113841, + "grad_norm": 1.2699849887771746, + "learning_rate": 6.6565611546381794e-06, + "loss": 0.1797, + "step": 1346 + }, + { + "epoch": 0.4111094155348695, + "grad_norm": 1.2890458539408176, + "learning_rate": 6.651895316851996e-06, + "loss": 0.2587, + "step": 1347 + }, + { + "epoch": 0.41141461925835493, + "grad_norm": 1.216750093543553, + "learning_rate": 6.6472278637858615e-06, + "loss": 0.3274, + "step": 1348 + }, + { + "epoch": 0.4117198229818404, + "grad_norm": 1.3058663712935152, + "learning_rate": 6.642558800003775e-06, + "loss": 0.2467, + "step": 1349 + }, + { + "epoch": 0.4120250267053258, + "grad_norm": 1.4706561117776265, + "learning_rate": 6.637888130071304e-06, + "loss": 0.4008, + "step": 1350 + }, + { + "epoch": 0.41233023042881123, + "grad_norm": 1.1481689346703796, + "learning_rate": 6.633215858555592e-06, + "loss": 0.2391, + "step": 1351 + }, + { + "epoch": 0.41263543415229664, + "grad_norm": 1.2458895811994175, + "learning_rate": 6.628541990025346e-06, + "loss": 0.2592, + "step": 1352 + }, + { + "epoch": 0.41294063787578206, + "grad_norm": 2.065000463305553, + "learning_rate": 6.623866529050834e-06, + "loss": 0.5728, + "step": 1353 + }, + { + "epoch": 0.4132458415992675, + "grad_norm": 1.626428050000369, + "learning_rate": 6.6191894802038826e-06, + "loss": 0.495, + "step": 1354 + }, + { + "epoch": 0.41355104532275294, + "grad_norm": 1.2186318364692617, + "learning_rate": 6.614510848057874e-06, + "loss": 0.3359, + "step": 1355 + }, + { + "epoch": 0.41385624904623836, + "grad_norm": 1.3483193797759696, + "learning_rate": 6.609830637187729e-06, + "loss": 0.3744, + "step": 1356 + }, + { + "epoch": 0.41416145276972377, + "grad_norm": 1.2559337920214608, + "learning_rate": 6.6051488521699235e-06, + "loss": 0.3737, + "step": 1357 + }, + { + "epoch": 0.41446665649320924, + "grad_norm": 1.3874775704297853, + "learning_rate": 6.600465497582466e-06, + "loss": 0.1995, + "step": 1358 + }, + { + "epoch": 0.41477186021669465, + "grad_norm": 1.6050931393702568, + "learning_rate": 6.595780578004901e-06, + "loss": 0.5452, + "step": 1359 + }, + { + "epoch": 0.41507706394018007, + "grad_norm": 1.4120118673382418, + "learning_rate": 6.591094098018306e-06, + "loss": 0.3246, + "step": 1360 + }, + { + "epoch": 0.4153822676636655, + "grad_norm": 1.1696817855430393, + "learning_rate": 6.586406062205278e-06, + "loss": 0.1567, + "step": 1361 + }, + { + "epoch": 0.4156874713871509, + "grad_norm": 1.4850231161148664, + "learning_rate": 6.581716475149943e-06, + "loss": 0.3303, + "step": 1362 + }, + { + "epoch": 0.41599267511063637, + "grad_norm": 1.4803085844420174, + "learning_rate": 6.577025341437939e-06, + "loss": 0.3144, + "step": 1363 + }, + { + "epoch": 0.4162978788341218, + "grad_norm": 1.314596318555117, + "learning_rate": 6.572332665656417e-06, + "loss": 0.3133, + "step": 1364 + }, + { + "epoch": 0.4166030825576072, + "grad_norm": 1.2386593406906814, + "learning_rate": 6.567638452394036e-06, + "loss": 0.2295, + "step": 1365 + }, + { + "epoch": 0.4169082862810926, + "grad_norm": 1.5178670129535232, + "learning_rate": 6.56294270624096e-06, + "loss": 0.3895, + "step": 1366 + }, + { + "epoch": 0.4172134900045781, + "grad_norm": 1.1161753197408095, + "learning_rate": 6.558245431788851e-06, + "loss": 0.2753, + "step": 1367 + }, + { + "epoch": 0.4175186937280635, + "grad_norm": 1.6916589702312108, + "learning_rate": 6.553546633630865e-06, + "loss": 0.5491, + "step": 1368 + }, + { + "epoch": 0.4178238974515489, + "grad_norm": 1.2497529739433018, + "learning_rate": 6.548846316361648e-06, + "loss": 0.275, + "step": 1369 + }, + { + "epoch": 0.4181291011750343, + "grad_norm": 1.3872597941831253, + "learning_rate": 6.544144484577331e-06, + "loss": 0.4461, + "step": 1370 + }, + { + "epoch": 0.41843430489851974, + "grad_norm": 1.290246777845889, + "learning_rate": 6.539441142875528e-06, + "loss": 0.3839, + "step": 1371 + }, + { + "epoch": 0.4187395086220052, + "grad_norm": 1.9737467857948305, + "learning_rate": 6.534736295855325e-06, + "loss": 0.2692, + "step": 1372 + }, + { + "epoch": 0.4190447123454906, + "grad_norm": 1.4088078442454266, + "learning_rate": 6.530029948117284e-06, + "loss": 0.4093, + "step": 1373 + }, + { + "epoch": 0.41934991606897604, + "grad_norm": 1.2700736897033225, + "learning_rate": 6.525322104263439e-06, + "loss": 0.4136, + "step": 1374 + }, + { + "epoch": 0.41965511979246145, + "grad_norm": 1.7721019538973242, + "learning_rate": 6.520612768897272e-06, + "loss": 0.3371, + "step": 1375 + }, + { + "epoch": 0.4199603235159469, + "grad_norm": 1.2889345047723617, + "learning_rate": 6.51590194662374e-06, + "loss": 0.303, + "step": 1376 + }, + { + "epoch": 0.42026552723943233, + "grad_norm": 1.1153113343630081, + "learning_rate": 6.511189642049244e-06, + "loss": 0.2793, + "step": 1377 + }, + { + "epoch": 0.42057073096291775, + "grad_norm": 1.3638733722885776, + "learning_rate": 6.506475859781637e-06, + "loss": 0.374, + "step": 1378 + }, + { + "epoch": 0.42087593468640316, + "grad_norm": 1.4684349696171455, + "learning_rate": 6.501760604430218e-06, + "loss": 0.3168, + "step": 1379 + }, + { + "epoch": 0.4211811384098886, + "grad_norm": 1.1514319999384148, + "learning_rate": 6.497043880605725e-06, + "loss": 0.2929, + "step": 1380 + }, + { + "epoch": 0.42148634213337405, + "grad_norm": 1.169458160460905, + "learning_rate": 6.492325692920329e-06, + "loss": 0.2506, + "step": 1381 + }, + { + "epoch": 0.42179154585685946, + "grad_norm": 1.7740829570344059, + "learning_rate": 6.487606045987645e-06, + "loss": 0.647, + "step": 1382 + }, + { + "epoch": 0.4220967495803449, + "grad_norm": 1.5764040486432587, + "learning_rate": 6.482884944422696e-06, + "loss": 0.403, + "step": 1383 + }, + { + "epoch": 0.4224019533038303, + "grad_norm": 1.4963369146995207, + "learning_rate": 6.478162392841942e-06, + "loss": 0.4148, + "step": 1384 + }, + { + "epoch": 0.42270715702731576, + "grad_norm": 1.4435630313971244, + "learning_rate": 6.473438395863259e-06, + "loss": 0.4409, + "step": 1385 + }, + { + "epoch": 0.4230123607508012, + "grad_norm": 1.3799044100508142, + "learning_rate": 6.468712958105926e-06, + "loss": 0.2855, + "step": 1386 + }, + { + "epoch": 0.4233175644742866, + "grad_norm": 1.160815119293516, + "learning_rate": 6.463986084190644e-06, + "loss": 0.2924, + "step": 1387 + }, + { + "epoch": 0.423622768197772, + "grad_norm": 1.4073095780293283, + "learning_rate": 6.459257778739511e-06, + "loss": 0.4324, + "step": 1388 + }, + { + "epoch": 0.4239279719212574, + "grad_norm": 1.3577032223044758, + "learning_rate": 6.4545280463760226e-06, + "loss": 0.2674, + "step": 1389 + }, + { + "epoch": 0.4242331756447429, + "grad_norm": 1.2994968614411917, + "learning_rate": 6.449796891725081e-06, + "loss": 0.2802, + "step": 1390 + }, + { + "epoch": 0.4245383793682283, + "grad_norm": 1.343062113855798, + "learning_rate": 6.445064319412966e-06, + "loss": 0.218, + "step": 1391 + }, + { + "epoch": 0.4248435830917137, + "grad_norm": 1.907900174162606, + "learning_rate": 6.44033033406735e-06, + "loss": 0.3629, + "step": 1392 + }, + { + "epoch": 0.42514878681519913, + "grad_norm": 1.3076657321115293, + "learning_rate": 6.43559494031729e-06, + "loss": 0.4244, + "step": 1393 + }, + { + "epoch": 0.42545399053868455, + "grad_norm": 1.393965833570882, + "learning_rate": 6.430858142793213e-06, + "loss": 0.2388, + "step": 1394 + }, + { + "epoch": 0.42575919426217, + "grad_norm": 1.4458587439041957, + "learning_rate": 6.426119946126922e-06, + "loss": 0.5452, + "step": 1395 + }, + { + "epoch": 0.42606439798565543, + "grad_norm": 1.2918032092044962, + "learning_rate": 6.4213803549515896e-06, + "loss": 0.3516, + "step": 1396 + }, + { + "epoch": 0.42636960170914084, + "grad_norm": 1.4058848012801475, + "learning_rate": 6.41663937390175e-06, + "loss": 0.2097, + "step": 1397 + }, + { + "epoch": 0.42667480543262626, + "grad_norm": 1.6133906071062372, + "learning_rate": 6.4118970076132995e-06, + "loss": 0.5124, + "step": 1398 + }, + { + "epoch": 0.42698000915611173, + "grad_norm": 1.3377311221640538, + "learning_rate": 6.407153260723484e-06, + "loss": 0.315, + "step": 1399 + }, + { + "epoch": 0.42728521287959714, + "grad_norm": 1.5917236311274028, + "learning_rate": 6.402408137870903e-06, + "loss": 0.4754, + "step": 1400 + }, + { + "epoch": 0.42759041660308256, + "grad_norm": 1.7638143522011498, + "learning_rate": 6.397661643695502e-06, + "loss": 0.3344, + "step": 1401 + }, + { + "epoch": 0.42789562032656797, + "grad_norm": 1.3144896730327111, + "learning_rate": 6.392913782838563e-06, + "loss": 0.2085, + "step": 1402 + }, + { + "epoch": 0.4282008240500534, + "grad_norm": 1.1905378835451088, + "learning_rate": 6.388164559942709e-06, + "loss": 0.2553, + "step": 1403 + }, + { + "epoch": 0.42850602777353886, + "grad_norm": 2.0174215427728277, + "learning_rate": 6.383413979651893e-06, + "loss": 0.396, + "step": 1404 + }, + { + "epoch": 0.42881123149702427, + "grad_norm": 1.1441372658734617, + "learning_rate": 6.378662046611397e-06, + "loss": 0.2705, + "step": 1405 + }, + { + "epoch": 0.4291164352205097, + "grad_norm": 1.5452084281598708, + "learning_rate": 6.373908765467821e-06, + "loss": 0.3395, + "step": 1406 + }, + { + "epoch": 0.4294216389439951, + "grad_norm": 1.0332304693111856, + "learning_rate": 6.369154140869088e-06, + "loss": 0.2871, + "step": 1407 + }, + { + "epoch": 0.42972684266748057, + "grad_norm": 1.6720861141214347, + "learning_rate": 6.364398177464435e-06, + "loss": 0.3322, + "step": 1408 + }, + { + "epoch": 0.430032046390966, + "grad_norm": 1.3643272344010366, + "learning_rate": 6.359640879904405e-06, + "loss": 0.3161, + "step": 1409 + }, + { + "epoch": 0.4303372501144514, + "grad_norm": 1.1447056451932383, + "learning_rate": 6.354882252840845e-06, + "loss": 0.2654, + "step": 1410 + }, + { + "epoch": 0.4306424538379368, + "grad_norm": 1.1667911883159097, + "learning_rate": 6.350122300926906e-06, + "loss": 0.2105, + "step": 1411 + }, + { + "epoch": 0.4309476575614222, + "grad_norm": 1.3662993243668309, + "learning_rate": 6.3453610288170335e-06, + "loss": 0.3964, + "step": 1412 + }, + { + "epoch": 0.4312528612849077, + "grad_norm": 1.5696957264190432, + "learning_rate": 6.340598441166958e-06, + "loss": 0.3675, + "step": 1413 + }, + { + "epoch": 0.4315580650083931, + "grad_norm": 1.2281182014723664, + "learning_rate": 6.335834542633707e-06, + "loss": 0.2694, + "step": 1414 + }, + { + "epoch": 0.4318632687318785, + "grad_norm": 1.4302582982340493, + "learning_rate": 6.331069337875581e-06, + "loss": 0.2888, + "step": 1415 + }, + { + "epoch": 0.43216847245536394, + "grad_norm": 0.9906163729704169, + "learning_rate": 6.32630283155216e-06, + "loss": 0.22, + "step": 1416 + }, + { + "epoch": 0.4324736761788494, + "grad_norm": 1.4635044332996883, + "learning_rate": 6.321535028324302e-06, + "loss": 0.4671, + "step": 1417 + }, + { + "epoch": 0.4327788799023348, + "grad_norm": 1.4378547230792813, + "learning_rate": 6.316765932854122e-06, + "loss": 0.3855, + "step": 1418 + }, + { + "epoch": 0.43308408362582024, + "grad_norm": 1.5956913213398347, + "learning_rate": 6.31199554980501e-06, + "loss": 0.3164, + "step": 1419 + }, + { + "epoch": 0.43338928734930565, + "grad_norm": 1.3379057277745874, + "learning_rate": 6.307223883841609e-06, + "loss": 0.3991, + "step": 1420 + }, + { + "epoch": 0.43369449107279107, + "grad_norm": 1.0448334558458165, + "learning_rate": 6.302450939629818e-06, + "loss": 0.2132, + "step": 1421 + }, + { + "epoch": 0.43399969479627654, + "grad_norm": 1.6769341191161795, + "learning_rate": 6.297676721836785e-06, + "loss": 0.4639, + "step": 1422 + }, + { + "epoch": 0.43430489851976195, + "grad_norm": 1.508740075559931, + "learning_rate": 6.292901235130904e-06, + "loss": 0.435, + "step": 1423 + }, + { + "epoch": 0.43461010224324736, + "grad_norm": 1.6165557954399556, + "learning_rate": 6.288124484181811e-06, + "loss": 0.3716, + "step": 1424 + }, + { + "epoch": 0.4349153059667328, + "grad_norm": 1.4933007366732682, + "learning_rate": 6.283346473660377e-06, + "loss": 0.4222, + "step": 1425 + }, + { + "epoch": 0.4352205096902182, + "grad_norm": 1.31895828771684, + "learning_rate": 6.278567208238705e-06, + "loss": 0.3421, + "step": 1426 + }, + { + "epoch": 0.43552571341370366, + "grad_norm": 1.6584626299167318, + "learning_rate": 6.273786692590122e-06, + "loss": 0.5482, + "step": 1427 + }, + { + "epoch": 0.4358309171371891, + "grad_norm": 1.4286268291630235, + "learning_rate": 6.269004931389184e-06, + "loss": 0.2923, + "step": 1428 + }, + { + "epoch": 0.4361361208606745, + "grad_norm": 1.330259694437434, + "learning_rate": 6.264221929311659e-06, + "loss": 0.228, + "step": 1429 + }, + { + "epoch": 0.4364413245841599, + "grad_norm": 0.9297451354040828, + "learning_rate": 6.259437691034531e-06, + "loss": 0.1884, + "step": 1430 + }, + { + "epoch": 0.4367465283076454, + "grad_norm": 1.2652771083096295, + "learning_rate": 6.2546522212359925e-06, + "loss": 0.3444, + "step": 1431 + }, + { + "epoch": 0.4370517320311308, + "grad_norm": 1.6100392868745599, + "learning_rate": 6.2498655245954396e-06, + "loss": 0.5201, + "step": 1432 + }, + { + "epoch": 0.4373569357546162, + "grad_norm": 1.2278762390726963, + "learning_rate": 6.245077605793471e-06, + "loss": 0.2888, + "step": 1433 + }, + { + "epoch": 0.4376621394781016, + "grad_norm": 1.386480821426643, + "learning_rate": 6.2402884695118745e-06, + "loss": 0.3163, + "step": 1434 + }, + { + "epoch": 0.43796734320158703, + "grad_norm": 1.0406764982843713, + "learning_rate": 6.235498120433635e-06, + "loss": 0.2533, + "step": 1435 + }, + { + "epoch": 0.4382725469250725, + "grad_norm": 1.5763703213054279, + "learning_rate": 6.230706563242919e-06, + "loss": 0.4144, + "step": 1436 + }, + { + "epoch": 0.4385777506485579, + "grad_norm": 1.639255933375323, + "learning_rate": 6.225913802625076e-06, + "loss": 0.3643, + "step": 1437 + }, + { + "epoch": 0.43888295437204333, + "grad_norm": 1.7824581131688386, + "learning_rate": 6.22111984326663e-06, + "loss": 0.2619, + "step": 1438 + }, + { + "epoch": 0.43918815809552875, + "grad_norm": 1.6342850757025045, + "learning_rate": 6.216324689855283e-06, + "loss": 0.4316, + "step": 1439 + }, + { + "epoch": 0.4394933618190142, + "grad_norm": 1.3924162825479758, + "learning_rate": 6.211528347079896e-06, + "loss": 0.5224, + "step": 1440 + }, + { + "epoch": 0.43979856554249963, + "grad_norm": 1.4266758450691195, + "learning_rate": 6.206730819630502e-06, + "loss": 0.5211, + "step": 1441 + }, + { + "epoch": 0.44010376926598505, + "grad_norm": 1.3623827402595172, + "learning_rate": 6.201932112198285e-06, + "loss": 0.3893, + "step": 1442 + }, + { + "epoch": 0.44040897298947046, + "grad_norm": 1.026045410355969, + "learning_rate": 6.197132229475586e-06, + "loss": 0.2376, + "step": 1443 + }, + { + "epoch": 0.4407141767129559, + "grad_norm": 1.1848190562207745, + "learning_rate": 6.192331176155897e-06, + "loss": 0.3109, + "step": 1444 + }, + { + "epoch": 0.44101938043644134, + "grad_norm": 1.5310615112137054, + "learning_rate": 6.1875289569338505e-06, + "loss": 0.5635, + "step": 1445 + }, + { + "epoch": 0.44132458415992676, + "grad_norm": 1.414171583488937, + "learning_rate": 6.182725576505222e-06, + "loss": 0.5202, + "step": 1446 + }, + { + "epoch": 0.4416297878834122, + "grad_norm": 1.881243926890763, + "learning_rate": 6.177921039566925e-06, + "loss": 0.4076, + "step": 1447 + }, + { + "epoch": 0.4419349916068976, + "grad_norm": 1.3255828762960546, + "learning_rate": 6.173115350816995e-06, + "loss": 0.3769, + "step": 1448 + }, + { + "epoch": 0.44224019533038306, + "grad_norm": 1.5119486789167886, + "learning_rate": 6.168308514954603e-06, + "loss": 0.4031, + "step": 1449 + }, + { + "epoch": 0.44254539905386847, + "grad_norm": 2.0489266865517695, + "learning_rate": 6.1635005366800385e-06, + "loss": 0.3346, + "step": 1450 + }, + { + "epoch": 0.4428506027773539, + "grad_norm": 1.4973886329367188, + "learning_rate": 6.158691420694706e-06, + "loss": 0.4739, + "step": 1451 + }, + { + "epoch": 0.4431558065008393, + "grad_norm": 1.0605172000337302, + "learning_rate": 6.153881171701124e-06, + "loss": 0.2526, + "step": 1452 + }, + { + "epoch": 0.4434610102243247, + "grad_norm": 1.118949194708475, + "learning_rate": 6.149069794402922e-06, + "loss": 0.2583, + "step": 1453 + }, + { + "epoch": 0.4437662139478102, + "grad_norm": 1.5444477187990262, + "learning_rate": 6.144257293504826e-06, + "loss": 0.4095, + "step": 1454 + }, + { + "epoch": 0.4440714176712956, + "grad_norm": 1.2399450249944732, + "learning_rate": 6.139443673712666e-06, + "loss": 0.3351, + "step": 1455 + }, + { + "epoch": 0.444376621394781, + "grad_norm": 1.5587921398740923, + "learning_rate": 6.134628939733367e-06, + "loss": 0.4804, + "step": 1456 + }, + { + "epoch": 0.4446818251182664, + "grad_norm": 1.3042968347851616, + "learning_rate": 6.129813096274937e-06, + "loss": 0.37, + "step": 1457 + }, + { + "epoch": 0.4449870288417519, + "grad_norm": 1.3791281113704872, + "learning_rate": 6.124996148046478e-06, + "loss": 0.3613, + "step": 1458 + }, + { + "epoch": 0.4452922325652373, + "grad_norm": 1.2376992094205213, + "learning_rate": 6.120178099758163e-06, + "loss": 0.2888, + "step": 1459 + }, + { + "epoch": 0.4455974362887227, + "grad_norm": 1.5950209094264465, + "learning_rate": 6.1153589561212465e-06, + "loss": 0.4905, + "step": 1460 + }, + { + "epoch": 0.44590264001220814, + "grad_norm": 1.144267549829832, + "learning_rate": 6.110538721848055e-06, + "loss": 0.2428, + "step": 1461 + }, + { + "epoch": 0.44620784373569355, + "grad_norm": 1.8243651239455192, + "learning_rate": 6.1057174016519746e-06, + "loss": 0.4422, + "step": 1462 + }, + { + "epoch": 0.446513047459179, + "grad_norm": 1.4471328406739752, + "learning_rate": 6.100895000247461e-06, + "loss": 0.365, + "step": 1463 + }, + { + "epoch": 0.44681825118266444, + "grad_norm": 1.1681201032981883, + "learning_rate": 6.096071522350023e-06, + "loss": 0.3077, + "step": 1464 + }, + { + "epoch": 0.44712345490614985, + "grad_norm": 1.0017560798569047, + "learning_rate": 6.091246972676221e-06, + "loss": 0.2612, + "step": 1465 + }, + { + "epoch": 0.44742865862963527, + "grad_norm": 1.1198051570109302, + "learning_rate": 6.086421355943669e-06, + "loss": 0.2661, + "step": 1466 + }, + { + "epoch": 0.4477338623531207, + "grad_norm": 1.5027878603280684, + "learning_rate": 6.081594676871018e-06, + "loss": 0.3801, + "step": 1467 + }, + { + "epoch": 0.44803906607660615, + "grad_norm": 1.5399520549494288, + "learning_rate": 6.076766940177961e-06, + "loss": 0.472, + "step": 1468 + }, + { + "epoch": 0.44834426980009157, + "grad_norm": 1.0842776340009068, + "learning_rate": 6.071938150585225e-06, + "loss": 0.2661, + "step": 1469 + }, + { + "epoch": 0.448649473523577, + "grad_norm": 1.5030565908492635, + "learning_rate": 6.067108312814565e-06, + "loss": 0.4655, + "step": 1470 + }, + { + "epoch": 0.4489546772470624, + "grad_norm": 1.538417777942964, + "learning_rate": 6.062277431588764e-06, + "loss": 0.4456, + "step": 1471 + }, + { + "epoch": 0.44925988097054786, + "grad_norm": 1.4993872980628737, + "learning_rate": 6.0574455116316226e-06, + "loss": 0.4185, + "step": 1472 + }, + { + "epoch": 0.4495650846940333, + "grad_norm": 1.5907228930614425, + "learning_rate": 6.052612557667958e-06, + "loss": 0.3011, + "step": 1473 + }, + { + "epoch": 0.4498702884175187, + "grad_norm": 1.2701054140331611, + "learning_rate": 6.0477785744236e-06, + "loss": 0.3505, + "step": 1474 + }, + { + "epoch": 0.4501754921410041, + "grad_norm": 1.195165338370715, + "learning_rate": 6.042943566625381e-06, + "loss": 0.342, + "step": 1475 + }, + { + "epoch": 0.4504806958644895, + "grad_norm": 1.751726797192751, + "learning_rate": 6.038107539001139e-06, + "loss": 0.3494, + "step": 1476 + }, + { + "epoch": 0.450785899587975, + "grad_norm": 1.6552717810878512, + "learning_rate": 6.033270496279707e-06, + "loss": 0.2819, + "step": 1477 + }, + { + "epoch": 0.4510911033114604, + "grad_norm": 1.3423149963148238, + "learning_rate": 6.0284324431909126e-06, + "loss": 0.4002, + "step": 1478 + }, + { + "epoch": 0.4513963070349458, + "grad_norm": 1.5006613862735463, + "learning_rate": 6.023593384465569e-06, + "loss": 0.4365, + "step": 1479 + }, + { + "epoch": 0.45170151075843124, + "grad_norm": 1.4902782908677048, + "learning_rate": 6.018753324835474e-06, + "loss": 0.1634, + "step": 1480 + }, + { + "epoch": 0.4520067144819167, + "grad_norm": 1.1448537743161962, + "learning_rate": 6.013912269033405e-06, + "loss": 0.196, + "step": 1481 + }, + { + "epoch": 0.4523119182054021, + "grad_norm": 1.4541649327454527, + "learning_rate": 6.009070221793113e-06, + "loss": 0.413, + "step": 1482 + }, + { + "epoch": 0.45261712192888753, + "grad_norm": 1.1981770218692536, + "learning_rate": 6.004227187849315e-06, + "loss": 0.2623, + "step": 1483 + }, + { + "epoch": 0.45292232565237295, + "grad_norm": 1.0048017849150068, + "learning_rate": 5.999383171937699e-06, + "loss": 0.2374, + "step": 1484 + }, + { + "epoch": 0.45322752937585836, + "grad_norm": 1.3030578120266822, + "learning_rate": 5.99453817879491e-06, + "loss": 0.32, + "step": 1485 + }, + { + "epoch": 0.45353273309934383, + "grad_norm": 1.7326148096661058, + "learning_rate": 5.989692213158547e-06, + "loss": 0.4677, + "step": 1486 + }, + { + "epoch": 0.45383793682282925, + "grad_norm": 1.6077233885750193, + "learning_rate": 5.984845279767161e-06, + "loss": 0.5707, + "step": 1487 + }, + { + "epoch": 0.45414314054631466, + "grad_norm": 1.290704871562614, + "learning_rate": 5.979997383360252e-06, + "loss": 0.3204, + "step": 1488 + }, + { + "epoch": 0.4544483442698001, + "grad_norm": 1.4568229301374171, + "learning_rate": 5.975148528678259e-06, + "loss": 0.3551, + "step": 1489 + }, + { + "epoch": 0.45475354799328555, + "grad_norm": 1.3095427303268419, + "learning_rate": 5.970298720462557e-06, + "loss": 0.2769, + "step": 1490 + }, + { + "epoch": 0.45505875171677096, + "grad_norm": 1.1544985522380489, + "learning_rate": 5.965447963455457e-06, + "loss": 0.3078, + "step": 1491 + }, + { + "epoch": 0.4553639554402564, + "grad_norm": 1.5958447621121368, + "learning_rate": 5.960596262400195e-06, + "loss": 0.2531, + "step": 1492 + }, + { + "epoch": 0.4556691591637418, + "grad_norm": 1.5606312830397104, + "learning_rate": 5.95574362204093e-06, + "loss": 0.3338, + "step": 1493 + }, + { + "epoch": 0.4559743628872272, + "grad_norm": 1.2261609920664052, + "learning_rate": 5.950890047122742e-06, + "loss": 0.3511, + "step": 1494 + }, + { + "epoch": 0.4562795666107127, + "grad_norm": 1.434002311342896, + "learning_rate": 5.9460355423916205e-06, + "loss": 0.3288, + "step": 1495 + }, + { + "epoch": 0.4565847703341981, + "grad_norm": 0.9862685988187032, + "learning_rate": 5.941180112594469e-06, + "loss": 0.2005, + "step": 1496 + }, + { + "epoch": 0.4568899740576835, + "grad_norm": 1.464340245497521, + "learning_rate": 5.936323762479093e-06, + "loss": 0.2452, + "step": 1497 + }, + { + "epoch": 0.4571951777811689, + "grad_norm": 1.554706899243958, + "learning_rate": 5.9314664967942e-06, + "loss": 0.33, + "step": 1498 + }, + { + "epoch": 0.45750038150465433, + "grad_norm": 1.4569953320859805, + "learning_rate": 5.926608320289388e-06, + "loss": 0.3072, + "step": 1499 + }, + { + "epoch": 0.4578055852281398, + "grad_norm": 1.4440211945507897, + "learning_rate": 5.92174923771515e-06, + "loss": 0.4295, + "step": 1500 + }, + { + "epoch": 0.4581107889516252, + "grad_norm": 1.669630451316585, + "learning_rate": 5.916889253822865e-06, + "loss": 0.5226, + "step": 1501 + }, + { + "epoch": 0.45841599267511063, + "grad_norm": 1.4250182736212258, + "learning_rate": 5.912028373364791e-06, + "loss": 0.3333, + "step": 1502 + }, + { + "epoch": 0.45872119639859604, + "grad_norm": 1.0328058008851029, + "learning_rate": 5.907166601094063e-06, + "loss": 0.2765, + "step": 1503 + }, + { + "epoch": 0.4590264001220815, + "grad_norm": 1.6987471002662056, + "learning_rate": 5.9023039417646895e-06, + "loss": 0.3082, + "step": 1504 + }, + { + "epoch": 0.4593316038455669, + "grad_norm": 1.3004454216451125, + "learning_rate": 5.897440400131546e-06, + "loss": 0.3204, + "step": 1505 + }, + { + "epoch": 0.45963680756905234, + "grad_norm": 1.4760594394056248, + "learning_rate": 5.8925759809503725e-06, + "loss": 0.4925, + "step": 1506 + }, + { + "epoch": 0.45994201129253776, + "grad_norm": 1.0003988543450875, + "learning_rate": 5.887710688977762e-06, + "loss": 0.2494, + "step": 1507 + }, + { + "epoch": 0.46024721501602317, + "grad_norm": 1.4474390829441592, + "learning_rate": 5.882844528971164e-06, + "loss": 0.421, + "step": 1508 + }, + { + "epoch": 0.46055241873950864, + "grad_norm": 1.4506829363533515, + "learning_rate": 5.8779775056888795e-06, + "loss": 0.3237, + "step": 1509 + }, + { + "epoch": 0.46085762246299405, + "grad_norm": 1.102194097730089, + "learning_rate": 5.873109623890047e-06, + "loss": 0.3195, + "step": 1510 + }, + { + "epoch": 0.46116282618647947, + "grad_norm": 1.4653483831299812, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.4283, + "step": 1511 + }, + { + "epoch": 0.4614680299099649, + "grad_norm": 1.5580902433592914, + "learning_rate": 5.863371303783511e-06, + "loss": 0.4983, + "step": 1512 + }, + { + "epoch": 0.46177323363345035, + "grad_norm": 1.859287163719373, + "learning_rate": 5.858500874998268e-06, + "loss": 0.4035, + "step": 1513 + }, + { + "epoch": 0.46207843735693577, + "grad_norm": 1.3899736057492478, + "learning_rate": 5.853629606741398e-06, + "loss": 0.3503, + "step": 1514 + }, + { + "epoch": 0.4623836410804212, + "grad_norm": 1.9654382371342167, + "learning_rate": 5.848757503776194e-06, + "loss": 0.2825, + "step": 1515 + }, + { + "epoch": 0.4626888448039066, + "grad_norm": 1.749083142563443, + "learning_rate": 5.843884570866764e-06, + "loss": 0.2898, + "step": 1516 + }, + { + "epoch": 0.462994048527392, + "grad_norm": 2.06586332345819, + "learning_rate": 5.839010812778033e-06, + "loss": 0.4441, + "step": 1517 + }, + { + "epoch": 0.4632992522508775, + "grad_norm": 1.416665899987107, + "learning_rate": 5.834136234275725e-06, + "loss": 0.2629, + "step": 1518 + }, + { + "epoch": 0.4636044559743629, + "grad_norm": 1.491807979233761, + "learning_rate": 5.829260840126371e-06, + "loss": 0.3957, + "step": 1519 + }, + { + "epoch": 0.4639096596978483, + "grad_norm": 1.823810409162459, + "learning_rate": 5.824384635097299e-06, + "loss": 0.3113, + "step": 1520 + }, + { + "epoch": 0.4642148634213337, + "grad_norm": 1.43257042224072, + "learning_rate": 5.819507623956631e-06, + "loss": 0.3604, + "step": 1521 + }, + { + "epoch": 0.4645200671448192, + "grad_norm": 1.6999961039554605, + "learning_rate": 5.814629811473274e-06, + "loss": 0.3102, + "step": 1522 + }, + { + "epoch": 0.4648252708683046, + "grad_norm": 1.35742920798098, + "learning_rate": 5.809751202416922e-06, + "loss": 0.2888, + "step": 1523 + }, + { + "epoch": 0.46513047459179, + "grad_norm": 1.7134434015762743, + "learning_rate": 5.804871801558045e-06, + "loss": 0.4024, + "step": 1524 + }, + { + "epoch": 0.46543567831527544, + "grad_norm": 1.1985209171155202, + "learning_rate": 5.79999161366789e-06, + "loss": 0.2442, + "step": 1525 + }, + { + "epoch": 0.46574088203876085, + "grad_norm": 1.3407548748143259, + "learning_rate": 5.795110643518473e-06, + "loss": 0.1733, + "step": 1526 + }, + { + "epoch": 0.4660460857622463, + "grad_norm": 1.5247299580605493, + "learning_rate": 5.790228895882571e-06, + "loss": 0.3787, + "step": 1527 + }, + { + "epoch": 0.46635128948573173, + "grad_norm": 1.535528006931427, + "learning_rate": 5.785346375533726e-06, + "loss": 0.4336, + "step": 1528 + }, + { + "epoch": 0.46665649320921715, + "grad_norm": 1.3568775430354145, + "learning_rate": 5.7804630872462354e-06, + "loss": 0.1561, + "step": 1529 + }, + { + "epoch": 0.46696169693270256, + "grad_norm": 1.5873546563844085, + "learning_rate": 5.775579035795145e-06, + "loss": 0.4776, + "step": 1530 + }, + { + "epoch": 0.467266900656188, + "grad_norm": 1.4509069270195842, + "learning_rate": 5.770694225956248e-06, + "loss": 0.3115, + "step": 1531 + }, + { + "epoch": 0.46757210437967345, + "grad_norm": 1.4283327448223122, + "learning_rate": 5.765808662506079e-06, + "loss": 0.3693, + "step": 1532 + }, + { + "epoch": 0.46787730810315886, + "grad_norm": 1.6978039694364109, + "learning_rate": 5.760922350221909e-06, + "loss": 0.2827, + "step": 1533 + }, + { + "epoch": 0.4681825118266443, + "grad_norm": 1.6065097227778868, + "learning_rate": 5.756035293881744e-06, + "loss": 0.2631, + "step": 1534 + }, + { + "epoch": 0.4684877155501297, + "grad_norm": 1.5416742616758474, + "learning_rate": 5.751147498264314e-06, + "loss": 0.5121, + "step": 1535 + }, + { + "epoch": 0.46879291927361516, + "grad_norm": 1.4642775598491673, + "learning_rate": 5.746258968149073e-06, + "loss": 0.411, + "step": 1536 + }, + { + "epoch": 0.4690981229971006, + "grad_norm": 1.8272083951571823, + "learning_rate": 5.7413697083161944e-06, + "loss": 0.3571, + "step": 1537 + }, + { + "epoch": 0.469403326720586, + "grad_norm": 1.4317064048986723, + "learning_rate": 5.736479723546564e-06, + "loss": 0.3125, + "step": 1538 + }, + { + "epoch": 0.4697085304440714, + "grad_norm": 1.314235538878401, + "learning_rate": 5.731589018621777e-06, + "loss": 0.1967, + "step": 1539 + }, + { + "epoch": 0.4700137341675568, + "grad_norm": 1.5035862966267712, + "learning_rate": 5.726697598324132e-06, + "loss": 0.3313, + "step": 1540 + }, + { + "epoch": 0.4703189378910423, + "grad_norm": 1.3404934923098695, + "learning_rate": 5.721805467436628e-06, + "loss": 0.3265, + "step": 1541 + }, + { + "epoch": 0.4706241416145277, + "grad_norm": 1.1548514284630382, + "learning_rate": 5.71691263074296e-06, + "loss": 0.1728, + "step": 1542 + }, + { + "epoch": 0.4709293453380131, + "grad_norm": 1.3183575665496856, + "learning_rate": 5.71201909302751e-06, + "loss": 0.431, + "step": 1543 + }, + { + "epoch": 0.47123454906149853, + "grad_norm": 1.4548282280692988, + "learning_rate": 5.707124859075346e-06, + "loss": 0.4382, + "step": 1544 + }, + { + "epoch": 0.471539752784984, + "grad_norm": 1.3806205661714575, + "learning_rate": 5.70222993367222e-06, + "loss": 0.1854, + "step": 1545 + }, + { + "epoch": 0.4718449565084694, + "grad_norm": 1.6537668579349547, + "learning_rate": 5.697334321604557e-06, + "loss": 0.4702, + "step": 1546 + }, + { + "epoch": 0.47215016023195483, + "grad_norm": 1.3352342505798407, + "learning_rate": 5.692438027659457e-06, + "loss": 0.4055, + "step": 1547 + }, + { + "epoch": 0.47245536395544024, + "grad_norm": 1.288274350677023, + "learning_rate": 5.68754105662468e-06, + "loss": 0.3408, + "step": 1548 + }, + { + "epoch": 0.47276056767892566, + "grad_norm": 1.4151500924241693, + "learning_rate": 5.682643413288656e-06, + "loss": 0.4746, + "step": 1549 + }, + { + "epoch": 0.47306577140241113, + "grad_norm": 1.2957954969534842, + "learning_rate": 5.677745102440468e-06, + "loss": 0.3128, + "step": 1550 + }, + { + "epoch": 0.47337097512589654, + "grad_norm": 1.4892185093430534, + "learning_rate": 5.672846128869853e-06, + "loss": 0.1999, + "step": 1551 + }, + { + "epoch": 0.47367617884938196, + "grad_norm": 1.159495669766031, + "learning_rate": 5.667946497367191e-06, + "loss": 0.2935, + "step": 1552 + }, + { + "epoch": 0.47398138257286737, + "grad_norm": 1.3909682803839567, + "learning_rate": 5.663046212723516e-06, + "loss": 0.4137, + "step": 1553 + }, + { + "epoch": 0.47428658629635284, + "grad_norm": 1.2459299583771315, + "learning_rate": 5.658145279730491e-06, + "loss": 0.255, + "step": 1554 + }, + { + "epoch": 0.47459179001983826, + "grad_norm": 1.3887686179326861, + "learning_rate": 5.65324370318042e-06, + "loss": 0.3186, + "step": 1555 + }, + { + "epoch": 0.47489699374332367, + "grad_norm": 1.6068867827569175, + "learning_rate": 5.648341487866228e-06, + "loss": 0.3965, + "step": 1556 + }, + { + "epoch": 0.4752021974668091, + "grad_norm": 1.5957802201770763, + "learning_rate": 5.643438638581472e-06, + "loss": 0.527, + "step": 1557 + }, + { + "epoch": 0.4755074011902945, + "grad_norm": 1.7927880289589633, + "learning_rate": 5.638535160120328e-06, + "loss": 0.5276, + "step": 1558 + }, + { + "epoch": 0.47581260491377997, + "grad_norm": 1.473208619804551, + "learning_rate": 5.633631057277582e-06, + "loss": 0.3348, + "step": 1559 + }, + { + "epoch": 0.4761178086372654, + "grad_norm": 1.1516190658610226, + "learning_rate": 5.6287263348486356e-06, + "loss": 0.2193, + "step": 1560 + }, + { + "epoch": 0.4764230123607508, + "grad_norm": 1.4141650926667002, + "learning_rate": 5.6238209976294955e-06, + "loss": 0.2721, + "step": 1561 + }, + { + "epoch": 0.4767282160842362, + "grad_norm": 1.342084606698652, + "learning_rate": 5.618915050416768e-06, + "loss": 0.3196, + "step": 1562 + }, + { + "epoch": 0.4770334198077217, + "grad_norm": 1.3503394795013224, + "learning_rate": 5.614008498007658e-06, + "loss": 0.3637, + "step": 1563 + }, + { + "epoch": 0.4773386235312071, + "grad_norm": 1.328424891104901, + "learning_rate": 5.60910134519996e-06, + "loss": 0.3218, + "step": 1564 + }, + { + "epoch": 0.4776438272546925, + "grad_norm": 1.3485172340980611, + "learning_rate": 5.604193596792054e-06, + "loss": 0.3297, + "step": 1565 + }, + { + "epoch": 0.4779490309781779, + "grad_norm": 1.2738033775398612, + "learning_rate": 5.599285257582911e-06, + "loss": 0.2263, + "step": 1566 + }, + { + "epoch": 0.47825423470166334, + "grad_norm": 1.3885241723681556, + "learning_rate": 5.594376332372068e-06, + "loss": 0.3373, + "step": 1567 + }, + { + "epoch": 0.4785594384251488, + "grad_norm": 1.1021865808381979, + "learning_rate": 5.589466825959642e-06, + "loss": 0.2387, + "step": 1568 + }, + { + "epoch": 0.4788646421486342, + "grad_norm": 1.428554398571414, + "learning_rate": 5.5845567431463175e-06, + "loss": 0.2956, + "step": 1569 + }, + { + "epoch": 0.47916984587211964, + "grad_norm": 1.3897546337563043, + "learning_rate": 5.579646088733343e-06, + "loss": 0.2354, + "step": 1570 + }, + { + "epoch": 0.47947504959560505, + "grad_norm": 1.4542362517109815, + "learning_rate": 5.574734867522523e-06, + "loss": 0.3921, + "step": 1571 + }, + { + "epoch": 0.47978025331909047, + "grad_norm": 1.7263399558968118, + "learning_rate": 5.569823084316219e-06, + "loss": 0.3029, + "step": 1572 + }, + { + "epoch": 0.48008545704257594, + "grad_norm": 1.621517558023657, + "learning_rate": 5.564910743917341e-06, + "loss": 0.3932, + "step": 1573 + }, + { + "epoch": 0.48039066076606135, + "grad_norm": 2.035620579678143, + "learning_rate": 5.559997851129346e-06, + "loss": 0.1695, + "step": 1574 + }, + { + "epoch": 0.48069586448954676, + "grad_norm": 1.3107818984028823, + "learning_rate": 5.555084410756226e-06, + "loss": 0.2542, + "step": 1575 + }, + { + "epoch": 0.4810010682130322, + "grad_norm": 1.6572997886831133, + "learning_rate": 5.550170427602513e-06, + "loss": 0.2124, + "step": 1576 + }, + { + "epoch": 0.48130627193651765, + "grad_norm": 2.748635560446825, + "learning_rate": 5.54525590647327e-06, + "loss": 0.3052, + "step": 1577 + }, + { + "epoch": 0.48161147566000306, + "grad_norm": 1.995315190898183, + "learning_rate": 5.540340852174083e-06, + "loss": 0.3363, + "step": 1578 + }, + { + "epoch": 0.4819166793834885, + "grad_norm": 1.5715192267042497, + "learning_rate": 5.53542526951106e-06, + "loss": 0.2548, + "step": 1579 + }, + { + "epoch": 0.4822218831069739, + "grad_norm": 1.4893926040435048, + "learning_rate": 5.530509163290829e-06, + "loss": 0.371, + "step": 1580 + }, + { + "epoch": 0.4825270868304593, + "grad_norm": 1.6286037539259315, + "learning_rate": 5.525592538320526e-06, + "loss": 0.376, + "step": 1581 + }, + { + "epoch": 0.4828322905539448, + "grad_norm": 1.7629836588949888, + "learning_rate": 5.520675399407795e-06, + "loss": 0.5786, + "step": 1582 + }, + { + "epoch": 0.4831374942774302, + "grad_norm": 1.4728271223346432, + "learning_rate": 5.515757751360784e-06, + "loss": 0.3431, + "step": 1583 + }, + { + "epoch": 0.4834426980009156, + "grad_norm": 1.1580318948342616, + "learning_rate": 5.510839598988137e-06, + "loss": 0.3826, + "step": 1584 + }, + { + "epoch": 0.483747901724401, + "grad_norm": 1.337925018091232, + "learning_rate": 5.505920947098995e-06, + "loss": 0.2725, + "step": 1585 + }, + { + "epoch": 0.4840531054478865, + "grad_norm": 1.6461638187220182, + "learning_rate": 5.50100180050298e-06, + "loss": 0.4827, + "step": 1586 + }, + { + "epoch": 0.4843583091713719, + "grad_norm": 1.4549773520578322, + "learning_rate": 5.496082164010207e-06, + "loss": 0.2952, + "step": 1587 + }, + { + "epoch": 0.4846635128948573, + "grad_norm": 1.4982046668394888, + "learning_rate": 5.491162042431264e-06, + "loss": 0.46, + "step": 1588 + }, + { + "epoch": 0.48496871661834273, + "grad_norm": 1.7000123023541676, + "learning_rate": 5.486241440577214e-06, + "loss": 0.2381, + "step": 1589 + }, + { + "epoch": 0.48527392034182815, + "grad_norm": 1.4018936497489025, + "learning_rate": 5.48132036325959e-06, + "loss": 0.3872, + "step": 1590 + }, + { + "epoch": 0.4855791240653136, + "grad_norm": 1.4151807546921091, + "learning_rate": 5.476398815290394e-06, + "loss": 0.381, + "step": 1591 + }, + { + "epoch": 0.48588432778879903, + "grad_norm": 0.9876963577129196, + "learning_rate": 5.471476801482079e-06, + "loss": 0.1868, + "step": 1592 + }, + { + "epoch": 0.48618953151228445, + "grad_norm": 1.6726232975328992, + "learning_rate": 5.466554326647564e-06, + "loss": 0.3994, + "step": 1593 + }, + { + "epoch": 0.48649473523576986, + "grad_norm": 1.2261133525745127, + "learning_rate": 5.461631395600208e-06, + "loss": 0.3083, + "step": 1594 + }, + { + "epoch": 0.48679993895925533, + "grad_norm": 2.0566916319054735, + "learning_rate": 5.456708013153829e-06, + "loss": 0.4756, + "step": 1595 + }, + { + "epoch": 0.48710514268274074, + "grad_norm": 1.4786102788703337, + "learning_rate": 5.451784184122676e-06, + "loss": 0.5202, + "step": 1596 + }, + { + "epoch": 0.48741034640622616, + "grad_norm": 1.3638063310410486, + "learning_rate": 5.446859913321437e-06, + "loss": 0.316, + "step": 1597 + }, + { + "epoch": 0.4877155501297116, + "grad_norm": 1.4931742815624802, + "learning_rate": 5.4419352055652345e-06, + "loss": 0.3344, + "step": 1598 + }, + { + "epoch": 0.488020753853197, + "grad_norm": 1.5569878051025439, + "learning_rate": 5.437010065669618e-06, + "loss": 0.3727, + "step": 1599 + }, + { + "epoch": 0.48832595757668246, + "grad_norm": 1.1973477657449398, + "learning_rate": 5.432084498450555e-06, + "loss": 0.253, + "step": 1600 + }, + { + "epoch": 0.48863116130016787, + "grad_norm": 1.4804547392250123, + "learning_rate": 5.427158508724436e-06, + "loss": 0.1928, + "step": 1601 + }, + { + "epoch": 0.4889363650236533, + "grad_norm": 1.5889388374815445, + "learning_rate": 5.422232101308064e-06, + "loss": 0.3242, + "step": 1602 + }, + { + "epoch": 0.4892415687471387, + "grad_norm": 1.4732208383918126, + "learning_rate": 5.417305281018648e-06, + "loss": 0.3179, + "step": 1603 + }, + { + "epoch": 0.4895467724706241, + "grad_norm": 1.5778071772757023, + "learning_rate": 5.4123780526738025e-06, + "loss": 0.3705, + "step": 1604 + }, + { + "epoch": 0.4898519761941096, + "grad_norm": 1.5952772414956238, + "learning_rate": 5.4074504210915405e-06, + "loss": 0.4752, + "step": 1605 + }, + { + "epoch": 0.490157179917595, + "grad_norm": 1.5327834411417531, + "learning_rate": 5.402522391090268e-06, + "loss": 0.2701, + "step": 1606 + }, + { + "epoch": 0.4904623836410804, + "grad_norm": 1.4174549863957768, + "learning_rate": 5.397593967488784e-06, + "loss": 0.305, + "step": 1607 + }, + { + "epoch": 0.4907675873645658, + "grad_norm": 1.4105364426993232, + "learning_rate": 5.3926651551062696e-06, + "loss": 0.2822, + "step": 1608 + }, + { + "epoch": 0.4910727910880513, + "grad_norm": 1.496829496898576, + "learning_rate": 5.387735958762284e-06, + "loss": 0.2455, + "step": 1609 + }, + { + "epoch": 0.4913779948115367, + "grad_norm": 1.3744912073153184, + "learning_rate": 5.38280638327677e-06, + "loss": 0.222, + "step": 1610 + }, + { + "epoch": 0.4916831985350221, + "grad_norm": 2.181978942265985, + "learning_rate": 5.377876433470031e-06, + "loss": 0.2868, + "step": 1611 + }, + { + "epoch": 0.49198840225850754, + "grad_norm": 1.6941287922726327, + "learning_rate": 5.372946114162745e-06, + "loss": 0.334, + "step": 1612 + }, + { + "epoch": 0.49229360598199295, + "grad_norm": 1.2726505398841743, + "learning_rate": 5.3680154301759435e-06, + "loss": 0.258, + "step": 1613 + }, + { + "epoch": 0.4925988097054784, + "grad_norm": 1.6061588484137834, + "learning_rate": 5.363084386331021e-06, + "loss": 0.4377, + "step": 1614 + }, + { + "epoch": 0.49290401342896384, + "grad_norm": 1.318008850970961, + "learning_rate": 5.358152987449722e-06, + "loss": 0.2085, + "step": 1615 + }, + { + "epoch": 0.49320921715244925, + "grad_norm": 1.2747270104863346, + "learning_rate": 5.353221238354136e-06, + "loss": 0.2869, + "step": 1616 + }, + { + "epoch": 0.49351442087593467, + "grad_norm": 1.6264464469792288, + "learning_rate": 5.348289143866696e-06, + "loss": 0.5463, + "step": 1617 + }, + { + "epoch": 0.49381962459942014, + "grad_norm": 1.455135472338877, + "learning_rate": 5.343356708810176e-06, + "loss": 0.4526, + "step": 1618 + }, + { + "epoch": 0.49412482832290555, + "grad_norm": 1.1443888604966979, + "learning_rate": 5.338423938007677e-06, + "loss": 0.2367, + "step": 1619 + }, + { + "epoch": 0.49443003204639097, + "grad_norm": 1.351694662875841, + "learning_rate": 5.3334908362826355e-06, + "loss": 0.1923, + "step": 1620 + }, + { + "epoch": 0.4947352357698764, + "grad_norm": 1.3722275613293962, + "learning_rate": 5.328557408458805e-06, + "loss": 0.2233, + "step": 1621 + }, + { + "epoch": 0.4950404394933618, + "grad_norm": 1.4038649360296689, + "learning_rate": 5.323623659360261e-06, + "loss": 0.2664, + "step": 1622 + }, + { + "epoch": 0.49534564321684726, + "grad_norm": 1.3069014353110344, + "learning_rate": 5.3186895938113934e-06, + "loss": 0.3407, + "step": 1623 + }, + { + "epoch": 0.4956508469403327, + "grad_norm": 1.2914138618332602, + "learning_rate": 5.3137552166369e-06, + "loss": 0.2242, + "step": 1624 + }, + { + "epoch": 0.4959560506638181, + "grad_norm": 1.4023515706388823, + "learning_rate": 5.3088205326617846e-06, + "loss": 0.432, + "step": 1625 + }, + { + "epoch": 0.4962612543873035, + "grad_norm": 4.034863885145975, + "learning_rate": 5.303885546711352e-06, + "loss": 0.3231, + "step": 1626 + }, + { + "epoch": 0.496566458110789, + "grad_norm": 1.3258072319474576, + "learning_rate": 5.298950263611197e-06, + "loss": 0.4098, + "step": 1627 + }, + { + "epoch": 0.4968716618342744, + "grad_norm": 1.4994044711169068, + "learning_rate": 5.294014688187213e-06, + "loss": 0.1783, + "step": 1628 + }, + { + "epoch": 0.4971768655577598, + "grad_norm": 1.0794348016638573, + "learning_rate": 5.289078825265573e-06, + "loss": 0.2446, + "step": 1629 + }, + { + "epoch": 0.4974820692812452, + "grad_norm": 1.2325800145854007, + "learning_rate": 5.284142679672733e-06, + "loss": 0.3066, + "step": 1630 + }, + { + "epoch": 0.49778727300473063, + "grad_norm": 1.5888945724293768, + "learning_rate": 5.279206256235429e-06, + "loss": 0.2676, + "step": 1631 + }, + { + "epoch": 0.4980924767282161, + "grad_norm": 1.5276496756436473, + "learning_rate": 5.27426955978066e-06, + "loss": 0.3483, + "step": 1632 + }, + { + "epoch": 0.4983976804517015, + "grad_norm": 1.621231110050268, + "learning_rate": 5.2693325951357e-06, + "loss": 0.4674, + "step": 1633 + }, + { + "epoch": 0.49870288417518693, + "grad_norm": 1.625742375918665, + "learning_rate": 5.264395367128085e-06, + "loss": 0.5141, + "step": 1634 + }, + { + "epoch": 0.49900808789867235, + "grad_norm": 1.3288540802436513, + "learning_rate": 5.259457880585599e-06, + "loss": 0.2809, + "step": 1635 + }, + { + "epoch": 0.4993132916221578, + "grad_norm": 1.4498227241274997, + "learning_rate": 5.254520140336293e-06, + "loss": 0.3109, + "step": 1636 + }, + { + "epoch": 0.49961849534564323, + "grad_norm": 1.411890754290964, + "learning_rate": 5.249582151208456e-06, + "loss": 0.3035, + "step": 1637 + }, + { + "epoch": 0.49992369906912865, + "grad_norm": 1.827351333060968, + "learning_rate": 5.244643918030623e-06, + "loss": 0.2952, + "step": 1638 + }, + { + "epoch": 0.5002289027926141, + "grad_norm": 1.4941225138046528, + "learning_rate": 5.23970544563157e-06, + "loss": 0.349, + "step": 1639 + }, + { + "epoch": 0.5005341065160995, + "grad_norm": 1.359880901338943, + "learning_rate": 5.234766738840301e-06, + "loss": 0.3015, + "step": 1640 + }, + { + "epoch": 0.500839310239585, + "grad_norm": 1.0754563848784184, + "learning_rate": 5.229827802486057e-06, + "loss": 0.2577, + "step": 1641 + }, + { + "epoch": 0.5011445139630704, + "grad_norm": 1.1574633649791777, + "learning_rate": 5.224888641398298e-06, + "loss": 0.2131, + "step": 1642 + }, + { + "epoch": 0.5014497176865558, + "grad_norm": 1.3888886144425863, + "learning_rate": 5.219949260406702e-06, + "loss": 0.2927, + "step": 1643 + }, + { + "epoch": 0.5017549214100412, + "grad_norm": 1.2080626787509892, + "learning_rate": 5.215009664341172e-06, + "loss": 0.1779, + "step": 1644 + }, + { + "epoch": 0.5020601251335266, + "grad_norm": 1.5390181946549573, + "learning_rate": 5.210069858031809e-06, + "loss": 0.4169, + "step": 1645 + }, + { + "epoch": 0.502365328857012, + "grad_norm": 1.6666794379062893, + "learning_rate": 5.205129846308927e-06, + "loss": 0.5059, + "step": 1646 + }, + { + "epoch": 0.5026705325804974, + "grad_norm": 1.1965539506640186, + "learning_rate": 5.200189634003041e-06, + "loss": 0.2838, + "step": 1647 + }, + { + "epoch": 0.502975736303983, + "grad_norm": 1.5427230446178664, + "learning_rate": 5.195249225944858e-06, + "loss": 0.5243, + "step": 1648 + }, + { + "epoch": 0.5032809400274684, + "grad_norm": 1.713220684130789, + "learning_rate": 5.190308626965278e-06, + "loss": 0.4358, + "step": 1649 + }, + { + "epoch": 0.5035861437509538, + "grad_norm": 1.972570557196598, + "learning_rate": 5.18536784189539e-06, + "loss": 0.1848, + "step": 1650 + }, + { + "epoch": 0.5038913474744392, + "grad_norm": 1.6571295939699278, + "learning_rate": 5.180426875566461e-06, + "loss": 0.6615, + "step": 1651 + }, + { + "epoch": 0.5041965511979246, + "grad_norm": 1.437005165181072, + "learning_rate": 5.175485732809941e-06, + "loss": 0.308, + "step": 1652 + }, + { + "epoch": 0.50450175492141, + "grad_norm": 1.0623456057400325, + "learning_rate": 5.170544418457447e-06, + "loss": 0.1621, + "step": 1653 + }, + { + "epoch": 0.5048069586448954, + "grad_norm": 1.7579201898436299, + "learning_rate": 5.165602937340765e-06, + "loss": 0.3202, + "step": 1654 + }, + { + "epoch": 0.5051121623683809, + "grad_norm": 1.5000292457272657, + "learning_rate": 5.160661294291845e-06, + "loss": 0.2291, + "step": 1655 + }, + { + "epoch": 0.5054173660918663, + "grad_norm": 0.9777395860180313, + "learning_rate": 5.155719494142799e-06, + "loss": 0.1873, + "step": 1656 + }, + { + "epoch": 0.5057225698153518, + "grad_norm": 1.503059921917863, + "learning_rate": 5.150777541725884e-06, + "loss": 0.2805, + "step": 1657 + }, + { + "epoch": 0.5060277735388372, + "grad_norm": 1.672907929102723, + "learning_rate": 5.145835441873514e-06, + "loss": 0.3044, + "step": 1658 + }, + { + "epoch": 0.5063329772623226, + "grad_norm": 2.0294129500910034, + "learning_rate": 5.140893199418238e-06, + "loss": 0.3157, + "step": 1659 + }, + { + "epoch": 0.506638180985808, + "grad_norm": 1.3093133570470568, + "learning_rate": 5.1359508191927585e-06, + "loss": 0.2408, + "step": 1660 + }, + { + "epoch": 0.5069433847092935, + "grad_norm": 1.3018779083966656, + "learning_rate": 5.131008306029901e-06, + "loss": 0.3438, + "step": 1661 + }, + { + "epoch": 0.5072485884327789, + "grad_norm": 1.5384066970844072, + "learning_rate": 5.126065664762623e-06, + "loss": 0.5485, + "step": 1662 + }, + { + "epoch": 0.5075537921562643, + "grad_norm": 1.249830902106062, + "learning_rate": 5.12112290022401e-06, + "loss": 0.2663, + "step": 1663 + }, + { + "epoch": 0.5078589958797497, + "grad_norm": 1.2530061812921935, + "learning_rate": 5.11618001724727e-06, + "loss": 0.2481, + "step": 1664 + }, + { + "epoch": 0.5081641996032351, + "grad_norm": 1.344296410824843, + "learning_rate": 5.111237020665718e-06, + "loss": 0.225, + "step": 1665 + }, + { + "epoch": 0.5084694033267206, + "grad_norm": 1.423868312190065, + "learning_rate": 5.106293915312788e-06, + "loss": 0.2623, + "step": 1666 + }, + { + "epoch": 0.508774607050206, + "grad_norm": 1.3149812496139412, + "learning_rate": 5.10135070602202e-06, + "loss": 0.2228, + "step": 1667 + }, + { + "epoch": 0.5090798107736915, + "grad_norm": 1.4409444425833837, + "learning_rate": 5.096407397627051e-06, + "loss": 0.5072, + "step": 1668 + }, + { + "epoch": 0.5093850144971769, + "grad_norm": 1.348503222582244, + "learning_rate": 5.09146399496162e-06, + "loss": 0.388, + "step": 1669 + }, + { + "epoch": 0.5096902182206623, + "grad_norm": 1.2483916903268428, + "learning_rate": 5.086520502859556e-06, + "loss": 0.2071, + "step": 1670 + }, + { + "epoch": 0.5099954219441477, + "grad_norm": 1.3998356654583803, + "learning_rate": 5.081576926154772e-06, + "loss": 0.2387, + "step": 1671 + }, + { + "epoch": 0.5103006256676331, + "grad_norm": 1.5289407561496389, + "learning_rate": 5.07663326968127e-06, + "loss": 0.5692, + "step": 1672 + }, + { + "epoch": 0.5106058293911185, + "grad_norm": 1.234600239737849, + "learning_rate": 5.071689538273126e-06, + "loss": 0.2844, + "step": 1673 + }, + { + "epoch": 0.510911033114604, + "grad_norm": 1.2635731960722638, + "learning_rate": 5.0667457367644904e-06, + "loss": 0.3551, + "step": 1674 + }, + { + "epoch": 0.5112162368380895, + "grad_norm": 1.3538142063487089, + "learning_rate": 5.061801869989582e-06, + "loss": 0.3053, + "step": 1675 + }, + { + "epoch": 0.5115214405615749, + "grad_norm": 1.2172009821134098, + "learning_rate": 5.056857942782682e-06, + "loss": 0.2138, + "step": 1676 + }, + { + "epoch": 0.5118266442850603, + "grad_norm": 1.5879432509969746, + "learning_rate": 5.051913959978136e-06, + "loss": 0.4392, + "step": 1677 + }, + { + "epoch": 0.5121318480085457, + "grad_norm": 1.9377433870094682, + "learning_rate": 5.046969926410336e-06, + "loss": 0.3714, + "step": 1678 + }, + { + "epoch": 0.5124370517320311, + "grad_norm": 1.2846490850622998, + "learning_rate": 5.042025846913731e-06, + "loss": 0.3023, + "step": 1679 + }, + { + "epoch": 0.5127422554555165, + "grad_norm": 1.584798018284521, + "learning_rate": 5.03708172632281e-06, + "loss": 0.5201, + "step": 1680 + }, + { + "epoch": 0.513047459179002, + "grad_norm": 1.3353130822121289, + "learning_rate": 5.032137569472103e-06, + "loss": 0.2252, + "step": 1681 + }, + { + "epoch": 0.5133526629024874, + "grad_norm": 1.2506857898127242, + "learning_rate": 5.027193381196178e-06, + "loss": 0.3043, + "step": 1682 + }, + { + "epoch": 0.5136578666259728, + "grad_norm": 1.7478079009136256, + "learning_rate": 5.022249166329633e-06, + "loss": 0.2608, + "step": 1683 + }, + { + "epoch": 0.5139630703494583, + "grad_norm": 1.4253991019869026, + "learning_rate": 5.017304929707085e-06, + "loss": 0.3544, + "step": 1684 + }, + { + "epoch": 0.5142682740729437, + "grad_norm": 1.0280421678748557, + "learning_rate": 5.012360676163186e-06, + "loss": 0.1793, + "step": 1685 + }, + { + "epoch": 0.5145734777964291, + "grad_norm": 1.1098929257342118, + "learning_rate": 5.007416410532593e-06, + "loss": 0.1695, + "step": 1686 + }, + { + "epoch": 0.5148786815199146, + "grad_norm": 1.192133647057363, + "learning_rate": 5.0024721376499775e-06, + "loss": 0.2837, + "step": 1687 + }, + { + "epoch": 0.5151838852434, + "grad_norm": 1.5272365054360038, + "learning_rate": 4.997527862350023e-06, + "loss": 0.4632, + "step": 1688 + }, + { + "epoch": 0.5154890889668854, + "grad_norm": 1.7459738602751647, + "learning_rate": 4.992583589467409e-06, + "loss": 0.3587, + "step": 1689 + }, + { + "epoch": 0.5157942926903708, + "grad_norm": 1.59178534193483, + "learning_rate": 4.987639323836815e-06, + "loss": 0.4888, + "step": 1690 + }, + { + "epoch": 0.5160994964138562, + "grad_norm": 1.452831197590841, + "learning_rate": 4.982695070292916e-06, + "loss": 0.3795, + "step": 1691 + }, + { + "epoch": 0.5164047001373416, + "grad_norm": 1.4771465781176318, + "learning_rate": 4.9777508336703705e-06, + "loss": 0.3817, + "step": 1692 + }, + { + "epoch": 0.516709903860827, + "grad_norm": 1.6254273366231557, + "learning_rate": 4.972806618803823e-06, + "loss": 0.4285, + "step": 1693 + }, + { + "epoch": 0.5170151075843126, + "grad_norm": 1.4842668995896693, + "learning_rate": 4.967862430527899e-06, + "loss": 0.2605, + "step": 1694 + }, + { + "epoch": 0.517320311307798, + "grad_norm": 1.4032108102229814, + "learning_rate": 4.962918273677192e-06, + "loss": 0.3153, + "step": 1695 + }, + { + "epoch": 0.5176255150312834, + "grad_norm": 1.5402384491648182, + "learning_rate": 4.957974153086269e-06, + "loss": 0.4933, + "step": 1696 + }, + { + "epoch": 0.5179307187547688, + "grad_norm": 1.361335436939541, + "learning_rate": 4.953030073589663e-06, + "loss": 0.3291, + "step": 1697 + }, + { + "epoch": 0.5182359224782542, + "grad_norm": 1.6322832481817982, + "learning_rate": 4.948086040021865e-06, + "loss": 0.4752, + "step": 1698 + }, + { + "epoch": 0.5185411262017396, + "grad_norm": 1.4430096400012284, + "learning_rate": 4.943142057217318e-06, + "loss": 0.2248, + "step": 1699 + }, + { + "epoch": 0.5188463299252251, + "grad_norm": 1.4209700838358612, + "learning_rate": 4.938198130010419e-06, + "loss": 0.2256, + "step": 1700 + }, + { + "epoch": 0.5191515336487105, + "grad_norm": 1.3308099870220456, + "learning_rate": 4.933254263235511e-06, + "loss": 0.2792, + "step": 1701 + }, + { + "epoch": 0.5194567373721959, + "grad_norm": 1.435096555910569, + "learning_rate": 4.928310461726876e-06, + "loss": 0.2201, + "step": 1702 + }, + { + "epoch": 0.5197619410956814, + "grad_norm": 1.2795476536614092, + "learning_rate": 4.923366730318731e-06, + "loss": 0.3959, + "step": 1703 + }, + { + "epoch": 0.5200671448191668, + "grad_norm": 1.445568000273355, + "learning_rate": 4.91842307384523e-06, + "loss": 0.3239, + "step": 1704 + }, + { + "epoch": 0.5203723485426522, + "grad_norm": 1.3882780522624698, + "learning_rate": 4.913479497140447e-06, + "loss": 0.3565, + "step": 1705 + }, + { + "epoch": 0.5206775522661377, + "grad_norm": 1.4812796996153128, + "learning_rate": 4.9085360050383805e-06, + "loss": 0.3752, + "step": 1706 + }, + { + "epoch": 0.5209827559896231, + "grad_norm": 1.257425047155564, + "learning_rate": 4.90359260237295e-06, + "loss": 0.1718, + "step": 1707 + }, + { + "epoch": 0.5212879597131085, + "grad_norm": 1.6987322933124722, + "learning_rate": 4.898649293977982e-06, + "loss": 0.349, + "step": 1708 + }, + { + "epoch": 0.5215931634365939, + "grad_norm": 1.1895206726638172, + "learning_rate": 4.893706084687213e-06, + "loss": 0.2822, + "step": 1709 + }, + { + "epoch": 0.5218983671600793, + "grad_norm": 1.3543095537752436, + "learning_rate": 4.888762979334285e-06, + "loss": 0.2793, + "step": 1710 + }, + { + "epoch": 0.5222035708835647, + "grad_norm": 1.6001037653415573, + "learning_rate": 4.883819982752733e-06, + "loss": 0.5458, + "step": 1711 + }, + { + "epoch": 0.5225087746070503, + "grad_norm": 1.5693779437099424, + "learning_rate": 4.878877099775991e-06, + "loss": 0.3162, + "step": 1712 + }, + { + "epoch": 0.5228139783305357, + "grad_norm": 1.4074639908298856, + "learning_rate": 4.8739343352373765e-06, + "loss": 0.3206, + "step": 1713 + }, + { + "epoch": 0.5231191820540211, + "grad_norm": 1.087432984227466, + "learning_rate": 4.868991693970099e-06, + "loss": 0.2167, + "step": 1714 + }, + { + "epoch": 0.5234243857775065, + "grad_norm": 1.4493333303392464, + "learning_rate": 4.864049180807242e-06, + "loss": 0.3923, + "step": 1715 + }, + { + "epoch": 0.5237295895009919, + "grad_norm": 1.6976032864998114, + "learning_rate": 4.859106800581762e-06, + "loss": 0.3282, + "step": 1716 + }, + { + "epoch": 0.5240347932244773, + "grad_norm": 1.1794888158561472, + "learning_rate": 4.854164558126489e-06, + "loss": 0.2228, + "step": 1717 + }, + { + "epoch": 0.5243399969479627, + "grad_norm": 1.2969136060001154, + "learning_rate": 4.849222458274118e-06, + "loss": 0.2679, + "step": 1718 + }, + { + "epoch": 0.5246452006714482, + "grad_norm": 1.7402174549586045, + "learning_rate": 4.844280505857203e-06, + "loss": 0.2213, + "step": 1719 + }, + { + "epoch": 0.5249504043949336, + "grad_norm": 1.604727543792755, + "learning_rate": 4.839338705708156e-06, + "loss": 0.4238, + "step": 1720 + }, + { + "epoch": 0.5252556081184191, + "grad_norm": 1.2421408590521474, + "learning_rate": 4.834397062659236e-06, + "loss": 0.3278, + "step": 1721 + }, + { + "epoch": 0.5255608118419045, + "grad_norm": 1.724100143577579, + "learning_rate": 4.829455581542555e-06, + "loss": 0.3619, + "step": 1722 + }, + { + "epoch": 0.5258660155653899, + "grad_norm": 1.3189581973354423, + "learning_rate": 4.824514267190061e-06, + "loss": 0.2873, + "step": 1723 + }, + { + "epoch": 0.5261712192888753, + "grad_norm": 1.2491657334100055, + "learning_rate": 4.81957312443354e-06, + "loss": 0.2806, + "step": 1724 + }, + { + "epoch": 0.5264764230123608, + "grad_norm": 1.2884089807374433, + "learning_rate": 4.814632158104612e-06, + "loss": 0.2944, + "step": 1725 + }, + { + "epoch": 0.5267816267358462, + "grad_norm": 1.5336454688828358, + "learning_rate": 4.809691373034725e-06, + "loss": 0.3756, + "step": 1726 + }, + { + "epoch": 0.5270868304593316, + "grad_norm": 1.1703041738460964, + "learning_rate": 4.804750774055145e-06, + "loss": 0.3417, + "step": 1727 + }, + { + "epoch": 0.527392034182817, + "grad_norm": 1.4976332589595074, + "learning_rate": 4.799810365996961e-06, + "loss": 0.4417, + "step": 1728 + }, + { + "epoch": 0.5276972379063024, + "grad_norm": 1.65105783191874, + "learning_rate": 4.794870153691072e-06, + "loss": 0.2322, + "step": 1729 + }, + { + "epoch": 0.5280024416297879, + "grad_norm": 1.122193650919566, + "learning_rate": 4.789930141968191e-06, + "loss": 0.2484, + "step": 1730 + }, + { + "epoch": 0.5283076453532733, + "grad_norm": 1.3721395563321508, + "learning_rate": 4.784990335658829e-06, + "loss": 0.2416, + "step": 1731 + }, + { + "epoch": 0.5286128490767588, + "grad_norm": 2.1794325156426564, + "learning_rate": 4.780050739593298e-06, + "loss": 0.3152, + "step": 1732 + }, + { + "epoch": 0.5289180528002442, + "grad_norm": 1.4591445937013092, + "learning_rate": 4.7751113586017036e-06, + "loss": 0.3311, + "step": 1733 + }, + { + "epoch": 0.5292232565237296, + "grad_norm": 1.3029702586412188, + "learning_rate": 4.770172197513944e-06, + "loss": 0.2442, + "step": 1734 + }, + { + "epoch": 0.529528460247215, + "grad_norm": 1.4553221632341957, + "learning_rate": 4.7652332611596996e-06, + "loss": 0.2228, + "step": 1735 + }, + { + "epoch": 0.5298336639707004, + "grad_norm": 1.7556590813614814, + "learning_rate": 4.760294554368432e-06, + "loss": 0.4196, + "step": 1736 + }, + { + "epoch": 0.5301388676941858, + "grad_norm": 1.518317279472581, + "learning_rate": 4.755356081969378e-06, + "loss": 0.3037, + "step": 1737 + }, + { + "epoch": 0.5304440714176712, + "grad_norm": 1.6122834673847743, + "learning_rate": 4.750417848791545e-06, + "loss": 0.3145, + "step": 1738 + }, + { + "epoch": 0.5307492751411568, + "grad_norm": 1.4647223256965953, + "learning_rate": 4.7454798596637075e-06, + "loss": 0.4074, + "step": 1739 + }, + { + "epoch": 0.5310544788646422, + "grad_norm": 1.6148928519923968, + "learning_rate": 4.740542119414402e-06, + "loss": 0.3707, + "step": 1740 + }, + { + "epoch": 0.5313596825881276, + "grad_norm": 1.7828794020487633, + "learning_rate": 4.735604632871919e-06, + "loss": 0.3663, + "step": 1741 + }, + { + "epoch": 0.531664886311613, + "grad_norm": 1.7313254412247303, + "learning_rate": 4.730667404864301e-06, + "loss": 0.3853, + "step": 1742 + }, + { + "epoch": 0.5319700900350984, + "grad_norm": 1.6644547407620942, + "learning_rate": 4.725730440219342e-06, + "loss": 0.3523, + "step": 1743 + }, + { + "epoch": 0.5322752937585838, + "grad_norm": 1.491477837500527, + "learning_rate": 4.720793743764574e-06, + "loss": 0.293, + "step": 1744 + }, + { + "epoch": 0.5325804974820693, + "grad_norm": 1.649985215091933, + "learning_rate": 4.715857320327267e-06, + "loss": 0.3803, + "step": 1745 + }, + { + "epoch": 0.5328857012055547, + "grad_norm": 1.331924120938343, + "learning_rate": 4.7109211747344276e-06, + "loss": 0.2943, + "step": 1746 + }, + { + "epoch": 0.5331909049290401, + "grad_norm": 1.651195514943811, + "learning_rate": 4.705985311812788e-06, + "loss": 0.3402, + "step": 1747 + }, + { + "epoch": 0.5334961086525256, + "grad_norm": 1.858088007872116, + "learning_rate": 4.701049736388804e-06, + "loss": 0.757, + "step": 1748 + }, + { + "epoch": 0.533801312376011, + "grad_norm": 1.3341596049865094, + "learning_rate": 4.69611445328865e-06, + "loss": 0.3517, + "step": 1749 + }, + { + "epoch": 0.5341065160994964, + "grad_norm": 1.349613438115718, + "learning_rate": 4.691179467338217e-06, + "loss": 0.239, + "step": 1750 + }, + { + "epoch": 0.5344117198229819, + "grad_norm": 1.3311127211303257, + "learning_rate": 4.6862447833631015e-06, + "loss": 0.1743, + "step": 1751 + }, + { + "epoch": 0.5347169235464673, + "grad_norm": 1.6728820619848093, + "learning_rate": 4.681310406188607e-06, + "loss": 0.3636, + "step": 1752 + }, + { + "epoch": 0.5350221272699527, + "grad_norm": 1.340753007659583, + "learning_rate": 4.67637634063974e-06, + "loss": 0.3137, + "step": 1753 + }, + { + "epoch": 0.5353273309934381, + "grad_norm": 1.6789840289703037, + "learning_rate": 4.671442591541196e-06, + "loss": 0.6145, + "step": 1754 + }, + { + "epoch": 0.5356325347169235, + "grad_norm": 1.3813820892281865, + "learning_rate": 4.666509163717365e-06, + "loss": 0.3854, + "step": 1755 + }, + { + "epoch": 0.5359377384404089, + "grad_norm": 1.519369945447102, + "learning_rate": 4.6615760619923235e-06, + "loss": 0.4617, + "step": 1756 + }, + { + "epoch": 0.5362429421638945, + "grad_norm": 1.2407672848532267, + "learning_rate": 4.656643291189826e-06, + "loss": 0.2891, + "step": 1757 + }, + { + "epoch": 0.5365481458873799, + "grad_norm": 1.6907484083561595, + "learning_rate": 4.6517108561333055e-06, + "loss": 0.439, + "step": 1758 + }, + { + "epoch": 0.5368533496108653, + "grad_norm": 1.307296793447627, + "learning_rate": 4.646778761645867e-06, + "loss": 0.3114, + "step": 1759 + }, + { + "epoch": 0.5371585533343507, + "grad_norm": 1.4060761238262538, + "learning_rate": 4.641847012550281e-06, + "loss": 0.3102, + "step": 1760 + }, + { + "epoch": 0.5374637570578361, + "grad_norm": 1.1585526091871678, + "learning_rate": 4.63691561366898e-06, + "loss": 0.344, + "step": 1761 + }, + { + "epoch": 0.5377689607813215, + "grad_norm": 1.2551860517157176, + "learning_rate": 4.6319845698240564e-06, + "loss": 0.3574, + "step": 1762 + }, + { + "epoch": 0.5380741645048069, + "grad_norm": 1.057066149533723, + "learning_rate": 4.627053885837257e-06, + "loss": 0.2527, + "step": 1763 + }, + { + "epoch": 0.5383793682282924, + "grad_norm": 1.3899142559966882, + "learning_rate": 4.622123566529969e-06, + "loss": 0.4238, + "step": 1764 + }, + { + "epoch": 0.5386845719517778, + "grad_norm": 1.5136220222385257, + "learning_rate": 4.617193616723231e-06, + "loss": 0.4436, + "step": 1765 + }, + { + "epoch": 0.5389897756752632, + "grad_norm": 1.439164731912172, + "learning_rate": 4.612264041237717e-06, + "loss": 0.3895, + "step": 1766 + }, + { + "epoch": 0.5392949793987487, + "grad_norm": 1.2109877052974534, + "learning_rate": 4.607334844893732e-06, + "loss": 0.2577, + "step": 1767 + }, + { + "epoch": 0.5396001831222341, + "grad_norm": 1.6738329679225448, + "learning_rate": 4.602406032511217e-06, + "loss": 0.5504, + "step": 1768 + }, + { + "epoch": 0.5399053868457195, + "grad_norm": 1.3467379772470351, + "learning_rate": 4.597477608909733e-06, + "loss": 0.2681, + "step": 1769 + }, + { + "epoch": 0.540210590569205, + "grad_norm": 1.3966677245240011, + "learning_rate": 4.592549578908462e-06, + "loss": 0.3906, + "step": 1770 + }, + { + "epoch": 0.5405157942926904, + "grad_norm": 1.305199574012114, + "learning_rate": 4.587621947326198e-06, + "loss": 0.2883, + "step": 1771 + }, + { + "epoch": 0.5408209980161758, + "grad_norm": 1.028638133088597, + "learning_rate": 4.582694718981354e-06, + "loss": 0.1888, + "step": 1772 + }, + { + "epoch": 0.5411262017396612, + "grad_norm": 1.511774973846459, + "learning_rate": 4.577767898691937e-06, + "loss": 0.3098, + "step": 1773 + }, + { + "epoch": 0.5414314054631466, + "grad_norm": 1.6115715074241228, + "learning_rate": 4.572841491275565e-06, + "loss": 0.4813, + "step": 1774 + }, + { + "epoch": 0.541736609186632, + "grad_norm": 1.1577809482656338, + "learning_rate": 4.5679155015494475e-06, + "loss": 0.1947, + "step": 1775 + }, + { + "epoch": 0.5420418129101175, + "grad_norm": 1.4664633787039183, + "learning_rate": 4.562989934330385e-06, + "loss": 0.2036, + "step": 1776 + }, + { + "epoch": 0.542347016633603, + "grad_norm": 1.2041716543620122, + "learning_rate": 4.558064794434766e-06, + "loss": 0.2272, + "step": 1777 + }, + { + "epoch": 0.5426522203570884, + "grad_norm": 1.3641306682040004, + "learning_rate": 4.553140086678563e-06, + "loss": 0.3755, + "step": 1778 + }, + { + "epoch": 0.5429574240805738, + "grad_norm": 1.3739568047542712, + "learning_rate": 4.548215815877324e-06, + "loss": 0.2787, + "step": 1779 + }, + { + "epoch": 0.5432626278040592, + "grad_norm": 2.6505197986898854, + "learning_rate": 4.543291986846172e-06, + "loss": 0.4521, + "step": 1780 + }, + { + "epoch": 0.5435678315275446, + "grad_norm": 1.4710712352129807, + "learning_rate": 4.5383686043997916e-06, + "loss": 0.3916, + "step": 1781 + }, + { + "epoch": 0.54387303525103, + "grad_norm": 1.1925169723230629, + "learning_rate": 4.533445673352438e-06, + "loss": 0.4002, + "step": 1782 + }, + { + "epoch": 0.5441782389745154, + "grad_norm": 1.251342815115979, + "learning_rate": 4.528523198517922e-06, + "loss": 0.2429, + "step": 1783 + }, + { + "epoch": 0.5444834426980009, + "grad_norm": 1.5975295485269585, + "learning_rate": 4.523601184709608e-06, + "loss": 0.2684, + "step": 1784 + }, + { + "epoch": 0.5447886464214864, + "grad_norm": 1.6643586947322488, + "learning_rate": 4.518679636740411e-06, + "loss": 0.3698, + "step": 1785 + }, + { + "epoch": 0.5450938501449718, + "grad_norm": 1.3920856048936912, + "learning_rate": 4.513758559422788e-06, + "loss": 0.2586, + "step": 1786 + }, + { + "epoch": 0.5453990538684572, + "grad_norm": 1.7161184105155805, + "learning_rate": 4.508837957568737e-06, + "loss": 0.4346, + "step": 1787 + }, + { + "epoch": 0.5457042575919426, + "grad_norm": 1.9392455451877773, + "learning_rate": 4.503917835989795e-06, + "loss": 0.3078, + "step": 1788 + }, + { + "epoch": 0.546009461315428, + "grad_norm": 1.5955708891333396, + "learning_rate": 4.4989981994970215e-06, + "loss": 0.3537, + "step": 1789 + }, + { + "epoch": 0.5463146650389135, + "grad_norm": 1.6369794760823198, + "learning_rate": 4.494079052901008e-06, + "loss": 0.3795, + "step": 1790 + }, + { + "epoch": 0.5466198687623989, + "grad_norm": 1.3368239111142433, + "learning_rate": 4.489160401011865e-06, + "loss": 0.3277, + "step": 1791 + }, + { + "epoch": 0.5469250724858843, + "grad_norm": 1.53161274252946, + "learning_rate": 4.484242248639219e-06, + "loss": 0.3846, + "step": 1792 + }, + { + "epoch": 0.5472302762093697, + "grad_norm": 1.847090459090323, + "learning_rate": 4.479324600592207e-06, + "loss": 0.5765, + "step": 1793 + }, + { + "epoch": 0.5475354799328552, + "grad_norm": 1.2508761196625122, + "learning_rate": 4.474407461679474e-06, + "loss": 0.262, + "step": 1794 + }, + { + "epoch": 0.5478406836563406, + "grad_norm": 1.4004402524276096, + "learning_rate": 4.46949083670917e-06, + "loss": 0.2882, + "step": 1795 + }, + { + "epoch": 0.5481458873798261, + "grad_norm": 1.5857922717718818, + "learning_rate": 4.46457473048894e-06, + "loss": 0.239, + "step": 1796 + }, + { + "epoch": 0.5484510911033115, + "grad_norm": 1.6062020439380993, + "learning_rate": 4.459659147825918e-06, + "loss": 0.3717, + "step": 1797 + }, + { + "epoch": 0.5487562948267969, + "grad_norm": 1.353016506717472, + "learning_rate": 4.45474409352673e-06, + "loss": 0.3397, + "step": 1798 + }, + { + "epoch": 0.5490614985502823, + "grad_norm": 1.8447201003228528, + "learning_rate": 4.4498295723974874e-06, + "loss": 0.4021, + "step": 1799 + }, + { + "epoch": 0.5493667022737677, + "grad_norm": 1.5434900103493834, + "learning_rate": 4.444915589243775e-06, + "loss": 0.4346, + "step": 1800 + }, + { + "epoch": 0.5496719059972531, + "grad_norm": 1.4692641636605677, + "learning_rate": 4.4400021488706554e-06, + "loss": 0.4139, + "step": 1801 + }, + { + "epoch": 0.5499771097207385, + "grad_norm": 1.7164322223877362, + "learning_rate": 4.43508925608266e-06, + "loss": 0.2815, + "step": 1802 + }, + { + "epoch": 0.5502823134442241, + "grad_norm": 1.5813457746488064, + "learning_rate": 4.430176915683782e-06, + "loss": 0.296, + "step": 1803 + }, + { + "epoch": 0.5505875171677095, + "grad_norm": 1.6394357639908674, + "learning_rate": 4.425265132477478e-06, + "loss": 0.5515, + "step": 1804 + }, + { + "epoch": 0.5508927208911949, + "grad_norm": 2.6314268002948626, + "learning_rate": 4.420353911266659e-06, + "loss": 0.3059, + "step": 1805 + }, + { + "epoch": 0.5511979246146803, + "grad_norm": 1.3058880975401832, + "learning_rate": 4.415443256853683e-06, + "loss": 0.3403, + "step": 1806 + }, + { + "epoch": 0.5515031283381657, + "grad_norm": 1.377286483777208, + "learning_rate": 4.410533174040359e-06, + "loss": 0.3375, + "step": 1807 + }, + { + "epoch": 0.5518083320616511, + "grad_norm": 0.8967185784549286, + "learning_rate": 4.4056236676279345e-06, + "loss": 0.1389, + "step": 1808 + }, + { + "epoch": 0.5521135357851366, + "grad_norm": 1.3009551794026946, + "learning_rate": 4.400714742417091e-06, + "loss": 0.3164, + "step": 1809 + }, + { + "epoch": 0.552418739508622, + "grad_norm": 1.0558516117873247, + "learning_rate": 4.395806403207947e-06, + "loss": 0.1837, + "step": 1810 + }, + { + "epoch": 0.5527239432321074, + "grad_norm": 1.5935819948692123, + "learning_rate": 4.3908986548000406e-06, + "loss": 0.4112, + "step": 1811 + }, + { + "epoch": 0.5530291469555929, + "grad_norm": 1.633330413595172, + "learning_rate": 4.3859915019923436e-06, + "loss": 0.3029, + "step": 1812 + }, + { + "epoch": 0.5533343506790783, + "grad_norm": 1.5346955440421504, + "learning_rate": 4.381084949583233e-06, + "loss": 0.3364, + "step": 1813 + }, + { + "epoch": 0.5536395544025637, + "grad_norm": 1.346352430252736, + "learning_rate": 4.376179002370505e-06, + "loss": 0.217, + "step": 1814 + }, + { + "epoch": 0.5539447581260492, + "grad_norm": 1.262614354535023, + "learning_rate": 4.371273665151366e-06, + "loss": 0.2082, + "step": 1815 + }, + { + "epoch": 0.5542499618495346, + "grad_norm": 1.5174607632889023, + "learning_rate": 4.36636894272242e-06, + "loss": 0.4906, + "step": 1816 + }, + { + "epoch": 0.55455516557302, + "grad_norm": 1.3534777967663216, + "learning_rate": 4.361464839879674e-06, + "loss": 0.2799, + "step": 1817 + }, + { + "epoch": 0.5548603692965054, + "grad_norm": 1.4373919404920255, + "learning_rate": 4.356561361418529e-06, + "loss": 0.2769, + "step": 1818 + }, + { + "epoch": 0.5551655730199908, + "grad_norm": 1.6256631818406408, + "learning_rate": 4.351658512133773e-06, + "loss": 0.3239, + "step": 1819 + }, + { + "epoch": 0.5554707767434762, + "grad_norm": 1.5206222736100585, + "learning_rate": 4.346756296819582e-06, + "loss": 0.3495, + "step": 1820 + }, + { + "epoch": 0.5557759804669617, + "grad_norm": 1.6079779922535562, + "learning_rate": 4.34185472026951e-06, + "loss": 0.404, + "step": 1821 + }, + { + "epoch": 0.5560811841904472, + "grad_norm": 1.3156486617835121, + "learning_rate": 4.3369537872764854e-06, + "loss": 0.2266, + "step": 1822 + }, + { + "epoch": 0.5563863879139326, + "grad_norm": 1.4051731649097803, + "learning_rate": 4.3320535026328095e-06, + "loss": 0.377, + "step": 1823 + }, + { + "epoch": 0.556691591637418, + "grad_norm": 1.5976076748279182, + "learning_rate": 4.327153871130151e-06, + "loss": 0.4271, + "step": 1824 + }, + { + "epoch": 0.5569967953609034, + "grad_norm": 1.5932764584008285, + "learning_rate": 4.322254897559534e-06, + "loss": 0.272, + "step": 1825 + }, + { + "epoch": 0.5573019990843888, + "grad_norm": 1.2476323593040315, + "learning_rate": 4.317356586711345e-06, + "loss": 0.3005, + "step": 1826 + }, + { + "epoch": 0.5576072028078742, + "grad_norm": 1.4480223948712203, + "learning_rate": 4.312458943375319e-06, + "loss": 0.5113, + "step": 1827 + }, + { + "epoch": 0.5579124065313597, + "grad_norm": 1.736339659164446, + "learning_rate": 4.307561972340545e-06, + "loss": 0.3381, + "step": 1828 + }, + { + "epoch": 0.5582176102548451, + "grad_norm": 1.5027439927263628, + "learning_rate": 4.302665678395444e-06, + "loss": 0.2734, + "step": 1829 + }, + { + "epoch": 0.5585228139783306, + "grad_norm": 1.7299477371965306, + "learning_rate": 4.297770066327781e-06, + "loss": 0.3715, + "step": 1830 + }, + { + "epoch": 0.558828017701816, + "grad_norm": 1.5375612944503692, + "learning_rate": 4.292875140924656e-06, + "loss": 0.2312, + "step": 1831 + }, + { + "epoch": 0.5591332214253014, + "grad_norm": 1.6590692529416964, + "learning_rate": 4.287980906972492e-06, + "loss": 0.5114, + "step": 1832 + }, + { + "epoch": 0.5594384251487868, + "grad_norm": 1.6149299823711942, + "learning_rate": 4.2830873692570406e-06, + "loss": 0.5849, + "step": 1833 + }, + { + "epoch": 0.5597436288722722, + "grad_norm": 1.2257140803122564, + "learning_rate": 4.278194532563373e-06, + "loss": 0.2985, + "step": 1834 + }, + { + "epoch": 0.5600488325957577, + "grad_norm": 1.4168339331371003, + "learning_rate": 4.273302401675869e-06, + "loss": 0.3212, + "step": 1835 + }, + { + "epoch": 0.5603540363192431, + "grad_norm": 1.24088703914615, + "learning_rate": 4.2684109813782235e-06, + "loss": 0.2954, + "step": 1836 + }, + { + "epoch": 0.5606592400427285, + "grad_norm": 1.2898878287553455, + "learning_rate": 4.263520276453438e-06, + "loss": 0.3189, + "step": 1837 + }, + { + "epoch": 0.5609644437662139, + "grad_norm": 1.2371405994749598, + "learning_rate": 4.258630291683807e-06, + "loss": 0.2873, + "step": 1838 + }, + { + "epoch": 0.5612696474896993, + "grad_norm": 1.6176357661582368, + "learning_rate": 4.253741031850928e-06, + "loss": 0.3103, + "step": 1839 + }, + { + "epoch": 0.5615748512131848, + "grad_norm": 1.7539077452927228, + "learning_rate": 4.248852501735688e-06, + "loss": 0.3436, + "step": 1840 + }, + { + "epoch": 0.5618800549366703, + "grad_norm": 1.464784829544187, + "learning_rate": 4.2439647061182585e-06, + "loss": 0.3883, + "step": 1841 + }, + { + "epoch": 0.5621852586601557, + "grad_norm": 1.0819455329738283, + "learning_rate": 4.2390776497780924e-06, + "loss": 0.2173, + "step": 1842 + }, + { + "epoch": 0.5624904623836411, + "grad_norm": 1.5239350900118207, + "learning_rate": 4.234191337493924e-06, + "loss": 0.2023, + "step": 1843 + }, + { + "epoch": 0.5627956661071265, + "grad_norm": 1.78640496384631, + "learning_rate": 4.229305774043753e-06, + "loss": 0.4053, + "step": 1844 + }, + { + "epoch": 0.5631008698306119, + "grad_norm": 1.7446265147225073, + "learning_rate": 4.224420964204856e-06, + "loss": 0.3202, + "step": 1845 + }, + { + "epoch": 0.5634060735540973, + "grad_norm": 1.409118270328708, + "learning_rate": 4.219536912753765e-06, + "loss": 0.3765, + "step": 1846 + }, + { + "epoch": 0.5637112772775827, + "grad_norm": 1.8518819412683283, + "learning_rate": 4.2146536244662755e-06, + "loss": 0.3897, + "step": 1847 + }, + { + "epoch": 0.5640164810010682, + "grad_norm": 1.418487033756496, + "learning_rate": 4.209771104117431e-06, + "loss": 0.2562, + "step": 1848 + }, + { + "epoch": 0.5643216847245537, + "grad_norm": 2.3571698323477404, + "learning_rate": 4.204889356481529e-06, + "loss": 0.3302, + "step": 1849 + }, + { + "epoch": 0.5646268884480391, + "grad_norm": 1.6473829322170688, + "learning_rate": 4.200008386332112e-06, + "loss": 0.4377, + "step": 1850 + }, + { + "epoch": 0.5649320921715245, + "grad_norm": 1.6536132408794952, + "learning_rate": 4.195128198441956e-06, + "loss": 0.3028, + "step": 1851 + }, + { + "epoch": 0.5652372958950099, + "grad_norm": 1.655984317415034, + "learning_rate": 4.19024879758308e-06, + "loss": 0.4111, + "step": 1852 + }, + { + "epoch": 0.5655424996184953, + "grad_norm": 1.3841085018145172, + "learning_rate": 4.185370188526729e-06, + "loss": 0.3595, + "step": 1853 + }, + { + "epoch": 0.5658477033419808, + "grad_norm": 1.5438987536712128, + "learning_rate": 4.180492376043371e-06, + "loss": 0.3239, + "step": 1854 + }, + { + "epoch": 0.5661529070654662, + "grad_norm": 1.4582024742633912, + "learning_rate": 4.175615364902702e-06, + "loss": 0.2405, + "step": 1855 + }, + { + "epoch": 0.5664581107889516, + "grad_norm": 1.563635680413248, + "learning_rate": 4.1707391598736315e-06, + "loss": 0.3507, + "step": 1856 + }, + { + "epoch": 0.566763314512437, + "grad_norm": 1.415615851433773, + "learning_rate": 4.1658637657242775e-06, + "loss": 0.3625, + "step": 1857 + }, + { + "epoch": 0.5670685182359225, + "grad_norm": 1.3449463065679088, + "learning_rate": 4.160989187221969e-06, + "loss": 0.3278, + "step": 1858 + }, + { + "epoch": 0.5673737219594079, + "grad_norm": 1.2270830349794986, + "learning_rate": 4.156115429133237e-06, + "loss": 0.2514, + "step": 1859 + }, + { + "epoch": 0.5676789256828934, + "grad_norm": 1.4667141379274422, + "learning_rate": 4.151242496223807e-06, + "loss": 0.318, + "step": 1860 + }, + { + "epoch": 0.5679841294063788, + "grad_norm": 1.6246601629699675, + "learning_rate": 4.146370393258603e-06, + "loss": 0.5131, + "step": 1861 + }, + { + "epoch": 0.5682893331298642, + "grad_norm": 1.614142901328195, + "learning_rate": 4.141499125001733e-06, + "loss": 0.5237, + "step": 1862 + }, + { + "epoch": 0.5685945368533496, + "grad_norm": 1.4567239964083918, + "learning_rate": 4.13662869621649e-06, + "loss": 0.2776, + "step": 1863 + }, + { + "epoch": 0.568899740576835, + "grad_norm": 1.2668036165155088, + "learning_rate": 4.131759111665349e-06, + "loss": 0.2393, + "step": 1864 + }, + { + "epoch": 0.5692049443003204, + "grad_norm": 1.8092268624549248, + "learning_rate": 4.126890376109953e-06, + "loss": 0.317, + "step": 1865 + }, + { + "epoch": 0.5695101480238058, + "grad_norm": 1.7242479647679296, + "learning_rate": 4.122022494311122e-06, + "loss": 0.3034, + "step": 1866 + }, + { + "epoch": 0.5698153517472914, + "grad_norm": 1.1713728273815491, + "learning_rate": 4.117155471028837e-06, + "loss": 0.1709, + "step": 1867 + }, + { + "epoch": 0.5701205554707768, + "grad_norm": 1.3046020205700666, + "learning_rate": 4.112289311022239e-06, + "loss": 0.2232, + "step": 1868 + }, + { + "epoch": 0.5704257591942622, + "grad_norm": 1.4203182543479589, + "learning_rate": 4.107424019049628e-06, + "loss": 0.3207, + "step": 1869 + }, + { + "epoch": 0.5707309629177476, + "grad_norm": 1.469352436289963, + "learning_rate": 4.102559599868454e-06, + "loss": 0.4105, + "step": 1870 + }, + { + "epoch": 0.571036166641233, + "grad_norm": 1.32659963638931, + "learning_rate": 4.097696058235311e-06, + "loss": 0.3715, + "step": 1871 + }, + { + "epoch": 0.5713413703647184, + "grad_norm": 1.351981831146365, + "learning_rate": 4.092833398905939e-06, + "loss": 0.413, + "step": 1872 + }, + { + "epoch": 0.5716465740882039, + "grad_norm": 1.1676271889652488, + "learning_rate": 4.087971626635212e-06, + "loss": 0.2227, + "step": 1873 + }, + { + "epoch": 0.5719517778116893, + "grad_norm": 1.395861378667927, + "learning_rate": 4.083110746177137e-06, + "loss": 0.3457, + "step": 1874 + }, + { + "epoch": 0.5722569815351747, + "grad_norm": 1.7052386406046318, + "learning_rate": 4.078250762284852e-06, + "loss": 0.3004, + "step": 1875 + }, + { + "epoch": 0.5725621852586602, + "grad_norm": 1.5593654471737393, + "learning_rate": 4.073391679710613e-06, + "loss": 0.1962, + "step": 1876 + }, + { + "epoch": 0.5728673889821456, + "grad_norm": 1.453389215546987, + "learning_rate": 4.068533503205801e-06, + "loss": 0.2628, + "step": 1877 + }, + { + "epoch": 0.573172592705631, + "grad_norm": 1.4620964634717861, + "learning_rate": 4.063676237520907e-06, + "loss": 0.3129, + "step": 1878 + }, + { + "epoch": 0.5734777964291164, + "grad_norm": 1.4098166270766606, + "learning_rate": 4.058819887405532e-06, + "loss": 0.2979, + "step": 1879 + }, + { + "epoch": 0.5737830001526019, + "grad_norm": 1.405420779416245, + "learning_rate": 4.053964457608381e-06, + "loss": 0.3358, + "step": 1880 + }, + { + "epoch": 0.5740882038760873, + "grad_norm": 1.3379669835930859, + "learning_rate": 4.049109952877261e-06, + "loss": 0.2897, + "step": 1881 + }, + { + "epoch": 0.5743934075995727, + "grad_norm": 1.527556109482659, + "learning_rate": 4.044256377959072e-06, + "loss": 0.4784, + "step": 1882 + }, + { + "epoch": 0.5746986113230581, + "grad_norm": 1.3768514825628073, + "learning_rate": 4.039403737599807e-06, + "loss": 0.3001, + "step": 1883 + }, + { + "epoch": 0.5750038150465435, + "grad_norm": 1.2508930830606682, + "learning_rate": 4.034552036544544e-06, + "loss": 0.3061, + "step": 1884 + }, + { + "epoch": 0.575309018770029, + "grad_norm": 1.1647278593095605, + "learning_rate": 4.029701279537444e-06, + "loss": 0.1376, + "step": 1885 + }, + { + "epoch": 0.5756142224935145, + "grad_norm": 1.2338571548694106, + "learning_rate": 4.024851471321744e-06, + "loss": 0.2517, + "step": 1886 + }, + { + "epoch": 0.5759194262169999, + "grad_norm": 1.5621581657327868, + "learning_rate": 4.02000261663975e-06, + "loss": 0.3779, + "step": 1887 + }, + { + "epoch": 0.5762246299404853, + "grad_norm": 1.496075264042124, + "learning_rate": 4.01515472023284e-06, + "loss": 0.22, + "step": 1888 + }, + { + "epoch": 0.5765298336639707, + "grad_norm": 1.9209284738850185, + "learning_rate": 4.010307786841456e-06, + "loss": 0.2617, + "step": 1889 + }, + { + "epoch": 0.5768350373874561, + "grad_norm": 1.4182439699870193, + "learning_rate": 4.005461821205092e-06, + "loss": 0.2029, + "step": 1890 + }, + { + "epoch": 0.5771402411109415, + "grad_norm": 1.632558848560275, + "learning_rate": 4.000616828062302e-06, + "loss": 0.339, + "step": 1891 + }, + { + "epoch": 0.577445444834427, + "grad_norm": 1.5939429671208698, + "learning_rate": 3.995772812150687e-06, + "loss": 0.343, + "step": 1892 + }, + { + "epoch": 0.5777506485579124, + "grad_norm": 1.86362055921937, + "learning_rate": 3.990929778206889e-06, + "loss": 0.3299, + "step": 1893 + }, + { + "epoch": 0.5780558522813979, + "grad_norm": 1.669041308861478, + "learning_rate": 3.986087730966596e-06, + "loss": 0.4438, + "step": 1894 + }, + { + "epoch": 0.5783610560048833, + "grad_norm": 1.9885482758668145, + "learning_rate": 3.981246675164526e-06, + "loss": 0.3986, + "step": 1895 + }, + { + "epoch": 0.5786662597283687, + "grad_norm": 1.4523473319942632, + "learning_rate": 3.976406615534433e-06, + "loss": 0.2753, + "step": 1896 + }, + { + "epoch": 0.5789714634518541, + "grad_norm": 1.7124036664389317, + "learning_rate": 3.971567556809089e-06, + "loss": 0.2557, + "step": 1897 + }, + { + "epoch": 0.5792766671753395, + "grad_norm": 1.3490546890467419, + "learning_rate": 3.966729503720294e-06, + "loss": 0.3153, + "step": 1898 + }, + { + "epoch": 0.579581870898825, + "grad_norm": 1.711569891289086, + "learning_rate": 3.961892460998863e-06, + "loss": 0.4145, + "step": 1899 + }, + { + "epoch": 0.5798870746223104, + "grad_norm": 1.6758973408215243, + "learning_rate": 3.95705643337462e-06, + "loss": 0.3659, + "step": 1900 + }, + { + "epoch": 0.5801922783457958, + "grad_norm": 1.5917092515664464, + "learning_rate": 3.952221425576401e-06, + "loss": 0.4488, + "step": 1901 + }, + { + "epoch": 0.5804974820692812, + "grad_norm": 1.4033121574658662, + "learning_rate": 3.947387442332043e-06, + "loss": 0.2452, + "step": 1902 + }, + { + "epoch": 0.5808026857927667, + "grad_norm": 1.473499329969696, + "learning_rate": 3.942554488368378e-06, + "loss": 0.3778, + "step": 1903 + }, + { + "epoch": 0.5811078895162521, + "grad_norm": 1.3946820345871305, + "learning_rate": 3.937722568411237e-06, + "loss": 0.238, + "step": 1904 + }, + { + "epoch": 0.5814130932397376, + "grad_norm": 1.735278332506307, + "learning_rate": 3.932891687185438e-06, + "loss": 0.3191, + "step": 1905 + }, + { + "epoch": 0.581718296963223, + "grad_norm": 1.728510245621926, + "learning_rate": 3.928061849414778e-06, + "loss": 0.4613, + "step": 1906 + }, + { + "epoch": 0.5820235006867084, + "grad_norm": 1.7322903038045538, + "learning_rate": 3.923233059822041e-06, + "loss": 0.4713, + "step": 1907 + }, + { + "epoch": 0.5823287044101938, + "grad_norm": 1.6221554140736476, + "learning_rate": 3.918405323128986e-06, + "loss": 0.3462, + "step": 1908 + }, + { + "epoch": 0.5826339081336792, + "grad_norm": 1.4204907228891313, + "learning_rate": 3.913578644056332e-06, + "loss": 0.2559, + "step": 1909 + }, + { + "epoch": 0.5829391118571646, + "grad_norm": 1.4410886337055444, + "learning_rate": 3.90875302732378e-06, + "loss": 0.3235, + "step": 1910 + }, + { + "epoch": 0.58324431558065, + "grad_norm": 1.3315830763234417, + "learning_rate": 3.903928477649978e-06, + "loss": 0.2323, + "step": 1911 + }, + { + "epoch": 0.5835495193041355, + "grad_norm": 1.4552983264589225, + "learning_rate": 3.899104999752541e-06, + "loss": 0.2523, + "step": 1912 + }, + { + "epoch": 0.583854723027621, + "grad_norm": 1.4988653341908547, + "learning_rate": 3.894282598348026e-06, + "loss": 0.3828, + "step": 1913 + }, + { + "epoch": 0.5841599267511064, + "grad_norm": 1.894276143625022, + "learning_rate": 3.889461278151946e-06, + "loss": 0.2661, + "step": 1914 + }, + { + "epoch": 0.5844651304745918, + "grad_norm": 1.2550040219915009, + "learning_rate": 3.884641043878754e-06, + "loss": 0.2631, + "step": 1915 + }, + { + "epoch": 0.5847703341980772, + "grad_norm": 1.5480246847190493, + "learning_rate": 3.879821900241838e-06, + "loss": 0.556, + "step": 1916 + }, + { + "epoch": 0.5850755379215626, + "grad_norm": 1.4156532402674349, + "learning_rate": 3.8750038519535235e-06, + "loss": 0.3244, + "step": 1917 + }, + { + "epoch": 0.585380741645048, + "grad_norm": 1.5455677404105437, + "learning_rate": 3.870186903725064e-06, + "loss": 0.5752, + "step": 1918 + }, + { + "epoch": 0.5856859453685335, + "grad_norm": 1.4636896495284506, + "learning_rate": 3.865371060266635e-06, + "loss": 0.3238, + "step": 1919 + }, + { + "epoch": 0.5859911490920189, + "grad_norm": 1.4740107047366455, + "learning_rate": 3.860556326287335e-06, + "loss": 0.3049, + "step": 1920 + }, + { + "epoch": 0.5862963528155043, + "grad_norm": 1.6439367384050791, + "learning_rate": 3.855742706495177e-06, + "loss": 0.4124, + "step": 1921 + }, + { + "epoch": 0.5866015565389898, + "grad_norm": 1.2368282132313706, + "learning_rate": 3.850930205597081e-06, + "loss": 0.3479, + "step": 1922 + }, + { + "epoch": 0.5869067602624752, + "grad_norm": 1.3429435261592815, + "learning_rate": 3.846118828298878e-06, + "loss": 0.2947, + "step": 1923 + }, + { + "epoch": 0.5872119639859606, + "grad_norm": 1.6180408087819254, + "learning_rate": 3.841308579305298e-06, + "loss": 0.4857, + "step": 1924 + }, + { + "epoch": 0.5875171677094461, + "grad_norm": 1.2893210411892835, + "learning_rate": 3.836499463319962e-06, + "loss": 0.4012, + "step": 1925 + }, + { + "epoch": 0.5878223714329315, + "grad_norm": 1.4427577353418675, + "learning_rate": 3.831691485045397e-06, + "loss": 0.5265, + "step": 1926 + }, + { + "epoch": 0.5881275751564169, + "grad_norm": 1.241609594881084, + "learning_rate": 3.826884649183006e-06, + "loss": 0.2704, + "step": 1927 + }, + { + "epoch": 0.5884327788799023, + "grad_norm": 1.424738756292409, + "learning_rate": 3.822078960433076e-06, + "loss": 0.3, + "step": 1928 + }, + { + "epoch": 0.5887379826033877, + "grad_norm": 1.7578843843808665, + "learning_rate": 3.817274423494778e-06, + "loss": 0.3955, + "step": 1929 + }, + { + "epoch": 0.5890431863268731, + "grad_norm": 1.5726966788282515, + "learning_rate": 3.8124710430661503e-06, + "loss": 0.2467, + "step": 1930 + }, + { + "epoch": 0.5893483900503587, + "grad_norm": 1.1631106382833136, + "learning_rate": 3.807668823844104e-06, + "loss": 0.3261, + "step": 1931 + }, + { + "epoch": 0.5896535937738441, + "grad_norm": 1.5045848553943075, + "learning_rate": 3.8028677705244153e-06, + "loss": 0.3358, + "step": 1932 + }, + { + "epoch": 0.5899587974973295, + "grad_norm": 1.8852065445237391, + "learning_rate": 3.798067887801717e-06, + "loss": 0.6849, + "step": 1933 + }, + { + "epoch": 0.5902640012208149, + "grad_norm": 1.3753020214954932, + "learning_rate": 3.7932691803694992e-06, + "loss": 0.4061, + "step": 1934 + }, + { + "epoch": 0.5905692049443003, + "grad_norm": 1.563977115522218, + "learning_rate": 3.7884716529201047e-06, + "loss": 0.2681, + "step": 1935 + }, + { + "epoch": 0.5908744086677857, + "grad_norm": 1.3038261905545871, + "learning_rate": 3.783675310144719e-06, + "loss": 0.3416, + "step": 1936 + }, + { + "epoch": 0.5911796123912711, + "grad_norm": 1.422085924538324, + "learning_rate": 3.7788801567333712e-06, + "loss": 0.3074, + "step": 1937 + }, + { + "epoch": 0.5914848161147566, + "grad_norm": 1.3398381447188235, + "learning_rate": 3.774086197374926e-06, + "loss": 0.3046, + "step": 1938 + }, + { + "epoch": 0.591790019838242, + "grad_norm": 1.215628514615742, + "learning_rate": 3.7692934367570823e-06, + "loss": 0.2865, + "step": 1939 + }, + { + "epoch": 0.5920952235617275, + "grad_norm": 1.0941402284130506, + "learning_rate": 3.7645018795663667e-06, + "loss": 0.2081, + "step": 1940 + }, + { + "epoch": 0.5924004272852129, + "grad_norm": 1.5331642486114183, + "learning_rate": 3.759711530488127e-06, + "loss": 0.1809, + "step": 1941 + }, + { + "epoch": 0.5927056310086983, + "grad_norm": 1.501084412551385, + "learning_rate": 3.7549223942065303e-06, + "loss": 0.3311, + "step": 1942 + }, + { + "epoch": 0.5930108347321837, + "grad_norm": 1.6530406735695713, + "learning_rate": 3.7501344754045604e-06, + "loss": 0.3101, + "step": 1943 + }, + { + "epoch": 0.5933160384556692, + "grad_norm": 1.6341490317636014, + "learning_rate": 3.745347778764008e-06, + "loss": 0.4414, + "step": 1944 + }, + { + "epoch": 0.5936212421791546, + "grad_norm": 1.45040770751033, + "learning_rate": 3.740562308965471e-06, + "loss": 0.2298, + "step": 1945 + }, + { + "epoch": 0.59392644590264, + "grad_norm": 1.67431355829628, + "learning_rate": 3.7357780706883423e-06, + "loss": 0.3863, + "step": 1946 + }, + { + "epoch": 0.5942316496261254, + "grad_norm": 1.2794014225529522, + "learning_rate": 3.730995068610817e-06, + "loss": 0.3324, + "step": 1947 + }, + { + "epoch": 0.5945368533496108, + "grad_norm": 1.325879701050942, + "learning_rate": 3.7262133074098793e-06, + "loss": 0.1979, + "step": 1948 + }, + { + "epoch": 0.5948420570730963, + "grad_norm": 1.251910133040228, + "learning_rate": 3.721432791761297e-06, + "loss": 0.2215, + "step": 1949 + }, + { + "epoch": 0.5951472607965818, + "grad_norm": 1.5840427750100448, + "learning_rate": 3.716653526339623e-06, + "loss": 0.3227, + "step": 1950 + }, + { + "epoch": 0.5954524645200672, + "grad_norm": 1.5473806922537334, + "learning_rate": 3.7118755158181896e-06, + "loss": 0.3352, + "step": 1951 + }, + { + "epoch": 0.5957576682435526, + "grad_norm": 1.521976650845937, + "learning_rate": 3.707098764869097e-06, + "loss": 0.4004, + "step": 1952 + }, + { + "epoch": 0.596062871967038, + "grad_norm": 1.5737464518124529, + "learning_rate": 3.702323278163217e-06, + "loss": 0.2717, + "step": 1953 + }, + { + "epoch": 0.5963680756905234, + "grad_norm": 1.680802228214579, + "learning_rate": 3.6975490603701846e-06, + "loss": 0.3889, + "step": 1954 + }, + { + "epoch": 0.5966732794140088, + "grad_norm": 1.3239251995767745, + "learning_rate": 3.692776116158393e-06, + "loss": 0.2214, + "step": 1955 + }, + { + "epoch": 0.5969784831374942, + "grad_norm": 1.571247334990142, + "learning_rate": 3.688004450194992e-06, + "loss": 0.1805, + "step": 1956 + }, + { + "epoch": 0.5972836868609797, + "grad_norm": 1.5040931486442533, + "learning_rate": 3.6832340671458804e-06, + "loss": 0.4679, + "step": 1957 + }, + { + "epoch": 0.5975888905844652, + "grad_norm": 1.1553720285455236, + "learning_rate": 3.6784649716757003e-06, + "loss": 0.1948, + "step": 1958 + }, + { + "epoch": 0.5978940943079506, + "grad_norm": 1.5530514183498014, + "learning_rate": 3.6736971684478403e-06, + "loss": 0.4196, + "step": 1959 + }, + { + "epoch": 0.598199298031436, + "grad_norm": 1.360045695098193, + "learning_rate": 3.6689306621244196e-06, + "loss": 0.3248, + "step": 1960 + }, + { + "epoch": 0.5985045017549214, + "grad_norm": 1.6265003907056328, + "learning_rate": 3.6641654573662943e-06, + "loss": 0.4469, + "step": 1961 + }, + { + "epoch": 0.5988097054784068, + "grad_norm": 1.3520633893925043, + "learning_rate": 3.659401558833042e-06, + "loss": 0.3628, + "step": 1962 + }, + { + "epoch": 0.5991149092018923, + "grad_norm": 1.3981669073598408, + "learning_rate": 3.654638971182969e-06, + "loss": 0.2122, + "step": 1963 + }, + { + "epoch": 0.5994201129253777, + "grad_norm": 1.204357061326749, + "learning_rate": 3.6498776990730956e-06, + "loss": 0.3035, + "step": 1964 + }, + { + "epoch": 0.5997253166488631, + "grad_norm": 1.5117911388009846, + "learning_rate": 3.6451177471591564e-06, + "loss": 0.3495, + "step": 1965 + }, + { + "epoch": 0.6000305203723485, + "grad_norm": 1.350551955098285, + "learning_rate": 3.640359120095597e-06, + "loss": 0.2792, + "step": 1966 + }, + { + "epoch": 0.600335724095834, + "grad_norm": 0.9937686486563697, + "learning_rate": 3.635601822535567e-06, + "loss": 0.1301, + "step": 1967 + }, + { + "epoch": 0.6006409278193194, + "grad_norm": 1.2543699173760188, + "learning_rate": 3.6308458591309124e-06, + "loss": 0.3023, + "step": 1968 + }, + { + "epoch": 0.6009461315428049, + "grad_norm": 1.5915859714577205, + "learning_rate": 3.6260912345321807e-06, + "loss": 0.4957, + "step": 1969 + }, + { + "epoch": 0.6012513352662903, + "grad_norm": 1.2913315194645163, + "learning_rate": 3.6213379533886063e-06, + "loss": 0.1803, + "step": 1970 + }, + { + "epoch": 0.6015565389897757, + "grad_norm": 1.4490209069122582, + "learning_rate": 3.6165860203481084e-06, + "loss": 0.4276, + "step": 1971 + }, + { + "epoch": 0.6018617427132611, + "grad_norm": 1.4139380637367027, + "learning_rate": 3.611835440057293e-06, + "loss": 0.386, + "step": 1972 + }, + { + "epoch": 0.6021669464367465, + "grad_norm": 1.4566161354402127, + "learning_rate": 3.6070862171614394e-06, + "loss": 0.5218, + "step": 1973 + }, + { + "epoch": 0.6024721501602319, + "grad_norm": 1.38118497255283, + "learning_rate": 3.602338356304501e-06, + "loss": 0.2619, + "step": 1974 + }, + { + "epoch": 0.6027773538837173, + "grad_norm": 1.365310688641884, + "learning_rate": 3.597591862129097e-06, + "loss": 0.3383, + "step": 1975 + }, + { + "epoch": 0.6030825576072029, + "grad_norm": 1.2206504871463535, + "learning_rate": 3.5928467392765164e-06, + "loss": 0.2515, + "step": 1976 + }, + { + "epoch": 0.6033877613306883, + "grad_norm": 1.0845150310236251, + "learning_rate": 3.5881029923867017e-06, + "loss": 0.2363, + "step": 1977 + }, + { + "epoch": 0.6036929650541737, + "grad_norm": 1.4588781519857335, + "learning_rate": 3.5833606260982503e-06, + "loss": 0.436, + "step": 1978 + }, + { + "epoch": 0.6039981687776591, + "grad_norm": 1.4518968868371211, + "learning_rate": 3.5786196450484112e-06, + "loss": 0.3958, + "step": 1979 + }, + { + "epoch": 0.6043033725011445, + "grad_norm": 1.3211666256268693, + "learning_rate": 3.5738800538730802e-06, + "loss": 0.2995, + "step": 1980 + }, + { + "epoch": 0.6046085762246299, + "grad_norm": 1.0657353229411386, + "learning_rate": 3.5691418572067895e-06, + "loss": 0.2288, + "step": 1981 + }, + { + "epoch": 0.6049137799481153, + "grad_norm": 1.3448858561527488, + "learning_rate": 3.5644050596827106e-06, + "loss": 0.3239, + "step": 1982 + }, + { + "epoch": 0.6052189836716008, + "grad_norm": 1.3294545418957675, + "learning_rate": 3.55966966593265e-06, + "loss": 0.3452, + "step": 1983 + }, + { + "epoch": 0.6055241873950862, + "grad_norm": 1.3759155693099883, + "learning_rate": 3.554935680587035e-06, + "loss": 0.2258, + "step": 1984 + }, + { + "epoch": 0.6058293911185716, + "grad_norm": 1.5070104491764489, + "learning_rate": 3.55020310827492e-06, + "loss": 0.2878, + "step": 1985 + }, + { + "epoch": 0.6061345948420571, + "grad_norm": 1.879756425844785, + "learning_rate": 3.545471953623978e-06, + "loss": 0.2489, + "step": 1986 + }, + { + "epoch": 0.6064397985655425, + "grad_norm": 1.4696428547590852, + "learning_rate": 3.540742221260492e-06, + "loss": 0.2002, + "step": 1987 + }, + { + "epoch": 0.606745002289028, + "grad_norm": 1.494970951117057, + "learning_rate": 3.5360139158093576e-06, + "loss": 0.4269, + "step": 1988 + }, + { + "epoch": 0.6070502060125134, + "grad_norm": 1.6030926349786003, + "learning_rate": 3.5312870418940758e-06, + "loss": 0.4339, + "step": 1989 + }, + { + "epoch": 0.6073554097359988, + "grad_norm": 1.5754902818308054, + "learning_rate": 3.5265616041367445e-06, + "loss": 0.3722, + "step": 1990 + }, + { + "epoch": 0.6076606134594842, + "grad_norm": 1.5153266377103214, + "learning_rate": 3.5218376071580572e-06, + "loss": 0.352, + "step": 1991 + }, + { + "epoch": 0.6079658171829696, + "grad_norm": 1.1711146621923718, + "learning_rate": 3.5171150555773036e-06, + "loss": 0.2412, + "step": 1992 + }, + { + "epoch": 0.608271020906455, + "grad_norm": 1.4680707456552144, + "learning_rate": 3.5123939540123564e-06, + "loss": 0.3318, + "step": 1993 + }, + { + "epoch": 0.6085762246299404, + "grad_norm": 1.4459308023423367, + "learning_rate": 3.50767430707967e-06, + "loss": 0.4519, + "step": 1994 + }, + { + "epoch": 0.608881428353426, + "grad_norm": 1.3206132625233349, + "learning_rate": 3.5029561193942772e-06, + "loss": 0.4462, + "step": 1995 + }, + { + "epoch": 0.6091866320769114, + "grad_norm": 2.04442630508687, + "learning_rate": 3.4982393955697835e-06, + "loss": 0.3718, + "step": 1996 + }, + { + "epoch": 0.6094918358003968, + "grad_norm": 1.5146144368766, + "learning_rate": 3.4935241402183647e-06, + "loss": 0.2436, + "step": 1997 + }, + { + "epoch": 0.6097970395238822, + "grad_norm": 1.5272208942456742, + "learning_rate": 3.488810357950757e-06, + "loss": 0.3587, + "step": 1998 + }, + { + "epoch": 0.6101022432473676, + "grad_norm": 1.3609917333157049, + "learning_rate": 3.484098053376261e-06, + "loss": 0.45, + "step": 1999 + }, + { + "epoch": 0.610407446970853, + "grad_norm": 1.5254999529433142, + "learning_rate": 3.4793872311027286e-06, + "loss": 0.4317, + "step": 2000 + }, + { + "epoch": 0.6107126506943384, + "grad_norm": 1.1504364263616889, + "learning_rate": 3.4746778957365633e-06, + "loss": 0.2089, + "step": 2001 + }, + { + "epoch": 0.6110178544178239, + "grad_norm": 1.5797256057625808, + "learning_rate": 3.469970051882716e-06, + "loss": 0.3793, + "step": 2002 + }, + { + "epoch": 0.6113230581413093, + "grad_norm": 1.5176225849644855, + "learning_rate": 3.4652637041446768e-06, + "loss": 0.4404, + "step": 2003 + }, + { + "epoch": 0.6116282618647948, + "grad_norm": 1.3316765366062373, + "learning_rate": 3.4605588571244754e-06, + "loss": 0.2728, + "step": 2004 + }, + { + "epoch": 0.6119334655882802, + "grad_norm": 1.4620157434526462, + "learning_rate": 3.455855515422672e-06, + "loss": 0.1825, + "step": 2005 + }, + { + "epoch": 0.6122386693117656, + "grad_norm": 1.3467097842675653, + "learning_rate": 3.451153683638355e-06, + "loss": 0.1818, + "step": 2006 + }, + { + "epoch": 0.612543873035251, + "grad_norm": 1.5570818229152807, + "learning_rate": 3.4464533663691357e-06, + "loss": 0.3795, + "step": 2007 + }, + { + "epoch": 0.6128490767587365, + "grad_norm": 1.1553422096400865, + "learning_rate": 3.44175456821115e-06, + "loss": 0.317, + "step": 2008 + }, + { + "epoch": 0.6131542804822219, + "grad_norm": 1.585321917649714, + "learning_rate": 3.4370572937590394e-06, + "loss": 0.4378, + "step": 2009 + }, + { + "epoch": 0.6134594842057073, + "grad_norm": 1.5798636952287084, + "learning_rate": 3.432361547605965e-06, + "loss": 0.5201, + "step": 2010 + }, + { + "epoch": 0.6137646879291927, + "grad_norm": 1.7181031830680185, + "learning_rate": 3.4276673343435846e-06, + "loss": 0.2838, + "step": 2011 + }, + { + "epoch": 0.6140698916526781, + "grad_norm": 1.2099924152704025, + "learning_rate": 3.422974658562062e-06, + "loss": 0.3456, + "step": 2012 + }, + { + "epoch": 0.6143750953761636, + "grad_norm": 1.416064620650194, + "learning_rate": 3.4182835248500584e-06, + "loss": 0.2049, + "step": 2013 + }, + { + "epoch": 0.614680299099649, + "grad_norm": 1.4845303102857086, + "learning_rate": 3.4135939377947225e-06, + "loss": 0.4289, + "step": 2014 + }, + { + "epoch": 0.6149855028231345, + "grad_norm": 1.6544476184824486, + "learning_rate": 3.4089059019816957e-06, + "loss": 0.4024, + "step": 2015 + }, + { + "epoch": 0.6152907065466199, + "grad_norm": 1.2671491134981838, + "learning_rate": 3.4042194219951002e-06, + "loss": 0.3065, + "step": 2016 + }, + { + "epoch": 0.6155959102701053, + "grad_norm": 1.4331331682533894, + "learning_rate": 3.3995345024175354e-06, + "loss": 0.3072, + "step": 2017 + }, + { + "epoch": 0.6159011139935907, + "grad_norm": 1.2838500089236313, + "learning_rate": 3.394851147830078e-06, + "loss": 0.2258, + "step": 2018 + }, + { + "epoch": 0.6162063177170761, + "grad_norm": 1.6058392990442283, + "learning_rate": 3.3901693628122735e-06, + "loss": 0.5359, + "step": 2019 + }, + { + "epoch": 0.6165115214405615, + "grad_norm": 1.433938299450454, + "learning_rate": 3.3854891519421295e-06, + "loss": 0.4852, + "step": 2020 + }, + { + "epoch": 0.616816725164047, + "grad_norm": 1.5971858071124367, + "learning_rate": 3.3808105197961183e-06, + "loss": 0.3521, + "step": 2021 + }, + { + "epoch": 0.6171219288875325, + "grad_norm": 1.5979296909615484, + "learning_rate": 3.376133470949169e-06, + "loss": 0.3326, + "step": 2022 + }, + { + "epoch": 0.6174271326110179, + "grad_norm": 1.6015247619648738, + "learning_rate": 3.3714580099746565e-06, + "loss": 0.2547, + "step": 2023 + }, + { + "epoch": 0.6177323363345033, + "grad_norm": 1.7584158984983258, + "learning_rate": 3.366784141444409e-06, + "loss": 0.2062, + "step": 2024 + }, + { + "epoch": 0.6180375400579887, + "grad_norm": 2.0900618679983007, + "learning_rate": 3.362111869928696e-06, + "loss": 0.1623, + "step": 2025 + }, + { + "epoch": 0.6183427437814741, + "grad_norm": 1.4144306862691312, + "learning_rate": 3.3574411999962262e-06, + "loss": 0.2689, + "step": 2026 + }, + { + "epoch": 0.6186479475049596, + "grad_norm": 1.083391952151385, + "learning_rate": 3.3527721362141384e-06, + "loss": 0.2124, + "step": 2027 + }, + { + "epoch": 0.618953151228445, + "grad_norm": 1.5982387385586947, + "learning_rate": 3.3481046831480045e-06, + "loss": 0.4185, + "step": 2028 + }, + { + "epoch": 0.6192583549519304, + "grad_norm": 1.5038134261380083, + "learning_rate": 3.343438845361822e-06, + "loss": 0.3102, + "step": 2029 + }, + { + "epoch": 0.6195635586754158, + "grad_norm": 1.6651988559686899, + "learning_rate": 3.3387746274180045e-06, + "loss": 0.4605, + "step": 2030 + }, + { + "epoch": 0.6198687623989013, + "grad_norm": 1.4493202523521755, + "learning_rate": 3.3341120338773854e-06, + "loss": 0.2189, + "step": 2031 + }, + { + "epoch": 0.6201739661223867, + "grad_norm": 1.3263552824004858, + "learning_rate": 3.32945106929921e-06, + "loss": 0.2189, + "step": 2032 + }, + { + "epoch": 0.6204791698458721, + "grad_norm": 1.103331639979082, + "learning_rate": 3.3247917382411276e-06, + "loss": 0.2551, + "step": 2033 + }, + { + "epoch": 0.6207843735693576, + "grad_norm": 1.7271412703578817, + "learning_rate": 3.3201340452591924e-06, + "loss": 0.3365, + "step": 2034 + }, + { + "epoch": 0.621089577292843, + "grad_norm": 1.4270357648434941, + "learning_rate": 3.3154779949078587e-06, + "loss": 0.2515, + "step": 2035 + }, + { + "epoch": 0.6213947810163284, + "grad_norm": 1.561423579417306, + "learning_rate": 3.310823591739969e-06, + "loss": 0.4633, + "step": 2036 + }, + { + "epoch": 0.6216999847398138, + "grad_norm": 1.4440339903172814, + "learning_rate": 3.3061708403067605e-06, + "loss": 0.2857, + "step": 2037 + }, + { + "epoch": 0.6220051884632992, + "grad_norm": 1.3703992339885327, + "learning_rate": 3.301519745157854e-06, + "loss": 0.3527, + "step": 2038 + }, + { + "epoch": 0.6223103921867846, + "grad_norm": 2.3498289492425277, + "learning_rate": 3.296870310841248e-06, + "loss": 0.3941, + "step": 2039 + }, + { + "epoch": 0.6226155959102702, + "grad_norm": 1.755215094880646, + "learning_rate": 3.2922225419033182e-06, + "loss": 0.3497, + "step": 2040 + }, + { + "epoch": 0.6229207996337556, + "grad_norm": 1.751629343382298, + "learning_rate": 3.287576442888816e-06, + "loss": 0.2313, + "step": 2041 + }, + { + "epoch": 0.623226003357241, + "grad_norm": 1.4654791916014522, + "learning_rate": 3.2829320183408565e-06, + "loss": 0.4198, + "step": 2042 + }, + { + "epoch": 0.6235312070807264, + "grad_norm": 1.445719068924257, + "learning_rate": 3.278289272800914e-06, + "loss": 0.2312, + "step": 2043 + }, + { + "epoch": 0.6238364108042118, + "grad_norm": 1.323658332280364, + "learning_rate": 3.2736482108088263e-06, + "loss": 0.1885, + "step": 2044 + }, + { + "epoch": 0.6241416145276972, + "grad_norm": 1.3388854400481305, + "learning_rate": 3.2690088369027845e-06, + "loss": 0.2285, + "step": 2045 + }, + { + "epoch": 0.6244468182511826, + "grad_norm": 1.6613302948823414, + "learning_rate": 3.264371155619325e-06, + "loss": 0.3752, + "step": 2046 + }, + { + "epoch": 0.6247520219746681, + "grad_norm": 1.319712304928442, + "learning_rate": 3.2597351714933333e-06, + "loss": 0.3389, + "step": 2047 + }, + { + "epoch": 0.6250572256981535, + "grad_norm": 1.7970433031956077, + "learning_rate": 3.255100889058035e-06, + "loss": 0.2808, + "step": 2048 + }, + { + "epoch": 0.625362429421639, + "grad_norm": 1.344036604013848, + "learning_rate": 3.2504683128449877e-06, + "loss": 0.2621, + "step": 2049 + }, + { + "epoch": 0.6256676331451244, + "grad_norm": 1.1089242368639294, + "learning_rate": 3.245837447384085e-06, + "loss": 0.2054, + "step": 2050 + }, + { + "epoch": 0.6259728368686098, + "grad_norm": 1.2462381022200077, + "learning_rate": 3.2412082972035474e-06, + "loss": 0.207, + "step": 2051 + }, + { + "epoch": 0.6262780405920952, + "grad_norm": 2.099058566922468, + "learning_rate": 3.236580866829914e-06, + "loss": 0.2684, + "step": 2052 + }, + { + "epoch": 0.6265832443155807, + "grad_norm": 1.5959101977981203, + "learning_rate": 3.2319551607880465e-06, + "loss": 0.1538, + "step": 2053 + }, + { + "epoch": 0.6268884480390661, + "grad_norm": 1.5808452163488298, + "learning_rate": 3.2273311836011212e-06, + "loss": 0.4676, + "step": 2054 + }, + { + "epoch": 0.6271936517625515, + "grad_norm": 1.5783441976772852, + "learning_rate": 3.2227089397906176e-06, + "loss": 0.2694, + "step": 2055 + }, + { + "epoch": 0.6274988554860369, + "grad_norm": 1.3233154276606143, + "learning_rate": 3.2180884338763263e-06, + "loss": 0.2815, + "step": 2056 + }, + { + "epoch": 0.6278040592095223, + "grad_norm": 1.3683772986237586, + "learning_rate": 3.213469670376337e-06, + "loss": 0.4691, + "step": 2057 + }, + { + "epoch": 0.6281092629330077, + "grad_norm": 1.3814349020532382, + "learning_rate": 3.208852653807035e-06, + "loss": 0.3432, + "step": 2058 + }, + { + "epoch": 0.6284144666564933, + "grad_norm": 1.7053781007379099, + "learning_rate": 3.2042373886830997e-06, + "loss": 0.1941, + "step": 2059 + }, + { + "epoch": 0.6287196703799787, + "grad_norm": 1.8130342419647294, + "learning_rate": 3.1996238795174915e-06, + "loss": 0.3539, + "step": 2060 + }, + { + "epoch": 0.6290248741034641, + "grad_norm": 1.5884229605259044, + "learning_rate": 3.195012130821461e-06, + "loss": 0.2894, + "step": 2061 + }, + { + "epoch": 0.6293300778269495, + "grad_norm": 1.5727476151253794, + "learning_rate": 3.1904021471045343e-06, + "loss": 0.2491, + "step": 2062 + }, + { + "epoch": 0.6296352815504349, + "grad_norm": 1.3468078596546862, + "learning_rate": 3.1857939328745103e-06, + "loss": 0.2732, + "step": 2063 + }, + { + "epoch": 0.6299404852739203, + "grad_norm": 1.6412286283753263, + "learning_rate": 3.181187492637461e-06, + "loss": 0.3381, + "step": 2064 + }, + { + "epoch": 0.6302456889974057, + "grad_norm": 1.2694648607941608, + "learning_rate": 3.1765828308977194e-06, + "loss": 0.1918, + "step": 2065 + }, + { + "epoch": 0.6305508927208912, + "grad_norm": 1.6855358419576623, + "learning_rate": 3.1719799521578827e-06, + "loss": 0.3533, + "step": 2066 + }, + { + "epoch": 0.6308560964443766, + "grad_norm": 1.5392790511318661, + "learning_rate": 3.167378860918805e-06, + "loss": 0.4304, + "step": 2067 + }, + { + "epoch": 0.6311613001678621, + "grad_norm": 1.1533721472892073, + "learning_rate": 3.1627795616795897e-06, + "loss": 0.2677, + "step": 2068 + }, + { + "epoch": 0.6314665038913475, + "grad_norm": 2.1180949501028907, + "learning_rate": 3.1581820589375905e-06, + "loss": 0.3112, + "step": 2069 + }, + { + "epoch": 0.6317717076148329, + "grad_norm": 1.5671800333683827, + "learning_rate": 3.153586357188404e-06, + "loss": 0.325, + "step": 2070 + }, + { + "epoch": 0.6320769113383183, + "grad_norm": 2.097007040082022, + "learning_rate": 3.1489924609258625e-06, + "loss": 0.4118, + "step": 2071 + }, + { + "epoch": 0.6323821150618038, + "grad_norm": 1.4393021440971188, + "learning_rate": 3.144400374642038e-06, + "loss": 0.1812, + "step": 2072 + }, + { + "epoch": 0.6326873187852892, + "grad_norm": 1.3994869790842874, + "learning_rate": 3.1398101028272276e-06, + "loss": 0.2627, + "step": 2073 + }, + { + "epoch": 0.6329925225087746, + "grad_norm": 1.5025259049351598, + "learning_rate": 3.13522164996996e-06, + "loss": 0.5017, + "step": 2074 + }, + { + "epoch": 0.63329772623226, + "grad_norm": 1.4470049047960356, + "learning_rate": 3.1306350205569815e-06, + "loss": 0.2555, + "step": 2075 + }, + { + "epoch": 0.6336029299557454, + "grad_norm": 1.2392322727033596, + "learning_rate": 3.1260502190732526e-06, + "loss": 0.2755, + "step": 2076 + }, + { + "epoch": 0.6339081336792309, + "grad_norm": 1.511805726538404, + "learning_rate": 3.121467250001952e-06, + "loss": 0.3459, + "step": 2077 + }, + { + "epoch": 0.6342133374027163, + "grad_norm": 1.3525250485792497, + "learning_rate": 3.1168861178244647e-06, + "loss": 0.3096, + "step": 2078 + }, + { + "epoch": 0.6345185411262018, + "grad_norm": 1.307945705908503, + "learning_rate": 3.112306827020377e-06, + "loss": 0.2578, + "step": 2079 + }, + { + "epoch": 0.6348237448496872, + "grad_norm": 1.5394497301963965, + "learning_rate": 3.107729382067478e-06, + "loss": 0.3444, + "step": 2080 + }, + { + "epoch": 0.6351289485731726, + "grad_norm": 1.598444737094519, + "learning_rate": 3.1031537874417518e-06, + "loss": 0.4928, + "step": 2081 + }, + { + "epoch": 0.635434152296658, + "grad_norm": 1.5971914795206672, + "learning_rate": 3.0985800476173684e-06, + "loss": 0.4915, + "step": 2082 + }, + { + "epoch": 0.6357393560201434, + "grad_norm": 1.6527534139352973, + "learning_rate": 3.09400816706669e-06, + "loss": 0.4085, + "step": 2083 + }, + { + "epoch": 0.6360445597436288, + "grad_norm": 1.6401130967569915, + "learning_rate": 3.089438150260259e-06, + "loss": 0.3713, + "step": 2084 + }, + { + "epoch": 0.6363497634671142, + "grad_norm": 1.541484340849266, + "learning_rate": 3.084870001666792e-06, + "loss": 0.4459, + "step": 2085 + }, + { + "epoch": 0.6366549671905998, + "grad_norm": 1.0952222452240987, + "learning_rate": 3.0803037257531832e-06, + "loss": 0.2727, + "step": 2086 + }, + { + "epoch": 0.6369601709140852, + "grad_norm": 1.5642281702733645, + "learning_rate": 3.075739326984496e-06, + "loss": 0.3154, + "step": 2087 + }, + { + "epoch": 0.6372653746375706, + "grad_norm": 1.7115958702034013, + "learning_rate": 3.071176809823952e-06, + "loss": 0.3098, + "step": 2088 + }, + { + "epoch": 0.637570578361056, + "grad_norm": 1.4996254771292976, + "learning_rate": 3.0666161787329385e-06, + "loss": 0.3766, + "step": 2089 + }, + { + "epoch": 0.6378757820845414, + "grad_norm": 1.6333493167069202, + "learning_rate": 3.062057438170999e-06, + "loss": 0.2892, + "step": 2090 + }, + { + "epoch": 0.6381809858080268, + "grad_norm": 1.5987303464634444, + "learning_rate": 3.0575005925958262e-06, + "loss": 0.3212, + "step": 2091 + }, + { + "epoch": 0.6384861895315123, + "grad_norm": 1.4072617176331819, + "learning_rate": 3.0529456464632582e-06, + "loss": 0.1471, + "step": 2092 + }, + { + "epoch": 0.6387913932549977, + "grad_norm": 1.571466126550538, + "learning_rate": 3.0483926042272794e-06, + "loss": 0.3683, + "step": 2093 + }, + { + "epoch": 0.6390965969784831, + "grad_norm": 1.2227746095740997, + "learning_rate": 3.043841470340011e-06, + "loss": 0.2164, + "step": 2094 + }, + { + "epoch": 0.6394018007019686, + "grad_norm": 1.7729233319896496, + "learning_rate": 3.0392922492517056e-06, + "loss": 0.6999, + "step": 2095 + }, + { + "epoch": 0.639707004425454, + "grad_norm": 1.5388646652175832, + "learning_rate": 3.0347449454107502e-06, + "loss": 0.3488, + "step": 2096 + }, + { + "epoch": 0.6400122081489394, + "grad_norm": 1.8500513275861752, + "learning_rate": 3.030199563263654e-06, + "loss": 0.245, + "step": 2097 + }, + { + "epoch": 0.6403174118724249, + "grad_norm": 1.4391914036577291, + "learning_rate": 3.0256561072550474e-06, + "loss": 0.3902, + "step": 2098 + }, + { + "epoch": 0.6406226155959103, + "grad_norm": 1.3225074896212763, + "learning_rate": 3.0211145818276766e-06, + "loss": 0.2795, + "step": 2099 + }, + { + "epoch": 0.6409278193193957, + "grad_norm": 1.4008863095885327, + "learning_rate": 3.016574991422404e-06, + "loss": 0.4512, + "step": 2100 + }, + { + "epoch": 0.6412330230428811, + "grad_norm": 1.3077431718505468, + "learning_rate": 3.0120373404781934e-06, + "loss": 0.345, + "step": 2101 + }, + { + "epoch": 0.6415382267663665, + "grad_norm": 1.410336892347925, + "learning_rate": 3.0075016334321176e-06, + "loss": 0.1603, + "step": 2102 + }, + { + "epoch": 0.6418434304898519, + "grad_norm": 1.296606564603937, + "learning_rate": 3.002967874719348e-06, + "loss": 0.2145, + "step": 2103 + }, + { + "epoch": 0.6421486342133375, + "grad_norm": 2.160768722154708, + "learning_rate": 2.998436068773146e-06, + "loss": 0.2316, + "step": 2104 + }, + { + "epoch": 0.6424538379368229, + "grad_norm": 1.6235477120050208, + "learning_rate": 2.9939062200248697e-06, + "loss": 0.4334, + "step": 2105 + }, + { + "epoch": 0.6427590416603083, + "grad_norm": 1.586342896010167, + "learning_rate": 2.989378332903959e-06, + "loss": 0.2349, + "step": 2106 + }, + { + "epoch": 0.6430642453837937, + "grad_norm": 1.4002403274390032, + "learning_rate": 2.9848524118379403e-06, + "loss": 0.3905, + "step": 2107 + }, + { + "epoch": 0.6433694491072791, + "grad_norm": 1.5982996011356052, + "learning_rate": 2.9803284612524115e-06, + "loss": 0.41, + "step": 2108 + }, + { + "epoch": 0.6436746528307645, + "grad_norm": 2.297067231937714, + "learning_rate": 2.9758064855710477e-06, + "loss": 0.3621, + "step": 2109 + }, + { + "epoch": 0.6439798565542499, + "grad_norm": 2.318012369052118, + "learning_rate": 2.9712864892155934e-06, + "loss": 0.3104, + "step": 2110 + }, + { + "epoch": 0.6442850602777354, + "grad_norm": 2.142990253492185, + "learning_rate": 2.966768476605854e-06, + "loss": 0.3539, + "step": 2111 + }, + { + "epoch": 0.6445902640012208, + "grad_norm": 1.4203788515210105, + "learning_rate": 2.9622524521596986e-06, + "loss": 0.4049, + "step": 2112 + }, + { + "epoch": 0.6448954677247063, + "grad_norm": 1.46424450633585, + "learning_rate": 2.9577384202930515e-06, + "loss": 0.3637, + "step": 2113 + }, + { + "epoch": 0.6452006714481917, + "grad_norm": 1.3445167128438134, + "learning_rate": 2.9532263854198864e-06, + "loss": 0.2368, + "step": 2114 + }, + { + "epoch": 0.6455058751716771, + "grad_norm": 1.4888215773741194, + "learning_rate": 2.9487163519522268e-06, + "loss": 0.3003, + "step": 2115 + }, + { + "epoch": 0.6458110788951625, + "grad_norm": 1.7424695252828115, + "learning_rate": 2.9442083243001383e-06, + "loss": 0.4063, + "step": 2116 + }, + { + "epoch": 0.646116282618648, + "grad_norm": 1.5240331023468454, + "learning_rate": 2.9397023068717245e-06, + "loss": 0.3378, + "step": 2117 + }, + { + "epoch": 0.6464214863421334, + "grad_norm": 1.449502099811024, + "learning_rate": 2.935198304073125e-06, + "loss": 0.4407, + "step": 2118 + }, + { + "epoch": 0.6467266900656188, + "grad_norm": 1.5464626494768008, + "learning_rate": 2.9306963203085076e-06, + "loss": 0.4029, + "step": 2119 + }, + { + "epoch": 0.6470318937891042, + "grad_norm": 1.4483921541854292, + "learning_rate": 2.9261963599800663e-06, + "loss": 0.5365, + "step": 2120 + }, + { + "epoch": 0.6473370975125896, + "grad_norm": 1.6113587532600442, + "learning_rate": 2.9216984274880174e-06, + "loss": 0.3575, + "step": 2121 + }, + { + "epoch": 0.6476423012360751, + "grad_norm": 1.3748376056834286, + "learning_rate": 2.917202527230592e-06, + "loss": 0.3129, + "step": 2122 + }, + { + "epoch": 0.6479475049595605, + "grad_norm": 1.446286260072825, + "learning_rate": 2.912708663604039e-06, + "loss": 0.3331, + "step": 2123 + }, + { + "epoch": 0.648252708683046, + "grad_norm": 1.3725686685307052, + "learning_rate": 2.90821684100261e-06, + "loss": 0.1887, + "step": 2124 + }, + { + "epoch": 0.6485579124065314, + "grad_norm": 1.2658837136411039, + "learning_rate": 2.903727063818565e-06, + "loss": 0.181, + "step": 2125 + }, + { + "epoch": 0.6488631161300168, + "grad_norm": 1.6770300846847512, + "learning_rate": 2.899239336442162e-06, + "loss": 0.3232, + "step": 2126 + }, + { + "epoch": 0.6491683198535022, + "grad_norm": 1.5126111454755953, + "learning_rate": 2.8947536632616514e-06, + "loss": 0.3019, + "step": 2127 + }, + { + "epoch": 0.6494735235769876, + "grad_norm": 1.9696856500616016, + "learning_rate": 2.890270048663284e-06, + "loss": 0.2439, + "step": 2128 + }, + { + "epoch": 0.649778727300473, + "grad_norm": 1.7741220640229292, + "learning_rate": 2.8857884970312895e-06, + "loss": 0.3048, + "step": 2129 + }, + { + "epoch": 0.6500839310239585, + "grad_norm": 1.3726947707371107, + "learning_rate": 2.8813090127478805e-06, + "loss": 0.2802, + "step": 2130 + }, + { + "epoch": 0.6503891347474439, + "grad_norm": 1.3405879764735342, + "learning_rate": 2.8768316001932544e-06, + "loss": 0.3458, + "step": 2131 + }, + { + "epoch": 0.6506943384709294, + "grad_norm": 1.4428183815455966, + "learning_rate": 2.8723562637455775e-06, + "loss": 0.3397, + "step": 2132 + }, + { + "epoch": 0.6509995421944148, + "grad_norm": 1.4755108029659387, + "learning_rate": 2.8678830077809855e-06, + "loss": 0.2699, + "step": 2133 + }, + { + "epoch": 0.6513047459179002, + "grad_norm": 5.025887231644773, + "learning_rate": 2.8634118366735853e-06, + "loss": 0.3069, + "step": 2134 + }, + { + "epoch": 0.6516099496413856, + "grad_norm": 1.7675406942039875, + "learning_rate": 2.8589427547954395e-06, + "loss": 0.3095, + "step": 2135 + }, + { + "epoch": 0.651915153364871, + "grad_norm": 1.6515770975584783, + "learning_rate": 2.854475766516568e-06, + "loss": 0.4581, + "step": 2136 + }, + { + "epoch": 0.6522203570883565, + "grad_norm": 1.5412767621738763, + "learning_rate": 2.850010876204949e-06, + "loss": 0.426, + "step": 2137 + }, + { + "epoch": 0.6525255608118419, + "grad_norm": 1.6086969521289778, + "learning_rate": 2.8455480882265013e-06, + "loss": 0.5286, + "step": 2138 + }, + { + "epoch": 0.6528307645353273, + "grad_norm": 1.2747819975932901, + "learning_rate": 2.8410874069450965e-06, + "loss": 0.1708, + "step": 2139 + }, + { + "epoch": 0.6531359682588127, + "grad_norm": 1.6086675329642104, + "learning_rate": 2.836628836722539e-06, + "loss": 0.3442, + "step": 2140 + }, + { + "epoch": 0.6534411719822982, + "grad_norm": 1.9070796724428367, + "learning_rate": 2.8321723819185745e-06, + "loss": 0.5537, + "step": 2141 + }, + { + "epoch": 0.6537463757057836, + "grad_norm": 1.3332865329317232, + "learning_rate": 2.827718046890875e-06, + "loss": 0.2816, + "step": 2142 + }, + { + "epoch": 0.6540515794292691, + "grad_norm": 1.730187662342969, + "learning_rate": 2.8232658359950413e-06, + "loss": 0.2651, + "step": 2143 + }, + { + "epoch": 0.6543567831527545, + "grad_norm": 1.9822012341616055, + "learning_rate": 2.8188157535846017e-06, + "loss": 0.3211, + "step": 2144 + }, + { + "epoch": 0.6546619868762399, + "grad_norm": 1.5474652021744844, + "learning_rate": 2.8143678040109977e-06, + "loss": 0.3248, + "step": 2145 + }, + { + "epoch": 0.6549671905997253, + "grad_norm": 1.5206826366158495, + "learning_rate": 2.8099219916235846e-06, + "loss": 0.2097, + "step": 2146 + }, + { + "epoch": 0.6552723943232107, + "grad_norm": 1.3269303727241766, + "learning_rate": 2.8054783207696355e-06, + "loss": 0.2827, + "step": 2147 + }, + { + "epoch": 0.6555775980466961, + "grad_norm": 1.6297771220740915, + "learning_rate": 2.801036795794321e-06, + "loss": 0.3147, + "step": 2148 + }, + { + "epoch": 0.6558828017701815, + "grad_norm": 1.3416452558163536, + "learning_rate": 2.7965974210407155e-06, + "loss": 0.4029, + "step": 2149 + }, + { + "epoch": 0.6561880054936671, + "grad_norm": 2.1646703424182663, + "learning_rate": 2.7921602008497957e-06, + "loss": 0.4962, + "step": 2150 + }, + { + "epoch": 0.6564932092171525, + "grad_norm": 1.4042343953416563, + "learning_rate": 2.7877251395604256e-06, + "loss": 0.3924, + "step": 2151 + }, + { + "epoch": 0.6567984129406379, + "grad_norm": 1.6602055082306306, + "learning_rate": 2.783292241509359e-06, + "loss": 0.2895, + "step": 2152 + }, + { + "epoch": 0.6571036166641233, + "grad_norm": 1.643207080368195, + "learning_rate": 2.77886151103124e-06, + "loss": 0.3346, + "step": 2153 + }, + { + "epoch": 0.6574088203876087, + "grad_norm": 1.9432752121379677, + "learning_rate": 2.7744329524585855e-06, + "loss": 0.3341, + "step": 2154 + }, + { + "epoch": 0.6577140241110941, + "grad_norm": 1.724986120527517, + "learning_rate": 2.7700065701217915e-06, + "loss": 0.2615, + "step": 2155 + }, + { + "epoch": 0.6580192278345796, + "grad_norm": 1.7374368395405981, + "learning_rate": 2.7655823683491286e-06, + "loss": 0.3564, + "step": 2156 + }, + { + "epoch": 0.658324431558065, + "grad_norm": 1.3937644239286564, + "learning_rate": 2.761160351466734e-06, + "loss": 0.3398, + "step": 2157 + }, + { + "epoch": 0.6586296352815504, + "grad_norm": 1.5512659963171873, + "learning_rate": 2.7567405237986067e-06, + "loss": 0.2583, + "step": 2158 + }, + { + "epoch": 0.6589348390050359, + "grad_norm": 1.7238301193948424, + "learning_rate": 2.7523228896666037e-06, + "loss": 0.4185, + "step": 2159 + }, + { + "epoch": 0.6592400427285213, + "grad_norm": 1.493719782184466, + "learning_rate": 2.747907453390443e-06, + "loss": 0.3503, + "step": 2160 + }, + { + "epoch": 0.6595452464520067, + "grad_norm": 1.6238057076028836, + "learning_rate": 2.743494219287688e-06, + "loss": 0.3759, + "step": 2161 + }, + { + "epoch": 0.6598504501754922, + "grad_norm": 1.7162390660926063, + "learning_rate": 2.7390831916737485e-06, + "loss": 0.3259, + "step": 2162 + }, + { + "epoch": 0.6601556538989776, + "grad_norm": 1.5950502066049759, + "learning_rate": 2.734674374861883e-06, + "loss": 0.5201, + "step": 2163 + }, + { + "epoch": 0.660460857622463, + "grad_norm": 1.306778973153302, + "learning_rate": 2.730267773163181e-06, + "loss": 0.2982, + "step": 2164 + }, + { + "epoch": 0.6607660613459484, + "grad_norm": 1.496021876588278, + "learning_rate": 2.725863390886568e-06, + "loss": 0.4278, + "step": 2165 + }, + { + "epoch": 0.6610712650694338, + "grad_norm": 1.227396687435002, + "learning_rate": 2.721461232338804e-06, + "loss": 0.2761, + "step": 2166 + }, + { + "epoch": 0.6613764687929192, + "grad_norm": 1.7441506038435841, + "learning_rate": 2.7170613018244683e-06, + "loss": 0.555, + "step": 2167 + }, + { + "epoch": 0.6616816725164048, + "grad_norm": 1.6919704844062524, + "learning_rate": 2.712663603645962e-06, + "loss": 0.371, + "step": 2168 + }, + { + "epoch": 0.6619868762398902, + "grad_norm": 1.4900654188298506, + "learning_rate": 2.7082681421035094e-06, + "loss": 0.3146, + "step": 2169 + }, + { + "epoch": 0.6622920799633756, + "grad_norm": 1.1272813130706185, + "learning_rate": 2.703874921495142e-06, + "loss": 0.2102, + "step": 2170 + }, + { + "epoch": 0.662597283686861, + "grad_norm": 1.4795927410136391, + "learning_rate": 2.6994839461167e-06, + "loss": 0.2223, + "step": 2171 + }, + { + "epoch": 0.6629024874103464, + "grad_norm": 1.4827372299722401, + "learning_rate": 2.6950952202618313e-06, + "loss": 0.4663, + "step": 2172 + }, + { + "epoch": 0.6632076911338318, + "grad_norm": 1.861175690980772, + "learning_rate": 2.6907087482219848e-06, + "loss": 0.4224, + "step": 2173 + }, + { + "epoch": 0.6635128948573172, + "grad_norm": 2.5564241718377256, + "learning_rate": 2.6863245342864008e-06, + "loss": 0.1718, + "step": 2174 + }, + { + "epoch": 0.6638180985808027, + "grad_norm": 1.352942583505333, + "learning_rate": 2.681942582742114e-06, + "loss": 0.3322, + "step": 2175 + }, + { + "epoch": 0.6641233023042881, + "grad_norm": 1.4334146247404056, + "learning_rate": 2.677562897873949e-06, + "loss": 0.3147, + "step": 2176 + }, + { + "epoch": 0.6644285060277736, + "grad_norm": 1.3552893264263521, + "learning_rate": 2.673185483964512e-06, + "loss": 0.3708, + "step": 2177 + }, + { + "epoch": 0.664733709751259, + "grad_norm": 1.8684004670444918, + "learning_rate": 2.668810345294185e-06, + "loss": 0.4266, + "step": 2178 + }, + { + "epoch": 0.6650389134747444, + "grad_norm": 1.4768731330410994, + "learning_rate": 2.6644374861411327e-06, + "loss": 0.3179, + "step": 2179 + }, + { + "epoch": 0.6653441171982298, + "grad_norm": 2.1939141106150575, + "learning_rate": 2.660066910781286e-06, + "loss": 0.3913, + "step": 2180 + }, + { + "epoch": 0.6656493209217152, + "grad_norm": 1.7390756913307748, + "learning_rate": 2.655698623488341e-06, + "loss": 0.332, + "step": 2181 + }, + { + "epoch": 0.6659545246452007, + "grad_norm": 1.6594161953455198, + "learning_rate": 2.651332628533764e-06, + "loss": 0.4666, + "step": 2182 + }, + { + "epoch": 0.6662597283686861, + "grad_norm": 1.6759364627548272, + "learning_rate": 2.6469689301867707e-06, + "loss": 0.2634, + "step": 2183 + }, + { + "epoch": 0.6665649320921715, + "grad_norm": 1.4169243783848449, + "learning_rate": 2.6426075327143348e-06, + "loss": 0.327, + "step": 2184 + }, + { + "epoch": 0.6668701358156569, + "grad_norm": 1.6951437307987798, + "learning_rate": 2.638248440381184e-06, + "loss": 0.3784, + "step": 2185 + }, + { + "epoch": 0.6671753395391424, + "grad_norm": 1.4855364622342466, + "learning_rate": 2.633891657449785e-06, + "loss": 0.4026, + "step": 2186 + }, + { + "epoch": 0.6674805432626278, + "grad_norm": 1.6989385264177839, + "learning_rate": 2.6295371881803505e-06, + "loss": 0.3153, + "step": 2187 + }, + { + "epoch": 0.6677857469861133, + "grad_norm": 1.6589449424000402, + "learning_rate": 2.6251850368308307e-06, + "loss": 0.3181, + "step": 2188 + }, + { + "epoch": 0.6680909507095987, + "grad_norm": 1.69333748104809, + "learning_rate": 2.62083520765691e-06, + "loss": 0.3024, + "step": 2189 + }, + { + "epoch": 0.6683961544330841, + "grad_norm": 1.596754047867073, + "learning_rate": 2.6164877049119984e-06, + "loss": 0.1607, + "step": 2190 + }, + { + "epoch": 0.6687013581565695, + "grad_norm": 1.5331805768146902, + "learning_rate": 2.6121425328472333e-06, + "loss": 0.3948, + "step": 2191 + }, + { + "epoch": 0.6690065618800549, + "grad_norm": 1.3316754623873768, + "learning_rate": 2.607799695711476e-06, + "loss": 0.2833, + "step": 2192 + }, + { + "epoch": 0.6693117656035403, + "grad_norm": 1.4953180996621576, + "learning_rate": 2.603459197751299e-06, + "loss": 0.3014, + "step": 2193 + }, + { + "epoch": 0.6696169693270257, + "grad_norm": 1.5357227784615837, + "learning_rate": 2.599121043210989e-06, + "loss": 0.3256, + "step": 2194 + }, + { + "epoch": 0.6699221730505113, + "grad_norm": 1.3627066726724995, + "learning_rate": 2.5947852363325463e-06, + "loss": 0.2163, + "step": 2195 + }, + { + "epoch": 0.6702273767739967, + "grad_norm": 1.6218927459627959, + "learning_rate": 2.5904517813556703e-06, + "loss": 0.257, + "step": 2196 + }, + { + "epoch": 0.6705325804974821, + "grad_norm": 1.537367996503828, + "learning_rate": 2.5861206825177586e-06, + "loss": 0.4094, + "step": 2197 + }, + { + "epoch": 0.6708377842209675, + "grad_norm": 2.2108715034043542, + "learning_rate": 2.581791944053914e-06, + "loss": 0.3225, + "step": 2198 + }, + { + "epoch": 0.6711429879444529, + "grad_norm": 1.31507339693338, + "learning_rate": 2.5774655701969226e-06, + "loss": 0.2566, + "step": 2199 + }, + { + "epoch": 0.6714481916679383, + "grad_norm": 1.1941951940145592, + "learning_rate": 2.5731415651772597e-06, + "loss": 0.2822, + "step": 2200 + }, + { + "epoch": 0.6717533953914238, + "grad_norm": 1.5930828119126677, + "learning_rate": 2.568819933223089e-06, + "loss": 0.3602, + "step": 2201 + }, + { + "epoch": 0.6720585991149092, + "grad_norm": 1.5396556969732447, + "learning_rate": 2.564500678560249e-06, + "loss": 0.4319, + "step": 2202 + }, + { + "epoch": 0.6723638028383946, + "grad_norm": 1.2802585394346762, + "learning_rate": 2.560183805412254e-06, + "loss": 0.2013, + "step": 2203 + }, + { + "epoch": 0.67266900656188, + "grad_norm": 1.4333977422362991, + "learning_rate": 2.5558693180002925e-06, + "loss": 0.3595, + "step": 2204 + }, + { + "epoch": 0.6729742102853655, + "grad_norm": 1.539821534405401, + "learning_rate": 2.5515572205432193e-06, + "loss": 0.3156, + "step": 2205 + }, + { + "epoch": 0.6732794140088509, + "grad_norm": 1.5023269882496997, + "learning_rate": 2.5472475172575508e-06, + "loss": 0.4035, + "step": 2206 + }, + { + "epoch": 0.6735846177323364, + "grad_norm": 1.4676758104903433, + "learning_rate": 2.5429402123574593e-06, + "loss": 0.2568, + "step": 2207 + }, + { + "epoch": 0.6738898214558218, + "grad_norm": 1.4353821121434278, + "learning_rate": 2.5386353100547807e-06, + "loss": 0.3064, + "step": 2208 + }, + { + "epoch": 0.6741950251793072, + "grad_norm": 2.7468215607358695, + "learning_rate": 2.5343328145589933e-06, + "loss": 0.3209, + "step": 2209 + }, + { + "epoch": 0.6745002289027926, + "grad_norm": 1.6208529741654805, + "learning_rate": 2.5300327300772243e-06, + "loss": 0.345, + "step": 2210 + }, + { + "epoch": 0.674805432626278, + "grad_norm": 1.7500265664400212, + "learning_rate": 2.525735060814246e-06, + "loss": 0.2432, + "step": 2211 + }, + { + "epoch": 0.6751106363497634, + "grad_norm": 1.5826410319323236, + "learning_rate": 2.5214398109724666e-06, + "loss": 0.3547, + "step": 2212 + }, + { + "epoch": 0.6754158400732488, + "grad_norm": 1.6468757354329329, + "learning_rate": 2.5171469847519257e-06, + "loss": 0.4582, + "step": 2213 + }, + { + "epoch": 0.6757210437967344, + "grad_norm": 1.524464891699435, + "learning_rate": 2.5128565863503006e-06, + "loss": 0.2804, + "step": 2214 + }, + { + "epoch": 0.6760262475202198, + "grad_norm": 1.757213236067236, + "learning_rate": 2.508568619962889e-06, + "loss": 0.3666, + "step": 2215 + }, + { + "epoch": 0.6763314512437052, + "grad_norm": 1.6424191791501594, + "learning_rate": 2.5042830897826083e-06, + "loss": 0.5182, + "step": 2216 + }, + { + "epoch": 0.6766366549671906, + "grad_norm": 1.5743705733094293, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.3995, + "step": 2217 + }, + { + "epoch": 0.676941858690676, + "grad_norm": 1.9861037166016167, + "learning_rate": 2.4957193548032187e-06, + "loss": 0.4346, + "step": 2218 + }, + { + "epoch": 0.6772470624141614, + "grad_norm": 1.4295765151007014, + "learning_rate": 2.49144115837802e-06, + "loss": 0.2974, + "step": 2219 + }, + { + "epoch": 0.6775522661376469, + "grad_norm": 1.2637981841035508, + "learning_rate": 2.4871654149077747e-06, + "loss": 0.1472, + "step": 2220 + }, + { + "epoch": 0.6778574698611323, + "grad_norm": 1.55571410056159, + "learning_rate": 2.482892128573454e-06, + "loss": 0.3477, + "step": 2221 + }, + { + "epoch": 0.6781626735846177, + "grad_norm": 1.5650642715760328, + "learning_rate": 2.4786213035536205e-06, + "loss": 0.3266, + "step": 2222 + }, + { + "epoch": 0.6784678773081032, + "grad_norm": 1.4957203212256605, + "learning_rate": 2.474352944024433e-06, + "loss": 0.3351, + "step": 2223 + }, + { + "epoch": 0.6787730810315886, + "grad_norm": 1.693859550058235, + "learning_rate": 2.4700870541596434e-06, + "loss": 0.2397, + "step": 2224 + }, + { + "epoch": 0.679078284755074, + "grad_norm": 1.1375829813106715, + "learning_rate": 2.4658236381305834e-06, + "loss": 0.2042, + "step": 2225 + }, + { + "epoch": 0.6793834884785594, + "grad_norm": 1.5914490489420068, + "learning_rate": 2.4615627001061655e-06, + "loss": 0.3779, + "step": 2226 + }, + { + "epoch": 0.6796886922020449, + "grad_norm": 1.747681989893777, + "learning_rate": 2.457304244252885e-06, + "loss": 0.3251, + "step": 2227 + }, + { + "epoch": 0.6799938959255303, + "grad_norm": 1.236007287216926, + "learning_rate": 2.4530482747348045e-06, + "loss": 0.2769, + "step": 2228 + }, + { + "epoch": 0.6802990996490157, + "grad_norm": 1.765434187309303, + "learning_rate": 2.448794795713555e-06, + "loss": 0.4044, + "step": 2229 + }, + { + "epoch": 0.6806043033725011, + "grad_norm": 1.3105969483407403, + "learning_rate": 2.444543811348338e-06, + "loss": 0.3482, + "step": 2230 + }, + { + "epoch": 0.6809095070959865, + "grad_norm": 2.476701221130886, + "learning_rate": 2.4402953257959107e-06, + "loss": 0.2611, + "step": 2231 + }, + { + "epoch": 0.681214710819472, + "grad_norm": 1.3629428476314511, + "learning_rate": 2.436049343210585e-06, + "loss": 0.198, + "step": 2232 + }, + { + "epoch": 0.6815199145429575, + "grad_norm": 1.2075854760475797, + "learning_rate": 2.4318058677442336e-06, + "loss": 0.261, + "step": 2233 + }, + { + "epoch": 0.6818251182664429, + "grad_norm": 1.5747515437022757, + "learning_rate": 2.4275649035462695e-06, + "loss": 0.3806, + "step": 2234 + }, + { + "epoch": 0.6821303219899283, + "grad_norm": 1.7471483702517483, + "learning_rate": 2.423326454763652e-06, + "loss": 0.3672, + "step": 2235 + }, + { + "epoch": 0.6824355257134137, + "grad_norm": 1.3886323003052499, + "learning_rate": 2.4190905255408853e-06, + "loss": 0.3586, + "step": 2236 + }, + { + "epoch": 0.6827407294368991, + "grad_norm": 1.6604409366430954, + "learning_rate": 2.4148571200200025e-06, + "loss": 0.2567, + "step": 2237 + }, + { + "epoch": 0.6830459331603845, + "grad_norm": 1.0963401642522104, + "learning_rate": 2.4106262423405754e-06, + "loss": 0.2167, + "step": 2238 + }, + { + "epoch": 0.68335113688387, + "grad_norm": 1.5618556410622801, + "learning_rate": 2.4063978966397017e-06, + "loss": 0.3866, + "step": 2239 + }, + { + "epoch": 0.6836563406073554, + "grad_norm": 1.7270739043944339, + "learning_rate": 2.4021720870520033e-06, + "loss": 0.2361, + "step": 2240 + }, + { + "epoch": 0.6839615443308409, + "grad_norm": 1.773884452256481, + "learning_rate": 2.397948817709618e-06, + "loss": 0.4827, + "step": 2241 + }, + { + "epoch": 0.6842667480543263, + "grad_norm": 1.6337555712580818, + "learning_rate": 2.3937280927422087e-06, + "loss": 0.2982, + "step": 2242 + }, + { + "epoch": 0.6845719517778117, + "grad_norm": 1.6348042728499839, + "learning_rate": 2.3895099162769426e-06, + "loss": 0.536, + "step": 2243 + }, + { + "epoch": 0.6848771555012971, + "grad_norm": 1.9449403092705355, + "learning_rate": 2.3852942924384953e-06, + "loss": 0.4189, + "step": 2244 + }, + { + "epoch": 0.6851823592247825, + "grad_norm": 1.6119447210408322, + "learning_rate": 2.3810812253490533e-06, + "loss": 0.464, + "step": 2245 + }, + { + "epoch": 0.685487562948268, + "grad_norm": 1.6223325109804345, + "learning_rate": 2.3768707191282957e-06, + "loss": 0.2066, + "step": 2246 + }, + { + "epoch": 0.6857927666717534, + "grad_norm": 1.5158096672896404, + "learning_rate": 2.3726627778933997e-06, + "loss": 0.2889, + "step": 2247 + }, + { + "epoch": 0.6860979703952388, + "grad_norm": 1.5524089736175863, + "learning_rate": 2.3684574057590337e-06, + "loss": 0.2266, + "step": 2248 + }, + { + "epoch": 0.6864031741187242, + "grad_norm": 1.7073229926701183, + "learning_rate": 2.364254606837358e-06, + "loss": 0.3773, + "step": 2249 + }, + { + "epoch": 0.6867083778422097, + "grad_norm": 1.7264873781734367, + "learning_rate": 2.360054385238012e-06, + "loss": 0.2424, + "step": 2250 + }, + { + "epoch": 0.6870135815656951, + "grad_norm": 1.8255696491964866, + "learning_rate": 2.355856745068115e-06, + "loss": 0.4534, + "step": 2251 + }, + { + "epoch": 0.6873187852891806, + "grad_norm": 1.826256343091619, + "learning_rate": 2.351661690432267e-06, + "loss": 0.276, + "step": 2252 + }, + { + "epoch": 0.687623989012666, + "grad_norm": 1.6053853628217218, + "learning_rate": 2.3474692254325328e-06, + "loss": 0.282, + "step": 2253 + }, + { + "epoch": 0.6879291927361514, + "grad_norm": 1.5373106925225748, + "learning_rate": 2.3432793541684502e-06, + "loss": 0.3329, + "step": 2254 + }, + { + "epoch": 0.6882343964596368, + "grad_norm": 2.2342562343804526, + "learning_rate": 2.339092080737021e-06, + "loss": 0.2579, + "step": 2255 + }, + { + "epoch": 0.6885396001831222, + "grad_norm": 1.5812283963020564, + "learning_rate": 2.334907409232704e-06, + "loss": 0.3708, + "step": 2256 + }, + { + "epoch": 0.6888448039066076, + "grad_norm": 1.5494931345932486, + "learning_rate": 2.330725343747412e-06, + "loss": 0.2758, + "step": 2257 + }, + { + "epoch": 0.689150007630093, + "grad_norm": 1.6512441741586468, + "learning_rate": 2.326545888370516e-06, + "loss": 0.2963, + "step": 2258 + }, + { + "epoch": 0.6894552113535786, + "grad_norm": 1.396566065819781, + "learning_rate": 2.3223690471888287e-06, + "loss": 0.14, + "step": 2259 + }, + { + "epoch": 0.689760415077064, + "grad_norm": 1.6348208984364616, + "learning_rate": 2.318194824286608e-06, + "loss": 0.2872, + "step": 2260 + }, + { + "epoch": 0.6900656188005494, + "grad_norm": 1.3921842084817904, + "learning_rate": 2.3140232237455557e-06, + "loss": 0.2908, + "step": 2261 + }, + { + "epoch": 0.6903708225240348, + "grad_norm": 1.5516133802355774, + "learning_rate": 2.3098542496448035e-06, + "loss": 0.3063, + "step": 2262 + }, + { + "epoch": 0.6906760262475202, + "grad_norm": 1.1592235990721869, + "learning_rate": 2.3056879060609154e-06, + "loss": 0.253, + "step": 2263 + }, + { + "epoch": 0.6909812299710056, + "grad_norm": 1.4434332099581564, + "learning_rate": 2.3015241970678904e-06, + "loss": 0.4221, + "step": 2264 + }, + { + "epoch": 0.691286433694491, + "grad_norm": 1.4893591474232895, + "learning_rate": 2.297363126737143e-06, + "loss": 0.2535, + "step": 2265 + }, + { + "epoch": 0.6915916374179765, + "grad_norm": 1.583421487194619, + "learning_rate": 2.2932046991375083e-06, + "loss": 0.417, + "step": 2266 + }, + { + "epoch": 0.6918968411414619, + "grad_norm": 1.5192571946940916, + "learning_rate": 2.289048918335245e-06, + "loss": 0.3871, + "step": 2267 + }, + { + "epoch": 0.6922020448649474, + "grad_norm": 1.4427150996998601, + "learning_rate": 2.2848957883940155e-06, + "loss": 0.1737, + "step": 2268 + }, + { + "epoch": 0.6925072485884328, + "grad_norm": 1.5122006458988937, + "learning_rate": 2.2807453133748904e-06, + "loss": 0.4822, + "step": 2269 + }, + { + "epoch": 0.6928124523119182, + "grad_norm": 1.6984069989930315, + "learning_rate": 2.276597497336349e-06, + "loss": 0.3834, + "step": 2270 + }, + { + "epoch": 0.6931176560354037, + "grad_norm": 1.3572682245688865, + "learning_rate": 2.27245234433427e-06, + "loss": 0.2541, + "step": 2271 + }, + { + "epoch": 0.6934228597588891, + "grad_norm": 1.8251110565674988, + "learning_rate": 2.268309858421925e-06, + "loss": 0.3142, + "step": 2272 + }, + { + "epoch": 0.6937280634823745, + "grad_norm": 1.3607310404566657, + "learning_rate": 2.264170043649976e-06, + "loss": 0.2144, + "step": 2273 + }, + { + "epoch": 0.6940332672058599, + "grad_norm": 1.7300473770093052, + "learning_rate": 2.2600329040664805e-06, + "loss": 0.3721, + "step": 2274 + }, + { + "epoch": 0.6943384709293453, + "grad_norm": 1.7843543323716198, + "learning_rate": 2.2558984437168735e-06, + "loss": 0.4995, + "step": 2275 + }, + { + "epoch": 0.6946436746528307, + "grad_norm": 1.2786625111949597, + "learning_rate": 2.251766666643971e-06, + "loss": 0.2811, + "step": 2276 + }, + { + "epoch": 0.6949488783763161, + "grad_norm": 1.6238076163532298, + "learning_rate": 2.2476375768879686e-06, + "loss": 0.3075, + "step": 2277 + }, + { + "epoch": 0.6952540820998017, + "grad_norm": 1.5830049508861397, + "learning_rate": 2.2435111784864323e-06, + "loss": 0.3474, + "step": 2278 + }, + { + "epoch": 0.6955592858232871, + "grad_norm": 1.847689670639536, + "learning_rate": 2.239387475474293e-06, + "loss": 0.2911, + "step": 2279 + }, + { + "epoch": 0.6958644895467725, + "grad_norm": 1.5764459421342933, + "learning_rate": 2.2352664718838522e-06, + "loss": 0.3912, + "step": 2280 + }, + { + "epoch": 0.6961696932702579, + "grad_norm": 1.6883475682950468, + "learning_rate": 2.231148171744768e-06, + "loss": 0.2597, + "step": 2281 + }, + { + "epoch": 0.6964748969937433, + "grad_norm": 1.5012572106181077, + "learning_rate": 2.2270325790840538e-06, + "loss": 0.3866, + "step": 2282 + }, + { + "epoch": 0.6967801007172287, + "grad_norm": 1.8193213912210353, + "learning_rate": 2.2229196979260803e-06, + "loss": 0.4478, + "step": 2283 + }, + { + "epoch": 0.6970853044407141, + "grad_norm": 1.578617887670166, + "learning_rate": 2.2188095322925623e-06, + "loss": 0.1706, + "step": 2284 + }, + { + "epoch": 0.6973905081641996, + "grad_norm": 1.355551417575489, + "learning_rate": 2.2147020862025598e-06, + "loss": 0.3604, + "step": 2285 + }, + { + "epoch": 0.697695711887685, + "grad_norm": 1.638895267359216, + "learning_rate": 2.2105973636724746e-06, + "loss": 0.291, + "step": 2286 + }, + { + "epoch": 0.6980009156111705, + "grad_norm": 1.4815833895565786, + "learning_rate": 2.2064953687160496e-06, + "loss": 0.2214, + "step": 2287 + }, + { + "epoch": 0.6983061193346559, + "grad_norm": 1.660687242747742, + "learning_rate": 2.202396105344352e-06, + "loss": 0.3362, + "step": 2288 + }, + { + "epoch": 0.6986113230581413, + "grad_norm": 1.3490028619474306, + "learning_rate": 2.1982995775657823e-06, + "loss": 0.1904, + "step": 2289 + }, + { + "epoch": 0.6989165267816267, + "grad_norm": 1.3466313542246475, + "learning_rate": 2.194205789386069e-06, + "loss": 0.278, + "step": 2290 + }, + { + "epoch": 0.6992217305051122, + "grad_norm": 1.631021932392484, + "learning_rate": 2.1901147448082568e-06, + "loss": 0.2299, + "step": 2291 + }, + { + "epoch": 0.6995269342285976, + "grad_norm": 1.347854555439276, + "learning_rate": 2.186026447832707e-06, + "loss": 0.3104, + "step": 2292 + }, + { + "epoch": 0.699832137952083, + "grad_norm": 1.9764058047217288, + "learning_rate": 2.181940902457102e-06, + "loss": 0.3163, + "step": 2293 + }, + { + "epoch": 0.7001373416755684, + "grad_norm": 1.5671620816634946, + "learning_rate": 2.1778581126764253e-06, + "loss": 0.5232, + "step": 2294 + }, + { + "epoch": 0.7004425453990538, + "grad_norm": 1.4410796170129498, + "learning_rate": 2.1737780824829673e-06, + "loss": 0.2524, + "step": 2295 + }, + { + "epoch": 0.7007477491225393, + "grad_norm": 1.6871822199250446, + "learning_rate": 2.1697008158663252e-06, + "loss": 0.3339, + "step": 2296 + }, + { + "epoch": 0.7010529528460248, + "grad_norm": 1.6588896822957648, + "learning_rate": 2.165626316813389e-06, + "loss": 0.2508, + "step": 2297 + }, + { + "epoch": 0.7013581565695102, + "grad_norm": 1.6510372542874876, + "learning_rate": 2.1615545893083413e-06, + "loss": 0.4034, + "step": 2298 + }, + { + "epoch": 0.7016633602929956, + "grad_norm": 1.581491938842722, + "learning_rate": 2.1574856373326607e-06, + "loss": 0.4049, + "step": 2299 + }, + { + "epoch": 0.701968564016481, + "grad_norm": 1.453233694297758, + "learning_rate": 2.153419464865107e-06, + "loss": 0.3468, + "step": 2300 + }, + { + "epoch": 0.7022737677399664, + "grad_norm": 1.808627365162433, + "learning_rate": 2.14935607588172e-06, + "loss": 0.303, + "step": 2301 + }, + { + "epoch": 0.7025789714634518, + "grad_norm": 1.4415485756344446, + "learning_rate": 2.1452954743558245e-06, + "loss": 0.3154, + "step": 2302 + }, + { + "epoch": 0.7028841751869372, + "grad_norm": 1.592144175830078, + "learning_rate": 2.141237664258017e-06, + "loss": 0.3909, + "step": 2303 + }, + { + "epoch": 0.7031893789104227, + "grad_norm": 1.4403194475823704, + "learning_rate": 2.1371826495561615e-06, + "loss": 0.2841, + "step": 2304 + }, + { + "epoch": 0.7034945826339082, + "grad_norm": 1.6852591435866713, + "learning_rate": 2.133130434215389e-06, + "loss": 0.512, + "step": 2305 + }, + { + "epoch": 0.7037997863573936, + "grad_norm": 1.5938343698945718, + "learning_rate": 2.1290810221980986e-06, + "loss": 0.2011, + "step": 2306 + }, + { + "epoch": 0.704104990080879, + "grad_norm": 1.5326866105983223, + "learning_rate": 2.125034417463942e-06, + "loss": 0.345, + "step": 2307 + }, + { + "epoch": 0.7044101938043644, + "grad_norm": 2.587166577662469, + "learning_rate": 2.1209906239698274e-06, + "loss": 0.3219, + "step": 2308 + }, + { + "epoch": 0.7047153975278498, + "grad_norm": 1.4756136474431192, + "learning_rate": 2.1169496456699175e-06, + "loss": 0.2406, + "step": 2309 + }, + { + "epoch": 0.7050206012513353, + "grad_norm": 1.613624517761201, + "learning_rate": 2.1129114865156187e-06, + "loss": 0.2712, + "step": 2310 + }, + { + "epoch": 0.7053258049748207, + "grad_norm": 1.5627764647993212, + "learning_rate": 2.1088761504555787e-06, + "loss": 0.2583, + "step": 2311 + }, + { + "epoch": 0.7056310086983061, + "grad_norm": 1.4190231900460788, + "learning_rate": 2.104843641435692e-06, + "loss": 0.3631, + "step": 2312 + }, + { + "epoch": 0.7059362124217915, + "grad_norm": 1.120858835736163, + "learning_rate": 2.100813963399083e-06, + "loss": 0.2264, + "step": 2313 + }, + { + "epoch": 0.706241416145277, + "grad_norm": 5.592583902383602, + "learning_rate": 2.096787120286107e-06, + "loss": 0.3734, + "step": 2314 + }, + { + "epoch": 0.7065466198687624, + "grad_norm": 1.4254198426262648, + "learning_rate": 2.092763116034352e-06, + "loss": 0.2948, + "step": 2315 + }, + { + "epoch": 0.7068518235922479, + "grad_norm": 1.4066840455637433, + "learning_rate": 2.0887419545786276e-06, + "loss": 0.3381, + "step": 2316 + }, + { + "epoch": 0.7071570273157333, + "grad_norm": 1.58796329496228, + "learning_rate": 2.0847236398509602e-06, + "loss": 0.3611, + "step": 2317 + }, + { + "epoch": 0.7074622310392187, + "grad_norm": 1.6806456206316778, + "learning_rate": 2.080708175780598e-06, + "loss": 0.2421, + "step": 2318 + }, + { + "epoch": 0.7077674347627041, + "grad_norm": 1.2883188125380933, + "learning_rate": 2.0766955662940023e-06, + "loss": 0.2541, + "step": 2319 + }, + { + "epoch": 0.7080726384861895, + "grad_norm": 1.7411635601553608, + "learning_rate": 2.072685815314838e-06, + "loss": 0.3755, + "step": 2320 + }, + { + "epoch": 0.7083778422096749, + "grad_norm": 1.3239666634134253, + "learning_rate": 2.0686789267639744e-06, + "loss": 0.3451, + "step": 2321 + }, + { + "epoch": 0.7086830459331603, + "grad_norm": 1.983676997898493, + "learning_rate": 2.0646749045594906e-06, + "loss": 0.42, + "step": 2322 + }, + { + "epoch": 0.7089882496566459, + "grad_norm": 1.0878535320868636, + "learning_rate": 2.060673752616653e-06, + "loss": 0.2291, + "step": 2323 + }, + { + "epoch": 0.7092934533801313, + "grad_norm": 1.2600542551045928, + "learning_rate": 2.0566754748479244e-06, + "loss": 0.2867, + "step": 2324 + }, + { + "epoch": 0.7095986571036167, + "grad_norm": 1.570453238941518, + "learning_rate": 2.0526800751629616e-06, + "loss": 0.2137, + "step": 2325 + }, + { + "epoch": 0.7099038608271021, + "grad_norm": 1.3212249131288691, + "learning_rate": 2.048687557468602e-06, + "loss": 0.3396, + "step": 2326 + }, + { + "epoch": 0.7102090645505875, + "grad_norm": 1.6117839376605163, + "learning_rate": 2.0446979256688647e-06, + "loss": 0.4791, + "step": 2327 + }, + { + "epoch": 0.7105142682740729, + "grad_norm": 1.2685593861176456, + "learning_rate": 2.0407111836649524e-06, + "loss": 0.3699, + "step": 2328 + }, + { + "epoch": 0.7108194719975583, + "grad_norm": 1.2768400639904869, + "learning_rate": 2.036727335355235e-06, + "loss": 0.2184, + "step": 2329 + }, + { + "epoch": 0.7111246757210438, + "grad_norm": 1.2921283676105053, + "learning_rate": 2.0327463846352562e-06, + "loss": 0.2455, + "step": 2330 + }, + { + "epoch": 0.7114298794445292, + "grad_norm": 1.6655785187324519, + "learning_rate": 2.0287683353977294e-06, + "loss": 0.3405, + "step": 2331 + }, + { + "epoch": 0.7117350831680147, + "grad_norm": 1.9153355938351166, + "learning_rate": 2.0247931915325254e-06, + "loss": 0.3889, + "step": 2332 + }, + { + "epoch": 0.7120402868915001, + "grad_norm": 1.4741604759613922, + "learning_rate": 2.0208209569266744e-06, + "loss": 0.2703, + "step": 2333 + }, + { + "epoch": 0.7123454906149855, + "grad_norm": 1.9778531651588995, + "learning_rate": 2.016851635464368e-06, + "loss": 0.2542, + "step": 2334 + }, + { + "epoch": 0.712650694338471, + "grad_norm": 1.590275061585144, + "learning_rate": 2.0128852310269413e-06, + "loss": 0.403, + "step": 2335 + }, + { + "epoch": 0.7129558980619564, + "grad_norm": 1.70148536079918, + "learning_rate": 2.008921747492884e-06, + "loss": 0.2813, + "step": 2336 + }, + { + "epoch": 0.7132611017854418, + "grad_norm": 1.5707428755209287, + "learning_rate": 2.004961188737823e-06, + "loss": 0.3936, + "step": 2337 + }, + { + "epoch": 0.7135663055089272, + "grad_norm": 1.3604719624909132, + "learning_rate": 2.0010035586345323e-06, + "loss": 0.2977, + "step": 2338 + }, + { + "epoch": 0.7138715092324126, + "grad_norm": 1.311140764412078, + "learning_rate": 1.997048861052916e-06, + "loss": 0.2976, + "step": 2339 + }, + { + "epoch": 0.714176712955898, + "grad_norm": 1.5495168302002966, + "learning_rate": 1.9930970998600122e-06, + "loss": 0.3235, + "step": 2340 + }, + { + "epoch": 0.7144819166793834, + "grad_norm": 1.55097318369583, + "learning_rate": 1.989148278919992e-06, + "loss": 0.2715, + "step": 2341 + }, + { + "epoch": 0.714787120402869, + "grad_norm": 1.5986632365767137, + "learning_rate": 1.985202402094146e-06, + "loss": 0.3065, + "step": 2342 + }, + { + "epoch": 0.7150923241263544, + "grad_norm": 1.5353380246081771, + "learning_rate": 1.9812594732408867e-06, + "loss": 0.278, + "step": 2343 + }, + { + "epoch": 0.7153975278498398, + "grad_norm": 1.5029875884822175, + "learning_rate": 1.9773194962157483e-06, + "loss": 0.3471, + "step": 2344 + }, + { + "epoch": 0.7157027315733252, + "grad_norm": 1.5119065752079335, + "learning_rate": 1.9733824748713747e-06, + "loss": 0.1674, + "step": 2345 + }, + { + "epoch": 0.7160079352968106, + "grad_norm": 1.4828013058914185, + "learning_rate": 1.9694484130575177e-06, + "loss": 0.3072, + "step": 2346 + }, + { + "epoch": 0.716313139020296, + "grad_norm": 1.3822579861494915, + "learning_rate": 1.965517314621042e-06, + "loss": 0.4649, + "step": 2347 + }, + { + "epoch": 0.7166183427437814, + "grad_norm": 1.458170018815355, + "learning_rate": 1.961589183405909e-06, + "loss": 0.3916, + "step": 2348 + }, + { + "epoch": 0.7169235464672669, + "grad_norm": 1.3859819200831016, + "learning_rate": 1.9576640232531785e-06, + "loss": 0.1681, + "step": 2349 + }, + { + "epoch": 0.7172287501907523, + "grad_norm": 1.515528410360513, + "learning_rate": 1.9537418380010096e-06, + "loss": 0.27, + "step": 2350 + }, + { + "epoch": 0.7175339539142378, + "grad_norm": 1.4128114424484064, + "learning_rate": 1.9498226314846473e-06, + "loss": 0.3298, + "step": 2351 + }, + { + "epoch": 0.7178391576377232, + "grad_norm": 1.2879356313934298, + "learning_rate": 1.9459064075364297e-06, + "loss": 0.2601, + "step": 2352 + }, + { + "epoch": 0.7181443613612086, + "grad_norm": 1.2039121554636452, + "learning_rate": 1.94199316998577e-06, + "loss": 0.2421, + "step": 2353 + }, + { + "epoch": 0.718449565084694, + "grad_norm": 1.4974907709515553, + "learning_rate": 1.938082922659172e-06, + "loss": 0.2043, + "step": 2354 + }, + { + "epoch": 0.7187547688081795, + "grad_norm": 1.6285164272848978, + "learning_rate": 1.934175669380206e-06, + "loss": 0.4642, + "step": 2355 + }, + { + "epoch": 0.7190599725316649, + "grad_norm": 1.7399320500358815, + "learning_rate": 1.9302714139695176e-06, + "loss": 0.4048, + "step": 2356 + }, + { + "epoch": 0.7193651762551503, + "grad_norm": 1.4081572420684747, + "learning_rate": 1.926370160244825e-06, + "loss": 0.2177, + "step": 2357 + }, + { + "epoch": 0.7196703799786357, + "grad_norm": 1.4367667276773715, + "learning_rate": 1.922471912020907e-06, + "loss": 0.3391, + "step": 2358 + }, + { + "epoch": 0.7199755837021211, + "grad_norm": 1.5369055490455528, + "learning_rate": 1.9185766731096015e-06, + "loss": 0.4479, + "step": 2359 + }, + { + "epoch": 0.7202807874256066, + "grad_norm": 1.5895659932815216, + "learning_rate": 1.9146844473198118e-06, + "loss": 0.396, + "step": 2360 + }, + { + "epoch": 0.720585991149092, + "grad_norm": 1.5298209236412752, + "learning_rate": 1.910795238457487e-06, + "loss": 0.4029, + "step": 2361 + }, + { + "epoch": 0.7208911948725775, + "grad_norm": 1.4132651931753095, + "learning_rate": 1.9069090503256277e-06, + "loss": 0.5109, + "step": 2362 + }, + { + "epoch": 0.7211963985960629, + "grad_norm": 1.0504476592048764, + "learning_rate": 1.903025886724285e-06, + "loss": 0.2154, + "step": 2363 + }, + { + "epoch": 0.7215016023195483, + "grad_norm": 1.5375095025986196, + "learning_rate": 1.8991457514505491e-06, + "loss": 0.3494, + "step": 2364 + }, + { + "epoch": 0.7218068060430337, + "grad_norm": 1.4249222349397213, + "learning_rate": 1.8952686482985466e-06, + "loss": 0.298, + "step": 2365 + }, + { + "epoch": 0.7221120097665191, + "grad_norm": 1.3483580600151477, + "learning_rate": 1.8913945810594458e-06, + "loss": 0.3369, + "step": 2366 + }, + { + "epoch": 0.7224172134900045, + "grad_norm": 1.6537907894810377, + "learning_rate": 1.8875235535214415e-06, + "loss": 0.4102, + "step": 2367 + }, + { + "epoch": 0.72272241721349, + "grad_norm": 1.7329477847290056, + "learning_rate": 1.8836555694697528e-06, + "loss": 0.3852, + "step": 2368 + }, + { + "epoch": 0.7230276209369755, + "grad_norm": 0.9438474667978664, + "learning_rate": 1.8797906326866355e-06, + "loss": 0.2417, + "step": 2369 + }, + { + "epoch": 0.7233328246604609, + "grad_norm": 1.3714424841819393, + "learning_rate": 1.875928746951353e-06, + "loss": 0.3425, + "step": 2370 + }, + { + "epoch": 0.7236380283839463, + "grad_norm": 1.5856665769786433, + "learning_rate": 1.8720699160401883e-06, + "loss": 0.417, + "step": 2371 + }, + { + "epoch": 0.7239432321074317, + "grad_norm": 1.617833989608368, + "learning_rate": 1.8682141437264423e-06, + "loss": 0.2736, + "step": 2372 + }, + { + "epoch": 0.7242484358309171, + "grad_norm": 1.3707649727949278, + "learning_rate": 1.8643614337804194e-06, + "loss": 0.2649, + "step": 2373 + }, + { + "epoch": 0.7245536395544026, + "grad_norm": 1.7067003431325545, + "learning_rate": 1.8605117899694325e-06, + "loss": 0.357, + "step": 2374 + }, + { + "epoch": 0.724858843277888, + "grad_norm": 1.4575250111130236, + "learning_rate": 1.856665216057793e-06, + "loss": 0.2518, + "step": 2375 + }, + { + "epoch": 0.7251640470013734, + "grad_norm": 1.8392255842033116, + "learning_rate": 1.8528217158068168e-06, + "loss": 0.3795, + "step": 2376 + }, + { + "epoch": 0.7254692507248588, + "grad_norm": 1.2997742383501494, + "learning_rate": 1.848981292974808e-06, + "loss": 0.2735, + "step": 2377 + }, + { + "epoch": 0.7257744544483443, + "grad_norm": 1.513637379899284, + "learning_rate": 1.8451439513170633e-06, + "loss": 0.408, + "step": 2378 + }, + { + "epoch": 0.7260796581718297, + "grad_norm": 1.5385949058679458, + "learning_rate": 1.8413096945858695e-06, + "loss": 0.3891, + "step": 2379 + }, + { + "epoch": 0.7263848618953151, + "grad_norm": 2.232541699335092, + "learning_rate": 1.837478526530493e-06, + "loss": 0.4276, + "step": 2380 + }, + { + "epoch": 0.7266900656188006, + "grad_norm": 1.6072156919899125, + "learning_rate": 1.8336504508971798e-06, + "loss": 0.4644, + "step": 2381 + }, + { + "epoch": 0.726995269342286, + "grad_norm": 1.7098489721428152, + "learning_rate": 1.8298254714291575e-06, + "loss": 0.4162, + "step": 2382 + }, + { + "epoch": 0.7273004730657714, + "grad_norm": 1.5804989022260485, + "learning_rate": 1.8260035918666203e-06, + "loss": 0.4063, + "step": 2383 + }, + { + "epoch": 0.7276056767892568, + "grad_norm": 1.2030976032258889, + "learning_rate": 1.8221848159467293e-06, + "loss": 0.3434, + "step": 2384 + }, + { + "epoch": 0.7279108805127422, + "grad_norm": 1.7012053704276349, + "learning_rate": 1.8183691474036224e-06, + "loss": 0.3693, + "step": 2385 + }, + { + "epoch": 0.7282160842362276, + "grad_norm": 1.488585433512296, + "learning_rate": 1.8145565899683875e-06, + "loss": 0.3843, + "step": 2386 + }, + { + "epoch": 0.7285212879597132, + "grad_norm": 1.5594380704245687, + "learning_rate": 1.8107471473690725e-06, + "loss": 0.2597, + "step": 2387 + }, + { + "epoch": 0.7288264916831986, + "grad_norm": 1.4362419262319646, + "learning_rate": 1.8069408233306846e-06, + "loss": 0.2128, + "step": 2388 + }, + { + "epoch": 0.729131695406684, + "grad_norm": 1.453994982767172, + "learning_rate": 1.8031376215751762e-06, + "loss": 0.4371, + "step": 2389 + }, + { + "epoch": 0.7294368991301694, + "grad_norm": 1.545719061618233, + "learning_rate": 1.7993375458214468e-06, + "loss": 0.2536, + "step": 2390 + }, + { + "epoch": 0.7297421028536548, + "grad_norm": 1.398782740764591, + "learning_rate": 1.7955405997853448e-06, + "loss": 0.2635, + "step": 2391 + }, + { + "epoch": 0.7300473065771402, + "grad_norm": 1.281366482533797, + "learning_rate": 1.7917467871796518e-06, + "loss": 0.25, + "step": 2392 + }, + { + "epoch": 0.7303525103006256, + "grad_norm": 1.7206023205122598, + "learning_rate": 1.7879561117140875e-06, + "loss": 0.1821, + "step": 2393 + }, + { + "epoch": 0.7306577140241111, + "grad_norm": 1.2659729373353488, + "learning_rate": 1.7841685770953071e-06, + "loss": 0.2269, + "step": 2394 + }, + { + "epoch": 0.7309629177475965, + "grad_norm": 1.7040532233697345, + "learning_rate": 1.7803841870268902e-06, + "loss": 0.5313, + "step": 2395 + }, + { + "epoch": 0.731268121471082, + "grad_norm": 1.7637367616801418, + "learning_rate": 1.776602945209342e-06, + "loss": 0.407, + "step": 2396 + }, + { + "epoch": 0.7315733251945674, + "grad_norm": 1.6319978132607265, + "learning_rate": 1.772824855340094e-06, + "loss": 0.4076, + "step": 2397 + }, + { + "epoch": 0.7318785289180528, + "grad_norm": 1.2482438148484345, + "learning_rate": 1.7690499211134898e-06, + "loss": 0.2887, + "step": 2398 + }, + { + "epoch": 0.7321837326415382, + "grad_norm": 1.9050410455001443, + "learning_rate": 1.7652781462207908e-06, + "loss": 0.3476, + "step": 2399 + }, + { + "epoch": 0.7324889363650237, + "grad_norm": 1.5632417062831914, + "learning_rate": 1.7615095343501637e-06, + "loss": 0.3904, + "step": 2400 + }, + { + "epoch": 0.7327941400885091, + "grad_norm": 1.2516479120148944, + "learning_rate": 1.7577440891866938e-06, + "loss": 0.3661, + "step": 2401 + }, + { + "epoch": 0.7330993438119945, + "grad_norm": 1.4281327595906879, + "learning_rate": 1.7539818144123589e-06, + "loss": 0.2554, + "step": 2402 + }, + { + "epoch": 0.7334045475354799, + "grad_norm": 1.9938065476006872, + "learning_rate": 1.7502227137060385e-06, + "loss": 0.211, + "step": 2403 + }, + { + "epoch": 0.7337097512589653, + "grad_norm": 1.5656770069370214, + "learning_rate": 1.7464667907435145e-06, + "loss": 0.3249, + "step": 2404 + }, + { + "epoch": 0.7340149549824508, + "grad_norm": 1.4857685975873842, + "learning_rate": 1.7427140491974553e-06, + "loss": 0.3922, + "step": 2405 + }, + { + "epoch": 0.7343201587059363, + "grad_norm": 1.5962279261031407, + "learning_rate": 1.7389644927374188e-06, + "loss": 0.4693, + "step": 2406 + }, + { + "epoch": 0.7346253624294217, + "grad_norm": 1.3060063075487254, + "learning_rate": 1.7352181250298527e-06, + "loss": 0.3797, + "step": 2407 + }, + { + "epoch": 0.7349305661529071, + "grad_norm": 1.6413985109214866, + "learning_rate": 1.7314749497380835e-06, + "loss": 0.4944, + "step": 2408 + }, + { + "epoch": 0.7352357698763925, + "grad_norm": 1.5695092723289383, + "learning_rate": 1.7277349705223133e-06, + "loss": 0.3746, + "step": 2409 + }, + { + "epoch": 0.7355409735998779, + "grad_norm": 1.694154475715984, + "learning_rate": 1.7239981910396274e-06, + "loss": 0.4035, + "step": 2410 + }, + { + "epoch": 0.7358461773233633, + "grad_norm": 1.4321803471625723, + "learning_rate": 1.720264614943974e-06, + "loss": 0.3191, + "step": 2411 + }, + { + "epoch": 0.7361513810468487, + "grad_norm": 1.3771418016095573, + "learning_rate": 1.7165342458861706e-06, + "loss": 0.2529, + "step": 2412 + }, + { + "epoch": 0.7364565847703342, + "grad_norm": 1.6635343761280355, + "learning_rate": 1.712807087513903e-06, + "loss": 0.3062, + "step": 2413 + }, + { + "epoch": 0.7367617884938196, + "grad_norm": 0.9998690400201703, + "learning_rate": 1.7090831434717142e-06, + "loss": 0.1487, + "step": 2414 + }, + { + "epoch": 0.7370669922173051, + "grad_norm": 1.5637897508967649, + "learning_rate": 1.7053624174010019e-06, + "loss": 0.434, + "step": 2415 + }, + { + "epoch": 0.7373721959407905, + "grad_norm": 1.4102781459716118, + "learning_rate": 1.7016449129400232e-06, + "loss": 0.2888, + "step": 2416 + }, + { + "epoch": 0.7376773996642759, + "grad_norm": 1.1743694967533793, + "learning_rate": 1.6979306337238787e-06, + "loss": 0.2099, + "step": 2417 + }, + { + "epoch": 0.7379826033877613, + "grad_norm": 1.7745821447548338, + "learning_rate": 1.694219583384521e-06, + "loss": 0.3803, + "step": 2418 + }, + { + "epoch": 0.7382878071112468, + "grad_norm": 1.6354627501520815, + "learning_rate": 1.690511765550739e-06, + "loss": 0.4103, + "step": 2419 + }, + { + "epoch": 0.7385930108347322, + "grad_norm": 1.5297920916047307, + "learning_rate": 1.6868071838481686e-06, + "loss": 0.3585, + "step": 2420 + }, + { + "epoch": 0.7388982145582176, + "grad_norm": 1.6670000537581684, + "learning_rate": 1.6831058418992747e-06, + "loss": 0.3463, + "step": 2421 + }, + { + "epoch": 0.739203418281703, + "grad_norm": 1.377165649462315, + "learning_rate": 1.6794077433233542e-06, + "loss": 0.2459, + "step": 2422 + }, + { + "epoch": 0.7395086220051884, + "grad_norm": 1.254668387413534, + "learning_rate": 1.675712891736539e-06, + "loss": 0.3237, + "step": 2423 + }, + { + "epoch": 0.7398138257286739, + "grad_norm": 1.6620188597654555, + "learning_rate": 1.6720212907517803e-06, + "loss": 0.4299, + "step": 2424 + }, + { + "epoch": 0.7401190294521593, + "grad_norm": 1.6523645277948942, + "learning_rate": 1.6683329439788498e-06, + "loss": 0.3996, + "step": 2425 + }, + { + "epoch": 0.7404242331756448, + "grad_norm": 1.6187362257912699, + "learning_rate": 1.6646478550243433e-06, + "loss": 0.3153, + "step": 2426 + }, + { + "epoch": 0.7407294368991302, + "grad_norm": 1.243329081404492, + "learning_rate": 1.6609660274916662e-06, + "loss": 0.203, + "step": 2427 + }, + { + "epoch": 0.7410346406226156, + "grad_norm": 1.4617906822883615, + "learning_rate": 1.6572874649810334e-06, + "loss": 0.3921, + "step": 2428 + }, + { + "epoch": 0.741339844346101, + "grad_norm": 1.4772782785804854, + "learning_rate": 1.653612171089473e-06, + "loss": 0.2399, + "step": 2429 + }, + { + "epoch": 0.7416450480695864, + "grad_norm": 1.516653989731904, + "learning_rate": 1.6499401494108124e-06, + "loss": 0.325, + "step": 2430 + }, + { + "epoch": 0.7419502517930718, + "grad_norm": 1.726246317224904, + "learning_rate": 1.646271403535678e-06, + "loss": 0.4074, + "step": 2431 + }, + { + "epoch": 0.7422554555165572, + "grad_norm": 1.6689973112145233, + "learning_rate": 1.6426059370515002e-06, + "loss": 0.2204, + "step": 2432 + }, + { + "epoch": 0.7425606592400428, + "grad_norm": 1.13846425278691, + "learning_rate": 1.638943753542493e-06, + "loss": 0.1386, + "step": 2433 + }, + { + "epoch": 0.7428658629635282, + "grad_norm": 1.4215926843752273, + "learning_rate": 1.6352848565896696e-06, + "loss": 0.2048, + "step": 2434 + }, + { + "epoch": 0.7431710666870136, + "grad_norm": 1.5202445192418594, + "learning_rate": 1.6316292497708208e-06, + "loss": 0.421, + "step": 2435 + }, + { + "epoch": 0.743476270410499, + "grad_norm": 1.6762691756708405, + "learning_rate": 1.6279769366605287e-06, + "loss": 0.4006, + "step": 2436 + }, + { + "epoch": 0.7437814741339844, + "grad_norm": 1.7012411775717595, + "learning_rate": 1.6243279208301483e-06, + "loss": 0.2386, + "step": 2437 + }, + { + "epoch": 0.7440866778574698, + "grad_norm": 1.4287681750645878, + "learning_rate": 1.620682205847811e-06, + "loss": 0.2714, + "step": 2438 + }, + { + "epoch": 0.7443918815809553, + "grad_norm": 1.6610026277708503, + "learning_rate": 1.617039795278425e-06, + "loss": 0.3636, + "step": 2439 + }, + { + "epoch": 0.7446970853044407, + "grad_norm": 2.0639538120528167, + "learning_rate": 1.6134006926836636e-06, + "loss": 0.247, + "step": 2440 + }, + { + "epoch": 0.7450022890279261, + "grad_norm": 1.5850709699512209, + "learning_rate": 1.609764901621964e-06, + "loss": 0.4474, + "step": 2441 + }, + { + "epoch": 0.7453074927514116, + "grad_norm": 1.6495669143346545, + "learning_rate": 1.6061324256485305e-06, + "loss": 0.4433, + "step": 2442 + }, + { + "epoch": 0.745612696474897, + "grad_norm": 1.7739809522756622, + "learning_rate": 1.6025032683153219e-06, + "loss": 0.3559, + "step": 2443 + }, + { + "epoch": 0.7459179001983824, + "grad_norm": 1.2253523767428416, + "learning_rate": 1.5988774331710511e-06, + "loss": 0.2543, + "step": 2444 + }, + { + "epoch": 0.7462231039218679, + "grad_norm": 1.267811099891557, + "learning_rate": 1.5952549237611864e-06, + "loss": 0.211, + "step": 2445 + }, + { + "epoch": 0.7465283076453533, + "grad_norm": 1.8148702710661058, + "learning_rate": 1.5916357436279416e-06, + "loss": 0.4958, + "step": 2446 + }, + { + "epoch": 0.7468335113688387, + "grad_norm": 1.4019785966150915, + "learning_rate": 1.5880198963102727e-06, + "loss": 0.3548, + "step": 2447 + }, + { + "epoch": 0.7471387150923241, + "grad_norm": 1.514170863443681, + "learning_rate": 1.5844073853438835e-06, + "loss": 0.3233, + "step": 2448 + }, + { + "epoch": 0.7474439188158095, + "grad_norm": 2.458336027327101, + "learning_rate": 1.5807982142612072e-06, + "loss": 0.3463, + "step": 2449 + }, + { + "epoch": 0.7477491225392949, + "grad_norm": 1.8685681972292403, + "learning_rate": 1.5771923865914186e-06, + "loss": 0.4082, + "step": 2450 + }, + { + "epoch": 0.7480543262627805, + "grad_norm": 1.4607828639804248, + "learning_rate": 1.5735899058604165e-06, + "loss": 0.2455, + "step": 2451 + }, + { + "epoch": 0.7483595299862659, + "grad_norm": 1.5437532772384637, + "learning_rate": 1.569990775590834e-06, + "loss": 0.2209, + "step": 2452 + }, + { + "epoch": 0.7486647337097513, + "grad_norm": 1.5998717524867132, + "learning_rate": 1.5663949993020216e-06, + "loss": 0.3259, + "step": 2453 + }, + { + "epoch": 0.7489699374332367, + "grad_norm": 1.5833261723942027, + "learning_rate": 1.5628025805100527e-06, + "loss": 0.3406, + "step": 2454 + }, + { + "epoch": 0.7492751411567221, + "grad_norm": 1.5819079692514202, + "learning_rate": 1.5592135227277194e-06, + "loss": 0.4312, + "step": 2455 + }, + { + "epoch": 0.7495803448802075, + "grad_norm": 2.0089884479621603, + "learning_rate": 1.5556278294645243e-06, + "loss": 0.3901, + "step": 2456 + }, + { + "epoch": 0.7498855486036929, + "grad_norm": 1.381615503205179, + "learning_rate": 1.5520455042266809e-06, + "loss": 0.2985, + "step": 2457 + }, + { + "epoch": 0.7501907523271784, + "grad_norm": 1.3001022628763312, + "learning_rate": 1.548466550517112e-06, + "loss": 0.2003, + "step": 2458 + }, + { + "epoch": 0.7504959560506638, + "grad_norm": 1.4329564811556512, + "learning_rate": 1.5448909718354398e-06, + "loss": 0.2839, + "step": 2459 + }, + { + "epoch": 0.7508011597741493, + "grad_norm": 1.5403711013032622, + "learning_rate": 1.5413187716779865e-06, + "loss": 0.3578, + "step": 2460 + }, + { + "epoch": 0.7511063634976347, + "grad_norm": 1.4433461602913331, + "learning_rate": 1.5377499535377755e-06, + "loss": 0.3038, + "step": 2461 + }, + { + "epoch": 0.7514115672211201, + "grad_norm": 1.3903299940281486, + "learning_rate": 1.5341845209045175e-06, + "loss": 0.3495, + "step": 2462 + }, + { + "epoch": 0.7517167709446055, + "grad_norm": 1.777219998589263, + "learning_rate": 1.5306224772646138e-06, + "loss": 0.3205, + "step": 2463 + }, + { + "epoch": 0.752021974668091, + "grad_norm": 1.3511000266469209, + "learning_rate": 1.527063826101156e-06, + "loss": 0.2054, + "step": 2464 + }, + { + "epoch": 0.7523271783915764, + "grad_norm": 1.4184063533061904, + "learning_rate": 1.523508570893914e-06, + "loss": 0.2443, + "step": 2465 + }, + { + "epoch": 0.7526323821150618, + "grad_norm": 1.4611404196422997, + "learning_rate": 1.5199567151193362e-06, + "loss": 0.2695, + "step": 2466 + }, + { + "epoch": 0.7529375858385472, + "grad_norm": 1.3855112755769896, + "learning_rate": 1.516408262250551e-06, + "loss": 0.2605, + "step": 2467 + }, + { + "epoch": 0.7532427895620326, + "grad_norm": 1.3116126921299953, + "learning_rate": 1.5128632157573597e-06, + "loss": 0.2025, + "step": 2468 + }, + { + "epoch": 0.7535479932855181, + "grad_norm": 1.4796773360325253, + "learning_rate": 1.5093215791062282e-06, + "loss": 0.378, + "step": 2469 + }, + { + "epoch": 0.7538531970090036, + "grad_norm": 1.432234949087124, + "learning_rate": 1.5057833557602886e-06, + "loss": 0.3638, + "step": 2470 + }, + { + "epoch": 0.754158400732489, + "grad_norm": 1.4684559142132647, + "learning_rate": 1.5022485491793404e-06, + "loss": 0.3034, + "step": 2471 + }, + { + "epoch": 0.7544636044559744, + "grad_norm": 1.712348391093848, + "learning_rate": 1.4987171628198376e-06, + "loss": 0.6474, + "step": 2472 + }, + { + "epoch": 0.7547688081794598, + "grad_norm": 1.2683167267429447, + "learning_rate": 1.49518920013489e-06, + "loss": 0.2659, + "step": 2473 + }, + { + "epoch": 0.7550740119029452, + "grad_norm": 1.4067118416141682, + "learning_rate": 1.4916646645742623e-06, + "loss": 0.3122, + "step": 2474 + }, + { + "epoch": 0.7553792156264306, + "grad_norm": 1.3294614462999204, + "learning_rate": 1.4881435595843658e-06, + "loss": 0.2014, + "step": 2475 + }, + { + "epoch": 0.755684419349916, + "grad_norm": 1.5514381996604707, + "learning_rate": 1.4846258886082559e-06, + "loss": 0.4586, + "step": 2476 + }, + { + "epoch": 0.7559896230734015, + "grad_norm": 1.2414279270860267, + "learning_rate": 1.4811116550856358e-06, + "loss": 0.1905, + "step": 2477 + }, + { + "epoch": 0.756294826796887, + "grad_norm": 1.5658974903410336, + "learning_rate": 1.4776008624528416e-06, + "loss": 0.1939, + "step": 2478 + }, + { + "epoch": 0.7566000305203724, + "grad_norm": 1.7196002591058608, + "learning_rate": 1.4740935141428447e-06, + "loss": 0.3984, + "step": 2479 + }, + { + "epoch": 0.7569052342438578, + "grad_norm": 1.7639793896757774, + "learning_rate": 1.470589613585255e-06, + "loss": 0.2218, + "step": 2480 + }, + { + "epoch": 0.7572104379673432, + "grad_norm": 1.3225571551889375, + "learning_rate": 1.4670891642063046e-06, + "loss": 0.177, + "step": 2481 + }, + { + "epoch": 0.7575156416908286, + "grad_norm": 1.6469842371461352, + "learning_rate": 1.4635921694288512e-06, + "loss": 0.3551, + "step": 2482 + }, + { + "epoch": 0.757820845414314, + "grad_norm": 1.6689084552871396, + "learning_rate": 1.4600986326723783e-06, + "loss": 0.3513, + "step": 2483 + }, + { + "epoch": 0.7581260491377995, + "grad_norm": 1.6027332771701015, + "learning_rate": 1.4566085573529876e-06, + "loss": 0.4021, + "step": 2484 + }, + { + "epoch": 0.7584312528612849, + "grad_norm": 1.583972182973655, + "learning_rate": 1.453121946883393e-06, + "loss": 0.297, + "step": 2485 + }, + { + "epoch": 0.7587364565847703, + "grad_norm": 1.758010242783844, + "learning_rate": 1.4496388046729205e-06, + "loss": 0.4719, + "step": 2486 + }, + { + "epoch": 0.7590416603082557, + "grad_norm": 1.4587979076269033, + "learning_rate": 1.446159134127509e-06, + "loss": 0.2238, + "step": 2487 + }, + { + "epoch": 0.7593468640317412, + "grad_norm": 1.516910361878622, + "learning_rate": 1.4426829386496983e-06, + "loss": 0.2117, + "step": 2488 + }, + { + "epoch": 0.7596520677552266, + "grad_norm": 1.027697194150736, + "learning_rate": 1.43921022163863e-06, + "loss": 0.1316, + "step": 2489 + }, + { + "epoch": 0.7599572714787121, + "grad_norm": 1.7174808324552195, + "learning_rate": 1.4357409864900495e-06, + "loss": 0.309, + "step": 2490 + }, + { + "epoch": 0.7602624752021975, + "grad_norm": 1.54123979105666, + "learning_rate": 1.4322752365962921e-06, + "loss": 0.3638, + "step": 2491 + }, + { + "epoch": 0.7605676789256829, + "grad_norm": 1.3864146154300274, + "learning_rate": 1.4288129753462858e-06, + "loss": 0.2661, + "step": 2492 + }, + { + "epoch": 0.7608728826491683, + "grad_norm": 1.7307300241219294, + "learning_rate": 1.4253542061255515e-06, + "loss": 0.3089, + "step": 2493 + }, + { + "epoch": 0.7611780863726537, + "grad_norm": 1.332318406461796, + "learning_rate": 1.4218989323161909e-06, + "loss": 0.3856, + "step": 2494 + }, + { + "epoch": 0.7614832900961391, + "grad_norm": 1.4346339638172307, + "learning_rate": 1.4184471572968873e-06, + "loss": 0.3232, + "step": 2495 + }, + { + "epoch": 0.7617884938196245, + "grad_norm": 1.3945504408939717, + "learning_rate": 1.4149988844429086e-06, + "loss": 0.2484, + "step": 2496 + }, + { + "epoch": 0.7620936975431101, + "grad_norm": 1.3073203652361365, + "learning_rate": 1.4115541171260928e-06, + "loss": 0.1359, + "step": 2497 + }, + { + "epoch": 0.7623989012665955, + "grad_norm": 1.2954470574924586, + "learning_rate": 1.4081128587148496e-06, + "loss": 0.2196, + "step": 2498 + }, + { + "epoch": 0.7627041049900809, + "grad_norm": 1.7836561181237647, + "learning_rate": 1.4046751125741615e-06, + "loss": 0.2989, + "step": 2499 + }, + { + "epoch": 0.7630093087135663, + "grad_norm": 1.6338560429242976, + "learning_rate": 1.4012408820655765e-06, + "loss": 0.2952, + "step": 2500 + }, + { + "epoch": 0.7633145124370517, + "grad_norm": 1.8965014723096463, + "learning_rate": 1.3978101705471992e-06, + "loss": 0.3649, + "step": 2501 + }, + { + "epoch": 0.7636197161605371, + "grad_norm": 1.403675435260702, + "learning_rate": 1.394382981373701e-06, + "loss": 0.2878, + "step": 2502 + }, + { + "epoch": 0.7639249198840226, + "grad_norm": 1.9008802859293734, + "learning_rate": 1.3909593178963032e-06, + "loss": 0.559, + "step": 2503 + }, + { + "epoch": 0.764230123607508, + "grad_norm": 1.6882297915798834, + "learning_rate": 1.3875391834627815e-06, + "loss": 0.2705, + "step": 2504 + }, + { + "epoch": 0.7645353273309934, + "grad_norm": 1.3132465146910093, + "learning_rate": 1.3841225814174592e-06, + "loss": 0.2154, + "step": 2505 + }, + { + "epoch": 0.7648405310544789, + "grad_norm": 1.503727255734048, + "learning_rate": 1.3807095151012101e-06, + "loss": 0.3739, + "step": 2506 + }, + { + "epoch": 0.7651457347779643, + "grad_norm": 1.8532239967908612, + "learning_rate": 1.3772999878514464e-06, + "loss": 0.3879, + "step": 2507 + }, + { + "epoch": 0.7654509385014497, + "grad_norm": 1.5263100172234858, + "learning_rate": 1.3738940030021191e-06, + "loss": 0.2597, + "step": 2508 + }, + { + "epoch": 0.7657561422249352, + "grad_norm": 1.5510558070282685, + "learning_rate": 1.37049156388372e-06, + "loss": 0.1667, + "step": 2509 + }, + { + "epoch": 0.7660613459484206, + "grad_norm": 1.7420929339963271, + "learning_rate": 1.3670926738232699e-06, + "loss": 0.5169, + "step": 2510 + }, + { + "epoch": 0.766366549671906, + "grad_norm": 1.5027245573035968, + "learning_rate": 1.3636973361443184e-06, + "loss": 0.4286, + "step": 2511 + }, + { + "epoch": 0.7666717533953914, + "grad_norm": 1.3230944336327843, + "learning_rate": 1.3603055541669457e-06, + "loss": 0.2573, + "step": 2512 + }, + { + "epoch": 0.7669769571188768, + "grad_norm": 2.2833142282099708, + "learning_rate": 1.3569173312077522e-06, + "loss": 0.3053, + "step": 2513 + }, + { + "epoch": 0.7672821608423622, + "grad_norm": 1.4527561170731167, + "learning_rate": 1.3535326705798563e-06, + "loss": 0.2039, + "step": 2514 + }, + { + "epoch": 0.7675873645658478, + "grad_norm": 1.1661107010154332, + "learning_rate": 1.3501515755928968e-06, + "loss": 0.1814, + "step": 2515 + }, + { + "epoch": 0.7678925682893332, + "grad_norm": 1.745056731933207, + "learning_rate": 1.346774049553027e-06, + "loss": 0.2475, + "step": 2516 + }, + { + "epoch": 0.7681977720128186, + "grad_norm": 1.7971733177340519, + "learning_rate": 1.3434000957629035e-06, + "loss": 0.2691, + "step": 2517 + }, + { + "epoch": 0.768502975736304, + "grad_norm": 1.9578135862598618, + "learning_rate": 1.3400297175216982e-06, + "loss": 0.5148, + "step": 2518 + }, + { + "epoch": 0.7688081794597894, + "grad_norm": 1.843697886619509, + "learning_rate": 1.33666291812508e-06, + "loss": 0.3523, + "step": 2519 + }, + { + "epoch": 0.7691133831832748, + "grad_norm": 1.1772101454940205, + "learning_rate": 1.3332997008652204e-06, + "loss": 0.2361, + "step": 2520 + }, + { + "epoch": 0.7694185869067602, + "grad_norm": 1.7516590156472773, + "learning_rate": 1.3299400690307917e-06, + "loss": 0.2935, + "step": 2521 + }, + { + "epoch": 0.7697237906302457, + "grad_norm": 1.538795564226342, + "learning_rate": 1.3265840259069551e-06, + "loss": 0.4383, + "step": 2522 + }, + { + "epoch": 0.7700289943537311, + "grad_norm": 1.415002162918217, + "learning_rate": 1.3232315747753639e-06, + "loss": 0.258, + "step": 2523 + }, + { + "epoch": 0.7703341980772166, + "grad_norm": 1.6090919467930516, + "learning_rate": 1.3198827189141627e-06, + "loss": 0.4418, + "step": 2524 + }, + { + "epoch": 0.770639401800702, + "grad_norm": 1.825167684820497, + "learning_rate": 1.3165374615979759e-06, + "loss": 0.2328, + "step": 2525 + }, + { + "epoch": 0.7709446055241874, + "grad_norm": 1.7249947589297079, + "learning_rate": 1.313195806097911e-06, + "loss": 0.4641, + "step": 2526 + }, + { + "epoch": 0.7712498092476728, + "grad_norm": 1.4212044612463415, + "learning_rate": 1.309857755681553e-06, + "loss": 0.2251, + "step": 2527 + }, + { + "epoch": 0.7715550129711582, + "grad_norm": 2.061260197546575, + "learning_rate": 1.3065233136129635e-06, + "loss": 0.4621, + "step": 2528 + }, + { + "epoch": 0.7718602166946437, + "grad_norm": 1.6672609064589439, + "learning_rate": 1.3031924831526737e-06, + "loss": 0.3449, + "step": 2529 + }, + { + "epoch": 0.7721654204181291, + "grad_norm": 1.4680252722090177, + "learning_rate": 1.2998652675576835e-06, + "loss": 0.3795, + "step": 2530 + }, + { + "epoch": 0.7724706241416145, + "grad_norm": 1.7029476248392246, + "learning_rate": 1.296541670081458e-06, + "loss": 0.3317, + "step": 2531 + }, + { + "epoch": 0.7727758278650999, + "grad_norm": 1.4895697995440238, + "learning_rate": 1.2932216939739284e-06, + "loss": 0.4766, + "step": 2532 + }, + { + "epoch": 0.7730810315885854, + "grad_norm": 1.7170191285718017, + "learning_rate": 1.2899053424814766e-06, + "loss": 0.1474, + "step": 2533 + }, + { + "epoch": 0.7733862353120708, + "grad_norm": 1.7030782430687559, + "learning_rate": 1.2865926188469497e-06, + "loss": 0.3246, + "step": 2534 + }, + { + "epoch": 0.7736914390355563, + "grad_norm": 1.6892554018601282, + "learning_rate": 1.2832835263096393e-06, + "loss": 0.4806, + "step": 2535 + }, + { + "epoch": 0.7739966427590417, + "grad_norm": 1.958577107076153, + "learning_rate": 1.2799780681052892e-06, + "loss": 0.4276, + "step": 2536 + }, + { + "epoch": 0.7743018464825271, + "grad_norm": 1.4386387336719104, + "learning_rate": 1.2766762474660931e-06, + "loss": 0.3012, + "step": 2537 + }, + { + "epoch": 0.7746070502060125, + "grad_norm": 1.4934041919813408, + "learning_rate": 1.2733780676206826e-06, + "loss": 0.3041, + "step": 2538 + }, + { + "epoch": 0.7749122539294979, + "grad_norm": 1.4600687963136778, + "learning_rate": 1.2700835317941296e-06, + "loss": 0.281, + "step": 2539 + }, + { + "epoch": 0.7752174576529833, + "grad_norm": 1.888694218658229, + "learning_rate": 1.2667926432079474e-06, + "loss": 0.2526, + "step": 2540 + }, + { + "epoch": 0.7755226613764687, + "grad_norm": 1.2069766331633431, + "learning_rate": 1.2635054050800788e-06, + "loss": 0.189, + "step": 2541 + }, + { + "epoch": 0.7758278650999543, + "grad_norm": 1.7914796184516408, + "learning_rate": 1.2602218206248962e-06, + "loss": 0.3041, + "step": 2542 + }, + { + "epoch": 0.7761330688234397, + "grad_norm": 1.6355628982899726, + "learning_rate": 1.256941893053204e-06, + "loss": 0.3401, + "step": 2543 + }, + { + "epoch": 0.7764382725469251, + "grad_norm": 1.5394860473922316, + "learning_rate": 1.2536656255722268e-06, + "loss": 0.2876, + "step": 2544 + }, + { + "epoch": 0.7767434762704105, + "grad_norm": 1.6669326172673098, + "learning_rate": 1.2503930213856098e-06, + "loss": 0.3818, + "step": 2545 + }, + { + "epoch": 0.7770486799938959, + "grad_norm": 1.9309726404813077, + "learning_rate": 1.2471240836934207e-06, + "loss": 0.3303, + "step": 2546 + }, + { + "epoch": 0.7773538837173813, + "grad_norm": 1.789663488734216, + "learning_rate": 1.2438588156921378e-06, + "loss": 0.595, + "step": 2547 + }, + { + "epoch": 0.7776590874408668, + "grad_norm": 1.8420347867118425, + "learning_rate": 1.2405972205746503e-06, + "loss": 0.2228, + "step": 2548 + }, + { + "epoch": 0.7779642911643522, + "grad_norm": 1.9279536438509446, + "learning_rate": 1.23733930153026e-06, + "loss": 0.2951, + "step": 2549 + }, + { + "epoch": 0.7782694948878376, + "grad_norm": 1.7373150486929299, + "learning_rate": 1.2340850617446725e-06, + "loss": 0.3925, + "step": 2550 + }, + { + "epoch": 0.7785746986113231, + "grad_norm": 1.5091105195836365, + "learning_rate": 1.230834504399994e-06, + "loss": 0.3783, + "step": 2551 + }, + { + "epoch": 0.7788799023348085, + "grad_norm": 1.6718016991634157, + "learning_rate": 1.2275876326747305e-06, + "loss": 0.2561, + "step": 2552 + }, + { + "epoch": 0.7791851060582939, + "grad_norm": 1.5749814289754123, + "learning_rate": 1.2243444497437867e-06, + "loss": 0.3802, + "step": 2553 + }, + { + "epoch": 0.7794903097817794, + "grad_norm": 1.4463613467598617, + "learning_rate": 1.2211049587784574e-06, + "loss": 0.2736, + "step": 2554 + }, + { + "epoch": 0.7797955135052648, + "grad_norm": 1.6206257707657987, + "learning_rate": 1.2178691629464257e-06, + "loss": 0.4952, + "step": 2555 + }, + { + "epoch": 0.7801007172287502, + "grad_norm": 1.4752179033349433, + "learning_rate": 1.2146370654117674e-06, + "loss": 0.3609, + "step": 2556 + }, + { + "epoch": 0.7804059209522356, + "grad_norm": 1.8063204424892583, + "learning_rate": 1.2114086693349365e-06, + "loss": 0.4268, + "step": 2557 + }, + { + "epoch": 0.780711124675721, + "grad_norm": 1.5945172426542091, + "learning_rate": 1.2081839778727683e-06, + "loss": 0.455, + "step": 2558 + }, + { + "epoch": 0.7810163283992064, + "grad_norm": 2.4498786584817176, + "learning_rate": 1.2049629941784801e-06, + "loss": 0.4397, + "step": 2559 + }, + { + "epoch": 0.7813215321226918, + "grad_norm": 1.3785621278506959, + "learning_rate": 1.2017457214016582e-06, + "loss": 0.2884, + "step": 2560 + }, + { + "epoch": 0.7816267358461774, + "grad_norm": 1.2967205185400668, + "learning_rate": 1.1985321626882617e-06, + "loss": 0.3354, + "step": 2561 + }, + { + "epoch": 0.7819319395696628, + "grad_norm": 2.1178940160744526, + "learning_rate": 1.1953223211806209e-06, + "loss": 0.2787, + "step": 2562 + }, + { + "epoch": 0.7822371432931482, + "grad_norm": 1.650534430564501, + "learning_rate": 1.192116200017428e-06, + "loss": 0.3282, + "step": 2563 + }, + { + "epoch": 0.7825423470166336, + "grad_norm": 1.6846552288380163, + "learning_rate": 1.1889138023337365e-06, + "loss": 0.1773, + "step": 2564 + }, + { + "epoch": 0.782847550740119, + "grad_norm": 1.586187483906093, + "learning_rate": 1.1857151312609622e-06, + "loss": 0.2041, + "step": 2565 + }, + { + "epoch": 0.7831527544636044, + "grad_norm": 1.3146422024869324, + "learning_rate": 1.1825201899268774e-06, + "loss": 0.359, + "step": 2566 + }, + { + "epoch": 0.7834579581870899, + "grad_norm": 1.6338815064392982, + "learning_rate": 1.1793289814556036e-06, + "loss": 0.3058, + "step": 2567 + }, + { + "epoch": 0.7837631619105753, + "grad_norm": 1.7511584670919065, + "learning_rate": 1.1761415089676126e-06, + "loss": 0.2958, + "step": 2568 + }, + { + "epoch": 0.7840683656340607, + "grad_norm": 1.475500219191894, + "learning_rate": 1.1729577755797278e-06, + "loss": 0.3746, + "step": 2569 + }, + { + "epoch": 0.7843735693575462, + "grad_norm": 1.4671615777046416, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.3707, + "step": 2570 + }, + { + "epoch": 0.7846787730810316, + "grad_norm": 1.4370461452351335, + "learning_rate": 1.1666015385532648e-06, + "loss": 0.322, + "step": 2571 + }, + { + "epoch": 0.784983976804517, + "grad_norm": 1.4175708727468457, + "learning_rate": 1.1634290411300354e-06, + "loss": 0.2171, + "step": 2572 + }, + { + "epoch": 0.7852891805280025, + "grad_norm": 1.5392806000287926, + "learning_rate": 1.160260295237598e-06, + "loss": 0.3984, + "step": 2573 + }, + { + "epoch": 0.7855943842514879, + "grad_norm": 1.4241266552501894, + "learning_rate": 1.1570953039744592e-06, + "loss": 0.2882, + "step": 2574 + }, + { + "epoch": 0.7858995879749733, + "grad_norm": 1.7183588449833307, + "learning_rate": 1.153934070435459e-06, + "loss": 0.2638, + "step": 2575 + }, + { + "epoch": 0.7862047916984587, + "grad_norm": 1.4766861798193391, + "learning_rate": 1.1507765977117596e-06, + "loss": 0.2883, + "step": 2576 + }, + { + "epoch": 0.7865099954219441, + "grad_norm": 1.4461283262829343, + "learning_rate": 1.1476228888908437e-06, + "loss": 0.3625, + "step": 2577 + }, + { + "epoch": 0.7868151991454295, + "grad_norm": 1.5577204272457295, + "learning_rate": 1.1444729470565191e-06, + "loss": 0.3019, + "step": 2578 + }, + { + "epoch": 0.787120402868915, + "grad_norm": 2.044754210906538, + "learning_rate": 1.1413267752889062e-06, + "loss": 0.2757, + "step": 2579 + }, + { + "epoch": 0.7874256065924005, + "grad_norm": 1.5613863218607207, + "learning_rate": 1.1381843766644379e-06, + "loss": 0.1602, + "step": 2580 + }, + { + "epoch": 0.7877308103158859, + "grad_norm": 1.3626170466832803, + "learning_rate": 1.1350457542558608e-06, + "loss": 0.2227, + "step": 2581 + }, + { + "epoch": 0.7880360140393713, + "grad_norm": 1.2994120955756945, + "learning_rate": 1.131910911132229e-06, + "loss": 0.2259, + "step": 2582 + }, + { + "epoch": 0.7883412177628567, + "grad_norm": 1.5530753666950647, + "learning_rate": 1.1287798503588994e-06, + "loss": 0.3569, + "step": 2583 + }, + { + "epoch": 0.7886464214863421, + "grad_norm": 1.5007414574736635, + "learning_rate": 1.1256525749975278e-06, + "loss": 0.2333, + "step": 2584 + }, + { + "epoch": 0.7889516252098275, + "grad_norm": 1.6337499528343469, + "learning_rate": 1.1225290881060747e-06, + "loss": 0.3932, + "step": 2585 + }, + { + "epoch": 0.789256828933313, + "grad_norm": 1.46669837021519, + "learning_rate": 1.1194093927387917e-06, + "loss": 0.3024, + "step": 2586 + }, + { + "epoch": 0.7895620326567984, + "grad_norm": 1.610861230889205, + "learning_rate": 1.1162934919462226e-06, + "loss": 0.3625, + "step": 2587 + }, + { + "epoch": 0.7898672363802839, + "grad_norm": 1.3004736550965181, + "learning_rate": 1.1131813887752046e-06, + "loss": 0.2931, + "step": 2588 + }, + { + "epoch": 0.7901724401037693, + "grad_norm": 1.5664590650677594, + "learning_rate": 1.110073086268858e-06, + "loss": 0.3256, + "step": 2589 + }, + { + "epoch": 0.7904776438272547, + "grad_norm": 1.3693046525986896, + "learning_rate": 1.1069685874665858e-06, + "loss": 0.3568, + "step": 2590 + }, + { + "epoch": 0.7907828475507401, + "grad_norm": 1.4960672162046136, + "learning_rate": 1.103867895404077e-06, + "loss": 0.2358, + "step": 2591 + }, + { + "epoch": 0.7910880512742255, + "grad_norm": 1.5834616893473803, + "learning_rate": 1.100771013113293e-06, + "loss": 0.3178, + "step": 2592 + }, + { + "epoch": 0.791393254997711, + "grad_norm": 1.2548552157562565, + "learning_rate": 1.0976779436224706e-06, + "loss": 0.2395, + "step": 2593 + }, + { + "epoch": 0.7916984587211964, + "grad_norm": 1.581735088742958, + "learning_rate": 1.0945886899561215e-06, + "loss": 0.4003, + "step": 2594 + }, + { + "epoch": 0.7920036624446818, + "grad_norm": 1.6164845583620946, + "learning_rate": 1.0915032551350225e-06, + "loss": 0.3452, + "step": 2595 + }, + { + "epoch": 0.7923088661681672, + "grad_norm": 1.665959311424618, + "learning_rate": 1.0884216421762167e-06, + "loss": 0.3503, + "step": 2596 + }, + { + "epoch": 0.7926140698916527, + "grad_norm": 1.3877622305018942, + "learning_rate": 1.0853438540930127e-06, + "loss": 0.2358, + "step": 2597 + }, + { + "epoch": 0.7929192736151381, + "grad_norm": 1.6745241514089109, + "learning_rate": 1.0822698938949783e-06, + "loss": 0.4222, + "step": 2598 + }, + { + "epoch": 0.7932244773386236, + "grad_norm": 1.5809693339955015, + "learning_rate": 1.079199764587936e-06, + "loss": 0.3156, + "step": 2599 + }, + { + "epoch": 0.793529681062109, + "grad_norm": 1.9384104527651729, + "learning_rate": 1.0761334691739623e-06, + "loss": 0.6036, + "step": 2600 + }, + { + "epoch": 0.7938348847855944, + "grad_norm": 1.3628061772973745, + "learning_rate": 1.0730710106513887e-06, + "loss": 0.2319, + "step": 2601 + }, + { + "epoch": 0.7941400885090798, + "grad_norm": 1.5729552847393045, + "learning_rate": 1.0700123920147915e-06, + "loss": 0.3517, + "step": 2602 + }, + { + "epoch": 0.7944452922325652, + "grad_norm": 1.5823186919378267, + "learning_rate": 1.0669576162549905e-06, + "loss": 0.3823, + "step": 2603 + }, + { + "epoch": 0.7947504959560506, + "grad_norm": 1.5621083340900117, + "learning_rate": 1.063906686359054e-06, + "loss": 0.2805, + "step": 2604 + }, + { + "epoch": 0.795055699679536, + "grad_norm": 1.5411503760694267, + "learning_rate": 1.0608596053102849e-06, + "loss": 0.1962, + "step": 2605 + }, + { + "epoch": 0.7953609034030216, + "grad_norm": 1.6821783005946354, + "learning_rate": 1.057816376088221e-06, + "loss": 0.4098, + "step": 2606 + }, + { + "epoch": 0.795666107126507, + "grad_norm": 1.7612649878790774, + "learning_rate": 1.0547770016686403e-06, + "loss": 0.3115, + "step": 2607 + }, + { + "epoch": 0.7959713108499924, + "grad_norm": 1.9945711125156906, + "learning_rate": 1.0517414850235452e-06, + "loss": 0.4042, + "step": 2608 + }, + { + "epoch": 0.7962765145734778, + "grad_norm": 1.6001281269747107, + "learning_rate": 1.0487098291211678e-06, + "loss": 0.3009, + "step": 2609 + }, + { + "epoch": 0.7965817182969632, + "grad_norm": 1.2878584353560845, + "learning_rate": 1.0456820369259667e-06, + "loss": 0.1765, + "step": 2610 + }, + { + "epoch": 0.7968869220204486, + "grad_norm": 1.6267063278824643, + "learning_rate": 1.04265811139862e-06, + "loss": 0.328, + "step": 2611 + }, + { + "epoch": 0.797192125743934, + "grad_norm": 1.5193173765477481, + "learning_rate": 1.0396380554960255e-06, + "loss": 0.2889, + "step": 2612 + }, + { + "epoch": 0.7974973294674195, + "grad_norm": 1.604876629493802, + "learning_rate": 1.0366218721712983e-06, + "loss": 0.3451, + "step": 2613 + }, + { + "epoch": 0.7978025331909049, + "grad_norm": 1.5038961831288478, + "learning_rate": 1.033609564373767e-06, + "loss": 0.2933, + "step": 2614 + }, + { + "epoch": 0.7981077369143904, + "grad_norm": 1.467630324807371, + "learning_rate": 1.0306011350489686e-06, + "loss": 0.2042, + "step": 2615 + }, + { + "epoch": 0.7984129406378758, + "grad_norm": 1.5211283079825, + "learning_rate": 1.0275965871386474e-06, + "loss": 0.2607, + "step": 2616 + }, + { + "epoch": 0.7987181443613612, + "grad_norm": 1.6333004163162896, + "learning_rate": 1.024595923580755e-06, + "loss": 0.29, + "step": 2617 + }, + { + "epoch": 0.7990233480848467, + "grad_norm": 1.6102677619000663, + "learning_rate": 1.0215991473094439e-06, + "loss": 0.2565, + "step": 2618 + }, + { + "epoch": 0.7993285518083321, + "grad_norm": 1.552044332470833, + "learning_rate": 1.0186062612550618e-06, + "loss": 0.2537, + "step": 2619 + }, + { + "epoch": 0.7996337555318175, + "grad_norm": 2.716022745905198, + "learning_rate": 1.015617268344159e-06, + "loss": 0.4498, + "step": 2620 + }, + { + "epoch": 0.7999389592553029, + "grad_norm": 1.854019016287683, + "learning_rate": 1.0126321714994741e-06, + "loss": 0.5988, + "step": 2621 + }, + { + "epoch": 0.8002441629787883, + "grad_norm": 1.546455711798656, + "learning_rate": 1.009650973639935e-06, + "loss": 0.3001, + "step": 2622 + }, + { + "epoch": 0.8005493667022737, + "grad_norm": 1.732303654057111, + "learning_rate": 1.006673677680663e-06, + "loss": 0.4049, + "step": 2623 + }, + { + "epoch": 0.8008545704257592, + "grad_norm": 1.6992859574981316, + "learning_rate": 1.0037002865329582e-06, + "loss": 0.499, + "step": 2624 + }, + { + "epoch": 0.8011597741492447, + "grad_norm": 1.5594763682716168, + "learning_rate": 1.0007308031043035e-06, + "loss": 0.4562, + "step": 2625 + }, + { + "epoch": 0.8014649778727301, + "grad_norm": 1.6350669081877012, + "learning_rate": 9.97765230298365e-07, + "loss": 0.4341, + "step": 2626 + }, + { + "epoch": 0.8017701815962155, + "grad_norm": 1.6414271255756192, + "learning_rate": 9.948035710149788e-07, + "loss": 0.3881, + "step": 2627 + }, + { + "epoch": 0.8020753853197009, + "grad_norm": 1.6181320159630346, + "learning_rate": 9.91845828150157e-07, + "loss": 0.4622, + "step": 2628 + }, + { + "epoch": 0.8023805890431863, + "grad_norm": 1.6530374283842706, + "learning_rate": 9.888920045960848e-07, + "loss": 0.4549, + "step": 2629 + }, + { + "epoch": 0.8026857927666717, + "grad_norm": 1.409891753607114, + "learning_rate": 9.859421032411087e-07, + "loss": 0.1985, + "step": 2630 + }, + { + "epoch": 0.8029909964901571, + "grad_norm": 3.1747841108318884, + "learning_rate": 9.82996126969747e-07, + "loss": 0.299, + "step": 2631 + }, + { + "epoch": 0.8032962002136426, + "grad_norm": 1.8120883605751632, + "learning_rate": 9.800540786626733e-07, + "loss": 0.5064, + "step": 2632 + }, + { + "epoch": 0.803601403937128, + "grad_norm": 1.5712675160696425, + "learning_rate": 9.771159611967263e-07, + "loss": 0.4155, + "step": 2633 + }, + { + "epoch": 0.8039066076606135, + "grad_norm": 1.4720110301920981, + "learning_rate": 9.741817774448964e-07, + "loss": 0.2089, + "step": 2634 + }, + { + "epoch": 0.8042118113840989, + "grad_norm": 1.4869879762975362, + "learning_rate": 9.712515302763287e-07, + "loss": 0.1924, + "step": 2635 + }, + { + "epoch": 0.8045170151075843, + "grad_norm": 1.7374620886309813, + "learning_rate": 9.683252225563212e-07, + "loss": 0.3555, + "step": 2636 + }, + { + "epoch": 0.8048222188310697, + "grad_norm": 1.4234749312771073, + "learning_rate": 9.654028571463169e-07, + "loss": 0.2412, + "step": 2637 + }, + { + "epoch": 0.8051274225545552, + "grad_norm": 1.4787099249497095, + "learning_rate": 9.624844369039048e-07, + "loss": 0.4092, + "step": 2638 + }, + { + "epoch": 0.8054326262780406, + "grad_norm": 1.699621814847962, + "learning_rate": 9.595699646828178e-07, + "loss": 0.2133, + "step": 2639 + }, + { + "epoch": 0.805737830001526, + "grad_norm": 1.5439514121430349, + "learning_rate": 9.566594433329257e-07, + "loss": 0.2268, + "step": 2640 + }, + { + "epoch": 0.8060430337250114, + "grad_norm": 1.7387800208324073, + "learning_rate": 9.537528757002357e-07, + "loss": 0.3704, + "step": 2641 + }, + { + "epoch": 0.8063482374484968, + "grad_norm": 1.8876369603468202, + "learning_rate": 9.508502646268908e-07, + "loss": 0.2616, + "step": 2642 + }, + { + "epoch": 0.8066534411719823, + "grad_norm": 1.3815542412463353, + "learning_rate": 9.479516129511635e-07, + "loss": 0.1867, + "step": 2643 + }, + { + "epoch": 0.8069586448954678, + "grad_norm": 1.4669823256287975, + "learning_rate": 9.450569235074531e-07, + "loss": 0.3232, + "step": 2644 + }, + { + "epoch": 0.8072638486189532, + "grad_norm": 1.6840137214554676, + "learning_rate": 9.421661991262893e-07, + "loss": 0.3368, + "step": 2645 + }, + { + "epoch": 0.8075690523424386, + "grad_norm": 1.7677239963764457, + "learning_rate": 9.392794426343189e-07, + "loss": 0.4677, + "step": 2646 + }, + { + "epoch": 0.807874256065924, + "grad_norm": 1.592833461133337, + "learning_rate": 9.36396656854312e-07, + "loss": 0.2972, + "step": 2647 + }, + { + "epoch": 0.8081794597894094, + "grad_norm": 1.3250285685356402, + "learning_rate": 9.335178446051579e-07, + "loss": 0.2016, + "step": 2648 + }, + { + "epoch": 0.8084846635128948, + "grad_norm": 1.8020969309505135, + "learning_rate": 9.306430087018565e-07, + "loss": 0.3618, + "step": 2649 + }, + { + "epoch": 0.8087898672363802, + "grad_norm": 1.6753558706039484, + "learning_rate": 9.277721519555194e-07, + "loss": 0.3168, + "step": 2650 + }, + { + "epoch": 0.8090950709598657, + "grad_norm": 1.9916679752084858, + "learning_rate": 9.249052771733713e-07, + "loss": 0.4444, + "step": 2651 + }, + { + "epoch": 0.8094002746833512, + "grad_norm": 1.558934683546651, + "learning_rate": 9.220423871587403e-07, + "loss": 0.373, + "step": 2652 + }, + { + "epoch": 0.8097054784068366, + "grad_norm": 1.9602276858604508, + "learning_rate": 9.19183484711057e-07, + "loss": 0.3209, + "step": 2653 + }, + { + "epoch": 0.810010682130322, + "grad_norm": 1.6310920960452386, + "learning_rate": 9.163285726258563e-07, + "loss": 0.2726, + "step": 2654 + }, + { + "epoch": 0.8103158858538074, + "grad_norm": 1.4738232266496092, + "learning_rate": 9.134776536947687e-07, + "loss": 0.3224, + "step": 2655 + }, + { + "epoch": 0.8106210895772928, + "grad_norm": 1.258303528657992, + "learning_rate": 9.106307307055201e-07, + "loss": 0.2699, + "step": 2656 + }, + { + "epoch": 0.8109262933007783, + "grad_norm": 2.01664885310773, + "learning_rate": 9.077878064419283e-07, + "loss": 0.3761, + "step": 2657 + }, + { + "epoch": 0.8112314970242637, + "grad_norm": 2.1735184852545903, + "learning_rate": 9.049488836839049e-07, + "loss": 0.2646, + "step": 2658 + }, + { + "epoch": 0.8115367007477491, + "grad_norm": 1.2358409048111307, + "learning_rate": 9.021139652074451e-07, + "loss": 0.1907, + "step": 2659 + }, + { + "epoch": 0.8118419044712345, + "grad_norm": 1.9648143849539013, + "learning_rate": 8.992830537846275e-07, + "loss": 0.4223, + "step": 2660 + }, + { + "epoch": 0.81214710819472, + "grad_norm": 1.99769292327075, + "learning_rate": 8.964561521836179e-07, + "loss": 0.304, + "step": 2661 + }, + { + "epoch": 0.8124523119182054, + "grad_norm": 1.4146455858753013, + "learning_rate": 8.936332631686545e-07, + "loss": 0.1802, + "step": 2662 + }, + { + "epoch": 0.8127575156416909, + "grad_norm": 1.703787018785887, + "learning_rate": 8.908143895000565e-07, + "loss": 0.3692, + "step": 2663 + }, + { + "epoch": 0.8130627193651763, + "grad_norm": 1.8186227708079596, + "learning_rate": 8.879995339342168e-07, + "loss": 0.1969, + "step": 2664 + }, + { + "epoch": 0.8133679230886617, + "grad_norm": 1.5891890243827214, + "learning_rate": 8.851886992235964e-07, + "loss": 0.3539, + "step": 2665 + }, + { + "epoch": 0.8136731268121471, + "grad_norm": 1.8341778052946631, + "learning_rate": 8.823818881167246e-07, + "loss": 0.2046, + "step": 2666 + }, + { + "epoch": 0.8139783305356325, + "grad_norm": 1.6571117534443234, + "learning_rate": 8.795791033582007e-07, + "loss": 0.1426, + "step": 2667 + }, + { + "epoch": 0.8142835342591179, + "grad_norm": 1.6438546499147626, + "learning_rate": 8.767803476886821e-07, + "loss": 0.2573, + "step": 2668 + }, + { + "epoch": 0.8145887379826033, + "grad_norm": 1.6007966978669126, + "learning_rate": 8.739856238448868e-07, + "loss": 0.3303, + "step": 2669 + }, + { + "epoch": 0.8148939417060889, + "grad_norm": 1.881959840806181, + "learning_rate": 8.711949345595949e-07, + "loss": 0.2803, + "step": 2670 + }, + { + "epoch": 0.8151991454295743, + "grad_norm": 1.6541498652581024, + "learning_rate": 8.684082825616363e-07, + "loss": 0.2841, + "step": 2671 + }, + { + "epoch": 0.8155043491530597, + "grad_norm": 1.9023082189355414, + "learning_rate": 8.656256705758936e-07, + "loss": 0.3925, + "step": 2672 + }, + { + "epoch": 0.8158095528765451, + "grad_norm": 1.3554873946169075, + "learning_rate": 8.628471013233036e-07, + "loss": 0.2266, + "step": 2673 + }, + { + "epoch": 0.8161147566000305, + "grad_norm": 1.4971356063509114, + "learning_rate": 8.600725775208446e-07, + "loss": 0.2315, + "step": 2674 + }, + { + "epoch": 0.8164199603235159, + "grad_norm": 1.628357280189394, + "learning_rate": 8.573021018815409e-07, + "loss": 0.3426, + "step": 2675 + }, + { + "epoch": 0.8167251640470014, + "grad_norm": 1.588611622088212, + "learning_rate": 8.545356771144603e-07, + "loss": 0.2657, + "step": 2676 + }, + { + "epoch": 0.8170303677704868, + "grad_norm": 1.5671981369981116, + "learning_rate": 8.517733059247074e-07, + "loss": 0.3451, + "step": 2677 + }, + { + "epoch": 0.8173355714939722, + "grad_norm": 1.4950183160482764, + "learning_rate": 8.490149910134233e-07, + "loss": 0.1883, + "step": 2678 + }, + { + "epoch": 0.8176407752174577, + "grad_norm": 1.733033632386829, + "learning_rate": 8.462607350777796e-07, + "loss": 0.2291, + "step": 2679 + }, + { + "epoch": 0.8179459789409431, + "grad_norm": 1.503043980308405, + "learning_rate": 8.435105408109889e-07, + "loss": 0.2581, + "step": 2680 + }, + { + "epoch": 0.8182511826644285, + "grad_norm": 1.518032325124027, + "learning_rate": 8.407644109022822e-07, + "loss": 0.2657, + "step": 2681 + }, + { + "epoch": 0.818556386387914, + "grad_norm": 1.5121513753077411, + "learning_rate": 8.380223480369182e-07, + "loss": 0.2234, + "step": 2682 + }, + { + "epoch": 0.8188615901113994, + "grad_norm": 1.6772203646134276, + "learning_rate": 8.352843548961831e-07, + "loss": 0.318, + "step": 2683 + }, + { + "epoch": 0.8191667938348848, + "grad_norm": 1.541296484896679, + "learning_rate": 8.325504341573792e-07, + "loss": 0.2169, + "step": 2684 + }, + { + "epoch": 0.8194719975583702, + "grad_norm": 1.6917734077991005, + "learning_rate": 8.298205884938265e-07, + "loss": 0.2167, + "step": 2685 + }, + { + "epoch": 0.8197772012818556, + "grad_norm": 1.5189316848902874, + "learning_rate": 8.270948205748642e-07, + "loss": 0.3513, + "step": 2686 + }, + { + "epoch": 0.820082405005341, + "grad_norm": 1.4605678151811097, + "learning_rate": 8.243731330658411e-07, + "loss": 0.2117, + "step": 2687 + }, + { + "epoch": 0.8203876087288265, + "grad_norm": 1.6154899348869585, + "learning_rate": 8.216555286281153e-07, + "loss": 0.221, + "step": 2688 + }, + { + "epoch": 0.820692812452312, + "grad_norm": 1.513179260818861, + "learning_rate": 8.189420099190559e-07, + "loss": 0.2856, + "step": 2689 + }, + { + "epoch": 0.8209980161757974, + "grad_norm": 1.922241858227103, + "learning_rate": 8.162325795920334e-07, + "loss": 0.4977, + "step": 2690 + }, + { + "epoch": 0.8213032198992828, + "grad_norm": 1.6229844431340152, + "learning_rate": 8.135272402964211e-07, + "loss": 0.2974, + "step": 2691 + }, + { + "epoch": 0.8216084236227682, + "grad_norm": 2.1257005546532133, + "learning_rate": 8.108259946775943e-07, + "loss": 0.4415, + "step": 2692 + }, + { + "epoch": 0.8219136273462536, + "grad_norm": 1.654955231905261, + "learning_rate": 8.081288453769226e-07, + "loss": 0.3712, + "step": 2693 + }, + { + "epoch": 0.822218831069739, + "grad_norm": 1.4861717994079104, + "learning_rate": 8.054357950317709e-07, + "loss": 0.2555, + "step": 2694 + }, + { + "epoch": 0.8225240347932244, + "grad_norm": 1.8518695174363622, + "learning_rate": 8.027468462754967e-07, + "loss": 0.3779, + "step": 2695 + }, + { + "epoch": 0.8228292385167099, + "grad_norm": 1.7312777933582089, + "learning_rate": 8.000620017374472e-07, + "loss": 0.3539, + "step": 2696 + }, + { + "epoch": 0.8231344422401954, + "grad_norm": 1.2929073445931385, + "learning_rate": 7.973812640429557e-07, + "loss": 0.1687, + "step": 2697 + }, + { + "epoch": 0.8234396459636808, + "grad_norm": 1.7167304919084254, + "learning_rate": 7.94704635813337e-07, + "loss": 0.3658, + "step": 2698 + }, + { + "epoch": 0.8237448496871662, + "grad_norm": 1.4959828308262608, + "learning_rate": 7.920321196658931e-07, + "loss": 0.3476, + "step": 2699 + }, + { + "epoch": 0.8240500534106516, + "grad_norm": 1.3697764271322341, + "learning_rate": 7.893637182139002e-07, + "loss": 0.2692, + "step": 2700 + }, + { + "epoch": 0.824355257134137, + "grad_norm": 1.5335971204552559, + "learning_rate": 7.866994340666129e-07, + "loss": 0.2137, + "step": 2701 + }, + { + "epoch": 0.8246604608576225, + "grad_norm": 1.657534119497938, + "learning_rate": 7.840392698292615e-07, + "loss": 0.371, + "step": 2702 + }, + { + "epoch": 0.8249656645811079, + "grad_norm": 1.8020708013844835, + "learning_rate": 7.813832281030442e-07, + "loss": 0.3245, + "step": 2703 + }, + { + "epoch": 0.8252708683045933, + "grad_norm": 1.6667883669607235, + "learning_rate": 7.787313114851292e-07, + "loss": 0.2716, + "step": 2704 + }, + { + "epoch": 0.8255760720280787, + "grad_norm": 1.2478884983014769, + "learning_rate": 7.760835225686536e-07, + "loss": 0.2216, + "step": 2705 + }, + { + "epoch": 0.8258812757515641, + "grad_norm": 1.5816860999845315, + "learning_rate": 7.734398639427154e-07, + "loss": 0.2371, + "step": 2706 + }, + { + "epoch": 0.8261864794750496, + "grad_norm": 1.5788263707479415, + "learning_rate": 7.708003381923734e-07, + "loss": 0.3195, + "step": 2707 + }, + { + "epoch": 0.826491683198535, + "grad_norm": 1.4651010516210756, + "learning_rate": 7.681649478986486e-07, + "loss": 0.3439, + "step": 2708 + }, + { + "epoch": 0.8267968869220205, + "grad_norm": 1.6232033113632907, + "learning_rate": 7.655336956385156e-07, + "loss": 0.2262, + "step": 2709 + }, + { + "epoch": 0.8271020906455059, + "grad_norm": 1.7164045108999924, + "learning_rate": 7.629065839849015e-07, + "loss": 0.2985, + "step": 2710 + }, + { + "epoch": 0.8274072943689913, + "grad_norm": 1.6014691581507452, + "learning_rate": 7.60283615506689e-07, + "loss": 0.4344, + "step": 2711 + }, + { + "epoch": 0.8277124980924767, + "grad_norm": 1.443248285088709, + "learning_rate": 7.576647927687041e-07, + "loss": 0.3966, + "step": 2712 + }, + { + "epoch": 0.8280177018159621, + "grad_norm": 1.4925582984260413, + "learning_rate": 7.550501183317238e-07, + "loss": 0.2209, + "step": 2713 + }, + { + "epoch": 0.8283229055394475, + "grad_norm": 1.6487842936271624, + "learning_rate": 7.524395947524648e-07, + "loss": 0.4352, + "step": 2714 + }, + { + "epoch": 0.828628109262933, + "grad_norm": 1.3713077041321657, + "learning_rate": 7.498332245835882e-07, + "loss": 0.3803, + "step": 2715 + }, + { + "epoch": 0.8289333129864185, + "grad_norm": 1.7482914076955032, + "learning_rate": 7.472310103736913e-07, + "loss": 0.4693, + "step": 2716 + }, + { + "epoch": 0.8292385167099039, + "grad_norm": 1.550940671168507, + "learning_rate": 7.446329546673075e-07, + "loss": 0.2152, + "step": 2717 + }, + { + "epoch": 0.8295437204333893, + "grad_norm": 1.4576270800196935, + "learning_rate": 7.420390600049077e-07, + "loss": 0.3354, + "step": 2718 + }, + { + "epoch": 0.8298489241568747, + "grad_norm": 1.8982575884682047, + "learning_rate": 7.394493289228887e-07, + "loss": 0.2404, + "step": 2719 + }, + { + "epoch": 0.8301541278803601, + "grad_norm": 1.691515066685838, + "learning_rate": 7.368637639535786e-07, + "loss": 0.4312, + "step": 2720 + }, + { + "epoch": 0.8304593316038456, + "grad_norm": 1.4694249651618714, + "learning_rate": 7.342823676252331e-07, + "loss": 0.2509, + "step": 2721 + }, + { + "epoch": 0.830764535327331, + "grad_norm": 1.546455249285673, + "learning_rate": 7.317051424620292e-07, + "loss": 0.2546, + "step": 2722 + }, + { + "epoch": 0.8310697390508164, + "grad_norm": 1.7556179334661928, + "learning_rate": 7.291320909840649e-07, + "loss": 0.371, + "step": 2723 + }, + { + "epoch": 0.8313749427743018, + "grad_norm": 1.8313804397736553, + "learning_rate": 7.265632157073604e-07, + "loss": 0.3004, + "step": 2724 + }, + { + "epoch": 0.8316801464977873, + "grad_norm": 1.5378078245585296, + "learning_rate": 7.239985191438498e-07, + "loss": 0.2243, + "step": 2725 + }, + { + "epoch": 0.8319853502212727, + "grad_norm": 1.5004354480661528, + "learning_rate": 7.214380038013797e-07, + "loss": 0.2521, + "step": 2726 + }, + { + "epoch": 0.8322905539447581, + "grad_norm": 1.7167295197520305, + "learning_rate": 7.188816721837116e-07, + "loss": 0.3701, + "step": 2727 + }, + { + "epoch": 0.8325957576682436, + "grad_norm": 1.4893673915884809, + "learning_rate": 7.163295267905135e-07, + "loss": 0.4652, + "step": 2728 + }, + { + "epoch": 0.832900961391729, + "grad_norm": 1.9899160562954143, + "learning_rate": 7.137815701173617e-07, + "loss": 0.3553, + "step": 2729 + }, + { + "epoch": 0.8332061651152144, + "grad_norm": 1.9035527893233544, + "learning_rate": 7.112378046557339e-07, + "loss": 0.2428, + "step": 2730 + }, + { + "epoch": 0.8335113688386998, + "grad_norm": 1.7219634100895571, + "learning_rate": 7.086982328930142e-07, + "loss": 0.4137, + "step": 2731 + }, + { + "epoch": 0.8338165725621852, + "grad_norm": 1.6987942570976622, + "learning_rate": 7.061628573124807e-07, + "loss": 0.3738, + "step": 2732 + }, + { + "epoch": 0.8341217762856706, + "grad_norm": 1.5903429002305511, + "learning_rate": 7.036316803933107e-07, + "loss": 0.4614, + "step": 2733 + }, + { + "epoch": 0.8344269800091562, + "grad_norm": 1.5979300639731906, + "learning_rate": 7.011047046105773e-07, + "loss": 0.3753, + "step": 2734 + }, + { + "epoch": 0.8347321837326416, + "grad_norm": 1.4488125049184104, + "learning_rate": 6.985819324352438e-07, + "loss": 0.1889, + "step": 2735 + }, + { + "epoch": 0.835037387456127, + "grad_norm": 1.6759672617434724, + "learning_rate": 6.960633663341615e-07, + "loss": 0.1517, + "step": 2736 + }, + { + "epoch": 0.8353425911796124, + "grad_norm": 1.625179574287605, + "learning_rate": 6.935490087700736e-07, + "loss": 0.3266, + "step": 2737 + }, + { + "epoch": 0.8356477949030978, + "grad_norm": 1.4067868373637953, + "learning_rate": 6.91038862201604e-07, + "loss": 0.1633, + "step": 2738 + }, + { + "epoch": 0.8359529986265832, + "grad_norm": 1.8386286065687836, + "learning_rate": 6.885329290832593e-07, + "loss": 0.2931, + "step": 2739 + }, + { + "epoch": 0.8362582023500686, + "grad_norm": 1.6406484693029308, + "learning_rate": 6.860312118654283e-07, + "loss": 0.4988, + "step": 2740 + }, + { + "epoch": 0.8365634060735541, + "grad_norm": 1.3974679324270254, + "learning_rate": 6.835337129943759e-07, + "loss": 0.2291, + "step": 2741 + }, + { + "epoch": 0.8368686097970395, + "grad_norm": 1.683103485593189, + "learning_rate": 6.810404349122407e-07, + "loss": 0.3066, + "step": 2742 + }, + { + "epoch": 0.837173813520525, + "grad_norm": 1.6876120353524549, + "learning_rate": 6.785513800570376e-07, + "loss": 0.4332, + "step": 2743 + }, + { + "epoch": 0.8374790172440104, + "grad_norm": 1.5257702305049623, + "learning_rate": 6.76066550862648e-07, + "loss": 0.1981, + "step": 2744 + }, + { + "epoch": 0.8377842209674958, + "grad_norm": 1.6135370453517366, + "learning_rate": 6.735859497588254e-07, + "loss": 0.1967, + "step": 2745 + }, + { + "epoch": 0.8380894246909812, + "grad_norm": 1.7321804011662434, + "learning_rate": 6.711095791711841e-07, + "loss": 0.2733, + "step": 2746 + }, + { + "epoch": 0.8383946284144667, + "grad_norm": 1.8118392792385045, + "learning_rate": 6.686374415212066e-07, + "loss": 0.2028, + "step": 2747 + }, + { + "epoch": 0.8386998321379521, + "grad_norm": 2.0445068871019902, + "learning_rate": 6.661695392262335e-07, + "loss": 0.4171, + "step": 2748 + }, + { + "epoch": 0.8390050358614375, + "grad_norm": 2.0394915717592106, + "learning_rate": 6.637058746994629e-07, + "loss": 0.3445, + "step": 2749 + }, + { + "epoch": 0.8393102395849229, + "grad_norm": 1.8957888489698793, + "learning_rate": 6.612464503499521e-07, + "loss": 0.4731, + "step": 2750 + }, + { + "epoch": 0.8396154433084083, + "grad_norm": 4.581169704342479, + "learning_rate": 6.587912685826104e-07, + "loss": 0.3414, + "step": 2751 + }, + { + "epoch": 0.8399206470318938, + "grad_norm": 1.8018825911654097, + "learning_rate": 6.563403317981975e-07, + "loss": 0.1865, + "step": 2752 + }, + { + "epoch": 0.8402258507553793, + "grad_norm": 1.8405863889002556, + "learning_rate": 6.538936423933257e-07, + "loss": 0.4019, + "step": 2753 + }, + { + "epoch": 0.8405310544788647, + "grad_norm": 1.3557187600114369, + "learning_rate": 6.514512027604508e-07, + "loss": 0.2392, + "step": 2754 + }, + { + "epoch": 0.8408362582023501, + "grad_norm": 1.6872412871795575, + "learning_rate": 6.490130152878727e-07, + "loss": 0.4288, + "step": 2755 + }, + { + "epoch": 0.8411414619258355, + "grad_norm": 1.643255033052677, + "learning_rate": 6.465790823597373e-07, + "loss": 0.4054, + "step": 2756 + }, + { + "epoch": 0.8414466656493209, + "grad_norm": 1.7998639002315395, + "learning_rate": 6.441494063560272e-07, + "loss": 0.4727, + "step": 2757 + }, + { + "epoch": 0.8417518693728063, + "grad_norm": 1.5893664944174015, + "learning_rate": 6.417239896525607e-07, + "loss": 0.2931, + "step": 2758 + }, + { + "epoch": 0.8420570730962917, + "grad_norm": 2.1674995533239834, + "learning_rate": 6.393028346209968e-07, + "loss": 0.376, + "step": 2759 + }, + { + "epoch": 0.8423622768197772, + "grad_norm": 3.081440490653117, + "learning_rate": 6.368859436288227e-07, + "loss": 0.4837, + "step": 2760 + }, + { + "epoch": 0.8426674805432627, + "grad_norm": 1.4416792282850845, + "learning_rate": 6.344733190393554e-07, + "loss": 0.2417, + "step": 2761 + }, + { + "epoch": 0.8429726842667481, + "grad_norm": 1.525500656242817, + "learning_rate": 6.320649632117443e-07, + "loss": 0.219, + "step": 2762 + }, + { + "epoch": 0.8432778879902335, + "grad_norm": 1.6337898651139318, + "learning_rate": 6.296608785009634e-07, + "loss": 0.437, + "step": 2763 + }, + { + "epoch": 0.8435830917137189, + "grad_norm": 1.6056735982404093, + "learning_rate": 6.272610672578072e-07, + "loss": 0.3207, + "step": 2764 + }, + { + "epoch": 0.8438882954372043, + "grad_norm": 1.7759983210340933, + "learning_rate": 6.248655318288932e-07, + "loss": 0.3323, + "step": 2765 + }, + { + "epoch": 0.8441934991606898, + "grad_norm": 1.433669667741416, + "learning_rate": 6.224742745566603e-07, + "loss": 0.2073, + "step": 2766 + }, + { + "epoch": 0.8444987028841752, + "grad_norm": 1.7109539431822582, + "learning_rate": 6.200872977793604e-07, + "loss": 0.4429, + "step": 2767 + }, + { + "epoch": 0.8448039066076606, + "grad_norm": 1.5934193586060046, + "learning_rate": 6.177046038310608e-07, + "loss": 0.3145, + "step": 2768 + }, + { + "epoch": 0.845109110331146, + "grad_norm": 1.6459467724042265, + "learning_rate": 6.153261950416433e-07, + "loss": 0.2244, + "step": 2769 + }, + { + "epoch": 0.8454143140546315, + "grad_norm": 2.6281456945377912, + "learning_rate": 6.129520737367972e-07, + "loss": 0.2321, + "step": 2770 + }, + { + "epoch": 0.8457195177781169, + "grad_norm": 1.1481705958782344, + "learning_rate": 6.105822422380181e-07, + "loss": 0.1813, + "step": 2771 + }, + { + "epoch": 0.8460247215016024, + "grad_norm": 2.063357319525523, + "learning_rate": 6.082167028626102e-07, + "loss": 0.2958, + "step": 2772 + }, + { + "epoch": 0.8463299252250878, + "grad_norm": 1.5858495528166288, + "learning_rate": 6.058554579236791e-07, + "loss": 0.228, + "step": 2773 + }, + { + "epoch": 0.8466351289485732, + "grad_norm": 1.5093068046249114, + "learning_rate": 6.034985097301299e-07, + "loss": 0.3244, + "step": 2774 + }, + { + "epoch": 0.8469403326720586, + "grad_norm": 1.69358407084056, + "learning_rate": 6.011458605866694e-07, + "loss": 0.4273, + "step": 2775 + }, + { + "epoch": 0.847245536395544, + "grad_norm": 1.5431909328757925, + "learning_rate": 5.987975127937973e-07, + "loss": 0.459, + "step": 2776 + }, + { + "epoch": 0.8475507401190294, + "grad_norm": 1.510231015426264, + "learning_rate": 5.964534686478074e-07, + "loss": 0.1519, + "step": 2777 + }, + { + "epoch": 0.8478559438425148, + "grad_norm": 1.6896869475566794, + "learning_rate": 5.941137304407901e-07, + "loss": 0.2564, + "step": 2778 + }, + { + "epoch": 0.8481611475660003, + "grad_norm": 1.7422526394422695, + "learning_rate": 5.917783004606192e-07, + "loss": 0.2066, + "step": 2779 + }, + { + "epoch": 0.8484663512894858, + "grad_norm": 1.9404947455240935, + "learning_rate": 5.894471809909585e-07, + "loss": 0.2178, + "step": 2780 + }, + { + "epoch": 0.8487715550129712, + "grad_norm": 1.551273604097384, + "learning_rate": 5.87120374311258e-07, + "loss": 0.3228, + "step": 2781 + }, + { + "epoch": 0.8490767587364566, + "grad_norm": 1.492001906432092, + "learning_rate": 5.847978826967482e-07, + "loss": 0.3276, + "step": 2782 + }, + { + "epoch": 0.849381962459942, + "grad_norm": 2.0452200918982286, + "learning_rate": 5.824797084184409e-07, + "loss": 0.127, + "step": 2783 + }, + { + "epoch": 0.8496871661834274, + "grad_norm": 1.866882235420747, + "learning_rate": 5.801658537431259e-07, + "loss": 0.392, + "step": 2784 + }, + { + "epoch": 0.8499923699069128, + "grad_norm": 1.5550305165427303, + "learning_rate": 5.778563209333715e-07, + "loss": 0.3541, + "step": 2785 + }, + { + "epoch": 0.8502975736303983, + "grad_norm": 1.5586397169022426, + "learning_rate": 5.755511122475172e-07, + "loss": 0.255, + "step": 2786 + }, + { + "epoch": 0.8506027773538837, + "grad_norm": 1.7528566478080552, + "learning_rate": 5.732502299396742e-07, + "loss": 0.2115, + "step": 2787 + }, + { + "epoch": 0.8509079810773691, + "grad_norm": 2.717382152707819, + "learning_rate": 5.709536762597262e-07, + "loss": 0.2892, + "step": 2788 + }, + { + "epoch": 0.8512131848008546, + "grad_norm": 1.9158547243459207, + "learning_rate": 5.686614534533213e-07, + "loss": 0.2301, + "step": 2789 + }, + { + "epoch": 0.85151838852434, + "grad_norm": 1.4491063817306546, + "learning_rate": 5.663735637618728e-07, + "loss": 0.1835, + "step": 2790 + }, + { + "epoch": 0.8518235922478254, + "grad_norm": 1.4648970530666563, + "learning_rate": 5.640900094225593e-07, + "loss": 0.357, + "step": 2791 + }, + { + "epoch": 0.8521287959713109, + "grad_norm": 1.7272508034615215, + "learning_rate": 5.61810792668318e-07, + "loss": 0.218, + "step": 2792 + }, + { + "epoch": 0.8524339996947963, + "grad_norm": 2.0758367574819987, + "learning_rate": 5.595359157278435e-07, + "loss": 0.4254, + "step": 2793 + }, + { + "epoch": 0.8527392034182817, + "grad_norm": 1.5052228440725954, + "learning_rate": 5.572653808255918e-07, + "loss": 0.3254, + "step": 2794 + }, + { + "epoch": 0.8530444071417671, + "grad_norm": 1.9500607065385915, + "learning_rate": 5.549991901817686e-07, + "loss": 0.2944, + "step": 2795 + }, + { + "epoch": 0.8533496108652525, + "grad_norm": 1.445294024375155, + "learning_rate": 5.527373460123314e-07, + "loss": 0.2834, + "step": 2796 + }, + { + "epoch": 0.8536548145887379, + "grad_norm": 1.546931256128644, + "learning_rate": 5.504798505289916e-07, + "loss": 0.3251, + "step": 2797 + }, + { + "epoch": 0.8539600183122235, + "grad_norm": 1.7930685443573522, + "learning_rate": 5.482267059392049e-07, + "loss": 0.2334, + "step": 2798 + }, + { + "epoch": 0.8542652220357089, + "grad_norm": 1.4881243285105927, + "learning_rate": 5.459779144461713e-07, + "loss": 0.3803, + "step": 2799 + }, + { + "epoch": 0.8545704257591943, + "grad_norm": 1.9628258841546715, + "learning_rate": 5.437334782488396e-07, + "loss": 0.3018, + "step": 2800 + }, + { + "epoch": 0.8548756294826797, + "grad_norm": 1.6535321374289942, + "learning_rate": 5.414933995418947e-07, + "loss": 0.324, + "step": 2801 + }, + { + "epoch": 0.8551808332061651, + "grad_norm": 2.0209536827929493, + "learning_rate": 5.392576805157617e-07, + "loss": 0.3422, + "step": 2802 + }, + { + "epoch": 0.8554860369296505, + "grad_norm": 1.6412923999431686, + "learning_rate": 5.370263233566053e-07, + "loss": 0.3786, + "step": 2803 + }, + { + "epoch": 0.8557912406531359, + "grad_norm": 1.9568745746447607, + "learning_rate": 5.347993302463228e-07, + "loss": 0.3732, + "step": 2804 + }, + { + "epoch": 0.8560964443766214, + "grad_norm": 1.6531349253942424, + "learning_rate": 5.325767033625429e-07, + "loss": 0.332, + "step": 2805 + }, + { + "epoch": 0.8564016481001068, + "grad_norm": 1.479312979039126, + "learning_rate": 5.303584448786286e-07, + "loss": 0.2376, + "step": 2806 + }, + { + "epoch": 0.8567068518235923, + "grad_norm": 1.6752264353727613, + "learning_rate": 5.281445569636684e-07, + "loss": 0.207, + "step": 2807 + }, + { + "epoch": 0.8570120555470777, + "grad_norm": 1.7475890854815774, + "learning_rate": 5.259350417824777e-07, + "loss": 0.3528, + "step": 2808 + }, + { + "epoch": 0.8573172592705631, + "grad_norm": 1.6438561727950982, + "learning_rate": 5.237299014955955e-07, + "loss": 0.164, + "step": 2809 + }, + { + "epoch": 0.8576224629940485, + "grad_norm": 1.9847062202427255, + "learning_rate": 5.215291382592841e-07, + "loss": 0.218, + "step": 2810 + }, + { + "epoch": 0.857927666717534, + "grad_norm": 2.064912252339862, + "learning_rate": 5.193327542255266e-07, + "loss": 0.3854, + "step": 2811 + }, + { + "epoch": 0.8582328704410194, + "grad_norm": 1.8378487236605714, + "learning_rate": 5.171407515420213e-07, + "loss": 0.3145, + "step": 2812 + }, + { + "epoch": 0.8585380741645048, + "grad_norm": 1.5437728911027, + "learning_rate": 5.14953132352185e-07, + "loss": 0.1919, + "step": 2813 + }, + { + "epoch": 0.8588432778879902, + "grad_norm": 1.9243326987424028, + "learning_rate": 5.127698987951457e-07, + "loss": 0.3098, + "step": 2814 + }, + { + "epoch": 0.8591484816114756, + "grad_norm": 1.6447618916260383, + "learning_rate": 5.105910530057439e-07, + "loss": 0.227, + "step": 2815 + }, + { + "epoch": 0.8594536853349611, + "grad_norm": 1.622402609560446, + "learning_rate": 5.084165971145316e-07, + "loss": 0.1718, + "step": 2816 + }, + { + "epoch": 0.8597588890584466, + "grad_norm": 1.7108956945877176, + "learning_rate": 5.06246533247765e-07, + "loss": 0.3257, + "step": 2817 + }, + { + "epoch": 0.860064092781932, + "grad_norm": 1.89074485966348, + "learning_rate": 5.040808635274063e-07, + "loss": 0.3174, + "step": 2818 + }, + { + "epoch": 0.8603692965054174, + "grad_norm": 2.2435927822106434, + "learning_rate": 5.019195900711233e-07, + "loss": 0.214, + "step": 2819 + }, + { + "epoch": 0.8606745002289028, + "grad_norm": 2.2042387183178556, + "learning_rate": 4.997627149922829e-07, + "loss": 0.3325, + "step": 2820 + }, + { + "epoch": 0.8609797039523882, + "grad_norm": 2.001996950739807, + "learning_rate": 4.976102403999511e-07, + "loss": 0.362, + "step": 2821 + }, + { + "epoch": 0.8612849076758736, + "grad_norm": 1.688043577454316, + "learning_rate": 4.954621683988919e-07, + "loss": 0.3231, + "step": 2822 + }, + { + "epoch": 0.861590111399359, + "grad_norm": 1.8101403233630515, + "learning_rate": 4.933185010895641e-07, + "loss": 0.3479, + "step": 2823 + }, + { + "epoch": 0.8618953151228445, + "grad_norm": 1.6240100779760942, + "learning_rate": 4.911792405681182e-07, + "loss": 0.2984, + "step": 2824 + }, + { + "epoch": 0.86220051884633, + "grad_norm": 1.349047354727378, + "learning_rate": 4.890443889263974e-07, + "loss": 0.2936, + "step": 2825 + }, + { + "epoch": 0.8625057225698154, + "grad_norm": 1.9930665713311155, + "learning_rate": 4.869139482519325e-07, + "loss": 0.2475, + "step": 2826 + }, + { + "epoch": 0.8628109262933008, + "grad_norm": 2.0426209483761584, + "learning_rate": 4.847879206279421e-07, + "loss": 0.3413, + "step": 2827 + }, + { + "epoch": 0.8631161300167862, + "grad_norm": 1.6504839447544135, + "learning_rate": 4.826663081333283e-07, + "loss": 0.291, + "step": 2828 + }, + { + "epoch": 0.8634213337402716, + "grad_norm": 1.6082023867071948, + "learning_rate": 4.805491128426776e-07, + "loss": 0.2673, + "step": 2829 + }, + { + "epoch": 0.863726537463757, + "grad_norm": 2.147514006066084, + "learning_rate": 4.784363368262557e-07, + "loss": 0.4531, + "step": 2830 + }, + { + "epoch": 0.8640317411872425, + "grad_norm": 1.4923935193747502, + "learning_rate": 4.763279821500061e-07, + "loss": 0.2804, + "step": 2831 + }, + { + "epoch": 0.8643369449107279, + "grad_norm": 1.5802741948758767, + "learning_rate": 4.74224050875553e-07, + "loss": 0.298, + "step": 2832 + }, + { + "epoch": 0.8646421486342133, + "grad_norm": 1.9097476826521518, + "learning_rate": 4.721245450601913e-07, + "loss": 0.4915, + "step": 2833 + }, + { + "epoch": 0.8649473523576988, + "grad_norm": 1.323127589594884, + "learning_rate": 4.7002946675688943e-07, + "loss": 0.2669, + "step": 2834 + }, + { + "epoch": 0.8652525560811842, + "grad_norm": 1.3578549798744834, + "learning_rate": 4.6793881801428796e-07, + "loss": 0.3498, + "step": 2835 + }, + { + "epoch": 0.8655577598046696, + "grad_norm": 2.281989513356405, + "learning_rate": 4.6585260087669394e-07, + "loss": 0.2315, + "step": 2836 + }, + { + "epoch": 0.8658629635281551, + "grad_norm": 1.5256072414416366, + "learning_rate": 4.6377081738408214e-07, + "loss": 0.3382, + "step": 2837 + }, + { + "epoch": 0.8661681672516405, + "grad_norm": 1.5990002488928738, + "learning_rate": 4.6169346957209313e-07, + "loss": 0.407, + "step": 2838 + }, + { + "epoch": 0.8664733709751259, + "grad_norm": 1.5824569317831327, + "learning_rate": 4.596205594720282e-07, + "loss": 0.1483, + "step": 2839 + }, + { + "epoch": 0.8667785746986113, + "grad_norm": 1.8002191357207018, + "learning_rate": 4.5755208911084927e-07, + "loss": 0.389, + "step": 2840 + }, + { + "epoch": 0.8670837784220967, + "grad_norm": 1.5460094766264687, + "learning_rate": 4.554880605111789e-07, + "loss": 0.3566, + "step": 2841 + }, + { + "epoch": 0.8673889821455821, + "grad_norm": 1.8934339353984617, + "learning_rate": 4.534284756912943e-07, + "loss": 0.2947, + "step": 2842 + }, + { + "epoch": 0.8676941858690677, + "grad_norm": 1.8798336032520344, + "learning_rate": 4.5137333666512794e-07, + "loss": 0.562, + "step": 2843 + }, + { + "epoch": 0.8679993895925531, + "grad_norm": 1.6991690091273968, + "learning_rate": 4.493226454422661e-07, + "loss": 0.3847, + "step": 2844 + }, + { + "epoch": 0.8683045933160385, + "grad_norm": 1.6244784765393927, + "learning_rate": 4.4727640402794536e-07, + "loss": 0.381, + "step": 2845 + }, + { + "epoch": 0.8686097970395239, + "grad_norm": 1.8197468545946518, + "learning_rate": 4.4523461442305016e-07, + "loss": 0.4407, + "step": 2846 + }, + { + "epoch": 0.8689150007630093, + "grad_norm": 1.7345323920789553, + "learning_rate": 4.431972786241118e-07, + "loss": 0.3216, + "step": 2847 + }, + { + "epoch": 0.8692202044864947, + "grad_norm": 1.4723264289459734, + "learning_rate": 4.4116439862330887e-07, + "loss": 0.3166, + "step": 2848 + }, + { + "epoch": 0.8695254082099801, + "grad_norm": 1.7425234346715797, + "learning_rate": 4.3913597640846016e-07, + "loss": 0.2847, + "step": 2849 + }, + { + "epoch": 0.8698306119334656, + "grad_norm": 1.7469750554837415, + "learning_rate": 4.371120139630258e-07, + "loss": 0.222, + "step": 2850 + }, + { + "epoch": 0.870135815656951, + "grad_norm": 1.5602645141141633, + "learning_rate": 4.350925132661077e-07, + "loss": 0.3852, + "step": 2851 + }, + { + "epoch": 0.8704410193804364, + "grad_norm": 2.6057014418616484, + "learning_rate": 4.330774762924417e-07, + "loss": 0.364, + "step": 2852 + }, + { + "epoch": 0.8707462231039219, + "grad_norm": 1.6420251593666035, + "learning_rate": 4.3106690501239945e-07, + "loss": 0.2929, + "step": 2853 + }, + { + "epoch": 0.8710514268274073, + "grad_norm": 1.6694773656836528, + "learning_rate": 4.290608013919889e-07, + "loss": 0.2865, + "step": 2854 + }, + { + "epoch": 0.8713566305508927, + "grad_norm": 1.5571008860954931, + "learning_rate": 4.270591673928448e-07, + "loss": 0.2187, + "step": 2855 + }, + { + "epoch": 0.8716618342743782, + "grad_norm": 1.8049190752481343, + "learning_rate": 4.250620049722337e-07, + "loss": 0.2377, + "step": 2856 + }, + { + "epoch": 0.8719670379978636, + "grad_norm": 1.2194201265448192, + "learning_rate": 4.2306931608305126e-07, + "loss": 0.196, + "step": 2857 + }, + { + "epoch": 0.872272241721349, + "grad_norm": 1.5577095602262914, + "learning_rate": 4.2108110267381596e-07, + "loss": 0.3266, + "step": 2858 + }, + { + "epoch": 0.8725774454448344, + "grad_norm": 1.8261355799367787, + "learning_rate": 4.190973666886705e-07, + "loss": 0.4057, + "step": 2859 + }, + { + "epoch": 0.8728826491683198, + "grad_norm": 1.7034846634774858, + "learning_rate": 4.17118110067381e-07, + "loss": 0.272, + "step": 2860 + }, + { + "epoch": 0.8731878528918052, + "grad_norm": 1.8111684446224605, + "learning_rate": 4.1514333474533263e-07, + "loss": 0.2453, + "step": 2861 + }, + { + "epoch": 0.8734930566152908, + "grad_norm": 1.5276556062497046, + "learning_rate": 4.1317304265352855e-07, + "loss": 0.3509, + "step": 2862 + }, + { + "epoch": 0.8737982603387762, + "grad_norm": 1.8832433057327023, + "learning_rate": 4.1120723571858645e-07, + "loss": 0.1538, + "step": 2863 + }, + { + "epoch": 0.8741034640622616, + "grad_norm": 1.9218201047919379, + "learning_rate": 4.092459158627421e-07, + "loss": 0.43, + "step": 2864 + }, + { + "epoch": 0.874408667785747, + "grad_norm": 1.895909765470433, + "learning_rate": 4.0728908500384024e-07, + "loss": 0.3998, + "step": 2865 + }, + { + "epoch": 0.8747138715092324, + "grad_norm": 1.494713208557475, + "learning_rate": 4.053367450553364e-07, + "loss": 0.2971, + "step": 2866 + }, + { + "epoch": 0.8750190752327178, + "grad_norm": 1.8677719031637467, + "learning_rate": 4.033888979262973e-07, + "loss": 0.3805, + "step": 2867 + }, + { + "epoch": 0.8753242789562032, + "grad_norm": 1.849333697261009, + "learning_rate": 4.014455455213939e-07, + "loss": 0.262, + "step": 2868 + }, + { + "epoch": 0.8756294826796887, + "grad_norm": 1.534919079812837, + "learning_rate": 3.995066897409022e-07, + "loss": 0.2033, + "step": 2869 + }, + { + "epoch": 0.8759346864031741, + "grad_norm": 1.834784872349212, + "learning_rate": 3.9757233248070403e-07, + "loss": 0.4, + "step": 2870 + }, + { + "epoch": 0.8762398901266596, + "grad_norm": 1.9071103015159971, + "learning_rate": 3.9564247563227854e-07, + "loss": 0.1831, + "step": 2871 + }, + { + "epoch": 0.876545093850145, + "grad_norm": 1.592166637699512, + "learning_rate": 3.9371712108270567e-07, + "loss": 0.2476, + "step": 2872 + }, + { + "epoch": 0.8768502975736304, + "grad_norm": 2.0964013009468907, + "learning_rate": 3.9179627071466443e-07, + "loss": 0.6438, + "step": 2873 + }, + { + "epoch": 0.8771555012971158, + "grad_norm": 1.5601353872677923, + "learning_rate": 3.898799264064279e-07, + "loss": 0.3256, + "step": 2874 + }, + { + "epoch": 0.8774607050206013, + "grad_norm": 2.2184170755256747, + "learning_rate": 3.8796809003186274e-07, + "loss": 0.3217, + "step": 2875 + }, + { + "epoch": 0.8777659087440867, + "grad_norm": 1.5778596862760628, + "learning_rate": 3.8606076346042855e-07, + "loss": 0.2969, + "step": 2876 + }, + { + "epoch": 0.8780711124675721, + "grad_norm": 1.5917046081388069, + "learning_rate": 3.8415794855717556e-07, + "loss": 0.4407, + "step": 2877 + }, + { + "epoch": 0.8783763161910575, + "grad_norm": 1.6609436322132212, + "learning_rate": 3.8225964718274114e-07, + "loss": 0.3823, + "step": 2878 + }, + { + "epoch": 0.8786815199145429, + "grad_norm": 1.6035716782319969, + "learning_rate": 3.8036586119334806e-07, + "loss": 0.2148, + "step": 2879 + }, + { + "epoch": 0.8789867236380284, + "grad_norm": 1.758931121934974, + "learning_rate": 3.784765924408074e-07, + "loss": 0.3072, + "step": 2880 + }, + { + "epoch": 0.8792919273615138, + "grad_norm": 1.7537087921934966, + "learning_rate": 3.7659184277251017e-07, + "loss": 0.4087, + "step": 2881 + }, + { + "epoch": 0.8795971310849993, + "grad_norm": 1.656573066152407, + "learning_rate": 3.747116140314283e-07, + "loss": 0.2921, + "step": 2882 + }, + { + "epoch": 0.8799023348084847, + "grad_norm": 1.6821172129718753, + "learning_rate": 3.728359080561156e-07, + "loss": 0.3312, + "step": 2883 + }, + { + "epoch": 0.8802075385319701, + "grad_norm": 2.029673624908971, + "learning_rate": 3.7096472668070095e-07, + "loss": 0.3149, + "step": 2884 + }, + { + "epoch": 0.8805127422554555, + "grad_norm": 1.4923235446767098, + "learning_rate": 3.690980717348885e-07, + "loss": 0.4473, + "step": 2885 + }, + { + "epoch": 0.8808179459789409, + "grad_norm": 1.884813820020141, + "learning_rate": 3.6723594504396054e-07, + "loss": 0.2947, + "step": 2886 + }, + { + "epoch": 0.8811231497024263, + "grad_norm": 2.0386964900644418, + "learning_rate": 3.6537834842876596e-07, + "loss": 0.2797, + "step": 2887 + }, + { + "epoch": 0.8814283534259117, + "grad_norm": 1.4766396800058812, + "learning_rate": 3.635252837057274e-07, + "loss": 0.2852, + "step": 2888 + }, + { + "epoch": 0.8817335571493973, + "grad_norm": 1.2583942844879477, + "learning_rate": 3.616767526868353e-07, + "loss": 0.1721, + "step": 2889 + }, + { + "epoch": 0.8820387608728827, + "grad_norm": 1.7116168341336353, + "learning_rate": 3.598327571796467e-07, + "loss": 0.3355, + "step": 2890 + }, + { + "epoch": 0.8823439645963681, + "grad_norm": 1.7783296973427543, + "learning_rate": 3.5799329898728254e-07, + "loss": 0.4913, + "step": 2891 + }, + { + "epoch": 0.8826491683198535, + "grad_norm": 1.660842789455405, + "learning_rate": 3.5615837990842894e-07, + "loss": 0.207, + "step": 2892 + }, + { + "epoch": 0.8829543720433389, + "grad_norm": 1.9116154189531032, + "learning_rate": 3.5432800173733464e-07, + "loss": 0.366, + "step": 2893 + }, + { + "epoch": 0.8832595757668243, + "grad_norm": 2.2885818758039442, + "learning_rate": 3.5250216626380395e-07, + "loss": 0.3826, + "step": 2894 + }, + { + "epoch": 0.8835647794903098, + "grad_norm": 3.0762827050967285, + "learning_rate": 3.506808752732016e-07, + "loss": 0.3125, + "step": 2895 + }, + { + "epoch": 0.8838699832137952, + "grad_norm": 1.5947558650859617, + "learning_rate": 3.488641305464496e-07, + "loss": 0.374, + "step": 2896 + }, + { + "epoch": 0.8841751869372806, + "grad_norm": 1.588661672926618, + "learning_rate": 3.470519338600231e-07, + "loss": 0.3316, + "step": 2897 + }, + { + "epoch": 0.8844803906607661, + "grad_norm": 1.91229278339448, + "learning_rate": 3.45244286985949e-07, + "loss": 0.3497, + "step": 2898 + }, + { + "epoch": 0.8847855943842515, + "grad_norm": 2.162437627697701, + "learning_rate": 3.434411916918079e-07, + "loss": 0.2772, + "step": 2899 + }, + { + "epoch": 0.8850907981077369, + "grad_norm": 2.1766614246548697, + "learning_rate": 3.4164264974072767e-07, + "loss": 0.3205, + "step": 2900 + }, + { + "epoch": 0.8853960018312224, + "grad_norm": 1.766540155139548, + "learning_rate": 3.3984866289138384e-07, + "loss": 0.3762, + "step": 2901 + }, + { + "epoch": 0.8857012055547078, + "grad_norm": 1.656932762057204, + "learning_rate": 3.3805923289799925e-07, + "loss": 0.2464, + "step": 2902 + }, + { + "epoch": 0.8860064092781932, + "grad_norm": 1.9518060121489198, + "learning_rate": 3.3627436151033976e-07, + "loss": 0.355, + "step": 2903 + }, + { + "epoch": 0.8863116130016786, + "grad_norm": 1.2265407293051998, + "learning_rate": 3.344940504737132e-07, + "loss": 0.2153, + "step": 2904 + }, + { + "epoch": 0.886616816725164, + "grad_norm": 2.1094249578140785, + "learning_rate": 3.3271830152897034e-07, + "loss": 0.3913, + "step": 2905 + }, + { + "epoch": 0.8869220204486494, + "grad_norm": 1.8163593901465602, + "learning_rate": 3.309471164124978e-07, + "loss": 0.1928, + "step": 2906 + }, + { + "epoch": 0.887227224172135, + "grad_norm": 1.611692741062144, + "learning_rate": 3.2918049685622176e-07, + "loss": 0.2319, + "step": 2907 + }, + { + "epoch": 0.8875324278956204, + "grad_norm": 1.7094278163526146, + "learning_rate": 3.274184445876033e-07, + "loss": 0.4093, + "step": 2908 + }, + { + "epoch": 0.8878376316191058, + "grad_norm": 2.231266019533923, + "learning_rate": 3.2566096132963855e-07, + "loss": 0.3184, + "step": 2909 + }, + { + "epoch": 0.8881428353425912, + "grad_norm": 1.7527878217750263, + "learning_rate": 3.2390804880085446e-07, + "loss": 0.2561, + "step": 2910 + }, + { + "epoch": 0.8884480390660766, + "grad_norm": 1.4361536936264583, + "learning_rate": 3.2215970871530833e-07, + "loss": 0.2711, + "step": 2911 + }, + { + "epoch": 0.888753242789562, + "grad_norm": 1.6304932691615903, + "learning_rate": 3.2041594278258814e-07, + "loss": 0.3421, + "step": 2912 + }, + { + "epoch": 0.8890584465130474, + "grad_norm": 1.6973082577670313, + "learning_rate": 3.1867675270780775e-07, + "loss": 0.4015, + "step": 2913 + }, + { + "epoch": 0.8893636502365329, + "grad_norm": 1.6377867352985322, + "learning_rate": 3.1694214019160617e-07, + "loss": 0.1819, + "step": 2914 + }, + { + "epoch": 0.8896688539600183, + "grad_norm": 2.070385280265544, + "learning_rate": 3.1521210693014895e-07, + "loss": 0.2974, + "step": 2915 + }, + { + "epoch": 0.8899740576835038, + "grad_norm": 1.3208479839358807, + "learning_rate": 3.1348665461511995e-07, + "loss": 0.2575, + "step": 2916 + }, + { + "epoch": 0.8902792614069892, + "grad_norm": 1.8611178524534187, + "learning_rate": 3.1176578493372634e-07, + "loss": 0.5982, + "step": 2917 + }, + { + "epoch": 0.8905844651304746, + "grad_norm": 1.8773806399667003, + "learning_rate": 3.1004949956869425e-07, + "loss": 0.3908, + "step": 2918 + }, + { + "epoch": 0.89088966885396, + "grad_norm": 1.9302383439732715, + "learning_rate": 3.083378001982662e-07, + "loss": 0.3582, + "step": 2919 + }, + { + "epoch": 0.8911948725774455, + "grad_norm": 1.632939018932989, + "learning_rate": 3.0663068849619947e-07, + "loss": 0.2021, + "step": 2920 + }, + { + "epoch": 0.8915000763009309, + "grad_norm": 1.7351259547529747, + "learning_rate": 3.0492816613176824e-07, + "loss": 0.2321, + "step": 2921 + }, + { + "epoch": 0.8918052800244163, + "grad_norm": 2.35469591611334, + "learning_rate": 3.0323023476975633e-07, + "loss": 0.3903, + "step": 2922 + }, + { + "epoch": 0.8921104837479017, + "grad_norm": 1.8756225823879709, + "learning_rate": 3.015368960704584e-07, + "loss": 0.3714, + "step": 2923 + }, + { + "epoch": 0.8924156874713871, + "grad_norm": 1.5504329107606214, + "learning_rate": 2.9984815168968005e-07, + "loss": 0.2628, + "step": 2924 + }, + { + "epoch": 0.8927208911948725, + "grad_norm": 1.5052450982942016, + "learning_rate": 2.9816400327873475e-07, + "loss": 0.311, + "step": 2925 + }, + { + "epoch": 0.893026094918358, + "grad_norm": 1.6022165591136466, + "learning_rate": 2.96484452484439e-07, + "loss": 0.4598, + "step": 2926 + }, + { + "epoch": 0.8933312986418435, + "grad_norm": 2.724851142247225, + "learning_rate": 2.948095009491164e-07, + "loss": 0.3631, + "step": 2927 + }, + { + "epoch": 0.8936365023653289, + "grad_norm": 1.5278134611567196, + "learning_rate": 2.9313915031059124e-07, + "loss": 0.1995, + "step": 2928 + }, + { + "epoch": 0.8939417060888143, + "grad_norm": 1.6693975328119404, + "learning_rate": 2.914734022021892e-07, + "loss": 0.3837, + "step": 2929 + }, + { + "epoch": 0.8942469098122997, + "grad_norm": 1.8591924625843064, + "learning_rate": 2.8981225825273685e-07, + "loss": 0.477, + "step": 2930 + }, + { + "epoch": 0.8945521135357851, + "grad_norm": 1.4329841835389625, + "learning_rate": 2.8815572008655813e-07, + "loss": 0.1959, + "step": 2931 + }, + { + "epoch": 0.8948573172592705, + "grad_norm": 1.4974132326746141, + "learning_rate": 2.865037893234718e-07, + "loss": 0.1497, + "step": 2932 + }, + { + "epoch": 0.895162520982756, + "grad_norm": 1.881911572696749, + "learning_rate": 2.8485646757879346e-07, + "loss": 0.3683, + "step": 2933 + }, + { + "epoch": 0.8954677247062414, + "grad_norm": 1.8397958024371963, + "learning_rate": 2.832137564633303e-07, + "loss": 0.3345, + "step": 2934 + }, + { + "epoch": 0.8957729284297269, + "grad_norm": 1.296334671153873, + "learning_rate": 2.815756575833822e-07, + "loss": 0.266, + "step": 2935 + }, + { + "epoch": 0.8960781321532123, + "grad_norm": 1.6612062252985271, + "learning_rate": 2.79942172540737e-07, + "loss": 0.4569, + "step": 2936 + }, + { + "epoch": 0.8963833358766977, + "grad_norm": 1.5904279005922972, + "learning_rate": 2.7831330293267456e-07, + "loss": 0.2959, + "step": 2937 + }, + { + "epoch": 0.8966885396001831, + "grad_norm": 1.961044430191955, + "learning_rate": 2.7668905035195814e-07, + "loss": 0.4133, + "step": 2938 + }, + { + "epoch": 0.8969937433236685, + "grad_norm": 1.6163743043142387, + "learning_rate": 2.750694163868373e-07, + "loss": 0.3433, + "step": 2939 + }, + { + "epoch": 0.897298947047154, + "grad_norm": 1.722356168945633, + "learning_rate": 2.734544026210473e-07, + "loss": 0.3398, + "step": 2940 + }, + { + "epoch": 0.8976041507706394, + "grad_norm": 1.5628068241229134, + "learning_rate": 2.7184401063380274e-07, + "loss": 0.1959, + "step": 2941 + }, + { + "epoch": 0.8979093544941248, + "grad_norm": 1.3166091717266857, + "learning_rate": 2.702382419998001e-07, + "loss": 0.1876, + "step": 2942 + }, + { + "epoch": 0.8982145582176102, + "grad_norm": 1.4971338546018425, + "learning_rate": 2.6863709828921613e-07, + "loss": 0.298, + "step": 2943 + }, + { + "epoch": 0.8985197619410957, + "grad_norm": 2.1887573988008957, + "learning_rate": 2.67040581067704e-07, + "loss": 0.2459, + "step": 2944 + }, + { + "epoch": 0.8988249656645811, + "grad_norm": 1.9489520851849282, + "learning_rate": 2.654486918963922e-07, + "loss": 0.3013, + "step": 2945 + }, + { + "epoch": 0.8991301693880666, + "grad_norm": 1.8230958541721705, + "learning_rate": 2.6386143233188555e-07, + "loss": 0.1776, + "step": 2946 + }, + { + "epoch": 0.899435373111552, + "grad_norm": 1.7492627907133111, + "learning_rate": 2.6227880392626027e-07, + "loss": 0.4107, + "step": 2947 + }, + { + "epoch": 0.8997405768350374, + "grad_norm": 1.5771168425144115, + "learning_rate": 2.60700808227064e-07, + "loss": 0.2886, + "step": 2948 + }, + { + "epoch": 0.9000457805585228, + "grad_norm": 1.432403735775991, + "learning_rate": 2.591274467773169e-07, + "loss": 0.3455, + "step": 2949 + }, + { + "epoch": 0.9003509842820082, + "grad_norm": 1.6271872105899479, + "learning_rate": 2.5755872111550507e-07, + "loss": 0.3225, + "step": 2950 + }, + { + "epoch": 0.9006561880054936, + "grad_norm": 1.551260924442995, + "learning_rate": 2.5599463277558135e-07, + "loss": 0.3506, + "step": 2951 + }, + { + "epoch": 0.900961391728979, + "grad_norm": 1.6455674338929827, + "learning_rate": 2.544351832869668e-07, + "loss": 0.3607, + "step": 2952 + }, + { + "epoch": 0.9012665954524646, + "grad_norm": 1.7086200783597483, + "learning_rate": 2.5288037417454337e-07, + "loss": 0.2798, + "step": 2953 + }, + { + "epoch": 0.90157179917595, + "grad_norm": 1.3498086299184577, + "learning_rate": 2.513302069586565e-07, + "loss": 0.1468, + "step": 2954 + }, + { + "epoch": 0.9018770028994354, + "grad_norm": 1.6086819091472078, + "learning_rate": 2.497846831551143e-07, + "loss": 0.2781, + "step": 2955 + }, + { + "epoch": 0.9021822066229208, + "grad_norm": 1.7869933710855876, + "learning_rate": 2.4824380427518225e-07, + "loss": 0.2661, + "step": 2956 + }, + { + "epoch": 0.9024874103464062, + "grad_norm": 1.8348454250746127, + "learning_rate": 2.46707571825584e-07, + "loss": 0.1668, + "step": 2957 + }, + { + "epoch": 0.9027926140698916, + "grad_norm": 1.7159862145001317, + "learning_rate": 2.4517598730850076e-07, + "loss": 0.3124, + "step": 2958 + }, + { + "epoch": 0.9030978177933771, + "grad_norm": 1.8220207483217812, + "learning_rate": 2.436490522215695e-07, + "loss": 0.2875, + "step": 2959 + }, + { + "epoch": 0.9034030215168625, + "grad_norm": 1.892282941255574, + "learning_rate": 2.421267680578787e-07, + "loss": 0.288, + "step": 2960 + }, + { + "epoch": 0.9037082252403479, + "grad_norm": 1.5044806634351904, + "learning_rate": 2.406091363059698e-07, + "loss": 0.2717, + "step": 2961 + }, + { + "epoch": 0.9040134289638334, + "grad_norm": 1.379994986497369, + "learning_rate": 2.3909615844983633e-07, + "loss": 0.281, + "step": 2962 + }, + { + "epoch": 0.9043186326873188, + "grad_norm": 1.6561392261240486, + "learning_rate": 2.3758783596891988e-07, + "loss": 0.3448, + "step": 2963 + }, + { + "epoch": 0.9046238364108042, + "grad_norm": 1.666811253316016, + "learning_rate": 2.360841703381084e-07, + "loss": 0.3559, + "step": 2964 + }, + { + "epoch": 0.9049290401342897, + "grad_norm": 1.77491503767383, + "learning_rate": 2.3458516302773905e-07, + "loss": 0.4251, + "step": 2965 + }, + { + "epoch": 0.9052342438577751, + "grad_norm": 2.1718214289144027, + "learning_rate": 2.3309081550359324e-07, + "loss": 0.288, + "step": 2966 + }, + { + "epoch": 0.9055394475812605, + "grad_norm": 2.085574000543535, + "learning_rate": 2.3160112922689326e-07, + "loss": 0.3233, + "step": 2967 + }, + { + "epoch": 0.9058446513047459, + "grad_norm": 2.1440465827621273, + "learning_rate": 2.301161056543072e-07, + "loss": 0.2738, + "step": 2968 + }, + { + "epoch": 0.9061498550282313, + "grad_norm": 1.9519208934810752, + "learning_rate": 2.2863574623794181e-07, + "loss": 0.4609, + "step": 2969 + }, + { + "epoch": 0.9064550587517167, + "grad_norm": 1.6276575512078413, + "learning_rate": 2.2716005242534255e-07, + "loss": 0.4467, + "step": 2970 + }, + { + "epoch": 0.9067602624752022, + "grad_norm": 1.7591843062973673, + "learning_rate": 2.2568902565949457e-07, + "loss": 0.2674, + "step": 2971 + }, + { + "epoch": 0.9070654661986877, + "grad_norm": 1.9227596820909427, + "learning_rate": 2.2422266737881838e-07, + "loss": 0.2263, + "step": 2972 + }, + { + "epoch": 0.9073706699221731, + "grad_norm": 1.827078756270843, + "learning_rate": 2.2276097901716753e-07, + "loss": 0.3184, + "step": 2973 + }, + { + "epoch": 0.9076758736456585, + "grad_norm": 1.7897558076617492, + "learning_rate": 2.2130396200383264e-07, + "loss": 0.2699, + "step": 2974 + }, + { + "epoch": 0.9079810773691439, + "grad_norm": 1.4481076816647764, + "learning_rate": 2.1985161776353513e-07, + "loss": 0.3225, + "step": 2975 + }, + { + "epoch": 0.9082862810926293, + "grad_norm": 1.751007198952909, + "learning_rate": 2.1840394771642737e-07, + "loss": 0.5209, + "step": 2976 + }, + { + "epoch": 0.9085914848161147, + "grad_norm": 1.678714346091653, + "learning_rate": 2.169609532780892e-07, + "loss": 0.3714, + "step": 2977 + }, + { + "epoch": 0.9088966885396002, + "grad_norm": 1.6436678325582152, + "learning_rate": 2.1552263585953136e-07, + "loss": 0.2791, + "step": 2978 + }, + { + "epoch": 0.9092018922630856, + "grad_norm": 1.752840733728362, + "learning_rate": 2.1408899686719e-07, + "loss": 0.2642, + "step": 2979 + }, + { + "epoch": 0.9095070959865711, + "grad_norm": 1.539056923173589, + "learning_rate": 2.1266003770292533e-07, + "loss": 0.1958, + "step": 2980 + }, + { + "epoch": 0.9098122997100565, + "grad_norm": 1.3348877111637576, + "learning_rate": 2.1123575976402467e-07, + "loss": 0.217, + "step": 2981 + }, + { + "epoch": 0.9101175034335419, + "grad_norm": 1.5200835256465404, + "learning_rate": 2.0981616444319452e-07, + "loss": 0.2368, + "step": 2982 + }, + { + "epoch": 0.9104227071570273, + "grad_norm": 1.5057324703588917, + "learning_rate": 2.0840125312856396e-07, + "loss": 0.1525, + "step": 2983 + }, + { + "epoch": 0.9107279108805127, + "grad_norm": 1.518402857411774, + "learning_rate": 2.0699102720368236e-07, + "loss": 0.4341, + "step": 2984 + }, + { + "epoch": 0.9110331146039982, + "grad_norm": 2.0636749388863187, + "learning_rate": 2.0558548804751666e-07, + "loss": 0.3891, + "step": 2985 + }, + { + "epoch": 0.9113383183274836, + "grad_norm": 1.8381990828593886, + "learning_rate": 2.041846370344508e-07, + "loss": 0.2212, + "step": 2986 + }, + { + "epoch": 0.911643522050969, + "grad_norm": 1.6369897440529062, + "learning_rate": 2.0278847553428682e-07, + "loss": 0.2761, + "step": 2987 + }, + { + "epoch": 0.9119487257744544, + "grad_norm": 1.956432319408319, + "learning_rate": 2.0139700491223768e-07, + "loss": 0.3502, + "step": 2988 + }, + { + "epoch": 0.9122539294979398, + "grad_norm": 1.4843700208078496, + "learning_rate": 2.000102265289311e-07, + "loss": 0.2639, + "step": 2989 + }, + { + "epoch": 0.9125591332214253, + "grad_norm": 1.8702280992730664, + "learning_rate": 1.9862814174040678e-07, + "loss": 0.2676, + "step": 2990 + }, + { + "epoch": 0.9128643369449108, + "grad_norm": 1.6415192846117646, + "learning_rate": 1.9725075189811594e-07, + "loss": 0.2689, + "step": 2991 + }, + { + "epoch": 0.9131695406683962, + "grad_norm": 1.7393193606196415, + "learning_rate": 1.9587805834891617e-07, + "loss": 0.3546, + "step": 2992 + }, + { + "epoch": 0.9134747443918816, + "grad_norm": 1.6258605732371942, + "learning_rate": 1.9451006243507487e-07, + "loss": 0.2548, + "step": 2993 + }, + { + "epoch": 0.913779948115367, + "grad_norm": 1.6662873631242507, + "learning_rate": 1.9314676549426593e-07, + "loss": 0.2709, + "step": 2994 + }, + { + "epoch": 0.9140851518388524, + "grad_norm": 3.438359239868652, + "learning_rate": 1.9178816885956742e-07, + "loss": 0.2072, + "step": 2995 + }, + { + "epoch": 0.9143903555623378, + "grad_norm": 1.552789881451876, + "learning_rate": 1.9043427385946057e-07, + "loss": 0.3087, + "step": 2996 + }, + { + "epoch": 0.9146955592858232, + "grad_norm": 1.6744143729085579, + "learning_rate": 1.8908508181783247e-07, + "loss": 0.282, + "step": 2997 + }, + { + "epoch": 0.9150007630093087, + "grad_norm": 1.862795592539928, + "learning_rate": 1.8774059405396894e-07, + "loss": 0.2934, + "step": 2998 + }, + { + "epoch": 0.9153059667327942, + "grad_norm": 1.5699772263351266, + "learning_rate": 1.8640081188255444e-07, + "loss": 0.2005, + "step": 2999 + }, + { + "epoch": 0.9156111704562796, + "grad_norm": 1.4547961889979046, + "learning_rate": 1.8506573661367655e-07, + "loss": 0.231, + "step": 3000 + }, + { + "epoch": 0.915916374179765, + "grad_norm": 1.9805592172038058, + "learning_rate": 1.83735369552816e-07, + "loss": 0.4442, + "step": 3001 + }, + { + "epoch": 0.9162215779032504, + "grad_norm": 1.6046233910046397, + "learning_rate": 1.8240971200085166e-07, + "loss": 0.3761, + "step": 3002 + }, + { + "epoch": 0.9165267816267358, + "grad_norm": 1.8319659841661153, + "learning_rate": 1.8108876525405772e-07, + "loss": 0.4478, + "step": 3003 + }, + { + "epoch": 0.9168319853502213, + "grad_norm": 1.522913681354371, + "learning_rate": 1.7977253060410093e-07, + "loss": 0.1907, + "step": 3004 + }, + { + "epoch": 0.9171371890737067, + "grad_norm": 1.6945324346838082, + "learning_rate": 1.7846100933803955e-07, + "loss": 0.27, + "step": 3005 + }, + { + "epoch": 0.9174423927971921, + "grad_norm": 1.6314053837810694, + "learning_rate": 1.771542027383255e-07, + "loss": 0.2314, + "step": 3006 + }, + { + "epoch": 0.9177475965206775, + "grad_norm": 1.6985445636319214, + "learning_rate": 1.7585211208279884e-07, + "loss": 0.3697, + "step": 3007 + }, + { + "epoch": 0.918052800244163, + "grad_norm": 1.5613154689723483, + "learning_rate": 1.7455473864468887e-07, + "loss": 0.2834, + "step": 3008 + }, + { + "epoch": 0.9183580039676484, + "grad_norm": 1.6859199756351442, + "learning_rate": 1.7326208369261023e-07, + "loss": 0.464, + "step": 3009 + }, + { + "epoch": 0.9186632076911339, + "grad_norm": 1.7305503808738993, + "learning_rate": 1.7197414849056793e-07, + "loss": 0.3251, + "step": 3010 + }, + { + "epoch": 0.9189684114146193, + "grad_norm": 1.7233544143749935, + "learning_rate": 1.706909342979468e-07, + "loss": 0.4452, + "step": 3011 + }, + { + "epoch": 0.9192736151381047, + "grad_norm": 1.6595896034336173, + "learning_rate": 1.694124423695187e-07, + "loss": 0.3588, + "step": 3012 + }, + { + "epoch": 0.9195788188615901, + "grad_norm": 1.6874345837382116, + "learning_rate": 1.681386739554375e-07, + "loss": 0.3651, + "step": 3013 + }, + { + "epoch": 0.9198840225850755, + "grad_norm": 1.8692354598745753, + "learning_rate": 1.6686963030123683e-07, + "loss": 0.4362, + "step": 3014 + }, + { + "epoch": 0.9201892263085609, + "grad_norm": 1.5592937379506653, + "learning_rate": 1.656053126478313e-07, + "loss": 0.2329, + "step": 3015 + }, + { + "epoch": 0.9204944300320463, + "grad_norm": 1.9022517563902894, + "learning_rate": 1.6434572223151423e-07, + "loss": 0.2243, + "step": 3016 + }, + { + "epoch": 0.9207996337555319, + "grad_norm": 1.464763181378052, + "learning_rate": 1.6309086028395648e-07, + "loss": 0.2692, + "step": 3017 + }, + { + "epoch": 0.9211048374790173, + "grad_norm": 1.8338574902859095, + "learning_rate": 1.6184072803220485e-07, + "loss": 0.332, + "step": 3018 + }, + { + "epoch": 0.9214100412025027, + "grad_norm": 2.0532311909937064, + "learning_rate": 1.6059532669868204e-07, + "loss": 0.3979, + "step": 3019 + }, + { + "epoch": 0.9217152449259881, + "grad_norm": 1.6680209220927975, + "learning_rate": 1.5935465750118396e-07, + "loss": 0.1827, + "step": 3020 + }, + { + "epoch": 0.9220204486494735, + "grad_norm": 1.593221314947013, + "learning_rate": 1.58118721652879e-07, + "loss": 0.2762, + "step": 3021 + }, + { + "epoch": 0.9223256523729589, + "grad_norm": 1.5754246033424293, + "learning_rate": 1.5688752036230881e-07, + "loss": 0.2798, + "step": 3022 + }, + { + "epoch": 0.9226308560964444, + "grad_norm": 1.6539477776029077, + "learning_rate": 1.556610548333831e-07, + "loss": 0.3146, + "step": 3023 + }, + { + "epoch": 0.9229360598199298, + "grad_norm": 1.7674358163103439, + "learning_rate": 1.5443932626538317e-07, + "loss": 0.3525, + "step": 3024 + }, + { + "epoch": 0.9232412635434152, + "grad_norm": 1.6245734315139786, + "learning_rate": 1.5322233585295608e-07, + "loss": 0.3127, + "step": 3025 + }, + { + "epoch": 0.9235464672669007, + "grad_norm": 1.5218617432817847, + "learning_rate": 1.5201008478611723e-07, + "loss": 0.3303, + "step": 3026 + }, + { + "epoch": 0.9238516709903861, + "grad_norm": 1.4627888418728086, + "learning_rate": 1.508025742502478e-07, + "loss": 0.3914, + "step": 3027 + }, + { + "epoch": 0.9241568747138715, + "grad_norm": 1.7738867371365525, + "learning_rate": 1.495998054260922e-07, + "loss": 0.3212, + "step": 3028 + }, + { + "epoch": 0.924462078437357, + "grad_norm": 1.8511057101111497, + "learning_rate": 1.4840177948975964e-07, + "loss": 0.3011, + "step": 3029 + }, + { + "epoch": 0.9247672821608424, + "grad_norm": 1.8829133572612307, + "learning_rate": 1.4720849761272138e-07, + "loss": 0.4673, + "step": 3030 + }, + { + "epoch": 0.9250724858843278, + "grad_norm": 1.8364459693162152, + "learning_rate": 1.4601996096180792e-07, + "loss": 0.2928, + "step": 3031 + }, + { + "epoch": 0.9253776896078132, + "grad_norm": 1.8570840008809846, + "learning_rate": 1.4483617069921296e-07, + "loss": 0.4844, + "step": 3032 + }, + { + "epoch": 0.9256828933312986, + "grad_norm": 1.6856272867234632, + "learning_rate": 1.436571279824861e-07, + "loss": 0.2067, + "step": 3033 + }, + { + "epoch": 0.925988097054784, + "grad_norm": 1.8831068260007167, + "learning_rate": 1.4248283396453566e-07, + "loss": 0.3718, + "step": 3034 + }, + { + "epoch": 0.9262933007782695, + "grad_norm": 1.5416696221950486, + "learning_rate": 1.4131328979362813e-07, + "loss": 0.2602, + "step": 3035 + }, + { + "epoch": 0.926598504501755, + "grad_norm": 1.3032519729192462, + "learning_rate": 1.4014849661338258e-07, + "loss": 0.1346, + "step": 3036 + }, + { + "epoch": 0.9269037082252404, + "grad_norm": 1.5945893115883587, + "learning_rate": 1.3898845556277352e-07, + "loss": 0.2103, + "step": 3037 + }, + { + "epoch": 0.9272089119487258, + "grad_norm": 2.267391384429593, + "learning_rate": 1.378331677761302e-07, + "loss": 0.3795, + "step": 3038 + }, + { + "epoch": 0.9275141156722112, + "grad_norm": 1.667049109767016, + "learning_rate": 1.3668263438313177e-07, + "loss": 0.3487, + "step": 3039 + }, + { + "epoch": 0.9278193193956966, + "grad_norm": 1.8954270594757712, + "learning_rate": 1.3553685650880998e-07, + "loss": 0.3996, + "step": 3040 + }, + { + "epoch": 0.928124523119182, + "grad_norm": 1.7837118570036214, + "learning_rate": 1.3439583527354417e-07, + "loss": 0.2491, + "step": 3041 + }, + { + "epoch": 0.9284297268426674, + "grad_norm": 1.8231666030295597, + "learning_rate": 1.3325957179306625e-07, + "loss": 0.1648, + "step": 3042 + }, + { + "epoch": 0.9287349305661529, + "grad_norm": 1.771683485119366, + "learning_rate": 1.321280671784525e-07, + "loss": 0.3665, + "step": 3043 + }, + { + "epoch": 0.9290401342896384, + "grad_norm": 1.8934447643684014, + "learning_rate": 1.3100132253612675e-07, + "loss": 0.1926, + "step": 3044 + }, + { + "epoch": 0.9293453380131238, + "grad_norm": 1.907266439635279, + "learning_rate": 1.2987933896785932e-07, + "loss": 0.3228, + "step": 3045 + }, + { + "epoch": 0.9296505417366092, + "grad_norm": 2.2551799064647726, + "learning_rate": 1.2876211757076373e-07, + "loss": 0.3462, + "step": 3046 + }, + { + "epoch": 0.9299557454600946, + "grad_norm": 1.7869325977714636, + "learning_rate": 1.2764965943729724e-07, + "loss": 0.2372, + "step": 3047 + }, + { + "epoch": 0.93026094918358, + "grad_norm": 1.7434704765049673, + "learning_rate": 1.2654196565526077e-07, + "loss": 0.3657, + "step": 3048 + }, + { + "epoch": 0.9305661529070655, + "grad_norm": 2.217071758149955, + "learning_rate": 1.2543903730779406e-07, + "loss": 0.305, + "step": 3049 + }, + { + "epoch": 0.9308713566305509, + "grad_norm": 1.9085185201264485, + "learning_rate": 1.2434087547337824e-07, + "loss": 0.1456, + "step": 3050 + }, + { + "epoch": 0.9311765603540363, + "grad_norm": 1.8140467916592178, + "learning_rate": 1.232474812258344e-07, + "loss": 0.2688, + "step": 3051 + }, + { + "epoch": 0.9314817640775217, + "grad_norm": 3.537591961315723, + "learning_rate": 1.2215885563432117e-07, + "loss": 0.2682, + "step": 3052 + }, + { + "epoch": 0.9317869678010072, + "grad_norm": 1.68609744525017, + "learning_rate": 1.2107499976333314e-07, + "loss": 0.3311, + "step": 3053 + }, + { + "epoch": 0.9320921715244926, + "grad_norm": 1.7453938218503213, + "learning_rate": 1.1999591467270255e-07, + "loss": 0.2957, + "step": 3054 + }, + { + "epoch": 0.932397375247978, + "grad_norm": 1.8908716466504132, + "learning_rate": 1.1892160141759479e-07, + "loss": 0.4493, + "step": 3055 + }, + { + "epoch": 0.9327025789714635, + "grad_norm": 1.4688529932150731, + "learning_rate": 1.1785206104851122e-07, + "loss": 0.1949, + "step": 3056 + }, + { + "epoch": 0.9330077826949489, + "grad_norm": 1.7005755740395718, + "learning_rate": 1.1678729461128524e-07, + "loss": 0.4439, + "step": 3057 + }, + { + "epoch": 0.9333129864184343, + "grad_norm": 1.62211918398401, + "learning_rate": 1.1572730314708181e-07, + "loss": 0.1903, + "step": 3058 + }, + { + "epoch": 0.9336181901419197, + "grad_norm": 1.4639416167390211, + "learning_rate": 1.1467208769239624e-07, + "loss": 0.1994, + "step": 3059 + }, + { + "epoch": 0.9339233938654051, + "grad_norm": 1.7931644773342565, + "learning_rate": 1.1362164927905595e-07, + "loss": 0.4354, + "step": 3060 + }, + { + "epoch": 0.9342285975888905, + "grad_norm": 2.07586259951712, + "learning_rate": 1.1257598893421429e-07, + "loss": 0.4668, + "step": 3061 + }, + { + "epoch": 0.934533801312376, + "grad_norm": 1.7073213169330927, + "learning_rate": 1.1153510768035447e-07, + "loss": 0.2002, + "step": 3062 + }, + { + "epoch": 0.9348390050358615, + "grad_norm": 1.294379579617933, + "learning_rate": 1.1049900653528512e-07, + "loss": 0.3173, + "step": 3063 + }, + { + "epoch": 0.9351442087593469, + "grad_norm": 1.7475252364141403, + "learning_rate": 1.0946768651214245e-07, + "loss": 0.1948, + "step": 3064 + }, + { + "epoch": 0.9354494124828323, + "grad_norm": 1.772644403383591, + "learning_rate": 1.0844114861938648e-07, + "loss": 0.4511, + "step": 3065 + }, + { + "epoch": 0.9357546162063177, + "grad_norm": 1.7161844699737987, + "learning_rate": 1.0741939386080091e-07, + "loss": 0.3736, + "step": 3066 + }, + { + "epoch": 0.9360598199298031, + "grad_norm": 1.6309186547749837, + "learning_rate": 1.0640242323549266e-07, + "loss": 0.2649, + "step": 3067 + }, + { + "epoch": 0.9363650236532886, + "grad_norm": 1.7457238814732614, + "learning_rate": 1.0539023773789069e-07, + "loss": 0.5385, + "step": 3068 + }, + { + "epoch": 0.936670227376774, + "grad_norm": 1.4956914694554349, + "learning_rate": 1.0438283835774387e-07, + "loss": 0.2701, + "step": 3069 + }, + { + "epoch": 0.9369754311002594, + "grad_norm": 1.4626932456651007, + "learning_rate": 1.0338022608012365e-07, + "loss": 0.2981, + "step": 3070 + }, + { + "epoch": 0.9372806348237448, + "grad_norm": 1.6248818868113604, + "learning_rate": 1.0238240188541748e-07, + "loss": 0.5032, + "step": 3071 + }, + { + "epoch": 0.9375858385472303, + "grad_norm": 1.458176886023303, + "learning_rate": 1.0138936674933098e-07, + "loss": 0.2469, + "step": 3072 + }, + { + "epoch": 0.9378910422707157, + "grad_norm": 1.741700724522758, + "learning_rate": 1.0040112164289073e-07, + "loss": 0.3523, + "step": 3073 + }, + { + "epoch": 0.9381962459942012, + "grad_norm": 2.2307663166219363, + "learning_rate": 9.941766753243543e-08, + "loss": 0.3739, + "step": 3074 + }, + { + "epoch": 0.9385014497176866, + "grad_norm": 1.6894420469064177, + "learning_rate": 9.84390053796197e-08, + "loss": 0.2126, + "step": 3075 + }, + { + "epoch": 0.938806653441172, + "grad_norm": 1.8517504890392837, + "learning_rate": 9.746513614141362e-08, + "loss": 0.1941, + "step": 3076 + }, + { + "epoch": 0.9391118571646574, + "grad_norm": 1.7329188927338595, + "learning_rate": 9.649606077009988e-08, + "loss": 0.5339, + "step": 3077 + }, + { + "epoch": 0.9394170608881428, + "grad_norm": 1.5848102039576724, + "learning_rate": 9.553178021327326e-08, + "loss": 0.1557, + "step": 3078 + }, + { + "epoch": 0.9397222646116282, + "grad_norm": 1.8113953743059201, + "learning_rate": 9.457229541384117e-08, + "loss": 0.1842, + "step": 3079 + }, + { + "epoch": 0.9400274683351136, + "grad_norm": 1.6202392532140553, + "learning_rate": 9.361760731002035e-08, + "loss": 0.336, + "step": 3080 + }, + { + "epoch": 0.9403326720585992, + "grad_norm": 1.6042187504755832, + "learning_rate": 9.266771683533738e-08, + "loss": 0.3475, + "step": 3081 + }, + { + "epoch": 0.9406378757820846, + "grad_norm": 1.3922444889550674, + "learning_rate": 9.172262491862816e-08, + "loss": 0.2794, + "step": 3082 + }, + { + "epoch": 0.94094307950557, + "grad_norm": 2.1203607518574707, + "learning_rate": 9.078233248403568e-08, + "loss": 0.1426, + "step": 3083 + }, + { + "epoch": 0.9412482832290554, + "grad_norm": 1.6426230480024866, + "learning_rate": 8.984684045101055e-08, + "loss": 0.2625, + "step": 3084 + }, + { + "epoch": 0.9415534869525408, + "grad_norm": 1.6979257158221603, + "learning_rate": 8.891614973430829e-08, + "loss": 0.3273, + "step": 3085 + }, + { + "epoch": 0.9418586906760262, + "grad_norm": 1.9710500693423525, + "learning_rate": 8.799026124399146e-08, + "loss": 0.274, + "step": 3086 + }, + { + "epoch": 0.9421638943995116, + "grad_norm": 1.870458825680719, + "learning_rate": 8.706917588542418e-08, + "loss": 0.174, + "step": 3087 + }, + { + "epoch": 0.9424690981229971, + "grad_norm": 1.8847617668151897, + "learning_rate": 8.6152894559276e-08, + "loss": 0.3027, + "step": 3088 + }, + { + "epoch": 0.9427743018464825, + "grad_norm": 1.881554084103904, + "learning_rate": 8.524141816151854e-08, + "loss": 0.1893, + "step": 3089 + }, + { + "epoch": 0.943079505569968, + "grad_norm": 1.739369461100097, + "learning_rate": 8.433474758342441e-08, + "loss": 0.2814, + "step": 3090 + }, + { + "epoch": 0.9433847092934534, + "grad_norm": 1.6545810567967778, + "learning_rate": 8.343288371156665e-08, + "loss": 0.2362, + "step": 3091 + }, + { + "epoch": 0.9436899130169388, + "grad_norm": 1.619856102286329, + "learning_rate": 8.253582742782041e-08, + "loss": 0.3785, + "step": 3092 + }, + { + "epoch": 0.9439951167404242, + "grad_norm": 1.651335785317375, + "learning_rate": 8.164357960935675e-08, + "loss": 0.4398, + "step": 3093 + }, + { + "epoch": 0.9443003204639097, + "grad_norm": 1.7539490012754981, + "learning_rate": 8.07561411286456e-08, + "loss": 0.3187, + "step": 3094 + }, + { + "epoch": 0.9446055241873951, + "grad_norm": 1.8747356228407501, + "learning_rate": 7.987351285345668e-08, + "loss": 0.3133, + "step": 3095 + }, + { + "epoch": 0.9449107279108805, + "grad_norm": 2.0530899857738967, + "learning_rate": 7.899569564685294e-08, + "loss": 0.1711, + "step": 3096 + }, + { + "epoch": 0.9452159316343659, + "grad_norm": 1.5156446632604146, + "learning_rate": 7.812269036719444e-08, + "loss": 0.3256, + "step": 3097 + }, + { + "epoch": 0.9455211353578513, + "grad_norm": 1.529266707901095, + "learning_rate": 7.725449786813554e-08, + "loss": 0.2665, + "step": 3098 + }, + { + "epoch": 0.9458263390813368, + "grad_norm": 1.7932842035311636, + "learning_rate": 7.63911189986255e-08, + "loss": 0.4121, + "step": 3099 + }, + { + "epoch": 0.9461315428048223, + "grad_norm": 1.753857720262275, + "learning_rate": 7.553255460290399e-08, + "loss": 0.2428, + "step": 3100 + }, + { + "epoch": 0.9464367465283077, + "grad_norm": 1.9349085797539933, + "learning_rate": 7.467880552050721e-08, + "loss": 0.4053, + "step": 3101 + }, + { + "epoch": 0.9467419502517931, + "grad_norm": 1.716596329121474, + "learning_rate": 7.382987258625851e-08, + "loss": 0.173, + "step": 3102 + }, + { + "epoch": 0.9470471539752785, + "grad_norm": 1.974453249229349, + "learning_rate": 7.298575663027385e-08, + "loss": 0.313, + "step": 3103 + }, + { + "epoch": 0.9473523576987639, + "grad_norm": 1.6405383859205218, + "learning_rate": 7.214645847795909e-08, + "loss": 0.4044, + "step": 3104 + }, + { + "epoch": 0.9476575614222493, + "grad_norm": 1.7514331943448285, + "learning_rate": 7.131197895000941e-08, + "loss": 0.2446, + "step": 3105 + }, + { + "epoch": 0.9479627651457347, + "grad_norm": 1.4290549958459764, + "learning_rate": 7.048231886240653e-08, + "loss": 0.1826, + "step": 3106 + }, + { + "epoch": 0.9482679688692202, + "grad_norm": 1.2450984220399632, + "learning_rate": 6.965747902642095e-08, + "loss": 0.2012, + "step": 3107 + }, + { + "epoch": 0.9485731725927057, + "grad_norm": 1.4945845598921765, + "learning_rate": 6.883746024860971e-08, + "loss": 0.2961, + "step": 3108 + }, + { + "epoch": 0.9488783763161911, + "grad_norm": 1.728767333588549, + "learning_rate": 6.802226333081474e-08, + "loss": 0.4283, + "step": 3109 + }, + { + "epoch": 0.9491835800396765, + "grad_norm": 1.7245224167187219, + "learning_rate": 6.721188907016396e-08, + "loss": 0.3148, + "step": 3110 + }, + { + "epoch": 0.9494887837631619, + "grad_norm": 1.5365620551148431, + "learning_rate": 6.640633825906906e-08, + "loss": 0.2251, + "step": 3111 + }, + { + "epoch": 0.9497939874866473, + "grad_norm": 1.5399204709213705, + "learning_rate": 6.56056116852255e-08, + "loss": 0.3458, + "step": 3112 + }, + { + "epoch": 0.9500991912101328, + "grad_norm": 1.4310328089740316, + "learning_rate": 6.48097101316103e-08, + "loss": 0.1783, + "step": 3113 + }, + { + "epoch": 0.9504043949336182, + "grad_norm": 1.6172332020074571, + "learning_rate": 6.40186343764848e-08, + "loss": 0.1566, + "step": 3114 + }, + { + "epoch": 0.9507095986571036, + "grad_norm": 1.8201053485143524, + "learning_rate": 6.32323851933886e-08, + "loss": 0.4466, + "step": 3115 + }, + { + "epoch": 0.951014802380589, + "grad_norm": 1.608563412958756, + "learning_rate": 6.245096335114387e-08, + "loss": 0.2211, + "step": 3116 + }, + { + "epoch": 0.9513200061040745, + "grad_norm": 1.6555680364455492, + "learning_rate": 6.16743696138522e-08, + "loss": 0.2064, + "step": 3117 + }, + { + "epoch": 0.9516252098275599, + "grad_norm": 2.2928571787631715, + "learning_rate": 6.090260474089227e-08, + "loss": 0.3908, + "step": 3118 + }, + { + "epoch": 0.9519304135510454, + "grad_norm": 1.796168710970393, + "learning_rate": 6.013566948692317e-08, + "loss": 0.4037, + "step": 3119 + }, + { + "epoch": 0.9522356172745308, + "grad_norm": 1.7894655798058297, + "learning_rate": 5.9373564601880595e-08, + "loss": 0.3374, + "step": 3120 + }, + { + "epoch": 0.9525408209980162, + "grad_norm": 1.1926506172885767, + "learning_rate": 5.8616290830976194e-08, + "loss": 0.1459, + "step": 3121 + }, + { + "epoch": 0.9528460247215016, + "grad_norm": 1.9998501483091524, + "learning_rate": 5.786384891469932e-08, + "loss": 0.3278, + "step": 3122 + }, + { + "epoch": 0.953151228444987, + "grad_norm": 1.780004366965777, + "learning_rate": 5.711623958881196e-08, + "loss": 0.2719, + "step": 3123 + }, + { + "epoch": 0.9534564321684724, + "grad_norm": 1.649335065432672, + "learning_rate": 5.6373463584353806e-08, + "loss": 0.203, + "step": 3124 + }, + { + "epoch": 0.9537616358919578, + "grad_norm": 1.96435364816284, + "learning_rate": 5.563552162763608e-08, + "loss": 0.4157, + "step": 3125 + }, + { + "epoch": 0.9540668396154434, + "grad_norm": 1.6124475278375099, + "learning_rate": 5.490241444024325e-08, + "loss": 0.2539, + "step": 3126 + }, + { + "epoch": 0.9543720433389288, + "grad_norm": 1.9688108525635462, + "learning_rate": 5.4174142739033545e-08, + "loss": 0.2505, + "step": 3127 + }, + { + "epoch": 0.9546772470624142, + "grad_norm": 1.9030120114515199, + "learning_rate": 5.3450707236135656e-08, + "loss": 0.393, + "step": 3128 + }, + { + "epoch": 0.9549824507858996, + "grad_norm": 2.1274750824846382, + "learning_rate": 5.2732108638949285e-08, + "loss": 0.3988, + "step": 3129 + }, + { + "epoch": 0.955287654509385, + "grad_norm": 1.9063736922359853, + "learning_rate": 5.201834765014568e-08, + "loss": 0.2716, + "step": 3130 + }, + { + "epoch": 0.9555928582328704, + "grad_norm": 1.746942983543036, + "learning_rate": 5.130942496766433e-08, + "loss": 0.3204, + "step": 3131 + }, + { + "epoch": 0.9558980619563558, + "grad_norm": 1.8650127179323246, + "learning_rate": 5.0605341284713504e-08, + "loss": 0.3323, + "step": 3132 + }, + { + "epoch": 0.9562032656798413, + "grad_norm": 1.8057055223573084, + "learning_rate": 4.990609728977136e-08, + "loss": 0.2977, + "step": 3133 + }, + { + "epoch": 0.9565084694033267, + "grad_norm": 1.8039711855851885, + "learning_rate": 4.9211693666582625e-08, + "loss": 0.2697, + "step": 3134 + }, + { + "epoch": 0.9568136731268121, + "grad_norm": 1.7222014711353566, + "learning_rate": 4.85221310941586e-08, + "loss": 0.2201, + "step": 3135 + }, + { + "epoch": 0.9571188768502976, + "grad_norm": 1.473317450672231, + "learning_rate": 4.7837410246777125e-08, + "loss": 0.2856, + "step": 3136 + }, + { + "epoch": 0.957424080573783, + "grad_norm": 1.6241791926305946, + "learning_rate": 4.715753179398208e-08, + "loss": 0.2776, + "step": 3137 + }, + { + "epoch": 0.9577292842972684, + "grad_norm": 1.5922351444442016, + "learning_rate": 4.648249640058222e-08, + "loss": 0.2124, + "step": 3138 + }, + { + "epoch": 0.9580344880207539, + "grad_norm": 1.724860409603394, + "learning_rate": 4.5812304726649546e-08, + "loss": 0.3272, + "step": 3139 + }, + { + "epoch": 0.9583396917442393, + "grad_norm": 1.6747404836910174, + "learning_rate": 4.514695742752151e-08, + "loss": 0.3928, + "step": 3140 + }, + { + "epoch": 0.9586448954677247, + "grad_norm": 1.6626282721345953, + "learning_rate": 4.448645515379657e-08, + "loss": 0.3282, + "step": 3141 + }, + { + "epoch": 0.9589500991912101, + "grad_norm": 1.4388094204216442, + "learning_rate": 4.383079855133699e-08, + "loss": 0.2552, + "step": 3142 + }, + { + "epoch": 0.9592553029146955, + "grad_norm": 1.6750453942823442, + "learning_rate": 4.317998826126601e-08, + "loss": 0.4144, + "step": 3143 + }, + { + "epoch": 0.9595605066381809, + "grad_norm": 1.691395819017345, + "learning_rate": 4.253402491996905e-08, + "loss": 0.3391, + "step": 3144 + }, + { + "epoch": 0.9598657103616665, + "grad_norm": 1.6872227758940446, + "learning_rate": 4.1892909159090275e-08, + "loss": 0.2, + "step": 3145 + }, + { + "epoch": 0.9601709140851519, + "grad_norm": 1.603624161312316, + "learning_rate": 4.125664160553544e-08, + "loss": 0.3261, + "step": 3146 + }, + { + "epoch": 0.9604761178086373, + "grad_norm": 1.6339332348223208, + "learning_rate": 4.062522288146853e-08, + "loss": 0.228, + "step": 3147 + }, + { + "epoch": 0.9607813215321227, + "grad_norm": 1.4318885744634444, + "learning_rate": 3.9998653604311763e-08, + "loss": 0.3181, + "step": 3148 + }, + { + "epoch": 0.9610865252556081, + "grad_norm": 1.5052302093852121, + "learning_rate": 3.937693438674728e-08, + "loss": 0.2756, + "step": 3149 + }, + { + "epoch": 0.9613917289790935, + "grad_norm": 1.8201246696782578, + "learning_rate": 3.876006583671266e-08, + "loss": 0.4108, + "step": 3150 + }, + { + "epoch": 0.9616969327025789, + "grad_norm": 1.5679880275025952, + "learning_rate": 3.8148048557403174e-08, + "loss": 0.3295, + "step": 3151 + }, + { + "epoch": 0.9620021364260644, + "grad_norm": 1.647552976089387, + "learning_rate": 3.754088314727067e-08, + "loss": 0.4898, + "step": 3152 + }, + { + "epoch": 0.9623073401495498, + "grad_norm": 1.9339834726015996, + "learning_rate": 3.693857020002245e-08, + "loss": 0.3003, + "step": 3153 + }, + { + "epoch": 0.9626125438730353, + "grad_norm": 1.5214906430390787, + "learning_rate": 3.634111030462073e-08, + "loss": 0.1723, + "step": 3154 + }, + { + "epoch": 0.9629177475965207, + "grad_norm": 1.947860519769557, + "learning_rate": 3.574850404528152e-08, + "loss": 0.2321, + "step": 3155 + }, + { + "epoch": 0.9632229513200061, + "grad_norm": 1.377300115923031, + "learning_rate": 3.5160752001476286e-08, + "loss": 0.2842, + "step": 3156 + }, + { + "epoch": 0.9635281550434915, + "grad_norm": 1.7866288336382636, + "learning_rate": 3.457785474792974e-08, + "loss": 0.3875, + "step": 3157 + }, + { + "epoch": 0.963833358766977, + "grad_norm": 2.1990459367465216, + "learning_rate": 3.399981285461706e-08, + "loss": 0.3652, + "step": 3158 + }, + { + "epoch": 0.9641385624904624, + "grad_norm": 1.47797661131238, + "learning_rate": 3.342662688676945e-08, + "loss": 0.417, + "step": 3159 + }, + { + "epoch": 0.9644437662139478, + "grad_norm": 1.8380294244712645, + "learning_rate": 3.2858297404866344e-08, + "loss": 0.3075, + "step": 3160 + }, + { + "epoch": 0.9647489699374332, + "grad_norm": 1.9320740555812037, + "learning_rate": 3.2294824964640424e-08, + "loss": 0.2917, + "step": 3161 + }, + { + "epoch": 0.9650541736609186, + "grad_norm": 1.5679615699136098, + "learning_rate": 3.173621011707484e-08, + "loss": 0.3805, + "step": 3162 + }, + { + "epoch": 0.9653593773844041, + "grad_norm": 1.8590137547501744, + "learning_rate": 3.118245340840154e-08, + "loss": 0.2933, + "step": 3163 + }, + { + "epoch": 0.9656645811078896, + "grad_norm": 1.5404897355750176, + "learning_rate": 3.0633555380102933e-08, + "loss": 0.1512, + "step": 3164 + }, + { + "epoch": 0.965969784831375, + "grad_norm": 1.943192149879615, + "learning_rate": 3.0089516568910795e-08, + "loss": 0.5274, + "step": 3165 + }, + { + "epoch": 0.9662749885548604, + "grad_norm": 2.6836705092734, + "learning_rate": 2.9550337506804583e-08, + "loss": 0.2219, + "step": 3166 + }, + { + "epoch": 0.9665801922783458, + "grad_norm": 1.5473289979487448, + "learning_rate": 2.9016018721012562e-08, + "loss": 0.2946, + "step": 3167 + }, + { + "epoch": 0.9668853960018312, + "grad_norm": 1.492487772343577, + "learning_rate": 2.8486560734009573e-08, + "loss": 0.2715, + "step": 3168 + }, + { + "epoch": 0.9671905997253166, + "grad_norm": 1.6104415766610036, + "learning_rate": 2.796196406351759e-08, + "loss": 0.4105, + "step": 3169 + }, + { + "epoch": 0.967495803448802, + "grad_norm": 1.5281427737335482, + "learning_rate": 2.7442229222505724e-08, + "loss": 0.3069, + "step": 3170 + }, + { + "epoch": 0.9678010071722875, + "grad_norm": 1.7751502953956564, + "learning_rate": 2.692735671918856e-08, + "loss": 0.3891, + "step": 3171 + }, + { + "epoch": 0.968106210895773, + "grad_norm": 1.7123520111962385, + "learning_rate": 2.641734705702559e-08, + "loss": 0.47, + "step": 3172 + }, + { + "epoch": 0.9684114146192584, + "grad_norm": 1.3369111645529905, + "learning_rate": 2.5912200734722336e-08, + "loss": 0.316, + "step": 3173 + }, + { + "epoch": 0.9687166183427438, + "grad_norm": 1.6674558757794788, + "learning_rate": 2.541191824622813e-08, + "loss": 0.1658, + "step": 3174 + }, + { + "epoch": 0.9690218220662292, + "grad_norm": 1.827558772111376, + "learning_rate": 2.4916500080736094e-08, + "loss": 0.2408, + "step": 3175 + }, + { + "epoch": 0.9693270257897146, + "grad_norm": 1.5216273582175108, + "learning_rate": 2.4425946722683725e-08, + "loss": 0.3275, + "step": 3176 + }, + { + "epoch": 0.9696322295132, + "grad_norm": 3.150742512944137, + "learning_rate": 2.394025865174954e-08, + "loss": 0.3721, + "step": 3177 + }, + { + "epoch": 0.9699374332366855, + "grad_norm": 1.7535139634941797, + "learning_rate": 2.345943634285752e-08, + "loss": 0.1866, + "step": 3178 + }, + { + "epoch": 0.9702426369601709, + "grad_norm": 1.4921787121898464, + "learning_rate": 2.2983480266171586e-08, + "loss": 0.3459, + "step": 3179 + }, + { + "epoch": 0.9705478406836563, + "grad_norm": 1.3992680475442059, + "learning_rate": 2.251239088709778e-08, + "loss": 0.246, + "step": 3180 + }, + { + "epoch": 0.9708530444071418, + "grad_norm": 1.5859029230567245, + "learning_rate": 2.2046168666284284e-08, + "loss": 0.3011, + "step": 3181 + }, + { + "epoch": 0.9711582481306272, + "grad_norm": 1.7028286177302412, + "learning_rate": 2.1584814059618098e-08, + "loss": 0.377, + "step": 3182 + }, + { + "epoch": 0.9714634518541126, + "grad_norm": 2.0593667940430684, + "learning_rate": 2.1128327518227797e-08, + "loss": 0.3509, + "step": 3183 + }, + { + "epoch": 0.9717686555775981, + "grad_norm": 1.6996737167043607, + "learning_rate": 2.067670948848244e-08, + "loss": 0.5195, + "step": 3184 + }, + { + "epoch": 0.9720738593010835, + "grad_norm": 1.247752027949812, + "learning_rate": 2.022996041198877e-08, + "loss": 0.1139, + "step": 3185 + }, + { + "epoch": 0.9723790630245689, + "grad_norm": 1.5635366434232196, + "learning_rate": 1.978808072559346e-08, + "loss": 0.2432, + "step": 3186 + }, + { + "epoch": 0.9726842667480543, + "grad_norm": 1.9556147461058806, + "learning_rate": 1.9351070861381995e-08, + "loss": 0.3601, + "step": 3187 + }, + { + "epoch": 0.9729894704715397, + "grad_norm": 1.507737627295227, + "learning_rate": 1.891893124667754e-08, + "loss": 0.3307, + "step": 3188 + }, + { + "epoch": 0.9732946741950251, + "grad_norm": 1.8446244574541686, + "learning_rate": 1.8491662304041536e-08, + "loss": 0.2839, + "step": 3189 + }, + { + "epoch": 0.9735998779185107, + "grad_norm": 1.459507369709913, + "learning_rate": 1.8069264451270884e-08, + "loss": 0.2402, + "step": 3190 + }, + { + "epoch": 0.9739050816419961, + "grad_norm": 1.7408320195966298, + "learning_rate": 1.7651738101401862e-08, + "loss": 0.2565, + "step": 3191 + }, + { + "epoch": 0.9742102853654815, + "grad_norm": 1.7614963839448081, + "learning_rate": 1.7239083662705657e-08, + "loss": 0.422, + "step": 3192 + }, + { + "epoch": 0.9745154890889669, + "grad_norm": 1.9687097938912876, + "learning_rate": 1.6831301538689503e-08, + "loss": 0.2555, + "step": 3193 + }, + { + "epoch": 0.9748206928124523, + "grad_norm": 1.791694936603218, + "learning_rate": 1.642839212809777e-08, + "loss": 0.2699, + "step": 3194 + }, + { + "epoch": 0.9751258965359377, + "grad_norm": 1.8067707406692137, + "learning_rate": 1.6030355824908638e-08, + "loss": 0.3262, + "step": 3195 + }, + { + "epoch": 0.9754311002594231, + "grad_norm": 2.2498554607165793, + "learning_rate": 1.5637193018335218e-08, + "loss": 0.2446, + "step": 3196 + }, + { + "epoch": 0.9757363039829086, + "grad_norm": 1.5439681667347085, + "learning_rate": 1.5248904092826088e-08, + "loss": 0.3062, + "step": 3197 + }, + { + "epoch": 0.976041507706394, + "grad_norm": 1.9424093866999719, + "learning_rate": 1.486548942806365e-08, + "loss": 0.3704, + "step": 3198 + }, + { + "epoch": 0.9763467114298795, + "grad_norm": 1.6517890738670324, + "learning_rate": 1.4486949398963557e-08, + "loss": 0.3108, + "step": 3199 + }, + { + "epoch": 0.9766519151533649, + "grad_norm": 1.5416845458085093, + "learning_rate": 1.4113284375675273e-08, + "loss": 0.2964, + "step": 3200 + }, + { + "epoch": 0.9769571188768503, + "grad_norm": 1.5593385374893778, + "learning_rate": 1.3744494723581526e-08, + "loss": 0.3386, + "step": 3201 + }, + { + "epoch": 0.9772623226003357, + "grad_norm": 1.6733622130653938, + "learning_rate": 1.3380580803297183e-08, + "loss": 0.3067, + "step": 3202 + }, + { + "epoch": 0.9775675263238212, + "grad_norm": 1.5936790151308489, + "learning_rate": 1.302154297066982e-08, + "loss": 0.1751, + "step": 3203 + }, + { + "epoch": 0.9778727300473066, + "grad_norm": 1.8938042887847097, + "learning_rate": 1.2667381576779714e-08, + "loss": 0.3145, + "step": 3204 + }, + { + "epoch": 0.978177933770792, + "grad_norm": 1.985342675905468, + "learning_rate": 1.231809696793651e-08, + "loss": 0.3656, + "step": 3205 + }, + { + "epoch": 0.9784831374942774, + "grad_norm": 1.7338526171051585, + "learning_rate": 1.1973689485684226e-08, + "loss": 0.2738, + "step": 3206 + }, + { + "epoch": 0.9787883412177628, + "grad_norm": 2.329815449261681, + "learning_rate": 1.1634159466795691e-08, + "loss": 0.4249, + "step": 3207 + }, + { + "epoch": 0.9790935449412482, + "grad_norm": 1.5805277897757888, + "learning_rate": 1.1299507243274222e-08, + "loss": 0.2954, + "step": 3208 + }, + { + "epoch": 0.9793987486647338, + "grad_norm": 1.850668453903233, + "learning_rate": 1.0969733142355832e-08, + "loss": 0.3293, + "step": 3209 + }, + { + "epoch": 0.9797039523882192, + "grad_norm": 1.7705096977547878, + "learning_rate": 1.064483748650369e-08, + "loss": 0.3052, + "step": 3210 + }, + { + "epoch": 0.9800091561117046, + "grad_norm": 2.1184759407252858, + "learning_rate": 1.0324820593412554e-08, + "loss": 0.2946, + "step": 3211 + }, + { + "epoch": 0.98031435983519, + "grad_norm": 1.6815254978375824, + "learning_rate": 1.0009682776005447e-08, + "loss": 0.4507, + "step": 3212 + }, + { + "epoch": 0.9806195635586754, + "grad_norm": 1.815273399253701, + "learning_rate": 9.69942434243587e-09, + "loss": 0.3258, + "step": 3213 + }, + { + "epoch": 0.9809247672821608, + "grad_norm": 1.7076222250047222, + "learning_rate": 9.394045596083923e-09, + "loss": 0.2681, + "step": 3214 + }, + { + "epoch": 0.9812299710056462, + "grad_norm": 1.7427164138755578, + "learning_rate": 9.093546835560185e-09, + "loss": 0.3873, + "step": 3215 + }, + { + "epoch": 0.9815351747291317, + "grad_norm": 1.5944511516812474, + "learning_rate": 8.79792835470239e-09, + "loss": 0.2452, + "step": 3216 + }, + { + "epoch": 0.9818403784526171, + "grad_norm": 1.9765614747056073, + "learning_rate": 8.507190442577085e-09, + "loss": 0.2779, + "step": 3217 + }, + { + "epoch": 0.9821455821761026, + "grad_norm": 1.9152803867141897, + "learning_rate": 8.221333383476859e-09, + "loss": 0.3015, + "step": 3218 + }, + { + "epoch": 0.982450785899588, + "grad_norm": 1.8976746584555748, + "learning_rate": 7.94035745692312e-09, + "loss": 0.3744, + "step": 3219 + }, + { + "epoch": 0.9827559896230734, + "grad_norm": 1.6438120086925618, + "learning_rate": 7.664262937663314e-09, + "loss": 0.223, + "step": 3220 + }, + { + "epoch": 0.9830611933465588, + "grad_norm": 2.0132378925455385, + "learning_rate": 7.393050095672594e-09, + "loss": 0.2209, + "step": 3221 + }, + { + "epoch": 0.9833663970700443, + "grad_norm": 1.4970602792900094, + "learning_rate": 7.126719196152154e-09, + "loss": 0.2667, + "step": 3222 + }, + { + "epoch": 0.9836716007935297, + "grad_norm": 1.9208437626266621, + "learning_rate": 6.865270499529786e-09, + "loss": 0.2817, + "step": 3223 + }, + { + "epoch": 0.9839768045170151, + "grad_norm": 1.9665921207890467, + "learning_rate": 6.608704261457655e-09, + "loss": 0.2463, + "step": 3224 + }, + { + "epoch": 0.9842820082405005, + "grad_norm": 1.7756663025534054, + "learning_rate": 6.357020732816743e-09, + "loss": 0.2913, + "step": 3225 + }, + { + "epoch": 0.9845872119639859, + "grad_norm": 1.478284205437885, + "learning_rate": 6.110220159710744e-09, + "loss": 0.2409, + "step": 3226 + }, + { + "epoch": 0.9848924156874714, + "grad_norm": 1.9575325634076046, + "learning_rate": 5.868302783469948e-09, + "loss": 0.45, + "step": 3227 + }, + { + "epoch": 0.9851976194109568, + "grad_norm": 1.889011356491274, + "learning_rate": 5.631268840650128e-09, + "loss": 0.2935, + "step": 3228 + }, + { + "epoch": 0.9855028231344423, + "grad_norm": 1.7479822242758298, + "learning_rate": 5.399118563030325e-09, + "loss": 0.3301, + "step": 3229 + }, + { + "epoch": 0.9858080268579277, + "grad_norm": 1.6485662184083025, + "learning_rate": 5.1718521776150665e-09, + "loss": 0.3427, + "step": 3230 + }, + { + "epoch": 0.9861132305814131, + "grad_norm": 1.6028818046415272, + "learning_rate": 4.9494699066338084e-09, + "loss": 0.196, + "step": 3231 + }, + { + "epoch": 0.9864184343048985, + "grad_norm": 2.015249646153553, + "learning_rate": 4.731971967539828e-09, + "loss": 0.439, + "step": 3232 + }, + { + "epoch": 0.9867236380283839, + "grad_norm": 1.834696313638363, + "learning_rate": 4.519358573009114e-09, + "loss": 0.2964, + "step": 3233 + }, + { + "epoch": 0.9870288417518693, + "grad_norm": 2.1060195796813526, + "learning_rate": 4.3116299309425845e-09, + "loss": 0.437, + "step": 3234 + }, + { + "epoch": 0.9873340454753547, + "grad_norm": 1.6522688611963112, + "learning_rate": 4.108786244464979e-09, + "loss": 0.3729, + "step": 3235 + }, + { + "epoch": 0.9876392491988403, + "grad_norm": 1.8505254489001164, + "learning_rate": 3.910827711923193e-09, + "loss": 0.4158, + "step": 3236 + }, + { + "epoch": 0.9879444529223257, + "grad_norm": 1.5994443703529546, + "learning_rate": 3.71775452688794e-09, + "loss": 0.2584, + "step": 3237 + }, + { + "epoch": 0.9882496566458111, + "grad_norm": 2.123646024299586, + "learning_rate": 3.529566878153756e-09, + "loss": 0.2636, + "step": 3238 + }, + { + "epoch": 0.9885548603692965, + "grad_norm": 1.6746975611466164, + "learning_rate": 3.346264949735667e-09, + "loss": 0.2883, + "step": 3239 + }, + { + "epoch": 0.9888600640927819, + "grad_norm": 1.7888428665675282, + "learning_rate": 3.1678489208736286e-09, + "loss": 0.3256, + "step": 3240 + }, + { + "epoch": 0.9891652678162673, + "grad_norm": 1.7498548992492244, + "learning_rate": 2.994318966028087e-09, + "loss": 0.2499, + "step": 3241 + }, + { + "epoch": 0.9894704715397528, + "grad_norm": 1.6301950894956327, + "learning_rate": 2.825675254883309e-09, + "loss": 0.4248, + "step": 3242 + }, + { + "epoch": 0.9897756752632382, + "grad_norm": 1.6474807638570632, + "learning_rate": 2.661917952344606e-09, + "loss": 0.4375, + "step": 3243 + }, + { + "epoch": 0.9900808789867236, + "grad_norm": 1.5130833816949905, + "learning_rate": 2.503047218539445e-09, + "loss": 0.2229, + "step": 3244 + }, + { + "epoch": 0.9903860827102091, + "grad_norm": 1.710721633740625, + "learning_rate": 2.349063208816893e-09, + "loss": 0.3004, + "step": 3245 + }, + { + "epoch": 0.9906912864336945, + "grad_norm": 1.4238009142793833, + "learning_rate": 2.199966073748172e-09, + "loss": 0.1267, + "step": 3246 + }, + { + "epoch": 0.9909964901571799, + "grad_norm": 2.179942563901439, + "learning_rate": 2.0557559591255495e-09, + "loss": 0.3234, + "step": 3247 + }, + { + "epoch": 0.9913016938806654, + "grad_norm": 2.6007007168155325, + "learning_rate": 1.916433005962337e-09, + "loss": 0.3738, + "step": 3248 + }, + { + "epoch": 0.9916068976041508, + "grad_norm": 2.0242736738285108, + "learning_rate": 1.7819973504940024e-09, + "loss": 0.1664, + "step": 3249 + }, + { + "epoch": 0.9919121013276362, + "grad_norm": 1.610472147760261, + "learning_rate": 1.6524491241753927e-09, + "loss": 0.3363, + "step": 3250 + }, + { + "epoch": 0.9922173050511216, + "grad_norm": 2.081484724675542, + "learning_rate": 1.5277884536835098e-09, + "loss": 0.2283, + "step": 3251 + }, + { + "epoch": 0.992522508774607, + "grad_norm": 1.7297711139376675, + "learning_rate": 1.408015460916401e-09, + "loss": 0.2762, + "step": 3252 + }, + { + "epoch": 0.9928277124980924, + "grad_norm": 1.5367615050441692, + "learning_rate": 1.2931302629914933e-09, + "loss": 0.2469, + "step": 3253 + }, + { + "epoch": 0.993132916221578, + "grad_norm": 1.7292033578434607, + "learning_rate": 1.1831329722478135e-09, + "loss": 0.3093, + "step": 3254 + }, + { + "epoch": 0.9934381199450634, + "grad_norm": 1.6099734628753108, + "learning_rate": 1.0780236962448787e-09, + "loss": 0.2235, + "step": 3255 + }, + { + "epoch": 0.9937433236685488, + "grad_norm": 1.4622875646213087, + "learning_rate": 9.778025377610301e-10, + "loss": 0.2812, + "step": 3256 + }, + { + "epoch": 0.9940485273920342, + "grad_norm": 1.5434701611866395, + "learning_rate": 8.824695947967643e-10, + "loss": 0.4045, + "step": 3257 + }, + { + "epoch": 0.9943537311155196, + "grad_norm": 1.786815381491549, + "learning_rate": 7.920249605719576e-10, + "loss": 0.3155, + "step": 3258 + }, + { + "epoch": 0.994658934839005, + "grad_norm": 1.9817553672905523, + "learning_rate": 7.064687235264211e-10, + "loss": 0.2771, + "step": 3259 + }, + { + "epoch": 0.9949641385624904, + "grad_norm": 1.6066809020592328, + "learning_rate": 6.258009673199006e-10, + "loss": 0.2929, + "step": 3260 + }, + { + "epoch": 0.9952693422859759, + "grad_norm": 2.435620194396852, + "learning_rate": 5.500217708320765e-10, + "loss": 0.3177, + "step": 3261 + }, + { + "epoch": 0.9955745460094613, + "grad_norm": 1.328916605483517, + "learning_rate": 4.791312081620092e-10, + "loss": 0.322, + "step": 3262 + }, + { + "epoch": 0.9958797497329468, + "grad_norm": 1.7408370185083828, + "learning_rate": 4.131293486298038e-10, + "loss": 0.1873, + "step": 3263 + }, + { + "epoch": 0.9961849534564322, + "grad_norm": 1.4655986010811306, + "learning_rate": 3.52016256773835e-10, + "loss": 0.3255, + "step": 3264 + }, + { + "epoch": 0.9964901571799176, + "grad_norm": 1.5263478185751251, + "learning_rate": 2.9579199235241216e-10, + "loss": 0.2787, + "step": 3265 + }, + { + "epoch": 0.996795360903403, + "grad_norm": 1.676123239615716, + "learning_rate": 2.4445661034377957e-10, + "loss": 0.3825, + "step": 3266 + }, + { + "epoch": 0.9971005646268885, + "grad_norm": 1.8073283108414324, + "learning_rate": 1.9801016094556092e-10, + "loss": 0.2098, + "step": 3267 + }, + { + "epoch": 0.9974057683503739, + "grad_norm": 1.7580545894492703, + "learning_rate": 1.5645268957420467e-10, + "loss": 0.4099, + "step": 3268 + }, + { + "epoch": 0.9977109720738593, + "grad_norm": 1.900901796255548, + "learning_rate": 1.1978423686664908e-10, + "loss": 0.4666, + "step": 3269 + }, + { + "epoch": 0.9980161757973447, + "grad_norm": 1.5234889095141848, + "learning_rate": 8.800483867865694e-11, + "loss": 0.3132, + "step": 3270 + }, + { + "epoch": 0.9983213795208301, + "grad_norm": 1.4627559992084171, + "learning_rate": 6.111452608426049e-11, + "loss": 0.2896, + "step": 3271 + }, + { + "epoch": 0.9986265832443156, + "grad_norm": 2.0341419467364257, + "learning_rate": 3.911332537853696e-11, + "loss": 0.3179, + "step": 3272 + }, + { + "epoch": 0.998931786967801, + "grad_norm": 1.884623562691759, + "learning_rate": 2.200125807538811e-11, + "loss": 0.2754, + "step": 3273 + }, + { + "epoch": 0.9992369906912865, + "grad_norm": 1.8192708059580842, + "learning_rate": 9.77834090643004e-12, + "loss": 0.3777, + "step": 3274 + }, + { + "epoch": 0.9995421944147719, + "grad_norm": 1.852195020067947, + "learning_rate": 2.4445858243238305e-12, + "loss": 0.4301, + "step": 3275 + }, + { + "epoch": 0.9998473981382573, + "grad_norm": 1.7682018549462064, + "learning_rate": 0.0, + "loss": 0.3041, + "step": 3276 + }, + { + "epoch": 0.9998473981382573, + "step": 3276, + "total_flos": 407180904620032.0, + "train_loss": 0.3420985525812822, + "train_runtime": 61899.4198, + "train_samples_per_second": 1.694, + "train_steps_per_second": 0.053 + } + ], + "logging_steps": 1.0, + "max_steps": 3276, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 328, + "total_flos": 407180904620032.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}