diff --git "a/checkpoint-648/trainer_state.json" "b/checkpoint-648/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-648/trainer_state.json" @@ -0,0 +1,4569 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 648, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015432098765432098, + "grad_norm": 3.0899453163146973, + "learning_rate": 3.125e-06, + "loss": 2.8755, + "step": 1 + }, + { + "epoch": 0.0030864197530864196, + "grad_norm": 1.8920949697494507, + "learning_rate": 6.25e-06, + "loss": 2.4752, + "step": 2 + }, + { + "epoch": 0.004629629629629629, + "grad_norm": 1.8928515911102295, + "learning_rate": 9.375000000000001e-06, + "loss": 2.3472, + "step": 3 + }, + { + "epoch": 0.006172839506172839, + "grad_norm": 2.1841516494750977, + "learning_rate": 1.25e-05, + "loss": 2.5111, + "step": 4 + }, + { + "epoch": 0.007716049382716049, + "grad_norm": 1.9916893243789673, + "learning_rate": 1.5625e-05, + "loss": 2.1544, + "step": 5 + }, + { + "epoch": 0.009259259259259259, + "grad_norm": 1.8125377893447876, + "learning_rate": 1.8750000000000002e-05, + "loss": 2.2085, + "step": 6 + }, + { + "epoch": 0.010802469135802469, + "grad_norm": 2.27866530418396, + "learning_rate": 2.1875e-05, + "loss": 2.9925, + "step": 7 + }, + { + "epoch": 0.012345679012345678, + "grad_norm": 2.621497392654419, + "learning_rate": 2.5e-05, + "loss": 2.6221, + "step": 8 + }, + { + "epoch": 0.013888888888888888, + "grad_norm": 2.2457780838012695, + "learning_rate": 2.8125000000000003e-05, + "loss": 2.2464, + "step": 9 + }, + { + "epoch": 0.015432098765432098, + "grad_norm": 1.88410484790802, + "learning_rate": 3.125e-05, + "loss": 1.8552, + "step": 10 + }, + { + "epoch": 0.016975308641975308, + "grad_norm": 2.6654891967773438, + "learning_rate": 3.4375e-05, + "loss": 2.473, + "step": 11 + }, + { + "epoch": 0.018518518518518517, + "grad_norm": 2.3718037605285645, + "learning_rate": 3.7500000000000003e-05, + "loss": 2.1753, + "step": 12 + }, + { + "epoch": 0.020061728395061727, + "grad_norm": 3.3965818881988525, + "learning_rate": 4.0625000000000005e-05, + "loss": 2.1306, + "step": 13 + }, + { + "epoch": 0.021604938271604937, + "grad_norm": 1.0532786846160889, + "learning_rate": 4.375e-05, + "loss": 1.7796, + "step": 14 + }, + { + "epoch": 0.023148148148148147, + "grad_norm": 2.0709316730499268, + "learning_rate": 4.6875e-05, + "loss": 1.9083, + "step": 15 + }, + { + "epoch": 0.024691358024691357, + "grad_norm": 2.6936681270599365, + "learning_rate": 5e-05, + "loss": 1.5981, + "step": 16 + }, + { + "epoch": 0.026234567901234566, + "grad_norm": 1.5314834117889404, + "learning_rate": 5.3125000000000004e-05, + "loss": 1.3591, + "step": 17 + }, + { + "epoch": 0.027777777777777776, + "grad_norm": 2.252319097518921, + "learning_rate": 5.6250000000000005e-05, + "loss": 1.1998, + "step": 18 + }, + { + "epoch": 0.029320987654320986, + "grad_norm": 1.2936722040176392, + "learning_rate": 5.9375e-05, + "loss": 1.2072, + "step": 19 + }, + { + "epoch": 0.030864197530864196, + "grad_norm": 1.1747729778289795, + "learning_rate": 6.25e-05, + "loss": 1.2882, + "step": 20 + }, + { + "epoch": 0.032407407407407406, + "grad_norm": 1.117002248764038, + "learning_rate": 6.562500000000001e-05, + "loss": 1.2283, + "step": 21 + }, + { + "epoch": 0.033950617283950615, + "grad_norm": 1.3000463247299194, + "learning_rate": 6.875e-05, + "loss": 0.9186, + "step": 22 + }, + { + "epoch": 0.035493827160493825, + "grad_norm": 1.3994625806808472, + "learning_rate": 7.1875e-05, + "loss": 1.2302, + "step": 23 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 1.2795201539993286, + "learning_rate": 7.500000000000001e-05, + "loss": 0.9952, + "step": 24 + }, + { + "epoch": 0.038580246913580245, + "grad_norm": 1.110682725906372, + "learning_rate": 7.8125e-05, + "loss": 0.8482, + "step": 25 + }, + { + "epoch": 0.040123456790123455, + "grad_norm": 0.8379050493240356, + "learning_rate": 8.125000000000001e-05, + "loss": 0.861, + "step": 26 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 0.6166597604751587, + "learning_rate": 8.4375e-05, + "loss": 1.1564, + "step": 27 + }, + { + "epoch": 0.043209876543209874, + "grad_norm": 0.46166491508483887, + "learning_rate": 8.75e-05, + "loss": 0.8898, + "step": 28 + }, + { + "epoch": 0.044753086419753084, + "grad_norm": 0.6219871640205383, + "learning_rate": 9.062500000000001e-05, + "loss": 0.6526, + "step": 29 + }, + { + "epoch": 0.046296296296296294, + "grad_norm": 0.9887904524803162, + "learning_rate": 9.375e-05, + "loss": 1.1225, + "step": 30 + }, + { + "epoch": 0.047839506172839504, + "grad_norm": 0.9143044948577881, + "learning_rate": 9.687500000000001e-05, + "loss": 0.3937, + "step": 31 + }, + { + "epoch": 0.04938271604938271, + "grad_norm": 0.7605987787246704, + "learning_rate": 0.0001, + "loss": 0.7167, + "step": 32 + }, + { + "epoch": 0.05092592592592592, + "grad_norm": 0.5838854312896729, + "learning_rate": 0.000103125, + "loss": 0.7857, + "step": 33 + }, + { + "epoch": 0.05246913580246913, + "grad_norm": 0.534980297088623, + "learning_rate": 0.00010625000000000001, + "loss": 0.5638, + "step": 34 + }, + { + "epoch": 0.05401234567901234, + "grad_norm": 0.9353653192520142, + "learning_rate": 0.000109375, + "loss": 0.6857, + "step": 35 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 0.5896846055984497, + "learning_rate": 0.00011250000000000001, + "loss": 0.279, + "step": 36 + }, + { + "epoch": 0.05709876543209876, + "grad_norm": 0.46166813373565674, + "learning_rate": 0.000115625, + "loss": 0.2871, + "step": 37 + }, + { + "epoch": 0.05864197530864197, + "grad_norm": 0.4007885456085205, + "learning_rate": 0.00011875, + "loss": 0.4575, + "step": 38 + }, + { + "epoch": 0.06018518518518518, + "grad_norm": 0.6551805734634399, + "learning_rate": 0.00012187500000000001, + "loss": 0.8253, + "step": 39 + }, + { + "epoch": 0.06172839506172839, + "grad_norm": 0.6135735511779785, + "learning_rate": 0.000125, + "loss": 0.1605, + "step": 40 + }, + { + "epoch": 0.06327160493827161, + "grad_norm": 0.5066695809364319, + "learning_rate": 0.000128125, + "loss": 0.7487, + "step": 41 + }, + { + "epoch": 0.06481481481481481, + "grad_norm": 0.8444607853889465, + "learning_rate": 0.00013125000000000002, + "loss": 0.5555, + "step": 42 + }, + { + "epoch": 0.06635802469135803, + "grad_norm": 0.6031410098075867, + "learning_rate": 0.000134375, + "loss": 0.7679, + "step": 43 + }, + { + "epoch": 0.06790123456790123, + "grad_norm": 0.46991780400276184, + "learning_rate": 0.0001375, + "loss": 0.4194, + "step": 44 + }, + { + "epoch": 0.06944444444444445, + "grad_norm": 0.40292972326278687, + "learning_rate": 0.00014062500000000002, + "loss": 0.1963, + "step": 45 + }, + { + "epoch": 0.07098765432098765, + "grad_norm": 0.6669200658798218, + "learning_rate": 0.00014375, + "loss": 0.8144, + "step": 46 + }, + { + "epoch": 0.07253086419753087, + "grad_norm": 0.3419376313686371, + "learning_rate": 0.000146875, + "loss": 0.6851, + "step": 47 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.5096787810325623, + "learning_rate": 0.00015000000000000001, + "loss": 0.5774, + "step": 48 + }, + { + "epoch": 0.07561728395061729, + "grad_norm": 0.37812817096710205, + "learning_rate": 0.000153125, + "loss": 0.4033, + "step": 49 + }, + { + "epoch": 0.07716049382716049, + "grad_norm": 0.6551339626312256, + "learning_rate": 0.00015625, + "loss": 1.2183, + "step": 50 + }, + { + "epoch": 0.0787037037037037, + "grad_norm": 0.2642059624195099, + "learning_rate": 0.000159375, + "loss": 0.1036, + "step": 51 + }, + { + "epoch": 0.08024691358024691, + "grad_norm": 0.6203199028968811, + "learning_rate": 0.00016250000000000002, + "loss": 0.7805, + "step": 52 + }, + { + "epoch": 0.08179012345679013, + "grad_norm": 0.4128793478012085, + "learning_rate": 0.000165625, + "loss": 0.469, + "step": 53 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.327178031206131, + "learning_rate": 0.00016875, + "loss": 1.1053, + "step": 54 + }, + { + "epoch": 0.08487654320987655, + "grad_norm": 0.25028762221336365, + "learning_rate": 0.00017187500000000002, + "loss": 0.458, + "step": 55 + }, + { + "epoch": 0.08641975308641975, + "grad_norm": 0.3341365456581116, + "learning_rate": 0.000175, + "loss": 0.5775, + "step": 56 + }, + { + "epoch": 0.08796296296296297, + "grad_norm": 0.4268285930156708, + "learning_rate": 0.000178125, + "loss": 0.2152, + "step": 57 + }, + { + "epoch": 0.08950617283950617, + "grad_norm": 0.3492703437805176, + "learning_rate": 0.00018125000000000001, + "loss": 1.2513, + "step": 58 + }, + { + "epoch": 0.09104938271604938, + "grad_norm": 0.49452951550483704, + "learning_rate": 0.000184375, + "loss": 0.8177, + "step": 59 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 0.2636454403400421, + "learning_rate": 0.0001875, + "loss": 0.7231, + "step": 60 + }, + { + "epoch": 0.0941358024691358, + "grad_norm": 0.2863665223121643, + "learning_rate": 0.000190625, + "loss": 0.1892, + "step": 61 + }, + { + "epoch": 0.09567901234567901, + "grad_norm": 1.0529048442840576, + "learning_rate": 0.00019375000000000002, + "loss": 0.4621, + "step": 62 + }, + { + "epoch": 0.09722222222222222, + "grad_norm": 0.3274974226951599, + "learning_rate": 0.000196875, + "loss": 0.5643, + "step": 63 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 0.2704814672470093, + "learning_rate": 0.0002, + "loss": 0.3614, + "step": 64 + }, + { + "epoch": 0.10030864197530864, + "grad_norm": 0.3830932676792145, + "learning_rate": 0.00019999855308503586, + "loss": 0.618, + "step": 65 + }, + { + "epoch": 0.10185185185185185, + "grad_norm": 0.290043443441391, + "learning_rate": 0.00019999421238201467, + "loss": 0.7141, + "step": 66 + }, + { + "epoch": 0.10339506172839506, + "grad_norm": 0.3828456699848175, + "learning_rate": 0.000199986978016549, + "loss": 0.5034, + "step": 67 + }, + { + "epoch": 0.10493827160493827, + "grad_norm": 0.326385498046875, + "learning_rate": 0.00019997685019798912, + "loss": 0.5742, + "step": 68 + }, + { + "epoch": 0.10648148148148148, + "grad_norm": 0.34104934334754944, + "learning_rate": 0.0001999638292194168, + "loss": 0.4535, + "step": 69 + }, + { + "epoch": 0.10802469135802469, + "grad_norm": 0.33013594150543213, + "learning_rate": 0.00019994791545763704, + "loss": 0.1104, + "step": 70 + }, + { + "epoch": 0.1095679012345679, + "grad_norm": 0.24851077795028687, + "learning_rate": 0.0001999291093731671, + "loss": 0.1701, + "step": 71 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.39759957790374756, + "learning_rate": 0.00019990741151022301, + "loss": 0.9863, + "step": 72 + }, + { + "epoch": 0.11265432098765432, + "grad_norm": 0.46530836820602417, + "learning_rate": 0.00019988282249670407, + "loss": 1.0482, + "step": 73 + }, + { + "epoch": 0.11419753086419752, + "grad_norm": 0.41774773597717285, + "learning_rate": 0.00019985534304417445, + "loss": 0.746, + "step": 74 + }, + { + "epoch": 0.11574074074074074, + "grad_norm": 0.3779691457748413, + "learning_rate": 0.00019982497394784284, + "loss": 0.7121, + "step": 75 + }, + { + "epoch": 0.11728395061728394, + "grad_norm": 0.27572616934776306, + "learning_rate": 0.00019979171608653924, + "loss": 0.4452, + "step": 76 + }, + { + "epoch": 0.11882716049382716, + "grad_norm": 0.4704704284667969, + "learning_rate": 0.00019975557042268953, + "loss": 0.7798, + "step": 77 + }, + { + "epoch": 0.12037037037037036, + "grad_norm": 0.35982805490493774, + "learning_rate": 0.0001997165380022878, + "loss": 0.8498, + "step": 78 + }, + { + "epoch": 0.12191358024691358, + "grad_norm": 0.4288599193096161, + "learning_rate": 0.00019967461995486587, + "loss": 0.7976, + "step": 79 + }, + { + "epoch": 0.12345679012345678, + "grad_norm": 0.3367895483970642, + "learning_rate": 0.00019962981749346078, + "loss": 0.1945, + "step": 80 + }, + { + "epoch": 0.125, + "grad_norm": 0.4507409632205963, + "learning_rate": 0.00019958213191457956, + "loss": 0.4913, + "step": 81 + }, + { + "epoch": 0.12654320987654322, + "grad_norm": 0.3567584753036499, + "learning_rate": 0.00019953156459816179, + "loss": 0.66, + "step": 82 + }, + { + "epoch": 0.12808641975308643, + "grad_norm": 0.3726179599761963, + "learning_rate": 0.00019947811700753954, + "loss": 0.4015, + "step": 83 + }, + { + "epoch": 0.12962962962962962, + "grad_norm": 0.2937561273574829, + "learning_rate": 0.0001994217906893952, + "loss": 0.4398, + "step": 84 + }, + { + "epoch": 0.13117283950617284, + "grad_norm": 0.3103060722351074, + "learning_rate": 0.00019936258727371666, + "loss": 0.1651, + "step": 85 + }, + { + "epoch": 0.13271604938271606, + "grad_norm": 0.36687710881233215, + "learning_rate": 0.00019930050847375008, + "loss": 0.4967, + "step": 86 + }, + { + "epoch": 0.13425925925925927, + "grad_norm": 0.7049203515052795, + "learning_rate": 0.0001992355560859503, + "loss": 0.9613, + "step": 87 + }, + { + "epoch": 0.13580246913580246, + "grad_norm": 0.2752722203731537, + "learning_rate": 0.000199167731989929, + "loss": 0.7139, + "step": 88 + }, + { + "epoch": 0.13734567901234568, + "grad_norm": 0.4479522705078125, + "learning_rate": 0.00019909703814840018, + "loss": 0.2407, + "step": 89 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 0.38699865341186523, + "learning_rate": 0.00019902347660712334, + "loss": 0.8037, + "step": 90 + }, + { + "epoch": 0.1404320987654321, + "grad_norm": 0.39821603894233704, + "learning_rate": 0.00019894704949484444, + "loss": 0.4912, + "step": 91 + }, + { + "epoch": 0.1419753086419753, + "grad_norm": 0.31964239478111267, + "learning_rate": 0.00019886775902323405, + "loss": 0.7093, + "step": 92 + }, + { + "epoch": 0.14351851851851852, + "grad_norm": 0.2825027108192444, + "learning_rate": 0.0001987856074868236, + "loss": 0.3929, + "step": 93 + }, + { + "epoch": 0.14506172839506173, + "grad_norm": 0.2643113136291504, + "learning_rate": 0.0001987005972629389, + "loss": 0.2032, + "step": 94 + }, + { + "epoch": 0.14660493827160495, + "grad_norm": 0.398742139339447, + "learning_rate": 0.00019861273081163113, + "loss": 0.3768, + "step": 95 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.3663339912891388, + "learning_rate": 0.00019852201067560606, + "loss": 0.5911, + "step": 96 + }, + { + "epoch": 0.14969135802469136, + "grad_norm": 0.14621390402317047, + "learning_rate": 0.0001984284394801501, + "loss": 0.0728, + "step": 97 + }, + { + "epoch": 0.15123456790123457, + "grad_norm": 0.4680364727973938, + "learning_rate": 0.0001983320199330545, + "loss": 0.5213, + "step": 98 + }, + { + "epoch": 0.1527777777777778, + "grad_norm": 0.3623412549495697, + "learning_rate": 0.00019823275482453698, + "loss": 0.635, + "step": 99 + }, + { + "epoch": 0.15432098765432098, + "grad_norm": 0.14720472693443298, + "learning_rate": 0.00019813064702716094, + "loss": 0.0862, + "step": 100 + }, + { + "epoch": 0.1558641975308642, + "grad_norm": 0.6224125027656555, + "learning_rate": 0.0001980256994957524, + "loss": 0.9097, + "step": 101 + }, + { + "epoch": 0.1574074074074074, + "grad_norm": 0.2866581082344055, + "learning_rate": 0.00019791791526731445, + "loss": 0.2315, + "step": 102 + }, + { + "epoch": 0.15895061728395063, + "grad_norm": 0.31025055050849915, + "learning_rate": 0.0001978072974609393, + "loss": 0.411, + "step": 103 + }, + { + "epoch": 0.16049382716049382, + "grad_norm": 0.20884157717227936, + "learning_rate": 0.0001976938492777182, + "loss": 0.2956, + "step": 104 + }, + { + "epoch": 0.16203703703703703, + "grad_norm": 0.2588401138782501, + "learning_rate": 0.00019757757400064855, + "loss": 0.1735, + "step": 105 + }, + { + "epoch": 0.16358024691358025, + "grad_norm": 0.3220229744911194, + "learning_rate": 0.0001974584749945392, + "loss": 0.5662, + "step": 106 + }, + { + "epoch": 0.16512345679012347, + "grad_norm": 0.2857765853404999, + "learning_rate": 0.00019733655570591276, + "loss": 0.3836, + "step": 107 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.5752654075622559, + "learning_rate": 0.00019721181966290613, + "loss": 0.4568, + "step": 108 + }, + { + "epoch": 0.16820987654320987, + "grad_norm": 0.7308361530303955, + "learning_rate": 0.00019708427047516826, + "loss": 0.7291, + "step": 109 + }, + { + "epoch": 0.1697530864197531, + "grad_norm": 0.27257639169692993, + "learning_rate": 0.00019695391183375573, + "loss": 0.3602, + "step": 110 + }, + { + "epoch": 0.1712962962962963, + "grad_norm": 0.320433109998703, + "learning_rate": 0.00019682074751102587, + "loss": 0.7321, + "step": 111 + }, + { + "epoch": 0.1728395061728395, + "grad_norm": 0.28458547592163086, + "learning_rate": 0.00019668478136052774, + "loss": 0.53, + "step": 112 + }, + { + "epoch": 0.1743827160493827, + "grad_norm": 0.5635747909545898, + "learning_rate": 0.00019654601731689054, + "loss": 0.5709, + "step": 113 + }, + { + "epoch": 0.17592592592592593, + "grad_norm": 0.3742586672306061, + "learning_rate": 0.00019640445939570958, + "loss": 0.7744, + "step": 114 + }, + { + "epoch": 0.17746913580246915, + "grad_norm": 0.3344044089317322, + "learning_rate": 0.00019626011169343043, + "loss": 0.7734, + "step": 115 + }, + { + "epoch": 0.17901234567901234, + "grad_norm": 0.2581966817378998, + "learning_rate": 0.0001961129783872301, + "loss": 0.4266, + "step": 116 + }, + { + "epoch": 0.18055555555555555, + "grad_norm": 0.37699440121650696, + "learning_rate": 0.0001959630637348962, + "loss": 0.4068, + "step": 117 + }, + { + "epoch": 0.18209876543209877, + "grad_norm": 0.29423436522483826, + "learning_rate": 0.00019581037207470382, + "loss": 0.7209, + "step": 118 + }, + { + "epoch": 0.18364197530864199, + "grad_norm": 0.2644597589969635, + "learning_rate": 0.00019565490782528995, + "loss": 0.3857, + "step": 119 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.24177905917167664, + "learning_rate": 0.00019549667548552556, + "loss": 0.3959, + "step": 120 + }, + { + "epoch": 0.1867283950617284, + "grad_norm": 0.20570896565914154, + "learning_rate": 0.0001953356796343854, + "loss": 0.4221, + "step": 121 + }, + { + "epoch": 0.1882716049382716, + "grad_norm": 0.30008113384246826, + "learning_rate": 0.00019517192493081565, + "loss": 0.1873, + "step": 122 + }, + { + "epoch": 0.18981481481481483, + "grad_norm": 0.31316304206848145, + "learning_rate": 0.0001950054161135989, + "loss": 0.6517, + "step": 123 + }, + { + "epoch": 0.19135802469135801, + "grad_norm": 0.5588698983192444, + "learning_rate": 0.00019483615800121716, + "loss": 0.8951, + "step": 124 + }, + { + "epoch": 0.19290123456790123, + "grad_norm": 0.2321777492761612, + "learning_rate": 0.00019466415549171235, + "loss": 0.4064, + "step": 125 + }, + { + "epoch": 0.19444444444444445, + "grad_norm": 0.2301969975233078, + "learning_rate": 0.00019448941356254454, + "loss": 0.3123, + "step": 126 + }, + { + "epoch": 0.19598765432098766, + "grad_norm": 0.253325879573822, + "learning_rate": 0.00019431193727044796, + "loss": 0.1832, + "step": 127 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 0.33053892850875854, + "learning_rate": 0.00019413173175128473, + "loss": 0.8404, + "step": 128 + }, + { + "epoch": 0.19907407407407407, + "grad_norm": 0.2916980981826782, + "learning_rate": 0.00019394880221989603, + "loss": 0.4437, + "step": 129 + }, + { + "epoch": 0.2006172839506173, + "grad_norm": 0.27841269969940186, + "learning_rate": 0.0001937631539699514, + "loss": 0.7476, + "step": 130 + }, + { + "epoch": 0.2021604938271605, + "grad_norm": 0.21882717311382294, + "learning_rate": 0.0001935747923737955, + "loss": 0.445, + "step": 131 + }, + { + "epoch": 0.2037037037037037, + "grad_norm": 0.3397207260131836, + "learning_rate": 0.0001933837228822925, + "loss": 0.6737, + "step": 132 + }, + { + "epoch": 0.2052469135802469, + "grad_norm": 0.19676990807056427, + "learning_rate": 0.00019318995102466863, + "loss": 0.2219, + "step": 133 + }, + { + "epoch": 0.20679012345679013, + "grad_norm": 0.31804370880126953, + "learning_rate": 0.00019299348240835182, + "loss": 0.6305, + "step": 134 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.30603691935539246, + "learning_rate": 0.0001927943227188097, + "loss": 0.3504, + "step": 135 + }, + { + "epoch": 0.20987654320987653, + "grad_norm": 0.2855909764766693, + "learning_rate": 0.000192592477719385, + "loss": 0.441, + "step": 136 + }, + { + "epoch": 0.21141975308641975, + "grad_norm": 0.3355270028114319, + "learning_rate": 0.0001923879532511287, + "loss": 0.557, + "step": 137 + }, + { + "epoch": 0.21296296296296297, + "grad_norm": 0.24465712904930115, + "learning_rate": 0.00019218075523263104, + "loss": 0.4268, + "step": 138 + }, + { + "epoch": 0.21450617283950618, + "grad_norm": 0.17175762355327606, + "learning_rate": 0.00019197088965985034, + "loss": 0.1277, + "step": 139 + }, + { + "epoch": 0.21604938271604937, + "grad_norm": 0.43302610516548157, + "learning_rate": 0.00019175836260593938, + "loss": 0.5849, + "step": 140 + }, + { + "epoch": 0.2175925925925926, + "grad_norm": 0.301595002412796, + "learning_rate": 0.0001915431802210696, + "loss": 0.4466, + "step": 141 + }, + { + "epoch": 0.2191358024691358, + "grad_norm": 0.25868454575538635, + "learning_rate": 0.00019132534873225323, + "loss": 0.198, + "step": 142 + }, + { + "epoch": 0.22067901234567902, + "grad_norm": 0.24638496339321136, + "learning_rate": 0.00019110487444316316, + "loss": 0.4116, + "step": 143 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.23242460191249847, + "learning_rate": 0.0001908817637339503, + "loss": 0.8272, + "step": 144 + }, + { + "epoch": 0.22376543209876543, + "grad_norm": 0.35410597920417786, + "learning_rate": 0.00019065602306105915, + "loss": 0.9215, + "step": 145 + }, + { + "epoch": 0.22530864197530864, + "grad_norm": 0.2640796899795532, + "learning_rate": 0.00019042765895704085, + "loss": 0.7417, + "step": 146 + }, + { + "epoch": 0.22685185185185186, + "grad_norm": 0.3106350898742676, + "learning_rate": 0.00019019667803036417, + "loss": 0.7736, + "step": 147 + }, + { + "epoch": 0.22839506172839505, + "grad_norm": 0.2846841812133789, + "learning_rate": 0.00018996308696522433, + "loss": 0.9721, + "step": 148 + }, + { + "epoch": 0.22993827160493827, + "grad_norm": 0.2250329554080963, + "learning_rate": 0.0001897268925213495, + "loss": 0.5244, + "step": 149 + }, + { + "epoch": 0.23148148148148148, + "grad_norm": 0.23303017020225525, + "learning_rate": 0.00018948810153380513, + "loss": 0.27, + "step": 150 + }, + { + "epoch": 0.2330246913580247, + "grad_norm": 0.2550167143344879, + "learning_rate": 0.0001892467209127963, + "loss": 0.3703, + "step": 151 + }, + { + "epoch": 0.2345679012345679, + "grad_norm": 0.24124033749103546, + "learning_rate": 0.00018900275764346768, + "loss": 0.1151, + "step": 152 + }, + { + "epoch": 0.2361111111111111, + "grad_norm": 0.4303573966026306, + "learning_rate": 0.00018875621878570135, + "loss": 0.4389, + "step": 153 + }, + { + "epoch": 0.23765432098765432, + "grad_norm": 0.26327794790267944, + "learning_rate": 0.00018850711147391257, + "loss": 0.5479, + "step": 154 + }, + { + "epoch": 0.23919753086419754, + "grad_norm": 0.2795560359954834, + "learning_rate": 0.0001882554429168433, + "loss": 0.1773, + "step": 155 + }, + { + "epoch": 0.24074074074074073, + "grad_norm": 0.35413095355033875, + "learning_rate": 0.00018800122039735358, + "loss": 0.6134, + "step": 156 + }, + { + "epoch": 0.24228395061728394, + "grad_norm": 0.15214578807353973, + "learning_rate": 0.0001877444512722107, + "loss": 0.1121, + "step": 157 + }, + { + "epoch": 0.24382716049382716, + "grad_norm": 0.3649609088897705, + "learning_rate": 0.00018748514297187648, + "loss": 0.4455, + "step": 158 + }, + { + "epoch": 0.24537037037037038, + "grad_norm": 0.28007131814956665, + "learning_rate": 0.00018722330300029213, + "loss": 0.7281, + "step": 159 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 1.0286228656768799, + "learning_rate": 0.0001869589389346611, + "loss": 0.8545, + "step": 160 + }, + { + "epoch": 0.24845679012345678, + "grad_norm": 0.30538541078567505, + "learning_rate": 0.00018669205842522984, + "loss": 0.621, + "step": 161 + }, + { + "epoch": 0.25, + "grad_norm": 0.34907880425453186, + "learning_rate": 0.00018642266919506644, + "loss": 0.9946, + "step": 162 + }, + { + "epoch": 0.2515432098765432, + "grad_norm": 0.2580735683441162, + "learning_rate": 0.00018615077903983703, + "loss": 0.5254, + "step": 163 + }, + { + "epoch": 0.25308641975308643, + "grad_norm": 0.2873341143131256, + "learning_rate": 0.00018587639582758031, + "loss": 0.4463, + "step": 164 + }, + { + "epoch": 0.25462962962962965, + "grad_norm": 0.34044674038887024, + "learning_rate": 0.00018559952749847976, + "loss": 0.3363, + "step": 165 + }, + { + "epoch": 0.25617283950617287, + "grad_norm": 0.27595579624176025, + "learning_rate": 0.000185320182064634, + "loss": 0.7043, + "step": 166 + }, + { + "epoch": 0.25771604938271603, + "grad_norm": 0.23352336883544922, + "learning_rate": 0.00018503836760982477, + "loss": 0.3758, + "step": 167 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.35870033502578735, + "learning_rate": 0.00018475409228928312, + "loss": 0.7854, + "step": 168 + }, + { + "epoch": 0.26080246913580246, + "grad_norm": 0.3221752941608429, + "learning_rate": 0.00018446736432945336, + "loss": 0.7738, + "step": 169 + }, + { + "epoch": 0.2623456790123457, + "grad_norm": 0.20870216190814972, + "learning_rate": 0.00018417819202775495, + "loss": 0.1124, + "step": 170 + }, + { + "epoch": 0.2638888888888889, + "grad_norm": 0.3498900830745697, + "learning_rate": 0.00018388658375234255, + "loss": 0.6496, + "step": 171 + }, + { + "epoch": 0.2654320987654321, + "grad_norm": 0.18623869121074677, + "learning_rate": 0.0001835925479418637, + "loss": 0.5837, + "step": 172 + }, + { + "epoch": 0.26697530864197533, + "grad_norm": 0.3215937316417694, + "learning_rate": 0.0001832960931052147, + "loss": 0.3791, + "step": 173 + }, + { + "epoch": 0.26851851851851855, + "grad_norm": 0.2414589524269104, + "learning_rate": 0.00018299722782129428, + "loss": 0.3661, + "step": 174 + }, + { + "epoch": 0.2700617283950617, + "grad_norm": 0.3455985486507416, + "learning_rate": 0.00018269596073875556, + "loss": 0.9395, + "step": 175 + }, + { + "epoch": 0.2716049382716049, + "grad_norm": 0.3490588366985321, + "learning_rate": 0.00018239230057575542, + "loss": 0.7401, + "step": 176 + }, + { + "epoch": 0.27314814814814814, + "grad_norm": 0.4741373658180237, + "learning_rate": 0.00018208625611970268, + "loss": 0.8191, + "step": 177 + }, + { + "epoch": 0.27469135802469136, + "grad_norm": 0.21905654668807983, + "learning_rate": 0.00018177783622700327, + "loss": 0.1344, + "step": 178 + }, + { + "epoch": 0.2762345679012346, + "grad_norm": 0.32941365242004395, + "learning_rate": 0.00018146704982280443, + "loss": 0.2385, + "step": 179 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.2891600430011749, + "learning_rate": 0.0001811539059007361, + "loss": 1.0453, + "step": 180 + }, + { + "epoch": 0.279320987654321, + "grad_norm": 0.23251338303089142, + "learning_rate": 0.00018083841352265086, + "loss": 0.2319, + "step": 181 + }, + { + "epoch": 0.2808641975308642, + "grad_norm": 0.2583518922328949, + "learning_rate": 0.00018052058181836151, + "loss": 0.3412, + "step": 182 + }, + { + "epoch": 0.2824074074074074, + "grad_norm": 0.24608013033866882, + "learning_rate": 0.00018020041998537709, + "loss": 0.3973, + "step": 183 + }, + { + "epoch": 0.2839506172839506, + "grad_norm": 0.2832016944885254, + "learning_rate": 0.00017987793728863651, + "loss": 0.1913, + "step": 184 + }, + { + "epoch": 0.2854938271604938, + "grad_norm": 0.20638220012187958, + "learning_rate": 0.00017955314306024054, + "loss": 0.5474, + "step": 185 + }, + { + "epoch": 0.28703703703703703, + "grad_norm": 0.18994338810443878, + "learning_rate": 0.0001792260466991818, + "loss": 0.1202, + "step": 186 + }, + { + "epoch": 0.28858024691358025, + "grad_norm": 0.29948702454566956, + "learning_rate": 0.00017889665767107266, + "loss": 0.3723, + "step": 187 + }, + { + "epoch": 0.29012345679012347, + "grad_norm": 0.2140231728553772, + "learning_rate": 0.00017856498550787144, + "loss": 0.3537, + "step": 188 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 0.15519015491008759, + "learning_rate": 0.0001782310398076064, + "loss": 0.3244, + "step": 189 + }, + { + "epoch": 0.2932098765432099, + "grad_norm": 0.20095421373844147, + "learning_rate": 0.00017789483023409824, + "loss": 0.1491, + "step": 190 + }, + { + "epoch": 0.29475308641975306, + "grad_norm": 0.29168039560317993, + "learning_rate": 0.00017755636651668012, + "loss": 0.7804, + "step": 191 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.2059430629014969, + "learning_rate": 0.00017721565844991643, + "loss": 0.3672, + "step": 192 + }, + { + "epoch": 0.2978395061728395, + "grad_norm": 0.332441121339798, + "learning_rate": 0.00017687271589331922, + "loss": 0.8577, + "step": 193 + }, + { + "epoch": 0.2993827160493827, + "grad_norm": 0.3359125256538391, + "learning_rate": 0.00017652754877106274, + "loss": 1.0497, + "step": 194 + }, + { + "epoch": 0.30092592592592593, + "grad_norm": 0.20881088078022003, + "learning_rate": 0.00017618016707169658, + "loss": 0.4744, + "step": 195 + }, + { + "epoch": 0.30246913580246915, + "grad_norm": 0.26887422800064087, + "learning_rate": 0.00017583058084785625, + "loss": 0.4659, + "step": 196 + }, + { + "epoch": 0.30401234567901236, + "grad_norm": 0.2714681327342987, + "learning_rate": 0.00017547880021597255, + "loss": 0.2655, + "step": 197 + }, + { + "epoch": 0.3055555555555556, + "grad_norm": 0.3164792060852051, + "learning_rate": 0.00017512483535597867, + "loss": 0.3755, + "step": 198 + }, + { + "epoch": 0.30709876543209874, + "grad_norm": 0.2882292866706848, + "learning_rate": 0.0001747686965110157, + "loss": 0.7344, + "step": 199 + }, + { + "epoch": 0.30864197530864196, + "grad_norm": 0.3346741199493408, + "learning_rate": 0.00017441039398713608, + "loss": 0.7048, + "step": 200 + }, + { + "epoch": 0.3101851851851852, + "grad_norm": 0.222006693482399, + "learning_rate": 0.0001740499381530055, + "loss": 0.3291, + "step": 201 + }, + { + "epoch": 0.3117283950617284, + "grad_norm": 0.3937174081802368, + "learning_rate": 0.00017368733943960276, + "loss": 0.6937, + "step": 202 + }, + { + "epoch": 0.3132716049382716, + "grad_norm": 0.3658754229545593, + "learning_rate": 0.00017332260833991794, + "loss": 0.3381, + "step": 203 + }, + { + "epoch": 0.3148148148148148, + "grad_norm": 0.4009222388267517, + "learning_rate": 0.00017295575540864877, + "loss": 0.5015, + "step": 204 + }, + { + "epoch": 0.31635802469135804, + "grad_norm": 0.3426806330680847, + "learning_rate": 0.00017258679126189516, + "loss": 0.8474, + "step": 205 + }, + { + "epoch": 0.31790123456790126, + "grad_norm": 0.16572719812393188, + "learning_rate": 0.00017221572657685205, + "loss": 0.5244, + "step": 206 + }, + { + "epoch": 0.3194444444444444, + "grad_norm": 0.15256325900554657, + "learning_rate": 0.00017184257209150027, + "loss": 0.1753, + "step": 207 + }, + { + "epoch": 0.32098765432098764, + "grad_norm": 0.20548087358474731, + "learning_rate": 0.00017146733860429612, + "loss": 0.3968, + "step": 208 + }, + { + "epoch": 0.32253086419753085, + "grad_norm": 0.3938625454902649, + "learning_rate": 0.00017109003697385843, + "loss": 0.7633, + "step": 209 + }, + { + "epoch": 0.32407407407407407, + "grad_norm": 0.46723672747612, + "learning_rate": 0.00017071067811865476, + "loss": 0.5869, + "step": 210 + }, + { + "epoch": 0.3256172839506173, + "grad_norm": 0.3297702670097351, + "learning_rate": 0.0001703292730166852, + "loss": 0.3302, + "step": 211 + }, + { + "epoch": 0.3271604938271605, + "grad_norm": 0.22976566851139069, + "learning_rate": 0.0001699458327051647, + "loss": 0.3393, + "step": 212 + }, + { + "epoch": 0.3287037037037037, + "grad_norm": 0.24704602360725403, + "learning_rate": 0.00016956036828020383, + "loss": 0.5944, + "step": 213 + }, + { + "epoch": 0.33024691358024694, + "grad_norm": 0.28030627965927124, + "learning_rate": 0.0001691728908964874, + "loss": 0.675, + "step": 214 + }, + { + "epoch": 0.3317901234567901, + "grad_norm": 0.3028554320335388, + "learning_rate": 0.00016878341176695196, + "loss": 0.4749, + "step": 215 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.42839252948760986, + "learning_rate": 0.00016839194216246108, + "loss": 0.7486, + "step": 216 + }, + { + "epoch": 0.33487654320987653, + "grad_norm": 0.47900381684303284, + "learning_rate": 0.00016799849341147939, + "loss": 1.1869, + "step": 217 + }, + { + "epoch": 0.33641975308641975, + "grad_norm": 0.480977326631546, + "learning_rate": 0.0001676030768997445, + "loss": 0.5782, + "step": 218 + }, + { + "epoch": 0.33796296296296297, + "grad_norm": 0.42303532361984253, + "learning_rate": 0.00016720570406993787, + "loss": 1.0646, + "step": 219 + }, + { + "epoch": 0.3395061728395062, + "grad_norm": 0.1927005499601364, + "learning_rate": 0.00016680638642135336, + "loss": 0.6238, + "step": 220 + }, + { + "epoch": 0.3410493827160494, + "grad_norm": 0.2654666602611542, + "learning_rate": 0.00016640513550956456, + "loss": 0.3746, + "step": 221 + }, + { + "epoch": 0.3425925925925926, + "grad_norm": 0.11220613867044449, + "learning_rate": 0.00016600196294609045, + "loss": 0.0814, + "step": 222 + }, + { + "epoch": 0.3441358024691358, + "grad_norm": 0.22100581228733063, + "learning_rate": 0.00016559688039805936, + "loss": 0.3315, + "step": 223 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 0.2173229157924652, + "learning_rate": 0.00016518989958787126, + "loss": 0.3058, + "step": 224 + }, + { + "epoch": 0.3472222222222222, + "grad_norm": 0.2455485761165619, + "learning_rate": 0.00016478103229285867, + "loss": 0.1495, + "step": 225 + }, + { + "epoch": 0.3487654320987654, + "grad_norm": 0.23965489864349365, + "learning_rate": 0.00016437029034494573, + "loss": 0.1245, + "step": 226 + }, + { + "epoch": 0.35030864197530864, + "grad_norm": 0.2711711823940277, + "learning_rate": 0.00016395768563030584, + "loss": 0.6607, + "step": 227 + }, + { + "epoch": 0.35185185185185186, + "grad_norm": 0.4605553150177002, + "learning_rate": 0.00016354323008901776, + "loss": 0.7805, + "step": 228 + }, + { + "epoch": 0.3533950617283951, + "grad_norm": 0.47439953684806824, + "learning_rate": 0.00016312693571471994, + "loss": 0.5941, + "step": 229 + }, + { + "epoch": 0.3549382716049383, + "grad_norm": 0.2883051335811615, + "learning_rate": 0.0001627088145542636, + "loss": 0.7769, + "step": 230 + }, + { + "epoch": 0.35648148148148145, + "grad_norm": 0.17247015237808228, + "learning_rate": 0.00016228887870736397, + "loss": 0.1281, + "step": 231 + }, + { + "epoch": 0.35802469135802467, + "grad_norm": 0.3834638297557831, + "learning_rate": 0.00016186714032625035, + "loss": 0.7709, + "step": 232 + }, + { + "epoch": 0.3595679012345679, + "grad_norm": 0.38721486926078796, + "learning_rate": 0.00016144361161531419, + "loss": 0.1536, + "step": 233 + }, + { + "epoch": 0.3611111111111111, + "grad_norm": 0.18458816409111023, + "learning_rate": 0.00016101830483075604, + "loss": 0.1103, + "step": 234 + }, + { + "epoch": 0.3626543209876543, + "grad_norm": 0.4738491475582123, + "learning_rate": 0.000160591232280231, + "loss": 0.6744, + "step": 235 + }, + { + "epoch": 0.36419753086419754, + "grad_norm": 0.5933437943458557, + "learning_rate": 0.00016016240632249224, + "loss": 0.3035, + "step": 236 + }, + { + "epoch": 0.36574074074074076, + "grad_norm": 0.2928576171398163, + "learning_rate": 0.0001597318393670338, + "loss": 0.1703, + "step": 237 + }, + { + "epoch": 0.36728395061728397, + "grad_norm": 0.21918344497680664, + "learning_rate": 0.00015929954387373103, + "loss": 0.1423, + "step": 238 + }, + { + "epoch": 0.36882716049382713, + "grad_norm": 0.23297619819641113, + "learning_rate": 0.00015886553235248027, + "loss": 0.373, + "step": 239 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.28735318779945374, + "learning_rate": 0.00015842981736283686, + "loss": 0.7291, + "step": 240 + }, + { + "epoch": 0.37191358024691357, + "grad_norm": 0.24087819457054138, + "learning_rate": 0.00015799241151365155, + "loss": 0.1987, + "step": 241 + }, + { + "epoch": 0.3734567901234568, + "grad_norm": 0.23208248615264893, + "learning_rate": 0.00015755332746270572, + "loss": 0.3301, + "step": 242 + }, + { + "epoch": 0.375, + "grad_norm": 0.1925646811723709, + "learning_rate": 0.000157112577916345, + "loss": 0.4609, + "step": 243 + }, + { + "epoch": 0.3765432098765432, + "grad_norm": 0.2823876142501831, + "learning_rate": 0.00015667017562911176, + "loss": 0.1751, + "step": 244 + }, + { + "epoch": 0.37808641975308643, + "grad_norm": 0.3314320743083954, + "learning_rate": 0.00015622613340337575, + "loss": 0.9271, + "step": 245 + }, + { + "epoch": 0.37962962962962965, + "grad_norm": 0.2851734161376953, + "learning_rate": 0.00015578046408896377, + "loss": 0.3541, + "step": 246 + }, + { + "epoch": 0.38117283950617287, + "grad_norm": 0.32121387124061584, + "learning_rate": 0.00015533318058278786, + "loss": 0.6937, + "step": 247 + }, + { + "epoch": 0.38271604938271603, + "grad_norm": 0.5950232744216919, + "learning_rate": 0.00015488429582847192, + "loss": 0.9107, + "step": 248 + }, + { + "epoch": 0.38425925925925924, + "grad_norm": 0.27574267983436584, + "learning_rate": 0.0001544338228159774, + "loss": 0.2329, + "step": 249 + }, + { + "epoch": 0.38580246913580246, + "grad_norm": 0.3125919699668884, + "learning_rate": 0.00015398177458122712, + "loss": 0.5895, + "step": 250 + }, + { + "epoch": 0.3873456790123457, + "grad_norm": 0.23169316351413727, + "learning_rate": 0.00015352816420572814, + "loss": 0.3733, + "step": 251 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 0.3215522766113281, + "learning_rate": 0.00015307300481619333, + "loss": 0.6273, + "step": 252 + }, + { + "epoch": 0.3904320987654321, + "grad_norm": 0.39781826734542847, + "learning_rate": 0.00015261630958416132, + "loss": 0.74, + "step": 253 + }, + { + "epoch": 0.39197530864197533, + "grad_norm": 0.3889845907688141, + "learning_rate": 0.0001521580917256154, + "loss": 0.4141, + "step": 254 + }, + { + "epoch": 0.39351851851851855, + "grad_norm": 0.37642401456832886, + "learning_rate": 0.00015169836450060106, + "loss": 0.6577, + "step": 255 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 0.4370867609977722, + "learning_rate": 0.0001512371412128424, + "loss": 0.7391, + "step": 256 + }, + { + "epoch": 0.3966049382716049, + "grad_norm": 0.38796278834342957, + "learning_rate": 0.0001507744352093569, + "loss": 0.4856, + "step": 257 + }, + { + "epoch": 0.39814814814814814, + "grad_norm": 0.2830154299736023, + "learning_rate": 0.00015031025988006936, + "loss": 0.8789, + "step": 258 + }, + { + "epoch": 0.39969135802469136, + "grad_norm": 0.1836373209953308, + "learning_rate": 0.0001498446286574244, + "loss": 0.1673, + "step": 259 + }, + { + "epoch": 0.4012345679012346, + "grad_norm": 0.21524512767791748, + "learning_rate": 0.00014937755501599772, + "loss": 0.3101, + "step": 260 + }, + { + "epoch": 0.4027777777777778, + "grad_norm": 0.3035656213760376, + "learning_rate": 0.00014890905247210613, + "loss": 0.9059, + "step": 261 + }, + { + "epoch": 0.404320987654321, + "grad_norm": 0.47792062163352966, + "learning_rate": 0.00014843913458341645, + "loss": 0.6366, + "step": 262 + }, + { + "epoch": 0.4058641975308642, + "grad_norm": 0.5084688067436218, + "learning_rate": 0.00014796781494855324, + "loss": 0.5508, + "step": 263 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.4174124598503113, + "learning_rate": 0.00014749510720670506, + "loss": 0.5291, + "step": 264 + }, + { + "epoch": 0.4089506172839506, + "grad_norm": 0.44806426763534546, + "learning_rate": 0.00014702102503723002, + "loss": 0.6322, + "step": 265 + }, + { + "epoch": 0.4104938271604938, + "grad_norm": 0.4298565685749054, + "learning_rate": 0.00014654558215925989, + "loss": 0.5524, + "step": 266 + }, + { + "epoch": 0.41203703703703703, + "grad_norm": 0.22846965491771698, + "learning_rate": 0.00014606879233130293, + "loss": 0.3397, + "step": 267 + }, + { + "epoch": 0.41358024691358025, + "grad_norm": 0.20276018977165222, + "learning_rate": 0.00014559066935084588, + "loss": 0.3727, + "step": 268 + }, + { + "epoch": 0.41512345679012347, + "grad_norm": 0.24021472036838531, + "learning_rate": 0.00014511122705395464, + "loss": 0.4209, + "step": 269 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.5035699009895325, + "learning_rate": 0.0001446304793148739, + "loss": 0.7499, + "step": 270 + }, + { + "epoch": 0.4182098765432099, + "grad_norm": 0.13195392489433289, + "learning_rate": 0.00014414844004562562, + "loss": 0.2793, + "step": 271 + }, + { + "epoch": 0.41975308641975306, + "grad_norm": 0.5532562136650085, + "learning_rate": 0.0001436651231956064, + "loss": 0.8478, + "step": 272 + }, + { + "epoch": 0.4212962962962963, + "grad_norm": 0.39383968710899353, + "learning_rate": 0.00014318054275118394, + "loss": 0.5646, + "step": 273 + }, + { + "epoch": 0.4228395061728395, + "grad_norm": 0.2708488404750824, + "learning_rate": 0.0001426947127352921, + "loss": 0.1098, + "step": 274 + }, + { + "epoch": 0.4243827160493827, + "grad_norm": 0.29492953419685364, + "learning_rate": 0.00014220764720702535, + "loss": 0.7384, + "step": 275 + }, + { + "epoch": 0.42592592592592593, + "grad_norm": 0.2666418254375458, + "learning_rate": 0.00014171936026123168, + "loss": 0.5572, + "step": 276 + }, + { + "epoch": 0.42746913580246915, + "grad_norm": 0.16895538568496704, + "learning_rate": 0.00014122986602810487, + "loss": 0.3171, + "step": 277 + }, + { + "epoch": 0.42901234567901236, + "grad_norm": 0.31443119049072266, + "learning_rate": 0.00014073917867277557, + "loss": 0.9125, + "step": 278 + }, + { + "epoch": 0.4305555555555556, + "grad_norm": 0.20100273191928864, + "learning_rate": 0.0001402473123949013, + "loss": 0.3965, + "step": 279 + }, + { + "epoch": 0.43209876543209874, + "grad_norm": 0.20125439763069153, + "learning_rate": 0.0001397542814282556, + "loss": 0.3162, + "step": 280 + }, + { + "epoch": 0.43364197530864196, + "grad_norm": 0.28735461831092834, + "learning_rate": 0.00013926010004031616, + "loss": 0.4995, + "step": 281 + }, + { + "epoch": 0.4351851851851852, + "grad_norm": 0.2023731768131256, + "learning_rate": 0.00013876478253185183, + "loss": 0.4746, + "step": 282 + }, + { + "epoch": 0.4367283950617284, + "grad_norm": 0.18414375185966492, + "learning_rate": 0.000138268343236509, + "loss": 0.4516, + "step": 283 + }, + { + "epoch": 0.4382716049382716, + "grad_norm": 0.3256647288799286, + "learning_rate": 0.0001377707965203965, + "loss": 0.9199, + "step": 284 + }, + { + "epoch": 0.4398148148148148, + "grad_norm": 0.2532738447189331, + "learning_rate": 0.00013727215678167014, + "loss": 0.5943, + "step": 285 + }, + { + "epoch": 0.44135802469135804, + "grad_norm": 0.19116981327533722, + "learning_rate": 0.00013677243845011588, + "loss": 0.2064, + "step": 286 + }, + { + "epoch": 0.44290123456790126, + "grad_norm": 0.291351854801178, + "learning_rate": 0.00013627165598673247, + "loss": 0.6035, + "step": 287 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.27666908502578735, + "learning_rate": 0.0001357698238833126, + "loss": 0.3954, + "step": 288 + }, + { + "epoch": 0.44598765432098764, + "grad_norm": 0.18012510240077972, + "learning_rate": 0.00013526695666202384, + "loss": 0.5363, + "step": 289 + }, + { + "epoch": 0.44753086419753085, + "grad_norm": 0.33386459946632385, + "learning_rate": 0.0001347630688749884, + "loss": 0.8915, + "step": 290 + }, + { + "epoch": 0.44907407407407407, + "grad_norm": 0.22583869099617004, + "learning_rate": 0.00013425817510386189, + "loss": 0.5176, + "step": 291 + }, + { + "epoch": 0.4506172839506173, + "grad_norm": 0.2687043845653534, + "learning_rate": 0.00013375228995941133, + "loss": 0.5344, + "step": 292 + }, + { + "epoch": 0.4521604938271605, + "grad_norm": 0.20422233641147614, + "learning_rate": 0.00013324542808109243, + "loss": 0.1604, + "step": 293 + }, + { + "epoch": 0.4537037037037037, + "grad_norm": 0.36622607707977295, + "learning_rate": 0.00013273760413662594, + "loss": 0.9199, + "step": 294 + }, + { + "epoch": 0.45524691358024694, + "grad_norm": 0.3127472400665283, + "learning_rate": 0.00013222883282157315, + "loss": 1.0857, + "step": 295 + }, + { + "epoch": 0.4567901234567901, + "grad_norm": 0.5156943202018738, + "learning_rate": 0.00013171912885891063, + "loss": 0.5367, + "step": 296 + }, + { + "epoch": 0.4583333333333333, + "grad_norm": 0.28158655762672424, + "learning_rate": 0.00013120850699860416, + "loss": 0.7872, + "step": 297 + }, + { + "epoch": 0.45987654320987653, + "grad_norm": 0.32909318804740906, + "learning_rate": 0.000130696982017182, + "loss": 0.8201, + "step": 298 + }, + { + "epoch": 0.46141975308641975, + "grad_norm": 0.3001207411289215, + "learning_rate": 0.00013018456871730715, + "loss": 0.504, + "step": 299 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 0.15215009450912476, + "learning_rate": 0.00012967128192734902, + "loss": 0.1207, + "step": 300 + }, + { + "epoch": 0.4645061728395062, + "grad_norm": 0.13983775675296783, + "learning_rate": 0.0001291571365009544, + "loss": 0.1454, + "step": 301 + }, + { + "epoch": 0.4660493827160494, + "grad_norm": 0.682342529296875, + "learning_rate": 0.00012864214731661742, + "loss": 0.8195, + "step": 302 + }, + { + "epoch": 0.4675925925925926, + "grad_norm": 0.2677520215511322, + "learning_rate": 0.00012812632927724935, + "loss": 0.7024, + "step": 303 + }, + { + "epoch": 0.4691358024691358, + "grad_norm": 0.2707967460155487, + "learning_rate": 0.00012760969730974694, + "loss": 0.3668, + "step": 304 + }, + { + "epoch": 0.470679012345679, + "grad_norm": 0.27531251311302185, + "learning_rate": 0.0001270922663645606, + "loss": 0.6344, + "step": 305 + }, + { + "epoch": 0.4722222222222222, + "grad_norm": 0.4020010530948639, + "learning_rate": 0.00012657405141526195, + "loss": 0.7611, + "step": 306 + }, + { + "epoch": 0.4737654320987654, + "grad_norm": 0.38070714473724365, + "learning_rate": 0.0001260550674581103, + "loss": 0.8608, + "step": 307 + }, + { + "epoch": 0.47530864197530864, + "grad_norm": 0.19036996364593506, + "learning_rate": 0.0001255353295116187, + "loss": 0.3872, + "step": 308 + }, + { + "epoch": 0.47685185185185186, + "grad_norm": 0.25352945923805237, + "learning_rate": 0.00012501485261611942, + "loss": 0.3644, + "step": 309 + }, + { + "epoch": 0.4783950617283951, + "grad_norm": 0.20836561918258667, + "learning_rate": 0.00012449365183332862, + "loss": 0.6056, + "step": 310 + }, + { + "epoch": 0.4799382716049383, + "grad_norm": 0.20088402926921844, + "learning_rate": 0.00012397174224591052, + "loss": 0.3417, + "step": 311 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.24147239327430725, + "learning_rate": 0.00012344913895704097, + "loss": 0.1839, + "step": 312 + }, + { + "epoch": 0.48302469135802467, + "grad_norm": 0.9085943102836609, + "learning_rate": 0.0001229258570899704, + "loss": 0.7375, + "step": 313 + }, + { + "epoch": 0.4845679012345679, + "grad_norm": 0.27051451802253723, + "learning_rate": 0.00012240191178758598, + "loss": 0.4131, + "step": 314 + }, + { + "epoch": 0.4861111111111111, + "grad_norm": 0.5605129599571228, + "learning_rate": 0.00012187731821197376, + "loss": 0.4446, + "step": 315 + }, + { + "epoch": 0.4876543209876543, + "grad_norm": 0.28277111053466797, + "learning_rate": 0.00012135209154397962, + "loss": 0.5936, + "step": 316 + }, + { + "epoch": 0.48919753086419754, + "grad_norm": 0.18896357715129852, + "learning_rate": 0.00012082624698277005, + "loss": 0.4057, + "step": 317 + }, + { + "epoch": 0.49074074074074076, + "grad_norm": 0.20177483558654785, + "learning_rate": 0.00012029979974539234, + "loss": 0.2258, + "step": 318 + }, + { + "epoch": 0.49228395061728397, + "grad_norm": 0.33871597051620483, + "learning_rate": 0.00011977276506633426, + "loss": 0.4368, + "step": 319 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 0.2399907261133194, + "learning_rate": 0.000119245158197083, + "loss": 0.1201, + "step": 320 + }, + { + "epoch": 0.49537037037037035, + "grad_norm": 0.26176193356513977, + "learning_rate": 0.00011871699440568412, + "loss": 0.4732, + "step": 321 + }, + { + "epoch": 0.49691358024691357, + "grad_norm": 0.24419881403446198, + "learning_rate": 0.0001181882889762994, + "loss": 0.6795, + "step": 322 + }, + { + "epoch": 0.4984567901234568, + "grad_norm": 0.2023530900478363, + "learning_rate": 0.00011765905720876487, + "loss": 0.372, + "step": 323 + }, + { + "epoch": 0.5, + "grad_norm": 0.1733013391494751, + "learning_rate": 0.00011712931441814776, + "loss": 0.4262, + "step": 324 + }, + { + "epoch": 0.5015432098765432, + "grad_norm": 0.39326512813568115, + "learning_rate": 0.00011659907593430352, + "loss": 0.1849, + "step": 325 + }, + { + "epoch": 0.5030864197530864, + "grad_norm": 0.2365029901266098, + "learning_rate": 0.00011606835710143207, + "loss": 0.402, + "step": 326 + }, + { + "epoch": 0.5046296296296297, + "grad_norm": 0.23560036718845367, + "learning_rate": 0.00011553717327763385, + "loss": 0.3979, + "step": 327 + }, + { + "epoch": 0.5061728395061729, + "grad_norm": 0.33829402923583984, + "learning_rate": 0.00011500553983446527, + "loss": 0.5577, + "step": 328 + }, + { + "epoch": 0.5077160493827161, + "grad_norm": 0.1754661649465561, + "learning_rate": 0.0001144734721564941, + "loss": 0.4206, + "step": 329 + }, + { + "epoch": 0.5092592592592593, + "grad_norm": 0.13841289281845093, + "learning_rate": 0.00011394098564085398, + "loss": 0.1164, + "step": 330 + }, + { + "epoch": 0.5108024691358025, + "grad_norm": 0.3422022759914398, + "learning_rate": 0.00011340809569679906, + "loss": 0.1931, + "step": 331 + }, + { + "epoch": 0.5123456790123457, + "grad_norm": 0.3109457194805145, + "learning_rate": 0.0001128748177452581, + "loss": 0.4279, + "step": 332 + }, + { + "epoch": 0.5138888888888888, + "grad_norm": 0.19185097515583038, + "learning_rate": 0.00011234116721838797, + "loss": 0.4782, + "step": 333 + }, + { + "epoch": 0.5154320987654321, + "grad_norm": 0.14880049228668213, + "learning_rate": 0.0001118071595591274, + "loss": 0.3131, + "step": 334 + }, + { + "epoch": 0.5169753086419753, + "grad_norm": 0.29228848218917847, + "learning_rate": 0.0001112728102207498, + "loss": 0.3152, + "step": 335 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.33393582701683044, + "learning_rate": 0.00011073813466641632, + "loss": 0.6484, + "step": 336 + }, + { + "epoch": 0.5200617283950617, + "grad_norm": 0.4790665805339813, + "learning_rate": 0.00011020314836872811, + "loss": 1.154, + "step": 337 + }, + { + "epoch": 0.5216049382716049, + "grad_norm": 0.5605064630508423, + "learning_rate": 0.00010966786680927874, + "loss": 0.8431, + "step": 338 + }, + { + "epoch": 0.5231481481481481, + "grad_norm": 0.20457619428634644, + "learning_rate": 0.00010913230547820623, + "loss": 0.3279, + "step": 339 + }, + { + "epoch": 0.5246913580246914, + "grad_norm": 0.35587751865386963, + "learning_rate": 0.00010859647987374467, + "loss": 0.8932, + "step": 340 + }, + { + "epoch": 0.5262345679012346, + "grad_norm": 0.1639918088912964, + "learning_rate": 0.00010806040550177572, + "loss": 0.3907, + "step": 341 + }, + { + "epoch": 0.5277777777777778, + "grad_norm": 0.20673660933971405, + "learning_rate": 0.00010752409787538, + "loss": 0.1562, + "step": 342 + }, + { + "epoch": 0.529320987654321, + "grad_norm": 0.17603667080402374, + "learning_rate": 0.00010698757251438816, + "loss": 0.3784, + "step": 343 + }, + { + "epoch": 0.5308641975308642, + "grad_norm": 0.3864747881889343, + "learning_rate": 0.00010645084494493165, + "loss": 0.5215, + "step": 344 + }, + { + "epoch": 0.5324074074074074, + "grad_norm": 0.16058044135570526, + "learning_rate": 0.00010591393069899349, + "loss": 0.3122, + "step": 345 + }, + { + "epoch": 0.5339506172839507, + "grad_norm": 0.2520942986011505, + "learning_rate": 0.00010537684531395879, + "loss": 0.5314, + "step": 346 + }, + { + "epoch": 0.5354938271604939, + "grad_norm": 0.17930999398231506, + "learning_rate": 0.00010483960433216522, + "loss": 0.1235, + "step": 347 + }, + { + "epoch": 0.5370370370370371, + "grad_norm": 0.48774784803390503, + "learning_rate": 0.00010430222330045304, + "loss": 0.5822, + "step": 348 + }, + { + "epoch": 0.5385802469135802, + "grad_norm": 0.10608334839344025, + "learning_rate": 0.00010376471776971545, + "loss": 0.0839, + "step": 349 + }, + { + "epoch": 0.5401234567901234, + "grad_norm": 0.189545139670372, + "learning_rate": 0.00010322710329444832, + "loss": 0.1508, + "step": 350 + }, + { + "epoch": 0.5416666666666666, + "grad_norm": 0.20387907326221466, + "learning_rate": 0.0001026893954323002, + "loss": 0.4793, + "step": 351 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 0.42547720670700073, + "learning_rate": 0.00010215160974362223, + "loss": 0.3155, + "step": 352 + }, + { + "epoch": 0.5447530864197531, + "grad_norm": 0.20840230584144592, + "learning_rate": 0.00010161376179101753, + "loss": 0.5614, + "step": 353 + }, + { + "epoch": 0.5462962962962963, + "grad_norm": 0.2521035969257355, + "learning_rate": 0.00010107586713889117, + "loss": 0.1982, + "step": 354 + }, + { + "epoch": 0.5478395061728395, + "grad_norm": 0.2233658730983734, + "learning_rate": 0.00010053794135299951, + "loss": 0.4786, + "step": 355 + }, + { + "epoch": 0.5493827160493827, + "grad_norm": 0.45647042989730835, + "learning_rate": 0.0001, + "loss": 0.5133, + "step": 356 + }, + { + "epoch": 0.5509259259259259, + "grad_norm": 0.34752780199050903, + "learning_rate": 9.946205864700052e-05, + "loss": 0.7293, + "step": 357 + }, + { + "epoch": 0.5524691358024691, + "grad_norm": 0.2195957899093628, + "learning_rate": 9.892413286110886e-05, + "loss": 0.3478, + "step": 358 + }, + { + "epoch": 0.5540123456790124, + "grad_norm": 0.2130778431892395, + "learning_rate": 9.838623820898249e-05, + "loss": 0.6372, + "step": 359 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.2345610111951828, + "learning_rate": 9.784839025637778e-05, + "loss": 0.4609, + "step": 360 + }, + { + "epoch": 0.5570987654320988, + "grad_norm": 0.2765243649482727, + "learning_rate": 9.73106045676998e-05, + "loss": 0.5018, + "step": 361 + }, + { + "epoch": 0.558641975308642, + "grad_norm": 0.33230382204055786, + "learning_rate": 9.677289670555169e-05, + "loss": 0.1977, + "step": 362 + }, + { + "epoch": 0.5601851851851852, + "grad_norm": 0.4939884841442108, + "learning_rate": 9.623528223028457e-05, + "loss": 0.8983, + "step": 363 + }, + { + "epoch": 0.5617283950617284, + "grad_norm": 0.25707146525382996, + "learning_rate": 9.569777669954694e-05, + "loss": 0.7765, + "step": 364 + }, + { + "epoch": 0.5632716049382716, + "grad_norm": 0.31705909967422485, + "learning_rate": 9.516039566783482e-05, + "loss": 0.4167, + "step": 365 + }, + { + "epoch": 0.5648148148148148, + "grad_norm": 0.2889512777328491, + "learning_rate": 9.462315468604124e-05, + "loss": 0.1233, + "step": 366 + }, + { + "epoch": 0.566358024691358, + "grad_norm": 0.2776125967502594, + "learning_rate": 9.408606930100654e-05, + "loss": 0.7083, + "step": 367 + }, + { + "epoch": 0.5679012345679012, + "grad_norm": 0.35557475686073303, + "learning_rate": 9.354915505506839e-05, + "loss": 0.3129, + "step": 368 + }, + { + "epoch": 0.5694444444444444, + "grad_norm": 0.31051987409591675, + "learning_rate": 9.301242748561185e-05, + "loss": 0.4522, + "step": 369 + }, + { + "epoch": 0.5709876543209876, + "grad_norm": 0.180246040225029, + "learning_rate": 9.247590212462001e-05, + "loss": 0.129, + "step": 370 + }, + { + "epoch": 0.5725308641975309, + "grad_norm": 0.31589406728744507, + "learning_rate": 9.193959449822429e-05, + "loss": 1.0016, + "step": 371 + }, + { + "epoch": 0.5740740740740741, + "grad_norm": 0.13801825046539307, + "learning_rate": 9.140352012625537e-05, + "loss": 0.1339, + "step": 372 + }, + { + "epoch": 0.5756172839506173, + "grad_norm": 0.15938417613506317, + "learning_rate": 9.08676945217938e-05, + "loss": 0.3158, + "step": 373 + }, + { + "epoch": 0.5771604938271605, + "grad_norm": 0.5883560180664062, + "learning_rate": 9.033213319072127e-05, + "loss": 0.4438, + "step": 374 + }, + { + "epoch": 0.5787037037037037, + "grad_norm": 0.1496354639530182, + "learning_rate": 8.979685163127194e-05, + "loss": 0.3505, + "step": 375 + }, + { + "epoch": 0.5802469135802469, + "grad_norm": 0.3306485414505005, + "learning_rate": 8.92618653335837e-05, + "loss": 0.1942, + "step": 376 + }, + { + "epoch": 0.5817901234567902, + "grad_norm": 0.19068115949630737, + "learning_rate": 8.872718977925022e-05, + "loss": 0.1478, + "step": 377 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 0.2563306987285614, + "learning_rate": 8.81928404408726e-05, + "loss": 0.52, + "step": 378 + }, + { + "epoch": 0.5848765432098766, + "grad_norm": 0.2779485881328583, + "learning_rate": 8.765883278161204e-05, + "loss": 0.5592, + "step": 379 + }, + { + "epoch": 0.5864197530864198, + "grad_norm": 0.24331793189048767, + "learning_rate": 8.712518225474191e-05, + "loss": 0.7972, + "step": 380 + }, + { + "epoch": 0.5879629629629629, + "grad_norm": 0.22886143624782562, + "learning_rate": 8.659190430320095e-05, + "loss": 0.1885, + "step": 381 + }, + { + "epoch": 0.5895061728395061, + "grad_norm": 0.2801979184150696, + "learning_rate": 8.605901435914607e-05, + "loss": 0.4509, + "step": 382 + }, + { + "epoch": 0.5910493827160493, + "grad_norm": 0.28682655096054077, + "learning_rate": 8.552652784350592e-05, + "loss": 0.6643, + "step": 383 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.3022245466709137, + "learning_rate": 8.499446016553474e-05, + "loss": 0.2217, + "step": 384 + }, + { + "epoch": 0.5941358024691358, + "grad_norm": 0.34305545687675476, + "learning_rate": 8.446282672236619e-05, + "loss": 0.4366, + "step": 385 + }, + { + "epoch": 0.595679012345679, + "grad_norm": 0.22857120633125305, + "learning_rate": 8.393164289856797e-05, + "loss": 0.3639, + "step": 386 + }, + { + "epoch": 0.5972222222222222, + "grad_norm": 0.2164112627506256, + "learning_rate": 8.340092406569649e-05, + "loss": 0.3924, + "step": 387 + }, + { + "epoch": 0.5987654320987654, + "grad_norm": 0.25482821464538574, + "learning_rate": 8.287068558185225e-05, + "loss": 0.4117, + "step": 388 + }, + { + "epoch": 0.6003086419753086, + "grad_norm": 0.13518688082695007, + "learning_rate": 8.234094279123516e-05, + "loss": 0.3268, + "step": 389 + }, + { + "epoch": 0.6018518518518519, + "grad_norm": 0.2003636211156845, + "learning_rate": 8.18117110237006e-05, + "loss": 0.545, + "step": 390 + }, + { + "epoch": 0.6033950617283951, + "grad_norm": 0.24785707890987396, + "learning_rate": 8.128300559431593e-05, + "loss": 0.9276, + "step": 391 + }, + { + "epoch": 0.6049382716049383, + "grad_norm": 0.275576651096344, + "learning_rate": 8.075484180291701e-05, + "loss": 0.4245, + "step": 392 + }, + { + "epoch": 0.6064814814814815, + "grad_norm": 0.23772022128105164, + "learning_rate": 8.022723493366576e-05, + "loss": 0.5176, + "step": 393 + }, + { + "epoch": 0.6080246913580247, + "grad_norm": 0.18755260109901428, + "learning_rate": 7.970020025460765e-05, + "loss": 0.8175, + "step": 394 + }, + { + "epoch": 0.6095679012345679, + "grad_norm": 0.2976040840148926, + "learning_rate": 7.917375301722997e-05, + "loss": 0.8663, + "step": 395 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.21300390362739563, + "learning_rate": 7.864790845602039e-05, + "loss": 0.1562, + "step": 396 + }, + { + "epoch": 0.6126543209876543, + "grad_norm": 0.18343684077262878, + "learning_rate": 7.812268178802626e-05, + "loss": 0.2527, + "step": 397 + }, + { + "epoch": 0.6141975308641975, + "grad_norm": 0.1589261144399643, + "learning_rate": 7.759808821241406e-05, + "loss": 0.1314, + "step": 398 + }, + { + "epoch": 0.6157407407407407, + "grad_norm": 0.2383323311805725, + "learning_rate": 7.707414291002964e-05, + "loss": 0.6418, + "step": 399 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 0.26811686158180237, + "learning_rate": 7.655086104295904e-05, + "loss": 0.4181, + "step": 400 + }, + { + "epoch": 0.6188271604938271, + "grad_norm": 0.2511789798736572, + "learning_rate": 7.60282577540895e-05, + "loss": 0.6246, + "step": 401 + }, + { + "epoch": 0.6203703703703703, + "grad_norm": 0.13311168551445007, + "learning_rate": 7.550634816667142e-05, + "loss": 0.0995, + "step": 402 + }, + { + "epoch": 0.6219135802469136, + "grad_norm": 0.19049036502838135, + "learning_rate": 7.498514738388059e-05, + "loss": 0.2354, + "step": 403 + }, + { + "epoch": 0.6234567901234568, + "grad_norm": 0.27821457386016846, + "learning_rate": 7.446467048838131e-05, + "loss": 0.4461, + "step": 404 + }, + { + "epoch": 0.625, + "grad_norm": 0.20892822742462158, + "learning_rate": 7.394493254188975e-05, + "loss": 0.2575, + "step": 405 + }, + { + "epoch": 0.6265432098765432, + "grad_norm": 0.22397580742835999, + "learning_rate": 7.342594858473807e-05, + "loss": 0.1657, + "step": 406 + }, + { + "epoch": 0.6280864197530864, + "grad_norm": 0.27712884545326233, + "learning_rate": 7.290773363543945e-05, + "loss": 0.6845, + "step": 407 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.264371395111084, + "learning_rate": 7.239030269025311e-05, + "loss": 0.5493, + "step": 408 + }, + { + "epoch": 0.6311728395061729, + "grad_norm": 0.19453969597816467, + "learning_rate": 7.187367072275066e-05, + "loss": 0.1357, + "step": 409 + }, + { + "epoch": 0.6327160493827161, + "grad_norm": 0.26357677578926086, + "learning_rate": 7.135785268338256e-05, + "loss": 0.8577, + "step": 410 + }, + { + "epoch": 0.6342592592592593, + "grad_norm": 0.2632603645324707, + "learning_rate": 7.084286349904563e-05, + "loss": 0.4733, + "step": 411 + }, + { + "epoch": 0.6358024691358025, + "grad_norm": 0.4994662404060364, + "learning_rate": 7.032871807265096e-05, + "loss": 0.2367, + "step": 412 + }, + { + "epoch": 0.6373456790123457, + "grad_norm": 0.12755128741264343, + "learning_rate": 6.981543128269286e-05, + "loss": 0.0988, + "step": 413 + }, + { + "epoch": 0.6388888888888888, + "grad_norm": 0.5753490924835205, + "learning_rate": 6.930301798281803e-05, + "loss": 0.6723, + "step": 414 + }, + { + "epoch": 0.6404320987654321, + "grad_norm": 0.24338620901107788, + "learning_rate": 6.879149300139585e-05, + "loss": 0.7552, + "step": 415 + }, + { + "epoch": 0.6419753086419753, + "grad_norm": 0.2144060581922531, + "learning_rate": 6.82808711410894e-05, + "loss": 0.3832, + "step": 416 + }, + { + "epoch": 0.6435185185185185, + "grad_norm": 0.2806336581707001, + "learning_rate": 6.777116717842686e-05, + "loss": 0.6669, + "step": 417 + }, + { + "epoch": 0.6450617283950617, + "grad_norm": 0.16209986805915833, + "learning_rate": 6.726239586337408e-05, + "loss": 0.1573, + "step": 418 + }, + { + "epoch": 0.6466049382716049, + "grad_norm": 0.36021384596824646, + "learning_rate": 6.675457191890756e-05, + "loss": 0.3975, + "step": 419 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 0.19047600030899048, + "learning_rate": 6.624771004058868e-05, + "loss": 0.4896, + "step": 420 + }, + { + "epoch": 0.6496913580246914, + "grad_norm": 0.44073987007141113, + "learning_rate": 6.574182489613814e-05, + "loss": 0.5972, + "step": 421 + }, + { + "epoch": 0.6512345679012346, + "grad_norm": 0.21274137496948242, + "learning_rate": 6.52369311250116e-05, + "loss": 0.1259, + "step": 422 + }, + { + "epoch": 0.6527777777777778, + "grad_norm": 0.25200140476226807, + "learning_rate": 6.473304333797621e-05, + "loss": 0.2554, + "step": 423 + }, + { + "epoch": 0.654320987654321, + "grad_norm": 0.26650434732437134, + "learning_rate": 6.423017611668745e-05, + "loss": 0.9587, + "step": 424 + }, + { + "epoch": 0.6558641975308642, + "grad_norm": 0.4036225378513336, + "learning_rate": 6.372834401326758e-05, + "loss": 0.8472, + "step": 425 + }, + { + "epoch": 0.6574074074074074, + "grad_norm": 0.3750089108943939, + "learning_rate": 6.322756154988409e-05, + "loss": 0.6928, + "step": 426 + }, + { + "epoch": 0.6589506172839507, + "grad_norm": 0.17716793715953827, + "learning_rate": 6.27278432183299e-05, + "loss": 0.423, + "step": 427 + }, + { + "epoch": 0.6604938271604939, + "grad_norm": 0.22470588982105255, + "learning_rate": 6.22292034796035e-05, + "loss": 0.5848, + "step": 428 + }, + { + "epoch": 0.6620370370370371, + "grad_norm": 0.3201143443584442, + "learning_rate": 6.173165676349103e-05, + "loss": 0.6933, + "step": 429 + }, + { + "epoch": 0.6635802469135802, + "grad_norm": 0.26013293862342834, + "learning_rate": 6.12352174681482e-05, + "loss": 0.837, + "step": 430 + }, + { + "epoch": 0.6651234567901234, + "grad_norm": 0.1394704431295395, + "learning_rate": 6.073989995968388e-05, + "loss": 0.3476, + "step": 431 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.21686775982379913, + "learning_rate": 6.024571857174443e-05, + "loss": 0.6275, + "step": 432 + }, + { + "epoch": 0.6682098765432098, + "grad_norm": 0.31212687492370605, + "learning_rate": 5.9752687605098714e-05, + "loss": 0.7522, + "step": 433 + }, + { + "epoch": 0.6697530864197531, + "grad_norm": 0.4370562434196472, + "learning_rate": 5.9260821327224444e-05, + "loss": 0.5916, + "step": 434 + }, + { + "epoch": 0.6712962962962963, + "grad_norm": 0.14993952214717865, + "learning_rate": 5.87701339718951e-05, + "loss": 0.1059, + "step": 435 + }, + { + "epoch": 0.6728395061728395, + "grad_norm": 0.1285599172115326, + "learning_rate": 5.828063973876834e-05, + "loss": 0.0888, + "step": 436 + }, + { + "epoch": 0.6743827160493827, + "grad_norm": 0.360960453748703, + "learning_rate": 5.779235279297467e-05, + "loss": 0.3833, + "step": 437 + }, + { + "epoch": 0.6759259259259259, + "grad_norm": 0.3488157093524933, + "learning_rate": 5.730528726470792e-05, + "loss": 0.3812, + "step": 438 + }, + { + "epoch": 0.6774691358024691, + "grad_norm": 0.40559062361717224, + "learning_rate": 5.6819457248816134e-05, + "loss": 0.7262, + "step": 439 + }, + { + "epoch": 0.6790123456790124, + "grad_norm": 0.15350504219532013, + "learning_rate": 5.633487680439361e-05, + "loss": 0.1537, + "step": 440 + }, + { + "epoch": 0.6805555555555556, + "grad_norm": 0.2758656442165375, + "learning_rate": 5.585155995437443e-05, + "loss": 0.4091, + "step": 441 + }, + { + "epoch": 0.6820987654320988, + "grad_norm": 0.18555998802185059, + "learning_rate": 5.536952068512608e-05, + "loss": 0.1681, + "step": 442 + }, + { + "epoch": 0.683641975308642, + "grad_norm": 0.12097325176000595, + "learning_rate": 5.488877294604537e-05, + "loss": 0.095, + "step": 443 + }, + { + "epoch": 0.6851851851851852, + "grad_norm": 0.3211557865142822, + "learning_rate": 5.440933064915414e-05, + "loss": 0.6539, + "step": 444 + }, + { + "epoch": 0.6867283950617284, + "grad_norm": 0.23512816429138184, + "learning_rate": 5.393120766869708e-05, + "loss": 0.4303, + "step": 445 + }, + { + "epoch": 0.6882716049382716, + "grad_norm": 0.26071473956108093, + "learning_rate": 5.3454417840740124e-05, + "loss": 0.5466, + "step": 446 + }, + { + "epoch": 0.6898148148148148, + "grad_norm": 0.24297121167182922, + "learning_rate": 5.2978974962769975e-05, + "loss": 0.4057, + "step": 447 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 0.3228786587715149, + "learning_rate": 5.2504892793295e-05, + "loss": 0.7604, + "step": 448 + }, + { + "epoch": 0.6929012345679012, + "grad_norm": 0.14784720540046692, + "learning_rate": 5.203218505144678e-05, + "loss": 0.1139, + "step": 449 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.7648014426231384, + "learning_rate": 5.156086541658356e-05, + "loss": 0.6502, + "step": 450 + }, + { + "epoch": 0.6959876543209876, + "grad_norm": 0.2782875895500183, + "learning_rate": 5.109094752789384e-05, + "loss": 0.4311, + "step": 451 + }, + { + "epoch": 0.6975308641975309, + "grad_norm": 0.15378238260746002, + "learning_rate": 5.062244498400228e-05, + "loss": 0.3567, + "step": 452 + }, + { + "epoch": 0.6990740740740741, + "grad_norm": 0.29984399676322937, + "learning_rate": 5.0155371342575616e-05, + "loss": 1.0182, + "step": 453 + }, + { + "epoch": 0.7006172839506173, + "grad_norm": 0.22591163218021393, + "learning_rate": 4.968974011993067e-05, + "loss": 0.2338, + "step": 454 + }, + { + "epoch": 0.7021604938271605, + "grad_norm": 0.2846401035785675, + "learning_rate": 4.9225564790643186e-05, + "loss": 0.5705, + "step": 455 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.24510538578033447, + "learning_rate": 4.876285878715764e-05, + "loss": 0.2205, + "step": 456 + }, + { + "epoch": 0.7052469135802469, + "grad_norm": 0.22834031283855438, + "learning_rate": 4.830163549939899e-05, + "loss": 0.249, + "step": 457 + }, + { + "epoch": 0.7067901234567902, + "grad_norm": 0.2675427496433258, + "learning_rate": 4.7841908274384616e-05, + "loss": 0.4518, + "step": 458 + }, + { + "epoch": 0.7083333333333334, + "grad_norm": 0.25624194741249084, + "learning_rate": 4.7383690415838714e-05, + "loss": 0.7167, + "step": 459 + }, + { + "epoch": 0.7098765432098766, + "grad_norm": 0.28292757272720337, + "learning_rate": 4.6926995183806644e-05, + "loss": 0.7126, + "step": 460 + }, + { + "epoch": 0.7114197530864198, + "grad_norm": 0.32363831996917725, + "learning_rate": 4.647183579427187e-05, + "loss": 0.8124, + "step": 461 + }, + { + "epoch": 0.7129629629629629, + "grad_norm": 0.3364168405532837, + "learning_rate": 4.601822541877291e-05, + "loss": 0.9296, + "step": 462 + }, + { + "epoch": 0.7145061728395061, + "grad_norm": 0.47601252794265747, + "learning_rate": 4.5566177184022605e-05, + "loss": 0.3506, + "step": 463 + }, + { + "epoch": 0.7160493827160493, + "grad_norm": 0.2587848901748657, + "learning_rate": 4.5115704171528105e-05, + "loss": 0.6646, + "step": 464 + }, + { + "epoch": 0.7175925925925926, + "grad_norm": 0.3272380232810974, + "learning_rate": 4.466681941721216e-05, + "loss": 0.4113, + "step": 465 + }, + { + "epoch": 0.7191358024691358, + "grad_norm": 0.12207739800214767, + "learning_rate": 4.421953591103627e-05, + "loss": 0.0998, + "step": 466 + }, + { + "epoch": 0.720679012345679, + "grad_norm": 0.2772601544857025, + "learning_rate": 4.377386659662425e-05, + "loss": 0.436, + "step": 467 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.18643896281719208, + "learning_rate": 4.332982437088825e-05, + "loss": 0.1212, + "step": 468 + }, + { + "epoch": 0.7237654320987654, + "grad_norm": 0.16902408003807068, + "learning_rate": 4.2887422083655006e-05, + "loss": 0.1363, + "step": 469 + }, + { + "epoch": 0.7253086419753086, + "grad_norm": 0.30154481530189514, + "learning_rate": 4.244667253729431e-05, + "loss": 0.5263, + "step": 470 + }, + { + "epoch": 0.7268518518518519, + "grad_norm": 0.18358105421066284, + "learning_rate": 4.2007588486348505e-05, + "loss": 0.5956, + "step": 471 + }, + { + "epoch": 0.7283950617283951, + "grad_norm": 0.26860612630844116, + "learning_rate": 4.1570182637163155e-05, + "loss": 0.678, + "step": 472 + }, + { + "epoch": 0.7299382716049383, + "grad_norm": 0.18593661487102509, + "learning_rate": 4.113446764751977e-05, + "loss": 0.559, + "step": 473 + }, + { + "epoch": 0.7314814814814815, + "grad_norm": 0.21515703201293945, + "learning_rate": 4.0700456126268984e-05, + "loss": 0.2778, + "step": 474 + }, + { + "epoch": 0.7330246913580247, + "grad_norm": 0.25489386916160583, + "learning_rate": 4.0268160632966226e-05, + "loss": 0.6518, + "step": 475 + }, + { + "epoch": 0.7345679012345679, + "grad_norm": 0.40696218609809875, + "learning_rate": 3.9837593677507726e-05, + "loss": 0.7037, + "step": 476 + }, + { + "epoch": 0.7361111111111112, + "grad_norm": 0.19291771948337555, + "learning_rate": 3.940876771976904e-05, + "loss": 0.4879, + "step": 477 + }, + { + "epoch": 0.7376543209876543, + "grad_norm": 0.45950472354888916, + "learning_rate": 3.898169516924398e-05, + "loss": 0.5043, + "step": 478 + }, + { + "epoch": 0.7391975308641975, + "grad_norm": 0.1111774668097496, + "learning_rate": 3.855638838468584e-05, + "loss": 0.0913, + "step": 479 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.17390532791614532, + "learning_rate": 3.813285967374969e-05, + "loss": 0.1564, + "step": 480 + }, + { + "epoch": 0.7422839506172839, + "grad_norm": 0.3220868408679962, + "learning_rate": 3.771112129263602e-05, + "loss": 0.8852, + "step": 481 + }, + { + "epoch": 0.7438271604938271, + "grad_norm": 0.3040640652179718, + "learning_rate": 3.7291185445736444e-05, + "loss": 0.5647, + "step": 482 + }, + { + "epoch": 0.7453703703703703, + "grad_norm": 0.2612684667110443, + "learning_rate": 3.687306428528006e-05, + "loss": 0.5195, + "step": 483 + }, + { + "epoch": 0.7469135802469136, + "grad_norm": 0.33559906482696533, + "learning_rate": 3.645676991098227e-05, + "loss": 0.6323, + "step": 484 + }, + { + "epoch": 0.7484567901234568, + "grad_norm": 0.19310182332992554, + "learning_rate": 3.6042314369694174e-05, + "loss": 0.3997, + "step": 485 + }, + { + "epoch": 0.75, + "grad_norm": 0.15787935256958008, + "learning_rate": 3.562970965505429e-05, + "loss": 0.1704, + "step": 486 + }, + { + "epoch": 0.7515432098765432, + "grad_norm": 0.27092063426971436, + "learning_rate": 3.5218967707141335e-05, + "loss": 0.5351, + "step": 487 + }, + { + "epoch": 0.7530864197530864, + "grad_norm": 0.2954064607620239, + "learning_rate": 3.4810100412128747e-05, + "loss": 0.6346, + "step": 488 + }, + { + "epoch": 0.7546296296296297, + "grad_norm": 0.20427364110946655, + "learning_rate": 3.440311960194068e-05, + "loss": 0.5094, + "step": 489 + }, + { + "epoch": 0.7561728395061729, + "grad_norm": 0.22827818989753723, + "learning_rate": 3.399803705390955e-05, + "loss": 0.3811, + "step": 490 + }, + { + "epoch": 0.7577160493827161, + "grad_norm": 0.19012290239334106, + "learning_rate": 3.359486449043547e-05, + "loss": 0.513, + "step": 491 + }, + { + "epoch": 0.7592592592592593, + "grad_norm": 0.22560228407382965, + "learning_rate": 3.319361357864663e-05, + "loss": 0.5542, + "step": 492 + }, + { + "epoch": 0.7608024691358025, + "grad_norm": 0.19475916028022766, + "learning_rate": 3.2794295930062144e-05, + "loss": 0.3721, + "step": 493 + }, + { + "epoch": 0.7623456790123457, + "grad_norm": 0.2412104308605194, + "learning_rate": 3.2396923100255515e-05, + "loss": 0.8091, + "step": 494 + }, + { + "epoch": 0.7638888888888888, + "grad_norm": 0.42671138048171997, + "learning_rate": 3.200150658852066e-05, + "loss": 0.7731, + "step": 495 + }, + { + "epoch": 0.7654320987654321, + "grad_norm": 0.2477133870124817, + "learning_rate": 3.160805783753897e-05, + "loss": 0.2803, + "step": 496 + }, + { + "epoch": 0.7669753086419753, + "grad_norm": 0.23327584564685822, + "learning_rate": 3.121658823304806e-05, + "loss": 0.3302, + "step": 497 + }, + { + "epoch": 0.7685185185185185, + "grad_norm": 0.14538195729255676, + "learning_rate": 3.0827109103512643e-05, + "loss": 0.1777, + "step": 498 + }, + { + "epoch": 0.7700617283950617, + "grad_norm": 0.22030945122241974, + "learning_rate": 3.043963171979618e-05, + "loss": 0.3754, + "step": 499 + }, + { + "epoch": 0.7716049382716049, + "grad_norm": 0.2599882185459137, + "learning_rate": 3.005416729483531e-05, + "loss": 0.6149, + "step": 500 + }, + { + "epoch": 0.7731481481481481, + "grad_norm": 0.2789295017719269, + "learning_rate": 2.9670726983314824e-05, + "loss": 0.6605, + "step": 501 + }, + { + "epoch": 0.7746913580246914, + "grad_norm": 0.3327292799949646, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.7938, + "step": 502 + }, + { + "epoch": 0.7762345679012346, + "grad_norm": 0.21114224195480347, + "learning_rate": 2.8909963026141583e-05, + "loss": 0.5268, + "step": 503 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.26455628871917725, + "learning_rate": 2.853266139570391e-05, + "loss": 0.6946, + "step": 504 + }, + { + "epoch": 0.779320987654321, + "grad_norm": 0.2472044825553894, + "learning_rate": 2.8157427908499746e-05, + "loss": 0.8173, + "step": 505 + }, + { + "epoch": 0.7808641975308642, + "grad_norm": 0.18958550691604614, + "learning_rate": 2.7784273423147967e-05, + "loss": 0.1577, + "step": 506 + }, + { + "epoch": 0.7824074074074074, + "grad_norm": 0.14402198791503906, + "learning_rate": 2.7413208738104868e-05, + "loss": 0.1015, + "step": 507 + }, + { + "epoch": 0.7839506172839507, + "grad_norm": 0.219883531332016, + "learning_rate": 2.7044244591351232e-05, + "loss": 0.6058, + "step": 508 + }, + { + "epoch": 0.7854938271604939, + "grad_norm": 0.21849100291728973, + "learning_rate": 2.6677391660082095e-05, + "loss": 0.3994, + "step": 509 + }, + { + "epoch": 0.7870370370370371, + "grad_norm": 0.28365975618362427, + "learning_rate": 2.6312660560397273e-05, + "loss": 0.6389, + "step": 510 + }, + { + "epoch": 0.7885802469135802, + "grad_norm": 0.24026718735694885, + "learning_rate": 2.5950061846994523e-05, + "loss": 0.4837, + "step": 511 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 0.28315937519073486, + "learning_rate": 2.5589606012863963e-05, + "loss": 0.5397, + "step": 512 + }, + { + "epoch": 0.7916666666666666, + "grad_norm": 0.15122969448566437, + "learning_rate": 2.5231303488984302e-05, + "loss": 0.1659, + "step": 513 + }, + { + "epoch": 0.7932098765432098, + "grad_norm": 0.21417337656021118, + "learning_rate": 2.4875164644021342e-05, + "loss": 0.1563, + "step": 514 + }, + { + "epoch": 0.7947530864197531, + "grad_norm": 0.20191837847232819, + "learning_rate": 2.4521199784027437e-05, + "loss": 0.3679, + "step": 515 + }, + { + "epoch": 0.7962962962962963, + "grad_norm": 0.3548601269721985, + "learning_rate": 2.4169419152143768e-05, + "loss": 0.3196, + "step": 516 + }, + { + "epoch": 0.7978395061728395, + "grad_norm": 0.21967338025569916, + "learning_rate": 2.381983292830343e-05, + "loss": 0.5358, + "step": 517 + }, + { + "epoch": 0.7993827160493827, + "grad_norm": 0.4001113176345825, + "learning_rate": 2.3472451228937253e-05, + "loss": 0.5449, + "step": 518 + }, + { + "epoch": 0.8009259259259259, + "grad_norm": 0.2046147882938385, + "learning_rate": 2.3127284106680812e-05, + "loss": 0.3095, + "step": 519 + }, + { + "epoch": 0.8024691358024691, + "grad_norm": 0.5111254453659058, + "learning_rate": 2.2784341550083576e-05, + "loss": 0.4179, + "step": 520 + }, + { + "epoch": 0.8040123456790124, + "grad_norm": 0.2834477722644806, + "learning_rate": 2.2443633483319927e-05, + "loss": 0.5246, + "step": 521 + }, + { + "epoch": 0.8055555555555556, + "grad_norm": 0.5373508930206299, + "learning_rate": 2.210516976590179e-05, + "loss": 0.2949, + "step": 522 + }, + { + "epoch": 0.8070987654320988, + "grad_norm": 0.20721107721328735, + "learning_rate": 2.1768960192393605e-05, + "loss": 0.3177, + "step": 523 + }, + { + "epoch": 0.808641975308642, + "grad_norm": 0.3565925061702728, + "learning_rate": 2.1435014492128547e-05, + "loss": 0.5407, + "step": 524 + }, + { + "epoch": 0.8101851851851852, + "grad_norm": 0.30221205949783325, + "learning_rate": 2.1103342328927332e-05, + "loss": 0.4413, + "step": 525 + }, + { + "epoch": 0.8117283950617284, + "grad_norm": 0.13516120612621307, + "learning_rate": 2.0773953300818204e-05, + "loss": 0.1066, + "step": 526 + }, + { + "epoch": 0.8132716049382716, + "grad_norm": 0.17537999153137207, + "learning_rate": 2.0446856939759472e-05, + "loss": 0.3488, + "step": 527 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.14302706718444824, + "learning_rate": 2.0122062711363532e-05, + "loss": 0.2962, + "step": 528 + }, + { + "epoch": 0.816358024691358, + "grad_norm": 0.38475722074508667, + "learning_rate": 1.979958001462291e-05, + "loss": 1.0018, + "step": 529 + }, + { + "epoch": 0.8179012345679012, + "grad_norm": 0.2256091684103012, + "learning_rate": 1.947941818163851e-05, + "loss": 0.6261, + "step": 530 + }, + { + "epoch": 0.8194444444444444, + "grad_norm": 0.25418537855148315, + "learning_rate": 1.9161586477349146e-05, + "loss": 0.8302, + "step": 531 + }, + { + "epoch": 0.8209876543209876, + "grad_norm": 0.2072589248418808, + "learning_rate": 1.8846094099263912e-05, + "loss": 0.3685, + "step": 532 + }, + { + "epoch": 0.8225308641975309, + "grad_norm": 0.25289809703826904, + "learning_rate": 1.8532950177195562e-05, + "loss": 0.7993, + "step": 533 + }, + { + "epoch": 0.8240740740740741, + "grad_norm": 0.2528473734855652, + "learning_rate": 1.8222163772996747e-05, + "loss": 0.2886, + "step": 534 + }, + { + "epoch": 0.8256172839506173, + "grad_norm": 0.25779202580451965, + "learning_rate": 1.7913743880297363e-05, + "loss": 0.4408, + "step": 535 + }, + { + "epoch": 0.8271604938271605, + "grad_norm": 0.24646838009357452, + "learning_rate": 1.7607699424244585e-05, + "loss": 0.7013, + "step": 536 + }, + { + "epoch": 0.8287037037037037, + "grad_norm": 0.20384298264980316, + "learning_rate": 1.73040392612445e-05, + "loss": 0.3492, + "step": 537 + }, + { + "epoch": 0.8302469135802469, + "grad_norm": 0.2533794045448303, + "learning_rate": 1.7002772178705716e-05, + "loss": 0.8325, + "step": 538 + }, + { + "epoch": 0.8317901234567902, + "grad_norm": 0.27576497197151184, + "learning_rate": 1.670390689478535e-05, + "loss": 0.915, + "step": 539 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.11635521054267883, + "learning_rate": 1.6407452058136296e-05, + "loss": 0.0965, + "step": 540 + }, + { + "epoch": 0.8348765432098766, + "grad_norm": 0.28577181696891785, + "learning_rate": 1.6113416247657476e-05, + "loss": 0.9393, + "step": 541 + }, + { + "epoch": 0.8364197530864198, + "grad_norm": 0.21558605134487152, + "learning_rate": 1.582180797224507e-05, + "loss": 0.6051, + "step": 542 + }, + { + "epoch": 0.8379629629629629, + "grad_norm": 0.19419488310813904, + "learning_rate": 1.553263567054668e-05, + "loss": 0.3612, + "step": 543 + }, + { + "epoch": 0.8395061728395061, + "grad_norm": 0.1557004600763321, + "learning_rate": 1.5245907710716911e-05, + "loss": 0.1561, + "step": 544 + }, + { + "epoch": 0.8410493827160493, + "grad_norm": 0.17826461791992188, + "learning_rate": 1.4961632390175229e-05, + "loss": 0.5126, + "step": 545 + }, + { + "epoch": 0.8425925925925926, + "grad_norm": 0.288781076669693, + "learning_rate": 1.4679817935366013e-05, + "loss": 0.7864, + "step": 546 + }, + { + "epoch": 0.8441358024691358, + "grad_norm": 0.21908049285411835, + "learning_rate": 1.4400472501520223e-05, + "loss": 0.3944, + "step": 547 + }, + { + "epoch": 0.845679012345679, + "grad_norm": 0.23368942737579346, + "learning_rate": 1.4123604172419713e-05, + "loss": 0.488, + "step": 548 + }, + { + "epoch": 0.8472222222222222, + "grad_norm": 0.231543630361557, + "learning_rate": 1.3849220960162957e-05, + "loss": 0.454, + "step": 549 + }, + { + "epoch": 0.8487654320987654, + "grad_norm": 0.27065229415893555, + "learning_rate": 1.3577330804933563e-05, + "loss": 0.4608, + "step": 550 + }, + { + "epoch": 0.8503086419753086, + "grad_norm": 0.19953985512256622, + "learning_rate": 1.3307941574770155e-05, + "loss": 0.3485, + "step": 551 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.20189432799816132, + "learning_rate": 1.30410610653389e-05, + "loss": 0.3875, + "step": 552 + }, + { + "epoch": 0.8533950617283951, + "grad_norm": 0.18729227781295776, + "learning_rate": 1.2776696999707905e-05, + "loss": 0.4264, + "step": 553 + }, + { + "epoch": 0.8549382716049383, + "grad_norm": 0.17767773568630219, + "learning_rate": 1.2514857028123529e-05, + "loss": 0.239, + "step": 554 + }, + { + "epoch": 0.8564814814814815, + "grad_norm": 0.27451393008232117, + "learning_rate": 1.2255548727789334e-05, + "loss": 1.1173, + "step": 555 + }, + { + "epoch": 0.8580246913580247, + "grad_norm": 0.16227801144123077, + "learning_rate": 1.1998779602646437e-05, + "loss": 0.335, + "step": 556 + }, + { + "epoch": 0.8595679012345679, + "grad_norm": 0.2681660056114197, + "learning_rate": 1.1744557083156704e-05, + "loss": 0.6286, + "step": 557 + }, + { + "epoch": 0.8611111111111112, + "grad_norm": 0.2587696611881256, + "learning_rate": 1.149288852608743e-05, + "loss": 0.8631, + "step": 558 + }, + { + "epoch": 0.8626543209876543, + "grad_norm": 0.2387055605649948, + "learning_rate": 1.1243781214298655e-05, + "loss": 0.4798, + "step": 559 + }, + { + "epoch": 0.8641975308641975, + "grad_norm": 0.19629858434200287, + "learning_rate": 1.0997242356532334e-05, + "loss": 0.1778, + "step": 560 + }, + { + "epoch": 0.8657407407407407, + "grad_norm": 0.20090347528457642, + "learning_rate": 1.0753279087203716e-05, + "loss": 0.1647, + "step": 561 + }, + { + "epoch": 0.8672839506172839, + "grad_norm": 0.25092846155166626, + "learning_rate": 1.0511898466194903e-05, + "loss": 0.3911, + "step": 562 + }, + { + "epoch": 0.8688271604938271, + "grad_norm": 0.23149822652339935, + "learning_rate": 1.0273107478650511e-05, + "loss": 0.6488, + "step": 563 + }, + { + "epoch": 0.8703703703703703, + "grad_norm": 0.14098748564720154, + "learning_rate": 1.0036913034775674e-05, + "loss": 0.1005, + "step": 564 + }, + { + "epoch": 0.8719135802469136, + "grad_norm": 0.1766311526298523, + "learning_rate": 9.803321969635815e-06, + "loss": 0.1402, + "step": 565 + }, + { + "epoch": 0.8734567901234568, + "grad_norm": 0.2351858913898468, + "learning_rate": 9.572341042959177e-06, + "loss": 0.4199, + "step": 566 + }, + { + "epoch": 0.875, + "grad_norm": 0.14286702871322632, + "learning_rate": 9.343976938940869e-06, + "loss": 0.1752, + "step": 567 + }, + { + "epoch": 0.8765432098765432, + "grad_norm": 0.19101577997207642, + "learning_rate": 9.118236266049707e-06, + "loss": 0.1785, + "step": 568 + }, + { + "epoch": 0.8780864197530864, + "grad_norm": 0.12843865156173706, + "learning_rate": 8.89512555683687e-06, + "loss": 0.3428, + "step": 569 + }, + { + "epoch": 0.8796296296296297, + "grad_norm": 0.2435745745897293, + "learning_rate": 8.67465126774677e-06, + "loss": 0.601, + "step": 570 + }, + { + "epoch": 0.8811728395061729, + "grad_norm": 0.2568911015987396, + "learning_rate": 8.456819778930437e-06, + "loss": 0.6286, + "step": 571 + }, + { + "epoch": 0.8827160493827161, + "grad_norm": 0.20903518795967102, + "learning_rate": 8.24163739406062e-06, + "loss": 0.3462, + "step": 572 + }, + { + "epoch": 0.8842592592592593, + "grad_norm": 0.17575101554393768, + "learning_rate": 8.029110340149648e-06, + "loss": 0.1926, + "step": 573 + }, + { + "epoch": 0.8858024691358025, + "grad_norm": 0.22315870225429535, + "learning_rate": 7.81924476736896e-06, + "loss": 0.1337, + "step": 574 + }, + { + "epoch": 0.8873456790123457, + "grad_norm": 0.48400798439979553, + "learning_rate": 7.612046748871327e-06, + "loss": 0.524, + "step": 575 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.25745725631713867, + "learning_rate": 7.40752228061502e-06, + "loss": 0.4573, + "step": 576 + }, + { + "epoch": 0.8904320987654321, + "grad_norm": 0.1676391363143921, + "learning_rate": 7.205677281190304e-06, + "loss": 0.432, + "step": 577 + }, + { + "epoch": 0.8919753086419753, + "grad_norm": 0.5424337983131409, + "learning_rate": 7.0065175916482095e-06, + "loss": 0.7182, + "step": 578 + }, + { + "epoch": 0.8935185185185185, + "grad_norm": 0.34882697463035583, + "learning_rate": 6.810048975331373e-06, + "loss": 0.6585, + "step": 579 + }, + { + "epoch": 0.8950617283950617, + "grad_norm": 0.10848943889141083, + "learning_rate": 6.616277117707492e-06, + "loss": 0.0957, + "step": 580 + }, + { + "epoch": 0.8966049382716049, + "grad_norm": 0.2503252327442169, + "learning_rate": 6.4252076262045104e-06, + "loss": 0.2147, + "step": 581 + }, + { + "epoch": 0.8981481481481481, + "grad_norm": 0.17907798290252686, + "learning_rate": 6.236846030048604e-06, + "loss": 0.4813, + "step": 582 + }, + { + "epoch": 0.8996913580246914, + "grad_norm": 0.2369259148836136, + "learning_rate": 6.05119778010399e-06, + "loss": 0.4512, + "step": 583 + }, + { + "epoch": 0.9012345679012346, + "grad_norm": 0.2982903718948364, + "learning_rate": 5.868268248715292e-06, + "loss": 0.4739, + "step": 584 + }, + { + "epoch": 0.9027777777777778, + "grad_norm": 0.23582804203033447, + "learning_rate": 5.6880627295520576e-06, + "loss": 0.5283, + "step": 585 + }, + { + "epoch": 0.904320987654321, + "grad_norm": 0.33620336651802063, + "learning_rate": 5.5105864374554785e-06, + "loss": 0.7431, + "step": 586 + }, + { + "epoch": 0.9058641975308642, + "grad_norm": 0.23844951391220093, + "learning_rate": 5.335844508287679e-06, + "loss": 0.6828, + "step": 587 + }, + { + "epoch": 0.9074074074074074, + "grad_norm": 0.27209237217903137, + "learning_rate": 5.163841998782837e-06, + "loss": 0.4757, + "step": 588 + }, + { + "epoch": 0.9089506172839507, + "grad_norm": 0.2599264085292816, + "learning_rate": 4.9945838864011165e-06, + "loss": 0.3456, + "step": 589 + }, + { + "epoch": 0.9104938271604939, + "grad_norm": 0.4429294466972351, + "learning_rate": 4.828075069184379e-06, + "loss": 0.7822, + "step": 590 + }, + { + "epoch": 0.9120370370370371, + "grad_norm": 0.2338528037071228, + "learning_rate": 4.664320365614627e-06, + "loss": 0.5011, + "step": 591 + }, + { + "epoch": 0.9135802469135802, + "grad_norm": 0.2366255223751068, + "learning_rate": 4.503324514474483e-06, + "loss": 0.6117, + "step": 592 + }, + { + "epoch": 0.9151234567901234, + "grad_norm": 0.27656567096710205, + "learning_rate": 4.345092174710063e-06, + "loss": 0.8685, + "step": 593 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 0.24723798036575317, + "learning_rate": 4.189627925296202e-06, + "loss": 0.4732, + "step": 594 + }, + { + "epoch": 0.9182098765432098, + "grad_norm": 0.24122512340545654, + "learning_rate": 4.036936265103819e-06, + "loss": 0.3992, + "step": 595 + }, + { + "epoch": 0.9197530864197531, + "grad_norm": 0.2776406407356262, + "learning_rate": 3.887021612769936e-06, + "loss": 0.4025, + "step": 596 + }, + { + "epoch": 0.9212962962962963, + "grad_norm": 0.24000807106494904, + "learning_rate": 3.739888306569572e-06, + "loss": 0.338, + "step": 597 + }, + { + "epoch": 0.9228395061728395, + "grad_norm": 0.28136980533599854, + "learning_rate": 3.595540604290437e-06, + "loss": 0.4468, + "step": 598 + }, + { + "epoch": 0.9243827160493827, + "grad_norm": 0.4682718813419342, + "learning_rate": 3.453982683109491e-06, + "loss": 0.5882, + "step": 599 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.3717178702354431, + "learning_rate": 3.3152186394722505e-06, + "loss": 0.7905, + "step": 600 + }, + { + "epoch": 0.9274691358024691, + "grad_norm": 0.36897924542427063, + "learning_rate": 3.179252488974138e-06, + "loss": 0.3149, + "step": 601 + }, + { + "epoch": 0.9290123456790124, + "grad_norm": 0.22970488667488098, + "learning_rate": 3.0460881662442763e-06, + "loss": 0.385, + "step": 602 + }, + { + "epoch": 0.9305555555555556, + "grad_norm": 0.1425536572933197, + "learning_rate": 2.915729524831745e-06, + "loss": 0.1535, + "step": 603 + }, + { + "epoch": 0.9320987654320988, + "grad_norm": 0.2239391803741455, + "learning_rate": 2.7881803370938597e-06, + "loss": 0.1139, + "step": 604 + }, + { + "epoch": 0.933641975308642, + "grad_norm": 0.19181783497333527, + "learning_rate": 2.6634442940872585e-06, + "loss": 0.2644, + "step": 605 + }, + { + "epoch": 0.9351851851851852, + "grad_norm": 0.5377497673034668, + "learning_rate": 2.5415250054608208e-06, + "loss": 0.5667, + "step": 606 + }, + { + "epoch": 0.9367283950617284, + "grad_norm": 0.23963221907615662, + "learning_rate": 2.422425999351463e-06, + "loss": 0.4221, + "step": 607 + }, + { + "epoch": 0.9382716049382716, + "grad_norm": 0.16822920739650726, + "learning_rate": 2.30615072228183e-06, + "loss": 0.3039, + "step": 608 + }, + { + "epoch": 0.9398148148148148, + "grad_norm": 0.16344808042049408, + "learning_rate": 2.192702539060709e-06, + "loss": 0.3314, + "step": 609 + }, + { + "epoch": 0.941358024691358, + "grad_norm": 0.5780327916145325, + "learning_rate": 2.082084732685574e-06, + "loss": 0.4843, + "step": 610 + }, + { + "epoch": 0.9429012345679012, + "grad_norm": 0.1630149781703949, + "learning_rate": 1.974300504247595e-06, + "loss": 0.3143, + "step": 611 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.34857067465782166, + "learning_rate": 1.869352972839067e-06, + "loss": 0.4956, + "step": 612 + }, + { + "epoch": 0.9459876543209876, + "grad_norm": 0.23466090857982635, + "learning_rate": 1.767245175463028e-06, + "loss": 0.4269, + "step": 613 + }, + { + "epoch": 0.9475308641975309, + "grad_norm": 0.4323621094226837, + "learning_rate": 1.667980066945507e-06, + "loss": 0.6628, + "step": 614 + }, + { + "epoch": 0.9490740740740741, + "grad_norm": 0.26573505997657776, + "learning_rate": 1.5715605198499128e-06, + "loss": 0.3901, + "step": 615 + }, + { + "epoch": 0.9506172839506173, + "grad_norm": 0.2463800609111786, + "learning_rate": 1.4779893243939359e-06, + "loss": 0.4483, + "step": 616 + }, + { + "epoch": 0.9521604938271605, + "grad_norm": 0.2444482296705246, + "learning_rate": 1.387269188368867e-06, + "loss": 0.4454, + "step": 617 + }, + { + "epoch": 0.9537037037037037, + "grad_norm": 0.3542223572731018, + "learning_rate": 1.2994027370611173e-06, + "loss": 0.6263, + "step": 618 + }, + { + "epoch": 0.9552469135802469, + "grad_norm": 0.32985541224479675, + "learning_rate": 1.2143925131763879e-06, + "loss": 0.8199, + "step": 619 + }, + { + "epoch": 0.9567901234567902, + "grad_norm": 0.20038342475891113, + "learning_rate": 1.1322409767659525e-06, + "loss": 0.4273, + "step": 620 + }, + { + "epoch": 0.9583333333333334, + "grad_norm": 0.20644037425518036, + "learning_rate": 1.0529505051555922e-06, + "loss": 0.1741, + "step": 621 + }, + { + "epoch": 0.9598765432098766, + "grad_norm": 0.2300005555152893, + "learning_rate": 9.765233928766493e-07, + "loss": 0.4326, + "step": 622 + }, + { + "epoch": 0.9614197530864198, + "grad_norm": 0.2371928095817566, + "learning_rate": 9.029618515998373e-07, + "loss": 0.1923, + "step": 623 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 1.8070638179779053, + "learning_rate": 8.322680100710023e-07, + "loss": 0.5084, + "step": 624 + }, + { + "epoch": 0.9645061728395061, + "grad_norm": 0.22745661437511444, + "learning_rate": 7.644439140497062e-07, + "loss": 0.1559, + "step": 625 + }, + { + "epoch": 0.9660493827160493, + "grad_norm": 0.22576944530010223, + "learning_rate": 6.994915262499513e-07, + "loss": 0.4001, + "step": 626 + }, + { + "epoch": 0.9675925925925926, + "grad_norm": 0.2861001491546631, + "learning_rate": 6.374127262833484e-07, + "loss": 0.2301, + "step": 627 + }, + { + "epoch": 0.9691358024691358, + "grad_norm": 0.21590173244476318, + "learning_rate": 5.782093106048159e-07, + "loss": 0.2035, + "step": 628 + }, + { + "epoch": 0.970679012345679, + "grad_norm": 0.1466875821352005, + "learning_rate": 5.218829924604873e-07, + "loss": 0.1674, + "step": 629 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.3603959381580353, + "learning_rate": 4.684354018382409e-07, + "loss": 0.6233, + "step": 630 + }, + { + "epoch": 0.9737654320987654, + "grad_norm": 0.3605661690235138, + "learning_rate": 4.178680854204475e-07, + "loss": 0.5026, + "step": 631 + }, + { + "epoch": 0.9753086419753086, + "grad_norm": 0.2252633422613144, + "learning_rate": 3.701825065392184e-07, + "loss": 0.4773, + "step": 632 + }, + { + "epoch": 0.9768518518518519, + "grad_norm": 0.24521510303020477, + "learning_rate": 3.253800451341382e-07, + "loss": 0.4941, + "step": 633 + }, + { + "epoch": 0.9783950617283951, + "grad_norm": 0.1830521523952484, + "learning_rate": 2.8346199771221994e-07, + "loss": 0.3713, + "step": 634 + }, + { + "epoch": 0.9799382716049383, + "grad_norm": 0.22394660115242004, + "learning_rate": 2.4442957731048985e-07, + "loss": 0.4073, + "step": 635 + }, + { + "epoch": 0.9814814814814815, + "grad_norm": 0.20932500064373016, + "learning_rate": 2.082839134607828e-07, + "loss": 0.2292, + "step": 636 + }, + { + "epoch": 0.9830246913580247, + "grad_norm": 0.18378764390945435, + "learning_rate": 1.7502605215715672e-07, + "loss": 0.323, + "step": 637 + }, + { + "epoch": 0.9845679012345679, + "grad_norm": 0.20239287614822388, + "learning_rate": 1.446569558255395e-07, + "loss": 0.3659, + "step": 638 + }, + { + "epoch": 0.9861111111111112, + "grad_norm": 0.16685689985752106, + "learning_rate": 1.1717750329595101e-07, + "loss": 0.384, + "step": 639 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.28053098917007446, + "learning_rate": 9.258848977700129e-08, + "loss": 0.4211, + "step": 640 + }, + { + "epoch": 0.9891975308641975, + "grad_norm": 0.16193103790283203, + "learning_rate": 7.089062683292014e-08, + "loss": 0.3354, + "step": 641 + }, + { + "epoch": 0.9907407407407407, + "grad_norm": 0.10545664280653, + "learning_rate": 5.208454236296234e-08, + "loss": 0.1249, + "step": 642 + }, + { + "epoch": 0.9922839506172839, + "grad_norm": 0.2873446047306061, + "learning_rate": 3.617078058322232e-08, + "loss": 0.6613, + "step": 643 + }, + { + "epoch": 0.9938271604938271, + "grad_norm": 0.27240923047065735, + "learning_rate": 2.3149802010913323e-08, + "loss": 0.854, + "step": 644 + }, + { + "epoch": 0.9953703703703703, + "grad_norm": 0.3831271231174469, + "learning_rate": 1.3021983451000364e-08, + "loss": 0.3838, + "step": 645 + }, + { + "epoch": 0.9969135802469136, + "grad_norm": 0.24650080502033234, + "learning_rate": 5.78761798534222e-09, + "loss": 0.175, + "step": 646 + }, + { + "epoch": 0.9984567901234568, + "grad_norm": 0.2070116400718689, + "learning_rate": 1.4469149641538337e-09, + "loss": 0.4032, + "step": 647 + }, + { + "epoch": 1.0, + "grad_norm": 0.24193182587623596, + "learning_rate": 0.0, + "loss": 0.7447, + "step": 648 + } + ], + "logging_steps": 1, + "max_steps": 648, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.620525789529495e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}